xref: /netbsd/external/gpl3/gcc/dist/gcc/config/gcn/gcn.cc (revision f0fbc68b)
1 /* Copyright (C) 2016-2022 Free Software Foundation, Inc.
2 
3    This file is free software; you can redistribute it and/or modify it under
4    the terms of the GNU General Public License as published by the Free
5    Software Foundation; either version 3 of the License, or (at your option)
6    any later version.
7 
8    This file is distributed in the hope that it will be useful, but WITHOUT
9    ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10    FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
11    for more details.
12 
13    You should have received a copy of the GNU General Public License
14    along with GCC; see the file COPYING3.  If not see
15    <http://www.gnu.org/licenses/>.  */
16 
17 /* {{{ Includes.  */
18 
19 /* We want GET_MODE_SIZE et al to return integers, please.  */
20 #define IN_TARGET_CODE 1
21 
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "memmodel.h"
28 #include "rtl.h"
29 #include "tree.h"
30 #include "df.h"
31 #include "tm_p.h"
32 #include "stringpool.h"
33 #include "optabs.h"
34 #include "regs.h"
35 #include "emit-rtl.h"
36 #include "recog.h"
37 #include "diagnostic-core.h"
38 #include "insn-attr.h"
39 #include "fold-const.h"
40 #include "calls.h"
41 #include "explow.h"
42 #include "expr.h"
43 #include "output.h"
44 #include "cfgrtl.h"
45 #include "langhooks.h"
46 #include "builtins.h"
47 #include "omp-general.h"
48 #include "print-rtl.h"
49 #include "attribs.h"
50 #include "varasm.h"
51 #include "intl.h"
52 #include "rtl-iter.h"
53 #include "dwarf2.h"
54 #include "gimple.h"
55 
56 /* This file should be included last.  */
57 #include "target-def.h"
58 
59 /* }}}  */
60 /* {{{ Global variables.  */
61 
62 /* Constants used by FP instructions.  */
63 
64 static REAL_VALUE_TYPE dconst4, dconst1over2pi;
65 static bool ext_gcn_constants_init = 0;
66 
67 /* Holds the ISA variant, derived from the command line parameters.  */
68 
69 int gcn_isa = 3;		/* Default to GCN3.  */
70 
71 /* Reserve this much space for LDS (for propagating variables from
72    worker-single mode to worker-partitioned mode), per workgroup.  Global
73    analysis could calculate an exact bound, but we don't do that yet.
74 
75    We want to permit full occupancy, so size accordingly.  */
76 
77 /* Use this as a default, but allow it to grow if the user requests a large
78    amount of gang-private shared-memory space.  */
79 static int acc_lds_size = 0x600;
80 
81 #define OMP_LDS_SIZE 0x600    /* 0x600 is 1/40 total, rounded down.  */
82 #define ACC_LDS_SIZE acc_lds_size
83 #define OTHER_LDS_SIZE 65536  /* If in doubt, reserve all of it.  */
84 
85 #define LDS_SIZE (flag_openacc ? ACC_LDS_SIZE \
86 		  : flag_openmp ? OMP_LDS_SIZE \
87 		  : OTHER_LDS_SIZE)
88 
89 static int gang_private_hwm = 32;
90 static hash_map<tree, int> lds_allocs;
91 
92 /* The number of registers usable by normal non-kernel functions.
93    The SGPR count includes any special extra registers such as VCC.  */
94 
95 #define MAX_NORMAL_SGPR_COUNT	62  // i.e. 64 with VCC
96 #define MAX_NORMAL_VGPR_COUNT	24
97 
98 /* }}}  */
99 /* {{{ Initialization and options.  */
100 
101 /* Initialize machine_function.  */
102 
103 static struct machine_function *
gcn_init_machine_status(void)104 gcn_init_machine_status (void)
105 {
106   struct machine_function *f;
107 
108   f = ggc_cleared_alloc<machine_function> ();
109 
110   if (TARGET_GCN3)
111     f->use_flat_addressing = true;
112 
113   return f;
114 }
115 
116 /* Implement TARGET_OPTION_OVERRIDE.
117 
118    Override option settings where defaults are variable, or we have specific
119    needs to consider.  */
120 
121 static void
gcn_option_override(void)122 gcn_option_override (void)
123 {
124   init_machine_status = gcn_init_machine_status;
125 
126   /* The HSA runtime does not respect ELF load addresses, so force PIE.  */
127   if (!flag_pie)
128     flag_pie = 2;
129   if (!flag_pic)
130     flag_pic = flag_pie;
131 
132   gcn_isa = gcn_arch == PROCESSOR_FIJI ? 3 : 5;
133 
134   /* The default stack size needs to be small for offload kernels because
135      there may be many, many threads.  Also, a smaller stack gives a
136      measureable performance boost.  But, a small stack is insufficient
137      for running the testsuite, so we use a larger default for the stand
138      alone case.  */
139   if (stack_size_opt == -1)
140     {
141       if (flag_openacc || flag_openmp)
142 	/* 512 bytes per work item = 32kB total.  */
143 	stack_size_opt = 512 * 64;
144       else
145 	/* 1MB total.  */
146 	stack_size_opt = 1048576;
147     }
148 
149   /* Reserve 1Kb (somewhat arbitrarily) of LDS space for reduction results and
150      worker broadcasts.  */
151   if (gang_private_size_opt == -1)
152     gang_private_size_opt = 512;
153   else if (gang_private_size_opt < gang_private_hwm)
154     gang_private_size_opt = gang_private_hwm;
155   else if (gang_private_size_opt >= acc_lds_size - 1024)
156     {
157       /* We need some space for reductions and worker broadcasting.  If the
158 	 user requests a large amount of gang-private LDS space, we might not
159 	 have enough left for the former.  Increase the LDS allocation in that
160 	 case, although this may reduce the maximum occupancy on the
161 	 hardware.  */
162       acc_lds_size = gang_private_size_opt + 1024;
163       if (acc_lds_size > 32768)
164 	acc_lds_size = 32768;
165     }
166 
167   /* The xnack option is a placeholder, for now.  */
168   if (flag_xnack)
169     sorry ("XNACK support");
170 }
171 
172 /* }}}  */
173 /* {{{ Attributes.  */
174 
175 /* This table defines the arguments that are permitted in
176    __attribute__ ((amdgpu_hsa_kernel (...))).
177 
178    The names and values correspond to the HSA metadata that is encoded
179    into the assembler file and binary.  */
180 
181 static const struct gcn_kernel_arg_type
182 {
183   const char *name;
184   const char *header_pseudo;
185   machine_mode mode;
186 
187   /* This should be set to -1 or -2 for a dynamically allocated register
188      number.  Use -1 if this argument contributes to the user_sgpr_count,
189      -2 otherwise.  */
190   int fixed_regno;
191 } gcn_kernel_arg_types[] = {
192   {"exec", NULL, DImode, EXEC_REG},
193 #define PRIVATE_SEGMENT_BUFFER_ARG 1
194   {"private_segment_buffer",
195     ".amdhsa_user_sgpr_private_segment_buffer", TImode, -1},
196 #define DISPATCH_PTR_ARG 2
197   {"dispatch_ptr", ".amdhsa_user_sgpr_dispatch_ptr", DImode, -1},
198 #define QUEUE_PTR_ARG 3
199   {"queue_ptr", ".amdhsa_user_sgpr_queue_ptr", DImode, -1},
200 #define KERNARG_SEGMENT_PTR_ARG 4
201   {"kernarg_segment_ptr", ".amdhsa_user_sgpr_kernarg_segment_ptr", DImode, -1},
202   {"dispatch_id", ".amdhsa_user_sgpr_dispatch_id", DImode, -1},
203 #define FLAT_SCRATCH_INIT_ARG 6
204   {"flat_scratch_init", ".amdhsa_user_sgpr_flat_scratch_init", DImode, -1},
205 #define FLAT_SCRATCH_SEGMENT_SIZE_ARG 7
206   {"private_segment_size", ".amdhsa_user_sgpr_private_segment_size", SImode, -1},
207 #define WORKGROUP_ID_X_ARG 8
208   {"workgroup_id_X", ".amdhsa_system_sgpr_workgroup_id_x", SImode, -2},
209   {"workgroup_id_Y", ".amdhsa_system_sgpr_workgroup_id_y", SImode, -2},
210   {"workgroup_id_Z", ".amdhsa_system_sgpr_workgroup_id_z", SImode, -2},
211   {"workgroup_info", ".amdhsa_system_sgpr_workgroup_info", SImode, -1},
212 #define PRIVATE_SEGMENT_WAVE_OFFSET_ARG 12
213   {"private_segment_wave_offset",
214     ".amdhsa_system_sgpr_private_segment_wavefront_offset", SImode, -2},
215 #define WORK_ITEM_ID_X_ARG 13
216   {"work_item_id_X", NULL, V64SImode, FIRST_VGPR_REG},
217 #define WORK_ITEM_ID_Y_ARG 14
218   {"work_item_id_Y", NULL, V64SImode, FIRST_VGPR_REG + 1},
219 #define WORK_ITEM_ID_Z_ARG 15
220   {"work_item_id_Z", NULL, V64SImode, FIRST_VGPR_REG + 2}
221 };
222 
223 static const long default_requested_args
224 	= (1 << PRIVATE_SEGMENT_BUFFER_ARG)
225 	  | (1 << DISPATCH_PTR_ARG)
226 	  | (1 << QUEUE_PTR_ARG)
227 	  | (1 << KERNARG_SEGMENT_PTR_ARG)
228 	  | (1 << PRIVATE_SEGMENT_WAVE_OFFSET_ARG)
229 	  | (1 << WORKGROUP_ID_X_ARG)
230 	  | (1 << WORK_ITEM_ID_X_ARG)
231 	  | (1 << WORK_ITEM_ID_Y_ARG)
232 	  | (1 << WORK_ITEM_ID_Z_ARG);
233 
234 /* Extract parameter settings from __attribute__((amdgpu_hsa_kernel ())).
235    This function also sets the default values for some arguments.
236 
237    Return true on success, with ARGS populated.  */
238 
239 static bool
gcn_parse_amdgpu_hsa_kernel_attribute(struct gcn_kernel_args * args,tree list)240 gcn_parse_amdgpu_hsa_kernel_attribute (struct gcn_kernel_args *args,
241 				       tree list)
242 {
243   bool err = false;
244   args->requested = default_requested_args;
245   args->nargs = 0;
246 
247   for (int a = 0; a < GCN_KERNEL_ARG_TYPES; a++)
248     args->reg[a] = -1;
249 
250   for (; list; list = TREE_CHAIN (list))
251     {
252       const char *str;
253       if (TREE_CODE (TREE_VALUE (list)) != STRING_CST)
254 	{
255 	  error ("%<amdgpu_hsa_kernel%> attribute requires string constant "
256 		 "arguments");
257 	  break;
258 	}
259       str = TREE_STRING_POINTER (TREE_VALUE (list));
260       int a;
261       for (a = 0; a < GCN_KERNEL_ARG_TYPES; a++)
262 	{
263 	  if (!strcmp (str, gcn_kernel_arg_types[a].name))
264 	    break;
265 	}
266       if (a == GCN_KERNEL_ARG_TYPES)
267 	{
268 	  error ("unknown specifier %qs in %<amdgpu_hsa_kernel%> attribute",
269 		 str);
270 	  err = true;
271 	  break;
272 	}
273       if (args->requested & (1 << a))
274 	{
275 	  error ("duplicated parameter specifier %qs in %<amdgpu_hsa_kernel%> "
276 		 "attribute", str);
277 	  err = true;
278 	  break;
279 	}
280       args->requested |= (1 << a);
281       args->order[args->nargs++] = a;
282     }
283 
284   /* Requesting WORK_ITEM_ID_Z_ARG implies requesting WORK_ITEM_ID_X_ARG and
285      WORK_ITEM_ID_Y_ARG.  Similarly, requesting WORK_ITEM_ID_Y_ARG implies
286      requesting WORK_ITEM_ID_X_ARG.  */
287   if (args->requested & (1 << WORK_ITEM_ID_Z_ARG))
288     args->requested |= (1 << WORK_ITEM_ID_Y_ARG);
289   if (args->requested & (1 << WORK_ITEM_ID_Y_ARG))
290     args->requested |= (1 << WORK_ITEM_ID_X_ARG);
291 
292   int sgpr_regno = FIRST_SGPR_REG;
293   args->nsgprs = 0;
294   for (int a = 0; a < GCN_KERNEL_ARG_TYPES; a++)
295     {
296       if (!(args->requested & (1 << a)))
297 	continue;
298 
299       if (gcn_kernel_arg_types[a].fixed_regno >= 0)
300 	args->reg[a] = gcn_kernel_arg_types[a].fixed_regno;
301       else
302 	{
303 	  int reg_count;
304 
305 	  switch (gcn_kernel_arg_types[a].mode)
306 	    {
307 	    case E_SImode:
308 	      reg_count = 1;
309 	      break;
310 	    case E_DImode:
311 	      reg_count = 2;
312 	      break;
313 	    case E_TImode:
314 	      reg_count = 4;
315 	      break;
316 	    default:
317 	      gcc_unreachable ();
318 	    }
319 	  args->reg[a] = sgpr_regno;
320 	  sgpr_regno += reg_count;
321 	  if (gcn_kernel_arg_types[a].fixed_regno == -1)
322 	    args->nsgprs += reg_count;
323 	}
324     }
325   if (sgpr_regno > FIRST_SGPR_REG + 16)
326     {
327       error ("too many arguments passed in sgpr registers");
328     }
329   return err;
330 }
331 
332 /* Referenced by TARGET_ATTRIBUTE_TABLE.
333 
334    Validates target specific attributes.  */
335 
336 static tree
gcn_handle_amdgpu_hsa_kernel_attribute(tree * node,tree name,tree args,int,bool * no_add_attrs)337 gcn_handle_amdgpu_hsa_kernel_attribute (tree *node, tree name,
338 					tree args, int, bool *no_add_attrs)
339 {
340   if (!FUNC_OR_METHOD_TYPE_P (*node))
341     {
342       warning (OPT_Wattributes, "%qE attribute only applies to functions",
343 	       name);
344       *no_add_attrs = true;
345       return NULL_TREE;
346     }
347 
348   /* Can combine regparm with all attributes but fastcall, and thiscall.  */
349   if (is_attribute_p ("gcnhsa_kernel", name))
350     {
351       struct gcn_kernel_args kernelarg;
352 
353       if (gcn_parse_amdgpu_hsa_kernel_attribute (&kernelarg, args))
354 	*no_add_attrs = true;
355 
356       return NULL_TREE;
357     }
358 
359   return NULL_TREE;
360 }
361 
362 /* Implement TARGET_ATTRIBUTE_TABLE.
363 
364    Create target-specific __attribute__ types.  */
365 
366 static const struct attribute_spec gcn_attribute_table[] = {
367   /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
368      affects_type_identity } */
369   {"amdgpu_hsa_kernel", 0, GCN_KERNEL_ARG_TYPES, false, true,
370    true, true, gcn_handle_amdgpu_hsa_kernel_attribute, NULL},
371   /* End element.  */
372   {NULL, 0, 0, false, false, false, false, NULL, NULL}
373 };
374 
375 /* }}}  */
376 /* {{{ Registers and modes.  */
377 
378 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P.  */
379 
380 bool
gcn_scalar_mode_supported_p(scalar_mode mode)381 gcn_scalar_mode_supported_p (scalar_mode mode)
382 {
383   return (mode == BImode
384 	  || mode == QImode
385 	  || mode == HImode /* || mode == HFmode  */
386 	  || mode == SImode || mode == SFmode
387 	  || mode == DImode || mode == DFmode
388 	  || mode == TImode);
389 }
390 
391 /* Implement TARGET_CLASS_MAX_NREGS.
392 
393    Return the number of hard registers needed to hold a value of MODE in
394    a register of class RCLASS.  */
395 
396 static unsigned char
gcn_class_max_nregs(reg_class_t rclass,machine_mode mode)397 gcn_class_max_nregs (reg_class_t rclass, machine_mode mode)
398 {
399   /* Scalar registers are 32bit, vector registers are in fact tuples of
400      64 lanes.  */
401   if (rclass == VGPR_REGS)
402     {
403       if (vgpr_1reg_mode_p (mode))
404 	return 1;
405       if (vgpr_2reg_mode_p (mode))
406 	return 2;
407       /* TImode is used by DImode compare_and_swap.  */
408       if (mode == TImode)
409 	return 4;
410     }
411   else if (rclass == VCC_CONDITIONAL_REG && mode == BImode)
412     return 2;
413   return CEIL (GET_MODE_SIZE (mode), 4);
414 }
415 
416 /* Implement TARGET_HARD_REGNO_NREGS.
417 
418    Return the number of hard registers needed to hold a value of MODE in
419    REGNO.  */
420 
421 unsigned int
gcn_hard_regno_nregs(unsigned int regno,machine_mode mode)422 gcn_hard_regno_nregs (unsigned int regno, machine_mode mode)
423 {
424   return gcn_class_max_nregs (REGNO_REG_CLASS (regno), mode);
425 }
426 
427 /* Implement TARGET_HARD_REGNO_MODE_OK.
428 
429    Return true if REGNO can hold value in MODE.  */
430 
431 bool
gcn_hard_regno_mode_ok(unsigned int regno,machine_mode mode)432 gcn_hard_regno_mode_ok (unsigned int regno, machine_mode mode)
433 {
434   /* Treat a complex mode as if it were a scalar mode of the same overall
435      size for the purposes of allocating hard registers.  */
436   if (COMPLEX_MODE_P (mode))
437     switch (mode)
438       {
439       case E_CQImode:
440       case E_CHImode:
441 	mode = SImode;
442 	break;
443       case E_CSImode:
444 	mode = DImode;
445 	break;
446       case E_CDImode:
447 	mode = TImode;
448 	break;
449       case E_HCmode:
450 	mode = SFmode;
451 	break;
452       case E_SCmode:
453 	mode = DFmode;
454 	break;
455       default:
456 	/* Not supported.  */
457 	return false;
458       }
459 
460   switch (regno)
461     {
462     case FLAT_SCRATCH_LO_REG:
463     case XNACK_MASK_LO_REG:
464     case TBA_LO_REG:
465     case TMA_LO_REG:
466       return (mode == SImode || mode == DImode);
467     case VCC_LO_REG:
468     case EXEC_LO_REG:
469       return (mode == BImode || mode == SImode || mode == DImode);
470     case M0_REG:
471     case FLAT_SCRATCH_HI_REG:
472     case XNACK_MASK_HI_REG:
473     case TBA_HI_REG:
474     case TMA_HI_REG:
475       return mode == SImode;
476     case VCC_HI_REG:
477       return false;
478     case EXEC_HI_REG:
479       return mode == SImode /*|| mode == V32BImode */ ;
480     case SCC_REG:
481     case VCCZ_REG:
482     case EXECZ_REG:
483       return mode == BImode;
484     }
485   if (regno == ARG_POINTER_REGNUM || regno == FRAME_POINTER_REGNUM)
486     return true;
487   if (SGPR_REGNO_P (regno))
488     /* We restrict double register values to aligned registers.  */
489     return (sgpr_1reg_mode_p (mode)
490 	    || (!((regno - FIRST_SGPR_REG) & 1) && sgpr_2reg_mode_p (mode))
491 	    || (((regno - FIRST_SGPR_REG) & 3) == 0 && mode == TImode));
492   if (VGPR_REGNO_P (regno))
493     /* Vector instructions do not care about the alignment of register
494        pairs, but where there is no 64-bit instruction, many of the
495        define_split do not work if the input and output registers partially
496        overlap.  We tried to fix this with early clobber and match
497        constraints, but it was bug prone, added complexity, and conflicts
498        with the 'U0' constraints on vec_merge.
499        Therefore, we restrict ourselved to aligned registers.  */
500     return (vgpr_1reg_mode_p (mode)
501 	    || (!((regno - FIRST_VGPR_REG) & 1) && vgpr_2reg_mode_p (mode))
502 	    /* TImode is used by DImode compare_and_swap.  */
503 	    || (mode == TImode
504 		&& !((regno - FIRST_VGPR_REG) & 3)));
505   return false;
506 }
507 
508 /* Implement REGNO_REG_CLASS via gcn.h.
509 
510    Return smallest class containing REGNO.  */
511 
512 enum reg_class
gcn_regno_reg_class(int regno)513 gcn_regno_reg_class (int regno)
514 {
515   switch (regno)
516     {
517     case SCC_REG:
518       return SCC_CONDITIONAL_REG;
519     case VCC_LO_REG:
520     case VCC_HI_REG:
521       return VCC_CONDITIONAL_REG;
522     case VCCZ_REG:
523       return VCCZ_CONDITIONAL_REG;
524     case EXECZ_REG:
525       return EXECZ_CONDITIONAL_REG;
526     case EXEC_LO_REG:
527     case EXEC_HI_REG:
528       return EXEC_MASK_REG;
529     }
530   if (VGPR_REGNO_P (regno))
531     return VGPR_REGS;
532   if (SGPR_REGNO_P (regno))
533     return SGPR_REGS;
534   if (regno < FIRST_VGPR_REG)
535     return GENERAL_REGS;
536   if (regno == ARG_POINTER_REGNUM || regno == FRAME_POINTER_REGNUM)
537     return AFP_REGS;
538   return ALL_REGS;
539 }
540 
541 /* Implement TARGET_CAN_CHANGE_MODE_CLASS.
542 
543    GCC assumes that lowpart contains first part of value as stored in memory.
544    This is not the case for vector registers.  */
545 
546 bool
gcn_can_change_mode_class(machine_mode from,machine_mode to,reg_class_t regclass)547 gcn_can_change_mode_class (machine_mode from, machine_mode to,
548 			   reg_class_t regclass)
549 {
550   if (!vgpr_vector_mode_p (from) && !vgpr_vector_mode_p (to))
551     return true;
552   return (gcn_class_max_nregs (regclass, from)
553 	  == gcn_class_max_nregs (regclass, to));
554 }
555 
556 /* Implement TARGET_SMALL_REGISTER_CLASSES_FOR_MODE_P.
557 
558    When this hook returns true for MODE, the compiler allows
559    registers explicitly used in the rtl to be used as spill registers
560    but prevents the compiler from extending the lifetime of these
561    registers.  */
562 
563 bool
gcn_small_register_classes_for_mode_p(machine_mode mode)564 gcn_small_register_classes_for_mode_p (machine_mode mode)
565 {
566   /* We allocate into exec and vcc regs.  Those make small register class.  */
567   return mode == DImode || mode == SImode;
568 }
569 
570 /* Implement TARGET_CLASS_LIKELY_SPILLED_P.
571 
572    Returns true if pseudos that have been assigned to registers of class RCLASS
573    would likely be spilled because registers of RCLASS are needed for spill
574    registers.  */
575 
576 static bool
gcn_class_likely_spilled_p(reg_class_t rclass)577 gcn_class_likely_spilled_p (reg_class_t rclass)
578 {
579   return (rclass == EXEC_MASK_REG
580 	  || reg_classes_intersect_p (ALL_CONDITIONAL_REGS, rclass));
581 }
582 
583 /* Implement TARGET_MODES_TIEABLE_P.
584 
585    Returns true if a value of MODE1 is accessible in MODE2 without
586    copying.  */
587 
588 bool
gcn_modes_tieable_p(machine_mode mode1,machine_mode mode2)589 gcn_modes_tieable_p (machine_mode mode1, machine_mode mode2)
590 {
591   return (GET_MODE_BITSIZE (mode1) <= MAX_FIXED_MODE_SIZE
592 	  && GET_MODE_BITSIZE (mode2) <= MAX_FIXED_MODE_SIZE);
593 }
594 
595 /* Implement TARGET_TRULY_NOOP_TRUNCATION.
596 
597    Returns true if it is safe to “convert” a value of INPREC bits to one of
598    OUTPREC bits (where OUTPREC is smaller than INPREC) by merely operating on
599    it as if it had only OUTPREC bits.  */
600 
601 bool
gcn_truly_noop_truncation(poly_uint64 outprec,poly_uint64 inprec)602 gcn_truly_noop_truncation (poly_uint64 outprec, poly_uint64 inprec)
603 {
604   return ((inprec <= 32) && (outprec <= inprec));
605 }
606 
607 /* Return N-th part of value occupying multiple registers.  */
608 
609 rtx
gcn_operand_part(machine_mode mode,rtx op,int n)610 gcn_operand_part (machine_mode mode, rtx op, int n)
611 {
612   if (GET_MODE_SIZE (mode) >= 256)
613     {
614       /*gcc_assert (GET_MODE_SIZE (mode) == 256 || n == 0);  */
615 
616       if (REG_P (op))
617 	{
618 	  gcc_assert (REGNO (op) + n < FIRST_PSEUDO_REGISTER);
619 	  return gen_rtx_REG (V64SImode, REGNO (op) + n);
620 	}
621       if (GET_CODE (op) == CONST_VECTOR)
622 	{
623 	  int units = GET_MODE_NUNITS (mode);
624 	  rtvec v = rtvec_alloc (units);
625 
626 	  for (int i = 0; i < units; ++i)
627 	    RTVEC_ELT (v, i) = gcn_operand_part (GET_MODE_INNER (mode),
628 						 CONST_VECTOR_ELT (op, i), n);
629 
630 	  return gen_rtx_CONST_VECTOR (V64SImode, v);
631 	}
632       if (GET_CODE (op) == UNSPEC && XINT (op, 1) == UNSPEC_VECTOR)
633 	return gcn_gen_undef (V64SImode);
634       gcc_unreachable ();
635     }
636   else if (GET_MODE_SIZE (mode) == 8 && REG_P (op))
637     {
638       gcc_assert (REGNO (op) + n < FIRST_PSEUDO_REGISTER);
639       return gen_rtx_REG (SImode, REGNO (op) + n);
640     }
641   else
642     {
643       if (GET_CODE (op) == UNSPEC && XINT (op, 1) == UNSPEC_VECTOR)
644 	return gcn_gen_undef (SImode);
645 
646       /* If it's a constant then let's assume it is of the largest mode
647 	 available, otherwise simplify_gen_subreg will fail.  */
648       if (mode == VOIDmode && CONST_INT_P (op))
649 	mode = DImode;
650       return simplify_gen_subreg (SImode, op, mode, n * 4);
651     }
652 }
653 
654 /* Return N-th part of value occupying multiple registers.  */
655 
656 rtx
gcn_operand_doublepart(machine_mode mode,rtx op,int n)657 gcn_operand_doublepart (machine_mode mode, rtx op, int n)
658 {
659   return simplify_gen_subreg (DImode, op, mode, n * 8);
660 }
661 
662 /* Return true if OP can be split into subregs or high/low parts.
663    This is always true for scalars, but not normally true for vectors.
664    However, for vectors in hardregs we can use the low and high registers.  */
665 
666 bool
gcn_can_split_p(machine_mode,rtx op)667 gcn_can_split_p (machine_mode, rtx op)
668 {
669   if (vgpr_vector_mode_p (GET_MODE (op)))
670     {
671       if (GET_CODE (op) == SUBREG)
672 	op = SUBREG_REG (op);
673       if (!REG_P (op))
674 	return true;
675       return REGNO (op) <= FIRST_PSEUDO_REGISTER;
676     }
677   return true;
678 }
679 
680 /* Implement TARGET_SPILL_CLASS.
681 
682    Return class of registers which could be used for pseudo of MODE
683    and of class RCLASS for spilling instead of memory.  Return NO_REGS
684    if it is not possible or non-profitable.  */
685 
686 static reg_class_t
gcn_spill_class(reg_class_t c,machine_mode)687 gcn_spill_class (reg_class_t c, machine_mode /*mode */ )
688 {
689   if (reg_classes_intersect_p (ALL_CONDITIONAL_REGS, c)
690       || c == VCC_CONDITIONAL_REG)
691     return SGPR_REGS;
692   else
693     return NO_REGS;
694 }
695 
696 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
697 
698    Change allocno class for given pseudo from allocno and best class
699    calculated by IRA.  */
700 
701 static reg_class_t
gcn_ira_change_pseudo_allocno_class(int regno,reg_class_t cl,reg_class_t best_cl)702 gcn_ira_change_pseudo_allocno_class (int regno, reg_class_t cl,
703 				     reg_class_t best_cl)
704 {
705   /* Avoid returning classes that contain both vgpr and sgpr registers.  */
706   if (cl != ALL_REGS && cl != SRCDST_REGS && cl != ALL_GPR_REGS)
707     return cl;
708   if (best_cl != ALL_REGS && best_cl != SRCDST_REGS
709       && best_cl != ALL_GPR_REGS)
710     return best_cl;
711 
712   machine_mode mode = PSEUDO_REGNO_MODE (regno);
713   if (vgpr_vector_mode_p (mode))
714     return VGPR_REGS;
715 
716   return GENERAL_REGS;
717 }
718 
719 /* Create a new DImode pseudo reg and emit an instruction to initialize
720    it to VAL.  */
721 
722 static rtx
get_exec(int64_t val)723 get_exec (int64_t val)
724 {
725   rtx reg = gen_reg_rtx (DImode);
726   emit_insn (gen_rtx_SET (reg, gen_int_mode (val, DImode)));
727   return reg;
728 }
729 
730 /* Return value of scalar exec register.  */
731 
732 rtx
gcn_scalar_exec()733 gcn_scalar_exec ()
734 {
735   return const1_rtx;
736 }
737 
738 /* Return pseudo holding scalar exec register.  */
739 
740 rtx
gcn_scalar_exec_reg()741 gcn_scalar_exec_reg ()
742 {
743   return get_exec (1);
744 }
745 
746 /* Return value of full exec register.  */
747 
748 rtx
gcn_full_exec()749 gcn_full_exec ()
750 {
751   return constm1_rtx;
752 }
753 
754 /* Return pseudo holding full exec register.  */
755 
756 rtx
gcn_full_exec_reg()757 gcn_full_exec_reg ()
758 {
759   return get_exec (-1);
760 }
761 
762 /* }}}  */
763 /* {{{ Immediate constants.  */
764 
765 /* Initialize shared numeric constants.  */
766 
767 static void
init_ext_gcn_constants(void)768 init_ext_gcn_constants (void)
769 {
770   real_from_integer (&dconst4, DFmode, 4, SIGNED);
771 
772   /* FIXME: this constant probably does not match what hardware really loads.
773      Reality check it eventually.  */
774   real_from_string (&dconst1over2pi,
775 		    "0.1591549430918953357663423455968866839");
776   real_convert (&dconst1over2pi, SFmode, &dconst1over2pi);
777 
778   ext_gcn_constants_init = 1;
779 }
780 
781 /* Return non-zero if X is a constant that can appear as an inline operand.
782    This is 0, 0.5, -0.5, 1, -1, 2, -2, 4,-4, 1/(2*pi)
783    Or a vector of those.
784    The value returned should be the encoding of this constant.  */
785 
786 int
gcn_inline_fp_constant_p(rtx x,bool allow_vector)787 gcn_inline_fp_constant_p (rtx x, bool allow_vector)
788 {
789   machine_mode mode = GET_MODE (x);
790 
791   if ((mode == V64HFmode || mode == V64SFmode || mode == V64DFmode)
792       && allow_vector)
793     {
794       int n;
795       if (GET_CODE (x) != CONST_VECTOR)
796 	return 0;
797       n = gcn_inline_fp_constant_p (CONST_VECTOR_ELT (x, 0), false);
798       if (!n)
799 	return 0;
800       for (int i = 1; i < 64; i++)
801 	if (CONST_VECTOR_ELT (x, i) != CONST_VECTOR_ELT (x, 0))
802 	  return 0;
803       return 1;
804     }
805 
806   if (mode != HFmode && mode != SFmode && mode != DFmode)
807     return 0;
808 
809   const REAL_VALUE_TYPE *r;
810 
811   if (x == CONST0_RTX (mode))
812     return 128;
813   if (x == CONST1_RTX (mode))
814     return 242;
815 
816   r = CONST_DOUBLE_REAL_VALUE (x);
817 
818   if (real_identical (r, &dconstm1))
819     return 243;
820 
821   if (real_identical (r, &dconsthalf))
822     return 240;
823   if (real_identical (r, &dconstm1))
824     return 243;
825   if (real_identical (r, &dconst2))
826     return 244;
827   if (real_identical (r, &dconst4))
828     return 246;
829   if (real_identical (r, &dconst1over2pi))
830     return 248;
831   if (!ext_gcn_constants_init)
832     init_ext_gcn_constants ();
833   real_value_negate (r);
834   if (real_identical (r, &dconsthalf))
835     return 241;
836   if (real_identical (r, &dconst2))
837     return 245;
838   if (real_identical (r, &dconst4))
839     return 247;
840 
841   /* FIXME: add 4, -4 and 1/(2*PI).  */
842 
843   return 0;
844 }
845 
846 /* Return non-zero if X is a constant that can appear as an immediate operand.
847    This is 0, 0.5, -0.5, 1, -1, 2, -2, 4,-4, 1/(2*pi)
848    Or a vector of those.
849    The value returned should be the encoding of this constant.  */
850 
851 bool
gcn_fp_constant_p(rtx x,bool allow_vector)852 gcn_fp_constant_p (rtx x, bool allow_vector)
853 {
854   machine_mode mode = GET_MODE (x);
855 
856   if ((mode == V64HFmode || mode == V64SFmode || mode == V64DFmode)
857       && allow_vector)
858     {
859       int n;
860       if (GET_CODE (x) != CONST_VECTOR)
861 	return false;
862       n = gcn_fp_constant_p (CONST_VECTOR_ELT (x, 0), false);
863       if (!n)
864 	return false;
865       for (int i = 1; i < 64; i++)
866 	if (CONST_VECTOR_ELT (x, i) != CONST_VECTOR_ELT (x, 0))
867 	  return false;
868       return true;
869     }
870   if (mode != HFmode && mode != SFmode && mode != DFmode)
871     return false;
872 
873   if (gcn_inline_fp_constant_p (x, false))
874     return true;
875   /* FIXME: It is not clear how 32bit immediates are interpreted here.  */
876   return (mode != DFmode);
877 }
878 
879 /* Return true if X is a constant representable as an inline immediate
880    constant in a 32-bit instruction encoding.  */
881 
882 bool
gcn_inline_constant_p(rtx x)883 gcn_inline_constant_p (rtx x)
884 {
885   if (GET_CODE (x) == CONST_INT)
886     return INTVAL (x) >= -16 && INTVAL (x) <= 64;
887   if (GET_CODE (x) == CONST_DOUBLE)
888     return gcn_inline_fp_constant_p (x, false);
889   if (GET_CODE (x) == CONST_VECTOR)
890     {
891       int n;
892       if (!vgpr_vector_mode_p (GET_MODE (x)))
893 	return false;
894       n = gcn_inline_constant_p (CONST_VECTOR_ELT (x, 0));
895       if (!n)
896 	return false;
897       for (int i = 1; i < 64; i++)
898 	if (CONST_VECTOR_ELT (x, i) != CONST_VECTOR_ELT (x, 0))
899 	  return false;
900       return 1;
901     }
902   return false;
903 }
904 
905 /* Return true if X is a constant representable as an immediate constant
906    in a 32 or 64-bit instruction encoding.  */
907 
908 bool
gcn_constant_p(rtx x)909 gcn_constant_p (rtx x)
910 {
911   switch (GET_CODE (x))
912     {
913     case CONST_INT:
914       return true;
915 
916     case CONST_DOUBLE:
917       return gcn_fp_constant_p (x, false);
918 
919     case CONST_VECTOR:
920       {
921 	int n;
922 	if (!vgpr_vector_mode_p (GET_MODE (x)))
923 	  return false;
924 	n = gcn_constant_p (CONST_VECTOR_ELT (x, 0));
925 	if (!n)
926 	  return false;
927 	for (int i = 1; i < 64; i++)
928 	  if (CONST_VECTOR_ELT (x, i) != CONST_VECTOR_ELT (x, 0))
929 	    return false;
930 	return true;
931       }
932 
933     case SYMBOL_REF:
934     case LABEL_REF:
935       return true;
936 
937     default:
938       ;
939     }
940 
941   return false;
942 }
943 
944 /* Return true if X is a constant representable as two inline immediate
945    constants in a 64-bit instruction that is split into two 32-bit
946    instructions.
947    When MIXED is set, the low-part is permitted to use the full 32-bits.  */
948 
949 bool
gcn_inline_constant64_p(rtx x,bool mixed)950 gcn_inline_constant64_p (rtx x, bool mixed)
951 {
952   if (GET_CODE (x) == CONST_VECTOR)
953     {
954       if (!vgpr_vector_mode_p (GET_MODE (x)))
955 	return false;
956       if (!gcn_inline_constant64_p (CONST_VECTOR_ELT (x, 0), mixed))
957 	return false;
958       for (int i = 1; i < 64; i++)
959 	if (CONST_VECTOR_ELT (x, i) != CONST_VECTOR_ELT (x, 0))
960 	  return false;
961 
962       return true;
963     }
964 
965   if (GET_CODE (x) != CONST_INT)
966     return false;
967 
968   rtx val_lo = gcn_operand_part (DImode, x, 0);
969   rtx val_hi = gcn_operand_part (DImode, x, 1);
970   return ((mixed || gcn_inline_constant_p (val_lo))
971 	  && gcn_inline_constant_p (val_hi));
972 }
973 
974 /* Return true if X is a constant representable as an immediate constant
975    in a 32 or 64-bit instruction encoding where the hardware will
976    extend the immediate to 64-bits.  */
977 
978 bool
gcn_constant64_p(rtx x)979 gcn_constant64_p (rtx x)
980 {
981   if (!gcn_constant_p (x))
982     return false;
983 
984   if (GET_CODE (x) != CONST_INT)
985     return true;
986 
987   /* Negative numbers are only allowed if they can be encoded within src0,
988      because the 32-bit immediates do not get sign-extended.
989      Unsigned numbers must not be encodable as 32-bit -1..-16, because the
990      assembler will use a src0 inline immediate and that will get
991      sign-extended.  */
992   HOST_WIDE_INT val = INTVAL (x);
993   return (((val & 0xffffffff) == val	/* Positive 32-bit.  */
994 	   && (val & 0xfffffff0) != 0xfffffff0)	/* Not -1..-16.  */
995 	  || gcn_inline_constant_p (x));	/* Src0.  */
996 }
997 
998 /* Implement TARGET_LEGITIMATE_CONSTANT_P.
999 
1000    Returns true if X is a legitimate constant for a MODE immediate operand.  */
1001 
1002 bool
gcn_legitimate_constant_p(machine_mode,rtx x)1003 gcn_legitimate_constant_p (machine_mode, rtx x)
1004 {
1005   return gcn_constant_p (x);
1006 }
1007 
1008 /* Return true if X is a CONST_VECTOR of single constant.  */
1009 
1010 static bool
single_cst_vector_p(rtx x)1011 single_cst_vector_p (rtx x)
1012 {
1013   if (GET_CODE (x) != CONST_VECTOR)
1014     return false;
1015   for (int i = 1; i < 64; i++)
1016     if (CONST_VECTOR_ELT (x, i) != CONST_VECTOR_ELT (x, 0))
1017       return false;
1018   return true;
1019 }
1020 
1021 /* Create a CONST_VECTOR of duplicated value A.  */
1022 
1023 rtx
gcn_vec_constant(machine_mode mode,int a)1024 gcn_vec_constant (machine_mode mode, int a)
1025 {
1026   /*if (!a)
1027     return CONST0_RTX (mode);
1028   if (a == -1)
1029     return CONSTM1_RTX (mode);
1030   if (a == 1)
1031     return CONST1_RTX (mode);
1032   if (a == 2)
1033     return CONST2_RTX (mode);*/
1034 
1035   int units = GET_MODE_NUNITS (mode);
1036   machine_mode innermode = GET_MODE_INNER (mode);
1037 
1038   rtx tem;
1039   if (FLOAT_MODE_P (innermode))
1040     {
1041       REAL_VALUE_TYPE rv;
1042       real_from_integer (&rv, NULL, a, SIGNED);
1043       tem = const_double_from_real_value (rv, innermode);
1044     }
1045   else
1046     tem = gen_int_mode (a, innermode);
1047 
1048   rtvec v = rtvec_alloc (units);
1049   for (int i = 0; i < units; ++i)
1050     RTVEC_ELT (v, i) = tem;
1051 
1052   return gen_rtx_CONST_VECTOR (mode, v);
1053 }
1054 
1055 /* Create a CONST_VECTOR of duplicated value A.  */
1056 
1057 rtx
gcn_vec_constant(machine_mode mode,rtx a)1058 gcn_vec_constant (machine_mode mode, rtx a)
1059 {
1060   int units = GET_MODE_NUNITS (mode);
1061   rtvec v = rtvec_alloc (units);
1062 
1063   for (int i = 0; i < units; ++i)
1064     RTVEC_ELT (v, i) = a;
1065 
1066   return gen_rtx_CONST_VECTOR (mode, v);
1067 }
1068 
1069 /* Create an undefined vector value, used where an insn operand is
1070    optional.  */
1071 
1072 rtx
gcn_gen_undef(machine_mode mode)1073 gcn_gen_undef (machine_mode mode)
1074 {
1075   return gen_rtx_UNSPEC (mode, gen_rtvec (1, const0_rtx), UNSPEC_VECTOR);
1076 }
1077 
1078 /* }}}  */
1079 /* {{{ Addresses, pointers and moves.  */
1080 
1081 /* Return true is REG is a valid place to store a pointer,
1082    for instructions that require an SGPR.
1083    FIXME rename. */
1084 
1085 static bool
gcn_address_register_p(rtx reg,machine_mode mode,bool strict)1086 gcn_address_register_p (rtx reg, machine_mode mode, bool strict)
1087 {
1088   if (GET_CODE (reg) == SUBREG)
1089     reg = SUBREG_REG (reg);
1090 
1091   if (!REG_P (reg))
1092     return false;
1093 
1094   if (GET_MODE (reg) != mode)
1095     return false;
1096 
1097   int regno = REGNO (reg);
1098 
1099   if (regno >= FIRST_PSEUDO_REGISTER)
1100     {
1101       if (!strict)
1102 	return true;
1103 
1104       if (!reg_renumber)
1105 	return false;
1106 
1107       regno = reg_renumber[regno];
1108     }
1109 
1110   return (SGPR_REGNO_P (regno) || regno == M0_REG
1111 	  || regno == ARG_POINTER_REGNUM || regno == FRAME_POINTER_REGNUM);
1112 }
1113 
1114 /* Return true is REG is a valid place to store a pointer,
1115    for instructions that require a VGPR.  */
1116 
1117 static bool
gcn_vec_address_register_p(rtx reg,machine_mode mode,bool strict)1118 gcn_vec_address_register_p (rtx reg, machine_mode mode, bool strict)
1119 {
1120   if (GET_CODE (reg) == SUBREG)
1121     reg = SUBREG_REG (reg);
1122 
1123   if (!REG_P (reg))
1124     return false;
1125 
1126   if (GET_MODE (reg) != mode)
1127     return false;
1128 
1129   int regno = REGNO (reg);
1130 
1131   if (regno >= FIRST_PSEUDO_REGISTER)
1132     {
1133       if (!strict)
1134 	return true;
1135 
1136       if (!reg_renumber)
1137 	return false;
1138 
1139       regno = reg_renumber[regno];
1140     }
1141 
1142   return VGPR_REGNO_P (regno);
1143 }
1144 
1145 /* Return true if X would be valid inside a MEM using the Flat address
1146    space.  */
1147 
1148 bool
gcn_flat_address_p(rtx x,machine_mode mode)1149 gcn_flat_address_p (rtx x, machine_mode mode)
1150 {
1151   bool vec_mode = (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
1152 		   || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT);
1153 
1154   if (vec_mode && gcn_address_register_p (x, DImode, false))
1155     return true;
1156 
1157   if (!vec_mode && gcn_vec_address_register_p (x, DImode, false))
1158     return true;
1159 
1160   if (TARGET_GCN5_PLUS
1161       && GET_CODE (x) == PLUS
1162       && gcn_vec_address_register_p (XEXP (x, 0), DImode, false)
1163       && CONST_INT_P (XEXP (x, 1)))
1164     return true;
1165 
1166   return false;
1167 }
1168 
1169 /* Return true if X would be valid inside a MEM using the Scalar Flat
1170    address space.  */
1171 
1172 bool
gcn_scalar_flat_address_p(rtx x)1173 gcn_scalar_flat_address_p (rtx x)
1174 {
1175   if (gcn_address_register_p (x, DImode, false))
1176     return true;
1177 
1178   if (GET_CODE (x) == PLUS
1179       && gcn_address_register_p (XEXP (x, 0), DImode, false)
1180       && CONST_INT_P (XEXP (x, 1)))
1181     return true;
1182 
1183   return false;
1184 }
1185 
1186 /* Return true if MEM X would be valid for the Scalar Flat address space.  */
1187 
1188 bool
gcn_scalar_flat_mem_p(rtx x)1189 gcn_scalar_flat_mem_p (rtx x)
1190 {
1191   if (!MEM_P (x))
1192     return false;
1193 
1194   if (GET_MODE_SIZE (GET_MODE (x)) < 4)
1195     return false;
1196 
1197   return gcn_scalar_flat_address_p (XEXP (x, 0));
1198 }
1199 
1200 /* Return true if X would be valid inside a MEM using the LDS or GDS
1201    address spaces.  */
1202 
1203 bool
gcn_ds_address_p(rtx x)1204 gcn_ds_address_p (rtx x)
1205 {
1206   if (gcn_vec_address_register_p (x, SImode, false))
1207     return true;
1208 
1209   if (GET_CODE (x) == PLUS
1210       && gcn_vec_address_register_p (XEXP (x, 0), SImode, false)
1211       && CONST_INT_P (XEXP (x, 1)))
1212     return true;
1213 
1214   return false;
1215 }
1216 
1217 /* Return true if ADDR would be valid inside a MEM using the Global
1218    address space.  */
1219 
1220 bool
gcn_global_address_p(rtx addr)1221 gcn_global_address_p (rtx addr)
1222 {
1223   if (gcn_address_register_p (addr, DImode, false)
1224       || gcn_vec_address_register_p (addr, DImode, false))
1225     return true;
1226 
1227   if (GET_CODE (addr) == PLUS)
1228     {
1229       rtx base = XEXP (addr, 0);
1230       rtx offset = XEXP (addr, 1);
1231       bool immediate_p = (CONST_INT_P (offset)
1232 			  && INTVAL (offset) >= -(1 << 12)
1233 			  && INTVAL (offset) < (1 << 12));
1234 
1235       if ((gcn_address_register_p (base, DImode, false)
1236 	   || gcn_vec_address_register_p (base, DImode, false))
1237 	  && immediate_p)
1238 	/* SGPR + CONST or VGPR + CONST  */
1239 	return true;
1240 
1241       if (gcn_address_register_p (base, DImode, false)
1242 	  && gcn_vgpr_register_operand (offset, SImode))
1243 	/* SPGR + VGPR  */
1244 	return true;
1245 
1246       if (GET_CODE (base) == PLUS
1247 	  && gcn_address_register_p (XEXP (base, 0), DImode, false)
1248 	  && gcn_vgpr_register_operand (XEXP (base, 1), SImode)
1249 	  && immediate_p)
1250 	/* (SGPR + VGPR) + CONST  */
1251 	return true;
1252     }
1253 
1254   return false;
1255 }
1256 
1257 /* Implement TARGET_ADDR_SPACE_LEGITIMATE_ADDRESS_P.
1258 
1259    Recognizes RTL expressions that are valid memory addresses for an
1260    instruction.  The MODE argument is the machine mode for the MEM
1261    expression that wants to use this address.
1262 
1263    It only recognizes address in canonical form.  LEGITIMIZE_ADDRESS should
1264    convert common non-canonical forms to canonical form so that they will
1265    be recognized.  */
1266 
1267 static bool
gcn_addr_space_legitimate_address_p(machine_mode mode,rtx x,bool strict,addr_space_t as)1268 gcn_addr_space_legitimate_address_p (machine_mode mode, rtx x, bool strict,
1269 				     addr_space_t as)
1270 {
1271   /* All vector instructions need to work on addresses in registers.  */
1272   if (!TARGET_GCN5_PLUS && (vgpr_vector_mode_p (mode) && !REG_P (x)))
1273     return false;
1274 
1275   if (AS_SCALAR_FLAT_P (as))
1276     {
1277       if (mode == QImode || mode == HImode)
1278 	return 0;
1279 
1280       switch (GET_CODE (x))
1281 	{
1282 	case REG:
1283 	  return gcn_address_register_p (x, DImode, strict);
1284 	/* Addresses are in the form BASE+OFFSET
1285 	   OFFSET is either 20bit unsigned immediate, SGPR or M0.
1286 	   Writes and atomics do not accept SGPR.  */
1287 	case PLUS:
1288 	  {
1289 	    rtx x0 = XEXP (x, 0);
1290 	    rtx x1 = XEXP (x, 1);
1291 	    if (!gcn_address_register_p (x0, DImode, strict))
1292 	      return false;
1293 	    /* FIXME: This is disabled because of the mode mismatch between
1294 	       SImode (for the address or m0 register) and the DImode PLUS.
1295 	       We'll need a zero_extend or similar.
1296 
1297 	    if (gcn_m0_register_p (x1, SImode, strict)
1298 		|| gcn_address_register_p (x1, SImode, strict))
1299 	      return true;
1300 	    else*/
1301 	    if (GET_CODE (x1) == CONST_INT)
1302 	      {
1303 		if (INTVAL (x1) >= 0 && INTVAL (x1) < (1 << 20)
1304 		    /* The low bits of the offset are ignored, even when
1305 		       they're meant to realign the pointer.  */
1306 		    && !(INTVAL (x1) & 0x3))
1307 		  return true;
1308 	      }
1309 	    return false;
1310 	  }
1311 
1312 	default:
1313 	  break;
1314 	}
1315     }
1316   else if (AS_SCRATCH_P (as))
1317     return gcn_address_register_p (x, SImode, strict);
1318   else if (AS_FLAT_P (as) || AS_FLAT_SCRATCH_P (as))
1319     {
1320       if (TARGET_GCN3 || GET_CODE (x) == REG)
1321        return ((GET_MODE_CLASS (mode) == MODE_VECTOR_INT
1322 		|| GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
1323 	       ? gcn_address_register_p (x, DImode, strict)
1324 	       : gcn_vec_address_register_p (x, DImode, strict));
1325       else
1326 	{
1327 	  gcc_assert (TARGET_GCN5_PLUS);
1328 
1329 	  if (GET_CODE (x) == PLUS)
1330 	    {
1331 	      rtx x1 = XEXP (x, 1);
1332 
1333 	      if (VECTOR_MODE_P (mode)
1334 		  ? !gcn_address_register_p (x, DImode, strict)
1335 		  : !gcn_vec_address_register_p (x, DImode, strict))
1336 		return false;
1337 
1338 	      if (GET_CODE (x1) == CONST_INT)
1339 		{
1340 		  if (INTVAL (x1) >= 0 && INTVAL (x1) < (1 << 12)
1341 		      /* The low bits of the offset are ignored, even when
1342 		         they're meant to realign the pointer.  */
1343 		      && !(INTVAL (x1) & 0x3))
1344 		    return true;
1345 		}
1346 	    }
1347 	  return false;
1348 	}
1349     }
1350   else if (AS_GLOBAL_P (as))
1351     {
1352       gcc_assert (TARGET_GCN5_PLUS);
1353 
1354       if (GET_CODE (x) == REG)
1355        return (gcn_address_register_p (x, DImode, strict)
1356 	       || (!VECTOR_MODE_P (mode)
1357 		   && gcn_vec_address_register_p (x, DImode, strict)));
1358       else if (GET_CODE (x) == PLUS)
1359 	{
1360 	  rtx base = XEXP (x, 0);
1361 	  rtx offset = XEXP (x, 1);
1362 
1363 	  bool immediate_p = (GET_CODE (offset) == CONST_INT
1364 			      /* Signed 13-bit immediate.  */
1365 			      && INTVAL (offset) >= -(1 << 12)
1366 			      && INTVAL (offset) < (1 << 12)
1367 			      /* The low bits of the offset are ignored, even
1368 			         when they're meant to realign the pointer.  */
1369 			      && !(INTVAL (offset) & 0x3));
1370 
1371 	  if (!VECTOR_MODE_P (mode))
1372 	    {
1373 	      if ((gcn_address_register_p (base, DImode, strict)
1374 		   || gcn_vec_address_register_p (base, DImode, strict))
1375 		  && immediate_p)
1376 		/* SGPR + CONST or VGPR + CONST  */
1377 		return true;
1378 
1379 	      if (gcn_address_register_p (base, DImode, strict)
1380 		  && gcn_vgpr_register_operand (offset, SImode))
1381 		/* SGPR + VGPR  */
1382 		return true;
1383 
1384 	      if (GET_CODE (base) == PLUS
1385 		  && gcn_address_register_p (XEXP (base, 0), DImode, strict)
1386 		  && gcn_vgpr_register_operand (XEXP (base, 1), SImode)
1387 		  && immediate_p)
1388 		/* (SGPR + VGPR) + CONST  */
1389 		return true;
1390 	    }
1391 	  else
1392 	    {
1393 	      if (gcn_address_register_p (base, DImode, strict)
1394 		  && immediate_p)
1395 		/* SGPR + CONST  */
1396 		return true;
1397 	    }
1398 	}
1399       else
1400 	return false;
1401     }
1402   else if (AS_ANY_DS_P (as))
1403     switch (GET_CODE (x))
1404       {
1405       case REG:
1406 	return (VECTOR_MODE_P (mode)
1407 		? gcn_address_register_p (x, SImode, strict)
1408 		: gcn_vec_address_register_p (x, SImode, strict));
1409       /* Addresses are in the form BASE+OFFSET
1410 	 OFFSET is either 20bit unsigned immediate, SGPR or M0.
1411 	 Writes and atomics do not accept SGPR.  */
1412       case PLUS:
1413 	{
1414 	  rtx x0 = XEXP (x, 0);
1415 	  rtx x1 = XEXP (x, 1);
1416 	  if (!gcn_vec_address_register_p (x0, DImode, strict))
1417 	    return false;
1418 	  if (GET_CODE (x1) == REG)
1419 	    {
1420 	      if (GET_CODE (x1) != REG
1421 		  || (REGNO (x1) <= FIRST_PSEUDO_REGISTER
1422 		      && !gcn_ssrc_register_operand (x1, DImode)))
1423 		return false;
1424 	    }
1425 	  else if (GET_CODE (x1) == CONST_VECTOR
1426 		   && GET_CODE (CONST_VECTOR_ELT (x1, 0)) == CONST_INT
1427 		   && single_cst_vector_p (x1))
1428 	    {
1429 	      x1 = CONST_VECTOR_ELT (x1, 0);
1430 	      if (INTVAL (x1) >= 0 && INTVAL (x1) < (1 << 20))
1431 		return true;
1432 	    }
1433 	  return false;
1434 	}
1435 
1436       default:
1437 	break;
1438       }
1439   else
1440     gcc_unreachable ();
1441   return false;
1442 }
1443 
1444 /* Implement TARGET_ADDR_SPACE_POINTER_MODE.
1445 
1446    Return the appropriate mode for a named address pointer.  */
1447 
1448 static scalar_int_mode
gcn_addr_space_pointer_mode(addr_space_t addrspace)1449 gcn_addr_space_pointer_mode (addr_space_t addrspace)
1450 {
1451   switch (addrspace)
1452     {
1453     case ADDR_SPACE_SCRATCH:
1454     case ADDR_SPACE_LDS:
1455     case ADDR_SPACE_GDS:
1456       return SImode;
1457     case ADDR_SPACE_DEFAULT:
1458     case ADDR_SPACE_FLAT:
1459     case ADDR_SPACE_FLAT_SCRATCH:
1460     case ADDR_SPACE_SCALAR_FLAT:
1461       return DImode;
1462     default:
1463       gcc_unreachable ();
1464     }
1465 }
1466 
1467 /* Implement TARGET_ADDR_SPACE_ADDRESS_MODE.
1468 
1469    Return the appropriate mode for a named address space address.  */
1470 
1471 static scalar_int_mode
gcn_addr_space_address_mode(addr_space_t addrspace)1472 gcn_addr_space_address_mode (addr_space_t addrspace)
1473 {
1474   return gcn_addr_space_pointer_mode (addrspace);
1475 }
1476 
1477 /* Implement TARGET_ADDR_SPACE_SUBSET_P.
1478 
1479    Determine if one named address space is a subset of another.  */
1480 
1481 static bool
gcn_addr_space_subset_p(addr_space_t subset,addr_space_t superset)1482 gcn_addr_space_subset_p (addr_space_t subset, addr_space_t superset)
1483 {
1484   if (subset == superset)
1485     return true;
1486   /* FIXME is this true?  */
1487   if (AS_FLAT_P (superset) || AS_SCALAR_FLAT_P (superset))
1488     return true;
1489   return false;
1490 }
1491 
1492 /* Convert from one address space to another.  */
1493 
1494 static rtx
gcn_addr_space_convert(rtx op,tree from_type,tree to_type)1495 gcn_addr_space_convert (rtx op, tree from_type, tree to_type)
1496 {
1497   gcc_assert (POINTER_TYPE_P (from_type));
1498   gcc_assert (POINTER_TYPE_P (to_type));
1499 
1500   addr_space_t as_from = TYPE_ADDR_SPACE (TREE_TYPE (from_type));
1501   addr_space_t as_to = TYPE_ADDR_SPACE (TREE_TYPE (to_type));
1502 
1503   if (AS_LDS_P (as_from) && AS_FLAT_P (as_to))
1504     {
1505       rtx queue = gen_rtx_REG (DImode,
1506 			       cfun->machine->args.reg[QUEUE_PTR_ARG]);
1507       rtx group_seg_aperture_hi = gen_rtx_MEM (SImode,
1508 				     gen_rtx_PLUS (DImode, queue,
1509 						   gen_int_mode (64, SImode)));
1510       rtx tmp = gen_reg_rtx (DImode);
1511 
1512       emit_move_insn (gen_lowpart (SImode, tmp), op);
1513       emit_move_insn (gen_highpart_mode (SImode, DImode, tmp),
1514 		      group_seg_aperture_hi);
1515 
1516       return tmp;
1517     }
1518   else if (as_from == as_to)
1519     return op;
1520   else
1521     gcc_unreachable ();
1522 }
1523 
1524 /* Implement TARGET_ADDR_SPACE_DEBUG.
1525 
1526    Return the dwarf address space class for each hardware address space.  */
1527 
1528 static int
gcn_addr_space_debug(addr_space_t as)1529 gcn_addr_space_debug (addr_space_t as)
1530 {
1531   switch (as)
1532     {
1533       case ADDR_SPACE_DEFAULT:
1534       case ADDR_SPACE_FLAT:
1535       case ADDR_SPACE_SCALAR_FLAT:
1536       case ADDR_SPACE_FLAT_SCRATCH:
1537 	return DW_ADDR_none;
1538       case ADDR_SPACE_GLOBAL:
1539 	return 1;      // DW_ADDR_LLVM_global
1540       case ADDR_SPACE_LDS:
1541 	return 3;      // DW_ADDR_LLVM_group
1542       case ADDR_SPACE_SCRATCH:
1543 	return 4;      // DW_ADDR_LLVM_private
1544       case ADDR_SPACE_GDS:
1545 	return 0x8000; // DW_ADDR_AMDGPU_region
1546     }
1547   gcc_unreachable ();
1548 }
1549 
1550 
1551 /* Implement REGNO_MODE_CODE_OK_FOR_BASE_P via gcn.h
1552 
1553    Retun true if REGNO is OK for memory adressing.  */
1554 
1555 bool
gcn_regno_mode_code_ok_for_base_p(int regno,machine_mode,addr_space_t as,int,int)1556 gcn_regno_mode_code_ok_for_base_p (int regno,
1557 				   machine_mode, addr_space_t as, int, int)
1558 {
1559   if (regno >= FIRST_PSEUDO_REGISTER)
1560     {
1561       if (reg_renumber)
1562 	regno = reg_renumber[regno];
1563       else
1564 	return true;
1565     }
1566   if (AS_FLAT_P (as))
1567     return (VGPR_REGNO_P (regno)
1568 	    || regno == ARG_POINTER_REGNUM || regno == FRAME_POINTER_REGNUM);
1569   else if (AS_SCALAR_FLAT_P (as))
1570     return (SGPR_REGNO_P (regno)
1571 	    || regno == ARG_POINTER_REGNUM || regno == FRAME_POINTER_REGNUM);
1572   else if (AS_GLOBAL_P (as))
1573     {
1574       return (SGPR_REGNO_P (regno)
1575 	      || VGPR_REGNO_P (regno)
1576 	      || regno == ARG_POINTER_REGNUM
1577 	      || regno == FRAME_POINTER_REGNUM);
1578     }
1579   else
1580     /* For now.  */
1581     return false;
1582 }
1583 
1584 /* Implement MODE_CODE_BASE_REG_CLASS via gcn.h.
1585 
1586    Return a suitable register class for memory addressing.  */
1587 
1588 reg_class
gcn_mode_code_base_reg_class(machine_mode mode,addr_space_t as,int oc,int ic)1589 gcn_mode_code_base_reg_class (machine_mode mode, addr_space_t as, int oc,
1590 			      int ic)
1591 {
1592   switch (as)
1593     {
1594     case ADDR_SPACE_DEFAULT:
1595       return gcn_mode_code_base_reg_class (mode, DEFAULT_ADDR_SPACE, oc, ic);
1596     case ADDR_SPACE_SCALAR_FLAT:
1597     case ADDR_SPACE_SCRATCH:
1598       return SGPR_REGS;
1599       break;
1600     case ADDR_SPACE_FLAT:
1601     case ADDR_SPACE_FLAT_SCRATCH:
1602     case ADDR_SPACE_LDS:
1603     case ADDR_SPACE_GDS:
1604       return ((GET_MODE_CLASS (mode) == MODE_VECTOR_INT
1605 	       || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
1606 	      ? SGPR_REGS : VGPR_REGS);
1607     case ADDR_SPACE_GLOBAL:
1608       return ((GET_MODE_CLASS (mode) == MODE_VECTOR_INT
1609 	       || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
1610 	      ? SGPR_REGS : ALL_GPR_REGS);
1611     }
1612   gcc_unreachable ();
1613 }
1614 
1615 /* Implement REGNO_OK_FOR_INDEX_P via gcn.h.
1616 
1617    Return true if REGNO is OK for index of memory addressing.  */
1618 
1619 bool
regno_ok_for_index_p(int regno)1620 regno_ok_for_index_p (int regno)
1621 {
1622   if (regno >= FIRST_PSEUDO_REGISTER)
1623     {
1624       if (reg_renumber)
1625 	regno = reg_renumber[regno];
1626       else
1627 	return true;
1628     }
1629   return regno == M0_REG || VGPR_REGNO_P (regno);
1630 }
1631 
1632 /* Generate move which uses the exec flags.  If EXEC is NULL, then it is
1633    assumed that all lanes normally relevant to the mode of the move are
1634    affected.  If PREV is NULL, then a sensible default is supplied for
1635    the inactive lanes.  */
1636 
1637 static rtx
gen_mov_with_exec(rtx op0,rtx op1,rtx exec=NULL,rtx prev=NULL)1638 gen_mov_with_exec (rtx op0, rtx op1, rtx exec = NULL, rtx prev = NULL)
1639 {
1640   machine_mode mode = GET_MODE (op0);
1641 
1642   if (vgpr_vector_mode_p (mode))
1643     {
1644       if (exec && exec != CONSTM1_RTX (DImode))
1645 	{
1646 	  if (!prev)
1647 	    prev = op0;
1648 	}
1649       else
1650 	{
1651 	  if (!prev)
1652 	    prev = gcn_gen_undef (mode);
1653 	  exec = gcn_full_exec_reg ();
1654 	}
1655 
1656       rtx set = gen_rtx_SET (op0, gen_rtx_VEC_MERGE (mode, op1, prev, exec));
1657 
1658       return gen_rtx_PARALLEL (VOIDmode,
1659 	       gen_rtvec (2, set,
1660 			 gen_rtx_CLOBBER (VOIDmode,
1661 					  gen_rtx_SCRATCH (V64DImode))));
1662     }
1663 
1664   return (gen_rtx_PARALLEL
1665 	  (VOIDmode,
1666 	   gen_rtvec (2, gen_rtx_SET (op0, op1),
1667 		      gen_rtx_USE (VOIDmode,
1668 				   exec ? exec : gcn_scalar_exec ()))));
1669 }
1670 
1671 /* Generate masked move.  */
1672 
1673 static rtx
gen_duplicate_load(rtx op0,rtx op1,rtx op2=NULL,rtx exec=NULL)1674 gen_duplicate_load (rtx op0, rtx op1, rtx op2 = NULL, rtx exec = NULL)
1675 {
1676   if (exec)
1677     return (gen_rtx_SET (op0,
1678 			 gen_rtx_VEC_MERGE (GET_MODE (op0),
1679 					    gen_rtx_VEC_DUPLICATE (GET_MODE
1680 								   (op0), op1),
1681 					    op2, exec)));
1682   else
1683     return (gen_rtx_SET (op0, gen_rtx_VEC_DUPLICATE (GET_MODE (op0), op1)));
1684 }
1685 
1686 /* Expand vector init of OP0 by VEC.
1687    Implements vec_init instruction pattern.  */
1688 
1689 void
gcn_expand_vector_init(rtx op0,rtx vec)1690 gcn_expand_vector_init (rtx op0, rtx vec)
1691 {
1692   int64_t initialized_mask = 0;
1693   int64_t curr_mask = 1;
1694   machine_mode mode = GET_MODE (op0);
1695 
1696   rtx val = XVECEXP (vec, 0, 0);
1697 
1698   for (int i = 1; i < 64; i++)
1699     if (rtx_equal_p (val, XVECEXP (vec, 0, i)))
1700       curr_mask |= (int64_t) 1 << i;
1701 
1702   if (gcn_constant_p (val))
1703     emit_move_insn (op0, gcn_vec_constant (mode, val));
1704   else
1705     {
1706       val = force_reg (GET_MODE_INNER (mode), val);
1707       emit_insn (gen_duplicate_load (op0, val));
1708     }
1709   initialized_mask |= curr_mask;
1710   for (int i = 1; i < 64; i++)
1711     if (!(initialized_mask & ((int64_t) 1 << i)))
1712       {
1713 	curr_mask = (int64_t) 1 << i;
1714 	rtx val = XVECEXP (vec, 0, i);
1715 
1716 	for (int j = i + 1; j < 64; j++)
1717 	  if (rtx_equal_p (val, XVECEXP (vec, 0, j)))
1718 	    curr_mask |= (int64_t) 1 << j;
1719 	if (gcn_constant_p (val))
1720 	  emit_insn (gen_mov_with_exec (op0, gcn_vec_constant (mode, val),
1721 					get_exec (curr_mask)));
1722 	else
1723 	  {
1724 	    val = force_reg (GET_MODE_INNER (mode), val);
1725 	    emit_insn (gen_duplicate_load (op0, val, op0,
1726 					   get_exec (curr_mask)));
1727 	  }
1728 	initialized_mask |= curr_mask;
1729       }
1730 }
1731 
1732 /* Load vector constant where n-th lane contains BASE+n*VAL.  */
1733 
1734 static rtx
strided_constant(machine_mode mode,int base,int val)1735 strided_constant (machine_mode mode, int base, int val)
1736 {
1737   rtx x = gen_reg_rtx (mode);
1738   emit_move_insn (x, gcn_vec_constant (mode, base));
1739   emit_insn (gen_addv64si3_exec (x, x, gcn_vec_constant (mode, val * 32),
1740 				 x, get_exec (0xffffffff00000000)));
1741   emit_insn (gen_addv64si3_exec (x, x, gcn_vec_constant (mode, val * 16),
1742 				 x, get_exec (0xffff0000ffff0000)));
1743   emit_insn (gen_addv64si3_exec (x, x, gcn_vec_constant (mode, val * 8),
1744 				 x, get_exec (0xff00ff00ff00ff00)));
1745   emit_insn (gen_addv64si3_exec (x, x, gcn_vec_constant (mode, val * 4),
1746 				 x, get_exec (0xf0f0f0f0f0f0f0f0)));
1747   emit_insn (gen_addv64si3_exec (x, x, gcn_vec_constant (mode, val * 2),
1748 				 x, get_exec (0xcccccccccccccccc)));
1749   emit_insn (gen_addv64si3_exec (x, x, gcn_vec_constant (mode, val * 1),
1750 				 x, get_exec (0xaaaaaaaaaaaaaaaa)));
1751   return x;
1752 }
1753 
1754 /* Implement TARGET_ADDR_SPACE_LEGITIMIZE_ADDRESS.  */
1755 
1756 static rtx
gcn_addr_space_legitimize_address(rtx x,rtx old,machine_mode mode,addr_space_t as)1757 gcn_addr_space_legitimize_address (rtx x, rtx old, machine_mode mode,
1758 				   addr_space_t as)
1759 {
1760   switch (as)
1761     {
1762     case ADDR_SPACE_DEFAULT:
1763       return gcn_addr_space_legitimize_address (x, old, mode,
1764 						DEFAULT_ADDR_SPACE);
1765     case ADDR_SPACE_SCALAR_FLAT:
1766     case ADDR_SPACE_SCRATCH:
1767       /* Instructions working on vectors need the address to be in
1768          a register.  */
1769       if (vgpr_vector_mode_p (mode))
1770 	return force_reg (GET_MODE (x), x);
1771 
1772       return x;
1773     case ADDR_SPACE_FLAT:
1774     case ADDR_SPACE_FLAT_SCRATCH:
1775     case ADDR_SPACE_GLOBAL:
1776       return TARGET_GCN3 ? force_reg (DImode, x) : x;
1777     case ADDR_SPACE_LDS:
1778     case ADDR_SPACE_GDS:
1779       /* FIXME: LDS support offsets, handle them!.  */
1780       if (vgpr_vector_mode_p (mode) && GET_MODE (x) != V64SImode)
1781 	{
1782 	  rtx addrs = gen_reg_rtx (V64SImode);
1783 	  rtx base = force_reg (SImode, x);
1784 	  rtx offsets = strided_constant (V64SImode, 0,
1785 					  GET_MODE_UNIT_SIZE (mode));
1786 
1787 	  emit_insn (gen_vec_duplicatev64si (addrs, base));
1788 	  emit_insn (gen_addv64si3 (addrs, offsets, addrs));
1789 	  return addrs;
1790 	}
1791       return x;
1792     }
1793   gcc_unreachable ();
1794 }
1795 
1796 /* Convert a (mem:<MODE> (reg:DI)) to (mem:<MODE> (reg:V64DI)) with the
1797    proper vector of stepped addresses.
1798 
1799    MEM will be a DImode address of a vector in an SGPR.
1800    TMP will be a V64DImode VGPR pair or (scratch:V64DI).  */
1801 
1802 rtx
gcn_expand_scalar_to_vector_address(machine_mode mode,rtx exec,rtx mem,rtx tmp)1803 gcn_expand_scalar_to_vector_address (machine_mode mode, rtx exec, rtx mem,
1804 				     rtx tmp)
1805 {
1806   gcc_assert (MEM_P (mem));
1807   rtx mem_base = XEXP (mem, 0);
1808   rtx mem_index = NULL_RTX;
1809 
1810   if (!TARGET_GCN5_PLUS)
1811     {
1812       /* gcn_addr_space_legitimize_address should have put the address in a
1813          register.  If not, it is too late to do anything about it.  */
1814       gcc_assert (REG_P (mem_base));
1815     }
1816 
1817   if (GET_CODE (mem_base) == PLUS)
1818     {
1819       mem_index = XEXP (mem_base, 1);
1820       mem_base = XEXP (mem_base, 0);
1821     }
1822 
1823   /* RF and RM base registers for vector modes should be always an SGPR.  */
1824   gcc_assert (SGPR_REGNO_P (REGNO (mem_base))
1825 	      || REGNO (mem_base) >= FIRST_PSEUDO_REGISTER);
1826 
1827   machine_mode inner = GET_MODE_INNER (mode);
1828   int shift = exact_log2 (GET_MODE_SIZE (inner));
1829   rtx ramp = gen_rtx_REG (V64SImode, VGPR_REGNO (1));
1830   rtx undef_v64si = gcn_gen_undef (V64SImode);
1831   rtx new_base = NULL_RTX;
1832   addr_space_t as = MEM_ADDR_SPACE (mem);
1833 
1834   rtx tmplo = (REG_P (tmp)
1835 	       ? gcn_operand_part (V64DImode, tmp, 0)
1836 	       : gen_reg_rtx (V64SImode));
1837 
1838   /* tmplo[:] = ramp[:] << shift  */
1839   if (exec)
1840     emit_insn (gen_ashlv64si3_exec (tmplo, ramp,
1841 				    gen_int_mode (shift, SImode),
1842 				    undef_v64si, exec));
1843   else
1844     emit_insn (gen_ashlv64si3 (tmplo, ramp, gen_int_mode (shift, SImode)));
1845 
1846   if (AS_FLAT_P (as))
1847     {
1848       rtx vcc = gen_rtx_REG (DImode, CC_SAVE_REG);
1849 
1850       if (REG_P (tmp))
1851 	{
1852 	  rtx mem_base_lo = gcn_operand_part (DImode, mem_base, 0);
1853 	  rtx mem_base_hi = gcn_operand_part (DImode, mem_base, 1);
1854 	  rtx tmphi = gcn_operand_part (V64DImode, tmp, 1);
1855 
1856 	  /* tmphi[:] = mem_base_hi  */
1857 	  if (exec)
1858 	    emit_insn (gen_vec_duplicatev64si_exec (tmphi, mem_base_hi,
1859 						    undef_v64si, exec));
1860 	  else
1861 	    emit_insn (gen_vec_duplicatev64si (tmphi, mem_base_hi));
1862 
1863 	  /* tmp[:] += zext (mem_base)  */
1864 	  if (exec)
1865 	    {
1866 	      emit_insn (gen_addv64si3_vcc_dup_exec (tmplo, mem_base_lo, tmplo,
1867 						     vcc, undef_v64si, exec));
1868 	      emit_insn (gen_addcv64si3_exec (tmphi, tmphi, const0_rtx,
1869 					      vcc, vcc, undef_v64si, exec));
1870 	    }
1871 	  else
1872 	    emit_insn (gen_addv64di3_vcc_zext_dup (tmp, mem_base_lo, tmp, vcc));
1873 	}
1874       else
1875 	{
1876 	  tmp = gen_reg_rtx (V64DImode);
1877 	  if (exec)
1878 	    emit_insn (gen_addv64di3_vcc_zext_dup2_exec
1879 		       (tmp, tmplo, mem_base, vcc, gcn_gen_undef (V64DImode),
1880 			exec));
1881 	  else
1882 	    emit_insn (gen_addv64di3_vcc_zext_dup2 (tmp, tmplo, mem_base, vcc));
1883 	}
1884 
1885       new_base = tmp;
1886     }
1887   else if (AS_ANY_DS_P (as))
1888     {
1889       if (!exec)
1890 	emit_insn (gen_addv64si3_dup (tmplo, tmplo, mem_base));
1891       else
1892         emit_insn (gen_addv64si3_dup_exec (tmplo, tmplo, mem_base,
1893 					   gcn_gen_undef (V64SImode), exec));
1894       new_base = tmplo;
1895     }
1896   else
1897     {
1898       mem_base = gen_rtx_VEC_DUPLICATE (V64DImode, mem_base);
1899       new_base = gen_rtx_PLUS (V64DImode, mem_base,
1900 			       gen_rtx_SIGN_EXTEND (V64DImode, tmplo));
1901     }
1902 
1903   return gen_rtx_PLUS (GET_MODE (new_base), new_base,
1904 		       gen_rtx_VEC_DUPLICATE (GET_MODE (new_base),
1905 					      (mem_index ? mem_index
1906 					       : const0_rtx)));
1907 }
1908 
1909 /* Convert a BASE address, a vector of OFFSETS, and a SCALE, to addresses
1910    suitable for the given address space.  This is indented for use in
1911    gather/scatter patterns.
1912 
1913    The offsets may be signed or unsigned, according to UNSIGNED_P.
1914    If EXEC is set then _exec patterns will be used, otherwise plain.
1915 
1916    Return values.
1917      ADDR_SPACE_FLAT   - return V64DImode vector of absolute addresses.
1918      ADDR_SPACE_GLOBAL - return V64SImode vector of offsets.  */
1919 
1920 rtx
gcn_expand_scaled_offsets(addr_space_t as,rtx base,rtx offsets,rtx scale,bool unsigned_p,rtx exec)1921 gcn_expand_scaled_offsets (addr_space_t as, rtx base, rtx offsets, rtx scale,
1922 			   bool unsigned_p, rtx exec)
1923 {
1924   rtx tmpsi = gen_reg_rtx (V64SImode);
1925   rtx tmpdi = gen_reg_rtx (V64DImode);
1926   rtx undefsi = exec ? gcn_gen_undef (V64SImode) : NULL;
1927   rtx undefdi = exec ? gcn_gen_undef (V64DImode) : NULL;
1928 
1929   if (CONST_INT_P (scale)
1930       && INTVAL (scale) > 0
1931       && exact_log2 (INTVAL (scale)) >= 0)
1932     emit_insn (gen_ashlv64si3 (tmpsi, offsets,
1933 			       GEN_INT (exact_log2 (INTVAL (scale)))));
1934   else
1935     (exec
1936      ? emit_insn (gen_mulv64si3_dup_exec (tmpsi, offsets, scale, undefsi,
1937 					  exec))
1938      : emit_insn (gen_mulv64si3_dup (tmpsi, offsets, scale)));
1939 
1940   /* "Global" instructions do not support negative register offsets.  */
1941   if (as == ADDR_SPACE_FLAT || !unsigned_p)
1942     {
1943       if (unsigned_p)
1944 	(exec
1945 	 ?  emit_insn (gen_addv64di3_zext_dup2_exec (tmpdi, tmpsi, base,
1946 						    undefdi, exec))
1947 	 :  emit_insn (gen_addv64di3_zext_dup2 (tmpdi, tmpsi, base)));
1948       else
1949 	(exec
1950 	 ?  emit_insn (gen_addv64di3_sext_dup2_exec (tmpdi, tmpsi, base,
1951 						     undefdi, exec))
1952 	 :  emit_insn (gen_addv64di3_sext_dup2 (tmpdi, tmpsi, base)));
1953       return tmpdi;
1954     }
1955   else if (as == ADDR_SPACE_GLOBAL)
1956     return tmpsi;
1957 
1958   gcc_unreachable ();
1959 }
1960 
1961 /* Return true if move from OP0 to OP1 is known to be executed in vector
1962    unit.  */
1963 
1964 bool
gcn_vgpr_move_p(rtx op0,rtx op1)1965 gcn_vgpr_move_p (rtx op0, rtx op1)
1966 {
1967   if (MEM_P (op0) && AS_SCALAR_FLAT_P (MEM_ADDR_SPACE (op0)))
1968     return true;
1969   if (MEM_P (op1) && AS_SCALAR_FLAT_P (MEM_ADDR_SPACE (op1)))
1970     return true;
1971   return ((REG_P (op0) && VGPR_REGNO_P (REGNO (op0)))
1972 	  || (REG_P (op1) && VGPR_REGNO_P (REGNO (op1)))
1973 	  || vgpr_vector_mode_p (GET_MODE (op0)));
1974 }
1975 
1976 /* Return true if move from OP0 to OP1 is known to be executed in scalar
1977    unit.  Used in the machine description.  */
1978 
1979 bool
gcn_sgpr_move_p(rtx op0,rtx op1)1980 gcn_sgpr_move_p (rtx op0, rtx op1)
1981 {
1982   if (MEM_P (op0) && AS_SCALAR_FLAT_P (MEM_ADDR_SPACE (op0)))
1983     return true;
1984   if (MEM_P (op1) && AS_SCALAR_FLAT_P (MEM_ADDR_SPACE (op1)))
1985     return true;
1986   if (!REG_P (op0) || REGNO (op0) >= FIRST_PSEUDO_REGISTER
1987       || VGPR_REGNO_P (REGNO (op0)))
1988     return false;
1989   if (REG_P (op1)
1990       && REGNO (op1) < FIRST_PSEUDO_REGISTER
1991       && !VGPR_REGNO_P (REGNO (op1)))
1992     return true;
1993   return immediate_operand (op1, VOIDmode) || memory_operand (op1, VOIDmode);
1994 }
1995 
1996 /* Implement TARGET_SECONDARY_RELOAD.
1997 
1998    The address space determines which registers can be used for loads and
1999    stores.  */
2000 
2001 static reg_class_t
gcn_secondary_reload(bool in_p,rtx x,reg_class_t rclass,machine_mode reload_mode,secondary_reload_info * sri)2002 gcn_secondary_reload (bool in_p, rtx x, reg_class_t rclass,
2003 		      machine_mode reload_mode, secondary_reload_info *sri)
2004 {
2005   reg_class_t result = NO_REGS;
2006   bool spilled_pseudo =
2007     (REG_P (x) || GET_CODE (x) == SUBREG) && true_regnum (x) == -1;
2008 
2009   if (dump_file && (dump_flags & TDF_DETAILS))
2010     {
2011       fprintf (dump_file, "gcn_secondary_reload: ");
2012       dump_value_slim (dump_file, x, 1);
2013       fprintf (dump_file, " %s %s:%s", (in_p ? "->" : "<-"),
2014 	       reg_class_names[rclass], GET_MODE_NAME (reload_mode));
2015       if (REG_P (x) || GET_CODE (x) == SUBREG)
2016 	fprintf (dump_file, " (true regnum: %d \"%s\")", true_regnum (x),
2017 		 (true_regnum (x) >= 0
2018 		  && true_regnum (x) < FIRST_PSEUDO_REGISTER
2019 		  ? reg_names[true_regnum (x)]
2020 		  : (spilled_pseudo ? "stack spill" : "??")));
2021       fprintf (dump_file, "\n");
2022     }
2023 
2024   /* Some callers don't use or initialize icode.  */
2025   sri->icode = CODE_FOR_nothing;
2026 
2027   if (MEM_P (x) || spilled_pseudo)
2028     {
2029       addr_space_t as = DEFAULT_ADDR_SPACE;
2030 
2031       /* If we have a spilled pseudo, we can't find the address space
2032 	 directly, but we know it's in ADDR_SPACE_FLAT space for GCN3 or
2033 	 ADDR_SPACE_GLOBAL for GCN5.  */
2034       if (MEM_P (x))
2035 	as = MEM_ADDR_SPACE (x);
2036 
2037       if (as == ADDR_SPACE_DEFAULT)
2038 	as = DEFAULT_ADDR_SPACE;
2039 
2040       switch (as)
2041 	{
2042 	case ADDR_SPACE_SCALAR_FLAT:
2043 	  result =
2044 	    ((!MEM_P (x) || rclass == SGPR_REGS) ? NO_REGS : SGPR_REGS);
2045 	  break;
2046 	case ADDR_SPACE_FLAT:
2047 	case ADDR_SPACE_FLAT_SCRATCH:
2048 	case ADDR_SPACE_GLOBAL:
2049 	  if (GET_MODE_CLASS (reload_mode) == MODE_VECTOR_INT
2050 	      || GET_MODE_CLASS (reload_mode) == MODE_VECTOR_FLOAT)
2051 	    {
2052 	      if (in_p)
2053 		switch (reload_mode)
2054 		  {
2055 		  case E_V64SImode:
2056 		    sri->icode = CODE_FOR_reload_inv64si;
2057 		    break;
2058 		  case E_V64SFmode:
2059 		    sri->icode = CODE_FOR_reload_inv64sf;
2060 		    break;
2061 		  case E_V64HImode:
2062 		    sri->icode = CODE_FOR_reload_inv64hi;
2063 		    break;
2064 		  case E_V64HFmode:
2065 		    sri->icode = CODE_FOR_reload_inv64hf;
2066 		    break;
2067 		  case E_V64QImode:
2068 		    sri->icode = CODE_FOR_reload_inv64qi;
2069 		    break;
2070 		  case E_V64DImode:
2071 		    sri->icode = CODE_FOR_reload_inv64di;
2072 		    break;
2073 		  case E_V64DFmode:
2074 		    sri->icode = CODE_FOR_reload_inv64df;
2075 		    break;
2076 		  default:
2077 		    gcc_unreachable ();
2078 		  }
2079 	      else
2080 		switch (reload_mode)
2081 		  {
2082 		  case E_V64SImode:
2083 		    sri->icode = CODE_FOR_reload_outv64si;
2084 		    break;
2085 		  case E_V64SFmode:
2086 		    sri->icode = CODE_FOR_reload_outv64sf;
2087 		    break;
2088 		  case E_V64HImode:
2089 		    sri->icode = CODE_FOR_reload_outv64hi;
2090 		    break;
2091 		  case E_V64HFmode:
2092 		    sri->icode = CODE_FOR_reload_outv64hf;
2093 		    break;
2094 		  case E_V64QImode:
2095 		    sri->icode = CODE_FOR_reload_outv64qi;
2096 		    break;
2097 		  case E_V64DImode:
2098 		    sri->icode = CODE_FOR_reload_outv64di;
2099 		    break;
2100 		  case E_V64DFmode:
2101 		    sri->icode = CODE_FOR_reload_outv64df;
2102 		    break;
2103 		  default:
2104 		    gcc_unreachable ();
2105 		  }
2106 	      break;
2107 	    }
2108 	  /* Fallthrough.  */
2109 	case ADDR_SPACE_LDS:
2110 	case ADDR_SPACE_GDS:
2111 	case ADDR_SPACE_SCRATCH:
2112 	  result = (rclass == VGPR_REGS ? NO_REGS : VGPR_REGS);
2113 	  break;
2114 	}
2115     }
2116 
2117   if (dump_file && (dump_flags & TDF_DETAILS))
2118     fprintf (dump_file, "   <= %s (icode: %s)\n", reg_class_names[result],
2119 	     get_insn_name (sri->icode));
2120 
2121   return result;
2122 }
2123 
2124 /* Update register usage after having seen the compiler flags and kernel
2125    attributes.  We typically want to fix registers that contain values
2126    set by the HSA runtime.  */
2127 
2128 static void
gcn_conditional_register_usage(void)2129 gcn_conditional_register_usage (void)
2130 {
2131   if (!cfun || !cfun->machine)
2132     return;
2133 
2134   if (cfun->machine->normal_function)
2135     {
2136       /* Restrict the set of SGPRs and VGPRs used by non-kernel functions.  */
2137       for (int i = SGPR_REGNO (MAX_NORMAL_SGPR_COUNT);
2138 	   i <= LAST_SGPR_REG; i++)
2139 	fixed_regs[i] = 1, call_used_regs[i] = 1;
2140 
2141       for (int i = VGPR_REGNO (MAX_NORMAL_VGPR_COUNT);
2142 	   i <= LAST_VGPR_REG; i++)
2143 	fixed_regs[i] = 1, call_used_regs[i] = 1;
2144 
2145       return;
2146     }
2147 
2148   /* If the set of requested args is the default set, nothing more needs to
2149      be done.  */
2150   if (cfun->machine->args.requested == default_requested_args)
2151     return;
2152 
2153   /* Requesting a set of args different from the default violates the ABI.  */
2154   if (!leaf_function_p ())
2155     warning (0, "A non-default set of initial values has been requested, "
2156 		"which violates the ABI");
2157 
2158   for (int i = SGPR_REGNO (0); i < SGPR_REGNO (14); i++)
2159     fixed_regs[i] = 0;
2160 
2161   /* Fix the runtime argument register containing values that may be
2162      needed later.  DISPATCH_PTR_ARG and FLAT_SCRATCH_* should not be
2163      needed after the prologue so there's no need to fix them.  */
2164   if (cfun->machine->args.reg[PRIVATE_SEGMENT_WAVE_OFFSET_ARG] >= 0)
2165     fixed_regs[cfun->machine->args.reg[PRIVATE_SEGMENT_WAVE_OFFSET_ARG]] = 1;
2166   if (cfun->machine->args.reg[PRIVATE_SEGMENT_BUFFER_ARG] >= 0)
2167     {
2168       /* The upper 32-bits of the 64-bit descriptor are not used, so allow
2169 	the containing registers to be used for other purposes.  */
2170       fixed_regs[cfun->machine->args.reg[PRIVATE_SEGMENT_BUFFER_ARG]] = 1;
2171       fixed_regs[cfun->machine->args.reg[PRIVATE_SEGMENT_BUFFER_ARG] + 1] = 1;
2172     }
2173   if (cfun->machine->args.reg[KERNARG_SEGMENT_PTR_ARG] >= 0)
2174     {
2175       fixed_regs[cfun->machine->args.reg[KERNARG_SEGMENT_PTR_ARG]] = 1;
2176       fixed_regs[cfun->machine->args.reg[KERNARG_SEGMENT_PTR_ARG] + 1] = 1;
2177     }
2178   if (cfun->machine->args.reg[DISPATCH_PTR_ARG] >= 0)
2179     {
2180       fixed_regs[cfun->machine->args.reg[DISPATCH_PTR_ARG]] = 1;
2181       fixed_regs[cfun->machine->args.reg[DISPATCH_PTR_ARG] + 1] = 1;
2182     }
2183   if (cfun->machine->args.reg[WORKGROUP_ID_X_ARG] >= 0)
2184     fixed_regs[cfun->machine->args.reg[WORKGROUP_ID_X_ARG]] = 1;
2185   if (cfun->machine->args.reg[WORK_ITEM_ID_X_ARG] >= 0)
2186     fixed_regs[cfun->machine->args.reg[WORK_ITEM_ID_X_ARG]] = 1;
2187   if (cfun->machine->args.reg[WORK_ITEM_ID_Y_ARG] >= 0)
2188     fixed_regs[cfun->machine->args.reg[WORK_ITEM_ID_Y_ARG]] = 1;
2189   if (cfun->machine->args.reg[WORK_ITEM_ID_Z_ARG] >= 0)
2190     fixed_regs[cfun->machine->args.reg[WORK_ITEM_ID_Z_ARG]] = 1;
2191 }
2192 
2193 /* Determine if a load or store is valid, according to the register classes
2194    and address space.  Used primarily by the machine description to decide
2195    when to split a move into two steps.  */
2196 
2197 bool
gcn_valid_move_p(machine_mode mode,rtx dest,rtx src)2198 gcn_valid_move_p (machine_mode mode, rtx dest, rtx src)
2199 {
2200   if (!MEM_P (dest) && !MEM_P (src))
2201     return true;
2202 
2203   if (MEM_P (dest)
2204       && AS_FLAT_P (MEM_ADDR_SPACE (dest))
2205       && (gcn_flat_address_p (XEXP (dest, 0), mode)
2206 	  || GET_CODE (XEXP (dest, 0)) == SYMBOL_REF
2207 	  || GET_CODE (XEXP (dest, 0)) == LABEL_REF)
2208       && gcn_vgpr_register_operand (src, mode))
2209     return true;
2210   else if (MEM_P (src)
2211 	   && AS_FLAT_P (MEM_ADDR_SPACE (src))
2212 	   && (gcn_flat_address_p (XEXP (src, 0), mode)
2213 	       || GET_CODE (XEXP (src, 0)) == SYMBOL_REF
2214 	       || GET_CODE (XEXP (src, 0)) == LABEL_REF)
2215 	   && gcn_vgpr_register_operand (dest, mode))
2216     return true;
2217 
2218   if (MEM_P (dest)
2219       && AS_GLOBAL_P (MEM_ADDR_SPACE (dest))
2220       && (gcn_global_address_p (XEXP (dest, 0))
2221 	  || GET_CODE (XEXP (dest, 0)) == SYMBOL_REF
2222 	  || GET_CODE (XEXP (dest, 0)) == LABEL_REF)
2223       && gcn_vgpr_register_operand (src, mode))
2224     return true;
2225   else if (MEM_P (src)
2226 	   && AS_GLOBAL_P (MEM_ADDR_SPACE (src))
2227 	   && (gcn_global_address_p (XEXP (src, 0))
2228 	       || GET_CODE (XEXP (src, 0)) == SYMBOL_REF
2229 	       || GET_CODE (XEXP (src, 0)) == LABEL_REF)
2230 	   && gcn_vgpr_register_operand (dest, mode))
2231     return true;
2232 
2233   if (MEM_P (dest)
2234       && MEM_ADDR_SPACE (dest) == ADDR_SPACE_SCALAR_FLAT
2235       && (gcn_scalar_flat_address_p (XEXP (dest, 0))
2236 	  || GET_CODE (XEXP (dest, 0)) == SYMBOL_REF
2237 	  || GET_CODE (XEXP (dest, 0)) == LABEL_REF)
2238       && gcn_ssrc_register_operand (src, mode))
2239     return true;
2240   else if (MEM_P (src)
2241 	   && MEM_ADDR_SPACE (src) == ADDR_SPACE_SCALAR_FLAT
2242 	   && (gcn_scalar_flat_address_p (XEXP (src, 0))
2243 	       || GET_CODE (XEXP (src, 0)) == SYMBOL_REF
2244 	       || GET_CODE (XEXP (src, 0)) == LABEL_REF)
2245 	   && gcn_sdst_register_operand (dest, mode))
2246     return true;
2247 
2248   if (MEM_P (dest)
2249       && AS_ANY_DS_P (MEM_ADDR_SPACE (dest))
2250       && gcn_ds_address_p (XEXP (dest, 0))
2251       && gcn_vgpr_register_operand (src, mode))
2252     return true;
2253   else if (MEM_P (src)
2254 	   && AS_ANY_DS_P (MEM_ADDR_SPACE (src))
2255 	   && gcn_ds_address_p (XEXP (src, 0))
2256 	   && gcn_vgpr_register_operand (dest, mode))
2257     return true;
2258 
2259   return false;
2260 }
2261 
2262 /* }}}  */
2263 /* {{{ Functions and ABI.  */
2264 
2265 /* Implement TARGET_FUNCTION_VALUE.
2266 
2267    Define how to find the value returned by a function.
2268    The register location is always the same, but the mode depends on
2269    VALTYPE.  */
2270 
2271 static rtx
gcn_function_value(const_tree valtype,const_tree,bool)2272 gcn_function_value (const_tree valtype, const_tree, bool)
2273 {
2274   machine_mode mode = TYPE_MODE (valtype);
2275 
2276   if (INTEGRAL_TYPE_P (valtype)
2277       && GET_MODE_CLASS (mode) == MODE_INT
2278       && GET_MODE_SIZE (mode) < 4)
2279     mode = SImode;
2280 
2281   return gen_rtx_REG (mode, SGPR_REGNO (RETURN_VALUE_REG));
2282 }
2283 
2284 /* Implement TARGET_FUNCTION_VALUE_REGNO_P.
2285 
2286    Return true if N is a possible register number for the function return
2287    value.  */
2288 
2289 static bool
gcn_function_value_regno_p(const unsigned int n)2290 gcn_function_value_regno_p (const unsigned int n)
2291 {
2292   return n == RETURN_VALUE_REG;
2293 }
2294 
2295 /* Calculate the number of registers required to hold function argument
2296    ARG.  */
2297 
2298 static int
num_arg_regs(const function_arg_info & arg)2299 num_arg_regs (const function_arg_info &arg)
2300 {
2301   if (targetm.calls.must_pass_in_stack (arg))
2302     return 0;
2303 
2304   int size = arg.promoted_size_in_bytes ();
2305   return (size + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
2306 }
2307 
2308 /* Implement TARGET_STRICT_ARGUMENT_NAMING.
2309 
2310    Return true if the location where a function argument is passed
2311    depends on whether or not it is a named argument
2312 
2313    For gcn, we know how to handle functions declared as stdarg: by
2314    passing an extra pointer to the unnamed arguments.  However, the
2315    Fortran frontend can produce a different situation, where a
2316    function pointer is declared with no arguments, but the actual
2317    function and calls to it take more arguments.  In that case, we
2318    want to ensure the call matches the definition of the function.  */
2319 
2320 static bool
gcn_strict_argument_naming(cumulative_args_t cum_v)2321 gcn_strict_argument_naming (cumulative_args_t cum_v)
2322 {
2323   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
2324 
2325   return cum->fntype == NULL_TREE || stdarg_p (cum->fntype);
2326 }
2327 
2328 /* Implement TARGET_PRETEND_OUTGOING_VARARGS_NAMED.
2329 
2330    See comment on gcn_strict_argument_naming.  */
2331 
2332 static bool
gcn_pretend_outgoing_varargs_named(cumulative_args_t cum_v)2333 gcn_pretend_outgoing_varargs_named (cumulative_args_t cum_v)
2334 {
2335   return !gcn_strict_argument_naming (cum_v);
2336 }
2337 
2338 /* Implement TARGET_FUNCTION_ARG.
2339 
2340    Return an RTX indicating whether a function argument is passed in a register
2341    and if so, which register.  */
2342 
2343 static rtx
gcn_function_arg(cumulative_args_t cum_v,const function_arg_info & arg)2344 gcn_function_arg (cumulative_args_t cum_v, const function_arg_info &arg)
2345 {
2346   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
2347   if (cum->normal_function)
2348     {
2349       if (!arg.named || arg.end_marker_p ())
2350 	return 0;
2351 
2352       if (targetm.calls.must_pass_in_stack (arg))
2353 	return 0;
2354 
2355       /* Vector parameters are not supported yet.  */
2356       if (VECTOR_MODE_P (arg.mode))
2357 	return 0;
2358 
2359       int reg_num = FIRST_PARM_REG + cum->num;
2360       int num_regs = num_arg_regs (arg);
2361       if (num_regs > 0)
2362 	while (reg_num % num_regs != 0)
2363 	  reg_num++;
2364       if (reg_num + num_regs <= FIRST_PARM_REG + NUM_PARM_REGS)
2365 	return gen_rtx_REG (arg.mode, reg_num);
2366     }
2367   else
2368     {
2369       if (cum->num >= cum->args.nargs)
2370 	{
2371 	  cum->offset = (cum->offset + TYPE_ALIGN (arg.type) / 8 - 1)
2372 	    & -(TYPE_ALIGN (arg.type) / 8);
2373 	  cfun->machine->kernarg_segment_alignment
2374 	    = MAX ((unsigned) cfun->machine->kernarg_segment_alignment,
2375 		   TYPE_ALIGN (arg.type) / 8);
2376 	  rtx addr = gen_rtx_REG (DImode,
2377 				  cum->args.reg[KERNARG_SEGMENT_PTR_ARG]);
2378 	  if (cum->offset)
2379 	    addr = gen_rtx_PLUS (DImode, addr,
2380 				 gen_int_mode (cum->offset, DImode));
2381 	  rtx mem = gen_rtx_MEM (arg.mode, addr);
2382 	  set_mem_attributes (mem, arg.type, 1);
2383 	  set_mem_addr_space (mem, ADDR_SPACE_SCALAR_FLAT);
2384 	  MEM_READONLY_P (mem) = 1;
2385 	  return mem;
2386 	}
2387 
2388       int a = cum->args.order[cum->num];
2389       if (arg.mode != gcn_kernel_arg_types[a].mode)
2390 	{
2391 	  error ("wrong type of argument %s", gcn_kernel_arg_types[a].name);
2392 	  return 0;
2393 	}
2394       return gen_rtx_REG ((machine_mode) gcn_kernel_arg_types[a].mode,
2395 			  cum->args.reg[a]);
2396     }
2397   return 0;
2398 }
2399 
2400 /* Implement TARGET_FUNCTION_ARG_ADVANCE.
2401 
2402    Updates the summarizer variable pointed to by CUM_V to advance past an
2403    argument in the argument list.  */
2404 
2405 static void
gcn_function_arg_advance(cumulative_args_t cum_v,const function_arg_info & arg)2406 gcn_function_arg_advance (cumulative_args_t cum_v,
2407 			  const function_arg_info &arg)
2408 {
2409   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
2410 
2411   if (cum->normal_function)
2412     {
2413       if (!arg.named)
2414 	return;
2415 
2416       int num_regs = num_arg_regs (arg);
2417       if (num_regs > 0)
2418 	while ((FIRST_PARM_REG + cum->num) % num_regs != 0)
2419 	  cum->num++;
2420       cum->num += num_regs;
2421     }
2422   else
2423     {
2424       if (cum->num < cum->args.nargs)
2425 	cum->num++;
2426       else
2427 	{
2428 	  cum->offset += tree_to_uhwi (TYPE_SIZE_UNIT (arg.type));
2429 	  cfun->machine->kernarg_segment_byte_size = cum->offset;
2430 	}
2431     }
2432 }
2433 
2434 /* Implement TARGET_ARG_PARTIAL_BYTES.
2435 
2436    Returns the number of bytes at the beginning of an argument that must be put
2437    in registers.  The value must be zero for arguments that are passed entirely
2438    in registers or that are entirely pushed on the stack.  */
2439 
2440 static int
gcn_arg_partial_bytes(cumulative_args_t cum_v,const function_arg_info & arg)2441 gcn_arg_partial_bytes (cumulative_args_t cum_v, const function_arg_info &arg)
2442 {
2443   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
2444 
2445   if (!arg.named)
2446     return 0;
2447 
2448   if (targetm.calls.must_pass_in_stack (arg))
2449     return 0;
2450 
2451   if (cum->num >= NUM_PARM_REGS)
2452     return 0;
2453 
2454   /* If the argument fits entirely in registers, return 0.  */
2455   if (cum->num + num_arg_regs (arg) <= NUM_PARM_REGS)
2456     return 0;
2457 
2458   return (NUM_PARM_REGS - cum->num) * UNITS_PER_WORD;
2459 }
2460 
2461 /* A normal function which takes a pointer argument may be passed a pointer to
2462    LDS space (via a high-bits-set aperture), and that only works with FLAT
2463    addressing, not GLOBAL.  Force FLAT addressing if the function has an
2464    incoming pointer parameter.  NOTE: This is a heuristic that works in the
2465    offloading case, but in general, a function might read global pointer
2466    variables, etc. that may refer to LDS space or other special memory areas
2467    not supported by GLOBAL instructions, and then this argument check would not
2468    suffice.  */
2469 
2470 static void
gcn_detect_incoming_pointer_arg(tree fndecl)2471 gcn_detect_incoming_pointer_arg (tree fndecl)
2472 {
2473   gcc_assert (cfun && cfun->machine);
2474 
2475   for (tree arg = TYPE_ARG_TYPES (TREE_TYPE (fndecl));
2476        arg;
2477        arg = TREE_CHAIN (arg))
2478     if (POINTER_TYPE_P (TREE_VALUE (arg)))
2479       cfun->machine->use_flat_addressing = true;
2480 }
2481 
2482 /* Implement INIT_CUMULATIVE_ARGS, via gcn.h.
2483 
2484    Initialize a variable CUM of type CUMULATIVE_ARGS for a call to a function
2485    whose data type is FNTYPE.  For a library call, FNTYPE is 0.  */
2486 
2487 void
gcn_init_cumulative_args(CUMULATIVE_ARGS * cum,tree fntype,rtx libname,tree fndecl,int caller)2488 gcn_init_cumulative_args (CUMULATIVE_ARGS *cum /* Argument info to init */ ,
2489 			  tree fntype /* tree ptr for function decl */ ,
2490 			  rtx libname /* SYMBOL_REF of library name or 0 */ ,
2491 			  tree fndecl, int caller)
2492 {
2493   memset (cum, 0, sizeof (*cum));
2494   cum->fntype = fntype;
2495   if (libname)
2496     {
2497       gcc_assert (cfun && cfun->machine);
2498       cum->normal_function = true;
2499       if (!caller)
2500 	{
2501 	  cfun->machine->normal_function = true;
2502 	  gcn_detect_incoming_pointer_arg (fndecl);
2503 	}
2504       return;
2505     }
2506   tree attr = NULL;
2507   if (fndecl)
2508     attr = lookup_attribute ("amdgpu_hsa_kernel", DECL_ATTRIBUTES (fndecl));
2509   if (fndecl && !attr)
2510     attr = lookup_attribute ("amdgpu_hsa_kernel",
2511 			     TYPE_ATTRIBUTES (TREE_TYPE (fndecl)));
2512   if (!attr && fntype)
2513     attr = lookup_attribute ("amdgpu_hsa_kernel", TYPE_ATTRIBUTES (fntype));
2514   /* Handle main () as kernel, so we can run testsuite.
2515      Handle OpenACC kernels similarly to main.  */
2516   if (!attr && !caller && fndecl
2517       && (MAIN_NAME_P (DECL_NAME (fndecl))
2518 	  || lookup_attribute ("omp target entrypoint",
2519 			       DECL_ATTRIBUTES (fndecl)) != NULL_TREE))
2520     gcn_parse_amdgpu_hsa_kernel_attribute (&cum->args, NULL_TREE);
2521   else
2522     {
2523       if (!attr || caller)
2524 	{
2525 	  gcc_assert (cfun && cfun->machine);
2526 	  cum->normal_function = true;
2527 	  if (!caller)
2528 	    cfun->machine->normal_function = true;
2529 	}
2530       gcn_parse_amdgpu_hsa_kernel_attribute
2531 	(&cum->args, attr ? TREE_VALUE (attr) : NULL_TREE);
2532     }
2533   cfun->machine->args = cum->args;
2534   if (!caller && cfun->machine->normal_function)
2535     gcn_detect_incoming_pointer_arg (fndecl);
2536 
2537   reinit_regs ();
2538 }
2539 
2540 static bool
gcn_return_in_memory(const_tree type,const_tree ARG_UNUSED (fntype))2541 gcn_return_in_memory (const_tree type, const_tree ARG_UNUSED (fntype))
2542 {
2543   machine_mode mode = TYPE_MODE (type);
2544   HOST_WIDE_INT size = int_size_in_bytes (type);
2545 
2546   if (AGGREGATE_TYPE_P (type))
2547     return true;
2548 
2549   /* Vector return values are not supported yet.  */
2550   if (VECTOR_TYPE_P (type))
2551     return true;
2552 
2553   if (mode == BLKmode)
2554     return true;
2555 
2556   if (size > 2 * UNITS_PER_WORD)
2557     return true;
2558 
2559   return false;
2560 }
2561 
2562 /* Implement TARGET_PROMOTE_FUNCTION_MODE.
2563 
2564    Return the mode to use for outgoing function arguments.  */
2565 
2566 machine_mode
gcn_promote_function_mode(const_tree ARG_UNUSED (type),machine_mode mode,int * ARG_UNUSED (punsignedp),const_tree ARG_UNUSED (funtype),int ARG_UNUSED (for_return))2567 gcn_promote_function_mode (const_tree ARG_UNUSED (type), machine_mode mode,
2568 			   int *ARG_UNUSED (punsignedp),
2569 			   const_tree ARG_UNUSED (funtype),
2570 			   int ARG_UNUSED (for_return))
2571 {
2572   if (GET_MODE_CLASS (mode) == MODE_INT && GET_MODE_SIZE (mode) < 4)
2573     return SImode;
2574 
2575   return mode;
2576 }
2577 
2578 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.
2579 
2580    Derived from hppa_gimplify_va_arg_expr.  The generic routine doesn't handle
2581    ARGS_GROW_DOWNWARDS.  */
2582 
2583 static tree
gcn_gimplify_va_arg_expr(tree valist,tree type,gimple_seq * ARG_UNUSED (pre_p),gimple_seq * ARG_UNUSED (post_p))2584 gcn_gimplify_va_arg_expr (tree valist, tree type,
2585 			  gimple_seq *ARG_UNUSED (pre_p),
2586 			  gimple_seq *ARG_UNUSED (post_p))
2587 {
2588   tree ptr = build_pointer_type (type);
2589   tree valist_type;
2590   tree t, u;
2591   bool indirect;
2592 
2593   indirect = pass_va_arg_by_reference (type);
2594   if (indirect)
2595     {
2596       type = ptr;
2597       ptr = build_pointer_type (type);
2598     }
2599   valist_type = TREE_TYPE (valist);
2600 
2601   /* Args grow down.  Not handled by generic routines.  */
2602 
2603   u = fold_convert (sizetype, size_in_bytes (type));
2604   u = fold_build1 (NEGATE_EXPR, sizetype, u);
2605   t = fold_build_pointer_plus (valist, u);
2606 
2607   /* Align to 8 byte boundary.  */
2608 
2609   u = build_int_cst (TREE_TYPE (t), -8);
2610   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t, u);
2611   t = fold_convert (valist_type, t);
2612 
2613   t = build2 (MODIFY_EXPR, valist_type, valist, t);
2614 
2615   t = fold_convert (ptr, t);
2616   t = build_va_arg_indirect_ref (t);
2617 
2618   if (indirect)
2619     t = build_va_arg_indirect_ref (t);
2620 
2621   return t;
2622 }
2623 
2624 /* Return 1 if TRAIT NAME is present in the OpenMP context's
2625    device trait set, return 0 if not present in any OpenMP context in the
2626    whole translation unit, or -1 if not present in the current OpenMP context
2627    but might be present in another OpenMP context in the same TU.  */
2628 
2629 int
gcn_omp_device_kind_arch_isa(enum omp_device_kind_arch_isa trait,const char * name)2630 gcn_omp_device_kind_arch_isa (enum omp_device_kind_arch_isa trait,
2631 			      const char *name)
2632 {
2633   switch (trait)
2634     {
2635     case omp_device_kind:
2636       return strcmp (name, "gpu") == 0;
2637     case omp_device_arch:
2638       return strcmp (name, "gcn") == 0;
2639     case omp_device_isa:
2640       if (strcmp (name, "fiji") == 0)
2641 	return gcn_arch == PROCESSOR_FIJI;
2642       if (strcmp (name, "gfx900") == 0)
2643 	return gcn_arch == PROCESSOR_VEGA10;
2644       if (strcmp (name, "gfx906") == 0)
2645 	return gcn_arch == PROCESSOR_VEGA20;
2646       if (strcmp (name, "gfx908") == 0)
2647 	return gcn_arch == PROCESSOR_GFX908;
2648       return 0;
2649     default:
2650       gcc_unreachable ();
2651     }
2652 }
2653 
2654 /* Calculate stack offsets needed to create prologues and epilogues.  */
2655 
2656 static struct machine_function *
gcn_compute_frame_offsets(void)2657 gcn_compute_frame_offsets (void)
2658 {
2659   machine_function *offsets = cfun->machine;
2660 
2661   if (reload_completed)
2662     return offsets;
2663 
2664   offsets->need_frame_pointer = frame_pointer_needed;
2665 
2666   offsets->outgoing_args_size = crtl->outgoing_args_size;
2667   offsets->pretend_size = crtl->args.pretend_args_size;
2668 
2669   offsets->local_vars = get_frame_size ();
2670 
2671   offsets->lr_needs_saving = (!leaf_function_p ()
2672 			      || df_regs_ever_live_p (LR_REGNUM)
2673 			      || df_regs_ever_live_p (LR_REGNUM + 1));
2674 
2675   offsets->callee_saves = offsets->lr_needs_saving ? 8 : 0;
2676 
2677   for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
2678     if ((df_regs_ever_live_p (regno) && !call_used_or_fixed_reg_p (regno))
2679 	|| ((regno & ~1) == HARD_FRAME_POINTER_REGNUM
2680 	    && frame_pointer_needed))
2681       offsets->callee_saves += (VGPR_REGNO_P (regno) ? 256 : 4);
2682 
2683   /* Round up to 64-bit boundary to maintain stack alignment.  */
2684   offsets->callee_saves = (offsets->callee_saves + 7) & ~7;
2685 
2686   return offsets;
2687 }
2688 
2689 /* Insert code into the prologue or epilogue to store or load any
2690    callee-save register to/from the stack.
2691 
2692    Helper function for gcn_expand_prologue and gcn_expand_epilogue.  */
2693 
2694 static void
move_callee_saved_registers(rtx sp,machine_function * offsets,bool prologue)2695 move_callee_saved_registers (rtx sp, machine_function *offsets,
2696 			     bool prologue)
2697 {
2698   int regno, offset, saved_scalars;
2699   rtx exec = gen_rtx_REG (DImode, EXEC_REG);
2700   rtx vcc = gen_rtx_REG (DImode, VCC_LO_REG);
2701   rtx offreg = gen_rtx_REG (SImode, SGPR_REGNO (22));
2702   rtx as = gen_rtx_CONST_INT (VOIDmode, STACK_ADDR_SPACE);
2703   HOST_WIDE_INT exec_set = 0;
2704   int offreg_set = 0;
2705   auto_vec<int> saved_sgprs;
2706 
2707   start_sequence ();
2708 
2709   /* Move scalars into two vector registers.  */
2710   for (regno = 0, saved_scalars = 0; regno < FIRST_VGPR_REG; regno++)
2711     if ((df_regs_ever_live_p (regno) && !call_used_or_fixed_reg_p (regno))
2712 	|| ((regno & ~1) == LINK_REGNUM && offsets->lr_needs_saving)
2713 	|| ((regno & ~1) == HARD_FRAME_POINTER_REGNUM
2714 	    && offsets->need_frame_pointer))
2715       {
2716 	rtx reg = gen_rtx_REG (SImode, regno);
2717 	rtx vreg = gen_rtx_REG (V64SImode,
2718 				VGPR_REGNO (6 + (saved_scalars / 64)));
2719 	int lane = saved_scalars % 64;
2720 
2721 	if (prologue)
2722 	  {
2723 	    emit_insn (gen_vec_setv64si (vreg, reg, GEN_INT (lane)));
2724 	    saved_sgprs.safe_push (regno);
2725 	  }
2726 	else
2727 	  emit_insn (gen_vec_extractv64sisi (reg, vreg, GEN_INT (lane)));
2728 
2729 	saved_scalars++;
2730       }
2731 
2732   rtx move_scalars = get_insns ();
2733   end_sequence ();
2734   start_sequence ();
2735 
2736   /* Ensure that all vector lanes are moved.  */
2737   exec_set = -1;
2738   emit_move_insn (exec, GEN_INT (exec_set));
2739 
2740   /* Set up a vector stack pointer.  */
2741   rtx _0_1_2_3 = gen_rtx_REG (V64SImode, VGPR_REGNO (1));
2742   rtx _0_4_8_12 = gen_rtx_REG (V64SImode, VGPR_REGNO (3));
2743   emit_insn (gen_ashlv64si3_exec (_0_4_8_12, _0_1_2_3, GEN_INT (2),
2744 				  gcn_gen_undef (V64SImode), exec));
2745   rtx vsp = gen_rtx_REG (V64DImode, VGPR_REGNO (4));
2746   emit_insn (gen_vec_duplicatev64di_exec (vsp, sp, gcn_gen_undef (V64DImode),
2747 					  exec));
2748   emit_insn (gen_addv64si3_vcc_exec (gcn_operand_part (V64SImode, vsp, 0),
2749 				     gcn_operand_part (V64SImode, vsp, 0),
2750 				     _0_4_8_12, vcc, gcn_gen_undef (V64SImode),
2751 				     exec));
2752   emit_insn (gen_addcv64si3_exec (gcn_operand_part (V64SImode, vsp, 1),
2753 				  gcn_operand_part (V64SImode, vsp, 1),
2754 				  const0_rtx, vcc, vcc,
2755 				  gcn_gen_undef (V64SImode), exec));
2756 
2757   /* Move vectors.  */
2758   for (regno = FIRST_VGPR_REG, offset = 0;
2759        regno < FIRST_PSEUDO_REGISTER; regno++)
2760     if ((df_regs_ever_live_p (regno) && !call_used_or_fixed_reg_p (regno))
2761 	|| (regno == VGPR_REGNO (6) && saved_scalars > 0)
2762 	|| (regno == VGPR_REGNO (7) && saved_scalars > 63))
2763       {
2764 	rtx reg = gen_rtx_REG (V64SImode, regno);
2765 	int size = 256;
2766 
2767 	if (regno == VGPR_REGNO (6) && saved_scalars < 64)
2768 	  size = saved_scalars * 4;
2769 	else if (regno == VGPR_REGNO (7) && saved_scalars < 128)
2770 	  size = (saved_scalars - 64) * 4;
2771 
2772 	if (size != 256 || exec_set != -1)
2773 	  {
2774 	    exec_set = ((unsigned HOST_WIDE_INT) 1 << (size / 4)) - 1;
2775 	    emit_move_insn (exec, gen_int_mode (exec_set, DImode));
2776 	  }
2777 
2778 	if (prologue)
2779 	  {
2780 	    rtx insn = emit_insn (gen_scatterv64si_insn_1offset_exec
2781 				  (vsp, const0_rtx, reg, as, const0_rtx,
2782 				   exec));
2783 
2784 	    /* Add CFI metadata.  */
2785 	    rtx note;
2786 	    if (regno == VGPR_REGNO (6) || regno == VGPR_REGNO (7))
2787 	      {
2788 		int start = (regno == VGPR_REGNO (7) ? 64 : 0);
2789 		int count = MIN (saved_scalars - start, 64);
2790 		int add_lr = (regno == VGPR_REGNO (6)
2791 			      && offsets->lr_needs_saving);
2792 		int lrdest = -1;
2793 		rtvec seq = rtvec_alloc (count + add_lr);
2794 
2795 		/* Add an REG_FRAME_RELATED_EXPR entry for each scalar
2796 		   register that was saved in this batch.  */
2797 		for (int idx = 0; idx < count; idx++)
2798 		  {
2799 		    int stackaddr = offset + idx * 4;
2800 		    rtx dest = gen_rtx_MEM (SImode,
2801 					    gen_rtx_PLUS
2802 					    (DImode, sp,
2803 					     GEN_INT (stackaddr)));
2804 		    rtx src = gen_rtx_REG (SImode, saved_sgprs[start + idx]);
2805 		    rtx set = gen_rtx_SET (dest, src);
2806 		    RTX_FRAME_RELATED_P (set) = 1;
2807 		    RTVEC_ELT (seq, idx) = set;
2808 
2809 		    if (saved_sgprs[start + idx] == LINK_REGNUM)
2810 		      lrdest = stackaddr;
2811 		  }
2812 
2813 		/* Add an additional expression for DWARF_LINK_REGISTER if
2814 		   LINK_REGNUM was saved.  */
2815 		if (lrdest != -1)
2816 		  {
2817 		    rtx dest = gen_rtx_MEM (DImode,
2818 					    gen_rtx_PLUS
2819 					    (DImode, sp,
2820 					     GEN_INT (lrdest)));
2821 		    rtx src = gen_rtx_REG (DImode, DWARF_LINK_REGISTER);
2822 		    rtx set = gen_rtx_SET (dest, src);
2823 		    RTX_FRAME_RELATED_P (set) = 1;
2824 		    RTVEC_ELT (seq, count) = set;
2825 		  }
2826 
2827 		note = gen_rtx_SEQUENCE (VOIDmode, seq);
2828 	      }
2829 	    else
2830 	      {
2831 		rtx dest = gen_rtx_MEM (V64SImode,
2832 					gen_rtx_PLUS (DImode, sp,
2833 						      GEN_INT (offset)));
2834 		rtx src = gen_rtx_REG (V64SImode, regno);
2835 		note = gen_rtx_SET (dest, src);
2836 	      }
2837 	    RTX_FRAME_RELATED_P (insn) = 1;
2838 	    add_reg_note (insn, REG_FRAME_RELATED_EXPR, note);
2839 	  }
2840 	else
2841 	  emit_insn (gen_gatherv64si_insn_1offset_exec
2842 		     (reg, vsp, const0_rtx, as, const0_rtx,
2843 		      gcn_gen_undef (V64SImode), exec));
2844 
2845 	/* Move our VSP to the next stack entry.  */
2846 	if (offreg_set != size)
2847 	  {
2848 	    offreg_set = size;
2849 	    emit_move_insn (offreg, GEN_INT (size));
2850 	  }
2851 	if (exec_set != -1)
2852 	  {
2853 	    exec_set = -1;
2854 	    emit_move_insn (exec, GEN_INT (exec_set));
2855 	  }
2856 	emit_insn (gen_addv64si3_vcc_dup_exec
2857 		   (gcn_operand_part (V64SImode, vsp, 0),
2858 		    offreg, gcn_operand_part (V64SImode, vsp, 0),
2859 		    vcc, gcn_gen_undef (V64SImode), exec));
2860 	emit_insn (gen_addcv64si3_exec
2861 		   (gcn_operand_part (V64SImode, vsp, 1),
2862 		    gcn_operand_part (V64SImode, vsp, 1),
2863 		    const0_rtx, vcc, vcc, gcn_gen_undef (V64SImode), exec));
2864 
2865 	offset += size;
2866       }
2867 
2868   rtx move_vectors = get_insns ();
2869   end_sequence ();
2870 
2871   if (prologue)
2872     {
2873       emit_insn (move_scalars);
2874       emit_insn (move_vectors);
2875     }
2876   else
2877     {
2878       emit_insn (move_vectors);
2879       emit_insn (move_scalars);
2880     }
2881 }
2882 
2883 /* Generate prologue.  Called from gen_prologue during pro_and_epilogue pass.
2884 
2885    For a non-kernel function, the stack layout looks like this (interim),
2886    growing *upwards*:
2887 
2888  hi | + ...
2889     |__________________| <-- current SP
2890     | outgoing args    |
2891     |__________________|
2892     | (alloca space)   |
2893     |__________________|
2894     | local vars       |
2895     |__________________| <-- FP/hard FP
2896     | callee-save regs |
2897     |__________________| <-- soft arg pointer
2898     | pretend args     |
2899     |__________________| <-- incoming SP
2900     | incoming args    |
2901  lo |..................|
2902 
2903    This implies arguments (beyond the first N in registers) must grow
2904    downwards (as, apparently, PA has them do).
2905 
2906    For a kernel function we have the simpler:
2907 
2908  hi | + ...
2909     |__________________| <-- current SP
2910     | outgoing args    |
2911     |__________________|
2912     | (alloca space)   |
2913     |__________________|
2914     | local vars       |
2915  lo |__________________| <-- FP/hard FP
2916 
2917 */
2918 
2919 void
gcn_expand_prologue()2920 gcn_expand_prologue ()
2921 {
2922   machine_function *offsets = gcn_compute_frame_offsets ();
2923 
2924   if (!cfun || !cfun->machine || cfun->machine->normal_function)
2925     {
2926       rtx sp = gen_rtx_REG (Pmode, STACK_POINTER_REGNUM);
2927       rtx sp_hi = gcn_operand_part (Pmode, sp, 1);
2928       rtx sp_lo = gcn_operand_part (Pmode, sp, 0);
2929       rtx fp = gen_rtx_REG (Pmode, HARD_FRAME_POINTER_REGNUM);
2930       rtx fp_hi = gcn_operand_part (Pmode, fp, 1);
2931       rtx fp_lo = gcn_operand_part (Pmode, fp, 0);
2932 
2933       start_sequence ();
2934 
2935       if (offsets->pretend_size > 0)
2936 	{
2937 	  /* FIXME: Do the actual saving of register pretend args to the stack.
2938 	     Register order needs consideration.  */
2939 	}
2940 
2941       /* Save callee-save regs.  */
2942       move_callee_saved_registers (sp, offsets, true);
2943 
2944       HOST_WIDE_INT sp_adjust = offsets->pretend_size
2945 	+ offsets->callee_saves
2946 	+ offsets->local_vars + offsets->outgoing_args_size;
2947       if (sp_adjust > 0)
2948 	{
2949 	  /* Adding RTX_FRAME_RELATED_P effectively disables spliting, so
2950 	     we use split add explictly, and specify the DImode add in
2951 	     the note.  */
2952 	  rtx scc = gen_rtx_REG (BImode, SCC_REG);
2953 	  rtx adjustment = gen_int_mode (sp_adjust, SImode);
2954 	  rtx insn = emit_insn (gen_addsi3_scalar_carry (sp_lo, sp_lo,
2955 							 adjustment, scc));
2956 	  if (!offsets->need_frame_pointer)
2957 	    {
2958 	      RTX_FRAME_RELATED_P (insn) = 1;
2959 	      add_reg_note (insn, REG_FRAME_RELATED_EXPR,
2960 			    gen_rtx_SET (sp,
2961 					 gen_rtx_PLUS (DImode, sp,
2962 						       adjustment)));
2963 	    }
2964 	  emit_insn (gen_addcsi3_scalar_zero (sp_hi, sp_hi, scc));
2965 	}
2966 
2967       if (offsets->need_frame_pointer)
2968 	{
2969 	  /* Adding RTX_FRAME_RELATED_P effectively disables spliting, so
2970 	     we use split add explictly, and specify the DImode add in
2971 	     the note.  */
2972 	  rtx scc = gen_rtx_REG (BImode, SCC_REG);
2973 	  int fp_adjust = -(offsets->local_vars + offsets->outgoing_args_size);
2974 	  rtx adjustment = gen_int_mode (fp_adjust, SImode);
2975 	  rtx insn = emit_insn (gen_addsi3_scalar_carry(fp_lo, sp_lo,
2976 							adjustment, scc));
2977 	  emit_insn (gen_addcsi3_scalar (fp_hi, sp_hi,
2978 					 (fp_adjust < 0 ? GEN_INT (-1)
2979 					  : const0_rtx),
2980 					 scc, scc));
2981 
2982 	  /* Set the CFA to the entry stack address, as an offset from the
2983 	     frame pointer.  This is preferred because the frame pointer is
2984 	     saved in each frame, whereas the stack pointer is not.  */
2985 	  RTX_FRAME_RELATED_P (insn) = 1;
2986 	  add_reg_note (insn, REG_CFA_DEF_CFA,
2987 			gen_rtx_PLUS (DImode, fp,
2988 				      GEN_INT (-(offsets->pretend_size
2989 						 + offsets->callee_saves))));
2990 	}
2991 
2992       rtx_insn *seq = get_insns ();
2993       end_sequence ();
2994 
2995       emit_insn (seq);
2996     }
2997   else
2998     {
2999       rtx wave_offset = gen_rtx_REG (SImode,
3000 				     cfun->machine->args.
3001 				     reg[PRIVATE_SEGMENT_WAVE_OFFSET_ARG]);
3002 
3003       if (cfun->machine->args.requested & (1 << FLAT_SCRATCH_INIT_ARG))
3004 	{
3005 	  rtx fs_init_lo =
3006 	    gen_rtx_REG (SImode,
3007 			 cfun->machine->args.reg[FLAT_SCRATCH_INIT_ARG]);
3008 	  rtx fs_init_hi =
3009 	    gen_rtx_REG (SImode,
3010 			 cfun->machine->args.reg[FLAT_SCRATCH_INIT_ARG] + 1);
3011 	  rtx fs_reg_lo = gen_rtx_REG (SImode, FLAT_SCRATCH_REG);
3012 	  rtx fs_reg_hi = gen_rtx_REG (SImode, FLAT_SCRATCH_REG + 1);
3013 
3014 	  /*rtx queue = gen_rtx_REG(DImode,
3015 				  cfun->machine->args.reg[QUEUE_PTR_ARG]);
3016 	  rtx aperture = gen_rtx_MEM (SImode,
3017 				      gen_rtx_PLUS (DImode, queue,
3018 						    gen_int_mode (68, SImode)));
3019 	  set_mem_addr_space (aperture, ADDR_SPACE_SCALAR_FLAT);*/
3020 
3021 	  /* Set up flat_scratch.  */
3022 	  emit_insn (gen_addsi3_scc (fs_reg_hi, fs_init_lo, wave_offset));
3023 	  emit_insn (gen_lshrsi3_scc (fs_reg_hi, fs_reg_hi,
3024 				      gen_int_mode (8, SImode)));
3025 	  emit_move_insn (fs_reg_lo, fs_init_hi);
3026 	}
3027 
3028       /* Set up frame pointer and stack pointer.  */
3029       rtx sp = gen_rtx_REG (DImode, STACK_POINTER_REGNUM);
3030       rtx sp_hi = simplify_gen_subreg (SImode, sp, DImode, 4);
3031       rtx sp_lo = simplify_gen_subreg (SImode, sp, DImode, 0);
3032       rtx fp = gen_rtx_REG (DImode, HARD_FRAME_POINTER_REGNUM);
3033       rtx fp_hi = simplify_gen_subreg (SImode, fp, DImode, 4);
3034       rtx fp_lo = simplify_gen_subreg (SImode, fp, DImode, 0);
3035 
3036       HOST_WIDE_INT sp_adjust = (offsets->local_vars
3037 				 + offsets->outgoing_args_size);
3038 
3039       /* Initialise FP and SP from the buffer descriptor in s[0:3].  */
3040       emit_move_insn (fp_lo, gen_rtx_REG (SImode, 0));
3041       emit_insn (gen_andsi3_scc (fp_hi, gen_rtx_REG (SImode, 1),
3042 				 gen_int_mode (0xffff, SImode)));
3043       rtx scc = gen_rtx_REG (BImode, SCC_REG);
3044       emit_insn (gen_addsi3_scalar_carry (fp_lo, fp_lo, wave_offset, scc));
3045       emit_insn (gen_addcsi3_scalar_zero (fp_hi, fp_hi, scc));
3046 
3047       /* Adding RTX_FRAME_RELATED_P effectively disables spliting, so we use
3048 	 split add explictly, and specify the DImode add in the note.
3049          The DWARF info expects that the callee-save data is in the frame,
3050          even though it isn't (because this is the entry point), so we
3051          make a notional adjustment to the DWARF frame offset here.  */
3052       rtx dbg_adjustment = gen_int_mode (sp_adjust + offsets->callee_saves,
3053 					 DImode);
3054       rtx insn;
3055       if (sp_adjust > 0)
3056 	{
3057 	  rtx scc = gen_rtx_REG (BImode, SCC_REG);
3058 	  rtx adjustment = gen_int_mode (sp_adjust, DImode);
3059 	  insn = emit_insn (gen_addsi3_scalar_carry(sp_lo, fp_lo, adjustment,
3060 						    scc));
3061 	  emit_insn (gen_addcsi3_scalar_zero (sp_hi, fp_hi, scc));
3062 	}
3063       else
3064 	insn = emit_move_insn (sp, fp);
3065       RTX_FRAME_RELATED_P (insn) = 1;
3066       add_reg_note (insn, REG_FRAME_RELATED_EXPR,
3067 		    gen_rtx_SET (sp, gen_rtx_PLUS (DImode, sp,
3068 						   dbg_adjustment)));
3069 
3070       if (offsets->need_frame_pointer)
3071 	{
3072 	  /* Set the CFA to the entry stack address, as an offset from the
3073 	     frame pointer.  This is necessary when alloca is used, and
3074 	     harmless otherwise.  */
3075 	  rtx neg_adjust = gen_int_mode (-offsets->callee_saves, DImode);
3076 	  add_reg_note (insn, REG_CFA_DEF_CFA,
3077 			gen_rtx_PLUS (DImode, fp, neg_adjust));
3078 	}
3079 
3080       /* Make sure the flat scratch reg doesn't get optimised away.  */
3081       emit_insn (gen_prologue_use (gen_rtx_REG (DImode, FLAT_SCRATCH_REG)));
3082     }
3083 
3084   /* Ensure that the scheduler doesn't do anything unexpected.  */
3085   emit_insn (gen_blockage ());
3086 
3087   /* m0 is initialized for the usual LDS DS and FLAT memory case.
3088      The low-part is the address of the topmost addressable byte, which is
3089      size-1.  The high-part is an offset and should be zero.  */
3090   emit_move_insn (gen_rtx_REG (SImode, M0_REG),
3091 		  gen_int_mode (LDS_SIZE, SImode));
3092 
3093   emit_insn (gen_prologue_use (gen_rtx_REG (SImode, M0_REG)));
3094 
3095   if (cfun && cfun->machine && !cfun->machine->normal_function && flag_openmp)
3096     {
3097       /* OpenMP kernels have an implicit call to gomp_gcn_enter_kernel.  */
3098       rtx fn_reg = gen_rtx_REG (Pmode, FIRST_PARM_REG);
3099       emit_move_insn (fn_reg, gen_rtx_SYMBOL_REF (Pmode,
3100 						  "gomp_gcn_enter_kernel"));
3101       emit_call_insn (gen_gcn_indirect_call (fn_reg, const0_rtx));
3102     }
3103 }
3104 
3105 /* Generate epilogue.  Called from gen_epilogue during pro_and_epilogue pass.
3106 
3107    See gcn_expand_prologue for stack details.  */
3108 
3109 void
gcn_expand_epilogue(void)3110 gcn_expand_epilogue (void)
3111 {
3112   /* Ensure that the scheduler doesn't do anything unexpected.  */
3113   emit_insn (gen_blockage ());
3114 
3115   if (!cfun || !cfun->machine || cfun->machine->normal_function)
3116     {
3117       machine_function *offsets = gcn_compute_frame_offsets ();
3118       rtx sp = gen_rtx_REG (Pmode, STACK_POINTER_REGNUM);
3119       rtx fp = gen_rtx_REG (Pmode, HARD_FRAME_POINTER_REGNUM);
3120 
3121       HOST_WIDE_INT sp_adjust = offsets->callee_saves + offsets->pretend_size;
3122 
3123       if (offsets->need_frame_pointer)
3124 	{
3125 	  /* Restore old SP from the frame pointer.  */
3126 	  if (sp_adjust > 0)
3127 	    emit_insn (gen_subdi3 (sp, fp, gen_int_mode (sp_adjust, DImode)));
3128 	  else
3129 	    emit_move_insn (sp, fp);
3130 	}
3131       else
3132 	{
3133 	  /* Restore old SP from current SP.  */
3134 	  sp_adjust += offsets->outgoing_args_size + offsets->local_vars;
3135 
3136 	  if (sp_adjust > 0)
3137 	    emit_insn (gen_subdi3 (sp, sp, gen_int_mode (sp_adjust, DImode)));
3138 	}
3139 
3140       move_callee_saved_registers (sp, offsets, false);
3141 
3142       /* There's no explicit use of the link register on the return insn.  Emit
3143          one here instead.  */
3144       if (offsets->lr_needs_saving)
3145 	emit_use (gen_rtx_REG (DImode, LINK_REGNUM));
3146 
3147       /* Similar for frame pointer.  */
3148       if (offsets->need_frame_pointer)
3149 	emit_use (gen_rtx_REG (DImode, HARD_FRAME_POINTER_REGNUM));
3150     }
3151   else if (flag_openmp)
3152     {
3153       /* OpenMP kernels have an implicit call to gomp_gcn_exit_kernel.  */
3154       rtx fn_reg = gen_rtx_REG (Pmode, FIRST_PARM_REG);
3155       emit_move_insn (fn_reg,
3156 		      gen_rtx_SYMBOL_REF (Pmode, "gomp_gcn_exit_kernel"));
3157       emit_call_insn (gen_gcn_indirect_call (fn_reg, const0_rtx));
3158     }
3159   else if (TREE_CODE (TREE_TYPE (DECL_RESULT (cfun->decl))) != VOID_TYPE)
3160     {
3161       /* Assume that an exit value compatible with gcn-run is expected.
3162          That is, the third input parameter is an int*.
3163 
3164          We can't allocate any new registers, but the kernarg_reg is
3165          dead after this, so we'll use that.  */
3166       rtx kernarg_reg = gen_rtx_REG (DImode, cfun->machine->args.reg
3167 				     [KERNARG_SEGMENT_PTR_ARG]);
3168       rtx retptr_mem = gen_rtx_MEM (DImode,
3169 				    gen_rtx_PLUS (DImode, kernarg_reg,
3170 						  GEN_INT (16)));
3171       set_mem_addr_space (retptr_mem, ADDR_SPACE_SCALAR_FLAT);
3172       emit_move_insn (kernarg_reg, retptr_mem);
3173 
3174       rtx retval_mem = gen_rtx_MEM (SImode, kernarg_reg);
3175       set_mem_addr_space (retval_mem, ADDR_SPACE_SCALAR_FLAT);
3176       emit_move_insn (retval_mem,
3177 		      gen_rtx_REG (SImode, SGPR_REGNO (RETURN_VALUE_REG)));
3178     }
3179 
3180   emit_jump_insn (gen_gcn_return ());
3181 }
3182 
3183 /* Implement TARGET_FRAME_POINTER_REQUIRED.
3184 
3185    Return true if the frame pointer should not be eliminated.  */
3186 
3187 bool
gcn_frame_pointer_rqd(void)3188 gcn_frame_pointer_rqd (void)
3189 {
3190   /* GDB needs the frame pointer in order to unwind properly,
3191      but that's not important for the entry point, unless alloca is used.
3192      It's not important for code execution, so we should repect the
3193      -fomit-frame-pointer flag.  */
3194   return (!flag_omit_frame_pointer
3195 	  && cfun
3196 	  && (cfun->calls_alloca
3197 	      || (cfun->machine && cfun->machine->normal_function)));
3198 }
3199 
3200 /* Implement TARGET_CAN_ELIMINATE.
3201 
3202    Return true if the compiler is allowed to try to replace register number
3203    FROM_REG with register number TO_REG.
3204 
3205    FIXME: is the default "true" not enough? Should this be a negative set?  */
3206 
3207 bool
gcn_can_eliminate_p(int,int to_reg)3208 gcn_can_eliminate_p (int /*from_reg */ , int to_reg)
3209 {
3210   return (to_reg == HARD_FRAME_POINTER_REGNUM
3211 	  || to_reg == STACK_POINTER_REGNUM);
3212 }
3213 
3214 /* Implement INITIAL_ELIMINATION_OFFSET.
3215 
3216    Returns the initial difference between the specified pair of registers, in
3217    terms of stack position.  */
3218 
3219 HOST_WIDE_INT
gcn_initial_elimination_offset(int from,int to)3220 gcn_initial_elimination_offset (int from, int to)
3221 {
3222   machine_function *offsets = gcn_compute_frame_offsets ();
3223 
3224   switch (from)
3225     {
3226     case ARG_POINTER_REGNUM:
3227       if (to == STACK_POINTER_REGNUM)
3228 	return -(offsets->callee_saves + offsets->local_vars
3229 		 + offsets->outgoing_args_size);
3230       else if (to == FRAME_POINTER_REGNUM || to == HARD_FRAME_POINTER_REGNUM)
3231 	return -offsets->callee_saves;
3232       else
3233 	gcc_unreachable ();
3234       break;
3235 
3236     case FRAME_POINTER_REGNUM:
3237       if (to == STACK_POINTER_REGNUM)
3238 	return -(offsets->local_vars + offsets->outgoing_args_size);
3239       else if (to == HARD_FRAME_POINTER_REGNUM)
3240 	return 0;
3241       else
3242 	gcc_unreachable ();
3243       break;
3244 
3245     default:
3246       gcc_unreachable ();
3247     }
3248 }
3249 
3250 /* Implement HARD_REGNO_RENAME_OK.
3251 
3252    Return true if it is permissible to rename a hard register from
3253    FROM_REG to TO_REG.  */
3254 
3255 bool
gcn_hard_regno_rename_ok(unsigned int from_reg,unsigned int to_reg)3256 gcn_hard_regno_rename_ok (unsigned int from_reg, unsigned int to_reg)
3257 {
3258   if (from_reg == SCC_REG
3259       || from_reg == VCC_LO_REG || from_reg == VCC_HI_REG
3260       || from_reg == EXEC_LO_REG || from_reg == EXEC_HI_REG
3261       || to_reg == SCC_REG
3262       || to_reg == VCC_LO_REG || to_reg == VCC_HI_REG
3263       || to_reg == EXEC_LO_REG || to_reg == EXEC_HI_REG)
3264     return false;
3265 
3266   /* Allow the link register to be used if it was saved.  */
3267   if ((to_reg & ~1) == LINK_REGNUM)
3268     return !cfun || cfun->machine->lr_needs_saving;
3269 
3270   /* Allow the registers used for the static chain to be used if the chain is
3271      not in active use.  */
3272   if ((to_reg & ~1) == STATIC_CHAIN_REGNUM)
3273     return !cfun
3274 	|| !(cfun->static_chain_decl
3275 	     && df_regs_ever_live_p (STATIC_CHAIN_REGNUM)
3276 	     && df_regs_ever_live_p (STATIC_CHAIN_REGNUM + 1));
3277 
3278   return true;
3279 }
3280 
3281 /* Implement HARD_REGNO_CALLER_SAVE_MODE.
3282 
3283    Which mode is required for saving NREGS of a pseudo-register in
3284    call-clobbered hard register REGNO.  */
3285 
3286 machine_mode
gcn_hard_regno_caller_save_mode(unsigned int regno,unsigned int nregs,machine_mode regmode)3287 gcn_hard_regno_caller_save_mode (unsigned int regno, unsigned int nregs,
3288 				 machine_mode regmode)
3289 {
3290   machine_mode result = choose_hard_reg_mode (regno, nregs, NULL);
3291 
3292   if (VECTOR_MODE_P (result) && !VECTOR_MODE_P (regmode))
3293     result = (nregs == 1 ? SImode : DImode);
3294 
3295   return result;
3296 }
3297 
3298 /* Implement TARGET_ASM_TRAMPOLINE_TEMPLATE.
3299 
3300    Output assembler code for a block containing the constant parts
3301    of a trampoline, leaving space for the variable parts.  */
3302 
3303 static void
gcn_asm_trampoline_template(FILE * f)3304 gcn_asm_trampoline_template (FILE *f)
3305 {
3306   /* The source operand of the move instructions must be a 32-bit
3307      constant following the opcode.  */
3308   asm_fprintf (f, "\ts_mov_b32\ts%i, 0xffff\n", STATIC_CHAIN_REGNUM);
3309   asm_fprintf (f, "\ts_mov_b32\ts%i, 0xffff\n", STATIC_CHAIN_REGNUM + 1);
3310   asm_fprintf (f, "\ts_mov_b32\ts%i, 0xffff\n", CC_SAVE_REG);
3311   asm_fprintf (f, "\ts_mov_b32\ts%i, 0xffff\n", CC_SAVE_REG + 1);
3312   asm_fprintf (f, "\ts_setpc_b64\ts[%i:%i]\n", CC_SAVE_REG, CC_SAVE_REG + 1);
3313 }
3314 
3315 /* Implement TARGET_TRAMPOLINE_INIT.
3316 
3317    Emit RTL insns to initialize the variable parts of a trampoline.
3318    FNDECL is the decl of the target address, M_TRAMP is a MEM for
3319    the trampoline, and CHAIN_VALUE is an RTX for the static chain
3320    to be passed to the target function.  */
3321 
3322 static void
gcn_trampoline_init(rtx m_tramp,tree fndecl,rtx chain_value)3323 gcn_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
3324 {
3325   if (TARGET_GCN5_PLUS)
3326     sorry ("nested function trampolines not supported on GCN5 due to"
3327            " non-executable stacks");
3328 
3329   emit_block_move (m_tramp, assemble_trampoline_template (),
3330 		   GEN_INT (TRAMPOLINE_SIZE), BLOCK_OP_NORMAL);
3331 
3332   rtx fnaddr = XEXP (DECL_RTL (fndecl), 0);
3333   rtx chain_value_reg = copy_to_reg (chain_value);
3334   rtx fnaddr_reg = copy_to_reg (fnaddr);
3335 
3336   for (int i = 0; i < 4; i++)
3337     {
3338       rtx mem = adjust_address (m_tramp, SImode, i * 8 + 4);
3339       rtx reg = i < 2 ? chain_value_reg : fnaddr_reg;
3340       emit_move_insn (mem, gen_rtx_SUBREG (SImode, reg, (i % 2) * 4));
3341     }
3342 
3343   rtx tramp_addr = XEXP (m_tramp, 0);
3344   emit_insn (gen_clear_icache (tramp_addr,
3345 			       plus_constant (ptr_mode, tramp_addr,
3346 					      TRAMPOLINE_SIZE)));
3347 }
3348 
3349 /* }}}  */
3350 /* {{{ Miscellaneous.  */
3351 
3352 /* Implement TARGET_CANNOT_COPY_INSN_P.
3353 
3354    Return true if INSN must not be duplicated.  */
3355 
3356 static bool
gcn_cannot_copy_insn_p(rtx_insn * insn)3357 gcn_cannot_copy_insn_p (rtx_insn *insn)
3358 {
3359   if (recog_memoized (insn) == CODE_FOR_gcn_wavefront_barrier)
3360     return true;
3361 
3362   return false;
3363 }
3364 
3365 /* Implement TARGET_DEBUG_UNWIND_INFO.
3366 
3367    Defines the mechanism that will be used for describing frame unwind
3368    information to the debugger.  */
3369 
3370 static enum unwind_info_type
gcn_debug_unwind_info()3371 gcn_debug_unwind_info ()
3372 {
3373   return UI_DWARF2;
3374 }
3375 
3376 /* Determine if there is a suitable hardware conversion instruction.
3377    Used primarily by the machine description.  */
3378 
3379 bool
gcn_valid_cvt_p(machine_mode from,machine_mode to,enum gcn_cvt_t op)3380 gcn_valid_cvt_p (machine_mode from, machine_mode to, enum gcn_cvt_t op)
3381 {
3382   if (VECTOR_MODE_P (from) != VECTOR_MODE_P (to))
3383     return false;
3384 
3385   if (VECTOR_MODE_P (from))
3386     {
3387       from = GET_MODE_INNER (from);
3388       to = GET_MODE_INNER (to);
3389     }
3390 
3391   switch (op)
3392     {
3393     case fix_trunc_cvt:
3394     case fixuns_trunc_cvt:
3395       if (GET_MODE_CLASS (from) != MODE_FLOAT
3396 	  || GET_MODE_CLASS (to) != MODE_INT)
3397 	return false;
3398       break;
3399     case float_cvt:
3400     case floatuns_cvt:
3401       if (GET_MODE_CLASS (from) != MODE_INT
3402 	  || GET_MODE_CLASS (to) != MODE_FLOAT)
3403 	return false;
3404       break;
3405     case extend_cvt:
3406       if (GET_MODE_CLASS (from) != MODE_FLOAT
3407 	  || GET_MODE_CLASS (to) != MODE_FLOAT
3408 	  || GET_MODE_SIZE (from) >= GET_MODE_SIZE (to))
3409 	return false;
3410       break;
3411     case trunc_cvt:
3412       if (GET_MODE_CLASS (from) != MODE_FLOAT
3413 	  || GET_MODE_CLASS (to) != MODE_FLOAT
3414 	  || GET_MODE_SIZE (from) <= GET_MODE_SIZE (to))
3415 	return false;
3416       break;
3417     }
3418 
3419   return ((to == HImode && from == HFmode)
3420 	  || (to == SImode && (from == SFmode || from == DFmode))
3421 	  || (to == HFmode && (from == HImode || from == SFmode))
3422 	  || (to == SFmode && (from == SImode || from == HFmode
3423 			       || from == DFmode))
3424 	  || (to == DFmode && (from == SImode || from == SFmode)));
3425 }
3426 
3427 /* Implement TARGET_EMUTLS_VAR_INIT.
3428 
3429    Disable emutls (gthr-gcn.h does not support it, yet).  */
3430 
3431 tree
gcn_emutls_var_init(tree,tree decl,tree)3432 gcn_emutls_var_init (tree, tree decl, tree)
3433 {
3434   sorry_at (DECL_SOURCE_LOCATION (decl), "TLS is not implemented for GCN.");
3435   return NULL_TREE;
3436 }
3437 
3438 /* }}}  */
3439 /* {{{ Costs.  */
3440 
3441 /* Implement TARGET_RTX_COSTS.
3442 
3443    Compute a (partial) cost for rtx X.  Return true if the complete
3444    cost has been computed, and false if subexpressions should be
3445    scanned.  In either case, *TOTAL contains the cost result.  */
3446 
3447 static bool
gcn_rtx_costs(rtx x,machine_mode,int,int,int * total,bool)3448 gcn_rtx_costs (rtx x, machine_mode, int, int, int *total, bool)
3449 {
3450   enum rtx_code code = GET_CODE (x);
3451   switch (code)
3452     {
3453     case CONST:
3454     case CONST_DOUBLE:
3455     case CONST_VECTOR:
3456     case CONST_INT:
3457       if (gcn_inline_constant_p (x))
3458 	*total = 0;
3459       else if (code == CONST_INT
3460 	  && ((unsigned HOST_WIDE_INT) INTVAL (x) + 0x8000) < 0x10000)
3461 	*total = 1;
3462       else if (gcn_constant_p (x))
3463 	*total = 2;
3464       else
3465 	*total = vgpr_vector_mode_p (GET_MODE (x)) ? 64 : 4;
3466       return true;
3467 
3468     case DIV:
3469       *total = 100;
3470       return false;
3471 
3472     default:
3473       *total = 3;
3474       return false;
3475     }
3476 }
3477 
3478 /* Implement TARGET_MEMORY_MOVE_COST.
3479 
3480    Return the cost of moving data of mode M between a
3481    register and memory.  A value of 2 is the default; this cost is
3482    relative to those in `REGISTER_MOVE_COST'.
3483 
3484    This function is used extensively by register_move_cost that is used to
3485    build tables at startup.  Make it inline in this case.
3486    When IN is 2, return maximum of in and out move cost.
3487 
3488    If moving between registers and memory is more expensive than
3489    between two registers, you should define this macro to express the
3490    relative cost.
3491 
3492    Model also increased moving costs of QImode registers in non
3493    Q_REGS classes.  */
3494 
3495 #define LOAD_COST  32
3496 #define STORE_COST 32
3497 static int
gcn_memory_move_cost(machine_mode mode,reg_class_t regclass,bool in)3498 gcn_memory_move_cost (machine_mode mode, reg_class_t regclass, bool in)
3499 {
3500   int nregs = CEIL (GET_MODE_SIZE (mode), 4);
3501   switch (regclass)
3502     {
3503     case SCC_CONDITIONAL_REG:
3504     case VCCZ_CONDITIONAL_REG:
3505     case VCC_CONDITIONAL_REG:
3506     case EXECZ_CONDITIONAL_REG:
3507     case ALL_CONDITIONAL_REGS:
3508     case SGPR_REGS:
3509     case SGPR_EXEC_REGS:
3510     case EXEC_MASK_REG:
3511     case SGPR_VOP_SRC_REGS:
3512     case SGPR_MEM_SRC_REGS:
3513     case SGPR_SRC_REGS:
3514     case SGPR_DST_REGS:
3515     case GENERAL_REGS:
3516     case AFP_REGS:
3517       if (!in)
3518 	return (STORE_COST + 2) * nregs;
3519       return LOAD_COST * nregs;
3520     case VGPR_REGS:
3521       if (in)
3522 	return (LOAD_COST + 2) * nregs;
3523       return STORE_COST * nregs;
3524     case ALL_REGS:
3525     case ALL_GPR_REGS:
3526     case SRCDST_REGS:
3527       if (in)
3528 	return (LOAD_COST + 2) * nregs;
3529       return (STORE_COST + 2) * nregs;
3530     default:
3531       gcc_unreachable ();
3532     }
3533 }
3534 
3535 /* Implement TARGET_REGISTER_MOVE_COST.
3536 
3537    Return the cost of moving data from a register in class CLASS1 to
3538    one in class CLASS2.  Base value is 2.  */
3539 
3540 static int
gcn_register_move_cost(machine_mode,reg_class_t dst,reg_class_t src)3541 gcn_register_move_cost (machine_mode, reg_class_t dst, reg_class_t src)
3542 {
3543   /* Increase cost of moving from and to vector registers.  While this is
3544      fast in hardware (I think), it has hidden cost of setting up the exec
3545      flags.  */
3546   if ((src < VGPR_REGS) != (dst < VGPR_REGS))
3547     return 4;
3548   return 2;
3549 }
3550 
3551 /* }}}  */
3552 /* {{{ Builtins.  */
3553 
3554 /* Type codes used by GCN built-in definitions.  */
3555 
3556 enum gcn_builtin_type_index
3557 {
3558   GCN_BTI_END_OF_PARAMS,
3559 
3560   GCN_BTI_VOID,
3561   GCN_BTI_BOOL,
3562   GCN_BTI_INT,
3563   GCN_BTI_UINT,
3564   GCN_BTI_SIZE_T,
3565   GCN_BTI_LLINT,
3566   GCN_BTI_LLUINT,
3567   GCN_BTI_EXEC,
3568 
3569   GCN_BTI_SF,
3570   GCN_BTI_V64SI,
3571   GCN_BTI_V64SF,
3572   GCN_BTI_V64PTR,
3573   GCN_BTI_SIPTR,
3574   GCN_BTI_SFPTR,
3575   GCN_BTI_VOIDPTR,
3576 
3577   GCN_BTI_LDS_VOIDPTR,
3578 
3579   GCN_BTI_MAX
3580 };
3581 
3582 static GTY(()) tree gcn_builtin_types[GCN_BTI_MAX];
3583 
3584 #define exec_type_node (gcn_builtin_types[GCN_BTI_EXEC])
3585 #define sf_type_node (gcn_builtin_types[GCN_BTI_SF])
3586 #define v64si_type_node (gcn_builtin_types[GCN_BTI_V64SI])
3587 #define v64sf_type_node (gcn_builtin_types[GCN_BTI_V64SF])
3588 #define v64ptr_type_node (gcn_builtin_types[GCN_BTI_V64PTR])
3589 #define siptr_type_node (gcn_builtin_types[GCN_BTI_SIPTR])
3590 #define sfptr_type_node (gcn_builtin_types[GCN_BTI_SFPTR])
3591 #define voidptr_type_node (gcn_builtin_types[GCN_BTI_VOIDPTR])
3592 #define size_t_type_node (gcn_builtin_types[GCN_BTI_SIZE_T])
3593 
3594 static rtx gcn_expand_builtin_1 (tree, rtx, rtx, machine_mode, int,
3595 				 struct gcn_builtin_description *);
3596 static rtx gcn_expand_builtin_binop (tree, rtx, rtx, machine_mode, int,
3597 				     struct gcn_builtin_description *);
3598 
3599 struct gcn_builtin_description;
3600 typedef rtx (*gcn_builtin_expander) (tree, rtx, rtx, machine_mode, int,
3601 				     struct gcn_builtin_description *);
3602 
3603 enum gcn_builtin_type
3604 {
3605   B_UNIMPLEMENTED,		/* Sorry out */
3606   B_INSN,			/* Emit a pattern */
3607   B_OVERLOAD			/* Placeholder for an overloaded function */
3608 };
3609 
3610 struct gcn_builtin_description
3611 {
3612   int fcode;
3613   int icode;
3614   const char *name;
3615   enum gcn_builtin_type type;
3616   /* The first element of parm is always the return type.  The rest
3617      are a zero terminated list of parameters.  */
3618   int parm[6];
3619   gcn_builtin_expander expander;
3620 };
3621 
3622 /* Read in the GCN builtins from gcn-builtins.def.  */
3623 
3624 extern GTY(()) struct gcn_builtin_description gcn_builtins[GCN_BUILTIN_MAX];
3625 
3626 struct gcn_builtin_description gcn_builtins[] = {
3627 #define DEF_BUILTIN(fcode, icode, name, type, params, expander)	\
3628   {GCN_BUILTIN_ ## fcode, icode, name, type, params, expander},
3629 
3630 #define DEF_BUILTIN_BINOP_INT_FP(fcode, ic, name)			\
3631   {GCN_BUILTIN_ ## fcode ## _V64SI,					\
3632    CODE_FOR_ ## ic ##v64si3_exec, name "_v64int", B_INSN,		\
3633    {GCN_BTI_V64SI, GCN_BTI_EXEC, GCN_BTI_V64SI, GCN_BTI_V64SI,		\
3634     GCN_BTI_V64SI, GCN_BTI_END_OF_PARAMS}, gcn_expand_builtin_binop},	\
3635   {GCN_BUILTIN_ ## fcode ## _V64SI_unspec,				\
3636    CODE_FOR_ ## ic ##v64si3_exec, name "_v64int_unspec", B_INSN, 	\
3637    {GCN_BTI_V64SI, GCN_BTI_EXEC, GCN_BTI_V64SI, GCN_BTI_V64SI,		\
3638     GCN_BTI_END_OF_PARAMS}, gcn_expand_builtin_binop},
3639 
3640 #include "gcn-builtins.def"
3641 #undef DEF_BUILTIN_BINOP_INT_FP
3642 #undef DEF_BUILTIN
3643 };
3644 
3645 static GTY(()) tree gcn_builtin_decls[GCN_BUILTIN_MAX];
3646 
3647 /* Implement TARGET_BUILTIN_DECL.
3648 
3649    Return the GCN builtin for CODE.  */
3650 
3651 tree
gcn_builtin_decl(unsigned code,bool ARG_UNUSED (initialize_p))3652 gcn_builtin_decl (unsigned code, bool ARG_UNUSED (initialize_p))
3653 {
3654   if (code >= GCN_BUILTIN_MAX)
3655     return error_mark_node;
3656 
3657   return gcn_builtin_decls[code];
3658 }
3659 
3660 /* Helper function for gcn_init_builtins.  */
3661 
3662 static void
gcn_init_builtin_types(void)3663 gcn_init_builtin_types (void)
3664 {
3665   gcn_builtin_types[GCN_BTI_VOID] = void_type_node;
3666   gcn_builtin_types[GCN_BTI_BOOL] = boolean_type_node;
3667   gcn_builtin_types[GCN_BTI_INT] = intSI_type_node;
3668   gcn_builtin_types[GCN_BTI_UINT] = unsigned_type_for (intSI_type_node);
3669   gcn_builtin_types[GCN_BTI_SIZE_T] = size_type_node;
3670   gcn_builtin_types[GCN_BTI_LLINT] = intDI_type_node;
3671   gcn_builtin_types[GCN_BTI_LLUINT] = unsigned_type_for (intDI_type_node);
3672 
3673   exec_type_node = unsigned_intDI_type_node;
3674   sf_type_node = float32_type_node;
3675   v64si_type_node = build_vector_type (intSI_type_node, 64);
3676   v64sf_type_node = build_vector_type (float_type_node, 64);
3677   v64ptr_type_node = build_vector_type (unsigned_intDI_type_node
3678 					/*build_pointer_type
3679 					  (integer_type_node) */
3680 					, 64);
3681   tree tmp = build_distinct_type_copy (intSI_type_node);
3682   TYPE_ADDR_SPACE (tmp) = ADDR_SPACE_FLAT;
3683   siptr_type_node = build_pointer_type (tmp);
3684 
3685   tmp = build_distinct_type_copy (float_type_node);
3686   TYPE_ADDR_SPACE (tmp) = ADDR_SPACE_FLAT;
3687   sfptr_type_node = build_pointer_type (tmp);
3688 
3689   tmp = build_distinct_type_copy (void_type_node);
3690   TYPE_ADDR_SPACE (tmp) = ADDR_SPACE_FLAT;
3691   voidptr_type_node = build_pointer_type (tmp);
3692 
3693   tmp = build_distinct_type_copy (void_type_node);
3694   TYPE_ADDR_SPACE (tmp) = ADDR_SPACE_LDS;
3695   gcn_builtin_types[GCN_BTI_LDS_VOIDPTR] = build_pointer_type (tmp);
3696 }
3697 
3698 /* Implement TARGET_INIT_BUILTINS.
3699 
3700    Set up all builtin functions for this target.  */
3701 
3702 static void
gcn_init_builtins(void)3703 gcn_init_builtins (void)
3704 {
3705   gcn_init_builtin_types ();
3706 
3707   struct gcn_builtin_description *d;
3708   unsigned int i;
3709   for (i = 0, d = gcn_builtins; i < GCN_BUILTIN_MAX; i++, d++)
3710     {
3711       tree p;
3712       char name[64];		/* build_function will make a copy.  */
3713       int parm;
3714 
3715       /* FIXME: Is this necessary/useful? */
3716       if (d->name == 0)
3717 	continue;
3718 
3719       /* Find last parm.  */
3720       for (parm = 1; d->parm[parm] != GCN_BTI_END_OF_PARAMS; parm++)
3721 	;
3722 
3723       p = void_list_node;
3724       while (parm > 1)
3725 	p = tree_cons (NULL_TREE, gcn_builtin_types[d->parm[--parm]], p);
3726 
3727       p = build_function_type (gcn_builtin_types[d->parm[0]], p);
3728 
3729       sprintf (name, "__builtin_gcn_%s", d->name);
3730       gcn_builtin_decls[i]
3731 	= add_builtin_function (name, p, i, BUILT_IN_MD, NULL, NULL_TREE);
3732 
3733       /* These builtins don't throw.  */
3734       TREE_NOTHROW (gcn_builtin_decls[i]) = 1;
3735     }
3736 
3737   /* These builtins need to take/return an LDS pointer: override the generic
3738      versions here.  */
3739 
3740   set_builtin_decl (BUILT_IN_GOACC_SINGLE_START,
3741 		    gcn_builtin_decls[GCN_BUILTIN_ACC_SINGLE_START], false);
3742 
3743   set_builtin_decl (BUILT_IN_GOACC_SINGLE_COPY_START,
3744 		    gcn_builtin_decls[GCN_BUILTIN_ACC_SINGLE_COPY_START],
3745 		    false);
3746 
3747   set_builtin_decl (BUILT_IN_GOACC_SINGLE_COPY_END,
3748 		    gcn_builtin_decls[GCN_BUILTIN_ACC_SINGLE_COPY_END],
3749 		    false);
3750 
3751   set_builtin_decl (BUILT_IN_GOACC_BARRIER,
3752 		    gcn_builtin_decls[GCN_BUILTIN_ACC_BARRIER], false);
3753 }
3754 
3755 /* Implement TARGET_INIT_LIBFUNCS.  */
3756 
3757 static void
gcn_init_libfuncs(void)3758 gcn_init_libfuncs (void)
3759 {
3760   /* BITS_PER_UNIT * 2 is 64 bits, which causes
3761      optabs-libfuncs.cc:gen_int_libfunc to omit TImode (i.e 128 bits)
3762      libcalls that we need to support operations for that type.  Initialise
3763      them here instead.  */
3764   set_optab_libfunc (udiv_optab, TImode, "__udivti3");
3765   set_optab_libfunc (umod_optab, TImode, "__umodti3");
3766   set_optab_libfunc (sdiv_optab, TImode, "__divti3");
3767   set_optab_libfunc (smod_optab, TImode, "__modti3");
3768   set_optab_libfunc (smul_optab, TImode, "__multi3");
3769   set_optab_libfunc (addv_optab, TImode, "__addvti3");
3770   set_optab_libfunc (subv_optab, TImode, "__subvti3");
3771   set_optab_libfunc (negv_optab, TImode, "__negvti2");
3772   set_optab_libfunc (absv_optab, TImode, "__absvti2");
3773   set_optab_libfunc (smulv_optab, TImode, "__mulvti3");
3774   set_optab_libfunc (ffs_optab, TImode, "__ffsti2");
3775   set_optab_libfunc (clz_optab, TImode, "__clzti2");
3776   set_optab_libfunc (ctz_optab, TImode, "__ctzti2");
3777   set_optab_libfunc (clrsb_optab, TImode, "__clrsbti2");
3778   set_optab_libfunc (popcount_optab, TImode, "__popcountti2");
3779   set_optab_libfunc (parity_optab, TImode, "__parityti2");
3780   set_optab_libfunc (bswap_optab, TImode, "__bswapti2");
3781 }
3782 
3783 /* Expand the CMP_SWAP GCN builtins.  We have our own versions that do
3784    not require taking the address of any object, other than the memory
3785    cell being operated on.
3786 
3787    Helper function for gcn_expand_builtin_1.  */
3788 
3789 static rtx
gcn_expand_cmp_swap(tree exp,rtx target)3790 gcn_expand_cmp_swap (tree exp, rtx target)
3791 {
3792   machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
3793   addr_space_t as
3794     = TYPE_ADDR_SPACE (TREE_TYPE (TREE_TYPE (CALL_EXPR_ARG (exp, 0))));
3795   machine_mode as_mode = gcn_addr_space_address_mode (as);
3796 
3797   if (!target)
3798     target = gen_reg_rtx (mode);
3799 
3800   rtx addr = expand_expr (CALL_EXPR_ARG (exp, 0),
3801 			  NULL_RTX, as_mode, EXPAND_NORMAL);
3802   rtx cmp = expand_expr (CALL_EXPR_ARG (exp, 1),
3803 			 NULL_RTX, mode, EXPAND_NORMAL);
3804   rtx src = expand_expr (CALL_EXPR_ARG (exp, 2),
3805 			 NULL_RTX, mode, EXPAND_NORMAL);
3806   rtx pat;
3807 
3808   rtx mem = gen_rtx_MEM (mode, force_reg (as_mode, addr));
3809   set_mem_addr_space (mem, as);
3810 
3811   if (!REG_P (cmp))
3812     cmp = copy_to_mode_reg (mode, cmp);
3813   if (!REG_P (src))
3814     src = copy_to_mode_reg (mode, src);
3815 
3816   if (mode == SImode)
3817     pat = gen_sync_compare_and_swapsi (target, mem, cmp, src);
3818   else
3819     pat = gen_sync_compare_and_swapdi (target, mem, cmp, src);
3820 
3821   emit_insn (pat);
3822 
3823   return target;
3824 }
3825 
3826 /* Expand many different builtins.
3827 
3828    Intended for use in gcn-builtins.def.  */
3829 
3830 static rtx
gcn_expand_builtin_1(tree exp,rtx target,rtx,machine_mode,int ignore,struct gcn_builtin_description *)3831 gcn_expand_builtin_1 (tree exp, rtx target, rtx /*subtarget */ ,
3832 		      machine_mode /*mode */ , int ignore,
3833 		      struct gcn_builtin_description *)
3834 {
3835   tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
3836   switch (DECL_MD_FUNCTION_CODE (fndecl))
3837     {
3838     case GCN_BUILTIN_FLAT_LOAD_INT32:
3839       {
3840 	if (ignore)
3841 	  return target;
3842 	/*rtx exec = */
3843 	force_reg (DImode,
3844 		   expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX, DImode,
3845 				EXPAND_NORMAL));
3846 	/*rtx ptr = */
3847 	force_reg (V64DImode,
3848 		   expand_expr (CALL_EXPR_ARG (exp, 1), NULL_RTX, V64DImode,
3849 				EXPAND_NORMAL));
3850 	/*emit_insn (gen_vector_flat_loadv64si
3851 		     (target, gcn_gen_undef (V64SImode), ptr, exec)); */
3852 	return target;
3853       }
3854     case GCN_BUILTIN_FLAT_LOAD_PTR_INT32:
3855     case GCN_BUILTIN_FLAT_LOAD_PTR_FLOAT:
3856       {
3857 	if (ignore)
3858 	  return target;
3859 	rtx exec = force_reg (DImode,
3860 			      expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX,
3861 					   DImode,
3862 					   EXPAND_NORMAL));
3863 	rtx ptr = force_reg (DImode,
3864 			     expand_expr (CALL_EXPR_ARG (exp, 1), NULL_RTX,
3865 					  V64DImode,
3866 					  EXPAND_NORMAL));
3867 	rtx offsets = force_reg (V64SImode,
3868 				 expand_expr (CALL_EXPR_ARG (exp, 2),
3869 					      NULL_RTX, V64DImode,
3870 					      EXPAND_NORMAL));
3871 	rtx addrs = gen_reg_rtx (V64DImode);
3872 	rtx tmp = gen_reg_rtx (V64SImode);
3873 	emit_insn (gen_ashlv64si3_exec (tmp, offsets,
3874 					  GEN_INT (2),
3875 					  gcn_gen_undef (V64SImode), exec));
3876 	emit_insn (gen_addv64di3_zext_dup2_exec (addrs, tmp, ptr,
3877 						 gcn_gen_undef (V64DImode),
3878 						 exec));
3879 	rtx mem = gen_rtx_MEM (GET_MODE (target), addrs);
3880 	/*set_mem_addr_space (mem, ADDR_SPACE_FLAT); */
3881 	/* FIXME: set attributes.  */
3882 	emit_insn (gen_mov_with_exec (target, mem, exec));
3883 	return target;
3884       }
3885     case GCN_BUILTIN_FLAT_STORE_PTR_INT32:
3886     case GCN_BUILTIN_FLAT_STORE_PTR_FLOAT:
3887       {
3888 	rtx exec = force_reg (DImode,
3889 			      expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX,
3890 					   DImode,
3891 					   EXPAND_NORMAL));
3892 	rtx ptr = force_reg (DImode,
3893 			     expand_expr (CALL_EXPR_ARG (exp, 1), NULL_RTX,
3894 					  V64DImode,
3895 					  EXPAND_NORMAL));
3896 	rtx offsets = force_reg (V64SImode,
3897 				 expand_expr (CALL_EXPR_ARG (exp, 2),
3898 					      NULL_RTX, V64DImode,
3899 					      EXPAND_NORMAL));
3900 	machine_mode vmode = TYPE_MODE (TREE_TYPE (CALL_EXPR_ARG (exp,
3901 								       3)));
3902 	rtx val = force_reg (vmode,
3903 			     expand_expr (CALL_EXPR_ARG (exp, 3), NULL_RTX,
3904 					  vmode,
3905 					  EXPAND_NORMAL));
3906 	rtx addrs = gen_reg_rtx (V64DImode);
3907 	rtx tmp = gen_reg_rtx (V64SImode);
3908 	emit_insn (gen_ashlv64si3_exec (tmp, offsets,
3909 					  GEN_INT (2),
3910 					  gcn_gen_undef (V64SImode), exec));
3911 	emit_insn (gen_addv64di3_zext_dup2_exec (addrs, tmp, ptr,
3912 						 gcn_gen_undef (V64DImode),
3913 						 exec));
3914 	rtx mem = gen_rtx_MEM (vmode, addrs);
3915 	/*set_mem_addr_space (mem, ADDR_SPACE_FLAT); */
3916 	/* FIXME: set attributes.  */
3917 	emit_insn (gen_mov_with_exec (mem, val, exec));
3918 	return target;
3919       }
3920     case GCN_BUILTIN_SQRTVF:
3921       {
3922 	if (ignore)
3923 	  return target;
3924 	rtx exec = gcn_full_exec_reg ();
3925 	rtx arg = force_reg (V64SFmode,
3926 			     expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX,
3927 					  V64SFmode,
3928 					  EXPAND_NORMAL));
3929 	emit_insn (gen_sqrtv64sf2_exec
3930 		   (target, arg, gcn_gen_undef (V64SFmode), exec));
3931 	return target;
3932       }
3933     case GCN_BUILTIN_SQRTF:
3934       {
3935 	if (ignore)
3936 	  return target;
3937 	rtx arg = force_reg (SFmode,
3938 			     expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX,
3939 					  SFmode,
3940 					  EXPAND_NORMAL));
3941 	emit_insn (gen_sqrtsf2 (target, arg));
3942 	return target;
3943       }
3944     case GCN_BUILTIN_OMP_DIM_SIZE:
3945       {
3946 	if (ignore)
3947 	  return target;
3948 	emit_insn (gen_oacc_dim_size (target,
3949 				      expand_expr (CALL_EXPR_ARG (exp, 0),
3950 						   NULL_RTX, SImode,
3951 						   EXPAND_NORMAL)));
3952 	return target;
3953       }
3954     case GCN_BUILTIN_OMP_DIM_POS:
3955       {
3956 	if (ignore)
3957 	  return target;
3958 	emit_insn (gen_oacc_dim_pos (target,
3959 				     expand_expr (CALL_EXPR_ARG (exp, 0),
3960 						  NULL_RTX, SImode,
3961 						  EXPAND_NORMAL)));
3962 	return target;
3963       }
3964     case GCN_BUILTIN_CMP_SWAP:
3965     case GCN_BUILTIN_CMP_SWAPLL:
3966       return gcn_expand_cmp_swap (exp, target);
3967 
3968     case GCN_BUILTIN_ACC_SINGLE_START:
3969       {
3970 	if (ignore)
3971 	  return target;
3972 
3973 	rtx wavefront = gcn_oacc_dim_pos (1);
3974 	rtx cond = gen_rtx_EQ (VOIDmode, wavefront, const0_rtx);
3975 	rtx cc = (target && REG_P (target)) ? target : gen_reg_rtx (BImode);
3976 	emit_insn (gen_cstoresi4 (cc, cond, wavefront, const0_rtx));
3977 	return cc;
3978       }
3979 
3980     case GCN_BUILTIN_ACC_SINGLE_COPY_START:
3981       {
3982 	rtx blk = force_reg (SImode,
3983 			     expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX,
3984 					  SImode, EXPAND_NORMAL));
3985 	rtx wavefront = gcn_oacc_dim_pos (1);
3986 	rtx cond = gen_rtx_NE (VOIDmode, wavefront, const0_rtx);
3987 	rtx not_zero = gen_label_rtx ();
3988 	emit_insn (gen_cbranchsi4 (cond, wavefront, const0_rtx, not_zero));
3989 	emit_move_insn (blk, const0_rtx);
3990 	emit_label (not_zero);
3991 	return blk;
3992       }
3993 
3994     case GCN_BUILTIN_ACC_SINGLE_COPY_END:
3995       return target;
3996 
3997     case GCN_BUILTIN_ACC_BARRIER:
3998       emit_insn (gen_gcn_wavefront_barrier ());
3999       return target;
4000 
4001     default:
4002       gcc_unreachable ();
4003     }
4004 }
4005 
4006 /* Expansion of simple arithmetic and bit binary operation builtins.
4007 
4008    Intended for use with gcn_builtins table.  */
4009 
4010 static rtx
gcn_expand_builtin_binop(tree exp,rtx target,rtx,machine_mode,int ignore,struct gcn_builtin_description * d)4011 gcn_expand_builtin_binop (tree exp, rtx target, rtx /*subtarget */ ,
4012 			  machine_mode /*mode */ , int ignore,
4013 			  struct gcn_builtin_description *d)
4014 {
4015   int icode = d->icode;
4016   if (ignore)
4017     return target;
4018 
4019   rtx exec = force_reg (DImode,
4020 			expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX, DImode,
4021 				     EXPAND_NORMAL));
4022 
4023   machine_mode m1 = insn_data[icode].operand[1].mode;
4024   rtx arg1 = expand_expr (CALL_EXPR_ARG (exp, 1), NULL_RTX, m1,
4025 			  EXPAND_NORMAL);
4026   if (!insn_data[icode].operand[1].predicate (arg1, m1))
4027     arg1 = force_reg (m1, arg1);
4028 
4029   machine_mode m2 = insn_data[icode].operand[2].mode;
4030   rtx arg2 = expand_expr (CALL_EXPR_ARG (exp, 2), NULL_RTX, m2,
4031 			  EXPAND_NORMAL);
4032   if (!insn_data[icode].operand[2].predicate (arg2, m2))
4033     arg2 = force_reg (m2, arg2);
4034 
4035   rtx arg_prev;
4036   if (call_expr_nargs (exp) == 4)
4037     {
4038       machine_mode m_prev = insn_data[icode].operand[4].mode;
4039       arg_prev = force_reg (m_prev,
4040 			    expand_expr (CALL_EXPR_ARG (exp, 3), NULL_RTX,
4041 					 m_prev, EXPAND_NORMAL));
4042     }
4043   else
4044     arg_prev = gcn_gen_undef (GET_MODE (target));
4045 
4046   rtx pat = GEN_FCN (icode) (target, arg1, arg2, exec, arg_prev);
4047   emit_insn (pat);
4048   return target;
4049 }
4050 
4051 /* Implement TARGET_EXPAND_BUILTIN.
4052 
4053    Expand an expression EXP that calls a built-in function, with result going
4054    to TARGET if that's convenient (and in mode MODE if that's convenient).
4055    SUBTARGET may be used as the target for computing one of EXP's operands.
4056    IGNORE is nonzero if the value is to be ignored.  */
4057 
4058 rtx
gcn_expand_builtin(tree exp,rtx target,rtx subtarget,machine_mode mode,int ignore)4059 gcn_expand_builtin (tree exp, rtx target, rtx subtarget, machine_mode mode,
4060 		    int ignore)
4061 {
4062   tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
4063   unsigned int fcode = DECL_MD_FUNCTION_CODE (fndecl);
4064   struct gcn_builtin_description *d;
4065 
4066   gcc_assert (fcode < GCN_BUILTIN_MAX);
4067   d = &gcn_builtins[fcode];
4068 
4069   if (d->type == B_UNIMPLEMENTED)
4070     sorry ("Builtin not implemented");
4071 
4072   return d->expander (exp, target, subtarget, mode, ignore, d);
4073 }
4074 
4075 /* }}}  */
4076 /* {{{ Vectorization.  */
4077 
4078 /* Implement TARGET_VECTORIZE_GET_MASK_MODE.
4079 
4080    A vector mask is a value that holds one boolean result for every element in
4081    a vector.  */
4082 
4083 opt_machine_mode
gcn_vectorize_get_mask_mode(machine_mode)4084 gcn_vectorize_get_mask_mode (machine_mode)
4085 {
4086   /* GCN uses a DImode bit-mask.  */
4087   return DImode;
4088 }
4089 
4090 /* Return an RTX that references a vector with the i-th lane containing
4091    PERM[i]*4.
4092 
4093    Helper function for gcn_vectorize_vec_perm_const.  */
4094 
4095 static rtx
gcn_make_vec_perm_address(unsigned int * perm)4096 gcn_make_vec_perm_address (unsigned int *perm)
4097 {
4098   rtx x = gen_reg_rtx (V64SImode);
4099   emit_move_insn (x, gcn_vec_constant (V64SImode, 0));
4100 
4101   /* Permutation addresses use byte addressing.  With each vector lane being
4102      4 bytes wide, and with 64 lanes in total, only bits 2..7 are significant,
4103      so only set those.
4104 
4105      The permutation given to the vec_perm* patterns range from 0 to 2N-1 to
4106      select between lanes in two vectors, but as the DS_BPERMUTE* instructions
4107      only take one source vector, the most-significant bit can be ignored
4108      here.  Instead, we can use EXEC masking to select the relevant part of
4109      each source vector after they are permuted separately.  */
4110   uint64_t bit_mask = 1 << 2;
4111   for (int i = 2; i < 8; i++, bit_mask <<= 1)
4112     {
4113       uint64_t exec_mask = 0;
4114       uint64_t lane_mask = 1;
4115       for (int j = 0; j < 64; j++, lane_mask <<= 1)
4116 	if ((perm[j] * 4) & bit_mask)
4117 	  exec_mask |= lane_mask;
4118 
4119       if (exec_mask)
4120 	emit_insn (gen_addv64si3_exec (x, x,
4121 				       gcn_vec_constant (V64SImode,
4122 							 bit_mask),
4123 				       x, get_exec (exec_mask)));
4124     }
4125 
4126   return x;
4127 }
4128 
4129 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST.
4130 
4131    Return true if permutation with SEL is possible.
4132 
4133    If DST/SRC0/SRC1 are non-null, emit the instructions to perform the
4134    permutations.  */
4135 
4136 static bool
gcn_vectorize_vec_perm_const(machine_mode vmode,rtx dst,rtx src0,rtx src1,const vec_perm_indices & sel)4137 gcn_vectorize_vec_perm_const (machine_mode vmode, rtx dst,
4138 			      rtx src0, rtx src1,
4139 			      const vec_perm_indices & sel)
4140 {
4141   unsigned int nelt = GET_MODE_NUNITS (vmode);
4142 
4143   gcc_assert (VECTOR_MODE_P (vmode));
4144   gcc_assert (nelt <= 64);
4145   gcc_assert (sel.length () == nelt);
4146 
4147   if (!dst)
4148     {
4149       /* All vector permutations are possible on this architecture,
4150          with varying degrees of efficiency depending on the permutation. */
4151       return true;
4152     }
4153 
4154   unsigned int perm[64];
4155   for (unsigned int i = 0; i < nelt; ++i)
4156     perm[i] = sel[i] & (2 * nelt - 1);
4157   for (unsigned int i = nelt; i < 64; ++i)
4158     perm[i] = 0;
4159 
4160   src0 = force_reg (vmode, src0);
4161   src1 = force_reg (vmode, src1);
4162 
4163   /* Make life a bit easier by swapping operands if necessary so that
4164      the first element always comes from src0.  */
4165   if (perm[0] >= nelt)
4166     {
4167       std::swap (src0, src1);
4168 
4169       for (unsigned int i = 0; i < nelt; ++i)
4170 	if (perm[i] < nelt)
4171 	  perm[i] += nelt;
4172 	else
4173 	  perm[i] -= nelt;
4174     }
4175 
4176   /* TODO: There are more efficient ways to implement certain permutations
4177      using ds_swizzle_b32 and/or DPP.  Test for and expand them here, before
4178      this more inefficient generic approach is used.  */
4179 
4180   int64_t src1_lanes = 0;
4181   int64_t lane_bit = 1;
4182 
4183   for (unsigned int i = 0; i < nelt; ++i, lane_bit <<= 1)
4184     {
4185       /* Set the bits for lanes from src1.  */
4186       if (perm[i] >= nelt)
4187 	src1_lanes |= lane_bit;
4188     }
4189 
4190   rtx addr = gcn_make_vec_perm_address (perm);
4191   rtx (*ds_bpermute) (rtx, rtx, rtx, rtx);
4192 
4193   switch (vmode)
4194     {
4195     case E_V64QImode:
4196       ds_bpermute = gen_ds_bpermutev64qi;
4197       break;
4198     case E_V64HImode:
4199       ds_bpermute = gen_ds_bpermutev64hi;
4200       break;
4201     case E_V64SImode:
4202       ds_bpermute = gen_ds_bpermutev64si;
4203       break;
4204     case E_V64HFmode:
4205       ds_bpermute = gen_ds_bpermutev64hf;
4206       break;
4207     case E_V64SFmode:
4208       ds_bpermute = gen_ds_bpermutev64sf;
4209       break;
4210     case E_V64DImode:
4211       ds_bpermute = gen_ds_bpermutev64di;
4212       break;
4213     case E_V64DFmode:
4214       ds_bpermute = gen_ds_bpermutev64df;
4215       break;
4216     default:
4217       gcc_assert (false);
4218     }
4219 
4220   /* Load elements from src0 to dst.  */
4221   gcc_assert (~src1_lanes);
4222   emit_insn (ds_bpermute (dst, addr, src0, gcn_full_exec_reg ()));
4223 
4224   /* Load elements from src1 to dst.  */
4225   if (src1_lanes)
4226     {
4227       /* Masking a lane masks both the destination and source lanes for
4228          DS_BPERMUTE, so we need to have all lanes enabled for the permute,
4229          then add an extra masked move to merge the results of permuting
4230          the two source vectors together.
4231        */
4232       rtx tmp = gen_reg_rtx (vmode);
4233       emit_insn (ds_bpermute (tmp, addr, src1, gcn_full_exec_reg ()));
4234       emit_insn (gen_mov_with_exec (dst, tmp, get_exec (src1_lanes)));
4235     }
4236 
4237   return true;
4238 }
4239 
4240 /* Implements TARGET_VECTOR_MODE_SUPPORTED_P.
4241 
4242    Return nonzero if vector MODE is supported with at least move
4243    instructions.  */
4244 
4245 static bool
gcn_vector_mode_supported_p(machine_mode mode)4246 gcn_vector_mode_supported_p (machine_mode mode)
4247 {
4248   return (mode == V64QImode || mode == V64HImode
4249 	  || mode == V64SImode || mode == V64DImode
4250 	  || mode == V64SFmode || mode == V64DFmode);
4251 }
4252 
4253 /* Implement TARGET_VECTORIZE_PREFERRED_SIMD_MODE.
4254 
4255    Enables autovectorization for all supported modes.  */
4256 
4257 static machine_mode
gcn_vectorize_preferred_simd_mode(scalar_mode mode)4258 gcn_vectorize_preferred_simd_mode (scalar_mode mode)
4259 {
4260   switch (mode)
4261     {
4262     case E_QImode:
4263       return V64QImode;
4264     case E_HImode:
4265       return V64HImode;
4266     case E_SImode:
4267       return V64SImode;
4268     case E_DImode:
4269       return V64DImode;
4270     case E_SFmode:
4271       return V64SFmode;
4272     case E_DFmode:
4273       return V64DFmode;
4274     default:
4275       return word_mode;
4276     }
4277 }
4278 
4279 /* Implement TARGET_VECTORIZE_RELATED_MODE.
4280 
4281    All GCN vectors are 64-lane, so this is simpler than other architectures.
4282    In particular, we do *not* want to match vector bit-size.  */
4283 
4284 static opt_machine_mode
gcn_related_vector_mode(machine_mode ARG_UNUSED (vector_mode),scalar_mode element_mode,poly_uint64 nunits)4285 gcn_related_vector_mode (machine_mode ARG_UNUSED (vector_mode),
4286 			 scalar_mode element_mode, poly_uint64 nunits)
4287 {
4288   if (known_ne (nunits, 0U) && known_ne (nunits, 64U))
4289     return VOIDmode;
4290 
4291   machine_mode pref_mode = gcn_vectorize_preferred_simd_mode (element_mode);
4292   if (!VECTOR_MODE_P (pref_mode))
4293     return VOIDmode;
4294 
4295   return pref_mode;
4296 }
4297 
4298 /* Implement TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT.
4299 
4300    Returns the preferred alignment in bits for accesses to vectors of type type
4301    in vectorized code. This might be less than or greater than the ABI-defined
4302    value returned by TARGET_VECTOR_ALIGNMENT. It can be equal to the alignment
4303    of a single element, in which case the vectorizer will not try to optimize
4304    for alignment.  */
4305 
4306 static poly_uint64
gcn_preferred_vector_alignment(const_tree type)4307 gcn_preferred_vector_alignment (const_tree type)
4308 {
4309   return TYPE_ALIGN (TREE_TYPE (type));
4310 }
4311 
4312 /* Implement TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT.
4313 
4314    Return true if the target supports misaligned vector store/load of a
4315    specific factor denoted in the misalignment parameter.  */
4316 
4317 static bool
gcn_vectorize_support_vector_misalignment(machine_mode ARG_UNUSED (mode),const_tree type,int misalignment,bool is_packed)4318 gcn_vectorize_support_vector_misalignment (machine_mode ARG_UNUSED (mode),
4319 					   const_tree type, int misalignment,
4320 					   bool is_packed)
4321 {
4322   if (is_packed)
4323     return false;
4324 
4325   /* If the misalignment is unknown, we should be able to handle the access
4326      so long as it is not to a member of a packed data structure.  */
4327   if (misalignment == -1)
4328     return true;
4329 
4330   /* Return true if the misalignment is a multiple of the natural alignment
4331      of the vector's element type.  This is probably always going to be
4332      true in practice, since we've already established that this isn't a
4333      packed access.  */
4334   return misalignment % TYPE_ALIGN_UNIT (type) == 0;
4335 }
4336 
4337 /* Implement TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.
4338 
4339    Return true if vector alignment is reachable (by peeling N iterations) for
4340    the given scalar type TYPE.  */
4341 
4342 static bool
gcn_vector_alignment_reachable(const_tree ARG_UNUSED (type),bool is_packed)4343 gcn_vector_alignment_reachable (const_tree ARG_UNUSED (type), bool is_packed)
4344 {
4345   /* Vectors which aren't in packed structures will not be less aligned than
4346      the natural alignment of their element type, so this is safe.  */
4347   return !is_packed;
4348 }
4349 
4350 /* Generate DPP instructions used for vector reductions.
4351 
4352    The opcode is given by INSN.
4353    The first operand of the operation is shifted right by SHIFT vector lanes.
4354    SHIFT must be a power of 2.  If SHIFT is 16, the 15th lane of each row is
4355    broadcast the next row (thereby acting like a shift of 16 for the end of
4356    each row).  If SHIFT is 32, lane 31 is broadcast to all the
4357    following lanes (thereby acting like a shift of 32 for lane 63).  */
4358 
4359 char *
gcn_expand_dpp_shr_insn(machine_mode mode,const char * insn,int unspec,int shift)4360 gcn_expand_dpp_shr_insn (machine_mode mode, const char *insn,
4361 			 int unspec, int shift)
4362 {
4363   static char buf[128];
4364   const char *dpp;
4365   const char *vcc_in = "";
4366   const char *vcc_out = "";
4367 
4368   /* Add the vcc operand if needed.  */
4369   if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
4370     {
4371       if (unspec == UNSPEC_PLUS_CARRY_IN_DPP_SHR)
4372 	vcc_in = ", vcc";
4373 
4374       if (unspec == UNSPEC_PLUS_CARRY_DPP_SHR
4375 	  || unspec == UNSPEC_PLUS_CARRY_IN_DPP_SHR)
4376 	vcc_out = ", vcc";
4377     }
4378 
4379   /* Add the DPP modifiers.  */
4380   switch (shift)
4381     {
4382     case 1:
4383       dpp = "row_shr:1 bound_ctrl:0";
4384       break;
4385     case 2:
4386       dpp = "row_shr:2 bound_ctrl:0";
4387       break;
4388     case 4:
4389       dpp = "row_shr:4 bank_mask:0xe";
4390       break;
4391     case 8:
4392       dpp = "row_shr:8 bank_mask:0xc";
4393       break;
4394     case 16:
4395       dpp = "row_bcast:15 row_mask:0xa";
4396       break;
4397     case 32:
4398       dpp = "row_bcast:31 row_mask:0xc";
4399       break;
4400     default:
4401       gcc_unreachable ();
4402     }
4403 
4404   if (unspec == UNSPEC_MOV_DPP_SHR && vgpr_2reg_mode_p (mode))
4405     sprintf (buf, "%s\t%%L0, %%L1 %s\n\t%s\t%%H0, %%H1 %s",
4406 	     insn, dpp, insn, dpp);
4407   else if (unspec == UNSPEC_MOV_DPP_SHR)
4408     sprintf (buf, "%s\t%%0, %%1 %s", insn, dpp);
4409   else
4410     sprintf (buf, "%s\t%%0%s, %%1, %%2%s %s", insn, vcc_out, vcc_in, dpp);
4411 
4412   return buf;
4413 }
4414 
4415 /* Generate vector reductions in terms of DPP instructions.
4416 
4417    The vector register SRC of mode MODE is reduced using the operation given
4418    by UNSPEC, and the scalar result is returned in lane 63 of a vector
4419    register.  */
4420 
4421 rtx
gcn_expand_reduc_scalar(machine_mode mode,rtx src,int unspec)4422 gcn_expand_reduc_scalar (machine_mode mode, rtx src, int unspec)
4423 {
4424   machine_mode orig_mode = mode;
4425   bool use_moves = (((unspec == UNSPEC_SMIN_DPP_SHR
4426 		      || unspec == UNSPEC_SMAX_DPP_SHR
4427 		      || unspec == UNSPEC_UMIN_DPP_SHR
4428 		      || unspec == UNSPEC_UMAX_DPP_SHR)
4429 		     && (mode == V64DImode
4430 			 || mode == V64DFmode))
4431 		    || (unspec == UNSPEC_PLUS_DPP_SHR
4432 			&& mode == V64DFmode));
4433   rtx_code code = (unspec == UNSPEC_SMIN_DPP_SHR ? SMIN
4434 		   : unspec == UNSPEC_SMAX_DPP_SHR ? SMAX
4435 		   : unspec == UNSPEC_UMIN_DPP_SHR ? UMIN
4436 		   : unspec == UNSPEC_UMAX_DPP_SHR ? UMAX
4437 		   : unspec == UNSPEC_PLUS_DPP_SHR ? PLUS
4438 		   : UNKNOWN);
4439   bool use_extends = ((unspec == UNSPEC_SMIN_DPP_SHR
4440 		       || unspec == UNSPEC_SMAX_DPP_SHR
4441 		       || unspec == UNSPEC_UMIN_DPP_SHR
4442 		       || unspec == UNSPEC_UMAX_DPP_SHR)
4443 		      && (mode == V64QImode
4444 			  || mode == V64HImode));
4445   bool unsignedp = (unspec == UNSPEC_UMIN_DPP_SHR
4446 		    || unspec == UNSPEC_UMAX_DPP_SHR);
4447   bool use_plus_carry = unspec == UNSPEC_PLUS_DPP_SHR
4448 			&& GET_MODE_CLASS (mode) == MODE_VECTOR_INT
4449 			&& (TARGET_GCN3 || mode == V64DImode);
4450 
4451   if (use_plus_carry)
4452     unspec = UNSPEC_PLUS_CARRY_DPP_SHR;
4453 
4454   if (use_extends)
4455     {
4456       rtx tmp = gen_reg_rtx (V64SImode);
4457       convert_move (tmp, src, unsignedp);
4458       src = tmp;
4459       mode = V64SImode;
4460     }
4461 
4462   /* Perform reduction by first performing the reduction operation on every
4463      pair of lanes, then on every pair of results from the previous
4464      iteration (thereby effectively reducing every 4 lanes) and so on until
4465      all lanes are reduced.  */
4466   rtx in, out = force_reg (mode, src);
4467   for (int i = 0, shift = 1; i < 6; i++, shift <<= 1)
4468     {
4469       rtx shift_val = gen_rtx_CONST_INT (VOIDmode, shift);
4470       in = out;
4471       out = gen_reg_rtx (mode);
4472 
4473       if (use_moves)
4474 	{
4475 	  rtx tmp = gen_reg_rtx (mode);
4476 	  emit_insn (gen_dpp_move (mode, tmp, in, shift_val));
4477 	  emit_insn (gen_rtx_SET (out, gen_rtx_fmt_ee (code, mode, tmp, in)));
4478 	}
4479       else
4480 	{
4481 	  rtx insn = gen_rtx_SET (out,
4482 				  gen_rtx_UNSPEC (mode,
4483 						  gen_rtvec (3, in, in,
4484 							     shift_val),
4485 						  unspec));
4486 
4487 	  /* Add clobber for instructions that set the carry flags.  */
4488 	  if (use_plus_carry)
4489 	    {
4490 	      rtx clobber = gen_rtx_CLOBBER (VOIDmode,
4491 					     gen_rtx_REG (DImode, VCC_REG));
4492 	      insn = gen_rtx_PARALLEL (VOIDmode,
4493 				       gen_rtvec (2, insn, clobber));
4494 	    }
4495 
4496 	  emit_insn (insn);
4497 	}
4498     }
4499 
4500   if (use_extends)
4501     {
4502       rtx tmp = gen_reg_rtx (orig_mode);
4503       convert_move (tmp, out, unsignedp);
4504       out = tmp;
4505     }
4506 
4507   return out;
4508 }
4509 
4510 /* Implement TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST.  */
4511 
4512 int
gcn_vectorization_cost(enum vect_cost_for_stmt ARG_UNUSED (type_of_cost),tree ARG_UNUSED (vectype),int ARG_UNUSED (misalign))4513 gcn_vectorization_cost (enum vect_cost_for_stmt ARG_UNUSED (type_of_cost),
4514 			tree ARG_UNUSED (vectype), int ARG_UNUSED (misalign))
4515 {
4516   /* Always vectorize.  */
4517   return 1;
4518 }
4519 
4520 /* }}}  */
4521 /* {{{ md_reorg pass.  */
4522 
4523 /* Identify VMEM instructions from their "type" attribute.  */
4524 
4525 static bool
gcn_vmem_insn_p(attr_type type)4526 gcn_vmem_insn_p (attr_type type)
4527 {
4528   switch (type)
4529     {
4530     case TYPE_MUBUF:
4531     case TYPE_MTBUF:
4532     case TYPE_FLAT:
4533       return true;
4534     case TYPE_UNKNOWN:
4535     case TYPE_SOP1:
4536     case TYPE_SOP2:
4537     case TYPE_SOPK:
4538     case TYPE_SOPC:
4539     case TYPE_SOPP:
4540     case TYPE_SMEM:
4541     case TYPE_DS:
4542     case TYPE_VOP2:
4543     case TYPE_VOP1:
4544     case TYPE_VOPC:
4545     case TYPE_VOP3A:
4546     case TYPE_VOP3B:
4547     case TYPE_VOP_SDWA:
4548     case TYPE_VOP_DPP:
4549     case TYPE_MULT:
4550     case TYPE_VMULT:
4551       return false;
4552     }
4553   gcc_unreachable ();
4554   return false;
4555 }
4556 
4557 /* If INSN sets the EXEC register to a constant value, return the value,
4558    otherwise return zero.  */
4559 
4560 static int64_t
gcn_insn_exec_value(rtx_insn * insn)4561 gcn_insn_exec_value (rtx_insn *insn)
4562 {
4563   if (!NONDEBUG_INSN_P (insn))
4564     return 0;
4565 
4566   rtx pattern = PATTERN (insn);
4567 
4568   if (GET_CODE (pattern) == SET)
4569     {
4570       rtx dest = XEXP (pattern, 0);
4571       rtx src = XEXP (pattern, 1);
4572 
4573       if (GET_MODE (dest) == DImode
4574 	  && REG_P (dest) && REGNO (dest) == EXEC_REG
4575 	  && CONST_INT_P (src))
4576 	return INTVAL (src);
4577     }
4578 
4579   return 0;
4580 }
4581 
4582 /* Sets the EXEC register before INSN to the value that it had after
4583    LAST_EXEC_DEF.  The constant value of the EXEC register is returned if
4584    known, otherwise it returns zero.  */
4585 
4586 static int64_t
gcn_restore_exec(rtx_insn * insn,rtx_insn * last_exec_def,int64_t curr_exec,bool curr_exec_known,bool & last_exec_def_saved)4587 gcn_restore_exec (rtx_insn *insn, rtx_insn *last_exec_def, int64_t curr_exec,
4588 		  bool curr_exec_known, bool &last_exec_def_saved)
4589 {
4590   rtx exec_reg = gen_rtx_REG (DImode, EXEC_REG);
4591   rtx exec;
4592 
4593   int64_t exec_value = gcn_insn_exec_value (last_exec_def);
4594 
4595   if (exec_value)
4596     {
4597       /* If the EXEC value is a constant and it happens to be the same as the
4598          current EXEC value, the restore can be skipped.  */
4599       if (curr_exec_known && exec_value == curr_exec)
4600 	return exec_value;
4601 
4602       exec = GEN_INT (exec_value);
4603     }
4604   else
4605     {
4606       /* If the EXEC value is not a constant, save it in a register after the
4607 	 point of definition.  */
4608       rtx exec_save_reg = gen_rtx_REG (DImode, EXEC_SAVE_REG);
4609 
4610       if (!last_exec_def_saved)
4611 	{
4612 	  start_sequence ();
4613 	  emit_move_insn (exec_save_reg, exec_reg);
4614 	  rtx_insn *seq = get_insns ();
4615 	  end_sequence ();
4616 
4617 	  emit_insn_after (seq, last_exec_def);
4618 	  if (dump_file && (dump_flags & TDF_DETAILS))
4619 	    fprintf (dump_file, "Saving EXEC after insn %d.\n",
4620 		     INSN_UID (last_exec_def));
4621 
4622 	  last_exec_def_saved = true;
4623 	}
4624 
4625       exec = exec_save_reg;
4626     }
4627 
4628   /* Restore EXEC register before the usage.  */
4629   start_sequence ();
4630   emit_move_insn (exec_reg, exec);
4631   rtx_insn *seq = get_insns ();
4632   end_sequence ();
4633   emit_insn_before (seq, insn);
4634 
4635   if (dump_file && (dump_flags & TDF_DETAILS))
4636     {
4637       if (exec_value)
4638 	fprintf (dump_file, "Restoring EXEC to %ld before insn %d.\n",
4639 		 exec_value, INSN_UID (insn));
4640       else
4641 	fprintf (dump_file,
4642 		 "Restoring EXEC from saved value before insn %d.\n",
4643 		 INSN_UID (insn));
4644     }
4645 
4646   return exec_value;
4647 }
4648 
4649 /* Implement TARGET_MACHINE_DEPENDENT_REORG.
4650 
4651    Ensure that pipeline dependencies and lane masking are set correctly.  */
4652 
4653 static void
gcn_md_reorg(void)4654 gcn_md_reorg (void)
4655 {
4656   basic_block bb;
4657   rtx exec_reg = gen_rtx_REG (DImode, EXEC_REG);
4658   regset_head live;
4659 
4660   INIT_REG_SET (&live);
4661 
4662   compute_bb_for_insn ();
4663 
4664   if (!optimize)
4665     {
4666       split_all_insns ();
4667       if (dump_file && (dump_flags & TDF_DETAILS))
4668 	{
4669 	  fprintf (dump_file, "After split:\n");
4670 	  print_rtl_with_bb (dump_file, get_insns (), dump_flags);
4671 	}
4672 
4673       /* Update data-flow information for split instructions.  */
4674       df_insn_rescan_all ();
4675     }
4676 
4677   df_live_add_problem ();
4678   df_live_set_all_dirty ();
4679   df_analyze ();
4680 
4681   /* This pass ensures that the EXEC register is set correctly, according
4682      to the "exec" attribute.  However, care must be taken so that the
4683      value that reaches explicit uses of the EXEC register remains the
4684      same as before.
4685    */
4686 
4687   FOR_EACH_BB_FN (bb, cfun)
4688     {
4689       if (dump_file && (dump_flags & TDF_DETAILS))
4690 	fprintf (dump_file, "BB %d:\n", bb->index);
4691 
4692       rtx_insn *insn, *curr;
4693       rtx_insn *last_exec_def = BB_HEAD (bb);
4694       bool last_exec_def_saved = false;
4695       bool curr_exec_explicit = true;
4696       bool curr_exec_known = true;
4697       int64_t curr_exec = 0;	/* 0 here means 'the value is that of EXEC
4698 				   after last_exec_def is executed'.  */
4699 
4700       bitmap live_in = DF_LR_IN (bb);
4701       bool exec_live_on_entry = false;
4702       if (bitmap_bit_p (live_in, EXEC_LO_REG)
4703 	  || bitmap_bit_p (live_in, EXEC_HI_REG))
4704 	{
4705 	  if (dump_file)
4706 	    fprintf (dump_file, "EXEC reg is live on entry to block %d\n",
4707 		     (int) bb->index);
4708 	  exec_live_on_entry = true;
4709 	}
4710 
4711       FOR_BB_INSNS_SAFE (bb, insn, curr)
4712 	{
4713 	  if (!NONDEBUG_INSN_P (insn))
4714 	    continue;
4715 
4716 	  if (GET_CODE (PATTERN (insn)) == USE
4717 	      || GET_CODE (PATTERN (insn)) == CLOBBER)
4718 	    continue;
4719 
4720 	  HARD_REG_SET defs, uses;
4721 	  CLEAR_HARD_REG_SET (defs);
4722 	  CLEAR_HARD_REG_SET (uses);
4723 	  note_stores (insn, record_hard_reg_sets, &defs);
4724 	  note_uses (&PATTERN (insn), record_hard_reg_uses, &uses);
4725 
4726 	  bool exec_lo_def_p = TEST_HARD_REG_BIT (defs, EXEC_LO_REG);
4727 	  bool exec_hi_def_p = TEST_HARD_REG_BIT (defs, EXEC_HI_REG);
4728 	  bool exec_used = (hard_reg_set_intersect_p
4729 			    (uses, reg_class_contents[(int) EXEC_MASK_REG])
4730 			    || TEST_HARD_REG_BIT (uses, EXECZ_REG));
4731 
4732 	  /* Check the instruction for implicit setting of EXEC via an
4733 	     attribute.  */
4734 	  attr_exec exec_attr = get_attr_exec (insn);
4735 	  int64_t new_exec;
4736 
4737 	  switch (exec_attr)
4738 	    {
4739 	    case EXEC_NONE:
4740 	      new_exec = 0;
4741 	      break;
4742 
4743 	    case EXEC_SINGLE:
4744 	      /* Instructions that do not involve memory accesses only require
4745 		 bit 0 of EXEC to be set.  */
4746 	      if (gcn_vmem_insn_p (get_attr_type (insn))
4747 		  || get_attr_type (insn) == TYPE_DS)
4748 		new_exec = 1;
4749 	      else
4750 		new_exec = curr_exec | 1;
4751 	      break;
4752 
4753 	    case EXEC_FULL:
4754 	      new_exec = -1;
4755 	      break;
4756 
4757 	    default:  /* Auto-detect what setting is appropriate.  */
4758 	      {
4759 	        new_exec = 0;
4760 
4761 		/* If EXEC is referenced explicitly then we don't need to do
4762 		   anything to set it, so we're done.  */
4763 		if (exec_used)
4764 		  break;
4765 
4766 		/* Scan the insn for VGPRs defs or uses.  The mode determines
4767 		   what kind of exec is needed.  */
4768 		subrtx_iterator::array_type array;
4769 		FOR_EACH_SUBRTX (iter, array, PATTERN (insn), NONCONST)
4770 		  {
4771 		    const_rtx x = *iter;
4772 		    if (REG_P (x) && VGPR_REGNO_P (REGNO (x)))
4773 		      {
4774 			if (VECTOR_MODE_P (GET_MODE (x)))
4775 			  {
4776 			    new_exec = -1;
4777 			    break;
4778 			  }
4779 			else
4780 			  new_exec = 1;
4781 		      }
4782 		  }
4783 	        }
4784 	      break;
4785 	    }
4786 
4787 	  if (new_exec && (!curr_exec_known || new_exec != curr_exec))
4788 	    {
4789 	      start_sequence ();
4790 	      emit_move_insn (exec_reg, GEN_INT (new_exec));
4791 	      rtx_insn *seq = get_insns ();
4792 	      end_sequence ();
4793 	      emit_insn_before (seq, insn);
4794 
4795 	      if (dump_file && (dump_flags & TDF_DETAILS))
4796 		fprintf (dump_file, "Setting EXEC to %ld before insn %d.\n",
4797 			 new_exec, INSN_UID (insn));
4798 
4799 	      curr_exec = new_exec;
4800 	      curr_exec_explicit = false;
4801 	      curr_exec_known = true;
4802 	    }
4803 	  else if (new_exec && dump_file && (dump_flags & TDF_DETAILS))
4804 	    {
4805 	      fprintf (dump_file, "Exec already is %ld before insn %d.\n",
4806 		       new_exec, INSN_UID (insn));
4807 	    }
4808 
4809 	  /* The state of the EXEC register is unknown after a
4810 	     function call.  */
4811 	  if (CALL_P (insn))
4812 	    curr_exec_known = false;
4813 
4814 	  /* Handle explicit uses of EXEC.  If the instruction is a partial
4815 	     explicit definition of EXEC, then treat it as an explicit use of
4816 	     EXEC as well.  */
4817 	  if (exec_used || exec_lo_def_p != exec_hi_def_p)
4818 	    {
4819 	      /* An instruction that explicitly uses EXEC should not also
4820 		 implicitly define it.  */
4821 	      gcc_assert (!exec_used || !new_exec);
4822 
4823 	      if (!curr_exec_known || !curr_exec_explicit)
4824 		{
4825 		  /* Restore the previous explicitly defined value.  */
4826 		  curr_exec = gcn_restore_exec (insn, last_exec_def,
4827 						curr_exec, curr_exec_known,
4828 						last_exec_def_saved);
4829 		  curr_exec_explicit = true;
4830 		  curr_exec_known = true;
4831 		}
4832 	    }
4833 
4834 	  /* Handle explicit definitions of EXEC.  */
4835 	  if (exec_lo_def_p || exec_hi_def_p)
4836 	    {
4837 	      last_exec_def = insn;
4838 	      last_exec_def_saved = false;
4839 	      curr_exec = gcn_insn_exec_value (insn);
4840 	      curr_exec_explicit = true;
4841 	      curr_exec_known = true;
4842 
4843 	      if (dump_file && (dump_flags & TDF_DETAILS))
4844 		fprintf (dump_file,
4845 			 "Found %s definition of EXEC at insn %d.\n",
4846 			 exec_lo_def_p == exec_hi_def_p ? "full" : "partial",
4847 			 INSN_UID (insn));
4848 	    }
4849 
4850 	  exec_live_on_entry = false;
4851 	}
4852 
4853       COPY_REG_SET (&live, DF_LR_OUT (bb));
4854       df_simulate_initialize_backwards (bb, &live);
4855 
4856       /* If EXEC is live after the basic block, restore the value of EXEC
4857 	 at the end of the block.  */
4858       if ((REGNO_REG_SET_P (&live, EXEC_LO_REG)
4859 	   || REGNO_REG_SET_P (&live, EXEC_HI_REG))
4860 	  && (!curr_exec_known || !curr_exec_explicit || exec_live_on_entry))
4861 	{
4862 	  rtx_insn *end_insn = BB_END (bb);
4863 
4864 	  /* If the instruction is not a jump instruction, do the restore
4865 	     after the last instruction in the basic block.  */
4866 	  if (NONJUMP_INSN_P (end_insn))
4867 	    end_insn = NEXT_INSN (end_insn);
4868 
4869 	  gcn_restore_exec (end_insn, last_exec_def, curr_exec,
4870 			    curr_exec_known, last_exec_def_saved);
4871 	}
4872     }
4873 
4874   CLEAR_REG_SET (&live);
4875 
4876   /* "Manually Inserted Wait States (NOPs)."
4877 
4878      GCN hardware detects most kinds of register dependencies, but there
4879      are some exceptions documented in the ISA manual.  This pass
4880      detects the missed cases, and inserts the documented number of NOPs
4881      required for correct execution.  */
4882 
4883   const int max_waits = 5;
4884   struct ilist
4885   {
4886     rtx_insn *insn;
4887     attr_unit unit;
4888     attr_delayeduse delayeduse;
4889     HARD_REG_SET writes;
4890     HARD_REG_SET reads;
4891     int age;
4892   } back[max_waits];
4893   int oldest = 0;
4894   for (int i = 0; i < max_waits; i++)
4895     back[i].insn = NULL;
4896 
4897   rtx_insn *insn, *last_insn = NULL;
4898   for (insn = get_insns (); insn != 0; insn = NEXT_INSN (insn))
4899     {
4900       if (!NONDEBUG_INSN_P (insn))
4901 	continue;
4902 
4903       if (GET_CODE (PATTERN (insn)) == USE
4904 	  || GET_CODE (PATTERN (insn)) == CLOBBER)
4905 	continue;
4906 
4907       attr_type itype = get_attr_type (insn);
4908       attr_unit iunit = get_attr_unit (insn);
4909       attr_delayeduse idelayeduse = get_attr_delayeduse (insn);
4910       HARD_REG_SET ireads, iwrites;
4911       CLEAR_HARD_REG_SET (ireads);
4912       CLEAR_HARD_REG_SET (iwrites);
4913       note_stores (insn, record_hard_reg_sets, &iwrites);
4914       note_uses (&PATTERN (insn), record_hard_reg_uses, &ireads);
4915 
4916       /* Scan recent previous instructions for dependencies not handled in
4917          hardware.  */
4918       int nops_rqd = 0;
4919       for (int i = oldest; i < oldest + max_waits; i++)
4920 	{
4921 	  struct ilist *prev_insn = &back[i % max_waits];
4922 
4923 	  if (!prev_insn->insn)
4924 	    continue;
4925 
4926 	  /* VALU writes SGPR followed by VMEM reading the same SGPR
4927 	     requires 5 wait states.  */
4928 	  if ((prev_insn->age + nops_rqd) < 5
4929 	      && prev_insn->unit == UNIT_VECTOR
4930 	      && gcn_vmem_insn_p (itype))
4931 	    {
4932 	      HARD_REG_SET regs = prev_insn->writes & ireads;
4933 	      if (hard_reg_set_intersect_p
4934 		  (regs, reg_class_contents[(int) SGPR_REGS]))
4935 		nops_rqd = 5 - prev_insn->age;
4936 	    }
4937 
4938 	  /* VALU sets VCC/EXEC followed by VALU uses VCCZ/EXECZ
4939 	     requires 5 wait states.  */
4940 	  if ((prev_insn->age + nops_rqd) < 5
4941 	      && prev_insn->unit == UNIT_VECTOR
4942 	      && iunit == UNIT_VECTOR
4943 	      && ((hard_reg_set_intersect_p
4944 		   (prev_insn->writes,
4945 		    reg_class_contents[(int) EXEC_MASK_REG])
4946 		   && TEST_HARD_REG_BIT (ireads, EXECZ_REG))
4947 		  ||
4948 		  (hard_reg_set_intersect_p
4949 		   (prev_insn->writes,
4950 		    reg_class_contents[(int) VCC_CONDITIONAL_REG])
4951 		   && TEST_HARD_REG_BIT (ireads, VCCZ_REG))))
4952 	    nops_rqd = 5 - prev_insn->age;
4953 
4954 	  /* VALU writes SGPR/VCC followed by v_{read,write}lane using
4955 	     SGPR/VCC as lane select requires 4 wait states.  */
4956 	  if ((prev_insn->age + nops_rqd) < 4
4957 	      && prev_insn->unit == UNIT_VECTOR
4958 	      && get_attr_laneselect (insn) == LANESELECT_YES)
4959 	    {
4960 	      HARD_REG_SET regs = prev_insn->writes & ireads;
4961 	      if (hard_reg_set_intersect_p
4962 		  (regs, reg_class_contents[(int) SGPR_REGS])
4963 		  || hard_reg_set_intersect_p
4964 		     (regs, reg_class_contents[(int) VCC_CONDITIONAL_REG]))
4965 		nops_rqd = 4 - prev_insn->age;
4966 	    }
4967 
4968 	  /* VALU writes VGPR followed by VALU_DPP reading that VGPR
4969 	     requires 2 wait states.  */
4970 	  if ((prev_insn->age + nops_rqd) < 2
4971 	      && prev_insn->unit == UNIT_VECTOR
4972 	      && itype == TYPE_VOP_DPP)
4973 	    {
4974 	      HARD_REG_SET regs = prev_insn->writes & ireads;
4975 	      if (hard_reg_set_intersect_p
4976 		  (regs, reg_class_contents[(int) VGPR_REGS]))
4977 		nops_rqd = 2 - prev_insn->age;
4978 	    }
4979 
4980 	  /* Store that requires input registers are not overwritten by
4981 	     following instruction.  */
4982 	  if ((prev_insn->age + nops_rqd) < 1
4983 	      && prev_insn->delayeduse == DELAYEDUSE_YES
4984 	      && ((hard_reg_set_intersect_p
4985 		   (prev_insn->reads, iwrites))))
4986 	    nops_rqd = 1 - prev_insn->age;
4987 	}
4988 
4989       /* Insert the required number of NOPs.  */
4990       for (int i = nops_rqd; i > 0; i--)
4991 	emit_insn_after (gen_nop (), last_insn);
4992 
4993       /* Age the previous instructions.  We can also ignore writes to
4994          registers subsequently overwritten.  */
4995       HARD_REG_SET written;
4996       CLEAR_HARD_REG_SET (written);
4997       for (int i = oldest + max_waits - 1; i > oldest; i--)
4998 	{
4999 	  struct ilist *prev_insn = &back[i % max_waits];
5000 
5001 	  /* Assume all instructions are equivalent to one "wait", the same
5002 	     as s_nop.  This is probably true for SALU, but not VALU (which
5003 	     may take longer), so this is not optimal.  However, AMD do
5004 	     not publish the cycle times for instructions.  */
5005 	  prev_insn->age += 1 + nops_rqd;
5006 
5007 	  written |= iwrites;
5008 	  prev_insn->writes &= ~written;
5009 	}
5010 
5011       /* Track the current instruction as a previous instruction.  */
5012       back[oldest].insn = insn;
5013       back[oldest].unit = iunit;
5014       back[oldest].delayeduse = idelayeduse;
5015       back[oldest].writes = iwrites;
5016       back[oldest].reads = ireads;
5017       back[oldest].age = 0;
5018       oldest = (oldest + 1) % max_waits;
5019 
5020       last_insn = insn;
5021     }
5022 }
5023 
5024 /* }}}  */
5025 /* {{{ OpenACC / OpenMP.  */
5026 
5027 #define GCN_DEFAULT_GANGS 0	/* Choose at runtime.  */
5028 #define GCN_DEFAULT_WORKERS 0	/* Choose at runtime.  */
5029 #define GCN_DEFAULT_VECTORS 1	/* Use autovectorization only, for now.  */
5030 
5031 /* Implement TARGET_GOACC_VALIDATE_DIMS.
5032 
5033    Check the launch dimensions provided for an OpenACC compute
5034    region, or routine.  */
5035 
5036 static bool
gcn_goacc_validate_dims(tree decl,int dims[],int fn_level,unsigned)5037 gcn_goacc_validate_dims (tree decl, int dims[], int fn_level,
5038 			 unsigned /*used*/)
5039 {
5040   bool changed = false;
5041   const int max_workers = 16;
5042 
5043   /* The vector size must appear to be 64, to the user, unless this is a
5044      SEQ routine.  The real, internal value is always 1, which means use
5045      autovectorization, but the user should not see that.  */
5046   if (fn_level <= GOMP_DIM_VECTOR && fn_level >= -1
5047       && dims[GOMP_DIM_VECTOR] >= 0)
5048     {
5049       if (fn_level < 0 && dims[GOMP_DIM_VECTOR] >= 0
5050 	  && dims[GOMP_DIM_VECTOR] != 64)
5051 	warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION,
5052 		    OPT_Wopenacc_dims,
5053 		    (dims[GOMP_DIM_VECTOR]
5054 		     ? G_("using %<vector_length (64)%>, ignoring %d")
5055 		     : G_("using %<vector_length (64)%>, "
5056 			  "ignoring runtime setting")),
5057 		    dims[GOMP_DIM_VECTOR]);
5058       dims[GOMP_DIM_VECTOR] = 1;
5059       changed = true;
5060     }
5061 
5062   /* Check the num workers is not too large.  */
5063   if (dims[GOMP_DIM_WORKER] > max_workers)
5064     {
5065       warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION,
5066 		  OPT_Wopenacc_dims,
5067 		  "using %<num_workers (%d)%>, ignoring %d",
5068 		  max_workers, dims[GOMP_DIM_WORKER]);
5069       dims[GOMP_DIM_WORKER] = max_workers;
5070       changed = true;
5071     }
5072 
5073   /* Set global defaults.  */
5074   if (!decl)
5075     {
5076       dims[GOMP_DIM_VECTOR] = GCN_DEFAULT_VECTORS;
5077       if (dims[GOMP_DIM_WORKER] < 0)
5078 	dims[GOMP_DIM_WORKER] = GCN_DEFAULT_WORKERS;
5079       if (dims[GOMP_DIM_GANG] < 0)
5080 	dims[GOMP_DIM_GANG] = GCN_DEFAULT_GANGS;
5081       changed = true;
5082     }
5083 
5084   return changed;
5085 }
5086 
5087 /* Helper function for oacc_dim_size instruction.
5088    Also used for OpenMP, via builtin_gcn_dim_size, and the omp_gcn pass.  */
5089 
5090 rtx
gcn_oacc_dim_size(int dim)5091 gcn_oacc_dim_size (int dim)
5092 {
5093   if (dim < 0 || dim > 2)
5094     error ("offload dimension out of range (%d)", dim);
5095 
5096   /* Vectors are a special case.  */
5097   if (dim == 2)
5098     return const1_rtx;		/* Think of this as 1 times 64.  */
5099 
5100   static int offset[] = {
5101     /* Offsets into dispatch packet.  */
5102     12,				/* X dim = Gang / Team / Work-group.  */
5103     20,				/* Z dim = Worker / Thread / Wavefront.  */
5104     16				/* Y dim = Vector / SIMD / Work-item.  */
5105   };
5106   rtx addr = gen_rtx_PLUS (DImode,
5107 			   gen_rtx_REG (DImode,
5108 					cfun->machine->args.
5109 					reg[DISPATCH_PTR_ARG]),
5110 			   GEN_INT (offset[dim]));
5111   return gen_rtx_MEM (SImode, addr);
5112 }
5113 
5114 /* Helper function for oacc_dim_pos instruction.
5115    Also used for OpenMP, via builtin_gcn_dim_pos, and the omp_gcn pass.  */
5116 
5117 rtx
gcn_oacc_dim_pos(int dim)5118 gcn_oacc_dim_pos (int dim)
5119 {
5120   if (dim < 0 || dim > 2)
5121     error ("offload dimension out of range (%d)", dim);
5122 
5123   static const int reg[] = {
5124     WORKGROUP_ID_X_ARG,		/* Gang / Team / Work-group.  */
5125     WORK_ITEM_ID_Z_ARG,		/* Worker / Thread / Wavefront.  */
5126     WORK_ITEM_ID_Y_ARG		/* Vector / SIMD / Work-item.  */
5127   };
5128 
5129   int reg_num = cfun->machine->args.reg[reg[dim]];
5130 
5131   /* The information must have been requested by the kernel.  */
5132   gcc_assert (reg_num >= 0);
5133 
5134   return gen_rtx_REG (SImode, reg_num);
5135 }
5136 
5137 /* Implement TARGET_GOACC_FORK_JOIN.  */
5138 
5139 static bool
gcn_fork_join(gcall * call,const int dims[],bool is_fork)5140 gcn_fork_join (gcall *call, const int dims[], bool is_fork)
5141 {
5142   tree arg = gimple_call_arg (call, 2);
5143   unsigned axis = TREE_INT_CST_LOW (arg);
5144 
5145   if (!is_fork && axis == GOMP_DIM_WORKER && dims[axis] != 1)
5146     return true;
5147 
5148   return false;
5149 }
5150 
5151 /* Implement ???????
5152    FIXME make this a real hook.
5153 
5154    Adjust FNDECL such that options inherited from the host compiler
5155    are made appropriate for the accelerator compiler.  */
5156 
5157 void
gcn_fixup_accel_lto_options(tree fndecl)5158 gcn_fixup_accel_lto_options (tree fndecl)
5159 {
5160   tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
5161   if (!func_optimize)
5162     return;
5163 
5164   tree old_optimize
5165     = build_optimization_node (&global_options, &global_options_set);
5166   tree new_optimize;
5167 
5168   /* If the function changed the optimization levels as well as
5169      setting target options, start with the optimizations
5170      specified.  */
5171   if (func_optimize != old_optimize)
5172     cl_optimization_restore (&global_options, &global_options_set,
5173 			     TREE_OPTIMIZATION (func_optimize));
5174 
5175   gcn_option_override ();
5176 
5177   /* The target attributes may also change some optimization flags,
5178      so update the optimization options if necessary.  */
5179   new_optimize = build_optimization_node (&global_options,
5180 					  &global_options_set);
5181 
5182   if (old_optimize != new_optimize)
5183     {
5184       DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
5185       cl_optimization_restore (&global_options, &global_options_set,
5186 			       TREE_OPTIMIZATION (old_optimize));
5187     }
5188 }
5189 
5190 /* Implement TARGET_GOACC_SHARED_MEM_LAYOUT hook.  */
5191 
5192 static void
gcn_shared_mem_layout(unsigned HOST_WIDE_INT * lo,unsigned HOST_WIDE_INT * hi,int ARG_UNUSED (dims[GOMP_DIM_MAX]),unsigned HOST_WIDE_INT ARG_UNUSED (private_size[GOMP_DIM_MAX]),unsigned HOST_WIDE_INT reduction_size[GOMP_DIM_MAX])5193 gcn_shared_mem_layout (unsigned HOST_WIDE_INT *lo,
5194 		       unsigned HOST_WIDE_INT *hi,
5195 		       int ARG_UNUSED (dims[GOMP_DIM_MAX]),
5196 		       unsigned HOST_WIDE_INT
5197 			 ARG_UNUSED (private_size[GOMP_DIM_MAX]),
5198 		       unsigned HOST_WIDE_INT reduction_size[GOMP_DIM_MAX])
5199 {
5200   *lo = gang_private_size_opt + reduction_size[GOMP_DIM_WORKER];
5201   /* !!! We can maybe use dims[] to estimate the maximum number of work
5202      groups/wavefronts/etc. we will launch, and therefore tune the maximum
5203      amount of LDS we should use.  For now, use a minimal amount to try to
5204      maximise occupancy.  */
5205   *hi = acc_lds_size;
5206   machine_function *machfun = cfun->machine;
5207   machfun->reduction_base = gang_private_size_opt;
5208   machfun->reduction_limit
5209     = gang_private_size_opt + reduction_size[GOMP_DIM_WORKER];
5210 }
5211 
5212 /* }}}  */
5213 /* {{{ ASM Output.  */
5214 
5215 /*  Implement TARGET_ASM_FILE_START.
5216 
5217     Print assembler file header text.  */
5218 
5219 static void
output_file_start(void)5220 output_file_start (void)
5221 {
5222   const char *cpu;
5223   bool use_xnack_attr = true;
5224   bool use_sram_attr = true;
5225   switch (gcn_arch)
5226     {
5227     case PROCESSOR_FIJI:
5228       cpu = "gfx803";
5229 #ifndef HAVE_GCN_XNACK_FIJI
5230       use_xnack_attr = false;
5231 #endif
5232       use_sram_attr = false;
5233       break;
5234     case PROCESSOR_VEGA10:
5235       cpu = "gfx900";
5236 #ifndef HAVE_GCN_XNACK_GFX900
5237       use_xnack_attr = false;
5238 #endif
5239       use_sram_attr = false;
5240       break;
5241     case PROCESSOR_VEGA20:
5242       cpu = "gfx906";
5243 #ifndef HAVE_GCN_XNACK_GFX906
5244       use_xnack_attr = false;
5245 #endif
5246       use_sram_attr = false;
5247       break;
5248     case PROCESSOR_GFX908:
5249       cpu = "gfx908";
5250 #ifndef HAVE_GCN_XNACK_GFX908
5251       use_xnack_attr = false;
5252 #endif
5253 #ifndef HAVE_GCN_SRAM_ECC_GFX908
5254       use_sram_attr = false;
5255 #endif
5256       break;
5257     default: gcc_unreachable ();
5258     }
5259 
5260 #if HAVE_GCN_ASM_V3_SYNTAX
5261   const char *xnack = (flag_xnack ? "+xnack" : "");
5262   const char *sram_ecc = (flag_sram_ecc ? "+sram-ecc" : "");
5263 #endif
5264 #if HAVE_GCN_ASM_V4_SYNTAX
5265   /* In HSACOv4 no attribute setting means the binary supports "any" hardware
5266      configuration.  In GCC binaries, this is true for SRAM ECC, but not
5267      XNACK.  */
5268   const char *xnack = (flag_xnack ? ":xnack+" : ":xnack-");
5269   const char *sram_ecc = (flag_sram_ecc == SRAM_ECC_ON ? ":sramecc+"
5270 			  : flag_sram_ecc == SRAM_ECC_OFF ? ":sramecc-"
5271 			  : "");
5272 #endif
5273   if (!use_xnack_attr)
5274     xnack = "";
5275   if (!use_sram_attr)
5276     sram_ecc = "";
5277 
5278   fprintf(asm_out_file, "\t.amdgcn_target \"amdgcn-unknown-amdhsa--%s%s%s\"\n",
5279 	  cpu,
5280 #if HAVE_GCN_ASM_V3_SYNTAX
5281 	  xnack, sram_ecc
5282 #endif
5283 #ifdef HAVE_GCN_ASM_V4_SYNTAX
5284 	  sram_ecc, xnack
5285 #endif
5286 	  );
5287 }
5288 
5289 /* Implement ASM_DECLARE_FUNCTION_NAME via gcn-hsa.h.
5290 
5291    Print the initial definition of a function name.
5292 
5293    For GCN kernel entry points this includes all the HSA meta-data, special
5294    alignment constraints that don't apply to regular functions, and magic
5295    comments that pass information to mkoffload.  */
5296 
5297 void
gcn_hsa_declare_function_name(FILE * file,const char * name,tree)5298 gcn_hsa_declare_function_name (FILE *file, const char *name, tree)
5299 {
5300   int sgpr, vgpr;
5301   bool xnack_enabled = false;
5302 
5303   fputs ("\n\n", file);
5304 
5305   if (cfun && cfun->machine && cfun->machine->normal_function)
5306     {
5307       fputs ("\t.type\t", file);
5308       assemble_name (file, name);
5309       fputs (",@function\n", file);
5310       assemble_name (file, name);
5311       fputs (":\n", file);
5312       return;
5313     }
5314 
5315   /* Determine count of sgpr/vgpr registers by looking for last
5316      one used.  */
5317   for (sgpr = 101; sgpr >= 0; sgpr--)
5318     if (df_regs_ever_live_p (FIRST_SGPR_REG + sgpr))
5319       break;
5320   sgpr++;
5321   for (vgpr = 255; vgpr >= 0; vgpr--)
5322     if (df_regs_ever_live_p (FIRST_VGPR_REG + vgpr))
5323       break;
5324   vgpr++;
5325 
5326   if (!leaf_function_p ())
5327     {
5328       /* We can't know how many registers function calls might use.  */
5329       if (vgpr < MAX_NORMAL_VGPR_COUNT)
5330 	vgpr = MAX_NORMAL_VGPR_COUNT;
5331       if (sgpr < MAX_NORMAL_SGPR_COUNT)
5332 	sgpr = MAX_NORMAL_SGPR_COUNT;
5333     }
5334 
5335   fputs ("\t.rodata\n"
5336 	 "\t.p2align\t6\n"
5337 	 "\t.amdhsa_kernel\t", file);
5338   assemble_name (file, name);
5339   fputs ("\n", file);
5340   int reg = FIRST_SGPR_REG;
5341   for (int a = 0; a < GCN_KERNEL_ARG_TYPES; a++)
5342     {
5343       int reg_first = -1;
5344       int reg_last;
5345       if ((cfun->machine->args.requested & (1 << a))
5346 	  && (gcn_kernel_arg_types[a].fixed_regno < 0))
5347 	{
5348 	  reg_first = reg;
5349 	  reg_last = (reg_first
5350 		      + (GET_MODE_SIZE (gcn_kernel_arg_types[a].mode)
5351 			 / UNITS_PER_WORD) - 1);
5352 	  reg = reg_last + 1;
5353 	}
5354 
5355       if (gcn_kernel_arg_types[a].header_pseudo)
5356 	{
5357 	  fprintf (file, "\t  %s%s\t%i",
5358 		   (cfun->machine->args.requested & (1 << a)) != 0 ? "" : ";",
5359 		   gcn_kernel_arg_types[a].header_pseudo,
5360 		   (cfun->machine->args.requested & (1 << a)) != 0);
5361 	  if (reg_first != -1)
5362 	    {
5363 	      fprintf (file, " ; (");
5364 	      for (int i = reg_first; i <= reg_last; ++i)
5365 		{
5366 		  if (i != reg_first)
5367 		    fprintf (file, ", ");
5368 		  fprintf (file, "%s", reg_names[i]);
5369 		}
5370 	      fprintf (file, ")");
5371 	    }
5372 	  fprintf (file, "\n");
5373 	}
5374       else if (gcn_kernel_arg_types[a].fixed_regno >= 0
5375 	       && cfun->machine->args.requested & (1 << a))
5376 	fprintf (file, "\t  ; %s\t%i (%s)\n",
5377 		 gcn_kernel_arg_types[a].name,
5378 		 (cfun->machine->args.requested & (1 << a)) != 0,
5379 		 reg_names[gcn_kernel_arg_types[a].fixed_regno]);
5380     }
5381   fprintf (file, "\t  .amdhsa_system_vgpr_workitem_id\t%i\n",
5382 	   (cfun->machine->args.requested & (1 << WORK_ITEM_ID_Z_ARG))
5383 	   ? 2
5384 	   : cfun->machine->args.requested & (1 << WORK_ITEM_ID_Y_ARG)
5385 	   ? 1 : 0);
5386   fprintf (file,
5387 	   "\t  .amdhsa_next_free_vgpr\t%i\n"
5388 	   "\t  .amdhsa_next_free_sgpr\t%i\n"
5389 	   "\t  .amdhsa_reserve_vcc\t1\n"
5390 	   "\t  .amdhsa_reserve_flat_scratch\t0\n"
5391 	   "\t  .amdhsa_reserve_xnack_mask\t%i\n"
5392 	   "\t  .amdhsa_private_segment_fixed_size\t%i\n"
5393 	   "\t  .amdhsa_group_segment_fixed_size\t%u\n"
5394 	   "\t  .amdhsa_float_denorm_mode_32\t3\n"
5395 	   "\t  .amdhsa_float_denorm_mode_16_64\t3\n",
5396 	   vgpr,
5397 	   sgpr,
5398 	   xnack_enabled,
5399 	   /* workitem_private_segment_bytes_size needs to be
5400 	      one 64th the wave-front stack size.  */
5401 	   stack_size_opt / 64,
5402 	   LDS_SIZE);
5403   fputs ("\t.end_amdhsa_kernel\n", file);
5404 
5405 #if 1
5406   /* The following is YAML embedded in assembler; tabs are not allowed.  */
5407   fputs ("        .amdgpu_metadata\n"
5408 	 "        amdhsa.version:\n"
5409 	 "          - 1\n"
5410 	 "          - 0\n"
5411 	 "        amdhsa.kernels:\n"
5412 	 "          - .name: ", file);
5413   assemble_name (file, name);
5414   fputs ("\n            .symbol: ", file);
5415   assemble_name (file, name);
5416   fprintf (file,
5417 	   ".kd\n"
5418 	   "            .kernarg_segment_size: %i\n"
5419 	   "            .kernarg_segment_align: %i\n"
5420 	   "            .group_segment_fixed_size: %u\n"
5421 	   "            .private_segment_fixed_size: %i\n"
5422 	   "            .wavefront_size: 64\n"
5423 	   "            .sgpr_count: %i\n"
5424 	   "            .vgpr_count: %i\n"
5425 	   "            .max_flat_workgroup_size: 1024\n",
5426 	   cfun->machine->kernarg_segment_byte_size,
5427 	   cfun->machine->kernarg_segment_alignment,
5428 	   LDS_SIZE,
5429 	   stack_size_opt / 64,
5430 	   sgpr, vgpr);
5431   fputs ("        .end_amdgpu_metadata\n", file);
5432 #endif
5433 
5434   fputs ("\t.text\n", file);
5435   fputs ("\t.align\t256\n", file);
5436   fputs ("\t.type\t", file);
5437   assemble_name (file, name);
5438   fputs (",@function\n", file);
5439   assemble_name (file, name);
5440   fputs (":\n", file);
5441 
5442   /* This comment is read by mkoffload.  */
5443   if (flag_openacc)
5444     fprintf (file, "\t;; OPENACC-DIMS: %d, %d, %d : %s\n",
5445 	     oacc_get_fn_dim_size (cfun->decl, GOMP_DIM_GANG),
5446 	     oacc_get_fn_dim_size (cfun->decl, GOMP_DIM_WORKER),
5447 	     oacc_get_fn_dim_size (cfun->decl, GOMP_DIM_VECTOR), name);
5448 }
5449 
5450 /* Implement TARGET_ASM_SELECT_SECTION.
5451 
5452    Return the section into which EXP should be placed.  */
5453 
5454 static section *
gcn_asm_select_section(tree exp,int reloc,unsigned HOST_WIDE_INT align)5455 gcn_asm_select_section (tree exp, int reloc, unsigned HOST_WIDE_INT align)
5456 {
5457   if (TREE_TYPE (exp) != error_mark_node
5458       && TYPE_ADDR_SPACE (TREE_TYPE (exp)) == ADDR_SPACE_LDS)
5459     {
5460       if (!DECL_P (exp))
5461 	return get_section (".lds_bss",
5462 			    SECTION_WRITE | SECTION_BSS | SECTION_DEBUG,
5463 			    NULL);
5464 
5465       return get_named_section (exp, ".lds_bss", reloc);
5466     }
5467 
5468   return default_elf_select_section (exp, reloc, align);
5469 }
5470 
5471 /* Implement TARGET_ASM_FUNCTION_PROLOGUE.
5472 
5473    Emits custom text into the assembler file at the head of each function.  */
5474 
5475 static void
gcn_target_asm_function_prologue(FILE * file)5476 gcn_target_asm_function_prologue (FILE *file)
5477 {
5478   machine_function *offsets = gcn_compute_frame_offsets ();
5479 
5480   asm_fprintf (file, "\t; using %s addressing in function\n",
5481 	       offsets->use_flat_addressing ? "flat" : "global");
5482 
5483   if (offsets->normal_function)
5484     {
5485       asm_fprintf (file, "\t; frame pointer needed: %s\n",
5486 		   offsets->need_frame_pointer ? "true" : "false");
5487       asm_fprintf (file, "\t; lr needs saving: %s\n",
5488 		   offsets->lr_needs_saving ? "true" : "false");
5489       asm_fprintf (file, "\t; outgoing args size: %wd\n",
5490 		   offsets->outgoing_args_size);
5491       asm_fprintf (file, "\t; pretend size: %wd\n", offsets->pretend_size);
5492       asm_fprintf (file, "\t; local vars size: %wd\n", offsets->local_vars);
5493       asm_fprintf (file, "\t; callee save size: %wd\n",
5494 		   offsets->callee_saves);
5495     }
5496   else
5497     {
5498       asm_fprintf (file, "\t; HSA kernel entry point\n");
5499       asm_fprintf (file, "\t; local vars size: %wd\n", offsets->local_vars);
5500       asm_fprintf (file, "\t; outgoing args size: %wd\n",
5501 		   offsets->outgoing_args_size);
5502     }
5503 }
5504 
5505 /* Helper function for print_operand and print_operand_address.
5506 
5507    Print a register as the assembler requires, according to mode and name.  */
5508 
5509 static void
print_reg(FILE * file,rtx x)5510 print_reg (FILE *file, rtx x)
5511 {
5512   machine_mode mode = GET_MODE (x);
5513   if (mode == BImode || mode == QImode || mode == HImode || mode == SImode
5514       || mode == HFmode || mode == SFmode
5515       || mode == V64SFmode || mode == V64SImode
5516       || mode == V64QImode || mode == V64HImode)
5517     fprintf (file, "%s", reg_names[REGNO (x)]);
5518   else if (mode == DImode || mode == V64DImode
5519 	   || mode == DFmode || mode == V64DFmode)
5520     {
5521       if (SGPR_REGNO_P (REGNO (x)))
5522 	fprintf (file, "s[%i:%i]", REGNO (x) - FIRST_SGPR_REG,
5523 		 REGNO (x) - FIRST_SGPR_REG + 1);
5524       else if (VGPR_REGNO_P (REGNO (x)))
5525 	fprintf (file, "v[%i:%i]", REGNO (x) - FIRST_VGPR_REG,
5526 		 REGNO (x) - FIRST_VGPR_REG + 1);
5527       else if (REGNO (x) == FLAT_SCRATCH_REG)
5528 	fprintf (file, "flat_scratch");
5529       else if (REGNO (x) == EXEC_REG)
5530 	fprintf (file, "exec");
5531       else if (REGNO (x) == VCC_LO_REG)
5532 	fprintf (file, "vcc");
5533       else
5534 	fprintf (file, "[%s:%s]",
5535 		 reg_names[REGNO (x)], reg_names[REGNO (x) + 1]);
5536     }
5537   else if (mode == TImode)
5538     {
5539       if (SGPR_REGNO_P (REGNO (x)))
5540 	fprintf (file, "s[%i:%i]", REGNO (x) - FIRST_SGPR_REG,
5541 		 REGNO (x) - FIRST_SGPR_REG + 3);
5542       else if (VGPR_REGNO_P (REGNO (x)))
5543 	fprintf (file, "v[%i:%i]", REGNO (x) - FIRST_VGPR_REG,
5544 		 REGNO (x) - FIRST_VGPR_REG + 3);
5545       else
5546 	gcc_unreachable ();
5547     }
5548   else
5549     gcc_unreachable ();
5550 }
5551 
5552 /* Implement TARGET_SECTION_TYPE_FLAGS.
5553 
5554    Return a set of section attributes for use by TARGET_ASM_NAMED_SECTION.  */
5555 
5556 static unsigned int
gcn_section_type_flags(tree decl,const char * name,int reloc)5557 gcn_section_type_flags (tree decl, const char *name, int reloc)
5558 {
5559   if (strcmp (name, ".lds_bss") == 0)
5560     return SECTION_WRITE | SECTION_BSS | SECTION_DEBUG;
5561 
5562   return default_section_type_flags (decl, name, reloc);
5563 }
5564 
5565 /* Helper function for gcn_asm_output_symbol_ref.
5566 
5567    FIXME: This function is used to lay out gang-private variables in LDS
5568    on a per-CU basis.
5569    There may be cases in which gang-private variables in different compilation
5570    units could clobber each other.  In that case we should be relying on the
5571    linker to lay out gang-private LDS space, but that doesn't appear to be
5572    possible at present.  */
5573 
5574 static void
gcn_print_lds_decl(FILE * f,tree var)5575 gcn_print_lds_decl (FILE *f, tree var)
5576 {
5577   int *offset;
5578   if ((offset = lds_allocs.get (var)))
5579     fprintf (f, "%u", (unsigned) *offset);
5580   else
5581     {
5582       unsigned HOST_WIDE_INT align = DECL_ALIGN_UNIT (var);
5583       tree type = TREE_TYPE (var);
5584       unsigned HOST_WIDE_INT size = tree_to_uhwi (TYPE_SIZE_UNIT (type));
5585       if (size > align && size > 4 && align < 8)
5586 	align = 8;
5587 
5588       gang_private_hwm = ((gang_private_hwm + align - 1) & ~(align - 1));
5589 
5590       lds_allocs.put (var, gang_private_hwm);
5591       fprintf (f, "%u", gang_private_hwm);
5592       gang_private_hwm += size;
5593       if (gang_private_hwm > gang_private_size_opt)
5594 	error ("%d bytes of gang-private data-share memory exhausted"
5595 	       " (increase with %<-mgang-private-size=%d%>, for example)",
5596 	       gang_private_size_opt, gang_private_hwm);
5597     }
5598 }
5599 
5600 /* Implement ASM_OUTPUT_SYMBOL_REF via gcn-hsa.h.  */
5601 
5602 void
gcn_asm_output_symbol_ref(FILE * file,rtx x)5603 gcn_asm_output_symbol_ref (FILE *file, rtx x)
5604 {
5605   tree decl;
5606   if (cfun
5607       && (decl = SYMBOL_REF_DECL (x)) != 0
5608       && TREE_CODE (decl) == VAR_DECL
5609       && AS_LDS_P (TYPE_ADDR_SPACE (TREE_TYPE (decl))))
5610     {
5611       /* LDS symbols (emitted using this hook) are only used at present
5612          to propagate worker values from an active thread to neutered
5613          threads.  Use the same offset for each such block, but don't
5614          use zero because null pointers are used to identify the active
5615          thread in GOACC_single_copy_start calls.  */
5616       gcn_print_lds_decl (file, decl);
5617     }
5618   else
5619     {
5620       assemble_name (file, XSTR (x, 0));
5621       /* FIXME: See above -- this condition is unreachable.  */
5622       if (cfun
5623 	  && (decl = SYMBOL_REF_DECL (x)) != 0
5624 	  && TREE_CODE (decl) == VAR_DECL
5625 	  && AS_LDS_P (TYPE_ADDR_SPACE (TREE_TYPE (decl))))
5626 	fputs ("@abs32", file);
5627     }
5628 }
5629 
5630 /* Implement TARGET_CONSTANT_ALIGNMENT.
5631 
5632    Returns the alignment in bits of a constant that is being placed in memory.
5633    CONSTANT is the constant and BASIC_ALIGN is the alignment that the object
5634    would ordinarily have.  */
5635 
5636 static HOST_WIDE_INT
gcn_constant_alignment(const_tree ARG_UNUSED (constant),HOST_WIDE_INT basic_align)5637 gcn_constant_alignment (const_tree ARG_UNUSED (constant),
5638 			HOST_WIDE_INT basic_align)
5639 {
5640   return basic_align > 128 ? basic_align : 128;
5641 }
5642 
5643 /* Implement PRINT_OPERAND_ADDRESS via gcn.h.  */
5644 
5645 void
print_operand_address(FILE * file,rtx mem)5646 print_operand_address (FILE *file, rtx mem)
5647 {
5648   gcc_assert (MEM_P (mem));
5649 
5650   rtx reg;
5651   rtx offset;
5652   addr_space_t as = MEM_ADDR_SPACE (mem);
5653   rtx addr = XEXP (mem, 0);
5654   gcc_assert (REG_P (addr) || GET_CODE (addr) == PLUS);
5655 
5656   if (AS_SCRATCH_P (as))
5657     switch (GET_CODE (addr))
5658       {
5659       case REG:
5660 	print_reg (file, addr);
5661 	break;
5662 
5663       case PLUS:
5664 	reg = XEXP (addr, 0);
5665 	offset = XEXP (addr, 1);
5666 	print_reg (file, reg);
5667 	if (GET_CODE (offset) == CONST_INT)
5668 	  fprintf (file, " offset:" HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
5669 	else
5670 	  abort ();
5671 	break;
5672 
5673       default:
5674 	debug_rtx (addr);
5675 	abort ();
5676       }
5677   else if (AS_ANY_FLAT_P (as))
5678     {
5679       if (GET_CODE (addr) == REG)
5680 	print_reg (file, addr);
5681       else
5682 	{
5683 	  gcc_assert (TARGET_GCN5_PLUS);
5684 	  print_reg (file, XEXP (addr, 0));
5685 	}
5686     }
5687   else if (AS_GLOBAL_P (as))
5688     {
5689       gcc_assert (TARGET_GCN5_PLUS);
5690 
5691       rtx base = addr;
5692       rtx vgpr_offset = NULL_RTX;
5693 
5694       if (GET_CODE (addr) == PLUS)
5695 	{
5696 	  base = XEXP (addr, 0);
5697 
5698 	  if (GET_CODE (base) == PLUS)
5699 	    {
5700 	      /* (SGPR + VGPR) + CONST  */
5701 	      vgpr_offset = XEXP (base, 1);
5702 	      base = XEXP (base, 0);
5703 	    }
5704 	  else
5705 	    {
5706 	      rtx offset = XEXP (addr, 1);
5707 
5708 	      if (REG_P (offset))
5709 		/* SGPR + VGPR  */
5710 		vgpr_offset = offset;
5711 	      else if (CONST_INT_P (offset))
5712 		/* VGPR + CONST or SGPR + CONST  */
5713 		;
5714 	      else
5715 		output_operand_lossage ("bad ADDR_SPACE_GLOBAL address");
5716 	    }
5717 	}
5718 
5719       if (REG_P (base))
5720 	{
5721 	  if (VGPR_REGNO_P (REGNO (base)))
5722 	    print_reg (file, base);
5723 	  else if (SGPR_REGNO_P (REGNO (base)))
5724 	    {
5725 	      /* The assembler requires a 64-bit VGPR pair here, even though
5726 	         the offset should be only 32-bit.  */
5727 	      if (vgpr_offset == NULL_RTX)
5728 		/* In this case, the vector offset is zero, so we use the first
5729 		   lane of v1, which is initialized to zero.  */
5730 		{
5731 		  if (HAVE_GCN_ASM_GLOBAL_LOAD_FIXED)
5732 		    fprintf (file, "v1");
5733 		  else
5734 		    fprintf (file, "v[1:2]");
5735 		}
5736 	      else if (REG_P (vgpr_offset)
5737 		       && VGPR_REGNO_P (REGNO (vgpr_offset)))
5738 		{
5739 		  if (HAVE_GCN_ASM_GLOBAL_LOAD_FIXED)
5740 		    fprintf (file, "v%d",
5741 			     REGNO (vgpr_offset) - FIRST_VGPR_REG);
5742 		  else
5743 		    fprintf (file, "v[%d:%d]",
5744 			     REGNO (vgpr_offset) - FIRST_VGPR_REG,
5745 			     REGNO (vgpr_offset) - FIRST_VGPR_REG + 1);
5746 		}
5747 	      else
5748 		output_operand_lossage ("bad ADDR_SPACE_GLOBAL address");
5749 	    }
5750 	}
5751       else
5752 	output_operand_lossage ("bad ADDR_SPACE_GLOBAL address");
5753     }
5754   else if (AS_ANY_DS_P (as))
5755     switch (GET_CODE (addr))
5756       {
5757       case REG:
5758 	print_reg (file, addr);
5759 	break;
5760 
5761       case PLUS:
5762 	reg = XEXP (addr, 0);
5763 	print_reg (file, reg);
5764 	break;
5765 
5766       default:
5767 	debug_rtx (addr);
5768 	abort ();
5769       }
5770   else
5771     switch (GET_CODE (addr))
5772       {
5773       case REG:
5774 	print_reg (file, addr);
5775 	fprintf (file, ", 0");
5776 	break;
5777 
5778       case PLUS:
5779 	reg = XEXP (addr, 0);
5780 	offset = XEXP (addr, 1);
5781 	print_reg (file, reg);
5782 	fprintf (file, ", ");
5783 	if (GET_CODE (offset) == REG)
5784 	  print_reg (file, reg);
5785 	else if (GET_CODE (offset) == CONST_INT)
5786 	  fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
5787 	else
5788 	  abort ();
5789 	break;
5790 
5791       default:
5792 	debug_rtx (addr);
5793 	abort ();
5794       }
5795 }
5796 
5797 /* Implement PRINT_OPERAND via gcn.h.
5798 
5799    b - print operand size as untyped operand (b8/b16/b32/b64)
5800    B - print operand size as SI/DI untyped operand (b32/b32/b32/b64)
5801    i - print operand size as untyped operand (i16/b32/i64)
5802    I - print operand size as SI/DI untyped operand(i32/b32/i64)
5803    u - print operand size as untyped operand (u16/u32/u64)
5804    U - print operand size as SI/DI untyped operand(u32/u64)
5805    o - print operand size as memory access size for loads
5806        (ubyte/ushort/dword/dwordx2/wordx3/dwordx4)
5807    s - print operand size as memory access size for stores
5808        (byte/short/dword/dwordx2/wordx3/dwordx4)
5809    C - print conditional code for s_cbranch (_sccz/_sccnz/_vccz/_vccnz...)
5810    c - print inverse conditional code for s_cbranch
5811    D - print conditional code for s_cmp (eq_u64/lg_u64...)
5812    E - print conditional code for v_cmp (eq_u64/ne_u64...)
5813    A - print address in formatting suitable for given address space.
5814    O - print offset:n for data share operations.
5815    ^ - print "_co" suffix for GCN5 mnemonics
5816    g - print "glc", if appropriate for given MEM
5817  */
5818 
5819 void
print_operand(FILE * file,rtx x,int code)5820 print_operand (FILE *file, rtx x, int code)
5821 {
5822   int xcode = x ? GET_CODE (x) : 0;
5823   bool invert = false;
5824   switch (code)
5825     {
5826       /* Instructions have the following suffixes.
5827          If there are two suffixes, the first is the destination type,
5828 	 and the second is the source type.
5829 
5830          B32 Bitfield (untyped data) 32-bit
5831          B64 Bitfield (untyped data) 64-bit
5832          F16 floating-point 16-bit
5833          F32 floating-point 32-bit (IEEE 754 single-precision float)
5834          F64 floating-point 64-bit (IEEE 754 double-precision float)
5835          I16 signed 32-bit integer
5836          I32 signed 32-bit integer
5837          I64 signed 64-bit integer
5838          U16 unsigned 32-bit integer
5839          U32 unsigned 32-bit integer
5840          U64 unsigned 64-bit integer  */
5841 
5842       /* Print operand size as untyped suffix.  */
5843     case 'b':
5844       {
5845 	const char *s = "";
5846 	machine_mode mode = GET_MODE (x);
5847 	if (VECTOR_MODE_P (mode))
5848 	  mode = GET_MODE_INNER (mode);
5849 	switch (GET_MODE_SIZE (mode))
5850 	  {
5851 	  case 1:
5852 	    s = "_b8";
5853 	    break;
5854 	  case 2:
5855 	    s = "_b16";
5856 	    break;
5857 	  case 4:
5858 	    s = "_b32";
5859 	    break;
5860 	  case 8:
5861 	    s = "_b64";
5862 	    break;
5863 	  default:
5864 	    output_operand_lossage ("invalid operand %%xn code");
5865 	    return;
5866 	  }
5867 	fputs (s, file);
5868       }
5869       return;
5870     case 'B':
5871       {
5872 	const char *s = "";
5873 	machine_mode mode = GET_MODE (x);
5874 	if (VECTOR_MODE_P (mode))
5875 	  mode = GET_MODE_INNER (mode);
5876 	switch (GET_MODE_SIZE (mode))
5877 	  {
5878 	  case 1:
5879 	  case 2:
5880 	  case 4:
5881 	    s = "_b32";
5882 	    break;
5883 	  case 8:
5884 	    s = "_b64";
5885 	    break;
5886 	  default:
5887 	    output_operand_lossage ("invalid operand %%xn code");
5888 	    return;
5889 	  }
5890 	fputs (s, file);
5891       }
5892       return;
5893     case 'e':
5894       fputs ("sext(", file);
5895       print_operand (file, x, 0);
5896       fputs (")", file);
5897       return;
5898     case 'i':
5899     case 'I':
5900     case 'u':
5901     case 'U':
5902       {
5903 	bool signed_p = code == 'i';
5904 	bool min32_p = code == 'I' || code == 'U';
5905 	const char *s = "";
5906 	machine_mode mode = GET_MODE (x);
5907 	if (VECTOR_MODE_P (mode))
5908 	  mode = GET_MODE_INNER (mode);
5909 	if (mode == VOIDmode)
5910 	  switch (GET_CODE (x))
5911 	    {
5912 	    case CONST_INT:
5913 	      s = signed_p ? "_i32" : "_u32";
5914 	      break;
5915 	    case CONST_DOUBLE:
5916 	      s = "_f64";
5917 	      break;
5918 	    default:
5919 	      output_operand_lossage ("invalid operand %%xn code");
5920 	      return;
5921 	    }
5922 	else if (FLOAT_MODE_P (mode))
5923 	  switch (GET_MODE_SIZE (mode))
5924 	    {
5925 	    case 2:
5926 	      s = "_f16";
5927 	      break;
5928 	    case 4:
5929 	      s = "_f32";
5930 	      break;
5931 	    case 8:
5932 	      s = "_f64";
5933 	      break;
5934 	    default:
5935 	      output_operand_lossage ("invalid operand %%xn code");
5936 	      return;
5937 	    }
5938 	else if (min32_p)
5939 	  switch (GET_MODE_SIZE (mode))
5940 	    {
5941 	    case 1:
5942 	    case 2:
5943 	    case 4:
5944 	      s = signed_p ? "_i32" : "_u32";
5945 	      break;
5946 	    case 8:
5947 	      s = signed_p ? "_i64" : "_u64";
5948 	      break;
5949 	    default:
5950 	      output_operand_lossage ("invalid operand %%xn code");
5951 	      return;
5952 	    }
5953 	else
5954 	  switch (GET_MODE_SIZE (mode))
5955 	    {
5956 	    case 1:
5957 	      s = signed_p ? "_i8" : "_u8";
5958 	      break;
5959 	    case 2:
5960 	      s = signed_p ? "_i16" : "_u16";
5961 	      break;
5962 	    case 4:
5963 	      s = signed_p ? "_i32" : "_u32";
5964 	      break;
5965 	    case 8:
5966 	      s = signed_p ? "_i64" : "_u64";
5967 	      break;
5968 	    default:
5969 	      output_operand_lossage ("invalid operand %%xn code");
5970 	      return;
5971 	    }
5972 	fputs (s, file);
5973       }
5974       return;
5975       /* Print operand size as untyped suffix.  */
5976     case 'o':
5977       {
5978 	const char *s = 0;
5979 	switch (GET_MODE_SIZE (GET_MODE (x)))
5980 	  {
5981 	  case 1:
5982 	    s = "_ubyte";
5983 	    break;
5984 	  case 2:
5985 	    s = "_ushort";
5986 	    break;
5987 	  /* The following are full-vector variants.  */
5988 	  case 64:
5989 	    s = "_ubyte";
5990 	    break;
5991 	  case 128:
5992 	    s = "_ushort";
5993 	    break;
5994 	  }
5995 
5996 	if (s)
5997 	  {
5998 	    fputs (s, file);
5999 	    return;
6000 	  }
6001 
6002 	/* Fall-through - the other cases for 'o' are the same as for 's'.  */
6003 	gcc_fallthrough();
6004       }
6005     case 's':
6006       {
6007 	const char *s = "";
6008 	switch (GET_MODE_SIZE (GET_MODE (x)))
6009 	  {
6010 	  case 1:
6011 	    s = "_byte";
6012 	    break;
6013 	  case 2:
6014 	    s = "_short";
6015 	    break;
6016 	  case 4:
6017 	    s = "_dword";
6018 	    break;
6019 	  case 8:
6020 	    s = "_dwordx2";
6021 	    break;
6022 	  case 12:
6023 	    s = "_dwordx3";
6024 	    break;
6025 	  case 16:
6026 	    s = "_dwordx4";
6027 	    break;
6028 	  case 32:
6029 	    s = "_dwordx8";
6030 	    break;
6031 	  case 64:
6032 	    s = VECTOR_MODE_P (GET_MODE (x)) ? "_byte" : "_dwordx16";
6033 	    break;
6034 	  /* The following are full-vector variants.  */
6035 	  case 128:
6036 	    s = "_short";
6037 	    break;
6038 	  case 256:
6039 	    s = "_dword";
6040 	    break;
6041 	  case 512:
6042 	    s = "_dwordx2";
6043 	    break;
6044 	  default:
6045 	    output_operand_lossage ("invalid operand %%xn code");
6046 	    return;
6047 	  }
6048 	fputs (s, file);
6049       }
6050       return;
6051     case 'A':
6052       if (xcode != MEM)
6053 	{
6054 	  output_operand_lossage ("invalid %%xn code");
6055 	  return;
6056 	}
6057       print_operand_address (file, x);
6058       return;
6059     case 'O':
6060       {
6061 	if (xcode != MEM)
6062 	  {
6063 	    output_operand_lossage ("invalid %%xn code");
6064 	    return;
6065 	  }
6066 	if (AS_GDS_P (MEM_ADDR_SPACE (x)))
6067 	  fprintf (file, " gds");
6068 
6069 	rtx x0 = XEXP (x, 0);
6070 	if (AS_GLOBAL_P (MEM_ADDR_SPACE (x)))
6071 	  {
6072 	    gcc_assert (TARGET_GCN5_PLUS);
6073 
6074 	    fprintf (file, ", ");
6075 
6076 	    rtx base = x0;
6077 	    rtx const_offset = NULL_RTX;
6078 
6079 	    if (GET_CODE (base) == PLUS)
6080 	      {
6081 		rtx offset = XEXP (x0, 1);
6082 		base = XEXP (x0, 0);
6083 
6084 		if (GET_CODE (base) == PLUS)
6085 		  /* (SGPR + VGPR) + CONST  */
6086 		  /* Ignore the VGPR offset for this operand.  */
6087 		  base = XEXP (base, 0);
6088 
6089 		if (CONST_INT_P (offset))
6090 		  const_offset = XEXP (x0, 1);
6091 		else if (REG_P (offset))
6092 		  /* SGPR + VGPR  */
6093 		  /* Ignore the VGPR offset for this operand.  */
6094 		  ;
6095 		else
6096 		  output_operand_lossage ("bad ADDR_SPACE_GLOBAL address");
6097 	      }
6098 
6099 	    if (REG_P (base))
6100 	      {
6101 		if (VGPR_REGNO_P (REGNO (base)))
6102 		  /* The VGPR address is specified in the %A operand.  */
6103 		  fprintf (file, "off");
6104 		else if (SGPR_REGNO_P (REGNO (base)))
6105 		  print_reg (file, base);
6106 		else
6107 		  output_operand_lossage ("bad ADDR_SPACE_GLOBAL address");
6108 	      }
6109 	    else
6110 	      output_operand_lossage ("bad ADDR_SPACE_GLOBAL address");
6111 
6112 	    if (const_offset != NULL_RTX)
6113 	      fprintf (file, " offset:" HOST_WIDE_INT_PRINT_DEC,
6114 		       INTVAL (const_offset));
6115 
6116 	    return;
6117 	  }
6118 
6119 	if (GET_CODE (x0) == REG)
6120 	  return;
6121 	if (GET_CODE (x0) != PLUS)
6122 	  {
6123 	    output_operand_lossage ("invalid %%xn code");
6124 	    return;
6125 	  }
6126 	rtx val = XEXP (x0, 1);
6127 	if (GET_CODE (val) == CONST_VECTOR)
6128 	  val = CONST_VECTOR_ELT (val, 0);
6129 	if (GET_CODE (val) != CONST_INT)
6130 	  {
6131 	    output_operand_lossage ("invalid %%xn code");
6132 	    return;
6133 	  }
6134 	fprintf (file, " offset:" HOST_WIDE_INT_PRINT_DEC, INTVAL (val));
6135 
6136       }
6137       return;
6138     case 'c':
6139       invert = true;
6140       /* Fall through.  */
6141     case 'C':
6142       {
6143 	const char *s;
6144 	bool num = false;
6145 	if ((xcode != EQ && xcode != NE) || !REG_P (XEXP (x, 0)))
6146 	  {
6147 	    output_operand_lossage ("invalid %%xn code");
6148 	    return;
6149 	  }
6150 	switch (REGNO (XEXP (x, 0)))
6151 	  {
6152 	  case VCC_REG:
6153 	  case VCCZ_REG:
6154 	    s = "_vcc";
6155 	    break;
6156 	  case SCC_REG:
6157 	    /* For some reason llvm-mc insists on scc0 instead of sccz.  */
6158 	    num = true;
6159 	    s = "_scc";
6160 	    break;
6161 	  case EXECZ_REG:
6162 	    s = "_exec";
6163 	    break;
6164 	  default:
6165 	    output_operand_lossage ("invalid %%xn code");
6166 	    return;
6167 	  }
6168 	fputs (s, file);
6169 	if (xcode == (invert ? NE : EQ))
6170 	  fputc (num ? '0' : 'z', file);
6171 	else
6172 	  fputs (num ? "1" : "nz", file);
6173 	return;
6174       }
6175     case 'D':
6176       {
6177 	const char *s;
6178 	bool cmp_signed = false;
6179 	switch (xcode)
6180 	  {
6181 	  case EQ:
6182 	    s = "_eq_";
6183 	    break;
6184 	  case NE:
6185 	    s = "_lg_";
6186 	    break;
6187 	  case LT:
6188 	    s = "_lt_";
6189 	    cmp_signed = true;
6190 	    break;
6191 	  case LE:
6192 	    s = "_le_";
6193 	    cmp_signed = true;
6194 	    break;
6195 	  case GT:
6196 	    s = "_gt_";
6197 	    cmp_signed = true;
6198 	    break;
6199 	  case GE:
6200 	    s = "_ge_";
6201 	    cmp_signed = true;
6202 	    break;
6203 	  case LTU:
6204 	    s = "_lt_";
6205 	    break;
6206 	  case LEU:
6207 	    s = "_le_";
6208 	    break;
6209 	  case GTU:
6210 	    s = "_gt_";
6211 	    break;
6212 	  case GEU:
6213 	    s = "_ge_";
6214 	    break;
6215 	  default:
6216 	    output_operand_lossage ("invalid %%xn code");
6217 	    return;
6218 	  }
6219 	fputs (s, file);
6220 	fputc (cmp_signed ? 'i' : 'u', file);
6221 
6222 	machine_mode mode = GET_MODE (XEXP (x, 0));
6223 
6224 	if (mode == VOIDmode)
6225 	  mode = GET_MODE (XEXP (x, 1));
6226 
6227 	/* If both sides are constants, then assume the instruction is in
6228 	   SImode since s_cmp can only do integer compares.  */
6229 	if (mode == VOIDmode)
6230 	  mode = SImode;
6231 
6232 	switch (GET_MODE_SIZE (mode))
6233 	  {
6234 	  case 4:
6235 	    s = "32";
6236 	    break;
6237 	  case 8:
6238 	    s = "64";
6239 	    break;
6240 	  default:
6241 	    output_operand_lossage ("invalid operand %%xn code");
6242 	    return;
6243 	  }
6244 	fputs (s, file);
6245 	return;
6246       }
6247     case 'E':
6248       {
6249 	const char *s;
6250 	bool cmp_signed = false;
6251 	machine_mode mode = GET_MODE (XEXP (x, 0));
6252 
6253 	if (mode == VOIDmode)
6254 	  mode = GET_MODE (XEXP (x, 1));
6255 
6256 	/* If both sides are constants, assume the instruction is in SFmode
6257 	   if either operand is floating point, otherwise assume SImode.  */
6258 	if (mode == VOIDmode)
6259 	  {
6260 	    if (GET_CODE (XEXP (x, 0)) == CONST_DOUBLE
6261 		|| GET_CODE (XEXP (x, 1)) == CONST_DOUBLE)
6262 	      mode = SFmode;
6263 	    else
6264 	      mode = SImode;
6265 	  }
6266 
6267 	/* Use the same format code for vector comparisons.  */
6268 	if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT
6269 	    || GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
6270 	  mode = GET_MODE_INNER (mode);
6271 
6272 	bool float_p = GET_MODE_CLASS (mode) == MODE_FLOAT;
6273 
6274 	switch (xcode)
6275 	  {
6276 	  case EQ:
6277 	    s = "_eq_";
6278 	    break;
6279 	  case NE:
6280 	    s = float_p ? "_neq_" : "_ne_";
6281 	    break;
6282 	  case LT:
6283 	    s = "_lt_";
6284 	    cmp_signed = true;
6285 	    break;
6286 	  case LE:
6287 	    s = "_le_";
6288 	    cmp_signed = true;
6289 	    break;
6290 	  case GT:
6291 	    s = "_gt_";
6292 	    cmp_signed = true;
6293 	    break;
6294 	  case GE:
6295 	    s = "_ge_";
6296 	    cmp_signed = true;
6297 	    break;
6298 	  case LTU:
6299 	    s = "_lt_";
6300 	    break;
6301 	  case LEU:
6302 	    s = "_le_";
6303 	    break;
6304 	  case GTU:
6305 	    s = "_gt_";
6306 	    break;
6307 	  case GEU:
6308 	    s = "_ge_";
6309 	    break;
6310 	  case ORDERED:
6311 	    s = "_o_";
6312 	    break;
6313 	  case UNORDERED:
6314 	    s = "_u_";
6315 	    break;
6316 	  case UNEQ:
6317 	    s = "_nlg_";
6318 	    break;
6319 	  case UNGE:
6320 	    s = "_nlt_";
6321 	    break;
6322 	  case UNGT:
6323 	    s = "_nle_";
6324 	    break;
6325 	  case UNLE:
6326 	    s = "_ngt_";
6327 	    break;
6328 	  case UNLT:
6329 	    s = "_nge_";
6330 	    break;
6331 	  case LTGT:
6332 	    s = "_lg_";
6333 	    break;
6334 	  default:
6335 	    output_operand_lossage ("invalid %%xn code");
6336 	    return;
6337 	  }
6338 	fputs (s, file);
6339 	fputc (float_p ? 'f' : cmp_signed ? 'i' : 'u', file);
6340 
6341 	switch (GET_MODE_SIZE (mode))
6342 	  {
6343 	  case 1:
6344 	    output_operand_lossage ("operand %%xn code invalid for QImode");
6345 	    return;
6346 	  case 2:
6347 	    s = "16";
6348 	    break;
6349 	  case 4:
6350 	    s = "32";
6351 	    break;
6352 	  case 8:
6353 	    s = "64";
6354 	    break;
6355 	  default:
6356 	    output_operand_lossage ("invalid operand %%xn code");
6357 	    return;
6358 	  }
6359 	fputs (s, file);
6360 	return;
6361       }
6362     case 'L':
6363       print_operand (file, gcn_operand_part (GET_MODE (x), x, 0), 0);
6364       return;
6365     case 'H':
6366       print_operand (file, gcn_operand_part (GET_MODE (x), x, 1), 0);
6367       return;
6368     case 'R':
6369       /* Print a scalar register number as an integer.  Temporary hack.  */
6370       gcc_assert (REG_P (x));
6371       fprintf (file, "%u", (int) REGNO (x));
6372       return;
6373     case 'V':
6374       /* Print a vector register number as an integer.  Temporary hack.  */
6375       gcc_assert (REG_P (x));
6376       fprintf (file, "%u", (int) REGNO (x) - FIRST_VGPR_REG);
6377       return;
6378     case 0:
6379       if (xcode == REG)
6380 	print_reg (file, x);
6381       else if (xcode == MEM)
6382 	output_address (GET_MODE (x), x);
6383       else if (xcode == CONST_INT)
6384 	fprintf (file, "%i", (int) INTVAL (x));
6385       else if (xcode == CONST_VECTOR)
6386 	print_operand (file, CONST_VECTOR_ELT (x, 0), code);
6387       else if (xcode == CONST_DOUBLE)
6388 	{
6389 	  const char *str;
6390 	  switch (gcn_inline_fp_constant_p (x, false))
6391 	    {
6392 	    case 240:
6393 	      str = "0.5";
6394 	      break;
6395 	    case 241:
6396 	      str = "-0.5";
6397 	      break;
6398 	    case 242:
6399 	      str = "1.0";
6400 	      break;
6401 	    case 243:
6402 	      str = "-1.0";
6403 	      break;
6404 	    case 244:
6405 	      str = "2.0";
6406 	      break;
6407 	    case 245:
6408 	      str = "-2.0";
6409 	      break;
6410 	    case 246:
6411 	      str = "4.0";
6412 	      break;
6413 	    case 247:
6414 	      str = "-4.0";
6415 	      break;
6416 	    case 248:
6417 	      str = "1/pi";
6418 	      break;
6419 	    default:
6420 	      rtx ix = simplify_gen_subreg (GET_MODE (x) == DFmode
6421 					    ? DImode : SImode,
6422 					    x, GET_MODE (x), 0);
6423 	      if (x)
6424 		print_operand (file, ix, code);
6425 	      else
6426 		output_operand_lossage ("invalid fp constant");
6427 	      return;
6428 	      break;
6429 	    }
6430 	  fprintf (file, str);
6431 	  return;
6432 	}
6433       else
6434 	output_addr_const (file, x);
6435       return;
6436     case '^':
6437       if (TARGET_GCN5_PLUS)
6438 	fputs ("_co", file);
6439       return;
6440     case 'g':
6441       gcc_assert (xcode == MEM);
6442       if (MEM_VOLATILE_P (x))
6443 	fputs (" glc", file);
6444       return;
6445     default:
6446       output_operand_lossage ("invalid %%xn code");
6447     }
6448   gcc_unreachable ();
6449 }
6450 
6451 /* Implement DBX_REGISTER_NUMBER macro.
6452 
6453    Return the DWARF register number that corresponds to the GCC internal
6454    REGNO.  */
6455 
6456 unsigned int
gcn_dwarf_register_number(unsigned int regno)6457 gcn_dwarf_register_number (unsigned int regno)
6458 {
6459   /* Registers defined in DWARF.  */
6460   if (regno == EXEC_LO_REG)
6461     return 17;
6462   /* We need to use a more complex DWARF expression for this
6463   else if (regno == EXEC_HI_REG)
6464     return 17; */
6465   else if (regno == VCC_LO_REG)
6466     return 768;
6467   /* We need to use a more complex DWARF expression for this
6468   else if (regno == VCC_HI_REG)
6469     return 768;  */
6470   else if (regno == SCC_REG)
6471     return 128;
6472   else if (regno == DWARF_LINK_REGISTER)
6473     return 16;
6474   else if (SGPR_REGNO_P (regno))
6475     {
6476       if (regno - FIRST_SGPR_REG < 64)
6477 	return (regno - FIRST_SGPR_REG + 32);
6478       else
6479 	return (regno - FIRST_SGPR_REG + 1024);
6480     }
6481   else if (VGPR_REGNO_P (regno))
6482     return (regno - FIRST_VGPR_REG + 2560);
6483 
6484   /* Otherwise, there's nothing sensible to do.  */
6485   return regno + 100000;
6486 }
6487 
6488 /* Implement TARGET_DWARF_REGISTER_SPAN.
6489 
6490    DImode and Vector DImode require additional registers.  */
6491 
6492 static rtx
gcn_dwarf_register_span(rtx rtl)6493 gcn_dwarf_register_span (rtx rtl)
6494 {
6495   machine_mode mode = GET_MODE (rtl);
6496 
6497   if (VECTOR_MODE_P (mode))
6498     mode = GET_MODE_INNER (mode);
6499 
6500   if (GET_MODE_SIZE (mode) != 8)
6501     return NULL_RTX;
6502 
6503   unsigned regno = REGNO (rtl);
6504 
6505   if (regno == DWARF_LINK_REGISTER)
6506     return NULL_RTX;
6507 
6508   rtx p = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (2));
6509   XVECEXP (p, 0, 0) = gen_rtx_REG (SImode, regno);
6510   XVECEXP (p, 0, 1) = gen_rtx_REG (SImode, regno + 1);
6511 
6512   return p;
6513 }
6514 
6515 /* }}}  */
6516 /* {{{ TARGET hook overrides.  */
6517 
6518 #undef  TARGET_ADDR_SPACE_ADDRESS_MODE
6519 #define TARGET_ADDR_SPACE_ADDRESS_MODE gcn_addr_space_address_mode
6520 #undef  TARGET_ADDR_SPACE_DEBUG
6521 #define TARGET_ADDR_SPACE_DEBUG gcn_addr_space_debug
6522 #undef  TARGET_ADDR_SPACE_LEGITIMATE_ADDRESS_P
6523 #define TARGET_ADDR_SPACE_LEGITIMATE_ADDRESS_P \
6524   gcn_addr_space_legitimate_address_p
6525 #undef  TARGET_ADDR_SPACE_LEGITIMIZE_ADDRESS
6526 #define TARGET_ADDR_SPACE_LEGITIMIZE_ADDRESS gcn_addr_space_legitimize_address
6527 #undef  TARGET_ADDR_SPACE_POINTER_MODE
6528 #define TARGET_ADDR_SPACE_POINTER_MODE gcn_addr_space_pointer_mode
6529 #undef  TARGET_ADDR_SPACE_SUBSET_P
6530 #define TARGET_ADDR_SPACE_SUBSET_P gcn_addr_space_subset_p
6531 #undef  TARGET_ADDR_SPACE_CONVERT
6532 #define TARGET_ADDR_SPACE_CONVERT gcn_addr_space_convert
6533 #undef  TARGET_ARG_PARTIAL_BYTES
6534 #define TARGET_ARG_PARTIAL_BYTES gcn_arg_partial_bytes
6535 #undef  TARGET_ASM_ALIGNED_DI_OP
6536 #define TARGET_ASM_ALIGNED_DI_OP "\t.8byte\t"
6537 #undef  TARGET_ASM_FILE_START
6538 #define TARGET_ASM_FILE_START output_file_start
6539 #undef  TARGET_ASM_FUNCTION_PROLOGUE
6540 #define TARGET_ASM_FUNCTION_PROLOGUE gcn_target_asm_function_prologue
6541 #undef  TARGET_ASM_SELECT_SECTION
6542 #define TARGET_ASM_SELECT_SECTION gcn_asm_select_section
6543 #undef  TARGET_ASM_TRAMPOLINE_TEMPLATE
6544 #define TARGET_ASM_TRAMPOLINE_TEMPLATE gcn_asm_trampoline_template
6545 #undef  TARGET_ATTRIBUTE_TABLE
6546 #define TARGET_ATTRIBUTE_TABLE gcn_attribute_table
6547 #undef  TARGET_BUILTIN_DECL
6548 #define TARGET_BUILTIN_DECL gcn_builtin_decl
6549 #undef  TARGET_CAN_CHANGE_MODE_CLASS
6550 #define TARGET_CAN_CHANGE_MODE_CLASS gcn_can_change_mode_class
6551 #undef  TARGET_CAN_ELIMINATE
6552 #define TARGET_CAN_ELIMINATE gcn_can_eliminate_p
6553 #undef  TARGET_CANNOT_COPY_INSN_P
6554 #define TARGET_CANNOT_COPY_INSN_P gcn_cannot_copy_insn_p
6555 #undef  TARGET_CLASS_LIKELY_SPILLED_P
6556 #define TARGET_CLASS_LIKELY_SPILLED_P gcn_class_likely_spilled_p
6557 #undef  TARGET_CLASS_MAX_NREGS
6558 #define TARGET_CLASS_MAX_NREGS gcn_class_max_nregs
6559 #undef  TARGET_CONDITIONAL_REGISTER_USAGE
6560 #define TARGET_CONDITIONAL_REGISTER_USAGE gcn_conditional_register_usage
6561 #undef  TARGET_CONSTANT_ALIGNMENT
6562 #define TARGET_CONSTANT_ALIGNMENT gcn_constant_alignment
6563 #undef  TARGET_DEBUG_UNWIND_INFO
6564 #define TARGET_DEBUG_UNWIND_INFO gcn_debug_unwind_info
6565 #undef  TARGET_DWARF_REGISTER_SPAN
6566 #define TARGET_DWARF_REGISTER_SPAN gcn_dwarf_register_span
6567 #undef  TARGET_EMUTLS_VAR_INIT
6568 #define TARGET_EMUTLS_VAR_INIT gcn_emutls_var_init
6569 #undef  TARGET_EXPAND_BUILTIN
6570 #define TARGET_EXPAND_BUILTIN gcn_expand_builtin
6571 #undef  TARGET_FRAME_POINTER_REQUIRED
6572 #define TARGET_FRAME_POINTER_REQUIRED gcn_frame_pointer_rqd
6573 #undef  TARGET_FUNCTION_ARG
6574 #undef  TARGET_FUNCTION_ARG_ADVANCE
6575 #define TARGET_FUNCTION_ARG_ADVANCE gcn_function_arg_advance
6576 #define TARGET_FUNCTION_ARG gcn_function_arg
6577 #undef  TARGET_FUNCTION_VALUE
6578 #define TARGET_FUNCTION_VALUE gcn_function_value
6579 #undef  TARGET_FUNCTION_VALUE_REGNO_P
6580 #define TARGET_FUNCTION_VALUE_REGNO_P gcn_function_value_regno_p
6581 #undef  TARGET_GIMPLIFY_VA_ARG_EXPR
6582 #define TARGET_GIMPLIFY_VA_ARG_EXPR gcn_gimplify_va_arg_expr
6583 #undef TARGET_OMP_DEVICE_KIND_ARCH_ISA
6584 #define TARGET_OMP_DEVICE_KIND_ARCH_ISA gcn_omp_device_kind_arch_isa
6585 #undef  TARGET_GOACC_ADJUST_PRIVATE_DECL
6586 #define TARGET_GOACC_ADJUST_PRIVATE_DECL gcn_goacc_adjust_private_decl
6587 #undef  TARGET_GOACC_CREATE_WORKER_BROADCAST_RECORD
6588 #define TARGET_GOACC_CREATE_WORKER_BROADCAST_RECORD \
6589   gcn_goacc_create_worker_broadcast_record
6590 #undef  TARGET_GOACC_FORK_JOIN
6591 #define TARGET_GOACC_FORK_JOIN gcn_fork_join
6592 #undef  TARGET_GOACC_REDUCTION
6593 #define TARGET_GOACC_REDUCTION gcn_goacc_reduction
6594 #undef  TARGET_GOACC_VALIDATE_DIMS
6595 #define TARGET_GOACC_VALIDATE_DIMS gcn_goacc_validate_dims
6596 #undef  TARGET_GOACC_SHARED_MEM_LAYOUT
6597 #define TARGET_GOACC_SHARED_MEM_LAYOUT gcn_shared_mem_layout
6598 #undef  TARGET_HARD_REGNO_MODE_OK
6599 #define TARGET_HARD_REGNO_MODE_OK gcn_hard_regno_mode_ok
6600 #undef  TARGET_HARD_REGNO_NREGS
6601 #define TARGET_HARD_REGNO_NREGS gcn_hard_regno_nregs
6602 #undef  TARGET_HAVE_SPECULATION_SAFE_VALUE
6603 #define TARGET_HAVE_SPECULATION_SAFE_VALUE speculation_safe_value_not_needed
6604 #undef  TARGET_INIT_BUILTINS
6605 #define TARGET_INIT_BUILTINS gcn_init_builtins
6606 #undef  TARGET_INIT_LIBFUNCS
6607 #define TARGET_INIT_LIBFUNCS gcn_init_libfuncs
6608 #undef  TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
6609 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
6610   gcn_ira_change_pseudo_allocno_class
6611 #undef  TARGET_LEGITIMATE_CONSTANT_P
6612 #define TARGET_LEGITIMATE_CONSTANT_P gcn_legitimate_constant_p
6613 #undef  TARGET_LRA_P
6614 #define TARGET_LRA_P hook_bool_void_true
6615 #undef  TARGET_MACHINE_DEPENDENT_REORG
6616 #define TARGET_MACHINE_DEPENDENT_REORG gcn_md_reorg
6617 #undef  TARGET_MEMORY_MOVE_COST
6618 #define TARGET_MEMORY_MOVE_COST gcn_memory_move_cost
6619 #undef  TARGET_MODES_TIEABLE_P
6620 #define TARGET_MODES_TIEABLE_P gcn_modes_tieable_p
6621 #undef  TARGET_OPTION_OVERRIDE
6622 #define TARGET_OPTION_OVERRIDE gcn_option_override
6623 #undef  TARGET_PRETEND_OUTGOING_VARARGS_NAMED
6624 #define TARGET_PRETEND_OUTGOING_VARARGS_NAMED \
6625   gcn_pretend_outgoing_varargs_named
6626 #undef  TARGET_PROMOTE_FUNCTION_MODE
6627 #define TARGET_PROMOTE_FUNCTION_MODE gcn_promote_function_mode
6628 #undef  TARGET_REGISTER_MOVE_COST
6629 #define TARGET_REGISTER_MOVE_COST gcn_register_move_cost
6630 #undef  TARGET_RETURN_IN_MEMORY
6631 #define TARGET_RETURN_IN_MEMORY gcn_return_in_memory
6632 #undef  TARGET_RTX_COSTS
6633 #define TARGET_RTX_COSTS gcn_rtx_costs
6634 #undef  TARGET_SECONDARY_RELOAD
6635 #define TARGET_SECONDARY_RELOAD gcn_secondary_reload
6636 #undef  TARGET_SECTION_TYPE_FLAGS
6637 #define TARGET_SECTION_TYPE_FLAGS gcn_section_type_flags
6638 #undef  TARGET_SCALAR_MODE_SUPPORTED_P
6639 #define TARGET_SCALAR_MODE_SUPPORTED_P gcn_scalar_mode_supported_p
6640 #undef  TARGET_SMALL_REGISTER_CLASSES_FOR_MODE_P
6641 #define TARGET_SMALL_REGISTER_CLASSES_FOR_MODE_P \
6642   gcn_small_register_classes_for_mode_p
6643 #undef  TARGET_SPILL_CLASS
6644 #define TARGET_SPILL_CLASS gcn_spill_class
6645 #undef  TARGET_STRICT_ARGUMENT_NAMING
6646 #define TARGET_STRICT_ARGUMENT_NAMING gcn_strict_argument_naming
6647 #undef  TARGET_TRAMPOLINE_INIT
6648 #define TARGET_TRAMPOLINE_INIT gcn_trampoline_init
6649 #undef  TARGET_TRULY_NOOP_TRUNCATION
6650 #define TARGET_TRULY_NOOP_TRUNCATION gcn_truly_noop_truncation
6651 #undef  TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
6652 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST gcn_vectorization_cost
6653 #undef  TARGET_VECTORIZE_GET_MASK_MODE
6654 #define TARGET_VECTORIZE_GET_MASK_MODE gcn_vectorize_get_mask_mode
6655 #undef  TARGET_VECTORIZE_PREFERRED_SIMD_MODE
6656 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE gcn_vectorize_preferred_simd_mode
6657 #undef  TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
6658 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
6659   gcn_preferred_vector_alignment
6660 #undef  TARGET_VECTORIZE_RELATED_MODE
6661 #define TARGET_VECTORIZE_RELATED_MODE gcn_related_vector_mode
6662 #undef  TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
6663 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
6664   gcn_vectorize_support_vector_misalignment
6665 #undef  TARGET_VECTORIZE_VEC_PERM_CONST
6666 #define TARGET_VECTORIZE_VEC_PERM_CONST gcn_vectorize_vec_perm_const
6667 #undef  TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
6668 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
6669   gcn_vector_alignment_reachable
6670 #undef  TARGET_VECTOR_MODE_SUPPORTED_P
6671 #define TARGET_VECTOR_MODE_SUPPORTED_P gcn_vector_mode_supported_p
6672 
6673 struct gcc_target targetm = TARGET_INITIALIZER;
6674 
6675 #include "gt-gcn.h"
6676 /* }}}  */
6677