1 /* Copyright (C) 2016-2020 Free Software Foundation, Inc.
2 
3    This file is free software; you can redistribute it and/or modify it under
4    the terms of the GNU General Public License as published by the Free
5    Software Foundation; either version 3 of the License, or (at your option)
6    any later version.
7 
8    This file is distributed in the hope that it will be useful, but WITHOUT
9    ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10    FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
11    for more details.
12 
13    You should have received a copy of the GNU General Public License
14    along with GCC; see the file COPYING3.  If not see
15    <http://www.gnu.org/licenses/>.  */
16 
17 /* {{{ Includes.  */
18 
19 /* We want GET_MODE_SIZE et al to return integers, please.  */
20 #define IN_TARGET_CODE 1
21 
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "memmodel.h"
28 #include "rtl.h"
29 #include "tree.h"
30 #include "df.h"
31 #include "tm_p.h"
32 #include "stringpool.h"
33 #include "optabs.h"
34 #include "regs.h"
35 #include "emit-rtl.h"
36 #include "recog.h"
37 #include "diagnostic-core.h"
38 #include "insn-attr.h"
39 #include "fold-const.h"
40 #include "calls.h"
41 #include "explow.h"
42 #include "expr.h"
43 #include "output.h"
44 #include "cfgrtl.h"
45 #include "langhooks.h"
46 #include "builtins.h"
47 #include "omp-general.h"
48 #include "print-rtl.h"
49 #include "attribs.h"
50 #include "varasm.h"
51 #include "intl.h"
52 #include "rtl-iter.h"
53 
54 /* This file should be included last.  */
55 #include "target-def.h"
56 
57 /* }}}  */
58 /* {{{ Global variables.  */
59 
60 /* Constants used by FP instructions.  */
61 
62 static REAL_VALUE_TYPE dconst4, dconst1over2pi;
63 static bool ext_gcn_constants_init = 0;
64 
65 /* Holds the ISA variant, derived from the command line parameters.  */
66 
67 int gcn_isa = 3;		/* Default to GCN3.  */
68 
69 /* Reserve this much space for LDS (for propagating variables from
70    worker-single mode to worker-partitioned mode), per workgroup.  Global
71    analysis could calculate an exact bound, but we don't do that yet.
72 
73    We want to permit full occupancy, so size accordingly.  */
74 
75 #define OMP_LDS_SIZE 0x600    /* 0x600 is 1/40 total, rounded down.  */
76 #define ACC_LDS_SIZE 32768    /* Half of the total should be fine.  */
77 #define OTHER_LDS_SIZE 65536  /* If in doubt, reserve all of it.  */
78 
79 #define LDS_SIZE (flag_openacc ? ACC_LDS_SIZE \
80 		  : flag_openmp ? OMP_LDS_SIZE \
81 		  : OTHER_LDS_SIZE)
82 
83 /* The number of registers usable by normal non-kernel functions.
84    The SGPR count includes any special extra registers such as VCC.  */
85 
86 #define MAX_NORMAL_SGPR_COUNT	64
87 #define MAX_NORMAL_VGPR_COUNT	24
88 
89 /* }}}  */
90 /* {{{ Initialization and options.  */
91 
92 /* Initialize machine_function.  */
93 
94 static struct machine_function *
gcn_init_machine_status(void)95 gcn_init_machine_status (void)
96 {
97   struct machine_function *f;
98 
99   f = ggc_cleared_alloc<machine_function> ();
100 
101   /* Set up LDS allocation for broadcasting for this function.  */
102   f->lds_allocated = 32;
103   f->lds_allocs = hash_map<tree, int>::create_ggc (64);
104 
105   /* And LDS temporary decls for worker reductions.  */
106   vec_alloc (f->reduc_decls, 0);
107 
108   if (TARGET_GCN3)
109     f->use_flat_addressing = true;
110 
111   return f;
112 }
113 
114 /* Implement TARGET_OPTION_OVERRIDE.
115 
116    Override option settings where defaults are variable, or we have specific
117    needs to consider.  */
118 
119 static void
gcn_option_override(void)120 gcn_option_override (void)
121 {
122   init_machine_status = gcn_init_machine_status;
123 
124   /* The HSA runtime does not respect ELF load addresses, so force PIE.  */
125   if (!flag_pie)
126     flag_pie = 2;
127   if (!flag_pic)
128     flag_pic = flag_pie;
129 
130   gcn_isa = gcn_arch == PROCESSOR_VEGA ? 5 : 3;
131 
132   /* The default stack size needs to be small for offload kernels because
133      there may be many, many threads.  Also, a smaller stack gives a
134      measureable performance boost.  But, a small stack is insufficient
135      for running the testsuite, so we use a larger default for the stand
136      alone case.  */
137   if (stack_size_opt == -1)
138     {
139       if (flag_openacc || flag_openmp)
140 	/* 512 bytes per work item = 32kB total.  */
141 	stack_size_opt = 512 * 64;
142       else
143 	/* 1MB total.  */
144 	stack_size_opt = 1048576;
145     }
146 }
147 
148 /* }}}  */
149 /* {{{ Attributes.  */
150 
151 /* This table defines the arguments that are permitted in
152    __attribute__ ((amdgpu_hsa_kernel (...))).
153 
154    The names and values correspond to the HSA metadata that is encoded
155    into the assembler file and binary.  */
156 
157 static const struct gcn_kernel_arg_type
158 {
159   const char *name;
160   const char *header_pseudo;
161   machine_mode mode;
162 
163   /* This should be set to -1 or -2 for a dynamically allocated register
164      number.  Use -1 if this argument contributes to the user_sgpr_count,
165      -2 otherwise.  */
166   int fixed_regno;
167 } gcn_kernel_arg_types[] = {
168   {"exec", NULL, DImode, EXEC_REG},
169 #define PRIVATE_SEGMENT_BUFFER_ARG 1
170   {"private_segment_buffer",
171     "enable_sgpr_private_segment_buffer", TImode, -1},
172 #define DISPATCH_PTR_ARG 2
173   {"dispatch_ptr", "enable_sgpr_dispatch_ptr", DImode, -1},
174 #define QUEUE_PTR_ARG 3
175   {"queue_ptr", "enable_sgpr_queue_ptr", DImode, -1},
176 #define KERNARG_SEGMENT_PTR_ARG 4
177   {"kernarg_segment_ptr", "enable_sgpr_kernarg_segment_ptr", DImode, -1},
178   {"dispatch_id", "enable_sgpr_dispatch_id", DImode, -1},
179 #define FLAT_SCRATCH_INIT_ARG 6
180   {"flat_scratch_init", "enable_sgpr_flat_scratch_init", DImode, -1},
181 #define FLAT_SCRATCH_SEGMENT_SIZE_ARG 7
182   {"private_segment_size", "enable_sgpr_private_segment_size", SImode, -1},
183   {"grid_workgroup_count_X",
184     "enable_sgpr_grid_workgroup_count_x", SImode, -1},
185   {"grid_workgroup_count_Y",
186     "enable_sgpr_grid_workgroup_count_y", SImode, -1},
187   {"grid_workgroup_count_Z",
188     "enable_sgpr_grid_workgroup_count_z", SImode, -1},
189 #define WORKGROUP_ID_X_ARG 11
190   {"workgroup_id_X", "enable_sgpr_workgroup_id_x", SImode, -2},
191   {"workgroup_id_Y", "enable_sgpr_workgroup_id_y", SImode, -2},
192   {"workgroup_id_Z", "enable_sgpr_workgroup_id_z", SImode, -2},
193   {"workgroup_info", "enable_sgpr_workgroup_info", SImode, -1},
194 #define PRIVATE_SEGMENT_WAVE_OFFSET_ARG 15
195   {"private_segment_wave_offset",
196     "enable_sgpr_private_segment_wave_byte_offset", SImode, -2},
197 #define WORK_ITEM_ID_X_ARG 16
198   {"work_item_id_X", NULL, V64SImode, FIRST_VGPR_REG},
199 #define WORK_ITEM_ID_Y_ARG 17
200   {"work_item_id_Y", NULL, V64SImode, FIRST_VGPR_REG + 1},
201 #define WORK_ITEM_ID_Z_ARG 18
202   {"work_item_id_Z", NULL, V64SImode, FIRST_VGPR_REG + 2}
203 };
204 
205 static const long default_requested_args
206 	= (1 << PRIVATE_SEGMENT_BUFFER_ARG)
207 	  | (1 << DISPATCH_PTR_ARG)
208 	  | (1 << QUEUE_PTR_ARG)
209 	  | (1 << KERNARG_SEGMENT_PTR_ARG)
210 	  | (1 << PRIVATE_SEGMENT_WAVE_OFFSET_ARG)
211 	  | (1 << WORKGROUP_ID_X_ARG)
212 	  | (1 << WORK_ITEM_ID_X_ARG)
213 	  | (1 << WORK_ITEM_ID_Y_ARG)
214 	  | (1 << WORK_ITEM_ID_Z_ARG);
215 
216 /* Extract parameter settings from __attribute__((amdgpu_hsa_kernel ())).
217    This function also sets the default values for some arguments.
218 
219    Return true on success, with ARGS populated.  */
220 
221 static bool
gcn_parse_amdgpu_hsa_kernel_attribute(struct gcn_kernel_args * args,tree list)222 gcn_parse_amdgpu_hsa_kernel_attribute (struct gcn_kernel_args *args,
223 				       tree list)
224 {
225   bool err = false;
226   args->requested = default_requested_args;
227   args->nargs = 0;
228 
229   for (int a = 0; a < GCN_KERNEL_ARG_TYPES; a++)
230     args->reg[a] = -1;
231 
232   for (; list; list = TREE_CHAIN (list))
233     {
234       const char *str;
235       if (TREE_CODE (TREE_VALUE (list)) != STRING_CST)
236 	{
237 	  error ("amdgpu_hsa_kernel attribute requires string constant "
238 		 "arguments");
239 	  break;
240 	}
241       str = TREE_STRING_POINTER (TREE_VALUE (list));
242       int a;
243       for (a = 0; a < GCN_KERNEL_ARG_TYPES; a++)
244 	{
245 	  if (!strcmp (str, gcn_kernel_arg_types[a].name))
246 	    break;
247 	}
248       if (a == GCN_KERNEL_ARG_TYPES)
249 	{
250 	  error ("unknown specifier %s in amdgpu_hsa_kernel attribute", str);
251 	  err = true;
252 	  break;
253 	}
254       if (args->requested & (1 << a))
255 	{
256 	  error ("duplicated parameter specifier %s in amdgpu_hsa_kernel "
257 		 "attribute", str);
258 	  err = true;
259 	  break;
260 	}
261       args->requested |= (1 << a);
262       args->order[args->nargs++] = a;
263     }
264 
265   /* Requesting WORK_ITEM_ID_Z_ARG implies requesting WORK_ITEM_ID_X_ARG and
266      WORK_ITEM_ID_Y_ARG.  Similarly, requesting WORK_ITEM_ID_Y_ARG implies
267      requesting WORK_ITEM_ID_X_ARG.  */
268   if (args->requested & (1 << WORK_ITEM_ID_Z_ARG))
269     args->requested |= (1 << WORK_ITEM_ID_Y_ARG);
270   if (args->requested & (1 << WORK_ITEM_ID_Y_ARG))
271     args->requested |= (1 << WORK_ITEM_ID_X_ARG);
272 
273   int sgpr_regno = FIRST_SGPR_REG;
274   args->nsgprs = 0;
275   for (int a = 0; a < GCN_KERNEL_ARG_TYPES; a++)
276     {
277       if (!(args->requested & (1 << a)))
278 	continue;
279 
280       if (gcn_kernel_arg_types[a].fixed_regno >= 0)
281 	args->reg[a] = gcn_kernel_arg_types[a].fixed_regno;
282       else
283 	{
284 	  int reg_count;
285 
286 	  switch (gcn_kernel_arg_types[a].mode)
287 	    {
288 	    case E_SImode:
289 	      reg_count = 1;
290 	      break;
291 	    case E_DImode:
292 	      reg_count = 2;
293 	      break;
294 	    case E_TImode:
295 	      reg_count = 4;
296 	      break;
297 	    default:
298 	      gcc_unreachable ();
299 	    }
300 	  args->reg[a] = sgpr_regno;
301 	  sgpr_regno += reg_count;
302 	  if (gcn_kernel_arg_types[a].fixed_regno == -1)
303 	    args->nsgprs += reg_count;
304 	}
305     }
306   if (sgpr_regno > FIRST_SGPR_REG + 16)
307     {
308       error ("too many arguments passed in sgpr registers");
309     }
310   return err;
311 }
312 
313 /* Referenced by TARGET_ATTRIBUTE_TABLE.
314 
315    Validates target specific attributes.  */
316 
317 static tree
gcn_handle_amdgpu_hsa_kernel_attribute(tree * node,tree name,tree args,int,bool * no_add_attrs)318 gcn_handle_amdgpu_hsa_kernel_attribute (tree *node, tree name,
319 					tree args, int, bool *no_add_attrs)
320 {
321   if (!FUNC_OR_METHOD_TYPE_P (*node))
322     {
323       warning (OPT_Wattributes, "%qE attribute only applies to functions",
324 	       name);
325       *no_add_attrs = true;
326       return NULL_TREE;
327     }
328 
329   /* Can combine regparm with all attributes but fastcall, and thiscall.  */
330   if (is_attribute_p ("gcnhsa_kernel", name))
331     {
332       struct gcn_kernel_args kernelarg;
333 
334       if (gcn_parse_amdgpu_hsa_kernel_attribute (&kernelarg, args))
335 	*no_add_attrs = true;
336 
337       return NULL_TREE;
338     }
339 
340   return NULL_TREE;
341 }
342 
343 /* Implement TARGET_ATTRIBUTE_TABLE.
344 
345    Create target-specific __attribute__ types.  */
346 
347 static const struct attribute_spec gcn_attribute_table[] = {
348   /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
349      affects_type_identity } */
350   {"amdgpu_hsa_kernel", 0, GCN_KERNEL_ARG_TYPES, false, true,
351    true, true, gcn_handle_amdgpu_hsa_kernel_attribute, NULL},
352   /* End element.  */
353   {NULL, 0, 0, false, false, false, false, NULL, NULL}
354 };
355 
356 /* }}}  */
357 /* {{{ Registers and modes.  */
358 
359 /* Implement TARGET_CLASS_MAX_NREGS.
360 
361    Return the number of hard registers needed to hold a value of MODE in
362    a register of class RCLASS.  */
363 
364 static unsigned char
gcn_class_max_nregs(reg_class_t rclass,machine_mode mode)365 gcn_class_max_nregs (reg_class_t rclass, machine_mode mode)
366 {
367   /* Scalar registers are 32bit, vector registers are in fact tuples of
368      64 lanes.  */
369   if (rclass == VGPR_REGS)
370     {
371       if (vgpr_1reg_mode_p (mode))
372 	return 1;
373       if (vgpr_2reg_mode_p (mode))
374 	return 2;
375       /* TImode is used by DImode compare_and_swap.  */
376       if (mode == TImode)
377 	return 4;
378     }
379   else if (rclass == VCC_CONDITIONAL_REG && mode == BImode)
380     return 2;
381   return CEIL (GET_MODE_SIZE (mode), 4);
382 }
383 
384 /* Implement TARGET_HARD_REGNO_NREGS.
385 
386    Return the number of hard registers needed to hold a value of MODE in
387    REGNO.  */
388 
389 unsigned int
gcn_hard_regno_nregs(unsigned int regno,machine_mode mode)390 gcn_hard_regno_nregs (unsigned int regno, machine_mode mode)
391 {
392   return gcn_class_max_nregs (REGNO_REG_CLASS (regno), mode);
393 }
394 
395 /* Implement TARGET_HARD_REGNO_MODE_OK.
396 
397    Return true if REGNO can hold value in MODE.  */
398 
399 bool
gcn_hard_regno_mode_ok(unsigned int regno,machine_mode mode)400 gcn_hard_regno_mode_ok (unsigned int regno, machine_mode mode)
401 {
402   /* Treat a complex mode as if it were a scalar mode of the same overall
403      size for the purposes of allocating hard registers.  */
404   if (COMPLEX_MODE_P (mode))
405     switch (mode)
406       {
407       case E_CQImode:
408       case E_CHImode:
409 	mode = SImode;
410 	break;
411       case E_CSImode:
412 	mode = DImode;
413 	break;
414       case E_CDImode:
415 	mode = TImode;
416 	break;
417       case E_HCmode:
418 	mode = SFmode;
419 	break;
420       case E_SCmode:
421 	mode = DFmode;
422 	break;
423       default:
424 	/* Not supported.  */
425 	return false;
426       }
427 
428   switch (regno)
429     {
430     case FLAT_SCRATCH_LO_REG:
431     case XNACK_MASK_LO_REG:
432     case TBA_LO_REG:
433     case TMA_LO_REG:
434       return (mode == SImode || mode == DImode);
435     case VCC_LO_REG:
436     case EXEC_LO_REG:
437       return (mode == BImode || mode == SImode || mode == DImode);
438     case M0_REG:
439     case FLAT_SCRATCH_HI_REG:
440     case XNACK_MASK_HI_REG:
441     case TBA_HI_REG:
442     case TMA_HI_REG:
443       return mode == SImode;
444     case VCC_HI_REG:
445       return false;
446     case EXEC_HI_REG:
447       return mode == SImode /*|| mode == V32BImode */ ;
448     case SCC_REG:
449     case VCCZ_REG:
450     case EXECZ_REG:
451       return mode == BImode;
452     }
453   if (regno == ARG_POINTER_REGNUM || regno == FRAME_POINTER_REGNUM)
454     return true;
455   if (SGPR_REGNO_P (regno))
456     /* We restrict double register values to aligned registers.  */
457     return (sgpr_1reg_mode_p (mode)
458 	    || (!((regno - FIRST_SGPR_REG) & 1) && sgpr_2reg_mode_p (mode))
459 	    || (((regno - FIRST_SGPR_REG) & 3) == 0 && mode == TImode));
460   if (VGPR_REGNO_P (regno))
461     /* Vector instructions do not care about the alignment of register
462        pairs, but where there is no 64-bit instruction, many of the
463        define_split do not work if the input and output registers partially
464        overlap.  We tried to fix this with early clobber and match
465        constraints, but it was bug prone, added complexity, and conflicts
466        with the 'U0' constraints on vec_merge.
467        Therefore, we restrict ourselved to aligned registers.  */
468     return (vgpr_1reg_mode_p (mode)
469 	    || (!((regno - FIRST_VGPR_REG) & 1) && vgpr_2reg_mode_p (mode))
470 	    /* TImode is used by DImode compare_and_swap.  */
471 	    || (mode == TImode
472 		&& !((regno - FIRST_VGPR_REG) & 3)));
473   return false;
474 }
475 
476 /* Implement REGNO_REG_CLASS via gcn.h.
477 
478    Return smallest class containing REGNO.  */
479 
480 enum reg_class
gcn_regno_reg_class(int regno)481 gcn_regno_reg_class (int regno)
482 {
483   switch (regno)
484     {
485     case SCC_REG:
486       return SCC_CONDITIONAL_REG;
487     case VCC_LO_REG:
488     case VCC_HI_REG:
489       return VCC_CONDITIONAL_REG;
490     case VCCZ_REG:
491       return VCCZ_CONDITIONAL_REG;
492     case EXECZ_REG:
493       return EXECZ_CONDITIONAL_REG;
494     case EXEC_LO_REG:
495     case EXEC_HI_REG:
496       return EXEC_MASK_REG;
497     }
498   if (VGPR_REGNO_P (regno))
499     return VGPR_REGS;
500   if (SGPR_REGNO_P (regno))
501     return SGPR_REGS;
502   if (regno < FIRST_VGPR_REG)
503     return GENERAL_REGS;
504   if (regno == ARG_POINTER_REGNUM || regno == FRAME_POINTER_REGNUM)
505     return AFP_REGS;
506   return ALL_REGS;
507 }
508 
509 /* Implement TARGET_CAN_CHANGE_MODE_CLASS.
510 
511    GCC assumes that lowpart contains first part of value as stored in memory.
512    This is not the case for vector registers.  */
513 
514 bool
gcn_can_change_mode_class(machine_mode from,machine_mode to,reg_class_t regclass)515 gcn_can_change_mode_class (machine_mode from, machine_mode to,
516 			   reg_class_t regclass)
517 {
518   if (!vgpr_vector_mode_p (from) && !vgpr_vector_mode_p (to))
519     return true;
520   return (gcn_class_max_nregs (regclass, from)
521 	  == gcn_class_max_nregs (regclass, to));
522 }
523 
524 /* Implement TARGET_SMALL_REGISTER_CLASSES_FOR_MODE_P.
525 
526    When this hook returns true for MODE, the compiler allows
527    registers explicitly used in the rtl to be used as spill registers
528    but prevents the compiler from extending the lifetime of these
529    registers.  */
530 
531 bool
gcn_small_register_classes_for_mode_p(machine_mode mode)532 gcn_small_register_classes_for_mode_p (machine_mode mode)
533 {
534   /* We allocate into exec and vcc regs.  Those make small register class.  */
535   return mode == DImode || mode == SImode;
536 }
537 
538 /* Implement TARGET_CLASS_LIKELY_SPILLED_P.
539 
540    Returns true if pseudos that have been assigned to registers of class RCLASS
541    would likely be spilled because registers of RCLASS are needed for spill
542    registers.  */
543 
544 static bool
gcn_class_likely_spilled_p(reg_class_t rclass)545 gcn_class_likely_spilled_p (reg_class_t rclass)
546 {
547   return (rclass == EXEC_MASK_REG
548 	  || reg_classes_intersect_p (ALL_CONDITIONAL_REGS, rclass));
549 }
550 
551 /* Implement TARGET_MODES_TIEABLE_P.
552 
553    Returns true if a value of MODE1 is accessible in MODE2 without
554    copying.  */
555 
556 bool
gcn_modes_tieable_p(machine_mode mode1,machine_mode mode2)557 gcn_modes_tieable_p (machine_mode mode1, machine_mode mode2)
558 {
559   return (GET_MODE_BITSIZE (mode1) <= MAX_FIXED_MODE_SIZE
560 	  && GET_MODE_BITSIZE (mode2) <= MAX_FIXED_MODE_SIZE);
561 }
562 
563 /* Implement TARGET_TRULY_NOOP_TRUNCATION.
564 
565    Returns true if it is safe to “convert” a value of INPREC bits to one of
566    OUTPREC bits (where OUTPREC is smaller than INPREC) by merely operating on
567    it as if it had only OUTPREC bits.  */
568 
569 bool
gcn_truly_noop_truncation(poly_uint64 outprec,poly_uint64 inprec)570 gcn_truly_noop_truncation (poly_uint64 outprec, poly_uint64 inprec)
571 {
572   return ((inprec <= 32) && (outprec <= inprec));
573 }
574 
575 /* Return N-th part of value occupying multiple registers.  */
576 
577 rtx
gcn_operand_part(machine_mode mode,rtx op,int n)578 gcn_operand_part (machine_mode mode, rtx op, int n)
579 {
580   if (GET_MODE_SIZE (mode) >= 256)
581     {
582       /*gcc_assert (GET_MODE_SIZE (mode) == 256 || n == 0);  */
583 
584       if (REG_P (op))
585 	{
586 	  gcc_assert (REGNO (op) + n < FIRST_PSEUDO_REGISTER);
587 	  return gen_rtx_REG (V64SImode, REGNO (op) + n);
588 	}
589       if (GET_CODE (op) == CONST_VECTOR)
590 	{
591 	  int units = GET_MODE_NUNITS (mode);
592 	  rtvec v = rtvec_alloc (units);
593 
594 	  for (int i = 0; i < units; ++i)
595 	    RTVEC_ELT (v, i) = gcn_operand_part (GET_MODE_INNER (mode),
596 						 CONST_VECTOR_ELT (op, i), n);
597 
598 	  return gen_rtx_CONST_VECTOR (V64SImode, v);
599 	}
600       if (GET_CODE (op) == UNSPEC && XINT (op, 1) == UNSPEC_VECTOR)
601 	return gcn_gen_undef (V64SImode);
602       gcc_unreachable ();
603     }
604   else if (GET_MODE_SIZE (mode) == 8 && REG_P (op))
605     {
606       gcc_assert (REGNO (op) + n < FIRST_PSEUDO_REGISTER);
607       return gen_rtx_REG (SImode, REGNO (op) + n);
608     }
609   else
610     {
611       if (GET_CODE (op) == UNSPEC && XINT (op, 1) == UNSPEC_VECTOR)
612 	return gcn_gen_undef (SImode);
613 
614       /* If it's a constant then let's assume it is of the largest mode
615 	 available, otherwise simplify_gen_subreg will fail.  */
616       if (mode == VOIDmode && CONST_INT_P (op))
617 	mode = DImode;
618       return simplify_gen_subreg (SImode, op, mode, n * 4);
619     }
620 }
621 
622 /* Return N-th part of value occupying multiple registers.  */
623 
624 rtx
gcn_operand_doublepart(machine_mode mode,rtx op,int n)625 gcn_operand_doublepart (machine_mode mode, rtx op, int n)
626 {
627   return simplify_gen_subreg (DImode, op, mode, n * 8);
628 }
629 
630 /* Return true if OP can be split into subregs or high/low parts.
631    This is always true for scalars, but not normally true for vectors.
632    However, for vectors in hardregs we can use the low and high registers.  */
633 
634 bool
gcn_can_split_p(machine_mode,rtx op)635 gcn_can_split_p (machine_mode, rtx op)
636 {
637   if (vgpr_vector_mode_p (GET_MODE (op)))
638     {
639       if (GET_CODE (op) == SUBREG)
640 	op = SUBREG_REG (op);
641       if (!REG_P (op))
642 	return true;
643       return REGNO (op) <= FIRST_PSEUDO_REGISTER;
644     }
645   return true;
646 }
647 
648 /* Implement TARGET_SPILL_CLASS.
649 
650    Return class of registers which could be used for pseudo of MODE
651    and of class RCLASS for spilling instead of memory.  Return NO_REGS
652    if it is not possible or non-profitable.  */
653 
654 static reg_class_t
gcn_spill_class(reg_class_t c,machine_mode)655 gcn_spill_class (reg_class_t c, machine_mode /*mode */ )
656 {
657   if (reg_classes_intersect_p (ALL_CONDITIONAL_REGS, c)
658       || c == VCC_CONDITIONAL_REG)
659     return SGPR_REGS;
660   else
661     return NO_REGS;
662 }
663 
664 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
665 
666    Change allocno class for given pseudo from allocno and best class
667    calculated by IRA.  */
668 
669 static reg_class_t
gcn_ira_change_pseudo_allocno_class(int regno,reg_class_t cl,reg_class_t best_cl)670 gcn_ira_change_pseudo_allocno_class (int regno, reg_class_t cl,
671 				     reg_class_t best_cl)
672 {
673   /* Avoid returning classes that contain both vgpr and sgpr registers.  */
674   if (cl != ALL_REGS && cl != SRCDST_REGS && cl != ALL_GPR_REGS)
675     return cl;
676   if (best_cl != ALL_REGS && best_cl != SRCDST_REGS
677       && best_cl != ALL_GPR_REGS)
678     return best_cl;
679 
680   machine_mode mode = PSEUDO_REGNO_MODE (regno);
681   if (vgpr_vector_mode_p (mode))
682     return VGPR_REGS;
683 
684   return GENERAL_REGS;
685 }
686 
687 /* Create a new DImode pseudo reg and emit an instruction to initialize
688    it to VAL.  */
689 
690 static rtx
get_exec(int64_t val)691 get_exec (int64_t val)
692 {
693   rtx reg = gen_reg_rtx (DImode);
694   emit_insn (gen_rtx_SET (reg, gen_int_mode (val, DImode)));
695   return reg;
696 }
697 
698 /* Return value of scalar exec register.  */
699 
700 rtx
gcn_scalar_exec()701 gcn_scalar_exec ()
702 {
703   return const1_rtx;
704 }
705 
706 /* Return pseudo holding scalar exec register.  */
707 
708 rtx
gcn_scalar_exec_reg()709 gcn_scalar_exec_reg ()
710 {
711   return get_exec (1);
712 }
713 
714 /* Return value of full exec register.  */
715 
716 rtx
gcn_full_exec()717 gcn_full_exec ()
718 {
719   return constm1_rtx;
720 }
721 
722 /* Return pseudo holding full exec register.  */
723 
724 rtx
gcn_full_exec_reg()725 gcn_full_exec_reg ()
726 {
727   return get_exec (-1);
728 }
729 
730 /* }}}  */
731 /* {{{ Immediate constants.  */
732 
733 /* Initialize shared numeric constants.  */
734 
735 static void
init_ext_gcn_constants(void)736 init_ext_gcn_constants (void)
737 {
738   real_from_integer (&dconst4, DFmode, 4, SIGNED);
739 
740   /* FIXME: this constant probably does not match what hardware really loads.
741      Reality check it eventually.  */
742   real_from_string (&dconst1over2pi,
743 		    "0.1591549430918953357663423455968866839");
744   real_convert (&dconst1over2pi, SFmode, &dconst1over2pi);
745 
746   ext_gcn_constants_init = 1;
747 }
748 
749 /* Return non-zero if X is a constant that can appear as an inline operand.
750    This is 0, 0.5, -0.5, 1, -1, 2, -2, 4,-4, 1/(2*pi)
751    Or a vector of those.
752    The value returned should be the encoding of this constant.  */
753 
754 int
gcn_inline_fp_constant_p(rtx x,bool allow_vector)755 gcn_inline_fp_constant_p (rtx x, bool allow_vector)
756 {
757   machine_mode mode = GET_MODE (x);
758 
759   if ((mode == V64HFmode || mode == V64SFmode || mode == V64DFmode)
760       && allow_vector)
761     {
762       int n;
763       if (GET_CODE (x) != CONST_VECTOR)
764 	return 0;
765       n = gcn_inline_fp_constant_p (CONST_VECTOR_ELT (x, 0), false);
766       if (!n)
767 	return 0;
768       for (int i = 1; i < 64; i++)
769 	if (CONST_VECTOR_ELT (x, i) != CONST_VECTOR_ELT (x, 0))
770 	  return 0;
771       return 1;
772     }
773 
774   if (mode != HFmode && mode != SFmode && mode != DFmode)
775     return 0;
776 
777   const REAL_VALUE_TYPE *r;
778 
779   if (x == CONST0_RTX (mode))
780     return 128;
781   if (x == CONST1_RTX (mode))
782     return 242;
783 
784   r = CONST_DOUBLE_REAL_VALUE (x);
785 
786   if (real_identical (r, &dconstm1))
787     return 243;
788 
789   if (real_identical (r, &dconsthalf))
790     return 240;
791   if (real_identical (r, &dconstm1))
792     return 243;
793   if (real_identical (r, &dconst2))
794     return 244;
795   if (real_identical (r, &dconst4))
796     return 246;
797   if (real_identical (r, &dconst1over2pi))
798     return 248;
799   if (!ext_gcn_constants_init)
800     init_ext_gcn_constants ();
801   real_value_negate (r);
802   if (real_identical (r, &dconsthalf))
803     return 241;
804   if (real_identical (r, &dconst2))
805     return 245;
806   if (real_identical (r, &dconst4))
807     return 247;
808 
809   /* FIXME: add 4, -4 and 1/(2*PI).  */
810 
811   return 0;
812 }
813 
814 /* Return non-zero if X is a constant that can appear as an immediate operand.
815    This is 0, 0.5, -0.5, 1, -1, 2, -2, 4,-4, 1/(2*pi)
816    Or a vector of those.
817    The value returned should be the encoding of this constant.  */
818 
819 bool
gcn_fp_constant_p(rtx x,bool allow_vector)820 gcn_fp_constant_p (rtx x, bool allow_vector)
821 {
822   machine_mode mode = GET_MODE (x);
823 
824   if ((mode == V64HFmode || mode == V64SFmode || mode == V64DFmode)
825       && allow_vector)
826     {
827       int n;
828       if (GET_CODE (x) != CONST_VECTOR)
829 	return false;
830       n = gcn_fp_constant_p (CONST_VECTOR_ELT (x, 0), false);
831       if (!n)
832 	return false;
833       for (int i = 1; i < 64; i++)
834 	if (CONST_VECTOR_ELT (x, i) != CONST_VECTOR_ELT (x, 0))
835 	  return false;
836       return true;
837     }
838   if (mode != HFmode && mode != SFmode && mode != DFmode)
839     return false;
840 
841   if (gcn_inline_fp_constant_p (x, false))
842     return true;
843   /* FIXME: It is not clear how 32bit immediates are interpreted here.  */
844   return (mode != DFmode);
845 }
846 
847 /* Return true if X is a constant representable as an inline immediate
848    constant in a 32-bit instruction encoding.  */
849 
850 bool
gcn_inline_constant_p(rtx x)851 gcn_inline_constant_p (rtx x)
852 {
853   if (GET_CODE (x) == CONST_INT)
854     return INTVAL (x) >= -16 && INTVAL (x) <= 64;
855   if (GET_CODE (x) == CONST_DOUBLE)
856     return gcn_inline_fp_constant_p (x, false);
857   if (GET_CODE (x) == CONST_VECTOR)
858     {
859       int n;
860       if (!vgpr_vector_mode_p (GET_MODE (x)))
861 	return false;
862       n = gcn_inline_constant_p (CONST_VECTOR_ELT (x, 0));
863       if (!n)
864 	return false;
865       for (int i = 1; i < 64; i++)
866 	if (CONST_VECTOR_ELT (x, i) != CONST_VECTOR_ELT (x, 0))
867 	  return false;
868       return 1;
869     }
870   return false;
871 }
872 
873 /* Return true if X is a constant representable as an immediate constant
874    in a 32 or 64-bit instruction encoding.  */
875 
876 bool
gcn_constant_p(rtx x)877 gcn_constant_p (rtx x)
878 {
879   switch (GET_CODE (x))
880     {
881     case CONST_INT:
882       return true;
883 
884     case CONST_DOUBLE:
885       return gcn_fp_constant_p (x, false);
886 
887     case CONST_VECTOR:
888       {
889 	int n;
890 	if (!vgpr_vector_mode_p (GET_MODE (x)))
891 	  return false;
892 	n = gcn_constant_p (CONST_VECTOR_ELT (x, 0));
893 	if (!n)
894 	  return false;
895 	for (int i = 1; i < 64; i++)
896 	  if (CONST_VECTOR_ELT (x, i) != CONST_VECTOR_ELT (x, 0))
897 	    return false;
898 	return true;
899       }
900 
901     case SYMBOL_REF:
902     case LABEL_REF:
903       return true;
904 
905     default:
906       ;
907     }
908 
909   return false;
910 }
911 
912 /* Return true if X is a constant representable as two inline immediate
913    constants in a 64-bit instruction that is split into two 32-bit
914    instructions.
915    When MIXED is set, the low-part is permitted to use the full 32-bits.  */
916 
917 bool
gcn_inline_constant64_p(rtx x,bool mixed)918 gcn_inline_constant64_p (rtx x, bool mixed)
919 {
920   if (GET_CODE (x) == CONST_VECTOR)
921     {
922       if (!vgpr_vector_mode_p (GET_MODE (x)))
923 	return false;
924       if (!gcn_inline_constant64_p (CONST_VECTOR_ELT (x, 0), mixed))
925 	return false;
926       for (int i = 1; i < 64; i++)
927 	if (CONST_VECTOR_ELT (x, i) != CONST_VECTOR_ELT (x, 0))
928 	  return false;
929 
930       return true;
931     }
932 
933   if (GET_CODE (x) != CONST_INT)
934     return false;
935 
936   rtx val_lo = gcn_operand_part (DImode, x, 0);
937   rtx val_hi = gcn_operand_part (DImode, x, 1);
938   return ((mixed || gcn_inline_constant_p (val_lo))
939 	  && gcn_inline_constant_p (val_hi));
940 }
941 
942 /* Return true if X is a constant representable as an immediate constant
943    in a 32 or 64-bit instruction encoding where the hardware will
944    extend the immediate to 64-bits.  */
945 
946 bool
gcn_constant64_p(rtx x)947 gcn_constant64_p (rtx x)
948 {
949   if (!gcn_constant_p (x))
950     return false;
951 
952   if (GET_CODE (x) != CONST_INT)
953     return true;
954 
955   /* Negative numbers are only allowed if they can be encoded within src0,
956      because the 32-bit immediates do not get sign-extended.
957      Unsigned numbers must not be encodable as 32-bit -1..-16, because the
958      assembler will use a src0 inline immediate and that will get
959      sign-extended.  */
960   HOST_WIDE_INT val = INTVAL (x);
961   return (((val & 0xffffffff) == val	/* Positive 32-bit.  */
962 	   && (val & 0xfffffff0) != 0xfffffff0)	/* Not -1..-16.  */
963 	  || gcn_inline_constant_p (x));	/* Src0.  */
964 }
965 
966 /* Implement TARGET_LEGITIMATE_CONSTANT_P.
967 
968    Returns true if X is a legitimate constant for a MODE immediate operand.  */
969 
970 bool
gcn_legitimate_constant_p(machine_mode,rtx x)971 gcn_legitimate_constant_p (machine_mode, rtx x)
972 {
973   return gcn_constant_p (x);
974 }
975 
976 /* Return true if X is a CONST_VECTOR of single constant.  */
977 
978 static bool
single_cst_vector_p(rtx x)979 single_cst_vector_p (rtx x)
980 {
981   if (GET_CODE (x) != CONST_VECTOR)
982     return false;
983   for (int i = 1; i < 64; i++)
984     if (CONST_VECTOR_ELT (x, i) != CONST_VECTOR_ELT (x, 0))
985       return false;
986   return true;
987 }
988 
989 /* Create a CONST_VECTOR of duplicated value A.  */
990 
991 rtx
gcn_vec_constant(machine_mode mode,int a)992 gcn_vec_constant (machine_mode mode, int a)
993 {
994   /*if (!a)
995     return CONST0_RTX (mode);
996   if (a == -1)
997     return CONSTM1_RTX (mode);
998   if (a == 1)
999     return CONST1_RTX (mode);
1000   if (a == 2)
1001     return CONST2_RTX (mode);*/
1002 
1003   int units = GET_MODE_NUNITS (mode);
1004   machine_mode innermode = GET_MODE_INNER (mode);
1005 
1006   rtx tem;
1007   if (FLOAT_MODE_P (innermode))
1008     {
1009       REAL_VALUE_TYPE rv;
1010       real_from_integer (&rv, NULL, a, SIGNED);
1011       tem = const_double_from_real_value (rv, innermode);
1012     }
1013   else
1014     tem = gen_int_mode (a, innermode);
1015 
1016   rtvec v = rtvec_alloc (units);
1017   for (int i = 0; i < units; ++i)
1018     RTVEC_ELT (v, i) = tem;
1019 
1020   return gen_rtx_CONST_VECTOR (mode, v);
1021 }
1022 
1023 /* Create a CONST_VECTOR of duplicated value A.  */
1024 
1025 rtx
gcn_vec_constant(machine_mode mode,rtx a)1026 gcn_vec_constant (machine_mode mode, rtx a)
1027 {
1028   int units = GET_MODE_NUNITS (mode);
1029   rtvec v = rtvec_alloc (units);
1030 
1031   for (int i = 0; i < units; ++i)
1032     RTVEC_ELT (v, i) = a;
1033 
1034   return gen_rtx_CONST_VECTOR (mode, v);
1035 }
1036 
1037 /* Create an undefined vector value, used where an insn operand is
1038    optional.  */
1039 
1040 rtx
gcn_gen_undef(machine_mode mode)1041 gcn_gen_undef (machine_mode mode)
1042 {
1043   return gen_rtx_UNSPEC (mode, gen_rtvec (1, const0_rtx), UNSPEC_VECTOR);
1044 }
1045 
1046 /* }}}  */
1047 /* {{{ Addresses, pointers and moves.  */
1048 
1049 /* Return true is REG is a valid place to store a pointer,
1050    for instructions that require an SGPR.
1051    FIXME rename. */
1052 
1053 static bool
gcn_address_register_p(rtx reg,machine_mode mode,bool strict)1054 gcn_address_register_p (rtx reg, machine_mode mode, bool strict)
1055 {
1056   if (GET_CODE (reg) == SUBREG)
1057     reg = SUBREG_REG (reg);
1058 
1059   if (!REG_P (reg))
1060     return false;
1061 
1062   if (GET_MODE (reg) != mode)
1063     return false;
1064 
1065   int regno = REGNO (reg);
1066 
1067   if (regno >= FIRST_PSEUDO_REGISTER)
1068     {
1069       if (!strict)
1070 	return true;
1071 
1072       if (!reg_renumber)
1073 	return false;
1074 
1075       regno = reg_renumber[regno];
1076     }
1077 
1078   return (SGPR_REGNO_P (regno) || regno == M0_REG
1079 	  || regno == ARG_POINTER_REGNUM || regno == FRAME_POINTER_REGNUM);
1080 }
1081 
1082 /* Return true is REG is a valid place to store a pointer,
1083    for instructions that require a VGPR.  */
1084 
1085 static bool
gcn_vec_address_register_p(rtx reg,machine_mode mode,bool strict)1086 gcn_vec_address_register_p (rtx reg, machine_mode mode, bool strict)
1087 {
1088   if (GET_CODE (reg) == SUBREG)
1089     reg = SUBREG_REG (reg);
1090 
1091   if (!REG_P (reg))
1092     return false;
1093 
1094   if (GET_MODE (reg) != mode)
1095     return false;
1096 
1097   int regno = REGNO (reg);
1098 
1099   if (regno >= FIRST_PSEUDO_REGISTER)
1100     {
1101       if (!strict)
1102 	return true;
1103 
1104       if (!reg_renumber)
1105 	return false;
1106 
1107       regno = reg_renumber[regno];
1108     }
1109 
1110   return VGPR_REGNO_P (regno);
1111 }
1112 
1113 /* Return true if X would be valid inside a MEM using the Flat address
1114    space.  */
1115 
1116 bool
gcn_flat_address_p(rtx x,machine_mode mode)1117 gcn_flat_address_p (rtx x, machine_mode mode)
1118 {
1119   bool vec_mode = (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
1120 		   || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT);
1121 
1122   if (vec_mode && gcn_address_register_p (x, DImode, false))
1123     return true;
1124 
1125   if (!vec_mode && gcn_vec_address_register_p (x, DImode, false))
1126     return true;
1127 
1128   if (TARGET_GCN5_PLUS
1129       && GET_CODE (x) == PLUS
1130       && gcn_vec_address_register_p (XEXP (x, 0), DImode, false)
1131       && CONST_INT_P (XEXP (x, 1)))
1132     return true;
1133 
1134   return false;
1135 }
1136 
1137 /* Return true if X would be valid inside a MEM using the Scalar Flat
1138    address space.  */
1139 
1140 bool
gcn_scalar_flat_address_p(rtx x)1141 gcn_scalar_flat_address_p (rtx x)
1142 {
1143   if (gcn_address_register_p (x, DImode, false))
1144     return true;
1145 
1146   if (GET_CODE (x) == PLUS
1147       && gcn_address_register_p (XEXP (x, 0), DImode, false)
1148       && CONST_INT_P (XEXP (x, 1)))
1149     return true;
1150 
1151   return false;
1152 }
1153 
1154 /* Return true if MEM X would be valid for the Scalar Flat address space.  */
1155 
1156 bool
gcn_scalar_flat_mem_p(rtx x)1157 gcn_scalar_flat_mem_p (rtx x)
1158 {
1159   if (!MEM_P (x))
1160     return false;
1161 
1162   if (GET_MODE_SIZE (GET_MODE (x)) < 4)
1163     return false;
1164 
1165   return gcn_scalar_flat_address_p (XEXP (x, 0));
1166 }
1167 
1168 /* Return true if X would be valid inside a MEM using the LDS or GDS
1169    address spaces.  */
1170 
1171 bool
gcn_ds_address_p(rtx x)1172 gcn_ds_address_p (rtx x)
1173 {
1174   if (gcn_vec_address_register_p (x, SImode, false))
1175     return true;
1176 
1177   if (GET_CODE (x) == PLUS
1178       && gcn_vec_address_register_p (XEXP (x, 0), SImode, false)
1179       && CONST_INT_P (XEXP (x, 1)))
1180     return true;
1181 
1182   return false;
1183 }
1184 
1185 /* Return true if ADDR would be valid inside a MEM using the Global
1186    address space.  */
1187 
1188 bool
gcn_global_address_p(rtx addr)1189 gcn_global_address_p (rtx addr)
1190 {
1191   if (gcn_address_register_p (addr, DImode, false)
1192       || gcn_vec_address_register_p (addr, DImode, false))
1193     return true;
1194 
1195   if (GET_CODE (addr) == PLUS)
1196     {
1197       rtx base = XEXP (addr, 0);
1198       rtx offset = XEXP (addr, 1);
1199       bool immediate_p = (CONST_INT_P (offset)
1200 			  && INTVAL (offset) >= -(1 << 12)
1201 			  && INTVAL (offset) < (1 << 12));
1202 
1203       if ((gcn_address_register_p (base, DImode, false)
1204 	   || gcn_vec_address_register_p (base, DImode, false))
1205 	  && immediate_p)
1206 	/* SGPR + CONST or VGPR + CONST  */
1207 	return true;
1208 
1209       if (gcn_address_register_p (base, DImode, false)
1210 	  && gcn_vgpr_register_operand (offset, SImode))
1211 	/* SPGR + VGPR  */
1212 	return true;
1213 
1214       if (GET_CODE (base) == PLUS
1215 	  && gcn_address_register_p (XEXP (base, 0), DImode, false)
1216 	  && gcn_vgpr_register_operand (XEXP (base, 1), SImode)
1217 	  && immediate_p)
1218 	/* (SGPR + VGPR) + CONST  */
1219 	return true;
1220     }
1221 
1222   return false;
1223 }
1224 
1225 /* Implement TARGET_ADDR_SPACE_LEGITIMATE_ADDRESS_P.
1226 
1227    Recognizes RTL expressions that are valid memory addresses for an
1228    instruction.  The MODE argument is the machine mode for the MEM
1229    expression that wants to use this address.
1230 
1231    It only recognizes address in canonical form.  LEGITIMIZE_ADDRESS should
1232    convert common non-canonical forms to canonical form so that they will
1233    be recognized.  */
1234 
1235 static bool
gcn_addr_space_legitimate_address_p(machine_mode mode,rtx x,bool strict,addr_space_t as)1236 gcn_addr_space_legitimate_address_p (machine_mode mode, rtx x, bool strict,
1237 				     addr_space_t as)
1238 {
1239   /* All vector instructions need to work on addresses in registers.  */
1240   if (!TARGET_GCN5_PLUS && (vgpr_vector_mode_p (mode) && !REG_P (x)))
1241     return false;
1242 
1243   if (AS_SCALAR_FLAT_P (as))
1244     {
1245       if (mode == QImode || mode == HImode)
1246 	return 0;
1247 
1248       switch (GET_CODE (x))
1249 	{
1250 	case REG:
1251 	  return gcn_address_register_p (x, DImode, strict);
1252 	/* Addresses are in the form BASE+OFFSET
1253 	   OFFSET is either 20bit unsigned immediate, SGPR or M0.
1254 	   Writes and atomics do not accept SGPR.  */
1255 	case PLUS:
1256 	  {
1257 	    rtx x0 = XEXP (x, 0);
1258 	    rtx x1 = XEXP (x, 1);
1259 	    if (!gcn_address_register_p (x0, DImode, strict))
1260 	      return false;
1261 	    /* FIXME: This is disabled because of the mode mismatch between
1262 	       SImode (for the address or m0 register) and the DImode PLUS.
1263 	       We'll need a zero_extend or similar.
1264 
1265 	    if (gcn_m0_register_p (x1, SImode, strict)
1266 		|| gcn_address_register_p (x1, SImode, strict))
1267 	      return true;
1268 	    else*/
1269 	    if (GET_CODE (x1) == CONST_INT)
1270 	      {
1271 		if (INTVAL (x1) >= 0 && INTVAL (x1) < (1 << 20)
1272 		    /* The low bits of the offset are ignored, even when
1273 		       they're meant to realign the pointer.  */
1274 		    && !(INTVAL (x1) & 0x3))
1275 		  return true;
1276 	      }
1277 	    return false;
1278 	  }
1279 
1280 	default:
1281 	  break;
1282 	}
1283     }
1284   else if (AS_SCRATCH_P (as))
1285     return gcn_address_register_p (x, SImode, strict);
1286   else if (AS_FLAT_P (as) || AS_FLAT_SCRATCH_P (as))
1287     {
1288       if (TARGET_GCN3 || GET_CODE (x) == REG)
1289        return ((GET_MODE_CLASS (mode) == MODE_VECTOR_INT
1290 		|| GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
1291 	       ? gcn_address_register_p (x, DImode, strict)
1292 	       : gcn_vec_address_register_p (x, DImode, strict));
1293       else
1294 	{
1295 	  gcc_assert (TARGET_GCN5_PLUS);
1296 
1297 	  if (GET_CODE (x) == PLUS)
1298 	    {
1299 	      rtx x1 = XEXP (x, 1);
1300 
1301 	      if (VECTOR_MODE_P (mode)
1302 		  ? !gcn_address_register_p (x, DImode, strict)
1303 		  : !gcn_vec_address_register_p (x, DImode, strict))
1304 		return false;
1305 
1306 	      if (GET_CODE (x1) == CONST_INT)
1307 		{
1308 		  if (INTVAL (x1) >= 0 && INTVAL (x1) < (1 << 12)
1309 		      /* The low bits of the offset are ignored, even when
1310 		         they're meant to realign the pointer.  */
1311 		      && !(INTVAL (x1) & 0x3))
1312 		    return true;
1313 		}
1314 	    }
1315 	  return false;
1316 	}
1317     }
1318   else if (AS_GLOBAL_P (as))
1319     {
1320       gcc_assert (TARGET_GCN5_PLUS);
1321 
1322       if (GET_CODE (x) == REG)
1323        return (gcn_address_register_p (x, DImode, strict)
1324 	       || (!VECTOR_MODE_P (mode)
1325 		   && gcn_vec_address_register_p (x, DImode, strict)));
1326       else if (GET_CODE (x) == PLUS)
1327 	{
1328 	  rtx base = XEXP (x, 0);
1329 	  rtx offset = XEXP (x, 1);
1330 
1331 	  bool immediate_p = (GET_CODE (offset) == CONST_INT
1332 			      /* Signed 13-bit immediate.  */
1333 			      && INTVAL (offset) >= -(1 << 12)
1334 			      && INTVAL (offset) < (1 << 12)
1335 			      /* The low bits of the offset are ignored, even
1336 			         when they're meant to realign the pointer.  */
1337 			      && !(INTVAL (offset) & 0x3));
1338 
1339 	  if (!VECTOR_MODE_P (mode))
1340 	    {
1341 	      if ((gcn_address_register_p (base, DImode, strict)
1342 		   || gcn_vec_address_register_p (base, DImode, strict))
1343 		  && immediate_p)
1344 		/* SGPR + CONST or VGPR + CONST  */
1345 		return true;
1346 
1347 	      if (gcn_address_register_p (base, DImode, strict)
1348 		  && gcn_vgpr_register_operand (offset, SImode))
1349 		/* SGPR + VGPR  */
1350 		return true;
1351 
1352 	      if (GET_CODE (base) == PLUS
1353 		  && gcn_address_register_p (XEXP (base, 0), DImode, strict)
1354 		  && gcn_vgpr_register_operand (XEXP (base, 1), SImode)
1355 		  && immediate_p)
1356 		/* (SGPR + VGPR) + CONST  */
1357 		return true;
1358 	    }
1359 	  else
1360 	    {
1361 	      if (gcn_address_register_p (base, DImode, strict)
1362 		  && immediate_p)
1363 		/* SGPR + CONST  */
1364 		return true;
1365 	    }
1366 	}
1367       else
1368 	return false;
1369     }
1370   else if (AS_ANY_DS_P (as))
1371     switch (GET_CODE (x))
1372       {
1373       case REG:
1374 	return (VECTOR_MODE_P (mode)
1375 		? gcn_address_register_p (x, SImode, strict)
1376 		: gcn_vec_address_register_p (x, SImode, strict));
1377       /* Addresses are in the form BASE+OFFSET
1378 	 OFFSET is either 20bit unsigned immediate, SGPR or M0.
1379 	 Writes and atomics do not accept SGPR.  */
1380       case PLUS:
1381 	{
1382 	  rtx x0 = XEXP (x, 0);
1383 	  rtx x1 = XEXP (x, 1);
1384 	  if (!gcn_vec_address_register_p (x0, DImode, strict))
1385 	    return false;
1386 	  if (GET_CODE (x1) == REG)
1387 	    {
1388 	      if (GET_CODE (x1) != REG
1389 		  || (REGNO (x1) <= FIRST_PSEUDO_REGISTER
1390 		      && !gcn_ssrc_register_operand (x1, DImode)))
1391 		return false;
1392 	    }
1393 	  else if (GET_CODE (x1) == CONST_VECTOR
1394 		   && GET_CODE (CONST_VECTOR_ELT (x1, 0)) == CONST_INT
1395 		   && single_cst_vector_p (x1))
1396 	    {
1397 	      x1 = CONST_VECTOR_ELT (x1, 0);
1398 	      if (INTVAL (x1) >= 0 && INTVAL (x1) < (1 << 20))
1399 		return true;
1400 	    }
1401 	  return false;
1402 	}
1403 
1404       default:
1405 	break;
1406       }
1407   else
1408     gcc_unreachable ();
1409   return false;
1410 }
1411 
1412 /* Implement TARGET_ADDR_SPACE_POINTER_MODE.
1413 
1414    Return the appropriate mode for a named address pointer.  */
1415 
1416 static scalar_int_mode
gcn_addr_space_pointer_mode(addr_space_t addrspace)1417 gcn_addr_space_pointer_mode (addr_space_t addrspace)
1418 {
1419   switch (addrspace)
1420     {
1421     case ADDR_SPACE_SCRATCH:
1422     case ADDR_SPACE_LDS:
1423     case ADDR_SPACE_GDS:
1424       return SImode;
1425     case ADDR_SPACE_DEFAULT:
1426     case ADDR_SPACE_FLAT:
1427     case ADDR_SPACE_FLAT_SCRATCH:
1428     case ADDR_SPACE_SCALAR_FLAT:
1429       return DImode;
1430     default:
1431       gcc_unreachable ();
1432     }
1433 }
1434 
1435 /* Implement TARGET_ADDR_SPACE_ADDRESS_MODE.
1436 
1437    Return the appropriate mode for a named address space address.  */
1438 
1439 static scalar_int_mode
gcn_addr_space_address_mode(addr_space_t addrspace)1440 gcn_addr_space_address_mode (addr_space_t addrspace)
1441 {
1442   return gcn_addr_space_pointer_mode (addrspace);
1443 }
1444 
1445 /* Implement TARGET_ADDR_SPACE_SUBSET_P.
1446 
1447    Determine if one named address space is a subset of another.  */
1448 
1449 static bool
gcn_addr_space_subset_p(addr_space_t subset,addr_space_t superset)1450 gcn_addr_space_subset_p (addr_space_t subset, addr_space_t superset)
1451 {
1452   if (subset == superset)
1453     return true;
1454   /* FIXME is this true?  */
1455   if (AS_FLAT_P (superset) || AS_SCALAR_FLAT_P (superset))
1456     return true;
1457   return false;
1458 }
1459 
1460 /* Convert from one address space to another.  */
1461 
1462 static rtx
gcn_addr_space_convert(rtx op,tree from_type,tree to_type)1463 gcn_addr_space_convert (rtx op, tree from_type, tree to_type)
1464 {
1465   gcc_assert (POINTER_TYPE_P (from_type));
1466   gcc_assert (POINTER_TYPE_P (to_type));
1467 
1468   addr_space_t as_from = TYPE_ADDR_SPACE (TREE_TYPE (from_type));
1469   addr_space_t as_to = TYPE_ADDR_SPACE (TREE_TYPE (to_type));
1470 
1471   if (AS_LDS_P (as_from) && AS_FLAT_P (as_to))
1472     {
1473       rtx queue = gen_rtx_REG (DImode,
1474 			       cfun->machine->args.reg[QUEUE_PTR_ARG]);
1475       rtx group_seg_aperture_hi = gen_rtx_MEM (SImode,
1476 				     gen_rtx_PLUS (DImode, queue,
1477 						   gen_int_mode (64, SImode)));
1478       rtx tmp = gen_reg_rtx (DImode);
1479 
1480       emit_move_insn (gen_lowpart (SImode, tmp), op);
1481       emit_move_insn (gen_highpart_mode (SImode, DImode, tmp),
1482 		      group_seg_aperture_hi);
1483 
1484       return tmp;
1485     }
1486   else if (as_from == as_to)
1487     return op;
1488   else
1489     gcc_unreachable ();
1490 }
1491 
1492 
1493 /* Implement REGNO_MODE_CODE_OK_FOR_BASE_P via gcn.h
1494 
1495    Retun true if REGNO is OK for memory adressing.  */
1496 
1497 bool
gcn_regno_mode_code_ok_for_base_p(int regno,machine_mode,addr_space_t as,int,int)1498 gcn_regno_mode_code_ok_for_base_p (int regno,
1499 				   machine_mode, addr_space_t as, int, int)
1500 {
1501   if (regno >= FIRST_PSEUDO_REGISTER)
1502     {
1503       if (reg_renumber)
1504 	regno = reg_renumber[regno];
1505       else
1506 	return true;
1507     }
1508   if (AS_FLAT_P (as))
1509     return (VGPR_REGNO_P (regno)
1510 	    || regno == ARG_POINTER_REGNUM || regno == FRAME_POINTER_REGNUM);
1511   else if (AS_SCALAR_FLAT_P (as))
1512     return (SGPR_REGNO_P (regno)
1513 	    || regno == ARG_POINTER_REGNUM || regno == FRAME_POINTER_REGNUM);
1514   else if (AS_GLOBAL_P (as))
1515     {
1516       return (SGPR_REGNO_P (regno)
1517 	      || VGPR_REGNO_P (regno)
1518 	      || regno == ARG_POINTER_REGNUM
1519 	      || regno == FRAME_POINTER_REGNUM);
1520     }
1521   else
1522     /* For now.  */
1523     return false;
1524 }
1525 
1526 /* Implement MODE_CODE_BASE_REG_CLASS via gcn.h.
1527 
1528    Return a suitable register class for memory addressing.  */
1529 
1530 reg_class
gcn_mode_code_base_reg_class(machine_mode mode,addr_space_t as,int oc,int ic)1531 gcn_mode_code_base_reg_class (machine_mode mode, addr_space_t as, int oc,
1532 			      int ic)
1533 {
1534   switch (as)
1535     {
1536     case ADDR_SPACE_DEFAULT:
1537       return gcn_mode_code_base_reg_class (mode, DEFAULT_ADDR_SPACE, oc, ic);
1538     case ADDR_SPACE_SCALAR_FLAT:
1539     case ADDR_SPACE_SCRATCH:
1540       return SGPR_REGS;
1541       break;
1542     case ADDR_SPACE_FLAT:
1543     case ADDR_SPACE_FLAT_SCRATCH:
1544     case ADDR_SPACE_LDS:
1545     case ADDR_SPACE_GDS:
1546       return ((GET_MODE_CLASS (mode) == MODE_VECTOR_INT
1547 	       || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
1548 	      ? SGPR_REGS : VGPR_REGS);
1549     case ADDR_SPACE_GLOBAL:
1550       return ((GET_MODE_CLASS (mode) == MODE_VECTOR_INT
1551 	       || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
1552 	      ? SGPR_REGS : ALL_GPR_REGS);
1553     }
1554   gcc_unreachable ();
1555 }
1556 
1557 /* Implement REGNO_OK_FOR_INDEX_P via gcn.h.
1558 
1559    Return true if REGNO is OK for index of memory addressing.  */
1560 
1561 bool
regno_ok_for_index_p(int regno)1562 regno_ok_for_index_p (int regno)
1563 {
1564   if (regno >= FIRST_PSEUDO_REGISTER)
1565     {
1566       if (reg_renumber)
1567 	regno = reg_renumber[regno];
1568       else
1569 	return true;
1570     }
1571   return regno == M0_REG || VGPR_REGNO_P (regno);
1572 }
1573 
1574 /* Generate move which uses the exec flags.  If EXEC is NULL, then it is
1575    assumed that all lanes normally relevant to the mode of the move are
1576    affected.  If PREV is NULL, then a sensible default is supplied for
1577    the inactive lanes.  */
1578 
1579 static rtx
1580 gen_mov_with_exec (rtx op0, rtx op1, rtx exec = NULL, rtx prev = NULL)
1581 {
1582   machine_mode mode = GET_MODE (op0);
1583 
1584   if (vgpr_vector_mode_p (mode))
1585     {
1586       if (exec && exec != CONSTM1_RTX (DImode))
1587 	{
1588 	  if (!prev)
1589 	    prev = op0;
1590 	}
1591       else
1592 	{
1593 	  if (!prev)
1594 	    prev = gcn_gen_undef (mode);
1595 	  exec = gcn_full_exec_reg ();
1596 	}
1597 
1598       rtx set = gen_rtx_SET (op0, gen_rtx_VEC_MERGE (mode, op1, prev, exec));
1599 
1600       return gen_rtx_PARALLEL (VOIDmode,
1601 	       gen_rtvec (2, set,
1602 			 gen_rtx_CLOBBER (VOIDmode,
1603 					  gen_rtx_SCRATCH (V64DImode))));
1604     }
1605 
1606   return (gen_rtx_PARALLEL
1607 	  (VOIDmode,
1608 	   gen_rtvec (2, gen_rtx_SET (op0, op1),
1609 		      gen_rtx_USE (VOIDmode,
1610 				   exec ? exec : gcn_scalar_exec ()))));
1611 }
1612 
1613 /* Generate masked move.  */
1614 
1615 static rtx
1616 gen_duplicate_load (rtx op0, rtx op1, rtx op2 = NULL, rtx exec = NULL)
1617 {
1618   if (exec)
1619     return (gen_rtx_SET (op0,
1620 			 gen_rtx_VEC_MERGE (GET_MODE (op0),
1621 					    gen_rtx_VEC_DUPLICATE (GET_MODE
1622 								   (op0), op1),
1623 					    op2, exec)));
1624   else
1625     return (gen_rtx_SET (op0, gen_rtx_VEC_DUPLICATE (GET_MODE (op0), op1)));
1626 }
1627 
1628 /* Expand vector init of OP0 by VEC.
1629    Implements vec_init instruction pattern.  */
1630 
1631 void
gcn_expand_vector_init(rtx op0,rtx vec)1632 gcn_expand_vector_init (rtx op0, rtx vec)
1633 {
1634   int64_t initialized_mask = 0;
1635   int64_t curr_mask = 1;
1636   machine_mode mode = GET_MODE (op0);
1637 
1638   rtx val = XVECEXP (vec, 0, 0);
1639 
1640   for (int i = 1; i < 64; i++)
1641     if (rtx_equal_p (val, XVECEXP (vec, 0, i)))
1642       curr_mask |= (int64_t) 1 << i;
1643 
1644   if (gcn_constant_p (val))
1645     emit_move_insn (op0, gcn_vec_constant (mode, val));
1646   else
1647     {
1648       val = force_reg (GET_MODE_INNER (mode), val);
1649       emit_insn (gen_duplicate_load (op0, val));
1650     }
1651   initialized_mask |= curr_mask;
1652   for (int i = 1; i < 64; i++)
1653     if (!(initialized_mask & ((int64_t) 1 << i)))
1654       {
1655 	curr_mask = (int64_t) 1 << i;
1656 	rtx val = XVECEXP (vec, 0, i);
1657 
1658 	for (int j = i + 1; j < 64; j++)
1659 	  if (rtx_equal_p (val, XVECEXP (vec, 0, j)))
1660 	    curr_mask |= (int64_t) 1 << j;
1661 	if (gcn_constant_p (val))
1662 	  emit_insn (gen_mov_with_exec (op0, gcn_vec_constant (mode, val),
1663 					get_exec (curr_mask)));
1664 	else
1665 	  {
1666 	    val = force_reg (GET_MODE_INNER (mode), val);
1667 	    emit_insn (gen_duplicate_load (op0, val, op0,
1668 					   get_exec (curr_mask)));
1669 	  }
1670 	initialized_mask |= curr_mask;
1671       }
1672 }
1673 
1674 /* Load vector constant where n-th lane contains BASE+n*VAL.  */
1675 
1676 static rtx
strided_constant(machine_mode mode,int base,int val)1677 strided_constant (machine_mode mode, int base, int val)
1678 {
1679   rtx x = gen_reg_rtx (mode);
1680   emit_move_insn (x, gcn_vec_constant (mode, base));
1681   emit_insn (gen_addv64si3_exec (x, x, gcn_vec_constant (mode, val * 32),
1682 				 x, get_exec (0xffffffff00000000)));
1683   emit_insn (gen_addv64si3_exec (x, x, gcn_vec_constant (mode, val * 16),
1684 				 x, get_exec (0xffff0000ffff0000)));
1685   emit_insn (gen_addv64si3_exec (x, x, gcn_vec_constant (mode, val * 8),
1686 				 x, get_exec (0xff00ff00ff00ff00)));
1687   emit_insn (gen_addv64si3_exec (x, x, gcn_vec_constant (mode, val * 4),
1688 				 x, get_exec (0xf0f0f0f0f0f0f0f0)));
1689   emit_insn (gen_addv64si3_exec (x, x, gcn_vec_constant (mode, val * 2),
1690 				 x, get_exec (0xcccccccccccccccc)));
1691   emit_insn (gen_addv64si3_exec (x, x, gcn_vec_constant (mode, val * 1),
1692 				 x, get_exec (0xaaaaaaaaaaaaaaaa)));
1693   return x;
1694 }
1695 
1696 /* Implement TARGET_ADDR_SPACE_LEGITIMIZE_ADDRESS.  */
1697 
1698 static rtx
gcn_addr_space_legitimize_address(rtx x,rtx old,machine_mode mode,addr_space_t as)1699 gcn_addr_space_legitimize_address (rtx x, rtx old, machine_mode mode,
1700 				   addr_space_t as)
1701 {
1702   switch (as)
1703     {
1704     case ADDR_SPACE_DEFAULT:
1705       return gcn_addr_space_legitimize_address (x, old, mode,
1706 						DEFAULT_ADDR_SPACE);
1707     case ADDR_SPACE_SCALAR_FLAT:
1708     case ADDR_SPACE_SCRATCH:
1709       /* Instructions working on vectors need the address to be in
1710          a register.  */
1711       if (vgpr_vector_mode_p (mode))
1712 	return force_reg (GET_MODE (x), x);
1713 
1714       return x;
1715     case ADDR_SPACE_FLAT:
1716     case ADDR_SPACE_FLAT_SCRATCH:
1717     case ADDR_SPACE_GLOBAL:
1718       return TARGET_GCN3 ? force_reg (DImode, x) : x;
1719     case ADDR_SPACE_LDS:
1720     case ADDR_SPACE_GDS:
1721       /* FIXME: LDS support offsets, handle them!.  */
1722       if (vgpr_vector_mode_p (mode) && GET_MODE (x) != V64SImode)
1723 	{
1724 	  rtx addrs = gen_reg_rtx (V64SImode);
1725 	  rtx base = force_reg (SImode, x);
1726 	  rtx offsets = strided_constant (V64SImode, 0,
1727 					  GET_MODE_UNIT_SIZE (mode));
1728 
1729 	  emit_insn (gen_vec_duplicatev64si (addrs, base));
1730 	  emit_insn (gen_addv64si3 (addrs, offsets, addrs));
1731 	  return addrs;
1732 	}
1733       return x;
1734     }
1735   gcc_unreachable ();
1736 }
1737 
1738 /* Convert a (mem:<MODE> (reg:DI)) to (mem:<MODE> (reg:V64DI)) with the
1739    proper vector of stepped addresses.
1740 
1741    MEM will be a DImode address of a vector in an SGPR.
1742    TMP will be a V64DImode VGPR pair or (scratch:V64DI).  */
1743 
1744 rtx
gcn_expand_scalar_to_vector_address(machine_mode mode,rtx exec,rtx mem,rtx tmp)1745 gcn_expand_scalar_to_vector_address (machine_mode mode, rtx exec, rtx mem,
1746 				     rtx tmp)
1747 {
1748   gcc_assert (MEM_P (mem));
1749   rtx mem_base = XEXP (mem, 0);
1750   rtx mem_index = NULL_RTX;
1751 
1752   if (!TARGET_GCN5_PLUS)
1753     {
1754       /* gcn_addr_space_legitimize_address should have put the address in a
1755          register.  If not, it is too late to do anything about it.  */
1756       gcc_assert (REG_P (mem_base));
1757     }
1758 
1759   if (GET_CODE (mem_base) == PLUS)
1760     {
1761       mem_index = XEXP (mem_base, 1);
1762       mem_base = XEXP (mem_base, 0);
1763     }
1764 
1765   /* RF and RM base registers for vector modes should be always an SGPR.  */
1766   gcc_assert (SGPR_REGNO_P (REGNO (mem_base))
1767 	      || REGNO (mem_base) >= FIRST_PSEUDO_REGISTER);
1768 
1769   machine_mode inner = GET_MODE_INNER (mode);
1770   int shift = exact_log2 (GET_MODE_SIZE (inner));
1771   rtx ramp = gen_rtx_REG (V64SImode, VGPR_REGNO (1));
1772   rtx undef_v64si = gcn_gen_undef (V64SImode);
1773   rtx new_base = NULL_RTX;
1774   addr_space_t as = MEM_ADDR_SPACE (mem);
1775 
1776   rtx tmplo = (REG_P (tmp)
1777 	       ? gcn_operand_part (V64DImode, tmp, 0)
1778 	       : gen_reg_rtx (V64SImode));
1779 
1780   /* tmplo[:] = ramp[:] << shift  */
1781   if (exec)
1782     emit_insn (gen_ashlv64si3_exec (tmplo, ramp,
1783 				    gen_int_mode (shift, SImode),
1784 				    undef_v64si, exec));
1785   else
1786     emit_insn (gen_ashlv64si3 (tmplo, ramp, gen_int_mode (shift, SImode)));
1787 
1788   if (AS_FLAT_P (as))
1789     {
1790       rtx vcc = gen_rtx_REG (DImode, CC_SAVE_REG);
1791 
1792       if (REG_P (tmp))
1793 	{
1794 	  rtx mem_base_lo = gcn_operand_part (DImode, mem_base, 0);
1795 	  rtx mem_base_hi = gcn_operand_part (DImode, mem_base, 1);
1796 	  rtx tmphi = gcn_operand_part (V64DImode, tmp, 1);
1797 
1798 	  /* tmphi[:] = mem_base_hi  */
1799 	  if (exec)
1800 	    emit_insn (gen_vec_duplicatev64si_exec (tmphi, mem_base_hi,
1801 						    undef_v64si, exec));
1802 	  else
1803 	    emit_insn (gen_vec_duplicatev64si (tmphi, mem_base_hi));
1804 
1805 	  /* tmp[:] += zext (mem_base)  */
1806 	  if (exec)
1807 	    {
1808 	      emit_insn (gen_addv64si3_vcc_dup_exec (tmplo, mem_base_lo, tmplo,
1809 						     vcc, undef_v64si, exec));
1810 	      emit_insn (gen_addcv64si3_exec (tmphi, tmphi, const0_rtx,
1811 					      vcc, vcc, undef_v64si, exec));
1812 	    }
1813 	  else
1814 	    emit_insn (gen_addv64di3_vcc_zext_dup (tmp, mem_base_lo, tmp, vcc));
1815 	}
1816       else
1817 	{
1818 	  tmp = gen_reg_rtx (V64DImode);
1819 	  if (exec)
1820 	    emit_insn (gen_addv64di3_vcc_zext_dup2_exec
1821 		       (tmp, tmplo, mem_base, vcc, gcn_gen_undef (V64DImode),
1822 			exec));
1823 	  else
1824 	    emit_insn (gen_addv64di3_vcc_zext_dup2 (tmp, tmplo, mem_base, vcc));
1825 	}
1826 
1827       new_base = tmp;
1828     }
1829   else if (AS_ANY_DS_P (as))
1830     {
1831       if (!exec)
1832 	emit_insn (gen_addv64si3_dup (tmplo, tmplo, mem_base));
1833       else
1834         emit_insn (gen_addv64si3_dup_exec (tmplo, tmplo, mem_base,
1835 					   gcn_gen_undef (V64SImode), exec));
1836       new_base = tmplo;
1837     }
1838   else
1839     {
1840       mem_base = gen_rtx_VEC_DUPLICATE (V64DImode, mem_base);
1841       new_base = gen_rtx_PLUS (V64DImode, mem_base,
1842 			       gen_rtx_SIGN_EXTEND (V64DImode, tmplo));
1843     }
1844 
1845   return gen_rtx_PLUS (GET_MODE (new_base), new_base,
1846 		       gen_rtx_VEC_DUPLICATE (GET_MODE (new_base),
1847 					      (mem_index ? mem_index
1848 					       : const0_rtx)));
1849 }
1850 
1851 /* Convert a BASE address, a vector of OFFSETS, and a SCALE, to addresses
1852    suitable for the given address space.  This is indented for use in
1853    gather/scatter patterns.
1854 
1855    The offsets may be signed or unsigned, according to UNSIGNED_P.
1856    If EXEC is set then _exec patterns will be used, otherwise plain.
1857 
1858    Return values.
1859      ADDR_SPACE_FLAT   - return V64DImode vector of absolute addresses.
1860      ADDR_SPACE_GLOBAL - return V64SImode vector of offsets.  */
1861 
1862 rtx
gcn_expand_scaled_offsets(addr_space_t as,rtx base,rtx offsets,rtx scale,bool unsigned_p,rtx exec)1863 gcn_expand_scaled_offsets (addr_space_t as, rtx base, rtx offsets, rtx scale,
1864 			   bool unsigned_p, rtx exec)
1865 {
1866   rtx tmpsi = gen_reg_rtx (V64SImode);
1867   rtx tmpdi = gen_reg_rtx (V64DImode);
1868   rtx undefsi = exec ? gcn_gen_undef (V64SImode) : NULL;
1869   rtx undefdi = exec ? gcn_gen_undef (V64DImode) : NULL;
1870 
1871   if (CONST_INT_P (scale)
1872       && INTVAL (scale) > 0
1873       && exact_log2 (INTVAL (scale)) >= 0)
1874     emit_insn (gen_ashlv64si3 (tmpsi, offsets,
1875 			       GEN_INT (exact_log2 (INTVAL (scale)))));
1876   else
1877     (exec
1878      ? emit_insn (gen_mulv64si3_dup_exec (tmpsi, offsets, scale, undefsi,
1879 					  exec))
1880      : emit_insn (gen_mulv64si3_dup (tmpsi, offsets, scale)));
1881 
1882   /* "Global" instructions do not support negative register offsets.  */
1883   if (as == ADDR_SPACE_FLAT || !unsigned_p)
1884     {
1885       if (unsigned_p)
1886 	(exec
1887 	 ?  emit_insn (gen_addv64di3_zext_dup2_exec (tmpdi, tmpsi, base,
1888 						    undefdi, exec))
1889 	 :  emit_insn (gen_addv64di3_zext_dup2 (tmpdi, tmpsi, base)));
1890       else
1891 	(exec
1892 	 ?  emit_insn (gen_addv64di3_sext_dup2_exec (tmpdi, tmpsi, base,
1893 						     undefdi, exec))
1894 	 :  emit_insn (gen_addv64di3_sext_dup2 (tmpdi, tmpsi, base)));
1895       return tmpdi;
1896     }
1897   else if (as == ADDR_SPACE_GLOBAL)
1898     return tmpsi;
1899 
1900   gcc_unreachable ();
1901 }
1902 
1903 /* Return true if move from OP0 to OP1 is known to be executed in vector
1904    unit.  */
1905 
1906 bool
gcn_vgpr_move_p(rtx op0,rtx op1)1907 gcn_vgpr_move_p (rtx op0, rtx op1)
1908 {
1909   if (MEM_P (op0) && AS_SCALAR_FLAT_P (MEM_ADDR_SPACE (op0)))
1910     return true;
1911   if (MEM_P (op1) && AS_SCALAR_FLAT_P (MEM_ADDR_SPACE (op1)))
1912     return true;
1913   return ((REG_P (op0) && VGPR_REGNO_P (REGNO (op0)))
1914 	  || (REG_P (op1) && VGPR_REGNO_P (REGNO (op1)))
1915 	  || vgpr_vector_mode_p (GET_MODE (op0)));
1916 }
1917 
1918 /* Return true if move from OP0 to OP1 is known to be executed in scalar
1919    unit.  Used in the machine description.  */
1920 
1921 bool
gcn_sgpr_move_p(rtx op0,rtx op1)1922 gcn_sgpr_move_p (rtx op0, rtx op1)
1923 {
1924   if (MEM_P (op0) && AS_SCALAR_FLAT_P (MEM_ADDR_SPACE (op0)))
1925     return true;
1926   if (MEM_P (op1) && AS_SCALAR_FLAT_P (MEM_ADDR_SPACE (op1)))
1927     return true;
1928   if (!REG_P (op0) || REGNO (op0) >= FIRST_PSEUDO_REGISTER
1929       || VGPR_REGNO_P (REGNO (op0)))
1930     return false;
1931   if (REG_P (op1)
1932       && REGNO (op1) < FIRST_PSEUDO_REGISTER
1933       && !VGPR_REGNO_P (REGNO (op1)))
1934     return true;
1935   return immediate_operand (op1, VOIDmode) || memory_operand (op1, VOIDmode);
1936 }
1937 
1938 /* Implement TARGET_SECONDARY_RELOAD.
1939 
1940    The address space determines which registers can be used for loads and
1941    stores.  */
1942 
1943 static reg_class_t
gcn_secondary_reload(bool in_p,rtx x,reg_class_t rclass,machine_mode reload_mode,secondary_reload_info * sri)1944 gcn_secondary_reload (bool in_p, rtx x, reg_class_t rclass,
1945 		      machine_mode reload_mode, secondary_reload_info *sri)
1946 {
1947   reg_class_t result = NO_REGS;
1948   bool spilled_pseudo =
1949     (REG_P (x) || GET_CODE (x) == SUBREG) && true_regnum (x) == -1;
1950 
1951   if (dump_file && (dump_flags & TDF_DETAILS))
1952     {
1953       fprintf (dump_file, "gcn_secondary_reload: ");
1954       dump_value_slim (dump_file, x, 1);
1955       fprintf (dump_file, " %s %s:%s", (in_p ? "->" : "<-"),
1956 	       reg_class_names[rclass], GET_MODE_NAME (reload_mode));
1957       if (REG_P (x) || GET_CODE (x) == SUBREG)
1958 	fprintf (dump_file, " (true regnum: %d \"%s\")", true_regnum (x),
1959 		 (true_regnum (x) >= 0
1960 		  && true_regnum (x) < FIRST_PSEUDO_REGISTER
1961 		  ? reg_names[true_regnum (x)]
1962 		  : (spilled_pseudo ? "stack spill" : "??")));
1963       fprintf (dump_file, "\n");
1964     }
1965 
1966   /* Some callers don't use or initialize icode.  */
1967   sri->icode = CODE_FOR_nothing;
1968 
1969   if (MEM_P (x) || spilled_pseudo)
1970     {
1971       addr_space_t as = DEFAULT_ADDR_SPACE;
1972 
1973       /* If we have a spilled pseudo, we can't find the address space
1974 	 directly, but we know it's in ADDR_SPACE_FLAT space for GCN3 or
1975 	 ADDR_SPACE_GLOBAL for GCN5.  */
1976       if (MEM_P (x))
1977 	as = MEM_ADDR_SPACE (x);
1978 
1979       if (as == ADDR_SPACE_DEFAULT)
1980 	as = DEFAULT_ADDR_SPACE;
1981 
1982       switch (as)
1983 	{
1984 	case ADDR_SPACE_SCALAR_FLAT:
1985 	  result =
1986 	    ((!MEM_P (x) || rclass == SGPR_REGS) ? NO_REGS : SGPR_REGS);
1987 	  break;
1988 	case ADDR_SPACE_FLAT:
1989 	case ADDR_SPACE_FLAT_SCRATCH:
1990 	case ADDR_SPACE_GLOBAL:
1991 	  if (GET_MODE_CLASS (reload_mode) == MODE_VECTOR_INT
1992 	      || GET_MODE_CLASS (reload_mode) == MODE_VECTOR_FLOAT)
1993 	    {
1994 	      if (in_p)
1995 		switch (reload_mode)
1996 		  {
1997 		  case E_V64SImode:
1998 		    sri->icode = CODE_FOR_reload_inv64si;
1999 		    break;
2000 		  case E_V64SFmode:
2001 		    sri->icode = CODE_FOR_reload_inv64sf;
2002 		    break;
2003 		  case E_V64HImode:
2004 		    sri->icode = CODE_FOR_reload_inv64hi;
2005 		    break;
2006 		  case E_V64HFmode:
2007 		    sri->icode = CODE_FOR_reload_inv64hf;
2008 		    break;
2009 		  case E_V64QImode:
2010 		    sri->icode = CODE_FOR_reload_inv64qi;
2011 		    break;
2012 		  case E_V64DImode:
2013 		    sri->icode = CODE_FOR_reload_inv64di;
2014 		    break;
2015 		  case E_V64DFmode:
2016 		    sri->icode = CODE_FOR_reload_inv64df;
2017 		    break;
2018 		  default:
2019 		    gcc_unreachable ();
2020 		  }
2021 	      else
2022 		switch (reload_mode)
2023 		  {
2024 		  case E_V64SImode:
2025 		    sri->icode = CODE_FOR_reload_outv64si;
2026 		    break;
2027 		  case E_V64SFmode:
2028 		    sri->icode = CODE_FOR_reload_outv64sf;
2029 		    break;
2030 		  case E_V64HImode:
2031 		    sri->icode = CODE_FOR_reload_outv64hi;
2032 		    break;
2033 		  case E_V64HFmode:
2034 		    sri->icode = CODE_FOR_reload_outv64hf;
2035 		    break;
2036 		  case E_V64QImode:
2037 		    sri->icode = CODE_FOR_reload_outv64qi;
2038 		    break;
2039 		  case E_V64DImode:
2040 		    sri->icode = CODE_FOR_reload_outv64di;
2041 		    break;
2042 		  case E_V64DFmode:
2043 		    sri->icode = CODE_FOR_reload_outv64df;
2044 		    break;
2045 		  default:
2046 		    gcc_unreachable ();
2047 		  }
2048 	      break;
2049 	    }
2050 	  /* Fallthrough.  */
2051 	case ADDR_SPACE_LDS:
2052 	case ADDR_SPACE_GDS:
2053 	case ADDR_SPACE_SCRATCH:
2054 	  result = (rclass == VGPR_REGS ? NO_REGS : VGPR_REGS);
2055 	  break;
2056 	}
2057     }
2058 
2059   if (dump_file && (dump_flags & TDF_DETAILS))
2060     fprintf (dump_file, "   <= %s (icode: %s)\n", reg_class_names[result],
2061 	     get_insn_name (sri->icode));
2062 
2063   return result;
2064 }
2065 
2066 /* Update register usage after having seen the compiler flags and kernel
2067    attributes.  We typically want to fix registers that contain values
2068    set by the HSA runtime.  */
2069 
2070 static void
gcn_conditional_register_usage(void)2071 gcn_conditional_register_usage (void)
2072 {
2073   if (!cfun || !cfun->machine)
2074     return;
2075 
2076   if (cfun->machine->normal_function)
2077     {
2078       /* Restrict the set of SGPRs and VGPRs used by non-kernel functions.  */
2079       for (int i = SGPR_REGNO (MAX_NORMAL_SGPR_COUNT - 2);
2080 	   i <= LAST_SGPR_REG; i++)
2081 	fixed_regs[i] = 1, call_used_regs[i] = 1;
2082 
2083       for (int i = VGPR_REGNO (MAX_NORMAL_VGPR_COUNT);
2084 	   i <= LAST_VGPR_REG; i++)
2085 	fixed_regs[i] = 1, call_used_regs[i] = 1;
2086 
2087       return;
2088     }
2089 
2090   /* If the set of requested args is the default set, nothing more needs to
2091      be done.  */
2092   if (cfun->machine->args.requested == default_requested_args)
2093     return;
2094 
2095   /* Requesting a set of args different from the default violates the ABI.  */
2096   if (!leaf_function_p ())
2097     warning (0, "A non-default set of initial values has been requested, "
2098 		"which violates the ABI!");
2099 
2100   for (int i = SGPR_REGNO (0); i < SGPR_REGNO (14); i++)
2101     fixed_regs[i] = 0;
2102 
2103   /* Fix the runtime argument register containing values that may be
2104      needed later.  DISPATCH_PTR_ARG and FLAT_SCRATCH_* should not be
2105      needed after the prologue so there's no need to fix them.  */
2106   if (cfun->machine->args.reg[PRIVATE_SEGMENT_WAVE_OFFSET_ARG] >= 0)
2107     fixed_regs[cfun->machine->args.reg[PRIVATE_SEGMENT_WAVE_OFFSET_ARG]] = 1;
2108   if (cfun->machine->args.reg[PRIVATE_SEGMENT_BUFFER_ARG] >= 0)
2109     {
2110       /* The upper 32-bits of the 64-bit descriptor are not used, so allow
2111 	the containing registers to be used for other purposes.  */
2112       fixed_regs[cfun->machine->args.reg[PRIVATE_SEGMENT_BUFFER_ARG]] = 1;
2113       fixed_regs[cfun->machine->args.reg[PRIVATE_SEGMENT_BUFFER_ARG] + 1] = 1;
2114     }
2115   if (cfun->machine->args.reg[KERNARG_SEGMENT_PTR_ARG] >= 0)
2116     {
2117       fixed_regs[cfun->machine->args.reg[KERNARG_SEGMENT_PTR_ARG]] = 1;
2118       fixed_regs[cfun->machine->args.reg[KERNARG_SEGMENT_PTR_ARG] + 1] = 1;
2119     }
2120   if (cfun->machine->args.reg[DISPATCH_PTR_ARG] >= 0)
2121     {
2122       fixed_regs[cfun->machine->args.reg[DISPATCH_PTR_ARG]] = 1;
2123       fixed_regs[cfun->machine->args.reg[DISPATCH_PTR_ARG] + 1] = 1;
2124     }
2125   if (cfun->machine->args.reg[WORKGROUP_ID_X_ARG] >= 0)
2126     fixed_regs[cfun->machine->args.reg[WORKGROUP_ID_X_ARG]] = 1;
2127   if (cfun->machine->args.reg[WORK_ITEM_ID_X_ARG] >= 0)
2128     fixed_regs[cfun->machine->args.reg[WORK_ITEM_ID_X_ARG]] = 1;
2129   if (cfun->machine->args.reg[WORK_ITEM_ID_Y_ARG] >= 0)
2130     fixed_regs[cfun->machine->args.reg[WORK_ITEM_ID_Y_ARG]] = 1;
2131   if (cfun->machine->args.reg[WORK_ITEM_ID_Z_ARG] >= 0)
2132     fixed_regs[cfun->machine->args.reg[WORK_ITEM_ID_Z_ARG]] = 1;
2133 
2134   if (TARGET_GCN5_PLUS)
2135     /* v0 is always zero, for global nul-offsets.  */
2136     fixed_regs[VGPR_REGNO (0)] = 1;
2137 }
2138 
2139 /* Determine if a load or store is valid, according to the register classes
2140    and address space.  Used primarily by the machine description to decide
2141    when to split a move into two steps.  */
2142 
2143 bool
gcn_valid_move_p(machine_mode mode,rtx dest,rtx src)2144 gcn_valid_move_p (machine_mode mode, rtx dest, rtx src)
2145 {
2146   if (!MEM_P (dest) && !MEM_P (src))
2147     return true;
2148 
2149   if (MEM_P (dest)
2150       && AS_FLAT_P (MEM_ADDR_SPACE (dest))
2151       && (gcn_flat_address_p (XEXP (dest, 0), mode)
2152 	  || GET_CODE (XEXP (dest, 0)) == SYMBOL_REF
2153 	  || GET_CODE (XEXP (dest, 0)) == LABEL_REF)
2154       && gcn_vgpr_register_operand (src, mode))
2155     return true;
2156   else if (MEM_P (src)
2157 	   && AS_FLAT_P (MEM_ADDR_SPACE (src))
2158 	   && (gcn_flat_address_p (XEXP (src, 0), mode)
2159 	       || GET_CODE (XEXP (src, 0)) == SYMBOL_REF
2160 	       || GET_CODE (XEXP (src, 0)) == LABEL_REF)
2161 	   && gcn_vgpr_register_operand (dest, mode))
2162     return true;
2163 
2164   if (MEM_P (dest)
2165       && AS_GLOBAL_P (MEM_ADDR_SPACE (dest))
2166       && (gcn_global_address_p (XEXP (dest, 0))
2167 	  || GET_CODE (XEXP (dest, 0)) == SYMBOL_REF
2168 	  || GET_CODE (XEXP (dest, 0)) == LABEL_REF)
2169       && gcn_vgpr_register_operand (src, mode))
2170     return true;
2171   else if (MEM_P (src)
2172 	   && AS_GLOBAL_P (MEM_ADDR_SPACE (src))
2173 	   && (gcn_global_address_p (XEXP (src, 0))
2174 	       || GET_CODE (XEXP (src, 0)) == SYMBOL_REF
2175 	       || GET_CODE (XEXP (src, 0)) == LABEL_REF)
2176 	   && gcn_vgpr_register_operand (dest, mode))
2177     return true;
2178 
2179   if (MEM_P (dest)
2180       && MEM_ADDR_SPACE (dest) == ADDR_SPACE_SCALAR_FLAT
2181       && (gcn_scalar_flat_address_p (XEXP (dest, 0))
2182 	  || GET_CODE (XEXP (dest, 0)) == SYMBOL_REF
2183 	  || GET_CODE (XEXP (dest, 0)) == LABEL_REF)
2184       && gcn_ssrc_register_operand (src, mode))
2185     return true;
2186   else if (MEM_P (src)
2187 	   && MEM_ADDR_SPACE (src) == ADDR_SPACE_SCALAR_FLAT
2188 	   && (gcn_scalar_flat_address_p (XEXP (src, 0))
2189 	       || GET_CODE (XEXP (src, 0)) == SYMBOL_REF
2190 	       || GET_CODE (XEXP (src, 0)) == LABEL_REF)
2191 	   && gcn_sdst_register_operand (dest, mode))
2192     return true;
2193 
2194   if (MEM_P (dest)
2195       && AS_ANY_DS_P (MEM_ADDR_SPACE (dest))
2196       && gcn_ds_address_p (XEXP (dest, 0))
2197       && gcn_vgpr_register_operand (src, mode))
2198     return true;
2199   else if (MEM_P (src)
2200 	   && AS_ANY_DS_P (MEM_ADDR_SPACE (src))
2201 	   && gcn_ds_address_p (XEXP (src, 0))
2202 	   && gcn_vgpr_register_operand (dest, mode))
2203     return true;
2204 
2205   return false;
2206 }
2207 
2208 /* }}}  */
2209 /* {{{ Functions and ABI.  */
2210 
2211 /* Implement TARGET_FUNCTION_VALUE.
2212 
2213    Define how to find the value returned by a function.
2214    The register location is always the same, but the mode depends on
2215    VALTYPE.  */
2216 
2217 static rtx
gcn_function_value(const_tree valtype,const_tree,bool)2218 gcn_function_value (const_tree valtype, const_tree, bool)
2219 {
2220   machine_mode mode = TYPE_MODE (valtype);
2221 
2222   if (INTEGRAL_TYPE_P (valtype)
2223       && GET_MODE_CLASS (mode) == MODE_INT
2224       && GET_MODE_SIZE (mode) < 4)
2225     mode = SImode;
2226 
2227   return gen_rtx_REG (mode, SGPR_REGNO (RETURN_VALUE_REG));
2228 }
2229 
2230 /* Implement TARGET_FUNCTION_VALUE_REGNO_P.
2231 
2232    Return true if N is a possible register number for the function return
2233    value.  */
2234 
2235 static bool
gcn_function_value_regno_p(const unsigned int n)2236 gcn_function_value_regno_p (const unsigned int n)
2237 {
2238   return n == RETURN_VALUE_REG;
2239 }
2240 
2241 /* Calculate the number of registers required to hold function argument
2242    ARG.  */
2243 
2244 static int
num_arg_regs(const function_arg_info & arg)2245 num_arg_regs (const function_arg_info &arg)
2246 {
2247   if (targetm.calls.must_pass_in_stack (arg))
2248     return 0;
2249 
2250   int size = arg.promoted_size_in_bytes ();
2251   return (size + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
2252 }
2253 
2254 /* Implement TARGET_STRICT_ARGUMENT_NAMING.
2255 
2256    Return true if the location where a function argument is passed
2257    depends on whether or not it is a named argument
2258 
2259    For gcn, we know how to handle functions declared as stdarg: by
2260    passing an extra pointer to the unnamed arguments.  However, the
2261    Fortran frontend can produce a different situation, where a
2262    function pointer is declared with no arguments, but the actual
2263    function and calls to it take more arguments.  In that case, we
2264    want to ensure the call matches the definition of the function.  */
2265 
2266 static bool
gcn_strict_argument_naming(cumulative_args_t cum_v)2267 gcn_strict_argument_naming (cumulative_args_t cum_v)
2268 {
2269   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
2270 
2271   return cum->fntype == NULL_TREE || stdarg_p (cum->fntype);
2272 }
2273 
2274 /* Implement TARGET_PRETEND_OUTGOING_VARARGS_NAMED.
2275 
2276    See comment on gcn_strict_argument_naming.  */
2277 
2278 static bool
gcn_pretend_outgoing_varargs_named(cumulative_args_t cum_v)2279 gcn_pretend_outgoing_varargs_named (cumulative_args_t cum_v)
2280 {
2281   return !gcn_strict_argument_naming (cum_v);
2282 }
2283 
2284 /* Implement TARGET_FUNCTION_ARG.
2285 
2286    Return an RTX indicating whether a function argument is passed in a register
2287    and if so, which register.  */
2288 
2289 static rtx
gcn_function_arg(cumulative_args_t cum_v,const function_arg_info & arg)2290 gcn_function_arg (cumulative_args_t cum_v, const function_arg_info &arg)
2291 {
2292   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
2293   if (cum->normal_function)
2294     {
2295       if (!arg.named || arg.end_marker_p ())
2296 	return 0;
2297 
2298       if (targetm.calls.must_pass_in_stack (arg))
2299 	return 0;
2300 
2301       int reg_num = FIRST_PARM_REG + cum->num;
2302       int num_regs = num_arg_regs (arg);
2303       if (num_regs > 0)
2304 	while (reg_num % num_regs != 0)
2305 	  reg_num++;
2306       if (reg_num + num_regs <= FIRST_PARM_REG + NUM_PARM_REGS)
2307 	return gen_rtx_REG (arg.mode, reg_num);
2308     }
2309   else
2310     {
2311       if (cum->num >= cum->args.nargs)
2312 	{
2313 	  cum->offset = (cum->offset + TYPE_ALIGN (arg.type) / 8 - 1)
2314 	    & -(TYPE_ALIGN (arg.type) / 8);
2315 	  cfun->machine->kernarg_segment_alignment
2316 	    = MAX ((unsigned) cfun->machine->kernarg_segment_alignment,
2317 		   TYPE_ALIGN (arg.type) / 8);
2318 	  rtx addr = gen_rtx_REG (DImode,
2319 				  cum->args.reg[KERNARG_SEGMENT_PTR_ARG]);
2320 	  if (cum->offset)
2321 	    addr = gen_rtx_PLUS (DImode, addr,
2322 				 gen_int_mode (cum->offset, DImode));
2323 	  rtx mem = gen_rtx_MEM (arg.mode, addr);
2324 	  set_mem_attributes (mem, arg.type, 1);
2325 	  set_mem_addr_space (mem, ADDR_SPACE_SCALAR_FLAT);
2326 	  MEM_READONLY_P (mem) = 1;
2327 	  return mem;
2328 	}
2329 
2330       int a = cum->args.order[cum->num];
2331       if (arg.mode != gcn_kernel_arg_types[a].mode)
2332 	{
2333 	  error ("wrong type of argument %s", gcn_kernel_arg_types[a].name);
2334 	  return 0;
2335 	}
2336       return gen_rtx_REG ((machine_mode) gcn_kernel_arg_types[a].mode,
2337 			  cum->args.reg[a]);
2338     }
2339   return 0;
2340 }
2341 
2342 /* Implement TARGET_FUNCTION_ARG_ADVANCE.
2343 
2344    Updates the summarizer variable pointed to by CUM_V to advance past an
2345    argument in the argument list.  */
2346 
2347 static void
gcn_function_arg_advance(cumulative_args_t cum_v,const function_arg_info & arg)2348 gcn_function_arg_advance (cumulative_args_t cum_v,
2349 			  const function_arg_info &arg)
2350 {
2351   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
2352 
2353   if (cum->normal_function)
2354     {
2355       if (!arg.named)
2356 	return;
2357 
2358       int num_regs = num_arg_regs (arg);
2359       if (num_regs > 0)
2360 	while ((FIRST_PARM_REG + cum->num) % num_regs != 0)
2361 	  cum->num++;
2362       cum->num += num_regs;
2363     }
2364   else
2365     {
2366       if (cum->num < cum->args.nargs)
2367 	cum->num++;
2368       else
2369 	{
2370 	  cum->offset += tree_to_uhwi (TYPE_SIZE_UNIT (arg.type));
2371 	  cfun->machine->kernarg_segment_byte_size = cum->offset;
2372 	}
2373     }
2374 }
2375 
2376 /* Implement TARGET_ARG_PARTIAL_BYTES.
2377 
2378    Returns the number of bytes at the beginning of an argument that must be put
2379    in registers.  The value must be zero for arguments that are passed entirely
2380    in registers or that are entirely pushed on the stack.  */
2381 
2382 static int
gcn_arg_partial_bytes(cumulative_args_t cum_v,const function_arg_info & arg)2383 gcn_arg_partial_bytes (cumulative_args_t cum_v, const function_arg_info &arg)
2384 {
2385   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
2386 
2387   if (!arg.named)
2388     return 0;
2389 
2390   if (targetm.calls.must_pass_in_stack (arg))
2391     return 0;
2392 
2393   if (cum->num >= NUM_PARM_REGS)
2394     return 0;
2395 
2396   /* If the argument fits entirely in registers, return 0.  */
2397   if (cum->num + num_arg_regs (arg) <= NUM_PARM_REGS)
2398     return 0;
2399 
2400   return (NUM_PARM_REGS - cum->num) * UNITS_PER_WORD;
2401 }
2402 
2403 /* A normal function which takes a pointer argument (to a scalar) may be
2404    passed a pointer to LDS space (via a high-bits-set aperture), and that only
2405    works with FLAT addressing, not GLOBAL.  Force FLAT addressing if the
2406    function has an incoming pointer-to-scalar parameter.  */
2407 
2408 static void
gcn_detect_incoming_pointer_arg(tree fndecl)2409 gcn_detect_incoming_pointer_arg (tree fndecl)
2410 {
2411   gcc_assert (cfun && cfun->machine);
2412 
2413   for (tree arg = TYPE_ARG_TYPES (TREE_TYPE (fndecl));
2414        arg;
2415        arg = TREE_CHAIN (arg))
2416     if (POINTER_TYPE_P (TREE_VALUE (arg))
2417 	&& !AGGREGATE_TYPE_P (TREE_TYPE (TREE_VALUE (arg))))
2418       cfun->machine->use_flat_addressing = true;
2419 }
2420 
2421 /* Implement INIT_CUMULATIVE_ARGS, via gcn.h.
2422 
2423    Initialize a variable CUM of type CUMULATIVE_ARGS for a call to a function
2424    whose data type is FNTYPE.  For a library call, FNTYPE is 0.  */
2425 
2426 void
gcn_init_cumulative_args(CUMULATIVE_ARGS * cum,tree fntype,rtx libname,tree fndecl,int caller)2427 gcn_init_cumulative_args (CUMULATIVE_ARGS *cum /* Argument info to init */ ,
2428 			  tree fntype /* tree ptr for function decl */ ,
2429 			  rtx libname /* SYMBOL_REF of library name or 0 */ ,
2430 			  tree fndecl, int caller)
2431 {
2432   memset (cum, 0, sizeof (*cum));
2433   cum->fntype = fntype;
2434   if (libname)
2435     {
2436       gcc_assert (cfun && cfun->machine);
2437       cum->normal_function = true;
2438       if (!caller)
2439 	{
2440 	  cfun->machine->normal_function = true;
2441 	  gcn_detect_incoming_pointer_arg (fndecl);
2442 	}
2443       return;
2444     }
2445   tree attr = NULL;
2446   if (fndecl)
2447     attr = lookup_attribute ("amdgpu_hsa_kernel", DECL_ATTRIBUTES (fndecl));
2448   if (fndecl && !attr)
2449     attr = lookup_attribute ("amdgpu_hsa_kernel",
2450 			     TYPE_ATTRIBUTES (TREE_TYPE (fndecl)));
2451   if (!attr && fntype)
2452     attr = lookup_attribute ("amdgpu_hsa_kernel", TYPE_ATTRIBUTES (fntype));
2453   /* Handle main () as kernel, so we can run testsuite.
2454      Handle OpenACC kernels similarly to main.  */
2455   if (!attr && !caller && fndecl
2456       && (MAIN_NAME_P (DECL_NAME (fndecl))
2457 	  || lookup_attribute ("omp target entrypoint",
2458 			       DECL_ATTRIBUTES (fndecl)) != NULL_TREE))
2459     gcn_parse_amdgpu_hsa_kernel_attribute (&cum->args, NULL_TREE);
2460   else
2461     {
2462       if (!attr || caller)
2463 	{
2464 	  gcc_assert (cfun && cfun->machine);
2465 	  cum->normal_function = true;
2466 	  if (!caller)
2467 	    cfun->machine->normal_function = true;
2468 	}
2469       gcn_parse_amdgpu_hsa_kernel_attribute
2470 	(&cum->args, attr ? TREE_VALUE (attr) : NULL_TREE);
2471     }
2472   cfun->machine->args = cum->args;
2473   if (!caller && cfun->machine->normal_function)
2474     gcn_detect_incoming_pointer_arg (fndecl);
2475 
2476   reinit_regs ();
2477 }
2478 
2479 static bool
gcn_return_in_memory(const_tree type,const_tree ARG_UNUSED (fntype))2480 gcn_return_in_memory (const_tree type, const_tree ARG_UNUSED (fntype))
2481 {
2482   machine_mode mode = TYPE_MODE (type);
2483   HOST_WIDE_INT size = int_size_in_bytes (type);
2484 
2485   if (AGGREGATE_TYPE_P (type))
2486     return true;
2487 
2488   if (mode == BLKmode)
2489     return true;
2490 
2491   if (size > 2 * UNITS_PER_WORD)
2492     return true;
2493 
2494   return false;
2495 }
2496 
2497 /* Implement TARGET_PROMOTE_FUNCTION_MODE.
2498 
2499    Return the mode to use for outgoing function arguments.  */
2500 
2501 machine_mode
gcn_promote_function_mode(const_tree ARG_UNUSED (type),machine_mode mode,int * ARG_UNUSED (punsignedp),const_tree ARG_UNUSED (funtype),int ARG_UNUSED (for_return))2502 gcn_promote_function_mode (const_tree ARG_UNUSED (type), machine_mode mode,
2503 			   int *ARG_UNUSED (punsignedp),
2504 			   const_tree ARG_UNUSED (funtype),
2505 			   int ARG_UNUSED (for_return))
2506 {
2507   if (GET_MODE_CLASS (mode) == MODE_INT && GET_MODE_SIZE (mode) < 4)
2508     return SImode;
2509 
2510   return mode;
2511 }
2512 
2513 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.
2514 
2515    Derived from hppa_gimplify_va_arg_expr.  The generic routine doesn't handle
2516    ARGS_GROW_DOWNWARDS.  */
2517 
2518 static tree
gcn_gimplify_va_arg_expr(tree valist,tree type,gimple_seq * ARG_UNUSED (pre_p),gimple_seq * ARG_UNUSED (post_p))2519 gcn_gimplify_va_arg_expr (tree valist, tree type,
2520 			  gimple_seq *ARG_UNUSED (pre_p),
2521 			  gimple_seq *ARG_UNUSED (post_p))
2522 {
2523   tree ptr = build_pointer_type (type);
2524   tree valist_type;
2525   tree t, u;
2526   bool indirect;
2527 
2528   indirect = pass_va_arg_by_reference (type);
2529   if (indirect)
2530     {
2531       type = ptr;
2532       ptr = build_pointer_type (type);
2533     }
2534   valist_type = TREE_TYPE (valist);
2535 
2536   /* Args grow down.  Not handled by generic routines.  */
2537 
2538   u = fold_convert (sizetype, size_in_bytes (type));
2539   u = fold_build1 (NEGATE_EXPR, sizetype, u);
2540   t = fold_build_pointer_plus (valist, u);
2541 
2542   /* Align to 8 byte boundary.  */
2543 
2544   u = build_int_cst (TREE_TYPE (t), -8);
2545   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t, u);
2546   t = fold_convert (valist_type, t);
2547 
2548   t = build2 (MODIFY_EXPR, valist_type, valist, t);
2549 
2550   t = fold_convert (ptr, t);
2551   t = build_va_arg_indirect_ref (t);
2552 
2553   if (indirect)
2554     t = build_va_arg_indirect_ref (t);
2555 
2556   return t;
2557 }
2558 
2559 /* Return 1 if TRAIT NAME is present in the OpenMP context's
2560    device trait set, return 0 if not present in any OpenMP context in the
2561    whole translation unit, or -1 if not present in the current OpenMP context
2562    but might be present in another OpenMP context in the same TU.  */
2563 
2564 int
gcn_omp_device_kind_arch_isa(enum omp_device_kind_arch_isa trait,const char * name)2565 gcn_omp_device_kind_arch_isa (enum omp_device_kind_arch_isa trait,
2566 			      const char *name)
2567 {
2568   switch (trait)
2569     {
2570     case omp_device_kind:
2571       return strcmp (name, "gpu") == 0;
2572     case omp_device_arch:
2573       return strcmp (name, "gcn") == 0;
2574     case omp_device_isa:
2575       if (strcmp (name, "fiji") == 0)
2576 	return gcn_arch == PROCESSOR_FIJI;
2577       if (strcmp (name, "gfx900") == 0)
2578 	return gcn_arch == PROCESSOR_VEGA;
2579       if (strcmp (name, "gfx906") == 0)
2580 	return gcn_arch == PROCESSOR_VEGA;
2581       return 0;
2582     default:
2583       gcc_unreachable ();
2584     }
2585 }
2586 
2587 /* Calculate stack offsets needed to create prologues and epilogues.  */
2588 
2589 static struct machine_function *
gcn_compute_frame_offsets(void)2590 gcn_compute_frame_offsets (void)
2591 {
2592   machine_function *offsets = cfun->machine;
2593 
2594   if (reload_completed)
2595     return offsets;
2596 
2597   offsets->need_frame_pointer = frame_pointer_needed;
2598 
2599   offsets->outgoing_args_size = crtl->outgoing_args_size;
2600   offsets->pretend_size = crtl->args.pretend_args_size;
2601 
2602   offsets->local_vars = get_frame_size ();
2603 
2604   offsets->lr_needs_saving = (!leaf_function_p ()
2605 			      || df_regs_ever_live_p (LR_REGNUM)
2606 			      || df_regs_ever_live_p (LR_REGNUM + 1));
2607 
2608   offsets->callee_saves = offsets->lr_needs_saving ? 8 : 0;
2609 
2610   for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
2611     if ((df_regs_ever_live_p (regno) && !call_used_or_fixed_reg_p (regno))
2612 	|| ((regno & ~1) == HARD_FRAME_POINTER_REGNUM
2613 	    && frame_pointer_needed))
2614       offsets->callee_saves += (VGPR_REGNO_P (regno) ? 256 : 4);
2615 
2616   /* Round up to 64-bit boundary to maintain stack alignment.  */
2617   offsets->callee_saves = (offsets->callee_saves + 7) & ~7;
2618 
2619   return offsets;
2620 }
2621 
2622 /* Insert code into the prologue or epilogue to store or load any
2623    callee-save register to/from the stack.
2624 
2625    Helper function for gcn_expand_prologue and gcn_expand_epilogue.  */
2626 
2627 static void
move_callee_saved_registers(rtx sp,machine_function * offsets,bool prologue)2628 move_callee_saved_registers (rtx sp, machine_function *offsets,
2629 			     bool prologue)
2630 {
2631   int regno, offset, saved_scalars;
2632   rtx exec = gen_rtx_REG (DImode, EXEC_REG);
2633   rtx vcc = gen_rtx_REG (DImode, VCC_LO_REG);
2634   rtx offreg = gen_rtx_REG (SImode, SGPR_REGNO (22));
2635   rtx as = gen_rtx_CONST_INT (VOIDmode, STACK_ADDR_SPACE);
2636   HOST_WIDE_INT exec_set = 0;
2637   int offreg_set = 0;
2638 
2639   start_sequence ();
2640 
2641   /* Move scalars into two vector registers.  */
2642   for (regno = 0, saved_scalars = 0; regno < FIRST_VGPR_REG; regno++)
2643     if ((df_regs_ever_live_p (regno) && !call_used_or_fixed_reg_p (regno))
2644 	|| ((regno & ~1) == LINK_REGNUM && offsets->lr_needs_saving)
2645 	|| ((regno & ~1) == HARD_FRAME_POINTER_REGNUM
2646 	    && offsets->need_frame_pointer))
2647       {
2648 	rtx reg = gen_rtx_REG (SImode, regno);
2649 	rtx vreg = gen_rtx_REG (V64SImode,
2650 				VGPR_REGNO (6 + (saved_scalars / 64)));
2651 	int lane = saved_scalars % 64;
2652 
2653 	if (prologue)
2654 	  emit_insn (gen_vec_setv64si (vreg, reg, GEN_INT (lane)));
2655 	else
2656 	  emit_insn (gen_vec_extractv64sisi (reg, vreg, GEN_INT (lane)));
2657 
2658 	saved_scalars++;
2659       }
2660 
2661   rtx move_scalars = get_insns ();
2662   end_sequence ();
2663   start_sequence ();
2664 
2665   /* Ensure that all vector lanes are moved.  */
2666   exec_set = -1;
2667   emit_move_insn (exec, GEN_INT (exec_set));
2668 
2669   /* Set up a vector stack pointer.  */
2670   rtx _0_1_2_3 = gen_rtx_REG (V64SImode, VGPR_REGNO (1));
2671   rtx _0_4_8_12 = gen_rtx_REG (V64SImode, VGPR_REGNO (3));
2672   emit_insn (gen_ashlv64si3_exec (_0_4_8_12, _0_1_2_3, GEN_INT (2),
2673 				  gcn_gen_undef (V64SImode), exec));
2674   rtx vsp = gen_rtx_REG (V64DImode, VGPR_REGNO (4));
2675   emit_insn (gen_vec_duplicatev64di_exec (vsp, sp, gcn_gen_undef (V64DImode),
2676 					  exec));
2677   emit_insn (gen_addv64si3_vcc_exec (gcn_operand_part (V64SImode, vsp, 0),
2678 				     gcn_operand_part (V64SImode, vsp, 0),
2679 				     _0_4_8_12, vcc, gcn_gen_undef (V64SImode),
2680 				     exec));
2681   emit_insn (gen_addcv64si3_exec (gcn_operand_part (V64SImode, vsp, 1),
2682 				  gcn_operand_part (V64SImode, vsp, 1),
2683 				  const0_rtx, vcc, vcc,
2684 				  gcn_gen_undef (V64SImode), exec));
2685 
2686   /* Move vectors.  */
2687   for (regno = FIRST_VGPR_REG, offset = offsets->pretend_size;
2688        regno < FIRST_PSEUDO_REGISTER; regno++)
2689     if ((df_regs_ever_live_p (regno) && !call_used_or_fixed_reg_p (regno))
2690 	|| (regno == VGPR_REGNO (6) && saved_scalars > 0)
2691 	|| (regno == VGPR_REGNO (7) && saved_scalars > 63))
2692       {
2693 	rtx reg = gen_rtx_REG (V64SImode, regno);
2694 	int size = 256;
2695 
2696 	if (regno == VGPR_REGNO (6) && saved_scalars < 64)
2697 	  size = saved_scalars * 4;
2698 	else if (regno == VGPR_REGNO (7) && saved_scalars < 128)
2699 	  size = (saved_scalars - 64) * 4;
2700 
2701 	if (size != 256 || exec_set != -1)
2702 	  {
2703 	    exec_set = ((unsigned HOST_WIDE_INT) 1 << (size / 4)) - 1;
2704 	    emit_move_insn (exec, gen_int_mode (exec_set, DImode));
2705 	  }
2706 
2707 	if (prologue)
2708 	  emit_insn (gen_scatterv64si_insn_1offset_exec (vsp, const0_rtx, reg,
2709 							 as, const0_rtx, exec));
2710 	else
2711 	  emit_insn (gen_gatherv64si_insn_1offset_exec
2712 		     (reg, vsp, const0_rtx, as, const0_rtx,
2713 		      gcn_gen_undef (V64SImode), exec));
2714 
2715 	/* Move our VSP to the next stack entry.  */
2716 	if (offreg_set != size)
2717 	  {
2718 	    offreg_set = size;
2719 	    emit_move_insn (offreg, GEN_INT (size));
2720 	  }
2721 	if (exec_set != -1)
2722 	  {
2723 	    exec_set = -1;
2724 	    emit_move_insn (exec, GEN_INT (exec_set));
2725 	  }
2726 	emit_insn (gen_addv64si3_vcc_dup_exec
2727 		   (gcn_operand_part (V64SImode, vsp, 0),
2728 		    offreg, gcn_operand_part (V64SImode, vsp, 0),
2729 		    vcc, gcn_gen_undef (V64SImode), exec));
2730 	emit_insn (gen_addcv64si3_exec
2731 		   (gcn_operand_part (V64SImode, vsp, 1),
2732 		    gcn_operand_part (V64SImode, vsp, 1),
2733 		    const0_rtx, vcc, vcc, gcn_gen_undef (V64SImode), exec));
2734 
2735 	offset += size;
2736       }
2737 
2738   rtx move_vectors = get_insns ();
2739   end_sequence ();
2740 
2741   if (prologue)
2742     {
2743       emit_insn (move_scalars);
2744       emit_insn (move_vectors);
2745     }
2746   else
2747     {
2748       emit_insn (move_vectors);
2749       emit_insn (move_scalars);
2750     }
2751 }
2752 
2753 /* Generate prologue.  Called from gen_prologue during pro_and_epilogue pass.
2754 
2755    For a non-kernel function, the stack layout looks like this (interim),
2756    growing *upwards*:
2757 
2758  hi | + ...
2759     |__________________| <-- current SP
2760     | outgoing args    |
2761     |__________________|
2762     | (alloca space)   |
2763     |__________________|
2764     | local vars       |
2765     |__________________| <-- FP/hard FP
2766     | callee-save regs |
2767     |__________________| <-- soft arg pointer
2768     | pretend args     |
2769     |__________________| <-- incoming SP
2770     | incoming args    |
2771  lo |..................|
2772 
2773    This implies arguments (beyond the first N in registers) must grow
2774    downwards (as, apparently, PA has them do).
2775 
2776    For a kernel function we have the simpler:
2777 
2778  hi | + ...
2779     |__________________| <-- current SP
2780     | outgoing args    |
2781     |__________________|
2782     | (alloca space)   |
2783     |__________________|
2784     | local vars       |
2785  lo |__________________| <-- FP/hard FP
2786 
2787 */
2788 
2789 void
gcn_expand_prologue()2790 gcn_expand_prologue ()
2791 {
2792   machine_function *offsets = gcn_compute_frame_offsets ();
2793 
2794   if (!cfun || !cfun->machine || cfun->machine->normal_function)
2795     {
2796       rtx sp = gen_rtx_REG (Pmode, STACK_POINTER_REGNUM);
2797       rtx fp = gen_rtx_REG (Pmode, HARD_FRAME_POINTER_REGNUM);
2798 
2799       start_sequence ();
2800 
2801       if (offsets->pretend_size > 0)
2802 	{
2803 	  /* FIXME: Do the actual saving of register pretend args to the stack.
2804 	     Register order needs consideration.  */
2805 	}
2806 
2807       /* Save callee-save regs.  */
2808       move_callee_saved_registers (sp, offsets, true);
2809 
2810       HOST_WIDE_INT sp_adjust = offsets->pretend_size
2811 	+ offsets->callee_saves
2812 	+ offsets->local_vars + offsets->outgoing_args_size;
2813       if (sp_adjust > 0)
2814 	emit_insn (gen_adddi3_scc (sp, sp, gen_int_mode (sp_adjust, DImode)));
2815 
2816       if (offsets->need_frame_pointer)
2817 	emit_insn (gen_adddi3_scc (fp, sp,
2818 				   gen_int_mode
2819 				   (-(offsets->local_vars +
2820 				      offsets->outgoing_args_size),
2821 				    DImode)));
2822 
2823       rtx_insn *seq = get_insns ();
2824       end_sequence ();
2825 
2826       /* FIXME: Prologue insns should have this flag set for debug output, etc.
2827 	 but it causes issues for now.
2828       for (insn = seq; insn; insn = NEXT_INSN (insn))
2829         if (INSN_P (insn))
2830 	  RTX_FRAME_RELATED_P (insn) = 1;*/
2831 
2832       emit_insn (seq);
2833     }
2834   else
2835     {
2836       rtx wave_offset = gen_rtx_REG (SImode,
2837 				     cfun->machine->args.
2838 				     reg[PRIVATE_SEGMENT_WAVE_OFFSET_ARG]);
2839 
2840       if (cfun->machine->args.requested & (1 << FLAT_SCRATCH_INIT_ARG))
2841 	{
2842 	  rtx fs_init_lo =
2843 	    gen_rtx_REG (SImode,
2844 			 cfun->machine->args.reg[FLAT_SCRATCH_INIT_ARG]);
2845 	  rtx fs_init_hi =
2846 	    gen_rtx_REG (SImode,
2847 			 cfun->machine->args.reg[FLAT_SCRATCH_INIT_ARG] + 1);
2848 	  rtx fs_reg_lo = gen_rtx_REG (SImode, FLAT_SCRATCH_REG);
2849 	  rtx fs_reg_hi = gen_rtx_REG (SImode, FLAT_SCRATCH_REG + 1);
2850 
2851 	  /*rtx queue = gen_rtx_REG(DImode,
2852 				  cfun->machine->args.reg[QUEUE_PTR_ARG]);
2853 	  rtx aperture = gen_rtx_MEM (SImode,
2854 				      gen_rtx_PLUS (DImode, queue,
2855 						    gen_int_mode (68, SImode)));
2856 	  set_mem_addr_space (aperture, ADDR_SPACE_SCALAR_FLAT);*/
2857 
2858 	  /* Set up flat_scratch.  */
2859 	  emit_insn (gen_addsi3_scc (fs_reg_hi, fs_init_lo, wave_offset));
2860 	  emit_insn (gen_lshrsi3_scc (fs_reg_hi, fs_reg_hi,
2861 				      gen_int_mode (8, SImode)));
2862 	  emit_move_insn (fs_reg_lo, fs_init_hi);
2863 	}
2864 
2865       /* Set up frame pointer and stack pointer.  */
2866       rtx sp = gen_rtx_REG (DImode, STACK_POINTER_REGNUM);
2867       rtx fp = gen_rtx_REG (DImode, HARD_FRAME_POINTER_REGNUM);
2868       rtx fp_hi = simplify_gen_subreg (SImode, fp, DImode, 4);
2869       rtx fp_lo = simplify_gen_subreg (SImode, fp, DImode, 0);
2870 
2871       HOST_WIDE_INT sp_adjust = (offsets->local_vars
2872 				 + offsets->outgoing_args_size);
2873 
2874       /* Initialise FP and SP from the buffer descriptor in s[0:3].  */
2875       emit_move_insn (fp_lo, gen_rtx_REG (SImode, 0));
2876       emit_insn (gen_andsi3_scc (fp_hi, gen_rtx_REG (SImode, 1),
2877 				 gen_int_mode (0xffff, SImode)));
2878       rtx scc = gen_rtx_REG (BImode, SCC_REG);
2879       emit_insn (gen_addsi3_scalar_carry (fp_lo, fp_lo, wave_offset, scc));
2880       emit_insn (gen_addcsi3_scalar_zero (fp_hi, fp_hi, scc));
2881 
2882       if (sp_adjust > 0)
2883 	emit_insn (gen_adddi3_scc (sp, fp, gen_int_mode (sp_adjust, DImode)));
2884       else
2885 	emit_move_insn (sp, fp);
2886 
2887       /* Make sure the flat scratch reg doesn't get optimised away.  */
2888       emit_insn (gen_prologue_use (gen_rtx_REG (DImode, FLAT_SCRATCH_REG)));
2889     }
2890 
2891   /* Ensure that the scheduler doesn't do anything unexpected.  */
2892   emit_insn (gen_blockage ());
2893 
2894   /* m0 is initialized for the usual LDS DS and FLAT memory case.
2895      The low-part is the address of the topmost addressable byte, which is
2896      size-1.  The high-part is an offset and should be zero.  */
2897   emit_move_insn (gen_rtx_REG (SImode, M0_REG),
2898 		  gen_int_mode (LDS_SIZE-1, SImode));
2899 
2900   emit_insn (gen_prologue_use (gen_rtx_REG (SImode, M0_REG)));
2901 
2902   if (cfun && cfun->machine && !cfun->machine->normal_function && flag_openmp)
2903     {
2904       /* OpenMP kernels have an implicit call to gomp_gcn_enter_kernel.  */
2905       rtx fn_reg = gen_rtx_REG (Pmode, FIRST_PARM_REG);
2906       emit_move_insn (fn_reg, gen_rtx_SYMBOL_REF (Pmode,
2907 						  "gomp_gcn_enter_kernel"));
2908       emit_call_insn (gen_gcn_indirect_call (fn_reg, const0_rtx));
2909     }
2910 }
2911 
2912 /* Generate epilogue.  Called from gen_epilogue during pro_and_epilogue pass.
2913 
2914    See gcn_expand_prologue for stack details.  */
2915 
2916 void
gcn_expand_epilogue(void)2917 gcn_expand_epilogue (void)
2918 {
2919   /* Ensure that the scheduler doesn't do anything unexpected.  */
2920   emit_insn (gen_blockage ());
2921 
2922   if (!cfun || !cfun->machine || cfun->machine->normal_function)
2923     {
2924       machine_function *offsets = gcn_compute_frame_offsets ();
2925       rtx sp = gen_rtx_REG (Pmode, STACK_POINTER_REGNUM);
2926       rtx fp = gen_rtx_REG (Pmode, HARD_FRAME_POINTER_REGNUM);
2927 
2928       HOST_WIDE_INT sp_adjust = offsets->callee_saves + offsets->pretend_size;
2929 
2930       if (offsets->need_frame_pointer)
2931 	{
2932 	  /* Restore old SP from the frame pointer.  */
2933 	  if (sp_adjust > 0)
2934 	    emit_insn (gen_subdi3 (sp, fp, gen_int_mode (sp_adjust, DImode)));
2935 	  else
2936 	    emit_move_insn (sp, fp);
2937 	}
2938       else
2939 	{
2940 	  /* Restore old SP from current SP.  */
2941 	  sp_adjust += offsets->outgoing_args_size + offsets->local_vars;
2942 
2943 	  if (sp_adjust > 0)
2944 	    emit_insn (gen_subdi3 (sp, sp, gen_int_mode (sp_adjust, DImode)));
2945 	}
2946 
2947       move_callee_saved_registers (sp, offsets, false);
2948 
2949       /* There's no explicit use of the link register on the return insn.  Emit
2950          one here instead.  */
2951       if (offsets->lr_needs_saving)
2952 	emit_use (gen_rtx_REG (DImode, LINK_REGNUM));
2953 
2954       /* Similar for frame pointer.  */
2955       if (offsets->need_frame_pointer)
2956 	emit_use (gen_rtx_REG (DImode, HARD_FRAME_POINTER_REGNUM));
2957     }
2958   else if (flag_openmp)
2959     {
2960       /* OpenMP kernels have an implicit call to gomp_gcn_exit_kernel.  */
2961       rtx fn_reg = gen_rtx_REG (Pmode, FIRST_PARM_REG);
2962       emit_move_insn (fn_reg,
2963 		      gen_rtx_SYMBOL_REF (Pmode, "gomp_gcn_exit_kernel"));
2964       emit_call_insn (gen_gcn_indirect_call (fn_reg, const0_rtx));
2965     }
2966   else if (TREE_CODE (TREE_TYPE (DECL_RESULT (cfun->decl))) != VOID_TYPE)
2967     {
2968       /* Assume that an exit value compatible with gcn-run is expected.
2969          That is, the third input parameter is an int*.
2970 
2971          We can't allocate any new registers, but the kernarg_reg is
2972          dead after this, so we'll use that.  */
2973       rtx kernarg_reg = gen_rtx_REG (DImode, cfun->machine->args.reg
2974 				     [KERNARG_SEGMENT_PTR_ARG]);
2975       rtx retptr_mem = gen_rtx_MEM (DImode,
2976 				    gen_rtx_PLUS (DImode, kernarg_reg,
2977 						  GEN_INT (16)));
2978       set_mem_addr_space (retptr_mem, ADDR_SPACE_SCALAR_FLAT);
2979       emit_move_insn (kernarg_reg, retptr_mem);
2980 
2981       rtx retval_mem = gen_rtx_MEM (SImode, kernarg_reg);
2982       set_mem_addr_space (retval_mem, ADDR_SPACE_SCALAR_FLAT);
2983       emit_move_insn (retval_mem,
2984 		      gen_rtx_REG (SImode, SGPR_REGNO (RETURN_VALUE_REG)));
2985     }
2986 
2987   emit_jump_insn (gen_gcn_return ());
2988 }
2989 
2990 /* Implement TARGET_CAN_ELIMINATE.
2991 
2992    Return true if the compiler is allowed to try to replace register number
2993    FROM_REG with register number TO_REG.
2994 
2995    FIXME: is the default "true" not enough? Should this be a negative set?  */
2996 
2997 bool
gcn_can_eliminate_p(int,int to_reg)2998 gcn_can_eliminate_p (int /*from_reg */ , int to_reg)
2999 {
3000   return (to_reg == HARD_FRAME_POINTER_REGNUM
3001 	  || to_reg == STACK_POINTER_REGNUM);
3002 }
3003 
3004 /* Implement INITIAL_ELIMINATION_OFFSET.
3005 
3006    Returns the initial difference between the specified pair of registers, in
3007    terms of stack position.  */
3008 
3009 HOST_WIDE_INT
gcn_initial_elimination_offset(int from,int to)3010 gcn_initial_elimination_offset (int from, int to)
3011 {
3012   machine_function *offsets = gcn_compute_frame_offsets ();
3013 
3014   switch (from)
3015     {
3016     case ARG_POINTER_REGNUM:
3017       if (to == STACK_POINTER_REGNUM)
3018 	return -(offsets->callee_saves + offsets->local_vars
3019 		 + offsets->outgoing_args_size);
3020       else if (to == FRAME_POINTER_REGNUM || to == HARD_FRAME_POINTER_REGNUM)
3021 	return -offsets->callee_saves;
3022       else
3023 	gcc_unreachable ();
3024       break;
3025 
3026     case FRAME_POINTER_REGNUM:
3027       if (to == STACK_POINTER_REGNUM)
3028 	return -(offsets->local_vars + offsets->outgoing_args_size);
3029       else if (to == HARD_FRAME_POINTER_REGNUM)
3030 	return 0;
3031       else
3032 	gcc_unreachable ();
3033       break;
3034 
3035     default:
3036       gcc_unreachable ();
3037     }
3038 }
3039 
3040 /* Implement HARD_REGNO_RENAME_OK.
3041 
3042    Return true if it is permissible to rename a hard register from
3043    FROM_REG to TO_REG.  */
3044 
3045 bool
gcn_hard_regno_rename_ok(unsigned int from_reg,unsigned int to_reg)3046 gcn_hard_regno_rename_ok (unsigned int from_reg, unsigned int to_reg)
3047 {
3048   if (from_reg == SCC_REG
3049       || from_reg == VCC_LO_REG || from_reg == VCC_HI_REG
3050       || from_reg == EXEC_LO_REG || from_reg == EXEC_HI_REG
3051       || to_reg == SCC_REG
3052       || to_reg == VCC_LO_REG || to_reg == VCC_HI_REG
3053       || to_reg == EXEC_LO_REG || to_reg == EXEC_HI_REG)
3054     return false;
3055 
3056   /* Allow the link register to be used if it was saved.  */
3057   if ((to_reg & ~1) == LINK_REGNUM)
3058     return !cfun || cfun->machine->lr_needs_saving;
3059 
3060   /* Allow the registers used for the static chain to be used if the chain is
3061      not in active use.  */
3062   if ((to_reg & ~1) == STATIC_CHAIN_REGNUM)
3063     return !cfun
3064 	|| !(cfun->static_chain_decl
3065 	     && df_regs_ever_live_p (STATIC_CHAIN_REGNUM)
3066 	     && df_regs_ever_live_p (STATIC_CHAIN_REGNUM + 1));
3067 
3068   return true;
3069 }
3070 
3071 /* Implement HARD_REGNO_CALLER_SAVE_MODE.
3072 
3073    Which mode is required for saving NREGS of a pseudo-register in
3074    call-clobbered hard register REGNO.  */
3075 
3076 machine_mode
gcn_hard_regno_caller_save_mode(unsigned int regno,unsigned int nregs,machine_mode regmode)3077 gcn_hard_regno_caller_save_mode (unsigned int regno, unsigned int nregs,
3078 				 machine_mode regmode)
3079 {
3080   machine_mode result = choose_hard_reg_mode (regno, nregs, NULL);
3081 
3082   if (VECTOR_MODE_P (result) && !VECTOR_MODE_P (regmode))
3083     result = (nregs == 1 ? SImode : DImode);
3084 
3085   return result;
3086 }
3087 
3088 /* Implement TARGET_ASM_TRAMPOLINE_TEMPLATE.
3089 
3090    Output assembler code for a block containing the constant parts
3091    of a trampoline, leaving space for the variable parts.  */
3092 
3093 static void
gcn_asm_trampoline_template(FILE * f)3094 gcn_asm_trampoline_template (FILE *f)
3095 {
3096   /* The source operand of the move instructions must be a 32-bit
3097      constant following the opcode.  */
3098   asm_fprintf (f, "\ts_mov_b32\ts%i, 0xffff\n", STATIC_CHAIN_REGNUM);
3099   asm_fprintf (f, "\ts_mov_b32\ts%i, 0xffff\n", STATIC_CHAIN_REGNUM + 1);
3100   asm_fprintf (f, "\ts_mov_b32\ts%i, 0xffff\n", CC_SAVE_REG);
3101   asm_fprintf (f, "\ts_mov_b32\ts%i, 0xffff\n", CC_SAVE_REG + 1);
3102   asm_fprintf (f, "\ts_setpc_b64\ts[%i:%i]\n", CC_SAVE_REG, CC_SAVE_REG + 1);
3103 }
3104 
3105 /* Implement TARGET_TRAMPOLINE_INIT.
3106 
3107    Emit RTL insns to initialize the variable parts of a trampoline.
3108    FNDECL is the decl of the target address, M_TRAMP is a MEM for
3109    the trampoline, and CHAIN_VALUE is an RTX for the static chain
3110    to be passed to the target function.  */
3111 
3112 static void
gcn_trampoline_init(rtx m_tramp,tree fndecl,rtx chain_value)3113 gcn_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
3114 {
3115   if (TARGET_GCN5_PLUS)
3116     sorry ("nested function trampolines not supported on GCN5 due to"
3117            " non-executable stacks");
3118 
3119   emit_block_move (m_tramp, assemble_trampoline_template (),
3120 		   GEN_INT (TRAMPOLINE_SIZE), BLOCK_OP_NORMAL);
3121 
3122   rtx fnaddr = XEXP (DECL_RTL (fndecl), 0);
3123   rtx chain_value_reg = copy_to_reg (chain_value);
3124   rtx fnaddr_reg = copy_to_reg (fnaddr);
3125 
3126   for (int i = 0; i < 4; i++)
3127     {
3128       rtx mem = adjust_address (m_tramp, SImode, i * 8 + 4);
3129       rtx reg = i < 2 ? chain_value_reg : fnaddr_reg;
3130       emit_move_insn (mem, gen_rtx_SUBREG (SImode, reg, (i % 2) * 4));
3131     }
3132 
3133   rtx tramp_addr = XEXP (m_tramp, 0);
3134   emit_insn (gen_clear_icache (tramp_addr,
3135 			       plus_constant (ptr_mode, tramp_addr,
3136 					      TRAMPOLINE_SIZE)));
3137 }
3138 
3139 /* }}}  */
3140 /* {{{ Miscellaneous.  */
3141 
3142 /* Implement TARGET_CANNOT_COPY_INSN_P.
3143 
3144    Return true if INSN must not be duplicated.  */
3145 
3146 static bool
gcn_cannot_copy_insn_p(rtx_insn * insn)3147 gcn_cannot_copy_insn_p (rtx_insn *insn)
3148 {
3149   if (recog_memoized (insn) == CODE_FOR_gcn_wavefront_barrier)
3150     return true;
3151 
3152   return false;
3153 }
3154 
3155 /* Implement TARGET_DEBUG_UNWIND_INFO.
3156 
3157    Defines the mechanism that will be used for describing frame unwind
3158    information to the debugger.  */
3159 
3160 static enum unwind_info_type
gcn_debug_unwind_info()3161 gcn_debug_unwind_info ()
3162 {
3163   /* No support for debug info, yet.  */
3164   return UI_NONE;
3165 }
3166 
3167 /* Determine if there is a suitable hardware conversion instruction.
3168    Used primarily by the machine description.  */
3169 
3170 bool
gcn_valid_cvt_p(machine_mode from,machine_mode to,enum gcn_cvt_t op)3171 gcn_valid_cvt_p (machine_mode from, machine_mode to, enum gcn_cvt_t op)
3172 {
3173   if (VECTOR_MODE_P (from) != VECTOR_MODE_P (to))
3174     return false;
3175 
3176   if (VECTOR_MODE_P (from))
3177     {
3178       from = GET_MODE_INNER (from);
3179       to = GET_MODE_INNER (to);
3180     }
3181 
3182   switch (op)
3183     {
3184     case fix_trunc_cvt:
3185     case fixuns_trunc_cvt:
3186       if (GET_MODE_CLASS (from) != MODE_FLOAT
3187 	  || GET_MODE_CLASS (to) != MODE_INT)
3188 	return false;
3189       break;
3190     case float_cvt:
3191     case floatuns_cvt:
3192       if (GET_MODE_CLASS (from) != MODE_INT
3193 	  || GET_MODE_CLASS (to) != MODE_FLOAT)
3194 	return false;
3195       break;
3196     case extend_cvt:
3197       if (GET_MODE_CLASS (from) != MODE_FLOAT
3198 	  || GET_MODE_CLASS (to) != MODE_FLOAT
3199 	  || GET_MODE_SIZE (from) >= GET_MODE_SIZE (to))
3200 	return false;
3201       break;
3202     case trunc_cvt:
3203       if (GET_MODE_CLASS (from) != MODE_FLOAT
3204 	  || GET_MODE_CLASS (to) != MODE_FLOAT
3205 	  || GET_MODE_SIZE (from) <= GET_MODE_SIZE (to))
3206 	return false;
3207       break;
3208     }
3209 
3210   return ((to == HImode && from == HFmode)
3211 	  || (to == SImode && (from == SFmode || from == DFmode))
3212 	  || (to == HFmode && (from == HImode || from == SFmode))
3213 	  || (to == SFmode && (from == SImode || from == HFmode
3214 			       || from == DFmode))
3215 	  || (to == DFmode && (from == SImode || from == SFmode)));
3216 }
3217 
3218 /* Implement TARGET_EMUTLS_VAR_INIT.
3219 
3220    Disable emutls (gthr-gcn.h does not support it, yet).  */
3221 
3222 tree
gcn_emutls_var_init(tree,tree decl,tree)3223 gcn_emutls_var_init (tree, tree decl, tree)
3224 {
3225   sorry_at (DECL_SOURCE_LOCATION (decl), "TLS is not implemented for GCN.");
3226   return NULL_TREE;
3227 }
3228 
3229 /* }}}  */
3230 /* {{{ Costs.  */
3231 
3232 /* Implement TARGET_RTX_COSTS.
3233 
3234    Compute a (partial) cost for rtx X.  Return true if the complete
3235    cost has been computed, and false if subexpressions should be
3236    scanned.  In either case, *TOTAL contains the cost result.  */
3237 
3238 static bool
gcn_rtx_costs(rtx x,machine_mode,int,int,int * total,bool)3239 gcn_rtx_costs (rtx x, machine_mode, int, int, int *total, bool)
3240 {
3241   enum rtx_code code = GET_CODE (x);
3242   switch (code)
3243     {
3244     case CONST:
3245     case CONST_DOUBLE:
3246     case CONST_VECTOR:
3247     case CONST_INT:
3248       if (gcn_inline_constant_p (x))
3249 	*total = 0;
3250       else if (code == CONST_INT
3251 	  && ((unsigned HOST_WIDE_INT) INTVAL (x) + 0x8000) < 0x10000)
3252 	*total = 1;
3253       else if (gcn_constant_p (x))
3254 	*total = 2;
3255       else
3256 	*total = vgpr_vector_mode_p (GET_MODE (x)) ? 64 : 4;
3257       return true;
3258 
3259     case DIV:
3260       *total = 100;
3261       return false;
3262 
3263     default:
3264       *total = 3;
3265       return false;
3266     }
3267 }
3268 
3269 /* Implement TARGET_MEMORY_MOVE_COST.
3270 
3271    Return the cost of moving data of mode M between a
3272    register and memory.  A value of 2 is the default; this cost is
3273    relative to those in `REGISTER_MOVE_COST'.
3274 
3275    This function is used extensively by register_move_cost that is used to
3276    build tables at startup.  Make it inline in this case.
3277    When IN is 2, return maximum of in and out move cost.
3278 
3279    If moving between registers and memory is more expensive than
3280    between two registers, you should define this macro to express the
3281    relative cost.
3282 
3283    Model also increased moving costs of QImode registers in non
3284    Q_REGS classes.  */
3285 
3286 #define LOAD_COST  32
3287 #define STORE_COST 32
3288 static int
gcn_memory_move_cost(machine_mode mode,reg_class_t regclass,bool in)3289 gcn_memory_move_cost (machine_mode mode, reg_class_t regclass, bool in)
3290 {
3291   int nregs = CEIL (GET_MODE_SIZE (mode), 4);
3292   switch (regclass)
3293     {
3294     case SCC_CONDITIONAL_REG:
3295     case VCCZ_CONDITIONAL_REG:
3296     case VCC_CONDITIONAL_REG:
3297     case EXECZ_CONDITIONAL_REG:
3298     case ALL_CONDITIONAL_REGS:
3299     case SGPR_REGS:
3300     case SGPR_EXEC_REGS:
3301     case EXEC_MASK_REG:
3302     case SGPR_VOP_SRC_REGS:
3303     case SGPR_MEM_SRC_REGS:
3304     case SGPR_SRC_REGS:
3305     case SGPR_DST_REGS:
3306     case GENERAL_REGS:
3307     case AFP_REGS:
3308       if (!in)
3309 	return (STORE_COST + 2) * nregs;
3310       return LOAD_COST * nregs;
3311     case VGPR_REGS:
3312       if (in)
3313 	return (LOAD_COST + 2) * nregs;
3314       return STORE_COST * nregs;
3315     case ALL_REGS:
3316     case ALL_GPR_REGS:
3317     case SRCDST_REGS:
3318       if (in)
3319 	return (LOAD_COST + 2) * nregs;
3320       return (STORE_COST + 2) * nregs;
3321     default:
3322       gcc_unreachable ();
3323     }
3324 }
3325 
3326 /* Implement TARGET_REGISTER_MOVE_COST.
3327 
3328    Return the cost of moving data from a register in class CLASS1 to
3329    one in class CLASS2.  Base value is 2.  */
3330 
3331 static int
gcn_register_move_cost(machine_mode,reg_class_t dst,reg_class_t src)3332 gcn_register_move_cost (machine_mode, reg_class_t dst, reg_class_t src)
3333 {
3334   /* Increase cost of moving from and to vector registers.  While this is
3335      fast in hardware (I think), it has hidden cost of setting up the exec
3336      flags.  */
3337   if ((src < VGPR_REGS) != (dst < VGPR_REGS))
3338     return 4;
3339   return 2;
3340 }
3341 
3342 /* }}}  */
3343 /* {{{ Builtins.  */
3344 
3345 /* Type codes used by GCN built-in definitions.  */
3346 
3347 enum gcn_builtin_type_index
3348 {
3349   GCN_BTI_END_OF_PARAMS,
3350 
3351   GCN_BTI_VOID,
3352   GCN_BTI_BOOL,
3353   GCN_BTI_INT,
3354   GCN_BTI_UINT,
3355   GCN_BTI_SIZE_T,
3356   GCN_BTI_LLINT,
3357   GCN_BTI_LLUINT,
3358   GCN_BTI_EXEC,
3359 
3360   GCN_BTI_SF,
3361   GCN_BTI_V64SI,
3362   GCN_BTI_V64SF,
3363   GCN_BTI_V64PTR,
3364   GCN_BTI_SIPTR,
3365   GCN_BTI_SFPTR,
3366   GCN_BTI_VOIDPTR,
3367 
3368   GCN_BTI_LDS_VOIDPTR,
3369 
3370   GCN_BTI_MAX
3371 };
3372 
3373 static GTY(()) tree gcn_builtin_types[GCN_BTI_MAX];
3374 
3375 #define exec_type_node (gcn_builtin_types[GCN_BTI_EXEC])
3376 #define sf_type_node (gcn_builtin_types[GCN_BTI_SF])
3377 #define v64si_type_node (gcn_builtin_types[GCN_BTI_V64SI])
3378 #define v64sf_type_node (gcn_builtin_types[GCN_BTI_V64SF])
3379 #define v64ptr_type_node (gcn_builtin_types[GCN_BTI_V64PTR])
3380 #define siptr_type_node (gcn_builtin_types[GCN_BTI_SIPTR])
3381 #define sfptr_type_node (gcn_builtin_types[GCN_BTI_SFPTR])
3382 #define voidptr_type_node (gcn_builtin_types[GCN_BTI_VOIDPTR])
3383 #define size_t_type_node (gcn_builtin_types[GCN_BTI_SIZE_T])
3384 
3385 static rtx gcn_expand_builtin_1 (tree, rtx, rtx, machine_mode, int,
3386 				 struct gcn_builtin_description *);
3387 static rtx gcn_expand_builtin_binop (tree, rtx, rtx, machine_mode, int,
3388 				     struct gcn_builtin_description *);
3389 
3390 struct gcn_builtin_description;
3391 typedef rtx (*gcn_builtin_expander) (tree, rtx, rtx, machine_mode, int,
3392 				     struct gcn_builtin_description *);
3393 
3394 enum gcn_builtin_type
3395 {
3396   B_UNIMPLEMENTED,		/* Sorry out */
3397   B_INSN,			/* Emit a pattern */
3398   B_OVERLOAD			/* Placeholder for an overloaded function */
3399 };
3400 
3401 struct gcn_builtin_description
3402 {
3403   int fcode;
3404   int icode;
3405   const char *name;
3406   enum gcn_builtin_type type;
3407   /* The first element of parm is always the return type.  The rest
3408      are a zero terminated list of parameters.  */
3409   int parm[6];
3410   gcn_builtin_expander expander;
3411 };
3412 
3413 /* Read in the GCN builtins from gcn-builtins.def.  */
3414 
3415 extern GTY(()) struct gcn_builtin_description gcn_builtins[GCN_BUILTIN_MAX];
3416 
3417 struct gcn_builtin_description gcn_builtins[] = {
3418 #define DEF_BUILTIN(fcode, icode, name, type, params, expander)	\
3419   {GCN_BUILTIN_ ## fcode, icode, name, type, params, expander},
3420 
3421 #define DEF_BUILTIN_BINOP_INT_FP(fcode, ic, name)			\
3422   {GCN_BUILTIN_ ## fcode ## _V64SI,					\
3423    CODE_FOR_ ## ic ##v64si3_exec, name "_v64int", B_INSN,		\
3424    {GCN_BTI_V64SI, GCN_BTI_EXEC, GCN_BTI_V64SI, GCN_BTI_V64SI,		\
3425     GCN_BTI_V64SI, GCN_BTI_END_OF_PARAMS}, gcn_expand_builtin_binop},	\
3426   {GCN_BUILTIN_ ## fcode ## _V64SI_unspec,				\
3427    CODE_FOR_ ## ic ##v64si3_exec, name "_v64int_unspec", B_INSN, 	\
3428    {GCN_BTI_V64SI, GCN_BTI_EXEC, GCN_BTI_V64SI, GCN_BTI_V64SI,		\
3429     GCN_BTI_END_OF_PARAMS}, gcn_expand_builtin_binop},
3430 
3431 #include "gcn-builtins.def"
3432 #undef DEF_BUILTIN_BINOP_INT_FP
3433 #undef DEF_BUILTIN
3434 };
3435 
3436 static GTY(()) tree gcn_builtin_decls[GCN_BUILTIN_MAX];
3437 
3438 /* Implement TARGET_BUILTIN_DECL.
3439 
3440    Return the GCN builtin for CODE.  */
3441 
3442 tree
gcn_builtin_decl(unsigned code,bool ARG_UNUSED (initialize_p))3443 gcn_builtin_decl (unsigned code, bool ARG_UNUSED (initialize_p))
3444 {
3445   if (code >= GCN_BUILTIN_MAX)
3446     return error_mark_node;
3447 
3448   return gcn_builtin_decls[code];
3449 }
3450 
3451 /* Helper function for gcn_init_builtins.  */
3452 
3453 static void
gcn_init_builtin_types(void)3454 gcn_init_builtin_types (void)
3455 {
3456   gcn_builtin_types[GCN_BTI_VOID] = void_type_node;
3457   gcn_builtin_types[GCN_BTI_BOOL] = boolean_type_node;
3458   gcn_builtin_types[GCN_BTI_INT] = intSI_type_node;
3459   gcn_builtin_types[GCN_BTI_UINT] = unsigned_type_for (intSI_type_node);
3460   gcn_builtin_types[GCN_BTI_SIZE_T] = size_type_node;
3461   gcn_builtin_types[GCN_BTI_LLINT] = intDI_type_node;
3462   gcn_builtin_types[GCN_BTI_LLUINT] = unsigned_type_for (intDI_type_node);
3463 
3464   exec_type_node = unsigned_intDI_type_node;
3465   sf_type_node = float32_type_node;
3466   v64si_type_node = build_vector_type (intSI_type_node, 64);
3467   v64sf_type_node = build_vector_type (float_type_node, 64);
3468   v64ptr_type_node = build_vector_type (unsigned_intDI_type_node
3469 					/*build_pointer_type
3470 					  (integer_type_node) */
3471 					, 64);
3472   tree tmp = build_distinct_type_copy (intSI_type_node);
3473   TYPE_ADDR_SPACE (tmp) = ADDR_SPACE_FLAT;
3474   siptr_type_node = build_pointer_type (tmp);
3475 
3476   tmp = build_distinct_type_copy (float_type_node);
3477   TYPE_ADDR_SPACE (tmp) = ADDR_SPACE_FLAT;
3478   sfptr_type_node = build_pointer_type (tmp);
3479 
3480   tmp = build_distinct_type_copy (void_type_node);
3481   TYPE_ADDR_SPACE (tmp) = ADDR_SPACE_FLAT;
3482   voidptr_type_node = build_pointer_type (tmp);
3483 
3484   tmp = build_distinct_type_copy (void_type_node);
3485   TYPE_ADDR_SPACE (tmp) = ADDR_SPACE_LDS;
3486   gcn_builtin_types[GCN_BTI_LDS_VOIDPTR] = build_pointer_type (tmp);
3487 }
3488 
3489 /* Implement TARGET_INIT_BUILTINS.
3490 
3491    Set up all builtin functions for this target.  */
3492 
3493 static void
gcn_init_builtins(void)3494 gcn_init_builtins (void)
3495 {
3496   gcn_init_builtin_types ();
3497 
3498   struct gcn_builtin_description *d;
3499   unsigned int i;
3500   for (i = 0, d = gcn_builtins; i < GCN_BUILTIN_MAX; i++, d++)
3501     {
3502       tree p;
3503       char name[64];		/* build_function will make a copy.  */
3504       int parm;
3505 
3506       /* FIXME: Is this necessary/useful? */
3507       if (d->name == 0)
3508 	continue;
3509 
3510       /* Find last parm.  */
3511       for (parm = 1; d->parm[parm] != GCN_BTI_END_OF_PARAMS; parm++)
3512 	;
3513 
3514       p = void_list_node;
3515       while (parm > 1)
3516 	p = tree_cons (NULL_TREE, gcn_builtin_types[d->parm[--parm]], p);
3517 
3518       p = build_function_type (gcn_builtin_types[d->parm[0]], p);
3519 
3520       sprintf (name, "__builtin_gcn_%s", d->name);
3521       gcn_builtin_decls[i]
3522 	= add_builtin_function (name, p, i, BUILT_IN_MD, NULL, NULL_TREE);
3523 
3524       /* These builtins don't throw.  */
3525       TREE_NOTHROW (gcn_builtin_decls[i]) = 1;
3526     }
3527 
3528 /* FIXME: remove the ifdef once OpenACC support is merged upstream.  */
3529 #ifdef BUILT_IN_GOACC_SINGLE_START
3530   /* These builtins need to take/return an LDS pointer: override the generic
3531      versions here.  */
3532 
3533   set_builtin_decl (BUILT_IN_GOACC_SINGLE_START,
3534 		    gcn_builtin_decls[GCN_BUILTIN_ACC_SINGLE_START], false);
3535 
3536   set_builtin_decl (BUILT_IN_GOACC_SINGLE_COPY_START,
3537 		    gcn_builtin_decls[GCN_BUILTIN_ACC_SINGLE_COPY_START],
3538 		    false);
3539 
3540   set_builtin_decl (BUILT_IN_GOACC_SINGLE_COPY_END,
3541 		    gcn_builtin_decls[GCN_BUILTIN_ACC_SINGLE_COPY_END],
3542 		    false);
3543 
3544   set_builtin_decl (BUILT_IN_GOACC_BARRIER,
3545 		    gcn_builtin_decls[GCN_BUILTIN_ACC_BARRIER], false);
3546 #endif
3547 }
3548 
3549 /* Expand the CMP_SWAP GCN builtins.  We have our own versions that do
3550    not require taking the address of any object, other than the memory
3551    cell being operated on.
3552 
3553    Helper function for gcn_expand_builtin_1.  */
3554 
3555 static rtx
gcn_expand_cmp_swap(tree exp,rtx target)3556 gcn_expand_cmp_swap (tree exp, rtx target)
3557 {
3558   machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
3559   addr_space_t as
3560     = TYPE_ADDR_SPACE (TREE_TYPE (TREE_TYPE (CALL_EXPR_ARG (exp, 0))));
3561   machine_mode as_mode = gcn_addr_space_address_mode (as);
3562 
3563   if (!target)
3564     target = gen_reg_rtx (mode);
3565 
3566   rtx addr = expand_expr (CALL_EXPR_ARG (exp, 0),
3567 			  NULL_RTX, as_mode, EXPAND_NORMAL);
3568   rtx cmp = expand_expr (CALL_EXPR_ARG (exp, 1),
3569 			 NULL_RTX, mode, EXPAND_NORMAL);
3570   rtx src = expand_expr (CALL_EXPR_ARG (exp, 2),
3571 			 NULL_RTX, mode, EXPAND_NORMAL);
3572   rtx pat;
3573 
3574   rtx mem = gen_rtx_MEM (mode, force_reg (as_mode, addr));
3575   set_mem_addr_space (mem, as);
3576 
3577   if (!REG_P (cmp))
3578     cmp = copy_to_mode_reg (mode, cmp);
3579   if (!REG_P (src))
3580     src = copy_to_mode_reg (mode, src);
3581 
3582   if (mode == SImode)
3583     pat = gen_sync_compare_and_swapsi (target, mem, cmp, src);
3584   else
3585     pat = gen_sync_compare_and_swapdi (target, mem, cmp, src);
3586 
3587   emit_insn (pat);
3588 
3589   return target;
3590 }
3591 
3592 /* Expand many different builtins.
3593 
3594    Intended for use in gcn-builtins.def.  */
3595 
3596 static rtx
gcn_expand_builtin_1(tree exp,rtx target,rtx,machine_mode,int ignore,struct gcn_builtin_description *)3597 gcn_expand_builtin_1 (tree exp, rtx target, rtx /*subtarget */ ,
3598 		      machine_mode /*mode */ , int ignore,
3599 		      struct gcn_builtin_description *)
3600 {
3601   tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
3602   switch (DECL_MD_FUNCTION_CODE (fndecl))
3603     {
3604     case GCN_BUILTIN_FLAT_LOAD_INT32:
3605       {
3606 	if (ignore)
3607 	  return target;
3608 	/*rtx exec = */
3609 	force_reg (DImode,
3610 		   expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX, DImode,
3611 				EXPAND_NORMAL));
3612 	/*rtx ptr = */
3613 	force_reg (V64DImode,
3614 		   expand_expr (CALL_EXPR_ARG (exp, 1), NULL_RTX, V64DImode,
3615 				EXPAND_NORMAL));
3616 	/*emit_insn (gen_vector_flat_loadv64si
3617 		     (target, gcn_gen_undef (V64SImode), ptr, exec)); */
3618 	return target;
3619       }
3620     case GCN_BUILTIN_FLAT_LOAD_PTR_INT32:
3621     case GCN_BUILTIN_FLAT_LOAD_PTR_FLOAT:
3622       {
3623 	if (ignore)
3624 	  return target;
3625 	rtx exec = force_reg (DImode,
3626 			      expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX,
3627 					   DImode,
3628 					   EXPAND_NORMAL));
3629 	rtx ptr = force_reg (DImode,
3630 			     expand_expr (CALL_EXPR_ARG (exp, 1), NULL_RTX,
3631 					  V64DImode,
3632 					  EXPAND_NORMAL));
3633 	rtx offsets = force_reg (V64SImode,
3634 				 expand_expr (CALL_EXPR_ARG (exp, 2),
3635 					      NULL_RTX, V64DImode,
3636 					      EXPAND_NORMAL));
3637 	rtx addrs = gen_reg_rtx (V64DImode);
3638 	rtx tmp = gen_reg_rtx (V64SImode);
3639 	emit_insn (gen_ashlv64si3_exec (tmp, offsets,
3640 					  GEN_INT (2),
3641 					  gcn_gen_undef (V64SImode), exec));
3642 	emit_insn (gen_addv64di3_zext_dup2_exec (addrs, tmp, ptr,
3643 						 gcn_gen_undef (V64DImode),
3644 						 exec));
3645 	rtx mem = gen_rtx_MEM (GET_MODE (target), addrs);
3646 	/*set_mem_addr_space (mem, ADDR_SPACE_FLAT); */
3647 	/* FIXME: set attributes.  */
3648 	emit_insn (gen_mov_with_exec (target, mem, exec));
3649 	return target;
3650       }
3651     case GCN_BUILTIN_FLAT_STORE_PTR_INT32:
3652     case GCN_BUILTIN_FLAT_STORE_PTR_FLOAT:
3653       {
3654 	rtx exec = force_reg (DImode,
3655 			      expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX,
3656 					   DImode,
3657 					   EXPAND_NORMAL));
3658 	rtx ptr = force_reg (DImode,
3659 			     expand_expr (CALL_EXPR_ARG (exp, 1), NULL_RTX,
3660 					  V64DImode,
3661 					  EXPAND_NORMAL));
3662 	rtx offsets = force_reg (V64SImode,
3663 				 expand_expr (CALL_EXPR_ARG (exp, 2),
3664 					      NULL_RTX, V64DImode,
3665 					      EXPAND_NORMAL));
3666 	machine_mode vmode = TYPE_MODE (TREE_TYPE (CALL_EXPR_ARG (exp,
3667 								       3)));
3668 	rtx val = force_reg (vmode,
3669 			     expand_expr (CALL_EXPR_ARG (exp, 3), NULL_RTX,
3670 					  vmode,
3671 					  EXPAND_NORMAL));
3672 	rtx addrs = gen_reg_rtx (V64DImode);
3673 	rtx tmp = gen_reg_rtx (V64SImode);
3674 	emit_insn (gen_ashlv64si3_exec (tmp, offsets,
3675 					  GEN_INT (2),
3676 					  gcn_gen_undef (V64SImode), exec));
3677 	emit_insn (gen_addv64di3_zext_dup2_exec (addrs, tmp, ptr,
3678 						 gcn_gen_undef (V64DImode),
3679 						 exec));
3680 	rtx mem = gen_rtx_MEM (vmode, addrs);
3681 	/*set_mem_addr_space (mem, ADDR_SPACE_FLAT); */
3682 	/* FIXME: set attributes.  */
3683 	emit_insn (gen_mov_with_exec (mem, val, exec));
3684 	return target;
3685       }
3686     case GCN_BUILTIN_SQRTVF:
3687       {
3688 	if (ignore)
3689 	  return target;
3690 	rtx exec = gcn_full_exec_reg ();
3691 	rtx arg = force_reg (V64SFmode,
3692 			     expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX,
3693 					  V64SFmode,
3694 					  EXPAND_NORMAL));
3695 	emit_insn (gen_sqrtv64sf2_exec
3696 		   (target, arg, gcn_gen_undef (V64SFmode), exec));
3697 	return target;
3698       }
3699     case GCN_BUILTIN_SQRTF:
3700       {
3701 	if (ignore)
3702 	  return target;
3703 	rtx arg = force_reg (SFmode,
3704 			     expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX,
3705 					  SFmode,
3706 					  EXPAND_NORMAL));
3707 	emit_insn (gen_sqrtsf2 (target, arg));
3708 	return target;
3709       }
3710     case GCN_BUILTIN_OMP_DIM_SIZE:
3711       {
3712 	if (ignore)
3713 	  return target;
3714 	emit_insn (gen_oacc_dim_size (target,
3715 				      expand_expr (CALL_EXPR_ARG (exp, 0),
3716 						   NULL_RTX, SImode,
3717 						   EXPAND_NORMAL)));
3718 	return target;
3719       }
3720     case GCN_BUILTIN_OMP_DIM_POS:
3721       {
3722 	if (ignore)
3723 	  return target;
3724 	emit_insn (gen_oacc_dim_pos (target,
3725 				     expand_expr (CALL_EXPR_ARG (exp, 0),
3726 						  NULL_RTX, SImode,
3727 						  EXPAND_NORMAL)));
3728 	return target;
3729       }
3730     case GCN_BUILTIN_CMP_SWAP:
3731     case GCN_BUILTIN_CMP_SWAPLL:
3732       return gcn_expand_cmp_swap (exp, target);
3733 
3734     case GCN_BUILTIN_ACC_SINGLE_START:
3735       {
3736 	if (ignore)
3737 	  return target;
3738 
3739 	rtx wavefront = gcn_oacc_dim_pos (1);
3740 	rtx cond = gen_rtx_EQ (VOIDmode, wavefront, const0_rtx);
3741 	rtx cc = (target && REG_P (target)) ? target : gen_reg_rtx (BImode);
3742 	emit_insn (gen_cstoresi4 (cc, cond, wavefront, const0_rtx));
3743 	return cc;
3744       }
3745 
3746     case GCN_BUILTIN_ACC_SINGLE_COPY_START:
3747       {
3748 	rtx blk = force_reg (SImode,
3749 			     expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX,
3750 					  SImode, EXPAND_NORMAL));
3751 	rtx wavefront = gcn_oacc_dim_pos (1);
3752 	rtx cond = gen_rtx_NE (VOIDmode, wavefront, const0_rtx);
3753 	rtx not_zero = gen_label_rtx ();
3754 	emit_insn (gen_cbranchsi4 (cond, wavefront, const0_rtx, not_zero));
3755 	emit_move_insn (blk, const0_rtx);
3756 	emit_label (not_zero);
3757 	return blk;
3758       }
3759 
3760     case GCN_BUILTIN_ACC_SINGLE_COPY_END:
3761       return target;
3762 
3763     case GCN_BUILTIN_ACC_BARRIER:
3764       emit_insn (gen_gcn_wavefront_barrier ());
3765       return target;
3766 
3767     default:
3768       gcc_unreachable ();
3769     }
3770 }
3771 
3772 /* Expansion of simple arithmetic and bit binary operation builtins.
3773 
3774    Intended for use with gcn_builtins table.  */
3775 
3776 static rtx
gcn_expand_builtin_binop(tree exp,rtx target,rtx,machine_mode,int ignore,struct gcn_builtin_description * d)3777 gcn_expand_builtin_binop (tree exp, rtx target, rtx /*subtarget */ ,
3778 			  machine_mode /*mode */ , int ignore,
3779 			  struct gcn_builtin_description *d)
3780 {
3781   int icode = d->icode;
3782   if (ignore)
3783     return target;
3784 
3785   rtx exec = force_reg (DImode,
3786 			expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX, DImode,
3787 				     EXPAND_NORMAL));
3788 
3789   machine_mode m1 = insn_data[icode].operand[1].mode;
3790   rtx arg1 = expand_expr (CALL_EXPR_ARG (exp, 1), NULL_RTX, m1,
3791 			  EXPAND_NORMAL);
3792   if (!insn_data[icode].operand[1].predicate (arg1, m1))
3793     arg1 = force_reg (m1, arg1);
3794 
3795   machine_mode m2 = insn_data[icode].operand[2].mode;
3796   rtx arg2 = expand_expr (CALL_EXPR_ARG (exp, 2), NULL_RTX, m2,
3797 			  EXPAND_NORMAL);
3798   if (!insn_data[icode].operand[2].predicate (arg2, m2))
3799     arg2 = force_reg (m2, arg2);
3800 
3801   rtx arg_prev;
3802   if (call_expr_nargs (exp) == 4)
3803     {
3804       machine_mode m_prev = insn_data[icode].operand[4].mode;
3805       arg_prev = force_reg (m_prev,
3806 			    expand_expr (CALL_EXPR_ARG (exp, 3), NULL_RTX,
3807 					 m_prev, EXPAND_NORMAL));
3808     }
3809   else
3810     arg_prev = gcn_gen_undef (GET_MODE (target));
3811 
3812   rtx pat = GEN_FCN (icode) (target, arg1, arg2, exec, arg_prev);
3813   emit_insn (pat);
3814   return target;
3815 }
3816 
3817 /* Implement TARGET_EXPAND_BUILTIN.
3818 
3819    Expand an expression EXP that calls a built-in function, with result going
3820    to TARGET if that's convenient (and in mode MODE if that's convenient).
3821    SUBTARGET may be used as the target for computing one of EXP's operands.
3822    IGNORE is nonzero if the value is to be ignored.  */
3823 
3824 rtx
gcn_expand_builtin(tree exp,rtx target,rtx subtarget,machine_mode mode,int ignore)3825 gcn_expand_builtin (tree exp, rtx target, rtx subtarget, machine_mode mode,
3826 		    int ignore)
3827 {
3828   tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
3829   unsigned int fcode = DECL_MD_FUNCTION_CODE (fndecl);
3830   struct gcn_builtin_description *d;
3831 
3832   gcc_assert (fcode < GCN_BUILTIN_MAX);
3833   d = &gcn_builtins[fcode];
3834 
3835   if (d->type == B_UNIMPLEMENTED)
3836     sorry ("Builtin not implemented");
3837 
3838   return d->expander (exp, target, subtarget, mode, ignore, d);
3839 }
3840 
3841 /* }}}  */
3842 /* {{{ Vectorization.  */
3843 
3844 /* Implement TARGET_VECTORIZE_GET_MASK_MODE.
3845 
3846    A vector mask is a value that holds one boolean result for every element in
3847    a vector.  */
3848 
3849 opt_machine_mode
gcn_vectorize_get_mask_mode(machine_mode)3850 gcn_vectorize_get_mask_mode (machine_mode)
3851 {
3852   /* GCN uses a DImode bit-mask.  */
3853   return DImode;
3854 }
3855 
3856 /* Return an RTX that references a vector with the i-th lane containing
3857    PERM[i]*4.
3858 
3859    Helper function for gcn_vectorize_vec_perm_const.  */
3860 
3861 static rtx
gcn_make_vec_perm_address(unsigned int * perm)3862 gcn_make_vec_perm_address (unsigned int *perm)
3863 {
3864   rtx x = gen_reg_rtx (V64SImode);
3865   emit_move_insn (x, gcn_vec_constant (V64SImode, 0));
3866 
3867   /* Permutation addresses use byte addressing.  With each vector lane being
3868      4 bytes wide, and with 64 lanes in total, only bits 2..7 are significant,
3869      so only set those.
3870 
3871      The permutation given to the vec_perm* patterns range from 0 to 2N-1 to
3872      select between lanes in two vectors, but as the DS_BPERMUTE* instructions
3873      only take one source vector, the most-significant bit can be ignored
3874      here.  Instead, we can use EXEC masking to select the relevant part of
3875      each source vector after they are permuted separately.  */
3876   uint64_t bit_mask = 1 << 2;
3877   for (int i = 2; i < 8; i++, bit_mask <<= 1)
3878     {
3879       uint64_t exec_mask = 0;
3880       uint64_t lane_mask = 1;
3881       for (int j = 0; j < 64; j++, lane_mask <<= 1)
3882 	if ((perm[j] * 4) & bit_mask)
3883 	  exec_mask |= lane_mask;
3884 
3885       if (exec_mask)
3886 	emit_insn (gen_addv64si3_exec (x, x,
3887 				       gcn_vec_constant (V64SImode,
3888 							 bit_mask),
3889 				       x, get_exec (exec_mask)));
3890     }
3891 
3892   return x;
3893 }
3894 
3895 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST.
3896 
3897    Return true if permutation with SEL is possible.
3898 
3899    If DST/SRC0/SRC1 are non-null, emit the instructions to perform the
3900    permutations.  */
3901 
3902 static bool
gcn_vectorize_vec_perm_const(machine_mode vmode,rtx dst,rtx src0,rtx src1,const vec_perm_indices & sel)3903 gcn_vectorize_vec_perm_const (machine_mode vmode, rtx dst,
3904 			      rtx src0, rtx src1,
3905 			      const vec_perm_indices & sel)
3906 {
3907   unsigned int nelt = GET_MODE_NUNITS (vmode);
3908 
3909   gcc_assert (VECTOR_MODE_P (vmode));
3910   gcc_assert (nelt <= 64);
3911   gcc_assert (sel.length () == nelt);
3912 
3913   if (!dst)
3914     {
3915       /* All vector permutations are possible on this architecture,
3916          with varying degrees of efficiency depending on the permutation. */
3917       return true;
3918     }
3919 
3920   unsigned int perm[64];
3921   for (unsigned int i = 0; i < nelt; ++i)
3922     perm[i] = sel[i] & (2 * nelt - 1);
3923 
3924   /* Make life a bit easier by swapping operands if necessary so that
3925      the first element always comes from src0.  */
3926   if (perm[0] >= nelt)
3927     {
3928       rtx temp = src0;
3929       src0 = src1;
3930       src1 = temp;
3931 
3932       for (unsigned int i = 0; i < nelt; ++i)
3933 	if (perm[i] < nelt)
3934 	  perm[i] += nelt;
3935 	else
3936 	  perm[i] -= nelt;
3937     }
3938 
3939   /* TODO: There are more efficient ways to implement certain permutations
3940      using ds_swizzle_b32 and/or DPP.  Test for and expand them here, before
3941      this more inefficient generic approach is used.  */
3942 
3943   int64_t src1_lanes = 0;
3944   int64_t lane_bit = 1;
3945 
3946   for (unsigned int i = 0; i < nelt; ++i, lane_bit <<= 1)
3947     {
3948       /* Set the bits for lanes from src1.  */
3949       if (perm[i] >= nelt)
3950 	src1_lanes |= lane_bit;
3951     }
3952 
3953   rtx addr = gcn_make_vec_perm_address (perm);
3954   rtx (*ds_bpermute) (rtx, rtx, rtx, rtx);
3955 
3956   switch (vmode)
3957     {
3958     case E_V64QImode:
3959       ds_bpermute = gen_ds_bpermutev64qi;
3960       break;
3961     case E_V64HImode:
3962       ds_bpermute = gen_ds_bpermutev64hi;
3963       break;
3964     case E_V64SImode:
3965       ds_bpermute = gen_ds_bpermutev64si;
3966       break;
3967     case E_V64HFmode:
3968       ds_bpermute = gen_ds_bpermutev64hf;
3969       break;
3970     case E_V64SFmode:
3971       ds_bpermute = gen_ds_bpermutev64sf;
3972       break;
3973     case E_V64DImode:
3974       ds_bpermute = gen_ds_bpermutev64di;
3975       break;
3976     case E_V64DFmode:
3977       ds_bpermute = gen_ds_bpermutev64df;
3978       break;
3979     default:
3980       gcc_assert (false);
3981     }
3982 
3983   /* Load elements from src0 to dst.  */
3984   gcc_assert (~src1_lanes);
3985   emit_insn (ds_bpermute (dst, addr, src0, gcn_full_exec_reg ()));
3986 
3987   /* Load elements from src1 to dst.  */
3988   if (src1_lanes)
3989     {
3990       /* Masking a lane masks both the destination and source lanes for
3991          DS_BPERMUTE, so we need to have all lanes enabled for the permute,
3992          then add an extra masked move to merge the results of permuting
3993          the two source vectors together.
3994        */
3995       rtx tmp = gen_reg_rtx (vmode);
3996       emit_insn (ds_bpermute (tmp, addr, src1, gcn_full_exec_reg ()));
3997       emit_insn (gen_mov_with_exec (dst, tmp, get_exec (src1_lanes)));
3998     }
3999 
4000   return true;
4001 }
4002 
4003 /* Implements TARGET_VECTOR_MODE_SUPPORTED_P.
4004 
4005    Return nonzero if vector MODE is supported with at least move
4006    instructions.  */
4007 
4008 static bool
gcn_vector_mode_supported_p(machine_mode mode)4009 gcn_vector_mode_supported_p (machine_mode mode)
4010 {
4011   return (mode == V64QImode || mode == V64HImode
4012 	  || mode == V64SImode || mode == V64DImode
4013 	  || mode == V64SFmode || mode == V64DFmode);
4014 }
4015 
4016 /* Implement TARGET_VECTORIZE_PREFERRED_SIMD_MODE.
4017 
4018    Enables autovectorization for all supported modes.  */
4019 
4020 static machine_mode
gcn_vectorize_preferred_simd_mode(scalar_mode mode)4021 gcn_vectorize_preferred_simd_mode (scalar_mode mode)
4022 {
4023   switch (mode)
4024     {
4025     case E_QImode:
4026       return V64QImode;
4027     case E_HImode:
4028       return V64HImode;
4029     case E_SImode:
4030       return V64SImode;
4031     case E_DImode:
4032       return V64DImode;
4033     case E_SFmode:
4034       return V64SFmode;
4035     case E_DFmode:
4036       return V64DFmode;
4037     default:
4038       return word_mode;
4039     }
4040 }
4041 
4042 /* Implement TARGET_VECTORIZE_RELATED_MODE.
4043 
4044    All GCN vectors are 64-lane, so this is simpler than other architectures.
4045    In particular, we do *not* want to match vector bit-size.  */
4046 
4047 static opt_machine_mode
gcn_related_vector_mode(machine_mode vector_mode,scalar_mode element_mode,poly_uint64 nunits)4048 gcn_related_vector_mode (machine_mode vector_mode, scalar_mode element_mode,
4049 			 poly_uint64 nunits)
4050 {
4051   if (known_ne (nunits, 0U) && known_ne (nunits, 64U))
4052     return VOIDmode;
4053 
4054   machine_mode pref_mode = gcn_vectorize_preferred_simd_mode (element_mode);
4055   if (!VECTOR_MODE_P (pref_mode))
4056     return VOIDmode;
4057 
4058   return pref_mode;
4059 }
4060 
4061 /* Implement TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT.
4062 
4063    Returns the preferred alignment in bits for accesses to vectors of type type
4064    in vectorized code. This might be less than or greater than the ABI-defined
4065    value returned by TARGET_VECTOR_ALIGNMENT. It can be equal to the alignment
4066    of a single element, in which case the vectorizer will not try to optimize
4067    for alignment.  */
4068 
4069 static poly_uint64
gcn_preferred_vector_alignment(const_tree type)4070 gcn_preferred_vector_alignment (const_tree type)
4071 {
4072   return TYPE_ALIGN (TREE_TYPE (type));
4073 }
4074 
4075 /* Implement TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT.
4076 
4077    Return true if the target supports misaligned vector store/load of a
4078    specific factor denoted in the misalignment parameter.  */
4079 
4080 static bool
gcn_vectorize_support_vector_misalignment(machine_mode ARG_UNUSED (mode),const_tree type,int misalignment,bool is_packed)4081 gcn_vectorize_support_vector_misalignment (machine_mode ARG_UNUSED (mode),
4082 					   const_tree type, int misalignment,
4083 					   bool is_packed)
4084 {
4085   if (is_packed)
4086     return false;
4087 
4088   /* If the misalignment is unknown, we should be able to handle the access
4089      so long as it is not to a member of a packed data structure.  */
4090   if (misalignment == -1)
4091     return true;
4092 
4093   /* Return true if the misalignment is a multiple of the natural alignment
4094      of the vector's element type.  This is probably always going to be
4095      true in practice, since we've already established that this isn't a
4096      packed access.  */
4097   return misalignment % TYPE_ALIGN_UNIT (type) == 0;
4098 }
4099 
4100 /* Implement TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.
4101 
4102    Return true if vector alignment is reachable (by peeling N iterations) for
4103    the given scalar type TYPE.  */
4104 
4105 static bool
gcn_vector_alignment_reachable(const_tree ARG_UNUSED (type),bool is_packed)4106 gcn_vector_alignment_reachable (const_tree ARG_UNUSED (type), bool is_packed)
4107 {
4108   /* Vectors which aren't in packed structures will not be less aligned than
4109      the natural alignment of their element type, so this is safe.  */
4110   return !is_packed;
4111 }
4112 
4113 /* Generate DPP instructions used for vector reductions.
4114 
4115    The opcode is given by INSN.
4116    The first operand of the operation is shifted right by SHIFT vector lanes.
4117    SHIFT must be a power of 2.  If SHIFT is 16, the 15th lane of each row is
4118    broadcast the next row (thereby acting like a shift of 16 for the end of
4119    each row).  If SHIFT is 32, lane 31 is broadcast to all the
4120    following lanes (thereby acting like a shift of 32 for lane 63).  */
4121 
4122 char *
gcn_expand_dpp_shr_insn(machine_mode mode,const char * insn,int unspec,int shift)4123 gcn_expand_dpp_shr_insn (machine_mode mode, const char *insn,
4124 			 int unspec, int shift)
4125 {
4126   static char buf[128];
4127   const char *dpp;
4128   const char *vcc_in = "";
4129   const char *vcc_out = "";
4130 
4131   /* Add the vcc operand if needed.  */
4132   if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
4133     {
4134       if (unspec == UNSPEC_PLUS_CARRY_IN_DPP_SHR)
4135 	vcc_in = ", vcc";
4136 
4137       if (unspec == UNSPEC_PLUS_CARRY_DPP_SHR
4138 	  || unspec == UNSPEC_PLUS_CARRY_IN_DPP_SHR)
4139 	vcc_out = ", vcc";
4140     }
4141 
4142   /* Add the DPP modifiers.  */
4143   switch (shift)
4144     {
4145     case 1:
4146       dpp = "row_shr:1 bound_ctrl:0";
4147       break;
4148     case 2:
4149       dpp = "row_shr:2 bound_ctrl:0";
4150       break;
4151     case 4:
4152       dpp = "row_shr:4 bank_mask:0xe";
4153       break;
4154     case 8:
4155       dpp = "row_shr:8 bank_mask:0xc";
4156       break;
4157     case 16:
4158       dpp = "row_bcast:15 row_mask:0xa";
4159       break;
4160     case 32:
4161       dpp = "row_bcast:31 row_mask:0xc";
4162       break;
4163     default:
4164       gcc_unreachable ();
4165     }
4166 
4167   if (unspec == UNSPEC_MOV_DPP_SHR && vgpr_2reg_mode_p (mode))
4168     sprintf (buf, "%s\t%%L0, %%L1 %s\n\t%s\t%%H0, %%H1 %s",
4169 	     insn, dpp, insn, dpp);
4170   else if (unspec == UNSPEC_MOV_DPP_SHR)
4171     sprintf (buf, "%s\t%%0, %%1 %s", insn, dpp);
4172   else
4173     sprintf (buf, "%s\t%%0%s, %%1, %%2%s %s", insn, vcc_out, vcc_in, dpp);
4174 
4175   return buf;
4176 }
4177 
4178 /* Generate vector reductions in terms of DPP instructions.
4179 
4180    The vector register SRC of mode MODE is reduced using the operation given
4181    by UNSPEC, and the scalar result is returned in lane 63 of a vector
4182    register.  */
4183 
4184 rtx
gcn_expand_reduc_scalar(machine_mode mode,rtx src,int unspec)4185 gcn_expand_reduc_scalar (machine_mode mode, rtx src, int unspec)
4186 {
4187   machine_mode orig_mode = mode;
4188   bool use_moves = (((unspec == UNSPEC_SMIN_DPP_SHR
4189 		      || unspec == UNSPEC_SMAX_DPP_SHR
4190 		      || unspec == UNSPEC_UMIN_DPP_SHR
4191 		      || unspec == UNSPEC_UMAX_DPP_SHR)
4192 		     && mode == V64DImode)
4193 		    || (unspec == UNSPEC_PLUS_DPP_SHR
4194 			&& mode == V64DFmode));
4195   rtx_code code = (unspec == UNSPEC_SMIN_DPP_SHR ? SMIN
4196 		   : unspec == UNSPEC_SMAX_DPP_SHR ? SMAX
4197 		   : unspec == UNSPEC_UMIN_DPP_SHR ? UMIN
4198 		   : unspec == UNSPEC_UMAX_DPP_SHR ? UMAX
4199 		   : unspec == UNSPEC_PLUS_DPP_SHR ? PLUS
4200 		   : UNKNOWN);
4201   bool use_extends = ((unspec == UNSPEC_SMIN_DPP_SHR
4202 		       || unspec == UNSPEC_SMAX_DPP_SHR
4203 		       || unspec == UNSPEC_UMIN_DPP_SHR
4204 		       || unspec == UNSPEC_UMAX_DPP_SHR)
4205 		      && (mode == V64QImode
4206 			  || mode == V64HImode));
4207   bool unsignedp = (unspec == UNSPEC_UMIN_DPP_SHR
4208 		    || unspec == UNSPEC_UMAX_DPP_SHR);
4209   bool use_plus_carry = unspec == UNSPEC_PLUS_DPP_SHR
4210 			&& GET_MODE_CLASS (mode) == MODE_VECTOR_INT
4211 			&& (TARGET_GCN3 || mode == V64DImode);
4212 
4213   if (use_plus_carry)
4214     unspec = UNSPEC_PLUS_CARRY_DPP_SHR;
4215 
4216   if (use_extends)
4217     {
4218       rtx tmp = gen_reg_rtx (V64SImode);
4219       convert_move (tmp, src, unsignedp);
4220       src = tmp;
4221       mode = V64SImode;
4222     }
4223 
4224   /* Perform reduction by first performing the reduction operation on every
4225      pair of lanes, then on every pair of results from the previous
4226      iteration (thereby effectively reducing every 4 lanes) and so on until
4227      all lanes are reduced.  */
4228   rtx in, out = src;
4229   for (int i = 0, shift = 1; i < 6; i++, shift <<= 1)
4230     {
4231       rtx shift_val = gen_rtx_CONST_INT (VOIDmode, shift);
4232       in = out;
4233       out = gen_reg_rtx (mode);
4234 
4235       if (use_moves)
4236 	{
4237 	  rtx tmp = gen_reg_rtx (mode);
4238 	  emit_insn (gen_dpp_move (mode, tmp, in, shift_val));
4239 	  emit_insn (gen_rtx_SET (out, gen_rtx_fmt_ee (code, mode, tmp, in)));
4240 	}
4241       else
4242 	{
4243 	  rtx insn = gen_rtx_SET (out,
4244 				  gen_rtx_UNSPEC (mode,
4245 						  gen_rtvec (3, in, in,
4246 							     shift_val),
4247 						  unspec));
4248 
4249 	  /* Add clobber for instructions that set the carry flags.  */
4250 	  if (use_plus_carry)
4251 	    {
4252 	      rtx clobber = gen_rtx_CLOBBER (VOIDmode,
4253 					     gen_rtx_REG (DImode, VCC_REG));
4254 	      insn = gen_rtx_PARALLEL (VOIDmode,
4255 				       gen_rtvec (2, insn, clobber));
4256 	    }
4257 
4258 	  emit_insn (insn);
4259 	}
4260     }
4261 
4262   if (use_extends)
4263     {
4264       rtx tmp = gen_reg_rtx (orig_mode);
4265       convert_move (tmp, out, unsignedp);
4266       out = tmp;
4267     }
4268 
4269   return out;
4270 }
4271 
4272 /* Implement TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST.  */
4273 
4274 int
gcn_vectorization_cost(enum vect_cost_for_stmt ARG_UNUSED (type_of_cost),tree ARG_UNUSED (vectype),int ARG_UNUSED (misalign))4275 gcn_vectorization_cost (enum vect_cost_for_stmt ARG_UNUSED (type_of_cost),
4276 			tree ARG_UNUSED (vectype), int ARG_UNUSED (misalign))
4277 {
4278   /* Always vectorize.  */
4279   return 1;
4280 }
4281 
4282 /* }}}  */
4283 /* {{{ md_reorg pass.  */
4284 
4285 /* Identify VMEM instructions from their "type" attribute.  */
4286 
4287 static bool
gcn_vmem_insn_p(attr_type type)4288 gcn_vmem_insn_p (attr_type type)
4289 {
4290   switch (type)
4291     {
4292     case TYPE_MUBUF:
4293     case TYPE_MTBUF:
4294     case TYPE_FLAT:
4295       return true;
4296     case TYPE_UNKNOWN:
4297     case TYPE_SOP1:
4298     case TYPE_SOP2:
4299     case TYPE_SOPK:
4300     case TYPE_SOPC:
4301     case TYPE_SOPP:
4302     case TYPE_SMEM:
4303     case TYPE_DS:
4304     case TYPE_VOP2:
4305     case TYPE_VOP1:
4306     case TYPE_VOPC:
4307     case TYPE_VOP3A:
4308     case TYPE_VOP3B:
4309     case TYPE_VOP_SDWA:
4310     case TYPE_VOP_DPP:
4311     case TYPE_MULT:
4312     case TYPE_VMULT:
4313       return false;
4314     }
4315   gcc_unreachable ();
4316   return false;
4317 }
4318 
4319 /* If INSN sets the EXEC register to a constant value, return the value,
4320    otherwise return zero.  */
4321 
4322 static int64_t
gcn_insn_exec_value(rtx_insn * insn)4323 gcn_insn_exec_value (rtx_insn *insn)
4324 {
4325   if (!NONDEBUG_INSN_P (insn))
4326     return 0;
4327 
4328   rtx pattern = PATTERN (insn);
4329 
4330   if (GET_CODE (pattern) == SET)
4331     {
4332       rtx dest = XEXP (pattern, 0);
4333       rtx src = XEXP (pattern, 1);
4334 
4335       if (GET_MODE (dest) == DImode
4336 	  && REG_P (dest) && REGNO (dest) == EXEC_REG
4337 	  && CONST_INT_P (src))
4338 	return INTVAL (src);
4339     }
4340 
4341   return 0;
4342 }
4343 
4344 /* Sets the EXEC register before INSN to the value that it had after
4345    LAST_EXEC_DEF.  The constant value of the EXEC register is returned if
4346    known, otherwise it returns zero.  */
4347 
4348 static int64_t
gcn_restore_exec(rtx_insn * insn,rtx_insn * last_exec_def,int64_t curr_exec,bool curr_exec_known,bool & last_exec_def_saved)4349 gcn_restore_exec (rtx_insn *insn, rtx_insn *last_exec_def, int64_t curr_exec,
4350 		  bool curr_exec_known, bool &last_exec_def_saved)
4351 {
4352   rtx exec_reg = gen_rtx_REG (DImode, EXEC_REG);
4353   rtx exec;
4354 
4355   int64_t exec_value = gcn_insn_exec_value (last_exec_def);
4356 
4357   if (exec_value)
4358     {
4359       /* If the EXEC value is a constant and it happens to be the same as the
4360          current EXEC value, the restore can be skipped.  */
4361       if (curr_exec_known && exec_value == curr_exec)
4362 	return exec_value;
4363 
4364       exec = GEN_INT (exec_value);
4365     }
4366   else
4367     {
4368       /* If the EXEC value is not a constant, save it in a register after the
4369 	 point of definition.  */
4370       rtx exec_save_reg = gen_rtx_REG (DImode, EXEC_SAVE_REG);
4371 
4372       if (!last_exec_def_saved)
4373 	{
4374 	  start_sequence ();
4375 	  emit_move_insn (exec_save_reg, exec_reg);
4376 	  rtx_insn *seq = get_insns ();
4377 	  end_sequence ();
4378 
4379 	  emit_insn_after (seq, last_exec_def);
4380 	  if (dump_file && (dump_flags & TDF_DETAILS))
4381 	    fprintf (dump_file, "Saving EXEC after insn %d.\n",
4382 		     INSN_UID (last_exec_def));
4383 
4384 	  last_exec_def_saved = true;
4385 	}
4386 
4387       exec = exec_save_reg;
4388     }
4389 
4390   /* Restore EXEC register before the usage.  */
4391   start_sequence ();
4392   emit_move_insn (exec_reg, exec);
4393   rtx_insn *seq = get_insns ();
4394   end_sequence ();
4395   emit_insn_before (seq, insn);
4396 
4397   if (dump_file && (dump_flags & TDF_DETAILS))
4398     {
4399       if (exec_value)
4400 	fprintf (dump_file, "Restoring EXEC to %ld before insn %d.\n",
4401 		 exec_value, INSN_UID (insn));
4402       else
4403 	fprintf (dump_file,
4404 		 "Restoring EXEC from saved value before insn %d.\n",
4405 		 INSN_UID (insn));
4406     }
4407 
4408   return exec_value;
4409 }
4410 
4411 /* Implement TARGET_MACHINE_DEPENDENT_REORG.
4412 
4413    Ensure that pipeline dependencies and lane masking are set correctly.  */
4414 
4415 static void
gcn_md_reorg(void)4416 gcn_md_reorg (void)
4417 {
4418   basic_block bb;
4419   rtx exec_reg = gen_rtx_REG (DImode, EXEC_REG);
4420   regset_head live;
4421 
4422   INIT_REG_SET (&live);
4423 
4424   compute_bb_for_insn ();
4425 
4426   if (!optimize)
4427     {
4428       split_all_insns ();
4429       if (dump_file && (dump_flags & TDF_DETAILS))
4430 	{
4431 	  fprintf (dump_file, "After split:\n");
4432 	  print_rtl_with_bb (dump_file, get_insns (), dump_flags);
4433 	}
4434 
4435       /* Update data-flow information for split instructions.  */
4436       df_insn_rescan_all ();
4437     }
4438 
4439   df_analyze ();
4440 
4441   /* This pass ensures that the EXEC register is set correctly, according
4442      to the "exec" attribute.  However, care must be taken so that the
4443      value that reaches explicit uses of the EXEC register remains the
4444      same as before.
4445    */
4446 
4447   FOR_EACH_BB_FN (bb, cfun)
4448     {
4449       if (dump_file && (dump_flags & TDF_DETAILS))
4450 	fprintf (dump_file, "BB %d:\n", bb->index);
4451 
4452       rtx_insn *insn, *curr;
4453       rtx_insn *last_exec_def = BB_HEAD (bb);
4454       bool last_exec_def_saved = false;
4455       bool curr_exec_explicit = true;
4456       bool curr_exec_known = true;
4457       int64_t curr_exec = 0;	/* 0 here means 'the value is that of EXEC
4458 				   after last_exec_def is executed'.  */
4459 
4460       FOR_BB_INSNS_SAFE (bb, insn, curr)
4461 	{
4462 	  if (!NONDEBUG_INSN_P (insn))
4463 	    continue;
4464 
4465 	  if (GET_CODE (PATTERN (insn)) == USE
4466 	      || GET_CODE (PATTERN (insn)) == CLOBBER)
4467 	    continue;
4468 
4469 	  HARD_REG_SET defs, uses;
4470 	  CLEAR_HARD_REG_SET (defs);
4471 	  CLEAR_HARD_REG_SET (uses);
4472 	  note_stores (insn, record_hard_reg_sets, &defs);
4473 	  note_uses (&PATTERN (insn), record_hard_reg_uses, &uses);
4474 
4475 	  bool exec_lo_def_p = TEST_HARD_REG_BIT (defs, EXEC_LO_REG);
4476 	  bool exec_hi_def_p = TEST_HARD_REG_BIT (defs, EXEC_HI_REG);
4477 	  bool exec_used = (hard_reg_set_intersect_p
4478 			    (uses, reg_class_contents[(int) EXEC_MASK_REG])
4479 			    || TEST_HARD_REG_BIT (uses, EXECZ_REG));
4480 
4481 	  /* Check the instruction for implicit setting of EXEC via an
4482 	     attribute.  */
4483 	  attr_exec exec_attr = get_attr_exec (insn);
4484 	  int64_t new_exec;
4485 
4486 	  switch (exec_attr)
4487 	    {
4488 	    case EXEC_NONE:
4489 	      new_exec = 0;
4490 	      break;
4491 
4492 	    case EXEC_SINGLE:
4493 	      /* Instructions that do not involve memory accesses only require
4494 		 bit 0 of EXEC to be set.  */
4495 	      if (gcn_vmem_insn_p (get_attr_type (insn))
4496 		  || get_attr_type (insn) == TYPE_DS)
4497 		new_exec = 1;
4498 	      else
4499 		new_exec = curr_exec | 1;
4500 	      break;
4501 
4502 	    case EXEC_FULL:
4503 	      new_exec = -1;
4504 	      break;
4505 
4506 	    default:  /* Auto-detect what setting is appropriate.  */
4507 	      {
4508 	        new_exec = 0;
4509 
4510 		/* If EXEC is referenced explicitly then we don't need to do
4511 		   anything to set it, so we're done.  */
4512 		if (exec_used)
4513 		  break;
4514 
4515 		/* Scan the insn for VGPRs defs or uses.  The mode determines
4516 		   what kind of exec is needed.  */
4517 		subrtx_iterator::array_type array;
4518 		FOR_EACH_SUBRTX (iter, array, PATTERN (insn), NONCONST)
4519 		  {
4520 		    const_rtx x = *iter;
4521 		    if (REG_P (x) && VGPR_REGNO_P (REGNO (x)))
4522 		      {
4523 			if (VECTOR_MODE_P (GET_MODE (x)))
4524 			  {
4525 			    new_exec = -1;
4526 			    break;
4527 			  }
4528 			else
4529 			  new_exec = 1;
4530 		      }
4531 		  }
4532 	        }
4533 	      break;
4534 	    }
4535 
4536 	  if (new_exec && (!curr_exec_known || new_exec != curr_exec))
4537 	    {
4538 	      start_sequence ();
4539 	      emit_move_insn (exec_reg, GEN_INT (new_exec));
4540 	      rtx_insn *seq = get_insns ();
4541 	      end_sequence ();
4542 	      emit_insn_before (seq, insn);
4543 
4544 	      if (dump_file && (dump_flags & TDF_DETAILS))
4545 		fprintf (dump_file, "Setting EXEC to %ld before insn %d.\n",
4546 			 new_exec, INSN_UID (insn));
4547 
4548 	      curr_exec = new_exec;
4549 	      curr_exec_explicit = false;
4550 	      curr_exec_known = true;
4551 	    }
4552 	  else if (new_exec && dump_file && (dump_flags & TDF_DETAILS))
4553 	    {
4554 	      fprintf (dump_file, "Exec already is %ld before insn %d.\n",
4555 		       new_exec, INSN_UID (insn));
4556 	    }
4557 
4558 	  /* The state of the EXEC register is unknown after a
4559 	     function call.  */
4560 	  if (CALL_P (insn))
4561 	    curr_exec_known = false;
4562 
4563 	  /* Handle explicit uses of EXEC.  If the instruction is a partial
4564 	     explicit definition of EXEC, then treat it as an explicit use of
4565 	     EXEC as well.  */
4566 	  if (exec_used || exec_lo_def_p != exec_hi_def_p)
4567 	    {
4568 	      /* An instruction that explicitly uses EXEC should not also
4569 		 implicitly define it.  */
4570 	      gcc_assert (!exec_used || !new_exec);
4571 
4572 	      if (!curr_exec_known || !curr_exec_explicit)
4573 		{
4574 		  /* Restore the previous explicitly defined value.  */
4575 		  curr_exec = gcn_restore_exec (insn, last_exec_def,
4576 						curr_exec, curr_exec_known,
4577 						last_exec_def_saved);
4578 		  curr_exec_explicit = true;
4579 		  curr_exec_known = true;
4580 		}
4581 	    }
4582 
4583 	  /* Handle explicit definitions of EXEC.  */
4584 	  if (exec_lo_def_p || exec_hi_def_p)
4585 	    {
4586 	      last_exec_def = insn;
4587 	      last_exec_def_saved = false;
4588 	      curr_exec = gcn_insn_exec_value (insn);
4589 	      curr_exec_explicit = true;
4590 	      curr_exec_known = true;
4591 
4592 	      if (dump_file && (dump_flags & TDF_DETAILS))
4593 		fprintf (dump_file,
4594 			 "Found %s definition of EXEC at insn %d.\n",
4595 			 exec_lo_def_p == exec_hi_def_p ? "full" : "partial",
4596 			 INSN_UID (insn));
4597 	    }
4598 	}
4599 
4600       COPY_REG_SET (&live, DF_LR_OUT (bb));
4601       df_simulate_initialize_backwards (bb, &live);
4602 
4603       /* If EXEC is live after the basic block, restore the value of EXEC
4604 	 at the end of the block.  */
4605       if ((REGNO_REG_SET_P (&live, EXEC_LO_REG)
4606 	   || REGNO_REG_SET_P (&live, EXEC_HI_REG))
4607 	  && (!curr_exec_known || !curr_exec_explicit))
4608 	{
4609 	  rtx_insn *end_insn = BB_END (bb);
4610 
4611 	  /* If the instruction is not a jump instruction, do the restore
4612 	     after the last instruction in the basic block.  */
4613 	  if (NONJUMP_INSN_P (end_insn))
4614 	    end_insn = NEXT_INSN (end_insn);
4615 
4616 	  gcn_restore_exec (end_insn, last_exec_def, curr_exec,
4617 			    curr_exec_known, last_exec_def_saved);
4618 	}
4619     }
4620 
4621   CLEAR_REG_SET (&live);
4622 
4623   /* "Manually Inserted Wait States (NOPs)."
4624 
4625      GCN hardware detects most kinds of register dependencies, but there
4626      are some exceptions documented in the ISA manual.  This pass
4627      detects the missed cases, and inserts the documented number of NOPs
4628      required for correct execution.  */
4629 
4630   const int max_waits = 5;
4631   struct ilist
4632   {
4633     rtx_insn *insn;
4634     attr_unit unit;
4635     attr_delayeduse delayeduse;
4636     HARD_REG_SET writes;
4637     HARD_REG_SET reads;
4638     int age;
4639   } back[max_waits];
4640   int oldest = 0;
4641   for (int i = 0; i < max_waits; i++)
4642     back[i].insn = NULL;
4643 
4644   rtx_insn *insn, *last_insn = NULL;
4645   for (insn = get_insns (); insn != 0; insn = NEXT_INSN (insn))
4646     {
4647       if (!NONDEBUG_INSN_P (insn))
4648 	continue;
4649 
4650       if (GET_CODE (PATTERN (insn)) == USE
4651 	  || GET_CODE (PATTERN (insn)) == CLOBBER)
4652 	continue;
4653 
4654       attr_type itype = get_attr_type (insn);
4655       attr_unit iunit = get_attr_unit (insn);
4656       attr_delayeduse idelayeduse = get_attr_delayeduse (insn);
4657       HARD_REG_SET ireads, iwrites;
4658       CLEAR_HARD_REG_SET (ireads);
4659       CLEAR_HARD_REG_SET (iwrites);
4660       note_stores (insn, record_hard_reg_sets, &iwrites);
4661       note_uses (&PATTERN (insn), record_hard_reg_uses, &ireads);
4662 
4663       /* Scan recent previous instructions for dependencies not handled in
4664          hardware.  */
4665       int nops_rqd = 0;
4666       for (int i = oldest; i < oldest + max_waits; i++)
4667 	{
4668 	  struct ilist *prev_insn = &back[i % max_waits];
4669 
4670 	  if (!prev_insn->insn)
4671 	    continue;
4672 
4673 	  /* VALU writes SGPR followed by VMEM reading the same SGPR
4674 	     requires 5 wait states.  */
4675 	  if ((prev_insn->age + nops_rqd) < 5
4676 	      && prev_insn->unit == UNIT_VECTOR
4677 	      && gcn_vmem_insn_p (itype))
4678 	    {
4679 	      HARD_REG_SET regs = prev_insn->writes & ireads;
4680 	      if (hard_reg_set_intersect_p
4681 		  (regs, reg_class_contents[(int) SGPR_REGS]))
4682 		nops_rqd = 5 - prev_insn->age;
4683 	    }
4684 
4685 	  /* VALU sets VCC/EXEC followed by VALU uses VCCZ/EXECZ
4686 	     requires 5 wait states.  */
4687 	  if ((prev_insn->age + nops_rqd) < 5
4688 	      && prev_insn->unit == UNIT_VECTOR
4689 	      && iunit == UNIT_VECTOR
4690 	      && ((hard_reg_set_intersect_p
4691 		   (prev_insn->writes,
4692 		    reg_class_contents[(int) EXEC_MASK_REG])
4693 		   && TEST_HARD_REG_BIT (ireads, EXECZ_REG))
4694 		  ||
4695 		  (hard_reg_set_intersect_p
4696 		   (prev_insn->writes,
4697 		    reg_class_contents[(int) VCC_CONDITIONAL_REG])
4698 		   && TEST_HARD_REG_BIT (ireads, VCCZ_REG))))
4699 	    nops_rqd = 5 - prev_insn->age;
4700 
4701 	  /* VALU writes SGPR/VCC followed by v_{read,write}lane using
4702 	     SGPR/VCC as lane select requires 4 wait states.  */
4703 	  if ((prev_insn->age + nops_rqd) < 4
4704 	      && prev_insn->unit == UNIT_VECTOR
4705 	      && get_attr_laneselect (insn) == LANESELECT_YES)
4706 	    {
4707 	      HARD_REG_SET regs = prev_insn->writes & ireads;
4708 	      if (hard_reg_set_intersect_p
4709 		  (regs, reg_class_contents[(int) SGPR_REGS])
4710 		  || hard_reg_set_intersect_p
4711 		     (regs, reg_class_contents[(int) VCC_CONDITIONAL_REG]))
4712 		nops_rqd = 4 - prev_insn->age;
4713 	    }
4714 
4715 	  /* VALU writes VGPR followed by VALU_DPP reading that VGPR
4716 	     requires 2 wait states.  */
4717 	  if ((prev_insn->age + nops_rqd) < 2
4718 	      && prev_insn->unit == UNIT_VECTOR
4719 	      && itype == TYPE_VOP_DPP)
4720 	    {
4721 	      HARD_REG_SET regs = prev_insn->writes & ireads;
4722 	      if (hard_reg_set_intersect_p
4723 		  (regs, reg_class_contents[(int) VGPR_REGS]))
4724 		nops_rqd = 2 - prev_insn->age;
4725 	    }
4726 
4727 	  /* Store that requires input registers are not overwritten by
4728 	     following instruction.  */
4729 	  if ((prev_insn->age + nops_rqd) < 1
4730 	      && prev_insn->delayeduse == DELAYEDUSE_YES
4731 	      && ((hard_reg_set_intersect_p
4732 		   (prev_insn->reads, iwrites))))
4733 	    nops_rqd = 1 - prev_insn->age;
4734 	}
4735 
4736       /* Insert the required number of NOPs.  */
4737       for (int i = nops_rqd; i > 0; i--)
4738 	emit_insn_after (gen_nop (), last_insn);
4739 
4740       /* Age the previous instructions.  We can also ignore writes to
4741          registers subsequently overwritten.  */
4742       HARD_REG_SET written;
4743       CLEAR_HARD_REG_SET (written);
4744       for (int i = oldest + max_waits - 1; i > oldest; i--)
4745 	{
4746 	  struct ilist *prev_insn = &back[i % max_waits];
4747 
4748 	  /* Assume all instructions are equivalent to one "wait", the same
4749 	     as s_nop.  This is probably true for SALU, but not VALU (which
4750 	     may take longer), so this is not optimal.  However, AMD do
4751 	     not publish the cycle times for instructions.  */
4752 	  prev_insn->age += 1 + nops_rqd;
4753 
4754 	  written |= iwrites;
4755 	  prev_insn->writes &= ~written;
4756 	}
4757 
4758       /* Track the current instruction as a previous instruction.  */
4759       back[oldest].insn = insn;
4760       back[oldest].unit = iunit;
4761       back[oldest].delayeduse = idelayeduse;
4762       back[oldest].writes = iwrites;
4763       back[oldest].reads = ireads;
4764       back[oldest].age = 0;
4765       oldest = (oldest + 1) % max_waits;
4766 
4767       last_insn = insn;
4768     }
4769 }
4770 
4771 /* }}}  */
4772 /* {{{ OpenACC / OpenMP.  */
4773 
4774 #define GCN_DEFAULT_GANGS 0	/* Choose at runtime.  */
4775 #define GCN_DEFAULT_WORKERS 0	/* Choose at runtime.  */
4776 #define GCN_DEFAULT_VECTORS 1	/* Use autovectorization only, for now.  */
4777 
4778 /* Implement TARGET_GOACC_VALIDATE_DIMS.
4779 
4780    Check the launch dimensions provided for an OpenACC compute
4781    region, or routine.  */
4782 
4783 static bool
gcn_goacc_validate_dims(tree decl,int dims[],int fn_level,unsigned)4784 gcn_goacc_validate_dims (tree decl, int dims[], int fn_level,
4785 			 unsigned /*used*/)
4786 {
4787   bool changed = false;
4788 
4789   /* FIXME: remove -facc-experimental-workers when they're ready.  */
4790   int max_workers = flag_worker_partitioning ? 16 : 1;
4791 
4792   gcc_assert (!flag_worker_partitioning);
4793 
4794   /* The vector size must appear to be 64, to the user, unless this is a
4795      SEQ routine.  The real, internal value is always 1, which means use
4796      autovectorization, but the user should not see that.  */
4797   if (fn_level <= GOMP_DIM_VECTOR && fn_level >= -1
4798       && dims[GOMP_DIM_VECTOR] >= 0)
4799     {
4800       if (fn_level < 0 && dims[GOMP_DIM_VECTOR] >= 0
4801 	  && dims[GOMP_DIM_VECTOR] != 64)
4802 	warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION,
4803 		    OPT_Wopenacc_dims,
4804 		    (dims[GOMP_DIM_VECTOR]
4805 		     ? G_("using vector_length (64), ignoring %d")
4806 		     : G_("using vector_length (64), "
4807 			  "ignoring runtime setting")),
4808 		    dims[GOMP_DIM_VECTOR]);
4809       dims[GOMP_DIM_VECTOR] = 1;
4810       changed = true;
4811     }
4812 
4813   /* Check the num workers is not too large.  */
4814   if (dims[GOMP_DIM_WORKER] > max_workers)
4815     {
4816       warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION,
4817 		  OPT_Wopenacc_dims,
4818 		  "using num_workers (%d), ignoring %d",
4819 		  max_workers, dims[GOMP_DIM_WORKER]);
4820       dims[GOMP_DIM_WORKER] = max_workers;
4821       changed = true;
4822     }
4823 
4824   /* Set global defaults.  */
4825   if (!decl)
4826     {
4827       dims[GOMP_DIM_VECTOR] = GCN_DEFAULT_VECTORS;
4828       if (dims[GOMP_DIM_WORKER] < 0)
4829 	dims[GOMP_DIM_WORKER] = (flag_worker_partitioning
4830 				 ? GCN_DEFAULT_WORKERS : 1);
4831       if (dims[GOMP_DIM_GANG] < 0)
4832 	dims[GOMP_DIM_GANG] = GCN_DEFAULT_GANGS;
4833       changed = true;
4834     }
4835 
4836   return changed;
4837 }
4838 
4839 /* Helper function for oacc_dim_size instruction.
4840    Also used for OpenMP, via builtin_gcn_dim_size, and the omp_gcn pass.  */
4841 
4842 rtx
gcn_oacc_dim_size(int dim)4843 gcn_oacc_dim_size (int dim)
4844 {
4845   if (dim < 0 || dim > 2)
4846     error ("offload dimension out of range (%d)", dim);
4847 
4848   /* Vectors are a special case.  */
4849   if (dim == 2)
4850     return const1_rtx;		/* Think of this as 1 times 64.  */
4851 
4852   static int offset[] = {
4853     /* Offsets into dispatch packet.  */
4854     12,				/* X dim = Gang / Team / Work-group.  */
4855     20,				/* Z dim = Worker / Thread / Wavefront.  */
4856     16				/* Y dim = Vector / SIMD / Work-item.  */
4857   };
4858   rtx addr = gen_rtx_PLUS (DImode,
4859 			   gen_rtx_REG (DImode,
4860 					cfun->machine->args.
4861 					reg[DISPATCH_PTR_ARG]),
4862 			   GEN_INT (offset[dim]));
4863   return gen_rtx_MEM (SImode, addr);
4864 }
4865 
4866 /* Helper function for oacc_dim_pos instruction.
4867    Also used for OpenMP, via builtin_gcn_dim_pos, and the omp_gcn pass.  */
4868 
4869 rtx
gcn_oacc_dim_pos(int dim)4870 gcn_oacc_dim_pos (int dim)
4871 {
4872   if (dim < 0 || dim > 2)
4873     error ("offload dimension out of range (%d)", dim);
4874 
4875   static const int reg[] = {
4876     WORKGROUP_ID_X_ARG,		/* Gang / Team / Work-group.  */
4877     WORK_ITEM_ID_Z_ARG,		/* Worker / Thread / Wavefront.  */
4878     WORK_ITEM_ID_Y_ARG		/* Vector / SIMD / Work-item.  */
4879   };
4880 
4881   int reg_num = cfun->machine->args.reg[reg[dim]];
4882 
4883   /* The information must have been requested by the kernel.  */
4884   gcc_assert (reg_num >= 0);
4885 
4886   return gen_rtx_REG (SImode, reg_num);
4887 }
4888 
4889 /* Implement TARGET_GOACC_FORK_JOIN.  */
4890 
4891 static bool
gcn_fork_join(gcall * ARG_UNUSED (call),const int * ARG_UNUSED (dims),bool ARG_UNUSED (is_fork))4892 gcn_fork_join (gcall *ARG_UNUSED (call), const int *ARG_UNUSED (dims),
4893 	       bool ARG_UNUSED (is_fork))
4894 {
4895   /* GCN does not use the fork/join concept invented for NVPTX.
4896      Instead we use standard autovectorization.  */
4897   return false;
4898 }
4899 
4900 /* Implement ???????
4901    FIXME make this a real hook.
4902 
4903    Adjust FNDECL such that options inherited from the host compiler
4904    are made appropriate for the accelerator compiler.  */
4905 
4906 void
gcn_fixup_accel_lto_options(tree fndecl)4907 gcn_fixup_accel_lto_options (tree fndecl)
4908 {
4909   tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
4910   if (!func_optimize)
4911     return;
4912 
4913   tree old_optimize = build_optimization_node (&global_options);
4914   tree new_optimize;
4915 
4916   /* If the function changed the optimization levels as well as
4917      setting target options, start with the optimizations
4918      specified.  */
4919   if (func_optimize != old_optimize)
4920     cl_optimization_restore (&global_options,
4921 			     TREE_OPTIMIZATION (func_optimize));
4922 
4923   gcn_option_override ();
4924 
4925   /* The target attributes may also change some optimization flags,
4926      so update the optimization options if necessary.  */
4927   new_optimize = build_optimization_node (&global_options);
4928 
4929   if (old_optimize != new_optimize)
4930     {
4931       DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
4932       cl_optimization_restore (&global_options,
4933 			       TREE_OPTIMIZATION (old_optimize));
4934     }
4935 }
4936 
4937 /* }}}  */
4938 /* {{{ ASM Output.  */
4939 
4940 /*  Implement TARGET_ASM_FILE_START.
4941 
4942     Print assembler file header text.  */
4943 
4944 static void
output_file_start(void)4945 output_file_start (void)
4946 {
4947   fprintf (asm_out_file, "\t.text\n");
4948   fprintf (asm_out_file, "\t.hsa_code_object_version 2,0\n");
4949   fprintf (asm_out_file, "\t.hsa_code_object_isa\n");	/* Autodetect.  */
4950   fprintf (asm_out_file, "\t.section\t.AMDGPU.config\n");
4951   fprintf (asm_out_file, "\t.text\n");
4952 }
4953 
4954 /* Implement ASM_DECLARE_FUNCTION_NAME via gcn-hsa.h.
4955 
4956    Print the initial definition of a function name.
4957 
4958    For GCN kernel entry points this includes all the HSA meta-data, special
4959    alignment constraints that don't apply to regular functions, and magic
4960    comments that pass information to mkoffload.  */
4961 
4962 void
gcn_hsa_declare_function_name(FILE * file,const char * name,tree)4963 gcn_hsa_declare_function_name (FILE *file, const char *name, tree)
4964 {
4965   int sgpr, vgpr;
4966   bool xnack_enabled = false;
4967   int extra_regs = 0;
4968 
4969   if (cfun && cfun->machine && cfun->machine->normal_function)
4970     {
4971       fputs ("\t.type\t", file);
4972       assemble_name (file, name);
4973       fputs (",@function\n", file);
4974       assemble_name (file, name);
4975       fputs (":\n", file);
4976       return;
4977     }
4978 
4979   /* Determine count of sgpr/vgpr registers by looking for last
4980      one used.  */
4981   for (sgpr = 101; sgpr >= 0; sgpr--)
4982     if (df_regs_ever_live_p (FIRST_SGPR_REG + sgpr))
4983       break;
4984   sgpr++;
4985   for (vgpr = 255; vgpr >= 0; vgpr--)
4986     if (df_regs_ever_live_p (FIRST_VGPR_REG + vgpr))
4987       break;
4988   vgpr++;
4989 
4990   if (xnack_enabled)
4991     extra_regs = 6;
4992   if (df_regs_ever_live_p (FLAT_SCRATCH_LO_REG)
4993       || df_regs_ever_live_p (FLAT_SCRATCH_HI_REG))
4994     extra_regs = 4;
4995   else if (df_regs_ever_live_p (VCC_LO_REG)
4996 	   || df_regs_ever_live_p (VCC_HI_REG))
4997     extra_regs = 2;
4998 
4999   if (!leaf_function_p ())
5000     {
5001       /* We can't know how many registers function calls might use.  */
5002       if (vgpr < MAX_NORMAL_VGPR_COUNT)
5003 	vgpr = MAX_NORMAL_VGPR_COUNT;
5004       if (sgpr + extra_regs < MAX_NORMAL_SGPR_COUNT)
5005 	sgpr = MAX_NORMAL_SGPR_COUNT - extra_regs;
5006     }
5007 
5008   /* GFX8 allocates SGPRs in blocks of 8.
5009      GFX9 uses blocks of 16.  */
5010   int granulated_sgprs;
5011   if (TARGET_GCN3)
5012     granulated_sgprs = (sgpr + extra_regs + 7) / 8 - 1;
5013   else if (TARGET_GCN5)
5014     granulated_sgprs = 2 * ((sgpr + extra_regs + 15) / 16 - 1);
5015   else
5016     gcc_unreachable ();
5017 
5018   fputs ("\t.align\t256\n", file);
5019   fputs ("\t.type\t", file);
5020   assemble_name (file, name);
5021   fputs (",@function\n\t.amdgpu_hsa_kernel\t", file);
5022   assemble_name (file, name);
5023   fputs ("\n", file);
5024   assemble_name (file, name);
5025   fputs (":\n", file);
5026   fprintf (file, "\t.amd_kernel_code_t\n"
5027 	   "\t\tkernel_code_version_major = 1\n"
5028 	   "\t\tkernel_code_version_minor = 0\n" "\t\tmachine_kind = 1\n"
5029 	   /* "\t\tmachine_version_major = 8\n"
5030 	      "\t\tmachine_version_minor = 0\n"
5031 	      "\t\tmachine_version_stepping = 1\n" */
5032 	   "\t\tkernel_code_entry_byte_offset = 256\n"
5033 	   "\t\tkernel_code_prefetch_byte_size = 0\n"
5034 	   "\t\tmax_scratch_backing_memory_byte_size = 0\n"
5035 	   "\t\tcompute_pgm_rsrc1_vgprs = %i\n"
5036 	   "\t\tcompute_pgm_rsrc1_sgprs = %i\n"
5037 	   "\t\tcompute_pgm_rsrc1_priority = 0\n"
5038 	   "\t\tcompute_pgm_rsrc1_float_mode = 192\n"
5039 	   "\t\tcompute_pgm_rsrc1_priv = 0\n"
5040 	   "\t\tcompute_pgm_rsrc1_dx10_clamp = 1\n"
5041 	   "\t\tcompute_pgm_rsrc1_debug_mode = 0\n"
5042 	   "\t\tcompute_pgm_rsrc1_ieee_mode = 1\n"
5043 	   /* We enable scratch memory.  */
5044 	   "\t\tcompute_pgm_rsrc2_scratch_en = 1\n"
5045 	   "\t\tcompute_pgm_rsrc2_user_sgpr = %i\n"
5046 	   "\t\tcompute_pgm_rsrc2_tgid_x_en = 1\n"
5047 	   "\t\tcompute_pgm_rsrc2_tgid_y_en = 0\n"
5048 	   "\t\tcompute_pgm_rsrc2_tgid_z_en = 0\n"
5049 	   "\t\tcompute_pgm_rsrc2_tg_size_en = 0\n"
5050 	   "\t\tcompute_pgm_rsrc2_tidig_comp_cnt = 0\n"
5051 	   "\t\tcompute_pgm_rsrc2_excp_en_msb = 0\n"
5052 	   "\t\tcompute_pgm_rsrc2_lds_size = 0\n"	/* Set at runtime.  */
5053 	   "\t\tcompute_pgm_rsrc2_excp_en = 0\n",
5054 	   (vgpr - 1) / 4,
5055 	   /* Must match wavefront_sgpr_count */
5056 	   granulated_sgprs,
5057 	   /* The total number of SGPR user data registers requested.  This
5058 	      number must match the number of user data registers enabled.  */
5059 	   cfun->machine->args.nsgprs);
5060   int reg = FIRST_SGPR_REG;
5061   for (int a = 0; a < GCN_KERNEL_ARG_TYPES; a++)
5062     {
5063       int reg_first = -1;
5064       int reg_last;
5065       if ((cfun->machine->args.requested & (1 << a))
5066 	  && (gcn_kernel_arg_types[a].fixed_regno < 0))
5067 	{
5068 	  reg_first = reg;
5069 	  reg_last = (reg_first
5070 		      + (GET_MODE_SIZE (gcn_kernel_arg_types[a].mode)
5071 			 / UNITS_PER_WORD) - 1);
5072 	  reg = reg_last + 1;
5073 	}
5074 
5075       if (gcn_kernel_arg_types[a].header_pseudo)
5076 	{
5077 	  fprintf (file, "\t\t%s = %i",
5078 		   gcn_kernel_arg_types[a].header_pseudo,
5079 		   (cfun->machine->args.requested & (1 << a)) != 0);
5080 	  if (reg_first != -1)
5081 	    {
5082 	      fprintf (file, " ; (");
5083 	      for (int i = reg_first; i <= reg_last; ++i)
5084 		{
5085 		  if (i != reg_first)
5086 		    fprintf (file, ", ");
5087 		  fprintf (file, "%s", reg_names[i]);
5088 		}
5089 	      fprintf (file, ")");
5090 	    }
5091 	  fprintf (file, "\n");
5092 	}
5093       else if (gcn_kernel_arg_types[a].fixed_regno >= 0
5094 	       && cfun->machine->args.requested & (1 << a))
5095 	fprintf (file, "\t\t; %s = %i (%s)\n",
5096 		 gcn_kernel_arg_types[a].name,
5097 		 (cfun->machine->args.requested & (1 << a)) != 0,
5098 		 reg_names[gcn_kernel_arg_types[a].fixed_regno]);
5099     }
5100   fprintf (file, "\t\tenable_vgpr_workitem_id = %i\n",
5101 	   (cfun->machine->args.requested & (1 << WORK_ITEM_ID_Z_ARG))
5102 	   ? 2
5103 	   : cfun->machine->args.requested & (1 << WORK_ITEM_ID_Y_ARG)
5104 	   ? 1 : 0);
5105   fprintf (file, "\t\tenable_ordered_append_gds = 0\n"
5106 	   "\t\tprivate_element_size = 1\n"
5107 	   "\t\tis_ptr64 = 1\n"
5108 	   "\t\tis_dynamic_callstack = 0\n"
5109 	   "\t\tis_debug_enabled = 0\n"
5110 	   "\t\tis_xnack_enabled = %i\n"
5111 	   "\t\tworkitem_private_segment_byte_size = %i\n"
5112 	   "\t\tworkgroup_group_segment_byte_size = %u\n"
5113 	   "\t\tgds_segment_byte_size = 0\n"
5114 	   "\t\tkernarg_segment_byte_size = %i\n"
5115 	   "\t\tworkgroup_fbarrier_count = 0\n"
5116 	   "\t\twavefront_sgpr_count = %i\n"
5117 	   "\t\tworkitem_vgpr_count = %i\n"
5118 	   "\t\treserved_vgpr_first = 0\n"
5119 	   "\t\treserved_vgpr_count = 0\n"
5120 	   "\t\treserved_sgpr_first = 0\n"
5121 	   "\t\treserved_sgpr_count = 0\n"
5122 	   "\t\tdebug_wavefront_private_segment_offset_sgpr = 0\n"
5123 	   "\t\tdebug_private_segment_buffer_sgpr = 0\n"
5124 	   "\t\tkernarg_segment_alignment = %i\n"
5125 	   "\t\tgroup_segment_alignment = 4\n"
5126 	   "\t\tprivate_segment_alignment = %i\n"
5127 	   "\t\twavefront_size = 6\n"
5128 	   "\t\tcall_convention = 0\n"
5129 	   "\t\truntime_loader_kernel_symbol = 0\n"
5130 	   "\t.end_amd_kernel_code_t\n", xnack_enabled,
5131 	   /* workitem_private_segment_bytes_size needs to be
5132 	      one 64th the wave-front stack size.  */
5133 	   stack_size_opt / 64,
5134 	   LDS_SIZE, cfun->machine->kernarg_segment_byte_size,
5135 	   /* Number of scalar registers used by a wavefront.  This
5136 	      includes the special SGPRs for VCC, Flat Scratch (Base,
5137 	      Size) and XNACK (for GFX8 (VI)+).  It does not include the
5138 	      16 SGPR added if a trap handler is enabled.  Must match
5139 	      compute_pgm_rsrc1.sgprs.  */
5140 	   sgpr + extra_regs, vgpr,
5141 	   cfun->machine->kernarg_segment_alignment,
5142 	   crtl->stack_alignment_needed / 8);
5143 
5144   /* This comment is read by mkoffload.  */
5145   if (flag_openacc)
5146     fprintf (file, "\t;; OPENACC-DIMS: %d, %d, %d : %s\n",
5147 	     oacc_get_fn_dim_size (cfun->decl, GOMP_DIM_GANG),
5148 	     oacc_get_fn_dim_size (cfun->decl, GOMP_DIM_WORKER),
5149 	     oacc_get_fn_dim_size (cfun->decl, GOMP_DIM_VECTOR), name);
5150 }
5151 
5152 /* Implement TARGET_ASM_SELECT_SECTION.
5153 
5154    Return the section into which EXP should be placed.  */
5155 
5156 static section *
gcn_asm_select_section(tree exp,int reloc,unsigned HOST_WIDE_INT align)5157 gcn_asm_select_section (tree exp, int reloc, unsigned HOST_WIDE_INT align)
5158 {
5159   if (TREE_TYPE (exp) != error_mark_node
5160       && TYPE_ADDR_SPACE (TREE_TYPE (exp)) == ADDR_SPACE_LDS)
5161     {
5162       if (!DECL_P (exp))
5163 	return get_section (".lds_bss",
5164 			    SECTION_WRITE | SECTION_BSS | SECTION_DEBUG,
5165 			    NULL);
5166 
5167       return get_named_section (exp, ".lds_bss", reloc);
5168     }
5169 
5170   return default_elf_select_section (exp, reloc, align);
5171 }
5172 
5173 /* Implement TARGET_ASM_FUNCTION_PROLOGUE.
5174 
5175    Emits custom text into the assembler file at the head of each function.  */
5176 
5177 static void
gcn_target_asm_function_prologue(FILE * file)5178 gcn_target_asm_function_prologue (FILE *file)
5179 {
5180   machine_function *offsets = gcn_compute_frame_offsets ();
5181 
5182   asm_fprintf (file, "\t; using %s addressing in function\n",
5183 	       offsets->use_flat_addressing ? "flat" : "global");
5184 
5185   if (offsets->normal_function)
5186     {
5187       asm_fprintf (file, "\t; frame pointer needed: %s\n",
5188 		   offsets->need_frame_pointer ? "true" : "false");
5189       asm_fprintf (file, "\t; lr needs saving: %s\n",
5190 		   offsets->lr_needs_saving ? "true" : "false");
5191       asm_fprintf (file, "\t; outgoing args size: %wd\n",
5192 		   offsets->outgoing_args_size);
5193       asm_fprintf (file, "\t; pretend size: %wd\n", offsets->pretend_size);
5194       asm_fprintf (file, "\t; local vars size: %wd\n", offsets->local_vars);
5195       asm_fprintf (file, "\t; callee save size: %wd\n",
5196 		   offsets->callee_saves);
5197     }
5198   else
5199     {
5200       asm_fprintf (file, "\t; HSA kernel entry point\n");
5201       asm_fprintf (file, "\t; local vars size: %wd\n", offsets->local_vars);
5202       asm_fprintf (file, "\t; outgoing args size: %wd\n",
5203 		   offsets->outgoing_args_size);
5204 
5205       /* Enable denorms.  */
5206       asm_fprintf (file, "\n\t; Set MODE[FP_DENORM]: allow single and double"
5207 		   " input and output denorms\n");
5208       asm_fprintf (file, "\ts_setreg_imm32_b32\thwreg(1, 4, 4), 0xf\n\n");
5209     }
5210 }
5211 
5212 /* Helper function for print_operand and print_operand_address.
5213 
5214    Print a register as the assembler requires, according to mode and name.  */
5215 
5216 static void
print_reg(FILE * file,rtx x)5217 print_reg (FILE *file, rtx x)
5218 {
5219   machine_mode mode = GET_MODE (x);
5220   if (mode == BImode || mode == QImode || mode == HImode || mode == SImode
5221       || mode == HFmode || mode == SFmode
5222       || mode == V64SFmode || mode == V64SImode
5223       || mode == V64QImode || mode == V64HImode)
5224     fprintf (file, "%s", reg_names[REGNO (x)]);
5225   else if (mode == DImode || mode == V64DImode
5226 	   || mode == DFmode || mode == V64DFmode)
5227     {
5228       if (SGPR_REGNO_P (REGNO (x)))
5229 	fprintf (file, "s[%i:%i]", REGNO (x) - FIRST_SGPR_REG,
5230 		 REGNO (x) - FIRST_SGPR_REG + 1);
5231       else if (VGPR_REGNO_P (REGNO (x)))
5232 	fprintf (file, "v[%i:%i]", REGNO (x) - FIRST_VGPR_REG,
5233 		 REGNO (x) - FIRST_VGPR_REG + 1);
5234       else if (REGNO (x) == FLAT_SCRATCH_REG)
5235 	fprintf (file, "flat_scratch");
5236       else if (REGNO (x) == EXEC_REG)
5237 	fprintf (file, "exec");
5238       else if (REGNO (x) == VCC_LO_REG)
5239 	fprintf (file, "vcc");
5240       else
5241 	fprintf (file, "[%s:%s]",
5242 		 reg_names[REGNO (x)], reg_names[REGNO (x) + 1]);
5243     }
5244   else if (mode == TImode)
5245     {
5246       if (SGPR_REGNO_P (REGNO (x)))
5247 	fprintf (file, "s[%i:%i]", REGNO (x) - FIRST_SGPR_REG,
5248 		 REGNO (x) - FIRST_SGPR_REG + 3);
5249       else if (VGPR_REGNO_P (REGNO (x)))
5250 	fprintf (file, "v[%i:%i]", REGNO (x) - FIRST_VGPR_REG,
5251 		 REGNO (x) - FIRST_VGPR_REG + 3);
5252       else
5253 	gcc_unreachable ();
5254     }
5255   else
5256     gcc_unreachable ();
5257 }
5258 
5259 /* Implement TARGET_SECTION_TYPE_FLAGS.
5260 
5261    Return a set of section attributes for use by TARGET_ASM_NAMED_SECTION.  */
5262 
5263 static unsigned int
gcn_section_type_flags(tree decl,const char * name,int reloc)5264 gcn_section_type_flags (tree decl, const char *name, int reloc)
5265 {
5266   if (strcmp (name, ".lds_bss") == 0)
5267     return SECTION_WRITE | SECTION_BSS | SECTION_DEBUG;
5268 
5269   return default_section_type_flags (decl, name, reloc);
5270 }
5271 
5272 /* Helper function for gcn_asm_output_symbol_ref.
5273 
5274    FIXME: If we want to have propagation blocks allocated separately and
5275    statically like this, it would be better done via symbol refs and the
5276    assembler/linker.  This is a temporary hack.  */
5277 
5278 static void
gcn_print_lds_decl(FILE * f,tree var)5279 gcn_print_lds_decl (FILE *f, tree var)
5280 {
5281   int *offset;
5282   machine_function *machfun = cfun->machine;
5283 
5284   if ((offset = machfun->lds_allocs->get (var)))
5285     fprintf (f, "%u", (unsigned) *offset);
5286   else
5287     {
5288       unsigned HOST_WIDE_INT align = DECL_ALIGN_UNIT (var);
5289       tree type = TREE_TYPE (var);
5290       unsigned HOST_WIDE_INT size = tree_to_uhwi (TYPE_SIZE_UNIT (type));
5291       if (size > align && size > 4 && align < 8)
5292 	align = 8;
5293 
5294       machfun->lds_allocated = ((machfun->lds_allocated + align - 1)
5295 				& ~(align - 1));
5296 
5297       machfun->lds_allocs->put (var, machfun->lds_allocated);
5298       fprintf (f, "%u", machfun->lds_allocated);
5299       machfun->lds_allocated += size;
5300       if (machfun->lds_allocated > LDS_SIZE)
5301 	error ("local data-share memory exhausted");
5302     }
5303 }
5304 
5305 /* Implement ASM_OUTPUT_SYMBOL_REF via gcn-hsa.h.  */
5306 
5307 void
gcn_asm_output_symbol_ref(FILE * file,rtx x)5308 gcn_asm_output_symbol_ref (FILE *file, rtx x)
5309 {
5310   tree decl;
5311   if (cfun
5312       && (decl = SYMBOL_REF_DECL (x)) != 0
5313       && TREE_CODE (decl) == VAR_DECL
5314       && AS_LDS_P (TYPE_ADDR_SPACE (TREE_TYPE (decl))))
5315     {
5316       /* LDS symbols (emitted using this hook) are only used at present
5317          to propagate worker values from an active thread to neutered
5318          threads.  Use the same offset for each such block, but don't
5319          use zero because null pointers are used to identify the active
5320          thread in GOACC_single_copy_start calls.  */
5321       gcn_print_lds_decl (file, decl);
5322     }
5323   else
5324     {
5325       assemble_name (file, XSTR (x, 0));
5326       /* FIXME: See above -- this condition is unreachable.  */
5327       if (cfun
5328 	  && (decl = SYMBOL_REF_DECL (x)) != 0
5329 	  && TREE_CODE (decl) == VAR_DECL
5330 	  && AS_LDS_P (TYPE_ADDR_SPACE (TREE_TYPE (decl))))
5331 	fputs ("@abs32", file);
5332     }
5333 }
5334 
5335 /* Implement TARGET_CONSTANT_ALIGNMENT.
5336 
5337    Returns the alignment in bits of a constant that is being placed in memory.
5338    CONSTANT is the constant and BASIC_ALIGN is the alignment that the object
5339    would ordinarily have.  */
5340 
5341 static HOST_WIDE_INT
gcn_constant_alignment(const_tree ARG_UNUSED (constant),HOST_WIDE_INT basic_align)5342 gcn_constant_alignment (const_tree ARG_UNUSED (constant),
5343 			HOST_WIDE_INT basic_align)
5344 {
5345   return basic_align > 128 ? basic_align : 128;
5346 }
5347 
5348 /* Implement PRINT_OPERAND_ADDRESS via gcn.h.  */
5349 
5350 void
print_operand_address(FILE * file,rtx mem)5351 print_operand_address (FILE *file, rtx mem)
5352 {
5353   gcc_assert (MEM_P (mem));
5354 
5355   rtx reg;
5356   rtx offset;
5357   addr_space_t as = MEM_ADDR_SPACE (mem);
5358   rtx addr = XEXP (mem, 0);
5359   gcc_assert (REG_P (addr) || GET_CODE (addr) == PLUS);
5360 
5361   if (AS_SCRATCH_P (as))
5362     switch (GET_CODE (addr))
5363       {
5364       case REG:
5365 	print_reg (file, addr);
5366 	break;
5367 
5368       case PLUS:
5369 	reg = XEXP (addr, 0);
5370 	offset = XEXP (addr, 1);
5371 	print_reg (file, reg);
5372 	if (GET_CODE (offset) == CONST_INT)
5373 	  fprintf (file, " offset:" HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
5374 	else
5375 	  abort ();
5376 	break;
5377 
5378       default:
5379 	debug_rtx (addr);
5380 	abort ();
5381       }
5382   else if (AS_ANY_FLAT_P (as))
5383     {
5384       if (GET_CODE (addr) == REG)
5385 	print_reg (file, addr);
5386       else
5387 	{
5388 	  gcc_assert (TARGET_GCN5_PLUS);
5389 	  print_reg (file, XEXP (addr, 0));
5390 	}
5391     }
5392   else if (AS_GLOBAL_P (as))
5393     {
5394       gcc_assert (TARGET_GCN5_PLUS);
5395 
5396       rtx base = addr;
5397       rtx vgpr_offset = NULL_RTX;
5398 
5399       if (GET_CODE (addr) == PLUS)
5400 	{
5401 	  base = XEXP (addr, 0);
5402 
5403 	  if (GET_CODE (base) == PLUS)
5404 	    {
5405 	      /* (SGPR + VGPR) + CONST  */
5406 	      vgpr_offset = XEXP (base, 1);
5407 	      base = XEXP (base, 0);
5408 	    }
5409 	  else
5410 	    {
5411 	      rtx offset = XEXP (addr, 1);
5412 
5413 	      if (REG_P (offset))
5414 		/* SGPR + VGPR  */
5415 		vgpr_offset = offset;
5416 	      else if (CONST_INT_P (offset))
5417 		/* VGPR + CONST or SGPR + CONST  */
5418 		;
5419 	      else
5420 		output_operand_lossage ("bad ADDR_SPACE_GLOBAL address");
5421 	    }
5422 	}
5423 
5424       if (REG_P (base))
5425 	{
5426 	  if (VGPR_REGNO_P (REGNO (base)))
5427 	    print_reg (file, base);
5428 	  else if (SGPR_REGNO_P (REGNO (base)))
5429 	    {
5430 	      /* The assembler requires a 64-bit VGPR pair here, even though
5431 	         the offset should be only 32-bit.  */
5432 	      if (vgpr_offset == NULL_RTX)
5433 		/* In this case, the vector offset is zero, so we use the first
5434 		   lane of v1, which is initialized to zero.  */
5435 		fprintf (file, "v[1:2]");
5436 	      else if (REG_P (vgpr_offset)
5437 		       && VGPR_REGNO_P (REGNO (vgpr_offset)))
5438 		{
5439 		  fprintf (file, "v[%d:%d]",
5440 			   REGNO (vgpr_offset) - FIRST_VGPR_REG,
5441 			   REGNO (vgpr_offset) - FIRST_VGPR_REG + 1);
5442 		}
5443 	      else
5444 		output_operand_lossage ("bad ADDR_SPACE_GLOBAL address");
5445 	    }
5446 	}
5447       else
5448 	output_operand_lossage ("bad ADDR_SPACE_GLOBAL address");
5449     }
5450   else if (AS_ANY_DS_P (as))
5451     switch (GET_CODE (addr))
5452       {
5453       case REG:
5454 	print_reg (file, addr);
5455 	break;
5456 
5457       case PLUS:
5458 	reg = XEXP (addr, 0);
5459 	print_reg (file, reg);
5460 	break;
5461 
5462       default:
5463 	debug_rtx (addr);
5464 	abort ();
5465       }
5466   else
5467     switch (GET_CODE (addr))
5468       {
5469       case REG:
5470 	print_reg (file, addr);
5471 	fprintf (file, ", 0");
5472 	break;
5473 
5474       case PLUS:
5475 	reg = XEXP (addr, 0);
5476 	offset = XEXP (addr, 1);
5477 	print_reg (file, reg);
5478 	fprintf (file, ", ");
5479 	if (GET_CODE (offset) == REG)
5480 	  print_reg (file, reg);
5481 	else if (GET_CODE (offset) == CONST_INT)
5482 	  fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
5483 	else
5484 	  abort ();
5485 	break;
5486 
5487       default:
5488 	debug_rtx (addr);
5489 	abort ();
5490       }
5491 }
5492 
5493 /* Implement PRINT_OPERAND via gcn.h.
5494 
5495    b - print operand size as untyped operand (b8/b16/b32/b64)
5496    B - print operand size as SI/DI untyped operand (b32/b32/b32/b64)
5497    i - print operand size as untyped operand (i16/b32/i64)
5498    I - print operand size as SI/DI untyped operand(i32/b32/i64)
5499    u - print operand size as untyped operand (u16/u32/u64)
5500    U - print operand size as SI/DI untyped operand(u32/u64)
5501    o - print operand size as memory access size for loads
5502        (ubyte/ushort/dword/dwordx2/wordx3/dwordx4)
5503    s - print operand size as memory access size for stores
5504        (byte/short/dword/dwordx2/wordx3/dwordx4)
5505    C - print conditional code for s_cbranch (_sccz/_sccnz/_vccz/_vccnz...)
5506    c - print inverse conditional code for s_cbranch
5507    D - print conditional code for s_cmp (eq_u64/lg_u64...)
5508    E - print conditional code for v_cmp (eq_u64/ne_u64...)
5509    A - print address in formatting suitable for given address space.
5510    O - print offset:n for data share operations.
5511    ^ - print "_co" suffix for GCN5 mnemonics
5512    g - print "glc", if appropriate for given MEM
5513  */
5514 
5515 void
print_operand(FILE * file,rtx x,int code)5516 print_operand (FILE *file, rtx x, int code)
5517 {
5518   int xcode = x ? GET_CODE (x) : 0;
5519   bool invert = false;
5520   switch (code)
5521     {
5522       /* Instructions have the following suffixes.
5523          If there are two suffixes, the first is the destination type,
5524 	 and the second is the source type.
5525 
5526          B32 Bitfield (untyped data) 32-bit
5527          B64 Bitfield (untyped data) 64-bit
5528          F16 floating-point 16-bit
5529          F32 floating-point 32-bit (IEEE 754 single-precision float)
5530          F64 floating-point 64-bit (IEEE 754 double-precision float)
5531          I16 signed 32-bit integer
5532          I32 signed 32-bit integer
5533          I64 signed 64-bit integer
5534          U16 unsigned 32-bit integer
5535          U32 unsigned 32-bit integer
5536          U64 unsigned 64-bit integer  */
5537 
5538       /* Print operand size as untyped suffix.  */
5539     case 'b':
5540       {
5541 	const char *s = "";
5542 	machine_mode mode = GET_MODE (x);
5543 	if (VECTOR_MODE_P (mode))
5544 	  mode = GET_MODE_INNER (mode);
5545 	switch (GET_MODE_SIZE (mode))
5546 	  {
5547 	  case 1:
5548 	    s = "_b8";
5549 	    break;
5550 	  case 2:
5551 	    s = "_b16";
5552 	    break;
5553 	  case 4:
5554 	    s = "_b32";
5555 	    break;
5556 	  case 8:
5557 	    s = "_b64";
5558 	    break;
5559 	  default:
5560 	    output_operand_lossage ("invalid operand %%xn code");
5561 	    return;
5562 	  }
5563 	fputs (s, file);
5564       }
5565       return;
5566     case 'B':
5567       {
5568 	const char *s = "";
5569 	machine_mode mode = GET_MODE (x);
5570 	if (VECTOR_MODE_P (mode))
5571 	  mode = GET_MODE_INNER (mode);
5572 	switch (GET_MODE_SIZE (mode))
5573 	  {
5574 	  case 1:
5575 	  case 2:
5576 	  case 4:
5577 	    s = "_b32";
5578 	    break;
5579 	  case 8:
5580 	    s = "_b64";
5581 	    break;
5582 	  default:
5583 	    output_operand_lossage ("invalid operand %%xn code");
5584 	    return;
5585 	  }
5586 	fputs (s, file);
5587       }
5588       return;
5589     case 'e':
5590       fputs ("sext(", file);
5591       print_operand (file, x, 0);
5592       fputs (")", file);
5593       return;
5594     case 'i':
5595     case 'I':
5596     case 'u':
5597     case 'U':
5598       {
5599 	bool signed_p = code == 'i';
5600 	bool min32_p = code == 'I' || code == 'U';
5601 	const char *s = "";
5602 	machine_mode mode = GET_MODE (x);
5603 	if (VECTOR_MODE_P (mode))
5604 	  mode = GET_MODE_INNER (mode);
5605 	if (mode == VOIDmode)
5606 	  switch (GET_CODE (x))
5607 	    {
5608 	    case CONST_INT:
5609 	      s = signed_p ? "_i32" : "_u32";
5610 	      break;
5611 	    case CONST_DOUBLE:
5612 	      s = "_f64";
5613 	      break;
5614 	    default:
5615 	      output_operand_lossage ("invalid operand %%xn code");
5616 	      return;
5617 	    }
5618 	else if (FLOAT_MODE_P (mode))
5619 	  switch (GET_MODE_SIZE (mode))
5620 	    {
5621 	    case 2:
5622 	      s = "_f16";
5623 	      break;
5624 	    case 4:
5625 	      s = "_f32";
5626 	      break;
5627 	    case 8:
5628 	      s = "_f64";
5629 	      break;
5630 	    default:
5631 	      output_operand_lossage ("invalid operand %%xn code");
5632 	      return;
5633 	    }
5634 	else if (min32_p)
5635 	  switch (GET_MODE_SIZE (mode))
5636 	    {
5637 	    case 1:
5638 	    case 2:
5639 	    case 4:
5640 	      s = signed_p ? "_i32" : "_u32";
5641 	      break;
5642 	    case 8:
5643 	      s = signed_p ? "_i64" : "_u64";
5644 	      break;
5645 	    default:
5646 	      output_operand_lossage ("invalid operand %%xn code");
5647 	      return;
5648 	    }
5649 	else
5650 	  switch (GET_MODE_SIZE (mode))
5651 	    {
5652 	    case 1:
5653 	      s = signed_p ? "_i8" : "_u8";
5654 	      break;
5655 	    case 2:
5656 	      s = signed_p ? "_i16" : "_u16";
5657 	      break;
5658 	    case 4:
5659 	      s = signed_p ? "_i32" : "_u32";
5660 	      break;
5661 	    case 8:
5662 	      s = signed_p ? "_i64" : "_u64";
5663 	      break;
5664 	    default:
5665 	      output_operand_lossage ("invalid operand %%xn code");
5666 	      return;
5667 	    }
5668 	fputs (s, file);
5669       }
5670       return;
5671       /* Print operand size as untyped suffix.  */
5672     case 'o':
5673       {
5674 	const char *s = 0;
5675 	switch (GET_MODE_SIZE (GET_MODE (x)))
5676 	  {
5677 	  case 1:
5678 	    s = "_ubyte";
5679 	    break;
5680 	  case 2:
5681 	    s = "_ushort";
5682 	    break;
5683 	  /* The following are full-vector variants.  */
5684 	  case 64:
5685 	    s = "_ubyte";
5686 	    break;
5687 	  case 128:
5688 	    s = "_ushort";
5689 	    break;
5690 	  }
5691 
5692 	if (s)
5693 	  {
5694 	    fputs (s, file);
5695 	    return;
5696 	  }
5697 
5698 	/* Fall-through - the other cases for 'o' are the same as for 's'.  */
5699 	gcc_fallthrough();
5700       }
5701     case 's':
5702       {
5703 	const char *s = "";
5704 	switch (GET_MODE_SIZE (GET_MODE (x)))
5705 	  {
5706 	  case 1:
5707 	    s = "_byte";
5708 	    break;
5709 	  case 2:
5710 	    s = "_short";
5711 	    break;
5712 	  case 4:
5713 	    s = "_dword";
5714 	    break;
5715 	  case 8:
5716 	    s = "_dwordx2";
5717 	    break;
5718 	  case 12:
5719 	    s = "_dwordx3";
5720 	    break;
5721 	  case 16:
5722 	    s = "_dwordx4";
5723 	    break;
5724 	  case 32:
5725 	    s = "_dwordx8";
5726 	    break;
5727 	  case 64:
5728 	    s = VECTOR_MODE_P (GET_MODE (x)) ? "_byte" : "_dwordx16";
5729 	    break;
5730 	  /* The following are full-vector variants.  */
5731 	  case 128:
5732 	    s = "_short";
5733 	    break;
5734 	  case 256:
5735 	    s = "_dword";
5736 	    break;
5737 	  case 512:
5738 	    s = "_dwordx2";
5739 	    break;
5740 	  default:
5741 	    output_operand_lossage ("invalid operand %%xn code");
5742 	    return;
5743 	  }
5744 	fputs (s, file);
5745       }
5746       return;
5747     case 'A':
5748       if (xcode != MEM)
5749 	{
5750 	  output_operand_lossage ("invalid %%xn code");
5751 	  return;
5752 	}
5753       print_operand_address (file, x);
5754       return;
5755     case 'O':
5756       {
5757 	if (xcode != MEM)
5758 	  {
5759 	    output_operand_lossage ("invalid %%xn code");
5760 	    return;
5761 	  }
5762 	if (AS_GDS_P (MEM_ADDR_SPACE (x)))
5763 	  fprintf (file, " gds");
5764 
5765 	rtx x0 = XEXP (x, 0);
5766 	if (AS_GLOBAL_P (MEM_ADDR_SPACE (x)))
5767 	  {
5768 	    gcc_assert (TARGET_GCN5_PLUS);
5769 
5770 	    fprintf (file, ", ");
5771 
5772 	    rtx base = x0;
5773 	    rtx const_offset = NULL_RTX;
5774 
5775 	    if (GET_CODE (base) == PLUS)
5776 	      {
5777 		rtx offset = XEXP (x0, 1);
5778 		base = XEXP (x0, 0);
5779 
5780 		if (GET_CODE (base) == PLUS)
5781 		  /* (SGPR + VGPR) + CONST  */
5782 		  /* Ignore the VGPR offset for this operand.  */
5783 		  base = XEXP (base, 0);
5784 
5785 		if (CONST_INT_P (offset))
5786 		  const_offset = XEXP (x0, 1);
5787 		else if (REG_P (offset))
5788 		  /* SGPR + VGPR  */
5789 		  /* Ignore the VGPR offset for this operand.  */
5790 		  ;
5791 		else
5792 		  output_operand_lossage ("bad ADDR_SPACE_GLOBAL address");
5793 	      }
5794 
5795 	    if (REG_P (base))
5796 	      {
5797 		if (VGPR_REGNO_P (REGNO (base)))
5798 		  /* The VGPR address is specified in the %A operand.  */
5799 		  fprintf (file, "off");
5800 		else if (SGPR_REGNO_P (REGNO (base)))
5801 		  print_reg (file, base);
5802 		else
5803 		  output_operand_lossage ("bad ADDR_SPACE_GLOBAL address");
5804 	      }
5805 	    else
5806 	      output_operand_lossage ("bad ADDR_SPACE_GLOBAL address");
5807 
5808 	    if (const_offset != NULL_RTX)
5809 	      fprintf (file, " offset:" HOST_WIDE_INT_PRINT_DEC,
5810 		       INTVAL (const_offset));
5811 
5812 	    return;
5813 	  }
5814 
5815 	if (GET_CODE (x0) == REG)
5816 	  return;
5817 	if (GET_CODE (x0) != PLUS)
5818 	  {
5819 	    output_operand_lossage ("invalid %%xn code");
5820 	    return;
5821 	  }
5822 	rtx val = XEXP (x0, 1);
5823 	if (GET_CODE (val) == CONST_VECTOR)
5824 	  val = CONST_VECTOR_ELT (val, 0);
5825 	if (GET_CODE (val) != CONST_INT)
5826 	  {
5827 	    output_operand_lossage ("invalid %%xn code");
5828 	    return;
5829 	  }
5830 	fprintf (file, " offset:" HOST_WIDE_INT_PRINT_DEC, INTVAL (val));
5831 
5832       }
5833       return;
5834     case 'c':
5835       invert = true;
5836       /* Fall through.  */
5837     case 'C':
5838       {
5839 	const char *s;
5840 	bool num = false;
5841 	if ((xcode != EQ && xcode != NE) || !REG_P (XEXP (x, 0)))
5842 	  {
5843 	    output_operand_lossage ("invalid %%xn code");
5844 	    return;
5845 	  }
5846 	switch (REGNO (XEXP (x, 0)))
5847 	  {
5848 	  case VCC_REG:
5849 	  case VCCZ_REG:
5850 	    s = "_vcc";
5851 	    break;
5852 	  case SCC_REG:
5853 	    /* For some reason llvm-mc insists on scc0 instead of sccz.  */
5854 	    num = true;
5855 	    s = "_scc";
5856 	    break;
5857 	  case EXECZ_REG:
5858 	    s = "_exec";
5859 	    break;
5860 	  default:
5861 	    output_operand_lossage ("invalid %%xn code");
5862 	    return;
5863 	  }
5864 	fputs (s, file);
5865 	if (xcode == (invert ? NE : EQ))
5866 	  fputc (num ? '0' : 'z', file);
5867 	else
5868 	  fputs (num ? "1" : "nz", file);
5869 	return;
5870       }
5871     case 'D':
5872       {
5873 	const char *s;
5874 	bool cmp_signed = false;
5875 	switch (xcode)
5876 	  {
5877 	  case EQ:
5878 	    s = "_eq_";
5879 	    break;
5880 	  case NE:
5881 	    s = "_lg_";
5882 	    break;
5883 	  case LT:
5884 	    s = "_lt_";
5885 	    cmp_signed = true;
5886 	    break;
5887 	  case LE:
5888 	    s = "_le_";
5889 	    cmp_signed = true;
5890 	    break;
5891 	  case GT:
5892 	    s = "_gt_";
5893 	    cmp_signed = true;
5894 	    break;
5895 	  case GE:
5896 	    s = "_ge_";
5897 	    cmp_signed = true;
5898 	    break;
5899 	  case LTU:
5900 	    s = "_lt_";
5901 	    break;
5902 	  case LEU:
5903 	    s = "_le_";
5904 	    break;
5905 	  case GTU:
5906 	    s = "_gt_";
5907 	    break;
5908 	  case GEU:
5909 	    s = "_ge_";
5910 	    break;
5911 	  default:
5912 	    output_operand_lossage ("invalid %%xn code");
5913 	    return;
5914 	  }
5915 	fputs (s, file);
5916 	fputc (cmp_signed ? 'i' : 'u', file);
5917 
5918 	machine_mode mode = GET_MODE (XEXP (x, 0));
5919 
5920 	if (mode == VOIDmode)
5921 	  mode = GET_MODE (XEXP (x, 1));
5922 
5923 	/* If both sides are constants, then assume the instruction is in
5924 	   SImode since s_cmp can only do integer compares.  */
5925 	if (mode == VOIDmode)
5926 	  mode = SImode;
5927 
5928 	switch (GET_MODE_SIZE (mode))
5929 	  {
5930 	  case 4:
5931 	    s = "32";
5932 	    break;
5933 	  case 8:
5934 	    s = "64";
5935 	    break;
5936 	  default:
5937 	    output_operand_lossage ("invalid operand %%xn code");
5938 	    return;
5939 	  }
5940 	fputs (s, file);
5941 	return;
5942       }
5943     case 'E':
5944       {
5945 	const char *s;
5946 	bool cmp_signed = false;
5947 	machine_mode mode = GET_MODE (XEXP (x, 0));
5948 
5949 	if (mode == VOIDmode)
5950 	  mode = GET_MODE (XEXP (x, 1));
5951 
5952 	/* If both sides are constants, assume the instruction is in SFmode
5953 	   if either operand is floating point, otherwise assume SImode.  */
5954 	if (mode == VOIDmode)
5955 	  {
5956 	    if (GET_CODE (XEXP (x, 0)) == CONST_DOUBLE
5957 		|| GET_CODE (XEXP (x, 1)) == CONST_DOUBLE)
5958 	      mode = SFmode;
5959 	    else
5960 	      mode = SImode;
5961 	  }
5962 
5963 	/* Use the same format code for vector comparisons.  */
5964 	if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT
5965 	    || GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
5966 	  mode = GET_MODE_INNER (mode);
5967 
5968 	bool float_p = GET_MODE_CLASS (mode) == MODE_FLOAT;
5969 
5970 	switch (xcode)
5971 	  {
5972 	  case EQ:
5973 	    s = "_eq_";
5974 	    break;
5975 	  case NE:
5976 	    s = float_p ? "_neq_" : "_ne_";
5977 	    break;
5978 	  case LT:
5979 	    s = "_lt_";
5980 	    cmp_signed = true;
5981 	    break;
5982 	  case LE:
5983 	    s = "_le_";
5984 	    cmp_signed = true;
5985 	    break;
5986 	  case GT:
5987 	    s = "_gt_";
5988 	    cmp_signed = true;
5989 	    break;
5990 	  case GE:
5991 	    s = "_ge_";
5992 	    cmp_signed = true;
5993 	    break;
5994 	  case LTU:
5995 	    s = "_lt_";
5996 	    break;
5997 	  case LEU:
5998 	    s = "_le_";
5999 	    break;
6000 	  case GTU:
6001 	    s = "_gt_";
6002 	    break;
6003 	  case GEU:
6004 	    s = "_ge_";
6005 	    break;
6006 	  case ORDERED:
6007 	    s = "_o_";
6008 	    break;
6009 	  case UNORDERED:
6010 	    s = "_u_";
6011 	    break;
6012 	  case UNEQ:
6013 	    s = "_nlg_";
6014 	    break;
6015 	  case UNGE:
6016 	    s = "_nlt_";
6017 	    break;
6018 	  case UNGT:
6019 	    s = "_nle_";
6020 	    break;
6021 	  case UNLE:
6022 	    s = "_ngt_";
6023 	    break;
6024 	  case UNLT:
6025 	    s = "_nge_";
6026 	    break;
6027 	  case LTGT:
6028 	    s = "_lg_";
6029 	    break;
6030 	  default:
6031 	    output_operand_lossage ("invalid %%xn code");
6032 	    return;
6033 	  }
6034 	fputs (s, file);
6035 	fputc (float_p ? 'f' : cmp_signed ? 'i' : 'u', file);
6036 
6037 	switch (GET_MODE_SIZE (mode))
6038 	  {
6039 	  case 1:
6040 	    output_operand_lossage ("operand %%xn code invalid for QImode");
6041 	    return;
6042 	  case 2:
6043 	    s = "16";
6044 	    break;
6045 	  case 4:
6046 	    s = "32";
6047 	    break;
6048 	  case 8:
6049 	    s = "64";
6050 	    break;
6051 	  default:
6052 	    output_operand_lossage ("invalid operand %%xn code");
6053 	    return;
6054 	  }
6055 	fputs (s, file);
6056 	return;
6057       }
6058     case 'L':
6059       print_operand (file, gcn_operand_part (GET_MODE (x), x, 0), 0);
6060       return;
6061     case 'H':
6062       print_operand (file, gcn_operand_part (GET_MODE (x), x, 1), 0);
6063       return;
6064     case 'R':
6065       /* Print a scalar register number as an integer.  Temporary hack.  */
6066       gcc_assert (REG_P (x));
6067       fprintf (file, "%u", (int) REGNO (x));
6068       return;
6069     case 'V':
6070       /* Print a vector register number as an integer.  Temporary hack.  */
6071       gcc_assert (REG_P (x));
6072       fprintf (file, "%u", (int) REGNO (x) - FIRST_VGPR_REG);
6073       return;
6074     case 0:
6075       if (xcode == REG)
6076 	print_reg (file, x);
6077       else if (xcode == MEM)
6078 	output_address (GET_MODE (x), x);
6079       else if (xcode == CONST_INT)
6080 	fprintf (file, "%i", (int) INTVAL (x));
6081       else if (xcode == CONST_VECTOR)
6082 	print_operand (file, CONST_VECTOR_ELT (x, 0), code);
6083       else if (xcode == CONST_DOUBLE)
6084 	{
6085 	  const char *str;
6086 	  switch (gcn_inline_fp_constant_p (x, false))
6087 	    {
6088 	    case 240:
6089 	      str = "0.5";
6090 	      break;
6091 	    case 241:
6092 	      str = "-0.5";
6093 	      break;
6094 	    case 242:
6095 	      str = "1.0";
6096 	      break;
6097 	    case 243:
6098 	      str = "-1.0";
6099 	      break;
6100 	    case 244:
6101 	      str = "2.0";
6102 	      break;
6103 	    case 245:
6104 	      str = "-2.0";
6105 	      break;
6106 	    case 246:
6107 	      str = "4.0";
6108 	      break;
6109 	    case 247:
6110 	      str = "-4.0";
6111 	      break;
6112 	    case 248:
6113 	      str = "1/pi";
6114 	      break;
6115 	    default:
6116 	      rtx ix = simplify_gen_subreg (GET_MODE (x) == DFmode
6117 					    ? DImode : SImode,
6118 					    x, GET_MODE (x), 0);
6119 	      if (x)
6120 		print_operand (file, ix, code);
6121 	      else
6122 		output_operand_lossage ("invalid fp constant");
6123 	      return;
6124 	      break;
6125 	    }
6126 	  fprintf (file, str);
6127 	  return;
6128 	}
6129       else
6130 	output_addr_const (file, x);
6131       return;
6132     case '^':
6133       if (TARGET_GCN5_PLUS)
6134 	fputs ("_co", file);
6135       return;
6136     case 'g':
6137       gcc_assert (xcode == MEM);
6138       if (MEM_VOLATILE_P (x))
6139 	fputs (" glc", file);
6140       return;
6141     default:
6142       output_operand_lossage ("invalid %%xn code");
6143     }
6144   gcc_unreachable ();
6145 }
6146 
6147 /* }}}  */
6148 /* {{{ TARGET hook overrides.  */
6149 
6150 #undef  TARGET_ADDR_SPACE_ADDRESS_MODE
6151 #define TARGET_ADDR_SPACE_ADDRESS_MODE gcn_addr_space_address_mode
6152 #undef  TARGET_ADDR_SPACE_LEGITIMATE_ADDRESS_P
6153 #define TARGET_ADDR_SPACE_LEGITIMATE_ADDRESS_P \
6154   gcn_addr_space_legitimate_address_p
6155 #undef  TARGET_ADDR_SPACE_LEGITIMIZE_ADDRESS
6156 #define TARGET_ADDR_SPACE_LEGITIMIZE_ADDRESS gcn_addr_space_legitimize_address
6157 #undef  TARGET_ADDR_SPACE_POINTER_MODE
6158 #define TARGET_ADDR_SPACE_POINTER_MODE gcn_addr_space_pointer_mode
6159 #undef  TARGET_ADDR_SPACE_SUBSET_P
6160 #define TARGET_ADDR_SPACE_SUBSET_P gcn_addr_space_subset_p
6161 #undef  TARGET_ADDR_SPACE_CONVERT
6162 #define TARGET_ADDR_SPACE_CONVERT gcn_addr_space_convert
6163 #undef  TARGET_ARG_PARTIAL_BYTES
6164 #define TARGET_ARG_PARTIAL_BYTES gcn_arg_partial_bytes
6165 #undef  TARGET_ASM_ALIGNED_DI_OP
6166 #define TARGET_ASM_ALIGNED_DI_OP "\t.8byte\t"
6167 #undef  TARGET_ASM_FILE_START
6168 #define TARGET_ASM_FILE_START output_file_start
6169 #undef  TARGET_ASM_FUNCTION_PROLOGUE
6170 #define TARGET_ASM_FUNCTION_PROLOGUE gcn_target_asm_function_prologue
6171 #undef  TARGET_ASM_SELECT_SECTION
6172 #define TARGET_ASM_SELECT_SECTION gcn_asm_select_section
6173 #undef  TARGET_ASM_TRAMPOLINE_TEMPLATE
6174 #define TARGET_ASM_TRAMPOLINE_TEMPLATE gcn_asm_trampoline_template
6175 #undef  TARGET_ATTRIBUTE_TABLE
6176 #define TARGET_ATTRIBUTE_TABLE gcn_attribute_table
6177 #undef  TARGET_BUILTIN_DECL
6178 #define TARGET_BUILTIN_DECL gcn_builtin_decl
6179 #undef  TARGET_CAN_CHANGE_MODE_CLASS
6180 #define TARGET_CAN_CHANGE_MODE_CLASS gcn_can_change_mode_class
6181 #undef  TARGET_CAN_ELIMINATE
6182 #define TARGET_CAN_ELIMINATE gcn_can_eliminate_p
6183 #undef  TARGET_CANNOT_COPY_INSN_P
6184 #define TARGET_CANNOT_COPY_INSN_P gcn_cannot_copy_insn_p
6185 #undef  TARGET_CLASS_LIKELY_SPILLED_P
6186 #define TARGET_CLASS_LIKELY_SPILLED_P gcn_class_likely_spilled_p
6187 #undef  TARGET_CLASS_MAX_NREGS
6188 #define TARGET_CLASS_MAX_NREGS gcn_class_max_nregs
6189 #undef  TARGET_CONDITIONAL_REGISTER_USAGE
6190 #define TARGET_CONDITIONAL_REGISTER_USAGE gcn_conditional_register_usage
6191 #undef  TARGET_CONSTANT_ALIGNMENT
6192 #define TARGET_CONSTANT_ALIGNMENT gcn_constant_alignment
6193 #undef  TARGET_DEBUG_UNWIND_INFO
6194 #define TARGET_DEBUG_UNWIND_INFO gcn_debug_unwind_info
6195 #undef  TARGET_EMUTLS_VAR_INIT
6196 #define TARGET_EMUTLS_VAR_INIT gcn_emutls_var_init
6197 #undef  TARGET_EXPAND_BUILTIN
6198 #define TARGET_EXPAND_BUILTIN gcn_expand_builtin
6199 #undef  TARGET_FUNCTION_ARG
6200 #undef  TARGET_FUNCTION_ARG_ADVANCE
6201 #define TARGET_FUNCTION_ARG_ADVANCE gcn_function_arg_advance
6202 #define TARGET_FUNCTION_ARG gcn_function_arg
6203 #undef  TARGET_FUNCTION_VALUE
6204 #define TARGET_FUNCTION_VALUE gcn_function_value
6205 #undef  TARGET_FUNCTION_VALUE_REGNO_P
6206 #define TARGET_FUNCTION_VALUE_REGNO_P gcn_function_value_regno_p
6207 #undef  TARGET_GIMPLIFY_VA_ARG_EXPR
6208 #define TARGET_GIMPLIFY_VA_ARG_EXPR gcn_gimplify_va_arg_expr
6209 #undef TARGET_OMP_DEVICE_KIND_ARCH_ISA
6210 #define TARGET_OMP_DEVICE_KIND_ARCH_ISA gcn_omp_device_kind_arch_isa
6211 #undef  TARGET_GOACC_ADJUST_PROPAGATION_RECORD
6212 #define TARGET_GOACC_ADJUST_PROPAGATION_RECORD \
6213   gcn_goacc_adjust_propagation_record
6214 #undef  TARGET_GOACC_ADJUST_GANGPRIVATE_DECL
6215 #define TARGET_GOACC_ADJUST_GANGPRIVATE_DECL gcn_goacc_adjust_gangprivate_decl
6216 #undef  TARGET_GOACC_FORK_JOIN
6217 #define TARGET_GOACC_FORK_JOIN gcn_fork_join
6218 #undef  TARGET_GOACC_REDUCTION
6219 #define TARGET_GOACC_REDUCTION gcn_goacc_reduction
6220 #undef  TARGET_GOACC_VALIDATE_DIMS
6221 #define TARGET_GOACC_VALIDATE_DIMS gcn_goacc_validate_dims
6222 #undef  TARGET_HARD_REGNO_MODE_OK
6223 #define TARGET_HARD_REGNO_MODE_OK gcn_hard_regno_mode_ok
6224 #undef  TARGET_HARD_REGNO_NREGS
6225 #define TARGET_HARD_REGNO_NREGS gcn_hard_regno_nregs
6226 #undef  TARGET_HAVE_SPECULATION_SAFE_VALUE
6227 #define TARGET_HAVE_SPECULATION_SAFE_VALUE speculation_safe_value_not_needed
6228 #undef  TARGET_INIT_BUILTINS
6229 #define TARGET_INIT_BUILTINS gcn_init_builtins
6230 #undef  TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
6231 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
6232   gcn_ira_change_pseudo_allocno_class
6233 #undef  TARGET_LEGITIMATE_CONSTANT_P
6234 #define TARGET_LEGITIMATE_CONSTANT_P gcn_legitimate_constant_p
6235 #undef  TARGET_LRA_P
6236 #define TARGET_LRA_P hook_bool_void_true
6237 #undef  TARGET_MACHINE_DEPENDENT_REORG
6238 #define TARGET_MACHINE_DEPENDENT_REORG gcn_md_reorg
6239 #undef  TARGET_MEMORY_MOVE_COST
6240 #define TARGET_MEMORY_MOVE_COST gcn_memory_move_cost
6241 #undef  TARGET_MODES_TIEABLE_P
6242 #define TARGET_MODES_TIEABLE_P gcn_modes_tieable_p
6243 #undef  TARGET_OPTION_OVERRIDE
6244 #define TARGET_OPTION_OVERRIDE gcn_option_override
6245 #undef  TARGET_PRETEND_OUTGOING_VARARGS_NAMED
6246 #define TARGET_PRETEND_OUTGOING_VARARGS_NAMED \
6247   gcn_pretend_outgoing_varargs_named
6248 #undef  TARGET_PROMOTE_FUNCTION_MODE
6249 #define TARGET_PROMOTE_FUNCTION_MODE gcn_promote_function_mode
6250 #undef  TARGET_REGISTER_MOVE_COST
6251 #define TARGET_REGISTER_MOVE_COST gcn_register_move_cost
6252 #undef  TARGET_RETURN_IN_MEMORY
6253 #define TARGET_RETURN_IN_MEMORY gcn_return_in_memory
6254 #undef  TARGET_RTX_COSTS
6255 #define TARGET_RTX_COSTS gcn_rtx_costs
6256 #undef  TARGET_SECONDARY_RELOAD
6257 #define TARGET_SECONDARY_RELOAD gcn_secondary_reload
6258 #undef  TARGET_SECTION_TYPE_FLAGS
6259 #define TARGET_SECTION_TYPE_FLAGS gcn_section_type_flags
6260 #undef  TARGET_SMALL_REGISTER_CLASSES_FOR_MODE_P
6261 #define TARGET_SMALL_REGISTER_CLASSES_FOR_MODE_P \
6262   gcn_small_register_classes_for_mode_p
6263 #undef  TARGET_SPILL_CLASS
6264 #define TARGET_SPILL_CLASS gcn_spill_class
6265 #undef  TARGET_STRICT_ARGUMENT_NAMING
6266 #define TARGET_STRICT_ARGUMENT_NAMING gcn_strict_argument_naming
6267 #undef  TARGET_TRAMPOLINE_INIT
6268 #define TARGET_TRAMPOLINE_INIT gcn_trampoline_init
6269 #undef  TARGET_TRULY_NOOP_TRUNCATION
6270 #define TARGET_TRULY_NOOP_TRUNCATION gcn_truly_noop_truncation
6271 #undef  TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
6272 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST gcn_vectorization_cost
6273 #undef  TARGET_VECTORIZE_GET_MASK_MODE
6274 #define TARGET_VECTORIZE_GET_MASK_MODE gcn_vectorize_get_mask_mode
6275 #undef  TARGET_VECTORIZE_PREFERRED_SIMD_MODE
6276 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE gcn_vectorize_preferred_simd_mode
6277 #undef  TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
6278 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
6279   gcn_preferred_vector_alignment
6280 #undef  TARGET_VECTORIZE_RELATED_MODE
6281 #define TARGET_VECTORIZE_RELATED_MODE gcn_related_vector_mode
6282 #undef  TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
6283 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
6284   gcn_vectorize_support_vector_misalignment
6285 #undef  TARGET_VECTORIZE_VEC_PERM_CONST
6286 #define TARGET_VECTORIZE_VEC_PERM_CONST gcn_vectorize_vec_perm_const
6287 #undef  TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
6288 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
6289   gcn_vector_alignment_reachable
6290 #undef  TARGET_VECTOR_MODE_SUPPORTED_P
6291 #define TARGET_VECTOR_MODE_SUPPORTED_P gcn_vector_mode_supported_p
6292 
6293 struct gcc_target targetm = TARGET_INITIALIZER;
6294 
6295 #include "gt-gcn.h"
6296 /* }}}  */
6297