1 /* Copyright (C) 2016-2020 Free Software Foundation, Inc.
2
3 This file is free software; you can redistribute it and/or modify it under
4 the terms of the GNU General Public License as published by the Free
5 Software Foundation; either version 3 of the License, or (at your option)
6 any later version.
7
8 This file is distributed in the hope that it will be useful, but WITHOUT
9 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
11 for more details.
12
13 You should have received a copy of the GNU General Public License
14 along with GCC; see the file COPYING3. If not see
15 <http://www.gnu.org/licenses/>. */
16
17 /* {{{ Includes. */
18
19 /* We want GET_MODE_SIZE et al to return integers, please. */
20 #define IN_TARGET_CODE 1
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "memmodel.h"
28 #include "rtl.h"
29 #include "tree.h"
30 #include "df.h"
31 #include "tm_p.h"
32 #include "stringpool.h"
33 #include "optabs.h"
34 #include "regs.h"
35 #include "emit-rtl.h"
36 #include "recog.h"
37 #include "diagnostic-core.h"
38 #include "insn-attr.h"
39 #include "fold-const.h"
40 #include "calls.h"
41 #include "explow.h"
42 #include "expr.h"
43 #include "output.h"
44 #include "cfgrtl.h"
45 #include "langhooks.h"
46 #include "builtins.h"
47 #include "omp-general.h"
48 #include "print-rtl.h"
49 #include "attribs.h"
50 #include "varasm.h"
51 #include "intl.h"
52 #include "rtl-iter.h"
53
54 /* This file should be included last. */
55 #include "target-def.h"
56
57 /* }}} */
58 /* {{{ Global variables. */
59
60 /* Constants used by FP instructions. */
61
62 static REAL_VALUE_TYPE dconst4, dconst1over2pi;
63 static bool ext_gcn_constants_init = 0;
64
65 /* Holds the ISA variant, derived from the command line parameters. */
66
67 int gcn_isa = 3; /* Default to GCN3. */
68
69 /* Reserve this much space for LDS (for propagating variables from
70 worker-single mode to worker-partitioned mode), per workgroup. Global
71 analysis could calculate an exact bound, but we don't do that yet.
72
73 We want to permit full occupancy, so size accordingly. */
74
75 #define OMP_LDS_SIZE 0x600 /* 0x600 is 1/40 total, rounded down. */
76 #define ACC_LDS_SIZE 32768 /* Half of the total should be fine. */
77 #define OTHER_LDS_SIZE 65536 /* If in doubt, reserve all of it. */
78
79 #define LDS_SIZE (flag_openacc ? ACC_LDS_SIZE \
80 : flag_openmp ? OMP_LDS_SIZE \
81 : OTHER_LDS_SIZE)
82
83 /* The number of registers usable by normal non-kernel functions.
84 The SGPR count includes any special extra registers such as VCC. */
85
86 #define MAX_NORMAL_SGPR_COUNT 64
87 #define MAX_NORMAL_VGPR_COUNT 24
88
89 /* }}} */
90 /* {{{ Initialization and options. */
91
92 /* Initialize machine_function. */
93
94 static struct machine_function *
gcn_init_machine_status(void)95 gcn_init_machine_status (void)
96 {
97 struct machine_function *f;
98
99 f = ggc_cleared_alloc<machine_function> ();
100
101 /* Set up LDS allocation for broadcasting for this function. */
102 f->lds_allocated = 32;
103 f->lds_allocs = hash_map<tree, int>::create_ggc (64);
104
105 /* And LDS temporary decls for worker reductions. */
106 vec_alloc (f->reduc_decls, 0);
107
108 if (TARGET_GCN3)
109 f->use_flat_addressing = true;
110
111 return f;
112 }
113
114 /* Implement TARGET_OPTION_OVERRIDE.
115
116 Override option settings where defaults are variable, or we have specific
117 needs to consider. */
118
119 static void
gcn_option_override(void)120 gcn_option_override (void)
121 {
122 init_machine_status = gcn_init_machine_status;
123
124 /* The HSA runtime does not respect ELF load addresses, so force PIE. */
125 if (!flag_pie)
126 flag_pie = 2;
127 if (!flag_pic)
128 flag_pic = flag_pie;
129
130 gcn_isa = gcn_arch == PROCESSOR_VEGA ? 5 : 3;
131
132 /* The default stack size needs to be small for offload kernels because
133 there may be many, many threads. Also, a smaller stack gives a
134 measureable performance boost. But, a small stack is insufficient
135 for running the testsuite, so we use a larger default for the stand
136 alone case. */
137 if (stack_size_opt == -1)
138 {
139 if (flag_openacc || flag_openmp)
140 /* 512 bytes per work item = 32kB total. */
141 stack_size_opt = 512 * 64;
142 else
143 /* 1MB total. */
144 stack_size_opt = 1048576;
145 }
146 }
147
148 /* }}} */
149 /* {{{ Attributes. */
150
151 /* This table defines the arguments that are permitted in
152 __attribute__ ((amdgpu_hsa_kernel (...))).
153
154 The names and values correspond to the HSA metadata that is encoded
155 into the assembler file and binary. */
156
157 static const struct gcn_kernel_arg_type
158 {
159 const char *name;
160 const char *header_pseudo;
161 machine_mode mode;
162
163 /* This should be set to -1 or -2 for a dynamically allocated register
164 number. Use -1 if this argument contributes to the user_sgpr_count,
165 -2 otherwise. */
166 int fixed_regno;
167 } gcn_kernel_arg_types[] = {
168 {"exec", NULL, DImode, EXEC_REG},
169 #define PRIVATE_SEGMENT_BUFFER_ARG 1
170 {"private_segment_buffer",
171 "enable_sgpr_private_segment_buffer", TImode, -1},
172 #define DISPATCH_PTR_ARG 2
173 {"dispatch_ptr", "enable_sgpr_dispatch_ptr", DImode, -1},
174 #define QUEUE_PTR_ARG 3
175 {"queue_ptr", "enable_sgpr_queue_ptr", DImode, -1},
176 #define KERNARG_SEGMENT_PTR_ARG 4
177 {"kernarg_segment_ptr", "enable_sgpr_kernarg_segment_ptr", DImode, -1},
178 {"dispatch_id", "enable_sgpr_dispatch_id", DImode, -1},
179 #define FLAT_SCRATCH_INIT_ARG 6
180 {"flat_scratch_init", "enable_sgpr_flat_scratch_init", DImode, -1},
181 #define FLAT_SCRATCH_SEGMENT_SIZE_ARG 7
182 {"private_segment_size", "enable_sgpr_private_segment_size", SImode, -1},
183 {"grid_workgroup_count_X",
184 "enable_sgpr_grid_workgroup_count_x", SImode, -1},
185 {"grid_workgroup_count_Y",
186 "enable_sgpr_grid_workgroup_count_y", SImode, -1},
187 {"grid_workgroup_count_Z",
188 "enable_sgpr_grid_workgroup_count_z", SImode, -1},
189 #define WORKGROUP_ID_X_ARG 11
190 {"workgroup_id_X", "enable_sgpr_workgroup_id_x", SImode, -2},
191 {"workgroup_id_Y", "enable_sgpr_workgroup_id_y", SImode, -2},
192 {"workgroup_id_Z", "enable_sgpr_workgroup_id_z", SImode, -2},
193 {"workgroup_info", "enable_sgpr_workgroup_info", SImode, -1},
194 #define PRIVATE_SEGMENT_WAVE_OFFSET_ARG 15
195 {"private_segment_wave_offset",
196 "enable_sgpr_private_segment_wave_byte_offset", SImode, -2},
197 #define WORK_ITEM_ID_X_ARG 16
198 {"work_item_id_X", NULL, V64SImode, FIRST_VGPR_REG},
199 #define WORK_ITEM_ID_Y_ARG 17
200 {"work_item_id_Y", NULL, V64SImode, FIRST_VGPR_REG + 1},
201 #define WORK_ITEM_ID_Z_ARG 18
202 {"work_item_id_Z", NULL, V64SImode, FIRST_VGPR_REG + 2}
203 };
204
205 static const long default_requested_args
206 = (1 << PRIVATE_SEGMENT_BUFFER_ARG)
207 | (1 << DISPATCH_PTR_ARG)
208 | (1 << QUEUE_PTR_ARG)
209 | (1 << KERNARG_SEGMENT_PTR_ARG)
210 | (1 << PRIVATE_SEGMENT_WAVE_OFFSET_ARG)
211 | (1 << WORKGROUP_ID_X_ARG)
212 | (1 << WORK_ITEM_ID_X_ARG)
213 | (1 << WORK_ITEM_ID_Y_ARG)
214 | (1 << WORK_ITEM_ID_Z_ARG);
215
216 /* Extract parameter settings from __attribute__((amdgpu_hsa_kernel ())).
217 This function also sets the default values for some arguments.
218
219 Return true on success, with ARGS populated. */
220
221 static bool
gcn_parse_amdgpu_hsa_kernel_attribute(struct gcn_kernel_args * args,tree list)222 gcn_parse_amdgpu_hsa_kernel_attribute (struct gcn_kernel_args *args,
223 tree list)
224 {
225 bool err = false;
226 args->requested = default_requested_args;
227 args->nargs = 0;
228
229 for (int a = 0; a < GCN_KERNEL_ARG_TYPES; a++)
230 args->reg[a] = -1;
231
232 for (; list; list = TREE_CHAIN (list))
233 {
234 const char *str;
235 if (TREE_CODE (TREE_VALUE (list)) != STRING_CST)
236 {
237 error ("amdgpu_hsa_kernel attribute requires string constant "
238 "arguments");
239 break;
240 }
241 str = TREE_STRING_POINTER (TREE_VALUE (list));
242 int a;
243 for (a = 0; a < GCN_KERNEL_ARG_TYPES; a++)
244 {
245 if (!strcmp (str, gcn_kernel_arg_types[a].name))
246 break;
247 }
248 if (a == GCN_KERNEL_ARG_TYPES)
249 {
250 error ("unknown specifier %s in amdgpu_hsa_kernel attribute", str);
251 err = true;
252 break;
253 }
254 if (args->requested & (1 << a))
255 {
256 error ("duplicated parameter specifier %s in amdgpu_hsa_kernel "
257 "attribute", str);
258 err = true;
259 break;
260 }
261 args->requested |= (1 << a);
262 args->order[args->nargs++] = a;
263 }
264
265 /* Requesting WORK_ITEM_ID_Z_ARG implies requesting WORK_ITEM_ID_X_ARG and
266 WORK_ITEM_ID_Y_ARG. Similarly, requesting WORK_ITEM_ID_Y_ARG implies
267 requesting WORK_ITEM_ID_X_ARG. */
268 if (args->requested & (1 << WORK_ITEM_ID_Z_ARG))
269 args->requested |= (1 << WORK_ITEM_ID_Y_ARG);
270 if (args->requested & (1 << WORK_ITEM_ID_Y_ARG))
271 args->requested |= (1 << WORK_ITEM_ID_X_ARG);
272
273 int sgpr_regno = FIRST_SGPR_REG;
274 args->nsgprs = 0;
275 for (int a = 0; a < GCN_KERNEL_ARG_TYPES; a++)
276 {
277 if (!(args->requested & (1 << a)))
278 continue;
279
280 if (gcn_kernel_arg_types[a].fixed_regno >= 0)
281 args->reg[a] = gcn_kernel_arg_types[a].fixed_regno;
282 else
283 {
284 int reg_count;
285
286 switch (gcn_kernel_arg_types[a].mode)
287 {
288 case E_SImode:
289 reg_count = 1;
290 break;
291 case E_DImode:
292 reg_count = 2;
293 break;
294 case E_TImode:
295 reg_count = 4;
296 break;
297 default:
298 gcc_unreachable ();
299 }
300 args->reg[a] = sgpr_regno;
301 sgpr_regno += reg_count;
302 if (gcn_kernel_arg_types[a].fixed_regno == -1)
303 args->nsgprs += reg_count;
304 }
305 }
306 if (sgpr_regno > FIRST_SGPR_REG + 16)
307 {
308 error ("too many arguments passed in sgpr registers");
309 }
310 return err;
311 }
312
313 /* Referenced by TARGET_ATTRIBUTE_TABLE.
314
315 Validates target specific attributes. */
316
317 static tree
gcn_handle_amdgpu_hsa_kernel_attribute(tree * node,tree name,tree args,int,bool * no_add_attrs)318 gcn_handle_amdgpu_hsa_kernel_attribute (tree *node, tree name,
319 tree args, int, bool *no_add_attrs)
320 {
321 if (!FUNC_OR_METHOD_TYPE_P (*node))
322 {
323 warning (OPT_Wattributes, "%qE attribute only applies to functions",
324 name);
325 *no_add_attrs = true;
326 return NULL_TREE;
327 }
328
329 /* Can combine regparm with all attributes but fastcall, and thiscall. */
330 if (is_attribute_p ("gcnhsa_kernel", name))
331 {
332 struct gcn_kernel_args kernelarg;
333
334 if (gcn_parse_amdgpu_hsa_kernel_attribute (&kernelarg, args))
335 *no_add_attrs = true;
336
337 return NULL_TREE;
338 }
339
340 return NULL_TREE;
341 }
342
343 /* Implement TARGET_ATTRIBUTE_TABLE.
344
345 Create target-specific __attribute__ types. */
346
347 static const struct attribute_spec gcn_attribute_table[] = {
348 /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
349 affects_type_identity } */
350 {"amdgpu_hsa_kernel", 0, GCN_KERNEL_ARG_TYPES, false, true,
351 true, true, gcn_handle_amdgpu_hsa_kernel_attribute, NULL},
352 /* End element. */
353 {NULL, 0, 0, false, false, false, false, NULL, NULL}
354 };
355
356 /* }}} */
357 /* {{{ Registers and modes. */
358
359 /* Implement TARGET_CLASS_MAX_NREGS.
360
361 Return the number of hard registers needed to hold a value of MODE in
362 a register of class RCLASS. */
363
364 static unsigned char
gcn_class_max_nregs(reg_class_t rclass,machine_mode mode)365 gcn_class_max_nregs (reg_class_t rclass, machine_mode mode)
366 {
367 /* Scalar registers are 32bit, vector registers are in fact tuples of
368 64 lanes. */
369 if (rclass == VGPR_REGS)
370 {
371 if (vgpr_1reg_mode_p (mode))
372 return 1;
373 if (vgpr_2reg_mode_p (mode))
374 return 2;
375 /* TImode is used by DImode compare_and_swap. */
376 if (mode == TImode)
377 return 4;
378 }
379 else if (rclass == VCC_CONDITIONAL_REG && mode == BImode)
380 return 2;
381 return CEIL (GET_MODE_SIZE (mode), 4);
382 }
383
384 /* Implement TARGET_HARD_REGNO_NREGS.
385
386 Return the number of hard registers needed to hold a value of MODE in
387 REGNO. */
388
389 unsigned int
gcn_hard_regno_nregs(unsigned int regno,machine_mode mode)390 gcn_hard_regno_nregs (unsigned int regno, machine_mode mode)
391 {
392 return gcn_class_max_nregs (REGNO_REG_CLASS (regno), mode);
393 }
394
395 /* Implement TARGET_HARD_REGNO_MODE_OK.
396
397 Return true if REGNO can hold value in MODE. */
398
399 bool
gcn_hard_regno_mode_ok(unsigned int regno,machine_mode mode)400 gcn_hard_regno_mode_ok (unsigned int regno, machine_mode mode)
401 {
402 /* Treat a complex mode as if it were a scalar mode of the same overall
403 size for the purposes of allocating hard registers. */
404 if (COMPLEX_MODE_P (mode))
405 switch (mode)
406 {
407 case E_CQImode:
408 case E_CHImode:
409 mode = SImode;
410 break;
411 case E_CSImode:
412 mode = DImode;
413 break;
414 case E_CDImode:
415 mode = TImode;
416 break;
417 case E_HCmode:
418 mode = SFmode;
419 break;
420 case E_SCmode:
421 mode = DFmode;
422 break;
423 default:
424 /* Not supported. */
425 return false;
426 }
427
428 switch (regno)
429 {
430 case FLAT_SCRATCH_LO_REG:
431 case XNACK_MASK_LO_REG:
432 case TBA_LO_REG:
433 case TMA_LO_REG:
434 return (mode == SImode || mode == DImode);
435 case VCC_LO_REG:
436 case EXEC_LO_REG:
437 return (mode == BImode || mode == SImode || mode == DImode);
438 case M0_REG:
439 case FLAT_SCRATCH_HI_REG:
440 case XNACK_MASK_HI_REG:
441 case TBA_HI_REG:
442 case TMA_HI_REG:
443 return mode == SImode;
444 case VCC_HI_REG:
445 return false;
446 case EXEC_HI_REG:
447 return mode == SImode /*|| mode == V32BImode */ ;
448 case SCC_REG:
449 case VCCZ_REG:
450 case EXECZ_REG:
451 return mode == BImode;
452 }
453 if (regno == ARG_POINTER_REGNUM || regno == FRAME_POINTER_REGNUM)
454 return true;
455 if (SGPR_REGNO_P (regno))
456 /* We restrict double register values to aligned registers. */
457 return (sgpr_1reg_mode_p (mode)
458 || (!((regno - FIRST_SGPR_REG) & 1) && sgpr_2reg_mode_p (mode))
459 || (((regno - FIRST_SGPR_REG) & 3) == 0 && mode == TImode));
460 if (VGPR_REGNO_P (regno))
461 /* Vector instructions do not care about the alignment of register
462 pairs, but where there is no 64-bit instruction, many of the
463 define_split do not work if the input and output registers partially
464 overlap. We tried to fix this with early clobber and match
465 constraints, but it was bug prone, added complexity, and conflicts
466 with the 'U0' constraints on vec_merge.
467 Therefore, we restrict ourselved to aligned registers. */
468 return (vgpr_1reg_mode_p (mode)
469 || (!((regno - FIRST_VGPR_REG) & 1) && vgpr_2reg_mode_p (mode))
470 /* TImode is used by DImode compare_and_swap. */
471 || (mode == TImode
472 && !((regno - FIRST_VGPR_REG) & 3)));
473 return false;
474 }
475
476 /* Implement REGNO_REG_CLASS via gcn.h.
477
478 Return smallest class containing REGNO. */
479
480 enum reg_class
gcn_regno_reg_class(int regno)481 gcn_regno_reg_class (int regno)
482 {
483 switch (regno)
484 {
485 case SCC_REG:
486 return SCC_CONDITIONAL_REG;
487 case VCC_LO_REG:
488 case VCC_HI_REG:
489 return VCC_CONDITIONAL_REG;
490 case VCCZ_REG:
491 return VCCZ_CONDITIONAL_REG;
492 case EXECZ_REG:
493 return EXECZ_CONDITIONAL_REG;
494 case EXEC_LO_REG:
495 case EXEC_HI_REG:
496 return EXEC_MASK_REG;
497 }
498 if (VGPR_REGNO_P (regno))
499 return VGPR_REGS;
500 if (SGPR_REGNO_P (regno))
501 return SGPR_REGS;
502 if (regno < FIRST_VGPR_REG)
503 return GENERAL_REGS;
504 if (regno == ARG_POINTER_REGNUM || regno == FRAME_POINTER_REGNUM)
505 return AFP_REGS;
506 return ALL_REGS;
507 }
508
509 /* Implement TARGET_CAN_CHANGE_MODE_CLASS.
510
511 GCC assumes that lowpart contains first part of value as stored in memory.
512 This is not the case for vector registers. */
513
514 bool
gcn_can_change_mode_class(machine_mode from,machine_mode to,reg_class_t regclass)515 gcn_can_change_mode_class (machine_mode from, machine_mode to,
516 reg_class_t regclass)
517 {
518 if (!vgpr_vector_mode_p (from) && !vgpr_vector_mode_p (to))
519 return true;
520 return (gcn_class_max_nregs (regclass, from)
521 == gcn_class_max_nregs (regclass, to));
522 }
523
524 /* Implement TARGET_SMALL_REGISTER_CLASSES_FOR_MODE_P.
525
526 When this hook returns true for MODE, the compiler allows
527 registers explicitly used in the rtl to be used as spill registers
528 but prevents the compiler from extending the lifetime of these
529 registers. */
530
531 bool
gcn_small_register_classes_for_mode_p(machine_mode mode)532 gcn_small_register_classes_for_mode_p (machine_mode mode)
533 {
534 /* We allocate into exec and vcc regs. Those make small register class. */
535 return mode == DImode || mode == SImode;
536 }
537
538 /* Implement TARGET_CLASS_LIKELY_SPILLED_P.
539
540 Returns true if pseudos that have been assigned to registers of class RCLASS
541 would likely be spilled because registers of RCLASS are needed for spill
542 registers. */
543
544 static bool
gcn_class_likely_spilled_p(reg_class_t rclass)545 gcn_class_likely_spilled_p (reg_class_t rclass)
546 {
547 return (rclass == EXEC_MASK_REG
548 || reg_classes_intersect_p (ALL_CONDITIONAL_REGS, rclass));
549 }
550
551 /* Implement TARGET_MODES_TIEABLE_P.
552
553 Returns true if a value of MODE1 is accessible in MODE2 without
554 copying. */
555
556 bool
gcn_modes_tieable_p(machine_mode mode1,machine_mode mode2)557 gcn_modes_tieable_p (machine_mode mode1, machine_mode mode2)
558 {
559 return (GET_MODE_BITSIZE (mode1) <= MAX_FIXED_MODE_SIZE
560 && GET_MODE_BITSIZE (mode2) <= MAX_FIXED_MODE_SIZE);
561 }
562
563 /* Implement TARGET_TRULY_NOOP_TRUNCATION.
564
565 Returns true if it is safe to “convert” a value of INPREC bits to one of
566 OUTPREC bits (where OUTPREC is smaller than INPREC) by merely operating on
567 it as if it had only OUTPREC bits. */
568
569 bool
gcn_truly_noop_truncation(poly_uint64 outprec,poly_uint64 inprec)570 gcn_truly_noop_truncation (poly_uint64 outprec, poly_uint64 inprec)
571 {
572 return ((inprec <= 32) && (outprec <= inprec));
573 }
574
575 /* Return N-th part of value occupying multiple registers. */
576
577 rtx
gcn_operand_part(machine_mode mode,rtx op,int n)578 gcn_operand_part (machine_mode mode, rtx op, int n)
579 {
580 if (GET_MODE_SIZE (mode) >= 256)
581 {
582 /*gcc_assert (GET_MODE_SIZE (mode) == 256 || n == 0); */
583
584 if (REG_P (op))
585 {
586 gcc_assert (REGNO (op) + n < FIRST_PSEUDO_REGISTER);
587 return gen_rtx_REG (V64SImode, REGNO (op) + n);
588 }
589 if (GET_CODE (op) == CONST_VECTOR)
590 {
591 int units = GET_MODE_NUNITS (mode);
592 rtvec v = rtvec_alloc (units);
593
594 for (int i = 0; i < units; ++i)
595 RTVEC_ELT (v, i) = gcn_operand_part (GET_MODE_INNER (mode),
596 CONST_VECTOR_ELT (op, i), n);
597
598 return gen_rtx_CONST_VECTOR (V64SImode, v);
599 }
600 if (GET_CODE (op) == UNSPEC && XINT (op, 1) == UNSPEC_VECTOR)
601 return gcn_gen_undef (V64SImode);
602 gcc_unreachable ();
603 }
604 else if (GET_MODE_SIZE (mode) == 8 && REG_P (op))
605 {
606 gcc_assert (REGNO (op) + n < FIRST_PSEUDO_REGISTER);
607 return gen_rtx_REG (SImode, REGNO (op) + n);
608 }
609 else
610 {
611 if (GET_CODE (op) == UNSPEC && XINT (op, 1) == UNSPEC_VECTOR)
612 return gcn_gen_undef (SImode);
613
614 /* If it's a constant then let's assume it is of the largest mode
615 available, otherwise simplify_gen_subreg will fail. */
616 if (mode == VOIDmode && CONST_INT_P (op))
617 mode = DImode;
618 return simplify_gen_subreg (SImode, op, mode, n * 4);
619 }
620 }
621
622 /* Return N-th part of value occupying multiple registers. */
623
624 rtx
gcn_operand_doublepart(machine_mode mode,rtx op,int n)625 gcn_operand_doublepart (machine_mode mode, rtx op, int n)
626 {
627 return simplify_gen_subreg (DImode, op, mode, n * 8);
628 }
629
630 /* Return true if OP can be split into subregs or high/low parts.
631 This is always true for scalars, but not normally true for vectors.
632 However, for vectors in hardregs we can use the low and high registers. */
633
634 bool
gcn_can_split_p(machine_mode,rtx op)635 gcn_can_split_p (machine_mode, rtx op)
636 {
637 if (vgpr_vector_mode_p (GET_MODE (op)))
638 {
639 if (GET_CODE (op) == SUBREG)
640 op = SUBREG_REG (op);
641 if (!REG_P (op))
642 return true;
643 return REGNO (op) <= FIRST_PSEUDO_REGISTER;
644 }
645 return true;
646 }
647
648 /* Implement TARGET_SPILL_CLASS.
649
650 Return class of registers which could be used for pseudo of MODE
651 and of class RCLASS for spilling instead of memory. Return NO_REGS
652 if it is not possible or non-profitable. */
653
654 static reg_class_t
gcn_spill_class(reg_class_t c,machine_mode)655 gcn_spill_class (reg_class_t c, machine_mode /*mode */ )
656 {
657 if (reg_classes_intersect_p (ALL_CONDITIONAL_REGS, c)
658 || c == VCC_CONDITIONAL_REG)
659 return SGPR_REGS;
660 else
661 return NO_REGS;
662 }
663
664 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
665
666 Change allocno class for given pseudo from allocno and best class
667 calculated by IRA. */
668
669 static reg_class_t
gcn_ira_change_pseudo_allocno_class(int regno,reg_class_t cl,reg_class_t best_cl)670 gcn_ira_change_pseudo_allocno_class (int regno, reg_class_t cl,
671 reg_class_t best_cl)
672 {
673 /* Avoid returning classes that contain both vgpr and sgpr registers. */
674 if (cl != ALL_REGS && cl != SRCDST_REGS && cl != ALL_GPR_REGS)
675 return cl;
676 if (best_cl != ALL_REGS && best_cl != SRCDST_REGS
677 && best_cl != ALL_GPR_REGS)
678 return best_cl;
679
680 machine_mode mode = PSEUDO_REGNO_MODE (regno);
681 if (vgpr_vector_mode_p (mode))
682 return VGPR_REGS;
683
684 return GENERAL_REGS;
685 }
686
687 /* Create a new DImode pseudo reg and emit an instruction to initialize
688 it to VAL. */
689
690 static rtx
get_exec(int64_t val)691 get_exec (int64_t val)
692 {
693 rtx reg = gen_reg_rtx (DImode);
694 emit_insn (gen_rtx_SET (reg, gen_int_mode (val, DImode)));
695 return reg;
696 }
697
698 /* Return value of scalar exec register. */
699
700 rtx
gcn_scalar_exec()701 gcn_scalar_exec ()
702 {
703 return const1_rtx;
704 }
705
706 /* Return pseudo holding scalar exec register. */
707
708 rtx
gcn_scalar_exec_reg()709 gcn_scalar_exec_reg ()
710 {
711 return get_exec (1);
712 }
713
714 /* Return value of full exec register. */
715
716 rtx
gcn_full_exec()717 gcn_full_exec ()
718 {
719 return constm1_rtx;
720 }
721
722 /* Return pseudo holding full exec register. */
723
724 rtx
gcn_full_exec_reg()725 gcn_full_exec_reg ()
726 {
727 return get_exec (-1);
728 }
729
730 /* }}} */
731 /* {{{ Immediate constants. */
732
733 /* Initialize shared numeric constants. */
734
735 static void
init_ext_gcn_constants(void)736 init_ext_gcn_constants (void)
737 {
738 real_from_integer (&dconst4, DFmode, 4, SIGNED);
739
740 /* FIXME: this constant probably does not match what hardware really loads.
741 Reality check it eventually. */
742 real_from_string (&dconst1over2pi,
743 "0.1591549430918953357663423455968866839");
744 real_convert (&dconst1over2pi, SFmode, &dconst1over2pi);
745
746 ext_gcn_constants_init = 1;
747 }
748
749 /* Return non-zero if X is a constant that can appear as an inline operand.
750 This is 0, 0.5, -0.5, 1, -1, 2, -2, 4,-4, 1/(2*pi)
751 Or a vector of those.
752 The value returned should be the encoding of this constant. */
753
754 int
gcn_inline_fp_constant_p(rtx x,bool allow_vector)755 gcn_inline_fp_constant_p (rtx x, bool allow_vector)
756 {
757 machine_mode mode = GET_MODE (x);
758
759 if ((mode == V64HFmode || mode == V64SFmode || mode == V64DFmode)
760 && allow_vector)
761 {
762 int n;
763 if (GET_CODE (x) != CONST_VECTOR)
764 return 0;
765 n = gcn_inline_fp_constant_p (CONST_VECTOR_ELT (x, 0), false);
766 if (!n)
767 return 0;
768 for (int i = 1; i < 64; i++)
769 if (CONST_VECTOR_ELT (x, i) != CONST_VECTOR_ELT (x, 0))
770 return 0;
771 return 1;
772 }
773
774 if (mode != HFmode && mode != SFmode && mode != DFmode)
775 return 0;
776
777 const REAL_VALUE_TYPE *r;
778
779 if (x == CONST0_RTX (mode))
780 return 128;
781 if (x == CONST1_RTX (mode))
782 return 242;
783
784 r = CONST_DOUBLE_REAL_VALUE (x);
785
786 if (real_identical (r, &dconstm1))
787 return 243;
788
789 if (real_identical (r, &dconsthalf))
790 return 240;
791 if (real_identical (r, &dconstm1))
792 return 243;
793 if (real_identical (r, &dconst2))
794 return 244;
795 if (real_identical (r, &dconst4))
796 return 246;
797 if (real_identical (r, &dconst1over2pi))
798 return 248;
799 if (!ext_gcn_constants_init)
800 init_ext_gcn_constants ();
801 real_value_negate (r);
802 if (real_identical (r, &dconsthalf))
803 return 241;
804 if (real_identical (r, &dconst2))
805 return 245;
806 if (real_identical (r, &dconst4))
807 return 247;
808
809 /* FIXME: add 4, -4 and 1/(2*PI). */
810
811 return 0;
812 }
813
814 /* Return non-zero if X is a constant that can appear as an immediate operand.
815 This is 0, 0.5, -0.5, 1, -1, 2, -2, 4,-4, 1/(2*pi)
816 Or a vector of those.
817 The value returned should be the encoding of this constant. */
818
819 bool
gcn_fp_constant_p(rtx x,bool allow_vector)820 gcn_fp_constant_p (rtx x, bool allow_vector)
821 {
822 machine_mode mode = GET_MODE (x);
823
824 if ((mode == V64HFmode || mode == V64SFmode || mode == V64DFmode)
825 && allow_vector)
826 {
827 int n;
828 if (GET_CODE (x) != CONST_VECTOR)
829 return false;
830 n = gcn_fp_constant_p (CONST_VECTOR_ELT (x, 0), false);
831 if (!n)
832 return false;
833 for (int i = 1; i < 64; i++)
834 if (CONST_VECTOR_ELT (x, i) != CONST_VECTOR_ELT (x, 0))
835 return false;
836 return true;
837 }
838 if (mode != HFmode && mode != SFmode && mode != DFmode)
839 return false;
840
841 if (gcn_inline_fp_constant_p (x, false))
842 return true;
843 /* FIXME: It is not clear how 32bit immediates are interpreted here. */
844 return (mode != DFmode);
845 }
846
847 /* Return true if X is a constant representable as an inline immediate
848 constant in a 32-bit instruction encoding. */
849
850 bool
gcn_inline_constant_p(rtx x)851 gcn_inline_constant_p (rtx x)
852 {
853 if (GET_CODE (x) == CONST_INT)
854 return INTVAL (x) >= -16 && INTVAL (x) <= 64;
855 if (GET_CODE (x) == CONST_DOUBLE)
856 return gcn_inline_fp_constant_p (x, false);
857 if (GET_CODE (x) == CONST_VECTOR)
858 {
859 int n;
860 if (!vgpr_vector_mode_p (GET_MODE (x)))
861 return false;
862 n = gcn_inline_constant_p (CONST_VECTOR_ELT (x, 0));
863 if (!n)
864 return false;
865 for (int i = 1; i < 64; i++)
866 if (CONST_VECTOR_ELT (x, i) != CONST_VECTOR_ELT (x, 0))
867 return false;
868 return 1;
869 }
870 return false;
871 }
872
873 /* Return true if X is a constant representable as an immediate constant
874 in a 32 or 64-bit instruction encoding. */
875
876 bool
gcn_constant_p(rtx x)877 gcn_constant_p (rtx x)
878 {
879 switch (GET_CODE (x))
880 {
881 case CONST_INT:
882 return true;
883
884 case CONST_DOUBLE:
885 return gcn_fp_constant_p (x, false);
886
887 case CONST_VECTOR:
888 {
889 int n;
890 if (!vgpr_vector_mode_p (GET_MODE (x)))
891 return false;
892 n = gcn_constant_p (CONST_VECTOR_ELT (x, 0));
893 if (!n)
894 return false;
895 for (int i = 1; i < 64; i++)
896 if (CONST_VECTOR_ELT (x, i) != CONST_VECTOR_ELT (x, 0))
897 return false;
898 return true;
899 }
900
901 case SYMBOL_REF:
902 case LABEL_REF:
903 return true;
904
905 default:
906 ;
907 }
908
909 return false;
910 }
911
912 /* Return true if X is a constant representable as two inline immediate
913 constants in a 64-bit instruction that is split into two 32-bit
914 instructions.
915 When MIXED is set, the low-part is permitted to use the full 32-bits. */
916
917 bool
gcn_inline_constant64_p(rtx x,bool mixed)918 gcn_inline_constant64_p (rtx x, bool mixed)
919 {
920 if (GET_CODE (x) == CONST_VECTOR)
921 {
922 if (!vgpr_vector_mode_p (GET_MODE (x)))
923 return false;
924 if (!gcn_inline_constant64_p (CONST_VECTOR_ELT (x, 0), mixed))
925 return false;
926 for (int i = 1; i < 64; i++)
927 if (CONST_VECTOR_ELT (x, i) != CONST_VECTOR_ELT (x, 0))
928 return false;
929
930 return true;
931 }
932
933 if (GET_CODE (x) != CONST_INT)
934 return false;
935
936 rtx val_lo = gcn_operand_part (DImode, x, 0);
937 rtx val_hi = gcn_operand_part (DImode, x, 1);
938 return ((mixed || gcn_inline_constant_p (val_lo))
939 && gcn_inline_constant_p (val_hi));
940 }
941
942 /* Return true if X is a constant representable as an immediate constant
943 in a 32 or 64-bit instruction encoding where the hardware will
944 extend the immediate to 64-bits. */
945
946 bool
gcn_constant64_p(rtx x)947 gcn_constant64_p (rtx x)
948 {
949 if (!gcn_constant_p (x))
950 return false;
951
952 if (GET_CODE (x) != CONST_INT)
953 return true;
954
955 /* Negative numbers are only allowed if they can be encoded within src0,
956 because the 32-bit immediates do not get sign-extended.
957 Unsigned numbers must not be encodable as 32-bit -1..-16, because the
958 assembler will use a src0 inline immediate and that will get
959 sign-extended. */
960 HOST_WIDE_INT val = INTVAL (x);
961 return (((val & 0xffffffff) == val /* Positive 32-bit. */
962 && (val & 0xfffffff0) != 0xfffffff0) /* Not -1..-16. */
963 || gcn_inline_constant_p (x)); /* Src0. */
964 }
965
966 /* Implement TARGET_LEGITIMATE_CONSTANT_P.
967
968 Returns true if X is a legitimate constant for a MODE immediate operand. */
969
970 bool
gcn_legitimate_constant_p(machine_mode,rtx x)971 gcn_legitimate_constant_p (machine_mode, rtx x)
972 {
973 return gcn_constant_p (x);
974 }
975
976 /* Return true if X is a CONST_VECTOR of single constant. */
977
978 static bool
single_cst_vector_p(rtx x)979 single_cst_vector_p (rtx x)
980 {
981 if (GET_CODE (x) != CONST_VECTOR)
982 return false;
983 for (int i = 1; i < 64; i++)
984 if (CONST_VECTOR_ELT (x, i) != CONST_VECTOR_ELT (x, 0))
985 return false;
986 return true;
987 }
988
989 /* Create a CONST_VECTOR of duplicated value A. */
990
991 rtx
gcn_vec_constant(machine_mode mode,int a)992 gcn_vec_constant (machine_mode mode, int a)
993 {
994 /*if (!a)
995 return CONST0_RTX (mode);
996 if (a == -1)
997 return CONSTM1_RTX (mode);
998 if (a == 1)
999 return CONST1_RTX (mode);
1000 if (a == 2)
1001 return CONST2_RTX (mode);*/
1002
1003 int units = GET_MODE_NUNITS (mode);
1004 machine_mode innermode = GET_MODE_INNER (mode);
1005
1006 rtx tem;
1007 if (FLOAT_MODE_P (innermode))
1008 {
1009 REAL_VALUE_TYPE rv;
1010 real_from_integer (&rv, NULL, a, SIGNED);
1011 tem = const_double_from_real_value (rv, innermode);
1012 }
1013 else
1014 tem = gen_int_mode (a, innermode);
1015
1016 rtvec v = rtvec_alloc (units);
1017 for (int i = 0; i < units; ++i)
1018 RTVEC_ELT (v, i) = tem;
1019
1020 return gen_rtx_CONST_VECTOR (mode, v);
1021 }
1022
1023 /* Create a CONST_VECTOR of duplicated value A. */
1024
1025 rtx
gcn_vec_constant(machine_mode mode,rtx a)1026 gcn_vec_constant (machine_mode mode, rtx a)
1027 {
1028 int units = GET_MODE_NUNITS (mode);
1029 rtvec v = rtvec_alloc (units);
1030
1031 for (int i = 0; i < units; ++i)
1032 RTVEC_ELT (v, i) = a;
1033
1034 return gen_rtx_CONST_VECTOR (mode, v);
1035 }
1036
1037 /* Create an undefined vector value, used where an insn operand is
1038 optional. */
1039
1040 rtx
gcn_gen_undef(machine_mode mode)1041 gcn_gen_undef (machine_mode mode)
1042 {
1043 return gen_rtx_UNSPEC (mode, gen_rtvec (1, const0_rtx), UNSPEC_VECTOR);
1044 }
1045
1046 /* }}} */
1047 /* {{{ Addresses, pointers and moves. */
1048
1049 /* Return true is REG is a valid place to store a pointer,
1050 for instructions that require an SGPR.
1051 FIXME rename. */
1052
1053 static bool
gcn_address_register_p(rtx reg,machine_mode mode,bool strict)1054 gcn_address_register_p (rtx reg, machine_mode mode, bool strict)
1055 {
1056 if (GET_CODE (reg) == SUBREG)
1057 reg = SUBREG_REG (reg);
1058
1059 if (!REG_P (reg))
1060 return false;
1061
1062 if (GET_MODE (reg) != mode)
1063 return false;
1064
1065 int regno = REGNO (reg);
1066
1067 if (regno >= FIRST_PSEUDO_REGISTER)
1068 {
1069 if (!strict)
1070 return true;
1071
1072 if (!reg_renumber)
1073 return false;
1074
1075 regno = reg_renumber[regno];
1076 }
1077
1078 return (SGPR_REGNO_P (regno) || regno == M0_REG
1079 || regno == ARG_POINTER_REGNUM || regno == FRAME_POINTER_REGNUM);
1080 }
1081
1082 /* Return true is REG is a valid place to store a pointer,
1083 for instructions that require a VGPR. */
1084
1085 static bool
gcn_vec_address_register_p(rtx reg,machine_mode mode,bool strict)1086 gcn_vec_address_register_p (rtx reg, machine_mode mode, bool strict)
1087 {
1088 if (GET_CODE (reg) == SUBREG)
1089 reg = SUBREG_REG (reg);
1090
1091 if (!REG_P (reg))
1092 return false;
1093
1094 if (GET_MODE (reg) != mode)
1095 return false;
1096
1097 int regno = REGNO (reg);
1098
1099 if (regno >= FIRST_PSEUDO_REGISTER)
1100 {
1101 if (!strict)
1102 return true;
1103
1104 if (!reg_renumber)
1105 return false;
1106
1107 regno = reg_renumber[regno];
1108 }
1109
1110 return VGPR_REGNO_P (regno);
1111 }
1112
1113 /* Return true if X would be valid inside a MEM using the Flat address
1114 space. */
1115
1116 bool
gcn_flat_address_p(rtx x,machine_mode mode)1117 gcn_flat_address_p (rtx x, machine_mode mode)
1118 {
1119 bool vec_mode = (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
1120 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT);
1121
1122 if (vec_mode && gcn_address_register_p (x, DImode, false))
1123 return true;
1124
1125 if (!vec_mode && gcn_vec_address_register_p (x, DImode, false))
1126 return true;
1127
1128 if (TARGET_GCN5_PLUS
1129 && GET_CODE (x) == PLUS
1130 && gcn_vec_address_register_p (XEXP (x, 0), DImode, false)
1131 && CONST_INT_P (XEXP (x, 1)))
1132 return true;
1133
1134 return false;
1135 }
1136
1137 /* Return true if X would be valid inside a MEM using the Scalar Flat
1138 address space. */
1139
1140 bool
gcn_scalar_flat_address_p(rtx x)1141 gcn_scalar_flat_address_p (rtx x)
1142 {
1143 if (gcn_address_register_p (x, DImode, false))
1144 return true;
1145
1146 if (GET_CODE (x) == PLUS
1147 && gcn_address_register_p (XEXP (x, 0), DImode, false)
1148 && CONST_INT_P (XEXP (x, 1)))
1149 return true;
1150
1151 return false;
1152 }
1153
1154 /* Return true if MEM X would be valid for the Scalar Flat address space. */
1155
1156 bool
gcn_scalar_flat_mem_p(rtx x)1157 gcn_scalar_flat_mem_p (rtx x)
1158 {
1159 if (!MEM_P (x))
1160 return false;
1161
1162 if (GET_MODE_SIZE (GET_MODE (x)) < 4)
1163 return false;
1164
1165 return gcn_scalar_flat_address_p (XEXP (x, 0));
1166 }
1167
1168 /* Return true if X would be valid inside a MEM using the LDS or GDS
1169 address spaces. */
1170
1171 bool
gcn_ds_address_p(rtx x)1172 gcn_ds_address_p (rtx x)
1173 {
1174 if (gcn_vec_address_register_p (x, SImode, false))
1175 return true;
1176
1177 if (GET_CODE (x) == PLUS
1178 && gcn_vec_address_register_p (XEXP (x, 0), SImode, false)
1179 && CONST_INT_P (XEXP (x, 1)))
1180 return true;
1181
1182 return false;
1183 }
1184
1185 /* Return true if ADDR would be valid inside a MEM using the Global
1186 address space. */
1187
1188 bool
gcn_global_address_p(rtx addr)1189 gcn_global_address_p (rtx addr)
1190 {
1191 if (gcn_address_register_p (addr, DImode, false)
1192 || gcn_vec_address_register_p (addr, DImode, false))
1193 return true;
1194
1195 if (GET_CODE (addr) == PLUS)
1196 {
1197 rtx base = XEXP (addr, 0);
1198 rtx offset = XEXP (addr, 1);
1199 bool immediate_p = (CONST_INT_P (offset)
1200 && INTVAL (offset) >= -(1 << 12)
1201 && INTVAL (offset) < (1 << 12));
1202
1203 if ((gcn_address_register_p (base, DImode, false)
1204 || gcn_vec_address_register_p (base, DImode, false))
1205 && immediate_p)
1206 /* SGPR + CONST or VGPR + CONST */
1207 return true;
1208
1209 if (gcn_address_register_p (base, DImode, false)
1210 && gcn_vgpr_register_operand (offset, SImode))
1211 /* SPGR + VGPR */
1212 return true;
1213
1214 if (GET_CODE (base) == PLUS
1215 && gcn_address_register_p (XEXP (base, 0), DImode, false)
1216 && gcn_vgpr_register_operand (XEXP (base, 1), SImode)
1217 && immediate_p)
1218 /* (SGPR + VGPR) + CONST */
1219 return true;
1220 }
1221
1222 return false;
1223 }
1224
1225 /* Implement TARGET_ADDR_SPACE_LEGITIMATE_ADDRESS_P.
1226
1227 Recognizes RTL expressions that are valid memory addresses for an
1228 instruction. The MODE argument is the machine mode for the MEM
1229 expression that wants to use this address.
1230
1231 It only recognizes address in canonical form. LEGITIMIZE_ADDRESS should
1232 convert common non-canonical forms to canonical form so that they will
1233 be recognized. */
1234
1235 static bool
gcn_addr_space_legitimate_address_p(machine_mode mode,rtx x,bool strict,addr_space_t as)1236 gcn_addr_space_legitimate_address_p (machine_mode mode, rtx x, bool strict,
1237 addr_space_t as)
1238 {
1239 /* All vector instructions need to work on addresses in registers. */
1240 if (!TARGET_GCN5_PLUS && (vgpr_vector_mode_p (mode) && !REG_P (x)))
1241 return false;
1242
1243 if (AS_SCALAR_FLAT_P (as))
1244 {
1245 if (mode == QImode || mode == HImode)
1246 return 0;
1247
1248 switch (GET_CODE (x))
1249 {
1250 case REG:
1251 return gcn_address_register_p (x, DImode, strict);
1252 /* Addresses are in the form BASE+OFFSET
1253 OFFSET is either 20bit unsigned immediate, SGPR or M0.
1254 Writes and atomics do not accept SGPR. */
1255 case PLUS:
1256 {
1257 rtx x0 = XEXP (x, 0);
1258 rtx x1 = XEXP (x, 1);
1259 if (!gcn_address_register_p (x0, DImode, strict))
1260 return false;
1261 /* FIXME: This is disabled because of the mode mismatch between
1262 SImode (for the address or m0 register) and the DImode PLUS.
1263 We'll need a zero_extend or similar.
1264
1265 if (gcn_m0_register_p (x1, SImode, strict)
1266 || gcn_address_register_p (x1, SImode, strict))
1267 return true;
1268 else*/
1269 if (GET_CODE (x1) == CONST_INT)
1270 {
1271 if (INTVAL (x1) >= 0 && INTVAL (x1) < (1 << 20)
1272 /* The low bits of the offset are ignored, even when
1273 they're meant to realign the pointer. */
1274 && !(INTVAL (x1) & 0x3))
1275 return true;
1276 }
1277 return false;
1278 }
1279
1280 default:
1281 break;
1282 }
1283 }
1284 else if (AS_SCRATCH_P (as))
1285 return gcn_address_register_p (x, SImode, strict);
1286 else if (AS_FLAT_P (as) || AS_FLAT_SCRATCH_P (as))
1287 {
1288 if (TARGET_GCN3 || GET_CODE (x) == REG)
1289 return ((GET_MODE_CLASS (mode) == MODE_VECTOR_INT
1290 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
1291 ? gcn_address_register_p (x, DImode, strict)
1292 : gcn_vec_address_register_p (x, DImode, strict));
1293 else
1294 {
1295 gcc_assert (TARGET_GCN5_PLUS);
1296
1297 if (GET_CODE (x) == PLUS)
1298 {
1299 rtx x1 = XEXP (x, 1);
1300
1301 if (VECTOR_MODE_P (mode)
1302 ? !gcn_address_register_p (x, DImode, strict)
1303 : !gcn_vec_address_register_p (x, DImode, strict))
1304 return false;
1305
1306 if (GET_CODE (x1) == CONST_INT)
1307 {
1308 if (INTVAL (x1) >= 0 && INTVAL (x1) < (1 << 12)
1309 /* The low bits of the offset are ignored, even when
1310 they're meant to realign the pointer. */
1311 && !(INTVAL (x1) & 0x3))
1312 return true;
1313 }
1314 }
1315 return false;
1316 }
1317 }
1318 else if (AS_GLOBAL_P (as))
1319 {
1320 gcc_assert (TARGET_GCN5_PLUS);
1321
1322 if (GET_CODE (x) == REG)
1323 return (gcn_address_register_p (x, DImode, strict)
1324 || (!VECTOR_MODE_P (mode)
1325 && gcn_vec_address_register_p (x, DImode, strict)));
1326 else if (GET_CODE (x) == PLUS)
1327 {
1328 rtx base = XEXP (x, 0);
1329 rtx offset = XEXP (x, 1);
1330
1331 bool immediate_p = (GET_CODE (offset) == CONST_INT
1332 /* Signed 13-bit immediate. */
1333 && INTVAL (offset) >= -(1 << 12)
1334 && INTVAL (offset) < (1 << 12)
1335 /* The low bits of the offset are ignored, even
1336 when they're meant to realign the pointer. */
1337 && !(INTVAL (offset) & 0x3));
1338
1339 if (!VECTOR_MODE_P (mode))
1340 {
1341 if ((gcn_address_register_p (base, DImode, strict)
1342 || gcn_vec_address_register_p (base, DImode, strict))
1343 && immediate_p)
1344 /* SGPR + CONST or VGPR + CONST */
1345 return true;
1346
1347 if (gcn_address_register_p (base, DImode, strict)
1348 && gcn_vgpr_register_operand (offset, SImode))
1349 /* SGPR + VGPR */
1350 return true;
1351
1352 if (GET_CODE (base) == PLUS
1353 && gcn_address_register_p (XEXP (base, 0), DImode, strict)
1354 && gcn_vgpr_register_operand (XEXP (base, 1), SImode)
1355 && immediate_p)
1356 /* (SGPR + VGPR) + CONST */
1357 return true;
1358 }
1359 else
1360 {
1361 if (gcn_address_register_p (base, DImode, strict)
1362 && immediate_p)
1363 /* SGPR + CONST */
1364 return true;
1365 }
1366 }
1367 else
1368 return false;
1369 }
1370 else if (AS_ANY_DS_P (as))
1371 switch (GET_CODE (x))
1372 {
1373 case REG:
1374 return (VECTOR_MODE_P (mode)
1375 ? gcn_address_register_p (x, SImode, strict)
1376 : gcn_vec_address_register_p (x, SImode, strict));
1377 /* Addresses are in the form BASE+OFFSET
1378 OFFSET is either 20bit unsigned immediate, SGPR or M0.
1379 Writes and atomics do not accept SGPR. */
1380 case PLUS:
1381 {
1382 rtx x0 = XEXP (x, 0);
1383 rtx x1 = XEXP (x, 1);
1384 if (!gcn_vec_address_register_p (x0, DImode, strict))
1385 return false;
1386 if (GET_CODE (x1) == REG)
1387 {
1388 if (GET_CODE (x1) != REG
1389 || (REGNO (x1) <= FIRST_PSEUDO_REGISTER
1390 && !gcn_ssrc_register_operand (x1, DImode)))
1391 return false;
1392 }
1393 else if (GET_CODE (x1) == CONST_VECTOR
1394 && GET_CODE (CONST_VECTOR_ELT (x1, 0)) == CONST_INT
1395 && single_cst_vector_p (x1))
1396 {
1397 x1 = CONST_VECTOR_ELT (x1, 0);
1398 if (INTVAL (x1) >= 0 && INTVAL (x1) < (1 << 20))
1399 return true;
1400 }
1401 return false;
1402 }
1403
1404 default:
1405 break;
1406 }
1407 else
1408 gcc_unreachable ();
1409 return false;
1410 }
1411
1412 /* Implement TARGET_ADDR_SPACE_POINTER_MODE.
1413
1414 Return the appropriate mode for a named address pointer. */
1415
1416 static scalar_int_mode
gcn_addr_space_pointer_mode(addr_space_t addrspace)1417 gcn_addr_space_pointer_mode (addr_space_t addrspace)
1418 {
1419 switch (addrspace)
1420 {
1421 case ADDR_SPACE_SCRATCH:
1422 case ADDR_SPACE_LDS:
1423 case ADDR_SPACE_GDS:
1424 return SImode;
1425 case ADDR_SPACE_DEFAULT:
1426 case ADDR_SPACE_FLAT:
1427 case ADDR_SPACE_FLAT_SCRATCH:
1428 case ADDR_SPACE_SCALAR_FLAT:
1429 return DImode;
1430 default:
1431 gcc_unreachable ();
1432 }
1433 }
1434
1435 /* Implement TARGET_ADDR_SPACE_ADDRESS_MODE.
1436
1437 Return the appropriate mode for a named address space address. */
1438
1439 static scalar_int_mode
gcn_addr_space_address_mode(addr_space_t addrspace)1440 gcn_addr_space_address_mode (addr_space_t addrspace)
1441 {
1442 return gcn_addr_space_pointer_mode (addrspace);
1443 }
1444
1445 /* Implement TARGET_ADDR_SPACE_SUBSET_P.
1446
1447 Determine if one named address space is a subset of another. */
1448
1449 static bool
gcn_addr_space_subset_p(addr_space_t subset,addr_space_t superset)1450 gcn_addr_space_subset_p (addr_space_t subset, addr_space_t superset)
1451 {
1452 if (subset == superset)
1453 return true;
1454 /* FIXME is this true? */
1455 if (AS_FLAT_P (superset) || AS_SCALAR_FLAT_P (superset))
1456 return true;
1457 return false;
1458 }
1459
1460 /* Convert from one address space to another. */
1461
1462 static rtx
gcn_addr_space_convert(rtx op,tree from_type,tree to_type)1463 gcn_addr_space_convert (rtx op, tree from_type, tree to_type)
1464 {
1465 gcc_assert (POINTER_TYPE_P (from_type));
1466 gcc_assert (POINTER_TYPE_P (to_type));
1467
1468 addr_space_t as_from = TYPE_ADDR_SPACE (TREE_TYPE (from_type));
1469 addr_space_t as_to = TYPE_ADDR_SPACE (TREE_TYPE (to_type));
1470
1471 if (AS_LDS_P (as_from) && AS_FLAT_P (as_to))
1472 {
1473 rtx queue = gen_rtx_REG (DImode,
1474 cfun->machine->args.reg[QUEUE_PTR_ARG]);
1475 rtx group_seg_aperture_hi = gen_rtx_MEM (SImode,
1476 gen_rtx_PLUS (DImode, queue,
1477 gen_int_mode (64, SImode)));
1478 rtx tmp = gen_reg_rtx (DImode);
1479
1480 emit_move_insn (gen_lowpart (SImode, tmp), op);
1481 emit_move_insn (gen_highpart_mode (SImode, DImode, tmp),
1482 group_seg_aperture_hi);
1483
1484 return tmp;
1485 }
1486 else if (as_from == as_to)
1487 return op;
1488 else
1489 gcc_unreachable ();
1490 }
1491
1492
1493 /* Implement REGNO_MODE_CODE_OK_FOR_BASE_P via gcn.h
1494
1495 Retun true if REGNO is OK for memory adressing. */
1496
1497 bool
gcn_regno_mode_code_ok_for_base_p(int regno,machine_mode,addr_space_t as,int,int)1498 gcn_regno_mode_code_ok_for_base_p (int regno,
1499 machine_mode, addr_space_t as, int, int)
1500 {
1501 if (regno >= FIRST_PSEUDO_REGISTER)
1502 {
1503 if (reg_renumber)
1504 regno = reg_renumber[regno];
1505 else
1506 return true;
1507 }
1508 if (AS_FLAT_P (as))
1509 return (VGPR_REGNO_P (regno)
1510 || regno == ARG_POINTER_REGNUM || regno == FRAME_POINTER_REGNUM);
1511 else if (AS_SCALAR_FLAT_P (as))
1512 return (SGPR_REGNO_P (regno)
1513 || regno == ARG_POINTER_REGNUM || regno == FRAME_POINTER_REGNUM);
1514 else if (AS_GLOBAL_P (as))
1515 {
1516 return (SGPR_REGNO_P (regno)
1517 || VGPR_REGNO_P (regno)
1518 || regno == ARG_POINTER_REGNUM
1519 || regno == FRAME_POINTER_REGNUM);
1520 }
1521 else
1522 /* For now. */
1523 return false;
1524 }
1525
1526 /* Implement MODE_CODE_BASE_REG_CLASS via gcn.h.
1527
1528 Return a suitable register class for memory addressing. */
1529
1530 reg_class
gcn_mode_code_base_reg_class(machine_mode mode,addr_space_t as,int oc,int ic)1531 gcn_mode_code_base_reg_class (machine_mode mode, addr_space_t as, int oc,
1532 int ic)
1533 {
1534 switch (as)
1535 {
1536 case ADDR_SPACE_DEFAULT:
1537 return gcn_mode_code_base_reg_class (mode, DEFAULT_ADDR_SPACE, oc, ic);
1538 case ADDR_SPACE_SCALAR_FLAT:
1539 case ADDR_SPACE_SCRATCH:
1540 return SGPR_REGS;
1541 break;
1542 case ADDR_SPACE_FLAT:
1543 case ADDR_SPACE_FLAT_SCRATCH:
1544 case ADDR_SPACE_LDS:
1545 case ADDR_SPACE_GDS:
1546 return ((GET_MODE_CLASS (mode) == MODE_VECTOR_INT
1547 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
1548 ? SGPR_REGS : VGPR_REGS);
1549 case ADDR_SPACE_GLOBAL:
1550 return ((GET_MODE_CLASS (mode) == MODE_VECTOR_INT
1551 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
1552 ? SGPR_REGS : ALL_GPR_REGS);
1553 }
1554 gcc_unreachable ();
1555 }
1556
1557 /* Implement REGNO_OK_FOR_INDEX_P via gcn.h.
1558
1559 Return true if REGNO is OK for index of memory addressing. */
1560
1561 bool
regno_ok_for_index_p(int regno)1562 regno_ok_for_index_p (int regno)
1563 {
1564 if (regno >= FIRST_PSEUDO_REGISTER)
1565 {
1566 if (reg_renumber)
1567 regno = reg_renumber[regno];
1568 else
1569 return true;
1570 }
1571 return regno == M0_REG || VGPR_REGNO_P (regno);
1572 }
1573
1574 /* Generate move which uses the exec flags. If EXEC is NULL, then it is
1575 assumed that all lanes normally relevant to the mode of the move are
1576 affected. If PREV is NULL, then a sensible default is supplied for
1577 the inactive lanes. */
1578
1579 static rtx
1580 gen_mov_with_exec (rtx op0, rtx op1, rtx exec = NULL, rtx prev = NULL)
1581 {
1582 machine_mode mode = GET_MODE (op0);
1583
1584 if (vgpr_vector_mode_p (mode))
1585 {
1586 if (exec && exec != CONSTM1_RTX (DImode))
1587 {
1588 if (!prev)
1589 prev = op0;
1590 }
1591 else
1592 {
1593 if (!prev)
1594 prev = gcn_gen_undef (mode);
1595 exec = gcn_full_exec_reg ();
1596 }
1597
1598 rtx set = gen_rtx_SET (op0, gen_rtx_VEC_MERGE (mode, op1, prev, exec));
1599
1600 return gen_rtx_PARALLEL (VOIDmode,
1601 gen_rtvec (2, set,
1602 gen_rtx_CLOBBER (VOIDmode,
1603 gen_rtx_SCRATCH (V64DImode))));
1604 }
1605
1606 return (gen_rtx_PARALLEL
1607 (VOIDmode,
1608 gen_rtvec (2, gen_rtx_SET (op0, op1),
1609 gen_rtx_USE (VOIDmode,
1610 exec ? exec : gcn_scalar_exec ()))));
1611 }
1612
1613 /* Generate masked move. */
1614
1615 static rtx
1616 gen_duplicate_load (rtx op0, rtx op1, rtx op2 = NULL, rtx exec = NULL)
1617 {
1618 if (exec)
1619 return (gen_rtx_SET (op0,
1620 gen_rtx_VEC_MERGE (GET_MODE (op0),
1621 gen_rtx_VEC_DUPLICATE (GET_MODE
1622 (op0), op1),
1623 op2, exec)));
1624 else
1625 return (gen_rtx_SET (op0, gen_rtx_VEC_DUPLICATE (GET_MODE (op0), op1)));
1626 }
1627
1628 /* Expand vector init of OP0 by VEC.
1629 Implements vec_init instruction pattern. */
1630
1631 void
gcn_expand_vector_init(rtx op0,rtx vec)1632 gcn_expand_vector_init (rtx op0, rtx vec)
1633 {
1634 int64_t initialized_mask = 0;
1635 int64_t curr_mask = 1;
1636 machine_mode mode = GET_MODE (op0);
1637
1638 rtx val = XVECEXP (vec, 0, 0);
1639
1640 for (int i = 1; i < 64; i++)
1641 if (rtx_equal_p (val, XVECEXP (vec, 0, i)))
1642 curr_mask |= (int64_t) 1 << i;
1643
1644 if (gcn_constant_p (val))
1645 emit_move_insn (op0, gcn_vec_constant (mode, val));
1646 else
1647 {
1648 val = force_reg (GET_MODE_INNER (mode), val);
1649 emit_insn (gen_duplicate_load (op0, val));
1650 }
1651 initialized_mask |= curr_mask;
1652 for (int i = 1; i < 64; i++)
1653 if (!(initialized_mask & ((int64_t) 1 << i)))
1654 {
1655 curr_mask = (int64_t) 1 << i;
1656 rtx val = XVECEXP (vec, 0, i);
1657
1658 for (int j = i + 1; j < 64; j++)
1659 if (rtx_equal_p (val, XVECEXP (vec, 0, j)))
1660 curr_mask |= (int64_t) 1 << j;
1661 if (gcn_constant_p (val))
1662 emit_insn (gen_mov_with_exec (op0, gcn_vec_constant (mode, val),
1663 get_exec (curr_mask)));
1664 else
1665 {
1666 val = force_reg (GET_MODE_INNER (mode), val);
1667 emit_insn (gen_duplicate_load (op0, val, op0,
1668 get_exec (curr_mask)));
1669 }
1670 initialized_mask |= curr_mask;
1671 }
1672 }
1673
1674 /* Load vector constant where n-th lane contains BASE+n*VAL. */
1675
1676 static rtx
strided_constant(machine_mode mode,int base,int val)1677 strided_constant (machine_mode mode, int base, int val)
1678 {
1679 rtx x = gen_reg_rtx (mode);
1680 emit_move_insn (x, gcn_vec_constant (mode, base));
1681 emit_insn (gen_addv64si3_exec (x, x, gcn_vec_constant (mode, val * 32),
1682 x, get_exec (0xffffffff00000000)));
1683 emit_insn (gen_addv64si3_exec (x, x, gcn_vec_constant (mode, val * 16),
1684 x, get_exec (0xffff0000ffff0000)));
1685 emit_insn (gen_addv64si3_exec (x, x, gcn_vec_constant (mode, val * 8),
1686 x, get_exec (0xff00ff00ff00ff00)));
1687 emit_insn (gen_addv64si3_exec (x, x, gcn_vec_constant (mode, val * 4),
1688 x, get_exec (0xf0f0f0f0f0f0f0f0)));
1689 emit_insn (gen_addv64si3_exec (x, x, gcn_vec_constant (mode, val * 2),
1690 x, get_exec (0xcccccccccccccccc)));
1691 emit_insn (gen_addv64si3_exec (x, x, gcn_vec_constant (mode, val * 1),
1692 x, get_exec (0xaaaaaaaaaaaaaaaa)));
1693 return x;
1694 }
1695
1696 /* Implement TARGET_ADDR_SPACE_LEGITIMIZE_ADDRESS. */
1697
1698 static rtx
gcn_addr_space_legitimize_address(rtx x,rtx old,machine_mode mode,addr_space_t as)1699 gcn_addr_space_legitimize_address (rtx x, rtx old, machine_mode mode,
1700 addr_space_t as)
1701 {
1702 switch (as)
1703 {
1704 case ADDR_SPACE_DEFAULT:
1705 return gcn_addr_space_legitimize_address (x, old, mode,
1706 DEFAULT_ADDR_SPACE);
1707 case ADDR_SPACE_SCALAR_FLAT:
1708 case ADDR_SPACE_SCRATCH:
1709 /* Instructions working on vectors need the address to be in
1710 a register. */
1711 if (vgpr_vector_mode_p (mode))
1712 return force_reg (GET_MODE (x), x);
1713
1714 return x;
1715 case ADDR_SPACE_FLAT:
1716 case ADDR_SPACE_FLAT_SCRATCH:
1717 case ADDR_SPACE_GLOBAL:
1718 return TARGET_GCN3 ? force_reg (DImode, x) : x;
1719 case ADDR_SPACE_LDS:
1720 case ADDR_SPACE_GDS:
1721 /* FIXME: LDS support offsets, handle them!. */
1722 if (vgpr_vector_mode_p (mode) && GET_MODE (x) != V64SImode)
1723 {
1724 rtx addrs = gen_reg_rtx (V64SImode);
1725 rtx base = force_reg (SImode, x);
1726 rtx offsets = strided_constant (V64SImode, 0,
1727 GET_MODE_UNIT_SIZE (mode));
1728
1729 emit_insn (gen_vec_duplicatev64si (addrs, base));
1730 emit_insn (gen_addv64si3 (addrs, offsets, addrs));
1731 return addrs;
1732 }
1733 return x;
1734 }
1735 gcc_unreachable ();
1736 }
1737
1738 /* Convert a (mem:<MODE> (reg:DI)) to (mem:<MODE> (reg:V64DI)) with the
1739 proper vector of stepped addresses.
1740
1741 MEM will be a DImode address of a vector in an SGPR.
1742 TMP will be a V64DImode VGPR pair or (scratch:V64DI). */
1743
1744 rtx
gcn_expand_scalar_to_vector_address(machine_mode mode,rtx exec,rtx mem,rtx tmp)1745 gcn_expand_scalar_to_vector_address (machine_mode mode, rtx exec, rtx mem,
1746 rtx tmp)
1747 {
1748 gcc_assert (MEM_P (mem));
1749 rtx mem_base = XEXP (mem, 0);
1750 rtx mem_index = NULL_RTX;
1751
1752 if (!TARGET_GCN5_PLUS)
1753 {
1754 /* gcn_addr_space_legitimize_address should have put the address in a
1755 register. If not, it is too late to do anything about it. */
1756 gcc_assert (REG_P (mem_base));
1757 }
1758
1759 if (GET_CODE (mem_base) == PLUS)
1760 {
1761 mem_index = XEXP (mem_base, 1);
1762 mem_base = XEXP (mem_base, 0);
1763 }
1764
1765 /* RF and RM base registers for vector modes should be always an SGPR. */
1766 gcc_assert (SGPR_REGNO_P (REGNO (mem_base))
1767 || REGNO (mem_base) >= FIRST_PSEUDO_REGISTER);
1768
1769 machine_mode inner = GET_MODE_INNER (mode);
1770 int shift = exact_log2 (GET_MODE_SIZE (inner));
1771 rtx ramp = gen_rtx_REG (V64SImode, VGPR_REGNO (1));
1772 rtx undef_v64si = gcn_gen_undef (V64SImode);
1773 rtx new_base = NULL_RTX;
1774 addr_space_t as = MEM_ADDR_SPACE (mem);
1775
1776 rtx tmplo = (REG_P (tmp)
1777 ? gcn_operand_part (V64DImode, tmp, 0)
1778 : gen_reg_rtx (V64SImode));
1779
1780 /* tmplo[:] = ramp[:] << shift */
1781 if (exec)
1782 emit_insn (gen_ashlv64si3_exec (tmplo, ramp,
1783 gen_int_mode (shift, SImode),
1784 undef_v64si, exec));
1785 else
1786 emit_insn (gen_ashlv64si3 (tmplo, ramp, gen_int_mode (shift, SImode)));
1787
1788 if (AS_FLAT_P (as))
1789 {
1790 rtx vcc = gen_rtx_REG (DImode, CC_SAVE_REG);
1791
1792 if (REG_P (tmp))
1793 {
1794 rtx mem_base_lo = gcn_operand_part (DImode, mem_base, 0);
1795 rtx mem_base_hi = gcn_operand_part (DImode, mem_base, 1);
1796 rtx tmphi = gcn_operand_part (V64DImode, tmp, 1);
1797
1798 /* tmphi[:] = mem_base_hi */
1799 if (exec)
1800 emit_insn (gen_vec_duplicatev64si_exec (tmphi, mem_base_hi,
1801 undef_v64si, exec));
1802 else
1803 emit_insn (gen_vec_duplicatev64si (tmphi, mem_base_hi));
1804
1805 /* tmp[:] += zext (mem_base) */
1806 if (exec)
1807 {
1808 emit_insn (gen_addv64si3_vcc_dup_exec (tmplo, mem_base_lo, tmplo,
1809 vcc, undef_v64si, exec));
1810 emit_insn (gen_addcv64si3_exec (tmphi, tmphi, const0_rtx,
1811 vcc, vcc, undef_v64si, exec));
1812 }
1813 else
1814 emit_insn (gen_addv64di3_vcc_zext_dup (tmp, mem_base_lo, tmp, vcc));
1815 }
1816 else
1817 {
1818 tmp = gen_reg_rtx (V64DImode);
1819 if (exec)
1820 emit_insn (gen_addv64di3_vcc_zext_dup2_exec
1821 (tmp, tmplo, mem_base, vcc, gcn_gen_undef (V64DImode),
1822 exec));
1823 else
1824 emit_insn (gen_addv64di3_vcc_zext_dup2 (tmp, tmplo, mem_base, vcc));
1825 }
1826
1827 new_base = tmp;
1828 }
1829 else if (AS_ANY_DS_P (as))
1830 {
1831 if (!exec)
1832 emit_insn (gen_addv64si3_dup (tmplo, tmplo, mem_base));
1833 else
1834 emit_insn (gen_addv64si3_dup_exec (tmplo, tmplo, mem_base,
1835 gcn_gen_undef (V64SImode), exec));
1836 new_base = tmplo;
1837 }
1838 else
1839 {
1840 mem_base = gen_rtx_VEC_DUPLICATE (V64DImode, mem_base);
1841 new_base = gen_rtx_PLUS (V64DImode, mem_base,
1842 gen_rtx_SIGN_EXTEND (V64DImode, tmplo));
1843 }
1844
1845 return gen_rtx_PLUS (GET_MODE (new_base), new_base,
1846 gen_rtx_VEC_DUPLICATE (GET_MODE (new_base),
1847 (mem_index ? mem_index
1848 : const0_rtx)));
1849 }
1850
1851 /* Convert a BASE address, a vector of OFFSETS, and a SCALE, to addresses
1852 suitable for the given address space. This is indented for use in
1853 gather/scatter patterns.
1854
1855 The offsets may be signed or unsigned, according to UNSIGNED_P.
1856 If EXEC is set then _exec patterns will be used, otherwise plain.
1857
1858 Return values.
1859 ADDR_SPACE_FLAT - return V64DImode vector of absolute addresses.
1860 ADDR_SPACE_GLOBAL - return V64SImode vector of offsets. */
1861
1862 rtx
gcn_expand_scaled_offsets(addr_space_t as,rtx base,rtx offsets,rtx scale,bool unsigned_p,rtx exec)1863 gcn_expand_scaled_offsets (addr_space_t as, rtx base, rtx offsets, rtx scale,
1864 bool unsigned_p, rtx exec)
1865 {
1866 rtx tmpsi = gen_reg_rtx (V64SImode);
1867 rtx tmpdi = gen_reg_rtx (V64DImode);
1868 rtx undefsi = exec ? gcn_gen_undef (V64SImode) : NULL;
1869 rtx undefdi = exec ? gcn_gen_undef (V64DImode) : NULL;
1870
1871 if (CONST_INT_P (scale)
1872 && INTVAL (scale) > 0
1873 && exact_log2 (INTVAL (scale)) >= 0)
1874 emit_insn (gen_ashlv64si3 (tmpsi, offsets,
1875 GEN_INT (exact_log2 (INTVAL (scale)))));
1876 else
1877 (exec
1878 ? emit_insn (gen_mulv64si3_dup_exec (tmpsi, offsets, scale, undefsi,
1879 exec))
1880 : emit_insn (gen_mulv64si3_dup (tmpsi, offsets, scale)));
1881
1882 /* "Global" instructions do not support negative register offsets. */
1883 if (as == ADDR_SPACE_FLAT || !unsigned_p)
1884 {
1885 if (unsigned_p)
1886 (exec
1887 ? emit_insn (gen_addv64di3_zext_dup2_exec (tmpdi, tmpsi, base,
1888 undefdi, exec))
1889 : emit_insn (gen_addv64di3_zext_dup2 (tmpdi, tmpsi, base)));
1890 else
1891 (exec
1892 ? emit_insn (gen_addv64di3_sext_dup2_exec (tmpdi, tmpsi, base,
1893 undefdi, exec))
1894 : emit_insn (gen_addv64di3_sext_dup2 (tmpdi, tmpsi, base)));
1895 return tmpdi;
1896 }
1897 else if (as == ADDR_SPACE_GLOBAL)
1898 return tmpsi;
1899
1900 gcc_unreachable ();
1901 }
1902
1903 /* Return true if move from OP0 to OP1 is known to be executed in vector
1904 unit. */
1905
1906 bool
gcn_vgpr_move_p(rtx op0,rtx op1)1907 gcn_vgpr_move_p (rtx op0, rtx op1)
1908 {
1909 if (MEM_P (op0) && AS_SCALAR_FLAT_P (MEM_ADDR_SPACE (op0)))
1910 return true;
1911 if (MEM_P (op1) && AS_SCALAR_FLAT_P (MEM_ADDR_SPACE (op1)))
1912 return true;
1913 return ((REG_P (op0) && VGPR_REGNO_P (REGNO (op0)))
1914 || (REG_P (op1) && VGPR_REGNO_P (REGNO (op1)))
1915 || vgpr_vector_mode_p (GET_MODE (op0)));
1916 }
1917
1918 /* Return true if move from OP0 to OP1 is known to be executed in scalar
1919 unit. Used in the machine description. */
1920
1921 bool
gcn_sgpr_move_p(rtx op0,rtx op1)1922 gcn_sgpr_move_p (rtx op0, rtx op1)
1923 {
1924 if (MEM_P (op0) && AS_SCALAR_FLAT_P (MEM_ADDR_SPACE (op0)))
1925 return true;
1926 if (MEM_P (op1) && AS_SCALAR_FLAT_P (MEM_ADDR_SPACE (op1)))
1927 return true;
1928 if (!REG_P (op0) || REGNO (op0) >= FIRST_PSEUDO_REGISTER
1929 || VGPR_REGNO_P (REGNO (op0)))
1930 return false;
1931 if (REG_P (op1)
1932 && REGNO (op1) < FIRST_PSEUDO_REGISTER
1933 && !VGPR_REGNO_P (REGNO (op1)))
1934 return true;
1935 return immediate_operand (op1, VOIDmode) || memory_operand (op1, VOIDmode);
1936 }
1937
1938 /* Implement TARGET_SECONDARY_RELOAD.
1939
1940 The address space determines which registers can be used for loads and
1941 stores. */
1942
1943 static reg_class_t
gcn_secondary_reload(bool in_p,rtx x,reg_class_t rclass,machine_mode reload_mode,secondary_reload_info * sri)1944 gcn_secondary_reload (bool in_p, rtx x, reg_class_t rclass,
1945 machine_mode reload_mode, secondary_reload_info *sri)
1946 {
1947 reg_class_t result = NO_REGS;
1948 bool spilled_pseudo =
1949 (REG_P (x) || GET_CODE (x) == SUBREG) && true_regnum (x) == -1;
1950
1951 if (dump_file && (dump_flags & TDF_DETAILS))
1952 {
1953 fprintf (dump_file, "gcn_secondary_reload: ");
1954 dump_value_slim (dump_file, x, 1);
1955 fprintf (dump_file, " %s %s:%s", (in_p ? "->" : "<-"),
1956 reg_class_names[rclass], GET_MODE_NAME (reload_mode));
1957 if (REG_P (x) || GET_CODE (x) == SUBREG)
1958 fprintf (dump_file, " (true regnum: %d \"%s\")", true_regnum (x),
1959 (true_regnum (x) >= 0
1960 && true_regnum (x) < FIRST_PSEUDO_REGISTER
1961 ? reg_names[true_regnum (x)]
1962 : (spilled_pseudo ? "stack spill" : "??")));
1963 fprintf (dump_file, "\n");
1964 }
1965
1966 /* Some callers don't use or initialize icode. */
1967 sri->icode = CODE_FOR_nothing;
1968
1969 if (MEM_P (x) || spilled_pseudo)
1970 {
1971 addr_space_t as = DEFAULT_ADDR_SPACE;
1972
1973 /* If we have a spilled pseudo, we can't find the address space
1974 directly, but we know it's in ADDR_SPACE_FLAT space for GCN3 or
1975 ADDR_SPACE_GLOBAL for GCN5. */
1976 if (MEM_P (x))
1977 as = MEM_ADDR_SPACE (x);
1978
1979 if (as == ADDR_SPACE_DEFAULT)
1980 as = DEFAULT_ADDR_SPACE;
1981
1982 switch (as)
1983 {
1984 case ADDR_SPACE_SCALAR_FLAT:
1985 result =
1986 ((!MEM_P (x) || rclass == SGPR_REGS) ? NO_REGS : SGPR_REGS);
1987 break;
1988 case ADDR_SPACE_FLAT:
1989 case ADDR_SPACE_FLAT_SCRATCH:
1990 case ADDR_SPACE_GLOBAL:
1991 if (GET_MODE_CLASS (reload_mode) == MODE_VECTOR_INT
1992 || GET_MODE_CLASS (reload_mode) == MODE_VECTOR_FLOAT)
1993 {
1994 if (in_p)
1995 switch (reload_mode)
1996 {
1997 case E_V64SImode:
1998 sri->icode = CODE_FOR_reload_inv64si;
1999 break;
2000 case E_V64SFmode:
2001 sri->icode = CODE_FOR_reload_inv64sf;
2002 break;
2003 case E_V64HImode:
2004 sri->icode = CODE_FOR_reload_inv64hi;
2005 break;
2006 case E_V64HFmode:
2007 sri->icode = CODE_FOR_reload_inv64hf;
2008 break;
2009 case E_V64QImode:
2010 sri->icode = CODE_FOR_reload_inv64qi;
2011 break;
2012 case E_V64DImode:
2013 sri->icode = CODE_FOR_reload_inv64di;
2014 break;
2015 case E_V64DFmode:
2016 sri->icode = CODE_FOR_reload_inv64df;
2017 break;
2018 default:
2019 gcc_unreachable ();
2020 }
2021 else
2022 switch (reload_mode)
2023 {
2024 case E_V64SImode:
2025 sri->icode = CODE_FOR_reload_outv64si;
2026 break;
2027 case E_V64SFmode:
2028 sri->icode = CODE_FOR_reload_outv64sf;
2029 break;
2030 case E_V64HImode:
2031 sri->icode = CODE_FOR_reload_outv64hi;
2032 break;
2033 case E_V64HFmode:
2034 sri->icode = CODE_FOR_reload_outv64hf;
2035 break;
2036 case E_V64QImode:
2037 sri->icode = CODE_FOR_reload_outv64qi;
2038 break;
2039 case E_V64DImode:
2040 sri->icode = CODE_FOR_reload_outv64di;
2041 break;
2042 case E_V64DFmode:
2043 sri->icode = CODE_FOR_reload_outv64df;
2044 break;
2045 default:
2046 gcc_unreachable ();
2047 }
2048 break;
2049 }
2050 /* Fallthrough. */
2051 case ADDR_SPACE_LDS:
2052 case ADDR_SPACE_GDS:
2053 case ADDR_SPACE_SCRATCH:
2054 result = (rclass == VGPR_REGS ? NO_REGS : VGPR_REGS);
2055 break;
2056 }
2057 }
2058
2059 if (dump_file && (dump_flags & TDF_DETAILS))
2060 fprintf (dump_file, " <= %s (icode: %s)\n", reg_class_names[result],
2061 get_insn_name (sri->icode));
2062
2063 return result;
2064 }
2065
2066 /* Update register usage after having seen the compiler flags and kernel
2067 attributes. We typically want to fix registers that contain values
2068 set by the HSA runtime. */
2069
2070 static void
gcn_conditional_register_usage(void)2071 gcn_conditional_register_usage (void)
2072 {
2073 if (!cfun || !cfun->machine)
2074 return;
2075
2076 if (cfun->machine->normal_function)
2077 {
2078 /* Restrict the set of SGPRs and VGPRs used by non-kernel functions. */
2079 for (int i = SGPR_REGNO (MAX_NORMAL_SGPR_COUNT - 2);
2080 i <= LAST_SGPR_REG; i++)
2081 fixed_regs[i] = 1, call_used_regs[i] = 1;
2082
2083 for (int i = VGPR_REGNO (MAX_NORMAL_VGPR_COUNT);
2084 i <= LAST_VGPR_REG; i++)
2085 fixed_regs[i] = 1, call_used_regs[i] = 1;
2086
2087 return;
2088 }
2089
2090 /* If the set of requested args is the default set, nothing more needs to
2091 be done. */
2092 if (cfun->machine->args.requested == default_requested_args)
2093 return;
2094
2095 /* Requesting a set of args different from the default violates the ABI. */
2096 if (!leaf_function_p ())
2097 warning (0, "A non-default set of initial values has been requested, "
2098 "which violates the ABI!");
2099
2100 for (int i = SGPR_REGNO (0); i < SGPR_REGNO (14); i++)
2101 fixed_regs[i] = 0;
2102
2103 /* Fix the runtime argument register containing values that may be
2104 needed later. DISPATCH_PTR_ARG and FLAT_SCRATCH_* should not be
2105 needed after the prologue so there's no need to fix them. */
2106 if (cfun->machine->args.reg[PRIVATE_SEGMENT_WAVE_OFFSET_ARG] >= 0)
2107 fixed_regs[cfun->machine->args.reg[PRIVATE_SEGMENT_WAVE_OFFSET_ARG]] = 1;
2108 if (cfun->machine->args.reg[PRIVATE_SEGMENT_BUFFER_ARG] >= 0)
2109 {
2110 /* The upper 32-bits of the 64-bit descriptor are not used, so allow
2111 the containing registers to be used for other purposes. */
2112 fixed_regs[cfun->machine->args.reg[PRIVATE_SEGMENT_BUFFER_ARG]] = 1;
2113 fixed_regs[cfun->machine->args.reg[PRIVATE_SEGMENT_BUFFER_ARG] + 1] = 1;
2114 }
2115 if (cfun->machine->args.reg[KERNARG_SEGMENT_PTR_ARG] >= 0)
2116 {
2117 fixed_regs[cfun->machine->args.reg[KERNARG_SEGMENT_PTR_ARG]] = 1;
2118 fixed_regs[cfun->machine->args.reg[KERNARG_SEGMENT_PTR_ARG] + 1] = 1;
2119 }
2120 if (cfun->machine->args.reg[DISPATCH_PTR_ARG] >= 0)
2121 {
2122 fixed_regs[cfun->machine->args.reg[DISPATCH_PTR_ARG]] = 1;
2123 fixed_regs[cfun->machine->args.reg[DISPATCH_PTR_ARG] + 1] = 1;
2124 }
2125 if (cfun->machine->args.reg[WORKGROUP_ID_X_ARG] >= 0)
2126 fixed_regs[cfun->machine->args.reg[WORKGROUP_ID_X_ARG]] = 1;
2127 if (cfun->machine->args.reg[WORK_ITEM_ID_X_ARG] >= 0)
2128 fixed_regs[cfun->machine->args.reg[WORK_ITEM_ID_X_ARG]] = 1;
2129 if (cfun->machine->args.reg[WORK_ITEM_ID_Y_ARG] >= 0)
2130 fixed_regs[cfun->machine->args.reg[WORK_ITEM_ID_Y_ARG]] = 1;
2131 if (cfun->machine->args.reg[WORK_ITEM_ID_Z_ARG] >= 0)
2132 fixed_regs[cfun->machine->args.reg[WORK_ITEM_ID_Z_ARG]] = 1;
2133
2134 if (TARGET_GCN5_PLUS)
2135 /* v0 is always zero, for global nul-offsets. */
2136 fixed_regs[VGPR_REGNO (0)] = 1;
2137 }
2138
2139 /* Determine if a load or store is valid, according to the register classes
2140 and address space. Used primarily by the machine description to decide
2141 when to split a move into two steps. */
2142
2143 bool
gcn_valid_move_p(machine_mode mode,rtx dest,rtx src)2144 gcn_valid_move_p (machine_mode mode, rtx dest, rtx src)
2145 {
2146 if (!MEM_P (dest) && !MEM_P (src))
2147 return true;
2148
2149 if (MEM_P (dest)
2150 && AS_FLAT_P (MEM_ADDR_SPACE (dest))
2151 && (gcn_flat_address_p (XEXP (dest, 0), mode)
2152 || GET_CODE (XEXP (dest, 0)) == SYMBOL_REF
2153 || GET_CODE (XEXP (dest, 0)) == LABEL_REF)
2154 && gcn_vgpr_register_operand (src, mode))
2155 return true;
2156 else if (MEM_P (src)
2157 && AS_FLAT_P (MEM_ADDR_SPACE (src))
2158 && (gcn_flat_address_p (XEXP (src, 0), mode)
2159 || GET_CODE (XEXP (src, 0)) == SYMBOL_REF
2160 || GET_CODE (XEXP (src, 0)) == LABEL_REF)
2161 && gcn_vgpr_register_operand (dest, mode))
2162 return true;
2163
2164 if (MEM_P (dest)
2165 && AS_GLOBAL_P (MEM_ADDR_SPACE (dest))
2166 && (gcn_global_address_p (XEXP (dest, 0))
2167 || GET_CODE (XEXP (dest, 0)) == SYMBOL_REF
2168 || GET_CODE (XEXP (dest, 0)) == LABEL_REF)
2169 && gcn_vgpr_register_operand (src, mode))
2170 return true;
2171 else if (MEM_P (src)
2172 && AS_GLOBAL_P (MEM_ADDR_SPACE (src))
2173 && (gcn_global_address_p (XEXP (src, 0))
2174 || GET_CODE (XEXP (src, 0)) == SYMBOL_REF
2175 || GET_CODE (XEXP (src, 0)) == LABEL_REF)
2176 && gcn_vgpr_register_operand (dest, mode))
2177 return true;
2178
2179 if (MEM_P (dest)
2180 && MEM_ADDR_SPACE (dest) == ADDR_SPACE_SCALAR_FLAT
2181 && (gcn_scalar_flat_address_p (XEXP (dest, 0))
2182 || GET_CODE (XEXP (dest, 0)) == SYMBOL_REF
2183 || GET_CODE (XEXP (dest, 0)) == LABEL_REF)
2184 && gcn_ssrc_register_operand (src, mode))
2185 return true;
2186 else if (MEM_P (src)
2187 && MEM_ADDR_SPACE (src) == ADDR_SPACE_SCALAR_FLAT
2188 && (gcn_scalar_flat_address_p (XEXP (src, 0))
2189 || GET_CODE (XEXP (src, 0)) == SYMBOL_REF
2190 || GET_CODE (XEXP (src, 0)) == LABEL_REF)
2191 && gcn_sdst_register_operand (dest, mode))
2192 return true;
2193
2194 if (MEM_P (dest)
2195 && AS_ANY_DS_P (MEM_ADDR_SPACE (dest))
2196 && gcn_ds_address_p (XEXP (dest, 0))
2197 && gcn_vgpr_register_operand (src, mode))
2198 return true;
2199 else if (MEM_P (src)
2200 && AS_ANY_DS_P (MEM_ADDR_SPACE (src))
2201 && gcn_ds_address_p (XEXP (src, 0))
2202 && gcn_vgpr_register_operand (dest, mode))
2203 return true;
2204
2205 return false;
2206 }
2207
2208 /* }}} */
2209 /* {{{ Functions and ABI. */
2210
2211 /* Implement TARGET_FUNCTION_VALUE.
2212
2213 Define how to find the value returned by a function.
2214 The register location is always the same, but the mode depends on
2215 VALTYPE. */
2216
2217 static rtx
gcn_function_value(const_tree valtype,const_tree,bool)2218 gcn_function_value (const_tree valtype, const_tree, bool)
2219 {
2220 machine_mode mode = TYPE_MODE (valtype);
2221
2222 if (INTEGRAL_TYPE_P (valtype)
2223 && GET_MODE_CLASS (mode) == MODE_INT
2224 && GET_MODE_SIZE (mode) < 4)
2225 mode = SImode;
2226
2227 return gen_rtx_REG (mode, SGPR_REGNO (RETURN_VALUE_REG));
2228 }
2229
2230 /* Implement TARGET_FUNCTION_VALUE_REGNO_P.
2231
2232 Return true if N is a possible register number for the function return
2233 value. */
2234
2235 static bool
gcn_function_value_regno_p(const unsigned int n)2236 gcn_function_value_regno_p (const unsigned int n)
2237 {
2238 return n == RETURN_VALUE_REG;
2239 }
2240
2241 /* Calculate the number of registers required to hold function argument
2242 ARG. */
2243
2244 static int
num_arg_regs(const function_arg_info & arg)2245 num_arg_regs (const function_arg_info &arg)
2246 {
2247 if (targetm.calls.must_pass_in_stack (arg))
2248 return 0;
2249
2250 int size = arg.promoted_size_in_bytes ();
2251 return (size + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
2252 }
2253
2254 /* Implement TARGET_STRICT_ARGUMENT_NAMING.
2255
2256 Return true if the location where a function argument is passed
2257 depends on whether or not it is a named argument
2258
2259 For gcn, we know how to handle functions declared as stdarg: by
2260 passing an extra pointer to the unnamed arguments. However, the
2261 Fortran frontend can produce a different situation, where a
2262 function pointer is declared with no arguments, but the actual
2263 function and calls to it take more arguments. In that case, we
2264 want to ensure the call matches the definition of the function. */
2265
2266 static bool
gcn_strict_argument_naming(cumulative_args_t cum_v)2267 gcn_strict_argument_naming (cumulative_args_t cum_v)
2268 {
2269 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
2270
2271 return cum->fntype == NULL_TREE || stdarg_p (cum->fntype);
2272 }
2273
2274 /* Implement TARGET_PRETEND_OUTGOING_VARARGS_NAMED.
2275
2276 See comment on gcn_strict_argument_naming. */
2277
2278 static bool
gcn_pretend_outgoing_varargs_named(cumulative_args_t cum_v)2279 gcn_pretend_outgoing_varargs_named (cumulative_args_t cum_v)
2280 {
2281 return !gcn_strict_argument_naming (cum_v);
2282 }
2283
2284 /* Implement TARGET_FUNCTION_ARG.
2285
2286 Return an RTX indicating whether a function argument is passed in a register
2287 and if so, which register. */
2288
2289 static rtx
gcn_function_arg(cumulative_args_t cum_v,const function_arg_info & arg)2290 gcn_function_arg (cumulative_args_t cum_v, const function_arg_info &arg)
2291 {
2292 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
2293 if (cum->normal_function)
2294 {
2295 if (!arg.named || arg.end_marker_p ())
2296 return 0;
2297
2298 if (targetm.calls.must_pass_in_stack (arg))
2299 return 0;
2300
2301 int reg_num = FIRST_PARM_REG + cum->num;
2302 int num_regs = num_arg_regs (arg);
2303 if (num_regs > 0)
2304 while (reg_num % num_regs != 0)
2305 reg_num++;
2306 if (reg_num + num_regs <= FIRST_PARM_REG + NUM_PARM_REGS)
2307 return gen_rtx_REG (arg.mode, reg_num);
2308 }
2309 else
2310 {
2311 if (cum->num >= cum->args.nargs)
2312 {
2313 cum->offset = (cum->offset + TYPE_ALIGN (arg.type) / 8 - 1)
2314 & -(TYPE_ALIGN (arg.type) / 8);
2315 cfun->machine->kernarg_segment_alignment
2316 = MAX ((unsigned) cfun->machine->kernarg_segment_alignment,
2317 TYPE_ALIGN (arg.type) / 8);
2318 rtx addr = gen_rtx_REG (DImode,
2319 cum->args.reg[KERNARG_SEGMENT_PTR_ARG]);
2320 if (cum->offset)
2321 addr = gen_rtx_PLUS (DImode, addr,
2322 gen_int_mode (cum->offset, DImode));
2323 rtx mem = gen_rtx_MEM (arg.mode, addr);
2324 set_mem_attributes (mem, arg.type, 1);
2325 set_mem_addr_space (mem, ADDR_SPACE_SCALAR_FLAT);
2326 MEM_READONLY_P (mem) = 1;
2327 return mem;
2328 }
2329
2330 int a = cum->args.order[cum->num];
2331 if (arg.mode != gcn_kernel_arg_types[a].mode)
2332 {
2333 error ("wrong type of argument %s", gcn_kernel_arg_types[a].name);
2334 return 0;
2335 }
2336 return gen_rtx_REG ((machine_mode) gcn_kernel_arg_types[a].mode,
2337 cum->args.reg[a]);
2338 }
2339 return 0;
2340 }
2341
2342 /* Implement TARGET_FUNCTION_ARG_ADVANCE.
2343
2344 Updates the summarizer variable pointed to by CUM_V to advance past an
2345 argument in the argument list. */
2346
2347 static void
gcn_function_arg_advance(cumulative_args_t cum_v,const function_arg_info & arg)2348 gcn_function_arg_advance (cumulative_args_t cum_v,
2349 const function_arg_info &arg)
2350 {
2351 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
2352
2353 if (cum->normal_function)
2354 {
2355 if (!arg.named)
2356 return;
2357
2358 int num_regs = num_arg_regs (arg);
2359 if (num_regs > 0)
2360 while ((FIRST_PARM_REG + cum->num) % num_regs != 0)
2361 cum->num++;
2362 cum->num += num_regs;
2363 }
2364 else
2365 {
2366 if (cum->num < cum->args.nargs)
2367 cum->num++;
2368 else
2369 {
2370 cum->offset += tree_to_uhwi (TYPE_SIZE_UNIT (arg.type));
2371 cfun->machine->kernarg_segment_byte_size = cum->offset;
2372 }
2373 }
2374 }
2375
2376 /* Implement TARGET_ARG_PARTIAL_BYTES.
2377
2378 Returns the number of bytes at the beginning of an argument that must be put
2379 in registers. The value must be zero for arguments that are passed entirely
2380 in registers or that are entirely pushed on the stack. */
2381
2382 static int
gcn_arg_partial_bytes(cumulative_args_t cum_v,const function_arg_info & arg)2383 gcn_arg_partial_bytes (cumulative_args_t cum_v, const function_arg_info &arg)
2384 {
2385 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
2386
2387 if (!arg.named)
2388 return 0;
2389
2390 if (targetm.calls.must_pass_in_stack (arg))
2391 return 0;
2392
2393 if (cum->num >= NUM_PARM_REGS)
2394 return 0;
2395
2396 /* If the argument fits entirely in registers, return 0. */
2397 if (cum->num + num_arg_regs (arg) <= NUM_PARM_REGS)
2398 return 0;
2399
2400 return (NUM_PARM_REGS - cum->num) * UNITS_PER_WORD;
2401 }
2402
2403 /* A normal function which takes a pointer argument (to a scalar) may be
2404 passed a pointer to LDS space (via a high-bits-set aperture), and that only
2405 works with FLAT addressing, not GLOBAL. Force FLAT addressing if the
2406 function has an incoming pointer-to-scalar parameter. */
2407
2408 static void
gcn_detect_incoming_pointer_arg(tree fndecl)2409 gcn_detect_incoming_pointer_arg (tree fndecl)
2410 {
2411 gcc_assert (cfun && cfun->machine);
2412
2413 for (tree arg = TYPE_ARG_TYPES (TREE_TYPE (fndecl));
2414 arg;
2415 arg = TREE_CHAIN (arg))
2416 if (POINTER_TYPE_P (TREE_VALUE (arg))
2417 && !AGGREGATE_TYPE_P (TREE_TYPE (TREE_VALUE (arg))))
2418 cfun->machine->use_flat_addressing = true;
2419 }
2420
2421 /* Implement INIT_CUMULATIVE_ARGS, via gcn.h.
2422
2423 Initialize a variable CUM of type CUMULATIVE_ARGS for a call to a function
2424 whose data type is FNTYPE. For a library call, FNTYPE is 0. */
2425
2426 void
gcn_init_cumulative_args(CUMULATIVE_ARGS * cum,tree fntype,rtx libname,tree fndecl,int caller)2427 gcn_init_cumulative_args (CUMULATIVE_ARGS *cum /* Argument info to init */ ,
2428 tree fntype /* tree ptr for function decl */ ,
2429 rtx libname /* SYMBOL_REF of library name or 0 */ ,
2430 tree fndecl, int caller)
2431 {
2432 memset (cum, 0, sizeof (*cum));
2433 cum->fntype = fntype;
2434 if (libname)
2435 {
2436 gcc_assert (cfun && cfun->machine);
2437 cum->normal_function = true;
2438 if (!caller)
2439 {
2440 cfun->machine->normal_function = true;
2441 gcn_detect_incoming_pointer_arg (fndecl);
2442 }
2443 return;
2444 }
2445 tree attr = NULL;
2446 if (fndecl)
2447 attr = lookup_attribute ("amdgpu_hsa_kernel", DECL_ATTRIBUTES (fndecl));
2448 if (fndecl && !attr)
2449 attr = lookup_attribute ("amdgpu_hsa_kernel",
2450 TYPE_ATTRIBUTES (TREE_TYPE (fndecl)));
2451 if (!attr && fntype)
2452 attr = lookup_attribute ("amdgpu_hsa_kernel", TYPE_ATTRIBUTES (fntype));
2453 /* Handle main () as kernel, so we can run testsuite.
2454 Handle OpenACC kernels similarly to main. */
2455 if (!attr && !caller && fndecl
2456 && (MAIN_NAME_P (DECL_NAME (fndecl))
2457 || lookup_attribute ("omp target entrypoint",
2458 DECL_ATTRIBUTES (fndecl)) != NULL_TREE))
2459 gcn_parse_amdgpu_hsa_kernel_attribute (&cum->args, NULL_TREE);
2460 else
2461 {
2462 if (!attr || caller)
2463 {
2464 gcc_assert (cfun && cfun->machine);
2465 cum->normal_function = true;
2466 if (!caller)
2467 cfun->machine->normal_function = true;
2468 }
2469 gcn_parse_amdgpu_hsa_kernel_attribute
2470 (&cum->args, attr ? TREE_VALUE (attr) : NULL_TREE);
2471 }
2472 cfun->machine->args = cum->args;
2473 if (!caller && cfun->machine->normal_function)
2474 gcn_detect_incoming_pointer_arg (fndecl);
2475
2476 reinit_regs ();
2477 }
2478
2479 static bool
gcn_return_in_memory(const_tree type,const_tree ARG_UNUSED (fntype))2480 gcn_return_in_memory (const_tree type, const_tree ARG_UNUSED (fntype))
2481 {
2482 machine_mode mode = TYPE_MODE (type);
2483 HOST_WIDE_INT size = int_size_in_bytes (type);
2484
2485 if (AGGREGATE_TYPE_P (type))
2486 return true;
2487
2488 if (mode == BLKmode)
2489 return true;
2490
2491 if (size > 2 * UNITS_PER_WORD)
2492 return true;
2493
2494 return false;
2495 }
2496
2497 /* Implement TARGET_PROMOTE_FUNCTION_MODE.
2498
2499 Return the mode to use for outgoing function arguments. */
2500
2501 machine_mode
gcn_promote_function_mode(const_tree ARG_UNUSED (type),machine_mode mode,int * ARG_UNUSED (punsignedp),const_tree ARG_UNUSED (funtype),int ARG_UNUSED (for_return))2502 gcn_promote_function_mode (const_tree ARG_UNUSED (type), machine_mode mode,
2503 int *ARG_UNUSED (punsignedp),
2504 const_tree ARG_UNUSED (funtype),
2505 int ARG_UNUSED (for_return))
2506 {
2507 if (GET_MODE_CLASS (mode) == MODE_INT && GET_MODE_SIZE (mode) < 4)
2508 return SImode;
2509
2510 return mode;
2511 }
2512
2513 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.
2514
2515 Derived from hppa_gimplify_va_arg_expr. The generic routine doesn't handle
2516 ARGS_GROW_DOWNWARDS. */
2517
2518 static tree
gcn_gimplify_va_arg_expr(tree valist,tree type,gimple_seq * ARG_UNUSED (pre_p),gimple_seq * ARG_UNUSED (post_p))2519 gcn_gimplify_va_arg_expr (tree valist, tree type,
2520 gimple_seq *ARG_UNUSED (pre_p),
2521 gimple_seq *ARG_UNUSED (post_p))
2522 {
2523 tree ptr = build_pointer_type (type);
2524 tree valist_type;
2525 tree t, u;
2526 bool indirect;
2527
2528 indirect = pass_va_arg_by_reference (type);
2529 if (indirect)
2530 {
2531 type = ptr;
2532 ptr = build_pointer_type (type);
2533 }
2534 valist_type = TREE_TYPE (valist);
2535
2536 /* Args grow down. Not handled by generic routines. */
2537
2538 u = fold_convert (sizetype, size_in_bytes (type));
2539 u = fold_build1 (NEGATE_EXPR, sizetype, u);
2540 t = fold_build_pointer_plus (valist, u);
2541
2542 /* Align to 8 byte boundary. */
2543
2544 u = build_int_cst (TREE_TYPE (t), -8);
2545 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t, u);
2546 t = fold_convert (valist_type, t);
2547
2548 t = build2 (MODIFY_EXPR, valist_type, valist, t);
2549
2550 t = fold_convert (ptr, t);
2551 t = build_va_arg_indirect_ref (t);
2552
2553 if (indirect)
2554 t = build_va_arg_indirect_ref (t);
2555
2556 return t;
2557 }
2558
2559 /* Return 1 if TRAIT NAME is present in the OpenMP context's
2560 device trait set, return 0 if not present in any OpenMP context in the
2561 whole translation unit, or -1 if not present in the current OpenMP context
2562 but might be present in another OpenMP context in the same TU. */
2563
2564 int
gcn_omp_device_kind_arch_isa(enum omp_device_kind_arch_isa trait,const char * name)2565 gcn_omp_device_kind_arch_isa (enum omp_device_kind_arch_isa trait,
2566 const char *name)
2567 {
2568 switch (trait)
2569 {
2570 case omp_device_kind:
2571 return strcmp (name, "gpu") == 0;
2572 case omp_device_arch:
2573 return strcmp (name, "gcn") == 0;
2574 case omp_device_isa:
2575 if (strcmp (name, "fiji") == 0)
2576 return gcn_arch == PROCESSOR_FIJI;
2577 if (strcmp (name, "gfx900") == 0)
2578 return gcn_arch == PROCESSOR_VEGA;
2579 if (strcmp (name, "gfx906") == 0)
2580 return gcn_arch == PROCESSOR_VEGA;
2581 return 0;
2582 default:
2583 gcc_unreachable ();
2584 }
2585 }
2586
2587 /* Calculate stack offsets needed to create prologues and epilogues. */
2588
2589 static struct machine_function *
gcn_compute_frame_offsets(void)2590 gcn_compute_frame_offsets (void)
2591 {
2592 machine_function *offsets = cfun->machine;
2593
2594 if (reload_completed)
2595 return offsets;
2596
2597 offsets->need_frame_pointer = frame_pointer_needed;
2598
2599 offsets->outgoing_args_size = crtl->outgoing_args_size;
2600 offsets->pretend_size = crtl->args.pretend_args_size;
2601
2602 offsets->local_vars = get_frame_size ();
2603
2604 offsets->lr_needs_saving = (!leaf_function_p ()
2605 || df_regs_ever_live_p (LR_REGNUM)
2606 || df_regs_ever_live_p (LR_REGNUM + 1));
2607
2608 offsets->callee_saves = offsets->lr_needs_saving ? 8 : 0;
2609
2610 for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
2611 if ((df_regs_ever_live_p (regno) && !call_used_or_fixed_reg_p (regno))
2612 || ((regno & ~1) == HARD_FRAME_POINTER_REGNUM
2613 && frame_pointer_needed))
2614 offsets->callee_saves += (VGPR_REGNO_P (regno) ? 256 : 4);
2615
2616 /* Round up to 64-bit boundary to maintain stack alignment. */
2617 offsets->callee_saves = (offsets->callee_saves + 7) & ~7;
2618
2619 return offsets;
2620 }
2621
2622 /* Insert code into the prologue or epilogue to store or load any
2623 callee-save register to/from the stack.
2624
2625 Helper function for gcn_expand_prologue and gcn_expand_epilogue. */
2626
2627 static void
move_callee_saved_registers(rtx sp,machine_function * offsets,bool prologue)2628 move_callee_saved_registers (rtx sp, machine_function *offsets,
2629 bool prologue)
2630 {
2631 int regno, offset, saved_scalars;
2632 rtx exec = gen_rtx_REG (DImode, EXEC_REG);
2633 rtx vcc = gen_rtx_REG (DImode, VCC_LO_REG);
2634 rtx offreg = gen_rtx_REG (SImode, SGPR_REGNO (22));
2635 rtx as = gen_rtx_CONST_INT (VOIDmode, STACK_ADDR_SPACE);
2636 HOST_WIDE_INT exec_set = 0;
2637 int offreg_set = 0;
2638
2639 start_sequence ();
2640
2641 /* Move scalars into two vector registers. */
2642 for (regno = 0, saved_scalars = 0; regno < FIRST_VGPR_REG; regno++)
2643 if ((df_regs_ever_live_p (regno) && !call_used_or_fixed_reg_p (regno))
2644 || ((regno & ~1) == LINK_REGNUM && offsets->lr_needs_saving)
2645 || ((regno & ~1) == HARD_FRAME_POINTER_REGNUM
2646 && offsets->need_frame_pointer))
2647 {
2648 rtx reg = gen_rtx_REG (SImode, regno);
2649 rtx vreg = gen_rtx_REG (V64SImode,
2650 VGPR_REGNO (6 + (saved_scalars / 64)));
2651 int lane = saved_scalars % 64;
2652
2653 if (prologue)
2654 emit_insn (gen_vec_setv64si (vreg, reg, GEN_INT (lane)));
2655 else
2656 emit_insn (gen_vec_extractv64sisi (reg, vreg, GEN_INT (lane)));
2657
2658 saved_scalars++;
2659 }
2660
2661 rtx move_scalars = get_insns ();
2662 end_sequence ();
2663 start_sequence ();
2664
2665 /* Ensure that all vector lanes are moved. */
2666 exec_set = -1;
2667 emit_move_insn (exec, GEN_INT (exec_set));
2668
2669 /* Set up a vector stack pointer. */
2670 rtx _0_1_2_3 = gen_rtx_REG (V64SImode, VGPR_REGNO (1));
2671 rtx _0_4_8_12 = gen_rtx_REG (V64SImode, VGPR_REGNO (3));
2672 emit_insn (gen_ashlv64si3_exec (_0_4_8_12, _0_1_2_3, GEN_INT (2),
2673 gcn_gen_undef (V64SImode), exec));
2674 rtx vsp = gen_rtx_REG (V64DImode, VGPR_REGNO (4));
2675 emit_insn (gen_vec_duplicatev64di_exec (vsp, sp, gcn_gen_undef (V64DImode),
2676 exec));
2677 emit_insn (gen_addv64si3_vcc_exec (gcn_operand_part (V64SImode, vsp, 0),
2678 gcn_operand_part (V64SImode, vsp, 0),
2679 _0_4_8_12, vcc, gcn_gen_undef (V64SImode),
2680 exec));
2681 emit_insn (gen_addcv64si3_exec (gcn_operand_part (V64SImode, vsp, 1),
2682 gcn_operand_part (V64SImode, vsp, 1),
2683 const0_rtx, vcc, vcc,
2684 gcn_gen_undef (V64SImode), exec));
2685
2686 /* Move vectors. */
2687 for (regno = FIRST_VGPR_REG, offset = offsets->pretend_size;
2688 regno < FIRST_PSEUDO_REGISTER; regno++)
2689 if ((df_regs_ever_live_p (regno) && !call_used_or_fixed_reg_p (regno))
2690 || (regno == VGPR_REGNO (6) && saved_scalars > 0)
2691 || (regno == VGPR_REGNO (7) && saved_scalars > 63))
2692 {
2693 rtx reg = gen_rtx_REG (V64SImode, regno);
2694 int size = 256;
2695
2696 if (regno == VGPR_REGNO (6) && saved_scalars < 64)
2697 size = saved_scalars * 4;
2698 else if (regno == VGPR_REGNO (7) && saved_scalars < 128)
2699 size = (saved_scalars - 64) * 4;
2700
2701 if (size != 256 || exec_set != -1)
2702 {
2703 exec_set = ((unsigned HOST_WIDE_INT) 1 << (size / 4)) - 1;
2704 emit_move_insn (exec, gen_int_mode (exec_set, DImode));
2705 }
2706
2707 if (prologue)
2708 emit_insn (gen_scatterv64si_insn_1offset_exec (vsp, const0_rtx, reg,
2709 as, const0_rtx, exec));
2710 else
2711 emit_insn (gen_gatherv64si_insn_1offset_exec
2712 (reg, vsp, const0_rtx, as, const0_rtx,
2713 gcn_gen_undef (V64SImode), exec));
2714
2715 /* Move our VSP to the next stack entry. */
2716 if (offreg_set != size)
2717 {
2718 offreg_set = size;
2719 emit_move_insn (offreg, GEN_INT (size));
2720 }
2721 if (exec_set != -1)
2722 {
2723 exec_set = -1;
2724 emit_move_insn (exec, GEN_INT (exec_set));
2725 }
2726 emit_insn (gen_addv64si3_vcc_dup_exec
2727 (gcn_operand_part (V64SImode, vsp, 0),
2728 offreg, gcn_operand_part (V64SImode, vsp, 0),
2729 vcc, gcn_gen_undef (V64SImode), exec));
2730 emit_insn (gen_addcv64si3_exec
2731 (gcn_operand_part (V64SImode, vsp, 1),
2732 gcn_operand_part (V64SImode, vsp, 1),
2733 const0_rtx, vcc, vcc, gcn_gen_undef (V64SImode), exec));
2734
2735 offset += size;
2736 }
2737
2738 rtx move_vectors = get_insns ();
2739 end_sequence ();
2740
2741 if (prologue)
2742 {
2743 emit_insn (move_scalars);
2744 emit_insn (move_vectors);
2745 }
2746 else
2747 {
2748 emit_insn (move_vectors);
2749 emit_insn (move_scalars);
2750 }
2751 }
2752
2753 /* Generate prologue. Called from gen_prologue during pro_and_epilogue pass.
2754
2755 For a non-kernel function, the stack layout looks like this (interim),
2756 growing *upwards*:
2757
2758 hi | + ...
2759 |__________________| <-- current SP
2760 | outgoing args |
2761 |__________________|
2762 | (alloca space) |
2763 |__________________|
2764 | local vars |
2765 |__________________| <-- FP/hard FP
2766 | callee-save regs |
2767 |__________________| <-- soft arg pointer
2768 | pretend args |
2769 |__________________| <-- incoming SP
2770 | incoming args |
2771 lo |..................|
2772
2773 This implies arguments (beyond the first N in registers) must grow
2774 downwards (as, apparently, PA has them do).
2775
2776 For a kernel function we have the simpler:
2777
2778 hi | + ...
2779 |__________________| <-- current SP
2780 | outgoing args |
2781 |__________________|
2782 | (alloca space) |
2783 |__________________|
2784 | local vars |
2785 lo |__________________| <-- FP/hard FP
2786
2787 */
2788
2789 void
gcn_expand_prologue()2790 gcn_expand_prologue ()
2791 {
2792 machine_function *offsets = gcn_compute_frame_offsets ();
2793
2794 if (!cfun || !cfun->machine || cfun->machine->normal_function)
2795 {
2796 rtx sp = gen_rtx_REG (Pmode, STACK_POINTER_REGNUM);
2797 rtx fp = gen_rtx_REG (Pmode, HARD_FRAME_POINTER_REGNUM);
2798
2799 start_sequence ();
2800
2801 if (offsets->pretend_size > 0)
2802 {
2803 /* FIXME: Do the actual saving of register pretend args to the stack.
2804 Register order needs consideration. */
2805 }
2806
2807 /* Save callee-save regs. */
2808 move_callee_saved_registers (sp, offsets, true);
2809
2810 HOST_WIDE_INT sp_adjust = offsets->pretend_size
2811 + offsets->callee_saves
2812 + offsets->local_vars + offsets->outgoing_args_size;
2813 if (sp_adjust > 0)
2814 emit_insn (gen_adddi3_scc (sp, sp, gen_int_mode (sp_adjust, DImode)));
2815
2816 if (offsets->need_frame_pointer)
2817 emit_insn (gen_adddi3_scc (fp, sp,
2818 gen_int_mode
2819 (-(offsets->local_vars +
2820 offsets->outgoing_args_size),
2821 DImode)));
2822
2823 rtx_insn *seq = get_insns ();
2824 end_sequence ();
2825
2826 /* FIXME: Prologue insns should have this flag set for debug output, etc.
2827 but it causes issues for now.
2828 for (insn = seq; insn; insn = NEXT_INSN (insn))
2829 if (INSN_P (insn))
2830 RTX_FRAME_RELATED_P (insn) = 1;*/
2831
2832 emit_insn (seq);
2833 }
2834 else
2835 {
2836 rtx wave_offset = gen_rtx_REG (SImode,
2837 cfun->machine->args.
2838 reg[PRIVATE_SEGMENT_WAVE_OFFSET_ARG]);
2839
2840 if (cfun->machine->args.requested & (1 << FLAT_SCRATCH_INIT_ARG))
2841 {
2842 rtx fs_init_lo =
2843 gen_rtx_REG (SImode,
2844 cfun->machine->args.reg[FLAT_SCRATCH_INIT_ARG]);
2845 rtx fs_init_hi =
2846 gen_rtx_REG (SImode,
2847 cfun->machine->args.reg[FLAT_SCRATCH_INIT_ARG] + 1);
2848 rtx fs_reg_lo = gen_rtx_REG (SImode, FLAT_SCRATCH_REG);
2849 rtx fs_reg_hi = gen_rtx_REG (SImode, FLAT_SCRATCH_REG + 1);
2850
2851 /*rtx queue = gen_rtx_REG(DImode,
2852 cfun->machine->args.reg[QUEUE_PTR_ARG]);
2853 rtx aperture = gen_rtx_MEM (SImode,
2854 gen_rtx_PLUS (DImode, queue,
2855 gen_int_mode (68, SImode)));
2856 set_mem_addr_space (aperture, ADDR_SPACE_SCALAR_FLAT);*/
2857
2858 /* Set up flat_scratch. */
2859 emit_insn (gen_addsi3_scc (fs_reg_hi, fs_init_lo, wave_offset));
2860 emit_insn (gen_lshrsi3_scc (fs_reg_hi, fs_reg_hi,
2861 gen_int_mode (8, SImode)));
2862 emit_move_insn (fs_reg_lo, fs_init_hi);
2863 }
2864
2865 /* Set up frame pointer and stack pointer. */
2866 rtx sp = gen_rtx_REG (DImode, STACK_POINTER_REGNUM);
2867 rtx fp = gen_rtx_REG (DImode, HARD_FRAME_POINTER_REGNUM);
2868 rtx fp_hi = simplify_gen_subreg (SImode, fp, DImode, 4);
2869 rtx fp_lo = simplify_gen_subreg (SImode, fp, DImode, 0);
2870
2871 HOST_WIDE_INT sp_adjust = (offsets->local_vars
2872 + offsets->outgoing_args_size);
2873
2874 /* Initialise FP and SP from the buffer descriptor in s[0:3]. */
2875 emit_move_insn (fp_lo, gen_rtx_REG (SImode, 0));
2876 emit_insn (gen_andsi3_scc (fp_hi, gen_rtx_REG (SImode, 1),
2877 gen_int_mode (0xffff, SImode)));
2878 rtx scc = gen_rtx_REG (BImode, SCC_REG);
2879 emit_insn (gen_addsi3_scalar_carry (fp_lo, fp_lo, wave_offset, scc));
2880 emit_insn (gen_addcsi3_scalar_zero (fp_hi, fp_hi, scc));
2881
2882 if (sp_adjust > 0)
2883 emit_insn (gen_adddi3_scc (sp, fp, gen_int_mode (sp_adjust, DImode)));
2884 else
2885 emit_move_insn (sp, fp);
2886
2887 /* Make sure the flat scratch reg doesn't get optimised away. */
2888 emit_insn (gen_prologue_use (gen_rtx_REG (DImode, FLAT_SCRATCH_REG)));
2889 }
2890
2891 /* Ensure that the scheduler doesn't do anything unexpected. */
2892 emit_insn (gen_blockage ());
2893
2894 /* m0 is initialized for the usual LDS DS and FLAT memory case.
2895 The low-part is the address of the topmost addressable byte, which is
2896 size-1. The high-part is an offset and should be zero. */
2897 emit_move_insn (gen_rtx_REG (SImode, M0_REG),
2898 gen_int_mode (LDS_SIZE-1, SImode));
2899
2900 emit_insn (gen_prologue_use (gen_rtx_REG (SImode, M0_REG)));
2901
2902 if (cfun && cfun->machine && !cfun->machine->normal_function && flag_openmp)
2903 {
2904 /* OpenMP kernels have an implicit call to gomp_gcn_enter_kernel. */
2905 rtx fn_reg = gen_rtx_REG (Pmode, FIRST_PARM_REG);
2906 emit_move_insn (fn_reg, gen_rtx_SYMBOL_REF (Pmode,
2907 "gomp_gcn_enter_kernel"));
2908 emit_call_insn (gen_gcn_indirect_call (fn_reg, const0_rtx));
2909 }
2910 }
2911
2912 /* Generate epilogue. Called from gen_epilogue during pro_and_epilogue pass.
2913
2914 See gcn_expand_prologue for stack details. */
2915
2916 void
gcn_expand_epilogue(void)2917 gcn_expand_epilogue (void)
2918 {
2919 /* Ensure that the scheduler doesn't do anything unexpected. */
2920 emit_insn (gen_blockage ());
2921
2922 if (!cfun || !cfun->machine || cfun->machine->normal_function)
2923 {
2924 machine_function *offsets = gcn_compute_frame_offsets ();
2925 rtx sp = gen_rtx_REG (Pmode, STACK_POINTER_REGNUM);
2926 rtx fp = gen_rtx_REG (Pmode, HARD_FRAME_POINTER_REGNUM);
2927
2928 HOST_WIDE_INT sp_adjust = offsets->callee_saves + offsets->pretend_size;
2929
2930 if (offsets->need_frame_pointer)
2931 {
2932 /* Restore old SP from the frame pointer. */
2933 if (sp_adjust > 0)
2934 emit_insn (gen_subdi3 (sp, fp, gen_int_mode (sp_adjust, DImode)));
2935 else
2936 emit_move_insn (sp, fp);
2937 }
2938 else
2939 {
2940 /* Restore old SP from current SP. */
2941 sp_adjust += offsets->outgoing_args_size + offsets->local_vars;
2942
2943 if (sp_adjust > 0)
2944 emit_insn (gen_subdi3 (sp, sp, gen_int_mode (sp_adjust, DImode)));
2945 }
2946
2947 move_callee_saved_registers (sp, offsets, false);
2948
2949 /* There's no explicit use of the link register on the return insn. Emit
2950 one here instead. */
2951 if (offsets->lr_needs_saving)
2952 emit_use (gen_rtx_REG (DImode, LINK_REGNUM));
2953
2954 /* Similar for frame pointer. */
2955 if (offsets->need_frame_pointer)
2956 emit_use (gen_rtx_REG (DImode, HARD_FRAME_POINTER_REGNUM));
2957 }
2958 else if (flag_openmp)
2959 {
2960 /* OpenMP kernels have an implicit call to gomp_gcn_exit_kernel. */
2961 rtx fn_reg = gen_rtx_REG (Pmode, FIRST_PARM_REG);
2962 emit_move_insn (fn_reg,
2963 gen_rtx_SYMBOL_REF (Pmode, "gomp_gcn_exit_kernel"));
2964 emit_call_insn (gen_gcn_indirect_call (fn_reg, const0_rtx));
2965 }
2966 else if (TREE_CODE (TREE_TYPE (DECL_RESULT (cfun->decl))) != VOID_TYPE)
2967 {
2968 /* Assume that an exit value compatible with gcn-run is expected.
2969 That is, the third input parameter is an int*.
2970
2971 We can't allocate any new registers, but the kernarg_reg is
2972 dead after this, so we'll use that. */
2973 rtx kernarg_reg = gen_rtx_REG (DImode, cfun->machine->args.reg
2974 [KERNARG_SEGMENT_PTR_ARG]);
2975 rtx retptr_mem = gen_rtx_MEM (DImode,
2976 gen_rtx_PLUS (DImode, kernarg_reg,
2977 GEN_INT (16)));
2978 set_mem_addr_space (retptr_mem, ADDR_SPACE_SCALAR_FLAT);
2979 emit_move_insn (kernarg_reg, retptr_mem);
2980
2981 rtx retval_mem = gen_rtx_MEM (SImode, kernarg_reg);
2982 set_mem_addr_space (retval_mem, ADDR_SPACE_SCALAR_FLAT);
2983 emit_move_insn (retval_mem,
2984 gen_rtx_REG (SImode, SGPR_REGNO (RETURN_VALUE_REG)));
2985 }
2986
2987 emit_jump_insn (gen_gcn_return ());
2988 }
2989
2990 /* Implement TARGET_CAN_ELIMINATE.
2991
2992 Return true if the compiler is allowed to try to replace register number
2993 FROM_REG with register number TO_REG.
2994
2995 FIXME: is the default "true" not enough? Should this be a negative set? */
2996
2997 bool
gcn_can_eliminate_p(int,int to_reg)2998 gcn_can_eliminate_p (int /*from_reg */ , int to_reg)
2999 {
3000 return (to_reg == HARD_FRAME_POINTER_REGNUM
3001 || to_reg == STACK_POINTER_REGNUM);
3002 }
3003
3004 /* Implement INITIAL_ELIMINATION_OFFSET.
3005
3006 Returns the initial difference between the specified pair of registers, in
3007 terms of stack position. */
3008
3009 HOST_WIDE_INT
gcn_initial_elimination_offset(int from,int to)3010 gcn_initial_elimination_offset (int from, int to)
3011 {
3012 machine_function *offsets = gcn_compute_frame_offsets ();
3013
3014 switch (from)
3015 {
3016 case ARG_POINTER_REGNUM:
3017 if (to == STACK_POINTER_REGNUM)
3018 return -(offsets->callee_saves + offsets->local_vars
3019 + offsets->outgoing_args_size);
3020 else if (to == FRAME_POINTER_REGNUM || to == HARD_FRAME_POINTER_REGNUM)
3021 return -offsets->callee_saves;
3022 else
3023 gcc_unreachable ();
3024 break;
3025
3026 case FRAME_POINTER_REGNUM:
3027 if (to == STACK_POINTER_REGNUM)
3028 return -(offsets->local_vars + offsets->outgoing_args_size);
3029 else if (to == HARD_FRAME_POINTER_REGNUM)
3030 return 0;
3031 else
3032 gcc_unreachable ();
3033 break;
3034
3035 default:
3036 gcc_unreachable ();
3037 }
3038 }
3039
3040 /* Implement HARD_REGNO_RENAME_OK.
3041
3042 Return true if it is permissible to rename a hard register from
3043 FROM_REG to TO_REG. */
3044
3045 bool
gcn_hard_regno_rename_ok(unsigned int from_reg,unsigned int to_reg)3046 gcn_hard_regno_rename_ok (unsigned int from_reg, unsigned int to_reg)
3047 {
3048 if (from_reg == SCC_REG
3049 || from_reg == VCC_LO_REG || from_reg == VCC_HI_REG
3050 || from_reg == EXEC_LO_REG || from_reg == EXEC_HI_REG
3051 || to_reg == SCC_REG
3052 || to_reg == VCC_LO_REG || to_reg == VCC_HI_REG
3053 || to_reg == EXEC_LO_REG || to_reg == EXEC_HI_REG)
3054 return false;
3055
3056 /* Allow the link register to be used if it was saved. */
3057 if ((to_reg & ~1) == LINK_REGNUM)
3058 return !cfun || cfun->machine->lr_needs_saving;
3059
3060 /* Allow the registers used for the static chain to be used if the chain is
3061 not in active use. */
3062 if ((to_reg & ~1) == STATIC_CHAIN_REGNUM)
3063 return !cfun
3064 || !(cfun->static_chain_decl
3065 && df_regs_ever_live_p (STATIC_CHAIN_REGNUM)
3066 && df_regs_ever_live_p (STATIC_CHAIN_REGNUM + 1));
3067
3068 return true;
3069 }
3070
3071 /* Implement HARD_REGNO_CALLER_SAVE_MODE.
3072
3073 Which mode is required for saving NREGS of a pseudo-register in
3074 call-clobbered hard register REGNO. */
3075
3076 machine_mode
gcn_hard_regno_caller_save_mode(unsigned int regno,unsigned int nregs,machine_mode regmode)3077 gcn_hard_regno_caller_save_mode (unsigned int regno, unsigned int nregs,
3078 machine_mode regmode)
3079 {
3080 machine_mode result = choose_hard_reg_mode (regno, nregs, NULL);
3081
3082 if (VECTOR_MODE_P (result) && !VECTOR_MODE_P (regmode))
3083 result = (nregs == 1 ? SImode : DImode);
3084
3085 return result;
3086 }
3087
3088 /* Implement TARGET_ASM_TRAMPOLINE_TEMPLATE.
3089
3090 Output assembler code for a block containing the constant parts
3091 of a trampoline, leaving space for the variable parts. */
3092
3093 static void
gcn_asm_trampoline_template(FILE * f)3094 gcn_asm_trampoline_template (FILE *f)
3095 {
3096 /* The source operand of the move instructions must be a 32-bit
3097 constant following the opcode. */
3098 asm_fprintf (f, "\ts_mov_b32\ts%i, 0xffff\n", STATIC_CHAIN_REGNUM);
3099 asm_fprintf (f, "\ts_mov_b32\ts%i, 0xffff\n", STATIC_CHAIN_REGNUM + 1);
3100 asm_fprintf (f, "\ts_mov_b32\ts%i, 0xffff\n", CC_SAVE_REG);
3101 asm_fprintf (f, "\ts_mov_b32\ts%i, 0xffff\n", CC_SAVE_REG + 1);
3102 asm_fprintf (f, "\ts_setpc_b64\ts[%i:%i]\n", CC_SAVE_REG, CC_SAVE_REG + 1);
3103 }
3104
3105 /* Implement TARGET_TRAMPOLINE_INIT.
3106
3107 Emit RTL insns to initialize the variable parts of a trampoline.
3108 FNDECL is the decl of the target address, M_TRAMP is a MEM for
3109 the trampoline, and CHAIN_VALUE is an RTX for the static chain
3110 to be passed to the target function. */
3111
3112 static void
gcn_trampoline_init(rtx m_tramp,tree fndecl,rtx chain_value)3113 gcn_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
3114 {
3115 if (TARGET_GCN5_PLUS)
3116 sorry ("nested function trampolines not supported on GCN5 due to"
3117 " non-executable stacks");
3118
3119 emit_block_move (m_tramp, assemble_trampoline_template (),
3120 GEN_INT (TRAMPOLINE_SIZE), BLOCK_OP_NORMAL);
3121
3122 rtx fnaddr = XEXP (DECL_RTL (fndecl), 0);
3123 rtx chain_value_reg = copy_to_reg (chain_value);
3124 rtx fnaddr_reg = copy_to_reg (fnaddr);
3125
3126 for (int i = 0; i < 4; i++)
3127 {
3128 rtx mem = adjust_address (m_tramp, SImode, i * 8 + 4);
3129 rtx reg = i < 2 ? chain_value_reg : fnaddr_reg;
3130 emit_move_insn (mem, gen_rtx_SUBREG (SImode, reg, (i % 2) * 4));
3131 }
3132
3133 rtx tramp_addr = XEXP (m_tramp, 0);
3134 emit_insn (gen_clear_icache (tramp_addr,
3135 plus_constant (ptr_mode, tramp_addr,
3136 TRAMPOLINE_SIZE)));
3137 }
3138
3139 /* }}} */
3140 /* {{{ Miscellaneous. */
3141
3142 /* Implement TARGET_CANNOT_COPY_INSN_P.
3143
3144 Return true if INSN must not be duplicated. */
3145
3146 static bool
gcn_cannot_copy_insn_p(rtx_insn * insn)3147 gcn_cannot_copy_insn_p (rtx_insn *insn)
3148 {
3149 if (recog_memoized (insn) == CODE_FOR_gcn_wavefront_barrier)
3150 return true;
3151
3152 return false;
3153 }
3154
3155 /* Implement TARGET_DEBUG_UNWIND_INFO.
3156
3157 Defines the mechanism that will be used for describing frame unwind
3158 information to the debugger. */
3159
3160 static enum unwind_info_type
gcn_debug_unwind_info()3161 gcn_debug_unwind_info ()
3162 {
3163 /* No support for debug info, yet. */
3164 return UI_NONE;
3165 }
3166
3167 /* Determine if there is a suitable hardware conversion instruction.
3168 Used primarily by the machine description. */
3169
3170 bool
gcn_valid_cvt_p(machine_mode from,machine_mode to,enum gcn_cvt_t op)3171 gcn_valid_cvt_p (machine_mode from, machine_mode to, enum gcn_cvt_t op)
3172 {
3173 if (VECTOR_MODE_P (from) != VECTOR_MODE_P (to))
3174 return false;
3175
3176 if (VECTOR_MODE_P (from))
3177 {
3178 from = GET_MODE_INNER (from);
3179 to = GET_MODE_INNER (to);
3180 }
3181
3182 switch (op)
3183 {
3184 case fix_trunc_cvt:
3185 case fixuns_trunc_cvt:
3186 if (GET_MODE_CLASS (from) != MODE_FLOAT
3187 || GET_MODE_CLASS (to) != MODE_INT)
3188 return false;
3189 break;
3190 case float_cvt:
3191 case floatuns_cvt:
3192 if (GET_MODE_CLASS (from) != MODE_INT
3193 || GET_MODE_CLASS (to) != MODE_FLOAT)
3194 return false;
3195 break;
3196 case extend_cvt:
3197 if (GET_MODE_CLASS (from) != MODE_FLOAT
3198 || GET_MODE_CLASS (to) != MODE_FLOAT
3199 || GET_MODE_SIZE (from) >= GET_MODE_SIZE (to))
3200 return false;
3201 break;
3202 case trunc_cvt:
3203 if (GET_MODE_CLASS (from) != MODE_FLOAT
3204 || GET_MODE_CLASS (to) != MODE_FLOAT
3205 || GET_MODE_SIZE (from) <= GET_MODE_SIZE (to))
3206 return false;
3207 break;
3208 }
3209
3210 return ((to == HImode && from == HFmode)
3211 || (to == SImode && (from == SFmode || from == DFmode))
3212 || (to == HFmode && (from == HImode || from == SFmode))
3213 || (to == SFmode && (from == SImode || from == HFmode
3214 || from == DFmode))
3215 || (to == DFmode && (from == SImode || from == SFmode)));
3216 }
3217
3218 /* Implement TARGET_EMUTLS_VAR_INIT.
3219
3220 Disable emutls (gthr-gcn.h does not support it, yet). */
3221
3222 tree
gcn_emutls_var_init(tree,tree decl,tree)3223 gcn_emutls_var_init (tree, tree decl, tree)
3224 {
3225 sorry_at (DECL_SOURCE_LOCATION (decl), "TLS is not implemented for GCN.");
3226 return NULL_TREE;
3227 }
3228
3229 /* }}} */
3230 /* {{{ Costs. */
3231
3232 /* Implement TARGET_RTX_COSTS.
3233
3234 Compute a (partial) cost for rtx X. Return true if the complete
3235 cost has been computed, and false if subexpressions should be
3236 scanned. In either case, *TOTAL contains the cost result. */
3237
3238 static bool
gcn_rtx_costs(rtx x,machine_mode,int,int,int * total,bool)3239 gcn_rtx_costs (rtx x, machine_mode, int, int, int *total, bool)
3240 {
3241 enum rtx_code code = GET_CODE (x);
3242 switch (code)
3243 {
3244 case CONST:
3245 case CONST_DOUBLE:
3246 case CONST_VECTOR:
3247 case CONST_INT:
3248 if (gcn_inline_constant_p (x))
3249 *total = 0;
3250 else if (code == CONST_INT
3251 && ((unsigned HOST_WIDE_INT) INTVAL (x) + 0x8000) < 0x10000)
3252 *total = 1;
3253 else if (gcn_constant_p (x))
3254 *total = 2;
3255 else
3256 *total = vgpr_vector_mode_p (GET_MODE (x)) ? 64 : 4;
3257 return true;
3258
3259 case DIV:
3260 *total = 100;
3261 return false;
3262
3263 default:
3264 *total = 3;
3265 return false;
3266 }
3267 }
3268
3269 /* Implement TARGET_MEMORY_MOVE_COST.
3270
3271 Return the cost of moving data of mode M between a
3272 register and memory. A value of 2 is the default; this cost is
3273 relative to those in `REGISTER_MOVE_COST'.
3274
3275 This function is used extensively by register_move_cost that is used to
3276 build tables at startup. Make it inline in this case.
3277 When IN is 2, return maximum of in and out move cost.
3278
3279 If moving between registers and memory is more expensive than
3280 between two registers, you should define this macro to express the
3281 relative cost.
3282
3283 Model also increased moving costs of QImode registers in non
3284 Q_REGS classes. */
3285
3286 #define LOAD_COST 32
3287 #define STORE_COST 32
3288 static int
gcn_memory_move_cost(machine_mode mode,reg_class_t regclass,bool in)3289 gcn_memory_move_cost (machine_mode mode, reg_class_t regclass, bool in)
3290 {
3291 int nregs = CEIL (GET_MODE_SIZE (mode), 4);
3292 switch (regclass)
3293 {
3294 case SCC_CONDITIONAL_REG:
3295 case VCCZ_CONDITIONAL_REG:
3296 case VCC_CONDITIONAL_REG:
3297 case EXECZ_CONDITIONAL_REG:
3298 case ALL_CONDITIONAL_REGS:
3299 case SGPR_REGS:
3300 case SGPR_EXEC_REGS:
3301 case EXEC_MASK_REG:
3302 case SGPR_VOP_SRC_REGS:
3303 case SGPR_MEM_SRC_REGS:
3304 case SGPR_SRC_REGS:
3305 case SGPR_DST_REGS:
3306 case GENERAL_REGS:
3307 case AFP_REGS:
3308 if (!in)
3309 return (STORE_COST + 2) * nregs;
3310 return LOAD_COST * nregs;
3311 case VGPR_REGS:
3312 if (in)
3313 return (LOAD_COST + 2) * nregs;
3314 return STORE_COST * nregs;
3315 case ALL_REGS:
3316 case ALL_GPR_REGS:
3317 case SRCDST_REGS:
3318 if (in)
3319 return (LOAD_COST + 2) * nregs;
3320 return (STORE_COST + 2) * nregs;
3321 default:
3322 gcc_unreachable ();
3323 }
3324 }
3325
3326 /* Implement TARGET_REGISTER_MOVE_COST.
3327
3328 Return the cost of moving data from a register in class CLASS1 to
3329 one in class CLASS2. Base value is 2. */
3330
3331 static int
gcn_register_move_cost(machine_mode,reg_class_t dst,reg_class_t src)3332 gcn_register_move_cost (machine_mode, reg_class_t dst, reg_class_t src)
3333 {
3334 /* Increase cost of moving from and to vector registers. While this is
3335 fast in hardware (I think), it has hidden cost of setting up the exec
3336 flags. */
3337 if ((src < VGPR_REGS) != (dst < VGPR_REGS))
3338 return 4;
3339 return 2;
3340 }
3341
3342 /* }}} */
3343 /* {{{ Builtins. */
3344
3345 /* Type codes used by GCN built-in definitions. */
3346
3347 enum gcn_builtin_type_index
3348 {
3349 GCN_BTI_END_OF_PARAMS,
3350
3351 GCN_BTI_VOID,
3352 GCN_BTI_BOOL,
3353 GCN_BTI_INT,
3354 GCN_BTI_UINT,
3355 GCN_BTI_SIZE_T,
3356 GCN_BTI_LLINT,
3357 GCN_BTI_LLUINT,
3358 GCN_BTI_EXEC,
3359
3360 GCN_BTI_SF,
3361 GCN_BTI_V64SI,
3362 GCN_BTI_V64SF,
3363 GCN_BTI_V64PTR,
3364 GCN_BTI_SIPTR,
3365 GCN_BTI_SFPTR,
3366 GCN_BTI_VOIDPTR,
3367
3368 GCN_BTI_LDS_VOIDPTR,
3369
3370 GCN_BTI_MAX
3371 };
3372
3373 static GTY(()) tree gcn_builtin_types[GCN_BTI_MAX];
3374
3375 #define exec_type_node (gcn_builtin_types[GCN_BTI_EXEC])
3376 #define sf_type_node (gcn_builtin_types[GCN_BTI_SF])
3377 #define v64si_type_node (gcn_builtin_types[GCN_BTI_V64SI])
3378 #define v64sf_type_node (gcn_builtin_types[GCN_BTI_V64SF])
3379 #define v64ptr_type_node (gcn_builtin_types[GCN_BTI_V64PTR])
3380 #define siptr_type_node (gcn_builtin_types[GCN_BTI_SIPTR])
3381 #define sfptr_type_node (gcn_builtin_types[GCN_BTI_SFPTR])
3382 #define voidptr_type_node (gcn_builtin_types[GCN_BTI_VOIDPTR])
3383 #define size_t_type_node (gcn_builtin_types[GCN_BTI_SIZE_T])
3384
3385 static rtx gcn_expand_builtin_1 (tree, rtx, rtx, machine_mode, int,
3386 struct gcn_builtin_description *);
3387 static rtx gcn_expand_builtin_binop (tree, rtx, rtx, machine_mode, int,
3388 struct gcn_builtin_description *);
3389
3390 struct gcn_builtin_description;
3391 typedef rtx (*gcn_builtin_expander) (tree, rtx, rtx, machine_mode, int,
3392 struct gcn_builtin_description *);
3393
3394 enum gcn_builtin_type
3395 {
3396 B_UNIMPLEMENTED, /* Sorry out */
3397 B_INSN, /* Emit a pattern */
3398 B_OVERLOAD /* Placeholder for an overloaded function */
3399 };
3400
3401 struct gcn_builtin_description
3402 {
3403 int fcode;
3404 int icode;
3405 const char *name;
3406 enum gcn_builtin_type type;
3407 /* The first element of parm is always the return type. The rest
3408 are a zero terminated list of parameters. */
3409 int parm[6];
3410 gcn_builtin_expander expander;
3411 };
3412
3413 /* Read in the GCN builtins from gcn-builtins.def. */
3414
3415 extern GTY(()) struct gcn_builtin_description gcn_builtins[GCN_BUILTIN_MAX];
3416
3417 struct gcn_builtin_description gcn_builtins[] = {
3418 #define DEF_BUILTIN(fcode, icode, name, type, params, expander) \
3419 {GCN_BUILTIN_ ## fcode, icode, name, type, params, expander},
3420
3421 #define DEF_BUILTIN_BINOP_INT_FP(fcode, ic, name) \
3422 {GCN_BUILTIN_ ## fcode ## _V64SI, \
3423 CODE_FOR_ ## ic ##v64si3_exec, name "_v64int", B_INSN, \
3424 {GCN_BTI_V64SI, GCN_BTI_EXEC, GCN_BTI_V64SI, GCN_BTI_V64SI, \
3425 GCN_BTI_V64SI, GCN_BTI_END_OF_PARAMS}, gcn_expand_builtin_binop}, \
3426 {GCN_BUILTIN_ ## fcode ## _V64SI_unspec, \
3427 CODE_FOR_ ## ic ##v64si3_exec, name "_v64int_unspec", B_INSN, \
3428 {GCN_BTI_V64SI, GCN_BTI_EXEC, GCN_BTI_V64SI, GCN_BTI_V64SI, \
3429 GCN_BTI_END_OF_PARAMS}, gcn_expand_builtin_binop},
3430
3431 #include "gcn-builtins.def"
3432 #undef DEF_BUILTIN_BINOP_INT_FP
3433 #undef DEF_BUILTIN
3434 };
3435
3436 static GTY(()) tree gcn_builtin_decls[GCN_BUILTIN_MAX];
3437
3438 /* Implement TARGET_BUILTIN_DECL.
3439
3440 Return the GCN builtin for CODE. */
3441
3442 tree
gcn_builtin_decl(unsigned code,bool ARG_UNUSED (initialize_p))3443 gcn_builtin_decl (unsigned code, bool ARG_UNUSED (initialize_p))
3444 {
3445 if (code >= GCN_BUILTIN_MAX)
3446 return error_mark_node;
3447
3448 return gcn_builtin_decls[code];
3449 }
3450
3451 /* Helper function for gcn_init_builtins. */
3452
3453 static void
gcn_init_builtin_types(void)3454 gcn_init_builtin_types (void)
3455 {
3456 gcn_builtin_types[GCN_BTI_VOID] = void_type_node;
3457 gcn_builtin_types[GCN_BTI_BOOL] = boolean_type_node;
3458 gcn_builtin_types[GCN_BTI_INT] = intSI_type_node;
3459 gcn_builtin_types[GCN_BTI_UINT] = unsigned_type_for (intSI_type_node);
3460 gcn_builtin_types[GCN_BTI_SIZE_T] = size_type_node;
3461 gcn_builtin_types[GCN_BTI_LLINT] = intDI_type_node;
3462 gcn_builtin_types[GCN_BTI_LLUINT] = unsigned_type_for (intDI_type_node);
3463
3464 exec_type_node = unsigned_intDI_type_node;
3465 sf_type_node = float32_type_node;
3466 v64si_type_node = build_vector_type (intSI_type_node, 64);
3467 v64sf_type_node = build_vector_type (float_type_node, 64);
3468 v64ptr_type_node = build_vector_type (unsigned_intDI_type_node
3469 /*build_pointer_type
3470 (integer_type_node) */
3471 , 64);
3472 tree tmp = build_distinct_type_copy (intSI_type_node);
3473 TYPE_ADDR_SPACE (tmp) = ADDR_SPACE_FLAT;
3474 siptr_type_node = build_pointer_type (tmp);
3475
3476 tmp = build_distinct_type_copy (float_type_node);
3477 TYPE_ADDR_SPACE (tmp) = ADDR_SPACE_FLAT;
3478 sfptr_type_node = build_pointer_type (tmp);
3479
3480 tmp = build_distinct_type_copy (void_type_node);
3481 TYPE_ADDR_SPACE (tmp) = ADDR_SPACE_FLAT;
3482 voidptr_type_node = build_pointer_type (tmp);
3483
3484 tmp = build_distinct_type_copy (void_type_node);
3485 TYPE_ADDR_SPACE (tmp) = ADDR_SPACE_LDS;
3486 gcn_builtin_types[GCN_BTI_LDS_VOIDPTR] = build_pointer_type (tmp);
3487 }
3488
3489 /* Implement TARGET_INIT_BUILTINS.
3490
3491 Set up all builtin functions for this target. */
3492
3493 static void
gcn_init_builtins(void)3494 gcn_init_builtins (void)
3495 {
3496 gcn_init_builtin_types ();
3497
3498 struct gcn_builtin_description *d;
3499 unsigned int i;
3500 for (i = 0, d = gcn_builtins; i < GCN_BUILTIN_MAX; i++, d++)
3501 {
3502 tree p;
3503 char name[64]; /* build_function will make a copy. */
3504 int parm;
3505
3506 /* FIXME: Is this necessary/useful? */
3507 if (d->name == 0)
3508 continue;
3509
3510 /* Find last parm. */
3511 for (parm = 1; d->parm[parm] != GCN_BTI_END_OF_PARAMS; parm++)
3512 ;
3513
3514 p = void_list_node;
3515 while (parm > 1)
3516 p = tree_cons (NULL_TREE, gcn_builtin_types[d->parm[--parm]], p);
3517
3518 p = build_function_type (gcn_builtin_types[d->parm[0]], p);
3519
3520 sprintf (name, "__builtin_gcn_%s", d->name);
3521 gcn_builtin_decls[i]
3522 = add_builtin_function (name, p, i, BUILT_IN_MD, NULL, NULL_TREE);
3523
3524 /* These builtins don't throw. */
3525 TREE_NOTHROW (gcn_builtin_decls[i]) = 1;
3526 }
3527
3528 /* FIXME: remove the ifdef once OpenACC support is merged upstream. */
3529 #ifdef BUILT_IN_GOACC_SINGLE_START
3530 /* These builtins need to take/return an LDS pointer: override the generic
3531 versions here. */
3532
3533 set_builtin_decl (BUILT_IN_GOACC_SINGLE_START,
3534 gcn_builtin_decls[GCN_BUILTIN_ACC_SINGLE_START], false);
3535
3536 set_builtin_decl (BUILT_IN_GOACC_SINGLE_COPY_START,
3537 gcn_builtin_decls[GCN_BUILTIN_ACC_SINGLE_COPY_START],
3538 false);
3539
3540 set_builtin_decl (BUILT_IN_GOACC_SINGLE_COPY_END,
3541 gcn_builtin_decls[GCN_BUILTIN_ACC_SINGLE_COPY_END],
3542 false);
3543
3544 set_builtin_decl (BUILT_IN_GOACC_BARRIER,
3545 gcn_builtin_decls[GCN_BUILTIN_ACC_BARRIER], false);
3546 #endif
3547 }
3548
3549 /* Expand the CMP_SWAP GCN builtins. We have our own versions that do
3550 not require taking the address of any object, other than the memory
3551 cell being operated on.
3552
3553 Helper function for gcn_expand_builtin_1. */
3554
3555 static rtx
gcn_expand_cmp_swap(tree exp,rtx target)3556 gcn_expand_cmp_swap (tree exp, rtx target)
3557 {
3558 machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
3559 addr_space_t as
3560 = TYPE_ADDR_SPACE (TREE_TYPE (TREE_TYPE (CALL_EXPR_ARG (exp, 0))));
3561 machine_mode as_mode = gcn_addr_space_address_mode (as);
3562
3563 if (!target)
3564 target = gen_reg_rtx (mode);
3565
3566 rtx addr = expand_expr (CALL_EXPR_ARG (exp, 0),
3567 NULL_RTX, as_mode, EXPAND_NORMAL);
3568 rtx cmp = expand_expr (CALL_EXPR_ARG (exp, 1),
3569 NULL_RTX, mode, EXPAND_NORMAL);
3570 rtx src = expand_expr (CALL_EXPR_ARG (exp, 2),
3571 NULL_RTX, mode, EXPAND_NORMAL);
3572 rtx pat;
3573
3574 rtx mem = gen_rtx_MEM (mode, force_reg (as_mode, addr));
3575 set_mem_addr_space (mem, as);
3576
3577 if (!REG_P (cmp))
3578 cmp = copy_to_mode_reg (mode, cmp);
3579 if (!REG_P (src))
3580 src = copy_to_mode_reg (mode, src);
3581
3582 if (mode == SImode)
3583 pat = gen_sync_compare_and_swapsi (target, mem, cmp, src);
3584 else
3585 pat = gen_sync_compare_and_swapdi (target, mem, cmp, src);
3586
3587 emit_insn (pat);
3588
3589 return target;
3590 }
3591
3592 /* Expand many different builtins.
3593
3594 Intended for use in gcn-builtins.def. */
3595
3596 static rtx
gcn_expand_builtin_1(tree exp,rtx target,rtx,machine_mode,int ignore,struct gcn_builtin_description *)3597 gcn_expand_builtin_1 (tree exp, rtx target, rtx /*subtarget */ ,
3598 machine_mode /*mode */ , int ignore,
3599 struct gcn_builtin_description *)
3600 {
3601 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
3602 switch (DECL_MD_FUNCTION_CODE (fndecl))
3603 {
3604 case GCN_BUILTIN_FLAT_LOAD_INT32:
3605 {
3606 if (ignore)
3607 return target;
3608 /*rtx exec = */
3609 force_reg (DImode,
3610 expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX, DImode,
3611 EXPAND_NORMAL));
3612 /*rtx ptr = */
3613 force_reg (V64DImode,
3614 expand_expr (CALL_EXPR_ARG (exp, 1), NULL_RTX, V64DImode,
3615 EXPAND_NORMAL));
3616 /*emit_insn (gen_vector_flat_loadv64si
3617 (target, gcn_gen_undef (V64SImode), ptr, exec)); */
3618 return target;
3619 }
3620 case GCN_BUILTIN_FLAT_LOAD_PTR_INT32:
3621 case GCN_BUILTIN_FLAT_LOAD_PTR_FLOAT:
3622 {
3623 if (ignore)
3624 return target;
3625 rtx exec = force_reg (DImode,
3626 expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX,
3627 DImode,
3628 EXPAND_NORMAL));
3629 rtx ptr = force_reg (DImode,
3630 expand_expr (CALL_EXPR_ARG (exp, 1), NULL_RTX,
3631 V64DImode,
3632 EXPAND_NORMAL));
3633 rtx offsets = force_reg (V64SImode,
3634 expand_expr (CALL_EXPR_ARG (exp, 2),
3635 NULL_RTX, V64DImode,
3636 EXPAND_NORMAL));
3637 rtx addrs = gen_reg_rtx (V64DImode);
3638 rtx tmp = gen_reg_rtx (V64SImode);
3639 emit_insn (gen_ashlv64si3_exec (tmp, offsets,
3640 GEN_INT (2),
3641 gcn_gen_undef (V64SImode), exec));
3642 emit_insn (gen_addv64di3_zext_dup2_exec (addrs, tmp, ptr,
3643 gcn_gen_undef (V64DImode),
3644 exec));
3645 rtx mem = gen_rtx_MEM (GET_MODE (target), addrs);
3646 /*set_mem_addr_space (mem, ADDR_SPACE_FLAT); */
3647 /* FIXME: set attributes. */
3648 emit_insn (gen_mov_with_exec (target, mem, exec));
3649 return target;
3650 }
3651 case GCN_BUILTIN_FLAT_STORE_PTR_INT32:
3652 case GCN_BUILTIN_FLAT_STORE_PTR_FLOAT:
3653 {
3654 rtx exec = force_reg (DImode,
3655 expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX,
3656 DImode,
3657 EXPAND_NORMAL));
3658 rtx ptr = force_reg (DImode,
3659 expand_expr (CALL_EXPR_ARG (exp, 1), NULL_RTX,
3660 V64DImode,
3661 EXPAND_NORMAL));
3662 rtx offsets = force_reg (V64SImode,
3663 expand_expr (CALL_EXPR_ARG (exp, 2),
3664 NULL_RTX, V64DImode,
3665 EXPAND_NORMAL));
3666 machine_mode vmode = TYPE_MODE (TREE_TYPE (CALL_EXPR_ARG (exp,
3667 3)));
3668 rtx val = force_reg (vmode,
3669 expand_expr (CALL_EXPR_ARG (exp, 3), NULL_RTX,
3670 vmode,
3671 EXPAND_NORMAL));
3672 rtx addrs = gen_reg_rtx (V64DImode);
3673 rtx tmp = gen_reg_rtx (V64SImode);
3674 emit_insn (gen_ashlv64si3_exec (tmp, offsets,
3675 GEN_INT (2),
3676 gcn_gen_undef (V64SImode), exec));
3677 emit_insn (gen_addv64di3_zext_dup2_exec (addrs, tmp, ptr,
3678 gcn_gen_undef (V64DImode),
3679 exec));
3680 rtx mem = gen_rtx_MEM (vmode, addrs);
3681 /*set_mem_addr_space (mem, ADDR_SPACE_FLAT); */
3682 /* FIXME: set attributes. */
3683 emit_insn (gen_mov_with_exec (mem, val, exec));
3684 return target;
3685 }
3686 case GCN_BUILTIN_SQRTVF:
3687 {
3688 if (ignore)
3689 return target;
3690 rtx exec = gcn_full_exec_reg ();
3691 rtx arg = force_reg (V64SFmode,
3692 expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX,
3693 V64SFmode,
3694 EXPAND_NORMAL));
3695 emit_insn (gen_sqrtv64sf2_exec
3696 (target, arg, gcn_gen_undef (V64SFmode), exec));
3697 return target;
3698 }
3699 case GCN_BUILTIN_SQRTF:
3700 {
3701 if (ignore)
3702 return target;
3703 rtx arg = force_reg (SFmode,
3704 expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX,
3705 SFmode,
3706 EXPAND_NORMAL));
3707 emit_insn (gen_sqrtsf2 (target, arg));
3708 return target;
3709 }
3710 case GCN_BUILTIN_OMP_DIM_SIZE:
3711 {
3712 if (ignore)
3713 return target;
3714 emit_insn (gen_oacc_dim_size (target,
3715 expand_expr (CALL_EXPR_ARG (exp, 0),
3716 NULL_RTX, SImode,
3717 EXPAND_NORMAL)));
3718 return target;
3719 }
3720 case GCN_BUILTIN_OMP_DIM_POS:
3721 {
3722 if (ignore)
3723 return target;
3724 emit_insn (gen_oacc_dim_pos (target,
3725 expand_expr (CALL_EXPR_ARG (exp, 0),
3726 NULL_RTX, SImode,
3727 EXPAND_NORMAL)));
3728 return target;
3729 }
3730 case GCN_BUILTIN_CMP_SWAP:
3731 case GCN_BUILTIN_CMP_SWAPLL:
3732 return gcn_expand_cmp_swap (exp, target);
3733
3734 case GCN_BUILTIN_ACC_SINGLE_START:
3735 {
3736 if (ignore)
3737 return target;
3738
3739 rtx wavefront = gcn_oacc_dim_pos (1);
3740 rtx cond = gen_rtx_EQ (VOIDmode, wavefront, const0_rtx);
3741 rtx cc = (target && REG_P (target)) ? target : gen_reg_rtx (BImode);
3742 emit_insn (gen_cstoresi4 (cc, cond, wavefront, const0_rtx));
3743 return cc;
3744 }
3745
3746 case GCN_BUILTIN_ACC_SINGLE_COPY_START:
3747 {
3748 rtx blk = force_reg (SImode,
3749 expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX,
3750 SImode, EXPAND_NORMAL));
3751 rtx wavefront = gcn_oacc_dim_pos (1);
3752 rtx cond = gen_rtx_NE (VOIDmode, wavefront, const0_rtx);
3753 rtx not_zero = gen_label_rtx ();
3754 emit_insn (gen_cbranchsi4 (cond, wavefront, const0_rtx, not_zero));
3755 emit_move_insn (blk, const0_rtx);
3756 emit_label (not_zero);
3757 return blk;
3758 }
3759
3760 case GCN_BUILTIN_ACC_SINGLE_COPY_END:
3761 return target;
3762
3763 case GCN_BUILTIN_ACC_BARRIER:
3764 emit_insn (gen_gcn_wavefront_barrier ());
3765 return target;
3766
3767 default:
3768 gcc_unreachable ();
3769 }
3770 }
3771
3772 /* Expansion of simple arithmetic and bit binary operation builtins.
3773
3774 Intended for use with gcn_builtins table. */
3775
3776 static rtx
gcn_expand_builtin_binop(tree exp,rtx target,rtx,machine_mode,int ignore,struct gcn_builtin_description * d)3777 gcn_expand_builtin_binop (tree exp, rtx target, rtx /*subtarget */ ,
3778 machine_mode /*mode */ , int ignore,
3779 struct gcn_builtin_description *d)
3780 {
3781 int icode = d->icode;
3782 if (ignore)
3783 return target;
3784
3785 rtx exec = force_reg (DImode,
3786 expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX, DImode,
3787 EXPAND_NORMAL));
3788
3789 machine_mode m1 = insn_data[icode].operand[1].mode;
3790 rtx arg1 = expand_expr (CALL_EXPR_ARG (exp, 1), NULL_RTX, m1,
3791 EXPAND_NORMAL);
3792 if (!insn_data[icode].operand[1].predicate (arg1, m1))
3793 arg1 = force_reg (m1, arg1);
3794
3795 machine_mode m2 = insn_data[icode].operand[2].mode;
3796 rtx arg2 = expand_expr (CALL_EXPR_ARG (exp, 2), NULL_RTX, m2,
3797 EXPAND_NORMAL);
3798 if (!insn_data[icode].operand[2].predicate (arg2, m2))
3799 arg2 = force_reg (m2, arg2);
3800
3801 rtx arg_prev;
3802 if (call_expr_nargs (exp) == 4)
3803 {
3804 machine_mode m_prev = insn_data[icode].operand[4].mode;
3805 arg_prev = force_reg (m_prev,
3806 expand_expr (CALL_EXPR_ARG (exp, 3), NULL_RTX,
3807 m_prev, EXPAND_NORMAL));
3808 }
3809 else
3810 arg_prev = gcn_gen_undef (GET_MODE (target));
3811
3812 rtx pat = GEN_FCN (icode) (target, arg1, arg2, exec, arg_prev);
3813 emit_insn (pat);
3814 return target;
3815 }
3816
3817 /* Implement TARGET_EXPAND_BUILTIN.
3818
3819 Expand an expression EXP that calls a built-in function, with result going
3820 to TARGET if that's convenient (and in mode MODE if that's convenient).
3821 SUBTARGET may be used as the target for computing one of EXP's operands.
3822 IGNORE is nonzero if the value is to be ignored. */
3823
3824 rtx
gcn_expand_builtin(tree exp,rtx target,rtx subtarget,machine_mode mode,int ignore)3825 gcn_expand_builtin (tree exp, rtx target, rtx subtarget, machine_mode mode,
3826 int ignore)
3827 {
3828 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
3829 unsigned int fcode = DECL_MD_FUNCTION_CODE (fndecl);
3830 struct gcn_builtin_description *d;
3831
3832 gcc_assert (fcode < GCN_BUILTIN_MAX);
3833 d = &gcn_builtins[fcode];
3834
3835 if (d->type == B_UNIMPLEMENTED)
3836 sorry ("Builtin not implemented");
3837
3838 return d->expander (exp, target, subtarget, mode, ignore, d);
3839 }
3840
3841 /* }}} */
3842 /* {{{ Vectorization. */
3843
3844 /* Implement TARGET_VECTORIZE_GET_MASK_MODE.
3845
3846 A vector mask is a value that holds one boolean result for every element in
3847 a vector. */
3848
3849 opt_machine_mode
gcn_vectorize_get_mask_mode(machine_mode)3850 gcn_vectorize_get_mask_mode (machine_mode)
3851 {
3852 /* GCN uses a DImode bit-mask. */
3853 return DImode;
3854 }
3855
3856 /* Return an RTX that references a vector with the i-th lane containing
3857 PERM[i]*4.
3858
3859 Helper function for gcn_vectorize_vec_perm_const. */
3860
3861 static rtx
gcn_make_vec_perm_address(unsigned int * perm)3862 gcn_make_vec_perm_address (unsigned int *perm)
3863 {
3864 rtx x = gen_reg_rtx (V64SImode);
3865 emit_move_insn (x, gcn_vec_constant (V64SImode, 0));
3866
3867 /* Permutation addresses use byte addressing. With each vector lane being
3868 4 bytes wide, and with 64 lanes in total, only bits 2..7 are significant,
3869 so only set those.
3870
3871 The permutation given to the vec_perm* patterns range from 0 to 2N-1 to
3872 select between lanes in two vectors, but as the DS_BPERMUTE* instructions
3873 only take one source vector, the most-significant bit can be ignored
3874 here. Instead, we can use EXEC masking to select the relevant part of
3875 each source vector after they are permuted separately. */
3876 uint64_t bit_mask = 1 << 2;
3877 for (int i = 2; i < 8; i++, bit_mask <<= 1)
3878 {
3879 uint64_t exec_mask = 0;
3880 uint64_t lane_mask = 1;
3881 for (int j = 0; j < 64; j++, lane_mask <<= 1)
3882 if ((perm[j] * 4) & bit_mask)
3883 exec_mask |= lane_mask;
3884
3885 if (exec_mask)
3886 emit_insn (gen_addv64si3_exec (x, x,
3887 gcn_vec_constant (V64SImode,
3888 bit_mask),
3889 x, get_exec (exec_mask)));
3890 }
3891
3892 return x;
3893 }
3894
3895 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST.
3896
3897 Return true if permutation with SEL is possible.
3898
3899 If DST/SRC0/SRC1 are non-null, emit the instructions to perform the
3900 permutations. */
3901
3902 static bool
gcn_vectorize_vec_perm_const(machine_mode vmode,rtx dst,rtx src0,rtx src1,const vec_perm_indices & sel)3903 gcn_vectorize_vec_perm_const (machine_mode vmode, rtx dst,
3904 rtx src0, rtx src1,
3905 const vec_perm_indices & sel)
3906 {
3907 unsigned int nelt = GET_MODE_NUNITS (vmode);
3908
3909 gcc_assert (VECTOR_MODE_P (vmode));
3910 gcc_assert (nelt <= 64);
3911 gcc_assert (sel.length () == nelt);
3912
3913 if (!dst)
3914 {
3915 /* All vector permutations are possible on this architecture,
3916 with varying degrees of efficiency depending on the permutation. */
3917 return true;
3918 }
3919
3920 unsigned int perm[64];
3921 for (unsigned int i = 0; i < nelt; ++i)
3922 perm[i] = sel[i] & (2 * nelt - 1);
3923
3924 /* Make life a bit easier by swapping operands if necessary so that
3925 the first element always comes from src0. */
3926 if (perm[0] >= nelt)
3927 {
3928 rtx temp = src0;
3929 src0 = src1;
3930 src1 = temp;
3931
3932 for (unsigned int i = 0; i < nelt; ++i)
3933 if (perm[i] < nelt)
3934 perm[i] += nelt;
3935 else
3936 perm[i] -= nelt;
3937 }
3938
3939 /* TODO: There are more efficient ways to implement certain permutations
3940 using ds_swizzle_b32 and/or DPP. Test for and expand them here, before
3941 this more inefficient generic approach is used. */
3942
3943 int64_t src1_lanes = 0;
3944 int64_t lane_bit = 1;
3945
3946 for (unsigned int i = 0; i < nelt; ++i, lane_bit <<= 1)
3947 {
3948 /* Set the bits for lanes from src1. */
3949 if (perm[i] >= nelt)
3950 src1_lanes |= lane_bit;
3951 }
3952
3953 rtx addr = gcn_make_vec_perm_address (perm);
3954 rtx (*ds_bpermute) (rtx, rtx, rtx, rtx);
3955
3956 switch (vmode)
3957 {
3958 case E_V64QImode:
3959 ds_bpermute = gen_ds_bpermutev64qi;
3960 break;
3961 case E_V64HImode:
3962 ds_bpermute = gen_ds_bpermutev64hi;
3963 break;
3964 case E_V64SImode:
3965 ds_bpermute = gen_ds_bpermutev64si;
3966 break;
3967 case E_V64HFmode:
3968 ds_bpermute = gen_ds_bpermutev64hf;
3969 break;
3970 case E_V64SFmode:
3971 ds_bpermute = gen_ds_bpermutev64sf;
3972 break;
3973 case E_V64DImode:
3974 ds_bpermute = gen_ds_bpermutev64di;
3975 break;
3976 case E_V64DFmode:
3977 ds_bpermute = gen_ds_bpermutev64df;
3978 break;
3979 default:
3980 gcc_assert (false);
3981 }
3982
3983 /* Load elements from src0 to dst. */
3984 gcc_assert (~src1_lanes);
3985 emit_insn (ds_bpermute (dst, addr, src0, gcn_full_exec_reg ()));
3986
3987 /* Load elements from src1 to dst. */
3988 if (src1_lanes)
3989 {
3990 /* Masking a lane masks both the destination and source lanes for
3991 DS_BPERMUTE, so we need to have all lanes enabled for the permute,
3992 then add an extra masked move to merge the results of permuting
3993 the two source vectors together.
3994 */
3995 rtx tmp = gen_reg_rtx (vmode);
3996 emit_insn (ds_bpermute (tmp, addr, src1, gcn_full_exec_reg ()));
3997 emit_insn (gen_mov_with_exec (dst, tmp, get_exec (src1_lanes)));
3998 }
3999
4000 return true;
4001 }
4002
4003 /* Implements TARGET_VECTOR_MODE_SUPPORTED_P.
4004
4005 Return nonzero if vector MODE is supported with at least move
4006 instructions. */
4007
4008 static bool
gcn_vector_mode_supported_p(machine_mode mode)4009 gcn_vector_mode_supported_p (machine_mode mode)
4010 {
4011 return (mode == V64QImode || mode == V64HImode
4012 || mode == V64SImode || mode == V64DImode
4013 || mode == V64SFmode || mode == V64DFmode);
4014 }
4015
4016 /* Implement TARGET_VECTORIZE_PREFERRED_SIMD_MODE.
4017
4018 Enables autovectorization for all supported modes. */
4019
4020 static machine_mode
gcn_vectorize_preferred_simd_mode(scalar_mode mode)4021 gcn_vectorize_preferred_simd_mode (scalar_mode mode)
4022 {
4023 switch (mode)
4024 {
4025 case E_QImode:
4026 return V64QImode;
4027 case E_HImode:
4028 return V64HImode;
4029 case E_SImode:
4030 return V64SImode;
4031 case E_DImode:
4032 return V64DImode;
4033 case E_SFmode:
4034 return V64SFmode;
4035 case E_DFmode:
4036 return V64DFmode;
4037 default:
4038 return word_mode;
4039 }
4040 }
4041
4042 /* Implement TARGET_VECTORIZE_RELATED_MODE.
4043
4044 All GCN vectors are 64-lane, so this is simpler than other architectures.
4045 In particular, we do *not* want to match vector bit-size. */
4046
4047 static opt_machine_mode
gcn_related_vector_mode(machine_mode vector_mode,scalar_mode element_mode,poly_uint64 nunits)4048 gcn_related_vector_mode (machine_mode vector_mode, scalar_mode element_mode,
4049 poly_uint64 nunits)
4050 {
4051 if (known_ne (nunits, 0U) && known_ne (nunits, 64U))
4052 return VOIDmode;
4053
4054 machine_mode pref_mode = gcn_vectorize_preferred_simd_mode (element_mode);
4055 if (!VECTOR_MODE_P (pref_mode))
4056 return VOIDmode;
4057
4058 return pref_mode;
4059 }
4060
4061 /* Implement TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT.
4062
4063 Returns the preferred alignment in bits for accesses to vectors of type type
4064 in vectorized code. This might be less than or greater than the ABI-defined
4065 value returned by TARGET_VECTOR_ALIGNMENT. It can be equal to the alignment
4066 of a single element, in which case the vectorizer will not try to optimize
4067 for alignment. */
4068
4069 static poly_uint64
gcn_preferred_vector_alignment(const_tree type)4070 gcn_preferred_vector_alignment (const_tree type)
4071 {
4072 return TYPE_ALIGN (TREE_TYPE (type));
4073 }
4074
4075 /* Implement TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT.
4076
4077 Return true if the target supports misaligned vector store/load of a
4078 specific factor denoted in the misalignment parameter. */
4079
4080 static bool
gcn_vectorize_support_vector_misalignment(machine_mode ARG_UNUSED (mode),const_tree type,int misalignment,bool is_packed)4081 gcn_vectorize_support_vector_misalignment (machine_mode ARG_UNUSED (mode),
4082 const_tree type, int misalignment,
4083 bool is_packed)
4084 {
4085 if (is_packed)
4086 return false;
4087
4088 /* If the misalignment is unknown, we should be able to handle the access
4089 so long as it is not to a member of a packed data structure. */
4090 if (misalignment == -1)
4091 return true;
4092
4093 /* Return true if the misalignment is a multiple of the natural alignment
4094 of the vector's element type. This is probably always going to be
4095 true in practice, since we've already established that this isn't a
4096 packed access. */
4097 return misalignment % TYPE_ALIGN_UNIT (type) == 0;
4098 }
4099
4100 /* Implement TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.
4101
4102 Return true if vector alignment is reachable (by peeling N iterations) for
4103 the given scalar type TYPE. */
4104
4105 static bool
gcn_vector_alignment_reachable(const_tree ARG_UNUSED (type),bool is_packed)4106 gcn_vector_alignment_reachable (const_tree ARG_UNUSED (type), bool is_packed)
4107 {
4108 /* Vectors which aren't in packed structures will not be less aligned than
4109 the natural alignment of their element type, so this is safe. */
4110 return !is_packed;
4111 }
4112
4113 /* Generate DPP instructions used for vector reductions.
4114
4115 The opcode is given by INSN.
4116 The first operand of the operation is shifted right by SHIFT vector lanes.
4117 SHIFT must be a power of 2. If SHIFT is 16, the 15th lane of each row is
4118 broadcast the next row (thereby acting like a shift of 16 for the end of
4119 each row). If SHIFT is 32, lane 31 is broadcast to all the
4120 following lanes (thereby acting like a shift of 32 for lane 63). */
4121
4122 char *
gcn_expand_dpp_shr_insn(machine_mode mode,const char * insn,int unspec,int shift)4123 gcn_expand_dpp_shr_insn (machine_mode mode, const char *insn,
4124 int unspec, int shift)
4125 {
4126 static char buf[128];
4127 const char *dpp;
4128 const char *vcc_in = "";
4129 const char *vcc_out = "";
4130
4131 /* Add the vcc operand if needed. */
4132 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
4133 {
4134 if (unspec == UNSPEC_PLUS_CARRY_IN_DPP_SHR)
4135 vcc_in = ", vcc";
4136
4137 if (unspec == UNSPEC_PLUS_CARRY_DPP_SHR
4138 || unspec == UNSPEC_PLUS_CARRY_IN_DPP_SHR)
4139 vcc_out = ", vcc";
4140 }
4141
4142 /* Add the DPP modifiers. */
4143 switch (shift)
4144 {
4145 case 1:
4146 dpp = "row_shr:1 bound_ctrl:0";
4147 break;
4148 case 2:
4149 dpp = "row_shr:2 bound_ctrl:0";
4150 break;
4151 case 4:
4152 dpp = "row_shr:4 bank_mask:0xe";
4153 break;
4154 case 8:
4155 dpp = "row_shr:8 bank_mask:0xc";
4156 break;
4157 case 16:
4158 dpp = "row_bcast:15 row_mask:0xa";
4159 break;
4160 case 32:
4161 dpp = "row_bcast:31 row_mask:0xc";
4162 break;
4163 default:
4164 gcc_unreachable ();
4165 }
4166
4167 if (unspec == UNSPEC_MOV_DPP_SHR && vgpr_2reg_mode_p (mode))
4168 sprintf (buf, "%s\t%%L0, %%L1 %s\n\t%s\t%%H0, %%H1 %s",
4169 insn, dpp, insn, dpp);
4170 else if (unspec == UNSPEC_MOV_DPP_SHR)
4171 sprintf (buf, "%s\t%%0, %%1 %s", insn, dpp);
4172 else
4173 sprintf (buf, "%s\t%%0%s, %%1, %%2%s %s", insn, vcc_out, vcc_in, dpp);
4174
4175 return buf;
4176 }
4177
4178 /* Generate vector reductions in terms of DPP instructions.
4179
4180 The vector register SRC of mode MODE is reduced using the operation given
4181 by UNSPEC, and the scalar result is returned in lane 63 of a vector
4182 register. */
4183
4184 rtx
gcn_expand_reduc_scalar(machine_mode mode,rtx src,int unspec)4185 gcn_expand_reduc_scalar (machine_mode mode, rtx src, int unspec)
4186 {
4187 machine_mode orig_mode = mode;
4188 bool use_moves = (((unspec == UNSPEC_SMIN_DPP_SHR
4189 || unspec == UNSPEC_SMAX_DPP_SHR
4190 || unspec == UNSPEC_UMIN_DPP_SHR
4191 || unspec == UNSPEC_UMAX_DPP_SHR)
4192 && mode == V64DImode)
4193 || (unspec == UNSPEC_PLUS_DPP_SHR
4194 && mode == V64DFmode));
4195 rtx_code code = (unspec == UNSPEC_SMIN_DPP_SHR ? SMIN
4196 : unspec == UNSPEC_SMAX_DPP_SHR ? SMAX
4197 : unspec == UNSPEC_UMIN_DPP_SHR ? UMIN
4198 : unspec == UNSPEC_UMAX_DPP_SHR ? UMAX
4199 : unspec == UNSPEC_PLUS_DPP_SHR ? PLUS
4200 : UNKNOWN);
4201 bool use_extends = ((unspec == UNSPEC_SMIN_DPP_SHR
4202 || unspec == UNSPEC_SMAX_DPP_SHR
4203 || unspec == UNSPEC_UMIN_DPP_SHR
4204 || unspec == UNSPEC_UMAX_DPP_SHR)
4205 && (mode == V64QImode
4206 || mode == V64HImode));
4207 bool unsignedp = (unspec == UNSPEC_UMIN_DPP_SHR
4208 || unspec == UNSPEC_UMAX_DPP_SHR);
4209 bool use_plus_carry = unspec == UNSPEC_PLUS_DPP_SHR
4210 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT
4211 && (TARGET_GCN3 || mode == V64DImode);
4212
4213 if (use_plus_carry)
4214 unspec = UNSPEC_PLUS_CARRY_DPP_SHR;
4215
4216 if (use_extends)
4217 {
4218 rtx tmp = gen_reg_rtx (V64SImode);
4219 convert_move (tmp, src, unsignedp);
4220 src = tmp;
4221 mode = V64SImode;
4222 }
4223
4224 /* Perform reduction by first performing the reduction operation on every
4225 pair of lanes, then on every pair of results from the previous
4226 iteration (thereby effectively reducing every 4 lanes) and so on until
4227 all lanes are reduced. */
4228 rtx in, out = src;
4229 for (int i = 0, shift = 1; i < 6; i++, shift <<= 1)
4230 {
4231 rtx shift_val = gen_rtx_CONST_INT (VOIDmode, shift);
4232 in = out;
4233 out = gen_reg_rtx (mode);
4234
4235 if (use_moves)
4236 {
4237 rtx tmp = gen_reg_rtx (mode);
4238 emit_insn (gen_dpp_move (mode, tmp, in, shift_val));
4239 emit_insn (gen_rtx_SET (out, gen_rtx_fmt_ee (code, mode, tmp, in)));
4240 }
4241 else
4242 {
4243 rtx insn = gen_rtx_SET (out,
4244 gen_rtx_UNSPEC (mode,
4245 gen_rtvec (3, in, in,
4246 shift_val),
4247 unspec));
4248
4249 /* Add clobber for instructions that set the carry flags. */
4250 if (use_plus_carry)
4251 {
4252 rtx clobber = gen_rtx_CLOBBER (VOIDmode,
4253 gen_rtx_REG (DImode, VCC_REG));
4254 insn = gen_rtx_PARALLEL (VOIDmode,
4255 gen_rtvec (2, insn, clobber));
4256 }
4257
4258 emit_insn (insn);
4259 }
4260 }
4261
4262 if (use_extends)
4263 {
4264 rtx tmp = gen_reg_rtx (orig_mode);
4265 convert_move (tmp, out, unsignedp);
4266 out = tmp;
4267 }
4268
4269 return out;
4270 }
4271
4272 /* Implement TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST. */
4273
4274 int
gcn_vectorization_cost(enum vect_cost_for_stmt ARG_UNUSED (type_of_cost),tree ARG_UNUSED (vectype),int ARG_UNUSED (misalign))4275 gcn_vectorization_cost (enum vect_cost_for_stmt ARG_UNUSED (type_of_cost),
4276 tree ARG_UNUSED (vectype), int ARG_UNUSED (misalign))
4277 {
4278 /* Always vectorize. */
4279 return 1;
4280 }
4281
4282 /* }}} */
4283 /* {{{ md_reorg pass. */
4284
4285 /* Identify VMEM instructions from their "type" attribute. */
4286
4287 static bool
gcn_vmem_insn_p(attr_type type)4288 gcn_vmem_insn_p (attr_type type)
4289 {
4290 switch (type)
4291 {
4292 case TYPE_MUBUF:
4293 case TYPE_MTBUF:
4294 case TYPE_FLAT:
4295 return true;
4296 case TYPE_UNKNOWN:
4297 case TYPE_SOP1:
4298 case TYPE_SOP2:
4299 case TYPE_SOPK:
4300 case TYPE_SOPC:
4301 case TYPE_SOPP:
4302 case TYPE_SMEM:
4303 case TYPE_DS:
4304 case TYPE_VOP2:
4305 case TYPE_VOP1:
4306 case TYPE_VOPC:
4307 case TYPE_VOP3A:
4308 case TYPE_VOP3B:
4309 case TYPE_VOP_SDWA:
4310 case TYPE_VOP_DPP:
4311 case TYPE_MULT:
4312 case TYPE_VMULT:
4313 return false;
4314 }
4315 gcc_unreachable ();
4316 return false;
4317 }
4318
4319 /* If INSN sets the EXEC register to a constant value, return the value,
4320 otherwise return zero. */
4321
4322 static int64_t
gcn_insn_exec_value(rtx_insn * insn)4323 gcn_insn_exec_value (rtx_insn *insn)
4324 {
4325 if (!NONDEBUG_INSN_P (insn))
4326 return 0;
4327
4328 rtx pattern = PATTERN (insn);
4329
4330 if (GET_CODE (pattern) == SET)
4331 {
4332 rtx dest = XEXP (pattern, 0);
4333 rtx src = XEXP (pattern, 1);
4334
4335 if (GET_MODE (dest) == DImode
4336 && REG_P (dest) && REGNO (dest) == EXEC_REG
4337 && CONST_INT_P (src))
4338 return INTVAL (src);
4339 }
4340
4341 return 0;
4342 }
4343
4344 /* Sets the EXEC register before INSN to the value that it had after
4345 LAST_EXEC_DEF. The constant value of the EXEC register is returned if
4346 known, otherwise it returns zero. */
4347
4348 static int64_t
gcn_restore_exec(rtx_insn * insn,rtx_insn * last_exec_def,int64_t curr_exec,bool curr_exec_known,bool & last_exec_def_saved)4349 gcn_restore_exec (rtx_insn *insn, rtx_insn *last_exec_def, int64_t curr_exec,
4350 bool curr_exec_known, bool &last_exec_def_saved)
4351 {
4352 rtx exec_reg = gen_rtx_REG (DImode, EXEC_REG);
4353 rtx exec;
4354
4355 int64_t exec_value = gcn_insn_exec_value (last_exec_def);
4356
4357 if (exec_value)
4358 {
4359 /* If the EXEC value is a constant and it happens to be the same as the
4360 current EXEC value, the restore can be skipped. */
4361 if (curr_exec_known && exec_value == curr_exec)
4362 return exec_value;
4363
4364 exec = GEN_INT (exec_value);
4365 }
4366 else
4367 {
4368 /* If the EXEC value is not a constant, save it in a register after the
4369 point of definition. */
4370 rtx exec_save_reg = gen_rtx_REG (DImode, EXEC_SAVE_REG);
4371
4372 if (!last_exec_def_saved)
4373 {
4374 start_sequence ();
4375 emit_move_insn (exec_save_reg, exec_reg);
4376 rtx_insn *seq = get_insns ();
4377 end_sequence ();
4378
4379 emit_insn_after (seq, last_exec_def);
4380 if (dump_file && (dump_flags & TDF_DETAILS))
4381 fprintf (dump_file, "Saving EXEC after insn %d.\n",
4382 INSN_UID (last_exec_def));
4383
4384 last_exec_def_saved = true;
4385 }
4386
4387 exec = exec_save_reg;
4388 }
4389
4390 /* Restore EXEC register before the usage. */
4391 start_sequence ();
4392 emit_move_insn (exec_reg, exec);
4393 rtx_insn *seq = get_insns ();
4394 end_sequence ();
4395 emit_insn_before (seq, insn);
4396
4397 if (dump_file && (dump_flags & TDF_DETAILS))
4398 {
4399 if (exec_value)
4400 fprintf (dump_file, "Restoring EXEC to %ld before insn %d.\n",
4401 exec_value, INSN_UID (insn));
4402 else
4403 fprintf (dump_file,
4404 "Restoring EXEC from saved value before insn %d.\n",
4405 INSN_UID (insn));
4406 }
4407
4408 return exec_value;
4409 }
4410
4411 /* Implement TARGET_MACHINE_DEPENDENT_REORG.
4412
4413 Ensure that pipeline dependencies and lane masking are set correctly. */
4414
4415 static void
gcn_md_reorg(void)4416 gcn_md_reorg (void)
4417 {
4418 basic_block bb;
4419 rtx exec_reg = gen_rtx_REG (DImode, EXEC_REG);
4420 regset_head live;
4421
4422 INIT_REG_SET (&live);
4423
4424 compute_bb_for_insn ();
4425
4426 if (!optimize)
4427 {
4428 split_all_insns ();
4429 if (dump_file && (dump_flags & TDF_DETAILS))
4430 {
4431 fprintf (dump_file, "After split:\n");
4432 print_rtl_with_bb (dump_file, get_insns (), dump_flags);
4433 }
4434
4435 /* Update data-flow information for split instructions. */
4436 df_insn_rescan_all ();
4437 }
4438
4439 df_analyze ();
4440
4441 /* This pass ensures that the EXEC register is set correctly, according
4442 to the "exec" attribute. However, care must be taken so that the
4443 value that reaches explicit uses of the EXEC register remains the
4444 same as before.
4445 */
4446
4447 FOR_EACH_BB_FN (bb, cfun)
4448 {
4449 if (dump_file && (dump_flags & TDF_DETAILS))
4450 fprintf (dump_file, "BB %d:\n", bb->index);
4451
4452 rtx_insn *insn, *curr;
4453 rtx_insn *last_exec_def = BB_HEAD (bb);
4454 bool last_exec_def_saved = false;
4455 bool curr_exec_explicit = true;
4456 bool curr_exec_known = true;
4457 int64_t curr_exec = 0; /* 0 here means 'the value is that of EXEC
4458 after last_exec_def is executed'. */
4459
4460 FOR_BB_INSNS_SAFE (bb, insn, curr)
4461 {
4462 if (!NONDEBUG_INSN_P (insn))
4463 continue;
4464
4465 if (GET_CODE (PATTERN (insn)) == USE
4466 || GET_CODE (PATTERN (insn)) == CLOBBER)
4467 continue;
4468
4469 HARD_REG_SET defs, uses;
4470 CLEAR_HARD_REG_SET (defs);
4471 CLEAR_HARD_REG_SET (uses);
4472 note_stores (insn, record_hard_reg_sets, &defs);
4473 note_uses (&PATTERN (insn), record_hard_reg_uses, &uses);
4474
4475 bool exec_lo_def_p = TEST_HARD_REG_BIT (defs, EXEC_LO_REG);
4476 bool exec_hi_def_p = TEST_HARD_REG_BIT (defs, EXEC_HI_REG);
4477 bool exec_used = (hard_reg_set_intersect_p
4478 (uses, reg_class_contents[(int) EXEC_MASK_REG])
4479 || TEST_HARD_REG_BIT (uses, EXECZ_REG));
4480
4481 /* Check the instruction for implicit setting of EXEC via an
4482 attribute. */
4483 attr_exec exec_attr = get_attr_exec (insn);
4484 int64_t new_exec;
4485
4486 switch (exec_attr)
4487 {
4488 case EXEC_NONE:
4489 new_exec = 0;
4490 break;
4491
4492 case EXEC_SINGLE:
4493 /* Instructions that do not involve memory accesses only require
4494 bit 0 of EXEC to be set. */
4495 if (gcn_vmem_insn_p (get_attr_type (insn))
4496 || get_attr_type (insn) == TYPE_DS)
4497 new_exec = 1;
4498 else
4499 new_exec = curr_exec | 1;
4500 break;
4501
4502 case EXEC_FULL:
4503 new_exec = -1;
4504 break;
4505
4506 default: /* Auto-detect what setting is appropriate. */
4507 {
4508 new_exec = 0;
4509
4510 /* If EXEC is referenced explicitly then we don't need to do
4511 anything to set it, so we're done. */
4512 if (exec_used)
4513 break;
4514
4515 /* Scan the insn for VGPRs defs or uses. The mode determines
4516 what kind of exec is needed. */
4517 subrtx_iterator::array_type array;
4518 FOR_EACH_SUBRTX (iter, array, PATTERN (insn), NONCONST)
4519 {
4520 const_rtx x = *iter;
4521 if (REG_P (x) && VGPR_REGNO_P (REGNO (x)))
4522 {
4523 if (VECTOR_MODE_P (GET_MODE (x)))
4524 {
4525 new_exec = -1;
4526 break;
4527 }
4528 else
4529 new_exec = 1;
4530 }
4531 }
4532 }
4533 break;
4534 }
4535
4536 if (new_exec && (!curr_exec_known || new_exec != curr_exec))
4537 {
4538 start_sequence ();
4539 emit_move_insn (exec_reg, GEN_INT (new_exec));
4540 rtx_insn *seq = get_insns ();
4541 end_sequence ();
4542 emit_insn_before (seq, insn);
4543
4544 if (dump_file && (dump_flags & TDF_DETAILS))
4545 fprintf (dump_file, "Setting EXEC to %ld before insn %d.\n",
4546 new_exec, INSN_UID (insn));
4547
4548 curr_exec = new_exec;
4549 curr_exec_explicit = false;
4550 curr_exec_known = true;
4551 }
4552 else if (new_exec && dump_file && (dump_flags & TDF_DETAILS))
4553 {
4554 fprintf (dump_file, "Exec already is %ld before insn %d.\n",
4555 new_exec, INSN_UID (insn));
4556 }
4557
4558 /* The state of the EXEC register is unknown after a
4559 function call. */
4560 if (CALL_P (insn))
4561 curr_exec_known = false;
4562
4563 /* Handle explicit uses of EXEC. If the instruction is a partial
4564 explicit definition of EXEC, then treat it as an explicit use of
4565 EXEC as well. */
4566 if (exec_used || exec_lo_def_p != exec_hi_def_p)
4567 {
4568 /* An instruction that explicitly uses EXEC should not also
4569 implicitly define it. */
4570 gcc_assert (!exec_used || !new_exec);
4571
4572 if (!curr_exec_known || !curr_exec_explicit)
4573 {
4574 /* Restore the previous explicitly defined value. */
4575 curr_exec = gcn_restore_exec (insn, last_exec_def,
4576 curr_exec, curr_exec_known,
4577 last_exec_def_saved);
4578 curr_exec_explicit = true;
4579 curr_exec_known = true;
4580 }
4581 }
4582
4583 /* Handle explicit definitions of EXEC. */
4584 if (exec_lo_def_p || exec_hi_def_p)
4585 {
4586 last_exec_def = insn;
4587 last_exec_def_saved = false;
4588 curr_exec = gcn_insn_exec_value (insn);
4589 curr_exec_explicit = true;
4590 curr_exec_known = true;
4591
4592 if (dump_file && (dump_flags & TDF_DETAILS))
4593 fprintf (dump_file,
4594 "Found %s definition of EXEC at insn %d.\n",
4595 exec_lo_def_p == exec_hi_def_p ? "full" : "partial",
4596 INSN_UID (insn));
4597 }
4598 }
4599
4600 COPY_REG_SET (&live, DF_LR_OUT (bb));
4601 df_simulate_initialize_backwards (bb, &live);
4602
4603 /* If EXEC is live after the basic block, restore the value of EXEC
4604 at the end of the block. */
4605 if ((REGNO_REG_SET_P (&live, EXEC_LO_REG)
4606 || REGNO_REG_SET_P (&live, EXEC_HI_REG))
4607 && (!curr_exec_known || !curr_exec_explicit))
4608 {
4609 rtx_insn *end_insn = BB_END (bb);
4610
4611 /* If the instruction is not a jump instruction, do the restore
4612 after the last instruction in the basic block. */
4613 if (NONJUMP_INSN_P (end_insn))
4614 end_insn = NEXT_INSN (end_insn);
4615
4616 gcn_restore_exec (end_insn, last_exec_def, curr_exec,
4617 curr_exec_known, last_exec_def_saved);
4618 }
4619 }
4620
4621 CLEAR_REG_SET (&live);
4622
4623 /* "Manually Inserted Wait States (NOPs)."
4624
4625 GCN hardware detects most kinds of register dependencies, but there
4626 are some exceptions documented in the ISA manual. This pass
4627 detects the missed cases, and inserts the documented number of NOPs
4628 required for correct execution. */
4629
4630 const int max_waits = 5;
4631 struct ilist
4632 {
4633 rtx_insn *insn;
4634 attr_unit unit;
4635 attr_delayeduse delayeduse;
4636 HARD_REG_SET writes;
4637 HARD_REG_SET reads;
4638 int age;
4639 } back[max_waits];
4640 int oldest = 0;
4641 for (int i = 0; i < max_waits; i++)
4642 back[i].insn = NULL;
4643
4644 rtx_insn *insn, *last_insn = NULL;
4645 for (insn = get_insns (); insn != 0; insn = NEXT_INSN (insn))
4646 {
4647 if (!NONDEBUG_INSN_P (insn))
4648 continue;
4649
4650 if (GET_CODE (PATTERN (insn)) == USE
4651 || GET_CODE (PATTERN (insn)) == CLOBBER)
4652 continue;
4653
4654 attr_type itype = get_attr_type (insn);
4655 attr_unit iunit = get_attr_unit (insn);
4656 attr_delayeduse idelayeduse = get_attr_delayeduse (insn);
4657 HARD_REG_SET ireads, iwrites;
4658 CLEAR_HARD_REG_SET (ireads);
4659 CLEAR_HARD_REG_SET (iwrites);
4660 note_stores (insn, record_hard_reg_sets, &iwrites);
4661 note_uses (&PATTERN (insn), record_hard_reg_uses, &ireads);
4662
4663 /* Scan recent previous instructions for dependencies not handled in
4664 hardware. */
4665 int nops_rqd = 0;
4666 for (int i = oldest; i < oldest + max_waits; i++)
4667 {
4668 struct ilist *prev_insn = &back[i % max_waits];
4669
4670 if (!prev_insn->insn)
4671 continue;
4672
4673 /* VALU writes SGPR followed by VMEM reading the same SGPR
4674 requires 5 wait states. */
4675 if ((prev_insn->age + nops_rqd) < 5
4676 && prev_insn->unit == UNIT_VECTOR
4677 && gcn_vmem_insn_p (itype))
4678 {
4679 HARD_REG_SET regs = prev_insn->writes & ireads;
4680 if (hard_reg_set_intersect_p
4681 (regs, reg_class_contents[(int) SGPR_REGS]))
4682 nops_rqd = 5 - prev_insn->age;
4683 }
4684
4685 /* VALU sets VCC/EXEC followed by VALU uses VCCZ/EXECZ
4686 requires 5 wait states. */
4687 if ((prev_insn->age + nops_rqd) < 5
4688 && prev_insn->unit == UNIT_VECTOR
4689 && iunit == UNIT_VECTOR
4690 && ((hard_reg_set_intersect_p
4691 (prev_insn->writes,
4692 reg_class_contents[(int) EXEC_MASK_REG])
4693 && TEST_HARD_REG_BIT (ireads, EXECZ_REG))
4694 ||
4695 (hard_reg_set_intersect_p
4696 (prev_insn->writes,
4697 reg_class_contents[(int) VCC_CONDITIONAL_REG])
4698 && TEST_HARD_REG_BIT (ireads, VCCZ_REG))))
4699 nops_rqd = 5 - prev_insn->age;
4700
4701 /* VALU writes SGPR/VCC followed by v_{read,write}lane using
4702 SGPR/VCC as lane select requires 4 wait states. */
4703 if ((prev_insn->age + nops_rqd) < 4
4704 && prev_insn->unit == UNIT_VECTOR
4705 && get_attr_laneselect (insn) == LANESELECT_YES)
4706 {
4707 HARD_REG_SET regs = prev_insn->writes & ireads;
4708 if (hard_reg_set_intersect_p
4709 (regs, reg_class_contents[(int) SGPR_REGS])
4710 || hard_reg_set_intersect_p
4711 (regs, reg_class_contents[(int) VCC_CONDITIONAL_REG]))
4712 nops_rqd = 4 - prev_insn->age;
4713 }
4714
4715 /* VALU writes VGPR followed by VALU_DPP reading that VGPR
4716 requires 2 wait states. */
4717 if ((prev_insn->age + nops_rqd) < 2
4718 && prev_insn->unit == UNIT_VECTOR
4719 && itype == TYPE_VOP_DPP)
4720 {
4721 HARD_REG_SET regs = prev_insn->writes & ireads;
4722 if (hard_reg_set_intersect_p
4723 (regs, reg_class_contents[(int) VGPR_REGS]))
4724 nops_rqd = 2 - prev_insn->age;
4725 }
4726
4727 /* Store that requires input registers are not overwritten by
4728 following instruction. */
4729 if ((prev_insn->age + nops_rqd) < 1
4730 && prev_insn->delayeduse == DELAYEDUSE_YES
4731 && ((hard_reg_set_intersect_p
4732 (prev_insn->reads, iwrites))))
4733 nops_rqd = 1 - prev_insn->age;
4734 }
4735
4736 /* Insert the required number of NOPs. */
4737 for (int i = nops_rqd; i > 0; i--)
4738 emit_insn_after (gen_nop (), last_insn);
4739
4740 /* Age the previous instructions. We can also ignore writes to
4741 registers subsequently overwritten. */
4742 HARD_REG_SET written;
4743 CLEAR_HARD_REG_SET (written);
4744 for (int i = oldest + max_waits - 1; i > oldest; i--)
4745 {
4746 struct ilist *prev_insn = &back[i % max_waits];
4747
4748 /* Assume all instructions are equivalent to one "wait", the same
4749 as s_nop. This is probably true for SALU, but not VALU (which
4750 may take longer), so this is not optimal. However, AMD do
4751 not publish the cycle times for instructions. */
4752 prev_insn->age += 1 + nops_rqd;
4753
4754 written |= iwrites;
4755 prev_insn->writes &= ~written;
4756 }
4757
4758 /* Track the current instruction as a previous instruction. */
4759 back[oldest].insn = insn;
4760 back[oldest].unit = iunit;
4761 back[oldest].delayeduse = idelayeduse;
4762 back[oldest].writes = iwrites;
4763 back[oldest].reads = ireads;
4764 back[oldest].age = 0;
4765 oldest = (oldest + 1) % max_waits;
4766
4767 last_insn = insn;
4768 }
4769 }
4770
4771 /* }}} */
4772 /* {{{ OpenACC / OpenMP. */
4773
4774 #define GCN_DEFAULT_GANGS 0 /* Choose at runtime. */
4775 #define GCN_DEFAULT_WORKERS 0 /* Choose at runtime. */
4776 #define GCN_DEFAULT_VECTORS 1 /* Use autovectorization only, for now. */
4777
4778 /* Implement TARGET_GOACC_VALIDATE_DIMS.
4779
4780 Check the launch dimensions provided for an OpenACC compute
4781 region, or routine. */
4782
4783 static bool
gcn_goacc_validate_dims(tree decl,int dims[],int fn_level,unsigned)4784 gcn_goacc_validate_dims (tree decl, int dims[], int fn_level,
4785 unsigned /*used*/)
4786 {
4787 bool changed = false;
4788
4789 /* FIXME: remove -facc-experimental-workers when they're ready. */
4790 int max_workers = flag_worker_partitioning ? 16 : 1;
4791
4792 gcc_assert (!flag_worker_partitioning);
4793
4794 /* The vector size must appear to be 64, to the user, unless this is a
4795 SEQ routine. The real, internal value is always 1, which means use
4796 autovectorization, but the user should not see that. */
4797 if (fn_level <= GOMP_DIM_VECTOR && fn_level >= -1
4798 && dims[GOMP_DIM_VECTOR] >= 0)
4799 {
4800 if (fn_level < 0 && dims[GOMP_DIM_VECTOR] >= 0
4801 && dims[GOMP_DIM_VECTOR] != 64)
4802 warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION,
4803 OPT_Wopenacc_dims,
4804 (dims[GOMP_DIM_VECTOR]
4805 ? G_("using vector_length (64), ignoring %d")
4806 : G_("using vector_length (64), "
4807 "ignoring runtime setting")),
4808 dims[GOMP_DIM_VECTOR]);
4809 dims[GOMP_DIM_VECTOR] = 1;
4810 changed = true;
4811 }
4812
4813 /* Check the num workers is not too large. */
4814 if (dims[GOMP_DIM_WORKER] > max_workers)
4815 {
4816 warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION,
4817 OPT_Wopenacc_dims,
4818 "using num_workers (%d), ignoring %d",
4819 max_workers, dims[GOMP_DIM_WORKER]);
4820 dims[GOMP_DIM_WORKER] = max_workers;
4821 changed = true;
4822 }
4823
4824 /* Set global defaults. */
4825 if (!decl)
4826 {
4827 dims[GOMP_DIM_VECTOR] = GCN_DEFAULT_VECTORS;
4828 if (dims[GOMP_DIM_WORKER] < 0)
4829 dims[GOMP_DIM_WORKER] = (flag_worker_partitioning
4830 ? GCN_DEFAULT_WORKERS : 1);
4831 if (dims[GOMP_DIM_GANG] < 0)
4832 dims[GOMP_DIM_GANG] = GCN_DEFAULT_GANGS;
4833 changed = true;
4834 }
4835
4836 return changed;
4837 }
4838
4839 /* Helper function for oacc_dim_size instruction.
4840 Also used for OpenMP, via builtin_gcn_dim_size, and the omp_gcn pass. */
4841
4842 rtx
gcn_oacc_dim_size(int dim)4843 gcn_oacc_dim_size (int dim)
4844 {
4845 if (dim < 0 || dim > 2)
4846 error ("offload dimension out of range (%d)", dim);
4847
4848 /* Vectors are a special case. */
4849 if (dim == 2)
4850 return const1_rtx; /* Think of this as 1 times 64. */
4851
4852 static int offset[] = {
4853 /* Offsets into dispatch packet. */
4854 12, /* X dim = Gang / Team / Work-group. */
4855 20, /* Z dim = Worker / Thread / Wavefront. */
4856 16 /* Y dim = Vector / SIMD / Work-item. */
4857 };
4858 rtx addr = gen_rtx_PLUS (DImode,
4859 gen_rtx_REG (DImode,
4860 cfun->machine->args.
4861 reg[DISPATCH_PTR_ARG]),
4862 GEN_INT (offset[dim]));
4863 return gen_rtx_MEM (SImode, addr);
4864 }
4865
4866 /* Helper function for oacc_dim_pos instruction.
4867 Also used for OpenMP, via builtin_gcn_dim_pos, and the omp_gcn pass. */
4868
4869 rtx
gcn_oacc_dim_pos(int dim)4870 gcn_oacc_dim_pos (int dim)
4871 {
4872 if (dim < 0 || dim > 2)
4873 error ("offload dimension out of range (%d)", dim);
4874
4875 static const int reg[] = {
4876 WORKGROUP_ID_X_ARG, /* Gang / Team / Work-group. */
4877 WORK_ITEM_ID_Z_ARG, /* Worker / Thread / Wavefront. */
4878 WORK_ITEM_ID_Y_ARG /* Vector / SIMD / Work-item. */
4879 };
4880
4881 int reg_num = cfun->machine->args.reg[reg[dim]];
4882
4883 /* The information must have been requested by the kernel. */
4884 gcc_assert (reg_num >= 0);
4885
4886 return gen_rtx_REG (SImode, reg_num);
4887 }
4888
4889 /* Implement TARGET_GOACC_FORK_JOIN. */
4890
4891 static bool
gcn_fork_join(gcall * ARG_UNUSED (call),const int * ARG_UNUSED (dims),bool ARG_UNUSED (is_fork))4892 gcn_fork_join (gcall *ARG_UNUSED (call), const int *ARG_UNUSED (dims),
4893 bool ARG_UNUSED (is_fork))
4894 {
4895 /* GCN does not use the fork/join concept invented for NVPTX.
4896 Instead we use standard autovectorization. */
4897 return false;
4898 }
4899
4900 /* Implement ???????
4901 FIXME make this a real hook.
4902
4903 Adjust FNDECL such that options inherited from the host compiler
4904 are made appropriate for the accelerator compiler. */
4905
4906 void
gcn_fixup_accel_lto_options(tree fndecl)4907 gcn_fixup_accel_lto_options (tree fndecl)
4908 {
4909 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
4910 if (!func_optimize)
4911 return;
4912
4913 tree old_optimize = build_optimization_node (&global_options);
4914 tree new_optimize;
4915
4916 /* If the function changed the optimization levels as well as
4917 setting target options, start with the optimizations
4918 specified. */
4919 if (func_optimize != old_optimize)
4920 cl_optimization_restore (&global_options,
4921 TREE_OPTIMIZATION (func_optimize));
4922
4923 gcn_option_override ();
4924
4925 /* The target attributes may also change some optimization flags,
4926 so update the optimization options if necessary. */
4927 new_optimize = build_optimization_node (&global_options);
4928
4929 if (old_optimize != new_optimize)
4930 {
4931 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
4932 cl_optimization_restore (&global_options,
4933 TREE_OPTIMIZATION (old_optimize));
4934 }
4935 }
4936
4937 /* }}} */
4938 /* {{{ ASM Output. */
4939
4940 /* Implement TARGET_ASM_FILE_START.
4941
4942 Print assembler file header text. */
4943
4944 static void
output_file_start(void)4945 output_file_start (void)
4946 {
4947 fprintf (asm_out_file, "\t.text\n");
4948 fprintf (asm_out_file, "\t.hsa_code_object_version 2,0\n");
4949 fprintf (asm_out_file, "\t.hsa_code_object_isa\n"); /* Autodetect. */
4950 fprintf (asm_out_file, "\t.section\t.AMDGPU.config\n");
4951 fprintf (asm_out_file, "\t.text\n");
4952 }
4953
4954 /* Implement ASM_DECLARE_FUNCTION_NAME via gcn-hsa.h.
4955
4956 Print the initial definition of a function name.
4957
4958 For GCN kernel entry points this includes all the HSA meta-data, special
4959 alignment constraints that don't apply to regular functions, and magic
4960 comments that pass information to mkoffload. */
4961
4962 void
gcn_hsa_declare_function_name(FILE * file,const char * name,tree)4963 gcn_hsa_declare_function_name (FILE *file, const char *name, tree)
4964 {
4965 int sgpr, vgpr;
4966 bool xnack_enabled = false;
4967 int extra_regs = 0;
4968
4969 if (cfun && cfun->machine && cfun->machine->normal_function)
4970 {
4971 fputs ("\t.type\t", file);
4972 assemble_name (file, name);
4973 fputs (",@function\n", file);
4974 assemble_name (file, name);
4975 fputs (":\n", file);
4976 return;
4977 }
4978
4979 /* Determine count of sgpr/vgpr registers by looking for last
4980 one used. */
4981 for (sgpr = 101; sgpr >= 0; sgpr--)
4982 if (df_regs_ever_live_p (FIRST_SGPR_REG + sgpr))
4983 break;
4984 sgpr++;
4985 for (vgpr = 255; vgpr >= 0; vgpr--)
4986 if (df_regs_ever_live_p (FIRST_VGPR_REG + vgpr))
4987 break;
4988 vgpr++;
4989
4990 if (xnack_enabled)
4991 extra_regs = 6;
4992 if (df_regs_ever_live_p (FLAT_SCRATCH_LO_REG)
4993 || df_regs_ever_live_p (FLAT_SCRATCH_HI_REG))
4994 extra_regs = 4;
4995 else if (df_regs_ever_live_p (VCC_LO_REG)
4996 || df_regs_ever_live_p (VCC_HI_REG))
4997 extra_regs = 2;
4998
4999 if (!leaf_function_p ())
5000 {
5001 /* We can't know how many registers function calls might use. */
5002 if (vgpr < MAX_NORMAL_VGPR_COUNT)
5003 vgpr = MAX_NORMAL_VGPR_COUNT;
5004 if (sgpr + extra_regs < MAX_NORMAL_SGPR_COUNT)
5005 sgpr = MAX_NORMAL_SGPR_COUNT - extra_regs;
5006 }
5007
5008 /* GFX8 allocates SGPRs in blocks of 8.
5009 GFX9 uses blocks of 16. */
5010 int granulated_sgprs;
5011 if (TARGET_GCN3)
5012 granulated_sgprs = (sgpr + extra_regs + 7) / 8 - 1;
5013 else if (TARGET_GCN5)
5014 granulated_sgprs = 2 * ((sgpr + extra_regs + 15) / 16 - 1);
5015 else
5016 gcc_unreachable ();
5017
5018 fputs ("\t.align\t256\n", file);
5019 fputs ("\t.type\t", file);
5020 assemble_name (file, name);
5021 fputs (",@function\n\t.amdgpu_hsa_kernel\t", file);
5022 assemble_name (file, name);
5023 fputs ("\n", file);
5024 assemble_name (file, name);
5025 fputs (":\n", file);
5026 fprintf (file, "\t.amd_kernel_code_t\n"
5027 "\t\tkernel_code_version_major = 1\n"
5028 "\t\tkernel_code_version_minor = 0\n" "\t\tmachine_kind = 1\n"
5029 /* "\t\tmachine_version_major = 8\n"
5030 "\t\tmachine_version_minor = 0\n"
5031 "\t\tmachine_version_stepping = 1\n" */
5032 "\t\tkernel_code_entry_byte_offset = 256\n"
5033 "\t\tkernel_code_prefetch_byte_size = 0\n"
5034 "\t\tmax_scratch_backing_memory_byte_size = 0\n"
5035 "\t\tcompute_pgm_rsrc1_vgprs = %i\n"
5036 "\t\tcompute_pgm_rsrc1_sgprs = %i\n"
5037 "\t\tcompute_pgm_rsrc1_priority = 0\n"
5038 "\t\tcompute_pgm_rsrc1_float_mode = 192\n"
5039 "\t\tcompute_pgm_rsrc1_priv = 0\n"
5040 "\t\tcompute_pgm_rsrc1_dx10_clamp = 1\n"
5041 "\t\tcompute_pgm_rsrc1_debug_mode = 0\n"
5042 "\t\tcompute_pgm_rsrc1_ieee_mode = 1\n"
5043 /* We enable scratch memory. */
5044 "\t\tcompute_pgm_rsrc2_scratch_en = 1\n"
5045 "\t\tcompute_pgm_rsrc2_user_sgpr = %i\n"
5046 "\t\tcompute_pgm_rsrc2_tgid_x_en = 1\n"
5047 "\t\tcompute_pgm_rsrc2_tgid_y_en = 0\n"
5048 "\t\tcompute_pgm_rsrc2_tgid_z_en = 0\n"
5049 "\t\tcompute_pgm_rsrc2_tg_size_en = 0\n"
5050 "\t\tcompute_pgm_rsrc2_tidig_comp_cnt = 0\n"
5051 "\t\tcompute_pgm_rsrc2_excp_en_msb = 0\n"
5052 "\t\tcompute_pgm_rsrc2_lds_size = 0\n" /* Set at runtime. */
5053 "\t\tcompute_pgm_rsrc2_excp_en = 0\n",
5054 (vgpr - 1) / 4,
5055 /* Must match wavefront_sgpr_count */
5056 granulated_sgprs,
5057 /* The total number of SGPR user data registers requested. This
5058 number must match the number of user data registers enabled. */
5059 cfun->machine->args.nsgprs);
5060 int reg = FIRST_SGPR_REG;
5061 for (int a = 0; a < GCN_KERNEL_ARG_TYPES; a++)
5062 {
5063 int reg_first = -1;
5064 int reg_last;
5065 if ((cfun->machine->args.requested & (1 << a))
5066 && (gcn_kernel_arg_types[a].fixed_regno < 0))
5067 {
5068 reg_first = reg;
5069 reg_last = (reg_first
5070 + (GET_MODE_SIZE (gcn_kernel_arg_types[a].mode)
5071 / UNITS_PER_WORD) - 1);
5072 reg = reg_last + 1;
5073 }
5074
5075 if (gcn_kernel_arg_types[a].header_pseudo)
5076 {
5077 fprintf (file, "\t\t%s = %i",
5078 gcn_kernel_arg_types[a].header_pseudo,
5079 (cfun->machine->args.requested & (1 << a)) != 0);
5080 if (reg_first != -1)
5081 {
5082 fprintf (file, " ; (");
5083 for (int i = reg_first; i <= reg_last; ++i)
5084 {
5085 if (i != reg_first)
5086 fprintf (file, ", ");
5087 fprintf (file, "%s", reg_names[i]);
5088 }
5089 fprintf (file, ")");
5090 }
5091 fprintf (file, "\n");
5092 }
5093 else if (gcn_kernel_arg_types[a].fixed_regno >= 0
5094 && cfun->machine->args.requested & (1 << a))
5095 fprintf (file, "\t\t; %s = %i (%s)\n",
5096 gcn_kernel_arg_types[a].name,
5097 (cfun->machine->args.requested & (1 << a)) != 0,
5098 reg_names[gcn_kernel_arg_types[a].fixed_regno]);
5099 }
5100 fprintf (file, "\t\tenable_vgpr_workitem_id = %i\n",
5101 (cfun->machine->args.requested & (1 << WORK_ITEM_ID_Z_ARG))
5102 ? 2
5103 : cfun->machine->args.requested & (1 << WORK_ITEM_ID_Y_ARG)
5104 ? 1 : 0);
5105 fprintf (file, "\t\tenable_ordered_append_gds = 0\n"
5106 "\t\tprivate_element_size = 1\n"
5107 "\t\tis_ptr64 = 1\n"
5108 "\t\tis_dynamic_callstack = 0\n"
5109 "\t\tis_debug_enabled = 0\n"
5110 "\t\tis_xnack_enabled = %i\n"
5111 "\t\tworkitem_private_segment_byte_size = %i\n"
5112 "\t\tworkgroup_group_segment_byte_size = %u\n"
5113 "\t\tgds_segment_byte_size = 0\n"
5114 "\t\tkernarg_segment_byte_size = %i\n"
5115 "\t\tworkgroup_fbarrier_count = 0\n"
5116 "\t\twavefront_sgpr_count = %i\n"
5117 "\t\tworkitem_vgpr_count = %i\n"
5118 "\t\treserved_vgpr_first = 0\n"
5119 "\t\treserved_vgpr_count = 0\n"
5120 "\t\treserved_sgpr_first = 0\n"
5121 "\t\treserved_sgpr_count = 0\n"
5122 "\t\tdebug_wavefront_private_segment_offset_sgpr = 0\n"
5123 "\t\tdebug_private_segment_buffer_sgpr = 0\n"
5124 "\t\tkernarg_segment_alignment = %i\n"
5125 "\t\tgroup_segment_alignment = 4\n"
5126 "\t\tprivate_segment_alignment = %i\n"
5127 "\t\twavefront_size = 6\n"
5128 "\t\tcall_convention = 0\n"
5129 "\t\truntime_loader_kernel_symbol = 0\n"
5130 "\t.end_amd_kernel_code_t\n", xnack_enabled,
5131 /* workitem_private_segment_bytes_size needs to be
5132 one 64th the wave-front stack size. */
5133 stack_size_opt / 64,
5134 LDS_SIZE, cfun->machine->kernarg_segment_byte_size,
5135 /* Number of scalar registers used by a wavefront. This
5136 includes the special SGPRs for VCC, Flat Scratch (Base,
5137 Size) and XNACK (for GFX8 (VI)+). It does not include the
5138 16 SGPR added if a trap handler is enabled. Must match
5139 compute_pgm_rsrc1.sgprs. */
5140 sgpr + extra_regs, vgpr,
5141 cfun->machine->kernarg_segment_alignment,
5142 crtl->stack_alignment_needed / 8);
5143
5144 /* This comment is read by mkoffload. */
5145 if (flag_openacc)
5146 fprintf (file, "\t;; OPENACC-DIMS: %d, %d, %d : %s\n",
5147 oacc_get_fn_dim_size (cfun->decl, GOMP_DIM_GANG),
5148 oacc_get_fn_dim_size (cfun->decl, GOMP_DIM_WORKER),
5149 oacc_get_fn_dim_size (cfun->decl, GOMP_DIM_VECTOR), name);
5150 }
5151
5152 /* Implement TARGET_ASM_SELECT_SECTION.
5153
5154 Return the section into which EXP should be placed. */
5155
5156 static section *
gcn_asm_select_section(tree exp,int reloc,unsigned HOST_WIDE_INT align)5157 gcn_asm_select_section (tree exp, int reloc, unsigned HOST_WIDE_INT align)
5158 {
5159 if (TREE_TYPE (exp) != error_mark_node
5160 && TYPE_ADDR_SPACE (TREE_TYPE (exp)) == ADDR_SPACE_LDS)
5161 {
5162 if (!DECL_P (exp))
5163 return get_section (".lds_bss",
5164 SECTION_WRITE | SECTION_BSS | SECTION_DEBUG,
5165 NULL);
5166
5167 return get_named_section (exp, ".lds_bss", reloc);
5168 }
5169
5170 return default_elf_select_section (exp, reloc, align);
5171 }
5172
5173 /* Implement TARGET_ASM_FUNCTION_PROLOGUE.
5174
5175 Emits custom text into the assembler file at the head of each function. */
5176
5177 static void
gcn_target_asm_function_prologue(FILE * file)5178 gcn_target_asm_function_prologue (FILE *file)
5179 {
5180 machine_function *offsets = gcn_compute_frame_offsets ();
5181
5182 asm_fprintf (file, "\t; using %s addressing in function\n",
5183 offsets->use_flat_addressing ? "flat" : "global");
5184
5185 if (offsets->normal_function)
5186 {
5187 asm_fprintf (file, "\t; frame pointer needed: %s\n",
5188 offsets->need_frame_pointer ? "true" : "false");
5189 asm_fprintf (file, "\t; lr needs saving: %s\n",
5190 offsets->lr_needs_saving ? "true" : "false");
5191 asm_fprintf (file, "\t; outgoing args size: %wd\n",
5192 offsets->outgoing_args_size);
5193 asm_fprintf (file, "\t; pretend size: %wd\n", offsets->pretend_size);
5194 asm_fprintf (file, "\t; local vars size: %wd\n", offsets->local_vars);
5195 asm_fprintf (file, "\t; callee save size: %wd\n",
5196 offsets->callee_saves);
5197 }
5198 else
5199 {
5200 asm_fprintf (file, "\t; HSA kernel entry point\n");
5201 asm_fprintf (file, "\t; local vars size: %wd\n", offsets->local_vars);
5202 asm_fprintf (file, "\t; outgoing args size: %wd\n",
5203 offsets->outgoing_args_size);
5204
5205 /* Enable denorms. */
5206 asm_fprintf (file, "\n\t; Set MODE[FP_DENORM]: allow single and double"
5207 " input and output denorms\n");
5208 asm_fprintf (file, "\ts_setreg_imm32_b32\thwreg(1, 4, 4), 0xf\n\n");
5209 }
5210 }
5211
5212 /* Helper function for print_operand and print_operand_address.
5213
5214 Print a register as the assembler requires, according to mode and name. */
5215
5216 static void
print_reg(FILE * file,rtx x)5217 print_reg (FILE *file, rtx x)
5218 {
5219 machine_mode mode = GET_MODE (x);
5220 if (mode == BImode || mode == QImode || mode == HImode || mode == SImode
5221 || mode == HFmode || mode == SFmode
5222 || mode == V64SFmode || mode == V64SImode
5223 || mode == V64QImode || mode == V64HImode)
5224 fprintf (file, "%s", reg_names[REGNO (x)]);
5225 else if (mode == DImode || mode == V64DImode
5226 || mode == DFmode || mode == V64DFmode)
5227 {
5228 if (SGPR_REGNO_P (REGNO (x)))
5229 fprintf (file, "s[%i:%i]", REGNO (x) - FIRST_SGPR_REG,
5230 REGNO (x) - FIRST_SGPR_REG + 1);
5231 else if (VGPR_REGNO_P (REGNO (x)))
5232 fprintf (file, "v[%i:%i]", REGNO (x) - FIRST_VGPR_REG,
5233 REGNO (x) - FIRST_VGPR_REG + 1);
5234 else if (REGNO (x) == FLAT_SCRATCH_REG)
5235 fprintf (file, "flat_scratch");
5236 else if (REGNO (x) == EXEC_REG)
5237 fprintf (file, "exec");
5238 else if (REGNO (x) == VCC_LO_REG)
5239 fprintf (file, "vcc");
5240 else
5241 fprintf (file, "[%s:%s]",
5242 reg_names[REGNO (x)], reg_names[REGNO (x) + 1]);
5243 }
5244 else if (mode == TImode)
5245 {
5246 if (SGPR_REGNO_P (REGNO (x)))
5247 fprintf (file, "s[%i:%i]", REGNO (x) - FIRST_SGPR_REG,
5248 REGNO (x) - FIRST_SGPR_REG + 3);
5249 else if (VGPR_REGNO_P (REGNO (x)))
5250 fprintf (file, "v[%i:%i]", REGNO (x) - FIRST_VGPR_REG,
5251 REGNO (x) - FIRST_VGPR_REG + 3);
5252 else
5253 gcc_unreachable ();
5254 }
5255 else
5256 gcc_unreachable ();
5257 }
5258
5259 /* Implement TARGET_SECTION_TYPE_FLAGS.
5260
5261 Return a set of section attributes for use by TARGET_ASM_NAMED_SECTION. */
5262
5263 static unsigned int
gcn_section_type_flags(tree decl,const char * name,int reloc)5264 gcn_section_type_flags (tree decl, const char *name, int reloc)
5265 {
5266 if (strcmp (name, ".lds_bss") == 0)
5267 return SECTION_WRITE | SECTION_BSS | SECTION_DEBUG;
5268
5269 return default_section_type_flags (decl, name, reloc);
5270 }
5271
5272 /* Helper function for gcn_asm_output_symbol_ref.
5273
5274 FIXME: If we want to have propagation blocks allocated separately and
5275 statically like this, it would be better done via symbol refs and the
5276 assembler/linker. This is a temporary hack. */
5277
5278 static void
gcn_print_lds_decl(FILE * f,tree var)5279 gcn_print_lds_decl (FILE *f, tree var)
5280 {
5281 int *offset;
5282 machine_function *machfun = cfun->machine;
5283
5284 if ((offset = machfun->lds_allocs->get (var)))
5285 fprintf (f, "%u", (unsigned) *offset);
5286 else
5287 {
5288 unsigned HOST_WIDE_INT align = DECL_ALIGN_UNIT (var);
5289 tree type = TREE_TYPE (var);
5290 unsigned HOST_WIDE_INT size = tree_to_uhwi (TYPE_SIZE_UNIT (type));
5291 if (size > align && size > 4 && align < 8)
5292 align = 8;
5293
5294 machfun->lds_allocated = ((machfun->lds_allocated + align - 1)
5295 & ~(align - 1));
5296
5297 machfun->lds_allocs->put (var, machfun->lds_allocated);
5298 fprintf (f, "%u", machfun->lds_allocated);
5299 machfun->lds_allocated += size;
5300 if (machfun->lds_allocated > LDS_SIZE)
5301 error ("local data-share memory exhausted");
5302 }
5303 }
5304
5305 /* Implement ASM_OUTPUT_SYMBOL_REF via gcn-hsa.h. */
5306
5307 void
gcn_asm_output_symbol_ref(FILE * file,rtx x)5308 gcn_asm_output_symbol_ref (FILE *file, rtx x)
5309 {
5310 tree decl;
5311 if (cfun
5312 && (decl = SYMBOL_REF_DECL (x)) != 0
5313 && TREE_CODE (decl) == VAR_DECL
5314 && AS_LDS_P (TYPE_ADDR_SPACE (TREE_TYPE (decl))))
5315 {
5316 /* LDS symbols (emitted using this hook) are only used at present
5317 to propagate worker values from an active thread to neutered
5318 threads. Use the same offset for each such block, but don't
5319 use zero because null pointers are used to identify the active
5320 thread in GOACC_single_copy_start calls. */
5321 gcn_print_lds_decl (file, decl);
5322 }
5323 else
5324 {
5325 assemble_name (file, XSTR (x, 0));
5326 /* FIXME: See above -- this condition is unreachable. */
5327 if (cfun
5328 && (decl = SYMBOL_REF_DECL (x)) != 0
5329 && TREE_CODE (decl) == VAR_DECL
5330 && AS_LDS_P (TYPE_ADDR_SPACE (TREE_TYPE (decl))))
5331 fputs ("@abs32", file);
5332 }
5333 }
5334
5335 /* Implement TARGET_CONSTANT_ALIGNMENT.
5336
5337 Returns the alignment in bits of a constant that is being placed in memory.
5338 CONSTANT is the constant and BASIC_ALIGN is the alignment that the object
5339 would ordinarily have. */
5340
5341 static HOST_WIDE_INT
gcn_constant_alignment(const_tree ARG_UNUSED (constant),HOST_WIDE_INT basic_align)5342 gcn_constant_alignment (const_tree ARG_UNUSED (constant),
5343 HOST_WIDE_INT basic_align)
5344 {
5345 return basic_align > 128 ? basic_align : 128;
5346 }
5347
5348 /* Implement PRINT_OPERAND_ADDRESS via gcn.h. */
5349
5350 void
print_operand_address(FILE * file,rtx mem)5351 print_operand_address (FILE *file, rtx mem)
5352 {
5353 gcc_assert (MEM_P (mem));
5354
5355 rtx reg;
5356 rtx offset;
5357 addr_space_t as = MEM_ADDR_SPACE (mem);
5358 rtx addr = XEXP (mem, 0);
5359 gcc_assert (REG_P (addr) || GET_CODE (addr) == PLUS);
5360
5361 if (AS_SCRATCH_P (as))
5362 switch (GET_CODE (addr))
5363 {
5364 case REG:
5365 print_reg (file, addr);
5366 break;
5367
5368 case PLUS:
5369 reg = XEXP (addr, 0);
5370 offset = XEXP (addr, 1);
5371 print_reg (file, reg);
5372 if (GET_CODE (offset) == CONST_INT)
5373 fprintf (file, " offset:" HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
5374 else
5375 abort ();
5376 break;
5377
5378 default:
5379 debug_rtx (addr);
5380 abort ();
5381 }
5382 else if (AS_ANY_FLAT_P (as))
5383 {
5384 if (GET_CODE (addr) == REG)
5385 print_reg (file, addr);
5386 else
5387 {
5388 gcc_assert (TARGET_GCN5_PLUS);
5389 print_reg (file, XEXP (addr, 0));
5390 }
5391 }
5392 else if (AS_GLOBAL_P (as))
5393 {
5394 gcc_assert (TARGET_GCN5_PLUS);
5395
5396 rtx base = addr;
5397 rtx vgpr_offset = NULL_RTX;
5398
5399 if (GET_CODE (addr) == PLUS)
5400 {
5401 base = XEXP (addr, 0);
5402
5403 if (GET_CODE (base) == PLUS)
5404 {
5405 /* (SGPR + VGPR) + CONST */
5406 vgpr_offset = XEXP (base, 1);
5407 base = XEXP (base, 0);
5408 }
5409 else
5410 {
5411 rtx offset = XEXP (addr, 1);
5412
5413 if (REG_P (offset))
5414 /* SGPR + VGPR */
5415 vgpr_offset = offset;
5416 else if (CONST_INT_P (offset))
5417 /* VGPR + CONST or SGPR + CONST */
5418 ;
5419 else
5420 output_operand_lossage ("bad ADDR_SPACE_GLOBAL address");
5421 }
5422 }
5423
5424 if (REG_P (base))
5425 {
5426 if (VGPR_REGNO_P (REGNO (base)))
5427 print_reg (file, base);
5428 else if (SGPR_REGNO_P (REGNO (base)))
5429 {
5430 /* The assembler requires a 64-bit VGPR pair here, even though
5431 the offset should be only 32-bit. */
5432 if (vgpr_offset == NULL_RTX)
5433 /* In this case, the vector offset is zero, so we use the first
5434 lane of v1, which is initialized to zero. */
5435 fprintf (file, "v[1:2]");
5436 else if (REG_P (vgpr_offset)
5437 && VGPR_REGNO_P (REGNO (vgpr_offset)))
5438 {
5439 fprintf (file, "v[%d:%d]",
5440 REGNO (vgpr_offset) - FIRST_VGPR_REG,
5441 REGNO (vgpr_offset) - FIRST_VGPR_REG + 1);
5442 }
5443 else
5444 output_operand_lossage ("bad ADDR_SPACE_GLOBAL address");
5445 }
5446 }
5447 else
5448 output_operand_lossage ("bad ADDR_SPACE_GLOBAL address");
5449 }
5450 else if (AS_ANY_DS_P (as))
5451 switch (GET_CODE (addr))
5452 {
5453 case REG:
5454 print_reg (file, addr);
5455 break;
5456
5457 case PLUS:
5458 reg = XEXP (addr, 0);
5459 print_reg (file, reg);
5460 break;
5461
5462 default:
5463 debug_rtx (addr);
5464 abort ();
5465 }
5466 else
5467 switch (GET_CODE (addr))
5468 {
5469 case REG:
5470 print_reg (file, addr);
5471 fprintf (file, ", 0");
5472 break;
5473
5474 case PLUS:
5475 reg = XEXP (addr, 0);
5476 offset = XEXP (addr, 1);
5477 print_reg (file, reg);
5478 fprintf (file, ", ");
5479 if (GET_CODE (offset) == REG)
5480 print_reg (file, reg);
5481 else if (GET_CODE (offset) == CONST_INT)
5482 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
5483 else
5484 abort ();
5485 break;
5486
5487 default:
5488 debug_rtx (addr);
5489 abort ();
5490 }
5491 }
5492
5493 /* Implement PRINT_OPERAND via gcn.h.
5494
5495 b - print operand size as untyped operand (b8/b16/b32/b64)
5496 B - print operand size as SI/DI untyped operand (b32/b32/b32/b64)
5497 i - print operand size as untyped operand (i16/b32/i64)
5498 I - print operand size as SI/DI untyped operand(i32/b32/i64)
5499 u - print operand size as untyped operand (u16/u32/u64)
5500 U - print operand size as SI/DI untyped operand(u32/u64)
5501 o - print operand size as memory access size for loads
5502 (ubyte/ushort/dword/dwordx2/wordx3/dwordx4)
5503 s - print operand size as memory access size for stores
5504 (byte/short/dword/dwordx2/wordx3/dwordx4)
5505 C - print conditional code for s_cbranch (_sccz/_sccnz/_vccz/_vccnz...)
5506 c - print inverse conditional code for s_cbranch
5507 D - print conditional code for s_cmp (eq_u64/lg_u64...)
5508 E - print conditional code for v_cmp (eq_u64/ne_u64...)
5509 A - print address in formatting suitable for given address space.
5510 O - print offset:n for data share operations.
5511 ^ - print "_co" suffix for GCN5 mnemonics
5512 g - print "glc", if appropriate for given MEM
5513 */
5514
5515 void
print_operand(FILE * file,rtx x,int code)5516 print_operand (FILE *file, rtx x, int code)
5517 {
5518 int xcode = x ? GET_CODE (x) : 0;
5519 bool invert = false;
5520 switch (code)
5521 {
5522 /* Instructions have the following suffixes.
5523 If there are two suffixes, the first is the destination type,
5524 and the second is the source type.
5525
5526 B32 Bitfield (untyped data) 32-bit
5527 B64 Bitfield (untyped data) 64-bit
5528 F16 floating-point 16-bit
5529 F32 floating-point 32-bit (IEEE 754 single-precision float)
5530 F64 floating-point 64-bit (IEEE 754 double-precision float)
5531 I16 signed 32-bit integer
5532 I32 signed 32-bit integer
5533 I64 signed 64-bit integer
5534 U16 unsigned 32-bit integer
5535 U32 unsigned 32-bit integer
5536 U64 unsigned 64-bit integer */
5537
5538 /* Print operand size as untyped suffix. */
5539 case 'b':
5540 {
5541 const char *s = "";
5542 machine_mode mode = GET_MODE (x);
5543 if (VECTOR_MODE_P (mode))
5544 mode = GET_MODE_INNER (mode);
5545 switch (GET_MODE_SIZE (mode))
5546 {
5547 case 1:
5548 s = "_b8";
5549 break;
5550 case 2:
5551 s = "_b16";
5552 break;
5553 case 4:
5554 s = "_b32";
5555 break;
5556 case 8:
5557 s = "_b64";
5558 break;
5559 default:
5560 output_operand_lossage ("invalid operand %%xn code");
5561 return;
5562 }
5563 fputs (s, file);
5564 }
5565 return;
5566 case 'B':
5567 {
5568 const char *s = "";
5569 machine_mode mode = GET_MODE (x);
5570 if (VECTOR_MODE_P (mode))
5571 mode = GET_MODE_INNER (mode);
5572 switch (GET_MODE_SIZE (mode))
5573 {
5574 case 1:
5575 case 2:
5576 case 4:
5577 s = "_b32";
5578 break;
5579 case 8:
5580 s = "_b64";
5581 break;
5582 default:
5583 output_operand_lossage ("invalid operand %%xn code");
5584 return;
5585 }
5586 fputs (s, file);
5587 }
5588 return;
5589 case 'e':
5590 fputs ("sext(", file);
5591 print_operand (file, x, 0);
5592 fputs (")", file);
5593 return;
5594 case 'i':
5595 case 'I':
5596 case 'u':
5597 case 'U':
5598 {
5599 bool signed_p = code == 'i';
5600 bool min32_p = code == 'I' || code == 'U';
5601 const char *s = "";
5602 machine_mode mode = GET_MODE (x);
5603 if (VECTOR_MODE_P (mode))
5604 mode = GET_MODE_INNER (mode);
5605 if (mode == VOIDmode)
5606 switch (GET_CODE (x))
5607 {
5608 case CONST_INT:
5609 s = signed_p ? "_i32" : "_u32";
5610 break;
5611 case CONST_DOUBLE:
5612 s = "_f64";
5613 break;
5614 default:
5615 output_operand_lossage ("invalid operand %%xn code");
5616 return;
5617 }
5618 else if (FLOAT_MODE_P (mode))
5619 switch (GET_MODE_SIZE (mode))
5620 {
5621 case 2:
5622 s = "_f16";
5623 break;
5624 case 4:
5625 s = "_f32";
5626 break;
5627 case 8:
5628 s = "_f64";
5629 break;
5630 default:
5631 output_operand_lossage ("invalid operand %%xn code");
5632 return;
5633 }
5634 else if (min32_p)
5635 switch (GET_MODE_SIZE (mode))
5636 {
5637 case 1:
5638 case 2:
5639 case 4:
5640 s = signed_p ? "_i32" : "_u32";
5641 break;
5642 case 8:
5643 s = signed_p ? "_i64" : "_u64";
5644 break;
5645 default:
5646 output_operand_lossage ("invalid operand %%xn code");
5647 return;
5648 }
5649 else
5650 switch (GET_MODE_SIZE (mode))
5651 {
5652 case 1:
5653 s = signed_p ? "_i8" : "_u8";
5654 break;
5655 case 2:
5656 s = signed_p ? "_i16" : "_u16";
5657 break;
5658 case 4:
5659 s = signed_p ? "_i32" : "_u32";
5660 break;
5661 case 8:
5662 s = signed_p ? "_i64" : "_u64";
5663 break;
5664 default:
5665 output_operand_lossage ("invalid operand %%xn code");
5666 return;
5667 }
5668 fputs (s, file);
5669 }
5670 return;
5671 /* Print operand size as untyped suffix. */
5672 case 'o':
5673 {
5674 const char *s = 0;
5675 switch (GET_MODE_SIZE (GET_MODE (x)))
5676 {
5677 case 1:
5678 s = "_ubyte";
5679 break;
5680 case 2:
5681 s = "_ushort";
5682 break;
5683 /* The following are full-vector variants. */
5684 case 64:
5685 s = "_ubyte";
5686 break;
5687 case 128:
5688 s = "_ushort";
5689 break;
5690 }
5691
5692 if (s)
5693 {
5694 fputs (s, file);
5695 return;
5696 }
5697
5698 /* Fall-through - the other cases for 'o' are the same as for 's'. */
5699 gcc_fallthrough();
5700 }
5701 case 's':
5702 {
5703 const char *s = "";
5704 switch (GET_MODE_SIZE (GET_MODE (x)))
5705 {
5706 case 1:
5707 s = "_byte";
5708 break;
5709 case 2:
5710 s = "_short";
5711 break;
5712 case 4:
5713 s = "_dword";
5714 break;
5715 case 8:
5716 s = "_dwordx2";
5717 break;
5718 case 12:
5719 s = "_dwordx3";
5720 break;
5721 case 16:
5722 s = "_dwordx4";
5723 break;
5724 case 32:
5725 s = "_dwordx8";
5726 break;
5727 case 64:
5728 s = VECTOR_MODE_P (GET_MODE (x)) ? "_byte" : "_dwordx16";
5729 break;
5730 /* The following are full-vector variants. */
5731 case 128:
5732 s = "_short";
5733 break;
5734 case 256:
5735 s = "_dword";
5736 break;
5737 case 512:
5738 s = "_dwordx2";
5739 break;
5740 default:
5741 output_operand_lossage ("invalid operand %%xn code");
5742 return;
5743 }
5744 fputs (s, file);
5745 }
5746 return;
5747 case 'A':
5748 if (xcode != MEM)
5749 {
5750 output_operand_lossage ("invalid %%xn code");
5751 return;
5752 }
5753 print_operand_address (file, x);
5754 return;
5755 case 'O':
5756 {
5757 if (xcode != MEM)
5758 {
5759 output_operand_lossage ("invalid %%xn code");
5760 return;
5761 }
5762 if (AS_GDS_P (MEM_ADDR_SPACE (x)))
5763 fprintf (file, " gds");
5764
5765 rtx x0 = XEXP (x, 0);
5766 if (AS_GLOBAL_P (MEM_ADDR_SPACE (x)))
5767 {
5768 gcc_assert (TARGET_GCN5_PLUS);
5769
5770 fprintf (file, ", ");
5771
5772 rtx base = x0;
5773 rtx const_offset = NULL_RTX;
5774
5775 if (GET_CODE (base) == PLUS)
5776 {
5777 rtx offset = XEXP (x0, 1);
5778 base = XEXP (x0, 0);
5779
5780 if (GET_CODE (base) == PLUS)
5781 /* (SGPR + VGPR) + CONST */
5782 /* Ignore the VGPR offset for this operand. */
5783 base = XEXP (base, 0);
5784
5785 if (CONST_INT_P (offset))
5786 const_offset = XEXP (x0, 1);
5787 else if (REG_P (offset))
5788 /* SGPR + VGPR */
5789 /* Ignore the VGPR offset for this operand. */
5790 ;
5791 else
5792 output_operand_lossage ("bad ADDR_SPACE_GLOBAL address");
5793 }
5794
5795 if (REG_P (base))
5796 {
5797 if (VGPR_REGNO_P (REGNO (base)))
5798 /* The VGPR address is specified in the %A operand. */
5799 fprintf (file, "off");
5800 else if (SGPR_REGNO_P (REGNO (base)))
5801 print_reg (file, base);
5802 else
5803 output_operand_lossage ("bad ADDR_SPACE_GLOBAL address");
5804 }
5805 else
5806 output_operand_lossage ("bad ADDR_SPACE_GLOBAL address");
5807
5808 if (const_offset != NULL_RTX)
5809 fprintf (file, " offset:" HOST_WIDE_INT_PRINT_DEC,
5810 INTVAL (const_offset));
5811
5812 return;
5813 }
5814
5815 if (GET_CODE (x0) == REG)
5816 return;
5817 if (GET_CODE (x0) != PLUS)
5818 {
5819 output_operand_lossage ("invalid %%xn code");
5820 return;
5821 }
5822 rtx val = XEXP (x0, 1);
5823 if (GET_CODE (val) == CONST_VECTOR)
5824 val = CONST_VECTOR_ELT (val, 0);
5825 if (GET_CODE (val) != CONST_INT)
5826 {
5827 output_operand_lossage ("invalid %%xn code");
5828 return;
5829 }
5830 fprintf (file, " offset:" HOST_WIDE_INT_PRINT_DEC, INTVAL (val));
5831
5832 }
5833 return;
5834 case 'c':
5835 invert = true;
5836 /* Fall through. */
5837 case 'C':
5838 {
5839 const char *s;
5840 bool num = false;
5841 if ((xcode != EQ && xcode != NE) || !REG_P (XEXP (x, 0)))
5842 {
5843 output_operand_lossage ("invalid %%xn code");
5844 return;
5845 }
5846 switch (REGNO (XEXP (x, 0)))
5847 {
5848 case VCC_REG:
5849 case VCCZ_REG:
5850 s = "_vcc";
5851 break;
5852 case SCC_REG:
5853 /* For some reason llvm-mc insists on scc0 instead of sccz. */
5854 num = true;
5855 s = "_scc";
5856 break;
5857 case EXECZ_REG:
5858 s = "_exec";
5859 break;
5860 default:
5861 output_operand_lossage ("invalid %%xn code");
5862 return;
5863 }
5864 fputs (s, file);
5865 if (xcode == (invert ? NE : EQ))
5866 fputc (num ? '0' : 'z', file);
5867 else
5868 fputs (num ? "1" : "nz", file);
5869 return;
5870 }
5871 case 'D':
5872 {
5873 const char *s;
5874 bool cmp_signed = false;
5875 switch (xcode)
5876 {
5877 case EQ:
5878 s = "_eq_";
5879 break;
5880 case NE:
5881 s = "_lg_";
5882 break;
5883 case LT:
5884 s = "_lt_";
5885 cmp_signed = true;
5886 break;
5887 case LE:
5888 s = "_le_";
5889 cmp_signed = true;
5890 break;
5891 case GT:
5892 s = "_gt_";
5893 cmp_signed = true;
5894 break;
5895 case GE:
5896 s = "_ge_";
5897 cmp_signed = true;
5898 break;
5899 case LTU:
5900 s = "_lt_";
5901 break;
5902 case LEU:
5903 s = "_le_";
5904 break;
5905 case GTU:
5906 s = "_gt_";
5907 break;
5908 case GEU:
5909 s = "_ge_";
5910 break;
5911 default:
5912 output_operand_lossage ("invalid %%xn code");
5913 return;
5914 }
5915 fputs (s, file);
5916 fputc (cmp_signed ? 'i' : 'u', file);
5917
5918 machine_mode mode = GET_MODE (XEXP (x, 0));
5919
5920 if (mode == VOIDmode)
5921 mode = GET_MODE (XEXP (x, 1));
5922
5923 /* If both sides are constants, then assume the instruction is in
5924 SImode since s_cmp can only do integer compares. */
5925 if (mode == VOIDmode)
5926 mode = SImode;
5927
5928 switch (GET_MODE_SIZE (mode))
5929 {
5930 case 4:
5931 s = "32";
5932 break;
5933 case 8:
5934 s = "64";
5935 break;
5936 default:
5937 output_operand_lossage ("invalid operand %%xn code");
5938 return;
5939 }
5940 fputs (s, file);
5941 return;
5942 }
5943 case 'E':
5944 {
5945 const char *s;
5946 bool cmp_signed = false;
5947 machine_mode mode = GET_MODE (XEXP (x, 0));
5948
5949 if (mode == VOIDmode)
5950 mode = GET_MODE (XEXP (x, 1));
5951
5952 /* If both sides are constants, assume the instruction is in SFmode
5953 if either operand is floating point, otherwise assume SImode. */
5954 if (mode == VOIDmode)
5955 {
5956 if (GET_CODE (XEXP (x, 0)) == CONST_DOUBLE
5957 || GET_CODE (XEXP (x, 1)) == CONST_DOUBLE)
5958 mode = SFmode;
5959 else
5960 mode = SImode;
5961 }
5962
5963 /* Use the same format code for vector comparisons. */
5964 if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT
5965 || GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
5966 mode = GET_MODE_INNER (mode);
5967
5968 bool float_p = GET_MODE_CLASS (mode) == MODE_FLOAT;
5969
5970 switch (xcode)
5971 {
5972 case EQ:
5973 s = "_eq_";
5974 break;
5975 case NE:
5976 s = float_p ? "_neq_" : "_ne_";
5977 break;
5978 case LT:
5979 s = "_lt_";
5980 cmp_signed = true;
5981 break;
5982 case LE:
5983 s = "_le_";
5984 cmp_signed = true;
5985 break;
5986 case GT:
5987 s = "_gt_";
5988 cmp_signed = true;
5989 break;
5990 case GE:
5991 s = "_ge_";
5992 cmp_signed = true;
5993 break;
5994 case LTU:
5995 s = "_lt_";
5996 break;
5997 case LEU:
5998 s = "_le_";
5999 break;
6000 case GTU:
6001 s = "_gt_";
6002 break;
6003 case GEU:
6004 s = "_ge_";
6005 break;
6006 case ORDERED:
6007 s = "_o_";
6008 break;
6009 case UNORDERED:
6010 s = "_u_";
6011 break;
6012 case UNEQ:
6013 s = "_nlg_";
6014 break;
6015 case UNGE:
6016 s = "_nlt_";
6017 break;
6018 case UNGT:
6019 s = "_nle_";
6020 break;
6021 case UNLE:
6022 s = "_ngt_";
6023 break;
6024 case UNLT:
6025 s = "_nge_";
6026 break;
6027 case LTGT:
6028 s = "_lg_";
6029 break;
6030 default:
6031 output_operand_lossage ("invalid %%xn code");
6032 return;
6033 }
6034 fputs (s, file);
6035 fputc (float_p ? 'f' : cmp_signed ? 'i' : 'u', file);
6036
6037 switch (GET_MODE_SIZE (mode))
6038 {
6039 case 1:
6040 output_operand_lossage ("operand %%xn code invalid for QImode");
6041 return;
6042 case 2:
6043 s = "16";
6044 break;
6045 case 4:
6046 s = "32";
6047 break;
6048 case 8:
6049 s = "64";
6050 break;
6051 default:
6052 output_operand_lossage ("invalid operand %%xn code");
6053 return;
6054 }
6055 fputs (s, file);
6056 return;
6057 }
6058 case 'L':
6059 print_operand (file, gcn_operand_part (GET_MODE (x), x, 0), 0);
6060 return;
6061 case 'H':
6062 print_operand (file, gcn_operand_part (GET_MODE (x), x, 1), 0);
6063 return;
6064 case 'R':
6065 /* Print a scalar register number as an integer. Temporary hack. */
6066 gcc_assert (REG_P (x));
6067 fprintf (file, "%u", (int) REGNO (x));
6068 return;
6069 case 'V':
6070 /* Print a vector register number as an integer. Temporary hack. */
6071 gcc_assert (REG_P (x));
6072 fprintf (file, "%u", (int) REGNO (x) - FIRST_VGPR_REG);
6073 return;
6074 case 0:
6075 if (xcode == REG)
6076 print_reg (file, x);
6077 else if (xcode == MEM)
6078 output_address (GET_MODE (x), x);
6079 else if (xcode == CONST_INT)
6080 fprintf (file, "%i", (int) INTVAL (x));
6081 else if (xcode == CONST_VECTOR)
6082 print_operand (file, CONST_VECTOR_ELT (x, 0), code);
6083 else if (xcode == CONST_DOUBLE)
6084 {
6085 const char *str;
6086 switch (gcn_inline_fp_constant_p (x, false))
6087 {
6088 case 240:
6089 str = "0.5";
6090 break;
6091 case 241:
6092 str = "-0.5";
6093 break;
6094 case 242:
6095 str = "1.0";
6096 break;
6097 case 243:
6098 str = "-1.0";
6099 break;
6100 case 244:
6101 str = "2.0";
6102 break;
6103 case 245:
6104 str = "-2.0";
6105 break;
6106 case 246:
6107 str = "4.0";
6108 break;
6109 case 247:
6110 str = "-4.0";
6111 break;
6112 case 248:
6113 str = "1/pi";
6114 break;
6115 default:
6116 rtx ix = simplify_gen_subreg (GET_MODE (x) == DFmode
6117 ? DImode : SImode,
6118 x, GET_MODE (x), 0);
6119 if (x)
6120 print_operand (file, ix, code);
6121 else
6122 output_operand_lossage ("invalid fp constant");
6123 return;
6124 break;
6125 }
6126 fprintf (file, str);
6127 return;
6128 }
6129 else
6130 output_addr_const (file, x);
6131 return;
6132 case '^':
6133 if (TARGET_GCN5_PLUS)
6134 fputs ("_co", file);
6135 return;
6136 case 'g':
6137 gcc_assert (xcode == MEM);
6138 if (MEM_VOLATILE_P (x))
6139 fputs (" glc", file);
6140 return;
6141 default:
6142 output_operand_lossage ("invalid %%xn code");
6143 }
6144 gcc_unreachable ();
6145 }
6146
6147 /* }}} */
6148 /* {{{ TARGET hook overrides. */
6149
6150 #undef TARGET_ADDR_SPACE_ADDRESS_MODE
6151 #define TARGET_ADDR_SPACE_ADDRESS_MODE gcn_addr_space_address_mode
6152 #undef TARGET_ADDR_SPACE_LEGITIMATE_ADDRESS_P
6153 #define TARGET_ADDR_SPACE_LEGITIMATE_ADDRESS_P \
6154 gcn_addr_space_legitimate_address_p
6155 #undef TARGET_ADDR_SPACE_LEGITIMIZE_ADDRESS
6156 #define TARGET_ADDR_SPACE_LEGITIMIZE_ADDRESS gcn_addr_space_legitimize_address
6157 #undef TARGET_ADDR_SPACE_POINTER_MODE
6158 #define TARGET_ADDR_SPACE_POINTER_MODE gcn_addr_space_pointer_mode
6159 #undef TARGET_ADDR_SPACE_SUBSET_P
6160 #define TARGET_ADDR_SPACE_SUBSET_P gcn_addr_space_subset_p
6161 #undef TARGET_ADDR_SPACE_CONVERT
6162 #define TARGET_ADDR_SPACE_CONVERT gcn_addr_space_convert
6163 #undef TARGET_ARG_PARTIAL_BYTES
6164 #define TARGET_ARG_PARTIAL_BYTES gcn_arg_partial_bytes
6165 #undef TARGET_ASM_ALIGNED_DI_OP
6166 #define TARGET_ASM_ALIGNED_DI_OP "\t.8byte\t"
6167 #undef TARGET_ASM_FILE_START
6168 #define TARGET_ASM_FILE_START output_file_start
6169 #undef TARGET_ASM_FUNCTION_PROLOGUE
6170 #define TARGET_ASM_FUNCTION_PROLOGUE gcn_target_asm_function_prologue
6171 #undef TARGET_ASM_SELECT_SECTION
6172 #define TARGET_ASM_SELECT_SECTION gcn_asm_select_section
6173 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
6174 #define TARGET_ASM_TRAMPOLINE_TEMPLATE gcn_asm_trampoline_template
6175 #undef TARGET_ATTRIBUTE_TABLE
6176 #define TARGET_ATTRIBUTE_TABLE gcn_attribute_table
6177 #undef TARGET_BUILTIN_DECL
6178 #define TARGET_BUILTIN_DECL gcn_builtin_decl
6179 #undef TARGET_CAN_CHANGE_MODE_CLASS
6180 #define TARGET_CAN_CHANGE_MODE_CLASS gcn_can_change_mode_class
6181 #undef TARGET_CAN_ELIMINATE
6182 #define TARGET_CAN_ELIMINATE gcn_can_eliminate_p
6183 #undef TARGET_CANNOT_COPY_INSN_P
6184 #define TARGET_CANNOT_COPY_INSN_P gcn_cannot_copy_insn_p
6185 #undef TARGET_CLASS_LIKELY_SPILLED_P
6186 #define TARGET_CLASS_LIKELY_SPILLED_P gcn_class_likely_spilled_p
6187 #undef TARGET_CLASS_MAX_NREGS
6188 #define TARGET_CLASS_MAX_NREGS gcn_class_max_nregs
6189 #undef TARGET_CONDITIONAL_REGISTER_USAGE
6190 #define TARGET_CONDITIONAL_REGISTER_USAGE gcn_conditional_register_usage
6191 #undef TARGET_CONSTANT_ALIGNMENT
6192 #define TARGET_CONSTANT_ALIGNMENT gcn_constant_alignment
6193 #undef TARGET_DEBUG_UNWIND_INFO
6194 #define TARGET_DEBUG_UNWIND_INFO gcn_debug_unwind_info
6195 #undef TARGET_EMUTLS_VAR_INIT
6196 #define TARGET_EMUTLS_VAR_INIT gcn_emutls_var_init
6197 #undef TARGET_EXPAND_BUILTIN
6198 #define TARGET_EXPAND_BUILTIN gcn_expand_builtin
6199 #undef TARGET_FUNCTION_ARG
6200 #undef TARGET_FUNCTION_ARG_ADVANCE
6201 #define TARGET_FUNCTION_ARG_ADVANCE gcn_function_arg_advance
6202 #define TARGET_FUNCTION_ARG gcn_function_arg
6203 #undef TARGET_FUNCTION_VALUE
6204 #define TARGET_FUNCTION_VALUE gcn_function_value
6205 #undef TARGET_FUNCTION_VALUE_REGNO_P
6206 #define TARGET_FUNCTION_VALUE_REGNO_P gcn_function_value_regno_p
6207 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
6208 #define TARGET_GIMPLIFY_VA_ARG_EXPR gcn_gimplify_va_arg_expr
6209 #undef TARGET_OMP_DEVICE_KIND_ARCH_ISA
6210 #define TARGET_OMP_DEVICE_KIND_ARCH_ISA gcn_omp_device_kind_arch_isa
6211 #undef TARGET_GOACC_ADJUST_PROPAGATION_RECORD
6212 #define TARGET_GOACC_ADJUST_PROPAGATION_RECORD \
6213 gcn_goacc_adjust_propagation_record
6214 #undef TARGET_GOACC_ADJUST_GANGPRIVATE_DECL
6215 #define TARGET_GOACC_ADJUST_GANGPRIVATE_DECL gcn_goacc_adjust_gangprivate_decl
6216 #undef TARGET_GOACC_FORK_JOIN
6217 #define TARGET_GOACC_FORK_JOIN gcn_fork_join
6218 #undef TARGET_GOACC_REDUCTION
6219 #define TARGET_GOACC_REDUCTION gcn_goacc_reduction
6220 #undef TARGET_GOACC_VALIDATE_DIMS
6221 #define TARGET_GOACC_VALIDATE_DIMS gcn_goacc_validate_dims
6222 #undef TARGET_HARD_REGNO_MODE_OK
6223 #define TARGET_HARD_REGNO_MODE_OK gcn_hard_regno_mode_ok
6224 #undef TARGET_HARD_REGNO_NREGS
6225 #define TARGET_HARD_REGNO_NREGS gcn_hard_regno_nregs
6226 #undef TARGET_HAVE_SPECULATION_SAFE_VALUE
6227 #define TARGET_HAVE_SPECULATION_SAFE_VALUE speculation_safe_value_not_needed
6228 #undef TARGET_INIT_BUILTINS
6229 #define TARGET_INIT_BUILTINS gcn_init_builtins
6230 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
6231 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
6232 gcn_ira_change_pseudo_allocno_class
6233 #undef TARGET_LEGITIMATE_CONSTANT_P
6234 #define TARGET_LEGITIMATE_CONSTANT_P gcn_legitimate_constant_p
6235 #undef TARGET_LRA_P
6236 #define TARGET_LRA_P hook_bool_void_true
6237 #undef TARGET_MACHINE_DEPENDENT_REORG
6238 #define TARGET_MACHINE_DEPENDENT_REORG gcn_md_reorg
6239 #undef TARGET_MEMORY_MOVE_COST
6240 #define TARGET_MEMORY_MOVE_COST gcn_memory_move_cost
6241 #undef TARGET_MODES_TIEABLE_P
6242 #define TARGET_MODES_TIEABLE_P gcn_modes_tieable_p
6243 #undef TARGET_OPTION_OVERRIDE
6244 #define TARGET_OPTION_OVERRIDE gcn_option_override
6245 #undef TARGET_PRETEND_OUTGOING_VARARGS_NAMED
6246 #define TARGET_PRETEND_OUTGOING_VARARGS_NAMED \
6247 gcn_pretend_outgoing_varargs_named
6248 #undef TARGET_PROMOTE_FUNCTION_MODE
6249 #define TARGET_PROMOTE_FUNCTION_MODE gcn_promote_function_mode
6250 #undef TARGET_REGISTER_MOVE_COST
6251 #define TARGET_REGISTER_MOVE_COST gcn_register_move_cost
6252 #undef TARGET_RETURN_IN_MEMORY
6253 #define TARGET_RETURN_IN_MEMORY gcn_return_in_memory
6254 #undef TARGET_RTX_COSTS
6255 #define TARGET_RTX_COSTS gcn_rtx_costs
6256 #undef TARGET_SECONDARY_RELOAD
6257 #define TARGET_SECONDARY_RELOAD gcn_secondary_reload
6258 #undef TARGET_SECTION_TYPE_FLAGS
6259 #define TARGET_SECTION_TYPE_FLAGS gcn_section_type_flags
6260 #undef TARGET_SMALL_REGISTER_CLASSES_FOR_MODE_P
6261 #define TARGET_SMALL_REGISTER_CLASSES_FOR_MODE_P \
6262 gcn_small_register_classes_for_mode_p
6263 #undef TARGET_SPILL_CLASS
6264 #define TARGET_SPILL_CLASS gcn_spill_class
6265 #undef TARGET_STRICT_ARGUMENT_NAMING
6266 #define TARGET_STRICT_ARGUMENT_NAMING gcn_strict_argument_naming
6267 #undef TARGET_TRAMPOLINE_INIT
6268 #define TARGET_TRAMPOLINE_INIT gcn_trampoline_init
6269 #undef TARGET_TRULY_NOOP_TRUNCATION
6270 #define TARGET_TRULY_NOOP_TRUNCATION gcn_truly_noop_truncation
6271 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
6272 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST gcn_vectorization_cost
6273 #undef TARGET_VECTORIZE_GET_MASK_MODE
6274 #define TARGET_VECTORIZE_GET_MASK_MODE gcn_vectorize_get_mask_mode
6275 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
6276 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE gcn_vectorize_preferred_simd_mode
6277 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
6278 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
6279 gcn_preferred_vector_alignment
6280 #undef TARGET_VECTORIZE_RELATED_MODE
6281 #define TARGET_VECTORIZE_RELATED_MODE gcn_related_vector_mode
6282 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
6283 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
6284 gcn_vectorize_support_vector_misalignment
6285 #undef TARGET_VECTORIZE_VEC_PERM_CONST
6286 #define TARGET_VECTORIZE_VEC_PERM_CONST gcn_vectorize_vec_perm_const
6287 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
6288 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
6289 gcn_vector_alignment_reachable
6290 #undef TARGET_VECTOR_MODE_SUPPORTED_P
6291 #define TARGET_VECTOR_MODE_SUPPORTED_P gcn_vector_mode_supported_p
6292
6293 struct gcc_target targetm = TARGET_INITIALIZER;
6294
6295 #include "gt-gcn.h"
6296 /* }}} */
6297