1 /* Copyright (C) 2016-2022 Free Software Foundation, Inc.
2
3 This file is free software; you can redistribute it and/or modify it under
4 the terms of the GNU General Public License as published by the Free
5 Software Foundation; either version 3 of the License, or (at your option)
6 any later version.
7
8 This file is distributed in the hope that it will be useful, but WITHOUT
9 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
11 for more details.
12
13 You should have received a copy of the GNU General Public License
14 along with GCC; see the file COPYING3. If not see
15 <http://www.gnu.org/licenses/>. */
16
17 /* {{{ Includes. */
18
19 /* We want GET_MODE_SIZE et al to return integers, please. */
20 #define IN_TARGET_CODE 1
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "memmodel.h"
28 #include "rtl.h"
29 #include "tree.h"
30 #include "df.h"
31 #include "tm_p.h"
32 #include "stringpool.h"
33 #include "optabs.h"
34 #include "regs.h"
35 #include "emit-rtl.h"
36 #include "recog.h"
37 #include "diagnostic-core.h"
38 #include "insn-attr.h"
39 #include "fold-const.h"
40 #include "calls.h"
41 #include "explow.h"
42 #include "expr.h"
43 #include "output.h"
44 #include "cfgrtl.h"
45 #include "langhooks.h"
46 #include "builtins.h"
47 #include "omp-general.h"
48 #include "print-rtl.h"
49 #include "attribs.h"
50 #include "varasm.h"
51 #include "intl.h"
52 #include "rtl-iter.h"
53 #include "dwarf2.h"
54 #include "gimple.h"
55
56 /* This file should be included last. */
57 #include "target-def.h"
58
59 /* }}} */
60 /* {{{ Global variables. */
61
62 /* Constants used by FP instructions. */
63
64 static REAL_VALUE_TYPE dconst4, dconst1over2pi;
65 static bool ext_gcn_constants_init = 0;
66
67 /* Holds the ISA variant, derived from the command line parameters. */
68
69 int gcn_isa = 3; /* Default to GCN3. */
70
71 /* Reserve this much space for LDS (for propagating variables from
72 worker-single mode to worker-partitioned mode), per workgroup. Global
73 analysis could calculate an exact bound, but we don't do that yet.
74
75 We want to permit full occupancy, so size accordingly. */
76
77 /* Use this as a default, but allow it to grow if the user requests a large
78 amount of gang-private shared-memory space. */
79 static int acc_lds_size = 0x600;
80
81 #define OMP_LDS_SIZE 0x600 /* 0x600 is 1/40 total, rounded down. */
82 #define ACC_LDS_SIZE acc_lds_size
83 #define OTHER_LDS_SIZE 65536 /* If in doubt, reserve all of it. */
84
85 #define LDS_SIZE (flag_openacc ? ACC_LDS_SIZE \
86 : flag_openmp ? OMP_LDS_SIZE \
87 : OTHER_LDS_SIZE)
88
89 static int gang_private_hwm = 32;
90 static hash_map<tree, int> lds_allocs;
91
92 /* The number of registers usable by normal non-kernel functions.
93 The SGPR count includes any special extra registers such as VCC. */
94
95 #define MAX_NORMAL_SGPR_COUNT 62 // i.e. 64 with VCC
96 #define MAX_NORMAL_VGPR_COUNT 24
97
98 /* }}} */
99 /* {{{ Initialization and options. */
100
101 /* Initialize machine_function. */
102
103 static struct machine_function *
gcn_init_machine_status(void)104 gcn_init_machine_status (void)
105 {
106 struct machine_function *f;
107
108 f = ggc_cleared_alloc<machine_function> ();
109
110 if (TARGET_GCN3)
111 f->use_flat_addressing = true;
112
113 return f;
114 }
115
116 /* Implement TARGET_OPTION_OVERRIDE.
117
118 Override option settings where defaults are variable, or we have specific
119 needs to consider. */
120
121 static void
gcn_option_override(void)122 gcn_option_override (void)
123 {
124 init_machine_status = gcn_init_machine_status;
125
126 /* The HSA runtime does not respect ELF load addresses, so force PIE. */
127 if (!flag_pie)
128 flag_pie = 2;
129 if (!flag_pic)
130 flag_pic = flag_pie;
131
132 gcn_isa = gcn_arch == PROCESSOR_FIJI ? 3 : 5;
133
134 /* The default stack size needs to be small for offload kernels because
135 there may be many, many threads. Also, a smaller stack gives a
136 measureable performance boost. But, a small stack is insufficient
137 for running the testsuite, so we use a larger default for the stand
138 alone case. */
139 if (stack_size_opt == -1)
140 {
141 if (flag_openacc || flag_openmp)
142 /* 512 bytes per work item = 32kB total. */
143 stack_size_opt = 512 * 64;
144 else
145 /* 1MB total. */
146 stack_size_opt = 1048576;
147 }
148
149 /* Reserve 1Kb (somewhat arbitrarily) of LDS space for reduction results and
150 worker broadcasts. */
151 if (gang_private_size_opt == -1)
152 gang_private_size_opt = 512;
153 else if (gang_private_size_opt < gang_private_hwm)
154 gang_private_size_opt = gang_private_hwm;
155 else if (gang_private_size_opt >= acc_lds_size - 1024)
156 {
157 /* We need some space for reductions and worker broadcasting. If the
158 user requests a large amount of gang-private LDS space, we might not
159 have enough left for the former. Increase the LDS allocation in that
160 case, although this may reduce the maximum occupancy on the
161 hardware. */
162 acc_lds_size = gang_private_size_opt + 1024;
163 if (acc_lds_size > 32768)
164 acc_lds_size = 32768;
165 }
166
167 /* The xnack option is a placeholder, for now. */
168 if (flag_xnack)
169 sorry ("XNACK support");
170 }
171
172 /* }}} */
173 /* {{{ Attributes. */
174
175 /* This table defines the arguments that are permitted in
176 __attribute__ ((amdgpu_hsa_kernel (...))).
177
178 The names and values correspond to the HSA metadata that is encoded
179 into the assembler file and binary. */
180
181 static const struct gcn_kernel_arg_type
182 {
183 const char *name;
184 const char *header_pseudo;
185 machine_mode mode;
186
187 /* This should be set to -1 or -2 for a dynamically allocated register
188 number. Use -1 if this argument contributes to the user_sgpr_count,
189 -2 otherwise. */
190 int fixed_regno;
191 } gcn_kernel_arg_types[] = {
192 {"exec", NULL, DImode, EXEC_REG},
193 #define PRIVATE_SEGMENT_BUFFER_ARG 1
194 {"private_segment_buffer",
195 ".amdhsa_user_sgpr_private_segment_buffer", TImode, -1},
196 #define DISPATCH_PTR_ARG 2
197 {"dispatch_ptr", ".amdhsa_user_sgpr_dispatch_ptr", DImode, -1},
198 #define QUEUE_PTR_ARG 3
199 {"queue_ptr", ".amdhsa_user_sgpr_queue_ptr", DImode, -1},
200 #define KERNARG_SEGMENT_PTR_ARG 4
201 {"kernarg_segment_ptr", ".amdhsa_user_sgpr_kernarg_segment_ptr", DImode, -1},
202 {"dispatch_id", ".amdhsa_user_sgpr_dispatch_id", DImode, -1},
203 #define FLAT_SCRATCH_INIT_ARG 6
204 {"flat_scratch_init", ".amdhsa_user_sgpr_flat_scratch_init", DImode, -1},
205 #define FLAT_SCRATCH_SEGMENT_SIZE_ARG 7
206 {"private_segment_size", ".amdhsa_user_sgpr_private_segment_size", SImode, -1},
207 #define WORKGROUP_ID_X_ARG 8
208 {"workgroup_id_X", ".amdhsa_system_sgpr_workgroup_id_x", SImode, -2},
209 {"workgroup_id_Y", ".amdhsa_system_sgpr_workgroup_id_y", SImode, -2},
210 {"workgroup_id_Z", ".amdhsa_system_sgpr_workgroup_id_z", SImode, -2},
211 {"workgroup_info", ".amdhsa_system_sgpr_workgroup_info", SImode, -1},
212 #define PRIVATE_SEGMENT_WAVE_OFFSET_ARG 12
213 {"private_segment_wave_offset",
214 ".amdhsa_system_sgpr_private_segment_wavefront_offset", SImode, -2},
215 #define WORK_ITEM_ID_X_ARG 13
216 {"work_item_id_X", NULL, V64SImode, FIRST_VGPR_REG},
217 #define WORK_ITEM_ID_Y_ARG 14
218 {"work_item_id_Y", NULL, V64SImode, FIRST_VGPR_REG + 1},
219 #define WORK_ITEM_ID_Z_ARG 15
220 {"work_item_id_Z", NULL, V64SImode, FIRST_VGPR_REG + 2}
221 };
222
223 static const long default_requested_args
224 = (1 << PRIVATE_SEGMENT_BUFFER_ARG)
225 | (1 << DISPATCH_PTR_ARG)
226 | (1 << QUEUE_PTR_ARG)
227 | (1 << KERNARG_SEGMENT_PTR_ARG)
228 | (1 << PRIVATE_SEGMENT_WAVE_OFFSET_ARG)
229 | (1 << WORKGROUP_ID_X_ARG)
230 | (1 << WORK_ITEM_ID_X_ARG)
231 | (1 << WORK_ITEM_ID_Y_ARG)
232 | (1 << WORK_ITEM_ID_Z_ARG);
233
234 /* Extract parameter settings from __attribute__((amdgpu_hsa_kernel ())).
235 This function also sets the default values for some arguments.
236
237 Return true on success, with ARGS populated. */
238
239 static bool
gcn_parse_amdgpu_hsa_kernel_attribute(struct gcn_kernel_args * args,tree list)240 gcn_parse_amdgpu_hsa_kernel_attribute (struct gcn_kernel_args *args,
241 tree list)
242 {
243 bool err = false;
244 args->requested = default_requested_args;
245 args->nargs = 0;
246
247 for (int a = 0; a < GCN_KERNEL_ARG_TYPES; a++)
248 args->reg[a] = -1;
249
250 for (; list; list = TREE_CHAIN (list))
251 {
252 const char *str;
253 if (TREE_CODE (TREE_VALUE (list)) != STRING_CST)
254 {
255 error ("%<amdgpu_hsa_kernel%> attribute requires string constant "
256 "arguments");
257 break;
258 }
259 str = TREE_STRING_POINTER (TREE_VALUE (list));
260 int a;
261 for (a = 0; a < GCN_KERNEL_ARG_TYPES; a++)
262 {
263 if (!strcmp (str, gcn_kernel_arg_types[a].name))
264 break;
265 }
266 if (a == GCN_KERNEL_ARG_TYPES)
267 {
268 error ("unknown specifier %qs in %<amdgpu_hsa_kernel%> attribute",
269 str);
270 err = true;
271 break;
272 }
273 if (args->requested & (1 << a))
274 {
275 error ("duplicated parameter specifier %qs in %<amdgpu_hsa_kernel%> "
276 "attribute", str);
277 err = true;
278 break;
279 }
280 args->requested |= (1 << a);
281 args->order[args->nargs++] = a;
282 }
283
284 /* Requesting WORK_ITEM_ID_Z_ARG implies requesting WORK_ITEM_ID_X_ARG and
285 WORK_ITEM_ID_Y_ARG. Similarly, requesting WORK_ITEM_ID_Y_ARG implies
286 requesting WORK_ITEM_ID_X_ARG. */
287 if (args->requested & (1 << WORK_ITEM_ID_Z_ARG))
288 args->requested |= (1 << WORK_ITEM_ID_Y_ARG);
289 if (args->requested & (1 << WORK_ITEM_ID_Y_ARG))
290 args->requested |= (1 << WORK_ITEM_ID_X_ARG);
291
292 int sgpr_regno = FIRST_SGPR_REG;
293 args->nsgprs = 0;
294 for (int a = 0; a < GCN_KERNEL_ARG_TYPES; a++)
295 {
296 if (!(args->requested & (1 << a)))
297 continue;
298
299 if (gcn_kernel_arg_types[a].fixed_regno >= 0)
300 args->reg[a] = gcn_kernel_arg_types[a].fixed_regno;
301 else
302 {
303 int reg_count;
304
305 switch (gcn_kernel_arg_types[a].mode)
306 {
307 case E_SImode:
308 reg_count = 1;
309 break;
310 case E_DImode:
311 reg_count = 2;
312 break;
313 case E_TImode:
314 reg_count = 4;
315 break;
316 default:
317 gcc_unreachable ();
318 }
319 args->reg[a] = sgpr_regno;
320 sgpr_regno += reg_count;
321 if (gcn_kernel_arg_types[a].fixed_regno == -1)
322 args->nsgprs += reg_count;
323 }
324 }
325 if (sgpr_regno > FIRST_SGPR_REG + 16)
326 {
327 error ("too many arguments passed in sgpr registers");
328 }
329 return err;
330 }
331
332 /* Referenced by TARGET_ATTRIBUTE_TABLE.
333
334 Validates target specific attributes. */
335
336 static tree
gcn_handle_amdgpu_hsa_kernel_attribute(tree * node,tree name,tree args,int,bool * no_add_attrs)337 gcn_handle_amdgpu_hsa_kernel_attribute (tree *node, tree name,
338 tree args, int, bool *no_add_attrs)
339 {
340 if (!FUNC_OR_METHOD_TYPE_P (*node))
341 {
342 warning (OPT_Wattributes, "%qE attribute only applies to functions",
343 name);
344 *no_add_attrs = true;
345 return NULL_TREE;
346 }
347
348 /* Can combine regparm with all attributes but fastcall, and thiscall. */
349 if (is_attribute_p ("gcnhsa_kernel", name))
350 {
351 struct gcn_kernel_args kernelarg;
352
353 if (gcn_parse_amdgpu_hsa_kernel_attribute (&kernelarg, args))
354 *no_add_attrs = true;
355
356 return NULL_TREE;
357 }
358
359 return NULL_TREE;
360 }
361
362 /* Implement TARGET_ATTRIBUTE_TABLE.
363
364 Create target-specific __attribute__ types. */
365
366 static const struct attribute_spec gcn_attribute_table[] = {
367 /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
368 affects_type_identity } */
369 {"amdgpu_hsa_kernel", 0, GCN_KERNEL_ARG_TYPES, false, true,
370 true, true, gcn_handle_amdgpu_hsa_kernel_attribute, NULL},
371 /* End element. */
372 {NULL, 0, 0, false, false, false, false, NULL, NULL}
373 };
374
375 /* }}} */
376 /* {{{ Registers and modes. */
377
378 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P. */
379
380 bool
gcn_scalar_mode_supported_p(scalar_mode mode)381 gcn_scalar_mode_supported_p (scalar_mode mode)
382 {
383 return (mode == BImode
384 || mode == QImode
385 || mode == HImode /* || mode == HFmode */
386 || mode == SImode || mode == SFmode
387 || mode == DImode || mode == DFmode
388 || mode == TImode);
389 }
390
391 /* Implement TARGET_CLASS_MAX_NREGS.
392
393 Return the number of hard registers needed to hold a value of MODE in
394 a register of class RCLASS. */
395
396 static unsigned char
gcn_class_max_nregs(reg_class_t rclass,machine_mode mode)397 gcn_class_max_nregs (reg_class_t rclass, machine_mode mode)
398 {
399 /* Scalar registers are 32bit, vector registers are in fact tuples of
400 64 lanes. */
401 if (rclass == VGPR_REGS)
402 {
403 if (vgpr_1reg_mode_p (mode))
404 return 1;
405 if (vgpr_2reg_mode_p (mode))
406 return 2;
407 /* TImode is used by DImode compare_and_swap. */
408 if (mode == TImode)
409 return 4;
410 }
411 else if (rclass == VCC_CONDITIONAL_REG && mode == BImode)
412 return 2;
413 return CEIL (GET_MODE_SIZE (mode), 4);
414 }
415
416 /* Implement TARGET_HARD_REGNO_NREGS.
417
418 Return the number of hard registers needed to hold a value of MODE in
419 REGNO. */
420
421 unsigned int
gcn_hard_regno_nregs(unsigned int regno,machine_mode mode)422 gcn_hard_regno_nregs (unsigned int regno, machine_mode mode)
423 {
424 return gcn_class_max_nregs (REGNO_REG_CLASS (regno), mode);
425 }
426
427 /* Implement TARGET_HARD_REGNO_MODE_OK.
428
429 Return true if REGNO can hold value in MODE. */
430
431 bool
gcn_hard_regno_mode_ok(unsigned int regno,machine_mode mode)432 gcn_hard_regno_mode_ok (unsigned int regno, machine_mode mode)
433 {
434 /* Treat a complex mode as if it were a scalar mode of the same overall
435 size for the purposes of allocating hard registers. */
436 if (COMPLEX_MODE_P (mode))
437 switch (mode)
438 {
439 case E_CQImode:
440 case E_CHImode:
441 mode = SImode;
442 break;
443 case E_CSImode:
444 mode = DImode;
445 break;
446 case E_CDImode:
447 mode = TImode;
448 break;
449 case E_HCmode:
450 mode = SFmode;
451 break;
452 case E_SCmode:
453 mode = DFmode;
454 break;
455 default:
456 /* Not supported. */
457 return false;
458 }
459
460 switch (regno)
461 {
462 case FLAT_SCRATCH_LO_REG:
463 case XNACK_MASK_LO_REG:
464 case TBA_LO_REG:
465 case TMA_LO_REG:
466 return (mode == SImode || mode == DImode);
467 case VCC_LO_REG:
468 case EXEC_LO_REG:
469 return (mode == BImode || mode == SImode || mode == DImode);
470 case M0_REG:
471 case FLAT_SCRATCH_HI_REG:
472 case XNACK_MASK_HI_REG:
473 case TBA_HI_REG:
474 case TMA_HI_REG:
475 return mode == SImode;
476 case VCC_HI_REG:
477 return false;
478 case EXEC_HI_REG:
479 return mode == SImode /*|| mode == V32BImode */ ;
480 case SCC_REG:
481 case VCCZ_REG:
482 case EXECZ_REG:
483 return mode == BImode;
484 }
485 if (regno == ARG_POINTER_REGNUM || regno == FRAME_POINTER_REGNUM)
486 return true;
487 if (SGPR_REGNO_P (regno))
488 /* We restrict double register values to aligned registers. */
489 return (sgpr_1reg_mode_p (mode)
490 || (!((regno - FIRST_SGPR_REG) & 1) && sgpr_2reg_mode_p (mode))
491 || (((regno - FIRST_SGPR_REG) & 3) == 0 && mode == TImode));
492 if (VGPR_REGNO_P (regno))
493 /* Vector instructions do not care about the alignment of register
494 pairs, but where there is no 64-bit instruction, many of the
495 define_split do not work if the input and output registers partially
496 overlap. We tried to fix this with early clobber and match
497 constraints, but it was bug prone, added complexity, and conflicts
498 with the 'U0' constraints on vec_merge.
499 Therefore, we restrict ourselved to aligned registers. */
500 return (vgpr_1reg_mode_p (mode)
501 || (!((regno - FIRST_VGPR_REG) & 1) && vgpr_2reg_mode_p (mode))
502 /* TImode is used by DImode compare_and_swap. */
503 || (mode == TImode
504 && !((regno - FIRST_VGPR_REG) & 3)));
505 return false;
506 }
507
508 /* Implement REGNO_REG_CLASS via gcn.h.
509
510 Return smallest class containing REGNO. */
511
512 enum reg_class
gcn_regno_reg_class(int regno)513 gcn_regno_reg_class (int regno)
514 {
515 switch (regno)
516 {
517 case SCC_REG:
518 return SCC_CONDITIONAL_REG;
519 case VCC_LO_REG:
520 case VCC_HI_REG:
521 return VCC_CONDITIONAL_REG;
522 case VCCZ_REG:
523 return VCCZ_CONDITIONAL_REG;
524 case EXECZ_REG:
525 return EXECZ_CONDITIONAL_REG;
526 case EXEC_LO_REG:
527 case EXEC_HI_REG:
528 return EXEC_MASK_REG;
529 }
530 if (VGPR_REGNO_P (regno))
531 return VGPR_REGS;
532 if (SGPR_REGNO_P (regno))
533 return SGPR_REGS;
534 if (regno < FIRST_VGPR_REG)
535 return GENERAL_REGS;
536 if (regno == ARG_POINTER_REGNUM || regno == FRAME_POINTER_REGNUM)
537 return AFP_REGS;
538 return ALL_REGS;
539 }
540
541 /* Implement TARGET_CAN_CHANGE_MODE_CLASS.
542
543 GCC assumes that lowpart contains first part of value as stored in memory.
544 This is not the case for vector registers. */
545
546 bool
gcn_can_change_mode_class(machine_mode from,machine_mode to,reg_class_t regclass)547 gcn_can_change_mode_class (machine_mode from, machine_mode to,
548 reg_class_t regclass)
549 {
550 if (!vgpr_vector_mode_p (from) && !vgpr_vector_mode_p (to))
551 return true;
552 return (gcn_class_max_nregs (regclass, from)
553 == gcn_class_max_nregs (regclass, to));
554 }
555
556 /* Implement TARGET_SMALL_REGISTER_CLASSES_FOR_MODE_P.
557
558 When this hook returns true for MODE, the compiler allows
559 registers explicitly used in the rtl to be used as spill registers
560 but prevents the compiler from extending the lifetime of these
561 registers. */
562
563 bool
gcn_small_register_classes_for_mode_p(machine_mode mode)564 gcn_small_register_classes_for_mode_p (machine_mode mode)
565 {
566 /* We allocate into exec and vcc regs. Those make small register class. */
567 return mode == DImode || mode == SImode;
568 }
569
570 /* Implement TARGET_CLASS_LIKELY_SPILLED_P.
571
572 Returns true if pseudos that have been assigned to registers of class RCLASS
573 would likely be spilled because registers of RCLASS are needed for spill
574 registers. */
575
576 static bool
gcn_class_likely_spilled_p(reg_class_t rclass)577 gcn_class_likely_spilled_p (reg_class_t rclass)
578 {
579 return (rclass == EXEC_MASK_REG
580 || reg_classes_intersect_p (ALL_CONDITIONAL_REGS, rclass));
581 }
582
583 /* Implement TARGET_MODES_TIEABLE_P.
584
585 Returns true if a value of MODE1 is accessible in MODE2 without
586 copying. */
587
588 bool
gcn_modes_tieable_p(machine_mode mode1,machine_mode mode2)589 gcn_modes_tieable_p (machine_mode mode1, machine_mode mode2)
590 {
591 return (GET_MODE_BITSIZE (mode1) <= MAX_FIXED_MODE_SIZE
592 && GET_MODE_BITSIZE (mode2) <= MAX_FIXED_MODE_SIZE);
593 }
594
595 /* Implement TARGET_TRULY_NOOP_TRUNCATION.
596
597 Returns true if it is safe to “convert” a value of INPREC bits to one of
598 OUTPREC bits (where OUTPREC is smaller than INPREC) by merely operating on
599 it as if it had only OUTPREC bits. */
600
601 bool
gcn_truly_noop_truncation(poly_uint64 outprec,poly_uint64 inprec)602 gcn_truly_noop_truncation (poly_uint64 outprec, poly_uint64 inprec)
603 {
604 return ((inprec <= 32) && (outprec <= inprec));
605 }
606
607 /* Return N-th part of value occupying multiple registers. */
608
609 rtx
gcn_operand_part(machine_mode mode,rtx op,int n)610 gcn_operand_part (machine_mode mode, rtx op, int n)
611 {
612 if (GET_MODE_SIZE (mode) >= 256)
613 {
614 /*gcc_assert (GET_MODE_SIZE (mode) == 256 || n == 0); */
615
616 if (REG_P (op))
617 {
618 gcc_assert (REGNO (op) + n < FIRST_PSEUDO_REGISTER);
619 return gen_rtx_REG (V64SImode, REGNO (op) + n);
620 }
621 if (GET_CODE (op) == CONST_VECTOR)
622 {
623 int units = GET_MODE_NUNITS (mode);
624 rtvec v = rtvec_alloc (units);
625
626 for (int i = 0; i < units; ++i)
627 RTVEC_ELT (v, i) = gcn_operand_part (GET_MODE_INNER (mode),
628 CONST_VECTOR_ELT (op, i), n);
629
630 return gen_rtx_CONST_VECTOR (V64SImode, v);
631 }
632 if (GET_CODE (op) == UNSPEC && XINT (op, 1) == UNSPEC_VECTOR)
633 return gcn_gen_undef (V64SImode);
634 gcc_unreachable ();
635 }
636 else if (GET_MODE_SIZE (mode) == 8 && REG_P (op))
637 {
638 gcc_assert (REGNO (op) + n < FIRST_PSEUDO_REGISTER);
639 return gen_rtx_REG (SImode, REGNO (op) + n);
640 }
641 else
642 {
643 if (GET_CODE (op) == UNSPEC && XINT (op, 1) == UNSPEC_VECTOR)
644 return gcn_gen_undef (SImode);
645
646 /* If it's a constant then let's assume it is of the largest mode
647 available, otherwise simplify_gen_subreg will fail. */
648 if (mode == VOIDmode && CONST_INT_P (op))
649 mode = DImode;
650 return simplify_gen_subreg (SImode, op, mode, n * 4);
651 }
652 }
653
654 /* Return N-th part of value occupying multiple registers. */
655
656 rtx
gcn_operand_doublepart(machine_mode mode,rtx op,int n)657 gcn_operand_doublepart (machine_mode mode, rtx op, int n)
658 {
659 return simplify_gen_subreg (DImode, op, mode, n * 8);
660 }
661
662 /* Return true if OP can be split into subregs or high/low parts.
663 This is always true for scalars, but not normally true for vectors.
664 However, for vectors in hardregs we can use the low and high registers. */
665
666 bool
gcn_can_split_p(machine_mode,rtx op)667 gcn_can_split_p (machine_mode, rtx op)
668 {
669 if (vgpr_vector_mode_p (GET_MODE (op)))
670 {
671 if (GET_CODE (op) == SUBREG)
672 op = SUBREG_REG (op);
673 if (!REG_P (op))
674 return true;
675 return REGNO (op) <= FIRST_PSEUDO_REGISTER;
676 }
677 return true;
678 }
679
680 /* Implement TARGET_SPILL_CLASS.
681
682 Return class of registers which could be used for pseudo of MODE
683 and of class RCLASS for spilling instead of memory. Return NO_REGS
684 if it is not possible or non-profitable. */
685
686 static reg_class_t
gcn_spill_class(reg_class_t c,machine_mode)687 gcn_spill_class (reg_class_t c, machine_mode /*mode */ )
688 {
689 if (reg_classes_intersect_p (ALL_CONDITIONAL_REGS, c)
690 || c == VCC_CONDITIONAL_REG)
691 return SGPR_REGS;
692 else
693 return NO_REGS;
694 }
695
696 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
697
698 Change allocno class for given pseudo from allocno and best class
699 calculated by IRA. */
700
701 static reg_class_t
gcn_ira_change_pseudo_allocno_class(int regno,reg_class_t cl,reg_class_t best_cl)702 gcn_ira_change_pseudo_allocno_class (int regno, reg_class_t cl,
703 reg_class_t best_cl)
704 {
705 /* Avoid returning classes that contain both vgpr and sgpr registers. */
706 if (cl != ALL_REGS && cl != SRCDST_REGS && cl != ALL_GPR_REGS)
707 return cl;
708 if (best_cl != ALL_REGS && best_cl != SRCDST_REGS
709 && best_cl != ALL_GPR_REGS)
710 return best_cl;
711
712 machine_mode mode = PSEUDO_REGNO_MODE (regno);
713 if (vgpr_vector_mode_p (mode))
714 return VGPR_REGS;
715
716 return GENERAL_REGS;
717 }
718
719 /* Create a new DImode pseudo reg and emit an instruction to initialize
720 it to VAL. */
721
722 static rtx
get_exec(int64_t val)723 get_exec (int64_t val)
724 {
725 rtx reg = gen_reg_rtx (DImode);
726 emit_insn (gen_rtx_SET (reg, gen_int_mode (val, DImode)));
727 return reg;
728 }
729
730 /* Return value of scalar exec register. */
731
732 rtx
gcn_scalar_exec()733 gcn_scalar_exec ()
734 {
735 return const1_rtx;
736 }
737
738 /* Return pseudo holding scalar exec register. */
739
740 rtx
gcn_scalar_exec_reg()741 gcn_scalar_exec_reg ()
742 {
743 return get_exec (1);
744 }
745
746 /* Return value of full exec register. */
747
748 rtx
gcn_full_exec()749 gcn_full_exec ()
750 {
751 return constm1_rtx;
752 }
753
754 /* Return pseudo holding full exec register. */
755
756 rtx
gcn_full_exec_reg()757 gcn_full_exec_reg ()
758 {
759 return get_exec (-1);
760 }
761
762 /* }}} */
763 /* {{{ Immediate constants. */
764
765 /* Initialize shared numeric constants. */
766
767 static void
init_ext_gcn_constants(void)768 init_ext_gcn_constants (void)
769 {
770 real_from_integer (&dconst4, DFmode, 4, SIGNED);
771
772 /* FIXME: this constant probably does not match what hardware really loads.
773 Reality check it eventually. */
774 real_from_string (&dconst1over2pi,
775 "0.1591549430918953357663423455968866839");
776 real_convert (&dconst1over2pi, SFmode, &dconst1over2pi);
777
778 ext_gcn_constants_init = 1;
779 }
780
781 /* Return non-zero if X is a constant that can appear as an inline operand.
782 This is 0, 0.5, -0.5, 1, -1, 2, -2, 4,-4, 1/(2*pi)
783 Or a vector of those.
784 The value returned should be the encoding of this constant. */
785
786 int
gcn_inline_fp_constant_p(rtx x,bool allow_vector)787 gcn_inline_fp_constant_p (rtx x, bool allow_vector)
788 {
789 machine_mode mode = GET_MODE (x);
790
791 if ((mode == V64HFmode || mode == V64SFmode || mode == V64DFmode)
792 && allow_vector)
793 {
794 int n;
795 if (GET_CODE (x) != CONST_VECTOR)
796 return 0;
797 n = gcn_inline_fp_constant_p (CONST_VECTOR_ELT (x, 0), false);
798 if (!n)
799 return 0;
800 for (int i = 1; i < 64; i++)
801 if (CONST_VECTOR_ELT (x, i) != CONST_VECTOR_ELT (x, 0))
802 return 0;
803 return 1;
804 }
805
806 if (mode != HFmode && mode != SFmode && mode != DFmode)
807 return 0;
808
809 const REAL_VALUE_TYPE *r;
810
811 if (x == CONST0_RTX (mode))
812 return 128;
813 if (x == CONST1_RTX (mode))
814 return 242;
815
816 r = CONST_DOUBLE_REAL_VALUE (x);
817
818 if (real_identical (r, &dconstm1))
819 return 243;
820
821 if (real_identical (r, &dconsthalf))
822 return 240;
823 if (real_identical (r, &dconstm1))
824 return 243;
825 if (real_identical (r, &dconst2))
826 return 244;
827 if (real_identical (r, &dconst4))
828 return 246;
829 if (real_identical (r, &dconst1over2pi))
830 return 248;
831 if (!ext_gcn_constants_init)
832 init_ext_gcn_constants ();
833 real_value_negate (r);
834 if (real_identical (r, &dconsthalf))
835 return 241;
836 if (real_identical (r, &dconst2))
837 return 245;
838 if (real_identical (r, &dconst4))
839 return 247;
840
841 /* FIXME: add 4, -4 and 1/(2*PI). */
842
843 return 0;
844 }
845
846 /* Return non-zero if X is a constant that can appear as an immediate operand.
847 This is 0, 0.5, -0.5, 1, -1, 2, -2, 4,-4, 1/(2*pi)
848 Or a vector of those.
849 The value returned should be the encoding of this constant. */
850
851 bool
gcn_fp_constant_p(rtx x,bool allow_vector)852 gcn_fp_constant_p (rtx x, bool allow_vector)
853 {
854 machine_mode mode = GET_MODE (x);
855
856 if ((mode == V64HFmode || mode == V64SFmode || mode == V64DFmode)
857 && allow_vector)
858 {
859 int n;
860 if (GET_CODE (x) != CONST_VECTOR)
861 return false;
862 n = gcn_fp_constant_p (CONST_VECTOR_ELT (x, 0), false);
863 if (!n)
864 return false;
865 for (int i = 1; i < 64; i++)
866 if (CONST_VECTOR_ELT (x, i) != CONST_VECTOR_ELT (x, 0))
867 return false;
868 return true;
869 }
870 if (mode != HFmode && mode != SFmode && mode != DFmode)
871 return false;
872
873 if (gcn_inline_fp_constant_p (x, false))
874 return true;
875 /* FIXME: It is not clear how 32bit immediates are interpreted here. */
876 return (mode != DFmode);
877 }
878
879 /* Return true if X is a constant representable as an inline immediate
880 constant in a 32-bit instruction encoding. */
881
882 bool
gcn_inline_constant_p(rtx x)883 gcn_inline_constant_p (rtx x)
884 {
885 if (GET_CODE (x) == CONST_INT)
886 return INTVAL (x) >= -16 && INTVAL (x) <= 64;
887 if (GET_CODE (x) == CONST_DOUBLE)
888 return gcn_inline_fp_constant_p (x, false);
889 if (GET_CODE (x) == CONST_VECTOR)
890 {
891 int n;
892 if (!vgpr_vector_mode_p (GET_MODE (x)))
893 return false;
894 n = gcn_inline_constant_p (CONST_VECTOR_ELT (x, 0));
895 if (!n)
896 return false;
897 for (int i = 1; i < 64; i++)
898 if (CONST_VECTOR_ELT (x, i) != CONST_VECTOR_ELT (x, 0))
899 return false;
900 return 1;
901 }
902 return false;
903 }
904
905 /* Return true if X is a constant representable as an immediate constant
906 in a 32 or 64-bit instruction encoding. */
907
908 bool
gcn_constant_p(rtx x)909 gcn_constant_p (rtx x)
910 {
911 switch (GET_CODE (x))
912 {
913 case CONST_INT:
914 return true;
915
916 case CONST_DOUBLE:
917 return gcn_fp_constant_p (x, false);
918
919 case CONST_VECTOR:
920 {
921 int n;
922 if (!vgpr_vector_mode_p (GET_MODE (x)))
923 return false;
924 n = gcn_constant_p (CONST_VECTOR_ELT (x, 0));
925 if (!n)
926 return false;
927 for (int i = 1; i < 64; i++)
928 if (CONST_VECTOR_ELT (x, i) != CONST_VECTOR_ELT (x, 0))
929 return false;
930 return true;
931 }
932
933 case SYMBOL_REF:
934 case LABEL_REF:
935 return true;
936
937 default:
938 ;
939 }
940
941 return false;
942 }
943
944 /* Return true if X is a constant representable as two inline immediate
945 constants in a 64-bit instruction that is split into two 32-bit
946 instructions.
947 When MIXED is set, the low-part is permitted to use the full 32-bits. */
948
949 bool
gcn_inline_constant64_p(rtx x,bool mixed)950 gcn_inline_constant64_p (rtx x, bool mixed)
951 {
952 if (GET_CODE (x) == CONST_VECTOR)
953 {
954 if (!vgpr_vector_mode_p (GET_MODE (x)))
955 return false;
956 if (!gcn_inline_constant64_p (CONST_VECTOR_ELT (x, 0), mixed))
957 return false;
958 for (int i = 1; i < 64; i++)
959 if (CONST_VECTOR_ELT (x, i) != CONST_VECTOR_ELT (x, 0))
960 return false;
961
962 return true;
963 }
964
965 if (GET_CODE (x) != CONST_INT)
966 return false;
967
968 rtx val_lo = gcn_operand_part (DImode, x, 0);
969 rtx val_hi = gcn_operand_part (DImode, x, 1);
970 return ((mixed || gcn_inline_constant_p (val_lo))
971 && gcn_inline_constant_p (val_hi));
972 }
973
974 /* Return true if X is a constant representable as an immediate constant
975 in a 32 or 64-bit instruction encoding where the hardware will
976 extend the immediate to 64-bits. */
977
978 bool
gcn_constant64_p(rtx x)979 gcn_constant64_p (rtx x)
980 {
981 if (!gcn_constant_p (x))
982 return false;
983
984 if (GET_CODE (x) != CONST_INT)
985 return true;
986
987 /* Negative numbers are only allowed if they can be encoded within src0,
988 because the 32-bit immediates do not get sign-extended.
989 Unsigned numbers must not be encodable as 32-bit -1..-16, because the
990 assembler will use a src0 inline immediate and that will get
991 sign-extended. */
992 HOST_WIDE_INT val = INTVAL (x);
993 return (((val & 0xffffffff) == val /* Positive 32-bit. */
994 && (val & 0xfffffff0) != 0xfffffff0) /* Not -1..-16. */
995 || gcn_inline_constant_p (x)); /* Src0. */
996 }
997
998 /* Implement TARGET_LEGITIMATE_CONSTANT_P.
999
1000 Returns true if X is a legitimate constant for a MODE immediate operand. */
1001
1002 bool
gcn_legitimate_constant_p(machine_mode,rtx x)1003 gcn_legitimate_constant_p (machine_mode, rtx x)
1004 {
1005 return gcn_constant_p (x);
1006 }
1007
1008 /* Return true if X is a CONST_VECTOR of single constant. */
1009
1010 static bool
single_cst_vector_p(rtx x)1011 single_cst_vector_p (rtx x)
1012 {
1013 if (GET_CODE (x) != CONST_VECTOR)
1014 return false;
1015 for (int i = 1; i < 64; i++)
1016 if (CONST_VECTOR_ELT (x, i) != CONST_VECTOR_ELT (x, 0))
1017 return false;
1018 return true;
1019 }
1020
1021 /* Create a CONST_VECTOR of duplicated value A. */
1022
1023 rtx
gcn_vec_constant(machine_mode mode,int a)1024 gcn_vec_constant (machine_mode mode, int a)
1025 {
1026 /*if (!a)
1027 return CONST0_RTX (mode);
1028 if (a == -1)
1029 return CONSTM1_RTX (mode);
1030 if (a == 1)
1031 return CONST1_RTX (mode);
1032 if (a == 2)
1033 return CONST2_RTX (mode);*/
1034
1035 int units = GET_MODE_NUNITS (mode);
1036 machine_mode innermode = GET_MODE_INNER (mode);
1037
1038 rtx tem;
1039 if (FLOAT_MODE_P (innermode))
1040 {
1041 REAL_VALUE_TYPE rv;
1042 real_from_integer (&rv, NULL, a, SIGNED);
1043 tem = const_double_from_real_value (rv, innermode);
1044 }
1045 else
1046 tem = gen_int_mode (a, innermode);
1047
1048 rtvec v = rtvec_alloc (units);
1049 for (int i = 0; i < units; ++i)
1050 RTVEC_ELT (v, i) = tem;
1051
1052 return gen_rtx_CONST_VECTOR (mode, v);
1053 }
1054
1055 /* Create a CONST_VECTOR of duplicated value A. */
1056
1057 rtx
gcn_vec_constant(machine_mode mode,rtx a)1058 gcn_vec_constant (machine_mode mode, rtx a)
1059 {
1060 int units = GET_MODE_NUNITS (mode);
1061 rtvec v = rtvec_alloc (units);
1062
1063 for (int i = 0; i < units; ++i)
1064 RTVEC_ELT (v, i) = a;
1065
1066 return gen_rtx_CONST_VECTOR (mode, v);
1067 }
1068
1069 /* Create an undefined vector value, used where an insn operand is
1070 optional. */
1071
1072 rtx
gcn_gen_undef(machine_mode mode)1073 gcn_gen_undef (machine_mode mode)
1074 {
1075 return gen_rtx_UNSPEC (mode, gen_rtvec (1, const0_rtx), UNSPEC_VECTOR);
1076 }
1077
1078 /* }}} */
1079 /* {{{ Addresses, pointers and moves. */
1080
1081 /* Return true is REG is a valid place to store a pointer,
1082 for instructions that require an SGPR.
1083 FIXME rename. */
1084
1085 static bool
gcn_address_register_p(rtx reg,machine_mode mode,bool strict)1086 gcn_address_register_p (rtx reg, machine_mode mode, bool strict)
1087 {
1088 if (GET_CODE (reg) == SUBREG)
1089 reg = SUBREG_REG (reg);
1090
1091 if (!REG_P (reg))
1092 return false;
1093
1094 if (GET_MODE (reg) != mode)
1095 return false;
1096
1097 int regno = REGNO (reg);
1098
1099 if (regno >= FIRST_PSEUDO_REGISTER)
1100 {
1101 if (!strict)
1102 return true;
1103
1104 if (!reg_renumber)
1105 return false;
1106
1107 regno = reg_renumber[regno];
1108 }
1109
1110 return (SGPR_REGNO_P (regno) || regno == M0_REG
1111 || regno == ARG_POINTER_REGNUM || regno == FRAME_POINTER_REGNUM);
1112 }
1113
1114 /* Return true is REG is a valid place to store a pointer,
1115 for instructions that require a VGPR. */
1116
1117 static bool
gcn_vec_address_register_p(rtx reg,machine_mode mode,bool strict)1118 gcn_vec_address_register_p (rtx reg, machine_mode mode, bool strict)
1119 {
1120 if (GET_CODE (reg) == SUBREG)
1121 reg = SUBREG_REG (reg);
1122
1123 if (!REG_P (reg))
1124 return false;
1125
1126 if (GET_MODE (reg) != mode)
1127 return false;
1128
1129 int regno = REGNO (reg);
1130
1131 if (regno >= FIRST_PSEUDO_REGISTER)
1132 {
1133 if (!strict)
1134 return true;
1135
1136 if (!reg_renumber)
1137 return false;
1138
1139 regno = reg_renumber[regno];
1140 }
1141
1142 return VGPR_REGNO_P (regno);
1143 }
1144
1145 /* Return true if X would be valid inside a MEM using the Flat address
1146 space. */
1147
1148 bool
gcn_flat_address_p(rtx x,machine_mode mode)1149 gcn_flat_address_p (rtx x, machine_mode mode)
1150 {
1151 bool vec_mode = (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
1152 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT);
1153
1154 if (vec_mode && gcn_address_register_p (x, DImode, false))
1155 return true;
1156
1157 if (!vec_mode && gcn_vec_address_register_p (x, DImode, false))
1158 return true;
1159
1160 if (TARGET_GCN5_PLUS
1161 && GET_CODE (x) == PLUS
1162 && gcn_vec_address_register_p (XEXP (x, 0), DImode, false)
1163 && CONST_INT_P (XEXP (x, 1)))
1164 return true;
1165
1166 return false;
1167 }
1168
1169 /* Return true if X would be valid inside a MEM using the Scalar Flat
1170 address space. */
1171
1172 bool
gcn_scalar_flat_address_p(rtx x)1173 gcn_scalar_flat_address_p (rtx x)
1174 {
1175 if (gcn_address_register_p (x, DImode, false))
1176 return true;
1177
1178 if (GET_CODE (x) == PLUS
1179 && gcn_address_register_p (XEXP (x, 0), DImode, false)
1180 && CONST_INT_P (XEXP (x, 1)))
1181 return true;
1182
1183 return false;
1184 }
1185
1186 /* Return true if MEM X would be valid for the Scalar Flat address space. */
1187
1188 bool
gcn_scalar_flat_mem_p(rtx x)1189 gcn_scalar_flat_mem_p (rtx x)
1190 {
1191 if (!MEM_P (x))
1192 return false;
1193
1194 if (GET_MODE_SIZE (GET_MODE (x)) < 4)
1195 return false;
1196
1197 return gcn_scalar_flat_address_p (XEXP (x, 0));
1198 }
1199
1200 /* Return true if X would be valid inside a MEM using the LDS or GDS
1201 address spaces. */
1202
1203 bool
gcn_ds_address_p(rtx x)1204 gcn_ds_address_p (rtx x)
1205 {
1206 if (gcn_vec_address_register_p (x, SImode, false))
1207 return true;
1208
1209 if (GET_CODE (x) == PLUS
1210 && gcn_vec_address_register_p (XEXP (x, 0), SImode, false)
1211 && CONST_INT_P (XEXP (x, 1)))
1212 return true;
1213
1214 return false;
1215 }
1216
1217 /* Return true if ADDR would be valid inside a MEM using the Global
1218 address space. */
1219
1220 bool
gcn_global_address_p(rtx addr)1221 gcn_global_address_p (rtx addr)
1222 {
1223 if (gcn_address_register_p (addr, DImode, false)
1224 || gcn_vec_address_register_p (addr, DImode, false))
1225 return true;
1226
1227 if (GET_CODE (addr) == PLUS)
1228 {
1229 rtx base = XEXP (addr, 0);
1230 rtx offset = XEXP (addr, 1);
1231 bool immediate_p = (CONST_INT_P (offset)
1232 && INTVAL (offset) >= -(1 << 12)
1233 && INTVAL (offset) < (1 << 12));
1234
1235 if ((gcn_address_register_p (base, DImode, false)
1236 || gcn_vec_address_register_p (base, DImode, false))
1237 && immediate_p)
1238 /* SGPR + CONST or VGPR + CONST */
1239 return true;
1240
1241 if (gcn_address_register_p (base, DImode, false)
1242 && gcn_vgpr_register_operand (offset, SImode))
1243 /* SPGR + VGPR */
1244 return true;
1245
1246 if (GET_CODE (base) == PLUS
1247 && gcn_address_register_p (XEXP (base, 0), DImode, false)
1248 && gcn_vgpr_register_operand (XEXP (base, 1), SImode)
1249 && immediate_p)
1250 /* (SGPR + VGPR) + CONST */
1251 return true;
1252 }
1253
1254 return false;
1255 }
1256
1257 /* Implement TARGET_ADDR_SPACE_LEGITIMATE_ADDRESS_P.
1258
1259 Recognizes RTL expressions that are valid memory addresses for an
1260 instruction. The MODE argument is the machine mode for the MEM
1261 expression that wants to use this address.
1262
1263 It only recognizes address in canonical form. LEGITIMIZE_ADDRESS should
1264 convert common non-canonical forms to canonical form so that they will
1265 be recognized. */
1266
1267 static bool
gcn_addr_space_legitimate_address_p(machine_mode mode,rtx x,bool strict,addr_space_t as)1268 gcn_addr_space_legitimate_address_p (machine_mode mode, rtx x, bool strict,
1269 addr_space_t as)
1270 {
1271 /* All vector instructions need to work on addresses in registers. */
1272 if (!TARGET_GCN5_PLUS && (vgpr_vector_mode_p (mode) && !REG_P (x)))
1273 return false;
1274
1275 if (AS_SCALAR_FLAT_P (as))
1276 {
1277 if (mode == QImode || mode == HImode)
1278 return 0;
1279
1280 switch (GET_CODE (x))
1281 {
1282 case REG:
1283 return gcn_address_register_p (x, DImode, strict);
1284 /* Addresses are in the form BASE+OFFSET
1285 OFFSET is either 20bit unsigned immediate, SGPR or M0.
1286 Writes and atomics do not accept SGPR. */
1287 case PLUS:
1288 {
1289 rtx x0 = XEXP (x, 0);
1290 rtx x1 = XEXP (x, 1);
1291 if (!gcn_address_register_p (x0, DImode, strict))
1292 return false;
1293 /* FIXME: This is disabled because of the mode mismatch between
1294 SImode (for the address or m0 register) and the DImode PLUS.
1295 We'll need a zero_extend or similar.
1296
1297 if (gcn_m0_register_p (x1, SImode, strict)
1298 || gcn_address_register_p (x1, SImode, strict))
1299 return true;
1300 else*/
1301 if (GET_CODE (x1) == CONST_INT)
1302 {
1303 if (INTVAL (x1) >= 0 && INTVAL (x1) < (1 << 20)
1304 /* The low bits of the offset are ignored, even when
1305 they're meant to realign the pointer. */
1306 && !(INTVAL (x1) & 0x3))
1307 return true;
1308 }
1309 return false;
1310 }
1311
1312 default:
1313 break;
1314 }
1315 }
1316 else if (AS_SCRATCH_P (as))
1317 return gcn_address_register_p (x, SImode, strict);
1318 else if (AS_FLAT_P (as) || AS_FLAT_SCRATCH_P (as))
1319 {
1320 if (TARGET_GCN3 || GET_CODE (x) == REG)
1321 return ((GET_MODE_CLASS (mode) == MODE_VECTOR_INT
1322 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
1323 ? gcn_address_register_p (x, DImode, strict)
1324 : gcn_vec_address_register_p (x, DImode, strict));
1325 else
1326 {
1327 gcc_assert (TARGET_GCN5_PLUS);
1328
1329 if (GET_CODE (x) == PLUS)
1330 {
1331 rtx x1 = XEXP (x, 1);
1332
1333 if (VECTOR_MODE_P (mode)
1334 ? !gcn_address_register_p (x, DImode, strict)
1335 : !gcn_vec_address_register_p (x, DImode, strict))
1336 return false;
1337
1338 if (GET_CODE (x1) == CONST_INT)
1339 {
1340 if (INTVAL (x1) >= 0 && INTVAL (x1) < (1 << 12)
1341 /* The low bits of the offset are ignored, even when
1342 they're meant to realign the pointer. */
1343 && !(INTVAL (x1) & 0x3))
1344 return true;
1345 }
1346 }
1347 return false;
1348 }
1349 }
1350 else if (AS_GLOBAL_P (as))
1351 {
1352 gcc_assert (TARGET_GCN5_PLUS);
1353
1354 if (GET_CODE (x) == REG)
1355 return (gcn_address_register_p (x, DImode, strict)
1356 || (!VECTOR_MODE_P (mode)
1357 && gcn_vec_address_register_p (x, DImode, strict)));
1358 else if (GET_CODE (x) == PLUS)
1359 {
1360 rtx base = XEXP (x, 0);
1361 rtx offset = XEXP (x, 1);
1362
1363 bool immediate_p = (GET_CODE (offset) == CONST_INT
1364 /* Signed 13-bit immediate. */
1365 && INTVAL (offset) >= -(1 << 12)
1366 && INTVAL (offset) < (1 << 12)
1367 /* The low bits of the offset are ignored, even
1368 when they're meant to realign the pointer. */
1369 && !(INTVAL (offset) & 0x3));
1370
1371 if (!VECTOR_MODE_P (mode))
1372 {
1373 if ((gcn_address_register_p (base, DImode, strict)
1374 || gcn_vec_address_register_p (base, DImode, strict))
1375 && immediate_p)
1376 /* SGPR + CONST or VGPR + CONST */
1377 return true;
1378
1379 if (gcn_address_register_p (base, DImode, strict)
1380 && gcn_vgpr_register_operand (offset, SImode))
1381 /* SGPR + VGPR */
1382 return true;
1383
1384 if (GET_CODE (base) == PLUS
1385 && gcn_address_register_p (XEXP (base, 0), DImode, strict)
1386 && gcn_vgpr_register_operand (XEXP (base, 1), SImode)
1387 && immediate_p)
1388 /* (SGPR + VGPR) + CONST */
1389 return true;
1390 }
1391 else
1392 {
1393 if (gcn_address_register_p (base, DImode, strict)
1394 && immediate_p)
1395 /* SGPR + CONST */
1396 return true;
1397 }
1398 }
1399 else
1400 return false;
1401 }
1402 else if (AS_ANY_DS_P (as))
1403 switch (GET_CODE (x))
1404 {
1405 case REG:
1406 return (VECTOR_MODE_P (mode)
1407 ? gcn_address_register_p (x, SImode, strict)
1408 : gcn_vec_address_register_p (x, SImode, strict));
1409 /* Addresses are in the form BASE+OFFSET
1410 OFFSET is either 20bit unsigned immediate, SGPR or M0.
1411 Writes and atomics do not accept SGPR. */
1412 case PLUS:
1413 {
1414 rtx x0 = XEXP (x, 0);
1415 rtx x1 = XEXP (x, 1);
1416 if (!gcn_vec_address_register_p (x0, DImode, strict))
1417 return false;
1418 if (GET_CODE (x1) == REG)
1419 {
1420 if (GET_CODE (x1) != REG
1421 || (REGNO (x1) <= FIRST_PSEUDO_REGISTER
1422 && !gcn_ssrc_register_operand (x1, DImode)))
1423 return false;
1424 }
1425 else if (GET_CODE (x1) == CONST_VECTOR
1426 && GET_CODE (CONST_VECTOR_ELT (x1, 0)) == CONST_INT
1427 && single_cst_vector_p (x1))
1428 {
1429 x1 = CONST_VECTOR_ELT (x1, 0);
1430 if (INTVAL (x1) >= 0 && INTVAL (x1) < (1 << 20))
1431 return true;
1432 }
1433 return false;
1434 }
1435
1436 default:
1437 break;
1438 }
1439 else
1440 gcc_unreachable ();
1441 return false;
1442 }
1443
1444 /* Implement TARGET_ADDR_SPACE_POINTER_MODE.
1445
1446 Return the appropriate mode for a named address pointer. */
1447
1448 static scalar_int_mode
gcn_addr_space_pointer_mode(addr_space_t addrspace)1449 gcn_addr_space_pointer_mode (addr_space_t addrspace)
1450 {
1451 switch (addrspace)
1452 {
1453 case ADDR_SPACE_SCRATCH:
1454 case ADDR_SPACE_LDS:
1455 case ADDR_SPACE_GDS:
1456 return SImode;
1457 case ADDR_SPACE_DEFAULT:
1458 case ADDR_SPACE_FLAT:
1459 case ADDR_SPACE_FLAT_SCRATCH:
1460 case ADDR_SPACE_SCALAR_FLAT:
1461 return DImode;
1462 default:
1463 gcc_unreachable ();
1464 }
1465 }
1466
1467 /* Implement TARGET_ADDR_SPACE_ADDRESS_MODE.
1468
1469 Return the appropriate mode for a named address space address. */
1470
1471 static scalar_int_mode
gcn_addr_space_address_mode(addr_space_t addrspace)1472 gcn_addr_space_address_mode (addr_space_t addrspace)
1473 {
1474 return gcn_addr_space_pointer_mode (addrspace);
1475 }
1476
1477 /* Implement TARGET_ADDR_SPACE_SUBSET_P.
1478
1479 Determine if one named address space is a subset of another. */
1480
1481 static bool
gcn_addr_space_subset_p(addr_space_t subset,addr_space_t superset)1482 gcn_addr_space_subset_p (addr_space_t subset, addr_space_t superset)
1483 {
1484 if (subset == superset)
1485 return true;
1486 /* FIXME is this true? */
1487 if (AS_FLAT_P (superset) || AS_SCALAR_FLAT_P (superset))
1488 return true;
1489 return false;
1490 }
1491
1492 /* Convert from one address space to another. */
1493
1494 static rtx
gcn_addr_space_convert(rtx op,tree from_type,tree to_type)1495 gcn_addr_space_convert (rtx op, tree from_type, tree to_type)
1496 {
1497 gcc_assert (POINTER_TYPE_P (from_type));
1498 gcc_assert (POINTER_TYPE_P (to_type));
1499
1500 addr_space_t as_from = TYPE_ADDR_SPACE (TREE_TYPE (from_type));
1501 addr_space_t as_to = TYPE_ADDR_SPACE (TREE_TYPE (to_type));
1502
1503 if (AS_LDS_P (as_from) && AS_FLAT_P (as_to))
1504 {
1505 rtx queue = gen_rtx_REG (DImode,
1506 cfun->machine->args.reg[QUEUE_PTR_ARG]);
1507 rtx group_seg_aperture_hi = gen_rtx_MEM (SImode,
1508 gen_rtx_PLUS (DImode, queue,
1509 gen_int_mode (64, SImode)));
1510 rtx tmp = gen_reg_rtx (DImode);
1511
1512 emit_move_insn (gen_lowpart (SImode, tmp), op);
1513 emit_move_insn (gen_highpart_mode (SImode, DImode, tmp),
1514 group_seg_aperture_hi);
1515
1516 return tmp;
1517 }
1518 else if (as_from == as_to)
1519 return op;
1520 else
1521 gcc_unreachable ();
1522 }
1523
1524 /* Implement TARGET_ADDR_SPACE_DEBUG.
1525
1526 Return the dwarf address space class for each hardware address space. */
1527
1528 static int
gcn_addr_space_debug(addr_space_t as)1529 gcn_addr_space_debug (addr_space_t as)
1530 {
1531 switch (as)
1532 {
1533 case ADDR_SPACE_DEFAULT:
1534 case ADDR_SPACE_FLAT:
1535 case ADDR_SPACE_SCALAR_FLAT:
1536 case ADDR_SPACE_FLAT_SCRATCH:
1537 return DW_ADDR_none;
1538 case ADDR_SPACE_GLOBAL:
1539 return 1; // DW_ADDR_LLVM_global
1540 case ADDR_SPACE_LDS:
1541 return 3; // DW_ADDR_LLVM_group
1542 case ADDR_SPACE_SCRATCH:
1543 return 4; // DW_ADDR_LLVM_private
1544 case ADDR_SPACE_GDS:
1545 return 0x8000; // DW_ADDR_AMDGPU_region
1546 }
1547 gcc_unreachable ();
1548 }
1549
1550
1551 /* Implement REGNO_MODE_CODE_OK_FOR_BASE_P via gcn.h
1552
1553 Retun true if REGNO is OK for memory adressing. */
1554
1555 bool
gcn_regno_mode_code_ok_for_base_p(int regno,machine_mode,addr_space_t as,int,int)1556 gcn_regno_mode_code_ok_for_base_p (int regno,
1557 machine_mode, addr_space_t as, int, int)
1558 {
1559 if (regno >= FIRST_PSEUDO_REGISTER)
1560 {
1561 if (reg_renumber)
1562 regno = reg_renumber[regno];
1563 else
1564 return true;
1565 }
1566 if (AS_FLAT_P (as))
1567 return (VGPR_REGNO_P (regno)
1568 || regno == ARG_POINTER_REGNUM || regno == FRAME_POINTER_REGNUM);
1569 else if (AS_SCALAR_FLAT_P (as))
1570 return (SGPR_REGNO_P (regno)
1571 || regno == ARG_POINTER_REGNUM || regno == FRAME_POINTER_REGNUM);
1572 else if (AS_GLOBAL_P (as))
1573 {
1574 return (SGPR_REGNO_P (regno)
1575 || VGPR_REGNO_P (regno)
1576 || regno == ARG_POINTER_REGNUM
1577 || regno == FRAME_POINTER_REGNUM);
1578 }
1579 else
1580 /* For now. */
1581 return false;
1582 }
1583
1584 /* Implement MODE_CODE_BASE_REG_CLASS via gcn.h.
1585
1586 Return a suitable register class for memory addressing. */
1587
1588 reg_class
gcn_mode_code_base_reg_class(machine_mode mode,addr_space_t as,int oc,int ic)1589 gcn_mode_code_base_reg_class (machine_mode mode, addr_space_t as, int oc,
1590 int ic)
1591 {
1592 switch (as)
1593 {
1594 case ADDR_SPACE_DEFAULT:
1595 return gcn_mode_code_base_reg_class (mode, DEFAULT_ADDR_SPACE, oc, ic);
1596 case ADDR_SPACE_SCALAR_FLAT:
1597 case ADDR_SPACE_SCRATCH:
1598 return SGPR_REGS;
1599 break;
1600 case ADDR_SPACE_FLAT:
1601 case ADDR_SPACE_FLAT_SCRATCH:
1602 case ADDR_SPACE_LDS:
1603 case ADDR_SPACE_GDS:
1604 return ((GET_MODE_CLASS (mode) == MODE_VECTOR_INT
1605 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
1606 ? SGPR_REGS : VGPR_REGS);
1607 case ADDR_SPACE_GLOBAL:
1608 return ((GET_MODE_CLASS (mode) == MODE_VECTOR_INT
1609 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
1610 ? SGPR_REGS : ALL_GPR_REGS);
1611 }
1612 gcc_unreachable ();
1613 }
1614
1615 /* Implement REGNO_OK_FOR_INDEX_P via gcn.h.
1616
1617 Return true if REGNO is OK for index of memory addressing. */
1618
1619 bool
regno_ok_for_index_p(int regno)1620 regno_ok_for_index_p (int regno)
1621 {
1622 if (regno >= FIRST_PSEUDO_REGISTER)
1623 {
1624 if (reg_renumber)
1625 regno = reg_renumber[regno];
1626 else
1627 return true;
1628 }
1629 return regno == M0_REG || VGPR_REGNO_P (regno);
1630 }
1631
1632 /* Generate move which uses the exec flags. If EXEC is NULL, then it is
1633 assumed that all lanes normally relevant to the mode of the move are
1634 affected. If PREV is NULL, then a sensible default is supplied for
1635 the inactive lanes. */
1636
1637 static rtx
gen_mov_with_exec(rtx op0,rtx op1,rtx exec=NULL,rtx prev=NULL)1638 gen_mov_with_exec (rtx op0, rtx op1, rtx exec = NULL, rtx prev = NULL)
1639 {
1640 machine_mode mode = GET_MODE (op0);
1641
1642 if (vgpr_vector_mode_p (mode))
1643 {
1644 if (exec && exec != CONSTM1_RTX (DImode))
1645 {
1646 if (!prev)
1647 prev = op0;
1648 }
1649 else
1650 {
1651 if (!prev)
1652 prev = gcn_gen_undef (mode);
1653 exec = gcn_full_exec_reg ();
1654 }
1655
1656 rtx set = gen_rtx_SET (op0, gen_rtx_VEC_MERGE (mode, op1, prev, exec));
1657
1658 return gen_rtx_PARALLEL (VOIDmode,
1659 gen_rtvec (2, set,
1660 gen_rtx_CLOBBER (VOIDmode,
1661 gen_rtx_SCRATCH (V64DImode))));
1662 }
1663
1664 return (gen_rtx_PARALLEL
1665 (VOIDmode,
1666 gen_rtvec (2, gen_rtx_SET (op0, op1),
1667 gen_rtx_USE (VOIDmode,
1668 exec ? exec : gcn_scalar_exec ()))));
1669 }
1670
1671 /* Generate masked move. */
1672
1673 static rtx
gen_duplicate_load(rtx op0,rtx op1,rtx op2=NULL,rtx exec=NULL)1674 gen_duplicate_load (rtx op0, rtx op1, rtx op2 = NULL, rtx exec = NULL)
1675 {
1676 if (exec)
1677 return (gen_rtx_SET (op0,
1678 gen_rtx_VEC_MERGE (GET_MODE (op0),
1679 gen_rtx_VEC_DUPLICATE (GET_MODE
1680 (op0), op1),
1681 op2, exec)));
1682 else
1683 return (gen_rtx_SET (op0, gen_rtx_VEC_DUPLICATE (GET_MODE (op0), op1)));
1684 }
1685
1686 /* Expand vector init of OP0 by VEC.
1687 Implements vec_init instruction pattern. */
1688
1689 void
gcn_expand_vector_init(rtx op0,rtx vec)1690 gcn_expand_vector_init (rtx op0, rtx vec)
1691 {
1692 int64_t initialized_mask = 0;
1693 int64_t curr_mask = 1;
1694 machine_mode mode = GET_MODE (op0);
1695
1696 rtx val = XVECEXP (vec, 0, 0);
1697
1698 for (int i = 1; i < 64; i++)
1699 if (rtx_equal_p (val, XVECEXP (vec, 0, i)))
1700 curr_mask |= (int64_t) 1 << i;
1701
1702 if (gcn_constant_p (val))
1703 emit_move_insn (op0, gcn_vec_constant (mode, val));
1704 else
1705 {
1706 val = force_reg (GET_MODE_INNER (mode), val);
1707 emit_insn (gen_duplicate_load (op0, val));
1708 }
1709 initialized_mask |= curr_mask;
1710 for (int i = 1; i < 64; i++)
1711 if (!(initialized_mask & ((int64_t) 1 << i)))
1712 {
1713 curr_mask = (int64_t) 1 << i;
1714 rtx val = XVECEXP (vec, 0, i);
1715
1716 for (int j = i + 1; j < 64; j++)
1717 if (rtx_equal_p (val, XVECEXP (vec, 0, j)))
1718 curr_mask |= (int64_t) 1 << j;
1719 if (gcn_constant_p (val))
1720 emit_insn (gen_mov_with_exec (op0, gcn_vec_constant (mode, val),
1721 get_exec (curr_mask)));
1722 else
1723 {
1724 val = force_reg (GET_MODE_INNER (mode), val);
1725 emit_insn (gen_duplicate_load (op0, val, op0,
1726 get_exec (curr_mask)));
1727 }
1728 initialized_mask |= curr_mask;
1729 }
1730 }
1731
1732 /* Load vector constant where n-th lane contains BASE+n*VAL. */
1733
1734 static rtx
strided_constant(machine_mode mode,int base,int val)1735 strided_constant (machine_mode mode, int base, int val)
1736 {
1737 rtx x = gen_reg_rtx (mode);
1738 emit_move_insn (x, gcn_vec_constant (mode, base));
1739 emit_insn (gen_addv64si3_exec (x, x, gcn_vec_constant (mode, val * 32),
1740 x, get_exec (0xffffffff00000000)));
1741 emit_insn (gen_addv64si3_exec (x, x, gcn_vec_constant (mode, val * 16),
1742 x, get_exec (0xffff0000ffff0000)));
1743 emit_insn (gen_addv64si3_exec (x, x, gcn_vec_constant (mode, val * 8),
1744 x, get_exec (0xff00ff00ff00ff00)));
1745 emit_insn (gen_addv64si3_exec (x, x, gcn_vec_constant (mode, val * 4),
1746 x, get_exec (0xf0f0f0f0f0f0f0f0)));
1747 emit_insn (gen_addv64si3_exec (x, x, gcn_vec_constant (mode, val * 2),
1748 x, get_exec (0xcccccccccccccccc)));
1749 emit_insn (gen_addv64si3_exec (x, x, gcn_vec_constant (mode, val * 1),
1750 x, get_exec (0xaaaaaaaaaaaaaaaa)));
1751 return x;
1752 }
1753
1754 /* Implement TARGET_ADDR_SPACE_LEGITIMIZE_ADDRESS. */
1755
1756 static rtx
gcn_addr_space_legitimize_address(rtx x,rtx old,machine_mode mode,addr_space_t as)1757 gcn_addr_space_legitimize_address (rtx x, rtx old, machine_mode mode,
1758 addr_space_t as)
1759 {
1760 switch (as)
1761 {
1762 case ADDR_SPACE_DEFAULT:
1763 return gcn_addr_space_legitimize_address (x, old, mode,
1764 DEFAULT_ADDR_SPACE);
1765 case ADDR_SPACE_SCALAR_FLAT:
1766 case ADDR_SPACE_SCRATCH:
1767 /* Instructions working on vectors need the address to be in
1768 a register. */
1769 if (vgpr_vector_mode_p (mode))
1770 return force_reg (GET_MODE (x), x);
1771
1772 return x;
1773 case ADDR_SPACE_FLAT:
1774 case ADDR_SPACE_FLAT_SCRATCH:
1775 case ADDR_SPACE_GLOBAL:
1776 return TARGET_GCN3 ? force_reg (DImode, x) : x;
1777 case ADDR_SPACE_LDS:
1778 case ADDR_SPACE_GDS:
1779 /* FIXME: LDS support offsets, handle them!. */
1780 if (vgpr_vector_mode_p (mode) && GET_MODE (x) != V64SImode)
1781 {
1782 rtx addrs = gen_reg_rtx (V64SImode);
1783 rtx base = force_reg (SImode, x);
1784 rtx offsets = strided_constant (V64SImode, 0,
1785 GET_MODE_UNIT_SIZE (mode));
1786
1787 emit_insn (gen_vec_duplicatev64si (addrs, base));
1788 emit_insn (gen_addv64si3 (addrs, offsets, addrs));
1789 return addrs;
1790 }
1791 return x;
1792 }
1793 gcc_unreachable ();
1794 }
1795
1796 /* Convert a (mem:<MODE> (reg:DI)) to (mem:<MODE> (reg:V64DI)) with the
1797 proper vector of stepped addresses.
1798
1799 MEM will be a DImode address of a vector in an SGPR.
1800 TMP will be a V64DImode VGPR pair or (scratch:V64DI). */
1801
1802 rtx
gcn_expand_scalar_to_vector_address(machine_mode mode,rtx exec,rtx mem,rtx tmp)1803 gcn_expand_scalar_to_vector_address (machine_mode mode, rtx exec, rtx mem,
1804 rtx tmp)
1805 {
1806 gcc_assert (MEM_P (mem));
1807 rtx mem_base = XEXP (mem, 0);
1808 rtx mem_index = NULL_RTX;
1809
1810 if (!TARGET_GCN5_PLUS)
1811 {
1812 /* gcn_addr_space_legitimize_address should have put the address in a
1813 register. If not, it is too late to do anything about it. */
1814 gcc_assert (REG_P (mem_base));
1815 }
1816
1817 if (GET_CODE (mem_base) == PLUS)
1818 {
1819 mem_index = XEXP (mem_base, 1);
1820 mem_base = XEXP (mem_base, 0);
1821 }
1822
1823 /* RF and RM base registers for vector modes should be always an SGPR. */
1824 gcc_assert (SGPR_REGNO_P (REGNO (mem_base))
1825 || REGNO (mem_base) >= FIRST_PSEUDO_REGISTER);
1826
1827 machine_mode inner = GET_MODE_INNER (mode);
1828 int shift = exact_log2 (GET_MODE_SIZE (inner));
1829 rtx ramp = gen_rtx_REG (V64SImode, VGPR_REGNO (1));
1830 rtx undef_v64si = gcn_gen_undef (V64SImode);
1831 rtx new_base = NULL_RTX;
1832 addr_space_t as = MEM_ADDR_SPACE (mem);
1833
1834 rtx tmplo = (REG_P (tmp)
1835 ? gcn_operand_part (V64DImode, tmp, 0)
1836 : gen_reg_rtx (V64SImode));
1837
1838 /* tmplo[:] = ramp[:] << shift */
1839 if (exec)
1840 emit_insn (gen_ashlv64si3_exec (tmplo, ramp,
1841 gen_int_mode (shift, SImode),
1842 undef_v64si, exec));
1843 else
1844 emit_insn (gen_ashlv64si3 (tmplo, ramp, gen_int_mode (shift, SImode)));
1845
1846 if (AS_FLAT_P (as))
1847 {
1848 rtx vcc = gen_rtx_REG (DImode, CC_SAVE_REG);
1849
1850 if (REG_P (tmp))
1851 {
1852 rtx mem_base_lo = gcn_operand_part (DImode, mem_base, 0);
1853 rtx mem_base_hi = gcn_operand_part (DImode, mem_base, 1);
1854 rtx tmphi = gcn_operand_part (V64DImode, tmp, 1);
1855
1856 /* tmphi[:] = mem_base_hi */
1857 if (exec)
1858 emit_insn (gen_vec_duplicatev64si_exec (tmphi, mem_base_hi,
1859 undef_v64si, exec));
1860 else
1861 emit_insn (gen_vec_duplicatev64si (tmphi, mem_base_hi));
1862
1863 /* tmp[:] += zext (mem_base) */
1864 if (exec)
1865 {
1866 emit_insn (gen_addv64si3_vcc_dup_exec (tmplo, mem_base_lo, tmplo,
1867 vcc, undef_v64si, exec));
1868 emit_insn (gen_addcv64si3_exec (tmphi, tmphi, const0_rtx,
1869 vcc, vcc, undef_v64si, exec));
1870 }
1871 else
1872 emit_insn (gen_addv64di3_vcc_zext_dup (tmp, mem_base_lo, tmp, vcc));
1873 }
1874 else
1875 {
1876 tmp = gen_reg_rtx (V64DImode);
1877 if (exec)
1878 emit_insn (gen_addv64di3_vcc_zext_dup2_exec
1879 (tmp, tmplo, mem_base, vcc, gcn_gen_undef (V64DImode),
1880 exec));
1881 else
1882 emit_insn (gen_addv64di3_vcc_zext_dup2 (tmp, tmplo, mem_base, vcc));
1883 }
1884
1885 new_base = tmp;
1886 }
1887 else if (AS_ANY_DS_P (as))
1888 {
1889 if (!exec)
1890 emit_insn (gen_addv64si3_dup (tmplo, tmplo, mem_base));
1891 else
1892 emit_insn (gen_addv64si3_dup_exec (tmplo, tmplo, mem_base,
1893 gcn_gen_undef (V64SImode), exec));
1894 new_base = tmplo;
1895 }
1896 else
1897 {
1898 mem_base = gen_rtx_VEC_DUPLICATE (V64DImode, mem_base);
1899 new_base = gen_rtx_PLUS (V64DImode, mem_base,
1900 gen_rtx_SIGN_EXTEND (V64DImode, tmplo));
1901 }
1902
1903 return gen_rtx_PLUS (GET_MODE (new_base), new_base,
1904 gen_rtx_VEC_DUPLICATE (GET_MODE (new_base),
1905 (mem_index ? mem_index
1906 : const0_rtx)));
1907 }
1908
1909 /* Convert a BASE address, a vector of OFFSETS, and a SCALE, to addresses
1910 suitable for the given address space. This is indented for use in
1911 gather/scatter patterns.
1912
1913 The offsets may be signed or unsigned, according to UNSIGNED_P.
1914 If EXEC is set then _exec patterns will be used, otherwise plain.
1915
1916 Return values.
1917 ADDR_SPACE_FLAT - return V64DImode vector of absolute addresses.
1918 ADDR_SPACE_GLOBAL - return V64SImode vector of offsets. */
1919
1920 rtx
gcn_expand_scaled_offsets(addr_space_t as,rtx base,rtx offsets,rtx scale,bool unsigned_p,rtx exec)1921 gcn_expand_scaled_offsets (addr_space_t as, rtx base, rtx offsets, rtx scale,
1922 bool unsigned_p, rtx exec)
1923 {
1924 rtx tmpsi = gen_reg_rtx (V64SImode);
1925 rtx tmpdi = gen_reg_rtx (V64DImode);
1926 rtx undefsi = exec ? gcn_gen_undef (V64SImode) : NULL;
1927 rtx undefdi = exec ? gcn_gen_undef (V64DImode) : NULL;
1928
1929 if (CONST_INT_P (scale)
1930 && INTVAL (scale) > 0
1931 && exact_log2 (INTVAL (scale)) >= 0)
1932 emit_insn (gen_ashlv64si3 (tmpsi, offsets,
1933 GEN_INT (exact_log2 (INTVAL (scale)))));
1934 else
1935 (exec
1936 ? emit_insn (gen_mulv64si3_dup_exec (tmpsi, offsets, scale, undefsi,
1937 exec))
1938 : emit_insn (gen_mulv64si3_dup (tmpsi, offsets, scale)));
1939
1940 /* "Global" instructions do not support negative register offsets. */
1941 if (as == ADDR_SPACE_FLAT || !unsigned_p)
1942 {
1943 if (unsigned_p)
1944 (exec
1945 ? emit_insn (gen_addv64di3_zext_dup2_exec (tmpdi, tmpsi, base,
1946 undefdi, exec))
1947 : emit_insn (gen_addv64di3_zext_dup2 (tmpdi, tmpsi, base)));
1948 else
1949 (exec
1950 ? emit_insn (gen_addv64di3_sext_dup2_exec (tmpdi, tmpsi, base,
1951 undefdi, exec))
1952 : emit_insn (gen_addv64di3_sext_dup2 (tmpdi, tmpsi, base)));
1953 return tmpdi;
1954 }
1955 else if (as == ADDR_SPACE_GLOBAL)
1956 return tmpsi;
1957
1958 gcc_unreachable ();
1959 }
1960
1961 /* Return true if move from OP0 to OP1 is known to be executed in vector
1962 unit. */
1963
1964 bool
gcn_vgpr_move_p(rtx op0,rtx op1)1965 gcn_vgpr_move_p (rtx op0, rtx op1)
1966 {
1967 if (MEM_P (op0) && AS_SCALAR_FLAT_P (MEM_ADDR_SPACE (op0)))
1968 return true;
1969 if (MEM_P (op1) && AS_SCALAR_FLAT_P (MEM_ADDR_SPACE (op1)))
1970 return true;
1971 return ((REG_P (op0) && VGPR_REGNO_P (REGNO (op0)))
1972 || (REG_P (op1) && VGPR_REGNO_P (REGNO (op1)))
1973 || vgpr_vector_mode_p (GET_MODE (op0)));
1974 }
1975
1976 /* Return true if move from OP0 to OP1 is known to be executed in scalar
1977 unit. Used in the machine description. */
1978
1979 bool
gcn_sgpr_move_p(rtx op0,rtx op1)1980 gcn_sgpr_move_p (rtx op0, rtx op1)
1981 {
1982 if (MEM_P (op0) && AS_SCALAR_FLAT_P (MEM_ADDR_SPACE (op0)))
1983 return true;
1984 if (MEM_P (op1) && AS_SCALAR_FLAT_P (MEM_ADDR_SPACE (op1)))
1985 return true;
1986 if (!REG_P (op0) || REGNO (op0) >= FIRST_PSEUDO_REGISTER
1987 || VGPR_REGNO_P (REGNO (op0)))
1988 return false;
1989 if (REG_P (op1)
1990 && REGNO (op1) < FIRST_PSEUDO_REGISTER
1991 && !VGPR_REGNO_P (REGNO (op1)))
1992 return true;
1993 return immediate_operand (op1, VOIDmode) || memory_operand (op1, VOIDmode);
1994 }
1995
1996 /* Implement TARGET_SECONDARY_RELOAD.
1997
1998 The address space determines which registers can be used for loads and
1999 stores. */
2000
2001 static reg_class_t
gcn_secondary_reload(bool in_p,rtx x,reg_class_t rclass,machine_mode reload_mode,secondary_reload_info * sri)2002 gcn_secondary_reload (bool in_p, rtx x, reg_class_t rclass,
2003 machine_mode reload_mode, secondary_reload_info *sri)
2004 {
2005 reg_class_t result = NO_REGS;
2006 bool spilled_pseudo =
2007 (REG_P (x) || GET_CODE (x) == SUBREG) && true_regnum (x) == -1;
2008
2009 if (dump_file && (dump_flags & TDF_DETAILS))
2010 {
2011 fprintf (dump_file, "gcn_secondary_reload: ");
2012 dump_value_slim (dump_file, x, 1);
2013 fprintf (dump_file, " %s %s:%s", (in_p ? "->" : "<-"),
2014 reg_class_names[rclass], GET_MODE_NAME (reload_mode));
2015 if (REG_P (x) || GET_CODE (x) == SUBREG)
2016 fprintf (dump_file, " (true regnum: %d \"%s\")", true_regnum (x),
2017 (true_regnum (x) >= 0
2018 && true_regnum (x) < FIRST_PSEUDO_REGISTER
2019 ? reg_names[true_regnum (x)]
2020 : (spilled_pseudo ? "stack spill" : "??")));
2021 fprintf (dump_file, "\n");
2022 }
2023
2024 /* Some callers don't use or initialize icode. */
2025 sri->icode = CODE_FOR_nothing;
2026
2027 if (MEM_P (x) || spilled_pseudo)
2028 {
2029 addr_space_t as = DEFAULT_ADDR_SPACE;
2030
2031 /* If we have a spilled pseudo, we can't find the address space
2032 directly, but we know it's in ADDR_SPACE_FLAT space for GCN3 or
2033 ADDR_SPACE_GLOBAL for GCN5. */
2034 if (MEM_P (x))
2035 as = MEM_ADDR_SPACE (x);
2036
2037 if (as == ADDR_SPACE_DEFAULT)
2038 as = DEFAULT_ADDR_SPACE;
2039
2040 switch (as)
2041 {
2042 case ADDR_SPACE_SCALAR_FLAT:
2043 result =
2044 ((!MEM_P (x) || rclass == SGPR_REGS) ? NO_REGS : SGPR_REGS);
2045 break;
2046 case ADDR_SPACE_FLAT:
2047 case ADDR_SPACE_FLAT_SCRATCH:
2048 case ADDR_SPACE_GLOBAL:
2049 if (GET_MODE_CLASS (reload_mode) == MODE_VECTOR_INT
2050 || GET_MODE_CLASS (reload_mode) == MODE_VECTOR_FLOAT)
2051 {
2052 if (in_p)
2053 switch (reload_mode)
2054 {
2055 case E_V64SImode:
2056 sri->icode = CODE_FOR_reload_inv64si;
2057 break;
2058 case E_V64SFmode:
2059 sri->icode = CODE_FOR_reload_inv64sf;
2060 break;
2061 case E_V64HImode:
2062 sri->icode = CODE_FOR_reload_inv64hi;
2063 break;
2064 case E_V64HFmode:
2065 sri->icode = CODE_FOR_reload_inv64hf;
2066 break;
2067 case E_V64QImode:
2068 sri->icode = CODE_FOR_reload_inv64qi;
2069 break;
2070 case E_V64DImode:
2071 sri->icode = CODE_FOR_reload_inv64di;
2072 break;
2073 case E_V64DFmode:
2074 sri->icode = CODE_FOR_reload_inv64df;
2075 break;
2076 default:
2077 gcc_unreachable ();
2078 }
2079 else
2080 switch (reload_mode)
2081 {
2082 case E_V64SImode:
2083 sri->icode = CODE_FOR_reload_outv64si;
2084 break;
2085 case E_V64SFmode:
2086 sri->icode = CODE_FOR_reload_outv64sf;
2087 break;
2088 case E_V64HImode:
2089 sri->icode = CODE_FOR_reload_outv64hi;
2090 break;
2091 case E_V64HFmode:
2092 sri->icode = CODE_FOR_reload_outv64hf;
2093 break;
2094 case E_V64QImode:
2095 sri->icode = CODE_FOR_reload_outv64qi;
2096 break;
2097 case E_V64DImode:
2098 sri->icode = CODE_FOR_reload_outv64di;
2099 break;
2100 case E_V64DFmode:
2101 sri->icode = CODE_FOR_reload_outv64df;
2102 break;
2103 default:
2104 gcc_unreachable ();
2105 }
2106 break;
2107 }
2108 /* Fallthrough. */
2109 case ADDR_SPACE_LDS:
2110 case ADDR_SPACE_GDS:
2111 case ADDR_SPACE_SCRATCH:
2112 result = (rclass == VGPR_REGS ? NO_REGS : VGPR_REGS);
2113 break;
2114 }
2115 }
2116
2117 if (dump_file && (dump_flags & TDF_DETAILS))
2118 fprintf (dump_file, " <= %s (icode: %s)\n", reg_class_names[result],
2119 get_insn_name (sri->icode));
2120
2121 return result;
2122 }
2123
2124 /* Update register usage after having seen the compiler flags and kernel
2125 attributes. We typically want to fix registers that contain values
2126 set by the HSA runtime. */
2127
2128 static void
gcn_conditional_register_usage(void)2129 gcn_conditional_register_usage (void)
2130 {
2131 if (!cfun || !cfun->machine)
2132 return;
2133
2134 if (cfun->machine->normal_function)
2135 {
2136 /* Restrict the set of SGPRs and VGPRs used by non-kernel functions. */
2137 for (int i = SGPR_REGNO (MAX_NORMAL_SGPR_COUNT);
2138 i <= LAST_SGPR_REG; i++)
2139 fixed_regs[i] = 1, call_used_regs[i] = 1;
2140
2141 for (int i = VGPR_REGNO (MAX_NORMAL_VGPR_COUNT);
2142 i <= LAST_VGPR_REG; i++)
2143 fixed_regs[i] = 1, call_used_regs[i] = 1;
2144
2145 return;
2146 }
2147
2148 /* If the set of requested args is the default set, nothing more needs to
2149 be done. */
2150 if (cfun->machine->args.requested == default_requested_args)
2151 return;
2152
2153 /* Requesting a set of args different from the default violates the ABI. */
2154 if (!leaf_function_p ())
2155 warning (0, "A non-default set of initial values has been requested, "
2156 "which violates the ABI");
2157
2158 for (int i = SGPR_REGNO (0); i < SGPR_REGNO (14); i++)
2159 fixed_regs[i] = 0;
2160
2161 /* Fix the runtime argument register containing values that may be
2162 needed later. DISPATCH_PTR_ARG and FLAT_SCRATCH_* should not be
2163 needed after the prologue so there's no need to fix them. */
2164 if (cfun->machine->args.reg[PRIVATE_SEGMENT_WAVE_OFFSET_ARG] >= 0)
2165 fixed_regs[cfun->machine->args.reg[PRIVATE_SEGMENT_WAVE_OFFSET_ARG]] = 1;
2166 if (cfun->machine->args.reg[PRIVATE_SEGMENT_BUFFER_ARG] >= 0)
2167 {
2168 /* The upper 32-bits of the 64-bit descriptor are not used, so allow
2169 the containing registers to be used for other purposes. */
2170 fixed_regs[cfun->machine->args.reg[PRIVATE_SEGMENT_BUFFER_ARG]] = 1;
2171 fixed_regs[cfun->machine->args.reg[PRIVATE_SEGMENT_BUFFER_ARG] + 1] = 1;
2172 }
2173 if (cfun->machine->args.reg[KERNARG_SEGMENT_PTR_ARG] >= 0)
2174 {
2175 fixed_regs[cfun->machine->args.reg[KERNARG_SEGMENT_PTR_ARG]] = 1;
2176 fixed_regs[cfun->machine->args.reg[KERNARG_SEGMENT_PTR_ARG] + 1] = 1;
2177 }
2178 if (cfun->machine->args.reg[DISPATCH_PTR_ARG] >= 0)
2179 {
2180 fixed_regs[cfun->machine->args.reg[DISPATCH_PTR_ARG]] = 1;
2181 fixed_regs[cfun->machine->args.reg[DISPATCH_PTR_ARG] + 1] = 1;
2182 }
2183 if (cfun->machine->args.reg[WORKGROUP_ID_X_ARG] >= 0)
2184 fixed_regs[cfun->machine->args.reg[WORKGROUP_ID_X_ARG]] = 1;
2185 if (cfun->machine->args.reg[WORK_ITEM_ID_X_ARG] >= 0)
2186 fixed_regs[cfun->machine->args.reg[WORK_ITEM_ID_X_ARG]] = 1;
2187 if (cfun->machine->args.reg[WORK_ITEM_ID_Y_ARG] >= 0)
2188 fixed_regs[cfun->machine->args.reg[WORK_ITEM_ID_Y_ARG]] = 1;
2189 if (cfun->machine->args.reg[WORK_ITEM_ID_Z_ARG] >= 0)
2190 fixed_regs[cfun->machine->args.reg[WORK_ITEM_ID_Z_ARG]] = 1;
2191 }
2192
2193 /* Determine if a load or store is valid, according to the register classes
2194 and address space. Used primarily by the machine description to decide
2195 when to split a move into two steps. */
2196
2197 bool
gcn_valid_move_p(machine_mode mode,rtx dest,rtx src)2198 gcn_valid_move_p (machine_mode mode, rtx dest, rtx src)
2199 {
2200 if (!MEM_P (dest) && !MEM_P (src))
2201 return true;
2202
2203 if (MEM_P (dest)
2204 && AS_FLAT_P (MEM_ADDR_SPACE (dest))
2205 && (gcn_flat_address_p (XEXP (dest, 0), mode)
2206 || GET_CODE (XEXP (dest, 0)) == SYMBOL_REF
2207 || GET_CODE (XEXP (dest, 0)) == LABEL_REF)
2208 && gcn_vgpr_register_operand (src, mode))
2209 return true;
2210 else if (MEM_P (src)
2211 && AS_FLAT_P (MEM_ADDR_SPACE (src))
2212 && (gcn_flat_address_p (XEXP (src, 0), mode)
2213 || GET_CODE (XEXP (src, 0)) == SYMBOL_REF
2214 || GET_CODE (XEXP (src, 0)) == LABEL_REF)
2215 && gcn_vgpr_register_operand (dest, mode))
2216 return true;
2217
2218 if (MEM_P (dest)
2219 && AS_GLOBAL_P (MEM_ADDR_SPACE (dest))
2220 && (gcn_global_address_p (XEXP (dest, 0))
2221 || GET_CODE (XEXP (dest, 0)) == SYMBOL_REF
2222 || GET_CODE (XEXP (dest, 0)) == LABEL_REF)
2223 && gcn_vgpr_register_operand (src, mode))
2224 return true;
2225 else if (MEM_P (src)
2226 && AS_GLOBAL_P (MEM_ADDR_SPACE (src))
2227 && (gcn_global_address_p (XEXP (src, 0))
2228 || GET_CODE (XEXP (src, 0)) == SYMBOL_REF
2229 || GET_CODE (XEXP (src, 0)) == LABEL_REF)
2230 && gcn_vgpr_register_operand (dest, mode))
2231 return true;
2232
2233 if (MEM_P (dest)
2234 && MEM_ADDR_SPACE (dest) == ADDR_SPACE_SCALAR_FLAT
2235 && (gcn_scalar_flat_address_p (XEXP (dest, 0))
2236 || GET_CODE (XEXP (dest, 0)) == SYMBOL_REF
2237 || GET_CODE (XEXP (dest, 0)) == LABEL_REF)
2238 && gcn_ssrc_register_operand (src, mode))
2239 return true;
2240 else if (MEM_P (src)
2241 && MEM_ADDR_SPACE (src) == ADDR_SPACE_SCALAR_FLAT
2242 && (gcn_scalar_flat_address_p (XEXP (src, 0))
2243 || GET_CODE (XEXP (src, 0)) == SYMBOL_REF
2244 || GET_CODE (XEXP (src, 0)) == LABEL_REF)
2245 && gcn_sdst_register_operand (dest, mode))
2246 return true;
2247
2248 if (MEM_P (dest)
2249 && AS_ANY_DS_P (MEM_ADDR_SPACE (dest))
2250 && gcn_ds_address_p (XEXP (dest, 0))
2251 && gcn_vgpr_register_operand (src, mode))
2252 return true;
2253 else if (MEM_P (src)
2254 && AS_ANY_DS_P (MEM_ADDR_SPACE (src))
2255 && gcn_ds_address_p (XEXP (src, 0))
2256 && gcn_vgpr_register_operand (dest, mode))
2257 return true;
2258
2259 return false;
2260 }
2261
2262 /* }}} */
2263 /* {{{ Functions and ABI. */
2264
2265 /* Implement TARGET_FUNCTION_VALUE.
2266
2267 Define how to find the value returned by a function.
2268 The register location is always the same, but the mode depends on
2269 VALTYPE. */
2270
2271 static rtx
gcn_function_value(const_tree valtype,const_tree,bool)2272 gcn_function_value (const_tree valtype, const_tree, bool)
2273 {
2274 machine_mode mode = TYPE_MODE (valtype);
2275
2276 if (INTEGRAL_TYPE_P (valtype)
2277 && GET_MODE_CLASS (mode) == MODE_INT
2278 && GET_MODE_SIZE (mode) < 4)
2279 mode = SImode;
2280
2281 return gen_rtx_REG (mode, SGPR_REGNO (RETURN_VALUE_REG));
2282 }
2283
2284 /* Implement TARGET_FUNCTION_VALUE_REGNO_P.
2285
2286 Return true if N is a possible register number for the function return
2287 value. */
2288
2289 static bool
gcn_function_value_regno_p(const unsigned int n)2290 gcn_function_value_regno_p (const unsigned int n)
2291 {
2292 return n == RETURN_VALUE_REG;
2293 }
2294
2295 /* Calculate the number of registers required to hold function argument
2296 ARG. */
2297
2298 static int
num_arg_regs(const function_arg_info & arg)2299 num_arg_regs (const function_arg_info &arg)
2300 {
2301 if (targetm.calls.must_pass_in_stack (arg))
2302 return 0;
2303
2304 int size = arg.promoted_size_in_bytes ();
2305 return (size + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
2306 }
2307
2308 /* Implement TARGET_STRICT_ARGUMENT_NAMING.
2309
2310 Return true if the location where a function argument is passed
2311 depends on whether or not it is a named argument
2312
2313 For gcn, we know how to handle functions declared as stdarg: by
2314 passing an extra pointer to the unnamed arguments. However, the
2315 Fortran frontend can produce a different situation, where a
2316 function pointer is declared with no arguments, but the actual
2317 function and calls to it take more arguments. In that case, we
2318 want to ensure the call matches the definition of the function. */
2319
2320 static bool
gcn_strict_argument_naming(cumulative_args_t cum_v)2321 gcn_strict_argument_naming (cumulative_args_t cum_v)
2322 {
2323 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
2324
2325 return cum->fntype == NULL_TREE || stdarg_p (cum->fntype);
2326 }
2327
2328 /* Implement TARGET_PRETEND_OUTGOING_VARARGS_NAMED.
2329
2330 See comment on gcn_strict_argument_naming. */
2331
2332 static bool
gcn_pretend_outgoing_varargs_named(cumulative_args_t cum_v)2333 gcn_pretend_outgoing_varargs_named (cumulative_args_t cum_v)
2334 {
2335 return !gcn_strict_argument_naming (cum_v);
2336 }
2337
2338 /* Implement TARGET_FUNCTION_ARG.
2339
2340 Return an RTX indicating whether a function argument is passed in a register
2341 and if so, which register. */
2342
2343 static rtx
gcn_function_arg(cumulative_args_t cum_v,const function_arg_info & arg)2344 gcn_function_arg (cumulative_args_t cum_v, const function_arg_info &arg)
2345 {
2346 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
2347 if (cum->normal_function)
2348 {
2349 if (!arg.named || arg.end_marker_p ())
2350 return 0;
2351
2352 if (targetm.calls.must_pass_in_stack (arg))
2353 return 0;
2354
2355 /* Vector parameters are not supported yet. */
2356 if (VECTOR_MODE_P (arg.mode))
2357 return 0;
2358
2359 int reg_num = FIRST_PARM_REG + cum->num;
2360 int num_regs = num_arg_regs (arg);
2361 if (num_regs > 0)
2362 while (reg_num % num_regs != 0)
2363 reg_num++;
2364 if (reg_num + num_regs <= FIRST_PARM_REG + NUM_PARM_REGS)
2365 return gen_rtx_REG (arg.mode, reg_num);
2366 }
2367 else
2368 {
2369 if (cum->num >= cum->args.nargs)
2370 {
2371 cum->offset = (cum->offset + TYPE_ALIGN (arg.type) / 8 - 1)
2372 & -(TYPE_ALIGN (arg.type) / 8);
2373 cfun->machine->kernarg_segment_alignment
2374 = MAX ((unsigned) cfun->machine->kernarg_segment_alignment,
2375 TYPE_ALIGN (arg.type) / 8);
2376 rtx addr = gen_rtx_REG (DImode,
2377 cum->args.reg[KERNARG_SEGMENT_PTR_ARG]);
2378 if (cum->offset)
2379 addr = gen_rtx_PLUS (DImode, addr,
2380 gen_int_mode (cum->offset, DImode));
2381 rtx mem = gen_rtx_MEM (arg.mode, addr);
2382 set_mem_attributes (mem, arg.type, 1);
2383 set_mem_addr_space (mem, ADDR_SPACE_SCALAR_FLAT);
2384 MEM_READONLY_P (mem) = 1;
2385 return mem;
2386 }
2387
2388 int a = cum->args.order[cum->num];
2389 if (arg.mode != gcn_kernel_arg_types[a].mode)
2390 {
2391 error ("wrong type of argument %s", gcn_kernel_arg_types[a].name);
2392 return 0;
2393 }
2394 return gen_rtx_REG ((machine_mode) gcn_kernel_arg_types[a].mode,
2395 cum->args.reg[a]);
2396 }
2397 return 0;
2398 }
2399
2400 /* Implement TARGET_FUNCTION_ARG_ADVANCE.
2401
2402 Updates the summarizer variable pointed to by CUM_V to advance past an
2403 argument in the argument list. */
2404
2405 static void
gcn_function_arg_advance(cumulative_args_t cum_v,const function_arg_info & arg)2406 gcn_function_arg_advance (cumulative_args_t cum_v,
2407 const function_arg_info &arg)
2408 {
2409 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
2410
2411 if (cum->normal_function)
2412 {
2413 if (!arg.named)
2414 return;
2415
2416 int num_regs = num_arg_regs (arg);
2417 if (num_regs > 0)
2418 while ((FIRST_PARM_REG + cum->num) % num_regs != 0)
2419 cum->num++;
2420 cum->num += num_regs;
2421 }
2422 else
2423 {
2424 if (cum->num < cum->args.nargs)
2425 cum->num++;
2426 else
2427 {
2428 cum->offset += tree_to_uhwi (TYPE_SIZE_UNIT (arg.type));
2429 cfun->machine->kernarg_segment_byte_size = cum->offset;
2430 }
2431 }
2432 }
2433
2434 /* Implement TARGET_ARG_PARTIAL_BYTES.
2435
2436 Returns the number of bytes at the beginning of an argument that must be put
2437 in registers. The value must be zero for arguments that are passed entirely
2438 in registers or that are entirely pushed on the stack. */
2439
2440 static int
gcn_arg_partial_bytes(cumulative_args_t cum_v,const function_arg_info & arg)2441 gcn_arg_partial_bytes (cumulative_args_t cum_v, const function_arg_info &arg)
2442 {
2443 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
2444
2445 if (!arg.named)
2446 return 0;
2447
2448 if (targetm.calls.must_pass_in_stack (arg))
2449 return 0;
2450
2451 if (cum->num >= NUM_PARM_REGS)
2452 return 0;
2453
2454 /* If the argument fits entirely in registers, return 0. */
2455 if (cum->num + num_arg_regs (arg) <= NUM_PARM_REGS)
2456 return 0;
2457
2458 return (NUM_PARM_REGS - cum->num) * UNITS_PER_WORD;
2459 }
2460
2461 /* A normal function which takes a pointer argument may be passed a pointer to
2462 LDS space (via a high-bits-set aperture), and that only works with FLAT
2463 addressing, not GLOBAL. Force FLAT addressing if the function has an
2464 incoming pointer parameter. NOTE: This is a heuristic that works in the
2465 offloading case, but in general, a function might read global pointer
2466 variables, etc. that may refer to LDS space or other special memory areas
2467 not supported by GLOBAL instructions, and then this argument check would not
2468 suffice. */
2469
2470 static void
gcn_detect_incoming_pointer_arg(tree fndecl)2471 gcn_detect_incoming_pointer_arg (tree fndecl)
2472 {
2473 gcc_assert (cfun && cfun->machine);
2474
2475 for (tree arg = TYPE_ARG_TYPES (TREE_TYPE (fndecl));
2476 arg;
2477 arg = TREE_CHAIN (arg))
2478 if (POINTER_TYPE_P (TREE_VALUE (arg)))
2479 cfun->machine->use_flat_addressing = true;
2480 }
2481
2482 /* Implement INIT_CUMULATIVE_ARGS, via gcn.h.
2483
2484 Initialize a variable CUM of type CUMULATIVE_ARGS for a call to a function
2485 whose data type is FNTYPE. For a library call, FNTYPE is 0. */
2486
2487 void
gcn_init_cumulative_args(CUMULATIVE_ARGS * cum,tree fntype,rtx libname,tree fndecl,int caller)2488 gcn_init_cumulative_args (CUMULATIVE_ARGS *cum /* Argument info to init */ ,
2489 tree fntype /* tree ptr for function decl */ ,
2490 rtx libname /* SYMBOL_REF of library name or 0 */ ,
2491 tree fndecl, int caller)
2492 {
2493 memset (cum, 0, sizeof (*cum));
2494 cum->fntype = fntype;
2495 if (libname)
2496 {
2497 gcc_assert (cfun && cfun->machine);
2498 cum->normal_function = true;
2499 if (!caller)
2500 {
2501 cfun->machine->normal_function = true;
2502 gcn_detect_incoming_pointer_arg (fndecl);
2503 }
2504 return;
2505 }
2506 tree attr = NULL;
2507 if (fndecl)
2508 attr = lookup_attribute ("amdgpu_hsa_kernel", DECL_ATTRIBUTES (fndecl));
2509 if (fndecl && !attr)
2510 attr = lookup_attribute ("amdgpu_hsa_kernel",
2511 TYPE_ATTRIBUTES (TREE_TYPE (fndecl)));
2512 if (!attr && fntype)
2513 attr = lookup_attribute ("amdgpu_hsa_kernel", TYPE_ATTRIBUTES (fntype));
2514 /* Handle main () as kernel, so we can run testsuite.
2515 Handle OpenACC kernels similarly to main. */
2516 if (!attr && !caller && fndecl
2517 && (MAIN_NAME_P (DECL_NAME (fndecl))
2518 || lookup_attribute ("omp target entrypoint",
2519 DECL_ATTRIBUTES (fndecl)) != NULL_TREE))
2520 gcn_parse_amdgpu_hsa_kernel_attribute (&cum->args, NULL_TREE);
2521 else
2522 {
2523 if (!attr || caller)
2524 {
2525 gcc_assert (cfun && cfun->machine);
2526 cum->normal_function = true;
2527 if (!caller)
2528 cfun->machine->normal_function = true;
2529 }
2530 gcn_parse_amdgpu_hsa_kernel_attribute
2531 (&cum->args, attr ? TREE_VALUE (attr) : NULL_TREE);
2532 }
2533 cfun->machine->args = cum->args;
2534 if (!caller && cfun->machine->normal_function)
2535 gcn_detect_incoming_pointer_arg (fndecl);
2536
2537 reinit_regs ();
2538 }
2539
2540 static bool
gcn_return_in_memory(const_tree type,const_tree ARG_UNUSED (fntype))2541 gcn_return_in_memory (const_tree type, const_tree ARG_UNUSED (fntype))
2542 {
2543 machine_mode mode = TYPE_MODE (type);
2544 HOST_WIDE_INT size = int_size_in_bytes (type);
2545
2546 if (AGGREGATE_TYPE_P (type))
2547 return true;
2548
2549 /* Vector return values are not supported yet. */
2550 if (VECTOR_TYPE_P (type))
2551 return true;
2552
2553 if (mode == BLKmode)
2554 return true;
2555
2556 if (size > 2 * UNITS_PER_WORD)
2557 return true;
2558
2559 return false;
2560 }
2561
2562 /* Implement TARGET_PROMOTE_FUNCTION_MODE.
2563
2564 Return the mode to use for outgoing function arguments. */
2565
2566 machine_mode
gcn_promote_function_mode(const_tree ARG_UNUSED (type),machine_mode mode,int * ARG_UNUSED (punsignedp),const_tree ARG_UNUSED (funtype),int ARG_UNUSED (for_return))2567 gcn_promote_function_mode (const_tree ARG_UNUSED (type), machine_mode mode,
2568 int *ARG_UNUSED (punsignedp),
2569 const_tree ARG_UNUSED (funtype),
2570 int ARG_UNUSED (for_return))
2571 {
2572 if (GET_MODE_CLASS (mode) == MODE_INT && GET_MODE_SIZE (mode) < 4)
2573 return SImode;
2574
2575 return mode;
2576 }
2577
2578 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.
2579
2580 Derived from hppa_gimplify_va_arg_expr. The generic routine doesn't handle
2581 ARGS_GROW_DOWNWARDS. */
2582
2583 static tree
gcn_gimplify_va_arg_expr(tree valist,tree type,gimple_seq * ARG_UNUSED (pre_p),gimple_seq * ARG_UNUSED (post_p))2584 gcn_gimplify_va_arg_expr (tree valist, tree type,
2585 gimple_seq *ARG_UNUSED (pre_p),
2586 gimple_seq *ARG_UNUSED (post_p))
2587 {
2588 tree ptr = build_pointer_type (type);
2589 tree valist_type;
2590 tree t, u;
2591 bool indirect;
2592
2593 indirect = pass_va_arg_by_reference (type);
2594 if (indirect)
2595 {
2596 type = ptr;
2597 ptr = build_pointer_type (type);
2598 }
2599 valist_type = TREE_TYPE (valist);
2600
2601 /* Args grow down. Not handled by generic routines. */
2602
2603 u = fold_convert (sizetype, size_in_bytes (type));
2604 u = fold_build1 (NEGATE_EXPR, sizetype, u);
2605 t = fold_build_pointer_plus (valist, u);
2606
2607 /* Align to 8 byte boundary. */
2608
2609 u = build_int_cst (TREE_TYPE (t), -8);
2610 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t, u);
2611 t = fold_convert (valist_type, t);
2612
2613 t = build2 (MODIFY_EXPR, valist_type, valist, t);
2614
2615 t = fold_convert (ptr, t);
2616 t = build_va_arg_indirect_ref (t);
2617
2618 if (indirect)
2619 t = build_va_arg_indirect_ref (t);
2620
2621 return t;
2622 }
2623
2624 /* Return 1 if TRAIT NAME is present in the OpenMP context's
2625 device trait set, return 0 if not present in any OpenMP context in the
2626 whole translation unit, or -1 if not present in the current OpenMP context
2627 but might be present in another OpenMP context in the same TU. */
2628
2629 int
gcn_omp_device_kind_arch_isa(enum omp_device_kind_arch_isa trait,const char * name)2630 gcn_omp_device_kind_arch_isa (enum omp_device_kind_arch_isa trait,
2631 const char *name)
2632 {
2633 switch (trait)
2634 {
2635 case omp_device_kind:
2636 return strcmp (name, "gpu") == 0;
2637 case omp_device_arch:
2638 return strcmp (name, "gcn") == 0;
2639 case omp_device_isa:
2640 if (strcmp (name, "fiji") == 0)
2641 return gcn_arch == PROCESSOR_FIJI;
2642 if (strcmp (name, "gfx900") == 0)
2643 return gcn_arch == PROCESSOR_VEGA10;
2644 if (strcmp (name, "gfx906") == 0)
2645 return gcn_arch == PROCESSOR_VEGA20;
2646 if (strcmp (name, "gfx908") == 0)
2647 return gcn_arch == PROCESSOR_GFX908;
2648 return 0;
2649 default:
2650 gcc_unreachable ();
2651 }
2652 }
2653
2654 /* Calculate stack offsets needed to create prologues and epilogues. */
2655
2656 static struct machine_function *
gcn_compute_frame_offsets(void)2657 gcn_compute_frame_offsets (void)
2658 {
2659 machine_function *offsets = cfun->machine;
2660
2661 if (reload_completed)
2662 return offsets;
2663
2664 offsets->need_frame_pointer = frame_pointer_needed;
2665
2666 offsets->outgoing_args_size = crtl->outgoing_args_size;
2667 offsets->pretend_size = crtl->args.pretend_args_size;
2668
2669 offsets->local_vars = get_frame_size ();
2670
2671 offsets->lr_needs_saving = (!leaf_function_p ()
2672 || df_regs_ever_live_p (LR_REGNUM)
2673 || df_regs_ever_live_p (LR_REGNUM + 1));
2674
2675 offsets->callee_saves = offsets->lr_needs_saving ? 8 : 0;
2676
2677 for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
2678 if ((df_regs_ever_live_p (regno) && !call_used_or_fixed_reg_p (regno))
2679 || ((regno & ~1) == HARD_FRAME_POINTER_REGNUM
2680 && frame_pointer_needed))
2681 offsets->callee_saves += (VGPR_REGNO_P (regno) ? 256 : 4);
2682
2683 /* Round up to 64-bit boundary to maintain stack alignment. */
2684 offsets->callee_saves = (offsets->callee_saves + 7) & ~7;
2685
2686 return offsets;
2687 }
2688
2689 /* Insert code into the prologue or epilogue to store or load any
2690 callee-save register to/from the stack.
2691
2692 Helper function for gcn_expand_prologue and gcn_expand_epilogue. */
2693
2694 static void
move_callee_saved_registers(rtx sp,machine_function * offsets,bool prologue)2695 move_callee_saved_registers (rtx sp, machine_function *offsets,
2696 bool prologue)
2697 {
2698 int regno, offset, saved_scalars;
2699 rtx exec = gen_rtx_REG (DImode, EXEC_REG);
2700 rtx vcc = gen_rtx_REG (DImode, VCC_LO_REG);
2701 rtx offreg = gen_rtx_REG (SImode, SGPR_REGNO (22));
2702 rtx as = gen_rtx_CONST_INT (VOIDmode, STACK_ADDR_SPACE);
2703 HOST_WIDE_INT exec_set = 0;
2704 int offreg_set = 0;
2705 auto_vec<int> saved_sgprs;
2706
2707 start_sequence ();
2708
2709 /* Move scalars into two vector registers. */
2710 for (regno = 0, saved_scalars = 0; regno < FIRST_VGPR_REG; regno++)
2711 if ((df_regs_ever_live_p (regno) && !call_used_or_fixed_reg_p (regno))
2712 || ((regno & ~1) == LINK_REGNUM && offsets->lr_needs_saving)
2713 || ((regno & ~1) == HARD_FRAME_POINTER_REGNUM
2714 && offsets->need_frame_pointer))
2715 {
2716 rtx reg = gen_rtx_REG (SImode, regno);
2717 rtx vreg = gen_rtx_REG (V64SImode,
2718 VGPR_REGNO (6 + (saved_scalars / 64)));
2719 int lane = saved_scalars % 64;
2720
2721 if (prologue)
2722 {
2723 emit_insn (gen_vec_setv64si (vreg, reg, GEN_INT (lane)));
2724 saved_sgprs.safe_push (regno);
2725 }
2726 else
2727 emit_insn (gen_vec_extractv64sisi (reg, vreg, GEN_INT (lane)));
2728
2729 saved_scalars++;
2730 }
2731
2732 rtx move_scalars = get_insns ();
2733 end_sequence ();
2734 start_sequence ();
2735
2736 /* Ensure that all vector lanes are moved. */
2737 exec_set = -1;
2738 emit_move_insn (exec, GEN_INT (exec_set));
2739
2740 /* Set up a vector stack pointer. */
2741 rtx _0_1_2_3 = gen_rtx_REG (V64SImode, VGPR_REGNO (1));
2742 rtx _0_4_8_12 = gen_rtx_REG (V64SImode, VGPR_REGNO (3));
2743 emit_insn (gen_ashlv64si3_exec (_0_4_8_12, _0_1_2_3, GEN_INT (2),
2744 gcn_gen_undef (V64SImode), exec));
2745 rtx vsp = gen_rtx_REG (V64DImode, VGPR_REGNO (4));
2746 emit_insn (gen_vec_duplicatev64di_exec (vsp, sp, gcn_gen_undef (V64DImode),
2747 exec));
2748 emit_insn (gen_addv64si3_vcc_exec (gcn_operand_part (V64SImode, vsp, 0),
2749 gcn_operand_part (V64SImode, vsp, 0),
2750 _0_4_8_12, vcc, gcn_gen_undef (V64SImode),
2751 exec));
2752 emit_insn (gen_addcv64si3_exec (gcn_operand_part (V64SImode, vsp, 1),
2753 gcn_operand_part (V64SImode, vsp, 1),
2754 const0_rtx, vcc, vcc,
2755 gcn_gen_undef (V64SImode), exec));
2756
2757 /* Move vectors. */
2758 for (regno = FIRST_VGPR_REG, offset = 0;
2759 regno < FIRST_PSEUDO_REGISTER; regno++)
2760 if ((df_regs_ever_live_p (regno) && !call_used_or_fixed_reg_p (regno))
2761 || (regno == VGPR_REGNO (6) && saved_scalars > 0)
2762 || (regno == VGPR_REGNO (7) && saved_scalars > 63))
2763 {
2764 rtx reg = gen_rtx_REG (V64SImode, regno);
2765 int size = 256;
2766
2767 if (regno == VGPR_REGNO (6) && saved_scalars < 64)
2768 size = saved_scalars * 4;
2769 else if (regno == VGPR_REGNO (7) && saved_scalars < 128)
2770 size = (saved_scalars - 64) * 4;
2771
2772 if (size != 256 || exec_set != -1)
2773 {
2774 exec_set = ((unsigned HOST_WIDE_INT) 1 << (size / 4)) - 1;
2775 emit_move_insn (exec, gen_int_mode (exec_set, DImode));
2776 }
2777
2778 if (prologue)
2779 {
2780 rtx insn = emit_insn (gen_scatterv64si_insn_1offset_exec
2781 (vsp, const0_rtx, reg, as, const0_rtx,
2782 exec));
2783
2784 /* Add CFI metadata. */
2785 rtx note;
2786 if (regno == VGPR_REGNO (6) || regno == VGPR_REGNO (7))
2787 {
2788 int start = (regno == VGPR_REGNO (7) ? 64 : 0);
2789 int count = MIN (saved_scalars - start, 64);
2790 int add_lr = (regno == VGPR_REGNO (6)
2791 && offsets->lr_needs_saving);
2792 int lrdest = -1;
2793 rtvec seq = rtvec_alloc (count + add_lr);
2794
2795 /* Add an REG_FRAME_RELATED_EXPR entry for each scalar
2796 register that was saved in this batch. */
2797 for (int idx = 0; idx < count; idx++)
2798 {
2799 int stackaddr = offset + idx * 4;
2800 rtx dest = gen_rtx_MEM (SImode,
2801 gen_rtx_PLUS
2802 (DImode, sp,
2803 GEN_INT (stackaddr)));
2804 rtx src = gen_rtx_REG (SImode, saved_sgprs[start + idx]);
2805 rtx set = gen_rtx_SET (dest, src);
2806 RTX_FRAME_RELATED_P (set) = 1;
2807 RTVEC_ELT (seq, idx) = set;
2808
2809 if (saved_sgprs[start + idx] == LINK_REGNUM)
2810 lrdest = stackaddr;
2811 }
2812
2813 /* Add an additional expression for DWARF_LINK_REGISTER if
2814 LINK_REGNUM was saved. */
2815 if (lrdest != -1)
2816 {
2817 rtx dest = gen_rtx_MEM (DImode,
2818 gen_rtx_PLUS
2819 (DImode, sp,
2820 GEN_INT (lrdest)));
2821 rtx src = gen_rtx_REG (DImode, DWARF_LINK_REGISTER);
2822 rtx set = gen_rtx_SET (dest, src);
2823 RTX_FRAME_RELATED_P (set) = 1;
2824 RTVEC_ELT (seq, count) = set;
2825 }
2826
2827 note = gen_rtx_SEQUENCE (VOIDmode, seq);
2828 }
2829 else
2830 {
2831 rtx dest = gen_rtx_MEM (V64SImode,
2832 gen_rtx_PLUS (DImode, sp,
2833 GEN_INT (offset)));
2834 rtx src = gen_rtx_REG (V64SImode, regno);
2835 note = gen_rtx_SET (dest, src);
2836 }
2837 RTX_FRAME_RELATED_P (insn) = 1;
2838 add_reg_note (insn, REG_FRAME_RELATED_EXPR, note);
2839 }
2840 else
2841 emit_insn (gen_gatherv64si_insn_1offset_exec
2842 (reg, vsp, const0_rtx, as, const0_rtx,
2843 gcn_gen_undef (V64SImode), exec));
2844
2845 /* Move our VSP to the next stack entry. */
2846 if (offreg_set != size)
2847 {
2848 offreg_set = size;
2849 emit_move_insn (offreg, GEN_INT (size));
2850 }
2851 if (exec_set != -1)
2852 {
2853 exec_set = -1;
2854 emit_move_insn (exec, GEN_INT (exec_set));
2855 }
2856 emit_insn (gen_addv64si3_vcc_dup_exec
2857 (gcn_operand_part (V64SImode, vsp, 0),
2858 offreg, gcn_operand_part (V64SImode, vsp, 0),
2859 vcc, gcn_gen_undef (V64SImode), exec));
2860 emit_insn (gen_addcv64si3_exec
2861 (gcn_operand_part (V64SImode, vsp, 1),
2862 gcn_operand_part (V64SImode, vsp, 1),
2863 const0_rtx, vcc, vcc, gcn_gen_undef (V64SImode), exec));
2864
2865 offset += size;
2866 }
2867
2868 rtx move_vectors = get_insns ();
2869 end_sequence ();
2870
2871 if (prologue)
2872 {
2873 emit_insn (move_scalars);
2874 emit_insn (move_vectors);
2875 }
2876 else
2877 {
2878 emit_insn (move_vectors);
2879 emit_insn (move_scalars);
2880 }
2881 }
2882
2883 /* Generate prologue. Called from gen_prologue during pro_and_epilogue pass.
2884
2885 For a non-kernel function, the stack layout looks like this (interim),
2886 growing *upwards*:
2887
2888 hi | + ...
2889 |__________________| <-- current SP
2890 | outgoing args |
2891 |__________________|
2892 | (alloca space) |
2893 |__________________|
2894 | local vars |
2895 |__________________| <-- FP/hard FP
2896 | callee-save regs |
2897 |__________________| <-- soft arg pointer
2898 | pretend args |
2899 |__________________| <-- incoming SP
2900 | incoming args |
2901 lo |..................|
2902
2903 This implies arguments (beyond the first N in registers) must grow
2904 downwards (as, apparently, PA has them do).
2905
2906 For a kernel function we have the simpler:
2907
2908 hi | + ...
2909 |__________________| <-- current SP
2910 | outgoing args |
2911 |__________________|
2912 | (alloca space) |
2913 |__________________|
2914 | local vars |
2915 lo |__________________| <-- FP/hard FP
2916
2917 */
2918
2919 void
gcn_expand_prologue()2920 gcn_expand_prologue ()
2921 {
2922 machine_function *offsets = gcn_compute_frame_offsets ();
2923
2924 if (!cfun || !cfun->machine || cfun->machine->normal_function)
2925 {
2926 rtx sp = gen_rtx_REG (Pmode, STACK_POINTER_REGNUM);
2927 rtx sp_hi = gcn_operand_part (Pmode, sp, 1);
2928 rtx sp_lo = gcn_operand_part (Pmode, sp, 0);
2929 rtx fp = gen_rtx_REG (Pmode, HARD_FRAME_POINTER_REGNUM);
2930 rtx fp_hi = gcn_operand_part (Pmode, fp, 1);
2931 rtx fp_lo = gcn_operand_part (Pmode, fp, 0);
2932
2933 start_sequence ();
2934
2935 if (offsets->pretend_size > 0)
2936 {
2937 /* FIXME: Do the actual saving of register pretend args to the stack.
2938 Register order needs consideration. */
2939 }
2940
2941 /* Save callee-save regs. */
2942 move_callee_saved_registers (sp, offsets, true);
2943
2944 HOST_WIDE_INT sp_adjust = offsets->pretend_size
2945 + offsets->callee_saves
2946 + offsets->local_vars + offsets->outgoing_args_size;
2947 if (sp_adjust > 0)
2948 {
2949 /* Adding RTX_FRAME_RELATED_P effectively disables spliting, so
2950 we use split add explictly, and specify the DImode add in
2951 the note. */
2952 rtx scc = gen_rtx_REG (BImode, SCC_REG);
2953 rtx adjustment = gen_int_mode (sp_adjust, SImode);
2954 rtx insn = emit_insn (gen_addsi3_scalar_carry (sp_lo, sp_lo,
2955 adjustment, scc));
2956 if (!offsets->need_frame_pointer)
2957 {
2958 RTX_FRAME_RELATED_P (insn) = 1;
2959 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
2960 gen_rtx_SET (sp,
2961 gen_rtx_PLUS (DImode, sp,
2962 adjustment)));
2963 }
2964 emit_insn (gen_addcsi3_scalar_zero (sp_hi, sp_hi, scc));
2965 }
2966
2967 if (offsets->need_frame_pointer)
2968 {
2969 /* Adding RTX_FRAME_RELATED_P effectively disables spliting, so
2970 we use split add explictly, and specify the DImode add in
2971 the note. */
2972 rtx scc = gen_rtx_REG (BImode, SCC_REG);
2973 int fp_adjust = -(offsets->local_vars + offsets->outgoing_args_size);
2974 rtx adjustment = gen_int_mode (fp_adjust, SImode);
2975 rtx insn = emit_insn (gen_addsi3_scalar_carry(fp_lo, sp_lo,
2976 adjustment, scc));
2977 emit_insn (gen_addcsi3_scalar (fp_hi, sp_hi,
2978 (fp_adjust < 0 ? GEN_INT (-1)
2979 : const0_rtx),
2980 scc, scc));
2981
2982 /* Set the CFA to the entry stack address, as an offset from the
2983 frame pointer. This is preferred because the frame pointer is
2984 saved in each frame, whereas the stack pointer is not. */
2985 RTX_FRAME_RELATED_P (insn) = 1;
2986 add_reg_note (insn, REG_CFA_DEF_CFA,
2987 gen_rtx_PLUS (DImode, fp,
2988 GEN_INT (-(offsets->pretend_size
2989 + offsets->callee_saves))));
2990 }
2991
2992 rtx_insn *seq = get_insns ();
2993 end_sequence ();
2994
2995 emit_insn (seq);
2996 }
2997 else
2998 {
2999 rtx wave_offset = gen_rtx_REG (SImode,
3000 cfun->machine->args.
3001 reg[PRIVATE_SEGMENT_WAVE_OFFSET_ARG]);
3002
3003 if (cfun->machine->args.requested & (1 << FLAT_SCRATCH_INIT_ARG))
3004 {
3005 rtx fs_init_lo =
3006 gen_rtx_REG (SImode,
3007 cfun->machine->args.reg[FLAT_SCRATCH_INIT_ARG]);
3008 rtx fs_init_hi =
3009 gen_rtx_REG (SImode,
3010 cfun->machine->args.reg[FLAT_SCRATCH_INIT_ARG] + 1);
3011 rtx fs_reg_lo = gen_rtx_REG (SImode, FLAT_SCRATCH_REG);
3012 rtx fs_reg_hi = gen_rtx_REG (SImode, FLAT_SCRATCH_REG + 1);
3013
3014 /*rtx queue = gen_rtx_REG(DImode,
3015 cfun->machine->args.reg[QUEUE_PTR_ARG]);
3016 rtx aperture = gen_rtx_MEM (SImode,
3017 gen_rtx_PLUS (DImode, queue,
3018 gen_int_mode (68, SImode)));
3019 set_mem_addr_space (aperture, ADDR_SPACE_SCALAR_FLAT);*/
3020
3021 /* Set up flat_scratch. */
3022 emit_insn (gen_addsi3_scc (fs_reg_hi, fs_init_lo, wave_offset));
3023 emit_insn (gen_lshrsi3_scc (fs_reg_hi, fs_reg_hi,
3024 gen_int_mode (8, SImode)));
3025 emit_move_insn (fs_reg_lo, fs_init_hi);
3026 }
3027
3028 /* Set up frame pointer and stack pointer. */
3029 rtx sp = gen_rtx_REG (DImode, STACK_POINTER_REGNUM);
3030 rtx sp_hi = simplify_gen_subreg (SImode, sp, DImode, 4);
3031 rtx sp_lo = simplify_gen_subreg (SImode, sp, DImode, 0);
3032 rtx fp = gen_rtx_REG (DImode, HARD_FRAME_POINTER_REGNUM);
3033 rtx fp_hi = simplify_gen_subreg (SImode, fp, DImode, 4);
3034 rtx fp_lo = simplify_gen_subreg (SImode, fp, DImode, 0);
3035
3036 HOST_WIDE_INT sp_adjust = (offsets->local_vars
3037 + offsets->outgoing_args_size);
3038
3039 /* Initialise FP and SP from the buffer descriptor in s[0:3]. */
3040 emit_move_insn (fp_lo, gen_rtx_REG (SImode, 0));
3041 emit_insn (gen_andsi3_scc (fp_hi, gen_rtx_REG (SImode, 1),
3042 gen_int_mode (0xffff, SImode)));
3043 rtx scc = gen_rtx_REG (BImode, SCC_REG);
3044 emit_insn (gen_addsi3_scalar_carry (fp_lo, fp_lo, wave_offset, scc));
3045 emit_insn (gen_addcsi3_scalar_zero (fp_hi, fp_hi, scc));
3046
3047 /* Adding RTX_FRAME_RELATED_P effectively disables spliting, so we use
3048 split add explictly, and specify the DImode add in the note.
3049 The DWARF info expects that the callee-save data is in the frame,
3050 even though it isn't (because this is the entry point), so we
3051 make a notional adjustment to the DWARF frame offset here. */
3052 rtx dbg_adjustment = gen_int_mode (sp_adjust + offsets->callee_saves,
3053 DImode);
3054 rtx insn;
3055 if (sp_adjust > 0)
3056 {
3057 rtx scc = gen_rtx_REG (BImode, SCC_REG);
3058 rtx adjustment = gen_int_mode (sp_adjust, DImode);
3059 insn = emit_insn (gen_addsi3_scalar_carry(sp_lo, fp_lo, adjustment,
3060 scc));
3061 emit_insn (gen_addcsi3_scalar_zero (sp_hi, fp_hi, scc));
3062 }
3063 else
3064 insn = emit_move_insn (sp, fp);
3065 RTX_FRAME_RELATED_P (insn) = 1;
3066 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
3067 gen_rtx_SET (sp, gen_rtx_PLUS (DImode, sp,
3068 dbg_adjustment)));
3069
3070 if (offsets->need_frame_pointer)
3071 {
3072 /* Set the CFA to the entry stack address, as an offset from the
3073 frame pointer. This is necessary when alloca is used, and
3074 harmless otherwise. */
3075 rtx neg_adjust = gen_int_mode (-offsets->callee_saves, DImode);
3076 add_reg_note (insn, REG_CFA_DEF_CFA,
3077 gen_rtx_PLUS (DImode, fp, neg_adjust));
3078 }
3079
3080 /* Make sure the flat scratch reg doesn't get optimised away. */
3081 emit_insn (gen_prologue_use (gen_rtx_REG (DImode, FLAT_SCRATCH_REG)));
3082 }
3083
3084 /* Ensure that the scheduler doesn't do anything unexpected. */
3085 emit_insn (gen_blockage ());
3086
3087 /* m0 is initialized for the usual LDS DS and FLAT memory case.
3088 The low-part is the address of the topmost addressable byte, which is
3089 size-1. The high-part is an offset and should be zero. */
3090 emit_move_insn (gen_rtx_REG (SImode, M0_REG),
3091 gen_int_mode (LDS_SIZE, SImode));
3092
3093 emit_insn (gen_prologue_use (gen_rtx_REG (SImode, M0_REG)));
3094
3095 if (cfun && cfun->machine && !cfun->machine->normal_function && flag_openmp)
3096 {
3097 /* OpenMP kernels have an implicit call to gomp_gcn_enter_kernel. */
3098 rtx fn_reg = gen_rtx_REG (Pmode, FIRST_PARM_REG);
3099 emit_move_insn (fn_reg, gen_rtx_SYMBOL_REF (Pmode,
3100 "gomp_gcn_enter_kernel"));
3101 emit_call_insn (gen_gcn_indirect_call (fn_reg, const0_rtx));
3102 }
3103 }
3104
3105 /* Generate epilogue. Called from gen_epilogue during pro_and_epilogue pass.
3106
3107 See gcn_expand_prologue for stack details. */
3108
3109 void
gcn_expand_epilogue(void)3110 gcn_expand_epilogue (void)
3111 {
3112 /* Ensure that the scheduler doesn't do anything unexpected. */
3113 emit_insn (gen_blockage ());
3114
3115 if (!cfun || !cfun->machine || cfun->machine->normal_function)
3116 {
3117 machine_function *offsets = gcn_compute_frame_offsets ();
3118 rtx sp = gen_rtx_REG (Pmode, STACK_POINTER_REGNUM);
3119 rtx fp = gen_rtx_REG (Pmode, HARD_FRAME_POINTER_REGNUM);
3120
3121 HOST_WIDE_INT sp_adjust = offsets->callee_saves + offsets->pretend_size;
3122
3123 if (offsets->need_frame_pointer)
3124 {
3125 /* Restore old SP from the frame pointer. */
3126 if (sp_adjust > 0)
3127 emit_insn (gen_subdi3 (sp, fp, gen_int_mode (sp_adjust, DImode)));
3128 else
3129 emit_move_insn (sp, fp);
3130 }
3131 else
3132 {
3133 /* Restore old SP from current SP. */
3134 sp_adjust += offsets->outgoing_args_size + offsets->local_vars;
3135
3136 if (sp_adjust > 0)
3137 emit_insn (gen_subdi3 (sp, sp, gen_int_mode (sp_adjust, DImode)));
3138 }
3139
3140 move_callee_saved_registers (sp, offsets, false);
3141
3142 /* There's no explicit use of the link register on the return insn. Emit
3143 one here instead. */
3144 if (offsets->lr_needs_saving)
3145 emit_use (gen_rtx_REG (DImode, LINK_REGNUM));
3146
3147 /* Similar for frame pointer. */
3148 if (offsets->need_frame_pointer)
3149 emit_use (gen_rtx_REG (DImode, HARD_FRAME_POINTER_REGNUM));
3150 }
3151 else if (flag_openmp)
3152 {
3153 /* OpenMP kernels have an implicit call to gomp_gcn_exit_kernel. */
3154 rtx fn_reg = gen_rtx_REG (Pmode, FIRST_PARM_REG);
3155 emit_move_insn (fn_reg,
3156 gen_rtx_SYMBOL_REF (Pmode, "gomp_gcn_exit_kernel"));
3157 emit_call_insn (gen_gcn_indirect_call (fn_reg, const0_rtx));
3158 }
3159 else if (TREE_CODE (TREE_TYPE (DECL_RESULT (cfun->decl))) != VOID_TYPE)
3160 {
3161 /* Assume that an exit value compatible with gcn-run is expected.
3162 That is, the third input parameter is an int*.
3163
3164 We can't allocate any new registers, but the kernarg_reg is
3165 dead after this, so we'll use that. */
3166 rtx kernarg_reg = gen_rtx_REG (DImode, cfun->machine->args.reg
3167 [KERNARG_SEGMENT_PTR_ARG]);
3168 rtx retptr_mem = gen_rtx_MEM (DImode,
3169 gen_rtx_PLUS (DImode, kernarg_reg,
3170 GEN_INT (16)));
3171 set_mem_addr_space (retptr_mem, ADDR_SPACE_SCALAR_FLAT);
3172 emit_move_insn (kernarg_reg, retptr_mem);
3173
3174 rtx retval_mem = gen_rtx_MEM (SImode, kernarg_reg);
3175 set_mem_addr_space (retval_mem, ADDR_SPACE_SCALAR_FLAT);
3176 emit_move_insn (retval_mem,
3177 gen_rtx_REG (SImode, SGPR_REGNO (RETURN_VALUE_REG)));
3178 }
3179
3180 emit_jump_insn (gen_gcn_return ());
3181 }
3182
3183 /* Implement TARGET_FRAME_POINTER_REQUIRED.
3184
3185 Return true if the frame pointer should not be eliminated. */
3186
3187 bool
gcn_frame_pointer_rqd(void)3188 gcn_frame_pointer_rqd (void)
3189 {
3190 /* GDB needs the frame pointer in order to unwind properly,
3191 but that's not important for the entry point, unless alloca is used.
3192 It's not important for code execution, so we should repect the
3193 -fomit-frame-pointer flag. */
3194 return (!flag_omit_frame_pointer
3195 && cfun
3196 && (cfun->calls_alloca
3197 || (cfun->machine && cfun->machine->normal_function)));
3198 }
3199
3200 /* Implement TARGET_CAN_ELIMINATE.
3201
3202 Return true if the compiler is allowed to try to replace register number
3203 FROM_REG with register number TO_REG.
3204
3205 FIXME: is the default "true" not enough? Should this be a negative set? */
3206
3207 bool
gcn_can_eliminate_p(int,int to_reg)3208 gcn_can_eliminate_p (int /*from_reg */ , int to_reg)
3209 {
3210 return (to_reg == HARD_FRAME_POINTER_REGNUM
3211 || to_reg == STACK_POINTER_REGNUM);
3212 }
3213
3214 /* Implement INITIAL_ELIMINATION_OFFSET.
3215
3216 Returns the initial difference between the specified pair of registers, in
3217 terms of stack position. */
3218
3219 HOST_WIDE_INT
gcn_initial_elimination_offset(int from,int to)3220 gcn_initial_elimination_offset (int from, int to)
3221 {
3222 machine_function *offsets = gcn_compute_frame_offsets ();
3223
3224 switch (from)
3225 {
3226 case ARG_POINTER_REGNUM:
3227 if (to == STACK_POINTER_REGNUM)
3228 return -(offsets->callee_saves + offsets->local_vars
3229 + offsets->outgoing_args_size);
3230 else if (to == FRAME_POINTER_REGNUM || to == HARD_FRAME_POINTER_REGNUM)
3231 return -offsets->callee_saves;
3232 else
3233 gcc_unreachable ();
3234 break;
3235
3236 case FRAME_POINTER_REGNUM:
3237 if (to == STACK_POINTER_REGNUM)
3238 return -(offsets->local_vars + offsets->outgoing_args_size);
3239 else if (to == HARD_FRAME_POINTER_REGNUM)
3240 return 0;
3241 else
3242 gcc_unreachable ();
3243 break;
3244
3245 default:
3246 gcc_unreachable ();
3247 }
3248 }
3249
3250 /* Implement HARD_REGNO_RENAME_OK.
3251
3252 Return true if it is permissible to rename a hard register from
3253 FROM_REG to TO_REG. */
3254
3255 bool
gcn_hard_regno_rename_ok(unsigned int from_reg,unsigned int to_reg)3256 gcn_hard_regno_rename_ok (unsigned int from_reg, unsigned int to_reg)
3257 {
3258 if (from_reg == SCC_REG
3259 || from_reg == VCC_LO_REG || from_reg == VCC_HI_REG
3260 || from_reg == EXEC_LO_REG || from_reg == EXEC_HI_REG
3261 || to_reg == SCC_REG
3262 || to_reg == VCC_LO_REG || to_reg == VCC_HI_REG
3263 || to_reg == EXEC_LO_REG || to_reg == EXEC_HI_REG)
3264 return false;
3265
3266 /* Allow the link register to be used if it was saved. */
3267 if ((to_reg & ~1) == LINK_REGNUM)
3268 return !cfun || cfun->machine->lr_needs_saving;
3269
3270 /* Allow the registers used for the static chain to be used if the chain is
3271 not in active use. */
3272 if ((to_reg & ~1) == STATIC_CHAIN_REGNUM)
3273 return !cfun
3274 || !(cfun->static_chain_decl
3275 && df_regs_ever_live_p (STATIC_CHAIN_REGNUM)
3276 && df_regs_ever_live_p (STATIC_CHAIN_REGNUM + 1));
3277
3278 return true;
3279 }
3280
3281 /* Implement HARD_REGNO_CALLER_SAVE_MODE.
3282
3283 Which mode is required for saving NREGS of a pseudo-register in
3284 call-clobbered hard register REGNO. */
3285
3286 machine_mode
gcn_hard_regno_caller_save_mode(unsigned int regno,unsigned int nregs,machine_mode regmode)3287 gcn_hard_regno_caller_save_mode (unsigned int regno, unsigned int nregs,
3288 machine_mode regmode)
3289 {
3290 machine_mode result = choose_hard_reg_mode (regno, nregs, NULL);
3291
3292 if (VECTOR_MODE_P (result) && !VECTOR_MODE_P (regmode))
3293 result = (nregs == 1 ? SImode : DImode);
3294
3295 return result;
3296 }
3297
3298 /* Implement TARGET_ASM_TRAMPOLINE_TEMPLATE.
3299
3300 Output assembler code for a block containing the constant parts
3301 of a trampoline, leaving space for the variable parts. */
3302
3303 static void
gcn_asm_trampoline_template(FILE * f)3304 gcn_asm_trampoline_template (FILE *f)
3305 {
3306 /* The source operand of the move instructions must be a 32-bit
3307 constant following the opcode. */
3308 asm_fprintf (f, "\ts_mov_b32\ts%i, 0xffff\n", STATIC_CHAIN_REGNUM);
3309 asm_fprintf (f, "\ts_mov_b32\ts%i, 0xffff\n", STATIC_CHAIN_REGNUM + 1);
3310 asm_fprintf (f, "\ts_mov_b32\ts%i, 0xffff\n", CC_SAVE_REG);
3311 asm_fprintf (f, "\ts_mov_b32\ts%i, 0xffff\n", CC_SAVE_REG + 1);
3312 asm_fprintf (f, "\ts_setpc_b64\ts[%i:%i]\n", CC_SAVE_REG, CC_SAVE_REG + 1);
3313 }
3314
3315 /* Implement TARGET_TRAMPOLINE_INIT.
3316
3317 Emit RTL insns to initialize the variable parts of a trampoline.
3318 FNDECL is the decl of the target address, M_TRAMP is a MEM for
3319 the trampoline, and CHAIN_VALUE is an RTX for the static chain
3320 to be passed to the target function. */
3321
3322 static void
gcn_trampoline_init(rtx m_tramp,tree fndecl,rtx chain_value)3323 gcn_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
3324 {
3325 if (TARGET_GCN5_PLUS)
3326 sorry ("nested function trampolines not supported on GCN5 due to"
3327 " non-executable stacks");
3328
3329 emit_block_move (m_tramp, assemble_trampoline_template (),
3330 GEN_INT (TRAMPOLINE_SIZE), BLOCK_OP_NORMAL);
3331
3332 rtx fnaddr = XEXP (DECL_RTL (fndecl), 0);
3333 rtx chain_value_reg = copy_to_reg (chain_value);
3334 rtx fnaddr_reg = copy_to_reg (fnaddr);
3335
3336 for (int i = 0; i < 4; i++)
3337 {
3338 rtx mem = adjust_address (m_tramp, SImode, i * 8 + 4);
3339 rtx reg = i < 2 ? chain_value_reg : fnaddr_reg;
3340 emit_move_insn (mem, gen_rtx_SUBREG (SImode, reg, (i % 2) * 4));
3341 }
3342
3343 rtx tramp_addr = XEXP (m_tramp, 0);
3344 emit_insn (gen_clear_icache (tramp_addr,
3345 plus_constant (ptr_mode, tramp_addr,
3346 TRAMPOLINE_SIZE)));
3347 }
3348
3349 /* }}} */
3350 /* {{{ Miscellaneous. */
3351
3352 /* Implement TARGET_CANNOT_COPY_INSN_P.
3353
3354 Return true if INSN must not be duplicated. */
3355
3356 static bool
gcn_cannot_copy_insn_p(rtx_insn * insn)3357 gcn_cannot_copy_insn_p (rtx_insn *insn)
3358 {
3359 if (recog_memoized (insn) == CODE_FOR_gcn_wavefront_barrier)
3360 return true;
3361
3362 return false;
3363 }
3364
3365 /* Implement TARGET_DEBUG_UNWIND_INFO.
3366
3367 Defines the mechanism that will be used for describing frame unwind
3368 information to the debugger. */
3369
3370 static enum unwind_info_type
gcn_debug_unwind_info()3371 gcn_debug_unwind_info ()
3372 {
3373 return UI_DWARF2;
3374 }
3375
3376 /* Determine if there is a suitable hardware conversion instruction.
3377 Used primarily by the machine description. */
3378
3379 bool
gcn_valid_cvt_p(machine_mode from,machine_mode to,enum gcn_cvt_t op)3380 gcn_valid_cvt_p (machine_mode from, machine_mode to, enum gcn_cvt_t op)
3381 {
3382 if (VECTOR_MODE_P (from) != VECTOR_MODE_P (to))
3383 return false;
3384
3385 if (VECTOR_MODE_P (from))
3386 {
3387 from = GET_MODE_INNER (from);
3388 to = GET_MODE_INNER (to);
3389 }
3390
3391 switch (op)
3392 {
3393 case fix_trunc_cvt:
3394 case fixuns_trunc_cvt:
3395 if (GET_MODE_CLASS (from) != MODE_FLOAT
3396 || GET_MODE_CLASS (to) != MODE_INT)
3397 return false;
3398 break;
3399 case float_cvt:
3400 case floatuns_cvt:
3401 if (GET_MODE_CLASS (from) != MODE_INT
3402 || GET_MODE_CLASS (to) != MODE_FLOAT)
3403 return false;
3404 break;
3405 case extend_cvt:
3406 if (GET_MODE_CLASS (from) != MODE_FLOAT
3407 || GET_MODE_CLASS (to) != MODE_FLOAT
3408 || GET_MODE_SIZE (from) >= GET_MODE_SIZE (to))
3409 return false;
3410 break;
3411 case trunc_cvt:
3412 if (GET_MODE_CLASS (from) != MODE_FLOAT
3413 || GET_MODE_CLASS (to) != MODE_FLOAT
3414 || GET_MODE_SIZE (from) <= GET_MODE_SIZE (to))
3415 return false;
3416 break;
3417 }
3418
3419 return ((to == HImode && from == HFmode)
3420 || (to == SImode && (from == SFmode || from == DFmode))
3421 || (to == HFmode && (from == HImode || from == SFmode))
3422 || (to == SFmode && (from == SImode || from == HFmode
3423 || from == DFmode))
3424 || (to == DFmode && (from == SImode || from == SFmode)));
3425 }
3426
3427 /* Implement TARGET_EMUTLS_VAR_INIT.
3428
3429 Disable emutls (gthr-gcn.h does not support it, yet). */
3430
3431 tree
gcn_emutls_var_init(tree,tree decl,tree)3432 gcn_emutls_var_init (tree, tree decl, tree)
3433 {
3434 sorry_at (DECL_SOURCE_LOCATION (decl), "TLS is not implemented for GCN.");
3435 return NULL_TREE;
3436 }
3437
3438 /* }}} */
3439 /* {{{ Costs. */
3440
3441 /* Implement TARGET_RTX_COSTS.
3442
3443 Compute a (partial) cost for rtx X. Return true if the complete
3444 cost has been computed, and false if subexpressions should be
3445 scanned. In either case, *TOTAL contains the cost result. */
3446
3447 static bool
gcn_rtx_costs(rtx x,machine_mode,int,int,int * total,bool)3448 gcn_rtx_costs (rtx x, machine_mode, int, int, int *total, bool)
3449 {
3450 enum rtx_code code = GET_CODE (x);
3451 switch (code)
3452 {
3453 case CONST:
3454 case CONST_DOUBLE:
3455 case CONST_VECTOR:
3456 case CONST_INT:
3457 if (gcn_inline_constant_p (x))
3458 *total = 0;
3459 else if (code == CONST_INT
3460 && ((unsigned HOST_WIDE_INT) INTVAL (x) + 0x8000) < 0x10000)
3461 *total = 1;
3462 else if (gcn_constant_p (x))
3463 *total = 2;
3464 else
3465 *total = vgpr_vector_mode_p (GET_MODE (x)) ? 64 : 4;
3466 return true;
3467
3468 case DIV:
3469 *total = 100;
3470 return false;
3471
3472 default:
3473 *total = 3;
3474 return false;
3475 }
3476 }
3477
3478 /* Implement TARGET_MEMORY_MOVE_COST.
3479
3480 Return the cost of moving data of mode M between a
3481 register and memory. A value of 2 is the default; this cost is
3482 relative to those in `REGISTER_MOVE_COST'.
3483
3484 This function is used extensively by register_move_cost that is used to
3485 build tables at startup. Make it inline in this case.
3486 When IN is 2, return maximum of in and out move cost.
3487
3488 If moving between registers and memory is more expensive than
3489 between two registers, you should define this macro to express the
3490 relative cost.
3491
3492 Model also increased moving costs of QImode registers in non
3493 Q_REGS classes. */
3494
3495 #define LOAD_COST 32
3496 #define STORE_COST 32
3497 static int
gcn_memory_move_cost(machine_mode mode,reg_class_t regclass,bool in)3498 gcn_memory_move_cost (machine_mode mode, reg_class_t regclass, bool in)
3499 {
3500 int nregs = CEIL (GET_MODE_SIZE (mode), 4);
3501 switch (regclass)
3502 {
3503 case SCC_CONDITIONAL_REG:
3504 case VCCZ_CONDITIONAL_REG:
3505 case VCC_CONDITIONAL_REG:
3506 case EXECZ_CONDITIONAL_REG:
3507 case ALL_CONDITIONAL_REGS:
3508 case SGPR_REGS:
3509 case SGPR_EXEC_REGS:
3510 case EXEC_MASK_REG:
3511 case SGPR_VOP_SRC_REGS:
3512 case SGPR_MEM_SRC_REGS:
3513 case SGPR_SRC_REGS:
3514 case SGPR_DST_REGS:
3515 case GENERAL_REGS:
3516 case AFP_REGS:
3517 if (!in)
3518 return (STORE_COST + 2) * nregs;
3519 return LOAD_COST * nregs;
3520 case VGPR_REGS:
3521 if (in)
3522 return (LOAD_COST + 2) * nregs;
3523 return STORE_COST * nregs;
3524 case ALL_REGS:
3525 case ALL_GPR_REGS:
3526 case SRCDST_REGS:
3527 if (in)
3528 return (LOAD_COST + 2) * nregs;
3529 return (STORE_COST + 2) * nregs;
3530 default:
3531 gcc_unreachable ();
3532 }
3533 }
3534
3535 /* Implement TARGET_REGISTER_MOVE_COST.
3536
3537 Return the cost of moving data from a register in class CLASS1 to
3538 one in class CLASS2. Base value is 2. */
3539
3540 static int
gcn_register_move_cost(machine_mode,reg_class_t dst,reg_class_t src)3541 gcn_register_move_cost (machine_mode, reg_class_t dst, reg_class_t src)
3542 {
3543 /* Increase cost of moving from and to vector registers. While this is
3544 fast in hardware (I think), it has hidden cost of setting up the exec
3545 flags. */
3546 if ((src < VGPR_REGS) != (dst < VGPR_REGS))
3547 return 4;
3548 return 2;
3549 }
3550
3551 /* }}} */
3552 /* {{{ Builtins. */
3553
3554 /* Type codes used by GCN built-in definitions. */
3555
3556 enum gcn_builtin_type_index
3557 {
3558 GCN_BTI_END_OF_PARAMS,
3559
3560 GCN_BTI_VOID,
3561 GCN_BTI_BOOL,
3562 GCN_BTI_INT,
3563 GCN_BTI_UINT,
3564 GCN_BTI_SIZE_T,
3565 GCN_BTI_LLINT,
3566 GCN_BTI_LLUINT,
3567 GCN_BTI_EXEC,
3568
3569 GCN_BTI_SF,
3570 GCN_BTI_V64SI,
3571 GCN_BTI_V64SF,
3572 GCN_BTI_V64PTR,
3573 GCN_BTI_SIPTR,
3574 GCN_BTI_SFPTR,
3575 GCN_BTI_VOIDPTR,
3576
3577 GCN_BTI_LDS_VOIDPTR,
3578
3579 GCN_BTI_MAX
3580 };
3581
3582 static GTY(()) tree gcn_builtin_types[GCN_BTI_MAX];
3583
3584 #define exec_type_node (gcn_builtin_types[GCN_BTI_EXEC])
3585 #define sf_type_node (gcn_builtin_types[GCN_BTI_SF])
3586 #define v64si_type_node (gcn_builtin_types[GCN_BTI_V64SI])
3587 #define v64sf_type_node (gcn_builtin_types[GCN_BTI_V64SF])
3588 #define v64ptr_type_node (gcn_builtin_types[GCN_BTI_V64PTR])
3589 #define siptr_type_node (gcn_builtin_types[GCN_BTI_SIPTR])
3590 #define sfptr_type_node (gcn_builtin_types[GCN_BTI_SFPTR])
3591 #define voidptr_type_node (gcn_builtin_types[GCN_BTI_VOIDPTR])
3592 #define size_t_type_node (gcn_builtin_types[GCN_BTI_SIZE_T])
3593
3594 static rtx gcn_expand_builtin_1 (tree, rtx, rtx, machine_mode, int,
3595 struct gcn_builtin_description *);
3596 static rtx gcn_expand_builtin_binop (tree, rtx, rtx, machine_mode, int,
3597 struct gcn_builtin_description *);
3598
3599 struct gcn_builtin_description;
3600 typedef rtx (*gcn_builtin_expander) (tree, rtx, rtx, machine_mode, int,
3601 struct gcn_builtin_description *);
3602
3603 enum gcn_builtin_type
3604 {
3605 B_UNIMPLEMENTED, /* Sorry out */
3606 B_INSN, /* Emit a pattern */
3607 B_OVERLOAD /* Placeholder for an overloaded function */
3608 };
3609
3610 struct gcn_builtin_description
3611 {
3612 int fcode;
3613 int icode;
3614 const char *name;
3615 enum gcn_builtin_type type;
3616 /* The first element of parm is always the return type. The rest
3617 are a zero terminated list of parameters. */
3618 int parm[6];
3619 gcn_builtin_expander expander;
3620 };
3621
3622 /* Read in the GCN builtins from gcn-builtins.def. */
3623
3624 extern GTY(()) struct gcn_builtin_description gcn_builtins[GCN_BUILTIN_MAX];
3625
3626 struct gcn_builtin_description gcn_builtins[] = {
3627 #define DEF_BUILTIN(fcode, icode, name, type, params, expander) \
3628 {GCN_BUILTIN_ ## fcode, icode, name, type, params, expander},
3629
3630 #define DEF_BUILTIN_BINOP_INT_FP(fcode, ic, name) \
3631 {GCN_BUILTIN_ ## fcode ## _V64SI, \
3632 CODE_FOR_ ## ic ##v64si3_exec, name "_v64int", B_INSN, \
3633 {GCN_BTI_V64SI, GCN_BTI_EXEC, GCN_BTI_V64SI, GCN_BTI_V64SI, \
3634 GCN_BTI_V64SI, GCN_BTI_END_OF_PARAMS}, gcn_expand_builtin_binop}, \
3635 {GCN_BUILTIN_ ## fcode ## _V64SI_unspec, \
3636 CODE_FOR_ ## ic ##v64si3_exec, name "_v64int_unspec", B_INSN, \
3637 {GCN_BTI_V64SI, GCN_BTI_EXEC, GCN_BTI_V64SI, GCN_BTI_V64SI, \
3638 GCN_BTI_END_OF_PARAMS}, gcn_expand_builtin_binop},
3639
3640 #include "gcn-builtins.def"
3641 #undef DEF_BUILTIN_BINOP_INT_FP
3642 #undef DEF_BUILTIN
3643 };
3644
3645 static GTY(()) tree gcn_builtin_decls[GCN_BUILTIN_MAX];
3646
3647 /* Implement TARGET_BUILTIN_DECL.
3648
3649 Return the GCN builtin for CODE. */
3650
3651 tree
gcn_builtin_decl(unsigned code,bool ARG_UNUSED (initialize_p))3652 gcn_builtin_decl (unsigned code, bool ARG_UNUSED (initialize_p))
3653 {
3654 if (code >= GCN_BUILTIN_MAX)
3655 return error_mark_node;
3656
3657 return gcn_builtin_decls[code];
3658 }
3659
3660 /* Helper function for gcn_init_builtins. */
3661
3662 static void
gcn_init_builtin_types(void)3663 gcn_init_builtin_types (void)
3664 {
3665 gcn_builtin_types[GCN_BTI_VOID] = void_type_node;
3666 gcn_builtin_types[GCN_BTI_BOOL] = boolean_type_node;
3667 gcn_builtin_types[GCN_BTI_INT] = intSI_type_node;
3668 gcn_builtin_types[GCN_BTI_UINT] = unsigned_type_for (intSI_type_node);
3669 gcn_builtin_types[GCN_BTI_SIZE_T] = size_type_node;
3670 gcn_builtin_types[GCN_BTI_LLINT] = intDI_type_node;
3671 gcn_builtin_types[GCN_BTI_LLUINT] = unsigned_type_for (intDI_type_node);
3672
3673 exec_type_node = unsigned_intDI_type_node;
3674 sf_type_node = float32_type_node;
3675 v64si_type_node = build_vector_type (intSI_type_node, 64);
3676 v64sf_type_node = build_vector_type (float_type_node, 64);
3677 v64ptr_type_node = build_vector_type (unsigned_intDI_type_node
3678 /*build_pointer_type
3679 (integer_type_node) */
3680 , 64);
3681 tree tmp = build_distinct_type_copy (intSI_type_node);
3682 TYPE_ADDR_SPACE (tmp) = ADDR_SPACE_FLAT;
3683 siptr_type_node = build_pointer_type (tmp);
3684
3685 tmp = build_distinct_type_copy (float_type_node);
3686 TYPE_ADDR_SPACE (tmp) = ADDR_SPACE_FLAT;
3687 sfptr_type_node = build_pointer_type (tmp);
3688
3689 tmp = build_distinct_type_copy (void_type_node);
3690 TYPE_ADDR_SPACE (tmp) = ADDR_SPACE_FLAT;
3691 voidptr_type_node = build_pointer_type (tmp);
3692
3693 tmp = build_distinct_type_copy (void_type_node);
3694 TYPE_ADDR_SPACE (tmp) = ADDR_SPACE_LDS;
3695 gcn_builtin_types[GCN_BTI_LDS_VOIDPTR] = build_pointer_type (tmp);
3696 }
3697
3698 /* Implement TARGET_INIT_BUILTINS.
3699
3700 Set up all builtin functions for this target. */
3701
3702 static void
gcn_init_builtins(void)3703 gcn_init_builtins (void)
3704 {
3705 gcn_init_builtin_types ();
3706
3707 struct gcn_builtin_description *d;
3708 unsigned int i;
3709 for (i = 0, d = gcn_builtins; i < GCN_BUILTIN_MAX; i++, d++)
3710 {
3711 tree p;
3712 char name[64]; /* build_function will make a copy. */
3713 int parm;
3714
3715 /* FIXME: Is this necessary/useful? */
3716 if (d->name == 0)
3717 continue;
3718
3719 /* Find last parm. */
3720 for (parm = 1; d->parm[parm] != GCN_BTI_END_OF_PARAMS; parm++)
3721 ;
3722
3723 p = void_list_node;
3724 while (parm > 1)
3725 p = tree_cons (NULL_TREE, gcn_builtin_types[d->parm[--parm]], p);
3726
3727 p = build_function_type (gcn_builtin_types[d->parm[0]], p);
3728
3729 sprintf (name, "__builtin_gcn_%s", d->name);
3730 gcn_builtin_decls[i]
3731 = add_builtin_function (name, p, i, BUILT_IN_MD, NULL, NULL_TREE);
3732
3733 /* These builtins don't throw. */
3734 TREE_NOTHROW (gcn_builtin_decls[i]) = 1;
3735 }
3736
3737 /* These builtins need to take/return an LDS pointer: override the generic
3738 versions here. */
3739
3740 set_builtin_decl (BUILT_IN_GOACC_SINGLE_START,
3741 gcn_builtin_decls[GCN_BUILTIN_ACC_SINGLE_START], false);
3742
3743 set_builtin_decl (BUILT_IN_GOACC_SINGLE_COPY_START,
3744 gcn_builtin_decls[GCN_BUILTIN_ACC_SINGLE_COPY_START],
3745 false);
3746
3747 set_builtin_decl (BUILT_IN_GOACC_SINGLE_COPY_END,
3748 gcn_builtin_decls[GCN_BUILTIN_ACC_SINGLE_COPY_END],
3749 false);
3750
3751 set_builtin_decl (BUILT_IN_GOACC_BARRIER,
3752 gcn_builtin_decls[GCN_BUILTIN_ACC_BARRIER], false);
3753 }
3754
3755 /* Implement TARGET_INIT_LIBFUNCS. */
3756
3757 static void
gcn_init_libfuncs(void)3758 gcn_init_libfuncs (void)
3759 {
3760 /* BITS_PER_UNIT * 2 is 64 bits, which causes
3761 optabs-libfuncs.cc:gen_int_libfunc to omit TImode (i.e 128 bits)
3762 libcalls that we need to support operations for that type. Initialise
3763 them here instead. */
3764 set_optab_libfunc (udiv_optab, TImode, "__udivti3");
3765 set_optab_libfunc (umod_optab, TImode, "__umodti3");
3766 set_optab_libfunc (sdiv_optab, TImode, "__divti3");
3767 set_optab_libfunc (smod_optab, TImode, "__modti3");
3768 set_optab_libfunc (smul_optab, TImode, "__multi3");
3769 set_optab_libfunc (addv_optab, TImode, "__addvti3");
3770 set_optab_libfunc (subv_optab, TImode, "__subvti3");
3771 set_optab_libfunc (negv_optab, TImode, "__negvti2");
3772 set_optab_libfunc (absv_optab, TImode, "__absvti2");
3773 set_optab_libfunc (smulv_optab, TImode, "__mulvti3");
3774 set_optab_libfunc (ffs_optab, TImode, "__ffsti2");
3775 set_optab_libfunc (clz_optab, TImode, "__clzti2");
3776 set_optab_libfunc (ctz_optab, TImode, "__ctzti2");
3777 set_optab_libfunc (clrsb_optab, TImode, "__clrsbti2");
3778 set_optab_libfunc (popcount_optab, TImode, "__popcountti2");
3779 set_optab_libfunc (parity_optab, TImode, "__parityti2");
3780 set_optab_libfunc (bswap_optab, TImode, "__bswapti2");
3781 }
3782
3783 /* Expand the CMP_SWAP GCN builtins. We have our own versions that do
3784 not require taking the address of any object, other than the memory
3785 cell being operated on.
3786
3787 Helper function for gcn_expand_builtin_1. */
3788
3789 static rtx
gcn_expand_cmp_swap(tree exp,rtx target)3790 gcn_expand_cmp_swap (tree exp, rtx target)
3791 {
3792 machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
3793 addr_space_t as
3794 = TYPE_ADDR_SPACE (TREE_TYPE (TREE_TYPE (CALL_EXPR_ARG (exp, 0))));
3795 machine_mode as_mode = gcn_addr_space_address_mode (as);
3796
3797 if (!target)
3798 target = gen_reg_rtx (mode);
3799
3800 rtx addr = expand_expr (CALL_EXPR_ARG (exp, 0),
3801 NULL_RTX, as_mode, EXPAND_NORMAL);
3802 rtx cmp = expand_expr (CALL_EXPR_ARG (exp, 1),
3803 NULL_RTX, mode, EXPAND_NORMAL);
3804 rtx src = expand_expr (CALL_EXPR_ARG (exp, 2),
3805 NULL_RTX, mode, EXPAND_NORMAL);
3806 rtx pat;
3807
3808 rtx mem = gen_rtx_MEM (mode, force_reg (as_mode, addr));
3809 set_mem_addr_space (mem, as);
3810
3811 if (!REG_P (cmp))
3812 cmp = copy_to_mode_reg (mode, cmp);
3813 if (!REG_P (src))
3814 src = copy_to_mode_reg (mode, src);
3815
3816 if (mode == SImode)
3817 pat = gen_sync_compare_and_swapsi (target, mem, cmp, src);
3818 else
3819 pat = gen_sync_compare_and_swapdi (target, mem, cmp, src);
3820
3821 emit_insn (pat);
3822
3823 return target;
3824 }
3825
3826 /* Expand many different builtins.
3827
3828 Intended for use in gcn-builtins.def. */
3829
3830 static rtx
gcn_expand_builtin_1(tree exp,rtx target,rtx,machine_mode,int ignore,struct gcn_builtin_description *)3831 gcn_expand_builtin_1 (tree exp, rtx target, rtx /*subtarget */ ,
3832 machine_mode /*mode */ , int ignore,
3833 struct gcn_builtin_description *)
3834 {
3835 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
3836 switch (DECL_MD_FUNCTION_CODE (fndecl))
3837 {
3838 case GCN_BUILTIN_FLAT_LOAD_INT32:
3839 {
3840 if (ignore)
3841 return target;
3842 /*rtx exec = */
3843 force_reg (DImode,
3844 expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX, DImode,
3845 EXPAND_NORMAL));
3846 /*rtx ptr = */
3847 force_reg (V64DImode,
3848 expand_expr (CALL_EXPR_ARG (exp, 1), NULL_RTX, V64DImode,
3849 EXPAND_NORMAL));
3850 /*emit_insn (gen_vector_flat_loadv64si
3851 (target, gcn_gen_undef (V64SImode), ptr, exec)); */
3852 return target;
3853 }
3854 case GCN_BUILTIN_FLAT_LOAD_PTR_INT32:
3855 case GCN_BUILTIN_FLAT_LOAD_PTR_FLOAT:
3856 {
3857 if (ignore)
3858 return target;
3859 rtx exec = force_reg (DImode,
3860 expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX,
3861 DImode,
3862 EXPAND_NORMAL));
3863 rtx ptr = force_reg (DImode,
3864 expand_expr (CALL_EXPR_ARG (exp, 1), NULL_RTX,
3865 V64DImode,
3866 EXPAND_NORMAL));
3867 rtx offsets = force_reg (V64SImode,
3868 expand_expr (CALL_EXPR_ARG (exp, 2),
3869 NULL_RTX, V64DImode,
3870 EXPAND_NORMAL));
3871 rtx addrs = gen_reg_rtx (V64DImode);
3872 rtx tmp = gen_reg_rtx (V64SImode);
3873 emit_insn (gen_ashlv64si3_exec (tmp, offsets,
3874 GEN_INT (2),
3875 gcn_gen_undef (V64SImode), exec));
3876 emit_insn (gen_addv64di3_zext_dup2_exec (addrs, tmp, ptr,
3877 gcn_gen_undef (V64DImode),
3878 exec));
3879 rtx mem = gen_rtx_MEM (GET_MODE (target), addrs);
3880 /*set_mem_addr_space (mem, ADDR_SPACE_FLAT); */
3881 /* FIXME: set attributes. */
3882 emit_insn (gen_mov_with_exec (target, mem, exec));
3883 return target;
3884 }
3885 case GCN_BUILTIN_FLAT_STORE_PTR_INT32:
3886 case GCN_BUILTIN_FLAT_STORE_PTR_FLOAT:
3887 {
3888 rtx exec = force_reg (DImode,
3889 expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX,
3890 DImode,
3891 EXPAND_NORMAL));
3892 rtx ptr = force_reg (DImode,
3893 expand_expr (CALL_EXPR_ARG (exp, 1), NULL_RTX,
3894 V64DImode,
3895 EXPAND_NORMAL));
3896 rtx offsets = force_reg (V64SImode,
3897 expand_expr (CALL_EXPR_ARG (exp, 2),
3898 NULL_RTX, V64DImode,
3899 EXPAND_NORMAL));
3900 machine_mode vmode = TYPE_MODE (TREE_TYPE (CALL_EXPR_ARG (exp,
3901 3)));
3902 rtx val = force_reg (vmode,
3903 expand_expr (CALL_EXPR_ARG (exp, 3), NULL_RTX,
3904 vmode,
3905 EXPAND_NORMAL));
3906 rtx addrs = gen_reg_rtx (V64DImode);
3907 rtx tmp = gen_reg_rtx (V64SImode);
3908 emit_insn (gen_ashlv64si3_exec (tmp, offsets,
3909 GEN_INT (2),
3910 gcn_gen_undef (V64SImode), exec));
3911 emit_insn (gen_addv64di3_zext_dup2_exec (addrs, tmp, ptr,
3912 gcn_gen_undef (V64DImode),
3913 exec));
3914 rtx mem = gen_rtx_MEM (vmode, addrs);
3915 /*set_mem_addr_space (mem, ADDR_SPACE_FLAT); */
3916 /* FIXME: set attributes. */
3917 emit_insn (gen_mov_with_exec (mem, val, exec));
3918 return target;
3919 }
3920 case GCN_BUILTIN_SQRTVF:
3921 {
3922 if (ignore)
3923 return target;
3924 rtx exec = gcn_full_exec_reg ();
3925 rtx arg = force_reg (V64SFmode,
3926 expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX,
3927 V64SFmode,
3928 EXPAND_NORMAL));
3929 emit_insn (gen_sqrtv64sf2_exec
3930 (target, arg, gcn_gen_undef (V64SFmode), exec));
3931 return target;
3932 }
3933 case GCN_BUILTIN_SQRTF:
3934 {
3935 if (ignore)
3936 return target;
3937 rtx arg = force_reg (SFmode,
3938 expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX,
3939 SFmode,
3940 EXPAND_NORMAL));
3941 emit_insn (gen_sqrtsf2 (target, arg));
3942 return target;
3943 }
3944 case GCN_BUILTIN_OMP_DIM_SIZE:
3945 {
3946 if (ignore)
3947 return target;
3948 emit_insn (gen_oacc_dim_size (target,
3949 expand_expr (CALL_EXPR_ARG (exp, 0),
3950 NULL_RTX, SImode,
3951 EXPAND_NORMAL)));
3952 return target;
3953 }
3954 case GCN_BUILTIN_OMP_DIM_POS:
3955 {
3956 if (ignore)
3957 return target;
3958 emit_insn (gen_oacc_dim_pos (target,
3959 expand_expr (CALL_EXPR_ARG (exp, 0),
3960 NULL_RTX, SImode,
3961 EXPAND_NORMAL)));
3962 return target;
3963 }
3964 case GCN_BUILTIN_CMP_SWAP:
3965 case GCN_BUILTIN_CMP_SWAPLL:
3966 return gcn_expand_cmp_swap (exp, target);
3967
3968 case GCN_BUILTIN_ACC_SINGLE_START:
3969 {
3970 if (ignore)
3971 return target;
3972
3973 rtx wavefront = gcn_oacc_dim_pos (1);
3974 rtx cond = gen_rtx_EQ (VOIDmode, wavefront, const0_rtx);
3975 rtx cc = (target && REG_P (target)) ? target : gen_reg_rtx (BImode);
3976 emit_insn (gen_cstoresi4 (cc, cond, wavefront, const0_rtx));
3977 return cc;
3978 }
3979
3980 case GCN_BUILTIN_ACC_SINGLE_COPY_START:
3981 {
3982 rtx blk = force_reg (SImode,
3983 expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX,
3984 SImode, EXPAND_NORMAL));
3985 rtx wavefront = gcn_oacc_dim_pos (1);
3986 rtx cond = gen_rtx_NE (VOIDmode, wavefront, const0_rtx);
3987 rtx not_zero = gen_label_rtx ();
3988 emit_insn (gen_cbranchsi4 (cond, wavefront, const0_rtx, not_zero));
3989 emit_move_insn (blk, const0_rtx);
3990 emit_label (not_zero);
3991 return blk;
3992 }
3993
3994 case GCN_BUILTIN_ACC_SINGLE_COPY_END:
3995 return target;
3996
3997 case GCN_BUILTIN_ACC_BARRIER:
3998 emit_insn (gen_gcn_wavefront_barrier ());
3999 return target;
4000
4001 default:
4002 gcc_unreachable ();
4003 }
4004 }
4005
4006 /* Expansion of simple arithmetic and bit binary operation builtins.
4007
4008 Intended for use with gcn_builtins table. */
4009
4010 static rtx
gcn_expand_builtin_binop(tree exp,rtx target,rtx,machine_mode,int ignore,struct gcn_builtin_description * d)4011 gcn_expand_builtin_binop (tree exp, rtx target, rtx /*subtarget */ ,
4012 machine_mode /*mode */ , int ignore,
4013 struct gcn_builtin_description *d)
4014 {
4015 int icode = d->icode;
4016 if (ignore)
4017 return target;
4018
4019 rtx exec = force_reg (DImode,
4020 expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX, DImode,
4021 EXPAND_NORMAL));
4022
4023 machine_mode m1 = insn_data[icode].operand[1].mode;
4024 rtx arg1 = expand_expr (CALL_EXPR_ARG (exp, 1), NULL_RTX, m1,
4025 EXPAND_NORMAL);
4026 if (!insn_data[icode].operand[1].predicate (arg1, m1))
4027 arg1 = force_reg (m1, arg1);
4028
4029 machine_mode m2 = insn_data[icode].operand[2].mode;
4030 rtx arg2 = expand_expr (CALL_EXPR_ARG (exp, 2), NULL_RTX, m2,
4031 EXPAND_NORMAL);
4032 if (!insn_data[icode].operand[2].predicate (arg2, m2))
4033 arg2 = force_reg (m2, arg2);
4034
4035 rtx arg_prev;
4036 if (call_expr_nargs (exp) == 4)
4037 {
4038 machine_mode m_prev = insn_data[icode].operand[4].mode;
4039 arg_prev = force_reg (m_prev,
4040 expand_expr (CALL_EXPR_ARG (exp, 3), NULL_RTX,
4041 m_prev, EXPAND_NORMAL));
4042 }
4043 else
4044 arg_prev = gcn_gen_undef (GET_MODE (target));
4045
4046 rtx pat = GEN_FCN (icode) (target, arg1, arg2, exec, arg_prev);
4047 emit_insn (pat);
4048 return target;
4049 }
4050
4051 /* Implement TARGET_EXPAND_BUILTIN.
4052
4053 Expand an expression EXP that calls a built-in function, with result going
4054 to TARGET if that's convenient (and in mode MODE if that's convenient).
4055 SUBTARGET may be used as the target for computing one of EXP's operands.
4056 IGNORE is nonzero if the value is to be ignored. */
4057
4058 rtx
gcn_expand_builtin(tree exp,rtx target,rtx subtarget,machine_mode mode,int ignore)4059 gcn_expand_builtin (tree exp, rtx target, rtx subtarget, machine_mode mode,
4060 int ignore)
4061 {
4062 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
4063 unsigned int fcode = DECL_MD_FUNCTION_CODE (fndecl);
4064 struct gcn_builtin_description *d;
4065
4066 gcc_assert (fcode < GCN_BUILTIN_MAX);
4067 d = &gcn_builtins[fcode];
4068
4069 if (d->type == B_UNIMPLEMENTED)
4070 sorry ("Builtin not implemented");
4071
4072 return d->expander (exp, target, subtarget, mode, ignore, d);
4073 }
4074
4075 /* }}} */
4076 /* {{{ Vectorization. */
4077
4078 /* Implement TARGET_VECTORIZE_GET_MASK_MODE.
4079
4080 A vector mask is a value that holds one boolean result for every element in
4081 a vector. */
4082
4083 opt_machine_mode
gcn_vectorize_get_mask_mode(machine_mode)4084 gcn_vectorize_get_mask_mode (machine_mode)
4085 {
4086 /* GCN uses a DImode bit-mask. */
4087 return DImode;
4088 }
4089
4090 /* Return an RTX that references a vector with the i-th lane containing
4091 PERM[i]*4.
4092
4093 Helper function for gcn_vectorize_vec_perm_const. */
4094
4095 static rtx
gcn_make_vec_perm_address(unsigned int * perm)4096 gcn_make_vec_perm_address (unsigned int *perm)
4097 {
4098 rtx x = gen_reg_rtx (V64SImode);
4099 emit_move_insn (x, gcn_vec_constant (V64SImode, 0));
4100
4101 /* Permutation addresses use byte addressing. With each vector lane being
4102 4 bytes wide, and with 64 lanes in total, only bits 2..7 are significant,
4103 so only set those.
4104
4105 The permutation given to the vec_perm* patterns range from 0 to 2N-1 to
4106 select between lanes in two vectors, but as the DS_BPERMUTE* instructions
4107 only take one source vector, the most-significant bit can be ignored
4108 here. Instead, we can use EXEC masking to select the relevant part of
4109 each source vector after they are permuted separately. */
4110 uint64_t bit_mask = 1 << 2;
4111 for (int i = 2; i < 8; i++, bit_mask <<= 1)
4112 {
4113 uint64_t exec_mask = 0;
4114 uint64_t lane_mask = 1;
4115 for (int j = 0; j < 64; j++, lane_mask <<= 1)
4116 if ((perm[j] * 4) & bit_mask)
4117 exec_mask |= lane_mask;
4118
4119 if (exec_mask)
4120 emit_insn (gen_addv64si3_exec (x, x,
4121 gcn_vec_constant (V64SImode,
4122 bit_mask),
4123 x, get_exec (exec_mask)));
4124 }
4125
4126 return x;
4127 }
4128
4129 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST.
4130
4131 Return true if permutation with SEL is possible.
4132
4133 If DST/SRC0/SRC1 are non-null, emit the instructions to perform the
4134 permutations. */
4135
4136 static bool
gcn_vectorize_vec_perm_const(machine_mode vmode,rtx dst,rtx src0,rtx src1,const vec_perm_indices & sel)4137 gcn_vectorize_vec_perm_const (machine_mode vmode, rtx dst,
4138 rtx src0, rtx src1,
4139 const vec_perm_indices & sel)
4140 {
4141 unsigned int nelt = GET_MODE_NUNITS (vmode);
4142
4143 gcc_assert (VECTOR_MODE_P (vmode));
4144 gcc_assert (nelt <= 64);
4145 gcc_assert (sel.length () == nelt);
4146
4147 if (!dst)
4148 {
4149 /* All vector permutations are possible on this architecture,
4150 with varying degrees of efficiency depending on the permutation. */
4151 return true;
4152 }
4153
4154 unsigned int perm[64];
4155 for (unsigned int i = 0; i < nelt; ++i)
4156 perm[i] = sel[i] & (2 * nelt - 1);
4157 for (unsigned int i = nelt; i < 64; ++i)
4158 perm[i] = 0;
4159
4160 src0 = force_reg (vmode, src0);
4161 src1 = force_reg (vmode, src1);
4162
4163 /* Make life a bit easier by swapping operands if necessary so that
4164 the first element always comes from src0. */
4165 if (perm[0] >= nelt)
4166 {
4167 std::swap (src0, src1);
4168
4169 for (unsigned int i = 0; i < nelt; ++i)
4170 if (perm[i] < nelt)
4171 perm[i] += nelt;
4172 else
4173 perm[i] -= nelt;
4174 }
4175
4176 /* TODO: There are more efficient ways to implement certain permutations
4177 using ds_swizzle_b32 and/or DPP. Test for and expand them here, before
4178 this more inefficient generic approach is used. */
4179
4180 int64_t src1_lanes = 0;
4181 int64_t lane_bit = 1;
4182
4183 for (unsigned int i = 0; i < nelt; ++i, lane_bit <<= 1)
4184 {
4185 /* Set the bits for lanes from src1. */
4186 if (perm[i] >= nelt)
4187 src1_lanes |= lane_bit;
4188 }
4189
4190 rtx addr = gcn_make_vec_perm_address (perm);
4191 rtx (*ds_bpermute) (rtx, rtx, rtx, rtx);
4192
4193 switch (vmode)
4194 {
4195 case E_V64QImode:
4196 ds_bpermute = gen_ds_bpermutev64qi;
4197 break;
4198 case E_V64HImode:
4199 ds_bpermute = gen_ds_bpermutev64hi;
4200 break;
4201 case E_V64SImode:
4202 ds_bpermute = gen_ds_bpermutev64si;
4203 break;
4204 case E_V64HFmode:
4205 ds_bpermute = gen_ds_bpermutev64hf;
4206 break;
4207 case E_V64SFmode:
4208 ds_bpermute = gen_ds_bpermutev64sf;
4209 break;
4210 case E_V64DImode:
4211 ds_bpermute = gen_ds_bpermutev64di;
4212 break;
4213 case E_V64DFmode:
4214 ds_bpermute = gen_ds_bpermutev64df;
4215 break;
4216 default:
4217 gcc_assert (false);
4218 }
4219
4220 /* Load elements from src0 to dst. */
4221 gcc_assert (~src1_lanes);
4222 emit_insn (ds_bpermute (dst, addr, src0, gcn_full_exec_reg ()));
4223
4224 /* Load elements from src1 to dst. */
4225 if (src1_lanes)
4226 {
4227 /* Masking a lane masks both the destination and source lanes for
4228 DS_BPERMUTE, so we need to have all lanes enabled for the permute,
4229 then add an extra masked move to merge the results of permuting
4230 the two source vectors together.
4231 */
4232 rtx tmp = gen_reg_rtx (vmode);
4233 emit_insn (ds_bpermute (tmp, addr, src1, gcn_full_exec_reg ()));
4234 emit_insn (gen_mov_with_exec (dst, tmp, get_exec (src1_lanes)));
4235 }
4236
4237 return true;
4238 }
4239
4240 /* Implements TARGET_VECTOR_MODE_SUPPORTED_P.
4241
4242 Return nonzero if vector MODE is supported with at least move
4243 instructions. */
4244
4245 static bool
gcn_vector_mode_supported_p(machine_mode mode)4246 gcn_vector_mode_supported_p (machine_mode mode)
4247 {
4248 return (mode == V64QImode || mode == V64HImode
4249 || mode == V64SImode || mode == V64DImode
4250 || mode == V64SFmode || mode == V64DFmode);
4251 }
4252
4253 /* Implement TARGET_VECTORIZE_PREFERRED_SIMD_MODE.
4254
4255 Enables autovectorization for all supported modes. */
4256
4257 static machine_mode
gcn_vectorize_preferred_simd_mode(scalar_mode mode)4258 gcn_vectorize_preferred_simd_mode (scalar_mode mode)
4259 {
4260 switch (mode)
4261 {
4262 case E_QImode:
4263 return V64QImode;
4264 case E_HImode:
4265 return V64HImode;
4266 case E_SImode:
4267 return V64SImode;
4268 case E_DImode:
4269 return V64DImode;
4270 case E_SFmode:
4271 return V64SFmode;
4272 case E_DFmode:
4273 return V64DFmode;
4274 default:
4275 return word_mode;
4276 }
4277 }
4278
4279 /* Implement TARGET_VECTORIZE_RELATED_MODE.
4280
4281 All GCN vectors are 64-lane, so this is simpler than other architectures.
4282 In particular, we do *not* want to match vector bit-size. */
4283
4284 static opt_machine_mode
gcn_related_vector_mode(machine_mode ARG_UNUSED (vector_mode),scalar_mode element_mode,poly_uint64 nunits)4285 gcn_related_vector_mode (machine_mode ARG_UNUSED (vector_mode),
4286 scalar_mode element_mode, poly_uint64 nunits)
4287 {
4288 if (known_ne (nunits, 0U) && known_ne (nunits, 64U))
4289 return VOIDmode;
4290
4291 machine_mode pref_mode = gcn_vectorize_preferred_simd_mode (element_mode);
4292 if (!VECTOR_MODE_P (pref_mode))
4293 return VOIDmode;
4294
4295 return pref_mode;
4296 }
4297
4298 /* Implement TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT.
4299
4300 Returns the preferred alignment in bits for accesses to vectors of type type
4301 in vectorized code. This might be less than or greater than the ABI-defined
4302 value returned by TARGET_VECTOR_ALIGNMENT. It can be equal to the alignment
4303 of a single element, in which case the vectorizer will not try to optimize
4304 for alignment. */
4305
4306 static poly_uint64
gcn_preferred_vector_alignment(const_tree type)4307 gcn_preferred_vector_alignment (const_tree type)
4308 {
4309 return TYPE_ALIGN (TREE_TYPE (type));
4310 }
4311
4312 /* Implement TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT.
4313
4314 Return true if the target supports misaligned vector store/load of a
4315 specific factor denoted in the misalignment parameter. */
4316
4317 static bool
gcn_vectorize_support_vector_misalignment(machine_mode ARG_UNUSED (mode),const_tree type,int misalignment,bool is_packed)4318 gcn_vectorize_support_vector_misalignment (machine_mode ARG_UNUSED (mode),
4319 const_tree type, int misalignment,
4320 bool is_packed)
4321 {
4322 if (is_packed)
4323 return false;
4324
4325 /* If the misalignment is unknown, we should be able to handle the access
4326 so long as it is not to a member of a packed data structure. */
4327 if (misalignment == -1)
4328 return true;
4329
4330 /* Return true if the misalignment is a multiple of the natural alignment
4331 of the vector's element type. This is probably always going to be
4332 true in practice, since we've already established that this isn't a
4333 packed access. */
4334 return misalignment % TYPE_ALIGN_UNIT (type) == 0;
4335 }
4336
4337 /* Implement TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.
4338
4339 Return true if vector alignment is reachable (by peeling N iterations) for
4340 the given scalar type TYPE. */
4341
4342 static bool
gcn_vector_alignment_reachable(const_tree ARG_UNUSED (type),bool is_packed)4343 gcn_vector_alignment_reachable (const_tree ARG_UNUSED (type), bool is_packed)
4344 {
4345 /* Vectors which aren't in packed structures will not be less aligned than
4346 the natural alignment of their element type, so this is safe. */
4347 return !is_packed;
4348 }
4349
4350 /* Generate DPP instructions used for vector reductions.
4351
4352 The opcode is given by INSN.
4353 The first operand of the operation is shifted right by SHIFT vector lanes.
4354 SHIFT must be a power of 2. If SHIFT is 16, the 15th lane of each row is
4355 broadcast the next row (thereby acting like a shift of 16 for the end of
4356 each row). If SHIFT is 32, lane 31 is broadcast to all the
4357 following lanes (thereby acting like a shift of 32 for lane 63). */
4358
4359 char *
gcn_expand_dpp_shr_insn(machine_mode mode,const char * insn,int unspec,int shift)4360 gcn_expand_dpp_shr_insn (machine_mode mode, const char *insn,
4361 int unspec, int shift)
4362 {
4363 static char buf[128];
4364 const char *dpp;
4365 const char *vcc_in = "";
4366 const char *vcc_out = "";
4367
4368 /* Add the vcc operand if needed. */
4369 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
4370 {
4371 if (unspec == UNSPEC_PLUS_CARRY_IN_DPP_SHR)
4372 vcc_in = ", vcc";
4373
4374 if (unspec == UNSPEC_PLUS_CARRY_DPP_SHR
4375 || unspec == UNSPEC_PLUS_CARRY_IN_DPP_SHR)
4376 vcc_out = ", vcc";
4377 }
4378
4379 /* Add the DPP modifiers. */
4380 switch (shift)
4381 {
4382 case 1:
4383 dpp = "row_shr:1 bound_ctrl:0";
4384 break;
4385 case 2:
4386 dpp = "row_shr:2 bound_ctrl:0";
4387 break;
4388 case 4:
4389 dpp = "row_shr:4 bank_mask:0xe";
4390 break;
4391 case 8:
4392 dpp = "row_shr:8 bank_mask:0xc";
4393 break;
4394 case 16:
4395 dpp = "row_bcast:15 row_mask:0xa";
4396 break;
4397 case 32:
4398 dpp = "row_bcast:31 row_mask:0xc";
4399 break;
4400 default:
4401 gcc_unreachable ();
4402 }
4403
4404 if (unspec == UNSPEC_MOV_DPP_SHR && vgpr_2reg_mode_p (mode))
4405 sprintf (buf, "%s\t%%L0, %%L1 %s\n\t%s\t%%H0, %%H1 %s",
4406 insn, dpp, insn, dpp);
4407 else if (unspec == UNSPEC_MOV_DPP_SHR)
4408 sprintf (buf, "%s\t%%0, %%1 %s", insn, dpp);
4409 else
4410 sprintf (buf, "%s\t%%0%s, %%1, %%2%s %s", insn, vcc_out, vcc_in, dpp);
4411
4412 return buf;
4413 }
4414
4415 /* Generate vector reductions in terms of DPP instructions.
4416
4417 The vector register SRC of mode MODE is reduced using the operation given
4418 by UNSPEC, and the scalar result is returned in lane 63 of a vector
4419 register. */
4420
4421 rtx
gcn_expand_reduc_scalar(machine_mode mode,rtx src,int unspec)4422 gcn_expand_reduc_scalar (machine_mode mode, rtx src, int unspec)
4423 {
4424 machine_mode orig_mode = mode;
4425 bool use_moves = (((unspec == UNSPEC_SMIN_DPP_SHR
4426 || unspec == UNSPEC_SMAX_DPP_SHR
4427 || unspec == UNSPEC_UMIN_DPP_SHR
4428 || unspec == UNSPEC_UMAX_DPP_SHR)
4429 && (mode == V64DImode
4430 || mode == V64DFmode))
4431 || (unspec == UNSPEC_PLUS_DPP_SHR
4432 && mode == V64DFmode));
4433 rtx_code code = (unspec == UNSPEC_SMIN_DPP_SHR ? SMIN
4434 : unspec == UNSPEC_SMAX_DPP_SHR ? SMAX
4435 : unspec == UNSPEC_UMIN_DPP_SHR ? UMIN
4436 : unspec == UNSPEC_UMAX_DPP_SHR ? UMAX
4437 : unspec == UNSPEC_PLUS_DPP_SHR ? PLUS
4438 : UNKNOWN);
4439 bool use_extends = ((unspec == UNSPEC_SMIN_DPP_SHR
4440 || unspec == UNSPEC_SMAX_DPP_SHR
4441 || unspec == UNSPEC_UMIN_DPP_SHR
4442 || unspec == UNSPEC_UMAX_DPP_SHR)
4443 && (mode == V64QImode
4444 || mode == V64HImode));
4445 bool unsignedp = (unspec == UNSPEC_UMIN_DPP_SHR
4446 || unspec == UNSPEC_UMAX_DPP_SHR);
4447 bool use_plus_carry = unspec == UNSPEC_PLUS_DPP_SHR
4448 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT
4449 && (TARGET_GCN3 || mode == V64DImode);
4450
4451 if (use_plus_carry)
4452 unspec = UNSPEC_PLUS_CARRY_DPP_SHR;
4453
4454 if (use_extends)
4455 {
4456 rtx tmp = gen_reg_rtx (V64SImode);
4457 convert_move (tmp, src, unsignedp);
4458 src = tmp;
4459 mode = V64SImode;
4460 }
4461
4462 /* Perform reduction by first performing the reduction operation on every
4463 pair of lanes, then on every pair of results from the previous
4464 iteration (thereby effectively reducing every 4 lanes) and so on until
4465 all lanes are reduced. */
4466 rtx in, out = force_reg (mode, src);
4467 for (int i = 0, shift = 1; i < 6; i++, shift <<= 1)
4468 {
4469 rtx shift_val = gen_rtx_CONST_INT (VOIDmode, shift);
4470 in = out;
4471 out = gen_reg_rtx (mode);
4472
4473 if (use_moves)
4474 {
4475 rtx tmp = gen_reg_rtx (mode);
4476 emit_insn (gen_dpp_move (mode, tmp, in, shift_val));
4477 emit_insn (gen_rtx_SET (out, gen_rtx_fmt_ee (code, mode, tmp, in)));
4478 }
4479 else
4480 {
4481 rtx insn = gen_rtx_SET (out,
4482 gen_rtx_UNSPEC (mode,
4483 gen_rtvec (3, in, in,
4484 shift_val),
4485 unspec));
4486
4487 /* Add clobber for instructions that set the carry flags. */
4488 if (use_plus_carry)
4489 {
4490 rtx clobber = gen_rtx_CLOBBER (VOIDmode,
4491 gen_rtx_REG (DImode, VCC_REG));
4492 insn = gen_rtx_PARALLEL (VOIDmode,
4493 gen_rtvec (2, insn, clobber));
4494 }
4495
4496 emit_insn (insn);
4497 }
4498 }
4499
4500 if (use_extends)
4501 {
4502 rtx tmp = gen_reg_rtx (orig_mode);
4503 convert_move (tmp, out, unsignedp);
4504 out = tmp;
4505 }
4506
4507 return out;
4508 }
4509
4510 /* Implement TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST. */
4511
4512 int
gcn_vectorization_cost(enum vect_cost_for_stmt ARG_UNUSED (type_of_cost),tree ARG_UNUSED (vectype),int ARG_UNUSED (misalign))4513 gcn_vectorization_cost (enum vect_cost_for_stmt ARG_UNUSED (type_of_cost),
4514 tree ARG_UNUSED (vectype), int ARG_UNUSED (misalign))
4515 {
4516 /* Always vectorize. */
4517 return 1;
4518 }
4519
4520 /* }}} */
4521 /* {{{ md_reorg pass. */
4522
4523 /* Identify VMEM instructions from their "type" attribute. */
4524
4525 static bool
gcn_vmem_insn_p(attr_type type)4526 gcn_vmem_insn_p (attr_type type)
4527 {
4528 switch (type)
4529 {
4530 case TYPE_MUBUF:
4531 case TYPE_MTBUF:
4532 case TYPE_FLAT:
4533 return true;
4534 case TYPE_UNKNOWN:
4535 case TYPE_SOP1:
4536 case TYPE_SOP2:
4537 case TYPE_SOPK:
4538 case TYPE_SOPC:
4539 case TYPE_SOPP:
4540 case TYPE_SMEM:
4541 case TYPE_DS:
4542 case TYPE_VOP2:
4543 case TYPE_VOP1:
4544 case TYPE_VOPC:
4545 case TYPE_VOP3A:
4546 case TYPE_VOP3B:
4547 case TYPE_VOP_SDWA:
4548 case TYPE_VOP_DPP:
4549 case TYPE_MULT:
4550 case TYPE_VMULT:
4551 return false;
4552 }
4553 gcc_unreachable ();
4554 return false;
4555 }
4556
4557 /* If INSN sets the EXEC register to a constant value, return the value,
4558 otherwise return zero. */
4559
4560 static int64_t
gcn_insn_exec_value(rtx_insn * insn)4561 gcn_insn_exec_value (rtx_insn *insn)
4562 {
4563 if (!NONDEBUG_INSN_P (insn))
4564 return 0;
4565
4566 rtx pattern = PATTERN (insn);
4567
4568 if (GET_CODE (pattern) == SET)
4569 {
4570 rtx dest = XEXP (pattern, 0);
4571 rtx src = XEXP (pattern, 1);
4572
4573 if (GET_MODE (dest) == DImode
4574 && REG_P (dest) && REGNO (dest) == EXEC_REG
4575 && CONST_INT_P (src))
4576 return INTVAL (src);
4577 }
4578
4579 return 0;
4580 }
4581
4582 /* Sets the EXEC register before INSN to the value that it had after
4583 LAST_EXEC_DEF. The constant value of the EXEC register is returned if
4584 known, otherwise it returns zero. */
4585
4586 static int64_t
gcn_restore_exec(rtx_insn * insn,rtx_insn * last_exec_def,int64_t curr_exec,bool curr_exec_known,bool & last_exec_def_saved)4587 gcn_restore_exec (rtx_insn *insn, rtx_insn *last_exec_def, int64_t curr_exec,
4588 bool curr_exec_known, bool &last_exec_def_saved)
4589 {
4590 rtx exec_reg = gen_rtx_REG (DImode, EXEC_REG);
4591 rtx exec;
4592
4593 int64_t exec_value = gcn_insn_exec_value (last_exec_def);
4594
4595 if (exec_value)
4596 {
4597 /* If the EXEC value is a constant and it happens to be the same as the
4598 current EXEC value, the restore can be skipped. */
4599 if (curr_exec_known && exec_value == curr_exec)
4600 return exec_value;
4601
4602 exec = GEN_INT (exec_value);
4603 }
4604 else
4605 {
4606 /* If the EXEC value is not a constant, save it in a register after the
4607 point of definition. */
4608 rtx exec_save_reg = gen_rtx_REG (DImode, EXEC_SAVE_REG);
4609
4610 if (!last_exec_def_saved)
4611 {
4612 start_sequence ();
4613 emit_move_insn (exec_save_reg, exec_reg);
4614 rtx_insn *seq = get_insns ();
4615 end_sequence ();
4616
4617 emit_insn_after (seq, last_exec_def);
4618 if (dump_file && (dump_flags & TDF_DETAILS))
4619 fprintf (dump_file, "Saving EXEC after insn %d.\n",
4620 INSN_UID (last_exec_def));
4621
4622 last_exec_def_saved = true;
4623 }
4624
4625 exec = exec_save_reg;
4626 }
4627
4628 /* Restore EXEC register before the usage. */
4629 start_sequence ();
4630 emit_move_insn (exec_reg, exec);
4631 rtx_insn *seq = get_insns ();
4632 end_sequence ();
4633 emit_insn_before (seq, insn);
4634
4635 if (dump_file && (dump_flags & TDF_DETAILS))
4636 {
4637 if (exec_value)
4638 fprintf (dump_file, "Restoring EXEC to %ld before insn %d.\n",
4639 exec_value, INSN_UID (insn));
4640 else
4641 fprintf (dump_file,
4642 "Restoring EXEC from saved value before insn %d.\n",
4643 INSN_UID (insn));
4644 }
4645
4646 return exec_value;
4647 }
4648
4649 /* Implement TARGET_MACHINE_DEPENDENT_REORG.
4650
4651 Ensure that pipeline dependencies and lane masking are set correctly. */
4652
4653 static void
gcn_md_reorg(void)4654 gcn_md_reorg (void)
4655 {
4656 basic_block bb;
4657 rtx exec_reg = gen_rtx_REG (DImode, EXEC_REG);
4658 regset_head live;
4659
4660 INIT_REG_SET (&live);
4661
4662 compute_bb_for_insn ();
4663
4664 if (!optimize)
4665 {
4666 split_all_insns ();
4667 if (dump_file && (dump_flags & TDF_DETAILS))
4668 {
4669 fprintf (dump_file, "After split:\n");
4670 print_rtl_with_bb (dump_file, get_insns (), dump_flags);
4671 }
4672
4673 /* Update data-flow information for split instructions. */
4674 df_insn_rescan_all ();
4675 }
4676
4677 df_live_add_problem ();
4678 df_live_set_all_dirty ();
4679 df_analyze ();
4680
4681 /* This pass ensures that the EXEC register is set correctly, according
4682 to the "exec" attribute. However, care must be taken so that the
4683 value that reaches explicit uses of the EXEC register remains the
4684 same as before.
4685 */
4686
4687 FOR_EACH_BB_FN (bb, cfun)
4688 {
4689 if (dump_file && (dump_flags & TDF_DETAILS))
4690 fprintf (dump_file, "BB %d:\n", bb->index);
4691
4692 rtx_insn *insn, *curr;
4693 rtx_insn *last_exec_def = BB_HEAD (bb);
4694 bool last_exec_def_saved = false;
4695 bool curr_exec_explicit = true;
4696 bool curr_exec_known = true;
4697 int64_t curr_exec = 0; /* 0 here means 'the value is that of EXEC
4698 after last_exec_def is executed'. */
4699
4700 bitmap live_in = DF_LR_IN (bb);
4701 bool exec_live_on_entry = false;
4702 if (bitmap_bit_p (live_in, EXEC_LO_REG)
4703 || bitmap_bit_p (live_in, EXEC_HI_REG))
4704 {
4705 if (dump_file)
4706 fprintf (dump_file, "EXEC reg is live on entry to block %d\n",
4707 (int) bb->index);
4708 exec_live_on_entry = true;
4709 }
4710
4711 FOR_BB_INSNS_SAFE (bb, insn, curr)
4712 {
4713 if (!NONDEBUG_INSN_P (insn))
4714 continue;
4715
4716 if (GET_CODE (PATTERN (insn)) == USE
4717 || GET_CODE (PATTERN (insn)) == CLOBBER)
4718 continue;
4719
4720 HARD_REG_SET defs, uses;
4721 CLEAR_HARD_REG_SET (defs);
4722 CLEAR_HARD_REG_SET (uses);
4723 note_stores (insn, record_hard_reg_sets, &defs);
4724 note_uses (&PATTERN (insn), record_hard_reg_uses, &uses);
4725
4726 bool exec_lo_def_p = TEST_HARD_REG_BIT (defs, EXEC_LO_REG);
4727 bool exec_hi_def_p = TEST_HARD_REG_BIT (defs, EXEC_HI_REG);
4728 bool exec_used = (hard_reg_set_intersect_p
4729 (uses, reg_class_contents[(int) EXEC_MASK_REG])
4730 || TEST_HARD_REG_BIT (uses, EXECZ_REG));
4731
4732 /* Check the instruction for implicit setting of EXEC via an
4733 attribute. */
4734 attr_exec exec_attr = get_attr_exec (insn);
4735 int64_t new_exec;
4736
4737 switch (exec_attr)
4738 {
4739 case EXEC_NONE:
4740 new_exec = 0;
4741 break;
4742
4743 case EXEC_SINGLE:
4744 /* Instructions that do not involve memory accesses only require
4745 bit 0 of EXEC to be set. */
4746 if (gcn_vmem_insn_p (get_attr_type (insn))
4747 || get_attr_type (insn) == TYPE_DS)
4748 new_exec = 1;
4749 else
4750 new_exec = curr_exec | 1;
4751 break;
4752
4753 case EXEC_FULL:
4754 new_exec = -1;
4755 break;
4756
4757 default: /* Auto-detect what setting is appropriate. */
4758 {
4759 new_exec = 0;
4760
4761 /* If EXEC is referenced explicitly then we don't need to do
4762 anything to set it, so we're done. */
4763 if (exec_used)
4764 break;
4765
4766 /* Scan the insn for VGPRs defs or uses. The mode determines
4767 what kind of exec is needed. */
4768 subrtx_iterator::array_type array;
4769 FOR_EACH_SUBRTX (iter, array, PATTERN (insn), NONCONST)
4770 {
4771 const_rtx x = *iter;
4772 if (REG_P (x) && VGPR_REGNO_P (REGNO (x)))
4773 {
4774 if (VECTOR_MODE_P (GET_MODE (x)))
4775 {
4776 new_exec = -1;
4777 break;
4778 }
4779 else
4780 new_exec = 1;
4781 }
4782 }
4783 }
4784 break;
4785 }
4786
4787 if (new_exec && (!curr_exec_known || new_exec != curr_exec))
4788 {
4789 start_sequence ();
4790 emit_move_insn (exec_reg, GEN_INT (new_exec));
4791 rtx_insn *seq = get_insns ();
4792 end_sequence ();
4793 emit_insn_before (seq, insn);
4794
4795 if (dump_file && (dump_flags & TDF_DETAILS))
4796 fprintf (dump_file, "Setting EXEC to %ld before insn %d.\n",
4797 new_exec, INSN_UID (insn));
4798
4799 curr_exec = new_exec;
4800 curr_exec_explicit = false;
4801 curr_exec_known = true;
4802 }
4803 else if (new_exec && dump_file && (dump_flags & TDF_DETAILS))
4804 {
4805 fprintf (dump_file, "Exec already is %ld before insn %d.\n",
4806 new_exec, INSN_UID (insn));
4807 }
4808
4809 /* The state of the EXEC register is unknown after a
4810 function call. */
4811 if (CALL_P (insn))
4812 curr_exec_known = false;
4813
4814 /* Handle explicit uses of EXEC. If the instruction is a partial
4815 explicit definition of EXEC, then treat it as an explicit use of
4816 EXEC as well. */
4817 if (exec_used || exec_lo_def_p != exec_hi_def_p)
4818 {
4819 /* An instruction that explicitly uses EXEC should not also
4820 implicitly define it. */
4821 gcc_assert (!exec_used || !new_exec);
4822
4823 if (!curr_exec_known || !curr_exec_explicit)
4824 {
4825 /* Restore the previous explicitly defined value. */
4826 curr_exec = gcn_restore_exec (insn, last_exec_def,
4827 curr_exec, curr_exec_known,
4828 last_exec_def_saved);
4829 curr_exec_explicit = true;
4830 curr_exec_known = true;
4831 }
4832 }
4833
4834 /* Handle explicit definitions of EXEC. */
4835 if (exec_lo_def_p || exec_hi_def_p)
4836 {
4837 last_exec_def = insn;
4838 last_exec_def_saved = false;
4839 curr_exec = gcn_insn_exec_value (insn);
4840 curr_exec_explicit = true;
4841 curr_exec_known = true;
4842
4843 if (dump_file && (dump_flags & TDF_DETAILS))
4844 fprintf (dump_file,
4845 "Found %s definition of EXEC at insn %d.\n",
4846 exec_lo_def_p == exec_hi_def_p ? "full" : "partial",
4847 INSN_UID (insn));
4848 }
4849
4850 exec_live_on_entry = false;
4851 }
4852
4853 COPY_REG_SET (&live, DF_LR_OUT (bb));
4854 df_simulate_initialize_backwards (bb, &live);
4855
4856 /* If EXEC is live after the basic block, restore the value of EXEC
4857 at the end of the block. */
4858 if ((REGNO_REG_SET_P (&live, EXEC_LO_REG)
4859 || REGNO_REG_SET_P (&live, EXEC_HI_REG))
4860 && (!curr_exec_known || !curr_exec_explicit || exec_live_on_entry))
4861 {
4862 rtx_insn *end_insn = BB_END (bb);
4863
4864 /* If the instruction is not a jump instruction, do the restore
4865 after the last instruction in the basic block. */
4866 if (NONJUMP_INSN_P (end_insn))
4867 end_insn = NEXT_INSN (end_insn);
4868
4869 gcn_restore_exec (end_insn, last_exec_def, curr_exec,
4870 curr_exec_known, last_exec_def_saved);
4871 }
4872 }
4873
4874 CLEAR_REG_SET (&live);
4875
4876 /* "Manually Inserted Wait States (NOPs)."
4877
4878 GCN hardware detects most kinds of register dependencies, but there
4879 are some exceptions documented in the ISA manual. This pass
4880 detects the missed cases, and inserts the documented number of NOPs
4881 required for correct execution. */
4882
4883 const int max_waits = 5;
4884 struct ilist
4885 {
4886 rtx_insn *insn;
4887 attr_unit unit;
4888 attr_delayeduse delayeduse;
4889 HARD_REG_SET writes;
4890 HARD_REG_SET reads;
4891 int age;
4892 } back[max_waits];
4893 int oldest = 0;
4894 for (int i = 0; i < max_waits; i++)
4895 back[i].insn = NULL;
4896
4897 rtx_insn *insn, *last_insn = NULL;
4898 for (insn = get_insns (); insn != 0; insn = NEXT_INSN (insn))
4899 {
4900 if (!NONDEBUG_INSN_P (insn))
4901 continue;
4902
4903 if (GET_CODE (PATTERN (insn)) == USE
4904 || GET_CODE (PATTERN (insn)) == CLOBBER)
4905 continue;
4906
4907 attr_type itype = get_attr_type (insn);
4908 attr_unit iunit = get_attr_unit (insn);
4909 attr_delayeduse idelayeduse = get_attr_delayeduse (insn);
4910 HARD_REG_SET ireads, iwrites;
4911 CLEAR_HARD_REG_SET (ireads);
4912 CLEAR_HARD_REG_SET (iwrites);
4913 note_stores (insn, record_hard_reg_sets, &iwrites);
4914 note_uses (&PATTERN (insn), record_hard_reg_uses, &ireads);
4915
4916 /* Scan recent previous instructions for dependencies not handled in
4917 hardware. */
4918 int nops_rqd = 0;
4919 for (int i = oldest; i < oldest + max_waits; i++)
4920 {
4921 struct ilist *prev_insn = &back[i % max_waits];
4922
4923 if (!prev_insn->insn)
4924 continue;
4925
4926 /* VALU writes SGPR followed by VMEM reading the same SGPR
4927 requires 5 wait states. */
4928 if ((prev_insn->age + nops_rqd) < 5
4929 && prev_insn->unit == UNIT_VECTOR
4930 && gcn_vmem_insn_p (itype))
4931 {
4932 HARD_REG_SET regs = prev_insn->writes & ireads;
4933 if (hard_reg_set_intersect_p
4934 (regs, reg_class_contents[(int) SGPR_REGS]))
4935 nops_rqd = 5 - prev_insn->age;
4936 }
4937
4938 /* VALU sets VCC/EXEC followed by VALU uses VCCZ/EXECZ
4939 requires 5 wait states. */
4940 if ((prev_insn->age + nops_rqd) < 5
4941 && prev_insn->unit == UNIT_VECTOR
4942 && iunit == UNIT_VECTOR
4943 && ((hard_reg_set_intersect_p
4944 (prev_insn->writes,
4945 reg_class_contents[(int) EXEC_MASK_REG])
4946 && TEST_HARD_REG_BIT (ireads, EXECZ_REG))
4947 ||
4948 (hard_reg_set_intersect_p
4949 (prev_insn->writes,
4950 reg_class_contents[(int) VCC_CONDITIONAL_REG])
4951 && TEST_HARD_REG_BIT (ireads, VCCZ_REG))))
4952 nops_rqd = 5 - prev_insn->age;
4953
4954 /* VALU writes SGPR/VCC followed by v_{read,write}lane using
4955 SGPR/VCC as lane select requires 4 wait states. */
4956 if ((prev_insn->age + nops_rqd) < 4
4957 && prev_insn->unit == UNIT_VECTOR
4958 && get_attr_laneselect (insn) == LANESELECT_YES)
4959 {
4960 HARD_REG_SET regs = prev_insn->writes & ireads;
4961 if (hard_reg_set_intersect_p
4962 (regs, reg_class_contents[(int) SGPR_REGS])
4963 || hard_reg_set_intersect_p
4964 (regs, reg_class_contents[(int) VCC_CONDITIONAL_REG]))
4965 nops_rqd = 4 - prev_insn->age;
4966 }
4967
4968 /* VALU writes VGPR followed by VALU_DPP reading that VGPR
4969 requires 2 wait states. */
4970 if ((prev_insn->age + nops_rqd) < 2
4971 && prev_insn->unit == UNIT_VECTOR
4972 && itype == TYPE_VOP_DPP)
4973 {
4974 HARD_REG_SET regs = prev_insn->writes & ireads;
4975 if (hard_reg_set_intersect_p
4976 (regs, reg_class_contents[(int) VGPR_REGS]))
4977 nops_rqd = 2 - prev_insn->age;
4978 }
4979
4980 /* Store that requires input registers are not overwritten by
4981 following instruction. */
4982 if ((prev_insn->age + nops_rqd) < 1
4983 && prev_insn->delayeduse == DELAYEDUSE_YES
4984 && ((hard_reg_set_intersect_p
4985 (prev_insn->reads, iwrites))))
4986 nops_rqd = 1 - prev_insn->age;
4987 }
4988
4989 /* Insert the required number of NOPs. */
4990 for (int i = nops_rqd; i > 0; i--)
4991 emit_insn_after (gen_nop (), last_insn);
4992
4993 /* Age the previous instructions. We can also ignore writes to
4994 registers subsequently overwritten. */
4995 HARD_REG_SET written;
4996 CLEAR_HARD_REG_SET (written);
4997 for (int i = oldest + max_waits - 1; i > oldest; i--)
4998 {
4999 struct ilist *prev_insn = &back[i % max_waits];
5000
5001 /* Assume all instructions are equivalent to one "wait", the same
5002 as s_nop. This is probably true for SALU, but not VALU (which
5003 may take longer), so this is not optimal. However, AMD do
5004 not publish the cycle times for instructions. */
5005 prev_insn->age += 1 + nops_rqd;
5006
5007 written |= iwrites;
5008 prev_insn->writes &= ~written;
5009 }
5010
5011 /* Track the current instruction as a previous instruction. */
5012 back[oldest].insn = insn;
5013 back[oldest].unit = iunit;
5014 back[oldest].delayeduse = idelayeduse;
5015 back[oldest].writes = iwrites;
5016 back[oldest].reads = ireads;
5017 back[oldest].age = 0;
5018 oldest = (oldest + 1) % max_waits;
5019
5020 last_insn = insn;
5021 }
5022 }
5023
5024 /* }}} */
5025 /* {{{ OpenACC / OpenMP. */
5026
5027 #define GCN_DEFAULT_GANGS 0 /* Choose at runtime. */
5028 #define GCN_DEFAULT_WORKERS 0 /* Choose at runtime. */
5029 #define GCN_DEFAULT_VECTORS 1 /* Use autovectorization only, for now. */
5030
5031 /* Implement TARGET_GOACC_VALIDATE_DIMS.
5032
5033 Check the launch dimensions provided for an OpenACC compute
5034 region, or routine. */
5035
5036 static bool
gcn_goacc_validate_dims(tree decl,int dims[],int fn_level,unsigned)5037 gcn_goacc_validate_dims (tree decl, int dims[], int fn_level,
5038 unsigned /*used*/)
5039 {
5040 bool changed = false;
5041 const int max_workers = 16;
5042
5043 /* The vector size must appear to be 64, to the user, unless this is a
5044 SEQ routine. The real, internal value is always 1, which means use
5045 autovectorization, but the user should not see that. */
5046 if (fn_level <= GOMP_DIM_VECTOR && fn_level >= -1
5047 && dims[GOMP_DIM_VECTOR] >= 0)
5048 {
5049 if (fn_level < 0 && dims[GOMP_DIM_VECTOR] >= 0
5050 && dims[GOMP_DIM_VECTOR] != 64)
5051 warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION,
5052 OPT_Wopenacc_dims,
5053 (dims[GOMP_DIM_VECTOR]
5054 ? G_("using %<vector_length (64)%>, ignoring %d")
5055 : G_("using %<vector_length (64)%>, "
5056 "ignoring runtime setting")),
5057 dims[GOMP_DIM_VECTOR]);
5058 dims[GOMP_DIM_VECTOR] = 1;
5059 changed = true;
5060 }
5061
5062 /* Check the num workers is not too large. */
5063 if (dims[GOMP_DIM_WORKER] > max_workers)
5064 {
5065 warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION,
5066 OPT_Wopenacc_dims,
5067 "using %<num_workers (%d)%>, ignoring %d",
5068 max_workers, dims[GOMP_DIM_WORKER]);
5069 dims[GOMP_DIM_WORKER] = max_workers;
5070 changed = true;
5071 }
5072
5073 /* Set global defaults. */
5074 if (!decl)
5075 {
5076 dims[GOMP_DIM_VECTOR] = GCN_DEFAULT_VECTORS;
5077 if (dims[GOMP_DIM_WORKER] < 0)
5078 dims[GOMP_DIM_WORKER] = GCN_DEFAULT_WORKERS;
5079 if (dims[GOMP_DIM_GANG] < 0)
5080 dims[GOMP_DIM_GANG] = GCN_DEFAULT_GANGS;
5081 changed = true;
5082 }
5083
5084 return changed;
5085 }
5086
5087 /* Helper function for oacc_dim_size instruction.
5088 Also used for OpenMP, via builtin_gcn_dim_size, and the omp_gcn pass. */
5089
5090 rtx
gcn_oacc_dim_size(int dim)5091 gcn_oacc_dim_size (int dim)
5092 {
5093 if (dim < 0 || dim > 2)
5094 error ("offload dimension out of range (%d)", dim);
5095
5096 /* Vectors are a special case. */
5097 if (dim == 2)
5098 return const1_rtx; /* Think of this as 1 times 64. */
5099
5100 static int offset[] = {
5101 /* Offsets into dispatch packet. */
5102 12, /* X dim = Gang / Team / Work-group. */
5103 20, /* Z dim = Worker / Thread / Wavefront. */
5104 16 /* Y dim = Vector / SIMD / Work-item. */
5105 };
5106 rtx addr = gen_rtx_PLUS (DImode,
5107 gen_rtx_REG (DImode,
5108 cfun->machine->args.
5109 reg[DISPATCH_PTR_ARG]),
5110 GEN_INT (offset[dim]));
5111 return gen_rtx_MEM (SImode, addr);
5112 }
5113
5114 /* Helper function for oacc_dim_pos instruction.
5115 Also used for OpenMP, via builtin_gcn_dim_pos, and the omp_gcn pass. */
5116
5117 rtx
gcn_oacc_dim_pos(int dim)5118 gcn_oacc_dim_pos (int dim)
5119 {
5120 if (dim < 0 || dim > 2)
5121 error ("offload dimension out of range (%d)", dim);
5122
5123 static const int reg[] = {
5124 WORKGROUP_ID_X_ARG, /* Gang / Team / Work-group. */
5125 WORK_ITEM_ID_Z_ARG, /* Worker / Thread / Wavefront. */
5126 WORK_ITEM_ID_Y_ARG /* Vector / SIMD / Work-item. */
5127 };
5128
5129 int reg_num = cfun->machine->args.reg[reg[dim]];
5130
5131 /* The information must have been requested by the kernel. */
5132 gcc_assert (reg_num >= 0);
5133
5134 return gen_rtx_REG (SImode, reg_num);
5135 }
5136
5137 /* Implement TARGET_GOACC_FORK_JOIN. */
5138
5139 static bool
gcn_fork_join(gcall * call,const int dims[],bool is_fork)5140 gcn_fork_join (gcall *call, const int dims[], bool is_fork)
5141 {
5142 tree arg = gimple_call_arg (call, 2);
5143 unsigned axis = TREE_INT_CST_LOW (arg);
5144
5145 if (!is_fork && axis == GOMP_DIM_WORKER && dims[axis] != 1)
5146 return true;
5147
5148 return false;
5149 }
5150
5151 /* Implement ???????
5152 FIXME make this a real hook.
5153
5154 Adjust FNDECL such that options inherited from the host compiler
5155 are made appropriate for the accelerator compiler. */
5156
5157 void
gcn_fixup_accel_lto_options(tree fndecl)5158 gcn_fixup_accel_lto_options (tree fndecl)
5159 {
5160 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
5161 if (!func_optimize)
5162 return;
5163
5164 tree old_optimize
5165 = build_optimization_node (&global_options, &global_options_set);
5166 tree new_optimize;
5167
5168 /* If the function changed the optimization levels as well as
5169 setting target options, start with the optimizations
5170 specified. */
5171 if (func_optimize != old_optimize)
5172 cl_optimization_restore (&global_options, &global_options_set,
5173 TREE_OPTIMIZATION (func_optimize));
5174
5175 gcn_option_override ();
5176
5177 /* The target attributes may also change some optimization flags,
5178 so update the optimization options if necessary. */
5179 new_optimize = build_optimization_node (&global_options,
5180 &global_options_set);
5181
5182 if (old_optimize != new_optimize)
5183 {
5184 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
5185 cl_optimization_restore (&global_options, &global_options_set,
5186 TREE_OPTIMIZATION (old_optimize));
5187 }
5188 }
5189
5190 /* Implement TARGET_GOACC_SHARED_MEM_LAYOUT hook. */
5191
5192 static void
gcn_shared_mem_layout(unsigned HOST_WIDE_INT * lo,unsigned HOST_WIDE_INT * hi,int ARG_UNUSED (dims[GOMP_DIM_MAX]),unsigned HOST_WIDE_INT ARG_UNUSED (private_size[GOMP_DIM_MAX]),unsigned HOST_WIDE_INT reduction_size[GOMP_DIM_MAX])5193 gcn_shared_mem_layout (unsigned HOST_WIDE_INT *lo,
5194 unsigned HOST_WIDE_INT *hi,
5195 int ARG_UNUSED (dims[GOMP_DIM_MAX]),
5196 unsigned HOST_WIDE_INT
5197 ARG_UNUSED (private_size[GOMP_DIM_MAX]),
5198 unsigned HOST_WIDE_INT reduction_size[GOMP_DIM_MAX])
5199 {
5200 *lo = gang_private_size_opt + reduction_size[GOMP_DIM_WORKER];
5201 /* !!! We can maybe use dims[] to estimate the maximum number of work
5202 groups/wavefronts/etc. we will launch, and therefore tune the maximum
5203 amount of LDS we should use. For now, use a minimal amount to try to
5204 maximise occupancy. */
5205 *hi = acc_lds_size;
5206 machine_function *machfun = cfun->machine;
5207 machfun->reduction_base = gang_private_size_opt;
5208 machfun->reduction_limit
5209 = gang_private_size_opt + reduction_size[GOMP_DIM_WORKER];
5210 }
5211
5212 /* }}} */
5213 /* {{{ ASM Output. */
5214
5215 /* Implement TARGET_ASM_FILE_START.
5216
5217 Print assembler file header text. */
5218
5219 static void
output_file_start(void)5220 output_file_start (void)
5221 {
5222 const char *cpu;
5223 bool use_xnack_attr = true;
5224 bool use_sram_attr = true;
5225 switch (gcn_arch)
5226 {
5227 case PROCESSOR_FIJI:
5228 cpu = "gfx803";
5229 #ifndef HAVE_GCN_XNACK_FIJI
5230 use_xnack_attr = false;
5231 #endif
5232 use_sram_attr = false;
5233 break;
5234 case PROCESSOR_VEGA10:
5235 cpu = "gfx900";
5236 #ifndef HAVE_GCN_XNACK_GFX900
5237 use_xnack_attr = false;
5238 #endif
5239 use_sram_attr = false;
5240 break;
5241 case PROCESSOR_VEGA20:
5242 cpu = "gfx906";
5243 #ifndef HAVE_GCN_XNACK_GFX906
5244 use_xnack_attr = false;
5245 #endif
5246 use_sram_attr = false;
5247 break;
5248 case PROCESSOR_GFX908:
5249 cpu = "gfx908";
5250 #ifndef HAVE_GCN_XNACK_GFX908
5251 use_xnack_attr = false;
5252 #endif
5253 #ifndef HAVE_GCN_SRAM_ECC_GFX908
5254 use_sram_attr = false;
5255 #endif
5256 break;
5257 default: gcc_unreachable ();
5258 }
5259
5260 #if HAVE_GCN_ASM_V3_SYNTAX
5261 const char *xnack = (flag_xnack ? "+xnack" : "");
5262 const char *sram_ecc = (flag_sram_ecc ? "+sram-ecc" : "");
5263 #endif
5264 #if HAVE_GCN_ASM_V4_SYNTAX
5265 /* In HSACOv4 no attribute setting means the binary supports "any" hardware
5266 configuration. In GCC binaries, this is true for SRAM ECC, but not
5267 XNACK. */
5268 const char *xnack = (flag_xnack ? ":xnack+" : ":xnack-");
5269 const char *sram_ecc = (flag_sram_ecc == SRAM_ECC_ON ? ":sramecc+"
5270 : flag_sram_ecc == SRAM_ECC_OFF ? ":sramecc-"
5271 : "");
5272 #endif
5273 if (!use_xnack_attr)
5274 xnack = "";
5275 if (!use_sram_attr)
5276 sram_ecc = "";
5277
5278 fprintf(asm_out_file, "\t.amdgcn_target \"amdgcn-unknown-amdhsa--%s%s%s\"\n",
5279 cpu,
5280 #if HAVE_GCN_ASM_V3_SYNTAX
5281 xnack, sram_ecc
5282 #endif
5283 #ifdef HAVE_GCN_ASM_V4_SYNTAX
5284 sram_ecc, xnack
5285 #endif
5286 );
5287 }
5288
5289 /* Implement ASM_DECLARE_FUNCTION_NAME via gcn-hsa.h.
5290
5291 Print the initial definition of a function name.
5292
5293 For GCN kernel entry points this includes all the HSA meta-data, special
5294 alignment constraints that don't apply to regular functions, and magic
5295 comments that pass information to mkoffload. */
5296
5297 void
gcn_hsa_declare_function_name(FILE * file,const char * name,tree)5298 gcn_hsa_declare_function_name (FILE *file, const char *name, tree)
5299 {
5300 int sgpr, vgpr;
5301 bool xnack_enabled = false;
5302
5303 fputs ("\n\n", file);
5304
5305 if (cfun && cfun->machine && cfun->machine->normal_function)
5306 {
5307 fputs ("\t.type\t", file);
5308 assemble_name (file, name);
5309 fputs (",@function\n", file);
5310 assemble_name (file, name);
5311 fputs (":\n", file);
5312 return;
5313 }
5314
5315 /* Determine count of sgpr/vgpr registers by looking for last
5316 one used. */
5317 for (sgpr = 101; sgpr >= 0; sgpr--)
5318 if (df_regs_ever_live_p (FIRST_SGPR_REG + sgpr))
5319 break;
5320 sgpr++;
5321 for (vgpr = 255; vgpr >= 0; vgpr--)
5322 if (df_regs_ever_live_p (FIRST_VGPR_REG + vgpr))
5323 break;
5324 vgpr++;
5325
5326 if (!leaf_function_p ())
5327 {
5328 /* We can't know how many registers function calls might use. */
5329 if (vgpr < MAX_NORMAL_VGPR_COUNT)
5330 vgpr = MAX_NORMAL_VGPR_COUNT;
5331 if (sgpr < MAX_NORMAL_SGPR_COUNT)
5332 sgpr = MAX_NORMAL_SGPR_COUNT;
5333 }
5334
5335 fputs ("\t.rodata\n"
5336 "\t.p2align\t6\n"
5337 "\t.amdhsa_kernel\t", file);
5338 assemble_name (file, name);
5339 fputs ("\n", file);
5340 int reg = FIRST_SGPR_REG;
5341 for (int a = 0; a < GCN_KERNEL_ARG_TYPES; a++)
5342 {
5343 int reg_first = -1;
5344 int reg_last;
5345 if ((cfun->machine->args.requested & (1 << a))
5346 && (gcn_kernel_arg_types[a].fixed_regno < 0))
5347 {
5348 reg_first = reg;
5349 reg_last = (reg_first
5350 + (GET_MODE_SIZE (gcn_kernel_arg_types[a].mode)
5351 / UNITS_PER_WORD) - 1);
5352 reg = reg_last + 1;
5353 }
5354
5355 if (gcn_kernel_arg_types[a].header_pseudo)
5356 {
5357 fprintf (file, "\t %s%s\t%i",
5358 (cfun->machine->args.requested & (1 << a)) != 0 ? "" : ";",
5359 gcn_kernel_arg_types[a].header_pseudo,
5360 (cfun->machine->args.requested & (1 << a)) != 0);
5361 if (reg_first != -1)
5362 {
5363 fprintf (file, " ; (");
5364 for (int i = reg_first; i <= reg_last; ++i)
5365 {
5366 if (i != reg_first)
5367 fprintf (file, ", ");
5368 fprintf (file, "%s", reg_names[i]);
5369 }
5370 fprintf (file, ")");
5371 }
5372 fprintf (file, "\n");
5373 }
5374 else if (gcn_kernel_arg_types[a].fixed_regno >= 0
5375 && cfun->machine->args.requested & (1 << a))
5376 fprintf (file, "\t ; %s\t%i (%s)\n",
5377 gcn_kernel_arg_types[a].name,
5378 (cfun->machine->args.requested & (1 << a)) != 0,
5379 reg_names[gcn_kernel_arg_types[a].fixed_regno]);
5380 }
5381 fprintf (file, "\t .amdhsa_system_vgpr_workitem_id\t%i\n",
5382 (cfun->machine->args.requested & (1 << WORK_ITEM_ID_Z_ARG))
5383 ? 2
5384 : cfun->machine->args.requested & (1 << WORK_ITEM_ID_Y_ARG)
5385 ? 1 : 0);
5386 fprintf (file,
5387 "\t .amdhsa_next_free_vgpr\t%i\n"
5388 "\t .amdhsa_next_free_sgpr\t%i\n"
5389 "\t .amdhsa_reserve_vcc\t1\n"
5390 "\t .amdhsa_reserve_flat_scratch\t0\n"
5391 "\t .amdhsa_reserve_xnack_mask\t%i\n"
5392 "\t .amdhsa_private_segment_fixed_size\t%i\n"
5393 "\t .amdhsa_group_segment_fixed_size\t%u\n"
5394 "\t .amdhsa_float_denorm_mode_32\t3\n"
5395 "\t .amdhsa_float_denorm_mode_16_64\t3\n",
5396 vgpr,
5397 sgpr,
5398 xnack_enabled,
5399 /* workitem_private_segment_bytes_size needs to be
5400 one 64th the wave-front stack size. */
5401 stack_size_opt / 64,
5402 LDS_SIZE);
5403 fputs ("\t.end_amdhsa_kernel\n", file);
5404
5405 #if 1
5406 /* The following is YAML embedded in assembler; tabs are not allowed. */
5407 fputs (" .amdgpu_metadata\n"
5408 " amdhsa.version:\n"
5409 " - 1\n"
5410 " - 0\n"
5411 " amdhsa.kernels:\n"
5412 " - .name: ", file);
5413 assemble_name (file, name);
5414 fputs ("\n .symbol: ", file);
5415 assemble_name (file, name);
5416 fprintf (file,
5417 ".kd\n"
5418 " .kernarg_segment_size: %i\n"
5419 " .kernarg_segment_align: %i\n"
5420 " .group_segment_fixed_size: %u\n"
5421 " .private_segment_fixed_size: %i\n"
5422 " .wavefront_size: 64\n"
5423 " .sgpr_count: %i\n"
5424 " .vgpr_count: %i\n"
5425 " .max_flat_workgroup_size: 1024\n",
5426 cfun->machine->kernarg_segment_byte_size,
5427 cfun->machine->kernarg_segment_alignment,
5428 LDS_SIZE,
5429 stack_size_opt / 64,
5430 sgpr, vgpr);
5431 fputs (" .end_amdgpu_metadata\n", file);
5432 #endif
5433
5434 fputs ("\t.text\n", file);
5435 fputs ("\t.align\t256\n", file);
5436 fputs ("\t.type\t", file);
5437 assemble_name (file, name);
5438 fputs (",@function\n", file);
5439 assemble_name (file, name);
5440 fputs (":\n", file);
5441
5442 /* This comment is read by mkoffload. */
5443 if (flag_openacc)
5444 fprintf (file, "\t;; OPENACC-DIMS: %d, %d, %d : %s\n",
5445 oacc_get_fn_dim_size (cfun->decl, GOMP_DIM_GANG),
5446 oacc_get_fn_dim_size (cfun->decl, GOMP_DIM_WORKER),
5447 oacc_get_fn_dim_size (cfun->decl, GOMP_DIM_VECTOR), name);
5448 }
5449
5450 /* Implement TARGET_ASM_SELECT_SECTION.
5451
5452 Return the section into which EXP should be placed. */
5453
5454 static section *
gcn_asm_select_section(tree exp,int reloc,unsigned HOST_WIDE_INT align)5455 gcn_asm_select_section (tree exp, int reloc, unsigned HOST_WIDE_INT align)
5456 {
5457 if (TREE_TYPE (exp) != error_mark_node
5458 && TYPE_ADDR_SPACE (TREE_TYPE (exp)) == ADDR_SPACE_LDS)
5459 {
5460 if (!DECL_P (exp))
5461 return get_section (".lds_bss",
5462 SECTION_WRITE | SECTION_BSS | SECTION_DEBUG,
5463 NULL);
5464
5465 return get_named_section (exp, ".lds_bss", reloc);
5466 }
5467
5468 return default_elf_select_section (exp, reloc, align);
5469 }
5470
5471 /* Implement TARGET_ASM_FUNCTION_PROLOGUE.
5472
5473 Emits custom text into the assembler file at the head of each function. */
5474
5475 static void
gcn_target_asm_function_prologue(FILE * file)5476 gcn_target_asm_function_prologue (FILE *file)
5477 {
5478 machine_function *offsets = gcn_compute_frame_offsets ();
5479
5480 asm_fprintf (file, "\t; using %s addressing in function\n",
5481 offsets->use_flat_addressing ? "flat" : "global");
5482
5483 if (offsets->normal_function)
5484 {
5485 asm_fprintf (file, "\t; frame pointer needed: %s\n",
5486 offsets->need_frame_pointer ? "true" : "false");
5487 asm_fprintf (file, "\t; lr needs saving: %s\n",
5488 offsets->lr_needs_saving ? "true" : "false");
5489 asm_fprintf (file, "\t; outgoing args size: %wd\n",
5490 offsets->outgoing_args_size);
5491 asm_fprintf (file, "\t; pretend size: %wd\n", offsets->pretend_size);
5492 asm_fprintf (file, "\t; local vars size: %wd\n", offsets->local_vars);
5493 asm_fprintf (file, "\t; callee save size: %wd\n",
5494 offsets->callee_saves);
5495 }
5496 else
5497 {
5498 asm_fprintf (file, "\t; HSA kernel entry point\n");
5499 asm_fprintf (file, "\t; local vars size: %wd\n", offsets->local_vars);
5500 asm_fprintf (file, "\t; outgoing args size: %wd\n",
5501 offsets->outgoing_args_size);
5502 }
5503 }
5504
5505 /* Helper function for print_operand and print_operand_address.
5506
5507 Print a register as the assembler requires, according to mode and name. */
5508
5509 static void
print_reg(FILE * file,rtx x)5510 print_reg (FILE *file, rtx x)
5511 {
5512 machine_mode mode = GET_MODE (x);
5513 if (mode == BImode || mode == QImode || mode == HImode || mode == SImode
5514 || mode == HFmode || mode == SFmode
5515 || mode == V64SFmode || mode == V64SImode
5516 || mode == V64QImode || mode == V64HImode)
5517 fprintf (file, "%s", reg_names[REGNO (x)]);
5518 else if (mode == DImode || mode == V64DImode
5519 || mode == DFmode || mode == V64DFmode)
5520 {
5521 if (SGPR_REGNO_P (REGNO (x)))
5522 fprintf (file, "s[%i:%i]", REGNO (x) - FIRST_SGPR_REG,
5523 REGNO (x) - FIRST_SGPR_REG + 1);
5524 else if (VGPR_REGNO_P (REGNO (x)))
5525 fprintf (file, "v[%i:%i]", REGNO (x) - FIRST_VGPR_REG,
5526 REGNO (x) - FIRST_VGPR_REG + 1);
5527 else if (REGNO (x) == FLAT_SCRATCH_REG)
5528 fprintf (file, "flat_scratch");
5529 else if (REGNO (x) == EXEC_REG)
5530 fprintf (file, "exec");
5531 else if (REGNO (x) == VCC_LO_REG)
5532 fprintf (file, "vcc");
5533 else
5534 fprintf (file, "[%s:%s]",
5535 reg_names[REGNO (x)], reg_names[REGNO (x) + 1]);
5536 }
5537 else if (mode == TImode)
5538 {
5539 if (SGPR_REGNO_P (REGNO (x)))
5540 fprintf (file, "s[%i:%i]", REGNO (x) - FIRST_SGPR_REG,
5541 REGNO (x) - FIRST_SGPR_REG + 3);
5542 else if (VGPR_REGNO_P (REGNO (x)))
5543 fprintf (file, "v[%i:%i]", REGNO (x) - FIRST_VGPR_REG,
5544 REGNO (x) - FIRST_VGPR_REG + 3);
5545 else
5546 gcc_unreachable ();
5547 }
5548 else
5549 gcc_unreachable ();
5550 }
5551
5552 /* Implement TARGET_SECTION_TYPE_FLAGS.
5553
5554 Return a set of section attributes for use by TARGET_ASM_NAMED_SECTION. */
5555
5556 static unsigned int
gcn_section_type_flags(tree decl,const char * name,int reloc)5557 gcn_section_type_flags (tree decl, const char *name, int reloc)
5558 {
5559 if (strcmp (name, ".lds_bss") == 0)
5560 return SECTION_WRITE | SECTION_BSS | SECTION_DEBUG;
5561
5562 return default_section_type_flags (decl, name, reloc);
5563 }
5564
5565 /* Helper function for gcn_asm_output_symbol_ref.
5566
5567 FIXME: This function is used to lay out gang-private variables in LDS
5568 on a per-CU basis.
5569 There may be cases in which gang-private variables in different compilation
5570 units could clobber each other. In that case we should be relying on the
5571 linker to lay out gang-private LDS space, but that doesn't appear to be
5572 possible at present. */
5573
5574 static void
gcn_print_lds_decl(FILE * f,tree var)5575 gcn_print_lds_decl (FILE *f, tree var)
5576 {
5577 int *offset;
5578 if ((offset = lds_allocs.get (var)))
5579 fprintf (f, "%u", (unsigned) *offset);
5580 else
5581 {
5582 unsigned HOST_WIDE_INT align = DECL_ALIGN_UNIT (var);
5583 tree type = TREE_TYPE (var);
5584 unsigned HOST_WIDE_INT size = tree_to_uhwi (TYPE_SIZE_UNIT (type));
5585 if (size > align && size > 4 && align < 8)
5586 align = 8;
5587
5588 gang_private_hwm = ((gang_private_hwm + align - 1) & ~(align - 1));
5589
5590 lds_allocs.put (var, gang_private_hwm);
5591 fprintf (f, "%u", gang_private_hwm);
5592 gang_private_hwm += size;
5593 if (gang_private_hwm > gang_private_size_opt)
5594 error ("%d bytes of gang-private data-share memory exhausted"
5595 " (increase with %<-mgang-private-size=%d%>, for example)",
5596 gang_private_size_opt, gang_private_hwm);
5597 }
5598 }
5599
5600 /* Implement ASM_OUTPUT_SYMBOL_REF via gcn-hsa.h. */
5601
5602 void
gcn_asm_output_symbol_ref(FILE * file,rtx x)5603 gcn_asm_output_symbol_ref (FILE *file, rtx x)
5604 {
5605 tree decl;
5606 if (cfun
5607 && (decl = SYMBOL_REF_DECL (x)) != 0
5608 && TREE_CODE (decl) == VAR_DECL
5609 && AS_LDS_P (TYPE_ADDR_SPACE (TREE_TYPE (decl))))
5610 {
5611 /* LDS symbols (emitted using this hook) are only used at present
5612 to propagate worker values from an active thread to neutered
5613 threads. Use the same offset for each such block, but don't
5614 use zero because null pointers are used to identify the active
5615 thread in GOACC_single_copy_start calls. */
5616 gcn_print_lds_decl (file, decl);
5617 }
5618 else
5619 {
5620 assemble_name (file, XSTR (x, 0));
5621 /* FIXME: See above -- this condition is unreachable. */
5622 if (cfun
5623 && (decl = SYMBOL_REF_DECL (x)) != 0
5624 && TREE_CODE (decl) == VAR_DECL
5625 && AS_LDS_P (TYPE_ADDR_SPACE (TREE_TYPE (decl))))
5626 fputs ("@abs32", file);
5627 }
5628 }
5629
5630 /* Implement TARGET_CONSTANT_ALIGNMENT.
5631
5632 Returns the alignment in bits of a constant that is being placed in memory.
5633 CONSTANT is the constant and BASIC_ALIGN is the alignment that the object
5634 would ordinarily have. */
5635
5636 static HOST_WIDE_INT
gcn_constant_alignment(const_tree ARG_UNUSED (constant),HOST_WIDE_INT basic_align)5637 gcn_constant_alignment (const_tree ARG_UNUSED (constant),
5638 HOST_WIDE_INT basic_align)
5639 {
5640 return basic_align > 128 ? basic_align : 128;
5641 }
5642
5643 /* Implement PRINT_OPERAND_ADDRESS via gcn.h. */
5644
5645 void
print_operand_address(FILE * file,rtx mem)5646 print_operand_address (FILE *file, rtx mem)
5647 {
5648 gcc_assert (MEM_P (mem));
5649
5650 rtx reg;
5651 rtx offset;
5652 addr_space_t as = MEM_ADDR_SPACE (mem);
5653 rtx addr = XEXP (mem, 0);
5654 gcc_assert (REG_P (addr) || GET_CODE (addr) == PLUS);
5655
5656 if (AS_SCRATCH_P (as))
5657 switch (GET_CODE (addr))
5658 {
5659 case REG:
5660 print_reg (file, addr);
5661 break;
5662
5663 case PLUS:
5664 reg = XEXP (addr, 0);
5665 offset = XEXP (addr, 1);
5666 print_reg (file, reg);
5667 if (GET_CODE (offset) == CONST_INT)
5668 fprintf (file, " offset:" HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
5669 else
5670 abort ();
5671 break;
5672
5673 default:
5674 debug_rtx (addr);
5675 abort ();
5676 }
5677 else if (AS_ANY_FLAT_P (as))
5678 {
5679 if (GET_CODE (addr) == REG)
5680 print_reg (file, addr);
5681 else
5682 {
5683 gcc_assert (TARGET_GCN5_PLUS);
5684 print_reg (file, XEXP (addr, 0));
5685 }
5686 }
5687 else if (AS_GLOBAL_P (as))
5688 {
5689 gcc_assert (TARGET_GCN5_PLUS);
5690
5691 rtx base = addr;
5692 rtx vgpr_offset = NULL_RTX;
5693
5694 if (GET_CODE (addr) == PLUS)
5695 {
5696 base = XEXP (addr, 0);
5697
5698 if (GET_CODE (base) == PLUS)
5699 {
5700 /* (SGPR + VGPR) + CONST */
5701 vgpr_offset = XEXP (base, 1);
5702 base = XEXP (base, 0);
5703 }
5704 else
5705 {
5706 rtx offset = XEXP (addr, 1);
5707
5708 if (REG_P (offset))
5709 /* SGPR + VGPR */
5710 vgpr_offset = offset;
5711 else if (CONST_INT_P (offset))
5712 /* VGPR + CONST or SGPR + CONST */
5713 ;
5714 else
5715 output_operand_lossage ("bad ADDR_SPACE_GLOBAL address");
5716 }
5717 }
5718
5719 if (REG_P (base))
5720 {
5721 if (VGPR_REGNO_P (REGNO (base)))
5722 print_reg (file, base);
5723 else if (SGPR_REGNO_P (REGNO (base)))
5724 {
5725 /* The assembler requires a 64-bit VGPR pair here, even though
5726 the offset should be only 32-bit. */
5727 if (vgpr_offset == NULL_RTX)
5728 /* In this case, the vector offset is zero, so we use the first
5729 lane of v1, which is initialized to zero. */
5730 {
5731 if (HAVE_GCN_ASM_GLOBAL_LOAD_FIXED)
5732 fprintf (file, "v1");
5733 else
5734 fprintf (file, "v[1:2]");
5735 }
5736 else if (REG_P (vgpr_offset)
5737 && VGPR_REGNO_P (REGNO (vgpr_offset)))
5738 {
5739 if (HAVE_GCN_ASM_GLOBAL_LOAD_FIXED)
5740 fprintf (file, "v%d",
5741 REGNO (vgpr_offset) - FIRST_VGPR_REG);
5742 else
5743 fprintf (file, "v[%d:%d]",
5744 REGNO (vgpr_offset) - FIRST_VGPR_REG,
5745 REGNO (vgpr_offset) - FIRST_VGPR_REG + 1);
5746 }
5747 else
5748 output_operand_lossage ("bad ADDR_SPACE_GLOBAL address");
5749 }
5750 }
5751 else
5752 output_operand_lossage ("bad ADDR_SPACE_GLOBAL address");
5753 }
5754 else if (AS_ANY_DS_P (as))
5755 switch (GET_CODE (addr))
5756 {
5757 case REG:
5758 print_reg (file, addr);
5759 break;
5760
5761 case PLUS:
5762 reg = XEXP (addr, 0);
5763 print_reg (file, reg);
5764 break;
5765
5766 default:
5767 debug_rtx (addr);
5768 abort ();
5769 }
5770 else
5771 switch (GET_CODE (addr))
5772 {
5773 case REG:
5774 print_reg (file, addr);
5775 fprintf (file, ", 0");
5776 break;
5777
5778 case PLUS:
5779 reg = XEXP (addr, 0);
5780 offset = XEXP (addr, 1);
5781 print_reg (file, reg);
5782 fprintf (file, ", ");
5783 if (GET_CODE (offset) == REG)
5784 print_reg (file, reg);
5785 else if (GET_CODE (offset) == CONST_INT)
5786 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
5787 else
5788 abort ();
5789 break;
5790
5791 default:
5792 debug_rtx (addr);
5793 abort ();
5794 }
5795 }
5796
5797 /* Implement PRINT_OPERAND via gcn.h.
5798
5799 b - print operand size as untyped operand (b8/b16/b32/b64)
5800 B - print operand size as SI/DI untyped operand (b32/b32/b32/b64)
5801 i - print operand size as untyped operand (i16/b32/i64)
5802 I - print operand size as SI/DI untyped operand(i32/b32/i64)
5803 u - print operand size as untyped operand (u16/u32/u64)
5804 U - print operand size as SI/DI untyped operand(u32/u64)
5805 o - print operand size as memory access size for loads
5806 (ubyte/ushort/dword/dwordx2/wordx3/dwordx4)
5807 s - print operand size as memory access size for stores
5808 (byte/short/dword/dwordx2/wordx3/dwordx4)
5809 C - print conditional code for s_cbranch (_sccz/_sccnz/_vccz/_vccnz...)
5810 c - print inverse conditional code for s_cbranch
5811 D - print conditional code for s_cmp (eq_u64/lg_u64...)
5812 E - print conditional code for v_cmp (eq_u64/ne_u64...)
5813 A - print address in formatting suitable for given address space.
5814 O - print offset:n for data share operations.
5815 ^ - print "_co" suffix for GCN5 mnemonics
5816 g - print "glc", if appropriate for given MEM
5817 */
5818
5819 void
print_operand(FILE * file,rtx x,int code)5820 print_operand (FILE *file, rtx x, int code)
5821 {
5822 int xcode = x ? GET_CODE (x) : 0;
5823 bool invert = false;
5824 switch (code)
5825 {
5826 /* Instructions have the following suffixes.
5827 If there are two suffixes, the first is the destination type,
5828 and the second is the source type.
5829
5830 B32 Bitfield (untyped data) 32-bit
5831 B64 Bitfield (untyped data) 64-bit
5832 F16 floating-point 16-bit
5833 F32 floating-point 32-bit (IEEE 754 single-precision float)
5834 F64 floating-point 64-bit (IEEE 754 double-precision float)
5835 I16 signed 32-bit integer
5836 I32 signed 32-bit integer
5837 I64 signed 64-bit integer
5838 U16 unsigned 32-bit integer
5839 U32 unsigned 32-bit integer
5840 U64 unsigned 64-bit integer */
5841
5842 /* Print operand size as untyped suffix. */
5843 case 'b':
5844 {
5845 const char *s = "";
5846 machine_mode mode = GET_MODE (x);
5847 if (VECTOR_MODE_P (mode))
5848 mode = GET_MODE_INNER (mode);
5849 switch (GET_MODE_SIZE (mode))
5850 {
5851 case 1:
5852 s = "_b8";
5853 break;
5854 case 2:
5855 s = "_b16";
5856 break;
5857 case 4:
5858 s = "_b32";
5859 break;
5860 case 8:
5861 s = "_b64";
5862 break;
5863 default:
5864 output_operand_lossage ("invalid operand %%xn code");
5865 return;
5866 }
5867 fputs (s, file);
5868 }
5869 return;
5870 case 'B':
5871 {
5872 const char *s = "";
5873 machine_mode mode = GET_MODE (x);
5874 if (VECTOR_MODE_P (mode))
5875 mode = GET_MODE_INNER (mode);
5876 switch (GET_MODE_SIZE (mode))
5877 {
5878 case 1:
5879 case 2:
5880 case 4:
5881 s = "_b32";
5882 break;
5883 case 8:
5884 s = "_b64";
5885 break;
5886 default:
5887 output_operand_lossage ("invalid operand %%xn code");
5888 return;
5889 }
5890 fputs (s, file);
5891 }
5892 return;
5893 case 'e':
5894 fputs ("sext(", file);
5895 print_operand (file, x, 0);
5896 fputs (")", file);
5897 return;
5898 case 'i':
5899 case 'I':
5900 case 'u':
5901 case 'U':
5902 {
5903 bool signed_p = code == 'i';
5904 bool min32_p = code == 'I' || code == 'U';
5905 const char *s = "";
5906 machine_mode mode = GET_MODE (x);
5907 if (VECTOR_MODE_P (mode))
5908 mode = GET_MODE_INNER (mode);
5909 if (mode == VOIDmode)
5910 switch (GET_CODE (x))
5911 {
5912 case CONST_INT:
5913 s = signed_p ? "_i32" : "_u32";
5914 break;
5915 case CONST_DOUBLE:
5916 s = "_f64";
5917 break;
5918 default:
5919 output_operand_lossage ("invalid operand %%xn code");
5920 return;
5921 }
5922 else if (FLOAT_MODE_P (mode))
5923 switch (GET_MODE_SIZE (mode))
5924 {
5925 case 2:
5926 s = "_f16";
5927 break;
5928 case 4:
5929 s = "_f32";
5930 break;
5931 case 8:
5932 s = "_f64";
5933 break;
5934 default:
5935 output_operand_lossage ("invalid operand %%xn code");
5936 return;
5937 }
5938 else if (min32_p)
5939 switch (GET_MODE_SIZE (mode))
5940 {
5941 case 1:
5942 case 2:
5943 case 4:
5944 s = signed_p ? "_i32" : "_u32";
5945 break;
5946 case 8:
5947 s = signed_p ? "_i64" : "_u64";
5948 break;
5949 default:
5950 output_operand_lossage ("invalid operand %%xn code");
5951 return;
5952 }
5953 else
5954 switch (GET_MODE_SIZE (mode))
5955 {
5956 case 1:
5957 s = signed_p ? "_i8" : "_u8";
5958 break;
5959 case 2:
5960 s = signed_p ? "_i16" : "_u16";
5961 break;
5962 case 4:
5963 s = signed_p ? "_i32" : "_u32";
5964 break;
5965 case 8:
5966 s = signed_p ? "_i64" : "_u64";
5967 break;
5968 default:
5969 output_operand_lossage ("invalid operand %%xn code");
5970 return;
5971 }
5972 fputs (s, file);
5973 }
5974 return;
5975 /* Print operand size as untyped suffix. */
5976 case 'o':
5977 {
5978 const char *s = 0;
5979 switch (GET_MODE_SIZE (GET_MODE (x)))
5980 {
5981 case 1:
5982 s = "_ubyte";
5983 break;
5984 case 2:
5985 s = "_ushort";
5986 break;
5987 /* The following are full-vector variants. */
5988 case 64:
5989 s = "_ubyte";
5990 break;
5991 case 128:
5992 s = "_ushort";
5993 break;
5994 }
5995
5996 if (s)
5997 {
5998 fputs (s, file);
5999 return;
6000 }
6001
6002 /* Fall-through - the other cases for 'o' are the same as for 's'. */
6003 gcc_fallthrough();
6004 }
6005 case 's':
6006 {
6007 const char *s = "";
6008 switch (GET_MODE_SIZE (GET_MODE (x)))
6009 {
6010 case 1:
6011 s = "_byte";
6012 break;
6013 case 2:
6014 s = "_short";
6015 break;
6016 case 4:
6017 s = "_dword";
6018 break;
6019 case 8:
6020 s = "_dwordx2";
6021 break;
6022 case 12:
6023 s = "_dwordx3";
6024 break;
6025 case 16:
6026 s = "_dwordx4";
6027 break;
6028 case 32:
6029 s = "_dwordx8";
6030 break;
6031 case 64:
6032 s = VECTOR_MODE_P (GET_MODE (x)) ? "_byte" : "_dwordx16";
6033 break;
6034 /* The following are full-vector variants. */
6035 case 128:
6036 s = "_short";
6037 break;
6038 case 256:
6039 s = "_dword";
6040 break;
6041 case 512:
6042 s = "_dwordx2";
6043 break;
6044 default:
6045 output_operand_lossage ("invalid operand %%xn code");
6046 return;
6047 }
6048 fputs (s, file);
6049 }
6050 return;
6051 case 'A':
6052 if (xcode != MEM)
6053 {
6054 output_operand_lossage ("invalid %%xn code");
6055 return;
6056 }
6057 print_operand_address (file, x);
6058 return;
6059 case 'O':
6060 {
6061 if (xcode != MEM)
6062 {
6063 output_operand_lossage ("invalid %%xn code");
6064 return;
6065 }
6066 if (AS_GDS_P (MEM_ADDR_SPACE (x)))
6067 fprintf (file, " gds");
6068
6069 rtx x0 = XEXP (x, 0);
6070 if (AS_GLOBAL_P (MEM_ADDR_SPACE (x)))
6071 {
6072 gcc_assert (TARGET_GCN5_PLUS);
6073
6074 fprintf (file, ", ");
6075
6076 rtx base = x0;
6077 rtx const_offset = NULL_RTX;
6078
6079 if (GET_CODE (base) == PLUS)
6080 {
6081 rtx offset = XEXP (x0, 1);
6082 base = XEXP (x0, 0);
6083
6084 if (GET_CODE (base) == PLUS)
6085 /* (SGPR + VGPR) + CONST */
6086 /* Ignore the VGPR offset for this operand. */
6087 base = XEXP (base, 0);
6088
6089 if (CONST_INT_P (offset))
6090 const_offset = XEXP (x0, 1);
6091 else if (REG_P (offset))
6092 /* SGPR + VGPR */
6093 /* Ignore the VGPR offset for this operand. */
6094 ;
6095 else
6096 output_operand_lossage ("bad ADDR_SPACE_GLOBAL address");
6097 }
6098
6099 if (REG_P (base))
6100 {
6101 if (VGPR_REGNO_P (REGNO (base)))
6102 /* The VGPR address is specified in the %A operand. */
6103 fprintf (file, "off");
6104 else if (SGPR_REGNO_P (REGNO (base)))
6105 print_reg (file, base);
6106 else
6107 output_operand_lossage ("bad ADDR_SPACE_GLOBAL address");
6108 }
6109 else
6110 output_operand_lossage ("bad ADDR_SPACE_GLOBAL address");
6111
6112 if (const_offset != NULL_RTX)
6113 fprintf (file, " offset:" HOST_WIDE_INT_PRINT_DEC,
6114 INTVAL (const_offset));
6115
6116 return;
6117 }
6118
6119 if (GET_CODE (x0) == REG)
6120 return;
6121 if (GET_CODE (x0) != PLUS)
6122 {
6123 output_operand_lossage ("invalid %%xn code");
6124 return;
6125 }
6126 rtx val = XEXP (x0, 1);
6127 if (GET_CODE (val) == CONST_VECTOR)
6128 val = CONST_VECTOR_ELT (val, 0);
6129 if (GET_CODE (val) != CONST_INT)
6130 {
6131 output_operand_lossage ("invalid %%xn code");
6132 return;
6133 }
6134 fprintf (file, " offset:" HOST_WIDE_INT_PRINT_DEC, INTVAL (val));
6135
6136 }
6137 return;
6138 case 'c':
6139 invert = true;
6140 /* Fall through. */
6141 case 'C':
6142 {
6143 const char *s;
6144 bool num = false;
6145 if ((xcode != EQ && xcode != NE) || !REG_P (XEXP (x, 0)))
6146 {
6147 output_operand_lossage ("invalid %%xn code");
6148 return;
6149 }
6150 switch (REGNO (XEXP (x, 0)))
6151 {
6152 case VCC_REG:
6153 case VCCZ_REG:
6154 s = "_vcc";
6155 break;
6156 case SCC_REG:
6157 /* For some reason llvm-mc insists on scc0 instead of sccz. */
6158 num = true;
6159 s = "_scc";
6160 break;
6161 case EXECZ_REG:
6162 s = "_exec";
6163 break;
6164 default:
6165 output_operand_lossage ("invalid %%xn code");
6166 return;
6167 }
6168 fputs (s, file);
6169 if (xcode == (invert ? NE : EQ))
6170 fputc (num ? '0' : 'z', file);
6171 else
6172 fputs (num ? "1" : "nz", file);
6173 return;
6174 }
6175 case 'D':
6176 {
6177 const char *s;
6178 bool cmp_signed = false;
6179 switch (xcode)
6180 {
6181 case EQ:
6182 s = "_eq_";
6183 break;
6184 case NE:
6185 s = "_lg_";
6186 break;
6187 case LT:
6188 s = "_lt_";
6189 cmp_signed = true;
6190 break;
6191 case LE:
6192 s = "_le_";
6193 cmp_signed = true;
6194 break;
6195 case GT:
6196 s = "_gt_";
6197 cmp_signed = true;
6198 break;
6199 case GE:
6200 s = "_ge_";
6201 cmp_signed = true;
6202 break;
6203 case LTU:
6204 s = "_lt_";
6205 break;
6206 case LEU:
6207 s = "_le_";
6208 break;
6209 case GTU:
6210 s = "_gt_";
6211 break;
6212 case GEU:
6213 s = "_ge_";
6214 break;
6215 default:
6216 output_operand_lossage ("invalid %%xn code");
6217 return;
6218 }
6219 fputs (s, file);
6220 fputc (cmp_signed ? 'i' : 'u', file);
6221
6222 machine_mode mode = GET_MODE (XEXP (x, 0));
6223
6224 if (mode == VOIDmode)
6225 mode = GET_MODE (XEXP (x, 1));
6226
6227 /* If both sides are constants, then assume the instruction is in
6228 SImode since s_cmp can only do integer compares. */
6229 if (mode == VOIDmode)
6230 mode = SImode;
6231
6232 switch (GET_MODE_SIZE (mode))
6233 {
6234 case 4:
6235 s = "32";
6236 break;
6237 case 8:
6238 s = "64";
6239 break;
6240 default:
6241 output_operand_lossage ("invalid operand %%xn code");
6242 return;
6243 }
6244 fputs (s, file);
6245 return;
6246 }
6247 case 'E':
6248 {
6249 const char *s;
6250 bool cmp_signed = false;
6251 machine_mode mode = GET_MODE (XEXP (x, 0));
6252
6253 if (mode == VOIDmode)
6254 mode = GET_MODE (XEXP (x, 1));
6255
6256 /* If both sides are constants, assume the instruction is in SFmode
6257 if either operand is floating point, otherwise assume SImode. */
6258 if (mode == VOIDmode)
6259 {
6260 if (GET_CODE (XEXP (x, 0)) == CONST_DOUBLE
6261 || GET_CODE (XEXP (x, 1)) == CONST_DOUBLE)
6262 mode = SFmode;
6263 else
6264 mode = SImode;
6265 }
6266
6267 /* Use the same format code for vector comparisons. */
6268 if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT
6269 || GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
6270 mode = GET_MODE_INNER (mode);
6271
6272 bool float_p = GET_MODE_CLASS (mode) == MODE_FLOAT;
6273
6274 switch (xcode)
6275 {
6276 case EQ:
6277 s = "_eq_";
6278 break;
6279 case NE:
6280 s = float_p ? "_neq_" : "_ne_";
6281 break;
6282 case LT:
6283 s = "_lt_";
6284 cmp_signed = true;
6285 break;
6286 case LE:
6287 s = "_le_";
6288 cmp_signed = true;
6289 break;
6290 case GT:
6291 s = "_gt_";
6292 cmp_signed = true;
6293 break;
6294 case GE:
6295 s = "_ge_";
6296 cmp_signed = true;
6297 break;
6298 case LTU:
6299 s = "_lt_";
6300 break;
6301 case LEU:
6302 s = "_le_";
6303 break;
6304 case GTU:
6305 s = "_gt_";
6306 break;
6307 case GEU:
6308 s = "_ge_";
6309 break;
6310 case ORDERED:
6311 s = "_o_";
6312 break;
6313 case UNORDERED:
6314 s = "_u_";
6315 break;
6316 case UNEQ:
6317 s = "_nlg_";
6318 break;
6319 case UNGE:
6320 s = "_nlt_";
6321 break;
6322 case UNGT:
6323 s = "_nle_";
6324 break;
6325 case UNLE:
6326 s = "_ngt_";
6327 break;
6328 case UNLT:
6329 s = "_nge_";
6330 break;
6331 case LTGT:
6332 s = "_lg_";
6333 break;
6334 default:
6335 output_operand_lossage ("invalid %%xn code");
6336 return;
6337 }
6338 fputs (s, file);
6339 fputc (float_p ? 'f' : cmp_signed ? 'i' : 'u', file);
6340
6341 switch (GET_MODE_SIZE (mode))
6342 {
6343 case 1:
6344 output_operand_lossage ("operand %%xn code invalid for QImode");
6345 return;
6346 case 2:
6347 s = "16";
6348 break;
6349 case 4:
6350 s = "32";
6351 break;
6352 case 8:
6353 s = "64";
6354 break;
6355 default:
6356 output_operand_lossage ("invalid operand %%xn code");
6357 return;
6358 }
6359 fputs (s, file);
6360 return;
6361 }
6362 case 'L':
6363 print_operand (file, gcn_operand_part (GET_MODE (x), x, 0), 0);
6364 return;
6365 case 'H':
6366 print_operand (file, gcn_operand_part (GET_MODE (x), x, 1), 0);
6367 return;
6368 case 'R':
6369 /* Print a scalar register number as an integer. Temporary hack. */
6370 gcc_assert (REG_P (x));
6371 fprintf (file, "%u", (int) REGNO (x));
6372 return;
6373 case 'V':
6374 /* Print a vector register number as an integer. Temporary hack. */
6375 gcc_assert (REG_P (x));
6376 fprintf (file, "%u", (int) REGNO (x) - FIRST_VGPR_REG);
6377 return;
6378 case 0:
6379 if (xcode == REG)
6380 print_reg (file, x);
6381 else if (xcode == MEM)
6382 output_address (GET_MODE (x), x);
6383 else if (xcode == CONST_INT)
6384 fprintf (file, "%i", (int) INTVAL (x));
6385 else if (xcode == CONST_VECTOR)
6386 print_operand (file, CONST_VECTOR_ELT (x, 0), code);
6387 else if (xcode == CONST_DOUBLE)
6388 {
6389 const char *str;
6390 switch (gcn_inline_fp_constant_p (x, false))
6391 {
6392 case 240:
6393 str = "0.5";
6394 break;
6395 case 241:
6396 str = "-0.5";
6397 break;
6398 case 242:
6399 str = "1.0";
6400 break;
6401 case 243:
6402 str = "-1.0";
6403 break;
6404 case 244:
6405 str = "2.0";
6406 break;
6407 case 245:
6408 str = "-2.0";
6409 break;
6410 case 246:
6411 str = "4.0";
6412 break;
6413 case 247:
6414 str = "-4.0";
6415 break;
6416 case 248:
6417 str = "1/pi";
6418 break;
6419 default:
6420 rtx ix = simplify_gen_subreg (GET_MODE (x) == DFmode
6421 ? DImode : SImode,
6422 x, GET_MODE (x), 0);
6423 if (x)
6424 print_operand (file, ix, code);
6425 else
6426 output_operand_lossage ("invalid fp constant");
6427 return;
6428 break;
6429 }
6430 fprintf (file, str);
6431 return;
6432 }
6433 else
6434 output_addr_const (file, x);
6435 return;
6436 case '^':
6437 if (TARGET_GCN5_PLUS)
6438 fputs ("_co", file);
6439 return;
6440 case 'g':
6441 gcc_assert (xcode == MEM);
6442 if (MEM_VOLATILE_P (x))
6443 fputs (" glc", file);
6444 return;
6445 default:
6446 output_operand_lossage ("invalid %%xn code");
6447 }
6448 gcc_unreachable ();
6449 }
6450
6451 /* Implement DBX_REGISTER_NUMBER macro.
6452
6453 Return the DWARF register number that corresponds to the GCC internal
6454 REGNO. */
6455
6456 unsigned int
gcn_dwarf_register_number(unsigned int regno)6457 gcn_dwarf_register_number (unsigned int regno)
6458 {
6459 /* Registers defined in DWARF. */
6460 if (regno == EXEC_LO_REG)
6461 return 17;
6462 /* We need to use a more complex DWARF expression for this
6463 else if (regno == EXEC_HI_REG)
6464 return 17; */
6465 else if (regno == VCC_LO_REG)
6466 return 768;
6467 /* We need to use a more complex DWARF expression for this
6468 else if (regno == VCC_HI_REG)
6469 return 768; */
6470 else if (regno == SCC_REG)
6471 return 128;
6472 else if (regno == DWARF_LINK_REGISTER)
6473 return 16;
6474 else if (SGPR_REGNO_P (regno))
6475 {
6476 if (regno - FIRST_SGPR_REG < 64)
6477 return (regno - FIRST_SGPR_REG + 32);
6478 else
6479 return (regno - FIRST_SGPR_REG + 1024);
6480 }
6481 else if (VGPR_REGNO_P (regno))
6482 return (regno - FIRST_VGPR_REG + 2560);
6483
6484 /* Otherwise, there's nothing sensible to do. */
6485 return regno + 100000;
6486 }
6487
6488 /* Implement TARGET_DWARF_REGISTER_SPAN.
6489
6490 DImode and Vector DImode require additional registers. */
6491
6492 static rtx
gcn_dwarf_register_span(rtx rtl)6493 gcn_dwarf_register_span (rtx rtl)
6494 {
6495 machine_mode mode = GET_MODE (rtl);
6496
6497 if (VECTOR_MODE_P (mode))
6498 mode = GET_MODE_INNER (mode);
6499
6500 if (GET_MODE_SIZE (mode) != 8)
6501 return NULL_RTX;
6502
6503 unsigned regno = REGNO (rtl);
6504
6505 if (regno == DWARF_LINK_REGISTER)
6506 return NULL_RTX;
6507
6508 rtx p = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (2));
6509 XVECEXP (p, 0, 0) = gen_rtx_REG (SImode, regno);
6510 XVECEXP (p, 0, 1) = gen_rtx_REG (SImode, regno + 1);
6511
6512 return p;
6513 }
6514
6515 /* }}} */
6516 /* {{{ TARGET hook overrides. */
6517
6518 #undef TARGET_ADDR_SPACE_ADDRESS_MODE
6519 #define TARGET_ADDR_SPACE_ADDRESS_MODE gcn_addr_space_address_mode
6520 #undef TARGET_ADDR_SPACE_DEBUG
6521 #define TARGET_ADDR_SPACE_DEBUG gcn_addr_space_debug
6522 #undef TARGET_ADDR_SPACE_LEGITIMATE_ADDRESS_P
6523 #define TARGET_ADDR_SPACE_LEGITIMATE_ADDRESS_P \
6524 gcn_addr_space_legitimate_address_p
6525 #undef TARGET_ADDR_SPACE_LEGITIMIZE_ADDRESS
6526 #define TARGET_ADDR_SPACE_LEGITIMIZE_ADDRESS gcn_addr_space_legitimize_address
6527 #undef TARGET_ADDR_SPACE_POINTER_MODE
6528 #define TARGET_ADDR_SPACE_POINTER_MODE gcn_addr_space_pointer_mode
6529 #undef TARGET_ADDR_SPACE_SUBSET_P
6530 #define TARGET_ADDR_SPACE_SUBSET_P gcn_addr_space_subset_p
6531 #undef TARGET_ADDR_SPACE_CONVERT
6532 #define TARGET_ADDR_SPACE_CONVERT gcn_addr_space_convert
6533 #undef TARGET_ARG_PARTIAL_BYTES
6534 #define TARGET_ARG_PARTIAL_BYTES gcn_arg_partial_bytes
6535 #undef TARGET_ASM_ALIGNED_DI_OP
6536 #define TARGET_ASM_ALIGNED_DI_OP "\t.8byte\t"
6537 #undef TARGET_ASM_FILE_START
6538 #define TARGET_ASM_FILE_START output_file_start
6539 #undef TARGET_ASM_FUNCTION_PROLOGUE
6540 #define TARGET_ASM_FUNCTION_PROLOGUE gcn_target_asm_function_prologue
6541 #undef TARGET_ASM_SELECT_SECTION
6542 #define TARGET_ASM_SELECT_SECTION gcn_asm_select_section
6543 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
6544 #define TARGET_ASM_TRAMPOLINE_TEMPLATE gcn_asm_trampoline_template
6545 #undef TARGET_ATTRIBUTE_TABLE
6546 #define TARGET_ATTRIBUTE_TABLE gcn_attribute_table
6547 #undef TARGET_BUILTIN_DECL
6548 #define TARGET_BUILTIN_DECL gcn_builtin_decl
6549 #undef TARGET_CAN_CHANGE_MODE_CLASS
6550 #define TARGET_CAN_CHANGE_MODE_CLASS gcn_can_change_mode_class
6551 #undef TARGET_CAN_ELIMINATE
6552 #define TARGET_CAN_ELIMINATE gcn_can_eliminate_p
6553 #undef TARGET_CANNOT_COPY_INSN_P
6554 #define TARGET_CANNOT_COPY_INSN_P gcn_cannot_copy_insn_p
6555 #undef TARGET_CLASS_LIKELY_SPILLED_P
6556 #define TARGET_CLASS_LIKELY_SPILLED_P gcn_class_likely_spilled_p
6557 #undef TARGET_CLASS_MAX_NREGS
6558 #define TARGET_CLASS_MAX_NREGS gcn_class_max_nregs
6559 #undef TARGET_CONDITIONAL_REGISTER_USAGE
6560 #define TARGET_CONDITIONAL_REGISTER_USAGE gcn_conditional_register_usage
6561 #undef TARGET_CONSTANT_ALIGNMENT
6562 #define TARGET_CONSTANT_ALIGNMENT gcn_constant_alignment
6563 #undef TARGET_DEBUG_UNWIND_INFO
6564 #define TARGET_DEBUG_UNWIND_INFO gcn_debug_unwind_info
6565 #undef TARGET_DWARF_REGISTER_SPAN
6566 #define TARGET_DWARF_REGISTER_SPAN gcn_dwarf_register_span
6567 #undef TARGET_EMUTLS_VAR_INIT
6568 #define TARGET_EMUTLS_VAR_INIT gcn_emutls_var_init
6569 #undef TARGET_EXPAND_BUILTIN
6570 #define TARGET_EXPAND_BUILTIN gcn_expand_builtin
6571 #undef TARGET_FRAME_POINTER_REQUIRED
6572 #define TARGET_FRAME_POINTER_REQUIRED gcn_frame_pointer_rqd
6573 #undef TARGET_FUNCTION_ARG
6574 #undef TARGET_FUNCTION_ARG_ADVANCE
6575 #define TARGET_FUNCTION_ARG_ADVANCE gcn_function_arg_advance
6576 #define TARGET_FUNCTION_ARG gcn_function_arg
6577 #undef TARGET_FUNCTION_VALUE
6578 #define TARGET_FUNCTION_VALUE gcn_function_value
6579 #undef TARGET_FUNCTION_VALUE_REGNO_P
6580 #define TARGET_FUNCTION_VALUE_REGNO_P gcn_function_value_regno_p
6581 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
6582 #define TARGET_GIMPLIFY_VA_ARG_EXPR gcn_gimplify_va_arg_expr
6583 #undef TARGET_OMP_DEVICE_KIND_ARCH_ISA
6584 #define TARGET_OMP_DEVICE_KIND_ARCH_ISA gcn_omp_device_kind_arch_isa
6585 #undef TARGET_GOACC_ADJUST_PRIVATE_DECL
6586 #define TARGET_GOACC_ADJUST_PRIVATE_DECL gcn_goacc_adjust_private_decl
6587 #undef TARGET_GOACC_CREATE_WORKER_BROADCAST_RECORD
6588 #define TARGET_GOACC_CREATE_WORKER_BROADCAST_RECORD \
6589 gcn_goacc_create_worker_broadcast_record
6590 #undef TARGET_GOACC_FORK_JOIN
6591 #define TARGET_GOACC_FORK_JOIN gcn_fork_join
6592 #undef TARGET_GOACC_REDUCTION
6593 #define TARGET_GOACC_REDUCTION gcn_goacc_reduction
6594 #undef TARGET_GOACC_VALIDATE_DIMS
6595 #define TARGET_GOACC_VALIDATE_DIMS gcn_goacc_validate_dims
6596 #undef TARGET_GOACC_SHARED_MEM_LAYOUT
6597 #define TARGET_GOACC_SHARED_MEM_LAYOUT gcn_shared_mem_layout
6598 #undef TARGET_HARD_REGNO_MODE_OK
6599 #define TARGET_HARD_REGNO_MODE_OK gcn_hard_regno_mode_ok
6600 #undef TARGET_HARD_REGNO_NREGS
6601 #define TARGET_HARD_REGNO_NREGS gcn_hard_regno_nregs
6602 #undef TARGET_HAVE_SPECULATION_SAFE_VALUE
6603 #define TARGET_HAVE_SPECULATION_SAFE_VALUE speculation_safe_value_not_needed
6604 #undef TARGET_INIT_BUILTINS
6605 #define TARGET_INIT_BUILTINS gcn_init_builtins
6606 #undef TARGET_INIT_LIBFUNCS
6607 #define TARGET_INIT_LIBFUNCS gcn_init_libfuncs
6608 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
6609 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
6610 gcn_ira_change_pseudo_allocno_class
6611 #undef TARGET_LEGITIMATE_CONSTANT_P
6612 #define TARGET_LEGITIMATE_CONSTANT_P gcn_legitimate_constant_p
6613 #undef TARGET_LRA_P
6614 #define TARGET_LRA_P hook_bool_void_true
6615 #undef TARGET_MACHINE_DEPENDENT_REORG
6616 #define TARGET_MACHINE_DEPENDENT_REORG gcn_md_reorg
6617 #undef TARGET_MEMORY_MOVE_COST
6618 #define TARGET_MEMORY_MOVE_COST gcn_memory_move_cost
6619 #undef TARGET_MODES_TIEABLE_P
6620 #define TARGET_MODES_TIEABLE_P gcn_modes_tieable_p
6621 #undef TARGET_OPTION_OVERRIDE
6622 #define TARGET_OPTION_OVERRIDE gcn_option_override
6623 #undef TARGET_PRETEND_OUTGOING_VARARGS_NAMED
6624 #define TARGET_PRETEND_OUTGOING_VARARGS_NAMED \
6625 gcn_pretend_outgoing_varargs_named
6626 #undef TARGET_PROMOTE_FUNCTION_MODE
6627 #define TARGET_PROMOTE_FUNCTION_MODE gcn_promote_function_mode
6628 #undef TARGET_REGISTER_MOVE_COST
6629 #define TARGET_REGISTER_MOVE_COST gcn_register_move_cost
6630 #undef TARGET_RETURN_IN_MEMORY
6631 #define TARGET_RETURN_IN_MEMORY gcn_return_in_memory
6632 #undef TARGET_RTX_COSTS
6633 #define TARGET_RTX_COSTS gcn_rtx_costs
6634 #undef TARGET_SECONDARY_RELOAD
6635 #define TARGET_SECONDARY_RELOAD gcn_secondary_reload
6636 #undef TARGET_SECTION_TYPE_FLAGS
6637 #define TARGET_SECTION_TYPE_FLAGS gcn_section_type_flags
6638 #undef TARGET_SCALAR_MODE_SUPPORTED_P
6639 #define TARGET_SCALAR_MODE_SUPPORTED_P gcn_scalar_mode_supported_p
6640 #undef TARGET_SMALL_REGISTER_CLASSES_FOR_MODE_P
6641 #define TARGET_SMALL_REGISTER_CLASSES_FOR_MODE_P \
6642 gcn_small_register_classes_for_mode_p
6643 #undef TARGET_SPILL_CLASS
6644 #define TARGET_SPILL_CLASS gcn_spill_class
6645 #undef TARGET_STRICT_ARGUMENT_NAMING
6646 #define TARGET_STRICT_ARGUMENT_NAMING gcn_strict_argument_naming
6647 #undef TARGET_TRAMPOLINE_INIT
6648 #define TARGET_TRAMPOLINE_INIT gcn_trampoline_init
6649 #undef TARGET_TRULY_NOOP_TRUNCATION
6650 #define TARGET_TRULY_NOOP_TRUNCATION gcn_truly_noop_truncation
6651 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
6652 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST gcn_vectorization_cost
6653 #undef TARGET_VECTORIZE_GET_MASK_MODE
6654 #define TARGET_VECTORIZE_GET_MASK_MODE gcn_vectorize_get_mask_mode
6655 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
6656 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE gcn_vectorize_preferred_simd_mode
6657 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
6658 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
6659 gcn_preferred_vector_alignment
6660 #undef TARGET_VECTORIZE_RELATED_MODE
6661 #define TARGET_VECTORIZE_RELATED_MODE gcn_related_vector_mode
6662 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
6663 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
6664 gcn_vectorize_support_vector_misalignment
6665 #undef TARGET_VECTORIZE_VEC_PERM_CONST
6666 #define TARGET_VECTORIZE_VEC_PERM_CONST gcn_vectorize_vec_perm_const
6667 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
6668 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
6669 gcn_vector_alignment_reachable
6670 #undef TARGET_VECTOR_MODE_SUPPORTED_P
6671 #define TARGET_VECTOR_MODE_SUPPORTED_P gcn_vector_mode_supported_p
6672
6673 struct gcc_target targetm = TARGET_INITIALIZER;
6674
6675 #include "gt-gcn.h"
6676 /* }}} */
6677