1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2018 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
4
5 This file is part of GCC.
6
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
10 any later version.
11
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
20
21 #define IN_TARGET_CODE 1
22
23 #include "config.h"
24 #define INCLUDE_STRING
25 #include "system.h"
26 #include "coretypes.h"
27 #include "backend.h"
28 #include "target.h"
29 #include "rtl.h"
30 #include "tree.h"
31 #include "memmodel.h"
32 #include "gimple.h"
33 #include "cfghooks.h"
34 #include "cfgloop.h"
35 #include "df.h"
36 #include "tm_p.h"
37 #include "stringpool.h"
38 #include "attribs.h"
39 #include "optabs.h"
40 #include "regs.h"
41 #include "emit-rtl.h"
42 #include "recog.h"
43 #include "diagnostic.h"
44 #include "insn-attr.h"
45 #include "alias.h"
46 #include "fold-const.h"
47 #include "stor-layout.h"
48 #include "calls.h"
49 #include "varasm.h"
50 #include "output.h"
51 #include "flags.h"
52 #include "explow.h"
53 #include "expr.h"
54 #include "reload.h"
55 #include "langhooks.h"
56 #include "opts.h"
57 #include "params.h"
58 #include "gimplify.h"
59 #include "dwarf2.h"
60 #include "gimple-iterator.h"
61 #include "tree-vectorizer.h"
62 #include "aarch64-cost-tables.h"
63 #include "dumpfile.h"
64 #include "builtins.h"
65 #include "rtl-iter.h"
66 #include "tm-constrs.h"
67 #include "sched-int.h"
68 #include "target-globals.h"
69 #include "common/common-target.h"
70 #include "cfgrtl.h"
71 #include "selftest.h"
72 #include "selftest-rtl.h"
73 #include "rtx-vector-builder.h"
74
75 /* This file should be included last. */
76 #include "target-def.h"
77
78 /* Defined for convenience. */
79 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
80
81 /* Classifies an address.
82
83 ADDRESS_REG_IMM
84 A simple base register plus immediate offset.
85
86 ADDRESS_REG_WB
87 A base register indexed by immediate offset with writeback.
88
89 ADDRESS_REG_REG
90 A base register indexed by (optionally scaled) register.
91
92 ADDRESS_REG_UXTW
93 A base register indexed by (optionally scaled) zero-extended register.
94
95 ADDRESS_REG_SXTW
96 A base register indexed by (optionally scaled) sign-extended register.
97
98 ADDRESS_LO_SUM
99 A LO_SUM rtx with a base register and "LO12" symbol relocation.
100
101 ADDRESS_SYMBOLIC:
102 A constant symbolic address, in pc-relative literal pool. */
103
104 enum aarch64_address_type {
105 ADDRESS_REG_IMM,
106 ADDRESS_REG_WB,
107 ADDRESS_REG_REG,
108 ADDRESS_REG_UXTW,
109 ADDRESS_REG_SXTW,
110 ADDRESS_LO_SUM,
111 ADDRESS_SYMBOLIC
112 };
113
114 struct aarch64_address_info {
115 enum aarch64_address_type type;
116 rtx base;
117 rtx offset;
118 poly_int64 const_offset;
119 int shift;
120 enum aarch64_symbol_type symbol_type;
121 };
122
123 /* Information about a legitimate vector immediate operand. */
124 struct simd_immediate_info
125 {
126 enum insn_type { MOV, MVN };
127 enum modifier_type { LSL, MSL };
128
simd_immediate_infosimd_immediate_info129 simd_immediate_info () {}
130 simd_immediate_info (scalar_float_mode, rtx);
131 simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
132 insn_type = MOV, modifier_type = LSL,
133 unsigned int = 0);
134 simd_immediate_info (scalar_mode, rtx, rtx);
135
136 /* The mode of the elements. */
137 scalar_mode elt_mode;
138
139 /* The value of each element if all elements are the same, or the
140 first value if the constant is a series. */
141 rtx value;
142
143 /* The value of the step if the constant is a series, null otherwise. */
144 rtx step;
145
146 /* The instruction to use to move the immediate into a vector. */
147 insn_type insn;
148
149 /* The kind of shift modifier to use, and the number of bits to shift.
150 This is (LSL, 0) if no shift is needed. */
151 modifier_type modifier;
152 unsigned int shift;
153 };
154
155 /* Construct a floating-point immediate in which each element has mode
156 ELT_MODE_IN and value VALUE_IN. */
157 inline simd_immediate_info
simd_immediate_info(scalar_float_mode elt_mode_in,rtx value_in)158 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
159 : elt_mode (elt_mode_in), value (value_in), step (NULL_RTX), insn (MOV),
160 modifier (LSL), shift (0)
161 {}
162
163 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
164 and value VALUE_IN. The other parameters are as for the structure
165 fields. */
166 inline simd_immediate_info
simd_immediate_info(scalar_int_mode elt_mode_in,unsigned HOST_WIDE_INT value_in,insn_type insn_in,modifier_type modifier_in,unsigned int shift_in)167 ::simd_immediate_info (scalar_int_mode elt_mode_in,
168 unsigned HOST_WIDE_INT value_in,
169 insn_type insn_in, modifier_type modifier_in,
170 unsigned int shift_in)
171 : elt_mode (elt_mode_in), value (gen_int_mode (value_in, elt_mode_in)),
172 step (NULL_RTX), insn (insn_in), modifier (modifier_in), shift (shift_in)
173 {}
174
175 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
176 and where element I is equal to VALUE_IN + I * STEP_IN. */
177 inline simd_immediate_info
simd_immediate_info(scalar_mode elt_mode_in,rtx value_in,rtx step_in)178 ::simd_immediate_info (scalar_mode elt_mode_in, rtx value_in, rtx step_in)
179 : elt_mode (elt_mode_in), value (value_in), step (step_in), insn (MOV),
180 modifier (LSL), shift (0)
181 {}
182
183 /* The current code model. */
184 enum aarch64_code_model aarch64_cmodel;
185
186 /* The number of 64-bit elements in an SVE vector. */
187 poly_uint16 aarch64_sve_vg;
188
189 #ifdef HAVE_AS_TLS
190 #undef TARGET_HAVE_TLS
191 #define TARGET_HAVE_TLS 1
192 #endif
193
194 static bool aarch64_composite_type_p (const_tree, machine_mode);
195 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
196 const_tree,
197 machine_mode *, int *,
198 bool *);
199 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
200 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
201 static void aarch64_override_options_after_change (void);
202 static bool aarch64_vector_mode_supported_p (machine_mode);
203 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
204 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
205 const_tree type,
206 int misalignment,
207 bool is_packed);
208 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
209 static bool aarch64_print_ldpstp_address (FILE *, machine_mode, rtx);
210
211 /* Major revision number of the ARM Architecture implemented by the target. */
212 unsigned aarch64_architecture_version;
213
214 /* The processor for which instructions should be scheduled. */
215 enum aarch64_processor aarch64_tune = cortexa53;
216
217 /* Mask to specify which instruction scheduling options should be used. */
218 unsigned long aarch64_tune_flags = 0;
219
220 /* Global flag for PC relative loads. */
221 bool aarch64_pcrelative_literal_loads;
222
223 /* Support for command line parsing of boolean flags in the tuning
224 structures. */
225 struct aarch64_flag_desc
226 {
227 const char* name;
228 unsigned int flag;
229 };
230
231 #define AARCH64_FUSION_PAIR(name, internal_name) \
232 { name, AARCH64_FUSE_##internal_name },
233 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
234 {
235 { "none", AARCH64_FUSE_NOTHING },
236 #include "aarch64-fusion-pairs.def"
237 { "all", AARCH64_FUSE_ALL },
238 { NULL, AARCH64_FUSE_NOTHING }
239 };
240
241 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
242 { name, AARCH64_EXTRA_TUNE_##internal_name },
243 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
244 {
245 { "none", AARCH64_EXTRA_TUNE_NONE },
246 #include "aarch64-tuning-flags.def"
247 { "all", AARCH64_EXTRA_TUNE_ALL },
248 { NULL, AARCH64_EXTRA_TUNE_NONE }
249 };
250
251 /* Tuning parameters. */
252
253 static const struct cpu_addrcost_table generic_addrcost_table =
254 {
255 {
256 1, /* hi */
257 0, /* si */
258 0, /* di */
259 1, /* ti */
260 },
261 0, /* pre_modify */
262 0, /* post_modify */
263 0, /* register_offset */
264 0, /* register_sextend */
265 0, /* register_zextend */
266 0 /* imm_offset */
267 };
268
269 static const struct cpu_addrcost_table exynosm1_addrcost_table =
270 {
271 {
272 0, /* hi */
273 0, /* si */
274 0, /* di */
275 2, /* ti */
276 },
277 0, /* pre_modify */
278 0, /* post_modify */
279 1, /* register_offset */
280 1, /* register_sextend */
281 2, /* register_zextend */
282 0, /* imm_offset */
283 };
284
285 static const struct cpu_addrcost_table xgene1_addrcost_table =
286 {
287 {
288 1, /* hi */
289 0, /* si */
290 0, /* di */
291 1, /* ti */
292 },
293 1, /* pre_modify */
294 0, /* post_modify */
295 0, /* register_offset */
296 1, /* register_sextend */
297 1, /* register_zextend */
298 0, /* imm_offset */
299 };
300
301 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
302 {
303 {
304 1, /* hi */
305 1, /* si */
306 1, /* di */
307 2, /* ti */
308 },
309 0, /* pre_modify */
310 0, /* post_modify */
311 2, /* register_offset */
312 3, /* register_sextend */
313 3, /* register_zextend */
314 0, /* imm_offset */
315 };
316
317 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
318 {
319 {
320 1, /* hi */
321 1, /* si */
322 1, /* di */
323 2, /* ti */
324 },
325 1, /* pre_modify */
326 1, /* post_modify */
327 3, /* register_offset */
328 3, /* register_sextend */
329 3, /* register_zextend */
330 2, /* imm_offset */
331 };
332
333 static const struct cpu_regmove_cost generic_regmove_cost =
334 {
335 1, /* GP2GP */
336 /* Avoid the use of slow int<->fp moves for spilling by setting
337 their cost higher than memmov_cost. */
338 5, /* GP2FP */
339 5, /* FP2GP */
340 2 /* FP2FP */
341 };
342
343 static const struct cpu_regmove_cost cortexa57_regmove_cost =
344 {
345 1, /* GP2GP */
346 /* Avoid the use of slow int<->fp moves for spilling by setting
347 their cost higher than memmov_cost. */
348 5, /* GP2FP */
349 5, /* FP2GP */
350 2 /* FP2FP */
351 };
352
353 static const struct cpu_regmove_cost cortexa53_regmove_cost =
354 {
355 1, /* GP2GP */
356 /* Avoid the use of slow int<->fp moves for spilling by setting
357 their cost higher than memmov_cost. */
358 5, /* GP2FP */
359 5, /* FP2GP */
360 2 /* FP2FP */
361 };
362
363 static const struct cpu_regmove_cost exynosm1_regmove_cost =
364 {
365 1, /* GP2GP */
366 /* Avoid the use of slow int<->fp moves for spilling by setting
367 their cost higher than memmov_cost (actual, 4 and 9). */
368 9, /* GP2FP */
369 9, /* FP2GP */
370 1 /* FP2FP */
371 };
372
373 static const struct cpu_regmove_cost thunderx_regmove_cost =
374 {
375 2, /* GP2GP */
376 2, /* GP2FP */
377 6, /* FP2GP */
378 4 /* FP2FP */
379 };
380
381 static const struct cpu_regmove_cost xgene1_regmove_cost =
382 {
383 1, /* GP2GP */
384 /* Avoid the use of slow int<->fp moves for spilling by setting
385 their cost higher than memmov_cost. */
386 8, /* GP2FP */
387 8, /* FP2GP */
388 2 /* FP2FP */
389 };
390
391 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
392 {
393 2, /* GP2GP */
394 /* Avoid the use of int<->fp moves for spilling. */
395 6, /* GP2FP */
396 6, /* FP2GP */
397 4 /* FP2FP */
398 };
399
400 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
401 {
402 1, /* GP2GP */
403 /* Avoid the use of int<->fp moves for spilling. */
404 8, /* GP2FP */
405 8, /* FP2GP */
406 4 /* FP2FP */
407 };
408
409 /* Generic costs for vector insn classes. */
410 static const struct cpu_vector_cost generic_vector_cost =
411 {
412 1, /* scalar_int_stmt_cost */
413 1, /* scalar_fp_stmt_cost */
414 1, /* scalar_load_cost */
415 1, /* scalar_store_cost */
416 1, /* vec_int_stmt_cost */
417 1, /* vec_fp_stmt_cost */
418 2, /* vec_permute_cost */
419 1, /* vec_to_scalar_cost */
420 1, /* scalar_to_vec_cost */
421 1, /* vec_align_load_cost */
422 1, /* vec_unalign_load_cost */
423 1, /* vec_unalign_store_cost */
424 1, /* vec_store_cost */
425 3, /* cond_taken_branch_cost */
426 1 /* cond_not_taken_branch_cost */
427 };
428
429 /* QDF24XX costs for vector insn classes. */
430 static const struct cpu_vector_cost qdf24xx_vector_cost =
431 {
432 1, /* scalar_int_stmt_cost */
433 1, /* scalar_fp_stmt_cost */
434 1, /* scalar_load_cost */
435 1, /* scalar_store_cost */
436 1, /* vec_int_stmt_cost */
437 3, /* vec_fp_stmt_cost */
438 2, /* vec_permute_cost */
439 1, /* vec_to_scalar_cost */
440 1, /* scalar_to_vec_cost */
441 1, /* vec_align_load_cost */
442 1, /* vec_unalign_load_cost */
443 1, /* vec_unalign_store_cost */
444 1, /* vec_store_cost */
445 3, /* cond_taken_branch_cost */
446 1 /* cond_not_taken_branch_cost */
447 };
448
449 /* ThunderX costs for vector insn classes. */
450 static const struct cpu_vector_cost thunderx_vector_cost =
451 {
452 1, /* scalar_int_stmt_cost */
453 1, /* scalar_fp_stmt_cost */
454 3, /* scalar_load_cost */
455 1, /* scalar_store_cost */
456 4, /* vec_int_stmt_cost */
457 1, /* vec_fp_stmt_cost */
458 4, /* vec_permute_cost */
459 2, /* vec_to_scalar_cost */
460 2, /* scalar_to_vec_cost */
461 3, /* vec_align_load_cost */
462 5, /* vec_unalign_load_cost */
463 5, /* vec_unalign_store_cost */
464 1, /* vec_store_cost */
465 3, /* cond_taken_branch_cost */
466 3 /* cond_not_taken_branch_cost */
467 };
468
469 /* Generic costs for vector insn classes. */
470 static const struct cpu_vector_cost cortexa57_vector_cost =
471 {
472 1, /* scalar_int_stmt_cost */
473 1, /* scalar_fp_stmt_cost */
474 4, /* scalar_load_cost */
475 1, /* scalar_store_cost */
476 2, /* vec_int_stmt_cost */
477 2, /* vec_fp_stmt_cost */
478 3, /* vec_permute_cost */
479 8, /* vec_to_scalar_cost */
480 8, /* scalar_to_vec_cost */
481 4, /* vec_align_load_cost */
482 4, /* vec_unalign_load_cost */
483 1, /* vec_unalign_store_cost */
484 1, /* vec_store_cost */
485 1, /* cond_taken_branch_cost */
486 1 /* cond_not_taken_branch_cost */
487 };
488
489 static const struct cpu_vector_cost exynosm1_vector_cost =
490 {
491 1, /* scalar_int_stmt_cost */
492 1, /* scalar_fp_stmt_cost */
493 5, /* scalar_load_cost */
494 1, /* scalar_store_cost */
495 3, /* vec_int_stmt_cost */
496 3, /* vec_fp_stmt_cost */
497 3, /* vec_permute_cost */
498 3, /* vec_to_scalar_cost */
499 3, /* scalar_to_vec_cost */
500 5, /* vec_align_load_cost */
501 5, /* vec_unalign_load_cost */
502 1, /* vec_unalign_store_cost */
503 1, /* vec_store_cost */
504 1, /* cond_taken_branch_cost */
505 1 /* cond_not_taken_branch_cost */
506 };
507
508 /* Generic costs for vector insn classes. */
509 static const struct cpu_vector_cost xgene1_vector_cost =
510 {
511 1, /* scalar_int_stmt_cost */
512 1, /* scalar_fp_stmt_cost */
513 5, /* scalar_load_cost */
514 1, /* scalar_store_cost */
515 2, /* vec_int_stmt_cost */
516 2, /* vec_fp_stmt_cost */
517 2, /* vec_permute_cost */
518 4, /* vec_to_scalar_cost */
519 4, /* scalar_to_vec_cost */
520 10, /* vec_align_load_cost */
521 10, /* vec_unalign_load_cost */
522 2, /* vec_unalign_store_cost */
523 2, /* vec_store_cost */
524 2, /* cond_taken_branch_cost */
525 1 /* cond_not_taken_branch_cost */
526 };
527
528 /* Costs for vector insn classes for Vulcan. */
529 static const struct cpu_vector_cost thunderx2t99_vector_cost =
530 {
531 1, /* scalar_int_stmt_cost */
532 6, /* scalar_fp_stmt_cost */
533 4, /* scalar_load_cost */
534 1, /* scalar_store_cost */
535 5, /* vec_int_stmt_cost */
536 6, /* vec_fp_stmt_cost */
537 3, /* vec_permute_cost */
538 6, /* vec_to_scalar_cost */
539 5, /* scalar_to_vec_cost */
540 8, /* vec_align_load_cost */
541 8, /* vec_unalign_load_cost */
542 4, /* vec_unalign_store_cost */
543 4, /* vec_store_cost */
544 2, /* cond_taken_branch_cost */
545 1 /* cond_not_taken_branch_cost */
546 };
547
548 /* Generic costs for branch instructions. */
549 static const struct cpu_branch_cost generic_branch_cost =
550 {
551 1, /* Predictable. */
552 3 /* Unpredictable. */
553 };
554
555 /* Generic approximation modes. */
556 static const cpu_approx_modes generic_approx_modes =
557 {
558 AARCH64_APPROX_NONE, /* division */
559 AARCH64_APPROX_NONE, /* sqrt */
560 AARCH64_APPROX_NONE /* recip_sqrt */
561 };
562
563 /* Approximation modes for Exynos M1. */
564 static const cpu_approx_modes exynosm1_approx_modes =
565 {
566 AARCH64_APPROX_NONE, /* division */
567 AARCH64_APPROX_ALL, /* sqrt */
568 AARCH64_APPROX_ALL /* recip_sqrt */
569 };
570
571 /* Approximation modes for X-Gene 1. */
572 static const cpu_approx_modes xgene1_approx_modes =
573 {
574 AARCH64_APPROX_NONE, /* division */
575 AARCH64_APPROX_NONE, /* sqrt */
576 AARCH64_APPROX_ALL /* recip_sqrt */
577 };
578
579 /* Generic prefetch settings (which disable prefetch). */
580 static const cpu_prefetch_tune generic_prefetch_tune =
581 {
582 0, /* num_slots */
583 -1, /* l1_cache_size */
584 -1, /* l1_cache_line_size */
585 -1, /* l2_cache_size */
586 -1 /* default_opt_level */
587 };
588
589 static const cpu_prefetch_tune exynosm1_prefetch_tune =
590 {
591 0, /* num_slots */
592 -1, /* l1_cache_size */
593 64, /* l1_cache_line_size */
594 -1, /* l2_cache_size */
595 -1 /* default_opt_level */
596 };
597
598 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
599 {
600 4, /* num_slots */
601 32, /* l1_cache_size */
602 64, /* l1_cache_line_size */
603 1024, /* l2_cache_size */
604 -1 /* default_opt_level */
605 };
606
607 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
608 {
609 8, /* num_slots */
610 32, /* l1_cache_size */
611 128, /* l1_cache_line_size */
612 16*1024, /* l2_cache_size */
613 3 /* default_opt_level */
614 };
615
616 static const cpu_prefetch_tune thunderx_prefetch_tune =
617 {
618 8, /* num_slots */
619 32, /* l1_cache_size */
620 128, /* l1_cache_line_size */
621 -1, /* l2_cache_size */
622 -1 /* default_opt_level */
623 };
624
625 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
626 {
627 8, /* num_slots */
628 32, /* l1_cache_size */
629 64, /* l1_cache_line_size */
630 256, /* l2_cache_size */
631 -1 /* default_opt_level */
632 };
633
634 static const struct tune_params generic_tunings =
635 {
636 &cortexa57_extra_costs,
637 &generic_addrcost_table,
638 &generic_regmove_cost,
639 &generic_vector_cost,
640 &generic_branch_cost,
641 &generic_approx_modes,
642 4, /* memmov_cost */
643 2, /* issue_rate */
644 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
645 8, /* function_align. */
646 4, /* jump_align. */
647 8, /* loop_align. */
648 2, /* int_reassoc_width. */
649 4, /* fp_reassoc_width. */
650 1, /* vec_reassoc_width. */
651 2, /* min_div_recip_mul_sf. */
652 2, /* min_div_recip_mul_df. */
653 0, /* max_case_values. */
654 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
655 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
656 &generic_prefetch_tune
657 };
658
659 static const struct tune_params cortexa35_tunings =
660 {
661 &cortexa53_extra_costs,
662 &generic_addrcost_table,
663 &cortexa53_regmove_cost,
664 &generic_vector_cost,
665 &generic_branch_cost,
666 &generic_approx_modes,
667 4, /* memmov_cost */
668 1, /* issue_rate */
669 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
670 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
671 16, /* function_align. */
672 4, /* jump_align. */
673 8, /* loop_align. */
674 2, /* int_reassoc_width. */
675 4, /* fp_reassoc_width. */
676 1, /* vec_reassoc_width. */
677 2, /* min_div_recip_mul_sf. */
678 2, /* min_div_recip_mul_df. */
679 0, /* max_case_values. */
680 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
681 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
682 &generic_prefetch_tune
683 };
684
685 static const struct tune_params cortexa53_tunings =
686 {
687 &cortexa53_extra_costs,
688 &generic_addrcost_table,
689 &cortexa53_regmove_cost,
690 &generic_vector_cost,
691 &generic_branch_cost,
692 &generic_approx_modes,
693 4, /* memmov_cost */
694 2, /* issue_rate */
695 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
696 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
697 16, /* function_align. */
698 4, /* jump_align. */
699 8, /* loop_align. */
700 2, /* int_reassoc_width. */
701 4, /* fp_reassoc_width. */
702 1, /* vec_reassoc_width. */
703 2, /* min_div_recip_mul_sf. */
704 2, /* min_div_recip_mul_df. */
705 0, /* max_case_values. */
706 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
707 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
708 &generic_prefetch_tune
709 };
710
711 static const struct tune_params cortexa57_tunings =
712 {
713 &cortexa57_extra_costs,
714 &generic_addrcost_table,
715 &cortexa57_regmove_cost,
716 &cortexa57_vector_cost,
717 &generic_branch_cost,
718 &generic_approx_modes,
719 4, /* memmov_cost */
720 3, /* issue_rate */
721 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
722 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
723 16, /* function_align. */
724 4, /* jump_align. */
725 8, /* loop_align. */
726 2, /* int_reassoc_width. */
727 4, /* fp_reassoc_width. */
728 1, /* vec_reassoc_width. */
729 2, /* min_div_recip_mul_sf. */
730 2, /* min_div_recip_mul_df. */
731 0, /* max_case_values. */
732 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
733 (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags. */
734 &generic_prefetch_tune
735 };
736
737 static const struct tune_params cortexa72_tunings =
738 {
739 &cortexa57_extra_costs,
740 &generic_addrcost_table,
741 &cortexa57_regmove_cost,
742 &cortexa57_vector_cost,
743 &generic_branch_cost,
744 &generic_approx_modes,
745 4, /* memmov_cost */
746 3, /* issue_rate */
747 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
748 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
749 16, /* function_align. */
750 4, /* jump_align. */
751 8, /* loop_align. */
752 2, /* int_reassoc_width. */
753 4, /* fp_reassoc_width. */
754 1, /* vec_reassoc_width. */
755 2, /* min_div_recip_mul_sf. */
756 2, /* min_div_recip_mul_df. */
757 0, /* max_case_values. */
758 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
759 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
760 &generic_prefetch_tune
761 };
762
763 static const struct tune_params cortexa73_tunings =
764 {
765 &cortexa57_extra_costs,
766 &generic_addrcost_table,
767 &cortexa57_regmove_cost,
768 &cortexa57_vector_cost,
769 &generic_branch_cost,
770 &generic_approx_modes,
771 4, /* memmov_cost. */
772 2, /* issue_rate. */
773 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
774 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
775 16, /* function_align. */
776 4, /* jump_align. */
777 8, /* loop_align. */
778 2, /* int_reassoc_width. */
779 4, /* fp_reassoc_width. */
780 1, /* vec_reassoc_width. */
781 2, /* min_div_recip_mul_sf. */
782 2, /* min_div_recip_mul_df. */
783 0, /* max_case_values. */
784 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
785 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
786 &generic_prefetch_tune
787 };
788
789
790
791 static const struct tune_params exynosm1_tunings =
792 {
793 &exynosm1_extra_costs,
794 &exynosm1_addrcost_table,
795 &exynosm1_regmove_cost,
796 &exynosm1_vector_cost,
797 &generic_branch_cost,
798 &exynosm1_approx_modes,
799 4, /* memmov_cost */
800 3, /* issue_rate */
801 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
802 4, /* function_align. */
803 4, /* jump_align. */
804 4, /* loop_align. */
805 2, /* int_reassoc_width. */
806 4, /* fp_reassoc_width. */
807 1, /* vec_reassoc_width. */
808 2, /* min_div_recip_mul_sf. */
809 2, /* min_div_recip_mul_df. */
810 48, /* max_case_values. */
811 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
812 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
813 &exynosm1_prefetch_tune
814 };
815
816 static const struct tune_params thunderxt88_tunings =
817 {
818 &thunderx_extra_costs,
819 &generic_addrcost_table,
820 &thunderx_regmove_cost,
821 &thunderx_vector_cost,
822 &generic_branch_cost,
823 &generic_approx_modes,
824 6, /* memmov_cost */
825 2, /* issue_rate */
826 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
827 8, /* function_align. */
828 8, /* jump_align. */
829 8, /* loop_align. */
830 2, /* int_reassoc_width. */
831 4, /* fp_reassoc_width. */
832 1, /* vec_reassoc_width. */
833 2, /* min_div_recip_mul_sf. */
834 2, /* min_div_recip_mul_df. */
835 0, /* max_case_values. */
836 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
837 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW), /* tune_flags. */
838 &thunderxt88_prefetch_tune
839 };
840
841 static const struct tune_params thunderx_tunings =
842 {
843 &thunderx_extra_costs,
844 &generic_addrcost_table,
845 &thunderx_regmove_cost,
846 &thunderx_vector_cost,
847 &generic_branch_cost,
848 &generic_approx_modes,
849 6, /* memmov_cost */
850 2, /* issue_rate */
851 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
852 8, /* function_align. */
853 8, /* jump_align. */
854 8, /* loop_align. */
855 2, /* int_reassoc_width. */
856 4, /* fp_reassoc_width. */
857 1, /* vec_reassoc_width. */
858 2, /* min_div_recip_mul_sf. */
859 2, /* min_div_recip_mul_df. */
860 0, /* max_case_values. */
861 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
862 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
863 | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */
864 &thunderx_prefetch_tune
865 };
866
867 static const struct tune_params xgene1_tunings =
868 {
869 &xgene1_extra_costs,
870 &xgene1_addrcost_table,
871 &xgene1_regmove_cost,
872 &xgene1_vector_cost,
873 &generic_branch_cost,
874 &xgene1_approx_modes,
875 6, /* memmov_cost */
876 4, /* issue_rate */
877 AARCH64_FUSE_NOTHING, /* fusible_ops */
878 16, /* function_align. */
879 8, /* jump_align. */
880 16, /* loop_align. */
881 2, /* int_reassoc_width. */
882 4, /* fp_reassoc_width. */
883 1, /* vec_reassoc_width. */
884 2, /* min_div_recip_mul_sf. */
885 2, /* min_div_recip_mul_df. */
886 0, /* max_case_values. */
887 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
888 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
889 &generic_prefetch_tune
890 };
891
892 static const struct tune_params qdf24xx_tunings =
893 {
894 &qdf24xx_extra_costs,
895 &qdf24xx_addrcost_table,
896 &qdf24xx_regmove_cost,
897 &qdf24xx_vector_cost,
898 &generic_branch_cost,
899 &generic_approx_modes,
900 4, /* memmov_cost */
901 4, /* issue_rate */
902 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
903 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
904 16, /* function_align. */
905 8, /* jump_align. */
906 16, /* loop_align. */
907 2, /* int_reassoc_width. */
908 4, /* fp_reassoc_width. */
909 1, /* vec_reassoc_width. */
910 2, /* min_div_recip_mul_sf. */
911 2, /* min_div_recip_mul_df. */
912 0, /* max_case_values. */
913 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
914 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
915 &qdf24xx_prefetch_tune
916 };
917
918 /* Tuning structure for the Qualcomm Saphira core. Default to falkor values
919 for now. */
920 static const struct tune_params saphira_tunings =
921 {
922 &generic_extra_costs,
923 &generic_addrcost_table,
924 &generic_regmove_cost,
925 &generic_vector_cost,
926 &generic_branch_cost,
927 &generic_approx_modes,
928 4, /* memmov_cost */
929 4, /* issue_rate */
930 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
931 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
932 16, /* function_align. */
933 8, /* jump_align. */
934 16, /* loop_align. */
935 2, /* int_reassoc_width. */
936 4, /* fp_reassoc_width. */
937 1, /* vec_reassoc_width. */
938 2, /* min_div_recip_mul_sf. */
939 2, /* min_div_recip_mul_df. */
940 0, /* max_case_values. */
941 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
942 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
943 &generic_prefetch_tune
944 };
945
946 static const struct tune_params thunderx2t99_tunings =
947 {
948 &thunderx2t99_extra_costs,
949 &thunderx2t99_addrcost_table,
950 &thunderx2t99_regmove_cost,
951 &thunderx2t99_vector_cost,
952 &generic_branch_cost,
953 &generic_approx_modes,
954 4, /* memmov_cost. */
955 4, /* issue_rate. */
956 (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
957 | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops */
958 16, /* function_align. */
959 8, /* jump_align. */
960 16, /* loop_align. */
961 3, /* int_reassoc_width. */
962 2, /* fp_reassoc_width. */
963 2, /* vec_reassoc_width. */
964 2, /* min_div_recip_mul_sf. */
965 2, /* min_div_recip_mul_df. */
966 0, /* max_case_values. */
967 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
968 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
969 &thunderx2t99_prefetch_tune
970 };
971
972 /* Support for fine-grained override of the tuning structures. */
973 struct aarch64_tuning_override_function
974 {
975 const char* name;
976 void (*parse_override)(const char*, struct tune_params*);
977 };
978
979 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
980 static void aarch64_parse_tune_string (const char*, struct tune_params*);
981
982 static const struct aarch64_tuning_override_function
983 aarch64_tuning_override_functions[] =
984 {
985 { "fuse", aarch64_parse_fuse_string },
986 { "tune", aarch64_parse_tune_string },
987 { NULL, NULL }
988 };
989
990 /* A processor implementing AArch64. */
991 struct processor
992 {
993 const char *const name;
994 enum aarch64_processor ident;
995 enum aarch64_processor sched_core;
996 enum aarch64_arch arch;
997 unsigned architecture_version;
998 const unsigned long flags;
999 const struct tune_params *const tune;
1000 };
1001
1002 /* Architectures implementing AArch64. */
1003 static const struct processor all_architectures[] =
1004 {
1005 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
1006 {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
1007 #include "aarch64-arches.def"
1008 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1009 };
1010
1011 /* Processor cores implementing AArch64. */
1012 static const struct processor all_cores[] =
1013 {
1014 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
1015 {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
1016 all_architectures[AARCH64_ARCH_##ARCH].architecture_version, \
1017 FLAGS, &COSTS##_tunings},
1018 #include "aarch64-cores.def"
1019 {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
1020 AARCH64_FL_FOR_ARCH8, &generic_tunings},
1021 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1022 };
1023
1024
1025 /* Target specification. These are populated by the -march, -mtune, -mcpu
1026 handling code or by target attributes. */
1027 static const struct processor *selected_arch;
1028 static const struct processor *selected_cpu;
1029 static const struct processor *selected_tune;
1030
1031 /* The current tuning set. */
1032 struct tune_params aarch64_tune_params = generic_tunings;
1033
1034 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
1035
1036 /* An ISA extension in the co-processor and main instruction set space. */
1037 struct aarch64_option_extension
1038 {
1039 const char *const name;
1040 const unsigned long flags_on;
1041 const unsigned long flags_off;
1042 };
1043
1044 typedef enum aarch64_cond_code
1045 {
1046 AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
1047 AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
1048 AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
1049 }
1050 aarch64_cc;
1051
1052 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1053
1054 /* The condition codes of the processor, and the inverse function. */
1055 static const char * const aarch64_condition_codes[] =
1056 {
1057 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1058 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1059 };
1060
1061 /* Generate code to enable conditional branches in functions over 1 MiB. */
1062 const char *
aarch64_gen_far_branch(rtx * operands,int pos_label,const char * dest,const char * branch_format)1063 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
1064 const char * branch_format)
1065 {
1066 rtx_code_label * tmp_label = gen_label_rtx ();
1067 char label_buf[256];
1068 char buffer[128];
1069 ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
1070 CODE_LABEL_NUMBER (tmp_label));
1071 const char *label_ptr = targetm.strip_name_encoding (label_buf);
1072 rtx dest_label = operands[pos_label];
1073 operands[pos_label] = tmp_label;
1074
1075 snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
1076 output_asm_insn (buffer, operands);
1077
1078 snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
1079 operands[pos_label] = dest_label;
1080 output_asm_insn (buffer, operands);
1081 return "";
1082 }
1083
1084 void
aarch64_err_no_fpadvsimd(machine_mode mode,const char * msg)1085 aarch64_err_no_fpadvsimd (machine_mode mode, const char *msg)
1086 {
1087 const char *mc = FLOAT_MODE_P (mode) ? "floating-point" : "vector";
1088 if (TARGET_GENERAL_REGS_ONLY)
1089 error ("%qs is incompatible with %s %s", "-mgeneral-regs-only", mc, msg);
1090 else
1091 error ("%qs feature modifier is incompatible with %s %s", "+nofp", mc, msg);
1092 }
1093
1094 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1095 The register allocator chooses ALL_REGS if FP_REGS and GENERAL_REGS have
1096 the same cost even if ALL_REGS has a much larger cost. ALL_REGS is also
1097 used if the cost of both FP_REGS and GENERAL_REGS is lower than the memory
1098 cost (in this case the best class is the lowest cost one). Using ALL_REGS
1099 irrespectively of its cost results in bad allocations with many redundant
1100 int<->FP moves which are expensive on various cores.
1101 To avoid this we don't allow ALL_REGS as the allocno class, but force a
1102 decision between FP_REGS and GENERAL_REGS. We use the allocno class if it
1103 isn't ALL_REGS. Similarly, use the best class if it isn't ALL_REGS.
1104 Otherwise set the allocno class depending on the mode.
1105 The result of this is that it is no longer inefficient to have a higher
1106 memory move cost than the register move cost.
1107 */
1108
1109 static reg_class_t
aarch64_ira_change_pseudo_allocno_class(int regno,reg_class_t allocno_class,reg_class_t best_class)1110 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
1111 reg_class_t best_class)
1112 {
1113 machine_mode mode;
1114
1115 if (allocno_class != ALL_REGS)
1116 return allocno_class;
1117
1118 if (best_class != ALL_REGS)
1119 return best_class;
1120
1121 mode = PSEUDO_REGNO_MODE (regno);
1122 return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1123 }
1124
1125 static unsigned int
aarch64_min_divisions_for_recip_mul(machine_mode mode)1126 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1127 {
1128 if (GET_MODE_UNIT_SIZE (mode) == 4)
1129 return aarch64_tune_params.min_div_recip_mul_sf;
1130 return aarch64_tune_params.min_div_recip_mul_df;
1131 }
1132
1133 /* Return the reassociation width of treeop OPC with mode MODE. */
1134 static int
aarch64_reassociation_width(unsigned opc,machine_mode mode)1135 aarch64_reassociation_width (unsigned opc, machine_mode mode)
1136 {
1137 if (VECTOR_MODE_P (mode))
1138 return aarch64_tune_params.vec_reassoc_width;
1139 if (INTEGRAL_MODE_P (mode))
1140 return aarch64_tune_params.int_reassoc_width;
1141 /* Avoid reassociating floating point addition so we emit more FMAs. */
1142 if (FLOAT_MODE_P (mode) && opc != PLUS_EXPR)
1143 return aarch64_tune_params.fp_reassoc_width;
1144 return 1;
1145 }
1146
1147 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
1148 unsigned
aarch64_dbx_register_number(unsigned regno)1149 aarch64_dbx_register_number (unsigned regno)
1150 {
1151 if (GP_REGNUM_P (regno))
1152 return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1153 else if (regno == SP_REGNUM)
1154 return AARCH64_DWARF_SP;
1155 else if (FP_REGNUM_P (regno))
1156 return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1157 else if (PR_REGNUM_P (regno))
1158 return AARCH64_DWARF_P0 + regno - P0_REGNUM;
1159 else if (regno == VG_REGNUM)
1160 return AARCH64_DWARF_VG;
1161
1162 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1163 equivalent DWARF register. */
1164 return DWARF_FRAME_REGISTERS;
1165 }
1166
1167 /* Return true if MODE is any of the Advanced SIMD structure modes. */
1168 static bool
aarch64_advsimd_struct_mode_p(machine_mode mode)1169 aarch64_advsimd_struct_mode_p (machine_mode mode)
1170 {
1171 return (TARGET_SIMD
1172 && (mode == OImode || mode == CImode || mode == XImode));
1173 }
1174
1175 /* Return true if MODE is an SVE predicate mode. */
1176 static bool
aarch64_sve_pred_mode_p(machine_mode mode)1177 aarch64_sve_pred_mode_p (machine_mode mode)
1178 {
1179 return (TARGET_SVE
1180 && (mode == VNx16BImode
1181 || mode == VNx8BImode
1182 || mode == VNx4BImode
1183 || mode == VNx2BImode));
1184 }
1185
1186 /* Three mutually-exclusive flags describing a vector or predicate type. */
1187 const unsigned int VEC_ADVSIMD = 1;
1188 const unsigned int VEC_SVE_DATA = 2;
1189 const unsigned int VEC_SVE_PRED = 4;
1190 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
1191 a structure of 2, 3 or 4 vectors. */
1192 const unsigned int VEC_STRUCT = 8;
1193 /* Useful combinations of the above. */
1194 const unsigned int VEC_ANY_SVE = VEC_SVE_DATA | VEC_SVE_PRED;
1195 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
1196
1197 /* Return a set of flags describing the vector properties of mode MODE.
1198 Ignore modes that are not supported by the current target. */
1199 static unsigned int
aarch64_classify_vector_mode(machine_mode mode)1200 aarch64_classify_vector_mode (machine_mode mode)
1201 {
1202 if (aarch64_advsimd_struct_mode_p (mode))
1203 return VEC_ADVSIMD | VEC_STRUCT;
1204
1205 if (aarch64_sve_pred_mode_p (mode))
1206 return VEC_SVE_PRED;
1207
1208 scalar_mode inner = GET_MODE_INNER (mode);
1209 if (VECTOR_MODE_P (mode)
1210 && (inner == QImode
1211 || inner == HImode
1212 || inner == HFmode
1213 || inner == SImode
1214 || inner == SFmode
1215 || inner == DImode
1216 || inner == DFmode))
1217 {
1218 if (TARGET_SVE)
1219 {
1220 if (known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR))
1221 return VEC_SVE_DATA;
1222 if (known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 2)
1223 || known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 3)
1224 || known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 4))
1225 return VEC_SVE_DATA | VEC_STRUCT;
1226 }
1227
1228 /* This includes V1DF but not V1DI (which doesn't exist). */
1229 if (TARGET_SIMD
1230 && (known_eq (GET_MODE_BITSIZE (mode), 64)
1231 || known_eq (GET_MODE_BITSIZE (mode), 128)))
1232 return VEC_ADVSIMD;
1233 }
1234
1235 return 0;
1236 }
1237
1238 /* Return true if MODE is any of the data vector modes, including
1239 structure modes. */
1240 static bool
aarch64_vector_data_mode_p(machine_mode mode)1241 aarch64_vector_data_mode_p (machine_mode mode)
1242 {
1243 return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
1244 }
1245
1246 /* Return true if MODE is an SVE data vector mode; either a single vector
1247 or a structure of vectors. */
1248 static bool
aarch64_sve_data_mode_p(machine_mode mode)1249 aarch64_sve_data_mode_p (machine_mode mode)
1250 {
1251 return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
1252 }
1253
1254 /* Implement target hook TARGET_ARRAY_MODE. */
1255 static opt_machine_mode
aarch64_array_mode(machine_mode mode,unsigned HOST_WIDE_INT nelems)1256 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
1257 {
1258 if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
1259 && IN_RANGE (nelems, 2, 4))
1260 return mode_for_vector (GET_MODE_INNER (mode),
1261 GET_MODE_NUNITS (mode) * nelems);
1262
1263 return opt_machine_mode ();
1264 }
1265
1266 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
1267 static bool
aarch64_array_mode_supported_p(machine_mode mode,unsigned HOST_WIDE_INT nelems)1268 aarch64_array_mode_supported_p (machine_mode mode,
1269 unsigned HOST_WIDE_INT nelems)
1270 {
1271 if (TARGET_SIMD
1272 && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1273 || AARCH64_VALID_SIMD_DREG_MODE (mode))
1274 && (nelems >= 2 && nelems <= 4))
1275 return true;
1276
1277 return false;
1278 }
1279
1280 /* Return the SVE predicate mode to use for elements that have
1281 ELEM_NBYTES bytes, if such a mode exists. */
1282
1283 opt_machine_mode
aarch64_sve_pred_mode(unsigned int elem_nbytes)1284 aarch64_sve_pred_mode (unsigned int elem_nbytes)
1285 {
1286 if (TARGET_SVE)
1287 {
1288 if (elem_nbytes == 1)
1289 return VNx16BImode;
1290 if (elem_nbytes == 2)
1291 return VNx8BImode;
1292 if (elem_nbytes == 4)
1293 return VNx4BImode;
1294 if (elem_nbytes == 8)
1295 return VNx2BImode;
1296 }
1297 return opt_machine_mode ();
1298 }
1299
1300 /* Implement TARGET_VECTORIZE_GET_MASK_MODE. */
1301
1302 static opt_machine_mode
aarch64_get_mask_mode(poly_uint64 nunits,poly_uint64 nbytes)1303 aarch64_get_mask_mode (poly_uint64 nunits, poly_uint64 nbytes)
1304 {
1305 if (TARGET_SVE && known_eq (nbytes, BYTES_PER_SVE_VECTOR))
1306 {
1307 unsigned int elem_nbytes = vector_element_size (nbytes, nunits);
1308 machine_mode pred_mode;
1309 if (aarch64_sve_pred_mode (elem_nbytes).exists (&pred_mode))
1310 return pred_mode;
1311 }
1312
1313 return default_get_mask_mode (nunits, nbytes);
1314 }
1315
1316 /* Implement TARGET_HARD_REGNO_NREGS. */
1317
1318 static unsigned int
aarch64_hard_regno_nregs(unsigned regno,machine_mode mode)1319 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1320 {
1321 /* ??? Logically we should only need to provide a value when
1322 HARD_REGNO_MODE_OK says that the combination is valid,
1323 but at the moment we need to handle all modes. Just ignore
1324 any runtime parts for registers that can't store them. */
1325 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
1326 switch (aarch64_regno_regclass (regno))
1327 {
1328 case FP_REGS:
1329 case FP_LO_REGS:
1330 if (aarch64_sve_data_mode_p (mode))
1331 return exact_div (GET_MODE_SIZE (mode),
1332 BYTES_PER_SVE_VECTOR).to_constant ();
1333 return CEIL (lowest_size, UNITS_PER_VREG);
1334 case PR_REGS:
1335 case PR_LO_REGS:
1336 case PR_HI_REGS:
1337 return 1;
1338 default:
1339 return CEIL (lowest_size, UNITS_PER_WORD);
1340 }
1341 gcc_unreachable ();
1342 }
1343
1344 /* Implement TARGET_HARD_REGNO_MODE_OK. */
1345
1346 static bool
aarch64_hard_regno_mode_ok(unsigned regno,machine_mode mode)1347 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1348 {
1349 if (GET_MODE_CLASS (mode) == MODE_CC)
1350 return regno == CC_REGNUM;
1351
1352 if (regno == VG_REGNUM)
1353 /* This must have the same size as _Unwind_Word. */
1354 return mode == DImode;
1355
1356 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1357 if (vec_flags & VEC_SVE_PRED)
1358 return PR_REGNUM_P (regno);
1359
1360 if (PR_REGNUM_P (regno))
1361 return 0;
1362
1363 if (regno == SP_REGNUM)
1364 /* The purpose of comparing with ptr_mode is to support the
1365 global register variable associated with the stack pointer
1366 register via the syntax of asm ("wsp") in ILP32. */
1367 return mode == Pmode || mode == ptr_mode;
1368
1369 if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1370 return mode == Pmode;
1371
1372 if (GP_REGNUM_P (regno) && known_le (GET_MODE_SIZE (mode), 16))
1373 return true;
1374
1375 if (FP_REGNUM_P (regno))
1376 {
1377 if (vec_flags & VEC_STRUCT)
1378 return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
1379 else
1380 return !VECTOR_MODE_P (mode) || vec_flags != 0;
1381 }
1382
1383 return false;
1384 }
1385
1386 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The callee only saves
1387 the lower 64 bits of a 128-bit register. Tell the compiler the callee
1388 clobbers the top 64 bits when restoring the bottom 64 bits. */
1389
1390 static bool
aarch64_hard_regno_call_part_clobbered(unsigned int regno,machine_mode mode)1391 aarch64_hard_regno_call_part_clobbered (unsigned int regno, machine_mode mode)
1392 {
1393 return FP_REGNUM_P (regno) && maybe_gt (GET_MODE_SIZE (mode), 8);
1394 }
1395
1396 /* Implement REGMODE_NATURAL_SIZE. */
1397 poly_uint64
aarch64_regmode_natural_size(machine_mode mode)1398 aarch64_regmode_natural_size (machine_mode mode)
1399 {
1400 /* The natural size for SVE data modes is one SVE data vector,
1401 and similarly for predicates. We can't independently modify
1402 anything smaller than that. */
1403 /* ??? For now, only do this for variable-width SVE registers.
1404 Doing it for constant-sized registers breaks lower-subreg.c. */
1405 /* ??? And once that's fixed, we should probably have similar
1406 code for Advanced SIMD. */
1407 if (!aarch64_sve_vg.is_constant ())
1408 {
1409 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1410 if (vec_flags & VEC_SVE_PRED)
1411 return BYTES_PER_SVE_PRED;
1412 if (vec_flags & VEC_SVE_DATA)
1413 return BYTES_PER_SVE_VECTOR;
1414 }
1415 return UNITS_PER_WORD;
1416 }
1417
1418 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
1419 machine_mode
aarch64_hard_regno_caller_save_mode(unsigned regno,unsigned,machine_mode mode)1420 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
1421 machine_mode mode)
1422 {
1423 /* The predicate mode determines which bits are significant and
1424 which are "don't care". Decreasing the number of lanes would
1425 lose data while increasing the number of lanes would make bits
1426 unnecessarily significant. */
1427 if (PR_REGNUM_P (regno))
1428 return mode;
1429 if (known_ge (GET_MODE_SIZE (mode), 4))
1430 return mode;
1431 else
1432 return SImode;
1433 }
1434
1435 /* Implement TARGET_CONSTANT_ALIGNMENT. Make strings word-aligned so
1436 that strcpy from constants will be faster. */
1437
1438 static HOST_WIDE_INT
aarch64_constant_alignment(const_tree exp,HOST_WIDE_INT align)1439 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
1440 {
1441 if (TREE_CODE (exp) == STRING_CST && !optimize_size)
1442 return MAX (align, BITS_PER_WORD);
1443 return align;
1444 }
1445
1446 /* Return true if calls to DECL should be treated as
1447 long-calls (ie called via a register). */
1448 static bool
aarch64_decl_is_long_call_p(const_tree decl ATTRIBUTE_UNUSED)1449 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1450 {
1451 return false;
1452 }
1453
1454 /* Return true if calls to symbol-ref SYM should be treated as
1455 long-calls (ie called via a register). */
1456 bool
aarch64_is_long_call_p(rtx sym)1457 aarch64_is_long_call_p (rtx sym)
1458 {
1459 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1460 }
1461
1462 /* Return true if calls to symbol-ref SYM should not go through
1463 plt stubs. */
1464
1465 bool
aarch64_is_noplt_call_p(rtx sym)1466 aarch64_is_noplt_call_p (rtx sym)
1467 {
1468 const_tree decl = SYMBOL_REF_DECL (sym);
1469
1470 if (flag_pic
1471 && decl
1472 && (!flag_plt
1473 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1474 && !targetm.binds_local_p (decl))
1475 return true;
1476
1477 return false;
1478 }
1479
1480 /* Return true if the offsets to a zero/sign-extract operation
1481 represent an expression that matches an extend operation. The
1482 operands represent the paramters from
1483
1484 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */
1485 bool
aarch64_is_extend_from_extract(scalar_int_mode mode,rtx mult_imm,rtx extract_imm)1486 aarch64_is_extend_from_extract (scalar_int_mode mode, rtx mult_imm,
1487 rtx extract_imm)
1488 {
1489 HOST_WIDE_INT mult_val, extract_val;
1490
1491 if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
1492 return false;
1493
1494 mult_val = INTVAL (mult_imm);
1495 extract_val = INTVAL (extract_imm);
1496
1497 if (extract_val > 8
1498 && extract_val < GET_MODE_BITSIZE (mode)
1499 && exact_log2 (extract_val & ~7) > 0
1500 && (extract_val & 7) <= 4
1501 && mult_val == (1 << (extract_val & 7)))
1502 return true;
1503
1504 return false;
1505 }
1506
1507 /* Emit an insn that's a simple single-set. Both the operands must be
1508 known to be valid. */
1509 inline static rtx_insn *
emit_set_insn(rtx x,rtx y)1510 emit_set_insn (rtx x, rtx y)
1511 {
1512 return emit_insn (gen_rtx_SET (x, y));
1513 }
1514
1515 /* X and Y are two things to compare using CODE. Emit the compare insn and
1516 return the rtx for register 0 in the proper mode. */
1517 rtx
aarch64_gen_compare_reg(RTX_CODE code,rtx x,rtx y)1518 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
1519 {
1520 machine_mode mode = SELECT_CC_MODE (code, x, y);
1521 rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
1522
1523 emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
1524 return cc_reg;
1525 }
1526
1527 /* Build the SYMBOL_REF for __tls_get_addr. */
1528
1529 static GTY(()) rtx tls_get_addr_libfunc;
1530
1531 rtx
aarch64_tls_get_addr(void)1532 aarch64_tls_get_addr (void)
1533 {
1534 if (!tls_get_addr_libfunc)
1535 tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
1536 return tls_get_addr_libfunc;
1537 }
1538
1539 /* Return the TLS model to use for ADDR. */
1540
1541 static enum tls_model
tls_symbolic_operand_type(rtx addr)1542 tls_symbolic_operand_type (rtx addr)
1543 {
1544 enum tls_model tls_kind = TLS_MODEL_NONE;
1545 if (GET_CODE (addr) == CONST)
1546 {
1547 poly_int64 addend;
1548 rtx sym = strip_offset (addr, &addend);
1549 if (GET_CODE (sym) == SYMBOL_REF)
1550 tls_kind = SYMBOL_REF_TLS_MODEL (sym);
1551 }
1552 else if (GET_CODE (addr) == SYMBOL_REF)
1553 tls_kind = SYMBOL_REF_TLS_MODEL (addr);
1554
1555 return tls_kind;
1556 }
1557
1558 /* We'll allow lo_sum's in addresses in our legitimate addresses
1559 so that combine would take care of combining addresses where
1560 necessary, but for generation purposes, we'll generate the address
1561 as :
1562 RTL Absolute
1563 tmp = hi (symbol_ref); adrp x1, foo
1564 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
1565 nop
1566
1567 PIC TLS
1568 adrp x1, :got:foo adrp tmp, :tlsgd:foo
1569 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
1570 bl __tls_get_addr
1571 nop
1572
1573 Load TLS symbol, depending on TLS mechanism and TLS access model.
1574
1575 Global Dynamic - Traditional TLS:
1576 adrp tmp, :tlsgd:imm
1577 add dest, tmp, #:tlsgd_lo12:imm
1578 bl __tls_get_addr
1579
1580 Global Dynamic - TLS Descriptors:
1581 adrp dest, :tlsdesc:imm
1582 ldr tmp, [dest, #:tlsdesc_lo12:imm]
1583 add dest, dest, #:tlsdesc_lo12:imm
1584 blr tmp
1585 mrs tp, tpidr_el0
1586 add dest, dest, tp
1587
1588 Initial Exec:
1589 mrs tp, tpidr_el0
1590 adrp tmp, :gottprel:imm
1591 ldr dest, [tmp, #:gottprel_lo12:imm]
1592 add dest, dest, tp
1593
1594 Local Exec:
1595 mrs tp, tpidr_el0
1596 add t0, tp, #:tprel_hi12:imm, lsl #12
1597 add t0, t0, #:tprel_lo12_nc:imm
1598 */
1599
1600 static void
aarch64_load_symref_appropriately(rtx dest,rtx imm,enum aarch64_symbol_type type)1601 aarch64_load_symref_appropriately (rtx dest, rtx imm,
1602 enum aarch64_symbol_type type)
1603 {
1604 switch (type)
1605 {
1606 case SYMBOL_SMALL_ABSOLUTE:
1607 {
1608 /* In ILP32, the mode of dest can be either SImode or DImode. */
1609 rtx tmp_reg = dest;
1610 machine_mode mode = GET_MODE (dest);
1611
1612 gcc_assert (mode == Pmode || mode == ptr_mode);
1613
1614 if (can_create_pseudo_p ())
1615 tmp_reg = gen_reg_rtx (mode);
1616
1617 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1618 emit_insn (gen_add_losym (dest, tmp_reg, imm));
1619 return;
1620 }
1621
1622 case SYMBOL_TINY_ABSOLUTE:
1623 emit_insn (gen_rtx_SET (dest, imm));
1624 return;
1625
1626 case SYMBOL_SMALL_GOT_28K:
1627 {
1628 machine_mode mode = GET_MODE (dest);
1629 rtx gp_rtx = pic_offset_table_rtx;
1630 rtx insn;
1631 rtx mem;
1632
1633 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
1634 here before rtl expand. Tree IVOPT will generate rtl pattern to
1635 decide rtx costs, in which case pic_offset_table_rtx is not
1636 initialized. For that case no need to generate the first adrp
1637 instruction as the final cost for global variable access is
1638 one instruction. */
1639 if (gp_rtx != NULL)
1640 {
1641 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
1642 using the page base as GOT base, the first page may be wasted,
1643 in the worst scenario, there is only 28K space for GOT).
1644
1645 The generate instruction sequence for accessing global variable
1646 is:
1647
1648 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
1649
1650 Only one instruction needed. But we must initialize
1651 pic_offset_table_rtx properly. We generate initialize insn for
1652 every global access, and allow CSE to remove all redundant.
1653
1654 The final instruction sequences will look like the following
1655 for multiply global variables access.
1656
1657 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
1658
1659 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
1660 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
1661 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
1662 ... */
1663
1664 rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
1665 crtl->uses_pic_offset_table = 1;
1666 emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
1667
1668 if (mode != GET_MODE (gp_rtx))
1669 gp_rtx = gen_lowpart (mode, gp_rtx);
1670
1671 }
1672
1673 if (mode == ptr_mode)
1674 {
1675 if (mode == DImode)
1676 insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
1677 else
1678 insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
1679
1680 mem = XVECEXP (SET_SRC (insn), 0, 0);
1681 }
1682 else
1683 {
1684 gcc_assert (mode == Pmode);
1685
1686 insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
1687 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1688 }
1689
1690 /* The operand is expected to be MEM. Whenever the related insn
1691 pattern changed, above code which calculate mem should be
1692 updated. */
1693 gcc_assert (GET_CODE (mem) == MEM);
1694 MEM_READONLY_P (mem) = 1;
1695 MEM_NOTRAP_P (mem) = 1;
1696 emit_insn (insn);
1697 return;
1698 }
1699
1700 case SYMBOL_SMALL_GOT_4G:
1701 {
1702 /* In ILP32, the mode of dest can be either SImode or DImode,
1703 while the got entry is always of SImode size. The mode of
1704 dest depends on how dest is used: if dest is assigned to a
1705 pointer (e.g. in the memory), it has SImode; it may have
1706 DImode if dest is dereferenced to access the memeory.
1707 This is why we have to handle three different ldr_got_small
1708 patterns here (two patterns for ILP32). */
1709
1710 rtx insn;
1711 rtx mem;
1712 rtx tmp_reg = dest;
1713 machine_mode mode = GET_MODE (dest);
1714
1715 if (can_create_pseudo_p ())
1716 tmp_reg = gen_reg_rtx (mode);
1717
1718 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1719 if (mode == ptr_mode)
1720 {
1721 if (mode == DImode)
1722 insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
1723 else
1724 insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
1725
1726 mem = XVECEXP (SET_SRC (insn), 0, 0);
1727 }
1728 else
1729 {
1730 gcc_assert (mode == Pmode);
1731
1732 insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
1733 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1734 }
1735
1736 gcc_assert (GET_CODE (mem) == MEM);
1737 MEM_READONLY_P (mem) = 1;
1738 MEM_NOTRAP_P (mem) = 1;
1739 emit_insn (insn);
1740 return;
1741 }
1742
1743 case SYMBOL_SMALL_TLSGD:
1744 {
1745 rtx_insn *insns;
1746 machine_mode mode = GET_MODE (dest);
1747 rtx result = gen_rtx_REG (mode, R0_REGNUM);
1748
1749 start_sequence ();
1750 if (TARGET_ILP32)
1751 aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
1752 else
1753 aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
1754 insns = get_insns ();
1755 end_sequence ();
1756
1757 RTL_CONST_CALL_P (insns) = 1;
1758 emit_libcall_block (insns, dest, result, imm);
1759 return;
1760 }
1761
1762 case SYMBOL_SMALL_TLSDESC:
1763 {
1764 machine_mode mode = GET_MODE (dest);
1765 rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
1766 rtx tp;
1767
1768 gcc_assert (mode == Pmode || mode == ptr_mode);
1769
1770 /* In ILP32, the got entry is always of SImode size. Unlike
1771 small GOT, the dest is fixed at reg 0. */
1772 if (TARGET_ILP32)
1773 emit_insn (gen_tlsdesc_small_si (imm));
1774 else
1775 emit_insn (gen_tlsdesc_small_di (imm));
1776 tp = aarch64_load_tp (NULL);
1777
1778 if (mode != Pmode)
1779 tp = gen_lowpart (mode, tp);
1780
1781 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
1782 if (REG_P (dest))
1783 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1784 return;
1785 }
1786
1787 case SYMBOL_SMALL_TLSIE:
1788 {
1789 /* In ILP32, the mode of dest can be either SImode or DImode,
1790 while the got entry is always of SImode size. The mode of
1791 dest depends on how dest is used: if dest is assigned to a
1792 pointer (e.g. in the memory), it has SImode; it may have
1793 DImode if dest is dereferenced to access the memeory.
1794 This is why we have to handle three different tlsie_small
1795 patterns here (two patterns for ILP32). */
1796 machine_mode mode = GET_MODE (dest);
1797 rtx tmp_reg = gen_reg_rtx (mode);
1798 rtx tp = aarch64_load_tp (NULL);
1799
1800 if (mode == ptr_mode)
1801 {
1802 if (mode == DImode)
1803 emit_insn (gen_tlsie_small_di (tmp_reg, imm));
1804 else
1805 {
1806 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
1807 tp = gen_lowpart (mode, tp);
1808 }
1809 }
1810 else
1811 {
1812 gcc_assert (mode == Pmode);
1813 emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
1814 }
1815
1816 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
1817 if (REG_P (dest))
1818 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1819 return;
1820 }
1821
1822 case SYMBOL_TLSLE12:
1823 case SYMBOL_TLSLE24:
1824 case SYMBOL_TLSLE32:
1825 case SYMBOL_TLSLE48:
1826 {
1827 machine_mode mode = GET_MODE (dest);
1828 rtx tp = aarch64_load_tp (NULL);
1829
1830 if (mode != Pmode)
1831 tp = gen_lowpart (mode, tp);
1832
1833 switch (type)
1834 {
1835 case SYMBOL_TLSLE12:
1836 emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
1837 (dest, tp, imm));
1838 break;
1839 case SYMBOL_TLSLE24:
1840 emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
1841 (dest, tp, imm));
1842 break;
1843 case SYMBOL_TLSLE32:
1844 emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
1845 (dest, imm));
1846 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1847 (dest, dest, tp));
1848 break;
1849 case SYMBOL_TLSLE48:
1850 emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
1851 (dest, imm));
1852 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1853 (dest, dest, tp));
1854 break;
1855 default:
1856 gcc_unreachable ();
1857 }
1858
1859 if (REG_P (dest))
1860 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1861 return;
1862 }
1863
1864 case SYMBOL_TINY_GOT:
1865 emit_insn (gen_ldr_got_tiny (dest, imm));
1866 return;
1867
1868 case SYMBOL_TINY_TLSIE:
1869 {
1870 machine_mode mode = GET_MODE (dest);
1871 rtx tp = aarch64_load_tp (NULL);
1872
1873 if (mode == ptr_mode)
1874 {
1875 if (mode == DImode)
1876 emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
1877 else
1878 {
1879 tp = gen_lowpart (mode, tp);
1880 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
1881 }
1882 }
1883 else
1884 {
1885 gcc_assert (mode == Pmode);
1886 emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
1887 }
1888
1889 if (REG_P (dest))
1890 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1891 return;
1892 }
1893
1894 default:
1895 gcc_unreachable ();
1896 }
1897 }
1898
1899 /* Emit a move from SRC to DEST. Assume that the move expanders can
1900 handle all moves if !can_create_pseudo_p (). The distinction is
1901 important because, unlike emit_move_insn, the move expanders know
1902 how to force Pmode objects into the constant pool even when the
1903 constant pool address is not itself legitimate. */
1904 static rtx
aarch64_emit_move(rtx dest,rtx src)1905 aarch64_emit_move (rtx dest, rtx src)
1906 {
1907 return (can_create_pseudo_p ()
1908 ? emit_move_insn (dest, src)
1909 : emit_move_insn_1 (dest, src));
1910 }
1911
1912 /* Split a 128-bit move operation into two 64-bit move operations,
1913 taking care to handle partial overlap of register to register
1914 copies. Special cases are needed when moving between GP regs and
1915 FP regs. SRC can be a register, constant or memory; DST a register
1916 or memory. If either operand is memory it must not have any side
1917 effects. */
1918 void
aarch64_split_128bit_move(rtx dst,rtx src)1919 aarch64_split_128bit_move (rtx dst, rtx src)
1920 {
1921 rtx dst_lo, dst_hi;
1922 rtx src_lo, src_hi;
1923
1924 machine_mode mode = GET_MODE (dst);
1925
1926 gcc_assert (mode == TImode || mode == TFmode);
1927 gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
1928 gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
1929
1930 if (REG_P (dst) && REG_P (src))
1931 {
1932 int src_regno = REGNO (src);
1933 int dst_regno = REGNO (dst);
1934
1935 /* Handle FP <-> GP regs. */
1936 if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
1937 {
1938 src_lo = gen_lowpart (word_mode, src);
1939 src_hi = gen_highpart (word_mode, src);
1940
1941 if (mode == TImode)
1942 {
1943 emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
1944 emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
1945 }
1946 else
1947 {
1948 emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
1949 emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
1950 }
1951 return;
1952 }
1953 else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
1954 {
1955 dst_lo = gen_lowpart (word_mode, dst);
1956 dst_hi = gen_highpart (word_mode, dst);
1957
1958 if (mode == TImode)
1959 {
1960 emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
1961 emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
1962 }
1963 else
1964 {
1965 emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
1966 emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
1967 }
1968 return;
1969 }
1970 }
1971
1972 dst_lo = gen_lowpart (word_mode, dst);
1973 dst_hi = gen_highpart (word_mode, dst);
1974 src_lo = gen_lowpart (word_mode, src);
1975 src_hi = gen_highpart_mode (word_mode, mode, src);
1976
1977 /* At most one pairing may overlap. */
1978 if (reg_overlap_mentioned_p (dst_lo, src_hi))
1979 {
1980 aarch64_emit_move (dst_hi, src_hi);
1981 aarch64_emit_move (dst_lo, src_lo);
1982 }
1983 else
1984 {
1985 aarch64_emit_move (dst_lo, src_lo);
1986 aarch64_emit_move (dst_hi, src_hi);
1987 }
1988 }
1989
1990 bool
aarch64_split_128bit_move_p(rtx dst,rtx src)1991 aarch64_split_128bit_move_p (rtx dst, rtx src)
1992 {
1993 return (! REG_P (src)
1994 || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
1995 }
1996
1997 /* Split a complex SIMD combine. */
1998
1999 void
aarch64_split_simd_combine(rtx dst,rtx src1,rtx src2)2000 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
2001 {
2002 machine_mode src_mode = GET_MODE (src1);
2003 machine_mode dst_mode = GET_MODE (dst);
2004
2005 gcc_assert (VECTOR_MODE_P (dst_mode));
2006 gcc_assert (register_operand (dst, dst_mode)
2007 && register_operand (src1, src_mode)
2008 && register_operand (src2, src_mode));
2009
2010 rtx (*gen) (rtx, rtx, rtx);
2011
2012 switch (src_mode)
2013 {
2014 case E_V8QImode:
2015 gen = gen_aarch64_simd_combinev8qi;
2016 break;
2017 case E_V4HImode:
2018 gen = gen_aarch64_simd_combinev4hi;
2019 break;
2020 case E_V2SImode:
2021 gen = gen_aarch64_simd_combinev2si;
2022 break;
2023 case E_V4HFmode:
2024 gen = gen_aarch64_simd_combinev4hf;
2025 break;
2026 case E_V2SFmode:
2027 gen = gen_aarch64_simd_combinev2sf;
2028 break;
2029 case E_DImode:
2030 gen = gen_aarch64_simd_combinedi;
2031 break;
2032 case E_DFmode:
2033 gen = gen_aarch64_simd_combinedf;
2034 break;
2035 default:
2036 gcc_unreachable ();
2037 }
2038
2039 emit_insn (gen (dst, src1, src2));
2040 return;
2041 }
2042
2043 /* Split a complex SIMD move. */
2044
2045 void
aarch64_split_simd_move(rtx dst,rtx src)2046 aarch64_split_simd_move (rtx dst, rtx src)
2047 {
2048 machine_mode src_mode = GET_MODE (src);
2049 machine_mode dst_mode = GET_MODE (dst);
2050
2051 gcc_assert (VECTOR_MODE_P (dst_mode));
2052
2053 if (REG_P (dst) && REG_P (src))
2054 {
2055 rtx (*gen) (rtx, rtx);
2056
2057 gcc_assert (VECTOR_MODE_P (src_mode));
2058
2059 switch (src_mode)
2060 {
2061 case E_V16QImode:
2062 gen = gen_aarch64_split_simd_movv16qi;
2063 break;
2064 case E_V8HImode:
2065 gen = gen_aarch64_split_simd_movv8hi;
2066 break;
2067 case E_V4SImode:
2068 gen = gen_aarch64_split_simd_movv4si;
2069 break;
2070 case E_V2DImode:
2071 gen = gen_aarch64_split_simd_movv2di;
2072 break;
2073 case E_V8HFmode:
2074 gen = gen_aarch64_split_simd_movv8hf;
2075 break;
2076 case E_V4SFmode:
2077 gen = gen_aarch64_split_simd_movv4sf;
2078 break;
2079 case E_V2DFmode:
2080 gen = gen_aarch64_split_simd_movv2df;
2081 break;
2082 default:
2083 gcc_unreachable ();
2084 }
2085
2086 emit_insn (gen (dst, src));
2087 return;
2088 }
2089 }
2090
2091 bool
aarch64_zero_extend_const_eq(machine_mode xmode,rtx x,machine_mode ymode,rtx y)2092 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
2093 machine_mode ymode, rtx y)
2094 {
2095 rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
2096 gcc_assert (r != NULL);
2097 return rtx_equal_p (x, r);
2098 }
2099
2100
2101 static rtx
aarch64_force_temporary(machine_mode mode,rtx x,rtx value)2102 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
2103 {
2104 if (can_create_pseudo_p ())
2105 return force_reg (mode, value);
2106 else
2107 {
2108 gcc_assert (x);
2109 aarch64_emit_move (x, value);
2110 return x;
2111 }
2112 }
2113
2114 /* Return true if we can move VALUE into a register using a single
2115 CNT[BHWD] instruction. */
2116
2117 static bool
aarch64_sve_cnt_immediate_p(poly_int64 value)2118 aarch64_sve_cnt_immediate_p (poly_int64 value)
2119 {
2120 HOST_WIDE_INT factor = value.coeffs[0];
2121 /* The coefficient must be [1, 16] * {2, 4, 8, 16}. */
2122 return (value.coeffs[1] == factor
2123 && IN_RANGE (factor, 2, 16 * 16)
2124 && (factor & 1) == 0
2125 && factor <= 16 * (factor & -factor));
2126 }
2127
2128 /* Likewise for rtx X. */
2129
2130 bool
aarch64_sve_cnt_immediate_p(rtx x)2131 aarch64_sve_cnt_immediate_p (rtx x)
2132 {
2133 poly_int64 value;
2134 return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
2135 }
2136
2137 /* Return the asm string for an instruction with a CNT-like vector size
2138 operand (a vector pattern followed by a multiplier in the range [1, 16]).
2139 PREFIX is the mnemonic without the size suffix and OPERANDS is the
2140 first part of the operands template (the part that comes before the
2141 vector size itself). FACTOR is the number of quadwords.
2142 NELTS_PER_VQ, if nonzero, is the number of elements in each quadword.
2143 If it is zero, we can use any element size. */
2144
2145 static char *
aarch64_output_sve_cnt_immediate(const char * prefix,const char * operands,unsigned int factor,unsigned int nelts_per_vq)2146 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2147 unsigned int factor,
2148 unsigned int nelts_per_vq)
2149 {
2150 static char buffer[sizeof ("sqincd\t%x0, %w0, all, mul #16")];
2151
2152 if (nelts_per_vq == 0)
2153 /* There is some overlap in the ranges of the four CNT instructions.
2154 Here we always use the smallest possible element size, so that the
2155 multiplier is 1 whereever possible. */
2156 nelts_per_vq = factor & -factor;
2157 int shift = std::min (exact_log2 (nelts_per_vq), 4);
2158 gcc_assert (IN_RANGE (shift, 1, 4));
2159 char suffix = "dwhb"[shift - 1];
2160
2161 factor >>= shift;
2162 unsigned int written;
2163 if (factor == 1)
2164 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
2165 prefix, suffix, operands);
2166 else
2167 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, all, mul #%d",
2168 prefix, suffix, operands, factor);
2169 gcc_assert (written < sizeof (buffer));
2170 return buffer;
2171 }
2172
2173 /* Return the asm string for an instruction with a CNT-like vector size
2174 operand (a vector pattern followed by a multiplier in the range [1, 16]).
2175 PREFIX is the mnemonic without the size suffix and OPERANDS is the
2176 first part of the operands template (the part that comes before the
2177 vector size itself). X is the value of the vector size operand,
2178 as a polynomial integer rtx. */
2179
2180 char *
aarch64_output_sve_cnt_immediate(const char * prefix,const char * operands,rtx x)2181 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2182 rtx x)
2183 {
2184 poly_int64 value = rtx_to_poly_int64 (x);
2185 gcc_assert (aarch64_sve_cnt_immediate_p (value));
2186 return aarch64_output_sve_cnt_immediate (prefix, operands,
2187 value.coeffs[1], 0);
2188 }
2189
2190 /* Return true if we can add VALUE to a register using a single ADDVL
2191 or ADDPL instruction. */
2192
2193 static bool
aarch64_sve_addvl_addpl_immediate_p(poly_int64 value)2194 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
2195 {
2196 HOST_WIDE_INT factor = value.coeffs[0];
2197 if (factor == 0 || value.coeffs[1] != factor)
2198 return false;
2199 /* FACTOR counts VG / 2, so a value of 2 is one predicate width
2200 and a value of 16 is one vector width. */
2201 return (((factor & 15) == 0 && IN_RANGE (factor, -32 * 16, 31 * 16))
2202 || ((factor & 1) == 0 && IN_RANGE (factor, -32 * 2, 31 * 2)));
2203 }
2204
2205 /* Likewise for rtx X. */
2206
2207 bool
aarch64_sve_addvl_addpl_immediate_p(rtx x)2208 aarch64_sve_addvl_addpl_immediate_p (rtx x)
2209 {
2210 poly_int64 value;
2211 return (poly_int_rtx_p (x, &value)
2212 && aarch64_sve_addvl_addpl_immediate_p (value));
2213 }
2214
2215 /* Return the asm string for adding ADDVL or ADDPL immediate X to operand 1
2216 and storing the result in operand 0. */
2217
2218 char *
aarch64_output_sve_addvl_addpl(rtx dest,rtx base,rtx offset)2219 aarch64_output_sve_addvl_addpl (rtx dest, rtx base, rtx offset)
2220 {
2221 static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
2222 poly_int64 offset_value = rtx_to_poly_int64 (offset);
2223 gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
2224
2225 /* Use INC or DEC if possible. */
2226 if (rtx_equal_p (dest, base) && GP_REGNUM_P (REGNO (dest)))
2227 {
2228 if (aarch64_sve_cnt_immediate_p (offset_value))
2229 return aarch64_output_sve_cnt_immediate ("inc", "%x0",
2230 offset_value.coeffs[1], 0);
2231 if (aarch64_sve_cnt_immediate_p (-offset_value))
2232 return aarch64_output_sve_cnt_immediate ("dec", "%x0",
2233 -offset_value.coeffs[1], 0);
2234 }
2235
2236 int factor = offset_value.coeffs[1];
2237 if ((factor & 15) == 0)
2238 snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
2239 else
2240 snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
2241 return buffer;
2242 }
2243
2244 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2245 instruction. If it is, store the number of elements in each vector
2246 quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
2247 factor in *FACTOR_OUT (if nonnull). */
2248
2249 bool
aarch64_sve_inc_dec_immediate_p(rtx x,int * factor_out,unsigned int * nelts_per_vq_out)2250 aarch64_sve_inc_dec_immediate_p (rtx x, int *factor_out,
2251 unsigned int *nelts_per_vq_out)
2252 {
2253 rtx elt;
2254 poly_int64 value;
2255
2256 if (!const_vec_duplicate_p (x, &elt)
2257 || !poly_int_rtx_p (elt, &value))
2258 return false;
2259
2260 unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
2261 if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
2262 /* There's no vector INCB. */
2263 return false;
2264
2265 HOST_WIDE_INT factor = value.coeffs[0];
2266 if (value.coeffs[1] != factor)
2267 return false;
2268
2269 /* The coefficient must be [1, 16] * NELTS_PER_VQ. */
2270 if ((factor % nelts_per_vq) != 0
2271 || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
2272 return false;
2273
2274 if (factor_out)
2275 *factor_out = factor;
2276 if (nelts_per_vq_out)
2277 *nelts_per_vq_out = nelts_per_vq;
2278 return true;
2279 }
2280
2281 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2282 instruction. */
2283
2284 bool
aarch64_sve_inc_dec_immediate_p(rtx x)2285 aarch64_sve_inc_dec_immediate_p (rtx x)
2286 {
2287 return aarch64_sve_inc_dec_immediate_p (x, NULL, NULL);
2288 }
2289
2290 /* Return the asm template for an SVE vector INC or DEC instruction.
2291 OPERANDS gives the operands before the vector count and X is the
2292 value of the vector count operand itself. */
2293
2294 char *
aarch64_output_sve_inc_dec_immediate(const char * operands,rtx x)2295 aarch64_output_sve_inc_dec_immediate (const char *operands, rtx x)
2296 {
2297 int factor;
2298 unsigned int nelts_per_vq;
2299 if (!aarch64_sve_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
2300 gcc_unreachable ();
2301 if (factor < 0)
2302 return aarch64_output_sve_cnt_immediate ("dec", operands, -factor,
2303 nelts_per_vq);
2304 else
2305 return aarch64_output_sve_cnt_immediate ("inc", operands, factor,
2306 nelts_per_vq);
2307 }
2308
2309 static int
aarch64_internal_mov_immediate(rtx dest,rtx imm,bool generate,scalar_int_mode mode)2310 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
2311 scalar_int_mode mode)
2312 {
2313 int i;
2314 unsigned HOST_WIDE_INT val, val2, mask;
2315 int one_match, zero_match;
2316 int num_insns;
2317
2318 val = INTVAL (imm);
2319
2320 if (aarch64_move_imm (val, mode))
2321 {
2322 if (generate)
2323 emit_insn (gen_rtx_SET (dest, imm));
2324 return 1;
2325 }
2326
2327 /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
2328 (with XXXX non-zero). In that case check to see if the move can be done in
2329 a smaller mode. */
2330 val2 = val & 0xffffffff;
2331 if (mode == DImode
2332 && aarch64_move_imm (val2, SImode)
2333 && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
2334 {
2335 if (generate)
2336 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
2337
2338 /* Check if we have to emit a second instruction by checking to see
2339 if any of the upper 32 bits of the original DI mode value is set. */
2340 if (val == val2)
2341 return 1;
2342
2343 i = (val >> 48) ? 48 : 32;
2344
2345 if (generate)
2346 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2347 GEN_INT ((val >> i) & 0xffff)));
2348
2349 return 2;
2350 }
2351
2352 if ((val >> 32) == 0 || mode == SImode)
2353 {
2354 if (generate)
2355 {
2356 emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
2357 if (mode == SImode)
2358 emit_insn (gen_insv_immsi (dest, GEN_INT (16),
2359 GEN_INT ((val >> 16) & 0xffff)));
2360 else
2361 emit_insn (gen_insv_immdi (dest, GEN_INT (16),
2362 GEN_INT ((val >> 16) & 0xffff)));
2363 }
2364 return 2;
2365 }
2366
2367 /* Remaining cases are all for DImode. */
2368
2369 mask = 0xffff;
2370 zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
2371 ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
2372 one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
2373 ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
2374
2375 if (zero_match != 2 && one_match != 2)
2376 {
2377 /* Try emitting a bitmask immediate with a movk replacing 16 bits.
2378 For a 64-bit bitmask try whether changing 16 bits to all ones or
2379 zeroes creates a valid bitmask. To check any repeated bitmask,
2380 try using 16 bits from the other 32-bit half of val. */
2381
2382 for (i = 0; i < 64; i += 16, mask <<= 16)
2383 {
2384 val2 = val & ~mask;
2385 if (val2 != val && aarch64_bitmask_imm (val2, mode))
2386 break;
2387 val2 = val | mask;
2388 if (val2 != val && aarch64_bitmask_imm (val2, mode))
2389 break;
2390 val2 = val2 & ~mask;
2391 val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
2392 if (val2 != val && aarch64_bitmask_imm (val2, mode))
2393 break;
2394 }
2395 if (i != 64)
2396 {
2397 if (generate)
2398 {
2399 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
2400 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2401 GEN_INT ((val >> i) & 0xffff)));
2402 }
2403 return 2;
2404 }
2405 }
2406
2407 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
2408 are emitted by the initial mov. If one_match > zero_match, skip set bits,
2409 otherwise skip zero bits. */
2410
2411 num_insns = 1;
2412 mask = 0xffff;
2413 val2 = one_match > zero_match ? ~val : val;
2414 i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
2415
2416 if (generate)
2417 emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
2418 ? (val | ~(mask << i))
2419 : (val & (mask << i)))));
2420 for (i += 16; i < 64; i += 16)
2421 {
2422 if ((val2 & (mask << i)) == 0)
2423 continue;
2424 if (generate)
2425 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2426 GEN_INT ((val >> i) & 0xffff)));
2427 num_insns ++;
2428 }
2429
2430 return num_insns;
2431 }
2432
2433 /* Return whether imm is a 128-bit immediate which is simple enough to
2434 expand inline. */
2435 bool
aarch64_mov128_immediate(rtx imm)2436 aarch64_mov128_immediate (rtx imm)
2437 {
2438 if (GET_CODE (imm) == CONST_INT)
2439 return true;
2440
2441 gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
2442
2443 rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
2444 rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
2445
2446 return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
2447 + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
2448 }
2449
2450
2451 /* Return the number of temporary registers that aarch64_add_offset_1
2452 would need to add OFFSET to a register. */
2453
2454 static unsigned int
aarch64_add_offset_1_temporaries(HOST_WIDE_INT offset)2455 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
2456 {
2457 return abs_hwi (offset) < 0x1000000 ? 0 : 1;
2458 }
2459
2460 /* A subroutine of aarch64_add_offset. Set DEST to SRC + OFFSET for
2461 a non-polynomial OFFSET. MODE is the mode of the addition.
2462 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2463 be set and CFA adjustments added to the generated instructions.
2464
2465 TEMP1, if nonnull, is a register of mode MODE that can be used as a
2466 temporary if register allocation is already complete. This temporary
2467 register may overlap DEST but must not overlap SRC. If TEMP1 is known
2468 to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
2469 the immediate again.
2470
2471 Since this function may be used to adjust the stack pointer, we must
2472 ensure that it cannot cause transient stack deallocation (for example
2473 by first incrementing SP and then decrementing when adjusting by a
2474 large immediate). */
2475
2476 static void
aarch64_add_offset_1(scalar_int_mode mode,rtx dest,rtx src,HOST_WIDE_INT offset,rtx temp1,bool frame_related_p,bool emit_move_imm)2477 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
2478 rtx src, HOST_WIDE_INT offset, rtx temp1,
2479 bool frame_related_p, bool emit_move_imm)
2480 {
2481 gcc_assert (emit_move_imm || temp1 != NULL_RTX);
2482 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
2483
2484 HOST_WIDE_INT moffset = abs_hwi (offset);
2485 rtx_insn *insn;
2486
2487 if (!moffset)
2488 {
2489 if (!rtx_equal_p (dest, src))
2490 {
2491 insn = emit_insn (gen_rtx_SET (dest, src));
2492 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2493 }
2494 return;
2495 }
2496
2497 /* Single instruction adjustment. */
2498 if (aarch64_uimm12_shift (moffset))
2499 {
2500 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
2501 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2502 return;
2503 }
2504
2505 /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
2506 and either:
2507
2508 a) the offset cannot be loaded by a 16-bit move or
2509 b) there is no spare register into which we can move it. */
2510 if (moffset < 0x1000000
2511 && ((!temp1 && !can_create_pseudo_p ())
2512 || !aarch64_move_imm (moffset, mode)))
2513 {
2514 HOST_WIDE_INT low_off = moffset & 0xfff;
2515
2516 low_off = offset < 0 ? -low_off : low_off;
2517 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
2518 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2519 insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
2520 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2521 return;
2522 }
2523
2524 /* Emit a move immediate if required and an addition/subtraction. */
2525 if (emit_move_imm)
2526 {
2527 gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
2528 temp1 = aarch64_force_temporary (mode, temp1, GEN_INT (moffset));
2529 }
2530 insn = emit_insn (offset < 0
2531 ? gen_sub3_insn (dest, src, temp1)
2532 : gen_add3_insn (dest, src, temp1));
2533 if (frame_related_p)
2534 {
2535 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2536 rtx adj = plus_constant (mode, src, offset);
2537 add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
2538 }
2539 }
2540
2541 /* Return the number of temporary registers that aarch64_add_offset
2542 would need to move OFFSET into a register or add OFFSET to a register;
2543 ADD_P is true if we want the latter rather than the former. */
2544
2545 static unsigned int
aarch64_offset_temporaries(bool add_p,poly_int64 offset)2546 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
2547 {
2548 /* This follows the same structure as aarch64_add_offset. */
2549 if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
2550 return 0;
2551
2552 unsigned int count = 0;
2553 HOST_WIDE_INT factor = offset.coeffs[1];
2554 HOST_WIDE_INT constant = offset.coeffs[0] - factor;
2555 poly_int64 poly_offset (factor, factor);
2556 if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
2557 /* Need one register for the ADDVL/ADDPL result. */
2558 count += 1;
2559 else if (factor != 0)
2560 {
2561 factor = abs (factor);
2562 if (factor > 16 * (factor & -factor))
2563 /* Need one register for the CNT result and one for the multiplication
2564 factor. If necessary, the second temporary can be reused for the
2565 constant part of the offset. */
2566 return 2;
2567 /* Need one register for the CNT result (which might then
2568 be shifted). */
2569 count += 1;
2570 }
2571 return count + aarch64_add_offset_1_temporaries (constant);
2572 }
2573
2574 /* If X can be represented as a poly_int64, return the number
2575 of temporaries that are required to add it to a register.
2576 Return -1 otherwise. */
2577
2578 int
aarch64_add_offset_temporaries(rtx x)2579 aarch64_add_offset_temporaries (rtx x)
2580 {
2581 poly_int64 offset;
2582 if (!poly_int_rtx_p (x, &offset))
2583 return -1;
2584 return aarch64_offset_temporaries (true, offset);
2585 }
2586
2587 /* Set DEST to SRC + OFFSET. MODE is the mode of the addition.
2588 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2589 be set and CFA adjustments added to the generated instructions.
2590
2591 TEMP1, if nonnull, is a register of mode MODE that can be used as a
2592 temporary if register allocation is already complete. This temporary
2593 register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
2594 If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
2595 false to avoid emitting the immediate again.
2596
2597 TEMP2, if nonnull, is a second temporary register that doesn't
2598 overlap either DEST or REG.
2599
2600 Since this function may be used to adjust the stack pointer, we must
2601 ensure that it cannot cause transient stack deallocation (for example
2602 by first incrementing SP and then decrementing when adjusting by a
2603 large immediate). */
2604
2605 static void
2606 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
2607 poly_int64 offset, rtx temp1, rtx temp2,
2608 bool frame_related_p, bool emit_move_imm = true)
2609 {
2610 gcc_assert (emit_move_imm || temp1 != NULL_RTX);
2611 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
2612 gcc_assert (temp1 == NULL_RTX
2613 || !frame_related_p
2614 || !reg_overlap_mentioned_p (temp1, dest));
2615 gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
2616
2617 /* Try using ADDVL or ADDPL to add the whole value. */
2618 if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
2619 {
2620 rtx offset_rtx = gen_int_mode (offset, mode);
2621 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
2622 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2623 return;
2624 }
2625
2626 /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
2627 SVE vector register, over and above the minimum size of 128 bits.
2628 This is equivalent to half the value returned by CNTD with a
2629 vector shape of ALL. */
2630 HOST_WIDE_INT factor = offset.coeffs[1];
2631 HOST_WIDE_INT constant = offset.coeffs[0] - factor;
2632
2633 /* Try using ADDVL or ADDPL to add the VG-based part. */
2634 poly_int64 poly_offset (factor, factor);
2635 if (src != const0_rtx
2636 && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
2637 {
2638 rtx offset_rtx = gen_int_mode (poly_offset, mode);
2639 if (frame_related_p)
2640 {
2641 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
2642 RTX_FRAME_RELATED_P (insn) = true;
2643 src = dest;
2644 }
2645 else
2646 {
2647 rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
2648 src = aarch64_force_temporary (mode, temp1, addr);
2649 temp1 = temp2;
2650 temp2 = NULL_RTX;
2651 }
2652 }
2653 /* Otherwise use a CNT-based sequence. */
2654 else if (factor != 0)
2655 {
2656 /* Use a subtraction if we have a negative factor. */
2657 rtx_code code = PLUS;
2658 if (factor < 0)
2659 {
2660 factor = -factor;
2661 code = MINUS;
2662 }
2663
2664 /* Calculate CNTD * FACTOR / 2. First try to fold the division
2665 into the multiplication. */
2666 rtx val;
2667 int shift = 0;
2668 if (factor & 1)
2669 /* Use a right shift by 1. */
2670 shift = -1;
2671 else
2672 factor /= 2;
2673 HOST_WIDE_INT low_bit = factor & -factor;
2674 if (factor <= 16 * low_bit)
2675 {
2676 if (factor > 16 * 8)
2677 {
2678 /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
2679 the value with the minimum multiplier and shift it into
2680 position. */
2681 int extra_shift = exact_log2 (low_bit);
2682 shift += extra_shift;
2683 factor >>= extra_shift;
2684 }
2685 val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
2686 }
2687 else
2688 {
2689 /* Use CNTD, then multiply it by FACTOR. */
2690 val = gen_int_mode (poly_int64 (2, 2), mode);
2691 val = aarch64_force_temporary (mode, temp1, val);
2692
2693 /* Go back to using a negative multiplication factor if we have
2694 no register from which to subtract. */
2695 if (code == MINUS && src == const0_rtx)
2696 {
2697 factor = -factor;
2698 code = PLUS;
2699 }
2700 rtx coeff1 = gen_int_mode (factor, mode);
2701 coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
2702 val = gen_rtx_MULT (mode, val, coeff1);
2703 }
2704
2705 if (shift > 0)
2706 {
2707 /* Multiply by 1 << SHIFT. */
2708 val = aarch64_force_temporary (mode, temp1, val);
2709 val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
2710 }
2711 else if (shift == -1)
2712 {
2713 /* Divide by 2. */
2714 val = aarch64_force_temporary (mode, temp1, val);
2715 val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
2716 }
2717
2718 /* Calculate SRC +/- CNTD * FACTOR / 2. */
2719 if (src != const0_rtx)
2720 {
2721 val = aarch64_force_temporary (mode, temp1, val);
2722 val = gen_rtx_fmt_ee (code, mode, src, val);
2723 }
2724 else if (code == MINUS)
2725 {
2726 val = aarch64_force_temporary (mode, temp1, val);
2727 val = gen_rtx_NEG (mode, val);
2728 }
2729
2730 if (constant == 0 || frame_related_p)
2731 {
2732 rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
2733 if (frame_related_p)
2734 {
2735 RTX_FRAME_RELATED_P (insn) = true;
2736 add_reg_note (insn, REG_CFA_ADJUST_CFA,
2737 gen_rtx_SET (dest, plus_constant (Pmode, src,
2738 poly_offset)));
2739 }
2740 src = dest;
2741 if (constant == 0)
2742 return;
2743 }
2744 else
2745 {
2746 src = aarch64_force_temporary (mode, temp1, val);
2747 temp1 = temp2;
2748 temp2 = NULL_RTX;
2749 }
2750
2751 emit_move_imm = true;
2752 }
2753
2754 aarch64_add_offset_1 (mode, dest, src, constant, temp1,
2755 frame_related_p, emit_move_imm);
2756 }
2757
2758 /* Like aarch64_add_offset, but the offset is given as an rtx rather
2759 than a poly_int64. */
2760
2761 void
aarch64_split_add_offset(scalar_int_mode mode,rtx dest,rtx src,rtx offset_rtx,rtx temp1,rtx temp2)2762 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
2763 rtx offset_rtx, rtx temp1, rtx temp2)
2764 {
2765 aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
2766 temp1, temp2, false);
2767 }
2768
2769 /* Add DELTA to the stack pointer, marking the instructions frame-related.
2770 TEMP1 is available as a temporary if nonnull. EMIT_MOVE_IMM is false
2771 if TEMP1 already contains abs (DELTA). */
2772
2773 static inline void
aarch64_add_sp(rtx temp1,rtx temp2,poly_int64 delta,bool emit_move_imm)2774 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
2775 {
2776 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
2777 temp1, temp2, true, emit_move_imm);
2778 }
2779
2780 /* Subtract DELTA from the stack pointer, marking the instructions
2781 frame-related if FRAME_RELATED_P. TEMP1 is available as a temporary
2782 if nonnull. */
2783
2784 static inline void
aarch64_sub_sp(rtx temp1,rtx temp2,poly_int64 delta,bool frame_related_p)2785 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p)
2786 {
2787 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
2788 temp1, temp2, frame_related_p);
2789 }
2790
2791 /* Set DEST to (vec_series BASE STEP). */
2792
2793 static void
aarch64_expand_vec_series(rtx dest,rtx base,rtx step)2794 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
2795 {
2796 machine_mode mode = GET_MODE (dest);
2797 scalar_mode inner = GET_MODE_INNER (mode);
2798
2799 /* Each operand can be a register or an immediate in the range [-16, 15]. */
2800 if (!aarch64_sve_index_immediate_p (base))
2801 base = force_reg (inner, base);
2802 if (!aarch64_sve_index_immediate_p (step))
2803 step = force_reg (inner, step);
2804
2805 emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
2806 }
2807
2808 /* Try to duplicate SRC into SVE register DEST, given that SRC is an
2809 integer of mode INT_MODE. Return true on success. */
2810
2811 static bool
aarch64_expand_sve_widened_duplicate(rtx dest,scalar_int_mode src_mode,rtx src)2812 aarch64_expand_sve_widened_duplicate (rtx dest, scalar_int_mode src_mode,
2813 rtx src)
2814 {
2815 /* If the constant is smaller than 128 bits, we can do the move
2816 using a vector of SRC_MODEs. */
2817 if (src_mode != TImode)
2818 {
2819 poly_uint64 count = exact_div (GET_MODE_SIZE (GET_MODE (dest)),
2820 GET_MODE_SIZE (src_mode));
2821 machine_mode dup_mode = mode_for_vector (src_mode, count).require ();
2822 emit_move_insn (gen_lowpart (dup_mode, dest),
2823 gen_const_vec_duplicate (dup_mode, src));
2824 return true;
2825 }
2826
2827 /* Use LD1RQ[BHWD] to load the 128 bits from memory. */
2828 src = force_const_mem (src_mode, src);
2829 if (!src)
2830 return false;
2831
2832 /* Make sure that the address is legitimate. */
2833 if (!aarch64_sve_ld1r_operand_p (src))
2834 {
2835 rtx addr = force_reg (Pmode, XEXP (src, 0));
2836 src = replace_equiv_address (src, addr);
2837 }
2838
2839 machine_mode mode = GET_MODE (dest);
2840 unsigned int elem_bytes = GET_MODE_UNIT_SIZE (mode);
2841 machine_mode pred_mode = aarch64_sve_pred_mode (elem_bytes).require ();
2842 rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
2843 src = gen_rtx_UNSPEC (mode, gen_rtvec (2, ptrue, src), UNSPEC_LD1RQ);
2844 emit_insn (gen_rtx_SET (dest, src));
2845 return true;
2846 }
2847
2848 /* Expand a move of general CONST_VECTOR SRC into DEST, given that it
2849 isn't a simple duplicate or series. */
2850
2851 static void
aarch64_expand_sve_const_vector(rtx dest,rtx src)2852 aarch64_expand_sve_const_vector (rtx dest, rtx src)
2853 {
2854 machine_mode mode = GET_MODE (src);
2855 unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
2856 unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
2857 gcc_assert (npatterns > 1);
2858
2859 if (nelts_per_pattern == 1)
2860 {
2861 /* The constant is a repeating seqeuence of at least two elements,
2862 where the repeating elements occupy no more than 128 bits.
2863 Get an integer representation of the replicated value. */
2864 scalar_int_mode int_mode;
2865 if (BYTES_BIG_ENDIAN)
2866 /* For now, always use LD1RQ to load the value on big-endian
2867 targets, since the handling of smaller integers includes a
2868 subreg that is semantically an element reverse. */
2869 int_mode = TImode;
2870 else
2871 {
2872 unsigned int int_bits = GET_MODE_UNIT_BITSIZE (mode) * npatterns;
2873 gcc_assert (int_bits <= 128);
2874 int_mode = int_mode_for_size (int_bits, 0).require ();
2875 }
2876 rtx int_value = simplify_gen_subreg (int_mode, src, mode, 0);
2877 if (int_value
2878 && aarch64_expand_sve_widened_duplicate (dest, int_mode, int_value))
2879 return;
2880 }
2881
2882 /* Expand each pattern individually. */
2883 rtx_vector_builder builder;
2884 auto_vec<rtx, 16> vectors (npatterns);
2885 for (unsigned int i = 0; i < npatterns; ++i)
2886 {
2887 builder.new_vector (mode, 1, nelts_per_pattern);
2888 for (unsigned int j = 0; j < nelts_per_pattern; ++j)
2889 builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
2890 vectors.quick_push (force_reg (mode, builder.build ()));
2891 }
2892
2893 /* Use permutes to interleave the separate vectors. */
2894 while (npatterns > 1)
2895 {
2896 npatterns /= 2;
2897 for (unsigned int i = 0; i < npatterns; ++i)
2898 {
2899 rtx tmp = (npatterns == 1 ? dest : gen_reg_rtx (mode));
2900 rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
2901 emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
2902 vectors[i] = tmp;
2903 }
2904 }
2905 gcc_assert (vectors[0] == dest);
2906 }
2907
2908 /* Set DEST to immediate IMM. For SVE vector modes, GEN_VEC_DUPLICATE
2909 is a pattern that can be used to set DEST to a replicated scalar
2910 element. */
2911
2912 void
aarch64_expand_mov_immediate(rtx dest,rtx imm,rtx (* gen_vec_duplicate)(rtx,rtx))2913 aarch64_expand_mov_immediate (rtx dest, rtx imm,
2914 rtx (*gen_vec_duplicate) (rtx, rtx))
2915 {
2916 machine_mode mode = GET_MODE (dest);
2917
2918 /* Check on what type of symbol it is. */
2919 scalar_int_mode int_mode;
2920 if ((GET_CODE (imm) == SYMBOL_REF
2921 || GET_CODE (imm) == LABEL_REF
2922 || GET_CODE (imm) == CONST
2923 || GET_CODE (imm) == CONST_POLY_INT)
2924 && is_a <scalar_int_mode> (mode, &int_mode))
2925 {
2926 rtx mem;
2927 poly_int64 offset;
2928 HOST_WIDE_INT const_offset;
2929 enum aarch64_symbol_type sty;
2930
2931 /* If we have (const (plus symbol offset)), separate out the offset
2932 before we start classifying the symbol. */
2933 rtx base = strip_offset (imm, &offset);
2934
2935 /* We must always add an offset involving VL separately, rather than
2936 folding it into the relocation. */
2937 if (!offset.is_constant (&const_offset))
2938 {
2939 if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
2940 emit_insn (gen_rtx_SET (dest, imm));
2941 else
2942 {
2943 /* Do arithmetic on 32-bit values if the result is smaller
2944 than that. */
2945 if (partial_subreg_p (int_mode, SImode))
2946 {
2947 /* It is invalid to do symbol calculations in modes
2948 narrower than SImode. */
2949 gcc_assert (base == const0_rtx);
2950 dest = gen_lowpart (SImode, dest);
2951 int_mode = SImode;
2952 }
2953 if (base != const0_rtx)
2954 {
2955 base = aarch64_force_temporary (int_mode, dest, base);
2956 aarch64_add_offset (int_mode, dest, base, offset,
2957 NULL_RTX, NULL_RTX, false);
2958 }
2959 else
2960 aarch64_add_offset (int_mode, dest, base, offset,
2961 dest, NULL_RTX, false);
2962 }
2963 return;
2964 }
2965
2966 sty = aarch64_classify_symbol (base, const_offset);
2967 switch (sty)
2968 {
2969 case SYMBOL_FORCE_TO_MEM:
2970 if (const_offset != 0
2971 && targetm.cannot_force_const_mem (int_mode, imm))
2972 {
2973 gcc_assert (can_create_pseudo_p ());
2974 base = aarch64_force_temporary (int_mode, dest, base);
2975 aarch64_add_offset (int_mode, dest, base, const_offset,
2976 NULL_RTX, NULL_RTX, false);
2977 return;
2978 }
2979
2980 mem = force_const_mem (ptr_mode, imm);
2981 gcc_assert (mem);
2982
2983 /* If we aren't generating PC relative literals, then
2984 we need to expand the literal pool access carefully.
2985 This is something that needs to be done in a number
2986 of places, so could well live as a separate function. */
2987 if (!aarch64_pcrelative_literal_loads)
2988 {
2989 gcc_assert (can_create_pseudo_p ());
2990 base = gen_reg_rtx (ptr_mode);
2991 aarch64_expand_mov_immediate (base, XEXP (mem, 0));
2992 if (ptr_mode != Pmode)
2993 base = convert_memory_address (Pmode, base);
2994 mem = gen_rtx_MEM (ptr_mode, base);
2995 }
2996
2997 if (int_mode != ptr_mode)
2998 mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
2999
3000 emit_insn (gen_rtx_SET (dest, mem));
3001
3002 return;
3003
3004 case SYMBOL_SMALL_TLSGD:
3005 case SYMBOL_SMALL_TLSDESC:
3006 case SYMBOL_SMALL_TLSIE:
3007 case SYMBOL_SMALL_GOT_28K:
3008 case SYMBOL_SMALL_GOT_4G:
3009 case SYMBOL_TINY_GOT:
3010 case SYMBOL_TINY_TLSIE:
3011 if (const_offset != 0)
3012 {
3013 gcc_assert(can_create_pseudo_p ());
3014 base = aarch64_force_temporary (int_mode, dest, base);
3015 aarch64_add_offset (int_mode, dest, base, const_offset,
3016 NULL_RTX, NULL_RTX, false);
3017 return;
3018 }
3019 /* FALLTHRU */
3020
3021 case SYMBOL_SMALL_ABSOLUTE:
3022 case SYMBOL_TINY_ABSOLUTE:
3023 case SYMBOL_TLSLE12:
3024 case SYMBOL_TLSLE24:
3025 case SYMBOL_TLSLE32:
3026 case SYMBOL_TLSLE48:
3027 aarch64_load_symref_appropriately (dest, imm, sty);
3028 return;
3029
3030 default:
3031 gcc_unreachable ();
3032 }
3033 }
3034
3035 if (!CONST_INT_P (imm))
3036 {
3037 rtx base, step, value;
3038 if (GET_CODE (imm) == HIGH
3039 || aarch64_simd_valid_immediate (imm, NULL))
3040 emit_insn (gen_rtx_SET (dest, imm));
3041 else if (const_vec_series_p (imm, &base, &step))
3042 aarch64_expand_vec_series (dest, base, step);
3043 else if (const_vec_duplicate_p (imm, &value))
3044 {
3045 /* If the constant is out of range of an SVE vector move,
3046 load it from memory if we can, otherwise move it into
3047 a register and use a DUP. */
3048 scalar_mode inner_mode = GET_MODE_INNER (mode);
3049 rtx op = force_const_mem (inner_mode, value);
3050 if (!op)
3051 op = force_reg (inner_mode, value);
3052 else if (!aarch64_sve_ld1r_operand_p (op))
3053 {
3054 rtx addr = force_reg (Pmode, XEXP (op, 0));
3055 op = replace_equiv_address (op, addr);
3056 }
3057 emit_insn (gen_vec_duplicate (dest, op));
3058 }
3059 else if (GET_CODE (imm) == CONST_VECTOR
3060 && !GET_MODE_NUNITS (GET_MODE (imm)).is_constant ())
3061 aarch64_expand_sve_const_vector (dest, imm);
3062 else
3063 {
3064 rtx mem = force_const_mem (mode, imm);
3065 gcc_assert (mem);
3066 emit_move_insn (dest, mem);
3067 }
3068
3069 return;
3070 }
3071
3072 aarch64_internal_mov_immediate (dest, imm, true,
3073 as_a <scalar_int_mode> (mode));
3074 }
3075
3076 /* Emit an SVE predicated move from SRC to DEST. PRED is a predicate
3077 that is known to contain PTRUE. */
3078
3079 void
aarch64_emit_sve_pred_move(rtx dest,rtx pred,rtx src)3080 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
3081 {
3082 emit_insn (gen_rtx_SET (dest, gen_rtx_UNSPEC (GET_MODE (dest),
3083 gen_rtvec (2, pred, src),
3084 UNSPEC_MERGE_PTRUE)));
3085 }
3086
3087 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
3088 operand is in memory. In this case we need to use the predicated LD1
3089 and ST1 instead of LDR and STR, both for correctness on big-endian
3090 targets and because LD1 and ST1 support a wider range of addressing modes.
3091 PRED_MODE is the mode of the predicate.
3092
3093 See the comment at the head of aarch64-sve.md for details about the
3094 big-endian handling. */
3095
3096 void
aarch64_expand_sve_mem_move(rtx dest,rtx src,machine_mode pred_mode)3097 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
3098 {
3099 machine_mode mode = GET_MODE (dest);
3100 rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
3101 if (!register_operand (src, mode)
3102 && !register_operand (dest, mode))
3103 {
3104 rtx tmp = gen_reg_rtx (mode);
3105 if (MEM_P (src))
3106 aarch64_emit_sve_pred_move (tmp, ptrue, src);
3107 else
3108 emit_move_insn (tmp, src);
3109 src = tmp;
3110 }
3111 aarch64_emit_sve_pred_move (dest, ptrue, src);
3112 }
3113
3114 /* Called only on big-endian targets. See whether an SVE vector move
3115 from SRC to DEST is effectively a REV[BHW] instruction, because at
3116 least one operand is a subreg of an SVE vector that has wider or
3117 narrower elements. Return true and emit the instruction if so.
3118
3119 For example:
3120
3121 (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
3122
3123 represents a VIEW_CONVERT between the following vectors, viewed
3124 in memory order:
3125
3126 R2: { [0].high, [0].low, [1].high, [1].low, ... }
3127 R1: { [0], [1], [2], [3], ... }
3128
3129 The high part of lane X in R2 should therefore correspond to lane X*2
3130 of R1, but the register representations are:
3131
3132 msb lsb
3133 R2: ...... [1].high [1].low [0].high [0].low
3134 R1: ...... [3] [2] [1] [0]
3135
3136 where the low part of lane X in R2 corresponds to lane X*2 in R1.
3137 We therefore need a reverse operation to swap the high and low values
3138 around.
3139
3140 This is purely an optimization. Without it we would spill the
3141 subreg operand to the stack in one mode and reload it in the
3142 other mode, which has the same effect as the REV. */
3143
3144 bool
aarch64_maybe_expand_sve_subreg_move(rtx dest,rtx src)3145 aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
3146 {
3147 gcc_assert (BYTES_BIG_ENDIAN);
3148 if (GET_CODE (dest) == SUBREG)
3149 dest = SUBREG_REG (dest);
3150 if (GET_CODE (src) == SUBREG)
3151 src = SUBREG_REG (src);
3152
3153 /* The optimization handles two single SVE REGs with different element
3154 sizes. */
3155 if (!REG_P (dest)
3156 || !REG_P (src)
3157 || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
3158 || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
3159 || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
3160 == GET_MODE_UNIT_SIZE (GET_MODE (src))))
3161 return false;
3162
3163 /* Generate *aarch64_sve_mov<mode>_subreg_be. */
3164 rtx ptrue = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
3165 rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
3166 UNSPEC_REV_SUBREG);
3167 emit_insn (gen_rtx_SET (dest, unspec));
3168 return true;
3169 }
3170
3171 /* Return a copy of X with mode MODE, without changing its other
3172 attributes. Unlike gen_lowpart, this doesn't care whether the
3173 mode change is valid. */
3174
3175 static rtx
aarch64_replace_reg_mode(rtx x,machine_mode mode)3176 aarch64_replace_reg_mode (rtx x, machine_mode mode)
3177 {
3178 if (GET_MODE (x) == mode)
3179 return x;
3180
3181 x = shallow_copy_rtx (x);
3182 set_mode_and_regno (x, mode, REGNO (x));
3183 return x;
3184 }
3185
3186 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
3187 operands. */
3188
3189 void
aarch64_split_sve_subreg_move(rtx dest,rtx ptrue,rtx src)3190 aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
3191 {
3192 /* Decide which REV operation we need. The mode with narrower elements
3193 determines the mode of the operands and the mode with the wider
3194 elements determines the reverse width. */
3195 machine_mode mode_with_wider_elts = GET_MODE (dest);
3196 machine_mode mode_with_narrower_elts = GET_MODE (src);
3197 if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
3198 < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
3199 std::swap (mode_with_wider_elts, mode_with_narrower_elts);
3200
3201 unsigned int wider_bytes = GET_MODE_UNIT_SIZE (mode_with_wider_elts);
3202 unsigned int unspec;
3203 if (wider_bytes == 8)
3204 unspec = UNSPEC_REV64;
3205 else if (wider_bytes == 4)
3206 unspec = UNSPEC_REV32;
3207 else if (wider_bytes == 2)
3208 unspec = UNSPEC_REV16;
3209 else
3210 gcc_unreachable ();
3211 machine_mode pred_mode = aarch64_sve_pred_mode (wider_bytes).require ();
3212
3213 /* Emit:
3214
3215 (set DEST (unspec [PTRUE (unspec [SRC] UNSPEC_REV<nn>)]
3216 UNSPEC_MERGE_PTRUE))
3217
3218 with the appropriate modes. */
3219 ptrue = gen_lowpart (pred_mode, ptrue);
3220 dest = aarch64_replace_reg_mode (dest, mode_with_narrower_elts);
3221 src = aarch64_replace_reg_mode (src, mode_with_narrower_elts);
3222 src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (1, src), unspec);
3223 src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (2, ptrue, src),
3224 UNSPEC_MERGE_PTRUE);
3225 emit_insn (gen_rtx_SET (dest, src));
3226 }
3227
3228 static bool
aarch64_function_ok_for_sibcall(tree decl ATTRIBUTE_UNUSED,tree exp ATTRIBUTE_UNUSED)3229 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
3230 tree exp ATTRIBUTE_UNUSED)
3231 {
3232 /* Currently, always true. */
3233 return true;
3234 }
3235
3236 /* Implement TARGET_PASS_BY_REFERENCE. */
3237
3238 static bool
aarch64_pass_by_reference(cumulative_args_t pcum ATTRIBUTE_UNUSED,machine_mode mode,const_tree type,bool named ATTRIBUTE_UNUSED)3239 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
3240 machine_mode mode,
3241 const_tree type,
3242 bool named ATTRIBUTE_UNUSED)
3243 {
3244 HOST_WIDE_INT size;
3245 machine_mode dummymode;
3246 int nregs;
3247
3248 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
3249 if (mode == BLKmode && type)
3250 size = int_size_in_bytes (type);
3251 else
3252 /* No frontends can create types with variable-sized modes, so we
3253 shouldn't be asked to pass or return them. */
3254 size = GET_MODE_SIZE (mode).to_constant ();
3255
3256 /* Aggregates are passed by reference based on their size. */
3257 if (type && AGGREGATE_TYPE_P (type))
3258 {
3259 size = int_size_in_bytes (type);
3260 }
3261
3262 /* Variable sized arguments are always returned by reference. */
3263 if (size < 0)
3264 return true;
3265
3266 /* Can this be a candidate to be passed in fp/simd register(s)? */
3267 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
3268 &dummymode, &nregs,
3269 NULL))
3270 return false;
3271
3272 /* Arguments which are variable sized or larger than 2 registers are
3273 passed by reference unless they are a homogenous floating point
3274 aggregate. */
3275 return size > 2 * UNITS_PER_WORD;
3276 }
3277
3278 /* Return TRUE if VALTYPE is padded to its least significant bits. */
3279 static bool
aarch64_return_in_msb(const_tree valtype)3280 aarch64_return_in_msb (const_tree valtype)
3281 {
3282 machine_mode dummy_mode;
3283 int dummy_int;
3284
3285 /* Never happens in little-endian mode. */
3286 if (!BYTES_BIG_ENDIAN)
3287 return false;
3288
3289 /* Only composite types smaller than or equal to 16 bytes can
3290 be potentially returned in registers. */
3291 if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
3292 || int_size_in_bytes (valtype) <= 0
3293 || int_size_in_bytes (valtype) > 16)
3294 return false;
3295
3296 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
3297 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
3298 is always passed/returned in the least significant bits of fp/simd
3299 register(s). */
3300 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
3301 &dummy_mode, &dummy_int, NULL))
3302 return false;
3303
3304 return true;
3305 }
3306
3307 /* Implement TARGET_FUNCTION_VALUE.
3308 Define how to find the value returned by a function. */
3309
3310 static rtx
aarch64_function_value(const_tree type,const_tree func,bool outgoing ATTRIBUTE_UNUSED)3311 aarch64_function_value (const_tree type, const_tree func,
3312 bool outgoing ATTRIBUTE_UNUSED)
3313 {
3314 machine_mode mode;
3315 int unsignedp;
3316 int count;
3317 machine_mode ag_mode;
3318
3319 mode = TYPE_MODE (type);
3320 if (INTEGRAL_TYPE_P (type))
3321 mode = promote_function_mode (type, mode, &unsignedp, func, 1);
3322
3323 if (aarch64_return_in_msb (type))
3324 {
3325 HOST_WIDE_INT size = int_size_in_bytes (type);
3326
3327 if (size % UNITS_PER_WORD != 0)
3328 {
3329 size += UNITS_PER_WORD - size % UNITS_PER_WORD;
3330 mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
3331 }
3332 }
3333
3334 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
3335 &ag_mode, &count, NULL))
3336 {
3337 if (!aarch64_composite_type_p (type, mode))
3338 {
3339 gcc_assert (count == 1 && mode == ag_mode);
3340 return gen_rtx_REG (mode, V0_REGNUM);
3341 }
3342 else
3343 {
3344 int i;
3345 rtx par;
3346
3347 par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
3348 for (i = 0; i < count; i++)
3349 {
3350 rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
3351 rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
3352 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
3353 XVECEXP (par, 0, i) = tmp;
3354 }
3355 return par;
3356 }
3357 }
3358 else
3359 return gen_rtx_REG (mode, R0_REGNUM);
3360 }
3361
3362 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
3363 Return true if REGNO is the number of a hard register in which the values
3364 of called function may come back. */
3365
3366 static bool
aarch64_function_value_regno_p(const unsigned int regno)3367 aarch64_function_value_regno_p (const unsigned int regno)
3368 {
3369 /* Maximum of 16 bytes can be returned in the general registers. Examples
3370 of 16-byte return values are: 128-bit integers and 16-byte small
3371 structures (excluding homogeneous floating-point aggregates). */
3372 if (regno == R0_REGNUM || regno == R1_REGNUM)
3373 return true;
3374
3375 /* Up to four fp/simd registers can return a function value, e.g. a
3376 homogeneous floating-point aggregate having four members. */
3377 if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
3378 return TARGET_FLOAT;
3379
3380 return false;
3381 }
3382
3383 /* Implement TARGET_RETURN_IN_MEMORY.
3384
3385 If the type T of the result of a function is such that
3386 void func (T arg)
3387 would require that arg be passed as a value in a register (or set of
3388 registers) according to the parameter passing rules, then the result
3389 is returned in the same registers as would be used for such an
3390 argument. */
3391
3392 static bool
aarch64_return_in_memory(const_tree type,const_tree fndecl ATTRIBUTE_UNUSED)3393 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
3394 {
3395 HOST_WIDE_INT size;
3396 machine_mode ag_mode;
3397 int count;
3398
3399 if (!AGGREGATE_TYPE_P (type)
3400 && TREE_CODE (type) != COMPLEX_TYPE
3401 && TREE_CODE (type) != VECTOR_TYPE)
3402 /* Simple scalar types always returned in registers. */
3403 return false;
3404
3405 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
3406 type,
3407 &ag_mode,
3408 &count,
3409 NULL))
3410 return false;
3411
3412 /* Types larger than 2 registers returned in memory. */
3413 size = int_size_in_bytes (type);
3414 return (size < 0 || size > 2 * UNITS_PER_WORD);
3415 }
3416
3417 static bool
aarch64_vfp_is_call_candidate(cumulative_args_t pcum_v,machine_mode mode,const_tree type,int * nregs)3418 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
3419 const_tree type, int *nregs)
3420 {
3421 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3422 return aarch64_vfp_is_call_or_return_candidate (mode,
3423 type,
3424 &pcum->aapcs_vfp_rmode,
3425 nregs,
3426 NULL);
3427 }
3428
3429 /* Given MODE and TYPE of a function argument, return the alignment in
3430 bits. The idea is to suppress any stronger alignment requested by
3431 the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
3432 This is a helper function for local use only. */
3433
3434 static unsigned int
aarch64_function_arg_alignment(machine_mode mode,const_tree type)3435 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
3436 {
3437 if (!type)
3438 return GET_MODE_ALIGNMENT (mode);
3439
3440 if (integer_zerop (TYPE_SIZE (type)))
3441 return 0;
3442
3443 gcc_assert (TYPE_MODE (type) == mode);
3444
3445 if (!AGGREGATE_TYPE_P (type))
3446 return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
3447
3448 if (TREE_CODE (type) == ARRAY_TYPE)
3449 return TYPE_ALIGN (TREE_TYPE (type));
3450
3451 unsigned int alignment = 0;
3452 for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
3453 if (TREE_CODE (field) == FIELD_DECL)
3454 alignment = std::max (alignment, DECL_ALIGN (field));
3455
3456 return alignment;
3457 }
3458
3459 /* Layout a function argument according to the AAPCS64 rules. The rule
3460 numbers refer to the rule numbers in the AAPCS64. */
3461
3462 static void
aarch64_layout_arg(cumulative_args_t pcum_v,machine_mode mode,const_tree type,bool named ATTRIBUTE_UNUSED)3463 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
3464 const_tree type,
3465 bool named ATTRIBUTE_UNUSED)
3466 {
3467 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3468 int ncrn, nvrn, nregs;
3469 bool allocate_ncrn, allocate_nvrn;
3470 HOST_WIDE_INT size;
3471
3472 /* We need to do this once per argument. */
3473 if (pcum->aapcs_arg_processed)
3474 return;
3475
3476 pcum->aapcs_arg_processed = true;
3477
3478 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
3479 if (type)
3480 size = int_size_in_bytes (type);
3481 else
3482 /* No frontends can create types with variable-sized modes, so we
3483 shouldn't be asked to pass or return them. */
3484 size = GET_MODE_SIZE (mode).to_constant ();
3485 size = ROUND_UP (size, UNITS_PER_WORD);
3486
3487 allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
3488 allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
3489 mode,
3490 type,
3491 &nregs);
3492
3493 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
3494 The following code thus handles passing by SIMD/FP registers first. */
3495
3496 nvrn = pcum->aapcs_nvrn;
3497
3498 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
3499 and homogenous short-vector aggregates (HVA). */
3500 if (allocate_nvrn)
3501 {
3502 if (!TARGET_FLOAT)
3503 aarch64_err_no_fpadvsimd (mode, "argument");
3504
3505 if (nvrn + nregs <= NUM_FP_ARG_REGS)
3506 {
3507 pcum->aapcs_nextnvrn = nvrn + nregs;
3508 if (!aarch64_composite_type_p (type, mode))
3509 {
3510 gcc_assert (nregs == 1);
3511 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
3512 }
3513 else
3514 {
3515 rtx par;
3516 int i;
3517 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
3518 for (i = 0; i < nregs; i++)
3519 {
3520 rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
3521 V0_REGNUM + nvrn + i);
3522 rtx offset = gen_int_mode
3523 (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
3524 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
3525 XVECEXP (par, 0, i) = tmp;
3526 }
3527 pcum->aapcs_reg = par;
3528 }
3529 return;
3530 }
3531 else
3532 {
3533 /* C.3 NSRN is set to 8. */
3534 pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
3535 goto on_stack;
3536 }
3537 }
3538
3539 ncrn = pcum->aapcs_ncrn;
3540 nregs = size / UNITS_PER_WORD;
3541
3542 /* C6 - C9. though the sign and zero extension semantics are
3543 handled elsewhere. This is the case where the argument fits
3544 entirely general registers. */
3545 if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
3546 {
3547
3548 gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
3549
3550 /* C.8 if the argument has an alignment of 16 then the NGRN is
3551 rounded up to the next even number. */
3552 if (nregs == 2
3553 && ncrn % 2
3554 /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
3555 comparison is there because for > 16 * BITS_PER_UNIT
3556 alignment nregs should be > 2 and therefore it should be
3557 passed by reference rather than value. */
3558 && aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
3559 {
3560 ++ncrn;
3561 gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
3562 }
3563
3564 /* NREGS can be 0 when e.g. an empty structure is to be passed.
3565 A reg is still generated for it, but the caller should be smart
3566 enough not to use it. */
3567 if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
3568 pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
3569 else
3570 {
3571 rtx par;
3572 int i;
3573
3574 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
3575 for (i = 0; i < nregs; i++)
3576 {
3577 rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
3578 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
3579 GEN_INT (i * UNITS_PER_WORD));
3580 XVECEXP (par, 0, i) = tmp;
3581 }
3582 pcum->aapcs_reg = par;
3583 }
3584
3585 pcum->aapcs_nextncrn = ncrn + nregs;
3586 return;
3587 }
3588
3589 /* C.11 */
3590 pcum->aapcs_nextncrn = NUM_ARG_REGS;
3591
3592 /* The argument is passed on stack; record the needed number of words for
3593 this argument and align the total size if necessary. */
3594 on_stack:
3595 pcum->aapcs_stack_words = size / UNITS_PER_WORD;
3596
3597 if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
3598 pcum->aapcs_stack_size = ROUND_UP (pcum->aapcs_stack_size,
3599 16 / UNITS_PER_WORD);
3600 return;
3601 }
3602
3603 /* Implement TARGET_FUNCTION_ARG. */
3604
3605 static rtx
aarch64_function_arg(cumulative_args_t pcum_v,machine_mode mode,const_tree type,bool named)3606 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
3607 const_tree type, bool named)
3608 {
3609 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3610 gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
3611
3612 if (mode == VOIDmode)
3613 return NULL_RTX;
3614
3615 aarch64_layout_arg (pcum_v, mode, type, named);
3616 return pcum->aapcs_reg;
3617 }
3618
3619 void
aarch64_init_cumulative_args(CUMULATIVE_ARGS * pcum,const_tree fntype ATTRIBUTE_UNUSED,rtx libname ATTRIBUTE_UNUSED,const_tree fndecl ATTRIBUTE_UNUSED,unsigned n_named ATTRIBUTE_UNUSED)3620 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
3621 const_tree fntype ATTRIBUTE_UNUSED,
3622 rtx libname ATTRIBUTE_UNUSED,
3623 const_tree fndecl ATTRIBUTE_UNUSED,
3624 unsigned n_named ATTRIBUTE_UNUSED)
3625 {
3626 pcum->aapcs_ncrn = 0;
3627 pcum->aapcs_nvrn = 0;
3628 pcum->aapcs_nextncrn = 0;
3629 pcum->aapcs_nextnvrn = 0;
3630 pcum->pcs_variant = ARM_PCS_AAPCS64;
3631 pcum->aapcs_reg = NULL_RTX;
3632 pcum->aapcs_arg_processed = false;
3633 pcum->aapcs_stack_words = 0;
3634 pcum->aapcs_stack_size = 0;
3635
3636 if (!TARGET_FLOAT
3637 && fndecl && TREE_PUBLIC (fndecl)
3638 && fntype && fntype != error_mark_node)
3639 {
3640 const_tree type = TREE_TYPE (fntype);
3641 machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument. */
3642 int nregs ATTRIBUTE_UNUSED; /* Likewise. */
3643 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
3644 &mode, &nregs, NULL))
3645 aarch64_err_no_fpadvsimd (TYPE_MODE (type), "return type");
3646 }
3647 return;
3648 }
3649
3650 static void
aarch64_function_arg_advance(cumulative_args_t pcum_v,machine_mode mode,const_tree type,bool named)3651 aarch64_function_arg_advance (cumulative_args_t pcum_v,
3652 machine_mode mode,
3653 const_tree type,
3654 bool named)
3655 {
3656 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3657 if (pcum->pcs_variant == ARM_PCS_AAPCS64)
3658 {
3659 aarch64_layout_arg (pcum_v, mode, type, named);
3660 gcc_assert ((pcum->aapcs_reg != NULL_RTX)
3661 != (pcum->aapcs_stack_words != 0));
3662 pcum->aapcs_arg_processed = false;
3663 pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
3664 pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
3665 pcum->aapcs_stack_size += pcum->aapcs_stack_words;
3666 pcum->aapcs_stack_words = 0;
3667 pcum->aapcs_reg = NULL_RTX;
3668 }
3669 }
3670
3671 bool
aarch64_function_arg_regno_p(unsigned regno)3672 aarch64_function_arg_regno_p (unsigned regno)
3673 {
3674 return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
3675 || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
3676 }
3677
3678 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
3679 PARM_BOUNDARY bits of alignment, but will be given anything up
3680 to STACK_BOUNDARY bits if the type requires it. This makes sure
3681 that both before and after the layout of each argument, the Next
3682 Stacked Argument Address (NSAA) will have a minimum alignment of
3683 8 bytes. */
3684
3685 static unsigned int
aarch64_function_arg_boundary(machine_mode mode,const_tree type)3686 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
3687 {
3688 unsigned int alignment = aarch64_function_arg_alignment (mode, type);
3689 return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
3690 }
3691
3692 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE. */
3693
3694 static fixed_size_mode
aarch64_get_reg_raw_mode(int regno)3695 aarch64_get_reg_raw_mode (int regno)
3696 {
3697 if (TARGET_SVE && FP_REGNUM_P (regno))
3698 /* Don't use the SVE part of the register for __builtin_apply and
3699 __builtin_return. The SVE registers aren't used by the normal PCS,
3700 so using them there would be a waste of time. The PCS extensions
3701 for SVE types are fundamentally incompatible with the
3702 __builtin_return/__builtin_apply interface. */
3703 return as_a <fixed_size_mode> (V16QImode);
3704 return default_get_reg_raw_mode (regno);
3705 }
3706
3707 /* Implement TARGET_FUNCTION_ARG_PADDING.
3708
3709 Small aggregate types are placed in the lowest memory address.
3710
3711 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
3712
3713 static pad_direction
aarch64_function_arg_padding(machine_mode mode,const_tree type)3714 aarch64_function_arg_padding (machine_mode mode, const_tree type)
3715 {
3716 /* On little-endian targets, the least significant byte of every stack
3717 argument is passed at the lowest byte address of the stack slot. */
3718 if (!BYTES_BIG_ENDIAN)
3719 return PAD_UPWARD;
3720
3721 /* Otherwise, integral, floating-point and pointer types are padded downward:
3722 the least significant byte of a stack argument is passed at the highest
3723 byte address of the stack slot. */
3724 if (type
3725 ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
3726 || POINTER_TYPE_P (type))
3727 : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
3728 return PAD_DOWNWARD;
3729
3730 /* Everything else padded upward, i.e. data in first byte of stack slot. */
3731 return PAD_UPWARD;
3732 }
3733
3734 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
3735
3736 It specifies padding for the last (may also be the only)
3737 element of a block move between registers and memory. If
3738 assuming the block is in the memory, padding upward means that
3739 the last element is padded after its highest significant byte,
3740 while in downward padding, the last element is padded at the
3741 its least significant byte side.
3742
3743 Small aggregates and small complex types are always padded
3744 upwards.
3745
3746 We don't need to worry about homogeneous floating-point or
3747 short-vector aggregates; their move is not affected by the
3748 padding direction determined here. Regardless of endianness,
3749 each element of such an aggregate is put in the least
3750 significant bits of a fp/simd register.
3751
3752 Return !BYTES_BIG_ENDIAN if the least significant byte of the
3753 register has useful data, and return the opposite if the most
3754 significant byte does. */
3755
3756 bool
aarch64_pad_reg_upward(machine_mode mode,const_tree type,bool first ATTRIBUTE_UNUSED)3757 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
3758 bool first ATTRIBUTE_UNUSED)
3759 {
3760
3761 /* Small composite types are always padded upward. */
3762 if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
3763 {
3764 HOST_WIDE_INT size;
3765 if (type)
3766 size = int_size_in_bytes (type);
3767 else
3768 /* No frontends can create types with variable-sized modes, so we
3769 shouldn't be asked to pass or return them. */
3770 size = GET_MODE_SIZE (mode).to_constant ();
3771 if (size < 2 * UNITS_PER_WORD)
3772 return true;
3773 }
3774
3775 /* Otherwise, use the default padding. */
3776 return !BYTES_BIG_ENDIAN;
3777 }
3778
3779 static scalar_int_mode
aarch64_libgcc_cmp_return_mode(void)3780 aarch64_libgcc_cmp_return_mode (void)
3781 {
3782 return SImode;
3783 }
3784
3785 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
3786
3787 /* We use the 12-bit shifted immediate arithmetic instructions so values
3788 must be multiple of (1 << 12), i.e. 4096. */
3789 #define ARITH_FACTOR 4096
3790
3791 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
3792 #error Cannot use simple address calculation for stack probing
3793 #endif
3794
3795 /* The pair of scratch registers used for stack probing. */
3796 #define PROBE_STACK_FIRST_REG 9
3797 #define PROBE_STACK_SECOND_REG 10
3798
3799 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
3800 inclusive. These are offsets from the current stack pointer. */
3801
3802 static void
aarch64_emit_probe_stack_range(HOST_WIDE_INT first,poly_int64 poly_size)3803 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
3804 {
3805 HOST_WIDE_INT size;
3806 if (!poly_size.is_constant (&size))
3807 {
3808 sorry ("stack probes for SVE frames");
3809 return;
3810 }
3811
3812 rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
3813
3814 /* See the same assertion on PROBE_INTERVAL above. */
3815 gcc_assert ((first % ARITH_FACTOR) == 0);
3816
3817 /* See if we have a constant small number of probes to generate. If so,
3818 that's the easy case. */
3819 if (size <= PROBE_INTERVAL)
3820 {
3821 const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
3822
3823 emit_set_insn (reg1,
3824 plus_constant (Pmode,
3825 stack_pointer_rtx, -(first + base)));
3826 emit_stack_probe (plus_constant (Pmode, reg1, base - size));
3827 }
3828
3829 /* The run-time loop is made up of 8 insns in the generic case while the
3830 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */
3831 else if (size <= 4 * PROBE_INTERVAL)
3832 {
3833 HOST_WIDE_INT i, rem;
3834
3835 emit_set_insn (reg1,
3836 plus_constant (Pmode,
3837 stack_pointer_rtx,
3838 -(first + PROBE_INTERVAL)));
3839 emit_stack_probe (reg1);
3840
3841 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
3842 it exceeds SIZE. If only two probes are needed, this will not
3843 generate any code. Then probe at FIRST + SIZE. */
3844 for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
3845 {
3846 emit_set_insn (reg1,
3847 plus_constant (Pmode, reg1, -PROBE_INTERVAL));
3848 emit_stack_probe (reg1);
3849 }
3850
3851 rem = size - (i - PROBE_INTERVAL);
3852 if (rem > 256)
3853 {
3854 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
3855
3856 emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
3857 emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
3858 }
3859 else
3860 emit_stack_probe (plus_constant (Pmode, reg1, -rem));
3861 }
3862
3863 /* Otherwise, do the same as above, but in a loop. Note that we must be
3864 extra careful with variables wrapping around because we might be at
3865 the very top (or the very bottom) of the address space and we have
3866 to be able to handle this case properly; in particular, we use an
3867 equality test for the loop condition. */
3868 else
3869 {
3870 rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
3871
3872 /* Step 1: round SIZE to the previous multiple of the interval. */
3873
3874 HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
3875
3876
3877 /* Step 2: compute initial and final value of the loop counter. */
3878
3879 /* TEST_ADDR = SP + FIRST. */
3880 emit_set_insn (reg1,
3881 plus_constant (Pmode, stack_pointer_rtx, -first));
3882
3883 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
3884 HOST_WIDE_INT adjustment = - (first + rounded_size);
3885 if (! aarch64_uimm12_shift (adjustment))
3886 {
3887 aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
3888 true, Pmode);
3889 emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
3890 }
3891 else
3892 emit_set_insn (reg2,
3893 plus_constant (Pmode, stack_pointer_rtx, adjustment));
3894
3895 /* Step 3: the loop
3896
3897 do
3898 {
3899 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
3900 probe at TEST_ADDR
3901 }
3902 while (TEST_ADDR != LAST_ADDR)
3903
3904 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
3905 until it is equal to ROUNDED_SIZE. */
3906
3907 emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
3908
3909
3910 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
3911 that SIZE is equal to ROUNDED_SIZE. */
3912
3913 if (size != rounded_size)
3914 {
3915 HOST_WIDE_INT rem = size - rounded_size;
3916
3917 if (rem > 256)
3918 {
3919 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
3920
3921 emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
3922 emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
3923 }
3924 else
3925 emit_stack_probe (plus_constant (Pmode, reg2, -rem));
3926 }
3927 }
3928
3929 /* Make sure nothing is scheduled before we are done. */
3930 emit_insn (gen_blockage ());
3931 }
3932
3933 /* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
3934 absolute addresses. */
3935
3936 const char *
aarch64_output_probe_stack_range(rtx reg1,rtx reg2)3937 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
3938 {
3939 static int labelno = 0;
3940 char loop_lab[32];
3941 rtx xops[2];
3942
3943 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
3944
3945 /* Loop. */
3946 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
3947
3948 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
3949 xops[0] = reg1;
3950 xops[1] = GEN_INT (PROBE_INTERVAL);
3951 output_asm_insn ("sub\t%0, %0, %1", xops);
3952
3953 /* Probe at TEST_ADDR. */
3954 output_asm_insn ("str\txzr, [%0]", xops);
3955
3956 /* Test if TEST_ADDR == LAST_ADDR. */
3957 xops[1] = reg2;
3958 output_asm_insn ("cmp\t%0, %1", xops);
3959
3960 /* Branch. */
3961 fputs ("\tb.ne\t", asm_out_file);
3962 assemble_name_raw (asm_out_file, loop_lab);
3963 fputc ('\n', asm_out_file);
3964
3965 return "";
3966 }
3967
3968 /* Mark the registers that need to be saved by the callee and calculate
3969 the size of the callee-saved registers area and frame record (both FP
3970 and LR may be omitted). */
3971 static void
aarch64_layout_frame(void)3972 aarch64_layout_frame (void)
3973 {
3974 HOST_WIDE_INT offset = 0;
3975 int regno, last_fp_reg = INVALID_REGNUM;
3976
3977 if (reload_completed && cfun->machine->frame.laid_out)
3978 return;
3979
3980 /* Force a frame chain for EH returns so the return address is at FP+8. */
3981 cfun->machine->frame.emit_frame_chain
3982 = frame_pointer_needed || crtl->calls_eh_return;
3983
3984 /* Emit a frame chain if the frame pointer is enabled.
3985 If -momit-leaf-frame-pointer is used, do not use a frame chain
3986 in leaf functions which do not use LR. */
3987 if (flag_omit_frame_pointer == 2
3988 && !(flag_omit_leaf_frame_pointer && crtl->is_leaf
3989 && !df_regs_ever_live_p (LR_REGNUM)))
3990 cfun->machine->frame.emit_frame_chain = true;
3991
3992 #define SLOT_NOT_REQUIRED (-2)
3993 #define SLOT_REQUIRED (-1)
3994
3995 cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
3996 cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
3997
3998 /* First mark all the registers that really need to be saved... */
3999 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4000 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
4001
4002 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4003 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
4004
4005 /* ... that includes the eh data registers (if needed)... */
4006 if (crtl->calls_eh_return)
4007 for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
4008 cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
4009 = SLOT_REQUIRED;
4010
4011 /* ... and any callee saved register that dataflow says is live. */
4012 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4013 if (df_regs_ever_live_p (regno)
4014 && (regno == R30_REGNUM
4015 || !call_used_regs[regno]))
4016 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
4017
4018 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4019 if (df_regs_ever_live_p (regno)
4020 && !call_used_regs[regno])
4021 {
4022 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
4023 last_fp_reg = regno;
4024 }
4025
4026 if (cfun->machine->frame.emit_frame_chain)
4027 {
4028 /* FP and LR are placed in the linkage record. */
4029 cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
4030 cfun->machine->frame.wb_candidate1 = R29_REGNUM;
4031 cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
4032 cfun->machine->frame.wb_candidate2 = R30_REGNUM;
4033 offset = 2 * UNITS_PER_WORD;
4034 }
4035
4036 /* Now assign stack slots for them. */
4037 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4038 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
4039 {
4040 cfun->machine->frame.reg_offset[regno] = offset;
4041 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
4042 cfun->machine->frame.wb_candidate1 = regno;
4043 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
4044 cfun->machine->frame.wb_candidate2 = regno;
4045 offset += UNITS_PER_WORD;
4046 }
4047
4048 HOST_WIDE_INT max_int_offset = offset;
4049 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
4050 bool has_align_gap = offset != max_int_offset;
4051
4052 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4053 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
4054 {
4055 /* If there is an alignment gap between integer and fp callee-saves,
4056 allocate the last fp register to it if possible. */
4057 if (regno == last_fp_reg && has_align_gap && (offset & 8) == 0)
4058 {
4059 cfun->machine->frame.reg_offset[regno] = max_int_offset;
4060 break;
4061 }
4062
4063 cfun->machine->frame.reg_offset[regno] = offset;
4064 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
4065 cfun->machine->frame.wb_candidate1 = regno;
4066 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
4067 && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
4068 cfun->machine->frame.wb_candidate2 = regno;
4069 offset += UNITS_PER_WORD;
4070 }
4071
4072 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
4073
4074 cfun->machine->frame.saved_regs_size = offset;
4075
4076 HOST_WIDE_INT varargs_and_saved_regs_size
4077 = offset + cfun->machine->frame.saved_varargs_size;
4078
4079 cfun->machine->frame.hard_fp_offset
4080 = aligned_upper_bound (varargs_and_saved_regs_size
4081 + get_frame_size (),
4082 STACK_BOUNDARY / BITS_PER_UNIT);
4083
4084 /* Both these values are already aligned. */
4085 gcc_assert (multiple_p (crtl->outgoing_args_size,
4086 STACK_BOUNDARY / BITS_PER_UNIT));
4087 cfun->machine->frame.frame_size
4088 = (cfun->machine->frame.hard_fp_offset
4089 + crtl->outgoing_args_size);
4090
4091 cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
4092
4093 cfun->machine->frame.initial_adjust = 0;
4094 cfun->machine->frame.final_adjust = 0;
4095 cfun->machine->frame.callee_adjust = 0;
4096 cfun->machine->frame.callee_offset = 0;
4097
4098 HOST_WIDE_INT max_push_offset = 0;
4099 if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
4100 max_push_offset = 512;
4101 else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
4102 max_push_offset = 256;
4103
4104 HOST_WIDE_INT const_size, const_fp_offset;
4105 if (cfun->machine->frame.frame_size.is_constant (&const_size)
4106 && const_size < max_push_offset
4107 && known_eq (crtl->outgoing_args_size, 0))
4108 {
4109 /* Simple, small frame with no outgoing arguments:
4110 stp reg1, reg2, [sp, -frame_size]!
4111 stp reg3, reg4, [sp, 16] */
4112 cfun->machine->frame.callee_adjust = const_size;
4113 }
4114 else if (known_lt (crtl->outgoing_args_size
4115 + cfun->machine->frame.saved_regs_size, 512)
4116 && !(cfun->calls_alloca
4117 && known_lt (cfun->machine->frame.hard_fp_offset,
4118 max_push_offset)))
4119 {
4120 /* Frame with small outgoing arguments:
4121 sub sp, sp, frame_size
4122 stp reg1, reg2, [sp, outgoing_args_size]
4123 stp reg3, reg4, [sp, outgoing_args_size + 16] */
4124 cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
4125 cfun->machine->frame.callee_offset
4126 = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
4127 }
4128 else if (cfun->machine->frame.hard_fp_offset.is_constant (&const_fp_offset)
4129 && const_fp_offset < max_push_offset)
4130 {
4131 /* Frame with large outgoing arguments but a small local area:
4132 stp reg1, reg2, [sp, -hard_fp_offset]!
4133 stp reg3, reg4, [sp, 16]
4134 sub sp, sp, outgoing_args_size */
4135 cfun->machine->frame.callee_adjust = const_fp_offset;
4136 cfun->machine->frame.final_adjust
4137 = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
4138 }
4139 else
4140 {
4141 /* Frame with large local area and outgoing arguments using frame pointer:
4142 sub sp, sp, hard_fp_offset
4143 stp x29, x30, [sp, 0]
4144 add x29, sp, 0
4145 stp reg3, reg4, [sp, 16]
4146 sub sp, sp, outgoing_args_size */
4147 cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
4148 cfun->machine->frame.final_adjust
4149 = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
4150 }
4151
4152 cfun->machine->frame.laid_out = true;
4153 }
4154
4155 /* Return true if the register REGNO is saved on entry to
4156 the current function. */
4157
4158 static bool
aarch64_register_saved_on_entry(int regno)4159 aarch64_register_saved_on_entry (int regno)
4160 {
4161 return cfun->machine->frame.reg_offset[regno] >= 0;
4162 }
4163
4164 /* Return the next register up from REGNO up to LIMIT for the callee
4165 to save. */
4166
4167 static unsigned
aarch64_next_callee_save(unsigned regno,unsigned limit)4168 aarch64_next_callee_save (unsigned regno, unsigned limit)
4169 {
4170 while (regno <= limit && !aarch64_register_saved_on_entry (regno))
4171 regno ++;
4172 return regno;
4173 }
4174
4175 /* Push the register number REGNO of mode MODE to the stack with write-back
4176 adjusting the stack by ADJUSTMENT. */
4177
4178 static void
aarch64_pushwb_single_reg(machine_mode mode,unsigned regno,HOST_WIDE_INT adjustment)4179 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
4180 HOST_WIDE_INT adjustment)
4181 {
4182 rtx base_rtx = stack_pointer_rtx;
4183 rtx insn, reg, mem;
4184
4185 reg = gen_rtx_REG (mode, regno);
4186 mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
4187 plus_constant (Pmode, base_rtx, -adjustment));
4188 mem = gen_frame_mem (mode, mem);
4189
4190 insn = emit_move_insn (mem, reg);
4191 RTX_FRAME_RELATED_P (insn) = 1;
4192 }
4193
4194 /* Generate and return an instruction to store the pair of registers
4195 REG and REG2 of mode MODE to location BASE with write-back adjusting
4196 the stack location BASE by ADJUSTMENT. */
4197
4198 static rtx
aarch64_gen_storewb_pair(machine_mode mode,rtx base,rtx reg,rtx reg2,HOST_WIDE_INT adjustment)4199 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
4200 HOST_WIDE_INT adjustment)
4201 {
4202 switch (mode)
4203 {
4204 case E_DImode:
4205 return gen_storewb_pairdi_di (base, base, reg, reg2,
4206 GEN_INT (-adjustment),
4207 GEN_INT (UNITS_PER_WORD - adjustment));
4208 case E_DFmode:
4209 return gen_storewb_pairdf_di (base, base, reg, reg2,
4210 GEN_INT (-adjustment),
4211 GEN_INT (UNITS_PER_WORD - adjustment));
4212 default:
4213 gcc_unreachable ();
4214 }
4215 }
4216
4217 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
4218 stack pointer by ADJUSTMENT. */
4219
4220 static void
aarch64_push_regs(unsigned regno1,unsigned regno2,HOST_WIDE_INT adjustment)4221 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
4222 {
4223 rtx_insn *insn;
4224 machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
4225
4226 if (regno2 == INVALID_REGNUM)
4227 return aarch64_pushwb_single_reg (mode, regno1, adjustment);
4228
4229 rtx reg1 = gen_rtx_REG (mode, regno1);
4230 rtx reg2 = gen_rtx_REG (mode, regno2);
4231
4232 insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
4233 reg2, adjustment));
4234 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
4235 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
4236 RTX_FRAME_RELATED_P (insn) = 1;
4237 }
4238
4239 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
4240 adjusting it by ADJUSTMENT afterwards. */
4241
4242 static rtx
aarch64_gen_loadwb_pair(machine_mode mode,rtx base,rtx reg,rtx reg2,HOST_WIDE_INT adjustment)4243 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
4244 HOST_WIDE_INT adjustment)
4245 {
4246 switch (mode)
4247 {
4248 case E_DImode:
4249 return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
4250 GEN_INT (UNITS_PER_WORD));
4251 case E_DFmode:
4252 return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
4253 GEN_INT (UNITS_PER_WORD));
4254 default:
4255 gcc_unreachable ();
4256 }
4257 }
4258
4259 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
4260 afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
4261 into CFI_OPS. */
4262
4263 static void
aarch64_pop_regs(unsigned regno1,unsigned regno2,HOST_WIDE_INT adjustment,rtx * cfi_ops)4264 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
4265 rtx *cfi_ops)
4266 {
4267 machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
4268 rtx reg1 = gen_rtx_REG (mode, regno1);
4269
4270 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
4271
4272 if (regno2 == INVALID_REGNUM)
4273 {
4274 rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
4275 mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
4276 emit_move_insn (reg1, gen_frame_mem (mode, mem));
4277 }
4278 else
4279 {
4280 rtx reg2 = gen_rtx_REG (mode, regno2);
4281 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
4282 emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
4283 reg2, adjustment));
4284 }
4285 }
4286
4287 /* Generate and return a store pair instruction of mode MODE to store
4288 register REG1 to MEM1 and register REG2 to MEM2. */
4289
4290 static rtx
aarch64_gen_store_pair(machine_mode mode,rtx mem1,rtx reg1,rtx mem2,rtx reg2)4291 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
4292 rtx reg2)
4293 {
4294 switch (mode)
4295 {
4296 case E_DImode:
4297 return gen_store_pairdi (mem1, reg1, mem2, reg2);
4298
4299 case E_DFmode:
4300 return gen_store_pairdf (mem1, reg1, mem2, reg2);
4301
4302 default:
4303 gcc_unreachable ();
4304 }
4305 }
4306
4307 /* Generate and regurn a load pair isntruction of mode MODE to load register
4308 REG1 from MEM1 and register REG2 from MEM2. */
4309
4310 static rtx
aarch64_gen_load_pair(machine_mode mode,rtx reg1,rtx mem1,rtx reg2,rtx mem2)4311 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
4312 rtx mem2)
4313 {
4314 switch (mode)
4315 {
4316 case E_DImode:
4317 return gen_load_pairdi (reg1, mem1, reg2, mem2);
4318
4319 case E_DFmode:
4320 return gen_load_pairdf (reg1, mem1, reg2, mem2);
4321
4322 default:
4323 gcc_unreachable ();
4324 }
4325 }
4326
4327 /* Return TRUE if return address signing should be enabled for the current
4328 function, otherwise return FALSE. */
4329
4330 bool
aarch64_return_address_signing_enabled(void)4331 aarch64_return_address_signing_enabled (void)
4332 {
4333 /* This function should only be called after frame laid out. */
4334 gcc_assert (cfun->machine->frame.laid_out);
4335
4336 /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
4337 if it's LR is pushed onto stack. */
4338 return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
4339 || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
4340 && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
4341 }
4342
4343 /* Emit code to save the callee-saved registers from register number START
4344 to LIMIT to the stack at the location starting at offset START_OFFSET,
4345 skipping any write-back candidates if SKIP_WB is true. */
4346
4347 static void
aarch64_save_callee_saves(machine_mode mode,poly_int64 start_offset,unsigned start,unsigned limit,bool skip_wb)4348 aarch64_save_callee_saves (machine_mode mode, poly_int64 start_offset,
4349 unsigned start, unsigned limit, bool skip_wb)
4350 {
4351 rtx_insn *insn;
4352 unsigned regno;
4353 unsigned regno2;
4354
4355 for (regno = aarch64_next_callee_save (start, limit);
4356 regno <= limit;
4357 regno = aarch64_next_callee_save (regno + 1, limit))
4358 {
4359 rtx reg, mem;
4360 poly_int64 offset;
4361
4362 if (skip_wb
4363 && (regno == cfun->machine->frame.wb_candidate1
4364 || regno == cfun->machine->frame.wb_candidate2))
4365 continue;
4366
4367 if (cfun->machine->reg_is_wrapped_separately[regno])
4368 continue;
4369
4370 reg = gen_rtx_REG (mode, regno);
4371 offset = start_offset + cfun->machine->frame.reg_offset[regno];
4372 mem = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
4373 offset));
4374
4375 regno2 = aarch64_next_callee_save (regno + 1, limit);
4376
4377 if (regno2 <= limit
4378 && !cfun->machine->reg_is_wrapped_separately[regno2]
4379 && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
4380 == cfun->machine->frame.reg_offset[regno2]))
4381
4382 {
4383 rtx reg2 = gen_rtx_REG (mode, regno2);
4384 rtx mem2;
4385
4386 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
4387 mem2 = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
4388 offset));
4389 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
4390 reg2));
4391
4392 /* The first part of a frame-related parallel insn is
4393 always assumed to be relevant to the frame
4394 calculations; subsequent parts, are only
4395 frame-related if explicitly marked. */
4396 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
4397 regno = regno2;
4398 }
4399 else
4400 insn = emit_move_insn (mem, reg);
4401
4402 RTX_FRAME_RELATED_P (insn) = 1;
4403 }
4404 }
4405
4406 /* Emit code to restore the callee registers of mode MODE from register
4407 number START up to and including LIMIT. Restore from the stack offset
4408 START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
4409 Write the appropriate REG_CFA_RESTORE notes into CFI_OPS. */
4410
4411 static void
aarch64_restore_callee_saves(machine_mode mode,poly_int64 start_offset,unsigned start,unsigned limit,bool skip_wb,rtx * cfi_ops)4412 aarch64_restore_callee_saves (machine_mode mode,
4413 poly_int64 start_offset, unsigned start,
4414 unsigned limit, bool skip_wb, rtx *cfi_ops)
4415 {
4416 rtx base_rtx = stack_pointer_rtx;
4417 unsigned regno;
4418 unsigned regno2;
4419 poly_int64 offset;
4420
4421 for (regno = aarch64_next_callee_save (start, limit);
4422 regno <= limit;
4423 regno = aarch64_next_callee_save (regno + 1, limit))
4424 {
4425 if (cfun->machine->reg_is_wrapped_separately[regno])
4426 continue;
4427
4428 rtx reg, mem;
4429
4430 if (skip_wb
4431 && (regno == cfun->machine->frame.wb_candidate1
4432 || regno == cfun->machine->frame.wb_candidate2))
4433 continue;
4434
4435 reg = gen_rtx_REG (mode, regno);
4436 offset = start_offset + cfun->machine->frame.reg_offset[regno];
4437 mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
4438
4439 regno2 = aarch64_next_callee_save (regno + 1, limit);
4440
4441 if (regno2 <= limit
4442 && !cfun->machine->reg_is_wrapped_separately[regno2]
4443 && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
4444 == cfun->machine->frame.reg_offset[regno2]))
4445 {
4446 rtx reg2 = gen_rtx_REG (mode, regno2);
4447 rtx mem2;
4448
4449 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
4450 mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
4451 emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
4452
4453 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
4454 regno = regno2;
4455 }
4456 else
4457 emit_move_insn (reg, mem);
4458 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
4459 }
4460 }
4461
4462 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
4463 of MODE. */
4464
4465 static inline bool
offset_4bit_signed_scaled_p(machine_mode mode,poly_int64 offset)4466 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4467 {
4468 HOST_WIDE_INT multiple;
4469 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4470 && IN_RANGE (multiple, -8, 7));
4471 }
4472
4473 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
4474 of MODE. */
4475
4476 static inline bool
offset_6bit_unsigned_scaled_p(machine_mode mode,poly_int64 offset)4477 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
4478 {
4479 HOST_WIDE_INT multiple;
4480 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4481 && IN_RANGE (multiple, 0, 63));
4482 }
4483
4484 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
4485 of MODE. */
4486
4487 bool
aarch64_offset_7bit_signed_scaled_p(machine_mode mode,poly_int64 offset)4488 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4489 {
4490 HOST_WIDE_INT multiple;
4491 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4492 && IN_RANGE (multiple, -64, 63));
4493 }
4494
4495 /* Return true if OFFSET is a signed 9-bit value. */
4496
4497 static inline bool
offset_9bit_signed_unscaled_p(machine_mode mode ATTRIBUTE_UNUSED,poly_int64 offset)4498 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
4499 poly_int64 offset)
4500 {
4501 HOST_WIDE_INT const_offset;
4502 return (offset.is_constant (&const_offset)
4503 && IN_RANGE (const_offset, -256, 255));
4504 }
4505
4506 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
4507 of MODE. */
4508
4509 static inline bool
offset_9bit_signed_scaled_p(machine_mode mode,poly_int64 offset)4510 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4511 {
4512 HOST_WIDE_INT multiple;
4513 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4514 && IN_RANGE (multiple, -256, 255));
4515 }
4516
4517 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
4518 of MODE. */
4519
4520 static inline bool
offset_12bit_unsigned_scaled_p(machine_mode mode,poly_int64 offset)4521 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
4522 {
4523 HOST_WIDE_INT multiple;
4524 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4525 && IN_RANGE (multiple, 0, 4095));
4526 }
4527
4528 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */
4529
4530 static sbitmap
aarch64_get_separate_components(void)4531 aarch64_get_separate_components (void)
4532 {
4533 aarch64_layout_frame ();
4534
4535 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
4536 bitmap_clear (components);
4537
4538 /* The registers we need saved to the frame. */
4539 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
4540 if (aarch64_register_saved_on_entry (regno))
4541 {
4542 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
4543 if (!frame_pointer_needed)
4544 offset += cfun->machine->frame.frame_size
4545 - cfun->machine->frame.hard_fp_offset;
4546 /* Check that we can access the stack slot of the register with one
4547 direct load with no adjustments needed. */
4548 if (offset_12bit_unsigned_scaled_p (DImode, offset))
4549 bitmap_set_bit (components, regno);
4550 }
4551
4552 /* Don't mess with the hard frame pointer. */
4553 if (frame_pointer_needed)
4554 bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
4555
4556 unsigned reg1 = cfun->machine->frame.wb_candidate1;
4557 unsigned reg2 = cfun->machine->frame.wb_candidate2;
4558 /* If aarch64_layout_frame has chosen registers to store/restore with
4559 writeback don't interfere with them to avoid having to output explicit
4560 stack adjustment instructions. */
4561 if (reg2 != INVALID_REGNUM)
4562 bitmap_clear_bit (components, reg2);
4563 if (reg1 != INVALID_REGNUM)
4564 bitmap_clear_bit (components, reg1);
4565
4566 bitmap_clear_bit (components, LR_REGNUM);
4567 bitmap_clear_bit (components, SP_REGNUM);
4568
4569 return components;
4570 }
4571
4572 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. */
4573
4574 static sbitmap
aarch64_components_for_bb(basic_block bb)4575 aarch64_components_for_bb (basic_block bb)
4576 {
4577 bitmap in = DF_LIVE_IN (bb);
4578 bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
4579 bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
4580
4581 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
4582 bitmap_clear (components);
4583
4584 /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */
4585 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
4586 if ((!call_used_regs[regno])
4587 && (bitmap_bit_p (in, regno)
4588 || bitmap_bit_p (gen, regno)
4589 || bitmap_bit_p (kill, regno)))
4590 {
4591 unsigned regno2, offset, offset2;
4592 bitmap_set_bit (components, regno);
4593
4594 /* If there is a callee-save at an adjacent offset, add it too
4595 to increase the use of LDP/STP. */
4596 offset = cfun->machine->frame.reg_offset[regno];
4597 regno2 = ((offset & 8) == 0) ? regno + 1 : regno - 1;
4598
4599 if (regno2 <= LAST_SAVED_REGNUM)
4600 {
4601 offset2 = cfun->machine->frame.reg_offset[regno2];
4602 if ((offset & ~8) == (offset2 & ~8))
4603 bitmap_set_bit (components, regno2);
4604 }
4605 }
4606
4607 return components;
4608 }
4609
4610 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
4611 Nothing to do for aarch64. */
4612
4613 static void
aarch64_disqualify_components(sbitmap,edge,sbitmap,bool)4614 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
4615 {
4616 }
4617
4618 /* Return the next set bit in BMP from START onwards. Return the total number
4619 of bits in BMP if no set bit is found at or after START. */
4620
4621 static unsigned int
aarch64_get_next_set_bit(sbitmap bmp,unsigned int start)4622 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
4623 {
4624 unsigned int nbits = SBITMAP_SIZE (bmp);
4625 if (start == nbits)
4626 return start;
4627
4628 gcc_assert (start < nbits);
4629 for (unsigned int i = start; i < nbits; i++)
4630 if (bitmap_bit_p (bmp, i))
4631 return i;
4632
4633 return nbits;
4634 }
4635
4636 /* Do the work for aarch64_emit_prologue_components and
4637 aarch64_emit_epilogue_components. COMPONENTS is the bitmap of registers
4638 to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
4639 for these components or the epilogue sequence. That is, it determines
4640 whether we should emit stores or loads and what kind of CFA notes to attach
4641 to the insns. Otherwise the logic for the two sequences is very
4642 similar. */
4643
4644 static void
aarch64_process_components(sbitmap components,bool prologue_p)4645 aarch64_process_components (sbitmap components, bool prologue_p)
4646 {
4647 rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
4648 ? HARD_FRAME_POINTER_REGNUM
4649 : STACK_POINTER_REGNUM);
4650
4651 unsigned last_regno = SBITMAP_SIZE (components);
4652 unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
4653 rtx_insn *insn = NULL;
4654
4655 while (regno != last_regno)
4656 {
4657 /* AAPCS64 section 5.1.2 requires only the bottom 64 bits to be saved
4658 so DFmode for the vector registers is enough. */
4659 machine_mode mode = GP_REGNUM_P (regno) ? E_DImode : E_DFmode;
4660 rtx reg = gen_rtx_REG (mode, regno);
4661 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
4662 if (!frame_pointer_needed)
4663 offset += cfun->machine->frame.frame_size
4664 - cfun->machine->frame.hard_fp_offset;
4665 rtx addr = plus_constant (Pmode, ptr_reg, offset);
4666 rtx mem = gen_frame_mem (mode, addr);
4667
4668 rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
4669 unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
4670 /* No more registers to handle after REGNO.
4671 Emit a single save/restore and exit. */
4672 if (regno2 == last_regno)
4673 {
4674 insn = emit_insn (set);
4675 RTX_FRAME_RELATED_P (insn) = 1;
4676 if (prologue_p)
4677 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
4678 else
4679 add_reg_note (insn, REG_CFA_RESTORE, reg);
4680 break;
4681 }
4682
4683 poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
4684 /* The next register is not of the same class or its offset is not
4685 mergeable with the current one into a pair. */
4686 if (!satisfies_constraint_Ump (mem)
4687 || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
4688 || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
4689 GET_MODE_SIZE (mode)))
4690 {
4691 insn = emit_insn (set);
4692 RTX_FRAME_RELATED_P (insn) = 1;
4693 if (prologue_p)
4694 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
4695 else
4696 add_reg_note (insn, REG_CFA_RESTORE, reg);
4697
4698 regno = regno2;
4699 continue;
4700 }
4701
4702 /* REGNO2 can be saved/restored in a pair with REGNO. */
4703 rtx reg2 = gen_rtx_REG (mode, regno2);
4704 if (!frame_pointer_needed)
4705 offset2 += cfun->machine->frame.frame_size
4706 - cfun->machine->frame.hard_fp_offset;
4707 rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
4708 rtx mem2 = gen_frame_mem (mode, addr2);
4709 rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
4710 : gen_rtx_SET (reg2, mem2);
4711
4712 if (prologue_p)
4713 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
4714 else
4715 insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
4716
4717 RTX_FRAME_RELATED_P (insn) = 1;
4718 if (prologue_p)
4719 {
4720 add_reg_note (insn, REG_CFA_OFFSET, set);
4721 add_reg_note (insn, REG_CFA_OFFSET, set2);
4722 }
4723 else
4724 {
4725 add_reg_note (insn, REG_CFA_RESTORE, reg);
4726 add_reg_note (insn, REG_CFA_RESTORE, reg2);
4727 }
4728
4729 regno = aarch64_get_next_set_bit (components, regno2 + 1);
4730 }
4731 }
4732
4733 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. */
4734
4735 static void
aarch64_emit_prologue_components(sbitmap components)4736 aarch64_emit_prologue_components (sbitmap components)
4737 {
4738 aarch64_process_components (components, true);
4739 }
4740
4741 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. */
4742
4743 static void
aarch64_emit_epilogue_components(sbitmap components)4744 aarch64_emit_epilogue_components (sbitmap components)
4745 {
4746 aarch64_process_components (components, false);
4747 }
4748
4749 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS. */
4750
4751 static void
aarch64_set_handled_components(sbitmap components)4752 aarch64_set_handled_components (sbitmap components)
4753 {
4754 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
4755 if (bitmap_bit_p (components, regno))
4756 cfun->machine->reg_is_wrapped_separately[regno] = true;
4757 }
4758
4759 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
4760 is saved at BASE + OFFSET. */
4761
4762 static void
aarch64_add_cfa_expression(rtx_insn * insn,unsigned int reg,rtx base,poly_int64 offset)4763 aarch64_add_cfa_expression (rtx_insn *insn, unsigned int reg,
4764 rtx base, poly_int64 offset)
4765 {
4766 rtx mem = gen_frame_mem (DImode, plus_constant (Pmode, base, offset));
4767 add_reg_note (insn, REG_CFA_EXPRESSION,
4768 gen_rtx_SET (mem, regno_reg_rtx[reg]));
4769 }
4770
4771 /* AArch64 stack frames generated by this compiler look like:
4772
4773 +-------------------------------+
4774 | |
4775 | incoming stack arguments |
4776 | |
4777 +-------------------------------+
4778 | | <-- incoming stack pointer (aligned)
4779 | callee-allocated save area |
4780 | for register varargs |
4781 | |
4782 +-------------------------------+
4783 | local variables | <-- frame_pointer_rtx
4784 | |
4785 +-------------------------------+
4786 | padding0 | \
4787 +-------------------------------+ |
4788 | callee-saved registers | | frame.saved_regs_size
4789 +-------------------------------+ |
4790 | LR' | |
4791 +-------------------------------+ |
4792 | FP' | / <- hard_frame_pointer_rtx (aligned)
4793 +-------------------------------+
4794 | dynamic allocation |
4795 +-------------------------------+
4796 | padding |
4797 +-------------------------------+
4798 | outgoing stack arguments | <-- arg_pointer
4799 | |
4800 +-------------------------------+
4801 | | <-- stack_pointer_rtx (aligned)
4802
4803 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
4804 but leave frame_pointer_rtx and hard_frame_pointer_rtx
4805 unchanged. */
4806
4807 /* Generate the prologue instructions for entry into a function.
4808 Establish the stack frame by decreasing the stack pointer with a
4809 properly calculated size and, if necessary, create a frame record
4810 filled with the values of LR and previous frame pointer. The
4811 current FP is also set up if it is in use. */
4812
4813 void
aarch64_expand_prologue(void)4814 aarch64_expand_prologue (void)
4815 {
4816 aarch64_layout_frame ();
4817
4818 poly_int64 frame_size = cfun->machine->frame.frame_size;
4819 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
4820 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
4821 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
4822 poly_int64 callee_offset = cfun->machine->frame.callee_offset;
4823 unsigned reg1 = cfun->machine->frame.wb_candidate1;
4824 unsigned reg2 = cfun->machine->frame.wb_candidate2;
4825 bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
4826 rtx_insn *insn;
4827
4828 /* Sign return address for functions. */
4829 if (aarch64_return_address_signing_enabled ())
4830 {
4831 insn = emit_insn (gen_pacisp ());
4832 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
4833 RTX_FRAME_RELATED_P (insn) = 1;
4834 }
4835
4836 if (flag_stack_usage_info)
4837 current_function_static_stack_size = constant_lower_bound (frame_size);
4838
4839 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
4840 {
4841 if (crtl->is_leaf && !cfun->calls_alloca)
4842 {
4843 if (maybe_gt (frame_size, PROBE_INTERVAL)
4844 && maybe_gt (frame_size, get_stack_check_protect ()))
4845 aarch64_emit_probe_stack_range (get_stack_check_protect (),
4846 (frame_size
4847 - get_stack_check_protect ()));
4848 }
4849 else if (maybe_gt (frame_size, 0))
4850 aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
4851 }
4852
4853 rtx ip0_rtx = gen_rtx_REG (Pmode, IP0_REGNUM);
4854 rtx ip1_rtx = gen_rtx_REG (Pmode, IP1_REGNUM);
4855
4856 aarch64_sub_sp (ip0_rtx, ip1_rtx, initial_adjust, true);
4857
4858 if (callee_adjust != 0)
4859 aarch64_push_regs (reg1, reg2, callee_adjust);
4860
4861 if (emit_frame_chain)
4862 {
4863 poly_int64 reg_offset = callee_adjust;
4864 if (callee_adjust == 0)
4865 {
4866 reg1 = R29_REGNUM;
4867 reg2 = R30_REGNUM;
4868 reg_offset = callee_offset;
4869 aarch64_save_callee_saves (DImode, reg_offset, reg1, reg2, false);
4870 }
4871 aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
4872 stack_pointer_rtx, callee_offset,
4873 ip1_rtx, ip0_rtx, frame_pointer_needed);
4874 if (frame_pointer_needed && !frame_size.is_constant ())
4875 {
4876 /* Variable-sized frames need to describe the save slot
4877 address using DW_CFA_expression rather than DW_CFA_offset.
4878 This means that, without taking further action, the
4879 locations of the registers that we've already saved would
4880 remain based on the stack pointer even after we redefine
4881 the CFA based on the frame pointer. We therefore need new
4882 DW_CFA_expressions to re-express the save slots with addresses
4883 based on the frame pointer. */
4884 rtx_insn *insn = get_last_insn ();
4885 gcc_assert (RTX_FRAME_RELATED_P (insn));
4886
4887 /* Add an explicit CFA definition if this was previously
4888 implicit. */
4889 if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
4890 {
4891 rtx src = plus_constant (Pmode, stack_pointer_rtx,
4892 callee_offset);
4893 add_reg_note (insn, REG_CFA_ADJUST_CFA,
4894 gen_rtx_SET (hard_frame_pointer_rtx, src));
4895 }
4896
4897 /* Change the save slot expressions for the registers that
4898 we've already saved. */
4899 reg_offset -= callee_offset;
4900 aarch64_add_cfa_expression (insn, reg2, hard_frame_pointer_rtx,
4901 reg_offset + UNITS_PER_WORD);
4902 aarch64_add_cfa_expression (insn, reg1, hard_frame_pointer_rtx,
4903 reg_offset);
4904 }
4905 emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
4906 }
4907
4908 aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
4909 callee_adjust != 0 || emit_frame_chain);
4910 aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
4911 callee_adjust != 0 || emit_frame_chain);
4912 aarch64_sub_sp (ip1_rtx, ip0_rtx, final_adjust, !frame_pointer_needed);
4913 }
4914
4915 /* Return TRUE if we can use a simple_return insn.
4916
4917 This function checks whether the callee saved stack is empty, which
4918 means no restore actions are need. The pro_and_epilogue will use
4919 this to check whether shrink-wrapping opt is feasible. */
4920
4921 bool
aarch64_use_return_insn_p(void)4922 aarch64_use_return_insn_p (void)
4923 {
4924 if (!reload_completed)
4925 return false;
4926
4927 if (crtl->profile)
4928 return false;
4929
4930 aarch64_layout_frame ();
4931
4932 return known_eq (cfun->machine->frame.frame_size, 0);
4933 }
4934
4935 /* Generate the epilogue instructions for returning from a function.
4936 This is almost exactly the reverse of the prolog sequence, except
4937 that we need to insert barriers to avoid scheduling loads that read
4938 from a deallocated stack, and we optimize the unwind records by
4939 emitting them all together if possible. */
4940 void
aarch64_expand_epilogue(bool for_sibcall)4941 aarch64_expand_epilogue (bool for_sibcall)
4942 {
4943 aarch64_layout_frame ();
4944
4945 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
4946 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
4947 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
4948 poly_int64 callee_offset = cfun->machine->frame.callee_offset;
4949 unsigned reg1 = cfun->machine->frame.wb_candidate1;
4950 unsigned reg2 = cfun->machine->frame.wb_candidate2;
4951 rtx cfi_ops = NULL;
4952 rtx_insn *insn;
4953 /* A stack clash protection prologue may not have left IP0_REGNUM or
4954 IP1_REGNUM in a usable state. The same is true for allocations
4955 with an SVE component, since we then need both temporary registers
4956 for each allocation. */
4957 bool can_inherit_p = (initial_adjust.is_constant ()
4958 && final_adjust.is_constant ()
4959 && !flag_stack_clash_protection);
4960
4961 /* We need to add memory barrier to prevent read from deallocated stack. */
4962 bool need_barrier_p
4963 = maybe_ne (get_frame_size ()
4964 + cfun->machine->frame.saved_varargs_size, 0);
4965
4966 /* Emit a barrier to prevent loads from a deallocated stack. */
4967 if (maybe_gt (final_adjust, crtl->outgoing_args_size)
4968 || cfun->calls_alloca
4969 || crtl->calls_eh_return)
4970 {
4971 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
4972 need_barrier_p = false;
4973 }
4974
4975 /* Restore the stack pointer from the frame pointer if it may not
4976 be the same as the stack pointer. */
4977 rtx ip0_rtx = gen_rtx_REG (Pmode, IP0_REGNUM);
4978 rtx ip1_rtx = gen_rtx_REG (Pmode, IP1_REGNUM);
4979 if (frame_pointer_needed
4980 && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
4981 /* If writeback is used when restoring callee-saves, the CFA
4982 is restored on the instruction doing the writeback. */
4983 aarch64_add_offset (Pmode, stack_pointer_rtx,
4984 hard_frame_pointer_rtx, -callee_offset,
4985 ip1_rtx, ip0_rtx, callee_adjust == 0);
4986 else
4987 aarch64_add_sp (ip1_rtx, ip0_rtx, final_adjust,
4988 !can_inherit_p || df_regs_ever_live_p (IP1_REGNUM));
4989
4990 aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
4991 callee_adjust != 0, &cfi_ops);
4992 aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
4993 callee_adjust != 0, &cfi_ops);
4994
4995 if (need_barrier_p)
4996 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
4997
4998 if (callee_adjust != 0)
4999 aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
5000
5001 if (callee_adjust != 0 || maybe_gt (initial_adjust, 65536))
5002 {
5003 /* Emit delayed restores and set the CFA to be SP + initial_adjust. */
5004 insn = get_last_insn ();
5005 rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
5006 REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
5007 RTX_FRAME_RELATED_P (insn) = 1;
5008 cfi_ops = NULL;
5009 }
5010
5011 aarch64_add_sp (ip0_rtx, ip1_rtx, initial_adjust,
5012 !can_inherit_p || df_regs_ever_live_p (IP0_REGNUM));
5013
5014 if (cfi_ops)
5015 {
5016 /* Emit delayed restores and reset the CFA to be SP. */
5017 insn = get_last_insn ();
5018 cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
5019 REG_NOTES (insn) = cfi_ops;
5020 RTX_FRAME_RELATED_P (insn) = 1;
5021 }
5022
5023 /* We prefer to emit the combined return/authenticate instruction RETAA,
5024 however there are three cases in which we must instead emit an explicit
5025 authentication instruction.
5026
5027 1) Sibcalls don't return in a normal way, so if we're about to call one
5028 we must authenticate.
5029
5030 2) The RETAA instruction is not available before ARMv8.3-A, so if we are
5031 generating code for !TARGET_ARMV8_3 we can't use it and must
5032 explicitly authenticate.
5033
5034 3) On an eh_return path we make extra stack adjustments to update the
5035 canonical frame address to be the exception handler's CFA. We want
5036 to authenticate using the CFA of the function which calls eh_return.
5037 */
5038 if (aarch64_return_address_signing_enabled ()
5039 && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
5040 {
5041 insn = emit_insn (gen_autisp ());
5042 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
5043 RTX_FRAME_RELATED_P (insn) = 1;
5044 }
5045
5046 /* Stack adjustment for exception handler. */
5047 if (crtl->calls_eh_return)
5048 {
5049 /* We need to unwind the stack by the offset computed by
5050 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
5051 to be SP; letting the CFA move during this adjustment
5052 is just as correct as retaining the CFA from the body
5053 of the function. Therefore, do nothing special. */
5054 emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
5055 }
5056
5057 emit_use (gen_rtx_REG (DImode, LR_REGNUM));
5058 if (!for_sibcall)
5059 emit_jump_insn (ret_rtx);
5060 }
5061
5062 /* Implement EH_RETURN_HANDLER_RTX. EH returns need to either return
5063 normally or return to a previous frame after unwinding.
5064
5065 An EH return uses a single shared return sequence. The epilogue is
5066 exactly like a normal epilogue except that it has an extra input
5067 register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
5068 that must be applied after the frame has been destroyed. An extra label
5069 is inserted before the epilogue which initializes this register to zero,
5070 and this is the entry point for a normal return.
5071
5072 An actual EH return updates the return address, initializes the stack
5073 adjustment and jumps directly into the epilogue (bypassing the zeroing
5074 of the adjustment). Since the return address is typically saved on the
5075 stack when a function makes a call, the saved LR must be updated outside
5076 the epilogue.
5077
5078 This poses problems as the store is generated well before the epilogue,
5079 so the offset of LR is not known yet. Also optimizations will remove the
5080 store as it appears dead, even after the epilogue is generated (as the
5081 base or offset for loading LR is different in many cases).
5082
5083 To avoid these problems this implementation forces the frame pointer
5084 in eh_return functions so that the location of LR is fixed and known early.
5085 It also marks the store volatile, so no optimization is permitted to
5086 remove the store. */
5087 rtx
aarch64_eh_return_handler_rtx(void)5088 aarch64_eh_return_handler_rtx (void)
5089 {
5090 rtx tmp = gen_frame_mem (Pmode,
5091 plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
5092
5093 /* Mark the store volatile, so no optimization is permitted to remove it. */
5094 MEM_VOLATILE_P (tmp) = true;
5095 return tmp;
5096 }
5097
5098 /* Output code to add DELTA to the first argument, and then jump
5099 to FUNCTION. Used for C++ multiple inheritance. */
5100 static void
aarch64_output_mi_thunk(FILE * file,tree thunk ATTRIBUTE_UNUSED,HOST_WIDE_INT delta,HOST_WIDE_INT vcall_offset,tree function)5101 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
5102 HOST_WIDE_INT delta,
5103 HOST_WIDE_INT vcall_offset,
5104 tree function)
5105 {
5106 /* The this pointer is always in x0. Note that this differs from
5107 Arm where the this pointer maybe bumped to r1 if r0 is required
5108 to return a pointer to an aggregate. On AArch64 a result value
5109 pointer will be in x8. */
5110 int this_regno = R0_REGNUM;
5111 rtx this_rtx, temp0, temp1, addr, funexp;
5112 rtx_insn *insn;
5113
5114 reload_completed = 1;
5115 emit_note (NOTE_INSN_PROLOGUE_END);
5116
5117 this_rtx = gen_rtx_REG (Pmode, this_regno);
5118 temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
5119 temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
5120
5121 if (vcall_offset == 0)
5122 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
5123 else
5124 {
5125 gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
5126
5127 addr = this_rtx;
5128 if (delta != 0)
5129 {
5130 if (delta >= -256 && delta < 256)
5131 addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
5132 plus_constant (Pmode, this_rtx, delta));
5133 else
5134 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
5135 temp1, temp0, false);
5136 }
5137
5138 if (Pmode == ptr_mode)
5139 aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
5140 else
5141 aarch64_emit_move (temp0,
5142 gen_rtx_ZERO_EXTEND (Pmode,
5143 gen_rtx_MEM (ptr_mode, addr)));
5144
5145 if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
5146 addr = plus_constant (Pmode, temp0, vcall_offset);
5147 else
5148 {
5149 aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
5150 Pmode);
5151 addr = gen_rtx_PLUS (Pmode, temp0, temp1);
5152 }
5153
5154 if (Pmode == ptr_mode)
5155 aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
5156 else
5157 aarch64_emit_move (temp1,
5158 gen_rtx_SIGN_EXTEND (Pmode,
5159 gen_rtx_MEM (ptr_mode, addr)));
5160
5161 emit_insn (gen_add2_insn (this_rtx, temp1));
5162 }
5163
5164 /* Generate a tail call to the target function. */
5165 if (!TREE_USED (function))
5166 {
5167 assemble_external (function);
5168 TREE_USED (function) = 1;
5169 }
5170 funexp = XEXP (DECL_RTL (function), 0);
5171 funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
5172 insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
5173 SIBLING_CALL_P (insn) = 1;
5174
5175 insn = get_insns ();
5176 shorten_branches (insn);
5177 final_start_function (insn, file, 1);
5178 final (insn, file, 1);
5179 final_end_function ();
5180
5181 /* Stop pretending to be a post-reload pass. */
5182 reload_completed = 0;
5183 }
5184
5185 static bool
aarch64_tls_referenced_p(rtx x)5186 aarch64_tls_referenced_p (rtx x)
5187 {
5188 if (!TARGET_HAVE_TLS)
5189 return false;
5190 subrtx_iterator::array_type array;
5191 FOR_EACH_SUBRTX (iter, array, x, ALL)
5192 {
5193 const_rtx x = *iter;
5194 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
5195 return true;
5196 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
5197 TLS offsets, not real symbol references. */
5198 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
5199 iter.skip_subrtxes ();
5200 }
5201 return false;
5202 }
5203
5204
5205 /* Return true if val can be encoded as a 12-bit unsigned immediate with
5206 a left shift of 0 or 12 bits. */
5207 bool
aarch64_uimm12_shift(HOST_WIDE_INT val)5208 aarch64_uimm12_shift (HOST_WIDE_INT val)
5209 {
5210 return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
5211 || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
5212 );
5213 }
5214
5215
5216 /* Return true if val is an immediate that can be loaded into a
5217 register by a MOVZ instruction. */
5218 static bool
aarch64_movw_imm(HOST_WIDE_INT val,scalar_int_mode mode)5219 aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
5220 {
5221 if (GET_MODE_SIZE (mode) > 4)
5222 {
5223 if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
5224 || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
5225 return 1;
5226 }
5227 else
5228 {
5229 /* Ignore sign extension. */
5230 val &= (HOST_WIDE_INT) 0xffffffff;
5231 }
5232 return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
5233 || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
5234 }
5235
5236 /* VAL is a value with the inner mode of MODE. Replicate it to fill a
5237 64-bit (DImode) integer. */
5238
5239 static unsigned HOST_WIDE_INT
aarch64_replicate_bitmask_imm(unsigned HOST_WIDE_INT val,machine_mode mode)5240 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
5241 {
5242 unsigned int size = GET_MODE_UNIT_PRECISION (mode);
5243 while (size < 64)
5244 {
5245 val &= (HOST_WIDE_INT_1U << size) - 1;
5246 val |= val << size;
5247 size *= 2;
5248 }
5249 return val;
5250 }
5251
5252 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
5253
5254 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
5255 {
5256 0x0000000100000001ull,
5257 0x0001000100010001ull,
5258 0x0101010101010101ull,
5259 0x1111111111111111ull,
5260 0x5555555555555555ull,
5261 };
5262
5263
5264 /* Return true if val is a valid bitmask immediate. */
5265
5266 bool
aarch64_bitmask_imm(HOST_WIDE_INT val_in,machine_mode mode)5267 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
5268 {
5269 unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
5270 int bits;
5271
5272 /* Check for a single sequence of one bits and return quickly if so.
5273 The special cases of all ones and all zeroes returns false. */
5274 val = aarch64_replicate_bitmask_imm (val_in, mode);
5275 tmp = val + (val & -val);
5276
5277 if (tmp == (tmp & -tmp))
5278 return (val + 1) > 1;
5279
5280 /* Replicate 32-bit immediates so we can treat them as 64-bit. */
5281 if (mode == SImode)
5282 val = (val << 32) | (val & 0xffffffff);
5283
5284 /* Invert if the immediate doesn't start with a zero bit - this means we
5285 only need to search for sequences of one bits. */
5286 if (val & 1)
5287 val = ~val;
5288
5289 /* Find the first set bit and set tmp to val with the first sequence of one
5290 bits removed. Return success if there is a single sequence of ones. */
5291 first_one = val & -val;
5292 tmp = val & (val + first_one);
5293
5294 if (tmp == 0)
5295 return true;
5296
5297 /* Find the next set bit and compute the difference in bit position. */
5298 next_one = tmp & -tmp;
5299 bits = clz_hwi (first_one) - clz_hwi (next_one);
5300 mask = val ^ tmp;
5301
5302 /* Check the bit position difference is a power of 2, and that the first
5303 sequence of one bits fits within 'bits' bits. */
5304 if ((mask >> bits) != 0 || bits != (bits & -bits))
5305 return false;
5306
5307 /* Check the sequence of one bits is repeated 64/bits times. */
5308 return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
5309 }
5310
5311 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
5312 Assumed precondition: VAL_IN Is not zero. */
5313
5314 unsigned HOST_WIDE_INT
aarch64_and_split_imm1(HOST_WIDE_INT val_in)5315 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
5316 {
5317 int lowest_bit_set = ctz_hwi (val_in);
5318 int highest_bit_set = floor_log2 (val_in);
5319 gcc_assert (val_in != 0);
5320
5321 return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
5322 (HOST_WIDE_INT_1U << lowest_bit_set));
5323 }
5324
5325 /* Create constant where bits outside of lowest bit set to highest bit set
5326 are set to 1. */
5327
5328 unsigned HOST_WIDE_INT
aarch64_and_split_imm2(HOST_WIDE_INT val_in)5329 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
5330 {
5331 return val_in | ~aarch64_and_split_imm1 (val_in);
5332 }
5333
5334 /* Return true if VAL_IN is a valid 'and' bitmask immediate. */
5335
5336 bool
aarch64_and_bitmask_imm(unsigned HOST_WIDE_INT val_in,machine_mode mode)5337 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
5338 {
5339 scalar_int_mode int_mode;
5340 if (!is_a <scalar_int_mode> (mode, &int_mode))
5341 return false;
5342
5343 if (aarch64_bitmask_imm (val_in, int_mode))
5344 return false;
5345
5346 if (aarch64_move_imm (val_in, int_mode))
5347 return false;
5348
5349 unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
5350
5351 return aarch64_bitmask_imm (imm2, int_mode);
5352 }
5353
5354 /* Return true if val is an immediate that can be loaded into a
5355 register in a single instruction. */
5356 bool
aarch64_move_imm(HOST_WIDE_INT val,machine_mode mode)5357 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
5358 {
5359 scalar_int_mode int_mode;
5360 if (!is_a <scalar_int_mode> (mode, &int_mode))
5361 return false;
5362
5363 if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
5364 return 1;
5365 return aarch64_bitmask_imm (val, int_mode);
5366 }
5367
5368 static bool
aarch64_cannot_force_const_mem(machine_mode mode ATTRIBUTE_UNUSED,rtx x)5369 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
5370 {
5371 rtx base, offset;
5372
5373 if (GET_CODE (x) == HIGH)
5374 return true;
5375
5376 /* There's no way to calculate VL-based values using relocations. */
5377 subrtx_iterator::array_type array;
5378 FOR_EACH_SUBRTX (iter, array, x, ALL)
5379 if (GET_CODE (*iter) == CONST_POLY_INT)
5380 return true;
5381
5382 split_const (x, &base, &offset);
5383 if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
5384 {
5385 if (aarch64_classify_symbol (base, INTVAL (offset))
5386 != SYMBOL_FORCE_TO_MEM)
5387 return true;
5388 else
5389 /* Avoid generating a 64-bit relocation in ILP32; leave
5390 to aarch64_expand_mov_immediate to handle it properly. */
5391 return mode != ptr_mode;
5392 }
5393
5394 return aarch64_tls_referenced_p (x);
5395 }
5396
5397 /* Implement TARGET_CASE_VALUES_THRESHOLD.
5398 The expansion for a table switch is quite expensive due to the number
5399 of instructions, the table lookup and hard to predict indirect jump.
5400 When optimizing for speed, and -O3 enabled, use the per-core tuning if
5401 set, otherwise use tables for > 16 cases as a tradeoff between size and
5402 performance. When optimizing for size, use the default setting. */
5403
5404 static unsigned int
aarch64_case_values_threshold(void)5405 aarch64_case_values_threshold (void)
5406 {
5407 /* Use the specified limit for the number of cases before using jump
5408 tables at higher optimization levels. */
5409 if (optimize > 2
5410 && selected_cpu->tune->max_case_values != 0)
5411 return selected_cpu->tune->max_case_values;
5412 else
5413 return optimize_size ? default_case_values_threshold () : 17;
5414 }
5415
5416 /* Return true if register REGNO is a valid index register.
5417 STRICT_P is true if REG_OK_STRICT is in effect. */
5418
5419 bool
aarch64_regno_ok_for_index_p(int regno,bool strict_p)5420 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
5421 {
5422 if (!HARD_REGISTER_NUM_P (regno))
5423 {
5424 if (!strict_p)
5425 return true;
5426
5427 if (!reg_renumber)
5428 return false;
5429
5430 regno = reg_renumber[regno];
5431 }
5432 return GP_REGNUM_P (regno);
5433 }
5434
5435 /* Return true if register REGNO is a valid base register for mode MODE.
5436 STRICT_P is true if REG_OK_STRICT is in effect. */
5437
5438 bool
aarch64_regno_ok_for_base_p(int regno,bool strict_p)5439 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
5440 {
5441 if (!HARD_REGISTER_NUM_P (regno))
5442 {
5443 if (!strict_p)
5444 return true;
5445
5446 if (!reg_renumber)
5447 return false;
5448
5449 regno = reg_renumber[regno];
5450 }
5451
5452 /* The fake registers will be eliminated to either the stack or
5453 hard frame pointer, both of which are usually valid base registers.
5454 Reload deals with the cases where the eliminated form isn't valid. */
5455 return (GP_REGNUM_P (regno)
5456 || regno == SP_REGNUM
5457 || regno == FRAME_POINTER_REGNUM
5458 || regno == ARG_POINTER_REGNUM);
5459 }
5460
5461 /* Return true if X is a valid base register for mode MODE.
5462 STRICT_P is true if REG_OK_STRICT is in effect. */
5463
5464 static bool
aarch64_base_register_rtx_p(rtx x,bool strict_p)5465 aarch64_base_register_rtx_p (rtx x, bool strict_p)
5466 {
5467 if (!strict_p
5468 && GET_CODE (x) == SUBREG
5469 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
5470 x = SUBREG_REG (x);
5471
5472 return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
5473 }
5474
5475 /* Return true if address offset is a valid index. If it is, fill in INFO
5476 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
5477
5478 static bool
aarch64_classify_index(struct aarch64_address_info * info,rtx x,machine_mode mode,bool strict_p)5479 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
5480 machine_mode mode, bool strict_p)
5481 {
5482 enum aarch64_address_type type;
5483 rtx index;
5484 int shift;
5485
5486 /* (reg:P) */
5487 if ((REG_P (x) || GET_CODE (x) == SUBREG)
5488 && GET_MODE (x) == Pmode)
5489 {
5490 type = ADDRESS_REG_REG;
5491 index = x;
5492 shift = 0;
5493 }
5494 /* (sign_extend:DI (reg:SI)) */
5495 else if ((GET_CODE (x) == SIGN_EXTEND
5496 || GET_CODE (x) == ZERO_EXTEND)
5497 && GET_MODE (x) == DImode
5498 && GET_MODE (XEXP (x, 0)) == SImode)
5499 {
5500 type = (GET_CODE (x) == SIGN_EXTEND)
5501 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5502 index = XEXP (x, 0);
5503 shift = 0;
5504 }
5505 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
5506 else if (GET_CODE (x) == MULT
5507 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
5508 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
5509 && GET_MODE (XEXP (x, 0)) == DImode
5510 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
5511 && CONST_INT_P (XEXP (x, 1)))
5512 {
5513 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
5514 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5515 index = XEXP (XEXP (x, 0), 0);
5516 shift = exact_log2 (INTVAL (XEXP (x, 1)));
5517 }
5518 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
5519 else if (GET_CODE (x) == ASHIFT
5520 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
5521 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
5522 && GET_MODE (XEXP (x, 0)) == DImode
5523 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
5524 && CONST_INT_P (XEXP (x, 1)))
5525 {
5526 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
5527 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5528 index = XEXP (XEXP (x, 0), 0);
5529 shift = INTVAL (XEXP (x, 1));
5530 }
5531 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
5532 else if ((GET_CODE (x) == SIGN_EXTRACT
5533 || GET_CODE (x) == ZERO_EXTRACT)
5534 && GET_MODE (x) == DImode
5535 && GET_CODE (XEXP (x, 0)) == MULT
5536 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5537 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
5538 {
5539 type = (GET_CODE (x) == SIGN_EXTRACT)
5540 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5541 index = XEXP (XEXP (x, 0), 0);
5542 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
5543 if (INTVAL (XEXP (x, 1)) != 32 + shift
5544 || INTVAL (XEXP (x, 2)) != 0)
5545 shift = -1;
5546 }
5547 /* (and:DI (mult:DI (reg:DI) (const_int scale))
5548 (const_int 0xffffffff<<shift)) */
5549 else if (GET_CODE (x) == AND
5550 && GET_MODE (x) == DImode
5551 && GET_CODE (XEXP (x, 0)) == MULT
5552 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5553 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
5554 && CONST_INT_P (XEXP (x, 1)))
5555 {
5556 type = ADDRESS_REG_UXTW;
5557 index = XEXP (XEXP (x, 0), 0);
5558 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
5559 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
5560 shift = -1;
5561 }
5562 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
5563 else if ((GET_CODE (x) == SIGN_EXTRACT
5564 || GET_CODE (x) == ZERO_EXTRACT)
5565 && GET_MODE (x) == DImode
5566 && GET_CODE (XEXP (x, 0)) == ASHIFT
5567 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5568 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
5569 {
5570 type = (GET_CODE (x) == SIGN_EXTRACT)
5571 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5572 index = XEXP (XEXP (x, 0), 0);
5573 shift = INTVAL (XEXP (XEXP (x, 0), 1));
5574 if (INTVAL (XEXP (x, 1)) != 32 + shift
5575 || INTVAL (XEXP (x, 2)) != 0)
5576 shift = -1;
5577 }
5578 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
5579 (const_int 0xffffffff<<shift)) */
5580 else if (GET_CODE (x) == AND
5581 && GET_MODE (x) == DImode
5582 && GET_CODE (XEXP (x, 0)) == ASHIFT
5583 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5584 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
5585 && CONST_INT_P (XEXP (x, 1)))
5586 {
5587 type = ADDRESS_REG_UXTW;
5588 index = XEXP (XEXP (x, 0), 0);
5589 shift = INTVAL (XEXP (XEXP (x, 0), 1));
5590 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
5591 shift = -1;
5592 }
5593 /* (mult:P (reg:P) (const_int scale)) */
5594 else if (GET_CODE (x) == MULT
5595 && GET_MODE (x) == Pmode
5596 && GET_MODE (XEXP (x, 0)) == Pmode
5597 && CONST_INT_P (XEXP (x, 1)))
5598 {
5599 type = ADDRESS_REG_REG;
5600 index = XEXP (x, 0);
5601 shift = exact_log2 (INTVAL (XEXP (x, 1)));
5602 }
5603 /* (ashift:P (reg:P) (const_int shift)) */
5604 else if (GET_CODE (x) == ASHIFT
5605 && GET_MODE (x) == Pmode
5606 && GET_MODE (XEXP (x, 0)) == Pmode
5607 && CONST_INT_P (XEXP (x, 1)))
5608 {
5609 type = ADDRESS_REG_REG;
5610 index = XEXP (x, 0);
5611 shift = INTVAL (XEXP (x, 1));
5612 }
5613 else
5614 return false;
5615
5616 if (!strict_p
5617 && GET_CODE (index) == SUBREG
5618 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
5619 index = SUBREG_REG (index);
5620
5621 if (aarch64_sve_data_mode_p (mode))
5622 {
5623 if (type != ADDRESS_REG_REG
5624 || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
5625 return false;
5626 }
5627 else
5628 {
5629 if (shift != 0
5630 && !(IN_RANGE (shift, 1, 3)
5631 && known_eq (1 << shift, GET_MODE_SIZE (mode))))
5632 return false;
5633 }
5634
5635 if (REG_P (index)
5636 && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
5637 {
5638 info->type = type;
5639 info->offset = index;
5640 info->shift = shift;
5641 return true;
5642 }
5643
5644 return false;
5645 }
5646
5647 /* Return true if MODE is one of the modes for which we
5648 support LDP/STP operations. */
5649
5650 static bool
aarch64_mode_valid_for_sched_fusion_p(machine_mode mode)5651 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
5652 {
5653 return mode == SImode || mode == DImode
5654 || mode == SFmode || mode == DFmode
5655 || (aarch64_vector_mode_supported_p (mode)
5656 && known_eq (GET_MODE_SIZE (mode), 8));
5657 }
5658
5659 /* Return true if REGNO is a virtual pointer register, or an eliminable
5660 "soft" frame register. Like REGNO_PTR_FRAME_P except that we don't
5661 include stack_pointer or hard_frame_pointer. */
5662 static bool
virt_or_elim_regno_p(unsigned regno)5663 virt_or_elim_regno_p (unsigned regno)
5664 {
5665 return ((regno >= FIRST_VIRTUAL_REGISTER
5666 && regno <= LAST_VIRTUAL_POINTER_REGISTER)
5667 || regno == FRAME_POINTER_REGNUM
5668 || regno == ARG_POINTER_REGNUM);
5669 }
5670
5671 /* Return true if X is a valid address of type TYPE for machine mode MODE.
5672 If it is, fill in INFO appropriately. STRICT_P is true if
5673 REG_OK_STRICT is in effect. */
5674
5675 static bool
5676 aarch64_classify_address (struct aarch64_address_info *info,
5677 rtx x, machine_mode mode, bool strict_p,
5678 aarch64_addr_query_type type = ADDR_QUERY_M)
5679 {
5680 enum rtx_code code = GET_CODE (x);
5681 rtx op0, op1;
5682 poly_int64 offset;
5683
5684 HOST_WIDE_INT const_size;
5685
5686 /* On BE, we use load/store pair for all large int mode load/stores.
5687 TI/TFmode may also use a load/store pair. */
5688 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
5689 bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
5690 bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
5691 || mode == TImode
5692 || mode == TFmode
5693 || (BYTES_BIG_ENDIAN && advsimd_struct_p));
5694
5695 bool allow_reg_index_p = (!load_store_pair_p
5696 && (known_lt (GET_MODE_SIZE (mode), 16)
5697 || vec_flags == VEC_ADVSIMD
5698 || vec_flags == VEC_SVE_DATA));
5699
5700 /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
5701 [Rn, #offset, MUL VL]. */
5702 if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
5703 && (code != REG && code != PLUS))
5704 return false;
5705
5706 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
5707 REG addressing. */
5708 if (advsimd_struct_p
5709 && !BYTES_BIG_ENDIAN
5710 && (code != POST_INC && code != REG))
5711 return false;
5712
5713 gcc_checking_assert (GET_MODE (x) == VOIDmode
5714 || SCALAR_INT_MODE_P (GET_MODE (x)));
5715
5716 switch (code)
5717 {
5718 case REG:
5719 case SUBREG:
5720 info->type = ADDRESS_REG_IMM;
5721 info->base = x;
5722 info->offset = const0_rtx;
5723 info->const_offset = 0;
5724 return aarch64_base_register_rtx_p (x, strict_p);
5725
5726 case PLUS:
5727 op0 = XEXP (x, 0);
5728 op1 = XEXP (x, 1);
5729
5730 if (! strict_p
5731 && REG_P (op0)
5732 && virt_or_elim_regno_p (REGNO (op0))
5733 && poly_int_rtx_p (op1, &offset))
5734 {
5735 info->type = ADDRESS_REG_IMM;
5736 info->base = op0;
5737 info->offset = op1;
5738 info->const_offset = offset;
5739
5740 return true;
5741 }
5742
5743 if (maybe_ne (GET_MODE_SIZE (mode), 0)
5744 && aarch64_base_register_rtx_p (op0, strict_p)
5745 && poly_int_rtx_p (op1, &offset))
5746 {
5747 info->type = ADDRESS_REG_IMM;
5748 info->base = op0;
5749 info->offset = op1;
5750 info->const_offset = offset;
5751
5752 /* TImode and TFmode values are allowed in both pairs of X
5753 registers and individual Q registers. The available
5754 address modes are:
5755 X,X: 7-bit signed scaled offset
5756 Q: 9-bit signed offset
5757 We conservatively require an offset representable in either mode.
5758 When performing the check for pairs of X registers i.e. LDP/STP
5759 pass down DImode since that is the natural size of the LDP/STP
5760 instruction memory accesses. */
5761 if (mode == TImode || mode == TFmode)
5762 return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
5763 && (offset_9bit_signed_unscaled_p (mode, offset)
5764 || offset_12bit_unsigned_scaled_p (mode, offset)));
5765
5766 /* A 7bit offset check because OImode will emit a ldp/stp
5767 instruction (only big endian will get here).
5768 For ldp/stp instructions, the offset is scaled for the size of a
5769 single element of the pair. */
5770 if (mode == OImode)
5771 return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
5772
5773 /* Three 9/12 bit offsets checks because CImode will emit three
5774 ldr/str instructions (only big endian will get here). */
5775 if (mode == CImode)
5776 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
5777 && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
5778 || offset_12bit_unsigned_scaled_p (V16QImode,
5779 offset + 32)));
5780
5781 /* Two 7bit offsets checks because XImode will emit two ldp/stp
5782 instructions (only big endian will get here). */
5783 if (mode == XImode)
5784 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
5785 && aarch64_offset_7bit_signed_scaled_p (TImode,
5786 offset + 32));
5787
5788 /* Make "m" use the LD1 offset range for SVE data modes, so
5789 that pre-RTL optimizers like ivopts will work to that
5790 instead of the wider LDR/STR range. */
5791 if (vec_flags == VEC_SVE_DATA)
5792 return (type == ADDR_QUERY_M
5793 ? offset_4bit_signed_scaled_p (mode, offset)
5794 : offset_9bit_signed_scaled_p (mode, offset));
5795
5796 if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
5797 {
5798 poly_int64 end_offset = (offset
5799 + GET_MODE_SIZE (mode)
5800 - BYTES_PER_SVE_VECTOR);
5801 return (type == ADDR_QUERY_M
5802 ? offset_4bit_signed_scaled_p (mode, offset)
5803 : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
5804 && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
5805 end_offset)));
5806 }
5807
5808 if (vec_flags == VEC_SVE_PRED)
5809 return offset_9bit_signed_scaled_p (mode, offset);
5810
5811 if (load_store_pair_p)
5812 return ((known_eq (GET_MODE_SIZE (mode), 4)
5813 || known_eq (GET_MODE_SIZE (mode), 8))
5814 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
5815 else
5816 return (offset_9bit_signed_unscaled_p (mode, offset)
5817 || offset_12bit_unsigned_scaled_p (mode, offset));
5818 }
5819
5820 if (allow_reg_index_p)
5821 {
5822 /* Look for base + (scaled/extended) index register. */
5823 if (aarch64_base_register_rtx_p (op0, strict_p)
5824 && aarch64_classify_index (info, op1, mode, strict_p))
5825 {
5826 info->base = op0;
5827 return true;
5828 }
5829 if (aarch64_base_register_rtx_p (op1, strict_p)
5830 && aarch64_classify_index (info, op0, mode, strict_p))
5831 {
5832 info->base = op1;
5833 return true;
5834 }
5835 }
5836
5837 return false;
5838
5839 case POST_INC:
5840 case POST_DEC:
5841 case PRE_INC:
5842 case PRE_DEC:
5843 info->type = ADDRESS_REG_WB;
5844 info->base = XEXP (x, 0);
5845 info->offset = NULL_RTX;
5846 return aarch64_base_register_rtx_p (info->base, strict_p);
5847
5848 case POST_MODIFY:
5849 case PRE_MODIFY:
5850 info->type = ADDRESS_REG_WB;
5851 info->base = XEXP (x, 0);
5852 if (GET_CODE (XEXP (x, 1)) == PLUS
5853 && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
5854 && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
5855 && aarch64_base_register_rtx_p (info->base, strict_p))
5856 {
5857 info->offset = XEXP (XEXP (x, 1), 1);
5858 info->const_offset = offset;
5859
5860 /* TImode and TFmode values are allowed in both pairs of X
5861 registers and individual Q registers. The available
5862 address modes are:
5863 X,X: 7-bit signed scaled offset
5864 Q: 9-bit signed offset
5865 We conservatively require an offset representable in either mode.
5866 */
5867 if (mode == TImode || mode == TFmode)
5868 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
5869 && offset_9bit_signed_unscaled_p (mode, offset));
5870
5871 if (load_store_pair_p)
5872 return ((known_eq (GET_MODE_SIZE (mode), 4)
5873 || known_eq (GET_MODE_SIZE (mode), 8))
5874 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
5875 else
5876 return offset_9bit_signed_unscaled_p (mode, offset);
5877 }
5878 return false;
5879
5880 case CONST:
5881 case SYMBOL_REF:
5882 case LABEL_REF:
5883 /* load literal: pc-relative constant pool entry. Only supported
5884 for SI mode or larger. */
5885 info->type = ADDRESS_SYMBOLIC;
5886
5887 if (!load_store_pair_p
5888 && GET_MODE_SIZE (mode).is_constant (&const_size)
5889 && const_size >= 4)
5890 {
5891 rtx sym, addend;
5892
5893 split_const (x, &sym, &addend);
5894 return ((GET_CODE (sym) == LABEL_REF
5895 || (GET_CODE (sym) == SYMBOL_REF
5896 && CONSTANT_POOL_ADDRESS_P (sym)
5897 && aarch64_pcrelative_literal_loads)));
5898 }
5899 return false;
5900
5901 case LO_SUM:
5902 info->type = ADDRESS_LO_SUM;
5903 info->base = XEXP (x, 0);
5904 info->offset = XEXP (x, 1);
5905 if (allow_reg_index_p
5906 && aarch64_base_register_rtx_p (info->base, strict_p))
5907 {
5908 rtx sym, offs;
5909 split_const (info->offset, &sym, &offs);
5910 if (GET_CODE (sym) == SYMBOL_REF
5911 && (aarch64_classify_symbol (sym, INTVAL (offs))
5912 == SYMBOL_SMALL_ABSOLUTE))
5913 {
5914 /* The symbol and offset must be aligned to the access size. */
5915 unsigned int align;
5916
5917 if (CONSTANT_POOL_ADDRESS_P (sym))
5918 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
5919 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
5920 {
5921 tree exp = SYMBOL_REF_DECL (sym);
5922 align = TYPE_ALIGN (TREE_TYPE (exp));
5923 align = aarch64_constant_alignment (exp, align);
5924 }
5925 else if (SYMBOL_REF_DECL (sym))
5926 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
5927 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
5928 && SYMBOL_REF_BLOCK (sym) != NULL)
5929 align = SYMBOL_REF_BLOCK (sym)->alignment;
5930 else
5931 align = BITS_PER_UNIT;
5932
5933 poly_int64 ref_size = GET_MODE_SIZE (mode);
5934 if (known_eq (ref_size, 0))
5935 ref_size = GET_MODE_SIZE (DImode);
5936
5937 return (multiple_p (INTVAL (offs), ref_size)
5938 && multiple_p (align / BITS_PER_UNIT, ref_size));
5939 }
5940 }
5941 return false;
5942
5943 default:
5944 return false;
5945 }
5946 }
5947
5948 /* Return true if the address X is valid for a PRFM instruction.
5949 STRICT_P is true if we should do strict checking with
5950 aarch64_classify_address. */
5951
5952 bool
aarch64_address_valid_for_prefetch_p(rtx x,bool strict_p)5953 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
5954 {
5955 struct aarch64_address_info addr;
5956
5957 /* PRFM accepts the same addresses as DImode... */
5958 bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
5959 if (!res)
5960 return false;
5961
5962 /* ... except writeback forms. */
5963 return addr.type != ADDRESS_REG_WB;
5964 }
5965
5966 bool
aarch64_symbolic_address_p(rtx x)5967 aarch64_symbolic_address_p (rtx x)
5968 {
5969 rtx offset;
5970
5971 split_const (x, &x, &offset);
5972 return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
5973 }
5974
5975 /* Classify the base of symbolic expression X. */
5976
5977 enum aarch64_symbol_type
aarch64_classify_symbolic_expression(rtx x)5978 aarch64_classify_symbolic_expression (rtx x)
5979 {
5980 rtx offset;
5981
5982 split_const (x, &x, &offset);
5983 return aarch64_classify_symbol (x, INTVAL (offset));
5984 }
5985
5986
5987 /* Return TRUE if X is a legitimate address for accessing memory in
5988 mode MODE. */
5989 static bool
aarch64_legitimate_address_hook_p(machine_mode mode,rtx x,bool strict_p)5990 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
5991 {
5992 struct aarch64_address_info addr;
5993
5994 return aarch64_classify_address (&addr, x, mode, strict_p);
5995 }
5996
5997 /* Return TRUE if X is a legitimate address of type TYPE for accessing
5998 memory in mode MODE. STRICT_P is true if REG_OK_STRICT is in effect. */
5999 bool
aarch64_legitimate_address_p(machine_mode mode,rtx x,bool strict_p,aarch64_addr_query_type type)6000 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
6001 aarch64_addr_query_type type)
6002 {
6003 struct aarch64_address_info addr;
6004
6005 return aarch64_classify_address (&addr, x, mode, strict_p, type);
6006 }
6007
6008 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT. */
6009
6010 static bool
aarch64_legitimize_address_displacement(rtx * offset1,rtx * offset2,poly_int64 orig_offset,machine_mode mode)6011 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
6012 poly_int64 orig_offset,
6013 machine_mode mode)
6014 {
6015 HOST_WIDE_INT size;
6016 if (GET_MODE_SIZE (mode).is_constant (&size))
6017 {
6018 HOST_WIDE_INT const_offset, second_offset;
6019
6020 /* A general SVE offset is A * VQ + B. Remove the A component from
6021 coefficient 0 in order to get the constant B. */
6022 const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
6023
6024 /* Split an out-of-range address displacement into a base and
6025 offset. Use 4KB range for 1- and 2-byte accesses and a 16KB
6026 range otherwise to increase opportunities for sharing the base
6027 address of different sizes. Unaligned accesses use the signed
6028 9-bit range, TImode/TFmode use the intersection of signed
6029 scaled 7-bit and signed 9-bit offset. */
6030 if (mode == TImode || mode == TFmode)
6031 second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
6032 else if ((const_offset & (size - 1)) != 0)
6033 second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
6034 else
6035 second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
6036
6037 if (second_offset == 0 || known_eq (orig_offset, second_offset))
6038 return false;
6039
6040 /* Split the offset into second_offset and the rest. */
6041 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
6042 *offset2 = gen_int_mode (second_offset, Pmode);
6043 return true;
6044 }
6045 else
6046 {
6047 /* Get the mode we should use as the basis of the range. For structure
6048 modes this is the mode of one vector. */
6049 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
6050 machine_mode step_mode
6051 = (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
6052
6053 /* Get the "mul vl" multiplier we'd like to use. */
6054 HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
6055 HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
6056 if (vec_flags & VEC_SVE_DATA)
6057 /* LDR supports a 9-bit range, but the move patterns for
6058 structure modes require all vectors to be in range of the
6059 same base. The simplest way of accomodating that while still
6060 promoting reuse of anchor points between different modes is
6061 to use an 8-bit range unconditionally. */
6062 vnum = ((vnum + 128) & 255) - 128;
6063 else
6064 /* Predicates are only handled singly, so we might as well use
6065 the full range. */
6066 vnum = ((vnum + 256) & 511) - 256;
6067 if (vnum == 0)
6068 return false;
6069
6070 /* Convert the "mul vl" multiplier into a byte offset. */
6071 poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
6072 if (known_eq (second_offset, orig_offset))
6073 return false;
6074
6075 /* Split the offset into second_offset and the rest. */
6076 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
6077 *offset2 = gen_int_mode (second_offset, Pmode);
6078 return true;
6079 }
6080 }
6081
6082 /* Return the binary representation of floating point constant VALUE in INTVAL.
6083 If the value cannot be converted, return false without setting INTVAL.
6084 The conversion is done in the given MODE. */
6085 bool
aarch64_reinterpret_float_as_int(rtx value,unsigned HOST_WIDE_INT * intval)6086 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
6087 {
6088
6089 /* We make a general exception for 0. */
6090 if (aarch64_float_const_zero_rtx_p (value))
6091 {
6092 *intval = 0;
6093 return true;
6094 }
6095
6096 scalar_float_mode mode;
6097 if (GET_CODE (value) != CONST_DOUBLE
6098 || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
6099 || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
6100 /* Only support up to DF mode. */
6101 || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
6102 return false;
6103
6104 unsigned HOST_WIDE_INT ival = 0;
6105
6106 long res[2];
6107 real_to_target (res,
6108 CONST_DOUBLE_REAL_VALUE (value),
6109 REAL_MODE_FORMAT (mode));
6110
6111 if (mode == DFmode)
6112 {
6113 int order = BYTES_BIG_ENDIAN ? 1 : 0;
6114 ival = zext_hwi (res[order], 32);
6115 ival |= (zext_hwi (res[1 - order], 32) << 32);
6116 }
6117 else
6118 ival = zext_hwi (res[0], 32);
6119
6120 *intval = ival;
6121 return true;
6122 }
6123
6124 /* Return TRUE if rtx X is an immediate constant that can be moved using a
6125 single MOV(+MOVK) followed by an FMOV. */
6126 bool
aarch64_float_const_rtx_p(rtx x)6127 aarch64_float_const_rtx_p (rtx x)
6128 {
6129 machine_mode mode = GET_MODE (x);
6130 if (mode == VOIDmode)
6131 return false;
6132
6133 /* Determine whether it's cheaper to write float constants as
6134 mov/movk pairs over ldr/adrp pairs. */
6135 unsigned HOST_WIDE_INT ival;
6136
6137 if (GET_CODE (x) == CONST_DOUBLE
6138 && SCALAR_FLOAT_MODE_P (mode)
6139 && aarch64_reinterpret_float_as_int (x, &ival))
6140 {
6141 scalar_int_mode imode = (mode == HFmode
6142 ? SImode
6143 : int_mode_for_mode (mode).require ());
6144 int num_instr = aarch64_internal_mov_immediate
6145 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
6146 return num_instr < 3;
6147 }
6148
6149 return false;
6150 }
6151
6152 /* Return TRUE if rtx X is immediate constant 0.0 */
6153 bool
aarch64_float_const_zero_rtx_p(rtx x)6154 aarch64_float_const_zero_rtx_p (rtx x)
6155 {
6156 if (GET_MODE (x) == VOIDmode)
6157 return false;
6158
6159 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
6160 return !HONOR_SIGNED_ZEROS (GET_MODE (x));
6161 return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
6162 }
6163
6164 /* Return TRUE if rtx X is immediate constant that fits in a single
6165 MOVI immediate operation. */
6166 bool
aarch64_can_const_movi_rtx_p(rtx x,machine_mode mode)6167 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
6168 {
6169 if (!TARGET_SIMD)
6170 return false;
6171
6172 machine_mode vmode;
6173 scalar_int_mode imode;
6174 unsigned HOST_WIDE_INT ival;
6175
6176 if (GET_CODE (x) == CONST_DOUBLE
6177 && SCALAR_FLOAT_MODE_P (mode))
6178 {
6179 if (!aarch64_reinterpret_float_as_int (x, &ival))
6180 return false;
6181
6182 /* We make a general exception for 0. */
6183 if (aarch64_float_const_zero_rtx_p (x))
6184 return true;
6185
6186 imode = int_mode_for_mode (mode).require ();
6187 }
6188 else if (GET_CODE (x) == CONST_INT
6189 && is_a <scalar_int_mode> (mode, &imode))
6190 ival = INTVAL (x);
6191 else
6192 return false;
6193
6194 /* use a 64 bit mode for everything except for DI/DF mode, where we use
6195 a 128 bit vector mode. */
6196 int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
6197
6198 vmode = aarch64_simd_container_mode (imode, width);
6199 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
6200
6201 return aarch64_simd_valid_immediate (v_op, NULL);
6202 }
6203
6204
6205 /* Return the fixed registers used for condition codes. */
6206
6207 static bool
aarch64_fixed_condition_code_regs(unsigned int * p1,unsigned int * p2)6208 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
6209 {
6210 *p1 = CC_REGNUM;
6211 *p2 = INVALID_REGNUM;
6212 return true;
6213 }
6214
6215 /* This function is used by the call expanders of the machine description.
6216 RESULT is the register in which the result is returned. It's NULL for
6217 "call" and "sibcall".
6218 MEM is the location of the function call.
6219 SIBCALL indicates whether this function call is normal call or sibling call.
6220 It will generate different pattern accordingly. */
6221
6222 void
aarch64_expand_call(rtx result,rtx mem,bool sibcall)6223 aarch64_expand_call (rtx result, rtx mem, bool sibcall)
6224 {
6225 rtx call, callee, tmp;
6226 rtvec vec;
6227 machine_mode mode;
6228
6229 gcc_assert (MEM_P (mem));
6230 callee = XEXP (mem, 0);
6231 mode = GET_MODE (callee);
6232 gcc_assert (mode == Pmode);
6233
6234 /* Decide if we should generate indirect calls by loading the
6235 address of the callee into a register before performing
6236 the branch-and-link. */
6237 if (SYMBOL_REF_P (callee)
6238 ? (aarch64_is_long_call_p (callee)
6239 || aarch64_is_noplt_call_p (callee))
6240 : !REG_P (callee))
6241 XEXP (mem, 0) = force_reg (mode, callee);
6242
6243 call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
6244
6245 if (result != NULL_RTX)
6246 call = gen_rtx_SET (result, call);
6247
6248 if (sibcall)
6249 tmp = ret_rtx;
6250 else
6251 tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
6252
6253 vec = gen_rtvec (2, call, tmp);
6254 call = gen_rtx_PARALLEL (VOIDmode, vec);
6255
6256 aarch64_emit_call_insn (call);
6257 }
6258
6259 /* Emit call insn with PAT and do aarch64-specific handling. */
6260
6261 void
aarch64_emit_call_insn(rtx pat)6262 aarch64_emit_call_insn (rtx pat)
6263 {
6264 rtx insn = emit_call_insn (pat);
6265
6266 rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
6267 clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
6268 clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
6269 }
6270
6271 machine_mode
aarch64_select_cc_mode(RTX_CODE code,rtx x,rtx y)6272 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
6273 {
6274 /* All floating point compares return CCFP if it is an equality
6275 comparison, and CCFPE otherwise. */
6276 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
6277 {
6278 switch (code)
6279 {
6280 case EQ:
6281 case NE:
6282 case UNORDERED:
6283 case ORDERED:
6284 case UNLT:
6285 case UNLE:
6286 case UNGT:
6287 case UNGE:
6288 case UNEQ:
6289 return CCFPmode;
6290
6291 case LT:
6292 case LE:
6293 case GT:
6294 case GE:
6295 case LTGT:
6296 return CCFPEmode;
6297
6298 default:
6299 gcc_unreachable ();
6300 }
6301 }
6302
6303 /* Equality comparisons of short modes against zero can be performed
6304 using the TST instruction with the appropriate bitmask. */
6305 if (y == const0_rtx && REG_P (x)
6306 && (code == EQ || code == NE)
6307 && (GET_MODE (x) == HImode || GET_MODE (x) == QImode))
6308 return CC_NZmode;
6309
6310 /* Similarly, comparisons of zero_extends from shorter modes can
6311 be performed using an ANDS with an immediate mask. */
6312 if (y == const0_rtx && GET_CODE (x) == ZERO_EXTEND
6313 && (GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6314 && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
6315 && (code == EQ || code == NE))
6316 return CC_NZmode;
6317
6318 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6319 && y == const0_rtx
6320 && (code == EQ || code == NE || code == LT || code == GE)
6321 && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
6322 || GET_CODE (x) == NEG
6323 || (GET_CODE (x) == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
6324 && CONST_INT_P (XEXP (x, 2)))))
6325 return CC_NZmode;
6326
6327 /* A compare with a shifted operand. Because of canonicalization,
6328 the comparison will have to be swapped when we emit the assembly
6329 code. */
6330 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6331 && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
6332 && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
6333 || GET_CODE (x) == LSHIFTRT
6334 || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
6335 return CC_SWPmode;
6336
6337 /* Similarly for a negated operand, but we can only do this for
6338 equalities. */
6339 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6340 && (REG_P (y) || GET_CODE (y) == SUBREG)
6341 && (code == EQ || code == NE)
6342 && GET_CODE (x) == NEG)
6343 return CC_Zmode;
6344
6345 /* A test for unsigned overflow. */
6346 if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
6347 && code == NE
6348 && GET_CODE (x) == PLUS
6349 && GET_CODE (y) == ZERO_EXTEND)
6350 return CC_Cmode;
6351
6352 /* For everything else, return CCmode. */
6353 return CCmode;
6354 }
6355
6356 static int
6357 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
6358
6359 int
aarch64_get_condition_code(rtx x)6360 aarch64_get_condition_code (rtx x)
6361 {
6362 machine_mode mode = GET_MODE (XEXP (x, 0));
6363 enum rtx_code comp_code = GET_CODE (x);
6364
6365 if (GET_MODE_CLASS (mode) != MODE_CC)
6366 mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
6367 return aarch64_get_condition_code_1 (mode, comp_code);
6368 }
6369
6370 static int
aarch64_get_condition_code_1(machine_mode mode,enum rtx_code comp_code)6371 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
6372 {
6373 switch (mode)
6374 {
6375 case E_CCFPmode:
6376 case E_CCFPEmode:
6377 switch (comp_code)
6378 {
6379 case GE: return AARCH64_GE;
6380 case GT: return AARCH64_GT;
6381 case LE: return AARCH64_LS;
6382 case LT: return AARCH64_MI;
6383 case NE: return AARCH64_NE;
6384 case EQ: return AARCH64_EQ;
6385 case ORDERED: return AARCH64_VC;
6386 case UNORDERED: return AARCH64_VS;
6387 case UNLT: return AARCH64_LT;
6388 case UNLE: return AARCH64_LE;
6389 case UNGT: return AARCH64_HI;
6390 case UNGE: return AARCH64_PL;
6391 default: return -1;
6392 }
6393 break;
6394
6395 case E_CCmode:
6396 switch (comp_code)
6397 {
6398 case NE: return AARCH64_NE;
6399 case EQ: return AARCH64_EQ;
6400 case GE: return AARCH64_GE;
6401 case GT: return AARCH64_GT;
6402 case LE: return AARCH64_LE;
6403 case LT: return AARCH64_LT;
6404 case GEU: return AARCH64_CS;
6405 case GTU: return AARCH64_HI;
6406 case LEU: return AARCH64_LS;
6407 case LTU: return AARCH64_CC;
6408 default: return -1;
6409 }
6410 break;
6411
6412 case E_CC_SWPmode:
6413 switch (comp_code)
6414 {
6415 case NE: return AARCH64_NE;
6416 case EQ: return AARCH64_EQ;
6417 case GE: return AARCH64_LE;
6418 case GT: return AARCH64_LT;
6419 case LE: return AARCH64_GE;
6420 case LT: return AARCH64_GT;
6421 case GEU: return AARCH64_LS;
6422 case GTU: return AARCH64_CC;
6423 case LEU: return AARCH64_CS;
6424 case LTU: return AARCH64_HI;
6425 default: return -1;
6426 }
6427 break;
6428
6429 case E_CC_NZmode:
6430 switch (comp_code)
6431 {
6432 case NE: return AARCH64_NE;
6433 case EQ: return AARCH64_EQ;
6434 case GE: return AARCH64_PL;
6435 case LT: return AARCH64_MI;
6436 default: return -1;
6437 }
6438 break;
6439
6440 case E_CC_Zmode:
6441 switch (comp_code)
6442 {
6443 case NE: return AARCH64_NE;
6444 case EQ: return AARCH64_EQ;
6445 default: return -1;
6446 }
6447 break;
6448
6449 case E_CC_Cmode:
6450 switch (comp_code)
6451 {
6452 case NE: return AARCH64_CS;
6453 case EQ: return AARCH64_CC;
6454 default: return -1;
6455 }
6456 break;
6457
6458 default:
6459 return -1;
6460 }
6461
6462 return -1;
6463 }
6464
6465 bool
aarch64_const_vec_all_same_in_range_p(rtx x,HOST_WIDE_INT minval,HOST_WIDE_INT maxval)6466 aarch64_const_vec_all_same_in_range_p (rtx x,
6467 HOST_WIDE_INT minval,
6468 HOST_WIDE_INT maxval)
6469 {
6470 rtx elt;
6471 return (const_vec_duplicate_p (x, &elt)
6472 && CONST_INT_P (elt)
6473 && IN_RANGE (INTVAL (elt), minval, maxval));
6474 }
6475
6476 bool
aarch64_const_vec_all_same_int_p(rtx x,HOST_WIDE_INT val)6477 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
6478 {
6479 return aarch64_const_vec_all_same_in_range_p (x, val, val);
6480 }
6481
6482 /* Return true if VEC is a constant in which every element is in the range
6483 [MINVAL, MAXVAL]. The elements do not need to have the same value. */
6484
6485 static bool
aarch64_const_vec_all_in_range_p(rtx vec,HOST_WIDE_INT minval,HOST_WIDE_INT maxval)6486 aarch64_const_vec_all_in_range_p (rtx vec,
6487 HOST_WIDE_INT minval,
6488 HOST_WIDE_INT maxval)
6489 {
6490 if (GET_CODE (vec) != CONST_VECTOR
6491 || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
6492 return false;
6493
6494 int nunits;
6495 if (!CONST_VECTOR_STEPPED_P (vec))
6496 nunits = const_vector_encoded_nelts (vec);
6497 else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
6498 return false;
6499
6500 for (int i = 0; i < nunits; i++)
6501 {
6502 rtx vec_elem = CONST_VECTOR_ELT (vec, i);
6503 if (!CONST_INT_P (vec_elem)
6504 || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
6505 return false;
6506 }
6507 return true;
6508 }
6509
6510 /* N Z C V. */
6511 #define AARCH64_CC_V 1
6512 #define AARCH64_CC_C (1 << 1)
6513 #define AARCH64_CC_Z (1 << 2)
6514 #define AARCH64_CC_N (1 << 3)
6515
6516 /* N Z C V flags for ccmp. Indexed by AARCH64_COND_CODE. */
6517 static const int aarch64_nzcv_codes[] =
6518 {
6519 0, /* EQ, Z == 1. */
6520 AARCH64_CC_Z, /* NE, Z == 0. */
6521 0, /* CS, C == 1. */
6522 AARCH64_CC_C, /* CC, C == 0. */
6523 0, /* MI, N == 1. */
6524 AARCH64_CC_N, /* PL, N == 0. */
6525 0, /* VS, V == 1. */
6526 AARCH64_CC_V, /* VC, V == 0. */
6527 0, /* HI, C ==1 && Z == 0. */
6528 AARCH64_CC_C, /* LS, !(C == 1 && Z == 0). */
6529 AARCH64_CC_V, /* GE, N == V. */
6530 0, /* LT, N != V. */
6531 AARCH64_CC_Z, /* GT, Z == 0 && N == V. */
6532 0, /* LE, !(Z == 0 && N == V). */
6533 0, /* AL, Any. */
6534 0 /* NV, Any. */
6535 };
6536
6537 /* Print floating-point vector immediate operand X to F, negating it
6538 first if NEGATE is true. Return true on success, false if it isn't
6539 a constant we can handle. */
6540
6541 static bool
aarch64_print_vector_float_operand(FILE * f,rtx x,bool negate)6542 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
6543 {
6544 rtx elt;
6545
6546 if (!const_vec_duplicate_p (x, &elt))
6547 return false;
6548
6549 REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
6550 if (negate)
6551 r = real_value_negate (&r);
6552
6553 /* We only handle the SVE single-bit immediates here. */
6554 if (real_equal (&r, &dconst0))
6555 asm_fprintf (f, "0.0");
6556 else if (real_equal (&r, &dconst1))
6557 asm_fprintf (f, "1.0");
6558 else if (real_equal (&r, &dconsthalf))
6559 asm_fprintf (f, "0.5");
6560 else
6561 return false;
6562
6563 return true;
6564 }
6565
6566 /* Return the equivalent letter for size. */
6567 static char
sizetochar(int size)6568 sizetochar (int size)
6569 {
6570 switch (size)
6571 {
6572 case 64: return 'd';
6573 case 32: return 's';
6574 case 16: return 'h';
6575 case 8 : return 'b';
6576 default: gcc_unreachable ();
6577 }
6578 }
6579
6580 /* Print operand X to file F in a target specific manner according to CODE.
6581 The acceptable formatting commands given by CODE are:
6582 'c': An integer or symbol address without a preceding #
6583 sign.
6584 'C': Take the duplicated element in a vector constant
6585 and print it in hex.
6586 'D': Take the duplicated element in a vector constant
6587 and print it as an unsigned integer, in decimal.
6588 'e': Print the sign/zero-extend size as a character 8->b,
6589 16->h, 32->w.
6590 'p': Prints N such that 2^N == X (X must be power of 2 and
6591 const int).
6592 'P': Print the number of non-zero bits in X (a const_int).
6593 'H': Print the higher numbered register of a pair (TImode)
6594 of regs.
6595 'm': Print a condition (eq, ne, etc).
6596 'M': Same as 'm', but invert condition.
6597 'N': Take the duplicated element in a vector constant
6598 and print the negative of it in decimal.
6599 'b/h/s/d/q': Print a scalar FP/SIMD register name.
6600 'S/T/U/V': Print a FP/SIMD register name for a register list.
6601 The register printed is the FP/SIMD register name
6602 of X + 0/1/2/3 for S/T/U/V.
6603 'R': Print a scalar FP/SIMD register name + 1.
6604 'X': Print bottom 16 bits of integer constant in hex.
6605 'w/x': Print a general register name or the zero register
6606 (32-bit or 64-bit).
6607 '0': Print a normal operand, if it's a general register,
6608 then we assume DImode.
6609 'k': Print NZCV for conditional compare instructions.
6610 'A': Output address constant representing the first
6611 argument of X, specifying a relocation offset
6612 if appropriate.
6613 'L': Output constant address specified by X
6614 with a relocation offset if appropriate.
6615 'G': Prints address of X, specifying a PC relative
6616 relocation mode if appropriate.
6617 'y': Output address of LDP or STP - this is used for
6618 some LDP/STPs which don't use a PARALLEL in their
6619 pattern (so the mode needs to be adjusted).
6620 'z': Output address of a typical LDP or STP. */
6621
6622 static void
aarch64_print_operand(FILE * f,rtx x,int code)6623 aarch64_print_operand (FILE *f, rtx x, int code)
6624 {
6625 rtx elt;
6626 switch (code)
6627 {
6628 case 'c':
6629 switch (GET_CODE (x))
6630 {
6631 case CONST_INT:
6632 fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
6633 break;
6634
6635 case SYMBOL_REF:
6636 output_addr_const (f, x);
6637 break;
6638
6639 case CONST:
6640 if (GET_CODE (XEXP (x, 0)) == PLUS
6641 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
6642 {
6643 output_addr_const (f, x);
6644 break;
6645 }
6646 /* Fall through. */
6647
6648 default:
6649 output_operand_lossage ("unsupported operand for code '%c'", code);
6650 }
6651 break;
6652
6653 case 'e':
6654 {
6655 int n;
6656
6657 if (!CONST_INT_P (x)
6658 || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
6659 {
6660 output_operand_lossage ("invalid operand for '%%%c'", code);
6661 return;
6662 }
6663
6664 switch (n)
6665 {
6666 case 3:
6667 fputc ('b', f);
6668 break;
6669 case 4:
6670 fputc ('h', f);
6671 break;
6672 case 5:
6673 fputc ('w', f);
6674 break;
6675 default:
6676 output_operand_lossage ("invalid operand for '%%%c'", code);
6677 return;
6678 }
6679 }
6680 break;
6681
6682 case 'p':
6683 {
6684 int n;
6685
6686 if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
6687 {
6688 output_operand_lossage ("invalid operand for '%%%c'", code);
6689 return;
6690 }
6691
6692 asm_fprintf (f, "%d", n);
6693 }
6694 break;
6695
6696 case 'P':
6697 if (!CONST_INT_P (x))
6698 {
6699 output_operand_lossage ("invalid operand for '%%%c'", code);
6700 return;
6701 }
6702
6703 asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
6704 break;
6705
6706 case 'H':
6707 if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
6708 {
6709 output_operand_lossage ("invalid operand for '%%%c'", code);
6710 return;
6711 }
6712
6713 asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
6714 break;
6715
6716 case 'M':
6717 case 'm':
6718 {
6719 int cond_code;
6720 /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */
6721 if (x == const_true_rtx)
6722 {
6723 if (code == 'M')
6724 fputs ("nv", f);
6725 return;
6726 }
6727
6728 if (!COMPARISON_P (x))
6729 {
6730 output_operand_lossage ("invalid operand for '%%%c'", code);
6731 return;
6732 }
6733
6734 cond_code = aarch64_get_condition_code (x);
6735 gcc_assert (cond_code >= 0);
6736 if (code == 'M')
6737 cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
6738 fputs (aarch64_condition_codes[cond_code], f);
6739 }
6740 break;
6741
6742 case 'N':
6743 if (!const_vec_duplicate_p (x, &elt))
6744 {
6745 output_operand_lossage ("invalid vector constant");
6746 return;
6747 }
6748
6749 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
6750 asm_fprintf (f, "%wd", -INTVAL (elt));
6751 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
6752 && aarch64_print_vector_float_operand (f, x, true))
6753 ;
6754 else
6755 {
6756 output_operand_lossage ("invalid vector constant");
6757 return;
6758 }
6759 break;
6760
6761 case 'b':
6762 case 'h':
6763 case 's':
6764 case 'd':
6765 case 'q':
6766 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
6767 {
6768 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
6769 return;
6770 }
6771 asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
6772 break;
6773
6774 case 'S':
6775 case 'T':
6776 case 'U':
6777 case 'V':
6778 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
6779 {
6780 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
6781 return;
6782 }
6783 asm_fprintf (f, "%c%d",
6784 aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
6785 REGNO (x) - V0_REGNUM + (code - 'S'));
6786 break;
6787
6788 case 'R':
6789 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
6790 {
6791 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
6792 return;
6793 }
6794 asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
6795 break;
6796
6797 case 'X':
6798 if (!CONST_INT_P (x))
6799 {
6800 output_operand_lossage ("invalid operand for '%%%c'", code);
6801 return;
6802 }
6803 asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
6804 break;
6805
6806 case 'C':
6807 {
6808 /* Print a replicated constant in hex. */
6809 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
6810 {
6811 output_operand_lossage ("invalid operand for '%%%c'", code);
6812 return;
6813 }
6814 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
6815 asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
6816 }
6817 break;
6818
6819 case 'D':
6820 {
6821 /* Print a replicated constant in decimal, treating it as
6822 unsigned. */
6823 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
6824 {
6825 output_operand_lossage ("invalid operand for '%%%c'", code);
6826 return;
6827 }
6828 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
6829 asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
6830 }
6831 break;
6832
6833 case 'w':
6834 case 'x':
6835 if (x == const0_rtx
6836 || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
6837 {
6838 asm_fprintf (f, "%czr", code);
6839 break;
6840 }
6841
6842 if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
6843 {
6844 asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
6845 break;
6846 }
6847
6848 if (REG_P (x) && REGNO (x) == SP_REGNUM)
6849 {
6850 asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
6851 break;
6852 }
6853
6854 /* Fall through */
6855
6856 case 0:
6857 if (x == NULL)
6858 {
6859 output_operand_lossage ("missing operand");
6860 return;
6861 }
6862
6863 switch (GET_CODE (x))
6864 {
6865 case REG:
6866 if (aarch64_sve_data_mode_p (GET_MODE (x)))
6867 {
6868 if (REG_NREGS (x) == 1)
6869 asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
6870 else
6871 {
6872 char suffix
6873 = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
6874 asm_fprintf (f, "{z%d.%c - z%d.%c}",
6875 REGNO (x) - V0_REGNUM, suffix,
6876 END_REGNO (x) - V0_REGNUM - 1, suffix);
6877 }
6878 }
6879 else
6880 asm_fprintf (f, "%s", reg_names [REGNO (x)]);
6881 break;
6882
6883 case MEM:
6884 output_address (GET_MODE (x), XEXP (x, 0));
6885 break;
6886
6887 case LABEL_REF:
6888 case SYMBOL_REF:
6889 output_addr_const (asm_out_file, x);
6890 break;
6891
6892 case CONST_INT:
6893 asm_fprintf (f, "%wd", INTVAL (x));
6894 break;
6895
6896 case CONST:
6897 if (!VECTOR_MODE_P (GET_MODE (x)))
6898 {
6899 output_addr_const (asm_out_file, x);
6900 break;
6901 }
6902 /* fall through */
6903
6904 case CONST_VECTOR:
6905 if (!const_vec_duplicate_p (x, &elt))
6906 {
6907 output_operand_lossage ("invalid vector constant");
6908 return;
6909 }
6910
6911 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
6912 asm_fprintf (f, "%wd", INTVAL (elt));
6913 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
6914 && aarch64_print_vector_float_operand (f, x, false))
6915 ;
6916 else
6917 {
6918 output_operand_lossage ("invalid vector constant");
6919 return;
6920 }
6921 break;
6922
6923 case CONST_DOUBLE:
6924 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
6925 be getting CONST_DOUBLEs holding integers. */
6926 gcc_assert (GET_MODE (x) != VOIDmode);
6927 if (aarch64_float_const_zero_rtx_p (x))
6928 {
6929 fputc ('0', f);
6930 break;
6931 }
6932 else if (aarch64_float_const_representable_p (x))
6933 {
6934 #define buf_size 20
6935 char float_buf[buf_size] = {'\0'};
6936 real_to_decimal_for_mode (float_buf,
6937 CONST_DOUBLE_REAL_VALUE (x),
6938 buf_size, buf_size,
6939 1, GET_MODE (x));
6940 asm_fprintf (asm_out_file, "%s", float_buf);
6941 break;
6942 #undef buf_size
6943 }
6944 output_operand_lossage ("invalid constant");
6945 return;
6946 default:
6947 output_operand_lossage ("invalid operand");
6948 return;
6949 }
6950 break;
6951
6952 case 'A':
6953 if (GET_CODE (x) == HIGH)
6954 x = XEXP (x, 0);
6955
6956 switch (aarch64_classify_symbolic_expression (x))
6957 {
6958 case SYMBOL_SMALL_GOT_4G:
6959 asm_fprintf (asm_out_file, ":got:");
6960 break;
6961
6962 case SYMBOL_SMALL_TLSGD:
6963 asm_fprintf (asm_out_file, ":tlsgd:");
6964 break;
6965
6966 case SYMBOL_SMALL_TLSDESC:
6967 asm_fprintf (asm_out_file, ":tlsdesc:");
6968 break;
6969
6970 case SYMBOL_SMALL_TLSIE:
6971 asm_fprintf (asm_out_file, ":gottprel:");
6972 break;
6973
6974 case SYMBOL_TLSLE24:
6975 asm_fprintf (asm_out_file, ":tprel:");
6976 break;
6977
6978 case SYMBOL_TINY_GOT:
6979 gcc_unreachable ();
6980 break;
6981
6982 default:
6983 break;
6984 }
6985 output_addr_const (asm_out_file, x);
6986 break;
6987
6988 case 'L':
6989 switch (aarch64_classify_symbolic_expression (x))
6990 {
6991 case SYMBOL_SMALL_GOT_4G:
6992 asm_fprintf (asm_out_file, ":lo12:");
6993 break;
6994
6995 case SYMBOL_SMALL_TLSGD:
6996 asm_fprintf (asm_out_file, ":tlsgd_lo12:");
6997 break;
6998
6999 case SYMBOL_SMALL_TLSDESC:
7000 asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
7001 break;
7002
7003 case SYMBOL_SMALL_TLSIE:
7004 asm_fprintf (asm_out_file, ":gottprel_lo12:");
7005 break;
7006
7007 case SYMBOL_TLSLE12:
7008 asm_fprintf (asm_out_file, ":tprel_lo12:");
7009 break;
7010
7011 case SYMBOL_TLSLE24:
7012 asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
7013 break;
7014
7015 case SYMBOL_TINY_GOT:
7016 asm_fprintf (asm_out_file, ":got:");
7017 break;
7018
7019 case SYMBOL_TINY_TLSIE:
7020 asm_fprintf (asm_out_file, ":gottprel:");
7021 break;
7022
7023 default:
7024 break;
7025 }
7026 output_addr_const (asm_out_file, x);
7027 break;
7028
7029 case 'G':
7030 switch (aarch64_classify_symbolic_expression (x))
7031 {
7032 case SYMBOL_TLSLE24:
7033 asm_fprintf (asm_out_file, ":tprel_hi12:");
7034 break;
7035 default:
7036 break;
7037 }
7038 output_addr_const (asm_out_file, x);
7039 break;
7040
7041 case 'k':
7042 {
7043 HOST_WIDE_INT cond_code;
7044
7045 if (!CONST_INT_P (x))
7046 {
7047 output_operand_lossage ("invalid operand for '%%%c'", code);
7048 return;
7049 }
7050
7051 cond_code = INTVAL (x);
7052 gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
7053 asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
7054 }
7055 break;
7056
7057 case 'y':
7058 case 'z':
7059 {
7060 machine_mode mode = GET_MODE (x);
7061
7062 if (GET_CODE (x) != MEM
7063 || (code == 'y' && maybe_ne (GET_MODE_SIZE (mode), 16)))
7064 {
7065 output_operand_lossage ("invalid operand for '%%%c'", code);
7066 return;
7067 }
7068
7069 if (code == 'y')
7070 /* LDP/STP which uses a single double-width memory operand.
7071 Adjust the mode to appear like a typical LDP/STP.
7072 Currently this is supported for 16-byte accesses only. */
7073 mode = DFmode;
7074
7075 if (!aarch64_print_ldpstp_address (f, mode, XEXP (x, 0)))
7076 output_operand_lossage ("invalid operand prefix '%%%c'", code);
7077 }
7078 break;
7079
7080 default:
7081 output_operand_lossage ("invalid operand prefix '%%%c'", code);
7082 return;
7083 }
7084 }
7085
7086 /* Print address 'x' of a memory access with mode 'mode'.
7087 'op' is the context required by aarch64_classify_address. It can either be
7088 MEM for a normal memory access or PARALLEL for LDP/STP. */
7089 static bool
aarch64_print_address_internal(FILE * f,machine_mode mode,rtx x,aarch64_addr_query_type type)7090 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
7091 aarch64_addr_query_type type)
7092 {
7093 struct aarch64_address_info addr;
7094 unsigned int size;
7095
7096 /* Check all addresses are Pmode - including ILP32. */
7097 if (GET_MODE (x) != Pmode
7098 && (!CONST_INT_P (x)
7099 || trunc_int_for_mode (INTVAL (x), Pmode) != INTVAL (x)))
7100 {
7101 output_operand_lossage ("invalid address mode");
7102 return false;
7103 }
7104
7105 if (aarch64_classify_address (&addr, x, mode, true, type))
7106 switch (addr.type)
7107 {
7108 case ADDRESS_REG_IMM:
7109 if (known_eq (addr.const_offset, 0))
7110 asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
7111 else if (aarch64_sve_data_mode_p (mode))
7112 {
7113 HOST_WIDE_INT vnum
7114 = exact_div (addr.const_offset,
7115 BYTES_PER_SVE_VECTOR).to_constant ();
7116 asm_fprintf (f, "[%s, #%wd, mul vl]",
7117 reg_names[REGNO (addr.base)], vnum);
7118 }
7119 else if (aarch64_sve_pred_mode_p (mode))
7120 {
7121 HOST_WIDE_INT vnum
7122 = exact_div (addr.const_offset,
7123 BYTES_PER_SVE_PRED).to_constant ();
7124 asm_fprintf (f, "[%s, #%wd, mul vl]",
7125 reg_names[REGNO (addr.base)], vnum);
7126 }
7127 else
7128 asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
7129 INTVAL (addr.offset));
7130 return true;
7131
7132 case ADDRESS_REG_REG:
7133 if (addr.shift == 0)
7134 asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
7135 reg_names [REGNO (addr.offset)]);
7136 else
7137 asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
7138 reg_names [REGNO (addr.offset)], addr.shift);
7139 return true;
7140
7141 case ADDRESS_REG_UXTW:
7142 if (addr.shift == 0)
7143 asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
7144 REGNO (addr.offset) - R0_REGNUM);
7145 else
7146 asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
7147 REGNO (addr.offset) - R0_REGNUM, addr.shift);
7148 return true;
7149
7150 case ADDRESS_REG_SXTW:
7151 if (addr.shift == 0)
7152 asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
7153 REGNO (addr.offset) - R0_REGNUM);
7154 else
7155 asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
7156 REGNO (addr.offset) - R0_REGNUM, addr.shift);
7157 return true;
7158
7159 case ADDRESS_REG_WB:
7160 /* Writeback is only supported for fixed-width modes. */
7161 size = GET_MODE_SIZE (mode).to_constant ();
7162 switch (GET_CODE (x))
7163 {
7164 case PRE_INC:
7165 asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
7166 return true;
7167 case POST_INC:
7168 asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
7169 return true;
7170 case PRE_DEC:
7171 asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
7172 return true;
7173 case POST_DEC:
7174 asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
7175 return true;
7176 case PRE_MODIFY:
7177 asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
7178 INTVAL (addr.offset));
7179 return true;
7180 case POST_MODIFY:
7181 asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
7182 INTVAL (addr.offset));
7183 return true;
7184 default:
7185 break;
7186 }
7187 break;
7188
7189 case ADDRESS_LO_SUM:
7190 asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
7191 output_addr_const (f, addr.offset);
7192 asm_fprintf (f, "]");
7193 return true;
7194
7195 case ADDRESS_SYMBOLIC:
7196 output_addr_const (f, x);
7197 return true;
7198 }
7199
7200 return false;
7201 }
7202
7203 /* Print address 'x' of a LDP/STP with mode 'mode'. */
7204 static bool
aarch64_print_ldpstp_address(FILE * f,machine_mode mode,rtx x)7205 aarch64_print_ldpstp_address (FILE *f, machine_mode mode, rtx x)
7206 {
7207 return aarch64_print_address_internal (f, mode, x, ADDR_QUERY_LDP_STP);
7208 }
7209
7210 /* Print address 'x' of a memory access with mode 'mode'. */
7211 static void
aarch64_print_operand_address(FILE * f,machine_mode mode,rtx x)7212 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
7213 {
7214 if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
7215 output_addr_const (f, x);
7216 }
7217
7218 bool
aarch64_label_mentioned_p(rtx x)7219 aarch64_label_mentioned_p (rtx x)
7220 {
7221 const char *fmt;
7222 int i;
7223
7224 if (GET_CODE (x) == LABEL_REF)
7225 return true;
7226
7227 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
7228 referencing instruction, but they are constant offsets, not
7229 symbols. */
7230 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
7231 return false;
7232
7233 fmt = GET_RTX_FORMAT (GET_CODE (x));
7234 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
7235 {
7236 if (fmt[i] == 'E')
7237 {
7238 int j;
7239
7240 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
7241 if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
7242 return 1;
7243 }
7244 else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
7245 return 1;
7246 }
7247
7248 return 0;
7249 }
7250
7251 /* Implement REGNO_REG_CLASS. */
7252
7253 enum reg_class
aarch64_regno_regclass(unsigned regno)7254 aarch64_regno_regclass (unsigned regno)
7255 {
7256 if (GP_REGNUM_P (regno))
7257 return GENERAL_REGS;
7258
7259 if (regno == SP_REGNUM)
7260 return STACK_REG;
7261
7262 if (regno == FRAME_POINTER_REGNUM
7263 || regno == ARG_POINTER_REGNUM)
7264 return POINTER_REGS;
7265
7266 if (FP_REGNUM_P (regno))
7267 return FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS;
7268
7269 if (PR_REGNUM_P (regno))
7270 return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
7271
7272 return NO_REGS;
7273 }
7274
7275 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
7276 If OFFSET is out of range, return an offset of an anchor point
7277 that is in range. Return 0 otherwise. */
7278
7279 static HOST_WIDE_INT
aarch64_anchor_offset(HOST_WIDE_INT offset,HOST_WIDE_INT size,machine_mode mode)7280 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
7281 machine_mode mode)
7282 {
7283 /* Does it look like we'll need a 16-byte load/store-pair operation? */
7284 if (size > 16)
7285 return (offset + 0x400) & ~0x7f0;
7286
7287 /* For offsets that aren't a multiple of the access size, the limit is
7288 -256...255. */
7289 if (offset & (size - 1))
7290 {
7291 /* BLKmode typically uses LDP of X-registers. */
7292 if (mode == BLKmode)
7293 return (offset + 512) & ~0x3ff;
7294 return (offset + 0x100) & ~0x1ff;
7295 }
7296
7297 /* Small negative offsets are supported. */
7298 if (IN_RANGE (offset, -256, 0))
7299 return 0;
7300
7301 if (mode == TImode || mode == TFmode)
7302 return (offset + 0x100) & ~0x1ff;
7303
7304 /* Use 12-bit offset by access size. */
7305 return offset & (~0xfff * size);
7306 }
7307
7308 static rtx
aarch64_legitimize_address(rtx x,rtx,machine_mode mode)7309 aarch64_legitimize_address (rtx x, rtx /* orig_x */, machine_mode mode)
7310 {
7311 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
7312 where mask is selected by alignment and size of the offset.
7313 We try to pick as large a range for the offset as possible to
7314 maximize the chance of a CSE. However, for aligned addresses
7315 we limit the range to 4k so that structures with different sized
7316 elements are likely to use the same base. We need to be careful
7317 not to split a CONST for some forms of address expression, otherwise
7318 it will generate sub-optimal code. */
7319
7320 if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
7321 {
7322 rtx base = XEXP (x, 0);
7323 rtx offset_rtx = XEXP (x, 1);
7324 HOST_WIDE_INT offset = INTVAL (offset_rtx);
7325
7326 if (GET_CODE (base) == PLUS)
7327 {
7328 rtx op0 = XEXP (base, 0);
7329 rtx op1 = XEXP (base, 1);
7330
7331 /* Force any scaling into a temp for CSE. */
7332 op0 = force_reg (Pmode, op0);
7333 op1 = force_reg (Pmode, op1);
7334
7335 /* Let the pointer register be in op0. */
7336 if (REG_POINTER (op1))
7337 std::swap (op0, op1);
7338
7339 /* If the pointer is virtual or frame related, then we know that
7340 virtual register instantiation or register elimination is going
7341 to apply a second constant. We want the two constants folded
7342 together easily. Therefore, emit as (OP0 + CONST) + OP1. */
7343 if (virt_or_elim_regno_p (REGNO (op0)))
7344 {
7345 base = expand_binop (Pmode, add_optab, op0, offset_rtx,
7346 NULL_RTX, true, OPTAB_DIRECT);
7347 return gen_rtx_PLUS (Pmode, base, op1);
7348 }
7349
7350 /* Otherwise, in order to encourage CSE (and thence loop strength
7351 reduce) scaled addresses, emit as (OP0 + OP1) + CONST. */
7352 base = expand_binop (Pmode, add_optab, op0, op1,
7353 NULL_RTX, true, OPTAB_DIRECT);
7354 x = gen_rtx_PLUS (Pmode, base, offset_rtx);
7355 }
7356
7357 HOST_WIDE_INT size;
7358 if (GET_MODE_SIZE (mode).is_constant (&size))
7359 {
7360 HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
7361 mode);
7362 if (base_offset != 0)
7363 {
7364 base = plus_constant (Pmode, base, base_offset);
7365 base = force_operand (base, NULL_RTX);
7366 return plus_constant (Pmode, base, offset - base_offset);
7367 }
7368 }
7369 }
7370
7371 return x;
7372 }
7373
7374 /* Return the reload icode required for a constant pool in mode. */
7375 static enum insn_code
aarch64_constant_pool_reload_icode(machine_mode mode)7376 aarch64_constant_pool_reload_icode (machine_mode mode)
7377 {
7378 switch (mode)
7379 {
7380 case E_SFmode:
7381 return CODE_FOR_aarch64_reload_movcpsfdi;
7382
7383 case E_DFmode:
7384 return CODE_FOR_aarch64_reload_movcpdfdi;
7385
7386 case E_TFmode:
7387 return CODE_FOR_aarch64_reload_movcptfdi;
7388
7389 case E_V8QImode:
7390 return CODE_FOR_aarch64_reload_movcpv8qidi;
7391
7392 case E_V16QImode:
7393 return CODE_FOR_aarch64_reload_movcpv16qidi;
7394
7395 case E_V4HImode:
7396 return CODE_FOR_aarch64_reload_movcpv4hidi;
7397
7398 case E_V8HImode:
7399 return CODE_FOR_aarch64_reload_movcpv8hidi;
7400
7401 case E_V2SImode:
7402 return CODE_FOR_aarch64_reload_movcpv2sidi;
7403
7404 case E_V4SImode:
7405 return CODE_FOR_aarch64_reload_movcpv4sidi;
7406
7407 case E_V2DImode:
7408 return CODE_FOR_aarch64_reload_movcpv2didi;
7409
7410 case E_V2DFmode:
7411 return CODE_FOR_aarch64_reload_movcpv2dfdi;
7412
7413 default:
7414 gcc_unreachable ();
7415 }
7416
7417 gcc_unreachable ();
7418 }
7419 static reg_class_t
aarch64_secondary_reload(bool in_p ATTRIBUTE_UNUSED,rtx x,reg_class_t rclass,machine_mode mode,secondary_reload_info * sri)7420 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
7421 reg_class_t rclass,
7422 machine_mode mode,
7423 secondary_reload_info *sri)
7424 {
7425 /* Use aarch64_sve_reload_be for SVE reloads that cannot be handled
7426 directly by the *aarch64_sve_mov<mode>_be move pattern. See the
7427 comment at the head of aarch64-sve.md for more details about the
7428 big-endian handling. */
7429 if (BYTES_BIG_ENDIAN
7430 && reg_class_subset_p (rclass, FP_REGS)
7431 && !((REG_P (x) && HARD_REGISTER_P (x))
7432 || aarch64_simd_valid_immediate (x, NULL))
7433 && aarch64_sve_data_mode_p (mode))
7434 {
7435 sri->icode = CODE_FOR_aarch64_sve_reload_be;
7436 return NO_REGS;
7437 }
7438
7439 /* If we have to disable direct literal pool loads and stores because the
7440 function is too big, then we need a scratch register. */
7441 if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
7442 && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
7443 || targetm.vector_mode_supported_p (GET_MODE (x)))
7444 && !aarch64_pcrelative_literal_loads)
7445 {
7446 sri->icode = aarch64_constant_pool_reload_icode (mode);
7447 return NO_REGS;
7448 }
7449
7450 /* Without the TARGET_SIMD instructions we cannot move a Q register
7451 to a Q register directly. We need a scratch. */
7452 if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
7453 && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
7454 && reg_class_subset_p (rclass, FP_REGS))
7455 {
7456 if (mode == TFmode)
7457 sri->icode = CODE_FOR_aarch64_reload_movtf;
7458 else if (mode == TImode)
7459 sri->icode = CODE_FOR_aarch64_reload_movti;
7460 return NO_REGS;
7461 }
7462
7463 /* A TFmode or TImode memory access should be handled via an FP_REGS
7464 because AArch64 has richer addressing modes for LDR/STR instructions
7465 than LDP/STP instructions. */
7466 if (TARGET_FLOAT && rclass == GENERAL_REGS
7467 && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
7468 return FP_REGS;
7469
7470 if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
7471 return GENERAL_REGS;
7472
7473 return NO_REGS;
7474 }
7475
7476 static bool
aarch64_can_eliminate(const int from ATTRIBUTE_UNUSED,const int to)7477 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
7478 {
7479 gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
7480
7481 /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
7482 can only eliminate to HARD_FRAME_POINTER_REGNUM. */
7483 if (frame_pointer_needed)
7484 return to == HARD_FRAME_POINTER_REGNUM;
7485 return true;
7486 }
7487
7488 poly_int64
aarch64_initial_elimination_offset(unsigned from,unsigned to)7489 aarch64_initial_elimination_offset (unsigned from, unsigned to)
7490 {
7491 aarch64_layout_frame ();
7492
7493 if (to == HARD_FRAME_POINTER_REGNUM)
7494 {
7495 if (from == ARG_POINTER_REGNUM)
7496 return cfun->machine->frame.hard_fp_offset;
7497
7498 if (from == FRAME_POINTER_REGNUM)
7499 return cfun->machine->frame.hard_fp_offset
7500 - cfun->machine->frame.locals_offset;
7501 }
7502
7503 if (to == STACK_POINTER_REGNUM)
7504 {
7505 if (from == FRAME_POINTER_REGNUM)
7506 return cfun->machine->frame.frame_size
7507 - cfun->machine->frame.locals_offset;
7508 }
7509
7510 return cfun->machine->frame.frame_size;
7511 }
7512
7513 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
7514 previous frame. */
7515
7516 rtx
aarch64_return_addr(int count,rtx frame ATTRIBUTE_UNUSED)7517 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
7518 {
7519 if (count != 0)
7520 return const0_rtx;
7521 return get_hard_reg_initial_val (Pmode, LR_REGNUM);
7522 }
7523
7524
7525 static void
aarch64_asm_trampoline_template(FILE * f)7526 aarch64_asm_trampoline_template (FILE *f)
7527 {
7528 if (TARGET_ILP32)
7529 {
7530 asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
7531 asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
7532 }
7533 else
7534 {
7535 asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
7536 asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
7537 }
7538 asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
7539 assemble_aligned_integer (4, const0_rtx);
7540 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
7541 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
7542 }
7543
7544 static void
aarch64_trampoline_init(rtx m_tramp,tree fndecl,rtx chain_value)7545 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
7546 {
7547 rtx fnaddr, mem, a_tramp;
7548 const int tramp_code_sz = 16;
7549
7550 /* Don't need to copy the trailing D-words, we fill those in below. */
7551 emit_block_move (m_tramp, assemble_trampoline_template (),
7552 GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
7553 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
7554 fnaddr = XEXP (DECL_RTL (fndecl), 0);
7555 if (GET_MODE (fnaddr) != ptr_mode)
7556 fnaddr = convert_memory_address (ptr_mode, fnaddr);
7557 emit_move_insn (mem, fnaddr);
7558
7559 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
7560 emit_move_insn (mem, chain_value);
7561
7562 /* XXX We should really define a "clear_cache" pattern and use
7563 gen_clear_cache(). */
7564 a_tramp = XEXP (m_tramp, 0);
7565 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
7566 LCT_NORMAL, VOIDmode, a_tramp, ptr_mode,
7567 plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
7568 ptr_mode);
7569 }
7570
7571 static unsigned char
aarch64_class_max_nregs(reg_class_t regclass,machine_mode mode)7572 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
7573 {
7574 /* ??? Logically we should only need to provide a value when
7575 HARD_REGNO_MODE_OK says that at least one register in REGCLASS
7576 can hold MODE, but at the moment we need to handle all modes.
7577 Just ignore any runtime parts for registers that can't store them. */
7578 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
7579 unsigned int nregs;
7580 switch (regclass)
7581 {
7582 case TAILCALL_ADDR_REGS:
7583 case POINTER_REGS:
7584 case GENERAL_REGS:
7585 case ALL_REGS:
7586 case POINTER_AND_FP_REGS:
7587 case FP_REGS:
7588 case FP_LO_REGS:
7589 if (aarch64_sve_data_mode_p (mode)
7590 && constant_multiple_p (GET_MODE_SIZE (mode),
7591 BYTES_PER_SVE_VECTOR, &nregs))
7592 return nregs;
7593 return (aarch64_vector_data_mode_p (mode)
7594 ? CEIL (lowest_size, UNITS_PER_VREG)
7595 : CEIL (lowest_size, UNITS_PER_WORD));
7596 case STACK_REG:
7597 case PR_REGS:
7598 case PR_LO_REGS:
7599 case PR_HI_REGS:
7600 return 1;
7601
7602 case NO_REGS:
7603 return 0;
7604
7605 default:
7606 break;
7607 }
7608 gcc_unreachable ();
7609 }
7610
7611 static reg_class_t
aarch64_preferred_reload_class(rtx x,reg_class_t regclass)7612 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
7613 {
7614 if (regclass == POINTER_REGS)
7615 return GENERAL_REGS;
7616
7617 if (regclass == STACK_REG)
7618 {
7619 if (REG_P(x)
7620 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
7621 return regclass;
7622
7623 return NO_REGS;
7624 }
7625
7626 /* Register eliminiation can result in a request for
7627 SP+constant->FP_REGS. We cannot support such operations which
7628 use SP as source and an FP_REG as destination, so reject out
7629 right now. */
7630 if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
7631 {
7632 rtx lhs = XEXP (x, 0);
7633
7634 /* Look through a possible SUBREG introduced by ILP32. */
7635 if (GET_CODE (lhs) == SUBREG)
7636 lhs = SUBREG_REG (lhs);
7637
7638 gcc_assert (REG_P (lhs));
7639 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
7640 POINTER_REGS));
7641 return NO_REGS;
7642 }
7643
7644 return regclass;
7645 }
7646
7647 void
aarch64_asm_output_labelref(FILE * f,const char * name)7648 aarch64_asm_output_labelref (FILE* f, const char *name)
7649 {
7650 asm_fprintf (f, "%U%s", name);
7651 }
7652
7653 static void
aarch64_elf_asm_constructor(rtx symbol,int priority)7654 aarch64_elf_asm_constructor (rtx symbol, int priority)
7655 {
7656 if (priority == DEFAULT_INIT_PRIORITY)
7657 default_ctor_section_asm_out_constructor (symbol, priority);
7658 else
7659 {
7660 section *s;
7661 /* While priority is known to be in range [0, 65535], so 18 bytes
7662 would be enough, the compiler might not know that. To avoid
7663 -Wformat-truncation false positive, use a larger size. */
7664 char buf[23];
7665 snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
7666 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
7667 switch_to_section (s);
7668 assemble_align (POINTER_SIZE);
7669 assemble_aligned_integer (POINTER_BYTES, symbol);
7670 }
7671 }
7672
7673 static void
aarch64_elf_asm_destructor(rtx symbol,int priority)7674 aarch64_elf_asm_destructor (rtx symbol, int priority)
7675 {
7676 if (priority == DEFAULT_INIT_PRIORITY)
7677 default_dtor_section_asm_out_destructor (symbol, priority);
7678 else
7679 {
7680 section *s;
7681 /* While priority is known to be in range [0, 65535], so 18 bytes
7682 would be enough, the compiler might not know that. To avoid
7683 -Wformat-truncation false positive, use a larger size. */
7684 char buf[23];
7685 snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
7686 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
7687 switch_to_section (s);
7688 assemble_align (POINTER_SIZE);
7689 assemble_aligned_integer (POINTER_BYTES, symbol);
7690 }
7691 }
7692
7693 const char*
aarch64_output_casesi(rtx * operands)7694 aarch64_output_casesi (rtx *operands)
7695 {
7696 char buf[100];
7697 char label[100];
7698 rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
7699 int index;
7700 static const char *const patterns[4][2] =
7701 {
7702 {
7703 "ldrb\t%w3, [%0,%w1,uxtw]",
7704 "add\t%3, %4, %w3, sxtb #2"
7705 },
7706 {
7707 "ldrh\t%w3, [%0,%w1,uxtw #1]",
7708 "add\t%3, %4, %w3, sxth #2"
7709 },
7710 {
7711 "ldr\t%w3, [%0,%w1,uxtw #2]",
7712 "add\t%3, %4, %w3, sxtw #2"
7713 },
7714 /* We assume that DImode is only generated when not optimizing and
7715 that we don't really need 64-bit address offsets. That would
7716 imply an object file with 8GB of code in a single function! */
7717 {
7718 "ldr\t%w3, [%0,%w1,uxtw #2]",
7719 "add\t%3, %4, %w3, sxtw #2"
7720 }
7721 };
7722
7723 gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
7724
7725 scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
7726 index = exact_log2 (GET_MODE_SIZE (mode));
7727
7728 gcc_assert (index >= 0 && index <= 3);
7729
7730 /* Need to implement table size reduction, by chaning the code below. */
7731 output_asm_insn (patterns[index][0], operands);
7732 ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
7733 snprintf (buf, sizeof (buf),
7734 "adr\t%%4, %s", targetm.strip_name_encoding (label));
7735 output_asm_insn (buf, operands);
7736 output_asm_insn (patterns[index][1], operands);
7737 output_asm_insn ("br\t%3", operands);
7738 assemble_label (asm_out_file, label);
7739 return "";
7740 }
7741
7742
7743 /* Return size in bits of an arithmetic operand which is shifted/scaled and
7744 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
7745 operator. */
7746
7747 int
aarch64_uxt_size(int shift,HOST_WIDE_INT mask)7748 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
7749 {
7750 if (shift >= 0 && shift <= 3)
7751 {
7752 int size;
7753 for (size = 8; size <= 32; size *= 2)
7754 {
7755 HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
7756 if (mask == bits << shift)
7757 return size;
7758 }
7759 }
7760 return 0;
7761 }
7762
7763 /* Constant pools are per function only when PC relative
7764 literal loads are true or we are in the large memory
7765 model. */
7766
7767 static inline bool
aarch64_can_use_per_function_literal_pools_p(void)7768 aarch64_can_use_per_function_literal_pools_p (void)
7769 {
7770 return (aarch64_pcrelative_literal_loads
7771 || aarch64_cmodel == AARCH64_CMODEL_LARGE);
7772 }
7773
7774 static bool
aarch64_use_blocks_for_constant_p(machine_mode,const_rtx)7775 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
7776 {
7777 /* We can't use blocks for constants when we're using a per-function
7778 constant pool. */
7779 return !aarch64_can_use_per_function_literal_pools_p ();
7780 }
7781
7782 /* Select appropriate section for constants depending
7783 on where we place literal pools. */
7784
7785 static section *
aarch64_select_rtx_section(machine_mode mode,rtx x,unsigned HOST_WIDE_INT align)7786 aarch64_select_rtx_section (machine_mode mode,
7787 rtx x,
7788 unsigned HOST_WIDE_INT align)
7789 {
7790 if (aarch64_can_use_per_function_literal_pools_p ())
7791 return function_section (current_function_decl);
7792
7793 return default_elf_select_rtx_section (mode, x, align);
7794 }
7795
7796 /* Implement ASM_OUTPUT_POOL_EPILOGUE. */
7797 void
aarch64_asm_output_pool_epilogue(FILE * f,const char *,tree,HOST_WIDE_INT offset)7798 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
7799 HOST_WIDE_INT offset)
7800 {
7801 /* When using per-function literal pools, we must ensure that any code
7802 section is aligned to the minimal instruction length, lest we get
7803 errors from the assembler re "unaligned instructions". */
7804 if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
7805 ASM_OUTPUT_ALIGN (f, 2);
7806 }
7807
7808 /* Costs. */
7809
7810 /* Helper function for rtx cost calculation. Strip a shift expression
7811 from X. Returns the inner operand if successful, or the original
7812 expression on failure. */
7813 static rtx
aarch64_strip_shift(rtx x)7814 aarch64_strip_shift (rtx x)
7815 {
7816 rtx op = x;
7817
7818 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
7819 we can convert both to ROR during final output. */
7820 if ((GET_CODE (op) == ASHIFT
7821 || GET_CODE (op) == ASHIFTRT
7822 || GET_CODE (op) == LSHIFTRT
7823 || GET_CODE (op) == ROTATERT
7824 || GET_CODE (op) == ROTATE)
7825 && CONST_INT_P (XEXP (op, 1)))
7826 return XEXP (op, 0);
7827
7828 if (GET_CODE (op) == MULT
7829 && CONST_INT_P (XEXP (op, 1))
7830 && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
7831 return XEXP (op, 0);
7832
7833 return x;
7834 }
7835
7836 /* Helper function for rtx cost calculation. Strip an extend
7837 expression from X. Returns the inner operand if successful, or the
7838 original expression on failure. We deal with a number of possible
7839 canonicalization variations here. If STRIP_SHIFT is true, then
7840 we can strip off a shift also. */
7841 static rtx
aarch64_strip_extend(rtx x,bool strip_shift)7842 aarch64_strip_extend (rtx x, bool strip_shift)
7843 {
7844 scalar_int_mode mode;
7845 rtx op = x;
7846
7847 if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
7848 return op;
7849
7850 /* Zero and sign extraction of a widened value. */
7851 if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
7852 && XEXP (op, 2) == const0_rtx
7853 && GET_CODE (XEXP (op, 0)) == MULT
7854 && aarch64_is_extend_from_extract (mode, XEXP (XEXP (op, 0), 1),
7855 XEXP (op, 1)))
7856 return XEXP (XEXP (op, 0), 0);
7857
7858 /* It can also be represented (for zero-extend) as an AND with an
7859 immediate. */
7860 if (GET_CODE (op) == AND
7861 && GET_CODE (XEXP (op, 0)) == MULT
7862 && CONST_INT_P (XEXP (XEXP (op, 0), 1))
7863 && CONST_INT_P (XEXP (op, 1))
7864 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
7865 INTVAL (XEXP (op, 1))) != 0)
7866 return XEXP (XEXP (op, 0), 0);
7867
7868 /* Now handle extended register, as this may also have an optional
7869 left shift by 1..4. */
7870 if (strip_shift
7871 && GET_CODE (op) == ASHIFT
7872 && CONST_INT_P (XEXP (op, 1))
7873 && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
7874 op = XEXP (op, 0);
7875
7876 if (GET_CODE (op) == ZERO_EXTEND
7877 || GET_CODE (op) == SIGN_EXTEND)
7878 op = XEXP (op, 0);
7879
7880 if (op != x)
7881 return op;
7882
7883 return x;
7884 }
7885
7886 /* Return true iff CODE is a shift supported in combination
7887 with arithmetic instructions. */
7888
7889 static bool
aarch64_shift_p(enum rtx_code code)7890 aarch64_shift_p (enum rtx_code code)
7891 {
7892 return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
7893 }
7894
7895
7896 /* Return true iff X is a cheap shift without a sign extend. */
7897
7898 static bool
aarch64_cheap_mult_shift_p(rtx x)7899 aarch64_cheap_mult_shift_p (rtx x)
7900 {
7901 rtx op0, op1;
7902
7903 op0 = XEXP (x, 0);
7904 op1 = XEXP (x, 1);
7905
7906 if (!(aarch64_tune_params.extra_tuning_flags
7907 & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
7908 return false;
7909
7910 if (GET_CODE (op0) == SIGN_EXTEND)
7911 return false;
7912
7913 if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
7914 && UINTVAL (op1) <= 4)
7915 return true;
7916
7917 if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
7918 return false;
7919
7920 HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
7921
7922 if (l2 > 0 && l2 <= 4)
7923 return true;
7924
7925 return false;
7926 }
7927
7928 /* Helper function for rtx cost calculation. Calculate the cost of
7929 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
7930 Return the calculated cost of the expression, recursing manually in to
7931 operands where needed. */
7932
7933 static int
aarch64_rtx_mult_cost(rtx x,enum rtx_code code,int outer,bool speed)7934 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
7935 {
7936 rtx op0, op1;
7937 const struct cpu_cost_table *extra_cost
7938 = aarch64_tune_params.insn_extra_cost;
7939 int cost = 0;
7940 bool compound_p = (outer == PLUS || outer == MINUS);
7941 machine_mode mode = GET_MODE (x);
7942
7943 gcc_checking_assert (code == MULT);
7944
7945 op0 = XEXP (x, 0);
7946 op1 = XEXP (x, 1);
7947
7948 if (VECTOR_MODE_P (mode))
7949 mode = GET_MODE_INNER (mode);
7950
7951 /* Integer multiply/fma. */
7952 if (GET_MODE_CLASS (mode) == MODE_INT)
7953 {
7954 /* The multiply will be canonicalized as a shift, cost it as such. */
7955 if (aarch64_shift_p (GET_CODE (x))
7956 || (CONST_INT_P (op1)
7957 && exact_log2 (INTVAL (op1)) > 0))
7958 {
7959 bool is_extend = GET_CODE (op0) == ZERO_EXTEND
7960 || GET_CODE (op0) == SIGN_EXTEND;
7961 if (speed)
7962 {
7963 if (compound_p)
7964 {
7965 /* If the shift is considered cheap,
7966 then don't add any cost. */
7967 if (aarch64_cheap_mult_shift_p (x))
7968 ;
7969 else if (REG_P (op1))
7970 /* ARITH + shift-by-register. */
7971 cost += extra_cost->alu.arith_shift_reg;
7972 else if (is_extend)
7973 /* ARITH + extended register. We don't have a cost field
7974 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
7975 cost += extra_cost->alu.extend_arith;
7976 else
7977 /* ARITH + shift-by-immediate. */
7978 cost += extra_cost->alu.arith_shift;
7979 }
7980 else
7981 /* LSL (immediate). */
7982 cost += extra_cost->alu.shift;
7983
7984 }
7985 /* Strip extends as we will have costed them in the case above. */
7986 if (is_extend)
7987 op0 = aarch64_strip_extend (op0, true);
7988
7989 cost += rtx_cost (op0, VOIDmode, code, 0, speed);
7990
7991 return cost;
7992 }
7993
7994 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
7995 compound and let the below cases handle it. After all, MNEG is a
7996 special-case alias of MSUB. */
7997 if (GET_CODE (op0) == NEG)
7998 {
7999 op0 = XEXP (op0, 0);
8000 compound_p = true;
8001 }
8002
8003 /* Integer multiplies or FMAs have zero/sign extending variants. */
8004 if ((GET_CODE (op0) == ZERO_EXTEND
8005 && GET_CODE (op1) == ZERO_EXTEND)
8006 || (GET_CODE (op0) == SIGN_EXTEND
8007 && GET_CODE (op1) == SIGN_EXTEND))
8008 {
8009 cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
8010 cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
8011
8012 if (speed)
8013 {
8014 if (compound_p)
8015 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
8016 cost += extra_cost->mult[0].extend_add;
8017 else
8018 /* MUL/SMULL/UMULL. */
8019 cost += extra_cost->mult[0].extend;
8020 }
8021
8022 return cost;
8023 }
8024
8025 /* This is either an integer multiply or a MADD. In both cases
8026 we want to recurse and cost the operands. */
8027 cost += rtx_cost (op0, mode, MULT, 0, speed);
8028 cost += rtx_cost (op1, mode, MULT, 1, speed);
8029
8030 if (speed)
8031 {
8032 if (compound_p)
8033 /* MADD/MSUB. */
8034 cost += extra_cost->mult[mode == DImode].add;
8035 else
8036 /* MUL. */
8037 cost += extra_cost->mult[mode == DImode].simple;
8038 }
8039
8040 return cost;
8041 }
8042 else
8043 {
8044 if (speed)
8045 {
8046 /* Floating-point FMA/FMUL can also support negations of the
8047 operands, unless the rounding mode is upward or downward in
8048 which case FNMUL is different than FMUL with operand negation. */
8049 bool neg0 = GET_CODE (op0) == NEG;
8050 bool neg1 = GET_CODE (op1) == NEG;
8051 if (compound_p || !flag_rounding_math || (neg0 && neg1))
8052 {
8053 if (neg0)
8054 op0 = XEXP (op0, 0);
8055 if (neg1)
8056 op1 = XEXP (op1, 0);
8057 }
8058
8059 if (compound_p)
8060 /* FMADD/FNMADD/FNMSUB/FMSUB. */
8061 cost += extra_cost->fp[mode == DFmode].fma;
8062 else
8063 /* FMUL/FNMUL. */
8064 cost += extra_cost->fp[mode == DFmode].mult;
8065 }
8066
8067 cost += rtx_cost (op0, mode, MULT, 0, speed);
8068 cost += rtx_cost (op1, mode, MULT, 1, speed);
8069 return cost;
8070 }
8071 }
8072
8073 static int
aarch64_address_cost(rtx x,machine_mode mode,addr_space_t as ATTRIBUTE_UNUSED,bool speed)8074 aarch64_address_cost (rtx x,
8075 machine_mode mode,
8076 addr_space_t as ATTRIBUTE_UNUSED,
8077 bool speed)
8078 {
8079 enum rtx_code c = GET_CODE (x);
8080 const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
8081 struct aarch64_address_info info;
8082 int cost = 0;
8083 info.shift = 0;
8084
8085 if (!aarch64_classify_address (&info, x, mode, false))
8086 {
8087 if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
8088 {
8089 /* This is a CONST or SYMBOL ref which will be split
8090 in a different way depending on the code model in use.
8091 Cost it through the generic infrastructure. */
8092 int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
8093 /* Divide through by the cost of one instruction to
8094 bring it to the same units as the address costs. */
8095 cost_symbol_ref /= COSTS_N_INSNS (1);
8096 /* The cost is then the cost of preparing the address,
8097 followed by an immediate (possibly 0) offset. */
8098 return cost_symbol_ref + addr_cost->imm_offset;
8099 }
8100 else
8101 {
8102 /* This is most likely a jump table from a case
8103 statement. */
8104 return addr_cost->register_offset;
8105 }
8106 }
8107
8108 switch (info.type)
8109 {
8110 case ADDRESS_LO_SUM:
8111 case ADDRESS_SYMBOLIC:
8112 case ADDRESS_REG_IMM:
8113 cost += addr_cost->imm_offset;
8114 break;
8115
8116 case ADDRESS_REG_WB:
8117 if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
8118 cost += addr_cost->pre_modify;
8119 else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
8120 cost += addr_cost->post_modify;
8121 else
8122 gcc_unreachable ();
8123
8124 break;
8125
8126 case ADDRESS_REG_REG:
8127 cost += addr_cost->register_offset;
8128 break;
8129
8130 case ADDRESS_REG_SXTW:
8131 cost += addr_cost->register_sextend;
8132 break;
8133
8134 case ADDRESS_REG_UXTW:
8135 cost += addr_cost->register_zextend;
8136 break;
8137
8138 default:
8139 gcc_unreachable ();
8140 }
8141
8142
8143 if (info.shift > 0)
8144 {
8145 /* For the sake of calculating the cost of the shifted register
8146 component, we can treat same sized modes in the same way. */
8147 if (known_eq (GET_MODE_BITSIZE (mode), 16))
8148 cost += addr_cost->addr_scale_costs.hi;
8149 else if (known_eq (GET_MODE_BITSIZE (mode), 32))
8150 cost += addr_cost->addr_scale_costs.si;
8151 else if (known_eq (GET_MODE_BITSIZE (mode), 64))
8152 cost += addr_cost->addr_scale_costs.di;
8153 else
8154 /* We can't tell, or this is a 128-bit vector. */
8155 cost += addr_cost->addr_scale_costs.ti;
8156 }
8157
8158 return cost;
8159 }
8160
8161 /* Return the cost of a branch. If SPEED_P is true then the compiler is
8162 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
8163 to be taken. */
8164
8165 int
aarch64_branch_cost(bool speed_p,bool predictable_p)8166 aarch64_branch_cost (bool speed_p, bool predictable_p)
8167 {
8168 /* When optimizing for speed, use the cost of unpredictable branches. */
8169 const struct cpu_branch_cost *branch_costs =
8170 aarch64_tune_params.branch_costs;
8171
8172 if (!speed_p || predictable_p)
8173 return branch_costs->predictable;
8174 else
8175 return branch_costs->unpredictable;
8176 }
8177
8178 /* Return true if the RTX X in mode MODE is a zero or sign extract
8179 usable in an ADD or SUB (extended register) instruction. */
8180 static bool
aarch64_rtx_arith_op_extract_p(rtx x,scalar_int_mode mode)8181 aarch64_rtx_arith_op_extract_p (rtx x, scalar_int_mode mode)
8182 {
8183 /* Catch add with a sign extract.
8184 This is add_<optab><mode>_multp2. */
8185 if (GET_CODE (x) == SIGN_EXTRACT
8186 || GET_CODE (x) == ZERO_EXTRACT)
8187 {
8188 rtx op0 = XEXP (x, 0);
8189 rtx op1 = XEXP (x, 1);
8190 rtx op2 = XEXP (x, 2);
8191
8192 if (GET_CODE (op0) == MULT
8193 && CONST_INT_P (op1)
8194 && op2 == const0_rtx
8195 && CONST_INT_P (XEXP (op0, 1))
8196 && aarch64_is_extend_from_extract (mode,
8197 XEXP (op0, 1),
8198 op1))
8199 {
8200 return true;
8201 }
8202 }
8203 /* The simple case <ARITH>, XD, XN, XM, [us]xt.
8204 No shift. */
8205 else if (GET_CODE (x) == SIGN_EXTEND
8206 || GET_CODE (x) == ZERO_EXTEND)
8207 return REG_P (XEXP (x, 0));
8208
8209 return false;
8210 }
8211
8212 static bool
aarch64_frint_unspec_p(unsigned int u)8213 aarch64_frint_unspec_p (unsigned int u)
8214 {
8215 switch (u)
8216 {
8217 case UNSPEC_FRINTZ:
8218 case UNSPEC_FRINTP:
8219 case UNSPEC_FRINTM:
8220 case UNSPEC_FRINTA:
8221 case UNSPEC_FRINTN:
8222 case UNSPEC_FRINTX:
8223 case UNSPEC_FRINTI:
8224 return true;
8225
8226 default:
8227 return false;
8228 }
8229 }
8230
8231 /* Return true iff X is an rtx that will match an extr instruction
8232 i.e. as described in the *extr<mode>5_insn family of patterns.
8233 OP0 and OP1 will be set to the operands of the shifts involved
8234 on success and will be NULL_RTX otherwise. */
8235
8236 static bool
aarch64_extr_rtx_p(rtx x,rtx * res_op0,rtx * res_op1)8237 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
8238 {
8239 rtx op0, op1;
8240 scalar_int_mode mode;
8241 if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
8242 return false;
8243
8244 *res_op0 = NULL_RTX;
8245 *res_op1 = NULL_RTX;
8246
8247 if (GET_CODE (x) != IOR)
8248 return false;
8249
8250 op0 = XEXP (x, 0);
8251 op1 = XEXP (x, 1);
8252
8253 if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
8254 || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
8255 {
8256 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
8257 if (GET_CODE (op1) == ASHIFT)
8258 std::swap (op0, op1);
8259
8260 if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
8261 return false;
8262
8263 unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
8264 unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
8265
8266 if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
8267 && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
8268 {
8269 *res_op0 = XEXP (op0, 0);
8270 *res_op1 = XEXP (op1, 0);
8271 return true;
8272 }
8273 }
8274
8275 return false;
8276 }
8277
8278 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
8279 storing it in *COST. Result is true if the total cost of the operation
8280 has now been calculated. */
8281 static bool
aarch64_if_then_else_costs(rtx op0,rtx op1,rtx op2,int * cost,bool speed)8282 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
8283 {
8284 rtx inner;
8285 rtx comparator;
8286 enum rtx_code cmpcode;
8287
8288 if (COMPARISON_P (op0))
8289 {
8290 inner = XEXP (op0, 0);
8291 comparator = XEXP (op0, 1);
8292 cmpcode = GET_CODE (op0);
8293 }
8294 else
8295 {
8296 inner = op0;
8297 comparator = const0_rtx;
8298 cmpcode = NE;
8299 }
8300
8301 if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
8302 {
8303 /* Conditional branch. */
8304 if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
8305 return true;
8306 else
8307 {
8308 if (cmpcode == NE || cmpcode == EQ)
8309 {
8310 if (comparator == const0_rtx)
8311 {
8312 /* TBZ/TBNZ/CBZ/CBNZ. */
8313 if (GET_CODE (inner) == ZERO_EXTRACT)
8314 /* TBZ/TBNZ. */
8315 *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
8316 ZERO_EXTRACT, 0, speed);
8317 else
8318 /* CBZ/CBNZ. */
8319 *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
8320
8321 return true;
8322 }
8323 }
8324 else if (cmpcode == LT || cmpcode == GE)
8325 {
8326 /* TBZ/TBNZ. */
8327 if (comparator == const0_rtx)
8328 return true;
8329 }
8330 }
8331 }
8332 else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
8333 {
8334 /* CCMP. */
8335 if (GET_CODE (op1) == COMPARE)
8336 {
8337 /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0. */
8338 if (XEXP (op1, 1) == const0_rtx)
8339 *cost += 1;
8340 if (speed)
8341 {
8342 machine_mode mode = GET_MODE (XEXP (op1, 0));
8343 const struct cpu_cost_table *extra_cost
8344 = aarch64_tune_params.insn_extra_cost;
8345
8346 if (GET_MODE_CLASS (mode) == MODE_INT)
8347 *cost += extra_cost->alu.arith;
8348 else
8349 *cost += extra_cost->fp[mode == DFmode].compare;
8350 }
8351 return true;
8352 }
8353
8354 /* It's a conditional operation based on the status flags,
8355 so it must be some flavor of CSEL. */
8356
8357 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
8358 if (GET_CODE (op1) == NEG
8359 || GET_CODE (op1) == NOT
8360 || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
8361 op1 = XEXP (op1, 0);
8362 else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
8363 {
8364 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */
8365 op1 = XEXP (op1, 0);
8366 op2 = XEXP (op2, 0);
8367 }
8368
8369 *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
8370 *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
8371 return true;
8372 }
8373
8374 /* We don't know what this is, cost all operands. */
8375 return false;
8376 }
8377
8378 /* Check whether X is a bitfield operation of the form shift + extend that
8379 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the
8380 operand to which the bitfield operation is applied. Otherwise return
8381 NULL_RTX. */
8382
8383 static rtx
aarch64_extend_bitfield_pattern_p(rtx x)8384 aarch64_extend_bitfield_pattern_p (rtx x)
8385 {
8386 rtx_code outer_code = GET_CODE (x);
8387 machine_mode outer_mode = GET_MODE (x);
8388
8389 if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
8390 && outer_mode != SImode && outer_mode != DImode)
8391 return NULL_RTX;
8392
8393 rtx inner = XEXP (x, 0);
8394 rtx_code inner_code = GET_CODE (inner);
8395 machine_mode inner_mode = GET_MODE (inner);
8396 rtx op = NULL_RTX;
8397
8398 switch (inner_code)
8399 {
8400 case ASHIFT:
8401 if (CONST_INT_P (XEXP (inner, 1))
8402 && (inner_mode == QImode || inner_mode == HImode))
8403 op = XEXP (inner, 0);
8404 break;
8405 case LSHIFTRT:
8406 if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
8407 && (inner_mode == QImode || inner_mode == HImode))
8408 op = XEXP (inner, 0);
8409 break;
8410 case ASHIFTRT:
8411 if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
8412 && (inner_mode == QImode || inner_mode == HImode))
8413 op = XEXP (inner, 0);
8414 break;
8415 default:
8416 break;
8417 }
8418
8419 return op;
8420 }
8421
8422 /* Return true if the mask and a shift amount from an RTX of the form
8423 (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
8424 mode MODE. See the *andim_ashift<mode>_bfiz pattern. */
8425
8426 bool
aarch64_mask_and_shift_for_ubfiz_p(scalar_int_mode mode,rtx mask,rtx shft_amnt)8427 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
8428 rtx shft_amnt)
8429 {
8430 return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
8431 && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
8432 && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
8433 && (INTVAL (mask)
8434 & ((HOST_WIDE_INT_1U << INTVAL (shft_amnt)) - 1)) == 0;
8435 }
8436
8437 /* Calculate the cost of calculating X, storing it in *COST. Result
8438 is true if the total cost of the operation has now been calculated. */
8439 static bool
aarch64_rtx_costs(rtx x,machine_mode mode,int outer ATTRIBUTE_UNUSED,int param ATTRIBUTE_UNUSED,int * cost,bool speed)8440 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
8441 int param ATTRIBUTE_UNUSED, int *cost, bool speed)
8442 {
8443 rtx op0, op1, op2;
8444 const struct cpu_cost_table *extra_cost
8445 = aarch64_tune_params.insn_extra_cost;
8446 int code = GET_CODE (x);
8447 scalar_int_mode int_mode;
8448
8449 /* By default, assume that everything has equivalent cost to the
8450 cheapest instruction. Any additional costs are applied as a delta
8451 above this default. */
8452 *cost = COSTS_N_INSNS (1);
8453
8454 switch (code)
8455 {
8456 case SET:
8457 /* The cost depends entirely on the operands to SET. */
8458 *cost = 0;
8459 op0 = SET_DEST (x);
8460 op1 = SET_SRC (x);
8461
8462 switch (GET_CODE (op0))
8463 {
8464 case MEM:
8465 if (speed)
8466 {
8467 rtx address = XEXP (op0, 0);
8468 if (VECTOR_MODE_P (mode))
8469 *cost += extra_cost->ldst.storev;
8470 else if (GET_MODE_CLASS (mode) == MODE_INT)
8471 *cost += extra_cost->ldst.store;
8472 else if (mode == SFmode)
8473 *cost += extra_cost->ldst.storef;
8474 else if (mode == DFmode)
8475 *cost += extra_cost->ldst.stored;
8476
8477 *cost +=
8478 COSTS_N_INSNS (aarch64_address_cost (address, mode,
8479 0, speed));
8480 }
8481
8482 *cost += rtx_cost (op1, mode, SET, 1, speed);
8483 return true;
8484
8485 case SUBREG:
8486 if (! REG_P (SUBREG_REG (op0)))
8487 *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
8488
8489 /* Fall through. */
8490 case REG:
8491 /* The cost is one per vector-register copied. */
8492 if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
8493 {
8494 int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
8495 *cost = COSTS_N_INSNS (nregs);
8496 }
8497 /* const0_rtx is in general free, but we will use an
8498 instruction to set a register to 0. */
8499 else if (REG_P (op1) || op1 == const0_rtx)
8500 {
8501 /* The cost is 1 per register copied. */
8502 int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
8503 *cost = COSTS_N_INSNS (nregs);
8504 }
8505 else
8506 /* Cost is just the cost of the RHS of the set. */
8507 *cost += rtx_cost (op1, mode, SET, 1, speed);
8508 return true;
8509
8510 case ZERO_EXTRACT:
8511 case SIGN_EXTRACT:
8512 /* Bit-field insertion. Strip any redundant widening of
8513 the RHS to meet the width of the target. */
8514 if (GET_CODE (op1) == SUBREG)
8515 op1 = SUBREG_REG (op1);
8516 if ((GET_CODE (op1) == ZERO_EXTEND
8517 || GET_CODE (op1) == SIGN_EXTEND)
8518 && CONST_INT_P (XEXP (op0, 1))
8519 && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
8520 && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
8521 op1 = XEXP (op1, 0);
8522
8523 if (CONST_INT_P (op1))
8524 {
8525 /* MOV immediate is assumed to always be cheap. */
8526 *cost = COSTS_N_INSNS (1);
8527 }
8528 else
8529 {
8530 /* BFM. */
8531 if (speed)
8532 *cost += extra_cost->alu.bfi;
8533 *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
8534 }
8535
8536 return true;
8537
8538 default:
8539 /* We can't make sense of this, assume default cost. */
8540 *cost = COSTS_N_INSNS (1);
8541 return false;
8542 }
8543 return false;
8544
8545 case CONST_INT:
8546 /* If an instruction can incorporate a constant within the
8547 instruction, the instruction's expression avoids calling
8548 rtx_cost() on the constant. If rtx_cost() is called on a
8549 constant, then it is usually because the constant must be
8550 moved into a register by one or more instructions.
8551
8552 The exception is constant 0, which can be expressed
8553 as XZR/WZR and is therefore free. The exception to this is
8554 if we have (set (reg) (const0_rtx)) in which case we must cost
8555 the move. However, we can catch that when we cost the SET, so
8556 we don't need to consider that here. */
8557 if (x == const0_rtx)
8558 *cost = 0;
8559 else
8560 {
8561 /* To an approximation, building any other constant is
8562 proportionally expensive to the number of instructions
8563 required to build that constant. This is true whether we
8564 are compiling for SPEED or otherwise. */
8565 if (!is_a <scalar_int_mode> (mode, &int_mode))
8566 int_mode = word_mode;
8567 *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
8568 (NULL_RTX, x, false, int_mode));
8569 }
8570 return true;
8571
8572 case CONST_DOUBLE:
8573
8574 /* First determine number of instructions to do the move
8575 as an integer constant. */
8576 if (!aarch64_float_const_representable_p (x)
8577 && !aarch64_can_const_movi_rtx_p (x, mode)
8578 && aarch64_float_const_rtx_p (x))
8579 {
8580 unsigned HOST_WIDE_INT ival;
8581 bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
8582 gcc_assert (succeed);
8583
8584 scalar_int_mode imode = (mode == HFmode
8585 ? SImode
8586 : int_mode_for_mode (mode).require ());
8587 int ncost = aarch64_internal_mov_immediate
8588 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
8589 *cost += COSTS_N_INSNS (ncost);
8590 return true;
8591 }
8592
8593 if (speed)
8594 {
8595 /* mov[df,sf]_aarch64. */
8596 if (aarch64_float_const_representable_p (x))
8597 /* FMOV (scalar immediate). */
8598 *cost += extra_cost->fp[mode == DFmode].fpconst;
8599 else if (!aarch64_float_const_zero_rtx_p (x))
8600 {
8601 /* This will be a load from memory. */
8602 if (mode == DFmode)
8603 *cost += extra_cost->ldst.loadd;
8604 else
8605 *cost += extra_cost->ldst.loadf;
8606 }
8607 else
8608 /* Otherwise this is +0.0. We get this using MOVI d0, #0
8609 or MOV v0.s[0], wzr - neither of which are modeled by the
8610 cost tables. Just use the default cost. */
8611 {
8612 }
8613 }
8614
8615 return true;
8616
8617 case MEM:
8618 if (speed)
8619 {
8620 /* For loads we want the base cost of a load, plus an
8621 approximation for the additional cost of the addressing
8622 mode. */
8623 rtx address = XEXP (x, 0);
8624 if (VECTOR_MODE_P (mode))
8625 *cost += extra_cost->ldst.loadv;
8626 else if (GET_MODE_CLASS (mode) == MODE_INT)
8627 *cost += extra_cost->ldst.load;
8628 else if (mode == SFmode)
8629 *cost += extra_cost->ldst.loadf;
8630 else if (mode == DFmode)
8631 *cost += extra_cost->ldst.loadd;
8632
8633 *cost +=
8634 COSTS_N_INSNS (aarch64_address_cost (address, mode,
8635 0, speed));
8636 }
8637
8638 return true;
8639
8640 case NEG:
8641 op0 = XEXP (x, 0);
8642
8643 if (VECTOR_MODE_P (mode))
8644 {
8645 if (speed)
8646 {
8647 /* FNEG. */
8648 *cost += extra_cost->vect.alu;
8649 }
8650 return false;
8651 }
8652
8653 if (GET_MODE_CLASS (mode) == MODE_INT)
8654 {
8655 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
8656 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
8657 {
8658 /* CSETM. */
8659 *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
8660 return true;
8661 }
8662
8663 /* Cost this as SUB wzr, X. */
8664 op0 = CONST0_RTX (mode);
8665 op1 = XEXP (x, 0);
8666 goto cost_minus;
8667 }
8668
8669 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
8670 {
8671 /* Support (neg(fma...)) as a single instruction only if
8672 sign of zeros is unimportant. This matches the decision
8673 making in aarch64.md. */
8674 if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
8675 {
8676 /* FNMADD. */
8677 *cost = rtx_cost (op0, mode, NEG, 0, speed);
8678 return true;
8679 }
8680 if (GET_CODE (op0) == MULT)
8681 {
8682 /* FNMUL. */
8683 *cost = rtx_cost (op0, mode, NEG, 0, speed);
8684 return true;
8685 }
8686 if (speed)
8687 /* FNEG. */
8688 *cost += extra_cost->fp[mode == DFmode].neg;
8689 return false;
8690 }
8691
8692 return false;
8693
8694 case CLRSB:
8695 case CLZ:
8696 if (speed)
8697 {
8698 if (VECTOR_MODE_P (mode))
8699 *cost += extra_cost->vect.alu;
8700 else
8701 *cost += extra_cost->alu.clz;
8702 }
8703
8704 return false;
8705
8706 case COMPARE:
8707 op0 = XEXP (x, 0);
8708 op1 = XEXP (x, 1);
8709
8710 if (op1 == const0_rtx
8711 && GET_CODE (op0) == AND)
8712 {
8713 x = op0;
8714 mode = GET_MODE (op0);
8715 goto cost_logic;
8716 }
8717
8718 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
8719 {
8720 /* TODO: A write to the CC flags possibly costs extra, this
8721 needs encoding in the cost tables. */
8722
8723 mode = GET_MODE (op0);
8724 /* ANDS. */
8725 if (GET_CODE (op0) == AND)
8726 {
8727 x = op0;
8728 goto cost_logic;
8729 }
8730
8731 if (GET_CODE (op0) == PLUS)
8732 {
8733 /* ADDS (and CMN alias). */
8734 x = op0;
8735 goto cost_plus;
8736 }
8737
8738 if (GET_CODE (op0) == MINUS)
8739 {
8740 /* SUBS. */
8741 x = op0;
8742 goto cost_minus;
8743 }
8744
8745 if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
8746 && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
8747 && CONST_INT_P (XEXP (op0, 2)))
8748 {
8749 /* COMPARE of ZERO_EXTRACT form of TST-immediate.
8750 Handle it here directly rather than going to cost_logic
8751 since we know the immediate generated for the TST is valid
8752 so we can avoid creating an intermediate rtx for it only
8753 for costing purposes. */
8754 if (speed)
8755 *cost += extra_cost->alu.logical;
8756
8757 *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
8758 ZERO_EXTRACT, 0, speed);
8759 return true;
8760 }
8761
8762 if (GET_CODE (op1) == NEG)
8763 {
8764 /* CMN. */
8765 if (speed)
8766 *cost += extra_cost->alu.arith;
8767
8768 *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
8769 *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
8770 return true;
8771 }
8772
8773 /* CMP.
8774
8775 Compare can freely swap the order of operands, and
8776 canonicalization puts the more complex operation first.
8777 But the integer MINUS logic expects the shift/extend
8778 operation in op1. */
8779 if (! (REG_P (op0)
8780 || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
8781 {
8782 op0 = XEXP (x, 1);
8783 op1 = XEXP (x, 0);
8784 }
8785 goto cost_minus;
8786 }
8787
8788 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
8789 {
8790 /* FCMP. */
8791 if (speed)
8792 *cost += extra_cost->fp[mode == DFmode].compare;
8793
8794 if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
8795 {
8796 *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
8797 /* FCMP supports constant 0.0 for no extra cost. */
8798 return true;
8799 }
8800 return false;
8801 }
8802
8803 if (VECTOR_MODE_P (mode))
8804 {
8805 /* Vector compare. */
8806 if (speed)
8807 *cost += extra_cost->vect.alu;
8808
8809 if (aarch64_float_const_zero_rtx_p (op1))
8810 {
8811 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
8812 cost. */
8813 return true;
8814 }
8815 return false;
8816 }
8817 return false;
8818
8819 case MINUS:
8820 {
8821 op0 = XEXP (x, 0);
8822 op1 = XEXP (x, 1);
8823
8824 cost_minus:
8825 *cost += rtx_cost (op0, mode, MINUS, 0, speed);
8826
8827 /* Detect valid immediates. */
8828 if ((GET_MODE_CLASS (mode) == MODE_INT
8829 || (GET_MODE_CLASS (mode) == MODE_CC
8830 && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
8831 && CONST_INT_P (op1)
8832 && aarch64_uimm12_shift (INTVAL (op1)))
8833 {
8834 if (speed)
8835 /* SUB(S) (immediate). */
8836 *cost += extra_cost->alu.arith;
8837 return true;
8838 }
8839
8840 /* Look for SUB (extended register). */
8841 if (is_a <scalar_int_mode> (mode, &int_mode)
8842 && aarch64_rtx_arith_op_extract_p (op1, int_mode))
8843 {
8844 if (speed)
8845 *cost += extra_cost->alu.extend_arith;
8846
8847 op1 = aarch64_strip_extend (op1, true);
8848 *cost += rtx_cost (op1, VOIDmode,
8849 (enum rtx_code) GET_CODE (op1), 0, speed);
8850 return true;
8851 }
8852
8853 rtx new_op1 = aarch64_strip_extend (op1, false);
8854
8855 /* Cost this as an FMA-alike operation. */
8856 if ((GET_CODE (new_op1) == MULT
8857 || aarch64_shift_p (GET_CODE (new_op1)))
8858 && code != COMPARE)
8859 {
8860 *cost += aarch64_rtx_mult_cost (new_op1, MULT,
8861 (enum rtx_code) code,
8862 speed);
8863 return true;
8864 }
8865
8866 *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
8867
8868 if (speed)
8869 {
8870 if (VECTOR_MODE_P (mode))
8871 {
8872 /* Vector SUB. */
8873 *cost += extra_cost->vect.alu;
8874 }
8875 else if (GET_MODE_CLASS (mode) == MODE_INT)
8876 {
8877 /* SUB(S). */
8878 *cost += extra_cost->alu.arith;
8879 }
8880 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
8881 {
8882 /* FSUB. */
8883 *cost += extra_cost->fp[mode == DFmode].addsub;
8884 }
8885 }
8886 return true;
8887 }
8888
8889 case PLUS:
8890 {
8891 rtx new_op0;
8892
8893 op0 = XEXP (x, 0);
8894 op1 = XEXP (x, 1);
8895
8896 cost_plus:
8897 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
8898 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
8899 {
8900 /* CSINC. */
8901 *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
8902 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
8903 return true;
8904 }
8905
8906 if (GET_MODE_CLASS (mode) == MODE_INT
8907 && ((CONST_INT_P (op1) && aarch64_uimm12_shift (INTVAL (op1)))
8908 || aarch64_sve_addvl_addpl_immediate (op1, mode)))
8909 {
8910 *cost += rtx_cost (op0, mode, PLUS, 0, speed);
8911
8912 if (speed)
8913 /* ADD (immediate). */
8914 *cost += extra_cost->alu.arith;
8915 return true;
8916 }
8917
8918 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
8919
8920 /* Look for ADD (extended register). */
8921 if (is_a <scalar_int_mode> (mode, &int_mode)
8922 && aarch64_rtx_arith_op_extract_p (op0, int_mode))
8923 {
8924 if (speed)
8925 *cost += extra_cost->alu.extend_arith;
8926
8927 op0 = aarch64_strip_extend (op0, true);
8928 *cost += rtx_cost (op0, VOIDmode,
8929 (enum rtx_code) GET_CODE (op0), 0, speed);
8930 return true;
8931 }
8932
8933 /* Strip any extend, leave shifts behind as we will
8934 cost them through mult_cost. */
8935 new_op0 = aarch64_strip_extend (op0, false);
8936
8937 if (GET_CODE (new_op0) == MULT
8938 || aarch64_shift_p (GET_CODE (new_op0)))
8939 {
8940 *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
8941 speed);
8942 return true;
8943 }
8944
8945 *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
8946
8947 if (speed)
8948 {
8949 if (VECTOR_MODE_P (mode))
8950 {
8951 /* Vector ADD. */
8952 *cost += extra_cost->vect.alu;
8953 }
8954 else if (GET_MODE_CLASS (mode) == MODE_INT)
8955 {
8956 /* ADD. */
8957 *cost += extra_cost->alu.arith;
8958 }
8959 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
8960 {
8961 /* FADD. */
8962 *cost += extra_cost->fp[mode == DFmode].addsub;
8963 }
8964 }
8965 return true;
8966 }
8967
8968 case BSWAP:
8969 *cost = COSTS_N_INSNS (1);
8970
8971 if (speed)
8972 {
8973 if (VECTOR_MODE_P (mode))
8974 *cost += extra_cost->vect.alu;
8975 else
8976 *cost += extra_cost->alu.rev;
8977 }
8978 return false;
8979
8980 case IOR:
8981 if (aarch_rev16_p (x))
8982 {
8983 *cost = COSTS_N_INSNS (1);
8984
8985 if (speed)
8986 {
8987 if (VECTOR_MODE_P (mode))
8988 *cost += extra_cost->vect.alu;
8989 else
8990 *cost += extra_cost->alu.rev;
8991 }
8992 return true;
8993 }
8994
8995 if (aarch64_extr_rtx_p (x, &op0, &op1))
8996 {
8997 *cost += rtx_cost (op0, mode, IOR, 0, speed);
8998 *cost += rtx_cost (op1, mode, IOR, 1, speed);
8999 if (speed)
9000 *cost += extra_cost->alu.shift;
9001
9002 return true;
9003 }
9004 /* Fall through. */
9005 case XOR:
9006 case AND:
9007 cost_logic:
9008 op0 = XEXP (x, 0);
9009 op1 = XEXP (x, 1);
9010
9011 if (VECTOR_MODE_P (mode))
9012 {
9013 if (speed)
9014 *cost += extra_cost->vect.alu;
9015 return true;
9016 }
9017
9018 if (code == AND
9019 && GET_CODE (op0) == MULT
9020 && CONST_INT_P (XEXP (op0, 1))
9021 && CONST_INT_P (op1)
9022 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
9023 INTVAL (op1)) != 0)
9024 {
9025 /* This is a UBFM/SBFM. */
9026 *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
9027 if (speed)
9028 *cost += extra_cost->alu.bfx;
9029 return true;
9030 }
9031
9032 if (is_int_mode (mode, &int_mode))
9033 {
9034 if (CONST_INT_P (op1))
9035 {
9036 /* We have a mask + shift version of a UBFIZ
9037 i.e. the *andim_ashift<mode>_bfiz pattern. */
9038 if (GET_CODE (op0) == ASHIFT
9039 && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
9040 XEXP (op0, 1)))
9041 {
9042 *cost += rtx_cost (XEXP (op0, 0), int_mode,
9043 (enum rtx_code) code, 0, speed);
9044 if (speed)
9045 *cost += extra_cost->alu.bfx;
9046
9047 return true;
9048 }
9049 else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
9050 {
9051 /* We possibly get the immediate for free, this is not
9052 modelled. */
9053 *cost += rtx_cost (op0, int_mode,
9054 (enum rtx_code) code, 0, speed);
9055 if (speed)
9056 *cost += extra_cost->alu.logical;
9057
9058 return true;
9059 }
9060 }
9061 else
9062 {
9063 rtx new_op0 = op0;
9064
9065 /* Handle ORN, EON, or BIC. */
9066 if (GET_CODE (op0) == NOT)
9067 op0 = XEXP (op0, 0);
9068
9069 new_op0 = aarch64_strip_shift (op0);
9070
9071 /* If we had a shift on op0 then this is a logical-shift-
9072 by-register/immediate operation. Otherwise, this is just
9073 a logical operation. */
9074 if (speed)
9075 {
9076 if (new_op0 != op0)
9077 {
9078 /* Shift by immediate. */
9079 if (CONST_INT_P (XEXP (op0, 1)))
9080 *cost += extra_cost->alu.log_shift;
9081 else
9082 *cost += extra_cost->alu.log_shift_reg;
9083 }
9084 else
9085 *cost += extra_cost->alu.logical;
9086 }
9087
9088 /* In both cases we want to cost both operands. */
9089 *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
9090 0, speed);
9091 *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
9092 1, speed);
9093
9094 return true;
9095 }
9096 }
9097 return false;
9098
9099 case NOT:
9100 x = XEXP (x, 0);
9101 op0 = aarch64_strip_shift (x);
9102
9103 if (VECTOR_MODE_P (mode))
9104 {
9105 /* Vector NOT. */
9106 *cost += extra_cost->vect.alu;
9107 return false;
9108 }
9109
9110 /* MVN-shifted-reg. */
9111 if (op0 != x)
9112 {
9113 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
9114
9115 if (speed)
9116 *cost += extra_cost->alu.log_shift;
9117
9118 return true;
9119 }
9120 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
9121 Handle the second form here taking care that 'a' in the above can
9122 be a shift. */
9123 else if (GET_CODE (op0) == XOR)
9124 {
9125 rtx newop0 = XEXP (op0, 0);
9126 rtx newop1 = XEXP (op0, 1);
9127 rtx op0_stripped = aarch64_strip_shift (newop0);
9128
9129 *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
9130 *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
9131
9132 if (speed)
9133 {
9134 if (op0_stripped != newop0)
9135 *cost += extra_cost->alu.log_shift;
9136 else
9137 *cost += extra_cost->alu.logical;
9138 }
9139
9140 return true;
9141 }
9142 /* MVN. */
9143 if (speed)
9144 *cost += extra_cost->alu.logical;
9145
9146 return false;
9147
9148 case ZERO_EXTEND:
9149
9150 op0 = XEXP (x, 0);
9151 /* If a value is written in SI mode, then zero extended to DI
9152 mode, the operation will in general be free as a write to
9153 a 'w' register implicitly zeroes the upper bits of an 'x'
9154 register. However, if this is
9155
9156 (set (reg) (zero_extend (reg)))
9157
9158 we must cost the explicit register move. */
9159 if (mode == DImode
9160 && GET_MODE (op0) == SImode
9161 && outer == SET)
9162 {
9163 int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
9164
9165 /* If OP_COST is non-zero, then the cost of the zero extend
9166 is effectively the cost of the inner operation. Otherwise
9167 we have a MOV instruction and we take the cost from the MOV
9168 itself. This is true independently of whether we are
9169 optimizing for space or time. */
9170 if (op_cost)
9171 *cost = op_cost;
9172
9173 return true;
9174 }
9175 else if (MEM_P (op0))
9176 {
9177 /* All loads can zero extend to any size for free. */
9178 *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
9179 return true;
9180 }
9181
9182 op0 = aarch64_extend_bitfield_pattern_p (x);
9183 if (op0)
9184 {
9185 *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
9186 if (speed)
9187 *cost += extra_cost->alu.bfx;
9188 return true;
9189 }
9190
9191 if (speed)
9192 {
9193 if (VECTOR_MODE_P (mode))
9194 {
9195 /* UMOV. */
9196 *cost += extra_cost->vect.alu;
9197 }
9198 else
9199 {
9200 /* We generate an AND instead of UXTB/UXTH. */
9201 *cost += extra_cost->alu.logical;
9202 }
9203 }
9204 return false;
9205
9206 case SIGN_EXTEND:
9207 if (MEM_P (XEXP (x, 0)))
9208 {
9209 /* LDRSH. */
9210 if (speed)
9211 {
9212 rtx address = XEXP (XEXP (x, 0), 0);
9213 *cost += extra_cost->ldst.load_sign_extend;
9214
9215 *cost +=
9216 COSTS_N_INSNS (aarch64_address_cost (address, mode,
9217 0, speed));
9218 }
9219 return true;
9220 }
9221
9222 op0 = aarch64_extend_bitfield_pattern_p (x);
9223 if (op0)
9224 {
9225 *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
9226 if (speed)
9227 *cost += extra_cost->alu.bfx;
9228 return true;
9229 }
9230
9231 if (speed)
9232 {
9233 if (VECTOR_MODE_P (mode))
9234 *cost += extra_cost->vect.alu;
9235 else
9236 *cost += extra_cost->alu.extend;
9237 }
9238 return false;
9239
9240 case ASHIFT:
9241 op0 = XEXP (x, 0);
9242 op1 = XEXP (x, 1);
9243
9244 if (CONST_INT_P (op1))
9245 {
9246 if (speed)
9247 {
9248 if (VECTOR_MODE_P (mode))
9249 {
9250 /* Vector shift (immediate). */
9251 *cost += extra_cost->vect.alu;
9252 }
9253 else
9254 {
9255 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
9256 aliases. */
9257 *cost += extra_cost->alu.shift;
9258 }
9259 }
9260
9261 /* We can incorporate zero/sign extend for free. */
9262 if (GET_CODE (op0) == ZERO_EXTEND
9263 || GET_CODE (op0) == SIGN_EXTEND)
9264 op0 = XEXP (op0, 0);
9265
9266 *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
9267 return true;
9268 }
9269 else
9270 {
9271 if (VECTOR_MODE_P (mode))
9272 {
9273 if (speed)
9274 /* Vector shift (register). */
9275 *cost += extra_cost->vect.alu;
9276 }
9277 else
9278 {
9279 if (speed)
9280 /* LSLV. */
9281 *cost += extra_cost->alu.shift_reg;
9282
9283 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
9284 && CONST_INT_P (XEXP (op1, 1))
9285 && known_eq (INTVAL (XEXP (op1, 1)),
9286 GET_MODE_BITSIZE (mode) - 1))
9287 {
9288 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
9289 /* We already demanded XEXP (op1, 0) to be REG_P, so
9290 don't recurse into it. */
9291 return true;
9292 }
9293 }
9294 return false; /* All arguments need to be in registers. */
9295 }
9296
9297 case ROTATE:
9298 case ROTATERT:
9299 case LSHIFTRT:
9300 case ASHIFTRT:
9301 op0 = XEXP (x, 0);
9302 op1 = XEXP (x, 1);
9303
9304 if (CONST_INT_P (op1))
9305 {
9306 /* ASR (immediate) and friends. */
9307 if (speed)
9308 {
9309 if (VECTOR_MODE_P (mode))
9310 *cost += extra_cost->vect.alu;
9311 else
9312 *cost += extra_cost->alu.shift;
9313 }
9314
9315 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
9316 return true;
9317 }
9318 else
9319 {
9320 if (VECTOR_MODE_P (mode))
9321 {
9322 if (speed)
9323 /* Vector shift (register). */
9324 *cost += extra_cost->vect.alu;
9325 }
9326 else
9327 {
9328 if (speed)
9329 /* ASR (register) and friends. */
9330 *cost += extra_cost->alu.shift_reg;
9331
9332 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
9333 && CONST_INT_P (XEXP (op1, 1))
9334 && known_eq (INTVAL (XEXP (op1, 1)),
9335 GET_MODE_BITSIZE (mode) - 1))
9336 {
9337 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
9338 /* We already demanded XEXP (op1, 0) to be REG_P, so
9339 don't recurse into it. */
9340 return true;
9341 }
9342 }
9343 return false; /* All arguments need to be in registers. */
9344 }
9345
9346 case SYMBOL_REF:
9347
9348 if (aarch64_cmodel == AARCH64_CMODEL_LARGE
9349 || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
9350 {
9351 /* LDR. */
9352 if (speed)
9353 *cost += extra_cost->ldst.load;
9354 }
9355 else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
9356 || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
9357 {
9358 /* ADRP, followed by ADD. */
9359 *cost += COSTS_N_INSNS (1);
9360 if (speed)
9361 *cost += 2 * extra_cost->alu.arith;
9362 }
9363 else if (aarch64_cmodel == AARCH64_CMODEL_TINY
9364 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
9365 {
9366 /* ADR. */
9367 if (speed)
9368 *cost += extra_cost->alu.arith;
9369 }
9370
9371 if (flag_pic)
9372 {
9373 /* One extra load instruction, after accessing the GOT. */
9374 *cost += COSTS_N_INSNS (1);
9375 if (speed)
9376 *cost += extra_cost->ldst.load;
9377 }
9378 return true;
9379
9380 case HIGH:
9381 case LO_SUM:
9382 /* ADRP/ADD (immediate). */
9383 if (speed)
9384 *cost += extra_cost->alu.arith;
9385 return true;
9386
9387 case ZERO_EXTRACT:
9388 case SIGN_EXTRACT:
9389 /* UBFX/SBFX. */
9390 if (speed)
9391 {
9392 if (VECTOR_MODE_P (mode))
9393 *cost += extra_cost->vect.alu;
9394 else
9395 *cost += extra_cost->alu.bfx;
9396 }
9397
9398 /* We can trust that the immediates used will be correct (there
9399 are no by-register forms), so we need only cost op0. */
9400 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
9401 return true;
9402
9403 case MULT:
9404 *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
9405 /* aarch64_rtx_mult_cost always handles recursion to its
9406 operands. */
9407 return true;
9408
9409 case MOD:
9410 /* We can expand signed mod by power of 2 using a NEGS, two parallel
9411 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of
9412 an unconditional negate. This case should only ever be reached through
9413 the set_smod_pow2_cheap check in expmed.c. */
9414 if (CONST_INT_P (XEXP (x, 1))
9415 && exact_log2 (INTVAL (XEXP (x, 1))) > 0
9416 && (mode == SImode || mode == DImode))
9417 {
9418 /* We expand to 4 instructions. Reset the baseline. */
9419 *cost = COSTS_N_INSNS (4);
9420
9421 if (speed)
9422 *cost += 2 * extra_cost->alu.logical
9423 + 2 * extra_cost->alu.arith;
9424
9425 return true;
9426 }
9427
9428 /* Fall-through. */
9429 case UMOD:
9430 if (speed)
9431 {
9432 /* Slighly prefer UMOD over SMOD. */
9433 if (VECTOR_MODE_P (mode))
9434 *cost += extra_cost->vect.alu;
9435 else if (GET_MODE_CLASS (mode) == MODE_INT)
9436 *cost += (extra_cost->mult[mode == DImode].add
9437 + extra_cost->mult[mode == DImode].idiv
9438 + (code == MOD ? 1 : 0));
9439 }
9440 return false; /* All arguments need to be in registers. */
9441
9442 case DIV:
9443 case UDIV:
9444 case SQRT:
9445 if (speed)
9446 {
9447 if (VECTOR_MODE_P (mode))
9448 *cost += extra_cost->vect.alu;
9449 else if (GET_MODE_CLASS (mode) == MODE_INT)
9450 /* There is no integer SQRT, so only DIV and UDIV can get
9451 here. */
9452 *cost += (extra_cost->mult[mode == DImode].idiv
9453 /* Slighly prefer UDIV over SDIV. */
9454 + (code == DIV ? 1 : 0));
9455 else
9456 *cost += extra_cost->fp[mode == DFmode].div;
9457 }
9458 return false; /* All arguments need to be in registers. */
9459
9460 case IF_THEN_ELSE:
9461 return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
9462 XEXP (x, 2), cost, speed);
9463
9464 case EQ:
9465 case NE:
9466 case GT:
9467 case GTU:
9468 case LT:
9469 case LTU:
9470 case GE:
9471 case GEU:
9472 case LE:
9473 case LEU:
9474
9475 return false; /* All arguments must be in registers. */
9476
9477 case FMA:
9478 op0 = XEXP (x, 0);
9479 op1 = XEXP (x, 1);
9480 op2 = XEXP (x, 2);
9481
9482 if (speed)
9483 {
9484 if (VECTOR_MODE_P (mode))
9485 *cost += extra_cost->vect.alu;
9486 else
9487 *cost += extra_cost->fp[mode == DFmode].fma;
9488 }
9489
9490 /* FMSUB, FNMADD, and FNMSUB are free. */
9491 if (GET_CODE (op0) == NEG)
9492 op0 = XEXP (op0, 0);
9493
9494 if (GET_CODE (op2) == NEG)
9495 op2 = XEXP (op2, 0);
9496
9497 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
9498 and the by-element operand as operand 0. */
9499 if (GET_CODE (op1) == NEG)
9500 op1 = XEXP (op1, 0);
9501
9502 /* Catch vector-by-element operations. The by-element operand can
9503 either be (vec_duplicate (vec_select (x))) or just
9504 (vec_select (x)), depending on whether we are multiplying by
9505 a vector or a scalar.
9506
9507 Canonicalization is not very good in these cases, FMA4 will put the
9508 by-element operand as operand 0, FNMA4 will have it as operand 1. */
9509 if (GET_CODE (op0) == VEC_DUPLICATE)
9510 op0 = XEXP (op0, 0);
9511 else if (GET_CODE (op1) == VEC_DUPLICATE)
9512 op1 = XEXP (op1, 0);
9513
9514 if (GET_CODE (op0) == VEC_SELECT)
9515 op0 = XEXP (op0, 0);
9516 else if (GET_CODE (op1) == VEC_SELECT)
9517 op1 = XEXP (op1, 0);
9518
9519 /* If the remaining parameters are not registers,
9520 get the cost to put them into registers. */
9521 *cost += rtx_cost (op0, mode, FMA, 0, speed);
9522 *cost += rtx_cost (op1, mode, FMA, 1, speed);
9523 *cost += rtx_cost (op2, mode, FMA, 2, speed);
9524 return true;
9525
9526 case FLOAT:
9527 case UNSIGNED_FLOAT:
9528 if (speed)
9529 *cost += extra_cost->fp[mode == DFmode].fromint;
9530 return false;
9531
9532 case FLOAT_EXTEND:
9533 if (speed)
9534 {
9535 if (VECTOR_MODE_P (mode))
9536 {
9537 /*Vector truncate. */
9538 *cost += extra_cost->vect.alu;
9539 }
9540 else
9541 *cost += extra_cost->fp[mode == DFmode].widen;
9542 }
9543 return false;
9544
9545 case FLOAT_TRUNCATE:
9546 if (speed)
9547 {
9548 if (VECTOR_MODE_P (mode))
9549 {
9550 /*Vector conversion. */
9551 *cost += extra_cost->vect.alu;
9552 }
9553 else
9554 *cost += extra_cost->fp[mode == DFmode].narrow;
9555 }
9556 return false;
9557
9558 case FIX:
9559 case UNSIGNED_FIX:
9560 x = XEXP (x, 0);
9561 /* Strip the rounding part. They will all be implemented
9562 by the fcvt* family of instructions anyway. */
9563 if (GET_CODE (x) == UNSPEC)
9564 {
9565 unsigned int uns_code = XINT (x, 1);
9566
9567 if (uns_code == UNSPEC_FRINTA
9568 || uns_code == UNSPEC_FRINTM
9569 || uns_code == UNSPEC_FRINTN
9570 || uns_code == UNSPEC_FRINTP
9571 || uns_code == UNSPEC_FRINTZ)
9572 x = XVECEXP (x, 0, 0);
9573 }
9574
9575 if (speed)
9576 {
9577 if (VECTOR_MODE_P (mode))
9578 *cost += extra_cost->vect.alu;
9579 else
9580 *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
9581 }
9582
9583 /* We can combine fmul by a power of 2 followed by a fcvt into a single
9584 fixed-point fcvt. */
9585 if (GET_CODE (x) == MULT
9586 && ((VECTOR_MODE_P (mode)
9587 && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
9588 || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
9589 {
9590 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
9591 0, speed);
9592 return true;
9593 }
9594
9595 *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
9596 return true;
9597
9598 case ABS:
9599 if (VECTOR_MODE_P (mode))
9600 {
9601 /* ABS (vector). */
9602 if (speed)
9603 *cost += extra_cost->vect.alu;
9604 }
9605 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
9606 {
9607 op0 = XEXP (x, 0);
9608
9609 /* FABD, which is analogous to FADD. */
9610 if (GET_CODE (op0) == MINUS)
9611 {
9612 *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
9613 *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
9614 if (speed)
9615 *cost += extra_cost->fp[mode == DFmode].addsub;
9616
9617 return true;
9618 }
9619 /* Simple FABS is analogous to FNEG. */
9620 if (speed)
9621 *cost += extra_cost->fp[mode == DFmode].neg;
9622 }
9623 else
9624 {
9625 /* Integer ABS will either be split to
9626 two arithmetic instructions, or will be an ABS
9627 (scalar), which we don't model. */
9628 *cost = COSTS_N_INSNS (2);
9629 if (speed)
9630 *cost += 2 * extra_cost->alu.arith;
9631 }
9632 return false;
9633
9634 case SMAX:
9635 case SMIN:
9636 if (speed)
9637 {
9638 if (VECTOR_MODE_P (mode))
9639 *cost += extra_cost->vect.alu;
9640 else
9641 {
9642 /* FMAXNM/FMINNM/FMAX/FMIN.
9643 TODO: This may not be accurate for all implementations, but
9644 we do not model this in the cost tables. */
9645 *cost += extra_cost->fp[mode == DFmode].addsub;
9646 }
9647 }
9648 return false;
9649
9650 case UNSPEC:
9651 /* The floating point round to integer frint* instructions. */
9652 if (aarch64_frint_unspec_p (XINT (x, 1)))
9653 {
9654 if (speed)
9655 *cost += extra_cost->fp[mode == DFmode].roundint;
9656
9657 return false;
9658 }
9659
9660 if (XINT (x, 1) == UNSPEC_RBIT)
9661 {
9662 if (speed)
9663 *cost += extra_cost->alu.rev;
9664
9665 return false;
9666 }
9667 break;
9668
9669 case TRUNCATE:
9670
9671 /* Decompose <su>muldi3_highpart. */
9672 if (/* (truncate:DI */
9673 mode == DImode
9674 /* (lshiftrt:TI */
9675 && GET_MODE (XEXP (x, 0)) == TImode
9676 && GET_CODE (XEXP (x, 0)) == LSHIFTRT
9677 /* (mult:TI */
9678 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
9679 /* (ANY_EXTEND:TI (reg:DI))
9680 (ANY_EXTEND:TI (reg:DI))) */
9681 && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
9682 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
9683 || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
9684 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
9685 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
9686 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
9687 /* (const_int 64) */
9688 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
9689 && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
9690 {
9691 /* UMULH/SMULH. */
9692 if (speed)
9693 *cost += extra_cost->mult[mode == DImode].extend;
9694 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
9695 mode, MULT, 0, speed);
9696 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
9697 mode, MULT, 1, speed);
9698 return true;
9699 }
9700
9701 /* Fall through. */
9702 default:
9703 break;
9704 }
9705
9706 if (dump_file
9707 && flag_aarch64_verbose_cost)
9708 fprintf (dump_file,
9709 "\nFailed to cost RTX. Assuming default cost.\n");
9710
9711 return true;
9712 }
9713
9714 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
9715 calculated for X. This cost is stored in *COST. Returns true
9716 if the total cost of X was calculated. */
9717 static bool
aarch64_rtx_costs_wrapper(rtx x,machine_mode mode,int outer,int param,int * cost,bool speed)9718 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
9719 int param, int *cost, bool speed)
9720 {
9721 bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
9722
9723 if (dump_file
9724 && flag_aarch64_verbose_cost)
9725 {
9726 print_rtl_single (dump_file, x);
9727 fprintf (dump_file, "\n%s cost: %d (%s)\n",
9728 speed ? "Hot" : "Cold",
9729 *cost, result ? "final" : "partial");
9730 }
9731
9732 return result;
9733 }
9734
9735 static int
aarch64_register_move_cost(machine_mode mode,reg_class_t from_i,reg_class_t to_i)9736 aarch64_register_move_cost (machine_mode mode,
9737 reg_class_t from_i, reg_class_t to_i)
9738 {
9739 enum reg_class from = (enum reg_class) from_i;
9740 enum reg_class to = (enum reg_class) to_i;
9741 const struct cpu_regmove_cost *regmove_cost
9742 = aarch64_tune_params.regmove_cost;
9743
9744 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
9745 if (to == TAILCALL_ADDR_REGS || to == POINTER_REGS)
9746 to = GENERAL_REGS;
9747
9748 if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS)
9749 from = GENERAL_REGS;
9750
9751 /* Moving between GPR and stack cost is the same as GP2GP. */
9752 if ((from == GENERAL_REGS && to == STACK_REG)
9753 || (to == GENERAL_REGS && from == STACK_REG))
9754 return regmove_cost->GP2GP;
9755
9756 /* To/From the stack register, we move via the gprs. */
9757 if (to == STACK_REG || from == STACK_REG)
9758 return aarch64_register_move_cost (mode, from, GENERAL_REGS)
9759 + aarch64_register_move_cost (mode, GENERAL_REGS, to);
9760
9761 if (known_eq (GET_MODE_SIZE (mode), 16))
9762 {
9763 /* 128-bit operations on general registers require 2 instructions. */
9764 if (from == GENERAL_REGS && to == GENERAL_REGS)
9765 return regmove_cost->GP2GP * 2;
9766 else if (from == GENERAL_REGS)
9767 return regmove_cost->GP2FP * 2;
9768 else if (to == GENERAL_REGS)
9769 return regmove_cost->FP2GP * 2;
9770
9771 /* When AdvSIMD instructions are disabled it is not possible to move
9772 a 128-bit value directly between Q registers. This is handled in
9773 secondary reload. A general register is used as a scratch to move
9774 the upper DI value and the lower DI value is moved directly,
9775 hence the cost is the sum of three moves. */
9776 if (! TARGET_SIMD)
9777 return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
9778
9779 return regmove_cost->FP2FP;
9780 }
9781
9782 if (from == GENERAL_REGS && to == GENERAL_REGS)
9783 return regmove_cost->GP2GP;
9784 else if (from == GENERAL_REGS)
9785 return regmove_cost->GP2FP;
9786 else if (to == GENERAL_REGS)
9787 return regmove_cost->FP2GP;
9788
9789 return regmove_cost->FP2FP;
9790 }
9791
9792 static int
aarch64_memory_move_cost(machine_mode mode ATTRIBUTE_UNUSED,reg_class_t rclass ATTRIBUTE_UNUSED,bool in ATTRIBUTE_UNUSED)9793 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
9794 reg_class_t rclass ATTRIBUTE_UNUSED,
9795 bool in ATTRIBUTE_UNUSED)
9796 {
9797 return aarch64_tune_params.memmov_cost;
9798 }
9799
9800 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
9801 to optimize 1.0/sqrt. */
9802
9803 static bool
use_rsqrt_p(machine_mode mode)9804 use_rsqrt_p (machine_mode mode)
9805 {
9806 return (!flag_trapping_math
9807 && flag_unsafe_math_optimizations
9808 && ((aarch64_tune_params.approx_modes->recip_sqrt
9809 & AARCH64_APPROX_MODE (mode))
9810 || flag_mrecip_low_precision_sqrt));
9811 }
9812
9813 /* Function to decide when to use the approximate reciprocal square root
9814 builtin. */
9815
9816 static tree
aarch64_builtin_reciprocal(tree fndecl)9817 aarch64_builtin_reciprocal (tree fndecl)
9818 {
9819 machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
9820
9821 if (!use_rsqrt_p (mode))
9822 return NULL_TREE;
9823 return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
9824 }
9825
9826 typedef rtx (*rsqrte_type) (rtx, rtx);
9827
9828 /* Select reciprocal square root initial estimate insn depending on machine
9829 mode. */
9830
9831 static rsqrte_type
get_rsqrte_type(machine_mode mode)9832 get_rsqrte_type (machine_mode mode)
9833 {
9834 switch (mode)
9835 {
9836 case E_DFmode: return gen_aarch64_rsqrtedf;
9837 case E_SFmode: return gen_aarch64_rsqrtesf;
9838 case E_V2DFmode: return gen_aarch64_rsqrtev2df;
9839 case E_V2SFmode: return gen_aarch64_rsqrtev2sf;
9840 case E_V4SFmode: return gen_aarch64_rsqrtev4sf;
9841 default: gcc_unreachable ();
9842 }
9843 }
9844
9845 typedef rtx (*rsqrts_type) (rtx, rtx, rtx);
9846
9847 /* Select reciprocal square root series step insn depending on machine mode. */
9848
9849 static rsqrts_type
get_rsqrts_type(machine_mode mode)9850 get_rsqrts_type (machine_mode mode)
9851 {
9852 switch (mode)
9853 {
9854 case E_DFmode: return gen_aarch64_rsqrtsdf;
9855 case E_SFmode: return gen_aarch64_rsqrtssf;
9856 case E_V2DFmode: return gen_aarch64_rsqrtsv2df;
9857 case E_V2SFmode: return gen_aarch64_rsqrtsv2sf;
9858 case E_V4SFmode: return gen_aarch64_rsqrtsv4sf;
9859 default: gcc_unreachable ();
9860 }
9861 }
9862
9863 /* Emit instruction sequence to compute either the approximate square root
9864 or its approximate reciprocal, depending on the flag RECP, and return
9865 whether the sequence was emitted or not. */
9866
9867 bool
aarch64_emit_approx_sqrt(rtx dst,rtx src,bool recp)9868 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
9869 {
9870 machine_mode mode = GET_MODE (dst);
9871
9872 if (GET_MODE_INNER (mode) == HFmode)
9873 {
9874 gcc_assert (!recp);
9875 return false;
9876 }
9877
9878 if (!recp)
9879 {
9880 if (!(flag_mlow_precision_sqrt
9881 || (aarch64_tune_params.approx_modes->sqrt
9882 & AARCH64_APPROX_MODE (mode))))
9883 return false;
9884
9885 if (flag_finite_math_only
9886 || flag_trapping_math
9887 || !flag_unsafe_math_optimizations
9888 || optimize_function_for_size_p (cfun))
9889 return false;
9890 }
9891 else
9892 /* Caller assumes we cannot fail. */
9893 gcc_assert (use_rsqrt_p (mode));
9894
9895 machine_mode mmsk = mode_for_int_vector (mode).require ();
9896 rtx xmsk = gen_reg_rtx (mmsk);
9897 if (!recp)
9898 /* When calculating the approximate square root, compare the
9899 argument with 0.0 and create a mask. */
9900 emit_insn (gen_rtx_SET (xmsk,
9901 gen_rtx_NEG (mmsk,
9902 gen_rtx_EQ (mmsk, src,
9903 CONST0_RTX (mode)))));
9904
9905 /* Estimate the approximate reciprocal square root. */
9906 rtx xdst = gen_reg_rtx (mode);
9907 emit_insn ((*get_rsqrte_type (mode)) (xdst, src));
9908
9909 /* Iterate over the series twice for SF and thrice for DF. */
9910 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
9911
9912 /* Optionally iterate over the series once less for faster performance
9913 while sacrificing the accuracy. */
9914 if ((recp && flag_mrecip_low_precision_sqrt)
9915 || (!recp && flag_mlow_precision_sqrt))
9916 iterations--;
9917
9918 /* Iterate over the series to calculate the approximate reciprocal square
9919 root. */
9920 rtx x1 = gen_reg_rtx (mode);
9921 while (iterations--)
9922 {
9923 rtx x2 = gen_reg_rtx (mode);
9924 emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
9925
9926 emit_insn ((*get_rsqrts_type (mode)) (x1, src, x2));
9927
9928 if (iterations > 0)
9929 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
9930 }
9931
9932 if (!recp)
9933 {
9934 /* Qualify the approximate reciprocal square root when the argument is
9935 0.0 by squashing the intermediary result to 0.0. */
9936 rtx xtmp = gen_reg_rtx (mmsk);
9937 emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
9938 gen_rtx_SUBREG (mmsk, xdst, 0)));
9939 emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
9940
9941 /* Calculate the approximate square root. */
9942 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
9943 }
9944
9945 /* Finalize the approximation. */
9946 emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
9947
9948 return true;
9949 }
9950
9951 typedef rtx (*recpe_type) (rtx, rtx);
9952
9953 /* Select reciprocal initial estimate insn depending on machine mode. */
9954
9955 static recpe_type
get_recpe_type(machine_mode mode)9956 get_recpe_type (machine_mode mode)
9957 {
9958 switch (mode)
9959 {
9960 case E_SFmode: return (gen_aarch64_frecpesf);
9961 case E_V2SFmode: return (gen_aarch64_frecpev2sf);
9962 case E_V4SFmode: return (gen_aarch64_frecpev4sf);
9963 case E_DFmode: return (gen_aarch64_frecpedf);
9964 case E_V2DFmode: return (gen_aarch64_frecpev2df);
9965 default: gcc_unreachable ();
9966 }
9967 }
9968
9969 typedef rtx (*recps_type) (rtx, rtx, rtx);
9970
9971 /* Select reciprocal series step insn depending on machine mode. */
9972
9973 static recps_type
get_recps_type(machine_mode mode)9974 get_recps_type (machine_mode mode)
9975 {
9976 switch (mode)
9977 {
9978 case E_SFmode: return (gen_aarch64_frecpssf);
9979 case E_V2SFmode: return (gen_aarch64_frecpsv2sf);
9980 case E_V4SFmode: return (gen_aarch64_frecpsv4sf);
9981 case E_DFmode: return (gen_aarch64_frecpsdf);
9982 case E_V2DFmode: return (gen_aarch64_frecpsv2df);
9983 default: gcc_unreachable ();
9984 }
9985 }
9986
9987 /* Emit the instruction sequence to compute the approximation for the division
9988 of NUM by DEN in QUO and return whether the sequence was emitted or not. */
9989
9990 bool
aarch64_emit_approx_div(rtx quo,rtx num,rtx den)9991 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
9992 {
9993 machine_mode mode = GET_MODE (quo);
9994
9995 if (GET_MODE_INNER (mode) == HFmode)
9996 return false;
9997
9998 bool use_approx_division_p = (flag_mlow_precision_div
9999 || (aarch64_tune_params.approx_modes->division
10000 & AARCH64_APPROX_MODE (mode)));
10001
10002 if (!flag_finite_math_only
10003 || flag_trapping_math
10004 || !flag_unsafe_math_optimizations
10005 || optimize_function_for_size_p (cfun)
10006 || !use_approx_division_p)
10007 return false;
10008
10009 if (!TARGET_SIMD && VECTOR_MODE_P (mode))
10010 return false;
10011
10012 /* Estimate the approximate reciprocal. */
10013 rtx xrcp = gen_reg_rtx (mode);
10014 emit_insn ((*get_recpe_type (mode)) (xrcp, den));
10015
10016 /* Iterate over the series twice for SF and thrice for DF. */
10017 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
10018
10019 /* Optionally iterate over the series once less for faster performance,
10020 while sacrificing the accuracy. */
10021 if (flag_mlow_precision_div)
10022 iterations--;
10023
10024 /* Iterate over the series to calculate the approximate reciprocal. */
10025 rtx xtmp = gen_reg_rtx (mode);
10026 while (iterations--)
10027 {
10028 emit_insn ((*get_recps_type (mode)) (xtmp, xrcp, den));
10029
10030 if (iterations > 0)
10031 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
10032 }
10033
10034 if (num != CONST1_RTX (mode))
10035 {
10036 /* As the approximate reciprocal of DEN is already calculated, only
10037 calculate the approximate division when NUM is not 1.0. */
10038 rtx xnum = force_reg (mode, num);
10039 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
10040 }
10041
10042 /* Finalize the approximation. */
10043 emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
10044 return true;
10045 }
10046
10047 /* Return the number of instructions that can be issued per cycle. */
10048 static int
aarch64_sched_issue_rate(void)10049 aarch64_sched_issue_rate (void)
10050 {
10051 return aarch64_tune_params.issue_rate;
10052 }
10053
10054 static int
aarch64_sched_first_cycle_multipass_dfa_lookahead(void)10055 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
10056 {
10057 int issue_rate = aarch64_sched_issue_rate ();
10058
10059 return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
10060 }
10061
10062
10063 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
10064 autopref_multipass_dfa_lookahead_guard from haifa-sched.c. It only
10065 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */
10066
10067 static int
aarch64_first_cycle_multipass_dfa_lookahead_guard(rtx_insn * insn,int ready_index)10068 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
10069 int ready_index)
10070 {
10071 return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
10072 }
10073
10074
10075 /* Vectorizer cost model target hooks. */
10076
10077 /* Implement targetm.vectorize.builtin_vectorization_cost. */
10078 static int
aarch64_builtin_vectorization_cost(enum vect_cost_for_stmt type_of_cost,tree vectype,int misalign ATTRIBUTE_UNUSED)10079 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
10080 tree vectype,
10081 int misalign ATTRIBUTE_UNUSED)
10082 {
10083 unsigned elements;
10084 const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
10085 bool fp = false;
10086
10087 if (vectype != NULL)
10088 fp = FLOAT_TYPE_P (vectype);
10089
10090 switch (type_of_cost)
10091 {
10092 case scalar_stmt:
10093 return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
10094
10095 case scalar_load:
10096 return costs->scalar_load_cost;
10097
10098 case scalar_store:
10099 return costs->scalar_store_cost;
10100
10101 case vector_stmt:
10102 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
10103
10104 case vector_load:
10105 return costs->vec_align_load_cost;
10106
10107 case vector_store:
10108 return costs->vec_store_cost;
10109
10110 case vec_to_scalar:
10111 return costs->vec_to_scalar_cost;
10112
10113 case scalar_to_vec:
10114 return costs->scalar_to_vec_cost;
10115
10116 case unaligned_load:
10117 case vector_gather_load:
10118 return costs->vec_unalign_load_cost;
10119
10120 case unaligned_store:
10121 case vector_scatter_store:
10122 return costs->vec_unalign_store_cost;
10123
10124 case cond_branch_taken:
10125 return costs->cond_taken_branch_cost;
10126
10127 case cond_branch_not_taken:
10128 return costs->cond_not_taken_branch_cost;
10129
10130 case vec_perm:
10131 return costs->vec_permute_cost;
10132
10133 case vec_promote_demote:
10134 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
10135
10136 case vec_construct:
10137 elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
10138 return elements / 2 + 1;
10139
10140 default:
10141 gcc_unreachable ();
10142 }
10143 }
10144
10145 /* Implement targetm.vectorize.add_stmt_cost. */
10146 static unsigned
aarch64_add_stmt_cost(void * data,int count,enum vect_cost_for_stmt kind,struct _stmt_vec_info * stmt_info,int misalign,enum vect_cost_model_location where)10147 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
10148 struct _stmt_vec_info *stmt_info, int misalign,
10149 enum vect_cost_model_location where)
10150 {
10151 unsigned *cost = (unsigned *) data;
10152 unsigned retval = 0;
10153
10154 if (flag_vect_cost_model)
10155 {
10156 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
10157 int stmt_cost =
10158 aarch64_builtin_vectorization_cost (kind, vectype, misalign);
10159
10160 /* Statements in an inner loop relative to the loop being
10161 vectorized are weighted more heavily. The value here is
10162 arbitrary and could potentially be improved with analysis. */
10163 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
10164 count *= 50; /* FIXME */
10165
10166 retval = (unsigned) (count * stmt_cost);
10167 cost[where] += retval;
10168 }
10169
10170 return retval;
10171 }
10172
10173 static void initialize_aarch64_code_model (struct gcc_options *);
10174
10175 /* Parse the TO_PARSE string and put the architecture struct that it
10176 selects into RES and the architectural features into ISA_FLAGS.
10177 Return an aarch64_parse_opt_result describing the parse result.
10178 If there is an error parsing, RES and ISA_FLAGS are left unchanged. */
10179
10180 static enum aarch64_parse_opt_result
aarch64_parse_arch(const char * to_parse,const struct processor ** res,unsigned long * isa_flags)10181 aarch64_parse_arch (const char *to_parse, const struct processor **res,
10182 unsigned long *isa_flags)
10183 {
10184 char *ext;
10185 const struct processor *arch;
10186 char *str = (char *) alloca (strlen (to_parse) + 1);
10187 size_t len;
10188
10189 strcpy (str, to_parse);
10190
10191 ext = strchr (str, '+');
10192
10193 if (ext != NULL)
10194 len = ext - str;
10195 else
10196 len = strlen (str);
10197
10198 if (len == 0)
10199 return AARCH64_PARSE_MISSING_ARG;
10200
10201
10202 /* Loop through the list of supported ARCHes to find a match. */
10203 for (arch = all_architectures; arch->name != NULL; arch++)
10204 {
10205 if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
10206 {
10207 unsigned long isa_temp = arch->flags;
10208
10209 if (ext != NULL)
10210 {
10211 /* TO_PARSE string contains at least one extension. */
10212 enum aarch64_parse_opt_result ext_res
10213 = aarch64_parse_extension (ext, &isa_temp);
10214
10215 if (ext_res != AARCH64_PARSE_OK)
10216 return ext_res;
10217 }
10218 /* Extension parsing was successful. Confirm the result
10219 arch and ISA flags. */
10220 *res = arch;
10221 *isa_flags = isa_temp;
10222 return AARCH64_PARSE_OK;
10223 }
10224 }
10225
10226 /* ARCH name not found in list. */
10227 return AARCH64_PARSE_INVALID_ARG;
10228 }
10229
10230 /* Parse the TO_PARSE string and put the result tuning in RES and the
10231 architecture flags in ISA_FLAGS. Return an aarch64_parse_opt_result
10232 describing the parse result. If there is an error parsing, RES and
10233 ISA_FLAGS are left unchanged. */
10234
10235 static enum aarch64_parse_opt_result
aarch64_parse_cpu(const char * to_parse,const struct processor ** res,unsigned long * isa_flags)10236 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
10237 unsigned long *isa_flags)
10238 {
10239 char *ext;
10240 const struct processor *cpu;
10241 char *str = (char *) alloca (strlen (to_parse) + 1);
10242 size_t len;
10243
10244 strcpy (str, to_parse);
10245
10246 ext = strchr (str, '+');
10247
10248 if (ext != NULL)
10249 len = ext - str;
10250 else
10251 len = strlen (str);
10252
10253 if (len == 0)
10254 return AARCH64_PARSE_MISSING_ARG;
10255
10256
10257 /* Loop through the list of supported CPUs to find a match. */
10258 for (cpu = all_cores; cpu->name != NULL; cpu++)
10259 {
10260 if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
10261 {
10262 unsigned long isa_temp = cpu->flags;
10263
10264
10265 if (ext != NULL)
10266 {
10267 /* TO_PARSE string contains at least one extension. */
10268 enum aarch64_parse_opt_result ext_res
10269 = aarch64_parse_extension (ext, &isa_temp);
10270
10271 if (ext_res != AARCH64_PARSE_OK)
10272 return ext_res;
10273 }
10274 /* Extension parsing was successfull. Confirm the result
10275 cpu and ISA flags. */
10276 *res = cpu;
10277 *isa_flags = isa_temp;
10278 return AARCH64_PARSE_OK;
10279 }
10280 }
10281
10282 /* CPU name not found in list. */
10283 return AARCH64_PARSE_INVALID_ARG;
10284 }
10285
10286 /* Parse the TO_PARSE string and put the cpu it selects into RES.
10287 Return an aarch64_parse_opt_result describing the parse result.
10288 If the parsing fails the RES does not change. */
10289
10290 static enum aarch64_parse_opt_result
aarch64_parse_tune(const char * to_parse,const struct processor ** res)10291 aarch64_parse_tune (const char *to_parse, const struct processor **res)
10292 {
10293 const struct processor *cpu;
10294 char *str = (char *) alloca (strlen (to_parse) + 1);
10295
10296 strcpy (str, to_parse);
10297
10298 /* Loop through the list of supported CPUs to find a match. */
10299 for (cpu = all_cores; cpu->name != NULL; cpu++)
10300 {
10301 if (strcmp (cpu->name, str) == 0)
10302 {
10303 *res = cpu;
10304 return AARCH64_PARSE_OK;
10305 }
10306 }
10307
10308 /* CPU name not found in list. */
10309 return AARCH64_PARSE_INVALID_ARG;
10310 }
10311
10312 /* Parse TOKEN, which has length LENGTH to see if it is an option
10313 described in FLAG. If it is, return the index bit for that fusion type.
10314 If not, error (printing OPTION_NAME) and return zero. */
10315
10316 static unsigned int
aarch64_parse_one_option_token(const char * token,size_t length,const struct aarch64_flag_desc * flag,const char * option_name)10317 aarch64_parse_one_option_token (const char *token,
10318 size_t length,
10319 const struct aarch64_flag_desc *flag,
10320 const char *option_name)
10321 {
10322 for (; flag->name != NULL; flag++)
10323 {
10324 if (length == strlen (flag->name)
10325 && !strncmp (flag->name, token, length))
10326 return flag->flag;
10327 }
10328
10329 error ("unknown flag passed in -moverride=%s (%s)", option_name, token);
10330 return 0;
10331 }
10332
10333 /* Parse OPTION which is a comma-separated list of flags to enable.
10334 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
10335 default state we inherit from the CPU tuning structures. OPTION_NAME
10336 gives the top-level option we are parsing in the -moverride string,
10337 for use in error messages. */
10338
10339 static unsigned int
aarch64_parse_boolean_options(const char * option,const struct aarch64_flag_desc * flags,unsigned int initial_state,const char * option_name)10340 aarch64_parse_boolean_options (const char *option,
10341 const struct aarch64_flag_desc *flags,
10342 unsigned int initial_state,
10343 const char *option_name)
10344 {
10345 const char separator = '.';
10346 const char* specs = option;
10347 const char* ntoken = option;
10348 unsigned int found_flags = initial_state;
10349
10350 while ((ntoken = strchr (specs, separator)))
10351 {
10352 size_t token_length = ntoken - specs;
10353 unsigned token_ops = aarch64_parse_one_option_token (specs,
10354 token_length,
10355 flags,
10356 option_name);
10357 /* If we find "none" (or, for simplicity's sake, an error) anywhere
10358 in the token stream, reset the supported operations. So:
10359
10360 adrp+add.cmp+branch.none.adrp+add
10361
10362 would have the result of turning on only adrp+add fusion. */
10363 if (!token_ops)
10364 found_flags = 0;
10365
10366 found_flags |= token_ops;
10367 specs = ++ntoken;
10368 }
10369
10370 /* We ended with a comma, print something. */
10371 if (!(*specs))
10372 {
10373 error ("%s string ill-formed\n", option_name);
10374 return 0;
10375 }
10376
10377 /* We still have one more token to parse. */
10378 size_t token_length = strlen (specs);
10379 unsigned token_ops = aarch64_parse_one_option_token (specs,
10380 token_length,
10381 flags,
10382 option_name);
10383 if (!token_ops)
10384 found_flags = 0;
10385
10386 found_flags |= token_ops;
10387 return found_flags;
10388 }
10389
10390 /* Support for overriding instruction fusion. */
10391
10392 static void
aarch64_parse_fuse_string(const char * fuse_string,struct tune_params * tune)10393 aarch64_parse_fuse_string (const char *fuse_string,
10394 struct tune_params *tune)
10395 {
10396 tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
10397 aarch64_fusible_pairs,
10398 tune->fusible_ops,
10399 "fuse=");
10400 }
10401
10402 /* Support for overriding other tuning flags. */
10403
10404 static void
aarch64_parse_tune_string(const char * tune_string,struct tune_params * tune)10405 aarch64_parse_tune_string (const char *tune_string,
10406 struct tune_params *tune)
10407 {
10408 tune->extra_tuning_flags
10409 = aarch64_parse_boolean_options (tune_string,
10410 aarch64_tuning_flags,
10411 tune->extra_tuning_flags,
10412 "tune=");
10413 }
10414
10415 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
10416 we understand. If it is, extract the option string and handoff to
10417 the appropriate function. */
10418
10419 void
aarch64_parse_one_override_token(const char * token,size_t length,struct tune_params * tune)10420 aarch64_parse_one_override_token (const char* token,
10421 size_t length,
10422 struct tune_params *tune)
10423 {
10424 const struct aarch64_tuning_override_function *fn
10425 = aarch64_tuning_override_functions;
10426
10427 const char *option_part = strchr (token, '=');
10428 if (!option_part)
10429 {
10430 error ("tuning string missing in option (%s)", token);
10431 return;
10432 }
10433
10434 /* Get the length of the option name. */
10435 length = option_part - token;
10436 /* Skip the '=' to get to the option string. */
10437 option_part++;
10438
10439 for (; fn->name != NULL; fn++)
10440 {
10441 if (!strncmp (fn->name, token, length))
10442 {
10443 fn->parse_override (option_part, tune);
10444 return;
10445 }
10446 }
10447
10448 error ("unknown tuning option (%s)",token);
10449 return;
10450 }
10451
10452 /* A checking mechanism for the implementation of the tls size. */
10453
10454 static void
initialize_aarch64_tls_size(struct gcc_options * opts)10455 initialize_aarch64_tls_size (struct gcc_options *opts)
10456 {
10457 if (aarch64_tls_size == 0)
10458 aarch64_tls_size = 24;
10459
10460 switch (opts->x_aarch64_cmodel_var)
10461 {
10462 case AARCH64_CMODEL_TINY:
10463 /* Both the default and maximum TLS size allowed under tiny is 1M which
10464 needs two instructions to address, so we clamp the size to 24. */
10465 if (aarch64_tls_size > 24)
10466 aarch64_tls_size = 24;
10467 break;
10468 case AARCH64_CMODEL_SMALL:
10469 /* The maximum TLS size allowed under small is 4G. */
10470 if (aarch64_tls_size > 32)
10471 aarch64_tls_size = 32;
10472 break;
10473 case AARCH64_CMODEL_LARGE:
10474 /* The maximum TLS size allowed under large is 16E.
10475 FIXME: 16E should be 64bit, we only support 48bit offset now. */
10476 if (aarch64_tls_size > 48)
10477 aarch64_tls_size = 48;
10478 break;
10479 default:
10480 gcc_unreachable ();
10481 }
10482
10483 return;
10484 }
10485
10486 /* Parse STRING looking for options in the format:
10487 string :: option:string
10488 option :: name=substring
10489 name :: {a-z}
10490 substring :: defined by option. */
10491
10492 static void
aarch64_parse_override_string(const char * input_string,struct tune_params * tune)10493 aarch64_parse_override_string (const char* input_string,
10494 struct tune_params* tune)
10495 {
10496 const char separator = ':';
10497 size_t string_length = strlen (input_string) + 1;
10498 char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
10499 char *string = string_root;
10500 strncpy (string, input_string, string_length);
10501 string[string_length - 1] = '\0';
10502
10503 char* ntoken = string;
10504
10505 while ((ntoken = strchr (string, separator)))
10506 {
10507 size_t token_length = ntoken - string;
10508 /* Make this substring look like a string. */
10509 *ntoken = '\0';
10510 aarch64_parse_one_override_token (string, token_length, tune);
10511 string = ++ntoken;
10512 }
10513
10514 /* One last option to parse. */
10515 aarch64_parse_one_override_token (string, strlen (string), tune);
10516 free (string_root);
10517 }
10518
10519
10520 static void
aarch64_override_options_after_change_1(struct gcc_options * opts)10521 aarch64_override_options_after_change_1 (struct gcc_options *opts)
10522 {
10523 /* PR 70044: We have to be careful about being called multiple times for the
10524 same function. This means all changes should be repeatable. */
10525
10526 /* If the frame pointer is enabled, set it to a special value that behaves
10527 similar to frame pointer omission. If we don't do this all leaf functions
10528 will get a frame pointer even if flag_omit_leaf_frame_pointer is set.
10529 If flag_omit_frame_pointer has this special value, we must force the
10530 frame pointer if not in a leaf function. We also need to force it in a
10531 leaf function if flag_omit_frame_pointer is not set or if LR is used. */
10532 if (opts->x_flag_omit_frame_pointer == 0)
10533 opts->x_flag_omit_frame_pointer = 2;
10534
10535 /* If not optimizing for size, set the default
10536 alignment to what the target wants. */
10537 if (!opts->x_optimize_size)
10538 {
10539 if (opts->x_align_loops <= 0)
10540 opts->x_align_loops = aarch64_tune_params.loop_align;
10541 if (opts->x_align_jumps <= 0)
10542 opts->x_align_jumps = aarch64_tune_params.jump_align;
10543 if (opts->x_align_functions <= 0)
10544 opts->x_align_functions = aarch64_tune_params.function_align;
10545 }
10546
10547 /* We default to no pc-relative literal loads. */
10548
10549 aarch64_pcrelative_literal_loads = false;
10550
10551 /* If -mpc-relative-literal-loads is set on the command line, this
10552 implies that the user asked for PC relative literal loads. */
10553 if (opts->x_pcrelative_literal_loads == 1)
10554 aarch64_pcrelative_literal_loads = true;
10555
10556 /* In the tiny memory model it makes no sense to disallow PC relative
10557 literal pool loads. */
10558 if (aarch64_cmodel == AARCH64_CMODEL_TINY
10559 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
10560 aarch64_pcrelative_literal_loads = true;
10561
10562 /* When enabling the lower precision Newton series for the square root, also
10563 enable it for the reciprocal square root, since the latter is an
10564 intermediary step for the former. */
10565 if (flag_mlow_precision_sqrt)
10566 flag_mrecip_low_precision_sqrt = true;
10567 }
10568
10569 /* 'Unpack' up the internal tuning structs and update the options
10570 in OPTS. The caller must have set up selected_tune and selected_arch
10571 as all the other target-specific codegen decisions are
10572 derived from them. */
10573
10574 void
aarch64_override_options_internal(struct gcc_options * opts)10575 aarch64_override_options_internal (struct gcc_options *opts)
10576 {
10577 aarch64_tune_flags = selected_tune->flags;
10578 aarch64_tune = selected_tune->sched_core;
10579 /* Make a copy of the tuning parameters attached to the core, which
10580 we may later overwrite. */
10581 aarch64_tune_params = *(selected_tune->tune);
10582 aarch64_architecture_version = selected_arch->architecture_version;
10583
10584 if (opts->x_aarch64_override_tune_string)
10585 aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
10586 &aarch64_tune_params);
10587
10588 /* This target defaults to strict volatile bitfields. */
10589 if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
10590 opts->x_flag_strict_volatile_bitfields = 1;
10591
10592 initialize_aarch64_code_model (opts);
10593 initialize_aarch64_tls_size (opts);
10594
10595 int queue_depth = 0;
10596 switch (aarch64_tune_params.autoprefetcher_model)
10597 {
10598 case tune_params::AUTOPREFETCHER_OFF:
10599 queue_depth = -1;
10600 break;
10601 case tune_params::AUTOPREFETCHER_WEAK:
10602 queue_depth = 0;
10603 break;
10604 case tune_params::AUTOPREFETCHER_STRONG:
10605 queue_depth = max_insn_queue_index + 1;
10606 break;
10607 default:
10608 gcc_unreachable ();
10609 }
10610
10611 /* We don't mind passing in global_options_set here as we don't use
10612 the *options_set structs anyway. */
10613 maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
10614 queue_depth,
10615 opts->x_param_values,
10616 global_options_set.x_param_values);
10617
10618 /* Set up parameters to be used in prefetching algorithm. Do not
10619 override the defaults unless we are tuning for a core we have
10620 researched values for. */
10621 if (aarch64_tune_params.prefetch->num_slots > 0)
10622 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
10623 aarch64_tune_params.prefetch->num_slots,
10624 opts->x_param_values,
10625 global_options_set.x_param_values);
10626 if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
10627 maybe_set_param_value (PARAM_L1_CACHE_SIZE,
10628 aarch64_tune_params.prefetch->l1_cache_size,
10629 opts->x_param_values,
10630 global_options_set.x_param_values);
10631 if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
10632 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
10633 aarch64_tune_params.prefetch->l1_cache_line_size,
10634 opts->x_param_values,
10635 global_options_set.x_param_values);
10636 if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
10637 maybe_set_param_value (PARAM_L2_CACHE_SIZE,
10638 aarch64_tune_params.prefetch->l2_cache_size,
10639 opts->x_param_values,
10640 global_options_set.x_param_values);
10641
10642 /* Use the alternative scheduling-pressure algorithm by default. */
10643 maybe_set_param_value (PARAM_SCHED_PRESSURE_ALGORITHM, SCHED_PRESSURE_MODEL,
10644 opts->x_param_values,
10645 global_options_set.x_param_values);
10646
10647 /* Enable sw prefetching at specified optimization level for
10648 CPUS that have prefetch. Lower optimization level threshold by 1
10649 when profiling is enabled. */
10650 if (opts->x_flag_prefetch_loop_arrays < 0
10651 && !opts->x_optimize_size
10652 && aarch64_tune_params.prefetch->default_opt_level >= 0
10653 && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
10654 opts->x_flag_prefetch_loop_arrays = 1;
10655
10656 aarch64_override_options_after_change_1 (opts);
10657 }
10658
10659 /* Print a hint with a suggestion for a core or architecture name that
10660 most closely resembles what the user passed in STR. ARCH is true if
10661 the user is asking for an architecture name. ARCH is false if the user
10662 is asking for a core name. */
10663
10664 static void
aarch64_print_hint_for_core_or_arch(const char * str,bool arch)10665 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
10666 {
10667 auto_vec<const char *> candidates;
10668 const struct processor *entry = arch ? all_architectures : all_cores;
10669 for (; entry->name != NULL; entry++)
10670 candidates.safe_push (entry->name);
10671
10672 #ifdef HAVE_LOCAL_CPU_DETECT
10673 /* Add also "native" as possible value. */
10674 if (arch)
10675 candidates.safe_push ("native");
10676 #endif
10677
10678 char *s;
10679 const char *hint = candidates_list_and_hint (str, s, candidates);
10680 if (hint)
10681 inform (input_location, "valid arguments are: %s;"
10682 " did you mean %qs?", s, hint);
10683 else
10684 inform (input_location, "valid arguments are: %s", s);
10685
10686 XDELETEVEC (s);
10687 }
10688
10689 /* Print a hint with a suggestion for a core name that most closely resembles
10690 what the user passed in STR. */
10691
10692 inline static void
aarch64_print_hint_for_core(const char * str)10693 aarch64_print_hint_for_core (const char *str)
10694 {
10695 aarch64_print_hint_for_core_or_arch (str, false);
10696 }
10697
10698 /* Print a hint with a suggestion for an architecture name that most closely
10699 resembles what the user passed in STR. */
10700
10701 inline static void
aarch64_print_hint_for_arch(const char * str)10702 aarch64_print_hint_for_arch (const char *str)
10703 {
10704 aarch64_print_hint_for_core_or_arch (str, true);
10705 }
10706
10707 /* Validate a command-line -mcpu option. Parse the cpu and extensions (if any)
10708 specified in STR and throw errors if appropriate. Put the results if
10709 they are valid in RES and ISA_FLAGS. Return whether the option is
10710 valid. */
10711
10712 static bool
aarch64_validate_mcpu(const char * str,const struct processor ** res,unsigned long * isa_flags)10713 aarch64_validate_mcpu (const char *str, const struct processor **res,
10714 unsigned long *isa_flags)
10715 {
10716 enum aarch64_parse_opt_result parse_res
10717 = aarch64_parse_cpu (str, res, isa_flags);
10718
10719 if (parse_res == AARCH64_PARSE_OK)
10720 return true;
10721
10722 switch (parse_res)
10723 {
10724 case AARCH64_PARSE_MISSING_ARG:
10725 error ("missing cpu name in %<-mcpu=%s%>", str);
10726 break;
10727 case AARCH64_PARSE_INVALID_ARG:
10728 error ("unknown value %qs for -mcpu", str);
10729 aarch64_print_hint_for_core (str);
10730 break;
10731 case AARCH64_PARSE_INVALID_FEATURE:
10732 error ("invalid feature modifier in %<-mcpu=%s%>", str);
10733 break;
10734 default:
10735 gcc_unreachable ();
10736 }
10737
10738 return false;
10739 }
10740
10741 /* Validate a command-line -march option. Parse the arch and extensions
10742 (if any) specified in STR and throw errors if appropriate. Put the
10743 results, if they are valid, in RES and ISA_FLAGS. Return whether the
10744 option is valid. */
10745
10746 static bool
aarch64_validate_march(const char * str,const struct processor ** res,unsigned long * isa_flags)10747 aarch64_validate_march (const char *str, const struct processor **res,
10748 unsigned long *isa_flags)
10749 {
10750 enum aarch64_parse_opt_result parse_res
10751 = aarch64_parse_arch (str, res, isa_flags);
10752
10753 if (parse_res == AARCH64_PARSE_OK)
10754 return true;
10755
10756 switch (parse_res)
10757 {
10758 case AARCH64_PARSE_MISSING_ARG:
10759 error ("missing arch name in %<-march=%s%>", str);
10760 break;
10761 case AARCH64_PARSE_INVALID_ARG:
10762 error ("unknown value %qs for -march", str);
10763 aarch64_print_hint_for_arch (str);
10764 break;
10765 case AARCH64_PARSE_INVALID_FEATURE:
10766 error ("invalid feature modifier in %<-march=%s%>", str);
10767 break;
10768 default:
10769 gcc_unreachable ();
10770 }
10771
10772 return false;
10773 }
10774
10775 /* Validate a command-line -mtune option. Parse the cpu
10776 specified in STR and throw errors if appropriate. Put the
10777 result, if it is valid, in RES. Return whether the option is
10778 valid. */
10779
10780 static bool
aarch64_validate_mtune(const char * str,const struct processor ** res)10781 aarch64_validate_mtune (const char *str, const struct processor **res)
10782 {
10783 enum aarch64_parse_opt_result parse_res
10784 = aarch64_parse_tune (str, res);
10785
10786 if (parse_res == AARCH64_PARSE_OK)
10787 return true;
10788
10789 switch (parse_res)
10790 {
10791 case AARCH64_PARSE_MISSING_ARG:
10792 error ("missing cpu name in %<-mtune=%s%>", str);
10793 break;
10794 case AARCH64_PARSE_INVALID_ARG:
10795 error ("unknown value %qs for -mtune", str);
10796 aarch64_print_hint_for_core (str);
10797 break;
10798 default:
10799 gcc_unreachable ();
10800 }
10801 return false;
10802 }
10803
10804 /* Return the CPU corresponding to the enum CPU.
10805 If it doesn't specify a cpu, return the default. */
10806
10807 static const struct processor *
aarch64_get_tune_cpu(enum aarch64_processor cpu)10808 aarch64_get_tune_cpu (enum aarch64_processor cpu)
10809 {
10810 if (cpu != aarch64_none)
10811 return &all_cores[cpu];
10812
10813 /* The & 0x3f is to extract the bottom 6 bits that encode the
10814 default cpu as selected by the --with-cpu GCC configure option
10815 in config.gcc.
10816 ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
10817 flags mechanism should be reworked to make it more sane. */
10818 return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
10819 }
10820
10821 /* Return the architecture corresponding to the enum ARCH.
10822 If it doesn't specify a valid architecture, return the default. */
10823
10824 static const struct processor *
aarch64_get_arch(enum aarch64_arch arch)10825 aarch64_get_arch (enum aarch64_arch arch)
10826 {
10827 if (arch != aarch64_no_arch)
10828 return &all_architectures[arch];
10829
10830 const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
10831
10832 return &all_architectures[cpu->arch];
10833 }
10834
10835 /* Return the VG value associated with -msve-vector-bits= value VALUE. */
10836
10837 static poly_uint16
aarch64_convert_sve_vector_bits(aarch64_sve_vector_bits_enum value)10838 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
10839 {
10840 /* For now generate vector-length agnostic code for -msve-vector-bits=128.
10841 This ensures we can clearly distinguish SVE and Advanced SIMD modes when
10842 deciding which .md file patterns to use and when deciding whether
10843 something is a legitimate address or constant. */
10844 if (value == SVE_SCALABLE || value == SVE_128)
10845 return poly_uint16 (2, 2);
10846 else
10847 return (int) value / 64;
10848 }
10849
10850 /* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning
10851 and is used to parse the -m{cpu,tune,arch} strings and setup the initial
10852 tuning structs. In particular it must set selected_tune and
10853 aarch64_isa_flags that define the available ISA features and tuning
10854 decisions. It must also set selected_arch as this will be used to
10855 output the .arch asm tags for each function. */
10856
10857 static void
aarch64_override_options(void)10858 aarch64_override_options (void)
10859 {
10860 unsigned long cpu_isa = 0;
10861 unsigned long arch_isa = 0;
10862 aarch64_isa_flags = 0;
10863
10864 bool valid_cpu = true;
10865 bool valid_tune = true;
10866 bool valid_arch = true;
10867
10868 selected_cpu = NULL;
10869 selected_arch = NULL;
10870 selected_tune = NULL;
10871
10872 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
10873 If either of -march or -mtune is given, they override their
10874 respective component of -mcpu. */
10875 if (aarch64_cpu_string)
10876 valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
10877 &cpu_isa);
10878
10879 if (aarch64_arch_string)
10880 valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
10881 &arch_isa);
10882
10883 if (aarch64_tune_string)
10884 valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
10885
10886 /* If the user did not specify a processor, choose the default
10887 one for them. This will be the CPU set during configuration using
10888 --with-cpu, otherwise it is "generic". */
10889 if (!selected_cpu)
10890 {
10891 if (selected_arch)
10892 {
10893 selected_cpu = &all_cores[selected_arch->ident];
10894 aarch64_isa_flags = arch_isa;
10895 explicit_arch = selected_arch->arch;
10896 }
10897 else
10898 {
10899 /* Get default configure-time CPU. */
10900 selected_cpu = aarch64_get_tune_cpu (aarch64_none);
10901 aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
10902 }
10903
10904 if (selected_tune)
10905 explicit_tune_core = selected_tune->ident;
10906 }
10907 /* If both -mcpu and -march are specified check that they are architecturally
10908 compatible, warn if they're not and prefer the -march ISA flags. */
10909 else if (selected_arch)
10910 {
10911 if (selected_arch->arch != selected_cpu->arch)
10912 {
10913 warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
10914 all_architectures[selected_cpu->arch].name,
10915 selected_arch->name);
10916 }
10917 aarch64_isa_flags = arch_isa;
10918 explicit_arch = selected_arch->arch;
10919 explicit_tune_core = selected_tune ? selected_tune->ident
10920 : selected_cpu->ident;
10921 }
10922 else
10923 {
10924 /* -mcpu but no -march. */
10925 aarch64_isa_flags = cpu_isa;
10926 explicit_tune_core = selected_tune ? selected_tune->ident
10927 : selected_cpu->ident;
10928 gcc_assert (selected_cpu);
10929 selected_arch = &all_architectures[selected_cpu->arch];
10930 explicit_arch = selected_arch->arch;
10931 }
10932
10933 /* Set the arch as well as we will need it when outputing
10934 the .arch directive in assembly. */
10935 if (!selected_arch)
10936 {
10937 gcc_assert (selected_cpu);
10938 selected_arch = &all_architectures[selected_cpu->arch];
10939 }
10940
10941 if (!selected_tune)
10942 selected_tune = selected_cpu;
10943
10944 #ifndef HAVE_AS_MABI_OPTION
10945 /* The compiler may have been configured with 2.23.* binutils, which does
10946 not have support for ILP32. */
10947 if (TARGET_ILP32)
10948 error ("assembler does not support -mabi=ilp32");
10949 #endif
10950
10951 /* Convert -msve-vector-bits to a VG count. */
10952 aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
10953
10954 if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
10955 sorry ("return address signing is only supported for -mabi=lp64");
10956
10957 /* Make sure we properly set up the explicit options. */
10958 if ((aarch64_cpu_string && valid_cpu)
10959 || (aarch64_tune_string && valid_tune))
10960 gcc_assert (explicit_tune_core != aarch64_none);
10961
10962 if ((aarch64_cpu_string && valid_cpu)
10963 || (aarch64_arch_string && valid_arch))
10964 gcc_assert (explicit_arch != aarch64_no_arch);
10965
10966 aarch64_override_options_internal (&global_options);
10967
10968 /* Save these options as the default ones in case we push and pop them later
10969 while processing functions with potential target attributes. */
10970 target_option_default_node = target_option_current_node
10971 = build_target_option_node (&global_options);
10972 }
10973
10974 /* Implement targetm.override_options_after_change. */
10975
10976 static void
aarch64_override_options_after_change(void)10977 aarch64_override_options_after_change (void)
10978 {
10979 aarch64_override_options_after_change_1 (&global_options);
10980 }
10981
10982 static struct machine_function *
aarch64_init_machine_status(void)10983 aarch64_init_machine_status (void)
10984 {
10985 struct machine_function *machine;
10986 machine = ggc_cleared_alloc<machine_function> ();
10987 return machine;
10988 }
10989
10990 void
aarch64_init_expanders(void)10991 aarch64_init_expanders (void)
10992 {
10993 init_machine_status = aarch64_init_machine_status;
10994 }
10995
10996 /* A checking mechanism for the implementation of the various code models. */
10997 static void
initialize_aarch64_code_model(struct gcc_options * opts)10998 initialize_aarch64_code_model (struct gcc_options *opts)
10999 {
11000 if (opts->x_flag_pic)
11001 {
11002 switch (opts->x_aarch64_cmodel_var)
11003 {
11004 case AARCH64_CMODEL_TINY:
11005 aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
11006 break;
11007 case AARCH64_CMODEL_SMALL:
11008 #ifdef HAVE_AS_SMALL_PIC_RELOCS
11009 aarch64_cmodel = (flag_pic == 2
11010 ? AARCH64_CMODEL_SMALL_PIC
11011 : AARCH64_CMODEL_SMALL_SPIC);
11012 #else
11013 aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
11014 #endif
11015 break;
11016 case AARCH64_CMODEL_LARGE:
11017 sorry ("code model %qs with -f%s", "large",
11018 opts->x_flag_pic > 1 ? "PIC" : "pic");
11019 break;
11020 default:
11021 gcc_unreachable ();
11022 }
11023 }
11024 else
11025 aarch64_cmodel = opts->x_aarch64_cmodel_var;
11026 }
11027
11028 /* Implement TARGET_OPTION_SAVE. */
11029
11030 static void
aarch64_option_save(struct cl_target_option * ptr,struct gcc_options * opts)11031 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
11032 {
11033 ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
11034 }
11035
11036 /* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions
11037 using the information saved in PTR. */
11038
11039 static void
aarch64_option_restore(struct gcc_options * opts,struct cl_target_option * ptr)11040 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
11041 {
11042 opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
11043 selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
11044 opts->x_explicit_arch = ptr->x_explicit_arch;
11045 selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
11046 opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
11047
11048 aarch64_override_options_internal (opts);
11049 }
11050
11051 /* Implement TARGET_OPTION_PRINT. */
11052
11053 static void
aarch64_option_print(FILE * file,int indent,struct cl_target_option * ptr)11054 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
11055 {
11056 const struct processor *cpu
11057 = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
11058 unsigned long isa_flags = ptr->x_aarch64_isa_flags;
11059 const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
11060 std::string extension
11061 = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
11062
11063 fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
11064 fprintf (file, "%*sselected arch = %s%s\n", indent, "",
11065 arch->name, extension.c_str ());
11066 }
11067
11068 static GTY(()) tree aarch64_previous_fndecl;
11069
11070 void
aarch64_reset_previous_fndecl(void)11071 aarch64_reset_previous_fndecl (void)
11072 {
11073 aarch64_previous_fndecl = NULL;
11074 }
11075
11076 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
11077 Used by aarch64_set_current_function and aarch64_pragma_target_parse to
11078 make sure optab availability predicates are recomputed when necessary. */
11079
11080 void
aarch64_save_restore_target_globals(tree new_tree)11081 aarch64_save_restore_target_globals (tree new_tree)
11082 {
11083 if (TREE_TARGET_GLOBALS (new_tree))
11084 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
11085 else if (new_tree == target_option_default_node)
11086 restore_target_globals (&default_target_globals);
11087 else
11088 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
11089 }
11090
11091 /* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions
11092 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
11093 of the function, if such exists. This function may be called multiple
11094 times on a single function so use aarch64_previous_fndecl to avoid
11095 setting up identical state. */
11096
11097 static void
aarch64_set_current_function(tree fndecl)11098 aarch64_set_current_function (tree fndecl)
11099 {
11100 if (!fndecl || fndecl == aarch64_previous_fndecl)
11101 return;
11102
11103 tree old_tree = (aarch64_previous_fndecl
11104 ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
11105 : NULL_TREE);
11106
11107 tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
11108
11109 /* If current function has no attributes but the previous one did,
11110 use the default node. */
11111 if (!new_tree && old_tree)
11112 new_tree = target_option_default_node;
11113
11114 /* If nothing to do, return. #pragma GCC reset or #pragma GCC pop to
11115 the default have been handled by aarch64_save_restore_target_globals from
11116 aarch64_pragma_target_parse. */
11117 if (old_tree == new_tree)
11118 return;
11119
11120 aarch64_previous_fndecl = fndecl;
11121
11122 /* First set the target options. */
11123 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
11124
11125 aarch64_save_restore_target_globals (new_tree);
11126 }
11127
11128 /* Enum describing the various ways we can handle attributes.
11129 In many cases we can reuse the generic option handling machinery. */
11130
11131 enum aarch64_attr_opt_type
11132 {
11133 aarch64_attr_mask, /* Attribute should set a bit in target_flags. */
11134 aarch64_attr_bool, /* Attribute sets or unsets a boolean variable. */
11135 aarch64_attr_enum, /* Attribute sets an enum variable. */
11136 aarch64_attr_custom /* Attribute requires a custom handling function. */
11137 };
11138
11139 /* All the information needed to handle a target attribute.
11140 NAME is the name of the attribute.
11141 ATTR_TYPE specifies the type of behavior of the attribute as described
11142 in the definition of enum aarch64_attr_opt_type.
11143 ALLOW_NEG is true if the attribute supports a "no-" form.
11144 HANDLER is the function that takes the attribute string as an argument
11145 It is needed only when the ATTR_TYPE is aarch64_attr_custom.
11146 OPT_NUM is the enum specifying the option that the attribute modifies.
11147 This is needed for attributes that mirror the behavior of a command-line
11148 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
11149 aarch64_attr_enum. */
11150
11151 struct aarch64_attribute_info
11152 {
11153 const char *name;
11154 enum aarch64_attr_opt_type attr_type;
11155 bool allow_neg;
11156 bool (*handler) (const char *);
11157 enum opt_code opt_num;
11158 };
11159
11160 /* Handle the ARCH_STR argument to the arch= target attribute. */
11161
11162 static bool
aarch64_handle_attr_arch(const char * str)11163 aarch64_handle_attr_arch (const char *str)
11164 {
11165 const struct processor *tmp_arch = NULL;
11166 enum aarch64_parse_opt_result parse_res
11167 = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags);
11168
11169 if (parse_res == AARCH64_PARSE_OK)
11170 {
11171 gcc_assert (tmp_arch);
11172 selected_arch = tmp_arch;
11173 explicit_arch = selected_arch->arch;
11174 return true;
11175 }
11176
11177 switch (parse_res)
11178 {
11179 case AARCH64_PARSE_MISSING_ARG:
11180 error ("missing name in %<target(\"arch=\")%> pragma or attribute");
11181 break;
11182 case AARCH64_PARSE_INVALID_ARG:
11183 error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str);
11184 aarch64_print_hint_for_arch (str);
11185 break;
11186 case AARCH64_PARSE_INVALID_FEATURE:
11187 error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str);
11188 break;
11189 default:
11190 gcc_unreachable ();
11191 }
11192
11193 return false;
11194 }
11195
11196 /* Handle the argument CPU_STR to the cpu= target attribute. */
11197
11198 static bool
aarch64_handle_attr_cpu(const char * str)11199 aarch64_handle_attr_cpu (const char *str)
11200 {
11201 const struct processor *tmp_cpu = NULL;
11202 enum aarch64_parse_opt_result parse_res
11203 = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags);
11204
11205 if (parse_res == AARCH64_PARSE_OK)
11206 {
11207 gcc_assert (tmp_cpu);
11208 selected_tune = tmp_cpu;
11209 explicit_tune_core = selected_tune->ident;
11210
11211 selected_arch = &all_architectures[tmp_cpu->arch];
11212 explicit_arch = selected_arch->arch;
11213 return true;
11214 }
11215
11216 switch (parse_res)
11217 {
11218 case AARCH64_PARSE_MISSING_ARG:
11219 error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
11220 break;
11221 case AARCH64_PARSE_INVALID_ARG:
11222 error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str);
11223 aarch64_print_hint_for_core (str);
11224 break;
11225 case AARCH64_PARSE_INVALID_FEATURE:
11226 error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str);
11227 break;
11228 default:
11229 gcc_unreachable ();
11230 }
11231
11232 return false;
11233 }
11234
11235 /* Handle the argument STR to the tune= target attribute. */
11236
11237 static bool
aarch64_handle_attr_tune(const char * str)11238 aarch64_handle_attr_tune (const char *str)
11239 {
11240 const struct processor *tmp_tune = NULL;
11241 enum aarch64_parse_opt_result parse_res
11242 = aarch64_parse_tune (str, &tmp_tune);
11243
11244 if (parse_res == AARCH64_PARSE_OK)
11245 {
11246 gcc_assert (tmp_tune);
11247 selected_tune = tmp_tune;
11248 explicit_tune_core = selected_tune->ident;
11249 return true;
11250 }
11251
11252 switch (parse_res)
11253 {
11254 case AARCH64_PARSE_INVALID_ARG:
11255 error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str);
11256 aarch64_print_hint_for_core (str);
11257 break;
11258 default:
11259 gcc_unreachable ();
11260 }
11261
11262 return false;
11263 }
11264
11265 /* Parse an architecture extensions target attribute string specified in STR.
11266 For example "+fp+nosimd". Show any errors if needed. Return TRUE
11267 if successful. Update aarch64_isa_flags to reflect the ISA features
11268 modified. */
11269
11270 static bool
aarch64_handle_attr_isa_flags(char * str)11271 aarch64_handle_attr_isa_flags (char *str)
11272 {
11273 enum aarch64_parse_opt_result parse_res;
11274 unsigned long isa_flags = aarch64_isa_flags;
11275
11276 /* We allow "+nothing" in the beginning to clear out all architectural
11277 features if the user wants to handpick specific features. */
11278 if (strncmp ("+nothing", str, 8) == 0)
11279 {
11280 isa_flags = 0;
11281 str += 8;
11282 }
11283
11284 parse_res = aarch64_parse_extension (str, &isa_flags);
11285
11286 if (parse_res == AARCH64_PARSE_OK)
11287 {
11288 aarch64_isa_flags = isa_flags;
11289 return true;
11290 }
11291
11292 switch (parse_res)
11293 {
11294 case AARCH64_PARSE_MISSING_ARG:
11295 error ("missing value in %<target()%> pragma or attribute");
11296 break;
11297
11298 case AARCH64_PARSE_INVALID_FEATURE:
11299 error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str);
11300 break;
11301
11302 default:
11303 gcc_unreachable ();
11304 }
11305
11306 return false;
11307 }
11308
11309 /* The target attributes that we support. On top of these we also support just
11310 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is
11311 handled explicitly in aarch64_process_one_target_attr. */
11312
11313 static const struct aarch64_attribute_info aarch64_attributes[] =
11314 {
11315 { "general-regs-only", aarch64_attr_mask, false, NULL,
11316 OPT_mgeneral_regs_only },
11317 { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
11318 OPT_mfix_cortex_a53_835769 },
11319 { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
11320 OPT_mfix_cortex_a53_843419 },
11321 { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
11322 { "strict-align", aarch64_attr_mask, false, NULL, OPT_mstrict_align },
11323 { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
11324 OPT_momit_leaf_frame_pointer },
11325 { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
11326 { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
11327 OPT_march_ },
11328 { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
11329 { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
11330 OPT_mtune_ },
11331 { "sign-return-address", aarch64_attr_enum, false, NULL,
11332 OPT_msign_return_address_ },
11333 { NULL, aarch64_attr_custom, false, NULL, OPT____ }
11334 };
11335
11336 /* Parse ARG_STR which contains the definition of one target attribute.
11337 Show appropriate errors if any or return true if the attribute is valid. */
11338
11339 static bool
aarch64_process_one_target_attr(char * arg_str)11340 aarch64_process_one_target_attr (char *arg_str)
11341 {
11342 bool invert = false;
11343
11344 size_t len = strlen (arg_str);
11345
11346 if (len == 0)
11347 {
11348 error ("malformed %<target()%> pragma or attribute");
11349 return false;
11350 }
11351
11352 char *str_to_check = (char *) alloca (len + 1);
11353 strcpy (str_to_check, arg_str);
11354
11355 /* Skip leading whitespace. */
11356 while (*str_to_check == ' ' || *str_to_check == '\t')
11357 str_to_check++;
11358
11359 /* We have something like __attribute__ ((target ("+fp+nosimd"))).
11360 It is easier to detect and handle it explicitly here rather than going
11361 through the machinery for the rest of the target attributes in this
11362 function. */
11363 if (*str_to_check == '+')
11364 return aarch64_handle_attr_isa_flags (str_to_check);
11365
11366 if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
11367 {
11368 invert = true;
11369 str_to_check += 3;
11370 }
11371 char *arg = strchr (str_to_check, '=');
11372
11373 /* If we found opt=foo then terminate STR_TO_CHECK at the '='
11374 and point ARG to "foo". */
11375 if (arg)
11376 {
11377 *arg = '\0';
11378 arg++;
11379 }
11380 const struct aarch64_attribute_info *p_attr;
11381 bool found = false;
11382 for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
11383 {
11384 /* If the names don't match up, or the user has given an argument
11385 to an attribute that doesn't accept one, or didn't give an argument
11386 to an attribute that expects one, fail to match. */
11387 if (strcmp (str_to_check, p_attr->name) != 0)
11388 continue;
11389
11390 found = true;
11391 bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
11392 || p_attr->attr_type == aarch64_attr_enum;
11393
11394 if (attr_need_arg_p ^ (arg != NULL))
11395 {
11396 error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
11397 return false;
11398 }
11399
11400 /* If the name matches but the attribute does not allow "no-" versions
11401 then we can't match. */
11402 if (invert && !p_attr->allow_neg)
11403 {
11404 error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
11405 return false;
11406 }
11407
11408 switch (p_attr->attr_type)
11409 {
11410 /* Has a custom handler registered.
11411 For example, cpu=, arch=, tune=. */
11412 case aarch64_attr_custom:
11413 gcc_assert (p_attr->handler);
11414 if (!p_attr->handler (arg))
11415 return false;
11416 break;
11417
11418 /* Either set or unset a boolean option. */
11419 case aarch64_attr_bool:
11420 {
11421 struct cl_decoded_option decoded;
11422
11423 generate_option (p_attr->opt_num, NULL, !invert,
11424 CL_TARGET, &decoded);
11425 aarch64_handle_option (&global_options, &global_options_set,
11426 &decoded, input_location);
11427 break;
11428 }
11429 /* Set or unset a bit in the target_flags. aarch64_handle_option
11430 should know what mask to apply given the option number. */
11431 case aarch64_attr_mask:
11432 {
11433 struct cl_decoded_option decoded;
11434 /* We only need to specify the option number.
11435 aarch64_handle_option will know which mask to apply. */
11436 decoded.opt_index = p_attr->opt_num;
11437 decoded.value = !invert;
11438 aarch64_handle_option (&global_options, &global_options_set,
11439 &decoded, input_location);
11440 break;
11441 }
11442 /* Use the option setting machinery to set an option to an enum. */
11443 case aarch64_attr_enum:
11444 {
11445 gcc_assert (arg);
11446 bool valid;
11447 int value;
11448 valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
11449 &value, CL_TARGET);
11450 if (valid)
11451 {
11452 set_option (&global_options, NULL, p_attr->opt_num, value,
11453 NULL, DK_UNSPECIFIED, input_location,
11454 global_dc);
11455 }
11456 else
11457 {
11458 error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
11459 }
11460 break;
11461 }
11462 default:
11463 gcc_unreachable ();
11464 }
11465 }
11466
11467 /* If we reached here we either have found an attribute and validated
11468 it or didn't match any. If we matched an attribute but its arguments
11469 were malformed we will have returned false already. */
11470 return found;
11471 }
11472
11473 /* Count how many times the character C appears in
11474 NULL-terminated string STR. */
11475
11476 static unsigned int
num_occurences_in_str(char c,char * str)11477 num_occurences_in_str (char c, char *str)
11478 {
11479 unsigned int res = 0;
11480 while (*str != '\0')
11481 {
11482 if (*str == c)
11483 res++;
11484
11485 str++;
11486 }
11487
11488 return res;
11489 }
11490
11491 /* Parse the tree in ARGS that contains the target attribute information
11492 and update the global target options space. */
11493
11494 bool
aarch64_process_target_attr(tree args)11495 aarch64_process_target_attr (tree args)
11496 {
11497 if (TREE_CODE (args) == TREE_LIST)
11498 {
11499 do
11500 {
11501 tree head = TREE_VALUE (args);
11502 if (head)
11503 {
11504 if (!aarch64_process_target_attr (head))
11505 return false;
11506 }
11507 args = TREE_CHAIN (args);
11508 } while (args);
11509
11510 return true;
11511 }
11512
11513 if (TREE_CODE (args) != STRING_CST)
11514 {
11515 error ("attribute %<target%> argument not a string");
11516 return false;
11517 }
11518
11519 size_t len = strlen (TREE_STRING_POINTER (args));
11520 char *str_to_check = (char *) alloca (len + 1);
11521 strcpy (str_to_check, TREE_STRING_POINTER (args));
11522
11523 if (len == 0)
11524 {
11525 error ("malformed %<target()%> pragma or attribute");
11526 return false;
11527 }
11528
11529 /* Used to catch empty spaces between commas i.e.
11530 attribute ((target ("attr1,,attr2"))). */
11531 unsigned int num_commas = num_occurences_in_str (',', str_to_check);
11532
11533 /* Handle multiple target attributes separated by ','. */
11534 char *token = strtok (str_to_check, ",");
11535
11536 unsigned int num_attrs = 0;
11537 while (token)
11538 {
11539 num_attrs++;
11540 if (!aarch64_process_one_target_attr (token))
11541 {
11542 error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
11543 return false;
11544 }
11545
11546 token = strtok (NULL, ",");
11547 }
11548
11549 if (num_attrs != num_commas + 1)
11550 {
11551 error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
11552 return false;
11553 }
11554
11555 return true;
11556 }
11557
11558 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to
11559 process attribute ((target ("..."))). */
11560
11561 static bool
aarch64_option_valid_attribute_p(tree fndecl,tree,tree args,int)11562 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
11563 {
11564 struct cl_target_option cur_target;
11565 bool ret;
11566 tree old_optimize;
11567 tree new_target, new_optimize;
11568 tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
11569
11570 /* If what we're processing is the current pragma string then the
11571 target option node is already stored in target_option_current_node
11572 by aarch64_pragma_target_parse in aarch64-c.c. Use that to avoid
11573 having to re-parse the string. This is especially useful to keep
11574 arm_neon.h compile times down since that header contains a lot
11575 of intrinsics enclosed in pragmas. */
11576 if (!existing_target && args == current_target_pragma)
11577 {
11578 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
11579 return true;
11580 }
11581 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
11582
11583 old_optimize = build_optimization_node (&global_options);
11584 func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
11585
11586 /* If the function changed the optimization levels as well as setting
11587 target options, start with the optimizations specified. */
11588 if (func_optimize && func_optimize != old_optimize)
11589 cl_optimization_restore (&global_options,
11590 TREE_OPTIMIZATION (func_optimize));
11591
11592 /* Save the current target options to restore at the end. */
11593 cl_target_option_save (&cur_target, &global_options);
11594
11595 /* If fndecl already has some target attributes applied to it, unpack
11596 them so that we add this attribute on top of them, rather than
11597 overwriting them. */
11598 if (existing_target)
11599 {
11600 struct cl_target_option *existing_options
11601 = TREE_TARGET_OPTION (existing_target);
11602
11603 if (existing_options)
11604 cl_target_option_restore (&global_options, existing_options);
11605 }
11606 else
11607 cl_target_option_restore (&global_options,
11608 TREE_TARGET_OPTION (target_option_current_node));
11609
11610 ret = aarch64_process_target_attr (args);
11611
11612 /* Set up any additional state. */
11613 if (ret)
11614 {
11615 aarch64_override_options_internal (&global_options);
11616 /* Initialize SIMD builtins if we haven't already.
11617 Set current_target_pragma to NULL for the duration so that
11618 the builtin initialization code doesn't try to tag the functions
11619 being built with the attributes specified by any current pragma, thus
11620 going into an infinite recursion. */
11621 if (TARGET_SIMD)
11622 {
11623 tree saved_current_target_pragma = current_target_pragma;
11624 current_target_pragma = NULL;
11625 aarch64_init_simd_builtins ();
11626 current_target_pragma = saved_current_target_pragma;
11627 }
11628 new_target = build_target_option_node (&global_options);
11629 }
11630 else
11631 new_target = NULL;
11632
11633 new_optimize = build_optimization_node (&global_options);
11634
11635 if (fndecl && ret)
11636 {
11637 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
11638
11639 if (old_optimize != new_optimize)
11640 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
11641 }
11642
11643 cl_target_option_restore (&global_options, &cur_target);
11644
11645 if (old_optimize != new_optimize)
11646 cl_optimization_restore (&global_options,
11647 TREE_OPTIMIZATION (old_optimize));
11648 return ret;
11649 }
11650
11651 /* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are
11652 tri-bool options (yes, no, don't care) and the default value is
11653 DEF, determine whether to reject inlining. */
11654
11655 static bool
aarch64_tribools_ok_for_inlining_p(int caller,int callee,int dont_care,int def)11656 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
11657 int dont_care, int def)
11658 {
11659 /* If the callee doesn't care, always allow inlining. */
11660 if (callee == dont_care)
11661 return true;
11662
11663 /* If the caller doesn't care, always allow inlining. */
11664 if (caller == dont_care)
11665 return true;
11666
11667 /* Otherwise, allow inlining if either the callee and caller values
11668 agree, or if the callee is using the default value. */
11669 return (callee == caller || callee == def);
11670 }
11671
11672 /* Implement TARGET_CAN_INLINE_P. Decide whether it is valid
11673 to inline CALLEE into CALLER based on target-specific info.
11674 Make sure that the caller and callee have compatible architectural
11675 features. Then go through the other possible target attributes
11676 and see if they can block inlining. Try not to reject always_inline
11677 callees unless they are incompatible architecturally. */
11678
11679 static bool
aarch64_can_inline_p(tree caller,tree callee)11680 aarch64_can_inline_p (tree caller, tree callee)
11681 {
11682 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
11683 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
11684
11685 /* If callee has no option attributes, then it is ok to inline. */
11686 if (!callee_tree)
11687 return true;
11688
11689 struct cl_target_option *caller_opts
11690 = TREE_TARGET_OPTION (caller_tree ? caller_tree
11691 : target_option_default_node);
11692
11693 struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
11694
11695
11696 /* Callee's ISA flags should be a subset of the caller's. */
11697 if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
11698 != callee_opts->x_aarch64_isa_flags)
11699 return false;
11700
11701 /* Allow non-strict aligned functions inlining into strict
11702 aligned ones. */
11703 if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
11704 != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
11705 && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
11706 && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
11707 return false;
11708
11709 bool always_inline = lookup_attribute ("always_inline",
11710 DECL_ATTRIBUTES (callee));
11711
11712 /* If the architectural features match up and the callee is always_inline
11713 then the other attributes don't matter. */
11714 if (always_inline)
11715 return true;
11716
11717 if (caller_opts->x_aarch64_cmodel_var
11718 != callee_opts->x_aarch64_cmodel_var)
11719 return false;
11720
11721 if (caller_opts->x_aarch64_tls_dialect
11722 != callee_opts->x_aarch64_tls_dialect)
11723 return false;
11724
11725 /* Honour explicit requests to workaround errata. */
11726 if (!aarch64_tribools_ok_for_inlining_p (
11727 caller_opts->x_aarch64_fix_a53_err835769,
11728 callee_opts->x_aarch64_fix_a53_err835769,
11729 2, TARGET_FIX_ERR_A53_835769_DEFAULT))
11730 return false;
11731
11732 if (!aarch64_tribools_ok_for_inlining_p (
11733 caller_opts->x_aarch64_fix_a53_err843419,
11734 callee_opts->x_aarch64_fix_a53_err843419,
11735 2, TARGET_FIX_ERR_A53_843419))
11736 return false;
11737
11738 /* If the user explicitly specified -momit-leaf-frame-pointer for the
11739 caller and calle and they don't match up, reject inlining. */
11740 if (!aarch64_tribools_ok_for_inlining_p (
11741 caller_opts->x_flag_omit_leaf_frame_pointer,
11742 callee_opts->x_flag_omit_leaf_frame_pointer,
11743 2, 1))
11744 return false;
11745
11746 /* If the callee has specific tuning overrides, respect them. */
11747 if (callee_opts->x_aarch64_override_tune_string != NULL
11748 && caller_opts->x_aarch64_override_tune_string == NULL)
11749 return false;
11750
11751 /* If the user specified tuning override strings for the
11752 caller and callee and they don't match up, reject inlining.
11753 We just do a string compare here, we don't analyze the meaning
11754 of the string, as it would be too costly for little gain. */
11755 if (callee_opts->x_aarch64_override_tune_string
11756 && caller_opts->x_aarch64_override_tune_string
11757 && (strcmp (callee_opts->x_aarch64_override_tune_string,
11758 caller_opts->x_aarch64_override_tune_string) != 0))
11759 return false;
11760
11761 return true;
11762 }
11763
11764 /* Return true if SYMBOL_REF X binds locally. */
11765
11766 static bool
aarch64_symbol_binds_local_p(const_rtx x)11767 aarch64_symbol_binds_local_p (const_rtx x)
11768 {
11769 return (SYMBOL_REF_DECL (x)
11770 ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
11771 : SYMBOL_REF_LOCAL_P (x));
11772 }
11773
11774 /* Return true if SYMBOL_REF X is thread local */
11775 static bool
aarch64_tls_symbol_p(rtx x)11776 aarch64_tls_symbol_p (rtx x)
11777 {
11778 if (! TARGET_HAVE_TLS)
11779 return false;
11780
11781 if (GET_CODE (x) != SYMBOL_REF)
11782 return false;
11783
11784 return SYMBOL_REF_TLS_MODEL (x) != 0;
11785 }
11786
11787 /* Classify a TLS symbol into one of the TLS kinds. */
11788 enum aarch64_symbol_type
aarch64_classify_tls_symbol(rtx x)11789 aarch64_classify_tls_symbol (rtx x)
11790 {
11791 enum tls_model tls_kind = tls_symbolic_operand_type (x);
11792
11793 switch (tls_kind)
11794 {
11795 case TLS_MODEL_GLOBAL_DYNAMIC:
11796 case TLS_MODEL_LOCAL_DYNAMIC:
11797 return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
11798
11799 case TLS_MODEL_INITIAL_EXEC:
11800 switch (aarch64_cmodel)
11801 {
11802 case AARCH64_CMODEL_TINY:
11803 case AARCH64_CMODEL_TINY_PIC:
11804 return SYMBOL_TINY_TLSIE;
11805 default:
11806 return SYMBOL_SMALL_TLSIE;
11807 }
11808
11809 case TLS_MODEL_LOCAL_EXEC:
11810 if (aarch64_tls_size == 12)
11811 return SYMBOL_TLSLE12;
11812 else if (aarch64_tls_size == 24)
11813 return SYMBOL_TLSLE24;
11814 else if (aarch64_tls_size == 32)
11815 return SYMBOL_TLSLE32;
11816 else if (aarch64_tls_size == 48)
11817 return SYMBOL_TLSLE48;
11818 else
11819 gcc_unreachable ();
11820
11821 case TLS_MODEL_EMULATED:
11822 case TLS_MODEL_NONE:
11823 return SYMBOL_FORCE_TO_MEM;
11824
11825 default:
11826 gcc_unreachable ();
11827 }
11828 }
11829
11830 /* Return the correct method for accessing X + OFFSET, where X is either
11831 a SYMBOL_REF or LABEL_REF. */
11832
11833 enum aarch64_symbol_type
aarch64_classify_symbol(rtx x,HOST_WIDE_INT offset)11834 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
11835 {
11836 if (GET_CODE (x) == LABEL_REF)
11837 {
11838 switch (aarch64_cmodel)
11839 {
11840 case AARCH64_CMODEL_LARGE:
11841 return SYMBOL_FORCE_TO_MEM;
11842
11843 case AARCH64_CMODEL_TINY_PIC:
11844 case AARCH64_CMODEL_TINY:
11845 return SYMBOL_TINY_ABSOLUTE;
11846
11847 case AARCH64_CMODEL_SMALL_SPIC:
11848 case AARCH64_CMODEL_SMALL_PIC:
11849 case AARCH64_CMODEL_SMALL:
11850 return SYMBOL_SMALL_ABSOLUTE;
11851
11852 default:
11853 gcc_unreachable ();
11854 }
11855 }
11856
11857 if (GET_CODE (x) == SYMBOL_REF)
11858 {
11859 if (aarch64_tls_symbol_p (x))
11860 return aarch64_classify_tls_symbol (x);
11861
11862 switch (aarch64_cmodel)
11863 {
11864 case AARCH64_CMODEL_TINY:
11865 /* When we retrieve symbol + offset address, we have to make sure
11866 the offset does not cause overflow of the final address. But
11867 we have no way of knowing the address of symbol at compile time
11868 so we can't accurately say if the distance between the PC and
11869 symbol + offset is outside the addressible range of +/-1M in the
11870 TINY code model. So we rely on images not being greater than
11871 1M and cap the offset at 1M and anything beyond 1M will have to
11872 be loaded using an alternative mechanism. Furthermore if the
11873 symbol is a weak reference to something that isn't known to
11874 resolve to a symbol in this module, then force to memory. */
11875 if ((SYMBOL_REF_WEAK (x)
11876 && !aarch64_symbol_binds_local_p (x))
11877 || !IN_RANGE (offset, -1048575, 1048575))
11878 return SYMBOL_FORCE_TO_MEM;
11879 return SYMBOL_TINY_ABSOLUTE;
11880
11881 case AARCH64_CMODEL_SMALL:
11882 /* Same reasoning as the tiny code model, but the offset cap here is
11883 4G. */
11884 if ((SYMBOL_REF_WEAK (x)
11885 && !aarch64_symbol_binds_local_p (x))
11886 || !IN_RANGE (offset, HOST_WIDE_INT_C (-4294967263),
11887 HOST_WIDE_INT_C (4294967264)))
11888 return SYMBOL_FORCE_TO_MEM;
11889 return SYMBOL_SMALL_ABSOLUTE;
11890
11891 case AARCH64_CMODEL_TINY_PIC:
11892 if (!aarch64_symbol_binds_local_p (x))
11893 return SYMBOL_TINY_GOT;
11894 return SYMBOL_TINY_ABSOLUTE;
11895
11896 case AARCH64_CMODEL_SMALL_SPIC:
11897 case AARCH64_CMODEL_SMALL_PIC:
11898 if (!aarch64_symbol_binds_local_p (x))
11899 return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
11900 ? SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
11901 return SYMBOL_SMALL_ABSOLUTE;
11902
11903 case AARCH64_CMODEL_LARGE:
11904 /* This is alright even in PIC code as the constant
11905 pool reference is always PC relative and within
11906 the same translation unit. */
11907 if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
11908 return SYMBOL_SMALL_ABSOLUTE;
11909 else
11910 return SYMBOL_FORCE_TO_MEM;
11911
11912 default:
11913 gcc_unreachable ();
11914 }
11915 }
11916
11917 /* By default push everything into the constant pool. */
11918 return SYMBOL_FORCE_TO_MEM;
11919 }
11920
11921 bool
aarch64_constant_address_p(rtx x)11922 aarch64_constant_address_p (rtx x)
11923 {
11924 return (CONSTANT_P (x) && memory_address_p (DImode, x));
11925 }
11926
11927 bool
aarch64_legitimate_pic_operand_p(rtx x)11928 aarch64_legitimate_pic_operand_p (rtx x)
11929 {
11930 if (GET_CODE (x) == SYMBOL_REF
11931 || (GET_CODE (x) == CONST
11932 && GET_CODE (XEXP (x, 0)) == PLUS
11933 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
11934 return false;
11935
11936 return true;
11937 }
11938
11939 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook. Return true for constants
11940 that should be rematerialized rather than spilled. */
11941
11942 static bool
aarch64_legitimate_constant_p(machine_mode mode,rtx x)11943 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
11944 {
11945 /* Support CSE and rematerialization of common constants. */
11946 if (CONST_INT_P (x)
11947 || (CONST_DOUBLE_P (x) && GET_MODE_CLASS (mode) == MODE_FLOAT)
11948 || GET_CODE (x) == CONST_VECTOR)
11949 return true;
11950
11951 /* Do not allow vector struct mode constants for Advanced SIMD.
11952 We could support 0 and -1 easily, but they need support in
11953 aarch64-simd.md. */
11954 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
11955 if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
11956 return false;
11957
11958 /* Only accept variable-length vector constants if they can be
11959 handled directly.
11960
11961 ??? It would be possible to handle rematerialization of other
11962 constants via secondary reloads. */
11963 if (vec_flags & VEC_ANY_SVE)
11964 return aarch64_simd_valid_immediate (x, NULL);
11965
11966 if (GET_CODE (x) == HIGH)
11967 x = XEXP (x, 0);
11968
11969 /* Accept polynomial constants that can be calculated by using the
11970 destination of a move as the sole temporary. Constants that
11971 require a second temporary cannot be rematerialized (they can't be
11972 forced to memory and also aren't legitimate constants). */
11973 poly_int64 offset;
11974 if (poly_int_rtx_p (x, &offset))
11975 return aarch64_offset_temporaries (false, offset) <= 1;
11976
11977 /* If an offset is being added to something else, we need to allow the
11978 base to be moved into the destination register, meaning that there
11979 are no free temporaries for the offset. */
11980 x = strip_offset (x, &offset);
11981 if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
11982 return false;
11983
11984 /* Do not allow const (plus (anchor_symbol, const_int)). */
11985 if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
11986 return false;
11987
11988 /* Treat symbols as constants. Avoid TLS symbols as they are complex,
11989 so spilling them is better than rematerialization. */
11990 if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
11991 return true;
11992
11993 /* Label references are always constant. */
11994 if (GET_CODE (x) == LABEL_REF)
11995 return true;
11996
11997 return false;
11998 }
11999
12000 rtx
aarch64_load_tp(rtx target)12001 aarch64_load_tp (rtx target)
12002 {
12003 if (!target
12004 || GET_MODE (target) != Pmode
12005 || !register_operand (target, Pmode))
12006 target = gen_reg_rtx (Pmode);
12007
12008 /* Can return in any reg. */
12009 emit_insn (gen_aarch64_load_tp_hard (target));
12010 return target;
12011 }
12012
12013 /* On AAPCS systems, this is the "struct __va_list". */
12014 static GTY(()) tree va_list_type;
12015
12016 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
12017 Return the type to use as __builtin_va_list.
12018
12019 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
12020
12021 struct __va_list
12022 {
12023 void *__stack;
12024 void *__gr_top;
12025 void *__vr_top;
12026 int __gr_offs;
12027 int __vr_offs;
12028 }; */
12029
12030 static tree
aarch64_build_builtin_va_list(void)12031 aarch64_build_builtin_va_list (void)
12032 {
12033 tree va_list_name;
12034 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
12035
12036 /* Create the type. */
12037 va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
12038 /* Give it the required name. */
12039 va_list_name = build_decl (BUILTINS_LOCATION,
12040 TYPE_DECL,
12041 get_identifier ("__va_list"),
12042 va_list_type);
12043 DECL_ARTIFICIAL (va_list_name) = 1;
12044 TYPE_NAME (va_list_type) = va_list_name;
12045 TYPE_STUB_DECL (va_list_type) = va_list_name;
12046
12047 /* Create the fields. */
12048 f_stack = build_decl (BUILTINS_LOCATION,
12049 FIELD_DECL, get_identifier ("__stack"),
12050 ptr_type_node);
12051 f_grtop = build_decl (BUILTINS_LOCATION,
12052 FIELD_DECL, get_identifier ("__gr_top"),
12053 ptr_type_node);
12054 f_vrtop = build_decl (BUILTINS_LOCATION,
12055 FIELD_DECL, get_identifier ("__vr_top"),
12056 ptr_type_node);
12057 f_groff = build_decl (BUILTINS_LOCATION,
12058 FIELD_DECL, get_identifier ("__gr_offs"),
12059 integer_type_node);
12060 f_vroff = build_decl (BUILTINS_LOCATION,
12061 FIELD_DECL, get_identifier ("__vr_offs"),
12062 integer_type_node);
12063
12064 /* Tell tree-stdarg pass about our internal offset fields.
12065 NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
12066 purpose to identify whether the code is updating va_list internal
12067 offset fields through irregular way. */
12068 va_list_gpr_counter_field = f_groff;
12069 va_list_fpr_counter_field = f_vroff;
12070
12071 DECL_ARTIFICIAL (f_stack) = 1;
12072 DECL_ARTIFICIAL (f_grtop) = 1;
12073 DECL_ARTIFICIAL (f_vrtop) = 1;
12074 DECL_ARTIFICIAL (f_groff) = 1;
12075 DECL_ARTIFICIAL (f_vroff) = 1;
12076
12077 DECL_FIELD_CONTEXT (f_stack) = va_list_type;
12078 DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
12079 DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
12080 DECL_FIELD_CONTEXT (f_groff) = va_list_type;
12081 DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
12082
12083 TYPE_FIELDS (va_list_type) = f_stack;
12084 DECL_CHAIN (f_stack) = f_grtop;
12085 DECL_CHAIN (f_grtop) = f_vrtop;
12086 DECL_CHAIN (f_vrtop) = f_groff;
12087 DECL_CHAIN (f_groff) = f_vroff;
12088
12089 /* Compute its layout. */
12090 layout_type (va_list_type);
12091
12092 return va_list_type;
12093 }
12094
12095 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
12096 static void
aarch64_expand_builtin_va_start(tree valist,rtx nextarg ATTRIBUTE_UNUSED)12097 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
12098 {
12099 const CUMULATIVE_ARGS *cum;
12100 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
12101 tree stack, grtop, vrtop, groff, vroff;
12102 tree t;
12103 int gr_save_area_size = cfun->va_list_gpr_size;
12104 int vr_save_area_size = cfun->va_list_fpr_size;
12105 int vr_offset;
12106
12107 cum = &crtl->args.info;
12108 if (cfun->va_list_gpr_size)
12109 gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
12110 cfun->va_list_gpr_size);
12111 if (cfun->va_list_fpr_size)
12112 vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
12113 * UNITS_PER_VREG, cfun->va_list_fpr_size);
12114
12115 if (!TARGET_FLOAT)
12116 {
12117 gcc_assert (cum->aapcs_nvrn == 0);
12118 vr_save_area_size = 0;
12119 }
12120
12121 f_stack = TYPE_FIELDS (va_list_type_node);
12122 f_grtop = DECL_CHAIN (f_stack);
12123 f_vrtop = DECL_CHAIN (f_grtop);
12124 f_groff = DECL_CHAIN (f_vrtop);
12125 f_vroff = DECL_CHAIN (f_groff);
12126
12127 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
12128 NULL_TREE);
12129 grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
12130 NULL_TREE);
12131 vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
12132 NULL_TREE);
12133 groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
12134 NULL_TREE);
12135 vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
12136 NULL_TREE);
12137
12138 /* Emit code to initialize STACK, which points to the next varargs stack
12139 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
12140 by named arguments. STACK is 8-byte aligned. */
12141 t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
12142 if (cum->aapcs_stack_size > 0)
12143 t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
12144 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
12145 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12146
12147 /* Emit code to initialize GRTOP, the top of the GR save area.
12148 virtual_incoming_args_rtx should have been 16 byte aligned. */
12149 t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
12150 t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
12151 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12152
12153 /* Emit code to initialize VRTOP, the top of the VR save area.
12154 This address is gr_save_area_bytes below GRTOP, rounded
12155 down to the next 16-byte boundary. */
12156 t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
12157 vr_offset = ROUND_UP (gr_save_area_size,
12158 STACK_BOUNDARY / BITS_PER_UNIT);
12159
12160 if (vr_offset)
12161 t = fold_build_pointer_plus_hwi (t, -vr_offset);
12162 t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
12163 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12164
12165 /* Emit code to initialize GROFF, the offset from GRTOP of the
12166 next GPR argument. */
12167 t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
12168 build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
12169 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12170
12171 /* Likewise emit code to initialize VROFF, the offset from FTOP
12172 of the next VR argument. */
12173 t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
12174 build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
12175 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12176 }
12177
12178 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
12179
12180 static tree
aarch64_gimplify_va_arg_expr(tree valist,tree type,gimple_seq * pre_p,gimple_seq * post_p ATTRIBUTE_UNUSED)12181 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
12182 gimple_seq *post_p ATTRIBUTE_UNUSED)
12183 {
12184 tree addr;
12185 bool indirect_p;
12186 bool is_ha; /* is HFA or HVA. */
12187 bool dw_align; /* double-word align. */
12188 machine_mode ag_mode = VOIDmode;
12189 int nregs;
12190 machine_mode mode;
12191
12192 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
12193 tree stack, f_top, f_off, off, arg, roundup, on_stack;
12194 HOST_WIDE_INT size, rsize, adjust, align;
12195 tree t, u, cond1, cond2;
12196
12197 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
12198 if (indirect_p)
12199 type = build_pointer_type (type);
12200
12201 mode = TYPE_MODE (type);
12202
12203 f_stack = TYPE_FIELDS (va_list_type_node);
12204 f_grtop = DECL_CHAIN (f_stack);
12205 f_vrtop = DECL_CHAIN (f_grtop);
12206 f_groff = DECL_CHAIN (f_vrtop);
12207 f_vroff = DECL_CHAIN (f_groff);
12208
12209 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
12210 f_stack, NULL_TREE);
12211 size = int_size_in_bytes (type);
12212 align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
12213
12214 dw_align = false;
12215 adjust = 0;
12216 if (aarch64_vfp_is_call_or_return_candidate (mode,
12217 type,
12218 &ag_mode,
12219 &nregs,
12220 &is_ha))
12221 {
12222 /* No frontends can create types with variable-sized modes, so we
12223 shouldn't be asked to pass or return them. */
12224 unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
12225
12226 /* TYPE passed in fp/simd registers. */
12227 if (!TARGET_FLOAT)
12228 aarch64_err_no_fpadvsimd (mode, "varargs");
12229
12230 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
12231 unshare_expr (valist), f_vrtop, NULL_TREE);
12232 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
12233 unshare_expr (valist), f_vroff, NULL_TREE);
12234
12235 rsize = nregs * UNITS_PER_VREG;
12236
12237 if (is_ha)
12238 {
12239 if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
12240 adjust = UNITS_PER_VREG - ag_size;
12241 }
12242 else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
12243 && size < UNITS_PER_VREG)
12244 {
12245 adjust = UNITS_PER_VREG - size;
12246 }
12247 }
12248 else
12249 {
12250 /* TYPE passed in general registers. */
12251 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
12252 unshare_expr (valist), f_grtop, NULL_TREE);
12253 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
12254 unshare_expr (valist), f_groff, NULL_TREE);
12255 rsize = ROUND_UP (size, UNITS_PER_WORD);
12256 nregs = rsize / UNITS_PER_WORD;
12257
12258 if (align > 8)
12259 dw_align = true;
12260
12261 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
12262 && size < UNITS_PER_WORD)
12263 {
12264 adjust = UNITS_PER_WORD - size;
12265 }
12266 }
12267
12268 /* Get a local temporary for the field value. */
12269 off = get_initialized_tmp_var (f_off, pre_p, NULL);
12270
12271 /* Emit code to branch if off >= 0. */
12272 t = build2 (GE_EXPR, boolean_type_node, off,
12273 build_int_cst (TREE_TYPE (off), 0));
12274 cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
12275
12276 if (dw_align)
12277 {
12278 /* Emit: offs = (offs + 15) & -16. */
12279 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
12280 build_int_cst (TREE_TYPE (off), 15));
12281 t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
12282 build_int_cst (TREE_TYPE (off), -16));
12283 roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
12284 }
12285 else
12286 roundup = NULL;
12287
12288 /* Update ap.__[g|v]r_offs */
12289 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
12290 build_int_cst (TREE_TYPE (off), rsize));
12291 t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
12292
12293 /* String up. */
12294 if (roundup)
12295 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
12296
12297 /* [cond2] if (ap.__[g|v]r_offs > 0) */
12298 u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
12299 build_int_cst (TREE_TYPE (f_off), 0));
12300 cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
12301
12302 /* String up: make sure the assignment happens before the use. */
12303 t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
12304 COND_EXPR_ELSE (cond1) = t;
12305
12306 /* Prepare the trees handling the argument that is passed on the stack;
12307 the top level node will store in ON_STACK. */
12308 arg = get_initialized_tmp_var (stack, pre_p, NULL);
12309 if (align > 8)
12310 {
12311 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
12312 t = fold_convert (intDI_type_node, arg);
12313 t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
12314 build_int_cst (TREE_TYPE (t), 15));
12315 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
12316 build_int_cst (TREE_TYPE (t), -16));
12317 t = fold_convert (TREE_TYPE (arg), t);
12318 roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
12319 }
12320 else
12321 roundup = NULL;
12322 /* Advance ap.__stack */
12323 t = fold_convert (intDI_type_node, arg);
12324 t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
12325 build_int_cst (TREE_TYPE (t), size + 7));
12326 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
12327 build_int_cst (TREE_TYPE (t), -8));
12328 t = fold_convert (TREE_TYPE (arg), t);
12329 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
12330 /* String up roundup and advance. */
12331 if (roundup)
12332 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
12333 /* String up with arg */
12334 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
12335 /* Big-endianness related address adjustment. */
12336 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
12337 && size < UNITS_PER_WORD)
12338 {
12339 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
12340 size_int (UNITS_PER_WORD - size));
12341 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
12342 }
12343
12344 COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
12345 COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
12346
12347 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
12348 t = off;
12349 if (adjust)
12350 t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
12351 build_int_cst (TREE_TYPE (off), adjust));
12352
12353 t = fold_convert (sizetype, t);
12354 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
12355
12356 if (is_ha)
12357 {
12358 /* type ha; // treat as "struct {ftype field[n];}"
12359 ... [computing offs]
12360 for (i = 0; i <nregs; ++i, offs += 16)
12361 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
12362 return ha; */
12363 int i;
12364 tree tmp_ha, field_t, field_ptr_t;
12365
12366 /* Declare a local variable. */
12367 tmp_ha = create_tmp_var_raw (type, "ha");
12368 gimple_add_tmp_var (tmp_ha);
12369
12370 /* Establish the base type. */
12371 switch (ag_mode)
12372 {
12373 case E_SFmode:
12374 field_t = float_type_node;
12375 field_ptr_t = float_ptr_type_node;
12376 break;
12377 case E_DFmode:
12378 field_t = double_type_node;
12379 field_ptr_t = double_ptr_type_node;
12380 break;
12381 case E_TFmode:
12382 field_t = long_double_type_node;
12383 field_ptr_t = long_double_ptr_type_node;
12384 break;
12385 case E_HFmode:
12386 field_t = aarch64_fp16_type_node;
12387 field_ptr_t = aarch64_fp16_ptr_type_node;
12388 break;
12389 case E_V2SImode:
12390 case E_V4SImode:
12391 {
12392 tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
12393 field_t = build_vector_type_for_mode (innertype, ag_mode);
12394 field_ptr_t = build_pointer_type (field_t);
12395 }
12396 break;
12397 default:
12398 gcc_assert (0);
12399 }
12400
12401 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
12402 tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
12403 addr = t;
12404 t = fold_convert (field_ptr_t, addr);
12405 t = build2 (MODIFY_EXPR, field_t,
12406 build1 (INDIRECT_REF, field_t, tmp_ha),
12407 build1 (INDIRECT_REF, field_t, t));
12408
12409 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
12410 for (i = 1; i < nregs; ++i)
12411 {
12412 addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
12413 u = fold_convert (field_ptr_t, addr);
12414 u = build2 (MODIFY_EXPR, field_t,
12415 build2 (MEM_REF, field_t, tmp_ha,
12416 build_int_cst (field_ptr_t,
12417 (i *
12418 int_size_in_bytes (field_t)))),
12419 build1 (INDIRECT_REF, field_t, u));
12420 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
12421 }
12422
12423 u = fold_convert (TREE_TYPE (f_top), tmp_ha);
12424 t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
12425 }
12426
12427 COND_EXPR_ELSE (cond2) = t;
12428 addr = fold_convert (build_pointer_type (type), cond1);
12429 addr = build_va_arg_indirect_ref (addr);
12430
12431 if (indirect_p)
12432 addr = build_va_arg_indirect_ref (addr);
12433
12434 return addr;
12435 }
12436
12437 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
12438
12439 static void
aarch64_setup_incoming_varargs(cumulative_args_t cum_v,machine_mode mode,tree type,int * pretend_size ATTRIBUTE_UNUSED,int no_rtl)12440 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
12441 tree type, int *pretend_size ATTRIBUTE_UNUSED,
12442 int no_rtl)
12443 {
12444 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
12445 CUMULATIVE_ARGS local_cum;
12446 int gr_saved = cfun->va_list_gpr_size;
12447 int vr_saved = cfun->va_list_fpr_size;
12448
12449 /* The caller has advanced CUM up to, but not beyond, the last named
12450 argument. Advance a local copy of CUM past the last "real" named
12451 argument, to find out how many registers are left over. */
12452 local_cum = *cum;
12453 aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
12454
12455 /* Found out how many registers we need to save.
12456 Honor tree-stdvar analysis results. */
12457 if (cfun->va_list_gpr_size)
12458 gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
12459 cfun->va_list_gpr_size / UNITS_PER_WORD);
12460 if (cfun->va_list_fpr_size)
12461 vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
12462 cfun->va_list_fpr_size / UNITS_PER_VREG);
12463
12464 if (!TARGET_FLOAT)
12465 {
12466 gcc_assert (local_cum.aapcs_nvrn == 0);
12467 vr_saved = 0;
12468 }
12469
12470 if (!no_rtl)
12471 {
12472 if (gr_saved > 0)
12473 {
12474 rtx ptr, mem;
12475
12476 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
12477 ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
12478 - gr_saved * UNITS_PER_WORD);
12479 mem = gen_frame_mem (BLKmode, ptr);
12480 set_mem_alias_set (mem, get_varargs_alias_set ());
12481
12482 move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
12483 mem, gr_saved);
12484 }
12485 if (vr_saved > 0)
12486 {
12487 /* We can't use move_block_from_reg, because it will use
12488 the wrong mode, storing D regs only. */
12489 machine_mode mode = TImode;
12490 int off, i, vr_start;
12491
12492 /* Set OFF to the offset from virtual_incoming_args_rtx of
12493 the first vector register. The VR save area lies below
12494 the GR one, and is aligned to 16 bytes. */
12495 off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
12496 STACK_BOUNDARY / BITS_PER_UNIT);
12497 off -= vr_saved * UNITS_PER_VREG;
12498
12499 vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
12500 for (i = 0; i < vr_saved; ++i)
12501 {
12502 rtx ptr, mem;
12503
12504 ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
12505 mem = gen_frame_mem (mode, ptr);
12506 set_mem_alias_set (mem, get_varargs_alias_set ());
12507 aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
12508 off += UNITS_PER_VREG;
12509 }
12510 }
12511 }
12512
12513 /* We don't save the size into *PRETEND_SIZE because we want to avoid
12514 any complication of having crtl->args.pretend_args_size changed. */
12515 cfun->machine->frame.saved_varargs_size
12516 = (ROUND_UP (gr_saved * UNITS_PER_WORD,
12517 STACK_BOUNDARY / BITS_PER_UNIT)
12518 + vr_saved * UNITS_PER_VREG);
12519 }
12520
12521 static void
aarch64_conditional_register_usage(void)12522 aarch64_conditional_register_usage (void)
12523 {
12524 int i;
12525 if (!TARGET_FLOAT)
12526 {
12527 for (i = V0_REGNUM; i <= V31_REGNUM; i++)
12528 {
12529 fixed_regs[i] = 1;
12530 call_used_regs[i] = 1;
12531 }
12532 }
12533 if (!TARGET_SVE)
12534 for (i = P0_REGNUM; i <= P15_REGNUM; i++)
12535 {
12536 fixed_regs[i] = 1;
12537 call_used_regs[i] = 1;
12538 }
12539 }
12540
12541 /* Walk down the type tree of TYPE counting consecutive base elements.
12542 If *MODEP is VOIDmode, then set it to the first valid floating point
12543 type. If a non-floating point type is found, or if a floating point
12544 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
12545 otherwise return the count in the sub-tree. */
12546 static int
aapcs_vfp_sub_candidate(const_tree type,machine_mode * modep)12547 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
12548 {
12549 machine_mode mode;
12550 HOST_WIDE_INT size;
12551
12552 switch (TREE_CODE (type))
12553 {
12554 case REAL_TYPE:
12555 mode = TYPE_MODE (type);
12556 if (mode != DFmode && mode != SFmode
12557 && mode != TFmode && mode != HFmode)
12558 return -1;
12559
12560 if (*modep == VOIDmode)
12561 *modep = mode;
12562
12563 if (*modep == mode)
12564 return 1;
12565
12566 break;
12567
12568 case COMPLEX_TYPE:
12569 mode = TYPE_MODE (TREE_TYPE (type));
12570 if (mode != DFmode && mode != SFmode
12571 && mode != TFmode && mode != HFmode)
12572 return -1;
12573
12574 if (*modep == VOIDmode)
12575 *modep = mode;
12576
12577 if (*modep == mode)
12578 return 2;
12579
12580 break;
12581
12582 case VECTOR_TYPE:
12583 /* Use V2SImode and V4SImode as representatives of all 64-bit
12584 and 128-bit vector types. */
12585 size = int_size_in_bytes (type);
12586 switch (size)
12587 {
12588 case 8:
12589 mode = V2SImode;
12590 break;
12591 case 16:
12592 mode = V4SImode;
12593 break;
12594 default:
12595 return -1;
12596 }
12597
12598 if (*modep == VOIDmode)
12599 *modep = mode;
12600
12601 /* Vector modes are considered to be opaque: two vectors are
12602 equivalent for the purposes of being homogeneous aggregates
12603 if they are the same size. */
12604 if (*modep == mode)
12605 return 1;
12606
12607 break;
12608
12609 case ARRAY_TYPE:
12610 {
12611 int count;
12612 tree index = TYPE_DOMAIN (type);
12613
12614 /* Can't handle incomplete types nor sizes that are not
12615 fixed. */
12616 if (!COMPLETE_TYPE_P (type)
12617 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
12618 return -1;
12619
12620 count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
12621 if (count == -1
12622 || !index
12623 || !TYPE_MAX_VALUE (index)
12624 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
12625 || !TYPE_MIN_VALUE (index)
12626 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
12627 || count < 0)
12628 return -1;
12629
12630 count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
12631 - tree_to_uhwi (TYPE_MIN_VALUE (index)));
12632
12633 /* There must be no padding. */
12634 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
12635 count * GET_MODE_BITSIZE (*modep)))
12636 return -1;
12637
12638 return count;
12639 }
12640
12641 case RECORD_TYPE:
12642 {
12643 int count = 0;
12644 int sub_count;
12645 tree field;
12646
12647 /* Can't handle incomplete types nor sizes that are not
12648 fixed. */
12649 if (!COMPLETE_TYPE_P (type)
12650 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
12651 return -1;
12652
12653 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
12654 {
12655 if (TREE_CODE (field) != FIELD_DECL)
12656 continue;
12657
12658 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
12659 if (sub_count < 0)
12660 return -1;
12661 count += sub_count;
12662 }
12663
12664 /* There must be no padding. */
12665 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
12666 count * GET_MODE_BITSIZE (*modep)))
12667 return -1;
12668
12669 return count;
12670 }
12671
12672 case UNION_TYPE:
12673 case QUAL_UNION_TYPE:
12674 {
12675 /* These aren't very interesting except in a degenerate case. */
12676 int count = 0;
12677 int sub_count;
12678 tree field;
12679
12680 /* Can't handle incomplete types nor sizes that are not
12681 fixed. */
12682 if (!COMPLETE_TYPE_P (type)
12683 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
12684 return -1;
12685
12686 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
12687 {
12688 if (TREE_CODE (field) != FIELD_DECL)
12689 continue;
12690
12691 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
12692 if (sub_count < 0)
12693 return -1;
12694 count = count > sub_count ? count : sub_count;
12695 }
12696
12697 /* There must be no padding. */
12698 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
12699 count * GET_MODE_BITSIZE (*modep)))
12700 return -1;
12701
12702 return count;
12703 }
12704
12705 default:
12706 break;
12707 }
12708
12709 return -1;
12710 }
12711
12712 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
12713 type as described in AAPCS64 \S 4.1.2.
12714
12715 See the comment above aarch64_composite_type_p for the notes on MODE. */
12716
12717 static bool
aarch64_short_vector_p(const_tree type,machine_mode mode)12718 aarch64_short_vector_p (const_tree type,
12719 machine_mode mode)
12720 {
12721 poly_int64 size = -1;
12722
12723 if (type && TREE_CODE (type) == VECTOR_TYPE)
12724 size = int_size_in_bytes (type);
12725 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
12726 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
12727 size = GET_MODE_SIZE (mode);
12728
12729 return known_eq (size, 8) || known_eq (size, 16);
12730 }
12731
12732 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
12733 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
12734 array types. The C99 floating-point complex types are also considered
12735 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
12736 types, which are GCC extensions and out of the scope of AAPCS64, are
12737 treated as composite types here as well.
12738
12739 Note that MODE itself is not sufficient in determining whether a type
12740 is such a composite type or not. This is because
12741 stor-layout.c:compute_record_mode may have already changed the MODE
12742 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
12743 structure with only one field may have its MODE set to the mode of the
12744 field. Also an integer mode whose size matches the size of the
12745 RECORD_TYPE type may be used to substitute the original mode
12746 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
12747 solely relied on. */
12748
12749 static bool
aarch64_composite_type_p(const_tree type,machine_mode mode)12750 aarch64_composite_type_p (const_tree type,
12751 machine_mode mode)
12752 {
12753 if (aarch64_short_vector_p (type, mode))
12754 return false;
12755
12756 if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
12757 return true;
12758
12759 if (mode == BLKmode
12760 || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
12761 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
12762 return true;
12763
12764 return false;
12765 }
12766
12767 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
12768 shall be passed or returned in simd/fp register(s) (providing these
12769 parameter passing registers are available).
12770
12771 Upon successful return, *COUNT returns the number of needed registers,
12772 *BASE_MODE returns the mode of the individual register and when IS_HAF
12773 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
12774 floating-point aggregate or a homogeneous short-vector aggregate. */
12775
12776 static bool
aarch64_vfp_is_call_or_return_candidate(machine_mode mode,const_tree type,machine_mode * base_mode,int * count,bool * is_ha)12777 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
12778 const_tree type,
12779 machine_mode *base_mode,
12780 int *count,
12781 bool *is_ha)
12782 {
12783 machine_mode new_mode = VOIDmode;
12784 bool composite_p = aarch64_composite_type_p (type, mode);
12785
12786 if (is_ha != NULL) *is_ha = false;
12787
12788 if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
12789 || aarch64_short_vector_p (type, mode))
12790 {
12791 *count = 1;
12792 new_mode = mode;
12793 }
12794 else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
12795 {
12796 if (is_ha != NULL) *is_ha = true;
12797 *count = 2;
12798 new_mode = GET_MODE_INNER (mode);
12799 }
12800 else if (type && composite_p)
12801 {
12802 int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
12803
12804 if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
12805 {
12806 if (is_ha != NULL) *is_ha = true;
12807 *count = ag_count;
12808 }
12809 else
12810 return false;
12811 }
12812 else
12813 return false;
12814
12815 *base_mode = new_mode;
12816 return true;
12817 }
12818
12819 /* Implement TARGET_STRUCT_VALUE_RTX. */
12820
12821 static rtx
aarch64_struct_value_rtx(tree fndecl ATTRIBUTE_UNUSED,int incoming ATTRIBUTE_UNUSED)12822 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
12823 int incoming ATTRIBUTE_UNUSED)
12824 {
12825 return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
12826 }
12827
12828 /* Implements target hook vector_mode_supported_p. */
12829 static bool
aarch64_vector_mode_supported_p(machine_mode mode)12830 aarch64_vector_mode_supported_p (machine_mode mode)
12831 {
12832 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
12833 return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
12834 }
12835
12836 /* Return appropriate SIMD container
12837 for MODE within a vector of WIDTH bits. */
12838 static machine_mode
aarch64_simd_container_mode(scalar_mode mode,poly_int64 width)12839 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
12840 {
12841 if (TARGET_SVE && known_eq (width, BITS_PER_SVE_VECTOR))
12842 switch (mode)
12843 {
12844 case E_DFmode:
12845 return VNx2DFmode;
12846 case E_SFmode:
12847 return VNx4SFmode;
12848 case E_HFmode:
12849 return VNx8HFmode;
12850 case E_DImode:
12851 return VNx2DImode;
12852 case E_SImode:
12853 return VNx4SImode;
12854 case E_HImode:
12855 return VNx8HImode;
12856 case E_QImode:
12857 return VNx16QImode;
12858 default:
12859 return word_mode;
12860 }
12861
12862 gcc_assert (known_eq (width, 64) || known_eq (width, 128));
12863 if (TARGET_SIMD)
12864 {
12865 if (known_eq (width, 128))
12866 switch (mode)
12867 {
12868 case E_DFmode:
12869 return V2DFmode;
12870 case E_SFmode:
12871 return V4SFmode;
12872 case E_HFmode:
12873 return V8HFmode;
12874 case E_SImode:
12875 return V4SImode;
12876 case E_HImode:
12877 return V8HImode;
12878 case E_QImode:
12879 return V16QImode;
12880 case E_DImode:
12881 return V2DImode;
12882 default:
12883 break;
12884 }
12885 else
12886 switch (mode)
12887 {
12888 case E_SFmode:
12889 return V2SFmode;
12890 case E_HFmode:
12891 return V4HFmode;
12892 case E_SImode:
12893 return V2SImode;
12894 case E_HImode:
12895 return V4HImode;
12896 case E_QImode:
12897 return V8QImode;
12898 default:
12899 break;
12900 }
12901 }
12902 return word_mode;
12903 }
12904
12905 /* Return 128-bit container as the preferred SIMD mode for MODE. */
12906 static machine_mode
aarch64_preferred_simd_mode(scalar_mode mode)12907 aarch64_preferred_simd_mode (scalar_mode mode)
12908 {
12909 poly_int64 bits = TARGET_SVE ? BITS_PER_SVE_VECTOR : 128;
12910 return aarch64_simd_container_mode (mode, bits);
12911 }
12912
12913 /* Return a list of possible vector sizes for the vectorizer
12914 to iterate over. */
12915 static void
aarch64_autovectorize_vector_sizes(vector_sizes * sizes)12916 aarch64_autovectorize_vector_sizes (vector_sizes *sizes)
12917 {
12918 if (TARGET_SVE)
12919 sizes->safe_push (BYTES_PER_SVE_VECTOR);
12920 sizes->safe_push (16);
12921 sizes->safe_push (8);
12922 }
12923
12924 /* Implement TARGET_MANGLE_TYPE. */
12925
12926 static const char *
aarch64_mangle_type(const_tree type)12927 aarch64_mangle_type (const_tree type)
12928 {
12929 /* The AArch64 ABI documents say that "__va_list" has to be
12930 managled as if it is in the "std" namespace. */
12931 if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
12932 return "St9__va_list";
12933
12934 /* Half-precision float. */
12935 if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
12936 return "Dh";
12937
12938 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
12939 builtin types. */
12940 if (TYPE_NAME (type) != NULL)
12941 return aarch64_mangle_builtin_type (type);
12942
12943 /* Use the default mangling. */
12944 return NULL;
12945 }
12946
12947 /* Find the first rtx_insn before insn that will generate an assembly
12948 instruction. */
12949
12950 static rtx_insn *
aarch64_prev_real_insn(rtx_insn * insn)12951 aarch64_prev_real_insn (rtx_insn *insn)
12952 {
12953 if (!insn)
12954 return NULL;
12955
12956 do
12957 {
12958 insn = prev_real_insn (insn);
12959 }
12960 while (insn && recog_memoized (insn) < 0);
12961
12962 return insn;
12963 }
12964
12965 static bool
is_madd_op(enum attr_type t1)12966 is_madd_op (enum attr_type t1)
12967 {
12968 unsigned int i;
12969 /* A number of these may be AArch32 only. */
12970 enum attr_type mlatypes[] = {
12971 TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
12972 TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
12973 TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
12974 };
12975
12976 for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
12977 {
12978 if (t1 == mlatypes[i])
12979 return true;
12980 }
12981
12982 return false;
12983 }
12984
12985 /* Check if there is a register dependency between a load and the insn
12986 for which we hold recog_data. */
12987
12988 static bool
dep_between_memop_and_curr(rtx memop)12989 dep_between_memop_and_curr (rtx memop)
12990 {
12991 rtx load_reg;
12992 int opno;
12993
12994 gcc_assert (GET_CODE (memop) == SET);
12995
12996 if (!REG_P (SET_DEST (memop)))
12997 return false;
12998
12999 load_reg = SET_DEST (memop);
13000 for (opno = 1; opno < recog_data.n_operands; opno++)
13001 {
13002 rtx operand = recog_data.operand[opno];
13003 if (REG_P (operand)
13004 && reg_overlap_mentioned_p (load_reg, operand))
13005 return true;
13006
13007 }
13008 return false;
13009 }
13010
13011
13012 /* When working around the Cortex-A53 erratum 835769,
13013 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
13014 instruction and has a preceding memory instruction such that a NOP
13015 should be inserted between them. */
13016
13017 bool
aarch64_madd_needs_nop(rtx_insn * insn)13018 aarch64_madd_needs_nop (rtx_insn* insn)
13019 {
13020 enum attr_type attr_type;
13021 rtx_insn *prev;
13022 rtx body;
13023
13024 if (!TARGET_FIX_ERR_A53_835769)
13025 return false;
13026
13027 if (!INSN_P (insn) || recog_memoized (insn) < 0)
13028 return false;
13029
13030 attr_type = get_attr_type (insn);
13031 if (!is_madd_op (attr_type))
13032 return false;
13033
13034 prev = aarch64_prev_real_insn (insn);
13035 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
13036 Restore recog state to INSN to avoid state corruption. */
13037 extract_constrain_insn_cached (insn);
13038
13039 if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
13040 return false;
13041
13042 body = single_set (prev);
13043
13044 /* If the previous insn is a memory op and there is no dependency between
13045 it and the DImode madd, emit a NOP between them. If body is NULL then we
13046 have a complex memory operation, probably a load/store pair.
13047 Be conservative for now and emit a NOP. */
13048 if (GET_MODE (recog_data.operand[0]) == DImode
13049 && (!body || !dep_between_memop_and_curr (body)))
13050 return true;
13051
13052 return false;
13053
13054 }
13055
13056
13057 /* Implement FINAL_PRESCAN_INSN. */
13058
13059 void
aarch64_final_prescan_insn(rtx_insn * insn)13060 aarch64_final_prescan_insn (rtx_insn *insn)
13061 {
13062 if (aarch64_madd_needs_nop (insn))
13063 fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
13064 }
13065
13066
13067 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
13068 instruction. */
13069
13070 bool
aarch64_sve_index_immediate_p(rtx base_or_step)13071 aarch64_sve_index_immediate_p (rtx base_or_step)
13072 {
13073 return (CONST_INT_P (base_or_step)
13074 && IN_RANGE (INTVAL (base_or_step), -16, 15));
13075 }
13076
13077 /* Return true if X is a valid immediate for the SVE ADD and SUB
13078 instructions. Negate X first if NEGATE_P is true. */
13079
13080 bool
aarch64_sve_arith_immediate_p(rtx x,bool negate_p)13081 aarch64_sve_arith_immediate_p (rtx x, bool negate_p)
13082 {
13083 rtx elt;
13084
13085 if (!const_vec_duplicate_p (x, &elt)
13086 || !CONST_INT_P (elt))
13087 return false;
13088
13089 HOST_WIDE_INT val = INTVAL (elt);
13090 if (negate_p)
13091 val = -val;
13092 val &= GET_MODE_MASK (GET_MODE_INNER (GET_MODE (x)));
13093
13094 if (val & 0xff)
13095 return IN_RANGE (val, 0, 0xff);
13096 return IN_RANGE (val, 0, 0xff00);
13097 }
13098
13099 /* Return true if X is a valid immediate operand for an SVE logical
13100 instruction such as AND. */
13101
13102 bool
aarch64_sve_bitmask_immediate_p(rtx x)13103 aarch64_sve_bitmask_immediate_p (rtx x)
13104 {
13105 rtx elt;
13106
13107 return (const_vec_duplicate_p (x, &elt)
13108 && CONST_INT_P (elt)
13109 && aarch64_bitmask_imm (INTVAL (elt),
13110 GET_MODE_INNER (GET_MODE (x))));
13111 }
13112
13113 /* Return true if X is a valid immediate for the SVE DUP and CPY
13114 instructions. */
13115
13116 bool
aarch64_sve_dup_immediate_p(rtx x)13117 aarch64_sve_dup_immediate_p (rtx x)
13118 {
13119 rtx elt;
13120
13121 if (!const_vec_duplicate_p (x, &elt)
13122 || !CONST_INT_P (elt))
13123 return false;
13124
13125 HOST_WIDE_INT val = INTVAL (elt);
13126 if (val & 0xff)
13127 return IN_RANGE (val, -0x80, 0x7f);
13128 return IN_RANGE (val, -0x8000, 0x7f00);
13129 }
13130
13131 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
13132 SIGNED_P says whether the operand is signed rather than unsigned. */
13133
13134 bool
aarch64_sve_cmp_immediate_p(rtx x,bool signed_p)13135 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
13136 {
13137 rtx elt;
13138
13139 return (const_vec_duplicate_p (x, &elt)
13140 && CONST_INT_P (elt)
13141 && (signed_p
13142 ? IN_RANGE (INTVAL (elt), -16, 15)
13143 : IN_RANGE (INTVAL (elt), 0, 127)));
13144 }
13145
13146 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
13147 instruction. Negate X first if NEGATE_P is true. */
13148
13149 bool
aarch64_sve_float_arith_immediate_p(rtx x,bool negate_p)13150 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
13151 {
13152 rtx elt;
13153 REAL_VALUE_TYPE r;
13154
13155 if (!const_vec_duplicate_p (x, &elt)
13156 || GET_CODE (elt) != CONST_DOUBLE)
13157 return false;
13158
13159 r = *CONST_DOUBLE_REAL_VALUE (elt);
13160
13161 if (negate_p)
13162 r = real_value_negate (&r);
13163
13164 if (real_equal (&r, &dconst1))
13165 return true;
13166 if (real_equal (&r, &dconsthalf))
13167 return true;
13168 return false;
13169 }
13170
13171 /* Return true if X is a valid immediate operand for an SVE FMUL
13172 instruction. */
13173
13174 bool
aarch64_sve_float_mul_immediate_p(rtx x)13175 aarch64_sve_float_mul_immediate_p (rtx x)
13176 {
13177 rtx elt;
13178
13179 /* GCC will never generate a multiply with an immediate of 2, so there is no
13180 point testing for it (even though it is a valid constant). */
13181 return (const_vec_duplicate_p (x, &elt)
13182 && GET_CODE (elt) == CONST_DOUBLE
13183 && real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf));
13184 }
13185
13186 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
13187 for the Advanced SIMD operation described by WHICH and INSN. If INFO
13188 is nonnull, use it to describe valid immediates. */
13189 static bool
aarch64_advsimd_valid_immediate_hs(unsigned int val32,simd_immediate_info * info,enum simd_immediate_check which,simd_immediate_info::insn_type insn)13190 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
13191 simd_immediate_info *info,
13192 enum simd_immediate_check which,
13193 simd_immediate_info::insn_type insn)
13194 {
13195 /* Try a 4-byte immediate with LSL. */
13196 for (unsigned int shift = 0; shift < 32; shift += 8)
13197 if ((val32 & (0xff << shift)) == val32)
13198 {
13199 if (info)
13200 *info = simd_immediate_info (SImode, val32 >> shift, insn,
13201 simd_immediate_info::LSL, shift);
13202 return true;
13203 }
13204
13205 /* Try a 2-byte immediate with LSL. */
13206 unsigned int imm16 = val32 & 0xffff;
13207 if (imm16 == (val32 >> 16))
13208 for (unsigned int shift = 0; shift < 16; shift += 8)
13209 if ((imm16 & (0xff << shift)) == imm16)
13210 {
13211 if (info)
13212 *info = simd_immediate_info (HImode, imm16 >> shift, insn,
13213 simd_immediate_info::LSL, shift);
13214 return true;
13215 }
13216
13217 /* Try a 4-byte immediate with MSL, except for cases that MVN
13218 can handle. */
13219 if (which == AARCH64_CHECK_MOV)
13220 for (unsigned int shift = 8; shift < 24; shift += 8)
13221 {
13222 unsigned int low = (1 << shift) - 1;
13223 if (((val32 & (0xff << shift)) | low) == val32)
13224 {
13225 if (info)
13226 *info = simd_immediate_info (SImode, val32 >> shift, insn,
13227 simd_immediate_info::MSL, shift);
13228 return true;
13229 }
13230 }
13231
13232 return false;
13233 }
13234
13235 /* Return true if replicating VAL64 is a valid immediate for the
13236 Advanced SIMD operation described by WHICH. If INFO is nonnull,
13237 use it to describe valid immediates. */
13238 static bool
aarch64_advsimd_valid_immediate(unsigned HOST_WIDE_INT val64,simd_immediate_info * info,enum simd_immediate_check which)13239 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
13240 simd_immediate_info *info,
13241 enum simd_immediate_check which)
13242 {
13243 unsigned int val32 = val64 & 0xffffffff;
13244 unsigned int val16 = val64 & 0xffff;
13245 unsigned int val8 = val64 & 0xff;
13246
13247 if (val32 == (val64 >> 32))
13248 {
13249 if ((which & AARCH64_CHECK_ORR) != 0
13250 && aarch64_advsimd_valid_immediate_hs (val32, info, which,
13251 simd_immediate_info::MOV))
13252 return true;
13253
13254 if ((which & AARCH64_CHECK_BIC) != 0
13255 && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
13256 simd_immediate_info::MVN))
13257 return true;
13258
13259 /* Try using a replicated byte. */
13260 if (which == AARCH64_CHECK_MOV
13261 && val16 == (val32 >> 16)
13262 && val8 == (val16 >> 8))
13263 {
13264 if (info)
13265 *info = simd_immediate_info (QImode, val8);
13266 return true;
13267 }
13268 }
13269
13270 /* Try using a bit-to-bytemask. */
13271 if (which == AARCH64_CHECK_MOV)
13272 {
13273 unsigned int i;
13274 for (i = 0; i < 64; i += 8)
13275 {
13276 unsigned char byte = (val64 >> i) & 0xff;
13277 if (byte != 0 && byte != 0xff)
13278 break;
13279 }
13280 if (i == 64)
13281 {
13282 if (info)
13283 *info = simd_immediate_info (DImode, val64);
13284 return true;
13285 }
13286 }
13287 return false;
13288 }
13289
13290 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
13291 instruction. If INFO is nonnull, use it to describe valid immediates. */
13292
13293 static bool
aarch64_sve_valid_immediate(unsigned HOST_WIDE_INT val64,simd_immediate_info * info)13294 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
13295 simd_immediate_info *info)
13296 {
13297 scalar_int_mode mode = DImode;
13298 unsigned int val32 = val64 & 0xffffffff;
13299 if (val32 == (val64 >> 32))
13300 {
13301 mode = SImode;
13302 unsigned int val16 = val32 & 0xffff;
13303 if (val16 == (val32 >> 16))
13304 {
13305 mode = HImode;
13306 unsigned int val8 = val16 & 0xff;
13307 if (val8 == (val16 >> 8))
13308 mode = QImode;
13309 }
13310 }
13311 HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
13312 if (IN_RANGE (val, -0x80, 0x7f))
13313 {
13314 /* DUP with no shift. */
13315 if (info)
13316 *info = simd_immediate_info (mode, val);
13317 return true;
13318 }
13319 if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
13320 {
13321 /* DUP with LSL #8. */
13322 if (info)
13323 *info = simd_immediate_info (mode, val);
13324 return true;
13325 }
13326 if (aarch64_bitmask_imm (val64, mode))
13327 {
13328 /* DUPM. */
13329 if (info)
13330 *info = simd_immediate_info (mode, val);
13331 return true;
13332 }
13333 return false;
13334 }
13335
13336 /* Return true if OP is a valid SIMD immediate for the operation
13337 described by WHICH. If INFO is nonnull, use it to describe valid
13338 immediates. */
13339 bool
aarch64_simd_valid_immediate(rtx op,simd_immediate_info * info,enum simd_immediate_check which)13340 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
13341 enum simd_immediate_check which)
13342 {
13343 machine_mode mode = GET_MODE (op);
13344 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
13345 if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
13346 return false;
13347
13348 scalar_mode elt_mode = GET_MODE_INNER (mode);
13349 rtx base, step;
13350 unsigned int n_elts;
13351 if (GET_CODE (op) == CONST_VECTOR
13352 && CONST_VECTOR_DUPLICATE_P (op))
13353 n_elts = CONST_VECTOR_NPATTERNS (op);
13354 else if ((vec_flags & VEC_SVE_DATA)
13355 && const_vec_series_p (op, &base, &step))
13356 {
13357 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
13358 if (!aarch64_sve_index_immediate_p (base)
13359 || !aarch64_sve_index_immediate_p (step))
13360 return false;
13361
13362 if (info)
13363 *info = simd_immediate_info (elt_mode, base, step);
13364 return true;
13365 }
13366 else if (GET_CODE (op) == CONST_VECTOR
13367 && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
13368 /* N_ELTS set above. */;
13369 else
13370 return false;
13371
13372 /* Handle PFALSE and PTRUE. */
13373 if (vec_flags & VEC_SVE_PRED)
13374 return (op == CONST0_RTX (mode)
13375 || op == CONSTM1_RTX (mode));
13376
13377 scalar_float_mode elt_float_mode;
13378 if (n_elts == 1
13379 && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
13380 {
13381 rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
13382 if (aarch64_float_const_zero_rtx_p (elt)
13383 || aarch64_float_const_representable_p (elt))
13384 {
13385 if (info)
13386 *info = simd_immediate_info (elt_float_mode, elt);
13387 return true;
13388 }
13389 }
13390
13391 unsigned int elt_size = GET_MODE_SIZE (elt_mode);
13392 if (elt_size > 8)
13393 return false;
13394
13395 scalar_int_mode elt_int_mode = int_mode_for_mode (elt_mode).require ();
13396
13397 /* Expand the vector constant out into a byte vector, with the least
13398 significant byte of the register first. */
13399 auto_vec<unsigned char, 16> bytes;
13400 bytes.reserve (n_elts * elt_size);
13401 for (unsigned int i = 0; i < n_elts; i++)
13402 {
13403 /* The vector is provided in gcc endian-neutral fashion.
13404 For aarch64_be Advanced SIMD, it must be laid out in the vector
13405 register in reverse order. */
13406 bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
13407 rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
13408
13409 if (elt_mode != elt_int_mode)
13410 elt = gen_lowpart (elt_int_mode, elt);
13411
13412 if (!CONST_INT_P (elt))
13413 return false;
13414
13415 unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
13416 for (unsigned int byte = 0; byte < elt_size; byte++)
13417 {
13418 bytes.quick_push (elt_val & 0xff);
13419 elt_val >>= BITS_PER_UNIT;
13420 }
13421 }
13422
13423 /* The immediate must repeat every eight bytes. */
13424 unsigned int nbytes = bytes.length ();
13425 for (unsigned i = 8; i < nbytes; ++i)
13426 if (bytes[i] != bytes[i - 8])
13427 return false;
13428
13429 /* Get the repeating 8-byte value as an integer. No endian correction
13430 is needed here because bytes is already in lsb-first order. */
13431 unsigned HOST_WIDE_INT val64 = 0;
13432 for (unsigned int i = 0; i < 8; i++)
13433 val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
13434 << (i * BITS_PER_UNIT));
13435
13436 if (vec_flags & VEC_SVE_DATA)
13437 return aarch64_sve_valid_immediate (val64, info);
13438 else
13439 return aarch64_advsimd_valid_immediate (val64, info, which);
13440 }
13441
13442 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
13443 has a step in the range of INDEX. Return the index expression if so,
13444 otherwise return null. */
13445 rtx
aarch64_check_zero_based_sve_index_immediate(rtx x)13446 aarch64_check_zero_based_sve_index_immediate (rtx x)
13447 {
13448 rtx base, step;
13449 if (const_vec_series_p (x, &base, &step)
13450 && base == const0_rtx
13451 && aarch64_sve_index_immediate_p (step))
13452 return step;
13453 return NULL_RTX;
13454 }
13455
13456 /* Check of immediate shift constants are within range. */
13457 bool
aarch64_simd_shift_imm_p(rtx x,machine_mode mode,bool left)13458 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
13459 {
13460 int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
13461 if (left)
13462 return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
13463 else
13464 return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
13465 }
13466
13467 /* Return the bitmask CONST_INT to select the bits required by a zero extract
13468 operation of width WIDTH at bit position POS. */
13469
13470 rtx
aarch64_mask_from_zextract_ops(rtx width,rtx pos)13471 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
13472 {
13473 gcc_assert (CONST_INT_P (width));
13474 gcc_assert (CONST_INT_P (pos));
13475
13476 unsigned HOST_WIDE_INT mask
13477 = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
13478 return GEN_INT (mask << UINTVAL (pos));
13479 }
13480
13481 bool
aarch64_mov_operand_p(rtx x,machine_mode mode)13482 aarch64_mov_operand_p (rtx x, machine_mode mode)
13483 {
13484 if (GET_CODE (x) == HIGH
13485 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
13486 return true;
13487
13488 if (CONST_INT_P (x))
13489 return true;
13490
13491 if (VECTOR_MODE_P (GET_MODE (x)))
13492 return aarch64_simd_valid_immediate (x, NULL);
13493
13494 if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
13495 return true;
13496
13497 if (aarch64_sve_cnt_immediate_p (x))
13498 return true;
13499
13500 return aarch64_classify_symbolic_expression (x)
13501 == SYMBOL_TINY_ABSOLUTE;
13502 }
13503
13504 /* Return a const_int vector of VAL. */
13505 rtx
aarch64_simd_gen_const_vector_dup(machine_mode mode,HOST_WIDE_INT val)13506 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
13507 {
13508 rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
13509 return gen_const_vec_duplicate (mode, c);
13510 }
13511
13512 /* Check OP is a legal scalar immediate for the MOVI instruction. */
13513
13514 bool
aarch64_simd_scalar_immediate_valid_for_move(rtx op,scalar_int_mode mode)13515 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
13516 {
13517 machine_mode vmode;
13518
13519 vmode = aarch64_simd_container_mode (mode, 64);
13520 rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
13521 return aarch64_simd_valid_immediate (op_v, NULL);
13522 }
13523
13524 /* Construct and return a PARALLEL RTX vector with elements numbering the
13525 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
13526 the vector - from the perspective of the architecture. This does not
13527 line up with GCC's perspective on lane numbers, so we end up with
13528 different masks depending on our target endian-ness. The diagram
13529 below may help. We must draw the distinction when building masks
13530 which select one half of the vector. An instruction selecting
13531 architectural low-lanes for a big-endian target, must be described using
13532 a mask selecting GCC high-lanes.
13533
13534 Big-Endian Little-Endian
13535
13536 GCC 0 1 2 3 3 2 1 0
13537 | x | x | x | x | | x | x | x | x |
13538 Architecture 3 2 1 0 3 2 1 0
13539
13540 Low Mask: { 2, 3 } { 0, 1 }
13541 High Mask: { 0, 1 } { 2, 3 }
13542
13543 MODE Is the mode of the vector and NUNITS is the number of units in it. */
13544
13545 rtx
aarch64_simd_vect_par_cnst_half(machine_mode mode,int nunits,bool high)13546 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
13547 {
13548 rtvec v = rtvec_alloc (nunits / 2);
13549 int high_base = nunits / 2;
13550 int low_base = 0;
13551 int base;
13552 rtx t1;
13553 int i;
13554
13555 if (BYTES_BIG_ENDIAN)
13556 base = high ? low_base : high_base;
13557 else
13558 base = high ? high_base : low_base;
13559
13560 for (i = 0; i < nunits / 2; i++)
13561 RTVEC_ELT (v, i) = GEN_INT (base + i);
13562
13563 t1 = gen_rtx_PARALLEL (mode, v);
13564 return t1;
13565 }
13566
13567 /* Check OP for validity as a PARALLEL RTX vector with elements
13568 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
13569 from the perspective of the architecture. See the diagram above
13570 aarch64_simd_vect_par_cnst_half for more details. */
13571
13572 bool
aarch64_simd_check_vect_par_cnst_half(rtx op,machine_mode mode,bool high)13573 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
13574 bool high)
13575 {
13576 int nelts;
13577 if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
13578 return false;
13579
13580 rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
13581 HOST_WIDE_INT count_op = XVECLEN (op, 0);
13582 HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
13583 int i = 0;
13584
13585 if (count_op != count_ideal)
13586 return false;
13587
13588 for (i = 0; i < count_ideal; i++)
13589 {
13590 rtx elt_op = XVECEXP (op, 0, i);
13591 rtx elt_ideal = XVECEXP (ideal, 0, i);
13592
13593 if (!CONST_INT_P (elt_op)
13594 || INTVAL (elt_ideal) != INTVAL (elt_op))
13595 return false;
13596 }
13597 return true;
13598 }
13599
13600 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
13601 HIGH (exclusive). */
13602 void
aarch64_simd_lane_bounds(rtx operand,HOST_WIDE_INT low,HOST_WIDE_INT high,const_tree exp)13603 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
13604 const_tree exp)
13605 {
13606 HOST_WIDE_INT lane;
13607 gcc_assert (CONST_INT_P (operand));
13608 lane = INTVAL (operand);
13609
13610 if (lane < low || lane >= high)
13611 {
13612 if (exp)
13613 error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
13614 else
13615 error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
13616 }
13617 }
13618
13619 /* Peform endian correction on lane number N, which indexes a vector
13620 of mode MODE, and return the result as an SImode rtx. */
13621
13622 rtx
aarch64_endian_lane_rtx(machine_mode mode,unsigned int n)13623 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
13624 {
13625 return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
13626 }
13627
13628 /* Return TRUE if OP is a valid vector addressing mode. */
13629
13630 bool
aarch64_simd_mem_operand_p(rtx op)13631 aarch64_simd_mem_operand_p (rtx op)
13632 {
13633 return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
13634 || REG_P (XEXP (op, 0)));
13635 }
13636
13637 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction. */
13638
13639 bool
aarch64_sve_ld1r_operand_p(rtx op)13640 aarch64_sve_ld1r_operand_p (rtx op)
13641 {
13642 struct aarch64_address_info addr;
13643 scalar_mode mode;
13644
13645 return (MEM_P (op)
13646 && is_a <scalar_mode> (GET_MODE (op), &mode)
13647 && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
13648 && addr.type == ADDRESS_REG_IMM
13649 && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
13650 }
13651
13652 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
13653 The conditions for STR are the same. */
13654 bool
aarch64_sve_ldr_operand_p(rtx op)13655 aarch64_sve_ldr_operand_p (rtx op)
13656 {
13657 struct aarch64_address_info addr;
13658
13659 return (MEM_P (op)
13660 && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
13661 false, ADDR_QUERY_ANY)
13662 && addr.type == ADDRESS_REG_IMM);
13663 }
13664
13665 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
13666 We need to be able to access the individual pieces, so the range
13667 is different from LD[234] and ST[234]. */
13668 bool
aarch64_sve_struct_memory_operand_p(rtx op)13669 aarch64_sve_struct_memory_operand_p (rtx op)
13670 {
13671 if (!MEM_P (op))
13672 return false;
13673
13674 machine_mode mode = GET_MODE (op);
13675 struct aarch64_address_info addr;
13676 if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
13677 ADDR_QUERY_ANY)
13678 || addr.type != ADDRESS_REG_IMM)
13679 return false;
13680
13681 poly_int64 first = addr.const_offset;
13682 poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
13683 return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
13684 && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
13685 }
13686
13687 /* Emit a register copy from operand to operand, taking care not to
13688 early-clobber source registers in the process.
13689
13690 COUNT is the number of components into which the copy needs to be
13691 decomposed. */
13692 void
aarch64_simd_emit_reg_reg_move(rtx * operands,machine_mode mode,unsigned int count)13693 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
13694 unsigned int count)
13695 {
13696 unsigned int i;
13697 int rdest = REGNO (operands[0]);
13698 int rsrc = REGNO (operands[1]);
13699
13700 if (!reg_overlap_mentioned_p (operands[0], operands[1])
13701 || rdest < rsrc)
13702 for (i = 0; i < count; i++)
13703 emit_move_insn (gen_rtx_REG (mode, rdest + i),
13704 gen_rtx_REG (mode, rsrc + i));
13705 else
13706 for (i = 0; i < count; i++)
13707 emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
13708 gen_rtx_REG (mode, rsrc + count - i - 1));
13709 }
13710
13711 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
13712 one of VSTRUCT modes: OI, CI, or XI. */
13713 int
aarch64_simd_attr_length_rglist(machine_mode mode)13714 aarch64_simd_attr_length_rglist (machine_mode mode)
13715 {
13716 /* This is only used (and only meaningful) for Advanced SIMD, not SVE. */
13717 return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
13718 }
13719
13720 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
13721 alignment of a vector to 128 bits. SVE predicates have an alignment of
13722 16 bits. */
13723 static HOST_WIDE_INT
aarch64_simd_vector_alignment(const_tree type)13724 aarch64_simd_vector_alignment (const_tree type)
13725 {
13726 if (TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
13727 /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
13728 be set for non-predicate vectors of booleans. Modes are the most
13729 direct way we have of identifying real SVE predicate types. */
13730 return GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL ? 16 : 128;
13731 HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
13732 return MIN (align, 128);
13733 }
13734
13735 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT. */
13736 static HOST_WIDE_INT
aarch64_vectorize_preferred_vector_alignment(const_tree type)13737 aarch64_vectorize_preferred_vector_alignment (const_tree type)
13738 {
13739 if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
13740 {
13741 /* If the length of the vector is fixed, try to align to that length,
13742 otherwise don't try to align at all. */
13743 HOST_WIDE_INT result;
13744 if (!BITS_PER_SVE_VECTOR.is_constant (&result))
13745 result = TYPE_ALIGN (TREE_TYPE (type));
13746 return result;
13747 }
13748 return TYPE_ALIGN (type);
13749 }
13750
13751 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
13752 static bool
aarch64_simd_vector_alignment_reachable(const_tree type,bool is_packed)13753 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
13754 {
13755 if (is_packed)
13756 return false;
13757
13758 /* For fixed-length vectors, check that the vectorizer will aim for
13759 full-vector alignment. This isn't true for generic GCC vectors
13760 that are wider than the ABI maximum of 128 bits. */
13761 if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
13762 && (wi::to_widest (TYPE_SIZE (type))
13763 != aarch64_vectorize_preferred_vector_alignment (type)))
13764 return false;
13765
13766 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
13767 return true;
13768 }
13769
13770 /* Return true if the vector misalignment factor is supported by the
13771 target. */
13772 static bool
aarch64_builtin_support_vector_misalignment(machine_mode mode,const_tree type,int misalignment,bool is_packed)13773 aarch64_builtin_support_vector_misalignment (machine_mode mode,
13774 const_tree type, int misalignment,
13775 bool is_packed)
13776 {
13777 if (TARGET_SIMD && STRICT_ALIGNMENT)
13778 {
13779 /* Return if movmisalign pattern is not supported for this mode. */
13780 if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
13781 return false;
13782
13783 /* Misalignment factor is unknown at compile time. */
13784 if (misalignment == -1)
13785 return false;
13786 }
13787 return default_builtin_support_vector_misalignment (mode, type, misalignment,
13788 is_packed);
13789 }
13790
13791 /* If VALS is a vector constant that can be loaded into a register
13792 using DUP, generate instructions to do so and return an RTX to
13793 assign to the register. Otherwise return NULL_RTX. */
13794 static rtx
aarch64_simd_dup_constant(rtx vals)13795 aarch64_simd_dup_constant (rtx vals)
13796 {
13797 machine_mode mode = GET_MODE (vals);
13798 machine_mode inner_mode = GET_MODE_INNER (mode);
13799 rtx x;
13800
13801 if (!const_vec_duplicate_p (vals, &x))
13802 return NULL_RTX;
13803
13804 /* We can load this constant by using DUP and a constant in a
13805 single ARM register. This will be cheaper than a vector
13806 load. */
13807 x = copy_to_mode_reg (inner_mode, x);
13808 return gen_vec_duplicate (mode, x);
13809 }
13810
13811
13812 /* Generate code to load VALS, which is a PARALLEL containing only
13813 constants (for vec_init) or CONST_VECTOR, efficiently into a
13814 register. Returns an RTX to copy into the register, or NULL_RTX
13815 for a PARALLEL that can not be converted into a CONST_VECTOR. */
13816 static rtx
aarch64_simd_make_constant(rtx vals)13817 aarch64_simd_make_constant (rtx vals)
13818 {
13819 machine_mode mode = GET_MODE (vals);
13820 rtx const_dup;
13821 rtx const_vec = NULL_RTX;
13822 int n_const = 0;
13823 int i;
13824
13825 if (GET_CODE (vals) == CONST_VECTOR)
13826 const_vec = vals;
13827 else if (GET_CODE (vals) == PARALLEL)
13828 {
13829 /* A CONST_VECTOR must contain only CONST_INTs and
13830 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
13831 Only store valid constants in a CONST_VECTOR. */
13832 int n_elts = XVECLEN (vals, 0);
13833 for (i = 0; i < n_elts; ++i)
13834 {
13835 rtx x = XVECEXP (vals, 0, i);
13836 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
13837 n_const++;
13838 }
13839 if (n_const == n_elts)
13840 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
13841 }
13842 else
13843 gcc_unreachable ();
13844
13845 if (const_vec != NULL_RTX
13846 && aarch64_simd_valid_immediate (const_vec, NULL))
13847 /* Load using MOVI/MVNI. */
13848 return const_vec;
13849 else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
13850 /* Loaded using DUP. */
13851 return const_dup;
13852 else if (const_vec != NULL_RTX)
13853 /* Load from constant pool. We can not take advantage of single-cycle
13854 LD1 because we need a PC-relative addressing mode. */
13855 return const_vec;
13856 else
13857 /* A PARALLEL containing something not valid inside CONST_VECTOR.
13858 We can not construct an initializer. */
13859 return NULL_RTX;
13860 }
13861
13862 /* Expand a vector initialisation sequence, such that TARGET is
13863 initialised to contain VALS. */
13864
13865 void
aarch64_expand_vector_init(rtx target,rtx vals)13866 aarch64_expand_vector_init (rtx target, rtx vals)
13867 {
13868 machine_mode mode = GET_MODE (target);
13869 scalar_mode inner_mode = GET_MODE_INNER (mode);
13870 /* The number of vector elements. */
13871 int n_elts = XVECLEN (vals, 0);
13872 /* The number of vector elements which are not constant. */
13873 int n_var = 0;
13874 rtx any_const = NULL_RTX;
13875 /* The first element of vals. */
13876 rtx v0 = XVECEXP (vals, 0, 0);
13877 bool all_same = true;
13878
13879 /* Count the number of variable elements to initialise. */
13880 for (int i = 0; i < n_elts; ++i)
13881 {
13882 rtx x = XVECEXP (vals, 0, i);
13883 if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
13884 ++n_var;
13885 else
13886 any_const = x;
13887
13888 all_same &= rtx_equal_p (x, v0);
13889 }
13890
13891 /* No variable elements, hand off to aarch64_simd_make_constant which knows
13892 how best to handle this. */
13893 if (n_var == 0)
13894 {
13895 rtx constant = aarch64_simd_make_constant (vals);
13896 if (constant != NULL_RTX)
13897 {
13898 emit_move_insn (target, constant);
13899 return;
13900 }
13901 }
13902
13903 /* Splat a single non-constant element if we can. */
13904 if (all_same)
13905 {
13906 rtx x = copy_to_mode_reg (inner_mode, v0);
13907 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
13908 return;
13909 }
13910
13911 enum insn_code icode = optab_handler (vec_set_optab, mode);
13912 gcc_assert (icode != CODE_FOR_nothing);
13913
13914 /* If there are only variable elements, try to optimize
13915 the insertion using dup for the most common element
13916 followed by insertions. */
13917
13918 /* The algorithm will fill matches[*][0] with the earliest matching element,
13919 and matches[X][1] with the count of duplicate elements (if X is the
13920 earliest element which has duplicates). */
13921
13922 if (n_var == n_elts && n_elts <= 16)
13923 {
13924 int matches[16][2] = {0};
13925 for (int i = 0; i < n_elts; i++)
13926 {
13927 for (int j = 0; j <= i; j++)
13928 {
13929 if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
13930 {
13931 matches[i][0] = j;
13932 matches[j][1]++;
13933 break;
13934 }
13935 }
13936 }
13937 int maxelement = 0;
13938 int maxv = 0;
13939 for (int i = 0; i < n_elts; i++)
13940 if (matches[i][1] > maxv)
13941 {
13942 maxelement = i;
13943 maxv = matches[i][1];
13944 }
13945
13946 /* Create a duplicate of the most common element. */
13947 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
13948 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
13949
13950 /* Insert the rest. */
13951 for (int i = 0; i < n_elts; i++)
13952 {
13953 rtx x = XVECEXP (vals, 0, i);
13954 if (matches[i][0] == maxelement)
13955 continue;
13956 x = copy_to_mode_reg (inner_mode, x);
13957 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
13958 }
13959 return;
13960 }
13961
13962 /* Initialise a vector which is part-variable. We want to first try
13963 to build those lanes which are constant in the most efficient way we
13964 can. */
13965 if (n_var != n_elts)
13966 {
13967 rtx copy = copy_rtx (vals);
13968
13969 /* Load constant part of vector. We really don't care what goes into the
13970 parts we will overwrite, but we're more likely to be able to load the
13971 constant efficiently if it has fewer, larger, repeating parts
13972 (see aarch64_simd_valid_immediate). */
13973 for (int i = 0; i < n_elts; i++)
13974 {
13975 rtx x = XVECEXP (vals, 0, i);
13976 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
13977 continue;
13978 rtx subst = any_const;
13979 for (int bit = n_elts / 2; bit > 0; bit /= 2)
13980 {
13981 /* Look in the copied vector, as more elements are const. */
13982 rtx test = XVECEXP (copy, 0, i ^ bit);
13983 if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
13984 {
13985 subst = test;
13986 break;
13987 }
13988 }
13989 XVECEXP (copy, 0, i) = subst;
13990 }
13991 aarch64_expand_vector_init (target, copy);
13992 }
13993
13994 /* Insert the variable lanes directly. */
13995 for (int i = 0; i < n_elts; i++)
13996 {
13997 rtx x = XVECEXP (vals, 0, i);
13998 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
13999 continue;
14000 x = copy_to_mode_reg (inner_mode, x);
14001 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
14002 }
14003 }
14004
14005 static unsigned HOST_WIDE_INT
aarch64_shift_truncation_mask(machine_mode mode)14006 aarch64_shift_truncation_mask (machine_mode mode)
14007 {
14008 if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
14009 return 0;
14010 return GET_MODE_UNIT_BITSIZE (mode) - 1;
14011 }
14012
14013 /* Select a format to encode pointers in exception handling data. */
14014 int
aarch64_asm_preferred_eh_data_format(int code ATTRIBUTE_UNUSED,int global)14015 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
14016 {
14017 int type;
14018 switch (aarch64_cmodel)
14019 {
14020 case AARCH64_CMODEL_TINY:
14021 case AARCH64_CMODEL_TINY_PIC:
14022 case AARCH64_CMODEL_SMALL:
14023 case AARCH64_CMODEL_SMALL_PIC:
14024 case AARCH64_CMODEL_SMALL_SPIC:
14025 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
14026 for everything. */
14027 type = DW_EH_PE_sdata4;
14028 break;
14029 default:
14030 /* No assumptions here. 8-byte relocs required. */
14031 type = DW_EH_PE_sdata8;
14032 break;
14033 }
14034 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
14035 }
14036
14037 /* The last .arch and .tune assembly strings that we printed. */
14038 static std::string aarch64_last_printed_arch_string;
14039 static std::string aarch64_last_printed_tune_string;
14040
14041 /* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used
14042 by the function fndecl. */
14043
14044 void
aarch64_declare_function_name(FILE * stream,const char * name,tree fndecl)14045 aarch64_declare_function_name (FILE *stream, const char* name,
14046 tree fndecl)
14047 {
14048 tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
14049
14050 struct cl_target_option *targ_options;
14051 if (target_parts)
14052 targ_options = TREE_TARGET_OPTION (target_parts);
14053 else
14054 targ_options = TREE_TARGET_OPTION (target_option_current_node);
14055 gcc_assert (targ_options);
14056
14057 const struct processor *this_arch
14058 = aarch64_get_arch (targ_options->x_explicit_arch);
14059
14060 unsigned long isa_flags = targ_options->x_aarch64_isa_flags;
14061 std::string extension
14062 = aarch64_get_extension_string_for_isa_flags (isa_flags,
14063 this_arch->flags);
14064 /* Only update the assembler .arch string if it is distinct from the last
14065 such string we printed. */
14066 std::string to_print = this_arch->name + extension;
14067 if (to_print != aarch64_last_printed_arch_string)
14068 {
14069 asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
14070 aarch64_last_printed_arch_string = to_print;
14071 }
14072
14073 /* Print the cpu name we're tuning for in the comments, might be
14074 useful to readers of the generated asm. Do it only when it changes
14075 from function to function and verbose assembly is requested. */
14076 const struct processor *this_tune
14077 = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
14078
14079 if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
14080 {
14081 asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
14082 this_tune->name);
14083 aarch64_last_printed_tune_string = this_tune->name;
14084 }
14085
14086 /* Don't forget the type directive for ELF. */
14087 ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
14088 ASM_OUTPUT_LABEL (stream, name);
14089 }
14090
14091 /* Implements TARGET_ASM_FILE_START. Output the assembly header. */
14092
14093 static void
aarch64_start_file(void)14094 aarch64_start_file (void)
14095 {
14096 struct cl_target_option *default_options
14097 = TREE_TARGET_OPTION (target_option_default_node);
14098
14099 const struct processor *default_arch
14100 = aarch64_get_arch (default_options->x_explicit_arch);
14101 unsigned long default_isa_flags = default_options->x_aarch64_isa_flags;
14102 std::string extension
14103 = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
14104 default_arch->flags);
14105
14106 aarch64_last_printed_arch_string = default_arch->name + extension;
14107 aarch64_last_printed_tune_string = "";
14108 asm_fprintf (asm_out_file, "\t.arch %s\n",
14109 aarch64_last_printed_arch_string.c_str ());
14110
14111 default_file_start ();
14112 }
14113
14114 /* Emit load exclusive. */
14115
14116 static void
aarch64_emit_load_exclusive(machine_mode mode,rtx rval,rtx mem,rtx model_rtx)14117 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
14118 rtx mem, rtx model_rtx)
14119 {
14120 rtx (*gen) (rtx, rtx, rtx);
14121
14122 switch (mode)
14123 {
14124 case E_QImode: gen = gen_aarch64_load_exclusiveqi; break;
14125 case E_HImode: gen = gen_aarch64_load_exclusivehi; break;
14126 case E_SImode: gen = gen_aarch64_load_exclusivesi; break;
14127 case E_DImode: gen = gen_aarch64_load_exclusivedi; break;
14128 default:
14129 gcc_unreachable ();
14130 }
14131
14132 emit_insn (gen (rval, mem, model_rtx));
14133 }
14134
14135 /* Emit store exclusive. */
14136
14137 static void
aarch64_emit_store_exclusive(machine_mode mode,rtx bval,rtx rval,rtx mem,rtx model_rtx)14138 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
14139 rtx rval, rtx mem, rtx model_rtx)
14140 {
14141 rtx (*gen) (rtx, rtx, rtx, rtx);
14142
14143 switch (mode)
14144 {
14145 case E_QImode: gen = gen_aarch64_store_exclusiveqi; break;
14146 case E_HImode: gen = gen_aarch64_store_exclusivehi; break;
14147 case E_SImode: gen = gen_aarch64_store_exclusivesi; break;
14148 case E_DImode: gen = gen_aarch64_store_exclusivedi; break;
14149 default:
14150 gcc_unreachable ();
14151 }
14152
14153 emit_insn (gen (bval, rval, mem, model_rtx));
14154 }
14155
14156 /* Mark the previous jump instruction as unlikely. */
14157
14158 static void
aarch64_emit_unlikely_jump(rtx insn)14159 aarch64_emit_unlikely_jump (rtx insn)
14160 {
14161 rtx_insn *jump = emit_jump_insn (insn);
14162 add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
14163 }
14164
14165 /* Expand a compare and swap pattern. */
14166
14167 void
aarch64_expand_compare_and_swap(rtx operands[])14168 aarch64_expand_compare_and_swap (rtx operands[])
14169 {
14170 rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
14171 machine_mode mode, cmp_mode;
14172 typedef rtx (*gen_cas_fn) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
14173 int idx;
14174 gen_cas_fn gen;
14175 const gen_cas_fn split_cas[] =
14176 {
14177 gen_aarch64_compare_and_swapqi,
14178 gen_aarch64_compare_and_swaphi,
14179 gen_aarch64_compare_and_swapsi,
14180 gen_aarch64_compare_and_swapdi
14181 };
14182 const gen_cas_fn atomic_cas[] =
14183 {
14184 gen_aarch64_compare_and_swapqi_lse,
14185 gen_aarch64_compare_and_swaphi_lse,
14186 gen_aarch64_compare_and_swapsi_lse,
14187 gen_aarch64_compare_and_swapdi_lse
14188 };
14189
14190 bval = operands[0];
14191 rval = operands[1];
14192 mem = operands[2];
14193 oldval = operands[3];
14194 newval = operands[4];
14195 is_weak = operands[5];
14196 mod_s = operands[6];
14197 mod_f = operands[7];
14198 mode = GET_MODE (mem);
14199 cmp_mode = mode;
14200
14201 /* Normally the succ memory model must be stronger than fail, but in the
14202 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
14203 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
14204
14205 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
14206 && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
14207 mod_s = GEN_INT (MEMMODEL_ACQ_REL);
14208
14209 switch (mode)
14210 {
14211 case E_QImode:
14212 case E_HImode:
14213 /* For short modes, we're going to perform the comparison in SImode,
14214 so do the zero-extension now. */
14215 cmp_mode = SImode;
14216 rval = gen_reg_rtx (SImode);
14217 oldval = convert_modes (SImode, mode, oldval, true);
14218 /* Fall through. */
14219
14220 case E_SImode:
14221 case E_DImode:
14222 /* Force the value into a register if needed. */
14223 if (!aarch64_plus_operand (oldval, mode))
14224 oldval = force_reg (cmp_mode, oldval);
14225 break;
14226
14227 default:
14228 gcc_unreachable ();
14229 }
14230
14231 switch (mode)
14232 {
14233 case E_QImode: idx = 0; break;
14234 case E_HImode: idx = 1; break;
14235 case E_SImode: idx = 2; break;
14236 case E_DImode: idx = 3; break;
14237 default:
14238 gcc_unreachable ();
14239 }
14240 if (TARGET_LSE)
14241 gen = atomic_cas[idx];
14242 else
14243 gen = split_cas[idx];
14244
14245 emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
14246
14247 if (mode == QImode || mode == HImode)
14248 emit_move_insn (operands[1], gen_lowpart (mode, rval));
14249
14250 x = gen_rtx_REG (CCmode, CC_REGNUM);
14251 x = gen_rtx_EQ (SImode, x, const0_rtx);
14252 emit_insn (gen_rtx_SET (bval, x));
14253 }
14254
14255 /* Test whether the target supports using a atomic load-operate instruction.
14256 CODE is the operation and AFTER is TRUE if the data in memory after the
14257 operation should be returned and FALSE if the data before the operation
14258 should be returned. Returns FALSE if the operation isn't supported by the
14259 architecture. */
14260
14261 bool
aarch64_atomic_ldop_supported_p(enum rtx_code code)14262 aarch64_atomic_ldop_supported_p (enum rtx_code code)
14263 {
14264 if (!TARGET_LSE)
14265 return false;
14266
14267 switch (code)
14268 {
14269 case SET:
14270 case AND:
14271 case IOR:
14272 case XOR:
14273 case MINUS:
14274 case PLUS:
14275 return true;
14276 default:
14277 return false;
14278 }
14279 }
14280
14281 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
14282 sequence implementing an atomic operation. */
14283
14284 static void
aarch64_emit_post_barrier(enum memmodel model)14285 aarch64_emit_post_barrier (enum memmodel model)
14286 {
14287 const enum memmodel base_model = memmodel_base (model);
14288
14289 if (is_mm_sync (model)
14290 && (base_model == MEMMODEL_ACQUIRE
14291 || base_model == MEMMODEL_ACQ_REL
14292 || base_model == MEMMODEL_SEQ_CST))
14293 {
14294 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
14295 }
14296 }
14297
14298 /* Emit an atomic compare-and-swap operation. RVAL is the destination register
14299 for the data in memory. EXPECTED is the value expected to be in memory.
14300 DESIRED is the value to store to memory. MEM is the memory location. MODEL
14301 is the memory ordering to use. */
14302
14303 void
aarch64_gen_atomic_cas(rtx rval,rtx mem,rtx expected,rtx desired,rtx model)14304 aarch64_gen_atomic_cas (rtx rval, rtx mem,
14305 rtx expected, rtx desired,
14306 rtx model)
14307 {
14308 rtx (*gen) (rtx, rtx, rtx, rtx);
14309 machine_mode mode;
14310
14311 mode = GET_MODE (mem);
14312
14313 switch (mode)
14314 {
14315 case E_QImode: gen = gen_aarch64_atomic_casqi; break;
14316 case E_HImode: gen = gen_aarch64_atomic_cashi; break;
14317 case E_SImode: gen = gen_aarch64_atomic_cassi; break;
14318 case E_DImode: gen = gen_aarch64_atomic_casdi; break;
14319 default:
14320 gcc_unreachable ();
14321 }
14322
14323 /* Move the expected value into the CAS destination register. */
14324 emit_insn (gen_rtx_SET (rval, expected));
14325
14326 /* Emit the CAS. */
14327 emit_insn (gen (rval, mem, desired, model));
14328
14329 /* Compare the expected value with the value loaded by the CAS, to establish
14330 whether the swap was made. */
14331 aarch64_gen_compare_reg (EQ, rval, expected);
14332 }
14333
14334 /* Split a compare and swap pattern. */
14335
14336 void
aarch64_split_compare_and_swap(rtx operands[])14337 aarch64_split_compare_and_swap (rtx operands[])
14338 {
14339 /* Split after prolog/epilog to avoid interactions with shrinkwrapping. */
14340 gcc_assert (epilogue_completed);
14341
14342 rtx rval, mem, oldval, newval, scratch;
14343 machine_mode mode;
14344 bool is_weak;
14345 rtx_code_label *label1, *label2;
14346 rtx x, cond;
14347 enum memmodel model;
14348 rtx model_rtx;
14349
14350 rval = operands[0];
14351 mem = operands[1];
14352 oldval = operands[2];
14353 newval = operands[3];
14354 is_weak = (operands[4] != const0_rtx);
14355 model_rtx = operands[5];
14356 scratch = operands[7];
14357 mode = GET_MODE (mem);
14358 model = memmodel_from_int (INTVAL (model_rtx));
14359
14360 /* When OLDVAL is zero and we want the strong version we can emit a tighter
14361 loop:
14362 .label1:
14363 LD[A]XR rval, [mem]
14364 CBNZ rval, .label2
14365 ST[L]XR scratch, newval, [mem]
14366 CBNZ scratch, .label1
14367 .label2:
14368 CMP rval, 0. */
14369 bool strong_zero_p = !is_weak && oldval == const0_rtx;
14370
14371 label1 = NULL;
14372 if (!is_weak)
14373 {
14374 label1 = gen_label_rtx ();
14375 emit_label (label1);
14376 }
14377 label2 = gen_label_rtx ();
14378
14379 /* The initial load can be relaxed for a __sync operation since a final
14380 barrier will be emitted to stop code hoisting. */
14381 if (is_mm_sync (model))
14382 aarch64_emit_load_exclusive (mode, rval, mem,
14383 GEN_INT (MEMMODEL_RELAXED));
14384 else
14385 aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
14386
14387 if (strong_zero_p)
14388 {
14389 x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
14390 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14391 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
14392 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14393 }
14394 else
14395 {
14396 cond = aarch64_gen_compare_reg (NE, rval, oldval);
14397 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
14398 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14399 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
14400 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14401 }
14402
14403 aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
14404
14405 if (!is_weak)
14406 {
14407 x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
14408 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14409 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
14410 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14411 }
14412 else
14413 {
14414 cond = gen_rtx_REG (CCmode, CC_REGNUM);
14415 x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
14416 emit_insn (gen_rtx_SET (cond, x));
14417 }
14418
14419 emit_label (label2);
14420 /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
14421 to set the condition flags. If this is not used it will be removed by
14422 later passes. */
14423 if (strong_zero_p)
14424 {
14425 cond = gen_rtx_REG (CCmode, CC_REGNUM);
14426 x = gen_rtx_COMPARE (CCmode, rval, const0_rtx);
14427 emit_insn (gen_rtx_SET (cond, x));
14428 }
14429 /* Emit any final barrier needed for a __sync operation. */
14430 if (is_mm_sync (model))
14431 aarch64_emit_post_barrier (model);
14432 }
14433
14434 /* Emit a BIC instruction. */
14435
14436 static void
aarch64_emit_bic(machine_mode mode,rtx dst,rtx s1,rtx s2,int shift)14437 aarch64_emit_bic (machine_mode mode, rtx dst, rtx s1, rtx s2, int shift)
14438 {
14439 rtx shift_rtx = GEN_INT (shift);
14440 rtx (*gen) (rtx, rtx, rtx, rtx);
14441
14442 switch (mode)
14443 {
14444 case E_SImode: gen = gen_and_one_cmpl_lshrsi3; break;
14445 case E_DImode: gen = gen_and_one_cmpl_lshrdi3; break;
14446 default:
14447 gcc_unreachable ();
14448 }
14449
14450 emit_insn (gen (dst, s2, shift_rtx, s1));
14451 }
14452
14453 /* Emit an atomic swap. */
14454
14455 static void
aarch64_emit_atomic_swap(machine_mode mode,rtx dst,rtx value,rtx mem,rtx model)14456 aarch64_emit_atomic_swap (machine_mode mode, rtx dst, rtx value,
14457 rtx mem, rtx model)
14458 {
14459 rtx (*gen) (rtx, rtx, rtx, rtx);
14460
14461 switch (mode)
14462 {
14463 case E_QImode: gen = gen_aarch64_atomic_swpqi; break;
14464 case E_HImode: gen = gen_aarch64_atomic_swphi; break;
14465 case E_SImode: gen = gen_aarch64_atomic_swpsi; break;
14466 case E_DImode: gen = gen_aarch64_atomic_swpdi; break;
14467 default:
14468 gcc_unreachable ();
14469 }
14470
14471 emit_insn (gen (dst, mem, value, model));
14472 }
14473
14474 /* Operations supported by aarch64_emit_atomic_load_op. */
14475
14476 enum aarch64_atomic_load_op_code
14477 {
14478 AARCH64_LDOP_PLUS, /* A + B */
14479 AARCH64_LDOP_XOR, /* A ^ B */
14480 AARCH64_LDOP_OR, /* A | B */
14481 AARCH64_LDOP_BIC /* A & ~B */
14482 };
14483
14484 /* Emit an atomic load-operate. */
14485
14486 static void
aarch64_emit_atomic_load_op(enum aarch64_atomic_load_op_code code,machine_mode mode,rtx dst,rtx src,rtx mem,rtx model)14487 aarch64_emit_atomic_load_op (enum aarch64_atomic_load_op_code code,
14488 machine_mode mode, rtx dst, rtx src,
14489 rtx mem, rtx model)
14490 {
14491 typedef rtx (*aarch64_atomic_load_op_fn) (rtx, rtx, rtx, rtx);
14492 const aarch64_atomic_load_op_fn plus[] =
14493 {
14494 gen_aarch64_atomic_loadaddqi,
14495 gen_aarch64_atomic_loadaddhi,
14496 gen_aarch64_atomic_loadaddsi,
14497 gen_aarch64_atomic_loadadddi
14498 };
14499 const aarch64_atomic_load_op_fn eor[] =
14500 {
14501 gen_aarch64_atomic_loadeorqi,
14502 gen_aarch64_atomic_loadeorhi,
14503 gen_aarch64_atomic_loadeorsi,
14504 gen_aarch64_atomic_loadeordi
14505 };
14506 const aarch64_atomic_load_op_fn ior[] =
14507 {
14508 gen_aarch64_atomic_loadsetqi,
14509 gen_aarch64_atomic_loadsethi,
14510 gen_aarch64_atomic_loadsetsi,
14511 gen_aarch64_atomic_loadsetdi
14512 };
14513 const aarch64_atomic_load_op_fn bic[] =
14514 {
14515 gen_aarch64_atomic_loadclrqi,
14516 gen_aarch64_atomic_loadclrhi,
14517 gen_aarch64_atomic_loadclrsi,
14518 gen_aarch64_atomic_loadclrdi
14519 };
14520 aarch64_atomic_load_op_fn gen;
14521 int idx = 0;
14522
14523 switch (mode)
14524 {
14525 case E_QImode: idx = 0; break;
14526 case E_HImode: idx = 1; break;
14527 case E_SImode: idx = 2; break;
14528 case E_DImode: idx = 3; break;
14529 default:
14530 gcc_unreachable ();
14531 }
14532
14533 switch (code)
14534 {
14535 case AARCH64_LDOP_PLUS: gen = plus[idx]; break;
14536 case AARCH64_LDOP_XOR: gen = eor[idx]; break;
14537 case AARCH64_LDOP_OR: gen = ior[idx]; break;
14538 case AARCH64_LDOP_BIC: gen = bic[idx]; break;
14539 default:
14540 gcc_unreachable ();
14541 }
14542
14543 emit_insn (gen (dst, mem, src, model));
14544 }
14545
14546 /* Emit an atomic load+operate. CODE is the operation. OUT_DATA is the
14547 location to store the data read from memory. OUT_RESULT is the location to
14548 store the result of the operation. MEM is the memory location to read and
14549 modify. MODEL_RTX is the memory ordering to use. VALUE is the second
14550 operand for the operation. Either OUT_DATA or OUT_RESULT, but not both, can
14551 be NULL. */
14552
14553 void
aarch64_gen_atomic_ldop(enum rtx_code code,rtx out_data,rtx out_result,rtx mem,rtx value,rtx model_rtx)14554 aarch64_gen_atomic_ldop (enum rtx_code code, rtx out_data, rtx out_result,
14555 rtx mem, rtx value, rtx model_rtx)
14556 {
14557 machine_mode mode = GET_MODE (mem);
14558 machine_mode wmode = (mode == DImode ? DImode : SImode);
14559 const bool short_mode = (mode < SImode);
14560 aarch64_atomic_load_op_code ldop_code;
14561 rtx src;
14562 rtx x;
14563
14564 if (out_data)
14565 out_data = gen_lowpart (mode, out_data);
14566
14567 if (out_result)
14568 out_result = gen_lowpart (mode, out_result);
14569
14570 /* Make sure the value is in a register, putting it into a destination
14571 register if it needs to be manipulated. */
14572 if (!register_operand (value, mode)
14573 || code == AND || code == MINUS)
14574 {
14575 src = out_result ? out_result : out_data;
14576 emit_move_insn (src, gen_lowpart (mode, value));
14577 }
14578 else
14579 src = value;
14580 gcc_assert (register_operand (src, mode));
14581
14582 /* Preprocess the data for the operation as necessary. If the operation is
14583 a SET then emit a swap instruction and finish. */
14584 switch (code)
14585 {
14586 case SET:
14587 aarch64_emit_atomic_swap (mode, out_data, src, mem, model_rtx);
14588 return;
14589
14590 case MINUS:
14591 /* Negate the value and treat it as a PLUS. */
14592 {
14593 rtx neg_src;
14594
14595 /* Resize the value if necessary. */
14596 if (short_mode)
14597 src = gen_lowpart (wmode, src);
14598
14599 neg_src = gen_rtx_NEG (wmode, src);
14600 emit_insn (gen_rtx_SET (src, neg_src));
14601
14602 if (short_mode)
14603 src = gen_lowpart (mode, src);
14604 }
14605 /* Fall-through. */
14606 case PLUS:
14607 ldop_code = AARCH64_LDOP_PLUS;
14608 break;
14609
14610 case IOR:
14611 ldop_code = AARCH64_LDOP_OR;
14612 break;
14613
14614 case XOR:
14615 ldop_code = AARCH64_LDOP_XOR;
14616 break;
14617
14618 case AND:
14619 {
14620 rtx not_src;
14621
14622 /* Resize the value if necessary. */
14623 if (short_mode)
14624 src = gen_lowpart (wmode, src);
14625
14626 not_src = gen_rtx_NOT (wmode, src);
14627 emit_insn (gen_rtx_SET (src, not_src));
14628
14629 if (short_mode)
14630 src = gen_lowpart (mode, src);
14631 }
14632 ldop_code = AARCH64_LDOP_BIC;
14633 break;
14634
14635 default:
14636 /* The operation can't be done with atomic instructions. */
14637 gcc_unreachable ();
14638 }
14639
14640 aarch64_emit_atomic_load_op (ldop_code, mode, out_data, src, mem, model_rtx);
14641
14642 /* If necessary, calculate the data in memory after the update by redoing the
14643 operation from values in registers. */
14644 if (!out_result)
14645 return;
14646
14647 if (short_mode)
14648 {
14649 src = gen_lowpart (wmode, src);
14650 out_data = gen_lowpart (wmode, out_data);
14651 out_result = gen_lowpart (wmode, out_result);
14652 }
14653
14654 x = NULL_RTX;
14655
14656 switch (code)
14657 {
14658 case MINUS:
14659 case PLUS:
14660 x = gen_rtx_PLUS (wmode, out_data, src);
14661 break;
14662 case IOR:
14663 x = gen_rtx_IOR (wmode, out_data, src);
14664 break;
14665 case XOR:
14666 x = gen_rtx_XOR (wmode, out_data, src);
14667 break;
14668 case AND:
14669 aarch64_emit_bic (wmode, out_result, out_data, src, 0);
14670 return;
14671 default:
14672 gcc_unreachable ();
14673 }
14674
14675 emit_set_insn (out_result, x);
14676
14677 return;
14678 }
14679
14680 /* Split an atomic operation. */
14681
14682 void
aarch64_split_atomic_op(enum rtx_code code,rtx old_out,rtx new_out,rtx mem,rtx value,rtx model_rtx,rtx cond)14683 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
14684 rtx value, rtx model_rtx, rtx cond)
14685 {
14686 /* Split after prolog/epilog to avoid interactions with shrinkwrapping. */
14687 gcc_assert (epilogue_completed);
14688
14689 machine_mode mode = GET_MODE (mem);
14690 machine_mode wmode = (mode == DImode ? DImode : SImode);
14691 const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
14692 const bool is_sync = is_mm_sync (model);
14693 rtx_code_label *label;
14694 rtx x;
14695
14696 /* Split the atomic operation into a sequence. */
14697 label = gen_label_rtx ();
14698 emit_label (label);
14699
14700 if (new_out)
14701 new_out = gen_lowpart (wmode, new_out);
14702 if (old_out)
14703 old_out = gen_lowpart (wmode, old_out);
14704 else
14705 old_out = new_out;
14706 value = simplify_gen_subreg (wmode, value, mode, 0);
14707
14708 /* The initial load can be relaxed for a __sync operation since a final
14709 barrier will be emitted to stop code hoisting. */
14710 if (is_sync)
14711 aarch64_emit_load_exclusive (mode, old_out, mem,
14712 GEN_INT (MEMMODEL_RELAXED));
14713 else
14714 aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
14715
14716 switch (code)
14717 {
14718 case SET:
14719 new_out = value;
14720 break;
14721
14722 case NOT:
14723 x = gen_rtx_AND (wmode, old_out, value);
14724 emit_insn (gen_rtx_SET (new_out, x));
14725 x = gen_rtx_NOT (wmode, new_out);
14726 emit_insn (gen_rtx_SET (new_out, x));
14727 break;
14728
14729 case MINUS:
14730 if (CONST_INT_P (value))
14731 {
14732 value = GEN_INT (-INTVAL (value));
14733 code = PLUS;
14734 }
14735 /* Fall through. */
14736
14737 default:
14738 x = gen_rtx_fmt_ee (code, wmode, old_out, value);
14739 emit_insn (gen_rtx_SET (new_out, x));
14740 break;
14741 }
14742
14743 aarch64_emit_store_exclusive (mode, cond, mem,
14744 gen_lowpart (mode, new_out), model_rtx);
14745
14746 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
14747 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14748 gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
14749 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14750
14751 /* Emit any final barrier needed for a __sync operation. */
14752 if (is_sync)
14753 aarch64_emit_post_barrier (model);
14754 }
14755
14756 static void
aarch64_init_libfuncs(void)14757 aarch64_init_libfuncs (void)
14758 {
14759 /* Half-precision float operations. The compiler handles all operations
14760 with NULL libfuncs by converting to SFmode. */
14761
14762 /* Conversions. */
14763 set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
14764 set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
14765
14766 /* Arithmetic. */
14767 set_optab_libfunc (add_optab, HFmode, NULL);
14768 set_optab_libfunc (sdiv_optab, HFmode, NULL);
14769 set_optab_libfunc (smul_optab, HFmode, NULL);
14770 set_optab_libfunc (neg_optab, HFmode, NULL);
14771 set_optab_libfunc (sub_optab, HFmode, NULL);
14772
14773 /* Comparisons. */
14774 set_optab_libfunc (eq_optab, HFmode, NULL);
14775 set_optab_libfunc (ne_optab, HFmode, NULL);
14776 set_optab_libfunc (lt_optab, HFmode, NULL);
14777 set_optab_libfunc (le_optab, HFmode, NULL);
14778 set_optab_libfunc (ge_optab, HFmode, NULL);
14779 set_optab_libfunc (gt_optab, HFmode, NULL);
14780 set_optab_libfunc (unord_optab, HFmode, NULL);
14781 }
14782
14783 /* Target hook for c_mode_for_suffix. */
14784 static machine_mode
aarch64_c_mode_for_suffix(char suffix)14785 aarch64_c_mode_for_suffix (char suffix)
14786 {
14787 if (suffix == 'q')
14788 return TFmode;
14789
14790 return VOIDmode;
14791 }
14792
14793 /* We can only represent floating point constants which will fit in
14794 "quarter-precision" values. These values are characterised by
14795 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
14796 by:
14797
14798 (-1)^s * (n/16) * 2^r
14799
14800 Where:
14801 's' is the sign bit.
14802 'n' is an integer in the range 16 <= n <= 31.
14803 'r' is an integer in the range -3 <= r <= 4. */
14804
14805 /* Return true iff X can be represented by a quarter-precision
14806 floating point immediate operand X. Note, we cannot represent 0.0. */
14807 bool
aarch64_float_const_representable_p(rtx x)14808 aarch64_float_const_representable_p (rtx x)
14809 {
14810 /* This represents our current view of how many bits
14811 make up the mantissa. */
14812 int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
14813 int exponent;
14814 unsigned HOST_WIDE_INT mantissa, mask;
14815 REAL_VALUE_TYPE r, m;
14816 bool fail;
14817
14818 if (!CONST_DOUBLE_P (x))
14819 return false;
14820
14821 /* We don't support HFmode constants yet. */
14822 if (GET_MODE (x) == VOIDmode || GET_MODE (x) == HFmode)
14823 return false;
14824
14825 r = *CONST_DOUBLE_REAL_VALUE (x);
14826
14827 /* We cannot represent infinities, NaNs or +/-zero. We won't
14828 know if we have +zero until we analyse the mantissa, but we
14829 can reject the other invalid values. */
14830 if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
14831 || REAL_VALUE_MINUS_ZERO (r))
14832 return false;
14833
14834 /* Extract exponent. */
14835 r = real_value_abs (&r);
14836 exponent = REAL_EXP (&r);
14837
14838 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
14839 highest (sign) bit, with a fixed binary point at bit point_pos.
14840 m1 holds the low part of the mantissa, m2 the high part.
14841 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
14842 bits for the mantissa, this can fail (low bits will be lost). */
14843 real_ldexp (&m, &r, point_pos - exponent);
14844 wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
14845
14846 /* If the low part of the mantissa has bits set we cannot represent
14847 the value. */
14848 if (w.ulow () != 0)
14849 return false;
14850 /* We have rejected the lower HOST_WIDE_INT, so update our
14851 understanding of how many bits lie in the mantissa and
14852 look only at the high HOST_WIDE_INT. */
14853 mantissa = w.elt (1);
14854 point_pos -= HOST_BITS_PER_WIDE_INT;
14855
14856 /* We can only represent values with a mantissa of the form 1.xxxx. */
14857 mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
14858 if ((mantissa & mask) != 0)
14859 return false;
14860
14861 /* Having filtered unrepresentable values, we may now remove all
14862 but the highest 5 bits. */
14863 mantissa >>= point_pos - 5;
14864
14865 /* We cannot represent the value 0.0, so reject it. This is handled
14866 elsewhere. */
14867 if (mantissa == 0)
14868 return false;
14869
14870 /* Then, as bit 4 is always set, we can mask it off, leaving
14871 the mantissa in the range [0, 15]. */
14872 mantissa &= ~(1 << 4);
14873 gcc_assert (mantissa <= 15);
14874
14875 /* GCC internally does not use IEEE754-like encoding (where normalized
14876 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
14877 Our mantissa values are shifted 4 places to the left relative to
14878 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
14879 by 5 places to correct for GCC's representation. */
14880 exponent = 5 - exponent;
14881
14882 return (exponent >= 0 && exponent <= 7);
14883 }
14884
14885 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
14886 immediate with a CONST_VECTOR of MODE and WIDTH. WHICH selects whether to
14887 output MOVI/MVNI, ORR or BIC immediate. */
14888 char*
aarch64_output_simd_mov_immediate(rtx const_vector,unsigned width,enum simd_immediate_check which)14889 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
14890 enum simd_immediate_check which)
14891 {
14892 bool is_valid;
14893 static char templ[40];
14894 const char *mnemonic;
14895 const char *shift_op;
14896 unsigned int lane_count = 0;
14897 char element_char;
14898
14899 struct simd_immediate_info info;
14900
14901 /* This will return true to show const_vector is legal for use as either
14902 a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
14903 It will also update INFO to show how the immediate should be generated.
14904 WHICH selects whether to check for MOVI/MVNI, ORR or BIC. */
14905 is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
14906 gcc_assert (is_valid);
14907
14908 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
14909 lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
14910
14911 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
14912 {
14913 gcc_assert (info.shift == 0 && info.insn == simd_immediate_info::MOV);
14914 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
14915 move immediate path. */
14916 if (aarch64_float_const_zero_rtx_p (info.value))
14917 info.value = GEN_INT (0);
14918 else
14919 {
14920 const unsigned int buf_size = 20;
14921 char float_buf[buf_size] = {'\0'};
14922 real_to_decimal_for_mode (float_buf,
14923 CONST_DOUBLE_REAL_VALUE (info.value),
14924 buf_size, buf_size, 1, info.elt_mode);
14925
14926 if (lane_count == 1)
14927 snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
14928 else
14929 snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
14930 lane_count, element_char, float_buf);
14931 return templ;
14932 }
14933 }
14934
14935 gcc_assert (CONST_INT_P (info.value));
14936
14937 if (which == AARCH64_CHECK_MOV)
14938 {
14939 mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
14940 shift_op = info.modifier == simd_immediate_info::MSL ? "msl" : "lsl";
14941 if (lane_count == 1)
14942 snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
14943 mnemonic, UINTVAL (info.value));
14944 else if (info.shift)
14945 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
14946 HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
14947 element_char, UINTVAL (info.value), shift_op, info.shift);
14948 else
14949 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
14950 HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
14951 element_char, UINTVAL (info.value));
14952 }
14953 else
14954 {
14955 /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR. */
14956 mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
14957 if (info.shift)
14958 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
14959 HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
14960 element_char, UINTVAL (info.value), "lsl", info.shift);
14961 else
14962 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
14963 HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
14964 element_char, UINTVAL (info.value));
14965 }
14966 return templ;
14967 }
14968
14969 char*
aarch64_output_scalar_simd_mov_immediate(rtx immediate,scalar_int_mode mode)14970 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
14971 {
14972
14973 /* If a floating point number was passed and we desire to use it in an
14974 integer mode do the conversion to integer. */
14975 if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
14976 {
14977 unsigned HOST_WIDE_INT ival;
14978 if (!aarch64_reinterpret_float_as_int (immediate, &ival))
14979 gcc_unreachable ();
14980 immediate = gen_int_mode (ival, mode);
14981 }
14982
14983 machine_mode vmode;
14984 /* use a 64 bit mode for everything except for DI/DF mode, where we use
14985 a 128 bit vector mode. */
14986 int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
14987
14988 vmode = aarch64_simd_container_mode (mode, width);
14989 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
14990 return aarch64_output_simd_mov_immediate (v_op, width);
14991 }
14992
14993 /* Return the output string to use for moving immediate CONST_VECTOR
14994 into an SVE register. */
14995
14996 char *
aarch64_output_sve_mov_immediate(rtx const_vector)14997 aarch64_output_sve_mov_immediate (rtx const_vector)
14998 {
14999 static char templ[40];
15000 struct simd_immediate_info info;
15001 char element_char;
15002
15003 bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
15004 gcc_assert (is_valid);
15005
15006 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
15007
15008 if (info.step)
15009 {
15010 snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
15011 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
15012 element_char, INTVAL (info.value), INTVAL (info.step));
15013 return templ;
15014 }
15015
15016 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
15017 {
15018 if (aarch64_float_const_zero_rtx_p (info.value))
15019 info.value = GEN_INT (0);
15020 else
15021 {
15022 const int buf_size = 20;
15023 char float_buf[buf_size] = {};
15024 real_to_decimal_for_mode (float_buf,
15025 CONST_DOUBLE_REAL_VALUE (info.value),
15026 buf_size, buf_size, 1, info.elt_mode);
15027
15028 snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
15029 element_char, float_buf);
15030 return templ;
15031 }
15032 }
15033
15034 snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
15035 element_char, INTVAL (info.value));
15036 return templ;
15037 }
15038
15039 /* Return the asm format for a PTRUE instruction whose destination has
15040 mode MODE. SUFFIX is the element size suffix. */
15041
15042 char *
aarch64_output_ptrue(machine_mode mode,char suffix)15043 aarch64_output_ptrue (machine_mode mode, char suffix)
15044 {
15045 unsigned int nunits;
15046 static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
15047 if (GET_MODE_NUNITS (mode).is_constant (&nunits))
15048 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", suffix, nunits);
15049 else
15050 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, all", suffix);
15051 return buf;
15052 }
15053
15054 /* Split operands into moves from op[1] + op[2] into op[0]. */
15055
15056 void
aarch64_split_combinev16qi(rtx operands[3])15057 aarch64_split_combinev16qi (rtx operands[3])
15058 {
15059 unsigned int dest = REGNO (operands[0]);
15060 unsigned int src1 = REGNO (operands[1]);
15061 unsigned int src2 = REGNO (operands[2]);
15062 machine_mode halfmode = GET_MODE (operands[1]);
15063 unsigned int halfregs = REG_NREGS (operands[1]);
15064 rtx destlo, desthi;
15065
15066 gcc_assert (halfmode == V16QImode);
15067
15068 if (src1 == dest && src2 == dest + halfregs)
15069 {
15070 /* No-op move. Can't split to nothing; emit something. */
15071 emit_note (NOTE_INSN_DELETED);
15072 return;
15073 }
15074
15075 /* Preserve register attributes for variable tracking. */
15076 destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
15077 desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
15078 GET_MODE_SIZE (halfmode));
15079
15080 /* Special case of reversed high/low parts. */
15081 if (reg_overlap_mentioned_p (operands[2], destlo)
15082 && reg_overlap_mentioned_p (operands[1], desthi))
15083 {
15084 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
15085 emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
15086 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
15087 }
15088 else if (!reg_overlap_mentioned_p (operands[2], destlo))
15089 {
15090 /* Try to avoid unnecessary moves if part of the result
15091 is in the right place already. */
15092 if (src1 != dest)
15093 emit_move_insn (destlo, operands[1]);
15094 if (src2 != dest + halfregs)
15095 emit_move_insn (desthi, operands[2]);
15096 }
15097 else
15098 {
15099 if (src2 != dest + halfregs)
15100 emit_move_insn (desthi, operands[2]);
15101 if (src1 != dest)
15102 emit_move_insn (destlo, operands[1]);
15103 }
15104 }
15105
15106 /* vec_perm support. */
15107
15108 struct expand_vec_perm_d
15109 {
15110 rtx target, op0, op1;
15111 vec_perm_indices perm;
15112 machine_mode vmode;
15113 unsigned int vec_flags;
15114 bool one_vector_p;
15115 bool testing_p;
15116 };
15117
15118 /* Generate a variable permutation. */
15119
15120 static void
aarch64_expand_vec_perm_1(rtx target,rtx op0,rtx op1,rtx sel)15121 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
15122 {
15123 machine_mode vmode = GET_MODE (target);
15124 bool one_vector_p = rtx_equal_p (op0, op1);
15125
15126 gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
15127 gcc_checking_assert (GET_MODE (op0) == vmode);
15128 gcc_checking_assert (GET_MODE (op1) == vmode);
15129 gcc_checking_assert (GET_MODE (sel) == vmode);
15130 gcc_checking_assert (TARGET_SIMD);
15131
15132 if (one_vector_p)
15133 {
15134 if (vmode == V8QImode)
15135 {
15136 /* Expand the argument to a V16QI mode by duplicating it. */
15137 rtx pair = gen_reg_rtx (V16QImode);
15138 emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
15139 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
15140 }
15141 else
15142 {
15143 emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
15144 }
15145 }
15146 else
15147 {
15148 rtx pair;
15149
15150 if (vmode == V8QImode)
15151 {
15152 pair = gen_reg_rtx (V16QImode);
15153 emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
15154 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
15155 }
15156 else
15157 {
15158 pair = gen_reg_rtx (OImode);
15159 emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
15160 emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
15161 }
15162 }
15163 }
15164
15165 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
15166 NELT is the number of elements in the vector. */
15167
15168 void
aarch64_expand_vec_perm(rtx target,rtx op0,rtx op1,rtx sel,unsigned int nelt)15169 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
15170 unsigned int nelt)
15171 {
15172 machine_mode vmode = GET_MODE (target);
15173 bool one_vector_p = rtx_equal_p (op0, op1);
15174 rtx mask;
15175
15176 /* The TBL instruction does not use a modulo index, so we must take care
15177 of that ourselves. */
15178 mask = aarch64_simd_gen_const_vector_dup (vmode,
15179 one_vector_p ? nelt - 1 : 2 * nelt - 1);
15180 sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
15181
15182 /* For big-endian, we also need to reverse the index within the vector
15183 (but not which vector). */
15184 if (BYTES_BIG_ENDIAN)
15185 {
15186 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
15187 if (!one_vector_p)
15188 mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
15189 sel = expand_simple_binop (vmode, XOR, sel, mask,
15190 NULL, 0, OPTAB_LIB_WIDEN);
15191 }
15192 aarch64_expand_vec_perm_1 (target, op0, op1, sel);
15193 }
15194
15195 /* Generate (set TARGET (unspec [OP0 OP1] CODE)). */
15196
15197 static void
emit_unspec2(rtx target,int code,rtx op0,rtx op1)15198 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
15199 {
15200 emit_insn (gen_rtx_SET (target,
15201 gen_rtx_UNSPEC (GET_MODE (target),
15202 gen_rtvec (2, op0, op1), code)));
15203 }
15204
15205 /* Expand an SVE vec_perm with the given operands. */
15206
15207 void
aarch64_expand_sve_vec_perm(rtx target,rtx op0,rtx op1,rtx sel)15208 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
15209 {
15210 machine_mode data_mode = GET_MODE (target);
15211 machine_mode sel_mode = GET_MODE (sel);
15212 /* Enforced by the pattern condition. */
15213 int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
15214
15215 /* Note: vec_perm indices are supposed to wrap when they go beyond the
15216 size of the two value vectors, i.e. the upper bits of the indices
15217 are effectively ignored. SVE TBL instead produces 0 for any
15218 out-of-range indices, so we need to modulo all the vec_perm indices
15219 to ensure they are all in range. */
15220 rtx sel_reg = force_reg (sel_mode, sel);
15221
15222 /* Check if the sel only references the first values vector. */
15223 if (GET_CODE (sel) == CONST_VECTOR
15224 && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
15225 {
15226 emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
15227 return;
15228 }
15229
15230 /* Check if the two values vectors are the same. */
15231 if (rtx_equal_p (op0, op1))
15232 {
15233 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
15234 rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
15235 NULL, 0, OPTAB_DIRECT);
15236 emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
15237 return;
15238 }
15239
15240 /* Run TBL on for each value vector and combine the results. */
15241
15242 rtx res0 = gen_reg_rtx (data_mode);
15243 rtx res1 = gen_reg_rtx (data_mode);
15244 rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
15245 if (GET_CODE (sel) != CONST_VECTOR
15246 || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
15247 {
15248 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
15249 2 * nunits - 1);
15250 sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
15251 NULL, 0, OPTAB_DIRECT);
15252 }
15253 emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
15254 rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
15255 NULL, 0, OPTAB_DIRECT);
15256 emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
15257 if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
15258 emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
15259 else
15260 emit_unspec2 (target, UNSPEC_IORF, res0, res1);
15261 }
15262
15263 /* Recognize patterns suitable for the TRN instructions. */
15264 static bool
aarch64_evpc_trn(struct expand_vec_perm_d * d)15265 aarch64_evpc_trn (struct expand_vec_perm_d *d)
15266 {
15267 HOST_WIDE_INT odd;
15268 poly_uint64 nelt = d->perm.length ();
15269 rtx out, in0, in1, x;
15270 machine_mode vmode = d->vmode;
15271
15272 if (GET_MODE_UNIT_SIZE (vmode) > 8)
15273 return false;
15274
15275 /* Note that these are little-endian tests.
15276 We correct for big-endian later. */
15277 if (!d->perm[0].is_constant (&odd)
15278 || (odd != 0 && odd != 1)
15279 || !d->perm.series_p (0, 2, odd, 2)
15280 || !d->perm.series_p (1, 2, nelt + odd, 2))
15281 return false;
15282
15283 /* Success! */
15284 if (d->testing_p)
15285 return true;
15286
15287 in0 = d->op0;
15288 in1 = d->op1;
15289 /* We don't need a big-endian lane correction for SVE; see the comment
15290 at the head of aarch64-sve.md for details. */
15291 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
15292 {
15293 x = in0, in0 = in1, in1 = x;
15294 odd = !odd;
15295 }
15296 out = d->target;
15297
15298 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
15299 odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
15300 return true;
15301 }
15302
15303 /* Recognize patterns suitable for the UZP instructions. */
15304 static bool
aarch64_evpc_uzp(struct expand_vec_perm_d * d)15305 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
15306 {
15307 HOST_WIDE_INT odd;
15308 rtx out, in0, in1, x;
15309 machine_mode vmode = d->vmode;
15310
15311 if (GET_MODE_UNIT_SIZE (vmode) > 8)
15312 return false;
15313
15314 /* Note that these are little-endian tests.
15315 We correct for big-endian later. */
15316 if (!d->perm[0].is_constant (&odd)
15317 || (odd != 0 && odd != 1)
15318 || !d->perm.series_p (0, 1, odd, 2))
15319 return false;
15320
15321 /* Success! */
15322 if (d->testing_p)
15323 return true;
15324
15325 in0 = d->op0;
15326 in1 = d->op1;
15327 /* We don't need a big-endian lane correction for SVE; see the comment
15328 at the head of aarch64-sve.md for details. */
15329 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
15330 {
15331 x = in0, in0 = in1, in1 = x;
15332 odd = !odd;
15333 }
15334 out = d->target;
15335
15336 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
15337 odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
15338 return true;
15339 }
15340
15341 /* Recognize patterns suitable for the ZIP instructions. */
15342 static bool
aarch64_evpc_zip(struct expand_vec_perm_d * d)15343 aarch64_evpc_zip (struct expand_vec_perm_d *d)
15344 {
15345 unsigned int high;
15346 poly_uint64 nelt = d->perm.length ();
15347 rtx out, in0, in1, x;
15348 machine_mode vmode = d->vmode;
15349
15350 if (GET_MODE_UNIT_SIZE (vmode) > 8)
15351 return false;
15352
15353 /* Note that these are little-endian tests.
15354 We correct for big-endian later. */
15355 poly_uint64 first = d->perm[0];
15356 if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
15357 || !d->perm.series_p (0, 2, first, 1)
15358 || !d->perm.series_p (1, 2, first + nelt, 1))
15359 return false;
15360 high = maybe_ne (first, 0U);
15361
15362 /* Success! */
15363 if (d->testing_p)
15364 return true;
15365
15366 in0 = d->op0;
15367 in1 = d->op1;
15368 /* We don't need a big-endian lane correction for SVE; see the comment
15369 at the head of aarch64-sve.md for details. */
15370 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
15371 {
15372 x = in0, in0 = in1, in1 = x;
15373 high = !high;
15374 }
15375 out = d->target;
15376
15377 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
15378 high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
15379 return true;
15380 }
15381
15382 /* Recognize patterns for the EXT insn. */
15383
15384 static bool
aarch64_evpc_ext(struct expand_vec_perm_d * d)15385 aarch64_evpc_ext (struct expand_vec_perm_d *d)
15386 {
15387 HOST_WIDE_INT location;
15388 rtx offset;
15389
15390 /* The first element always refers to the first vector.
15391 Check if the extracted indices are increasing by one. */
15392 if (d->vec_flags == VEC_SVE_PRED
15393 || !d->perm[0].is_constant (&location)
15394 || !d->perm.series_p (0, 1, location, 1))
15395 return false;
15396
15397 /* Success! */
15398 if (d->testing_p)
15399 return true;
15400
15401 /* The case where (location == 0) is a no-op for both big- and little-endian,
15402 and is removed by the mid-end at optimization levels -O1 and higher.
15403
15404 We don't need a big-endian lane correction for SVE; see the comment
15405 at the head of aarch64-sve.md for details. */
15406 if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
15407 {
15408 /* After setup, we want the high elements of the first vector (stored
15409 at the LSB end of the register), and the low elements of the second
15410 vector (stored at the MSB end of the register). So swap. */
15411 std::swap (d->op0, d->op1);
15412 /* location != 0 (above), so safe to assume (nelt - location) < nelt.
15413 to_constant () is safe since this is restricted to Advanced SIMD
15414 vectors. */
15415 location = d->perm.length ().to_constant () - location;
15416 }
15417
15418 offset = GEN_INT (location);
15419 emit_set_insn (d->target,
15420 gen_rtx_UNSPEC (d->vmode,
15421 gen_rtvec (3, d->op0, d->op1, offset),
15422 UNSPEC_EXT));
15423 return true;
15424 }
15425
15426 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
15427 within each 64-bit, 32-bit or 16-bit granule. */
15428
15429 static bool
aarch64_evpc_rev_local(struct expand_vec_perm_d * d)15430 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
15431 {
15432 HOST_WIDE_INT diff;
15433 unsigned int i, size, unspec;
15434 machine_mode pred_mode;
15435
15436 if (d->vec_flags == VEC_SVE_PRED
15437 || !d->one_vector_p
15438 || !d->perm[0].is_constant (&diff))
15439 return false;
15440
15441 size = (diff + 1) * GET_MODE_UNIT_SIZE (d->vmode);
15442 if (size == 8)
15443 {
15444 unspec = UNSPEC_REV64;
15445 pred_mode = VNx2BImode;
15446 }
15447 else if (size == 4)
15448 {
15449 unspec = UNSPEC_REV32;
15450 pred_mode = VNx4BImode;
15451 }
15452 else if (size == 2)
15453 {
15454 unspec = UNSPEC_REV16;
15455 pred_mode = VNx8BImode;
15456 }
15457 else
15458 return false;
15459
15460 unsigned int step = diff + 1;
15461 for (i = 0; i < step; ++i)
15462 if (!d->perm.series_p (i, step, diff - i, step))
15463 return false;
15464
15465 /* Success! */
15466 if (d->testing_p)
15467 return true;
15468
15469 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
15470 if (d->vec_flags == VEC_SVE_DATA)
15471 {
15472 rtx pred = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
15473 src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (2, pred, src),
15474 UNSPEC_MERGE_PTRUE);
15475 }
15476 emit_set_insn (d->target, src);
15477 return true;
15478 }
15479
15480 /* Recognize patterns for the REV insn, which reverses elements within
15481 a full vector. */
15482
15483 static bool
aarch64_evpc_rev_global(struct expand_vec_perm_d * d)15484 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
15485 {
15486 poly_uint64 nelt = d->perm.length ();
15487
15488 if (!d->one_vector_p || d->vec_flags != VEC_SVE_DATA)
15489 return false;
15490
15491 if (!d->perm.series_p (0, 1, nelt - 1, -1))
15492 return false;
15493
15494 /* Success! */
15495 if (d->testing_p)
15496 return true;
15497
15498 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
15499 emit_set_insn (d->target, src);
15500 return true;
15501 }
15502
15503 static bool
aarch64_evpc_dup(struct expand_vec_perm_d * d)15504 aarch64_evpc_dup (struct expand_vec_perm_d *d)
15505 {
15506 rtx out = d->target;
15507 rtx in0;
15508 HOST_WIDE_INT elt;
15509 machine_mode vmode = d->vmode;
15510 rtx lane;
15511
15512 if (d->vec_flags == VEC_SVE_PRED
15513 || d->perm.encoding ().encoded_nelts () != 1
15514 || !d->perm[0].is_constant (&elt))
15515 return false;
15516
15517 if (d->vec_flags == VEC_SVE_DATA && elt >= 64 * GET_MODE_UNIT_SIZE (vmode))
15518 return false;
15519
15520 /* Success! */
15521 if (d->testing_p)
15522 return true;
15523
15524 /* The generic preparation in aarch64_expand_vec_perm_const_1
15525 swaps the operand order and the permute indices if it finds
15526 d->perm[0] to be in the second operand. Thus, we can always
15527 use d->op0 and need not do any extra arithmetic to get the
15528 correct lane number. */
15529 in0 = d->op0;
15530 lane = GEN_INT (elt); /* The pattern corrects for big-endian. */
15531
15532 rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
15533 rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
15534 emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
15535 return true;
15536 }
15537
15538 static bool
aarch64_evpc_tbl(struct expand_vec_perm_d * d)15539 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
15540 {
15541 rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
15542 machine_mode vmode = d->vmode;
15543
15544 /* Make sure that the indices are constant. */
15545 unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
15546 for (unsigned int i = 0; i < encoded_nelts; ++i)
15547 if (!d->perm[i].is_constant ())
15548 return false;
15549
15550 if (d->testing_p)
15551 return true;
15552
15553 /* Generic code will try constant permutation twice. Once with the
15554 original mode and again with the elements lowered to QImode.
15555 So wait and don't do the selector expansion ourselves. */
15556 if (vmode != V8QImode && vmode != V16QImode)
15557 return false;
15558
15559 /* to_constant is safe since this routine is specific to Advanced SIMD
15560 vectors. */
15561 unsigned int nelt = d->perm.length ().to_constant ();
15562 for (unsigned int i = 0; i < nelt; ++i)
15563 /* If big-endian and two vectors we end up with a weird mixed-endian
15564 mode on NEON. Reverse the index within each word but not the word
15565 itself. to_constant is safe because we checked is_constant above. */
15566 rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
15567 ? d->perm[i].to_constant () ^ (nelt - 1)
15568 : d->perm[i].to_constant ());
15569
15570 sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
15571 sel = force_reg (vmode, sel);
15572
15573 aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
15574 return true;
15575 }
15576
15577 /* Try to implement D using an SVE TBL instruction. */
15578
15579 static bool
aarch64_evpc_sve_tbl(struct expand_vec_perm_d * d)15580 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
15581 {
15582 unsigned HOST_WIDE_INT nelt;
15583
15584 /* Permuting two variable-length vectors could overflow the
15585 index range. */
15586 if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
15587 return false;
15588
15589 if (d->testing_p)
15590 return true;
15591
15592 machine_mode sel_mode = mode_for_int_vector (d->vmode).require ();
15593 rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
15594 aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
15595 return true;
15596 }
15597
15598 static bool
aarch64_expand_vec_perm_const_1(struct expand_vec_perm_d * d)15599 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
15600 {
15601 /* The pattern matching functions above are written to look for a small
15602 number to begin the sequence (0, 1, N/2). If we begin with an index
15603 from the second operand, we can swap the operands. */
15604 poly_int64 nelt = d->perm.length ();
15605 if (known_ge (d->perm[0], nelt))
15606 {
15607 d->perm.rotate_inputs (1);
15608 std::swap (d->op0, d->op1);
15609 }
15610
15611 if ((d->vec_flags == VEC_ADVSIMD
15612 || d->vec_flags == VEC_SVE_DATA
15613 || d->vec_flags == VEC_SVE_PRED)
15614 && known_gt (nelt, 1))
15615 {
15616 if (aarch64_evpc_rev_local (d))
15617 return true;
15618 else if (aarch64_evpc_rev_global (d))
15619 return true;
15620 else if (aarch64_evpc_ext (d))
15621 return true;
15622 else if (aarch64_evpc_dup (d))
15623 return true;
15624 else if (aarch64_evpc_zip (d))
15625 return true;
15626 else if (aarch64_evpc_uzp (d))
15627 return true;
15628 else if (aarch64_evpc_trn (d))
15629 return true;
15630 if (d->vec_flags == VEC_SVE_DATA)
15631 return aarch64_evpc_sve_tbl (d);
15632 else if (d->vec_flags == VEC_ADVSIMD)
15633 return aarch64_evpc_tbl (d);
15634 }
15635 return false;
15636 }
15637
15638 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
15639
15640 static bool
aarch64_vectorize_vec_perm_const(machine_mode vmode,rtx target,rtx op0,rtx op1,const vec_perm_indices & sel)15641 aarch64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
15642 rtx op1, const vec_perm_indices &sel)
15643 {
15644 struct expand_vec_perm_d d;
15645
15646 /* Check whether the mask can be applied to a single vector. */
15647 if (op0 && rtx_equal_p (op0, op1))
15648 d.one_vector_p = true;
15649 else if (sel.all_from_input_p (0))
15650 {
15651 d.one_vector_p = true;
15652 op1 = op0;
15653 }
15654 else if (sel.all_from_input_p (1))
15655 {
15656 d.one_vector_p = true;
15657 op0 = op1;
15658 }
15659 else
15660 d.one_vector_p = false;
15661
15662 d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
15663 sel.nelts_per_input ());
15664 d.vmode = vmode;
15665 d.vec_flags = aarch64_classify_vector_mode (d.vmode);
15666 d.target = target;
15667 d.op0 = op0;
15668 d.op1 = op1;
15669 d.testing_p = !target;
15670
15671 if (!d.testing_p)
15672 return aarch64_expand_vec_perm_const_1 (&d);
15673
15674 rtx_insn *last = get_last_insn ();
15675 bool ret = aarch64_expand_vec_perm_const_1 (&d);
15676 gcc_assert (last == get_last_insn ());
15677
15678 return ret;
15679 }
15680
15681 /* Generate a byte permute mask for a register of mode MODE,
15682 which has NUNITS units. */
15683
15684 rtx
aarch64_reverse_mask(machine_mode mode,unsigned int nunits)15685 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
15686 {
15687 /* We have to reverse each vector because we dont have
15688 a permuted load that can reverse-load according to ABI rules. */
15689 rtx mask;
15690 rtvec v = rtvec_alloc (16);
15691 unsigned int i, j;
15692 unsigned int usize = GET_MODE_UNIT_SIZE (mode);
15693
15694 gcc_assert (BYTES_BIG_ENDIAN);
15695 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
15696
15697 for (i = 0; i < nunits; i++)
15698 for (j = 0; j < usize; j++)
15699 RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
15700 mask = gen_rtx_CONST_VECTOR (V16QImode, v);
15701 return force_reg (V16QImode, mask);
15702 }
15703
15704 /* Return true if X is a valid second operand for the SVE instruction
15705 that implements integer comparison OP_CODE. */
15706
15707 static bool
aarch64_sve_cmp_operand_p(rtx_code op_code,rtx x)15708 aarch64_sve_cmp_operand_p (rtx_code op_code, rtx x)
15709 {
15710 if (register_operand (x, VOIDmode))
15711 return true;
15712
15713 switch (op_code)
15714 {
15715 case LTU:
15716 case LEU:
15717 case GEU:
15718 case GTU:
15719 return aarch64_sve_cmp_immediate_p (x, false);
15720 case LT:
15721 case LE:
15722 case GE:
15723 case GT:
15724 case NE:
15725 case EQ:
15726 return aarch64_sve_cmp_immediate_p (x, true);
15727 default:
15728 gcc_unreachable ();
15729 }
15730 }
15731
15732 /* Return the UNSPEC_COND_* code for comparison CODE. */
15733
15734 static unsigned int
aarch64_unspec_cond_code(rtx_code code)15735 aarch64_unspec_cond_code (rtx_code code)
15736 {
15737 switch (code)
15738 {
15739 case NE:
15740 return UNSPEC_COND_NE;
15741 case EQ:
15742 return UNSPEC_COND_EQ;
15743 case LT:
15744 return UNSPEC_COND_LT;
15745 case GT:
15746 return UNSPEC_COND_GT;
15747 case LE:
15748 return UNSPEC_COND_LE;
15749 case GE:
15750 return UNSPEC_COND_GE;
15751 case LTU:
15752 return UNSPEC_COND_LO;
15753 case GTU:
15754 return UNSPEC_COND_HI;
15755 case LEU:
15756 return UNSPEC_COND_LS;
15757 case GEU:
15758 return UNSPEC_COND_HS;
15759 case UNORDERED:
15760 return UNSPEC_COND_UO;
15761 default:
15762 gcc_unreachable ();
15763 }
15764 }
15765
15766 /* Return an (unspec:PRED_MODE [PRED OP0 OP1] UNSPEC_COND_<X>) expression,
15767 where <X> is the operation associated with comparison CODE. */
15768
15769 static rtx
aarch64_gen_unspec_cond(rtx_code code,machine_mode pred_mode,rtx pred,rtx op0,rtx op1)15770 aarch64_gen_unspec_cond (rtx_code code, machine_mode pred_mode,
15771 rtx pred, rtx op0, rtx op1)
15772 {
15773 rtvec vec = gen_rtvec (3, pred, op0, op1);
15774 return gen_rtx_UNSPEC (pred_mode, vec, aarch64_unspec_cond_code (code));
15775 }
15776
15777 /* Expand an SVE integer comparison:
15778
15779 TARGET = CODE (OP0, OP1). */
15780
15781 void
aarch64_expand_sve_vec_cmp_int(rtx target,rtx_code code,rtx op0,rtx op1)15782 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
15783 {
15784 machine_mode pred_mode = GET_MODE (target);
15785 machine_mode data_mode = GET_MODE (op0);
15786
15787 if (!aarch64_sve_cmp_operand_p (code, op1))
15788 op1 = force_reg (data_mode, op1);
15789
15790 rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
15791 rtx unspec = aarch64_gen_unspec_cond (code, pred_mode, ptrue, op0, op1);
15792 emit_insn (gen_set_clobber_cc (target, unspec));
15793 }
15794
15795 /* Emit an instruction:
15796
15797 (set TARGET (unspec:PRED_MODE [PRED OP0 OP1] UNSPEC_COND_<X>))
15798
15799 where <X> is the operation associated with comparison CODE. */
15800
15801 static void
aarch64_emit_unspec_cond(rtx target,rtx_code code,machine_mode pred_mode,rtx pred,rtx op0,rtx op1)15802 aarch64_emit_unspec_cond (rtx target, rtx_code code, machine_mode pred_mode,
15803 rtx pred, rtx op0, rtx op1)
15804 {
15805 rtx unspec = aarch64_gen_unspec_cond (code, pred_mode, pred, op0, op1);
15806 emit_set_insn (target, unspec);
15807 }
15808
15809 /* Emit:
15810
15811 (set TMP1 (unspec:PRED_MODE [PTRUE OP0 OP1] UNSPEC_COND_<X1>))
15812 (set TMP2 (unspec:PRED_MODE [PTRUE OP0 OP1] UNSPEC_COND_<X2>))
15813 (set TARGET (and:PRED_MODE (ior:PRED_MODE TMP1 TMP2) PTRUE))
15814
15815 where <Xi> is the operation associated with comparison CODEi. */
15816
15817 static void
aarch64_emit_unspec_cond_or(rtx target,rtx_code code1,rtx_code code2,machine_mode pred_mode,rtx ptrue,rtx op0,rtx op1)15818 aarch64_emit_unspec_cond_or (rtx target, rtx_code code1, rtx_code code2,
15819 machine_mode pred_mode, rtx ptrue,
15820 rtx op0, rtx op1)
15821 {
15822 rtx tmp1 = gen_reg_rtx (pred_mode);
15823 aarch64_emit_unspec_cond (tmp1, code1, pred_mode, ptrue, op0, op1);
15824 rtx tmp2 = gen_reg_rtx (pred_mode);
15825 aarch64_emit_unspec_cond (tmp2, code2, pred_mode, ptrue, op0, op1);
15826 emit_set_insn (target, gen_rtx_AND (pred_mode,
15827 gen_rtx_IOR (pred_mode, tmp1, tmp2),
15828 ptrue));
15829 }
15830
15831 /* If CAN_INVERT_P, emit an instruction:
15832
15833 (set TARGET (unspec:PRED_MODE [PRED OP0 OP1] UNSPEC_COND_<X>))
15834
15835 where <X> is the operation associated with comparison CODE. Otherwise
15836 emit:
15837
15838 (set TMP (unspec:PRED_MODE [PRED OP0 OP1] UNSPEC_COND_<X>))
15839 (set TARGET (and:PRED_MODE (not:PRED_MODE TMP) PTRUE))
15840
15841 where the second instructions sets TARGET to the inverse of TMP. */
15842
15843 static void
aarch64_emit_inverted_unspec_cond(rtx target,rtx_code code,machine_mode pred_mode,rtx ptrue,rtx pred,rtx op0,rtx op1,bool can_invert_p)15844 aarch64_emit_inverted_unspec_cond (rtx target, rtx_code code,
15845 machine_mode pred_mode, rtx ptrue, rtx pred,
15846 rtx op0, rtx op1, bool can_invert_p)
15847 {
15848 if (can_invert_p)
15849 aarch64_emit_unspec_cond (target, code, pred_mode, pred, op0, op1);
15850 else
15851 {
15852 rtx tmp = gen_reg_rtx (pred_mode);
15853 aarch64_emit_unspec_cond (tmp, code, pred_mode, pred, op0, op1);
15854 emit_set_insn (target, gen_rtx_AND (pred_mode,
15855 gen_rtx_NOT (pred_mode, tmp),
15856 ptrue));
15857 }
15858 }
15859
15860 /* Expand an SVE floating-point comparison:
15861
15862 TARGET = CODE (OP0, OP1)
15863
15864 If CAN_INVERT_P is true, the caller can also handle inverted results;
15865 return true if the result is in fact inverted. */
15866
15867 bool
aarch64_expand_sve_vec_cmp_float(rtx target,rtx_code code,rtx op0,rtx op1,bool can_invert_p)15868 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
15869 rtx op0, rtx op1, bool can_invert_p)
15870 {
15871 machine_mode pred_mode = GET_MODE (target);
15872 machine_mode data_mode = GET_MODE (op0);
15873
15874 rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
15875 switch (code)
15876 {
15877 case UNORDERED:
15878 /* UNORDERED has no immediate form. */
15879 op1 = force_reg (data_mode, op1);
15880 aarch64_emit_unspec_cond (target, code, pred_mode, ptrue, op0, op1);
15881 return false;
15882
15883 case LT:
15884 case LE:
15885 case GT:
15886 case GE:
15887 case EQ:
15888 case NE:
15889 /* There is native support for the comparison. */
15890 aarch64_emit_unspec_cond (target, code, pred_mode, ptrue, op0, op1);
15891 return false;
15892
15893 case ORDERED:
15894 /* There is native support for the inverse comparison. */
15895 op1 = force_reg (data_mode, op1);
15896 aarch64_emit_inverted_unspec_cond (target, UNORDERED,
15897 pred_mode, ptrue, ptrue, op0, op1,
15898 can_invert_p);
15899 return can_invert_p;
15900
15901 case LTGT:
15902 /* This is a trapping operation (LT or GT). */
15903 aarch64_emit_unspec_cond_or (target, LT, GT, pred_mode, ptrue, op0, op1);
15904 return false;
15905
15906 case UNEQ:
15907 if (!flag_trapping_math)
15908 {
15909 /* This would trap for signaling NaNs. */
15910 op1 = force_reg (data_mode, op1);
15911 aarch64_emit_unspec_cond_or (target, UNORDERED, EQ,
15912 pred_mode, ptrue, op0, op1);
15913 return false;
15914 }
15915 /* fall through */
15916
15917 case UNLT:
15918 case UNLE:
15919 case UNGT:
15920 case UNGE:
15921 {
15922 rtx ordered = ptrue;
15923 if (flag_trapping_math)
15924 {
15925 /* Only compare the elements that are known to be ordered. */
15926 ordered = gen_reg_rtx (pred_mode);
15927 op1 = force_reg (data_mode, op1);
15928 aarch64_emit_inverted_unspec_cond (ordered, UNORDERED, pred_mode,
15929 ptrue, ptrue, op0, op1, false);
15930 }
15931 if (code == UNEQ)
15932 code = NE;
15933 else
15934 code = reverse_condition_maybe_unordered (code);
15935 aarch64_emit_inverted_unspec_cond (target, code, pred_mode, ptrue,
15936 ordered, op0, op1, can_invert_p);
15937 return can_invert_p;
15938 }
15939
15940 default:
15941 gcc_unreachable ();
15942 }
15943 }
15944
15945 /* Expand an SVE vcond pattern with operands OPS. DATA_MODE is the mode
15946 of the data being selected and CMP_MODE is the mode of the values being
15947 compared. */
15948
15949 void
aarch64_expand_sve_vcond(machine_mode data_mode,machine_mode cmp_mode,rtx * ops)15950 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
15951 rtx *ops)
15952 {
15953 machine_mode pred_mode
15954 = aarch64_get_mask_mode (GET_MODE_NUNITS (cmp_mode),
15955 GET_MODE_SIZE (cmp_mode)).require ();
15956 rtx pred = gen_reg_rtx (pred_mode);
15957 if (FLOAT_MODE_P (cmp_mode))
15958 {
15959 if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
15960 ops[4], ops[5], true))
15961 std::swap (ops[1], ops[2]);
15962 }
15963 else
15964 aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
15965
15966 rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
15967 emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
15968 }
15969
15970 /* Implement TARGET_MODES_TIEABLE_P. In principle we should always return
15971 true. However due to issues with register allocation it is preferable
15972 to avoid tieing integer scalar and FP scalar modes. Executing integer
15973 operations in general registers is better than treating them as scalar
15974 vector operations. This reduces latency and avoids redundant int<->FP
15975 moves. So tie modes if they are either the same class, or vector modes
15976 with other vector modes, vector structs or any scalar mode. */
15977
15978 static bool
aarch64_modes_tieable_p(machine_mode mode1,machine_mode mode2)15979 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
15980 {
15981 if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
15982 return true;
15983
15984 /* We specifically want to allow elements of "structure" modes to
15985 be tieable to the structure. This more general condition allows
15986 other rarer situations too. The reason we don't extend this to
15987 predicate modes is that there are no predicate structure modes
15988 nor any specific instructions for extracting part of a predicate
15989 register. */
15990 if (aarch64_vector_data_mode_p (mode1)
15991 && aarch64_vector_data_mode_p (mode2))
15992 return true;
15993
15994 /* Also allow any scalar modes with vectors. */
15995 if (aarch64_vector_mode_supported_p (mode1)
15996 || aarch64_vector_mode_supported_p (mode2))
15997 return true;
15998
15999 return false;
16000 }
16001
16002 /* Return a new RTX holding the result of moving POINTER forward by
16003 AMOUNT bytes. */
16004
16005 static rtx
aarch64_move_pointer(rtx pointer,poly_int64 amount)16006 aarch64_move_pointer (rtx pointer, poly_int64 amount)
16007 {
16008 rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
16009
16010 return adjust_automodify_address (pointer, GET_MODE (pointer),
16011 next, amount);
16012 }
16013
16014 /* Return a new RTX holding the result of moving POINTER forward by the
16015 size of the mode it points to. */
16016
16017 static rtx
aarch64_progress_pointer(rtx pointer)16018 aarch64_progress_pointer (rtx pointer)
16019 {
16020 return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
16021 }
16022
16023 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
16024 MODE bytes. */
16025
16026 static void
aarch64_copy_one_block_and_progress_pointers(rtx * src,rtx * dst,machine_mode mode)16027 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
16028 machine_mode mode)
16029 {
16030 rtx reg = gen_reg_rtx (mode);
16031
16032 /* "Cast" the pointers to the correct mode. */
16033 *src = adjust_address (*src, mode, 0);
16034 *dst = adjust_address (*dst, mode, 0);
16035 /* Emit the memcpy. */
16036 emit_move_insn (reg, *src);
16037 emit_move_insn (*dst, reg);
16038 /* Move the pointers forward. */
16039 *src = aarch64_progress_pointer (*src);
16040 *dst = aarch64_progress_pointer (*dst);
16041 }
16042
16043 /* Expand movmem, as if from a __builtin_memcpy. Return true if
16044 we succeed, otherwise return false. */
16045
16046 bool
aarch64_expand_movmem(rtx * operands)16047 aarch64_expand_movmem (rtx *operands)
16048 {
16049 unsigned int n;
16050 rtx dst = operands[0];
16051 rtx src = operands[1];
16052 rtx base;
16053 bool speed_p = !optimize_function_for_size_p (cfun);
16054
16055 /* When optimizing for size, give a better estimate of the length of a
16056 memcpy call, but use the default otherwise. */
16057 unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
16058
16059 /* We can't do anything smart if the amount to copy is not constant. */
16060 if (!CONST_INT_P (operands[2]))
16061 return false;
16062
16063 n = UINTVAL (operands[2]);
16064
16065 /* Try to keep the number of instructions low. For cases below 16 bytes we
16066 need to make at most two moves. For cases above 16 bytes it will be one
16067 move for each 16 byte chunk, then at most two additional moves. */
16068 if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
16069 return false;
16070
16071 base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
16072 dst = adjust_automodify_address (dst, VOIDmode, base, 0);
16073
16074 base = copy_to_mode_reg (Pmode, XEXP (src, 0));
16075 src = adjust_automodify_address (src, VOIDmode, base, 0);
16076
16077 /* Simple cases. Copy 0-3 bytes, as (if applicable) a 2-byte, then a
16078 1-byte chunk. */
16079 if (n < 4)
16080 {
16081 if (n >= 2)
16082 {
16083 aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
16084 n -= 2;
16085 }
16086
16087 if (n == 1)
16088 aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
16089
16090 return true;
16091 }
16092
16093 /* Copy 4-8 bytes. First a 4-byte chunk, then (if applicable) a second
16094 4-byte chunk, partially overlapping with the previously copied chunk. */
16095 if (n < 8)
16096 {
16097 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
16098 n -= 4;
16099 if (n > 0)
16100 {
16101 int move = n - 4;
16102
16103 src = aarch64_move_pointer (src, move);
16104 dst = aarch64_move_pointer (dst, move);
16105 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
16106 }
16107 return true;
16108 }
16109
16110 /* Copy more than 8 bytes. Copy chunks of 16 bytes until we run out of
16111 them, then (if applicable) an 8-byte chunk. */
16112 while (n >= 8)
16113 {
16114 if (n / 16)
16115 {
16116 aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
16117 n -= 16;
16118 }
16119 else
16120 {
16121 aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
16122 n -= 8;
16123 }
16124 }
16125
16126 /* Finish the final bytes of the copy. We can always do this in one
16127 instruction. We either copy the exact amount we need, or partially
16128 overlap with the previous chunk we copied and copy 8-bytes. */
16129 if (n == 0)
16130 return true;
16131 else if (n == 1)
16132 aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
16133 else if (n == 2)
16134 aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
16135 else if (n == 4)
16136 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
16137 else
16138 {
16139 if (n == 3)
16140 {
16141 src = aarch64_move_pointer (src, -1);
16142 dst = aarch64_move_pointer (dst, -1);
16143 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
16144 }
16145 else
16146 {
16147 int move = n - 8;
16148
16149 src = aarch64_move_pointer (src, move);
16150 dst = aarch64_move_pointer (dst, move);
16151 aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
16152 }
16153 }
16154
16155 return true;
16156 }
16157
16158 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
16159 SImode stores. Handle the case when the constant has identical
16160 bottom and top halves. This is beneficial when the two stores can be
16161 merged into an STP and we avoid synthesising potentially expensive
16162 immediates twice. Return true if such a split is possible. */
16163
16164 bool
aarch64_split_dimode_const_store(rtx dst,rtx src)16165 aarch64_split_dimode_const_store (rtx dst, rtx src)
16166 {
16167 rtx lo = gen_lowpart (SImode, src);
16168 rtx hi = gen_highpart_mode (SImode, DImode, src);
16169
16170 bool size_p = optimize_function_for_size_p (cfun);
16171
16172 if (!rtx_equal_p (lo, hi))
16173 return false;
16174
16175 unsigned int orig_cost
16176 = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
16177 unsigned int lo_cost
16178 = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
16179
16180 /* We want to transform:
16181 MOV x1, 49370
16182 MOVK x1, 0x140, lsl 16
16183 MOVK x1, 0xc0da, lsl 32
16184 MOVK x1, 0x140, lsl 48
16185 STR x1, [x0]
16186 into:
16187 MOV w1, 49370
16188 MOVK w1, 0x140, lsl 16
16189 STP w1, w1, [x0]
16190 So we want to perform this only when we save two instructions
16191 or more. When optimizing for size, however, accept any code size
16192 savings we can. */
16193 if (size_p && orig_cost <= lo_cost)
16194 return false;
16195
16196 if (!size_p
16197 && (orig_cost <= lo_cost + 1))
16198 return false;
16199
16200 rtx mem_lo = adjust_address (dst, SImode, 0);
16201 if (!aarch64_mem_pair_operand (mem_lo, SImode))
16202 return false;
16203
16204 rtx tmp_reg = gen_reg_rtx (SImode);
16205 aarch64_expand_mov_immediate (tmp_reg, lo);
16206 rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
16207 /* Don't emit an explicit store pair as this may not be always profitable.
16208 Let the sched-fusion logic decide whether to merge them. */
16209 emit_move_insn (mem_lo, tmp_reg);
16210 emit_move_insn (mem_hi, tmp_reg);
16211
16212 return true;
16213 }
16214
16215 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
16216
16217 static unsigned HOST_WIDE_INT
aarch64_asan_shadow_offset(void)16218 aarch64_asan_shadow_offset (void)
16219 {
16220 return (HOST_WIDE_INT_1 << 36);
16221 }
16222
16223 static rtx
aarch64_gen_ccmp_first(rtx_insn ** prep_seq,rtx_insn ** gen_seq,int code,tree treeop0,tree treeop1)16224 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
16225 int code, tree treeop0, tree treeop1)
16226 {
16227 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
16228 rtx op0, op1;
16229 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
16230 insn_code icode;
16231 struct expand_operand ops[4];
16232
16233 start_sequence ();
16234 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
16235
16236 op_mode = GET_MODE (op0);
16237 if (op_mode == VOIDmode)
16238 op_mode = GET_MODE (op1);
16239
16240 switch (op_mode)
16241 {
16242 case E_QImode:
16243 case E_HImode:
16244 case E_SImode:
16245 cmp_mode = SImode;
16246 icode = CODE_FOR_cmpsi;
16247 break;
16248
16249 case E_DImode:
16250 cmp_mode = DImode;
16251 icode = CODE_FOR_cmpdi;
16252 break;
16253
16254 case E_SFmode:
16255 cmp_mode = SFmode;
16256 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
16257 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
16258 break;
16259
16260 case E_DFmode:
16261 cmp_mode = DFmode;
16262 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
16263 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
16264 break;
16265
16266 default:
16267 end_sequence ();
16268 return NULL_RTX;
16269 }
16270
16271 op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
16272 op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
16273 if (!op0 || !op1)
16274 {
16275 end_sequence ();
16276 return NULL_RTX;
16277 }
16278 *prep_seq = get_insns ();
16279 end_sequence ();
16280
16281 create_fixed_operand (&ops[0], op0);
16282 create_fixed_operand (&ops[1], op1);
16283
16284 start_sequence ();
16285 if (!maybe_expand_insn (icode, 2, ops))
16286 {
16287 end_sequence ();
16288 return NULL_RTX;
16289 }
16290 *gen_seq = get_insns ();
16291 end_sequence ();
16292
16293 return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
16294 gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
16295 }
16296
16297 static rtx
aarch64_gen_ccmp_next(rtx_insn ** prep_seq,rtx_insn ** gen_seq,rtx prev,int cmp_code,tree treeop0,tree treeop1,int bit_code)16298 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
16299 int cmp_code, tree treeop0, tree treeop1, int bit_code)
16300 {
16301 rtx op0, op1, target;
16302 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
16303 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
16304 insn_code icode;
16305 struct expand_operand ops[6];
16306 int aarch64_cond;
16307
16308 push_to_sequence (*prep_seq);
16309 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
16310
16311 op_mode = GET_MODE (op0);
16312 if (op_mode == VOIDmode)
16313 op_mode = GET_MODE (op1);
16314
16315 switch (op_mode)
16316 {
16317 case E_QImode:
16318 case E_HImode:
16319 case E_SImode:
16320 cmp_mode = SImode;
16321 icode = CODE_FOR_ccmpsi;
16322 break;
16323
16324 case E_DImode:
16325 cmp_mode = DImode;
16326 icode = CODE_FOR_ccmpdi;
16327 break;
16328
16329 case E_SFmode:
16330 cmp_mode = SFmode;
16331 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
16332 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
16333 break;
16334
16335 case E_DFmode:
16336 cmp_mode = DFmode;
16337 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
16338 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
16339 break;
16340
16341 default:
16342 end_sequence ();
16343 return NULL_RTX;
16344 }
16345
16346 op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
16347 op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
16348 if (!op0 || !op1)
16349 {
16350 end_sequence ();
16351 return NULL_RTX;
16352 }
16353 *prep_seq = get_insns ();
16354 end_sequence ();
16355
16356 target = gen_rtx_REG (cc_mode, CC_REGNUM);
16357 aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
16358
16359 if (bit_code != AND)
16360 {
16361 prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
16362 GET_MODE (XEXP (prev, 0))),
16363 VOIDmode, XEXP (prev, 0), const0_rtx);
16364 aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
16365 }
16366
16367 create_fixed_operand (&ops[0], XEXP (prev, 0));
16368 create_fixed_operand (&ops[1], target);
16369 create_fixed_operand (&ops[2], op0);
16370 create_fixed_operand (&ops[3], op1);
16371 create_fixed_operand (&ops[4], prev);
16372 create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
16373
16374 push_to_sequence (*gen_seq);
16375 if (!maybe_expand_insn (icode, 6, ops))
16376 {
16377 end_sequence ();
16378 return NULL_RTX;
16379 }
16380
16381 *gen_seq = get_insns ();
16382 end_sequence ();
16383
16384 return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
16385 }
16386
16387 #undef TARGET_GEN_CCMP_FIRST
16388 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
16389
16390 #undef TARGET_GEN_CCMP_NEXT
16391 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
16392
16393 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
16394 instruction fusion of some sort. */
16395
16396 static bool
aarch64_macro_fusion_p(void)16397 aarch64_macro_fusion_p (void)
16398 {
16399 return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
16400 }
16401
16402
16403 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
16404 should be kept together during scheduling. */
16405
16406 static bool
aarch_macro_fusion_pair_p(rtx_insn * prev,rtx_insn * curr)16407 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
16408 {
16409 rtx set_dest;
16410 rtx prev_set = single_set (prev);
16411 rtx curr_set = single_set (curr);
16412 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
16413 bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
16414
16415 if (!aarch64_macro_fusion_p ())
16416 return false;
16417
16418 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
16419 {
16420 /* We are trying to match:
16421 prev (mov) == (set (reg r0) (const_int imm16))
16422 curr (movk) == (set (zero_extract (reg r0)
16423 (const_int 16)
16424 (const_int 16))
16425 (const_int imm16_1)) */
16426
16427 set_dest = SET_DEST (curr_set);
16428
16429 if (GET_CODE (set_dest) == ZERO_EXTRACT
16430 && CONST_INT_P (SET_SRC (curr_set))
16431 && CONST_INT_P (SET_SRC (prev_set))
16432 && CONST_INT_P (XEXP (set_dest, 2))
16433 && INTVAL (XEXP (set_dest, 2)) == 16
16434 && REG_P (XEXP (set_dest, 0))
16435 && REG_P (SET_DEST (prev_set))
16436 && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
16437 {
16438 return true;
16439 }
16440 }
16441
16442 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
16443 {
16444
16445 /* We're trying to match:
16446 prev (adrp) == (set (reg r1)
16447 (high (symbol_ref ("SYM"))))
16448 curr (add) == (set (reg r0)
16449 (lo_sum (reg r1)
16450 (symbol_ref ("SYM"))))
16451 Note that r0 need not necessarily be the same as r1, especially
16452 during pre-regalloc scheduling. */
16453
16454 if (satisfies_constraint_Ush (SET_SRC (prev_set))
16455 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
16456 {
16457 if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
16458 && REG_P (XEXP (SET_SRC (curr_set), 0))
16459 && REGNO (XEXP (SET_SRC (curr_set), 0))
16460 == REGNO (SET_DEST (prev_set))
16461 && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
16462 XEXP (SET_SRC (curr_set), 1)))
16463 return true;
16464 }
16465 }
16466
16467 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
16468 {
16469
16470 /* We're trying to match:
16471 prev (movk) == (set (zero_extract (reg r0)
16472 (const_int 16)
16473 (const_int 32))
16474 (const_int imm16_1))
16475 curr (movk) == (set (zero_extract (reg r0)
16476 (const_int 16)
16477 (const_int 48))
16478 (const_int imm16_2)) */
16479
16480 if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
16481 && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
16482 && REG_P (XEXP (SET_DEST (prev_set), 0))
16483 && REG_P (XEXP (SET_DEST (curr_set), 0))
16484 && REGNO (XEXP (SET_DEST (prev_set), 0))
16485 == REGNO (XEXP (SET_DEST (curr_set), 0))
16486 && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
16487 && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
16488 && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
16489 && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
16490 && CONST_INT_P (SET_SRC (prev_set))
16491 && CONST_INT_P (SET_SRC (curr_set)))
16492 return true;
16493
16494 }
16495 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
16496 {
16497 /* We're trying to match:
16498 prev (adrp) == (set (reg r0)
16499 (high (symbol_ref ("SYM"))))
16500 curr (ldr) == (set (reg r1)
16501 (mem (lo_sum (reg r0)
16502 (symbol_ref ("SYM")))))
16503 or
16504 curr (ldr) == (set (reg r1)
16505 (zero_extend (mem
16506 (lo_sum (reg r0)
16507 (symbol_ref ("SYM")))))) */
16508 if (satisfies_constraint_Ush (SET_SRC (prev_set))
16509 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
16510 {
16511 rtx curr_src = SET_SRC (curr_set);
16512
16513 if (GET_CODE (curr_src) == ZERO_EXTEND)
16514 curr_src = XEXP (curr_src, 0);
16515
16516 if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
16517 && REG_P (XEXP (XEXP (curr_src, 0), 0))
16518 && REGNO (XEXP (XEXP (curr_src, 0), 0))
16519 == REGNO (SET_DEST (prev_set))
16520 && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
16521 XEXP (SET_SRC (prev_set), 0)))
16522 return true;
16523 }
16524 }
16525
16526 if (aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)
16527 && aarch_crypto_can_dual_issue (prev, curr))
16528 return true;
16529
16530 if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
16531 && any_condjump_p (curr))
16532 {
16533 unsigned int condreg1, condreg2;
16534 rtx cc_reg_1;
16535 aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
16536 cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
16537
16538 if (reg_referenced_p (cc_reg_1, PATTERN (curr))
16539 && prev
16540 && modified_in_p (cc_reg_1, prev))
16541 {
16542 enum attr_type prev_type = get_attr_type (prev);
16543
16544 /* FIXME: this misses some which is considered simple arthematic
16545 instructions for ThunderX. Simple shifts are missed here. */
16546 if (prev_type == TYPE_ALUS_SREG
16547 || prev_type == TYPE_ALUS_IMM
16548 || prev_type == TYPE_LOGICS_REG
16549 || prev_type == TYPE_LOGICS_IMM)
16550 return true;
16551 }
16552 }
16553
16554 if (prev_set
16555 && curr_set
16556 && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
16557 && any_condjump_p (curr))
16558 {
16559 /* We're trying to match:
16560 prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
16561 curr (cbz) == (set (pc) (if_then_else (eq/ne) (r0)
16562 (const_int 0))
16563 (label_ref ("SYM"))
16564 (pc)) */
16565 if (SET_DEST (curr_set) == (pc_rtx)
16566 && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
16567 && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
16568 && REG_P (SET_DEST (prev_set))
16569 && REGNO (SET_DEST (prev_set))
16570 == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
16571 {
16572 /* Fuse ALU operations followed by conditional branch instruction. */
16573 switch (get_attr_type (prev))
16574 {
16575 case TYPE_ALU_IMM:
16576 case TYPE_ALU_SREG:
16577 case TYPE_ADC_REG:
16578 case TYPE_ADC_IMM:
16579 case TYPE_ADCS_REG:
16580 case TYPE_ADCS_IMM:
16581 case TYPE_LOGIC_REG:
16582 case TYPE_LOGIC_IMM:
16583 case TYPE_CSEL:
16584 case TYPE_ADR:
16585 case TYPE_MOV_IMM:
16586 case TYPE_SHIFT_REG:
16587 case TYPE_SHIFT_IMM:
16588 case TYPE_BFM:
16589 case TYPE_RBIT:
16590 case TYPE_REV:
16591 case TYPE_EXTEND:
16592 return true;
16593
16594 default:;
16595 }
16596 }
16597 }
16598
16599 return false;
16600 }
16601
16602 /* Return true iff the instruction fusion described by OP is enabled. */
16603
16604 bool
aarch64_fusion_enabled_p(enum aarch64_fusion_pairs op)16605 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
16606 {
16607 return (aarch64_tune_params.fusible_ops & op) != 0;
16608 }
16609
16610 /* If MEM is in the form of [base+offset], extract the two parts
16611 of address and set to BASE and OFFSET, otherwise return false
16612 after clearing BASE and OFFSET. */
16613
16614 bool
extract_base_offset_in_addr(rtx mem,rtx * base,rtx * offset)16615 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
16616 {
16617 rtx addr;
16618
16619 gcc_assert (MEM_P (mem));
16620
16621 addr = XEXP (mem, 0);
16622
16623 if (REG_P (addr))
16624 {
16625 *base = addr;
16626 *offset = const0_rtx;
16627 return true;
16628 }
16629
16630 if (GET_CODE (addr) == PLUS
16631 && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
16632 {
16633 *base = XEXP (addr, 0);
16634 *offset = XEXP (addr, 1);
16635 return true;
16636 }
16637
16638 *base = NULL_RTX;
16639 *offset = NULL_RTX;
16640
16641 return false;
16642 }
16643
16644 /* Types for scheduling fusion. */
16645 enum sched_fusion_type
16646 {
16647 SCHED_FUSION_NONE = 0,
16648 SCHED_FUSION_LD_SIGN_EXTEND,
16649 SCHED_FUSION_LD_ZERO_EXTEND,
16650 SCHED_FUSION_LD,
16651 SCHED_FUSION_ST,
16652 SCHED_FUSION_NUM
16653 };
16654
16655 /* If INSN is a load or store of address in the form of [base+offset],
16656 extract the two parts and set to BASE and OFFSET. Return scheduling
16657 fusion type this INSN is. */
16658
16659 static enum sched_fusion_type
fusion_load_store(rtx_insn * insn,rtx * base,rtx * offset)16660 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
16661 {
16662 rtx x, dest, src;
16663 enum sched_fusion_type fusion = SCHED_FUSION_LD;
16664
16665 gcc_assert (INSN_P (insn));
16666 x = PATTERN (insn);
16667 if (GET_CODE (x) != SET)
16668 return SCHED_FUSION_NONE;
16669
16670 src = SET_SRC (x);
16671 dest = SET_DEST (x);
16672
16673 machine_mode dest_mode = GET_MODE (dest);
16674
16675 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
16676 return SCHED_FUSION_NONE;
16677
16678 if (GET_CODE (src) == SIGN_EXTEND)
16679 {
16680 fusion = SCHED_FUSION_LD_SIGN_EXTEND;
16681 src = XEXP (src, 0);
16682 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
16683 return SCHED_FUSION_NONE;
16684 }
16685 else if (GET_CODE (src) == ZERO_EXTEND)
16686 {
16687 fusion = SCHED_FUSION_LD_ZERO_EXTEND;
16688 src = XEXP (src, 0);
16689 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
16690 return SCHED_FUSION_NONE;
16691 }
16692
16693 if (GET_CODE (src) == MEM && REG_P (dest))
16694 extract_base_offset_in_addr (src, base, offset);
16695 else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
16696 {
16697 fusion = SCHED_FUSION_ST;
16698 extract_base_offset_in_addr (dest, base, offset);
16699 }
16700 else
16701 return SCHED_FUSION_NONE;
16702
16703 if (*base == NULL_RTX || *offset == NULL_RTX)
16704 fusion = SCHED_FUSION_NONE;
16705
16706 return fusion;
16707 }
16708
16709 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
16710
16711 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
16712 and PRI are only calculated for these instructions. For other instruction,
16713 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
16714 type instruction fusion can be added by returning different priorities.
16715
16716 It's important that irrelevant instructions get the largest FUSION_PRI. */
16717
16718 static void
aarch64_sched_fusion_priority(rtx_insn * insn,int max_pri,int * fusion_pri,int * pri)16719 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
16720 int *fusion_pri, int *pri)
16721 {
16722 int tmp, off_val;
16723 rtx base, offset;
16724 enum sched_fusion_type fusion;
16725
16726 gcc_assert (INSN_P (insn));
16727
16728 tmp = max_pri - 1;
16729 fusion = fusion_load_store (insn, &base, &offset);
16730 if (fusion == SCHED_FUSION_NONE)
16731 {
16732 *pri = tmp;
16733 *fusion_pri = tmp;
16734 return;
16735 }
16736
16737 /* Set FUSION_PRI according to fusion type and base register. */
16738 *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
16739
16740 /* Calculate PRI. */
16741 tmp /= 2;
16742
16743 /* INSN with smaller offset goes first. */
16744 off_val = (int)(INTVAL (offset));
16745 if (off_val >= 0)
16746 tmp -= (off_val & 0xfffff);
16747 else
16748 tmp += ((- off_val) & 0xfffff);
16749
16750 *pri = tmp;
16751 return;
16752 }
16753
16754 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
16755 Adjust priority of sha1h instructions so they are scheduled before
16756 other SHA1 instructions. */
16757
16758 static int
aarch64_sched_adjust_priority(rtx_insn * insn,int priority)16759 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
16760 {
16761 rtx x = PATTERN (insn);
16762
16763 if (GET_CODE (x) == SET)
16764 {
16765 x = SET_SRC (x);
16766
16767 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
16768 return priority + 10;
16769 }
16770
16771 return priority;
16772 }
16773
16774 /* Given OPERANDS of consecutive load/store, check if we can merge
16775 them into ldp/stp. LOAD is true if they are load instructions.
16776 MODE is the mode of memory operands. */
16777
16778 bool
aarch64_operands_ok_for_ldpstp(rtx * operands,bool load,machine_mode mode)16779 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
16780 machine_mode mode)
16781 {
16782 HOST_WIDE_INT offval_1, offval_2, msize;
16783 enum reg_class rclass_1, rclass_2;
16784 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
16785
16786 if (load)
16787 {
16788 mem_1 = operands[1];
16789 mem_2 = operands[3];
16790 reg_1 = operands[0];
16791 reg_2 = operands[2];
16792 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
16793 if (REGNO (reg_1) == REGNO (reg_2))
16794 return false;
16795 }
16796 else
16797 {
16798 mem_1 = operands[0];
16799 mem_2 = operands[2];
16800 reg_1 = operands[1];
16801 reg_2 = operands[3];
16802 }
16803
16804 /* The mems cannot be volatile. */
16805 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
16806 return false;
16807
16808 /* If we have SImode and slow unaligned ldp,
16809 check the alignment to be at least 8 byte. */
16810 if (mode == SImode
16811 && (aarch64_tune_params.extra_tuning_flags
16812 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
16813 && !optimize_size
16814 && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
16815 return false;
16816
16817 /* Check if the addresses are in the form of [base+offset]. */
16818 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
16819 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
16820 return false;
16821 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
16822 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
16823 return false;
16824
16825 /* Check if the bases are same. */
16826 if (!rtx_equal_p (base_1, base_2))
16827 return false;
16828
16829 offval_1 = INTVAL (offset_1);
16830 offval_2 = INTVAL (offset_2);
16831 /* We should only be trying this for fixed-sized modes. There is no
16832 SVE LDP/STP instruction. */
16833 msize = GET_MODE_SIZE (mode).to_constant ();
16834 /* Check if the offsets are consecutive. */
16835 if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
16836 return false;
16837
16838 /* Check if the addresses are clobbered by load. */
16839 if (load)
16840 {
16841 if (reg_mentioned_p (reg_1, mem_1))
16842 return false;
16843
16844 /* In increasing order, the last load can clobber the address. */
16845 if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
16846 return false;
16847 }
16848
16849 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
16850 rclass_1 = FP_REGS;
16851 else
16852 rclass_1 = GENERAL_REGS;
16853
16854 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
16855 rclass_2 = FP_REGS;
16856 else
16857 rclass_2 = GENERAL_REGS;
16858
16859 /* Check if the registers are of same class. */
16860 if (rclass_1 != rclass_2)
16861 return false;
16862
16863 return true;
16864 }
16865
16866 /* Given OPERANDS of consecutive load/store, check if we can merge
16867 them into ldp/stp by adjusting the offset. LOAD is true if they
16868 are load instructions. MODE is the mode of memory operands.
16869
16870 Given below consecutive stores:
16871
16872 str w1, [xb, 0x100]
16873 str w1, [xb, 0x104]
16874 str w1, [xb, 0x108]
16875 str w1, [xb, 0x10c]
16876
16877 Though the offsets are out of the range supported by stp, we can
16878 still pair them after adjusting the offset, like:
16879
16880 add scratch, xb, 0x100
16881 stp w1, w1, [scratch]
16882 stp w1, w1, [scratch, 0x8]
16883
16884 The peephole patterns detecting this opportunity should guarantee
16885 the scratch register is avaliable. */
16886
16887 bool
aarch64_operands_adjust_ok_for_ldpstp(rtx * operands,bool load,scalar_mode mode)16888 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
16889 scalar_mode mode)
16890 {
16891 enum reg_class rclass_1, rclass_2, rclass_3, rclass_4;
16892 HOST_WIDE_INT offval_1, offval_2, offval_3, offval_4, msize;
16893 rtx mem_1, mem_2, mem_3, mem_4, reg_1, reg_2, reg_3, reg_4;
16894 rtx base_1, base_2, base_3, base_4, offset_1, offset_2, offset_3, offset_4;
16895
16896 if (load)
16897 {
16898 reg_1 = operands[0];
16899 mem_1 = operands[1];
16900 reg_2 = operands[2];
16901 mem_2 = operands[3];
16902 reg_3 = operands[4];
16903 mem_3 = operands[5];
16904 reg_4 = operands[6];
16905 mem_4 = operands[7];
16906 gcc_assert (REG_P (reg_1) && REG_P (reg_2)
16907 && REG_P (reg_3) && REG_P (reg_4));
16908 if (REGNO (reg_1) == REGNO (reg_2) || REGNO (reg_3) == REGNO (reg_4))
16909 return false;
16910 }
16911 else
16912 {
16913 mem_1 = operands[0];
16914 reg_1 = operands[1];
16915 mem_2 = operands[2];
16916 reg_2 = operands[3];
16917 mem_3 = operands[4];
16918 reg_3 = operands[5];
16919 mem_4 = operands[6];
16920 reg_4 = operands[7];
16921 }
16922 /* Skip if memory operand is by itslef valid for ldp/stp. */
16923 if (!MEM_P (mem_1) || aarch64_mem_pair_operand (mem_1, mode))
16924 return false;
16925
16926 /* The mems cannot be volatile. */
16927 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2)
16928 || MEM_VOLATILE_P (mem_3) ||MEM_VOLATILE_P (mem_4))
16929 return false;
16930
16931 /* Check if the addresses are in the form of [base+offset]. */
16932 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
16933 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
16934 return false;
16935 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
16936 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
16937 return false;
16938 extract_base_offset_in_addr (mem_3, &base_3, &offset_3);
16939 if (base_3 == NULL_RTX || offset_3 == NULL_RTX)
16940 return false;
16941 extract_base_offset_in_addr (mem_4, &base_4, &offset_4);
16942 if (base_4 == NULL_RTX || offset_4 == NULL_RTX)
16943 return false;
16944
16945 /* Check if the bases are same. */
16946 if (!rtx_equal_p (base_1, base_2)
16947 || !rtx_equal_p (base_2, base_3)
16948 || !rtx_equal_p (base_3, base_4))
16949 return false;
16950
16951 offval_1 = INTVAL (offset_1);
16952 offval_2 = INTVAL (offset_2);
16953 offval_3 = INTVAL (offset_3);
16954 offval_4 = INTVAL (offset_4);
16955 msize = GET_MODE_SIZE (mode);
16956 /* Check if the offsets are consecutive. */
16957 if ((offval_1 != (offval_2 + msize)
16958 || offval_1 != (offval_3 + msize * 2)
16959 || offval_1 != (offval_4 + msize * 3))
16960 && (offval_4 != (offval_3 + msize)
16961 || offval_4 != (offval_2 + msize * 2)
16962 || offval_4 != (offval_1 + msize * 3)))
16963 return false;
16964
16965 /* Check if the addresses are clobbered by load. */
16966 if (load)
16967 {
16968 if (reg_mentioned_p (reg_1, mem_1)
16969 || reg_mentioned_p (reg_2, mem_2)
16970 || reg_mentioned_p (reg_3, mem_3))
16971 return false;
16972
16973 /* In increasing order, the last load can clobber the address. */
16974 if (offval_1 > offval_2 && reg_mentioned_p (reg_4, mem_4))
16975 return false;
16976 }
16977
16978 /* If we have SImode and slow unaligned ldp,
16979 check the alignment to be at least 8 byte. */
16980 if (mode == SImode
16981 && (aarch64_tune_params.extra_tuning_flags
16982 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
16983 && !optimize_size
16984 && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
16985 return false;
16986
16987 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
16988 rclass_1 = FP_REGS;
16989 else
16990 rclass_1 = GENERAL_REGS;
16991
16992 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
16993 rclass_2 = FP_REGS;
16994 else
16995 rclass_2 = GENERAL_REGS;
16996
16997 if (REG_P (reg_3) && FP_REGNUM_P (REGNO (reg_3)))
16998 rclass_3 = FP_REGS;
16999 else
17000 rclass_3 = GENERAL_REGS;
17001
17002 if (REG_P (reg_4) && FP_REGNUM_P (REGNO (reg_4)))
17003 rclass_4 = FP_REGS;
17004 else
17005 rclass_4 = GENERAL_REGS;
17006
17007 /* Check if the registers are of same class. */
17008 if (rclass_1 != rclass_2 || rclass_2 != rclass_3 || rclass_3 != rclass_4)
17009 return false;
17010
17011 return true;
17012 }
17013
17014 /* Given OPERANDS of consecutive load/store, this function pairs them
17015 into ldp/stp after adjusting the offset. It depends on the fact
17016 that addresses of load/store instructions are in increasing order.
17017 MODE is the mode of memory operands. CODE is the rtl operator
17018 which should be applied to all memory operands, it's SIGN_EXTEND,
17019 ZERO_EXTEND or UNKNOWN. */
17020
17021 bool
aarch64_gen_adjusted_ldpstp(rtx * operands,bool load,scalar_mode mode,RTX_CODE code)17022 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
17023 scalar_mode mode, RTX_CODE code)
17024 {
17025 rtx base, offset, t1, t2;
17026 rtx mem_1, mem_2, mem_3, mem_4;
17027 HOST_WIDE_INT off_val, abs_off, adj_off, new_off, stp_off_limit, msize;
17028
17029 if (load)
17030 {
17031 mem_1 = operands[1];
17032 mem_2 = operands[3];
17033 mem_3 = operands[5];
17034 mem_4 = operands[7];
17035 }
17036 else
17037 {
17038 mem_1 = operands[0];
17039 mem_2 = operands[2];
17040 mem_3 = operands[4];
17041 mem_4 = operands[6];
17042 gcc_assert (code == UNKNOWN);
17043 }
17044
17045 extract_base_offset_in_addr (mem_1, &base, &offset);
17046 gcc_assert (base != NULL_RTX && offset != NULL_RTX);
17047
17048 /* Adjust offset thus it can fit in ldp/stp instruction. */
17049 msize = GET_MODE_SIZE (mode);
17050 stp_off_limit = msize * 0x40;
17051 off_val = INTVAL (offset);
17052 abs_off = (off_val < 0) ? -off_val : off_val;
17053 new_off = abs_off % stp_off_limit;
17054 adj_off = abs_off - new_off;
17055
17056 /* Further adjust to make sure all offsets are OK. */
17057 if ((new_off + msize * 2) >= stp_off_limit)
17058 {
17059 adj_off += stp_off_limit;
17060 new_off -= stp_off_limit;
17061 }
17062
17063 /* Make sure the adjustment can be done with ADD/SUB instructions. */
17064 if (adj_off >= 0x1000)
17065 return false;
17066
17067 if (off_val < 0)
17068 {
17069 adj_off = -adj_off;
17070 new_off = -new_off;
17071 }
17072
17073 /* Create new memory references. */
17074 mem_1 = change_address (mem_1, VOIDmode,
17075 plus_constant (DImode, operands[8], new_off));
17076
17077 /* Check if the adjusted address is OK for ldp/stp. */
17078 if (!aarch64_mem_pair_operand (mem_1, mode))
17079 return false;
17080
17081 msize = GET_MODE_SIZE (mode);
17082 mem_2 = change_address (mem_2, VOIDmode,
17083 plus_constant (DImode,
17084 operands[8],
17085 new_off + msize));
17086 mem_3 = change_address (mem_3, VOIDmode,
17087 plus_constant (DImode,
17088 operands[8],
17089 new_off + msize * 2));
17090 mem_4 = change_address (mem_4, VOIDmode,
17091 plus_constant (DImode,
17092 operands[8],
17093 new_off + msize * 3));
17094
17095 if (code == ZERO_EXTEND)
17096 {
17097 mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
17098 mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
17099 mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
17100 mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
17101 }
17102 else if (code == SIGN_EXTEND)
17103 {
17104 mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
17105 mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
17106 mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
17107 mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
17108 }
17109
17110 if (load)
17111 {
17112 operands[1] = mem_1;
17113 operands[3] = mem_2;
17114 operands[5] = mem_3;
17115 operands[7] = mem_4;
17116 }
17117 else
17118 {
17119 operands[0] = mem_1;
17120 operands[2] = mem_2;
17121 operands[4] = mem_3;
17122 operands[6] = mem_4;
17123 }
17124
17125 /* Emit adjusting instruction. */
17126 emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, adj_off)));
17127 /* Emit ldp/stp instructions. */
17128 t1 = gen_rtx_SET (operands[0], operands[1]);
17129 t2 = gen_rtx_SET (operands[2], operands[3]);
17130 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
17131 t1 = gen_rtx_SET (operands[4], operands[5]);
17132 t2 = gen_rtx_SET (operands[6], operands[7]);
17133 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
17134 return true;
17135 }
17136
17137 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE. Assume for now that
17138 it isn't worth branching around empty masked ops (including masked
17139 stores). */
17140
17141 static bool
aarch64_empty_mask_is_expensive(unsigned)17142 aarch64_empty_mask_is_expensive (unsigned)
17143 {
17144 return false;
17145 }
17146
17147 /* Return 1 if pseudo register should be created and used to hold
17148 GOT address for PIC code. */
17149
17150 bool
aarch64_use_pseudo_pic_reg(void)17151 aarch64_use_pseudo_pic_reg (void)
17152 {
17153 return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
17154 }
17155
17156 /* Implement TARGET_UNSPEC_MAY_TRAP_P. */
17157
17158 static int
aarch64_unspec_may_trap_p(const_rtx x,unsigned flags)17159 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
17160 {
17161 switch (XINT (x, 1))
17162 {
17163 case UNSPEC_GOTSMALLPIC:
17164 case UNSPEC_GOTSMALLPIC28K:
17165 case UNSPEC_GOTTINYPIC:
17166 return 0;
17167 default:
17168 break;
17169 }
17170
17171 return default_unspec_may_trap_p (x, flags);
17172 }
17173
17174
17175 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
17176 return the log2 of that value. Otherwise return -1. */
17177
17178 int
aarch64_fpconst_pow_of_2(rtx x)17179 aarch64_fpconst_pow_of_2 (rtx x)
17180 {
17181 const REAL_VALUE_TYPE *r;
17182
17183 if (!CONST_DOUBLE_P (x))
17184 return -1;
17185
17186 r = CONST_DOUBLE_REAL_VALUE (x);
17187
17188 if (REAL_VALUE_NEGATIVE (*r)
17189 || REAL_VALUE_ISNAN (*r)
17190 || REAL_VALUE_ISINF (*r)
17191 || !real_isinteger (r, DFmode))
17192 return -1;
17193
17194 return exact_log2 (real_to_integer (r));
17195 }
17196
17197 /* If X is a vector of equal CONST_DOUBLE values and that value is
17198 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
17199
17200 int
aarch64_vec_fpconst_pow_of_2(rtx x)17201 aarch64_vec_fpconst_pow_of_2 (rtx x)
17202 {
17203 int nelts;
17204 if (GET_CODE (x) != CONST_VECTOR
17205 || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
17206 return -1;
17207
17208 if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
17209 return -1;
17210
17211 int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
17212 if (firstval <= 0)
17213 return -1;
17214
17215 for (int i = 1; i < nelts; i++)
17216 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
17217 return -1;
17218
17219 return firstval;
17220 }
17221
17222 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
17223 to float.
17224
17225 __fp16 always promotes through this hook.
17226 _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
17227 through the generic excess precision logic rather than here. */
17228
17229 static tree
aarch64_promoted_type(const_tree t)17230 aarch64_promoted_type (const_tree t)
17231 {
17232 if (SCALAR_FLOAT_TYPE_P (t)
17233 && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
17234 return float_type_node;
17235
17236 return NULL_TREE;
17237 }
17238
17239 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
17240
17241 static bool
aarch64_optab_supported_p(int op,machine_mode mode1,machine_mode,optimization_type opt_type)17242 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
17243 optimization_type opt_type)
17244 {
17245 switch (op)
17246 {
17247 case rsqrt_optab:
17248 return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
17249
17250 default:
17251 return true;
17252 }
17253 }
17254
17255 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook. */
17256
17257 static unsigned int
aarch64_dwarf_poly_indeterminate_value(unsigned int i,unsigned int * factor,int * offset)17258 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
17259 int *offset)
17260 {
17261 /* Polynomial invariant 1 == (VG / 2) - 1. */
17262 gcc_assert (i == 1);
17263 *factor = 2;
17264 *offset = 1;
17265 return AARCH64_DWARF_VG;
17266 }
17267
17268 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
17269 if MODE is HFmode, and punt to the generic implementation otherwise. */
17270
17271 static bool
aarch64_libgcc_floating_mode_supported_p(scalar_float_mode mode)17272 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
17273 {
17274 return (mode == HFmode
17275 ? true
17276 : default_libgcc_floating_mode_supported_p (mode));
17277 }
17278
17279 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
17280 if MODE is HFmode, and punt to the generic implementation otherwise. */
17281
17282 static bool
aarch64_scalar_mode_supported_p(scalar_mode mode)17283 aarch64_scalar_mode_supported_p (scalar_mode mode)
17284 {
17285 return (mode == HFmode
17286 ? true
17287 : default_scalar_mode_supported_p (mode));
17288 }
17289
17290 /* Set the value of FLT_EVAL_METHOD.
17291 ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
17292
17293 0: evaluate all operations and constants, whose semantic type has at
17294 most the range and precision of type float, to the range and
17295 precision of float; evaluate all other operations and constants to
17296 the range and precision of the semantic type;
17297
17298 N, where _FloatN is a supported interchange floating type
17299 evaluate all operations and constants, whose semantic type has at
17300 most the range and precision of _FloatN type, to the range and
17301 precision of the _FloatN type; evaluate all other operations and
17302 constants to the range and precision of the semantic type;
17303
17304 If we have the ARMv8.2-A extensions then we support _Float16 in native
17305 precision, so we should set this to 16. Otherwise, we support the type,
17306 but want to evaluate expressions in float precision, so set this to
17307 0. */
17308
17309 static enum flt_eval_method
aarch64_excess_precision(enum excess_precision_type type)17310 aarch64_excess_precision (enum excess_precision_type type)
17311 {
17312 switch (type)
17313 {
17314 case EXCESS_PRECISION_TYPE_FAST:
17315 case EXCESS_PRECISION_TYPE_STANDARD:
17316 /* We can calculate either in 16-bit range and precision or
17317 32-bit range and precision. Make that decision based on whether
17318 we have native support for the ARMv8.2-A 16-bit floating-point
17319 instructions or not. */
17320 return (TARGET_FP_F16INST
17321 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
17322 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
17323 case EXCESS_PRECISION_TYPE_IMPLICIT:
17324 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
17325 default:
17326 gcc_unreachable ();
17327 }
17328 return FLT_EVAL_METHOD_UNPREDICTABLE;
17329 }
17330
17331 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN. Return true if INSN can be
17332 scheduled for speculative execution. Reject the long-running division
17333 and square-root instructions. */
17334
17335 static bool
aarch64_sched_can_speculate_insn(rtx_insn * insn)17336 aarch64_sched_can_speculate_insn (rtx_insn *insn)
17337 {
17338 switch (get_attr_type (insn))
17339 {
17340 case TYPE_SDIV:
17341 case TYPE_UDIV:
17342 case TYPE_FDIVS:
17343 case TYPE_FDIVD:
17344 case TYPE_FSQRTS:
17345 case TYPE_FSQRTD:
17346 case TYPE_NEON_FP_SQRT_S:
17347 case TYPE_NEON_FP_SQRT_D:
17348 case TYPE_NEON_FP_SQRT_S_Q:
17349 case TYPE_NEON_FP_SQRT_D_Q:
17350 case TYPE_NEON_FP_DIV_S:
17351 case TYPE_NEON_FP_DIV_D:
17352 case TYPE_NEON_FP_DIV_S_Q:
17353 case TYPE_NEON_FP_DIV_D_Q:
17354 return false;
17355 default:
17356 return true;
17357 }
17358 }
17359
17360 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES. */
17361
17362 static int
aarch64_compute_pressure_classes(reg_class * classes)17363 aarch64_compute_pressure_classes (reg_class *classes)
17364 {
17365 int i = 0;
17366 classes[i++] = GENERAL_REGS;
17367 classes[i++] = FP_REGS;
17368 /* PR_REGS isn't a useful pressure class because many predicate pseudo
17369 registers need to go in PR_LO_REGS at some point during their
17370 lifetime. Splitting it into two halves has the effect of making
17371 all predicates count against PR_LO_REGS, so that we try whenever
17372 possible to restrict the number of live predicates to 8. This
17373 greatly reduces the amount of spilling in certain loops. */
17374 classes[i++] = PR_LO_REGS;
17375 classes[i++] = PR_HI_REGS;
17376 return i;
17377 }
17378
17379 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
17380
17381 static bool
aarch64_can_change_mode_class(machine_mode from,machine_mode to,reg_class_t)17382 aarch64_can_change_mode_class (machine_mode from,
17383 machine_mode to, reg_class_t)
17384 {
17385 if (BYTES_BIG_ENDIAN)
17386 {
17387 bool from_sve_p = aarch64_sve_data_mode_p (from);
17388 bool to_sve_p = aarch64_sve_data_mode_p (to);
17389
17390 /* Don't allow changes between SVE data modes and non-SVE modes.
17391 See the comment at the head of aarch64-sve.md for details. */
17392 if (from_sve_p != to_sve_p)
17393 return false;
17394
17395 /* Don't allow changes in element size: lane 0 of the new vector
17396 would not then be lane 0 of the old vector. See the comment
17397 above aarch64_maybe_expand_sve_subreg_move for a more detailed
17398 description.
17399
17400 In the worst case, this forces a register to be spilled in
17401 one mode and reloaded in the other, which handles the
17402 endianness correctly. */
17403 if (from_sve_p && GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to))
17404 return false;
17405 }
17406 return true;
17407 }
17408
17409 /* Implement TARGET_EARLY_REMAT_MODES. */
17410
17411 static void
aarch64_select_early_remat_modes(sbitmap modes)17412 aarch64_select_early_remat_modes (sbitmap modes)
17413 {
17414 /* SVE values are not normally live across a call, so it should be
17415 worth doing early rematerialization even in VL-specific mode. */
17416 for (int i = 0; i < NUM_MACHINE_MODES; ++i)
17417 {
17418 machine_mode mode = (machine_mode) i;
17419 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
17420 if (vec_flags & VEC_ANY_SVE)
17421 bitmap_set_bit (modes, i);
17422 }
17423 }
17424
17425 /* Target-specific selftests. */
17426
17427 #if CHECKING_P
17428
17429 namespace selftest {
17430
17431 /* Selftest for the RTL loader.
17432 Verify that the RTL loader copes with a dump from
17433 print_rtx_function. This is essentially just a test that class
17434 function_reader can handle a real dump, but it also verifies
17435 that lookup_reg_by_dump_name correctly handles hard regs.
17436 The presence of hard reg names in the dump means that the test is
17437 target-specific, hence it is in this file. */
17438
17439 static void
aarch64_test_loading_full_dump()17440 aarch64_test_loading_full_dump ()
17441 {
17442 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
17443
17444 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
17445
17446 rtx_insn *insn_1 = get_insn_by_uid (1);
17447 ASSERT_EQ (NOTE, GET_CODE (insn_1));
17448
17449 rtx_insn *insn_15 = get_insn_by_uid (15);
17450 ASSERT_EQ (INSN, GET_CODE (insn_15));
17451 ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
17452
17453 /* Verify crtl->return_rtx. */
17454 ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
17455 ASSERT_EQ (0, REGNO (crtl->return_rtx));
17456 ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
17457 }
17458
17459 /* Run all target-specific selftests. */
17460
17461 static void
aarch64_run_selftests(void)17462 aarch64_run_selftests (void)
17463 {
17464 aarch64_test_loading_full_dump ();
17465 }
17466
17467 } // namespace selftest
17468
17469 #endif /* #if CHECKING_P */
17470
17471 #undef TARGET_ADDRESS_COST
17472 #define TARGET_ADDRESS_COST aarch64_address_cost
17473
17474 /* This hook will determines whether unnamed bitfields affect the alignment
17475 of the containing structure. The hook returns true if the structure
17476 should inherit the alignment requirements of an unnamed bitfield's
17477 type. */
17478 #undef TARGET_ALIGN_ANON_BITFIELD
17479 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
17480
17481 #undef TARGET_ASM_ALIGNED_DI_OP
17482 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
17483
17484 #undef TARGET_ASM_ALIGNED_HI_OP
17485 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
17486
17487 #undef TARGET_ASM_ALIGNED_SI_OP
17488 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
17489
17490 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
17491 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
17492 hook_bool_const_tree_hwi_hwi_const_tree_true
17493
17494 #undef TARGET_ASM_FILE_START
17495 #define TARGET_ASM_FILE_START aarch64_start_file
17496
17497 #undef TARGET_ASM_OUTPUT_MI_THUNK
17498 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
17499
17500 #undef TARGET_ASM_SELECT_RTX_SECTION
17501 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
17502
17503 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
17504 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
17505
17506 #undef TARGET_BUILD_BUILTIN_VA_LIST
17507 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
17508
17509 #undef TARGET_CALLEE_COPIES
17510 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
17511
17512 #undef TARGET_CAN_ELIMINATE
17513 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
17514
17515 #undef TARGET_CAN_INLINE_P
17516 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
17517
17518 #undef TARGET_CANNOT_FORCE_CONST_MEM
17519 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
17520
17521 #undef TARGET_CASE_VALUES_THRESHOLD
17522 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
17523
17524 #undef TARGET_CONDITIONAL_REGISTER_USAGE
17525 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
17526
17527 /* Only the least significant bit is used for initialization guard
17528 variables. */
17529 #undef TARGET_CXX_GUARD_MASK_BIT
17530 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
17531
17532 #undef TARGET_C_MODE_FOR_SUFFIX
17533 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
17534
17535 #ifdef TARGET_BIG_ENDIAN_DEFAULT
17536 #undef TARGET_DEFAULT_TARGET_FLAGS
17537 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
17538 #endif
17539
17540 #undef TARGET_CLASS_MAX_NREGS
17541 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
17542
17543 #undef TARGET_BUILTIN_DECL
17544 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
17545
17546 #undef TARGET_BUILTIN_RECIPROCAL
17547 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
17548
17549 #undef TARGET_C_EXCESS_PRECISION
17550 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
17551
17552 #undef TARGET_EXPAND_BUILTIN
17553 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
17554
17555 #undef TARGET_EXPAND_BUILTIN_VA_START
17556 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
17557
17558 #undef TARGET_FOLD_BUILTIN
17559 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
17560
17561 #undef TARGET_FUNCTION_ARG
17562 #define TARGET_FUNCTION_ARG aarch64_function_arg
17563
17564 #undef TARGET_FUNCTION_ARG_ADVANCE
17565 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
17566
17567 #undef TARGET_FUNCTION_ARG_BOUNDARY
17568 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
17569
17570 #undef TARGET_FUNCTION_ARG_PADDING
17571 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
17572
17573 #undef TARGET_GET_RAW_RESULT_MODE
17574 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
17575 #undef TARGET_GET_RAW_ARG_MODE
17576 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
17577
17578 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
17579 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
17580
17581 #undef TARGET_FUNCTION_VALUE
17582 #define TARGET_FUNCTION_VALUE aarch64_function_value
17583
17584 #undef TARGET_FUNCTION_VALUE_REGNO_P
17585 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
17586
17587 #undef TARGET_GIMPLE_FOLD_BUILTIN
17588 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
17589
17590 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
17591 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
17592
17593 #undef TARGET_INIT_BUILTINS
17594 #define TARGET_INIT_BUILTINS aarch64_init_builtins
17595
17596 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
17597 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
17598 aarch64_ira_change_pseudo_allocno_class
17599
17600 #undef TARGET_LEGITIMATE_ADDRESS_P
17601 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
17602
17603 #undef TARGET_LEGITIMATE_CONSTANT_P
17604 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
17605
17606 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
17607 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
17608 aarch64_legitimize_address_displacement
17609
17610 #undef TARGET_LIBGCC_CMP_RETURN_MODE
17611 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
17612
17613 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
17614 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
17615 aarch64_libgcc_floating_mode_supported_p
17616
17617 #undef TARGET_MANGLE_TYPE
17618 #define TARGET_MANGLE_TYPE aarch64_mangle_type
17619
17620 #undef TARGET_MEMORY_MOVE_COST
17621 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
17622
17623 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
17624 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
17625
17626 #undef TARGET_MUST_PASS_IN_STACK
17627 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
17628
17629 /* This target hook should return true if accesses to volatile bitfields
17630 should use the narrowest mode possible. It should return false if these
17631 accesses should use the bitfield container type. */
17632 #undef TARGET_NARROW_VOLATILE_BITFIELD
17633 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
17634
17635 #undef TARGET_OPTION_OVERRIDE
17636 #define TARGET_OPTION_OVERRIDE aarch64_override_options
17637
17638 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
17639 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
17640 aarch64_override_options_after_change
17641
17642 #undef TARGET_OPTION_SAVE
17643 #define TARGET_OPTION_SAVE aarch64_option_save
17644
17645 #undef TARGET_OPTION_RESTORE
17646 #define TARGET_OPTION_RESTORE aarch64_option_restore
17647
17648 #undef TARGET_OPTION_PRINT
17649 #define TARGET_OPTION_PRINT aarch64_option_print
17650
17651 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
17652 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
17653
17654 #undef TARGET_SET_CURRENT_FUNCTION
17655 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
17656
17657 #undef TARGET_PASS_BY_REFERENCE
17658 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
17659
17660 #undef TARGET_PREFERRED_RELOAD_CLASS
17661 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
17662
17663 #undef TARGET_SCHED_REASSOCIATION_WIDTH
17664 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
17665
17666 #undef TARGET_PROMOTED_TYPE
17667 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
17668
17669 #undef TARGET_SECONDARY_RELOAD
17670 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
17671
17672 #undef TARGET_SHIFT_TRUNCATION_MASK
17673 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
17674
17675 #undef TARGET_SETUP_INCOMING_VARARGS
17676 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
17677
17678 #undef TARGET_STRUCT_VALUE_RTX
17679 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
17680
17681 #undef TARGET_REGISTER_MOVE_COST
17682 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
17683
17684 #undef TARGET_RETURN_IN_MEMORY
17685 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
17686
17687 #undef TARGET_RETURN_IN_MSB
17688 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
17689
17690 #undef TARGET_RTX_COSTS
17691 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
17692
17693 #undef TARGET_SCALAR_MODE_SUPPORTED_P
17694 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
17695
17696 #undef TARGET_SCHED_ISSUE_RATE
17697 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
17698
17699 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
17700 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
17701 aarch64_sched_first_cycle_multipass_dfa_lookahead
17702
17703 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
17704 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
17705 aarch64_first_cycle_multipass_dfa_lookahead_guard
17706
17707 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
17708 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
17709 aarch64_get_separate_components
17710
17711 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
17712 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
17713 aarch64_components_for_bb
17714
17715 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
17716 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
17717 aarch64_disqualify_components
17718
17719 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
17720 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
17721 aarch64_emit_prologue_components
17722
17723 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
17724 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
17725 aarch64_emit_epilogue_components
17726
17727 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
17728 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
17729 aarch64_set_handled_components
17730
17731 #undef TARGET_TRAMPOLINE_INIT
17732 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
17733
17734 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
17735 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
17736
17737 #undef TARGET_VECTOR_MODE_SUPPORTED_P
17738 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
17739
17740 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
17741 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
17742 aarch64_builtin_support_vector_misalignment
17743
17744 #undef TARGET_ARRAY_MODE
17745 #define TARGET_ARRAY_MODE aarch64_array_mode
17746
17747 #undef TARGET_ARRAY_MODE_SUPPORTED_P
17748 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
17749
17750 #undef TARGET_VECTORIZE_ADD_STMT_COST
17751 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
17752
17753 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
17754 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
17755 aarch64_builtin_vectorization_cost
17756
17757 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
17758 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
17759
17760 #undef TARGET_VECTORIZE_BUILTINS
17761 #define TARGET_VECTORIZE_BUILTINS
17762
17763 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
17764 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
17765 aarch64_builtin_vectorized_function
17766
17767 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
17768 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
17769 aarch64_autovectorize_vector_sizes
17770
17771 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
17772 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
17773 aarch64_atomic_assign_expand_fenv
17774
17775 /* Section anchor support. */
17776
17777 #undef TARGET_MIN_ANCHOR_OFFSET
17778 #define TARGET_MIN_ANCHOR_OFFSET -256
17779
17780 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
17781 byte offset; we can do much more for larger data types, but have no way
17782 to determine the size of the access. We assume accesses are aligned. */
17783 #undef TARGET_MAX_ANCHOR_OFFSET
17784 #define TARGET_MAX_ANCHOR_OFFSET 4095
17785
17786 #undef TARGET_VECTOR_ALIGNMENT
17787 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
17788
17789 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
17790 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
17791 aarch64_vectorize_preferred_vector_alignment
17792 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
17793 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
17794 aarch64_simd_vector_alignment_reachable
17795
17796 /* vec_perm support. */
17797
17798 #undef TARGET_VECTORIZE_VEC_PERM_CONST
17799 #define TARGET_VECTORIZE_VEC_PERM_CONST \
17800 aarch64_vectorize_vec_perm_const
17801
17802 #undef TARGET_VECTORIZE_GET_MASK_MODE
17803 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
17804 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
17805 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
17806 aarch64_empty_mask_is_expensive
17807
17808 #undef TARGET_INIT_LIBFUNCS
17809 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
17810
17811 #undef TARGET_FIXED_CONDITION_CODE_REGS
17812 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
17813
17814 #undef TARGET_FLAGS_REGNUM
17815 #define TARGET_FLAGS_REGNUM CC_REGNUM
17816
17817 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
17818 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
17819
17820 #undef TARGET_ASAN_SHADOW_OFFSET
17821 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
17822
17823 #undef TARGET_LEGITIMIZE_ADDRESS
17824 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
17825
17826 #undef TARGET_SCHED_CAN_SPECULATE_INSN
17827 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
17828
17829 #undef TARGET_CAN_USE_DOLOOP_P
17830 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
17831
17832 #undef TARGET_SCHED_ADJUST_PRIORITY
17833 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
17834
17835 #undef TARGET_SCHED_MACRO_FUSION_P
17836 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
17837
17838 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
17839 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
17840
17841 #undef TARGET_SCHED_FUSION_PRIORITY
17842 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
17843
17844 #undef TARGET_UNSPEC_MAY_TRAP_P
17845 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
17846
17847 #undef TARGET_USE_PSEUDO_PIC_REG
17848 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
17849
17850 #undef TARGET_PRINT_OPERAND
17851 #define TARGET_PRINT_OPERAND aarch64_print_operand
17852
17853 #undef TARGET_PRINT_OPERAND_ADDRESS
17854 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
17855
17856 #undef TARGET_OPTAB_SUPPORTED_P
17857 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
17858
17859 #undef TARGET_OMIT_STRUCT_RETURN_REG
17860 #define TARGET_OMIT_STRUCT_RETURN_REG true
17861
17862 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
17863 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
17864 aarch64_dwarf_poly_indeterminate_value
17865
17866 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors. */
17867 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
17868 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
17869
17870 #undef TARGET_HARD_REGNO_NREGS
17871 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
17872 #undef TARGET_HARD_REGNO_MODE_OK
17873 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
17874
17875 #undef TARGET_MODES_TIEABLE_P
17876 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
17877
17878 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
17879 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
17880 aarch64_hard_regno_call_part_clobbered
17881
17882 #undef TARGET_CONSTANT_ALIGNMENT
17883 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
17884
17885 #undef TARGET_COMPUTE_PRESSURE_CLASSES
17886 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
17887
17888 #undef TARGET_CAN_CHANGE_MODE_CLASS
17889 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
17890
17891 #undef TARGET_SELECT_EARLY_REMAT_MODES
17892 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
17893
17894 #if CHECKING_P
17895 #undef TARGET_RUN_TARGET_SELFTESTS
17896 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
17897 #endif /* #if CHECKING_P */
17898
17899 struct gcc_target targetm = TARGET_INITIALIZER;
17900
17901 #include "gt-aarch64.h"
17902