1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2018 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
4
5 This file is part of GCC.
6
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
10 any later version.
11
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
20
21 #define IN_TARGET_CODE 1
22
23 #include "config.h"
24 #define INCLUDE_STRING
25 #include "system.h"
26 #include "coretypes.h"
27 #include "backend.h"
28 #include "target.h"
29 #include "rtl.h"
30 #include "tree.h"
31 #include "memmodel.h"
32 #include "gimple.h"
33 #include "cfghooks.h"
34 #include "cfgloop.h"
35 #include "df.h"
36 #include "tm_p.h"
37 #include "stringpool.h"
38 #include "attribs.h"
39 #include "optabs.h"
40 #include "regs.h"
41 #include "emit-rtl.h"
42 #include "recog.h"
43 #include "diagnostic.h"
44 #include "insn-attr.h"
45 #include "alias.h"
46 #include "fold-const.h"
47 #include "stor-layout.h"
48 #include "calls.h"
49 #include "varasm.h"
50 #include "output.h"
51 #include "flags.h"
52 #include "explow.h"
53 #include "expr.h"
54 #include "reload.h"
55 #include "langhooks.h"
56 #include "opts.h"
57 #include "params.h"
58 #include "gimplify.h"
59 #include "dwarf2.h"
60 #include "gimple-iterator.h"
61 #include "tree-vectorizer.h"
62 #include "aarch64-cost-tables.h"
63 #include "dumpfile.h"
64 #include "builtins.h"
65 #include "rtl-iter.h"
66 #include "tm-constrs.h"
67 #include "sched-int.h"
68 #include "target-globals.h"
69 #include "common/common-target.h"
70 #include "cfgrtl.h"
71 #include "selftest.h"
72 #include "selftest-rtl.h"
73 #include "rtx-vector-builder.h"
74
75 /* This file should be included last. */
76 #include "target-def.h"
77
78 /* Defined for convenience. */
79 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
80
81 /* Classifies an address.
82
83 ADDRESS_REG_IMM
84 A simple base register plus immediate offset.
85
86 ADDRESS_REG_WB
87 A base register indexed by immediate offset with writeback.
88
89 ADDRESS_REG_REG
90 A base register indexed by (optionally scaled) register.
91
92 ADDRESS_REG_UXTW
93 A base register indexed by (optionally scaled) zero-extended register.
94
95 ADDRESS_REG_SXTW
96 A base register indexed by (optionally scaled) sign-extended register.
97
98 ADDRESS_LO_SUM
99 A LO_SUM rtx with a base register and "LO12" symbol relocation.
100
101 ADDRESS_SYMBOLIC:
102 A constant symbolic address, in pc-relative literal pool. */
103
104 enum aarch64_address_type {
105 ADDRESS_REG_IMM,
106 ADDRESS_REG_WB,
107 ADDRESS_REG_REG,
108 ADDRESS_REG_UXTW,
109 ADDRESS_REG_SXTW,
110 ADDRESS_LO_SUM,
111 ADDRESS_SYMBOLIC
112 };
113
114 struct aarch64_address_info {
115 enum aarch64_address_type type;
116 rtx base;
117 rtx offset;
118 poly_int64 const_offset;
119 int shift;
120 enum aarch64_symbol_type symbol_type;
121 };
122
123 /* Information about a legitimate vector immediate operand. */
124 struct simd_immediate_info
125 {
126 enum insn_type { MOV, MVN };
127 enum modifier_type { LSL, MSL };
128
simd_immediate_infosimd_immediate_info129 simd_immediate_info () {}
130 simd_immediate_info (scalar_float_mode, rtx);
131 simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
132 insn_type = MOV, modifier_type = LSL,
133 unsigned int = 0);
134 simd_immediate_info (scalar_mode, rtx, rtx);
135
136 /* The mode of the elements. */
137 scalar_mode elt_mode;
138
139 /* The value of each element if all elements are the same, or the
140 first value if the constant is a series. */
141 rtx value;
142
143 /* The value of the step if the constant is a series, null otherwise. */
144 rtx step;
145
146 /* The instruction to use to move the immediate into a vector. */
147 insn_type insn;
148
149 /* The kind of shift modifier to use, and the number of bits to shift.
150 This is (LSL, 0) if no shift is needed. */
151 modifier_type modifier;
152 unsigned int shift;
153 };
154
155 /* Construct a floating-point immediate in which each element has mode
156 ELT_MODE_IN and value VALUE_IN. */
157 inline simd_immediate_info
simd_immediate_info(scalar_float_mode elt_mode_in,rtx value_in)158 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
159 : elt_mode (elt_mode_in), value (value_in), step (NULL_RTX), insn (MOV),
160 modifier (LSL), shift (0)
161 {}
162
163 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
164 and value VALUE_IN. The other parameters are as for the structure
165 fields. */
166 inline simd_immediate_info
simd_immediate_info(scalar_int_mode elt_mode_in,unsigned HOST_WIDE_INT value_in,insn_type insn_in,modifier_type modifier_in,unsigned int shift_in)167 ::simd_immediate_info (scalar_int_mode elt_mode_in,
168 unsigned HOST_WIDE_INT value_in,
169 insn_type insn_in, modifier_type modifier_in,
170 unsigned int shift_in)
171 : elt_mode (elt_mode_in), value (gen_int_mode (value_in, elt_mode_in)),
172 step (NULL_RTX), insn (insn_in), modifier (modifier_in), shift (shift_in)
173 {}
174
175 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
176 and where element I is equal to VALUE_IN + I * STEP_IN. */
177 inline simd_immediate_info
simd_immediate_info(scalar_mode elt_mode_in,rtx value_in,rtx step_in)178 ::simd_immediate_info (scalar_mode elt_mode_in, rtx value_in, rtx step_in)
179 : elt_mode (elt_mode_in), value (value_in), step (step_in), insn (MOV),
180 modifier (LSL), shift (0)
181 {}
182
183 /* The current code model. */
184 enum aarch64_code_model aarch64_cmodel;
185
186 /* The number of 64-bit elements in an SVE vector. */
187 poly_uint16 aarch64_sve_vg;
188
189 #ifdef HAVE_AS_TLS
190 #undef TARGET_HAVE_TLS
191 #define TARGET_HAVE_TLS 1
192 #endif
193
194 static bool aarch64_composite_type_p (const_tree, machine_mode);
195 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
196 const_tree,
197 machine_mode *, int *,
198 bool *);
199 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
200 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
201 static void aarch64_override_options_after_change (void);
202 static bool aarch64_vector_mode_supported_p (machine_mode);
203 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
204 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
205 const_tree type,
206 int misalignment,
207 bool is_packed);
208 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
209 static bool aarch64_print_ldpstp_address (FILE *, machine_mode, rtx);
210
211 /* Major revision number of the ARM Architecture implemented by the target. */
212 unsigned aarch64_architecture_version;
213
214 /* The processor for which instructions should be scheduled. */
215 enum aarch64_processor aarch64_tune = cortexa53;
216
217 /* Mask to specify which instruction scheduling options should be used. */
218 unsigned long aarch64_tune_flags = 0;
219
220 /* Global flag for PC relative loads. */
221 bool aarch64_pcrelative_literal_loads;
222
223 /* Support for command line parsing of boolean flags in the tuning
224 structures. */
225 struct aarch64_flag_desc
226 {
227 const char* name;
228 unsigned int flag;
229 };
230
231 #define AARCH64_FUSION_PAIR(name, internal_name) \
232 { name, AARCH64_FUSE_##internal_name },
233 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
234 {
235 { "none", AARCH64_FUSE_NOTHING },
236 #include "aarch64-fusion-pairs.def"
237 { "all", AARCH64_FUSE_ALL },
238 { NULL, AARCH64_FUSE_NOTHING }
239 };
240
241 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
242 { name, AARCH64_EXTRA_TUNE_##internal_name },
243 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
244 {
245 { "none", AARCH64_EXTRA_TUNE_NONE },
246 #include "aarch64-tuning-flags.def"
247 { "all", AARCH64_EXTRA_TUNE_ALL },
248 { NULL, AARCH64_EXTRA_TUNE_NONE }
249 };
250
251 /* Tuning parameters. */
252
253 static const struct cpu_addrcost_table generic_addrcost_table =
254 {
255 {
256 1, /* hi */
257 0, /* si */
258 0, /* di */
259 1, /* ti */
260 },
261 0, /* pre_modify */
262 0, /* post_modify */
263 0, /* register_offset */
264 0, /* register_sextend */
265 0, /* register_zextend */
266 0 /* imm_offset */
267 };
268
269 static const struct cpu_addrcost_table exynosm1_addrcost_table =
270 {
271 {
272 0, /* hi */
273 0, /* si */
274 0, /* di */
275 2, /* ti */
276 },
277 0, /* pre_modify */
278 0, /* post_modify */
279 1, /* register_offset */
280 1, /* register_sextend */
281 2, /* register_zextend */
282 0, /* imm_offset */
283 };
284
285 static const struct cpu_addrcost_table xgene1_addrcost_table =
286 {
287 {
288 1, /* hi */
289 0, /* si */
290 0, /* di */
291 1, /* ti */
292 },
293 1, /* pre_modify */
294 0, /* post_modify */
295 0, /* register_offset */
296 1, /* register_sextend */
297 1, /* register_zextend */
298 0, /* imm_offset */
299 };
300
301 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
302 {
303 {
304 1, /* hi */
305 1, /* si */
306 1, /* di */
307 2, /* ti */
308 },
309 0, /* pre_modify */
310 0, /* post_modify */
311 2, /* register_offset */
312 3, /* register_sextend */
313 3, /* register_zextend */
314 0, /* imm_offset */
315 };
316
317 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
318 {
319 {
320 1, /* hi */
321 1, /* si */
322 1, /* di */
323 2, /* ti */
324 },
325 1, /* pre_modify */
326 1, /* post_modify */
327 3, /* register_offset */
328 3, /* register_sextend */
329 3, /* register_zextend */
330 2, /* imm_offset */
331 };
332
333 static const struct cpu_regmove_cost generic_regmove_cost =
334 {
335 1, /* GP2GP */
336 /* Avoid the use of slow int<->fp moves for spilling by setting
337 their cost higher than memmov_cost. */
338 5, /* GP2FP */
339 5, /* FP2GP */
340 2 /* FP2FP */
341 };
342
343 static const struct cpu_regmove_cost cortexa57_regmove_cost =
344 {
345 1, /* GP2GP */
346 /* Avoid the use of slow int<->fp moves for spilling by setting
347 their cost higher than memmov_cost. */
348 5, /* GP2FP */
349 5, /* FP2GP */
350 2 /* FP2FP */
351 };
352
353 static const struct cpu_regmove_cost cortexa53_regmove_cost =
354 {
355 1, /* GP2GP */
356 /* Avoid the use of slow int<->fp moves for spilling by setting
357 their cost higher than memmov_cost. */
358 5, /* GP2FP */
359 5, /* FP2GP */
360 2 /* FP2FP */
361 };
362
363 static const struct cpu_regmove_cost exynosm1_regmove_cost =
364 {
365 1, /* GP2GP */
366 /* Avoid the use of slow int<->fp moves for spilling by setting
367 their cost higher than memmov_cost (actual, 4 and 9). */
368 9, /* GP2FP */
369 9, /* FP2GP */
370 1 /* FP2FP */
371 };
372
373 static const struct cpu_regmove_cost thunderx_regmove_cost =
374 {
375 2, /* GP2GP */
376 2, /* GP2FP */
377 6, /* FP2GP */
378 4 /* FP2FP */
379 };
380
381 static const struct cpu_regmove_cost xgene1_regmove_cost =
382 {
383 1, /* GP2GP */
384 /* Avoid the use of slow int<->fp moves for spilling by setting
385 their cost higher than memmov_cost. */
386 8, /* GP2FP */
387 8, /* FP2GP */
388 2 /* FP2FP */
389 };
390
391 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
392 {
393 2, /* GP2GP */
394 /* Avoid the use of int<->fp moves for spilling. */
395 6, /* GP2FP */
396 6, /* FP2GP */
397 4 /* FP2FP */
398 };
399
400 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
401 {
402 1, /* GP2GP */
403 /* Avoid the use of int<->fp moves for spilling. */
404 8, /* GP2FP */
405 8, /* FP2GP */
406 4 /* FP2FP */
407 };
408
409 /* Generic costs for vector insn classes. */
410 static const struct cpu_vector_cost generic_vector_cost =
411 {
412 1, /* scalar_int_stmt_cost */
413 1, /* scalar_fp_stmt_cost */
414 1, /* scalar_load_cost */
415 1, /* scalar_store_cost */
416 1, /* vec_int_stmt_cost */
417 1, /* vec_fp_stmt_cost */
418 2, /* vec_permute_cost */
419 1, /* vec_to_scalar_cost */
420 1, /* scalar_to_vec_cost */
421 1, /* vec_align_load_cost */
422 1, /* vec_unalign_load_cost */
423 1, /* vec_unalign_store_cost */
424 1, /* vec_store_cost */
425 3, /* cond_taken_branch_cost */
426 1 /* cond_not_taken_branch_cost */
427 };
428
429 /* QDF24XX costs for vector insn classes. */
430 static const struct cpu_vector_cost qdf24xx_vector_cost =
431 {
432 1, /* scalar_int_stmt_cost */
433 1, /* scalar_fp_stmt_cost */
434 1, /* scalar_load_cost */
435 1, /* scalar_store_cost */
436 1, /* vec_int_stmt_cost */
437 3, /* vec_fp_stmt_cost */
438 2, /* vec_permute_cost */
439 1, /* vec_to_scalar_cost */
440 1, /* scalar_to_vec_cost */
441 1, /* vec_align_load_cost */
442 1, /* vec_unalign_load_cost */
443 1, /* vec_unalign_store_cost */
444 1, /* vec_store_cost */
445 3, /* cond_taken_branch_cost */
446 1 /* cond_not_taken_branch_cost */
447 };
448
449 /* ThunderX costs for vector insn classes. */
450 static const struct cpu_vector_cost thunderx_vector_cost =
451 {
452 1, /* scalar_int_stmt_cost */
453 1, /* scalar_fp_stmt_cost */
454 3, /* scalar_load_cost */
455 1, /* scalar_store_cost */
456 4, /* vec_int_stmt_cost */
457 1, /* vec_fp_stmt_cost */
458 4, /* vec_permute_cost */
459 2, /* vec_to_scalar_cost */
460 2, /* scalar_to_vec_cost */
461 3, /* vec_align_load_cost */
462 5, /* vec_unalign_load_cost */
463 5, /* vec_unalign_store_cost */
464 1, /* vec_store_cost */
465 3, /* cond_taken_branch_cost */
466 3 /* cond_not_taken_branch_cost */
467 };
468
469 /* Generic costs for vector insn classes. */
470 static const struct cpu_vector_cost cortexa57_vector_cost =
471 {
472 1, /* scalar_int_stmt_cost */
473 1, /* scalar_fp_stmt_cost */
474 4, /* scalar_load_cost */
475 1, /* scalar_store_cost */
476 2, /* vec_int_stmt_cost */
477 2, /* vec_fp_stmt_cost */
478 3, /* vec_permute_cost */
479 8, /* vec_to_scalar_cost */
480 8, /* scalar_to_vec_cost */
481 4, /* vec_align_load_cost */
482 4, /* vec_unalign_load_cost */
483 1, /* vec_unalign_store_cost */
484 1, /* vec_store_cost */
485 1, /* cond_taken_branch_cost */
486 1 /* cond_not_taken_branch_cost */
487 };
488
489 static const struct cpu_vector_cost exynosm1_vector_cost =
490 {
491 1, /* scalar_int_stmt_cost */
492 1, /* scalar_fp_stmt_cost */
493 5, /* scalar_load_cost */
494 1, /* scalar_store_cost */
495 3, /* vec_int_stmt_cost */
496 3, /* vec_fp_stmt_cost */
497 3, /* vec_permute_cost */
498 3, /* vec_to_scalar_cost */
499 3, /* scalar_to_vec_cost */
500 5, /* vec_align_load_cost */
501 5, /* vec_unalign_load_cost */
502 1, /* vec_unalign_store_cost */
503 1, /* vec_store_cost */
504 1, /* cond_taken_branch_cost */
505 1 /* cond_not_taken_branch_cost */
506 };
507
508 /* Generic costs for vector insn classes. */
509 static const struct cpu_vector_cost xgene1_vector_cost =
510 {
511 1, /* scalar_int_stmt_cost */
512 1, /* scalar_fp_stmt_cost */
513 5, /* scalar_load_cost */
514 1, /* scalar_store_cost */
515 2, /* vec_int_stmt_cost */
516 2, /* vec_fp_stmt_cost */
517 2, /* vec_permute_cost */
518 4, /* vec_to_scalar_cost */
519 4, /* scalar_to_vec_cost */
520 10, /* vec_align_load_cost */
521 10, /* vec_unalign_load_cost */
522 2, /* vec_unalign_store_cost */
523 2, /* vec_store_cost */
524 2, /* cond_taken_branch_cost */
525 1 /* cond_not_taken_branch_cost */
526 };
527
528 /* Costs for vector insn classes for Vulcan. */
529 static const struct cpu_vector_cost thunderx2t99_vector_cost =
530 {
531 1, /* scalar_int_stmt_cost */
532 6, /* scalar_fp_stmt_cost */
533 4, /* scalar_load_cost */
534 1, /* scalar_store_cost */
535 5, /* vec_int_stmt_cost */
536 6, /* vec_fp_stmt_cost */
537 3, /* vec_permute_cost */
538 6, /* vec_to_scalar_cost */
539 5, /* scalar_to_vec_cost */
540 8, /* vec_align_load_cost */
541 8, /* vec_unalign_load_cost */
542 4, /* vec_unalign_store_cost */
543 4, /* vec_store_cost */
544 2, /* cond_taken_branch_cost */
545 1 /* cond_not_taken_branch_cost */
546 };
547
548 /* Generic costs for branch instructions. */
549 static const struct cpu_branch_cost generic_branch_cost =
550 {
551 1, /* Predictable. */
552 3 /* Unpredictable. */
553 };
554
555 /* Generic approximation modes. */
556 static const cpu_approx_modes generic_approx_modes =
557 {
558 AARCH64_APPROX_NONE, /* division */
559 AARCH64_APPROX_NONE, /* sqrt */
560 AARCH64_APPROX_NONE /* recip_sqrt */
561 };
562
563 /* Approximation modes for Exynos M1. */
564 static const cpu_approx_modes exynosm1_approx_modes =
565 {
566 AARCH64_APPROX_NONE, /* division */
567 AARCH64_APPROX_ALL, /* sqrt */
568 AARCH64_APPROX_ALL /* recip_sqrt */
569 };
570
571 /* Approximation modes for X-Gene 1. */
572 static const cpu_approx_modes xgene1_approx_modes =
573 {
574 AARCH64_APPROX_NONE, /* division */
575 AARCH64_APPROX_NONE, /* sqrt */
576 AARCH64_APPROX_ALL /* recip_sqrt */
577 };
578
579 /* Generic prefetch settings (which disable prefetch). */
580 static const cpu_prefetch_tune generic_prefetch_tune =
581 {
582 0, /* num_slots */
583 -1, /* l1_cache_size */
584 -1, /* l1_cache_line_size */
585 -1, /* l2_cache_size */
586 -1 /* default_opt_level */
587 };
588
589 static const cpu_prefetch_tune exynosm1_prefetch_tune =
590 {
591 0, /* num_slots */
592 -1, /* l1_cache_size */
593 64, /* l1_cache_line_size */
594 -1, /* l2_cache_size */
595 -1 /* default_opt_level */
596 };
597
598 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
599 {
600 4, /* num_slots */
601 32, /* l1_cache_size */
602 64, /* l1_cache_line_size */
603 1024, /* l2_cache_size */
604 -1 /* default_opt_level */
605 };
606
607 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
608 {
609 8, /* num_slots */
610 32, /* l1_cache_size */
611 128, /* l1_cache_line_size */
612 16*1024, /* l2_cache_size */
613 3 /* default_opt_level */
614 };
615
616 static const cpu_prefetch_tune thunderx_prefetch_tune =
617 {
618 8, /* num_slots */
619 32, /* l1_cache_size */
620 128, /* l1_cache_line_size */
621 -1, /* l2_cache_size */
622 -1 /* default_opt_level */
623 };
624
625 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
626 {
627 8, /* num_slots */
628 32, /* l1_cache_size */
629 64, /* l1_cache_line_size */
630 256, /* l2_cache_size */
631 -1 /* default_opt_level */
632 };
633
634 static const struct tune_params generic_tunings =
635 {
636 &cortexa57_extra_costs,
637 &generic_addrcost_table,
638 &generic_regmove_cost,
639 &generic_vector_cost,
640 &generic_branch_cost,
641 &generic_approx_modes,
642 4, /* memmov_cost */
643 2, /* issue_rate */
644 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
645 8, /* function_align. */
646 4, /* jump_align. */
647 8, /* loop_align. */
648 2, /* int_reassoc_width. */
649 4, /* fp_reassoc_width. */
650 1, /* vec_reassoc_width. */
651 2, /* min_div_recip_mul_sf. */
652 2, /* min_div_recip_mul_df. */
653 0, /* max_case_values. */
654 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
655 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
656 &generic_prefetch_tune
657 };
658
659 static const struct tune_params cortexa35_tunings =
660 {
661 &cortexa53_extra_costs,
662 &generic_addrcost_table,
663 &cortexa53_regmove_cost,
664 &generic_vector_cost,
665 &generic_branch_cost,
666 &generic_approx_modes,
667 4, /* memmov_cost */
668 1, /* issue_rate */
669 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
670 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
671 16, /* function_align. */
672 4, /* jump_align. */
673 8, /* loop_align. */
674 2, /* int_reassoc_width. */
675 4, /* fp_reassoc_width. */
676 1, /* vec_reassoc_width. */
677 2, /* min_div_recip_mul_sf. */
678 2, /* min_div_recip_mul_df. */
679 0, /* max_case_values. */
680 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
681 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
682 &generic_prefetch_tune
683 };
684
685 static const struct tune_params cortexa53_tunings =
686 {
687 &cortexa53_extra_costs,
688 &generic_addrcost_table,
689 &cortexa53_regmove_cost,
690 &generic_vector_cost,
691 &generic_branch_cost,
692 &generic_approx_modes,
693 4, /* memmov_cost */
694 2, /* issue_rate */
695 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
696 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
697 16, /* function_align. */
698 4, /* jump_align. */
699 8, /* loop_align. */
700 2, /* int_reassoc_width. */
701 4, /* fp_reassoc_width. */
702 1, /* vec_reassoc_width. */
703 2, /* min_div_recip_mul_sf. */
704 2, /* min_div_recip_mul_df. */
705 0, /* max_case_values. */
706 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
707 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
708 &generic_prefetch_tune
709 };
710
711 static const struct tune_params cortexa57_tunings =
712 {
713 &cortexa57_extra_costs,
714 &generic_addrcost_table,
715 &cortexa57_regmove_cost,
716 &cortexa57_vector_cost,
717 &generic_branch_cost,
718 &generic_approx_modes,
719 4, /* memmov_cost */
720 3, /* issue_rate */
721 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
722 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
723 16, /* function_align. */
724 4, /* jump_align. */
725 8, /* loop_align. */
726 2, /* int_reassoc_width. */
727 4, /* fp_reassoc_width. */
728 1, /* vec_reassoc_width. */
729 2, /* min_div_recip_mul_sf. */
730 2, /* min_div_recip_mul_df. */
731 0, /* max_case_values. */
732 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
733 (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags. */
734 &generic_prefetch_tune
735 };
736
737 static const struct tune_params cortexa72_tunings =
738 {
739 &cortexa57_extra_costs,
740 &generic_addrcost_table,
741 &cortexa57_regmove_cost,
742 &cortexa57_vector_cost,
743 &generic_branch_cost,
744 &generic_approx_modes,
745 4, /* memmov_cost */
746 3, /* issue_rate */
747 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
748 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
749 16, /* function_align. */
750 4, /* jump_align. */
751 8, /* loop_align. */
752 2, /* int_reassoc_width. */
753 4, /* fp_reassoc_width. */
754 1, /* vec_reassoc_width. */
755 2, /* min_div_recip_mul_sf. */
756 2, /* min_div_recip_mul_df. */
757 0, /* max_case_values. */
758 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
759 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
760 &generic_prefetch_tune
761 };
762
763 static const struct tune_params cortexa73_tunings =
764 {
765 &cortexa57_extra_costs,
766 &generic_addrcost_table,
767 &cortexa57_regmove_cost,
768 &cortexa57_vector_cost,
769 &generic_branch_cost,
770 &generic_approx_modes,
771 4, /* memmov_cost. */
772 2, /* issue_rate. */
773 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
774 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
775 16, /* function_align. */
776 4, /* jump_align. */
777 8, /* loop_align. */
778 2, /* int_reassoc_width. */
779 4, /* fp_reassoc_width. */
780 1, /* vec_reassoc_width. */
781 2, /* min_div_recip_mul_sf. */
782 2, /* min_div_recip_mul_df. */
783 0, /* max_case_values. */
784 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
785 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
786 &generic_prefetch_tune
787 };
788
789
790
791 static const struct tune_params exynosm1_tunings =
792 {
793 &exynosm1_extra_costs,
794 &exynosm1_addrcost_table,
795 &exynosm1_regmove_cost,
796 &exynosm1_vector_cost,
797 &generic_branch_cost,
798 &exynosm1_approx_modes,
799 4, /* memmov_cost */
800 3, /* issue_rate */
801 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
802 4, /* function_align. */
803 4, /* jump_align. */
804 4, /* loop_align. */
805 2, /* int_reassoc_width. */
806 4, /* fp_reassoc_width. */
807 1, /* vec_reassoc_width. */
808 2, /* min_div_recip_mul_sf. */
809 2, /* min_div_recip_mul_df. */
810 48, /* max_case_values. */
811 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
812 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
813 &exynosm1_prefetch_tune
814 };
815
816 static const struct tune_params thunderxt88_tunings =
817 {
818 &thunderx_extra_costs,
819 &generic_addrcost_table,
820 &thunderx_regmove_cost,
821 &thunderx_vector_cost,
822 &generic_branch_cost,
823 &generic_approx_modes,
824 6, /* memmov_cost */
825 2, /* issue_rate */
826 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
827 8, /* function_align. */
828 8, /* jump_align. */
829 8, /* loop_align. */
830 2, /* int_reassoc_width. */
831 4, /* fp_reassoc_width. */
832 1, /* vec_reassoc_width. */
833 2, /* min_div_recip_mul_sf. */
834 2, /* min_div_recip_mul_df. */
835 0, /* max_case_values. */
836 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
837 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW), /* tune_flags. */
838 &thunderxt88_prefetch_tune
839 };
840
841 static const struct tune_params thunderx_tunings =
842 {
843 &thunderx_extra_costs,
844 &generic_addrcost_table,
845 &thunderx_regmove_cost,
846 &thunderx_vector_cost,
847 &generic_branch_cost,
848 &generic_approx_modes,
849 6, /* memmov_cost */
850 2, /* issue_rate */
851 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
852 8, /* function_align. */
853 8, /* jump_align. */
854 8, /* loop_align. */
855 2, /* int_reassoc_width. */
856 4, /* fp_reassoc_width. */
857 1, /* vec_reassoc_width. */
858 2, /* min_div_recip_mul_sf. */
859 2, /* min_div_recip_mul_df. */
860 0, /* max_case_values. */
861 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
862 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
863 | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */
864 &thunderx_prefetch_tune
865 };
866
867 static const struct tune_params xgene1_tunings =
868 {
869 &xgene1_extra_costs,
870 &xgene1_addrcost_table,
871 &xgene1_regmove_cost,
872 &xgene1_vector_cost,
873 &generic_branch_cost,
874 &xgene1_approx_modes,
875 6, /* memmov_cost */
876 4, /* issue_rate */
877 AARCH64_FUSE_NOTHING, /* fusible_ops */
878 16, /* function_align. */
879 8, /* jump_align. */
880 16, /* loop_align. */
881 2, /* int_reassoc_width. */
882 4, /* fp_reassoc_width. */
883 1, /* vec_reassoc_width. */
884 2, /* min_div_recip_mul_sf. */
885 2, /* min_div_recip_mul_df. */
886 0, /* max_case_values. */
887 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
888 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
889 &generic_prefetch_tune
890 };
891
892 static const struct tune_params qdf24xx_tunings =
893 {
894 &qdf24xx_extra_costs,
895 &qdf24xx_addrcost_table,
896 &qdf24xx_regmove_cost,
897 &qdf24xx_vector_cost,
898 &generic_branch_cost,
899 &generic_approx_modes,
900 4, /* memmov_cost */
901 4, /* issue_rate */
902 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
903 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
904 16, /* function_align. */
905 8, /* jump_align. */
906 16, /* loop_align. */
907 2, /* int_reassoc_width. */
908 4, /* fp_reassoc_width. */
909 1, /* vec_reassoc_width. */
910 2, /* min_div_recip_mul_sf. */
911 2, /* min_div_recip_mul_df. */
912 0, /* max_case_values. */
913 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
914 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
915 &qdf24xx_prefetch_tune
916 };
917
918 /* Tuning structure for the Qualcomm Saphira core. Default to falkor values
919 for now. */
920 static const struct tune_params saphira_tunings =
921 {
922 &generic_extra_costs,
923 &generic_addrcost_table,
924 &generic_regmove_cost,
925 &generic_vector_cost,
926 &generic_branch_cost,
927 &generic_approx_modes,
928 4, /* memmov_cost */
929 4, /* issue_rate */
930 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
931 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
932 16, /* function_align. */
933 8, /* jump_align. */
934 16, /* loop_align. */
935 2, /* int_reassoc_width. */
936 4, /* fp_reassoc_width. */
937 1, /* vec_reassoc_width. */
938 2, /* min_div_recip_mul_sf. */
939 2, /* min_div_recip_mul_df. */
940 0, /* max_case_values. */
941 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
942 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
943 &generic_prefetch_tune
944 };
945
946 static const struct tune_params thunderx2t99_tunings =
947 {
948 &thunderx2t99_extra_costs,
949 &thunderx2t99_addrcost_table,
950 &thunderx2t99_regmove_cost,
951 &thunderx2t99_vector_cost,
952 &generic_branch_cost,
953 &generic_approx_modes,
954 4, /* memmov_cost. */
955 4, /* issue_rate. */
956 (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
957 | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops */
958 16, /* function_align. */
959 8, /* jump_align. */
960 16, /* loop_align. */
961 3, /* int_reassoc_width. */
962 2, /* fp_reassoc_width. */
963 2, /* vec_reassoc_width. */
964 2, /* min_div_recip_mul_sf. */
965 2, /* min_div_recip_mul_df. */
966 0, /* max_case_values. */
967 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
968 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
969 &thunderx2t99_prefetch_tune
970 };
971
972 /* Support for fine-grained override of the tuning structures. */
973 struct aarch64_tuning_override_function
974 {
975 const char* name;
976 void (*parse_override)(const char*, struct tune_params*);
977 };
978
979 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
980 static void aarch64_parse_tune_string (const char*, struct tune_params*);
981
982 static const struct aarch64_tuning_override_function
983 aarch64_tuning_override_functions[] =
984 {
985 { "fuse", aarch64_parse_fuse_string },
986 { "tune", aarch64_parse_tune_string },
987 { NULL, NULL }
988 };
989
990 /* A processor implementing AArch64. */
991 struct processor
992 {
993 const char *const name;
994 enum aarch64_processor ident;
995 enum aarch64_processor sched_core;
996 enum aarch64_arch arch;
997 unsigned architecture_version;
998 const unsigned long flags;
999 const struct tune_params *const tune;
1000 };
1001
1002 /* Architectures implementing AArch64. */
1003 static const struct processor all_architectures[] =
1004 {
1005 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
1006 {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
1007 #include "aarch64-arches.def"
1008 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1009 };
1010
1011 /* Processor cores implementing AArch64. */
1012 static const struct processor all_cores[] =
1013 {
1014 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
1015 {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
1016 all_architectures[AARCH64_ARCH_##ARCH].architecture_version, \
1017 FLAGS, &COSTS##_tunings},
1018 #include "aarch64-cores.def"
1019 {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
1020 AARCH64_FL_FOR_ARCH8, &generic_tunings},
1021 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1022 };
1023
1024
1025 /* Target specification. These are populated by the -march, -mtune, -mcpu
1026 handling code or by target attributes. */
1027 static const struct processor *selected_arch;
1028 static const struct processor *selected_cpu;
1029 static const struct processor *selected_tune;
1030
1031 /* The current tuning set. */
1032 struct tune_params aarch64_tune_params = generic_tunings;
1033
1034 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
1035
1036 /* An ISA extension in the co-processor and main instruction set space. */
1037 struct aarch64_option_extension
1038 {
1039 const char *const name;
1040 const unsigned long flags_on;
1041 const unsigned long flags_off;
1042 };
1043
1044 typedef enum aarch64_cond_code
1045 {
1046 AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
1047 AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
1048 AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
1049 }
1050 aarch64_cc;
1051
1052 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1053
1054 /* The condition codes of the processor, and the inverse function. */
1055 static const char * const aarch64_condition_codes[] =
1056 {
1057 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1058 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1059 };
1060
1061 /* Generate code to enable conditional branches in functions over 1 MiB. */
1062 const char *
aarch64_gen_far_branch(rtx * operands,int pos_label,const char * dest,const char * branch_format)1063 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
1064 const char * branch_format)
1065 {
1066 rtx_code_label * tmp_label = gen_label_rtx ();
1067 char label_buf[256];
1068 char buffer[128];
1069 ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
1070 CODE_LABEL_NUMBER (tmp_label));
1071 const char *label_ptr = targetm.strip_name_encoding (label_buf);
1072 rtx dest_label = operands[pos_label];
1073 operands[pos_label] = tmp_label;
1074
1075 snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
1076 output_asm_insn (buffer, operands);
1077
1078 snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
1079 operands[pos_label] = dest_label;
1080 output_asm_insn (buffer, operands);
1081 return "";
1082 }
1083
1084 void
aarch64_err_no_fpadvsimd(machine_mode mode,const char * msg)1085 aarch64_err_no_fpadvsimd (machine_mode mode, const char *msg)
1086 {
1087 const char *mc = FLOAT_MODE_P (mode) ? "floating-point" : "vector";
1088 if (TARGET_GENERAL_REGS_ONLY)
1089 error ("%qs is incompatible with %s %s", "-mgeneral-regs-only", mc, msg);
1090 else
1091 error ("%qs feature modifier is incompatible with %s %s", "+nofp", mc, msg);
1092 }
1093
1094 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1095 The register allocator chooses ALL_REGS if FP_REGS and GENERAL_REGS have
1096 the same cost even if ALL_REGS has a much larger cost. ALL_REGS is also
1097 used if the cost of both FP_REGS and GENERAL_REGS is lower than the memory
1098 cost (in this case the best class is the lowest cost one). Using ALL_REGS
1099 irrespectively of its cost results in bad allocations with many redundant
1100 int<->FP moves which are expensive on various cores.
1101 To avoid this we don't allow ALL_REGS as the allocno class, but force a
1102 decision between FP_REGS and GENERAL_REGS. We use the allocno class if it
1103 isn't ALL_REGS. Similarly, use the best class if it isn't ALL_REGS.
1104 Otherwise set the allocno class depending on the mode.
1105 The result of this is that it is no longer inefficient to have a higher
1106 memory move cost than the register move cost.
1107 */
1108
1109 static reg_class_t
aarch64_ira_change_pseudo_allocno_class(int regno,reg_class_t allocno_class,reg_class_t best_class)1110 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
1111 reg_class_t best_class)
1112 {
1113 machine_mode mode;
1114
1115 if (allocno_class != ALL_REGS)
1116 return allocno_class;
1117
1118 if (best_class != ALL_REGS)
1119 return best_class;
1120
1121 mode = PSEUDO_REGNO_MODE (regno);
1122 return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1123 }
1124
1125 static unsigned int
aarch64_min_divisions_for_recip_mul(machine_mode mode)1126 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1127 {
1128 if (GET_MODE_UNIT_SIZE (mode) == 4)
1129 return aarch64_tune_params.min_div_recip_mul_sf;
1130 return aarch64_tune_params.min_div_recip_mul_df;
1131 }
1132
1133 /* Return the reassociation width of treeop OPC with mode MODE. */
1134 static int
aarch64_reassociation_width(unsigned opc,machine_mode mode)1135 aarch64_reassociation_width (unsigned opc, machine_mode mode)
1136 {
1137 if (VECTOR_MODE_P (mode))
1138 return aarch64_tune_params.vec_reassoc_width;
1139 if (INTEGRAL_MODE_P (mode))
1140 return aarch64_tune_params.int_reassoc_width;
1141 /* Avoid reassociating floating point addition so we emit more FMAs. */
1142 if (FLOAT_MODE_P (mode) && opc != PLUS_EXPR)
1143 return aarch64_tune_params.fp_reassoc_width;
1144 return 1;
1145 }
1146
1147 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
1148 unsigned
aarch64_dbx_register_number(unsigned regno)1149 aarch64_dbx_register_number (unsigned regno)
1150 {
1151 if (GP_REGNUM_P (regno))
1152 return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1153 else if (regno == SP_REGNUM)
1154 return AARCH64_DWARF_SP;
1155 else if (FP_REGNUM_P (regno))
1156 return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1157 else if (PR_REGNUM_P (regno))
1158 return AARCH64_DWARF_P0 + regno - P0_REGNUM;
1159 else if (regno == VG_REGNUM)
1160 return AARCH64_DWARF_VG;
1161
1162 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1163 equivalent DWARF register. */
1164 return DWARF_FRAME_REGISTERS;
1165 }
1166
1167 /* Return true if MODE is any of the Advanced SIMD structure modes. */
1168 static bool
aarch64_advsimd_struct_mode_p(machine_mode mode)1169 aarch64_advsimd_struct_mode_p (machine_mode mode)
1170 {
1171 return (TARGET_SIMD
1172 && (mode == OImode || mode == CImode || mode == XImode));
1173 }
1174
1175 /* Return true if MODE is an SVE predicate mode. */
1176 static bool
aarch64_sve_pred_mode_p(machine_mode mode)1177 aarch64_sve_pred_mode_p (machine_mode mode)
1178 {
1179 return (TARGET_SVE
1180 && (mode == VNx16BImode
1181 || mode == VNx8BImode
1182 || mode == VNx4BImode
1183 || mode == VNx2BImode));
1184 }
1185
1186 /* Three mutually-exclusive flags describing a vector or predicate type. */
1187 const unsigned int VEC_ADVSIMD = 1;
1188 const unsigned int VEC_SVE_DATA = 2;
1189 const unsigned int VEC_SVE_PRED = 4;
1190 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
1191 a structure of 2, 3 or 4 vectors. */
1192 const unsigned int VEC_STRUCT = 8;
1193 /* Useful combinations of the above. */
1194 const unsigned int VEC_ANY_SVE = VEC_SVE_DATA | VEC_SVE_PRED;
1195 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
1196
1197 /* Return a set of flags describing the vector properties of mode MODE.
1198 Ignore modes that are not supported by the current target. */
1199 static unsigned int
aarch64_classify_vector_mode(machine_mode mode)1200 aarch64_classify_vector_mode (machine_mode mode)
1201 {
1202 if (aarch64_advsimd_struct_mode_p (mode))
1203 return VEC_ADVSIMD | VEC_STRUCT;
1204
1205 if (aarch64_sve_pred_mode_p (mode))
1206 return VEC_SVE_PRED;
1207
1208 scalar_mode inner = GET_MODE_INNER (mode);
1209 if (VECTOR_MODE_P (mode)
1210 && (inner == QImode
1211 || inner == HImode
1212 || inner == HFmode
1213 || inner == SImode
1214 || inner == SFmode
1215 || inner == DImode
1216 || inner == DFmode))
1217 {
1218 if (TARGET_SVE)
1219 {
1220 if (known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR))
1221 return VEC_SVE_DATA;
1222 if (known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 2)
1223 || known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 3)
1224 || known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 4))
1225 return VEC_SVE_DATA | VEC_STRUCT;
1226 }
1227
1228 /* This includes V1DF but not V1DI (which doesn't exist). */
1229 if (TARGET_SIMD
1230 && (known_eq (GET_MODE_BITSIZE (mode), 64)
1231 || known_eq (GET_MODE_BITSIZE (mode), 128)))
1232 return VEC_ADVSIMD;
1233 }
1234
1235 return 0;
1236 }
1237
1238 /* Return true if MODE is any of the data vector modes, including
1239 structure modes. */
1240 static bool
aarch64_vector_data_mode_p(machine_mode mode)1241 aarch64_vector_data_mode_p (machine_mode mode)
1242 {
1243 return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
1244 }
1245
1246 /* Return true if MODE is an SVE data vector mode; either a single vector
1247 or a structure of vectors. */
1248 static bool
aarch64_sve_data_mode_p(machine_mode mode)1249 aarch64_sve_data_mode_p (machine_mode mode)
1250 {
1251 return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
1252 }
1253
1254 /* Implement target hook TARGET_ARRAY_MODE. */
1255 static opt_machine_mode
aarch64_array_mode(machine_mode mode,unsigned HOST_WIDE_INT nelems)1256 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
1257 {
1258 if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
1259 && IN_RANGE (nelems, 2, 4))
1260 return mode_for_vector (GET_MODE_INNER (mode),
1261 GET_MODE_NUNITS (mode) * nelems);
1262
1263 return opt_machine_mode ();
1264 }
1265
1266 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
1267 static bool
aarch64_array_mode_supported_p(machine_mode mode,unsigned HOST_WIDE_INT nelems)1268 aarch64_array_mode_supported_p (machine_mode mode,
1269 unsigned HOST_WIDE_INT nelems)
1270 {
1271 if (TARGET_SIMD
1272 && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1273 || AARCH64_VALID_SIMD_DREG_MODE (mode))
1274 && (nelems >= 2 && nelems <= 4))
1275 return true;
1276
1277 return false;
1278 }
1279
1280 /* Return the SVE predicate mode to use for elements that have
1281 ELEM_NBYTES bytes, if such a mode exists. */
1282
1283 opt_machine_mode
aarch64_sve_pred_mode(unsigned int elem_nbytes)1284 aarch64_sve_pred_mode (unsigned int elem_nbytes)
1285 {
1286 if (TARGET_SVE)
1287 {
1288 if (elem_nbytes == 1)
1289 return VNx16BImode;
1290 if (elem_nbytes == 2)
1291 return VNx8BImode;
1292 if (elem_nbytes == 4)
1293 return VNx4BImode;
1294 if (elem_nbytes == 8)
1295 return VNx2BImode;
1296 }
1297 return opt_machine_mode ();
1298 }
1299
1300 /* Implement TARGET_VECTORIZE_GET_MASK_MODE. */
1301
1302 static opt_machine_mode
aarch64_get_mask_mode(poly_uint64 nunits,poly_uint64 nbytes)1303 aarch64_get_mask_mode (poly_uint64 nunits, poly_uint64 nbytes)
1304 {
1305 if (TARGET_SVE && known_eq (nbytes, BYTES_PER_SVE_VECTOR))
1306 {
1307 unsigned int elem_nbytes = vector_element_size (nbytes, nunits);
1308 machine_mode pred_mode;
1309 if (aarch64_sve_pred_mode (elem_nbytes).exists (&pred_mode))
1310 return pred_mode;
1311 }
1312
1313 return default_get_mask_mode (nunits, nbytes);
1314 }
1315
1316 /* Implement TARGET_HARD_REGNO_NREGS. */
1317
1318 static unsigned int
aarch64_hard_regno_nregs(unsigned regno,machine_mode mode)1319 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1320 {
1321 /* ??? Logically we should only need to provide a value when
1322 HARD_REGNO_MODE_OK says that the combination is valid,
1323 but at the moment we need to handle all modes. Just ignore
1324 any runtime parts for registers that can't store them. */
1325 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
1326 switch (aarch64_regno_regclass (regno))
1327 {
1328 case FP_REGS:
1329 case FP_LO_REGS:
1330 if (aarch64_sve_data_mode_p (mode))
1331 return exact_div (GET_MODE_SIZE (mode),
1332 BYTES_PER_SVE_VECTOR).to_constant ();
1333 return CEIL (lowest_size, UNITS_PER_VREG);
1334 case PR_REGS:
1335 case PR_LO_REGS:
1336 case PR_HI_REGS:
1337 return 1;
1338 default:
1339 return CEIL (lowest_size, UNITS_PER_WORD);
1340 }
1341 gcc_unreachable ();
1342 }
1343
1344 /* Implement TARGET_HARD_REGNO_MODE_OK. */
1345
1346 static bool
aarch64_hard_regno_mode_ok(unsigned regno,machine_mode mode)1347 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1348 {
1349 if (GET_MODE_CLASS (mode) == MODE_CC)
1350 return regno == CC_REGNUM;
1351
1352 if (regno == VG_REGNUM)
1353 /* This must have the same size as _Unwind_Word. */
1354 return mode == DImode;
1355
1356 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1357 if (vec_flags & VEC_SVE_PRED)
1358 return PR_REGNUM_P (regno);
1359
1360 if (PR_REGNUM_P (regno))
1361 return 0;
1362
1363 if (regno == SP_REGNUM)
1364 /* The purpose of comparing with ptr_mode is to support the
1365 global register variable associated with the stack pointer
1366 register via the syntax of asm ("wsp") in ILP32. */
1367 return mode == Pmode || mode == ptr_mode;
1368
1369 if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1370 return mode == Pmode;
1371
1372 if (GP_REGNUM_P (regno) && known_le (GET_MODE_SIZE (mode), 16))
1373 return true;
1374
1375 if (FP_REGNUM_P (regno))
1376 {
1377 if (vec_flags & VEC_STRUCT)
1378 return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
1379 else
1380 return !VECTOR_MODE_P (mode) || vec_flags != 0;
1381 }
1382
1383 return false;
1384 }
1385
1386 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The callee only saves
1387 the lower 64 bits of a 128-bit register. Tell the compiler the callee
1388 clobbers the top 64 bits when restoring the bottom 64 bits. */
1389
1390 static bool
aarch64_hard_regno_call_part_clobbered(unsigned int regno,machine_mode mode)1391 aarch64_hard_regno_call_part_clobbered (unsigned int regno, machine_mode mode)
1392 {
1393 return FP_REGNUM_P (regno) && maybe_gt (GET_MODE_SIZE (mode), 8);
1394 }
1395
1396 /* Implement REGMODE_NATURAL_SIZE. */
1397 poly_uint64
aarch64_regmode_natural_size(machine_mode mode)1398 aarch64_regmode_natural_size (machine_mode mode)
1399 {
1400 /* The natural size for SVE data modes is one SVE data vector,
1401 and similarly for predicates. We can't independently modify
1402 anything smaller than that. */
1403 /* ??? For now, only do this for variable-width SVE registers.
1404 Doing it for constant-sized registers breaks lower-subreg.c. */
1405 /* ??? And once that's fixed, we should probably have similar
1406 code for Advanced SIMD. */
1407 if (!aarch64_sve_vg.is_constant ())
1408 {
1409 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1410 if (vec_flags & VEC_SVE_PRED)
1411 return BYTES_PER_SVE_PRED;
1412 if (vec_flags & VEC_SVE_DATA)
1413 return BYTES_PER_SVE_VECTOR;
1414 }
1415 return UNITS_PER_WORD;
1416 }
1417
1418 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
1419 machine_mode
aarch64_hard_regno_caller_save_mode(unsigned regno,unsigned,machine_mode mode)1420 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
1421 machine_mode mode)
1422 {
1423 /* The predicate mode determines which bits are significant and
1424 which are "don't care". Decreasing the number of lanes would
1425 lose data while increasing the number of lanes would make bits
1426 unnecessarily significant. */
1427 if (PR_REGNUM_P (regno))
1428 return mode;
1429 if (known_ge (GET_MODE_SIZE (mode), 4))
1430 return mode;
1431 else
1432 return SImode;
1433 }
1434
1435 /* Implement TARGET_CONSTANT_ALIGNMENT. Make strings word-aligned so
1436 that strcpy from constants will be faster. */
1437
1438 static HOST_WIDE_INT
aarch64_constant_alignment(const_tree exp,HOST_WIDE_INT align)1439 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
1440 {
1441 if (TREE_CODE (exp) == STRING_CST && !optimize_size)
1442 return MAX (align, BITS_PER_WORD);
1443 return align;
1444 }
1445
1446 /* Return true if calls to DECL should be treated as
1447 long-calls (ie called via a register). */
1448 static bool
aarch64_decl_is_long_call_p(const_tree decl ATTRIBUTE_UNUSED)1449 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1450 {
1451 return false;
1452 }
1453
1454 /* Return true if calls to symbol-ref SYM should be treated as
1455 long-calls (ie called via a register). */
1456 bool
aarch64_is_long_call_p(rtx sym)1457 aarch64_is_long_call_p (rtx sym)
1458 {
1459 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1460 }
1461
1462 /* Return true if calls to symbol-ref SYM should not go through
1463 plt stubs. */
1464
1465 bool
aarch64_is_noplt_call_p(rtx sym)1466 aarch64_is_noplt_call_p (rtx sym)
1467 {
1468 const_tree decl = SYMBOL_REF_DECL (sym);
1469
1470 if (flag_pic
1471 && decl
1472 && (!flag_plt
1473 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1474 && !targetm.binds_local_p (decl))
1475 return true;
1476
1477 return false;
1478 }
1479
1480 /* Return true if the offsets to a zero/sign-extract operation
1481 represent an expression that matches an extend operation. The
1482 operands represent the paramters from
1483
1484 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */
1485 bool
aarch64_is_extend_from_extract(scalar_int_mode mode,rtx mult_imm,rtx extract_imm)1486 aarch64_is_extend_from_extract (scalar_int_mode mode, rtx mult_imm,
1487 rtx extract_imm)
1488 {
1489 HOST_WIDE_INT mult_val, extract_val;
1490
1491 if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
1492 return false;
1493
1494 mult_val = INTVAL (mult_imm);
1495 extract_val = INTVAL (extract_imm);
1496
1497 if (extract_val > 8
1498 && extract_val < GET_MODE_BITSIZE (mode)
1499 && exact_log2 (extract_val & ~7) > 0
1500 && (extract_val & 7) <= 4
1501 && mult_val == (1 << (extract_val & 7)))
1502 return true;
1503
1504 return false;
1505 }
1506
1507 /* Emit an insn that's a simple single-set. Both the operands must be
1508 known to be valid. */
1509 inline static rtx_insn *
emit_set_insn(rtx x,rtx y)1510 emit_set_insn (rtx x, rtx y)
1511 {
1512 return emit_insn (gen_rtx_SET (x, y));
1513 }
1514
1515 /* X and Y are two things to compare using CODE. Emit the compare insn and
1516 return the rtx for register 0 in the proper mode. */
1517 rtx
aarch64_gen_compare_reg(RTX_CODE code,rtx x,rtx y)1518 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
1519 {
1520 machine_mode mode = SELECT_CC_MODE (code, x, y);
1521 rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
1522
1523 emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
1524 return cc_reg;
1525 }
1526
1527 /* Build the SYMBOL_REF for __tls_get_addr. */
1528
1529 static GTY(()) rtx tls_get_addr_libfunc;
1530
1531 rtx
aarch64_tls_get_addr(void)1532 aarch64_tls_get_addr (void)
1533 {
1534 if (!tls_get_addr_libfunc)
1535 tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
1536 return tls_get_addr_libfunc;
1537 }
1538
1539 /* Return the TLS model to use for ADDR. */
1540
1541 static enum tls_model
tls_symbolic_operand_type(rtx addr)1542 tls_symbolic_operand_type (rtx addr)
1543 {
1544 enum tls_model tls_kind = TLS_MODEL_NONE;
1545 if (GET_CODE (addr) == CONST)
1546 {
1547 poly_int64 addend;
1548 rtx sym = strip_offset (addr, &addend);
1549 if (GET_CODE (sym) == SYMBOL_REF)
1550 tls_kind = SYMBOL_REF_TLS_MODEL (sym);
1551 }
1552 else if (GET_CODE (addr) == SYMBOL_REF)
1553 tls_kind = SYMBOL_REF_TLS_MODEL (addr);
1554
1555 return tls_kind;
1556 }
1557
1558 /* We'll allow lo_sum's in addresses in our legitimate addresses
1559 so that combine would take care of combining addresses where
1560 necessary, but for generation purposes, we'll generate the address
1561 as :
1562 RTL Absolute
1563 tmp = hi (symbol_ref); adrp x1, foo
1564 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
1565 nop
1566
1567 PIC TLS
1568 adrp x1, :got:foo adrp tmp, :tlsgd:foo
1569 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
1570 bl __tls_get_addr
1571 nop
1572
1573 Load TLS symbol, depending on TLS mechanism and TLS access model.
1574
1575 Global Dynamic - Traditional TLS:
1576 adrp tmp, :tlsgd:imm
1577 add dest, tmp, #:tlsgd_lo12:imm
1578 bl __tls_get_addr
1579
1580 Global Dynamic - TLS Descriptors:
1581 adrp dest, :tlsdesc:imm
1582 ldr tmp, [dest, #:tlsdesc_lo12:imm]
1583 add dest, dest, #:tlsdesc_lo12:imm
1584 blr tmp
1585 mrs tp, tpidr_el0
1586 add dest, dest, tp
1587
1588 Initial Exec:
1589 mrs tp, tpidr_el0
1590 adrp tmp, :gottprel:imm
1591 ldr dest, [tmp, #:gottprel_lo12:imm]
1592 add dest, dest, tp
1593
1594 Local Exec:
1595 mrs tp, tpidr_el0
1596 add t0, tp, #:tprel_hi12:imm, lsl #12
1597 add t0, t0, #:tprel_lo12_nc:imm
1598 */
1599
1600 static void
aarch64_load_symref_appropriately(rtx dest,rtx imm,enum aarch64_symbol_type type)1601 aarch64_load_symref_appropriately (rtx dest, rtx imm,
1602 enum aarch64_symbol_type type)
1603 {
1604 switch (type)
1605 {
1606 case SYMBOL_SMALL_ABSOLUTE:
1607 {
1608 /* In ILP32, the mode of dest can be either SImode or DImode. */
1609 rtx tmp_reg = dest;
1610 machine_mode mode = GET_MODE (dest);
1611
1612 gcc_assert (mode == Pmode || mode == ptr_mode);
1613
1614 if (can_create_pseudo_p ())
1615 tmp_reg = gen_reg_rtx (mode);
1616
1617 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1618 emit_insn (gen_add_losym (dest, tmp_reg, imm));
1619 return;
1620 }
1621
1622 case SYMBOL_TINY_ABSOLUTE:
1623 emit_insn (gen_rtx_SET (dest, imm));
1624 return;
1625
1626 case SYMBOL_SMALL_GOT_28K:
1627 {
1628 machine_mode mode = GET_MODE (dest);
1629 rtx gp_rtx = pic_offset_table_rtx;
1630 rtx insn;
1631 rtx mem;
1632
1633 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
1634 here before rtl expand. Tree IVOPT will generate rtl pattern to
1635 decide rtx costs, in which case pic_offset_table_rtx is not
1636 initialized. For that case no need to generate the first adrp
1637 instruction as the final cost for global variable access is
1638 one instruction. */
1639 if (gp_rtx != NULL)
1640 {
1641 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
1642 using the page base as GOT base, the first page may be wasted,
1643 in the worst scenario, there is only 28K space for GOT).
1644
1645 The generate instruction sequence for accessing global variable
1646 is:
1647
1648 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
1649
1650 Only one instruction needed. But we must initialize
1651 pic_offset_table_rtx properly. We generate initialize insn for
1652 every global access, and allow CSE to remove all redundant.
1653
1654 The final instruction sequences will look like the following
1655 for multiply global variables access.
1656
1657 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
1658
1659 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
1660 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
1661 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
1662 ... */
1663
1664 rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
1665 crtl->uses_pic_offset_table = 1;
1666 emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
1667
1668 if (mode != GET_MODE (gp_rtx))
1669 gp_rtx = gen_lowpart (mode, gp_rtx);
1670
1671 }
1672
1673 if (mode == ptr_mode)
1674 {
1675 if (mode == DImode)
1676 insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
1677 else
1678 insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
1679
1680 mem = XVECEXP (SET_SRC (insn), 0, 0);
1681 }
1682 else
1683 {
1684 gcc_assert (mode == Pmode);
1685
1686 insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
1687 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1688 }
1689
1690 /* The operand is expected to be MEM. Whenever the related insn
1691 pattern changed, above code which calculate mem should be
1692 updated. */
1693 gcc_assert (GET_CODE (mem) == MEM);
1694 MEM_READONLY_P (mem) = 1;
1695 MEM_NOTRAP_P (mem) = 1;
1696 emit_insn (insn);
1697 return;
1698 }
1699
1700 case SYMBOL_SMALL_GOT_4G:
1701 {
1702 /* In ILP32, the mode of dest can be either SImode or DImode,
1703 while the got entry is always of SImode size. The mode of
1704 dest depends on how dest is used: if dest is assigned to a
1705 pointer (e.g. in the memory), it has SImode; it may have
1706 DImode if dest is dereferenced to access the memeory.
1707 This is why we have to handle three different ldr_got_small
1708 patterns here (two patterns for ILP32). */
1709
1710 rtx insn;
1711 rtx mem;
1712 rtx tmp_reg = dest;
1713 machine_mode mode = GET_MODE (dest);
1714
1715 if (can_create_pseudo_p ())
1716 tmp_reg = gen_reg_rtx (mode);
1717
1718 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1719 if (mode == ptr_mode)
1720 {
1721 if (mode == DImode)
1722 insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
1723 else
1724 insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
1725
1726 mem = XVECEXP (SET_SRC (insn), 0, 0);
1727 }
1728 else
1729 {
1730 gcc_assert (mode == Pmode);
1731
1732 insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
1733 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1734 }
1735
1736 gcc_assert (GET_CODE (mem) == MEM);
1737 MEM_READONLY_P (mem) = 1;
1738 MEM_NOTRAP_P (mem) = 1;
1739 emit_insn (insn);
1740 return;
1741 }
1742
1743 case SYMBOL_SMALL_TLSGD:
1744 {
1745 rtx_insn *insns;
1746 machine_mode mode = GET_MODE (dest);
1747 rtx result = gen_rtx_REG (mode, R0_REGNUM);
1748
1749 start_sequence ();
1750 if (TARGET_ILP32)
1751 aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
1752 else
1753 aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
1754 insns = get_insns ();
1755 end_sequence ();
1756
1757 RTL_CONST_CALL_P (insns) = 1;
1758 emit_libcall_block (insns, dest, result, imm);
1759 return;
1760 }
1761
1762 case SYMBOL_SMALL_TLSDESC:
1763 {
1764 machine_mode mode = GET_MODE (dest);
1765 rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
1766 rtx tp;
1767
1768 gcc_assert (mode == Pmode || mode == ptr_mode);
1769
1770 /* In ILP32, the got entry is always of SImode size. Unlike
1771 small GOT, the dest is fixed at reg 0. */
1772 if (TARGET_ILP32)
1773 emit_insn (gen_tlsdesc_small_si (imm));
1774 else
1775 emit_insn (gen_tlsdesc_small_di (imm));
1776 tp = aarch64_load_tp (NULL);
1777
1778 if (mode != Pmode)
1779 tp = gen_lowpart (mode, tp);
1780
1781 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
1782 if (REG_P (dest))
1783 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1784 return;
1785 }
1786
1787 case SYMBOL_SMALL_TLSIE:
1788 {
1789 /* In ILP32, the mode of dest can be either SImode or DImode,
1790 while the got entry is always of SImode size. The mode of
1791 dest depends on how dest is used: if dest is assigned to a
1792 pointer (e.g. in the memory), it has SImode; it may have
1793 DImode if dest is dereferenced to access the memeory.
1794 This is why we have to handle three different tlsie_small
1795 patterns here (two patterns for ILP32). */
1796 machine_mode mode = GET_MODE (dest);
1797 rtx tmp_reg = gen_reg_rtx (mode);
1798 rtx tp = aarch64_load_tp (NULL);
1799
1800 if (mode == ptr_mode)
1801 {
1802 if (mode == DImode)
1803 emit_insn (gen_tlsie_small_di (tmp_reg, imm));
1804 else
1805 {
1806 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
1807 tp = gen_lowpart (mode, tp);
1808 }
1809 }
1810 else
1811 {
1812 gcc_assert (mode == Pmode);
1813 emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
1814 }
1815
1816 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
1817 if (REG_P (dest))
1818 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1819 return;
1820 }
1821
1822 case SYMBOL_TLSLE12:
1823 case SYMBOL_TLSLE24:
1824 case SYMBOL_TLSLE32:
1825 case SYMBOL_TLSLE48:
1826 {
1827 machine_mode mode = GET_MODE (dest);
1828 rtx tp = aarch64_load_tp (NULL);
1829
1830 if (mode != Pmode)
1831 tp = gen_lowpart (mode, tp);
1832
1833 switch (type)
1834 {
1835 case SYMBOL_TLSLE12:
1836 emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
1837 (dest, tp, imm));
1838 break;
1839 case SYMBOL_TLSLE24:
1840 emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
1841 (dest, tp, imm));
1842 break;
1843 case SYMBOL_TLSLE32:
1844 emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
1845 (dest, imm));
1846 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1847 (dest, dest, tp));
1848 break;
1849 case SYMBOL_TLSLE48:
1850 emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
1851 (dest, imm));
1852 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1853 (dest, dest, tp));
1854 break;
1855 default:
1856 gcc_unreachable ();
1857 }
1858
1859 if (REG_P (dest))
1860 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1861 return;
1862 }
1863
1864 case SYMBOL_TINY_GOT:
1865 emit_insn (gen_ldr_got_tiny (dest, imm));
1866 return;
1867
1868 case SYMBOL_TINY_TLSIE:
1869 {
1870 machine_mode mode = GET_MODE (dest);
1871 rtx tp = aarch64_load_tp (NULL);
1872
1873 if (mode == ptr_mode)
1874 {
1875 if (mode == DImode)
1876 emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
1877 else
1878 {
1879 tp = gen_lowpart (mode, tp);
1880 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
1881 }
1882 }
1883 else
1884 {
1885 gcc_assert (mode == Pmode);
1886 emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
1887 }
1888
1889 if (REG_P (dest))
1890 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1891 return;
1892 }
1893
1894 default:
1895 gcc_unreachable ();
1896 }
1897 }
1898
1899 /* Emit a move from SRC to DEST. Assume that the move expanders can
1900 handle all moves if !can_create_pseudo_p (). The distinction is
1901 important because, unlike emit_move_insn, the move expanders know
1902 how to force Pmode objects into the constant pool even when the
1903 constant pool address is not itself legitimate. */
1904 static rtx
aarch64_emit_move(rtx dest,rtx src)1905 aarch64_emit_move (rtx dest, rtx src)
1906 {
1907 return (can_create_pseudo_p ()
1908 ? emit_move_insn (dest, src)
1909 : emit_move_insn_1 (dest, src));
1910 }
1911
1912 /* Split a 128-bit move operation into two 64-bit move operations,
1913 taking care to handle partial overlap of register to register
1914 copies. Special cases are needed when moving between GP regs and
1915 FP regs. SRC can be a register, constant or memory; DST a register
1916 or memory. If either operand is memory it must not have any side
1917 effects. */
1918 void
aarch64_split_128bit_move(rtx dst,rtx src)1919 aarch64_split_128bit_move (rtx dst, rtx src)
1920 {
1921 rtx dst_lo, dst_hi;
1922 rtx src_lo, src_hi;
1923
1924 machine_mode mode = GET_MODE (dst);
1925
1926 gcc_assert (mode == TImode || mode == TFmode);
1927 gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
1928 gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
1929
1930 if (REG_P (dst) && REG_P (src))
1931 {
1932 int src_regno = REGNO (src);
1933 int dst_regno = REGNO (dst);
1934
1935 /* Handle FP <-> GP regs. */
1936 if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
1937 {
1938 src_lo = gen_lowpart (word_mode, src);
1939 src_hi = gen_highpart (word_mode, src);
1940
1941 if (mode == TImode)
1942 {
1943 emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
1944 emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
1945 }
1946 else
1947 {
1948 emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
1949 emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
1950 }
1951 return;
1952 }
1953 else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
1954 {
1955 dst_lo = gen_lowpart (word_mode, dst);
1956 dst_hi = gen_highpart (word_mode, dst);
1957
1958 if (mode == TImode)
1959 {
1960 emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
1961 emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
1962 }
1963 else
1964 {
1965 emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
1966 emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
1967 }
1968 return;
1969 }
1970 }
1971
1972 dst_lo = gen_lowpart (word_mode, dst);
1973 dst_hi = gen_highpart (word_mode, dst);
1974 src_lo = gen_lowpart (word_mode, src);
1975 src_hi = gen_highpart_mode (word_mode, mode, src);
1976
1977 /* At most one pairing may overlap. */
1978 if (reg_overlap_mentioned_p (dst_lo, src_hi))
1979 {
1980 aarch64_emit_move (dst_hi, src_hi);
1981 aarch64_emit_move (dst_lo, src_lo);
1982 }
1983 else
1984 {
1985 aarch64_emit_move (dst_lo, src_lo);
1986 aarch64_emit_move (dst_hi, src_hi);
1987 }
1988 }
1989
1990 bool
aarch64_split_128bit_move_p(rtx dst,rtx src)1991 aarch64_split_128bit_move_p (rtx dst, rtx src)
1992 {
1993 return (! REG_P (src)
1994 || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
1995 }
1996
1997 /* Split a complex SIMD combine. */
1998
1999 void
aarch64_split_simd_combine(rtx dst,rtx src1,rtx src2)2000 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
2001 {
2002 machine_mode src_mode = GET_MODE (src1);
2003 machine_mode dst_mode = GET_MODE (dst);
2004
2005 gcc_assert (VECTOR_MODE_P (dst_mode));
2006 gcc_assert (register_operand (dst, dst_mode)
2007 && register_operand (src1, src_mode)
2008 && register_operand (src2, src_mode));
2009
2010 rtx (*gen) (rtx, rtx, rtx);
2011
2012 switch (src_mode)
2013 {
2014 case E_V8QImode:
2015 gen = gen_aarch64_simd_combinev8qi;
2016 break;
2017 case E_V4HImode:
2018 gen = gen_aarch64_simd_combinev4hi;
2019 break;
2020 case E_V2SImode:
2021 gen = gen_aarch64_simd_combinev2si;
2022 break;
2023 case E_V4HFmode:
2024 gen = gen_aarch64_simd_combinev4hf;
2025 break;
2026 case E_V2SFmode:
2027 gen = gen_aarch64_simd_combinev2sf;
2028 break;
2029 case E_DImode:
2030 gen = gen_aarch64_simd_combinedi;
2031 break;
2032 case E_DFmode:
2033 gen = gen_aarch64_simd_combinedf;
2034 break;
2035 default:
2036 gcc_unreachable ();
2037 }
2038
2039 emit_insn (gen (dst, src1, src2));
2040 return;
2041 }
2042
2043 /* Split a complex SIMD move. */
2044
2045 void
aarch64_split_simd_move(rtx dst,rtx src)2046 aarch64_split_simd_move (rtx dst, rtx src)
2047 {
2048 machine_mode src_mode = GET_MODE (src);
2049 machine_mode dst_mode = GET_MODE (dst);
2050
2051 gcc_assert (VECTOR_MODE_P (dst_mode));
2052
2053 if (REG_P (dst) && REG_P (src))
2054 {
2055 rtx (*gen) (rtx, rtx);
2056
2057 gcc_assert (VECTOR_MODE_P (src_mode));
2058
2059 switch (src_mode)
2060 {
2061 case E_V16QImode:
2062 gen = gen_aarch64_split_simd_movv16qi;
2063 break;
2064 case E_V8HImode:
2065 gen = gen_aarch64_split_simd_movv8hi;
2066 break;
2067 case E_V4SImode:
2068 gen = gen_aarch64_split_simd_movv4si;
2069 break;
2070 case E_V2DImode:
2071 gen = gen_aarch64_split_simd_movv2di;
2072 break;
2073 case E_V8HFmode:
2074 gen = gen_aarch64_split_simd_movv8hf;
2075 break;
2076 case E_V4SFmode:
2077 gen = gen_aarch64_split_simd_movv4sf;
2078 break;
2079 case E_V2DFmode:
2080 gen = gen_aarch64_split_simd_movv2df;
2081 break;
2082 default:
2083 gcc_unreachable ();
2084 }
2085
2086 emit_insn (gen (dst, src));
2087 return;
2088 }
2089 }
2090
2091 bool
aarch64_zero_extend_const_eq(machine_mode xmode,rtx x,machine_mode ymode,rtx y)2092 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
2093 machine_mode ymode, rtx y)
2094 {
2095 rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
2096 gcc_assert (r != NULL);
2097 return rtx_equal_p (x, r);
2098 }
2099
2100
2101 static rtx
aarch64_force_temporary(machine_mode mode,rtx x,rtx value)2102 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
2103 {
2104 if (can_create_pseudo_p ())
2105 return force_reg (mode, value);
2106 else
2107 {
2108 gcc_assert (x);
2109 aarch64_emit_move (x, value);
2110 return x;
2111 }
2112 }
2113
2114 /* Return true if we can move VALUE into a register using a single
2115 CNT[BHWD] instruction. */
2116
2117 static bool
aarch64_sve_cnt_immediate_p(poly_int64 value)2118 aarch64_sve_cnt_immediate_p (poly_int64 value)
2119 {
2120 HOST_WIDE_INT factor = value.coeffs[0];
2121 /* The coefficient must be [1, 16] * {2, 4, 8, 16}. */
2122 return (value.coeffs[1] == factor
2123 && IN_RANGE (factor, 2, 16 * 16)
2124 && (factor & 1) == 0
2125 && factor <= 16 * (factor & -factor));
2126 }
2127
2128 /* Likewise for rtx X. */
2129
2130 bool
aarch64_sve_cnt_immediate_p(rtx x)2131 aarch64_sve_cnt_immediate_p (rtx x)
2132 {
2133 poly_int64 value;
2134 return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
2135 }
2136
2137 /* Return the asm string for an instruction with a CNT-like vector size
2138 operand (a vector pattern followed by a multiplier in the range [1, 16]).
2139 PREFIX is the mnemonic without the size suffix and OPERANDS is the
2140 first part of the operands template (the part that comes before the
2141 vector size itself). FACTOR is the number of quadwords.
2142 NELTS_PER_VQ, if nonzero, is the number of elements in each quadword.
2143 If it is zero, we can use any element size. */
2144
2145 static char *
aarch64_output_sve_cnt_immediate(const char * prefix,const char * operands,unsigned int factor,unsigned int nelts_per_vq)2146 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2147 unsigned int factor,
2148 unsigned int nelts_per_vq)
2149 {
2150 static char buffer[sizeof ("sqincd\t%x0, %w0, all, mul #16")];
2151
2152 if (nelts_per_vq == 0)
2153 /* There is some overlap in the ranges of the four CNT instructions.
2154 Here we always use the smallest possible element size, so that the
2155 multiplier is 1 whereever possible. */
2156 nelts_per_vq = factor & -factor;
2157 int shift = std::min (exact_log2 (nelts_per_vq), 4);
2158 gcc_assert (IN_RANGE (shift, 1, 4));
2159 char suffix = "dwhb"[shift - 1];
2160
2161 factor >>= shift;
2162 unsigned int written;
2163 if (factor == 1)
2164 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
2165 prefix, suffix, operands);
2166 else
2167 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, all, mul #%d",
2168 prefix, suffix, operands, factor);
2169 gcc_assert (written < sizeof (buffer));
2170 return buffer;
2171 }
2172
2173 /* Return the asm string for an instruction with a CNT-like vector size
2174 operand (a vector pattern followed by a multiplier in the range [1, 16]).
2175 PREFIX is the mnemonic without the size suffix and OPERANDS is the
2176 first part of the operands template (the part that comes before the
2177 vector size itself). X is the value of the vector size operand,
2178 as a polynomial integer rtx. */
2179
2180 char *
aarch64_output_sve_cnt_immediate(const char * prefix,const char * operands,rtx x)2181 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2182 rtx x)
2183 {
2184 poly_int64 value = rtx_to_poly_int64 (x);
2185 gcc_assert (aarch64_sve_cnt_immediate_p (value));
2186 return aarch64_output_sve_cnt_immediate (prefix, operands,
2187 value.coeffs[1], 0);
2188 }
2189
2190 /* Return true if we can add VALUE to a register using a single ADDVL
2191 or ADDPL instruction. */
2192
2193 static bool
aarch64_sve_addvl_addpl_immediate_p(poly_int64 value)2194 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
2195 {
2196 HOST_WIDE_INT factor = value.coeffs[0];
2197 if (factor == 0 || value.coeffs[1] != factor)
2198 return false;
2199 /* FACTOR counts VG / 2, so a value of 2 is one predicate width
2200 and a value of 16 is one vector width. */
2201 return (((factor & 15) == 0 && IN_RANGE (factor, -32 * 16, 31 * 16))
2202 || ((factor & 1) == 0 && IN_RANGE (factor, -32 * 2, 31 * 2)));
2203 }
2204
2205 /* Likewise for rtx X. */
2206
2207 bool
aarch64_sve_addvl_addpl_immediate_p(rtx x)2208 aarch64_sve_addvl_addpl_immediate_p (rtx x)
2209 {
2210 poly_int64 value;
2211 return (poly_int_rtx_p (x, &value)
2212 && aarch64_sve_addvl_addpl_immediate_p (value));
2213 }
2214
2215 /* Return the asm string for adding ADDVL or ADDPL immediate X to operand 1
2216 and storing the result in operand 0. */
2217
2218 char *
aarch64_output_sve_addvl_addpl(rtx dest,rtx base,rtx offset)2219 aarch64_output_sve_addvl_addpl (rtx dest, rtx base, rtx offset)
2220 {
2221 static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
2222 poly_int64 offset_value = rtx_to_poly_int64 (offset);
2223 gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
2224
2225 /* Use INC or DEC if possible. */
2226 if (rtx_equal_p (dest, base) && GP_REGNUM_P (REGNO (dest)))
2227 {
2228 if (aarch64_sve_cnt_immediate_p (offset_value))
2229 return aarch64_output_sve_cnt_immediate ("inc", "%x0",
2230 offset_value.coeffs[1], 0);
2231 if (aarch64_sve_cnt_immediate_p (-offset_value))
2232 return aarch64_output_sve_cnt_immediate ("dec", "%x0",
2233 -offset_value.coeffs[1], 0);
2234 }
2235
2236 int factor = offset_value.coeffs[1];
2237 if ((factor & 15) == 0)
2238 snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
2239 else
2240 snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
2241 return buffer;
2242 }
2243
2244 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2245 instruction. If it is, store the number of elements in each vector
2246 quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
2247 factor in *FACTOR_OUT (if nonnull). */
2248
2249 bool
aarch64_sve_inc_dec_immediate_p(rtx x,int * factor_out,unsigned int * nelts_per_vq_out)2250 aarch64_sve_inc_dec_immediate_p (rtx x, int *factor_out,
2251 unsigned int *nelts_per_vq_out)
2252 {
2253 rtx elt;
2254 poly_int64 value;
2255
2256 if (!const_vec_duplicate_p (x, &elt)
2257 || !poly_int_rtx_p (elt, &value))
2258 return false;
2259
2260 unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
2261 if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
2262 /* There's no vector INCB. */
2263 return false;
2264
2265 HOST_WIDE_INT factor = value.coeffs[0];
2266 if (value.coeffs[1] != factor)
2267 return false;
2268
2269 /* The coefficient must be [1, 16] * NELTS_PER_VQ. */
2270 if ((factor % nelts_per_vq) != 0
2271 || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
2272 return false;
2273
2274 if (factor_out)
2275 *factor_out = factor;
2276 if (nelts_per_vq_out)
2277 *nelts_per_vq_out = nelts_per_vq;
2278 return true;
2279 }
2280
2281 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2282 instruction. */
2283
2284 bool
aarch64_sve_inc_dec_immediate_p(rtx x)2285 aarch64_sve_inc_dec_immediate_p (rtx x)
2286 {
2287 return aarch64_sve_inc_dec_immediate_p (x, NULL, NULL);
2288 }
2289
2290 /* Return the asm template for an SVE vector INC or DEC instruction.
2291 OPERANDS gives the operands before the vector count and X is the
2292 value of the vector count operand itself. */
2293
2294 char *
aarch64_output_sve_inc_dec_immediate(const char * operands,rtx x)2295 aarch64_output_sve_inc_dec_immediate (const char *operands, rtx x)
2296 {
2297 int factor;
2298 unsigned int nelts_per_vq;
2299 if (!aarch64_sve_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
2300 gcc_unreachable ();
2301 if (factor < 0)
2302 return aarch64_output_sve_cnt_immediate ("dec", operands, -factor,
2303 nelts_per_vq);
2304 else
2305 return aarch64_output_sve_cnt_immediate ("inc", operands, factor,
2306 nelts_per_vq);
2307 }
2308
2309 static int
aarch64_internal_mov_immediate(rtx dest,rtx imm,bool generate,scalar_int_mode mode)2310 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
2311 scalar_int_mode mode)
2312 {
2313 int i;
2314 unsigned HOST_WIDE_INT val, val2, mask;
2315 int one_match, zero_match;
2316 int num_insns;
2317
2318 val = INTVAL (imm);
2319
2320 if (aarch64_move_imm (val, mode))
2321 {
2322 if (generate)
2323 emit_insn (gen_rtx_SET (dest, imm));
2324 return 1;
2325 }
2326
2327 /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
2328 (with XXXX non-zero). In that case check to see if the move can be done in
2329 a smaller mode. */
2330 val2 = val & 0xffffffff;
2331 if (mode == DImode
2332 && aarch64_move_imm (val2, SImode)
2333 && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
2334 {
2335 if (generate)
2336 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
2337
2338 /* Check if we have to emit a second instruction by checking to see
2339 if any of the upper 32 bits of the original DI mode value is set. */
2340 if (val == val2)
2341 return 1;
2342
2343 i = (val >> 48) ? 48 : 32;
2344
2345 if (generate)
2346 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2347 GEN_INT ((val >> i) & 0xffff)));
2348
2349 return 2;
2350 }
2351
2352 if ((val >> 32) == 0 || mode == SImode)
2353 {
2354 if (generate)
2355 {
2356 emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
2357 if (mode == SImode)
2358 emit_insn (gen_insv_immsi (dest, GEN_INT (16),
2359 GEN_INT ((val >> 16) & 0xffff)));
2360 else
2361 emit_insn (gen_insv_immdi (dest, GEN_INT (16),
2362 GEN_INT ((val >> 16) & 0xffff)));
2363 }
2364 return 2;
2365 }
2366
2367 /* Remaining cases are all for DImode. */
2368
2369 mask = 0xffff;
2370 zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
2371 ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
2372 one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
2373 ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
2374
2375 if (zero_match != 2 && one_match != 2)
2376 {
2377 /* Try emitting a bitmask immediate with a movk replacing 16 bits.
2378 For a 64-bit bitmask try whether changing 16 bits to all ones or
2379 zeroes creates a valid bitmask. To check any repeated bitmask,
2380 try using 16 bits from the other 32-bit half of val. */
2381
2382 for (i = 0; i < 64; i += 16, mask <<= 16)
2383 {
2384 val2 = val & ~mask;
2385 if (val2 != val && aarch64_bitmask_imm (val2, mode))
2386 break;
2387 val2 = val | mask;
2388 if (val2 != val && aarch64_bitmask_imm (val2, mode))
2389 break;
2390 val2 = val2 & ~mask;
2391 val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
2392 if (val2 != val && aarch64_bitmask_imm (val2, mode))
2393 break;
2394 }
2395 if (i != 64)
2396 {
2397 if (generate)
2398 {
2399 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
2400 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2401 GEN_INT ((val >> i) & 0xffff)));
2402 }
2403 return 2;
2404 }
2405 }
2406
2407 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
2408 are emitted by the initial mov. If one_match > zero_match, skip set bits,
2409 otherwise skip zero bits. */
2410
2411 num_insns = 1;
2412 mask = 0xffff;
2413 val2 = one_match > zero_match ? ~val : val;
2414 i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
2415
2416 if (generate)
2417 emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
2418 ? (val | ~(mask << i))
2419 : (val & (mask << i)))));
2420 for (i += 16; i < 64; i += 16)
2421 {
2422 if ((val2 & (mask << i)) == 0)
2423 continue;
2424 if (generate)
2425 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2426 GEN_INT ((val >> i) & 0xffff)));
2427 num_insns ++;
2428 }
2429
2430 return num_insns;
2431 }
2432
2433 /* Return whether imm is a 128-bit immediate which is simple enough to
2434 expand inline. */
2435 bool
aarch64_mov128_immediate(rtx imm)2436 aarch64_mov128_immediate (rtx imm)
2437 {
2438 if (GET_CODE (imm) == CONST_INT)
2439 return true;
2440
2441 gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
2442
2443 rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
2444 rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
2445
2446 return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
2447 + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
2448 }
2449
2450
2451 /* Return the number of temporary registers that aarch64_add_offset_1
2452 would need to add OFFSET to a register. */
2453
2454 static unsigned int
aarch64_add_offset_1_temporaries(HOST_WIDE_INT offset)2455 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
2456 {
2457 return abs_hwi (offset) < 0x1000000 ? 0 : 1;
2458 }
2459
2460 /* A subroutine of aarch64_add_offset. Set DEST to SRC + OFFSET for
2461 a non-polynomial OFFSET. MODE is the mode of the addition.
2462 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2463 be set and CFA adjustments added to the generated instructions.
2464
2465 TEMP1, if nonnull, is a register of mode MODE that can be used as a
2466 temporary if register allocation is already complete. This temporary
2467 register may overlap DEST but must not overlap SRC. If TEMP1 is known
2468 to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
2469 the immediate again.
2470
2471 Since this function may be used to adjust the stack pointer, we must
2472 ensure that it cannot cause transient stack deallocation (for example
2473 by first incrementing SP and then decrementing when adjusting by a
2474 large immediate). */
2475
2476 static void
aarch64_add_offset_1(scalar_int_mode mode,rtx dest,rtx src,HOST_WIDE_INT offset,rtx temp1,bool frame_related_p,bool emit_move_imm)2477 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
2478 rtx src, HOST_WIDE_INT offset, rtx temp1,
2479 bool frame_related_p, bool emit_move_imm)
2480 {
2481 gcc_assert (emit_move_imm || temp1 != NULL_RTX);
2482 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
2483
2484 HOST_WIDE_INT moffset = abs_hwi (offset);
2485 rtx_insn *insn;
2486
2487 if (!moffset)
2488 {
2489 if (!rtx_equal_p (dest, src))
2490 {
2491 insn = emit_insn (gen_rtx_SET (dest, src));
2492 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2493 }
2494 return;
2495 }
2496
2497 /* Single instruction adjustment. */
2498 if (aarch64_uimm12_shift (moffset))
2499 {
2500 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
2501 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2502 return;
2503 }
2504
2505 /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
2506 and either:
2507
2508 a) the offset cannot be loaded by a 16-bit move or
2509 b) there is no spare register into which we can move it. */
2510 if (moffset < 0x1000000
2511 && ((!temp1 && !can_create_pseudo_p ())
2512 || !aarch64_move_imm (moffset, mode)))
2513 {
2514 HOST_WIDE_INT low_off = moffset & 0xfff;
2515
2516 low_off = offset < 0 ? -low_off : low_off;
2517 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
2518 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2519 insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
2520 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2521 return;
2522 }
2523
2524 /* Emit a move immediate if required and an addition/subtraction. */
2525 if (emit_move_imm)
2526 {
2527 gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
2528 temp1 = aarch64_force_temporary (mode, temp1, GEN_INT (moffset));
2529 }
2530 insn = emit_insn (offset < 0
2531 ? gen_sub3_insn (dest, src, temp1)
2532 : gen_add3_insn (dest, src, temp1));
2533 if (frame_related_p)
2534 {
2535 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2536 rtx adj = plus_constant (mode, src, offset);
2537 add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
2538 }
2539 }
2540
2541 /* Return the number of temporary registers that aarch64_add_offset
2542 would need to move OFFSET into a register or add OFFSET to a register;
2543 ADD_P is true if we want the latter rather than the former. */
2544
2545 static unsigned int
aarch64_offset_temporaries(bool add_p,poly_int64 offset)2546 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
2547 {
2548 /* This follows the same structure as aarch64_add_offset. */
2549 if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
2550 return 0;
2551
2552 unsigned int count = 0;
2553 HOST_WIDE_INT factor = offset.coeffs[1];
2554 HOST_WIDE_INT constant = offset.coeffs[0] - factor;
2555 poly_int64 poly_offset (factor, factor);
2556 if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
2557 /* Need one register for the ADDVL/ADDPL result. */
2558 count += 1;
2559 else if (factor != 0)
2560 {
2561 factor = abs (factor);
2562 if (factor > 16 * (factor & -factor))
2563 /* Need one register for the CNT result and one for the multiplication
2564 factor. If necessary, the second temporary can be reused for the
2565 constant part of the offset. */
2566 return 2;
2567 /* Need one register for the CNT result (which might then
2568 be shifted). */
2569 count += 1;
2570 }
2571 return count + aarch64_add_offset_1_temporaries (constant);
2572 }
2573
2574 /* If X can be represented as a poly_int64, return the number
2575 of temporaries that are required to add it to a register.
2576 Return -1 otherwise. */
2577
2578 int
aarch64_add_offset_temporaries(rtx x)2579 aarch64_add_offset_temporaries (rtx x)
2580 {
2581 poly_int64 offset;
2582 if (!poly_int_rtx_p (x, &offset))
2583 return -1;
2584 return aarch64_offset_temporaries (true, offset);
2585 }
2586
2587 /* Set DEST to SRC + OFFSET. MODE is the mode of the addition.
2588 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2589 be set and CFA adjustments added to the generated instructions.
2590
2591 TEMP1, if nonnull, is a register of mode MODE that can be used as a
2592 temporary if register allocation is already complete. This temporary
2593 register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
2594 If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
2595 false to avoid emitting the immediate again.
2596
2597 TEMP2, if nonnull, is a second temporary register that doesn't
2598 overlap either DEST or REG.
2599
2600 Since this function may be used to adjust the stack pointer, we must
2601 ensure that it cannot cause transient stack deallocation (for example
2602 by first incrementing SP and then decrementing when adjusting by a
2603 large immediate). */
2604
2605 static void
2606 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
2607 poly_int64 offset, rtx temp1, rtx temp2,
2608 bool frame_related_p, bool emit_move_imm = true)
2609 {
2610 gcc_assert (emit_move_imm || temp1 != NULL_RTX);
2611 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
2612 gcc_assert (temp1 == NULL_RTX
2613 || !frame_related_p
2614 || !reg_overlap_mentioned_p (temp1, dest));
2615 gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
2616
2617 /* Try using ADDVL or ADDPL to add the whole value. */
2618 if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
2619 {
2620 rtx offset_rtx = gen_int_mode (offset, mode);
2621 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
2622 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2623 return;
2624 }
2625
2626 /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
2627 SVE vector register, over and above the minimum size of 128 bits.
2628 This is equivalent to half the value returned by CNTD with a
2629 vector shape of ALL. */
2630 HOST_WIDE_INT factor = offset.coeffs[1];
2631 HOST_WIDE_INT constant = offset.coeffs[0] - factor;
2632
2633 /* Try using ADDVL or ADDPL to add the VG-based part. */
2634 poly_int64 poly_offset (factor, factor);
2635 if (src != const0_rtx
2636 && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
2637 {
2638 rtx offset_rtx = gen_int_mode (poly_offset, mode);
2639 if (frame_related_p)
2640 {
2641 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
2642 RTX_FRAME_RELATED_P (insn) = true;
2643 src = dest;
2644 }
2645 else
2646 {
2647 rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
2648 src = aarch64_force_temporary (mode, temp1, addr);
2649 temp1 = temp2;
2650 temp2 = NULL_RTX;
2651 }
2652 }
2653 /* Otherwise use a CNT-based sequence. */
2654 else if (factor != 0)
2655 {
2656 /* Use a subtraction if we have a negative factor. */
2657 rtx_code code = PLUS;
2658 if (factor < 0)
2659 {
2660 factor = -factor;
2661 code = MINUS;
2662 }
2663
2664 /* Calculate CNTD * FACTOR / 2. First try to fold the division
2665 into the multiplication. */
2666 rtx val;
2667 int shift = 0;
2668 if (factor & 1)
2669 /* Use a right shift by 1. */
2670 shift = -1;
2671 else
2672 factor /= 2;
2673 HOST_WIDE_INT low_bit = factor & -factor;
2674 if (factor <= 16 * low_bit)
2675 {
2676 if (factor > 16 * 8)
2677 {
2678 /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
2679 the value with the minimum multiplier and shift it into
2680 position. */
2681 int extra_shift = exact_log2 (low_bit);
2682 shift += extra_shift;
2683 factor >>= extra_shift;
2684 }
2685 val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
2686 }
2687 else
2688 {
2689 /* Use CNTD, then multiply it by FACTOR. */
2690 val = gen_int_mode (poly_int64 (2, 2), mode);
2691 val = aarch64_force_temporary (mode, temp1, val);
2692
2693 /* Go back to using a negative multiplication factor if we have
2694 no register from which to subtract. */
2695 if (code == MINUS && src == const0_rtx)
2696 {
2697 factor = -factor;
2698 code = PLUS;
2699 }
2700 rtx coeff1 = gen_int_mode (factor, mode);
2701 coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
2702 val = gen_rtx_MULT (mode, val, coeff1);
2703 }
2704
2705 if (shift > 0)
2706 {
2707 /* Multiply by 1 << SHIFT. */
2708 val = aarch64_force_temporary (mode, temp1, val);
2709 val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
2710 }
2711 else if (shift == -1)
2712 {
2713 /* Divide by 2. */
2714 val = aarch64_force_temporary (mode, temp1, val);
2715 val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
2716 }
2717
2718 /* Calculate SRC +/- CNTD * FACTOR / 2. */
2719 if (src != const0_rtx)
2720 {
2721 val = aarch64_force_temporary (mode, temp1, val);
2722 val = gen_rtx_fmt_ee (code, mode, src, val);
2723 }
2724 else if (code == MINUS)
2725 {
2726 val = aarch64_force_temporary (mode, temp1, val);
2727 val = gen_rtx_NEG (mode, val);
2728 }
2729
2730 if (constant == 0 || frame_related_p)
2731 {
2732 rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
2733 if (frame_related_p)
2734 {
2735 RTX_FRAME_RELATED_P (insn) = true;
2736 add_reg_note (insn, REG_CFA_ADJUST_CFA,
2737 gen_rtx_SET (dest, plus_constant (Pmode, src,
2738 poly_offset)));
2739 }
2740 src = dest;
2741 if (constant == 0)
2742 return;
2743 }
2744 else
2745 {
2746 src = aarch64_force_temporary (mode, temp1, val);
2747 temp1 = temp2;
2748 temp2 = NULL_RTX;
2749 }
2750
2751 emit_move_imm = true;
2752 }
2753
2754 aarch64_add_offset_1 (mode, dest, src, constant, temp1,
2755 frame_related_p, emit_move_imm);
2756 }
2757
2758 /* Like aarch64_add_offset, but the offset is given as an rtx rather
2759 than a poly_int64. */
2760
2761 void
aarch64_split_add_offset(scalar_int_mode mode,rtx dest,rtx src,rtx offset_rtx,rtx temp1,rtx temp2)2762 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
2763 rtx offset_rtx, rtx temp1, rtx temp2)
2764 {
2765 aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
2766 temp1, temp2, false);
2767 }
2768
2769 /* Add DELTA to the stack pointer, marking the instructions frame-related.
2770 TEMP1 is available as a temporary if nonnull. EMIT_MOVE_IMM is false
2771 if TEMP1 already contains abs (DELTA). */
2772
2773 static inline void
aarch64_add_sp(rtx temp1,rtx temp2,poly_int64 delta,bool emit_move_imm)2774 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
2775 {
2776 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
2777 temp1, temp2, true, emit_move_imm);
2778 }
2779
2780 /* Subtract DELTA from the stack pointer, marking the instructions
2781 frame-related if FRAME_RELATED_P. TEMP1 is available as a temporary
2782 if nonnull. */
2783
2784 static inline void
aarch64_sub_sp(rtx temp1,rtx temp2,poly_int64 delta,bool frame_related_p)2785 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p)
2786 {
2787 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
2788 temp1, temp2, frame_related_p);
2789 }
2790
2791 /* Set DEST to (vec_series BASE STEP). */
2792
2793 static void
aarch64_expand_vec_series(rtx dest,rtx base,rtx step)2794 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
2795 {
2796 machine_mode mode = GET_MODE (dest);
2797 scalar_mode inner = GET_MODE_INNER (mode);
2798
2799 /* Each operand can be a register or an immediate in the range [-16, 15]. */
2800 if (!aarch64_sve_index_immediate_p (base))
2801 base = force_reg (inner, base);
2802 if (!aarch64_sve_index_immediate_p (step))
2803 step = force_reg (inner, step);
2804
2805 emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
2806 }
2807
2808 /* Try to duplicate SRC into SVE register DEST, given that SRC is an
2809 integer of mode INT_MODE. Return true on success. */
2810
2811 static bool
aarch64_expand_sve_widened_duplicate(rtx dest,scalar_int_mode src_mode,rtx src)2812 aarch64_expand_sve_widened_duplicate (rtx dest, scalar_int_mode src_mode,
2813 rtx src)
2814 {
2815 /* If the constant is smaller than 128 bits, we can do the move
2816 using a vector of SRC_MODEs. */
2817 if (src_mode != TImode)
2818 {
2819 poly_uint64 count = exact_div (GET_MODE_SIZE (GET_MODE (dest)),
2820 GET_MODE_SIZE (src_mode));
2821 machine_mode dup_mode = mode_for_vector (src_mode, count).require ();
2822 emit_move_insn (gen_lowpart (dup_mode, dest),
2823 gen_const_vec_duplicate (dup_mode, src));
2824 return true;
2825 }
2826
2827 /* Use LD1RQ[BHWD] to load the 128 bits from memory. */
2828 src = force_const_mem (src_mode, src);
2829 if (!src)
2830 return false;
2831
2832 /* Make sure that the address is legitimate. */
2833 if (!aarch64_sve_ld1r_operand_p (src))
2834 {
2835 rtx addr = force_reg (Pmode, XEXP (src, 0));
2836 src = replace_equiv_address (src, addr);
2837 }
2838
2839 machine_mode mode = GET_MODE (dest);
2840 unsigned int elem_bytes = GET_MODE_UNIT_SIZE (mode);
2841 machine_mode pred_mode = aarch64_sve_pred_mode (elem_bytes).require ();
2842 rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
2843 src = gen_rtx_UNSPEC (mode, gen_rtvec (2, ptrue, src), UNSPEC_LD1RQ);
2844 emit_insn (gen_rtx_SET (dest, src));
2845 return true;
2846 }
2847
2848 /* Expand a move of general CONST_VECTOR SRC into DEST, given that it
2849 isn't a simple duplicate or series. */
2850
2851 static void
aarch64_expand_sve_const_vector(rtx dest,rtx src)2852 aarch64_expand_sve_const_vector (rtx dest, rtx src)
2853 {
2854 machine_mode mode = GET_MODE (src);
2855 unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
2856 unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
2857 gcc_assert (npatterns > 1);
2858
2859 if (nelts_per_pattern == 1)
2860 {
2861 /* The constant is a repeating seqeuence of at least two elements,
2862 where the repeating elements occupy no more than 128 bits.
2863 Get an integer representation of the replicated value. */
2864 scalar_int_mode int_mode;
2865 if (BYTES_BIG_ENDIAN)
2866 /* For now, always use LD1RQ to load the value on big-endian
2867 targets, since the handling of smaller integers includes a
2868 subreg that is semantically an element reverse. */
2869 int_mode = TImode;
2870 else
2871 {
2872 unsigned int int_bits = GET_MODE_UNIT_BITSIZE (mode) * npatterns;
2873 gcc_assert (int_bits <= 128);
2874 int_mode = int_mode_for_size (int_bits, 0).require ();
2875 }
2876 rtx int_value = simplify_gen_subreg (int_mode, src, mode, 0);
2877 if (int_value
2878 && aarch64_expand_sve_widened_duplicate (dest, int_mode, int_value))
2879 return;
2880 }
2881
2882 /* Expand each pattern individually. */
2883 rtx_vector_builder builder;
2884 auto_vec<rtx, 16> vectors (npatterns);
2885 for (unsigned int i = 0; i < npatterns; ++i)
2886 {
2887 builder.new_vector (mode, 1, nelts_per_pattern);
2888 for (unsigned int j = 0; j < nelts_per_pattern; ++j)
2889 builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
2890 vectors.quick_push (force_reg (mode, builder.build ()));
2891 }
2892
2893 /* Use permutes to interleave the separate vectors. */
2894 while (npatterns > 1)
2895 {
2896 npatterns /= 2;
2897 for (unsigned int i = 0; i < npatterns; ++i)
2898 {
2899 rtx tmp = (npatterns == 1 ? dest : gen_reg_rtx (mode));
2900 rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
2901 emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
2902 vectors[i] = tmp;
2903 }
2904 }
2905 gcc_assert (vectors[0] == dest);
2906 }
2907
2908 /* Set DEST to immediate IMM. For SVE vector modes, GEN_VEC_DUPLICATE
2909 is a pattern that can be used to set DEST to a replicated scalar
2910 element. */
2911
2912 void
aarch64_expand_mov_immediate(rtx dest,rtx imm,rtx (* gen_vec_duplicate)(rtx,rtx))2913 aarch64_expand_mov_immediate (rtx dest, rtx imm,
2914 rtx (*gen_vec_duplicate) (rtx, rtx))
2915 {
2916 machine_mode mode = GET_MODE (dest);
2917
2918 /* Check on what type of symbol it is. */
2919 scalar_int_mode int_mode;
2920 if ((GET_CODE (imm) == SYMBOL_REF
2921 || GET_CODE (imm) == LABEL_REF
2922 || GET_CODE (imm) == CONST
2923 || GET_CODE (imm) == CONST_POLY_INT)
2924 && is_a <scalar_int_mode> (mode, &int_mode))
2925 {
2926 rtx mem;
2927 poly_int64 offset;
2928 HOST_WIDE_INT const_offset;
2929 enum aarch64_symbol_type sty;
2930
2931 /* If we have (const (plus symbol offset)), separate out the offset
2932 before we start classifying the symbol. */
2933 rtx base = strip_offset (imm, &offset);
2934
2935 /* We must always add an offset involving VL separately, rather than
2936 folding it into the relocation. */
2937 if (!offset.is_constant (&const_offset))
2938 {
2939 if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
2940 emit_insn (gen_rtx_SET (dest, imm));
2941 else
2942 {
2943 /* Do arithmetic on 32-bit values if the result is smaller
2944 than that. */
2945 if (partial_subreg_p (int_mode, SImode))
2946 {
2947 /* It is invalid to do symbol calculations in modes
2948 narrower than SImode. */
2949 gcc_assert (base == const0_rtx);
2950 dest = gen_lowpart (SImode, dest);
2951 int_mode = SImode;
2952 }
2953 if (base != const0_rtx)
2954 {
2955 base = aarch64_force_temporary (int_mode, dest, base);
2956 aarch64_add_offset (int_mode, dest, base, offset,
2957 NULL_RTX, NULL_RTX, false);
2958 }
2959 else
2960 aarch64_add_offset (int_mode, dest, base, offset,
2961 dest, NULL_RTX, false);
2962 }
2963 return;
2964 }
2965
2966 sty = aarch64_classify_symbol (base, const_offset);
2967 switch (sty)
2968 {
2969 case SYMBOL_FORCE_TO_MEM:
2970 if (const_offset != 0
2971 && targetm.cannot_force_const_mem (int_mode, imm))
2972 {
2973 gcc_assert (can_create_pseudo_p ());
2974 base = aarch64_force_temporary (int_mode, dest, base);
2975 aarch64_add_offset (int_mode, dest, base, const_offset,
2976 NULL_RTX, NULL_RTX, false);
2977 return;
2978 }
2979
2980 mem = force_const_mem (ptr_mode, imm);
2981 gcc_assert (mem);
2982
2983 /* If we aren't generating PC relative literals, then
2984 we need to expand the literal pool access carefully.
2985 This is something that needs to be done in a number
2986 of places, so could well live as a separate function. */
2987 if (!aarch64_pcrelative_literal_loads)
2988 {
2989 gcc_assert (can_create_pseudo_p ());
2990 base = gen_reg_rtx (ptr_mode);
2991 aarch64_expand_mov_immediate (base, XEXP (mem, 0));
2992 if (ptr_mode != Pmode)
2993 base = convert_memory_address (Pmode, base);
2994 mem = gen_rtx_MEM (ptr_mode, base);
2995 }
2996
2997 if (int_mode != ptr_mode)
2998 mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
2999
3000 emit_insn (gen_rtx_SET (dest, mem));
3001
3002 return;
3003
3004 case SYMBOL_SMALL_TLSGD:
3005 case SYMBOL_SMALL_TLSDESC:
3006 case SYMBOL_SMALL_TLSIE:
3007 case SYMBOL_SMALL_GOT_28K:
3008 case SYMBOL_SMALL_GOT_4G:
3009 case SYMBOL_TINY_GOT:
3010 case SYMBOL_TINY_TLSIE:
3011 if (const_offset != 0)
3012 {
3013 gcc_assert(can_create_pseudo_p ());
3014 base = aarch64_force_temporary (int_mode, dest, base);
3015 aarch64_add_offset (int_mode, dest, base, const_offset,
3016 NULL_RTX, NULL_RTX, false);
3017 return;
3018 }
3019 /* FALLTHRU */
3020
3021 case SYMBOL_SMALL_ABSOLUTE:
3022 case SYMBOL_TINY_ABSOLUTE:
3023 case SYMBOL_TLSLE12:
3024 case SYMBOL_TLSLE24:
3025 case SYMBOL_TLSLE32:
3026 case SYMBOL_TLSLE48:
3027 aarch64_load_symref_appropriately (dest, imm, sty);
3028 return;
3029
3030 default:
3031 gcc_unreachable ();
3032 }
3033 }
3034
3035 if (!CONST_INT_P (imm))
3036 {
3037 rtx base, step, value;
3038 if (GET_CODE (imm) == HIGH
3039 || aarch64_simd_valid_immediate (imm, NULL))
3040 emit_insn (gen_rtx_SET (dest, imm));
3041 else if (const_vec_series_p (imm, &base, &step))
3042 aarch64_expand_vec_series (dest, base, step);
3043 else if (const_vec_duplicate_p (imm, &value))
3044 {
3045 /* If the constant is out of range of an SVE vector move,
3046 load it from memory if we can, otherwise move it into
3047 a register and use a DUP. */
3048 scalar_mode inner_mode = GET_MODE_INNER (mode);
3049 rtx op = force_const_mem (inner_mode, value);
3050 if (!op)
3051 op = force_reg (inner_mode, value);
3052 else if (!aarch64_sve_ld1r_operand_p (op))
3053 {
3054 rtx addr = force_reg (Pmode, XEXP (op, 0));
3055 op = replace_equiv_address (op, addr);
3056 }
3057 emit_insn (gen_vec_duplicate (dest, op));
3058 }
3059 else if (GET_CODE (imm) == CONST_VECTOR
3060 && !GET_MODE_NUNITS (GET_MODE (imm)).is_constant ())
3061 aarch64_expand_sve_const_vector (dest, imm);
3062 else
3063 {
3064 rtx mem = force_const_mem (mode, imm);
3065 gcc_assert (mem);
3066 emit_move_insn (dest, mem);
3067 }
3068
3069 return;
3070 }
3071
3072 aarch64_internal_mov_immediate (dest, imm, true,
3073 as_a <scalar_int_mode> (mode));
3074 }
3075
3076 /* Emit an SVE predicated move from SRC to DEST. PRED is a predicate
3077 that is known to contain PTRUE. */
3078
3079 void
aarch64_emit_sve_pred_move(rtx dest,rtx pred,rtx src)3080 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
3081 {
3082 emit_insn (gen_rtx_SET (dest, gen_rtx_UNSPEC (GET_MODE (dest),
3083 gen_rtvec (2, pred, src),
3084 UNSPEC_MERGE_PTRUE)));
3085 }
3086
3087 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
3088 operand is in memory. In this case we need to use the predicated LD1
3089 and ST1 instead of LDR and STR, both for correctness on big-endian
3090 targets and because LD1 and ST1 support a wider range of addressing modes.
3091 PRED_MODE is the mode of the predicate.
3092
3093 See the comment at the head of aarch64-sve.md for details about the
3094 big-endian handling. */
3095
3096 void
aarch64_expand_sve_mem_move(rtx dest,rtx src,machine_mode pred_mode)3097 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
3098 {
3099 machine_mode mode = GET_MODE (dest);
3100 rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
3101 if (!register_operand (src, mode)
3102 && !register_operand (dest, mode))
3103 {
3104 rtx tmp = gen_reg_rtx (mode);
3105 if (MEM_P (src))
3106 aarch64_emit_sve_pred_move (tmp, ptrue, src);
3107 else
3108 emit_move_insn (tmp, src);
3109 src = tmp;
3110 }
3111 aarch64_emit_sve_pred_move (dest, ptrue, src);
3112 }
3113
3114 /* Called only on big-endian targets. See whether an SVE vector move
3115 from SRC to DEST is effectively a REV[BHW] instruction, because at
3116 least one operand is a subreg of an SVE vector that has wider or
3117 narrower elements. Return true and emit the instruction if so.
3118
3119 For example:
3120
3121 (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
3122
3123 represents a VIEW_CONVERT between the following vectors, viewed
3124 in memory order:
3125
3126 R2: { [0].high, [0].low, [1].high, [1].low, ... }
3127 R1: { [0], [1], [2], [3], ... }
3128
3129 The high part of lane X in R2 should therefore correspond to lane X*2
3130 of R1, but the register representations are:
3131
3132 msb lsb
3133 R2: ...... [1].high [1].low [0].high [0].low
3134 R1: ...... [3] [2] [1] [0]
3135
3136 where the low part of lane X in R2 corresponds to lane X*2 in R1.
3137 We therefore need a reverse operation to swap the high and low values
3138 around.
3139
3140 This is purely an optimization. Without it we would spill the
3141 subreg operand to the stack in one mode and reload it in the
3142 other mode, which has the same effect as the REV. */
3143
3144 bool
aarch64_maybe_expand_sve_subreg_move(rtx dest,rtx src)3145 aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
3146 {
3147 gcc_assert (BYTES_BIG_ENDIAN);
3148 if (GET_CODE (dest) == SUBREG)
3149 dest = SUBREG_REG (dest);
3150 if (GET_CODE (src) == SUBREG)
3151 src = SUBREG_REG (src);
3152
3153 /* The optimization handles two single SVE REGs with different element
3154 sizes. */
3155 if (!REG_P (dest)
3156 || !REG_P (src)
3157 || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
3158 || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
3159 || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
3160 == GET_MODE_UNIT_SIZE (GET_MODE (src))))
3161 return false;
3162
3163 /* Generate *aarch64_sve_mov<mode>_subreg_be. */
3164 rtx ptrue = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
3165 rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
3166 UNSPEC_REV_SUBREG);
3167 emit_insn (gen_rtx_SET (dest, unspec));
3168 return true;
3169 }
3170
3171 /* Return a copy of X with mode MODE, without changing its other
3172 attributes. Unlike gen_lowpart, this doesn't care whether the
3173 mode change is valid. */
3174
3175 static rtx
aarch64_replace_reg_mode(rtx x,machine_mode mode)3176 aarch64_replace_reg_mode (rtx x, machine_mode mode)
3177 {
3178 if (GET_MODE (x) == mode)
3179 return x;
3180
3181 x = shallow_copy_rtx (x);
3182 set_mode_and_regno (x, mode, REGNO (x));
3183 return x;
3184 }
3185
3186 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
3187 operands. */
3188
3189 void
aarch64_split_sve_subreg_move(rtx dest,rtx ptrue,rtx src)3190 aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
3191 {
3192 /* Decide which REV operation we need. The mode with narrower elements
3193 determines the mode of the operands and the mode with the wider
3194 elements determines the reverse width. */
3195 machine_mode mode_with_wider_elts = GET_MODE (dest);
3196 machine_mode mode_with_narrower_elts = GET_MODE (src);
3197 if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
3198 < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
3199 std::swap (mode_with_wider_elts, mode_with_narrower_elts);
3200
3201 unsigned int wider_bytes = GET_MODE_UNIT_SIZE (mode_with_wider_elts);
3202 unsigned int unspec;
3203 if (wider_bytes == 8)
3204 unspec = UNSPEC_REV64;
3205 else if (wider_bytes == 4)
3206 unspec = UNSPEC_REV32;
3207 else if (wider_bytes == 2)
3208 unspec = UNSPEC_REV16;
3209 else
3210 gcc_unreachable ();
3211 machine_mode pred_mode = aarch64_sve_pred_mode (wider_bytes).require ();
3212
3213 /* Emit:
3214
3215 (set DEST (unspec [PTRUE (unspec [SRC] UNSPEC_REV<nn>)]
3216 UNSPEC_MERGE_PTRUE))
3217
3218 with the appropriate modes. */
3219 ptrue = gen_lowpart (pred_mode, ptrue);
3220 dest = aarch64_replace_reg_mode (dest, mode_with_narrower_elts);
3221 src = aarch64_replace_reg_mode (src, mode_with_narrower_elts);
3222 src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (1, src), unspec);
3223 src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (2, ptrue, src),
3224 UNSPEC_MERGE_PTRUE);
3225 emit_insn (gen_rtx_SET (dest, src));
3226 }
3227
3228 static bool
aarch64_function_ok_for_sibcall(tree decl ATTRIBUTE_UNUSED,tree exp ATTRIBUTE_UNUSED)3229 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
3230 tree exp ATTRIBUTE_UNUSED)
3231 {
3232 /* Currently, always true. */
3233 return true;
3234 }
3235
3236 /* Implement TARGET_PASS_BY_REFERENCE. */
3237
3238 static bool
aarch64_pass_by_reference(cumulative_args_t pcum ATTRIBUTE_UNUSED,machine_mode mode,const_tree type,bool named ATTRIBUTE_UNUSED)3239 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
3240 machine_mode mode,
3241 const_tree type,
3242 bool named ATTRIBUTE_UNUSED)
3243 {
3244 HOST_WIDE_INT size;
3245 machine_mode dummymode;
3246 int nregs;
3247
3248 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
3249 if (mode == BLKmode && type)
3250 size = int_size_in_bytes (type);
3251 else
3252 /* No frontends can create types with variable-sized modes, so we
3253 shouldn't be asked to pass or return them. */
3254 size = GET_MODE_SIZE (mode).to_constant ();
3255
3256 /* Aggregates are passed by reference based on their size. */
3257 if (type && AGGREGATE_TYPE_P (type))
3258 {
3259 size = int_size_in_bytes (type);
3260 }
3261
3262 /* Variable sized arguments are always returned by reference. */
3263 if (size < 0)
3264 return true;
3265
3266 /* Can this be a candidate to be passed in fp/simd register(s)? */
3267 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
3268 &dummymode, &nregs,
3269 NULL))
3270 return false;
3271
3272 /* Arguments which are variable sized or larger than 2 registers are
3273 passed by reference unless they are a homogenous floating point
3274 aggregate. */
3275 return size > 2 * UNITS_PER_WORD;
3276 }
3277
3278 /* Return TRUE if VALTYPE is padded to its least significant bits. */
3279 static bool
aarch64_return_in_msb(const_tree valtype)3280 aarch64_return_in_msb (const_tree valtype)
3281 {
3282 machine_mode dummy_mode;
3283 int dummy_int;
3284
3285 /* Never happens in little-endian mode. */
3286 if (!BYTES_BIG_ENDIAN)
3287 return false;
3288
3289 /* Only composite types smaller than or equal to 16 bytes can
3290 be potentially returned in registers. */
3291 if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
3292 || int_size_in_bytes (valtype) <= 0
3293 || int_size_in_bytes (valtype) > 16)
3294 return false;
3295
3296 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
3297 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
3298 is always passed/returned in the least significant bits of fp/simd
3299 register(s). */
3300 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
3301 &dummy_mode, &dummy_int, NULL))
3302 return false;
3303
3304 return true;
3305 }
3306
3307 /* Implement TARGET_FUNCTION_VALUE.
3308 Define how to find the value returned by a function. */
3309
3310 static rtx
aarch64_function_value(const_tree type,const_tree func,bool outgoing ATTRIBUTE_UNUSED)3311 aarch64_function_value (const_tree type, const_tree func,
3312 bool outgoing ATTRIBUTE_UNUSED)
3313 {
3314 machine_mode mode;
3315 int unsignedp;
3316 int count;
3317 machine_mode ag_mode;
3318
3319 mode = TYPE_MODE (type);
3320 if (INTEGRAL_TYPE_P (type))
3321 mode = promote_function_mode (type, mode, &unsignedp, func, 1);
3322
3323 if (aarch64_return_in_msb (type))
3324 {
3325 HOST_WIDE_INT size = int_size_in_bytes (type);
3326
3327 if (size % UNITS_PER_WORD != 0)
3328 {
3329 size += UNITS_PER_WORD - size % UNITS_PER_WORD;
3330 mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
3331 }
3332 }
3333
3334 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
3335 &ag_mode, &count, NULL))
3336 {
3337 if (!aarch64_composite_type_p (type, mode))
3338 {
3339 gcc_assert (count == 1 && mode == ag_mode);
3340 return gen_rtx_REG (mode, V0_REGNUM);
3341 }
3342 else
3343 {
3344 int i;
3345 rtx par;
3346
3347 par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
3348 for (i = 0; i < count; i++)
3349 {
3350 rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
3351 rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
3352 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
3353 XVECEXP (par, 0, i) = tmp;
3354 }
3355 return par;
3356 }
3357 }
3358 else
3359 return gen_rtx_REG (mode, R0_REGNUM);
3360 }
3361
3362 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
3363 Return true if REGNO is the number of a hard register in which the values
3364 of called function may come back. */
3365
3366 static bool
aarch64_function_value_regno_p(const unsigned int regno)3367 aarch64_function_value_regno_p (const unsigned int regno)
3368 {
3369 /* Maximum of 16 bytes can be returned in the general registers. Examples
3370 of 16-byte return values are: 128-bit integers and 16-byte small
3371 structures (excluding homogeneous floating-point aggregates). */
3372 if (regno == R0_REGNUM || regno == R1_REGNUM)
3373 return true;
3374
3375 /* Up to four fp/simd registers can return a function value, e.g. a
3376 homogeneous floating-point aggregate having four members. */
3377 if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
3378 return TARGET_FLOAT;
3379
3380 return false;
3381 }
3382
3383 /* Implement TARGET_RETURN_IN_MEMORY.
3384
3385 If the type T of the result of a function is such that
3386 void func (T arg)
3387 would require that arg be passed as a value in a register (or set of
3388 registers) according to the parameter passing rules, then the result
3389 is returned in the same registers as would be used for such an
3390 argument. */
3391
3392 static bool
aarch64_return_in_memory(const_tree type,const_tree fndecl ATTRIBUTE_UNUSED)3393 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
3394 {
3395 HOST_WIDE_INT size;
3396 machine_mode ag_mode;
3397 int count;
3398
3399 if (!AGGREGATE_TYPE_P (type)
3400 && TREE_CODE (type) != COMPLEX_TYPE
3401 && TREE_CODE (type) != VECTOR_TYPE)
3402 /* Simple scalar types always returned in registers. */
3403 return false;
3404
3405 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
3406 type,
3407 &ag_mode,
3408 &count,
3409 NULL))
3410 return false;
3411
3412 /* Types larger than 2 registers returned in memory. */
3413 size = int_size_in_bytes (type);
3414 return (size < 0 || size > 2 * UNITS_PER_WORD);
3415 }
3416
3417 static bool
aarch64_vfp_is_call_candidate(cumulative_args_t pcum_v,machine_mode mode,const_tree type,int * nregs)3418 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
3419 const_tree type, int *nregs)
3420 {
3421 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3422 return aarch64_vfp_is_call_or_return_candidate (mode,
3423 type,
3424 &pcum->aapcs_vfp_rmode,
3425 nregs,
3426 NULL);
3427 }
3428
3429 /* Given MODE and TYPE of a function argument, return the alignment in
3430 bits. The idea is to suppress any stronger alignment requested by
3431 the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
3432 This is a helper function for local use only. */
3433
3434 static unsigned int
aarch64_function_arg_alignment(machine_mode mode,const_tree type)3435 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
3436 {
3437 if (!type)
3438 return GET_MODE_ALIGNMENT (mode);
3439
3440 if (integer_zerop (TYPE_SIZE (type)))
3441 return 0;
3442
3443 gcc_assert (TYPE_MODE (type) == mode);
3444
3445 if (!AGGREGATE_TYPE_P (type))
3446 return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
3447
3448 if (TREE_CODE (type) == ARRAY_TYPE)
3449 return TYPE_ALIGN (TREE_TYPE (type));
3450
3451 unsigned int alignment = 0;
3452 for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
3453 if (TREE_CODE (field) == FIELD_DECL)
3454 alignment = std::max (alignment, DECL_ALIGN (field));
3455
3456 return alignment;
3457 }
3458
3459 /* Layout a function argument according to the AAPCS64 rules. The rule
3460 numbers refer to the rule numbers in the AAPCS64. */
3461
3462 static void
aarch64_layout_arg(cumulative_args_t pcum_v,machine_mode mode,const_tree type,bool named ATTRIBUTE_UNUSED)3463 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
3464 const_tree type,
3465 bool named ATTRIBUTE_UNUSED)
3466 {
3467 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3468 int ncrn, nvrn, nregs;
3469 bool allocate_ncrn, allocate_nvrn;
3470 HOST_WIDE_INT size;
3471
3472 /* We need to do this once per argument. */
3473 if (pcum->aapcs_arg_processed)
3474 return;
3475
3476 pcum->aapcs_arg_processed = true;
3477
3478 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
3479 if (type)
3480 size = int_size_in_bytes (type);
3481 else
3482 /* No frontends can create types with variable-sized modes, so we
3483 shouldn't be asked to pass or return them. */
3484 size = GET_MODE_SIZE (mode).to_constant ();
3485 size = ROUND_UP (size, UNITS_PER_WORD);
3486
3487 allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
3488 allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
3489 mode,
3490 type,
3491 &nregs);
3492
3493 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
3494 The following code thus handles passing by SIMD/FP registers first. */
3495
3496 nvrn = pcum->aapcs_nvrn;
3497
3498 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
3499 and homogenous short-vector aggregates (HVA). */
3500 if (allocate_nvrn)
3501 {
3502 if (!TARGET_FLOAT)
3503 aarch64_err_no_fpadvsimd (mode, "argument");
3504
3505 if (nvrn + nregs <= NUM_FP_ARG_REGS)
3506 {
3507 pcum->aapcs_nextnvrn = nvrn + nregs;
3508 if (!aarch64_composite_type_p (type, mode))
3509 {
3510 gcc_assert (nregs == 1);
3511 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
3512 }
3513 else
3514 {
3515 rtx par;
3516 int i;
3517 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
3518 for (i = 0; i < nregs; i++)
3519 {
3520 rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
3521 V0_REGNUM + nvrn + i);
3522 rtx offset = gen_int_mode
3523 (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
3524 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
3525 XVECEXP (par, 0, i) = tmp;
3526 }
3527 pcum->aapcs_reg = par;
3528 }
3529 return;
3530 }
3531 else
3532 {
3533 /* C.3 NSRN is set to 8. */
3534 pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
3535 goto on_stack;
3536 }
3537 }
3538
3539 ncrn = pcum->aapcs_ncrn;
3540 nregs = size / UNITS_PER_WORD;
3541
3542 /* C6 - C9. though the sign and zero extension semantics are
3543 handled elsewhere. This is the case where the argument fits
3544 entirely general registers. */
3545 if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
3546 {
3547
3548 gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
3549
3550 /* C.8 if the argument has an alignment of 16 then the NGRN is
3551 rounded up to the next even number. */
3552 if (nregs == 2
3553 && ncrn % 2
3554 /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
3555 comparison is there because for > 16 * BITS_PER_UNIT
3556 alignment nregs should be > 2 and therefore it should be
3557 passed by reference rather than value. */
3558 && aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
3559 {
3560 ++ncrn;
3561 gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
3562 }
3563
3564 /* NREGS can be 0 when e.g. an empty structure is to be passed.
3565 A reg is still generated for it, but the caller should be smart
3566 enough not to use it. */
3567 if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
3568 pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
3569 else
3570 {
3571 rtx par;
3572 int i;
3573
3574 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
3575 for (i = 0; i < nregs; i++)
3576 {
3577 rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
3578 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
3579 GEN_INT (i * UNITS_PER_WORD));
3580 XVECEXP (par, 0, i) = tmp;
3581 }
3582 pcum->aapcs_reg = par;
3583 }
3584
3585 pcum->aapcs_nextncrn = ncrn + nregs;
3586 return;
3587 }
3588
3589 /* C.11 */
3590 pcum->aapcs_nextncrn = NUM_ARG_REGS;
3591
3592 /* The argument is passed on stack; record the needed number of words for
3593 this argument and align the total size if necessary. */
3594 on_stack:
3595 pcum->aapcs_stack_words = size / UNITS_PER_WORD;
3596
3597 if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
3598 pcum->aapcs_stack_size = ROUND_UP (pcum->aapcs_stack_size,
3599 16 / UNITS_PER_WORD);
3600 return;
3601 }
3602
3603 /* Implement TARGET_FUNCTION_ARG. */
3604
3605 static rtx
aarch64_function_arg(cumulative_args_t pcum_v,machine_mode mode,const_tree type,bool named)3606 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
3607 const_tree type, bool named)
3608 {
3609 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3610 gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
3611
3612 if (mode == VOIDmode)
3613 return NULL_RTX;
3614
3615 aarch64_layout_arg (pcum_v, mode, type, named);
3616 return pcum->aapcs_reg;
3617 }
3618
3619 void
aarch64_init_cumulative_args(CUMULATIVE_ARGS * pcum,const_tree fntype ATTRIBUTE_UNUSED,rtx libname ATTRIBUTE_UNUSED,const_tree fndecl ATTRIBUTE_UNUSED,unsigned n_named ATTRIBUTE_UNUSED)3620 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
3621 const_tree fntype ATTRIBUTE_UNUSED,
3622 rtx libname ATTRIBUTE_UNUSED,
3623 const_tree fndecl ATTRIBUTE_UNUSED,
3624 unsigned n_named ATTRIBUTE_UNUSED)
3625 {
3626 pcum->aapcs_ncrn = 0;
3627 pcum->aapcs_nvrn = 0;
3628 pcum->aapcs_nextncrn = 0;
3629 pcum->aapcs_nextnvrn = 0;
3630 pcum->pcs_variant = ARM_PCS_AAPCS64;
3631 pcum->aapcs_reg = NULL_RTX;
3632 pcum->aapcs_arg_processed = false;
3633 pcum->aapcs_stack_words = 0;
3634 pcum->aapcs_stack_size = 0;
3635
3636 if (!TARGET_FLOAT
3637 && fndecl && TREE_PUBLIC (fndecl)
3638 && fntype && fntype != error_mark_node)
3639 {
3640 const_tree type = TREE_TYPE (fntype);
3641 machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument. */
3642 int nregs ATTRIBUTE_UNUSED; /* Likewise. */
3643 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
3644 &mode, &nregs, NULL))
3645 aarch64_err_no_fpadvsimd (TYPE_MODE (type), "return type");
3646 }
3647 return;
3648 }
3649
3650 static void
aarch64_function_arg_advance(cumulative_args_t pcum_v,machine_mode mode,const_tree type,bool named)3651 aarch64_function_arg_advance (cumulative_args_t pcum_v,
3652 machine_mode mode,
3653 const_tree type,
3654 bool named)
3655 {
3656 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3657 if (pcum->pcs_variant == ARM_PCS_AAPCS64)
3658 {
3659 aarch64_layout_arg (pcum_v, mode, type, named);
3660 gcc_assert ((pcum->aapcs_reg != NULL_RTX)
3661 != (pcum->aapcs_stack_words != 0));
3662 pcum->aapcs_arg_processed = false;
3663 pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
3664 pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
3665 pcum->aapcs_stack_size += pcum->aapcs_stack_words;
3666 pcum->aapcs_stack_words = 0;
3667 pcum->aapcs_reg = NULL_RTX;
3668 }
3669 }
3670
3671 bool
aarch64_function_arg_regno_p(unsigned regno)3672 aarch64_function_arg_regno_p (unsigned regno)
3673 {
3674 return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
3675 || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
3676 }
3677
3678 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
3679 PARM_BOUNDARY bits of alignment, but will be given anything up
3680 to STACK_BOUNDARY bits if the type requires it. This makes sure
3681 that both before and after the layout of each argument, the Next
3682 Stacked Argument Address (NSAA) will have a minimum alignment of
3683 8 bytes. */
3684
3685 static unsigned int
aarch64_function_arg_boundary(machine_mode mode,const_tree type)3686 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
3687 {
3688 unsigned int alignment = aarch64_function_arg_alignment (mode, type);
3689 return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
3690 }
3691
3692 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE. */
3693
3694 static fixed_size_mode
aarch64_get_reg_raw_mode(int regno)3695 aarch64_get_reg_raw_mode (int regno)
3696 {
3697 if (TARGET_SVE && FP_REGNUM_P (regno))
3698 /* Don't use the SVE part of the register for __builtin_apply and
3699 __builtin_return. The SVE registers aren't used by the normal PCS,
3700 so using them there would be a waste of time. The PCS extensions
3701 for SVE types are fundamentally incompatible with the
3702 __builtin_return/__builtin_apply interface. */
3703 return as_a <fixed_size_mode> (V16QImode);
3704 return default_get_reg_raw_mode (regno);
3705 }
3706
3707 /* Implement TARGET_FUNCTION_ARG_PADDING.
3708
3709 Small aggregate types are placed in the lowest memory address.
3710
3711 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
3712
3713 static pad_direction
aarch64_function_arg_padding(machine_mode mode,const_tree type)3714 aarch64_function_arg_padding (machine_mode mode, const_tree type)
3715 {
3716 /* On little-endian targets, the least significant byte of every stack
3717 argument is passed at the lowest byte address of the stack slot. */
3718 if (!BYTES_BIG_ENDIAN)
3719 return PAD_UPWARD;
3720
3721 /* Otherwise, integral, floating-point and pointer types are padded downward:
3722 the least significant byte of a stack argument is passed at the highest
3723 byte address of the stack slot. */
3724 if (type
3725 ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
3726 || POINTER_TYPE_P (type))
3727 : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
3728 return PAD_DOWNWARD;
3729
3730 /* Everything else padded upward, i.e. data in first byte of stack slot. */
3731 return PAD_UPWARD;
3732 }
3733
3734 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
3735
3736 It specifies padding for the last (may also be the only)
3737 element of a block move between registers and memory. If
3738 assuming the block is in the memory, padding upward means that
3739 the last element is padded after its highest significant byte,
3740 while in downward padding, the last element is padded at the
3741 its least significant byte side.
3742
3743 Small aggregates and small complex types are always padded
3744 upwards.
3745
3746 We don't need to worry about homogeneous floating-point or
3747 short-vector aggregates; their move is not affected by the
3748 padding direction determined here. Regardless of endianness,
3749 each element of such an aggregate is put in the least
3750 significant bits of a fp/simd register.
3751
3752 Return !BYTES_BIG_ENDIAN if the least significant byte of the
3753 register has useful data, and return the opposite if the most
3754 significant byte does. */
3755
3756 bool
aarch64_pad_reg_upward(machine_mode mode,const_tree type,bool first ATTRIBUTE_UNUSED)3757 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
3758 bool first ATTRIBUTE_UNUSED)
3759 {
3760
3761 /* Small composite types are always padded upward. */
3762 if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
3763 {
3764 HOST_WIDE_INT size;
3765 if (type)
3766 size = int_size_in_bytes (type);
3767 else
3768 /* No frontends can create types with variable-sized modes, so we
3769 shouldn't be asked to pass or return them. */
3770 size = GET_MODE_SIZE (mode).to_constant ();
3771 if (size < 2 * UNITS_PER_WORD)
3772 return true;
3773 }
3774
3775 /* Otherwise, use the default padding. */
3776 return !BYTES_BIG_ENDIAN;
3777 }
3778
3779 static scalar_int_mode
aarch64_libgcc_cmp_return_mode(void)3780 aarch64_libgcc_cmp_return_mode (void)
3781 {
3782 return SImode;
3783 }
3784
3785 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
3786
3787 /* We use the 12-bit shifted immediate arithmetic instructions so values
3788 must be multiple of (1 << 12), i.e. 4096. */
3789 #define ARITH_FACTOR 4096
3790
3791 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
3792 #error Cannot use simple address calculation for stack probing
3793 #endif
3794
3795 /* The pair of scratch registers used for stack probing. */
3796 #define PROBE_STACK_FIRST_REG 9
3797 #define PROBE_STACK_SECOND_REG 10
3798
3799 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
3800 inclusive. These are offsets from the current stack pointer. */
3801
3802 static void
aarch64_emit_probe_stack_range(HOST_WIDE_INT first,poly_int64 poly_size)3803 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
3804 {
3805 HOST_WIDE_INT size;
3806 if (!poly_size.is_constant (&size))
3807 {
3808 sorry ("stack probes for SVE frames");
3809 return;
3810 }
3811
3812 rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
3813
3814 /* See the same assertion on PROBE_INTERVAL above. */
3815 gcc_assert ((first % ARITH_FACTOR) == 0);
3816
3817 /* See if we have a constant small number of probes to generate. If so,
3818 that's the easy case. */
3819 if (size <= PROBE_INTERVAL)
3820 {
3821 const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
3822
3823 emit_set_insn (reg1,
3824 plus_constant (Pmode,
3825 stack_pointer_rtx, -(first + base)));
3826 emit_stack_probe (plus_constant (Pmode, reg1, base - size));
3827 }
3828
3829 /* The run-time loop is made up of 8 insns in the generic case while the
3830 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */
3831 else if (size <= 4 * PROBE_INTERVAL)
3832 {
3833 HOST_WIDE_INT i, rem;
3834
3835 emit_set_insn (reg1,
3836 plus_constant (Pmode,
3837 stack_pointer_rtx,
3838 -(first + PROBE_INTERVAL)));
3839 emit_stack_probe (reg1);
3840
3841 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
3842 it exceeds SIZE. If only two probes are needed, this will not
3843 generate any code. Then probe at FIRST + SIZE. */
3844 for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
3845 {
3846 emit_set_insn (reg1,
3847 plus_constant (Pmode, reg1, -PROBE_INTERVAL));
3848 emit_stack_probe (reg1);
3849 }
3850
3851 rem = size - (i - PROBE_INTERVAL);
3852 if (rem > 256)
3853 {
3854 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
3855
3856 emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
3857 emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
3858 }
3859 else
3860 emit_stack_probe (plus_constant (Pmode, reg1, -rem));
3861 }
3862
3863 /* Otherwise, do the same as above, but in a loop. Note that we must be
3864 extra careful with variables wrapping around because we might be at
3865 the very top (or the very bottom) of the address space and we have
3866 to be able to handle this case properly; in particular, we use an
3867 equality test for the loop condition. */
3868 else
3869 {
3870 rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
3871
3872 /* Step 1: round SIZE to the previous multiple of the interval. */
3873
3874 HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
3875
3876
3877 /* Step 2: compute initial and final value of the loop counter. */
3878
3879 /* TEST_ADDR = SP + FIRST. */
3880 emit_set_insn (reg1,
3881 plus_constant (Pmode, stack_pointer_rtx, -first));
3882
3883 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
3884 HOST_WIDE_INT adjustment = - (first + rounded_size);
3885 if (! aarch64_uimm12_shift (adjustment))
3886 {
3887 aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
3888 true, Pmode);
3889 emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
3890 }
3891 else
3892 emit_set_insn (reg2,
3893 plus_constant (Pmode, stack_pointer_rtx, adjustment));
3894
3895 /* Step 3: the loop
3896
3897 do
3898 {
3899 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
3900 probe at TEST_ADDR
3901 }
3902 while (TEST_ADDR != LAST_ADDR)
3903
3904 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
3905 until it is equal to ROUNDED_SIZE. */
3906
3907 emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
3908
3909
3910 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
3911 that SIZE is equal to ROUNDED_SIZE. */
3912
3913 if (size != rounded_size)
3914 {
3915 HOST_WIDE_INT rem = size - rounded_size;
3916
3917 if (rem > 256)
3918 {
3919 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
3920
3921 emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
3922 emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
3923 }
3924 else
3925 emit_stack_probe (plus_constant (Pmode, reg2, -rem));
3926 }
3927 }
3928
3929 /* Make sure nothing is scheduled before we are done. */
3930 emit_insn (gen_blockage ());
3931 }
3932
3933 /* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
3934 absolute addresses. */
3935
3936 const char *
aarch64_output_probe_stack_range(rtx reg1,rtx reg2)3937 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
3938 {
3939 static int labelno = 0;
3940 char loop_lab[32];
3941 rtx xops[2];
3942
3943 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
3944
3945 /* Loop. */
3946 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
3947
3948 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
3949 xops[0] = reg1;
3950 xops[1] = GEN_INT (PROBE_INTERVAL);
3951 output_asm_insn ("sub\t%0, %0, %1", xops);
3952
3953 /* Probe at TEST_ADDR. */
3954 output_asm_insn ("str\txzr, [%0]", xops);
3955
3956 /* Test if TEST_ADDR == LAST_ADDR. */
3957 xops[1] = reg2;
3958 output_asm_insn ("cmp\t%0, %1", xops);
3959
3960 /* Branch. */
3961 fputs ("\tb.ne\t", asm_out_file);
3962 assemble_name_raw (asm_out_file, loop_lab);
3963 fputc ('\n', asm_out_file);
3964
3965 return "";
3966 }
3967
3968 /* Mark the registers that need to be saved by the callee and calculate
3969 the size of the callee-saved registers area and frame record (both FP
3970 and LR may be omitted). */
3971 static void
aarch64_layout_frame(void)3972 aarch64_layout_frame (void)
3973 {
3974 HOST_WIDE_INT offset = 0;
3975 int regno, last_fp_reg = INVALID_REGNUM;
3976
3977 if (reload_completed && cfun->machine->frame.laid_out)
3978 return;
3979
3980 /* Force a frame chain for EH returns so the return address is at FP+8. */
3981 cfun->machine->frame.emit_frame_chain
3982 = frame_pointer_needed || crtl->calls_eh_return;
3983
3984 /* Emit a frame chain if the frame pointer is enabled.
3985 If -momit-leaf-frame-pointer is used, do not use a frame chain
3986 in leaf functions which do not use LR. */
3987 if (flag_omit_frame_pointer == 2
3988 && !(flag_omit_leaf_frame_pointer && crtl->is_leaf
3989 && !df_regs_ever_live_p (LR_REGNUM)))
3990 cfun->machine->frame.emit_frame_chain = true;
3991
3992 #define SLOT_NOT_REQUIRED (-2)
3993 #define SLOT_REQUIRED (-1)
3994
3995 cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
3996 cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
3997
3998 /* First mark all the registers that really need to be saved... */
3999 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4000 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
4001
4002 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4003 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
4004
4005 /* ... that includes the eh data registers (if needed)... */
4006 if (crtl->calls_eh_return)
4007 for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
4008 cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
4009 = SLOT_REQUIRED;
4010
4011 /* ... and any callee saved register that dataflow says is live. */
4012 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4013 if (df_regs_ever_live_p (regno)
4014 && (regno == R30_REGNUM
4015 || !call_used_regs[regno]))
4016 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
4017
4018 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4019 if (df_regs_ever_live_p (regno)
4020 && !call_used_regs[regno])
4021 {
4022 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
4023 last_fp_reg = regno;
4024 }
4025
4026 if (cfun->machine->frame.emit_frame_chain)
4027 {
4028 /* FP and LR are placed in the linkage record. */
4029 cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
4030 cfun->machine->frame.wb_candidate1 = R29_REGNUM;
4031 cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
4032 cfun->machine->frame.wb_candidate2 = R30_REGNUM;
4033 offset = 2 * UNITS_PER_WORD;
4034 }
4035
4036 /* Now assign stack slots for them. */
4037 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4038 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
4039 {
4040 cfun->machine->frame.reg_offset[regno] = offset;
4041 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
4042 cfun->machine->frame.wb_candidate1 = regno;
4043 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
4044 cfun->machine->frame.wb_candidate2 = regno;
4045 offset += UNITS_PER_WORD;
4046 }
4047
4048 HOST_WIDE_INT max_int_offset = offset;
4049 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
4050 bool has_align_gap = offset != max_int_offset;
4051
4052 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4053 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
4054 {
4055 /* If there is an alignment gap between integer and fp callee-saves,
4056 allocate the last fp register to it if possible. */
4057 if (regno == last_fp_reg && has_align_gap && (offset & 8) == 0)
4058 {
4059 cfun->machine->frame.reg_offset[regno] = max_int_offset;
4060 break;
4061 }
4062
4063 cfun->machine->frame.reg_offset[regno] = offset;
4064 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
4065 cfun->machine->frame.wb_candidate1 = regno;
4066 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
4067 && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
4068 cfun->machine->frame.wb_candidate2 = regno;
4069 offset += UNITS_PER_WORD;
4070 }
4071
4072 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
4073
4074 cfun->machine->frame.saved_regs_size = offset;
4075
4076 HOST_WIDE_INT varargs_and_saved_regs_size
4077 = offset + cfun->machine->frame.saved_varargs_size;
4078
4079 cfun->machine->frame.hard_fp_offset
4080 = aligned_upper_bound (varargs_and_saved_regs_size
4081 + get_frame_size (),
4082 STACK_BOUNDARY / BITS_PER_UNIT);
4083
4084 /* Both these values are already aligned. */
4085 gcc_assert (multiple_p (crtl->outgoing_args_size,
4086 STACK_BOUNDARY / BITS_PER_UNIT));
4087 cfun->machine->frame.frame_size
4088 = (cfun->machine->frame.hard_fp_offset
4089 + crtl->outgoing_args_size);
4090
4091 cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
4092
4093 cfun->machine->frame.initial_adjust = 0;
4094 cfun->machine->frame.final_adjust = 0;
4095 cfun->machine->frame.callee_adjust = 0;
4096 cfun->machine->frame.callee_offset = 0;
4097
4098 HOST_WIDE_INT max_push_offset = 0;
4099 if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
4100 max_push_offset = 512;
4101 else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
4102 max_push_offset = 256;
4103
4104 HOST_WIDE_INT const_size, const_fp_offset;
4105 if (cfun->machine->frame.frame_size.is_constant (&const_size)
4106 && const_size < max_push_offset
4107 && known_eq (crtl->outgoing_args_size, 0))
4108 {
4109 /* Simple, small frame with no outgoing arguments:
4110 stp reg1, reg2, [sp, -frame_size]!
4111 stp reg3, reg4, [sp, 16] */
4112 cfun->machine->frame.callee_adjust = const_size;
4113 }
4114 else if (known_lt (crtl->outgoing_args_size
4115 + cfun->machine->frame.saved_regs_size, 512)
4116 && !(cfun->calls_alloca
4117 && known_lt (cfun->machine->frame.hard_fp_offset,
4118 max_push_offset)))
4119 {
4120 /* Frame with small outgoing arguments:
4121 sub sp, sp, frame_size
4122 stp reg1, reg2, [sp, outgoing_args_size]
4123 stp reg3, reg4, [sp, outgoing_args_size + 16] */
4124 cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
4125 cfun->machine->frame.callee_offset
4126 = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
4127 }
4128 else if (cfun->machine->frame.hard_fp_offset.is_constant (&const_fp_offset)
4129 && const_fp_offset < max_push_offset)
4130 {
4131 /* Frame with large outgoing arguments but a small local area:
4132 stp reg1, reg2, [sp, -hard_fp_offset]!
4133 stp reg3, reg4, [sp, 16]
4134 sub sp, sp, outgoing_args_size */
4135 cfun->machine->frame.callee_adjust = const_fp_offset;
4136 cfun->machine->frame.final_adjust
4137 = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
4138 }
4139 else
4140 {
4141 /* Frame with large local area and outgoing arguments using frame pointer:
4142 sub sp, sp, hard_fp_offset
4143 stp x29, x30, [sp, 0]
4144 add x29, sp, 0
4145 stp reg3, reg4, [sp, 16]
4146 sub sp, sp, outgoing_args_size */
4147 cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
4148 cfun->machine->frame.final_adjust
4149 = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
4150 }
4151
4152 cfun->machine->frame.laid_out = true;
4153 }
4154
4155 /* Return true if the register REGNO is saved on entry to
4156 the current function. */
4157
4158 static bool
aarch64_register_saved_on_entry(int regno)4159 aarch64_register_saved_on_entry (int regno)
4160 {
4161 return cfun->machine->frame.reg_offset[regno] >= 0;
4162 }
4163
4164 /* Return the next register up from REGNO up to LIMIT for the callee
4165 to save. */
4166
4167 static unsigned
aarch64_next_callee_save(unsigned regno,unsigned limit)4168 aarch64_next_callee_save (unsigned regno, unsigned limit)
4169 {
4170 while (regno <= limit && !aarch64_register_saved_on_entry (regno))
4171 regno ++;
4172 return regno;
4173 }
4174
4175 /* Push the register number REGNO of mode MODE to the stack with write-back
4176 adjusting the stack by ADJUSTMENT. */
4177
4178 static void
aarch64_pushwb_single_reg(machine_mode mode,unsigned regno,HOST_WIDE_INT adjustment)4179 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
4180 HOST_WIDE_INT adjustment)
4181 {
4182 rtx base_rtx = stack_pointer_rtx;
4183 rtx insn, reg, mem;
4184
4185 reg = gen_rtx_REG (mode, regno);
4186 mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
4187 plus_constant (Pmode, base_rtx, -adjustment));
4188 mem = gen_frame_mem (mode, mem);
4189
4190 insn = emit_move_insn (mem, reg);
4191 RTX_FRAME_RELATED_P (insn) = 1;
4192 }
4193
4194 /* Generate and return an instruction to store the pair of registers
4195 REG and REG2 of mode MODE to location BASE with write-back adjusting
4196 the stack location BASE by ADJUSTMENT. */
4197
4198 static rtx
aarch64_gen_storewb_pair(machine_mode mode,rtx base,rtx reg,rtx reg2,HOST_WIDE_INT adjustment)4199 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
4200 HOST_WIDE_INT adjustment)
4201 {
4202 switch (mode)
4203 {
4204 case E_DImode:
4205 return gen_storewb_pairdi_di (base, base, reg, reg2,
4206 GEN_INT (-adjustment),
4207 GEN_INT (UNITS_PER_WORD - adjustment));
4208 case E_DFmode:
4209 return gen_storewb_pairdf_di (base, base, reg, reg2,
4210 GEN_INT (-adjustment),
4211 GEN_INT (UNITS_PER_WORD - adjustment));
4212 default:
4213 gcc_unreachable ();
4214 }
4215 }
4216
4217 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
4218 stack pointer by ADJUSTMENT. */
4219
4220 static void
aarch64_push_regs(unsigned regno1,unsigned regno2,HOST_WIDE_INT adjustment)4221 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
4222 {
4223 rtx_insn *insn;
4224 machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
4225
4226 if (regno2 == INVALID_REGNUM)
4227 return aarch64_pushwb_single_reg (mode, regno1, adjustment);
4228
4229 rtx reg1 = gen_rtx_REG (mode, regno1);
4230 rtx reg2 = gen_rtx_REG (mode, regno2);
4231
4232 insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
4233 reg2, adjustment));
4234 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
4235 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
4236 RTX_FRAME_RELATED_P (insn) = 1;
4237 }
4238
4239 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
4240 adjusting it by ADJUSTMENT afterwards. */
4241
4242 static rtx
aarch64_gen_loadwb_pair(machine_mode mode,rtx base,rtx reg,rtx reg2,HOST_WIDE_INT adjustment)4243 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
4244 HOST_WIDE_INT adjustment)
4245 {
4246 switch (mode)
4247 {
4248 case E_DImode:
4249 return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
4250 GEN_INT (UNITS_PER_WORD));
4251 case E_DFmode:
4252 return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
4253 GEN_INT (UNITS_PER_WORD));
4254 default:
4255 gcc_unreachable ();
4256 }
4257 }
4258
4259 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
4260 afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
4261 into CFI_OPS. */
4262
4263 static void
aarch64_pop_regs(unsigned regno1,unsigned regno2,HOST_WIDE_INT adjustment,rtx * cfi_ops)4264 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
4265 rtx *cfi_ops)
4266 {
4267 machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
4268 rtx reg1 = gen_rtx_REG (mode, regno1);
4269
4270 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
4271
4272 if (regno2 == INVALID_REGNUM)
4273 {
4274 rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
4275 mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
4276 emit_move_insn (reg1, gen_frame_mem (mode, mem));
4277 }
4278 else
4279 {
4280 rtx reg2 = gen_rtx_REG (mode, regno2);
4281 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
4282 emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
4283 reg2, adjustment));
4284 }
4285 }
4286
4287 /* Generate and return a store pair instruction of mode MODE to store
4288 register REG1 to MEM1 and register REG2 to MEM2. */
4289
4290 static rtx
aarch64_gen_store_pair(machine_mode mode,rtx mem1,rtx reg1,rtx mem2,rtx reg2)4291 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
4292 rtx reg2)
4293 {
4294 switch (mode)
4295 {
4296 case E_DImode:
4297 return gen_store_pairdi (mem1, reg1, mem2, reg2);
4298
4299 case E_DFmode:
4300 return gen_store_pairdf (mem1, reg1, mem2, reg2);
4301
4302 default:
4303 gcc_unreachable ();
4304 }
4305 }
4306
4307 /* Generate and regurn a load pair isntruction of mode MODE to load register
4308 REG1 from MEM1 and register REG2 from MEM2. */
4309
4310 static rtx
aarch64_gen_load_pair(machine_mode mode,rtx reg1,rtx mem1,rtx reg2,rtx mem2)4311 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
4312 rtx mem2)
4313 {
4314 switch (mode)
4315 {
4316 case E_DImode:
4317 return gen_load_pairdi (reg1, mem1, reg2, mem2);
4318
4319 case E_DFmode:
4320 return gen_load_pairdf (reg1, mem1, reg2, mem2);
4321
4322 default:
4323 gcc_unreachable ();
4324 }
4325 }
4326
4327 /* Return TRUE if return address signing should be enabled for the current
4328 function, otherwise return FALSE. */
4329
4330 bool
aarch64_return_address_signing_enabled(void)4331 aarch64_return_address_signing_enabled (void)
4332 {
4333 /* This function should only be called after frame laid out. */
4334 gcc_assert (cfun->machine->frame.laid_out);
4335
4336 /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
4337 if it's LR is pushed onto stack. */
4338 return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
4339 || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
4340 && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
4341 }
4342
4343 /* Emit code to save the callee-saved registers from register number START
4344 to LIMIT to the stack at the location starting at offset START_OFFSET,
4345 skipping any write-back candidates if SKIP_WB is true. */
4346
4347 static void
aarch64_save_callee_saves(machine_mode mode,poly_int64 start_offset,unsigned start,unsigned limit,bool skip_wb)4348 aarch64_save_callee_saves (machine_mode mode, poly_int64 start_offset,
4349 unsigned start, unsigned limit, bool skip_wb)
4350 {
4351 rtx_insn *insn;
4352 unsigned regno;
4353 unsigned regno2;
4354
4355 for (regno = aarch64_next_callee_save (start, limit);
4356 regno <= limit;
4357 regno = aarch64_next_callee_save (regno + 1, limit))
4358 {
4359 rtx reg, mem;
4360 poly_int64 offset;
4361
4362 if (skip_wb
4363 && (regno == cfun->machine->frame.wb_candidate1
4364 || regno == cfun->machine->frame.wb_candidate2))
4365 continue;
4366
4367 if (cfun->machine->reg_is_wrapped_separately[regno])
4368 continue;
4369
4370 reg = gen_rtx_REG (mode, regno);
4371 offset = start_offset + cfun->machine->frame.reg_offset[regno];
4372 mem = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
4373 offset));
4374
4375 regno2 = aarch64_next_callee_save (regno + 1, limit);
4376
4377 if (regno2 <= limit
4378 && !cfun->machine->reg_is_wrapped_separately[regno2]
4379 && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
4380 == cfun->machine->frame.reg_offset[regno2]))
4381
4382 {
4383 rtx reg2 = gen_rtx_REG (mode, regno2);
4384 rtx mem2;
4385
4386 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
4387 mem2 = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
4388 offset));
4389 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
4390 reg2));
4391
4392 /* The first part of a frame-related parallel insn is
4393 always assumed to be relevant to the frame
4394 calculations; subsequent parts, are only
4395 frame-related if explicitly marked. */
4396 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
4397 regno = regno2;
4398 }
4399 else
4400 insn = emit_move_insn (mem, reg);
4401
4402 RTX_FRAME_RELATED_P (insn) = 1;
4403 }
4404 }
4405
4406 /* Emit code to restore the callee registers of mode MODE from register
4407 number START up to and including LIMIT. Restore from the stack offset
4408 START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
4409 Write the appropriate REG_CFA_RESTORE notes into CFI_OPS. */
4410
4411 static void
aarch64_restore_callee_saves(machine_mode mode,poly_int64 start_offset,unsigned start,unsigned limit,bool skip_wb,rtx * cfi_ops)4412 aarch64_restore_callee_saves (machine_mode mode,
4413 poly_int64 start_offset, unsigned start,
4414 unsigned limit, bool skip_wb, rtx *cfi_ops)
4415 {
4416 rtx base_rtx = stack_pointer_rtx;
4417 unsigned regno;
4418 unsigned regno2;
4419 poly_int64 offset;
4420
4421 for (regno = aarch64_next_callee_save (start, limit);
4422 regno <= limit;
4423 regno = aarch64_next_callee_save (regno + 1, limit))
4424 {
4425 if (cfun->machine->reg_is_wrapped_separately[regno])
4426 continue;
4427
4428 rtx reg, mem;
4429
4430 if (skip_wb
4431 && (regno == cfun->machine->frame.wb_candidate1
4432 || regno == cfun->machine->frame.wb_candidate2))
4433 continue;
4434
4435 reg = gen_rtx_REG (mode, regno);
4436 offset = start_offset + cfun->machine->frame.reg_offset[regno];
4437 mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
4438
4439 regno2 = aarch64_next_callee_save (regno + 1, limit);
4440
4441 if (regno2 <= limit
4442 && !cfun->machine->reg_is_wrapped_separately[regno2]
4443 && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
4444 == cfun->machine->frame.reg_offset[regno2]))
4445 {
4446 rtx reg2 = gen_rtx_REG (mode, regno2);
4447 rtx mem2;
4448
4449 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
4450 mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
4451 emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
4452
4453 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
4454 regno = regno2;
4455 }
4456 else
4457 emit_move_insn (reg, mem);
4458 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
4459 }
4460 }
4461
4462 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
4463 of MODE. */
4464
4465 static inline bool
offset_4bit_signed_scaled_p(machine_mode mode,poly_int64 offset)4466 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4467 {
4468 HOST_WIDE_INT multiple;
4469 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4470 && IN_RANGE (multiple, -8, 7));
4471 }
4472
4473 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
4474 of MODE. */
4475
4476 static inline bool
offset_6bit_unsigned_scaled_p(machine_mode mode,poly_int64 offset)4477 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
4478 {
4479 HOST_WIDE_INT multiple;
4480 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4481 && IN_RANGE (multiple, 0, 63));
4482 }
4483
4484 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
4485 of MODE. */
4486
4487 bool
aarch64_offset_7bit_signed_scaled_p(machine_mode mode,poly_int64 offset)4488 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4489 {
4490 HOST_WIDE_INT multiple;
4491 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4492 && IN_RANGE (multiple, -64, 63));
4493 }
4494
4495 /* Return true if OFFSET is a signed 9-bit value. */
4496
4497 static inline bool
offset_9bit_signed_unscaled_p(machine_mode mode ATTRIBUTE_UNUSED,poly_int64 offset)4498 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
4499 poly_int64 offset)
4500 {
4501 HOST_WIDE_INT const_offset;
4502 return (offset.is_constant (&const_offset)
4503 && IN_RANGE (const_offset, -256, 255));
4504 }
4505
4506 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
4507 of MODE. */
4508
4509 static inline bool
offset_9bit_signed_scaled_p(machine_mode mode,poly_int64 offset)4510 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4511 {
4512 HOST_WIDE_INT multiple;
4513 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4514 && IN_RANGE (multiple, -256, 255));
4515 }
4516
4517 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
4518 of MODE. */
4519
4520 static inline bool
offset_12bit_unsigned_scaled_p(machine_mode mode,poly_int64 offset)4521 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
4522 {
4523 HOST_WIDE_INT multiple;
4524 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4525 && IN_RANGE (multiple, 0, 4095));
4526 }
4527
4528 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */
4529
4530 static sbitmap
aarch64_get_separate_components(void)4531 aarch64_get_separate_components (void)
4532 {
4533 aarch64_layout_frame ();
4534
4535 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
4536 bitmap_clear (components);
4537
4538 /* The registers we need saved to the frame. */
4539 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
4540 if (aarch64_register_saved_on_entry (regno))
4541 {
4542 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
4543 if (!frame_pointer_needed)
4544 offset += cfun->machine->frame.frame_size
4545 - cfun->machine->frame.hard_fp_offset;
4546 /* Check that we can access the stack slot of the register with one
4547 direct load with no adjustments needed. */
4548 if (offset_12bit_unsigned_scaled_p (DImode, offset))
4549 bitmap_set_bit (components, regno);
4550 }
4551
4552 /* Don't mess with the hard frame pointer. */
4553 if (frame_pointer_needed)
4554 bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
4555
4556 unsigned reg1 = cfun->machine->frame.wb_candidate1;
4557 unsigned reg2 = cfun->machine->frame.wb_candidate2;
4558 /* If aarch64_layout_frame has chosen registers to store/restore with
4559 writeback don't interfere with them to avoid having to output explicit
4560 stack adjustment instructions. */
4561 if (reg2 != INVALID_REGNUM)
4562 bitmap_clear_bit (components, reg2);
4563 if (reg1 != INVALID_REGNUM)
4564 bitmap_clear_bit (components, reg1);
4565
4566 bitmap_clear_bit (components, LR_REGNUM);
4567 bitmap_clear_bit (components, SP_REGNUM);
4568
4569 return components;
4570 }
4571
4572 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. */
4573
4574 static sbitmap
aarch64_components_for_bb(basic_block bb)4575 aarch64_components_for_bb (basic_block bb)
4576 {
4577 bitmap in = DF_LIVE_IN (bb);
4578 bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
4579 bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
4580
4581 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
4582 bitmap_clear (components);
4583
4584 /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */
4585 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
4586 if ((!call_used_regs[regno])
4587 && (bitmap_bit_p (in, regno)
4588 || bitmap_bit_p (gen, regno)
4589 || bitmap_bit_p (kill, regno)))
4590 {
4591 unsigned regno2, offset, offset2;
4592 bitmap_set_bit (components, regno);
4593
4594 /* If there is a callee-save at an adjacent offset, add it too
4595 to increase the use of LDP/STP. */
4596 offset = cfun->machine->frame.reg_offset[regno];
4597 regno2 = ((offset & 8) == 0) ? regno + 1 : regno - 1;
4598
4599 if (regno2 <= LAST_SAVED_REGNUM)
4600 {
4601 offset2 = cfun->machine->frame.reg_offset[regno2];
4602 if ((offset & ~8) == (offset2 & ~8))
4603 bitmap_set_bit (components, regno2);
4604 }
4605 }
4606
4607 return components;
4608 }
4609
4610 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
4611 Nothing to do for aarch64. */
4612
4613 static void
aarch64_disqualify_components(sbitmap,edge,sbitmap,bool)4614 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
4615 {
4616 }
4617
4618 /* Return the next set bit in BMP from START onwards. Return the total number
4619 of bits in BMP if no set bit is found at or after START. */
4620
4621 static unsigned int
aarch64_get_next_set_bit(sbitmap bmp,unsigned int start)4622 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
4623 {
4624 unsigned int nbits = SBITMAP_SIZE (bmp);
4625 if (start == nbits)
4626 return start;
4627
4628 gcc_assert (start < nbits);
4629 for (unsigned int i = start; i < nbits; i++)
4630 if (bitmap_bit_p (bmp, i))
4631 return i;
4632
4633 return nbits;
4634 }
4635
4636 /* Do the work for aarch64_emit_prologue_components and
4637 aarch64_emit_epilogue_components. COMPONENTS is the bitmap of registers
4638 to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
4639 for these components or the epilogue sequence. That is, it determines
4640 whether we should emit stores or loads and what kind of CFA notes to attach
4641 to the insns. Otherwise the logic for the two sequences is very
4642 similar. */
4643
4644 static void
aarch64_process_components(sbitmap components,bool prologue_p)4645 aarch64_process_components (sbitmap components, bool prologue_p)
4646 {
4647 rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
4648 ? HARD_FRAME_POINTER_REGNUM
4649 : STACK_POINTER_REGNUM);
4650
4651 unsigned last_regno = SBITMAP_SIZE (components);
4652 unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
4653 rtx_insn *insn = NULL;
4654
4655 while (regno != last_regno)
4656 {
4657 /* AAPCS64 section 5.1.2 requires only the bottom 64 bits to be saved
4658 so DFmode for the vector registers is enough. */
4659 machine_mode mode = GP_REGNUM_P (regno) ? E_DImode : E_DFmode;
4660 rtx reg = gen_rtx_REG (mode, regno);
4661 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
4662 if (!frame_pointer_needed)
4663 offset += cfun->machine->frame.frame_size
4664 - cfun->machine->frame.hard_fp_offset;
4665 rtx addr = plus_constant (Pmode, ptr_reg, offset);
4666 rtx mem = gen_frame_mem (mode, addr);
4667
4668 rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
4669 unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
4670 /* No more registers to handle after REGNO.
4671 Emit a single save/restore and exit. */
4672 if (regno2 == last_regno)
4673 {
4674 insn = emit_insn (set);
4675 RTX_FRAME_RELATED_P (insn) = 1;
4676 if (prologue_p)
4677 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
4678 else
4679 add_reg_note (insn, REG_CFA_RESTORE, reg);
4680 break;
4681 }
4682
4683 poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
4684 /* The next register is not of the same class or its offset is not
4685 mergeable with the current one into a pair. */
4686 if (!satisfies_constraint_Ump (mem)
4687 || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
4688 || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
4689 GET_MODE_SIZE (mode)))
4690 {
4691 insn = emit_insn (set);
4692 RTX_FRAME_RELATED_P (insn) = 1;
4693 if (prologue_p)
4694 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
4695 else
4696 add_reg_note (insn, REG_CFA_RESTORE, reg);
4697
4698 regno = regno2;
4699 continue;
4700 }
4701
4702 /* REGNO2 can be saved/restored in a pair with REGNO. */
4703 rtx reg2 = gen_rtx_REG (mode, regno2);
4704 if (!frame_pointer_needed)
4705 offset2 += cfun->machine->frame.frame_size
4706 - cfun->machine->frame.hard_fp_offset;
4707 rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
4708 rtx mem2 = gen_frame_mem (mode, addr2);
4709 rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
4710 : gen_rtx_SET (reg2, mem2);
4711
4712 if (prologue_p)
4713 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
4714 else
4715 insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
4716
4717 RTX_FRAME_RELATED_P (insn) = 1;
4718 if (prologue_p)
4719 {
4720 add_reg_note (insn, REG_CFA_OFFSET, set);
4721 add_reg_note (insn, REG_CFA_OFFSET, set2);
4722 }
4723 else
4724 {
4725 add_reg_note (insn, REG_CFA_RESTORE, reg);
4726 add_reg_note (insn, REG_CFA_RESTORE, reg2);
4727 }
4728
4729 regno = aarch64_get_next_set_bit (components, regno2 + 1);
4730 }
4731 }
4732
4733 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. */
4734
4735 static void
aarch64_emit_prologue_components(sbitmap components)4736 aarch64_emit_prologue_components (sbitmap components)
4737 {
4738 aarch64_process_components (components, true);
4739 }
4740
4741 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. */
4742
4743 static void
aarch64_emit_epilogue_components(sbitmap components)4744 aarch64_emit_epilogue_components (sbitmap components)
4745 {
4746 aarch64_process_components (components, false);
4747 }
4748
4749 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS. */
4750
4751 static void
aarch64_set_handled_components(sbitmap components)4752 aarch64_set_handled_components (sbitmap components)
4753 {
4754 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
4755 if (bitmap_bit_p (components, regno))
4756 cfun->machine->reg_is_wrapped_separately[regno] = true;
4757 }
4758
4759 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
4760 is saved at BASE + OFFSET. */
4761
4762 static void
aarch64_add_cfa_expression(rtx_insn * insn,unsigned int reg,rtx base,poly_int64 offset)4763 aarch64_add_cfa_expression (rtx_insn *insn, unsigned int reg,
4764 rtx base, poly_int64 offset)
4765 {
4766 rtx mem = gen_frame_mem (DImode, plus_constant (Pmode, base, offset));
4767 add_reg_note (insn, REG_CFA_EXPRESSION,
4768 gen_rtx_SET (mem, regno_reg_rtx[reg]));
4769 }
4770
4771 /* AArch64 stack frames generated by this compiler look like:
4772
4773 +-------------------------------+
4774 | |
4775 | incoming stack arguments |
4776 | |
4777 +-------------------------------+
4778 | | <-- incoming stack pointer (aligned)
4779 | callee-allocated save area |
4780 | for register varargs |
4781 | |
4782 +-------------------------------+
4783 | local variables | <-- frame_pointer_rtx
4784 | |
4785 +-------------------------------+
4786 | padding0 | \
4787 +-------------------------------+ |
4788 | callee-saved registers | | frame.saved_regs_size
4789 +-------------------------------+ |
4790 | LR' | |
4791 +-------------------------------+ |
4792 | FP' | / <- hard_frame_pointer_rtx (aligned)
4793 +-------------------------------+
4794 | dynamic allocation |
4795 +-------------------------------+
4796 | padding |
4797 +-------------------------------+
4798 | outgoing stack arguments | <-- arg_pointer
4799 | |
4800 +-------------------------------+
4801 | | <-- stack_pointer_rtx (aligned)
4802
4803 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
4804 but leave frame_pointer_rtx and hard_frame_pointer_rtx
4805 unchanged. */
4806
4807 /* Generate the prologue instructions for entry into a function.
4808 Establish the stack frame by decreasing the stack pointer with a
4809 properly calculated size and, if necessary, create a frame record
4810 filled with the values of LR and previous frame pointer. The
4811 current FP is also set up if it is in use. */
4812
4813 void
aarch64_expand_prologue(void)4814 aarch64_expand_prologue (void)
4815 {
4816 aarch64_layout_frame ();
4817
4818 poly_int64 frame_size = cfun->machine->frame.frame_size;
4819 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
4820 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
4821 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
4822 poly_int64 callee_offset = cfun->machine->frame.callee_offset;
4823 unsigned reg1 = cfun->machine->frame.wb_candidate1;
4824 unsigned reg2 = cfun->machine->frame.wb_candidate2;
4825 bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
4826 rtx_insn *insn;
4827
4828 /* Sign return address for functions. */
4829 if (aarch64_return_address_signing_enabled ())
4830 {
4831 insn = emit_insn (gen_pacisp ());
4832 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
4833 RTX_FRAME_RELATED_P (insn) = 1;
4834 }
4835
4836 if (flag_stack_usage_info)
4837 current_function_static_stack_size = constant_lower_bound (frame_size);
4838
4839 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
4840 {
4841 if (crtl->is_leaf && !cfun->calls_alloca)
4842 {
4843 if (maybe_gt (frame_size, PROBE_INTERVAL)
4844 && maybe_gt (frame_size, get_stack_check_protect ()))
4845 aarch64_emit_probe_stack_range (get_stack_check_protect (),
4846 (frame_size
4847 - get_stack_check_protect ()));
4848 }
4849 else if (maybe_gt (frame_size, 0))
4850 aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
4851 }
4852
4853 rtx ip0_rtx = gen_rtx_REG (Pmode, IP0_REGNUM);
4854 rtx ip1_rtx = gen_rtx_REG (Pmode, IP1_REGNUM);
4855
4856 aarch64_sub_sp (ip0_rtx, ip1_rtx, initial_adjust, true);
4857
4858 if (callee_adjust != 0)
4859 aarch64_push_regs (reg1, reg2, callee_adjust);
4860
4861 if (emit_frame_chain)
4862 {
4863 poly_int64 reg_offset = callee_adjust;
4864 if (callee_adjust == 0)
4865 {
4866 reg1 = R29_REGNUM;
4867 reg2 = R30_REGNUM;
4868 reg_offset = callee_offset;
4869 aarch64_save_callee_saves (DImode, reg_offset, reg1, reg2, false);
4870 }
4871 aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
4872 stack_pointer_rtx, callee_offset,
4873 ip1_rtx, ip0_rtx, frame_pointer_needed);
4874 if (frame_pointer_needed && !frame_size.is_constant ())
4875 {
4876 /* Variable-sized frames need to describe the save slot
4877 address using DW_CFA_expression rather than DW_CFA_offset.
4878 This means that, without taking further action, the
4879 locations of the registers that we've already saved would
4880 remain based on the stack pointer even after we redefine
4881 the CFA based on the frame pointer. We therefore need new
4882 DW_CFA_expressions to re-express the save slots with addresses
4883 based on the frame pointer. */
4884 rtx_insn *insn = get_last_insn ();
4885 gcc_assert (RTX_FRAME_RELATED_P (insn));
4886
4887 /* Add an explicit CFA definition if this was previously
4888 implicit. */
4889 if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
4890 {
4891 rtx src = plus_constant (Pmode, stack_pointer_rtx,
4892 callee_offset);
4893 add_reg_note (insn, REG_CFA_ADJUST_CFA,
4894 gen_rtx_SET (hard_frame_pointer_rtx, src));
4895 }
4896
4897 /* Change the save slot expressions for the registers that
4898 we've already saved. */
4899 reg_offset -= callee_offset;
4900 aarch64_add_cfa_expression (insn, reg2, hard_frame_pointer_rtx,
4901 reg_offset + UNITS_PER_WORD);
4902 aarch64_add_cfa_expression (insn, reg1, hard_frame_pointer_rtx,
4903 reg_offset);
4904 }
4905 emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
4906 }
4907
4908 aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
4909 callee_adjust != 0 || emit_frame_chain);
4910 aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
4911 callee_adjust != 0 || emit_frame_chain);
4912 aarch64_sub_sp (ip1_rtx, ip0_rtx, final_adjust, !frame_pointer_needed);
4913 }
4914
4915 /* Return TRUE if we can use a simple_return insn.
4916
4917 This function checks whether the callee saved stack is empty, which
4918 means no restore actions are need. The pro_and_epilogue will use
4919 this to check whether shrink-wrapping opt is feasible. */
4920
4921 bool
aarch64_use_return_insn_p(void)4922 aarch64_use_return_insn_p (void)
4923 {
4924 if (!reload_completed)
4925 return false;
4926
4927 if (crtl->profile)
4928 return false;
4929
4930 aarch64_layout_frame ();
4931
4932 return known_eq (cfun->machine->frame.frame_size, 0);
4933 }
4934
4935 /* Generate the epilogue instructions for returning from a function.
4936 This is almost exactly the reverse of the prolog sequence, except
4937 that we need to insert barriers to avoid scheduling loads that read
4938 from a deallocated stack, and we optimize the unwind records by
4939 emitting them all together if possible. */
4940 void
aarch64_expand_epilogue(bool for_sibcall)4941 aarch64_expand_epilogue (bool for_sibcall)
4942 {
4943 aarch64_layout_frame ();
4944
4945 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
4946 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
4947 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
4948 poly_int64 callee_offset = cfun->machine->frame.callee_offset;
4949 unsigned reg1 = cfun->machine->frame.wb_candidate1;
4950 unsigned reg2 = cfun->machine->frame.wb_candidate2;
4951 rtx cfi_ops = NULL;
4952 rtx_insn *insn;
4953 /* A stack clash protection prologue may not have left IP0_REGNUM or
4954 IP1_REGNUM in a usable state. The same is true for allocations
4955 with an SVE component, since we then need both temporary registers
4956 for each allocation. */
4957 bool can_inherit_p = (initial_adjust.is_constant ()
4958 && final_adjust.is_constant ()
4959 && !flag_stack_clash_protection);
4960
4961 /* We need to add memory barrier to prevent read from deallocated stack. */
4962 bool need_barrier_p
4963 = maybe_ne (get_frame_size ()
4964 + cfun->machine->frame.saved_varargs_size, 0);
4965
4966 /* Emit a barrier to prevent loads from a deallocated stack. */
4967 if (maybe_gt (final_adjust, crtl->outgoing_args_size)
4968 || cfun->calls_alloca
4969 || crtl->calls_eh_return)
4970 {
4971 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
4972 need_barrier_p = false;
4973 }
4974
4975 /* Restore the stack pointer from the frame pointer if it may not
4976 be the same as the stack pointer. */
4977 rtx ip0_rtx = gen_rtx_REG (Pmode, IP0_REGNUM);
4978 rtx ip1_rtx = gen_rtx_REG (Pmode, IP1_REGNUM);
4979 if (frame_pointer_needed
4980 && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
4981 /* If writeback is used when restoring callee-saves, the CFA
4982 is restored on the instruction doing the writeback. */
4983 aarch64_add_offset (Pmode, stack_pointer_rtx,
4984 hard_frame_pointer_rtx, -callee_offset,
4985 ip1_rtx, ip0_rtx, callee_adjust == 0);
4986 else
4987 aarch64_add_sp (ip1_rtx, ip0_rtx, final_adjust,
4988 !can_inherit_p || df_regs_ever_live_p (IP1_REGNUM));
4989
4990 aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
4991 callee_adjust != 0, &cfi_ops);
4992 aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
4993 callee_adjust != 0, &cfi_ops);
4994
4995 if (need_barrier_p)
4996 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
4997
4998 if (callee_adjust != 0)
4999 aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
5000
5001 if (callee_adjust != 0 || maybe_gt (initial_adjust, 65536))
5002 {
5003 /* Emit delayed restores and set the CFA to be SP + initial_adjust. */
5004 insn = get_last_insn ();
5005 rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
5006 REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
5007 RTX_FRAME_RELATED_P (insn) = 1;
5008 cfi_ops = NULL;
5009 }
5010
5011 aarch64_add_sp (ip0_rtx, ip1_rtx, initial_adjust,
5012 !can_inherit_p || df_regs_ever_live_p (IP0_REGNUM));
5013
5014 if (cfi_ops)
5015 {
5016 /* Emit delayed restores and reset the CFA to be SP. */
5017 insn = get_last_insn ();
5018 cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
5019 REG_NOTES (insn) = cfi_ops;
5020 RTX_FRAME_RELATED_P (insn) = 1;
5021 }
5022
5023 /* We prefer to emit the combined return/authenticate instruction RETAA,
5024 however there are three cases in which we must instead emit an explicit
5025 authentication instruction.
5026
5027 1) Sibcalls don't return in a normal way, so if we're about to call one
5028 we must authenticate.
5029
5030 2) The RETAA instruction is not available before ARMv8.3-A, so if we are
5031 generating code for !TARGET_ARMV8_3 we can't use it and must
5032 explicitly authenticate.
5033
5034 3) On an eh_return path we make extra stack adjustments to update the
5035 canonical frame address to be the exception handler's CFA. We want
5036 to authenticate using the CFA of the function which calls eh_return.
5037 */
5038 if (aarch64_return_address_signing_enabled ()
5039 && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
5040 {
5041 insn = emit_insn (gen_autisp ());
5042 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
5043 RTX_FRAME_RELATED_P (insn) = 1;
5044 }
5045
5046 /* Stack adjustment for exception handler. */
5047 if (crtl->calls_eh_return)
5048 {
5049 /* We need to unwind the stack by the offset computed by
5050 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
5051 to be SP; letting the CFA move during this adjustment
5052 is just as correct as retaining the CFA from the body
5053 of the function. Therefore, do nothing special. */
5054 emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
5055 }
5056
5057 emit_use (gen_rtx_REG (DImode, LR_REGNUM));
5058 if (!for_sibcall)
5059 emit_jump_insn (ret_rtx);
5060 }
5061
5062 /* Implement EH_RETURN_HANDLER_RTX. EH returns need to either return
5063 normally or return to a previous frame after unwinding.
5064
5065 An EH return uses a single shared return sequence. The epilogue is
5066 exactly like a normal epilogue except that it has an extra input
5067 register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
5068 that must be applied after the frame has been destroyed. An extra label
5069 is inserted before the epilogue which initializes this register to zero,
5070 and this is the entry point for a normal return.
5071
5072 An actual EH return updates the return address, initializes the stack
5073 adjustment and jumps directly into the epilogue (bypassing the zeroing
5074 of the adjustment). Since the return address is typically saved on the
5075 stack when a function makes a call, the saved LR must be updated outside
5076 the epilogue.
5077
5078 This poses problems as the store is generated well before the epilogue,
5079 so the offset of LR is not known yet. Also optimizations will remove the
5080 store as it appears dead, even after the epilogue is generated (as the
5081 base or offset for loading LR is different in many cases).
5082
5083 To avoid these problems this implementation forces the frame pointer
5084 in eh_return functions so that the location of LR is fixed and known early.
5085 It also marks the store volatile, so no optimization is permitted to
5086 remove the store. */
5087 rtx
aarch64_eh_return_handler_rtx(void)5088 aarch64_eh_return_handler_rtx (void)
5089 {
5090 rtx tmp = gen_frame_mem (Pmode,
5091 plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
5092
5093 /* Mark the store volatile, so no optimization is permitted to remove it. */
5094 MEM_VOLATILE_P (tmp) = true;
5095 return tmp;
5096 }
5097
5098 /* Output code to add DELTA to the first argument, and then jump
5099 to FUNCTION. Used for C++ multiple inheritance. */
5100 static void
aarch64_output_mi_thunk(FILE * file,tree thunk ATTRIBUTE_UNUSED,HOST_WIDE_INT delta,HOST_WIDE_INT vcall_offset,tree function)5101 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
5102 HOST_WIDE_INT delta,
5103 HOST_WIDE_INT vcall_offset,
5104 tree function)
5105 {
5106 /* The this pointer is always in x0. Note that this differs from
5107 Arm where the this pointer maybe bumped to r1 if r0 is required
5108 to return a pointer to an aggregate. On AArch64 a result value
5109 pointer will be in x8. */
5110 int this_regno = R0_REGNUM;
5111 rtx this_rtx, temp0, temp1, addr, funexp;
5112 rtx_insn *insn;
5113
5114 reload_completed = 1;
5115 emit_note (NOTE_INSN_PROLOGUE_END);
5116
5117 this_rtx = gen_rtx_REG (Pmode, this_regno);
5118 temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
5119 temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
5120
5121 if (vcall_offset == 0)
5122 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
5123 else
5124 {
5125 gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
5126
5127 addr = this_rtx;
5128 if (delta != 0)
5129 {
5130 if (delta >= -256 && delta < 256)
5131 addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
5132 plus_constant (Pmode, this_rtx, delta));
5133 else
5134 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
5135 temp1, temp0, false);
5136 }
5137
5138 if (Pmode == ptr_mode)
5139 aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
5140 else
5141 aarch64_emit_move (temp0,
5142 gen_rtx_ZERO_EXTEND (Pmode,
5143 gen_rtx_MEM (ptr_mode, addr)));
5144
5145 if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
5146 addr = plus_constant (Pmode, temp0, vcall_offset);
5147 else
5148 {
5149 aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
5150 Pmode);
5151 addr = gen_rtx_PLUS (Pmode, temp0, temp1);
5152 }
5153
5154 if (Pmode == ptr_mode)
5155 aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
5156 else
5157 aarch64_emit_move (temp1,
5158 gen_rtx_SIGN_EXTEND (Pmode,
5159 gen_rtx_MEM (ptr_mode, addr)));
5160
5161 emit_insn (gen_add2_insn (this_rtx, temp1));
5162 }
5163
5164 /* Generate a tail call to the target function. */
5165 if (!TREE_USED (function))
5166 {
5167 assemble_external (function);
5168 TREE_USED (function) = 1;
5169 }
5170 funexp = XEXP (DECL_RTL (function), 0);
5171 funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
5172 insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
5173 SIBLING_CALL_P (insn) = 1;
5174
5175 insn = get_insns ();
5176 shorten_branches (insn);
5177 final_start_function (insn, file, 1);
5178 final (insn, file, 1);
5179 final_end_function ();
5180
5181 /* Stop pretending to be a post-reload pass. */
5182 reload_completed = 0;
5183 }
5184
5185 static bool
aarch64_tls_referenced_p(rtx x)5186 aarch64_tls_referenced_p (rtx x)
5187 {
5188 if (!TARGET_HAVE_TLS)
5189 return false;
5190 subrtx_iterator::array_type array;
5191 FOR_EACH_SUBRTX (iter, array, x, ALL)
5192 {
5193 const_rtx x = *iter;
5194 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
5195 return true;
5196 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
5197 TLS offsets, not real symbol references. */
5198 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
5199 iter.skip_subrtxes ();
5200 }
5201 return false;
5202 }
5203
5204
5205 /* Return true if val can be encoded as a 12-bit unsigned immediate with
5206 a left shift of 0 or 12 bits. */
5207 bool
aarch64_uimm12_shift(HOST_WIDE_INT val)5208 aarch64_uimm12_shift (HOST_WIDE_INT val)
5209 {
5210 return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
5211 || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
5212 );
5213 }
5214
5215
5216 /* Return true if val is an immediate that can be loaded into a
5217 register by a MOVZ instruction. */
5218 static bool
aarch64_movw_imm(HOST_WIDE_INT val,scalar_int_mode mode)5219 aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
5220 {
5221 if (GET_MODE_SIZE (mode) > 4)
5222 {
5223 if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
5224 || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
5225 return 1;
5226 }
5227 else
5228 {
5229 /* Ignore sign extension. */
5230 val &= (HOST_WIDE_INT) 0xffffffff;
5231 }
5232 return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
5233 || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
5234 }
5235
5236 /* VAL is a value with the inner mode of MODE. Replicate it to fill a
5237 64-bit (DImode) integer. */
5238
5239 static unsigned HOST_WIDE_INT
aarch64_replicate_bitmask_imm(unsigned HOST_WIDE_INT val,machine_mode mode)5240 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
5241 {
5242 unsigned int size = GET_MODE_UNIT_PRECISION (mode);
5243 while (size < 64)
5244 {
5245 val &= (HOST_WIDE_INT_1U << size) - 1;
5246 val |= val << size;
5247 size *= 2;
5248 }
5249 return val;
5250 }
5251
5252 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
5253
5254 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
5255 {
5256 0x0000000100000001ull,
5257 0x0001000100010001ull,
5258 0x0101010101010101ull,
5259 0x1111111111111111ull,
5260 0x5555555555555555ull,
5261 };
5262
5263
5264 /* Return true if val is a valid bitmask immediate. */
5265
5266 bool
aarch64_bitmask_imm(HOST_WIDE_INT val_in,machine_mode mode)5267 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
5268 {
5269 unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
5270 int bits;
5271
5272 /* Check for a single sequence of one bits and return quickly if so.
5273 The special cases of all ones and all zeroes returns false. */
5274 val = aarch64_replicate_bitmask_imm (val_in, mode);
5275 tmp = val + (val & -val);
5276
5277 if (tmp == (tmp & -tmp))
5278 return (val + 1) > 1;
5279
5280 /* Replicate 32-bit immediates so we can treat them as 64-bit. */
5281 if (mode == SImode)
5282 val = (val << 32) | (val & 0xffffffff);
5283
5284 /* Invert if the immediate doesn't start with a zero bit - this means we
5285 only need to search for sequences of one bits. */
5286 if (val & 1)
5287 val = ~val;
5288
5289 /* Find the first set bit and set tmp to val with the first sequence of one
5290 bits removed. Return success if there is a single sequence of ones. */
5291 first_one = val & -val;
5292 tmp = val & (val + first_one);
5293
5294 if (tmp == 0)
5295 return true;
5296
5297 /* Find the next set bit and compute the difference in bit position. */
5298 next_one = tmp & -tmp;
5299 bits = clz_hwi (first_one) - clz_hwi (next_one);
5300 mask = val ^ tmp;
5301
5302 /* Check the bit position difference is a power of 2, and that the first
5303 sequence of one bits fits within 'bits' bits. */
5304 if ((mask >> bits) != 0 || bits != (bits & -bits))
5305 return false;
5306
5307 /* Check the sequence of one bits is repeated 64/bits times. */
5308 return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
5309 }
5310
5311 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
5312 Assumed precondition: VAL_IN Is not zero. */
5313
5314 unsigned HOST_WIDE_INT
aarch64_and_split_imm1(HOST_WIDE_INT val_in)5315 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
5316 {
5317 int lowest_bit_set = ctz_hwi (val_in);
5318 int highest_bit_set = floor_log2 (val_in);
5319 gcc_assert (val_in != 0);
5320
5321 return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
5322 (HOST_WIDE_INT_1U << lowest_bit_set));
5323 }
5324
5325 /* Create constant where bits outside of lowest bit set to highest bit set
5326 are set to 1. */
5327
5328 unsigned HOST_WIDE_INT
aarch64_and_split_imm2(HOST_WIDE_INT val_in)5329 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
5330 {
5331 return val_in | ~aarch64_and_split_imm1 (val_in);
5332 }
5333
5334 /* Return true if VAL_IN is a valid 'and' bitmask immediate. */
5335
5336 bool
aarch64_and_bitmask_imm(unsigned HOST_WIDE_INT val_in,machine_mode mode)5337 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
5338 {
5339 scalar_int_mode int_mode;
5340 if (!is_a <scalar_int_mode> (mode, &int_mode))
5341 return false;
5342
5343 if (aarch64_bitmask_imm (val_in, int_mode))
5344 return false;
5345
5346 if (aarch64_move_imm (val_in, int_mode))
5347 return false;
5348
5349 unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
5350
5351 return aarch64_bitmask_imm (imm2, int_mode);
5352 }
5353
5354 /* Return true if val is an immediate that can be loaded into a
5355 register in a single instruction. */
5356 bool
aarch64_move_imm(HOST_WIDE_INT val,machine_mode mode)5357 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
5358 {
5359 scalar_int_mode int_mode;
5360 if (!is_a <scalar_int_mode> (mode, &int_mode))
5361 return false;
5362
5363 if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
5364 return 1;
5365 return aarch64_bitmask_imm (val, int_mode);
5366 }
5367
5368 static bool
aarch64_cannot_force_const_mem(machine_mode mode ATTRIBUTE_UNUSED,rtx x)5369 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
5370 {
5371 rtx base, offset;
5372
5373 if (GET_CODE (x) == HIGH)
5374 return true;
5375
5376 /* There's no way to calculate VL-based values using relocations. */
5377 subrtx_iterator::array_type array;
5378 FOR_EACH_SUBRTX (iter, array, x, ALL)
5379 if (GET_CODE (*iter) == CONST_POLY_INT)
5380 return true;
5381
5382 split_const (x, &base, &offset);
5383 if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
5384 {
5385 if (aarch64_classify_symbol (base, INTVAL (offset))
5386 != SYMBOL_FORCE_TO_MEM)
5387 return true;
5388 else
5389 /* Avoid generating a 64-bit relocation in ILP32; leave
5390 to aarch64_expand_mov_immediate to handle it properly. */
5391 return mode != ptr_mode;
5392 }
5393
5394 return aarch64_tls_referenced_p (x);
5395 }
5396
5397 /* Implement TARGET_CASE_VALUES_THRESHOLD.
5398 The expansion for a table switch is quite expensive due to the number
5399 of instructions, the table lookup and hard to predict indirect jump.
5400 When optimizing for speed, and -O3 enabled, use the per-core tuning if
5401 set, otherwise use tables for > 16 cases as a tradeoff between size and
5402 performance. When optimizing for size, use the default setting. */
5403
5404 static unsigned int
aarch64_case_values_threshold(void)5405 aarch64_case_values_threshold (void)
5406 {
5407 /* Use the specified limit for the number of cases before using jump
5408 tables at higher optimization levels. */
5409 if (optimize > 2
5410 && selected_cpu->tune->max_case_values != 0)
5411 return selected_cpu->tune->max_case_values;
5412 else
5413 return optimize_size ? default_case_values_threshold () : 17;
5414 }
5415
5416 /* Return true if register REGNO is a valid index register.
5417 STRICT_P is true if REG_OK_STRICT is in effect. */
5418
5419 bool
aarch64_regno_ok_for_index_p(int regno,bool strict_p)5420 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
5421 {
5422 if (!HARD_REGISTER_NUM_P (regno))
5423 {
5424 if (!strict_p)
5425 return true;
5426
5427 if (!reg_renumber)
5428 return false;
5429
5430 regno = reg_renumber[regno];
5431 }
5432 return GP_REGNUM_P (regno);
5433 }
5434
5435 /* Return true if register REGNO is a valid base register for mode MODE.
5436 STRICT_P is true if REG_OK_STRICT is in effect. */
5437
5438 bool
aarch64_regno_ok_for_base_p(int regno,bool strict_p)5439 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
5440 {
5441 if (!HARD_REGISTER_NUM_P (regno))
5442 {
5443 if (!strict_p)
5444 return true;
5445
5446 if (!reg_renumber)
5447 return false;
5448
5449 regno = reg_renumber[regno];
5450 }
5451
5452 /* The fake registers will be eliminated to either the stack or
5453 hard frame pointer, both of which are usually valid base registers.
5454 Reload deals with the cases where the eliminated form isn't valid. */
5455 return (GP_REGNUM_P (regno)
5456 || regno == SP_REGNUM
5457 || regno == FRAME_POINTER_REGNUM
5458 || regno == ARG_POINTER_REGNUM);
5459 }
5460
5461 /* Return true if X is a valid base register for mode MODE.
5462 STRICT_P is true if REG_OK_STRICT is in effect. */
5463
5464 static bool
aarch64_base_register_rtx_p(rtx x,bool strict_p)5465 aarch64_base_register_rtx_p (rtx x, bool strict_p)
5466 {
5467 if (!strict_p
5468 && GET_CODE (x) == SUBREG
5469 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
5470 x = SUBREG_REG (x);
5471
5472 return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
5473 }
5474
5475 /* Return true if address offset is a valid index. If it is, fill in INFO
5476 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
5477
5478 static bool
aarch64_classify_index(struct aarch64_address_info * info,rtx x,machine_mode mode,bool strict_p)5479 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
5480 machine_mode mode, bool strict_p)
5481 {
5482 enum aarch64_address_type type;
5483 rtx index;
5484 int shift;
5485
5486 /* (reg:P) */
5487 if ((REG_P (x) || GET_CODE (x) == SUBREG)
5488 && GET_MODE (x) == Pmode)
5489 {
5490 type = ADDRESS_REG_REG;
5491 index = x;
5492 shift = 0;
5493 }
5494 /* (sign_extend:DI (reg:SI)) */
5495 else if ((GET_CODE (x) == SIGN_EXTEND
5496 || GET_CODE (x) == ZERO_EXTEND)
5497 && GET_MODE (x) == DImode
5498 && GET_MODE (XEXP (x, 0)) == SImode)
5499 {
5500 type = (GET_CODE (x) == SIGN_EXTEND)
5501 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5502 index = XEXP (x, 0);
5503 shift = 0;
5504 }
5505 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
5506 else if (GET_CODE (x) == MULT
5507 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
5508 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
5509 && GET_MODE (XEXP (x, 0)) == DImode
5510 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
5511 && CONST_INT_P (XEXP (x, 1)))
5512 {
5513 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
5514 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5515 index = XEXP (XEXP (x, 0), 0);
5516 shift = exact_log2 (INTVAL (XEXP (x, 1)));
5517 }
5518 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
5519 else if (GET_CODE (x) == ASHIFT
5520 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
5521 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
5522 && GET_MODE (XEXP (x, 0)) == DImode
5523 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
5524 && CONST_INT_P (XEXP (x, 1)))
5525 {
5526 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
5527 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5528 index = XEXP (XEXP (x, 0), 0);
5529 shift = INTVAL (XEXP (x, 1));
5530 }
5531 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
5532 else if ((GET_CODE (x) == SIGN_EXTRACT
5533 || GET_CODE (x) == ZERO_EXTRACT)
5534 && GET_MODE (x) == DImode
5535 && GET_CODE (XEXP (x, 0)) == MULT
5536 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5537 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
5538 {
5539 type = (GET_CODE (x) == SIGN_EXTRACT)
5540 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5541 index = XEXP (XEXP (x, 0), 0);
5542 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
5543 if (INTVAL (XEXP (x, 1)) != 32 + shift
5544 || INTVAL (XEXP (x, 2)) != 0)
5545 shift = -1;
5546 }
5547 /* (and:DI (mult:DI (reg:DI) (const_int scale))
5548 (const_int 0xffffffff<<shift)) */
5549 else if (GET_CODE (x) == AND
5550 && GET_MODE (x) == DImode
5551 && GET_CODE (XEXP (x, 0)) == MULT
5552 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5553 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
5554 && CONST_INT_P (XEXP (x, 1)))
5555 {
5556 type = ADDRESS_REG_UXTW;
5557 index = XEXP (XEXP (x, 0), 0);
5558 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
5559 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
5560 shift = -1;
5561 }
5562 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
5563 else if ((GET_CODE (x) == SIGN_EXTRACT
5564 || GET_CODE (x) == ZERO_EXTRACT)
5565 && GET_MODE (x) == DImode
5566 && GET_CODE (XEXP (x, 0)) == ASHIFT
5567 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5568 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
5569 {
5570 type = (GET_CODE (x) == SIGN_EXTRACT)
5571 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5572 index = XEXP (XEXP (x, 0), 0);
5573 shift = INTVAL (XEXP (XEXP (x, 0), 1));
5574 if (INTVAL (XEXP (x, 1)) != 32 + shift
5575 || INTVAL (XEXP (x, 2)) != 0)
5576 shift = -1;
5577 }
5578 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
5579 (const_int 0xffffffff<<shift)) */
5580 else if (GET_CODE (x) == AND
5581 && GET_MODE (x) == DImode
5582 && GET_CODE (XEXP (x, 0)) == ASHIFT
5583 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5584 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
5585 && CONST_INT_P (XEXP (x, 1)))
5586 {
5587 type = ADDRESS_REG_UXTW;
5588 index = XEXP (XEXP (x, 0), 0);
5589 shift = INTVAL (XEXP (XEXP (x, 0), 1));
5590 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
5591 shift = -1;
5592 }
5593 /* (mult:P (reg:P) (const_int scale)) */
5594 else if (GET_CODE (x) == MULT
5595 && GET_MODE (x) == Pmode
5596 && GET_MODE (XEXP (x, 0)) == Pmode
5597 && CONST_INT_P (XEXP (x, 1)))
5598 {
5599 type = ADDRESS_REG_REG;
5600 index = XEXP (x, 0);
5601 shift = exact_log2 (INTVAL (XEXP (x, 1)));
5602 }
5603 /* (ashift:P (reg:P) (const_int shift)) */
5604 else if (GET_CODE (x) == ASHIFT
5605 && GET_MODE (x) == Pmode
5606 && GET_MODE (XEXP (x, 0)) == Pmode
5607 && CONST_INT_P (XEXP (x, 1)))
5608 {
5609 type = ADDRESS_REG_REG;
5610 index = XEXP (x, 0);
5611 shift = INTVAL (XEXP (x, 1));
5612 }
5613 else
5614 return false;
5615
5616 if (!strict_p
5617 && GET_CODE (index) == SUBREG
5618 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
5619 index = SUBREG_REG (index);
5620
5621 if (aarch64_sve_data_mode_p (mode))
5622 {
5623 if (type != ADDRESS_REG_REG
5624 || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
5625 return false;
5626 }
5627 else
5628 {
5629 if (shift != 0
5630 && !(IN_RANGE (shift, 1, 3)
5631 && known_eq (1 << shift, GET_MODE_SIZE (mode))))
5632 return false;
5633 }
5634
5635 if (REG_P (index)
5636 && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
5637 {
5638 info->type = type;
5639 info->offset = index;
5640 info->shift = shift;
5641 return true;
5642 }
5643
5644 return false;
5645 }
5646
5647 /* Return true if MODE is one of the modes for which we
5648 support LDP/STP operations. */
5649
5650 static bool
aarch64_mode_valid_for_sched_fusion_p(machine_mode mode)5651 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
5652 {
5653 return mode == SImode || mode == DImode
5654 || mode == SFmode || mode == DFmode
5655 || (aarch64_vector_mode_supported_p (mode)
5656 && known_eq (GET_MODE_SIZE (mode), 8));
5657 }
5658
5659 /* Return true if REGNO is a virtual pointer register, or an eliminable
5660 "soft" frame register. Like REGNO_PTR_FRAME_P except that we don't
5661 include stack_pointer or hard_frame_pointer. */
5662 static bool
virt_or_elim_regno_p(unsigned regno)5663 virt_or_elim_regno_p (unsigned regno)
5664 {
5665 return ((regno >= FIRST_VIRTUAL_REGISTER
5666 && regno <= LAST_VIRTUAL_POINTER_REGISTER)
5667 || regno == FRAME_POINTER_REGNUM
5668 || regno == ARG_POINTER_REGNUM);
5669 }
5670
5671 /* Return true if X is a valid address of type TYPE for machine mode MODE.
5672 If it is, fill in INFO appropriately. STRICT_P is true if
5673 REG_OK_STRICT is in effect. */
5674
5675 static bool
5676 aarch64_classify_address (struct aarch64_address_info *info,
5677 rtx x, machine_mode mode, bool strict_p,
5678 aarch64_addr_query_type type = ADDR_QUERY_M)
5679 {
5680 enum rtx_code code = GET_CODE (x);
5681 rtx op0, op1;
5682 poly_int64 offset;
5683
5684 HOST_WIDE_INT const_size;
5685
5686 /* On BE, we use load/store pair for all large int mode load/stores.
5687 TI/TFmode may also use a load/store pair. */
5688 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
5689 bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
5690 bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
5691 || mode == TImode
5692 || mode == TFmode
5693 || (BYTES_BIG_ENDIAN && advsimd_struct_p));
5694
5695 bool allow_reg_index_p = (!load_store_pair_p
5696 && (known_lt (GET_MODE_SIZE (mode), 16)
5697 || vec_flags == VEC_ADVSIMD
5698 || vec_flags == VEC_SVE_DATA));
5699
5700 /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
5701 [Rn, #offset, MUL VL]. */
5702 if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
5703 && (code != REG && code != PLUS))
5704 return false;
5705
5706 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
5707 REG addressing. */
5708 if (advsimd_struct_p
5709 && !BYTES_BIG_ENDIAN
5710 && (code != POST_INC && code != REG))
5711 return false;
5712
5713 gcc_checking_assert (GET_MODE (x) == VOIDmode
5714 || SCALAR_INT_MODE_P (GET_MODE (x)));
5715
5716 switch (code)
5717 {
5718 case REG:
5719 case SUBREG:
5720 info->type = ADDRESS_REG_IMM;
5721 info->base = x;
5722 info->offset = const0_rtx;
5723 info->const_offset = 0;
5724 return aarch64_base_register_rtx_p (x, strict_p);
5725
5726 case PLUS:
5727 op0 = XEXP (x, 0);
5728 op1 = XEXP (x, 1);
5729
5730 if (! strict_p
5731 && REG_P (op0)
5732 && virt_or_elim_regno_p (REGNO (op0))
5733 && poly_int_rtx_p (op1, &offset))
5734 {
5735 info->type = ADDRESS_REG_IMM;
5736 info->base = op0;
5737 info->offset = op1;
5738 info->const_offset = offset;
5739
5740 return true;
5741 }
5742
5743 if (maybe_ne (GET_MODE_SIZE (mode), 0)
5744 && aarch64_base_register_rtx_p (op0, strict_p)
5745 && poly_int_rtx_p (op1, &offset))
5746 {
5747 info->type = ADDRESS_REG_IMM;
5748 info->base = op0;
5749 info->offset = op1;
5750 info->const_offset = offset;
5751
5752 /* TImode and TFmode values are allowed in both pairs of X
5753 registers and individual Q registers. The available
5754 address modes are:
5755 X,X: 7-bit signed scaled offset
5756 Q: 9-bit signed offset
5757 We conservatively require an offset representable in either mode.
5758 When performing the check for pairs of X registers i.e. LDP/STP
5759 pass down DImode since that is the natural size of the LDP/STP
5760 instruction memory accesses. */
5761 if (mode == TImode || mode == TFmode)
5762 return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
5763 && (offset_9bit_signed_unscaled_p (mode, offset)
5764 || offset_12bit_unsigned_scaled_p (mode, offset)));
5765
5766 /* A 7bit offset check because OImode will emit a ldp/stp
5767 instruction (only big endian will get here).
5768 For ldp/stp instructions, the offset is scaled for the size of a
5769 single element of the pair. */
5770 if (mode == OImode)
5771 return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
5772
5773 /* Three 9/12 bit offsets checks because CImode will emit three
5774 ldr/str instructions (only big endian will get here). */
5775 if (mode == CImode)
5776 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
5777 && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
5778 || offset_12bit_unsigned_scaled_p (V16QImode,
5779 offset + 32)));
5780
5781 /* Two 7bit offsets checks because XImode will emit two ldp/stp
5782 instructions (only big endian will get here). */
5783 if (mode == XImode)
5784 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
5785 && aarch64_offset_7bit_signed_scaled_p (TImode,
5786 offset + 32));
5787
5788 /* Make "m" use the LD1 offset range for SVE data modes, so
5789 that pre-RTL optimizers like ivopts will work to that
5790 instead of the wider LDR/STR range. */
5791 if (vec_flags == VEC_SVE_DATA)
5792 return (type == ADDR_QUERY_M
5793 ? offset_4bit_signed_scaled_p (mode, offset)
5794 : offset_9bit_signed_scaled_p (mode, offset));
5795
5796 if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
5797 {
5798 poly_int64 end_offset = (offset
5799 + GET_MODE_SIZE (mode)
5800 - BYTES_PER_SVE_VECTOR);
5801 return (type == ADDR_QUERY_M
5802 ? offset_4bit_signed_scaled_p (mode, offset)
5803 : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
5804 && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
5805 end_offset)));
5806 }
5807
5808 if (vec_flags == VEC_SVE_PRED)
5809 return offset_9bit_signed_scaled_p (mode, offset);
5810
5811 if (load_store_pair_p)
5812 return ((known_eq (GET_MODE_SIZE (mode), 4)
5813 || known_eq (GET_MODE_SIZE (mode), 8))
5814 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
5815 else
5816 return (offset_9bit_signed_unscaled_p (mode, offset)
5817 || offset_12bit_unsigned_scaled_p (mode, offset));
5818 }
5819
5820 if (allow_reg_index_p)
5821 {
5822 /* Look for base + (scaled/extended) index register. */
5823 if (aarch64_base_register_rtx_p (op0, strict_p)
5824 && aarch64_classify_index (info, op1, mode, strict_p))
5825 {
5826 info->base = op0;
5827 return true;
5828 }
5829 if (aarch64_base_register_rtx_p (op1, strict_p)
5830 && aarch64_classify_index (info, op0, mode, strict_p))
5831 {
5832 info->base = op1;
5833 return true;
5834 }
5835 }
5836
5837 return false;
5838
5839 case POST_INC:
5840 case POST_DEC:
5841 case PRE_INC:
5842 case PRE_DEC:
5843 info->type = ADDRESS_REG_WB;
5844 info->base = XEXP (x, 0);
5845 info->offset = NULL_RTX;
5846 return aarch64_base_register_rtx_p (info->base, strict_p);
5847
5848 case POST_MODIFY:
5849 case PRE_MODIFY:
5850 info->type = ADDRESS_REG_WB;
5851 info->base = XEXP (x, 0);
5852 if (GET_CODE (XEXP (x, 1)) == PLUS
5853 && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
5854 && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
5855 && aarch64_base_register_rtx_p (info->base, strict_p))
5856 {
5857 info->offset = XEXP (XEXP (x, 1), 1);
5858 info->const_offset = offset;
5859
5860 /* TImode and TFmode values are allowed in both pairs of X
5861 registers and individual Q registers. The available
5862 address modes are:
5863 X,X: 7-bit signed scaled offset
5864 Q: 9-bit signed offset
5865 We conservatively require an offset representable in either mode.
5866 */
5867 if (mode == TImode || mode == TFmode)
5868 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
5869 && offset_9bit_signed_unscaled_p (mode, offset));
5870
5871 if (load_store_pair_p)
5872 return ((known_eq (GET_MODE_SIZE (mode), 4)
5873 || known_eq (GET_MODE_SIZE (mode), 8))
5874 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
5875 else
5876 return offset_9bit_signed_unscaled_p (mode, offset);
5877 }
5878 return false;
5879
5880 case CONST:
5881 case SYMBOL_REF:
5882 case LABEL_REF:
5883 /* load literal: pc-relative constant pool entry. Only supported
5884 for SI mode or larger. */
5885 info->type = ADDRESS_SYMBOLIC;
5886
5887 if (!load_store_pair_p
5888 && GET_MODE_SIZE (mode).is_constant (&const_size)
5889 && const_size >= 4)
5890 {
5891 rtx sym, addend;
5892
5893 split_const (x, &sym, &addend);
5894 return ((GET_CODE (sym) == LABEL_REF
5895 || (GET_CODE (sym) == SYMBOL_REF
5896 && CONSTANT_POOL_ADDRESS_P (sym)
5897 && aarch64_pcrelative_literal_loads)));
5898 }
5899 return false;
5900
5901 case LO_SUM:
5902 info->type = ADDRESS_LO_SUM;
5903 info->base = XEXP (x, 0);
5904 info->offset = XEXP (x, 1);
5905 if (allow_reg_index_p
5906 && aarch64_base_register_rtx_p (info->base, strict_p))
5907 {
5908 rtx sym, offs;
5909 split_const (info->offset, &sym, &offs);
5910 if (GET_CODE (sym) == SYMBOL_REF
5911 && (aarch64_classify_symbol (sym, INTVAL (offs))
5912 == SYMBOL_SMALL_ABSOLUTE))
5913 {
5914 /* The symbol and offset must be aligned to the access size. */
5915 unsigned int align;
5916
5917 if (CONSTANT_POOL_ADDRESS_P (sym))
5918 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
5919 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
5920 {
5921 tree exp = SYMBOL_REF_DECL (sym);
5922 align = TYPE_ALIGN (TREE_TYPE (exp));
5923 align = aarch64_constant_alignment (exp, align);
5924 }
5925 else if (SYMBOL_REF_DECL (sym))
5926 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
5927 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
5928 && SYMBOL_REF_BLOCK (sym) != NULL)
5929 align = SYMBOL_REF_BLOCK (sym)->alignment;
5930 else
5931 align = BITS_PER_UNIT;
5932
5933 poly_int64 ref_size = GET_MODE_SIZE (mode);
5934 if (known_eq (ref_size, 0))
5935 ref_size = GET_MODE_SIZE (DImode);
5936
5937 return (multiple_p (INTVAL (offs), ref_size)
5938 && multiple_p (align / BITS_PER_UNIT, ref_size));
5939 }
5940 }
5941 return false;
5942
5943 default:
5944 return false;
5945 }
5946 }
5947
5948 /* Return true if the address X is valid for a PRFM instruction.
5949 STRICT_P is true if we should do strict checking with
5950 aarch64_classify_address. */
5951
5952 bool
aarch64_address_valid_for_prefetch_p(rtx x,bool strict_p)5953 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
5954 {
5955 struct aarch64_address_info addr;
5956
5957 /* PRFM accepts the same addresses as DImode... */
5958 bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
5959 if (!res)
5960 return false;
5961
5962 /* ... except writeback forms. */
5963 return addr.type != ADDRESS_REG_WB;
5964 }
5965
5966 bool
aarch64_symbolic_address_p(rtx x)5967 aarch64_symbolic_address_p (rtx x)
5968 {
5969 rtx offset;
5970
5971 split_const (x, &x, &offset);
5972 return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
5973 }
5974
5975 /* Classify the base of symbolic expression X. */
5976
5977 enum aarch64_symbol_type
aarch64_classify_symbolic_expression(rtx x)5978 aarch64_classify_symbolic_expression (rtx x)
5979 {
5980 rtx offset;
5981
5982 split_const (x, &x, &offset);
5983 return aarch64_classify_symbol (x, INTVAL (offset));
5984 }
5985
5986
5987 /* Return TRUE if X is a legitimate address for accessing memory in
5988 mode MODE. */
5989 static bool
aarch64_legitimate_address_hook_p(machine_mode mode,rtx x,bool strict_p)5990 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
5991 {
5992 struct aarch64_address_info addr;
5993
5994 return aarch64_classify_address (&addr, x, mode, strict_p);
5995 }
5996
5997 /* Return TRUE if X is a legitimate address of type TYPE for accessing
5998 memory in mode MODE. STRICT_P is true if REG_OK_STRICT is in effect. */
5999 bool
aarch64_legitimate_address_p(machine_mode mode,rtx x,bool strict_p,aarch64_addr_query_type type)6000 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
6001 aarch64_addr_query_type type)
6002 {
6003 struct aarch64_address_info addr;
6004
6005 return aarch64_classify_address (&addr, x, mode, strict_p, type);
6006 }
6007
6008 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT. */
6009
6010 static bool
aarch64_legitimize_address_displacement(rtx * offset1,rtx * offset2,poly_int64 orig_offset,machine_mode mode)6011 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
6012 poly_int64 orig_offset,
6013 machine_mode mode)
6014 {
6015 HOST_WIDE_INT size;
6016 if (GET_MODE_SIZE (mode).is_constant (&size))
6017 {
6018 HOST_WIDE_INT const_offset, second_offset;
6019
6020 /* A general SVE offset is A * VQ + B. Remove the A component from
6021 coefficient 0 in order to get the constant B. */
6022 const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
6023
6024 /* Split an out-of-range address displacement into a base and
6025 offset. Use 4KB range for 1- and 2-byte accesses and a 16KB
6026 range otherwise to increase opportunities for sharing the base
6027 address of different sizes. Unaligned accesses use the signed
6028 9-bit range, TImode/TFmode use the intersection of signed
6029 scaled 7-bit and signed 9-bit offset. */
6030 if (mode == TImode || mode == TFmode)
6031 second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
6032 else if ((const_offset & (size - 1)) != 0)
6033 second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
6034 else
6035 second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
6036
6037 if (second_offset == 0 || known_eq (orig_offset, second_offset))
6038 return false;
6039
6040 /* Split the offset into second_offset and the rest. */
6041 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
6042 *offset2 = gen_int_mode (second_offset, Pmode);
6043 return true;
6044 }
6045 else
6046 {
6047 /* Get the mode we should use as the basis of the range. For structure
6048 modes this is the mode of one vector. */
6049 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
6050 machine_mode step_mode
6051 = (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
6052
6053 /* Get the "mul vl" multiplier we'd like to use. */
6054 HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
6055 HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
6056 if (vec_flags & VEC_SVE_DATA)
6057 /* LDR supports a 9-bit range, but the move patterns for
6058 structure modes require all vectors to be in range of the
6059 same base. The simplest way of accomodating that while still
6060 promoting reuse of anchor points between different modes is
6061 to use an 8-bit range unconditionally. */
6062 vnum = ((vnum + 128) & 255) - 128;
6063 else
6064 /* Predicates are only handled singly, so we might as well use
6065 the full range. */
6066 vnum = ((vnum + 256) & 511) - 256;
6067 if (vnum == 0)
6068 return false;
6069
6070 /* Convert the "mul vl" multiplier into a byte offset. */
6071 poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
6072 if (known_eq (second_offset, orig_offset))
6073 return false;
6074
6075 /* Split the offset into second_offset and the rest. */
6076 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
6077 *offset2 = gen_int_mode (second_offset, Pmode);
6078 return true;
6079 }
6080 }
6081
6082 /* Return the binary representation of floating point constant VALUE in INTVAL.
6083 If the value cannot be converted, return false without setting INTVAL.
6084 The conversion is done in the given MODE. */
6085 bool
aarch64_reinterpret_float_as_int(rtx value,unsigned HOST_WIDE_INT * intval)6086 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
6087 {
6088
6089 /* We make a general exception for 0. */
6090 if (aarch64_float_const_zero_rtx_p (value))
6091 {
6092 *intval = 0;
6093 return true;
6094 }
6095
6096 scalar_float_mode mode;
6097 if (GET_CODE (value) != CONST_DOUBLE
6098 || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
6099 || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
6100 /* Only support up to DF mode. */
6101 || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
6102 return false;
6103
6104 unsigned HOST_WIDE_INT ival = 0;
6105
6106 long res[2];
6107 real_to_target (res,
6108 CONST_DOUBLE_REAL_VALUE (value),
6109 REAL_MODE_FORMAT (mode));
6110
6111 if (mode == DFmode)
6112 {
6113 int order = BYTES_BIG_ENDIAN ? 1 : 0;
6114 ival = zext_hwi (res[order], 32);
6115 ival |= (zext_hwi (res[1 - order], 32) << 32);
6116 }
6117 else
6118 ival = zext_hwi (res[0], 32);
6119
6120 *intval = ival;
6121 return true;
6122 }
6123
6124 /* Return TRUE if rtx X is an immediate constant that can be moved using a
6125 single MOV(+MOVK) followed by an FMOV. */
6126 bool
aarch64_float_const_rtx_p(rtx x)6127 aarch64_float_const_rtx_p (rtx x)
6128 {
6129 machine_mode mode = GET_MODE (x);
6130 if (mode == VOIDmode)
6131 return false;
6132
6133 /* Determine whether it's cheaper to write float constants as
6134 mov/movk pairs over ldr/adrp pairs. */
6135 unsigned HOST_WIDE_INT ival;
6136
6137 if (GET_CODE (x) == CONST_DOUBLE
6138 && SCALAR_FLOAT_MODE_P (mode)
6139 && aarch64_reinterpret_float_as_int (x, &ival))
6140 {
6141 scalar_int_mode imode = (mode == HFmode
6142 ? SImode
6143 : int_mode_for_mode (mode).require ());
6144 int num_instr = aarch64_internal_mov_immediate
6145 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
6146 return num_instr < 3;
6147 }
6148
6149 return false;
6150 }
6151
6152 /* Return TRUE if rtx X is immediate constant 0.0 */
6153 bool
aarch64_float_const_zero_rtx_p(rtx x)6154 aarch64_float_const_zero_rtx_p (rtx x)
6155 {
6156 if (GET_MODE (x) == VOIDmode)
6157 return false;
6158
6159 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
6160 return !HONOR_SIGNED_ZEROS (GET_MODE (x));
6161 return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
6162 }
6163
6164 /* Return TRUE if rtx X is immediate constant that fits in a single
6165 MOVI immediate operation. */
6166 bool
aarch64_can_const_movi_rtx_p(rtx x,machine_mode mode)6167 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
6168 {
6169 if (!TARGET_SIMD)
6170 return false;
6171
6172 machine_mode vmode;
6173 scalar_int_mode imode;
6174 unsigned HOST_WIDE_INT ival;
6175
6176 if (GET_CODE (x) == CONST_DOUBLE
6177 && SCALAR_FLOAT_MODE_P (mode))
6178 {
6179 if (!aarch64_reinterpret_float_as_int (x, &ival))
6180 return false;
6181
6182 /* We make a general exception for 0. */
6183 if (aarch64_float_const_zero_rtx_p (x))
6184 return true;
6185
6186 imode = int_mode_for_mode (mode).require ();
6187 }
6188 else if (GET_CODE (x) == CONST_INT
6189 && is_a <scalar_int_mode> (mode, &imode))
6190 ival = INTVAL (x);
6191 else
6192 return false;
6193
6194 /* use a 64 bit mode for everything except for DI/DF mode, where we use
6195 a 128 bit vector mode. */
6196 int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
6197
6198 vmode = aarch64_simd_container_mode (imode, width);
6199 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
6200
6201 return aarch64_simd_valid_immediate (v_op, NULL);
6202 }
6203
6204
6205 /* Return the fixed registers used for condition codes. */
6206
6207 static bool
aarch64_fixed_condition_code_regs(unsigned int * p1,unsigned int * p2)6208 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
6209 {
6210 *p1 = CC_REGNUM;
6211 *p2 = INVALID_REGNUM;
6212 return true;
6213 }
6214
6215 /* This function is used by the call expanders of the machine description.
6216 RESULT is the register in which the result is returned. It's NULL for
6217 "call" and "sibcall".
6218 MEM is the location of the function call.
6219 SIBCALL indicates whether this function call is normal call or sibling call.
6220 It will generate different pattern accordingly. */
6221
6222 void
aarch64_expand_call(rtx result,rtx mem,bool sibcall)6223 aarch64_expand_call (rtx result, rtx mem, bool sibcall)
6224 {
6225 rtx call, callee, tmp;
6226 rtvec vec;
6227 machine_mode mode;
6228
6229 gcc_assert (MEM_P (mem));
6230 callee = XEXP (mem, 0);
6231 mode = GET_MODE (callee);
6232 gcc_assert (mode == Pmode);
6233
6234 /* Decide if we should generate indirect calls by loading the
6235 address of the callee into a register before performing
6236 the branch-and-link. */
6237 if (SYMBOL_REF_P (callee)
6238 ? (aarch64_is_long_call_p (callee)
6239 || aarch64_is_noplt_call_p (callee))
6240 : !REG_P (callee))
6241 XEXP (mem, 0) = force_reg (mode, callee);
6242
6243 call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
6244
6245 if (result != NULL_RTX)
6246 call = gen_rtx_SET (result, call);
6247
6248 if (sibcall)
6249 tmp = ret_rtx;
6250 else
6251 tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
6252
6253 vec = gen_rtvec (2, call, tmp);
6254 call = gen_rtx_PARALLEL (VOIDmode, vec);
6255
6256 aarch64_emit_call_insn (call);
6257 }
6258
6259 /* Emit call insn with PAT and do aarch64-specific handling. */
6260
6261 void
aarch64_emit_call_insn(rtx pat)6262 aarch64_emit_call_insn (rtx pat)
6263 {
6264 rtx insn = emit_call_insn (pat);
6265
6266 rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
6267 clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
6268 clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
6269 }
6270
6271 machine_mode
aarch64_select_cc_mode(RTX_CODE code,rtx x,rtx y)6272 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
6273 {
6274 /* All floating point compares return CCFP if it is an equality
6275 comparison, and CCFPE otherwise. */
6276 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
6277 {
6278 switch (code)
6279 {
6280 case EQ:
6281 case NE:
6282 case UNORDERED:
6283 case ORDERED:
6284 case UNLT:
6285 case UNLE:
6286 case UNGT:
6287 case UNGE:
6288 case UNEQ:
6289 return CCFPmode;
6290
6291 case LT:
6292 case LE:
6293 case GT:
6294 case GE:
6295 case LTGT:
6296 return CCFPEmode;
6297
6298 default:
6299 gcc_unreachable ();
6300 }
6301 }
6302
6303 /* Equality comparisons of short modes against zero can be performed
6304 using the TST instruction with the appropriate bitmask. */
6305 if (y == const0_rtx && REG_P (x)
6306 && (code == EQ || code == NE)
6307 && (GET_MODE (x) == HImode || GET_MODE (x) == QImode))
6308 return CC_NZmode;
6309
6310 /* Similarly, comparisons of zero_extends from shorter modes can
6311 be performed using an ANDS with an immediate mask. */
6312 if (y == const0_rtx && GET_CODE (x) == ZERO_EXTEND
6313 && (GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6314 && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
6315 && (code == EQ || code == NE))
6316 return CC_NZmode;
6317
6318 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6319 && y == const0_rtx
6320 && (code == EQ || code == NE || code == LT || code == GE)
6321 && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
6322 || GET_CODE (x) == NEG
6323 || (GET_CODE (x) == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
6324 && CONST_INT_P (XEXP (x, 2)))))
6325 return CC_NZmode;
6326
6327 /* A compare with a shifted operand. Because of canonicalization,
6328 the comparison will have to be swapped when we emit the assembly
6329 code. */
6330 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6331 && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
6332 && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
6333 || GET_CODE (x) == LSHIFTRT
6334 || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
6335 return CC_SWPmode;
6336
6337 /* Similarly for a negated operand, but we can only do this for
6338 equalities. */
6339 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6340 && (REG_P (y) || GET_CODE (y) == SUBREG)
6341 && (code == EQ || code == NE)
6342 && GET_CODE (x) == NEG)
6343 return CC_Zmode;
6344
6345 /* A test for unsigned overflow. */
6346 if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
6347 && code == NE
6348 && GET_CODE (x) == PLUS
6349 && GET_CODE (y) == ZERO_EXTEND)
6350 return CC_Cmode;
6351
6352 /* For everything else, return CCmode. */
6353 return CCmode;
6354 }
6355
6356 static int
6357 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
6358
6359 int
aarch64_get_condition_code(rtx x)6360 aarch64_get_condition_code (rtx x)
6361 {
6362 machine_mode mode = GET_MODE (XEXP (x, 0));
6363 enum rtx_code comp_code = GET_CODE (x);
6364
6365 if (GET_MODE_CLASS (mode) != MODE_CC)
6366 mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
6367 return aarch64_get_condition_code_1 (mode, comp_code);
6368 }
6369
6370 static int
aarch64_get_condition_code_1(machine_mode mode,enum rtx_code comp_code)6371 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
6372 {
6373 switch (mode)
6374 {
6375 case E_CCFPmode:
6376 case E_CCFPEmode:
6377 switch (comp_code)
6378 {
6379 case GE: return AARCH64_GE;
6380 case GT: return AARCH64_GT;
6381 case LE: return AARCH64_LS;
6382 case LT: return AARCH64_MI;
6383 case NE: return AARCH64_NE;
6384 case EQ: return AARCH64_EQ;
6385 case ORDERED: return AARCH64_VC;
6386 case UNORDERED: return AARCH64_VS;
6387 case UNLT: return AARCH64_LT;
6388 case UNLE: return AARCH64_LE;
6389 case UNGT: return AARCH64_HI;
6390 case UNGE: return AARCH64_PL;
6391 default: return -1;
6392 }
6393 break;
6394
6395 case E_CCmode:
6396 switch (comp_code)
6397 {
6398 case NE: return AARCH64_NE;
6399 case EQ: return AARCH64_EQ;
6400 case GE: return AARCH64_GE;
6401 case GT: return AARCH64_GT;
6402 case LE: return AARCH64_LE;
6403 case LT: return AARCH64_LT;
6404 case GEU: return AARCH64_CS;
6405 case GTU: return AARCH64_HI;
6406 case LEU: return AARCH64_LS;
6407 case LTU: return AARCH64_CC;
6408 default: return -1;
6409 }
6410 break;
6411
6412 case E_CC_SWPmode:
6413 switch (comp_code)
6414 {
6415 case NE: return AARCH64_NE;
6416 case EQ: return AARCH64_EQ;
6417 case GE: return AARCH64_LE;
6418 case GT: return AARCH64_LT;
6419 case LE: return AARCH64_GE;
6420 case LT: return AARCH64_GT;
6421 case GEU: return AARCH64_LS;
6422 case GTU: return AARCH64_CC;
6423 case LEU: return AARCH64_CS;
6424 case LTU: return AARCH64_HI;
6425 default: return -1;
6426 }
6427 break;
6428
6429 case E_CC_NZmode:
6430 switch (comp_code)
6431 {
6432 case NE: return AARCH64_NE;
6433 case EQ: return AARCH64_EQ;
6434 case GE: return AARCH64_PL;
6435 case LT: return AARCH64_MI;
6436 default: return -1;
6437 }
6438 break;
6439
6440 case E_CC_Zmode:
6441 switch (comp_code)
6442 {
6443 case NE: return AARCH64_NE;
6444 case EQ: return AARCH64_EQ;
6445 default: return -1;
6446 }
6447 break;
6448
6449 case E_CC_Cmode:
6450 switch (comp_code)
6451 {
6452 case NE: return AARCH64_CS;
6453 case EQ: return AARCH64_CC;
6454 default: return -1;
6455 }
6456 break;
6457
6458 default:
6459 return -1;
6460 }
6461
6462 return -1;
6463 }
6464
6465 bool
aarch64_const_vec_all_same_in_range_p(rtx x,HOST_WIDE_INT minval,HOST_WIDE_INT maxval)6466 aarch64_const_vec_all_same_in_range_p (rtx x,
6467 HOST_WIDE_INT minval,
6468 HOST_WIDE_INT maxval)
6469 {
6470 rtx elt;
6471 return (const_vec_duplicate_p (x, &elt)
6472 && CONST_INT_P (elt)
6473 && IN_RANGE (INTVAL (elt), minval, maxval));
6474 }
6475
6476 bool
aarch64_const_vec_all_same_int_p(rtx x,HOST_WIDE_INT val)6477 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
6478 {
6479 return aarch64_const_vec_all_same_in_range_p (x, val, val);
6480 }
6481
6482 /* Return true if VEC is a constant in which every element is in the range
6483 [MINVAL, MAXVAL]. The elements do not need to have the same value. */
6484
6485 static bool
aarch64_const_vec_all_in_range_p(rtx vec,HOST_WIDE_INT minval,HOST_WIDE_INT maxval)6486 aarch64_const_vec_all_in_range_p (rtx vec,
6487 HOST_WIDE_INT minval,
6488 HOST_WIDE_INT maxval)
6489 {
6490 if (GET_CODE (vec) != CONST_VECTOR
6491 || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
6492 return false;
6493
6494 int nunits;
6495 if (!CONST_VECTOR_STEPPED_P (vec))
6496 nunits = const_vector_encoded_nelts (vec);
6497 else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
6498 return false;
6499
6500 for (int i = 0; i < nunits; i++)
6501 {
6502 rtx vec_elem = CONST_VECTOR_ELT (vec, i);
6503 if (!CONST_INT_P (vec_elem)
6504 || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
6505 return false;
6506 }
6507 return true;
6508 }
6509
6510 /* N Z C V. */
6511 #define AARCH64_CC_V 1
6512 #define AARCH64_CC_C (1 << 1)
6513 #define AARCH64_CC_Z (1 << 2)
6514 #define AARCH64_CC_N (1 << 3)
6515
6516 /* N Z C V flags for ccmp. Indexed by AARCH64_COND_CODE. */
6517 static const int aarch64_nzcv_codes[] =
6518 {
6519 0, /* EQ, Z == 1. */
6520 AARCH64_CC_Z, /* NE, Z == 0. */
6521 0, /* CS, C == 1. */
6522 AARCH64_CC_C, /* CC, C == 0. */
6523 0, /* MI, N == 1. */
6524 AARCH64_CC_N, /* PL, N == 0. */
6525 0, /* VS, V == 1. */
6526 AARCH64_CC_V, /* VC, V == 0. */
6527 0, /* HI, C ==1 && Z == 0. */
6528 AARCH64_CC_C, /* LS, !(C == 1 && Z == 0). */
6529 AARCH64_CC_V, /* GE, N == V. */
6530 0, /* LT, N != V. */
6531 AARCH64_CC_Z, /* GT, Z == 0 && N == V. */
6532 0, /* LE, !(Z == 0 && N == V). */
6533 0, /* AL, Any. */
6534 0 /* NV, Any. */
6535 };
6536
6537 /* Print floating-point vector immediate operand X to F, negating it
6538 first if NEGATE is true. Return true on success, false if it isn't
6539 a constant we can handle. */
6540
6541 static bool
aarch64_print_vector_float_operand(FILE * f,rtx x,bool negate)6542 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
6543 {
6544 rtx elt;
6545
6546 if (!const_vec_duplicate_p (x, &elt))
6547 return false;
6548
6549 REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
6550 if (negate)
6551 r = real_value_negate (&r);
6552
6553 /* We only handle the SVE single-bit immediates here. */
6554 if (real_equal (&r, &dconst0))
6555 asm_fprintf (f, "0.0");
6556 else if (real_equal (&r, &dconst1))
6557 asm_fprintf (f, "1.0");
6558 else if (real_equal (&r, &dconsthalf))
6559 asm_fprintf (f, "0.5");
6560 else
6561 return false;
6562
6563 return true;
6564 }
6565
6566 /* Return the equivalent letter for size. */
6567 static char
sizetochar(int size)6568 sizetochar (int size)
6569 {
6570 switch (size)
6571 {
6572 case 64: return 'd';
6573 case 32: return 's';
6574 case 16: return 'h';
6575 case 8 : return 'b';
6576 default: gcc_unreachable ();
6577 }
6578 }
6579
6580 /* Print operand X to file F in a target specific manner according to CODE.
6581 The acceptable formatting commands given by CODE are:
6582 'c': An integer or symbol address without a preceding #
6583 sign.
6584 'C': Take the duplicated element in a vector constant
6585 and print it in hex.
6586 'D': Take the duplicated element in a vector constant
6587 and print it as an unsigned integer, in decimal.
6588 'e': Print the sign/zero-extend size as a character 8->b,
6589 16->h, 32->w.
6590 'p': Prints N such that 2^N == X (X must be power of 2 and
6591 const int).
6592 'P': Print the number of non-zero bits in X (a const_int).
6593 'H': Print the higher numbered register of a pair (TImode)
6594 of regs.
6595 'm': Print a condition (eq, ne, etc).
6596 'M': Same as 'm', but invert condition.
6597 'N': Take the duplicated element in a vector constant
6598 and print the negative of it in decimal.
6599 'b/h/s/d/q': Print a scalar FP/SIMD register name.
6600 'S/T/U/V': Print a FP/SIMD register name for a register list.
6601 The register printed is the FP/SIMD register name
6602 of X + 0/1/2/3 for S/T/U/V.
6603 'R': Print a scalar FP/SIMD register name + 1.
6604 'X': Print bottom 16 bits of integer constant in hex.
6605 'w/x': Print a general register name or the zero register
6606 (32-bit or 64-bit).
6607 '0': Print a normal operand, if it's a general register,
6608 then we assume DImode.
6609 'k': Print NZCV for conditional compare instructions.
6610 'A': Output address constant representing the first
6611 argument of X, specifying a relocation offset
6612 if appropriate.
6613 'L': Output constant address specified by X
6614 with a relocation offset if appropriate.
6615 'G': Prints address of X, specifying a PC relative
6616 relocation mode if appropriate.
6617 'y': Output address of LDP or STP - this is used for
6618 some LDP/STPs which don't use a PARALLEL in their
6619 pattern (so the mode needs to be adjusted).
6620 'z': Output address of a typical LDP or STP. */
6621
6622 static void
aarch64_print_operand(FILE * f,rtx x,int code)6623 aarch64_print_operand (FILE *f, rtx x, int code)
6624 {
6625 rtx elt;
6626 switch (code)
6627 {
6628 case 'c':
6629 switch (GET_CODE (x))
6630 {
6631 case CONST_INT:
6632 fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
6633 break;
6634
6635 case SYMBOL_REF:
6636 output_addr_const (f, x);
6637 break;
6638
6639 case CONST:
6640 if (GET_CODE (XEXP (x, 0)) == PLUS
6641 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
6642 {
6643 output_addr_const (f, x);
6644 break;
6645 }
6646 /* Fall through. */
6647
6648 default:
6649 output_operand_lossage ("unsupported operand for code '%c'", code);
6650 }
6651 break;
6652
6653 case 'e':
6654 {
6655 int n;
6656
6657 if (!CONST_INT_P (x)
6658 || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
6659 {
6660 output_operand_lossage ("invalid operand for '%%%c'", code);
6661 return;
6662 }
6663
6664 switch (n)
6665 {
6666 case 3:
6667 fputc ('b', f);
6668 break;
6669 case 4:
6670 fputc ('h', f);
6671 break;
6672 case 5:
6673 fputc ('w', f);
6674 break;
6675 default:
6676 output_operand_lossage ("invalid operand for '%%%c'", code);
6677 return;
6678 }
6679 }
6680 break;
6681
6682 case 'p':
6683 {
6684 int n;
6685
6686 if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
6687 {
6688 output_operand_lossage ("invalid operand for '%%%c'", code);
6689 return;
6690 }
6691
6692 asm_fprintf (f, "%d", n);
6693 }
6694 break;
6695
6696 case 'P':
6697 if (!CONST_INT_P (x))
6698 {
6699 output_operand_lossage ("invalid operand for '%%%c'", code);
6700 return;
6701 }
6702
6703 asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
6704 break;
6705
6706 case 'H':
6707 if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
6708 {
6709 output_operand_lossage ("invalid operand for '%%%c'", code);
6710 return;
6711 }
6712
6713 asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
6714 break;
6715
6716 case 'M':
6717 case 'm':
6718 {
6719 int cond_code;
6720 /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */
6721 if (x == const_true_rtx)
6722 {
6723 if (code == 'M')
6724 fputs ("nv", f);
6725 return;
6726 }
6727
6728 if (!COMPARISON_P (x))
6729 {
6730 output_operand_lossage ("invalid operand for '%%%c'", code);
6731 return;
6732 }
6733
6734 cond_code = aarch64_get_condition_code (x);
6735 gcc_assert (cond_code >= 0);
6736 if (code == 'M')
6737 cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
6738 fputs (aarch64_condition_codes[cond_code], f);
6739 }
6740 break;
6741
6742 case 'N':
6743 if (!const_vec_duplicate_p (x, &elt))
6744 {
6745 output_operand_lossage ("invalid vector constant");
6746 return;
6747 }
6748
6749 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
6750 asm_fprintf (f, "%wd", -INTVAL (elt));
6751 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
6752 && aarch64_print_vector_float_operand (f, x, true))
6753 ;
6754 else
6755 {
6756 output_operand_lossage ("invalid vector constant");
6757 return;
6758 }
6759 break;
6760
6761 case 'b':
6762 case 'h':
6763 case 's':
6764 case 'd':
6765 case 'q':
6766 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
6767 {
6768 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
6769 return;
6770 }
6771 asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
6772 break;
6773
6774 case 'S':
6775 case 'T':
6776 case 'U':
6777 case 'V':
6778 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
6779 {
6780 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
6781 return;
6782 }
6783 asm_fprintf (f, "%c%d",
6784 aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
6785 REGNO (x) - V0_REGNUM + (code - 'S'));
6786 break;
6787
6788 case 'R':
6789 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
6790 {
6791 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
6792 return;
6793 }
6794 asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
6795 break;
6796
6797 case 'X':
6798 if (!CONST_INT_P (x))
6799 {
6800 output_operand_lossage ("invalid operand for '%%%c'", code);
6801 return;
6802 }
6803 asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
6804 break;
6805
6806 case 'C':
6807 {
6808 /* Print a replicated constant in hex. */
6809 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
6810 {
6811 output_operand_lossage ("invalid operand for '%%%c'", code);
6812 return;
6813 }
6814 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
6815 asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
6816 }
6817 break;
6818
6819 case 'D':
6820 {
6821 /* Print a replicated constant in decimal, treating it as
6822 unsigned. */
6823 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
6824 {
6825 output_operand_lossage ("invalid operand for '%%%c'", code);
6826 return;
6827 }
6828 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
6829 asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
6830 }
6831 break;
6832
6833 case 'w':
6834 case 'x':
6835 if (x == const0_rtx
6836 || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
6837 {
6838 asm_fprintf (f, "%czr", code);
6839 break;
6840 }
6841
6842 if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
6843 {
6844 asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
6845 break;
6846 }
6847
6848 if (REG_P (x) && REGNO (x) == SP_REGNUM)
6849 {
6850 asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
6851 break;
6852 }
6853
6854 /* Fall through */
6855
6856 case 0:
6857 if (x == NULL)
6858 {
6859 output_operand_lossage ("missing operand");
6860 return;
6861 }
6862
6863 switch (GET_CODE (x))
6864 {
6865 case REG:
6866 if (aarch64_sve_data_mode_p (GET_MODE (x)))
6867 {
6868 if (REG_NREGS (x) == 1)
6869 asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
6870 else
6871 {
6872 char suffix
6873 = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
6874 asm_fprintf (f, "{z%d.%c - z%d.%c}",
6875 REGNO (x) - V0_REGNUM, suffix,
6876 END_REGNO (x) - V0_REGNUM - 1, suffix);
6877 }
6878 }
6879 else
6880 asm_fprintf (f, "%s", reg_names [REGNO (x)]);
6881 break;
6882
6883 case MEM:
6884 output_address (GET_MODE (x), XEXP (x, 0));
6885 break;
6886
6887 case LABEL_REF:
6888 case SYMBOL_REF:
6889 output_addr_const (asm_out_file, x);
6890 break;
6891
6892 case CONST_INT:
6893 asm_fprintf (f, "%wd", INTVAL (x));
6894 break;
6895
6896 case CONST:
6897 if (!VECTOR_MODE_P (GET_MODE (x)))
6898 {
6899 output_addr_const (asm_out_file, x);
6900 break;
6901 }
6902 /* fall through */
6903
6904 case CONST_VECTOR:
6905 if (!const_vec_duplicate_p (x, &elt))
6906 {
6907 output_operand_lossage ("invalid vector constant");
6908 return;
6909 }
6910
6911 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
6912 asm_fprintf (f, "%wd", INTVAL (elt));
6913 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
6914 && aarch64_print_vector_float_operand (f, x, false))
6915 ;
6916 else
6917 {
6918 output_operand_lossage ("invalid vector constant");
6919 return;
6920 }
6921 break;
6922
6923 case CONST_DOUBLE:
6924 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
6925 be getting CONST_DOUBLEs holding integers. */
6926 gcc_assert (GET_MODE (x) != VOIDmode);
6927 if (aarch64_float_const_zero_rtx_p (x))
6928 {
6929 fputc ('0', f);
6930 break;
6931 }
6932 else if (aarch64_float_const_representable_p (x))
6933 {
6934 #define buf_size 20
6935 char float_buf[buf_size] = {'\0'};
6936 real_to_decimal_for_mode (float_buf,
6937 CONST_DOUBLE_REAL_VALUE (x),
6938 buf_size, buf_size,
6939 1, GET_MODE (x));
6940 asm_fprintf (asm_out_file, "%s", float_buf);
6941 break;
6942 #undef buf_size
6943 }
6944 output_operand_lossage ("invalid constant");
6945 return;
6946 default:
6947 output_operand_lossage ("invalid operand");
6948 return;
6949 }
6950 break;
6951
6952 case 'A':
6953 if (GET_CODE (x) == HIGH)
6954 x = XEXP (x, 0);
6955
6956 switch (aarch64_classify_symbolic_expression (x))
6957 {
6958 case SYMBOL_SMALL_GOT_4G:
6959 asm_fprintf (asm_out_file, ":got:");
6960 break;
6961
6962 case SYMBOL_SMALL_TLSGD:
6963 asm_fprintf (asm_out_file, ":tlsgd:");
6964 break;
6965
6966 case SYMBOL_SMALL_TLSDESC:
6967 asm_fprintf (asm_out_file, ":tlsdesc:");
6968 break;
6969
6970 case SYMBOL_SMALL_TLSIE:
6971 asm_fprintf (asm_out_file, ":gottprel:");
6972 break;
6973
6974 case SYMBOL_TLSLE24:
6975 asm_fprintf (asm_out_file, ":tprel:");
6976 break;
6977
6978 case SYMBOL_TINY_GOT:
6979 gcc_unreachable ();
6980 break;
6981
6982 default:
6983 break;
6984 }
6985 output_addr_const (asm_out_file, x);
6986 break;
6987
6988 case 'L':
6989 switch (aarch64_classify_symbolic_expression (x))
6990 {
6991 case SYMBOL_SMALL_GOT_4G:
6992 asm_fprintf (asm_out_file, ":lo12:");
6993 break;
6994
6995 case SYMBOL_SMALL_TLSGD:
6996 asm_fprintf (asm_out_file, ":tlsgd_lo12:");
6997 break;
6998
6999 case SYMBOL_SMALL_TLSDESC:
7000 asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
7001 break;
7002
7003 case SYMBOL_SMALL_TLSIE:
7004 asm_fprintf (asm_out_file, ":gottprel_lo12:");
7005 break;
7006
7007 case SYMBOL_TLSLE12:
7008 asm_fprintf (asm_out_file, ":tprel_lo12:");
7009 break;
7010
7011 case SYMBOL_TLSLE24:
7012 asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
7013 break;
7014
7015 case SYMBOL_TINY_GOT:
7016 asm_fprintf (asm_out_file, ":got:");
7017 break;
7018
7019 case SYMBOL_TINY_TLSIE:
7020 asm_fprintf (asm_out_file, ":gottprel:");
7021 break;
7022
7023 default:
7024 break;
7025 }
7026 output_addr_const (asm_out_file, x);
7027 break;
7028
7029 case 'G':
7030 switch (aarch64_classify_symbolic_expression (x))
7031 {
7032 case SYMBOL_TLSLE24:
7033 asm_fprintf (asm_out_file, ":tprel_hi12:");
7034 break;
7035 default:
7036 break;
7037 }
7038 output_addr_const (asm_out_file, x);
7039 break;
7040
7041 case 'k':
7042 {
7043 HOST_WIDE_INT cond_code;
7044
7045 if (!CONST_INT_P (x))
7046 {
7047 output_operand_lossage ("invalid operand for '%%%c'", code);
7048 return;
7049 }
7050
7051 cond_code = INTVAL (x);
7052 gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
7053 asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
7054 }
7055 break;
7056
7057 case 'y':
7058 case 'z':
7059 {
7060 machine_mode mode = GET_MODE (x);
7061
7062 if (GET_CODE (x) != MEM
7063 || (code == 'y' && maybe_ne (GET_MODE_SIZE (mode), 16)))
7064 {
7065 output_operand_lossage ("invalid operand for '%%%c'", code);
7066 return;
7067 }
7068
7069 if (code == 'y')
7070 /* LDP/STP which uses a single double-width memory operand.
7071 Adjust the mode to appear like a typical LDP/STP.
7072 Currently this is supported for 16-byte accesses only. */
7073 mode = DFmode;
7074
7075 if (!aarch64_print_ldpstp_address (f, mode, XEXP (x, 0)))
7076 output_operand_lossage ("invalid operand prefix '%%%c'", code);
7077 }
7078 break;
7079
7080 default:
7081 output_operand_lossage ("invalid operand prefix '%%%c'", code);
7082 return;
7083 }
7084 }
7085
7086 /* Print address 'x' of a memory access with mode 'mode'.
7087 'op' is the context required by aarch64_classify_address. It can either be
7088 MEM for a normal memory access or PARALLEL for LDP/STP. */
7089 static bool
aarch64_print_address_internal(FILE * f,machine_mode mode,rtx x,aarch64_addr_query_type type)7090 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
7091 aarch64_addr_query_type type)
7092 {
7093 struct aarch64_address_info addr;
7094 unsigned int size;
7095
7096 /* Check all addresses are Pmode - including ILP32. */
7097 if (GET_MODE (x) != Pmode
7098 && (!CONST_INT_P (x)
7099 || trunc_int_for_mode (INTVAL (x), Pmode) != INTVAL (x)))
7100 {
7101 output_operand_lossage ("invalid address mode");
7102 return false;
7103 }
7104
7105 if (aarch64_classify_address (&addr, x, mode, true, type))
7106 switch (addr.type)
7107 {
7108 case ADDRESS_REG_IMM:
7109 if (known_eq (addr.const_offset, 0))
7110 asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
7111 else if (aarch64_sve_data_mode_p (mode))
7112 {
7113 HOST_WIDE_INT vnum
7114 = exact_div (addr.const_offset,
7115 BYTES_PER_SVE_VECTOR).to_constant ();
7116 asm_fprintf (f, "[%s, #%wd, mul vl]",
7117 reg_names[REGNO (addr.base)], vnum);
7118 }
7119 else if (aarch64_sve_pred_mode_p (mode))
7120 {
7121 HOST_WIDE_INT vnum
7122 = exact_div (addr.const_offset,
7123 BYTES_PER_SVE_PRED).to_constant ();
7124 asm_fprintf (f, "[%s, #%wd, mul vl]",
7125 reg_names[REGNO (addr.base)], vnum);
7126 }
7127 else
7128 asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
7129 INTVAL (addr.offset));
7130 return true;
7131
7132 case ADDRESS_REG_REG:
7133 if (addr.shift == 0)
7134 asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
7135 reg_names [REGNO (addr.offset)]);
7136 else
7137 asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
7138 reg_names [REGNO (addr.offset)], addr.shift);
7139 return true;
7140
7141 case ADDRESS_REG_UXTW:
7142 if (addr.shift == 0)
7143 asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
7144 REGNO (addr.offset) - R0_REGNUM);
7145 else
7146 asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
7147 REGNO (addr.offset) - R0_REGNUM, addr.shift);
7148 return true;
7149
7150 case ADDRESS_REG_SXTW:
7151 if (addr.shift == 0)
7152 asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
7153 REGNO (addr.offset) - R0_REGNUM);
7154 else
7155 asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
7156 REGNO (addr.offset) - R0_REGNUM, addr.shift);
7157 return true;
7158
7159 case ADDRESS_REG_WB:
7160 /* Writeback is only supported for fixed-width modes. */
7161 size = GET_MODE_SIZE (mode).to_constant ();
7162 switch (GET_CODE (x))
7163 {
7164 case PRE_INC:
7165 asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
7166 return true;
7167 case POST_INC:
7168 asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
7169 return true;
7170 case PRE_DEC:
7171 asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
7172 return true;
7173 case POST_DEC:
7174 asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
7175 return true;
7176 case PRE_MODIFY:
7177 asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
7178 INTVAL (addr.offset));
7179 return true;
7180 case POST_MODIFY:
7181 asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
7182 INTVAL (addr.offset));
7183 return true;
7184 default:
7185 break;
7186 }
7187 break;
7188
7189 case ADDRESS_LO_SUM:
7190 asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
7191 output_addr_const (f, addr.offset);
7192 asm_fprintf (f, "]");
7193 return true;
7194
7195 case ADDRESS_SYMBOLIC:
7196 output_addr_const (f, x);
7197 return true;
7198 }
7199
7200 return false;
7201 }
7202
7203 /* Print address 'x' of a LDP/STP with mode 'mode'. */
7204 static bool
aarch64_print_ldpstp_address(FILE * f,machine_mode mode,rtx x)7205 aarch64_print_ldpstp_address (FILE *f, machine_mode mode, rtx x)
7206 {
7207 return aarch64_print_address_internal (f, mode, x, ADDR_QUERY_LDP_STP);
7208 }
7209
7210 /* Print address 'x' of a memory access with mode 'mode'. */
7211 static void
aarch64_print_operand_address(FILE * f,machine_mode mode,rtx x)7212 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
7213 {
7214 if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
7215 output_addr_const (f, x);
7216 }
7217
7218 bool
aarch64_label_mentioned_p(rtx x)7219 aarch64_label_mentioned_p (rtx x)
7220 {
7221 const char *fmt;
7222 int i;
7223
7224 if (GET_CODE (x) == LABEL_REF)
7225 return true;
7226
7227 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
7228 referencing instruction, but they are constant offsets, not
7229 symbols. */
7230 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
7231 return false;
7232
7233 fmt = GET_RTX_FORMAT (GET_CODE (x));
7234 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
7235 {
7236 if (fmt[i] == 'E')
7237 {
7238 int j;
7239
7240 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
7241 if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
7242 return 1;
7243 }
7244 else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
7245 return 1;
7246 }
7247
7248 return 0;
7249 }
7250
7251 /* Implement REGNO_REG_CLASS. */
7252
7253 enum reg_class
aarch64_regno_regclass(unsigned regno)7254 aarch64_regno_regclass (unsigned regno)
7255 {
7256 if (GP_REGNUM_P (regno))
7257 return GENERAL_REGS;
7258
7259 if (regno == SP_REGNUM)
7260 return STACK_REG;
7261
7262 if (regno == FRAME_POINTER_REGNUM
7263 || regno == ARG_POINTER_REGNUM)
7264 return POINTER_REGS;
7265
7266 if (FP_REGNUM_P (regno))
7267 return FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS;
7268
7269 if (PR_REGNUM_P (regno))
7270 return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
7271
7272 return NO_REGS;
7273 }
7274
7275 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
7276 If OFFSET is out of range, return an offset of an anchor point
7277 that is in range. Return 0 otherwise. */
7278
7279 static HOST_WIDE_INT
aarch64_anchor_offset(HOST_WIDE_INT offset,HOST_WIDE_INT size,machine_mode mode)7280 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
7281 machine_mode mode)
7282 {
7283 /* Does it look like we'll need a 16-byte load/store-pair operation? */
7284 if (size > 16)
7285 return (offset + 0x400) & ~0x7f0;
7286
7287 /* For offsets that aren't a multiple of the access size, the limit is
7288 -256...255. */
7289 if (offset & (size - 1))
7290 {
7291 /* BLKmode typically uses LDP of X-registers. */
7292 if (mode == BLKmode)
7293 return (offset + 512) & ~0x3ff;
7294 return (offset + 0x100) & ~0x1ff;
7295 }
7296
7297 /* Small negative offsets are supported. */
7298 if (IN_RANGE (offset, -256, 0))
7299 return 0;
7300
7301 if (mode == TImode || mode == TFmode)
7302 return (offset + 0x100) & ~0x1ff;
7303
7304 /* Use 12-bit offset by access size. */
7305 return offset & (~0xfff * size);
7306 }
7307
7308 static rtx
aarch64_legitimize_address(rtx x,rtx,machine_mode mode)7309 aarch64_legitimize_address (rtx x, rtx /* orig_x */, machine_mode mode)
7310 {
7311 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
7312 where mask is selected by alignment and size of the offset.
7313 We try to pick as large a range for the offset as possible to
7314 maximize the chance of a CSE. However, for aligned addresses
7315 we limit the range to 4k so that structures with different sized
7316 elements are likely to use the same base. We need to be careful
7317 not to split a CONST for some forms of address expression, otherwise
7318 it will generate sub-optimal code. */
7319
7320 if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
7321 {
7322 rtx base = XEXP (x, 0);
7323 rtx offset_rtx = XEXP (x, 1);
7324 HOST_WIDE_INT offset = INTVAL (offset_rtx);
7325
7326 if (GET_CODE (base) == PLUS)
7327 {
7328 rtx op0 = XEXP (base, 0);
7329 rtx op1 = XEXP (base, 1);
7330
7331 /* Force any scaling into a temp for CSE. */
7332 op0 = force_reg (Pmode, op0);
7333 op1 = force_reg (Pmode, op1);
7334
7335 /* Let the pointer register be in op0. */
7336 if (REG_POINTER (op1))
7337 std::swap (op0, op1);
7338
7339 /* If the pointer is virtual or frame related, then we know that
7340 virtual register instantiation or register elimination is going
7341 to apply a second constant. We want the two constants folded
7342 together easily. Therefore, emit as (OP0 + CONST) + OP1. */
7343 if (virt_or_elim_regno_p (REGNO (op0)))
7344 {
7345 base = expand_binop (Pmode, add_optab, op0, offset_rtx,
7346 NULL_RTX, true, OPTAB_DIRECT);
7347 return gen_rtx_PLUS (Pmode, base, op1);
7348 }
7349
7350 /* Otherwise, in order to encourage CSE (and thence loop strength
7351 reduce) scaled addresses, emit as (OP0 + OP1) + CONST. */
7352 base = expand_binop (Pmode, add_optab, op0, op1,
7353 NULL_RTX, true, OPTAB_DIRECT);
7354 x = gen_rtx_PLUS (Pmode, base, offset_rtx);
7355 }
7356
7357 HOST_WIDE_INT size;
7358 if (GET_MODE_SIZE (mode).is_constant (&size))
7359 {
7360 HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
7361 mode);
7362 if (base_offset != 0)
7363 {
7364 base = plus_constant (Pmode, base, base_offset);
7365 base = force_operand (base, NULL_RTX);
7366 return plus_constant (Pmode, base, offset - base_offset);
7367 }
7368 }
7369 }
7370
7371 return x;
7372 }
7373
7374 /* Return the reload icode required for a constant pool in mode. */
7375 static enum insn_code
aarch64_constant_pool_reload_icode(machine_mode mode)7376 aarch64_constant_pool_reload_icode (machine_mode mode)
7377 {
7378 switch (mode)
7379 {
7380 case E_SFmode:
7381 return CODE_FOR_aarch64_reload_movcpsfdi;
7382
7383 case E_DFmode:
7384 return CODE_FOR_aarch64_reload_movcpdfdi;
7385
7386 case E_TFmode:
7387 return CODE_FOR_aarch64_reload_movcptfdi;
7388
7389 case E_V8QImode:
7390 return CODE_FOR_aarch64_reload_movcpv8qidi;
7391
7392 case E_V16QImode:
7393 return CODE_FOR_aarch64_reload_movcpv16qidi;
7394
7395 case E_V4HImode:
7396 return CODE_FOR_aarch64_reload_movcpv4hidi;
7397
7398 case E_V8HImode:
7399 return CODE_FOR_aarch64_reload_movcpv8hidi;
7400
7401 case E_V2SImode:
7402 return CODE_FOR_aarch64_reload_movcpv2sidi;
7403
7404 case E_V4SImode:
7405 return CODE_FOR_aarch64_reload_movcpv4sidi;
7406
7407 case E_V2DImode:
7408 return CODE_FOR_aarch64_reload_movcpv2didi;
7409
7410 case E_V2DFmode:
7411 return CODE_FOR_aarch64_reload_movcpv2dfdi;
7412
7413 default:
7414 gcc_unreachable ();
7415 }
7416
7417 gcc_unreachable ();
7418 }
7419 static reg_class_t
aarch64_secondary_reload(bool in_p ATTRIBUTE_UNUSED,rtx x,reg_class_t rclass,machine_mode mode,secondary_reload_info * sri)7420 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
7421 reg_class_t rclass,
7422 machine_mode mode,
7423 secondary_reload_info *sri)
7424 {
7425 /* Use aarch64_sve_reload_be for SVE reloads that cannot be handled
7426 directly by the *aarch64_sve_mov<mode>_be move pattern. See the
7427 comment at the head of aarch64-sve.md for more details about the
7428 big-endian handling. */
7429 if (BYTES_BIG_ENDIAN
7430 && reg_class_subset_p (rclass, FP_REGS)
7431 && !((REG_P (x) && HARD_REGISTER_P (x))
7432 || aarch64_simd_valid_immediate (x, NULL))
7433 && aarch64_sve_data_mode_p (mode))
7434 {
7435 sri->icode = CODE_FOR_aarch64_sve_reload_be;
7436 return NO_REGS;
7437 }
7438
7439 /* If we have to disable direct literal pool loads and stores because the
7440 function is too big, then we need a scratch register. */
7441 if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
7442 && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
7443 || targetm.vector_mode_supported_p (GET_MODE (x)))
7444 && !aarch64_pcrelative_literal_loads)
7445 {
7446 sri->icode = aarch64_constant_pool_reload_icode (mode);
7447 return NO_REGS;
7448 }
7449
7450 /* Without the TARGET_SIMD instructions we cannot move a Q register
7451 to a Q register directly. We need a scratch. */
7452 if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
7453 && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
7454 && reg_class_subset_p (rclass, FP_REGS))
7455 {
7456 if (mode == TFmode)
7457 sri->icode = CODE_FOR_aarch64_reload_movtf;
7458 else if (mode == TImode)
7459 sri->icode = CODE_FOR_aarch64_reload_movti;
7460 return NO_REGS;
7461 }
7462
7463 /* A TFmode or TImode memory access should be handled via an FP_REGS
7464 because AArch64 has richer addressing modes for LDR/STR instructions
7465 than LDP/STP instructions. */
7466 if (TARGET_FLOAT && rclass == GENERAL_REGS
7467 && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
7468 return FP_REGS;
7469
7470 if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
7471 return GENERAL_REGS;
7472
7473 return NO_REGS;
7474 }
7475
7476 static bool
aarch64_can_eliminate(const int from ATTRIBUTE_UNUSED,const int to)7477 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
7478 {
7479 gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
7480
7481 /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
7482 can only eliminate to HARD_FRAME_POINTER_REGNUM. */
7483 if (frame_pointer_needed)
7484 return to == HARD_FRAME_POINTER_REGNUM;
7485 return true;
7486 }
7487
7488 poly_int64
aarch64_initial_elimination_offset(unsigned from,unsigned to)7489 aarch64_initial_elimination_offset (unsigned from, unsigned to)
7490 {
7491 aarch64_layout_frame ();
7492
7493 if (to == HARD_FRAME_POINTER_REGNUM)
7494 {
7495 if (from == ARG_POINTER_REGNUM)
7496 return cfun->machine->frame.hard_fp_offset;
7497
7498 if (from == FRAME_POINTER_REGNUM)
7499 return cfun->machine->frame.hard_fp_offset
7500 - cfun->machine->frame.locals_offset;
7501 }
7502
7503 if (to == STACK_POINTER_REGNUM)
7504 {
7505 if (from == FRAME_POINTER_REGNUM)
7506 return cfun->machine->frame.frame_size
7507 - cfun->machine->frame.locals_offset;
7508 }
7509
7510 return cfun->machine->frame.frame_size;
7511 }
7512
7513 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
7514 previous frame. */
7515
7516 rtx
aarch64_return_addr(int count,rtx frame ATTRIBUTE_UNUSED)7517 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
7518 {
7519 if (count != 0)
7520 return const0_rtx;
7521 return get_hard_reg_initial_val (Pmode, LR_REGNUM);
7522 }
7523
7524
7525 static void
aarch64_asm_trampoline_template(FILE * f)7526 aarch64_asm_trampoline_template (FILE *f)
7527 {
7528 if (TARGET_ILP32)
7529 {
7530 asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
7531 asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
7532 }
7533 else
7534 {
7535 asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
7536 asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
7537 }
7538 asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
7539 assemble_aligned_integer (4, const0_rtx);
7540 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
7541 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
7542 }
7543
7544 static void
aarch64_trampoline_init(rtx m_tramp,tree fndecl,rtx chain_value)7545 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
7546 {
7547 rtx fnaddr, mem, a_tramp;
7548 const int tramp_code_sz = 16;
7549
7550 /* Don't need to copy the trailing D-words, we fill those in below. */
7551 emit_block_move (m_tramp, assemble_trampoline_template (),
7552 GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
7553 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
7554 fnaddr = XEXP (DECL_RTL (fndecl), 0);
7555 if (GET_MODE (fnaddr) != ptr_mode)
7556 fnaddr = convert_memory_address (ptr_mode, fnaddr);
7557 emit_move_insn (mem, fnaddr);
7558
7559 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
7560 emit_move_insn (mem, chain_value);
7561
7562 /* XXX We should really define a "clear_cache" pattern and use
7563 gen_clear_cache(). */
7564 a_tramp = XEXP (m_tramp, 0);
7565 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
7566 LCT_NORMAL, VOIDmode, a_tramp, ptr_mode,
7567 plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
7568 ptr_mode);
7569 }
7570
7571 static unsigned char
aarch64_class_max_nregs(reg_class_t regclass,machine_mode mode)7572 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
7573 {
7574 /* ??? Logically we should only need to provide a value when
7575 HARD_REGNO_MODE_OK says that at least one register in REGCLASS
7576 can hold MODE, but at the moment we need to handle all modes.
7577 Just ignore any runtime parts for registers that can't store them. */
7578 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
7579 unsigned int nregs;
7580 switch (regclass)
7581 {
7582 case TAILCALL_ADDR_REGS:
7583 case POINTER_REGS:
7584 case GENERAL_REGS:
7585 case ALL_REGS:
7586 case POINTER_AND_FP_REGS:
7587 case FP_REGS:
7588 case FP_LO_REGS:
7589 if (aarch64_sve_data_mode_p (mode)
7590 && constant_multiple_p (GET_MODE_SIZE (mode),
7591 BYTES_PER_SVE_VECTOR, &nregs))
7592 return nregs;
7593 return (aarch64_vector_data_mode_p (mode)
7594 ? CEIL (lowest_size, UNITS_PER_VREG)
7595 : CEIL (lowest_size, UNITS_PER_WORD));
7596 case STACK_REG:
7597 case PR_REGS:
7598 case PR_LO_REGS:
7599 case PR_HI_REGS:
7600 return 1;
7601
7602 case NO_REGS:
7603 return 0;
7604
7605 default:
7606 break;
7607 }
7608 gcc_unreachable ();
7609 }
7610
7611 static reg_class_t
aarch64_preferred_reload_class(rtx x,reg_class_t regclass)7612 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
7613 {
7614 if (regclass == POINTER_REGS)
7615 return GENERAL_REGS;
7616
7617 if (regclass == STACK_REG)
7618 {
7619 if (REG_P(x)
7620 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
7621 return regclass;
7622
7623 return NO_REGS;
7624 }
7625
7626 /* Register eliminiation can result in a request for
7627 SP+constant->FP_REGS. We cannot support such operations which
7628 use SP as source and an FP_REG as destination, so reject out
7629 right now. */
7630 if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
7631 {
7632 rtx lhs = XEXP (x, 0);
7633
7634 /* Look through a possible SUBREG introduced by ILP32. */
7635 if (GET_CODE (lhs) == SUBREG)
7636 lhs = SUBREG_REG (lhs);
7637
7638 gcc_assert (REG_P (lhs));
7639 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
7640 POINTER_REGS));
7641 return NO_REGS;
7642 }
7643
7644 return regclass;
7645 }
7646
7647 void
aarch64_asm_output_labelref(FILE * f,const char * name)7648 aarch64_asm_output_labelref (FILE* f, const char *name)
7649 {
7650 asm_fprintf (f, "%U%s", name);
7651 }
7652
7653 static void
aarch64_elf_asm_constructor(rtx symbol,int priority)7654 aarch64_elf_asm_constructor (rtx symbol, int priority)
7655 {
7656 if (priority == DEFAULT_INIT_PRIORITY)
7657 default_ctor_section_asm_out_constructor (symbol, priority);
7658 else
7659 {
7660 section *s;
7661 /* While priority is known to be in range [0, 65535], so 18 bytes
7662 would be enough, the compiler might not know that. To avoid
7663 -Wformat-truncation false positive, use a larger size. */
7664 char buf[23];
7665 snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
7666 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
7667 switch_to_section (s);
7668 assemble_align (POINTER_SIZE);
7669 assemble_aligned_integer (POINTER_BYTES, symbol);
7670 }
7671 }
7672
7673 static void
aarch64_elf_asm_destructor(rtx symbol,int priority)7674 aarch64_elf_asm_destructor (rtx symbol, int priority)
7675 {
7676 if (priority == DEFAULT_INIT_PRIORITY)
7677 default_dtor_section_asm_out_destructor (symbol, priority);
7678 else
7679 {
7680 section *s;
7681 /* While priority is known to be in range [0, 65535], so 18 bytes
7682 would be enough, the compiler might not know that. To avoid
7683 -Wformat-truncation false positive, use a larger size. */
7684 char buf[23];
7685 snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
7686 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
7687 switch_to_section (s);
7688 assemble_align (POINTER_SIZE);
7689 assemble_aligned_integer (POINTER_BYTES, symbol);
7690 }
7691 }
7692
7693 const char*
aarch64_output_casesi(rtx * operands)7694 aarch64_output_casesi (rtx *operands)
7695 {
7696 char buf[100];
7697 char label[100];
7698 rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
7699 int index;
7700 static const char *const patterns[4][2] =
7701 {
7702 {
7703 "ldrb\t%w3, [%0,%w1,uxtw]",
7704 "add\t%3, %4, %w3, sxtb #2"
7705 },
7706 {
7707 "ldrh\t%w3, [%0,%w1,uxtw #1]",
7708 "add\t%3, %4, %w3, sxth #2"
7709 },
7710 {
7711 "ldr\t%w3, [%0,%w1,uxtw #2]",
7712 "add\t%3, %4, %w3, sxtw #2"
7713 },
7714 /* We assume that DImode is only generated when not optimizing and
7715 that we don't really need 64-bit address offsets. That would
7716 imply an object file with 8GB of code in a single function! */
7717 {
7718 "ldr\t%w3, [%0,%w1,uxtw #2]",
7719 "add\t%3, %4, %w3, sxtw #2"
7720 }
7721 };
7722
7723 gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
7724
7725 scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
7726 index = exact_log2 (GET_MODE_SIZE (mode));
7727
7728 gcc_assert (index >= 0 && index <= 3);
7729
7730 /* Need to implement table size reduction, by chaning the code below. */
7731 output_asm_insn (patterns[index][0], operands);
7732 ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
7733 snprintf (buf, sizeof (buf),
7734 "adr\t%%4, %s", targetm.strip_name_encoding (label));
7735 output_asm_insn (buf, operands);
7736 output_asm_insn (patterns[index][1], operands);
7737 output_asm_insn ("br\t%3", operands);
7738 assemble_label (asm_out_file, label);
7739 return "";
7740 }
7741
7742
7743 /* Return size in bits of an arithmetic operand which is shifted/scaled and
7744 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
7745 operator. */
7746
7747 int
aarch64_uxt_size(int shift,HOST_WIDE_INT mask)7748 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
7749 {
7750 if (shift >= 0 && shift <= 3)
7751 {
7752 int size;
7753 for (size = 8; size <= 32; size *= 2)
7754 {
7755 HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
7756 if (mask == bits << shift)
7757 return size;
7758 }
7759 }
7760 return 0;
7761 }
7762
7763 /* Constant pools are per function only when PC relative
7764 literal loads are true or we are in the large memory
7765 model. */
7766
7767 static inline bool
aarch64_can_use_per_function_literal_pools_p(void)7768 aarch64_can_use_per_function_literal_pools_p (void)
7769 {
7770 return (aarch64_pcrelative_literal_loads
7771 || aarch64_cmodel == AARCH64_CMODEL_LARGE);
7772 }
7773
7774 static bool
aarch64_use_blocks_for_constant_p(machine_mode,const_rtx)7775 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
7776 {
7777 /* We can't use blocks for constants when we're using a per-function
7778 constant pool. */
7779 return !aarch64_can_use_per_function_literal_pools_p ();
7780 }
7781
7782 /* Select appropriate section for constants depending
7783 on where we place literal pools. */
7784
7785 static section *
aarch64_select_rtx_section(machine_mode mode,rtx x,unsigned HOST_WIDE_INT align)7786 aarch64_select_rtx_section (machine_mode mode,
7787 rtx x,
7788 unsigned HOST_WIDE_INT align)
7789 {
7790 if (aarch64_can_use_per_function_literal_pools_p ())
7791 return function_section (current_function_decl);
7792
7793 return default_elf_select_rtx_section (mode, x, align);
7794 }
7795
7796 /* Implement ASM_OUTPUT_POOL_EPILOGUE. */
7797 void
aarch64_asm_output_pool_epilogue(FILE * f,const char *,tree,HOST_WIDE_INT offset)7798 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
7799 HOST_WIDE_INT offset)
7800 {
7801 /* When using per-function literal pools, we must ensure that any code
7802 section is aligned to the minimal instruction length, lest we get
7803 errors from the assembler re "unaligned instructions". */
7804 if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
7805 ASM_OUTPUT_ALIGN (f, 2);
7806 }
7807
7808 /* Costs. */
7809
7810 /* Helper function for rtx cost calculation. Strip a shift expression
7811 from X. Returns the inner operand if successful, or the original
7812 expression on failure. */
7813 static rtx
aarch64_strip_shift(rtx x)7814 aarch64_strip_shift (rtx x)
7815 {
7816 rtx op = x;
7817
7818 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
7819 we can convert both to ROR during final output. */
7820 if ((GET_CODE (op) == ASHIFT
7821 || GET_CODE (op) == ASHIFTRT
7822 || GET_CODE (op) == LSHIFTRT
7823 || GET_CODE (op) == ROTATERT
7824 || GET_CODE (op) == ROTATE)
7825 && CONST_INT_P (XEXP (op, 1)))
7826 return XEXP (op, 0);
7827
7828 if (GET_CODE (op) == MULT
7829 && CONST_INT_P (XEXP (op, 1))
7830 && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
7831 return XEXP (op, 0);
7832
7833 return x;
7834 }
7835
7836 /* Helper function for rtx cost calculation. Strip an extend
7837 expression from X. Returns the inner operand if successful, or the
7838 original expression on failure. We deal with a number of possible
7839 canonicalization variations here. If STRIP_SHIFT is true, then
7840 we can strip off a shift also. */
7841 static rtx
aarch64_strip_extend(rtx x,bool strip_shift)7842 aarch64_strip_extend (rtx x, bool strip_shift)
7843 {
7844 scalar_int_mode mode;
7845 rtx op = x;
7846
7847 if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
7848 return op;
7849
7850 /* Zero and sign extraction of a widened value. */
7851 if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
7852 && XEXP (op, 2) == const0_rtx
7853 && GET_CODE (XEXP (op, 0)) == MULT
7854 && aarch64_is_extend_from_extract (mode, XEXP (XEXP (op, 0), 1),
7855 XEXP (op, 1)))
7856 return XEXP (XEXP (op, 0), 0);
7857
7858 /* It can also be represented (for zero-extend) as an AND with an
7859 immediate. */
7860 if (GET_CODE (op) == AND
7861 && GET_CODE (XEXP (op, 0)) == MULT
7862 && CONST_INT_P (XEXP (XEXP (op, 0), 1))
7863 && CONST_INT_P (XEXP (op, 1))
7864 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
7865 INTVAL (XEXP (op, 1))) != 0)
7866 return XEXP (XEXP (op, 0), 0);
7867
7868 /* Now handle extended register, as this may also have an optional
7869 left shift by 1..4. */
7870 if (strip_shift
7871 && GET_CODE (op) == ASHIFT
7872 && CONST_INT_P (XEXP (op, 1))
7873 && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
7874 op = XEXP (op, 0);
7875
7876 if (GET_CODE (op) == ZERO_EXTEND
7877 || GET_CODE (op) == SIGN_EXTEND)
7878 op = XEXP (op, 0);
7879
7880 if (op != x)
7881 return op;
7882
7883 return x;
7884 }
7885
7886 /* Return true iff CODE is a shift supported in combination
7887 with arithmetic instructions. */
7888
7889 static bool
aarch64_shift_p(enum rtx_code code)7890 aarch64_shift_p (enum rtx_code code)
7891 {
7892 return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
7893 }
7894
7895
7896 /* Return true iff X is a cheap shift without a sign extend. */
7897
7898 static bool
aarch64_cheap_mult_shift_p(rtx x)7899 aarch64_cheap_mult_shift_p (rtx x)
7900 {
7901 rtx op0, op1;
7902
7903 op0 = XEXP (x, 0);
7904 op1 = XEXP (x, 1);
7905
7906 if (!(aarch64_tune_params.extra_tuning_flags
7907 & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
7908 return false;
7909
7910 if (GET_CODE (op0) == SIGN_EXTEND)
7911 return false;
7912
7913 if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
7914 && UINTVAL (op1) <= 4)
7915 return true;
7916
7917 if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
7918 return false;
7919
7920 HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
7921
7922 if (l2 > 0 && l2 <= 4)
7923 return true;
7924
7925 return false;
7926 }
7927
7928 /* Helper function for rtx cost calculation. Calculate the cost of
7929 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
7930 Return the calculated cost of the expression, recursing manually in to
7931 operands where needed. */
7932
7933 static int
aarch64_rtx_mult_cost(rtx x,enum rtx_code code,int outer,bool speed)7934 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
7935 {
7936 rtx op0, op1;
7937 const struct cpu_cost_table *extra_cost
7938 = aarch64_tune_params.insn_extra_cost;
7939 int cost = 0;
7940 bool compound_p = (outer == PLUS || outer == MINUS);
7941 machine_mode mode = GET_MODE (x);
7942
7943 gcc_checking_assert (code == MULT);
7944
7945 op0 = XEXP (x, 0);
7946 op1 = XEXP (x, 1);
7947
7948 if (VECTOR_MODE_P (mode))
7949 mode = GET_MODE_INNER (mode);
7950
7951 /* Integer multiply/fma. */
7952 if (GET_MODE_CLASS (mode) == MODE_INT)
7953 {
7954 /* The multiply will be canonicalized as a shift, cost it as such. */
7955 if (aarch64_shift_p (GET_CODE (x))
7956 || (CONST_INT_P (op1)
7957 && exact_log2 (INTVAL (op1)) > 0))
7958 {
7959 bool is_extend = GET_CODE (op0) == ZERO_EXTEND
7960 || GET_CODE (op0) == SIGN_EXTEND;
7961 if (speed)
7962 {
7963 if (compound_p)
7964 {
7965 /* If the shift is considered cheap,
7966 then don't add any cost. */
7967 if (aarch64_cheap_mult_shift_p (x))
7968 ;
7969 else if (REG_P (op1))
7970 /* ARITH + shift-by-register. */
7971 cost += extra_cost->alu.arith_shift_reg;
7972 else if (is_extend)
7973 /* ARITH + extended register. We don't have a cost field
7974 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
7975 cost += extra_cost->alu.extend_arith;
7976 else
7977 /* ARITH + shift-by-immediate. */
7978 cost += extra_cost->alu.arith_shift;
7979 }
7980 else
7981 /* LSL (immediate). */
7982 cost += extra_cost->alu.shift;
7983
7984 }
7985 /* Strip extends as we will have costed them in the case above. */
7986 if (is_extend)
7987 op0 = aarch64_strip_extend (op0, true);
7988
7989 cost += rtx_cost (op0, VOIDmode, code, 0, speed);
7990
7991 return cost;
7992 }
7993
7994 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
7995 compound and let the below cases handle it. After all, MNEG is a
7996 special-case alias of MSUB. */
7997 if (GET_CODE (op0) == NEG)
7998 {
7999 op0 = XEXP (op0, 0);
8000 compound_p = true;
8001 }
8002
8003 /* Integer multiplies or FMAs have zero/sign extending variants. */
8004 if ((GET_CODE (op0) == ZERO_EXTEND
8005 && GET_CODE (op1) == ZERO_EXTEND)
8006 || (GET_CODE (op0) == SIGN_EXTEND
8007 && GET_CODE (op1) == SIGN_EXTEND))
8008 {
8009 cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
8010 cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
8011
8012 if (speed)
8013 {
8014 if (compound_p)
8015 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
8016 cost += extra_cost->mult[0].extend_add;
8017 else
8018 /* MUL/SMULL/UMULL. */
8019 cost += extra_cost->mult[0].extend;
8020 }
8021
8022 return cost;
8023 }
8024
8025 /* This is either an integer multiply or a MADD. In both cases
8026 we want to recurse and cost the operands. */
8027 cost += rtx_cost (op0, mode, MULT, 0, speed);
8028 cost += rtx_cost (op1, mode, MULT, 1, speed);
8029
8030 if (speed)
8031 {
8032 if (compound_p)
8033 /* MADD/MSUB. */
8034 cost += extra_cost->mult[mode == DImode].add;
8035 else
8036 /* MUL. */
8037 cost += extra_cost->mult[mode == DImode].simple;
8038 }
8039
8040 return cost;
8041 }
8042 else
8043 {
8044 if (speed)
8045 {
8046 /* Floating-point FMA/FMUL can also support negations of the
8047 operands, unless the rounding mode is upward or downward in
8048 which case FNMUL is different than FMUL with operand negation. */
8049 bool neg0 = GET_CODE (op0) == NEG;
8050 bool neg1 = GET_CODE (op1) == NEG;
8051 if (compound_p || !flag_rounding_math || (neg0 && neg1))
8052 {
8053 if (neg0)
8054 op0 = XEXP (op0, 0);
8055 if (neg1)
8056 op1 = XEXP (op1, 0);
8057 }
8058
8059 if (compound_p)
8060 /* FMADD/FNMADD/FNMSUB/FMSUB. */
8061 cost += extra_cost->fp[mode == DFmode].fma;
8062 else
8063 /* FMUL/FNMUL. */
8064 cost += extra_cost->fp[mode == DFmode].mult;
8065 }
8066
8067 cost += rtx_cost (op0, mode, MULT, 0, speed);
8068 cost += rtx_cost (op1, mode, MULT, 1, speed);
8069 return cost;
8070 }
8071 }
8072
8073 static int
aarch64_address_cost(rtx x,machine_mode mode,addr_space_t as ATTRIBUTE_UNUSED,bool speed)8074 aarch64_address_cost (rtx x,
8075 machine_mode mode,
8076 addr_space_t as ATTRIBUTE_UNUSED,
8077 bool speed)
8078 {
8079 enum rtx_code c = GET_CODE (x);
8080 const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
8081 struct aarch64_address_info info;
8082 int cost = 0;
8083 info.shift = 0;
8084
8085 if (!aarch64_classify_address (&info, x, mode, false))
8086 {
8087 if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
8088 {
8089 /* This is a CONST or SYMBOL ref which will be split
8090 in a different way depending on the code model in use.
8091 Cost it through the generic infrastructure. */
8092 int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
8093 /* Divide through by the cost of one instruction to
8094 bring it to the same units as the address costs. */
8095 cost_symbol_ref /= COSTS_N_INSNS (1);
8096 /* The cost is then the cost of preparing the address,
8097 followed by an immediate (possibly 0) offset. */
8098 return cost_symbol_ref + addr_cost->imm_offset;
8099 }
8100 else
8101 {
8102 /* This is most likely a jump table from a case
8103 statement. */
8104 return addr_cost->register_offset;
8105 }
8106 }
8107
8108 switch (info.type)
8109 {
8110 case ADDRESS_LO_SUM:
8111 case ADDRESS_SYMBOLIC:
8112 case ADDRESS_REG_IMM:
8113 cost += addr_cost->imm_offset;
8114 break;
8115
8116 case ADDRESS_REG_WB:
8117 if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
8118 cost += addr_cost->pre_modify;
8119 else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
8120 cost += addr_cost->post_modify;
8121 else
8122 gcc_unreachable ();
8123
8124 break;
8125
8126 case ADDRESS_REG_REG:
8127 cost += addr_cost->register_offset;
8128 break;
8129
8130 case ADDRESS_REG_SXTW:
8131 cost += addr_cost->register_sextend;
8132 break;
8133
8134 case ADDRESS_REG_UXTW:
8135 cost += addr_cost->register_zextend;
8136 break;
8137
8138 default:
8139 gcc_unreachable ();
8140 }
8141
8142
8143 if (info.shift > 0)
8144 {
8145 /* For the sake of calculating the cost of the shifted register
8146 component, we can treat same sized modes in the same way. */
8147 if (known_eq (GET_MODE_BITSIZE (mode), 16))
8148 cost += addr_cost->addr_scale_costs.hi;
8149 else if (known_eq (GET_MODE_BITSIZE (mode), 32))
8150 cost += addr_cost->addr_scale_costs.si;
8151 else if (known_eq (GET_MODE_BITSIZE (mode), 64))
8152 cost += addr_cost->addr_scale_costs.di;
8153 else
8154 /* We can't tell, or this is a 128-bit vector. */
8155 cost += addr_cost->addr_scale_costs.ti;
8156 }
8157
8158 return cost;
8159 }
8160
8161 /* Return the cost of a branch. If SPEED_P is true then the compiler is
8162 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
8163 to be taken. */
8164
8165 int
aarch64_branch_cost(bool speed_p,bool predictable_p)8166 aarch64_branch_cost (bool speed_p, bool predictable_p)
8167 {
8168 /* When optimizing for speed, use the cost of unpredictable branches. */
8169 const struct cpu_branch_cost *branch_costs =
8170 aarch64_tune_params.branch_costs;
8171
8172 if (!speed_p || predictable_p)
8173 return branch_costs->predictable;
8174 else
8175 return branch_costs->unpredictable;
8176 }
8177
8178 /* Return true if the RTX X in mode MODE is a zero or sign extract
8179 usable in an ADD or SUB (extended register) instruction. */
8180 static bool
aarch64_rtx_arith_op_extract_p(rtx x,scalar_int_mode mode)8181 aarch64_rtx_arith_op_extract_p (rtx x, scalar_int_mode mode)
8182 {
8183 /* Catch add with a sign extract.
8184 This is add_<optab><mode>_multp2. */
8185 if (GET_CODE (x) == SIGN_EXTRACT
8186 || GET_CODE (x) == ZERO_EXTRACT)
8187 {
8188 rtx op0 = XEXP (x, 0);
8189 rtx op1 = XEXP (x, 1);
8190 rtx op2 = XEXP (x, 2);
8191
8192 if (GET_CODE (op0) == MULT
8193 && CONST_INT_P (op1)
8194 && op2 == const0_rtx
8195 && CONST_INT_P (XEXP (op0, 1))
8196 && aarch64_is_extend_from_extract (mode,
8197 XEXP (op0, 1),
8198 op1))
8199 {
8200 return true;
8201 }
8202 }
8203 /* The simple case <ARITH>, XD, XN, XM, [us]xt.
8204 No shift. */
8205 else if (GET_CODE (x) == SIGN_EXTEND
8206 || GET_CODE (x) == ZERO_EXTEND)
8207 return REG_P (XEXP (x, 0));
8208
8209 return false;
8210 }
8211
8212 static bool
aarch64_frint_unspec_p(unsigned int u)8213 aarch64_frint_unspec_p (unsigned int u)
8214 {
8215 switch (u)
8216 {
8217 case UNSPEC_FRINTZ:
8218 case UNSPEC_FRINTP:
8219 case UNSPEC_FRINTM:
8220 case UNSPEC_FRINTA:
8221 case UNSPEC_FRINTN:
8222 case UNSPEC_FRINTX:
8223 case UNSPEC_FRINTI:
8224 return true;
8225
8226 default:
8227 return false;
8228 }
8229 }
8230
8231 /* Return true iff X is an rtx that will match an extr instruction
8232 i.e. as described in the *extr<mode>5_insn family of patterns.
8233 OP0 and OP1 will be set to the operands of the shifts involved
8234 on success and will be NULL_RTX otherwise. */
8235
8236 static bool
aarch64_extr_rtx_p(rtx x,rtx * res_op0,rtx * res_op1)8237 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
8238 {
8239 rtx op0, op1;
8240 scalar_int_mode mode;
8241 if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
8242 return false;
8243
8244 *res_op0 = NULL_RTX;
8245 *res_op1 = NULL_RTX;
8246
8247 if (GET_CODE (x) != IOR)
8248 return false;
8249
8250 op0 = XEXP (x, 0);
8251 op1 = XEXP (x, 1);
8252
8253 if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
8254 || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
8255 {
8256 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
8257 if (GET_CODE (op1) == ASHIFT)
8258 std::swap (op0, op1);
8259
8260 if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
8261 return false;
8262
8263 unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
8264 unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
8265
8266 if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
8267 && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
8268 {
8269 *res_op0 = XEXP (op0, 0);
8270 *res_op1 = XEXP (op1, 0);
8271 return true;
8272 }
8273 }
8274
8275 return false;
8276 }
8277
8278 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
8279 storing it in *COST. Result is true if the total cost of the operation
8280 has now been calculated. */
8281 static bool
aarch64_if_then_else_costs(rtx op0,rtx op1,rtx op2,int * cost,bool speed)8282 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
8283 {
8284 rtx inner;
8285 rtx comparator;
8286 enum rtx_code cmpcode;
8287
8288 if (COMPARISON_P (op0))
8289 {
8290 inner = XEXP (op0, 0);
8291 comparator = XEXP (op0, 1);
8292 cmpcode = GET_CODE (op0);
8293 }
8294 else
8295 {
8296 inner = op0;
8297 comparator = const0_rtx;
8298 cmpcode = NE;
8299 }
8300
8301 if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
8302 {
8303 /* Conditional branch. */
8304 if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
8305 return true;
8306 else
8307 {
8308 if (cmpcode == NE || cmpcode == EQ)
8309 {
8310 if (comparator == const0_rtx)
8311 {
8312 /* TBZ/TBNZ/CBZ/CBNZ. */
8313 if (GET_CODE (inner) == ZERO_EXTRACT)
8314 /* TBZ/TBNZ. */
8315 *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
8316 ZERO_EXTRACT, 0, speed);
8317 else
8318 /* CBZ/CBNZ. */
8319 *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
8320
8321 return true;
8322 }
8323 }
8324 else if (cmpcode == LT || cmpcode == GE)
8325 {
8326 /* TBZ/TBNZ. */
8327 if (comparator == const0_rtx)
8328 return true;
8329 }
8330 }
8331 }
8332 else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
8333 {
8334 /* CCMP. */
8335 if (GET_CODE (op1) == COMPARE)
8336 {
8337 /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0. */
8338 if (XEXP (op1, 1) == const0_rtx)
8339 *cost += 1;
8340 if (speed)
8341 {
8342 machine_mode mode = GET_MODE (XEXP (op1, 0));
8343 const struct cpu_cost_table *extra_cost
8344 = aarch64_tune_params.insn_extra_cost;
8345
8346 if (GET_MODE_CLASS (mode) == MODE_INT)
8347 *cost += extra_cost->alu.arith;
8348 else
8349 *cost += extra_cost->fp[mode == DFmode].compare;
8350 }
8351 return true;
8352 }
8353
8354 /* It's a conditional operation based on the status flags,
8355 so it must be some flavor of CSEL. */
8356
8357 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
8358 if (GET_CODE (op1) == NEG
8359 || GET_CODE (op1) == NOT
8360 || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
8361 op1 = XEXP (op1, 0);
8362 else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
8363 {
8364 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */
8365 op1 = XEXP (op1, 0);
8366 op2 = XEXP (op2, 0);
8367 }
8368
8369 *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
8370 *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
8371 return true;
8372 }
8373
8374 /* We don't know what this is, cost all operands. */
8375 return false;
8376 }
8377
8378 /* Check whether X is a bitfield operation of the form shift + extend that
8379 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the
8380 operand to which the bitfield operation is applied. Otherwise return
8381 NULL_RTX. */
8382
8383 static rtx
aarch64_extend_bitfield_pattern_p(rtx x)8384 aarch64_extend_bitfield_pattern_p (rtx x)
8385 {
8386 rtx_code outer_code = GET_CODE (x);
8387 machine_mode outer_mode = GET_MODE (x);
8388
8389 if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
8390 && outer_mode != SImode && outer_mode != DImode)
8391 return NULL_RTX;
8392
8393 rtx inner = XEXP (x, 0);
8394 rtx_code inner_code = GET_CODE (inner);
8395 machine_mode inner_mode = GET_MODE (inner);
8396 rtx op = NULL_RTX;
8397
8398 switch (inner_code)
8399 {
8400 case ASHIFT:
8401 if (CONST_INT_P (XEXP (inner, 1))
8402 && (inner_mode == QImode || inner_mode == HImode))
8403 op = XEXP (inner, 0);
8404 break;
8405 case LSHIFTRT:
8406 if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
8407 && (inner_mode == QImode || inner_mode == HImode))
8408 op = XEXP (inner, 0);
8409 break;
8410 case ASHIFTRT:
8411 if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
8412 && (inner_mode == QImode || inner_mode == HImode))
8413 op = XEXP (inner, 0);
8414 break;
8415 default:
8416 break;
8417 }
8418
8419 return op;
8420 }
8421
8422 /* Return true if the mask and a shift amount from an RTX of the form
8423 (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
8424 mode MODE. See the *andim_ashift<mode>_bfiz pattern. */
8425
8426 bool
aarch64_mask_and_shift_for_ubfiz_p(scalar_int_mode mode,rtx mask,rtx shft_amnt)8427 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
8428 rtx shft_amnt)
8429 {
8430 return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
8431 && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
8432 && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
8433 && (INTVAL (mask)
8434 & ((HOST_WIDE_INT_1U << INTVAL (shft_amnt)) - 1)) == 0;
8435 }
8436
8437 /* Calculate the cost of calculating X, storing it in *COST. Result
8438 is true if the total cost of the operation has now been calculated. */
8439 static bool
aarch64_rtx_costs(rtx x,machine_mode mode,int outer ATTRIBUTE_UNUSED,int param ATTRIBUTE_UNUSED,int * cost,bool speed)8440 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
8441 int param ATTRIBUTE_UNUSED, int *cost, bool speed)
8442 {
8443 rtx op0, op1, op2;
8444 const struct cpu_cost_table *extra_cost
8445 = aarch64_tune_params.insn_extra_cost;
8446 int code = GET_CODE (x);
8447 scalar_int_mode int_mode;
8448
8449 /* By default, assume that everything has equivalent cost to the
8450 cheapest instruction. Any additional costs are applied as a delta
8451 above this default. */
8452 *cost = COSTS_N_INSNS (1);
8453
8454 switch (code)
8455 {
8456 case SET:
8457 /* The cost depends entirely on the operands to SET. */
8458 *cost = 0;
8459 op0 = SET_DEST (x);
8460 op1 = SET_SRC (x);
8461
8462 switch (GET_CODE (op0))
8463 {
8464 case MEM:
8465 if (speed)
8466 {
8467 rtx address = XEXP (op0, 0);
8468 if (VECTOR_MODE_P (mode))
8469 *cost += extra_cost->ldst.storev;
8470 else if (GET_MODE_CLASS (mode) == MODE_INT)
8471 *cost += extra_cost->ldst.store;
8472 else if (mode == SFmode)
8473 *cost += extra_cost->ldst.storef;
8474 else if (mode == DFmode)
8475 *cost += extra_cost->ldst.stored;
8476
8477 *cost +=
8478 COSTS_N_INSNS (aarch64_address_cost (address, mode,
8479 0, speed));
8480 }
8481
8482 *cost += rtx_cost (op1, mode, SET, 1, speed);
8483 return true;
8484
8485 case SUBREG:
8486 if (! REG_P (SUBREG_REG (op0)))
8487 *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
8488
8489 /* Fall through. */
8490 case REG:
8491 /* The cost is one per vector-register copied. */
8492 if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
8493 {
8494 int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
8495 *cost = COSTS_N_INSNS (nregs);
8496 }
8497 /* const0_rtx is in general free, but we will use an
8498 instruction to set a register to 0. */
8499 else if (REG_P (op1) || op1 == const0_rtx)
8500 {
8501 /* The cost is 1 per register copied. */
8502 int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
8503 *cost = COSTS_N_INSNS (nregs);
8504 }
8505 else
8506 /* Cost is just the cost of the RHS of the set. */
8507 *cost += rtx_cost (op1, mode, SET, 1, speed);
8508 return true;
8509
8510 case ZERO_EXTRACT:
8511 case SIGN_EXTRACT:
8512 /* Bit-field insertion. Strip any redundant widening of
8513 the RHS to meet the width of the target. */
8514 if (GET_CODE (op1) == SUBREG)
8515 op1 = SUBREG_REG (op1);
8516 if ((GET_CODE (op1) == ZERO_EXTEND
8517 || GET_CODE (op1) == SIGN_EXTEND)
8518 && CONST_INT_P (XEXP (op0, 1))
8519 && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
8520 && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
8521 op1 = XEXP (op1, 0);
8522
8523 if (CONST_INT_P (op1))
8524 {
8525 /* MOV immediate is assumed to always be cheap. */
8526 *cost = COSTS_N_INSNS (1);
8527 }
8528 else
8529 {
8530 /* BFM. */
8531 if (speed)
8532 *cost += extra_cost->alu.bfi;
8533 *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
8534 }
8535
8536 return true;
8537
8538 default:
8539 /* We can't make sense of this, assume default cost. */
8540 *cost = COSTS_N_INSNS (1);
8541 return false;
8542 }
8543 return false;
8544
8545 case CONST_INT:
8546 /* If an instruction can incorporate a constant within the
8547 instruction, the instruction's expression avoids calling
8548 rtx_cost() on the constant. If rtx_cost() is called on a
8549 constant, then it is usually because the constant must be
8550 moved into a register by one or more instructions.
8551
8552 The exception is constant 0, which can be expressed
8553 as XZR/WZR and is therefore free. The exception to this is
8554 if we have (set (reg) (const0_rtx)) in which case we must cost
8555 the move. However, we can catch that when we cost the SET, so
8556 we don't need to consider that here. */
8557 if (x == const0_rtx)
8558 *cost = 0;
8559 else
8560 {
8561 /* To an approximation, building any other constant is
8562 proportionally expensive to the number of instructions
8563 required to build that constant. This is true whether we
8564 are compiling for SPEED or otherwise. */
8565 if (!is_a <scalar_int_mode> (mode, &int_mode))
8566 int_mode = word_mode;
8567 *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
8568 (NULL_RTX, x, false, int_mode));
8569 }
8570 return true;
8571
8572 case CONST_DOUBLE:
8573
8574 /* First determine number of instructions to do the move
8575 as an integer constant. */
8576 if (!aarch64_float_const_representable_p (x)
8577 && !aarch64_can_const_movi_rtx_p (x, mode)
8578 && aarch64_float_const_rtx_p (x))
8579 {
8580 unsigned HOST_WIDE_INT ival;
8581 bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
8582 gcc_assert (succeed);
8583
8584 scalar_int_mode imode = (mode == HFmode
8585 ? SImode
8586 : int_mode_for_mode (mode).require ());
8587 int ncost = aarch64_internal_mov_immediate
8588 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
8589 *cost += COSTS_N_INSNS (ncost);
8590 return true;
8591 }
8592
8593 if (speed)
8594 {
8595 /* mov[df,sf]_aarch64. */
8596 if (aarch64_float_const_representable_p (x))
8597 /* FMOV (scalar immediate). */
8598 *cost += extra_cost->fp[mode == DFmode].fpconst;
8599 else if (!aarch64_float_const_zero_rtx_p (x))
8600 {
8601 /* This will be a load from memory. */
8602 if (mode == DFmode)
8603 *cost += extra_cost->ldst.loadd;
8604 else
8605 *cost += extra_cost->ldst.loadf;
8606 }
8607 else
8608 /* Otherwise this is +0.0. We get this using MOVI d0, #0
8609 or MOV v0.s[0], wzr - neither of which are modeled by the
8610 cost tables. Just use the default cost. */
8611 {
8612 }
8613 }
8614
8615 return true;
8616
8617 case MEM:
8618 if (speed)
8619 {
8620 /* For loads we want the base cost of a load, plus an
8621 approximation for the additional cost of the addressing
8622 mode. */
8623 rtx address = XEXP (x, 0);
8624 if (VECTOR_MODE_P (mode))
8625 *cost += extra_cost->ldst.loadv;
8626 else if (GET_MODE_CLASS (mode) == MODE_INT)
8627 *cost += extra_cost->ldst.load;
8628 else if (mode == SFmode)
8629 *cost += extra_cost->ldst.loadf;
8630 else if (mode == DFmode)
8631 *cost += extra_cost->ldst.loadd;
8632
8633 *cost +=
8634 COSTS_N_INSNS (aarch64_address_cost (address, mode,
8635 0, speed));
8636 }
8637
8638 return true;
8639
8640 case NEG:
8641 op0 = XEXP (x, 0);
8642
8643 if (VECTOR_MODE_P (mode))
8644 {
8645 if (speed)
8646 {
8647 /* FNEG. */
8648 *cost += extra_cost->vect.alu;
8649 }
8650 return false;
8651 }
8652
8653 if (GET_MODE_CLASS (mode) == MODE_INT)
8654 {
8655 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
8656 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
8657 {
8658 /* CSETM. */
8659 *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
8660 return true;
8661 }
8662
8663 /* Cost this as SUB wzr, X. */
8664 op0 = CONST0_RTX (mode);
8665 op1 = XEXP (x, 0);
8666 goto cost_minus;
8667 }
8668
8669 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
8670 {
8671 /* Support (neg(fma...)) as a single instruction only if
8672 sign of zeros is unimportant. This matches the decision
8673 making in aarch64.md. */
8674 if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
8675 {
8676 /* FNMADD. */
8677 *cost = rtx_cost (op0, mode, NEG, 0, speed);
8678 return true;
8679 }
8680 if (GET_CODE (op0) == MULT)
8681 {
8682 /* FNMUL. */
8683 *cost = rtx_cost (op0, mode, NEG, 0, speed);
8684 return true;
8685 }
8686 if (speed)
8687 /* FNEG. */
8688 *cost += extra_cost->fp[mode == DFmode].neg;
8689 return false;
8690 }
8691
8692 return false;
8693
8694 case CLRSB:
8695 case CLZ:
8696 if (speed)
8697 {
8698 if (VECTOR_MODE_P (mode))
8699 *cost += extra_cost->vect.alu;
8700 else
8701 *cost += extra_cost->alu.clz;
8702 }
8703
8704 return false;
8705
8706 case COMPARE:
8707 op0 = XEXP (x, 0);
8708 op1 = XEXP (x, 1);
8709
8710 if (op1 == const0_rtx
8711 && GET_CODE (op0) == AND)
8712 {
8713 x = op0;
8714 mode = GET_MODE (op0);
8715 goto cost_logic;
8716 }
8717
8718 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
8719 {
8720 /* TODO: A write to the CC flags possibly costs extra, this
8721 needs encoding in the cost tables. */
8722
8723 mode = GET_MODE (op0);
8724 /* ANDS. */
8725 if (GET_CODE (op0) == AND)
8726 {
8727 x = op0;
8728 goto cost_logic;
8729 }
8730
8731 if (GET_CODE (op0) == PLUS)
8732 {
8733 /* ADDS (and CMN alias). */
8734 x = op0;
8735 goto cost_plus;
8736 }
8737
8738 if (GET_CODE (op0) == MINUS)
8739 {
8740 /* SUBS. */
8741 x = op0;
8742 goto cost_minus;
8743 }
8744
8745 if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
8746 && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
8747 && CONST_INT_P (XEXP (op0, 2)))
8748 {
8749 /* COMPARE of ZERO_EXTRACT form of TST-immediate.
8750 Handle it here directly rather than going to cost_logic
8751 since we know the immediate generated for the TST is valid
8752 so we can avoid creating an intermediate rtx for it only
8753 for costing purposes. */
8754 if (speed)
8755 *cost += extra_cost->alu.logical;
8756
8757 *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
8758 ZERO_EXTRACT, 0, speed);
8759 return true;
8760 }
8761
8762 if (GET_CODE (op1) == NEG)
8763 {
8764 /* CMN. */
8765 if (speed)
8766 *cost += extra_cost->alu.arith;
8767
8768 *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
8769 *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
8770 return true;
8771 }
8772
8773 /* CMP.
8774
8775 Compare can freely swap the order of operands, and
8776 canonicalization puts the more complex operation first.
8777 But the integer MINUS logic expects the shift/extend
8778 operation in op1. */
8779 if (! (REG_P (op0)
8780 || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
8781 {
8782 op0 = XEXP (x, 1);
8783 op1 = XEXP (x, 0);
8784 }
8785 goto cost_minus;
8786 }
8787
8788 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
8789 {
8790 /* FCMP. */
8791 if (speed)
8792 *cost += extra_cost->fp[mode == DFmode].compare;
8793
8794 if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
8795 {
8796 *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
8797 /* FCMP supports constant 0.0 for no extra cost. */
8798 return true;
8799 }
8800 return false;
8801 }
8802
8803 if (VECTOR_MODE_P (mode))
8804 {
8805 /* Vector compare. */
8806 if (speed)
8807 *cost += extra_cost->vect.alu;
8808
8809 if (aarch64_float_const_zero_rtx_p (op1))
8810 {
8811 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
8812 cost. */
8813 return true;
8814 }
8815 return false;
8816 }
8817 return false;
8818
8819 case MINUS:
8820 {
8821 op0 = XEXP (x, 0);
8822 op1 = XEXP (x, 1);
8823
8824 cost_minus:
8825 *cost += rtx_cost (op0, mode, MINUS, 0, speed);
8826
8827 /* Detect valid immediates. */
8828 if ((GET_MODE_CLASS (mode) == MODE_INT
8829 || (GET_MODE_CLASS (mode) == MODE_CC
8830 && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
8831 && CONST_INT_P (op1)
8832 && aarch64_uimm12_shift (INTVAL (op1)))
8833 {
8834 if (speed)
8835 /* SUB(S) (immediate). */
8836 *cost += extra_cost->alu.arith;
8837 return true;
8838 }
8839
8840 /* Look for SUB (extended register). */
8841 if (is_a <scalar_int_mode> (mode, &int_mode)
8842 && aarch64_rtx_arith_op_extract_p (op1, int_mode))
8843 {
8844 if (speed)
8845 *cost += extra_cost->alu.extend_arith;
8846
8847 op1 = aarch64_strip_extend (op1, true);
8848 *cost += rtx_cost (op1, VOIDmode,
8849 (enum rtx_code) GET_CODE (op1), 0, speed);
8850 return true;
8851 }
8852
8853 rtx new_op1 = aarch64_strip_extend (op1, false);
8854
8855 /* Cost this as an FMA-alike operation. */
8856 if ((GET_CODE (new_op1) == MULT
8857 || aarch64_shift_p (GET_CODE (new_op1)))
8858 && code != COMPARE)
8859 {
8860 *cost += aarch64_rtx_mult_cost (new_op1, MULT,
8861 (enum rtx_code) code,
8862 speed);
8863 return true;
8864 }
8865
8866 *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
8867
8868 if (speed)
8869 {
8870 if (VECTOR_MODE_P (mode))
8871 {
8872 /* Vector SUB. */
8873 *cost += extra_cost->vect.alu;
8874 }
8875 else if (GET_MODE_CLASS (mode) == MODE_INT)
8876 {
8877 /* SUB(S). */
8878 *cost += extra_cost->alu.arith;
8879 }
8880 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
8881 {
8882 /* FSUB. */
8883 *cost += extra_cost->fp[mode == DFmode].addsub;
8884 }
8885 }
8886 return true;
8887 }
8888
8889 case PLUS:
8890 {
8891 rtx new_op0;
8892
8893 op0 = XEXP (x, 0);
8894 op1 = XEXP (x, 1);
8895
8896 cost_plus:
8897 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
8898 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
8899 {
8900 /* CSINC. */
8901 *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
8902 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
8903 return true;
8904 }
8905
8906 if (GET_MODE_CLASS (mode) == MODE_INT
8907 && ((CONST_INT_P (op1) && aarch64_uimm12_shift (INTVAL (op1)))
8908 || aarch64_sve_addvl_addpl_immediate (op1, mode)))
8909 {
8910 *cost += rtx_cost (op0, mode, PLUS, 0, speed);
8911
8912 if (speed)
8913 /* ADD (immediate). */
8914 *cost += extra_cost->alu.arith;
8915 return true;
8916 }
8917
8918 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
8919
8920 /* Look for ADD (extended register). */
8921 if (is_a <scalar_int_mode> (mode, &int_mode)
8922 && aarch64_rtx_arith_op_extract_p (op0, int_mode))
8923 {
8924 if (speed)
8925 *cost += extra_cost->alu.extend_arith;
8926
8927 op0 = aarch64_strip_extend (op0, true);
8928 *cost += rtx_cost (op0, VOIDmode,
8929 (enum rtx_code) GET_CODE (op0), 0, speed);
8930 return true;
8931 }
8932
8933 /* Strip any extend, leave shifts behind as we will
8934 cost them through mult_cost. */
8935 new_op0 = aarch64_strip_extend (op0, false);
8936
8937 if (GET_CODE (new_op0) == MULT
8938 || aarch64_shift_p (GET_CODE (new_op0)))
8939 {
8940 *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
8941 speed);
8942 return true;
8943 }
8944
8945 *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
8946
8947 if (speed)
8948 {
8949 if (VECTOR_MODE_P (mode))
8950 {
8951 /* Vector ADD. */
8952 *cost += extra_cost->vect.alu;
8953 }
8954 else if (GET_MODE_CLASS (mode) == MODE_INT)
8955 {
8956 /* ADD. */
8957 *cost += extra_cost->alu.arith;
8958 }
8959 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
8960 {
8961 /* FADD. */
8962 *cost += extra_cost->fp[mode == DFmode].addsub;
8963 }
8964 }
8965 return true;
8966 }
8967
8968 case BSWAP:
8969 *cost = COSTS_N_INSNS (1);
8970
8971 if (speed)
8972 {
8973 if (VECTOR_MODE_P (mode))
8974 *cost += extra_cost->vect.alu;
8975 else
8976 *cost += extra_cost->alu.rev;
8977 }
8978 return false;
8979
8980 case IOR:
8981 if (aarch_rev16_p (x))
8982 {
8983 *cost = COSTS_N_INSNS (1);
8984
8985 if (speed)
8986 {
8987 if (VECTOR_MODE_P (mode))
8988 *cost += extra_cost->vect.alu;
8989 else
8990 *cost += extra_cost->alu.rev;
8991 }
8992 return true;
8993 }
8994
8995 if (aarch64_extr_rtx_p (x, &op0, &op1))
8996 {
8997 *cost += rtx_cost (op0, mode, IOR, 0, speed);
8998 *cost += rtx_cost (op1, mode, IOR, 1, speed);
8999 if (speed)
9000 *cost += extra_cost->alu.shift;
9001
9002 return true;
9003 }
9004 /* Fall through. */
9005 case XOR:
9006 case AND:
9007 cost_logic:
9008 op0 = XEXP (x, 0);
9009 op1 = XEXP (x, 1);
9010
9011 if (VECTOR_MODE_P (mode))
9012 {
9013 if (speed)
9014 *cost += extra_cost->vect.alu;
9015 return true;
9016 }
9017
9018 if (code == AND
9019 && GET_CODE (op0) == MULT
9020 && CONST_INT_P (XEXP (op0, 1))
9021 && CONST_INT_P (op1)
9022 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
9023 INTVAL (op1)) != 0)
9024 {
9025 /* This is a UBFM/SBFM. */
9026 *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
9027 if (speed)
9028 *cost += extra_cost->alu.bfx;
9029 return true;
9030 }
9031
9032 if (is_int_mode (mode, &int_mode))
9033 {
9034 if (CONST_INT_P (op1))
9035 {
9036 /* We have a mask + shift version of a UBFIZ
9037 i.e. the *andim_ashift<mode>_bfiz pattern. */
9038 if (GET_CODE (op0) == ASHIFT
9039 && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
9040 XEXP (op0, 1)))
9041 {
9042 *cost += rtx_cost (XEXP (op0, 0), int_mode,
9043 (enum rtx_code) code, 0, speed);
9044 if (speed)
9045 *cost += extra_cost->alu.bfx;
9046
9047 return true;
9048 }
9049 else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
9050 {
9051 /* We possibly get the immediate for free, this is not
9052 modelled. */
9053 *cost += rtx_cost (op0, int_mode,
9054 (enum rtx_code) code, 0, speed);
9055 if (speed)
9056 *cost += extra_cost->alu.logical;
9057
9058 return true;
9059 }
9060 }
9061 else
9062 {
9063 rtx new_op0 = op0;
9064
9065 /* Handle ORN, EON, or BIC. */
9066 if (GET_CODE (op0) == NOT)
9067 op0 = XEXP (op0, 0);
9068
9069 new_op0 = aarch64_strip_shift (op0);
9070
9071 /* If we had a shift on op0 then this is a logical-shift-
9072 by-register/immediate operation. Otherwise, this is just
9073 a logical operation. */
9074 if (speed)
9075 {
9076 if (new_op0 != op0)
9077 {
9078 /* Shift by immediate. */
9079 if (CONST_INT_P (XEXP (op0, 1)))
9080 *cost += extra_cost->alu.log_shift;
9081 else
9082 *cost += extra_cost->alu.log_shift_reg;
9083 }
9084 else
9085 *cost += extra_cost->alu.logical;
9086 }
9087
9088 /* In both cases we want to cost both operands. */
9089 *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
9090 0, speed);
9091 *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
9092 1, speed);
9093
9094 return true;
9095 }
9096 }
9097 return false;
9098
9099 case NOT:
9100 x = XEXP (x, 0);
9101 op0 = aarch64_strip_shift (x);
9102
9103 if (VECTOR_MODE_P (mode))
9104 {
9105 /* Vector NOT. */
9106 *cost += extra_cost->vect.alu;
9107 return false;
9108 }
9109
9110 /* MVN-shifted-reg. */
9111 if (op0 != x)
9112 {
9113 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
9114
9115 if (speed)
9116 *cost += extra_cost->alu.log_shift;
9117
9118 return true;
9119 }
9120 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
9121 Handle the second form here taking care that 'a' in the above can
9122 be a shift. */
9123 else if (GET_CODE (op0) == XOR)
9124 {
9125 rtx newop0 = XEXP (op0, 0);
9126 rtx newop1 = XEXP (op0, 1);
9127 rtx op0_stripped = aarch64_strip_shift (newop0);
9128
9129 *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
9130 *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
9131
9132 if (speed)
9133 {
9134 if (op0_stripped != newop0)
9135 *cost += extra_cost->alu.log_shift;
9136 else
9137 *cost += extra_cost->alu.logical;
9138 }
9139
9140 return true;
9141 }
9142 /* MVN. */
9143 if (speed)
9144 *cost += extra_cost->alu.logical;
9145
9146 return false;
9147
9148 case ZERO_EXTEND:
9149
9150 op0 = XEXP (x, 0);
9151 /* If a value is written in SI mode, then zero extended to DI
9152 mode, the operation will in general be free as a write to
9153 a 'w' register implicitly zeroes the upper bits of an 'x'
9154 register. However, if this is
9155
9156 (set (reg) (zero_extend (reg)))
9157
9158 we must cost the explicit register move. */
9159 if (mode == DImode
9160 && GET_MODE (op0) == SImode
9161 && outer == SET)
9162 {
9163 int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
9164
9165 /* If OP_COST is non-zero, then the cost of the zero extend
9166 is effectively the cost of the inner operation. Otherwise
9167 we have a MOV instruction and we take the cost from the MOV
9168 itself. This is true independently of whether we are
9169 optimizing for space or time. */
9170 if (op_cost)
9171 *cost = op_cost;
9172
9173 return true;
9174 }
9175 else if (MEM_P (op0))
9176 {
9177 /* All loads can zero extend to any size for free. */
9178 *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
9179 return true;
9180 }
9181
9182 op0 = aarch64_extend_bitfield_pattern_p (x);
9183 if (op0)
9184 {
9185 *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
9186 if (speed)
9187 *cost += extra_cost->alu.bfx;
9188 return true;
9189 }
9190
9191 if (speed)
9192 {
9193 if (VECTOR_MODE_P (mode))
9194 {
9195 /* UMOV. */
9196 *cost += extra_cost->vect.alu;
9197 }
9198 else
9199 {
9200 /* We generate an AND instead of UXTB/UXTH. */
9201 *cost += extra_cost->alu.logical;
9202 }
9203 }
9204 return false;
9205
9206 case SIGN_EXTEND:
9207 if (MEM_P (XEXP (x, 0)))
9208 {
9209 /* LDRSH. */
9210 if (speed)
9211 {
9212 rtx address = XEXP (XEXP (x, 0), 0);
9213 *cost += extra_cost->ldst.load_sign_extend;
9214
9215 *cost +=
9216 COSTS_N_INSNS (aarch64_address_cost (address, mode,
9217 0, speed));
9218 }
9219 return true;
9220 }
9221
9222 op0 = aarch64_extend_bitfield_pattern_p (x);
9223 if (op0)
9224 {
9225 *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
9226 if (speed)
9227 *cost += extra_cost->alu.bfx;
9228 return true;
9229 }
9230
9231 if (speed)
9232 {
9233 if (VECTOR_MODE_P (mode))
9234 *cost += extra_cost->vect.alu;
9235 else
9236 *cost += extra_cost->alu.extend;
9237 }
9238 return false;
9239
9240 case ASHIFT:
9241 op0 = XEXP (x, 0);
9242 op1 = XEXP (x, 1);
9243
9244 if (CONST_INT_P (op1))
9245 {
9246 if (speed)
9247 {
9248 if (VECTOR_MODE_P (mode))
9249 {
9250 /* Vector shift (immediate). */
9251 *cost += extra_cost->vect.alu;
9252 }
9253 else
9254 {
9255 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
9256 aliases. */
9257 *cost += extra_cost->alu.shift;
9258 }
9259 }
9260
9261 /* We can incorporate zero/sign extend for free. */
9262 if (GET_CODE (op0) == ZERO_EXTEND
9263 || GET_CODE (op0) == SIGN_EXTEND)
9264 op0 = XEXP (op0, 0);
9265
9266 *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
9267 return true;
9268 }
9269 else
9270 {
9271 if (VECTOR_MODE_P (mode))
9272 {
9273 if (speed)
9274 /* Vector shift (register). */
9275 *cost += extra_cost->vect.alu;
9276 }
9277 else
9278 {
9279 if (speed)
9280 /* LSLV. */
9281 *cost += extra_cost->alu.shift_reg;
9282
9283 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
9284 && CONST_INT_P (XEXP (op1, 1))
9285 && known_eq (INTVAL (XEXP (op1, 1)),
9286 GET_MODE_BITSIZE (mode) - 1))
9287 {
9288 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
9289 /* We already demanded XEXP (op1, 0) to be REG_P, so
9290 don't recurse into it. */
9291 return true;
9292 }
9293 }
9294 return false; /* All arguments need to be in registers. */
9295 }
9296
9297 case ROTATE:
9298 case ROTATERT:
9299 case LSHIFTRT:
9300 case ASHIFTRT:
9301 op0 = XEXP (x, 0);
9302 op1 = XEXP (x, 1);
9303
9304 if (CONST_INT_P (op1))
9305 {
9306 /* ASR (immediate) and friends. */
9307 if (speed)
9308 {
9309 if (VECTOR_MODE_P (mode))
9310 *cost += extra_cost->vect.alu;
9311 else
9312 *cost += extra_cost->alu.shift;
9313 }
9314
9315 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
9316 return true;
9317 }
9318 else
9319 {
9320 if (VECTOR_MODE_P (mode))
9321 {
9322 if (speed)
9323 /* Vector shift (register). */
9324 *cost += extra_cost->vect.alu;
9325 }
9326 else
9327 {
9328 if (speed)
9329 /* ASR (register) and friends. */
9330 *cost += extra_cost->alu.shift_reg;
9331
9332 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
9333 && CONST_INT_P (XEXP (op1, 1))
9334 && known_eq (INTVAL (XEXP (op1, 1)),
9335 GET_MODE_BITSIZE (mode) - 1))
9336 {
9337 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
9338 /* We already demanded XEXP (op1, 0) to be REG_P, so
9339 don't recurse into it. */
9340 return true;
9341 }
9342 }
9343 return false; /* All arguments need to be in registers. */
9344 }
9345
9346 case SYMBOL_REF:
9347
9348 if (aarch64_cmodel == AARCH64_CMODEL_LARGE
9349 || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
9350 {
9351 /* LDR. */
9352 if (speed)
9353 *cost += extra_cost->ldst.load;
9354 }
9355 else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
9356 || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
9357 {
9358 /* ADRP, followed by ADD. */
9359 *cost += COSTS_N_INSNS (1);
9360 if (speed)
9361 *cost += 2 * extra_cost->alu.arith;
9362 }
9363 else if (aarch64_cmodel == AARCH64_CMODEL_TINY
9364 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
9365 {
9366 /* ADR. */
9367 if (speed)
9368 *cost += extra_cost->alu.arith;
9369 }
9370
9371 if (flag_pic)
9372 {
9373 /* One extra load instruction, after accessing the GOT. */
9374 *cost += COSTS_N_INSNS (1);
9375 if (speed)
9376 *cost += extra_cost->ldst.load;
9377 }
9378 return true;
9379
9380 case HIGH:
9381 case LO_SUM:
9382 /* ADRP/ADD (immediate). */
9383 if (speed)
9384 *cost += extra_cost->alu.arith;
9385 return true;
9386
9387 case ZERO_EXTRACT:
9388 case SIGN_EXTRACT:
9389 /* UBFX/SBFX. */
9390 if (speed)
9391 {
9392 if (VECTOR_MODE_P (mode))
9393 *cost += extra_cost->vect.alu;
9394 else
9395 *cost += extra_cost->alu.bfx;
9396 }
9397
9398 /* We can trust that the immediates used will be correct (there
9399 are no by-register forms), so we need only cost op0. */
9400 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
9401 return true;
9402
9403 case MULT:
9404 *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
9405 /* aarch64_rtx_mult_cost always handles recursion to its
9406 operands. */
9407 return true;
9408
9409 case MOD:
9410 /* We can expand signed mod by power of 2 using a NEGS, two parallel
9411 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of
9412 an unconditional negate. This case should only ever be reached through
9413 the set_smod_pow2_cheap check in expmed.c. */
9414 if (CONST_INT_P (XEXP (x, 1))
9415 && exact_log2 (INTVAL (XEXP (x, 1))) > 0
9416 && (mode == SImode || mode == DImode))
9417 {
9418 /* We expand to 4 instructions. Reset the baseline. */
9419 *cost = COSTS_N_INSNS (4);
9420
9421 if (speed)
9422 *cost += 2 * extra_cost->alu.logical
9423 + 2 * extra_cost->alu.arith;
9424
9425 return true;
9426 }
9427
9428 /* Fall-through. */
9429 case UMOD:
9430 if (speed)
9431 {
9432 /* Slighly prefer UMOD over SMOD. */
9433 if (VECTOR_MODE_P (mode))
9434 *cost += extra_cost->vect.alu;
9435 else if (GET_MODE_CLASS (mode) == MODE_INT)
9436 *cost += (extra_cost->mult[mode == DImode].add
9437 + extra_cost->mult[mode == DImode].idiv
9438 + (code == MOD ? 1 : 0));
9439 }
9440 return false; /* All arguments need to be in registers. */
9441
9442 case DIV:
9443 case UDIV:
9444 case SQRT:
9445 if (speed)
9446 {
9447 if (VECTOR_MODE_P (mode))
9448 *cost += extra_cost->vect.alu;
9449 else if (GET_MODE_CLASS (mode) == MODE_INT)
9450 /* There is no integer SQRT, so only DIV and UDIV can get
9451 here. */
9452 *cost += (extra_cost->mult[mode == DImode].idiv
9453 /* Slighly prefer UDIV over SDIV. */
9454 + (code == DIV ? 1 : 0));
9455 else
9456 *cost += extra_cost->fp[mode == DFmode].div;
9457 }
9458 return false; /* All arguments need to be in registers. */
9459
9460 case IF_THEN_ELSE:
9461 return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
9462 XEXP (x, 2), cost, speed);
9463
9464 case EQ:
9465 case NE:
9466 case GT:
9467 case GTU:
9468 case LT:
9469 case LTU:
9470 case GE:
9471 case GEU:
9472 case LE:
9473 case LEU:
9474
9475 return false; /* All arguments must be in registers. */
9476
9477 case FMA:
9478 op0 = XEXP (x, 0);
9479 op1 = XEXP (x, 1);
9480 op2 = XEXP (x, 2);
9481
9482 if (speed)
9483 {
9484 if (VECTOR_MODE_P (mode))
9485 *cost += extra_cost->vect.alu;
9486 else
9487 *cost += extra_cost->fp[mode == DFmode].fma;
9488 }
9489
9490 /* FMSUB, FNMADD, and FNMSUB are free. */
9491 if (GET_CODE (op0) == NEG)
9492 op0 = XEXP (op0, 0);
9493
9494 if (GET_CODE (op2) == NEG)
9495 op2 = XEXP (op2, 0);
9496
9497 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
9498 and the by-element operand as operand 0. */
9499 if (GET_CODE (op1) == NEG)
9500 op1 = XEXP (op1, 0);
9501
9502 /* Catch vector-by-element operations. The by-element operand can
9503 either be (vec_duplicate (vec_select (x))) or just
9504 (vec_select (x)), depending on whether we are multiplying by
9505 a vector or a scalar.
9506
9507 Canonicalization is not very good in these cases, FMA4 will put the
9508 by-element operand as operand 0, FNMA4 will have it as operand 1. */
9509 if (GET_CODE (op0) == VEC_DUPLICATE)
9510 op0 = XEXP (op0, 0);
9511 else if (GET_CODE (op1) == VEC_DUPLICATE)
9512 op1 = XEXP (op1, 0);
9513
9514 if (GET_CODE (op0) == VEC_SELECT)
9515 op0 = XEXP (op0, 0);
9516 else if (GET_CODE (op1) == VEC_SELECT)
9517 op1 = XEXP (op1, 0);
9518
9519 /* If the remaining parameters are not registers,
9520 get the cost to put them into registers. */
9521 *cost += rtx_cost (op0, mode, FMA, 0, speed);
9522 *cost += rtx_cost (op1, mode, FMA, 1, speed);
9523 *cost += rtx_cost (op2, mode, FMA, 2, speed);
9524 return true;
9525
9526 case FLOAT:
9527 case UNSIGNED_FLOAT:
9528 if (speed)
9529 *cost += extra_cost->fp[mode == DFmode].fromint;
9530 return false;
9531
9532 case FLOAT_EXTEND:
9533 if (speed)
9534 {
9535 if (VECTOR_MODE_P (mode))
9536 {
9537 /*Vector truncate. */
9538 *cost += extra_cost->vect.alu;
9539 }
9540 else
9541 *cost += extra_cost->fp[mode == DFmode].widen;
9542 }
9543 return false;
9544
9545 case FLOAT_TRUNCATE:
9546 if (speed)
9547 {
9548 if (VECTOR_MODE_P (mode))
9549 {
9550 /*Vector conversion. */
9551 *cost += extra_cost->vect.alu;
9552 }
9553 else
9554 *cost += extra_cost->fp[mode == DFmode].narrow;
9555 }
9556 return false;
9557
9558 case FIX:
9559 case UNSIGNED_FIX:
9560 x = XEXP (x, 0);
9561 /* Strip the rounding part. They will all be implemented
9562 by the fcvt* family of instructions anyway. */
9563 if (GET_CODE (x) == UNSPEC)
9564 {
9565 unsigned int uns_code = XINT (x, 1);
9566
9567 if (uns_code == UNSPEC_FRINTA
9568 || uns_code == UNSPEC_FRINTM
9569 || uns_code == UNSPEC_FRINTN
9570 || uns_code == UNSPEC_FRINTP
9571 || uns_code == UNSPEC_FRINTZ)
9572 x = XVECEXP (x, 0, 0);
9573 }
9574
9575 if (speed)
9576 {
9577 if (VECTOR_MODE_P (mode))
9578 *cost += extra_cost->vect.alu;
9579 else
9580 *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
9581 }
9582
9583 /* We can combine fmul by a power of 2 followed by a fcvt into a single
9584 fixed-point fcvt. */
9585 if (GET_CODE (x) == MULT
9586 && ((VECTOR_MODE_P (mode)
9587 && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
9588 || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
9589 {
9590 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
9591 0, speed);
9592 return true;
9593 }
9594
9595 *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
9596 return true;
9597
9598 case ABS:
9599 if (VECTOR_MODE_P (mode))
9600 {
9601 /* ABS (vector). */
9602 if (speed)
9603 *cost += extra_cost->vect.alu;
9604 }
9605 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
9606 {
9607 op0 = XEXP (x, 0);
9608
9609 /* FABD, which is analogous to FADD. */
9610 if (GET_CODE (op0) == MINUS)
9611 {
9612 *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
9613 *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
9614 if (speed)
9615 *cost += extra_cost->fp[mode == DFmode].addsub;
9616
9617 return true;
9618 }
9619 /* Simple FABS is analogous to FNEG. */
9620 if (speed)
9621 *cost += extra_cost->fp[mode == DFmode].neg;
9622 }
9623 else
9624 {
9625 /* Integer ABS will either be split to
9626 two arithmetic instructions, or will be an ABS
9627 (scalar), which we don't model. */
9628 *cost = COSTS_N_INSNS (2);
9629 if (speed)
9630 *cost += 2 * extra_cost->alu.arith;
9631 }
9632 return false;
9633
9634 case SMAX:
9635 case SMIN:
9636 if (speed)
9637 {
9638 if (VECTOR_MODE_P (mode))
9639 *cost += extra_cost->vect.alu;
9640 else
9641 {
9642 /* FMAXNM/FMINNM/FMAX/FMIN.
9643 TODO: This may not be accurate for all implementations, but
9644 we do not model this in the cost tables. */
9645 *cost += extra_cost->fp[mode == DFmode].addsub;
9646 }
9647 }
9648 return false;
9649
9650 case UNSPEC:
9651 /* The floating point round to integer frint* instructions. */
9652 if (aarch64_frint_unspec_p (XINT (x, 1)))
9653 {
9654 if (speed)
9655 *cost += extra_cost->fp[mode == DFmode].roundint;
9656
9657 return false;
9658 }
9659
9660 if (XINT (x, 1) == UNSPEC_RBIT)
9661 {
9662 if (speed)
9663 *cost += extra_cost->alu.rev;
9664
9665 return false;
9666 }
9667 break;
9668
9669 case TRUNCATE:
9670
9671 /* Decompose <su>muldi3_highpart. */
9672 if (/* (truncate:DI */
9673 mode == DImode
9674 /* (lshiftrt:TI */
9675 && GET_MODE (XEXP (x, 0)) == TImode
9676 && GET_CODE (XEXP (x, 0)) == LSHIFTRT
9677 /* (mult:TI */
9678 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
9679 /* (ANY_EXTEND:TI (reg:DI))
9680 (ANY_EXTEND:TI (reg:DI))) */
9681 && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
9682 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
9683 || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
9684 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
9685 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
9686 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
9687 /* (const_int 64) */
9688 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
9689 && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
9690 {
9691 /* UMULH/SMULH. */
9692 if (speed)
9693 *cost += extra_cost->mult[mode == DImode].extend;
9694 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
9695 mode, MULT, 0, speed);
9696 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
9697 mode, MULT, 1, speed);
9698 return true;
9699 }
9700
9701 /* Fall through. */
9702 default:
9703 break;
9704 }
9705
9706 if (dump_file
9707 && flag_aarch64_verbose_cost)
9708 fprintf (dump_file,
9709 "\nFailed to cost RTX. Assuming default cost.\n");
9710
9711 return true;
9712 }
9713
9714 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
9715 calculated for X. This cost is stored in *COST. Returns true
9716 if the total cost of X was calculated. */
9717 static bool
aarch64_rtx_costs_wrapper(rtx x,machine_mode mode,int outer,int param,int * cost,bool speed)9718 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
9719 int param, int *cost, bool speed)
9720 {
9721 bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
9722
9723 if (dump_file
9724 && flag_aarch64_verbose_cost)
9725 {
9726 print_rtl_single (dump_file, x);
9727 fprintf (dump_file, "\n%s cost: %d (%s)\n",
9728 speed ? "Hot" : "Cold",
9729 *cost, result ? "final" : "partial");
9730 }
9731
9732 return result;
9733 }
9734
9735 static int
aarch64_register_move_cost(machine_mode mode,reg_class_t from_i,reg_class_t to_i)9736 aarch64_register_move_cost (machine_mode mode,
9737 reg_class_t from_i, reg_class_t to_i)
9738 {
9739 enum reg_class from = (enum reg_class) from_i;
9740 enum reg_class to = (enum reg_class) to_i;
9741 const struct cpu_regmove_cost *regmove_cost
9742 = aarch64_tune_params.regmove_cost;
9743
9744 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
9745 if (to == TAILCALL_ADDR_REGS || to == POINTER_REGS)
9746 to = GENERAL_REGS;
9747
9748 if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS)
9749 from = GENERAL_REGS;
9750
9751 /* Moving between GPR and stack cost is the same as GP2GP. */
9752 if ((from == GENERAL_REGS && to == STACK_REG)
9753 || (to == GENERAL_REGS && from == STACK_REG))
9754 return regmove_cost->GP2GP;
9755
9756 /* To/From the stack register, we move via the gprs. */
9757 if (to == STACK_REG || from == STACK_REG)
9758 return aarch64_register_move_cost (mode, from, GENERAL_REGS)
9759 + aarch64_register_move_cost (mode, GENERAL_REGS, to);
9760
9761 if (known_eq (GET_MODE_SIZE (mode), 16))
9762 {
9763 /* 128-bit operations on general registers require 2 instructions. */
9764 if (from == GENERAL_REGS && to == GENERAL_REGS)
9765 return regmove_cost->GP2GP * 2;
9766 else if (from == GENERAL_REGS)
9767 return regmove_cost->GP2FP * 2;
9768 else if (to == GENERAL_REGS)
9769 return regmove_cost->FP2GP * 2;
9770
9771 /* When AdvSIMD instructions are disabled it is not possible to move
9772 a 128-bit value directly between Q registers. This is handled in
9773 secondary reload. A general register is used as a scratch to move
9774 the upper DI value and the lower DI value is moved directly,
9775 hence the cost is the sum of three moves. */
9776 if (! TARGET_SIMD)
9777 return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
9778
9779 return regmove_cost->FP2FP;
9780 }
9781
9782 if (from == GENERAL_REGS && to == GENERAL_REGS)
9783 return regmove_cost->GP2GP;
9784 else if (from == GENERAL_REGS)
9785 return regmove_cost->GP2FP;
9786 else if (to == GENERAL_REGS)
9787 return regmove_cost->FP2GP;
9788
9789 return regmove_cost->FP2FP;
9790 }
9791
9792 static int
aarch64_memory_move_cost(machine_mode mode ATTRIBUTE_UNUSED,reg_class_t rclass ATTRIBUTE_UNUSED,bool in ATTRIBUTE_UNUSED)9793 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
9794 reg_class_t rclass ATTRIBUTE_UNUSED,
9795 bool in ATTRIBUTE_UNUSED)
9796 {
9797 return aarch64_tune_params.memmov_cost;
9798 }
9799
9800 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
9801 to optimize 1.0/sqrt. */
9802
9803 static bool
use_rsqrt_p(machine_mode mode)9804 use_rsqrt_p (machine_mode mode)
9805 {
9806 return (!flag_trapping_math
9807 && flag_unsafe_math_optimizations
9808 && ((aarch64_tune_params.approx_modes->recip_sqrt
9809 & AARCH64_APPROX_MODE (mode))
9810 || flag_mrecip_low_precision_sqrt));
9811 }
9812
9813 /* Function to decide when to use the approximate reciprocal square root
9814 builtin. */
9815
9816 static tree
aarch64_builtin_reciprocal(tree fndecl)9817 aarch64_builtin_reciprocal (tree fndecl)
9818 {
9819 machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
9820
9821 if (!use_rsqrt_p (mode))
9822 return NULL_TREE;
9823 return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
9824 }
9825
9826 typedef rtx (*rsqrte_type) (rtx, rtx);
9827
9828 /* Select reciprocal square root initial estimate insn depending on machine
9829 mode. */
9830
9831 static rsqrte_type
get_rsqrte_type(machine_mode mode)9832 get_rsqrte_type (machine_mode mode)
9833 {
9834 switch (mode)
9835 {
9836 case E_DFmode: return gen_aarch64_rsqrtedf;
9837 case E_SFmode: return gen_aarch64_rsqrtesf;
9838 case E_V2DFmode: return gen_aarch64_rsqrtev2df;
9839 case E_V2SFmode: return gen_aarch64_rsqrtev2sf;
9840 case E_V4SFmode: return gen_aarch64_rsqrtev4sf;
9841 default: gcc_unreachable ();
9842 }
9843 }
9844
9845 typedef rtx (*rsqrts_type) (rtx, rtx, rtx);
9846
9847 /* Select reciprocal square root series step insn depending on machine mode. */
9848
9849 static rsqrts_type
get_rsqrts_type(machine_mode mode)9850 get_rsqrts_type (machine_mode mode)
9851 {
9852 switch (mode)
9853 {
9854 case E_DFmode: return gen_aarch64_rsqrtsdf;
9855 case E_SFmode: return gen_aarch64_rsqrtssf;
9856 case E_V2DFmode: return gen_aarch64_rsqrtsv2df;
9857 case E_V2SFmode: return gen_aarch64_rsqrtsv2sf;
9858 case E_V4SFmode: return gen_aarch64_rsqrtsv4sf;
9859 default: gcc_unreachable ();
9860 }
9861 }
9862
9863 /* Emit instruction sequence to compute either the approximate square root
9864 or its approximate reciprocal, depending on the flag RECP, and return
9865 whether the sequence was emitted or not. */
9866
9867 bool
aarch64_emit_approx_sqrt(rtx dst,rtx src,bool recp)9868 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
9869 {
9870 machine_mode mode = GET_MODE (dst);
9871
9872 if (GET_MODE_INNER (mode) == HFmode)
9873 {
9874 gcc_assert (!recp);
9875 return false;
9876 }
9877
9878 if (!recp)
9879 {
9880 if (!(flag_mlow_precision_sqrt
9881 || (aarch64_tune_params.approx_modes->sqrt
9882 & AARCH64_APPROX_MODE (mode))))
9883 return false;
9884
9885 if (flag_finite_math_only
9886 || flag_trapping_math
9887 || !flag_unsafe_math_optimizations
9888 || optimize_function_for_size_p (cfun))
9889 return false;
9890 }
9891 else
9892 /* Caller assumes we cannot fail. */
9893 gcc_assert (use_rsqrt_p (mode));
9894
9895 machine_mode mmsk = mode_for_int_vector (mode).require ();
9896 rtx xmsk = gen_reg_rtx (mmsk);
9897 if (!recp)
9898 /* When calculating the approximate square root, compare the
9899 argument with 0.0 and create a mask. */
9900 emit_insn (gen_rtx_SET (xmsk,
9901 gen_rtx_NEG (mmsk,
9902 gen_rtx_EQ (mmsk, src,
9903 CONST0_RTX (mode)))));
9904
9905 /* Estimate the approximate reciprocal square root. */
9906 rtx xdst = gen_reg_rtx (mode);
9907 emit_insn ((*get_rsqrte_type (mode)) (xdst, src));
9908
9909 /* Iterate over the series twice for SF and thrice for DF. */
9910 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
9911
9912 /* Optionally iterate over the series once less for faster performance
9913 while sacrificing the accuracy. */
9914 if ((recp && flag_mrecip_low_precision_sqrt)
9915 || (!recp && flag_mlow_precision_sqrt))
9916 iterations--;
9917
9918 /* Iterate over the series to calculate the approximate reciprocal square
9919 root. */
9920 rtx x1 = gen_reg_rtx (mode);
9921 while (iterations--)
9922 {
9923 rtx x2 = gen_reg_rtx (mode);
9924 emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
9925
9926 emit_insn ((*get_rsqrts_type (mode)) (x1, src, x2));
9927
9928 if (iterations > 0)
9929 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
9930 }
9931
9932 if (!recp)
9933 {
9934 /* Qualify the approximate reciprocal square root when the argument is
9935 0.0 by squashing the intermediary result to 0.0. */
9936 rtx xtmp = gen_reg_rtx (mmsk);
9937 emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
9938 gen_rtx_SUBREG (mmsk, xdst, 0)));
9939 emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
9940
9941 /* Calculate the approximate square root. */
9942 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
9943 }
9944
9945 /* Finalize the approximation. */
9946 emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
9947
9948 return true;
9949 }
9950
9951 typedef rtx (*recpe_type) (rtx, rtx);
9952
9953 /* Select reciprocal initial estimate insn depending on machine mode. */
9954
9955 static recpe_type
get_recpe_type(machine_mode mode)9956 get_recpe_type (machine_mode mode)
9957 {
9958 switch (mode)
9959 {
9960 case E_SFmode: return (gen_aarch64_frecpesf);
9961 case E_V2SFmode: return (gen_aarch64_frecpev2sf);
9962 case E_V4SFmode: return (gen_aarch64_frecpev4sf);
9963 case E_DFmode: return (gen_aarch64_frecpedf);
9964 case E_V2DFmode: return (gen_aarch64_frecpev2df);
9965 default: gcc_unreachable ();
9966 }
9967 }
9968
9969 typedef rtx (*recps_type) (rtx, rtx, rtx);
9970
9971 /* Select reciprocal series step insn depending on machine mode. */
9972
9973 static recps_type
get_recps_type(machine_mode mode)9974 get_recps_type (machine_mode mode)
9975 {
9976 switch (mode)
9977 {
9978 case E_SFmode: return (gen_aarch64_frecpssf);
9979 case E_V2SFmode: return (gen_aarch64_frecpsv2sf);
9980 case E_V4SFmode: return (gen_aarch64_frecpsv4sf);
9981 case E_DFmode: return (gen_aarch64_frecpsdf);
9982 case E_V2DFmode: return (gen_aarch64_frecpsv2df);
9983 default: gcc_unreachable ();
9984 }
9985 }
9986
9987 /* Emit the instruction sequence to compute the approximation for the division
9988 of NUM by DEN in QUO and return whether the sequence was emitted or not. */
9989
9990 bool
aarch64_emit_approx_div(rtx quo,rtx num,rtx den)9991 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
9992 {
9993 machine_mode mode = GET_MODE (quo);
9994
9995 if (GET_MODE_INNER (mode) == HFmode)
9996 return false;
9997
9998 bool use_approx_division_p = (flag_mlow_precision_div
9999 || (aarch64_tune_params.approx_modes->division
10000 & AARCH64_APPROX_MODE (mode)));
10001
10002 if (!flag_finite_math_only
10003 || flag_trapping_math
10004 || !flag_unsafe_math_optimizations
10005 || optimize_function_for_size_p (cfun)
10006 || !use_approx_division_p)
10007 return false;
10008
10009 if (!TARGET_SIMD && VECTOR_MODE_P (mode))
10010 return false;
10011
10012 /* Estimate the approximate reciprocal. */
10013 rtx xrcp = gen_reg_rtx (mode);
10014 emit_insn ((*get_recpe_type (mode)) (xrcp, den));
10015
10016 /* Iterate over the series twice for SF and thrice for DF. */
10017 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
10018
10019 /* Optionally iterate over the series once less for faster performance,
10020 while sacrificing the accuracy. */
10021 if (flag_mlow_precision_div)
10022 iterations--;
10023
10024 /* Iterate over the series to calculate the approximate reciprocal. */
10025 rtx xtmp = gen_reg_rtx (mode);
10026 while (iterations--)
10027 {
10028 emit_insn ((*get_recps_type (mode)) (xtmp, xrcp, den));
10029
10030 if (iterations > 0)
10031 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
10032 }
10033
10034 if (num != CONST1_RTX (mode))
10035 {
10036 /* As the approximate reciprocal of DEN is already calculated, only
10037 calculate the approximate division when NUM is not 1.0. */
10038 rtx xnum = force_reg (mode, num);
10039 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
10040 }
10041
10042 /* Finalize the approximation. */
10043 emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
10044 return true;
10045 }
10046
10047 /* Return the number of instructions that can be issued per cycle. */
10048 static int
aarch64_sched_issue_rate(void)10049 aarch64_sched_issue_rate (void)
10050 {
10051 return aarch64_tune_params.issue_rate;
10052 }
10053
10054 static int
aarch64_sched_first_cycle_multipass_dfa_lookahead(void)10055 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
10056 {
10057 int issue_rate = aarch64_sched_issue_rate ();
10058
10059 return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
10060 }
10061
10062
10063 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
10064 autopref_multipass_dfa_lookahead_guard from haifa-sched.c. It only
10065 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */
10066
10067 static int
aarch64_first_cycle_multipass_dfa_lookahead_guard(rtx_insn * insn,int ready_index)10068 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
10069 int ready_index)
10070 {
10071 return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
10072 }
10073
10074
10075 /* Vectorizer cost model target hooks. */
10076
10077 /* Implement targetm.vectorize.builtin_vectorization_cost. */
10078 static int
aarch64_builtin_vectorization_cost(enum vect_cost_for_stmt type_of_cost,tree vectype,int misalign ATTRIBUTE_UNUSED)10079 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
10080 tree vectype,
10081 int misalign ATTRIBUTE_UNUSED)
10082 {
10083 unsigned elements;
10084 const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
10085 bool fp = false;
10086
10087 if (vectype != NULL)
10088 fp = FLOAT_TYPE_P (vectype);
10089
10090 switch (type_of_cost)
10091 {
10092 case scalar_stmt:
10093 return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
10094
10095 case scalar_load:
10096 return costs->scalar_load_cost;
10097
10098 case scalar_store:
10099 return costs->scalar_store_cost;
10100
10101 case vector_stmt:
10102 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
10103
10104 case vector_load:
10105 return costs->vec_align_load_cost;
10106
10107 case vector_store:
10108 return costs->vec_store_cost;
10109
10110 case vec_to_scalar:
10111 return costs->vec_to_scalar_cost;
10112
10113 case scalar_to_vec:
10114 return costs->scalar_to_vec_cost;
10115
10116 case unaligned_load:
10117 case vector_gather_load:
10118 return costs->vec_unalign_load_cost;
10119
10120 case unaligned_store:
10121 case vector_scatter_store:
10122 return costs->vec_unalign_store_cost;
10123
10124 case cond_branch_taken:
10125 return costs->cond_taken_branch_cost;
10126
10127 case cond_branch_not_taken:
10128 return costs->cond_not_taken_branch_cost;
10129
10130 case vec_perm:
10131 return costs->vec_permute_cost;
10132
10133 case vec_promote_demote:
10134 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
10135
10136 case vec_construct:
10137 elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
10138 return elements / 2 + 1;
10139
10140 default:
10141 gcc_unreachable ();
10142 }
10143 }
10144
10145 /* Implement targetm.vectorize.add_stmt_cost. */
10146 static unsigned
aarch64_add_stmt_cost(void * data,int count,enum vect_cost_for_stmt kind,struct _stmt_vec_info * stmt_info,int misalign,enum vect_cost_model_location where)10147 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
10148 struct _stmt_vec_info *stmt_info, int misalign,
10149 enum vect_cost_model_location where)
10150 {
10151 unsigned *cost = (unsigned *) data;
10152 unsigned retval = 0;
10153
10154 if (flag_vect_cost_model)
10155 {
10156 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
10157 int stmt_cost =
10158 aarch64_builtin_vectorization_cost (kind, vectype, misalign);
10159
10160 /* Statements in an inner loop relative to the loop being
10161 vectorized are weighted more heavily. The value here is
10162 arbitrary and could potentially be improved with analysis. */
10163 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
10164 count *= 50; /* FIXME */
10165
10166 retval = (unsigned) (count * stmt_cost);
10167 cost[where] += retval;
10168 }
10169
10170 return retval;
10171 }
10172
10173 static void initialize_aarch64_code_model (struct gcc_options *);
10174
10175 /* Parse the TO_PARSE string and put the architecture struct that it
10176 selects into RES and the architectural features into ISA_FLAGS.
10177 Return an aarch64_parse_opt_result describing the parse result.
10178 If there is an error parsing, RES and ISA_FLAGS are left unchanged. */
10179
10180 static enum aarch64_parse_opt_result
aarch64_parse_arch(const char * to_parse,const struct processor ** res,unsigned long * isa_flags)10181 aarch64_parse_arch (const char *to_parse, const struct processor **res,
10182 unsigned long *isa_flags)
10183 {
10184 char *ext;
10185 const struct processor *arch;
10186 char *str = (char *) alloca (strlen (to_parse) + 1);
10187 size_t len;
10188
10189 strcpy (str, to_parse);
10190
10191 ext = strchr (str, '+');
10192
10193 if (ext != NULL)
10194 len = ext - str;
10195 else
10196 len = strlen (str);
10197
10198 if (len == 0)
10199 return AARCH64_PARSE_MISSING_ARG;
10200
10201
10202 /* Loop through the list of supported ARCHes to find a match. */
10203 for (arch = all_architectures; arch->name != NULL; arch++)
10204 {
10205 if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
10206 {
10207 unsigned long isa_temp = arch->flags;
10208
10209 if (ext != NULL)
10210 {
10211 /* TO_PARSE string contains at least one extension. */
10212 enum aarch64_parse_opt_result ext_res
10213 = aarch64_parse_extension (ext, &isa_temp);
10214
10215 if (ext_res != AARCH64_PARSE_OK)
10216 return ext_res;
10217 }
10218 /* Extension parsing was successful. Confirm the result
10219 arch and ISA flags. */
10220 *res = arch;
10221 *isa_flags = isa_temp;
10222 return AARCH64_PARSE_OK;
10223 }
10224 }
10225
10226 /* ARCH name not found in list. */
10227 return AARCH64_PARSE_INVALID_ARG;
10228 }
10229
10230 /* Parse the TO_PARSE string and put the result tuning in RES and the
10231 architecture flags in ISA_FLAGS. Return an aarch64_parse_opt_result
10232 describing the parse result. If there is an error parsing, RES and
10233 ISA_FLAGS are left unchanged. */
10234
10235 static enum aarch64_parse_opt_result
aarch64_parse_cpu(const char * to_parse,const struct processor ** res,unsigned long * isa_flags)10236 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
10237 unsigned long *isa_flags)
10238 {
10239 char *ext;
10240 const struct processor *cpu;
10241 char *str = (char *) alloca (strlen (to_parse) + 1);
10242 size_t len;
10243
10244 strcpy (str, to_parse);
10245
10246 ext = strchr (str, '+');
10247
10248 if (ext != NULL)
10249 len = ext - str;
10250 else
10251 len = strlen (str);
10252
10253 if (len == 0)
10254 return AARCH64_PARSE_MISSING_ARG;
10255
10256
10257 /* Loop through the list of supported CPUs to find a match. */
10258 for (cpu = all_cores; cpu->name != NULL; cpu++)
10259 {
10260 if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
10261 {
10262 unsigned long isa_temp = cpu->flags;
10263
10264
10265 if (ext != NULL)
10266 {
10267 /* TO_PARSE string contains at least one extension. */
10268 enum aarch64_parse_opt_result ext_res
10269 = aarch64_parse_extension (ext, &isa_temp);
10270
10271 if (ext_res != AARCH64_PARSE_OK)
10272 return ext_res;
10273 }
10274 /* Extension parsing was successfull. Confirm the result
10275 cpu and ISA flags. */
10276 *res = cpu;
10277 *isa_flags = isa_temp;
10278 return AARCH64_PARSE_OK;
10279 }
10280 }
10281
10282 /* CPU name not found in list. */
10283 return AARCH64_PARSE_INVALID_ARG;
10284 }
10285
10286 /* Parse the TO_PARSE string and put the cpu it selects into RES.
10287 Return an aarch64_parse_opt_result describing the parse result.
10288 If the parsing fails the RES does not change. */
10289
10290 static enum aarch64_parse_opt_result
aarch64_parse_tune(const char * to_parse,const struct processor ** res)10291 aarch64_parse_tune (const char *to_parse, const struct processor **res)
10292 {
10293 const struct processor *cpu;
10294 char *str = (char *) alloca (strlen (to_parse) + 1);
10295
10296 strcpy (str, to_parse);
10297
10298 /* Loop through the list of supported CPUs to find a match. */
10299 for (cpu = all_cores; cpu->name != NULL; cpu++)
10300 {
10301 if (strcmp (cpu->name, str) == 0)
10302 {
10303 *res = cpu;
10304 return AARCH64_PARSE_OK;
10305 }
10306 }
10307
10308 /* CPU name not found in list. */
10309 return AARCH64_PARSE_INVALID_ARG;
10310 }
10311
10312 /* Parse TOKEN, which has length LENGTH to see if it is an option
10313 described in FLAG. If it is, return the index bit for that fusion type.
10314 If not, error (printing OPTION_NAME) and return zero. */
10315
10316 static unsigned int
aarch64_parse_one_option_token(const char * token,size_t length,const struct aarch64_flag_desc * flag,const char * option_name)10317 aarch64_parse_one_option_token (const char *token,
10318 size_t length,
10319 const struct aarch64_flag_desc *flag,
10320 const char *option_name)
10321 {
10322 for (; flag->name != NULL; flag++)
10323 {
10324 if (length == strlen (flag->name)
10325 && !strncmp (flag->name, token, length))
10326 return flag->flag;
10327 }
10328
10329 error ("unknown flag passed in -moverride=%s (%s)", option_name, token);
10330 return 0;
10331 }
10332
10333 /* Parse OPTION which is a comma-separated list of flags to enable.
10334 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
10335 default state we inherit from the CPU tuning structures. OPTION_NAME
10336 gives the top-level option we are parsing in the -moverride string,
10337 for use in error messages. */
10338
10339 static unsigned int
aarch64_parse_boolean_options(const char * option,const struct aarch64_flag_desc * flags,unsigned int initial_state,const char * option_name)10340 aarch64_parse_boolean_options (const char *option,
10341 const struct aarch64_flag_desc *flags,
10342 unsigned int initial_state,
10343 const char *option_name)
10344 {
10345 const char separator = '.';
10346 const char* specs = option;
10347 const char* ntoken = option;
10348 unsigned int found_flags = initial_state;
10349
10350 while ((ntoken = strchr (specs, separator)))
10351 {
10352 size_t token_length = ntoken - specs;
10353 unsigned token_ops = aarch64_parse_one_option_token (specs,
10354 token_length,
10355 flags,
10356 option_name);
10357 /* If we find "none" (or, for simplicity's sake, an error) anywhere
10358 in the token stream, reset the supported operations. So:
10359
10360 adrp+add.cmp+branch.none.adrp+add
10361
10362 would have the result of turning on only adrp+add fusion. */
10363 if (!token_ops)
10364 found_flags = 0;
10365
10366 found_flags |= token_ops;
10367 specs = ++ntoken;
10368 }
10369
10370 /* We ended with a comma, print something. */
10371 if (!(*specs))
10372 {
10373 error ("%s string ill-formed\n", option_name);
10374 return 0;
10375 }
10376
10377 /* We still have one more token to parse. */
10378 size_t token_length = strlen (specs);
10379 unsigned token_ops = aarch64_parse_one_option_token (specs,
10380 token_length,
10381 flags,
10382 option_name);
10383 if (!token_ops)
10384 found_flags = 0;
10385
10386 found_flags |= token_ops;
10387 return found_flags;
10388 }
10389
10390 /* Support for overriding instruction fusion. */
10391
10392 static void
aarch64_parse_fuse_string(const char * fuse_string,struct tune_params * tune)10393 aarch64_parse_fuse_string (const char *fuse_string,
10394 struct tune_params *tune)
10395 {
10396 tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
10397 aarch64_fusible_pairs,
10398 tune->fusible_ops,
10399 "fuse=");
10400 }
10401
10402 /* Support for overriding other tuning flags. */
10403
10404 static void
aarch64_parse_tune_string(const char * tune_string,struct tune_params * tune)10405 aarch64_parse_tune_string (const char *tune_string,
10406 struct tune_params *tune)
10407 {
10408 tune->extra_tuning_flags
10409 = aarch64_parse_boolean_options (tune_string,
10410 aarch64_tuning_flags,
10411 tune->extra_tuning_flags,
10412 "tune=");
10413 }
10414
10415 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
10416 we understand. If it is, extract the option string and handoff to
10417 the appropriate function. */
10418
10419 void
aarch64_parse_one_override_token(const char * token,size_t length,struct tune_params * tune)10420 aarch64_parse_one_override_token (const char* token,
10421 size_t length,
10422 struct tune_params *tune)
10423 {
10424 const struct aarch64_tuning_override_function *fn
10425 = aarch64_tuning_override_functions;
10426
10427 const char *option_part = strchr (token, '=');
10428 if (!option_part)
10429 {
10430 error ("tuning string missing in option (%s)", token);
10431 return;
10432 }
10433
10434 /* Get the length of the option name. */
10435 length = option_part - token;
10436 /* Skip the '=' to get to the option string. */
10437 option_part++;
10438
10439 for (; fn->name != NULL; fn++)
10440 {
10441 if (!strncmp (fn->name, token, length))
10442 {
10443 fn->parse_override (option_part, tune);
10444 return;
10445 }
10446 }
10447
10448 error ("unknown tuning option (%s)",token);
10449 return;
10450 }
10451
10452 /* A checking mechanism for the implementation of the tls size. */
10453
10454 static void
initialize_aarch64_tls_size(struct gcc_options * opts)10455 initialize_aarch64_tls_size (struct gcc_options *opts)
10456 {
10457 if (aarch64_tls_size == 0)
10458 aarch64_tls_size = 24;
10459
10460 switch (opts->x_aarch64_cmodel_var)
10461 {
10462 case AARCH64_CMODEL_TINY:
10463 /* Both the default and maximum TLS size allowed under tiny is 1M which
10464 needs two instructions to address, so we clamp the size to 24. */
10465 if (aarch64_tls_size > 24)
10466 aarch64_tls_size = 24;
10467 break;
10468 case AARCH64_CMODEL_SMALL:
10469 /* The maximum TLS size allowed under small is 4G. */
10470 if (aarch64_tls_size > 32)
10471 aarch64_tls_size = 32;
10472 break;
10473 case AARCH64_CMODEL_LARGE:
10474 /* The maximum TLS size allowed under large is 16E.
10475 FIXME: 16E should be 64bit, we only support 48bit offset now. */
10476 if (aarch64_tls_size > 48)
10477 aarch64_tls_size = 48;
10478 break;
10479 default:
10480 gcc_unreachable ();
10481 }
10482
10483 return;
10484 }
10485
10486 /* Parse STRING looking for options in the format:
10487 string :: option:string
10488 option :: name=substring
10489 name :: {a-z}
10490 substring :: defined by option. */
10491
10492 static void
aarch64_parse_override_string(const char * input_string,struct tune_params * tune)10493 aarch64_parse_override_string (const char* input_string,
10494 struct tune_params* tune)
10495 {
10496 const char separator = ':';
10497 size_t string_length = strlen (input_string) + 1;
10498 char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
10499 char *string = string_root;
10500 strncpy (string, input_string, string_length);
10501 string[string_length - 1] = '\0';
10502
10503 char* ntoken = string;
10504
10505 while ((ntoken = strchr (string, separator)))
10506 {
10507 size_t token_length = ntoken - string;
10508 /* Make this substring look like a string. */
10509 *ntoken = '\0';
10510 aarch64_parse_one_override_token (string, token_length, tune);
10511 string = ++ntoken;
10512 }
10513
10514 /* One last option to parse. */
10515 aarch64_parse_one_override_token (string, strlen (string), tune);
10516 free (string_root);
10517 }
10518
10519
10520 static void
aarch64_override_options_after_change_1(struct gcc_options * opts)10521 aarch64_override_options_after_change_1 (struct gcc_options *opts)
10522 {
10523 /* PR 70044: We have to be careful about being called multiple times for the
10524 same function. This means all changes should be repeatable. */
10525
10526 /* If the frame pointer is enabled, set it to a special value that behaves
10527 similar to frame pointer omission. If we don't do this all leaf functions
10528 will get a frame pointer even if flag_omit_leaf_frame_pointer is set.
10529 If flag_omit_frame_pointer has this special value, we must force the
10530 frame pointer if not in a leaf function. We also need to force it in a
10531 leaf function if flag_omit_frame_pointer is not set or if LR is used. */
10532 if (opts->x_flag_omit_frame_pointer == 0)
10533 opts->x_flag_omit_frame_pointer = 2;
10534
10535 /* If not optimizing for size, set the default
10536 alignment to what the target wants. */
10537 if (!opts->x_optimize_size)
10538 {
10539 if (opts->x_align_loops <= 0)
10540 opts->x_align_loops = aarch64_tune_params.loop_align;
10541 if (opts->x_align_jumps <= 0)
10542 opts->x_align_jumps = aarch64_tune_params.jump_align;
10543 if (opts->x_align_functions <= 0)
10544 opts->x_align_functions = aarch64_tune_params.function_align;
10545 }
10546
10547 /* We default to no pc-relative literal loads. */
10548
10549 aarch64_pcrelative_literal_loads = false;
10550
10551 /* If -mpc-relative-literal-loads is set on the command line, this
10552 implies that the user asked for PC relative literal loads. */
10553 if (opts->x_pcrelative_literal_loads == 1)
10554 aarch64_pcrelative_literal_loads = true;
10555
10556 /* In the tiny memory model it makes no sense to disallow PC relative
10557 literal pool loads. */
10558 if (aarch64_cmodel == AARCH64_CMODEL_TINY
10559 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
10560 aarch64_pcrelative_literal_loads = true;
10561
10562 /* When enabling the lower precision Newton series for the square root, also
10563 enable it for the reciprocal square root, since the latter is an
10564 intermediary step for the former. */
10565 if (flag_mlow_precision_sqrt)
10566 flag_mrecip_low_precision_sqrt = true;
10567 }
10568
10569 /* 'Unpack' up the internal tuning structs and update the options
10570 in OPTS. The caller must have set up selected_tune and selected_arch
10571 as all the other target-specific codegen decisions are
10572 derived from them. */
10573
10574 void
aarch64_override_options_internal(struct gcc_options * opts)10575 aarch64_override_options_internal (struct gcc_options *opts)
10576 {
10577 aarch64_tune_flags = selected_tune->flags;
10578 aarch64_tune = selected_tune->sched_core;
10579 /* Make a copy of the tuning parameters attached to the core, which
10580 we may later overwrite. */
10581 aarch64_tune_params = *(selected_tune->tune);
10582 aarch64_architecture_version = selected_arch->architecture_version;
10583
10584 if (opts->x_aarch64_override_tune_string)
10585 aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
10586 &aarch64_tune_params);
10587
10588 /* This target defaults to strict volatile bitfields. */
10589 if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
10590 opts->x_flag_strict_volatile_bitfields = 1;
10591
10592 initialize_aarch64_code_model (opts);
10593 initialize_aarch64_tls_size (opts);
10594
10595 int queue_depth = 0;
10596 switch (aarch64_tune_params.autoprefetcher_model)
10597 {
10598 case tune_params::AUTOPREFETCHER_OFF:
10599 queue_depth = -1;
10600 break;
10601 case tune_params::AUTOPREFETCHER_WEAK:
10602 queue_depth = 0;
10603 break;
10604 case tune_params::AUTOPREFETCHER_STRONG:
10605 queue_depth = max_insn_queue_index + 1;
10606 break;
10607 default:
10608 gcc_unreachable ();
10609 }
10610
10611 /* We don't mind passing in global_options_set here as we don't use
10612 the *options_set structs anyway. */
10613 maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
10614 queue_depth,
10615 opts->x_param_values,
10616 global_options_set.x_param_values);
10617
10618 /* Set up parameters to be used in prefetching algorithm. Do not
10619 override the defaults unless we are tuning for a core we have
10620 researched values for. */
10621 if (aarch64_tune_params.prefetch->num_slots > 0)
10622 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
10623 aarch64_tune_params.prefetch->num_slots,
10624 opts->x_param_values,
10625 global_options_set.x_param_values);
10626 if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
10627 maybe_set_param_value (PARAM_L1_CACHE_SIZE,
10628 aarch64_tune_params.prefetch->l1_cache_size,
10629 opts->x_param_values,
10630 global_options_set.x_param_values);
10631 if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
10632 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
10633 aarch64_tune_params.prefetch->l1_cache_line_size,
10634 opts->x_param_values,
10635 global_options_set.x_param_values);
10636 if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
10637 maybe_set_param_value (PARAM_L2_CACHE_SIZE,
10638 aarch64_tune_params.prefetch->l2_cache_size,
10639 opts->x_param_values,
10640 global_options_set.x_param_values);
10641
10642 /* Use the alternative scheduling-pressure algorithm by default. */
10643 maybe_set_param_value (PARAM_SCHED_PRESSURE_ALGORITHM, SCHED_PRESSURE_MODEL,
10644 opts->x_param_values,
10645 global_options_set.x_param_values);
10646
10647 /* Enable sw prefetching at specified optimization level for
10648 CPUS that have prefetch. Lower optimization level threshold by 1
10649 when profiling is enabled. */
10650 if (opts->x_flag_prefetch_loop_arrays < 0
10651 && !opts->x_optimize_size
10652 && aarch64_tune_params.prefetch->default_opt_level >= 0
10653 && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
10654 opts->x_flag_prefetch_loop_arrays = 1;
10655
10656 aarch64_override_options_after_change_1 (opts);
10657 }
10658
10659 /* Print a hint with a suggestion for a core or architecture name that
10660 most closely resembles what the user passed in STR. ARCH is true if
10661 the user is asking for an architecture name. ARCH is false if the user
10662 is asking for a core name. */
10663
10664 static void
aarch64_print_hint_for_core_or_arch(const char * str,bool arch)10665 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
10666 {
10667 auto_vec<const char *> candidates;
10668 const struct processor *entry = arch ? all_architectures : all_cores;
10669 for (; entry->name != NULL; entry++)
10670 candidates.safe_push (entry->name);
10671
10672 #ifdef HAVE_LOCAL_CPU_DETECT
10673 /* Add also "native" as possible value. */
10674 if (arch)
10675 candidates.safe_push ("native");
10676 #endif
10677
10678 char *s;
10679 const char *hint = candidates_list_and_hint (str, s, candidates);
10680 if (hint)
10681 inform (input_location, "valid arguments are: %s;"
10682 " did you mean %qs?", s, hint);
10683 else
10684 inform (input_location, "valid arguments are: %s", s);
10685
10686 XDELETEVEC (s);
10687 }
10688
10689 /* Print a hint with a suggestion for a core name that most closely resembles
10690 what the user passed in STR. */
10691
10692 inline static void
aarch64_print_hint_for_core(const char * str)10693 aarch64_print_hint_for_core (const char *str)
10694 {
10695 aarch64_print_hint_for_core_or_arch (str, false);
10696 }
10697
10698 /* Print a hint with a suggestion for an architecture name that most closely
10699 resembles what the user passed in STR. */
10700
10701 inline static void
aarch64_print_hint_for_arch(const char * str)10702 aarch64_print_hint_for_arch (const char *str)
10703 {
10704 aarch64_print_hint_for_core_or_arch (str, true);
10705 }
10706
10707 /* Validate a command-line -mcpu option. Parse the cpu and extensions (if any)
10708 specified in STR and throw errors if appropriate. Put the results if
10709 they are valid in RES and ISA_FLAGS. Return whether the option is
10710 valid. */
10711
10712 static bool
aarch64_validate_mcpu(const char * str,const struct processor ** res,unsigned long * isa_flags)10713 aarch64_validate_mcpu (const char *str, const struct processor **res,
10714 unsigned long *isa_flags)
10715 {
10716 enum aarch64_parse_opt_result parse_res
10717 = aarch64_parse_cpu (str, res, isa_flags);
10718
10719 if (parse_res == AARCH64_PARSE_OK)
10720 return true;
10721
10722 switch (parse_res)
10723 {
10724 case AARCH64_PARSE_MISSING_ARG:
10725 error ("missing cpu name in %<-mcpu=%s%>", str);
10726 break;
10727 case AARCH64_PARSE_INVALID_ARG:
10728 error ("unknown value %qs for -mcpu", str);
10729 aarch64_print_hint_for_core (str);
10730 break;
10731 case AARCH64_PARSE_INVALID_FEATURE:
10732 error ("invalid feature modifier in %<-mcpu=%s%>", str);
10733 break;
10734 default:
10735 gcc_unreachable ();
10736 }
10737
10738 return false;
10739 }
10740
10741 /* Validate a command-line -march option. Parse the arch and extensions
10742 (if any) specified in STR and throw errors if appropriate. Put the
10743 results, if they are valid, in RES and ISA_FLAGS. Return whether the
10744 option is valid. */
10745
10746 static bool
aarch64_validate_march(const char * str,const struct processor ** res,unsigned long * isa_flags)10747 aarch64_validate_march (const char *str, const struct processor **res,
10748 unsigned long *isa_flags)
10749 {
10750 enum aarch64_parse_opt_result parse_res
10751 = aarch64_parse_arch (str, res, isa_flags);
10752
10753 if (parse_res == AARCH64_PARSE_OK)
10754 return true;
10755
10756 switch (parse_res)
10757 {
10758 case AARCH64_PARSE_MISSING_ARG:
10759 error ("missing arch name in %<-march=%s%>", str);
10760 break;
10761 case AARCH64_PARSE_INVALID_ARG:
10762 error ("unknown value %qs for -march", str);
10763 aarch64_print_hint_for_arch (str);
10764 break;
10765 case AARCH64_PARSE_INVALID_FEATURE:
10766 error ("invalid feature modifier in %<-march=%s%>", str);
10767 break;
10768 default:
10769 gcc_unreachable ();
10770 }
10771
10772 return false;
10773 }
10774
10775 /* Validate a command-line -mtune option. Parse the cpu
10776 specified in STR and throw errors if appropriate. Put the
10777 result, if it is valid, in RES. Return whether the option is
10778 valid. */
10779
10780 static bool
aarch64_validate_mtune(const char * str,const struct processor ** res)10781 aarch64_validate_mtune (const char *str, const struct processor **res)
10782 {
10783 enum aarch64_parse_opt_result parse_res
10784 = aarch64_parse_tune (str, res);
10785
10786 if (parse_res == AARCH64_PARSE_OK)
10787 return true;
10788
10789 switch (parse_res)
10790 {
10791 case AARCH64_PARSE_MISSING_ARG:
10792 error ("missing cpu name in %<-mtune=%s%>", str);
10793 break;
10794 case AARCH64_PARSE_INVALID_ARG:
10795 error ("unknown value %qs for -mtune", str);
10796 aarch64_print_hint_for_core (str);
10797 break;
10798 default:
10799 gcc_unreachable ();
10800 }
10801 return false;
10802 }
10803
10804 /* Return the CPU corresponding to the enum CPU.
10805 If it doesn't specify a cpu, return the default. */
10806
10807 static const struct processor *
aarch64_get_tune_cpu(enum aarch64_processor cpu)10808 aarch64_get_tune_cpu (enum aarch64_processor cpu)
10809 {
10810 if (cpu != aarch64_none)
10811 return &all_cores[cpu];
10812
10813 /* The & 0x3f is to extract the bottom 6 bits that encode the
10814 default cpu as selected by the --with-cpu GCC configure option
10815 in config.gcc.
10816 ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
10817 flags mechanism should be reworked to make it more sane. */
10818 return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
10819 }
10820
10821 /* Return the architecture corresponding to the enum ARCH.
10822 If it doesn't specify a valid architecture, return the default. */
10823
10824 static const struct processor *
aarch64_get_arch(enum aarch64_arch arch)10825 aarch64_get_arch (enum aarch64_arch arch)
10826 {
10827 if (arch != aarch64_no_arch)
10828 return &all_architectures[arch];
10829
10830 const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
10831
10832 return &all_architectures[cpu->arch];
10833 }
10834
10835 /* Return the VG value associated with -msve-vector-bits= value VALUE. */
10836
10837 static poly_uint16
aarch64_convert_sve_vector_bits(aarch64_sve_vector_bits_enum value)10838 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
10839 {
10840 /* For now generate vector-length agnostic code for -msve-vector-bits=128.
10841 This ensures we can clearly distinguish SVE and Advanced SIMD modes when
10842 deciding which .md file patterns to use and when deciding whether
10843 something is a legitimate address or constant. */
10844 if (value == SVE_SCALABLE || value == SVE_128)
10845 return poly_uint16 (2, 2);
10846 else
10847 return (int) value / 64;
10848 }
10849
10850 /* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning
10851 and is used to parse the -m{cpu,tune,arch} strings and setup the initial
10852 tuning structs. In particular it must set selected_tune and
10853 aarch64_isa_flags that define the available ISA features and tuning
10854 decisions. It must also set selected_arch as this will be used to
10855 output the .arch asm tags for each function. */
10856
10857 static void
aarch64_override_options(void)10858 aarch64_override_options (void)
10859 {
10860 unsigned long cpu_isa = 0;
10861 unsigned long arch_isa = 0;
10862 aarch64_isa_flags = 0;
10863
10864 bool valid_cpu = true;
10865 bool valid_tune = true;
10866 bool valid_arch = true;
10867
10868 selected_cpu = NULL;
10869 selected_arch = NULL;
10870 selected_tune = NULL;
10871
10872 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
10873 If either of -march or -mtune is given, they override their
10874 respective component of -mcpu. */
10875 if (aarch64_cpu_string)
10876 valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
10877 &cpu_isa);
10878
10879 if (aarch64_arch_string)
10880 valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
10881 &arch_isa);
10882
10883 if (aarch64_tune_string)
10884 valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
10885
10886 /* If the user did not specify a processor, choose the default
10887 one for them. This will be the CPU set during configuration using
10888 --with-cpu, otherwise it is "generic". */
10889 if (!selected_cpu)
10890 {
10891 if (selected_arch)
10892 {
10893 selected_cpu = &all_cores[selected_arch->ident];
10894 aarch64_isa_flags = arch_isa;
10895 explicit_arch = selected_arch->arch;
10896 }
10897 else
10898 {
10899 /* Get default configure-time CPU. */
10900 selected_cpu = aarch64_get_tune_cpu (aarch64_none);
10901 aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
10902 }
10903
10904 if (selected_tune)
10905 explicit_tune_core = selected_tune->ident;
10906 }
10907 /* If both -mcpu and -march are specified check that they are architecturally
10908 compatible, warn if they're not and prefer the -march ISA flags. */
10909 else if (selected_arch)
10910 {
10911 if (selected_arch->arch != selected_cpu->arch)
10912 {
10913 warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
10914 all_architectures[selected_cpu->arch].name,
10915 selected_arch->name);
10916 }
10917 aarch64_isa_flags = arch_isa;
10918 explicit_arch = selected_arch->arch;
10919 explicit_tune_core = selected_tune ? selected_tune->ident
10920 : selected_cpu->ident;
10921 }
10922 else
10923 {
10924 /* -mcpu but no -march. */
10925 aarch64_isa_flags = cpu_isa;
10926 explicit_tune_core = selected_tune ? selected_tune->ident
10927 : selected_cpu->ident;
10928 gcc_assert (selected_cpu);
10929 selected_arch = &all_architectures[selected_cpu->arch];
10930 explicit_arch = selected_arch->arch;
10931 }
10932
10933 /* Set the arch as well as we will need it when outputing
10934 the .arch directive in assembly. */
10935 if (!selected_arch)
10936 {
10937 gcc_assert (selected_cpu);
10938 selected_arch = &all_architectures[selected_cpu->arch];
10939 }
10940
10941 if (!selected_tune)
10942 selected_tune = selected_cpu;
10943
10944 #ifndef HAVE_AS_MABI_OPTION
10945 /* The compiler may have been configured with 2.23.* binutils, which does
10946 not have support for ILP32. */
10947 if (TARGET_ILP32)
10948 error ("assembler does not support -mabi=ilp32");
10949 #endif
10950
10951 /* Convert -msve-vector-bits to a VG count. */
10952 aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
10953
10954 if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
10955 sorry ("return address signing is only supported for -mabi=lp64");
10956
10957 /* Make sure we properly set up the explicit options. */
10958 if ((aarch64_cpu_string && valid_cpu)
10959 || (aarch64_tune_string && valid_tune))
10960 gcc_assert (explicit_tune_core != aarch64_none);
10961
10962 if ((aarch64_cpu_string && valid_cpu)
10963 || (aarch64_arch_string && valid_arch))
10964 gcc_assert (explicit_arch != aarch64_no_arch);
10965
10966 aarch64_override_options_internal (&global_options);
10967
10968 /* Save these options as the default ones in case we push and pop them later
10969 while processing functions with potential target attributes. */
10970 target_option_default_node = target_option_current_node
10971 = build_target_option_node (&global_options);
10972 }
10973
10974 /* Implement targetm.override_options_after_change. */
10975
10976 static void
aarch64_override_options_after_change(void)10977 aarch64_override_options_after_change (void)
10978 {
10979 aarch64_override_options_after_change_1 (&global_options);
10980 }
10981
10982 static struct machine_function *
aarch64_init_machine_status(void)10983 aarch64_init_machine_status (void)
10984 {
10985 struct machine_function *machine;
10986 machine = ggc_cleared_alloc<machine_function> ();
10987 return machine;
10988 }
10989
10990 void
aarch64_init_expanders(void)10991 aarch64_init_expanders (void)
10992 {
10993 init_machine_status = aarch64_init_machine_status;
10994 }
10995
10996 /* A checking mechanism for the implementation of the various code models. */
10997 static void
initialize_aarch64_code_model(struct gcc_options * opts)10998 initialize_aarch64_code_model (struct gcc_options *opts)
10999 {
11000 if (opts->x_flag_pic)
11001 {
11002 switch (opts->x_aarch64_cmodel_var)
11003 {
11004 case AARCH64_CMODEL_TINY:
11005 aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
11006 break;
11007 case AARCH64_CMODEL_SMALL:
11008 #ifdef HAVE_AS_SMALL_PIC_RELOCS
11009 aarch64_cmodel = (flag_pic == 2
11010 ? AARCH64_CMODEL_SMALL_PIC
11011 : AARCH64_CMODEL_SMALL_SPIC);
11012 #else
11013 aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
11014 #endif
11015 break;
11016 case AARCH64_CMODEL_LARGE:
11017 sorry ("code model %qs with -f%s", "large",
11018 opts->x_flag_pic > 1 ? "PIC" : "pic");
11019 break;
11020 default:
11021 gcc_unreachable ();
11022 }
11023 }
11024 else
11025 aarch64_cmodel = opts->x_aarch64_cmodel_var;
11026 }
11027
11028 /* Implement TARGET_OPTION_SAVE. */
11029
11030 static void
aarch64_option_save(struct cl_target_option * ptr,struct gcc_options * opts)11031 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
11032 {
11033 ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
11034 }
11035
11036 /* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions
11037 using the information saved in PTR. */
11038
11039 static void
aarch64_option_restore(struct gcc_options * opts,struct cl_target_option * ptr)11040 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
11041 {
11042 opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
11043 selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
11044 opts->x_explicit_arch = ptr->x_explicit_arch;
11045 selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
11046 opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
11047
11048 aarch64_override_options_internal (opts);
11049 }
11050
11051 /* Implement TARGET_OPTION_PRINT. */
11052
11053 static void
aarch64_option_print(FILE * file,int indent,struct cl_target_option * ptr)11054 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
11055 {
11056 const struct processor *cpu
11057 = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
11058 unsigned long isa_flags = ptr->x_aarch64_isa_flags;
11059 const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
11060 std::string extension
11061 = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
11062
11063 fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
11064 fprintf (file, "%*sselected arch = %s%s\n", indent, "",
11065 arch->name, extension.c_str ());
11066 }
11067
11068 static GTY(()) tree aarch64_previous_fndecl;
11069
11070 void
aarch64_reset_previous_fndecl(void)11071 aarch64_reset_previous_fndecl (void)
11072 {
11073 aarch64_previous_fndecl = NULL;
11074 }
11075
11076 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
11077 Used by aarch64_set_current_function and aarch64_pragma_target_parse to
11078 make sure optab availability predicates are recomputed when necessary. */
11079
11080 void
aarch64_save_restore_target_globals(tree new_tree)11081 aarch64_save_restore_target_globals (tree new_tree)
11082 {
11083 if (TREE_TARGET_GLOBALS (new_tree))
11084 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
11085 else if (new_tree == target_option_default_node)
11086 restore_target_globals (&default_target_globals);
11087 else
11088 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
11089 }
11090
11091 /* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions
11092 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
11093 of the function, if such exists. This function may be called multiple
11094 times on a single function so use aarch64_previous_fndecl to avoid
11095 setting up identical state. */
11096
11097 static void
aarch64_set_current_function(tree fndecl)11098 aarch64_set_current_function (tree fndecl)
11099 {
11100 if (!fndecl || fndecl == aarch64_previous_fndecl)
11101 return;
11102
11103 tree old_tree = (aarch64_previous_fndecl
11104 ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
11105 : NULL_TREE);
11106
11107 tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
11108
11109 /* If current function has no attributes but the previous one did,
11110 use the default node. */
11111 if (!new_tree && old_tree)
11112 new_tree = target_option_default_node;
11113
11114 /* If nothing to do, return. #pragma GCC reset or #pragma GCC pop to
11115 the default have been handled by aarch64_save_restore_target_globals from
11116 aarch64_pragma_target_parse. */
11117 if (old_tree == new_tree)
11118 return;
11119
11120 aarch64_previous_fndecl = fndecl;
11121
11122 /* First set the target options. */
11123 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
11124
11125 aarch64_save_restore_target_globals (new_tree);
11126 }
11127
11128 /* Enum describing the various ways we can handle attributes.
11129 In many cases we can reuse the generic option handling machinery. */
11130
11131 enum aarch64_attr_opt_type
11132 {
11133 aarch64_attr_mask, /* Attribute should set a bit in target_flags. */
11134 aarch64_attr_bool, /* Attribute sets or unsets a boolean variable. */
11135 aarch64_attr_enum, /* Attribute sets an enum variable. */
11136 aarch64_attr_custom /* Attribute requires a custom handling function. */
11137 };
11138
11139 /* All the information needed to handle a target attribute.
11140 NAME is the name of the attribute.
11141 ATTR_TYPE specifies the type of behavior of the attribute as described
11142 in the definition of enum aarch64_attr_opt_type.
11143 ALLOW_NEG is true if the attribute supports a "no-" form.
11144 HANDLER is the function that takes the attribute string as an argument
11145 It is needed only when the ATTR_TYPE is aarch64_attr_custom.
11146 OPT_NUM is the enum specifying the option that the attribute modifies.
11147 This is needed for attributes that mirror the behavior of a command-line
11148 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
11149 aarch64_attr_enum. */
11150
11151 struct aarch64_attribute_info
11152 {
11153 const char *name;
11154 enum aarch64_attr_opt_type attr_type;
11155 bool allow_neg;
11156 bool (*handler) (const char *);
11157 enum opt_code opt_num;
11158 };
11159
11160 /* Handle the ARCH_STR argument to the arch= target attribute. */
11161
11162 static bool
aarch64_handle_attr_arch(const char * str)11163 aarch64_handle_attr_arch (const char *str)
11164 {
11165 const struct processor *tmp_arch = NULL;
11166 enum aarch64_parse_opt_result parse_res
11167 = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags);
11168
11169 if (parse_res == AARCH64_PARSE_OK)
11170 {
11171 gcc_assert (tmp_arch);
11172 selected_arch = tmp_arch;
11173 explicit_arch = selected_arch->arch;
11174 return true;
11175 }
11176
11177 switch (parse_res)
11178 {
11179 case AARCH64_PARSE_MISSING_ARG:
11180 error ("missing name in %<target(\"arch=\")%> pragma or attribute");
11181 break;
11182 case AARCH64_PARSE_INVALID_ARG:
11183 error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str);
11184 aarch64_print_hint_for_arch (str);
11185 break;
11186 case AARCH64_PARSE_INVALID_FEATURE:
11187 error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str);
11188 break;
11189 default:
11190 gcc_unreachable ();
11191 }
11192
11193 return false;
11194 }
11195
11196 /* Handle the argument CPU_STR to the cpu= target attribute. */
11197
11198 static bool
aarch64_handle_attr_cpu(const char * str)11199 aarch64_handle_attr_cpu (const char *str)
11200 {
11201 const struct processor *tmp_cpu = NULL;
11202 enum aarch64_parse_opt_result parse_res
11203 = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags);
11204
11205 if (parse_res == AARCH64_PARSE_OK)
11206 {
11207 gcc_assert (tmp_cpu);
11208 selected_tune = tmp_cpu;
11209 explicit_tune_core = selected_tune->ident;
11210
11211 selected_arch = &all_architectures[tmp_cpu->arch];
11212 explicit_arch = selected_arch->arch;
11213 return true;
11214 }
11215
11216 switch (parse_res)
11217 {
11218 case AARCH64_PARSE_MISSING_ARG:
11219 error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
11220 break;
11221 case AARCH64_PARSE_INVALID_ARG:
11222 error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str);
11223 aarch64_print_hint_for_core (str);
11224 break;
11225 case AARCH64_PARSE_INVALID_FEATURE:
11226 error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str);
11227 break;
11228 default:
11229 gcc_unreachable ();
11230 }
11231
11232 return false;
11233 }
11234
11235 /* Handle the argument STR to the tune= target attribute. */
11236
11237 static bool
aarch64_handle_attr_tune(const char * str)11238 aarch64_handle_attr_tune (const char *str)
11239 {
11240 const struct processor *tmp_tune = NULL;
11241 enum aarch64_parse_opt_result parse_res
11242 = aarch64_parse_tune (str, &tmp_tune);
11243
11244 if (parse_res == AARCH64_PARSE_OK)
11245 {
11246 gcc_assert (tmp_tune);
11247 selected_tune = tmp_tune;
11248 explicit_tune_core = selected_tune->ident;
11249 return true;
11250 }
11251
11252 switch (parse_res)
11253 {
11254 case AARCH64_PARSE_INVALID_ARG:
11255 error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str);
11256 aarch64_print_hint_for_core (str);
11257 break;
11258 default:
11259 gcc_unreachable ();
11260 }
11261
11262 return false;
11263 }
11264
11265 /* Parse an architecture extensions target attribute string specified in STR.
11266 For example "+fp+nosimd". Show any errors if needed. Return TRUE
11267 if successful. Update aarch64_isa_flags to reflect the ISA features
11268 modified. */
11269
11270 static bool
aarch64_handle_attr_isa_flags(char * str)11271 aarch64_handle_attr_isa_flags (char *str)
11272 {
11273 enum aarch64_parse_opt_result parse_res;
11274 unsigned long isa_flags = aarch64_isa_flags;
11275
11276 /* We allow "+nothing" in the beginning to clear out all architectural
11277 features if the user wants to handpick specific features. */
11278 if (strncmp ("+nothing", str, 8) == 0)
11279 {
11280 isa_flags = 0;
11281 str += 8;
11282 }
11283
11284 parse_res = aarch64_parse_extension (str, &isa_flags);
11285
11286 if (parse_res == AARCH64_PARSE_OK)
11287 {
11288 aarch64_isa_flags = isa_flags;
11289 return true;
11290 }
11291
11292 switch (parse_res)
11293 {
11294 case AARCH64_PARSE_MISSING_ARG:
11295 error ("missing value in %<target()%> pragma or attribute");
11296 break;
11297
11298 case AARCH64_PARSE_INVALID_FEATURE:
11299 error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str);
11300 break;
11301
11302 default:
11303 gcc_unreachable ();
11304 }
11305
11306 return false;
11307 }
11308
11309 /* The target attributes that we support. On top of these we also support just
11310 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is
11311 handled explicitly in aarch64_process_one_target_attr. */
11312
11313 static const struct aarch64_attribute_info aarch64_attributes[] =
11314 {
11315 { "general-regs-only", aarch64_attr_mask, false, NULL,
11316 OPT_mgeneral_regs_only },
11317 { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
11318 OPT_mfix_cortex_a53_835769 },
11319 { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
11320 OPT_mfix_cortex_a53_843419 },
11321 { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
11322 { "strict-align", aarch64_attr_mask, false, NULL, OPT_mstrict_align },
11323 { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
11324 OPT_momit_leaf_frame_pointer },
11325 { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
11326 { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
11327 OPT_march_ },
11328 { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
11329 { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
11330 OPT_mtune_ },
11331 { "sign-return-address", aarch64_attr_enum, false, NULL,
11332 OPT_msign_return_address_ },
11333 { NULL, aarch64_attr_custom, false, NULL, OPT____ }
11334 };
11335
11336 /* Parse ARG_STR which contains the definition of one target attribute.
11337 Show appropriate errors if any or return true if the attribute is valid. */
11338
11339 static bool
aarch64_process_one_target_attr(char * arg_str)11340 aarch64_process_one_target_attr (char *arg_str)
11341 {
11342 bool invert = false;
11343
11344 size_t len = strlen (arg_str);
11345
11346 if (len == 0)
11347 {
11348 error ("malformed %<target()%> pragma or attribute");
11349 return false;
11350 }
11351
11352 char *str_to_check = (char *) alloca (len + 1);
11353 strcpy (str_to_check, arg_str);
11354
11355 /* Skip leading whitespace. */
11356 while (*str_to_check == ' ' || *str_to_check == '\t')
11357 str_to_check++;
11358
11359 /* We have something like __attribute__ ((target ("+fp+nosimd"))).
11360 It is easier to detect and handle it explicitly here rather than going
11361 through the machinery for the rest of the target attributes in this
11362 function. */
11363 if (*str_to_check == '+')
11364 return aarch64_handle_attr_isa_flags (str_to_check);
11365
11366 if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
11367 {
11368 invert = true;
11369 str_to_check += 3;
11370 }
11371 char *arg = strchr (str_to_check, '=');
11372
11373 /* If we found opt=foo then terminate STR_TO_CHECK at the '='
11374 and point ARG to "foo". */
11375 if (arg)
11376 {
11377 *arg = '\0';
11378 arg++;
11379 }
11380 const struct aarch64_attribute_info *p_attr;
11381 bool found = false;
11382 for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
11383 {
11384 /* If the names don't match up, or the user has given an argument
11385 to an attribute that doesn't accept one, or didn't give an argument
11386 to an attribute that expects one, fail to match. */
11387 if (strcmp (str_to_check, p_attr->name) != 0)
11388 continue;
11389
11390 found = true;
11391 bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
11392 || p_attr->attr_type == aarch64_attr_enum;
11393
11394 if (attr_need_arg_p ^ (arg != NULL))
11395 {
11396 error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
11397 return false;
11398 }
11399
11400 /* If the name matches but the attribute does not allow "no-" versions
11401 then we can't match. */
11402 if (invert && !p_attr->allow_neg)
11403 {
11404 error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
11405 return false;
11406 }
11407
11408 switch (p_attr->attr_type)
11409 {
11410 /* Has a custom handler registered.
11411 For example, cpu=, arch=, tune=. */
11412 case aarch64_attr_custom:
11413 gcc_assert (p_attr->handler);
11414 if (!p_attr->handler (arg))
11415 return false;
11416 break;
11417
11418 /* Either set or unset a boolean option. */
11419 case aarch64_attr_bool:
11420 {
11421 struct cl_decoded_option decoded;
11422
11423 generate_option (p_attr->opt_num, NULL, !invert,
11424 CL_TARGET, &decoded);
11425 aarch64_handle_option (&global_options, &global_options_set,
11426 &decoded, input_location);
11427 break;
11428 }
11429 /* Set or unset a bit in the target_flags. aarch64_handle_option
11430 should know what mask to apply given the option number. */
11431 case aarch64_attr_mask:
11432 {
11433 struct cl_decoded_option decoded;
11434 /* We only need to specify the option number.
11435 aarch64_handle_option will know which mask to apply. */
11436 decoded.opt_index = p_attr->opt_num;
11437 decoded.value = !invert;
11438 aarch64_handle_option (&global_options, &global_options_set,
11439 &decoded, input_location);
11440 break;
11441 }
11442 /* Use the option setting machinery to set an option to an enum. */
11443 case aarch64_attr_enum:
11444 {
11445 gcc_assert (arg);
11446 bool valid;
11447 int value;
11448 valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
11449 &value, CL_TARGET);
11450 if (valid)
11451 {
11452 set_option (&global_options, NULL, p_attr->opt_num, value,
11453 NULL, DK_UNSPECIFIED, input_location,
11454 global_dc);
11455 }
11456 else
11457 {
11458 error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
11459 }
11460 break;
11461 }
11462 default:
11463 gcc_unreachable ();
11464 }
11465 }
11466
11467 /* If we reached here we either have found an attribute and validated
11468 it or didn't match any. If we matched an attribute but its arguments
11469 were malformed we will have returned false already. */
11470 return found;
11471 }
11472
11473 /* Count how many times the character C appears in
11474 NULL-terminated string STR. */
11475
11476 static unsigned int
num_occurences_in_str(char c,char * str)11477 num_occurences_in_str (char c, char *str)
11478 {
11479 unsigned int res = 0;
11480 while (*str != '\0')
11481 {
11482 if (*str == c)
11483 res++;
11484
11485 str++;
11486 }
11487
11488 return res;
11489 }
11490
11491 /* Parse the tree in ARGS that contains the target attribute information
11492 and update the global target options space. */
11493
11494 bool
aarch64_process_target_attr(tree args)11495 aarch64_process_target_attr (tree args)
11496 {
11497 if (TREE_CODE (args) == TREE_LIST)
11498 {
11499 do
11500 {
11501 tree head = TREE_VALUE (args);
11502 if (head)
11503 {
11504 if (!aarch64_process_target_attr (head))
11505 return false;
11506 }
11507 args = TREE_CHAIN (args);
11508 } while (args);
11509
11510 return true;
11511 }
11512
11513 if (TREE_CODE (args) != STRING_CST)
11514 {
11515 error ("attribute %<target%> argument not a string");
11516 return false;
11517 }
11518
11519 size_t len = strlen (TREE_STRING_POINTER (args));
11520 char *str_to_check = (char *) alloca (len + 1);
11521 strcpy (str_to_check, TREE_STRING_POINTER (args));
11522
11523 if (len == 0)
11524 {
11525 error ("malformed %<target()%> pragma or attribute");
11526 return false;
11527 }
11528
11529 /* Used to catch empty spaces between commas i.e.
11530 attribute ((target ("attr1,,attr2"))). */
11531 unsigned int num_commas = num_occurences_in_str (',', str_to_check);
11532
11533 /* Handle multiple target attributes separated by ','. */
11534 char *token = strtok (str_to_check, ",");
11535
11536 unsigned int num_attrs = 0;
11537 while (token)
11538 {
11539 num_attrs++;
11540 if (!aarch64_process_one_target_attr (token))
11541 {
11542 error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
11543 return false;
11544 }
11545
11546 token = strtok (NULL, ",");
11547 }
11548
11549 if (num_attrs != num_commas + 1)
11550 {
11551 error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
11552 return false;
11553 }
11554
11555 return true;
11556 }
11557
11558 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to
11559 process attribute ((target ("..."))). */
11560
11561 static bool
aarch64_option_valid_attribute_p(tree fndecl,tree,tree args,int)11562 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
11563 {
11564 struct cl_target_option cur_target;
11565 bool ret;
11566 tree old_optimize;
11567 tree new_target, new_optimize;
11568 tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
11569
11570 /* If what we're processing is the current pragma string then the
11571 target option node is already stored in target_option_current_node
11572 by aarch64_pragma_target_parse in aarch64-c.c. Use that to avoid
11573 having to re-parse the string. This is especially useful to keep
11574 arm_neon.h compile times down since that header contains a lot
11575 of intrinsics enclosed in pragmas. */
11576 if (!existing_target && args == current_target_pragma)
11577 {
11578 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
11579 return true;
11580 }
11581 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
11582
11583 old_optimize = build_optimization_node (&global_options);
11584 func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
11585
11586 /* If the function changed the optimization levels as well as setting
11587 target options, start with the optimizations specified. */
11588 if (func_optimize && func_optimize != old_optimize)
11589 cl_optimization_restore (&global_options,
11590 TREE_OPTIMIZATION (func_optimize));
11591
11592 /* Save the current target options to restore at the end. */
11593 cl_target_option_save (&cur_target, &global_options);
11594
11595 /* If fndecl already has some target attributes applied to it, unpack
11596 them so that we add this attribute on top of them, rather than
11597 overwriting them. */
11598 if (existing_target)
11599 {
11600 struct cl_target_option *existing_options
11601 = TREE_TARGET_OPTION (existing_target);
11602
11603 if (existing_options)
11604 cl_target_option_restore (&global_options, existing_options);
11605 }
11606 else
11607 cl_target_option_restore (&global_options,
11608 TREE_TARGET_OPTION (target_option_current_node));
11609
11610 ret = aarch64_process_target_attr (args);
11611
11612 /* Set up any additional state. */
11613 if (ret)
11614 {
11615 aarch64_override_options_internal (&global_options);
11616 /* Initialize SIMD builtins if we haven't already.
11617 Set current_target_pragma to NULL for the duration so that
11618 the builtin initialization code doesn't try to tag the functions
11619 being built with the attributes specified by any current pragma, thus
11620 going into an infinite recursion. */
11621 if (TARGET_SIMD)
11622 {
11623 tree saved_current_target_pragma = current_target_pragma;
11624 current_target_pragma = NULL;
11625 aarch64_init_simd_builtins ();
11626 current_target_pragma = saved_current_target_pragma;
11627 }
11628 new_target = build_target_option_node (&global_options);
11629 }
11630 else
11631 new_target = NULL;
11632
11633 new_optimize = build_optimization_node (&global_options);
11634
11635 if (fndecl && ret)
11636 {
11637 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
11638
11639 if (old_optimize != new_optimize)
11640 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
11641 }
11642
11643 cl_target_option_restore (&global_options, &cur_target);
11644
11645 if (old_optimize != new_optimize)
11646 cl_optimization_restore (&global_options,
11647 TREE_OPTIMIZATION (old_optimize));
11648 return ret;
11649 }
11650
11651 /* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are
11652 tri-bool options (yes, no, don't care) and the default value is
11653 DEF, determine whether to reject inlining. */
11654
11655 static bool
aarch64_tribools_ok_for_inlining_p(int caller,int callee,int dont_care,int def)11656 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
11657 int dont_care, int def)
11658 {
11659 /* If the callee doesn't care, always allow inlining. */
11660 if (callee == dont_care)
11661 return true;
11662
11663 /* If the caller doesn't care, always allow inlining. */
11664 if (caller == dont_care)
11665 return true;
11666
11667 /* Otherwise, allow inlining if either the callee and caller values
11668 agree, or if the callee is using the default value. */
11669 return (callee == caller || callee == def);
11670 }
11671
11672 /* Implement TARGET_CAN_INLINE_P. Decide whether it is valid
11673 to inline CALLEE into CALLER based on target-specific info.
11674 Make sure that the caller and callee have compatible architectural
11675 features. Then go through the other possible target attributes
11676 and see if they can block inlining. Try not to reject always_inline
11677 callees unless they are incompatible architecturally. */
11678
11679 static bool
aarch64_can_inline_p(tree caller,tree callee)11680 aarch64_can_inline_p (tree caller, tree callee)
11681 {
11682 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
11683 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
11684
11685 /* If callee has no option attributes, then it is ok to inline. */
11686 if (!callee_tree)
11687 return true;
11688
11689 struct cl_target_option *caller_opts
11690 = TREE_TARGET_OPTION (caller_tree ? caller_tree
11691 : target_option_default_node);
11692
11693 struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
11694
11695
11696 /* Callee's ISA flags should be a subset of the caller's. */
11697 if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
11698 != callee_opts->x_aarch64_isa_flags)
11699 return false;
11700
11701 /* Allow non-strict aligned functions inlining into strict
11702 aligned ones. */
11703 if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
11704 != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
11705 && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
11706 && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
11707 return false;
11708
11709 bool always_inline = lookup_attribute ("always_inline",
11710 DECL_ATTRIBUTES (callee));
11711
11712 /* If the architectural features match up and the callee is always_inline
11713 then the other attributes don't matter. */
11714 if (always_inline)
11715 return true;
11716
11717 if (caller_opts->x_aarch64_cmodel_var
11718 != callee_opts->x_aarch64_cmodel_var)
11719 return false;
11720
11721 if (caller_opts->x_aarch64_tls_dialect
11722 != callee_opts->x_aarch64_tls_dialect)
11723 return false;
11724
11725 /* Honour explicit requests to workaround errata. */
11726 if (!aarch64_tribools_ok_for_inlining_p (
11727 caller_opts->x_aarch64_fix_a53_err835769,
11728 callee_opts->x_aarch64_fix_a53_err835769,
11729 2, TARGET_FIX_ERR_A53_835769_DEFAULT))
11730 return false;
11731
11732 if (!aarch64_tribools_ok_for_inlining_p (
11733 caller_opts->x_aarch64_fix_a53_err843419,
11734 callee_opts->x_aarch64_fix_a53_err843419,
11735 2, TARGET_FIX_ERR_A53_843419))
11736 return false;
11737
11738 /* If the user explicitly specified -momit-leaf-frame-pointer for the
11739 caller and calle and they don't match up, reject inlining. */
11740 if (!aarch64_tribools_ok_for_inlining_p (
11741 caller_opts->x_flag_omit_leaf_frame_pointer,
11742 callee_opts->x_flag_omit_leaf_frame_pointer,
11743 2, 1))
11744 return false;
11745
11746 /* If the callee has specific tuning overrides, respect them. */
11747 if (callee_opts->x_aarch64_override_tune_string != NULL
11748 && caller_opts->x_aarch64_override_tune_string == NULL)
11749 return false;
11750
11751 /* If the user specified tuning override strings for the
11752 caller and callee and they don't match up, reject inlining.
11753 We just do a string compare here, we don't analyze the meaning
11754 of the string, as it would be too costly for little gain. */
11755 if (callee_opts->x_aarch64_override_tune_string
11756 && caller_opts->x_aarch64_override_tune_string
11757 && (strcmp (callee_opts->x_aarch64_override_tune_string,
11758 caller_opts->x_aarch64_override_tune_string) != 0))
11759 return false;
11760
11761 return true;
11762 }
11763
11764 /* Return true if SYMBOL_REF X binds locally. */
11765
11766 static bool
aarch64_symbol_binds_local_p(const_rtx x)11767 aarch64_symbol_binds_local_p (const_rtx x)
11768 {
11769 return (SYMBOL_REF_DECL (x)
11770 ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
11771 : SYMBOL_REF_LOCAL_P (x));
11772 }
11773
11774 /* Return true if SYMBOL_REF X is thread local */
11775 static bool
aarch64_tls_symbol_p(rtx x)11776 aarch64_tls_symbol_p (rtx x)
11777 {
11778 if (! TARGET_HAVE_TLS)
11779 return false;
11780
11781 if (GET_CODE (x) != SYMBOL_REF)
11782 return false;
11783
11784 return SYMBOL_REF_TLS_MODEL (x) != 0;
11785 }
11786
11787 /* Classify a TLS symbol into one of the TLS kinds. */
11788 enum aarch64_symbol_type
aarch64_classify_tls_symbol(rtx x)11789 aarch64_classify_tls_symbol (rtx x)
11790 {
11791 enum tls_model tls_kind = tls_symbolic_operand_type (x);
11792
11793 switch (tls_kind)
11794 {
11795 case TLS_MODEL_GLOBAL_DYNAMIC:
11796 case TLS_MODEL_LOCAL_DYNAMIC:
11797 return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
11798
11799 case TLS_MODEL_INITIAL_EXEC:
11800 switch (aarch64_cmodel)
11801 {
11802 case AARCH64_CMODEL_TINY:
11803 case AARCH64_CMODEL_TINY_PIC:
11804 return SYMBOL_TINY_TLSIE;
11805 default:
11806 return SYMBOL_SMALL_TLSIE;
11807 }
11808
11809 case TLS_MODEL_LOCAL_EXEC:
11810 if (aarch64_tls_size == 12)
11811 return SYMBOL_TLSLE12;
11812 else if (aarch64_tls_size == 24)
11813 return SYMBOL_TLSLE24;
11814 else if (aarch64_tls_size == 32)
11815 return SYMBOL_TLSLE32;
11816 else if (aarch64_tls_size == 48)
11817 return SYMBOL_TLSLE48;
11818 else
11819 gcc_unreachable ();
11820
11821 case TLS_MODEL_EMULATED:
11822 case TLS_MODEL_NONE:
11823 return SYMBOL_FORCE_TO_MEM;
11824
11825 default:
11826 gcc_unreachable ();
11827 }
11828 }
11829
11830 /* Return the correct method for accessing X + OFFSET, where X is either
11831 a SYMBOL_REF or LABEL_REF. */
11832
11833 enum aarch64_symbol_type
aarch64_classify_symbol(rtx x,HOST_WIDE_INT offset)11834 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
11835 {
11836 if (GET_CODE (x) == LABEL_REF)
11837 {
11838 switch (aarch64_cmodel)
11839 {
11840 case AARCH64_CMODEL_LARGE:
11841 return SYMBOL_FORCE_TO_MEM;
11842
11843 case AARCH64_CMODEL_TINY_PIC:
11844 case AARCH64_CMODEL_TINY:
11845 return SYMBOL_TINY_ABSOLUTE;
11846
11847 case AARCH64_CMODEL_SMALL_SPIC:
11848 case AARCH64_CMODEL_SMALL_PIC:
11849 case AARCH64_CMODEL_SMALL:
11850 return SYMBOL_SMALL_ABSOLUTE;
11851
11852 default:
11853 gcc_unreachable ();
11854 }
11855 }
11856
11857 if (GET_CODE (x) == SYMBOL_REF)
11858 {
11859 if (aarch64_tls_symbol_p (x))
11860 return aarch64_classify_tls_symbol (x);
11861
11862 switch (aarch64_cmodel)
11863 {
11864 case AARCH64_CMODEL_TINY:
11865 /* When we retrieve symbol + offset address, we have to make sure
11866 the offset does not cause overflow of the final address. But
11867 we have no way of knowing the address of symbol at compile time
11868 so we can't accurately say if the distance between the PC and
11869 symbol + offset is outside the addressible range of +/-1M in the
11870 TINY code model. So we rely on images not being greater than
11871 1M and cap the offset at 1M and anything beyond 1M will have to
11872 be loaded using an alternative mechanism. Furthermore if the
11873 symbol is a weak reference to something that isn't known to
11874 resolve to a symbol in this module, then force to memory. */
11875 if ((SYMBOL_REF_WEAK (x)
11876 && !aarch64_symbol_binds_local_p (x))
11877 || !IN_RANGE (offset, -1048575, 1048575))
11878 return SYMBOL_FORCE_TO_MEM;
11879 return SYMBOL_TINY_ABSOLUTE;
11880
11881 case AARCH64_CMODEL_SMALL:
11882 /* Same reasoning as the tiny code model, but the offset cap here is
11883 4G. */
11884 if ((SYMBOL_REF_WEAK (x)
11885 && !aarch64_symbol_binds_local_p (x))
11886 || !IN_RANGE (offset, HOST_WIDE_INT_C (-4294967263),
11887 HOST_WIDE_INT_C (4294967264)))
11888 return SYMBOL_FORCE_TO_MEM;
11889 return SYMBOL_SMALL_ABSOLUTE;
11890
11891 case AARCH64_CMODEL_TINY_PIC:
11892 if (!aarch64_symbol_binds_local_p (x))
11893 return SYMBOL_TINY_GOT;
11894 return SYMBOL_TINY_ABSOLUTE;
11895
11896 case AARCH64_CMODEL_SMALL_SPIC:
11897 case AARCH64_CMODEL_SMALL_PIC:
11898 if (!aarch64_symbol_binds_local_p (x))
11899 return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
11900 ? SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
11901 return SYMBOL_SMALL_ABSOLUTE;
11902
11903 case AARCH64_CMODEL_LARGE:
11904 /* This is alright even in PIC code as the constant
11905 pool reference is always PC relative and within
11906 the same translation unit. */
11907 if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
11908 return SYMBOL_SMALL_ABSOLUTE;
11909 else
11910 return SYMBOL_FORCE_TO_MEM;
11911
11912 default:
11913 gcc_unreachable ();
11914 }
11915 }
11916
11917 /* By default push everything into the constant pool. */
11918 return SYMBOL_FORCE_TO_MEM;
11919 }
11920
11921 bool
aarch64_constant_address_p(rtx x)11922 aarch64_constant_address_p (rtx x)
11923 {
11924 return (CONSTANT_P (x) && memory_address_p (DImode, x));
11925 }
11926
11927 bool
aarch64_legitimate_pic_operand_p(rtx x)11928 aarch64_legitimate_pic_operand_p (rtx x)
11929 {
11930 if (GET_CODE (x) == SYMBOL_REF
11931 || (GET_CODE (x) == CONST
11932 && GET_CODE (XEXP (x, 0)) == PLUS
11933 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
11934 return false;
11935
11936 return true;
11937 }
11938
11939 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook. Return true for constants
11940 that should be rematerialized rather than spilled. */
11941
11942 static bool
aarch64_legitimate_constant_p(machine_mode mode,rtx x)11943 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
11944 {
11945 /* Support CSE and rematerialization of common constants. */
11946 if (CONST_INT_P (x)
11947 || (CONST_DOUBLE_P (x) && GET_MODE_CLASS (mode) == MODE_FLOAT)
11948 || GET_CODE (x) == CONST_VECTOR)
11949 return true;
11950
11951 /* Do not allow vector struct mode constants for Advanced SIMD.
11952 We could support 0 and -1 easily, but they need support in
11953 aarch64-simd.md. */
11954 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
11955 if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
11956 return false;
11957
11958 /* Only accept variable-length vector constants if they can be
11959 handled directly.
11960
11961 ??? It would be possible to handle rematerialization of other
11962 constants via secondary reloads. */
11963 if (vec_flags & VEC_ANY_SVE)
11964 return aarch64_simd_valid_immediate (x, NULL);
11965
11966 if (GET_CODE (x) == HIGH)
11967 x = XEXP (x, 0);
11968
11969 /* Accept polynomial constants that can be calculated by using the
11970 destination of a move as the sole temporary. Constants that
11971 require a second temporary cannot be rematerialized (they can't be
11972 forced to memory and also aren't legitimate constants). */
11973 poly_int64 offset;
11974 if (poly_int_rtx_p (x, &offset))
11975 return aarch64_offset_temporaries (false, offset) <= 1;
11976
11977 /* If an offset is being added to something else, we need to allow the
11978 base to be moved into the destination register, meaning that there
11979 are no free temporaries for the offset. */
11980 x = strip_offset (x, &offset);
11981 if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
11982 return false;
11983
11984 /* Do not allow const (plus (anchor_symbol, const_int)). */
11985 if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
11986 return false;
11987
11988 /* Treat symbols as constants. Avoid TLS symbols as they are complex,
11989 so spilling them is better than rematerialization. */
11990 if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
11991 return true;
11992
11993 /* Label references are always constant. */
11994 if (GET_CODE (x) == LABEL_REF)
11995 return true;
11996
11997 return false;
11998 }
11999
12000 rtx
aarch64_load_tp(rtx target)12001 aarch64_load_tp (rtx target)
12002 {
12003 if (!target
12004 || GET_MODE (target) != Pmode
12005 || !register_operand (target, Pmode))
12006 target = gen_reg_rtx (Pmode);
12007
12008 /* Can return in any reg. */
12009 emit_insn (gen_aarch64_load_tp_hard (target));
12010 return target;
12011 }
12012
12013 /* On AAPCS systems, this is the "struct __va_list". */
12014 static GTY(()) tree va_list_type;
12015
12016 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
12017 Return the type to use as __builtin_va_list.
12018
12019 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
12020
12021 struct __va_list
12022 {
12023 void *__stack;
12024 void *__gr_top;
12025 void *__vr_top;
12026 int __gr_offs;
12027 int __vr_offs;
12028 }; */
12029
12030 static tree
aarch64_build_builtin_va_list(void)12031 aarch64_build_builtin_va_list (void)
12032 {
12033 tree va_list_name;
12034 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
12035
12036 /* Create the type. */
12037 va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
12038 /* Give it the required name. */
12039 va_list_name = build_decl (BUILTINS_LOCATION,
12040 TYPE_DECL,
12041 get_identifier ("__va_list"),
12042 va_list_type);
12043 DECL_ARTIFICIAL (va_list_name) = 1;
12044 TYPE_NAME (va_list_type) = va_list_name;
12045 TYPE_STUB_DECL (va_list_type) = va_list_name;
12046
12047 /* Create the fields. */
12048 f_stack = build_decl (BUILTINS_LOCATION,
12049 FIELD_DECL, get_identifier ("__stack"),
12050 ptr_type_node);
12051 f_grtop = build_decl (BUILTINS_LOCATION,
12052 FIELD_DECL, get_identifier ("__gr_top"),
12053 ptr_type_node);
12054 f_vrtop = build_decl (BUILTINS_LOCATION,
12055 FIELD_DECL, get_identifier ("__vr_top"),
12056 ptr_type_node);
12057 f_groff = build_decl (BUILTINS_LOCATION,
12058 FIELD_DECL, get_identifier ("__gr_offs"),
12059 integer_type_node);
12060 f_vroff = build_decl (BUILTINS_LOCATION,
12061 FIELD_DECL, get_identifier ("__vr_offs"),
12062 integer_type_node);
12063
12064 /* Tell tree-stdarg pass about our internal offset fields.
12065 NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
12066 purpose to identify whether the code is updating va_list internal
12067 offset fields through irregular way. */
12068 va_list_gpr_counter_field = f_groff;
12069 va_list_fpr_counter_field = f_vroff;
12070
12071 DECL_ARTIFICIAL (f_stack) = 1;
12072 DECL_ARTIFICIAL (f_grtop) = 1;
12073 DECL_ARTIFICIAL (f_vrtop) = 1;
12074 DECL_ARTIFICIAL (f_groff) = 1;
12075 DECL_ARTIFICIAL (f_vroff) = 1;
12076
12077 DECL_FIELD_CONTEXT (f_stack) = va_list_type;
12078 DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
12079 DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
12080 DECL_FIELD_CONTEXT (f_groff) = va_list_type;
12081 DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
12082
12083 TYPE_FIELDS (va_list_type) = f_stack;
12084 DECL_CHAIN (f_stack) = f_grtop;
12085 DECL_CHAIN (f_grtop) = f_vrtop;
12086 DECL_CHAIN (f_vrtop) = f_groff;
12087 DECL_CHAIN (f_groff) = f_vroff;
12088
12089 /* Compute its layout. */
12090 layout_type (va_list_type);
12091
12092 return va_list_type;
12093 }
12094
12095 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
12096 static void
aarch64_expand_builtin_va_start(tree valist,rtx nextarg ATTRIBUTE_UNUSED)12097 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
12098 {
12099 const CUMULATIVE_ARGS *cum;
12100 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
12101 tree stack, grtop, vrtop, groff, vroff;
12102 tree t;
12103 int gr_save_area_size = cfun->va_list_gpr_size;
12104 int vr_save_area_size = cfun->va_list_fpr_size;
12105 int vr_offset;
12106
12107 cum = &crtl->args.info;
12108 if (cfun->va_list_gpr_size)
12109 gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
12110 cfun->va_list_gpr_size);
12111 if (cfun->va_list_fpr_size)
12112 vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
12113 * UNITS_PER_VREG, cfun->va_list_fpr_size);
12114
12115 if (!TARGET_FLOAT)
12116 {
12117 gcc_assert (cum->aapcs_nvrn == 0);
12118 vr_save_area_size = 0;
12119 }
12120
12121 f_stack = TYPE_FIELDS (va_list_type_node);
12122 f_grtop = DECL_CHAIN (f_stack);
12123 f_vrtop = DECL_CHAIN (f_grtop);
12124 f_groff = DECL_CHAIN (f_vrtop);
12125 f_vroff = DECL_CHAIN (f_groff);
12126
12127 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
12128 NULL_TREE);
12129 grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
12130 NULL_TREE);
12131 vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
12132 NULL_TREE);
12133 groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
12134 NULL_TREE);
12135 vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
12136 NULL_TREE);
12137
12138 /* Emit code to initialize STACK, which points to the next varargs stack
12139 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
12140 by named arguments. STACK is 8-byte aligned. */
12141 t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
12142 if (cum->aapcs_stack_size > 0)
12143 t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
12144 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
12145 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12146
12147 /* Emit code to initialize GRTOP, the top of the GR save area.
12148 virtual_incoming_args_rtx should have been 16 byte aligned. */
12149 t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
12150 t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
12151 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12152
12153 /* Emit code to initialize VRTOP, the top of the VR save area.
12154 This address is gr_save_area_bytes below GRTOP, rounded
12155 down to the next 16-byte boundary. */
12156 t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
12157 vr_offset = ROUND_UP (gr_save_area_size,
12158 STACK_BOUNDARY / BITS_PER_UNIT);
12159
12160 if (vr_offset)
12161 t = fold_build_pointer_plus_hwi (t, -vr_offset);
12162 t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
12163 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12164
12165 /* Emit code to initialize GROFF, the offset from GRTOP of the
12166 next GPR argument. */
12167 t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
12168 build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
12169 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12170
12171 /* Likewise emit code to initialize VROFF, the offset from FTOP
12172 of the next VR argument. */
12173 t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
12174 build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
12175 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12176 }
12177
12178 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
12179
12180 static tree
aarch64_gimplify_va_arg_expr(tree valist,tree type,gimple_seq * pre_p,gimple_seq * post_p ATTRIBUTE_UNUSED)12181 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
12182 gimple_seq *post_p ATTRIBUTE_UNUSED)
12183 {
12184 tree addr;
12185 bool indirect_p;
12186 bool is_ha; /* is HFA or HVA. */
12187 bool dw_align; /* double-word align. */
12188 machine_mode ag_mode = VOIDmode;
12189 int nregs;
12190 machine_mode mode;
12191
12192 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
12193 tree stack, f_top, f_off, off, arg, roundup, on_stack;
12194 HOST_WIDE_INT size, rsize, adjust, align;
12195 tree t, u, cond1, cond2;
12196
12197 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
12198 if (indirect_p)
12199 type = build_pointer_type (type);
12200
12201 mode = TYPE_MODE (type);
12202
12203 f_stack = TYPE_FIELDS (va_list_type_node);
12204 f_grtop = DECL_CHAIN (f_stack);
12205 f_vrtop = DECL_CHAIN (f_grtop);
12206 f_groff = DECL_CHAIN (f_vrtop);
12207 f_vroff = DECL_CHAIN (f_groff);
12208
12209 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
12210 f_stack, NULL_TREE);
12211 size = int_size_in_bytes (type);
12212 align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
12213
12214 dw_align = false;
12215 adjust = 0;
12216 if (aarch64_vfp_is_call_or_return_candidate (mode,
12217 type,
12218 &ag_mode,
12219 &nregs,
12220 &is_ha))
12221 {
12222 /* No frontends can create types with variable-sized modes, so we
12223 shouldn't be asked to pass or return them. */
12224 unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
12225
12226 /* TYPE passed in fp/simd registers. */
12227 if (!TARGET_FLOAT)
12228 aarch64_err_no_fpadvsimd (mode, "varargs");
12229
12230 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
12231 unshare_expr (valist), f_vrtop, NULL_TREE);
12232 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
12233 unshare_expr (valist), f_vroff, NULL_TREE);
12234
12235 rsize = nregs * UNITS_PER_VREG;
12236
12237 if (is_ha)
12238 {
12239 if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
12240 adjust = UNITS_PER_VREG - ag_size;
12241 }
12242 else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
12243 && size < UNITS_PER_VREG)
12244 {
12245 adjust = UNITS_PER_VREG - size;
12246 }
12247 }
12248 else
12249 {
12250 /* TYPE passed in general registers. */
12251 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
12252 unshare_expr (valist), f_grtop, NULL_TREE);
12253 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
12254 unshare_expr (valist), f_groff, NULL_TREE);
12255 rsize = ROUND_UP (size, UNITS_PER_WORD);
12256 nregs = rsize / UNITS_PER_WORD;
12257
12258 if (align > 8)
12259 dw_align = true;
12260
12261 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
12262 && size < UNITS_PER_WORD)
12263 {
12264 adjust = UNITS_PER_WORD - size;
12265 }
12266 }
12267
12268 /* Get a local temporary for the field value. */
12269 off = get_initialized_tmp_var (f_off, pre_p, NULL);
12270
12271 /* Emit code to branch if off >= 0. */
12272 t = build2 (GE_EXPR, boolean_type_node, off,
12273 build_int_cst (TREE_TYPE (off), 0));
12274 cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
12275
12276 if (dw_align)
12277 {
12278 /* Emit: offs = (offs + 15) & -16. */
12279 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
12280 build_int_cst (TREE_TYPE (off), 15));
12281 t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
12282 build_int_cst (TREE_TYPE (off), -16));
12283 roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
12284 }
12285 else
12286 roundup = NULL;
12287
12288 /* Update ap.__[g|v]r_offs */
12289 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
12290 build_int_cst (TREE_TYPE (off), rsize));
12291 t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
12292
12293 /* String up. */
12294 if (roundup)
12295 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
12296
12297 /* [cond2] if (ap.__[g|v]r_offs > 0) */
12298 u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
12299 build_int_cst (TREE_TYPE (f_off), 0));
12300 cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
12301
12302 /* String up: make sure the assignment happens before the use. */
12303 t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
12304 COND_EXPR_ELSE (cond1) = t;
12305
12306 /* Prepare the trees handling the argument that is passed on the stack;
12307 the top level node will store in ON_STACK. */
12308 arg = get_initialized_tmp_var (stack, pre_p, NULL);
12309 if (align > 8)
12310 {
12311 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
12312 t = fold_convert (intDI_type_node, arg);
12313 t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
12314 build_int_cst (TREE_TYPE (t), 15));
12315 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
12316 build_int_cst (TREE_TYPE (t), -16));
12317 t = fold_convert (TREE_TYPE (arg), t);
12318 roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
12319 }
12320 else
12321 roundup = NULL;
12322 /* Advance ap.__stack */
12323 t = fold_convert (intDI_type_node, arg);
12324 t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
12325 build_int_cst (TREE_TYPE (t), size + 7));
12326 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
12327 build_int_cst (TREE_TYPE (t), -8));
12328 t = fold_convert (TREE_TYPE (arg), t);
12329 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
12330 /* String up roundup and advance. */
12331 if (roundup)
12332 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
12333 /* String up with arg */
12334 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
12335 /* Big-endianness related address adjustment. */
12336 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
12337 && size < UNITS_PER_WORD)
12338 {
12339 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
12340 size_int (UNITS_PER_WORD - size));
12341 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
12342 }
12343
12344 COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
12345 COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
12346
12347 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
12348 t = off;
12349 if (adjust)
12350 t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
12351 build_int_cst (TREE_TYPE (off), adjust));
12352
12353 t = fold_convert (sizetype, t);
12354 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
12355
12356 if (is_ha)
12357 {
12358 /* type ha; // treat as "struct {ftype field[n];}"
12359 ... [computing offs]
12360 for (i = 0; i <nregs; ++i, offs += 16)
12361 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
12362 return ha; */
12363 int i;
12364 tree tmp_ha, field_t, field_ptr_t;
12365
12366 /* Declare a local variable. */
12367 tmp_ha = create_tmp_var_raw (type, "ha");
12368 gimple_add_tmp_var (tmp_ha);
12369
12370 /* Establish the base type. */
12371 switch (ag_mode)
12372 {
12373 case E_SFmode:
12374 field_t = float_type_node;
12375 field_ptr_t = float_ptr_type_node;
12376 break;
12377 case E_DFmode:
12378 field_t = double_type_node;
12379 field_ptr_t = double_ptr_type_node;
12380 break;
12381 case E_TFmode:
12382 field_t = long_double_type_node;
12383 field_ptr_t = long_double_ptr_type_node;
12384 break;
12385 case E_HFmode:
12386 field_t = aarch64_fp16_type_node;
12387 field_ptr_t = aarch64_fp16_ptr_type_node;
12388 break;
12389 case E_V2SImode:
12390 case E_V4SImode:
12391 {
12392 tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
12393 field_t = build_vector_type_for_mode (innertype, ag_mode);
12394 field_ptr_t = build_pointer_type (field_t);
12395 }
12396 break;
12397 default:
12398 gcc_assert (0);
12399 }
12400
12401 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
12402 tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
12403 addr = t;
12404 t = fold_convert (field_ptr_t, addr);
12405 t = build2 (MODIFY_EXPR, field_t,
12406 build1 (INDIRECT_REF, field_t, tmp_ha),
12407 build1 (INDIRECT_REF, field_t, t));
12408
12409 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
12410 for (i = 1; i < nregs; ++i)
12411 {
12412 addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
12413 u = fold_convert (field_ptr_t, addr);
12414 u = build2 (MODIFY_EXPR, field_t,
12415 build2 (MEM_REF, field_t, tmp_ha,
12416 build_int_cst (field_ptr_t,
12417 (i *
12418 int_size_in_bytes (field_t)))),
12419 build1 (INDIRECT_REF, field_t, u));
12420 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
12421 }
12422
12423 u = fold_convert (TREE_TYPE (f_top), tmp_ha);
12424 t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
12425 }
12426
12427 COND_EXPR_ELSE (cond2) = t;
12428 addr = fold_convert (build_pointer_type (type), cond1);
12429 addr = build_va_arg_indirect_ref (addr);
12430
12431 if (indirect_p)
12432 addr = build_va_arg_indirect_ref (addr);
12433
12434 return addr;
12435 }
12436
12437 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
12438
12439 static void
aarch64_setup_incoming_varargs(cumulative_args_t cum_v,machine_mode mode,tree type,int * pretend_size ATTRIBUTE_UNUSED,int no_rtl)12440 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
12441 tree type, int *pretend_size ATTRIBUTE_UNUSED,
12442 int no_rtl)
12443 {
12444 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
12445 CUMULATIVE_ARGS local_cum;
12446 int gr_saved = cfun->va_list_gpr_size;
12447 int vr_saved = cfun->va_list_fpr_size;
12448
12449 /* The caller has advanced CUM up to, but not beyond, the last named
12450 argument. Advance a local copy of CUM past the last "real" named
12451 argument, to find out how many registers are left over. */
12452 local_cum = *cum;
12453 aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
12454
12455 /* Found out how many registers we need to save.
12456 Honor tree-stdvar analysis results. */
12457 if (cfun->va_list_gpr_size)
12458 gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
12459 cfun->va_list_gpr_size / UNITS_PER_WORD);
12460 if (cfun->va_list_fpr_size)
12461 vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
12462 cfun->va_list_fpr_size / UNITS_PER_VREG);
12463
12464 if (!TARGET_FLOAT)
12465 {
12466 gcc_assert (local_cum.aapcs_nvrn == 0);
12467 vr_saved = 0;
12468 }
12469
12470 if (!no_rtl)
12471 {
12472 if (gr_saved > 0)
12473 {
12474 rtx ptr, mem;
12475
12476 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
12477 ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
12478 - gr_saved * UNITS_PER_WORD);
12479 mem = gen_frame_mem (BLKmode, ptr);
12480 set_mem_alias_set (mem, get_varargs_alias_set ());
12481
12482 move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
12483 mem, gr_saved);
12484 }
12485 if (vr_saved > 0)
12486 {
12487 /* We can't use move_block_from_reg, because it will use
12488 the wrong mode, storing D regs only. */
12489 machine_mode mode = TImode;
12490 int off, i, vr_start;
12491
12492 /* Set OFF to the offset from virtual_incoming_args_rtx of
12493 the first vector register. The VR save area lies below
12494 the GR one, and is aligned to 16 bytes. */
12495 off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
12496 STACK_BOUNDARY / BITS_PER_UNIT);
12497 off -= vr_saved * UNITS_PER_VREG;
12498
12499 vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
12500 for (i = 0; i < vr_saved; ++i)
12501 {
12502 rtx ptr, mem;
12503
12504 ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
12505 mem = gen_frame_mem (mode, ptr);
12506 set_mem_alias_set (mem, get_varargs_alias_set ());
12507 aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
12508 off += UNITS_PER_VREG;
12509 }
12510 }
12511 }
12512
12513 /* We don't save the size into *PRETEND_SIZE because we want to avoid
12514 any complication of having crtl->args.pretend_args_size changed. */
12515 cfun->machine->frame.saved_varargs_size
12516 = (ROUND_UP (gr_saved * UNITS_PER_WORD,
12517 STACK_BOUNDARY / BITS_PER_UNIT)
12518 + vr_saved * UNITS_PER_VREG);
12519 }
12520
12521 static void
aarch64_conditional_register_usage(void)12522 aarch64_conditional_register_usage (void)
12523 {
12524 int i;
12525 if (!TARGET_FLOAT)
12526 {
12527 for (i = V0_REGNUM; i <= V31_REGNUM; i++)
12528 {
12529 fixed_regs[i] = 1;
12530 call_used_regs[i] = 1;
12531 }
12532 }
12533 if (!TARGET_SVE)
12534 for (i = P0_REGNUM; i <= P15_REGNUM; i++)
12535 {
12536 fixed_regs[i] = 1;
12537 call_used_regs[i] = 1;
12538 }
12539 }
12540
12541 /* Walk down the type tree of TYPE counting consecutive base elements.
12542 If *MODEP is VOIDmode, then set it to the first valid floating point
12543 type. If a non-floating point type is found, or if a floating point
12544 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
12545 otherwise return the count in the sub-tree. */
12546 static int
aapcs_vfp_sub_candidate(const_tree type,machine_mode * modep)12547 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
12548 {
12549 machine_mode mode;
12550 HOST_WIDE_INT size;
12551
12552 switch (TREE_CODE (type))
12553 {
12554 case REAL_TYPE:
12555 mode = TYPE_MODE (type);
12556 if (mode != DFmode && mode != SFmode
12557 && mode != TFmode && mode != HFmode)
12558 return -1;
12559
12560 if (*modep == VOIDmode)
12561 *modep = mode;
12562
12563 if (*modep == mode)
12564 return 1;
12565
12566 break;
12567
12568 case COMPLEX_TYPE:
12569 mode = TYPE_MODE (TREE_TYPE (type));
12570 if (mode != DFmode && mode != SFmode
12571 && mode != TFmode && mode != HFmode)
12572 return -1;
12573
12574 if (*modep == VOIDmode)
12575 *modep = mode;
12576
12577 if (*modep == mode)
12578 return 2;
12579
12580 break;
12581
12582 case VECTOR_TYPE:
12583 /* Use V2SImode and V4SImode as representatives of all 64-bit
12584 and 128-bit vector types. */
12585 size = int_size_in_bytes (type);
12586 switch (size)
12587 {
12588 case 8:
12589 mode = V2SImode;
12590 break;
12591 case 16:
12592 mode = V4SImode;
12593 break;
12594 default:
12595 return -1;
12596 }
12597
12598 if (*modep == VOIDmode)
12599 *modep = mode;
12600
12601 /* Vector modes are considered to be opaque: two vectors are
12602 equivalent for the purposes of being homogeneous aggregates
12603 if they are the same size. */
12604 if (*modep == mode)
12605 return 1;
12606
12607 break;
12608
12609 case ARRAY_TYPE:
12610 {
12611 int count;
12612 tree index = TYPE_DOMAIN (type);
12613
12614 /* Can't handle incomplete types nor sizes that are not
12615 fixed. */
12616 if (!COMPLETE_TYPE_P (type)
12617 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
12618 return -1;
12619
12620 count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
12621 if (count == -1
12622 || !index
12623 || !TYPE_MAX_VALUE (index)
12624 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
12625 || !TYPE_MIN_VALUE (index)
12626 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
12627 || count < 0)
12628 return -1;
12629
12630 count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
12631 - tree_to_uhwi (TYPE_MIN_VALUE (index)));
12632
12633 /* There must be no padding. */
12634 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
12635 count * GET_MODE_BITSIZE (*modep)))
12636 return -1;
12637
12638 return count;
12639 }
12640
12641 case RECORD_TYPE:
12642 {
12643 int count = 0;
12644 int sub_count;
12645 tree field;
12646
12647 /* Can't handle incomplete types nor sizes that are not
12648 fixed. */
12649 if (!COMPLETE_TYPE_P (type)
12650 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
12651 return -1;
12652
12653 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
12654 {
12655 if (TREE_CODE (field) != FIELD_DECL)
12656 continue;
12657
12658 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
12659 if (sub_count < 0)
12660 return -1;
12661 count += sub_count;
12662 }
12663
12664 /* There must be no padding. */
12665 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
12666 count * GET_MODE_BITSIZE (*modep)))
12667 return -1;
12668
12669 return count;
12670 }
12671
12672 case UNION_TYPE:
12673 case QUAL_UNION_TYPE:
12674 {
12675 /* These aren't very interesting except in a degenerate case. */
12676 int count = 0;
12677 int sub_count;
12678 tree field;
12679
12680 /* Can't handle incomplete types nor sizes that are not
12681 fixed. */
12682 if (!COMPLETE_TYPE_P (type)
12683 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
12684 return -1;
12685
12686 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
12687 {
12688 if (TREE_CODE (field) != FIELD_DECL)
12689 continue;
12690
12691 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
12692 if (sub_count < 0)
12693 return -1;
12694 count = count > sub_count ? count : sub_count;
12695 }
12696
12697 /* There must be no padding. */
12698 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
12699 count * GET_MODE_BITSIZE (*modep)))
12700 return -1;
12701
12702 return count;
12703 }
12704
12705 default:
12706 break;
12707 }
12708
12709 return -1;
12710 }
12711
12712 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
12713 type as described in AAPCS64 \S 4.1.2.
12714
12715 See the comment above aarch64_composite_type_p for the notes on MODE. */
12716
12717 static bool
aarch64_short_vector_p(const_tree type,machine_mode mode)12718 aarch64_short_vector_p (const_tree type,
12719 machine_mode mode)
12720 {
12721 poly_int64 size = -1;
12722
12723 if (type && TREE_CODE (type) == VECTOR_TYPE)
12724 size = int_size_in_bytes (type);
12725 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
12726 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
12727 size = GET_MODE_SIZE (mode);
12728
12729 return known_eq (size, 8) || known_eq (size, 16);
12730 }
12731
12732 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
12733 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
12734 array types. The C99 floating-point complex types are also considered
12735 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
12736 types, which are GCC extensions and out of the scope of AAPCS64, are
12737 treated as composite types here as well.
12738
12739 Note that MODE itself is not sufficient in determining whether a type
12740 is such a composite type or not. This is because
12741 stor-layout.c:compute_record_mode may have already changed the MODE
12742 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
12743 structure with only one field may have its MODE set to the mode of the
12744 field. Also an integer mode whose size matches the size of the
12745 RECORD_TYPE type may be used to substitute the original mode
12746 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
12747 solely relied on. */
12748
12749 static bool
aarch64_composite_type_p(const_tree type,machine_mode mode)12750 aarch64_composite_type_p (const_tree type,
12751 machine_mode mode)
12752 {
12753 if (aarch64_short_vector_p (type, mode))
12754 return false;
12755
12756 if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
12757 return true;
12758
12759 if (mode == BLKmode
12760 || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
12761 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
12762 return true;
12763
12764 return false;
12765 }
12766
12767 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
12768 shall be passed or returned in simd/fp register(s) (providing these
12769 parameter passing registers are available).
12770
12771 Upon successful return, *COUNT returns the number of needed registers,
12772 *BASE_MODE returns the mode of the individual register and when IS_HAF
12773 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
12774 floating-point aggregate or a homogeneous short-vector aggregate. */
12775
12776 static bool
aarch64_vfp_is_call_or_return_candidate(machine_mode mode,const_tree type,machine_mode * base_mode,int * count,bool * is_ha)12777 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
12778 const_tree type,
12779 machine_mode *base_mode,
12780 int *count,
12781 bool *is_ha)
12782 {
12783 machine_mode new_mode = VOIDmode;
12784 bool composite_p = aarch64_composite_type_p (type, mode);
12785
12786 if (is_ha != NULL) *is_ha = false;
12787
12788 if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
12789 || aarch64_short_vector_p (type, mode))
12790 {
12791 *count = 1;
12792 new_mode = mode;
12793 }
12794 else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
12795 {
12796 if (is_ha != NULL) *is_ha = true;
12797 *count = 2;
12798 new_mode = GET_MODE_INNER (mode);
12799 }
12800 else if (type && composite_p)
12801 {
12802 int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
12803
12804 if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
12805 {
12806 if (is_ha != NULL) *is_ha = true;
12807 *count = ag_count;
12808 }
12809 else
12810 return false;
12811 }
12812 else
12813 return false;
12814
12815 *base_mode = new_mode;
12816 return true;
12817 }
12818
12819 /* Implement TARGET_STRUCT_VALUE_RTX. */
12820
12821 static rtx
aarch64_struct_value_rtx(tree fndecl ATTRIBUTE_UNUSED,int incoming ATTRIBUTE_UNUSED)12822 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
12823 int incoming ATTRIBUTE_UNUSED)
12824 {
12825 return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
12826 }
12827
12828 /* Implements target hook vector_mode_supported_p. */
12829 static bool
aarch64_vector_mode_supported_p(machine_mode mode)12830 aarch64_vector_mode_supported_p (machine_mode mode)
12831 {
12832 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
12833 return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
12834 }
12835
12836 /* Return appropriate SIMD container
12837 for MODE within a vector of WIDTH bits. */
12838 static machine_mode
aarch64_simd_container_mode(scalar_mode mode,poly_int64 width)12839 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
12840 {
12841 if (TARGET_SVE && known_eq (width, BITS_PER_SVE_VECTOR))
12842 switch (mode)
12843 {
12844 case E_DFmode:
12845 return VNx2DFmode;
12846 case E_SFmode:
12847 return VNx4SFmode;
12848 case E_HFmode:
12849 return VNx8HFmode;
12850 case E_DImode:
12851 return VNx2DImode;
12852 case E_SImode:
12853 return VNx4SImode;
12854 case E_HImode:
12855 return VNx8HImode;
12856 case E_QImode:
12857 return VNx16QImode;
12858 default:
12859 return word_mode;
12860 }
12861
12862 gcc_assert (known_eq (width, 64) || known_eq (width, 128));
12863 if (TARGET_SIMD)
12864 {
12865 if (known_eq (width, 128))
12866 switch (mode)
12867 {
12868 case E_DFmode:
12869 return V2DFmode;
12870 case E_SFmode:
12871 return V4SFmode;
12872 case E_HFmode:
12873 return V8HFmode;
12874 case E_SImode:
12875 return V4SImode;
12876 case E_HImode:
12877 return V8HImode;
12878 case E_QImode:
12879 return V16QImode;
12880 case E_DImode:
12881 return V2DImode;
12882 default:
12883 break;
12884 }
12885 else
12886 switch (mode)
12887 {
12888 case E_SFmode:
12889 return V2SFmode;
12890 case E_HFmode:
12891 return V4HFmode;
12892 case E_SImode:
12893 return V2SImode;
12894 case E_HImode:
12895 return V4HImode;
12896 case E_QImode:
12897 return V8QImode;
12898 default:
12899 break;
12900 }
12901 }
12902 return word_mode;
12903 }
12904
12905 /* Return 128-bit container as the preferred SIMD mode for MODE. */
12906 static machine_mode
aarch64_preferred_simd_mode(scalar_mode mode)12907 aarch64_preferred_simd_mode (scalar_mode mode)
12908 {
12909 poly_int64 bits = TARGET_SVE ? BITS_PER_SVE_VECTOR : 128;
12910 return aarch64_simd_container_mode (mode, bits);
12911 }
12912
12913 /* Return a list of possible vector sizes for the vectorizer
12914 to iterate over. */
12915 static void
aarch64_autovectorize_vector_sizes(vector_sizes * sizes)12916 aarch64_autovectorize_vector_sizes (vector_sizes *sizes)
12917 {
12918 if (TARGET_SVE)
12919 sizes->safe_push (BYTES_PER_SVE_VECTOR);
12920 sizes->safe_push (16);
12921 sizes->safe_push (8);
12922 }
12923
12924 /* Implement TARGET_MANGLE_TYPE. */
12925
12926 static const char *
aarch64_mangle_type(const_tree type)12927 aarch64_mangle_type (const_tree type)
12928 {
12929 /* The AArch64 ABI documents say that "__va_list" has to be
12930 managled as if it is in the "std" namespace. */
12931 if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
12932 return "St9__va_list";
12933
12934 /* Half-precision float. */
12935 if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
12936 return "Dh";
12937
12938 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
12939 builtin types. */
12940 if (TYPE_NAME (type) != NULL)
12941 return aarch64_mangle_builtin_type (type);
12942
12943 /* Use the default mangling. */
12944 return NULL;
12945 }
12946
12947 /* Find the first rtx_insn before insn that will generate an assembly
12948 instruction. */
12949
12950 static rtx_insn *
aarch64_prev_real_insn(rtx_insn * insn)12951 aarch64_prev_real_insn (rtx_insn *insn)
12952 {
12953 if (!insn)
12954 return NULL;
12955
12956 do
12957 {
12958 insn = prev_real_insn (insn);
12959 }
12960 while (insn && recog_memoized (insn) < 0);
12961
12962 return insn;
12963 }
12964
12965 static bool
is_madd_op(enum attr_type t1)12966 is_madd_op (enum attr_type t1)
12967 {
12968 unsigned int i;
12969 /* A number of these may be AArch32 only. */
12970 enum attr_type mlatypes[] = {
12971 TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
12972 TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
12973 TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
12974 };
12975
12976 for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
12977 {
12978 if (t1 == mlatypes[i])
12979 return true;
12980 }
12981
12982 return false;
12983 }
12984
12985 /* Check if there is a register dependency between a load and the insn
12986 for which we hold recog_data. */
12987
12988 static bool
dep_between_memop_and_curr(rtx memop)12989 dep_between_memop_and_curr (rtx memop)
12990 {
12991 rtx load_reg;
12992 int opno;
12993
12994 gcc_assert (GET_CODE (memop) == SET);
12995
12996 if (!REG_P (SET_DEST (memop)))
12997 return false;
12998
12999 load_reg = SET_DEST (memop);
13000 for (opno = 1; opno < recog_data.n_operands; opno++)
13001 {
13002 rtx operand = recog_data.operand[opno];
13003 if (REG_P (operand)
13004 && reg_overlap_mentioned_p (load_reg, operand))
13005 return true;
13006
13007 }
13008 return false;
13009 }
13010
13011
13012 /* When working around the Cortex-A53 erratum 835769,
13013 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
13014 instruction and has a preceding memory instruction such that a NOP
13015 should be inserted between them. */
13016
13017 bool
aarch64_madd_needs_nop(rtx_insn * insn)13018 aarch64_madd_needs_nop (rtx_insn* insn)
13019 {
13020 enum attr_type attr_type;
13021 rtx_insn *prev;
13022 rtx body;
13023
13024 if (!TARGET_FIX_ERR_A53_835769)
13025 return false;
13026
13027 if (!INSN_P (insn) || recog_memoized (insn) < 0)
13028 return false;
13029
13030 attr_type = get_attr_type (insn);
13031 if (!is_madd_op (attr_type))
13032 return false;
13033
13034 prev = aarch64_prev_real_insn (insn);
13035 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
13036 Restore recog state to INSN to avoid state corruption. */
13037 extract_constrain_insn_cached (insn);
13038
13039 if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
13040 return false;
13041
13042 body = single_set (prev);
13043
13044 /* If the previous insn is a memory op and there is no dependency between
13045 it and the DImode madd, emit a NOP between them. If body is NULL then we
13046 have a complex memory operation, probably a load/store pair.
13047 Be conservative for now and emit a NOP. */
13048 if (GET_MODE (recog_data.operand[0]) == DImode
13049 && (!body || !dep_between_memop_and_curr (body)))
13050 return true;
13051
13052 return false;
13053
13054 }
13055
13056
13057 /* Implement FINAL_PRESCAN_INSN. */
13058
13059 void
aarch64_final_prescan_insn(rtx_insn * insn)13060 aarch64_final_prescan_insn (rtx_insn *insn)
13061 {
13062 if (aarch64_madd_needs_nop (insn))
13063 fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
13064 }
13065
13066
13067 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
13068 instruction. */
13069
13070 bool
aarch64_sve_index_immediate_p(rtx base_or_step)13071 aarch64_sve_index_immediate_p (rtx base_or_step)
13072 {
13073 return (CONST_INT_P (base_or_step)
13074 && IN_RANGE (INTVAL (base_or_step), -16, 15));
13075 }
13076
13077 /* Return true if X is a valid immediate for the SVE ADD and SUB
13078 instructions. Negate X first if NEGATE_P is true. */
13079
13080 bool
aarch64_sve_arith_immediate_p(rtx x,bool negate_p)13081 aarch64_sve_arith_immediate_p (rtx x, bool negate_p)
13082 {
13083 rtx elt;
13084
13085 if (!const_vec_duplicate_p (x, &elt)
13086 || !CONST_INT_P (elt))
13087 return false;
13088
13089 HOST_WIDE_INT val = INTVAL (elt);
13090 if (negate_p)
13091 val = -val;
13092 val &= GET_MODE_MASK (GET_MODE_INNER (GET_MODE (x)));
13093
13094 if (val & 0xff)
13095 return IN_RANGE (val, 0, 0xff);
13096 return IN_RANGE (val, 0, 0xff00);
13097 }
13098
13099 /* Return true if X is a valid immediate operand for an SVE logical
13100 instruction such as AND. */
13101
13102 bool
aarch64_sve_bitmask_immediate_p(rtx x)13103 aarch64_sve_bitmask_immediate_p (rtx x)
13104 {
13105 rtx elt;
13106
13107 return (const_vec_duplicate_p (x, &elt)
13108 && CONST_INT_P (elt)
13109 && aarch64_bitmask_imm (INTVAL (elt),
13110 GET_MODE_INNER (GET_MODE (x))));
13111 }
13112
13113 /* Return true if X is a valid immediate for the SVE DUP and CPY
13114 instructions. */
13115
13116 bool
aarch64_sve_dup_immediate_p(rtx x)13117 aarch64_sve_dup_immediate_p (rtx x)
13118 {
13119 rtx elt;
13120
13121 if (!const_vec_duplicate_p (x, &elt)
13122 || !CONST_INT_P (elt))
13123 return false;
13124
13125 HOST_WIDE_INT val = INTVAL (elt);
13126 if (val & 0xff)
13127 return IN_RANGE (val, -0x80, 0x7f);
13128 return IN_RANGE (val, -0x8000, 0x7f00);
13129 }
13130
13131 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
13132 SIGNED_P says whether the operand is signed rather than unsigned. */
13133
13134 bool
aarch64_sve_cmp_immediate_p(rtx x,bool signed_p)13135 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
13136 {
13137 rtx elt;
13138
13139 return (const_vec_duplicate_p (x, &elt)
13140 && CONST_INT_P (elt)
13141 && (signed_p
13142 ? IN_RANGE (INTVAL (elt), -16, 15)
13143 : IN_RANGE (INTVAL (elt), 0, 127)));
13144 }
13145
13146 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
13147 instruction. Negate X first if NEGATE_P is true. */
13148
13149 bool
aarch64_sve_float_arith_immediate_p(rtx x,bool negate_p)13150 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
13151 {
13152 rtx elt;
13153 REAL_VALUE_TYPE r;
13154
13155 if (!const_vec_duplicate_p (x, &elt)
13156 || GET_CODE (elt) != CONST_DOUBLE)
13157 return false;
13158
13159 r = *CONST_DOUBLE_REAL_VALUE (elt);
13160
13161 if (negate_p)
13162 r = real_value_negate (&r);
13163
13164 if (real_equal (&r, &dconst1))
13165 return true;
13166 if (real_equal (&r, &dconsthalf))
13167 return true;
13168 return false;
13169 }
13170
13171 /* Return true if X is a valid immediate operand for an SVE FMUL
13172 instruction. */
13173
13174 bool
aarch64_sve_float_mul_immediate_p(rtx x)13175 aarch64_sve_float_mul_immediate_p (rtx x)
13176 {
13177 rtx elt;
13178
13179 /* GCC will never generate a multiply with an immediate of 2, so there is no
13180 point testing for it (even though it is a valid constant). */
13181 return (const_vec_duplicate_p (x, &elt)
13182 && GET_CODE (elt) == CONST_DOUBLE
13183 && real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf));
13184 }
13185
13186 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
13187 for the Advanced SIMD operation described by WHICH and INSN. If INFO
13188 is nonnull, use it to describe valid immediates. */
13189 static bool
aarch64_advsimd_valid_immediate_hs(unsigned int val32,simd_immediate_info * info,enum simd_immediate_check which,simd_immediate_info::insn_type insn)13190 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
13191 simd_immediate_info *info,
13192 enum simd_immediate_check which,
13193 simd_immediate_info::insn_type insn)
13194 {
13195 /* Try a 4-byte immediate with LSL. */
13196 for (unsigned int shift = 0; shift < 32; shift += 8)
13197 if ((val32 & (0xff << shift)) == val32)
13198 {
13199 if (info)
13200 *info = simd_immediate_info (SImode, val32 >> shift, insn,
13201 simd_immediate_info::LSL, shift);
13202 return true;
13203 }
13204
13205 /* Try a 2-byte immediate with LSL. */
13206 unsigned int imm16 = val32 & 0xffff;
13207 if (imm16 == (val32 >> 16))
13208 for (unsigned int shift = 0; shift < 16; shift += 8)
13209 if ((imm16 & (0xff << shift)) == imm16)
13210 {
13211 if (info)
13212 *info = simd_immediate_info (HImode, imm16 >> shift, insn,
13213 simd_immediate_info::LSL, shift);
13214 return true;
13215 }
13216
13217 /* Try a 4-byte immediate with MSL, except for cases that MVN
13218 can handle. */
13219 if (which == AARCH64_CHECK_MOV)
13220 for (unsigned int shift = 8; shift < 24; shift += 8)
13221 {
13222 unsigned int low = (1 << shift) - 1;
13223 if (((val32 & (0xff << shift)) | low) == val32)
13224 {
13225 if (info)
13226 *info = simd_immediate_info (SImode, val32 >> shift, insn,
13227 simd_immediate_info::MSL, shift);
13228 return true;
13229 }
13230 }
13231
13232 return false;
13233 }
13234
13235 /* Return true if replicating VAL64 is a valid immediate for the
13236 Advanced SIMD operation described by WHICH. If INFO is nonnull,
13237 use it to describe valid immediates. */
13238 static bool
aarch64_advsimd_valid_immediate(unsigned HOST_WIDE_INT val64,simd_immediate_info * info,enum simd_immediate_check which)13239 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
13240 simd_immediate_info *info,
13241 enum simd_immediate_check which)
13242 {
13243 unsigned int val32 = val64 & 0xffffffff;
13244 unsigned int val16 = val64 & 0xffff;
13245 unsigned int val8 = val64 & 0xff;
13246
13247 if (val32 == (val64 >> 32))
13248 {
13249 if ((which & AARCH64_CHECK_ORR) != 0
13250 && aarch64_advsimd_valid_immediate_hs (val32, info, which,
13251 simd_immediate_info::MOV))
13252 return true;
13253
13254 if ((which & AARCH64_CHECK_BIC) != 0
13255 && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
13256 simd_immediate_info::MVN))
13257 return true;
13258
13259 /* Try using a replicated byte. */
13260 if (which == AARCH64_CHECK_MOV
13261 && val16 == (val32 >> 16)
13262 && val8 == (val16 >> 8))
13263 {
13264 if (info)
13265 *info = simd_immediate_info (QImode, val8);
13266 return true;
13267 }
13268 }
13269
13270 /* Try using a bit-to-bytemask. */
13271 if (which == AARCH64_CHECK_MOV)
13272 {
13273 unsigned int i;
13274 for (i = 0; i < 64; i += 8)
13275 {
13276 unsigned char byte = (val64 >> i) & 0xff;
13277 if (byte != 0 && byte != 0xff)
13278 break;
13279 }
13280 if (i == 64)
13281 {
13282 if (info)
13283 *info = simd_immediate_info (DImode, val64);
13284 return true;
13285 }
13286 }
13287 return false;
13288 }
13289
13290 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
13291 instruction. If INFO is nonnull, use it to describe valid immediates. */
13292
13293 static bool
aarch64_sve_valid_immediate(unsigned HOST_WIDE_INT val64,simd_immediate_info * info)13294 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
13295 simd_immediate_info *info)
13296 {
13297 scalar_int_mode mode = DImode;
13298 unsigned int val32 = val64 & 0xffffffff;
13299 if (val32 == (val64 >> 32))
13300 {
13301 mode = SImode;
13302 unsigned int val16 = val32 & 0xffff;
13303 if (val16 == (val32 >> 16))
13304 {
13305 mode = HImode;
13306 unsigned int val8 = val16 & 0xff;
13307 if (val8 == (val16 >> 8))
13308 mode = QImode;
13309 }
13310 }
13311 HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
13312 if (IN_RANGE (val, -0x80, 0x7f))
13313 {
13314 /* DUP with no shift. */
13315 if (info)
13316 *info = simd_immediate_info (mode, val);
13317 return true;
13318 }
13319 if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
13320 {
13321 /* DUP with LSL #8. */
13322 if (info)
13323 *info = simd_immediate_info (mode, val);
13324 return true;
13325 }
13326 if (aarch64_bitmask_imm (val64, mode))
13327 {
13328 /* DUPM. */
13329 if (info)
13330 *info = simd_immediate_info (mode, val);
13331 return true;
13332 }
13333 return false;
13334 }
13335
13336 /* Return true if OP is a valid SIMD immediate for the operation
13337 described by WHICH. If INFO is nonnull, use it to describe valid
13338 immediates. */
13339 bool
aarch64_simd_valid_immediate(rtx op,simd_immediate_info * info,enum simd_immediate_check which)13340 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
13341 enum simd_immediate_check which)
13342 {
13343 machine_mode mode = GET_MODE (op);
13344 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
13345 if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
13346 return false;
13347
13348 scalar_mode elt_mode = GET_MODE_INNER (mode);
13349 rtx base, step;
13350 unsigned int n_elts;
13351 if (GET_CODE (op) == CONST_VECTOR
13352 && CONST_VECTOR_DUPLICATE_P (op))
13353 n_elts = CONST_VECTOR_NPATTERNS (op);
13354 else if ((vec_flags & VEC_SVE_DATA)
13355 && const_vec_series_p (op, &base, &step))
13356 {
13357 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
13358 if (!aarch64_sve_index_immediate_p (base)
13359 || !aarch64_sve_index_immediate_p (step))
13360 return false;
13361
13362 if (info)
13363 *info = simd_immediate_info (elt_mode, base, step);
13364 return true;
13365 }
13366 else if (GET_CODE (op) == CONST_VECTOR
13367 && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
13368 /* N_ELTS set above. */;
13369 else
13370 return false;
13371
13372 /* Handle PFALSE and PTRUE. */
13373 if (vec_flags & VEC_SVE_PRED)
13374 return (op == CONST0_RTX (mode)
13375 || op == CONSTM1_RTX (mode));
13376
13377 scalar_float_mode elt_float_mode;
13378 if (n_elts == 1
13379 && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
13380 {
13381 rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
13382 if (aarch64_float_const_zero_rtx_p (elt)
13383 || aarch64_float_const_representable_p (elt))
13384 {
13385 if (info)
13386 *info = simd_immediate_info (elt_float_mode, elt);
13387 return true;
13388 }
13389 }
13390
13391 unsigned int elt_size = GET_MODE_SIZE (elt_mode);
13392 if (elt_size > 8)
13393 return false;
13394
13395 scalar_int_mode elt_int_mode = int_mode_for_mode (elt_mode).require ();
13396
13397 /* Expand the vector constant out into a byte vector, with the least
13398 significant byte of the register first. */
13399 auto_vec<unsigned char, 16> bytes;
13400 bytes.reserve (n_elts * elt_size);
13401 for (unsigned int i = 0; i < n_elts; i++)
13402 {
13403 /* The vector is provided in gcc endian-neutral fashion.
13404 For aarch64_be Advanced SIMD, it must be laid out in the vector
13405 register in reverse order. */
13406 bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
13407 rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
13408
13409 if (elt_mode != elt_int_mode)
13410 elt = gen_lowpart (elt_int_mode, elt);
13411
13412 if (!CONST_INT_P (elt))
13413 return false;
13414
13415 unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
13416 for (unsigned int byte = 0; byte < elt_size; byte++)
13417 {
13418 bytes.quick_push (elt_val & 0xff);
13419 elt_val >>= BITS_PER_UNIT;
13420 }
13421 }
13422
13423 /* The immediate must repeat every eight bytes. */
13424 unsigned int nbytes = bytes.length ();
13425 for (unsigned i = 8; i < nbytes; ++i)
13426 if (bytes[i] != bytes[i - 8])
13427 return false;
13428
13429 /* Get the repeating 8-byte value as an integer. No endian correction
13430 is needed here because bytes is already in lsb-first order. */
13431 unsigned HOST_WIDE_INT val64 = 0;
13432 for (unsigned int i = 0; i < 8; i++)
13433 val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
13434 << (i * BITS_PER_UNIT));
13435
13436 if (vec_flags & VEC_SVE_DATA)
13437 return aarch64_sve_valid_immediate (val64, info);
13438 else
13439 return aarch64_advsimd_valid_immediate (val64, info, which);
13440 }
13441
13442 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
13443 has a step in the range of INDEX. Return the index expression if so,
13444 otherwise return null. */
13445 rtx
aarch64_check_zero_based_sve_index_immediate(rtx x)13446 aarch64_check_zero_based_sve_index_immediate (rtx x)
13447 {
13448 rtx base, step;
13449 if (const_vec_series_p (x, &base, &step)
13450 && base == const0_rtx
13451 && aarch64_sve_index_immediate_p (step))
13452 return step;
13453 return NULL_RTX;
13454 }
13455
13456 /* Check of immediate shift constants are within range. */
13457 bool
aarch64_simd_shift_imm_p(rtx x,machine_mode mode,bool left)13458 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
13459 {
13460 int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
13461 if (left)
13462 return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
13463 else
13464 return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
13465 }
13466
13467 /* Return the bitmask CONST_INT to select the bits required by a zero extract
13468 operation of width WIDTH at bit position POS. */
13469
13470 rtx
aarch64_mask_from_zextract_ops(rtx width,rtx pos)13471 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
13472 {
13473 gcc_assert (CONST_INT_P (width));
13474 gcc_assert (CONST_INT_P (pos));
13475
13476 unsigned HOST_WIDE_INT mask
13477 = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
13478 return GEN_INT (mask << UINTVAL (pos));
13479 }
13480
13481 bool
aarch64_mov_operand_p(rtx x,machine_mode mode)13482 aarch64_mov_operand_p (rtx x, machine_mode mode)
13483 {
13484 if (GET_CODE (x) == HIGH
13485 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
13486 return true;
13487
13488 if (CONST_INT_P (x))
13489 return true;
13490
13491 if (VECTOR_MODE_P (GET_MODE (x)))
13492 return aarch64_simd_valid_immediate (x, NULL);
13493
13494 if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
13495 return true;
13496
13497 if (aarch64_sve_cnt_immediate_p (x))
13498 return true;
13499
13500 return aarch64_classify_symbolic_expression (x)
13501 == SYMBOL_TINY_ABSOLUTE;
13502 }
13503
13504 /* Return a const_int vector of VAL. */
13505 rtx
aarch64_simd_gen_const_vector_dup(machine_mode mode,HOST_WIDE_INT val)13506 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
13507 {
13508 rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
13509 return gen_const_vec_duplicate (mode, c);
13510 }
13511
13512 /* Check OP is a legal scalar immediate for the MOVI instruction. */
13513
13514 bool
aarch64_simd_scalar_immediate_valid_for_move(rtx op,scalar_int_mode mode)13515 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
13516 {
13517 machine_mode vmode;
13518
13519 vmode = aarch64_simd_container_mode (mode, 64);
13520 rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
13521 return aarch64_simd_valid_immediate (op_v, NULL);
13522 }
13523
13524 /* Construct and return a PARALLEL RTX vector with elements numbering the
13525 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
13526 the vector - from the perspective of the architecture. This does not
13527 line up with GCC's perspective on lane numbers, so we end up with
13528 different masks depending on our target endian-ness. The diagram
13529 below may help. We must draw the distinction when building masks
13530 which select one half of the vector. An instruction selecting
13531 architectural low-lanes for a big-endian target, must be described using
13532 a mask selecting GCC high-lanes.
13533
13534 Big-Endian Little-Endian
13535
13536 GCC 0 1 2 3 3 2 1 0
13537 | x | x | x | x | | x | x | x | x |
13538 Architecture 3 2 1 0 3 2 1 0
13539
13540 Low Mask: { 2, 3 } { 0, 1 }
13541 High Mask: { 0, 1 } { 2, 3 }
13542
13543 MODE Is the mode of the vector and NUNITS is the number of units in it. */
13544
13545 rtx
aarch64_simd_vect_par_cnst_half(machine_mode mode,int nunits,bool high)13546 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
13547 {
13548 rtvec v = rtvec_alloc (nunits / 2);
13549 int high_base = nunits / 2;
13550 int low_base = 0;
13551 int base;
13552 rtx t1;
13553 int i;
13554
13555 if (BYTES_BIG_ENDIAN)
13556 base = high ? low_base : high_base;
13557 else
13558 base = high ? high_base : low_base;
13559
13560 for (i = 0; i < nunits / 2; i++)
13561 RTVEC_ELT (v, i) = GEN_INT (base + i);
13562
13563 t1 = gen_rtx_PARALLEL (mode, v);
13564 return t1;
13565 }
13566
13567 /* Check OP for validity as a PARALLEL RTX vector with elements
13568 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
13569 from the perspective of the architecture. See the diagram above
13570 aarch64_simd_vect_par_cnst_half for more details. */
13571
13572 bool
aarch64_simd_check_vect_par_cnst_half(rtx op,machine_mode mode,bool high)13573 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
13574 bool high)
13575 {
13576 int nelts;
13577 if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
13578 return false;
13579
13580 rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
13581 HOST_WIDE_INT count_op = XVECLEN (op, 0);
13582 HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
13583 int i = 0;
13584
13585 if (count_op != count_ideal)
13586 return false;
13587
13588 for (i = 0; i < count_ideal; i++)
13589 {
13590 rtx elt_op = XVECEXP (op, 0, i);
13591 rtx elt_ideal = XVECEXP (ideal, 0, i);
13592
13593 if (!CONST_INT_P (elt_op)
13594 || INTVAL (elt_ideal) != INTVAL (elt_op))
13595 return false;
13596 }
13597 return true;
13598 }
13599
13600 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
13601 HIGH (exclusive). */
13602 void
aarch64_simd_lane_bounds(rtx operand,HOST_WIDE_INT low,HOST_WIDE_INT high,const_tree exp)13603 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
13604 const_tree exp)
13605 {
13606 HOST_WIDE_INT lane;
13607 gcc_assert (CONST_INT_P (operand));
13608 lane = INTVAL (operand);
13609
13610 if (lane < low || lane >= high)
13611 {
13612 if (exp)
13613 error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
13614 else
13615 error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
13616 }
13617 }
13618
13619 /* Peform endian correction on lane number N, which indexes a vector
13620 of mode MODE, and return the result as an SImode rtx. */
13621
13622 rtx
aarch64_endian_lane_rtx(machine_mode mode,unsigned int n)13623 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
13624 {
13625 return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
13626 }
13627
13628 /* Return TRUE if OP is a valid vector addressing mode. */
13629
13630 bool
aarch64_simd_mem_operand_p(rtx op)13631 aarch64_simd_mem_operand_p (rtx op)
13632 {
13633 return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
13634 || REG_P (XEXP (op, 0)));
13635 }
13636
13637 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction. */
13638
13639 bool
aarch64_sve_ld1r_operand_p(rtx op)13640 aarch64_sve_ld1r_operand_p (rtx op)
13641 {
13642 struct aarch64_address_info addr;
13643 scalar_mode mode;
13644
13645 return (MEM_P (op)
13646 && is_a <scalar_mode> (GET_MODE (op), &mode)
13647 && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
13648 && addr.type == ADDRESS_REG_IMM
13649 && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
13650 }
13651
13652 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
13653 The conditions for STR are the same. */
13654 bool
aarch64_sve_ldr_operand_p(rtx op)13655 aarch64_sve_ldr_operand_p (rtx op)
13656 {
13657 struct aarch64_address_info addr;
13658
13659 return (MEM_P (op)
13660 && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
13661 false, ADDR_QUERY_ANY)
13662 && addr.type == ADDRESS_REG_IMM);
13663 }
13664
13665 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
13666 We need to be able to access the individual pieces, so the range
13667 is different from LD[234] and ST[234]. */
13668 bool
aarch64_sve_struct_memory_operand_p(rtx op)13669 aarch64_sve_struct_memory_operand_p (rtx op)
13670 {
13671 if (!MEM_P (op))
13672 return false;
13673
13674 machine_mode mode = GET_MODE (op);
13675 struct aarch64_address_info addr;
13676 if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
13677 ADDR_QUERY_ANY)
13678 || addr.type != ADDRESS_REG_IMM)
13679 return false;
13680
13681 poly_int64 first = addr.const_offset;
13682 poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
13683 return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
13684 && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
13685 }
13686
13687 /* Emit a register copy from operand to operand, taking care not to
13688 early-clobber source registers in the process.
13689
13690 COUNT is the number of components into which the copy needs to be
13691 decomposed. */
13692 void
aarch64_simd_emit_reg_reg_move(rtx * operands,machine_mode mode,unsigned int count)13693 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
13694 unsigned int count)
13695 {
13696 unsigned int i;
13697 int rdest = REGNO (operands[0]);
13698 int rsrc = REGNO (operands[1]);
13699
13700 if (!reg_overlap_mentioned_p (operands[0], operands[1])
13701 || rdest < rsrc)
13702 for (i = 0; i < count; i++)
13703 emit_move_insn (gen_rtx_REG (mode, rdest + i),
13704 gen_rtx_REG (mode, rsrc + i));
13705 else
13706 for (i = 0; i < count; i++)
13707 emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
13708 gen_rtx_REG (mode, rsrc + count - i - 1));
13709 }
13710
13711 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
13712 one of VSTRUCT modes: OI, CI, or XI. */
13713 int
aarch64_simd_attr_length_rglist(machine_mode mode)13714 aarch64_simd_attr_length_rglist (machine_mode mode)
13715 {
13716 /* This is only used (and only meaningful) for Advanced SIMD, not SVE. */
13717 return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
13718 }
13719
13720 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
13721 alignment of a vector to 128 bits. SVE predicates have an alignment of
13722 16 bits. */
13723 static HOST_WIDE_INT
aarch64_simd_vector_alignment(const_tree type)13724 aarch64_simd_vector_alignment (const_tree type)
13725 {
13726 if (TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
13727 /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
13728 be set for non-predicate vectors of booleans. Modes are the most
13729 direct way we have of identifying real SVE predicate types. */
13730 return GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL ? 16 : 128;
13731 HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
13732 return MIN (align, 128);
13733 }
13734
13735 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT. */
13736 static HOST_WIDE_INT
aarch64_vectorize_preferred_vector_alignment(const_tree type)13737 aarch64_vectorize_preferred_vector_alignment (const_tree type)
13738 {
13739 if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
13740 {
13741 /* If the length of the vector is fixed, try to align to that length,
13742 otherwise don't try to align at all. */
13743 HOST_WIDE_INT result;
13744 if (!BITS_PER_SVE_VECTOR.is_constant (&result))
13745 result = TYPE_ALIGN (TREE_TYPE (type));
13746 return result;
13747 }
13748 return TYPE_ALIGN (type);
13749 }
13750
13751 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
13752 static bool
aarch64_simd_vector_alignment_reachable(const_tree type,bool is_packed)13753 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
13754 {
13755 if (is_packed)
13756 return false;
13757
13758 /* For fixed-length vectors, check that the vectorizer will aim for
13759 full-vector alignment. This isn't true for generic GCC vectors
13760 that are wider than the ABI maximum of 128 bits. */
13761 if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
13762 && (wi::to_widest (TYPE_SIZE (type))
13763 != aarch64_vectorize_preferred_vector_alignment (type)))
13764 return false;
13765
13766 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
13767 return true;
13768 }
13769
13770 /* Return true if the vector misalignment factor is supported by the
13771 target. */
13772 static bool
aarch64_builtin_support_vector_misalignment(machine_mode mode,const_tree type,int misalignment,bool is_packed)13773 aarch64_builtin_support_vector_misalignment (machine_mode mode,
13774 const_tree type, int misalignment,
13775 bool is_packed)
13776 {
13777 if (TARGET_SIMD && STRICT_ALIGNMENT)
13778 {
13779 /* Return if movmisalign pattern is not supported for this mode. */
13780 if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
13781 return false;
13782
13783 /* Misalignment factor is unknown at compile time. */
13784 if (misalignment == -1)
13785 return false;
13786 }
13787 return default_builtin_support_vector_misalignment (mode, type, misalignment,
13788 is_packed);
13789 }
13790
13791 /* If VALS is a vector constant that can be loaded into a register
13792 using DUP, generate instructions to do so and return an RTX to
13793 assign to the register. Otherwise return NULL_RTX. */
13794 static rtx
aarch64_simd_dup_constant(rtx vals)13795 aarch64_simd_dup_constant (rtx vals)
13796 {
13797 machine_mode mode = GET_MODE (vals);
13798 machine_mode inner_mode = GET_MODE_INNER (mode);
13799 rtx x;
13800
13801 if (!const_vec_duplicate_p (vals, &x))
13802 return NULL_RTX;
13803
13804 /* We can load this constant by using DUP and a constant in a
13805 single ARM register. This will be cheaper than a vector
13806 load. */
13807 x = copy_to_mode_reg (inner_mode, x);
13808 return gen_vec_duplicate (mode, x);
13809 }
13810
13811
13812 /* Generate code to load VALS, which is a PARALLEL containing only
13813 constants (for vec_init) or CONST_VECTOR, efficiently into a
13814 register. Returns an RTX to copy into the register, or NULL_RTX
13815 for a PARALLEL that can not be converted into a CONST_VECTOR. */
13816 static rtx
aarch64_simd_make_constant(rtx vals)13817 aarch64_simd_make_constant (rtx vals)
13818 {
13819 machine_mode mode = GET_MODE (vals);
13820 rtx const_dup;
13821 rtx const_vec = NULL_RTX;
13822 int n_const = 0;
13823 int i;
13824
13825 if (GET_CODE (vals) == CONST_VECTOR)
13826 const_vec = vals;
13827 else if (GET_CODE (vals) == PARALLEL)
13828 {
13829 /* A CONST_VECTOR must contain only CONST_INTs and
13830 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
13831 Only store valid constants in a CONST_VECTOR. */
13832 int n_elts = XVECLEN (vals, 0);
13833 for (i = 0; i < n_elts; ++i)
13834 {
13835 rtx x = XVECEXP (vals, 0, i);
13836 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
13837 n_const++;
13838 }
13839 if (n_const == n_elts)
13840 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
13841 }
13842 else
13843 gcc_unreachable ();
13844
13845 if (const_vec != NULL_RTX
13846 && aarch64_simd_valid_immediate (const_vec, NULL))
13847 /* Load using MOVI/MVNI. */
13848 return const_vec;
13849 else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
13850 /* Loaded using DUP. */
13851 return const_dup;
13852 else if (const_vec != NULL_RTX)
13853 /* Load from constant pool. We can not take advantage of single-cycle
13854 LD1 because we need a PC-relative addressing mode. */
13855 return const_vec;
13856 else
13857 /* A PARALLEL containing something not valid inside CONST_VECTOR.
13858 We can not construct an initializer. */
13859 return NULL_RTX;
13860 }
13861
13862 /* Expand a vector initialisation sequence, such that TARGET is
13863 initialised to contain VALS. */
13864
13865 void
aarch64_expand_vector_init(rtx target,rtx vals)13866 aarch64_expand_vector_init (rtx target, rtx vals)
13867 {
13868 machine_mode mode = GET_MODE (target);
13869 scalar_mode inner_mode = GET_MODE_INNER (mode);
13870 /* The number of vector elements. */
13871 int n_elts = XVECLEN (vals, 0);
13872 /* The number of vector elements which are not constant. */
13873 int n_var = 0;
13874 rtx any_const = NULL_RTX;
13875 /* The first element of vals. */
13876 rtx v0 = XVECEXP (vals, 0, 0);
13877 bool all_same = true;
13878
13879 /* Count the number of variable elements to initialise. */
13880 for (int i = 0; i < n_elts; ++i)
13881 {
13882 rtx x = XVECEXP (vals, 0, i);
13883 if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
13884 ++n_var;
13885 else
13886 any_const = x;
13887
13888 all_same &= rtx_equal_p (x, v0);
13889 }
13890
13891 /* No variable elements, hand off to aarch64_simd_make_constant which knows
13892 how best to handle this. */
13893 if (n_var == 0)
13894 {
13895 rtx constant = aarch64_simd_make_constant (vals);
13896 if (constant != NULL_RTX)
13897 {
13898 emit_move_insn (target, constant);
13899 return;
13900 }
13901 }
13902
13903 /* Splat a single non-constant element if we can. */
13904 if (all_same)
13905 {
13906 rtx x = copy_to_mode_reg (inner_mode, v0);
13907 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
13908 return;
13909 }
13910
13911 enum insn_code icode = optab_handler (vec_set_optab, mode);
13912 gcc_assert (icode != CODE_FOR_nothing);
13913
13914 /* If there are only variable elements, try to optimize
13915 the insertion using dup for the most common element
13916 followed by insertions. */
13917
13918 /* The algorithm will fill matches[*][0] with the earliest matching element,
13919 and matches[X][1] with the count of duplicate elements (if X is the
13920 earliest element which has duplicates). */
13921
13922 if (n_var == n_elts && n_elts <= 16)
13923 {
13924 int matches[16][2] = {0};
13925 for (int i = 0; i < n_elts; i++)
13926 {
13927 for (int j = 0; j <= i; j++)
13928 {
13929 if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
13930 {
13931 matches[i][0] = j;
13932 matches[j][1]++;
13933 break;
13934 }
13935 }
13936 }
13937 int maxelement = 0;
13938 int maxv = 0;
13939 for (int i = 0; i < n_elts; i++)
13940 if (matches[i][1] > maxv)
13941 {
13942 maxelement = i;
13943 maxv = matches[i][1];
13944 }
13945
13946 /* Create a duplicate of the most common element. */
13947 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
13948 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
13949
13950 /* Insert the rest. */
13951 for (int i = 0; i < n_elts; i++)
13952 {
13953 rtx x = XVECEXP (vals, 0, i);
13954 if (matches[i][0] == maxelement)
13955 continue;
13956 x = copy_to_mode_reg (inner_mode, x);
13957 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
13958 }
13959 return;
13960 }
13961
13962 /* Initialise a vector which is part-variable. We want to first try
13963 to build those lanes which are constant in the most efficient way we
13964 can. */
13965 if (n_var != n_elts)
13966 {
13967 rtx copy = copy_rtx (vals);
13968
13969 /* Load constant part of vector. We really don't care what goes into the
13970 parts we will overwrite, but we're more likely to be able to load the
13971 constant efficiently if it has fewer, larger, repeating parts
13972 (see aarch64_simd_valid_immediate). */
13973 for (int i = 0; i < n_elts; i++)
13974 {
13975 rtx x = XVECEXP (vals, 0, i);
13976 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
13977 continue;
13978 rtx subst = any_const;
13979 for (int bit = n_elts / 2; bit > 0; bit /= 2)
13980 {
13981 /* Look in the copied vector, as more elements are const. */
13982 rtx test = XVECEXP (copy, 0, i ^ bit);
13983 if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
13984 {
13985 subst = test;
13986 break;
13987 }
13988 }
13989 XVECEXP (copy, 0, i) = subst;
13990 }
13991 aarch64_expand_vector_init (target, copy);
13992 }
13993
13994 /* Insert the variable lanes directly. */
13995 for (int i = 0; i < n_elts; i++)
13996 {
13997 rtx x = XVECEXP (vals, 0, i);
13998 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
13999 continue;
14000 x = copy_to_mode_reg (inner_mode, x);
14001 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
14002 }
14003 }
14004
14005 static unsigned HOST_WIDE_INT
aarch64_shift_truncation_mask(machine_mode mode)14006 aarch64_shift_truncation_mask (machine_mode mode)
14007 {
14008 if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
14009 return 0;
14010 return GET_MODE_UNIT_BITSIZE (mode) - 1;
14011 }
14012
14013 /* Select a format to encode pointers in exception handling data. */
14014 int
aarch64_asm_preferred_eh_data_format(int code ATTRIBUTE_UNUSED,int global)14015 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
14016 {
14017 int type;
14018 switch (aarch64_cmodel)
14019 {
14020 case AARCH64_CMODEL_TINY:
14021 case AARCH64_CMODEL_TINY_PIC:
14022 case AARCH64_CMODEL_SMALL:
14023 case AARCH64_CMODEL_SMALL_PIC:
14024 case AARCH64_CMODEL_SMALL_SPIC:
14025 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
14026 for everything. */
14027 type = DW_EH_PE_sdata4;
14028 break;
14029 default:
14030 /* No assumptions here. 8-byte relocs required. */
14031 type = DW_EH_PE_sdata8;
14032 break;
14033 }
14034 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
14035 }
14036
14037 /* The last .arch and .tune assembly strings that we printed. */
14038 static std::string aarch64_last_printed_arch_string;
14039 static std::string aarch64_last_printed_tune_string;
14040
14041 /* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used
14042 by the function fndecl. */
14043
14044 void
aarch64_declare_function_name(FILE * stream,const char * name,tree fndecl)14045 aarch64_declare_function_name (FILE *stream, const char* name,
14046 tree fndecl)
14047 {
14048 tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
14049
14050 struct cl_target_option *targ_options;
14051 if (target_parts)
14052 targ_options = TREE_TARGET_OPTION (target_parts);
14053 else
14054 targ_options = TREE_TARGET_OPTION (target_option_current_node);
14055 gcc_assert (targ_options);
14056
14057 const struct processor *this_arch
14058 = aarch64_get_arch (targ_options->x_explicit_arch);
14059
14060 unsigned long isa_flags = targ_options->x_aarch64_isa_flags;
14061 std::string extension
14062 = aarch64_get_extension_string_for_isa_flags (isa_flags,
14063 this_arch->flags);
14064 /* Only update the assembler .arch string if it is distinct from the last
14065 such string we printed. */
14066 std::string to_print = this_arch->name + extension;
14067 if (to_print != aarch64_last_printed_arch_string)
14068 {
14069 asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
14070 aarch64_last_printed_arch_string = to_print;
14071 }
14072
14073 /* Print the cpu name we're tuning for in the comments, might be
14074 useful to readers of the generated asm. Do it only when it changes
14075 from function to function and verbose assembly is requested. */
14076 const struct processor *this_tune
14077 = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
14078
14079 if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
14080 {
14081 asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
14082 this_tune->name);
14083 aarch64_last_printed_tune_string = this_tune->name;
14084 }
14085
14086 /* Don't forget the type directive for ELF. */
14087 ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
14088 ASM_OUTPUT_LABEL (stream, name);
14089 }
14090
14091 /* Implements TARGET_ASM_FILE_START. Output the assembly header. */
14092
14093 static void
aarch64_start_file(void)14094 aarch64_start_file (void)
14095 {
14096 struct cl_target_option *default_options
14097 = TREE_TARGET_OPTION (target_option_default_node);
14098
14099 const struct processor *default_arch
14100 = aarch64_get_arch (default_options->x_explicit_arch);
14101 unsigned long default_isa_flags = default_options->x_aarch64_isa_flags;
14102 std::string extension
14103 = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
14104 default_arch->flags);
14105
14106 aarch64_last_printed_arch_string = default_arch->name + extension;
14107 aarch64_last_printed_tune_string = "";
14108 asm_fprintf (asm_out_file, "\t.arch %s\n",
14109 aarch64_last_printed_arch_string.c_str ());
14110
14111 default_file_start ();
14112 }
14113
14114 /* Emit load exclusive. */
14115
14116 static void
aarch64_emit_load_exclusive(machine_mode mode,rtx rval,rtx mem,rtx model_rtx)14117 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
14118 rtx mem, rtx model_rtx)
14119 {
14120 rtx (*gen) (rtx, rtx, rtx);
14121
14122 switch (mode)
14123 {
14124 case E_QImode: gen = gen_aarch64_load_exclusiveqi; break;
14125 case E_HImode: gen = gen_aarch64_load_exclusivehi; break;
14126 case E_SImode: gen = gen_aarch64_load_exclusivesi; break;
14127 case E_DImode: gen = gen_aarch64_load_exclusivedi; break;
14128 default:
14129 gcc_unreachable ();
14130 }
14131
14132 emit_insn (gen (rval, mem, model_rtx));
14133 }
14134
14135 /* Emit store exclusive. */
14136
14137 static void
aarch64_emit_store_exclusive(machine_mode mode,rtx bval,rtx rval,rtx mem,rtx model_rtx)14138 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
14139 rtx rval, rtx mem, rtx model_rtx)
14140 {
14141 rtx (*gen) (rtx, rtx, rtx, rtx);
14142
14143 switch (mode)
14144 {
14145 case E_QImode: gen = gen_aarch64_store_exclusiveqi; break;
14146 case E_HImode: gen = gen_aarch64_store_exclusivehi; break;
14147 case E_SImode: gen = gen_aarch64_store_exclusivesi; break;
14148 case E_DImode: gen = gen_aarch64_store_exclusivedi; break;
14149 default:
14150 gcc_unreachable ();
14151 }
14152
14153 emit_insn (gen (bval, rval, mem, model_rtx));
14154 }
14155
14156 /* Mark the previous jump instruction as unlikely. */
14157
14158 static void
aarch64_emit_unlikely_jump(rtx insn)14159 aarch64_emit_unlikely_jump (rtx insn)
14160 {
14161 rtx_insn *jump = emit_jump_insn (insn);
14162 add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
14163 }
14164
14165 /* Expand a compare and swap pattern. */
14166
14167 void
aarch64_expand_compare_and_swap(rtx operands[])14168 aarch64_expand_compare_and_swap (rtx operands[])
14169 {
14170 rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
14171 machine_mode mode, cmp_mode;
14172 typedef rtx (*gen_cas_fn) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
14173 int idx;
14174 gen_cas_fn gen;
14175 const gen_cas_fn split_cas[] =
14176 {
14177 gen_aarch64_compare_and_swapqi,
14178 gen_aarch64_compare_and_swaphi,
14179 gen_aarch64_compare_and_swapsi,
14180 gen_aarch64_compare_and_swapdi
14181 };
14182 const gen_cas_fn atomic_cas[] =
14183 {
14184 gen_aarch64_compare_and_swapqi_lse,
14185 gen_aarch64_compare_and_swaphi_lse,
14186 gen_aarch64_compare_and_swapsi_lse,
14187 gen_aarch64_compare_and_swapdi_lse
14188 };
14189
14190 bval = operands[0];
14191 rval = operands[1];
14192 mem = operands[2];
14193 oldval = operands[3];
14194 newval = operands[4];
14195 is_weak = operands[5];
14196 mod_s = operands[6];
14197 mod_f = operands[7];
14198 mode = GET_MODE (mem);
14199 cmp_mode = mode;
14200
14201 /* Normally the succ memory model must be stronger than fail, but in the
14202 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
14203 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
14204
14205 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
14206 && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
14207 mod_s = GEN_INT (MEMMODEL_ACQ_REL);
14208
14209 switch (mode)
14210 {
14211 case E_QImode:
14212 case E_HImode:
14213 /* For short modes, we're going to perform the comparison in SImode,
14214 so do the zero-extension now. */
14215 cmp_mode = SImode;
14216 rval = gen_reg_rtx (SImode);
14217 oldval = convert_modes (SImode, mode, oldval, true);
14218 /* Fall through. */
14219
14220 case E_SImode:
14221 case E_DImode:
14222 /* Force the value into a register if needed. */
14223 if (!aarch64_plus_operand (oldval, mode))
14224 oldval = force_reg (cmp_mode, oldval);
14225 break;
14226
14227 default:
14228 gcc_unreachable ();
14229 }
14230
14231 switch (mode)
14232 {
14233 case E_QImode: idx = 0; break;
14234 case E_HImode: idx = 1; break;
14235 case E_SImode: idx = 2; break;
14236 case E_DImode: idx = 3; break;
14237 default:
14238 gcc_unreachable ();
14239 }
14240 if (TARGET_LSE)
14241 gen = atomic_cas[idx];
14242 else
14243 gen = split_cas[idx];
14244
14245 emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
14246
14247 if (mode == QImode || mode == HImode)
14248 emit_move_insn (operands[1], gen_lowpart (mode, rval));
14249
14250 x = gen_rtx_REG (CCmode, CC_REGNUM);
14251 x = gen_rtx_EQ (SImode, x, const0_rtx);
14252 emit_insn (gen_rtx_SET (bval, x));
14253 }
14254
14255 /* Test whether the target supports using a atomic load-operate instruction.
14256 CODE is the operation and AFTER is TRUE if the data in memory after the
14257 operation should be returned and FALSE if the data before the operation
14258 should be returned. Returns FALSE if the operation isn't supported by the
14259 architecture. */
14260
14261 bool
aarch64_atomic_ldop_supported_p(enum rtx_code code)14262 aarch64_atomic_ldop_supported_p (enum rtx_code code)
14263 {
14264 if (!TARGET_LSE)
14265 return false;
14266
14267 switch (code)
14268 {
14269 case SET:
14270 case AND:
14271 case IOR:
14272 case XOR:
14273 case MINUS:
14274 case PLUS:
14275 return true;
14276 default:
14277 return false;
14278 }
14279 }
14280
14281 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
14282 sequence implementing an atomic operation. */
14283
14284 static void
aarch64_emit_post_barrier(enum memmodel model)14285 aarch64_emit_post_barrier (enum memmodel model)
14286 {
14287 const enum memmodel base_model = memmodel_base (model);
14288
14289 if (is_mm_sync (model)
14290 && (base_model == MEMMODEL_ACQUIRE
14291 || base_model == MEMMODEL_ACQ_REL
14292 || base_model == MEMMODEL_SEQ_CST))
14293 {
14294 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
14295 }
14296 }
14297
14298 /* Emit an atomic compare-and-swap operation. RVAL is the destination register
14299 for the data in memory. EXPECTED is the value expected to be in memory.
14300 DESIRED is the value to store to memory. MEM is the memory location. MODEL
14301 is the memory ordering to use. */
14302
14303 void
aarch64_gen_atomic_cas(rtx rval,rtx mem,rtx expected,rtx desired,rtx model)14304 aarch64_gen_atomic_cas (rtx rval, rtx mem,
14305 rtx expected, rtx desired,
14306 rtx model)
14307 {
14308 rtx (*gen) (rtx, rtx, rtx, rtx);
14309 machine_mode mode;
14310
14311 mode = GET_MODE (mem);
14312
14313 switch (mode)
14314 {
14315 case E_QImode: gen = gen_aarch64_atomic_casqi; break;
14316 case E_HImode: gen = gen_aarch64_atomic_cashi; break;
14317 case E_SImode: gen = gen_aarch64_atomic_cassi; break;
14318 case E_DImode: gen = gen_aarch64_atomic_casdi; break;
14319 default:
14320 gcc_unreachable ();
14321 }
14322
14323 /* Move the expected value into the CAS destination register. */
14324 emit_insn (gen_rtx_SET (rval, expected));
14325
14326 /* Emit the CAS. */
14327 emit_insn (gen (rval, mem, desired, model));
14328
14329 /* Compare the expected value with the value loaded by the CAS, to establish
14330 whether the swap was made. */
14331 aarch64_gen_compare_reg (EQ, rval, expected);
14332 }
14333
14334 /* Split a compare and swap pattern. */
14335
14336 void
aarch64_split_compare_and_swap(rtx operands[])14337 aarch64_split_compare_and_swap (rtx operands[])
14338 {
14339 rtx rval, mem, oldval, newval, scratch;
14340 machine_mode mode;
14341 bool is_weak;
14342 rtx_code_label *label1, *label2;
14343 rtx x, cond;
14344 enum memmodel model;
14345 rtx model_rtx;
14346
14347 rval = operands[0];
14348 mem = operands[1];
14349 oldval = operands[2];
14350 newval = operands[3];
14351 is_weak = (operands[4] != const0_rtx);
14352 model_rtx = operands[5];
14353 scratch = operands[7];
14354 mode = GET_MODE (mem);
14355 model = memmodel_from_int (INTVAL (model_rtx));
14356
14357 /* When OLDVAL is zero and we want the strong version we can emit a tighter
14358 loop:
14359 .label1:
14360 LD[A]XR rval, [mem]
14361 CBNZ rval, .label2
14362 ST[L]XR scratch, newval, [mem]
14363 CBNZ scratch, .label1
14364 .label2:
14365 CMP rval, 0. */
14366 bool strong_zero_p = !is_weak && oldval == const0_rtx;
14367
14368 label1 = NULL;
14369 if (!is_weak)
14370 {
14371 label1 = gen_label_rtx ();
14372 emit_label (label1);
14373 }
14374 label2 = gen_label_rtx ();
14375
14376 /* The initial load can be relaxed for a __sync operation since a final
14377 barrier will be emitted to stop code hoisting. */
14378 if (is_mm_sync (model))
14379 aarch64_emit_load_exclusive (mode, rval, mem,
14380 GEN_INT (MEMMODEL_RELAXED));
14381 else
14382 aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
14383
14384 if (strong_zero_p)
14385 {
14386 x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
14387 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14388 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
14389 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14390 }
14391 else
14392 {
14393 cond = aarch64_gen_compare_reg (NE, rval, oldval);
14394 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
14395 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14396 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
14397 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14398 }
14399
14400 aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
14401
14402 if (!is_weak)
14403 {
14404 x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
14405 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14406 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
14407 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14408 }
14409 else
14410 {
14411 cond = gen_rtx_REG (CCmode, CC_REGNUM);
14412 x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
14413 emit_insn (gen_rtx_SET (cond, x));
14414 }
14415
14416 emit_label (label2);
14417 /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
14418 to set the condition flags. If this is not used it will be removed by
14419 later passes. */
14420 if (strong_zero_p)
14421 {
14422 cond = gen_rtx_REG (CCmode, CC_REGNUM);
14423 x = gen_rtx_COMPARE (CCmode, rval, const0_rtx);
14424 emit_insn (gen_rtx_SET (cond, x));
14425 }
14426 /* Emit any final barrier needed for a __sync operation. */
14427 if (is_mm_sync (model))
14428 aarch64_emit_post_barrier (model);
14429 }
14430
14431 /* Emit a BIC instruction. */
14432
14433 static void
aarch64_emit_bic(machine_mode mode,rtx dst,rtx s1,rtx s2,int shift)14434 aarch64_emit_bic (machine_mode mode, rtx dst, rtx s1, rtx s2, int shift)
14435 {
14436 rtx shift_rtx = GEN_INT (shift);
14437 rtx (*gen) (rtx, rtx, rtx, rtx);
14438
14439 switch (mode)
14440 {
14441 case E_SImode: gen = gen_and_one_cmpl_lshrsi3; break;
14442 case E_DImode: gen = gen_and_one_cmpl_lshrdi3; break;
14443 default:
14444 gcc_unreachable ();
14445 }
14446
14447 emit_insn (gen (dst, s2, shift_rtx, s1));
14448 }
14449
14450 /* Emit an atomic swap. */
14451
14452 static void
aarch64_emit_atomic_swap(machine_mode mode,rtx dst,rtx value,rtx mem,rtx model)14453 aarch64_emit_atomic_swap (machine_mode mode, rtx dst, rtx value,
14454 rtx mem, rtx model)
14455 {
14456 rtx (*gen) (rtx, rtx, rtx, rtx);
14457
14458 switch (mode)
14459 {
14460 case E_QImode: gen = gen_aarch64_atomic_swpqi; break;
14461 case E_HImode: gen = gen_aarch64_atomic_swphi; break;
14462 case E_SImode: gen = gen_aarch64_atomic_swpsi; break;
14463 case E_DImode: gen = gen_aarch64_atomic_swpdi; break;
14464 default:
14465 gcc_unreachable ();
14466 }
14467
14468 emit_insn (gen (dst, mem, value, model));
14469 }
14470
14471 /* Operations supported by aarch64_emit_atomic_load_op. */
14472
14473 enum aarch64_atomic_load_op_code
14474 {
14475 AARCH64_LDOP_PLUS, /* A + B */
14476 AARCH64_LDOP_XOR, /* A ^ B */
14477 AARCH64_LDOP_OR, /* A | B */
14478 AARCH64_LDOP_BIC /* A & ~B */
14479 };
14480
14481 /* Emit an atomic load-operate. */
14482
14483 static void
aarch64_emit_atomic_load_op(enum aarch64_atomic_load_op_code code,machine_mode mode,rtx dst,rtx src,rtx mem,rtx model)14484 aarch64_emit_atomic_load_op (enum aarch64_atomic_load_op_code code,
14485 machine_mode mode, rtx dst, rtx src,
14486 rtx mem, rtx model)
14487 {
14488 typedef rtx (*aarch64_atomic_load_op_fn) (rtx, rtx, rtx, rtx);
14489 const aarch64_atomic_load_op_fn plus[] =
14490 {
14491 gen_aarch64_atomic_loadaddqi,
14492 gen_aarch64_atomic_loadaddhi,
14493 gen_aarch64_atomic_loadaddsi,
14494 gen_aarch64_atomic_loadadddi
14495 };
14496 const aarch64_atomic_load_op_fn eor[] =
14497 {
14498 gen_aarch64_atomic_loadeorqi,
14499 gen_aarch64_atomic_loadeorhi,
14500 gen_aarch64_atomic_loadeorsi,
14501 gen_aarch64_atomic_loadeordi
14502 };
14503 const aarch64_atomic_load_op_fn ior[] =
14504 {
14505 gen_aarch64_atomic_loadsetqi,
14506 gen_aarch64_atomic_loadsethi,
14507 gen_aarch64_atomic_loadsetsi,
14508 gen_aarch64_atomic_loadsetdi
14509 };
14510 const aarch64_atomic_load_op_fn bic[] =
14511 {
14512 gen_aarch64_atomic_loadclrqi,
14513 gen_aarch64_atomic_loadclrhi,
14514 gen_aarch64_atomic_loadclrsi,
14515 gen_aarch64_atomic_loadclrdi
14516 };
14517 aarch64_atomic_load_op_fn gen;
14518 int idx = 0;
14519
14520 switch (mode)
14521 {
14522 case E_QImode: idx = 0; break;
14523 case E_HImode: idx = 1; break;
14524 case E_SImode: idx = 2; break;
14525 case E_DImode: idx = 3; break;
14526 default:
14527 gcc_unreachable ();
14528 }
14529
14530 switch (code)
14531 {
14532 case AARCH64_LDOP_PLUS: gen = plus[idx]; break;
14533 case AARCH64_LDOP_XOR: gen = eor[idx]; break;
14534 case AARCH64_LDOP_OR: gen = ior[idx]; break;
14535 case AARCH64_LDOP_BIC: gen = bic[idx]; break;
14536 default:
14537 gcc_unreachable ();
14538 }
14539
14540 emit_insn (gen (dst, mem, src, model));
14541 }
14542
14543 /* Emit an atomic load+operate. CODE is the operation. OUT_DATA is the
14544 location to store the data read from memory. OUT_RESULT is the location to
14545 store the result of the operation. MEM is the memory location to read and
14546 modify. MODEL_RTX is the memory ordering to use. VALUE is the second
14547 operand for the operation. Either OUT_DATA or OUT_RESULT, but not both, can
14548 be NULL. */
14549
14550 void
aarch64_gen_atomic_ldop(enum rtx_code code,rtx out_data,rtx out_result,rtx mem,rtx value,rtx model_rtx)14551 aarch64_gen_atomic_ldop (enum rtx_code code, rtx out_data, rtx out_result,
14552 rtx mem, rtx value, rtx model_rtx)
14553 {
14554 machine_mode mode = GET_MODE (mem);
14555 machine_mode wmode = (mode == DImode ? DImode : SImode);
14556 const bool short_mode = (mode < SImode);
14557 aarch64_atomic_load_op_code ldop_code;
14558 rtx src;
14559 rtx x;
14560
14561 if (out_data)
14562 out_data = gen_lowpart (mode, out_data);
14563
14564 if (out_result)
14565 out_result = gen_lowpart (mode, out_result);
14566
14567 /* Make sure the value is in a register, putting it into a destination
14568 register if it needs to be manipulated. */
14569 if (!register_operand (value, mode)
14570 || code == AND || code == MINUS)
14571 {
14572 src = out_result ? out_result : out_data;
14573 emit_move_insn (src, gen_lowpart (mode, value));
14574 }
14575 else
14576 src = value;
14577 gcc_assert (register_operand (src, mode));
14578
14579 /* Preprocess the data for the operation as necessary. If the operation is
14580 a SET then emit a swap instruction and finish. */
14581 switch (code)
14582 {
14583 case SET:
14584 aarch64_emit_atomic_swap (mode, out_data, src, mem, model_rtx);
14585 return;
14586
14587 case MINUS:
14588 /* Negate the value and treat it as a PLUS. */
14589 {
14590 rtx neg_src;
14591
14592 /* Resize the value if necessary. */
14593 if (short_mode)
14594 src = gen_lowpart (wmode, src);
14595
14596 neg_src = gen_rtx_NEG (wmode, src);
14597 emit_insn (gen_rtx_SET (src, neg_src));
14598
14599 if (short_mode)
14600 src = gen_lowpart (mode, src);
14601 }
14602 /* Fall-through. */
14603 case PLUS:
14604 ldop_code = AARCH64_LDOP_PLUS;
14605 break;
14606
14607 case IOR:
14608 ldop_code = AARCH64_LDOP_OR;
14609 break;
14610
14611 case XOR:
14612 ldop_code = AARCH64_LDOP_XOR;
14613 break;
14614
14615 case AND:
14616 {
14617 rtx not_src;
14618
14619 /* Resize the value if necessary. */
14620 if (short_mode)
14621 src = gen_lowpart (wmode, src);
14622
14623 not_src = gen_rtx_NOT (wmode, src);
14624 emit_insn (gen_rtx_SET (src, not_src));
14625
14626 if (short_mode)
14627 src = gen_lowpart (mode, src);
14628 }
14629 ldop_code = AARCH64_LDOP_BIC;
14630 break;
14631
14632 default:
14633 /* The operation can't be done with atomic instructions. */
14634 gcc_unreachable ();
14635 }
14636
14637 aarch64_emit_atomic_load_op (ldop_code, mode, out_data, src, mem, model_rtx);
14638
14639 /* If necessary, calculate the data in memory after the update by redoing the
14640 operation from values in registers. */
14641 if (!out_result)
14642 return;
14643
14644 if (short_mode)
14645 {
14646 src = gen_lowpart (wmode, src);
14647 out_data = gen_lowpart (wmode, out_data);
14648 out_result = gen_lowpart (wmode, out_result);
14649 }
14650
14651 x = NULL_RTX;
14652
14653 switch (code)
14654 {
14655 case MINUS:
14656 case PLUS:
14657 x = gen_rtx_PLUS (wmode, out_data, src);
14658 break;
14659 case IOR:
14660 x = gen_rtx_IOR (wmode, out_data, src);
14661 break;
14662 case XOR:
14663 x = gen_rtx_XOR (wmode, out_data, src);
14664 break;
14665 case AND:
14666 aarch64_emit_bic (wmode, out_result, out_data, src, 0);
14667 return;
14668 default:
14669 gcc_unreachable ();
14670 }
14671
14672 emit_set_insn (out_result, x);
14673
14674 return;
14675 }
14676
14677 /* Split an atomic operation. */
14678
14679 void
aarch64_split_atomic_op(enum rtx_code code,rtx old_out,rtx new_out,rtx mem,rtx value,rtx model_rtx,rtx cond)14680 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
14681 rtx value, rtx model_rtx, rtx cond)
14682 {
14683 machine_mode mode = GET_MODE (mem);
14684 machine_mode wmode = (mode == DImode ? DImode : SImode);
14685 const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
14686 const bool is_sync = is_mm_sync (model);
14687 rtx_code_label *label;
14688 rtx x;
14689
14690 /* Split the atomic operation into a sequence. */
14691 label = gen_label_rtx ();
14692 emit_label (label);
14693
14694 if (new_out)
14695 new_out = gen_lowpart (wmode, new_out);
14696 if (old_out)
14697 old_out = gen_lowpart (wmode, old_out);
14698 else
14699 old_out = new_out;
14700 value = simplify_gen_subreg (wmode, value, mode, 0);
14701
14702 /* The initial load can be relaxed for a __sync operation since a final
14703 barrier will be emitted to stop code hoisting. */
14704 if (is_sync)
14705 aarch64_emit_load_exclusive (mode, old_out, mem,
14706 GEN_INT (MEMMODEL_RELAXED));
14707 else
14708 aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
14709
14710 switch (code)
14711 {
14712 case SET:
14713 new_out = value;
14714 break;
14715
14716 case NOT:
14717 x = gen_rtx_AND (wmode, old_out, value);
14718 emit_insn (gen_rtx_SET (new_out, x));
14719 x = gen_rtx_NOT (wmode, new_out);
14720 emit_insn (gen_rtx_SET (new_out, x));
14721 break;
14722
14723 case MINUS:
14724 if (CONST_INT_P (value))
14725 {
14726 value = GEN_INT (-INTVAL (value));
14727 code = PLUS;
14728 }
14729 /* Fall through. */
14730
14731 default:
14732 x = gen_rtx_fmt_ee (code, wmode, old_out, value);
14733 emit_insn (gen_rtx_SET (new_out, x));
14734 break;
14735 }
14736
14737 aarch64_emit_store_exclusive (mode, cond, mem,
14738 gen_lowpart (mode, new_out), model_rtx);
14739
14740 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
14741 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14742 gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
14743 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14744
14745 /* Emit any final barrier needed for a __sync operation. */
14746 if (is_sync)
14747 aarch64_emit_post_barrier (model);
14748 }
14749
14750 static void
aarch64_init_libfuncs(void)14751 aarch64_init_libfuncs (void)
14752 {
14753 /* Half-precision float operations. The compiler handles all operations
14754 with NULL libfuncs by converting to SFmode. */
14755
14756 /* Conversions. */
14757 set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
14758 set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
14759
14760 /* Arithmetic. */
14761 set_optab_libfunc (add_optab, HFmode, NULL);
14762 set_optab_libfunc (sdiv_optab, HFmode, NULL);
14763 set_optab_libfunc (smul_optab, HFmode, NULL);
14764 set_optab_libfunc (neg_optab, HFmode, NULL);
14765 set_optab_libfunc (sub_optab, HFmode, NULL);
14766
14767 /* Comparisons. */
14768 set_optab_libfunc (eq_optab, HFmode, NULL);
14769 set_optab_libfunc (ne_optab, HFmode, NULL);
14770 set_optab_libfunc (lt_optab, HFmode, NULL);
14771 set_optab_libfunc (le_optab, HFmode, NULL);
14772 set_optab_libfunc (ge_optab, HFmode, NULL);
14773 set_optab_libfunc (gt_optab, HFmode, NULL);
14774 set_optab_libfunc (unord_optab, HFmode, NULL);
14775 }
14776
14777 /* Target hook for c_mode_for_suffix. */
14778 static machine_mode
aarch64_c_mode_for_suffix(char suffix)14779 aarch64_c_mode_for_suffix (char suffix)
14780 {
14781 if (suffix == 'q')
14782 return TFmode;
14783
14784 return VOIDmode;
14785 }
14786
14787 /* We can only represent floating point constants which will fit in
14788 "quarter-precision" values. These values are characterised by
14789 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
14790 by:
14791
14792 (-1)^s * (n/16) * 2^r
14793
14794 Where:
14795 's' is the sign bit.
14796 'n' is an integer in the range 16 <= n <= 31.
14797 'r' is an integer in the range -3 <= r <= 4. */
14798
14799 /* Return true iff X can be represented by a quarter-precision
14800 floating point immediate operand X. Note, we cannot represent 0.0. */
14801 bool
aarch64_float_const_representable_p(rtx x)14802 aarch64_float_const_representable_p (rtx x)
14803 {
14804 /* This represents our current view of how many bits
14805 make up the mantissa. */
14806 int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
14807 int exponent;
14808 unsigned HOST_WIDE_INT mantissa, mask;
14809 REAL_VALUE_TYPE r, m;
14810 bool fail;
14811
14812 if (!CONST_DOUBLE_P (x))
14813 return false;
14814
14815 /* We don't support HFmode constants yet. */
14816 if (GET_MODE (x) == VOIDmode || GET_MODE (x) == HFmode)
14817 return false;
14818
14819 r = *CONST_DOUBLE_REAL_VALUE (x);
14820
14821 /* We cannot represent infinities, NaNs or +/-zero. We won't
14822 know if we have +zero until we analyse the mantissa, but we
14823 can reject the other invalid values. */
14824 if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
14825 || REAL_VALUE_MINUS_ZERO (r))
14826 return false;
14827
14828 /* Extract exponent. */
14829 r = real_value_abs (&r);
14830 exponent = REAL_EXP (&r);
14831
14832 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
14833 highest (sign) bit, with a fixed binary point at bit point_pos.
14834 m1 holds the low part of the mantissa, m2 the high part.
14835 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
14836 bits for the mantissa, this can fail (low bits will be lost). */
14837 real_ldexp (&m, &r, point_pos - exponent);
14838 wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
14839
14840 /* If the low part of the mantissa has bits set we cannot represent
14841 the value. */
14842 if (w.ulow () != 0)
14843 return false;
14844 /* We have rejected the lower HOST_WIDE_INT, so update our
14845 understanding of how many bits lie in the mantissa and
14846 look only at the high HOST_WIDE_INT. */
14847 mantissa = w.elt (1);
14848 point_pos -= HOST_BITS_PER_WIDE_INT;
14849
14850 /* We can only represent values with a mantissa of the form 1.xxxx. */
14851 mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
14852 if ((mantissa & mask) != 0)
14853 return false;
14854
14855 /* Having filtered unrepresentable values, we may now remove all
14856 but the highest 5 bits. */
14857 mantissa >>= point_pos - 5;
14858
14859 /* We cannot represent the value 0.0, so reject it. This is handled
14860 elsewhere. */
14861 if (mantissa == 0)
14862 return false;
14863
14864 /* Then, as bit 4 is always set, we can mask it off, leaving
14865 the mantissa in the range [0, 15]. */
14866 mantissa &= ~(1 << 4);
14867 gcc_assert (mantissa <= 15);
14868
14869 /* GCC internally does not use IEEE754-like encoding (where normalized
14870 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
14871 Our mantissa values are shifted 4 places to the left relative to
14872 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
14873 by 5 places to correct for GCC's representation. */
14874 exponent = 5 - exponent;
14875
14876 return (exponent >= 0 && exponent <= 7);
14877 }
14878
14879 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
14880 immediate with a CONST_VECTOR of MODE and WIDTH. WHICH selects whether to
14881 output MOVI/MVNI, ORR or BIC immediate. */
14882 char*
aarch64_output_simd_mov_immediate(rtx const_vector,unsigned width,enum simd_immediate_check which)14883 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
14884 enum simd_immediate_check which)
14885 {
14886 bool is_valid;
14887 static char templ[40];
14888 const char *mnemonic;
14889 const char *shift_op;
14890 unsigned int lane_count = 0;
14891 char element_char;
14892
14893 struct simd_immediate_info info;
14894
14895 /* This will return true to show const_vector is legal for use as either
14896 a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
14897 It will also update INFO to show how the immediate should be generated.
14898 WHICH selects whether to check for MOVI/MVNI, ORR or BIC. */
14899 is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
14900 gcc_assert (is_valid);
14901
14902 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
14903 lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
14904
14905 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
14906 {
14907 gcc_assert (info.shift == 0 && info.insn == simd_immediate_info::MOV);
14908 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
14909 move immediate path. */
14910 if (aarch64_float_const_zero_rtx_p (info.value))
14911 info.value = GEN_INT (0);
14912 else
14913 {
14914 const unsigned int buf_size = 20;
14915 char float_buf[buf_size] = {'\0'};
14916 real_to_decimal_for_mode (float_buf,
14917 CONST_DOUBLE_REAL_VALUE (info.value),
14918 buf_size, buf_size, 1, info.elt_mode);
14919
14920 if (lane_count == 1)
14921 snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
14922 else
14923 snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
14924 lane_count, element_char, float_buf);
14925 return templ;
14926 }
14927 }
14928
14929 gcc_assert (CONST_INT_P (info.value));
14930
14931 if (which == AARCH64_CHECK_MOV)
14932 {
14933 mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
14934 shift_op = info.modifier == simd_immediate_info::MSL ? "msl" : "lsl";
14935 if (lane_count == 1)
14936 snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
14937 mnemonic, UINTVAL (info.value));
14938 else if (info.shift)
14939 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
14940 HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
14941 element_char, UINTVAL (info.value), shift_op, info.shift);
14942 else
14943 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
14944 HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
14945 element_char, UINTVAL (info.value));
14946 }
14947 else
14948 {
14949 /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR. */
14950 mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
14951 if (info.shift)
14952 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
14953 HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
14954 element_char, UINTVAL (info.value), "lsl", info.shift);
14955 else
14956 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
14957 HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
14958 element_char, UINTVAL (info.value));
14959 }
14960 return templ;
14961 }
14962
14963 char*
aarch64_output_scalar_simd_mov_immediate(rtx immediate,scalar_int_mode mode)14964 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
14965 {
14966
14967 /* If a floating point number was passed and we desire to use it in an
14968 integer mode do the conversion to integer. */
14969 if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
14970 {
14971 unsigned HOST_WIDE_INT ival;
14972 if (!aarch64_reinterpret_float_as_int (immediate, &ival))
14973 gcc_unreachable ();
14974 immediate = gen_int_mode (ival, mode);
14975 }
14976
14977 machine_mode vmode;
14978 /* use a 64 bit mode for everything except for DI/DF mode, where we use
14979 a 128 bit vector mode. */
14980 int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
14981
14982 vmode = aarch64_simd_container_mode (mode, width);
14983 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
14984 return aarch64_output_simd_mov_immediate (v_op, width);
14985 }
14986
14987 /* Return the output string to use for moving immediate CONST_VECTOR
14988 into an SVE register. */
14989
14990 char *
aarch64_output_sve_mov_immediate(rtx const_vector)14991 aarch64_output_sve_mov_immediate (rtx const_vector)
14992 {
14993 static char templ[40];
14994 struct simd_immediate_info info;
14995 char element_char;
14996
14997 bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
14998 gcc_assert (is_valid);
14999
15000 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
15001
15002 if (info.step)
15003 {
15004 snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
15005 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
15006 element_char, INTVAL (info.value), INTVAL (info.step));
15007 return templ;
15008 }
15009
15010 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
15011 {
15012 if (aarch64_float_const_zero_rtx_p (info.value))
15013 info.value = GEN_INT (0);
15014 else
15015 {
15016 const int buf_size = 20;
15017 char float_buf[buf_size] = {};
15018 real_to_decimal_for_mode (float_buf,
15019 CONST_DOUBLE_REAL_VALUE (info.value),
15020 buf_size, buf_size, 1, info.elt_mode);
15021
15022 snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
15023 element_char, float_buf);
15024 return templ;
15025 }
15026 }
15027
15028 snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
15029 element_char, INTVAL (info.value));
15030 return templ;
15031 }
15032
15033 /* Return the asm format for a PTRUE instruction whose destination has
15034 mode MODE. SUFFIX is the element size suffix. */
15035
15036 char *
aarch64_output_ptrue(machine_mode mode,char suffix)15037 aarch64_output_ptrue (machine_mode mode, char suffix)
15038 {
15039 unsigned int nunits;
15040 static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
15041 if (GET_MODE_NUNITS (mode).is_constant (&nunits))
15042 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", suffix, nunits);
15043 else
15044 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, all", suffix);
15045 return buf;
15046 }
15047
15048 /* Split operands into moves from op[1] + op[2] into op[0]. */
15049
15050 void
aarch64_split_combinev16qi(rtx operands[3])15051 aarch64_split_combinev16qi (rtx operands[3])
15052 {
15053 unsigned int dest = REGNO (operands[0]);
15054 unsigned int src1 = REGNO (operands[1]);
15055 unsigned int src2 = REGNO (operands[2]);
15056 machine_mode halfmode = GET_MODE (operands[1]);
15057 unsigned int halfregs = REG_NREGS (operands[1]);
15058 rtx destlo, desthi;
15059
15060 gcc_assert (halfmode == V16QImode);
15061
15062 if (src1 == dest && src2 == dest + halfregs)
15063 {
15064 /* No-op move. Can't split to nothing; emit something. */
15065 emit_note (NOTE_INSN_DELETED);
15066 return;
15067 }
15068
15069 /* Preserve register attributes for variable tracking. */
15070 destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
15071 desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
15072 GET_MODE_SIZE (halfmode));
15073
15074 /* Special case of reversed high/low parts. */
15075 if (reg_overlap_mentioned_p (operands[2], destlo)
15076 && reg_overlap_mentioned_p (operands[1], desthi))
15077 {
15078 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
15079 emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
15080 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
15081 }
15082 else if (!reg_overlap_mentioned_p (operands[2], destlo))
15083 {
15084 /* Try to avoid unnecessary moves if part of the result
15085 is in the right place already. */
15086 if (src1 != dest)
15087 emit_move_insn (destlo, operands[1]);
15088 if (src2 != dest + halfregs)
15089 emit_move_insn (desthi, operands[2]);
15090 }
15091 else
15092 {
15093 if (src2 != dest + halfregs)
15094 emit_move_insn (desthi, operands[2]);
15095 if (src1 != dest)
15096 emit_move_insn (destlo, operands[1]);
15097 }
15098 }
15099
15100 /* vec_perm support. */
15101
15102 struct expand_vec_perm_d
15103 {
15104 rtx target, op0, op1;
15105 vec_perm_indices perm;
15106 machine_mode vmode;
15107 unsigned int vec_flags;
15108 bool one_vector_p;
15109 bool testing_p;
15110 };
15111
15112 /* Generate a variable permutation. */
15113
15114 static void
aarch64_expand_vec_perm_1(rtx target,rtx op0,rtx op1,rtx sel)15115 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
15116 {
15117 machine_mode vmode = GET_MODE (target);
15118 bool one_vector_p = rtx_equal_p (op0, op1);
15119
15120 gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
15121 gcc_checking_assert (GET_MODE (op0) == vmode);
15122 gcc_checking_assert (GET_MODE (op1) == vmode);
15123 gcc_checking_assert (GET_MODE (sel) == vmode);
15124 gcc_checking_assert (TARGET_SIMD);
15125
15126 if (one_vector_p)
15127 {
15128 if (vmode == V8QImode)
15129 {
15130 /* Expand the argument to a V16QI mode by duplicating it. */
15131 rtx pair = gen_reg_rtx (V16QImode);
15132 emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
15133 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
15134 }
15135 else
15136 {
15137 emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
15138 }
15139 }
15140 else
15141 {
15142 rtx pair;
15143
15144 if (vmode == V8QImode)
15145 {
15146 pair = gen_reg_rtx (V16QImode);
15147 emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
15148 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
15149 }
15150 else
15151 {
15152 pair = gen_reg_rtx (OImode);
15153 emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
15154 emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
15155 }
15156 }
15157 }
15158
15159 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
15160 NELT is the number of elements in the vector. */
15161
15162 void
aarch64_expand_vec_perm(rtx target,rtx op0,rtx op1,rtx sel,unsigned int nelt)15163 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
15164 unsigned int nelt)
15165 {
15166 machine_mode vmode = GET_MODE (target);
15167 bool one_vector_p = rtx_equal_p (op0, op1);
15168 rtx mask;
15169
15170 /* The TBL instruction does not use a modulo index, so we must take care
15171 of that ourselves. */
15172 mask = aarch64_simd_gen_const_vector_dup (vmode,
15173 one_vector_p ? nelt - 1 : 2 * nelt - 1);
15174 sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
15175
15176 /* For big-endian, we also need to reverse the index within the vector
15177 (but not which vector). */
15178 if (BYTES_BIG_ENDIAN)
15179 {
15180 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
15181 if (!one_vector_p)
15182 mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
15183 sel = expand_simple_binop (vmode, XOR, sel, mask,
15184 NULL, 0, OPTAB_LIB_WIDEN);
15185 }
15186 aarch64_expand_vec_perm_1 (target, op0, op1, sel);
15187 }
15188
15189 /* Generate (set TARGET (unspec [OP0 OP1] CODE)). */
15190
15191 static void
emit_unspec2(rtx target,int code,rtx op0,rtx op1)15192 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
15193 {
15194 emit_insn (gen_rtx_SET (target,
15195 gen_rtx_UNSPEC (GET_MODE (target),
15196 gen_rtvec (2, op0, op1), code)));
15197 }
15198
15199 /* Expand an SVE vec_perm with the given operands. */
15200
15201 void
aarch64_expand_sve_vec_perm(rtx target,rtx op0,rtx op1,rtx sel)15202 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
15203 {
15204 machine_mode data_mode = GET_MODE (target);
15205 machine_mode sel_mode = GET_MODE (sel);
15206 /* Enforced by the pattern condition. */
15207 int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
15208
15209 /* Note: vec_perm indices are supposed to wrap when they go beyond the
15210 size of the two value vectors, i.e. the upper bits of the indices
15211 are effectively ignored. SVE TBL instead produces 0 for any
15212 out-of-range indices, so we need to modulo all the vec_perm indices
15213 to ensure they are all in range. */
15214 rtx sel_reg = force_reg (sel_mode, sel);
15215
15216 /* Check if the sel only references the first values vector. */
15217 if (GET_CODE (sel) == CONST_VECTOR
15218 && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
15219 {
15220 emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
15221 return;
15222 }
15223
15224 /* Check if the two values vectors are the same. */
15225 if (rtx_equal_p (op0, op1))
15226 {
15227 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
15228 rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
15229 NULL, 0, OPTAB_DIRECT);
15230 emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
15231 return;
15232 }
15233
15234 /* Run TBL on for each value vector and combine the results. */
15235
15236 rtx res0 = gen_reg_rtx (data_mode);
15237 rtx res1 = gen_reg_rtx (data_mode);
15238 rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
15239 if (GET_CODE (sel) != CONST_VECTOR
15240 || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
15241 {
15242 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
15243 2 * nunits - 1);
15244 sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
15245 NULL, 0, OPTAB_DIRECT);
15246 }
15247 emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
15248 rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
15249 NULL, 0, OPTAB_DIRECT);
15250 emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
15251 if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
15252 emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
15253 else
15254 emit_unspec2 (target, UNSPEC_IORF, res0, res1);
15255 }
15256
15257 /* Recognize patterns suitable for the TRN instructions. */
15258 static bool
aarch64_evpc_trn(struct expand_vec_perm_d * d)15259 aarch64_evpc_trn (struct expand_vec_perm_d *d)
15260 {
15261 HOST_WIDE_INT odd;
15262 poly_uint64 nelt = d->perm.length ();
15263 rtx out, in0, in1, x;
15264 machine_mode vmode = d->vmode;
15265
15266 if (GET_MODE_UNIT_SIZE (vmode) > 8)
15267 return false;
15268
15269 /* Note that these are little-endian tests.
15270 We correct for big-endian later. */
15271 if (!d->perm[0].is_constant (&odd)
15272 || (odd != 0 && odd != 1)
15273 || !d->perm.series_p (0, 2, odd, 2)
15274 || !d->perm.series_p (1, 2, nelt + odd, 2))
15275 return false;
15276
15277 /* Success! */
15278 if (d->testing_p)
15279 return true;
15280
15281 in0 = d->op0;
15282 in1 = d->op1;
15283 /* We don't need a big-endian lane correction for SVE; see the comment
15284 at the head of aarch64-sve.md for details. */
15285 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
15286 {
15287 x = in0, in0 = in1, in1 = x;
15288 odd = !odd;
15289 }
15290 out = d->target;
15291
15292 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
15293 odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
15294 return true;
15295 }
15296
15297 /* Recognize patterns suitable for the UZP instructions. */
15298 static bool
aarch64_evpc_uzp(struct expand_vec_perm_d * d)15299 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
15300 {
15301 HOST_WIDE_INT odd;
15302 rtx out, in0, in1, x;
15303 machine_mode vmode = d->vmode;
15304
15305 if (GET_MODE_UNIT_SIZE (vmode) > 8)
15306 return false;
15307
15308 /* Note that these are little-endian tests.
15309 We correct for big-endian later. */
15310 if (!d->perm[0].is_constant (&odd)
15311 || (odd != 0 && odd != 1)
15312 || !d->perm.series_p (0, 1, odd, 2))
15313 return false;
15314
15315 /* Success! */
15316 if (d->testing_p)
15317 return true;
15318
15319 in0 = d->op0;
15320 in1 = d->op1;
15321 /* We don't need a big-endian lane correction for SVE; see the comment
15322 at the head of aarch64-sve.md for details. */
15323 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
15324 {
15325 x = in0, in0 = in1, in1 = x;
15326 odd = !odd;
15327 }
15328 out = d->target;
15329
15330 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
15331 odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
15332 return true;
15333 }
15334
15335 /* Recognize patterns suitable for the ZIP instructions. */
15336 static bool
aarch64_evpc_zip(struct expand_vec_perm_d * d)15337 aarch64_evpc_zip (struct expand_vec_perm_d *d)
15338 {
15339 unsigned int high;
15340 poly_uint64 nelt = d->perm.length ();
15341 rtx out, in0, in1, x;
15342 machine_mode vmode = d->vmode;
15343
15344 if (GET_MODE_UNIT_SIZE (vmode) > 8)
15345 return false;
15346
15347 /* Note that these are little-endian tests.
15348 We correct for big-endian later. */
15349 poly_uint64 first = d->perm[0];
15350 if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
15351 || !d->perm.series_p (0, 2, first, 1)
15352 || !d->perm.series_p (1, 2, first + nelt, 1))
15353 return false;
15354 high = maybe_ne (first, 0U);
15355
15356 /* Success! */
15357 if (d->testing_p)
15358 return true;
15359
15360 in0 = d->op0;
15361 in1 = d->op1;
15362 /* We don't need a big-endian lane correction for SVE; see the comment
15363 at the head of aarch64-sve.md for details. */
15364 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
15365 {
15366 x = in0, in0 = in1, in1 = x;
15367 high = !high;
15368 }
15369 out = d->target;
15370
15371 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
15372 high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
15373 return true;
15374 }
15375
15376 /* Recognize patterns for the EXT insn. */
15377
15378 static bool
aarch64_evpc_ext(struct expand_vec_perm_d * d)15379 aarch64_evpc_ext (struct expand_vec_perm_d *d)
15380 {
15381 HOST_WIDE_INT location;
15382 rtx offset;
15383
15384 /* The first element always refers to the first vector.
15385 Check if the extracted indices are increasing by one. */
15386 if (d->vec_flags == VEC_SVE_PRED
15387 || !d->perm[0].is_constant (&location)
15388 || !d->perm.series_p (0, 1, location, 1))
15389 return false;
15390
15391 /* Success! */
15392 if (d->testing_p)
15393 return true;
15394
15395 /* The case where (location == 0) is a no-op for both big- and little-endian,
15396 and is removed by the mid-end at optimization levels -O1 and higher.
15397
15398 We don't need a big-endian lane correction for SVE; see the comment
15399 at the head of aarch64-sve.md for details. */
15400 if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
15401 {
15402 /* After setup, we want the high elements of the first vector (stored
15403 at the LSB end of the register), and the low elements of the second
15404 vector (stored at the MSB end of the register). So swap. */
15405 std::swap (d->op0, d->op1);
15406 /* location != 0 (above), so safe to assume (nelt - location) < nelt.
15407 to_constant () is safe since this is restricted to Advanced SIMD
15408 vectors. */
15409 location = d->perm.length ().to_constant () - location;
15410 }
15411
15412 offset = GEN_INT (location);
15413 emit_set_insn (d->target,
15414 gen_rtx_UNSPEC (d->vmode,
15415 gen_rtvec (3, d->op0, d->op1, offset),
15416 UNSPEC_EXT));
15417 return true;
15418 }
15419
15420 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
15421 within each 64-bit, 32-bit or 16-bit granule. */
15422
15423 static bool
aarch64_evpc_rev_local(struct expand_vec_perm_d * d)15424 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
15425 {
15426 HOST_WIDE_INT diff;
15427 unsigned int i, size, unspec;
15428 machine_mode pred_mode;
15429
15430 if (d->vec_flags == VEC_SVE_PRED
15431 || !d->one_vector_p
15432 || !d->perm[0].is_constant (&diff))
15433 return false;
15434
15435 size = (diff + 1) * GET_MODE_UNIT_SIZE (d->vmode);
15436 if (size == 8)
15437 {
15438 unspec = UNSPEC_REV64;
15439 pred_mode = VNx2BImode;
15440 }
15441 else if (size == 4)
15442 {
15443 unspec = UNSPEC_REV32;
15444 pred_mode = VNx4BImode;
15445 }
15446 else if (size == 2)
15447 {
15448 unspec = UNSPEC_REV16;
15449 pred_mode = VNx8BImode;
15450 }
15451 else
15452 return false;
15453
15454 unsigned int step = diff + 1;
15455 for (i = 0; i < step; ++i)
15456 if (!d->perm.series_p (i, step, diff - i, step))
15457 return false;
15458
15459 /* Success! */
15460 if (d->testing_p)
15461 return true;
15462
15463 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
15464 if (d->vec_flags == VEC_SVE_DATA)
15465 {
15466 rtx pred = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
15467 src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (2, pred, src),
15468 UNSPEC_MERGE_PTRUE);
15469 }
15470 emit_set_insn (d->target, src);
15471 return true;
15472 }
15473
15474 /* Recognize patterns for the REV insn, which reverses elements within
15475 a full vector. */
15476
15477 static bool
aarch64_evpc_rev_global(struct expand_vec_perm_d * d)15478 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
15479 {
15480 poly_uint64 nelt = d->perm.length ();
15481
15482 if (!d->one_vector_p || d->vec_flags != VEC_SVE_DATA)
15483 return false;
15484
15485 if (!d->perm.series_p (0, 1, nelt - 1, -1))
15486 return false;
15487
15488 /* Success! */
15489 if (d->testing_p)
15490 return true;
15491
15492 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
15493 emit_set_insn (d->target, src);
15494 return true;
15495 }
15496
15497 static bool
aarch64_evpc_dup(struct expand_vec_perm_d * d)15498 aarch64_evpc_dup (struct expand_vec_perm_d *d)
15499 {
15500 rtx out = d->target;
15501 rtx in0;
15502 HOST_WIDE_INT elt;
15503 machine_mode vmode = d->vmode;
15504 rtx lane;
15505
15506 if (d->vec_flags == VEC_SVE_PRED
15507 || d->perm.encoding ().encoded_nelts () != 1
15508 || !d->perm[0].is_constant (&elt))
15509 return false;
15510
15511 if (d->vec_flags == VEC_SVE_DATA && elt >= 64 * GET_MODE_UNIT_SIZE (vmode))
15512 return false;
15513
15514 /* Success! */
15515 if (d->testing_p)
15516 return true;
15517
15518 /* The generic preparation in aarch64_expand_vec_perm_const_1
15519 swaps the operand order and the permute indices if it finds
15520 d->perm[0] to be in the second operand. Thus, we can always
15521 use d->op0 and need not do any extra arithmetic to get the
15522 correct lane number. */
15523 in0 = d->op0;
15524 lane = GEN_INT (elt); /* The pattern corrects for big-endian. */
15525
15526 rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
15527 rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
15528 emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
15529 return true;
15530 }
15531
15532 static bool
aarch64_evpc_tbl(struct expand_vec_perm_d * d)15533 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
15534 {
15535 rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
15536 machine_mode vmode = d->vmode;
15537
15538 /* Make sure that the indices are constant. */
15539 unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
15540 for (unsigned int i = 0; i < encoded_nelts; ++i)
15541 if (!d->perm[i].is_constant ())
15542 return false;
15543
15544 if (d->testing_p)
15545 return true;
15546
15547 /* Generic code will try constant permutation twice. Once with the
15548 original mode and again with the elements lowered to QImode.
15549 So wait and don't do the selector expansion ourselves. */
15550 if (vmode != V8QImode && vmode != V16QImode)
15551 return false;
15552
15553 /* to_constant is safe since this routine is specific to Advanced SIMD
15554 vectors. */
15555 unsigned int nelt = d->perm.length ().to_constant ();
15556 for (unsigned int i = 0; i < nelt; ++i)
15557 /* If big-endian and two vectors we end up with a weird mixed-endian
15558 mode on NEON. Reverse the index within each word but not the word
15559 itself. to_constant is safe because we checked is_constant above. */
15560 rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
15561 ? d->perm[i].to_constant () ^ (nelt - 1)
15562 : d->perm[i].to_constant ());
15563
15564 sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
15565 sel = force_reg (vmode, sel);
15566
15567 aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
15568 return true;
15569 }
15570
15571 /* Try to implement D using an SVE TBL instruction. */
15572
15573 static bool
aarch64_evpc_sve_tbl(struct expand_vec_perm_d * d)15574 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
15575 {
15576 unsigned HOST_WIDE_INT nelt;
15577
15578 /* Permuting two variable-length vectors could overflow the
15579 index range. */
15580 if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
15581 return false;
15582
15583 if (d->testing_p)
15584 return true;
15585
15586 machine_mode sel_mode = mode_for_int_vector (d->vmode).require ();
15587 rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
15588 aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
15589 return true;
15590 }
15591
15592 static bool
aarch64_expand_vec_perm_const_1(struct expand_vec_perm_d * d)15593 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
15594 {
15595 /* The pattern matching functions above are written to look for a small
15596 number to begin the sequence (0, 1, N/2). If we begin with an index
15597 from the second operand, we can swap the operands. */
15598 poly_int64 nelt = d->perm.length ();
15599 if (known_ge (d->perm[0], nelt))
15600 {
15601 d->perm.rotate_inputs (1);
15602 std::swap (d->op0, d->op1);
15603 }
15604
15605 if ((d->vec_flags == VEC_ADVSIMD
15606 || d->vec_flags == VEC_SVE_DATA
15607 || d->vec_flags == VEC_SVE_PRED)
15608 && known_gt (nelt, 1))
15609 {
15610 if (aarch64_evpc_rev_local (d))
15611 return true;
15612 else if (aarch64_evpc_rev_global (d))
15613 return true;
15614 else if (aarch64_evpc_ext (d))
15615 return true;
15616 else if (aarch64_evpc_dup (d))
15617 return true;
15618 else if (aarch64_evpc_zip (d))
15619 return true;
15620 else if (aarch64_evpc_uzp (d))
15621 return true;
15622 else if (aarch64_evpc_trn (d))
15623 return true;
15624 if (d->vec_flags == VEC_SVE_DATA)
15625 return aarch64_evpc_sve_tbl (d);
15626 else if (d->vec_flags == VEC_SVE_DATA)
15627 return aarch64_evpc_tbl (d);
15628 }
15629 return false;
15630 }
15631
15632 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
15633
15634 static bool
aarch64_vectorize_vec_perm_const(machine_mode vmode,rtx target,rtx op0,rtx op1,const vec_perm_indices & sel)15635 aarch64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
15636 rtx op1, const vec_perm_indices &sel)
15637 {
15638 struct expand_vec_perm_d d;
15639
15640 /* Check whether the mask can be applied to a single vector. */
15641 if (op0 && rtx_equal_p (op0, op1))
15642 d.one_vector_p = true;
15643 else if (sel.all_from_input_p (0))
15644 {
15645 d.one_vector_p = true;
15646 op1 = op0;
15647 }
15648 else if (sel.all_from_input_p (1))
15649 {
15650 d.one_vector_p = true;
15651 op0 = op1;
15652 }
15653 else
15654 d.one_vector_p = false;
15655
15656 d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
15657 sel.nelts_per_input ());
15658 d.vmode = vmode;
15659 d.vec_flags = aarch64_classify_vector_mode (d.vmode);
15660 d.target = target;
15661 d.op0 = op0;
15662 d.op1 = op1;
15663 d.testing_p = !target;
15664
15665 if (!d.testing_p)
15666 return aarch64_expand_vec_perm_const_1 (&d);
15667
15668 rtx_insn *last = get_last_insn ();
15669 bool ret = aarch64_expand_vec_perm_const_1 (&d);
15670 gcc_assert (last == get_last_insn ());
15671
15672 return ret;
15673 }
15674
15675 /* Generate a byte permute mask for a register of mode MODE,
15676 which has NUNITS units. */
15677
15678 rtx
aarch64_reverse_mask(machine_mode mode,unsigned int nunits)15679 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
15680 {
15681 /* We have to reverse each vector because we dont have
15682 a permuted load that can reverse-load according to ABI rules. */
15683 rtx mask;
15684 rtvec v = rtvec_alloc (16);
15685 unsigned int i, j;
15686 unsigned int usize = GET_MODE_UNIT_SIZE (mode);
15687
15688 gcc_assert (BYTES_BIG_ENDIAN);
15689 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
15690
15691 for (i = 0; i < nunits; i++)
15692 for (j = 0; j < usize; j++)
15693 RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
15694 mask = gen_rtx_CONST_VECTOR (V16QImode, v);
15695 return force_reg (V16QImode, mask);
15696 }
15697
15698 /* Return true if X is a valid second operand for the SVE instruction
15699 that implements integer comparison OP_CODE. */
15700
15701 static bool
aarch64_sve_cmp_operand_p(rtx_code op_code,rtx x)15702 aarch64_sve_cmp_operand_p (rtx_code op_code, rtx x)
15703 {
15704 if (register_operand (x, VOIDmode))
15705 return true;
15706
15707 switch (op_code)
15708 {
15709 case LTU:
15710 case LEU:
15711 case GEU:
15712 case GTU:
15713 return aarch64_sve_cmp_immediate_p (x, false);
15714 case LT:
15715 case LE:
15716 case GE:
15717 case GT:
15718 case NE:
15719 case EQ:
15720 return aarch64_sve_cmp_immediate_p (x, true);
15721 default:
15722 gcc_unreachable ();
15723 }
15724 }
15725
15726 /* Return the UNSPEC_COND_* code for comparison CODE. */
15727
15728 static unsigned int
aarch64_unspec_cond_code(rtx_code code)15729 aarch64_unspec_cond_code (rtx_code code)
15730 {
15731 switch (code)
15732 {
15733 case NE:
15734 return UNSPEC_COND_NE;
15735 case EQ:
15736 return UNSPEC_COND_EQ;
15737 case LT:
15738 return UNSPEC_COND_LT;
15739 case GT:
15740 return UNSPEC_COND_GT;
15741 case LE:
15742 return UNSPEC_COND_LE;
15743 case GE:
15744 return UNSPEC_COND_GE;
15745 case LTU:
15746 return UNSPEC_COND_LO;
15747 case GTU:
15748 return UNSPEC_COND_HI;
15749 case LEU:
15750 return UNSPEC_COND_LS;
15751 case GEU:
15752 return UNSPEC_COND_HS;
15753 case UNORDERED:
15754 return UNSPEC_COND_UO;
15755 default:
15756 gcc_unreachable ();
15757 }
15758 }
15759
15760 /* Return an (unspec:PRED_MODE [PRED OP0 OP1] UNSPEC_COND_<X>) expression,
15761 where <X> is the operation associated with comparison CODE. */
15762
15763 static rtx
aarch64_gen_unspec_cond(rtx_code code,machine_mode pred_mode,rtx pred,rtx op0,rtx op1)15764 aarch64_gen_unspec_cond (rtx_code code, machine_mode pred_mode,
15765 rtx pred, rtx op0, rtx op1)
15766 {
15767 rtvec vec = gen_rtvec (3, pred, op0, op1);
15768 return gen_rtx_UNSPEC (pred_mode, vec, aarch64_unspec_cond_code (code));
15769 }
15770
15771 /* Expand an SVE integer comparison:
15772
15773 TARGET = CODE (OP0, OP1). */
15774
15775 void
aarch64_expand_sve_vec_cmp_int(rtx target,rtx_code code,rtx op0,rtx op1)15776 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
15777 {
15778 machine_mode pred_mode = GET_MODE (target);
15779 machine_mode data_mode = GET_MODE (op0);
15780
15781 if (!aarch64_sve_cmp_operand_p (code, op1))
15782 op1 = force_reg (data_mode, op1);
15783
15784 rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
15785 rtx unspec = aarch64_gen_unspec_cond (code, pred_mode, ptrue, op0, op1);
15786 emit_insn (gen_set_clobber_cc (target, unspec));
15787 }
15788
15789 /* Emit an instruction:
15790
15791 (set TARGET (unspec:PRED_MODE [PRED OP0 OP1] UNSPEC_COND_<X>))
15792
15793 where <X> is the operation associated with comparison CODE. */
15794
15795 static void
aarch64_emit_unspec_cond(rtx target,rtx_code code,machine_mode pred_mode,rtx pred,rtx op0,rtx op1)15796 aarch64_emit_unspec_cond (rtx target, rtx_code code, machine_mode pred_mode,
15797 rtx pred, rtx op0, rtx op1)
15798 {
15799 rtx unspec = aarch64_gen_unspec_cond (code, pred_mode, pred, op0, op1);
15800 emit_set_insn (target, unspec);
15801 }
15802
15803 /* Emit:
15804
15805 (set TMP1 (unspec:PRED_MODE [PTRUE OP0 OP1] UNSPEC_COND_<X1>))
15806 (set TMP2 (unspec:PRED_MODE [PTRUE OP0 OP1] UNSPEC_COND_<X2>))
15807 (set TARGET (and:PRED_MODE (ior:PRED_MODE TMP1 TMP2) PTRUE))
15808
15809 where <Xi> is the operation associated with comparison CODEi. */
15810
15811 static void
aarch64_emit_unspec_cond_or(rtx target,rtx_code code1,rtx_code code2,machine_mode pred_mode,rtx ptrue,rtx op0,rtx op1)15812 aarch64_emit_unspec_cond_or (rtx target, rtx_code code1, rtx_code code2,
15813 machine_mode pred_mode, rtx ptrue,
15814 rtx op0, rtx op1)
15815 {
15816 rtx tmp1 = gen_reg_rtx (pred_mode);
15817 aarch64_emit_unspec_cond (tmp1, code1, pred_mode, ptrue, op0, op1);
15818 rtx tmp2 = gen_reg_rtx (pred_mode);
15819 aarch64_emit_unspec_cond (tmp2, code2, pred_mode, ptrue, op0, op1);
15820 emit_set_insn (target, gen_rtx_AND (pred_mode,
15821 gen_rtx_IOR (pred_mode, tmp1, tmp2),
15822 ptrue));
15823 }
15824
15825 /* If CAN_INVERT_P, emit an instruction:
15826
15827 (set TARGET (unspec:PRED_MODE [PRED OP0 OP1] UNSPEC_COND_<X>))
15828
15829 where <X> is the operation associated with comparison CODE. Otherwise
15830 emit:
15831
15832 (set TMP (unspec:PRED_MODE [PRED OP0 OP1] UNSPEC_COND_<X>))
15833 (set TARGET (and:PRED_MODE (not:PRED_MODE TMP) PTRUE))
15834
15835 where the second instructions sets TARGET to the inverse of TMP. */
15836
15837 static void
aarch64_emit_inverted_unspec_cond(rtx target,rtx_code code,machine_mode pred_mode,rtx ptrue,rtx pred,rtx op0,rtx op1,bool can_invert_p)15838 aarch64_emit_inverted_unspec_cond (rtx target, rtx_code code,
15839 machine_mode pred_mode, rtx ptrue, rtx pred,
15840 rtx op0, rtx op1, bool can_invert_p)
15841 {
15842 if (can_invert_p)
15843 aarch64_emit_unspec_cond (target, code, pred_mode, pred, op0, op1);
15844 else
15845 {
15846 rtx tmp = gen_reg_rtx (pred_mode);
15847 aarch64_emit_unspec_cond (tmp, code, pred_mode, pred, op0, op1);
15848 emit_set_insn (target, gen_rtx_AND (pred_mode,
15849 gen_rtx_NOT (pred_mode, tmp),
15850 ptrue));
15851 }
15852 }
15853
15854 /* Expand an SVE floating-point comparison:
15855
15856 TARGET = CODE (OP0, OP1)
15857
15858 If CAN_INVERT_P is true, the caller can also handle inverted results;
15859 return true if the result is in fact inverted. */
15860
15861 bool
aarch64_expand_sve_vec_cmp_float(rtx target,rtx_code code,rtx op0,rtx op1,bool can_invert_p)15862 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
15863 rtx op0, rtx op1, bool can_invert_p)
15864 {
15865 machine_mode pred_mode = GET_MODE (target);
15866 machine_mode data_mode = GET_MODE (op0);
15867
15868 rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
15869 switch (code)
15870 {
15871 case UNORDERED:
15872 /* UNORDERED has no immediate form. */
15873 op1 = force_reg (data_mode, op1);
15874 aarch64_emit_unspec_cond (target, code, pred_mode, ptrue, op0, op1);
15875 return false;
15876
15877 case LT:
15878 case LE:
15879 case GT:
15880 case GE:
15881 case EQ:
15882 case NE:
15883 /* There is native support for the comparison. */
15884 aarch64_emit_unspec_cond (target, code, pred_mode, ptrue, op0, op1);
15885 return false;
15886
15887 case ORDERED:
15888 /* There is native support for the inverse comparison. */
15889 op1 = force_reg (data_mode, op1);
15890 aarch64_emit_inverted_unspec_cond (target, UNORDERED,
15891 pred_mode, ptrue, ptrue, op0, op1,
15892 can_invert_p);
15893 return can_invert_p;
15894
15895 case LTGT:
15896 /* This is a trapping operation (LT or GT). */
15897 aarch64_emit_unspec_cond_or (target, LT, GT, pred_mode, ptrue, op0, op1);
15898 return false;
15899
15900 case UNEQ:
15901 if (!flag_trapping_math)
15902 {
15903 /* This would trap for signaling NaNs. */
15904 op1 = force_reg (data_mode, op1);
15905 aarch64_emit_unspec_cond_or (target, UNORDERED, EQ,
15906 pred_mode, ptrue, op0, op1);
15907 return false;
15908 }
15909 /* fall through */
15910
15911 case UNLT:
15912 case UNLE:
15913 case UNGT:
15914 case UNGE:
15915 {
15916 rtx ordered = ptrue;
15917 if (flag_trapping_math)
15918 {
15919 /* Only compare the elements that are known to be ordered. */
15920 ordered = gen_reg_rtx (pred_mode);
15921 op1 = force_reg (data_mode, op1);
15922 aarch64_emit_inverted_unspec_cond (ordered, UNORDERED, pred_mode,
15923 ptrue, ptrue, op0, op1, false);
15924 }
15925 if (code == UNEQ)
15926 code = NE;
15927 else
15928 code = reverse_condition_maybe_unordered (code);
15929 aarch64_emit_inverted_unspec_cond (target, code, pred_mode, ptrue,
15930 ordered, op0, op1, can_invert_p);
15931 return can_invert_p;
15932 }
15933
15934 default:
15935 gcc_unreachable ();
15936 }
15937 }
15938
15939 /* Expand an SVE vcond pattern with operands OPS. DATA_MODE is the mode
15940 of the data being selected and CMP_MODE is the mode of the values being
15941 compared. */
15942
15943 void
aarch64_expand_sve_vcond(machine_mode data_mode,machine_mode cmp_mode,rtx * ops)15944 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
15945 rtx *ops)
15946 {
15947 machine_mode pred_mode
15948 = aarch64_get_mask_mode (GET_MODE_NUNITS (cmp_mode),
15949 GET_MODE_SIZE (cmp_mode)).require ();
15950 rtx pred = gen_reg_rtx (pred_mode);
15951 if (FLOAT_MODE_P (cmp_mode))
15952 {
15953 if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
15954 ops[4], ops[5], true))
15955 std::swap (ops[1], ops[2]);
15956 }
15957 else
15958 aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
15959
15960 rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
15961 emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
15962 }
15963
15964 /* Implement TARGET_MODES_TIEABLE_P. In principle we should always return
15965 true. However due to issues with register allocation it is preferable
15966 to avoid tieing integer scalar and FP scalar modes. Executing integer
15967 operations in general registers is better than treating them as scalar
15968 vector operations. This reduces latency and avoids redundant int<->FP
15969 moves. So tie modes if they are either the same class, or vector modes
15970 with other vector modes, vector structs or any scalar mode. */
15971
15972 static bool
aarch64_modes_tieable_p(machine_mode mode1,machine_mode mode2)15973 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
15974 {
15975 if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
15976 return true;
15977
15978 /* We specifically want to allow elements of "structure" modes to
15979 be tieable to the structure. This more general condition allows
15980 other rarer situations too. The reason we don't extend this to
15981 predicate modes is that there are no predicate structure modes
15982 nor any specific instructions for extracting part of a predicate
15983 register. */
15984 if (aarch64_vector_data_mode_p (mode1)
15985 && aarch64_vector_data_mode_p (mode2))
15986 return true;
15987
15988 /* Also allow any scalar modes with vectors. */
15989 if (aarch64_vector_mode_supported_p (mode1)
15990 || aarch64_vector_mode_supported_p (mode2))
15991 return true;
15992
15993 return false;
15994 }
15995
15996 /* Return a new RTX holding the result of moving POINTER forward by
15997 AMOUNT bytes. */
15998
15999 static rtx
aarch64_move_pointer(rtx pointer,poly_int64 amount)16000 aarch64_move_pointer (rtx pointer, poly_int64 amount)
16001 {
16002 rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
16003
16004 return adjust_automodify_address (pointer, GET_MODE (pointer),
16005 next, amount);
16006 }
16007
16008 /* Return a new RTX holding the result of moving POINTER forward by the
16009 size of the mode it points to. */
16010
16011 static rtx
aarch64_progress_pointer(rtx pointer)16012 aarch64_progress_pointer (rtx pointer)
16013 {
16014 return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
16015 }
16016
16017 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
16018 MODE bytes. */
16019
16020 static void
aarch64_copy_one_block_and_progress_pointers(rtx * src,rtx * dst,machine_mode mode)16021 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
16022 machine_mode mode)
16023 {
16024 rtx reg = gen_reg_rtx (mode);
16025
16026 /* "Cast" the pointers to the correct mode. */
16027 *src = adjust_address (*src, mode, 0);
16028 *dst = adjust_address (*dst, mode, 0);
16029 /* Emit the memcpy. */
16030 emit_move_insn (reg, *src);
16031 emit_move_insn (*dst, reg);
16032 /* Move the pointers forward. */
16033 *src = aarch64_progress_pointer (*src);
16034 *dst = aarch64_progress_pointer (*dst);
16035 }
16036
16037 /* Expand movmem, as if from a __builtin_memcpy. Return true if
16038 we succeed, otherwise return false. */
16039
16040 bool
aarch64_expand_movmem(rtx * operands)16041 aarch64_expand_movmem (rtx *operands)
16042 {
16043 unsigned int n;
16044 rtx dst = operands[0];
16045 rtx src = operands[1];
16046 rtx base;
16047 bool speed_p = !optimize_function_for_size_p (cfun);
16048
16049 /* When optimizing for size, give a better estimate of the length of a
16050 memcpy call, but use the default otherwise. */
16051 unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
16052
16053 /* We can't do anything smart if the amount to copy is not constant. */
16054 if (!CONST_INT_P (operands[2]))
16055 return false;
16056
16057 n = UINTVAL (operands[2]);
16058
16059 /* Try to keep the number of instructions low. For cases below 16 bytes we
16060 need to make at most two moves. For cases above 16 bytes it will be one
16061 move for each 16 byte chunk, then at most two additional moves. */
16062 if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
16063 return false;
16064
16065 base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
16066 dst = adjust_automodify_address (dst, VOIDmode, base, 0);
16067
16068 base = copy_to_mode_reg (Pmode, XEXP (src, 0));
16069 src = adjust_automodify_address (src, VOIDmode, base, 0);
16070
16071 /* Simple cases. Copy 0-3 bytes, as (if applicable) a 2-byte, then a
16072 1-byte chunk. */
16073 if (n < 4)
16074 {
16075 if (n >= 2)
16076 {
16077 aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
16078 n -= 2;
16079 }
16080
16081 if (n == 1)
16082 aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
16083
16084 return true;
16085 }
16086
16087 /* Copy 4-8 bytes. First a 4-byte chunk, then (if applicable) a second
16088 4-byte chunk, partially overlapping with the previously copied chunk. */
16089 if (n < 8)
16090 {
16091 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
16092 n -= 4;
16093 if (n > 0)
16094 {
16095 int move = n - 4;
16096
16097 src = aarch64_move_pointer (src, move);
16098 dst = aarch64_move_pointer (dst, move);
16099 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
16100 }
16101 return true;
16102 }
16103
16104 /* Copy more than 8 bytes. Copy chunks of 16 bytes until we run out of
16105 them, then (if applicable) an 8-byte chunk. */
16106 while (n >= 8)
16107 {
16108 if (n / 16)
16109 {
16110 aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
16111 n -= 16;
16112 }
16113 else
16114 {
16115 aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
16116 n -= 8;
16117 }
16118 }
16119
16120 /* Finish the final bytes of the copy. We can always do this in one
16121 instruction. We either copy the exact amount we need, or partially
16122 overlap with the previous chunk we copied and copy 8-bytes. */
16123 if (n == 0)
16124 return true;
16125 else if (n == 1)
16126 aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
16127 else if (n == 2)
16128 aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
16129 else if (n == 4)
16130 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
16131 else
16132 {
16133 if (n == 3)
16134 {
16135 src = aarch64_move_pointer (src, -1);
16136 dst = aarch64_move_pointer (dst, -1);
16137 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
16138 }
16139 else
16140 {
16141 int move = n - 8;
16142
16143 src = aarch64_move_pointer (src, move);
16144 dst = aarch64_move_pointer (dst, move);
16145 aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
16146 }
16147 }
16148
16149 return true;
16150 }
16151
16152 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
16153 SImode stores. Handle the case when the constant has identical
16154 bottom and top halves. This is beneficial when the two stores can be
16155 merged into an STP and we avoid synthesising potentially expensive
16156 immediates twice. Return true if such a split is possible. */
16157
16158 bool
aarch64_split_dimode_const_store(rtx dst,rtx src)16159 aarch64_split_dimode_const_store (rtx dst, rtx src)
16160 {
16161 rtx lo = gen_lowpart (SImode, src);
16162 rtx hi = gen_highpart_mode (SImode, DImode, src);
16163
16164 bool size_p = optimize_function_for_size_p (cfun);
16165
16166 if (!rtx_equal_p (lo, hi))
16167 return false;
16168
16169 unsigned int orig_cost
16170 = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
16171 unsigned int lo_cost
16172 = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
16173
16174 /* We want to transform:
16175 MOV x1, 49370
16176 MOVK x1, 0x140, lsl 16
16177 MOVK x1, 0xc0da, lsl 32
16178 MOVK x1, 0x140, lsl 48
16179 STR x1, [x0]
16180 into:
16181 MOV w1, 49370
16182 MOVK w1, 0x140, lsl 16
16183 STP w1, w1, [x0]
16184 So we want to perform this only when we save two instructions
16185 or more. When optimizing for size, however, accept any code size
16186 savings we can. */
16187 if (size_p && orig_cost <= lo_cost)
16188 return false;
16189
16190 if (!size_p
16191 && (orig_cost <= lo_cost + 1))
16192 return false;
16193
16194 rtx mem_lo = adjust_address (dst, SImode, 0);
16195 if (!aarch64_mem_pair_operand (mem_lo, SImode))
16196 return false;
16197
16198 rtx tmp_reg = gen_reg_rtx (SImode);
16199 aarch64_expand_mov_immediate (tmp_reg, lo);
16200 rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
16201 /* Don't emit an explicit store pair as this may not be always profitable.
16202 Let the sched-fusion logic decide whether to merge them. */
16203 emit_move_insn (mem_lo, tmp_reg);
16204 emit_move_insn (mem_hi, tmp_reg);
16205
16206 return true;
16207 }
16208
16209 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
16210
16211 static unsigned HOST_WIDE_INT
aarch64_asan_shadow_offset(void)16212 aarch64_asan_shadow_offset (void)
16213 {
16214 return (HOST_WIDE_INT_1 << 36);
16215 }
16216
16217 static rtx
aarch64_gen_ccmp_first(rtx_insn ** prep_seq,rtx_insn ** gen_seq,int code,tree treeop0,tree treeop1)16218 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
16219 int code, tree treeop0, tree treeop1)
16220 {
16221 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
16222 rtx op0, op1;
16223 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
16224 insn_code icode;
16225 struct expand_operand ops[4];
16226
16227 start_sequence ();
16228 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
16229
16230 op_mode = GET_MODE (op0);
16231 if (op_mode == VOIDmode)
16232 op_mode = GET_MODE (op1);
16233
16234 switch (op_mode)
16235 {
16236 case E_QImode:
16237 case E_HImode:
16238 case E_SImode:
16239 cmp_mode = SImode;
16240 icode = CODE_FOR_cmpsi;
16241 break;
16242
16243 case E_DImode:
16244 cmp_mode = DImode;
16245 icode = CODE_FOR_cmpdi;
16246 break;
16247
16248 case E_SFmode:
16249 cmp_mode = SFmode;
16250 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
16251 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
16252 break;
16253
16254 case E_DFmode:
16255 cmp_mode = DFmode;
16256 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
16257 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
16258 break;
16259
16260 default:
16261 end_sequence ();
16262 return NULL_RTX;
16263 }
16264
16265 op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
16266 op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
16267 if (!op0 || !op1)
16268 {
16269 end_sequence ();
16270 return NULL_RTX;
16271 }
16272 *prep_seq = get_insns ();
16273 end_sequence ();
16274
16275 create_fixed_operand (&ops[0], op0);
16276 create_fixed_operand (&ops[1], op1);
16277
16278 start_sequence ();
16279 if (!maybe_expand_insn (icode, 2, ops))
16280 {
16281 end_sequence ();
16282 return NULL_RTX;
16283 }
16284 *gen_seq = get_insns ();
16285 end_sequence ();
16286
16287 return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
16288 gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
16289 }
16290
16291 static rtx
aarch64_gen_ccmp_next(rtx_insn ** prep_seq,rtx_insn ** gen_seq,rtx prev,int cmp_code,tree treeop0,tree treeop1,int bit_code)16292 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
16293 int cmp_code, tree treeop0, tree treeop1, int bit_code)
16294 {
16295 rtx op0, op1, target;
16296 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
16297 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
16298 insn_code icode;
16299 struct expand_operand ops[6];
16300 int aarch64_cond;
16301
16302 push_to_sequence (*prep_seq);
16303 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
16304
16305 op_mode = GET_MODE (op0);
16306 if (op_mode == VOIDmode)
16307 op_mode = GET_MODE (op1);
16308
16309 switch (op_mode)
16310 {
16311 case E_QImode:
16312 case E_HImode:
16313 case E_SImode:
16314 cmp_mode = SImode;
16315 icode = CODE_FOR_ccmpsi;
16316 break;
16317
16318 case E_DImode:
16319 cmp_mode = DImode;
16320 icode = CODE_FOR_ccmpdi;
16321 break;
16322
16323 case E_SFmode:
16324 cmp_mode = SFmode;
16325 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
16326 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
16327 break;
16328
16329 case E_DFmode:
16330 cmp_mode = DFmode;
16331 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
16332 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
16333 break;
16334
16335 default:
16336 end_sequence ();
16337 return NULL_RTX;
16338 }
16339
16340 op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
16341 op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
16342 if (!op0 || !op1)
16343 {
16344 end_sequence ();
16345 return NULL_RTX;
16346 }
16347 *prep_seq = get_insns ();
16348 end_sequence ();
16349
16350 target = gen_rtx_REG (cc_mode, CC_REGNUM);
16351 aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
16352
16353 if (bit_code != AND)
16354 {
16355 prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
16356 GET_MODE (XEXP (prev, 0))),
16357 VOIDmode, XEXP (prev, 0), const0_rtx);
16358 aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
16359 }
16360
16361 create_fixed_operand (&ops[0], XEXP (prev, 0));
16362 create_fixed_operand (&ops[1], target);
16363 create_fixed_operand (&ops[2], op0);
16364 create_fixed_operand (&ops[3], op1);
16365 create_fixed_operand (&ops[4], prev);
16366 create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
16367
16368 push_to_sequence (*gen_seq);
16369 if (!maybe_expand_insn (icode, 6, ops))
16370 {
16371 end_sequence ();
16372 return NULL_RTX;
16373 }
16374
16375 *gen_seq = get_insns ();
16376 end_sequence ();
16377
16378 return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
16379 }
16380
16381 #undef TARGET_GEN_CCMP_FIRST
16382 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
16383
16384 #undef TARGET_GEN_CCMP_NEXT
16385 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
16386
16387 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
16388 instruction fusion of some sort. */
16389
16390 static bool
aarch64_macro_fusion_p(void)16391 aarch64_macro_fusion_p (void)
16392 {
16393 return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
16394 }
16395
16396
16397 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
16398 should be kept together during scheduling. */
16399
16400 static bool
aarch_macro_fusion_pair_p(rtx_insn * prev,rtx_insn * curr)16401 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
16402 {
16403 rtx set_dest;
16404 rtx prev_set = single_set (prev);
16405 rtx curr_set = single_set (curr);
16406 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
16407 bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
16408
16409 if (!aarch64_macro_fusion_p ())
16410 return false;
16411
16412 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
16413 {
16414 /* We are trying to match:
16415 prev (mov) == (set (reg r0) (const_int imm16))
16416 curr (movk) == (set (zero_extract (reg r0)
16417 (const_int 16)
16418 (const_int 16))
16419 (const_int imm16_1)) */
16420
16421 set_dest = SET_DEST (curr_set);
16422
16423 if (GET_CODE (set_dest) == ZERO_EXTRACT
16424 && CONST_INT_P (SET_SRC (curr_set))
16425 && CONST_INT_P (SET_SRC (prev_set))
16426 && CONST_INT_P (XEXP (set_dest, 2))
16427 && INTVAL (XEXP (set_dest, 2)) == 16
16428 && REG_P (XEXP (set_dest, 0))
16429 && REG_P (SET_DEST (prev_set))
16430 && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
16431 {
16432 return true;
16433 }
16434 }
16435
16436 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
16437 {
16438
16439 /* We're trying to match:
16440 prev (adrp) == (set (reg r1)
16441 (high (symbol_ref ("SYM"))))
16442 curr (add) == (set (reg r0)
16443 (lo_sum (reg r1)
16444 (symbol_ref ("SYM"))))
16445 Note that r0 need not necessarily be the same as r1, especially
16446 during pre-regalloc scheduling. */
16447
16448 if (satisfies_constraint_Ush (SET_SRC (prev_set))
16449 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
16450 {
16451 if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
16452 && REG_P (XEXP (SET_SRC (curr_set), 0))
16453 && REGNO (XEXP (SET_SRC (curr_set), 0))
16454 == REGNO (SET_DEST (prev_set))
16455 && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
16456 XEXP (SET_SRC (curr_set), 1)))
16457 return true;
16458 }
16459 }
16460
16461 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
16462 {
16463
16464 /* We're trying to match:
16465 prev (movk) == (set (zero_extract (reg r0)
16466 (const_int 16)
16467 (const_int 32))
16468 (const_int imm16_1))
16469 curr (movk) == (set (zero_extract (reg r0)
16470 (const_int 16)
16471 (const_int 48))
16472 (const_int imm16_2)) */
16473
16474 if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
16475 && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
16476 && REG_P (XEXP (SET_DEST (prev_set), 0))
16477 && REG_P (XEXP (SET_DEST (curr_set), 0))
16478 && REGNO (XEXP (SET_DEST (prev_set), 0))
16479 == REGNO (XEXP (SET_DEST (curr_set), 0))
16480 && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
16481 && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
16482 && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
16483 && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
16484 && CONST_INT_P (SET_SRC (prev_set))
16485 && CONST_INT_P (SET_SRC (curr_set)))
16486 return true;
16487
16488 }
16489 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
16490 {
16491 /* We're trying to match:
16492 prev (adrp) == (set (reg r0)
16493 (high (symbol_ref ("SYM"))))
16494 curr (ldr) == (set (reg r1)
16495 (mem (lo_sum (reg r0)
16496 (symbol_ref ("SYM")))))
16497 or
16498 curr (ldr) == (set (reg r1)
16499 (zero_extend (mem
16500 (lo_sum (reg r0)
16501 (symbol_ref ("SYM")))))) */
16502 if (satisfies_constraint_Ush (SET_SRC (prev_set))
16503 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
16504 {
16505 rtx curr_src = SET_SRC (curr_set);
16506
16507 if (GET_CODE (curr_src) == ZERO_EXTEND)
16508 curr_src = XEXP (curr_src, 0);
16509
16510 if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
16511 && REG_P (XEXP (XEXP (curr_src, 0), 0))
16512 && REGNO (XEXP (XEXP (curr_src, 0), 0))
16513 == REGNO (SET_DEST (prev_set))
16514 && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
16515 XEXP (SET_SRC (prev_set), 0)))
16516 return true;
16517 }
16518 }
16519
16520 if (aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)
16521 && aarch_crypto_can_dual_issue (prev, curr))
16522 return true;
16523
16524 if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
16525 && any_condjump_p (curr))
16526 {
16527 unsigned int condreg1, condreg2;
16528 rtx cc_reg_1;
16529 aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
16530 cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
16531
16532 if (reg_referenced_p (cc_reg_1, PATTERN (curr))
16533 && prev
16534 && modified_in_p (cc_reg_1, prev))
16535 {
16536 enum attr_type prev_type = get_attr_type (prev);
16537
16538 /* FIXME: this misses some which is considered simple arthematic
16539 instructions for ThunderX. Simple shifts are missed here. */
16540 if (prev_type == TYPE_ALUS_SREG
16541 || prev_type == TYPE_ALUS_IMM
16542 || prev_type == TYPE_LOGICS_REG
16543 || prev_type == TYPE_LOGICS_IMM)
16544 return true;
16545 }
16546 }
16547
16548 if (prev_set
16549 && curr_set
16550 && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
16551 && any_condjump_p (curr))
16552 {
16553 /* We're trying to match:
16554 prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
16555 curr (cbz) == (set (pc) (if_then_else (eq/ne) (r0)
16556 (const_int 0))
16557 (label_ref ("SYM"))
16558 (pc)) */
16559 if (SET_DEST (curr_set) == (pc_rtx)
16560 && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
16561 && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
16562 && REG_P (SET_DEST (prev_set))
16563 && REGNO (SET_DEST (prev_set))
16564 == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
16565 {
16566 /* Fuse ALU operations followed by conditional branch instruction. */
16567 switch (get_attr_type (prev))
16568 {
16569 case TYPE_ALU_IMM:
16570 case TYPE_ALU_SREG:
16571 case TYPE_ADC_REG:
16572 case TYPE_ADC_IMM:
16573 case TYPE_ADCS_REG:
16574 case TYPE_ADCS_IMM:
16575 case TYPE_LOGIC_REG:
16576 case TYPE_LOGIC_IMM:
16577 case TYPE_CSEL:
16578 case TYPE_ADR:
16579 case TYPE_MOV_IMM:
16580 case TYPE_SHIFT_REG:
16581 case TYPE_SHIFT_IMM:
16582 case TYPE_BFM:
16583 case TYPE_RBIT:
16584 case TYPE_REV:
16585 case TYPE_EXTEND:
16586 return true;
16587
16588 default:;
16589 }
16590 }
16591 }
16592
16593 return false;
16594 }
16595
16596 /* Return true iff the instruction fusion described by OP is enabled. */
16597
16598 bool
aarch64_fusion_enabled_p(enum aarch64_fusion_pairs op)16599 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
16600 {
16601 return (aarch64_tune_params.fusible_ops & op) != 0;
16602 }
16603
16604 /* If MEM is in the form of [base+offset], extract the two parts
16605 of address and set to BASE and OFFSET, otherwise return false
16606 after clearing BASE and OFFSET. */
16607
16608 bool
extract_base_offset_in_addr(rtx mem,rtx * base,rtx * offset)16609 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
16610 {
16611 rtx addr;
16612
16613 gcc_assert (MEM_P (mem));
16614
16615 addr = XEXP (mem, 0);
16616
16617 if (REG_P (addr))
16618 {
16619 *base = addr;
16620 *offset = const0_rtx;
16621 return true;
16622 }
16623
16624 if (GET_CODE (addr) == PLUS
16625 && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
16626 {
16627 *base = XEXP (addr, 0);
16628 *offset = XEXP (addr, 1);
16629 return true;
16630 }
16631
16632 *base = NULL_RTX;
16633 *offset = NULL_RTX;
16634
16635 return false;
16636 }
16637
16638 /* Types for scheduling fusion. */
16639 enum sched_fusion_type
16640 {
16641 SCHED_FUSION_NONE = 0,
16642 SCHED_FUSION_LD_SIGN_EXTEND,
16643 SCHED_FUSION_LD_ZERO_EXTEND,
16644 SCHED_FUSION_LD,
16645 SCHED_FUSION_ST,
16646 SCHED_FUSION_NUM
16647 };
16648
16649 /* If INSN is a load or store of address in the form of [base+offset],
16650 extract the two parts and set to BASE and OFFSET. Return scheduling
16651 fusion type this INSN is. */
16652
16653 static enum sched_fusion_type
fusion_load_store(rtx_insn * insn,rtx * base,rtx * offset)16654 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
16655 {
16656 rtx x, dest, src;
16657 enum sched_fusion_type fusion = SCHED_FUSION_LD;
16658
16659 gcc_assert (INSN_P (insn));
16660 x = PATTERN (insn);
16661 if (GET_CODE (x) != SET)
16662 return SCHED_FUSION_NONE;
16663
16664 src = SET_SRC (x);
16665 dest = SET_DEST (x);
16666
16667 machine_mode dest_mode = GET_MODE (dest);
16668
16669 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
16670 return SCHED_FUSION_NONE;
16671
16672 if (GET_CODE (src) == SIGN_EXTEND)
16673 {
16674 fusion = SCHED_FUSION_LD_SIGN_EXTEND;
16675 src = XEXP (src, 0);
16676 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
16677 return SCHED_FUSION_NONE;
16678 }
16679 else if (GET_CODE (src) == ZERO_EXTEND)
16680 {
16681 fusion = SCHED_FUSION_LD_ZERO_EXTEND;
16682 src = XEXP (src, 0);
16683 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
16684 return SCHED_FUSION_NONE;
16685 }
16686
16687 if (GET_CODE (src) == MEM && REG_P (dest))
16688 extract_base_offset_in_addr (src, base, offset);
16689 else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
16690 {
16691 fusion = SCHED_FUSION_ST;
16692 extract_base_offset_in_addr (dest, base, offset);
16693 }
16694 else
16695 return SCHED_FUSION_NONE;
16696
16697 if (*base == NULL_RTX || *offset == NULL_RTX)
16698 fusion = SCHED_FUSION_NONE;
16699
16700 return fusion;
16701 }
16702
16703 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
16704
16705 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
16706 and PRI are only calculated for these instructions. For other instruction,
16707 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
16708 type instruction fusion can be added by returning different priorities.
16709
16710 It's important that irrelevant instructions get the largest FUSION_PRI. */
16711
16712 static void
aarch64_sched_fusion_priority(rtx_insn * insn,int max_pri,int * fusion_pri,int * pri)16713 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
16714 int *fusion_pri, int *pri)
16715 {
16716 int tmp, off_val;
16717 rtx base, offset;
16718 enum sched_fusion_type fusion;
16719
16720 gcc_assert (INSN_P (insn));
16721
16722 tmp = max_pri - 1;
16723 fusion = fusion_load_store (insn, &base, &offset);
16724 if (fusion == SCHED_FUSION_NONE)
16725 {
16726 *pri = tmp;
16727 *fusion_pri = tmp;
16728 return;
16729 }
16730
16731 /* Set FUSION_PRI according to fusion type and base register. */
16732 *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
16733
16734 /* Calculate PRI. */
16735 tmp /= 2;
16736
16737 /* INSN with smaller offset goes first. */
16738 off_val = (int)(INTVAL (offset));
16739 if (off_val >= 0)
16740 tmp -= (off_val & 0xfffff);
16741 else
16742 tmp += ((- off_val) & 0xfffff);
16743
16744 *pri = tmp;
16745 return;
16746 }
16747
16748 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
16749 Adjust priority of sha1h instructions so they are scheduled before
16750 other SHA1 instructions. */
16751
16752 static int
aarch64_sched_adjust_priority(rtx_insn * insn,int priority)16753 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
16754 {
16755 rtx x = PATTERN (insn);
16756
16757 if (GET_CODE (x) == SET)
16758 {
16759 x = SET_SRC (x);
16760
16761 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
16762 return priority + 10;
16763 }
16764
16765 return priority;
16766 }
16767
16768 /* Given OPERANDS of consecutive load/store, check if we can merge
16769 them into ldp/stp. LOAD is true if they are load instructions.
16770 MODE is the mode of memory operands. */
16771
16772 bool
aarch64_operands_ok_for_ldpstp(rtx * operands,bool load,machine_mode mode)16773 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
16774 machine_mode mode)
16775 {
16776 HOST_WIDE_INT offval_1, offval_2, msize;
16777 enum reg_class rclass_1, rclass_2;
16778 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
16779
16780 if (load)
16781 {
16782 mem_1 = operands[1];
16783 mem_2 = operands[3];
16784 reg_1 = operands[0];
16785 reg_2 = operands[2];
16786 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
16787 if (REGNO (reg_1) == REGNO (reg_2))
16788 return false;
16789 }
16790 else
16791 {
16792 mem_1 = operands[0];
16793 mem_2 = operands[2];
16794 reg_1 = operands[1];
16795 reg_2 = operands[3];
16796 }
16797
16798 /* The mems cannot be volatile. */
16799 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
16800 return false;
16801
16802 /* If we have SImode and slow unaligned ldp,
16803 check the alignment to be at least 8 byte. */
16804 if (mode == SImode
16805 && (aarch64_tune_params.extra_tuning_flags
16806 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
16807 && !optimize_size
16808 && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
16809 return false;
16810
16811 /* Check if the addresses are in the form of [base+offset]. */
16812 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
16813 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
16814 return false;
16815 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
16816 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
16817 return false;
16818
16819 /* Check if the bases are same. */
16820 if (!rtx_equal_p (base_1, base_2))
16821 return false;
16822
16823 offval_1 = INTVAL (offset_1);
16824 offval_2 = INTVAL (offset_2);
16825 /* We should only be trying this for fixed-sized modes. There is no
16826 SVE LDP/STP instruction. */
16827 msize = GET_MODE_SIZE (mode).to_constant ();
16828 /* Check if the offsets are consecutive. */
16829 if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
16830 return false;
16831
16832 /* Check if the addresses are clobbered by load. */
16833 if (load)
16834 {
16835 if (reg_mentioned_p (reg_1, mem_1))
16836 return false;
16837
16838 /* In increasing order, the last load can clobber the address. */
16839 if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
16840 return false;
16841 }
16842
16843 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
16844 rclass_1 = FP_REGS;
16845 else
16846 rclass_1 = GENERAL_REGS;
16847
16848 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
16849 rclass_2 = FP_REGS;
16850 else
16851 rclass_2 = GENERAL_REGS;
16852
16853 /* Check if the registers are of same class. */
16854 if (rclass_1 != rclass_2)
16855 return false;
16856
16857 return true;
16858 }
16859
16860 /* Given OPERANDS of consecutive load/store, check if we can merge
16861 them into ldp/stp by adjusting the offset. LOAD is true if they
16862 are load instructions. MODE is the mode of memory operands.
16863
16864 Given below consecutive stores:
16865
16866 str w1, [xb, 0x100]
16867 str w1, [xb, 0x104]
16868 str w1, [xb, 0x108]
16869 str w1, [xb, 0x10c]
16870
16871 Though the offsets are out of the range supported by stp, we can
16872 still pair them after adjusting the offset, like:
16873
16874 add scratch, xb, 0x100
16875 stp w1, w1, [scratch]
16876 stp w1, w1, [scratch, 0x8]
16877
16878 The peephole patterns detecting this opportunity should guarantee
16879 the scratch register is avaliable. */
16880
16881 bool
aarch64_operands_adjust_ok_for_ldpstp(rtx * operands,bool load,scalar_mode mode)16882 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
16883 scalar_mode mode)
16884 {
16885 enum reg_class rclass_1, rclass_2, rclass_3, rclass_4;
16886 HOST_WIDE_INT offval_1, offval_2, offval_3, offval_4, msize;
16887 rtx mem_1, mem_2, mem_3, mem_4, reg_1, reg_2, reg_3, reg_4;
16888 rtx base_1, base_2, base_3, base_4, offset_1, offset_2, offset_3, offset_4;
16889
16890 if (load)
16891 {
16892 reg_1 = operands[0];
16893 mem_1 = operands[1];
16894 reg_2 = operands[2];
16895 mem_2 = operands[3];
16896 reg_3 = operands[4];
16897 mem_3 = operands[5];
16898 reg_4 = operands[6];
16899 mem_4 = operands[7];
16900 gcc_assert (REG_P (reg_1) && REG_P (reg_2)
16901 && REG_P (reg_3) && REG_P (reg_4));
16902 if (REGNO (reg_1) == REGNO (reg_2) || REGNO (reg_3) == REGNO (reg_4))
16903 return false;
16904 }
16905 else
16906 {
16907 mem_1 = operands[0];
16908 reg_1 = operands[1];
16909 mem_2 = operands[2];
16910 reg_2 = operands[3];
16911 mem_3 = operands[4];
16912 reg_3 = operands[5];
16913 mem_4 = operands[6];
16914 reg_4 = operands[7];
16915 }
16916 /* Skip if memory operand is by itslef valid for ldp/stp. */
16917 if (!MEM_P (mem_1) || aarch64_mem_pair_operand (mem_1, mode))
16918 return false;
16919
16920 /* The mems cannot be volatile. */
16921 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2)
16922 || MEM_VOLATILE_P (mem_3) ||MEM_VOLATILE_P (mem_4))
16923 return false;
16924
16925 /* Check if the addresses are in the form of [base+offset]. */
16926 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
16927 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
16928 return false;
16929 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
16930 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
16931 return false;
16932 extract_base_offset_in_addr (mem_3, &base_3, &offset_3);
16933 if (base_3 == NULL_RTX || offset_3 == NULL_RTX)
16934 return false;
16935 extract_base_offset_in_addr (mem_4, &base_4, &offset_4);
16936 if (base_4 == NULL_RTX || offset_4 == NULL_RTX)
16937 return false;
16938
16939 /* Check if the bases are same. */
16940 if (!rtx_equal_p (base_1, base_2)
16941 || !rtx_equal_p (base_2, base_3)
16942 || !rtx_equal_p (base_3, base_4))
16943 return false;
16944
16945 offval_1 = INTVAL (offset_1);
16946 offval_2 = INTVAL (offset_2);
16947 offval_3 = INTVAL (offset_3);
16948 offval_4 = INTVAL (offset_4);
16949 msize = GET_MODE_SIZE (mode);
16950 /* Check if the offsets are consecutive. */
16951 if ((offval_1 != (offval_2 + msize)
16952 || offval_1 != (offval_3 + msize * 2)
16953 || offval_1 != (offval_4 + msize * 3))
16954 && (offval_4 != (offval_3 + msize)
16955 || offval_4 != (offval_2 + msize * 2)
16956 || offval_4 != (offval_1 + msize * 3)))
16957 return false;
16958
16959 /* Check if the addresses are clobbered by load. */
16960 if (load)
16961 {
16962 if (reg_mentioned_p (reg_1, mem_1)
16963 || reg_mentioned_p (reg_2, mem_2)
16964 || reg_mentioned_p (reg_3, mem_3))
16965 return false;
16966
16967 /* In increasing order, the last load can clobber the address. */
16968 if (offval_1 > offval_2 && reg_mentioned_p (reg_4, mem_4))
16969 return false;
16970 }
16971
16972 /* If we have SImode and slow unaligned ldp,
16973 check the alignment to be at least 8 byte. */
16974 if (mode == SImode
16975 && (aarch64_tune_params.extra_tuning_flags
16976 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
16977 && !optimize_size
16978 && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
16979 return false;
16980
16981 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
16982 rclass_1 = FP_REGS;
16983 else
16984 rclass_1 = GENERAL_REGS;
16985
16986 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
16987 rclass_2 = FP_REGS;
16988 else
16989 rclass_2 = GENERAL_REGS;
16990
16991 if (REG_P (reg_3) && FP_REGNUM_P (REGNO (reg_3)))
16992 rclass_3 = FP_REGS;
16993 else
16994 rclass_3 = GENERAL_REGS;
16995
16996 if (REG_P (reg_4) && FP_REGNUM_P (REGNO (reg_4)))
16997 rclass_4 = FP_REGS;
16998 else
16999 rclass_4 = GENERAL_REGS;
17000
17001 /* Check if the registers are of same class. */
17002 if (rclass_1 != rclass_2 || rclass_2 != rclass_3 || rclass_3 != rclass_4)
17003 return false;
17004
17005 return true;
17006 }
17007
17008 /* Given OPERANDS of consecutive load/store, this function pairs them
17009 into ldp/stp after adjusting the offset. It depends on the fact
17010 that addresses of load/store instructions are in increasing order.
17011 MODE is the mode of memory operands. CODE is the rtl operator
17012 which should be applied to all memory operands, it's SIGN_EXTEND,
17013 ZERO_EXTEND or UNKNOWN. */
17014
17015 bool
aarch64_gen_adjusted_ldpstp(rtx * operands,bool load,scalar_mode mode,RTX_CODE code)17016 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
17017 scalar_mode mode, RTX_CODE code)
17018 {
17019 rtx base, offset, t1, t2;
17020 rtx mem_1, mem_2, mem_3, mem_4;
17021 HOST_WIDE_INT off_val, abs_off, adj_off, new_off, stp_off_limit, msize;
17022
17023 if (load)
17024 {
17025 mem_1 = operands[1];
17026 mem_2 = operands[3];
17027 mem_3 = operands[5];
17028 mem_4 = operands[7];
17029 }
17030 else
17031 {
17032 mem_1 = operands[0];
17033 mem_2 = operands[2];
17034 mem_3 = operands[4];
17035 mem_4 = operands[6];
17036 gcc_assert (code == UNKNOWN);
17037 }
17038
17039 extract_base_offset_in_addr (mem_1, &base, &offset);
17040 gcc_assert (base != NULL_RTX && offset != NULL_RTX);
17041
17042 /* Adjust offset thus it can fit in ldp/stp instruction. */
17043 msize = GET_MODE_SIZE (mode);
17044 stp_off_limit = msize * 0x40;
17045 off_val = INTVAL (offset);
17046 abs_off = (off_val < 0) ? -off_val : off_val;
17047 new_off = abs_off % stp_off_limit;
17048 adj_off = abs_off - new_off;
17049
17050 /* Further adjust to make sure all offsets are OK. */
17051 if ((new_off + msize * 2) >= stp_off_limit)
17052 {
17053 adj_off += stp_off_limit;
17054 new_off -= stp_off_limit;
17055 }
17056
17057 /* Make sure the adjustment can be done with ADD/SUB instructions. */
17058 if (adj_off >= 0x1000)
17059 return false;
17060
17061 if (off_val < 0)
17062 {
17063 adj_off = -adj_off;
17064 new_off = -new_off;
17065 }
17066
17067 /* Create new memory references. */
17068 mem_1 = change_address (mem_1, VOIDmode,
17069 plus_constant (DImode, operands[8], new_off));
17070
17071 /* Check if the adjusted address is OK for ldp/stp. */
17072 if (!aarch64_mem_pair_operand (mem_1, mode))
17073 return false;
17074
17075 msize = GET_MODE_SIZE (mode);
17076 mem_2 = change_address (mem_2, VOIDmode,
17077 plus_constant (DImode,
17078 operands[8],
17079 new_off + msize));
17080 mem_3 = change_address (mem_3, VOIDmode,
17081 plus_constant (DImode,
17082 operands[8],
17083 new_off + msize * 2));
17084 mem_4 = change_address (mem_4, VOIDmode,
17085 plus_constant (DImode,
17086 operands[8],
17087 new_off + msize * 3));
17088
17089 if (code == ZERO_EXTEND)
17090 {
17091 mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
17092 mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
17093 mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
17094 mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
17095 }
17096 else if (code == SIGN_EXTEND)
17097 {
17098 mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
17099 mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
17100 mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
17101 mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
17102 }
17103
17104 if (load)
17105 {
17106 operands[1] = mem_1;
17107 operands[3] = mem_2;
17108 operands[5] = mem_3;
17109 operands[7] = mem_4;
17110 }
17111 else
17112 {
17113 operands[0] = mem_1;
17114 operands[2] = mem_2;
17115 operands[4] = mem_3;
17116 operands[6] = mem_4;
17117 }
17118
17119 /* Emit adjusting instruction. */
17120 emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, adj_off)));
17121 /* Emit ldp/stp instructions. */
17122 t1 = gen_rtx_SET (operands[0], operands[1]);
17123 t2 = gen_rtx_SET (operands[2], operands[3]);
17124 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
17125 t1 = gen_rtx_SET (operands[4], operands[5]);
17126 t2 = gen_rtx_SET (operands[6], operands[7]);
17127 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
17128 return true;
17129 }
17130
17131 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE. Assume for now that
17132 it isn't worth branching around empty masked ops (including masked
17133 stores). */
17134
17135 static bool
aarch64_empty_mask_is_expensive(unsigned)17136 aarch64_empty_mask_is_expensive (unsigned)
17137 {
17138 return false;
17139 }
17140
17141 /* Return 1 if pseudo register should be created and used to hold
17142 GOT address for PIC code. */
17143
17144 bool
aarch64_use_pseudo_pic_reg(void)17145 aarch64_use_pseudo_pic_reg (void)
17146 {
17147 return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
17148 }
17149
17150 /* Implement TARGET_UNSPEC_MAY_TRAP_P. */
17151
17152 static int
aarch64_unspec_may_trap_p(const_rtx x,unsigned flags)17153 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
17154 {
17155 switch (XINT (x, 1))
17156 {
17157 case UNSPEC_GOTSMALLPIC:
17158 case UNSPEC_GOTSMALLPIC28K:
17159 case UNSPEC_GOTTINYPIC:
17160 return 0;
17161 default:
17162 break;
17163 }
17164
17165 return default_unspec_may_trap_p (x, flags);
17166 }
17167
17168
17169 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
17170 return the log2 of that value. Otherwise return -1. */
17171
17172 int
aarch64_fpconst_pow_of_2(rtx x)17173 aarch64_fpconst_pow_of_2 (rtx x)
17174 {
17175 const REAL_VALUE_TYPE *r;
17176
17177 if (!CONST_DOUBLE_P (x))
17178 return -1;
17179
17180 r = CONST_DOUBLE_REAL_VALUE (x);
17181
17182 if (REAL_VALUE_NEGATIVE (*r)
17183 || REAL_VALUE_ISNAN (*r)
17184 || REAL_VALUE_ISINF (*r)
17185 || !real_isinteger (r, DFmode))
17186 return -1;
17187
17188 return exact_log2 (real_to_integer (r));
17189 }
17190
17191 /* If X is a vector of equal CONST_DOUBLE values and that value is
17192 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
17193
17194 int
aarch64_vec_fpconst_pow_of_2(rtx x)17195 aarch64_vec_fpconst_pow_of_2 (rtx x)
17196 {
17197 int nelts;
17198 if (GET_CODE (x) != CONST_VECTOR
17199 || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
17200 return -1;
17201
17202 if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
17203 return -1;
17204
17205 int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
17206 if (firstval <= 0)
17207 return -1;
17208
17209 for (int i = 1; i < nelts; i++)
17210 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
17211 return -1;
17212
17213 return firstval;
17214 }
17215
17216 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
17217 to float.
17218
17219 __fp16 always promotes through this hook.
17220 _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
17221 through the generic excess precision logic rather than here. */
17222
17223 static tree
aarch64_promoted_type(const_tree t)17224 aarch64_promoted_type (const_tree t)
17225 {
17226 if (SCALAR_FLOAT_TYPE_P (t)
17227 && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
17228 return float_type_node;
17229
17230 return NULL_TREE;
17231 }
17232
17233 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
17234
17235 static bool
aarch64_optab_supported_p(int op,machine_mode mode1,machine_mode,optimization_type opt_type)17236 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
17237 optimization_type opt_type)
17238 {
17239 switch (op)
17240 {
17241 case rsqrt_optab:
17242 return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
17243
17244 default:
17245 return true;
17246 }
17247 }
17248
17249 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook. */
17250
17251 static unsigned int
aarch64_dwarf_poly_indeterminate_value(unsigned int i,unsigned int * factor,int * offset)17252 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
17253 int *offset)
17254 {
17255 /* Polynomial invariant 1 == (VG / 2) - 1. */
17256 gcc_assert (i == 1);
17257 *factor = 2;
17258 *offset = 1;
17259 return AARCH64_DWARF_VG;
17260 }
17261
17262 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
17263 if MODE is HFmode, and punt to the generic implementation otherwise. */
17264
17265 static bool
aarch64_libgcc_floating_mode_supported_p(scalar_float_mode mode)17266 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
17267 {
17268 return (mode == HFmode
17269 ? true
17270 : default_libgcc_floating_mode_supported_p (mode));
17271 }
17272
17273 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
17274 if MODE is HFmode, and punt to the generic implementation otherwise. */
17275
17276 static bool
aarch64_scalar_mode_supported_p(scalar_mode mode)17277 aarch64_scalar_mode_supported_p (scalar_mode mode)
17278 {
17279 return (mode == HFmode
17280 ? true
17281 : default_scalar_mode_supported_p (mode));
17282 }
17283
17284 /* Set the value of FLT_EVAL_METHOD.
17285 ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
17286
17287 0: evaluate all operations and constants, whose semantic type has at
17288 most the range and precision of type float, to the range and
17289 precision of float; evaluate all other operations and constants to
17290 the range and precision of the semantic type;
17291
17292 N, where _FloatN is a supported interchange floating type
17293 evaluate all operations and constants, whose semantic type has at
17294 most the range and precision of _FloatN type, to the range and
17295 precision of the _FloatN type; evaluate all other operations and
17296 constants to the range and precision of the semantic type;
17297
17298 If we have the ARMv8.2-A extensions then we support _Float16 in native
17299 precision, so we should set this to 16. Otherwise, we support the type,
17300 but want to evaluate expressions in float precision, so set this to
17301 0. */
17302
17303 static enum flt_eval_method
aarch64_excess_precision(enum excess_precision_type type)17304 aarch64_excess_precision (enum excess_precision_type type)
17305 {
17306 switch (type)
17307 {
17308 case EXCESS_PRECISION_TYPE_FAST:
17309 case EXCESS_PRECISION_TYPE_STANDARD:
17310 /* We can calculate either in 16-bit range and precision or
17311 32-bit range and precision. Make that decision based on whether
17312 we have native support for the ARMv8.2-A 16-bit floating-point
17313 instructions or not. */
17314 return (TARGET_FP_F16INST
17315 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
17316 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
17317 case EXCESS_PRECISION_TYPE_IMPLICIT:
17318 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
17319 default:
17320 gcc_unreachable ();
17321 }
17322 return FLT_EVAL_METHOD_UNPREDICTABLE;
17323 }
17324
17325 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN. Return true if INSN can be
17326 scheduled for speculative execution. Reject the long-running division
17327 and square-root instructions. */
17328
17329 static bool
aarch64_sched_can_speculate_insn(rtx_insn * insn)17330 aarch64_sched_can_speculate_insn (rtx_insn *insn)
17331 {
17332 switch (get_attr_type (insn))
17333 {
17334 case TYPE_SDIV:
17335 case TYPE_UDIV:
17336 case TYPE_FDIVS:
17337 case TYPE_FDIVD:
17338 case TYPE_FSQRTS:
17339 case TYPE_FSQRTD:
17340 case TYPE_NEON_FP_SQRT_S:
17341 case TYPE_NEON_FP_SQRT_D:
17342 case TYPE_NEON_FP_SQRT_S_Q:
17343 case TYPE_NEON_FP_SQRT_D_Q:
17344 case TYPE_NEON_FP_DIV_S:
17345 case TYPE_NEON_FP_DIV_D:
17346 case TYPE_NEON_FP_DIV_S_Q:
17347 case TYPE_NEON_FP_DIV_D_Q:
17348 return false;
17349 default:
17350 return true;
17351 }
17352 }
17353
17354 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES. */
17355
17356 static int
aarch64_compute_pressure_classes(reg_class * classes)17357 aarch64_compute_pressure_classes (reg_class *classes)
17358 {
17359 int i = 0;
17360 classes[i++] = GENERAL_REGS;
17361 classes[i++] = FP_REGS;
17362 /* PR_REGS isn't a useful pressure class because many predicate pseudo
17363 registers need to go in PR_LO_REGS at some point during their
17364 lifetime. Splitting it into two halves has the effect of making
17365 all predicates count against PR_LO_REGS, so that we try whenever
17366 possible to restrict the number of live predicates to 8. This
17367 greatly reduces the amount of spilling in certain loops. */
17368 classes[i++] = PR_LO_REGS;
17369 classes[i++] = PR_HI_REGS;
17370 return i;
17371 }
17372
17373 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
17374
17375 static bool
aarch64_can_change_mode_class(machine_mode from,machine_mode to,reg_class_t)17376 aarch64_can_change_mode_class (machine_mode from,
17377 machine_mode to, reg_class_t)
17378 {
17379 if (BYTES_BIG_ENDIAN)
17380 {
17381 bool from_sve_p = aarch64_sve_data_mode_p (from);
17382 bool to_sve_p = aarch64_sve_data_mode_p (to);
17383
17384 /* Don't allow changes between SVE data modes and non-SVE modes.
17385 See the comment at the head of aarch64-sve.md for details. */
17386 if (from_sve_p != to_sve_p)
17387 return false;
17388
17389 /* Don't allow changes in element size: lane 0 of the new vector
17390 would not then be lane 0 of the old vector. See the comment
17391 above aarch64_maybe_expand_sve_subreg_move for a more detailed
17392 description.
17393
17394 In the worst case, this forces a register to be spilled in
17395 one mode and reloaded in the other, which handles the
17396 endianness correctly. */
17397 if (from_sve_p && GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to))
17398 return false;
17399 }
17400 return true;
17401 }
17402
17403 /* Implement TARGET_EARLY_REMAT_MODES. */
17404
17405 static void
aarch64_select_early_remat_modes(sbitmap modes)17406 aarch64_select_early_remat_modes (sbitmap modes)
17407 {
17408 /* SVE values are not normally live across a call, so it should be
17409 worth doing early rematerialization even in VL-specific mode. */
17410 for (int i = 0; i < NUM_MACHINE_MODES; ++i)
17411 {
17412 machine_mode mode = (machine_mode) i;
17413 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
17414 if (vec_flags & VEC_ANY_SVE)
17415 bitmap_set_bit (modes, i);
17416 }
17417 }
17418
17419 /* Target-specific selftests. */
17420
17421 #if CHECKING_P
17422
17423 namespace selftest {
17424
17425 /* Selftest for the RTL loader.
17426 Verify that the RTL loader copes with a dump from
17427 print_rtx_function. This is essentially just a test that class
17428 function_reader can handle a real dump, but it also verifies
17429 that lookup_reg_by_dump_name correctly handles hard regs.
17430 The presence of hard reg names in the dump means that the test is
17431 target-specific, hence it is in this file. */
17432
17433 static void
aarch64_test_loading_full_dump()17434 aarch64_test_loading_full_dump ()
17435 {
17436 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
17437
17438 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
17439
17440 rtx_insn *insn_1 = get_insn_by_uid (1);
17441 ASSERT_EQ (NOTE, GET_CODE (insn_1));
17442
17443 rtx_insn *insn_15 = get_insn_by_uid (15);
17444 ASSERT_EQ (INSN, GET_CODE (insn_15));
17445 ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
17446
17447 /* Verify crtl->return_rtx. */
17448 ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
17449 ASSERT_EQ (0, REGNO (crtl->return_rtx));
17450 ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
17451 }
17452
17453 /* Run all target-specific selftests. */
17454
17455 static void
aarch64_run_selftests(void)17456 aarch64_run_selftests (void)
17457 {
17458 aarch64_test_loading_full_dump ();
17459 }
17460
17461 } // namespace selftest
17462
17463 #endif /* #if CHECKING_P */
17464
17465 #undef TARGET_ADDRESS_COST
17466 #define TARGET_ADDRESS_COST aarch64_address_cost
17467
17468 /* This hook will determines whether unnamed bitfields affect the alignment
17469 of the containing structure. The hook returns true if the structure
17470 should inherit the alignment requirements of an unnamed bitfield's
17471 type. */
17472 #undef TARGET_ALIGN_ANON_BITFIELD
17473 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
17474
17475 #undef TARGET_ASM_ALIGNED_DI_OP
17476 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
17477
17478 #undef TARGET_ASM_ALIGNED_HI_OP
17479 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
17480
17481 #undef TARGET_ASM_ALIGNED_SI_OP
17482 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
17483
17484 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
17485 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
17486 hook_bool_const_tree_hwi_hwi_const_tree_true
17487
17488 #undef TARGET_ASM_FILE_START
17489 #define TARGET_ASM_FILE_START aarch64_start_file
17490
17491 #undef TARGET_ASM_OUTPUT_MI_THUNK
17492 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
17493
17494 #undef TARGET_ASM_SELECT_RTX_SECTION
17495 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
17496
17497 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
17498 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
17499
17500 #undef TARGET_BUILD_BUILTIN_VA_LIST
17501 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
17502
17503 #undef TARGET_CALLEE_COPIES
17504 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
17505
17506 #undef TARGET_CAN_ELIMINATE
17507 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
17508
17509 #undef TARGET_CAN_INLINE_P
17510 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
17511
17512 #undef TARGET_CANNOT_FORCE_CONST_MEM
17513 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
17514
17515 #undef TARGET_CASE_VALUES_THRESHOLD
17516 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
17517
17518 #undef TARGET_CONDITIONAL_REGISTER_USAGE
17519 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
17520
17521 /* Only the least significant bit is used for initialization guard
17522 variables. */
17523 #undef TARGET_CXX_GUARD_MASK_BIT
17524 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
17525
17526 #undef TARGET_C_MODE_FOR_SUFFIX
17527 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
17528
17529 #ifdef TARGET_BIG_ENDIAN_DEFAULT
17530 #undef TARGET_DEFAULT_TARGET_FLAGS
17531 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
17532 #endif
17533
17534 #undef TARGET_CLASS_MAX_NREGS
17535 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
17536
17537 #undef TARGET_BUILTIN_DECL
17538 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
17539
17540 #undef TARGET_BUILTIN_RECIPROCAL
17541 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
17542
17543 #undef TARGET_C_EXCESS_PRECISION
17544 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
17545
17546 #undef TARGET_EXPAND_BUILTIN
17547 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
17548
17549 #undef TARGET_EXPAND_BUILTIN_VA_START
17550 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
17551
17552 #undef TARGET_FOLD_BUILTIN
17553 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
17554
17555 #undef TARGET_FUNCTION_ARG
17556 #define TARGET_FUNCTION_ARG aarch64_function_arg
17557
17558 #undef TARGET_FUNCTION_ARG_ADVANCE
17559 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
17560
17561 #undef TARGET_FUNCTION_ARG_BOUNDARY
17562 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
17563
17564 #undef TARGET_FUNCTION_ARG_PADDING
17565 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
17566
17567 #undef TARGET_GET_RAW_RESULT_MODE
17568 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
17569 #undef TARGET_GET_RAW_ARG_MODE
17570 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
17571
17572 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
17573 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
17574
17575 #undef TARGET_FUNCTION_VALUE
17576 #define TARGET_FUNCTION_VALUE aarch64_function_value
17577
17578 #undef TARGET_FUNCTION_VALUE_REGNO_P
17579 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
17580
17581 #undef TARGET_GIMPLE_FOLD_BUILTIN
17582 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
17583
17584 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
17585 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
17586
17587 #undef TARGET_INIT_BUILTINS
17588 #define TARGET_INIT_BUILTINS aarch64_init_builtins
17589
17590 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
17591 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
17592 aarch64_ira_change_pseudo_allocno_class
17593
17594 #undef TARGET_LEGITIMATE_ADDRESS_P
17595 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
17596
17597 #undef TARGET_LEGITIMATE_CONSTANT_P
17598 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
17599
17600 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
17601 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
17602 aarch64_legitimize_address_displacement
17603
17604 #undef TARGET_LIBGCC_CMP_RETURN_MODE
17605 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
17606
17607 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
17608 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
17609 aarch64_libgcc_floating_mode_supported_p
17610
17611 #undef TARGET_MANGLE_TYPE
17612 #define TARGET_MANGLE_TYPE aarch64_mangle_type
17613
17614 #undef TARGET_MEMORY_MOVE_COST
17615 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
17616
17617 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
17618 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
17619
17620 #undef TARGET_MUST_PASS_IN_STACK
17621 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
17622
17623 /* This target hook should return true if accesses to volatile bitfields
17624 should use the narrowest mode possible. It should return false if these
17625 accesses should use the bitfield container type. */
17626 #undef TARGET_NARROW_VOLATILE_BITFIELD
17627 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
17628
17629 #undef TARGET_OPTION_OVERRIDE
17630 #define TARGET_OPTION_OVERRIDE aarch64_override_options
17631
17632 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
17633 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
17634 aarch64_override_options_after_change
17635
17636 #undef TARGET_OPTION_SAVE
17637 #define TARGET_OPTION_SAVE aarch64_option_save
17638
17639 #undef TARGET_OPTION_RESTORE
17640 #define TARGET_OPTION_RESTORE aarch64_option_restore
17641
17642 #undef TARGET_OPTION_PRINT
17643 #define TARGET_OPTION_PRINT aarch64_option_print
17644
17645 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
17646 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
17647
17648 #undef TARGET_SET_CURRENT_FUNCTION
17649 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
17650
17651 #undef TARGET_PASS_BY_REFERENCE
17652 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
17653
17654 #undef TARGET_PREFERRED_RELOAD_CLASS
17655 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
17656
17657 #undef TARGET_SCHED_REASSOCIATION_WIDTH
17658 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
17659
17660 #undef TARGET_PROMOTED_TYPE
17661 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
17662
17663 #undef TARGET_SECONDARY_RELOAD
17664 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
17665
17666 #undef TARGET_SHIFT_TRUNCATION_MASK
17667 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
17668
17669 #undef TARGET_SETUP_INCOMING_VARARGS
17670 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
17671
17672 #undef TARGET_STRUCT_VALUE_RTX
17673 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
17674
17675 #undef TARGET_REGISTER_MOVE_COST
17676 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
17677
17678 #undef TARGET_RETURN_IN_MEMORY
17679 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
17680
17681 #undef TARGET_RETURN_IN_MSB
17682 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
17683
17684 #undef TARGET_RTX_COSTS
17685 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
17686
17687 #undef TARGET_SCALAR_MODE_SUPPORTED_P
17688 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
17689
17690 #undef TARGET_SCHED_ISSUE_RATE
17691 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
17692
17693 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
17694 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
17695 aarch64_sched_first_cycle_multipass_dfa_lookahead
17696
17697 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
17698 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
17699 aarch64_first_cycle_multipass_dfa_lookahead_guard
17700
17701 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
17702 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
17703 aarch64_get_separate_components
17704
17705 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
17706 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
17707 aarch64_components_for_bb
17708
17709 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
17710 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
17711 aarch64_disqualify_components
17712
17713 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
17714 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
17715 aarch64_emit_prologue_components
17716
17717 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
17718 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
17719 aarch64_emit_epilogue_components
17720
17721 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
17722 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
17723 aarch64_set_handled_components
17724
17725 #undef TARGET_TRAMPOLINE_INIT
17726 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
17727
17728 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
17729 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
17730
17731 #undef TARGET_VECTOR_MODE_SUPPORTED_P
17732 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
17733
17734 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
17735 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
17736 aarch64_builtin_support_vector_misalignment
17737
17738 #undef TARGET_ARRAY_MODE
17739 #define TARGET_ARRAY_MODE aarch64_array_mode
17740
17741 #undef TARGET_ARRAY_MODE_SUPPORTED_P
17742 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
17743
17744 #undef TARGET_VECTORIZE_ADD_STMT_COST
17745 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
17746
17747 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
17748 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
17749 aarch64_builtin_vectorization_cost
17750
17751 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
17752 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
17753
17754 #undef TARGET_VECTORIZE_BUILTINS
17755 #define TARGET_VECTORIZE_BUILTINS
17756
17757 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
17758 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
17759 aarch64_builtin_vectorized_function
17760
17761 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
17762 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
17763 aarch64_autovectorize_vector_sizes
17764
17765 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
17766 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
17767 aarch64_atomic_assign_expand_fenv
17768
17769 /* Section anchor support. */
17770
17771 #undef TARGET_MIN_ANCHOR_OFFSET
17772 #define TARGET_MIN_ANCHOR_OFFSET -256
17773
17774 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
17775 byte offset; we can do much more for larger data types, but have no way
17776 to determine the size of the access. We assume accesses are aligned. */
17777 #undef TARGET_MAX_ANCHOR_OFFSET
17778 #define TARGET_MAX_ANCHOR_OFFSET 4095
17779
17780 #undef TARGET_VECTOR_ALIGNMENT
17781 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
17782
17783 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
17784 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
17785 aarch64_vectorize_preferred_vector_alignment
17786 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
17787 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
17788 aarch64_simd_vector_alignment_reachable
17789
17790 /* vec_perm support. */
17791
17792 #undef TARGET_VECTORIZE_VEC_PERM_CONST
17793 #define TARGET_VECTORIZE_VEC_PERM_CONST \
17794 aarch64_vectorize_vec_perm_const
17795
17796 #undef TARGET_VECTORIZE_GET_MASK_MODE
17797 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
17798 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
17799 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
17800 aarch64_empty_mask_is_expensive
17801
17802 #undef TARGET_INIT_LIBFUNCS
17803 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
17804
17805 #undef TARGET_FIXED_CONDITION_CODE_REGS
17806 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
17807
17808 #undef TARGET_FLAGS_REGNUM
17809 #define TARGET_FLAGS_REGNUM CC_REGNUM
17810
17811 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
17812 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
17813
17814 #undef TARGET_ASAN_SHADOW_OFFSET
17815 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
17816
17817 #undef TARGET_LEGITIMIZE_ADDRESS
17818 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
17819
17820 #undef TARGET_SCHED_CAN_SPECULATE_INSN
17821 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
17822
17823 #undef TARGET_CAN_USE_DOLOOP_P
17824 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
17825
17826 #undef TARGET_SCHED_ADJUST_PRIORITY
17827 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
17828
17829 #undef TARGET_SCHED_MACRO_FUSION_P
17830 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
17831
17832 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
17833 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
17834
17835 #undef TARGET_SCHED_FUSION_PRIORITY
17836 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
17837
17838 #undef TARGET_UNSPEC_MAY_TRAP_P
17839 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
17840
17841 #undef TARGET_USE_PSEUDO_PIC_REG
17842 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
17843
17844 #undef TARGET_PRINT_OPERAND
17845 #define TARGET_PRINT_OPERAND aarch64_print_operand
17846
17847 #undef TARGET_PRINT_OPERAND_ADDRESS
17848 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
17849
17850 #undef TARGET_OPTAB_SUPPORTED_P
17851 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
17852
17853 #undef TARGET_OMIT_STRUCT_RETURN_REG
17854 #define TARGET_OMIT_STRUCT_RETURN_REG true
17855
17856 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
17857 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
17858 aarch64_dwarf_poly_indeterminate_value
17859
17860 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors. */
17861 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
17862 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
17863
17864 #undef TARGET_HARD_REGNO_NREGS
17865 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
17866 #undef TARGET_HARD_REGNO_MODE_OK
17867 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
17868
17869 #undef TARGET_MODES_TIEABLE_P
17870 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
17871
17872 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
17873 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
17874 aarch64_hard_regno_call_part_clobbered
17875
17876 #undef TARGET_CONSTANT_ALIGNMENT
17877 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
17878
17879 #undef TARGET_COMPUTE_PRESSURE_CLASSES
17880 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
17881
17882 #undef TARGET_CAN_CHANGE_MODE_CLASS
17883 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
17884
17885 #undef TARGET_SELECT_EARLY_REMAT_MODES
17886 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
17887
17888 #if CHECKING_P
17889 #undef TARGET_RUN_TARGET_SELFTESTS
17890 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
17891 #endif /* #if CHECKING_P */
17892
17893 struct gcc_target targetm = TARGET_INITIALIZER;
17894
17895 #include "gt-aarch64.h"
17896