1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2019 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
4
5 This file is part of GCC.
6
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
10 any later version.
11
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
20
21 #define IN_TARGET_CODE 1
22
23 #include "config.h"
24 #define INCLUDE_STRING
25 #include "system.h"
26 #include "coretypes.h"
27 #include "backend.h"
28 #include "target.h"
29 #include "rtl.h"
30 #include "tree.h"
31 #include "memmodel.h"
32 #include "gimple.h"
33 #include "cfghooks.h"
34 #include "cfgloop.h"
35 #include "df.h"
36 #include "tm_p.h"
37 #include "stringpool.h"
38 #include "attribs.h"
39 #include "optabs.h"
40 #include "regs.h"
41 #include "emit-rtl.h"
42 #include "recog.h"
43 #include "cgraph.h"
44 #include "diagnostic.h"
45 #include "insn-attr.h"
46 #include "alias.h"
47 #include "fold-const.h"
48 #include "stor-layout.h"
49 #include "calls.h"
50 #include "varasm.h"
51 #include "output.h"
52 #include "flags.h"
53 #include "explow.h"
54 #include "expr.h"
55 #include "reload.h"
56 #include "langhooks.h"
57 #include "opts.h"
58 #include "params.h"
59 #include "gimplify.h"
60 #include "dwarf2.h"
61 #include "gimple-iterator.h"
62 #include "tree-vectorizer.h"
63 #include "aarch64-cost-tables.h"
64 #include "dumpfile.h"
65 #include "builtins.h"
66 #include "rtl-iter.h"
67 #include "tm-constrs.h"
68 #include "sched-int.h"
69 #include "target-globals.h"
70 #include "common/common-target.h"
71 #include "cfgrtl.h"
72 #include "selftest.h"
73 #include "selftest-rtl.h"
74 #include "rtx-vector-builder.h"
75 #include "intl.h"
76
77 /* This file should be included last. */
78 #include "target-def.h"
79
80 /* Defined for convenience. */
81 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
82
83 /* Information about a legitimate vector immediate operand. */
84 struct simd_immediate_info
85 {
86 enum insn_type { MOV, MVN };
87 enum modifier_type { LSL, MSL };
88
simd_immediate_infosimd_immediate_info89 simd_immediate_info () {}
90 simd_immediate_info (scalar_float_mode, rtx);
91 simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
92 insn_type = MOV, modifier_type = LSL,
93 unsigned int = 0);
94 simd_immediate_info (scalar_mode, rtx, rtx);
95
96 /* The mode of the elements. */
97 scalar_mode elt_mode;
98
99 /* The value of each element if all elements are the same, or the
100 first value if the constant is a series. */
101 rtx value;
102
103 /* The value of the step if the constant is a series, null otherwise. */
104 rtx step;
105
106 /* The instruction to use to move the immediate into a vector. */
107 insn_type insn;
108
109 /* The kind of shift modifier to use, and the number of bits to shift.
110 This is (LSL, 0) if no shift is needed. */
111 modifier_type modifier;
112 unsigned int shift;
113 };
114
115 /* Construct a floating-point immediate in which each element has mode
116 ELT_MODE_IN and value VALUE_IN. */
117 inline simd_immediate_info
simd_immediate_info(scalar_float_mode elt_mode_in,rtx value_in)118 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
119 : elt_mode (elt_mode_in), value (value_in), step (NULL_RTX), insn (MOV),
120 modifier (LSL), shift (0)
121 {}
122
123 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
124 and value VALUE_IN. The other parameters are as for the structure
125 fields. */
126 inline simd_immediate_info
simd_immediate_info(scalar_int_mode elt_mode_in,unsigned HOST_WIDE_INT value_in,insn_type insn_in,modifier_type modifier_in,unsigned int shift_in)127 ::simd_immediate_info (scalar_int_mode elt_mode_in,
128 unsigned HOST_WIDE_INT value_in,
129 insn_type insn_in, modifier_type modifier_in,
130 unsigned int shift_in)
131 : elt_mode (elt_mode_in), value (gen_int_mode (value_in, elt_mode_in)),
132 step (NULL_RTX), insn (insn_in), modifier (modifier_in), shift (shift_in)
133 {}
134
135 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
136 and where element I is equal to VALUE_IN + I * STEP_IN. */
137 inline simd_immediate_info
simd_immediate_info(scalar_mode elt_mode_in,rtx value_in,rtx step_in)138 ::simd_immediate_info (scalar_mode elt_mode_in, rtx value_in, rtx step_in)
139 : elt_mode (elt_mode_in), value (value_in), step (step_in), insn (MOV),
140 modifier (LSL), shift (0)
141 {}
142
143 /* The current code model. */
144 enum aarch64_code_model aarch64_cmodel;
145
146 /* The number of 64-bit elements in an SVE vector. */
147 poly_uint16 aarch64_sve_vg;
148
149 #ifdef HAVE_AS_TLS
150 #undef TARGET_HAVE_TLS
151 #define TARGET_HAVE_TLS 1
152 #endif
153
154 static bool aarch64_composite_type_p (const_tree, machine_mode);
155 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
156 const_tree,
157 machine_mode *, int *,
158 bool *);
159 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
160 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
161 static void aarch64_override_options_after_change (void);
162 static bool aarch64_vector_mode_supported_p (machine_mode);
163 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
164 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
165 const_tree type,
166 int misalignment,
167 bool is_packed);
168 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
169 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
170 aarch64_addr_query_type);
171 static HOST_WIDE_INT aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val);
172
173 /* Major revision number of the ARM Architecture implemented by the target. */
174 unsigned aarch64_architecture_version;
175
176 /* The processor for which instructions should be scheduled. */
177 enum aarch64_processor aarch64_tune = cortexa53;
178
179 /* Mask to specify which instruction scheduling options should be used. */
180 unsigned long aarch64_tune_flags = 0;
181
182 /* Global flag for PC relative loads. */
183 bool aarch64_pcrelative_literal_loads;
184
185 /* Global flag for whether frame pointer is enabled. */
186 bool aarch64_use_frame_pointer;
187
188 #define BRANCH_PROTECT_STR_MAX 255
189 char *accepted_branch_protection_string = NULL;
190
191 static enum aarch64_parse_opt_result
192 aarch64_parse_branch_protection (const char*, char**);
193
194 /* Support for command line parsing of boolean flags in the tuning
195 structures. */
196 struct aarch64_flag_desc
197 {
198 const char* name;
199 unsigned int flag;
200 };
201
202 #define AARCH64_FUSION_PAIR(name, internal_name) \
203 { name, AARCH64_FUSE_##internal_name },
204 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
205 {
206 { "none", AARCH64_FUSE_NOTHING },
207 #include "aarch64-fusion-pairs.def"
208 { "all", AARCH64_FUSE_ALL },
209 { NULL, AARCH64_FUSE_NOTHING }
210 };
211
212 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
213 { name, AARCH64_EXTRA_TUNE_##internal_name },
214 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
215 {
216 { "none", AARCH64_EXTRA_TUNE_NONE },
217 #include "aarch64-tuning-flags.def"
218 { "all", AARCH64_EXTRA_TUNE_ALL },
219 { NULL, AARCH64_EXTRA_TUNE_NONE }
220 };
221
222 /* Tuning parameters. */
223
224 static const struct cpu_addrcost_table generic_addrcost_table =
225 {
226 {
227 1, /* hi */
228 0, /* si */
229 0, /* di */
230 1, /* ti */
231 },
232 0, /* pre_modify */
233 0, /* post_modify */
234 0, /* register_offset */
235 0, /* register_sextend */
236 0, /* register_zextend */
237 0 /* imm_offset */
238 };
239
240 static const struct cpu_addrcost_table exynosm1_addrcost_table =
241 {
242 {
243 0, /* hi */
244 0, /* si */
245 0, /* di */
246 2, /* ti */
247 },
248 0, /* pre_modify */
249 0, /* post_modify */
250 1, /* register_offset */
251 1, /* register_sextend */
252 2, /* register_zextend */
253 0, /* imm_offset */
254 };
255
256 static const struct cpu_addrcost_table xgene1_addrcost_table =
257 {
258 {
259 1, /* hi */
260 0, /* si */
261 0, /* di */
262 1, /* ti */
263 },
264 1, /* pre_modify */
265 1, /* post_modify */
266 0, /* register_offset */
267 1, /* register_sextend */
268 1, /* register_zextend */
269 0, /* imm_offset */
270 };
271
272 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
273 {
274 {
275 1, /* hi */
276 1, /* si */
277 1, /* di */
278 2, /* ti */
279 },
280 0, /* pre_modify */
281 0, /* post_modify */
282 2, /* register_offset */
283 3, /* register_sextend */
284 3, /* register_zextend */
285 0, /* imm_offset */
286 };
287
288 static const struct cpu_addrcost_table tsv110_addrcost_table =
289 {
290 {
291 1, /* hi */
292 0, /* si */
293 0, /* di */
294 1, /* ti */
295 },
296 0, /* pre_modify */
297 0, /* post_modify */
298 0, /* register_offset */
299 1, /* register_sextend */
300 1, /* register_zextend */
301 0, /* imm_offset */
302 };
303
304 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
305 {
306 {
307 1, /* hi */
308 1, /* si */
309 1, /* di */
310 2, /* ti */
311 },
312 1, /* pre_modify */
313 1, /* post_modify */
314 3, /* register_offset */
315 3, /* register_sextend */
316 3, /* register_zextend */
317 2, /* imm_offset */
318 };
319
320 static const struct cpu_addrcost_table a64fx_addrcost_table =
321 {
322 {
323 1, /* hi */
324 1, /* si */
325 1, /* di */
326 2, /* ti */
327 },
328 0, /* pre_modify */
329 0, /* post_modify */
330 2, /* register_offset */
331 3, /* register_sextend */
332 3, /* register_zextend */
333 0, /* imm_offset */
334 };
335
336 static const struct cpu_regmove_cost generic_regmove_cost =
337 {
338 1, /* GP2GP */
339 /* Avoid the use of slow int<->fp moves for spilling by setting
340 their cost higher than memmov_cost. */
341 5, /* GP2FP */
342 5, /* FP2GP */
343 2 /* FP2FP */
344 };
345
346 static const struct cpu_regmove_cost cortexa57_regmove_cost =
347 {
348 1, /* GP2GP */
349 /* Avoid the use of slow int<->fp moves for spilling by setting
350 their cost higher than memmov_cost. */
351 5, /* GP2FP */
352 5, /* FP2GP */
353 2 /* FP2FP */
354 };
355
356 static const struct cpu_regmove_cost cortexa53_regmove_cost =
357 {
358 1, /* GP2GP */
359 /* Avoid the use of slow int<->fp moves for spilling by setting
360 their cost higher than memmov_cost. */
361 5, /* GP2FP */
362 5, /* FP2GP */
363 2 /* FP2FP */
364 };
365
366 static const struct cpu_regmove_cost exynosm1_regmove_cost =
367 {
368 1, /* GP2GP */
369 /* Avoid the use of slow int<->fp moves for spilling by setting
370 their cost higher than memmov_cost (actual, 4 and 9). */
371 9, /* GP2FP */
372 9, /* FP2GP */
373 1 /* FP2FP */
374 };
375
376 static const struct cpu_regmove_cost thunderx_regmove_cost =
377 {
378 2, /* GP2GP */
379 2, /* GP2FP */
380 6, /* FP2GP */
381 4 /* FP2FP */
382 };
383
384 static const struct cpu_regmove_cost xgene1_regmove_cost =
385 {
386 1, /* GP2GP */
387 /* Avoid the use of slow int<->fp moves for spilling by setting
388 their cost higher than memmov_cost. */
389 8, /* GP2FP */
390 8, /* FP2GP */
391 2 /* FP2FP */
392 };
393
394 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
395 {
396 2, /* GP2GP */
397 /* Avoid the use of int<->fp moves for spilling. */
398 6, /* GP2FP */
399 6, /* FP2GP */
400 4 /* FP2FP */
401 };
402
403 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
404 {
405 1, /* GP2GP */
406 /* Avoid the use of int<->fp moves for spilling. */
407 8, /* GP2FP */
408 8, /* FP2GP */
409 4 /* FP2FP */
410 };
411
412 static const struct cpu_regmove_cost tsv110_regmove_cost =
413 {
414 1, /* GP2GP */
415 /* Avoid the use of slow int<->fp moves for spilling by setting
416 their cost higher than memmov_cost. */
417 2, /* GP2FP */
418 3, /* FP2GP */
419 2 /* FP2FP */
420 };
421
422 static const struct cpu_regmove_cost a64fx_regmove_cost =
423 {
424 1, /* GP2GP */
425 /* Avoid the use of slow int<->fp moves for spilling by setting
426 their cost higher than memmov_cost. */
427 5, /* GP2FP */
428 7, /* FP2GP */
429 2 /* FP2FP */
430 };
431
432 /* Generic costs for vector insn classes. */
433 static const struct cpu_vector_cost generic_vector_cost =
434 {
435 1, /* scalar_int_stmt_cost */
436 1, /* scalar_fp_stmt_cost */
437 1, /* scalar_load_cost */
438 1, /* scalar_store_cost */
439 1, /* vec_int_stmt_cost */
440 1, /* vec_fp_stmt_cost */
441 2, /* vec_permute_cost */
442 1, /* vec_to_scalar_cost */
443 1, /* scalar_to_vec_cost */
444 1, /* vec_align_load_cost */
445 1, /* vec_unalign_load_cost */
446 1, /* vec_unalign_store_cost */
447 1, /* vec_store_cost */
448 3, /* cond_taken_branch_cost */
449 1 /* cond_not_taken_branch_cost */
450 };
451
452 /* QDF24XX costs for vector insn classes. */
453 static const struct cpu_vector_cost qdf24xx_vector_cost =
454 {
455 1, /* scalar_int_stmt_cost */
456 1, /* scalar_fp_stmt_cost */
457 1, /* scalar_load_cost */
458 1, /* scalar_store_cost */
459 1, /* vec_int_stmt_cost */
460 3, /* vec_fp_stmt_cost */
461 2, /* vec_permute_cost */
462 1, /* vec_to_scalar_cost */
463 1, /* scalar_to_vec_cost */
464 1, /* vec_align_load_cost */
465 1, /* vec_unalign_load_cost */
466 1, /* vec_unalign_store_cost */
467 1, /* vec_store_cost */
468 3, /* cond_taken_branch_cost */
469 1 /* cond_not_taken_branch_cost */
470 };
471
472 /* ThunderX costs for vector insn classes. */
473 static const struct cpu_vector_cost thunderx_vector_cost =
474 {
475 1, /* scalar_int_stmt_cost */
476 1, /* scalar_fp_stmt_cost */
477 3, /* scalar_load_cost */
478 1, /* scalar_store_cost */
479 4, /* vec_int_stmt_cost */
480 1, /* vec_fp_stmt_cost */
481 4, /* vec_permute_cost */
482 2, /* vec_to_scalar_cost */
483 2, /* scalar_to_vec_cost */
484 3, /* vec_align_load_cost */
485 5, /* vec_unalign_load_cost */
486 5, /* vec_unalign_store_cost */
487 1, /* vec_store_cost */
488 3, /* cond_taken_branch_cost */
489 3 /* cond_not_taken_branch_cost */
490 };
491
492 static const struct cpu_vector_cost tsv110_vector_cost =
493 {
494 1, /* scalar_int_stmt_cost */
495 1, /* scalar_fp_stmt_cost */
496 5, /* scalar_load_cost */
497 1, /* scalar_store_cost */
498 2, /* vec_int_stmt_cost */
499 2, /* vec_fp_stmt_cost */
500 2, /* vec_permute_cost */
501 3, /* vec_to_scalar_cost */
502 2, /* scalar_to_vec_cost */
503 5, /* vec_align_load_cost */
504 5, /* vec_unalign_load_cost */
505 1, /* vec_unalign_store_cost */
506 1, /* vec_store_cost */
507 1, /* cond_taken_branch_cost */
508 1 /* cond_not_taken_branch_cost */
509 };
510
511 /* Generic costs for vector insn classes. */
512 static const struct cpu_vector_cost cortexa57_vector_cost =
513 {
514 1, /* scalar_int_stmt_cost */
515 1, /* scalar_fp_stmt_cost */
516 4, /* scalar_load_cost */
517 1, /* scalar_store_cost */
518 2, /* vec_int_stmt_cost */
519 2, /* vec_fp_stmt_cost */
520 3, /* vec_permute_cost */
521 8, /* vec_to_scalar_cost */
522 8, /* scalar_to_vec_cost */
523 4, /* vec_align_load_cost */
524 4, /* vec_unalign_load_cost */
525 1, /* vec_unalign_store_cost */
526 1, /* vec_store_cost */
527 1, /* cond_taken_branch_cost */
528 1 /* cond_not_taken_branch_cost */
529 };
530
531 static const struct cpu_vector_cost exynosm1_vector_cost =
532 {
533 1, /* scalar_int_stmt_cost */
534 1, /* scalar_fp_stmt_cost */
535 5, /* scalar_load_cost */
536 1, /* scalar_store_cost */
537 3, /* vec_int_stmt_cost */
538 3, /* vec_fp_stmt_cost */
539 3, /* vec_permute_cost */
540 3, /* vec_to_scalar_cost */
541 3, /* scalar_to_vec_cost */
542 5, /* vec_align_load_cost */
543 5, /* vec_unalign_load_cost */
544 1, /* vec_unalign_store_cost */
545 1, /* vec_store_cost */
546 1, /* cond_taken_branch_cost */
547 1 /* cond_not_taken_branch_cost */
548 };
549
550 /* Generic costs for vector insn classes. */
551 static const struct cpu_vector_cost xgene1_vector_cost =
552 {
553 1, /* scalar_int_stmt_cost */
554 1, /* scalar_fp_stmt_cost */
555 5, /* scalar_load_cost */
556 1, /* scalar_store_cost */
557 2, /* vec_int_stmt_cost */
558 2, /* vec_fp_stmt_cost */
559 2, /* vec_permute_cost */
560 4, /* vec_to_scalar_cost */
561 4, /* scalar_to_vec_cost */
562 10, /* vec_align_load_cost */
563 10, /* vec_unalign_load_cost */
564 2, /* vec_unalign_store_cost */
565 2, /* vec_store_cost */
566 2, /* cond_taken_branch_cost */
567 1 /* cond_not_taken_branch_cost */
568 };
569
570 /* Costs for vector insn classes for Vulcan. */
571 static const struct cpu_vector_cost thunderx2t99_vector_cost =
572 {
573 1, /* scalar_int_stmt_cost */
574 6, /* scalar_fp_stmt_cost */
575 4, /* scalar_load_cost */
576 1, /* scalar_store_cost */
577 5, /* vec_int_stmt_cost */
578 6, /* vec_fp_stmt_cost */
579 3, /* vec_permute_cost */
580 6, /* vec_to_scalar_cost */
581 5, /* scalar_to_vec_cost */
582 8, /* vec_align_load_cost */
583 8, /* vec_unalign_load_cost */
584 4, /* vec_unalign_store_cost */
585 4, /* vec_store_cost */
586 2, /* cond_taken_branch_cost */
587 1 /* cond_not_taken_branch_cost */
588 };
589
590 static const struct cpu_vector_cost a64fx_vector_cost =
591 {
592 1, /* scalar_int_stmt_cost */
593 5, /* scalar_fp_stmt_cost */
594 4, /* scalar_load_cost */
595 1, /* scalar_store_cost */
596 2, /* vec_int_stmt_cost */
597 5, /* vec_fp_stmt_cost */
598 3, /* vec_permute_cost */
599 13, /* vec_to_scalar_cost */
600 4, /* scalar_to_vec_cost */
601 6, /* vec_align_load_cost */
602 6, /* vec_unalign_load_cost */
603 1, /* vec_unalign_store_cost */
604 1, /* vec_store_cost */
605 3, /* cond_taken_branch_cost */
606 1 /* cond_not_taken_branch_cost */
607 };
608
609 /* Generic costs for branch instructions. */
610 static const struct cpu_branch_cost generic_branch_cost =
611 {
612 1, /* Predictable. */
613 3 /* Unpredictable. */
614 };
615
616 /* Generic approximation modes. */
617 static const cpu_approx_modes generic_approx_modes =
618 {
619 AARCH64_APPROX_NONE, /* division */
620 AARCH64_APPROX_NONE, /* sqrt */
621 AARCH64_APPROX_NONE /* recip_sqrt */
622 };
623
624 /* Approximation modes for Exynos M1. */
625 static const cpu_approx_modes exynosm1_approx_modes =
626 {
627 AARCH64_APPROX_NONE, /* division */
628 AARCH64_APPROX_ALL, /* sqrt */
629 AARCH64_APPROX_ALL /* recip_sqrt */
630 };
631
632 /* Approximation modes for X-Gene 1. */
633 static const cpu_approx_modes xgene1_approx_modes =
634 {
635 AARCH64_APPROX_NONE, /* division */
636 AARCH64_APPROX_NONE, /* sqrt */
637 AARCH64_APPROX_ALL /* recip_sqrt */
638 };
639
640 /* Generic prefetch settings (which disable prefetch). */
641 static const cpu_prefetch_tune generic_prefetch_tune =
642 {
643 0, /* num_slots */
644 -1, /* l1_cache_size */
645 -1, /* l1_cache_line_size */
646 -1, /* l2_cache_size */
647 true, /* prefetch_dynamic_strides */
648 -1, /* minimum_stride */
649 -1 /* default_opt_level */
650 };
651
652 static const cpu_prefetch_tune exynosm1_prefetch_tune =
653 {
654 0, /* num_slots */
655 -1, /* l1_cache_size */
656 64, /* l1_cache_line_size */
657 -1, /* l2_cache_size */
658 true, /* prefetch_dynamic_strides */
659 -1, /* minimum_stride */
660 -1 /* default_opt_level */
661 };
662
663 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
664 {
665 4, /* num_slots */
666 32, /* l1_cache_size */
667 64, /* l1_cache_line_size */
668 512, /* l2_cache_size */
669 false, /* prefetch_dynamic_strides */
670 2048, /* minimum_stride */
671 3 /* default_opt_level */
672 };
673
674 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
675 {
676 8, /* num_slots */
677 32, /* l1_cache_size */
678 128, /* l1_cache_line_size */
679 16*1024, /* l2_cache_size */
680 true, /* prefetch_dynamic_strides */
681 -1, /* minimum_stride */
682 3 /* default_opt_level */
683 };
684
685 static const cpu_prefetch_tune thunderx_prefetch_tune =
686 {
687 8, /* num_slots */
688 32, /* l1_cache_size */
689 128, /* l1_cache_line_size */
690 -1, /* l2_cache_size */
691 true, /* prefetch_dynamic_strides */
692 -1, /* minimum_stride */
693 -1 /* default_opt_level */
694 };
695
696 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
697 {
698 8, /* num_slots */
699 32, /* l1_cache_size */
700 64, /* l1_cache_line_size */
701 256, /* l2_cache_size */
702 true, /* prefetch_dynamic_strides */
703 -1, /* minimum_stride */
704 -1 /* default_opt_level */
705 };
706
707 static const cpu_prefetch_tune tsv110_prefetch_tune =
708 {
709 0, /* num_slots */
710 64, /* l1_cache_size */
711 64, /* l1_cache_line_size */
712 512, /* l2_cache_size */
713 true, /* prefetch_dynamic_strides */
714 -1, /* minimum_stride */
715 -1 /* default_opt_level */
716 };
717
718 static const cpu_prefetch_tune xgene1_prefetch_tune =
719 {
720 8, /* num_slots */
721 32, /* l1_cache_size */
722 64, /* l1_cache_line_size */
723 256, /* l2_cache_size */
724 true, /* prefetch_dynamic_strides */
725 -1, /* minimum_stride */
726 -1 /* default_opt_level */
727 };
728
729 static const cpu_prefetch_tune a64fx_prefetch_tune =
730 {
731 8, /* num_slots */
732 64, /* l1_cache_size */
733 256, /* l1_cache_line_size */
734 32768, /* l2_cache_size */
735 true, /* prefetch_dynamic_strides */
736 -1, /* minimum_stride */
737 -1 /* default_opt_level */
738 };
739
740 static const struct tune_params generic_tunings =
741 {
742 &cortexa57_extra_costs,
743 &generic_addrcost_table,
744 &generic_regmove_cost,
745 &generic_vector_cost,
746 &generic_branch_cost,
747 &generic_approx_modes,
748 SVE_NOT_IMPLEMENTED, /* sve_width */
749 4, /* memmov_cost */
750 2, /* issue_rate */
751 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
752 "8", /* function_align. */
753 "4", /* jump_align. */
754 "8", /* loop_align. */
755 2, /* int_reassoc_width. */
756 4, /* fp_reassoc_width. */
757 1, /* vec_reassoc_width. */
758 2, /* min_div_recip_mul_sf. */
759 2, /* min_div_recip_mul_df. */
760 0, /* max_case_values. */
761 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
762 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
763 &generic_prefetch_tune
764 };
765
766 static const struct tune_params cortexa35_tunings =
767 {
768 &cortexa53_extra_costs,
769 &generic_addrcost_table,
770 &cortexa53_regmove_cost,
771 &generic_vector_cost,
772 &generic_branch_cost,
773 &generic_approx_modes,
774 SVE_NOT_IMPLEMENTED, /* sve_width */
775 4, /* memmov_cost */
776 1, /* issue_rate */
777 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
778 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
779 "16", /* function_align. */
780 "4", /* jump_align. */
781 "8", /* loop_align. */
782 2, /* int_reassoc_width. */
783 4, /* fp_reassoc_width. */
784 1, /* vec_reassoc_width. */
785 2, /* min_div_recip_mul_sf. */
786 2, /* min_div_recip_mul_df. */
787 0, /* max_case_values. */
788 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
789 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
790 &generic_prefetch_tune
791 };
792
793 static const struct tune_params cortexa53_tunings =
794 {
795 &cortexa53_extra_costs,
796 &generic_addrcost_table,
797 &cortexa53_regmove_cost,
798 &generic_vector_cost,
799 &generic_branch_cost,
800 &generic_approx_modes,
801 SVE_NOT_IMPLEMENTED, /* sve_width */
802 4, /* memmov_cost */
803 2, /* issue_rate */
804 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
805 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
806 "16", /* function_align. */
807 "4", /* jump_align. */
808 "8", /* loop_align. */
809 2, /* int_reassoc_width. */
810 4, /* fp_reassoc_width. */
811 1, /* vec_reassoc_width. */
812 2, /* min_div_recip_mul_sf. */
813 2, /* min_div_recip_mul_df. */
814 0, /* max_case_values. */
815 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
816 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
817 &generic_prefetch_tune
818 };
819
820 static const struct tune_params cortexa57_tunings =
821 {
822 &cortexa57_extra_costs,
823 &generic_addrcost_table,
824 &cortexa57_regmove_cost,
825 &cortexa57_vector_cost,
826 &generic_branch_cost,
827 &generic_approx_modes,
828 SVE_NOT_IMPLEMENTED, /* sve_width */
829 4, /* memmov_cost */
830 3, /* issue_rate */
831 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
832 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
833 "16", /* function_align. */
834 "4", /* jump_align. */
835 "8", /* loop_align. */
836 2, /* int_reassoc_width. */
837 4, /* fp_reassoc_width. */
838 1, /* vec_reassoc_width. */
839 2, /* min_div_recip_mul_sf. */
840 2, /* min_div_recip_mul_df. */
841 0, /* max_case_values. */
842 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
843 (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags. */
844 &generic_prefetch_tune
845 };
846
847 static const struct tune_params cortexa72_tunings =
848 {
849 &cortexa57_extra_costs,
850 &generic_addrcost_table,
851 &cortexa57_regmove_cost,
852 &cortexa57_vector_cost,
853 &generic_branch_cost,
854 &generic_approx_modes,
855 SVE_NOT_IMPLEMENTED, /* sve_width */
856 4, /* memmov_cost */
857 3, /* issue_rate */
858 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
859 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
860 "16", /* function_align. */
861 "4", /* jump_align. */
862 "8", /* loop_align. */
863 2, /* int_reassoc_width. */
864 4, /* fp_reassoc_width. */
865 1, /* vec_reassoc_width. */
866 2, /* min_div_recip_mul_sf. */
867 2, /* min_div_recip_mul_df. */
868 0, /* max_case_values. */
869 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
870 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
871 &generic_prefetch_tune
872 };
873
874 static const struct tune_params cortexa73_tunings =
875 {
876 &cortexa57_extra_costs,
877 &generic_addrcost_table,
878 &cortexa57_regmove_cost,
879 &cortexa57_vector_cost,
880 &generic_branch_cost,
881 &generic_approx_modes,
882 SVE_NOT_IMPLEMENTED, /* sve_width */
883 4, /* memmov_cost. */
884 2, /* issue_rate. */
885 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
886 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
887 "16", /* function_align. */
888 "4", /* jump_align. */
889 "8", /* loop_align. */
890 2, /* int_reassoc_width. */
891 4, /* fp_reassoc_width. */
892 1, /* vec_reassoc_width. */
893 2, /* min_div_recip_mul_sf. */
894 2, /* min_div_recip_mul_df. */
895 0, /* max_case_values. */
896 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
897 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
898 &generic_prefetch_tune
899 };
900
901
902
903 static const struct tune_params exynosm1_tunings =
904 {
905 &exynosm1_extra_costs,
906 &exynosm1_addrcost_table,
907 &exynosm1_regmove_cost,
908 &exynosm1_vector_cost,
909 &generic_branch_cost,
910 &exynosm1_approx_modes,
911 SVE_NOT_IMPLEMENTED, /* sve_width */
912 4, /* memmov_cost */
913 3, /* issue_rate */
914 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
915 "4", /* function_align. */
916 "4", /* jump_align. */
917 "4", /* loop_align. */
918 2, /* int_reassoc_width. */
919 4, /* fp_reassoc_width. */
920 1, /* vec_reassoc_width. */
921 2, /* min_div_recip_mul_sf. */
922 2, /* min_div_recip_mul_df. */
923 48, /* max_case_values. */
924 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
925 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
926 &exynosm1_prefetch_tune
927 };
928
929 static const struct tune_params thunderxt88_tunings =
930 {
931 &thunderx_extra_costs,
932 &generic_addrcost_table,
933 &thunderx_regmove_cost,
934 &thunderx_vector_cost,
935 &generic_branch_cost,
936 &generic_approx_modes,
937 SVE_NOT_IMPLEMENTED, /* sve_width */
938 6, /* memmov_cost */
939 2, /* issue_rate */
940 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
941 "8", /* function_align. */
942 "8", /* jump_align. */
943 "8", /* loop_align. */
944 2, /* int_reassoc_width. */
945 4, /* fp_reassoc_width. */
946 1, /* vec_reassoc_width. */
947 2, /* min_div_recip_mul_sf. */
948 2, /* min_div_recip_mul_df. */
949 0, /* max_case_values. */
950 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
951 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW), /* tune_flags. */
952 &thunderxt88_prefetch_tune
953 };
954
955 static const struct tune_params thunderx_tunings =
956 {
957 &thunderx_extra_costs,
958 &generic_addrcost_table,
959 &thunderx_regmove_cost,
960 &thunderx_vector_cost,
961 &generic_branch_cost,
962 &generic_approx_modes,
963 SVE_NOT_IMPLEMENTED, /* sve_width */
964 6, /* memmov_cost */
965 2, /* issue_rate */
966 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
967 "8", /* function_align. */
968 "8", /* jump_align. */
969 "8", /* loop_align. */
970 2, /* int_reassoc_width. */
971 4, /* fp_reassoc_width. */
972 1, /* vec_reassoc_width. */
973 2, /* min_div_recip_mul_sf. */
974 2, /* min_div_recip_mul_df. */
975 0, /* max_case_values. */
976 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
977 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
978 | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */
979 &thunderx_prefetch_tune
980 };
981
982 static const struct tune_params tsv110_tunings =
983 {
984 &tsv110_extra_costs,
985 &tsv110_addrcost_table,
986 &tsv110_regmove_cost,
987 &tsv110_vector_cost,
988 &generic_branch_cost,
989 &generic_approx_modes,
990 SVE_NOT_IMPLEMENTED, /* sve_width */
991 4, /* memmov_cost */
992 4, /* issue_rate */
993 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH
994 | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops */
995 "16", /* function_align. */
996 "4", /* jump_align. */
997 "8", /* loop_align. */
998 2, /* int_reassoc_width. */
999 4, /* fp_reassoc_width. */
1000 1, /* vec_reassoc_width. */
1001 2, /* min_div_recip_mul_sf. */
1002 2, /* min_div_recip_mul_df. */
1003 0, /* max_case_values. */
1004 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1005 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1006 &tsv110_prefetch_tune
1007 };
1008
1009 static const struct tune_params xgene1_tunings =
1010 {
1011 &xgene1_extra_costs,
1012 &xgene1_addrcost_table,
1013 &xgene1_regmove_cost,
1014 &xgene1_vector_cost,
1015 &generic_branch_cost,
1016 &xgene1_approx_modes,
1017 SVE_NOT_IMPLEMENTED, /* sve_width */
1018 6, /* memmov_cost */
1019 4, /* issue_rate */
1020 AARCH64_FUSE_NOTHING, /* fusible_ops */
1021 "16", /* function_align. */
1022 "16", /* jump_align. */
1023 "16", /* loop_align. */
1024 2, /* int_reassoc_width. */
1025 4, /* fp_reassoc_width. */
1026 1, /* vec_reassoc_width. */
1027 2, /* min_div_recip_mul_sf. */
1028 2, /* min_div_recip_mul_df. */
1029 17, /* max_case_values. */
1030 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
1031 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS), /* tune_flags. */
1032 &xgene1_prefetch_tune
1033 };
1034
1035 static const struct tune_params emag_tunings =
1036 {
1037 &xgene1_extra_costs,
1038 &xgene1_addrcost_table,
1039 &xgene1_regmove_cost,
1040 &xgene1_vector_cost,
1041 &generic_branch_cost,
1042 &xgene1_approx_modes,
1043 SVE_NOT_IMPLEMENTED,
1044 6, /* memmov_cost */
1045 4, /* issue_rate */
1046 AARCH64_FUSE_NOTHING, /* fusible_ops */
1047 "16", /* function_align. */
1048 "16", /* jump_align. */
1049 "16", /* loop_align. */
1050 2, /* int_reassoc_width. */
1051 4, /* fp_reassoc_width. */
1052 1, /* vec_reassoc_width. */
1053 2, /* min_div_recip_mul_sf. */
1054 2, /* min_div_recip_mul_df. */
1055 17, /* max_case_values. */
1056 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
1057 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS), /* tune_flags. */
1058 &xgene1_prefetch_tune
1059 };
1060
1061 static const struct tune_params qdf24xx_tunings =
1062 {
1063 &qdf24xx_extra_costs,
1064 &qdf24xx_addrcost_table,
1065 &qdf24xx_regmove_cost,
1066 &qdf24xx_vector_cost,
1067 &generic_branch_cost,
1068 &generic_approx_modes,
1069 SVE_NOT_IMPLEMENTED, /* sve_width */
1070 4, /* memmov_cost */
1071 4, /* issue_rate */
1072 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1073 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
1074 "16", /* function_align. */
1075 "8", /* jump_align. */
1076 "16", /* loop_align. */
1077 2, /* int_reassoc_width. */
1078 4, /* fp_reassoc_width. */
1079 1, /* vec_reassoc_width. */
1080 2, /* min_div_recip_mul_sf. */
1081 2, /* min_div_recip_mul_df. */
1082 0, /* max_case_values. */
1083 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1084 AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS, /* tune_flags. */
1085 &qdf24xx_prefetch_tune
1086 };
1087
1088 /* Tuning structure for the Qualcomm Saphira core. Default to falkor values
1089 for now. */
1090 static const struct tune_params saphira_tunings =
1091 {
1092 &generic_extra_costs,
1093 &generic_addrcost_table,
1094 &generic_regmove_cost,
1095 &generic_vector_cost,
1096 &generic_branch_cost,
1097 &generic_approx_modes,
1098 SVE_NOT_IMPLEMENTED, /* sve_width */
1099 4, /* memmov_cost */
1100 4, /* issue_rate */
1101 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1102 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
1103 "16", /* function_align. */
1104 "8", /* jump_align. */
1105 "16", /* loop_align. */
1106 2, /* int_reassoc_width. */
1107 4, /* fp_reassoc_width. */
1108 1, /* vec_reassoc_width. */
1109 2, /* min_div_recip_mul_sf. */
1110 2, /* min_div_recip_mul_df. */
1111 0, /* max_case_values. */
1112 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1113 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1114 &generic_prefetch_tune
1115 };
1116
1117 static const struct tune_params thunderx2t99_tunings =
1118 {
1119 &thunderx2t99_extra_costs,
1120 &thunderx2t99_addrcost_table,
1121 &thunderx2t99_regmove_cost,
1122 &thunderx2t99_vector_cost,
1123 &generic_branch_cost,
1124 &generic_approx_modes,
1125 SVE_NOT_IMPLEMENTED, /* sve_width */
1126 4, /* memmov_cost. */
1127 4, /* issue_rate. */
1128 (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
1129 | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops */
1130 "16", /* function_align. */
1131 "8", /* jump_align. */
1132 "16", /* loop_align. */
1133 3, /* int_reassoc_width. */
1134 2, /* fp_reassoc_width. */
1135 2, /* vec_reassoc_width. */
1136 2, /* min_div_recip_mul_sf. */
1137 2, /* min_div_recip_mul_df. */
1138 0, /* max_case_values. */
1139 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1140 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1141 &thunderx2t99_prefetch_tune
1142 };
1143
1144 static const struct tune_params neoversen1_tunings =
1145 {
1146 &cortexa57_extra_costs,
1147 &generic_addrcost_table,
1148 &generic_regmove_cost,
1149 &cortexa57_vector_cost,
1150 &generic_branch_cost,
1151 &generic_approx_modes,
1152 SVE_NOT_IMPLEMENTED, /* sve_width */
1153 4, /* memmov_cost */
1154 3, /* issue_rate */
1155 AARCH64_FUSE_AES_AESMC, /* fusible_ops */
1156 "32:16", /* function_align. */
1157 "32:16", /* jump_align. */
1158 "32:16", /* loop_align. */
1159 2, /* int_reassoc_width. */
1160 4, /* fp_reassoc_width. */
1161 2, /* vec_reassoc_width. */
1162 2, /* min_div_recip_mul_sf. */
1163 2, /* min_div_recip_mul_df. */
1164 0, /* max_case_values. */
1165 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1166 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1167 &generic_prefetch_tune
1168 };
1169
1170 static const struct tune_params neoversev1_tunings =
1171 {
1172 &cortexa57_extra_costs,
1173 &generic_addrcost_table,
1174 &generic_regmove_cost,
1175 &cortexa57_vector_cost,
1176 &generic_branch_cost,
1177 &generic_approx_modes,
1178 SVE_256, /* sve_width */
1179 4, /* memmov_cost */
1180 3, /* issue_rate */
1181 AARCH64_FUSE_AES_AESMC, /* fusible_ops */
1182 "32:16", /* function_align. */
1183 "32:16", /* jump_align. */
1184 "32:16", /* loop_align. */
1185 2, /* int_reassoc_width. */
1186 4, /* fp_reassoc_width. */
1187 2, /* vec_reassoc_width. */
1188 2, /* min_div_recip_mul_sf. */
1189 2, /* min_div_recip_mul_df. */
1190 0, /* max_case_values. */
1191 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1192 (AARCH64_EXTRA_TUNE_PREFER_ADVSIMD_AUTOVEC), /* tune_flags. */
1193 &generic_prefetch_tune
1194 };
1195
1196 static const struct tune_params neoversen2_tunings =
1197 {
1198 &cortexa57_extra_costs,
1199 &generic_addrcost_table,
1200 &generic_regmove_cost,
1201 &cortexa57_vector_cost,
1202 &generic_branch_cost,
1203 &generic_approx_modes,
1204 SVE_128, /* sve_width */
1205 4, /* memmov_cost */
1206 3, /* issue_rate */
1207 AARCH64_FUSE_AES_AESMC, /* fusible_ops */
1208 "32:16", /* function_align. */
1209 "32:16", /* jump_align. */
1210 "32:16", /* loop_align. */
1211 2, /* int_reassoc_width. */
1212 4, /* fp_reassoc_width. */
1213 2, /* vec_reassoc_width. */
1214 2, /* min_div_recip_mul_sf. */
1215 2, /* min_div_recip_mul_df. */
1216 0, /* max_case_values. */
1217 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1218 (AARCH64_EXTRA_TUNE_PREFER_ADVSIMD_AUTOVEC), /* tune_flags. */
1219 &generic_prefetch_tune
1220 };
1221
1222 static const struct tune_params a64fx_tunings =
1223 {
1224 &a64fx_extra_costs,
1225 &a64fx_addrcost_table,
1226 &a64fx_regmove_cost,
1227 &a64fx_vector_cost,
1228 &generic_branch_cost,
1229 &generic_approx_modes,
1230 SVE_512, /* sve_width */
1231 4, /* memmov_cost */
1232 7, /* issue_rate */
1233 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */
1234 "32", /* function_align. */
1235 "16", /* jump_align. */
1236 "32", /* loop_align. */
1237 4, /* int_reassoc_width. */
1238 2, /* fp_reassoc_width. */
1239 2, /* vec_reassoc_width. */
1240 2, /* min_div_recip_mul_sf. */
1241 2, /* min_div_recip_mul_df. */
1242 0, /* max_case_values. */
1243 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1244 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1245 &a64fx_prefetch_tune
1246 };
1247
1248 /* Support for fine-grained override of the tuning structures. */
1249 struct aarch64_tuning_override_function
1250 {
1251 const char* name;
1252 void (*parse_override)(const char*, struct tune_params*);
1253 };
1254
1255 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
1256 static void aarch64_parse_tune_string (const char*, struct tune_params*);
1257 static void aarch64_parse_sve_width_string (const char*, struct tune_params*);
1258
1259 static const struct aarch64_tuning_override_function
1260 aarch64_tuning_override_functions[] =
1261 {
1262 { "fuse", aarch64_parse_fuse_string },
1263 { "tune", aarch64_parse_tune_string },
1264 { "sve_width", aarch64_parse_sve_width_string },
1265 { NULL, NULL }
1266 };
1267
1268 /* A processor implementing AArch64. */
1269 struct processor
1270 {
1271 const char *const name;
1272 enum aarch64_processor ident;
1273 enum aarch64_processor sched_core;
1274 enum aarch64_arch arch;
1275 unsigned architecture_version;
1276 const unsigned long flags;
1277 const struct tune_params *const tune;
1278 };
1279
1280 /* Architectures implementing AArch64. */
1281 static const struct processor all_architectures[] =
1282 {
1283 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
1284 {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
1285 #include "aarch64-arches.def"
1286 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1287 };
1288
1289 /* Processor cores implementing AArch64. */
1290 static const struct processor all_cores[] =
1291 {
1292 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
1293 {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
1294 all_architectures[AARCH64_ARCH_##ARCH].architecture_version, \
1295 FLAGS, &COSTS##_tunings},
1296 #include "aarch64-cores.def"
1297 {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
1298 AARCH64_FL_FOR_ARCH8, &generic_tunings},
1299 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1300 };
1301
1302
1303 /* Target specification. These are populated by the -march, -mtune, -mcpu
1304 handling code or by target attributes. */
1305 static const struct processor *selected_arch;
1306 static const struct processor *selected_cpu;
1307 static const struct processor *selected_tune;
1308
1309 /* The current tuning set. */
1310 struct tune_params aarch64_tune_params = generic_tunings;
1311
1312 /* Table of machine attributes. */
1313 static const struct attribute_spec aarch64_attribute_table[] =
1314 {
1315 /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
1316 affects_type_identity, handler, exclude } */
1317 { "aarch64_vector_pcs", 0, 0, false, true, true, true, NULL, NULL },
1318 { NULL, 0, 0, false, false, false, false, NULL, NULL }
1319 };
1320
1321 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
1322
1323 /* An ISA extension in the co-processor and main instruction set space. */
1324 struct aarch64_option_extension
1325 {
1326 const char *const name;
1327 const unsigned long flags_on;
1328 const unsigned long flags_off;
1329 };
1330
1331 typedef enum aarch64_cond_code
1332 {
1333 AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
1334 AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
1335 AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
1336 }
1337 aarch64_cc;
1338
1339 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1340
1341 struct aarch64_branch_protect_type
1342 {
1343 /* The type's name that the user passes to the branch-protection option
1344 string. */
1345 const char* name;
1346 /* Function to handle the protection type and set global variables.
1347 First argument is the string token corresponding with this type and the
1348 second argument is the next token in the option string.
1349 Return values:
1350 * AARCH64_PARSE_OK: Handling was sucessful.
1351 * AARCH64_INVALID_ARG: The type is invalid in this context and the caller
1352 should print an error.
1353 * AARCH64_INVALID_FEATURE: The type is invalid and the handler prints its
1354 own error. */
1355 enum aarch64_parse_opt_result (*handler)(char*, char*);
1356 /* A list of types that can follow this type in the option string. */
1357 const aarch64_branch_protect_type* subtypes;
1358 unsigned int num_subtypes;
1359 };
1360
1361 static enum aarch64_parse_opt_result
aarch64_handle_no_branch_protection(char * str,char * rest)1362 aarch64_handle_no_branch_protection (char* str, char* rest)
1363 {
1364 aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
1365 aarch64_enable_bti = 0;
1366 if (rest)
1367 {
1368 error ("unexpected %<%s%> after %<%s%>", rest, str);
1369 return AARCH64_PARSE_INVALID_FEATURE;
1370 }
1371 return AARCH64_PARSE_OK;
1372 }
1373
1374 static enum aarch64_parse_opt_result
aarch64_handle_standard_branch_protection(char * str,char * rest)1375 aarch64_handle_standard_branch_protection (char* str, char* rest)
1376 {
1377 aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1378 aarch64_enable_bti = 1;
1379 if (rest)
1380 {
1381 error ("unexpected %<%s%> after %<%s%>", rest, str);
1382 return AARCH64_PARSE_INVALID_FEATURE;
1383 }
1384 return AARCH64_PARSE_OK;
1385 }
1386
1387 static enum aarch64_parse_opt_result
aarch64_handle_pac_ret_protection(char * str ATTRIBUTE_UNUSED,char * rest ATTRIBUTE_UNUSED)1388 aarch64_handle_pac_ret_protection (char* str ATTRIBUTE_UNUSED,
1389 char* rest ATTRIBUTE_UNUSED)
1390 {
1391 aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1392 return AARCH64_PARSE_OK;
1393 }
1394
1395 static enum aarch64_parse_opt_result
aarch64_handle_pac_ret_leaf(char * str ATTRIBUTE_UNUSED,char * rest ATTRIBUTE_UNUSED)1396 aarch64_handle_pac_ret_leaf (char* str ATTRIBUTE_UNUSED,
1397 char* rest ATTRIBUTE_UNUSED)
1398 {
1399 aarch64_ra_sign_scope = AARCH64_FUNCTION_ALL;
1400 return AARCH64_PARSE_OK;
1401 }
1402
1403 static enum aarch64_parse_opt_result
aarch64_handle_bti_protection(char * str ATTRIBUTE_UNUSED,char * rest ATTRIBUTE_UNUSED)1404 aarch64_handle_bti_protection (char* str ATTRIBUTE_UNUSED,
1405 char* rest ATTRIBUTE_UNUSED)
1406 {
1407 aarch64_enable_bti = 1;
1408 return AARCH64_PARSE_OK;
1409 }
1410
1411 static const struct aarch64_branch_protect_type aarch64_pac_ret_subtypes[] = {
1412 { "leaf", aarch64_handle_pac_ret_leaf, NULL, 0 },
1413 { NULL, NULL, NULL, 0 }
1414 };
1415
1416 static const struct aarch64_branch_protect_type aarch64_branch_protect_types[] = {
1417 { "none", aarch64_handle_no_branch_protection, NULL, 0 },
1418 { "standard", aarch64_handle_standard_branch_protection, NULL, 0 },
1419 { "pac-ret", aarch64_handle_pac_ret_protection, aarch64_pac_ret_subtypes,
1420 ARRAY_SIZE (aarch64_pac_ret_subtypes) },
1421 { "bti", aarch64_handle_bti_protection, NULL, 0 },
1422 { NULL, NULL, NULL, 0 }
1423 };
1424
1425 /* The condition codes of the processor, and the inverse function. */
1426 static const char * const aarch64_condition_codes[] =
1427 {
1428 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1429 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1430 };
1431
1432 /* Generate code to enable conditional branches in functions over 1 MiB. */
1433 const char *
aarch64_gen_far_branch(rtx * operands,int pos_label,const char * dest,const char * branch_format)1434 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
1435 const char * branch_format)
1436 {
1437 rtx_code_label * tmp_label = gen_label_rtx ();
1438 char label_buf[256];
1439 char buffer[128];
1440 ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
1441 CODE_LABEL_NUMBER (tmp_label));
1442 const char *label_ptr = targetm.strip_name_encoding (label_buf);
1443 rtx dest_label = operands[pos_label];
1444 operands[pos_label] = tmp_label;
1445
1446 snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
1447 output_asm_insn (buffer, operands);
1448
1449 snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
1450 operands[pos_label] = dest_label;
1451 output_asm_insn (buffer, operands);
1452 return "";
1453 }
1454
1455 void
aarch64_err_no_fpadvsimd(machine_mode mode)1456 aarch64_err_no_fpadvsimd (machine_mode mode)
1457 {
1458 if (TARGET_GENERAL_REGS_ONLY)
1459 if (FLOAT_MODE_P (mode))
1460 error ("%qs is incompatible with the use of floating-point types",
1461 "-mgeneral-regs-only");
1462 else
1463 error ("%qs is incompatible with the use of vector types",
1464 "-mgeneral-regs-only");
1465 else
1466 if (FLOAT_MODE_P (mode))
1467 error ("%qs feature modifier is incompatible with the use of"
1468 " floating-point types", "+nofp");
1469 else
1470 error ("%qs feature modifier is incompatible with the use of"
1471 " vector types", "+nofp");
1472 }
1473
1474 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1475 The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
1476 GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
1477 higher cost. POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
1478 and GENERAL_REGS is lower than the memory cost (in this case the best class
1479 is the lowest cost one). Using POINTER_AND_FP_REGS irrespectively of its
1480 cost results in bad allocations with many redundant int<->FP moves which
1481 are expensive on various cores.
1482 To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
1483 force a decision between FP_REGS and GENERAL_REGS. We use the allocno class
1484 if it isn't POINTER_AND_FP_REGS. Similarly, use the best class if it isn't
1485 POINTER_AND_FP_REGS. Otherwise set the allocno class depending on the mode.
1486 The result of this is that it is no longer inefficient to have a higher
1487 memory move cost than the register move cost.
1488 */
1489
1490 static reg_class_t
aarch64_ira_change_pseudo_allocno_class(int regno,reg_class_t allocno_class,reg_class_t best_class)1491 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
1492 reg_class_t best_class)
1493 {
1494 machine_mode mode;
1495
1496 if (!reg_class_subset_p (GENERAL_REGS, allocno_class)
1497 || !reg_class_subset_p (FP_REGS, allocno_class))
1498 return allocno_class;
1499
1500 if (!reg_class_subset_p (GENERAL_REGS, best_class)
1501 || !reg_class_subset_p (FP_REGS, best_class))
1502 return best_class;
1503
1504 mode = PSEUDO_REGNO_MODE (regno);
1505 return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1506 }
1507
1508 static unsigned int
aarch64_min_divisions_for_recip_mul(machine_mode mode)1509 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1510 {
1511 if (GET_MODE_UNIT_SIZE (mode) == 4)
1512 return aarch64_tune_params.min_div_recip_mul_sf;
1513 return aarch64_tune_params.min_div_recip_mul_df;
1514 }
1515
1516 /* Return the reassociation width of treeop OPC with mode MODE. */
1517 static int
aarch64_reassociation_width(unsigned opc,machine_mode mode)1518 aarch64_reassociation_width (unsigned opc, machine_mode mode)
1519 {
1520 if (VECTOR_MODE_P (mode))
1521 return aarch64_tune_params.vec_reassoc_width;
1522 if (INTEGRAL_MODE_P (mode))
1523 return aarch64_tune_params.int_reassoc_width;
1524 /* Avoid reassociating floating point addition so we emit more FMAs. */
1525 if (FLOAT_MODE_P (mode) && opc != PLUS_EXPR)
1526 return aarch64_tune_params.fp_reassoc_width;
1527 return 1;
1528 }
1529
1530 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
1531 unsigned
aarch64_dbx_register_number(unsigned regno)1532 aarch64_dbx_register_number (unsigned regno)
1533 {
1534 if (GP_REGNUM_P (regno))
1535 return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1536 else if (regno == SP_REGNUM)
1537 return AARCH64_DWARF_SP;
1538 else if (FP_REGNUM_P (regno))
1539 return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1540 else if (PR_REGNUM_P (regno))
1541 return AARCH64_DWARF_P0 + regno - P0_REGNUM;
1542 else if (regno == VG_REGNUM)
1543 return AARCH64_DWARF_VG;
1544
1545 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1546 equivalent DWARF register. */
1547 return DWARF_FRAME_REGISTERS;
1548 }
1549
1550 /* Return true if MODE is any of the Advanced SIMD structure modes. */
1551 static bool
aarch64_advsimd_struct_mode_p(machine_mode mode)1552 aarch64_advsimd_struct_mode_p (machine_mode mode)
1553 {
1554 return (TARGET_SIMD
1555 && (mode == OImode || mode == CImode || mode == XImode));
1556 }
1557
1558 /* Return true if MODE is an SVE predicate mode. */
1559 static bool
aarch64_sve_pred_mode_p(machine_mode mode)1560 aarch64_sve_pred_mode_p (machine_mode mode)
1561 {
1562 return (TARGET_SVE
1563 && (mode == VNx16BImode
1564 || mode == VNx8BImode
1565 || mode == VNx4BImode
1566 || mode == VNx2BImode));
1567 }
1568
1569 /* Three mutually-exclusive flags describing a vector or predicate type. */
1570 const unsigned int VEC_ADVSIMD = 1;
1571 const unsigned int VEC_SVE_DATA = 2;
1572 const unsigned int VEC_SVE_PRED = 4;
1573 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
1574 a structure of 2, 3 or 4 vectors. */
1575 const unsigned int VEC_STRUCT = 8;
1576 /* Useful combinations of the above. */
1577 const unsigned int VEC_ANY_SVE = VEC_SVE_DATA | VEC_SVE_PRED;
1578 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
1579
1580 /* Return a set of flags describing the vector properties of mode MODE.
1581 Ignore modes that are not supported by the current target. */
1582 static unsigned int
aarch64_classify_vector_mode(machine_mode mode)1583 aarch64_classify_vector_mode (machine_mode mode)
1584 {
1585 if (aarch64_advsimd_struct_mode_p (mode))
1586 return VEC_ADVSIMD | VEC_STRUCT;
1587
1588 if (aarch64_sve_pred_mode_p (mode))
1589 return VEC_SVE_PRED;
1590
1591 scalar_mode inner = GET_MODE_INNER (mode);
1592 if (VECTOR_MODE_P (mode)
1593 && (inner == QImode
1594 || inner == HImode
1595 || inner == HFmode
1596 || inner == SImode
1597 || inner == SFmode
1598 || inner == DImode
1599 || inner == DFmode))
1600 {
1601 if (TARGET_SVE)
1602 {
1603 if (known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR))
1604 return VEC_SVE_DATA;
1605 if (known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 2)
1606 || known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 3)
1607 || known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 4))
1608 return VEC_SVE_DATA | VEC_STRUCT;
1609 }
1610
1611 /* This includes V1DF but not V1DI (which doesn't exist). */
1612 if (TARGET_SIMD
1613 && (known_eq (GET_MODE_BITSIZE (mode), 64)
1614 || known_eq (GET_MODE_BITSIZE (mode), 128)))
1615 return VEC_ADVSIMD;
1616 }
1617
1618 return 0;
1619 }
1620
1621 /* Return true if MODE is any of the data vector modes, including
1622 structure modes. */
1623 static bool
aarch64_vector_data_mode_p(machine_mode mode)1624 aarch64_vector_data_mode_p (machine_mode mode)
1625 {
1626 return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
1627 }
1628
1629 /* Return true if MODE is an SVE data vector mode; either a single vector
1630 or a structure of vectors. */
1631 static bool
aarch64_sve_data_mode_p(machine_mode mode)1632 aarch64_sve_data_mode_p (machine_mode mode)
1633 {
1634 return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
1635 }
1636
1637 /* Implement target hook TARGET_ARRAY_MODE. */
1638 static opt_machine_mode
aarch64_array_mode(machine_mode mode,unsigned HOST_WIDE_INT nelems)1639 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
1640 {
1641 if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
1642 && IN_RANGE (nelems, 2, 4))
1643 return mode_for_vector (GET_MODE_INNER (mode),
1644 GET_MODE_NUNITS (mode) * nelems);
1645
1646 return opt_machine_mode ();
1647 }
1648
1649 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
1650 static bool
aarch64_array_mode_supported_p(machine_mode mode,unsigned HOST_WIDE_INT nelems)1651 aarch64_array_mode_supported_p (machine_mode mode,
1652 unsigned HOST_WIDE_INT nelems)
1653 {
1654 if (TARGET_SIMD
1655 && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1656 || AARCH64_VALID_SIMD_DREG_MODE (mode))
1657 && (nelems >= 2 && nelems <= 4))
1658 return true;
1659
1660 return false;
1661 }
1662
1663 /* Return the SVE predicate mode to use for elements that have
1664 ELEM_NBYTES bytes, if such a mode exists. */
1665
1666 opt_machine_mode
aarch64_sve_pred_mode(unsigned int elem_nbytes)1667 aarch64_sve_pred_mode (unsigned int elem_nbytes)
1668 {
1669 if (TARGET_SVE)
1670 {
1671 if (elem_nbytes == 1)
1672 return VNx16BImode;
1673 if (elem_nbytes == 2)
1674 return VNx8BImode;
1675 if (elem_nbytes == 4)
1676 return VNx4BImode;
1677 if (elem_nbytes == 8)
1678 return VNx2BImode;
1679 }
1680 return opt_machine_mode ();
1681 }
1682
1683 /* Implement TARGET_VECTORIZE_GET_MASK_MODE. */
1684
1685 static opt_machine_mode
aarch64_get_mask_mode(poly_uint64 nunits,poly_uint64 nbytes)1686 aarch64_get_mask_mode (poly_uint64 nunits, poly_uint64 nbytes)
1687 {
1688 if (TARGET_SVE && known_eq (nbytes, BYTES_PER_SVE_VECTOR))
1689 {
1690 unsigned int elem_nbytes = vector_element_size (nbytes, nunits);
1691 machine_mode pred_mode;
1692 if (aarch64_sve_pred_mode (elem_nbytes).exists (&pred_mode))
1693 return pred_mode;
1694 }
1695
1696 return default_get_mask_mode (nunits, nbytes);
1697 }
1698
1699 /* Implement TARGET_PREFERRED_ELSE_VALUE. For binary operations,
1700 prefer to use the first arithmetic operand as the else value if
1701 the else value doesn't matter, since that exactly matches the SVE
1702 destructive merging form. For ternary operations we could either
1703 pick the first operand and use FMAD-like instructions or the last
1704 operand and use FMLA-like instructions; the latter seems more
1705 natural. */
1706
1707 static tree
aarch64_preferred_else_value(unsigned,tree,unsigned int nops,tree * ops)1708 aarch64_preferred_else_value (unsigned, tree, unsigned int nops, tree *ops)
1709 {
1710 return nops == 3 ? ops[2] : ops[0];
1711 }
1712
1713 /* Implement TARGET_HARD_REGNO_NREGS. */
1714
1715 static unsigned int
aarch64_hard_regno_nregs(unsigned regno,machine_mode mode)1716 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1717 {
1718 /* ??? Logically we should only need to provide a value when
1719 HARD_REGNO_MODE_OK says that the combination is valid,
1720 but at the moment we need to handle all modes. Just ignore
1721 any runtime parts for registers that can't store them. */
1722 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
1723 switch (aarch64_regno_regclass (regno))
1724 {
1725 case FP_REGS:
1726 case FP_LO_REGS:
1727 if (aarch64_sve_data_mode_p (mode))
1728 return exact_div (GET_MODE_SIZE (mode),
1729 BYTES_PER_SVE_VECTOR).to_constant ();
1730 return CEIL (lowest_size, UNITS_PER_VREG);
1731 case PR_REGS:
1732 case PR_LO_REGS:
1733 case PR_HI_REGS:
1734 return 1;
1735 default:
1736 return CEIL (lowest_size, UNITS_PER_WORD);
1737 }
1738 gcc_unreachable ();
1739 }
1740
1741 /* Implement TARGET_HARD_REGNO_MODE_OK. */
1742
1743 static bool
aarch64_hard_regno_mode_ok(unsigned regno,machine_mode mode)1744 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1745 {
1746 if (GET_MODE_CLASS (mode) == MODE_CC)
1747 return regno == CC_REGNUM;
1748
1749 if (regno == VG_REGNUM)
1750 /* This must have the same size as _Unwind_Word. */
1751 return mode == DImode;
1752
1753 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1754 if (vec_flags & VEC_SVE_PRED)
1755 return PR_REGNUM_P (regno);
1756
1757 if (PR_REGNUM_P (regno))
1758 return 0;
1759
1760 if (regno == SP_REGNUM)
1761 /* The purpose of comparing with ptr_mode is to support the
1762 global register variable associated with the stack pointer
1763 register via the syntax of asm ("wsp") in ILP32. */
1764 return mode == Pmode || mode == ptr_mode;
1765
1766 if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1767 return mode == Pmode;
1768
1769 if (GP_REGNUM_P (regno))
1770 {
1771 if (known_le (GET_MODE_SIZE (mode), 8))
1772 return true;
1773 else if (known_le (GET_MODE_SIZE (mode), 16))
1774 return (regno & 1) == 0;
1775 }
1776 else if (FP_REGNUM_P (regno))
1777 {
1778 if (vec_flags & VEC_STRUCT)
1779 return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
1780 else
1781 return !VECTOR_MODE_P (mode) || vec_flags != 0;
1782 }
1783
1784 return false;
1785 }
1786
1787 /* Return true if this is a definition of a vectorized simd function. */
1788
1789 static bool
aarch64_simd_decl_p(tree fndecl)1790 aarch64_simd_decl_p (tree fndecl)
1791 {
1792 tree fntype;
1793
1794 if (fndecl == NULL)
1795 return false;
1796 fntype = TREE_TYPE (fndecl);
1797 if (fntype == NULL)
1798 return false;
1799
1800 /* Functions with the aarch64_vector_pcs attribute use the simd ABI. */
1801 if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)) != NULL)
1802 return true;
1803
1804 return false;
1805 }
1806
1807 /* Return the mode a register save/restore should use. DImode for integer
1808 registers, DFmode for FP registers in non-SIMD functions (they only save
1809 the bottom half of a 128 bit register), or TFmode for FP registers in
1810 SIMD functions. */
1811
1812 static machine_mode
aarch64_reg_save_mode(tree fndecl,unsigned regno)1813 aarch64_reg_save_mode (tree fndecl, unsigned regno)
1814 {
1815 return GP_REGNUM_P (regno)
1816 ? E_DImode
1817 : (aarch64_simd_decl_p (fndecl) ? E_TFmode : E_DFmode);
1818 }
1819
1820 /* Return true if the instruction is a call to a SIMD function, false
1821 if it is not a SIMD function or if we do not know anything about
1822 the function. */
1823
1824 static bool
aarch64_simd_call_p(rtx_insn * insn)1825 aarch64_simd_call_p (rtx_insn *insn)
1826 {
1827 rtx symbol;
1828 rtx call;
1829 tree fndecl;
1830
1831 gcc_assert (CALL_P (insn));
1832 call = get_call_rtx_from (insn);
1833 symbol = XEXP (XEXP (call, 0), 0);
1834 if (GET_CODE (symbol) != SYMBOL_REF)
1835 return false;
1836 fndecl = SYMBOL_REF_DECL (symbol);
1837 if (!fndecl)
1838 return false;
1839
1840 return aarch64_simd_decl_p (fndecl);
1841 }
1842
1843 /* Implement TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS. If INSN calls
1844 a function that uses the SIMD ABI, take advantage of the extra
1845 call-preserved registers that the ABI provides. */
1846
1847 void
aarch64_remove_extra_call_preserved_regs(rtx_insn * insn,HARD_REG_SET * return_set)1848 aarch64_remove_extra_call_preserved_regs (rtx_insn *insn,
1849 HARD_REG_SET *return_set)
1850 {
1851 if (aarch64_simd_call_p (insn))
1852 {
1853 for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
1854 if (FP_SIMD_SAVED_REGNUM_P (regno))
1855 CLEAR_HARD_REG_BIT (*return_set, regno);
1856 }
1857 }
1858
1859 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The callee only saves
1860 the lower 64 bits of a 128-bit register. Tell the compiler the callee
1861 clobbers the top 64 bits when restoring the bottom 64 bits. */
1862
1863 static bool
aarch64_hard_regno_call_part_clobbered(rtx_insn * insn,unsigned int regno,machine_mode mode)1864 aarch64_hard_regno_call_part_clobbered (rtx_insn *insn, unsigned int regno,
1865 machine_mode mode)
1866 {
1867 bool simd_p = insn && CALL_P (insn) && aarch64_simd_call_p (insn);
1868 return FP_REGNUM_P (regno)
1869 && maybe_gt (GET_MODE_SIZE (mode), simd_p ? 16 : 8);
1870 }
1871
1872 /* Implement TARGET_RETURN_CALL_WITH_MAX_CLOBBERS. */
1873
1874 rtx_insn *
aarch64_return_call_with_max_clobbers(rtx_insn * call_1,rtx_insn * call_2)1875 aarch64_return_call_with_max_clobbers (rtx_insn *call_1, rtx_insn *call_2)
1876 {
1877 gcc_assert (CALL_P (call_1) && CALL_P (call_2));
1878
1879 if (!aarch64_simd_call_p (call_1) || aarch64_simd_call_p (call_2))
1880 return call_1;
1881 else
1882 return call_2;
1883 }
1884
1885 /* Implement REGMODE_NATURAL_SIZE. */
1886 poly_uint64
aarch64_regmode_natural_size(machine_mode mode)1887 aarch64_regmode_natural_size (machine_mode mode)
1888 {
1889 /* The natural size for SVE data modes is one SVE data vector,
1890 and similarly for predicates. We can't independently modify
1891 anything smaller than that. */
1892 /* ??? For now, only do this for variable-width SVE registers.
1893 Doing it for constant-sized registers breaks lower-subreg.c. */
1894 /* ??? And once that's fixed, we should probably have similar
1895 code for Advanced SIMD. */
1896 if (!aarch64_sve_vg.is_constant ())
1897 {
1898 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1899 if (vec_flags & VEC_SVE_PRED)
1900 return BYTES_PER_SVE_PRED;
1901 if (vec_flags & VEC_SVE_DATA)
1902 return BYTES_PER_SVE_VECTOR;
1903 }
1904 return UNITS_PER_WORD;
1905 }
1906
1907 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
1908 machine_mode
aarch64_hard_regno_caller_save_mode(unsigned regno,unsigned,machine_mode mode)1909 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
1910 machine_mode mode)
1911 {
1912 /* The predicate mode determines which bits are significant and
1913 which are "don't care". Decreasing the number of lanes would
1914 lose data while increasing the number of lanes would make bits
1915 unnecessarily significant. */
1916 if (PR_REGNUM_P (regno))
1917 return mode;
1918 if (known_ge (GET_MODE_SIZE (mode), 4))
1919 return mode;
1920 else
1921 return SImode;
1922 }
1923
1924 /* Return true if I's bits are consecutive ones from the MSB. */
1925 bool
aarch64_high_bits_all_ones_p(HOST_WIDE_INT i)1926 aarch64_high_bits_all_ones_p (HOST_WIDE_INT i)
1927 {
1928 return exact_log2 (-i) != HOST_WIDE_INT_M1;
1929 }
1930
1931 /* Implement TARGET_CONSTANT_ALIGNMENT. Make strings word-aligned so
1932 that strcpy from constants will be faster. */
1933
1934 static HOST_WIDE_INT
aarch64_constant_alignment(const_tree exp,HOST_WIDE_INT align)1935 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
1936 {
1937 if (TREE_CODE (exp) == STRING_CST && !optimize_size)
1938 return MAX (align, BITS_PER_WORD);
1939 return align;
1940 }
1941
1942 /* Return true if calls to DECL should be treated as
1943 long-calls (ie called via a register). */
1944 static bool
aarch64_decl_is_long_call_p(const_tree decl ATTRIBUTE_UNUSED)1945 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1946 {
1947 return false;
1948 }
1949
1950 /* Return true if calls to symbol-ref SYM should be treated as
1951 long-calls (ie called via a register). */
1952 bool
aarch64_is_long_call_p(rtx sym)1953 aarch64_is_long_call_p (rtx sym)
1954 {
1955 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1956 }
1957
1958 /* Return true if calls to symbol-ref SYM should not go through
1959 plt stubs. */
1960
1961 bool
aarch64_is_noplt_call_p(rtx sym)1962 aarch64_is_noplt_call_p (rtx sym)
1963 {
1964 const_tree decl = SYMBOL_REF_DECL (sym);
1965
1966 if (flag_pic
1967 && decl
1968 && (!flag_plt
1969 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1970 && !targetm.binds_local_p (decl))
1971 return true;
1972
1973 return false;
1974 }
1975
1976 /* Return true if the offsets to a zero/sign-extract operation
1977 represent an expression that matches an extend operation. The
1978 operands represent the paramters from
1979
1980 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */
1981 bool
aarch64_is_extend_from_extract(scalar_int_mode mode,rtx mult_imm,rtx extract_imm)1982 aarch64_is_extend_from_extract (scalar_int_mode mode, rtx mult_imm,
1983 rtx extract_imm)
1984 {
1985 HOST_WIDE_INT mult_val, extract_val;
1986
1987 if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
1988 return false;
1989
1990 mult_val = INTVAL (mult_imm);
1991 extract_val = INTVAL (extract_imm);
1992
1993 if (extract_val > 8
1994 && extract_val < GET_MODE_BITSIZE (mode)
1995 && exact_log2 (extract_val & ~7) > 0
1996 && (extract_val & 7) <= 4
1997 && mult_val == (1 << (extract_val & 7)))
1998 return true;
1999
2000 return false;
2001 }
2002
2003 /* Emit an insn that's a simple single-set. Both the operands must be
2004 known to be valid. */
2005 inline static rtx_insn *
emit_set_insn(rtx x,rtx y)2006 emit_set_insn (rtx x, rtx y)
2007 {
2008 return emit_insn (gen_rtx_SET (x, y));
2009 }
2010
2011 /* X and Y are two things to compare using CODE. Emit the compare insn and
2012 return the rtx for register 0 in the proper mode. */
2013 rtx
aarch64_gen_compare_reg(RTX_CODE code,rtx x,rtx y)2014 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
2015 {
2016 machine_mode cmp_mode = GET_MODE (x);
2017 machine_mode cc_mode;
2018 rtx cc_reg;
2019
2020 if (cmp_mode == TImode)
2021 {
2022 gcc_assert (code == NE);
2023
2024 cc_mode = CCmode;
2025 cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2026
2027 rtx x_lo = operand_subword (x, 0, 0, TImode);
2028 rtx y_lo = operand_subword (y, 0, 0, TImode);
2029 emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x_lo, y_lo));
2030
2031 rtx x_hi = operand_subword (x, 1, 0, TImode);
2032 rtx y_hi = operand_subword (y, 1, 0, TImode);
2033 emit_insn (gen_ccmpdi (cc_reg, cc_reg, x_hi, y_hi,
2034 gen_rtx_EQ (cc_mode, cc_reg, const0_rtx),
2035 GEN_INT (AARCH64_EQ)));
2036 }
2037 else
2038 {
2039 cc_mode = SELECT_CC_MODE (code, x, y);
2040 cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2041 emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x, y));
2042 }
2043 return cc_reg;
2044 }
2045
2046 /* Similarly, but maybe zero-extend Y if Y_MODE < SImode. */
2047
2048 static rtx
aarch64_gen_compare_reg_maybe_ze(RTX_CODE code,rtx x,rtx y,machine_mode y_mode)2049 aarch64_gen_compare_reg_maybe_ze (RTX_CODE code, rtx x, rtx y,
2050 machine_mode y_mode)
2051 {
2052 if (y_mode == E_QImode || y_mode == E_HImode)
2053 {
2054 if (CONST_INT_P (y))
2055 {
2056 y = GEN_INT (INTVAL (y) & GET_MODE_MASK (y_mode));
2057 y_mode = SImode;
2058 }
2059 else
2060 {
2061 rtx t, cc_reg;
2062 machine_mode cc_mode;
2063
2064 t = gen_rtx_ZERO_EXTEND (SImode, y);
2065 t = gen_rtx_COMPARE (CC_SWPmode, t, x);
2066 cc_mode = CC_SWPmode;
2067 cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2068 emit_set_insn (cc_reg, t);
2069 return cc_reg;
2070 }
2071 }
2072
2073 if (!aarch64_plus_operand (y, y_mode))
2074 y = force_reg (y_mode, y);
2075
2076 return aarch64_gen_compare_reg (code, x, y);
2077 }
2078
2079 /* Build the SYMBOL_REF for __tls_get_addr. */
2080
2081 static GTY(()) rtx tls_get_addr_libfunc;
2082
2083 rtx
aarch64_tls_get_addr(void)2084 aarch64_tls_get_addr (void)
2085 {
2086 if (!tls_get_addr_libfunc)
2087 tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
2088 return tls_get_addr_libfunc;
2089 }
2090
2091 /* Return the TLS model to use for ADDR. */
2092
2093 static enum tls_model
tls_symbolic_operand_type(rtx addr)2094 tls_symbolic_operand_type (rtx addr)
2095 {
2096 enum tls_model tls_kind = TLS_MODEL_NONE;
2097 if (GET_CODE (addr) == CONST)
2098 {
2099 poly_int64 addend;
2100 rtx sym = strip_offset (addr, &addend);
2101 if (GET_CODE (sym) == SYMBOL_REF)
2102 tls_kind = SYMBOL_REF_TLS_MODEL (sym);
2103 }
2104 else if (GET_CODE (addr) == SYMBOL_REF)
2105 tls_kind = SYMBOL_REF_TLS_MODEL (addr);
2106
2107 return tls_kind;
2108 }
2109
2110 /* We'll allow lo_sum's in addresses in our legitimate addresses
2111 so that combine would take care of combining addresses where
2112 necessary, but for generation purposes, we'll generate the address
2113 as :
2114 RTL Absolute
2115 tmp = hi (symbol_ref); adrp x1, foo
2116 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
2117 nop
2118
2119 PIC TLS
2120 adrp x1, :got:foo adrp tmp, :tlsgd:foo
2121 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
2122 bl __tls_get_addr
2123 nop
2124
2125 Load TLS symbol, depending on TLS mechanism and TLS access model.
2126
2127 Global Dynamic - Traditional TLS:
2128 adrp tmp, :tlsgd:imm
2129 add dest, tmp, #:tlsgd_lo12:imm
2130 bl __tls_get_addr
2131
2132 Global Dynamic - TLS Descriptors:
2133 adrp dest, :tlsdesc:imm
2134 ldr tmp, [dest, #:tlsdesc_lo12:imm]
2135 add dest, dest, #:tlsdesc_lo12:imm
2136 blr tmp
2137 mrs tp, tpidr_el0
2138 add dest, dest, tp
2139
2140 Initial Exec:
2141 mrs tp, tpidr_el0
2142 adrp tmp, :gottprel:imm
2143 ldr dest, [tmp, #:gottprel_lo12:imm]
2144 add dest, dest, tp
2145
2146 Local Exec:
2147 mrs tp, tpidr_el0
2148 add t0, tp, #:tprel_hi12:imm, lsl #12
2149 add t0, t0, #:tprel_lo12_nc:imm
2150 */
2151
2152 static void
aarch64_load_symref_appropriately(rtx dest,rtx imm,enum aarch64_symbol_type type)2153 aarch64_load_symref_appropriately (rtx dest, rtx imm,
2154 enum aarch64_symbol_type type)
2155 {
2156 switch (type)
2157 {
2158 case SYMBOL_SMALL_ABSOLUTE:
2159 {
2160 /* In ILP32, the mode of dest can be either SImode or DImode. */
2161 rtx tmp_reg = dest;
2162 machine_mode mode = GET_MODE (dest);
2163
2164 gcc_assert (mode == Pmode || mode == ptr_mode);
2165
2166 if (can_create_pseudo_p ())
2167 tmp_reg = gen_reg_rtx (mode);
2168
2169 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
2170 emit_insn (gen_add_losym (dest, tmp_reg, imm));
2171 return;
2172 }
2173
2174 case SYMBOL_TINY_ABSOLUTE:
2175 emit_insn (gen_rtx_SET (dest, imm));
2176 return;
2177
2178 case SYMBOL_SMALL_GOT_28K:
2179 {
2180 machine_mode mode = GET_MODE (dest);
2181 rtx gp_rtx = pic_offset_table_rtx;
2182 rtx insn;
2183 rtx mem;
2184
2185 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
2186 here before rtl expand. Tree IVOPT will generate rtl pattern to
2187 decide rtx costs, in which case pic_offset_table_rtx is not
2188 initialized. For that case no need to generate the first adrp
2189 instruction as the final cost for global variable access is
2190 one instruction. */
2191 if (gp_rtx != NULL)
2192 {
2193 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
2194 using the page base as GOT base, the first page may be wasted,
2195 in the worst scenario, there is only 28K space for GOT).
2196
2197 The generate instruction sequence for accessing global variable
2198 is:
2199
2200 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
2201
2202 Only one instruction needed. But we must initialize
2203 pic_offset_table_rtx properly. We generate initialize insn for
2204 every global access, and allow CSE to remove all redundant.
2205
2206 The final instruction sequences will look like the following
2207 for multiply global variables access.
2208
2209 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
2210
2211 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
2212 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
2213 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
2214 ... */
2215
2216 rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
2217 crtl->uses_pic_offset_table = 1;
2218 emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
2219
2220 if (mode != GET_MODE (gp_rtx))
2221 gp_rtx = gen_lowpart (mode, gp_rtx);
2222
2223 }
2224
2225 if (mode == ptr_mode)
2226 {
2227 if (mode == DImode)
2228 insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
2229 else
2230 insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
2231
2232 mem = XVECEXP (SET_SRC (insn), 0, 0);
2233 }
2234 else
2235 {
2236 gcc_assert (mode == Pmode);
2237
2238 insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
2239 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
2240 }
2241
2242 /* The operand is expected to be MEM. Whenever the related insn
2243 pattern changed, above code which calculate mem should be
2244 updated. */
2245 gcc_assert (GET_CODE (mem) == MEM);
2246 MEM_READONLY_P (mem) = 1;
2247 MEM_NOTRAP_P (mem) = 1;
2248 emit_insn (insn);
2249 return;
2250 }
2251
2252 case SYMBOL_SMALL_GOT_4G:
2253 {
2254 /* In ILP32, the mode of dest can be either SImode or DImode,
2255 while the got entry is always of SImode size. The mode of
2256 dest depends on how dest is used: if dest is assigned to a
2257 pointer (e.g. in the memory), it has SImode; it may have
2258 DImode if dest is dereferenced to access the memeory.
2259 This is why we have to handle three different ldr_got_small
2260 patterns here (two patterns for ILP32). */
2261
2262 rtx insn;
2263 rtx mem;
2264 rtx tmp_reg = dest;
2265 machine_mode mode = GET_MODE (dest);
2266
2267 if (can_create_pseudo_p ())
2268 tmp_reg = gen_reg_rtx (mode);
2269
2270 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
2271 if (mode == ptr_mode)
2272 {
2273 if (mode == DImode)
2274 insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
2275 else
2276 insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
2277
2278 mem = XVECEXP (SET_SRC (insn), 0, 0);
2279 }
2280 else
2281 {
2282 gcc_assert (mode == Pmode);
2283
2284 insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
2285 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
2286 }
2287
2288 gcc_assert (GET_CODE (mem) == MEM);
2289 MEM_READONLY_P (mem) = 1;
2290 MEM_NOTRAP_P (mem) = 1;
2291 emit_insn (insn);
2292 return;
2293 }
2294
2295 case SYMBOL_SMALL_TLSGD:
2296 {
2297 rtx_insn *insns;
2298 machine_mode mode = GET_MODE (dest);
2299 rtx result = gen_rtx_REG (mode, R0_REGNUM);
2300
2301 start_sequence ();
2302 if (TARGET_ILP32)
2303 aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
2304 else
2305 aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
2306 insns = get_insns ();
2307 end_sequence ();
2308
2309 RTL_CONST_CALL_P (insns) = 1;
2310 emit_libcall_block (insns, dest, result, imm);
2311 return;
2312 }
2313
2314 case SYMBOL_SMALL_TLSDESC:
2315 {
2316 machine_mode mode = GET_MODE (dest);
2317 rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
2318 rtx tp;
2319
2320 gcc_assert (mode == Pmode || mode == ptr_mode);
2321
2322 /* In ILP32, the got entry is always of SImode size. Unlike
2323 small GOT, the dest is fixed at reg 0. */
2324 if (TARGET_ILP32)
2325 emit_insn (gen_tlsdesc_small_si (imm));
2326 else
2327 emit_insn (gen_tlsdesc_small_di (imm));
2328 tp = aarch64_load_tp (NULL);
2329
2330 if (mode != Pmode)
2331 tp = gen_lowpart (mode, tp);
2332
2333 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
2334 if (REG_P (dest))
2335 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2336 return;
2337 }
2338
2339 case SYMBOL_SMALL_TLSIE:
2340 {
2341 /* In ILP32, the mode of dest can be either SImode or DImode,
2342 while the got entry is always of SImode size. The mode of
2343 dest depends on how dest is used: if dest is assigned to a
2344 pointer (e.g. in the memory), it has SImode; it may have
2345 DImode if dest is dereferenced to access the memeory.
2346 This is why we have to handle three different tlsie_small
2347 patterns here (two patterns for ILP32). */
2348 machine_mode mode = GET_MODE (dest);
2349 rtx tmp_reg = gen_reg_rtx (mode);
2350 rtx tp = aarch64_load_tp (NULL);
2351
2352 if (mode == ptr_mode)
2353 {
2354 if (mode == DImode)
2355 emit_insn (gen_tlsie_small_di (tmp_reg, imm));
2356 else
2357 {
2358 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
2359 tp = gen_lowpart (mode, tp);
2360 }
2361 }
2362 else
2363 {
2364 gcc_assert (mode == Pmode);
2365 emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
2366 }
2367
2368 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
2369 if (REG_P (dest))
2370 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2371 return;
2372 }
2373
2374 case SYMBOL_TLSLE12:
2375 case SYMBOL_TLSLE24:
2376 case SYMBOL_TLSLE32:
2377 case SYMBOL_TLSLE48:
2378 {
2379 machine_mode mode = GET_MODE (dest);
2380 rtx tp = aarch64_load_tp (NULL);
2381
2382 if (mode != Pmode)
2383 tp = gen_lowpart (mode, tp);
2384
2385 switch (type)
2386 {
2387 case SYMBOL_TLSLE12:
2388 emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
2389 (dest, tp, imm));
2390 break;
2391 case SYMBOL_TLSLE24:
2392 emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
2393 (dest, tp, imm));
2394 break;
2395 case SYMBOL_TLSLE32:
2396 emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
2397 (dest, imm));
2398 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
2399 (dest, dest, tp));
2400 break;
2401 case SYMBOL_TLSLE48:
2402 emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
2403 (dest, imm));
2404 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
2405 (dest, dest, tp));
2406 break;
2407 default:
2408 gcc_unreachable ();
2409 }
2410
2411 if (REG_P (dest))
2412 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2413 return;
2414 }
2415
2416 case SYMBOL_TINY_GOT:
2417 emit_insn (gen_ldr_got_tiny (dest, imm));
2418 return;
2419
2420 case SYMBOL_TINY_TLSIE:
2421 {
2422 machine_mode mode = GET_MODE (dest);
2423 rtx tp = aarch64_load_tp (NULL);
2424
2425 if (mode == ptr_mode)
2426 {
2427 if (mode == DImode)
2428 emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
2429 else
2430 {
2431 tp = gen_lowpart (mode, tp);
2432 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
2433 }
2434 }
2435 else
2436 {
2437 gcc_assert (mode == Pmode);
2438 emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
2439 }
2440
2441 if (REG_P (dest))
2442 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2443 return;
2444 }
2445
2446 default:
2447 gcc_unreachable ();
2448 }
2449 }
2450
2451 /* Emit a move from SRC to DEST. Assume that the move expanders can
2452 handle all moves if !can_create_pseudo_p (). The distinction is
2453 important because, unlike emit_move_insn, the move expanders know
2454 how to force Pmode objects into the constant pool even when the
2455 constant pool address is not itself legitimate. */
2456 static rtx
aarch64_emit_move(rtx dest,rtx src)2457 aarch64_emit_move (rtx dest, rtx src)
2458 {
2459 return (can_create_pseudo_p ()
2460 ? emit_move_insn (dest, src)
2461 : emit_move_insn_1 (dest, src));
2462 }
2463
2464 /* Apply UNOPTAB to OP and store the result in DEST. */
2465
2466 static void
aarch64_emit_unop(rtx dest,optab unoptab,rtx op)2467 aarch64_emit_unop (rtx dest, optab unoptab, rtx op)
2468 {
2469 rtx tmp = expand_unop (GET_MODE (dest), unoptab, op, dest, 0);
2470 if (dest != tmp)
2471 emit_move_insn (dest, tmp);
2472 }
2473
2474 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST. */
2475
2476 static void
aarch64_emit_binop(rtx dest,optab binoptab,rtx op0,rtx op1)2477 aarch64_emit_binop (rtx dest, optab binoptab, rtx op0, rtx op1)
2478 {
2479 rtx tmp = expand_binop (GET_MODE (dest), binoptab, op0, op1, dest, 0,
2480 OPTAB_DIRECT);
2481 if (dest != tmp)
2482 emit_move_insn (dest, tmp);
2483 }
2484
2485 /* Split a 128-bit move operation into two 64-bit move operations,
2486 taking care to handle partial overlap of register to register
2487 copies. Special cases are needed when moving between GP regs and
2488 FP regs. SRC can be a register, constant or memory; DST a register
2489 or memory. If either operand is memory it must not have any side
2490 effects. */
2491 void
aarch64_split_128bit_move(rtx dst,rtx src)2492 aarch64_split_128bit_move (rtx dst, rtx src)
2493 {
2494 rtx dst_lo, dst_hi;
2495 rtx src_lo, src_hi;
2496
2497 machine_mode mode = GET_MODE (dst);
2498
2499 gcc_assert (mode == TImode || mode == TFmode);
2500 gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
2501 gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
2502
2503 if (REG_P (dst) && REG_P (src))
2504 {
2505 int src_regno = REGNO (src);
2506 int dst_regno = REGNO (dst);
2507
2508 /* Handle FP <-> GP regs. */
2509 if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
2510 {
2511 src_lo = gen_lowpart (word_mode, src);
2512 src_hi = gen_highpart (word_mode, src);
2513
2514 emit_insn (gen_aarch64_movlow_di (mode, dst, src_lo));
2515 emit_insn (gen_aarch64_movhigh_di (mode, dst, src_hi));
2516 return;
2517 }
2518 else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
2519 {
2520 dst_lo = gen_lowpart (word_mode, dst);
2521 dst_hi = gen_highpart (word_mode, dst);
2522
2523 emit_insn (gen_aarch64_movdi_low (mode, dst_lo, src));
2524 emit_insn (gen_aarch64_movdi_high (mode, dst_hi, src));
2525 return;
2526 }
2527 }
2528
2529 dst_lo = gen_lowpart (word_mode, dst);
2530 dst_hi = gen_highpart (word_mode, dst);
2531 src_lo = gen_lowpart (word_mode, src);
2532 src_hi = gen_highpart_mode (word_mode, mode, src);
2533
2534 /* At most one pairing may overlap. */
2535 if (reg_overlap_mentioned_p (dst_lo, src_hi))
2536 {
2537 aarch64_emit_move (dst_hi, src_hi);
2538 aarch64_emit_move (dst_lo, src_lo);
2539 }
2540 else
2541 {
2542 aarch64_emit_move (dst_lo, src_lo);
2543 aarch64_emit_move (dst_hi, src_hi);
2544 }
2545 }
2546
2547 bool
aarch64_split_128bit_move_p(rtx dst,rtx src)2548 aarch64_split_128bit_move_p (rtx dst, rtx src)
2549 {
2550 return (! REG_P (src)
2551 || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
2552 }
2553
2554 /* Split a complex SIMD combine. */
2555
2556 void
aarch64_split_simd_combine(rtx dst,rtx src1,rtx src2)2557 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
2558 {
2559 machine_mode src_mode = GET_MODE (src1);
2560 machine_mode dst_mode = GET_MODE (dst);
2561
2562 gcc_assert (VECTOR_MODE_P (dst_mode));
2563 gcc_assert (register_operand (dst, dst_mode)
2564 && register_operand (src1, src_mode)
2565 && register_operand (src2, src_mode));
2566
2567 emit_insn (gen_aarch64_simd_combine (src_mode, dst, src1, src2));
2568 return;
2569 }
2570
2571 /* Split a complex SIMD move. */
2572
2573 void
aarch64_split_simd_move(rtx dst,rtx src)2574 aarch64_split_simd_move (rtx dst, rtx src)
2575 {
2576 machine_mode src_mode = GET_MODE (src);
2577 machine_mode dst_mode = GET_MODE (dst);
2578
2579 gcc_assert (VECTOR_MODE_P (dst_mode));
2580
2581 if (REG_P (dst) && REG_P (src))
2582 {
2583 gcc_assert (VECTOR_MODE_P (src_mode));
2584 emit_insn (gen_aarch64_split_simd_mov (src_mode, dst, src));
2585 }
2586 }
2587
2588 bool
aarch64_zero_extend_const_eq(machine_mode xmode,rtx x,machine_mode ymode,rtx y)2589 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
2590 machine_mode ymode, rtx y)
2591 {
2592 rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
2593 gcc_assert (r != NULL);
2594 return rtx_equal_p (x, r);
2595 }
2596
2597
2598 static rtx
aarch64_force_temporary(machine_mode mode,rtx x,rtx value)2599 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
2600 {
2601 if (can_create_pseudo_p ())
2602 return force_reg (mode, value);
2603 else
2604 {
2605 gcc_assert (x);
2606 aarch64_emit_move (x, value);
2607 return x;
2608 }
2609 }
2610
2611 /* Return true if we can move VALUE into a register using a single
2612 CNT[BHWD] instruction. */
2613
2614 static bool
aarch64_sve_cnt_immediate_p(poly_int64 value)2615 aarch64_sve_cnt_immediate_p (poly_int64 value)
2616 {
2617 HOST_WIDE_INT factor = value.coeffs[0];
2618 /* The coefficient must be [1, 16] * {2, 4, 8, 16}. */
2619 return (value.coeffs[1] == factor
2620 && IN_RANGE (factor, 2, 16 * 16)
2621 && (factor & 1) == 0
2622 && factor <= 16 * (factor & -factor));
2623 }
2624
2625 /* Likewise for rtx X. */
2626
2627 bool
aarch64_sve_cnt_immediate_p(rtx x)2628 aarch64_sve_cnt_immediate_p (rtx x)
2629 {
2630 poly_int64 value;
2631 return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
2632 }
2633
2634 /* Return the asm string for an instruction with a CNT-like vector size
2635 operand (a vector pattern followed by a multiplier in the range [1, 16]).
2636 PREFIX is the mnemonic without the size suffix and OPERANDS is the
2637 first part of the operands template (the part that comes before the
2638 vector size itself). FACTOR is the number of quadwords.
2639 NELTS_PER_VQ, if nonzero, is the number of elements in each quadword.
2640 If it is zero, we can use any element size. */
2641
2642 static char *
aarch64_output_sve_cnt_immediate(const char * prefix,const char * operands,unsigned int factor,unsigned int nelts_per_vq)2643 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2644 unsigned int factor,
2645 unsigned int nelts_per_vq)
2646 {
2647 static char buffer[sizeof ("sqincd\t%x0, %w0, all, mul #16")];
2648
2649 if (nelts_per_vq == 0)
2650 /* There is some overlap in the ranges of the four CNT instructions.
2651 Here we always use the smallest possible element size, so that the
2652 multiplier is 1 whereever possible. */
2653 nelts_per_vq = factor & -factor;
2654 int shift = std::min (exact_log2 (nelts_per_vq), 4);
2655 gcc_assert (IN_RANGE (shift, 1, 4));
2656 char suffix = "dwhb"[shift - 1];
2657
2658 factor >>= shift;
2659 unsigned int written;
2660 if (factor == 1)
2661 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
2662 prefix, suffix, operands);
2663 else
2664 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, all, mul #%d",
2665 prefix, suffix, operands, factor);
2666 gcc_assert (written < sizeof (buffer));
2667 return buffer;
2668 }
2669
2670 /* Return the asm string for an instruction with a CNT-like vector size
2671 operand (a vector pattern followed by a multiplier in the range [1, 16]).
2672 PREFIX is the mnemonic without the size suffix and OPERANDS is the
2673 first part of the operands template (the part that comes before the
2674 vector size itself). X is the value of the vector size operand,
2675 as a polynomial integer rtx. */
2676
2677 char *
aarch64_output_sve_cnt_immediate(const char * prefix,const char * operands,rtx x)2678 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2679 rtx x)
2680 {
2681 poly_int64 value = rtx_to_poly_int64 (x);
2682 gcc_assert (aarch64_sve_cnt_immediate_p (value));
2683 return aarch64_output_sve_cnt_immediate (prefix, operands,
2684 value.coeffs[1], 0);
2685 }
2686
2687 /* Return true if we can add VALUE to a register using a single ADDVL
2688 or ADDPL instruction. */
2689
2690 static bool
aarch64_sve_addvl_addpl_immediate_p(poly_int64 value)2691 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
2692 {
2693 HOST_WIDE_INT factor = value.coeffs[0];
2694 if (factor == 0 || value.coeffs[1] != factor)
2695 return false;
2696 /* FACTOR counts VG / 2, so a value of 2 is one predicate width
2697 and a value of 16 is one vector width. */
2698 return (((factor & 15) == 0 && IN_RANGE (factor, -32 * 16, 31 * 16))
2699 || ((factor & 1) == 0 && IN_RANGE (factor, -32 * 2, 31 * 2)));
2700 }
2701
2702 /* Likewise for rtx X. */
2703
2704 bool
aarch64_sve_addvl_addpl_immediate_p(rtx x)2705 aarch64_sve_addvl_addpl_immediate_p (rtx x)
2706 {
2707 poly_int64 value;
2708 return (poly_int_rtx_p (x, &value)
2709 && aarch64_sve_addvl_addpl_immediate_p (value));
2710 }
2711
2712 /* Return the asm string for adding ADDVL or ADDPL immediate X to operand 1
2713 and storing the result in operand 0. */
2714
2715 char *
aarch64_output_sve_addvl_addpl(rtx dest,rtx base,rtx offset)2716 aarch64_output_sve_addvl_addpl (rtx dest, rtx base, rtx offset)
2717 {
2718 static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
2719 poly_int64 offset_value = rtx_to_poly_int64 (offset);
2720 gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
2721
2722 /* Use INC or DEC if possible. */
2723 if (rtx_equal_p (dest, base) && GP_REGNUM_P (REGNO (dest)))
2724 {
2725 if (aarch64_sve_cnt_immediate_p (offset_value))
2726 return aarch64_output_sve_cnt_immediate ("inc", "%x0",
2727 offset_value.coeffs[1], 0);
2728 if (aarch64_sve_cnt_immediate_p (-offset_value))
2729 return aarch64_output_sve_cnt_immediate ("dec", "%x0",
2730 -offset_value.coeffs[1], 0);
2731 }
2732
2733 int factor = offset_value.coeffs[1];
2734 if ((factor & 15) == 0)
2735 snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
2736 else
2737 snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
2738 return buffer;
2739 }
2740
2741 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2742 instruction. If it is, store the number of elements in each vector
2743 quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
2744 factor in *FACTOR_OUT (if nonnull). */
2745
2746 bool
aarch64_sve_inc_dec_immediate_p(rtx x,int * factor_out,unsigned int * nelts_per_vq_out)2747 aarch64_sve_inc_dec_immediate_p (rtx x, int *factor_out,
2748 unsigned int *nelts_per_vq_out)
2749 {
2750 rtx elt;
2751 poly_int64 value;
2752
2753 if (!const_vec_duplicate_p (x, &elt)
2754 || !poly_int_rtx_p (elt, &value))
2755 return false;
2756
2757 unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
2758 if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
2759 /* There's no vector INCB. */
2760 return false;
2761
2762 HOST_WIDE_INT factor = value.coeffs[0];
2763 if (value.coeffs[1] != factor)
2764 return false;
2765
2766 /* The coefficient must be [1, 16] * NELTS_PER_VQ. */
2767 if ((factor % nelts_per_vq) != 0
2768 || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
2769 return false;
2770
2771 if (factor_out)
2772 *factor_out = factor;
2773 if (nelts_per_vq_out)
2774 *nelts_per_vq_out = nelts_per_vq;
2775 return true;
2776 }
2777
2778 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2779 instruction. */
2780
2781 bool
aarch64_sve_inc_dec_immediate_p(rtx x)2782 aarch64_sve_inc_dec_immediate_p (rtx x)
2783 {
2784 return aarch64_sve_inc_dec_immediate_p (x, NULL, NULL);
2785 }
2786
2787 /* Return the asm template for an SVE vector INC or DEC instruction.
2788 OPERANDS gives the operands before the vector count and X is the
2789 value of the vector count operand itself. */
2790
2791 char *
aarch64_output_sve_inc_dec_immediate(const char * operands,rtx x)2792 aarch64_output_sve_inc_dec_immediate (const char *operands, rtx x)
2793 {
2794 int factor;
2795 unsigned int nelts_per_vq;
2796 if (!aarch64_sve_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
2797 gcc_unreachable ();
2798 if (factor < 0)
2799 return aarch64_output_sve_cnt_immediate ("dec", operands, -factor,
2800 nelts_per_vq);
2801 else
2802 return aarch64_output_sve_cnt_immediate ("inc", operands, factor,
2803 nelts_per_vq);
2804 }
2805
2806 static int
aarch64_internal_mov_immediate(rtx dest,rtx imm,bool generate,scalar_int_mode mode)2807 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
2808 scalar_int_mode mode)
2809 {
2810 int i;
2811 unsigned HOST_WIDE_INT val, val2, mask;
2812 int one_match, zero_match;
2813 int num_insns;
2814
2815 val = INTVAL (imm);
2816
2817 if (aarch64_move_imm (val, mode))
2818 {
2819 if (generate)
2820 emit_insn (gen_rtx_SET (dest, imm));
2821 return 1;
2822 }
2823
2824 /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
2825 (with XXXX non-zero). In that case check to see if the move can be done in
2826 a smaller mode. */
2827 val2 = val & 0xffffffff;
2828 if (mode == DImode
2829 && aarch64_move_imm (val2, SImode)
2830 && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
2831 {
2832 if (generate)
2833 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
2834
2835 /* Check if we have to emit a second instruction by checking to see
2836 if any of the upper 32 bits of the original DI mode value is set. */
2837 if (val == val2)
2838 return 1;
2839
2840 i = (val >> 48) ? 48 : 32;
2841
2842 if (generate)
2843 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2844 GEN_INT ((val >> i) & 0xffff)));
2845
2846 return 2;
2847 }
2848
2849 if ((val >> 32) == 0 || mode == SImode)
2850 {
2851 if (generate)
2852 {
2853 emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
2854 if (mode == SImode)
2855 emit_insn (gen_insv_immsi (dest, GEN_INT (16),
2856 GEN_INT ((val >> 16) & 0xffff)));
2857 else
2858 emit_insn (gen_insv_immdi (dest, GEN_INT (16),
2859 GEN_INT ((val >> 16) & 0xffff)));
2860 }
2861 return 2;
2862 }
2863
2864 /* Remaining cases are all for DImode. */
2865
2866 mask = 0xffff;
2867 zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
2868 ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
2869 one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
2870 ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
2871
2872 if (zero_match != 2 && one_match != 2)
2873 {
2874 /* Try emitting a bitmask immediate with a movk replacing 16 bits.
2875 For a 64-bit bitmask try whether changing 16 bits to all ones or
2876 zeroes creates a valid bitmask. To check any repeated bitmask,
2877 try using 16 bits from the other 32-bit half of val. */
2878
2879 for (i = 0; i < 64; i += 16, mask <<= 16)
2880 {
2881 val2 = val & ~mask;
2882 if (val2 != val && aarch64_bitmask_imm (val2, mode))
2883 break;
2884 val2 = val | mask;
2885 if (val2 != val && aarch64_bitmask_imm (val2, mode))
2886 break;
2887 val2 = val2 & ~mask;
2888 val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
2889 if (val2 != val && aarch64_bitmask_imm (val2, mode))
2890 break;
2891 }
2892 if (i != 64)
2893 {
2894 if (generate)
2895 {
2896 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
2897 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2898 GEN_INT ((val >> i) & 0xffff)));
2899 }
2900 return 2;
2901 }
2902 }
2903
2904 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
2905 are emitted by the initial mov. If one_match > zero_match, skip set bits,
2906 otherwise skip zero bits. */
2907
2908 num_insns = 1;
2909 mask = 0xffff;
2910 val2 = one_match > zero_match ? ~val : val;
2911 i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
2912
2913 if (generate)
2914 emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
2915 ? (val | ~(mask << i))
2916 : (val & (mask << i)))));
2917 for (i += 16; i < 64; i += 16)
2918 {
2919 if ((val2 & (mask << i)) == 0)
2920 continue;
2921 if (generate)
2922 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2923 GEN_INT ((val >> i) & 0xffff)));
2924 num_insns ++;
2925 }
2926
2927 return num_insns;
2928 }
2929
2930 /* Return whether imm is a 128-bit immediate which is simple enough to
2931 expand inline. */
2932 bool
aarch64_mov128_immediate(rtx imm)2933 aarch64_mov128_immediate (rtx imm)
2934 {
2935 if (GET_CODE (imm) == CONST_INT)
2936 return true;
2937
2938 gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
2939
2940 rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
2941 rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
2942
2943 return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
2944 + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
2945 }
2946
2947
2948 /* Return the number of temporary registers that aarch64_add_offset_1
2949 would need to add OFFSET to a register. */
2950
2951 static unsigned int
aarch64_add_offset_1_temporaries(HOST_WIDE_INT offset)2952 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
2953 {
2954 return absu_hwi (offset) < 0x1000000 ? 0 : 1;
2955 }
2956
2957 /* A subroutine of aarch64_add_offset. Set DEST to SRC + OFFSET for
2958 a non-polynomial OFFSET. MODE is the mode of the addition.
2959 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2960 be set and CFA adjustments added to the generated instructions.
2961
2962 TEMP1, if nonnull, is a register of mode MODE that can be used as a
2963 temporary if register allocation is already complete. This temporary
2964 register may overlap DEST but must not overlap SRC. If TEMP1 is known
2965 to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
2966 the immediate again.
2967
2968 Since this function may be used to adjust the stack pointer, we must
2969 ensure that it cannot cause transient stack deallocation (for example
2970 by first incrementing SP and then decrementing when adjusting by a
2971 large immediate). */
2972
2973 static void
aarch64_add_offset_1(scalar_int_mode mode,rtx dest,rtx src,HOST_WIDE_INT offset,rtx temp1,bool frame_related_p,bool emit_move_imm)2974 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
2975 rtx src, HOST_WIDE_INT offset, rtx temp1,
2976 bool frame_related_p, bool emit_move_imm)
2977 {
2978 gcc_assert (emit_move_imm || temp1 != NULL_RTX);
2979 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
2980
2981 unsigned HOST_WIDE_INT moffset = absu_hwi (offset);
2982 rtx_insn *insn;
2983
2984 if (!moffset)
2985 {
2986 if (!rtx_equal_p (dest, src))
2987 {
2988 insn = emit_insn (gen_rtx_SET (dest, src));
2989 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2990 }
2991 return;
2992 }
2993
2994 /* Single instruction adjustment. */
2995 if (aarch64_uimm12_shift (moffset))
2996 {
2997 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
2998 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2999 return;
3000 }
3001
3002 /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
3003 and either:
3004
3005 a) the offset cannot be loaded by a 16-bit move or
3006 b) there is no spare register into which we can move it. */
3007 if (moffset < 0x1000000
3008 && ((!temp1 && !can_create_pseudo_p ())
3009 || !aarch64_move_imm (moffset, mode)))
3010 {
3011 HOST_WIDE_INT low_off = moffset & 0xfff;
3012
3013 low_off = offset < 0 ? -low_off : low_off;
3014 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
3015 RTX_FRAME_RELATED_P (insn) = frame_related_p;
3016 insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
3017 RTX_FRAME_RELATED_P (insn) = frame_related_p;
3018 return;
3019 }
3020
3021 /* Emit a move immediate if required and an addition/subtraction. */
3022 if (emit_move_imm)
3023 {
3024 gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
3025 temp1 = aarch64_force_temporary (mode, temp1,
3026 gen_int_mode (moffset, mode));
3027 }
3028 insn = emit_insn (offset < 0
3029 ? gen_sub3_insn (dest, src, temp1)
3030 : gen_add3_insn (dest, src, temp1));
3031 if (frame_related_p)
3032 {
3033 RTX_FRAME_RELATED_P (insn) = frame_related_p;
3034 rtx adj = plus_constant (mode, src, offset);
3035 add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
3036 }
3037 }
3038
3039 /* Return the number of temporary registers that aarch64_add_offset
3040 would need to move OFFSET into a register or add OFFSET to a register;
3041 ADD_P is true if we want the latter rather than the former. */
3042
3043 static unsigned int
aarch64_offset_temporaries(bool add_p,poly_int64 offset)3044 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
3045 {
3046 /* This follows the same structure as aarch64_add_offset. */
3047 if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
3048 return 0;
3049
3050 unsigned int count = 0;
3051 HOST_WIDE_INT factor = offset.coeffs[1];
3052 HOST_WIDE_INT constant = offset.coeffs[0] - factor;
3053 poly_int64 poly_offset (factor, factor);
3054 if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
3055 /* Need one register for the ADDVL/ADDPL result. */
3056 count += 1;
3057 else if (factor != 0)
3058 {
3059 factor = abs (factor);
3060 if (factor > 16 * (factor & -factor))
3061 /* Need one register for the CNT result and one for the multiplication
3062 factor. If necessary, the second temporary can be reused for the
3063 constant part of the offset. */
3064 return 2;
3065 /* Need one register for the CNT result (which might then
3066 be shifted). */
3067 count += 1;
3068 }
3069 return count + aarch64_add_offset_1_temporaries (constant);
3070 }
3071
3072 /* If X can be represented as a poly_int64, return the number
3073 of temporaries that are required to add it to a register.
3074 Return -1 otherwise. */
3075
3076 int
aarch64_add_offset_temporaries(rtx x)3077 aarch64_add_offset_temporaries (rtx x)
3078 {
3079 poly_int64 offset;
3080 if (!poly_int_rtx_p (x, &offset))
3081 return -1;
3082 return aarch64_offset_temporaries (true, offset);
3083 }
3084
3085 /* Set DEST to SRC + OFFSET. MODE is the mode of the addition.
3086 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
3087 be set and CFA adjustments added to the generated instructions.
3088
3089 TEMP1, if nonnull, is a register of mode MODE that can be used as a
3090 temporary if register allocation is already complete. This temporary
3091 register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
3092 If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
3093 false to avoid emitting the immediate again.
3094
3095 TEMP2, if nonnull, is a second temporary register that doesn't
3096 overlap either DEST or REG.
3097
3098 Since this function may be used to adjust the stack pointer, we must
3099 ensure that it cannot cause transient stack deallocation (for example
3100 by first incrementing SP and then decrementing when adjusting by a
3101 large immediate). */
3102
3103 static void
3104 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
3105 poly_int64 offset, rtx temp1, rtx temp2,
3106 bool frame_related_p, bool emit_move_imm = true)
3107 {
3108 gcc_assert (emit_move_imm || temp1 != NULL_RTX);
3109 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
3110 gcc_assert (temp1 == NULL_RTX
3111 || !frame_related_p
3112 || !reg_overlap_mentioned_p (temp1, dest));
3113 gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
3114
3115 /* Try using ADDVL or ADDPL to add the whole value. */
3116 if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
3117 {
3118 rtx offset_rtx = gen_int_mode (offset, mode);
3119 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
3120 RTX_FRAME_RELATED_P (insn) = frame_related_p;
3121 return;
3122 }
3123
3124 /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
3125 SVE vector register, over and above the minimum size of 128 bits.
3126 This is equivalent to half the value returned by CNTD with a
3127 vector shape of ALL. */
3128 HOST_WIDE_INT factor = offset.coeffs[1];
3129 HOST_WIDE_INT constant = offset.coeffs[0] - factor;
3130
3131 /* Try using ADDVL or ADDPL to add the VG-based part. */
3132 poly_int64 poly_offset (factor, factor);
3133 if (src != const0_rtx
3134 && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
3135 {
3136 rtx offset_rtx = gen_int_mode (poly_offset, mode);
3137 if (frame_related_p)
3138 {
3139 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
3140 RTX_FRAME_RELATED_P (insn) = true;
3141 src = dest;
3142 }
3143 else
3144 {
3145 rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
3146 src = aarch64_force_temporary (mode, temp1, addr);
3147 temp1 = temp2;
3148 temp2 = NULL_RTX;
3149 }
3150 }
3151 /* Otherwise use a CNT-based sequence. */
3152 else if (factor != 0)
3153 {
3154 /* Use a subtraction if we have a negative factor. */
3155 rtx_code code = PLUS;
3156 if (factor < 0)
3157 {
3158 factor = -factor;
3159 code = MINUS;
3160 }
3161
3162 /* Calculate CNTD * FACTOR / 2. First try to fold the division
3163 into the multiplication. */
3164 rtx val;
3165 int shift = 0;
3166 if (factor & 1)
3167 /* Use a right shift by 1. */
3168 shift = -1;
3169 else
3170 factor /= 2;
3171 HOST_WIDE_INT low_bit = factor & -factor;
3172 if (factor <= 16 * low_bit)
3173 {
3174 if (factor > 16 * 8)
3175 {
3176 /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
3177 the value with the minimum multiplier and shift it into
3178 position. */
3179 int extra_shift = exact_log2 (low_bit);
3180 shift += extra_shift;
3181 factor >>= extra_shift;
3182 }
3183 val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
3184 }
3185 else
3186 {
3187 /* Use CNTD, then multiply it by FACTOR. */
3188 val = gen_int_mode (poly_int64 (2, 2), mode);
3189 val = aarch64_force_temporary (mode, temp1, val);
3190
3191 /* Go back to using a negative multiplication factor if we have
3192 no register from which to subtract. */
3193 if (code == MINUS && src == const0_rtx)
3194 {
3195 factor = -factor;
3196 code = PLUS;
3197 }
3198 rtx coeff1 = gen_int_mode (factor, mode);
3199 coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
3200 val = gen_rtx_MULT (mode, val, coeff1);
3201 }
3202
3203 if (shift > 0)
3204 {
3205 /* Multiply by 1 << SHIFT. */
3206 val = aarch64_force_temporary (mode, temp1, val);
3207 val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
3208 }
3209 else if (shift == -1)
3210 {
3211 /* Divide by 2. */
3212 val = aarch64_force_temporary (mode, temp1, val);
3213 val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
3214 }
3215
3216 /* Calculate SRC +/- CNTD * FACTOR / 2. */
3217 if (src != const0_rtx)
3218 {
3219 val = aarch64_force_temporary (mode, temp1, val);
3220 val = gen_rtx_fmt_ee (code, mode, src, val);
3221 }
3222 else if (code == MINUS)
3223 {
3224 val = aarch64_force_temporary (mode, temp1, val);
3225 val = gen_rtx_NEG (mode, val);
3226 }
3227
3228 if (constant == 0 || frame_related_p)
3229 {
3230 rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
3231 if (frame_related_p)
3232 {
3233 RTX_FRAME_RELATED_P (insn) = true;
3234 add_reg_note (insn, REG_CFA_ADJUST_CFA,
3235 gen_rtx_SET (dest, plus_constant (Pmode, src,
3236 poly_offset)));
3237 }
3238 src = dest;
3239 if (constant == 0)
3240 return;
3241 }
3242 else
3243 {
3244 src = aarch64_force_temporary (mode, temp1, val);
3245 temp1 = temp2;
3246 temp2 = NULL_RTX;
3247 }
3248
3249 emit_move_imm = true;
3250 }
3251
3252 aarch64_add_offset_1 (mode, dest, src, constant, temp1,
3253 frame_related_p, emit_move_imm);
3254 }
3255
3256 /* Like aarch64_add_offset, but the offset is given as an rtx rather
3257 than a poly_int64. */
3258
3259 void
aarch64_split_add_offset(scalar_int_mode mode,rtx dest,rtx src,rtx offset_rtx,rtx temp1,rtx temp2)3260 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
3261 rtx offset_rtx, rtx temp1, rtx temp2)
3262 {
3263 aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
3264 temp1, temp2, false);
3265 }
3266
3267 /* Add DELTA to the stack pointer, marking the instructions frame-related.
3268 TEMP1 is available as a temporary if nonnull. EMIT_MOVE_IMM is false
3269 if TEMP1 already contains abs (DELTA). */
3270
3271 static inline void
aarch64_add_sp(rtx temp1,rtx temp2,poly_int64 delta,bool emit_move_imm)3272 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
3273 {
3274 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
3275 temp1, temp2, true, emit_move_imm);
3276 }
3277
3278 /* Subtract DELTA from the stack pointer, marking the instructions
3279 frame-related if FRAME_RELATED_P. TEMP1 is available as a temporary
3280 if nonnull. */
3281
3282 static inline void
3283 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p,
3284 bool emit_move_imm = true)
3285 {
3286 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
3287 temp1, temp2, frame_related_p, emit_move_imm);
3288 }
3289
3290 /* Set DEST to (vec_series BASE STEP). */
3291
3292 static void
aarch64_expand_vec_series(rtx dest,rtx base,rtx step)3293 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
3294 {
3295 machine_mode mode = GET_MODE (dest);
3296 scalar_mode inner = GET_MODE_INNER (mode);
3297
3298 /* Each operand can be a register or an immediate in the range [-16, 15]. */
3299 if (!aarch64_sve_index_immediate_p (base))
3300 base = force_reg (inner, base);
3301 if (!aarch64_sve_index_immediate_p (step))
3302 step = force_reg (inner, step);
3303
3304 emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
3305 }
3306
3307 /* Try to duplicate SRC into SVE register DEST, given that SRC is an
3308 integer of mode INT_MODE. Return true on success. */
3309
3310 static bool
aarch64_expand_sve_widened_duplicate(rtx dest,scalar_int_mode src_mode,rtx src)3311 aarch64_expand_sve_widened_duplicate (rtx dest, scalar_int_mode src_mode,
3312 rtx src)
3313 {
3314 /* If the constant is smaller than 128 bits, we can do the move
3315 using a vector of SRC_MODEs. */
3316 if (src_mode != TImode)
3317 {
3318 poly_uint64 count = exact_div (GET_MODE_SIZE (GET_MODE (dest)),
3319 GET_MODE_SIZE (src_mode));
3320 machine_mode dup_mode = mode_for_vector (src_mode, count).require ();
3321 emit_move_insn (gen_lowpart (dup_mode, dest),
3322 gen_const_vec_duplicate (dup_mode, src));
3323 return true;
3324 }
3325
3326 /* Use LD1RQ[BHWD] to load the 128 bits from memory. */
3327 src = force_const_mem (src_mode, src);
3328 if (!src)
3329 return false;
3330
3331 /* Make sure that the address is legitimate. */
3332 if (!aarch64_sve_ld1r_operand_p (src))
3333 {
3334 rtx addr = force_reg (Pmode, XEXP (src, 0));
3335 src = replace_equiv_address (src, addr);
3336 }
3337
3338 machine_mode mode = GET_MODE (dest);
3339 unsigned int elem_bytes = GET_MODE_UNIT_SIZE (mode);
3340 machine_mode pred_mode = aarch64_sve_pred_mode (elem_bytes).require ();
3341 rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
3342 src = gen_rtx_UNSPEC (mode, gen_rtvec (2, ptrue, src), UNSPEC_LD1RQ);
3343 emit_insn (gen_rtx_SET (dest, src));
3344 return true;
3345 }
3346
3347 /* Expand a move of general CONST_VECTOR SRC into DEST, given that it
3348 isn't a simple duplicate or series. */
3349
3350 static void
aarch64_expand_sve_const_vector(rtx dest,rtx src)3351 aarch64_expand_sve_const_vector (rtx dest, rtx src)
3352 {
3353 machine_mode mode = GET_MODE (src);
3354 unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
3355 unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
3356 gcc_assert (npatterns > 1);
3357
3358 if (nelts_per_pattern == 1)
3359 {
3360 /* The constant is a repeating seqeuence of at least two elements,
3361 where the repeating elements occupy no more than 128 bits.
3362 Get an integer representation of the replicated value. */
3363 scalar_int_mode int_mode;
3364 if (BYTES_BIG_ENDIAN)
3365 /* For now, always use LD1RQ to load the value on big-endian
3366 targets, since the handling of smaller integers includes a
3367 subreg that is semantically an element reverse. */
3368 int_mode = TImode;
3369 else
3370 {
3371 unsigned int int_bits = GET_MODE_UNIT_BITSIZE (mode) * npatterns;
3372 gcc_assert (int_bits <= 128);
3373 int_mode = int_mode_for_size (int_bits, 0).require ();
3374 }
3375 rtx int_value = simplify_gen_subreg (int_mode, src, mode, 0);
3376 if (int_value
3377 && aarch64_expand_sve_widened_duplicate (dest, int_mode, int_value))
3378 return;
3379 }
3380
3381 /* Expand each pattern individually. */
3382 rtx_vector_builder builder;
3383 auto_vec<rtx, 16> vectors (npatterns);
3384 for (unsigned int i = 0; i < npatterns; ++i)
3385 {
3386 builder.new_vector (mode, 1, nelts_per_pattern);
3387 for (unsigned int j = 0; j < nelts_per_pattern; ++j)
3388 builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
3389 vectors.quick_push (force_reg (mode, builder.build ()));
3390 }
3391
3392 /* Use permutes to interleave the separate vectors. */
3393 while (npatterns > 1)
3394 {
3395 npatterns /= 2;
3396 for (unsigned int i = 0; i < npatterns; ++i)
3397 {
3398 rtx tmp = (npatterns == 1 ? dest : gen_reg_rtx (mode));
3399 rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
3400 emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
3401 vectors[i] = tmp;
3402 }
3403 }
3404 gcc_assert (vectors[0] == dest);
3405 }
3406
3407 /* Set DEST to immediate IMM. For SVE vector modes, GEN_VEC_DUPLICATE
3408 is a pattern that can be used to set DEST to a replicated scalar
3409 element. */
3410
3411 void
aarch64_expand_mov_immediate(rtx dest,rtx imm,rtx (* gen_vec_duplicate)(rtx,rtx))3412 aarch64_expand_mov_immediate (rtx dest, rtx imm,
3413 rtx (*gen_vec_duplicate) (rtx, rtx))
3414 {
3415 machine_mode mode = GET_MODE (dest);
3416
3417 /* Check on what type of symbol it is. */
3418 scalar_int_mode int_mode;
3419 if ((GET_CODE (imm) == SYMBOL_REF
3420 || GET_CODE (imm) == LABEL_REF
3421 || GET_CODE (imm) == CONST
3422 || GET_CODE (imm) == CONST_POLY_INT)
3423 && is_a <scalar_int_mode> (mode, &int_mode))
3424 {
3425 rtx mem;
3426 poly_int64 offset;
3427 HOST_WIDE_INT const_offset;
3428 enum aarch64_symbol_type sty;
3429
3430 /* If we have (const (plus symbol offset)), separate out the offset
3431 before we start classifying the symbol. */
3432 rtx base = strip_offset (imm, &offset);
3433
3434 /* We must always add an offset involving VL separately, rather than
3435 folding it into the relocation. */
3436 if (!offset.is_constant (&const_offset))
3437 {
3438 if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
3439 emit_insn (gen_rtx_SET (dest, imm));
3440 else
3441 {
3442 /* Do arithmetic on 32-bit values if the result is smaller
3443 than that. */
3444 if (partial_subreg_p (int_mode, SImode))
3445 {
3446 /* It is invalid to do symbol calculations in modes
3447 narrower than SImode. */
3448 gcc_assert (base == const0_rtx);
3449 dest = gen_lowpart (SImode, dest);
3450 int_mode = SImode;
3451 }
3452 if (base != const0_rtx)
3453 {
3454 base = aarch64_force_temporary (int_mode, dest, base);
3455 aarch64_add_offset (int_mode, dest, base, offset,
3456 NULL_RTX, NULL_RTX, false);
3457 }
3458 else
3459 aarch64_add_offset (int_mode, dest, base, offset,
3460 dest, NULL_RTX, false);
3461 }
3462 return;
3463 }
3464
3465 sty = aarch64_classify_symbol (base, const_offset);
3466 switch (sty)
3467 {
3468 case SYMBOL_FORCE_TO_MEM:
3469 if (const_offset != 0
3470 && targetm.cannot_force_const_mem (int_mode, imm))
3471 {
3472 gcc_assert (can_create_pseudo_p ());
3473 base = aarch64_force_temporary (int_mode, dest, base);
3474 aarch64_add_offset (int_mode, dest, base, const_offset,
3475 NULL_RTX, NULL_RTX, false);
3476 return;
3477 }
3478
3479 mem = force_const_mem (ptr_mode, imm);
3480 gcc_assert (mem);
3481
3482 /* If we aren't generating PC relative literals, then
3483 we need to expand the literal pool access carefully.
3484 This is something that needs to be done in a number
3485 of places, so could well live as a separate function. */
3486 if (!aarch64_pcrelative_literal_loads)
3487 {
3488 gcc_assert (can_create_pseudo_p ());
3489 base = gen_reg_rtx (ptr_mode);
3490 aarch64_expand_mov_immediate (base, XEXP (mem, 0));
3491 if (ptr_mode != Pmode)
3492 base = convert_memory_address (Pmode, base);
3493 mem = gen_rtx_MEM (ptr_mode, base);
3494 }
3495
3496 if (int_mode != ptr_mode)
3497 mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
3498
3499 emit_insn (gen_rtx_SET (dest, mem));
3500
3501 return;
3502
3503 case SYMBOL_SMALL_TLSGD:
3504 case SYMBOL_SMALL_TLSDESC:
3505 case SYMBOL_SMALL_TLSIE:
3506 case SYMBOL_SMALL_GOT_28K:
3507 case SYMBOL_SMALL_GOT_4G:
3508 case SYMBOL_TINY_GOT:
3509 case SYMBOL_TINY_TLSIE:
3510 if (const_offset != 0)
3511 {
3512 gcc_assert(can_create_pseudo_p ());
3513 base = aarch64_force_temporary (int_mode, dest, base);
3514 aarch64_add_offset (int_mode, dest, base, const_offset,
3515 NULL_RTX, NULL_RTX, false);
3516 return;
3517 }
3518 /* FALLTHRU */
3519
3520 case SYMBOL_SMALL_ABSOLUTE:
3521 case SYMBOL_TINY_ABSOLUTE:
3522 case SYMBOL_TLSLE12:
3523 case SYMBOL_TLSLE24:
3524 case SYMBOL_TLSLE32:
3525 case SYMBOL_TLSLE48:
3526 aarch64_load_symref_appropriately (dest, imm, sty);
3527 return;
3528
3529 default:
3530 gcc_unreachable ();
3531 }
3532 }
3533
3534 if (!CONST_INT_P (imm))
3535 {
3536 rtx base, step, value;
3537 if (GET_CODE (imm) == HIGH
3538 || aarch64_simd_valid_immediate (imm, NULL))
3539 emit_insn (gen_rtx_SET (dest, imm));
3540 else if (const_vec_series_p (imm, &base, &step))
3541 aarch64_expand_vec_series (dest, base, step);
3542 else if (const_vec_duplicate_p (imm, &value))
3543 {
3544 /* If the constant is out of range of an SVE vector move,
3545 load it from memory if we can, otherwise move it into
3546 a register and use a DUP. */
3547 scalar_mode inner_mode = GET_MODE_INNER (mode);
3548 rtx op = force_const_mem (inner_mode, value);
3549 if (!op)
3550 op = force_reg (inner_mode, value);
3551 else if (!aarch64_sve_ld1r_operand_p (op))
3552 {
3553 rtx addr = force_reg (Pmode, XEXP (op, 0));
3554 op = replace_equiv_address (op, addr);
3555 }
3556 emit_insn (gen_vec_duplicate (dest, op));
3557 }
3558 else if (GET_CODE (imm) == CONST_VECTOR
3559 && !GET_MODE_NUNITS (GET_MODE (imm)).is_constant ())
3560 aarch64_expand_sve_const_vector (dest, imm);
3561 else
3562 {
3563 rtx mem = force_const_mem (mode, imm);
3564 gcc_assert (mem);
3565 emit_move_insn (dest, mem);
3566 }
3567
3568 return;
3569 }
3570
3571 aarch64_internal_mov_immediate (dest, imm, true,
3572 as_a <scalar_int_mode> (mode));
3573 }
3574
3575 /* Emit an SVE predicated move from SRC to DEST. PRED is a predicate
3576 that is known to contain PTRUE. */
3577
3578 void
aarch64_emit_sve_pred_move(rtx dest,rtx pred,rtx src)3579 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
3580 {
3581 expand_operand ops[3];
3582 machine_mode mode = GET_MODE (dest);
3583 create_output_operand (&ops[0], dest, mode);
3584 create_input_operand (&ops[1], pred, GET_MODE(pred));
3585 create_input_operand (&ops[2], src, mode);
3586 expand_insn (code_for_aarch64_pred_mov (mode), 3, ops);
3587 }
3588
3589 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
3590 operand is in memory. In this case we need to use the predicated LD1
3591 and ST1 instead of LDR and STR, both for correctness on big-endian
3592 targets and because LD1 and ST1 support a wider range of addressing modes.
3593 PRED_MODE is the mode of the predicate.
3594
3595 See the comment at the head of aarch64-sve.md for details about the
3596 big-endian handling. */
3597
3598 void
aarch64_expand_sve_mem_move(rtx dest,rtx src,machine_mode pred_mode)3599 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
3600 {
3601 machine_mode mode = GET_MODE (dest);
3602 rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
3603 if (!register_operand (src, mode)
3604 && !register_operand (dest, mode))
3605 {
3606 rtx tmp = gen_reg_rtx (mode);
3607 if (MEM_P (src))
3608 aarch64_emit_sve_pred_move (tmp, ptrue, src);
3609 else
3610 emit_move_insn (tmp, src);
3611 src = tmp;
3612 }
3613 aarch64_emit_sve_pred_move (dest, ptrue, src);
3614 }
3615
3616 /* Called only on big-endian targets. See whether an SVE vector move
3617 from SRC to DEST is effectively a REV[BHW] instruction, because at
3618 least one operand is a subreg of an SVE vector that has wider or
3619 narrower elements. Return true and emit the instruction if so.
3620
3621 For example:
3622
3623 (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
3624
3625 represents a VIEW_CONVERT between the following vectors, viewed
3626 in memory order:
3627
3628 R2: { [0].high, [0].low, [1].high, [1].low, ... }
3629 R1: { [0], [1], [2], [3], ... }
3630
3631 The high part of lane X in R2 should therefore correspond to lane X*2
3632 of R1, but the register representations are:
3633
3634 msb lsb
3635 R2: ...... [1].high [1].low [0].high [0].low
3636 R1: ...... [3] [2] [1] [0]
3637
3638 where the low part of lane X in R2 corresponds to lane X*2 in R1.
3639 We therefore need a reverse operation to swap the high and low values
3640 around.
3641
3642 This is purely an optimization. Without it we would spill the
3643 subreg operand to the stack in one mode and reload it in the
3644 other mode, which has the same effect as the REV. */
3645
3646 bool
aarch64_maybe_expand_sve_subreg_move(rtx dest,rtx src)3647 aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
3648 {
3649 gcc_assert (BYTES_BIG_ENDIAN);
3650 if (GET_CODE (dest) == SUBREG)
3651 dest = SUBREG_REG (dest);
3652 if (GET_CODE (src) == SUBREG)
3653 src = SUBREG_REG (src);
3654
3655 /* The optimization handles two single SVE REGs with different element
3656 sizes. */
3657 if (!REG_P (dest)
3658 || !REG_P (src)
3659 || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
3660 || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
3661 || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
3662 == GET_MODE_UNIT_SIZE (GET_MODE (src))))
3663 return false;
3664
3665 /* Generate *aarch64_sve_mov<mode>_subreg_be. */
3666 rtx ptrue = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
3667 rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
3668 UNSPEC_REV_SUBREG);
3669 emit_insn (gen_rtx_SET (dest, unspec));
3670 return true;
3671 }
3672
3673 /* Return a copy of X with mode MODE, without changing its other
3674 attributes. Unlike gen_lowpart, this doesn't care whether the
3675 mode change is valid. */
3676
3677 static rtx
aarch64_replace_reg_mode(rtx x,machine_mode mode)3678 aarch64_replace_reg_mode (rtx x, machine_mode mode)
3679 {
3680 if (GET_MODE (x) == mode)
3681 return x;
3682
3683 x = shallow_copy_rtx (x);
3684 set_mode_and_regno (x, mode, REGNO (x));
3685 return x;
3686 }
3687
3688 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
3689 operands. */
3690
3691 void
aarch64_split_sve_subreg_move(rtx dest,rtx ptrue,rtx src)3692 aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
3693 {
3694 /* Decide which REV operation we need. The mode with narrower elements
3695 determines the mode of the operands and the mode with the wider
3696 elements determines the reverse width. */
3697 machine_mode mode_with_wider_elts = GET_MODE (dest);
3698 machine_mode mode_with_narrower_elts = GET_MODE (src);
3699 if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
3700 < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
3701 std::swap (mode_with_wider_elts, mode_with_narrower_elts);
3702
3703 unsigned int wider_bytes = GET_MODE_UNIT_SIZE (mode_with_wider_elts);
3704 unsigned int unspec;
3705 if (wider_bytes == 8)
3706 unspec = UNSPEC_REV64;
3707 else if (wider_bytes == 4)
3708 unspec = UNSPEC_REV32;
3709 else if (wider_bytes == 2)
3710 unspec = UNSPEC_REV16;
3711 else
3712 gcc_unreachable ();
3713 machine_mode pred_mode = aarch64_sve_pred_mode (wider_bytes).require ();
3714
3715 /* Emit:
3716
3717 (set DEST (unspec [PTRUE (unspec [SRC] UNSPEC_REV<nn>)]
3718 UNSPEC_MERGE_PTRUE))
3719
3720 with the appropriate modes. */
3721 ptrue = gen_lowpart (pred_mode, ptrue);
3722 dest = aarch64_replace_reg_mode (dest, mode_with_narrower_elts);
3723 src = aarch64_replace_reg_mode (src, mode_with_narrower_elts);
3724 src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (1, src), unspec);
3725 src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (2, ptrue, src),
3726 UNSPEC_MERGE_PTRUE);
3727 emit_insn (gen_rtx_SET (dest, src));
3728 }
3729
3730 static bool
aarch64_function_ok_for_sibcall(tree decl ATTRIBUTE_UNUSED,tree exp ATTRIBUTE_UNUSED)3731 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
3732 tree exp ATTRIBUTE_UNUSED)
3733 {
3734 if (aarch64_simd_decl_p (cfun->decl) != aarch64_simd_decl_p (decl))
3735 return false;
3736
3737 return true;
3738 }
3739
3740 /* Implement TARGET_PASS_BY_REFERENCE. */
3741
3742 static bool
aarch64_pass_by_reference(cumulative_args_t pcum ATTRIBUTE_UNUSED,machine_mode mode,const_tree type,bool named ATTRIBUTE_UNUSED)3743 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
3744 machine_mode mode,
3745 const_tree type,
3746 bool named ATTRIBUTE_UNUSED)
3747 {
3748 HOST_WIDE_INT size;
3749 machine_mode dummymode;
3750 int nregs;
3751
3752 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
3753 if (mode == BLKmode && type)
3754 size = int_size_in_bytes (type);
3755 else
3756 /* No frontends can create types with variable-sized modes, so we
3757 shouldn't be asked to pass or return them. */
3758 size = GET_MODE_SIZE (mode).to_constant ();
3759
3760 /* Aggregates are passed by reference based on their size. */
3761 if (type && AGGREGATE_TYPE_P (type))
3762 {
3763 size = int_size_in_bytes (type);
3764 }
3765
3766 /* Variable sized arguments are always returned by reference. */
3767 if (size < 0)
3768 return true;
3769
3770 /* Can this be a candidate to be passed in fp/simd register(s)? */
3771 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
3772 &dummymode, &nregs,
3773 NULL))
3774 return false;
3775
3776 /* Arguments which are variable sized or larger than 2 registers are
3777 passed by reference unless they are a homogenous floating point
3778 aggregate. */
3779 return size > 2 * UNITS_PER_WORD;
3780 }
3781
3782 /* Return TRUE if VALTYPE is padded to its least significant bits. */
3783 static bool
aarch64_return_in_msb(const_tree valtype)3784 aarch64_return_in_msb (const_tree valtype)
3785 {
3786 machine_mode dummy_mode;
3787 int dummy_int;
3788
3789 /* Never happens in little-endian mode. */
3790 if (!BYTES_BIG_ENDIAN)
3791 return false;
3792
3793 /* Only composite types smaller than or equal to 16 bytes can
3794 be potentially returned in registers. */
3795 if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
3796 || int_size_in_bytes (valtype) <= 0
3797 || int_size_in_bytes (valtype) > 16)
3798 return false;
3799
3800 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
3801 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
3802 is always passed/returned in the least significant bits of fp/simd
3803 register(s). */
3804 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
3805 &dummy_mode, &dummy_int, NULL))
3806 return false;
3807
3808 return true;
3809 }
3810
3811 /* Implement TARGET_FUNCTION_VALUE.
3812 Define how to find the value returned by a function. */
3813
3814 static rtx
aarch64_function_value(const_tree type,const_tree func,bool outgoing ATTRIBUTE_UNUSED)3815 aarch64_function_value (const_tree type, const_tree func,
3816 bool outgoing ATTRIBUTE_UNUSED)
3817 {
3818 machine_mode mode;
3819 int unsignedp;
3820 int count;
3821 machine_mode ag_mode;
3822
3823 mode = TYPE_MODE (type);
3824 if (INTEGRAL_TYPE_P (type))
3825 mode = promote_function_mode (type, mode, &unsignedp, func, 1);
3826
3827 if (aarch64_return_in_msb (type))
3828 {
3829 HOST_WIDE_INT size = int_size_in_bytes (type);
3830
3831 if (size % UNITS_PER_WORD != 0)
3832 {
3833 size += UNITS_PER_WORD - size % UNITS_PER_WORD;
3834 mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
3835 }
3836 }
3837
3838 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
3839 &ag_mode, &count, NULL))
3840 {
3841 if (!aarch64_composite_type_p (type, mode))
3842 {
3843 gcc_assert (count == 1 && mode == ag_mode);
3844 return gen_rtx_REG (mode, V0_REGNUM);
3845 }
3846 else
3847 {
3848 int i;
3849 rtx par;
3850
3851 par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
3852 for (i = 0; i < count; i++)
3853 {
3854 rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
3855 rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
3856 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
3857 XVECEXP (par, 0, i) = tmp;
3858 }
3859 return par;
3860 }
3861 }
3862 else
3863 return gen_rtx_REG (mode, R0_REGNUM);
3864 }
3865
3866 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
3867 Return true if REGNO is the number of a hard register in which the values
3868 of called function may come back. */
3869
3870 static bool
aarch64_function_value_regno_p(const unsigned int regno)3871 aarch64_function_value_regno_p (const unsigned int regno)
3872 {
3873 /* Maximum of 16 bytes can be returned in the general registers. Examples
3874 of 16-byte return values are: 128-bit integers and 16-byte small
3875 structures (excluding homogeneous floating-point aggregates). */
3876 if (regno == R0_REGNUM || regno == R1_REGNUM)
3877 return true;
3878
3879 /* Up to four fp/simd registers can return a function value, e.g. a
3880 homogeneous floating-point aggregate having four members. */
3881 if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
3882 return TARGET_FLOAT;
3883
3884 return false;
3885 }
3886
3887 /* Implement TARGET_RETURN_IN_MEMORY.
3888
3889 If the type T of the result of a function is such that
3890 void func (T arg)
3891 would require that arg be passed as a value in a register (or set of
3892 registers) according to the parameter passing rules, then the result
3893 is returned in the same registers as would be used for such an
3894 argument. */
3895
3896 static bool
aarch64_return_in_memory(const_tree type,const_tree fndecl ATTRIBUTE_UNUSED)3897 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
3898 {
3899 HOST_WIDE_INT size;
3900 machine_mode ag_mode;
3901 int count;
3902
3903 if (!AGGREGATE_TYPE_P (type)
3904 && TREE_CODE (type) != COMPLEX_TYPE
3905 && TREE_CODE (type) != VECTOR_TYPE)
3906 /* Simple scalar types always returned in registers. */
3907 return false;
3908
3909 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
3910 type,
3911 &ag_mode,
3912 &count,
3913 NULL))
3914 return false;
3915
3916 /* Types larger than 2 registers returned in memory. */
3917 size = int_size_in_bytes (type);
3918 return (size < 0 || size > 2 * UNITS_PER_WORD);
3919 }
3920
3921 static bool
aarch64_vfp_is_call_candidate(cumulative_args_t pcum_v,machine_mode mode,const_tree type,int * nregs)3922 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
3923 const_tree type, int *nregs)
3924 {
3925 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3926 return aarch64_vfp_is_call_or_return_candidate (mode,
3927 type,
3928 &pcum->aapcs_vfp_rmode,
3929 nregs,
3930 NULL);
3931 }
3932
3933 /* Given MODE and TYPE of a function argument, return the alignment in
3934 bits. The idea is to suppress any stronger alignment requested by
3935 the user and opt for the natural alignment (specified in AAPCS64 \S
3936 4.1). ABI_BREAK is set to true if the alignment was incorrectly
3937 calculated in versions of GCC prior to GCC-9. This is a helper
3938 function for local use only. */
3939
3940 static unsigned int
aarch64_function_arg_alignment(machine_mode mode,const_tree type,bool * abi_break)3941 aarch64_function_arg_alignment (machine_mode mode, const_tree type,
3942 bool *abi_break)
3943 {
3944 *abi_break = false;
3945 if (!type)
3946 return GET_MODE_ALIGNMENT (mode);
3947
3948 if (integer_zerop (TYPE_SIZE (type)))
3949 return 0;
3950
3951 gcc_assert (TYPE_MODE (type) == mode);
3952
3953 if (!AGGREGATE_TYPE_P (type))
3954 return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
3955
3956 if (TREE_CODE (type) == ARRAY_TYPE)
3957 return TYPE_ALIGN (TREE_TYPE (type));
3958
3959 unsigned int alignment = 0;
3960 unsigned int bitfield_alignment = 0;
3961 for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
3962 if (TREE_CODE (field) == FIELD_DECL)
3963 {
3964 alignment = std::max (alignment, DECL_ALIGN (field));
3965 if (DECL_BIT_FIELD_TYPE (field))
3966 bitfield_alignment
3967 = std::max (bitfield_alignment,
3968 TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field)));
3969 }
3970
3971 if (bitfield_alignment > alignment)
3972 {
3973 *abi_break = true;
3974 return bitfield_alignment;
3975 }
3976
3977 return alignment;
3978 }
3979
3980 /* Layout a function argument according to the AAPCS64 rules. The rule
3981 numbers refer to the rule numbers in the AAPCS64. */
3982
3983 static void
aarch64_layout_arg(cumulative_args_t pcum_v,machine_mode mode,const_tree type,bool named ATTRIBUTE_UNUSED)3984 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
3985 const_tree type,
3986 bool named ATTRIBUTE_UNUSED)
3987 {
3988 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3989 int ncrn, nvrn, nregs;
3990 bool allocate_ncrn, allocate_nvrn;
3991 HOST_WIDE_INT size;
3992 bool abi_break;
3993
3994 /* We need to do this once per argument. */
3995 if (pcum->aapcs_arg_processed)
3996 return;
3997
3998 pcum->aapcs_arg_processed = true;
3999
4000 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
4001 if (type)
4002 size = int_size_in_bytes (type);
4003 else
4004 /* No frontends can create types with variable-sized modes, so we
4005 shouldn't be asked to pass or return them. */
4006 size = GET_MODE_SIZE (mode).to_constant ();
4007 size = ROUND_UP (size, UNITS_PER_WORD);
4008
4009 allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
4010 allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
4011 mode,
4012 type,
4013 &nregs);
4014
4015 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
4016 The following code thus handles passing by SIMD/FP registers first. */
4017
4018 nvrn = pcum->aapcs_nvrn;
4019
4020 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
4021 and homogenous short-vector aggregates (HVA). */
4022 if (allocate_nvrn)
4023 {
4024 if (!TARGET_FLOAT)
4025 aarch64_err_no_fpadvsimd (mode);
4026
4027 if (nvrn + nregs <= NUM_FP_ARG_REGS)
4028 {
4029 pcum->aapcs_nextnvrn = nvrn + nregs;
4030 if (!aarch64_composite_type_p (type, mode))
4031 {
4032 gcc_assert (nregs == 1);
4033 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
4034 }
4035 else
4036 {
4037 rtx par;
4038 int i;
4039 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
4040 for (i = 0; i < nregs; i++)
4041 {
4042 rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
4043 V0_REGNUM + nvrn + i);
4044 rtx offset = gen_int_mode
4045 (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
4046 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
4047 XVECEXP (par, 0, i) = tmp;
4048 }
4049 pcum->aapcs_reg = par;
4050 }
4051 return;
4052 }
4053 else
4054 {
4055 /* C.3 NSRN is set to 8. */
4056 pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
4057 goto on_stack;
4058 }
4059 }
4060
4061 ncrn = pcum->aapcs_ncrn;
4062 nregs = size / UNITS_PER_WORD;
4063
4064 /* C6 - C9. though the sign and zero extension semantics are
4065 handled elsewhere. This is the case where the argument fits
4066 entirely general registers. */
4067 if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
4068 {
4069 gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
4070
4071 /* C.8 if the argument has an alignment of 16 then the NGRN is
4072 rounded up to the next even number. */
4073 if (nregs == 2
4074 && ncrn % 2
4075 /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
4076 comparison is there because for > 16 * BITS_PER_UNIT
4077 alignment nregs should be > 2 and therefore it should be
4078 passed by reference rather than value. */
4079 && (aarch64_function_arg_alignment (mode, type, &abi_break)
4080 == 16 * BITS_PER_UNIT))
4081 {
4082 if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
4083 inform (input_location, "parameter passing for argument of type "
4084 "%qT changed in GCC 9.1", type);
4085 ++ncrn;
4086 gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
4087 }
4088
4089 /* NREGS can be 0 when e.g. an empty structure is to be passed.
4090 A reg is still generated for it, but the caller should be smart
4091 enough not to use it. */
4092 if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
4093 pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
4094 else
4095 {
4096 rtx par;
4097 int i;
4098
4099 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
4100 for (i = 0; i < nregs; i++)
4101 {
4102 rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
4103 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
4104 GEN_INT (i * UNITS_PER_WORD));
4105 XVECEXP (par, 0, i) = tmp;
4106 }
4107 pcum->aapcs_reg = par;
4108 }
4109
4110 pcum->aapcs_nextncrn = ncrn + nregs;
4111 return;
4112 }
4113
4114 /* C.11 */
4115 pcum->aapcs_nextncrn = NUM_ARG_REGS;
4116
4117 /* The argument is passed on stack; record the needed number of words for
4118 this argument and align the total size if necessary. */
4119 on_stack:
4120 pcum->aapcs_stack_words = size / UNITS_PER_WORD;
4121
4122 if (aarch64_function_arg_alignment (mode, type, &abi_break)
4123 == 16 * BITS_PER_UNIT)
4124 {
4125 int new_size = ROUND_UP (pcum->aapcs_stack_size, 16 / UNITS_PER_WORD);
4126 if (pcum->aapcs_stack_size != new_size)
4127 {
4128 if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
4129 inform (input_location, "parameter passing for argument of type "
4130 "%qT changed in GCC 9.1", type);
4131 pcum->aapcs_stack_size = new_size;
4132 }
4133 }
4134 return;
4135 }
4136
4137 /* Implement TARGET_FUNCTION_ARG. */
4138
4139 static rtx
aarch64_function_arg(cumulative_args_t pcum_v,machine_mode mode,const_tree type,bool named)4140 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
4141 const_tree type, bool named)
4142 {
4143 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4144 gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
4145
4146 if (mode == VOIDmode)
4147 return NULL_RTX;
4148
4149 aarch64_layout_arg (pcum_v, mode, type, named);
4150 return pcum->aapcs_reg;
4151 }
4152
4153 void
aarch64_init_cumulative_args(CUMULATIVE_ARGS * pcum,const_tree fntype ATTRIBUTE_UNUSED,rtx libname ATTRIBUTE_UNUSED,const_tree fndecl ATTRIBUTE_UNUSED,unsigned n_named ATTRIBUTE_UNUSED)4154 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
4155 const_tree fntype ATTRIBUTE_UNUSED,
4156 rtx libname ATTRIBUTE_UNUSED,
4157 const_tree fndecl ATTRIBUTE_UNUSED,
4158 unsigned n_named ATTRIBUTE_UNUSED)
4159 {
4160 pcum->aapcs_ncrn = 0;
4161 pcum->aapcs_nvrn = 0;
4162 pcum->aapcs_nextncrn = 0;
4163 pcum->aapcs_nextnvrn = 0;
4164 pcum->pcs_variant = ARM_PCS_AAPCS64;
4165 pcum->aapcs_reg = NULL_RTX;
4166 pcum->aapcs_arg_processed = false;
4167 pcum->aapcs_stack_words = 0;
4168 pcum->aapcs_stack_size = 0;
4169
4170 if (!TARGET_FLOAT
4171 && fndecl && TREE_PUBLIC (fndecl)
4172 && fntype && fntype != error_mark_node)
4173 {
4174 const_tree type = TREE_TYPE (fntype);
4175 machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument. */
4176 int nregs ATTRIBUTE_UNUSED; /* Likewise. */
4177 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
4178 &mode, &nregs, NULL))
4179 aarch64_err_no_fpadvsimd (TYPE_MODE (type));
4180 }
4181 return;
4182 }
4183
4184 static void
aarch64_function_arg_advance(cumulative_args_t pcum_v,machine_mode mode,const_tree type,bool named)4185 aarch64_function_arg_advance (cumulative_args_t pcum_v,
4186 machine_mode mode,
4187 const_tree type,
4188 bool named)
4189 {
4190 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4191 if (pcum->pcs_variant == ARM_PCS_AAPCS64)
4192 {
4193 aarch64_layout_arg (pcum_v, mode, type, named);
4194 gcc_assert ((pcum->aapcs_reg != NULL_RTX)
4195 != (pcum->aapcs_stack_words != 0));
4196 pcum->aapcs_arg_processed = false;
4197 pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
4198 pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
4199 pcum->aapcs_stack_size += pcum->aapcs_stack_words;
4200 pcum->aapcs_stack_words = 0;
4201 pcum->aapcs_reg = NULL_RTX;
4202 }
4203 }
4204
4205 bool
aarch64_function_arg_regno_p(unsigned regno)4206 aarch64_function_arg_regno_p (unsigned regno)
4207 {
4208 return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
4209 || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
4210 }
4211
4212 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
4213 PARM_BOUNDARY bits of alignment, but will be given anything up
4214 to STACK_BOUNDARY bits if the type requires it. This makes sure
4215 that both before and after the layout of each argument, the Next
4216 Stacked Argument Address (NSAA) will have a minimum alignment of
4217 8 bytes. */
4218
4219 static unsigned int
aarch64_function_arg_boundary(machine_mode mode,const_tree type)4220 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
4221 {
4222 bool abi_break;
4223 unsigned int alignment = aarch64_function_arg_alignment (mode, type,
4224 &abi_break);
4225 if (abi_break & warn_psabi)
4226 inform (input_location, "parameter passing for argument of type "
4227 "%qT changed in GCC 9.1", type);
4228
4229 return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
4230 }
4231
4232 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE. */
4233
4234 static fixed_size_mode
aarch64_get_reg_raw_mode(int regno)4235 aarch64_get_reg_raw_mode (int regno)
4236 {
4237 if (TARGET_SVE && FP_REGNUM_P (regno))
4238 /* Don't use the SVE part of the register for __builtin_apply and
4239 __builtin_return. The SVE registers aren't used by the normal PCS,
4240 so using them there would be a waste of time. The PCS extensions
4241 for SVE types are fundamentally incompatible with the
4242 __builtin_return/__builtin_apply interface. */
4243 return as_a <fixed_size_mode> (V16QImode);
4244 return default_get_reg_raw_mode (regno);
4245 }
4246
4247 /* Implement TARGET_FUNCTION_ARG_PADDING.
4248
4249 Small aggregate types are placed in the lowest memory address.
4250
4251 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
4252
4253 static pad_direction
aarch64_function_arg_padding(machine_mode mode,const_tree type)4254 aarch64_function_arg_padding (machine_mode mode, const_tree type)
4255 {
4256 /* On little-endian targets, the least significant byte of every stack
4257 argument is passed at the lowest byte address of the stack slot. */
4258 if (!BYTES_BIG_ENDIAN)
4259 return PAD_UPWARD;
4260
4261 /* Otherwise, integral, floating-point and pointer types are padded downward:
4262 the least significant byte of a stack argument is passed at the highest
4263 byte address of the stack slot. */
4264 if (type
4265 ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
4266 || POINTER_TYPE_P (type))
4267 : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
4268 return PAD_DOWNWARD;
4269
4270 /* Everything else padded upward, i.e. data in first byte of stack slot. */
4271 return PAD_UPWARD;
4272 }
4273
4274 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
4275
4276 It specifies padding for the last (may also be the only)
4277 element of a block move between registers and memory. If
4278 assuming the block is in the memory, padding upward means that
4279 the last element is padded after its highest significant byte,
4280 while in downward padding, the last element is padded at the
4281 its least significant byte side.
4282
4283 Small aggregates and small complex types are always padded
4284 upwards.
4285
4286 We don't need to worry about homogeneous floating-point or
4287 short-vector aggregates; their move is not affected by the
4288 padding direction determined here. Regardless of endianness,
4289 each element of such an aggregate is put in the least
4290 significant bits of a fp/simd register.
4291
4292 Return !BYTES_BIG_ENDIAN if the least significant byte of the
4293 register has useful data, and return the opposite if the most
4294 significant byte does. */
4295
4296 bool
aarch64_pad_reg_upward(machine_mode mode,const_tree type,bool first ATTRIBUTE_UNUSED)4297 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
4298 bool first ATTRIBUTE_UNUSED)
4299 {
4300
4301 /* Small composite types are always padded upward. */
4302 if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
4303 {
4304 HOST_WIDE_INT size;
4305 if (type)
4306 size = int_size_in_bytes (type);
4307 else
4308 /* No frontends can create types with variable-sized modes, so we
4309 shouldn't be asked to pass or return them. */
4310 size = GET_MODE_SIZE (mode).to_constant ();
4311 if (size < 2 * UNITS_PER_WORD)
4312 return true;
4313 }
4314
4315 /* Otherwise, use the default padding. */
4316 return !BYTES_BIG_ENDIAN;
4317 }
4318
4319 static scalar_int_mode
aarch64_libgcc_cmp_return_mode(void)4320 aarch64_libgcc_cmp_return_mode (void)
4321 {
4322 return SImode;
4323 }
4324
4325 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
4326
4327 /* We use the 12-bit shifted immediate arithmetic instructions so values
4328 must be multiple of (1 << 12), i.e. 4096. */
4329 #define ARITH_FACTOR 4096
4330
4331 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
4332 #error Cannot use simple address calculation for stack probing
4333 #endif
4334
4335 /* The pair of scratch registers used for stack probing. */
4336 #define PROBE_STACK_FIRST_REG R9_REGNUM
4337 #define PROBE_STACK_SECOND_REG R10_REGNUM
4338
4339 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
4340 inclusive. These are offsets from the current stack pointer. */
4341
4342 static void
aarch64_emit_probe_stack_range(HOST_WIDE_INT first,poly_int64 poly_size)4343 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
4344 {
4345 HOST_WIDE_INT size;
4346 if (!poly_size.is_constant (&size))
4347 {
4348 sorry ("stack probes for SVE frames");
4349 return;
4350 }
4351
4352 rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
4353
4354 /* See the same assertion on PROBE_INTERVAL above. */
4355 gcc_assert ((first % ARITH_FACTOR) == 0);
4356
4357 /* See if we have a constant small number of probes to generate. If so,
4358 that's the easy case. */
4359 if (size <= PROBE_INTERVAL)
4360 {
4361 const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
4362
4363 emit_set_insn (reg1,
4364 plus_constant (Pmode,
4365 stack_pointer_rtx, -(first + base)));
4366 emit_stack_probe (plus_constant (Pmode, reg1, base - size));
4367 }
4368
4369 /* The run-time loop is made up of 8 insns in the generic case while the
4370 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */
4371 else if (size <= 4 * PROBE_INTERVAL)
4372 {
4373 HOST_WIDE_INT i, rem;
4374
4375 emit_set_insn (reg1,
4376 plus_constant (Pmode,
4377 stack_pointer_rtx,
4378 -(first + PROBE_INTERVAL)));
4379 emit_stack_probe (reg1);
4380
4381 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
4382 it exceeds SIZE. If only two probes are needed, this will not
4383 generate any code. Then probe at FIRST + SIZE. */
4384 for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
4385 {
4386 emit_set_insn (reg1,
4387 plus_constant (Pmode, reg1, -PROBE_INTERVAL));
4388 emit_stack_probe (reg1);
4389 }
4390
4391 rem = size - (i - PROBE_INTERVAL);
4392 if (rem > 256)
4393 {
4394 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
4395
4396 emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
4397 emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
4398 }
4399 else
4400 emit_stack_probe (plus_constant (Pmode, reg1, -rem));
4401 }
4402
4403 /* Otherwise, do the same as above, but in a loop. Note that we must be
4404 extra careful with variables wrapping around because we might be at
4405 the very top (or the very bottom) of the address space and we have
4406 to be able to handle this case properly; in particular, we use an
4407 equality test for the loop condition. */
4408 else
4409 {
4410 rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
4411
4412 /* Step 1: round SIZE to the previous multiple of the interval. */
4413
4414 HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
4415
4416
4417 /* Step 2: compute initial and final value of the loop counter. */
4418
4419 /* TEST_ADDR = SP + FIRST. */
4420 emit_set_insn (reg1,
4421 plus_constant (Pmode, stack_pointer_rtx, -first));
4422
4423 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
4424 HOST_WIDE_INT adjustment = - (first + rounded_size);
4425 if (! aarch64_uimm12_shift (adjustment))
4426 {
4427 aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
4428 true, Pmode);
4429 emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
4430 }
4431 else
4432 emit_set_insn (reg2,
4433 plus_constant (Pmode, stack_pointer_rtx, adjustment));
4434
4435 /* Step 3: the loop
4436
4437 do
4438 {
4439 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
4440 probe at TEST_ADDR
4441 }
4442 while (TEST_ADDR != LAST_ADDR)
4443
4444 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
4445 until it is equal to ROUNDED_SIZE. */
4446
4447 emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
4448
4449
4450 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
4451 that SIZE is equal to ROUNDED_SIZE. */
4452
4453 if (size != rounded_size)
4454 {
4455 HOST_WIDE_INT rem = size - rounded_size;
4456
4457 if (rem > 256)
4458 {
4459 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
4460
4461 emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
4462 emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
4463 }
4464 else
4465 emit_stack_probe (plus_constant (Pmode, reg2, -rem));
4466 }
4467 }
4468
4469 /* Make sure nothing is scheduled before we are done. */
4470 emit_insn (gen_blockage ());
4471 }
4472
4473 /* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
4474 absolute addresses. */
4475
4476 const char *
aarch64_output_probe_stack_range(rtx reg1,rtx reg2)4477 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
4478 {
4479 static int labelno = 0;
4480 char loop_lab[32];
4481 rtx xops[2];
4482
4483 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
4484
4485 /* Loop. */
4486 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
4487
4488 HOST_WIDE_INT stack_clash_probe_interval
4489 = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
4490
4491 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
4492 xops[0] = reg1;
4493 HOST_WIDE_INT interval;
4494 if (flag_stack_clash_protection)
4495 interval = stack_clash_probe_interval;
4496 else
4497 interval = PROBE_INTERVAL;
4498
4499 gcc_assert (aarch64_uimm12_shift (interval));
4500 xops[1] = GEN_INT (interval);
4501
4502 output_asm_insn ("sub\t%0, %0, %1", xops);
4503
4504 /* If doing stack clash protection then we probe up by the ABI specified
4505 amount. We do this because we're dropping full pages at a time in the
4506 loop. But if we're doing non-stack clash probing, probe at SP 0. */
4507 if (flag_stack_clash_protection)
4508 xops[1] = GEN_INT (STACK_CLASH_CALLER_GUARD);
4509 else
4510 xops[1] = CONST0_RTX (GET_MODE (xops[1]));
4511
4512 /* Probe at TEST_ADDR. If we're inside the loop it is always safe to probe
4513 by this amount for each iteration. */
4514 output_asm_insn ("str\txzr, [%0, %1]", xops);
4515
4516 /* Test if TEST_ADDR == LAST_ADDR. */
4517 xops[1] = reg2;
4518 output_asm_insn ("cmp\t%0, %1", xops);
4519
4520 /* Branch. */
4521 fputs ("\tb.ne\t", asm_out_file);
4522 assemble_name_raw (asm_out_file, loop_lab);
4523 fputc ('\n', asm_out_file);
4524
4525 return "";
4526 }
4527
4528 /* Emit the probe loop for doing stack clash probes and stack adjustments for
4529 SVE. This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
4530 of GUARD_SIZE. When a probe is emitted it is done at most
4531 MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of
4532 at most MIN_PROBE_THRESHOLD. By the end of this function
4533 BASE = BASE - ADJUSTMENT. */
4534
4535 const char *
aarch64_output_probe_sve_stack_clash(rtx base,rtx adjustment,rtx min_probe_threshold,rtx guard_size)4536 aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment,
4537 rtx min_probe_threshold, rtx guard_size)
4538 {
4539 /* This function is not allowed to use any instruction generation function
4540 like gen_ and friends. If you do you'll likely ICE during CFG validation,
4541 so instead emit the code you want using output_asm_insn. */
4542 gcc_assert (flag_stack_clash_protection);
4543 gcc_assert (CONST_INT_P (min_probe_threshold) && CONST_INT_P (guard_size));
4544 gcc_assert (INTVAL (guard_size) > INTVAL (min_probe_threshold));
4545
4546 /* The minimum required allocation before the residual requires probing. */
4547 HOST_WIDE_INT residual_probe_guard = INTVAL (min_probe_threshold);
4548
4549 /* Clamp the value down to the nearest value that can be used with a cmp. */
4550 residual_probe_guard = aarch64_clamp_to_uimm12_shift (residual_probe_guard);
4551 rtx probe_offset_value_rtx = gen_int_mode (residual_probe_guard, Pmode);
4552
4553 gcc_assert (INTVAL (min_probe_threshold) >= residual_probe_guard);
4554 gcc_assert (aarch64_uimm12_shift (residual_probe_guard));
4555
4556 static int labelno = 0;
4557 char loop_start_lab[32];
4558 char loop_end_lab[32];
4559 rtx xops[2];
4560
4561 ASM_GENERATE_INTERNAL_LABEL (loop_start_lab, "SVLPSPL", labelno);
4562 ASM_GENERATE_INTERNAL_LABEL (loop_end_lab, "SVLPEND", labelno++);
4563
4564 /* Emit loop start label. */
4565 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_start_lab);
4566
4567 /* ADJUSTMENT < RESIDUAL_PROBE_GUARD. */
4568 xops[0] = adjustment;
4569 xops[1] = probe_offset_value_rtx;
4570 output_asm_insn ("cmp\t%0, %1", xops);
4571
4572 /* Branch to end if not enough adjustment to probe. */
4573 fputs ("\tb.lt\t", asm_out_file);
4574 assemble_name_raw (asm_out_file, loop_end_lab);
4575 fputc ('\n', asm_out_file);
4576
4577 /* BASE = BASE - RESIDUAL_PROBE_GUARD. */
4578 xops[0] = base;
4579 xops[1] = probe_offset_value_rtx;
4580 output_asm_insn ("sub\t%0, %0, %1", xops);
4581
4582 /* Probe at BASE. */
4583 xops[1] = const0_rtx;
4584 output_asm_insn ("str\txzr, [%0, %1]", xops);
4585
4586 /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD. */
4587 xops[0] = adjustment;
4588 xops[1] = probe_offset_value_rtx;
4589 output_asm_insn ("sub\t%0, %0, %1", xops);
4590
4591 /* Branch to start if still more bytes to allocate. */
4592 fputs ("\tb\t", asm_out_file);
4593 assemble_name_raw (asm_out_file, loop_start_lab);
4594 fputc ('\n', asm_out_file);
4595
4596 /* No probe leave. */
4597 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab);
4598
4599 /* BASE = BASE - ADJUSTMENT. */
4600 xops[0] = base;
4601 xops[1] = adjustment;
4602 output_asm_insn ("sub\t%0, %0, %1", xops);
4603 return "";
4604 }
4605
4606 /* Determine whether a frame chain needs to be generated. */
4607 static bool
aarch64_needs_frame_chain(void)4608 aarch64_needs_frame_chain (void)
4609 {
4610 /* Force a frame chain for EH returns so the return address is at FP+8. */
4611 if (frame_pointer_needed || crtl->calls_eh_return)
4612 return true;
4613
4614 /* A leaf function cannot have calls or write LR. */
4615 bool is_leaf = crtl->is_leaf && !df_regs_ever_live_p (LR_REGNUM);
4616
4617 /* Don't use a frame chain in leaf functions if leaf frame pointers
4618 are disabled. */
4619 if (flag_omit_leaf_frame_pointer && is_leaf)
4620 return false;
4621
4622 return aarch64_use_frame_pointer;
4623 }
4624
4625 /* Mark the registers that need to be saved by the callee and calculate
4626 the size of the callee-saved registers area and frame record (both FP
4627 and LR may be omitted). */
4628 static void
aarch64_layout_frame(void)4629 aarch64_layout_frame (void)
4630 {
4631 HOST_WIDE_INT offset = 0;
4632 int regno, last_fp_reg = INVALID_REGNUM;
4633 bool simd_function = aarch64_simd_decl_p (cfun->decl);
4634
4635 cfun->machine->frame.emit_frame_chain = aarch64_needs_frame_chain ();
4636
4637 /* Adjust the outgoing arguments size if required. Keep it in sync with what
4638 the mid-end is doing. */
4639 crtl->outgoing_args_size = STACK_DYNAMIC_OFFSET (cfun);
4640
4641 #define SLOT_NOT_REQUIRED (-2)
4642 #define SLOT_REQUIRED (-1)
4643
4644 cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
4645 cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
4646
4647 /* If this is a non-leaf simd function with calls we assume that
4648 at least one of those calls is to a non-simd function and thus
4649 we must save V8 to V23 in the prologue. */
4650
4651 if (simd_function && !crtl->is_leaf)
4652 {
4653 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4654 if (FP_SIMD_SAVED_REGNUM_P (regno))
4655 df_set_regs_ever_live (regno, true);
4656 }
4657
4658 /* First mark all the registers that really need to be saved... */
4659 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4660 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
4661
4662 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4663 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
4664
4665 /* ... that includes the eh data registers (if needed)... */
4666 if (crtl->calls_eh_return)
4667 for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
4668 cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
4669 = SLOT_REQUIRED;
4670
4671 /* ... and any callee saved register that dataflow says is live. */
4672 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4673 if (df_regs_ever_live_p (regno)
4674 && (regno == R30_REGNUM
4675 || !call_used_regs[regno]))
4676 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
4677
4678 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4679 if (df_regs_ever_live_p (regno)
4680 && (!call_used_regs[regno]
4681 || (simd_function && FP_SIMD_SAVED_REGNUM_P (regno))))
4682 {
4683 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
4684 last_fp_reg = regno;
4685 }
4686
4687 if (cfun->machine->frame.emit_frame_chain)
4688 {
4689 /* FP and LR are placed in the linkage record. */
4690 cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
4691 cfun->machine->frame.wb_candidate1 = R29_REGNUM;
4692 cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
4693 cfun->machine->frame.wb_candidate2 = R30_REGNUM;
4694 offset = 2 * UNITS_PER_WORD;
4695 }
4696
4697 /* With stack-clash, LR must be saved in non-leaf functions. */
4698 gcc_assert (crtl->is_leaf
4699 || (cfun->machine->frame.reg_offset[R30_REGNUM]
4700 != SLOT_NOT_REQUIRED));
4701
4702 /* Now assign stack slots for them. */
4703 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4704 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
4705 {
4706 cfun->machine->frame.reg_offset[regno] = offset;
4707 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
4708 cfun->machine->frame.wb_candidate1 = regno;
4709 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
4710 cfun->machine->frame.wb_candidate2 = regno;
4711 offset += UNITS_PER_WORD;
4712 }
4713
4714 HOST_WIDE_INT max_int_offset = offset;
4715 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
4716 bool has_align_gap = offset != max_int_offset;
4717
4718 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4719 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
4720 {
4721 /* If there is an alignment gap between integer and fp callee-saves,
4722 allocate the last fp register to it if possible. */
4723 if (regno == last_fp_reg
4724 && has_align_gap
4725 && !simd_function
4726 && (offset & 8) == 0)
4727 {
4728 cfun->machine->frame.reg_offset[regno] = max_int_offset;
4729 break;
4730 }
4731
4732 cfun->machine->frame.reg_offset[regno] = offset;
4733 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
4734 cfun->machine->frame.wb_candidate1 = regno;
4735 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
4736 && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
4737 cfun->machine->frame.wb_candidate2 = regno;
4738 offset += simd_function ? UNITS_PER_VREG : UNITS_PER_WORD;
4739 }
4740
4741 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
4742
4743 cfun->machine->frame.saved_regs_size = offset;
4744
4745 HOST_WIDE_INT varargs_and_saved_regs_size
4746 = offset + cfun->machine->frame.saved_varargs_size;
4747
4748 cfun->machine->frame.hard_fp_offset
4749 = aligned_upper_bound (varargs_and_saved_regs_size
4750 + get_frame_size (),
4751 STACK_BOUNDARY / BITS_PER_UNIT);
4752
4753 /* Both these values are already aligned. */
4754 gcc_assert (multiple_p (crtl->outgoing_args_size,
4755 STACK_BOUNDARY / BITS_PER_UNIT));
4756 cfun->machine->frame.frame_size
4757 = (cfun->machine->frame.hard_fp_offset
4758 + crtl->outgoing_args_size);
4759
4760 cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
4761
4762 cfun->machine->frame.initial_adjust = 0;
4763 cfun->machine->frame.final_adjust = 0;
4764 cfun->machine->frame.callee_adjust = 0;
4765 cfun->machine->frame.callee_offset = 0;
4766
4767 HOST_WIDE_INT max_push_offset = 0;
4768 if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
4769 max_push_offset = 512;
4770 else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
4771 max_push_offset = 256;
4772
4773 HOST_WIDE_INT const_size, const_fp_offset;
4774 if (cfun->machine->frame.frame_size.is_constant (&const_size)
4775 && const_size < max_push_offset
4776 && known_eq (crtl->outgoing_args_size, 0))
4777 {
4778 /* Simple, small frame with no outgoing arguments:
4779 stp reg1, reg2, [sp, -frame_size]!
4780 stp reg3, reg4, [sp, 16] */
4781 cfun->machine->frame.callee_adjust = const_size;
4782 }
4783 else if (known_lt (crtl->outgoing_args_size
4784 + cfun->machine->frame.saved_regs_size, 512)
4785 && !(cfun->calls_alloca
4786 && known_lt (cfun->machine->frame.hard_fp_offset,
4787 max_push_offset)))
4788 {
4789 /* Frame with small outgoing arguments:
4790 sub sp, sp, frame_size
4791 stp reg1, reg2, [sp, outgoing_args_size]
4792 stp reg3, reg4, [sp, outgoing_args_size + 16] */
4793 cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
4794 cfun->machine->frame.callee_offset
4795 = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
4796 }
4797 else if (cfun->machine->frame.hard_fp_offset.is_constant (&const_fp_offset)
4798 && const_fp_offset < max_push_offset)
4799 {
4800 /* Frame with large outgoing arguments but a small local area:
4801 stp reg1, reg2, [sp, -hard_fp_offset]!
4802 stp reg3, reg4, [sp, 16]
4803 sub sp, sp, outgoing_args_size */
4804 cfun->machine->frame.callee_adjust = const_fp_offset;
4805 cfun->machine->frame.final_adjust
4806 = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
4807 }
4808 else
4809 {
4810 /* Frame with large local area and outgoing arguments using frame pointer:
4811 sub sp, sp, hard_fp_offset
4812 stp x29, x30, [sp, 0]
4813 add x29, sp, 0
4814 stp reg3, reg4, [sp, 16]
4815 sub sp, sp, outgoing_args_size */
4816 cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
4817 cfun->machine->frame.final_adjust
4818 = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
4819 }
4820
4821 cfun->machine->frame.laid_out = true;
4822 }
4823
4824 /* Return true if the register REGNO is saved on entry to
4825 the current function. */
4826
4827 static bool
aarch64_register_saved_on_entry(int regno)4828 aarch64_register_saved_on_entry (int regno)
4829 {
4830 return cfun->machine->frame.reg_offset[regno] >= 0;
4831 }
4832
4833 /* Return the next register up from REGNO up to LIMIT for the callee
4834 to save. */
4835
4836 static unsigned
aarch64_next_callee_save(unsigned regno,unsigned limit)4837 aarch64_next_callee_save (unsigned regno, unsigned limit)
4838 {
4839 while (regno <= limit && !aarch64_register_saved_on_entry (regno))
4840 regno ++;
4841 return regno;
4842 }
4843
4844 /* Push the register number REGNO of mode MODE to the stack with write-back
4845 adjusting the stack by ADJUSTMENT. */
4846
4847 static void
aarch64_pushwb_single_reg(machine_mode mode,unsigned regno,HOST_WIDE_INT adjustment)4848 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
4849 HOST_WIDE_INT adjustment)
4850 {
4851 rtx base_rtx = stack_pointer_rtx;
4852 rtx insn, reg, mem;
4853
4854 reg = gen_rtx_REG (mode, regno);
4855 mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
4856 plus_constant (Pmode, base_rtx, -adjustment));
4857 mem = gen_frame_mem (mode, mem);
4858
4859 insn = emit_move_insn (mem, reg);
4860 RTX_FRAME_RELATED_P (insn) = 1;
4861 }
4862
4863 /* Generate and return an instruction to store the pair of registers
4864 REG and REG2 of mode MODE to location BASE with write-back adjusting
4865 the stack location BASE by ADJUSTMENT. */
4866
4867 static rtx
aarch64_gen_storewb_pair(machine_mode mode,rtx base,rtx reg,rtx reg2,HOST_WIDE_INT adjustment)4868 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
4869 HOST_WIDE_INT adjustment)
4870 {
4871 switch (mode)
4872 {
4873 case E_DImode:
4874 return gen_storewb_pairdi_di (base, base, reg, reg2,
4875 GEN_INT (-adjustment),
4876 GEN_INT (UNITS_PER_WORD - adjustment));
4877 case E_DFmode:
4878 return gen_storewb_pairdf_di (base, base, reg, reg2,
4879 GEN_INT (-adjustment),
4880 GEN_INT (UNITS_PER_WORD - adjustment));
4881 case E_TFmode:
4882 return gen_storewb_pairtf_di (base, base, reg, reg2,
4883 GEN_INT (-adjustment),
4884 GEN_INT (UNITS_PER_VREG - adjustment));
4885 default:
4886 gcc_unreachable ();
4887 }
4888 }
4889
4890 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
4891 stack pointer by ADJUSTMENT. */
4892
4893 static void
aarch64_push_regs(unsigned regno1,unsigned regno2,HOST_WIDE_INT adjustment)4894 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
4895 {
4896 rtx_insn *insn;
4897 machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1);
4898
4899 if (regno2 == INVALID_REGNUM)
4900 return aarch64_pushwb_single_reg (mode, regno1, adjustment);
4901
4902 rtx reg1 = gen_rtx_REG (mode, regno1);
4903 rtx reg2 = gen_rtx_REG (mode, regno2);
4904
4905 insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
4906 reg2, adjustment));
4907 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
4908 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
4909 RTX_FRAME_RELATED_P (insn) = 1;
4910 }
4911
4912 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
4913 adjusting it by ADJUSTMENT afterwards. */
4914
4915 static rtx
aarch64_gen_loadwb_pair(machine_mode mode,rtx base,rtx reg,rtx reg2,HOST_WIDE_INT adjustment)4916 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
4917 HOST_WIDE_INT adjustment)
4918 {
4919 switch (mode)
4920 {
4921 case E_DImode:
4922 return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
4923 GEN_INT (UNITS_PER_WORD));
4924 case E_DFmode:
4925 return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
4926 GEN_INT (UNITS_PER_WORD));
4927 case E_TFmode:
4928 return gen_loadwb_pairtf_di (base, base, reg, reg2, GEN_INT (adjustment),
4929 GEN_INT (UNITS_PER_VREG));
4930 default:
4931 gcc_unreachable ();
4932 }
4933 }
4934
4935 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
4936 afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
4937 into CFI_OPS. */
4938
4939 static void
aarch64_pop_regs(unsigned regno1,unsigned regno2,HOST_WIDE_INT adjustment,rtx * cfi_ops)4940 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
4941 rtx *cfi_ops)
4942 {
4943 machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1);
4944 rtx reg1 = gen_rtx_REG (mode, regno1);
4945
4946 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
4947
4948 if (regno2 == INVALID_REGNUM)
4949 {
4950 rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
4951 mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
4952 emit_move_insn (reg1, gen_frame_mem (mode, mem));
4953 }
4954 else
4955 {
4956 rtx reg2 = gen_rtx_REG (mode, regno2);
4957 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
4958 emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
4959 reg2, adjustment));
4960 }
4961 }
4962
4963 /* Generate and return a store pair instruction of mode MODE to store
4964 register REG1 to MEM1 and register REG2 to MEM2. */
4965
4966 static rtx
aarch64_gen_store_pair(machine_mode mode,rtx mem1,rtx reg1,rtx mem2,rtx reg2)4967 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
4968 rtx reg2)
4969 {
4970 switch (mode)
4971 {
4972 case E_DImode:
4973 return gen_store_pair_dw_didi (mem1, reg1, mem2, reg2);
4974
4975 case E_DFmode:
4976 return gen_store_pair_dw_dfdf (mem1, reg1, mem2, reg2);
4977
4978 case E_TFmode:
4979 return gen_store_pair_dw_tftf (mem1, reg1, mem2, reg2);
4980
4981 default:
4982 gcc_unreachable ();
4983 }
4984 }
4985
4986 /* Generate and regurn a load pair isntruction of mode MODE to load register
4987 REG1 from MEM1 and register REG2 from MEM2. */
4988
4989 static rtx
aarch64_gen_load_pair(machine_mode mode,rtx reg1,rtx mem1,rtx reg2,rtx mem2)4990 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
4991 rtx mem2)
4992 {
4993 switch (mode)
4994 {
4995 case E_DImode:
4996 return gen_load_pair_dw_didi (reg1, mem1, reg2, mem2);
4997
4998 case E_DFmode:
4999 return gen_load_pair_dw_dfdf (reg1, mem1, reg2, mem2);
5000
5001 case E_TFmode:
5002 return gen_load_pair_dw_tftf (reg1, mem1, reg2, mem2);
5003
5004 default:
5005 gcc_unreachable ();
5006 }
5007 }
5008
5009 /* Return TRUE if return address signing should be enabled for the current
5010 function, otherwise return FALSE. */
5011
5012 bool
aarch64_return_address_signing_enabled(void)5013 aarch64_return_address_signing_enabled (void)
5014 {
5015 /* This function should only be called after frame laid out. */
5016 gcc_assert (cfun->machine->frame.laid_out);
5017
5018 /* Turn return address signing off in any function that uses
5019 __builtin_eh_return. The address passed to __builtin_eh_return
5020 is not signed so either it has to be signed (with original sp)
5021 or the code path that uses it has to avoid authenticating it.
5022 Currently eh return introduces a return to anywhere gadget, no
5023 matter what we do here since it uses ret with user provided
5024 address. An ideal fix for that is to use indirect branch which
5025 can be protected with BTI j (to some extent). */
5026 if (crtl->calls_eh_return)
5027 return false;
5028
5029 /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
5030 if it's LR is pushed onto stack. */
5031 return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
5032 || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
5033 && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
5034 }
5035
5036 /* Return TRUE if Branch Target Identification Mechanism is enabled. */
5037 bool
aarch64_bti_enabled(void)5038 aarch64_bti_enabled (void)
5039 {
5040 return (aarch64_enable_bti == 1);
5041 }
5042
5043 /* Emit code to save the callee-saved registers from register number START
5044 to LIMIT to the stack at the location starting at offset START_OFFSET,
5045 skipping any write-back candidates if SKIP_WB is true. */
5046
5047 static void
aarch64_save_callee_saves(machine_mode mode,poly_int64 start_offset,unsigned start,unsigned limit,bool skip_wb)5048 aarch64_save_callee_saves (machine_mode mode, poly_int64 start_offset,
5049 unsigned start, unsigned limit, bool skip_wb)
5050 {
5051 rtx_insn *insn;
5052 unsigned regno;
5053 unsigned regno2;
5054
5055 for (regno = aarch64_next_callee_save (start, limit);
5056 regno <= limit;
5057 regno = aarch64_next_callee_save (regno + 1, limit))
5058 {
5059 rtx reg, mem;
5060 poly_int64 offset;
5061 int offset_diff;
5062
5063 if (skip_wb
5064 && (regno == cfun->machine->frame.wb_candidate1
5065 || regno == cfun->machine->frame.wb_candidate2))
5066 continue;
5067
5068 if (cfun->machine->reg_is_wrapped_separately[regno])
5069 continue;
5070
5071 reg = gen_rtx_REG (mode, regno);
5072 offset = start_offset + cfun->machine->frame.reg_offset[regno];
5073 mem = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
5074 offset));
5075
5076 regno2 = aarch64_next_callee_save (regno + 1, limit);
5077 offset_diff = cfun->machine->frame.reg_offset[regno2]
5078 - cfun->machine->frame.reg_offset[regno];
5079
5080 if (regno2 <= limit
5081 && !cfun->machine->reg_is_wrapped_separately[regno2]
5082 && known_eq (GET_MODE_SIZE (mode), offset_diff))
5083 {
5084 rtx reg2 = gen_rtx_REG (mode, regno2);
5085 rtx mem2;
5086
5087 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
5088 mem2 = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
5089 offset));
5090 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
5091 reg2));
5092
5093 /* The first part of a frame-related parallel insn is
5094 always assumed to be relevant to the frame
5095 calculations; subsequent parts, are only
5096 frame-related if explicitly marked. */
5097 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
5098 regno = regno2;
5099 }
5100 else
5101 insn = emit_move_insn (mem, reg);
5102
5103 RTX_FRAME_RELATED_P (insn) = 1;
5104 }
5105 }
5106
5107 /* Emit code to restore the callee registers of mode MODE from register
5108 number START up to and including LIMIT. Restore from the stack offset
5109 START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
5110 Write the appropriate REG_CFA_RESTORE notes into CFI_OPS. */
5111
5112 static void
aarch64_restore_callee_saves(machine_mode mode,poly_int64 start_offset,unsigned start,unsigned limit,bool skip_wb,rtx * cfi_ops)5113 aarch64_restore_callee_saves (machine_mode mode,
5114 poly_int64 start_offset, unsigned start,
5115 unsigned limit, bool skip_wb, rtx *cfi_ops)
5116 {
5117 rtx base_rtx = stack_pointer_rtx;
5118 unsigned regno;
5119 unsigned regno2;
5120 poly_int64 offset;
5121
5122 for (regno = aarch64_next_callee_save (start, limit);
5123 regno <= limit;
5124 regno = aarch64_next_callee_save (regno + 1, limit))
5125 {
5126 if (cfun->machine->reg_is_wrapped_separately[regno])
5127 continue;
5128
5129 rtx reg, mem;
5130 int offset_diff;
5131
5132 if (skip_wb
5133 && (regno == cfun->machine->frame.wb_candidate1
5134 || regno == cfun->machine->frame.wb_candidate2))
5135 continue;
5136
5137 reg = gen_rtx_REG (mode, regno);
5138 offset = start_offset + cfun->machine->frame.reg_offset[regno];
5139 mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
5140
5141 regno2 = aarch64_next_callee_save (regno + 1, limit);
5142 offset_diff = cfun->machine->frame.reg_offset[regno2]
5143 - cfun->machine->frame.reg_offset[regno];
5144
5145 if (regno2 <= limit
5146 && !cfun->machine->reg_is_wrapped_separately[regno2]
5147 && known_eq (GET_MODE_SIZE (mode), offset_diff))
5148 {
5149 rtx reg2 = gen_rtx_REG (mode, regno2);
5150 rtx mem2;
5151
5152 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
5153 mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
5154 emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
5155
5156 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
5157 regno = regno2;
5158 }
5159 else
5160 emit_move_insn (reg, mem);
5161 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
5162 }
5163 }
5164
5165 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
5166 of MODE. */
5167
5168 static inline bool
offset_4bit_signed_scaled_p(machine_mode mode,poly_int64 offset)5169 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
5170 {
5171 HOST_WIDE_INT multiple;
5172 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5173 && IN_RANGE (multiple, -8, 7));
5174 }
5175
5176 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
5177 of MODE. */
5178
5179 static inline bool
offset_6bit_unsigned_scaled_p(machine_mode mode,poly_int64 offset)5180 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
5181 {
5182 HOST_WIDE_INT multiple;
5183 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5184 && IN_RANGE (multiple, 0, 63));
5185 }
5186
5187 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
5188 of MODE. */
5189
5190 bool
aarch64_offset_7bit_signed_scaled_p(machine_mode mode,poly_int64 offset)5191 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
5192 {
5193 HOST_WIDE_INT multiple;
5194 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5195 && IN_RANGE (multiple, -64, 63));
5196 }
5197
5198 /* Return true if OFFSET is a signed 9-bit value. */
5199
5200 bool
aarch64_offset_9bit_signed_unscaled_p(machine_mode mode ATTRIBUTE_UNUSED,poly_int64 offset)5201 aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
5202 poly_int64 offset)
5203 {
5204 HOST_WIDE_INT const_offset;
5205 return (offset.is_constant (&const_offset)
5206 && IN_RANGE (const_offset, -256, 255));
5207 }
5208
5209 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
5210 of MODE. */
5211
5212 static inline bool
offset_9bit_signed_scaled_p(machine_mode mode,poly_int64 offset)5213 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
5214 {
5215 HOST_WIDE_INT multiple;
5216 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5217 && IN_RANGE (multiple, -256, 255));
5218 }
5219
5220 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
5221 of MODE. */
5222
5223 static inline bool
offset_12bit_unsigned_scaled_p(machine_mode mode,poly_int64 offset)5224 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
5225 {
5226 HOST_WIDE_INT multiple;
5227 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5228 && IN_RANGE (multiple, 0, 4095));
5229 }
5230
5231 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */
5232
5233 static sbitmap
aarch64_get_separate_components(void)5234 aarch64_get_separate_components (void)
5235 {
5236 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
5237 bitmap_clear (components);
5238
5239 /* The registers we need saved to the frame. */
5240 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
5241 if (aarch64_register_saved_on_entry (regno))
5242 {
5243 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
5244 if (!frame_pointer_needed)
5245 offset += cfun->machine->frame.frame_size
5246 - cfun->machine->frame.hard_fp_offset;
5247 /* Check that we can access the stack slot of the register with one
5248 direct load with no adjustments needed. */
5249 if (offset_12bit_unsigned_scaled_p (DImode, offset))
5250 bitmap_set_bit (components, regno);
5251 }
5252
5253 /* Don't mess with the hard frame pointer. */
5254 if (frame_pointer_needed)
5255 bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
5256
5257 unsigned reg1 = cfun->machine->frame.wb_candidate1;
5258 unsigned reg2 = cfun->machine->frame.wb_candidate2;
5259 /* If registers have been chosen to be stored/restored with
5260 writeback don't interfere with them to avoid having to output explicit
5261 stack adjustment instructions. */
5262 if (reg2 != INVALID_REGNUM)
5263 bitmap_clear_bit (components, reg2);
5264 if (reg1 != INVALID_REGNUM)
5265 bitmap_clear_bit (components, reg1);
5266
5267 bitmap_clear_bit (components, LR_REGNUM);
5268 bitmap_clear_bit (components, SP_REGNUM);
5269
5270 return components;
5271 }
5272
5273 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. */
5274
5275 static sbitmap
aarch64_components_for_bb(basic_block bb)5276 aarch64_components_for_bb (basic_block bb)
5277 {
5278 bitmap in = DF_LIVE_IN (bb);
5279 bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
5280 bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
5281 bool simd_function = aarch64_simd_decl_p (cfun->decl);
5282
5283 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
5284 bitmap_clear (components);
5285
5286 /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */
5287 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
5288 if ((!call_used_regs[regno]
5289 || (simd_function && FP_SIMD_SAVED_REGNUM_P (regno)))
5290 && (bitmap_bit_p (in, regno)
5291 || bitmap_bit_p (gen, regno)
5292 || bitmap_bit_p (kill, regno)))
5293 {
5294 unsigned regno2, offset, offset2;
5295 bitmap_set_bit (components, regno);
5296
5297 /* If there is a callee-save at an adjacent offset, add it too
5298 to increase the use of LDP/STP. */
5299 offset = cfun->machine->frame.reg_offset[regno];
5300 regno2 = ((offset & 8) == 0) ? regno + 1 : regno - 1;
5301
5302 if (regno2 <= LAST_SAVED_REGNUM)
5303 {
5304 offset2 = cfun->machine->frame.reg_offset[regno2];
5305 if ((offset & ~8) == (offset2 & ~8))
5306 bitmap_set_bit (components, regno2);
5307 }
5308 }
5309
5310 return components;
5311 }
5312
5313 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
5314 Nothing to do for aarch64. */
5315
5316 static void
aarch64_disqualify_components(sbitmap,edge,sbitmap,bool)5317 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
5318 {
5319 }
5320
5321 /* Return the next set bit in BMP from START onwards. Return the total number
5322 of bits in BMP if no set bit is found at or after START. */
5323
5324 static unsigned int
aarch64_get_next_set_bit(sbitmap bmp,unsigned int start)5325 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
5326 {
5327 unsigned int nbits = SBITMAP_SIZE (bmp);
5328 if (start == nbits)
5329 return start;
5330
5331 gcc_assert (start < nbits);
5332 for (unsigned int i = start; i < nbits; i++)
5333 if (bitmap_bit_p (bmp, i))
5334 return i;
5335
5336 return nbits;
5337 }
5338
5339 /* Do the work for aarch64_emit_prologue_components and
5340 aarch64_emit_epilogue_components. COMPONENTS is the bitmap of registers
5341 to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
5342 for these components or the epilogue sequence. That is, it determines
5343 whether we should emit stores or loads and what kind of CFA notes to attach
5344 to the insns. Otherwise the logic for the two sequences is very
5345 similar. */
5346
5347 static void
aarch64_process_components(sbitmap components,bool prologue_p)5348 aarch64_process_components (sbitmap components, bool prologue_p)
5349 {
5350 rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
5351 ? HARD_FRAME_POINTER_REGNUM
5352 : STACK_POINTER_REGNUM);
5353
5354 unsigned last_regno = SBITMAP_SIZE (components);
5355 unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
5356 rtx_insn *insn = NULL;
5357
5358 while (regno != last_regno)
5359 {
5360 /* AAPCS64 section 5.1.2 requires only the low 64 bits to be saved
5361 so DFmode for the vector registers is enough. For simd functions
5362 we want to save the low 128 bits. */
5363 machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno);
5364
5365 rtx reg = gen_rtx_REG (mode, regno);
5366 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
5367 if (!frame_pointer_needed)
5368 offset += cfun->machine->frame.frame_size
5369 - cfun->machine->frame.hard_fp_offset;
5370 rtx addr = plus_constant (Pmode, ptr_reg, offset);
5371 rtx mem = gen_frame_mem (mode, addr);
5372
5373 rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
5374 unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
5375 /* No more registers to handle after REGNO.
5376 Emit a single save/restore and exit. */
5377 if (regno2 == last_regno)
5378 {
5379 insn = emit_insn (set);
5380 RTX_FRAME_RELATED_P (insn) = 1;
5381 if (prologue_p)
5382 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
5383 else
5384 add_reg_note (insn, REG_CFA_RESTORE, reg);
5385 break;
5386 }
5387
5388 poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
5389 /* The next register is not of the same class or its offset is not
5390 mergeable with the current one into a pair. */
5391 if (!satisfies_constraint_Ump (mem)
5392 || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
5393 || (aarch64_simd_decl_p (cfun->decl) && FP_REGNUM_P (regno))
5394 || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
5395 GET_MODE_SIZE (mode)))
5396 {
5397 insn = emit_insn (set);
5398 RTX_FRAME_RELATED_P (insn) = 1;
5399 if (prologue_p)
5400 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
5401 else
5402 add_reg_note (insn, REG_CFA_RESTORE, reg);
5403
5404 regno = regno2;
5405 continue;
5406 }
5407
5408 /* REGNO2 can be saved/restored in a pair with REGNO. */
5409 rtx reg2 = gen_rtx_REG (mode, regno2);
5410 if (!frame_pointer_needed)
5411 offset2 += cfun->machine->frame.frame_size
5412 - cfun->machine->frame.hard_fp_offset;
5413 rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
5414 rtx mem2 = gen_frame_mem (mode, addr2);
5415 rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
5416 : gen_rtx_SET (reg2, mem2);
5417
5418 if (prologue_p)
5419 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
5420 else
5421 insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
5422
5423 RTX_FRAME_RELATED_P (insn) = 1;
5424 if (prologue_p)
5425 {
5426 add_reg_note (insn, REG_CFA_OFFSET, set);
5427 add_reg_note (insn, REG_CFA_OFFSET, set2);
5428 }
5429 else
5430 {
5431 add_reg_note (insn, REG_CFA_RESTORE, reg);
5432 add_reg_note (insn, REG_CFA_RESTORE, reg2);
5433 }
5434
5435 regno = aarch64_get_next_set_bit (components, regno2 + 1);
5436 }
5437 }
5438
5439 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. */
5440
5441 static void
aarch64_emit_prologue_components(sbitmap components)5442 aarch64_emit_prologue_components (sbitmap components)
5443 {
5444 aarch64_process_components (components, true);
5445 }
5446
5447 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. */
5448
5449 static void
aarch64_emit_epilogue_components(sbitmap components)5450 aarch64_emit_epilogue_components (sbitmap components)
5451 {
5452 aarch64_process_components (components, false);
5453 }
5454
5455 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS. */
5456
5457 static void
aarch64_set_handled_components(sbitmap components)5458 aarch64_set_handled_components (sbitmap components)
5459 {
5460 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
5461 if (bitmap_bit_p (components, regno))
5462 cfun->machine->reg_is_wrapped_separately[regno] = true;
5463 }
5464
5465 /* On AArch64 we have an ABI defined safe buffer. This constant is used to
5466 determining the probe offset for alloca. */
5467
5468 static HOST_WIDE_INT
aarch64_stack_clash_protection_alloca_probe_range(void)5469 aarch64_stack_clash_protection_alloca_probe_range (void)
5470 {
5471 return STACK_CLASH_CALLER_GUARD;
5472 }
5473
5474
5475 /* Allocate POLY_SIZE bytes of stack space using TEMP1 and TEMP2 as scratch
5476 registers. If POLY_SIZE is not large enough to require a probe this function
5477 will only adjust the stack. When allocating the stack space
5478 FRAME_RELATED_P is then used to indicate if the allocation is frame related.
5479 FINAL_ADJUSTMENT_P indicates whether we are allocating the outgoing
5480 arguments. If we are then we ensure that any allocation larger than the ABI
5481 defined buffer needs a probe so that the invariant of having a 1KB buffer is
5482 maintained.
5483
5484 We emit barriers after each stack adjustment to prevent optimizations from
5485 breaking the invariant that we never drop the stack more than a page. This
5486 invariant is needed to make it easier to correctly handle asynchronous
5487 events, e.g. if we were to allow the stack to be dropped by more than a page
5488 and then have multiple probes up and we take a signal somewhere in between
5489 then the signal handler doesn't know the state of the stack and can make no
5490 assumptions about which pages have been probed. */
5491
5492 static void
aarch64_allocate_and_probe_stack_space(rtx temp1,rtx temp2,poly_int64 poly_size,bool frame_related_p,bool final_adjustment_p)5493 aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
5494 poly_int64 poly_size,
5495 bool frame_related_p,
5496 bool final_adjustment_p)
5497 {
5498 HOST_WIDE_INT guard_size
5499 = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
5500 HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
5501 /* When doing the final adjustment for the outgoing argument size we can't
5502 assume that LR was saved at position 0. So subtract it's offset from the
5503 ABI safe buffer so that we don't accidentally allow an adjustment that
5504 would result in an allocation larger than the ABI buffer without
5505 probing. */
5506 HOST_WIDE_INT min_probe_threshold
5507 = final_adjustment_p
5508 ? guard_used_by_caller - cfun->machine->frame.reg_offset[LR_REGNUM]
5509 : guard_size - guard_used_by_caller;
5510
5511 poly_int64 frame_size = cfun->machine->frame.frame_size;
5512
5513 /* We should always have a positive probe threshold. */
5514 gcc_assert (min_probe_threshold > 0);
5515
5516 if (flag_stack_clash_protection && !final_adjustment_p)
5517 {
5518 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
5519 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
5520
5521 if (known_eq (frame_size, 0))
5522 {
5523 dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
5524 }
5525 else if (known_lt (initial_adjust, guard_size - guard_used_by_caller)
5526 && known_lt (final_adjust, guard_used_by_caller))
5527 {
5528 dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
5529 }
5530 }
5531
5532 /* If SIZE is not large enough to require probing, just adjust the stack and
5533 exit. */
5534 if (known_lt (poly_size, min_probe_threshold)
5535 || !flag_stack_clash_protection)
5536 {
5537 aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p);
5538 return;
5539 }
5540
5541 HOST_WIDE_INT size;
5542 /* Handle the SVE non-constant case first. */
5543 if (!poly_size.is_constant (&size))
5544 {
5545 if (dump_file)
5546 {
5547 fprintf (dump_file, "Stack clash SVE prologue: ");
5548 print_dec (poly_size, dump_file);
5549 fprintf (dump_file, " bytes, dynamic probing will be required.\n");
5550 }
5551
5552 /* First calculate the amount of bytes we're actually spilling. */
5553 aarch64_add_offset (Pmode, temp1, CONST0_RTX (Pmode),
5554 poly_size, temp1, temp2, false, true);
5555
5556 rtx_insn *insn = get_last_insn ();
5557
5558 if (frame_related_p)
5559 {
5560 /* This is done to provide unwinding information for the stack
5561 adjustments we're about to do, however to prevent the optimizers
5562 from removing the R11 move and leaving the CFA note (which would be
5563 very wrong) we tie the old and new stack pointer together.
5564 The tie will expand to nothing but the optimizers will not touch
5565 the instruction. */
5566 rtx stack_ptr_copy = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
5567 emit_move_insn (stack_ptr_copy, stack_pointer_rtx);
5568 emit_insn (gen_stack_tie (stack_ptr_copy, stack_pointer_rtx));
5569
5570 /* We want the CFA independent of the stack pointer for the
5571 duration of the loop. */
5572 add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy);
5573 RTX_FRAME_RELATED_P (insn) = 1;
5574 }
5575
5576 rtx probe_const = gen_int_mode (min_probe_threshold, Pmode);
5577 rtx guard_const = gen_int_mode (guard_size, Pmode);
5578
5579 insn = emit_insn (gen_probe_sve_stack_clash (Pmode, stack_pointer_rtx,
5580 stack_pointer_rtx, temp1,
5581 probe_const, guard_const));
5582
5583 /* Now reset the CFA register if needed. */
5584 if (frame_related_p)
5585 {
5586 add_reg_note (insn, REG_CFA_DEF_CFA,
5587 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
5588 gen_int_mode (poly_size, Pmode)));
5589 RTX_FRAME_RELATED_P (insn) = 1;
5590 }
5591
5592 return;
5593 }
5594
5595 if (dump_file)
5596 fprintf (dump_file,
5597 "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
5598 " bytes, probing will be required.\n", size);
5599
5600 /* Round size to the nearest multiple of guard_size, and calculate the
5601 residual as the difference between the original size and the rounded
5602 size. */
5603 HOST_WIDE_INT rounded_size = ROUND_DOWN (size, guard_size);
5604 HOST_WIDE_INT residual = size - rounded_size;
5605
5606 /* We can handle a small number of allocations/probes inline. Otherwise
5607 punt to a loop. */
5608 if (rounded_size <= STACK_CLASH_MAX_UNROLL_PAGES * guard_size)
5609 {
5610 for (HOST_WIDE_INT i = 0; i < rounded_size; i += guard_size)
5611 {
5612 aarch64_sub_sp (NULL, temp2, guard_size, true);
5613 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
5614 guard_used_by_caller));
5615 emit_insn (gen_blockage ());
5616 }
5617 dump_stack_clash_frame_info (PROBE_INLINE, size != rounded_size);
5618 }
5619 else
5620 {
5621 /* Compute the ending address. */
5622 aarch64_add_offset (Pmode, temp1, stack_pointer_rtx, -rounded_size,
5623 temp1, NULL, false, true);
5624 rtx_insn *insn = get_last_insn ();
5625
5626 /* For the initial allocation, we don't have a frame pointer
5627 set up, so we always need CFI notes. If we're doing the
5628 final allocation, then we may have a frame pointer, in which
5629 case it is the CFA, otherwise we need CFI notes.
5630
5631 We can determine which allocation we are doing by looking at
5632 the value of FRAME_RELATED_P since the final allocations are not
5633 frame related. */
5634 if (frame_related_p)
5635 {
5636 /* We want the CFA independent of the stack pointer for the
5637 duration of the loop. */
5638 add_reg_note (insn, REG_CFA_DEF_CFA,
5639 plus_constant (Pmode, temp1, rounded_size));
5640 RTX_FRAME_RELATED_P (insn) = 1;
5641 }
5642
5643 /* This allocates and probes the stack. Note that this re-uses some of
5644 the existing Ada stack protection code. However we are guaranteed not
5645 to enter the non loop or residual branches of that code.
5646
5647 The non-loop part won't be entered because if our allocation amount
5648 doesn't require a loop, the case above would handle it.
5649
5650 The residual amount won't be entered because TEMP1 is a mutliple of
5651 the allocation size. The residual will always be 0. As such, the only
5652 part we are actually using from that code is the loop setup. The
5653 actual probing is done in aarch64_output_probe_stack_range. */
5654 insn = emit_insn (gen_probe_stack_range (stack_pointer_rtx,
5655 stack_pointer_rtx, temp1));
5656
5657 /* Now reset the CFA register if needed. */
5658 if (frame_related_p)
5659 {
5660 add_reg_note (insn, REG_CFA_DEF_CFA,
5661 plus_constant (Pmode, stack_pointer_rtx, rounded_size));
5662 RTX_FRAME_RELATED_P (insn) = 1;
5663 }
5664
5665 emit_insn (gen_blockage ());
5666 dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
5667 }
5668
5669 /* Handle any residuals. Residuals of at least MIN_PROBE_THRESHOLD have to
5670 be probed. This maintains the requirement that each page is probed at
5671 least once. For initial probing we probe only if the allocation is
5672 more than GUARD_SIZE - buffer, and for the outgoing arguments we probe
5673 if the amount is larger than buffer. GUARD_SIZE - buffer + buffer ==
5674 GUARD_SIZE. This works that for any allocation that is large enough to
5675 trigger a probe here, we'll have at least one, and if they're not large
5676 enough for this code to emit anything for them, The page would have been
5677 probed by the saving of FP/LR either by this function or any callees. If
5678 we don't have any callees then we won't have more stack adjustments and so
5679 are still safe. */
5680 if (residual)
5681 {
5682 HOST_WIDE_INT residual_probe_offset = guard_used_by_caller;
5683 /* If we're doing final adjustments, and we've done any full page
5684 allocations then any residual needs to be probed. */
5685 if (final_adjustment_p && rounded_size != 0)
5686 min_probe_threshold = 0;
5687 /* If doing a small final adjustment, we always probe at offset 0.
5688 This is done to avoid issues when LR is not at position 0 or when
5689 the final adjustment is smaller than the probing offset. */
5690 else if (final_adjustment_p && rounded_size == 0)
5691 residual_probe_offset = 0;
5692
5693 aarch64_sub_sp (temp1, temp2, residual, frame_related_p);
5694 if (residual >= min_probe_threshold)
5695 {
5696 if (dump_file)
5697 fprintf (dump_file,
5698 "Stack clash AArch64 prologue residuals: "
5699 HOST_WIDE_INT_PRINT_DEC " bytes, probing will be required."
5700 "\n", residual);
5701
5702 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
5703 residual_probe_offset));
5704 emit_insn (gen_blockage ());
5705 }
5706 }
5707 }
5708
5709 /* Return 1 if the register is used by the epilogue. We need to say the
5710 return register is used, but only after epilogue generation is complete.
5711 Note that in the case of sibcalls, the values "used by the epilogue" are
5712 considered live at the start of the called function.
5713
5714 For SIMD functions we need to return 1 for FP registers that are saved and
5715 restored by a function but are not zero in call_used_regs. If we do not do
5716 this optimizations may remove the restore of the register. */
5717
5718 int
aarch64_epilogue_uses(int regno)5719 aarch64_epilogue_uses (int regno)
5720 {
5721 if (epilogue_completed)
5722 {
5723 if (regno == LR_REGNUM)
5724 return 1;
5725 if (aarch64_simd_decl_p (cfun->decl) && FP_SIMD_SAVED_REGNUM_P (regno))
5726 return 1;
5727 }
5728 return 0;
5729 }
5730
5731 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
5732 is saved at BASE + OFFSET. */
5733
5734 static void
aarch64_add_cfa_expression(rtx_insn * insn,unsigned int reg,rtx base,poly_int64 offset)5735 aarch64_add_cfa_expression (rtx_insn *insn, unsigned int reg,
5736 rtx base, poly_int64 offset)
5737 {
5738 rtx mem = gen_frame_mem (DImode, plus_constant (Pmode, base, offset));
5739 add_reg_note (insn, REG_CFA_EXPRESSION,
5740 gen_rtx_SET (mem, regno_reg_rtx[reg]));
5741 }
5742
5743 /* AArch64 stack frames generated by this compiler look like:
5744
5745 +-------------------------------+
5746 | |
5747 | incoming stack arguments |
5748 | |
5749 +-------------------------------+
5750 | | <-- incoming stack pointer (aligned)
5751 | callee-allocated save area |
5752 | for register varargs |
5753 | |
5754 +-------------------------------+
5755 | local variables | <-- frame_pointer_rtx
5756 | |
5757 +-------------------------------+
5758 | padding | \
5759 +-------------------------------+ |
5760 | callee-saved registers | | frame.saved_regs_size
5761 +-------------------------------+ |
5762 | LR' | |
5763 +-------------------------------+ |
5764 | FP' | / <- hard_frame_pointer_rtx (aligned)
5765 +-------------------------------+
5766 | dynamic allocation |
5767 +-------------------------------+
5768 | padding |
5769 +-------------------------------+
5770 | outgoing stack arguments | <-- arg_pointer
5771 | |
5772 +-------------------------------+
5773 | | <-- stack_pointer_rtx (aligned)
5774
5775 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
5776 but leave frame_pointer_rtx and hard_frame_pointer_rtx
5777 unchanged.
5778
5779 By default for stack-clash we assume the guard is at least 64KB, but this
5780 value is configurable to either 4KB or 64KB. We also force the guard size to
5781 be the same as the probing interval and both values are kept in sync.
5782
5783 With those assumptions the callee can allocate up to 63KB (or 3KB depending
5784 on the guard size) of stack space without probing.
5785
5786 When probing is needed, we emit a probe at the start of the prologue
5787 and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
5788
5789 We have to track how much space has been allocated and the only stores
5790 to the stack we track as implicit probes are the FP/LR stores.
5791
5792 For outgoing arguments we probe if the size is larger than 1KB, such that
5793 the ABI specified buffer is maintained for the next callee.
5794
5795 The following registers are reserved during frame layout and should not be
5796 used for any other purpose:
5797
5798 - r11: Used by stack clash protection when SVE is enabled.
5799 - r12(EP0) and r13(EP1): Used as temporaries for stack adjustment.
5800 - r14 and r15: Used for speculation tracking.
5801 - r16(IP0), r17(IP1): Used by indirect tailcalls.
5802 - r30(LR), r29(FP): Used by standard frame layout.
5803
5804 These registers must be avoided in frame layout related code unless the
5805 explicit intention is to interact with one of the features listed above. */
5806
5807 /* Generate the prologue instructions for entry into a function.
5808 Establish the stack frame by decreasing the stack pointer with a
5809 properly calculated size and, if necessary, create a frame record
5810 filled with the values of LR and previous frame pointer. The
5811 current FP is also set up if it is in use. */
5812
5813 void
aarch64_expand_prologue(void)5814 aarch64_expand_prologue (void)
5815 {
5816 poly_int64 frame_size = cfun->machine->frame.frame_size;
5817 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
5818 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
5819 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
5820 poly_int64 callee_offset = cfun->machine->frame.callee_offset;
5821 unsigned reg1 = cfun->machine->frame.wb_candidate1;
5822 unsigned reg2 = cfun->machine->frame.wb_candidate2;
5823 bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
5824 rtx_insn *insn;
5825
5826 /* Sign return address for functions. */
5827 if (aarch64_return_address_signing_enabled ())
5828 {
5829 insn = emit_insn (gen_pacisp ());
5830 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
5831 RTX_FRAME_RELATED_P (insn) = 1;
5832 }
5833
5834 if (flag_stack_usage_info)
5835 current_function_static_stack_size = constant_lower_bound (frame_size);
5836
5837 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
5838 {
5839 if (crtl->is_leaf && !cfun->calls_alloca)
5840 {
5841 if (maybe_gt (frame_size, PROBE_INTERVAL)
5842 && maybe_gt (frame_size, get_stack_check_protect ()))
5843 aarch64_emit_probe_stack_range (get_stack_check_protect (),
5844 (frame_size
5845 - get_stack_check_protect ()));
5846 }
5847 else if (maybe_gt (frame_size, 0))
5848 aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
5849 }
5850
5851 rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
5852 rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
5853
5854 /* In theory we should never have both an initial adjustment
5855 and a callee save adjustment. Verify that is the case since the
5856 code below does not handle it for -fstack-clash-protection. */
5857 gcc_assert (known_eq (initial_adjust, 0) || callee_adjust == 0);
5858
5859 /* Will only probe if the initial adjustment is larger than the guard
5860 less the amount of the guard reserved for use by the caller's
5861 outgoing args. */
5862 aarch64_allocate_and_probe_stack_space (tmp0_rtx, tmp1_rtx, initial_adjust,
5863 true, false);
5864
5865 if (callee_adjust != 0)
5866 aarch64_push_regs (reg1, reg2, callee_adjust);
5867
5868 if (emit_frame_chain)
5869 {
5870 poly_int64 reg_offset = callee_adjust;
5871 if (callee_adjust == 0)
5872 {
5873 reg1 = R29_REGNUM;
5874 reg2 = R30_REGNUM;
5875 reg_offset = callee_offset;
5876 aarch64_save_callee_saves (DImode, reg_offset, reg1, reg2, false);
5877 }
5878 aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
5879 stack_pointer_rtx, callee_offset,
5880 tmp1_rtx, tmp0_rtx, frame_pointer_needed);
5881 if (frame_pointer_needed && !frame_size.is_constant ())
5882 {
5883 /* Variable-sized frames need to describe the save slot
5884 address using DW_CFA_expression rather than DW_CFA_offset.
5885 This means that, without taking further action, the
5886 locations of the registers that we've already saved would
5887 remain based on the stack pointer even after we redefine
5888 the CFA based on the frame pointer. We therefore need new
5889 DW_CFA_expressions to re-express the save slots with addresses
5890 based on the frame pointer. */
5891 rtx_insn *insn = get_last_insn ();
5892 gcc_assert (RTX_FRAME_RELATED_P (insn));
5893
5894 /* Add an explicit CFA definition if this was previously
5895 implicit. */
5896 if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
5897 {
5898 rtx src = plus_constant (Pmode, stack_pointer_rtx,
5899 callee_offset);
5900 add_reg_note (insn, REG_CFA_ADJUST_CFA,
5901 gen_rtx_SET (hard_frame_pointer_rtx, src));
5902 }
5903
5904 /* Change the save slot expressions for the registers that
5905 we've already saved. */
5906 reg_offset -= callee_offset;
5907 aarch64_add_cfa_expression (insn, reg2, hard_frame_pointer_rtx,
5908 reg_offset + UNITS_PER_WORD);
5909 aarch64_add_cfa_expression (insn, reg1, hard_frame_pointer_rtx,
5910 reg_offset);
5911 }
5912 emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
5913 }
5914
5915 aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
5916 callee_adjust != 0 || emit_frame_chain);
5917 if (aarch64_simd_decl_p (cfun->decl))
5918 aarch64_save_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM,
5919 callee_adjust != 0 || emit_frame_chain);
5920 else
5921 aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
5922 callee_adjust != 0 || emit_frame_chain);
5923
5924 /* We may need to probe the final adjustment if it is larger than the guard
5925 that is assumed by the called. */
5926 aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust,
5927 !frame_pointer_needed, true);
5928 }
5929
5930 /* Return TRUE if we can use a simple_return insn.
5931
5932 This function checks whether the callee saved stack is empty, which
5933 means no restore actions are need. The pro_and_epilogue will use
5934 this to check whether shrink-wrapping opt is feasible. */
5935
5936 bool
aarch64_use_return_insn_p(void)5937 aarch64_use_return_insn_p (void)
5938 {
5939 if (!reload_completed)
5940 return false;
5941
5942 if (crtl->profile)
5943 return false;
5944
5945 return known_eq (cfun->machine->frame.frame_size, 0);
5946 }
5947
5948 /* Return false for non-leaf SIMD functions in order to avoid
5949 shrink-wrapping them. Doing this will lose the necessary
5950 save/restore of FP registers. */
5951
5952 bool
aarch64_use_simple_return_insn_p(void)5953 aarch64_use_simple_return_insn_p (void)
5954 {
5955 if (aarch64_simd_decl_p (cfun->decl) && !crtl->is_leaf)
5956 return false;
5957
5958 return true;
5959 }
5960
5961 /* Generate the epilogue instructions for returning from a function.
5962 This is almost exactly the reverse of the prolog sequence, except
5963 that we need to insert barriers to avoid scheduling loads that read
5964 from a deallocated stack, and we optimize the unwind records by
5965 emitting them all together if possible. */
5966 void
aarch64_expand_epilogue(bool for_sibcall)5967 aarch64_expand_epilogue (bool for_sibcall)
5968 {
5969 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
5970 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
5971 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
5972 poly_int64 callee_offset = cfun->machine->frame.callee_offset;
5973 unsigned reg1 = cfun->machine->frame.wb_candidate1;
5974 unsigned reg2 = cfun->machine->frame.wb_candidate2;
5975 rtx cfi_ops = NULL;
5976 rtx_insn *insn;
5977 /* A stack clash protection prologue may not have left EP0_REGNUM or
5978 EP1_REGNUM in a usable state. The same is true for allocations
5979 with an SVE component, since we then need both temporary registers
5980 for each allocation. For stack clash we are in a usable state if
5981 the adjustment is less than GUARD_SIZE - GUARD_USED_BY_CALLER. */
5982 HOST_WIDE_INT guard_size
5983 = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
5984 HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
5985
5986 /* We can re-use the registers when the allocation amount is smaller than
5987 guard_size - guard_used_by_caller because we won't be doing any probes
5988 then. In such situations the register should remain live with the correct
5989 value. */
5990 bool can_inherit_p = (initial_adjust.is_constant ()
5991 && final_adjust.is_constant ())
5992 && (!flag_stack_clash_protection
5993 || known_lt (initial_adjust,
5994 guard_size - guard_used_by_caller));
5995
5996 /* We need to add memory barrier to prevent read from deallocated stack. */
5997 bool need_barrier_p
5998 = maybe_ne (get_frame_size ()
5999 + cfun->machine->frame.saved_varargs_size, 0);
6000
6001 /* Emit a barrier to prevent loads from a deallocated stack. */
6002 if (maybe_gt (final_adjust, crtl->outgoing_args_size)
6003 || cfun->calls_alloca
6004 || crtl->calls_eh_return)
6005 {
6006 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
6007 need_barrier_p = false;
6008 }
6009
6010 /* Restore the stack pointer from the frame pointer if it may not
6011 be the same as the stack pointer. */
6012 rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
6013 rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
6014 if (frame_pointer_needed
6015 && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
6016 /* If writeback is used when restoring callee-saves, the CFA
6017 is restored on the instruction doing the writeback. */
6018 aarch64_add_offset (Pmode, stack_pointer_rtx,
6019 hard_frame_pointer_rtx, -callee_offset,
6020 tmp1_rtx, tmp0_rtx, callee_adjust == 0);
6021 else
6022 /* The case where we need to re-use the register here is very rare, so
6023 avoid the complicated condition and just always emit a move if the
6024 immediate doesn't fit. */
6025 aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, true);
6026
6027 aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
6028 callee_adjust != 0, &cfi_ops);
6029 if (aarch64_simd_decl_p (cfun->decl))
6030 aarch64_restore_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM,
6031 callee_adjust != 0, &cfi_ops);
6032 else
6033 aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
6034 callee_adjust != 0, &cfi_ops);
6035
6036 if (need_barrier_p)
6037 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
6038
6039 if (callee_adjust != 0)
6040 aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
6041
6042 if (callee_adjust != 0 || maybe_gt (initial_adjust, 65536))
6043 {
6044 /* Emit delayed restores and set the CFA to be SP + initial_adjust. */
6045 insn = get_last_insn ();
6046 rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
6047 REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
6048 RTX_FRAME_RELATED_P (insn) = 1;
6049 cfi_ops = NULL;
6050 }
6051
6052 /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
6053 add restriction on emit_move optimization to leaf functions. */
6054 aarch64_add_sp (tmp0_rtx, tmp1_rtx, initial_adjust,
6055 (!can_inherit_p || !crtl->is_leaf
6056 || df_regs_ever_live_p (EP0_REGNUM)));
6057
6058 if (cfi_ops)
6059 {
6060 /* Emit delayed restores and reset the CFA to be SP. */
6061 insn = get_last_insn ();
6062 cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
6063 REG_NOTES (insn) = cfi_ops;
6064 RTX_FRAME_RELATED_P (insn) = 1;
6065 }
6066
6067 /* We prefer to emit the combined return/authenticate instruction RETAA,
6068 however there are three cases in which we must instead emit an explicit
6069 authentication instruction.
6070
6071 1) Sibcalls don't return in a normal way, so if we're about to call one
6072 we must authenticate.
6073
6074 2) The RETAA instruction is not available before ARMv8.3-A, so if we are
6075 generating code for !TARGET_ARMV8_3 we can't use it and must
6076 explicitly authenticate.
6077
6078 3) On an eh_return path we make extra stack adjustments to update the
6079 canonical frame address to be the exception handler's CFA. We want
6080 to authenticate using the CFA of the function which calls eh_return.
6081 */
6082 if (aarch64_return_address_signing_enabled ()
6083 && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
6084 {
6085 insn = emit_insn (gen_autisp ());
6086 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
6087 RTX_FRAME_RELATED_P (insn) = 1;
6088 }
6089
6090 /* Stack adjustment for exception handler. */
6091 if (crtl->calls_eh_return)
6092 {
6093 /* We need to unwind the stack by the offset computed by
6094 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
6095 to be SP; letting the CFA move during this adjustment
6096 is just as correct as retaining the CFA from the body
6097 of the function. Therefore, do nothing special. */
6098 emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
6099 }
6100
6101 emit_use (gen_rtx_REG (DImode, LR_REGNUM));
6102 if (!for_sibcall)
6103 emit_jump_insn (ret_rtx);
6104 }
6105
6106 /* Implement EH_RETURN_HANDLER_RTX. EH returns need to either return
6107 normally or return to a previous frame after unwinding.
6108
6109 An EH return uses a single shared return sequence. The epilogue is
6110 exactly like a normal epilogue except that it has an extra input
6111 register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
6112 that must be applied after the frame has been destroyed. An extra label
6113 is inserted before the epilogue which initializes this register to zero,
6114 and this is the entry point for a normal return.
6115
6116 An actual EH return updates the return address, initializes the stack
6117 adjustment and jumps directly into the epilogue (bypassing the zeroing
6118 of the adjustment). Since the return address is typically saved on the
6119 stack when a function makes a call, the saved LR must be updated outside
6120 the epilogue.
6121
6122 This poses problems as the store is generated well before the epilogue,
6123 so the offset of LR is not known yet. Also optimizations will remove the
6124 store as it appears dead, even after the epilogue is generated (as the
6125 base or offset for loading LR is different in many cases).
6126
6127 To avoid these problems this implementation forces the frame pointer
6128 in eh_return functions so that the location of LR is fixed and known early.
6129 It also marks the store volatile, so no optimization is permitted to
6130 remove the store. */
6131 rtx
aarch64_eh_return_handler_rtx(void)6132 aarch64_eh_return_handler_rtx (void)
6133 {
6134 rtx tmp = gen_frame_mem (Pmode,
6135 plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
6136
6137 /* Mark the store volatile, so no optimization is permitted to remove it. */
6138 MEM_VOLATILE_P (tmp) = true;
6139 return tmp;
6140 }
6141
6142 /* Output code to add DELTA to the first argument, and then jump
6143 to FUNCTION. Used for C++ multiple inheritance. */
6144 static void
aarch64_output_mi_thunk(FILE * file,tree thunk ATTRIBUTE_UNUSED,HOST_WIDE_INT delta,HOST_WIDE_INT vcall_offset,tree function)6145 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
6146 HOST_WIDE_INT delta,
6147 HOST_WIDE_INT vcall_offset,
6148 tree function)
6149 {
6150 /* The this pointer is always in x0. Note that this differs from
6151 Arm where the this pointer maybe bumped to r1 if r0 is required
6152 to return a pointer to an aggregate. On AArch64 a result value
6153 pointer will be in x8. */
6154 int this_regno = R0_REGNUM;
6155 rtx this_rtx, temp0, temp1, addr, funexp;
6156 rtx_insn *insn;
6157
6158 if (aarch64_bti_enabled ())
6159 emit_insn (gen_bti_c());
6160
6161 reload_completed = 1;
6162 emit_note (NOTE_INSN_PROLOGUE_END);
6163
6164 this_rtx = gen_rtx_REG (Pmode, this_regno);
6165 temp0 = gen_rtx_REG (Pmode, EP0_REGNUM);
6166 temp1 = gen_rtx_REG (Pmode, EP1_REGNUM);
6167
6168 if (vcall_offset == 0)
6169 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
6170 else
6171 {
6172 gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
6173
6174 addr = this_rtx;
6175 if (delta != 0)
6176 {
6177 if (delta >= -256 && delta < 256)
6178 addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
6179 plus_constant (Pmode, this_rtx, delta));
6180 else
6181 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
6182 temp1, temp0, false);
6183 }
6184
6185 if (Pmode == ptr_mode)
6186 aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
6187 else
6188 aarch64_emit_move (temp0,
6189 gen_rtx_ZERO_EXTEND (Pmode,
6190 gen_rtx_MEM (ptr_mode, addr)));
6191
6192 if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
6193 addr = plus_constant (Pmode, temp0, vcall_offset);
6194 else
6195 {
6196 aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
6197 Pmode);
6198 addr = gen_rtx_PLUS (Pmode, temp0, temp1);
6199 }
6200
6201 if (Pmode == ptr_mode)
6202 aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
6203 else
6204 aarch64_emit_move (temp1,
6205 gen_rtx_SIGN_EXTEND (Pmode,
6206 gen_rtx_MEM (ptr_mode, addr)));
6207
6208 emit_insn (gen_add2_insn (this_rtx, temp1));
6209 }
6210
6211 /* Generate a tail call to the target function. */
6212 if (!TREE_USED (function))
6213 {
6214 assemble_external (function);
6215 TREE_USED (function) = 1;
6216 }
6217 funexp = XEXP (DECL_RTL (function), 0);
6218 funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
6219 insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
6220 SIBLING_CALL_P (insn) = 1;
6221
6222 insn = get_insns ();
6223 shorten_branches (insn);
6224 final_start_function (insn, file, 1);
6225 final (insn, file, 1);
6226 final_end_function ();
6227
6228 /* Stop pretending to be a post-reload pass. */
6229 reload_completed = 0;
6230 }
6231
6232 static bool
aarch64_tls_referenced_p(rtx x)6233 aarch64_tls_referenced_p (rtx x)
6234 {
6235 if (!TARGET_HAVE_TLS)
6236 return false;
6237 subrtx_iterator::array_type array;
6238 FOR_EACH_SUBRTX (iter, array, x, ALL)
6239 {
6240 const_rtx x = *iter;
6241 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
6242 return true;
6243 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
6244 TLS offsets, not real symbol references. */
6245 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
6246 iter.skip_subrtxes ();
6247 }
6248 return false;
6249 }
6250
6251
6252 /* Return true if val can be encoded as a 12-bit unsigned immediate with
6253 a left shift of 0 or 12 bits. */
6254 bool
aarch64_uimm12_shift(HOST_WIDE_INT val)6255 aarch64_uimm12_shift (HOST_WIDE_INT val)
6256 {
6257 return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
6258 || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
6259 );
6260 }
6261
6262 /* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
6263 that can be created with a left shift of 0 or 12. */
6264 static HOST_WIDE_INT
aarch64_clamp_to_uimm12_shift(HOST_WIDE_INT val)6265 aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val)
6266 {
6267 /* Check to see if the value fits in 24 bits, as that is the maximum we can
6268 handle correctly. */
6269 gcc_assert ((val & 0xffffff) == val);
6270
6271 if (((val & 0xfff) << 0) == val)
6272 return val;
6273
6274 return val & (0xfff << 12);
6275 }
6276
6277 /* Return true if val is an immediate that can be loaded into a
6278 register by a MOVZ instruction. */
6279 static bool
aarch64_movw_imm(HOST_WIDE_INT val,scalar_int_mode mode)6280 aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
6281 {
6282 if (GET_MODE_SIZE (mode) > 4)
6283 {
6284 if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
6285 || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
6286 return 1;
6287 }
6288 else
6289 {
6290 /* Ignore sign extension. */
6291 val &= (HOST_WIDE_INT) 0xffffffff;
6292 }
6293 return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
6294 || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
6295 }
6296
6297 /* VAL is a value with the inner mode of MODE. Replicate it to fill a
6298 64-bit (DImode) integer. */
6299
6300 static unsigned HOST_WIDE_INT
aarch64_replicate_bitmask_imm(unsigned HOST_WIDE_INT val,machine_mode mode)6301 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
6302 {
6303 unsigned int size = GET_MODE_UNIT_PRECISION (mode);
6304 while (size < 64)
6305 {
6306 val &= (HOST_WIDE_INT_1U << size) - 1;
6307 val |= val << size;
6308 size *= 2;
6309 }
6310 return val;
6311 }
6312
6313 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
6314
6315 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
6316 {
6317 0x0000000100000001ull,
6318 0x0001000100010001ull,
6319 0x0101010101010101ull,
6320 0x1111111111111111ull,
6321 0x5555555555555555ull,
6322 };
6323
6324
6325 /* Return true if val is a valid bitmask immediate. */
6326
6327 bool
aarch64_bitmask_imm(HOST_WIDE_INT val_in,machine_mode mode)6328 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
6329 {
6330 unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
6331 int bits;
6332
6333 /* Check for a single sequence of one bits and return quickly if so.
6334 The special cases of all ones and all zeroes returns false. */
6335 val = aarch64_replicate_bitmask_imm (val_in, mode);
6336 tmp = val + (val & -val);
6337
6338 if (tmp == (tmp & -tmp))
6339 return (val + 1) > 1;
6340
6341 /* Replicate 32-bit immediates so we can treat them as 64-bit. */
6342 if (mode == SImode)
6343 val = (val << 32) | (val & 0xffffffff);
6344
6345 /* Invert if the immediate doesn't start with a zero bit - this means we
6346 only need to search for sequences of one bits. */
6347 if (val & 1)
6348 val = ~val;
6349
6350 /* Find the first set bit and set tmp to val with the first sequence of one
6351 bits removed. Return success if there is a single sequence of ones. */
6352 first_one = val & -val;
6353 tmp = val & (val + first_one);
6354
6355 if (tmp == 0)
6356 return true;
6357
6358 /* Find the next set bit and compute the difference in bit position. */
6359 next_one = tmp & -tmp;
6360 bits = clz_hwi (first_one) - clz_hwi (next_one);
6361 mask = val ^ tmp;
6362
6363 /* Check the bit position difference is a power of 2, and that the first
6364 sequence of one bits fits within 'bits' bits. */
6365 if ((mask >> bits) != 0 || bits != (bits & -bits))
6366 return false;
6367
6368 /* Check the sequence of one bits is repeated 64/bits times. */
6369 return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
6370 }
6371
6372 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
6373 Assumed precondition: VAL_IN Is not zero. */
6374
6375 unsigned HOST_WIDE_INT
aarch64_and_split_imm1(HOST_WIDE_INT val_in)6376 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
6377 {
6378 int lowest_bit_set = ctz_hwi (val_in);
6379 int highest_bit_set = floor_log2 (val_in);
6380 gcc_assert (val_in != 0);
6381
6382 return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
6383 (HOST_WIDE_INT_1U << lowest_bit_set));
6384 }
6385
6386 /* Create constant where bits outside of lowest bit set to highest bit set
6387 are set to 1. */
6388
6389 unsigned HOST_WIDE_INT
aarch64_and_split_imm2(HOST_WIDE_INT val_in)6390 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
6391 {
6392 return val_in | ~aarch64_and_split_imm1 (val_in);
6393 }
6394
6395 /* Return true if VAL_IN is a valid 'and' bitmask immediate. */
6396
6397 bool
aarch64_and_bitmask_imm(unsigned HOST_WIDE_INT val_in,machine_mode mode)6398 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
6399 {
6400 scalar_int_mode int_mode;
6401 if (!is_a <scalar_int_mode> (mode, &int_mode))
6402 return false;
6403
6404 if (aarch64_bitmask_imm (val_in, int_mode))
6405 return false;
6406
6407 if (aarch64_move_imm (val_in, int_mode))
6408 return false;
6409
6410 unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
6411
6412 return aarch64_bitmask_imm (imm2, int_mode);
6413 }
6414
6415 /* Return true if val is an immediate that can be loaded into a
6416 register in a single instruction. */
6417 bool
aarch64_move_imm(HOST_WIDE_INT val,machine_mode mode)6418 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
6419 {
6420 scalar_int_mode int_mode;
6421 if (!is_a <scalar_int_mode> (mode, &int_mode))
6422 return false;
6423
6424 if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
6425 return 1;
6426 return aarch64_bitmask_imm (val, int_mode);
6427 }
6428
6429 static bool
aarch64_cannot_force_const_mem(machine_mode mode ATTRIBUTE_UNUSED,rtx x)6430 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
6431 {
6432 rtx base, offset;
6433
6434 if (GET_CODE (x) == HIGH)
6435 return true;
6436
6437 /* There's no way to calculate VL-based values using relocations. */
6438 subrtx_iterator::array_type array;
6439 FOR_EACH_SUBRTX (iter, array, x, ALL)
6440 if (GET_CODE (*iter) == CONST_POLY_INT)
6441 return true;
6442
6443 split_const (x, &base, &offset);
6444 if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
6445 {
6446 if (aarch64_classify_symbol (base, INTVAL (offset))
6447 != SYMBOL_FORCE_TO_MEM)
6448 return true;
6449 else
6450 /* Avoid generating a 64-bit relocation in ILP32; leave
6451 to aarch64_expand_mov_immediate to handle it properly. */
6452 return mode != ptr_mode;
6453 }
6454
6455 return aarch64_tls_referenced_p (x);
6456 }
6457
6458 /* Implement TARGET_CASE_VALUES_THRESHOLD.
6459 The expansion for a table switch is quite expensive due to the number
6460 of instructions, the table lookup and hard to predict indirect jump.
6461 When optimizing for speed, and -O3 enabled, use the per-core tuning if
6462 set, otherwise use tables for > 16 cases as a tradeoff between size and
6463 performance. When optimizing for size, use the default setting. */
6464
6465 static unsigned int
aarch64_case_values_threshold(void)6466 aarch64_case_values_threshold (void)
6467 {
6468 /* Use the specified limit for the number of cases before using jump
6469 tables at higher optimization levels. */
6470 if (optimize > 2
6471 && selected_cpu->tune->max_case_values != 0)
6472 return selected_cpu->tune->max_case_values;
6473 else
6474 return optimize_size ? default_case_values_threshold () : 17;
6475 }
6476
6477 /* Return true if register REGNO is a valid index register.
6478 STRICT_P is true if REG_OK_STRICT is in effect. */
6479
6480 bool
aarch64_regno_ok_for_index_p(int regno,bool strict_p)6481 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
6482 {
6483 if (!HARD_REGISTER_NUM_P (regno))
6484 {
6485 if (!strict_p)
6486 return true;
6487
6488 if (!reg_renumber)
6489 return false;
6490
6491 regno = reg_renumber[regno];
6492 }
6493 return GP_REGNUM_P (regno);
6494 }
6495
6496 /* Return true if register REGNO is a valid base register for mode MODE.
6497 STRICT_P is true if REG_OK_STRICT is in effect. */
6498
6499 bool
aarch64_regno_ok_for_base_p(int regno,bool strict_p)6500 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
6501 {
6502 if (!HARD_REGISTER_NUM_P (regno))
6503 {
6504 if (!strict_p)
6505 return true;
6506
6507 if (!reg_renumber)
6508 return false;
6509
6510 regno = reg_renumber[regno];
6511 }
6512
6513 /* The fake registers will be eliminated to either the stack or
6514 hard frame pointer, both of which are usually valid base registers.
6515 Reload deals with the cases where the eliminated form isn't valid. */
6516 return (GP_REGNUM_P (regno)
6517 || regno == SP_REGNUM
6518 || regno == FRAME_POINTER_REGNUM
6519 || regno == ARG_POINTER_REGNUM);
6520 }
6521
6522 /* Return true if X is a valid base register for mode MODE.
6523 STRICT_P is true if REG_OK_STRICT is in effect. */
6524
6525 static bool
aarch64_base_register_rtx_p(rtx x,bool strict_p)6526 aarch64_base_register_rtx_p (rtx x, bool strict_p)
6527 {
6528 if (!strict_p
6529 && GET_CODE (x) == SUBREG
6530 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
6531 x = SUBREG_REG (x);
6532
6533 return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
6534 }
6535
6536 /* Return true if address offset is a valid index. If it is, fill in INFO
6537 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
6538
6539 static bool
aarch64_classify_index(struct aarch64_address_info * info,rtx x,machine_mode mode,bool strict_p)6540 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
6541 machine_mode mode, bool strict_p)
6542 {
6543 enum aarch64_address_type type;
6544 rtx index;
6545 int shift;
6546
6547 /* (reg:P) */
6548 if ((REG_P (x) || GET_CODE (x) == SUBREG)
6549 && GET_MODE (x) == Pmode)
6550 {
6551 type = ADDRESS_REG_REG;
6552 index = x;
6553 shift = 0;
6554 }
6555 /* (sign_extend:DI (reg:SI)) */
6556 else if ((GET_CODE (x) == SIGN_EXTEND
6557 || GET_CODE (x) == ZERO_EXTEND)
6558 && GET_MODE (x) == DImode
6559 && GET_MODE (XEXP (x, 0)) == SImode)
6560 {
6561 type = (GET_CODE (x) == SIGN_EXTEND)
6562 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6563 index = XEXP (x, 0);
6564 shift = 0;
6565 }
6566 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
6567 else if (GET_CODE (x) == MULT
6568 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
6569 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
6570 && GET_MODE (XEXP (x, 0)) == DImode
6571 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
6572 && CONST_INT_P (XEXP (x, 1)))
6573 {
6574 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
6575 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6576 index = XEXP (XEXP (x, 0), 0);
6577 shift = exact_log2 (INTVAL (XEXP (x, 1)));
6578 }
6579 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
6580 else if (GET_CODE (x) == ASHIFT
6581 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
6582 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
6583 && GET_MODE (XEXP (x, 0)) == DImode
6584 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
6585 && CONST_INT_P (XEXP (x, 1)))
6586 {
6587 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
6588 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6589 index = XEXP (XEXP (x, 0), 0);
6590 shift = INTVAL (XEXP (x, 1));
6591 }
6592 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
6593 else if ((GET_CODE (x) == SIGN_EXTRACT
6594 || GET_CODE (x) == ZERO_EXTRACT)
6595 && GET_MODE (x) == DImode
6596 && GET_CODE (XEXP (x, 0)) == MULT
6597 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
6598 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
6599 {
6600 type = (GET_CODE (x) == SIGN_EXTRACT)
6601 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6602 index = XEXP (XEXP (x, 0), 0);
6603 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
6604 if (INTVAL (XEXP (x, 1)) != 32 + shift
6605 || INTVAL (XEXP (x, 2)) != 0)
6606 shift = -1;
6607 }
6608 /* (and:DI (mult:DI (reg:DI) (const_int scale))
6609 (const_int 0xffffffff<<shift)) */
6610 else if (GET_CODE (x) == AND
6611 && GET_MODE (x) == DImode
6612 && GET_CODE (XEXP (x, 0)) == MULT
6613 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
6614 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
6615 && CONST_INT_P (XEXP (x, 1)))
6616 {
6617 type = ADDRESS_REG_UXTW;
6618 index = XEXP (XEXP (x, 0), 0);
6619 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
6620 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
6621 shift = -1;
6622 }
6623 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
6624 else if ((GET_CODE (x) == SIGN_EXTRACT
6625 || GET_CODE (x) == ZERO_EXTRACT)
6626 && GET_MODE (x) == DImode
6627 && GET_CODE (XEXP (x, 0)) == ASHIFT
6628 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
6629 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
6630 {
6631 type = (GET_CODE (x) == SIGN_EXTRACT)
6632 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6633 index = XEXP (XEXP (x, 0), 0);
6634 shift = INTVAL (XEXP (XEXP (x, 0), 1));
6635 if (INTVAL (XEXP (x, 1)) != 32 + shift
6636 || INTVAL (XEXP (x, 2)) != 0)
6637 shift = -1;
6638 }
6639 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
6640 (const_int 0xffffffff<<shift)) */
6641 else if (GET_CODE (x) == AND
6642 && GET_MODE (x) == DImode
6643 && GET_CODE (XEXP (x, 0)) == ASHIFT
6644 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
6645 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
6646 && CONST_INT_P (XEXP (x, 1)))
6647 {
6648 type = ADDRESS_REG_UXTW;
6649 index = XEXP (XEXP (x, 0), 0);
6650 shift = INTVAL (XEXP (XEXP (x, 0), 1));
6651 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
6652 shift = -1;
6653 }
6654 /* (mult:P (reg:P) (const_int scale)) */
6655 else if (GET_CODE (x) == MULT
6656 && GET_MODE (x) == Pmode
6657 && GET_MODE (XEXP (x, 0)) == Pmode
6658 && CONST_INT_P (XEXP (x, 1)))
6659 {
6660 type = ADDRESS_REG_REG;
6661 index = XEXP (x, 0);
6662 shift = exact_log2 (INTVAL (XEXP (x, 1)));
6663 }
6664 /* (ashift:P (reg:P) (const_int shift)) */
6665 else if (GET_CODE (x) == ASHIFT
6666 && GET_MODE (x) == Pmode
6667 && GET_MODE (XEXP (x, 0)) == Pmode
6668 && CONST_INT_P (XEXP (x, 1)))
6669 {
6670 type = ADDRESS_REG_REG;
6671 index = XEXP (x, 0);
6672 shift = INTVAL (XEXP (x, 1));
6673 }
6674 else
6675 return false;
6676
6677 if (!strict_p
6678 && GET_CODE (index) == SUBREG
6679 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
6680 index = SUBREG_REG (index);
6681
6682 if (aarch64_sve_data_mode_p (mode))
6683 {
6684 if (type != ADDRESS_REG_REG
6685 || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
6686 return false;
6687 }
6688 else
6689 {
6690 if (shift != 0
6691 && !(IN_RANGE (shift, 1, 3)
6692 && known_eq (1 << shift, GET_MODE_SIZE (mode))))
6693 return false;
6694 }
6695
6696 if (REG_P (index)
6697 && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
6698 {
6699 info->type = type;
6700 info->offset = index;
6701 info->shift = shift;
6702 return true;
6703 }
6704
6705 return false;
6706 }
6707
6708 /* Return true if MODE is one of the modes for which we
6709 support LDP/STP operations. */
6710
6711 static bool
aarch64_mode_valid_for_sched_fusion_p(machine_mode mode)6712 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
6713 {
6714 return mode == SImode || mode == DImode
6715 || mode == SFmode || mode == DFmode
6716 || (aarch64_vector_mode_supported_p (mode)
6717 && (known_eq (GET_MODE_SIZE (mode), 8)
6718 || (known_eq (GET_MODE_SIZE (mode), 16)
6719 && (aarch64_tune_params.extra_tuning_flags
6720 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0)));
6721 }
6722
6723 /* Return true if REGNO is a virtual pointer register, or an eliminable
6724 "soft" frame register. Like REGNO_PTR_FRAME_P except that we don't
6725 include stack_pointer or hard_frame_pointer. */
6726 static bool
virt_or_elim_regno_p(unsigned regno)6727 virt_or_elim_regno_p (unsigned regno)
6728 {
6729 return ((regno >= FIRST_VIRTUAL_REGISTER
6730 && regno <= LAST_VIRTUAL_POINTER_REGISTER)
6731 || regno == FRAME_POINTER_REGNUM
6732 || regno == ARG_POINTER_REGNUM);
6733 }
6734
6735 /* Return true if X is a valid address of type TYPE for machine mode MODE.
6736 If it is, fill in INFO appropriately. STRICT_P is true if
6737 REG_OK_STRICT is in effect. */
6738
6739 bool
aarch64_classify_address(struct aarch64_address_info * info,rtx x,machine_mode mode,bool strict_p,aarch64_addr_query_type type)6740 aarch64_classify_address (struct aarch64_address_info *info,
6741 rtx x, machine_mode mode, bool strict_p,
6742 aarch64_addr_query_type type)
6743 {
6744 enum rtx_code code = GET_CODE (x);
6745 rtx op0, op1;
6746 poly_int64 offset;
6747
6748 HOST_WIDE_INT const_size;
6749
6750 /* On BE, we use load/store pair for all large int mode load/stores.
6751 TI/TFmode may also use a load/store pair. */
6752 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
6753 bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
6754 bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
6755 || type == ADDR_QUERY_LDP_STP_N
6756 || mode == TImode
6757 || mode == TFmode
6758 || (BYTES_BIG_ENDIAN && advsimd_struct_p));
6759
6760 /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
6761 corresponds to the actual size of the memory being loaded/stored and the
6762 mode of the corresponding addressing mode is half of that. */
6763 if (type == ADDR_QUERY_LDP_STP_N
6764 && known_eq (GET_MODE_SIZE (mode), 16))
6765 mode = DFmode;
6766
6767 bool allow_reg_index_p = (!load_store_pair_p
6768 && (known_lt (GET_MODE_SIZE (mode), 16)
6769 || vec_flags == VEC_ADVSIMD
6770 || vec_flags == VEC_SVE_DATA));
6771
6772 /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
6773 [Rn, #offset, MUL VL]. */
6774 if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
6775 && (code != REG && code != PLUS))
6776 return false;
6777
6778 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
6779 REG addressing. */
6780 if (advsimd_struct_p
6781 && !BYTES_BIG_ENDIAN
6782 && (code != POST_INC && code != REG))
6783 return false;
6784
6785 gcc_checking_assert (GET_MODE (x) == VOIDmode
6786 || SCALAR_INT_MODE_P (GET_MODE (x)));
6787
6788 switch (code)
6789 {
6790 case REG:
6791 case SUBREG:
6792 info->type = ADDRESS_REG_IMM;
6793 info->base = x;
6794 info->offset = const0_rtx;
6795 info->const_offset = 0;
6796 return aarch64_base_register_rtx_p (x, strict_p);
6797
6798 case PLUS:
6799 op0 = XEXP (x, 0);
6800 op1 = XEXP (x, 1);
6801
6802 if (! strict_p
6803 && REG_P (op0)
6804 && virt_or_elim_regno_p (REGNO (op0))
6805 && poly_int_rtx_p (op1, &offset))
6806 {
6807 info->type = ADDRESS_REG_IMM;
6808 info->base = op0;
6809 info->offset = op1;
6810 info->const_offset = offset;
6811
6812 return true;
6813 }
6814
6815 if (maybe_ne (GET_MODE_SIZE (mode), 0)
6816 && aarch64_base_register_rtx_p (op0, strict_p)
6817 && poly_int_rtx_p (op1, &offset))
6818 {
6819 info->type = ADDRESS_REG_IMM;
6820 info->base = op0;
6821 info->offset = op1;
6822 info->const_offset = offset;
6823
6824 /* TImode and TFmode values are allowed in both pairs of X
6825 registers and individual Q registers. The available
6826 address modes are:
6827 X,X: 7-bit signed scaled offset
6828 Q: 9-bit signed offset
6829 We conservatively require an offset representable in either mode.
6830 When performing the check for pairs of X registers i.e. LDP/STP
6831 pass down DImode since that is the natural size of the LDP/STP
6832 instruction memory accesses. */
6833 if (mode == TImode || mode == TFmode)
6834 return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
6835 && (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
6836 || offset_12bit_unsigned_scaled_p (mode, offset)));
6837
6838 /* A 7bit offset check because OImode will emit a ldp/stp
6839 instruction (only big endian will get here).
6840 For ldp/stp instructions, the offset is scaled for the size of a
6841 single element of the pair. */
6842 if (mode == OImode)
6843 return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
6844
6845 /* Three 9/12 bit offsets checks because CImode will emit three
6846 ldr/str instructions (only big endian will get here). */
6847 if (mode == CImode)
6848 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
6849 && (aarch64_offset_9bit_signed_unscaled_p (V16QImode,
6850 offset + 32)
6851 || offset_12bit_unsigned_scaled_p (V16QImode,
6852 offset + 32)));
6853
6854 /* Two 7bit offsets checks because XImode will emit two ldp/stp
6855 instructions (only big endian will get here). */
6856 if (mode == XImode)
6857 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
6858 && aarch64_offset_7bit_signed_scaled_p (TImode,
6859 offset + 32));
6860
6861 /* Make "m" use the LD1 offset range for SVE data modes, so
6862 that pre-RTL optimizers like ivopts will work to that
6863 instead of the wider LDR/STR range. */
6864 if (vec_flags == VEC_SVE_DATA)
6865 return (type == ADDR_QUERY_M
6866 ? offset_4bit_signed_scaled_p (mode, offset)
6867 : offset_9bit_signed_scaled_p (mode, offset));
6868
6869 if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
6870 {
6871 poly_int64 end_offset = (offset
6872 + GET_MODE_SIZE (mode)
6873 - BYTES_PER_SVE_VECTOR);
6874 return (type == ADDR_QUERY_M
6875 ? offset_4bit_signed_scaled_p (mode, offset)
6876 : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
6877 && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
6878 end_offset)));
6879 }
6880
6881 if (vec_flags == VEC_SVE_PRED)
6882 return offset_9bit_signed_scaled_p (mode, offset);
6883
6884 if (load_store_pair_p)
6885 return ((known_eq (GET_MODE_SIZE (mode), 4)
6886 || known_eq (GET_MODE_SIZE (mode), 8)
6887 || known_eq (GET_MODE_SIZE (mode), 16))
6888 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
6889 else
6890 return (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
6891 || offset_12bit_unsigned_scaled_p (mode, offset));
6892 }
6893
6894 if (allow_reg_index_p)
6895 {
6896 /* Look for base + (scaled/extended) index register. */
6897 if (aarch64_base_register_rtx_p (op0, strict_p)
6898 && aarch64_classify_index (info, op1, mode, strict_p))
6899 {
6900 info->base = op0;
6901 return true;
6902 }
6903 if (aarch64_base_register_rtx_p (op1, strict_p)
6904 && aarch64_classify_index (info, op0, mode, strict_p))
6905 {
6906 info->base = op1;
6907 return true;
6908 }
6909 }
6910
6911 return false;
6912
6913 case POST_INC:
6914 case POST_DEC:
6915 case PRE_INC:
6916 case PRE_DEC:
6917 info->type = ADDRESS_REG_WB;
6918 info->base = XEXP (x, 0);
6919 info->offset = NULL_RTX;
6920 return aarch64_base_register_rtx_p (info->base, strict_p);
6921
6922 case POST_MODIFY:
6923 case PRE_MODIFY:
6924 info->type = ADDRESS_REG_WB;
6925 info->base = XEXP (x, 0);
6926 if (GET_CODE (XEXP (x, 1)) == PLUS
6927 && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
6928 && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
6929 && aarch64_base_register_rtx_p (info->base, strict_p))
6930 {
6931 info->offset = XEXP (XEXP (x, 1), 1);
6932 info->const_offset = offset;
6933
6934 /* TImode and TFmode values are allowed in both pairs of X
6935 registers and individual Q registers. The available
6936 address modes are:
6937 X,X: 7-bit signed scaled offset
6938 Q: 9-bit signed offset
6939 We conservatively require an offset representable in either mode.
6940 */
6941 if (mode == TImode || mode == TFmode)
6942 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
6943 && aarch64_offset_9bit_signed_unscaled_p (mode, offset));
6944
6945 if (load_store_pair_p)
6946 return ((known_eq (GET_MODE_SIZE (mode), 4)
6947 || known_eq (GET_MODE_SIZE (mode), 8)
6948 || known_eq (GET_MODE_SIZE (mode), 16))
6949 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
6950 else
6951 return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
6952 }
6953 return false;
6954
6955 case CONST:
6956 case SYMBOL_REF:
6957 case LABEL_REF:
6958 /* load literal: pc-relative constant pool entry. Only supported
6959 for SI mode or larger. */
6960 info->type = ADDRESS_SYMBOLIC;
6961
6962 if (!load_store_pair_p
6963 && GET_MODE_SIZE (mode).is_constant (&const_size)
6964 && const_size >= 4)
6965 {
6966 rtx sym, addend;
6967
6968 split_const (x, &sym, &addend);
6969 return ((GET_CODE (sym) == LABEL_REF
6970 || (GET_CODE (sym) == SYMBOL_REF
6971 && CONSTANT_POOL_ADDRESS_P (sym)
6972 && aarch64_pcrelative_literal_loads)));
6973 }
6974 return false;
6975
6976 case LO_SUM:
6977 info->type = ADDRESS_LO_SUM;
6978 info->base = XEXP (x, 0);
6979 info->offset = XEXP (x, 1);
6980 if (allow_reg_index_p
6981 && aarch64_base_register_rtx_p (info->base, strict_p))
6982 {
6983 rtx sym, offs;
6984 split_const (info->offset, &sym, &offs);
6985 if (GET_CODE (sym) == SYMBOL_REF
6986 && (aarch64_classify_symbol (sym, INTVAL (offs))
6987 == SYMBOL_SMALL_ABSOLUTE))
6988 {
6989 /* The symbol and offset must be aligned to the access size. */
6990 unsigned int align;
6991
6992 if (CONSTANT_POOL_ADDRESS_P (sym))
6993 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
6994 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
6995 {
6996 tree exp = SYMBOL_REF_DECL (sym);
6997 align = TYPE_ALIGN (TREE_TYPE (exp));
6998 align = aarch64_constant_alignment (exp, align);
6999 }
7000 else if (SYMBOL_REF_DECL (sym))
7001 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
7002 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
7003 && SYMBOL_REF_BLOCK (sym) != NULL)
7004 align = SYMBOL_REF_BLOCK (sym)->alignment;
7005 else
7006 align = BITS_PER_UNIT;
7007
7008 poly_int64 ref_size = GET_MODE_SIZE (mode);
7009 if (known_eq (ref_size, 0))
7010 ref_size = GET_MODE_SIZE (DImode);
7011
7012 return (multiple_p (INTVAL (offs), ref_size)
7013 && multiple_p (align / BITS_PER_UNIT, ref_size));
7014 }
7015 }
7016 return false;
7017
7018 default:
7019 return false;
7020 }
7021 }
7022
7023 /* Return true if the address X is valid for a PRFM instruction.
7024 STRICT_P is true if we should do strict checking with
7025 aarch64_classify_address. */
7026
7027 bool
aarch64_address_valid_for_prefetch_p(rtx x,bool strict_p)7028 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
7029 {
7030 struct aarch64_address_info addr;
7031
7032 /* PRFM accepts the same addresses as DImode... */
7033 bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
7034 if (!res)
7035 return false;
7036
7037 /* ... except writeback forms. */
7038 return addr.type != ADDRESS_REG_WB;
7039 }
7040
7041 bool
aarch64_symbolic_address_p(rtx x)7042 aarch64_symbolic_address_p (rtx x)
7043 {
7044 rtx offset;
7045
7046 split_const (x, &x, &offset);
7047 return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
7048 }
7049
7050 /* Classify the base of symbolic expression X. */
7051
7052 enum aarch64_symbol_type
aarch64_classify_symbolic_expression(rtx x)7053 aarch64_classify_symbolic_expression (rtx x)
7054 {
7055 rtx offset;
7056
7057 split_const (x, &x, &offset);
7058 return aarch64_classify_symbol (x, INTVAL (offset));
7059 }
7060
7061
7062 /* Return TRUE if X is a legitimate address for accessing memory in
7063 mode MODE. */
7064 static bool
aarch64_legitimate_address_hook_p(machine_mode mode,rtx x,bool strict_p)7065 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
7066 {
7067 struct aarch64_address_info addr;
7068
7069 return aarch64_classify_address (&addr, x, mode, strict_p);
7070 }
7071
7072 /* Return TRUE if X is a legitimate address of type TYPE for accessing
7073 memory in mode MODE. STRICT_P is true if REG_OK_STRICT is in effect. */
7074 bool
aarch64_legitimate_address_p(machine_mode mode,rtx x,bool strict_p,aarch64_addr_query_type type)7075 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
7076 aarch64_addr_query_type type)
7077 {
7078 struct aarch64_address_info addr;
7079
7080 return aarch64_classify_address (&addr, x, mode, strict_p, type);
7081 }
7082
7083 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT. */
7084
7085 static bool
aarch64_legitimize_address_displacement(rtx * offset1,rtx * offset2,poly_int64 orig_offset,machine_mode mode)7086 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
7087 poly_int64 orig_offset,
7088 machine_mode mode)
7089 {
7090 HOST_WIDE_INT size;
7091 if (GET_MODE_SIZE (mode).is_constant (&size))
7092 {
7093 HOST_WIDE_INT const_offset, second_offset;
7094
7095 /* A general SVE offset is A * VQ + B. Remove the A component from
7096 coefficient 0 in order to get the constant B. */
7097 const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
7098
7099 /* Split an out-of-range address displacement into a base and
7100 offset. Use 4KB range for 1- and 2-byte accesses and a 16KB
7101 range otherwise to increase opportunities for sharing the base
7102 address of different sizes. Unaligned accesses use the signed
7103 9-bit range, TImode/TFmode use the intersection of signed
7104 scaled 7-bit and signed 9-bit offset. */
7105 if (mode == TImode || mode == TFmode)
7106 second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
7107 else if ((const_offset & (size - 1)) != 0)
7108 second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
7109 else
7110 second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
7111
7112 if (second_offset == 0 || known_eq (orig_offset, second_offset))
7113 return false;
7114
7115 /* Split the offset into second_offset and the rest. */
7116 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
7117 *offset2 = gen_int_mode (second_offset, Pmode);
7118 return true;
7119 }
7120 else
7121 {
7122 /* Get the mode we should use as the basis of the range. For structure
7123 modes this is the mode of one vector. */
7124 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
7125 machine_mode step_mode
7126 = (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
7127
7128 /* Get the "mul vl" multiplier we'd like to use. */
7129 HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
7130 HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
7131 if (vec_flags & VEC_SVE_DATA)
7132 /* LDR supports a 9-bit range, but the move patterns for
7133 structure modes require all vectors to be in range of the
7134 same base. The simplest way of accomodating that while still
7135 promoting reuse of anchor points between different modes is
7136 to use an 8-bit range unconditionally. */
7137 vnum = ((vnum + 128) & 255) - 128;
7138 else
7139 /* Predicates are only handled singly, so we might as well use
7140 the full range. */
7141 vnum = ((vnum + 256) & 511) - 256;
7142 if (vnum == 0)
7143 return false;
7144
7145 /* Convert the "mul vl" multiplier into a byte offset. */
7146 poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
7147 if (known_eq (second_offset, orig_offset))
7148 return false;
7149
7150 /* Split the offset into second_offset and the rest. */
7151 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
7152 *offset2 = gen_int_mode (second_offset, Pmode);
7153 return true;
7154 }
7155 }
7156
7157 /* Return the binary representation of floating point constant VALUE in INTVAL.
7158 If the value cannot be converted, return false without setting INTVAL.
7159 The conversion is done in the given MODE. */
7160 bool
aarch64_reinterpret_float_as_int(rtx value,unsigned HOST_WIDE_INT * intval)7161 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
7162 {
7163
7164 /* We make a general exception for 0. */
7165 if (aarch64_float_const_zero_rtx_p (value))
7166 {
7167 *intval = 0;
7168 return true;
7169 }
7170
7171 scalar_float_mode mode;
7172 if (GET_CODE (value) != CONST_DOUBLE
7173 || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
7174 || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
7175 /* Only support up to DF mode. */
7176 || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
7177 return false;
7178
7179 unsigned HOST_WIDE_INT ival = 0;
7180
7181 long res[2];
7182 real_to_target (res,
7183 CONST_DOUBLE_REAL_VALUE (value),
7184 REAL_MODE_FORMAT (mode));
7185
7186 if (mode == DFmode)
7187 {
7188 int order = BYTES_BIG_ENDIAN ? 1 : 0;
7189 ival = zext_hwi (res[order], 32);
7190 ival |= (zext_hwi (res[1 - order], 32) << 32);
7191 }
7192 else
7193 ival = zext_hwi (res[0], 32);
7194
7195 *intval = ival;
7196 return true;
7197 }
7198
7199 /* Return TRUE if rtx X is an immediate constant that can be moved using a
7200 single MOV(+MOVK) followed by an FMOV. */
7201 bool
aarch64_float_const_rtx_p(rtx x)7202 aarch64_float_const_rtx_p (rtx x)
7203 {
7204 machine_mode mode = GET_MODE (x);
7205 if (mode == VOIDmode)
7206 return false;
7207
7208 /* Determine whether it's cheaper to write float constants as
7209 mov/movk pairs over ldr/adrp pairs. */
7210 unsigned HOST_WIDE_INT ival;
7211
7212 if (GET_CODE (x) == CONST_DOUBLE
7213 && SCALAR_FLOAT_MODE_P (mode)
7214 && aarch64_reinterpret_float_as_int (x, &ival))
7215 {
7216 scalar_int_mode imode = (mode == HFmode
7217 ? SImode
7218 : int_mode_for_mode (mode).require ());
7219 int num_instr = aarch64_internal_mov_immediate
7220 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
7221 return num_instr < 3;
7222 }
7223
7224 return false;
7225 }
7226
7227 /* Return TRUE if rtx X is immediate constant 0.0 */
7228 bool
aarch64_float_const_zero_rtx_p(rtx x)7229 aarch64_float_const_zero_rtx_p (rtx x)
7230 {
7231 if (GET_MODE (x) == VOIDmode)
7232 return false;
7233
7234 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
7235 return !HONOR_SIGNED_ZEROS (GET_MODE (x));
7236 return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
7237 }
7238
7239 /* Return TRUE if rtx X is immediate constant that fits in a single
7240 MOVI immediate operation. */
7241 bool
aarch64_can_const_movi_rtx_p(rtx x,machine_mode mode)7242 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
7243 {
7244 if (!TARGET_SIMD)
7245 return false;
7246
7247 machine_mode vmode;
7248 scalar_int_mode imode;
7249 unsigned HOST_WIDE_INT ival;
7250
7251 if (GET_CODE (x) == CONST_DOUBLE
7252 && SCALAR_FLOAT_MODE_P (mode))
7253 {
7254 if (!aarch64_reinterpret_float_as_int (x, &ival))
7255 return false;
7256
7257 /* We make a general exception for 0. */
7258 if (aarch64_float_const_zero_rtx_p (x))
7259 return true;
7260
7261 imode = int_mode_for_mode (mode).require ();
7262 }
7263 else if (GET_CODE (x) == CONST_INT
7264 && is_a <scalar_int_mode> (mode, &imode))
7265 ival = INTVAL (x);
7266 else
7267 return false;
7268
7269 /* use a 64 bit mode for everything except for DI/DF mode, where we use
7270 a 128 bit vector mode. */
7271 int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
7272
7273 vmode = aarch64_simd_container_mode (imode, width);
7274 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
7275
7276 return aarch64_simd_valid_immediate (v_op, NULL);
7277 }
7278
7279
7280 /* Return the fixed registers used for condition codes. */
7281
7282 static bool
aarch64_fixed_condition_code_regs(unsigned int * p1,unsigned int * p2)7283 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
7284 {
7285 *p1 = CC_REGNUM;
7286 *p2 = INVALID_REGNUM;
7287 return true;
7288 }
7289
7290 /* This function is used by the call expanders of the machine description.
7291 RESULT is the register in which the result is returned. It's NULL for
7292 "call" and "sibcall".
7293 MEM is the location of the function call.
7294 SIBCALL indicates whether this function call is normal call or sibling call.
7295 It will generate different pattern accordingly. */
7296
7297 void
aarch64_expand_call(rtx result,rtx mem,bool sibcall)7298 aarch64_expand_call (rtx result, rtx mem, bool sibcall)
7299 {
7300 rtx call, callee, tmp;
7301 rtvec vec;
7302 machine_mode mode;
7303
7304 gcc_assert (MEM_P (mem));
7305 callee = XEXP (mem, 0);
7306 mode = GET_MODE (callee);
7307 gcc_assert (mode == Pmode);
7308
7309 /* Decide if we should generate indirect calls by loading the
7310 address of the callee into a register before performing
7311 the branch-and-link. */
7312 if (SYMBOL_REF_P (callee)
7313 ? (aarch64_is_long_call_p (callee)
7314 || aarch64_is_noplt_call_p (callee))
7315 : !REG_P (callee))
7316 XEXP (mem, 0) = force_reg (mode, callee);
7317
7318 call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
7319
7320 if (result != NULL_RTX)
7321 call = gen_rtx_SET (result, call);
7322
7323 if (sibcall)
7324 tmp = ret_rtx;
7325 else
7326 tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
7327
7328 vec = gen_rtvec (2, call, tmp);
7329 call = gen_rtx_PARALLEL (VOIDmode, vec);
7330
7331 aarch64_emit_call_insn (call);
7332 }
7333
7334 /* Emit call insn with PAT and do aarch64-specific handling. */
7335
7336 void
aarch64_emit_call_insn(rtx pat)7337 aarch64_emit_call_insn (rtx pat)
7338 {
7339 rtx insn = emit_call_insn (pat);
7340
7341 rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
7342 clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
7343 clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
7344 }
7345
7346 machine_mode
aarch64_select_cc_mode(RTX_CODE code,rtx x,rtx y)7347 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
7348 {
7349 machine_mode mode_x = GET_MODE (x);
7350 rtx_code code_x = GET_CODE (x);
7351
7352 /* All floating point compares return CCFP if it is an equality
7353 comparison, and CCFPE otherwise. */
7354 if (GET_MODE_CLASS (mode_x) == MODE_FLOAT)
7355 {
7356 switch (code)
7357 {
7358 case EQ:
7359 case NE:
7360 case UNORDERED:
7361 case ORDERED:
7362 case UNLT:
7363 case UNLE:
7364 case UNGT:
7365 case UNGE:
7366 case UNEQ:
7367 return CCFPmode;
7368
7369 case LT:
7370 case LE:
7371 case GT:
7372 case GE:
7373 case LTGT:
7374 return CCFPEmode;
7375
7376 default:
7377 gcc_unreachable ();
7378 }
7379 }
7380
7381 /* Equality comparisons of short modes against zero can be performed
7382 using the TST instruction with the appropriate bitmask. */
7383 if (y == const0_rtx && (REG_P (x) || SUBREG_P (x))
7384 && (code == EQ || code == NE)
7385 && (mode_x == HImode || mode_x == QImode))
7386 return CC_NZmode;
7387
7388 /* Similarly, comparisons of zero_extends from shorter modes can
7389 be performed using an ANDS with an immediate mask. */
7390 if (y == const0_rtx && code_x == ZERO_EXTEND
7391 && (mode_x == SImode || mode_x == DImode)
7392 && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
7393 && (code == EQ || code == NE))
7394 return CC_NZmode;
7395
7396 if ((mode_x == SImode || mode_x == DImode)
7397 && y == const0_rtx
7398 && (code == EQ || code == NE || code == LT || code == GE)
7399 && (code_x == PLUS || code_x == MINUS || code_x == AND
7400 || code_x == NEG
7401 || (code_x == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
7402 && CONST_INT_P (XEXP (x, 2)))))
7403 return CC_NZmode;
7404
7405 /* A compare with a shifted operand. Because of canonicalization,
7406 the comparison will have to be swapped when we emit the assembly
7407 code. */
7408 if ((mode_x == SImode || mode_x == DImode)
7409 && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
7410 && (code_x == ASHIFT || code_x == ASHIFTRT
7411 || code_x == LSHIFTRT
7412 || code_x == ZERO_EXTEND || code_x == SIGN_EXTEND))
7413 return CC_SWPmode;
7414
7415 /* Similarly for a negated operand, but we can only do this for
7416 equalities. */
7417 if ((mode_x == SImode || mode_x == DImode)
7418 && (REG_P (y) || GET_CODE (y) == SUBREG)
7419 && (code == EQ || code == NE)
7420 && code_x == NEG)
7421 return CC_Zmode;
7422
7423 /* A test for unsigned overflow from an addition. */
7424 if ((mode_x == DImode || mode_x == TImode)
7425 && (code == LTU || code == GEU)
7426 && code_x == PLUS
7427 && rtx_equal_p (XEXP (x, 0), y))
7428 return CC_Cmode;
7429
7430 /* A test for unsigned overflow from an add with carry. */
7431 if ((mode_x == DImode || mode_x == TImode)
7432 && (code == LTU || code == GEU)
7433 && code_x == PLUS
7434 && CONST_SCALAR_INT_P (y)
7435 && (rtx_mode_t (y, mode_x)
7436 == (wi::shwi (1, mode_x)
7437 << (GET_MODE_BITSIZE (mode_x).to_constant () / 2))))
7438 return CC_ADCmode;
7439
7440 /* A test for signed overflow. */
7441 if ((mode_x == DImode || mode_x == TImode)
7442 && code == NE
7443 && code_x == PLUS
7444 && GET_CODE (y) == SIGN_EXTEND)
7445 return CC_Vmode;
7446
7447 /* For everything else, return CCmode. */
7448 return CCmode;
7449 }
7450
7451 static int
7452 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
7453
7454 int
aarch64_get_condition_code(rtx x)7455 aarch64_get_condition_code (rtx x)
7456 {
7457 machine_mode mode = GET_MODE (XEXP (x, 0));
7458 enum rtx_code comp_code = GET_CODE (x);
7459
7460 if (GET_MODE_CLASS (mode) != MODE_CC)
7461 mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
7462 return aarch64_get_condition_code_1 (mode, comp_code);
7463 }
7464
7465 static int
aarch64_get_condition_code_1(machine_mode mode,enum rtx_code comp_code)7466 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
7467 {
7468 switch (mode)
7469 {
7470 case E_CCFPmode:
7471 case E_CCFPEmode:
7472 switch (comp_code)
7473 {
7474 case GE: return AARCH64_GE;
7475 case GT: return AARCH64_GT;
7476 case LE: return AARCH64_LS;
7477 case LT: return AARCH64_MI;
7478 case NE: return AARCH64_NE;
7479 case EQ: return AARCH64_EQ;
7480 case ORDERED: return AARCH64_VC;
7481 case UNORDERED: return AARCH64_VS;
7482 case UNLT: return AARCH64_LT;
7483 case UNLE: return AARCH64_LE;
7484 case UNGT: return AARCH64_HI;
7485 case UNGE: return AARCH64_PL;
7486 default: return -1;
7487 }
7488 break;
7489
7490 case E_CCmode:
7491 switch (comp_code)
7492 {
7493 case NE: return AARCH64_NE;
7494 case EQ: return AARCH64_EQ;
7495 case GE: return AARCH64_GE;
7496 case GT: return AARCH64_GT;
7497 case LE: return AARCH64_LE;
7498 case LT: return AARCH64_LT;
7499 case GEU: return AARCH64_CS;
7500 case GTU: return AARCH64_HI;
7501 case LEU: return AARCH64_LS;
7502 case LTU: return AARCH64_CC;
7503 default: return -1;
7504 }
7505 break;
7506
7507 case E_CC_SWPmode:
7508 switch (comp_code)
7509 {
7510 case NE: return AARCH64_NE;
7511 case EQ: return AARCH64_EQ;
7512 case GE: return AARCH64_LE;
7513 case GT: return AARCH64_LT;
7514 case LE: return AARCH64_GE;
7515 case LT: return AARCH64_GT;
7516 case GEU: return AARCH64_LS;
7517 case GTU: return AARCH64_CC;
7518 case LEU: return AARCH64_CS;
7519 case LTU: return AARCH64_HI;
7520 default: return -1;
7521 }
7522 break;
7523
7524 case E_CC_NZmode:
7525 switch (comp_code)
7526 {
7527 case NE: return AARCH64_NE;
7528 case EQ: return AARCH64_EQ;
7529 case GE: return AARCH64_PL;
7530 case LT: return AARCH64_MI;
7531 default: return -1;
7532 }
7533 break;
7534
7535 case E_CC_Zmode:
7536 switch (comp_code)
7537 {
7538 case NE: return AARCH64_NE;
7539 case EQ: return AARCH64_EQ;
7540 default: return -1;
7541 }
7542 break;
7543
7544 case E_CC_Cmode:
7545 switch (comp_code)
7546 {
7547 case LTU: return AARCH64_CS;
7548 case GEU: return AARCH64_CC;
7549 default: return -1;
7550 }
7551 break;
7552
7553 case E_CC_ADCmode:
7554 switch (comp_code)
7555 {
7556 case GEU: return AARCH64_CS;
7557 case LTU: return AARCH64_CC;
7558 default: return -1;
7559 }
7560 break;
7561
7562 case E_CC_Vmode:
7563 switch (comp_code)
7564 {
7565 case NE: return AARCH64_VS;
7566 case EQ: return AARCH64_VC;
7567 default: return -1;
7568 }
7569 break;
7570
7571 default:
7572 return -1;
7573 }
7574
7575 return -1;
7576 }
7577
7578 bool
aarch64_const_vec_all_same_in_range_p(rtx x,HOST_WIDE_INT minval,HOST_WIDE_INT maxval)7579 aarch64_const_vec_all_same_in_range_p (rtx x,
7580 HOST_WIDE_INT minval,
7581 HOST_WIDE_INT maxval)
7582 {
7583 rtx elt;
7584 return (const_vec_duplicate_p (x, &elt)
7585 && CONST_INT_P (elt)
7586 && IN_RANGE (INTVAL (elt), minval, maxval));
7587 }
7588
7589 bool
aarch64_const_vec_all_same_int_p(rtx x,HOST_WIDE_INT val)7590 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
7591 {
7592 return aarch64_const_vec_all_same_in_range_p (x, val, val);
7593 }
7594
7595 /* Return true if VEC is a constant in which every element is in the range
7596 [MINVAL, MAXVAL]. The elements do not need to have the same value. */
7597
7598 static bool
aarch64_const_vec_all_in_range_p(rtx vec,HOST_WIDE_INT minval,HOST_WIDE_INT maxval)7599 aarch64_const_vec_all_in_range_p (rtx vec,
7600 HOST_WIDE_INT minval,
7601 HOST_WIDE_INT maxval)
7602 {
7603 if (GET_CODE (vec) != CONST_VECTOR
7604 || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
7605 return false;
7606
7607 int nunits;
7608 if (!CONST_VECTOR_STEPPED_P (vec))
7609 nunits = const_vector_encoded_nelts (vec);
7610 else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
7611 return false;
7612
7613 for (int i = 0; i < nunits; i++)
7614 {
7615 rtx vec_elem = CONST_VECTOR_ELT (vec, i);
7616 if (!CONST_INT_P (vec_elem)
7617 || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
7618 return false;
7619 }
7620 return true;
7621 }
7622
7623 /* N Z C V. */
7624 #define AARCH64_CC_V 1
7625 #define AARCH64_CC_C (1 << 1)
7626 #define AARCH64_CC_Z (1 << 2)
7627 #define AARCH64_CC_N (1 << 3)
7628
7629 /* N Z C V flags for ccmp. Indexed by AARCH64_COND_CODE. */
7630 static const int aarch64_nzcv_codes[] =
7631 {
7632 0, /* EQ, Z == 1. */
7633 AARCH64_CC_Z, /* NE, Z == 0. */
7634 0, /* CS, C == 1. */
7635 AARCH64_CC_C, /* CC, C == 0. */
7636 0, /* MI, N == 1. */
7637 AARCH64_CC_N, /* PL, N == 0. */
7638 0, /* VS, V == 1. */
7639 AARCH64_CC_V, /* VC, V == 0. */
7640 0, /* HI, C ==1 && Z == 0. */
7641 AARCH64_CC_C, /* LS, !(C == 1 && Z == 0). */
7642 AARCH64_CC_V, /* GE, N == V. */
7643 0, /* LT, N != V. */
7644 AARCH64_CC_Z, /* GT, Z == 0 && N == V. */
7645 0, /* LE, !(Z == 0 && N == V). */
7646 0, /* AL, Any. */
7647 0 /* NV, Any. */
7648 };
7649
7650 /* Print floating-point vector immediate operand X to F, negating it
7651 first if NEGATE is true. Return true on success, false if it isn't
7652 a constant we can handle. */
7653
7654 static bool
aarch64_print_vector_float_operand(FILE * f,rtx x,bool negate)7655 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
7656 {
7657 rtx elt;
7658
7659 if (!const_vec_duplicate_p (x, &elt))
7660 return false;
7661
7662 REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
7663 if (negate)
7664 r = real_value_negate (&r);
7665
7666 /* We only handle the SVE single-bit immediates here. */
7667 if (real_equal (&r, &dconst0))
7668 asm_fprintf (f, "0.0");
7669 else if (real_equal (&r, &dconst1))
7670 asm_fprintf (f, "1.0");
7671 else if (real_equal (&r, &dconsthalf))
7672 asm_fprintf (f, "0.5");
7673 else
7674 return false;
7675
7676 return true;
7677 }
7678
7679 /* Return the equivalent letter for size. */
7680 static char
sizetochar(int size)7681 sizetochar (int size)
7682 {
7683 switch (size)
7684 {
7685 case 64: return 'd';
7686 case 32: return 's';
7687 case 16: return 'h';
7688 case 8 : return 'b';
7689 default: gcc_unreachable ();
7690 }
7691 }
7692
7693 /* Print operand X to file F in a target specific manner according to CODE.
7694 The acceptable formatting commands given by CODE are:
7695 'c': An integer or symbol address without a preceding #
7696 sign.
7697 'C': Take the duplicated element in a vector constant
7698 and print it in hex.
7699 'D': Take the duplicated element in a vector constant
7700 and print it as an unsigned integer, in decimal.
7701 'e': Print the sign/zero-extend size as a character 8->b,
7702 16->h, 32->w.
7703 'p': Prints N such that 2^N == X (X must be power of 2 and
7704 const int).
7705 'P': Print the number of non-zero bits in X (a const_int).
7706 'H': Print the higher numbered register of a pair (TImode)
7707 of regs.
7708 'm': Print a condition (eq, ne, etc).
7709 'M': Same as 'm', but invert condition.
7710 'N': Take the duplicated element in a vector constant
7711 and print the negative of it in decimal.
7712 'b/h/s/d/q': Print a scalar FP/SIMD register name.
7713 'S/T/U/V': Print a FP/SIMD register name for a register list.
7714 The register printed is the FP/SIMD register name
7715 of X + 0/1/2/3 for S/T/U/V.
7716 'R': Print a scalar Integer/FP/SIMD register name + 1.
7717 'X': Print bottom 16 bits of integer constant in hex.
7718 'w/x': Print a general register name or the zero register
7719 (32-bit or 64-bit).
7720 '0': Print a normal operand, if it's a general register,
7721 then we assume DImode.
7722 'k': Print NZCV for conditional compare instructions.
7723 'A': Output address constant representing the first
7724 argument of X, specifying a relocation offset
7725 if appropriate.
7726 'L': Output constant address specified by X
7727 with a relocation offset if appropriate.
7728 'G': Prints address of X, specifying a PC relative
7729 relocation mode if appropriate.
7730 'y': Output address of LDP or STP - this is used for
7731 some LDP/STPs which don't use a PARALLEL in their
7732 pattern (so the mode needs to be adjusted).
7733 'z': Output address of a typical LDP or STP. */
7734
7735 static void
aarch64_print_operand(FILE * f,rtx x,int code)7736 aarch64_print_operand (FILE *f, rtx x, int code)
7737 {
7738 rtx elt;
7739 switch (code)
7740 {
7741 case 'c':
7742 switch (GET_CODE (x))
7743 {
7744 case CONST_INT:
7745 fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
7746 break;
7747
7748 case SYMBOL_REF:
7749 output_addr_const (f, x);
7750 break;
7751
7752 case CONST:
7753 if (GET_CODE (XEXP (x, 0)) == PLUS
7754 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
7755 {
7756 output_addr_const (f, x);
7757 break;
7758 }
7759 /* Fall through. */
7760
7761 default:
7762 output_operand_lossage ("unsupported operand for code '%c'", code);
7763 }
7764 break;
7765
7766 case 'e':
7767 {
7768 int n;
7769
7770 if (!CONST_INT_P (x)
7771 || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
7772 {
7773 output_operand_lossage ("invalid operand for '%%%c'", code);
7774 return;
7775 }
7776
7777 switch (n)
7778 {
7779 case 3:
7780 fputc ('b', f);
7781 break;
7782 case 4:
7783 fputc ('h', f);
7784 break;
7785 case 5:
7786 fputc ('w', f);
7787 break;
7788 default:
7789 output_operand_lossage ("invalid operand for '%%%c'", code);
7790 return;
7791 }
7792 }
7793 break;
7794
7795 case 'p':
7796 {
7797 int n;
7798
7799 if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
7800 {
7801 output_operand_lossage ("invalid operand for '%%%c'", code);
7802 return;
7803 }
7804
7805 asm_fprintf (f, "%d", n);
7806 }
7807 break;
7808
7809 case 'P':
7810 if (!CONST_INT_P (x))
7811 {
7812 output_operand_lossage ("invalid operand for '%%%c'", code);
7813 return;
7814 }
7815
7816 asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
7817 break;
7818
7819 case 'H':
7820 if (x == const0_rtx)
7821 {
7822 asm_fprintf (f, "xzr");
7823 break;
7824 }
7825
7826 if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
7827 {
7828 output_operand_lossage ("invalid operand for '%%%c'", code);
7829 return;
7830 }
7831
7832 asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
7833 break;
7834
7835 case 'M':
7836 case 'm':
7837 {
7838 int cond_code;
7839 /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */
7840 if (x == const_true_rtx)
7841 {
7842 if (code == 'M')
7843 fputs ("nv", f);
7844 return;
7845 }
7846
7847 if (!COMPARISON_P (x))
7848 {
7849 output_operand_lossage ("invalid operand for '%%%c'", code);
7850 return;
7851 }
7852
7853 cond_code = aarch64_get_condition_code (x);
7854 gcc_assert (cond_code >= 0);
7855 if (code == 'M')
7856 cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
7857 fputs (aarch64_condition_codes[cond_code], f);
7858 }
7859 break;
7860
7861 case 'N':
7862 if (!const_vec_duplicate_p (x, &elt))
7863 {
7864 output_operand_lossage ("invalid vector constant");
7865 return;
7866 }
7867
7868 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
7869 asm_fprintf (f, "%wd", -INTVAL (elt));
7870 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
7871 && aarch64_print_vector_float_operand (f, x, true))
7872 ;
7873 else
7874 {
7875 output_operand_lossage ("invalid vector constant");
7876 return;
7877 }
7878 break;
7879
7880 case 'b':
7881 case 'h':
7882 case 's':
7883 case 'd':
7884 case 'q':
7885 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
7886 {
7887 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
7888 return;
7889 }
7890 asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
7891 break;
7892
7893 case 'S':
7894 case 'T':
7895 case 'U':
7896 case 'V':
7897 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
7898 {
7899 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
7900 return;
7901 }
7902 asm_fprintf (f, "%c%d",
7903 aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
7904 REGNO (x) - V0_REGNUM + (code - 'S'));
7905 break;
7906
7907 case 'R':
7908 if (REG_P (x) && FP_REGNUM_P (REGNO (x)))
7909 asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
7910 else if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
7911 asm_fprintf (f, "x%d", REGNO (x) - R0_REGNUM + 1);
7912 else
7913 output_operand_lossage ("incompatible register operand for '%%%c'",
7914 code);
7915 break;
7916
7917 case 'X':
7918 if (!CONST_INT_P (x))
7919 {
7920 output_operand_lossage ("invalid operand for '%%%c'", code);
7921 return;
7922 }
7923 asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
7924 break;
7925
7926 case 'C':
7927 {
7928 /* Print a replicated constant in hex. */
7929 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
7930 {
7931 output_operand_lossage ("invalid operand for '%%%c'", code);
7932 return;
7933 }
7934 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
7935 asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
7936 }
7937 break;
7938
7939 case 'D':
7940 {
7941 /* Print a replicated constant in decimal, treating it as
7942 unsigned. */
7943 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
7944 {
7945 output_operand_lossage ("invalid operand for '%%%c'", code);
7946 return;
7947 }
7948 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
7949 asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
7950 }
7951 break;
7952
7953 case 'w':
7954 case 'x':
7955 if (x == const0_rtx
7956 || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
7957 {
7958 asm_fprintf (f, "%czr", code);
7959 break;
7960 }
7961
7962 if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
7963 {
7964 asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
7965 break;
7966 }
7967
7968 if (REG_P (x) && REGNO (x) == SP_REGNUM)
7969 {
7970 asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
7971 break;
7972 }
7973
7974 /* Fall through */
7975
7976 case 0:
7977 if (x == NULL)
7978 {
7979 output_operand_lossage ("missing operand");
7980 return;
7981 }
7982
7983 switch (GET_CODE (x))
7984 {
7985 case REG:
7986 if (aarch64_sve_data_mode_p (GET_MODE (x)))
7987 {
7988 if (REG_NREGS (x) == 1)
7989 asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
7990 else
7991 {
7992 char suffix
7993 = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
7994 asm_fprintf (f, "{z%d.%c - z%d.%c}",
7995 REGNO (x) - V0_REGNUM, suffix,
7996 END_REGNO (x) - V0_REGNUM - 1, suffix);
7997 }
7998 }
7999 else
8000 asm_fprintf (f, "%s", reg_names [REGNO (x)]);
8001 break;
8002
8003 case MEM:
8004 output_address (GET_MODE (x), XEXP (x, 0));
8005 break;
8006
8007 case LABEL_REF:
8008 case SYMBOL_REF:
8009 output_addr_const (asm_out_file, x);
8010 break;
8011
8012 case CONST_INT:
8013 asm_fprintf (f, "%wd", INTVAL (x));
8014 break;
8015
8016 case CONST:
8017 if (!VECTOR_MODE_P (GET_MODE (x)))
8018 {
8019 output_addr_const (asm_out_file, x);
8020 break;
8021 }
8022 /* fall through */
8023
8024 case CONST_VECTOR:
8025 if (!const_vec_duplicate_p (x, &elt))
8026 {
8027 output_operand_lossage ("invalid vector constant");
8028 return;
8029 }
8030
8031 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
8032 asm_fprintf (f, "%wd", INTVAL (elt));
8033 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
8034 && aarch64_print_vector_float_operand (f, x, false))
8035 ;
8036 else
8037 {
8038 output_operand_lossage ("invalid vector constant");
8039 return;
8040 }
8041 break;
8042
8043 case CONST_DOUBLE:
8044 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
8045 be getting CONST_DOUBLEs holding integers. */
8046 gcc_assert (GET_MODE (x) != VOIDmode);
8047 if (aarch64_float_const_zero_rtx_p (x))
8048 {
8049 fputc ('0', f);
8050 break;
8051 }
8052 else if (aarch64_float_const_representable_p (x))
8053 {
8054 #define buf_size 20
8055 char float_buf[buf_size] = {'\0'};
8056 real_to_decimal_for_mode (float_buf,
8057 CONST_DOUBLE_REAL_VALUE (x),
8058 buf_size, buf_size,
8059 1, GET_MODE (x));
8060 asm_fprintf (asm_out_file, "%s", float_buf);
8061 break;
8062 #undef buf_size
8063 }
8064 output_operand_lossage ("invalid constant");
8065 return;
8066 default:
8067 output_operand_lossage ("invalid operand");
8068 return;
8069 }
8070 break;
8071
8072 case 'A':
8073 if (GET_CODE (x) == HIGH)
8074 x = XEXP (x, 0);
8075
8076 switch (aarch64_classify_symbolic_expression (x))
8077 {
8078 case SYMBOL_SMALL_GOT_4G:
8079 asm_fprintf (asm_out_file, ":got:");
8080 break;
8081
8082 case SYMBOL_SMALL_TLSGD:
8083 asm_fprintf (asm_out_file, ":tlsgd:");
8084 break;
8085
8086 case SYMBOL_SMALL_TLSDESC:
8087 asm_fprintf (asm_out_file, ":tlsdesc:");
8088 break;
8089
8090 case SYMBOL_SMALL_TLSIE:
8091 asm_fprintf (asm_out_file, ":gottprel:");
8092 break;
8093
8094 case SYMBOL_TLSLE24:
8095 asm_fprintf (asm_out_file, ":tprel:");
8096 break;
8097
8098 case SYMBOL_TINY_GOT:
8099 gcc_unreachable ();
8100 break;
8101
8102 default:
8103 break;
8104 }
8105 output_addr_const (asm_out_file, x);
8106 break;
8107
8108 case 'L':
8109 switch (aarch64_classify_symbolic_expression (x))
8110 {
8111 case SYMBOL_SMALL_GOT_4G:
8112 asm_fprintf (asm_out_file, ":lo12:");
8113 break;
8114
8115 case SYMBOL_SMALL_TLSGD:
8116 asm_fprintf (asm_out_file, ":tlsgd_lo12:");
8117 break;
8118
8119 case SYMBOL_SMALL_TLSDESC:
8120 asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
8121 break;
8122
8123 case SYMBOL_SMALL_TLSIE:
8124 asm_fprintf (asm_out_file, ":gottprel_lo12:");
8125 break;
8126
8127 case SYMBOL_TLSLE12:
8128 asm_fprintf (asm_out_file, ":tprel_lo12:");
8129 break;
8130
8131 case SYMBOL_TLSLE24:
8132 asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
8133 break;
8134
8135 case SYMBOL_TINY_GOT:
8136 asm_fprintf (asm_out_file, ":got:");
8137 break;
8138
8139 case SYMBOL_TINY_TLSIE:
8140 asm_fprintf (asm_out_file, ":gottprel:");
8141 break;
8142
8143 default:
8144 break;
8145 }
8146 output_addr_const (asm_out_file, x);
8147 break;
8148
8149 case 'G':
8150 switch (aarch64_classify_symbolic_expression (x))
8151 {
8152 case SYMBOL_TLSLE24:
8153 asm_fprintf (asm_out_file, ":tprel_hi12:");
8154 break;
8155 default:
8156 break;
8157 }
8158 output_addr_const (asm_out_file, x);
8159 break;
8160
8161 case 'k':
8162 {
8163 HOST_WIDE_INT cond_code;
8164
8165 if (!CONST_INT_P (x))
8166 {
8167 output_operand_lossage ("invalid operand for '%%%c'", code);
8168 return;
8169 }
8170
8171 cond_code = INTVAL (x);
8172 gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
8173 asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
8174 }
8175 break;
8176
8177 case 'y':
8178 case 'z':
8179 {
8180 machine_mode mode = GET_MODE (x);
8181
8182 if (GET_CODE (x) != MEM
8183 || (code == 'y' && maybe_ne (GET_MODE_SIZE (mode), 16)))
8184 {
8185 output_operand_lossage ("invalid operand for '%%%c'", code);
8186 return;
8187 }
8188
8189 if (!aarch64_print_address_internal (f, mode, XEXP (x, 0),
8190 code == 'y'
8191 ? ADDR_QUERY_LDP_STP_N
8192 : ADDR_QUERY_LDP_STP))
8193 output_operand_lossage ("invalid operand prefix '%%%c'", code);
8194 }
8195 break;
8196
8197 default:
8198 output_operand_lossage ("invalid operand prefix '%%%c'", code);
8199 return;
8200 }
8201 }
8202
8203 /* Print address 'x' of a memory access with mode 'mode'.
8204 'op' is the context required by aarch64_classify_address. It can either be
8205 MEM for a normal memory access or PARALLEL for LDP/STP. */
8206 static bool
aarch64_print_address_internal(FILE * f,machine_mode mode,rtx x,aarch64_addr_query_type type)8207 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
8208 aarch64_addr_query_type type)
8209 {
8210 struct aarch64_address_info addr;
8211 unsigned int size;
8212
8213 /* Check all addresses are Pmode - including ILP32. */
8214 if (GET_MODE (x) != Pmode
8215 && (!CONST_INT_P (x)
8216 || trunc_int_for_mode (INTVAL (x), Pmode) != INTVAL (x)))
8217 {
8218 output_operand_lossage ("invalid address mode");
8219 return false;
8220 }
8221
8222 if (aarch64_classify_address (&addr, x, mode, true, type))
8223 switch (addr.type)
8224 {
8225 case ADDRESS_REG_IMM:
8226 if (known_eq (addr.const_offset, 0))
8227 asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
8228 else if (aarch64_sve_data_mode_p (mode))
8229 {
8230 HOST_WIDE_INT vnum
8231 = exact_div (addr.const_offset,
8232 BYTES_PER_SVE_VECTOR).to_constant ();
8233 asm_fprintf (f, "[%s, #%wd, mul vl]",
8234 reg_names[REGNO (addr.base)], vnum);
8235 }
8236 else if (aarch64_sve_pred_mode_p (mode))
8237 {
8238 HOST_WIDE_INT vnum
8239 = exact_div (addr.const_offset,
8240 BYTES_PER_SVE_PRED).to_constant ();
8241 asm_fprintf (f, "[%s, #%wd, mul vl]",
8242 reg_names[REGNO (addr.base)], vnum);
8243 }
8244 else
8245 asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
8246 INTVAL (addr.offset));
8247 return true;
8248
8249 case ADDRESS_REG_REG:
8250 if (addr.shift == 0)
8251 asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
8252 reg_names [REGNO (addr.offset)]);
8253 else
8254 asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
8255 reg_names [REGNO (addr.offset)], addr.shift);
8256 return true;
8257
8258 case ADDRESS_REG_UXTW:
8259 if (addr.shift == 0)
8260 asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
8261 REGNO (addr.offset) - R0_REGNUM);
8262 else
8263 asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
8264 REGNO (addr.offset) - R0_REGNUM, addr.shift);
8265 return true;
8266
8267 case ADDRESS_REG_SXTW:
8268 if (addr.shift == 0)
8269 asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
8270 REGNO (addr.offset) - R0_REGNUM);
8271 else
8272 asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
8273 REGNO (addr.offset) - R0_REGNUM, addr.shift);
8274 return true;
8275
8276 case ADDRESS_REG_WB:
8277 /* Writeback is only supported for fixed-width modes. */
8278 size = GET_MODE_SIZE (mode).to_constant ();
8279 switch (GET_CODE (x))
8280 {
8281 case PRE_INC:
8282 asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
8283 return true;
8284 case POST_INC:
8285 asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
8286 return true;
8287 case PRE_DEC:
8288 asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
8289 return true;
8290 case POST_DEC:
8291 asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
8292 return true;
8293 case PRE_MODIFY:
8294 asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
8295 INTVAL (addr.offset));
8296 return true;
8297 case POST_MODIFY:
8298 asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
8299 INTVAL (addr.offset));
8300 return true;
8301 default:
8302 break;
8303 }
8304 break;
8305
8306 case ADDRESS_LO_SUM:
8307 asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
8308 output_addr_const (f, addr.offset);
8309 asm_fprintf (f, "]");
8310 return true;
8311
8312 case ADDRESS_SYMBOLIC:
8313 output_addr_const (f, x);
8314 return true;
8315 }
8316
8317 return false;
8318 }
8319
8320 /* Print address 'x' of a memory access with mode 'mode'. */
8321 static void
aarch64_print_operand_address(FILE * f,machine_mode mode,rtx x)8322 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
8323 {
8324 if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
8325 output_addr_const (f, x);
8326 }
8327
8328 bool
aarch64_label_mentioned_p(rtx x)8329 aarch64_label_mentioned_p (rtx x)
8330 {
8331 const char *fmt;
8332 int i;
8333
8334 if (GET_CODE (x) == LABEL_REF)
8335 return true;
8336
8337 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
8338 referencing instruction, but they are constant offsets, not
8339 symbols. */
8340 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
8341 return false;
8342
8343 fmt = GET_RTX_FORMAT (GET_CODE (x));
8344 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
8345 {
8346 if (fmt[i] == 'E')
8347 {
8348 int j;
8349
8350 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
8351 if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
8352 return 1;
8353 }
8354 else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
8355 return 1;
8356 }
8357
8358 return 0;
8359 }
8360
8361 /* Implement REGNO_REG_CLASS. */
8362
8363 enum reg_class
aarch64_regno_regclass(unsigned regno)8364 aarch64_regno_regclass (unsigned regno)
8365 {
8366 if (STUB_REGNUM_P (regno))
8367 return STUB_REGS;
8368
8369 if (GP_REGNUM_P (regno))
8370 return GENERAL_REGS;
8371
8372 if (regno == SP_REGNUM)
8373 return STACK_REG;
8374
8375 if (regno == FRAME_POINTER_REGNUM
8376 || regno == ARG_POINTER_REGNUM)
8377 return POINTER_REGS;
8378
8379 if (FP_REGNUM_P (regno))
8380 return FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS;
8381
8382 if (PR_REGNUM_P (regno))
8383 return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
8384
8385 return NO_REGS;
8386 }
8387
8388 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
8389 If OFFSET is out of range, return an offset of an anchor point
8390 that is in range. Return 0 otherwise. */
8391
8392 static HOST_WIDE_INT
aarch64_anchor_offset(HOST_WIDE_INT offset,HOST_WIDE_INT size,machine_mode mode)8393 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
8394 machine_mode mode)
8395 {
8396 /* Does it look like we'll need a 16-byte load/store-pair operation? */
8397 if (size > 16)
8398 return (offset + 0x400) & ~0x7f0;
8399
8400 /* For offsets that aren't a multiple of the access size, the limit is
8401 -256...255. */
8402 if (offset & (size - 1))
8403 {
8404 /* BLKmode typically uses LDP of X-registers. */
8405 if (mode == BLKmode)
8406 return (offset + 512) & ~0x3ff;
8407 return (offset + 0x100) & ~0x1ff;
8408 }
8409
8410 /* Small negative offsets are supported. */
8411 if (IN_RANGE (offset, -256, 0))
8412 return 0;
8413
8414 if (mode == TImode || mode == TFmode)
8415 return (offset + 0x100) & ~0x1ff;
8416
8417 /* Use 12-bit offset by access size. */
8418 return offset & (~0xfff * size);
8419 }
8420
8421 static rtx
aarch64_legitimize_address(rtx x,rtx,machine_mode mode)8422 aarch64_legitimize_address (rtx x, rtx /* orig_x */, machine_mode mode)
8423 {
8424 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
8425 where mask is selected by alignment and size of the offset.
8426 We try to pick as large a range for the offset as possible to
8427 maximize the chance of a CSE. However, for aligned addresses
8428 we limit the range to 4k so that structures with different sized
8429 elements are likely to use the same base. We need to be careful
8430 not to split a CONST for some forms of address expression, otherwise
8431 it will generate sub-optimal code. */
8432
8433 if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
8434 {
8435 rtx base = XEXP (x, 0);
8436 rtx offset_rtx = XEXP (x, 1);
8437 HOST_WIDE_INT offset = INTVAL (offset_rtx);
8438
8439 if (GET_CODE (base) == PLUS)
8440 {
8441 rtx op0 = XEXP (base, 0);
8442 rtx op1 = XEXP (base, 1);
8443
8444 /* Force any scaling into a temp for CSE. */
8445 op0 = force_reg (Pmode, op0);
8446 op1 = force_reg (Pmode, op1);
8447
8448 /* Let the pointer register be in op0. */
8449 if (REG_POINTER (op1))
8450 std::swap (op0, op1);
8451
8452 /* If the pointer is virtual or frame related, then we know that
8453 virtual register instantiation or register elimination is going
8454 to apply a second constant. We want the two constants folded
8455 together easily. Therefore, emit as (OP0 + CONST) + OP1. */
8456 if (virt_or_elim_regno_p (REGNO (op0)))
8457 {
8458 base = expand_binop (Pmode, add_optab, op0, offset_rtx,
8459 NULL_RTX, true, OPTAB_DIRECT);
8460 return gen_rtx_PLUS (Pmode, base, op1);
8461 }
8462
8463 /* Otherwise, in order to encourage CSE (and thence loop strength
8464 reduce) scaled addresses, emit as (OP0 + OP1) + CONST. */
8465 base = expand_binop (Pmode, add_optab, op0, op1,
8466 NULL_RTX, true, OPTAB_DIRECT);
8467 x = gen_rtx_PLUS (Pmode, base, offset_rtx);
8468 }
8469
8470 HOST_WIDE_INT size;
8471 if (GET_MODE_SIZE (mode).is_constant (&size))
8472 {
8473 HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
8474 mode);
8475 if (base_offset != 0)
8476 {
8477 base = plus_constant (Pmode, base, base_offset);
8478 base = force_operand (base, NULL_RTX);
8479 return plus_constant (Pmode, base, offset - base_offset);
8480 }
8481 }
8482 }
8483
8484 return x;
8485 }
8486
8487 static reg_class_t
aarch64_secondary_reload(bool in_p ATTRIBUTE_UNUSED,rtx x,reg_class_t rclass,machine_mode mode,secondary_reload_info * sri)8488 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
8489 reg_class_t rclass,
8490 machine_mode mode,
8491 secondary_reload_info *sri)
8492 {
8493 /* Use aarch64_sve_reload_be for SVE reloads that cannot be handled
8494 directly by the *aarch64_sve_mov<mode>_be move pattern. See the
8495 comment at the head of aarch64-sve.md for more details about the
8496 big-endian handling. */
8497 if (BYTES_BIG_ENDIAN
8498 && reg_class_subset_p (rclass, FP_REGS)
8499 && !((REG_P (x) && HARD_REGISTER_P (x))
8500 || aarch64_simd_valid_immediate (x, NULL))
8501 && aarch64_sve_data_mode_p (mode))
8502 {
8503 sri->icode = CODE_FOR_aarch64_sve_reload_be;
8504 return NO_REGS;
8505 }
8506
8507 /* If we have to disable direct literal pool loads and stores because the
8508 function is too big, then we need a scratch register. */
8509 if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
8510 && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
8511 || targetm.vector_mode_supported_p (GET_MODE (x)))
8512 && !aarch64_pcrelative_literal_loads)
8513 {
8514 sri->icode = code_for_aarch64_reload_movcp (mode, DImode);
8515 return NO_REGS;
8516 }
8517
8518 /* Without the TARGET_SIMD instructions we cannot move a Q register
8519 to a Q register directly. We need a scratch. */
8520 if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
8521 && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
8522 && reg_class_subset_p (rclass, FP_REGS))
8523 {
8524 sri->icode = code_for_aarch64_reload_mov (mode);
8525 return NO_REGS;
8526 }
8527
8528 /* A TFmode or TImode memory access should be handled via an FP_REGS
8529 because AArch64 has richer addressing modes for LDR/STR instructions
8530 than LDP/STP instructions. */
8531 if (TARGET_FLOAT && rclass == GENERAL_REGS
8532 && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
8533 return FP_REGS;
8534
8535 if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
8536 return GENERAL_REGS;
8537
8538 return NO_REGS;
8539 }
8540
8541 static bool
aarch64_can_eliminate(const int from ATTRIBUTE_UNUSED,const int to)8542 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
8543 {
8544 gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
8545
8546 /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
8547 can only eliminate to HARD_FRAME_POINTER_REGNUM. */
8548 if (frame_pointer_needed)
8549 return to == HARD_FRAME_POINTER_REGNUM;
8550 return true;
8551 }
8552
8553 poly_int64
aarch64_initial_elimination_offset(unsigned from,unsigned to)8554 aarch64_initial_elimination_offset (unsigned from, unsigned to)
8555 {
8556 if (to == HARD_FRAME_POINTER_REGNUM)
8557 {
8558 if (from == ARG_POINTER_REGNUM)
8559 return cfun->machine->frame.hard_fp_offset;
8560
8561 if (from == FRAME_POINTER_REGNUM)
8562 return cfun->machine->frame.hard_fp_offset
8563 - cfun->machine->frame.locals_offset;
8564 }
8565
8566 if (to == STACK_POINTER_REGNUM)
8567 {
8568 if (from == FRAME_POINTER_REGNUM)
8569 return cfun->machine->frame.frame_size
8570 - cfun->machine->frame.locals_offset;
8571 }
8572
8573 return cfun->machine->frame.frame_size;
8574 }
8575
8576
8577 /* Get return address without mangling. */
8578
8579 rtx
aarch64_return_addr_rtx(void)8580 aarch64_return_addr_rtx (void)
8581 {
8582 rtx val = get_hard_reg_initial_val (Pmode, LR_REGNUM);
8583 /* Note: aarch64_return_address_signing_enabled only
8584 works after cfun->machine->frame.laid_out is set,
8585 so here we don't know if the return address will
8586 be signed or not. */
8587 rtx lr = gen_rtx_REG (Pmode, LR_REGNUM);
8588 emit_move_insn (lr, val);
8589 emit_insn (GEN_FCN (CODE_FOR_xpaclri) ());
8590 return lr;
8591 }
8592
8593
8594 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
8595 previous frame. */
8596
8597 rtx
aarch64_return_addr(int count,rtx frame ATTRIBUTE_UNUSED)8598 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
8599 {
8600 if (count != 0)
8601 return const0_rtx;
8602 return aarch64_return_addr_rtx ();
8603 }
8604
8605
8606 static void
aarch64_asm_trampoline_template(FILE * f)8607 aarch64_asm_trampoline_template (FILE *f)
8608 {
8609 int offset1 = 24;
8610 int offset2 = 28;
8611
8612 if (aarch64_bti_enabled ())
8613 {
8614 asm_fprintf (f, "\thint\t34 // bti c\n");
8615 offset1 -= 4;
8616 offset2 -= 4;
8617 }
8618
8619 if (TARGET_ILP32)
8620 {
8621 asm_fprintf (f, "\tldr\tw%d, .+%d\n", IP1_REGNUM - R0_REGNUM, offset1);
8622 asm_fprintf (f, "\tldr\tw%d, .+%d\n", STATIC_CHAIN_REGNUM - R0_REGNUM,
8623 offset1);
8624 }
8625 else
8626 {
8627 asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [IP1_REGNUM], offset1);
8628 asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [STATIC_CHAIN_REGNUM],
8629 offset2);
8630 }
8631 asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
8632
8633 /* We always emit a speculation barrier.
8634 This is because the same trampoline template is used for every nested
8635 function. Since nested functions are not particularly common or
8636 performant we don't worry too much about the extra instructions to copy
8637 around.
8638 This is not yet a problem, since we have not yet implemented function
8639 specific attributes to choose between hardening against straight line
8640 speculation or not, but such function specific attributes are likely to
8641 happen in the future. */
8642 asm_fprintf (f, "\tdsb\tsy\n\tisb\n");
8643
8644 /* The trampoline needs an extra padding instruction. In case if BTI is
8645 enabled the padding instruction is replaced by the BTI instruction at
8646 the beginning. */
8647 if (!aarch64_bti_enabled ())
8648 assemble_aligned_integer (4, const0_rtx);
8649
8650 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
8651 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
8652 }
8653
8654 static void
aarch64_trampoline_init(rtx m_tramp,tree fndecl,rtx chain_value)8655 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
8656 {
8657 rtx fnaddr, mem, a_tramp;
8658 const int tramp_code_sz = 24;
8659
8660 /* Don't need to copy the trailing D-words, we fill those in below. */
8661 /* We create our own memory address in Pmode so that `emit_block_move` can
8662 use parts of the backend which expect Pmode addresses. */
8663 rtx temp = convert_memory_address (Pmode, XEXP (m_tramp, 0));
8664 emit_block_move (gen_rtx_MEM (BLKmode, temp),
8665 assemble_trampoline_template (),
8666 GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
8667 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
8668 fnaddr = XEXP (DECL_RTL (fndecl), 0);
8669 if (GET_MODE (fnaddr) != ptr_mode)
8670 fnaddr = convert_memory_address (ptr_mode, fnaddr);
8671 emit_move_insn (mem, fnaddr);
8672
8673 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
8674 emit_move_insn (mem, chain_value);
8675
8676 /* XXX We should really define a "clear_cache" pattern and use
8677 gen_clear_cache(). */
8678 a_tramp = XEXP (m_tramp, 0);
8679 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
8680 LCT_NORMAL, VOIDmode, a_tramp, ptr_mode,
8681 plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
8682 ptr_mode);
8683 }
8684
8685 static unsigned char
aarch64_class_max_nregs(reg_class_t regclass,machine_mode mode)8686 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
8687 {
8688 /* ??? Logically we should only need to provide a value when
8689 HARD_REGNO_MODE_OK says that at least one register in REGCLASS
8690 can hold MODE, but at the moment we need to handle all modes.
8691 Just ignore any runtime parts for registers that can't store them. */
8692 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
8693 unsigned int nregs;
8694 switch (regclass)
8695 {
8696 case STUB_REGS:
8697 case TAILCALL_ADDR_REGS:
8698 case POINTER_REGS:
8699 case GENERAL_REGS:
8700 case ALL_REGS:
8701 case POINTER_AND_FP_REGS:
8702 case FP_REGS:
8703 case FP_LO_REGS:
8704 if (aarch64_sve_data_mode_p (mode)
8705 && constant_multiple_p (GET_MODE_SIZE (mode),
8706 BYTES_PER_SVE_VECTOR, &nregs))
8707 return nregs;
8708 return (aarch64_vector_data_mode_p (mode)
8709 ? CEIL (lowest_size, UNITS_PER_VREG)
8710 : CEIL (lowest_size, UNITS_PER_WORD));
8711 case STACK_REG:
8712 case PR_REGS:
8713 case PR_LO_REGS:
8714 case PR_HI_REGS:
8715 return 1;
8716
8717 case NO_REGS:
8718 return 0;
8719
8720 default:
8721 break;
8722 }
8723 gcc_unreachable ();
8724 }
8725
8726 static reg_class_t
aarch64_preferred_reload_class(rtx x,reg_class_t regclass)8727 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
8728 {
8729 if (regclass == POINTER_REGS)
8730 return GENERAL_REGS;
8731
8732 if (regclass == STACK_REG)
8733 {
8734 if (REG_P(x)
8735 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
8736 return regclass;
8737
8738 return NO_REGS;
8739 }
8740
8741 /* Register eliminiation can result in a request for
8742 SP+constant->FP_REGS. We cannot support such operations which
8743 use SP as source and an FP_REG as destination, so reject out
8744 right now. */
8745 if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
8746 {
8747 rtx lhs = XEXP (x, 0);
8748
8749 /* Look through a possible SUBREG introduced by ILP32. */
8750 if (GET_CODE (lhs) == SUBREG)
8751 lhs = SUBREG_REG (lhs);
8752
8753 gcc_assert (REG_P (lhs));
8754 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
8755 POINTER_REGS));
8756 return NO_REGS;
8757 }
8758
8759 return regclass;
8760 }
8761
8762 void
aarch64_asm_output_labelref(FILE * f,const char * name)8763 aarch64_asm_output_labelref (FILE* f, const char *name)
8764 {
8765 asm_fprintf (f, "%U%s", name);
8766 }
8767
8768 static void
aarch64_elf_asm_constructor(rtx symbol,int priority)8769 aarch64_elf_asm_constructor (rtx symbol, int priority)
8770 {
8771 if (priority == DEFAULT_INIT_PRIORITY)
8772 default_ctor_section_asm_out_constructor (symbol, priority);
8773 else
8774 {
8775 section *s;
8776 /* While priority is known to be in range [0, 65535], so 18 bytes
8777 would be enough, the compiler might not know that. To avoid
8778 -Wformat-truncation false positive, use a larger size. */
8779 char buf[23];
8780 snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
8781 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
8782 switch_to_section (s);
8783 assemble_align (POINTER_SIZE);
8784 assemble_aligned_integer (POINTER_BYTES, symbol);
8785 }
8786 }
8787
8788 static void
aarch64_elf_asm_destructor(rtx symbol,int priority)8789 aarch64_elf_asm_destructor (rtx symbol, int priority)
8790 {
8791 if (priority == DEFAULT_INIT_PRIORITY)
8792 default_dtor_section_asm_out_destructor (symbol, priority);
8793 else
8794 {
8795 section *s;
8796 /* While priority is known to be in range [0, 65535], so 18 bytes
8797 would be enough, the compiler might not know that. To avoid
8798 -Wformat-truncation false positive, use a larger size. */
8799 char buf[23];
8800 snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
8801 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
8802 switch_to_section (s);
8803 assemble_align (POINTER_SIZE);
8804 assemble_aligned_integer (POINTER_BYTES, symbol);
8805 }
8806 }
8807
8808 const char*
aarch64_output_casesi(rtx * operands)8809 aarch64_output_casesi (rtx *operands)
8810 {
8811 char buf[100];
8812 char label[100];
8813 rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
8814 int index;
8815 static const char *const patterns[4][2] =
8816 {
8817 {
8818 "ldrb\t%w3, [%0,%w1,uxtw]",
8819 "add\t%3, %4, %w3, sxtb #2"
8820 },
8821 {
8822 "ldrh\t%w3, [%0,%w1,uxtw #1]",
8823 "add\t%3, %4, %w3, sxth #2"
8824 },
8825 {
8826 "ldr\t%w3, [%0,%w1,uxtw #2]",
8827 "add\t%3, %4, %w3, sxtw #2"
8828 },
8829 /* We assume that DImode is only generated when not optimizing and
8830 that we don't really need 64-bit address offsets. That would
8831 imply an object file with 8GB of code in a single function! */
8832 {
8833 "ldr\t%w3, [%0,%w1,uxtw #2]",
8834 "add\t%3, %4, %w3, sxtw #2"
8835 }
8836 };
8837
8838 gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
8839
8840 scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
8841 index = exact_log2 (GET_MODE_SIZE (mode));
8842
8843 gcc_assert (index >= 0 && index <= 3);
8844
8845 /* Need to implement table size reduction, by chaning the code below. */
8846 output_asm_insn (patterns[index][0], operands);
8847 ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
8848 snprintf (buf, sizeof (buf),
8849 "adr\t%%4, %s", targetm.strip_name_encoding (label));
8850 output_asm_insn (buf, operands);
8851 output_asm_insn (patterns[index][1], operands);
8852 output_asm_insn ("br\t%3", operands);
8853 output_asm_insn (aarch64_sls_barrier (aarch64_harden_sls_retbr_p ()),
8854 operands);
8855 assemble_label (asm_out_file, label);
8856 return "";
8857 }
8858
8859
8860 /* Return size in bits of an arithmetic operand which is shifted/scaled and
8861 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
8862 operator. */
8863
8864 int
aarch64_uxt_size(int shift,HOST_WIDE_INT mask)8865 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
8866 {
8867 if (shift >= 0 && shift <= 3)
8868 {
8869 int size;
8870 for (size = 8; size <= 32; size *= 2)
8871 {
8872 HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
8873 if (mask == bits << shift)
8874 return size;
8875 }
8876 }
8877 return 0;
8878 }
8879
8880 /* Constant pools are per function only when PC relative
8881 literal loads are true or we are in the large memory
8882 model. */
8883
8884 static inline bool
aarch64_can_use_per_function_literal_pools_p(void)8885 aarch64_can_use_per_function_literal_pools_p (void)
8886 {
8887 return (aarch64_pcrelative_literal_loads
8888 || aarch64_cmodel == AARCH64_CMODEL_LARGE);
8889 }
8890
8891 static bool
aarch64_use_blocks_for_constant_p(machine_mode,const_rtx)8892 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
8893 {
8894 /* We can't use blocks for constants when we're using a per-function
8895 constant pool. */
8896 return !aarch64_can_use_per_function_literal_pools_p ();
8897 }
8898
8899 /* Select appropriate section for constants depending
8900 on where we place literal pools. */
8901
8902 static section *
aarch64_select_rtx_section(machine_mode mode,rtx x,unsigned HOST_WIDE_INT align)8903 aarch64_select_rtx_section (machine_mode mode,
8904 rtx x,
8905 unsigned HOST_WIDE_INT align)
8906 {
8907 if (aarch64_can_use_per_function_literal_pools_p ())
8908 return function_section (current_function_decl);
8909
8910 return default_elf_select_rtx_section (mode, x, align);
8911 }
8912
8913 /* Implement ASM_OUTPUT_POOL_EPILOGUE. */
8914 void
aarch64_asm_output_pool_epilogue(FILE * f,const char *,tree,HOST_WIDE_INT offset)8915 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
8916 HOST_WIDE_INT offset)
8917 {
8918 /* When using per-function literal pools, we must ensure that any code
8919 section is aligned to the minimal instruction length, lest we get
8920 errors from the assembler re "unaligned instructions". */
8921 if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
8922 ASM_OUTPUT_ALIGN (f, 2);
8923 }
8924
8925 /* Costs. */
8926
8927 /* Helper function for rtx cost calculation. Strip a shift expression
8928 from X. Returns the inner operand if successful, or the original
8929 expression on failure. */
8930 static rtx
aarch64_strip_shift(rtx x)8931 aarch64_strip_shift (rtx x)
8932 {
8933 rtx op = x;
8934
8935 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
8936 we can convert both to ROR during final output. */
8937 if ((GET_CODE (op) == ASHIFT
8938 || GET_CODE (op) == ASHIFTRT
8939 || GET_CODE (op) == LSHIFTRT
8940 || GET_CODE (op) == ROTATERT
8941 || GET_CODE (op) == ROTATE)
8942 && CONST_INT_P (XEXP (op, 1)))
8943 return XEXP (op, 0);
8944
8945 if (GET_CODE (op) == MULT
8946 && CONST_INT_P (XEXP (op, 1))
8947 && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
8948 return XEXP (op, 0);
8949
8950 return x;
8951 }
8952
8953 /* Helper function for rtx cost calculation. Strip an extend
8954 expression from X. Returns the inner operand if successful, or the
8955 original expression on failure. We deal with a number of possible
8956 canonicalization variations here. If STRIP_SHIFT is true, then
8957 we can strip off a shift also. */
8958 static rtx
aarch64_strip_extend(rtx x,bool strip_shift)8959 aarch64_strip_extend (rtx x, bool strip_shift)
8960 {
8961 scalar_int_mode mode;
8962 rtx op = x;
8963
8964 if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
8965 return op;
8966
8967 /* Zero and sign extraction of a widened value. */
8968 if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
8969 && XEXP (op, 2) == const0_rtx
8970 && GET_CODE (XEXP (op, 0)) == MULT
8971 && aarch64_is_extend_from_extract (mode, XEXP (XEXP (op, 0), 1),
8972 XEXP (op, 1)))
8973 return XEXP (XEXP (op, 0), 0);
8974
8975 /* It can also be represented (for zero-extend) as an AND with an
8976 immediate. */
8977 if (GET_CODE (op) == AND
8978 && GET_CODE (XEXP (op, 0)) == MULT
8979 && CONST_INT_P (XEXP (XEXP (op, 0), 1))
8980 && CONST_INT_P (XEXP (op, 1))
8981 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
8982 INTVAL (XEXP (op, 1))) != 0)
8983 return XEXP (XEXP (op, 0), 0);
8984
8985 /* Now handle extended register, as this may also have an optional
8986 left shift by 1..4. */
8987 if (strip_shift
8988 && GET_CODE (op) == ASHIFT
8989 && CONST_INT_P (XEXP (op, 1))
8990 && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
8991 op = XEXP (op, 0);
8992
8993 if (GET_CODE (op) == ZERO_EXTEND
8994 || GET_CODE (op) == SIGN_EXTEND)
8995 op = XEXP (op, 0);
8996
8997 if (op != x)
8998 return op;
8999
9000 return x;
9001 }
9002
9003 /* Return true iff CODE is a shift supported in combination
9004 with arithmetic instructions. */
9005
9006 static bool
aarch64_shift_p(enum rtx_code code)9007 aarch64_shift_p (enum rtx_code code)
9008 {
9009 return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
9010 }
9011
9012
9013 /* Return true iff X is a cheap shift without a sign extend. */
9014
9015 static bool
aarch64_cheap_mult_shift_p(rtx x)9016 aarch64_cheap_mult_shift_p (rtx x)
9017 {
9018 rtx op0, op1;
9019
9020 op0 = XEXP (x, 0);
9021 op1 = XEXP (x, 1);
9022
9023 if (!(aarch64_tune_params.extra_tuning_flags
9024 & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
9025 return false;
9026
9027 if (GET_CODE (op0) == SIGN_EXTEND)
9028 return false;
9029
9030 if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
9031 && UINTVAL (op1) <= 4)
9032 return true;
9033
9034 if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
9035 return false;
9036
9037 HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
9038
9039 if (l2 > 0 && l2 <= 4)
9040 return true;
9041
9042 return false;
9043 }
9044
9045 /* Helper function for rtx cost calculation. Calculate the cost of
9046 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
9047 Return the calculated cost of the expression, recursing manually in to
9048 operands where needed. */
9049
9050 static int
aarch64_rtx_mult_cost(rtx x,enum rtx_code code,int outer,bool speed)9051 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
9052 {
9053 rtx op0, op1;
9054 const struct cpu_cost_table *extra_cost
9055 = aarch64_tune_params.insn_extra_cost;
9056 int cost = 0;
9057 bool compound_p = (outer == PLUS || outer == MINUS);
9058 machine_mode mode = GET_MODE (x);
9059
9060 gcc_checking_assert (code == MULT);
9061
9062 op0 = XEXP (x, 0);
9063 op1 = XEXP (x, 1);
9064
9065 if (VECTOR_MODE_P (mode))
9066 mode = GET_MODE_INNER (mode);
9067
9068 /* Integer multiply/fma. */
9069 if (GET_MODE_CLASS (mode) == MODE_INT)
9070 {
9071 /* The multiply will be canonicalized as a shift, cost it as such. */
9072 if (aarch64_shift_p (GET_CODE (x))
9073 || (CONST_INT_P (op1)
9074 && exact_log2 (INTVAL (op1)) > 0))
9075 {
9076 bool is_extend = GET_CODE (op0) == ZERO_EXTEND
9077 || GET_CODE (op0) == SIGN_EXTEND;
9078 if (speed)
9079 {
9080 if (compound_p)
9081 {
9082 /* If the shift is considered cheap,
9083 then don't add any cost. */
9084 if (aarch64_cheap_mult_shift_p (x))
9085 ;
9086 else if (REG_P (op1))
9087 /* ARITH + shift-by-register. */
9088 cost += extra_cost->alu.arith_shift_reg;
9089 else if (is_extend)
9090 /* ARITH + extended register. We don't have a cost field
9091 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
9092 cost += extra_cost->alu.extend_arith;
9093 else
9094 /* ARITH + shift-by-immediate. */
9095 cost += extra_cost->alu.arith_shift;
9096 }
9097 else
9098 /* LSL (immediate). */
9099 cost += extra_cost->alu.shift;
9100
9101 }
9102 /* Strip extends as we will have costed them in the case above. */
9103 if (is_extend)
9104 op0 = aarch64_strip_extend (op0, true);
9105
9106 cost += rtx_cost (op0, VOIDmode, code, 0, speed);
9107
9108 return cost;
9109 }
9110
9111 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
9112 compound and let the below cases handle it. After all, MNEG is a
9113 special-case alias of MSUB. */
9114 if (GET_CODE (op0) == NEG)
9115 {
9116 op0 = XEXP (op0, 0);
9117 compound_p = true;
9118 }
9119
9120 /* Integer multiplies or FMAs have zero/sign extending variants. */
9121 if ((GET_CODE (op0) == ZERO_EXTEND
9122 && GET_CODE (op1) == ZERO_EXTEND)
9123 || (GET_CODE (op0) == SIGN_EXTEND
9124 && GET_CODE (op1) == SIGN_EXTEND))
9125 {
9126 cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
9127 cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
9128
9129 if (speed)
9130 {
9131 if (compound_p)
9132 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
9133 cost += extra_cost->mult[0].extend_add;
9134 else
9135 /* MUL/SMULL/UMULL. */
9136 cost += extra_cost->mult[0].extend;
9137 }
9138
9139 return cost;
9140 }
9141
9142 /* This is either an integer multiply or a MADD. In both cases
9143 we want to recurse and cost the operands. */
9144 cost += rtx_cost (op0, mode, MULT, 0, speed);
9145 cost += rtx_cost (op1, mode, MULT, 1, speed);
9146
9147 if (speed)
9148 {
9149 if (compound_p)
9150 /* MADD/MSUB. */
9151 cost += extra_cost->mult[mode == DImode].add;
9152 else
9153 /* MUL. */
9154 cost += extra_cost->mult[mode == DImode].simple;
9155 }
9156
9157 return cost;
9158 }
9159 else
9160 {
9161 if (speed)
9162 {
9163 /* Floating-point FMA/FMUL can also support negations of the
9164 operands, unless the rounding mode is upward or downward in
9165 which case FNMUL is different than FMUL with operand negation. */
9166 bool neg0 = GET_CODE (op0) == NEG;
9167 bool neg1 = GET_CODE (op1) == NEG;
9168 if (compound_p || !flag_rounding_math || (neg0 && neg1))
9169 {
9170 if (neg0)
9171 op0 = XEXP (op0, 0);
9172 if (neg1)
9173 op1 = XEXP (op1, 0);
9174 }
9175
9176 if (compound_p)
9177 /* FMADD/FNMADD/FNMSUB/FMSUB. */
9178 cost += extra_cost->fp[mode == DFmode].fma;
9179 else
9180 /* FMUL/FNMUL. */
9181 cost += extra_cost->fp[mode == DFmode].mult;
9182 }
9183
9184 cost += rtx_cost (op0, mode, MULT, 0, speed);
9185 cost += rtx_cost (op1, mode, MULT, 1, speed);
9186 return cost;
9187 }
9188 }
9189
9190 static int
aarch64_address_cost(rtx x,machine_mode mode,addr_space_t as ATTRIBUTE_UNUSED,bool speed)9191 aarch64_address_cost (rtx x,
9192 machine_mode mode,
9193 addr_space_t as ATTRIBUTE_UNUSED,
9194 bool speed)
9195 {
9196 enum rtx_code c = GET_CODE (x);
9197 const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
9198 struct aarch64_address_info info;
9199 int cost = 0;
9200 info.shift = 0;
9201
9202 if (!aarch64_classify_address (&info, x, mode, false))
9203 {
9204 if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
9205 {
9206 /* This is a CONST or SYMBOL ref which will be split
9207 in a different way depending on the code model in use.
9208 Cost it through the generic infrastructure. */
9209 int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
9210 /* Divide through by the cost of one instruction to
9211 bring it to the same units as the address costs. */
9212 cost_symbol_ref /= COSTS_N_INSNS (1);
9213 /* The cost is then the cost of preparing the address,
9214 followed by an immediate (possibly 0) offset. */
9215 return cost_symbol_ref + addr_cost->imm_offset;
9216 }
9217 else
9218 {
9219 /* This is most likely a jump table from a case
9220 statement. */
9221 return addr_cost->register_offset;
9222 }
9223 }
9224
9225 switch (info.type)
9226 {
9227 case ADDRESS_LO_SUM:
9228 case ADDRESS_SYMBOLIC:
9229 case ADDRESS_REG_IMM:
9230 cost += addr_cost->imm_offset;
9231 break;
9232
9233 case ADDRESS_REG_WB:
9234 if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
9235 cost += addr_cost->pre_modify;
9236 else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
9237 cost += addr_cost->post_modify;
9238 else
9239 gcc_unreachable ();
9240
9241 break;
9242
9243 case ADDRESS_REG_REG:
9244 cost += addr_cost->register_offset;
9245 break;
9246
9247 case ADDRESS_REG_SXTW:
9248 cost += addr_cost->register_sextend;
9249 break;
9250
9251 case ADDRESS_REG_UXTW:
9252 cost += addr_cost->register_zextend;
9253 break;
9254
9255 default:
9256 gcc_unreachable ();
9257 }
9258
9259
9260 if (info.shift > 0)
9261 {
9262 /* For the sake of calculating the cost of the shifted register
9263 component, we can treat same sized modes in the same way. */
9264 if (known_eq (GET_MODE_BITSIZE (mode), 16))
9265 cost += addr_cost->addr_scale_costs.hi;
9266 else if (known_eq (GET_MODE_BITSIZE (mode), 32))
9267 cost += addr_cost->addr_scale_costs.si;
9268 else if (known_eq (GET_MODE_BITSIZE (mode), 64))
9269 cost += addr_cost->addr_scale_costs.di;
9270 else
9271 /* We can't tell, or this is a 128-bit vector. */
9272 cost += addr_cost->addr_scale_costs.ti;
9273 }
9274
9275 return cost;
9276 }
9277
9278 /* Return the cost of a branch. If SPEED_P is true then the compiler is
9279 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
9280 to be taken. */
9281
9282 int
aarch64_branch_cost(bool speed_p,bool predictable_p)9283 aarch64_branch_cost (bool speed_p, bool predictable_p)
9284 {
9285 /* When optimizing for speed, use the cost of unpredictable branches. */
9286 const struct cpu_branch_cost *branch_costs =
9287 aarch64_tune_params.branch_costs;
9288
9289 if (!speed_p || predictable_p)
9290 return branch_costs->predictable;
9291 else
9292 return branch_costs->unpredictable;
9293 }
9294
9295 /* Return true if the RTX X in mode MODE is a zero or sign extract
9296 usable in an ADD or SUB (extended register) instruction. */
9297 static bool
aarch64_rtx_arith_op_extract_p(rtx x,scalar_int_mode mode)9298 aarch64_rtx_arith_op_extract_p (rtx x, scalar_int_mode mode)
9299 {
9300 /* Catch add with a sign extract.
9301 This is add_<optab><mode>_multp2. */
9302 if (GET_CODE (x) == SIGN_EXTRACT
9303 || GET_CODE (x) == ZERO_EXTRACT)
9304 {
9305 rtx op0 = XEXP (x, 0);
9306 rtx op1 = XEXP (x, 1);
9307 rtx op2 = XEXP (x, 2);
9308
9309 if (GET_CODE (op0) == MULT
9310 && CONST_INT_P (op1)
9311 && op2 == const0_rtx
9312 && CONST_INT_P (XEXP (op0, 1))
9313 && aarch64_is_extend_from_extract (mode,
9314 XEXP (op0, 1),
9315 op1))
9316 {
9317 return true;
9318 }
9319 }
9320 /* The simple case <ARITH>, XD, XN, XM, [us]xt.
9321 No shift. */
9322 else if (GET_CODE (x) == SIGN_EXTEND
9323 || GET_CODE (x) == ZERO_EXTEND)
9324 return REG_P (XEXP (x, 0));
9325
9326 return false;
9327 }
9328
9329 static bool
aarch64_frint_unspec_p(unsigned int u)9330 aarch64_frint_unspec_p (unsigned int u)
9331 {
9332 switch (u)
9333 {
9334 case UNSPEC_FRINTZ:
9335 case UNSPEC_FRINTP:
9336 case UNSPEC_FRINTM:
9337 case UNSPEC_FRINTA:
9338 case UNSPEC_FRINTN:
9339 case UNSPEC_FRINTX:
9340 case UNSPEC_FRINTI:
9341 return true;
9342
9343 default:
9344 return false;
9345 }
9346 }
9347
9348 /* Return true iff X is an rtx that will match an extr instruction
9349 i.e. as described in the *extr<mode>5_insn family of patterns.
9350 OP0 and OP1 will be set to the operands of the shifts involved
9351 on success and will be NULL_RTX otherwise. */
9352
9353 static bool
aarch64_extr_rtx_p(rtx x,rtx * res_op0,rtx * res_op1)9354 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
9355 {
9356 rtx op0, op1;
9357 scalar_int_mode mode;
9358 if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
9359 return false;
9360
9361 *res_op0 = NULL_RTX;
9362 *res_op1 = NULL_RTX;
9363
9364 if (GET_CODE (x) != IOR)
9365 return false;
9366
9367 op0 = XEXP (x, 0);
9368 op1 = XEXP (x, 1);
9369
9370 if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
9371 || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
9372 {
9373 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
9374 if (GET_CODE (op1) == ASHIFT)
9375 std::swap (op0, op1);
9376
9377 if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
9378 return false;
9379
9380 unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
9381 unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
9382
9383 if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
9384 && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
9385 {
9386 *res_op0 = XEXP (op0, 0);
9387 *res_op1 = XEXP (op1, 0);
9388 return true;
9389 }
9390 }
9391
9392 return false;
9393 }
9394
9395 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
9396 storing it in *COST. Result is true if the total cost of the operation
9397 has now been calculated. */
9398 static bool
aarch64_if_then_else_costs(rtx op0,rtx op1,rtx op2,int * cost,bool speed)9399 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
9400 {
9401 rtx inner;
9402 rtx comparator;
9403 enum rtx_code cmpcode;
9404
9405 if (COMPARISON_P (op0))
9406 {
9407 inner = XEXP (op0, 0);
9408 comparator = XEXP (op0, 1);
9409 cmpcode = GET_CODE (op0);
9410 }
9411 else
9412 {
9413 inner = op0;
9414 comparator = const0_rtx;
9415 cmpcode = NE;
9416 }
9417
9418 if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
9419 {
9420 /* Conditional branch. */
9421 if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
9422 return true;
9423 else
9424 {
9425 if (cmpcode == NE || cmpcode == EQ)
9426 {
9427 if (comparator == const0_rtx)
9428 {
9429 /* TBZ/TBNZ/CBZ/CBNZ. */
9430 if (GET_CODE (inner) == ZERO_EXTRACT)
9431 /* TBZ/TBNZ. */
9432 *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
9433 ZERO_EXTRACT, 0, speed);
9434 else
9435 /* CBZ/CBNZ. */
9436 *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
9437
9438 return true;
9439 }
9440 }
9441 else if (cmpcode == LT || cmpcode == GE)
9442 {
9443 /* TBZ/TBNZ. */
9444 if (comparator == const0_rtx)
9445 return true;
9446 }
9447 }
9448 }
9449 else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
9450 {
9451 /* CCMP. */
9452 if (GET_CODE (op1) == COMPARE)
9453 {
9454 /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0. */
9455 if (XEXP (op1, 1) == const0_rtx)
9456 *cost += 1;
9457 if (speed)
9458 {
9459 machine_mode mode = GET_MODE (XEXP (op1, 0));
9460 const struct cpu_cost_table *extra_cost
9461 = aarch64_tune_params.insn_extra_cost;
9462
9463 if (GET_MODE_CLASS (mode) == MODE_INT)
9464 *cost += extra_cost->alu.arith;
9465 else
9466 *cost += extra_cost->fp[mode == DFmode].compare;
9467 }
9468 return true;
9469 }
9470
9471 /* It's a conditional operation based on the status flags,
9472 so it must be some flavor of CSEL. */
9473
9474 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
9475 if (GET_CODE (op1) == NEG
9476 || GET_CODE (op1) == NOT
9477 || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
9478 op1 = XEXP (op1, 0);
9479 else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
9480 {
9481 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */
9482 op1 = XEXP (op1, 0);
9483 op2 = XEXP (op2, 0);
9484 }
9485
9486 *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
9487 *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
9488 return true;
9489 }
9490
9491 /* We don't know what this is, cost all operands. */
9492 return false;
9493 }
9494
9495 /* Check whether X is a bitfield operation of the form shift + extend that
9496 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the
9497 operand to which the bitfield operation is applied. Otherwise return
9498 NULL_RTX. */
9499
9500 static rtx
aarch64_extend_bitfield_pattern_p(rtx x)9501 aarch64_extend_bitfield_pattern_p (rtx x)
9502 {
9503 rtx_code outer_code = GET_CODE (x);
9504 machine_mode outer_mode = GET_MODE (x);
9505
9506 if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
9507 && outer_mode != SImode && outer_mode != DImode)
9508 return NULL_RTX;
9509
9510 rtx inner = XEXP (x, 0);
9511 rtx_code inner_code = GET_CODE (inner);
9512 machine_mode inner_mode = GET_MODE (inner);
9513 rtx op = NULL_RTX;
9514
9515 switch (inner_code)
9516 {
9517 case ASHIFT:
9518 if (CONST_INT_P (XEXP (inner, 1))
9519 && (inner_mode == QImode || inner_mode == HImode))
9520 op = XEXP (inner, 0);
9521 break;
9522 case LSHIFTRT:
9523 if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
9524 && (inner_mode == QImode || inner_mode == HImode))
9525 op = XEXP (inner, 0);
9526 break;
9527 case ASHIFTRT:
9528 if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
9529 && (inner_mode == QImode || inner_mode == HImode))
9530 op = XEXP (inner, 0);
9531 break;
9532 default:
9533 break;
9534 }
9535
9536 return op;
9537 }
9538
9539 /* Return true if the mask and a shift amount from an RTX of the form
9540 (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
9541 mode MODE. See the *andim_ashift<mode>_bfiz pattern. */
9542
9543 bool
aarch64_mask_and_shift_for_ubfiz_p(scalar_int_mode mode,rtx mask,rtx shft_amnt)9544 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
9545 rtx shft_amnt)
9546 {
9547 return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
9548 && INTVAL (mask) > 0
9549 && UINTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
9550 && exact_log2 ((UINTVAL (mask) >> UINTVAL (shft_amnt)) + 1) >= 0
9551 && (UINTVAL (mask)
9552 & ((HOST_WIDE_INT_1U << UINTVAL (shft_amnt)) - 1)) == 0;
9553 }
9554
9555 /* Return true if the masks and a shift amount from an RTX of the form
9556 ((x & MASK1) | ((y << SHIFT_AMNT) & MASK2)) are valid to combine into
9557 a BFI instruction of mode MODE. See *arch64_bfi patterns. */
9558
9559 bool
aarch64_masks_and_shift_for_bfi_p(scalar_int_mode mode,unsigned HOST_WIDE_INT mask1,unsigned HOST_WIDE_INT shft_amnt,unsigned HOST_WIDE_INT mask2)9560 aarch64_masks_and_shift_for_bfi_p (scalar_int_mode mode,
9561 unsigned HOST_WIDE_INT mask1,
9562 unsigned HOST_WIDE_INT shft_amnt,
9563 unsigned HOST_WIDE_INT mask2)
9564 {
9565 unsigned HOST_WIDE_INT t;
9566
9567 /* Verify that there is no overlap in what bits are set in the two masks. */
9568 if (mask1 != ~mask2)
9569 return false;
9570
9571 /* Verify that mask2 is not all zeros or ones. */
9572 if (mask2 == 0 || mask2 == HOST_WIDE_INT_M1U)
9573 return false;
9574
9575 /* The shift amount should always be less than the mode size. */
9576 gcc_assert (shft_amnt < GET_MODE_BITSIZE (mode));
9577
9578 /* Verify that the mask being shifted is contiguous and would be in the
9579 least significant bits after shifting by shft_amnt. */
9580 t = mask2 + (HOST_WIDE_INT_1U << shft_amnt);
9581 return (t == (t & -t));
9582 }
9583
9584 /* Calculate the cost of calculating X, storing it in *COST. Result
9585 is true if the total cost of the operation has now been calculated. */
9586 static bool
aarch64_rtx_costs(rtx x,machine_mode mode,int outer ATTRIBUTE_UNUSED,int param ATTRIBUTE_UNUSED,int * cost,bool speed)9587 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
9588 int param ATTRIBUTE_UNUSED, int *cost, bool speed)
9589 {
9590 rtx op0, op1, op2;
9591 const struct cpu_cost_table *extra_cost
9592 = aarch64_tune_params.insn_extra_cost;
9593 int code = GET_CODE (x);
9594 scalar_int_mode int_mode;
9595
9596 /* By default, assume that everything has equivalent cost to the
9597 cheapest instruction. Any additional costs are applied as a delta
9598 above this default. */
9599 *cost = COSTS_N_INSNS (1);
9600
9601 switch (code)
9602 {
9603 case SET:
9604 /* The cost depends entirely on the operands to SET. */
9605 *cost = 0;
9606 op0 = SET_DEST (x);
9607 op1 = SET_SRC (x);
9608
9609 switch (GET_CODE (op0))
9610 {
9611 case MEM:
9612 if (speed)
9613 {
9614 rtx address = XEXP (op0, 0);
9615 if (VECTOR_MODE_P (mode))
9616 *cost += extra_cost->ldst.storev;
9617 else if (GET_MODE_CLASS (mode) == MODE_INT)
9618 *cost += extra_cost->ldst.store;
9619 else if (mode == SFmode)
9620 *cost += extra_cost->ldst.storef;
9621 else if (mode == DFmode)
9622 *cost += extra_cost->ldst.stored;
9623
9624 *cost +=
9625 COSTS_N_INSNS (aarch64_address_cost (address, mode,
9626 0, speed));
9627 }
9628
9629 *cost += rtx_cost (op1, mode, SET, 1, speed);
9630 return true;
9631
9632 case SUBREG:
9633 if (! REG_P (SUBREG_REG (op0)))
9634 *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
9635
9636 /* Fall through. */
9637 case REG:
9638 /* The cost is one per vector-register copied. */
9639 if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
9640 {
9641 int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
9642 *cost = COSTS_N_INSNS (nregs);
9643 }
9644 /* const0_rtx is in general free, but we will use an
9645 instruction to set a register to 0. */
9646 else if (REG_P (op1) || op1 == const0_rtx)
9647 {
9648 /* The cost is 1 per register copied. */
9649 int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
9650 *cost = COSTS_N_INSNS (nregs);
9651 }
9652 else
9653 /* Cost is just the cost of the RHS of the set. */
9654 *cost += rtx_cost (op1, mode, SET, 1, speed);
9655 return true;
9656
9657 case ZERO_EXTRACT:
9658 case SIGN_EXTRACT:
9659 /* Bit-field insertion. Strip any redundant widening of
9660 the RHS to meet the width of the target. */
9661 if (GET_CODE (op1) == SUBREG)
9662 op1 = SUBREG_REG (op1);
9663 if ((GET_CODE (op1) == ZERO_EXTEND
9664 || GET_CODE (op1) == SIGN_EXTEND)
9665 && CONST_INT_P (XEXP (op0, 1))
9666 && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
9667 && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
9668 op1 = XEXP (op1, 0);
9669
9670 if (CONST_INT_P (op1))
9671 {
9672 /* MOV immediate is assumed to always be cheap. */
9673 *cost = COSTS_N_INSNS (1);
9674 }
9675 else
9676 {
9677 /* BFM. */
9678 if (speed)
9679 *cost += extra_cost->alu.bfi;
9680 *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
9681 }
9682
9683 return true;
9684
9685 default:
9686 /* We can't make sense of this, assume default cost. */
9687 *cost = COSTS_N_INSNS (1);
9688 return false;
9689 }
9690 return false;
9691
9692 case CONST_INT:
9693 /* If an instruction can incorporate a constant within the
9694 instruction, the instruction's expression avoids calling
9695 rtx_cost() on the constant. If rtx_cost() is called on a
9696 constant, then it is usually because the constant must be
9697 moved into a register by one or more instructions.
9698
9699 The exception is constant 0, which can be expressed
9700 as XZR/WZR and is therefore free. The exception to this is
9701 if we have (set (reg) (const0_rtx)) in which case we must cost
9702 the move. However, we can catch that when we cost the SET, so
9703 we don't need to consider that here. */
9704 if (x == const0_rtx)
9705 *cost = 0;
9706 else
9707 {
9708 /* To an approximation, building any other constant is
9709 proportionally expensive to the number of instructions
9710 required to build that constant. This is true whether we
9711 are compiling for SPEED or otherwise. */
9712 if (!is_a <scalar_int_mode> (mode, &int_mode))
9713 int_mode = word_mode;
9714 *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
9715 (NULL_RTX, x, false, int_mode));
9716 }
9717 return true;
9718
9719 case CONST_DOUBLE:
9720
9721 /* First determine number of instructions to do the move
9722 as an integer constant. */
9723 if (!aarch64_float_const_representable_p (x)
9724 && !aarch64_can_const_movi_rtx_p (x, mode)
9725 && aarch64_float_const_rtx_p (x))
9726 {
9727 unsigned HOST_WIDE_INT ival;
9728 bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
9729 gcc_assert (succeed);
9730
9731 scalar_int_mode imode = (mode == HFmode
9732 ? SImode
9733 : int_mode_for_mode (mode).require ());
9734 int ncost = aarch64_internal_mov_immediate
9735 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
9736 *cost += COSTS_N_INSNS (ncost);
9737 return true;
9738 }
9739
9740 if (speed)
9741 {
9742 /* mov[df,sf]_aarch64. */
9743 if (aarch64_float_const_representable_p (x))
9744 /* FMOV (scalar immediate). */
9745 *cost += extra_cost->fp[mode == DFmode].fpconst;
9746 else if (!aarch64_float_const_zero_rtx_p (x))
9747 {
9748 /* This will be a load from memory. */
9749 if (mode == DFmode)
9750 *cost += extra_cost->ldst.loadd;
9751 else
9752 *cost += extra_cost->ldst.loadf;
9753 }
9754 else
9755 /* Otherwise this is +0.0. We get this using MOVI d0, #0
9756 or MOV v0.s[0], wzr - neither of which are modeled by the
9757 cost tables. Just use the default cost. */
9758 {
9759 }
9760 }
9761
9762 return true;
9763
9764 case MEM:
9765 if (speed)
9766 {
9767 /* For loads we want the base cost of a load, plus an
9768 approximation for the additional cost of the addressing
9769 mode. */
9770 rtx address = XEXP (x, 0);
9771 if (VECTOR_MODE_P (mode))
9772 *cost += extra_cost->ldst.loadv;
9773 else if (GET_MODE_CLASS (mode) == MODE_INT)
9774 *cost += extra_cost->ldst.load;
9775 else if (mode == SFmode)
9776 *cost += extra_cost->ldst.loadf;
9777 else if (mode == DFmode)
9778 *cost += extra_cost->ldst.loadd;
9779
9780 *cost +=
9781 COSTS_N_INSNS (aarch64_address_cost (address, mode,
9782 0, speed));
9783 }
9784
9785 return true;
9786
9787 case NEG:
9788 op0 = XEXP (x, 0);
9789
9790 if (VECTOR_MODE_P (mode))
9791 {
9792 if (speed)
9793 {
9794 /* FNEG. */
9795 *cost += extra_cost->vect.alu;
9796 }
9797 return false;
9798 }
9799
9800 if (GET_MODE_CLASS (mode) == MODE_INT)
9801 {
9802 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
9803 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
9804 {
9805 /* CSETM. */
9806 *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
9807 return true;
9808 }
9809
9810 /* Cost this as SUB wzr, X. */
9811 op0 = CONST0_RTX (mode);
9812 op1 = XEXP (x, 0);
9813 goto cost_minus;
9814 }
9815
9816 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
9817 {
9818 /* Support (neg(fma...)) as a single instruction only if
9819 sign of zeros is unimportant. This matches the decision
9820 making in aarch64.md. */
9821 if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
9822 {
9823 /* FNMADD. */
9824 *cost = rtx_cost (op0, mode, NEG, 0, speed);
9825 return true;
9826 }
9827 if (GET_CODE (op0) == MULT)
9828 {
9829 /* FNMUL. */
9830 *cost = rtx_cost (op0, mode, NEG, 0, speed);
9831 return true;
9832 }
9833 if (speed)
9834 /* FNEG. */
9835 *cost += extra_cost->fp[mode == DFmode].neg;
9836 return false;
9837 }
9838
9839 return false;
9840
9841 case CLRSB:
9842 case CLZ:
9843 if (speed)
9844 {
9845 if (VECTOR_MODE_P (mode))
9846 *cost += extra_cost->vect.alu;
9847 else
9848 *cost += extra_cost->alu.clz;
9849 }
9850
9851 return false;
9852
9853 case COMPARE:
9854 op0 = XEXP (x, 0);
9855 op1 = XEXP (x, 1);
9856
9857 if (op1 == const0_rtx
9858 && GET_CODE (op0) == AND)
9859 {
9860 x = op0;
9861 mode = GET_MODE (op0);
9862 goto cost_logic;
9863 }
9864
9865 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
9866 {
9867 /* TODO: A write to the CC flags possibly costs extra, this
9868 needs encoding in the cost tables. */
9869
9870 mode = GET_MODE (op0);
9871 /* ANDS. */
9872 if (GET_CODE (op0) == AND)
9873 {
9874 x = op0;
9875 goto cost_logic;
9876 }
9877
9878 if (GET_CODE (op0) == PLUS)
9879 {
9880 /* ADDS (and CMN alias). */
9881 x = op0;
9882 goto cost_plus;
9883 }
9884
9885 if (GET_CODE (op0) == MINUS)
9886 {
9887 /* SUBS. */
9888 x = op0;
9889 goto cost_minus;
9890 }
9891
9892 if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
9893 && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
9894 && CONST_INT_P (XEXP (op0, 2)))
9895 {
9896 /* COMPARE of ZERO_EXTRACT form of TST-immediate.
9897 Handle it here directly rather than going to cost_logic
9898 since we know the immediate generated for the TST is valid
9899 so we can avoid creating an intermediate rtx for it only
9900 for costing purposes. */
9901 if (speed)
9902 *cost += extra_cost->alu.logical;
9903
9904 *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
9905 ZERO_EXTRACT, 0, speed);
9906 return true;
9907 }
9908
9909 if (GET_CODE (op1) == NEG)
9910 {
9911 /* CMN. */
9912 if (speed)
9913 *cost += extra_cost->alu.arith;
9914
9915 *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
9916 *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
9917 return true;
9918 }
9919
9920 /* CMP.
9921
9922 Compare can freely swap the order of operands, and
9923 canonicalization puts the more complex operation first.
9924 But the integer MINUS logic expects the shift/extend
9925 operation in op1. */
9926 if (! (REG_P (op0)
9927 || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
9928 {
9929 op0 = XEXP (x, 1);
9930 op1 = XEXP (x, 0);
9931 }
9932 goto cost_minus;
9933 }
9934
9935 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
9936 {
9937 /* FCMP. */
9938 if (speed)
9939 *cost += extra_cost->fp[mode == DFmode].compare;
9940
9941 if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
9942 {
9943 *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
9944 /* FCMP supports constant 0.0 for no extra cost. */
9945 return true;
9946 }
9947 return false;
9948 }
9949
9950 if (VECTOR_MODE_P (mode))
9951 {
9952 /* Vector compare. */
9953 if (speed)
9954 *cost += extra_cost->vect.alu;
9955
9956 if (aarch64_float_const_zero_rtx_p (op1))
9957 {
9958 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
9959 cost. */
9960 return true;
9961 }
9962 return false;
9963 }
9964 return false;
9965
9966 case MINUS:
9967 {
9968 op0 = XEXP (x, 0);
9969 op1 = XEXP (x, 1);
9970
9971 cost_minus:
9972 *cost += rtx_cost (op0, mode, MINUS, 0, speed);
9973
9974 /* Detect valid immediates. */
9975 if ((GET_MODE_CLASS (mode) == MODE_INT
9976 || (GET_MODE_CLASS (mode) == MODE_CC
9977 && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
9978 && CONST_INT_P (op1)
9979 && aarch64_uimm12_shift (INTVAL (op1)))
9980 {
9981 if (speed)
9982 /* SUB(S) (immediate). */
9983 *cost += extra_cost->alu.arith;
9984 return true;
9985 }
9986
9987 /* Look for SUB (extended register). */
9988 if (is_a <scalar_int_mode> (mode, &int_mode)
9989 && aarch64_rtx_arith_op_extract_p (op1, int_mode))
9990 {
9991 if (speed)
9992 *cost += extra_cost->alu.extend_arith;
9993
9994 op1 = aarch64_strip_extend (op1, true);
9995 *cost += rtx_cost (op1, VOIDmode,
9996 (enum rtx_code) GET_CODE (op1), 0, speed);
9997 return true;
9998 }
9999
10000 rtx new_op1 = aarch64_strip_extend (op1, false);
10001
10002 /* Cost this as an FMA-alike operation. */
10003 if ((GET_CODE (new_op1) == MULT
10004 || aarch64_shift_p (GET_CODE (new_op1)))
10005 && code != COMPARE)
10006 {
10007 *cost += aarch64_rtx_mult_cost (new_op1, MULT,
10008 (enum rtx_code) code,
10009 speed);
10010 return true;
10011 }
10012
10013 *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
10014
10015 if (speed)
10016 {
10017 if (VECTOR_MODE_P (mode))
10018 {
10019 /* Vector SUB. */
10020 *cost += extra_cost->vect.alu;
10021 }
10022 else if (GET_MODE_CLASS (mode) == MODE_INT)
10023 {
10024 /* SUB(S). */
10025 *cost += extra_cost->alu.arith;
10026 }
10027 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
10028 {
10029 /* FSUB. */
10030 *cost += extra_cost->fp[mode == DFmode].addsub;
10031 }
10032 }
10033 return true;
10034 }
10035
10036 case PLUS:
10037 {
10038 rtx new_op0;
10039
10040 op0 = XEXP (x, 0);
10041 op1 = XEXP (x, 1);
10042
10043 cost_plus:
10044 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
10045 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
10046 {
10047 /* CSINC. */
10048 *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
10049 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
10050 return true;
10051 }
10052
10053 if (GET_MODE_CLASS (mode) == MODE_INT
10054 && ((CONST_INT_P (op1) && aarch64_uimm12_shift (INTVAL (op1)))
10055 || aarch64_sve_addvl_addpl_immediate (op1, mode)))
10056 {
10057 *cost += rtx_cost (op0, mode, PLUS, 0, speed);
10058
10059 if (speed)
10060 /* ADD (immediate). */
10061 *cost += extra_cost->alu.arith;
10062 return true;
10063 }
10064
10065 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
10066
10067 /* Look for ADD (extended register). */
10068 if (is_a <scalar_int_mode> (mode, &int_mode)
10069 && aarch64_rtx_arith_op_extract_p (op0, int_mode))
10070 {
10071 if (speed)
10072 *cost += extra_cost->alu.extend_arith;
10073
10074 op0 = aarch64_strip_extend (op0, true);
10075 *cost += rtx_cost (op0, VOIDmode,
10076 (enum rtx_code) GET_CODE (op0), 0, speed);
10077 return true;
10078 }
10079
10080 /* Strip any extend, leave shifts behind as we will
10081 cost them through mult_cost. */
10082 new_op0 = aarch64_strip_extend (op0, false);
10083
10084 if (GET_CODE (new_op0) == MULT
10085 || aarch64_shift_p (GET_CODE (new_op0)))
10086 {
10087 *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
10088 speed);
10089 return true;
10090 }
10091
10092 *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
10093
10094 if (speed)
10095 {
10096 if (VECTOR_MODE_P (mode))
10097 {
10098 /* Vector ADD. */
10099 *cost += extra_cost->vect.alu;
10100 }
10101 else if (GET_MODE_CLASS (mode) == MODE_INT)
10102 {
10103 /* ADD. */
10104 *cost += extra_cost->alu.arith;
10105 }
10106 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
10107 {
10108 /* FADD. */
10109 *cost += extra_cost->fp[mode == DFmode].addsub;
10110 }
10111 }
10112 return true;
10113 }
10114
10115 case BSWAP:
10116 *cost = COSTS_N_INSNS (1);
10117
10118 if (speed)
10119 {
10120 if (VECTOR_MODE_P (mode))
10121 *cost += extra_cost->vect.alu;
10122 else
10123 *cost += extra_cost->alu.rev;
10124 }
10125 return false;
10126
10127 case IOR:
10128 if (aarch_rev16_p (x))
10129 {
10130 *cost = COSTS_N_INSNS (1);
10131
10132 if (speed)
10133 {
10134 if (VECTOR_MODE_P (mode))
10135 *cost += extra_cost->vect.alu;
10136 else
10137 *cost += extra_cost->alu.rev;
10138 }
10139 return true;
10140 }
10141
10142 if (aarch64_extr_rtx_p (x, &op0, &op1))
10143 {
10144 *cost += rtx_cost (op0, mode, IOR, 0, speed);
10145 *cost += rtx_cost (op1, mode, IOR, 1, speed);
10146 if (speed)
10147 *cost += extra_cost->alu.shift;
10148
10149 return true;
10150 }
10151 /* Fall through. */
10152 case XOR:
10153 case AND:
10154 cost_logic:
10155 op0 = XEXP (x, 0);
10156 op1 = XEXP (x, 1);
10157
10158 if (VECTOR_MODE_P (mode))
10159 {
10160 if (speed)
10161 *cost += extra_cost->vect.alu;
10162 return true;
10163 }
10164
10165 if (code == AND
10166 && GET_CODE (op0) == MULT
10167 && CONST_INT_P (XEXP (op0, 1))
10168 && CONST_INT_P (op1)
10169 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
10170 INTVAL (op1)) != 0)
10171 {
10172 /* This is a UBFM/SBFM. */
10173 *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
10174 if (speed)
10175 *cost += extra_cost->alu.bfx;
10176 return true;
10177 }
10178
10179 if (is_int_mode (mode, &int_mode))
10180 {
10181 if (CONST_INT_P (op1))
10182 {
10183 /* We have a mask + shift version of a UBFIZ
10184 i.e. the *andim_ashift<mode>_bfiz pattern. */
10185 if (GET_CODE (op0) == ASHIFT
10186 && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
10187 XEXP (op0, 1)))
10188 {
10189 *cost += rtx_cost (XEXP (op0, 0), int_mode,
10190 (enum rtx_code) code, 0, speed);
10191 if (speed)
10192 *cost += extra_cost->alu.bfx;
10193
10194 return true;
10195 }
10196 else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
10197 {
10198 /* We possibly get the immediate for free, this is not
10199 modelled. */
10200 *cost += rtx_cost (op0, int_mode,
10201 (enum rtx_code) code, 0, speed);
10202 if (speed)
10203 *cost += extra_cost->alu.logical;
10204
10205 return true;
10206 }
10207 }
10208 else
10209 {
10210 rtx new_op0 = op0;
10211
10212 /* Handle ORN, EON, or BIC. */
10213 if (GET_CODE (op0) == NOT)
10214 op0 = XEXP (op0, 0);
10215
10216 new_op0 = aarch64_strip_shift (op0);
10217
10218 /* If we had a shift on op0 then this is a logical-shift-
10219 by-register/immediate operation. Otherwise, this is just
10220 a logical operation. */
10221 if (speed)
10222 {
10223 if (new_op0 != op0)
10224 {
10225 /* Shift by immediate. */
10226 if (CONST_INT_P (XEXP (op0, 1)))
10227 *cost += extra_cost->alu.log_shift;
10228 else
10229 *cost += extra_cost->alu.log_shift_reg;
10230 }
10231 else
10232 *cost += extra_cost->alu.logical;
10233 }
10234
10235 /* In both cases we want to cost both operands. */
10236 *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
10237 0, speed);
10238 *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
10239 1, speed);
10240
10241 return true;
10242 }
10243 }
10244 return false;
10245
10246 case NOT:
10247 x = XEXP (x, 0);
10248 op0 = aarch64_strip_shift (x);
10249
10250 if (VECTOR_MODE_P (mode))
10251 {
10252 /* Vector NOT. */
10253 *cost += extra_cost->vect.alu;
10254 return false;
10255 }
10256
10257 /* MVN-shifted-reg. */
10258 if (op0 != x)
10259 {
10260 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
10261
10262 if (speed)
10263 *cost += extra_cost->alu.log_shift;
10264
10265 return true;
10266 }
10267 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
10268 Handle the second form here taking care that 'a' in the above can
10269 be a shift. */
10270 else if (GET_CODE (op0) == XOR)
10271 {
10272 rtx newop0 = XEXP (op0, 0);
10273 rtx newop1 = XEXP (op0, 1);
10274 rtx op0_stripped = aarch64_strip_shift (newop0);
10275
10276 *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
10277 *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
10278
10279 if (speed)
10280 {
10281 if (op0_stripped != newop0)
10282 *cost += extra_cost->alu.log_shift;
10283 else
10284 *cost += extra_cost->alu.logical;
10285 }
10286
10287 return true;
10288 }
10289 /* MVN. */
10290 if (speed)
10291 *cost += extra_cost->alu.logical;
10292
10293 return false;
10294
10295 case ZERO_EXTEND:
10296
10297 op0 = XEXP (x, 0);
10298 /* If a value is written in SI mode, then zero extended to DI
10299 mode, the operation will in general be free as a write to
10300 a 'w' register implicitly zeroes the upper bits of an 'x'
10301 register. However, if this is
10302
10303 (set (reg) (zero_extend (reg)))
10304
10305 we must cost the explicit register move. */
10306 if (mode == DImode
10307 && GET_MODE (op0) == SImode
10308 && outer == SET)
10309 {
10310 int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
10311
10312 /* If OP_COST is non-zero, then the cost of the zero extend
10313 is effectively the cost of the inner operation. Otherwise
10314 we have a MOV instruction and we take the cost from the MOV
10315 itself. This is true independently of whether we are
10316 optimizing for space or time. */
10317 if (op_cost)
10318 *cost = op_cost;
10319
10320 return true;
10321 }
10322 else if (MEM_P (op0))
10323 {
10324 /* All loads can zero extend to any size for free. */
10325 *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
10326 return true;
10327 }
10328
10329 op0 = aarch64_extend_bitfield_pattern_p (x);
10330 if (op0)
10331 {
10332 *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
10333 if (speed)
10334 *cost += extra_cost->alu.bfx;
10335 return true;
10336 }
10337
10338 if (speed)
10339 {
10340 if (VECTOR_MODE_P (mode))
10341 {
10342 /* UMOV. */
10343 *cost += extra_cost->vect.alu;
10344 }
10345 else
10346 {
10347 /* We generate an AND instead of UXTB/UXTH. */
10348 *cost += extra_cost->alu.logical;
10349 }
10350 }
10351 return false;
10352
10353 case SIGN_EXTEND:
10354 if (MEM_P (XEXP (x, 0)))
10355 {
10356 /* LDRSH. */
10357 if (speed)
10358 {
10359 rtx address = XEXP (XEXP (x, 0), 0);
10360 *cost += extra_cost->ldst.load_sign_extend;
10361
10362 *cost +=
10363 COSTS_N_INSNS (aarch64_address_cost (address, mode,
10364 0, speed));
10365 }
10366 return true;
10367 }
10368
10369 op0 = aarch64_extend_bitfield_pattern_p (x);
10370 if (op0)
10371 {
10372 *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
10373 if (speed)
10374 *cost += extra_cost->alu.bfx;
10375 return true;
10376 }
10377
10378 if (speed)
10379 {
10380 if (VECTOR_MODE_P (mode))
10381 *cost += extra_cost->vect.alu;
10382 else
10383 *cost += extra_cost->alu.extend;
10384 }
10385 return false;
10386
10387 case ASHIFT:
10388 op0 = XEXP (x, 0);
10389 op1 = XEXP (x, 1);
10390
10391 if (CONST_INT_P (op1))
10392 {
10393 if (speed)
10394 {
10395 if (VECTOR_MODE_P (mode))
10396 {
10397 /* Vector shift (immediate). */
10398 *cost += extra_cost->vect.alu;
10399 }
10400 else
10401 {
10402 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
10403 aliases. */
10404 *cost += extra_cost->alu.shift;
10405 }
10406 }
10407
10408 /* We can incorporate zero/sign extend for free. */
10409 if (GET_CODE (op0) == ZERO_EXTEND
10410 || GET_CODE (op0) == SIGN_EXTEND)
10411 op0 = XEXP (op0, 0);
10412
10413 *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
10414 return true;
10415 }
10416 else
10417 {
10418 if (VECTOR_MODE_P (mode))
10419 {
10420 if (speed)
10421 /* Vector shift (register). */
10422 *cost += extra_cost->vect.alu;
10423 }
10424 else
10425 {
10426 if (speed)
10427 /* LSLV. */
10428 *cost += extra_cost->alu.shift_reg;
10429
10430 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
10431 && CONST_INT_P (XEXP (op1, 1))
10432 && known_eq (INTVAL (XEXP (op1, 1)),
10433 GET_MODE_BITSIZE (mode) - 1))
10434 {
10435 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
10436 /* We already demanded XEXP (op1, 0) to be REG_P, so
10437 don't recurse into it. */
10438 return true;
10439 }
10440 }
10441 return false; /* All arguments need to be in registers. */
10442 }
10443
10444 case ROTATE:
10445 case ROTATERT:
10446 case LSHIFTRT:
10447 case ASHIFTRT:
10448 op0 = XEXP (x, 0);
10449 op1 = XEXP (x, 1);
10450
10451 if (CONST_INT_P (op1))
10452 {
10453 /* ASR (immediate) and friends. */
10454 if (speed)
10455 {
10456 if (VECTOR_MODE_P (mode))
10457 *cost += extra_cost->vect.alu;
10458 else
10459 *cost += extra_cost->alu.shift;
10460 }
10461
10462 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
10463 return true;
10464 }
10465 else
10466 {
10467 if (VECTOR_MODE_P (mode))
10468 {
10469 if (speed)
10470 /* Vector shift (register). */
10471 *cost += extra_cost->vect.alu;
10472 }
10473 else
10474 {
10475 if (speed)
10476 /* ASR (register) and friends. */
10477 *cost += extra_cost->alu.shift_reg;
10478
10479 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
10480 && CONST_INT_P (XEXP (op1, 1))
10481 && known_eq (INTVAL (XEXP (op1, 1)),
10482 GET_MODE_BITSIZE (mode) - 1))
10483 {
10484 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
10485 /* We already demanded XEXP (op1, 0) to be REG_P, so
10486 don't recurse into it. */
10487 return true;
10488 }
10489 }
10490 return false; /* All arguments need to be in registers. */
10491 }
10492
10493 case SYMBOL_REF:
10494
10495 if (aarch64_cmodel == AARCH64_CMODEL_LARGE
10496 || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
10497 {
10498 /* LDR. */
10499 if (speed)
10500 *cost += extra_cost->ldst.load;
10501 }
10502 else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
10503 || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
10504 {
10505 /* ADRP, followed by ADD. */
10506 *cost += COSTS_N_INSNS (1);
10507 if (speed)
10508 *cost += 2 * extra_cost->alu.arith;
10509 }
10510 else if (aarch64_cmodel == AARCH64_CMODEL_TINY
10511 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
10512 {
10513 /* ADR. */
10514 if (speed)
10515 *cost += extra_cost->alu.arith;
10516 }
10517
10518 if (flag_pic)
10519 {
10520 /* One extra load instruction, after accessing the GOT. */
10521 *cost += COSTS_N_INSNS (1);
10522 if (speed)
10523 *cost += extra_cost->ldst.load;
10524 }
10525 return true;
10526
10527 case HIGH:
10528 case LO_SUM:
10529 /* ADRP/ADD (immediate). */
10530 if (speed)
10531 *cost += extra_cost->alu.arith;
10532 return true;
10533
10534 case ZERO_EXTRACT:
10535 case SIGN_EXTRACT:
10536 /* UBFX/SBFX. */
10537 if (speed)
10538 {
10539 if (VECTOR_MODE_P (mode))
10540 *cost += extra_cost->vect.alu;
10541 else
10542 *cost += extra_cost->alu.bfx;
10543 }
10544
10545 /* We can trust that the immediates used will be correct (there
10546 are no by-register forms), so we need only cost op0. */
10547 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
10548 return true;
10549
10550 case MULT:
10551 *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
10552 /* aarch64_rtx_mult_cost always handles recursion to its
10553 operands. */
10554 return true;
10555
10556 case MOD:
10557 /* We can expand signed mod by power of 2 using a NEGS, two parallel
10558 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of
10559 an unconditional negate. This case should only ever be reached through
10560 the set_smod_pow2_cheap check in expmed.c. */
10561 if (CONST_INT_P (XEXP (x, 1))
10562 && exact_log2 (INTVAL (XEXP (x, 1))) > 0
10563 && (mode == SImode || mode == DImode))
10564 {
10565 /* We expand to 4 instructions. Reset the baseline. */
10566 *cost = COSTS_N_INSNS (4);
10567
10568 if (speed)
10569 *cost += 2 * extra_cost->alu.logical
10570 + 2 * extra_cost->alu.arith;
10571
10572 return true;
10573 }
10574
10575 /* Fall-through. */
10576 case UMOD:
10577 if (speed)
10578 {
10579 /* Slighly prefer UMOD over SMOD. */
10580 if (VECTOR_MODE_P (mode))
10581 *cost += extra_cost->vect.alu;
10582 else if (GET_MODE_CLASS (mode) == MODE_INT)
10583 *cost += (extra_cost->mult[mode == DImode].add
10584 + extra_cost->mult[mode == DImode].idiv
10585 + (code == MOD ? 1 : 0));
10586 }
10587 return false; /* All arguments need to be in registers. */
10588
10589 case DIV:
10590 case UDIV:
10591 case SQRT:
10592 if (speed)
10593 {
10594 if (VECTOR_MODE_P (mode))
10595 *cost += extra_cost->vect.alu;
10596 else if (GET_MODE_CLASS (mode) == MODE_INT)
10597 /* There is no integer SQRT, so only DIV and UDIV can get
10598 here. */
10599 *cost += (extra_cost->mult[mode == DImode].idiv
10600 /* Slighly prefer UDIV over SDIV. */
10601 + (code == DIV ? 1 : 0));
10602 else
10603 *cost += extra_cost->fp[mode == DFmode].div;
10604 }
10605 return false; /* All arguments need to be in registers. */
10606
10607 case IF_THEN_ELSE:
10608 return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
10609 XEXP (x, 2), cost, speed);
10610
10611 case EQ:
10612 case NE:
10613 case GT:
10614 case GTU:
10615 case LT:
10616 case LTU:
10617 case GE:
10618 case GEU:
10619 case LE:
10620 case LEU:
10621
10622 return false; /* All arguments must be in registers. */
10623
10624 case FMA:
10625 op0 = XEXP (x, 0);
10626 op1 = XEXP (x, 1);
10627 op2 = XEXP (x, 2);
10628
10629 if (speed)
10630 {
10631 if (VECTOR_MODE_P (mode))
10632 *cost += extra_cost->vect.alu;
10633 else
10634 *cost += extra_cost->fp[mode == DFmode].fma;
10635 }
10636
10637 /* FMSUB, FNMADD, and FNMSUB are free. */
10638 if (GET_CODE (op0) == NEG)
10639 op0 = XEXP (op0, 0);
10640
10641 if (GET_CODE (op2) == NEG)
10642 op2 = XEXP (op2, 0);
10643
10644 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
10645 and the by-element operand as operand 0. */
10646 if (GET_CODE (op1) == NEG)
10647 op1 = XEXP (op1, 0);
10648
10649 /* Catch vector-by-element operations. The by-element operand can
10650 either be (vec_duplicate (vec_select (x))) or just
10651 (vec_select (x)), depending on whether we are multiplying by
10652 a vector or a scalar.
10653
10654 Canonicalization is not very good in these cases, FMA4 will put the
10655 by-element operand as operand 0, FNMA4 will have it as operand 1. */
10656 if (GET_CODE (op0) == VEC_DUPLICATE)
10657 op0 = XEXP (op0, 0);
10658 else if (GET_CODE (op1) == VEC_DUPLICATE)
10659 op1 = XEXP (op1, 0);
10660
10661 if (GET_CODE (op0) == VEC_SELECT)
10662 op0 = XEXP (op0, 0);
10663 else if (GET_CODE (op1) == VEC_SELECT)
10664 op1 = XEXP (op1, 0);
10665
10666 /* If the remaining parameters are not registers,
10667 get the cost to put them into registers. */
10668 *cost += rtx_cost (op0, mode, FMA, 0, speed);
10669 *cost += rtx_cost (op1, mode, FMA, 1, speed);
10670 *cost += rtx_cost (op2, mode, FMA, 2, speed);
10671 return true;
10672
10673 case FLOAT:
10674 case UNSIGNED_FLOAT:
10675 if (speed)
10676 *cost += extra_cost->fp[mode == DFmode].fromint;
10677 return false;
10678
10679 case FLOAT_EXTEND:
10680 if (speed)
10681 {
10682 if (VECTOR_MODE_P (mode))
10683 {
10684 /*Vector truncate. */
10685 *cost += extra_cost->vect.alu;
10686 }
10687 else
10688 *cost += extra_cost->fp[mode == DFmode].widen;
10689 }
10690 return false;
10691
10692 case FLOAT_TRUNCATE:
10693 if (speed)
10694 {
10695 if (VECTOR_MODE_P (mode))
10696 {
10697 /*Vector conversion. */
10698 *cost += extra_cost->vect.alu;
10699 }
10700 else
10701 *cost += extra_cost->fp[mode == DFmode].narrow;
10702 }
10703 return false;
10704
10705 case FIX:
10706 case UNSIGNED_FIX:
10707 x = XEXP (x, 0);
10708 /* Strip the rounding part. They will all be implemented
10709 by the fcvt* family of instructions anyway. */
10710 if (GET_CODE (x) == UNSPEC)
10711 {
10712 unsigned int uns_code = XINT (x, 1);
10713
10714 if (uns_code == UNSPEC_FRINTA
10715 || uns_code == UNSPEC_FRINTM
10716 || uns_code == UNSPEC_FRINTN
10717 || uns_code == UNSPEC_FRINTP
10718 || uns_code == UNSPEC_FRINTZ)
10719 x = XVECEXP (x, 0, 0);
10720 }
10721
10722 if (speed)
10723 {
10724 if (VECTOR_MODE_P (mode))
10725 *cost += extra_cost->vect.alu;
10726 else
10727 *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
10728 }
10729
10730 /* We can combine fmul by a power of 2 followed by a fcvt into a single
10731 fixed-point fcvt. */
10732 if (GET_CODE (x) == MULT
10733 && ((VECTOR_MODE_P (mode)
10734 && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
10735 || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
10736 {
10737 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
10738 0, speed);
10739 return true;
10740 }
10741
10742 *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
10743 return true;
10744
10745 case ABS:
10746 if (VECTOR_MODE_P (mode))
10747 {
10748 /* ABS (vector). */
10749 if (speed)
10750 *cost += extra_cost->vect.alu;
10751 }
10752 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
10753 {
10754 op0 = XEXP (x, 0);
10755
10756 /* FABD, which is analogous to FADD. */
10757 if (GET_CODE (op0) == MINUS)
10758 {
10759 *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
10760 *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
10761 if (speed)
10762 *cost += extra_cost->fp[mode == DFmode].addsub;
10763
10764 return true;
10765 }
10766 /* Simple FABS is analogous to FNEG. */
10767 if (speed)
10768 *cost += extra_cost->fp[mode == DFmode].neg;
10769 }
10770 else
10771 {
10772 /* Integer ABS will either be split to
10773 two arithmetic instructions, or will be an ABS
10774 (scalar), which we don't model. */
10775 *cost = COSTS_N_INSNS (2);
10776 if (speed)
10777 *cost += 2 * extra_cost->alu.arith;
10778 }
10779 return false;
10780
10781 case SMAX:
10782 case SMIN:
10783 if (speed)
10784 {
10785 if (VECTOR_MODE_P (mode))
10786 *cost += extra_cost->vect.alu;
10787 else
10788 {
10789 /* FMAXNM/FMINNM/FMAX/FMIN.
10790 TODO: This may not be accurate for all implementations, but
10791 we do not model this in the cost tables. */
10792 *cost += extra_cost->fp[mode == DFmode].addsub;
10793 }
10794 }
10795 return false;
10796
10797 case UNSPEC:
10798 /* The floating point round to integer frint* instructions. */
10799 if (aarch64_frint_unspec_p (XINT (x, 1)))
10800 {
10801 if (speed)
10802 *cost += extra_cost->fp[mode == DFmode].roundint;
10803
10804 return false;
10805 }
10806
10807 if (XINT (x, 1) == UNSPEC_RBIT)
10808 {
10809 if (speed)
10810 *cost += extra_cost->alu.rev;
10811
10812 return false;
10813 }
10814 break;
10815
10816 case TRUNCATE:
10817
10818 /* Decompose <su>muldi3_highpart. */
10819 if (/* (truncate:DI */
10820 mode == DImode
10821 /* (lshiftrt:TI */
10822 && GET_MODE (XEXP (x, 0)) == TImode
10823 && GET_CODE (XEXP (x, 0)) == LSHIFTRT
10824 /* (mult:TI */
10825 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
10826 /* (ANY_EXTEND:TI (reg:DI))
10827 (ANY_EXTEND:TI (reg:DI))) */
10828 && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
10829 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
10830 || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
10831 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
10832 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
10833 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
10834 /* (const_int 64) */
10835 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
10836 && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
10837 {
10838 /* UMULH/SMULH. */
10839 if (speed)
10840 *cost += extra_cost->mult[mode == DImode].extend;
10841 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
10842 mode, MULT, 0, speed);
10843 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
10844 mode, MULT, 1, speed);
10845 return true;
10846 }
10847
10848 /* Fall through. */
10849 default:
10850 break;
10851 }
10852
10853 if (dump_file
10854 && flag_aarch64_verbose_cost)
10855 fprintf (dump_file,
10856 "\nFailed to cost RTX. Assuming default cost.\n");
10857
10858 return true;
10859 }
10860
10861 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
10862 calculated for X. This cost is stored in *COST. Returns true
10863 if the total cost of X was calculated. */
10864 static bool
aarch64_rtx_costs_wrapper(rtx x,machine_mode mode,int outer,int param,int * cost,bool speed)10865 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
10866 int param, int *cost, bool speed)
10867 {
10868 bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
10869
10870 if (dump_file
10871 && flag_aarch64_verbose_cost)
10872 {
10873 print_rtl_single (dump_file, x);
10874 fprintf (dump_file, "\n%s cost: %d (%s)\n",
10875 speed ? "Hot" : "Cold",
10876 *cost, result ? "final" : "partial");
10877 }
10878
10879 return result;
10880 }
10881
10882 static int
aarch64_register_move_cost(machine_mode mode,reg_class_t from_i,reg_class_t to_i)10883 aarch64_register_move_cost (machine_mode mode,
10884 reg_class_t from_i, reg_class_t to_i)
10885 {
10886 enum reg_class from = (enum reg_class) from_i;
10887 enum reg_class to = (enum reg_class) to_i;
10888 const struct cpu_regmove_cost *regmove_cost
10889 = aarch64_tune_params.regmove_cost;
10890
10891 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
10892 if (to == TAILCALL_ADDR_REGS || to == POINTER_REGS
10893 || to == STUB_REGS)
10894 to = GENERAL_REGS;
10895
10896 if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS
10897 || from == STUB_REGS)
10898 from = GENERAL_REGS;
10899
10900 /* Moving between GPR and stack cost is the same as GP2GP. */
10901 if ((from == GENERAL_REGS && to == STACK_REG)
10902 || (to == GENERAL_REGS && from == STACK_REG))
10903 return regmove_cost->GP2GP;
10904
10905 /* To/From the stack register, we move via the gprs. */
10906 if (to == STACK_REG || from == STACK_REG)
10907 return aarch64_register_move_cost (mode, from, GENERAL_REGS)
10908 + aarch64_register_move_cost (mode, GENERAL_REGS, to);
10909
10910 if (known_eq (GET_MODE_SIZE (mode), 16))
10911 {
10912 /* 128-bit operations on general registers require 2 instructions. */
10913 if (from == GENERAL_REGS && to == GENERAL_REGS)
10914 return regmove_cost->GP2GP * 2;
10915 else if (from == GENERAL_REGS)
10916 return regmove_cost->GP2FP * 2;
10917 else if (to == GENERAL_REGS)
10918 return regmove_cost->FP2GP * 2;
10919
10920 /* When AdvSIMD instructions are disabled it is not possible to move
10921 a 128-bit value directly between Q registers. This is handled in
10922 secondary reload. A general register is used as a scratch to move
10923 the upper DI value and the lower DI value is moved directly,
10924 hence the cost is the sum of three moves. */
10925 if (! TARGET_SIMD)
10926 return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
10927
10928 return regmove_cost->FP2FP;
10929 }
10930
10931 if (from == GENERAL_REGS && to == GENERAL_REGS)
10932 return regmove_cost->GP2GP;
10933 else if (from == GENERAL_REGS)
10934 return regmove_cost->GP2FP;
10935 else if (to == GENERAL_REGS)
10936 return regmove_cost->FP2GP;
10937
10938 return regmove_cost->FP2FP;
10939 }
10940
10941 static int
aarch64_memory_move_cost(machine_mode mode ATTRIBUTE_UNUSED,reg_class_t rclass ATTRIBUTE_UNUSED,bool in ATTRIBUTE_UNUSED)10942 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
10943 reg_class_t rclass ATTRIBUTE_UNUSED,
10944 bool in ATTRIBUTE_UNUSED)
10945 {
10946 return aarch64_tune_params.memmov_cost;
10947 }
10948
10949 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
10950 to optimize 1.0/sqrt. */
10951
10952 static bool
use_rsqrt_p(machine_mode mode)10953 use_rsqrt_p (machine_mode mode)
10954 {
10955 return (!flag_trapping_math
10956 && flag_unsafe_math_optimizations
10957 && ((aarch64_tune_params.approx_modes->recip_sqrt
10958 & AARCH64_APPROX_MODE (mode))
10959 || flag_mrecip_low_precision_sqrt));
10960 }
10961
10962 /* Function to decide when to use the approximate reciprocal square root
10963 builtin. */
10964
10965 static tree
aarch64_builtin_reciprocal(tree fndecl)10966 aarch64_builtin_reciprocal (tree fndecl)
10967 {
10968 machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
10969
10970 if (!use_rsqrt_p (mode))
10971 return NULL_TREE;
10972 return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
10973 }
10974
10975 /* Emit instruction sequence to compute either the approximate square root
10976 or its approximate reciprocal, depending on the flag RECP, and return
10977 whether the sequence was emitted or not. */
10978
10979 bool
aarch64_emit_approx_sqrt(rtx dst,rtx src,bool recp)10980 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
10981 {
10982 machine_mode mode = GET_MODE (dst);
10983
10984 if (GET_MODE_INNER (mode) == HFmode)
10985 {
10986 gcc_assert (!recp);
10987 return false;
10988 }
10989
10990 if (!recp)
10991 {
10992 if (!(flag_mlow_precision_sqrt
10993 || (aarch64_tune_params.approx_modes->sqrt
10994 & AARCH64_APPROX_MODE (mode))))
10995 return false;
10996
10997 if (flag_finite_math_only
10998 || flag_trapping_math
10999 || !flag_unsafe_math_optimizations
11000 || optimize_function_for_size_p (cfun))
11001 return false;
11002 }
11003 else
11004 /* Caller assumes we cannot fail. */
11005 gcc_assert (use_rsqrt_p (mode));
11006
11007 machine_mode mmsk = mode_for_int_vector (mode).require ();
11008 rtx xmsk = gen_reg_rtx (mmsk);
11009 if (!recp)
11010 /* When calculating the approximate square root, compare the
11011 argument with 0.0 and create a mask. */
11012 emit_insn (gen_rtx_SET (xmsk,
11013 gen_rtx_NEG (mmsk,
11014 gen_rtx_EQ (mmsk, src,
11015 CONST0_RTX (mode)))));
11016
11017 /* Estimate the approximate reciprocal square root. */
11018 rtx xdst = gen_reg_rtx (mode);
11019 emit_insn (gen_aarch64_rsqrte (mode, xdst, src));
11020
11021 /* Iterate over the series twice for SF and thrice for DF. */
11022 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
11023
11024 /* Optionally iterate over the series once less for faster performance
11025 while sacrificing the accuracy. */
11026 if ((recp && flag_mrecip_low_precision_sqrt)
11027 || (!recp && flag_mlow_precision_sqrt))
11028 iterations--;
11029
11030 /* Iterate over the series to calculate the approximate reciprocal square
11031 root. */
11032 rtx x1 = gen_reg_rtx (mode);
11033 while (iterations--)
11034 {
11035 rtx x2 = gen_reg_rtx (mode);
11036 emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
11037
11038 emit_insn (gen_aarch64_rsqrts (mode, x1, src, x2));
11039
11040 if (iterations > 0)
11041 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
11042 }
11043
11044 if (!recp)
11045 {
11046 /* Qualify the approximate reciprocal square root when the argument is
11047 0.0 by squashing the intermediary result to 0.0. */
11048 rtx xtmp = gen_reg_rtx (mmsk);
11049 emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
11050 gen_rtx_SUBREG (mmsk, xdst, 0)));
11051 emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
11052
11053 /* Calculate the approximate square root. */
11054 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
11055 }
11056
11057 /* Finalize the approximation. */
11058 emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
11059
11060 return true;
11061 }
11062
11063 /* Emit the instruction sequence to compute the approximation for the division
11064 of NUM by DEN in QUO and return whether the sequence was emitted or not. */
11065
11066 bool
aarch64_emit_approx_div(rtx quo,rtx num,rtx den)11067 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
11068 {
11069 machine_mode mode = GET_MODE (quo);
11070
11071 if (GET_MODE_INNER (mode) == HFmode)
11072 return false;
11073
11074 bool use_approx_division_p = (flag_mlow_precision_div
11075 || (aarch64_tune_params.approx_modes->division
11076 & AARCH64_APPROX_MODE (mode)));
11077
11078 if (!flag_finite_math_only
11079 || flag_trapping_math
11080 || !flag_unsafe_math_optimizations
11081 || optimize_function_for_size_p (cfun)
11082 || !use_approx_division_p)
11083 return false;
11084
11085 if (!TARGET_SIMD && VECTOR_MODE_P (mode))
11086 return false;
11087
11088 /* Estimate the approximate reciprocal. */
11089 rtx xrcp = gen_reg_rtx (mode);
11090 emit_insn (gen_aarch64_frecpe (mode, xrcp, den));
11091
11092 /* Iterate over the series twice for SF and thrice for DF. */
11093 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
11094
11095 /* Optionally iterate over the series once less for faster performance,
11096 while sacrificing the accuracy. */
11097 if (flag_mlow_precision_div)
11098 iterations--;
11099
11100 /* Iterate over the series to calculate the approximate reciprocal. */
11101 rtx xtmp = gen_reg_rtx (mode);
11102 while (iterations--)
11103 {
11104 emit_insn (gen_aarch64_frecps (mode, xtmp, xrcp, den));
11105
11106 if (iterations > 0)
11107 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
11108 }
11109
11110 if (num != CONST1_RTX (mode))
11111 {
11112 /* As the approximate reciprocal of DEN is already calculated, only
11113 calculate the approximate division when NUM is not 1.0. */
11114 rtx xnum = force_reg (mode, num);
11115 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
11116 }
11117
11118 /* Finalize the approximation. */
11119 emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
11120 return true;
11121 }
11122
11123 /* Return the number of instructions that can be issued per cycle. */
11124 static int
aarch64_sched_issue_rate(void)11125 aarch64_sched_issue_rate (void)
11126 {
11127 return aarch64_tune_params.issue_rate;
11128 }
11129
11130 static int
aarch64_sched_first_cycle_multipass_dfa_lookahead(void)11131 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
11132 {
11133 int issue_rate = aarch64_sched_issue_rate ();
11134
11135 return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
11136 }
11137
11138
11139 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
11140 autopref_multipass_dfa_lookahead_guard from haifa-sched.c. It only
11141 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */
11142
11143 static int
aarch64_first_cycle_multipass_dfa_lookahead_guard(rtx_insn * insn,int ready_index)11144 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
11145 int ready_index)
11146 {
11147 return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
11148 }
11149
11150
11151 /* Vectorizer cost model target hooks. */
11152
11153 /* Implement targetm.vectorize.builtin_vectorization_cost. */
11154 static int
aarch64_builtin_vectorization_cost(enum vect_cost_for_stmt type_of_cost,tree vectype,int misalign ATTRIBUTE_UNUSED)11155 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
11156 tree vectype,
11157 int misalign ATTRIBUTE_UNUSED)
11158 {
11159 unsigned elements;
11160 const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
11161 bool fp = false;
11162
11163 if (vectype != NULL)
11164 fp = FLOAT_TYPE_P (vectype);
11165
11166 switch (type_of_cost)
11167 {
11168 case scalar_stmt:
11169 return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
11170
11171 case scalar_load:
11172 return costs->scalar_load_cost;
11173
11174 case scalar_store:
11175 return costs->scalar_store_cost;
11176
11177 case vector_stmt:
11178 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
11179
11180 case vector_load:
11181 return costs->vec_align_load_cost;
11182
11183 case vector_store:
11184 return costs->vec_store_cost;
11185
11186 case vec_to_scalar:
11187 return costs->vec_to_scalar_cost;
11188
11189 case scalar_to_vec:
11190 return costs->scalar_to_vec_cost;
11191
11192 case unaligned_load:
11193 case vector_gather_load:
11194 return costs->vec_unalign_load_cost;
11195
11196 case unaligned_store:
11197 case vector_scatter_store:
11198 return costs->vec_unalign_store_cost;
11199
11200 case cond_branch_taken:
11201 return costs->cond_taken_branch_cost;
11202
11203 case cond_branch_not_taken:
11204 return costs->cond_not_taken_branch_cost;
11205
11206 case vec_perm:
11207 return costs->vec_permute_cost;
11208
11209 case vec_promote_demote:
11210 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
11211
11212 case vec_construct:
11213 elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
11214 return elements / 2 + 1;
11215
11216 default:
11217 gcc_unreachable ();
11218 }
11219 }
11220
11221 /* Implement targetm.vectorize.add_stmt_cost. */
11222 static unsigned
aarch64_add_stmt_cost(void * data,int count,enum vect_cost_for_stmt kind,struct _stmt_vec_info * stmt_info,int misalign,enum vect_cost_model_location where)11223 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
11224 struct _stmt_vec_info *stmt_info, int misalign,
11225 enum vect_cost_model_location where)
11226 {
11227 unsigned *cost = (unsigned *) data;
11228 unsigned retval = 0;
11229
11230 if (flag_vect_cost_model)
11231 {
11232 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
11233 int stmt_cost =
11234 aarch64_builtin_vectorization_cost (kind, vectype, misalign);
11235
11236 /* Statements in an inner loop relative to the loop being
11237 vectorized are weighted more heavily. The value here is
11238 arbitrary and could potentially be improved with analysis. */
11239 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
11240 count *= 50; /* FIXME */
11241
11242 retval = (unsigned) (count * stmt_cost);
11243 cost[where] += retval;
11244 }
11245
11246 return retval;
11247 }
11248
11249 static void initialize_aarch64_code_model (struct gcc_options *);
11250
11251 /* Parse the TO_PARSE string and put the architecture struct that it
11252 selects into RES and the architectural features into ISA_FLAGS.
11253 Return an aarch64_parse_opt_result describing the parse result.
11254 If there is an error parsing, RES and ISA_FLAGS are left unchanged.
11255 When the TO_PARSE string contains an invalid extension,
11256 a copy of the string is created and stored to INVALID_EXTENSION. */
11257
11258 static enum aarch64_parse_opt_result
aarch64_parse_arch(const char * to_parse,const struct processor ** res,unsigned long * isa_flags,std::string * invalid_extension)11259 aarch64_parse_arch (const char *to_parse, const struct processor **res,
11260 unsigned long *isa_flags, std::string *invalid_extension)
11261 {
11262 const char *ext;
11263 const struct processor *arch;
11264 size_t len;
11265
11266 ext = strchr (to_parse, '+');
11267
11268 if (ext != NULL)
11269 len = ext - to_parse;
11270 else
11271 len = strlen (to_parse);
11272
11273 if (len == 0)
11274 return AARCH64_PARSE_MISSING_ARG;
11275
11276
11277 /* Loop through the list of supported ARCHes to find a match. */
11278 for (arch = all_architectures; arch->name != NULL; arch++)
11279 {
11280 if (strlen (arch->name) == len
11281 && strncmp (arch->name, to_parse, len) == 0)
11282 {
11283 unsigned long isa_temp = arch->flags;
11284
11285 if (ext != NULL)
11286 {
11287 /* TO_PARSE string contains at least one extension. */
11288 enum aarch64_parse_opt_result ext_res
11289 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
11290
11291 if (ext_res != AARCH64_PARSE_OK)
11292 return ext_res;
11293 }
11294 /* Extension parsing was successful. Confirm the result
11295 arch and ISA flags. */
11296 *res = arch;
11297 *isa_flags = isa_temp;
11298 return AARCH64_PARSE_OK;
11299 }
11300 }
11301
11302 /* ARCH name not found in list. */
11303 return AARCH64_PARSE_INVALID_ARG;
11304 }
11305
11306 /* Parse the TO_PARSE string and put the result tuning in RES and the
11307 architecture flags in ISA_FLAGS. Return an aarch64_parse_opt_result
11308 describing the parse result. If there is an error parsing, RES and
11309 ISA_FLAGS are left unchanged.
11310 When the TO_PARSE string contains an invalid extension,
11311 a copy of the string is created and stored to INVALID_EXTENSION. */
11312
11313 static enum aarch64_parse_opt_result
aarch64_parse_cpu(const char * to_parse,const struct processor ** res,unsigned long * isa_flags,std::string * invalid_extension)11314 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
11315 unsigned long *isa_flags, std::string *invalid_extension)
11316 {
11317 const char *ext;
11318 const struct processor *cpu;
11319 size_t len;
11320
11321 ext = strchr (to_parse, '+');
11322
11323 if (ext != NULL)
11324 len = ext - to_parse;
11325 else
11326 len = strlen (to_parse);
11327
11328 if (len == 0)
11329 return AARCH64_PARSE_MISSING_ARG;
11330
11331
11332 /* Loop through the list of supported CPUs to find a match. */
11333 for (cpu = all_cores; cpu->name != NULL; cpu++)
11334 {
11335 if (strlen (cpu->name) == len && strncmp (cpu->name, to_parse, len) == 0)
11336 {
11337 unsigned long isa_temp = cpu->flags;
11338
11339
11340 if (ext != NULL)
11341 {
11342 /* TO_PARSE string contains at least one extension. */
11343 enum aarch64_parse_opt_result ext_res
11344 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
11345
11346 if (ext_res != AARCH64_PARSE_OK)
11347 return ext_res;
11348 }
11349 /* Extension parsing was successfull. Confirm the result
11350 cpu and ISA flags. */
11351 *res = cpu;
11352 *isa_flags = isa_temp;
11353 return AARCH64_PARSE_OK;
11354 }
11355 }
11356
11357 /* CPU name not found in list. */
11358 return AARCH64_PARSE_INVALID_ARG;
11359 }
11360
11361 /* Parse the TO_PARSE string and put the cpu it selects into RES.
11362 Return an aarch64_parse_opt_result describing the parse result.
11363 If the parsing fails the RES does not change. */
11364
11365 static enum aarch64_parse_opt_result
aarch64_parse_tune(const char * to_parse,const struct processor ** res)11366 aarch64_parse_tune (const char *to_parse, const struct processor **res)
11367 {
11368 const struct processor *cpu;
11369
11370 /* Loop through the list of supported CPUs to find a match. */
11371 for (cpu = all_cores; cpu->name != NULL; cpu++)
11372 {
11373 if (strcmp (cpu->name, to_parse) == 0)
11374 {
11375 *res = cpu;
11376 return AARCH64_PARSE_OK;
11377 }
11378 }
11379
11380 /* CPU name not found in list. */
11381 return AARCH64_PARSE_INVALID_ARG;
11382 }
11383
11384 /* Parse TOKEN, which has length LENGTH to see if it is an option
11385 described in FLAG. If it is, return the index bit for that fusion type.
11386 If not, error (printing OPTION_NAME) and return zero. */
11387
11388 static unsigned int
aarch64_parse_one_option_token(const char * token,size_t length,const struct aarch64_flag_desc * flag,const char * option_name)11389 aarch64_parse_one_option_token (const char *token,
11390 size_t length,
11391 const struct aarch64_flag_desc *flag,
11392 const char *option_name)
11393 {
11394 for (; flag->name != NULL; flag++)
11395 {
11396 if (length == strlen (flag->name)
11397 && !strncmp (flag->name, token, length))
11398 return flag->flag;
11399 }
11400
11401 error ("unknown flag passed in %<-moverride=%s%> (%s)", option_name, token);
11402 return 0;
11403 }
11404
11405 /* Parse OPTION which is a comma-separated list of flags to enable.
11406 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
11407 default state we inherit from the CPU tuning structures. OPTION_NAME
11408 gives the top-level option we are parsing in the -moverride string,
11409 for use in error messages. */
11410
11411 static unsigned int
aarch64_parse_boolean_options(const char * option,const struct aarch64_flag_desc * flags,unsigned int initial_state,const char * option_name)11412 aarch64_parse_boolean_options (const char *option,
11413 const struct aarch64_flag_desc *flags,
11414 unsigned int initial_state,
11415 const char *option_name)
11416 {
11417 const char separator = '.';
11418 const char* specs = option;
11419 const char* ntoken = option;
11420 unsigned int found_flags = initial_state;
11421
11422 while ((ntoken = strchr (specs, separator)))
11423 {
11424 size_t token_length = ntoken - specs;
11425 unsigned token_ops = aarch64_parse_one_option_token (specs,
11426 token_length,
11427 flags,
11428 option_name);
11429 /* If we find "none" (or, for simplicity's sake, an error) anywhere
11430 in the token stream, reset the supported operations. So:
11431
11432 adrp+add.cmp+branch.none.adrp+add
11433
11434 would have the result of turning on only adrp+add fusion. */
11435 if (!token_ops)
11436 found_flags = 0;
11437
11438 found_flags |= token_ops;
11439 specs = ++ntoken;
11440 }
11441
11442 /* We ended with a comma, print something. */
11443 if (!(*specs))
11444 {
11445 error ("%s string ill-formed\n", option_name);
11446 return 0;
11447 }
11448
11449 /* We still have one more token to parse. */
11450 size_t token_length = strlen (specs);
11451 unsigned token_ops = aarch64_parse_one_option_token (specs,
11452 token_length,
11453 flags,
11454 option_name);
11455 if (!token_ops)
11456 found_flags = 0;
11457
11458 found_flags |= token_ops;
11459 return found_flags;
11460 }
11461
11462 /* Support for overriding instruction fusion. */
11463
11464 static void
aarch64_parse_fuse_string(const char * fuse_string,struct tune_params * tune)11465 aarch64_parse_fuse_string (const char *fuse_string,
11466 struct tune_params *tune)
11467 {
11468 tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
11469 aarch64_fusible_pairs,
11470 tune->fusible_ops,
11471 "fuse=");
11472 }
11473
11474 /* Support for overriding other tuning flags. */
11475
11476 static void
aarch64_parse_tune_string(const char * tune_string,struct tune_params * tune)11477 aarch64_parse_tune_string (const char *tune_string,
11478 struct tune_params *tune)
11479 {
11480 tune->extra_tuning_flags
11481 = aarch64_parse_boolean_options (tune_string,
11482 aarch64_tuning_flags,
11483 tune->extra_tuning_flags,
11484 "tune=");
11485 }
11486
11487 /* Parse the sve_width tuning moverride string in TUNE_STRING.
11488 Accept the valid SVE vector widths allowed by
11489 aarch64_sve_vector_bits_enum and use it to override sve_width
11490 in TUNE. */
11491
11492 static void
aarch64_parse_sve_width_string(const char * tune_string,struct tune_params * tune)11493 aarch64_parse_sve_width_string (const char *tune_string,
11494 struct tune_params *tune)
11495 {
11496 int width = -1;
11497
11498 int n = sscanf (tune_string, "%d", &width);
11499 if (n == EOF)
11500 {
11501 error ("invalid format for sve_width");
11502 return;
11503 }
11504 switch (width)
11505 {
11506 case SVE_128:
11507 case SVE_256:
11508 case SVE_512:
11509 case SVE_1024:
11510 case SVE_2048:
11511 break;
11512 default:
11513 error ("invalid sve_width value: %d", width);
11514 }
11515 tune->sve_width = (enum aarch64_sve_vector_bits_enum) width;
11516 }
11517
11518 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
11519 we understand. If it is, extract the option string and handoff to
11520 the appropriate function. */
11521
11522 void
aarch64_parse_one_override_token(const char * token,size_t length,struct tune_params * tune)11523 aarch64_parse_one_override_token (const char* token,
11524 size_t length,
11525 struct tune_params *tune)
11526 {
11527 const struct aarch64_tuning_override_function *fn
11528 = aarch64_tuning_override_functions;
11529
11530 const char *option_part = strchr (token, '=');
11531 if (!option_part)
11532 {
11533 error ("tuning string missing in option (%s)", token);
11534 return;
11535 }
11536
11537 /* Get the length of the option name. */
11538 length = option_part - token;
11539 /* Skip the '=' to get to the option string. */
11540 option_part++;
11541
11542 for (; fn->name != NULL; fn++)
11543 {
11544 if (!strncmp (fn->name, token, length))
11545 {
11546 fn->parse_override (option_part, tune);
11547 return;
11548 }
11549 }
11550
11551 error ("unknown tuning option (%s)",token);
11552 return;
11553 }
11554
11555 /* A checking mechanism for the implementation of the tls size. */
11556
11557 static void
initialize_aarch64_tls_size(struct gcc_options * opts)11558 initialize_aarch64_tls_size (struct gcc_options *opts)
11559 {
11560 if (aarch64_tls_size == 0)
11561 aarch64_tls_size = 24;
11562
11563 switch (opts->x_aarch64_cmodel_var)
11564 {
11565 case AARCH64_CMODEL_TINY:
11566 /* Both the default and maximum TLS size allowed under tiny is 1M which
11567 needs two instructions to address, so we clamp the size to 24. */
11568 if (aarch64_tls_size > 24)
11569 aarch64_tls_size = 24;
11570 break;
11571 case AARCH64_CMODEL_SMALL:
11572 /* The maximum TLS size allowed under small is 4G. */
11573 if (aarch64_tls_size > 32)
11574 aarch64_tls_size = 32;
11575 break;
11576 case AARCH64_CMODEL_LARGE:
11577 /* The maximum TLS size allowed under large is 16E.
11578 FIXME: 16E should be 64bit, we only support 48bit offset now. */
11579 if (aarch64_tls_size > 48)
11580 aarch64_tls_size = 48;
11581 break;
11582 default:
11583 gcc_unreachable ();
11584 }
11585
11586 return;
11587 }
11588
11589 /* Parse STRING looking for options in the format:
11590 string :: option:string
11591 option :: name=substring
11592 name :: {a-z}
11593 substring :: defined by option. */
11594
11595 static void
aarch64_parse_override_string(const char * input_string,struct tune_params * tune)11596 aarch64_parse_override_string (const char* input_string,
11597 struct tune_params* tune)
11598 {
11599 const char separator = ':';
11600 size_t string_length = strlen (input_string) + 1;
11601 char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
11602 char *string = string_root;
11603 strncpy (string, input_string, string_length);
11604 string[string_length - 1] = '\0';
11605
11606 char* ntoken = string;
11607
11608 while ((ntoken = strchr (string, separator)))
11609 {
11610 size_t token_length = ntoken - string;
11611 /* Make this substring look like a string. */
11612 *ntoken = '\0';
11613 aarch64_parse_one_override_token (string, token_length, tune);
11614 string = ++ntoken;
11615 }
11616
11617 /* One last option to parse. */
11618 aarch64_parse_one_override_token (string, strlen (string), tune);
11619 free (string_root);
11620 }
11621
11622
11623 static void
aarch64_override_options_after_change_1(struct gcc_options * opts)11624 aarch64_override_options_after_change_1 (struct gcc_options *opts)
11625 {
11626 if (accepted_branch_protection_string)
11627 {
11628 opts->x_aarch64_branch_protection_string
11629 = xstrdup (accepted_branch_protection_string);
11630 }
11631
11632 /* PR 70044: We have to be careful about being called multiple times for the
11633 same function. This means all changes should be repeatable. */
11634
11635 /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
11636 Disable the frame pointer flag so the mid-end will not use a frame
11637 pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
11638 Set x_flag_omit_frame_pointer to the special value 2 to differentiate
11639 between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2). */
11640 aarch64_use_frame_pointer = opts->x_flag_omit_frame_pointer != 1;
11641 if (opts->x_flag_omit_frame_pointer == 0)
11642 opts->x_flag_omit_frame_pointer = 2;
11643
11644 /* If not optimizing for size, set the default
11645 alignment to what the target wants. */
11646 if (!opts->x_optimize_size)
11647 {
11648 if (opts->x_flag_align_loops && !opts->x_str_align_loops)
11649 opts->x_str_align_loops = aarch64_tune_params.loop_align;
11650 if (opts->x_flag_align_jumps && !opts->x_str_align_jumps)
11651 opts->x_str_align_jumps = aarch64_tune_params.jump_align;
11652 if (opts->x_flag_align_functions && !opts->x_str_align_functions)
11653 opts->x_str_align_functions = aarch64_tune_params.function_align;
11654 }
11655
11656 /* We default to no pc-relative literal loads. */
11657
11658 aarch64_pcrelative_literal_loads = false;
11659
11660 /* If -mpc-relative-literal-loads is set on the command line, this
11661 implies that the user asked for PC relative literal loads. */
11662 if (opts->x_pcrelative_literal_loads == 1)
11663 aarch64_pcrelative_literal_loads = true;
11664
11665 /* In the tiny memory model it makes no sense to disallow PC relative
11666 literal pool loads. */
11667 if (aarch64_cmodel == AARCH64_CMODEL_TINY
11668 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
11669 aarch64_pcrelative_literal_loads = true;
11670
11671 /* When enabling the lower precision Newton series for the square root, also
11672 enable it for the reciprocal square root, since the latter is an
11673 intermediary step for the former. */
11674 if (flag_mlow_precision_sqrt)
11675 flag_mrecip_low_precision_sqrt = true;
11676 }
11677
11678 /* 'Unpack' up the internal tuning structs and update the options
11679 in OPTS. The caller must have set up selected_tune and selected_arch
11680 as all the other target-specific codegen decisions are
11681 derived from them. */
11682
11683 void
aarch64_override_options_internal(struct gcc_options * opts)11684 aarch64_override_options_internal (struct gcc_options *opts)
11685 {
11686 aarch64_tune_flags = selected_tune->flags;
11687 aarch64_tune = selected_tune->sched_core;
11688 /* Make a copy of the tuning parameters attached to the core, which
11689 we may later overwrite. */
11690 aarch64_tune_params = *(selected_tune->tune);
11691 aarch64_architecture_version = selected_arch->architecture_version;
11692
11693 if (opts->x_aarch64_override_tune_string)
11694 aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
11695 &aarch64_tune_params);
11696
11697 /* This target defaults to strict volatile bitfields. */
11698 if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
11699 opts->x_flag_strict_volatile_bitfields = 1;
11700
11701 if (aarch64_stack_protector_guard == SSP_GLOBAL
11702 && opts->x_aarch64_stack_protector_guard_offset_str)
11703 {
11704 error ("incompatible options %<-mstack-protector-guard=global%> and "
11705 "%<-mstack-protector-guard-offset=%s%>",
11706 aarch64_stack_protector_guard_offset_str);
11707 }
11708
11709 if (aarch64_stack_protector_guard == SSP_SYSREG
11710 && !(opts->x_aarch64_stack_protector_guard_offset_str
11711 && opts->x_aarch64_stack_protector_guard_reg_str))
11712 {
11713 error ("both %<-mstack-protector-guard-offset%> and "
11714 "%<-mstack-protector-guard-reg%> must be used "
11715 "with %<-mstack-protector-guard=sysreg%>");
11716 }
11717
11718 if (opts->x_aarch64_stack_protector_guard_reg_str)
11719 {
11720 if (strlen (opts->x_aarch64_stack_protector_guard_reg_str) > 100)
11721 error ("specify a system register with a small string length.");
11722 }
11723
11724 if (opts->x_aarch64_stack_protector_guard_offset_str)
11725 {
11726 char *end;
11727 const char *str = aarch64_stack_protector_guard_offset_str;
11728 errno = 0;
11729 long offs = strtol (aarch64_stack_protector_guard_offset_str, &end, 0);
11730 if (!*str || *end || errno)
11731 error ("%qs is not a valid offset in %qs", str,
11732 "-mstack-protector-guard-offset=");
11733 aarch64_stack_protector_guard_offset = offs;
11734 }
11735
11736 initialize_aarch64_code_model (opts);
11737 initialize_aarch64_tls_size (opts);
11738
11739 int queue_depth = 0;
11740 switch (aarch64_tune_params.autoprefetcher_model)
11741 {
11742 case tune_params::AUTOPREFETCHER_OFF:
11743 queue_depth = -1;
11744 break;
11745 case tune_params::AUTOPREFETCHER_WEAK:
11746 queue_depth = 0;
11747 break;
11748 case tune_params::AUTOPREFETCHER_STRONG:
11749 queue_depth = max_insn_queue_index + 1;
11750 break;
11751 default:
11752 gcc_unreachable ();
11753 }
11754
11755 /* We don't mind passing in global_options_set here as we don't use
11756 the *options_set structs anyway. */
11757 maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
11758 queue_depth,
11759 opts->x_param_values,
11760 global_options_set.x_param_values);
11761
11762 /* Set up parameters to be used in prefetching algorithm. Do not
11763 override the defaults unless we are tuning for a core we have
11764 researched values for. */
11765 if (aarch64_tune_params.prefetch->num_slots > 0)
11766 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
11767 aarch64_tune_params.prefetch->num_slots,
11768 opts->x_param_values,
11769 global_options_set.x_param_values);
11770 if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
11771 maybe_set_param_value (PARAM_L1_CACHE_SIZE,
11772 aarch64_tune_params.prefetch->l1_cache_size,
11773 opts->x_param_values,
11774 global_options_set.x_param_values);
11775 if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
11776 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
11777 aarch64_tune_params.prefetch->l1_cache_line_size,
11778 opts->x_param_values,
11779 global_options_set.x_param_values);
11780 if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
11781 maybe_set_param_value (PARAM_L2_CACHE_SIZE,
11782 aarch64_tune_params.prefetch->l2_cache_size,
11783 opts->x_param_values,
11784 global_options_set.x_param_values);
11785 if (!aarch64_tune_params.prefetch->prefetch_dynamic_strides)
11786 maybe_set_param_value (PARAM_PREFETCH_DYNAMIC_STRIDES,
11787 0,
11788 opts->x_param_values,
11789 global_options_set.x_param_values);
11790 if (aarch64_tune_params.prefetch->minimum_stride >= 0)
11791 maybe_set_param_value (PARAM_PREFETCH_MINIMUM_STRIDE,
11792 aarch64_tune_params.prefetch->minimum_stride,
11793 opts->x_param_values,
11794 global_options_set.x_param_values);
11795
11796 /* Use the alternative scheduling-pressure algorithm by default. */
11797 maybe_set_param_value (PARAM_SCHED_PRESSURE_ALGORITHM, SCHED_PRESSURE_MODEL,
11798 opts->x_param_values,
11799 global_options_set.x_param_values);
11800
11801 /* If the user hasn't changed it via configure then set the default to 64 KB
11802 for the backend. */
11803 maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE,
11804 DEFAULT_STK_CLASH_GUARD_SIZE == 0
11805 ? 16 : DEFAULT_STK_CLASH_GUARD_SIZE,
11806 opts->x_param_values,
11807 global_options_set.x_param_values);
11808
11809 /* Validate the guard size. */
11810 int guard_size = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
11811
11812 /* Enforce that interval is the same size as size so the mid-end does the
11813 right thing. */
11814 maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL,
11815 guard_size,
11816 opts->x_param_values,
11817 global_options_set.x_param_values);
11818
11819 /* The maybe_set calls won't update the value if the user has explicitly set
11820 one. Which means we need to validate that probing interval and guard size
11821 are equal. */
11822 int probe_interval
11823 = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL);
11824 if (guard_size != probe_interval)
11825 error ("stack clash guard size %<%d%> must be equal to probing interval "
11826 "%<%d%>", guard_size, probe_interval);
11827
11828 /* Enable sw prefetching at specified optimization level for
11829 CPUS that have prefetch. Lower optimization level threshold by 1
11830 when profiling is enabled. */
11831 if (opts->x_flag_prefetch_loop_arrays < 0
11832 && !opts->x_optimize_size
11833 && aarch64_tune_params.prefetch->default_opt_level >= 0
11834 && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
11835 opts->x_flag_prefetch_loop_arrays = 1;
11836
11837 if (opts->x_aarch64_arch_string == NULL)
11838 opts->x_aarch64_arch_string = selected_arch->name;
11839 if (opts->x_aarch64_cpu_string == NULL)
11840 opts->x_aarch64_cpu_string = selected_cpu->name;
11841 if (opts->x_aarch64_tune_string == NULL)
11842 opts->x_aarch64_tune_string = selected_tune->name;
11843
11844 aarch64_override_options_after_change_1 (opts);
11845 }
11846
11847 /* Print a hint with a suggestion for a core or architecture name that
11848 most closely resembles what the user passed in STR. ARCH is true if
11849 the user is asking for an architecture name. ARCH is false if the user
11850 is asking for a core name. */
11851
11852 static void
aarch64_print_hint_for_core_or_arch(const char * str,bool arch)11853 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
11854 {
11855 auto_vec<const char *> candidates;
11856 const struct processor *entry = arch ? all_architectures : all_cores;
11857 for (; entry->name != NULL; entry++)
11858 candidates.safe_push (entry->name);
11859
11860 #ifdef HAVE_LOCAL_CPU_DETECT
11861 /* Add also "native" as possible value. */
11862 if (arch)
11863 candidates.safe_push ("native");
11864 #endif
11865
11866 char *s;
11867 const char *hint = candidates_list_and_hint (str, s, candidates);
11868 if (hint)
11869 inform (input_location, "valid arguments are: %s;"
11870 " did you mean %qs?", s, hint);
11871 else
11872 inform (input_location, "valid arguments are: %s", s);
11873
11874 XDELETEVEC (s);
11875 }
11876
11877 /* Print a hint with a suggestion for a core name that most closely resembles
11878 what the user passed in STR. */
11879
11880 inline static void
aarch64_print_hint_for_core(const char * str)11881 aarch64_print_hint_for_core (const char *str)
11882 {
11883 aarch64_print_hint_for_core_or_arch (str, false);
11884 }
11885
11886 /* Print a hint with a suggestion for an architecture name that most closely
11887 resembles what the user passed in STR. */
11888
11889 inline static void
aarch64_print_hint_for_arch(const char * str)11890 aarch64_print_hint_for_arch (const char *str)
11891 {
11892 aarch64_print_hint_for_core_or_arch (str, true);
11893 }
11894
11895
11896 /* Print a hint with a suggestion for an extension name
11897 that most closely resembles what the user passed in STR. */
11898
11899 void
aarch64_print_hint_for_extensions(const std::string & str)11900 aarch64_print_hint_for_extensions (const std::string &str)
11901 {
11902 auto_vec<const char *> candidates;
11903 aarch64_get_all_extension_candidates (&candidates);
11904 char *s;
11905 const char *hint = candidates_list_and_hint (str.c_str (), s, candidates);
11906 if (hint)
11907 inform (input_location, "valid arguments are: %s;"
11908 " did you mean %qs?", s, hint);
11909 else
11910 inform (input_location, "valid arguments are: %s;", s);
11911
11912 XDELETEVEC (s);
11913 }
11914
11915 /* Validate a command-line -mcpu option. Parse the cpu and extensions (if any)
11916 specified in STR and throw errors if appropriate. Put the results if
11917 they are valid in RES and ISA_FLAGS. Return whether the option is
11918 valid. */
11919
11920 static bool
aarch64_validate_mcpu(const char * str,const struct processor ** res,unsigned long * isa_flags)11921 aarch64_validate_mcpu (const char *str, const struct processor **res,
11922 unsigned long *isa_flags)
11923 {
11924 std::string invalid_extension;
11925 enum aarch64_parse_opt_result parse_res
11926 = aarch64_parse_cpu (str, res, isa_flags, &invalid_extension);
11927
11928 if (parse_res == AARCH64_PARSE_OK)
11929 return true;
11930
11931 switch (parse_res)
11932 {
11933 case AARCH64_PARSE_MISSING_ARG:
11934 error ("missing cpu name in %<-mcpu=%s%>", str);
11935 break;
11936 case AARCH64_PARSE_INVALID_ARG:
11937 error ("unknown value %qs for %<-mcpu%>", str);
11938 aarch64_print_hint_for_core (str);
11939 break;
11940 case AARCH64_PARSE_INVALID_FEATURE:
11941 error ("invalid feature modifier %qs in %<-mcpu=%s%>",
11942 invalid_extension.c_str (), str);
11943 aarch64_print_hint_for_extensions (invalid_extension);
11944 break;
11945 default:
11946 gcc_unreachable ();
11947 }
11948
11949 return false;
11950 }
11951
11952 /* Straight line speculation indicators. */
11953 enum aarch64_sls_hardening_type
11954 {
11955 SLS_NONE = 0,
11956 SLS_RETBR = 1,
11957 SLS_BLR = 2,
11958 SLS_ALL = 3,
11959 };
11960 static enum aarch64_sls_hardening_type aarch64_sls_hardening;
11961
11962 /* Return whether we should mitigatate Straight Line Speculation for the RET
11963 and BR instructions. */
11964 bool
aarch64_harden_sls_retbr_p(void)11965 aarch64_harden_sls_retbr_p (void)
11966 {
11967 return aarch64_sls_hardening & SLS_RETBR;
11968 }
11969
11970 /* Return whether we should mitigatate Straight Line Speculation for the BLR
11971 instruction. */
11972 bool
aarch64_harden_sls_blr_p(void)11973 aarch64_harden_sls_blr_p (void)
11974 {
11975 return aarch64_sls_hardening & SLS_BLR;
11976 }
11977
11978 /* As of yet we only allow setting these options globally, in the future we may
11979 allow setting them per function. */
11980 static void
aarch64_validate_sls_mitigation(const char * const_str)11981 aarch64_validate_sls_mitigation (const char *const_str)
11982 {
11983 char *token_save = NULL;
11984 char *str = NULL;
11985
11986 if (strcmp (const_str, "none") == 0)
11987 {
11988 aarch64_sls_hardening = SLS_NONE;
11989 return;
11990 }
11991 if (strcmp (const_str, "all") == 0)
11992 {
11993 aarch64_sls_hardening = SLS_ALL;
11994 return;
11995 }
11996
11997 char *str_root = xstrdup (const_str);
11998 str = strtok_r (str_root, ",", &token_save);
11999 if (!str)
12000 error ("invalid argument given to %<-mharden-sls=%>");
12001
12002 int temp = SLS_NONE;
12003 while (str)
12004 {
12005 if (strcmp (str, "blr") == 0)
12006 temp |= SLS_BLR;
12007 else if (strcmp (str, "retbr") == 0)
12008 temp |= SLS_RETBR;
12009 else if (strcmp (str, "none") == 0 || strcmp (str, "all") == 0)
12010 {
12011 error ("%<%s%> must be by itself for %<-mharden-sls=%>", str);
12012 break;
12013 }
12014 else
12015 {
12016 error ("invalid argument %<%s%> for %<-mharden-sls=%>", str);
12017 break;
12018 }
12019 str = strtok_r (NULL, ",", &token_save);
12020 }
12021 aarch64_sls_hardening = (aarch64_sls_hardening_type) temp;
12022 free (str_root);
12023 }
12024
12025 /* Parses CONST_STR for branch protection features specified in
12026 aarch64_branch_protect_types, and set any global variables required. Returns
12027 the parsing result and assigns LAST_STR to the last processed token from
12028 CONST_STR so that it can be used for error reporting. */
12029
12030 static enum
aarch64_parse_branch_protection(const char * const_str,char ** last_str)12031 aarch64_parse_opt_result aarch64_parse_branch_protection (const char *const_str,
12032 char** last_str)
12033 {
12034 char *str_root = xstrdup (const_str);
12035 char* token_save = NULL;
12036 char *str = strtok_r (str_root, "+", &token_save);
12037 enum aarch64_parse_opt_result res = AARCH64_PARSE_OK;
12038 if (!str)
12039 res = AARCH64_PARSE_MISSING_ARG;
12040 else
12041 {
12042 char *next_str = strtok_r (NULL, "+", &token_save);
12043 /* Reset the branch protection features to their defaults. */
12044 aarch64_handle_no_branch_protection (NULL, NULL);
12045
12046 while (str && res == AARCH64_PARSE_OK)
12047 {
12048 const aarch64_branch_protect_type* type = aarch64_branch_protect_types;
12049 bool found = false;
12050 /* Search for this type. */
12051 while (type && type->name && !found && res == AARCH64_PARSE_OK)
12052 {
12053 if (strcmp (str, type->name) == 0)
12054 {
12055 found = true;
12056 res = type->handler (str, next_str);
12057 str = next_str;
12058 next_str = strtok_r (NULL, "+", &token_save);
12059 }
12060 else
12061 type++;
12062 }
12063 if (found && res == AARCH64_PARSE_OK)
12064 {
12065 bool found_subtype = true;
12066 /* Loop through each token until we find one that isn't a
12067 subtype. */
12068 while (found_subtype)
12069 {
12070 found_subtype = false;
12071 const aarch64_branch_protect_type *subtype = type->subtypes;
12072 /* Search for the subtype. */
12073 while (str && subtype && subtype->name && !found_subtype
12074 && res == AARCH64_PARSE_OK)
12075 {
12076 if (strcmp (str, subtype->name) == 0)
12077 {
12078 found_subtype = true;
12079 res = subtype->handler (str, next_str);
12080 str = next_str;
12081 next_str = strtok_r (NULL, "+", &token_save);
12082 }
12083 else
12084 subtype++;
12085 }
12086 }
12087 }
12088 else if (!found)
12089 res = AARCH64_PARSE_INVALID_ARG;
12090 }
12091 }
12092 /* Copy the last processed token into the argument to pass it back.
12093 Used by option and attribute validation to print the offending token. */
12094 if (last_str)
12095 {
12096 if (str) strcpy (*last_str, str);
12097 else *last_str = NULL;
12098 }
12099 if (res == AARCH64_PARSE_OK)
12100 {
12101 /* If needed, alloc the accepted string then copy in const_str.
12102 Used by override_option_after_change_1. */
12103 if (!accepted_branch_protection_string)
12104 accepted_branch_protection_string = (char *) xmalloc (
12105 BRANCH_PROTECT_STR_MAX
12106 + 1);
12107 strncpy (accepted_branch_protection_string, const_str,
12108 BRANCH_PROTECT_STR_MAX + 1);
12109 /* Forcibly null-terminate. */
12110 accepted_branch_protection_string[BRANCH_PROTECT_STR_MAX] = '\0';
12111 }
12112 return res;
12113 }
12114
12115 static bool
aarch64_validate_mbranch_protection(const char * const_str)12116 aarch64_validate_mbranch_protection (const char *const_str)
12117 {
12118 char *str = (char *) xmalloc (strlen (const_str));
12119 enum aarch64_parse_opt_result res =
12120 aarch64_parse_branch_protection (const_str, &str);
12121 if (res == AARCH64_PARSE_INVALID_ARG)
12122 error ("invalid arg %<%s%> for %<-mbranch-protection=%>", str);
12123 else if (res == AARCH64_PARSE_MISSING_ARG)
12124 error ("missing arg for %<-mbranch-protection=%>");
12125 free (str);
12126 return res == AARCH64_PARSE_OK;
12127 }
12128
12129 /* Validate a command-line -march option. Parse the arch and extensions
12130 (if any) specified in STR and throw errors if appropriate. Put the
12131 results, if they are valid, in RES and ISA_FLAGS. Return whether the
12132 option is valid. */
12133
12134 static bool
aarch64_validate_march(const char * str,const struct processor ** res,unsigned long * isa_flags)12135 aarch64_validate_march (const char *str, const struct processor **res,
12136 unsigned long *isa_flags)
12137 {
12138 std::string invalid_extension;
12139 enum aarch64_parse_opt_result parse_res
12140 = aarch64_parse_arch (str, res, isa_flags, &invalid_extension);
12141
12142 if (parse_res == AARCH64_PARSE_OK)
12143 return true;
12144
12145 switch (parse_res)
12146 {
12147 case AARCH64_PARSE_MISSING_ARG:
12148 error ("missing arch name in %<-march=%s%>", str);
12149 break;
12150 case AARCH64_PARSE_INVALID_ARG:
12151 error ("unknown value %qs for %<-march%>", str);
12152 aarch64_print_hint_for_arch (str);
12153 break;
12154 case AARCH64_PARSE_INVALID_FEATURE:
12155 error ("invalid feature modifier %qs in %<-march=%s%>",
12156 invalid_extension.c_str (), str);
12157 aarch64_print_hint_for_extensions (invalid_extension);
12158 break;
12159 default:
12160 gcc_unreachable ();
12161 }
12162
12163 return false;
12164 }
12165
12166 /* Validate a command-line -mtune option. Parse the cpu
12167 specified in STR and throw errors if appropriate. Put the
12168 result, if it is valid, in RES. Return whether the option is
12169 valid. */
12170
12171 static bool
aarch64_validate_mtune(const char * str,const struct processor ** res)12172 aarch64_validate_mtune (const char *str, const struct processor **res)
12173 {
12174 enum aarch64_parse_opt_result parse_res
12175 = aarch64_parse_tune (str, res);
12176
12177 if (parse_res == AARCH64_PARSE_OK)
12178 return true;
12179
12180 switch (parse_res)
12181 {
12182 case AARCH64_PARSE_MISSING_ARG:
12183 error ("missing cpu name in %<-mtune=%s%>", str);
12184 break;
12185 case AARCH64_PARSE_INVALID_ARG:
12186 error ("unknown value %qs for %<-mtune%>", str);
12187 aarch64_print_hint_for_core (str);
12188 break;
12189 default:
12190 gcc_unreachable ();
12191 }
12192 return false;
12193 }
12194
12195 /* Return the CPU corresponding to the enum CPU.
12196 If it doesn't specify a cpu, return the default. */
12197
12198 static const struct processor *
aarch64_get_tune_cpu(enum aarch64_processor cpu)12199 aarch64_get_tune_cpu (enum aarch64_processor cpu)
12200 {
12201 if (cpu != aarch64_none)
12202 return &all_cores[cpu];
12203
12204 /* The & 0x3f is to extract the bottom 6 bits that encode the
12205 default cpu as selected by the --with-cpu GCC configure option
12206 in config.gcc.
12207 ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
12208 flags mechanism should be reworked to make it more sane. */
12209 return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
12210 }
12211
12212 /* Return the architecture corresponding to the enum ARCH.
12213 If it doesn't specify a valid architecture, return the default. */
12214
12215 static const struct processor *
aarch64_get_arch(enum aarch64_arch arch)12216 aarch64_get_arch (enum aarch64_arch arch)
12217 {
12218 if (arch != aarch64_no_arch)
12219 return &all_architectures[arch];
12220
12221 const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
12222
12223 return &all_architectures[cpu->arch];
12224 }
12225
12226 /* Return the VG value associated with -msve-vector-bits= value VALUE. */
12227
12228 static poly_uint16
aarch64_convert_sve_vector_bits(aarch64_sve_vector_bits_enum value)12229 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
12230 {
12231 /* For now generate vector-length agnostic code for -msve-vector-bits=128.
12232 This ensures we can clearly distinguish SVE and Advanced SIMD modes when
12233 deciding which .md file patterns to use and when deciding whether
12234 something is a legitimate address or constant. */
12235 if (value == SVE_SCALABLE || value == SVE_128)
12236 return poly_uint16 (2, 2);
12237 else
12238 return (int) value / 64;
12239 }
12240
12241 /* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning
12242 and is used to parse the -m{cpu,tune,arch} strings and setup the initial
12243 tuning structs. In particular it must set selected_tune and
12244 aarch64_isa_flags that define the available ISA features and tuning
12245 decisions. It must also set selected_arch as this will be used to
12246 output the .arch asm tags for each function. */
12247
12248 static void
aarch64_override_options(void)12249 aarch64_override_options (void)
12250 {
12251 unsigned long cpu_isa = 0;
12252 unsigned long arch_isa = 0;
12253 aarch64_isa_flags = 0;
12254
12255 bool valid_cpu = true;
12256 bool valid_tune = true;
12257 bool valid_arch = true;
12258
12259 selected_cpu = NULL;
12260 selected_arch = NULL;
12261 selected_tune = NULL;
12262
12263 if (aarch64_harden_sls_string)
12264 aarch64_validate_sls_mitigation (aarch64_harden_sls_string);
12265
12266 if (aarch64_branch_protection_string)
12267 aarch64_validate_mbranch_protection (aarch64_branch_protection_string);
12268
12269 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
12270 If either of -march or -mtune is given, they override their
12271 respective component of -mcpu. */
12272 if (aarch64_cpu_string)
12273 valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
12274 &cpu_isa);
12275
12276 if (aarch64_arch_string)
12277 valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
12278 &arch_isa);
12279
12280 if (aarch64_tune_string)
12281 valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
12282
12283 #ifdef SUBTARGET_OVERRIDE_OPTIONS
12284 SUBTARGET_OVERRIDE_OPTIONS;
12285 #endif
12286
12287 /* If the user did not specify a processor, choose the default
12288 one for them. This will be the CPU set during configuration using
12289 --with-cpu, otherwise it is "generic". */
12290 if (!selected_cpu)
12291 {
12292 if (selected_arch)
12293 {
12294 selected_cpu = &all_cores[selected_arch->ident];
12295 aarch64_isa_flags = arch_isa;
12296 explicit_arch = selected_arch->arch;
12297 }
12298 else
12299 {
12300 /* Get default configure-time CPU. */
12301 selected_cpu = aarch64_get_tune_cpu (aarch64_none);
12302 aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
12303 }
12304
12305 if (selected_tune)
12306 explicit_tune_core = selected_tune->ident;
12307 }
12308 /* If both -mcpu and -march are specified check that they are architecturally
12309 compatible, warn if they're not and prefer the -march ISA flags. */
12310 else if (selected_arch)
12311 {
12312 if (selected_arch->arch != selected_cpu->arch)
12313 {
12314 warning (0, "switch %<-mcpu=%s%> conflicts with %<-march=%s%> switch",
12315 all_architectures[selected_cpu->arch].name,
12316 selected_arch->name);
12317 }
12318 aarch64_isa_flags = arch_isa;
12319 explicit_arch = selected_arch->arch;
12320 explicit_tune_core = selected_tune ? selected_tune->ident
12321 : selected_cpu->ident;
12322 }
12323 else
12324 {
12325 /* -mcpu but no -march. */
12326 aarch64_isa_flags = cpu_isa;
12327 explicit_tune_core = selected_tune ? selected_tune->ident
12328 : selected_cpu->ident;
12329 gcc_assert (selected_cpu);
12330 selected_arch = &all_architectures[selected_cpu->arch];
12331 explicit_arch = selected_arch->arch;
12332 }
12333
12334 /* Set the arch as well as we will need it when outputing
12335 the .arch directive in assembly. */
12336 if (!selected_arch)
12337 {
12338 gcc_assert (selected_cpu);
12339 selected_arch = &all_architectures[selected_cpu->arch];
12340 }
12341
12342 if (!selected_tune)
12343 selected_tune = selected_cpu;
12344
12345 if (aarch64_enable_bti == 2)
12346 {
12347 #ifdef TARGET_ENABLE_BTI
12348 aarch64_enable_bti = 1;
12349 #else
12350 aarch64_enable_bti = 0;
12351 #endif
12352 }
12353
12354 /* Return address signing is currently not supported for ILP32 targets. For
12355 LP64 targets use the configured option in the absence of a command-line
12356 option for -mbranch-protection. */
12357 if (!TARGET_ILP32 && accepted_branch_protection_string == NULL)
12358 {
12359 #ifdef TARGET_ENABLE_PAC_RET
12360 aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
12361 #else
12362 aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
12363 #endif
12364 }
12365
12366 #ifndef HAVE_AS_MABI_OPTION
12367 /* The compiler may have been configured with 2.23.* binutils, which does
12368 not have support for ILP32. */
12369 if (TARGET_ILP32)
12370 error ("assembler does not support %<-mabi=ilp32%>");
12371 #endif
12372
12373 /* Convert -msve-vector-bits to a VG count. */
12374 aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
12375
12376 if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
12377 sorry ("return address signing is only supported for %<-mabi=lp64%>");
12378
12379 /* Make sure we properly set up the explicit options. */
12380 if ((aarch64_cpu_string && valid_cpu)
12381 || (aarch64_tune_string && valid_tune))
12382 gcc_assert (explicit_tune_core != aarch64_none);
12383
12384 if ((aarch64_cpu_string && valid_cpu)
12385 || (aarch64_arch_string && valid_arch))
12386 gcc_assert (explicit_arch != aarch64_no_arch);
12387
12388 /* The pass to insert speculation tracking runs before
12389 shrink-wrapping and the latter does not know how to update the
12390 tracking status. So disable it in this case. */
12391 if (aarch64_track_speculation)
12392 flag_shrink_wrap = 0;
12393
12394 aarch64_override_options_internal (&global_options);
12395
12396 /* Save these options as the default ones in case we push and pop them later
12397 while processing functions with potential target attributes. */
12398 target_option_default_node = target_option_current_node
12399 = build_target_option_node (&global_options);
12400 }
12401
12402 /* Implement targetm.override_options_after_change. */
12403
12404 static void
aarch64_override_options_after_change(void)12405 aarch64_override_options_after_change (void)
12406 {
12407 aarch64_override_options_after_change_1 (&global_options);
12408 }
12409
12410 static struct machine_function *
aarch64_init_machine_status(void)12411 aarch64_init_machine_status (void)
12412 {
12413 struct machine_function *machine;
12414 machine = ggc_cleared_alloc<machine_function> ();
12415 return machine;
12416 }
12417
12418 void
aarch64_init_expanders(void)12419 aarch64_init_expanders (void)
12420 {
12421 init_machine_status = aarch64_init_machine_status;
12422 }
12423
12424 /* A checking mechanism for the implementation of the various code models. */
12425 static void
initialize_aarch64_code_model(struct gcc_options * opts)12426 initialize_aarch64_code_model (struct gcc_options *opts)
12427 {
12428 if (opts->x_flag_pic)
12429 {
12430 switch (opts->x_aarch64_cmodel_var)
12431 {
12432 case AARCH64_CMODEL_TINY:
12433 aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
12434 break;
12435 case AARCH64_CMODEL_SMALL:
12436 #ifdef HAVE_AS_SMALL_PIC_RELOCS
12437 aarch64_cmodel = (flag_pic == 2
12438 ? AARCH64_CMODEL_SMALL_PIC
12439 : AARCH64_CMODEL_SMALL_SPIC);
12440 #else
12441 aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
12442 #endif
12443 break;
12444 case AARCH64_CMODEL_LARGE:
12445 sorry ("code model %qs with %<-f%s%>", "large",
12446 opts->x_flag_pic > 1 ? "PIC" : "pic");
12447 break;
12448 default:
12449 gcc_unreachable ();
12450 }
12451 }
12452 else
12453 aarch64_cmodel = opts->x_aarch64_cmodel_var;
12454 }
12455
12456 /* Implement TARGET_OPTION_SAVE. */
12457
12458 static void
aarch64_option_save(struct cl_target_option * ptr,struct gcc_options * opts)12459 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
12460 {
12461 ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
12462 ptr->x_aarch64_branch_protection_string
12463 = opts->x_aarch64_branch_protection_string;
12464 }
12465
12466 /* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions
12467 using the information saved in PTR. */
12468
12469 static void
aarch64_option_restore(struct gcc_options * opts,struct cl_target_option * ptr)12470 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
12471 {
12472 opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
12473 selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
12474 opts->x_explicit_arch = ptr->x_explicit_arch;
12475 selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
12476 opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
12477 opts->x_aarch64_branch_protection_string
12478 = ptr->x_aarch64_branch_protection_string;
12479 if (opts->x_aarch64_branch_protection_string)
12480 {
12481 aarch64_parse_branch_protection (opts->x_aarch64_branch_protection_string,
12482 NULL);
12483 }
12484
12485 aarch64_override_options_internal (opts);
12486 }
12487
12488 /* Implement TARGET_OPTION_PRINT. */
12489
12490 static void
aarch64_option_print(FILE * file,int indent,struct cl_target_option * ptr)12491 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
12492 {
12493 const struct processor *cpu
12494 = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
12495 unsigned long isa_flags = ptr->x_aarch64_isa_flags;
12496 const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
12497 std::string extension
12498 = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
12499
12500 fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
12501 fprintf (file, "%*sselected arch = %s%s\n", indent, "",
12502 arch->name, extension.c_str ());
12503 }
12504
12505 static GTY(()) tree aarch64_previous_fndecl;
12506
12507 void
aarch64_reset_previous_fndecl(void)12508 aarch64_reset_previous_fndecl (void)
12509 {
12510 aarch64_previous_fndecl = NULL;
12511 }
12512
12513 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
12514 Used by aarch64_set_current_function and aarch64_pragma_target_parse to
12515 make sure optab availability predicates are recomputed when necessary. */
12516
12517 void
aarch64_save_restore_target_globals(tree new_tree)12518 aarch64_save_restore_target_globals (tree new_tree)
12519 {
12520 if (TREE_TARGET_GLOBALS (new_tree))
12521 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
12522 else if (new_tree == target_option_default_node)
12523 restore_target_globals (&default_target_globals);
12524 else
12525 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
12526 }
12527
12528 /* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions
12529 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
12530 of the function, if such exists. This function may be called multiple
12531 times on a single function so use aarch64_previous_fndecl to avoid
12532 setting up identical state. */
12533
12534 static void
aarch64_set_current_function(tree fndecl)12535 aarch64_set_current_function (tree fndecl)
12536 {
12537 if (!fndecl || fndecl == aarch64_previous_fndecl)
12538 return;
12539
12540 tree old_tree = (aarch64_previous_fndecl
12541 ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
12542 : NULL_TREE);
12543
12544 tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
12545
12546 /* If current function has no attributes but the previous one did,
12547 use the default node. */
12548 if (!new_tree && old_tree)
12549 new_tree = target_option_default_node;
12550
12551 /* If nothing to do, return. #pragma GCC reset or #pragma GCC pop to
12552 the default have been handled by aarch64_save_restore_target_globals from
12553 aarch64_pragma_target_parse. */
12554 if (old_tree == new_tree)
12555 return;
12556
12557 aarch64_previous_fndecl = fndecl;
12558
12559 /* First set the target options. */
12560 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
12561
12562 aarch64_save_restore_target_globals (new_tree);
12563 }
12564
12565 /* Enum describing the various ways we can handle attributes.
12566 In many cases we can reuse the generic option handling machinery. */
12567
12568 enum aarch64_attr_opt_type
12569 {
12570 aarch64_attr_mask, /* Attribute should set a bit in target_flags. */
12571 aarch64_attr_bool, /* Attribute sets or unsets a boolean variable. */
12572 aarch64_attr_enum, /* Attribute sets an enum variable. */
12573 aarch64_attr_custom /* Attribute requires a custom handling function. */
12574 };
12575
12576 /* All the information needed to handle a target attribute.
12577 NAME is the name of the attribute.
12578 ATTR_TYPE specifies the type of behavior of the attribute as described
12579 in the definition of enum aarch64_attr_opt_type.
12580 ALLOW_NEG is true if the attribute supports a "no-" form.
12581 HANDLER is the function that takes the attribute string as an argument
12582 It is needed only when the ATTR_TYPE is aarch64_attr_custom.
12583 OPT_NUM is the enum specifying the option that the attribute modifies.
12584 This is needed for attributes that mirror the behavior of a command-line
12585 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
12586 aarch64_attr_enum. */
12587
12588 struct aarch64_attribute_info
12589 {
12590 const char *name;
12591 enum aarch64_attr_opt_type attr_type;
12592 bool allow_neg;
12593 bool (*handler) (const char *);
12594 enum opt_code opt_num;
12595 };
12596
12597 /* Handle the ARCH_STR argument to the arch= target attribute. */
12598
12599 static bool
aarch64_handle_attr_arch(const char * str)12600 aarch64_handle_attr_arch (const char *str)
12601 {
12602 const struct processor *tmp_arch = NULL;
12603 std::string invalid_extension;
12604 enum aarch64_parse_opt_result parse_res
12605 = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags, &invalid_extension);
12606
12607 if (parse_res == AARCH64_PARSE_OK)
12608 {
12609 gcc_assert (tmp_arch);
12610 selected_arch = tmp_arch;
12611 explicit_arch = selected_arch->arch;
12612 return true;
12613 }
12614
12615 switch (parse_res)
12616 {
12617 case AARCH64_PARSE_MISSING_ARG:
12618 error ("missing name in %<target(\"arch=\")%> pragma or attribute");
12619 break;
12620 case AARCH64_PARSE_INVALID_ARG:
12621 error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str);
12622 aarch64_print_hint_for_arch (str);
12623 break;
12624 case AARCH64_PARSE_INVALID_FEATURE:
12625 error ("invalid feature modifier %s of value (\"%s\") in "
12626 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
12627 aarch64_print_hint_for_extensions (invalid_extension);
12628 break;
12629 default:
12630 gcc_unreachable ();
12631 }
12632
12633 return false;
12634 }
12635
12636 /* Handle the argument CPU_STR to the cpu= target attribute. */
12637
12638 static bool
aarch64_handle_attr_cpu(const char * str)12639 aarch64_handle_attr_cpu (const char *str)
12640 {
12641 const struct processor *tmp_cpu = NULL;
12642 std::string invalid_extension;
12643 enum aarch64_parse_opt_result parse_res
12644 = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags, &invalid_extension);
12645
12646 if (parse_res == AARCH64_PARSE_OK)
12647 {
12648 gcc_assert (tmp_cpu);
12649 selected_tune = tmp_cpu;
12650 explicit_tune_core = selected_tune->ident;
12651
12652 selected_arch = &all_architectures[tmp_cpu->arch];
12653 explicit_arch = selected_arch->arch;
12654 return true;
12655 }
12656
12657 switch (parse_res)
12658 {
12659 case AARCH64_PARSE_MISSING_ARG:
12660 error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
12661 break;
12662 case AARCH64_PARSE_INVALID_ARG:
12663 error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str);
12664 aarch64_print_hint_for_core (str);
12665 break;
12666 case AARCH64_PARSE_INVALID_FEATURE:
12667 error ("invalid feature modifier %s of value (\"%s\") in "
12668 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
12669 aarch64_print_hint_for_extensions (invalid_extension);
12670 break;
12671 default:
12672 gcc_unreachable ();
12673 }
12674
12675 return false;
12676 }
12677
12678 /* Handle the argument STR to the branch-protection= attribute. */
12679
12680 static bool
aarch64_handle_attr_branch_protection(const char * str)12681 aarch64_handle_attr_branch_protection (const char* str)
12682 {
12683 char *err_str = (char *) xmalloc (strlen (str));
12684 enum aarch64_parse_opt_result res = aarch64_parse_branch_protection (str,
12685 &err_str);
12686 bool success = false;
12687 switch (res)
12688 {
12689 case AARCH64_PARSE_MISSING_ARG:
12690 error ("missing argument to %<target(\"branch-protection=\")%> pragma or"
12691 " attribute");
12692 break;
12693 case AARCH64_PARSE_INVALID_ARG:
12694 error ("invalid protection type (\"%s\") in %<target(\"branch-protection"
12695 "=\")%> pragma or attribute", err_str);
12696 break;
12697 case AARCH64_PARSE_OK:
12698 success = true;
12699 /* Fall through. */
12700 case AARCH64_PARSE_INVALID_FEATURE:
12701 break;
12702 default:
12703 gcc_unreachable ();
12704 }
12705 free (err_str);
12706 return success;
12707 }
12708
12709 /* Handle the argument STR to the tune= target attribute. */
12710
12711 static bool
aarch64_handle_attr_tune(const char * str)12712 aarch64_handle_attr_tune (const char *str)
12713 {
12714 const struct processor *tmp_tune = NULL;
12715 enum aarch64_parse_opt_result parse_res
12716 = aarch64_parse_tune (str, &tmp_tune);
12717
12718 if (parse_res == AARCH64_PARSE_OK)
12719 {
12720 gcc_assert (tmp_tune);
12721 selected_tune = tmp_tune;
12722 explicit_tune_core = selected_tune->ident;
12723 return true;
12724 }
12725
12726 switch (parse_res)
12727 {
12728 case AARCH64_PARSE_INVALID_ARG:
12729 error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str);
12730 aarch64_print_hint_for_core (str);
12731 break;
12732 default:
12733 gcc_unreachable ();
12734 }
12735
12736 return false;
12737 }
12738
12739 /* Parse an architecture extensions target attribute string specified in STR.
12740 For example "+fp+nosimd". Show any errors if needed. Return TRUE
12741 if successful. Update aarch64_isa_flags to reflect the ISA features
12742 modified. */
12743
12744 static bool
aarch64_handle_attr_isa_flags(char * str)12745 aarch64_handle_attr_isa_flags (char *str)
12746 {
12747 enum aarch64_parse_opt_result parse_res;
12748 unsigned long isa_flags = aarch64_isa_flags;
12749
12750 /* We allow "+nothing" in the beginning to clear out all architectural
12751 features if the user wants to handpick specific features. */
12752 if (strncmp ("+nothing", str, 8) == 0)
12753 {
12754 isa_flags = 0;
12755 str += 8;
12756 }
12757
12758 std::string invalid_extension;
12759 parse_res = aarch64_parse_extension (str, &isa_flags, &invalid_extension);
12760
12761 if (parse_res == AARCH64_PARSE_OK)
12762 {
12763 aarch64_isa_flags = isa_flags;
12764 return true;
12765 }
12766
12767 switch (parse_res)
12768 {
12769 case AARCH64_PARSE_MISSING_ARG:
12770 error ("missing value in %<target()%> pragma or attribute");
12771 break;
12772
12773 case AARCH64_PARSE_INVALID_FEATURE:
12774 error ("invalid feature modifier %s of value (\"%s\") in "
12775 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
12776 break;
12777
12778 default:
12779 gcc_unreachable ();
12780 }
12781
12782 return false;
12783 }
12784
12785 /* The target attributes that we support. On top of these we also support just
12786 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is
12787 handled explicitly in aarch64_process_one_target_attr. */
12788
12789 static const struct aarch64_attribute_info aarch64_attributes[] =
12790 {
12791 { "general-regs-only", aarch64_attr_mask, false, NULL,
12792 OPT_mgeneral_regs_only },
12793 { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
12794 OPT_mfix_cortex_a53_835769 },
12795 { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
12796 OPT_mfix_cortex_a53_843419 },
12797 { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
12798 { "strict-align", aarch64_attr_mask, true, NULL, OPT_mstrict_align },
12799 { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
12800 OPT_momit_leaf_frame_pointer },
12801 { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
12802 { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
12803 OPT_march_ },
12804 { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
12805 { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
12806 OPT_mtune_ },
12807 { "branch-protection", aarch64_attr_custom, false,
12808 aarch64_handle_attr_branch_protection, OPT_mbranch_protection_ },
12809 { "sign-return-address", aarch64_attr_enum, false, NULL,
12810 OPT_msign_return_address_ },
12811 { "outline-atomics", aarch64_attr_bool, true, NULL,
12812 OPT_moutline_atomics},
12813 { NULL, aarch64_attr_custom, false, NULL, OPT____ }
12814 };
12815
12816 /* Parse ARG_STR which contains the definition of one target attribute.
12817 Show appropriate errors if any or return true if the attribute is valid. */
12818
12819 static bool
aarch64_process_one_target_attr(char * arg_str)12820 aarch64_process_one_target_attr (char *arg_str)
12821 {
12822 bool invert = false;
12823
12824 size_t len = strlen (arg_str);
12825
12826 if (len == 0)
12827 {
12828 error ("malformed %<target()%> pragma or attribute");
12829 return false;
12830 }
12831
12832 char *str_to_check = (char *) alloca (len + 1);
12833 strcpy (str_to_check, arg_str);
12834
12835 /* Skip leading whitespace. */
12836 while (*str_to_check == ' ' || *str_to_check == '\t')
12837 str_to_check++;
12838
12839 /* We have something like __attribute__ ((target ("+fp+nosimd"))).
12840 It is easier to detect and handle it explicitly here rather than going
12841 through the machinery for the rest of the target attributes in this
12842 function. */
12843 if (*str_to_check == '+')
12844 return aarch64_handle_attr_isa_flags (str_to_check);
12845
12846 if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
12847 {
12848 invert = true;
12849 str_to_check += 3;
12850 }
12851 char *arg = strchr (str_to_check, '=');
12852
12853 /* If we found opt=foo then terminate STR_TO_CHECK at the '='
12854 and point ARG to "foo". */
12855 if (arg)
12856 {
12857 *arg = '\0';
12858 arg++;
12859 }
12860 const struct aarch64_attribute_info *p_attr;
12861 bool found = false;
12862 for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
12863 {
12864 /* If the names don't match up, or the user has given an argument
12865 to an attribute that doesn't accept one, or didn't give an argument
12866 to an attribute that expects one, fail to match. */
12867 if (strcmp (str_to_check, p_attr->name) != 0)
12868 continue;
12869
12870 found = true;
12871 bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
12872 || p_attr->attr_type == aarch64_attr_enum;
12873
12874 if (attr_need_arg_p ^ (arg != NULL))
12875 {
12876 error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
12877 return false;
12878 }
12879
12880 /* If the name matches but the attribute does not allow "no-" versions
12881 then we can't match. */
12882 if (invert && !p_attr->allow_neg)
12883 {
12884 error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
12885 return false;
12886 }
12887
12888 switch (p_attr->attr_type)
12889 {
12890 /* Has a custom handler registered.
12891 For example, cpu=, arch=, tune=. */
12892 case aarch64_attr_custom:
12893 gcc_assert (p_attr->handler);
12894 if (!p_attr->handler (arg))
12895 return false;
12896 break;
12897
12898 /* Either set or unset a boolean option. */
12899 case aarch64_attr_bool:
12900 {
12901 struct cl_decoded_option decoded;
12902
12903 generate_option (p_attr->opt_num, NULL, !invert,
12904 CL_TARGET, &decoded);
12905 aarch64_handle_option (&global_options, &global_options_set,
12906 &decoded, input_location);
12907 break;
12908 }
12909 /* Set or unset a bit in the target_flags. aarch64_handle_option
12910 should know what mask to apply given the option number. */
12911 case aarch64_attr_mask:
12912 {
12913 struct cl_decoded_option decoded;
12914 /* We only need to specify the option number.
12915 aarch64_handle_option will know which mask to apply. */
12916 decoded.opt_index = p_attr->opt_num;
12917 decoded.value = !invert;
12918 aarch64_handle_option (&global_options, &global_options_set,
12919 &decoded, input_location);
12920 break;
12921 }
12922 /* Use the option setting machinery to set an option to an enum. */
12923 case aarch64_attr_enum:
12924 {
12925 gcc_assert (arg);
12926 bool valid;
12927 int value;
12928 valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
12929 &value, CL_TARGET);
12930 if (valid)
12931 {
12932 set_option (&global_options, NULL, p_attr->opt_num, value,
12933 NULL, DK_UNSPECIFIED, input_location,
12934 global_dc);
12935 }
12936 else
12937 {
12938 error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
12939 }
12940 break;
12941 }
12942 default:
12943 gcc_unreachable ();
12944 }
12945 }
12946
12947 /* If we reached here we either have found an attribute and validated
12948 it or didn't match any. If we matched an attribute but its arguments
12949 were malformed we will have returned false already. */
12950 return found;
12951 }
12952
12953 /* Count how many times the character C appears in
12954 NULL-terminated string STR. */
12955
12956 static unsigned int
num_occurences_in_str(char c,char * str)12957 num_occurences_in_str (char c, char *str)
12958 {
12959 unsigned int res = 0;
12960 while (*str != '\0')
12961 {
12962 if (*str == c)
12963 res++;
12964
12965 str++;
12966 }
12967
12968 return res;
12969 }
12970
12971 /* Parse the tree in ARGS that contains the target attribute information
12972 and update the global target options space. */
12973
12974 bool
aarch64_process_target_attr(tree args)12975 aarch64_process_target_attr (tree args)
12976 {
12977 if (TREE_CODE (args) == TREE_LIST)
12978 {
12979 do
12980 {
12981 tree head = TREE_VALUE (args);
12982 if (head)
12983 {
12984 if (!aarch64_process_target_attr (head))
12985 return false;
12986 }
12987 args = TREE_CHAIN (args);
12988 } while (args);
12989
12990 return true;
12991 }
12992
12993 if (TREE_CODE (args) != STRING_CST)
12994 {
12995 error ("attribute %<target%> argument not a string");
12996 return false;
12997 }
12998
12999 size_t len = strlen (TREE_STRING_POINTER (args));
13000 char *str_to_check = (char *) alloca (len + 1);
13001 strcpy (str_to_check, TREE_STRING_POINTER (args));
13002
13003 if (len == 0)
13004 {
13005 error ("malformed %<target()%> pragma or attribute");
13006 return false;
13007 }
13008
13009 /* Used to catch empty spaces between commas i.e.
13010 attribute ((target ("attr1,,attr2"))). */
13011 unsigned int num_commas = num_occurences_in_str (',', str_to_check);
13012
13013 /* Handle multiple target attributes separated by ','. */
13014 char *token = strtok_r (str_to_check, ",", &str_to_check);
13015
13016 unsigned int num_attrs = 0;
13017 while (token)
13018 {
13019 num_attrs++;
13020 if (!aarch64_process_one_target_attr (token))
13021 {
13022 error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
13023 return false;
13024 }
13025
13026 token = strtok_r (NULL, ",", &str_to_check);
13027 }
13028
13029 if (num_attrs != num_commas + 1)
13030 {
13031 error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
13032 return false;
13033 }
13034
13035 return true;
13036 }
13037
13038 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to
13039 process attribute ((target ("..."))). */
13040
13041 static bool
aarch64_option_valid_attribute_p(tree fndecl,tree,tree args,int)13042 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
13043 {
13044 struct cl_target_option cur_target;
13045 bool ret;
13046 tree old_optimize;
13047 tree new_target, new_optimize;
13048 tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
13049
13050 /* If what we're processing is the current pragma string then the
13051 target option node is already stored in target_option_current_node
13052 by aarch64_pragma_target_parse in aarch64-c.c. Use that to avoid
13053 having to re-parse the string. This is especially useful to keep
13054 arm_neon.h compile times down since that header contains a lot
13055 of intrinsics enclosed in pragmas. */
13056 if (!existing_target && args == current_target_pragma)
13057 {
13058 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
13059 return true;
13060 }
13061 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
13062
13063 old_optimize = build_optimization_node (&global_options);
13064 func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
13065
13066 /* If the function changed the optimization levels as well as setting
13067 target options, start with the optimizations specified. */
13068 if (func_optimize && func_optimize != old_optimize)
13069 cl_optimization_restore (&global_options,
13070 TREE_OPTIMIZATION (func_optimize));
13071
13072 /* Save the current target options to restore at the end. */
13073 cl_target_option_save (&cur_target, &global_options);
13074
13075 /* If fndecl already has some target attributes applied to it, unpack
13076 them so that we add this attribute on top of them, rather than
13077 overwriting them. */
13078 if (existing_target)
13079 {
13080 struct cl_target_option *existing_options
13081 = TREE_TARGET_OPTION (existing_target);
13082
13083 if (existing_options)
13084 cl_target_option_restore (&global_options, existing_options);
13085 }
13086 else
13087 cl_target_option_restore (&global_options,
13088 TREE_TARGET_OPTION (target_option_current_node));
13089
13090 ret = aarch64_process_target_attr (args);
13091
13092 /* Set up any additional state. */
13093 if (ret)
13094 {
13095 aarch64_override_options_internal (&global_options);
13096 /* Initialize SIMD builtins if we haven't already.
13097 Set current_target_pragma to NULL for the duration so that
13098 the builtin initialization code doesn't try to tag the functions
13099 being built with the attributes specified by any current pragma, thus
13100 going into an infinite recursion. */
13101 if (TARGET_SIMD)
13102 {
13103 tree saved_current_target_pragma = current_target_pragma;
13104 current_target_pragma = NULL;
13105 aarch64_init_simd_builtins ();
13106 current_target_pragma = saved_current_target_pragma;
13107 }
13108 new_target = build_target_option_node (&global_options);
13109 }
13110 else
13111 new_target = NULL;
13112
13113 new_optimize = build_optimization_node (&global_options);
13114
13115 if (fndecl && ret)
13116 {
13117 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
13118
13119 if (old_optimize != new_optimize)
13120 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
13121 }
13122
13123 cl_target_option_restore (&global_options, &cur_target);
13124
13125 if (old_optimize != new_optimize)
13126 cl_optimization_restore (&global_options,
13127 TREE_OPTIMIZATION (old_optimize));
13128 return ret;
13129 }
13130
13131 /* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are
13132 tri-bool options (yes, no, don't care) and the default value is
13133 DEF, determine whether to reject inlining. */
13134
13135 static bool
aarch64_tribools_ok_for_inlining_p(int caller,int callee,int dont_care,int def)13136 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
13137 int dont_care, int def)
13138 {
13139 /* If the callee doesn't care, always allow inlining. */
13140 if (callee == dont_care)
13141 return true;
13142
13143 /* If the caller doesn't care, always allow inlining. */
13144 if (caller == dont_care)
13145 return true;
13146
13147 /* Otherwise, allow inlining if either the callee and caller values
13148 agree, or if the callee is using the default value. */
13149 return (callee == caller || callee == def);
13150 }
13151
13152 /* Implement TARGET_CAN_INLINE_P. Decide whether it is valid
13153 to inline CALLEE into CALLER based on target-specific info.
13154 Make sure that the caller and callee have compatible architectural
13155 features. Then go through the other possible target attributes
13156 and see if they can block inlining. Try not to reject always_inline
13157 callees unless they are incompatible architecturally. */
13158
13159 static bool
aarch64_can_inline_p(tree caller,tree callee)13160 aarch64_can_inline_p (tree caller, tree callee)
13161 {
13162 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
13163 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
13164
13165 struct cl_target_option *caller_opts
13166 = TREE_TARGET_OPTION (caller_tree ? caller_tree
13167 : target_option_default_node);
13168
13169 struct cl_target_option *callee_opts
13170 = TREE_TARGET_OPTION (callee_tree ? callee_tree
13171 : target_option_default_node);
13172
13173 /* Callee's ISA flags should be a subset of the caller's. */
13174 if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
13175 != callee_opts->x_aarch64_isa_flags)
13176 return false;
13177
13178 /* Allow non-strict aligned functions inlining into strict
13179 aligned ones. */
13180 if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
13181 != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
13182 && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
13183 && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
13184 return false;
13185
13186 bool always_inline = lookup_attribute ("always_inline",
13187 DECL_ATTRIBUTES (callee));
13188
13189 /* If the architectural features match up and the callee is always_inline
13190 then the other attributes don't matter. */
13191 if (always_inline)
13192 return true;
13193
13194 if (caller_opts->x_aarch64_cmodel_var
13195 != callee_opts->x_aarch64_cmodel_var)
13196 return false;
13197
13198 if (caller_opts->x_aarch64_tls_dialect
13199 != callee_opts->x_aarch64_tls_dialect)
13200 return false;
13201
13202 /* Honour explicit requests to workaround errata. */
13203 if (!aarch64_tribools_ok_for_inlining_p (
13204 caller_opts->x_aarch64_fix_a53_err835769,
13205 callee_opts->x_aarch64_fix_a53_err835769,
13206 2, TARGET_FIX_ERR_A53_835769_DEFAULT))
13207 return false;
13208
13209 if (!aarch64_tribools_ok_for_inlining_p (
13210 caller_opts->x_aarch64_fix_a53_err843419,
13211 callee_opts->x_aarch64_fix_a53_err843419,
13212 2, TARGET_FIX_ERR_A53_843419))
13213 return false;
13214
13215 /* If the user explicitly specified -momit-leaf-frame-pointer for the
13216 caller and calle and they don't match up, reject inlining. */
13217 if (!aarch64_tribools_ok_for_inlining_p (
13218 caller_opts->x_flag_omit_leaf_frame_pointer,
13219 callee_opts->x_flag_omit_leaf_frame_pointer,
13220 2, 1))
13221 return false;
13222
13223 /* If the callee has specific tuning overrides, respect them. */
13224 if (callee_opts->x_aarch64_override_tune_string != NULL
13225 && caller_opts->x_aarch64_override_tune_string == NULL)
13226 return false;
13227
13228 /* If the user specified tuning override strings for the
13229 caller and callee and they don't match up, reject inlining.
13230 We just do a string compare here, we don't analyze the meaning
13231 of the string, as it would be too costly for little gain. */
13232 if (callee_opts->x_aarch64_override_tune_string
13233 && caller_opts->x_aarch64_override_tune_string
13234 && (strcmp (callee_opts->x_aarch64_override_tune_string,
13235 caller_opts->x_aarch64_override_tune_string) != 0))
13236 return false;
13237
13238 return true;
13239 }
13240
13241 /* Return true if SYMBOL_REF X binds locally. */
13242
13243 static bool
aarch64_symbol_binds_local_p(const_rtx x)13244 aarch64_symbol_binds_local_p (const_rtx x)
13245 {
13246 return (SYMBOL_REF_DECL (x)
13247 ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
13248 : SYMBOL_REF_LOCAL_P (x));
13249 }
13250
13251 /* Return true if SYMBOL_REF X is thread local */
13252 static bool
aarch64_tls_symbol_p(rtx x)13253 aarch64_tls_symbol_p (rtx x)
13254 {
13255 if (! TARGET_HAVE_TLS)
13256 return false;
13257
13258 if (GET_CODE (x) != SYMBOL_REF)
13259 return false;
13260
13261 return SYMBOL_REF_TLS_MODEL (x) != 0;
13262 }
13263
13264 /* Classify a TLS symbol into one of the TLS kinds. */
13265 enum aarch64_symbol_type
aarch64_classify_tls_symbol(rtx x)13266 aarch64_classify_tls_symbol (rtx x)
13267 {
13268 enum tls_model tls_kind = tls_symbolic_operand_type (x);
13269
13270 switch (tls_kind)
13271 {
13272 case TLS_MODEL_GLOBAL_DYNAMIC:
13273 case TLS_MODEL_LOCAL_DYNAMIC:
13274 return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
13275
13276 case TLS_MODEL_INITIAL_EXEC:
13277 switch (aarch64_cmodel)
13278 {
13279 case AARCH64_CMODEL_TINY:
13280 case AARCH64_CMODEL_TINY_PIC:
13281 return SYMBOL_TINY_TLSIE;
13282 default:
13283 return SYMBOL_SMALL_TLSIE;
13284 }
13285
13286 case TLS_MODEL_LOCAL_EXEC:
13287 if (aarch64_tls_size == 12)
13288 return SYMBOL_TLSLE12;
13289 else if (aarch64_tls_size == 24)
13290 return SYMBOL_TLSLE24;
13291 else if (aarch64_tls_size == 32)
13292 return SYMBOL_TLSLE32;
13293 else if (aarch64_tls_size == 48)
13294 return SYMBOL_TLSLE48;
13295 else
13296 gcc_unreachable ();
13297
13298 case TLS_MODEL_EMULATED:
13299 case TLS_MODEL_NONE:
13300 return SYMBOL_FORCE_TO_MEM;
13301
13302 default:
13303 gcc_unreachable ();
13304 }
13305 }
13306
13307 /* Return the correct method for accessing X + OFFSET, where X is either
13308 a SYMBOL_REF or LABEL_REF. */
13309
13310 enum aarch64_symbol_type
aarch64_classify_symbol(rtx x,HOST_WIDE_INT offset)13311 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
13312 {
13313 if (GET_CODE (x) == LABEL_REF)
13314 {
13315 switch (aarch64_cmodel)
13316 {
13317 case AARCH64_CMODEL_LARGE:
13318 return SYMBOL_FORCE_TO_MEM;
13319
13320 case AARCH64_CMODEL_TINY_PIC:
13321 case AARCH64_CMODEL_TINY:
13322 return SYMBOL_TINY_ABSOLUTE;
13323
13324 case AARCH64_CMODEL_SMALL_SPIC:
13325 case AARCH64_CMODEL_SMALL_PIC:
13326 case AARCH64_CMODEL_SMALL:
13327 return SYMBOL_SMALL_ABSOLUTE;
13328
13329 default:
13330 gcc_unreachable ();
13331 }
13332 }
13333
13334 if (GET_CODE (x) == SYMBOL_REF)
13335 {
13336 if (aarch64_tls_symbol_p (x))
13337 return aarch64_classify_tls_symbol (x);
13338
13339 switch (aarch64_cmodel)
13340 {
13341 case AARCH64_CMODEL_TINY:
13342 /* When we retrieve symbol + offset address, we have to make sure
13343 the offset does not cause overflow of the final address. But
13344 we have no way of knowing the address of symbol at compile time
13345 so we can't accurately say if the distance between the PC and
13346 symbol + offset is outside the addressible range of +/-1MB in the
13347 TINY code model. So we limit the maximum offset to +/-64KB and
13348 assume the offset to the symbol is not larger than +/-(1MB - 64KB).
13349 If offset_within_block_p is true we allow larger offsets.
13350 Furthermore force to memory if the symbol is a weak reference to
13351 something that doesn't resolve to a symbol in this module. */
13352
13353 if (SYMBOL_REF_WEAK (x) && !aarch64_symbol_binds_local_p (x))
13354 return SYMBOL_FORCE_TO_MEM;
13355 if (!(IN_RANGE (offset, -0x10000, 0x10000)
13356 || offset_within_block_p (x, offset)))
13357 return SYMBOL_FORCE_TO_MEM;
13358
13359 return SYMBOL_TINY_ABSOLUTE;
13360
13361 case AARCH64_CMODEL_SMALL:
13362 /* Same reasoning as the tiny code model, but the offset cap here is
13363 1MB, allowing +/-3.9GB for the offset to the symbol. */
13364
13365 if (SYMBOL_REF_WEAK (x) && !aarch64_symbol_binds_local_p (x))
13366 return SYMBOL_FORCE_TO_MEM;
13367 if (!(IN_RANGE (offset, -0x100000, 0x100000)
13368 || offset_within_block_p (x, offset)))
13369 return SYMBOL_FORCE_TO_MEM;
13370
13371 return SYMBOL_SMALL_ABSOLUTE;
13372
13373 case AARCH64_CMODEL_TINY_PIC:
13374 if (!aarch64_symbol_binds_local_p (x))
13375 return SYMBOL_TINY_GOT;
13376 return SYMBOL_TINY_ABSOLUTE;
13377
13378 case AARCH64_CMODEL_SMALL_SPIC:
13379 case AARCH64_CMODEL_SMALL_PIC:
13380 if (!aarch64_symbol_binds_local_p (x))
13381 return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
13382 ? SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
13383 return SYMBOL_SMALL_ABSOLUTE;
13384
13385 case AARCH64_CMODEL_LARGE:
13386 /* This is alright even in PIC code as the constant
13387 pool reference is always PC relative and within
13388 the same translation unit. */
13389 if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
13390 return SYMBOL_SMALL_ABSOLUTE;
13391 else
13392 return SYMBOL_FORCE_TO_MEM;
13393
13394 default:
13395 gcc_unreachable ();
13396 }
13397 }
13398
13399 /* By default push everything into the constant pool. */
13400 return SYMBOL_FORCE_TO_MEM;
13401 }
13402
13403 bool
aarch64_constant_address_p(rtx x)13404 aarch64_constant_address_p (rtx x)
13405 {
13406 return (CONSTANT_P (x) && memory_address_p (DImode, x));
13407 }
13408
13409 bool
aarch64_legitimate_pic_operand_p(rtx x)13410 aarch64_legitimate_pic_operand_p (rtx x)
13411 {
13412 if (GET_CODE (x) == SYMBOL_REF
13413 || (GET_CODE (x) == CONST
13414 && GET_CODE (XEXP (x, 0)) == PLUS
13415 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
13416 return false;
13417
13418 return true;
13419 }
13420
13421 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook. Return true for constants
13422 that should be rematerialized rather than spilled. */
13423
13424 static bool
aarch64_legitimate_constant_p(machine_mode mode,rtx x)13425 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
13426 {
13427 /* Support CSE and rematerialization of common constants. */
13428 if (CONST_INT_P (x)
13429 || (CONST_DOUBLE_P (x) && GET_MODE_CLASS (mode) == MODE_FLOAT)
13430 || GET_CODE (x) == CONST_VECTOR)
13431 return true;
13432
13433 /* Do not allow vector struct mode constants for Advanced SIMD.
13434 We could support 0 and -1 easily, but they need support in
13435 aarch64-simd.md. */
13436 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
13437 if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
13438 return false;
13439
13440 /* Only accept variable-length vector constants if they can be
13441 handled directly.
13442
13443 ??? It would be possible to handle rematerialization of other
13444 constants via secondary reloads. */
13445 if (vec_flags & VEC_ANY_SVE)
13446 return aarch64_simd_valid_immediate (x, NULL);
13447
13448 if (GET_CODE (x) == HIGH)
13449 x = XEXP (x, 0);
13450
13451 /* Accept polynomial constants that can be calculated by using the
13452 destination of a move as the sole temporary. Constants that
13453 require a second temporary cannot be rematerialized (they can't be
13454 forced to memory and also aren't legitimate constants). */
13455 poly_int64 offset;
13456 if (poly_int_rtx_p (x, &offset))
13457 return aarch64_offset_temporaries (false, offset) <= 1;
13458
13459 /* If an offset is being added to something else, we need to allow the
13460 base to be moved into the destination register, meaning that there
13461 are no free temporaries for the offset. */
13462 x = strip_offset (x, &offset);
13463 if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
13464 return false;
13465
13466 /* Do not allow const (plus (anchor_symbol, const_int)). */
13467 if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
13468 return false;
13469
13470 /* Treat symbols as constants. Avoid TLS symbols as they are complex,
13471 so spilling them is better than rematerialization. */
13472 if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
13473 return true;
13474
13475 /* Label references are always constant. */
13476 if (GET_CODE (x) == LABEL_REF)
13477 return true;
13478
13479 return false;
13480 }
13481
13482 rtx
aarch64_load_tp(rtx target)13483 aarch64_load_tp (rtx target)
13484 {
13485 if (!target
13486 || GET_MODE (target) != Pmode
13487 || !register_operand (target, Pmode))
13488 target = gen_reg_rtx (Pmode);
13489
13490 /* Can return in any reg. */
13491 emit_insn (gen_aarch64_load_tp_hard (target));
13492 return target;
13493 }
13494
13495 /* On AAPCS systems, this is the "struct __va_list". */
13496 static GTY(()) tree va_list_type;
13497
13498 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
13499 Return the type to use as __builtin_va_list.
13500
13501 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
13502
13503 struct __va_list
13504 {
13505 void *__stack;
13506 void *__gr_top;
13507 void *__vr_top;
13508 int __gr_offs;
13509 int __vr_offs;
13510 }; */
13511
13512 static tree
aarch64_build_builtin_va_list(void)13513 aarch64_build_builtin_va_list (void)
13514 {
13515 tree va_list_name;
13516 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
13517
13518 /* Create the type. */
13519 va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
13520 /* Give it the required name. */
13521 va_list_name = build_decl (BUILTINS_LOCATION,
13522 TYPE_DECL,
13523 get_identifier ("__va_list"),
13524 va_list_type);
13525 DECL_ARTIFICIAL (va_list_name) = 1;
13526 TYPE_NAME (va_list_type) = va_list_name;
13527 TYPE_STUB_DECL (va_list_type) = va_list_name;
13528
13529 /* Create the fields. */
13530 f_stack = build_decl (BUILTINS_LOCATION,
13531 FIELD_DECL, get_identifier ("__stack"),
13532 ptr_type_node);
13533 f_grtop = build_decl (BUILTINS_LOCATION,
13534 FIELD_DECL, get_identifier ("__gr_top"),
13535 ptr_type_node);
13536 f_vrtop = build_decl (BUILTINS_LOCATION,
13537 FIELD_DECL, get_identifier ("__vr_top"),
13538 ptr_type_node);
13539 f_groff = build_decl (BUILTINS_LOCATION,
13540 FIELD_DECL, get_identifier ("__gr_offs"),
13541 integer_type_node);
13542 f_vroff = build_decl (BUILTINS_LOCATION,
13543 FIELD_DECL, get_identifier ("__vr_offs"),
13544 integer_type_node);
13545
13546 /* Tell tree-stdarg pass about our internal offset fields.
13547 NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
13548 purpose to identify whether the code is updating va_list internal
13549 offset fields through irregular way. */
13550 va_list_gpr_counter_field = f_groff;
13551 va_list_fpr_counter_field = f_vroff;
13552
13553 DECL_ARTIFICIAL (f_stack) = 1;
13554 DECL_ARTIFICIAL (f_grtop) = 1;
13555 DECL_ARTIFICIAL (f_vrtop) = 1;
13556 DECL_ARTIFICIAL (f_groff) = 1;
13557 DECL_ARTIFICIAL (f_vroff) = 1;
13558
13559 DECL_FIELD_CONTEXT (f_stack) = va_list_type;
13560 DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
13561 DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
13562 DECL_FIELD_CONTEXT (f_groff) = va_list_type;
13563 DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
13564
13565 TYPE_FIELDS (va_list_type) = f_stack;
13566 DECL_CHAIN (f_stack) = f_grtop;
13567 DECL_CHAIN (f_grtop) = f_vrtop;
13568 DECL_CHAIN (f_vrtop) = f_groff;
13569 DECL_CHAIN (f_groff) = f_vroff;
13570
13571 /* Compute its layout. */
13572 layout_type (va_list_type);
13573
13574 return va_list_type;
13575 }
13576
13577 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
13578 static void
aarch64_expand_builtin_va_start(tree valist,rtx nextarg ATTRIBUTE_UNUSED)13579 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
13580 {
13581 const CUMULATIVE_ARGS *cum;
13582 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
13583 tree stack, grtop, vrtop, groff, vroff;
13584 tree t;
13585 int gr_save_area_size = cfun->va_list_gpr_size;
13586 int vr_save_area_size = cfun->va_list_fpr_size;
13587 int vr_offset;
13588
13589 cum = &crtl->args.info;
13590 if (cfun->va_list_gpr_size)
13591 gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
13592 cfun->va_list_gpr_size);
13593 if (cfun->va_list_fpr_size)
13594 vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
13595 * UNITS_PER_VREG, cfun->va_list_fpr_size);
13596
13597 if (!TARGET_FLOAT)
13598 {
13599 gcc_assert (cum->aapcs_nvrn == 0);
13600 vr_save_area_size = 0;
13601 }
13602
13603 f_stack = TYPE_FIELDS (va_list_type_node);
13604 f_grtop = DECL_CHAIN (f_stack);
13605 f_vrtop = DECL_CHAIN (f_grtop);
13606 f_groff = DECL_CHAIN (f_vrtop);
13607 f_vroff = DECL_CHAIN (f_groff);
13608
13609 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
13610 NULL_TREE);
13611 grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
13612 NULL_TREE);
13613 vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
13614 NULL_TREE);
13615 groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
13616 NULL_TREE);
13617 vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
13618 NULL_TREE);
13619
13620 /* Emit code to initialize STACK, which points to the next varargs stack
13621 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
13622 by named arguments. STACK is 8-byte aligned. */
13623 t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
13624 if (cum->aapcs_stack_size > 0)
13625 t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
13626 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
13627 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13628
13629 /* Emit code to initialize GRTOP, the top of the GR save area.
13630 virtual_incoming_args_rtx should have been 16 byte aligned. */
13631 t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
13632 t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
13633 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13634
13635 /* Emit code to initialize VRTOP, the top of the VR save area.
13636 This address is gr_save_area_bytes below GRTOP, rounded
13637 down to the next 16-byte boundary. */
13638 t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
13639 vr_offset = ROUND_UP (gr_save_area_size,
13640 STACK_BOUNDARY / BITS_PER_UNIT);
13641
13642 if (vr_offset)
13643 t = fold_build_pointer_plus_hwi (t, -vr_offset);
13644 t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
13645 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13646
13647 /* Emit code to initialize GROFF, the offset from GRTOP of the
13648 next GPR argument. */
13649 t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
13650 build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
13651 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13652
13653 /* Likewise emit code to initialize VROFF, the offset from FTOP
13654 of the next VR argument. */
13655 t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
13656 build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
13657 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13658 }
13659
13660 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
13661
13662 static tree
aarch64_gimplify_va_arg_expr(tree valist,tree type,gimple_seq * pre_p,gimple_seq * post_p ATTRIBUTE_UNUSED)13663 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
13664 gimple_seq *post_p ATTRIBUTE_UNUSED)
13665 {
13666 tree addr;
13667 bool indirect_p;
13668 bool is_ha; /* is HFA or HVA. */
13669 bool dw_align; /* double-word align. */
13670 machine_mode ag_mode = VOIDmode;
13671 int nregs;
13672 machine_mode mode;
13673
13674 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
13675 tree stack, f_top, f_off, off, arg, roundup, on_stack;
13676 HOST_WIDE_INT size, rsize, adjust, align;
13677 tree t, u, cond1, cond2;
13678
13679 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
13680 if (indirect_p)
13681 type = build_pointer_type (type);
13682
13683 mode = TYPE_MODE (type);
13684
13685 f_stack = TYPE_FIELDS (va_list_type_node);
13686 f_grtop = DECL_CHAIN (f_stack);
13687 f_vrtop = DECL_CHAIN (f_grtop);
13688 f_groff = DECL_CHAIN (f_vrtop);
13689 f_vroff = DECL_CHAIN (f_groff);
13690
13691 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
13692 f_stack, NULL_TREE);
13693 size = int_size_in_bytes (type);
13694
13695 bool abi_break;
13696 align
13697 = aarch64_function_arg_alignment (mode, type, &abi_break) / BITS_PER_UNIT;
13698
13699 dw_align = false;
13700 adjust = 0;
13701 if (aarch64_vfp_is_call_or_return_candidate (mode,
13702 type,
13703 &ag_mode,
13704 &nregs,
13705 &is_ha))
13706 {
13707 /* No frontends can create types with variable-sized modes, so we
13708 shouldn't be asked to pass or return them. */
13709 unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
13710
13711 /* TYPE passed in fp/simd registers. */
13712 if (!TARGET_FLOAT)
13713 aarch64_err_no_fpadvsimd (mode);
13714
13715 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
13716 unshare_expr (valist), f_vrtop, NULL_TREE);
13717 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
13718 unshare_expr (valist), f_vroff, NULL_TREE);
13719
13720 rsize = nregs * UNITS_PER_VREG;
13721
13722 if (is_ha)
13723 {
13724 if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
13725 adjust = UNITS_PER_VREG - ag_size;
13726 }
13727 else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
13728 && size < UNITS_PER_VREG)
13729 {
13730 adjust = UNITS_PER_VREG - size;
13731 }
13732 }
13733 else
13734 {
13735 /* TYPE passed in general registers. */
13736 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
13737 unshare_expr (valist), f_grtop, NULL_TREE);
13738 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
13739 unshare_expr (valist), f_groff, NULL_TREE);
13740 rsize = ROUND_UP (size, UNITS_PER_WORD);
13741 nregs = rsize / UNITS_PER_WORD;
13742
13743 if (align > 8)
13744 {
13745 if (abi_break && warn_psabi)
13746 inform (input_location, "parameter passing for argument of type "
13747 "%qT changed in GCC 9.1", type);
13748 dw_align = true;
13749 }
13750
13751 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
13752 && size < UNITS_PER_WORD)
13753 {
13754 adjust = UNITS_PER_WORD - size;
13755 }
13756 }
13757
13758 /* Get a local temporary for the field value. */
13759 off = get_initialized_tmp_var (f_off, pre_p, NULL);
13760
13761 /* Emit code to branch if off >= 0. */
13762 t = build2 (GE_EXPR, boolean_type_node, off,
13763 build_int_cst (TREE_TYPE (off), 0));
13764 cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
13765
13766 if (dw_align)
13767 {
13768 /* Emit: offs = (offs + 15) & -16. */
13769 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
13770 build_int_cst (TREE_TYPE (off), 15));
13771 t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
13772 build_int_cst (TREE_TYPE (off), -16));
13773 roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
13774 }
13775 else
13776 roundup = NULL;
13777
13778 /* Update ap.__[g|v]r_offs */
13779 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
13780 build_int_cst (TREE_TYPE (off), rsize));
13781 t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
13782
13783 /* String up. */
13784 if (roundup)
13785 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
13786
13787 /* [cond2] if (ap.__[g|v]r_offs > 0) */
13788 u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
13789 build_int_cst (TREE_TYPE (f_off), 0));
13790 cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
13791
13792 /* String up: make sure the assignment happens before the use. */
13793 t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
13794 COND_EXPR_ELSE (cond1) = t;
13795
13796 /* Prepare the trees handling the argument that is passed on the stack;
13797 the top level node will store in ON_STACK. */
13798 arg = get_initialized_tmp_var (stack, pre_p, NULL);
13799 if (align > 8)
13800 {
13801 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
13802 t = fold_build_pointer_plus_hwi (arg, 15);
13803 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
13804 build_int_cst (TREE_TYPE (t), -16));
13805 roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
13806 }
13807 else
13808 roundup = NULL;
13809 /* Advance ap.__stack */
13810 t = fold_build_pointer_plus_hwi (arg, size + 7);
13811 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
13812 build_int_cst (TREE_TYPE (t), -8));
13813 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
13814 /* String up roundup and advance. */
13815 if (roundup)
13816 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
13817 /* String up with arg */
13818 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
13819 /* Big-endianness related address adjustment. */
13820 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
13821 && size < UNITS_PER_WORD)
13822 {
13823 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
13824 size_int (UNITS_PER_WORD - size));
13825 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
13826 }
13827
13828 COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
13829 COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
13830
13831 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
13832 t = off;
13833 if (adjust)
13834 t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
13835 build_int_cst (TREE_TYPE (off), adjust));
13836
13837 t = fold_convert (sizetype, t);
13838 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
13839
13840 if (is_ha)
13841 {
13842 /* type ha; // treat as "struct {ftype field[n];}"
13843 ... [computing offs]
13844 for (i = 0; i <nregs; ++i, offs += 16)
13845 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
13846 return ha; */
13847 int i;
13848 tree tmp_ha, field_t, field_ptr_t;
13849
13850 /* Declare a local variable. */
13851 tmp_ha = create_tmp_var_raw (type, "ha");
13852 gimple_add_tmp_var (tmp_ha);
13853
13854 /* Establish the base type. */
13855 switch (ag_mode)
13856 {
13857 case E_SFmode:
13858 field_t = float_type_node;
13859 field_ptr_t = float_ptr_type_node;
13860 break;
13861 case E_DFmode:
13862 field_t = double_type_node;
13863 field_ptr_t = double_ptr_type_node;
13864 break;
13865 case E_TFmode:
13866 field_t = long_double_type_node;
13867 field_ptr_t = long_double_ptr_type_node;
13868 break;
13869 case E_HFmode:
13870 field_t = aarch64_fp16_type_node;
13871 field_ptr_t = aarch64_fp16_ptr_type_node;
13872 break;
13873 case E_V2SImode:
13874 case E_V4SImode:
13875 {
13876 tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
13877 field_t = build_vector_type_for_mode (innertype, ag_mode);
13878 field_ptr_t = build_pointer_type (field_t);
13879 }
13880 break;
13881 default:
13882 gcc_assert (0);
13883 }
13884
13885 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
13886 tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
13887 addr = t;
13888 t = fold_convert (field_ptr_t, addr);
13889 t = build2 (MODIFY_EXPR, field_t,
13890 build1 (INDIRECT_REF, field_t, tmp_ha),
13891 build1 (INDIRECT_REF, field_t, t));
13892
13893 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
13894 for (i = 1; i < nregs; ++i)
13895 {
13896 addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
13897 u = fold_convert (field_ptr_t, addr);
13898 u = build2 (MODIFY_EXPR, field_t,
13899 build2 (MEM_REF, field_t, tmp_ha,
13900 build_int_cst (field_ptr_t,
13901 (i *
13902 int_size_in_bytes (field_t)))),
13903 build1 (INDIRECT_REF, field_t, u));
13904 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
13905 }
13906
13907 u = fold_convert (TREE_TYPE (f_top), tmp_ha);
13908 t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
13909 }
13910
13911 COND_EXPR_ELSE (cond2) = t;
13912 addr = fold_convert (build_pointer_type (type), cond1);
13913 addr = build_va_arg_indirect_ref (addr);
13914
13915 if (indirect_p)
13916 addr = build_va_arg_indirect_ref (addr);
13917
13918 return addr;
13919 }
13920
13921 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
13922
13923 static void
aarch64_setup_incoming_varargs(cumulative_args_t cum_v,machine_mode mode,tree type,int * pretend_size ATTRIBUTE_UNUSED,int no_rtl)13924 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
13925 tree type, int *pretend_size ATTRIBUTE_UNUSED,
13926 int no_rtl)
13927 {
13928 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
13929 CUMULATIVE_ARGS local_cum;
13930 int gr_saved = cfun->va_list_gpr_size;
13931 int vr_saved = cfun->va_list_fpr_size;
13932
13933 /* The caller has advanced CUM up to, but not beyond, the last named
13934 argument. Advance a local copy of CUM past the last "real" named
13935 argument, to find out how many registers are left over. */
13936 local_cum = *cum;
13937 aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
13938
13939 /* Found out how many registers we need to save.
13940 Honor tree-stdvar analysis results. */
13941 if (cfun->va_list_gpr_size)
13942 gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
13943 cfun->va_list_gpr_size / UNITS_PER_WORD);
13944 if (cfun->va_list_fpr_size)
13945 vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
13946 cfun->va_list_fpr_size / UNITS_PER_VREG);
13947
13948 if (!TARGET_FLOAT)
13949 {
13950 gcc_assert (local_cum.aapcs_nvrn == 0);
13951 vr_saved = 0;
13952 }
13953
13954 if (!no_rtl)
13955 {
13956 if (gr_saved > 0)
13957 {
13958 rtx ptr, mem;
13959
13960 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
13961 ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
13962 - gr_saved * UNITS_PER_WORD);
13963 mem = gen_frame_mem (BLKmode, ptr);
13964 set_mem_alias_set (mem, get_varargs_alias_set ());
13965
13966 move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
13967 mem, gr_saved);
13968 }
13969 if (vr_saved > 0)
13970 {
13971 /* We can't use move_block_from_reg, because it will use
13972 the wrong mode, storing D regs only. */
13973 machine_mode mode = TImode;
13974 int off, i, vr_start;
13975
13976 /* Set OFF to the offset from virtual_incoming_args_rtx of
13977 the first vector register. The VR save area lies below
13978 the GR one, and is aligned to 16 bytes. */
13979 off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
13980 STACK_BOUNDARY / BITS_PER_UNIT);
13981 off -= vr_saved * UNITS_PER_VREG;
13982
13983 vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
13984 for (i = 0; i < vr_saved; ++i)
13985 {
13986 rtx ptr, mem;
13987
13988 ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
13989 mem = gen_frame_mem (mode, ptr);
13990 set_mem_alias_set (mem, get_varargs_alias_set ());
13991 aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
13992 off += UNITS_PER_VREG;
13993 }
13994 }
13995 }
13996
13997 /* We don't save the size into *PRETEND_SIZE because we want to avoid
13998 any complication of having crtl->args.pretend_args_size changed. */
13999 cfun->machine->frame.saved_varargs_size
14000 = (ROUND_UP (gr_saved * UNITS_PER_WORD,
14001 STACK_BOUNDARY / BITS_PER_UNIT)
14002 + vr_saved * UNITS_PER_VREG);
14003 }
14004
14005 static void
aarch64_conditional_register_usage(void)14006 aarch64_conditional_register_usage (void)
14007 {
14008 int i;
14009 if (!TARGET_FLOAT)
14010 {
14011 for (i = V0_REGNUM; i <= V31_REGNUM; i++)
14012 {
14013 fixed_regs[i] = 1;
14014 call_used_regs[i] = 1;
14015 }
14016 }
14017 if (!TARGET_SVE)
14018 for (i = P0_REGNUM; i <= P15_REGNUM; i++)
14019 {
14020 fixed_regs[i] = 1;
14021 call_used_regs[i] = 1;
14022 }
14023
14024 /* When tracking speculation, we need a couple of call-clobbered registers
14025 to track the speculation state. It would be nice to just use
14026 IP0 and IP1, but currently there are numerous places that just
14027 assume these registers are free for other uses (eg pointer
14028 authentication). */
14029 if (aarch64_track_speculation)
14030 {
14031 fixed_regs[SPECULATION_TRACKER_REGNUM] = 1;
14032 call_used_regs[SPECULATION_TRACKER_REGNUM] = 1;
14033 fixed_regs[SPECULATION_SCRATCH_REGNUM] = 1;
14034 call_used_regs[SPECULATION_SCRATCH_REGNUM] = 1;
14035 }
14036 }
14037
14038 /* Walk down the type tree of TYPE counting consecutive base elements.
14039 If *MODEP is VOIDmode, then set it to the first valid floating point
14040 type. If a non-floating point type is found, or if a floating point
14041 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
14042 otherwise return the count in the sub-tree. */
14043 static int
aapcs_vfp_sub_candidate(const_tree type,machine_mode * modep)14044 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
14045 {
14046 machine_mode mode;
14047 HOST_WIDE_INT size;
14048
14049 switch (TREE_CODE (type))
14050 {
14051 case REAL_TYPE:
14052 mode = TYPE_MODE (type);
14053 if (mode != DFmode && mode != SFmode
14054 && mode != TFmode && mode != HFmode)
14055 return -1;
14056
14057 if (*modep == VOIDmode)
14058 *modep = mode;
14059
14060 if (*modep == mode)
14061 return 1;
14062
14063 break;
14064
14065 case COMPLEX_TYPE:
14066 mode = TYPE_MODE (TREE_TYPE (type));
14067 if (mode != DFmode && mode != SFmode
14068 && mode != TFmode && mode != HFmode)
14069 return -1;
14070
14071 if (*modep == VOIDmode)
14072 *modep = mode;
14073
14074 if (*modep == mode)
14075 return 2;
14076
14077 break;
14078
14079 case VECTOR_TYPE:
14080 /* Use V2SImode and V4SImode as representatives of all 64-bit
14081 and 128-bit vector types. */
14082 size = int_size_in_bytes (type);
14083 switch (size)
14084 {
14085 case 8:
14086 mode = V2SImode;
14087 break;
14088 case 16:
14089 mode = V4SImode;
14090 break;
14091 default:
14092 return -1;
14093 }
14094
14095 if (*modep == VOIDmode)
14096 *modep = mode;
14097
14098 /* Vector modes are considered to be opaque: two vectors are
14099 equivalent for the purposes of being homogeneous aggregates
14100 if they are the same size. */
14101 if (*modep == mode)
14102 return 1;
14103
14104 break;
14105
14106 case ARRAY_TYPE:
14107 {
14108 int count;
14109 tree index = TYPE_DOMAIN (type);
14110
14111 /* Can't handle incomplete types nor sizes that are not
14112 fixed. */
14113 if (!COMPLETE_TYPE_P (type)
14114 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
14115 return -1;
14116
14117 count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
14118 if (count == -1
14119 || !index
14120 || !TYPE_MAX_VALUE (index)
14121 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
14122 || !TYPE_MIN_VALUE (index)
14123 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
14124 || count < 0)
14125 return -1;
14126
14127 count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
14128 - tree_to_uhwi (TYPE_MIN_VALUE (index)));
14129
14130 /* There must be no padding. */
14131 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
14132 count * GET_MODE_BITSIZE (*modep)))
14133 return -1;
14134
14135 return count;
14136 }
14137
14138 case RECORD_TYPE:
14139 {
14140 int count = 0;
14141 int sub_count;
14142 tree field;
14143
14144 /* Can't handle incomplete types nor sizes that are not
14145 fixed. */
14146 if (!COMPLETE_TYPE_P (type)
14147 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
14148 return -1;
14149
14150 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
14151 {
14152 if (TREE_CODE (field) != FIELD_DECL)
14153 continue;
14154
14155 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
14156 if (sub_count < 0)
14157 return -1;
14158 count += sub_count;
14159 }
14160
14161 /* There must be no padding. */
14162 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
14163 count * GET_MODE_BITSIZE (*modep)))
14164 return -1;
14165
14166 return count;
14167 }
14168
14169 case UNION_TYPE:
14170 case QUAL_UNION_TYPE:
14171 {
14172 /* These aren't very interesting except in a degenerate case. */
14173 int count = 0;
14174 int sub_count;
14175 tree field;
14176
14177 /* Can't handle incomplete types nor sizes that are not
14178 fixed. */
14179 if (!COMPLETE_TYPE_P (type)
14180 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
14181 return -1;
14182
14183 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
14184 {
14185 if (TREE_CODE (field) != FIELD_DECL)
14186 continue;
14187
14188 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
14189 if (sub_count < 0)
14190 return -1;
14191 count = count > sub_count ? count : sub_count;
14192 }
14193
14194 /* There must be no padding. */
14195 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
14196 count * GET_MODE_BITSIZE (*modep)))
14197 return -1;
14198
14199 return count;
14200 }
14201
14202 default:
14203 break;
14204 }
14205
14206 return -1;
14207 }
14208
14209 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
14210 type as described in AAPCS64 \S 4.1.2.
14211
14212 See the comment above aarch64_composite_type_p for the notes on MODE. */
14213
14214 static bool
aarch64_short_vector_p(const_tree type,machine_mode mode)14215 aarch64_short_vector_p (const_tree type,
14216 machine_mode mode)
14217 {
14218 poly_int64 size = -1;
14219
14220 if (type && TREE_CODE (type) == VECTOR_TYPE)
14221 size = int_size_in_bytes (type);
14222 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
14223 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
14224 size = GET_MODE_SIZE (mode);
14225
14226 return known_eq (size, 8) || known_eq (size, 16);
14227 }
14228
14229 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
14230 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
14231 array types. The C99 floating-point complex types are also considered
14232 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
14233 types, which are GCC extensions and out of the scope of AAPCS64, are
14234 treated as composite types here as well.
14235
14236 Note that MODE itself is not sufficient in determining whether a type
14237 is such a composite type or not. This is because
14238 stor-layout.c:compute_record_mode may have already changed the MODE
14239 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
14240 structure with only one field may have its MODE set to the mode of the
14241 field. Also an integer mode whose size matches the size of the
14242 RECORD_TYPE type may be used to substitute the original mode
14243 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
14244 solely relied on. */
14245
14246 static bool
aarch64_composite_type_p(const_tree type,machine_mode mode)14247 aarch64_composite_type_p (const_tree type,
14248 machine_mode mode)
14249 {
14250 if (aarch64_short_vector_p (type, mode))
14251 return false;
14252
14253 if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
14254 return true;
14255
14256 if (mode == BLKmode
14257 || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
14258 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
14259 return true;
14260
14261 return false;
14262 }
14263
14264 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
14265 shall be passed or returned in simd/fp register(s) (providing these
14266 parameter passing registers are available).
14267
14268 Upon successful return, *COUNT returns the number of needed registers,
14269 *BASE_MODE returns the mode of the individual register and when IS_HAF
14270 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
14271 floating-point aggregate or a homogeneous short-vector aggregate. */
14272
14273 static bool
aarch64_vfp_is_call_or_return_candidate(machine_mode mode,const_tree type,machine_mode * base_mode,int * count,bool * is_ha)14274 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
14275 const_tree type,
14276 machine_mode *base_mode,
14277 int *count,
14278 bool *is_ha)
14279 {
14280 machine_mode new_mode = VOIDmode;
14281 bool composite_p = aarch64_composite_type_p (type, mode);
14282
14283 if (is_ha != NULL) *is_ha = false;
14284
14285 if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
14286 || aarch64_short_vector_p (type, mode))
14287 {
14288 *count = 1;
14289 new_mode = mode;
14290 }
14291 else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
14292 {
14293 if (is_ha != NULL) *is_ha = true;
14294 *count = 2;
14295 new_mode = GET_MODE_INNER (mode);
14296 }
14297 else if (type && composite_p)
14298 {
14299 int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
14300
14301 if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
14302 {
14303 if (is_ha != NULL) *is_ha = true;
14304 *count = ag_count;
14305 }
14306 else
14307 return false;
14308 }
14309 else
14310 return false;
14311
14312 *base_mode = new_mode;
14313 return true;
14314 }
14315
14316 /* Implement TARGET_STRUCT_VALUE_RTX. */
14317
14318 static rtx
aarch64_struct_value_rtx(tree fndecl ATTRIBUTE_UNUSED,int incoming ATTRIBUTE_UNUSED)14319 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
14320 int incoming ATTRIBUTE_UNUSED)
14321 {
14322 return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
14323 }
14324
14325 /* Implements target hook vector_mode_supported_p. */
14326 static bool
aarch64_vector_mode_supported_p(machine_mode mode)14327 aarch64_vector_mode_supported_p (machine_mode mode)
14328 {
14329 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
14330 return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
14331 }
14332
14333 /* Return appropriate SIMD container
14334 for MODE within a vector of WIDTH bits. */
14335 static machine_mode
aarch64_simd_container_mode(scalar_mode mode,poly_int64 width)14336 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
14337 {
14338 if (TARGET_SVE && known_eq (width, BITS_PER_SVE_VECTOR))
14339 switch (mode)
14340 {
14341 case E_DFmode:
14342 return VNx2DFmode;
14343 case E_SFmode:
14344 return VNx4SFmode;
14345 case E_HFmode:
14346 return VNx8HFmode;
14347 case E_DImode:
14348 return VNx2DImode;
14349 case E_SImode:
14350 return VNx4SImode;
14351 case E_HImode:
14352 return VNx8HImode;
14353 case E_QImode:
14354 return VNx16QImode;
14355 default:
14356 return word_mode;
14357 }
14358
14359 gcc_assert (known_eq (width, 64) || known_eq (width, 128));
14360 if (TARGET_SIMD)
14361 {
14362 if (known_eq (width, 128))
14363 switch (mode)
14364 {
14365 case E_DFmode:
14366 return V2DFmode;
14367 case E_SFmode:
14368 return V4SFmode;
14369 case E_HFmode:
14370 return V8HFmode;
14371 case E_SImode:
14372 return V4SImode;
14373 case E_HImode:
14374 return V8HImode;
14375 case E_QImode:
14376 return V16QImode;
14377 case E_DImode:
14378 return V2DImode;
14379 default:
14380 break;
14381 }
14382 else
14383 switch (mode)
14384 {
14385 case E_SFmode:
14386 return V2SFmode;
14387 case E_HFmode:
14388 return V4HFmode;
14389 case E_SImode:
14390 return V2SImode;
14391 case E_HImode:
14392 return V4HImode;
14393 case E_QImode:
14394 return V8QImode;
14395 default:
14396 break;
14397 }
14398 }
14399 return word_mode;
14400 }
14401
14402 /* Return 128-bit container as the preferred SIMD mode for MODE. */
14403 static machine_mode
aarch64_preferred_simd_mode(scalar_mode mode)14404 aarch64_preferred_simd_mode (scalar_mode mode)
14405 {
14406 /* If current tuning prefers Advanced SIMD, bypass SVE. */
14407 bool use_sve
14408 = TARGET_SVE
14409 && !(aarch64_tune_params.extra_tuning_flags
14410 & AARCH64_EXTRA_TUNE_PREFER_ADVSIMD_AUTOVEC);
14411 poly_int64 bits = use_sve ? BITS_PER_SVE_VECTOR : 128;
14412 return aarch64_simd_container_mode (mode, bits);
14413 }
14414
14415 /* Return a list of possible vector sizes for the vectorizer
14416 to iterate over. */
14417 static void
aarch64_autovectorize_vector_sizes(vector_sizes * sizes)14418 aarch64_autovectorize_vector_sizes (vector_sizes *sizes)
14419 {
14420 bool use_sve
14421 = TARGET_SVE
14422 && !(aarch64_tune_params.extra_tuning_flags
14423 & AARCH64_EXTRA_TUNE_PREFER_ADVSIMD_AUTOVEC);
14424 if (use_sve)
14425 sizes->safe_push (BYTES_PER_SVE_VECTOR);
14426 sizes->safe_push (16);
14427 sizes->safe_push (8);
14428 }
14429
14430 /* Implement TARGET_MANGLE_TYPE. */
14431
14432 static const char *
aarch64_mangle_type(const_tree type)14433 aarch64_mangle_type (const_tree type)
14434 {
14435 /* The AArch64 ABI documents say that "__va_list" has to be
14436 mangled as if it is in the "std" namespace. */
14437 if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
14438 return "St9__va_list";
14439
14440 /* Half-precision float. */
14441 if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
14442 return "Dh";
14443
14444 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
14445 builtin types. */
14446 if (TYPE_NAME (type) != NULL)
14447 return aarch64_mangle_builtin_type (type);
14448
14449 /* Use the default mangling. */
14450 return NULL;
14451 }
14452
14453 /* Find the first rtx_insn before insn that will generate an assembly
14454 instruction. */
14455
14456 static rtx_insn *
aarch64_prev_real_insn(rtx_insn * insn)14457 aarch64_prev_real_insn (rtx_insn *insn)
14458 {
14459 if (!insn)
14460 return NULL;
14461
14462 do
14463 {
14464 insn = prev_real_insn (insn);
14465 }
14466 while (insn && recog_memoized (insn) < 0);
14467
14468 return insn;
14469 }
14470
14471 static bool
is_madd_op(enum attr_type t1)14472 is_madd_op (enum attr_type t1)
14473 {
14474 unsigned int i;
14475 /* A number of these may be AArch32 only. */
14476 enum attr_type mlatypes[] = {
14477 TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
14478 TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
14479 TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
14480 };
14481
14482 for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
14483 {
14484 if (t1 == mlatypes[i])
14485 return true;
14486 }
14487
14488 return false;
14489 }
14490
14491 /* Check if there is a register dependency between a load and the insn
14492 for which we hold recog_data. */
14493
14494 static bool
dep_between_memop_and_curr(rtx memop)14495 dep_between_memop_and_curr (rtx memop)
14496 {
14497 rtx load_reg;
14498 int opno;
14499
14500 gcc_assert (GET_CODE (memop) == SET);
14501
14502 if (!REG_P (SET_DEST (memop)))
14503 return false;
14504
14505 load_reg = SET_DEST (memop);
14506 for (opno = 1; opno < recog_data.n_operands; opno++)
14507 {
14508 rtx operand = recog_data.operand[opno];
14509 if (REG_P (operand)
14510 && reg_overlap_mentioned_p (load_reg, operand))
14511 return true;
14512
14513 }
14514 return false;
14515 }
14516
14517
14518 /* When working around the Cortex-A53 erratum 835769,
14519 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
14520 instruction and has a preceding memory instruction such that a NOP
14521 should be inserted between them. */
14522
14523 bool
aarch64_madd_needs_nop(rtx_insn * insn)14524 aarch64_madd_needs_nop (rtx_insn* insn)
14525 {
14526 enum attr_type attr_type;
14527 rtx_insn *prev;
14528 rtx body;
14529
14530 if (!TARGET_FIX_ERR_A53_835769)
14531 return false;
14532
14533 if (!INSN_P (insn) || recog_memoized (insn) < 0)
14534 return false;
14535
14536 attr_type = get_attr_type (insn);
14537 if (!is_madd_op (attr_type))
14538 return false;
14539
14540 prev = aarch64_prev_real_insn (insn);
14541 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
14542 Restore recog state to INSN to avoid state corruption. */
14543 extract_constrain_insn_cached (insn);
14544
14545 if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
14546 return false;
14547
14548 body = single_set (prev);
14549
14550 /* If the previous insn is a memory op and there is no dependency between
14551 it and the DImode madd, emit a NOP between them. If body is NULL then we
14552 have a complex memory operation, probably a load/store pair.
14553 Be conservative for now and emit a NOP. */
14554 if (GET_MODE (recog_data.operand[0]) == DImode
14555 && (!body || !dep_between_memop_and_curr (body)))
14556 return true;
14557
14558 return false;
14559
14560 }
14561
14562
14563 /* Implement FINAL_PRESCAN_INSN. */
14564
14565 void
aarch64_final_prescan_insn(rtx_insn * insn)14566 aarch64_final_prescan_insn (rtx_insn *insn)
14567 {
14568 if (aarch64_madd_needs_nop (insn))
14569 fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
14570 }
14571
14572
14573 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
14574 instruction. */
14575
14576 bool
aarch64_sve_index_immediate_p(rtx base_or_step)14577 aarch64_sve_index_immediate_p (rtx base_or_step)
14578 {
14579 return (CONST_INT_P (base_or_step)
14580 && IN_RANGE (INTVAL (base_or_step), -16, 15));
14581 }
14582
14583 /* Return true if X is a valid immediate for the SVE ADD and SUB
14584 instructions. Negate X first if NEGATE_P is true. */
14585
14586 bool
aarch64_sve_arith_immediate_p(rtx x,bool negate_p)14587 aarch64_sve_arith_immediate_p (rtx x, bool negate_p)
14588 {
14589 rtx elt;
14590
14591 if (!const_vec_duplicate_p (x, &elt)
14592 || !CONST_INT_P (elt))
14593 return false;
14594
14595 HOST_WIDE_INT val = INTVAL (elt);
14596 if (negate_p)
14597 val = -val;
14598 val &= GET_MODE_MASK (GET_MODE_INNER (GET_MODE (x)));
14599
14600 if (val & 0xff)
14601 return IN_RANGE (val, 0, 0xff);
14602 return IN_RANGE (val, 0, 0xff00);
14603 }
14604
14605 /* Return true if X is a valid immediate operand for an SVE logical
14606 instruction such as AND. */
14607
14608 bool
aarch64_sve_bitmask_immediate_p(rtx x)14609 aarch64_sve_bitmask_immediate_p (rtx x)
14610 {
14611 rtx elt;
14612
14613 return (const_vec_duplicate_p (x, &elt)
14614 && CONST_INT_P (elt)
14615 && aarch64_bitmask_imm (INTVAL (elt),
14616 GET_MODE_INNER (GET_MODE (x))));
14617 }
14618
14619 /* Return true if X is a valid immediate for the SVE DUP and CPY
14620 instructions. */
14621
14622 bool
aarch64_sve_dup_immediate_p(rtx x)14623 aarch64_sve_dup_immediate_p (rtx x)
14624 {
14625 rtx elt;
14626
14627 if (!const_vec_duplicate_p (x, &elt)
14628 || !CONST_INT_P (elt))
14629 return false;
14630
14631 HOST_WIDE_INT val = INTVAL (elt);
14632 if (val & 0xff)
14633 return IN_RANGE (val, -0x80, 0x7f);
14634 return IN_RANGE (val, -0x8000, 0x7f00);
14635 }
14636
14637 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
14638 SIGNED_P says whether the operand is signed rather than unsigned. */
14639
14640 bool
aarch64_sve_cmp_immediate_p(rtx x,bool signed_p)14641 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
14642 {
14643 rtx elt;
14644
14645 return (const_vec_duplicate_p (x, &elt)
14646 && CONST_INT_P (elt)
14647 && (signed_p
14648 ? IN_RANGE (INTVAL (elt), -16, 15)
14649 : IN_RANGE (INTVAL (elt), 0, 127)));
14650 }
14651
14652 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
14653 instruction. Negate X first if NEGATE_P is true. */
14654
14655 bool
aarch64_sve_float_arith_immediate_p(rtx x,bool negate_p)14656 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
14657 {
14658 rtx elt;
14659 REAL_VALUE_TYPE r;
14660
14661 if (!const_vec_duplicate_p (x, &elt)
14662 || GET_CODE (elt) != CONST_DOUBLE)
14663 return false;
14664
14665 r = *CONST_DOUBLE_REAL_VALUE (elt);
14666
14667 if (negate_p)
14668 r = real_value_negate (&r);
14669
14670 if (real_equal (&r, &dconst1))
14671 return true;
14672 if (real_equal (&r, &dconsthalf))
14673 return true;
14674 return false;
14675 }
14676
14677 /* Return true if X is a valid immediate operand for an SVE FMUL
14678 instruction. */
14679
14680 bool
aarch64_sve_float_mul_immediate_p(rtx x)14681 aarch64_sve_float_mul_immediate_p (rtx x)
14682 {
14683 rtx elt;
14684
14685 /* GCC will never generate a multiply with an immediate of 2, so there is no
14686 point testing for it (even though it is a valid constant). */
14687 return (const_vec_duplicate_p (x, &elt)
14688 && GET_CODE (elt) == CONST_DOUBLE
14689 && real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf));
14690 }
14691
14692 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
14693 for the Advanced SIMD operation described by WHICH and INSN. If INFO
14694 is nonnull, use it to describe valid immediates. */
14695 static bool
aarch64_advsimd_valid_immediate_hs(unsigned int val32,simd_immediate_info * info,enum simd_immediate_check which,simd_immediate_info::insn_type insn)14696 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
14697 simd_immediate_info *info,
14698 enum simd_immediate_check which,
14699 simd_immediate_info::insn_type insn)
14700 {
14701 /* Try a 4-byte immediate with LSL. */
14702 for (unsigned int shift = 0; shift < 32; shift += 8)
14703 if ((val32 & (0xff << shift)) == val32)
14704 {
14705 if (info)
14706 *info = simd_immediate_info (SImode, val32 >> shift, insn,
14707 simd_immediate_info::LSL, shift);
14708 return true;
14709 }
14710
14711 /* Try a 2-byte immediate with LSL. */
14712 unsigned int imm16 = val32 & 0xffff;
14713 if (imm16 == (val32 >> 16))
14714 for (unsigned int shift = 0; shift < 16; shift += 8)
14715 if ((imm16 & (0xff << shift)) == imm16)
14716 {
14717 if (info)
14718 *info = simd_immediate_info (HImode, imm16 >> shift, insn,
14719 simd_immediate_info::LSL, shift);
14720 return true;
14721 }
14722
14723 /* Try a 4-byte immediate with MSL, except for cases that MVN
14724 can handle. */
14725 if (which == AARCH64_CHECK_MOV)
14726 for (unsigned int shift = 8; shift < 24; shift += 8)
14727 {
14728 unsigned int low = (1 << shift) - 1;
14729 if (((val32 & (0xff << shift)) | low) == val32)
14730 {
14731 if (info)
14732 *info = simd_immediate_info (SImode, val32 >> shift, insn,
14733 simd_immediate_info::MSL, shift);
14734 return true;
14735 }
14736 }
14737
14738 return false;
14739 }
14740
14741 /* Return true if replicating VAL64 is a valid immediate for the
14742 Advanced SIMD operation described by WHICH. If INFO is nonnull,
14743 use it to describe valid immediates. */
14744 static bool
aarch64_advsimd_valid_immediate(unsigned HOST_WIDE_INT val64,simd_immediate_info * info,enum simd_immediate_check which)14745 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
14746 simd_immediate_info *info,
14747 enum simd_immediate_check which)
14748 {
14749 unsigned int val32 = val64 & 0xffffffff;
14750 unsigned int val16 = val64 & 0xffff;
14751 unsigned int val8 = val64 & 0xff;
14752
14753 if (val32 == (val64 >> 32))
14754 {
14755 if ((which & AARCH64_CHECK_ORR) != 0
14756 && aarch64_advsimd_valid_immediate_hs (val32, info, which,
14757 simd_immediate_info::MOV))
14758 return true;
14759
14760 if ((which & AARCH64_CHECK_BIC) != 0
14761 && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
14762 simd_immediate_info::MVN))
14763 return true;
14764
14765 /* Try using a replicated byte. */
14766 if (which == AARCH64_CHECK_MOV
14767 && val16 == (val32 >> 16)
14768 && val8 == (val16 >> 8))
14769 {
14770 if (info)
14771 *info = simd_immediate_info (QImode, val8);
14772 return true;
14773 }
14774 }
14775
14776 /* Try using a bit-to-bytemask. */
14777 if (which == AARCH64_CHECK_MOV)
14778 {
14779 unsigned int i;
14780 for (i = 0; i < 64; i += 8)
14781 {
14782 unsigned char byte = (val64 >> i) & 0xff;
14783 if (byte != 0 && byte != 0xff)
14784 break;
14785 }
14786 if (i == 64)
14787 {
14788 if (info)
14789 *info = simd_immediate_info (DImode, val64);
14790 return true;
14791 }
14792 }
14793 return false;
14794 }
14795
14796 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
14797 instruction. If INFO is nonnull, use it to describe valid immediates. */
14798
14799 static bool
aarch64_sve_valid_immediate(unsigned HOST_WIDE_INT val64,simd_immediate_info * info)14800 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
14801 simd_immediate_info *info)
14802 {
14803 scalar_int_mode mode = DImode;
14804 unsigned int val32 = val64 & 0xffffffff;
14805 if (val32 == (val64 >> 32))
14806 {
14807 mode = SImode;
14808 unsigned int val16 = val32 & 0xffff;
14809 if (val16 == (val32 >> 16))
14810 {
14811 mode = HImode;
14812 unsigned int val8 = val16 & 0xff;
14813 if (val8 == (val16 >> 8))
14814 mode = QImode;
14815 }
14816 }
14817 HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
14818 if (IN_RANGE (val, -0x80, 0x7f))
14819 {
14820 /* DUP with no shift. */
14821 if (info)
14822 *info = simd_immediate_info (mode, val);
14823 return true;
14824 }
14825 if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
14826 {
14827 /* DUP with LSL #8. */
14828 if (info)
14829 *info = simd_immediate_info (mode, val);
14830 return true;
14831 }
14832 if (aarch64_bitmask_imm (val64, mode))
14833 {
14834 /* DUPM. */
14835 if (info)
14836 *info = simd_immediate_info (mode, val);
14837 return true;
14838 }
14839 return false;
14840 }
14841
14842 /* Return true if OP is a valid SIMD immediate for the operation
14843 described by WHICH. If INFO is nonnull, use it to describe valid
14844 immediates. */
14845 bool
aarch64_simd_valid_immediate(rtx op,simd_immediate_info * info,enum simd_immediate_check which)14846 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
14847 enum simd_immediate_check which)
14848 {
14849 machine_mode mode = GET_MODE (op);
14850 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
14851 if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
14852 return false;
14853
14854 scalar_mode elt_mode = GET_MODE_INNER (mode);
14855 rtx base, step;
14856 unsigned int n_elts;
14857 if (GET_CODE (op) == CONST_VECTOR
14858 && CONST_VECTOR_DUPLICATE_P (op))
14859 n_elts = CONST_VECTOR_NPATTERNS (op);
14860 else if ((vec_flags & VEC_SVE_DATA)
14861 && const_vec_series_p (op, &base, &step))
14862 {
14863 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
14864 if (!aarch64_sve_index_immediate_p (base)
14865 || !aarch64_sve_index_immediate_p (step))
14866 return false;
14867
14868 if (info)
14869 *info = simd_immediate_info (elt_mode, base, step);
14870 return true;
14871 }
14872 else if (GET_CODE (op) == CONST_VECTOR
14873 && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
14874 /* N_ELTS set above. */;
14875 else
14876 return false;
14877
14878 /* Handle PFALSE and PTRUE. */
14879 if (vec_flags & VEC_SVE_PRED)
14880 return (op == CONST0_RTX (mode)
14881 || op == CONSTM1_RTX (mode));
14882
14883 scalar_float_mode elt_float_mode;
14884 if (n_elts == 1
14885 && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
14886 {
14887 rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
14888 if (aarch64_float_const_zero_rtx_p (elt)
14889 || aarch64_float_const_representable_p (elt))
14890 {
14891 if (info)
14892 *info = simd_immediate_info (elt_float_mode, elt);
14893 return true;
14894 }
14895 }
14896
14897 unsigned int elt_size = GET_MODE_SIZE (elt_mode);
14898 if (elt_size > 8)
14899 return false;
14900
14901 scalar_int_mode elt_int_mode = int_mode_for_mode (elt_mode).require ();
14902
14903 /* Expand the vector constant out into a byte vector, with the least
14904 significant byte of the register first. */
14905 auto_vec<unsigned char, 16> bytes;
14906 bytes.reserve (n_elts * elt_size);
14907 for (unsigned int i = 0; i < n_elts; i++)
14908 {
14909 /* The vector is provided in gcc endian-neutral fashion.
14910 For aarch64_be Advanced SIMD, it must be laid out in the vector
14911 register in reverse order. */
14912 bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
14913 rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
14914
14915 if (elt_mode != elt_int_mode)
14916 elt = gen_lowpart (elt_int_mode, elt);
14917
14918 if (!CONST_INT_P (elt))
14919 return false;
14920
14921 unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
14922 for (unsigned int byte = 0; byte < elt_size; byte++)
14923 {
14924 bytes.quick_push (elt_val & 0xff);
14925 elt_val >>= BITS_PER_UNIT;
14926 }
14927 }
14928
14929 /* The immediate must repeat every eight bytes. */
14930 unsigned int nbytes = bytes.length ();
14931 for (unsigned i = 8; i < nbytes; ++i)
14932 if (bytes[i] != bytes[i - 8])
14933 return false;
14934
14935 /* Get the repeating 8-byte value as an integer. No endian correction
14936 is needed here because bytes is already in lsb-first order. */
14937 unsigned HOST_WIDE_INT val64 = 0;
14938 for (unsigned int i = 0; i < 8; i++)
14939 val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
14940 << (i * BITS_PER_UNIT));
14941
14942 if (vec_flags & VEC_SVE_DATA)
14943 return aarch64_sve_valid_immediate (val64, info);
14944 else
14945 return aarch64_advsimd_valid_immediate (val64, info, which);
14946 }
14947
14948 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
14949 has a step in the range of INDEX. Return the index expression if so,
14950 otherwise return null. */
14951 rtx
aarch64_check_zero_based_sve_index_immediate(rtx x)14952 aarch64_check_zero_based_sve_index_immediate (rtx x)
14953 {
14954 rtx base, step;
14955 if (const_vec_series_p (x, &base, &step)
14956 && base == const0_rtx
14957 && aarch64_sve_index_immediate_p (step))
14958 return step;
14959 return NULL_RTX;
14960 }
14961
14962 /* Check of immediate shift constants are within range. */
14963 bool
aarch64_simd_shift_imm_p(rtx x,machine_mode mode,bool left)14964 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
14965 {
14966 int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
14967 if (left)
14968 return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
14969 else
14970 return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
14971 }
14972
14973 /* Return the bitmask CONST_INT to select the bits required by a zero extract
14974 operation of width WIDTH at bit position POS. */
14975
14976 rtx
aarch64_mask_from_zextract_ops(rtx width,rtx pos)14977 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
14978 {
14979 gcc_assert (CONST_INT_P (width));
14980 gcc_assert (CONST_INT_P (pos));
14981
14982 unsigned HOST_WIDE_INT mask
14983 = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
14984 return GEN_INT (mask << UINTVAL (pos));
14985 }
14986
14987 bool
aarch64_mov_operand_p(rtx x,machine_mode mode)14988 aarch64_mov_operand_p (rtx x, machine_mode mode)
14989 {
14990 if (GET_CODE (x) == HIGH
14991 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
14992 return true;
14993
14994 if (CONST_INT_P (x))
14995 return true;
14996
14997 if (VECTOR_MODE_P (GET_MODE (x)))
14998 return aarch64_simd_valid_immediate (x, NULL);
14999
15000 if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
15001 return true;
15002
15003 if (aarch64_sve_cnt_immediate_p (x))
15004 return true;
15005
15006 return aarch64_classify_symbolic_expression (x)
15007 == SYMBOL_TINY_ABSOLUTE;
15008 }
15009
15010 /* Return a const_int vector of VAL. */
15011 rtx
aarch64_simd_gen_const_vector_dup(machine_mode mode,HOST_WIDE_INT val)15012 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
15013 {
15014 rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
15015 return gen_const_vec_duplicate (mode, c);
15016 }
15017
15018 /* Check OP is a legal scalar immediate for the MOVI instruction. */
15019
15020 bool
aarch64_simd_scalar_immediate_valid_for_move(rtx op,scalar_int_mode mode)15021 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
15022 {
15023 machine_mode vmode;
15024
15025 vmode = aarch64_simd_container_mode (mode, 64);
15026 rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
15027 return aarch64_simd_valid_immediate (op_v, NULL);
15028 }
15029
15030 /* Construct and return a PARALLEL RTX vector with elements numbering the
15031 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
15032 the vector - from the perspective of the architecture. This does not
15033 line up with GCC's perspective on lane numbers, so we end up with
15034 different masks depending on our target endian-ness. The diagram
15035 below may help. We must draw the distinction when building masks
15036 which select one half of the vector. An instruction selecting
15037 architectural low-lanes for a big-endian target, must be described using
15038 a mask selecting GCC high-lanes.
15039
15040 Big-Endian Little-Endian
15041
15042 GCC 0 1 2 3 3 2 1 0
15043 | x | x | x | x | | x | x | x | x |
15044 Architecture 3 2 1 0 3 2 1 0
15045
15046 Low Mask: { 2, 3 } { 0, 1 }
15047 High Mask: { 0, 1 } { 2, 3 }
15048
15049 MODE Is the mode of the vector and NUNITS is the number of units in it. */
15050
15051 rtx
aarch64_simd_vect_par_cnst_half(machine_mode mode,int nunits,bool high)15052 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
15053 {
15054 rtvec v = rtvec_alloc (nunits / 2);
15055 int high_base = nunits / 2;
15056 int low_base = 0;
15057 int base;
15058 rtx t1;
15059 int i;
15060
15061 if (BYTES_BIG_ENDIAN)
15062 base = high ? low_base : high_base;
15063 else
15064 base = high ? high_base : low_base;
15065
15066 for (i = 0; i < nunits / 2; i++)
15067 RTVEC_ELT (v, i) = GEN_INT (base + i);
15068
15069 t1 = gen_rtx_PARALLEL (mode, v);
15070 return t1;
15071 }
15072
15073 /* Check OP for validity as a PARALLEL RTX vector with elements
15074 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
15075 from the perspective of the architecture. See the diagram above
15076 aarch64_simd_vect_par_cnst_half for more details. */
15077
15078 bool
aarch64_simd_check_vect_par_cnst_half(rtx op,machine_mode mode,bool high)15079 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
15080 bool high)
15081 {
15082 int nelts;
15083 if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
15084 return false;
15085
15086 rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
15087 HOST_WIDE_INT count_op = XVECLEN (op, 0);
15088 HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
15089 int i = 0;
15090
15091 if (count_op != count_ideal)
15092 return false;
15093
15094 for (i = 0; i < count_ideal; i++)
15095 {
15096 rtx elt_op = XVECEXP (op, 0, i);
15097 rtx elt_ideal = XVECEXP (ideal, 0, i);
15098
15099 if (!CONST_INT_P (elt_op)
15100 || INTVAL (elt_ideal) != INTVAL (elt_op))
15101 return false;
15102 }
15103 return true;
15104 }
15105
15106 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
15107 HIGH (exclusive). */
15108 void
aarch64_simd_lane_bounds(rtx operand,HOST_WIDE_INT low,HOST_WIDE_INT high,const_tree exp)15109 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
15110 const_tree exp)
15111 {
15112 HOST_WIDE_INT lane;
15113 gcc_assert (CONST_INT_P (operand));
15114 lane = INTVAL (operand);
15115
15116 if (lane < low || lane >= high)
15117 {
15118 if (exp)
15119 error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
15120 else
15121 error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
15122 }
15123 }
15124
15125 /* Peform endian correction on lane number N, which indexes a vector
15126 of mode MODE, and return the result as an SImode rtx. */
15127
15128 rtx
aarch64_endian_lane_rtx(machine_mode mode,unsigned int n)15129 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
15130 {
15131 return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
15132 }
15133
15134 /* Return TRUE if OP is a valid vector addressing mode. */
15135
15136 bool
aarch64_simd_mem_operand_p(rtx op)15137 aarch64_simd_mem_operand_p (rtx op)
15138 {
15139 return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
15140 || REG_P (XEXP (op, 0)));
15141 }
15142
15143 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction. */
15144
15145 bool
aarch64_sve_ld1r_operand_p(rtx op)15146 aarch64_sve_ld1r_operand_p (rtx op)
15147 {
15148 struct aarch64_address_info addr;
15149 scalar_mode mode;
15150
15151 return (MEM_P (op)
15152 && is_a <scalar_mode> (GET_MODE (op), &mode)
15153 && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
15154 && addr.type == ADDRESS_REG_IMM
15155 && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
15156 }
15157
15158 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
15159 The conditions for STR are the same. */
15160 bool
aarch64_sve_ldr_operand_p(rtx op)15161 aarch64_sve_ldr_operand_p (rtx op)
15162 {
15163 struct aarch64_address_info addr;
15164
15165 return (MEM_P (op)
15166 && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
15167 false, ADDR_QUERY_ANY)
15168 && addr.type == ADDRESS_REG_IMM);
15169 }
15170
15171 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
15172 We need to be able to access the individual pieces, so the range
15173 is different from LD[234] and ST[234]. */
15174 bool
aarch64_sve_struct_memory_operand_p(rtx op)15175 aarch64_sve_struct_memory_operand_p (rtx op)
15176 {
15177 if (!MEM_P (op))
15178 return false;
15179
15180 machine_mode mode = GET_MODE (op);
15181 struct aarch64_address_info addr;
15182 if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
15183 ADDR_QUERY_ANY)
15184 || addr.type != ADDRESS_REG_IMM)
15185 return false;
15186
15187 poly_int64 first = addr.const_offset;
15188 poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
15189 return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
15190 && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
15191 }
15192
15193 /* Emit a register copy from operand to operand, taking care not to
15194 early-clobber source registers in the process.
15195
15196 COUNT is the number of components into which the copy needs to be
15197 decomposed. */
15198 void
aarch64_simd_emit_reg_reg_move(rtx * operands,machine_mode mode,unsigned int count)15199 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
15200 unsigned int count)
15201 {
15202 unsigned int i;
15203 int rdest = REGNO (operands[0]);
15204 int rsrc = REGNO (operands[1]);
15205
15206 if (!reg_overlap_mentioned_p (operands[0], operands[1])
15207 || rdest < rsrc)
15208 for (i = 0; i < count; i++)
15209 emit_move_insn (gen_rtx_REG (mode, rdest + i),
15210 gen_rtx_REG (mode, rsrc + i));
15211 else
15212 for (i = 0; i < count; i++)
15213 emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
15214 gen_rtx_REG (mode, rsrc + count - i - 1));
15215 }
15216
15217 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
15218 one of VSTRUCT modes: OI, CI, or XI. */
15219 int
aarch64_simd_attr_length_rglist(machine_mode mode)15220 aarch64_simd_attr_length_rglist (machine_mode mode)
15221 {
15222 /* This is only used (and only meaningful) for Advanced SIMD, not SVE. */
15223 return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
15224 }
15225
15226 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
15227 alignment of a vector to 128 bits. SVE predicates have an alignment of
15228 16 bits. */
15229 static HOST_WIDE_INT
aarch64_simd_vector_alignment(const_tree type)15230 aarch64_simd_vector_alignment (const_tree type)
15231 {
15232 if (TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
15233 /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
15234 be set for non-predicate vectors of booleans. Modes are the most
15235 direct way we have of identifying real SVE predicate types. */
15236 return GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL ? 16 : 128;
15237 return wi::umin (wi::to_wide (TYPE_SIZE (type)), 128).to_uhwi ();
15238 }
15239
15240 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT. */
15241 static poly_uint64
aarch64_vectorize_preferred_vector_alignment(const_tree type)15242 aarch64_vectorize_preferred_vector_alignment (const_tree type)
15243 {
15244 if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
15245 {
15246 /* If the length of the vector is fixed, try to align to that length,
15247 otherwise don't try to align at all. */
15248 HOST_WIDE_INT result;
15249 if (!BITS_PER_SVE_VECTOR.is_constant (&result))
15250 result = TYPE_ALIGN (TREE_TYPE (type));
15251 return result;
15252 }
15253 return TYPE_ALIGN (type);
15254 }
15255
15256 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
15257 static bool
aarch64_simd_vector_alignment_reachable(const_tree type,bool is_packed)15258 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
15259 {
15260 if (is_packed)
15261 return false;
15262
15263 /* For fixed-length vectors, check that the vectorizer will aim for
15264 full-vector alignment. This isn't true for generic GCC vectors
15265 that are wider than the ABI maximum of 128 bits. */
15266 poly_uint64 preferred_alignment =
15267 aarch64_vectorize_preferred_vector_alignment (type);
15268 if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
15269 && maybe_ne (wi::to_widest (TYPE_SIZE (type)),
15270 preferred_alignment))
15271 return false;
15272
15273 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
15274 return true;
15275 }
15276
15277 /* Return true if the vector misalignment factor is supported by the
15278 target. */
15279 static bool
aarch64_builtin_support_vector_misalignment(machine_mode mode,const_tree type,int misalignment,bool is_packed)15280 aarch64_builtin_support_vector_misalignment (machine_mode mode,
15281 const_tree type, int misalignment,
15282 bool is_packed)
15283 {
15284 if (TARGET_SIMD && STRICT_ALIGNMENT)
15285 {
15286 /* Return if movmisalign pattern is not supported for this mode. */
15287 if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
15288 return false;
15289
15290 /* Misalignment factor is unknown at compile time. */
15291 if (misalignment == -1)
15292 return false;
15293 }
15294 return default_builtin_support_vector_misalignment (mode, type, misalignment,
15295 is_packed);
15296 }
15297
15298 /* If VALS is a vector constant that can be loaded into a register
15299 using DUP, generate instructions to do so and return an RTX to
15300 assign to the register. Otherwise return NULL_RTX. */
15301 static rtx
aarch64_simd_dup_constant(rtx vals)15302 aarch64_simd_dup_constant (rtx vals)
15303 {
15304 machine_mode mode = GET_MODE (vals);
15305 machine_mode inner_mode = GET_MODE_INNER (mode);
15306 rtx x;
15307
15308 if (!const_vec_duplicate_p (vals, &x))
15309 return NULL_RTX;
15310
15311 /* We can load this constant by using DUP and a constant in a
15312 single ARM register. This will be cheaper than a vector
15313 load. */
15314 x = copy_to_mode_reg (inner_mode, x);
15315 return gen_vec_duplicate (mode, x);
15316 }
15317
15318
15319 /* Generate code to load VALS, which is a PARALLEL containing only
15320 constants (for vec_init) or CONST_VECTOR, efficiently into a
15321 register. Returns an RTX to copy into the register, or NULL_RTX
15322 for a PARALLEL that cannot be converted into a CONST_VECTOR. */
15323 static rtx
aarch64_simd_make_constant(rtx vals)15324 aarch64_simd_make_constant (rtx vals)
15325 {
15326 machine_mode mode = GET_MODE (vals);
15327 rtx const_dup;
15328 rtx const_vec = NULL_RTX;
15329 int n_const = 0;
15330 int i;
15331
15332 if (GET_CODE (vals) == CONST_VECTOR)
15333 const_vec = vals;
15334 else if (GET_CODE (vals) == PARALLEL)
15335 {
15336 /* A CONST_VECTOR must contain only CONST_INTs and
15337 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
15338 Only store valid constants in a CONST_VECTOR. */
15339 int n_elts = XVECLEN (vals, 0);
15340 for (i = 0; i < n_elts; ++i)
15341 {
15342 rtx x = XVECEXP (vals, 0, i);
15343 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
15344 n_const++;
15345 }
15346 if (n_const == n_elts)
15347 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
15348 }
15349 else
15350 gcc_unreachable ();
15351
15352 if (const_vec != NULL_RTX
15353 && aarch64_simd_valid_immediate (const_vec, NULL))
15354 /* Load using MOVI/MVNI. */
15355 return const_vec;
15356 else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
15357 /* Loaded using DUP. */
15358 return const_dup;
15359 else if (const_vec != NULL_RTX)
15360 /* Load from constant pool. We cannot take advantage of single-cycle
15361 LD1 because we need a PC-relative addressing mode. */
15362 return const_vec;
15363 else
15364 /* A PARALLEL containing something not valid inside CONST_VECTOR.
15365 We cannot construct an initializer. */
15366 return NULL_RTX;
15367 }
15368
15369 /* Expand a vector initialisation sequence, such that TARGET is
15370 initialised to contain VALS. */
15371
15372 void
aarch64_expand_vector_init(rtx target,rtx vals)15373 aarch64_expand_vector_init (rtx target, rtx vals)
15374 {
15375 machine_mode mode = GET_MODE (target);
15376 scalar_mode inner_mode = GET_MODE_INNER (mode);
15377 /* The number of vector elements. */
15378 int n_elts = XVECLEN (vals, 0);
15379 /* The number of vector elements which are not constant. */
15380 int n_var = 0;
15381 rtx any_const = NULL_RTX;
15382 /* The first element of vals. */
15383 rtx v0 = XVECEXP (vals, 0, 0);
15384 bool all_same = true;
15385
15386 /* Count the number of variable elements to initialise. */
15387 for (int i = 0; i < n_elts; ++i)
15388 {
15389 rtx x = XVECEXP (vals, 0, i);
15390 if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
15391 ++n_var;
15392 else
15393 any_const = x;
15394
15395 all_same &= rtx_equal_p (x, v0);
15396 }
15397
15398 /* No variable elements, hand off to aarch64_simd_make_constant which knows
15399 how best to handle this. */
15400 if (n_var == 0)
15401 {
15402 rtx constant = aarch64_simd_make_constant (vals);
15403 if (constant != NULL_RTX)
15404 {
15405 emit_move_insn (target, constant);
15406 return;
15407 }
15408 }
15409
15410 /* Splat a single non-constant element if we can. */
15411 if (all_same)
15412 {
15413 rtx x = copy_to_mode_reg (inner_mode, v0);
15414 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
15415 return;
15416 }
15417
15418 enum insn_code icode = optab_handler (vec_set_optab, mode);
15419 gcc_assert (icode != CODE_FOR_nothing);
15420
15421 /* If there are only variable elements, try to optimize
15422 the insertion using dup for the most common element
15423 followed by insertions. */
15424
15425 /* The algorithm will fill matches[*][0] with the earliest matching element,
15426 and matches[X][1] with the count of duplicate elements (if X is the
15427 earliest element which has duplicates). */
15428
15429 if (n_var == n_elts && n_elts <= 16)
15430 {
15431 int matches[16][2] = {0};
15432 for (int i = 0; i < n_elts; i++)
15433 {
15434 for (int j = 0; j <= i; j++)
15435 {
15436 if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
15437 {
15438 matches[i][0] = j;
15439 matches[j][1]++;
15440 break;
15441 }
15442 }
15443 }
15444 int maxelement = 0;
15445 int maxv = 0;
15446 for (int i = 0; i < n_elts; i++)
15447 if (matches[i][1] > maxv)
15448 {
15449 maxelement = i;
15450 maxv = matches[i][1];
15451 }
15452
15453 /* Create a duplicate of the most common element, unless all elements
15454 are equally useless to us, in which case just immediately set the
15455 vector register using the first element. */
15456
15457 if (maxv == 1)
15458 {
15459 /* For vectors of two 64-bit elements, we can do even better. */
15460 if (n_elts == 2
15461 && (inner_mode == E_DImode
15462 || inner_mode == E_DFmode))
15463
15464 {
15465 rtx x0 = XVECEXP (vals, 0, 0);
15466 rtx x1 = XVECEXP (vals, 0, 1);
15467 /* Combine can pick up this case, but handling it directly
15468 here leaves clearer RTL.
15469
15470 This is load_pair_lanes<mode>, and also gives us a clean-up
15471 for store_pair_lanes<mode>. */
15472 if (memory_operand (x0, inner_mode)
15473 && memory_operand (x1, inner_mode)
15474 && !STRICT_ALIGNMENT
15475 && rtx_equal_p (XEXP (x1, 0),
15476 plus_constant (Pmode,
15477 XEXP (x0, 0),
15478 GET_MODE_SIZE (inner_mode))))
15479 {
15480 rtx t;
15481 if (inner_mode == DFmode)
15482 t = gen_load_pair_lanesdf (target, x0, x1);
15483 else
15484 t = gen_load_pair_lanesdi (target, x0, x1);
15485 emit_insn (t);
15486 return;
15487 }
15488 }
15489 /* The subreg-move sequence below will move into lane zero of the
15490 vector register. For big-endian we want that position to hold
15491 the last element of VALS. */
15492 maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
15493 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
15494 aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
15495 }
15496 else
15497 {
15498 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
15499 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
15500 }
15501
15502 /* Insert the rest. */
15503 for (int i = 0; i < n_elts; i++)
15504 {
15505 rtx x = XVECEXP (vals, 0, i);
15506 if (matches[i][0] == maxelement)
15507 continue;
15508 x = copy_to_mode_reg (inner_mode, x);
15509 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
15510 }
15511 return;
15512 }
15513
15514 /* Initialise a vector which is part-variable. We want to first try
15515 to build those lanes which are constant in the most efficient way we
15516 can. */
15517 if (n_var != n_elts)
15518 {
15519 rtx copy = copy_rtx (vals);
15520
15521 /* Load constant part of vector. We really don't care what goes into the
15522 parts we will overwrite, but we're more likely to be able to load the
15523 constant efficiently if it has fewer, larger, repeating parts
15524 (see aarch64_simd_valid_immediate). */
15525 for (int i = 0; i < n_elts; i++)
15526 {
15527 rtx x = XVECEXP (vals, 0, i);
15528 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
15529 continue;
15530 rtx subst = any_const;
15531 for (int bit = n_elts / 2; bit > 0; bit /= 2)
15532 {
15533 /* Look in the copied vector, as more elements are const. */
15534 rtx test = XVECEXP (copy, 0, i ^ bit);
15535 if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
15536 {
15537 subst = test;
15538 break;
15539 }
15540 }
15541 XVECEXP (copy, 0, i) = subst;
15542 }
15543 aarch64_expand_vector_init (target, copy);
15544 }
15545
15546 /* Insert the variable lanes directly. */
15547 for (int i = 0; i < n_elts; i++)
15548 {
15549 rtx x = XVECEXP (vals, 0, i);
15550 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
15551 continue;
15552 x = copy_to_mode_reg (inner_mode, x);
15553 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
15554 }
15555 }
15556
15557 static unsigned HOST_WIDE_INT
aarch64_shift_truncation_mask(machine_mode mode)15558 aarch64_shift_truncation_mask (machine_mode mode)
15559 {
15560 if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
15561 return 0;
15562 return GET_MODE_UNIT_BITSIZE (mode) - 1;
15563 }
15564
15565 /* Select a format to encode pointers in exception handling data. */
15566 int
aarch64_asm_preferred_eh_data_format(int code ATTRIBUTE_UNUSED,int global)15567 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
15568 {
15569 int type;
15570 switch (aarch64_cmodel)
15571 {
15572 case AARCH64_CMODEL_TINY:
15573 case AARCH64_CMODEL_TINY_PIC:
15574 case AARCH64_CMODEL_SMALL:
15575 case AARCH64_CMODEL_SMALL_PIC:
15576 case AARCH64_CMODEL_SMALL_SPIC:
15577 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
15578 for everything. */
15579 type = DW_EH_PE_sdata4;
15580 break;
15581 default:
15582 /* No assumptions here. 8-byte relocs required. */
15583 type = DW_EH_PE_sdata8;
15584 break;
15585 }
15586 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
15587 }
15588
15589 /* Output .variant_pcs for aarch64_vector_pcs function symbols. */
15590
15591 static void
aarch64_asm_output_variant_pcs(FILE * stream,const tree decl,const char * name)15592 aarch64_asm_output_variant_pcs (FILE *stream, const tree decl, const char* name)
15593 {
15594 if (aarch64_simd_decl_p (decl))
15595 {
15596 fprintf (stream, "\t.variant_pcs\t");
15597 assemble_name (stream, name);
15598 fprintf (stream, "\n");
15599 }
15600 }
15601
15602 /* The last .arch and .tune assembly strings that we printed. */
15603 static std::string aarch64_last_printed_arch_string;
15604 static std::string aarch64_last_printed_tune_string;
15605
15606 /* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used
15607 by the function fndecl. */
15608
15609 void
aarch64_declare_function_name(FILE * stream,const char * name,tree fndecl)15610 aarch64_declare_function_name (FILE *stream, const char* name,
15611 tree fndecl)
15612 {
15613 tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
15614
15615 struct cl_target_option *targ_options;
15616 if (target_parts)
15617 targ_options = TREE_TARGET_OPTION (target_parts);
15618 else
15619 targ_options = TREE_TARGET_OPTION (target_option_current_node);
15620 gcc_assert (targ_options);
15621
15622 const struct processor *this_arch
15623 = aarch64_get_arch (targ_options->x_explicit_arch);
15624
15625 unsigned long isa_flags = targ_options->x_aarch64_isa_flags;
15626 std::string extension
15627 = aarch64_get_extension_string_for_isa_flags (isa_flags,
15628 this_arch->flags);
15629 /* Only update the assembler .arch string if it is distinct from the last
15630 such string we printed. */
15631 std::string to_print = this_arch->name + extension;
15632 if (to_print != aarch64_last_printed_arch_string)
15633 {
15634 asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
15635 aarch64_last_printed_arch_string = to_print;
15636 }
15637
15638 /* Print the cpu name we're tuning for in the comments, might be
15639 useful to readers of the generated asm. Do it only when it changes
15640 from function to function and verbose assembly is requested. */
15641 const struct processor *this_tune
15642 = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
15643
15644 if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
15645 {
15646 asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
15647 this_tune->name);
15648 aarch64_last_printed_tune_string = this_tune->name;
15649 }
15650
15651 aarch64_asm_output_variant_pcs (stream, fndecl, name);
15652
15653 /* Don't forget the type directive for ELF. */
15654 ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
15655 ASM_OUTPUT_LABEL (stream, name);
15656
15657 cfun->machine->label_is_assembled = true;
15658 }
15659
15660 /* Implement PRINT_PATCHABLE_FUNCTION_ENTRY. Check if the patch area is after
15661 the function label and emit a BTI if necessary. */
15662
15663 void
aarch64_print_patchable_function_entry(FILE * file,unsigned HOST_WIDE_INT patch_area_size,bool record_p)15664 aarch64_print_patchable_function_entry (FILE *file,
15665 unsigned HOST_WIDE_INT patch_area_size,
15666 bool record_p)
15667 {
15668 if (cfun->machine->label_is_assembled
15669 && aarch64_bti_enabled ()
15670 && !cgraph_node::get (cfun->decl)->only_called_directly_p ())
15671 {
15672 /* Remove the BTI that follows the patch area and insert a new BTI
15673 before the patch area right after the function label. */
15674 rtx_insn *insn = next_real_nondebug_insn (get_insns ());
15675 if (insn
15676 && INSN_P (insn)
15677 && GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
15678 && XINT (PATTERN (insn), 1) == UNSPECV_BTI_C)
15679 delete_insn (insn);
15680 asm_fprintf (file, "\thint\t34 // bti c\n");
15681 }
15682
15683 default_print_patchable_function_entry (file, patch_area_size, record_p);
15684 }
15685
15686 /* Implement ASM_OUTPUT_DEF_FROM_DECLS. Output .variant_pcs for aliases. */
15687
15688 void
aarch64_asm_output_alias(FILE * stream,const tree decl,const tree target)15689 aarch64_asm_output_alias (FILE *stream, const tree decl, const tree target)
15690 {
15691 const char *name = XSTR (XEXP (DECL_RTL (decl), 0), 0);
15692 const char *value = IDENTIFIER_POINTER (target);
15693 aarch64_asm_output_variant_pcs (stream, decl, name);
15694 ASM_OUTPUT_DEF (stream, name, value);
15695 }
15696
15697 /* Implement ASM_OUTPUT_EXTERNAL. Output .variant_pcs for undefined
15698 function symbol references. */
15699
15700 void
aarch64_asm_output_external(FILE * stream,tree decl,const char * name)15701 aarch64_asm_output_external (FILE *stream, tree decl, const char* name)
15702 {
15703 default_elf_asm_output_external (stream, decl, name);
15704 aarch64_asm_output_variant_pcs (stream, decl, name);
15705 }
15706
15707 /* Implements TARGET_ASM_FILE_START. Output the assembly header. */
15708
15709 static void
aarch64_start_file(void)15710 aarch64_start_file (void)
15711 {
15712 struct cl_target_option *default_options
15713 = TREE_TARGET_OPTION (target_option_default_node);
15714
15715 const struct processor *default_arch
15716 = aarch64_get_arch (default_options->x_explicit_arch);
15717 unsigned long default_isa_flags = default_options->x_aarch64_isa_flags;
15718 std::string extension
15719 = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
15720 default_arch->flags);
15721
15722 aarch64_last_printed_arch_string = default_arch->name + extension;
15723 aarch64_last_printed_tune_string = "";
15724 asm_fprintf (asm_out_file, "\t.arch %s\n",
15725 aarch64_last_printed_arch_string.c_str ());
15726
15727 default_file_start ();
15728 }
15729
15730 /* Emit load exclusive. */
15731
15732 static void
aarch64_emit_load_exclusive(machine_mode mode,rtx rval,rtx mem,rtx model_rtx)15733 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
15734 rtx mem, rtx model_rtx)
15735 {
15736 if (mode == TImode)
15737 emit_insn (gen_aarch64_load_exclusive_pair (gen_lowpart (DImode, rval),
15738 gen_highpart (DImode, rval),
15739 mem, model_rtx));
15740 else
15741 emit_insn (gen_aarch64_load_exclusive (mode, rval, mem, model_rtx));
15742 }
15743
15744 /* Emit store exclusive. */
15745
15746 static void
aarch64_emit_store_exclusive(machine_mode mode,rtx bval,rtx mem,rtx rval,rtx model_rtx)15747 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
15748 rtx mem, rtx rval, rtx model_rtx)
15749 {
15750 if (mode == TImode)
15751 emit_insn (gen_aarch64_store_exclusive_pair
15752 (bval, mem, operand_subword (rval, 0, 0, TImode),
15753 operand_subword (rval, 1, 0, TImode), model_rtx));
15754 else
15755 emit_insn (gen_aarch64_store_exclusive (mode, bval, mem, rval, model_rtx));
15756 }
15757
15758 /* Mark the previous jump instruction as unlikely. */
15759
15760 static void
aarch64_emit_unlikely_jump(rtx insn)15761 aarch64_emit_unlikely_jump (rtx insn)
15762 {
15763 rtx_insn *jump = emit_jump_insn (insn);
15764 add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
15765 }
15766
15767 /* We store the names of the various atomic helpers in a 5x4 array.
15768 Return the libcall function given MODE, MODEL and NAMES. */
15769
15770 rtx
aarch64_atomic_ool_func(machine_mode mode,rtx model_rtx,const atomic_ool_names * names)15771 aarch64_atomic_ool_func(machine_mode mode, rtx model_rtx,
15772 const atomic_ool_names *names)
15773 {
15774 memmodel model = memmodel_base (INTVAL (model_rtx));
15775 int mode_idx, model_idx;
15776
15777 switch (mode)
15778 {
15779 case E_QImode:
15780 mode_idx = 0;
15781 break;
15782 case E_HImode:
15783 mode_idx = 1;
15784 break;
15785 case E_SImode:
15786 mode_idx = 2;
15787 break;
15788 case E_DImode:
15789 mode_idx = 3;
15790 break;
15791 case E_TImode:
15792 mode_idx = 4;
15793 break;
15794 default:
15795 gcc_unreachable ();
15796 }
15797
15798 switch (model)
15799 {
15800 case MEMMODEL_RELAXED:
15801 model_idx = 0;
15802 break;
15803 case MEMMODEL_CONSUME:
15804 case MEMMODEL_ACQUIRE:
15805 model_idx = 1;
15806 break;
15807 case MEMMODEL_RELEASE:
15808 model_idx = 2;
15809 break;
15810 case MEMMODEL_ACQ_REL:
15811 case MEMMODEL_SEQ_CST:
15812 model_idx = 3;
15813 break;
15814 default:
15815 gcc_unreachable ();
15816 }
15817
15818 return init_one_libfunc_visibility (names->str[mode_idx][model_idx],
15819 VISIBILITY_HIDDEN);
15820 }
15821
15822 #define DEF0(B, N) \
15823 { "__aarch64_" #B #N "_relax", \
15824 "__aarch64_" #B #N "_acq", \
15825 "__aarch64_" #B #N "_rel", \
15826 "__aarch64_" #B #N "_acq_rel" }
15827
15828 #define DEF4(B) DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), \
15829 { NULL, NULL, NULL, NULL }
15830 #define DEF5(B) DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), DEF0(B, 16)
15831
15832 static const atomic_ool_names aarch64_ool_cas_names = { { DEF5(cas) } };
15833 const atomic_ool_names aarch64_ool_swp_names = { { DEF4(swp) } };
15834 const atomic_ool_names aarch64_ool_ldadd_names = { { DEF4(ldadd) } };
15835 const atomic_ool_names aarch64_ool_ldset_names = { { DEF4(ldset) } };
15836 const atomic_ool_names aarch64_ool_ldclr_names = { { DEF4(ldclr) } };
15837 const atomic_ool_names aarch64_ool_ldeor_names = { { DEF4(ldeor) } };
15838
15839 #undef DEF0
15840 #undef DEF4
15841 #undef DEF5
15842
15843 /* Expand a compare and swap pattern. */
15844
15845 void
aarch64_expand_compare_and_swap(rtx operands[])15846 aarch64_expand_compare_and_swap (rtx operands[])
15847 {
15848 rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x, cc_reg;
15849 machine_mode mode, r_mode;
15850
15851 bval = operands[0];
15852 rval = operands[1];
15853 mem = operands[2];
15854 oldval = operands[3];
15855 newval = operands[4];
15856 is_weak = operands[5];
15857 mod_s = operands[6];
15858 mod_f = operands[7];
15859 mode = GET_MODE (mem);
15860
15861 /* Normally the succ memory model must be stronger than fail, but in the
15862 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
15863 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
15864 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
15865 && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
15866 mod_s = GEN_INT (MEMMODEL_ACQ_REL);
15867
15868 r_mode = mode;
15869 if (mode == QImode || mode == HImode)
15870 {
15871 r_mode = SImode;
15872 rval = gen_reg_rtx (r_mode);
15873 }
15874
15875 if (TARGET_LSE)
15876 {
15877 /* The CAS insn requires oldval and rval overlap, but we need to
15878 have a copy of oldval saved across the operation to tell if
15879 the operation is successful. */
15880 if (reg_overlap_mentioned_p (rval, oldval))
15881 rval = copy_to_mode_reg (r_mode, oldval);
15882 else
15883 emit_move_insn (rval, gen_lowpart (r_mode, oldval));
15884
15885 emit_insn (gen_aarch64_compare_and_swap_lse (mode, rval, mem,
15886 newval, mod_s));
15887 cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
15888 }
15889 else if (TARGET_OUTLINE_ATOMICS)
15890 {
15891 /* Oldval must satisfy compare afterward. */
15892 if (!aarch64_plus_operand (oldval, mode))
15893 oldval = force_reg (mode, oldval);
15894 rtx func = aarch64_atomic_ool_func (mode, mod_s, &aarch64_ool_cas_names);
15895 rval = emit_library_call_value (func, NULL_RTX, LCT_NORMAL, r_mode,
15896 oldval, mode, newval, mode,
15897 XEXP (mem, 0), Pmode);
15898 cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
15899 }
15900 else
15901 {
15902 /* The oldval predicate varies by mode. Test it and force to reg. */
15903 insn_code code = code_for_aarch64_compare_and_swap (mode);
15904 if (!insn_data[code].operand[2].predicate (oldval, mode))
15905 oldval = force_reg (mode, oldval);
15906
15907 emit_insn (GEN_FCN (code) (rval, mem, oldval, newval,
15908 is_weak, mod_s, mod_f));
15909 cc_reg = gen_rtx_REG (CCmode, CC_REGNUM);
15910 }
15911
15912 if (r_mode != mode)
15913 rval = gen_lowpart (mode, rval);
15914 emit_move_insn (operands[1], rval);
15915
15916 x = gen_rtx_EQ (SImode, cc_reg, const0_rtx);
15917 emit_insn (gen_rtx_SET (bval, x));
15918 }
15919
15920 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
15921 sequence implementing an atomic operation. */
15922
15923 static void
aarch64_emit_post_barrier(enum memmodel model)15924 aarch64_emit_post_barrier (enum memmodel model)
15925 {
15926 const enum memmodel base_model = memmodel_base (model);
15927
15928 if (is_mm_sync (model)
15929 && (base_model == MEMMODEL_ACQUIRE
15930 || base_model == MEMMODEL_ACQ_REL
15931 || base_model == MEMMODEL_SEQ_CST))
15932 {
15933 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
15934 }
15935 }
15936
15937 /* Split a compare and swap pattern. */
15938
15939 void
aarch64_split_compare_and_swap(rtx operands[])15940 aarch64_split_compare_and_swap (rtx operands[])
15941 {
15942 /* Split after prolog/epilog to avoid interactions with shrinkwrapping. */
15943 gcc_assert (epilogue_completed);
15944
15945 rtx rval, mem, oldval, newval, scratch, x, model_rtx;
15946 machine_mode mode;
15947 bool is_weak;
15948 rtx_code_label *label1, *label2;
15949 enum memmodel model;
15950
15951 rval = operands[0];
15952 mem = operands[1];
15953 oldval = operands[2];
15954 newval = operands[3];
15955 is_weak = (operands[4] != const0_rtx);
15956 model_rtx = operands[5];
15957 scratch = operands[7];
15958 mode = GET_MODE (mem);
15959 model = memmodel_from_int (INTVAL (model_rtx));
15960
15961 /* When OLDVAL is zero and we want the strong version we can emit a tighter
15962 loop:
15963 .label1:
15964 LD[A]XR rval, [mem]
15965 CBNZ rval, .label2
15966 ST[L]XR scratch, newval, [mem]
15967 CBNZ scratch, .label1
15968 .label2:
15969 CMP rval, 0. */
15970 bool strong_zero_p = (!is_weak && !aarch64_track_speculation &&
15971 oldval == const0_rtx && mode != TImode);
15972
15973 label1 = NULL;
15974 if (!is_weak)
15975 {
15976 label1 = gen_label_rtx ();
15977 emit_label (label1);
15978 }
15979 label2 = gen_label_rtx ();
15980
15981 /* The initial load can be relaxed for a __sync operation since a final
15982 barrier will be emitted to stop code hoisting. */
15983 if (is_mm_sync (model))
15984 aarch64_emit_load_exclusive (mode, rval, mem, GEN_INT (MEMMODEL_RELAXED));
15985 else
15986 aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
15987
15988 if (strong_zero_p)
15989 x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
15990 else
15991 {
15992 rtx cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
15993 x = gen_rtx_NE (VOIDmode, cc_reg, const0_rtx);
15994 }
15995 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
15996 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
15997 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
15998
15999 aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
16000
16001 if (!is_weak)
16002 {
16003 if (aarch64_track_speculation)
16004 {
16005 /* Emit an explicit compare instruction, so that we can correctly
16006 track the condition codes. */
16007 rtx cc_reg = aarch64_gen_compare_reg (NE, scratch, const0_rtx);
16008 x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
16009 }
16010 else
16011 x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
16012
16013 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
16014 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
16015 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
16016 }
16017 else
16018 aarch64_gen_compare_reg (NE, scratch, const0_rtx);
16019
16020 emit_label (label2);
16021
16022 /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
16023 to set the condition flags. If this is not used it will be removed by
16024 later passes. */
16025 if (strong_zero_p)
16026 aarch64_gen_compare_reg (NE, rval, const0_rtx);
16027
16028 /* Emit any final barrier needed for a __sync operation. */
16029 if (is_mm_sync (model))
16030 aarch64_emit_post_barrier (model);
16031 }
16032
16033 /* Split an atomic operation. */
16034
16035 void
aarch64_split_atomic_op(enum rtx_code code,rtx old_out,rtx new_out,rtx mem,rtx value,rtx model_rtx,rtx cond)16036 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
16037 rtx value, rtx model_rtx, rtx cond)
16038 {
16039 /* Split after prolog/epilog to avoid interactions with shrinkwrapping. */
16040 gcc_assert (epilogue_completed);
16041
16042 machine_mode mode = GET_MODE (mem);
16043 machine_mode wmode = (mode == DImode ? DImode : SImode);
16044 const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
16045 const bool is_sync = is_mm_sync (model);
16046 rtx_code_label *label;
16047 rtx x;
16048
16049 /* Split the atomic operation into a sequence. */
16050 label = gen_label_rtx ();
16051 emit_label (label);
16052
16053 if (new_out)
16054 new_out = gen_lowpart (wmode, new_out);
16055 if (old_out)
16056 old_out = gen_lowpart (wmode, old_out);
16057 else
16058 old_out = new_out;
16059 value = simplify_gen_subreg (wmode, value, mode, 0);
16060
16061 /* The initial load can be relaxed for a __sync operation since a final
16062 barrier will be emitted to stop code hoisting. */
16063 if (is_sync)
16064 aarch64_emit_load_exclusive (mode, old_out, mem,
16065 GEN_INT (MEMMODEL_RELAXED));
16066 else
16067 aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
16068
16069 switch (code)
16070 {
16071 case SET:
16072 new_out = value;
16073 break;
16074
16075 case NOT:
16076 x = gen_rtx_AND (wmode, old_out, value);
16077 emit_insn (gen_rtx_SET (new_out, x));
16078 x = gen_rtx_NOT (wmode, new_out);
16079 emit_insn (gen_rtx_SET (new_out, x));
16080 break;
16081
16082 case MINUS:
16083 if (CONST_INT_P (value))
16084 {
16085 value = GEN_INT (-INTVAL (value));
16086 code = PLUS;
16087 }
16088 /* Fall through. */
16089
16090 default:
16091 x = gen_rtx_fmt_ee (code, wmode, old_out, value);
16092 emit_insn (gen_rtx_SET (new_out, x));
16093 break;
16094 }
16095
16096 aarch64_emit_store_exclusive (mode, cond, mem,
16097 gen_lowpart (mode, new_out), model_rtx);
16098
16099 if (aarch64_track_speculation)
16100 {
16101 /* Emit an explicit compare instruction, so that we can correctly
16102 track the condition codes. */
16103 rtx cc_reg = aarch64_gen_compare_reg (NE, cond, const0_rtx);
16104 x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
16105 }
16106 else
16107 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
16108
16109 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
16110 gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
16111 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
16112
16113 /* Emit any final barrier needed for a __sync operation. */
16114 if (is_sync)
16115 aarch64_emit_post_barrier (model);
16116 }
16117
16118 static void
aarch64_init_libfuncs(void)16119 aarch64_init_libfuncs (void)
16120 {
16121 /* Half-precision float operations. The compiler handles all operations
16122 with NULL libfuncs by converting to SFmode. */
16123
16124 /* Conversions. */
16125 set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
16126 set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
16127
16128 /* Arithmetic. */
16129 set_optab_libfunc (add_optab, HFmode, NULL);
16130 set_optab_libfunc (sdiv_optab, HFmode, NULL);
16131 set_optab_libfunc (smul_optab, HFmode, NULL);
16132 set_optab_libfunc (neg_optab, HFmode, NULL);
16133 set_optab_libfunc (sub_optab, HFmode, NULL);
16134
16135 /* Comparisons. */
16136 set_optab_libfunc (eq_optab, HFmode, NULL);
16137 set_optab_libfunc (ne_optab, HFmode, NULL);
16138 set_optab_libfunc (lt_optab, HFmode, NULL);
16139 set_optab_libfunc (le_optab, HFmode, NULL);
16140 set_optab_libfunc (ge_optab, HFmode, NULL);
16141 set_optab_libfunc (gt_optab, HFmode, NULL);
16142 set_optab_libfunc (unord_optab, HFmode, NULL);
16143 }
16144
16145 /* Target hook for c_mode_for_suffix. */
16146 static machine_mode
aarch64_c_mode_for_suffix(char suffix)16147 aarch64_c_mode_for_suffix (char suffix)
16148 {
16149 if (suffix == 'q')
16150 return TFmode;
16151
16152 return VOIDmode;
16153 }
16154
16155 /* We can only represent floating point constants which will fit in
16156 "quarter-precision" values. These values are characterised by
16157 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
16158 by:
16159
16160 (-1)^s * (n/16) * 2^r
16161
16162 Where:
16163 's' is the sign bit.
16164 'n' is an integer in the range 16 <= n <= 31.
16165 'r' is an integer in the range -3 <= r <= 4. */
16166
16167 /* Return true iff X can be represented by a quarter-precision
16168 floating point immediate operand X. Note, we cannot represent 0.0. */
16169 bool
aarch64_float_const_representable_p(rtx x)16170 aarch64_float_const_representable_p (rtx x)
16171 {
16172 /* This represents our current view of how many bits
16173 make up the mantissa. */
16174 int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
16175 int exponent;
16176 unsigned HOST_WIDE_INT mantissa, mask;
16177 REAL_VALUE_TYPE r, m;
16178 bool fail;
16179
16180 if (!CONST_DOUBLE_P (x))
16181 return false;
16182
16183 if (GET_MODE (x) == VOIDmode
16184 || (GET_MODE (x) == HFmode && !TARGET_FP_F16INST))
16185 return false;
16186
16187 r = *CONST_DOUBLE_REAL_VALUE (x);
16188
16189 /* We cannot represent infinities, NaNs or +/-zero. We won't
16190 know if we have +zero until we analyse the mantissa, but we
16191 can reject the other invalid values. */
16192 if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
16193 || REAL_VALUE_MINUS_ZERO (r))
16194 return false;
16195
16196 /* Extract exponent. */
16197 r = real_value_abs (&r);
16198 exponent = REAL_EXP (&r);
16199
16200 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
16201 highest (sign) bit, with a fixed binary point at bit point_pos.
16202 m1 holds the low part of the mantissa, m2 the high part.
16203 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
16204 bits for the mantissa, this can fail (low bits will be lost). */
16205 real_ldexp (&m, &r, point_pos - exponent);
16206 wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
16207
16208 /* If the low part of the mantissa has bits set we cannot represent
16209 the value. */
16210 if (w.ulow () != 0)
16211 return false;
16212 /* We have rejected the lower HOST_WIDE_INT, so update our
16213 understanding of how many bits lie in the mantissa and
16214 look only at the high HOST_WIDE_INT. */
16215 mantissa = w.elt (1);
16216 point_pos -= HOST_BITS_PER_WIDE_INT;
16217
16218 /* We can only represent values with a mantissa of the form 1.xxxx. */
16219 mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
16220 if ((mantissa & mask) != 0)
16221 return false;
16222
16223 /* Having filtered unrepresentable values, we may now remove all
16224 but the highest 5 bits. */
16225 mantissa >>= point_pos - 5;
16226
16227 /* We cannot represent the value 0.0, so reject it. This is handled
16228 elsewhere. */
16229 if (mantissa == 0)
16230 return false;
16231
16232 /* Then, as bit 4 is always set, we can mask it off, leaving
16233 the mantissa in the range [0, 15]. */
16234 mantissa &= ~(1 << 4);
16235 gcc_assert (mantissa <= 15);
16236
16237 /* GCC internally does not use IEEE754-like encoding (where normalized
16238 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
16239 Our mantissa values are shifted 4 places to the left relative to
16240 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
16241 by 5 places to correct for GCC's representation. */
16242 exponent = 5 - exponent;
16243
16244 return (exponent >= 0 && exponent <= 7);
16245 }
16246
16247 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
16248 immediate with a CONST_VECTOR of MODE and WIDTH. WHICH selects whether to
16249 output MOVI/MVNI, ORR or BIC immediate. */
16250 char*
aarch64_output_simd_mov_immediate(rtx const_vector,unsigned width,enum simd_immediate_check which)16251 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
16252 enum simd_immediate_check which)
16253 {
16254 bool is_valid;
16255 static char templ[40];
16256 const char *mnemonic;
16257 const char *shift_op;
16258 unsigned int lane_count = 0;
16259 char element_char;
16260
16261 struct simd_immediate_info info;
16262
16263 /* This will return true to show const_vector is legal for use as either
16264 a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
16265 It will also update INFO to show how the immediate should be generated.
16266 WHICH selects whether to check for MOVI/MVNI, ORR or BIC. */
16267 is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
16268 gcc_assert (is_valid);
16269
16270 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
16271 lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
16272
16273 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
16274 {
16275 gcc_assert (info.shift == 0 && info.insn == simd_immediate_info::MOV);
16276 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
16277 move immediate path. */
16278 if (aarch64_float_const_zero_rtx_p (info.value))
16279 info.value = GEN_INT (0);
16280 else
16281 {
16282 const unsigned int buf_size = 20;
16283 char float_buf[buf_size] = {'\0'};
16284 real_to_decimal_for_mode (float_buf,
16285 CONST_DOUBLE_REAL_VALUE (info.value),
16286 buf_size, buf_size, 1, info.elt_mode);
16287
16288 if (lane_count == 1)
16289 snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
16290 else
16291 snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
16292 lane_count, element_char, float_buf);
16293 return templ;
16294 }
16295 }
16296
16297 gcc_assert (CONST_INT_P (info.value));
16298
16299 if (which == AARCH64_CHECK_MOV)
16300 {
16301 mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
16302 shift_op = info.modifier == simd_immediate_info::MSL ? "msl" : "lsl";
16303 if (lane_count == 1)
16304 snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
16305 mnemonic, UINTVAL (info.value));
16306 else if (info.shift)
16307 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
16308 HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
16309 element_char, UINTVAL (info.value), shift_op, info.shift);
16310 else
16311 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
16312 HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
16313 element_char, UINTVAL (info.value));
16314 }
16315 else
16316 {
16317 /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR. */
16318 mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
16319 if (info.shift)
16320 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
16321 HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
16322 element_char, UINTVAL (info.value), "lsl", info.shift);
16323 else
16324 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
16325 HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
16326 element_char, UINTVAL (info.value));
16327 }
16328 return templ;
16329 }
16330
16331 char*
aarch64_output_scalar_simd_mov_immediate(rtx immediate,scalar_int_mode mode)16332 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
16333 {
16334
16335 /* If a floating point number was passed and we desire to use it in an
16336 integer mode do the conversion to integer. */
16337 if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
16338 {
16339 unsigned HOST_WIDE_INT ival;
16340 if (!aarch64_reinterpret_float_as_int (immediate, &ival))
16341 gcc_unreachable ();
16342 immediate = gen_int_mode (ival, mode);
16343 }
16344
16345 machine_mode vmode;
16346 /* use a 64 bit mode for everything except for DI/DF mode, where we use
16347 a 128 bit vector mode. */
16348 int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
16349
16350 vmode = aarch64_simd_container_mode (mode, width);
16351 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
16352 return aarch64_output_simd_mov_immediate (v_op, width);
16353 }
16354
16355 /* Return the output string to use for moving immediate CONST_VECTOR
16356 into an SVE register. */
16357
16358 char *
aarch64_output_sve_mov_immediate(rtx const_vector)16359 aarch64_output_sve_mov_immediate (rtx const_vector)
16360 {
16361 static char templ[40];
16362 struct simd_immediate_info info;
16363 char element_char;
16364
16365 bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
16366 gcc_assert (is_valid);
16367
16368 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
16369
16370 if (info.step)
16371 {
16372 snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
16373 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
16374 element_char, INTVAL (info.value), INTVAL (info.step));
16375 return templ;
16376 }
16377
16378 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
16379 {
16380 if (aarch64_float_const_zero_rtx_p (info.value))
16381 info.value = GEN_INT (0);
16382 else
16383 {
16384 const int buf_size = 20;
16385 char float_buf[buf_size] = {};
16386 real_to_decimal_for_mode (float_buf,
16387 CONST_DOUBLE_REAL_VALUE (info.value),
16388 buf_size, buf_size, 1, info.elt_mode);
16389
16390 snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
16391 element_char, float_buf);
16392 return templ;
16393 }
16394 }
16395
16396 snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
16397 element_char, INTVAL (info.value));
16398 return templ;
16399 }
16400
16401 /* Return the asm format for a PTRUE instruction whose destination has
16402 mode MODE. SUFFIX is the element size suffix. */
16403
16404 char *
aarch64_output_ptrue(machine_mode mode,char suffix)16405 aarch64_output_ptrue (machine_mode mode, char suffix)
16406 {
16407 unsigned int nunits;
16408 static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
16409 if (GET_MODE_NUNITS (mode).is_constant (&nunits))
16410 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", suffix, nunits);
16411 else
16412 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, all", suffix);
16413 return buf;
16414 }
16415
16416 /* Split operands into moves from op[1] + op[2] into op[0]. */
16417
16418 void
aarch64_split_combinev16qi(rtx operands[3])16419 aarch64_split_combinev16qi (rtx operands[3])
16420 {
16421 unsigned int dest = REGNO (operands[0]);
16422 unsigned int src1 = REGNO (operands[1]);
16423 unsigned int src2 = REGNO (operands[2]);
16424 machine_mode halfmode = GET_MODE (operands[1]);
16425 unsigned int halfregs = REG_NREGS (operands[1]);
16426 rtx destlo, desthi;
16427
16428 gcc_assert (halfmode == V16QImode);
16429
16430 if (src1 == dest && src2 == dest + halfregs)
16431 {
16432 /* No-op move. Can't split to nothing; emit something. */
16433 emit_note (NOTE_INSN_DELETED);
16434 return;
16435 }
16436
16437 /* Preserve register attributes for variable tracking. */
16438 destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
16439 desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
16440 GET_MODE_SIZE (halfmode));
16441
16442 /* Special case of reversed high/low parts. */
16443 if (reg_overlap_mentioned_p (operands[2], destlo)
16444 && reg_overlap_mentioned_p (operands[1], desthi))
16445 {
16446 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
16447 emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
16448 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
16449 }
16450 else if (!reg_overlap_mentioned_p (operands[2], destlo))
16451 {
16452 /* Try to avoid unnecessary moves if part of the result
16453 is in the right place already. */
16454 if (src1 != dest)
16455 emit_move_insn (destlo, operands[1]);
16456 if (src2 != dest + halfregs)
16457 emit_move_insn (desthi, operands[2]);
16458 }
16459 else
16460 {
16461 if (src2 != dest + halfregs)
16462 emit_move_insn (desthi, operands[2]);
16463 if (src1 != dest)
16464 emit_move_insn (destlo, operands[1]);
16465 }
16466 }
16467
16468 /* vec_perm support. */
16469
16470 struct expand_vec_perm_d
16471 {
16472 rtx target, op0, op1;
16473 vec_perm_indices perm;
16474 machine_mode vmode;
16475 unsigned int vec_flags;
16476 bool one_vector_p;
16477 bool testing_p;
16478 };
16479
16480 /* Generate a variable permutation. */
16481
16482 static void
aarch64_expand_vec_perm_1(rtx target,rtx op0,rtx op1,rtx sel)16483 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
16484 {
16485 machine_mode vmode = GET_MODE (target);
16486 bool one_vector_p = rtx_equal_p (op0, op1);
16487
16488 gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
16489 gcc_checking_assert (GET_MODE (op0) == vmode);
16490 gcc_checking_assert (GET_MODE (op1) == vmode);
16491 gcc_checking_assert (GET_MODE (sel) == vmode);
16492 gcc_checking_assert (TARGET_SIMD);
16493
16494 if (one_vector_p)
16495 {
16496 if (vmode == V8QImode)
16497 {
16498 /* Expand the argument to a V16QI mode by duplicating it. */
16499 rtx pair = gen_reg_rtx (V16QImode);
16500 emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
16501 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
16502 }
16503 else
16504 {
16505 emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
16506 }
16507 }
16508 else
16509 {
16510 rtx pair;
16511
16512 if (vmode == V8QImode)
16513 {
16514 pair = gen_reg_rtx (V16QImode);
16515 emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
16516 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
16517 }
16518 else
16519 {
16520 pair = gen_reg_rtx (OImode);
16521 emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
16522 emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
16523 }
16524 }
16525 }
16526
16527 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
16528 NELT is the number of elements in the vector. */
16529
16530 void
aarch64_expand_vec_perm(rtx target,rtx op0,rtx op1,rtx sel,unsigned int nelt)16531 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
16532 unsigned int nelt)
16533 {
16534 machine_mode vmode = GET_MODE (target);
16535 bool one_vector_p = rtx_equal_p (op0, op1);
16536 rtx mask;
16537
16538 /* The TBL instruction does not use a modulo index, so we must take care
16539 of that ourselves. */
16540 mask = aarch64_simd_gen_const_vector_dup (vmode,
16541 one_vector_p ? nelt - 1 : 2 * nelt - 1);
16542 sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
16543
16544 /* For big-endian, we also need to reverse the index within the vector
16545 (but not which vector). */
16546 if (BYTES_BIG_ENDIAN)
16547 {
16548 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
16549 if (!one_vector_p)
16550 mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
16551 sel = expand_simple_binop (vmode, XOR, sel, mask,
16552 NULL, 0, OPTAB_LIB_WIDEN);
16553 }
16554 aarch64_expand_vec_perm_1 (target, op0, op1, sel);
16555 }
16556
16557 /* Generate (set TARGET (unspec [OP0 OP1] CODE)). */
16558
16559 static void
emit_unspec2(rtx target,int code,rtx op0,rtx op1)16560 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
16561 {
16562 emit_insn (gen_rtx_SET (target,
16563 gen_rtx_UNSPEC (GET_MODE (target),
16564 gen_rtvec (2, op0, op1), code)));
16565 }
16566
16567 /* Expand an SVE vec_perm with the given operands. */
16568
16569 void
aarch64_expand_sve_vec_perm(rtx target,rtx op0,rtx op1,rtx sel)16570 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
16571 {
16572 machine_mode data_mode = GET_MODE (target);
16573 machine_mode sel_mode = GET_MODE (sel);
16574 /* Enforced by the pattern condition. */
16575 int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
16576
16577 /* Note: vec_perm indices are supposed to wrap when they go beyond the
16578 size of the two value vectors, i.e. the upper bits of the indices
16579 are effectively ignored. SVE TBL instead produces 0 for any
16580 out-of-range indices, so we need to modulo all the vec_perm indices
16581 to ensure they are all in range. */
16582 rtx sel_reg = force_reg (sel_mode, sel);
16583
16584 /* Check if the sel only references the first values vector. */
16585 if (GET_CODE (sel) == CONST_VECTOR
16586 && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
16587 {
16588 emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
16589 return;
16590 }
16591
16592 /* Check if the two values vectors are the same. */
16593 if (rtx_equal_p (op0, op1))
16594 {
16595 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
16596 rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
16597 NULL, 0, OPTAB_DIRECT);
16598 emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
16599 return;
16600 }
16601
16602 /* Run TBL on for each value vector and combine the results. */
16603
16604 rtx res0 = gen_reg_rtx (data_mode);
16605 rtx res1 = gen_reg_rtx (data_mode);
16606 rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
16607 if (GET_CODE (sel) != CONST_VECTOR
16608 || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
16609 {
16610 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
16611 2 * nunits - 1);
16612 sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
16613 NULL, 0, OPTAB_DIRECT);
16614 }
16615 emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
16616 rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
16617 NULL, 0, OPTAB_DIRECT);
16618 emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
16619 if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
16620 emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
16621 else
16622 emit_unspec2 (target, UNSPEC_IORF, res0, res1);
16623 }
16624
16625 /* Recognize patterns suitable for the TRN instructions. */
16626 static bool
aarch64_evpc_trn(struct expand_vec_perm_d * d)16627 aarch64_evpc_trn (struct expand_vec_perm_d *d)
16628 {
16629 HOST_WIDE_INT odd;
16630 poly_uint64 nelt = d->perm.length ();
16631 rtx out, in0, in1, x;
16632 machine_mode vmode = d->vmode;
16633
16634 if (GET_MODE_UNIT_SIZE (vmode) > 8)
16635 return false;
16636
16637 /* Note that these are little-endian tests.
16638 We correct for big-endian later. */
16639 if (!d->perm[0].is_constant (&odd)
16640 || (odd != 0 && odd != 1)
16641 || !d->perm.series_p (0, 2, odd, 2)
16642 || !d->perm.series_p (1, 2, nelt + odd, 2))
16643 return false;
16644
16645 /* Success! */
16646 if (d->testing_p)
16647 return true;
16648
16649 in0 = d->op0;
16650 in1 = d->op1;
16651 /* We don't need a big-endian lane correction for SVE; see the comment
16652 at the head of aarch64-sve.md for details. */
16653 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
16654 {
16655 x = in0, in0 = in1, in1 = x;
16656 odd = !odd;
16657 }
16658 out = d->target;
16659
16660 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
16661 odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
16662 return true;
16663 }
16664
16665 /* Recognize patterns suitable for the UZP instructions. */
16666 static bool
aarch64_evpc_uzp(struct expand_vec_perm_d * d)16667 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
16668 {
16669 HOST_WIDE_INT odd;
16670 rtx out, in0, in1, x;
16671 machine_mode vmode = d->vmode;
16672
16673 if (GET_MODE_UNIT_SIZE (vmode) > 8)
16674 return false;
16675
16676 /* Note that these are little-endian tests.
16677 We correct for big-endian later. */
16678 if (!d->perm[0].is_constant (&odd)
16679 || (odd != 0 && odd != 1)
16680 || !d->perm.series_p (0, 1, odd, 2))
16681 return false;
16682
16683 /* Success! */
16684 if (d->testing_p)
16685 return true;
16686
16687 in0 = d->op0;
16688 in1 = d->op1;
16689 /* We don't need a big-endian lane correction for SVE; see the comment
16690 at the head of aarch64-sve.md for details. */
16691 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
16692 {
16693 x = in0, in0 = in1, in1 = x;
16694 odd = !odd;
16695 }
16696 out = d->target;
16697
16698 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
16699 odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
16700 return true;
16701 }
16702
16703 /* Recognize patterns suitable for the ZIP instructions. */
16704 static bool
aarch64_evpc_zip(struct expand_vec_perm_d * d)16705 aarch64_evpc_zip (struct expand_vec_perm_d *d)
16706 {
16707 unsigned int high;
16708 poly_uint64 nelt = d->perm.length ();
16709 rtx out, in0, in1, x;
16710 machine_mode vmode = d->vmode;
16711
16712 if (GET_MODE_UNIT_SIZE (vmode) > 8)
16713 return false;
16714
16715 /* Note that these are little-endian tests.
16716 We correct for big-endian later. */
16717 poly_uint64 first = d->perm[0];
16718 if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
16719 || !d->perm.series_p (0, 2, first, 1)
16720 || !d->perm.series_p (1, 2, first + nelt, 1))
16721 return false;
16722 high = maybe_ne (first, 0U);
16723
16724 /* Success! */
16725 if (d->testing_p)
16726 return true;
16727
16728 in0 = d->op0;
16729 in1 = d->op1;
16730 /* We don't need a big-endian lane correction for SVE; see the comment
16731 at the head of aarch64-sve.md for details. */
16732 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
16733 {
16734 x = in0, in0 = in1, in1 = x;
16735 high = !high;
16736 }
16737 out = d->target;
16738
16739 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
16740 high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
16741 return true;
16742 }
16743
16744 /* Recognize patterns for the EXT insn. */
16745
16746 static bool
aarch64_evpc_ext(struct expand_vec_perm_d * d)16747 aarch64_evpc_ext (struct expand_vec_perm_d *d)
16748 {
16749 HOST_WIDE_INT location;
16750 rtx offset;
16751
16752 /* The first element always refers to the first vector.
16753 Check if the extracted indices are increasing by one. */
16754 if (d->vec_flags == VEC_SVE_PRED
16755 || !d->perm[0].is_constant (&location)
16756 || !d->perm.series_p (0, 1, location, 1))
16757 return false;
16758
16759 /* Success! */
16760 if (d->testing_p)
16761 return true;
16762
16763 /* The case where (location == 0) is a no-op for both big- and little-endian,
16764 and is removed by the mid-end at optimization levels -O1 and higher.
16765
16766 We don't need a big-endian lane correction for SVE; see the comment
16767 at the head of aarch64-sve.md for details. */
16768 if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
16769 {
16770 /* After setup, we want the high elements of the first vector (stored
16771 at the LSB end of the register), and the low elements of the second
16772 vector (stored at the MSB end of the register). So swap. */
16773 std::swap (d->op0, d->op1);
16774 /* location != 0 (above), so safe to assume (nelt - location) < nelt.
16775 to_constant () is safe since this is restricted to Advanced SIMD
16776 vectors. */
16777 location = d->perm.length ().to_constant () - location;
16778 }
16779
16780 offset = GEN_INT (location);
16781 emit_set_insn (d->target,
16782 gen_rtx_UNSPEC (d->vmode,
16783 gen_rtvec (3, d->op0, d->op1, offset),
16784 UNSPEC_EXT));
16785 return true;
16786 }
16787
16788 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
16789 within each 64-bit, 32-bit or 16-bit granule. */
16790
16791 static bool
aarch64_evpc_rev_local(struct expand_vec_perm_d * d)16792 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
16793 {
16794 HOST_WIDE_INT diff;
16795 unsigned int i, size, unspec;
16796 machine_mode pred_mode;
16797
16798 if (d->vec_flags == VEC_SVE_PRED
16799 || !d->one_vector_p
16800 || !d->perm[0].is_constant (&diff)
16801 || !diff)
16802 return false;
16803
16804 size = (diff + 1) * GET_MODE_UNIT_SIZE (d->vmode);
16805 if (size == 8)
16806 {
16807 unspec = UNSPEC_REV64;
16808 pred_mode = VNx2BImode;
16809 }
16810 else if (size == 4)
16811 {
16812 unspec = UNSPEC_REV32;
16813 pred_mode = VNx4BImode;
16814 }
16815 else if (size == 2)
16816 {
16817 unspec = UNSPEC_REV16;
16818 pred_mode = VNx8BImode;
16819 }
16820 else
16821 return false;
16822
16823 unsigned int step = diff + 1;
16824 for (i = 0; i < step; ++i)
16825 if (!d->perm.series_p (i, step, diff - i, step))
16826 return false;
16827
16828 /* Success! */
16829 if (d->testing_p)
16830 return true;
16831
16832 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
16833 if (d->vec_flags == VEC_SVE_DATA)
16834 {
16835 rtx pred = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
16836 src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (2, pred, src),
16837 UNSPEC_MERGE_PTRUE);
16838 }
16839 emit_set_insn (d->target, src);
16840 return true;
16841 }
16842
16843 /* Recognize patterns for the REV insn, which reverses elements within
16844 a full vector. */
16845
16846 static bool
aarch64_evpc_rev_global(struct expand_vec_perm_d * d)16847 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
16848 {
16849 poly_uint64 nelt = d->perm.length ();
16850
16851 if (!d->one_vector_p || d->vec_flags != VEC_SVE_DATA)
16852 return false;
16853
16854 if (!d->perm.series_p (0, 1, nelt - 1, -1))
16855 return false;
16856
16857 /* Success! */
16858 if (d->testing_p)
16859 return true;
16860
16861 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
16862 emit_set_insn (d->target, src);
16863 return true;
16864 }
16865
16866 static bool
aarch64_evpc_dup(struct expand_vec_perm_d * d)16867 aarch64_evpc_dup (struct expand_vec_perm_d *d)
16868 {
16869 rtx out = d->target;
16870 rtx in0;
16871 HOST_WIDE_INT elt;
16872 machine_mode vmode = d->vmode;
16873 rtx lane;
16874
16875 if (d->vec_flags == VEC_SVE_PRED
16876 || d->perm.encoding ().encoded_nelts () != 1
16877 || !d->perm[0].is_constant (&elt))
16878 return false;
16879
16880 if (d->vec_flags == VEC_SVE_DATA && elt >= 64 * GET_MODE_UNIT_SIZE (vmode))
16881 return false;
16882
16883 /* Success! */
16884 if (d->testing_p)
16885 return true;
16886
16887 /* The generic preparation in aarch64_expand_vec_perm_const_1
16888 swaps the operand order and the permute indices if it finds
16889 d->perm[0] to be in the second operand. Thus, we can always
16890 use d->op0 and need not do any extra arithmetic to get the
16891 correct lane number. */
16892 in0 = d->op0;
16893 lane = GEN_INT (elt); /* The pattern corrects for big-endian. */
16894
16895 rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
16896 rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
16897 emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
16898 return true;
16899 }
16900
16901 static bool
aarch64_evpc_tbl(struct expand_vec_perm_d * d)16902 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
16903 {
16904 rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
16905 machine_mode vmode = d->vmode;
16906
16907 /* Make sure that the indices are constant. */
16908 unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
16909 for (unsigned int i = 0; i < encoded_nelts; ++i)
16910 if (!d->perm[i].is_constant ())
16911 return false;
16912
16913 if (d->testing_p)
16914 return true;
16915
16916 /* Generic code will try constant permutation twice. Once with the
16917 original mode and again with the elements lowered to QImode.
16918 So wait and don't do the selector expansion ourselves. */
16919 if (vmode != V8QImode && vmode != V16QImode)
16920 return false;
16921
16922 /* to_constant is safe since this routine is specific to Advanced SIMD
16923 vectors. */
16924 unsigned int nelt = d->perm.length ().to_constant ();
16925 for (unsigned int i = 0; i < nelt; ++i)
16926 /* If big-endian and two vectors we end up with a weird mixed-endian
16927 mode on NEON. Reverse the index within each word but not the word
16928 itself. to_constant is safe because we checked is_constant above. */
16929 rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
16930 ? d->perm[i].to_constant () ^ (nelt - 1)
16931 : d->perm[i].to_constant ());
16932
16933 sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
16934 sel = force_reg (vmode, sel);
16935
16936 aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
16937 return true;
16938 }
16939
16940 /* Try to implement D using an SVE TBL instruction. */
16941
16942 static bool
aarch64_evpc_sve_tbl(struct expand_vec_perm_d * d)16943 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
16944 {
16945 unsigned HOST_WIDE_INT nelt;
16946
16947 /* Permuting two variable-length vectors could overflow the
16948 index range. */
16949 if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
16950 return false;
16951
16952 if (d->testing_p)
16953 return true;
16954
16955 machine_mode sel_mode = mode_for_int_vector (d->vmode).require ();
16956 rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
16957 if (d->one_vector_p)
16958 emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
16959 else
16960 aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
16961 return true;
16962 }
16963
16964 static bool
aarch64_expand_vec_perm_const_1(struct expand_vec_perm_d * d)16965 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
16966 {
16967 /* The pattern matching functions above are written to look for a small
16968 number to begin the sequence (0, 1, N/2). If we begin with an index
16969 from the second operand, we can swap the operands. */
16970 poly_int64 nelt = d->perm.length ();
16971 if (known_ge (d->perm[0], nelt))
16972 {
16973 d->perm.rotate_inputs (1);
16974 std::swap (d->op0, d->op1);
16975 }
16976
16977 if ((d->vec_flags == VEC_ADVSIMD
16978 || d->vec_flags == VEC_SVE_DATA
16979 || d->vec_flags == VEC_SVE_PRED)
16980 && known_gt (nelt, 1))
16981 {
16982 if (aarch64_evpc_rev_local (d))
16983 return true;
16984 else if (aarch64_evpc_rev_global (d))
16985 return true;
16986 else if (aarch64_evpc_ext (d))
16987 return true;
16988 else if (aarch64_evpc_dup (d))
16989 return true;
16990 else if (aarch64_evpc_zip (d))
16991 return true;
16992 else if (aarch64_evpc_uzp (d))
16993 return true;
16994 else if (aarch64_evpc_trn (d))
16995 return true;
16996 if (d->vec_flags == VEC_SVE_DATA)
16997 return aarch64_evpc_sve_tbl (d);
16998 else if (d->vec_flags == VEC_ADVSIMD)
16999 return aarch64_evpc_tbl (d);
17000 }
17001 return false;
17002 }
17003
17004 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
17005
17006 static bool
aarch64_vectorize_vec_perm_const(machine_mode vmode,rtx target,rtx op0,rtx op1,const vec_perm_indices & sel)17007 aarch64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
17008 rtx op1, const vec_perm_indices &sel)
17009 {
17010 struct expand_vec_perm_d d;
17011
17012 /* Check whether the mask can be applied to a single vector. */
17013 if (sel.ninputs () == 1
17014 || (op0 && rtx_equal_p (op0, op1)))
17015 d.one_vector_p = true;
17016 else if (sel.all_from_input_p (0))
17017 {
17018 d.one_vector_p = true;
17019 op1 = op0;
17020 }
17021 else if (sel.all_from_input_p (1))
17022 {
17023 d.one_vector_p = true;
17024 op0 = op1;
17025 }
17026 else
17027 d.one_vector_p = false;
17028
17029 d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
17030 sel.nelts_per_input ());
17031 d.vmode = vmode;
17032 d.vec_flags = aarch64_classify_vector_mode (d.vmode);
17033 d.target = target;
17034 d.op0 = op0;
17035 d.op1 = op1;
17036 d.testing_p = !target;
17037
17038 if (!d.testing_p)
17039 return aarch64_expand_vec_perm_const_1 (&d);
17040
17041 rtx_insn *last = get_last_insn ();
17042 bool ret = aarch64_expand_vec_perm_const_1 (&d);
17043 gcc_assert (last == get_last_insn ());
17044
17045 return ret;
17046 }
17047
17048 /* Generate a byte permute mask for a register of mode MODE,
17049 which has NUNITS units. */
17050
17051 rtx
aarch64_reverse_mask(machine_mode mode,unsigned int nunits)17052 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
17053 {
17054 /* We have to reverse each vector because we dont have
17055 a permuted load that can reverse-load according to ABI rules. */
17056 rtx mask;
17057 rtvec v = rtvec_alloc (16);
17058 unsigned int i, j;
17059 unsigned int usize = GET_MODE_UNIT_SIZE (mode);
17060
17061 gcc_assert (BYTES_BIG_ENDIAN);
17062 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
17063
17064 for (i = 0; i < nunits; i++)
17065 for (j = 0; j < usize; j++)
17066 RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
17067 mask = gen_rtx_CONST_VECTOR (V16QImode, v);
17068 return force_reg (V16QImode, mask);
17069 }
17070
17071 /* Return true if X is a valid second operand for the SVE instruction
17072 that implements integer comparison OP_CODE. */
17073
17074 static bool
aarch64_sve_cmp_operand_p(rtx_code op_code,rtx x)17075 aarch64_sve_cmp_operand_p (rtx_code op_code, rtx x)
17076 {
17077 if (register_operand (x, VOIDmode))
17078 return true;
17079
17080 switch (op_code)
17081 {
17082 case LTU:
17083 case LEU:
17084 case GEU:
17085 case GTU:
17086 return aarch64_sve_cmp_immediate_p (x, false);
17087 case LT:
17088 case LE:
17089 case GE:
17090 case GT:
17091 case NE:
17092 case EQ:
17093 return aarch64_sve_cmp_immediate_p (x, true);
17094 default:
17095 gcc_unreachable ();
17096 }
17097 }
17098
17099 /* Use predicated SVE instructions to implement the equivalent of:
17100
17101 (set TARGET OP)
17102
17103 given that PTRUE is an all-true predicate of the appropriate mode. */
17104
17105 static void
aarch64_emit_sve_ptrue_op(rtx target,rtx ptrue,rtx op)17106 aarch64_emit_sve_ptrue_op (rtx target, rtx ptrue, rtx op)
17107 {
17108 rtx unspec = gen_rtx_UNSPEC (GET_MODE (target),
17109 gen_rtvec (2, ptrue, op),
17110 UNSPEC_MERGE_PTRUE);
17111 rtx_insn *insn = emit_set_insn (target, unspec);
17112 set_unique_reg_note (insn, REG_EQUAL, copy_rtx (op));
17113 }
17114
17115 /* Likewise, but also clobber the condition codes. */
17116
17117 static void
aarch64_emit_sve_ptrue_op_cc(rtx target,rtx ptrue,rtx op)17118 aarch64_emit_sve_ptrue_op_cc (rtx target, rtx ptrue, rtx op)
17119 {
17120 rtx unspec = gen_rtx_UNSPEC (GET_MODE (target),
17121 gen_rtvec (2, ptrue, op),
17122 UNSPEC_MERGE_PTRUE);
17123 rtx_insn *insn = emit_insn (gen_set_clobber_cc (target, unspec));
17124 set_unique_reg_note (insn, REG_EQUAL, copy_rtx (op));
17125 }
17126
17127 /* Return the UNSPEC_COND_* code for comparison CODE. */
17128
17129 static unsigned int
aarch64_unspec_cond_code(rtx_code code)17130 aarch64_unspec_cond_code (rtx_code code)
17131 {
17132 switch (code)
17133 {
17134 case NE:
17135 return UNSPEC_COND_NE;
17136 case EQ:
17137 return UNSPEC_COND_EQ;
17138 case LT:
17139 return UNSPEC_COND_LT;
17140 case GT:
17141 return UNSPEC_COND_GT;
17142 case LE:
17143 return UNSPEC_COND_LE;
17144 case GE:
17145 return UNSPEC_COND_GE;
17146 default:
17147 gcc_unreachable ();
17148 }
17149 }
17150
17151 /* Emit:
17152
17153 (set TARGET (unspec [PRED OP0 OP1] UNSPEC_COND_<X>))
17154
17155 where <X> is the operation associated with comparison CODE. This form
17156 of instruction is used when (and (CODE OP0 OP1) PRED) would have different
17157 semantics, such as when PRED might not be all-true and when comparing
17158 inactive lanes could have side effects. */
17159
17160 static void
aarch64_emit_sve_predicated_cond(rtx target,rtx_code code,rtx pred,rtx op0,rtx op1)17161 aarch64_emit_sve_predicated_cond (rtx target, rtx_code code,
17162 rtx pred, rtx op0, rtx op1)
17163 {
17164 rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
17165 gen_rtvec (3, pred, op0, op1),
17166 aarch64_unspec_cond_code (code));
17167 emit_set_insn (target, unspec);
17168 }
17169
17170 /* Expand an SVE integer comparison using the SVE equivalent of:
17171
17172 (set TARGET (CODE OP0 OP1)). */
17173
17174 void
aarch64_expand_sve_vec_cmp_int(rtx target,rtx_code code,rtx op0,rtx op1)17175 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
17176 {
17177 machine_mode pred_mode = GET_MODE (target);
17178 machine_mode data_mode = GET_MODE (op0);
17179
17180 if (!aarch64_sve_cmp_operand_p (code, op1))
17181 op1 = force_reg (data_mode, op1);
17182
17183 rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
17184 rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
17185 aarch64_emit_sve_ptrue_op_cc (target, ptrue, cond);
17186 }
17187
17188 /* Emit the SVE equivalent of:
17189
17190 (set TMP1 (CODE1 OP0 OP1))
17191 (set TMP2 (CODE2 OP0 OP1))
17192 (set TARGET (ior:PRED_MODE TMP1 TMP2))
17193
17194 PTRUE is an all-true predicate with the same mode as TARGET. */
17195
17196 static void
aarch64_emit_sve_or_conds(rtx target,rtx_code code1,rtx_code code2,rtx ptrue,rtx op0,rtx op1)17197 aarch64_emit_sve_or_conds (rtx target, rtx_code code1, rtx_code code2,
17198 rtx ptrue, rtx op0, rtx op1)
17199 {
17200 machine_mode pred_mode = GET_MODE (ptrue);
17201 rtx tmp1 = gen_reg_rtx (pred_mode);
17202 aarch64_emit_sve_ptrue_op (tmp1, ptrue,
17203 gen_rtx_fmt_ee (code1, pred_mode, op0, op1));
17204 rtx tmp2 = gen_reg_rtx (pred_mode);
17205 aarch64_emit_sve_ptrue_op (tmp2, ptrue,
17206 gen_rtx_fmt_ee (code2, pred_mode, op0, op1));
17207 aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
17208 }
17209
17210 /* Emit the SVE equivalent of:
17211
17212 (set TMP (CODE OP0 OP1))
17213 (set TARGET (not TMP))
17214
17215 PTRUE is an all-true predicate with the same mode as TARGET. */
17216
17217 static void
aarch64_emit_sve_inverted_cond(rtx target,rtx ptrue,rtx_code code,rtx op0,rtx op1)17218 aarch64_emit_sve_inverted_cond (rtx target, rtx ptrue, rtx_code code,
17219 rtx op0, rtx op1)
17220 {
17221 machine_mode pred_mode = GET_MODE (ptrue);
17222 rtx tmp = gen_reg_rtx (pred_mode);
17223 aarch64_emit_sve_ptrue_op (tmp, ptrue,
17224 gen_rtx_fmt_ee (code, pred_mode, op0, op1));
17225 aarch64_emit_unop (target, one_cmpl_optab, tmp);
17226 }
17227
17228 /* Expand an SVE floating-point comparison using the SVE equivalent of:
17229
17230 (set TARGET (CODE OP0 OP1))
17231
17232 If CAN_INVERT_P is true, the caller can also handle inverted results;
17233 return true if the result is in fact inverted. */
17234
17235 bool
aarch64_expand_sve_vec_cmp_float(rtx target,rtx_code code,rtx op0,rtx op1,bool can_invert_p)17236 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
17237 rtx op0, rtx op1, bool can_invert_p)
17238 {
17239 machine_mode pred_mode = GET_MODE (target);
17240 machine_mode data_mode = GET_MODE (op0);
17241
17242 rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
17243 switch (code)
17244 {
17245 case UNORDERED:
17246 /* UNORDERED has no immediate form. */
17247 op1 = force_reg (data_mode, op1);
17248 /* fall through */
17249 case LT:
17250 case LE:
17251 case GT:
17252 case GE:
17253 case EQ:
17254 case NE:
17255 {
17256 /* There is native support for the comparison. */
17257 rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
17258 aarch64_emit_sve_ptrue_op (target, ptrue, cond);
17259 return false;
17260 }
17261
17262 case LTGT:
17263 /* This is a trapping operation (LT or GT). */
17264 aarch64_emit_sve_or_conds (target, LT, GT, ptrue, op0, op1);
17265 return false;
17266
17267 case UNEQ:
17268 if (!flag_trapping_math)
17269 {
17270 /* This would trap for signaling NaNs. */
17271 op1 = force_reg (data_mode, op1);
17272 aarch64_emit_sve_or_conds (target, UNORDERED, EQ, ptrue, op0, op1);
17273 return false;
17274 }
17275 /* fall through */
17276 case UNLT:
17277 case UNLE:
17278 case UNGT:
17279 case UNGE:
17280 if (flag_trapping_math)
17281 {
17282 /* Work out which elements are ordered. */
17283 rtx ordered = gen_reg_rtx (pred_mode);
17284 op1 = force_reg (data_mode, op1);
17285 aarch64_emit_sve_inverted_cond (ordered, ptrue, UNORDERED, op0, op1);
17286
17287 /* Test the opposite condition for the ordered elements,
17288 then invert the result. */
17289 if (code == UNEQ)
17290 code = NE;
17291 else
17292 code = reverse_condition_maybe_unordered (code);
17293 if (can_invert_p)
17294 {
17295 aarch64_emit_sve_predicated_cond (target, code,
17296 ordered, op0, op1);
17297 return true;
17298 }
17299 rtx tmp = gen_reg_rtx (pred_mode);
17300 aarch64_emit_sve_predicated_cond (tmp, code, ordered, op0, op1);
17301 aarch64_emit_unop (target, one_cmpl_optab, tmp);
17302 return false;
17303 }
17304 break;
17305
17306 case ORDERED:
17307 /* ORDERED has no immediate form. */
17308 op1 = force_reg (data_mode, op1);
17309 break;
17310
17311 default:
17312 gcc_unreachable ();
17313 }
17314
17315 /* There is native support for the inverse comparison. */
17316 code = reverse_condition_maybe_unordered (code);
17317 if (can_invert_p)
17318 {
17319 rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
17320 aarch64_emit_sve_ptrue_op (target, ptrue, cond);
17321 return true;
17322 }
17323 aarch64_emit_sve_inverted_cond (target, ptrue, code, op0, op1);
17324 return false;
17325 }
17326
17327 /* Expand an SVE vcond pattern with operands OPS. DATA_MODE is the mode
17328 of the data being selected and CMP_MODE is the mode of the values being
17329 compared. */
17330
17331 void
aarch64_expand_sve_vcond(machine_mode data_mode,machine_mode cmp_mode,rtx * ops)17332 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
17333 rtx *ops)
17334 {
17335 machine_mode pred_mode
17336 = aarch64_get_mask_mode (GET_MODE_NUNITS (cmp_mode),
17337 GET_MODE_SIZE (cmp_mode)).require ();
17338 rtx pred = gen_reg_rtx (pred_mode);
17339 if (FLOAT_MODE_P (cmp_mode))
17340 {
17341 if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
17342 ops[4], ops[5], true))
17343 std::swap (ops[1], ops[2]);
17344 }
17345 else
17346 aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
17347
17348 rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
17349 emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
17350 }
17351
17352 /* Implement TARGET_MODES_TIEABLE_P. In principle we should always return
17353 true. However due to issues with register allocation it is preferable
17354 to avoid tieing integer scalar and FP scalar modes. Executing integer
17355 operations in general registers is better than treating them as scalar
17356 vector operations. This reduces latency and avoids redundant int<->FP
17357 moves. So tie modes if they are either the same class, or vector modes
17358 with other vector modes, vector structs or any scalar mode. */
17359
17360 static bool
aarch64_modes_tieable_p(machine_mode mode1,machine_mode mode2)17361 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
17362 {
17363 if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
17364 return true;
17365
17366 /* We specifically want to allow elements of "structure" modes to
17367 be tieable to the structure. This more general condition allows
17368 other rarer situations too. The reason we don't extend this to
17369 predicate modes is that there are no predicate structure modes
17370 nor any specific instructions for extracting part of a predicate
17371 register. */
17372 if (aarch64_vector_data_mode_p (mode1)
17373 && aarch64_vector_data_mode_p (mode2))
17374 return true;
17375
17376 /* Also allow any scalar modes with vectors. */
17377 if (aarch64_vector_mode_supported_p (mode1)
17378 || aarch64_vector_mode_supported_p (mode2))
17379 return true;
17380
17381 return false;
17382 }
17383
17384 /* Return a new RTX holding the result of moving POINTER forward by
17385 AMOUNT bytes. */
17386
17387 static rtx
aarch64_move_pointer(rtx pointer,poly_int64 amount)17388 aarch64_move_pointer (rtx pointer, poly_int64 amount)
17389 {
17390 rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
17391
17392 return adjust_automodify_address (pointer, GET_MODE (pointer),
17393 next, amount);
17394 }
17395
17396 /* Return a new RTX holding the result of moving POINTER forward by the
17397 size of the mode it points to. */
17398
17399 static rtx
aarch64_progress_pointer(rtx pointer)17400 aarch64_progress_pointer (rtx pointer)
17401 {
17402 return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
17403 }
17404
17405 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
17406 MODE bytes. */
17407
17408 static void
aarch64_copy_one_block_and_progress_pointers(rtx * src,rtx * dst,machine_mode mode)17409 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
17410 machine_mode mode)
17411 {
17412 rtx reg = gen_reg_rtx (mode);
17413
17414 /* "Cast" the pointers to the correct mode. */
17415 *src = adjust_address (*src, mode, 0);
17416 *dst = adjust_address (*dst, mode, 0);
17417 /* Emit the memcpy. */
17418 emit_move_insn (reg, *src);
17419 emit_move_insn (*dst, reg);
17420 /* Move the pointers forward. */
17421 *src = aarch64_progress_pointer (*src);
17422 *dst = aarch64_progress_pointer (*dst);
17423 }
17424
17425 /* Expand movmem, as if from a __builtin_memcpy. Return true if
17426 we succeed, otherwise return false. */
17427
17428 bool
aarch64_expand_movmem(rtx * operands)17429 aarch64_expand_movmem (rtx *operands)
17430 {
17431 /* These need to be signed as we need to perform arithmetic on n as
17432 signed operations. */
17433 int n, mode_bits;
17434 rtx dst = operands[0];
17435 rtx src = operands[1];
17436 rtx base;
17437 machine_mode cur_mode = BLKmode, next_mode;
17438 bool speed_p = !optimize_function_for_size_p (cfun);
17439
17440 /* When optimizing for size, give a better estimate of the length of a
17441 memcpy call, but use the default otherwise. Moves larger than 8 bytes
17442 will always require an even number of instructions to do now. And each
17443 operation requires both a load+store, so divide the max number by 2. */
17444 unsigned int max_num_moves = (speed_p ? 16 : AARCH64_CALL_RATIO) / 2;
17445
17446 /* We can't do anything smart if the amount to copy is not constant. */
17447 if (!CONST_INT_P (operands[2]))
17448 return false;
17449
17450 unsigned HOST_WIDE_INT tmp = INTVAL (operands[2]);
17451
17452 /* Try to keep the number of instructions low. For all cases we will do at
17453 most two moves for the residual amount, since we'll always overlap the
17454 remainder. */
17455 if (((tmp / 16) + (tmp % 16 ? 2 : 0)) > max_num_moves)
17456 return false;
17457
17458 /* At this point tmp is known to have to fit inside an int. */
17459 n = tmp;
17460
17461 base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
17462 dst = adjust_automodify_address (dst, VOIDmode, base, 0);
17463
17464 base = copy_to_mode_reg (Pmode, XEXP (src, 0));
17465 src = adjust_automodify_address (src, VOIDmode, base, 0);
17466
17467 /* Convert n to bits to make the rest of the code simpler. */
17468 n = n * BITS_PER_UNIT;
17469
17470 /* Maximum amount to copy in one go. The AArch64 back-end has integer modes
17471 larger than TImode, but we should not use them for loads/stores here. */
17472 const int copy_limit = GET_MODE_BITSIZE (TImode);
17473
17474 while (n > 0)
17475 {
17476 /* Find the largest mode in which to do the copy in without over reading
17477 or writing. */
17478 opt_scalar_int_mode mode_iter;
17479 FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
17480 if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit))
17481 cur_mode = mode_iter.require ();
17482
17483 gcc_assert (cur_mode != BLKmode);
17484
17485 mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
17486 aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode);
17487
17488 n -= mode_bits;
17489
17490 /* Do certain trailing copies as overlapping if it's going to be
17491 cheaper. i.e. less instructions to do so. For instance doing a 15
17492 byte copy it's more efficient to do two overlapping 8 byte copies than
17493 8 + 6 + 1. */
17494 if (n > 0 && n <= 8 * BITS_PER_UNIT)
17495 {
17496 next_mode = smallest_mode_for_size (n, MODE_INT);
17497 int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
17498 src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT);
17499 dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
17500 n = n_bits;
17501 }
17502 }
17503
17504 return true;
17505 }
17506
17507 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
17508 SImode stores. Handle the case when the constant has identical
17509 bottom and top halves. This is beneficial when the two stores can be
17510 merged into an STP and we avoid synthesising potentially expensive
17511 immediates twice. Return true if such a split is possible. */
17512
17513 bool
aarch64_split_dimode_const_store(rtx dst,rtx src)17514 aarch64_split_dimode_const_store (rtx dst, rtx src)
17515 {
17516 rtx lo = gen_lowpart (SImode, src);
17517 rtx hi = gen_highpart_mode (SImode, DImode, src);
17518
17519 bool size_p = optimize_function_for_size_p (cfun);
17520
17521 if (!rtx_equal_p (lo, hi))
17522 return false;
17523
17524 unsigned int orig_cost
17525 = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
17526 unsigned int lo_cost
17527 = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
17528
17529 /* We want to transform:
17530 MOV x1, 49370
17531 MOVK x1, 0x140, lsl 16
17532 MOVK x1, 0xc0da, lsl 32
17533 MOVK x1, 0x140, lsl 48
17534 STR x1, [x0]
17535 into:
17536 MOV w1, 49370
17537 MOVK w1, 0x140, lsl 16
17538 STP w1, w1, [x0]
17539 So we want to perform this only when we save two instructions
17540 or more. When optimizing for size, however, accept any code size
17541 savings we can. */
17542 if (size_p && orig_cost <= lo_cost)
17543 return false;
17544
17545 if (!size_p
17546 && (orig_cost <= lo_cost + 1))
17547 return false;
17548
17549 rtx mem_lo = adjust_address (dst, SImode, 0);
17550 if (!aarch64_mem_pair_operand (mem_lo, SImode))
17551 return false;
17552
17553 rtx tmp_reg = gen_reg_rtx (SImode);
17554 aarch64_expand_mov_immediate (tmp_reg, lo);
17555 rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
17556 /* Don't emit an explicit store pair as this may not be always profitable.
17557 Let the sched-fusion logic decide whether to merge them. */
17558 emit_move_insn (mem_lo, tmp_reg);
17559 emit_move_insn (mem_hi, tmp_reg);
17560
17561 return true;
17562 }
17563
17564 /* Generate RTL for a conditional branch with rtx comparison CODE in
17565 mode CC_MODE. The destination of the unlikely conditional branch
17566 is LABEL_REF. */
17567
17568 void
aarch64_gen_unlikely_cbranch(enum rtx_code code,machine_mode cc_mode,rtx label_ref)17569 aarch64_gen_unlikely_cbranch (enum rtx_code code, machine_mode cc_mode,
17570 rtx label_ref)
17571 {
17572 rtx x;
17573 x = gen_rtx_fmt_ee (code, VOIDmode,
17574 gen_rtx_REG (cc_mode, CC_REGNUM),
17575 const0_rtx);
17576
17577 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
17578 gen_rtx_LABEL_REF (VOIDmode, label_ref),
17579 pc_rtx);
17580 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
17581 }
17582
17583 /* Generate DImode scratch registers for 128-bit (TImode) addition.
17584
17585 OP1 represents the TImode destination operand 1
17586 OP2 represents the TImode destination operand 2
17587 LOW_DEST represents the low half (DImode) of TImode operand 0
17588 LOW_IN1 represents the low half (DImode) of TImode operand 1
17589 LOW_IN2 represents the low half (DImode) of TImode operand 2
17590 HIGH_DEST represents the high half (DImode) of TImode operand 0
17591 HIGH_IN1 represents the high half (DImode) of TImode operand 1
17592 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
17593
17594 void
aarch64_addti_scratch_regs(rtx op1,rtx op2,rtx * low_dest,rtx * low_in1,rtx * low_in2,rtx * high_dest,rtx * high_in1,rtx * high_in2)17595 aarch64_addti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
17596 rtx *low_in1, rtx *low_in2,
17597 rtx *high_dest, rtx *high_in1,
17598 rtx *high_in2)
17599 {
17600 *low_dest = gen_reg_rtx (DImode);
17601 *low_in1 = gen_lowpart (DImode, op1);
17602 *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
17603 subreg_lowpart_offset (DImode, TImode));
17604 *high_dest = gen_reg_rtx (DImode);
17605 *high_in1 = gen_highpart (DImode, op1);
17606 *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
17607 subreg_highpart_offset (DImode, TImode));
17608 }
17609
17610 /* Generate DImode scratch registers for 128-bit (TImode) subtraction.
17611
17612 This function differs from 'arch64_addti_scratch_regs' in that
17613 OP1 can be an immediate constant (zero). We must call
17614 subreg_highpart_offset with DImode and TImode arguments, otherwise
17615 VOIDmode will be used for the const_int which generates an internal
17616 error from subreg_size_highpart_offset which does not expect a size of zero.
17617
17618 OP1 represents the TImode destination operand 1
17619 OP2 represents the TImode destination operand 2
17620 LOW_DEST represents the low half (DImode) of TImode operand 0
17621 LOW_IN1 represents the low half (DImode) of TImode operand 1
17622 LOW_IN2 represents the low half (DImode) of TImode operand 2
17623 HIGH_DEST represents the high half (DImode) of TImode operand 0
17624 HIGH_IN1 represents the high half (DImode) of TImode operand 1
17625 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
17626
17627
17628 void
aarch64_subvti_scratch_regs(rtx op1,rtx op2,rtx * low_dest,rtx * low_in1,rtx * low_in2,rtx * high_dest,rtx * high_in1,rtx * high_in2)17629 aarch64_subvti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
17630 rtx *low_in1, rtx *low_in2,
17631 rtx *high_dest, rtx *high_in1,
17632 rtx *high_in2)
17633 {
17634 *low_dest = gen_reg_rtx (DImode);
17635 *low_in1 = simplify_gen_subreg (DImode, op1, TImode,
17636 subreg_lowpart_offset (DImode, TImode));
17637
17638 *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
17639 subreg_lowpart_offset (DImode, TImode));
17640 *high_dest = gen_reg_rtx (DImode);
17641
17642 *high_in1 = simplify_gen_subreg (DImode, op1, TImode,
17643 subreg_highpart_offset (DImode, TImode));
17644 *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
17645 subreg_highpart_offset (DImode, TImode));
17646 }
17647
17648 /* Generate RTL for 128-bit (TImode) subtraction with overflow.
17649
17650 OP0 represents the TImode destination operand 0
17651 LOW_DEST represents the low half (DImode) of TImode operand 0
17652 LOW_IN1 represents the low half (DImode) of TImode operand 1
17653 LOW_IN2 represents the low half (DImode) of TImode operand 2
17654 HIGH_DEST represents the high half (DImode) of TImode operand 0
17655 HIGH_IN1 represents the high half (DImode) of TImode operand 1
17656 HIGH_IN2 represents the high half (DImode) of TImode operand 2
17657 UNSIGNED_P is true if the operation is being performed on unsigned
17658 values. */
17659 void
aarch64_expand_subvti(rtx op0,rtx low_dest,rtx low_in1,rtx low_in2,rtx high_dest,rtx high_in1,rtx high_in2,bool unsigned_p)17660 aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1,
17661 rtx low_in2, rtx high_dest, rtx high_in1,
17662 rtx high_in2, bool unsigned_p)
17663 {
17664 if (low_in2 == const0_rtx)
17665 {
17666 low_dest = low_in1;
17667 high_in2 = force_reg (DImode, high_in2);
17668 if (unsigned_p)
17669 emit_insn (gen_subdi3_compare1 (high_dest, high_in1, high_in2));
17670 else
17671 emit_insn (gen_subvdi_insn (high_dest, high_in1, high_in2));
17672 }
17673 else
17674 {
17675 if (aarch64_plus_immediate (low_in2, DImode))
17676 emit_insn (gen_subdi3_compare1_imm (low_dest, low_in1, low_in2,
17677 GEN_INT (-INTVAL (low_in2))));
17678 else
17679 {
17680 low_in2 = force_reg (DImode, low_in2);
17681 emit_insn (gen_subdi3_compare1 (low_dest, low_in1, low_in2));
17682 }
17683 high_in2 = force_reg (DImode, high_in2);
17684
17685 if (unsigned_p)
17686 emit_insn (gen_usubdi3_carryinC (high_dest, high_in1, high_in2));
17687 else
17688 emit_insn (gen_subdi3_carryinV (high_dest, high_in1, high_in2));
17689 }
17690
17691 emit_move_insn (gen_lowpart (DImode, op0), low_dest);
17692 emit_move_insn (gen_highpart (DImode, op0), high_dest);
17693
17694 }
17695
17696 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
17697
17698 static unsigned HOST_WIDE_INT
aarch64_asan_shadow_offset(void)17699 aarch64_asan_shadow_offset (void)
17700 {
17701 return (HOST_WIDE_INT_1 << 36);
17702 }
17703
17704 static rtx
aarch64_gen_ccmp_first(rtx_insn ** prep_seq,rtx_insn ** gen_seq,int code,tree treeop0,tree treeop1)17705 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
17706 int code, tree treeop0, tree treeop1)
17707 {
17708 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
17709 rtx op0, op1;
17710 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
17711 insn_code icode;
17712 struct expand_operand ops[4];
17713
17714 start_sequence ();
17715 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
17716
17717 op_mode = GET_MODE (op0);
17718 if (op_mode == VOIDmode)
17719 op_mode = GET_MODE (op1);
17720
17721 switch (op_mode)
17722 {
17723 case E_QImode:
17724 case E_HImode:
17725 case E_SImode:
17726 cmp_mode = SImode;
17727 icode = CODE_FOR_cmpsi;
17728 break;
17729
17730 case E_DImode:
17731 cmp_mode = DImode;
17732 icode = CODE_FOR_cmpdi;
17733 break;
17734
17735 case E_SFmode:
17736 cmp_mode = SFmode;
17737 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
17738 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
17739 break;
17740
17741 case E_DFmode:
17742 cmp_mode = DFmode;
17743 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
17744 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
17745 break;
17746
17747 default:
17748 end_sequence ();
17749 return NULL_RTX;
17750 }
17751
17752 op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
17753 op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
17754 if (!op0 || !op1)
17755 {
17756 end_sequence ();
17757 return NULL_RTX;
17758 }
17759 *prep_seq = get_insns ();
17760 end_sequence ();
17761
17762 create_fixed_operand (&ops[0], op0);
17763 create_fixed_operand (&ops[1], op1);
17764
17765 start_sequence ();
17766 if (!maybe_expand_insn (icode, 2, ops))
17767 {
17768 end_sequence ();
17769 return NULL_RTX;
17770 }
17771 *gen_seq = get_insns ();
17772 end_sequence ();
17773
17774 return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
17775 gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
17776 }
17777
17778 static rtx
aarch64_gen_ccmp_next(rtx_insn ** prep_seq,rtx_insn ** gen_seq,rtx prev,int cmp_code,tree treeop0,tree treeop1,int bit_code)17779 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
17780 int cmp_code, tree treeop0, tree treeop1, int bit_code)
17781 {
17782 rtx op0, op1, target;
17783 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
17784 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
17785 insn_code icode;
17786 struct expand_operand ops[6];
17787 int aarch64_cond;
17788
17789 push_to_sequence (*prep_seq);
17790 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
17791
17792 op_mode = GET_MODE (op0);
17793 if (op_mode == VOIDmode)
17794 op_mode = GET_MODE (op1);
17795
17796 switch (op_mode)
17797 {
17798 case E_QImode:
17799 case E_HImode:
17800 case E_SImode:
17801 cmp_mode = SImode;
17802 icode = CODE_FOR_ccmpsi;
17803 break;
17804
17805 case E_DImode:
17806 cmp_mode = DImode;
17807 icode = CODE_FOR_ccmpdi;
17808 break;
17809
17810 case E_SFmode:
17811 cmp_mode = SFmode;
17812 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
17813 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
17814 break;
17815
17816 case E_DFmode:
17817 cmp_mode = DFmode;
17818 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
17819 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
17820 break;
17821
17822 default:
17823 end_sequence ();
17824 return NULL_RTX;
17825 }
17826
17827 op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
17828 op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
17829 if (!op0 || !op1)
17830 {
17831 end_sequence ();
17832 return NULL_RTX;
17833 }
17834 *prep_seq = get_insns ();
17835 end_sequence ();
17836
17837 target = gen_rtx_REG (cc_mode, CC_REGNUM);
17838 aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
17839
17840 if (bit_code != AND)
17841 {
17842 prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
17843 GET_MODE (XEXP (prev, 0))),
17844 VOIDmode, XEXP (prev, 0), const0_rtx);
17845 aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
17846 }
17847
17848 create_fixed_operand (&ops[0], XEXP (prev, 0));
17849 create_fixed_operand (&ops[1], target);
17850 create_fixed_operand (&ops[2], op0);
17851 create_fixed_operand (&ops[3], op1);
17852 create_fixed_operand (&ops[4], prev);
17853 create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
17854
17855 push_to_sequence (*gen_seq);
17856 if (!maybe_expand_insn (icode, 6, ops))
17857 {
17858 end_sequence ();
17859 return NULL_RTX;
17860 }
17861
17862 *gen_seq = get_insns ();
17863 end_sequence ();
17864
17865 return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
17866 }
17867
17868 #undef TARGET_GEN_CCMP_FIRST
17869 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
17870
17871 #undef TARGET_GEN_CCMP_NEXT
17872 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
17873
17874 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
17875 instruction fusion of some sort. */
17876
17877 static bool
aarch64_macro_fusion_p(void)17878 aarch64_macro_fusion_p (void)
17879 {
17880 return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
17881 }
17882
17883
17884 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
17885 should be kept together during scheduling. */
17886
17887 static bool
aarch_macro_fusion_pair_p(rtx_insn * prev,rtx_insn * curr)17888 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
17889 {
17890 rtx set_dest;
17891 rtx prev_set = single_set (prev);
17892 rtx curr_set = single_set (curr);
17893 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
17894 bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
17895
17896 if (!aarch64_macro_fusion_p ())
17897 return false;
17898
17899 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
17900 {
17901 /* We are trying to match:
17902 prev (mov) == (set (reg r0) (const_int imm16))
17903 curr (movk) == (set (zero_extract (reg r0)
17904 (const_int 16)
17905 (const_int 16))
17906 (const_int imm16_1)) */
17907
17908 set_dest = SET_DEST (curr_set);
17909
17910 if (GET_CODE (set_dest) == ZERO_EXTRACT
17911 && CONST_INT_P (SET_SRC (curr_set))
17912 && CONST_INT_P (SET_SRC (prev_set))
17913 && CONST_INT_P (XEXP (set_dest, 2))
17914 && INTVAL (XEXP (set_dest, 2)) == 16
17915 && REG_P (XEXP (set_dest, 0))
17916 && REG_P (SET_DEST (prev_set))
17917 && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
17918 {
17919 return true;
17920 }
17921 }
17922
17923 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
17924 {
17925
17926 /* We're trying to match:
17927 prev (adrp) == (set (reg r1)
17928 (high (symbol_ref ("SYM"))))
17929 curr (add) == (set (reg r0)
17930 (lo_sum (reg r1)
17931 (symbol_ref ("SYM"))))
17932 Note that r0 need not necessarily be the same as r1, especially
17933 during pre-regalloc scheduling. */
17934
17935 if (satisfies_constraint_Ush (SET_SRC (prev_set))
17936 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
17937 {
17938 if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
17939 && REG_P (XEXP (SET_SRC (curr_set), 0))
17940 && REGNO (XEXP (SET_SRC (curr_set), 0))
17941 == REGNO (SET_DEST (prev_set))
17942 && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
17943 XEXP (SET_SRC (curr_set), 1)))
17944 return true;
17945 }
17946 }
17947
17948 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
17949 {
17950
17951 /* We're trying to match:
17952 prev (movk) == (set (zero_extract (reg r0)
17953 (const_int 16)
17954 (const_int 32))
17955 (const_int imm16_1))
17956 curr (movk) == (set (zero_extract (reg r0)
17957 (const_int 16)
17958 (const_int 48))
17959 (const_int imm16_2)) */
17960
17961 if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
17962 && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
17963 && REG_P (XEXP (SET_DEST (prev_set), 0))
17964 && REG_P (XEXP (SET_DEST (curr_set), 0))
17965 && REGNO (XEXP (SET_DEST (prev_set), 0))
17966 == REGNO (XEXP (SET_DEST (curr_set), 0))
17967 && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
17968 && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
17969 && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
17970 && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
17971 && CONST_INT_P (SET_SRC (prev_set))
17972 && CONST_INT_P (SET_SRC (curr_set)))
17973 return true;
17974
17975 }
17976 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
17977 {
17978 /* We're trying to match:
17979 prev (adrp) == (set (reg r0)
17980 (high (symbol_ref ("SYM"))))
17981 curr (ldr) == (set (reg r1)
17982 (mem (lo_sum (reg r0)
17983 (symbol_ref ("SYM")))))
17984 or
17985 curr (ldr) == (set (reg r1)
17986 (zero_extend (mem
17987 (lo_sum (reg r0)
17988 (symbol_ref ("SYM")))))) */
17989 if (satisfies_constraint_Ush (SET_SRC (prev_set))
17990 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
17991 {
17992 rtx curr_src = SET_SRC (curr_set);
17993
17994 if (GET_CODE (curr_src) == ZERO_EXTEND)
17995 curr_src = XEXP (curr_src, 0);
17996
17997 if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
17998 && REG_P (XEXP (XEXP (curr_src, 0), 0))
17999 && REGNO (XEXP (XEXP (curr_src, 0), 0))
18000 == REGNO (SET_DEST (prev_set))
18001 && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
18002 XEXP (SET_SRC (prev_set), 0)))
18003 return true;
18004 }
18005 }
18006
18007 if (aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)
18008 && aarch_crypto_can_dual_issue (prev, curr))
18009 return true;
18010
18011 if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
18012 && any_condjump_p (curr))
18013 {
18014 unsigned int condreg1, condreg2;
18015 rtx cc_reg_1;
18016 aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
18017 cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
18018
18019 if (reg_referenced_p (cc_reg_1, PATTERN (curr))
18020 && prev
18021 && modified_in_p (cc_reg_1, prev))
18022 {
18023 enum attr_type prev_type = get_attr_type (prev);
18024
18025 /* FIXME: this misses some which is considered simple arthematic
18026 instructions for ThunderX. Simple shifts are missed here. */
18027 if (prev_type == TYPE_ALUS_SREG
18028 || prev_type == TYPE_ALUS_IMM
18029 || prev_type == TYPE_LOGICS_REG
18030 || prev_type == TYPE_LOGICS_IMM)
18031 return true;
18032 }
18033 }
18034
18035 if (prev_set
18036 && curr_set
18037 && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
18038 && any_condjump_p (curr))
18039 {
18040 /* We're trying to match:
18041 prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
18042 curr (cbz) == (set (pc) (if_then_else (eq/ne) (r0)
18043 (const_int 0))
18044 (label_ref ("SYM"))
18045 (pc)) */
18046 if (SET_DEST (curr_set) == (pc_rtx)
18047 && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
18048 && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
18049 && REG_P (SET_DEST (prev_set))
18050 && REGNO (SET_DEST (prev_set))
18051 == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
18052 {
18053 /* Fuse ALU operations followed by conditional branch instruction. */
18054 switch (get_attr_type (prev))
18055 {
18056 case TYPE_ALU_IMM:
18057 case TYPE_ALU_SREG:
18058 case TYPE_ADC_REG:
18059 case TYPE_ADC_IMM:
18060 case TYPE_ADCS_REG:
18061 case TYPE_ADCS_IMM:
18062 case TYPE_LOGIC_REG:
18063 case TYPE_LOGIC_IMM:
18064 case TYPE_CSEL:
18065 case TYPE_ADR:
18066 case TYPE_MOV_IMM:
18067 case TYPE_SHIFT_REG:
18068 case TYPE_SHIFT_IMM:
18069 case TYPE_BFM:
18070 case TYPE_RBIT:
18071 case TYPE_REV:
18072 case TYPE_EXTEND:
18073 return true;
18074
18075 default:;
18076 }
18077 }
18078 }
18079
18080 return false;
18081 }
18082
18083 /* Return true iff the instruction fusion described by OP is enabled. */
18084
18085 bool
aarch64_fusion_enabled_p(enum aarch64_fusion_pairs op)18086 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
18087 {
18088 return (aarch64_tune_params.fusible_ops & op) != 0;
18089 }
18090
18091 /* If MEM is in the form of [base+offset], extract the two parts
18092 of address and set to BASE and OFFSET, otherwise return false
18093 after clearing BASE and OFFSET. */
18094
18095 bool
extract_base_offset_in_addr(rtx mem,rtx * base,rtx * offset)18096 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
18097 {
18098 rtx addr;
18099
18100 gcc_assert (MEM_P (mem));
18101
18102 addr = XEXP (mem, 0);
18103
18104 if (REG_P (addr))
18105 {
18106 *base = addr;
18107 *offset = const0_rtx;
18108 return true;
18109 }
18110
18111 if (GET_CODE (addr) == PLUS
18112 && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
18113 {
18114 *base = XEXP (addr, 0);
18115 *offset = XEXP (addr, 1);
18116 return true;
18117 }
18118
18119 *base = NULL_RTX;
18120 *offset = NULL_RTX;
18121
18122 return false;
18123 }
18124
18125 /* Types for scheduling fusion. */
18126 enum sched_fusion_type
18127 {
18128 SCHED_FUSION_NONE = 0,
18129 SCHED_FUSION_LD_SIGN_EXTEND,
18130 SCHED_FUSION_LD_ZERO_EXTEND,
18131 SCHED_FUSION_LD,
18132 SCHED_FUSION_ST,
18133 SCHED_FUSION_NUM
18134 };
18135
18136 /* If INSN is a load or store of address in the form of [base+offset],
18137 extract the two parts and set to BASE and OFFSET. Return scheduling
18138 fusion type this INSN is. */
18139
18140 static enum sched_fusion_type
fusion_load_store(rtx_insn * insn,rtx * base,rtx * offset)18141 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
18142 {
18143 rtx x, dest, src;
18144 enum sched_fusion_type fusion = SCHED_FUSION_LD;
18145
18146 gcc_assert (INSN_P (insn));
18147 x = PATTERN (insn);
18148 if (GET_CODE (x) != SET)
18149 return SCHED_FUSION_NONE;
18150
18151 src = SET_SRC (x);
18152 dest = SET_DEST (x);
18153
18154 machine_mode dest_mode = GET_MODE (dest);
18155
18156 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
18157 return SCHED_FUSION_NONE;
18158
18159 if (GET_CODE (src) == SIGN_EXTEND)
18160 {
18161 fusion = SCHED_FUSION_LD_SIGN_EXTEND;
18162 src = XEXP (src, 0);
18163 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
18164 return SCHED_FUSION_NONE;
18165 }
18166 else if (GET_CODE (src) == ZERO_EXTEND)
18167 {
18168 fusion = SCHED_FUSION_LD_ZERO_EXTEND;
18169 src = XEXP (src, 0);
18170 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
18171 return SCHED_FUSION_NONE;
18172 }
18173
18174 if (GET_CODE (src) == MEM && REG_P (dest))
18175 extract_base_offset_in_addr (src, base, offset);
18176 else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
18177 {
18178 fusion = SCHED_FUSION_ST;
18179 extract_base_offset_in_addr (dest, base, offset);
18180 }
18181 else
18182 return SCHED_FUSION_NONE;
18183
18184 if (*base == NULL_RTX || *offset == NULL_RTX)
18185 fusion = SCHED_FUSION_NONE;
18186
18187 return fusion;
18188 }
18189
18190 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
18191
18192 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
18193 and PRI are only calculated for these instructions. For other instruction,
18194 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
18195 type instruction fusion can be added by returning different priorities.
18196
18197 It's important that irrelevant instructions get the largest FUSION_PRI. */
18198
18199 static void
aarch64_sched_fusion_priority(rtx_insn * insn,int max_pri,int * fusion_pri,int * pri)18200 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
18201 int *fusion_pri, int *pri)
18202 {
18203 int tmp, off_val;
18204 rtx base, offset;
18205 enum sched_fusion_type fusion;
18206
18207 gcc_assert (INSN_P (insn));
18208
18209 tmp = max_pri - 1;
18210 fusion = fusion_load_store (insn, &base, &offset);
18211 if (fusion == SCHED_FUSION_NONE)
18212 {
18213 *pri = tmp;
18214 *fusion_pri = tmp;
18215 return;
18216 }
18217
18218 /* Set FUSION_PRI according to fusion type and base register. */
18219 *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
18220
18221 /* Calculate PRI. */
18222 tmp /= 2;
18223
18224 /* INSN with smaller offset goes first. */
18225 off_val = (int)(INTVAL (offset));
18226 if (off_val >= 0)
18227 tmp -= (off_val & 0xfffff);
18228 else
18229 tmp += ((- off_val) & 0xfffff);
18230
18231 *pri = tmp;
18232 return;
18233 }
18234
18235 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
18236 Adjust priority of sha1h instructions so they are scheduled before
18237 other SHA1 instructions. */
18238
18239 static int
aarch64_sched_adjust_priority(rtx_insn * insn,int priority)18240 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
18241 {
18242 rtx x = PATTERN (insn);
18243
18244 if (GET_CODE (x) == SET)
18245 {
18246 x = SET_SRC (x);
18247
18248 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
18249 return priority + 10;
18250 }
18251
18252 return priority;
18253 }
18254
18255 /* Given OPERANDS of consecutive load/store, check if we can merge
18256 them into ldp/stp. LOAD is true if they are load instructions.
18257 MODE is the mode of memory operands. */
18258
18259 bool
aarch64_operands_ok_for_ldpstp(rtx * operands,bool load,machine_mode mode)18260 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
18261 machine_mode mode)
18262 {
18263 HOST_WIDE_INT offval_1, offval_2, msize;
18264 enum reg_class rclass_1, rclass_2;
18265 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
18266
18267 if (load)
18268 {
18269 mem_1 = operands[1];
18270 mem_2 = operands[3];
18271 reg_1 = operands[0];
18272 reg_2 = operands[2];
18273 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
18274 if (REGNO (reg_1) == REGNO (reg_2))
18275 return false;
18276 }
18277 else
18278 {
18279 mem_1 = operands[0];
18280 mem_2 = operands[2];
18281 reg_1 = operands[1];
18282 reg_2 = operands[3];
18283 }
18284
18285 /* The mems cannot be volatile. */
18286 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
18287 return false;
18288
18289 /* If we have SImode and slow unaligned ldp,
18290 check the alignment to be at least 8 byte. */
18291 if (mode == SImode
18292 && (aarch64_tune_params.extra_tuning_flags
18293 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
18294 && !optimize_size
18295 && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
18296 return false;
18297
18298 /* Check if the addresses are in the form of [base+offset]. */
18299 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
18300 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
18301 return false;
18302 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
18303 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
18304 return false;
18305
18306 /* Check if the bases are same. */
18307 if (!rtx_equal_p (base_1, base_2))
18308 return false;
18309
18310 /* The operands must be of the same size. */
18311 gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1)),
18312 GET_MODE_SIZE (GET_MODE (mem_2))));
18313
18314 offval_1 = INTVAL (offset_1);
18315 offval_2 = INTVAL (offset_2);
18316 /* We should only be trying this for fixed-sized modes. There is no
18317 SVE LDP/STP instruction. */
18318 msize = GET_MODE_SIZE (mode).to_constant ();
18319 /* Check if the offsets are consecutive. */
18320 if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
18321 return false;
18322
18323 /* Check if the addresses are clobbered by load. */
18324 if (load)
18325 {
18326 if (reg_mentioned_p (reg_1, mem_1))
18327 return false;
18328
18329 /* In increasing order, the last load can clobber the address. */
18330 if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
18331 return false;
18332 }
18333
18334 /* One of the memory accesses must be a mempair operand.
18335 If it is not the first one, they need to be swapped by the
18336 peephole. */
18337 if (!aarch64_mem_pair_operand (mem_1, GET_MODE (mem_1))
18338 && !aarch64_mem_pair_operand (mem_2, GET_MODE (mem_2)))
18339 return false;
18340
18341 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
18342 rclass_1 = FP_REGS;
18343 else
18344 rclass_1 = GENERAL_REGS;
18345
18346 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
18347 rclass_2 = FP_REGS;
18348 else
18349 rclass_2 = GENERAL_REGS;
18350
18351 /* Check if the registers are of same class. */
18352 if (rclass_1 != rclass_2)
18353 return false;
18354
18355 return true;
18356 }
18357
18358 /* Given OPERANDS of consecutive load/store that can be merged,
18359 swap them if they are not in ascending order. */
18360 void
aarch64_swap_ldrstr_operands(rtx * operands,bool load)18361 aarch64_swap_ldrstr_operands (rtx* operands, bool load)
18362 {
18363 rtx mem_1, mem_2, base_1, base_2, offset_1, offset_2;
18364 HOST_WIDE_INT offval_1, offval_2;
18365
18366 if (load)
18367 {
18368 mem_1 = operands[1];
18369 mem_2 = operands[3];
18370 }
18371 else
18372 {
18373 mem_1 = operands[0];
18374 mem_2 = operands[2];
18375 }
18376
18377 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
18378 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
18379
18380 offval_1 = INTVAL (offset_1);
18381 offval_2 = INTVAL (offset_2);
18382
18383 if (offval_1 > offval_2)
18384 {
18385 /* Irrespective of whether this is a load or a store,
18386 we do the same swap. */
18387 std::swap (operands[0], operands[2]);
18388 std::swap (operands[1], operands[3]);
18389 }
18390 }
18391
18392 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
18393 comparison between the two. */
18394 int
aarch64_host_wide_int_compare(const void * x,const void * y)18395 aarch64_host_wide_int_compare (const void *x, const void *y)
18396 {
18397 return wi::cmps (* ((const HOST_WIDE_INT *) x),
18398 * ((const HOST_WIDE_INT *) y));
18399 }
18400
18401 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
18402 other pointing to a REG rtx containing an offset, compare the offsets
18403 of the two pairs.
18404
18405 Return:
18406
18407 1 iff offset (X) > offset (Y)
18408 0 iff offset (X) == offset (Y)
18409 -1 iff offset (X) < offset (Y) */
18410 int
aarch64_ldrstr_offset_compare(const void * x,const void * y)18411 aarch64_ldrstr_offset_compare (const void *x, const void *y)
18412 {
18413 const rtx * operands_1 = (const rtx *) x;
18414 const rtx * operands_2 = (const rtx *) y;
18415 rtx mem_1, mem_2, base, offset_1, offset_2;
18416
18417 if (MEM_P (operands_1[0]))
18418 mem_1 = operands_1[0];
18419 else
18420 mem_1 = operands_1[1];
18421
18422 if (MEM_P (operands_2[0]))
18423 mem_2 = operands_2[0];
18424 else
18425 mem_2 = operands_2[1];
18426
18427 /* Extract the offsets. */
18428 extract_base_offset_in_addr (mem_1, &base, &offset_1);
18429 extract_base_offset_in_addr (mem_2, &base, &offset_2);
18430
18431 gcc_assert (offset_1 != NULL_RTX && offset_2 != NULL_RTX);
18432
18433 return wi::cmps (INTVAL (offset_1), INTVAL (offset_2));
18434 }
18435
18436 /* Given OPERANDS of consecutive load/store, check if we can merge
18437 them into ldp/stp by adjusting the offset. LOAD is true if they
18438 are load instructions. MODE is the mode of memory operands.
18439
18440 Given below consecutive stores:
18441
18442 str w1, [xb, 0x100]
18443 str w1, [xb, 0x104]
18444 str w1, [xb, 0x108]
18445 str w1, [xb, 0x10c]
18446
18447 Though the offsets are out of the range supported by stp, we can
18448 still pair them after adjusting the offset, like:
18449
18450 add scratch, xb, 0x100
18451 stp w1, w1, [scratch]
18452 stp w1, w1, [scratch, 0x8]
18453
18454 The peephole patterns detecting this opportunity should guarantee
18455 the scratch register is avaliable. */
18456
18457 bool
aarch64_operands_adjust_ok_for_ldpstp(rtx * operands,bool load,scalar_mode mode)18458 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
18459 scalar_mode mode)
18460 {
18461 const int num_insns = 4;
18462 enum reg_class rclass;
18463 HOST_WIDE_INT offvals[num_insns], msize;
18464 rtx mem[num_insns], reg[num_insns], base[num_insns], offset[num_insns];
18465
18466 if (load)
18467 {
18468 for (int i = 0; i < num_insns; i++)
18469 {
18470 reg[i] = operands[2 * i];
18471 mem[i] = operands[2 * i + 1];
18472
18473 gcc_assert (REG_P (reg[i]));
18474 }
18475
18476 /* Do not attempt to merge the loads if the loads clobber each other. */
18477 for (int i = 0; i < 8; i += 2)
18478 for (int j = i + 2; j < 8; j += 2)
18479 if (reg_overlap_mentioned_p (operands[i], operands[j]))
18480 return false;
18481 }
18482 else
18483 for (int i = 0; i < num_insns; i++)
18484 {
18485 mem[i] = operands[2 * i];
18486 reg[i] = operands[2 * i + 1];
18487 }
18488
18489 /* Skip if memory operand is by itself valid for ldp/stp. */
18490 if (!MEM_P (mem[0]) || aarch64_mem_pair_operand (mem[0], mode))
18491 return false;
18492
18493 for (int i = 0; i < num_insns; i++)
18494 {
18495 /* The mems cannot be volatile. */
18496 if (MEM_VOLATILE_P (mem[i]))
18497 return false;
18498
18499 /* Check if the addresses are in the form of [base+offset]. */
18500 extract_base_offset_in_addr (mem[i], base + i, offset + i);
18501 if (base[i] == NULL_RTX || offset[i] == NULL_RTX)
18502 return false;
18503 }
18504
18505 /* Check if the registers are of same class. */
18506 rclass = REG_P (reg[0]) && FP_REGNUM_P (REGNO (reg[0]))
18507 ? FP_REGS : GENERAL_REGS;
18508
18509 for (int i = 1; i < num_insns; i++)
18510 if (REG_P (reg[i]) && FP_REGNUM_P (REGNO (reg[i])))
18511 {
18512 if (rclass != FP_REGS)
18513 return false;
18514 }
18515 else
18516 {
18517 if (rclass != GENERAL_REGS)
18518 return false;
18519 }
18520
18521 /* Only the last register in the order in which they occur
18522 may be clobbered by the load. */
18523 if (rclass == GENERAL_REGS && load)
18524 for (int i = 0; i < num_insns - 1; i++)
18525 if (reg_mentioned_p (reg[i], mem[i]))
18526 return false;
18527
18528 /* Check if the bases are same. */
18529 for (int i = 0; i < num_insns - 1; i++)
18530 if (!rtx_equal_p (base[i], base[i + 1]))
18531 return false;
18532
18533 for (int i = 0; i < num_insns; i++)
18534 offvals[i] = INTVAL (offset[i]);
18535
18536 msize = GET_MODE_SIZE (mode);
18537
18538 /* Check if the offsets can be put in the right order to do a ldp/stp. */
18539 qsort (offvals, num_insns, sizeof (HOST_WIDE_INT),
18540 aarch64_host_wide_int_compare);
18541
18542 if (!(offvals[1] == offvals[0] + msize
18543 && offvals[3] == offvals[2] + msize))
18544 return false;
18545
18546 /* Check that offsets are within range of each other. The ldp/stp
18547 instructions have 7 bit immediate offsets, so use 0x80. */
18548 if (offvals[2] - offvals[0] >= msize * 0x80)
18549 return false;
18550
18551 /* The offsets must be aligned with respect to each other. */
18552 if (offvals[0] % msize != offvals[2] % msize)
18553 return false;
18554
18555 /* If we have SImode and slow unaligned ldp,
18556 check the alignment to be at least 8 byte. */
18557 if (mode == SImode
18558 && (aarch64_tune_params.extra_tuning_flags
18559 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
18560 && !optimize_size
18561 && MEM_ALIGN (mem[0]) < 8 * BITS_PER_UNIT)
18562 return false;
18563
18564 return true;
18565 }
18566
18567 /* Given OPERANDS of consecutive load/store, this function pairs them
18568 into LDP/STP after adjusting the offset. It depends on the fact
18569 that the operands can be sorted so the offsets are correct for STP.
18570 MODE is the mode of memory operands. CODE is the rtl operator
18571 which should be applied to all memory operands, it's SIGN_EXTEND,
18572 ZERO_EXTEND or UNKNOWN. */
18573
18574 bool
aarch64_gen_adjusted_ldpstp(rtx * operands,bool load,scalar_mode mode,RTX_CODE code)18575 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
18576 scalar_mode mode, RTX_CODE code)
18577 {
18578 rtx base, offset_1, offset_3, t1, t2;
18579 rtx mem_1, mem_2, mem_3, mem_4;
18580 rtx temp_operands[8];
18581 HOST_WIDE_INT off_val_1, off_val_3, base_off, new_off_1, new_off_3,
18582 stp_off_upper_limit, stp_off_lower_limit, msize;
18583
18584 /* We make changes on a copy as we may still bail out. */
18585 for (int i = 0; i < 8; i ++)
18586 temp_operands[i] = operands[i];
18587
18588 /* Sort the operands. */
18589 qsort (temp_operands, 4, 2 * sizeof (rtx *), aarch64_ldrstr_offset_compare);
18590
18591 /* Copy the memory operands so that if we have to bail for some
18592 reason the original addresses are unchanged. */
18593 if (load)
18594 {
18595 mem_1 = copy_rtx (temp_operands[1]);
18596 mem_2 = copy_rtx (temp_operands[3]);
18597 mem_3 = copy_rtx (temp_operands[5]);
18598 mem_4 = copy_rtx (temp_operands[7]);
18599 }
18600 else
18601 {
18602 mem_1 = copy_rtx (temp_operands[0]);
18603 mem_2 = copy_rtx (temp_operands[2]);
18604 mem_3 = copy_rtx (temp_operands[4]);
18605 mem_4 = copy_rtx (temp_operands[6]);
18606 gcc_assert (code == UNKNOWN);
18607 }
18608
18609 extract_base_offset_in_addr (mem_1, &base, &offset_1);
18610 extract_base_offset_in_addr (mem_3, &base, &offset_3);
18611 gcc_assert (base != NULL_RTX && offset_1 != NULL_RTX
18612 && offset_3 != NULL_RTX);
18613
18614 /* Adjust offset so it can fit in LDP/STP instruction. */
18615 msize = GET_MODE_SIZE (mode);
18616 stp_off_upper_limit = msize * (0x40 - 1);
18617 stp_off_lower_limit = - msize * 0x40;
18618
18619 off_val_1 = INTVAL (offset_1);
18620 off_val_3 = INTVAL (offset_3);
18621
18622 /* The base offset is optimally half way between the two STP/LDP offsets. */
18623 if (msize <= 4)
18624 base_off = (off_val_1 + off_val_3) / 2;
18625 else
18626 /* However, due to issues with negative LDP/STP offset generation for
18627 larger modes, for DF, DI and vector modes. we must not use negative
18628 addresses smaller than 9 signed unadjusted bits can store. This
18629 provides the most range in this case. */
18630 base_off = off_val_1;
18631
18632 /* Adjust the base so that it is aligned with the addresses but still
18633 optimal. */
18634 if (base_off % msize != off_val_1 % msize)
18635 /* Fix the offset, bearing in mind we want to make it bigger not
18636 smaller. */
18637 base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
18638 else if (msize <= 4)
18639 /* The negative range of LDP/STP is one larger than the positive range. */
18640 base_off += msize;
18641
18642 /* Check if base offset is too big or too small. We can attempt to resolve
18643 this issue by setting it to the maximum value and seeing if the offsets
18644 still fit. */
18645 if (base_off >= 0x1000)
18646 {
18647 base_off = 0x1000 - 1;
18648 /* We must still make sure that the base offset is aligned with respect
18649 to the address. But it may may not be made any bigger. */
18650 base_off -= (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
18651 }
18652
18653 /* Likewise for the case where the base is too small. */
18654 if (base_off <= -0x1000)
18655 {
18656 base_off = -0x1000 + 1;
18657 base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
18658 }
18659
18660 /* Offset of the first STP/LDP. */
18661 new_off_1 = off_val_1 - base_off;
18662
18663 /* Offset of the second STP/LDP. */
18664 new_off_3 = off_val_3 - base_off;
18665
18666 /* The offsets must be within the range of the LDP/STP instructions. */
18667 if (new_off_1 > stp_off_upper_limit || new_off_1 < stp_off_lower_limit
18668 || new_off_3 > stp_off_upper_limit || new_off_3 < stp_off_lower_limit)
18669 return false;
18670
18671 replace_equiv_address_nv (mem_1, plus_constant (Pmode, operands[8],
18672 new_off_1), true);
18673 replace_equiv_address_nv (mem_2, plus_constant (Pmode, operands[8],
18674 new_off_1 + msize), true);
18675 replace_equiv_address_nv (mem_3, plus_constant (Pmode, operands[8],
18676 new_off_3), true);
18677 replace_equiv_address_nv (mem_4, plus_constant (Pmode, operands[8],
18678 new_off_3 + msize), true);
18679
18680 if (!aarch64_mem_pair_operand (mem_1, mode)
18681 || !aarch64_mem_pair_operand (mem_3, mode))
18682 return false;
18683
18684 if (code == ZERO_EXTEND)
18685 {
18686 mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
18687 mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
18688 mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
18689 mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
18690 }
18691 else if (code == SIGN_EXTEND)
18692 {
18693 mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
18694 mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
18695 mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
18696 mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
18697 }
18698
18699 if (load)
18700 {
18701 operands[0] = temp_operands[0];
18702 operands[1] = mem_1;
18703 operands[2] = temp_operands[2];
18704 operands[3] = mem_2;
18705 operands[4] = temp_operands[4];
18706 operands[5] = mem_3;
18707 operands[6] = temp_operands[6];
18708 operands[7] = mem_4;
18709 }
18710 else
18711 {
18712 operands[0] = mem_1;
18713 operands[1] = temp_operands[1];
18714 operands[2] = mem_2;
18715 operands[3] = temp_operands[3];
18716 operands[4] = mem_3;
18717 operands[5] = temp_operands[5];
18718 operands[6] = mem_4;
18719 operands[7] = temp_operands[7];
18720 }
18721
18722 /* Emit adjusting instruction. */
18723 emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, base_off)));
18724 /* Emit ldp/stp instructions. */
18725 t1 = gen_rtx_SET (operands[0], operands[1]);
18726 t2 = gen_rtx_SET (operands[2], operands[3]);
18727 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
18728 t1 = gen_rtx_SET (operands[4], operands[5]);
18729 t2 = gen_rtx_SET (operands[6], operands[7]);
18730 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
18731 return true;
18732 }
18733
18734 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE. Assume for now that
18735 it isn't worth branching around empty masked ops (including masked
18736 stores). */
18737
18738 static bool
aarch64_empty_mask_is_expensive(unsigned)18739 aarch64_empty_mask_is_expensive (unsigned)
18740 {
18741 return false;
18742 }
18743
18744 /* Return 1 if pseudo register should be created and used to hold
18745 GOT address for PIC code. */
18746
18747 bool
aarch64_use_pseudo_pic_reg(void)18748 aarch64_use_pseudo_pic_reg (void)
18749 {
18750 return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
18751 }
18752
18753 /* Implement TARGET_UNSPEC_MAY_TRAP_P. */
18754
18755 static int
aarch64_unspec_may_trap_p(const_rtx x,unsigned flags)18756 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
18757 {
18758 switch (XINT (x, 1))
18759 {
18760 case UNSPEC_GOTSMALLPIC:
18761 case UNSPEC_GOTSMALLPIC28K:
18762 case UNSPEC_GOTTINYPIC:
18763 return 0;
18764 default:
18765 break;
18766 }
18767
18768 return default_unspec_may_trap_p (x, flags);
18769 }
18770
18771
18772 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
18773 return the log2 of that value. Otherwise return -1. */
18774
18775 int
aarch64_fpconst_pow_of_2(rtx x)18776 aarch64_fpconst_pow_of_2 (rtx x)
18777 {
18778 const REAL_VALUE_TYPE *r;
18779
18780 if (!CONST_DOUBLE_P (x))
18781 return -1;
18782
18783 r = CONST_DOUBLE_REAL_VALUE (x);
18784
18785 if (REAL_VALUE_NEGATIVE (*r)
18786 || REAL_VALUE_ISNAN (*r)
18787 || REAL_VALUE_ISINF (*r)
18788 || !real_isinteger (r, DFmode))
18789 return -1;
18790
18791 return exact_log2 (real_to_integer (r));
18792 }
18793
18794 /* If X is a vector of equal CONST_DOUBLE values and that value is
18795 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
18796
18797 int
aarch64_vec_fpconst_pow_of_2(rtx x)18798 aarch64_vec_fpconst_pow_of_2 (rtx x)
18799 {
18800 int nelts;
18801 if (GET_CODE (x) != CONST_VECTOR
18802 || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
18803 return -1;
18804
18805 if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
18806 return -1;
18807
18808 int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
18809 if (firstval <= 0)
18810 return -1;
18811
18812 for (int i = 1; i < nelts; i++)
18813 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
18814 return -1;
18815
18816 return firstval;
18817 }
18818
18819 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
18820 to float.
18821
18822 __fp16 always promotes through this hook.
18823 _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
18824 through the generic excess precision logic rather than here. */
18825
18826 static tree
aarch64_promoted_type(const_tree t)18827 aarch64_promoted_type (const_tree t)
18828 {
18829 if (SCALAR_FLOAT_TYPE_P (t)
18830 && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
18831 return float_type_node;
18832
18833 return NULL_TREE;
18834 }
18835
18836 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
18837
18838 static bool
aarch64_optab_supported_p(int op,machine_mode mode1,machine_mode,optimization_type opt_type)18839 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
18840 optimization_type opt_type)
18841 {
18842 switch (op)
18843 {
18844 case rsqrt_optab:
18845 return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
18846
18847 default:
18848 return true;
18849 }
18850 }
18851
18852 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook. */
18853
18854 static unsigned int
aarch64_dwarf_poly_indeterminate_value(unsigned int i,unsigned int * factor,int * offset)18855 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
18856 int *offset)
18857 {
18858 /* Polynomial invariant 1 == (VG / 2) - 1. */
18859 gcc_assert (i == 1);
18860 *factor = 2;
18861 *offset = 1;
18862 return AARCH64_DWARF_VG;
18863 }
18864
18865 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
18866 if MODE is HFmode, and punt to the generic implementation otherwise. */
18867
18868 static bool
aarch64_libgcc_floating_mode_supported_p(scalar_float_mode mode)18869 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
18870 {
18871 return (mode == HFmode
18872 ? true
18873 : default_libgcc_floating_mode_supported_p (mode));
18874 }
18875
18876 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
18877 if MODE is HFmode, and punt to the generic implementation otherwise. */
18878
18879 static bool
aarch64_scalar_mode_supported_p(scalar_mode mode)18880 aarch64_scalar_mode_supported_p (scalar_mode mode)
18881 {
18882 return (mode == HFmode
18883 ? true
18884 : default_scalar_mode_supported_p (mode));
18885 }
18886
18887 /* Set the value of FLT_EVAL_METHOD.
18888 ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
18889
18890 0: evaluate all operations and constants, whose semantic type has at
18891 most the range and precision of type float, to the range and
18892 precision of float; evaluate all other operations and constants to
18893 the range and precision of the semantic type;
18894
18895 N, where _FloatN is a supported interchange floating type
18896 evaluate all operations and constants, whose semantic type has at
18897 most the range and precision of _FloatN type, to the range and
18898 precision of the _FloatN type; evaluate all other operations and
18899 constants to the range and precision of the semantic type;
18900
18901 If we have the ARMv8.2-A extensions then we support _Float16 in native
18902 precision, so we should set this to 16. Otherwise, we support the type,
18903 but want to evaluate expressions in float precision, so set this to
18904 0. */
18905
18906 static enum flt_eval_method
aarch64_excess_precision(enum excess_precision_type type)18907 aarch64_excess_precision (enum excess_precision_type type)
18908 {
18909 switch (type)
18910 {
18911 case EXCESS_PRECISION_TYPE_FAST:
18912 case EXCESS_PRECISION_TYPE_STANDARD:
18913 /* We can calculate either in 16-bit range and precision or
18914 32-bit range and precision. Make that decision based on whether
18915 we have native support for the ARMv8.2-A 16-bit floating-point
18916 instructions or not. */
18917 return (TARGET_FP_F16INST
18918 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
18919 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
18920 case EXCESS_PRECISION_TYPE_IMPLICIT:
18921 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
18922 default:
18923 gcc_unreachable ();
18924 }
18925 return FLT_EVAL_METHOD_UNPREDICTABLE;
18926 }
18927
18928 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN. Return true if INSN can be
18929 scheduled for speculative execution. Reject the long-running division
18930 and square-root instructions. */
18931
18932 static bool
aarch64_sched_can_speculate_insn(rtx_insn * insn)18933 aarch64_sched_can_speculate_insn (rtx_insn *insn)
18934 {
18935 switch (get_attr_type (insn))
18936 {
18937 case TYPE_SDIV:
18938 case TYPE_UDIV:
18939 case TYPE_FDIVS:
18940 case TYPE_FDIVD:
18941 case TYPE_FSQRTS:
18942 case TYPE_FSQRTD:
18943 case TYPE_NEON_FP_SQRT_S:
18944 case TYPE_NEON_FP_SQRT_D:
18945 case TYPE_NEON_FP_SQRT_S_Q:
18946 case TYPE_NEON_FP_SQRT_D_Q:
18947 case TYPE_NEON_FP_DIV_S:
18948 case TYPE_NEON_FP_DIV_D:
18949 case TYPE_NEON_FP_DIV_S_Q:
18950 case TYPE_NEON_FP_DIV_D_Q:
18951 return false;
18952 default:
18953 return true;
18954 }
18955 }
18956
18957 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES. */
18958
18959 static int
aarch64_compute_pressure_classes(reg_class * classes)18960 aarch64_compute_pressure_classes (reg_class *classes)
18961 {
18962 int i = 0;
18963 classes[i++] = GENERAL_REGS;
18964 classes[i++] = FP_REGS;
18965 /* PR_REGS isn't a useful pressure class because many predicate pseudo
18966 registers need to go in PR_LO_REGS at some point during their
18967 lifetime. Splitting it into two halves has the effect of making
18968 all predicates count against PR_LO_REGS, so that we try whenever
18969 possible to restrict the number of live predicates to 8. This
18970 greatly reduces the amount of spilling in certain loops. */
18971 classes[i++] = PR_LO_REGS;
18972 classes[i++] = PR_HI_REGS;
18973 return i;
18974 }
18975
18976 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
18977
18978 static bool
aarch64_can_change_mode_class(machine_mode from,machine_mode to,reg_class_t)18979 aarch64_can_change_mode_class (machine_mode from,
18980 machine_mode to, reg_class_t)
18981 {
18982 if (BYTES_BIG_ENDIAN)
18983 {
18984 bool from_sve_p = aarch64_sve_data_mode_p (from);
18985 bool to_sve_p = aarch64_sve_data_mode_p (to);
18986
18987 /* Don't allow changes between SVE data modes and non-SVE modes.
18988 See the comment at the head of aarch64-sve.md for details. */
18989 if (from_sve_p != to_sve_p)
18990 return false;
18991
18992 /* Don't allow changes in element size: lane 0 of the new vector
18993 would not then be lane 0 of the old vector. See the comment
18994 above aarch64_maybe_expand_sve_subreg_move for a more detailed
18995 description.
18996
18997 In the worst case, this forces a register to be spilled in
18998 one mode and reloaded in the other, which handles the
18999 endianness correctly. */
19000 if (from_sve_p && GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to))
19001 return false;
19002 }
19003 return true;
19004 }
19005
19006 /* Implement TARGET_EARLY_REMAT_MODES. */
19007
19008 static void
aarch64_select_early_remat_modes(sbitmap modes)19009 aarch64_select_early_remat_modes (sbitmap modes)
19010 {
19011 /* SVE values are not normally live across a call, so it should be
19012 worth doing early rematerialization even in VL-specific mode. */
19013 for (int i = 0; i < NUM_MACHINE_MODES; ++i)
19014 {
19015 machine_mode mode = (machine_mode) i;
19016 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
19017 if (vec_flags & VEC_ANY_SVE)
19018 bitmap_set_bit (modes, i);
19019 }
19020 }
19021
19022 /* Override the default target speculation_safe_value. */
19023 static rtx
aarch64_speculation_safe_value(machine_mode mode,rtx result,rtx val,rtx failval)19024 aarch64_speculation_safe_value (machine_mode mode,
19025 rtx result, rtx val, rtx failval)
19026 {
19027 /* Maybe we should warn if falling back to hard barriers. They are
19028 likely to be noticably more expensive than the alternative below. */
19029 if (!aarch64_track_speculation)
19030 return default_speculation_safe_value (mode, result, val, failval);
19031
19032 if (!REG_P (val))
19033 val = copy_to_mode_reg (mode, val);
19034
19035 if (!aarch64_reg_or_zero (failval, mode))
19036 failval = copy_to_mode_reg (mode, failval);
19037
19038 emit_insn (gen_despeculate_copy (mode, result, val, failval));
19039 return result;
19040 }
19041
19042 /* Implement TARGET_ESTIMATED_POLY_VALUE.
19043 Look into the tuning structure for an estimate.
19044 VAL.coeffs[1] is multiplied by the number of VQ chunks over the initial
19045 Advanced SIMD 128 bits. */
19046
19047 static HOST_WIDE_INT
aarch64_estimated_poly_value(poly_int64 val)19048 aarch64_estimated_poly_value (poly_int64 val)
19049 {
19050 enum aarch64_sve_vector_bits_enum width_source
19051 = aarch64_tune_params.sve_width;
19052
19053 /* If we still don't have an estimate, use the default. */
19054 if (width_source == SVE_SCALABLE)
19055 return default_estimated_poly_value (val);
19056
19057 HOST_WIDE_INT over_128 = width_source - 128;
19058 return val.coeffs[0] + val.coeffs[1] * over_128 / 128;
19059 }
19060
19061
19062 /* Return true for types that could be supported as SIMD return or
19063 argument types. */
19064
19065 static bool
supported_simd_type(tree t)19066 supported_simd_type (tree t)
19067 {
19068 if (SCALAR_FLOAT_TYPE_P (t) || INTEGRAL_TYPE_P (t) || POINTER_TYPE_P (t))
19069 {
19070 HOST_WIDE_INT s = tree_to_shwi (TYPE_SIZE_UNIT (t));
19071 return s == 1 || s == 2 || s == 4 || s == 8;
19072 }
19073 return false;
19074 }
19075
19076 /* Return true for types that currently are supported as SIMD return
19077 or argument types. */
19078
19079 static bool
currently_supported_simd_type(tree t,tree b)19080 currently_supported_simd_type (tree t, tree b)
19081 {
19082 if (COMPLEX_FLOAT_TYPE_P (t))
19083 return false;
19084
19085 if (TYPE_SIZE (t) != TYPE_SIZE (b))
19086 return false;
19087
19088 return supported_simd_type (t);
19089 }
19090
19091 /* Implement TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN. */
19092
19093 static int
aarch64_simd_clone_compute_vecsize_and_simdlen(struct cgraph_node * node,struct cgraph_simd_clone * clonei,tree base_type,int num)19094 aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
19095 struct cgraph_simd_clone *clonei,
19096 tree base_type, int num)
19097 {
19098 tree t, ret_type, arg_type;
19099 unsigned int elt_bits, vec_bits, count;
19100
19101 if (!TARGET_SIMD)
19102 return 0;
19103
19104 if (clonei->simdlen
19105 && (clonei->simdlen < 2
19106 || clonei->simdlen > 1024
19107 || (clonei->simdlen & (clonei->simdlen - 1)) != 0))
19108 {
19109 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19110 "unsupported simdlen %d", clonei->simdlen);
19111 return 0;
19112 }
19113
19114 ret_type = TREE_TYPE (TREE_TYPE (node->decl));
19115 if (TREE_CODE (ret_type) != VOID_TYPE
19116 && !currently_supported_simd_type (ret_type, base_type))
19117 {
19118 if (TYPE_SIZE (ret_type) != TYPE_SIZE (base_type))
19119 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19120 "GCC does not currently support mixed size types "
19121 "for %<simd%> functions");
19122 else if (supported_simd_type (ret_type))
19123 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19124 "GCC does not currently support return type %qT "
19125 "for %<simd%> functions", ret_type);
19126 else
19127 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19128 "unsupported return type %qT for %<simd%> functions",
19129 ret_type);
19130 return 0;
19131 }
19132
19133 int i;
19134 tree type_arg_types = TYPE_ARG_TYPES (TREE_TYPE (node->decl));
19135 bool decl_arg_p = (node->definition || type_arg_types == NULL_TREE);
19136
19137 for (t = (decl_arg_p ? DECL_ARGUMENTS (node->decl) : type_arg_types), i = 0;
19138 t && t != void_list_node; t = TREE_CHAIN (t), i++)
19139 {
19140 tree arg_type = decl_arg_p ? TREE_TYPE (t) : TREE_VALUE (t);
19141
19142 if (clonei->args[i].arg_type != SIMD_CLONE_ARG_TYPE_UNIFORM
19143 && !currently_supported_simd_type (arg_type, base_type))
19144 {
19145 if (TYPE_SIZE (arg_type) != TYPE_SIZE (base_type))
19146 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19147 "GCC does not currently support mixed size types "
19148 "for %<simd%> functions");
19149 else
19150 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19151 "GCC does not currently support argument type %qT "
19152 "for %<simd%> functions", arg_type);
19153 return 0;
19154 }
19155 }
19156
19157 clonei->vecsize_mangle = 'n';
19158 clonei->mask_mode = VOIDmode;
19159 elt_bits = GET_MODE_BITSIZE (SCALAR_TYPE_MODE (base_type));
19160 if (clonei->simdlen == 0)
19161 {
19162 count = 2;
19163 vec_bits = (num == 0 ? 64 : 128);
19164 clonei->simdlen = vec_bits / elt_bits;
19165 }
19166 else
19167 {
19168 count = 1;
19169 vec_bits = clonei->simdlen * elt_bits;
19170 if (vec_bits != 64 && vec_bits != 128)
19171 {
19172 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19173 "GCC does not currently support simdlen %d for type %qT",
19174 clonei->simdlen, base_type);
19175 return 0;
19176 }
19177 }
19178 clonei->vecsize_int = vec_bits;
19179 clonei->vecsize_float = vec_bits;
19180 return count;
19181 }
19182
19183 /* Implement TARGET_SIMD_CLONE_ADJUST. */
19184
19185 static void
aarch64_simd_clone_adjust(struct cgraph_node * node)19186 aarch64_simd_clone_adjust (struct cgraph_node *node)
19187 {
19188 /* Add aarch64_vector_pcs target attribute to SIMD clones so they
19189 use the correct ABI. */
19190
19191 tree t = TREE_TYPE (node->decl);
19192 TYPE_ATTRIBUTES (t) = make_attribute ("aarch64_vector_pcs", "default",
19193 TYPE_ATTRIBUTES (t));
19194 }
19195
19196 /* Implement TARGET_SIMD_CLONE_USABLE. */
19197
19198 static int
aarch64_simd_clone_usable(struct cgraph_node * node)19199 aarch64_simd_clone_usable (struct cgraph_node *node)
19200 {
19201 switch (node->simdclone->vecsize_mangle)
19202 {
19203 case 'n':
19204 if (!TARGET_SIMD)
19205 return -1;
19206 return 0;
19207 default:
19208 gcc_unreachable ();
19209 }
19210 }
19211
19212 /* Implement TARGET_COMP_TYPE_ATTRIBUTES */
19213
19214 static int
aarch64_comp_type_attributes(const_tree type1,const_tree type2)19215 aarch64_comp_type_attributes (const_tree type1, const_tree type2)
19216 {
19217 if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type1))
19218 != lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type2)))
19219 return 0;
19220 return 1;
19221 }
19222
19223 /* Implement TARGET_GET_MULTILIB_ABI_NAME */
19224
19225 static const char *
aarch64_get_multilib_abi_name(void)19226 aarch64_get_multilib_abi_name (void)
19227 {
19228 if (TARGET_BIG_END)
19229 return TARGET_ILP32 ? "aarch64_be_ilp32" : "aarch64_be";
19230 return TARGET_ILP32 ? "aarch64_ilp32" : "aarch64";
19231 }
19232
19233 /* Implement TARGET_STACK_PROTECT_GUARD. In case of a
19234 global variable based guard use the default else
19235 return a null tree. */
19236 static tree
aarch64_stack_protect_guard(void)19237 aarch64_stack_protect_guard (void)
19238 {
19239 if (aarch64_stack_protector_guard == SSP_GLOBAL)
19240 return default_stack_protect_guard ();
19241
19242 return NULL_TREE;
19243 }
19244
19245 /* Implement TARGET_ASM_FILE_END for AArch64. This adds the AArch64 GNU NOTE
19246 section at the end if needed. */
19247 #define GNU_PROPERTY_AARCH64_FEATURE_1_AND 0xc0000000
19248 #define GNU_PROPERTY_AARCH64_FEATURE_1_BTI (1U << 0)
19249 #define GNU_PROPERTY_AARCH64_FEATURE_1_PAC (1U << 1)
19250 void
aarch64_file_end_indicate_exec_stack()19251 aarch64_file_end_indicate_exec_stack ()
19252 {
19253 file_end_indicate_exec_stack ();
19254
19255 unsigned feature_1_and = 0;
19256 if (aarch64_bti_enabled ())
19257 feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_BTI;
19258
19259 if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE)
19260 feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_PAC;
19261
19262 if (feature_1_and)
19263 {
19264 /* Generate .note.gnu.property section. */
19265 switch_to_section (get_section (".note.gnu.property",
19266 SECTION_NOTYPE, NULL));
19267
19268 /* PT_NOTE header: namesz, descsz, type.
19269 namesz = 4 ("GNU\0")
19270 descsz = 16 (Size of the program property array)
19271 [(12 + padding) * Number of array elements]
19272 type = 5 (NT_GNU_PROPERTY_TYPE_0). */
19273 assemble_align (POINTER_SIZE);
19274 assemble_integer (GEN_INT (4), 4, 32, 1);
19275 assemble_integer (GEN_INT (ROUND_UP (12, POINTER_BYTES)), 4, 32, 1);
19276 assemble_integer (GEN_INT (5), 4, 32, 1);
19277
19278 /* PT_NOTE name. */
19279 assemble_string ("GNU", 4);
19280
19281 /* PT_NOTE contents for NT_GNU_PROPERTY_TYPE_0:
19282 type = GNU_PROPERTY_AARCH64_FEATURE_1_AND
19283 datasz = 4
19284 data = feature_1_and. */
19285 assemble_integer (GEN_INT (GNU_PROPERTY_AARCH64_FEATURE_1_AND), 4, 32, 1);
19286 assemble_integer (GEN_INT (4), 4, 32, 1);
19287 assemble_integer (GEN_INT (feature_1_and), 4, 32, 1);
19288
19289 /* Pad the size of the note to the required alignment. */
19290 assemble_align (POINTER_SIZE);
19291 }
19292 }
19293 #undef GNU_PROPERTY_AARCH64_FEATURE_1_PAC
19294 #undef GNU_PROPERTY_AARCH64_FEATURE_1_BTI
19295 #undef GNU_PROPERTY_AARCH64_FEATURE_1_AND
19296
19297 /* Helper function for straight line speculation.
19298 Return what barrier should be emitted for straight line speculation
19299 mitigation.
19300 When not mitigating against straight line speculation this function returns
19301 an empty string.
19302 When mitigating against straight line speculation, use:
19303 * SB when the v8.5-A SB extension is enabled.
19304 * DSB+ISB otherwise. */
19305 const char *
aarch64_sls_barrier(int mitigation_required)19306 aarch64_sls_barrier (int mitigation_required)
19307 {
19308 return mitigation_required
19309 ? (TARGET_SB ? "sb" : "dsb\tsy\n\tisb")
19310 : "";
19311 }
19312
19313 static GTY (()) tree aarch64_sls_shared_thunks[30];
19314 static GTY (()) bool aarch64_sls_shared_thunks_needed = false;
19315 const char *indirect_symbol_names[30] = {
19316 "__call_indirect_x0",
19317 "__call_indirect_x1",
19318 "__call_indirect_x2",
19319 "__call_indirect_x3",
19320 "__call_indirect_x4",
19321 "__call_indirect_x5",
19322 "__call_indirect_x6",
19323 "__call_indirect_x7",
19324 "__call_indirect_x8",
19325 "__call_indirect_x9",
19326 "__call_indirect_x10",
19327 "__call_indirect_x11",
19328 "__call_indirect_x12",
19329 "__call_indirect_x13",
19330 "__call_indirect_x14",
19331 "__call_indirect_x15",
19332 "", /* "__call_indirect_x16", */
19333 "", /* "__call_indirect_x17", */
19334 "__call_indirect_x18",
19335 "__call_indirect_x19",
19336 "__call_indirect_x20",
19337 "__call_indirect_x21",
19338 "__call_indirect_x22",
19339 "__call_indirect_x23",
19340 "__call_indirect_x24",
19341 "__call_indirect_x25",
19342 "__call_indirect_x26",
19343 "__call_indirect_x27",
19344 "__call_indirect_x28",
19345 "__call_indirect_x29",
19346 };
19347
19348 /* Function to create a BLR thunk. This thunk is used to mitigate straight
19349 line speculation. Instead of a simple BLR that can be speculated past,
19350 we emit a BL to this thunk, and this thunk contains a BR to the relevant
19351 register. These thunks have the relevant speculation barries put after
19352 their indirect branch so that speculation is blocked.
19353
19354 We use such a thunk so the speculation barriers are kept off the
19355 architecturally executed path in order to reduce the performance overhead.
19356
19357 When optimizing for size we use stubs shared by the linked object.
19358 When optimizing for performance we emit stubs for each function in the hope
19359 that the branch predictor can better train on jumps specific for a given
19360 function. */
19361 rtx
aarch64_sls_create_blr_label(int regnum)19362 aarch64_sls_create_blr_label (int regnum)
19363 {
19364 gcc_assert (STUB_REGNUM_P (regnum));
19365 if (optimize_function_for_size_p (cfun))
19366 {
19367 /* For the thunks shared between different functions in this compilation
19368 unit we use a named symbol -- this is just for users to more easily
19369 understand the generated assembly. */
19370 aarch64_sls_shared_thunks_needed = true;
19371 const char *thunk_name = indirect_symbol_names[regnum];
19372 if (aarch64_sls_shared_thunks[regnum] == NULL)
19373 {
19374 /* Build a decl representing this function stub and record it for
19375 later. We build a decl here so we can use the GCC machinery for
19376 handling sections automatically (through `get_named_section` and
19377 `make_decl_one_only`). That saves us a lot of trouble handling
19378 the specifics of different output file formats. */
19379 tree decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
19380 get_identifier (thunk_name),
19381 build_function_type_list (void_type_node,
19382 NULL_TREE));
19383 DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
19384 NULL_TREE, void_type_node);
19385 TREE_PUBLIC (decl) = 1;
19386 TREE_STATIC (decl) = 1;
19387 DECL_IGNORED_P (decl) = 1;
19388 DECL_ARTIFICIAL (decl) = 1;
19389 make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
19390 resolve_unique_section (decl, 0, false);
19391 aarch64_sls_shared_thunks[regnum] = decl;
19392 }
19393
19394 return gen_rtx_SYMBOL_REF (Pmode, thunk_name);
19395 }
19396
19397 if (cfun->machine->call_via[regnum] == NULL)
19398 cfun->machine->call_via[regnum]
19399 = gen_rtx_LABEL_REF (Pmode, gen_label_rtx ());
19400 return cfun->machine->call_via[regnum];
19401 }
19402
19403 /* Helper function for aarch64_sls_emit_blr_function_thunks and
19404 aarch64_sls_emit_shared_blr_thunks below. */
19405 static void
aarch64_sls_emit_function_stub(FILE * out_file,int regnum)19406 aarch64_sls_emit_function_stub (FILE *out_file, int regnum)
19407 {
19408 /* Save in x16 and branch to that function so this transformation does
19409 not prevent jumping to `BTI c` instructions. */
19410 asm_fprintf (out_file, "\tmov\tx16, x%d\n", regnum);
19411 asm_fprintf (out_file, "\tbr\tx16\n");
19412 }
19413
19414 /* Emit all BLR stubs for this particular function.
19415 Here we emit all the BLR stubs needed for the current function. Since we
19416 emit these stubs in a consecutive block we know there will be no speculation
19417 gadgets between each stub, and hence we only emit a speculation barrier at
19418 the end of the stub sequences.
19419
19420 This is called in the TARGET_ASM_FUNCTION_EPILOGUE hook. */
19421 void
aarch64_sls_emit_blr_function_thunks(FILE * out_file)19422 aarch64_sls_emit_blr_function_thunks (FILE *out_file)
19423 {
19424 if (! aarch64_harden_sls_blr_p ())
19425 return;
19426
19427 bool any_functions_emitted = false;
19428 /* We must save and restore the current function section since this assembly
19429 is emitted at the end of the function. This means it can be emitted *just
19430 after* the cold section of a function. That cold part would be emitted in
19431 a different section. That switch would trigger a `.cfi_endproc` directive
19432 to be emitted in the original section and a `.cfi_startproc` directive to
19433 be emitted in the new section. Switching to the original section without
19434 restoring would mean that the `.cfi_endproc` emitted as a function ends
19435 would happen in a different section -- leaving an unmatched
19436 `.cfi_startproc` in the cold text section and an unmatched `.cfi_endproc`
19437 in the standard text section. */
19438 section *save_text_section = in_section;
19439 switch_to_section (function_section (current_function_decl));
19440 for (int regnum = 0; regnum < 30; ++regnum)
19441 {
19442 rtx specu_label = cfun->machine->call_via[regnum];
19443 if (specu_label == NULL)
19444 continue;
19445
19446 targetm.asm_out.print_operand (out_file, specu_label, 0);
19447 asm_fprintf (out_file, ":\n");
19448 aarch64_sls_emit_function_stub (out_file, regnum);
19449 any_functions_emitted = true;
19450 }
19451 if (any_functions_emitted)
19452 /* Can use the SB if needs be here, since this stub will only be used
19453 by the current function, and hence for the current target. */
19454 asm_fprintf (out_file, "\t%s\n", aarch64_sls_barrier (true));
19455 switch_to_section (save_text_section);
19456 }
19457
19458 /* Emit shared BLR stubs for the current compilation unit.
19459 Over the course of compiling this unit we may have converted some BLR
19460 instructions to a BL to a shared stub function. This is where we emit those
19461 stub functions.
19462 This function is for the stubs shared between different functions in this
19463 compilation unit. We share when optimizing for size instead of speed.
19464
19465 This function is called through the TARGET_ASM_FILE_END hook. */
19466 void
aarch64_sls_emit_shared_blr_thunks(FILE * out_file)19467 aarch64_sls_emit_shared_blr_thunks (FILE *out_file)
19468 {
19469 if (! aarch64_sls_shared_thunks_needed)
19470 return;
19471
19472 for (int regnum = 0; regnum < 30; ++regnum)
19473 {
19474 tree decl = aarch64_sls_shared_thunks[regnum];
19475 if (!decl)
19476 continue;
19477
19478 const char *name = indirect_symbol_names[regnum];
19479 switch_to_section (get_named_section (decl, NULL, 0));
19480 ASM_OUTPUT_ALIGN (out_file, 2);
19481 targetm.asm_out.globalize_label (out_file, name);
19482 /* Only emits if the compiler is configured for an assembler that can
19483 handle visibility directives. */
19484 targetm.asm_out.assemble_visibility (decl, VISIBILITY_HIDDEN);
19485 ASM_OUTPUT_TYPE_DIRECTIVE (out_file, name, "function");
19486 ASM_OUTPUT_LABEL (out_file, name);
19487 aarch64_sls_emit_function_stub (out_file, regnum);
19488 /* Use the most conservative target to ensure it can always be used by any
19489 function in the translation unit. */
19490 asm_fprintf (out_file, "\tdsb\tsy\n\tisb\n");
19491 ASM_DECLARE_FUNCTION_SIZE (out_file, name, decl);
19492 }
19493 }
19494
19495 /* Implement TARGET_ASM_FILE_END. */
19496 void
aarch64_asm_file_end()19497 aarch64_asm_file_end ()
19498 {
19499 aarch64_sls_emit_shared_blr_thunks (asm_out_file);
19500 /* Since this function will be called for the ASM_FILE_END hook, we ensure
19501 that what would be called otherwise (e.g. `file_end_indicate_exec_stack`
19502 for FreeBSD) still gets called. */
19503 #ifdef TARGET_ASM_FILE_END
19504 TARGET_ASM_FILE_END ();
19505 #endif
19506 }
19507
19508 const char *
aarch64_indirect_call_asm(rtx addr)19509 aarch64_indirect_call_asm (rtx addr)
19510 {
19511 gcc_assert (REG_P (addr));
19512 if (aarch64_harden_sls_blr_p ())
19513 {
19514 rtx stub_label = aarch64_sls_create_blr_label (REGNO (addr));
19515 output_asm_insn ("bl\t%0", &stub_label);
19516 }
19517 else
19518 output_asm_insn ("blr\t%0", &addr);
19519 return "";
19520 }
19521
19522 /* Target-specific selftests. */
19523
19524 #if CHECKING_P
19525
19526 namespace selftest {
19527
19528 /* Selftest for the RTL loader.
19529 Verify that the RTL loader copes with a dump from
19530 print_rtx_function. This is essentially just a test that class
19531 function_reader can handle a real dump, but it also verifies
19532 that lookup_reg_by_dump_name correctly handles hard regs.
19533 The presence of hard reg names in the dump means that the test is
19534 target-specific, hence it is in this file. */
19535
19536 static void
aarch64_test_loading_full_dump()19537 aarch64_test_loading_full_dump ()
19538 {
19539 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
19540
19541 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
19542
19543 rtx_insn *insn_1 = get_insn_by_uid (1);
19544 ASSERT_EQ (NOTE, GET_CODE (insn_1));
19545
19546 rtx_insn *insn_15 = get_insn_by_uid (15);
19547 ASSERT_EQ (INSN, GET_CODE (insn_15));
19548 ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
19549
19550 /* Verify crtl->return_rtx. */
19551 ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
19552 ASSERT_EQ (0, REGNO (crtl->return_rtx));
19553 ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
19554 }
19555
19556 /* Run all target-specific selftests. */
19557
19558 static void
aarch64_run_selftests(void)19559 aarch64_run_selftests (void)
19560 {
19561 aarch64_test_loading_full_dump ();
19562 }
19563
19564 } // namespace selftest
19565
19566 #endif /* #if CHECKING_P */
19567
19568 #undef TARGET_STACK_PROTECT_GUARD
19569 #define TARGET_STACK_PROTECT_GUARD aarch64_stack_protect_guard
19570
19571 #undef TARGET_ADDRESS_COST
19572 #define TARGET_ADDRESS_COST aarch64_address_cost
19573
19574 /* This hook will determines whether unnamed bitfields affect the alignment
19575 of the containing structure. The hook returns true if the structure
19576 should inherit the alignment requirements of an unnamed bitfield's
19577 type. */
19578 #undef TARGET_ALIGN_ANON_BITFIELD
19579 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
19580
19581 #undef TARGET_ASM_ALIGNED_DI_OP
19582 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
19583
19584 #undef TARGET_ASM_ALIGNED_HI_OP
19585 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
19586
19587 #undef TARGET_ASM_ALIGNED_SI_OP
19588 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
19589
19590 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
19591 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
19592 hook_bool_const_tree_hwi_hwi_const_tree_true
19593
19594 #undef TARGET_ASM_FILE_START
19595 #define TARGET_ASM_FILE_START aarch64_start_file
19596
19597 #undef TARGET_ASM_OUTPUT_MI_THUNK
19598 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
19599
19600 #undef TARGET_ASM_SELECT_RTX_SECTION
19601 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
19602
19603 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
19604 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
19605
19606 #undef TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY
19607 #define TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY aarch64_print_patchable_function_entry
19608
19609 #undef TARGET_BUILD_BUILTIN_VA_LIST
19610 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
19611
19612 #undef TARGET_CALLEE_COPIES
19613 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
19614
19615 #undef TARGET_CAN_ELIMINATE
19616 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
19617
19618 #undef TARGET_CAN_INLINE_P
19619 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
19620
19621 #undef TARGET_CANNOT_FORCE_CONST_MEM
19622 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
19623
19624 #undef TARGET_CASE_VALUES_THRESHOLD
19625 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
19626
19627 #undef TARGET_CONDITIONAL_REGISTER_USAGE
19628 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
19629
19630 /* Only the least significant bit is used for initialization guard
19631 variables. */
19632 #undef TARGET_CXX_GUARD_MASK_BIT
19633 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
19634
19635 #undef TARGET_C_MODE_FOR_SUFFIX
19636 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
19637
19638 #ifdef TARGET_BIG_ENDIAN_DEFAULT
19639 #undef TARGET_DEFAULT_TARGET_FLAGS
19640 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
19641 #endif
19642
19643 #undef TARGET_CLASS_MAX_NREGS
19644 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
19645
19646 #undef TARGET_BUILTIN_DECL
19647 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
19648
19649 #undef TARGET_BUILTIN_RECIPROCAL
19650 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
19651
19652 #undef TARGET_C_EXCESS_PRECISION
19653 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
19654
19655 #undef TARGET_EXPAND_BUILTIN
19656 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
19657
19658 #undef TARGET_EXPAND_BUILTIN_VA_START
19659 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
19660
19661 #undef TARGET_FOLD_BUILTIN
19662 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
19663
19664 #undef TARGET_FUNCTION_ARG
19665 #define TARGET_FUNCTION_ARG aarch64_function_arg
19666
19667 #undef TARGET_FUNCTION_ARG_ADVANCE
19668 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
19669
19670 #undef TARGET_FUNCTION_ARG_BOUNDARY
19671 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
19672
19673 #undef TARGET_FUNCTION_ARG_PADDING
19674 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
19675
19676 #undef TARGET_GET_RAW_RESULT_MODE
19677 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
19678 #undef TARGET_GET_RAW_ARG_MODE
19679 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
19680
19681 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
19682 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
19683
19684 #undef TARGET_FUNCTION_VALUE
19685 #define TARGET_FUNCTION_VALUE aarch64_function_value
19686
19687 #undef TARGET_FUNCTION_VALUE_REGNO_P
19688 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
19689
19690 #undef TARGET_GIMPLE_FOLD_BUILTIN
19691 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
19692
19693 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
19694 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
19695
19696 #undef TARGET_INIT_BUILTINS
19697 #define TARGET_INIT_BUILTINS aarch64_init_builtins
19698
19699 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
19700 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
19701 aarch64_ira_change_pseudo_allocno_class
19702
19703 #undef TARGET_LEGITIMATE_ADDRESS_P
19704 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
19705
19706 #undef TARGET_LEGITIMATE_CONSTANT_P
19707 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
19708
19709 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
19710 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
19711 aarch64_legitimize_address_displacement
19712
19713 #undef TARGET_LIBGCC_CMP_RETURN_MODE
19714 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
19715
19716 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
19717 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
19718 aarch64_libgcc_floating_mode_supported_p
19719
19720 #undef TARGET_MANGLE_TYPE
19721 #define TARGET_MANGLE_TYPE aarch64_mangle_type
19722
19723 #undef TARGET_MEMORY_MOVE_COST
19724 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
19725
19726 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
19727 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
19728
19729 #undef TARGET_MUST_PASS_IN_STACK
19730 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
19731
19732 /* This target hook should return true if accesses to volatile bitfields
19733 should use the narrowest mode possible. It should return false if these
19734 accesses should use the bitfield container type. */
19735 #undef TARGET_NARROW_VOLATILE_BITFIELD
19736 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
19737
19738 #undef TARGET_OPTION_OVERRIDE
19739 #define TARGET_OPTION_OVERRIDE aarch64_override_options
19740
19741 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
19742 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
19743 aarch64_override_options_after_change
19744
19745 #undef TARGET_OPTION_SAVE
19746 #define TARGET_OPTION_SAVE aarch64_option_save
19747
19748 #undef TARGET_OPTION_RESTORE
19749 #define TARGET_OPTION_RESTORE aarch64_option_restore
19750
19751 #undef TARGET_OPTION_PRINT
19752 #define TARGET_OPTION_PRINT aarch64_option_print
19753
19754 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
19755 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
19756
19757 #undef TARGET_SET_CURRENT_FUNCTION
19758 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
19759
19760 #undef TARGET_PASS_BY_REFERENCE
19761 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
19762
19763 #undef TARGET_PREFERRED_RELOAD_CLASS
19764 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
19765
19766 #undef TARGET_SCHED_REASSOCIATION_WIDTH
19767 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
19768
19769 #undef TARGET_PROMOTED_TYPE
19770 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
19771
19772 #undef TARGET_SECONDARY_RELOAD
19773 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
19774
19775 #undef TARGET_SHIFT_TRUNCATION_MASK
19776 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
19777
19778 #undef TARGET_SETUP_INCOMING_VARARGS
19779 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
19780
19781 #undef TARGET_STRUCT_VALUE_RTX
19782 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
19783
19784 #undef TARGET_REGISTER_MOVE_COST
19785 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
19786
19787 #undef TARGET_RETURN_IN_MEMORY
19788 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
19789
19790 #undef TARGET_RETURN_IN_MSB
19791 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
19792
19793 #undef TARGET_RTX_COSTS
19794 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
19795
19796 #undef TARGET_SCALAR_MODE_SUPPORTED_P
19797 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
19798
19799 #undef TARGET_SCHED_ISSUE_RATE
19800 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
19801
19802 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
19803 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
19804 aarch64_sched_first_cycle_multipass_dfa_lookahead
19805
19806 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
19807 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
19808 aarch64_first_cycle_multipass_dfa_lookahead_guard
19809
19810 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
19811 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
19812 aarch64_get_separate_components
19813
19814 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
19815 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
19816 aarch64_components_for_bb
19817
19818 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
19819 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
19820 aarch64_disqualify_components
19821
19822 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
19823 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
19824 aarch64_emit_prologue_components
19825
19826 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
19827 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
19828 aarch64_emit_epilogue_components
19829
19830 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
19831 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
19832 aarch64_set_handled_components
19833
19834 #undef TARGET_TRAMPOLINE_INIT
19835 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
19836
19837 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
19838 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
19839
19840 #undef TARGET_VECTOR_MODE_SUPPORTED_P
19841 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
19842
19843 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
19844 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
19845 aarch64_builtin_support_vector_misalignment
19846
19847 #undef TARGET_ARRAY_MODE
19848 #define TARGET_ARRAY_MODE aarch64_array_mode
19849
19850 #undef TARGET_ARRAY_MODE_SUPPORTED_P
19851 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
19852
19853 #undef TARGET_VECTORIZE_ADD_STMT_COST
19854 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
19855
19856 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
19857 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
19858 aarch64_builtin_vectorization_cost
19859
19860 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
19861 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
19862
19863 #undef TARGET_VECTORIZE_BUILTINS
19864 #define TARGET_VECTORIZE_BUILTINS
19865
19866 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
19867 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
19868 aarch64_builtin_vectorized_function
19869
19870 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
19871 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
19872 aarch64_autovectorize_vector_sizes
19873
19874 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
19875 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
19876 aarch64_atomic_assign_expand_fenv
19877
19878 /* Section anchor support. */
19879
19880 #undef TARGET_MIN_ANCHOR_OFFSET
19881 #define TARGET_MIN_ANCHOR_OFFSET -256
19882
19883 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
19884 byte offset; we can do much more for larger data types, but have no way
19885 to determine the size of the access. We assume accesses are aligned. */
19886 #undef TARGET_MAX_ANCHOR_OFFSET
19887 #define TARGET_MAX_ANCHOR_OFFSET 4095
19888
19889 #undef TARGET_VECTOR_ALIGNMENT
19890 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
19891
19892 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
19893 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
19894 aarch64_vectorize_preferred_vector_alignment
19895 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
19896 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
19897 aarch64_simd_vector_alignment_reachable
19898
19899 /* vec_perm support. */
19900
19901 #undef TARGET_VECTORIZE_VEC_PERM_CONST
19902 #define TARGET_VECTORIZE_VEC_PERM_CONST \
19903 aarch64_vectorize_vec_perm_const
19904
19905 #undef TARGET_VECTORIZE_GET_MASK_MODE
19906 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
19907 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
19908 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
19909 aarch64_empty_mask_is_expensive
19910 #undef TARGET_PREFERRED_ELSE_VALUE
19911 #define TARGET_PREFERRED_ELSE_VALUE \
19912 aarch64_preferred_else_value
19913
19914 #undef TARGET_INIT_LIBFUNCS
19915 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
19916
19917 #undef TARGET_FIXED_CONDITION_CODE_REGS
19918 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
19919
19920 #undef TARGET_FLAGS_REGNUM
19921 #define TARGET_FLAGS_REGNUM CC_REGNUM
19922
19923 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
19924 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
19925
19926 #undef TARGET_ASAN_SHADOW_OFFSET
19927 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
19928
19929 #undef TARGET_LEGITIMIZE_ADDRESS
19930 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
19931
19932 #undef TARGET_SCHED_CAN_SPECULATE_INSN
19933 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
19934
19935 #undef TARGET_CAN_USE_DOLOOP_P
19936 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
19937
19938 #undef TARGET_SCHED_ADJUST_PRIORITY
19939 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
19940
19941 #undef TARGET_SCHED_MACRO_FUSION_P
19942 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
19943
19944 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
19945 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
19946
19947 #undef TARGET_SCHED_FUSION_PRIORITY
19948 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
19949
19950 #undef TARGET_UNSPEC_MAY_TRAP_P
19951 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
19952
19953 #undef TARGET_USE_PSEUDO_PIC_REG
19954 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
19955
19956 #undef TARGET_PRINT_OPERAND
19957 #define TARGET_PRINT_OPERAND aarch64_print_operand
19958
19959 #undef TARGET_PRINT_OPERAND_ADDRESS
19960 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
19961
19962 #undef TARGET_OPTAB_SUPPORTED_P
19963 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
19964
19965 #undef TARGET_OMIT_STRUCT_RETURN_REG
19966 #define TARGET_OMIT_STRUCT_RETURN_REG true
19967
19968 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
19969 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
19970 aarch64_dwarf_poly_indeterminate_value
19971
19972 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors. */
19973 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
19974 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
19975
19976 #undef TARGET_HARD_REGNO_NREGS
19977 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
19978 #undef TARGET_HARD_REGNO_MODE_OK
19979 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
19980
19981 #undef TARGET_MODES_TIEABLE_P
19982 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
19983
19984 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
19985 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
19986 aarch64_hard_regno_call_part_clobbered
19987
19988 #undef TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS
19989 #define TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS \
19990 aarch64_remove_extra_call_preserved_regs
19991
19992 #undef TARGET_RETURN_CALL_WITH_MAX_CLOBBERS
19993 #define TARGET_RETURN_CALL_WITH_MAX_CLOBBERS \
19994 aarch64_return_call_with_max_clobbers
19995
19996 #undef TARGET_CONSTANT_ALIGNMENT
19997 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
19998
19999 #undef TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE
20000 #define TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE \
20001 aarch64_stack_clash_protection_alloca_probe_range
20002
20003 #undef TARGET_COMPUTE_PRESSURE_CLASSES
20004 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
20005
20006 #undef TARGET_CAN_CHANGE_MODE_CLASS
20007 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
20008
20009 #undef TARGET_SELECT_EARLY_REMAT_MODES
20010 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
20011
20012 #undef TARGET_SPECULATION_SAFE_VALUE
20013 #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
20014
20015 #undef TARGET_ESTIMATED_POLY_VALUE
20016 #define TARGET_ESTIMATED_POLY_VALUE aarch64_estimated_poly_value
20017
20018 #undef TARGET_ATTRIBUTE_TABLE
20019 #define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
20020
20021 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
20022 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
20023 aarch64_simd_clone_compute_vecsize_and_simdlen
20024
20025 #undef TARGET_SIMD_CLONE_ADJUST
20026 #define TARGET_SIMD_CLONE_ADJUST aarch64_simd_clone_adjust
20027
20028 #undef TARGET_SIMD_CLONE_USABLE
20029 #define TARGET_SIMD_CLONE_USABLE aarch64_simd_clone_usable
20030
20031 #undef TARGET_COMP_TYPE_ATTRIBUTES
20032 #define TARGET_COMP_TYPE_ATTRIBUTES aarch64_comp_type_attributes
20033
20034 #undef TARGET_GET_MULTILIB_ABI_NAME
20035 #define TARGET_GET_MULTILIB_ABI_NAME aarch64_get_multilib_abi_name
20036
20037 #if CHECKING_P
20038 #undef TARGET_RUN_TARGET_SELFTESTS
20039 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
20040 #endif /* #if CHECKING_P */
20041
20042 #undef TARGET_ASM_FILE_END
20043 #define TARGET_ASM_FILE_END aarch64_asm_file_end
20044
20045 #undef TARGET_ASM_FUNCTION_EPILOGUE
20046 #define TARGET_ASM_FUNCTION_EPILOGUE aarch64_sls_emit_blr_function_thunks
20047
20048 struct gcc_target targetm = TARGET_INITIALIZER;
20049
20050 #include "gt-aarch64.h"
20051