1 /* Machine description for AArch64 architecture.
2    Copyright (C) 2009-2019 Free Software Foundation, Inc.
3    Contributed by ARM Ltd.
4 
5    This file is part of GCC.
6 
7    GCC is free software; you can redistribute it and/or modify it
8    under the terms of the GNU General Public License as published by
9    the Free Software Foundation; either version 3, or (at your option)
10    any later version.
11 
12    GCC is distributed in the hope that it will be useful, but
13    WITHOUT ANY WARRANTY; without even the implied warranty of
14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15    General Public License for more details.
16 
17    You should have received a copy of the GNU General Public License
18    along with GCC; see the file COPYING3.  If not see
19    <http://www.gnu.org/licenses/>.  */
20 
21 #define IN_TARGET_CODE 1
22 
23 #include "config.h"
24 #define INCLUDE_STRING
25 #include "system.h"
26 #include "coretypes.h"
27 #include "backend.h"
28 #include "target.h"
29 #include "rtl.h"
30 #include "tree.h"
31 #include "memmodel.h"
32 #include "gimple.h"
33 #include "cfghooks.h"
34 #include "cfgloop.h"
35 #include "df.h"
36 #include "tm_p.h"
37 #include "stringpool.h"
38 #include "attribs.h"
39 #include "optabs.h"
40 #include "regs.h"
41 #include "emit-rtl.h"
42 #include "recog.h"
43 #include "cgraph.h"
44 #include "diagnostic.h"
45 #include "insn-attr.h"
46 #include "alias.h"
47 #include "fold-const.h"
48 #include "stor-layout.h"
49 #include "calls.h"
50 #include "varasm.h"
51 #include "output.h"
52 #include "flags.h"
53 #include "explow.h"
54 #include "expr.h"
55 #include "reload.h"
56 #include "langhooks.h"
57 #include "opts.h"
58 #include "params.h"
59 #include "gimplify.h"
60 #include "dwarf2.h"
61 #include "gimple-iterator.h"
62 #include "tree-vectorizer.h"
63 #include "aarch64-cost-tables.h"
64 #include "dumpfile.h"
65 #include "builtins.h"
66 #include "rtl-iter.h"
67 #include "tm-constrs.h"
68 #include "sched-int.h"
69 #include "target-globals.h"
70 #include "common/common-target.h"
71 #include "cfgrtl.h"
72 #include "selftest.h"
73 #include "selftest-rtl.h"
74 #include "rtx-vector-builder.h"
75 #include "intl.h"
76 
77 /* This file should be included last.  */
78 #include "target-def.h"
79 
80 /* Defined for convenience.  */
81 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
82 
83 /* Information about a legitimate vector immediate operand.  */
84 struct simd_immediate_info
85 {
86   enum insn_type { MOV, MVN };
87   enum modifier_type { LSL, MSL };
88 
simd_immediate_infosimd_immediate_info89   simd_immediate_info () {}
90   simd_immediate_info (scalar_float_mode, rtx);
91   simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
92 		       insn_type = MOV, modifier_type = LSL,
93 		       unsigned int = 0);
94   simd_immediate_info (scalar_mode, rtx, rtx);
95 
96   /* The mode of the elements.  */
97   scalar_mode elt_mode;
98 
99   /* The value of each element if all elements are the same, or the
100      first value if the constant is a series.  */
101   rtx value;
102 
103   /* The value of the step if the constant is a series, null otherwise.  */
104   rtx step;
105 
106   /* The instruction to use to move the immediate into a vector.  */
107   insn_type insn;
108 
109   /* The kind of shift modifier to use, and the number of bits to shift.
110      This is (LSL, 0) if no shift is needed.  */
111   modifier_type modifier;
112   unsigned int shift;
113 };
114 
115 /* Construct a floating-point immediate in which each element has mode
116    ELT_MODE_IN and value VALUE_IN.  */
117 inline simd_immediate_info
simd_immediate_info(scalar_float_mode elt_mode_in,rtx value_in)118 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
119   : elt_mode (elt_mode_in), value (value_in), step (NULL_RTX), insn (MOV),
120     modifier (LSL), shift (0)
121 {}
122 
123 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
124    and value VALUE_IN.  The other parameters are as for the structure
125    fields.  */
126 inline simd_immediate_info
simd_immediate_info(scalar_int_mode elt_mode_in,unsigned HOST_WIDE_INT value_in,insn_type insn_in,modifier_type modifier_in,unsigned int shift_in)127 ::simd_immediate_info (scalar_int_mode elt_mode_in,
128 		       unsigned HOST_WIDE_INT value_in,
129 		       insn_type insn_in, modifier_type modifier_in,
130 		       unsigned int shift_in)
131   : elt_mode (elt_mode_in), value (gen_int_mode (value_in, elt_mode_in)),
132     step (NULL_RTX), insn (insn_in), modifier (modifier_in), shift (shift_in)
133 {}
134 
135 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
136    and where element I is equal to VALUE_IN + I * STEP_IN.  */
137 inline simd_immediate_info
simd_immediate_info(scalar_mode elt_mode_in,rtx value_in,rtx step_in)138 ::simd_immediate_info (scalar_mode elt_mode_in, rtx value_in, rtx step_in)
139   : elt_mode (elt_mode_in), value (value_in), step (step_in), insn (MOV),
140     modifier (LSL), shift (0)
141 {}
142 
143 /* The current code model.  */
144 enum aarch64_code_model aarch64_cmodel;
145 
146 /* The number of 64-bit elements in an SVE vector.  */
147 poly_uint16 aarch64_sve_vg;
148 
149 #ifdef HAVE_AS_TLS
150 #undef TARGET_HAVE_TLS
151 #define TARGET_HAVE_TLS 1
152 #endif
153 
154 static bool aarch64_composite_type_p (const_tree, machine_mode);
155 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
156 						     const_tree,
157 						     machine_mode *, int *,
158 						     bool *);
159 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
160 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
161 static void aarch64_override_options_after_change (void);
162 static bool aarch64_vector_mode_supported_p (machine_mode);
163 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
164 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
165 							 const_tree type,
166 							 int misalignment,
167 							 bool is_packed);
168 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
169 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
170 					    aarch64_addr_query_type);
171 static HOST_WIDE_INT aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val);
172 
173 /* Major revision number of the ARM Architecture implemented by the target.  */
174 unsigned aarch64_architecture_version;
175 
176 /* The processor for which instructions should be scheduled.  */
177 enum aarch64_processor aarch64_tune = cortexa53;
178 
179 /* Mask to specify which instruction scheduling options should be used.  */
180 unsigned long aarch64_tune_flags = 0;
181 
182 /* Global flag for PC relative loads.  */
183 bool aarch64_pcrelative_literal_loads;
184 
185 /* Global flag for whether frame pointer is enabled.  */
186 bool aarch64_use_frame_pointer;
187 
188 #define BRANCH_PROTECT_STR_MAX 255
189 char *accepted_branch_protection_string = NULL;
190 
191 static enum aarch64_parse_opt_result
192 aarch64_parse_branch_protection (const char*, char**);
193 
194 /* Support for command line parsing of boolean flags in the tuning
195    structures.  */
196 struct aarch64_flag_desc
197 {
198   const char* name;
199   unsigned int flag;
200 };
201 
202 #define AARCH64_FUSION_PAIR(name, internal_name) \
203   { name, AARCH64_FUSE_##internal_name },
204 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
205 {
206   { "none", AARCH64_FUSE_NOTHING },
207 #include "aarch64-fusion-pairs.def"
208   { "all", AARCH64_FUSE_ALL },
209   { NULL, AARCH64_FUSE_NOTHING }
210 };
211 
212 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
213   { name, AARCH64_EXTRA_TUNE_##internal_name },
214 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
215 {
216   { "none", AARCH64_EXTRA_TUNE_NONE },
217 #include "aarch64-tuning-flags.def"
218   { "all", AARCH64_EXTRA_TUNE_ALL },
219   { NULL, AARCH64_EXTRA_TUNE_NONE }
220 };
221 
222 /* Tuning parameters.  */
223 
224 static const struct cpu_addrcost_table generic_addrcost_table =
225 {
226     {
227       1, /* hi  */
228       0, /* si  */
229       0, /* di  */
230       1, /* ti  */
231     },
232   0, /* pre_modify  */
233   0, /* post_modify  */
234   0, /* register_offset  */
235   0, /* register_sextend  */
236   0, /* register_zextend  */
237   0 /* imm_offset  */
238 };
239 
240 static const struct cpu_addrcost_table exynosm1_addrcost_table =
241 {
242     {
243       0, /* hi  */
244       0, /* si  */
245       0, /* di  */
246       2, /* ti  */
247     },
248   0, /* pre_modify  */
249   0, /* post_modify  */
250   1, /* register_offset  */
251   1, /* register_sextend  */
252   2, /* register_zextend  */
253   0, /* imm_offset  */
254 };
255 
256 static const struct cpu_addrcost_table xgene1_addrcost_table =
257 {
258     {
259       1, /* hi  */
260       0, /* si  */
261       0, /* di  */
262       1, /* ti  */
263     },
264   1, /* pre_modify  */
265   1, /* post_modify  */
266   0, /* register_offset  */
267   1, /* register_sextend  */
268   1, /* register_zextend  */
269   0, /* imm_offset  */
270 };
271 
272 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
273 {
274     {
275       1, /* hi  */
276       1, /* si  */
277       1, /* di  */
278       2, /* ti  */
279     },
280   0, /* pre_modify  */
281   0, /* post_modify  */
282   2, /* register_offset  */
283   3, /* register_sextend  */
284   3, /* register_zextend  */
285   0, /* imm_offset  */
286 };
287 
288 static const struct cpu_addrcost_table tsv110_addrcost_table =
289 {
290     {
291       1, /* hi  */
292       0, /* si  */
293       0, /* di  */
294       1, /* ti  */
295     },
296   0, /* pre_modify  */
297   0, /* post_modify  */
298   0, /* register_offset  */
299   1, /* register_sextend  */
300   1, /* register_zextend  */
301   0, /* imm_offset  */
302 };
303 
304 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
305 {
306     {
307       1, /* hi  */
308       1, /* si  */
309       1, /* di  */
310       2, /* ti  */
311     },
312   1, /* pre_modify  */
313   1, /* post_modify  */
314   3, /* register_offset  */
315   3, /* register_sextend  */
316   3, /* register_zextend  */
317   2, /* imm_offset  */
318 };
319 
320 static const struct cpu_addrcost_table a64fx_addrcost_table =
321 {
322     {
323       1, /* hi  */
324       1, /* si  */
325       1, /* di  */
326       2, /* ti  */
327     },
328   0, /* pre_modify  */
329   0, /* post_modify  */
330   2, /* register_offset  */
331   3, /* register_sextend  */
332   3, /* register_zextend  */
333   0, /* imm_offset  */
334 };
335 
336 static const struct cpu_regmove_cost generic_regmove_cost =
337 {
338   1, /* GP2GP  */
339   /* Avoid the use of slow int<->fp moves for spilling by setting
340      their cost higher than memmov_cost.  */
341   5, /* GP2FP  */
342   5, /* FP2GP  */
343   2 /* FP2FP  */
344 };
345 
346 static const struct cpu_regmove_cost cortexa57_regmove_cost =
347 {
348   1, /* GP2GP  */
349   /* Avoid the use of slow int<->fp moves for spilling by setting
350      their cost higher than memmov_cost.  */
351   5, /* GP2FP  */
352   5, /* FP2GP  */
353   2 /* FP2FP  */
354 };
355 
356 static const struct cpu_regmove_cost cortexa53_regmove_cost =
357 {
358   1, /* GP2GP  */
359   /* Avoid the use of slow int<->fp moves for spilling by setting
360      their cost higher than memmov_cost.  */
361   5, /* GP2FP  */
362   5, /* FP2GP  */
363   2 /* FP2FP  */
364 };
365 
366 static const struct cpu_regmove_cost exynosm1_regmove_cost =
367 {
368   1, /* GP2GP  */
369   /* Avoid the use of slow int<->fp moves for spilling by setting
370      their cost higher than memmov_cost (actual, 4 and 9).  */
371   9, /* GP2FP  */
372   9, /* FP2GP  */
373   1 /* FP2FP  */
374 };
375 
376 static const struct cpu_regmove_cost thunderx_regmove_cost =
377 {
378   2, /* GP2GP  */
379   2, /* GP2FP  */
380   6, /* FP2GP  */
381   4 /* FP2FP  */
382 };
383 
384 static const struct cpu_regmove_cost xgene1_regmove_cost =
385 {
386   1, /* GP2GP  */
387   /* Avoid the use of slow int<->fp moves for spilling by setting
388      their cost higher than memmov_cost.  */
389   8, /* GP2FP  */
390   8, /* FP2GP  */
391   2 /* FP2FP  */
392 };
393 
394 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
395 {
396   2, /* GP2GP  */
397   /* Avoid the use of int<->fp moves for spilling.  */
398   6, /* GP2FP  */
399   6, /* FP2GP  */
400   4 /* FP2FP  */
401 };
402 
403 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
404 {
405   1, /* GP2GP  */
406   /* Avoid the use of int<->fp moves for spilling.  */
407   8, /* GP2FP  */
408   8, /* FP2GP  */
409   4  /* FP2FP  */
410 };
411 
412 static const struct cpu_regmove_cost tsv110_regmove_cost =
413 {
414   1, /* GP2GP  */
415   /* Avoid the use of slow int<->fp moves for spilling by setting
416      their cost higher than memmov_cost.  */
417   2, /* GP2FP  */
418   3, /* FP2GP  */
419   2  /* FP2FP  */
420 };
421 
422 static const struct cpu_regmove_cost a64fx_regmove_cost =
423 {
424   1, /* GP2GP  */
425   /* Avoid the use of slow int<->fp moves for spilling by setting
426      their cost higher than memmov_cost.  */
427   5, /* GP2FP  */
428   7, /* FP2GP  */
429   2 /* FP2FP  */
430 };
431 
432 /* Generic costs for vector insn classes.  */
433 static const struct cpu_vector_cost generic_vector_cost =
434 {
435   1, /* scalar_int_stmt_cost  */
436   1, /* scalar_fp_stmt_cost  */
437   1, /* scalar_load_cost  */
438   1, /* scalar_store_cost  */
439   1, /* vec_int_stmt_cost  */
440   1, /* vec_fp_stmt_cost  */
441   2, /* vec_permute_cost  */
442   1, /* vec_to_scalar_cost  */
443   1, /* scalar_to_vec_cost  */
444   1, /* vec_align_load_cost  */
445   1, /* vec_unalign_load_cost  */
446   1, /* vec_unalign_store_cost  */
447   1, /* vec_store_cost  */
448   3, /* cond_taken_branch_cost  */
449   1 /* cond_not_taken_branch_cost  */
450 };
451 
452 /* QDF24XX costs for vector insn classes.  */
453 static const struct cpu_vector_cost qdf24xx_vector_cost =
454 {
455   1, /* scalar_int_stmt_cost  */
456   1, /* scalar_fp_stmt_cost  */
457   1, /* scalar_load_cost  */
458   1, /* scalar_store_cost  */
459   1, /* vec_int_stmt_cost  */
460   3, /* vec_fp_stmt_cost  */
461   2, /* vec_permute_cost  */
462   1, /* vec_to_scalar_cost  */
463   1, /* scalar_to_vec_cost  */
464   1, /* vec_align_load_cost  */
465   1, /* vec_unalign_load_cost  */
466   1, /* vec_unalign_store_cost  */
467   1, /* vec_store_cost  */
468   3, /* cond_taken_branch_cost  */
469   1 /* cond_not_taken_branch_cost  */
470 };
471 
472 /* ThunderX costs for vector insn classes.  */
473 static const struct cpu_vector_cost thunderx_vector_cost =
474 {
475   1, /* scalar_int_stmt_cost  */
476   1, /* scalar_fp_stmt_cost  */
477   3, /* scalar_load_cost  */
478   1, /* scalar_store_cost  */
479   4, /* vec_int_stmt_cost  */
480   1, /* vec_fp_stmt_cost  */
481   4, /* vec_permute_cost  */
482   2, /* vec_to_scalar_cost  */
483   2, /* scalar_to_vec_cost  */
484   3, /* vec_align_load_cost  */
485   5, /* vec_unalign_load_cost  */
486   5, /* vec_unalign_store_cost  */
487   1, /* vec_store_cost  */
488   3, /* cond_taken_branch_cost  */
489   3 /* cond_not_taken_branch_cost  */
490 };
491 
492 static const struct cpu_vector_cost tsv110_vector_cost =
493 {
494   1, /* scalar_int_stmt_cost  */
495   1, /* scalar_fp_stmt_cost  */
496   5, /* scalar_load_cost  */
497   1, /* scalar_store_cost  */
498   2, /* vec_int_stmt_cost  */
499   2, /* vec_fp_stmt_cost  */
500   2, /* vec_permute_cost  */
501   3, /* vec_to_scalar_cost  */
502   2, /* scalar_to_vec_cost  */
503   5, /* vec_align_load_cost  */
504   5, /* vec_unalign_load_cost  */
505   1, /* vec_unalign_store_cost  */
506   1, /* vec_store_cost  */
507   1, /* cond_taken_branch_cost  */
508   1 /* cond_not_taken_branch_cost  */
509 };
510 
511 /* Generic costs for vector insn classes.  */
512 static const struct cpu_vector_cost cortexa57_vector_cost =
513 {
514   1, /* scalar_int_stmt_cost  */
515   1, /* scalar_fp_stmt_cost  */
516   4, /* scalar_load_cost  */
517   1, /* scalar_store_cost  */
518   2, /* vec_int_stmt_cost  */
519   2, /* vec_fp_stmt_cost  */
520   3, /* vec_permute_cost  */
521   8, /* vec_to_scalar_cost  */
522   8, /* scalar_to_vec_cost  */
523   4, /* vec_align_load_cost  */
524   4, /* vec_unalign_load_cost  */
525   1, /* vec_unalign_store_cost  */
526   1, /* vec_store_cost  */
527   1, /* cond_taken_branch_cost  */
528   1 /* cond_not_taken_branch_cost  */
529 };
530 
531 static const struct cpu_vector_cost exynosm1_vector_cost =
532 {
533   1, /* scalar_int_stmt_cost  */
534   1, /* scalar_fp_stmt_cost  */
535   5, /* scalar_load_cost  */
536   1, /* scalar_store_cost  */
537   3, /* vec_int_stmt_cost  */
538   3, /* vec_fp_stmt_cost  */
539   3, /* vec_permute_cost  */
540   3, /* vec_to_scalar_cost  */
541   3, /* scalar_to_vec_cost  */
542   5, /* vec_align_load_cost  */
543   5, /* vec_unalign_load_cost  */
544   1, /* vec_unalign_store_cost  */
545   1, /* vec_store_cost  */
546   1, /* cond_taken_branch_cost  */
547   1 /* cond_not_taken_branch_cost  */
548 };
549 
550 /* Generic costs for vector insn classes.  */
551 static const struct cpu_vector_cost xgene1_vector_cost =
552 {
553   1, /* scalar_int_stmt_cost  */
554   1, /* scalar_fp_stmt_cost  */
555   5, /* scalar_load_cost  */
556   1, /* scalar_store_cost  */
557   2, /* vec_int_stmt_cost  */
558   2, /* vec_fp_stmt_cost  */
559   2, /* vec_permute_cost  */
560   4, /* vec_to_scalar_cost  */
561   4, /* scalar_to_vec_cost  */
562   10, /* vec_align_load_cost  */
563   10, /* vec_unalign_load_cost  */
564   2, /* vec_unalign_store_cost  */
565   2, /* vec_store_cost  */
566   2, /* cond_taken_branch_cost  */
567   1 /* cond_not_taken_branch_cost  */
568 };
569 
570 /* Costs for vector insn classes for Vulcan.  */
571 static const struct cpu_vector_cost thunderx2t99_vector_cost =
572 {
573   1, /* scalar_int_stmt_cost  */
574   6, /* scalar_fp_stmt_cost  */
575   4, /* scalar_load_cost  */
576   1, /* scalar_store_cost  */
577   5, /* vec_int_stmt_cost  */
578   6, /* vec_fp_stmt_cost  */
579   3, /* vec_permute_cost  */
580   6, /* vec_to_scalar_cost  */
581   5, /* scalar_to_vec_cost  */
582   8, /* vec_align_load_cost  */
583   8, /* vec_unalign_load_cost  */
584   4, /* vec_unalign_store_cost  */
585   4, /* vec_store_cost  */
586   2, /* cond_taken_branch_cost  */
587   1  /* cond_not_taken_branch_cost  */
588 };
589 
590 static const struct cpu_vector_cost a64fx_vector_cost =
591 {
592   1, /* scalar_int_stmt_cost  */
593   5, /* scalar_fp_stmt_cost  */
594   4, /* scalar_load_cost  */
595   1, /* scalar_store_cost  */
596   2, /* vec_int_stmt_cost  */
597   5, /* vec_fp_stmt_cost  */
598   3, /* vec_permute_cost  */
599   13, /* vec_to_scalar_cost  */
600   4, /* scalar_to_vec_cost  */
601   6, /* vec_align_load_cost  */
602   6, /* vec_unalign_load_cost  */
603   1, /* vec_unalign_store_cost  */
604   1, /* vec_store_cost  */
605   3, /* cond_taken_branch_cost  */
606   1 /* cond_not_taken_branch_cost  */
607 };
608 
609 /* Generic costs for branch instructions.  */
610 static const struct cpu_branch_cost generic_branch_cost =
611 {
612   1,  /* Predictable.  */
613   3   /* Unpredictable.  */
614 };
615 
616 /* Generic approximation modes.  */
617 static const cpu_approx_modes generic_approx_modes =
618 {
619   AARCH64_APPROX_NONE,	/* division  */
620   AARCH64_APPROX_NONE,	/* sqrt  */
621   AARCH64_APPROX_NONE	/* recip_sqrt  */
622 };
623 
624 /* Approximation modes for Exynos M1.  */
625 static const cpu_approx_modes exynosm1_approx_modes =
626 {
627   AARCH64_APPROX_NONE,	/* division  */
628   AARCH64_APPROX_ALL,	/* sqrt  */
629   AARCH64_APPROX_ALL	/* recip_sqrt  */
630 };
631 
632 /* Approximation modes for X-Gene 1.  */
633 static const cpu_approx_modes xgene1_approx_modes =
634 {
635   AARCH64_APPROX_NONE,	/* division  */
636   AARCH64_APPROX_NONE,	/* sqrt  */
637   AARCH64_APPROX_ALL	/* recip_sqrt  */
638 };
639 
640 /* Generic prefetch settings (which disable prefetch).  */
641 static const cpu_prefetch_tune generic_prefetch_tune =
642 {
643   0,			/* num_slots  */
644   -1,			/* l1_cache_size  */
645   -1,			/* l1_cache_line_size  */
646   -1,			/* l2_cache_size  */
647   true,			/* prefetch_dynamic_strides */
648   -1,			/* minimum_stride */
649   -1			/* default_opt_level  */
650 };
651 
652 static const cpu_prefetch_tune exynosm1_prefetch_tune =
653 {
654   0,			/* num_slots  */
655   -1,			/* l1_cache_size  */
656   64,			/* l1_cache_line_size  */
657   -1,			/* l2_cache_size  */
658   true,			/* prefetch_dynamic_strides */
659   -1,			/* minimum_stride */
660   -1			/* default_opt_level  */
661 };
662 
663 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
664 {
665   4,			/* num_slots  */
666   32,			/* l1_cache_size  */
667   64,			/* l1_cache_line_size  */
668   512,			/* l2_cache_size  */
669   false,		/* prefetch_dynamic_strides */
670   2048,			/* minimum_stride */
671   3			/* default_opt_level  */
672 };
673 
674 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
675 {
676   8,			/* num_slots  */
677   32,			/* l1_cache_size  */
678   128,			/* l1_cache_line_size  */
679   16*1024,		/* l2_cache_size  */
680   true,			/* prefetch_dynamic_strides */
681   -1,			/* minimum_stride */
682   3			/* default_opt_level  */
683 };
684 
685 static const cpu_prefetch_tune thunderx_prefetch_tune =
686 {
687   8,			/* num_slots  */
688   32,			/* l1_cache_size  */
689   128,			/* l1_cache_line_size  */
690   -1,			/* l2_cache_size  */
691   true,			/* prefetch_dynamic_strides */
692   -1,			/* minimum_stride */
693   -1			/* default_opt_level  */
694 };
695 
696 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
697 {
698   8,			/* num_slots  */
699   32,			/* l1_cache_size  */
700   64,			/* l1_cache_line_size  */
701   256,			/* l2_cache_size  */
702   true,			/* prefetch_dynamic_strides */
703   -1,			/* minimum_stride */
704   -1			/* default_opt_level  */
705 };
706 
707 static const cpu_prefetch_tune tsv110_prefetch_tune =
708 {
709   0,                    /* num_slots  */
710   64,                   /* l1_cache_size  */
711   64,                   /* l1_cache_line_size  */
712   512,                  /* l2_cache_size  */
713   true,                 /* prefetch_dynamic_strides */
714   -1,                   /* minimum_stride */
715   -1                    /* default_opt_level  */
716 };
717 
718 static const cpu_prefetch_tune xgene1_prefetch_tune =
719 {
720   8,			/* num_slots  */
721   32,			/* l1_cache_size  */
722   64,			/* l1_cache_line_size  */
723   256,			/* l2_cache_size  */
724   true,                 /* prefetch_dynamic_strides */
725   -1,                   /* minimum_stride */
726   -1			/* default_opt_level  */
727 };
728 
729 static const cpu_prefetch_tune a64fx_prefetch_tune =
730 {
731   8,			/* num_slots  */
732   64,			/* l1_cache_size  */
733   256,			/* l1_cache_line_size  */
734   32768,		/* l2_cache_size  */
735   true,			/* prefetch_dynamic_strides */
736   -1,			/* minimum_stride */
737   -1			/* default_opt_level  */
738 };
739 
740 static const struct tune_params generic_tunings =
741 {
742   &cortexa57_extra_costs,
743   &generic_addrcost_table,
744   &generic_regmove_cost,
745   &generic_vector_cost,
746   &generic_branch_cost,
747   &generic_approx_modes,
748   SVE_NOT_IMPLEMENTED, /* sve_width  */
749   4, /* memmov_cost  */
750   2, /* issue_rate  */
751   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
752   "8",	/* function_align.  */
753   "4",	/* jump_align.  */
754   "8",	/* loop_align.  */
755   2,	/* int_reassoc_width.  */
756   4,	/* fp_reassoc_width.  */
757   1,	/* vec_reassoc_width.  */
758   2,	/* min_div_recip_mul_sf.  */
759   2,	/* min_div_recip_mul_df.  */
760   0,	/* max_case_values.  */
761   tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
762   (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
763   &generic_prefetch_tune
764 };
765 
766 static const struct tune_params cortexa35_tunings =
767 {
768   &cortexa53_extra_costs,
769   &generic_addrcost_table,
770   &cortexa53_regmove_cost,
771   &generic_vector_cost,
772   &generic_branch_cost,
773   &generic_approx_modes,
774   SVE_NOT_IMPLEMENTED, /* sve_width  */
775   4, /* memmov_cost  */
776   1, /* issue_rate  */
777   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
778    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
779   "16",	/* function_align.  */
780   "4",	/* jump_align.  */
781   "8",	/* loop_align.  */
782   2,	/* int_reassoc_width.  */
783   4,	/* fp_reassoc_width.  */
784   1,	/* vec_reassoc_width.  */
785   2,	/* min_div_recip_mul_sf.  */
786   2,	/* min_div_recip_mul_df.  */
787   0,	/* max_case_values.  */
788   tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
789   (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
790   &generic_prefetch_tune
791 };
792 
793 static const struct tune_params cortexa53_tunings =
794 {
795   &cortexa53_extra_costs,
796   &generic_addrcost_table,
797   &cortexa53_regmove_cost,
798   &generic_vector_cost,
799   &generic_branch_cost,
800   &generic_approx_modes,
801   SVE_NOT_IMPLEMENTED, /* sve_width  */
802   4, /* memmov_cost  */
803   2, /* issue_rate  */
804   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
805    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
806   "16",	/* function_align.  */
807   "4",	/* jump_align.  */
808   "8",	/* loop_align.  */
809   2,	/* int_reassoc_width.  */
810   4,	/* fp_reassoc_width.  */
811   1,	/* vec_reassoc_width.  */
812   2,	/* min_div_recip_mul_sf.  */
813   2,	/* min_div_recip_mul_df.  */
814   0,	/* max_case_values.  */
815   tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
816   (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
817   &generic_prefetch_tune
818 };
819 
820 static const struct tune_params cortexa57_tunings =
821 {
822   &cortexa57_extra_costs,
823   &generic_addrcost_table,
824   &cortexa57_regmove_cost,
825   &cortexa57_vector_cost,
826   &generic_branch_cost,
827   &generic_approx_modes,
828   SVE_NOT_IMPLEMENTED, /* sve_width  */
829   4, /* memmov_cost  */
830   3, /* issue_rate  */
831   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
832    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
833   "16",	/* function_align.  */
834   "4",	/* jump_align.  */
835   "8",	/* loop_align.  */
836   2,	/* int_reassoc_width.  */
837   4,	/* fp_reassoc_width.  */
838   1,	/* vec_reassoc_width.  */
839   2,	/* min_div_recip_mul_sf.  */
840   2,	/* min_div_recip_mul_df.  */
841   0,	/* max_case_values.  */
842   tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
843   (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS),	/* tune_flags.  */
844   &generic_prefetch_tune
845 };
846 
847 static const struct tune_params cortexa72_tunings =
848 {
849   &cortexa57_extra_costs,
850   &generic_addrcost_table,
851   &cortexa57_regmove_cost,
852   &cortexa57_vector_cost,
853   &generic_branch_cost,
854   &generic_approx_modes,
855   SVE_NOT_IMPLEMENTED, /* sve_width  */
856   4, /* memmov_cost  */
857   3, /* issue_rate  */
858   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
859    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
860   "16",	/* function_align.  */
861   "4",	/* jump_align.  */
862   "8",	/* loop_align.  */
863   2,	/* int_reassoc_width.  */
864   4,	/* fp_reassoc_width.  */
865   1,	/* vec_reassoc_width.  */
866   2,	/* min_div_recip_mul_sf.  */
867   2,	/* min_div_recip_mul_df.  */
868   0,	/* max_case_values.  */
869   tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
870   (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
871   &generic_prefetch_tune
872 };
873 
874 static const struct tune_params cortexa73_tunings =
875 {
876   &cortexa57_extra_costs,
877   &generic_addrcost_table,
878   &cortexa57_regmove_cost,
879   &cortexa57_vector_cost,
880   &generic_branch_cost,
881   &generic_approx_modes,
882   SVE_NOT_IMPLEMENTED, /* sve_width  */
883   4, /* memmov_cost.  */
884   2, /* issue_rate.  */
885   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
886    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
887   "16",	/* function_align.  */
888   "4",	/* jump_align.  */
889   "8",	/* loop_align.  */
890   2,	/* int_reassoc_width.  */
891   4,	/* fp_reassoc_width.  */
892   1,	/* vec_reassoc_width.  */
893   2,	/* min_div_recip_mul_sf.  */
894   2,	/* min_div_recip_mul_df.  */
895   0,	/* max_case_values.  */
896   tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
897   (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
898   &generic_prefetch_tune
899 };
900 
901 
902 
903 static const struct tune_params exynosm1_tunings =
904 {
905   &exynosm1_extra_costs,
906   &exynosm1_addrcost_table,
907   &exynosm1_regmove_cost,
908   &exynosm1_vector_cost,
909   &generic_branch_cost,
910   &exynosm1_approx_modes,
911   SVE_NOT_IMPLEMENTED, /* sve_width  */
912   4,	/* memmov_cost  */
913   3,	/* issue_rate  */
914   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
915   "4",	/* function_align.  */
916   "4",	/* jump_align.  */
917   "4",	/* loop_align.  */
918   2,	/* int_reassoc_width.  */
919   4,	/* fp_reassoc_width.  */
920   1,	/* vec_reassoc_width.  */
921   2,	/* min_div_recip_mul_sf.  */
922   2,	/* min_div_recip_mul_df.  */
923   48,	/* max_case_values.  */
924   tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
925   (AARCH64_EXTRA_TUNE_NONE), /* tune_flags.  */
926   &exynosm1_prefetch_tune
927 };
928 
929 static const struct tune_params thunderxt88_tunings =
930 {
931   &thunderx_extra_costs,
932   &generic_addrcost_table,
933   &thunderx_regmove_cost,
934   &thunderx_vector_cost,
935   &generic_branch_cost,
936   &generic_approx_modes,
937   SVE_NOT_IMPLEMENTED, /* sve_width  */
938   6, /* memmov_cost  */
939   2, /* issue_rate  */
940   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
941   "8",	/* function_align.  */
942   "8",	/* jump_align.  */
943   "8",	/* loop_align.  */
944   2,	/* int_reassoc_width.  */
945   4,	/* fp_reassoc_width.  */
946   1,	/* vec_reassoc_width.  */
947   2,	/* min_div_recip_mul_sf.  */
948   2,	/* min_div_recip_mul_df.  */
949   0,	/* max_case_values.  */
950   tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
951   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW),	/* tune_flags.  */
952   &thunderxt88_prefetch_tune
953 };
954 
955 static const struct tune_params thunderx_tunings =
956 {
957   &thunderx_extra_costs,
958   &generic_addrcost_table,
959   &thunderx_regmove_cost,
960   &thunderx_vector_cost,
961   &generic_branch_cost,
962   &generic_approx_modes,
963   SVE_NOT_IMPLEMENTED, /* sve_width  */
964   6, /* memmov_cost  */
965   2, /* issue_rate  */
966   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
967   "8",	/* function_align.  */
968   "8",	/* jump_align.  */
969   "8",	/* loop_align.  */
970   2,	/* int_reassoc_width.  */
971   4,	/* fp_reassoc_width.  */
972   1,	/* vec_reassoc_width.  */
973   2,	/* min_div_recip_mul_sf.  */
974   2,	/* min_div_recip_mul_df.  */
975   0,	/* max_case_values.  */
976   tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
977   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
978    | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),	/* tune_flags.  */
979   &thunderx_prefetch_tune
980 };
981 
982 static const struct tune_params tsv110_tunings =
983 {
984   &tsv110_extra_costs,
985   &tsv110_addrcost_table,
986   &tsv110_regmove_cost,
987   &tsv110_vector_cost,
988   &generic_branch_cost,
989   &generic_approx_modes,
990   SVE_NOT_IMPLEMENTED, /* sve_width  */
991   4,    /* memmov_cost  */
992   4,    /* issue_rate  */
993   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH
994    | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops  */
995   "16", /* function_align.  */
996   "4",  /* jump_align.  */
997   "8",  /* loop_align.  */
998   2,    /* int_reassoc_width.  */
999   4,    /* fp_reassoc_width.  */
1000   1,    /* vec_reassoc_width.  */
1001   2,    /* min_div_recip_mul_sf.  */
1002   2,    /* min_div_recip_mul_df.  */
1003   0,    /* max_case_values.  */
1004   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1005   (AARCH64_EXTRA_TUNE_NONE),     /* tune_flags.  */
1006   &tsv110_prefetch_tune
1007 };
1008 
1009 static const struct tune_params xgene1_tunings =
1010 {
1011   &xgene1_extra_costs,
1012   &xgene1_addrcost_table,
1013   &xgene1_regmove_cost,
1014   &xgene1_vector_cost,
1015   &generic_branch_cost,
1016   &xgene1_approx_modes,
1017   SVE_NOT_IMPLEMENTED, /* sve_width  */
1018   6, /* memmov_cost  */
1019   4, /* issue_rate  */
1020   AARCH64_FUSE_NOTHING, /* fusible_ops  */
1021   "16",	/* function_align.  */
1022   "16",	/* jump_align.  */
1023   "16",	/* loop_align.  */
1024   2,	/* int_reassoc_width.  */
1025   4,	/* fp_reassoc_width.  */
1026   1,	/* vec_reassoc_width.  */
1027   2,	/* min_div_recip_mul_sf.  */
1028   2,	/* min_div_recip_mul_df.  */
1029   17,	/* max_case_values.  */
1030   tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
1031   (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS),	/* tune_flags.  */
1032   &xgene1_prefetch_tune
1033 };
1034 
1035 static const struct tune_params emag_tunings =
1036 {
1037   &xgene1_extra_costs,
1038   &xgene1_addrcost_table,
1039   &xgene1_regmove_cost,
1040   &xgene1_vector_cost,
1041   &generic_branch_cost,
1042   &xgene1_approx_modes,
1043   SVE_NOT_IMPLEMENTED,
1044   6, /* memmov_cost  */
1045   4, /* issue_rate  */
1046   AARCH64_FUSE_NOTHING, /* fusible_ops  */
1047   "16",	/* function_align.  */
1048   "16",	/* jump_align.  */
1049   "16",	/* loop_align.  */
1050   2,	/* int_reassoc_width.  */
1051   4,	/* fp_reassoc_width.  */
1052   1,	/* vec_reassoc_width.  */
1053   2,	/* min_div_recip_mul_sf.  */
1054   2,	/* min_div_recip_mul_df.  */
1055   17,	/* max_case_values.  */
1056   tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
1057   (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS),	/* tune_flags.  */
1058   &xgene1_prefetch_tune
1059 };
1060 
1061 static const struct tune_params qdf24xx_tunings =
1062 {
1063   &qdf24xx_extra_costs,
1064   &qdf24xx_addrcost_table,
1065   &qdf24xx_regmove_cost,
1066   &qdf24xx_vector_cost,
1067   &generic_branch_cost,
1068   &generic_approx_modes,
1069   SVE_NOT_IMPLEMENTED, /* sve_width  */
1070   4, /* memmov_cost  */
1071   4, /* issue_rate  */
1072   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1073    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
1074   "16",	/* function_align.  */
1075   "8",	/* jump_align.  */
1076   "16",	/* loop_align.  */
1077   2,	/* int_reassoc_width.  */
1078   4,	/* fp_reassoc_width.  */
1079   1,	/* vec_reassoc_width.  */
1080   2,	/* min_div_recip_mul_sf.  */
1081   2,	/* min_div_recip_mul_df.  */
1082   0,	/* max_case_values.  */
1083   tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
1084   AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS, /* tune_flags.  */
1085   &qdf24xx_prefetch_tune
1086 };
1087 
1088 /* Tuning structure for the Qualcomm Saphira core.  Default to falkor values
1089    for now.  */
1090 static const struct tune_params saphira_tunings =
1091 {
1092   &generic_extra_costs,
1093   &generic_addrcost_table,
1094   &generic_regmove_cost,
1095   &generic_vector_cost,
1096   &generic_branch_cost,
1097   &generic_approx_modes,
1098   SVE_NOT_IMPLEMENTED, /* sve_width  */
1099   4, /* memmov_cost  */
1100   4, /* issue_rate  */
1101   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1102    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
1103   "16",	/* function_align.  */
1104   "8",	/* jump_align.  */
1105   "16",	/* loop_align.  */
1106   2,	/* int_reassoc_width.  */
1107   4,	/* fp_reassoc_width.  */
1108   1,	/* vec_reassoc_width.  */
1109   2,	/* min_div_recip_mul_sf.  */
1110   2,	/* min_div_recip_mul_df.  */
1111   0,	/* max_case_values.  */
1112   tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
1113   (AARCH64_EXTRA_TUNE_NONE),		/* tune_flags.  */
1114   &generic_prefetch_tune
1115 };
1116 
1117 static const struct tune_params thunderx2t99_tunings =
1118 {
1119   &thunderx2t99_extra_costs,
1120   &thunderx2t99_addrcost_table,
1121   &thunderx2t99_regmove_cost,
1122   &thunderx2t99_vector_cost,
1123   &generic_branch_cost,
1124   &generic_approx_modes,
1125   SVE_NOT_IMPLEMENTED, /* sve_width  */
1126   4, /* memmov_cost.  */
1127   4, /* issue_rate.  */
1128   (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
1129    | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops  */
1130   "16",	/* function_align.  */
1131   "8",	/* jump_align.  */
1132   "16",	/* loop_align.  */
1133   3,	/* int_reassoc_width.  */
1134   2,	/* fp_reassoc_width.  */
1135   2,	/* vec_reassoc_width.  */
1136   2,	/* min_div_recip_mul_sf.  */
1137   2,	/* min_div_recip_mul_df.  */
1138   0,	/* max_case_values.  */
1139   tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
1140   (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
1141   &thunderx2t99_prefetch_tune
1142 };
1143 
1144 static const struct tune_params neoversen1_tunings =
1145 {
1146   &cortexa57_extra_costs,
1147   &generic_addrcost_table,
1148   &generic_regmove_cost,
1149   &cortexa57_vector_cost,
1150   &generic_branch_cost,
1151   &generic_approx_modes,
1152   SVE_NOT_IMPLEMENTED, /* sve_width  */
1153   4, /* memmov_cost  */
1154   3, /* issue_rate  */
1155   AARCH64_FUSE_AES_AESMC, /* fusible_ops  */
1156   "32:16",	/* function_align.  */
1157   "32:16",	/* jump_align.  */
1158   "32:16",	/* loop_align.  */
1159   2,	/* int_reassoc_width.  */
1160   4,	/* fp_reassoc_width.  */
1161   2,	/* vec_reassoc_width.  */
1162   2,	/* min_div_recip_mul_sf.  */
1163   2,	/* min_div_recip_mul_df.  */
1164   0,	/* max_case_values.  */
1165   tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
1166   (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
1167   &generic_prefetch_tune
1168 };
1169 
1170 static const struct tune_params neoversev1_tunings =
1171 {
1172   &cortexa57_extra_costs,
1173   &generic_addrcost_table,
1174   &generic_regmove_cost,
1175   &cortexa57_vector_cost,
1176   &generic_branch_cost,
1177   &generic_approx_modes,
1178   SVE_256, /* sve_width  */
1179   4, /* memmov_cost  */
1180   3, /* issue_rate  */
1181   AARCH64_FUSE_AES_AESMC, /* fusible_ops  */
1182   "32:16",      /* function_align.  */
1183   "32:16",      /* jump_align.  */
1184   "32:16",      /* loop_align.  */
1185   2,    /* int_reassoc_width.  */
1186   4,    /* fp_reassoc_width.  */
1187   2,    /* vec_reassoc_width.  */
1188   2,    /* min_div_recip_mul_sf.  */
1189   2,    /* min_div_recip_mul_df.  */
1190   0,    /* max_case_values.  */
1191   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1192   (AARCH64_EXTRA_TUNE_PREFER_ADVSIMD_AUTOVEC),    /* tune_flags.  */
1193   &generic_prefetch_tune
1194 };
1195 
1196 static const struct tune_params neoversen2_tunings =
1197 {
1198   &cortexa57_extra_costs,
1199   &generic_addrcost_table,
1200   &generic_regmove_cost,
1201   &cortexa57_vector_cost,
1202   &generic_branch_cost,
1203   &generic_approx_modes,
1204   SVE_128, /* sve_width  */
1205   4, /* memmov_cost  */
1206   3, /* issue_rate  */
1207   AARCH64_FUSE_AES_AESMC, /* fusible_ops  */
1208   "32:16",      /* function_align.  */
1209   "32:16",      /* jump_align.  */
1210   "32:16",      /* loop_align.  */
1211   2,    /* int_reassoc_width.  */
1212   4,    /* fp_reassoc_width.  */
1213   2,    /* vec_reassoc_width.  */
1214   2,    /* min_div_recip_mul_sf.  */
1215   2,    /* min_div_recip_mul_df.  */
1216   0,    /* max_case_values.  */
1217   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1218   (AARCH64_EXTRA_TUNE_PREFER_ADVSIMD_AUTOVEC),    /* tune_flags.  */
1219   &generic_prefetch_tune
1220 };
1221 
1222 static const struct tune_params a64fx_tunings =
1223 {
1224   &a64fx_extra_costs,
1225   &a64fx_addrcost_table,
1226   &a64fx_regmove_cost,
1227   &a64fx_vector_cost,
1228   &generic_branch_cost,
1229   &generic_approx_modes,
1230   SVE_512, /* sve_width  */
1231   4, /* memmov_cost  */
1232   7, /* issue_rate  */
1233   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
1234   "32",	/* function_align.  */
1235   "16",	/* jump_align.  */
1236   "32",	/* loop_align.  */
1237   4,	/* int_reassoc_width.  */
1238   2,	/* fp_reassoc_width.  */
1239   2,	/* vec_reassoc_width.  */
1240   2,	/* min_div_recip_mul_sf.  */
1241   2,	/* min_div_recip_mul_df.  */
1242   0,	/* max_case_values.  */
1243   tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
1244   (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
1245   &a64fx_prefetch_tune
1246 };
1247 
1248 /* Support for fine-grained override of the tuning structures.  */
1249 struct aarch64_tuning_override_function
1250 {
1251   const char* name;
1252   void (*parse_override)(const char*, struct tune_params*);
1253 };
1254 
1255 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
1256 static void aarch64_parse_tune_string (const char*, struct tune_params*);
1257 static void aarch64_parse_sve_width_string (const char*, struct tune_params*);
1258 
1259 static const struct aarch64_tuning_override_function
1260 aarch64_tuning_override_functions[] =
1261 {
1262   { "fuse", aarch64_parse_fuse_string },
1263   { "tune", aarch64_parse_tune_string },
1264   { "sve_width", aarch64_parse_sve_width_string },
1265   { NULL, NULL }
1266 };
1267 
1268 /* A processor implementing AArch64.  */
1269 struct processor
1270 {
1271   const char *const name;
1272   enum aarch64_processor ident;
1273   enum aarch64_processor sched_core;
1274   enum aarch64_arch arch;
1275   unsigned architecture_version;
1276   const unsigned long flags;
1277   const struct tune_params *const tune;
1278 };
1279 
1280 /* Architectures implementing AArch64.  */
1281 static const struct processor all_architectures[] =
1282 {
1283 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
1284   {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
1285 #include "aarch64-arches.def"
1286   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1287 };
1288 
1289 /* Processor cores implementing AArch64.  */
1290 static const struct processor all_cores[] =
1291 {
1292 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
1293   {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH,				\
1294   all_architectures[AARCH64_ARCH_##ARCH].architecture_version,	\
1295   FLAGS, &COSTS##_tunings},
1296 #include "aarch64-cores.def"
1297   {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
1298     AARCH64_FL_FOR_ARCH8, &generic_tunings},
1299   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1300 };
1301 
1302 
1303 /* Target specification.  These are populated by the -march, -mtune, -mcpu
1304    handling code or by target attributes.  */
1305 static const struct processor *selected_arch;
1306 static const struct processor *selected_cpu;
1307 static const struct processor *selected_tune;
1308 
1309 /* The current tuning set.  */
1310 struct tune_params aarch64_tune_params = generic_tunings;
1311 
1312 /* Table of machine attributes.  */
1313 static const struct attribute_spec aarch64_attribute_table[] =
1314 {
1315   /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
1316        affects_type_identity, handler, exclude } */
1317   { "aarch64_vector_pcs", 0, 0, false, true,  true,  true,  NULL, NULL },
1318   { NULL,                 0, 0, false, false, false, false, NULL, NULL }
1319 };
1320 
1321 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
1322 
1323 /* An ISA extension in the co-processor and main instruction set space.  */
1324 struct aarch64_option_extension
1325 {
1326   const char *const name;
1327   const unsigned long flags_on;
1328   const unsigned long flags_off;
1329 };
1330 
1331 typedef enum aarch64_cond_code
1332 {
1333   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
1334   AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
1335   AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
1336 }
1337 aarch64_cc;
1338 
1339 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1340 
1341 struct aarch64_branch_protect_type
1342 {
1343   /* The type's name that the user passes to the branch-protection option
1344     string.  */
1345   const char* name;
1346   /* Function to handle the protection type and set global variables.
1347     First argument is the string token corresponding with this type and the
1348     second argument is the next token in the option string.
1349     Return values:
1350     * AARCH64_PARSE_OK: Handling was sucessful.
1351     * AARCH64_INVALID_ARG: The type is invalid in this context and the caller
1352       should print an error.
1353     * AARCH64_INVALID_FEATURE: The type is invalid and the handler prints its
1354       own error.  */
1355   enum aarch64_parse_opt_result (*handler)(char*, char*);
1356   /* A list of types that can follow this type in the option string.  */
1357   const aarch64_branch_protect_type* subtypes;
1358   unsigned int num_subtypes;
1359 };
1360 
1361 static enum aarch64_parse_opt_result
aarch64_handle_no_branch_protection(char * str,char * rest)1362 aarch64_handle_no_branch_protection (char* str, char* rest)
1363 {
1364   aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
1365   aarch64_enable_bti = 0;
1366   if (rest)
1367     {
1368       error ("unexpected %<%s%> after %<%s%>", rest, str);
1369       return AARCH64_PARSE_INVALID_FEATURE;
1370     }
1371   return AARCH64_PARSE_OK;
1372 }
1373 
1374 static enum aarch64_parse_opt_result
aarch64_handle_standard_branch_protection(char * str,char * rest)1375 aarch64_handle_standard_branch_protection (char* str, char* rest)
1376 {
1377   aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1378   aarch64_enable_bti = 1;
1379   if (rest)
1380     {
1381       error ("unexpected %<%s%> after %<%s%>", rest, str);
1382       return AARCH64_PARSE_INVALID_FEATURE;
1383     }
1384   return AARCH64_PARSE_OK;
1385 }
1386 
1387 static enum aarch64_parse_opt_result
aarch64_handle_pac_ret_protection(char * str ATTRIBUTE_UNUSED,char * rest ATTRIBUTE_UNUSED)1388 aarch64_handle_pac_ret_protection (char* str ATTRIBUTE_UNUSED,
1389 				    char* rest ATTRIBUTE_UNUSED)
1390 {
1391   aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1392   return AARCH64_PARSE_OK;
1393 }
1394 
1395 static enum aarch64_parse_opt_result
aarch64_handle_pac_ret_leaf(char * str ATTRIBUTE_UNUSED,char * rest ATTRIBUTE_UNUSED)1396 aarch64_handle_pac_ret_leaf (char* str ATTRIBUTE_UNUSED,
1397 			      char* rest ATTRIBUTE_UNUSED)
1398 {
1399   aarch64_ra_sign_scope = AARCH64_FUNCTION_ALL;
1400   return AARCH64_PARSE_OK;
1401 }
1402 
1403 static enum aarch64_parse_opt_result
aarch64_handle_bti_protection(char * str ATTRIBUTE_UNUSED,char * rest ATTRIBUTE_UNUSED)1404 aarch64_handle_bti_protection (char* str ATTRIBUTE_UNUSED,
1405 				    char* rest ATTRIBUTE_UNUSED)
1406 {
1407   aarch64_enable_bti = 1;
1408   return AARCH64_PARSE_OK;
1409 }
1410 
1411 static const struct aarch64_branch_protect_type aarch64_pac_ret_subtypes[] = {
1412   { "leaf", aarch64_handle_pac_ret_leaf, NULL, 0 },
1413   { NULL, NULL, NULL, 0 }
1414 };
1415 
1416 static const struct aarch64_branch_protect_type aarch64_branch_protect_types[] = {
1417   { "none", aarch64_handle_no_branch_protection, NULL, 0 },
1418   { "standard", aarch64_handle_standard_branch_protection, NULL, 0 },
1419   { "pac-ret", aarch64_handle_pac_ret_protection, aarch64_pac_ret_subtypes,
1420     ARRAY_SIZE (aarch64_pac_ret_subtypes) },
1421   { "bti", aarch64_handle_bti_protection, NULL, 0 },
1422   { NULL, NULL, NULL, 0 }
1423 };
1424 
1425 /* The condition codes of the processor, and the inverse function.  */
1426 static const char * const aarch64_condition_codes[] =
1427 {
1428   "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1429   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1430 };
1431 
1432 /* Generate code to enable conditional branches in functions over 1 MiB.  */
1433 const char *
aarch64_gen_far_branch(rtx * operands,int pos_label,const char * dest,const char * branch_format)1434 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
1435 			const char * branch_format)
1436 {
1437     rtx_code_label * tmp_label = gen_label_rtx ();
1438     char label_buf[256];
1439     char buffer[128];
1440     ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
1441 				 CODE_LABEL_NUMBER (tmp_label));
1442     const char *label_ptr = targetm.strip_name_encoding (label_buf);
1443     rtx dest_label = operands[pos_label];
1444     operands[pos_label] = tmp_label;
1445 
1446     snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
1447     output_asm_insn (buffer, operands);
1448 
1449     snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
1450     operands[pos_label] = dest_label;
1451     output_asm_insn (buffer, operands);
1452     return "";
1453 }
1454 
1455 void
aarch64_err_no_fpadvsimd(machine_mode mode)1456 aarch64_err_no_fpadvsimd (machine_mode mode)
1457 {
1458   if (TARGET_GENERAL_REGS_ONLY)
1459     if (FLOAT_MODE_P (mode))
1460       error ("%qs is incompatible with the use of floating-point types",
1461 	     "-mgeneral-regs-only");
1462     else
1463       error ("%qs is incompatible with the use of vector types",
1464 	     "-mgeneral-regs-only");
1465   else
1466     if (FLOAT_MODE_P (mode))
1467       error ("%qs feature modifier is incompatible with the use of"
1468 	     " floating-point types", "+nofp");
1469     else
1470       error ("%qs feature modifier is incompatible with the use of"
1471 	     " vector types", "+nofp");
1472 }
1473 
1474 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1475    The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
1476    GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
1477    higher cost.  POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
1478    and GENERAL_REGS is lower than the memory cost (in this case the best class
1479    is the lowest cost one).  Using POINTER_AND_FP_REGS irrespectively of its
1480    cost results in bad allocations with many redundant int<->FP moves which
1481    are expensive on various cores.
1482    To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
1483    force a decision between FP_REGS and GENERAL_REGS.  We use the allocno class
1484    if it isn't POINTER_AND_FP_REGS.  Similarly, use the best class if it isn't
1485    POINTER_AND_FP_REGS.  Otherwise set the allocno class depending on the mode.
1486    The result of this is that it is no longer inefficient to have a higher
1487    memory move cost than the register move cost.
1488 */
1489 
1490 static reg_class_t
aarch64_ira_change_pseudo_allocno_class(int regno,reg_class_t allocno_class,reg_class_t best_class)1491 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
1492 					 reg_class_t best_class)
1493 {
1494   machine_mode mode;
1495 
1496   if (!reg_class_subset_p (GENERAL_REGS, allocno_class)
1497       || !reg_class_subset_p (FP_REGS, allocno_class))
1498     return allocno_class;
1499 
1500   if (!reg_class_subset_p (GENERAL_REGS, best_class)
1501       || !reg_class_subset_p (FP_REGS, best_class))
1502     return best_class;
1503 
1504   mode = PSEUDO_REGNO_MODE (regno);
1505   return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1506 }
1507 
1508 static unsigned int
aarch64_min_divisions_for_recip_mul(machine_mode mode)1509 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1510 {
1511   if (GET_MODE_UNIT_SIZE (mode) == 4)
1512     return aarch64_tune_params.min_div_recip_mul_sf;
1513   return aarch64_tune_params.min_div_recip_mul_df;
1514 }
1515 
1516 /* Return the reassociation width of treeop OPC with mode MODE.  */
1517 static int
aarch64_reassociation_width(unsigned opc,machine_mode mode)1518 aarch64_reassociation_width (unsigned opc, machine_mode mode)
1519 {
1520   if (VECTOR_MODE_P (mode))
1521     return aarch64_tune_params.vec_reassoc_width;
1522   if (INTEGRAL_MODE_P (mode))
1523     return aarch64_tune_params.int_reassoc_width;
1524   /* Avoid reassociating floating point addition so we emit more FMAs.  */
1525   if (FLOAT_MODE_P (mode) && opc != PLUS_EXPR)
1526     return aarch64_tune_params.fp_reassoc_width;
1527   return 1;
1528 }
1529 
1530 /* Provide a mapping from gcc register numbers to dwarf register numbers.  */
1531 unsigned
aarch64_dbx_register_number(unsigned regno)1532 aarch64_dbx_register_number (unsigned regno)
1533 {
1534    if (GP_REGNUM_P (regno))
1535      return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1536    else if (regno == SP_REGNUM)
1537      return AARCH64_DWARF_SP;
1538    else if (FP_REGNUM_P (regno))
1539      return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1540    else if (PR_REGNUM_P (regno))
1541      return AARCH64_DWARF_P0 + regno - P0_REGNUM;
1542    else if (regno == VG_REGNUM)
1543      return AARCH64_DWARF_VG;
1544 
1545    /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1546       equivalent DWARF register.  */
1547    return DWARF_FRAME_REGISTERS;
1548 }
1549 
1550 /* Return true if MODE is any of the Advanced SIMD structure modes.  */
1551 static bool
aarch64_advsimd_struct_mode_p(machine_mode mode)1552 aarch64_advsimd_struct_mode_p (machine_mode mode)
1553 {
1554   return (TARGET_SIMD
1555 	  && (mode == OImode || mode == CImode || mode == XImode));
1556 }
1557 
1558 /* Return true if MODE is an SVE predicate mode.  */
1559 static bool
aarch64_sve_pred_mode_p(machine_mode mode)1560 aarch64_sve_pred_mode_p (machine_mode mode)
1561 {
1562   return (TARGET_SVE
1563 	  && (mode == VNx16BImode
1564 	      || mode == VNx8BImode
1565 	      || mode == VNx4BImode
1566 	      || mode == VNx2BImode));
1567 }
1568 
1569 /* Three mutually-exclusive flags describing a vector or predicate type.  */
1570 const unsigned int VEC_ADVSIMD  = 1;
1571 const unsigned int VEC_SVE_DATA = 2;
1572 const unsigned int VEC_SVE_PRED = 4;
1573 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
1574    a structure of 2, 3 or 4 vectors.  */
1575 const unsigned int VEC_STRUCT   = 8;
1576 /* Useful combinations of the above.  */
1577 const unsigned int VEC_ANY_SVE  = VEC_SVE_DATA | VEC_SVE_PRED;
1578 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
1579 
1580 /* Return a set of flags describing the vector properties of mode MODE.
1581    Ignore modes that are not supported by the current target.  */
1582 static unsigned int
aarch64_classify_vector_mode(machine_mode mode)1583 aarch64_classify_vector_mode (machine_mode mode)
1584 {
1585   if (aarch64_advsimd_struct_mode_p (mode))
1586     return VEC_ADVSIMD | VEC_STRUCT;
1587 
1588   if (aarch64_sve_pred_mode_p (mode))
1589     return VEC_SVE_PRED;
1590 
1591   scalar_mode inner = GET_MODE_INNER (mode);
1592   if (VECTOR_MODE_P (mode)
1593       && (inner == QImode
1594 	  || inner == HImode
1595 	  || inner == HFmode
1596 	  || inner == SImode
1597 	  || inner == SFmode
1598 	  || inner == DImode
1599 	  || inner == DFmode))
1600     {
1601       if (TARGET_SVE)
1602 	{
1603 	  if (known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR))
1604 	    return VEC_SVE_DATA;
1605 	  if (known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 2)
1606 	      || known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 3)
1607 	      || known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 4))
1608 	    return VEC_SVE_DATA | VEC_STRUCT;
1609 	}
1610 
1611       /* This includes V1DF but not V1DI (which doesn't exist).  */
1612       if (TARGET_SIMD
1613 	  && (known_eq (GET_MODE_BITSIZE (mode), 64)
1614 	      || known_eq (GET_MODE_BITSIZE (mode), 128)))
1615 	return VEC_ADVSIMD;
1616     }
1617 
1618   return 0;
1619 }
1620 
1621 /* Return true if MODE is any of the data vector modes, including
1622    structure modes.  */
1623 static bool
aarch64_vector_data_mode_p(machine_mode mode)1624 aarch64_vector_data_mode_p (machine_mode mode)
1625 {
1626   return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
1627 }
1628 
1629 /* Return true if MODE is an SVE data vector mode; either a single vector
1630    or a structure of vectors.  */
1631 static bool
aarch64_sve_data_mode_p(machine_mode mode)1632 aarch64_sve_data_mode_p (machine_mode mode)
1633 {
1634   return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
1635 }
1636 
1637 /* Implement target hook TARGET_ARRAY_MODE.  */
1638 static opt_machine_mode
aarch64_array_mode(machine_mode mode,unsigned HOST_WIDE_INT nelems)1639 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
1640 {
1641   if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
1642       && IN_RANGE (nelems, 2, 4))
1643     return mode_for_vector (GET_MODE_INNER (mode),
1644 			    GET_MODE_NUNITS (mode) * nelems);
1645 
1646   return opt_machine_mode ();
1647 }
1648 
1649 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
1650 static bool
aarch64_array_mode_supported_p(machine_mode mode,unsigned HOST_WIDE_INT nelems)1651 aarch64_array_mode_supported_p (machine_mode mode,
1652 				unsigned HOST_WIDE_INT nelems)
1653 {
1654   if (TARGET_SIMD
1655       && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1656 	  || AARCH64_VALID_SIMD_DREG_MODE (mode))
1657       && (nelems >= 2 && nelems <= 4))
1658     return true;
1659 
1660   return false;
1661 }
1662 
1663 /* Return the SVE predicate mode to use for elements that have
1664    ELEM_NBYTES bytes, if such a mode exists.  */
1665 
1666 opt_machine_mode
aarch64_sve_pred_mode(unsigned int elem_nbytes)1667 aarch64_sve_pred_mode (unsigned int elem_nbytes)
1668 {
1669   if (TARGET_SVE)
1670     {
1671       if (elem_nbytes == 1)
1672 	return VNx16BImode;
1673       if (elem_nbytes == 2)
1674 	return VNx8BImode;
1675       if (elem_nbytes == 4)
1676 	return VNx4BImode;
1677       if (elem_nbytes == 8)
1678 	return VNx2BImode;
1679     }
1680   return opt_machine_mode ();
1681 }
1682 
1683 /* Implement TARGET_VECTORIZE_GET_MASK_MODE.  */
1684 
1685 static opt_machine_mode
aarch64_get_mask_mode(poly_uint64 nunits,poly_uint64 nbytes)1686 aarch64_get_mask_mode (poly_uint64 nunits, poly_uint64 nbytes)
1687 {
1688   if (TARGET_SVE && known_eq (nbytes, BYTES_PER_SVE_VECTOR))
1689     {
1690       unsigned int elem_nbytes = vector_element_size (nbytes, nunits);
1691       machine_mode pred_mode;
1692       if (aarch64_sve_pred_mode (elem_nbytes).exists (&pred_mode))
1693 	return pred_mode;
1694     }
1695 
1696   return default_get_mask_mode (nunits, nbytes);
1697 }
1698 
1699 /* Implement TARGET_PREFERRED_ELSE_VALUE.  For binary operations,
1700    prefer to use the first arithmetic operand as the else value if
1701    the else value doesn't matter, since that exactly matches the SVE
1702    destructive merging form.  For ternary operations we could either
1703    pick the first operand and use FMAD-like instructions or the last
1704    operand and use FMLA-like instructions; the latter seems more
1705    natural.  */
1706 
1707 static tree
aarch64_preferred_else_value(unsigned,tree,unsigned int nops,tree * ops)1708 aarch64_preferred_else_value (unsigned, tree, unsigned int nops, tree *ops)
1709 {
1710   return nops == 3 ? ops[2] : ops[0];
1711 }
1712 
1713 /* Implement TARGET_HARD_REGNO_NREGS.  */
1714 
1715 static unsigned int
aarch64_hard_regno_nregs(unsigned regno,machine_mode mode)1716 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1717 {
1718   /* ??? Logically we should only need to provide a value when
1719      HARD_REGNO_MODE_OK says that the combination is valid,
1720      but at the moment we need to handle all modes.  Just ignore
1721      any runtime parts for registers that can't store them.  */
1722   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
1723   switch (aarch64_regno_regclass (regno))
1724     {
1725     case FP_REGS:
1726     case FP_LO_REGS:
1727       if (aarch64_sve_data_mode_p (mode))
1728 	return exact_div (GET_MODE_SIZE (mode),
1729 			  BYTES_PER_SVE_VECTOR).to_constant ();
1730       return CEIL (lowest_size, UNITS_PER_VREG);
1731     case PR_REGS:
1732     case PR_LO_REGS:
1733     case PR_HI_REGS:
1734       return 1;
1735     default:
1736       return CEIL (lowest_size, UNITS_PER_WORD);
1737     }
1738   gcc_unreachable ();
1739 }
1740 
1741 /* Implement TARGET_HARD_REGNO_MODE_OK.  */
1742 
1743 static bool
aarch64_hard_regno_mode_ok(unsigned regno,machine_mode mode)1744 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1745 {
1746   if (GET_MODE_CLASS (mode) == MODE_CC)
1747     return regno == CC_REGNUM;
1748 
1749   if (regno == VG_REGNUM)
1750     /* This must have the same size as _Unwind_Word.  */
1751     return mode == DImode;
1752 
1753   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1754   if (vec_flags & VEC_SVE_PRED)
1755     return PR_REGNUM_P (regno);
1756 
1757   if (PR_REGNUM_P (regno))
1758     return 0;
1759 
1760   if (regno == SP_REGNUM)
1761     /* The purpose of comparing with ptr_mode is to support the
1762        global register variable associated with the stack pointer
1763        register via the syntax of asm ("wsp") in ILP32.  */
1764     return mode == Pmode || mode == ptr_mode;
1765 
1766   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1767     return mode == Pmode;
1768 
1769   if (GP_REGNUM_P (regno))
1770     {
1771       if (known_le (GET_MODE_SIZE (mode), 8))
1772 	return true;
1773       else if (known_le (GET_MODE_SIZE (mode), 16))
1774 	return (regno & 1) == 0;
1775     }
1776   else if (FP_REGNUM_P (regno))
1777     {
1778       if (vec_flags & VEC_STRUCT)
1779 	return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
1780       else
1781 	return !VECTOR_MODE_P (mode) || vec_flags != 0;
1782     }
1783 
1784   return false;
1785 }
1786 
1787 /* Return true if this is a definition of a vectorized simd function.  */
1788 
1789 static bool
aarch64_simd_decl_p(tree fndecl)1790 aarch64_simd_decl_p (tree fndecl)
1791 {
1792   tree fntype;
1793 
1794   if (fndecl == NULL)
1795     return false;
1796   fntype = TREE_TYPE (fndecl);
1797   if (fntype == NULL)
1798     return false;
1799 
1800   /* Functions with the aarch64_vector_pcs attribute use the simd ABI.  */
1801   if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)) != NULL)
1802     return true;
1803 
1804   return false;
1805 }
1806 
1807 /* Return the mode a register save/restore should use.  DImode for integer
1808    registers, DFmode for FP registers in non-SIMD functions (they only save
1809    the bottom half of a 128 bit register), or TFmode for FP registers in
1810    SIMD functions.  */
1811 
1812 static machine_mode
aarch64_reg_save_mode(tree fndecl,unsigned regno)1813 aarch64_reg_save_mode (tree fndecl, unsigned regno)
1814 {
1815   return GP_REGNUM_P (regno)
1816 	   ? E_DImode
1817 	   : (aarch64_simd_decl_p (fndecl) ? E_TFmode : E_DFmode);
1818 }
1819 
1820 /* Return true if the instruction is a call to a SIMD function, false
1821    if it is not a SIMD function or if we do not know anything about
1822    the function.  */
1823 
1824 static bool
aarch64_simd_call_p(rtx_insn * insn)1825 aarch64_simd_call_p (rtx_insn *insn)
1826 {
1827   rtx symbol;
1828   rtx call;
1829   tree fndecl;
1830 
1831   gcc_assert (CALL_P (insn));
1832   call = get_call_rtx_from (insn);
1833   symbol = XEXP (XEXP (call, 0), 0);
1834   if (GET_CODE (symbol) != SYMBOL_REF)
1835     return false;
1836   fndecl = SYMBOL_REF_DECL (symbol);
1837   if (!fndecl)
1838     return false;
1839 
1840   return aarch64_simd_decl_p (fndecl);
1841 }
1842 
1843 /* Implement TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS.  If INSN calls
1844    a function that uses the SIMD ABI, take advantage of the extra
1845    call-preserved registers that the ABI provides.  */
1846 
1847 void
aarch64_remove_extra_call_preserved_regs(rtx_insn * insn,HARD_REG_SET * return_set)1848 aarch64_remove_extra_call_preserved_regs (rtx_insn *insn,
1849 					  HARD_REG_SET *return_set)
1850 {
1851   if (aarch64_simd_call_p (insn))
1852     {
1853       for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
1854 	if (FP_SIMD_SAVED_REGNUM_P (regno))
1855 	  CLEAR_HARD_REG_BIT (*return_set, regno);
1856     }
1857 }
1858 
1859 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED.  The callee only saves
1860    the lower 64 bits of a 128-bit register.  Tell the compiler the callee
1861    clobbers the top 64 bits when restoring the bottom 64 bits.  */
1862 
1863 static bool
aarch64_hard_regno_call_part_clobbered(rtx_insn * insn,unsigned int regno,machine_mode mode)1864 aarch64_hard_regno_call_part_clobbered (rtx_insn *insn, unsigned int regno,
1865 					machine_mode mode)
1866 {
1867   bool simd_p = insn && CALL_P (insn) && aarch64_simd_call_p (insn);
1868   return FP_REGNUM_P (regno)
1869 	 && maybe_gt (GET_MODE_SIZE (mode), simd_p ? 16 : 8);
1870 }
1871 
1872 /* Implement TARGET_RETURN_CALL_WITH_MAX_CLOBBERS.  */
1873 
1874 rtx_insn *
aarch64_return_call_with_max_clobbers(rtx_insn * call_1,rtx_insn * call_2)1875 aarch64_return_call_with_max_clobbers (rtx_insn *call_1, rtx_insn *call_2)
1876 {
1877   gcc_assert (CALL_P (call_1) && CALL_P (call_2));
1878 
1879   if (!aarch64_simd_call_p (call_1) || aarch64_simd_call_p (call_2))
1880     return call_1;
1881   else
1882     return call_2;
1883 }
1884 
1885 /* Implement REGMODE_NATURAL_SIZE.  */
1886 poly_uint64
aarch64_regmode_natural_size(machine_mode mode)1887 aarch64_regmode_natural_size (machine_mode mode)
1888 {
1889   /* The natural size for SVE data modes is one SVE data vector,
1890      and similarly for predicates.  We can't independently modify
1891      anything smaller than that.  */
1892   /* ??? For now, only do this for variable-width SVE registers.
1893      Doing it for constant-sized registers breaks lower-subreg.c.  */
1894   /* ??? And once that's fixed, we should probably have similar
1895      code for Advanced SIMD.  */
1896   if (!aarch64_sve_vg.is_constant ())
1897     {
1898       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1899       if (vec_flags & VEC_SVE_PRED)
1900 	return BYTES_PER_SVE_PRED;
1901       if (vec_flags & VEC_SVE_DATA)
1902 	return BYTES_PER_SVE_VECTOR;
1903     }
1904   return UNITS_PER_WORD;
1905 }
1906 
1907 /* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
1908 machine_mode
aarch64_hard_regno_caller_save_mode(unsigned regno,unsigned,machine_mode mode)1909 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
1910 				     machine_mode mode)
1911 {
1912   /* The predicate mode determines which bits are significant and
1913      which are "don't care".  Decreasing the number of lanes would
1914      lose data while increasing the number of lanes would make bits
1915      unnecessarily significant.  */
1916   if (PR_REGNUM_P (regno))
1917     return mode;
1918   if (known_ge (GET_MODE_SIZE (mode), 4))
1919     return mode;
1920   else
1921     return SImode;
1922 }
1923 
1924 /* Return true if I's bits are consecutive ones from the MSB.  */
1925 bool
aarch64_high_bits_all_ones_p(HOST_WIDE_INT i)1926 aarch64_high_bits_all_ones_p (HOST_WIDE_INT i)
1927 {
1928   return exact_log2 (-i) != HOST_WIDE_INT_M1;
1929 }
1930 
1931 /* Implement TARGET_CONSTANT_ALIGNMENT.  Make strings word-aligned so
1932    that strcpy from constants will be faster.  */
1933 
1934 static HOST_WIDE_INT
aarch64_constant_alignment(const_tree exp,HOST_WIDE_INT align)1935 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
1936 {
1937   if (TREE_CODE (exp) == STRING_CST && !optimize_size)
1938     return MAX (align, BITS_PER_WORD);
1939   return align;
1940 }
1941 
1942 /* Return true if calls to DECL should be treated as
1943    long-calls (ie called via a register).  */
1944 static bool
aarch64_decl_is_long_call_p(const_tree decl ATTRIBUTE_UNUSED)1945 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1946 {
1947   return false;
1948 }
1949 
1950 /* Return true if calls to symbol-ref SYM should be treated as
1951    long-calls (ie called via a register).  */
1952 bool
aarch64_is_long_call_p(rtx sym)1953 aarch64_is_long_call_p (rtx sym)
1954 {
1955   return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1956 }
1957 
1958 /* Return true if calls to symbol-ref SYM should not go through
1959    plt stubs.  */
1960 
1961 bool
aarch64_is_noplt_call_p(rtx sym)1962 aarch64_is_noplt_call_p (rtx sym)
1963 {
1964   const_tree decl = SYMBOL_REF_DECL (sym);
1965 
1966   if (flag_pic
1967       && decl
1968       && (!flag_plt
1969 	  || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1970       && !targetm.binds_local_p (decl))
1971     return true;
1972 
1973   return false;
1974 }
1975 
1976 /* Return true if the offsets to a zero/sign-extract operation
1977    represent an expression that matches an extend operation.  The
1978    operands represent the paramters from
1979 
1980    (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)).  */
1981 bool
aarch64_is_extend_from_extract(scalar_int_mode mode,rtx mult_imm,rtx extract_imm)1982 aarch64_is_extend_from_extract (scalar_int_mode mode, rtx mult_imm,
1983 				rtx extract_imm)
1984 {
1985   HOST_WIDE_INT mult_val, extract_val;
1986 
1987   if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
1988     return false;
1989 
1990   mult_val = INTVAL (mult_imm);
1991   extract_val = INTVAL (extract_imm);
1992 
1993   if (extract_val > 8
1994       && extract_val < GET_MODE_BITSIZE (mode)
1995       && exact_log2 (extract_val & ~7) > 0
1996       && (extract_val & 7) <= 4
1997       && mult_val == (1 << (extract_val & 7)))
1998     return true;
1999 
2000   return false;
2001 }
2002 
2003 /* Emit an insn that's a simple single-set.  Both the operands must be
2004    known to be valid.  */
2005 inline static rtx_insn *
emit_set_insn(rtx x,rtx y)2006 emit_set_insn (rtx x, rtx y)
2007 {
2008   return emit_insn (gen_rtx_SET (x, y));
2009 }
2010 
2011 /* X and Y are two things to compare using CODE.  Emit the compare insn and
2012    return the rtx for register 0 in the proper mode.  */
2013 rtx
aarch64_gen_compare_reg(RTX_CODE code,rtx x,rtx y)2014 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
2015 {
2016   machine_mode cmp_mode = GET_MODE (x);
2017   machine_mode cc_mode;
2018   rtx cc_reg;
2019 
2020   if (cmp_mode == TImode)
2021     {
2022       gcc_assert (code == NE);
2023 
2024       cc_mode = CCmode;
2025       cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2026 
2027       rtx x_lo = operand_subword (x, 0, 0, TImode);
2028       rtx y_lo = operand_subword (y, 0, 0, TImode);
2029       emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x_lo, y_lo));
2030 
2031       rtx x_hi = operand_subword (x, 1, 0, TImode);
2032       rtx y_hi = operand_subword (y, 1, 0, TImode);
2033       emit_insn (gen_ccmpdi (cc_reg, cc_reg, x_hi, y_hi,
2034 			     gen_rtx_EQ (cc_mode, cc_reg, const0_rtx),
2035 			     GEN_INT (AARCH64_EQ)));
2036     }
2037   else
2038     {
2039       cc_mode = SELECT_CC_MODE (code, x, y);
2040       cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2041       emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x, y));
2042     }
2043   return cc_reg;
2044 }
2045 
2046 /* Similarly, but maybe zero-extend Y if Y_MODE < SImode.  */
2047 
2048 static rtx
aarch64_gen_compare_reg_maybe_ze(RTX_CODE code,rtx x,rtx y,machine_mode y_mode)2049 aarch64_gen_compare_reg_maybe_ze (RTX_CODE code, rtx x, rtx y,
2050                                   machine_mode y_mode)
2051 {
2052   if (y_mode == E_QImode || y_mode == E_HImode)
2053     {
2054       if (CONST_INT_P (y))
2055 	{
2056 	  y = GEN_INT (INTVAL (y) & GET_MODE_MASK (y_mode));
2057 	  y_mode = SImode;
2058 	}
2059       else
2060 	{
2061 	  rtx t, cc_reg;
2062 	  machine_mode cc_mode;
2063 
2064 	  t = gen_rtx_ZERO_EXTEND (SImode, y);
2065 	  t = gen_rtx_COMPARE (CC_SWPmode, t, x);
2066 	  cc_mode = CC_SWPmode;
2067 	  cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2068 	  emit_set_insn (cc_reg, t);
2069 	  return cc_reg;
2070 	}
2071     }
2072 
2073   if (!aarch64_plus_operand (y, y_mode))
2074     y = force_reg (y_mode, y);
2075 
2076   return aarch64_gen_compare_reg (code, x, y);
2077 }
2078 
2079 /* Build the SYMBOL_REF for __tls_get_addr.  */
2080 
2081 static GTY(()) rtx tls_get_addr_libfunc;
2082 
2083 rtx
aarch64_tls_get_addr(void)2084 aarch64_tls_get_addr (void)
2085 {
2086   if (!tls_get_addr_libfunc)
2087     tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
2088   return tls_get_addr_libfunc;
2089 }
2090 
2091 /* Return the TLS model to use for ADDR.  */
2092 
2093 static enum tls_model
tls_symbolic_operand_type(rtx addr)2094 tls_symbolic_operand_type (rtx addr)
2095 {
2096   enum tls_model tls_kind = TLS_MODEL_NONE;
2097   if (GET_CODE (addr) == CONST)
2098     {
2099       poly_int64 addend;
2100       rtx sym = strip_offset (addr, &addend);
2101       if (GET_CODE (sym) == SYMBOL_REF)
2102 	tls_kind = SYMBOL_REF_TLS_MODEL (sym);
2103     }
2104   else if (GET_CODE (addr) == SYMBOL_REF)
2105     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
2106 
2107   return tls_kind;
2108 }
2109 
2110 /* We'll allow lo_sum's in addresses in our legitimate addresses
2111    so that combine would take care of combining addresses where
2112    necessary, but for generation purposes, we'll generate the address
2113    as :
2114    RTL                               Absolute
2115    tmp = hi (symbol_ref);            adrp  x1, foo
2116    dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
2117                                      nop
2118 
2119    PIC                               TLS
2120    adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
2121    ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
2122                                      bl   __tls_get_addr
2123                                      nop
2124 
2125    Load TLS symbol, depending on TLS mechanism and TLS access model.
2126 
2127    Global Dynamic - Traditional TLS:
2128    adrp tmp, :tlsgd:imm
2129    add  dest, tmp, #:tlsgd_lo12:imm
2130    bl   __tls_get_addr
2131 
2132    Global Dynamic - TLS Descriptors:
2133    adrp dest, :tlsdesc:imm
2134    ldr  tmp, [dest, #:tlsdesc_lo12:imm]
2135    add  dest, dest, #:tlsdesc_lo12:imm
2136    blr  tmp
2137    mrs  tp, tpidr_el0
2138    add  dest, dest, tp
2139 
2140    Initial Exec:
2141    mrs  tp, tpidr_el0
2142    adrp tmp, :gottprel:imm
2143    ldr  dest, [tmp, #:gottprel_lo12:imm]
2144    add  dest, dest, tp
2145 
2146    Local Exec:
2147    mrs  tp, tpidr_el0
2148    add  t0, tp, #:tprel_hi12:imm, lsl #12
2149    add  t0, t0, #:tprel_lo12_nc:imm
2150 */
2151 
2152 static void
aarch64_load_symref_appropriately(rtx dest,rtx imm,enum aarch64_symbol_type type)2153 aarch64_load_symref_appropriately (rtx dest, rtx imm,
2154 				   enum aarch64_symbol_type type)
2155 {
2156   switch (type)
2157     {
2158     case SYMBOL_SMALL_ABSOLUTE:
2159       {
2160 	/* In ILP32, the mode of dest can be either SImode or DImode.  */
2161 	rtx tmp_reg = dest;
2162 	machine_mode mode = GET_MODE (dest);
2163 
2164 	gcc_assert (mode == Pmode || mode == ptr_mode);
2165 
2166 	if (can_create_pseudo_p ())
2167 	  tmp_reg = gen_reg_rtx (mode);
2168 
2169 	emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
2170 	emit_insn (gen_add_losym (dest, tmp_reg, imm));
2171 	return;
2172       }
2173 
2174     case SYMBOL_TINY_ABSOLUTE:
2175       emit_insn (gen_rtx_SET (dest, imm));
2176       return;
2177 
2178     case SYMBOL_SMALL_GOT_28K:
2179       {
2180 	machine_mode mode = GET_MODE (dest);
2181 	rtx gp_rtx = pic_offset_table_rtx;
2182 	rtx insn;
2183 	rtx mem;
2184 
2185 	/* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
2186 	   here before rtl expand.  Tree IVOPT will generate rtl pattern to
2187 	   decide rtx costs, in which case pic_offset_table_rtx is not
2188 	   initialized.  For that case no need to generate the first adrp
2189 	   instruction as the final cost for global variable access is
2190 	   one instruction.  */
2191 	if (gp_rtx != NULL)
2192 	  {
2193 	    /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
2194 	       using the page base as GOT base, the first page may be wasted,
2195 	       in the worst scenario, there is only 28K space for GOT).
2196 
2197 	       The generate instruction sequence for accessing global variable
2198 	       is:
2199 
2200 		 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
2201 
2202 	       Only one instruction needed. But we must initialize
2203 	       pic_offset_table_rtx properly.  We generate initialize insn for
2204 	       every global access, and allow CSE to remove all redundant.
2205 
2206 	       The final instruction sequences will look like the following
2207 	       for multiply global variables access.
2208 
2209 		 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
2210 
2211 		 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
2212 		 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
2213 		 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
2214 		 ...  */
2215 
2216 	    rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
2217 	    crtl->uses_pic_offset_table = 1;
2218 	    emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
2219 
2220 	    if (mode != GET_MODE (gp_rtx))
2221              gp_rtx = gen_lowpart (mode, gp_rtx);
2222 
2223 	  }
2224 
2225 	if (mode == ptr_mode)
2226 	  {
2227 	    if (mode == DImode)
2228 	      insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
2229 	    else
2230 	      insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
2231 
2232 	    mem = XVECEXP (SET_SRC (insn), 0, 0);
2233 	  }
2234 	else
2235 	  {
2236 	    gcc_assert (mode == Pmode);
2237 
2238 	    insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
2239 	    mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
2240 	  }
2241 
2242 	/* The operand is expected to be MEM.  Whenever the related insn
2243 	   pattern changed, above code which calculate mem should be
2244 	   updated.  */
2245 	gcc_assert (GET_CODE (mem) == MEM);
2246 	MEM_READONLY_P (mem) = 1;
2247 	MEM_NOTRAP_P (mem) = 1;
2248 	emit_insn (insn);
2249 	return;
2250       }
2251 
2252     case SYMBOL_SMALL_GOT_4G:
2253       {
2254 	/* In ILP32, the mode of dest can be either SImode or DImode,
2255 	   while the got entry is always of SImode size.  The mode of
2256 	   dest depends on how dest is used: if dest is assigned to a
2257 	   pointer (e.g. in the memory), it has SImode; it may have
2258 	   DImode if dest is dereferenced to access the memeory.
2259 	   This is why we have to handle three different ldr_got_small
2260 	   patterns here (two patterns for ILP32).  */
2261 
2262 	rtx insn;
2263 	rtx mem;
2264 	rtx tmp_reg = dest;
2265 	machine_mode mode = GET_MODE (dest);
2266 
2267 	if (can_create_pseudo_p ())
2268 	  tmp_reg = gen_reg_rtx (mode);
2269 
2270 	emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
2271 	if (mode == ptr_mode)
2272 	  {
2273 	    if (mode == DImode)
2274 	      insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
2275 	    else
2276 	      insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
2277 
2278 	    mem = XVECEXP (SET_SRC (insn), 0, 0);
2279 	  }
2280 	else
2281 	  {
2282 	    gcc_assert (mode == Pmode);
2283 
2284 	    insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
2285 	    mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
2286 	  }
2287 
2288 	gcc_assert (GET_CODE (mem) == MEM);
2289 	MEM_READONLY_P (mem) = 1;
2290 	MEM_NOTRAP_P (mem) = 1;
2291 	emit_insn (insn);
2292 	return;
2293       }
2294 
2295     case SYMBOL_SMALL_TLSGD:
2296       {
2297 	rtx_insn *insns;
2298 	machine_mode mode = GET_MODE (dest);
2299 	rtx result = gen_rtx_REG (mode, R0_REGNUM);
2300 
2301 	start_sequence ();
2302 	if (TARGET_ILP32)
2303 	  aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
2304 	else
2305 	  aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
2306 	insns = get_insns ();
2307 	end_sequence ();
2308 
2309 	RTL_CONST_CALL_P (insns) = 1;
2310 	emit_libcall_block (insns, dest, result, imm);
2311 	return;
2312       }
2313 
2314     case SYMBOL_SMALL_TLSDESC:
2315       {
2316 	machine_mode mode = GET_MODE (dest);
2317 	rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
2318 	rtx tp;
2319 
2320 	gcc_assert (mode == Pmode || mode == ptr_mode);
2321 
2322 	/* In ILP32, the got entry is always of SImode size.  Unlike
2323 	   small GOT, the dest is fixed at reg 0.  */
2324 	if (TARGET_ILP32)
2325 	  emit_insn (gen_tlsdesc_small_si (imm));
2326 	else
2327 	  emit_insn (gen_tlsdesc_small_di (imm));
2328 	tp = aarch64_load_tp (NULL);
2329 
2330 	if (mode != Pmode)
2331 	  tp = gen_lowpart (mode, tp);
2332 
2333 	emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
2334 	if (REG_P (dest))
2335 	  set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2336 	return;
2337       }
2338 
2339     case SYMBOL_SMALL_TLSIE:
2340       {
2341 	/* In ILP32, the mode of dest can be either SImode or DImode,
2342 	   while the got entry is always of SImode size.  The mode of
2343 	   dest depends on how dest is used: if dest is assigned to a
2344 	   pointer (e.g. in the memory), it has SImode; it may have
2345 	   DImode if dest is dereferenced to access the memeory.
2346 	   This is why we have to handle three different tlsie_small
2347 	   patterns here (two patterns for ILP32).  */
2348 	machine_mode mode = GET_MODE (dest);
2349 	rtx tmp_reg = gen_reg_rtx (mode);
2350 	rtx tp = aarch64_load_tp (NULL);
2351 
2352 	if (mode == ptr_mode)
2353 	  {
2354 	    if (mode == DImode)
2355 	      emit_insn (gen_tlsie_small_di (tmp_reg, imm));
2356 	    else
2357 	      {
2358 		emit_insn (gen_tlsie_small_si (tmp_reg, imm));
2359 		tp = gen_lowpart (mode, tp);
2360 	      }
2361 	  }
2362 	else
2363 	  {
2364 	    gcc_assert (mode == Pmode);
2365 	    emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
2366 	  }
2367 
2368 	emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
2369 	if (REG_P (dest))
2370 	  set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2371 	return;
2372       }
2373 
2374     case SYMBOL_TLSLE12:
2375     case SYMBOL_TLSLE24:
2376     case SYMBOL_TLSLE32:
2377     case SYMBOL_TLSLE48:
2378       {
2379 	machine_mode mode = GET_MODE (dest);
2380 	rtx tp = aarch64_load_tp (NULL);
2381 
2382 	if (mode != Pmode)
2383 	  tp = gen_lowpart (mode, tp);
2384 
2385 	switch (type)
2386 	  {
2387 	  case SYMBOL_TLSLE12:
2388 	    emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
2389 			(dest, tp, imm));
2390 	    break;
2391 	  case SYMBOL_TLSLE24:
2392 	    emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
2393 			(dest, tp, imm));
2394 	  break;
2395 	  case SYMBOL_TLSLE32:
2396 	    emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
2397 			(dest, imm));
2398 	    emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
2399 			(dest, dest, tp));
2400 	  break;
2401 	  case SYMBOL_TLSLE48:
2402 	    emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
2403 			(dest, imm));
2404 	    emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
2405 			(dest, dest, tp));
2406 	    break;
2407 	  default:
2408 	    gcc_unreachable ();
2409 	  }
2410 
2411 	if (REG_P (dest))
2412 	  set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2413 	return;
2414       }
2415 
2416     case SYMBOL_TINY_GOT:
2417       emit_insn (gen_ldr_got_tiny (dest, imm));
2418       return;
2419 
2420     case SYMBOL_TINY_TLSIE:
2421       {
2422 	machine_mode mode = GET_MODE (dest);
2423 	rtx tp = aarch64_load_tp (NULL);
2424 
2425 	if (mode == ptr_mode)
2426 	  {
2427 	    if (mode == DImode)
2428 	      emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
2429 	    else
2430 	      {
2431 		tp = gen_lowpart (mode, tp);
2432 		emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
2433 	      }
2434 	  }
2435 	else
2436 	  {
2437 	    gcc_assert (mode == Pmode);
2438 	    emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
2439 	  }
2440 
2441 	if (REG_P (dest))
2442 	  set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2443 	return;
2444       }
2445 
2446     default:
2447       gcc_unreachable ();
2448     }
2449 }
2450 
2451 /* Emit a move from SRC to DEST.  Assume that the move expanders can
2452    handle all moves if !can_create_pseudo_p ().  The distinction is
2453    important because, unlike emit_move_insn, the move expanders know
2454    how to force Pmode objects into the constant pool even when the
2455    constant pool address is not itself legitimate.  */
2456 static rtx
aarch64_emit_move(rtx dest,rtx src)2457 aarch64_emit_move (rtx dest, rtx src)
2458 {
2459   return (can_create_pseudo_p ()
2460 	  ? emit_move_insn (dest, src)
2461 	  : emit_move_insn_1 (dest, src));
2462 }
2463 
2464 /* Apply UNOPTAB to OP and store the result in DEST.  */
2465 
2466 static void
aarch64_emit_unop(rtx dest,optab unoptab,rtx op)2467 aarch64_emit_unop (rtx dest, optab unoptab, rtx op)
2468 {
2469   rtx tmp = expand_unop (GET_MODE (dest), unoptab, op, dest, 0);
2470   if (dest != tmp)
2471     emit_move_insn (dest, tmp);
2472 }
2473 
2474 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST.  */
2475 
2476 static void
aarch64_emit_binop(rtx dest,optab binoptab,rtx op0,rtx op1)2477 aarch64_emit_binop (rtx dest, optab binoptab, rtx op0, rtx op1)
2478 {
2479   rtx tmp = expand_binop (GET_MODE (dest), binoptab, op0, op1, dest, 0,
2480 			  OPTAB_DIRECT);
2481   if (dest != tmp)
2482     emit_move_insn (dest, tmp);
2483 }
2484 
2485 /* Split a 128-bit move operation into two 64-bit move operations,
2486    taking care to handle partial overlap of register to register
2487    copies.  Special cases are needed when moving between GP regs and
2488    FP regs.  SRC can be a register, constant or memory; DST a register
2489    or memory.  If either operand is memory it must not have any side
2490    effects.  */
2491 void
aarch64_split_128bit_move(rtx dst,rtx src)2492 aarch64_split_128bit_move (rtx dst, rtx src)
2493 {
2494   rtx dst_lo, dst_hi;
2495   rtx src_lo, src_hi;
2496 
2497   machine_mode mode = GET_MODE (dst);
2498 
2499   gcc_assert (mode == TImode || mode == TFmode);
2500   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
2501   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
2502 
2503   if (REG_P (dst) && REG_P (src))
2504     {
2505       int src_regno = REGNO (src);
2506       int dst_regno = REGNO (dst);
2507 
2508       /* Handle FP <-> GP regs.  */
2509       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
2510 	{
2511 	  src_lo = gen_lowpart (word_mode, src);
2512 	  src_hi = gen_highpart (word_mode, src);
2513 
2514 	  emit_insn (gen_aarch64_movlow_di (mode, dst, src_lo));
2515 	  emit_insn (gen_aarch64_movhigh_di (mode, dst, src_hi));
2516 	  return;
2517 	}
2518       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
2519 	{
2520 	  dst_lo = gen_lowpart (word_mode, dst);
2521 	  dst_hi = gen_highpart (word_mode, dst);
2522 
2523 	  emit_insn (gen_aarch64_movdi_low (mode, dst_lo, src));
2524 	  emit_insn (gen_aarch64_movdi_high (mode, dst_hi, src));
2525 	  return;
2526 	}
2527     }
2528 
2529   dst_lo = gen_lowpart (word_mode, dst);
2530   dst_hi = gen_highpart (word_mode, dst);
2531   src_lo = gen_lowpart (word_mode, src);
2532   src_hi = gen_highpart_mode (word_mode, mode, src);
2533 
2534   /* At most one pairing may overlap.  */
2535   if (reg_overlap_mentioned_p (dst_lo, src_hi))
2536     {
2537       aarch64_emit_move (dst_hi, src_hi);
2538       aarch64_emit_move (dst_lo, src_lo);
2539     }
2540   else
2541     {
2542       aarch64_emit_move (dst_lo, src_lo);
2543       aarch64_emit_move (dst_hi, src_hi);
2544     }
2545 }
2546 
2547 bool
aarch64_split_128bit_move_p(rtx dst,rtx src)2548 aarch64_split_128bit_move_p (rtx dst, rtx src)
2549 {
2550   return (! REG_P (src)
2551 	  || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
2552 }
2553 
2554 /* Split a complex SIMD combine.  */
2555 
2556 void
aarch64_split_simd_combine(rtx dst,rtx src1,rtx src2)2557 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
2558 {
2559   machine_mode src_mode = GET_MODE (src1);
2560   machine_mode dst_mode = GET_MODE (dst);
2561 
2562   gcc_assert (VECTOR_MODE_P (dst_mode));
2563   gcc_assert (register_operand (dst, dst_mode)
2564 	      && register_operand (src1, src_mode)
2565 	      && register_operand (src2, src_mode));
2566 
2567   emit_insn (gen_aarch64_simd_combine (src_mode, dst, src1, src2));
2568   return;
2569 }
2570 
2571 /* Split a complex SIMD move.  */
2572 
2573 void
aarch64_split_simd_move(rtx dst,rtx src)2574 aarch64_split_simd_move (rtx dst, rtx src)
2575 {
2576   machine_mode src_mode = GET_MODE (src);
2577   machine_mode dst_mode = GET_MODE (dst);
2578 
2579   gcc_assert (VECTOR_MODE_P (dst_mode));
2580 
2581   if (REG_P (dst) && REG_P (src))
2582     {
2583       gcc_assert (VECTOR_MODE_P (src_mode));
2584       emit_insn (gen_aarch64_split_simd_mov (src_mode, dst, src));
2585     }
2586 }
2587 
2588 bool
aarch64_zero_extend_const_eq(machine_mode xmode,rtx x,machine_mode ymode,rtx y)2589 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
2590 			      machine_mode ymode, rtx y)
2591 {
2592   rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
2593   gcc_assert (r != NULL);
2594   return rtx_equal_p (x, r);
2595 }
2596 
2597 
2598 static rtx
aarch64_force_temporary(machine_mode mode,rtx x,rtx value)2599 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
2600 {
2601   if (can_create_pseudo_p ())
2602     return force_reg (mode, value);
2603   else
2604     {
2605       gcc_assert (x);
2606       aarch64_emit_move (x, value);
2607       return x;
2608     }
2609 }
2610 
2611 /* Return true if we can move VALUE into a register using a single
2612    CNT[BHWD] instruction.  */
2613 
2614 static bool
aarch64_sve_cnt_immediate_p(poly_int64 value)2615 aarch64_sve_cnt_immediate_p (poly_int64 value)
2616 {
2617   HOST_WIDE_INT factor = value.coeffs[0];
2618   /* The coefficient must be [1, 16] * {2, 4, 8, 16}.  */
2619   return (value.coeffs[1] == factor
2620 	  && IN_RANGE (factor, 2, 16 * 16)
2621 	  && (factor & 1) == 0
2622 	  && factor <= 16 * (factor & -factor));
2623 }
2624 
2625 /* Likewise for rtx X.  */
2626 
2627 bool
aarch64_sve_cnt_immediate_p(rtx x)2628 aarch64_sve_cnt_immediate_p (rtx x)
2629 {
2630   poly_int64 value;
2631   return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
2632 }
2633 
2634 /* Return the asm string for an instruction with a CNT-like vector size
2635    operand (a vector pattern followed by a multiplier in the range [1, 16]).
2636    PREFIX is the mnemonic without the size suffix and OPERANDS is the
2637    first part of the operands template (the part that comes before the
2638    vector size itself).  FACTOR is the number of quadwords.
2639    NELTS_PER_VQ, if nonzero, is the number of elements in each quadword.
2640    If it is zero, we can use any element size.  */
2641 
2642 static char *
aarch64_output_sve_cnt_immediate(const char * prefix,const char * operands,unsigned int factor,unsigned int nelts_per_vq)2643 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2644 				  unsigned int factor,
2645 				  unsigned int nelts_per_vq)
2646 {
2647   static char buffer[sizeof ("sqincd\t%x0, %w0, all, mul #16")];
2648 
2649   if (nelts_per_vq == 0)
2650     /* There is some overlap in the ranges of the four CNT instructions.
2651        Here we always use the smallest possible element size, so that the
2652        multiplier is 1 whereever possible.  */
2653     nelts_per_vq = factor & -factor;
2654   int shift = std::min (exact_log2 (nelts_per_vq), 4);
2655   gcc_assert (IN_RANGE (shift, 1, 4));
2656   char suffix = "dwhb"[shift - 1];
2657 
2658   factor >>= shift;
2659   unsigned int written;
2660   if (factor == 1)
2661     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
2662 			prefix, suffix, operands);
2663   else
2664     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, all, mul #%d",
2665 			prefix, suffix, operands, factor);
2666   gcc_assert (written < sizeof (buffer));
2667   return buffer;
2668 }
2669 
2670 /* Return the asm string for an instruction with a CNT-like vector size
2671    operand (a vector pattern followed by a multiplier in the range [1, 16]).
2672    PREFIX is the mnemonic without the size suffix and OPERANDS is the
2673    first part of the operands template (the part that comes before the
2674    vector size itself).  X is the value of the vector size operand,
2675    as a polynomial integer rtx.  */
2676 
2677 char *
aarch64_output_sve_cnt_immediate(const char * prefix,const char * operands,rtx x)2678 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2679 				  rtx x)
2680 {
2681   poly_int64 value = rtx_to_poly_int64 (x);
2682   gcc_assert (aarch64_sve_cnt_immediate_p (value));
2683   return aarch64_output_sve_cnt_immediate (prefix, operands,
2684 					   value.coeffs[1], 0);
2685 }
2686 
2687 /* Return true if we can add VALUE to a register using a single ADDVL
2688    or ADDPL instruction.  */
2689 
2690 static bool
aarch64_sve_addvl_addpl_immediate_p(poly_int64 value)2691 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
2692 {
2693   HOST_WIDE_INT factor = value.coeffs[0];
2694   if (factor == 0 || value.coeffs[1] != factor)
2695     return false;
2696   /* FACTOR counts VG / 2, so a value of 2 is one predicate width
2697      and a value of 16 is one vector width.  */
2698   return (((factor & 15) == 0 && IN_RANGE (factor, -32 * 16, 31 * 16))
2699 	  || ((factor & 1) == 0 && IN_RANGE (factor, -32 * 2, 31 * 2)));
2700 }
2701 
2702 /* Likewise for rtx X.  */
2703 
2704 bool
aarch64_sve_addvl_addpl_immediate_p(rtx x)2705 aarch64_sve_addvl_addpl_immediate_p (rtx x)
2706 {
2707   poly_int64 value;
2708   return (poly_int_rtx_p (x, &value)
2709 	  && aarch64_sve_addvl_addpl_immediate_p (value));
2710 }
2711 
2712 /* Return the asm string for adding ADDVL or ADDPL immediate X to operand 1
2713    and storing the result in operand 0.  */
2714 
2715 char *
aarch64_output_sve_addvl_addpl(rtx dest,rtx base,rtx offset)2716 aarch64_output_sve_addvl_addpl (rtx dest, rtx base, rtx offset)
2717 {
2718   static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
2719   poly_int64 offset_value = rtx_to_poly_int64 (offset);
2720   gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
2721 
2722   /* Use INC or DEC if possible.  */
2723   if (rtx_equal_p (dest, base) && GP_REGNUM_P (REGNO (dest)))
2724     {
2725       if (aarch64_sve_cnt_immediate_p (offset_value))
2726 	return aarch64_output_sve_cnt_immediate ("inc", "%x0",
2727 						 offset_value.coeffs[1], 0);
2728       if (aarch64_sve_cnt_immediate_p (-offset_value))
2729 	return aarch64_output_sve_cnt_immediate ("dec", "%x0",
2730 						 -offset_value.coeffs[1], 0);
2731     }
2732 
2733   int factor = offset_value.coeffs[1];
2734   if ((factor & 15) == 0)
2735     snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
2736   else
2737     snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
2738   return buffer;
2739 }
2740 
2741 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2742    instruction.  If it is, store the number of elements in each vector
2743    quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
2744    factor in *FACTOR_OUT (if nonnull).  */
2745 
2746 bool
aarch64_sve_inc_dec_immediate_p(rtx x,int * factor_out,unsigned int * nelts_per_vq_out)2747 aarch64_sve_inc_dec_immediate_p (rtx x, int *factor_out,
2748 				 unsigned int *nelts_per_vq_out)
2749 {
2750   rtx elt;
2751   poly_int64 value;
2752 
2753   if (!const_vec_duplicate_p (x, &elt)
2754       || !poly_int_rtx_p (elt, &value))
2755     return false;
2756 
2757   unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
2758   if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
2759     /* There's no vector INCB.  */
2760     return false;
2761 
2762   HOST_WIDE_INT factor = value.coeffs[0];
2763   if (value.coeffs[1] != factor)
2764     return false;
2765 
2766   /* The coefficient must be [1, 16] * NELTS_PER_VQ.  */
2767   if ((factor % nelts_per_vq) != 0
2768       || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
2769     return false;
2770 
2771   if (factor_out)
2772     *factor_out = factor;
2773   if (nelts_per_vq_out)
2774     *nelts_per_vq_out = nelts_per_vq;
2775   return true;
2776 }
2777 
2778 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2779    instruction.  */
2780 
2781 bool
aarch64_sve_inc_dec_immediate_p(rtx x)2782 aarch64_sve_inc_dec_immediate_p (rtx x)
2783 {
2784   return aarch64_sve_inc_dec_immediate_p (x, NULL, NULL);
2785 }
2786 
2787 /* Return the asm template for an SVE vector INC or DEC instruction.
2788    OPERANDS gives the operands before the vector count and X is the
2789    value of the vector count operand itself.  */
2790 
2791 char *
aarch64_output_sve_inc_dec_immediate(const char * operands,rtx x)2792 aarch64_output_sve_inc_dec_immediate (const char *operands, rtx x)
2793 {
2794   int factor;
2795   unsigned int nelts_per_vq;
2796   if (!aarch64_sve_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
2797     gcc_unreachable ();
2798   if (factor < 0)
2799     return aarch64_output_sve_cnt_immediate ("dec", operands, -factor,
2800 					     nelts_per_vq);
2801   else
2802     return aarch64_output_sve_cnt_immediate ("inc", operands, factor,
2803 					     nelts_per_vq);
2804 }
2805 
2806 static int
aarch64_internal_mov_immediate(rtx dest,rtx imm,bool generate,scalar_int_mode mode)2807 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
2808 				scalar_int_mode mode)
2809 {
2810   int i;
2811   unsigned HOST_WIDE_INT val, val2, mask;
2812   int one_match, zero_match;
2813   int num_insns;
2814 
2815   val = INTVAL (imm);
2816 
2817   if (aarch64_move_imm (val, mode))
2818     {
2819       if (generate)
2820 	emit_insn (gen_rtx_SET (dest, imm));
2821       return 1;
2822     }
2823 
2824   /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
2825      (with XXXX non-zero). In that case check to see if the move can be done in
2826      a smaller mode.  */
2827   val2 = val & 0xffffffff;
2828   if (mode == DImode
2829       && aarch64_move_imm (val2, SImode)
2830       && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
2831     {
2832       if (generate)
2833 	emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
2834 
2835       /* Check if we have to emit a second instruction by checking to see
2836          if any of the upper 32 bits of the original DI mode value is set.  */
2837       if (val == val2)
2838 	return 1;
2839 
2840       i = (val >> 48) ? 48 : 32;
2841 
2842       if (generate)
2843 	 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2844 				    GEN_INT ((val >> i) & 0xffff)));
2845 
2846       return 2;
2847     }
2848 
2849   if ((val >> 32) == 0 || mode == SImode)
2850     {
2851       if (generate)
2852 	{
2853 	  emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
2854 	  if (mode == SImode)
2855 	    emit_insn (gen_insv_immsi (dest, GEN_INT (16),
2856 				       GEN_INT ((val >> 16) & 0xffff)));
2857 	  else
2858 	    emit_insn (gen_insv_immdi (dest, GEN_INT (16),
2859 				       GEN_INT ((val >> 16) & 0xffff)));
2860 	}
2861       return 2;
2862     }
2863 
2864   /* Remaining cases are all for DImode.  */
2865 
2866   mask = 0xffff;
2867   zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
2868     ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
2869   one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
2870     ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
2871 
2872   if (zero_match != 2 && one_match != 2)
2873     {
2874       /* Try emitting a bitmask immediate with a movk replacing 16 bits.
2875 	 For a 64-bit bitmask try whether changing 16 bits to all ones or
2876 	 zeroes creates a valid bitmask.  To check any repeated bitmask,
2877 	 try using 16 bits from the other 32-bit half of val.  */
2878 
2879       for (i = 0; i < 64; i += 16, mask <<= 16)
2880 	{
2881 	  val2 = val & ~mask;
2882 	  if (val2 != val && aarch64_bitmask_imm (val2, mode))
2883 	    break;
2884 	  val2 = val | mask;
2885 	  if (val2 != val && aarch64_bitmask_imm (val2, mode))
2886 	    break;
2887 	  val2 = val2 & ~mask;
2888 	  val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
2889 	  if (val2 != val && aarch64_bitmask_imm (val2, mode))
2890 	    break;
2891 	}
2892       if (i != 64)
2893 	{
2894 	  if (generate)
2895 	    {
2896 	      emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
2897 	      emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2898 					 GEN_INT ((val >> i) & 0xffff)));
2899 	    }
2900 	  return 2;
2901 	}
2902     }
2903 
2904   /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
2905      are emitted by the initial mov.  If one_match > zero_match, skip set bits,
2906      otherwise skip zero bits.  */
2907 
2908   num_insns = 1;
2909   mask = 0xffff;
2910   val2 = one_match > zero_match ? ~val : val;
2911   i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
2912 
2913   if (generate)
2914     emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
2915 					   ? (val | ~(mask << i))
2916 					   : (val & (mask << i)))));
2917   for (i += 16; i < 64; i += 16)
2918     {
2919       if ((val2 & (mask << i)) == 0)
2920 	continue;
2921       if (generate)
2922 	emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2923 				   GEN_INT ((val >> i) & 0xffff)));
2924       num_insns ++;
2925     }
2926 
2927   return num_insns;
2928 }
2929 
2930 /* Return whether imm is a 128-bit immediate which is simple enough to
2931    expand inline.  */
2932 bool
aarch64_mov128_immediate(rtx imm)2933 aarch64_mov128_immediate (rtx imm)
2934 {
2935   if (GET_CODE (imm) == CONST_INT)
2936     return true;
2937 
2938   gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
2939 
2940   rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
2941   rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
2942 
2943   return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
2944 	 + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
2945 }
2946 
2947 
2948 /* Return the number of temporary registers that aarch64_add_offset_1
2949    would need to add OFFSET to a register.  */
2950 
2951 static unsigned int
aarch64_add_offset_1_temporaries(HOST_WIDE_INT offset)2952 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
2953 {
2954   return absu_hwi (offset) < 0x1000000 ? 0 : 1;
2955 }
2956 
2957 /* A subroutine of aarch64_add_offset.  Set DEST to SRC + OFFSET for
2958    a non-polynomial OFFSET.  MODE is the mode of the addition.
2959    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2960    be set and CFA adjustments added to the generated instructions.
2961 
2962    TEMP1, if nonnull, is a register of mode MODE that can be used as a
2963    temporary if register allocation is already complete.  This temporary
2964    register may overlap DEST but must not overlap SRC.  If TEMP1 is known
2965    to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
2966    the immediate again.
2967 
2968    Since this function may be used to adjust the stack pointer, we must
2969    ensure that it cannot cause transient stack deallocation (for example
2970    by first incrementing SP and then decrementing when adjusting by a
2971    large immediate).  */
2972 
2973 static void
aarch64_add_offset_1(scalar_int_mode mode,rtx dest,rtx src,HOST_WIDE_INT offset,rtx temp1,bool frame_related_p,bool emit_move_imm)2974 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
2975 		      rtx src, HOST_WIDE_INT offset, rtx temp1,
2976 		      bool frame_related_p, bool emit_move_imm)
2977 {
2978   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
2979   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
2980 
2981   unsigned HOST_WIDE_INT moffset = absu_hwi (offset);
2982   rtx_insn *insn;
2983 
2984   if (!moffset)
2985     {
2986       if (!rtx_equal_p (dest, src))
2987 	{
2988 	  insn = emit_insn (gen_rtx_SET (dest, src));
2989 	  RTX_FRAME_RELATED_P (insn) = frame_related_p;
2990 	}
2991       return;
2992     }
2993 
2994   /* Single instruction adjustment.  */
2995   if (aarch64_uimm12_shift (moffset))
2996     {
2997       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
2998       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2999       return;
3000     }
3001 
3002   /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
3003      and either:
3004 
3005      a) the offset cannot be loaded by a 16-bit move or
3006      b) there is no spare register into which we can move it.  */
3007   if (moffset < 0x1000000
3008       && ((!temp1 && !can_create_pseudo_p ())
3009 	  || !aarch64_move_imm (moffset, mode)))
3010     {
3011       HOST_WIDE_INT low_off = moffset & 0xfff;
3012 
3013       low_off = offset < 0 ? -low_off : low_off;
3014       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
3015       RTX_FRAME_RELATED_P (insn) = frame_related_p;
3016       insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
3017       RTX_FRAME_RELATED_P (insn) = frame_related_p;
3018       return;
3019     }
3020 
3021   /* Emit a move immediate if required and an addition/subtraction.  */
3022   if (emit_move_imm)
3023     {
3024       gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
3025       temp1 = aarch64_force_temporary (mode, temp1,
3026 				       gen_int_mode (moffset, mode));
3027     }
3028   insn = emit_insn (offset < 0
3029 		    ? gen_sub3_insn (dest, src, temp1)
3030 		    : gen_add3_insn (dest, src, temp1));
3031   if (frame_related_p)
3032     {
3033       RTX_FRAME_RELATED_P (insn) = frame_related_p;
3034       rtx adj = plus_constant (mode, src, offset);
3035       add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
3036     }
3037 }
3038 
3039 /* Return the number of temporary registers that aarch64_add_offset
3040    would need to move OFFSET into a register or add OFFSET to a register;
3041    ADD_P is true if we want the latter rather than the former.  */
3042 
3043 static unsigned int
aarch64_offset_temporaries(bool add_p,poly_int64 offset)3044 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
3045 {
3046   /* This follows the same structure as aarch64_add_offset.  */
3047   if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
3048     return 0;
3049 
3050   unsigned int count = 0;
3051   HOST_WIDE_INT factor = offset.coeffs[1];
3052   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
3053   poly_int64 poly_offset (factor, factor);
3054   if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
3055     /* Need one register for the ADDVL/ADDPL result.  */
3056     count += 1;
3057   else if (factor != 0)
3058     {
3059       factor = abs (factor);
3060       if (factor > 16 * (factor & -factor))
3061 	/* Need one register for the CNT result and one for the multiplication
3062 	   factor.  If necessary, the second temporary can be reused for the
3063 	   constant part of the offset.  */
3064 	return 2;
3065       /* Need one register for the CNT result (which might then
3066 	 be shifted).  */
3067       count += 1;
3068     }
3069   return count + aarch64_add_offset_1_temporaries (constant);
3070 }
3071 
3072 /* If X can be represented as a poly_int64, return the number
3073    of temporaries that are required to add it to a register.
3074    Return -1 otherwise.  */
3075 
3076 int
aarch64_add_offset_temporaries(rtx x)3077 aarch64_add_offset_temporaries (rtx x)
3078 {
3079   poly_int64 offset;
3080   if (!poly_int_rtx_p (x, &offset))
3081     return -1;
3082   return aarch64_offset_temporaries (true, offset);
3083 }
3084 
3085 /* Set DEST to SRC + OFFSET.  MODE is the mode of the addition.
3086    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
3087    be set and CFA adjustments added to the generated instructions.
3088 
3089    TEMP1, if nonnull, is a register of mode MODE that can be used as a
3090    temporary if register allocation is already complete.  This temporary
3091    register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
3092    If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
3093    false to avoid emitting the immediate again.
3094 
3095    TEMP2, if nonnull, is a second temporary register that doesn't
3096    overlap either DEST or REG.
3097 
3098    Since this function may be used to adjust the stack pointer, we must
3099    ensure that it cannot cause transient stack deallocation (for example
3100    by first incrementing SP and then decrementing when adjusting by a
3101    large immediate).  */
3102 
3103 static void
3104 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
3105 		    poly_int64 offset, rtx temp1, rtx temp2,
3106 		    bool frame_related_p, bool emit_move_imm = true)
3107 {
3108   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
3109   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
3110   gcc_assert (temp1 == NULL_RTX
3111 	      || !frame_related_p
3112 	      || !reg_overlap_mentioned_p (temp1, dest));
3113   gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
3114 
3115   /* Try using ADDVL or ADDPL to add the whole value.  */
3116   if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
3117     {
3118       rtx offset_rtx = gen_int_mode (offset, mode);
3119       rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
3120       RTX_FRAME_RELATED_P (insn) = frame_related_p;
3121       return;
3122     }
3123 
3124   /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
3125      SVE vector register, over and above the minimum size of 128 bits.
3126      This is equivalent to half the value returned by CNTD with a
3127      vector shape of ALL.  */
3128   HOST_WIDE_INT factor = offset.coeffs[1];
3129   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
3130 
3131   /* Try using ADDVL or ADDPL to add the VG-based part.  */
3132   poly_int64 poly_offset (factor, factor);
3133   if (src != const0_rtx
3134       && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
3135     {
3136       rtx offset_rtx = gen_int_mode (poly_offset, mode);
3137       if (frame_related_p)
3138 	{
3139 	  rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
3140 	  RTX_FRAME_RELATED_P (insn) = true;
3141 	  src = dest;
3142 	}
3143       else
3144 	{
3145 	  rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
3146 	  src = aarch64_force_temporary (mode, temp1, addr);
3147 	  temp1 = temp2;
3148 	  temp2 = NULL_RTX;
3149 	}
3150     }
3151   /* Otherwise use a CNT-based sequence.  */
3152   else if (factor != 0)
3153     {
3154       /* Use a subtraction if we have a negative factor.  */
3155       rtx_code code = PLUS;
3156       if (factor < 0)
3157 	{
3158 	  factor = -factor;
3159 	  code = MINUS;
3160 	}
3161 
3162       /* Calculate CNTD * FACTOR / 2.  First try to fold the division
3163 	 into the multiplication.  */
3164       rtx val;
3165       int shift = 0;
3166       if (factor & 1)
3167 	/* Use a right shift by 1.  */
3168 	shift = -1;
3169       else
3170 	factor /= 2;
3171       HOST_WIDE_INT low_bit = factor & -factor;
3172       if (factor <= 16 * low_bit)
3173 	{
3174 	  if (factor > 16 * 8)
3175 	    {
3176 	      /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
3177 		 the value with the minimum multiplier and shift it into
3178 		 position.  */
3179 	      int extra_shift = exact_log2 (low_bit);
3180 	      shift += extra_shift;
3181 	      factor >>= extra_shift;
3182 	    }
3183 	  val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
3184 	}
3185       else
3186 	{
3187 	  /* Use CNTD, then multiply it by FACTOR.  */
3188 	  val = gen_int_mode (poly_int64 (2, 2), mode);
3189 	  val = aarch64_force_temporary (mode, temp1, val);
3190 
3191 	  /* Go back to using a negative multiplication factor if we have
3192 	     no register from which to subtract.  */
3193 	  if (code == MINUS && src == const0_rtx)
3194 	    {
3195 	      factor = -factor;
3196 	      code = PLUS;
3197 	    }
3198 	  rtx coeff1 = gen_int_mode (factor, mode);
3199 	  coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
3200 	  val = gen_rtx_MULT (mode, val, coeff1);
3201 	}
3202 
3203       if (shift > 0)
3204 	{
3205 	  /* Multiply by 1 << SHIFT.  */
3206 	  val = aarch64_force_temporary (mode, temp1, val);
3207 	  val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
3208 	}
3209       else if (shift == -1)
3210 	{
3211 	  /* Divide by 2.  */
3212 	  val = aarch64_force_temporary (mode, temp1, val);
3213 	  val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
3214 	}
3215 
3216       /* Calculate SRC +/- CNTD * FACTOR / 2.  */
3217       if (src != const0_rtx)
3218 	{
3219 	  val = aarch64_force_temporary (mode, temp1, val);
3220 	  val = gen_rtx_fmt_ee (code, mode, src, val);
3221 	}
3222       else if (code == MINUS)
3223 	{
3224 	  val = aarch64_force_temporary (mode, temp1, val);
3225 	  val = gen_rtx_NEG (mode, val);
3226 	}
3227 
3228       if (constant == 0 || frame_related_p)
3229 	{
3230 	  rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
3231 	  if (frame_related_p)
3232 	    {
3233 	      RTX_FRAME_RELATED_P (insn) = true;
3234 	      add_reg_note (insn, REG_CFA_ADJUST_CFA,
3235 			    gen_rtx_SET (dest, plus_constant (Pmode, src,
3236 							      poly_offset)));
3237 	    }
3238 	  src = dest;
3239 	  if (constant == 0)
3240 	    return;
3241 	}
3242       else
3243 	{
3244 	  src = aarch64_force_temporary (mode, temp1, val);
3245 	  temp1 = temp2;
3246 	  temp2 = NULL_RTX;
3247 	}
3248 
3249       emit_move_imm = true;
3250     }
3251 
3252   aarch64_add_offset_1 (mode, dest, src, constant, temp1,
3253 			frame_related_p, emit_move_imm);
3254 }
3255 
3256 /* Like aarch64_add_offset, but the offset is given as an rtx rather
3257    than a poly_int64.  */
3258 
3259 void
aarch64_split_add_offset(scalar_int_mode mode,rtx dest,rtx src,rtx offset_rtx,rtx temp1,rtx temp2)3260 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
3261 			  rtx offset_rtx, rtx temp1, rtx temp2)
3262 {
3263   aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
3264 		      temp1, temp2, false);
3265 }
3266 
3267 /* Add DELTA to the stack pointer, marking the instructions frame-related.
3268    TEMP1 is available as a temporary if nonnull.  EMIT_MOVE_IMM is false
3269    if TEMP1 already contains abs (DELTA).  */
3270 
3271 static inline void
aarch64_add_sp(rtx temp1,rtx temp2,poly_int64 delta,bool emit_move_imm)3272 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
3273 {
3274   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
3275 		      temp1, temp2, true, emit_move_imm);
3276 }
3277 
3278 /* Subtract DELTA from the stack pointer, marking the instructions
3279    frame-related if FRAME_RELATED_P.  TEMP1 is available as a temporary
3280    if nonnull.  */
3281 
3282 static inline void
3283 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p,
3284 		bool emit_move_imm = true)
3285 {
3286   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
3287 		      temp1, temp2, frame_related_p, emit_move_imm);
3288 }
3289 
3290 /* Set DEST to (vec_series BASE STEP).  */
3291 
3292 static void
aarch64_expand_vec_series(rtx dest,rtx base,rtx step)3293 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
3294 {
3295   machine_mode mode = GET_MODE (dest);
3296   scalar_mode inner = GET_MODE_INNER (mode);
3297 
3298   /* Each operand can be a register or an immediate in the range [-16, 15].  */
3299   if (!aarch64_sve_index_immediate_p (base))
3300     base = force_reg (inner, base);
3301   if (!aarch64_sve_index_immediate_p (step))
3302     step = force_reg (inner, step);
3303 
3304   emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
3305 }
3306 
3307 /* Try to duplicate SRC into SVE register DEST, given that SRC is an
3308    integer of mode INT_MODE.  Return true on success.  */
3309 
3310 static bool
aarch64_expand_sve_widened_duplicate(rtx dest,scalar_int_mode src_mode,rtx src)3311 aarch64_expand_sve_widened_duplicate (rtx dest, scalar_int_mode src_mode,
3312 				      rtx src)
3313 {
3314   /* If the constant is smaller than 128 bits, we can do the move
3315      using a vector of SRC_MODEs.  */
3316   if (src_mode != TImode)
3317     {
3318       poly_uint64 count = exact_div (GET_MODE_SIZE (GET_MODE (dest)),
3319 				     GET_MODE_SIZE (src_mode));
3320       machine_mode dup_mode = mode_for_vector (src_mode, count).require ();
3321       emit_move_insn (gen_lowpart (dup_mode, dest),
3322 		      gen_const_vec_duplicate (dup_mode, src));
3323       return true;
3324     }
3325 
3326   /* Use LD1RQ[BHWD] to load the 128 bits from memory.  */
3327   src = force_const_mem (src_mode, src);
3328   if (!src)
3329     return false;
3330 
3331   /* Make sure that the address is legitimate.  */
3332   if (!aarch64_sve_ld1r_operand_p (src))
3333     {
3334       rtx addr = force_reg (Pmode, XEXP (src, 0));
3335       src = replace_equiv_address (src, addr);
3336     }
3337 
3338   machine_mode mode = GET_MODE (dest);
3339   unsigned int elem_bytes = GET_MODE_UNIT_SIZE (mode);
3340   machine_mode pred_mode = aarch64_sve_pred_mode (elem_bytes).require ();
3341   rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
3342   src = gen_rtx_UNSPEC (mode, gen_rtvec (2, ptrue, src), UNSPEC_LD1RQ);
3343   emit_insn (gen_rtx_SET (dest, src));
3344   return true;
3345 }
3346 
3347 /* Expand a move of general CONST_VECTOR SRC into DEST, given that it
3348    isn't a simple duplicate or series.  */
3349 
3350 static void
aarch64_expand_sve_const_vector(rtx dest,rtx src)3351 aarch64_expand_sve_const_vector (rtx dest, rtx src)
3352 {
3353   machine_mode mode = GET_MODE (src);
3354   unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
3355   unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
3356   gcc_assert (npatterns > 1);
3357 
3358   if (nelts_per_pattern == 1)
3359     {
3360       /* The constant is a repeating seqeuence of at least two elements,
3361 	 where the repeating elements occupy no more than 128 bits.
3362 	 Get an integer representation of the replicated value.  */
3363       scalar_int_mode int_mode;
3364       if (BYTES_BIG_ENDIAN)
3365 	/* For now, always use LD1RQ to load the value on big-endian
3366 	   targets, since the handling of smaller integers includes a
3367 	   subreg that is semantically an element reverse.  */
3368 	int_mode = TImode;
3369       else
3370 	{
3371 	  unsigned int int_bits = GET_MODE_UNIT_BITSIZE (mode) * npatterns;
3372 	  gcc_assert (int_bits <= 128);
3373 	  int_mode = int_mode_for_size (int_bits, 0).require ();
3374 	}
3375       rtx int_value = simplify_gen_subreg (int_mode, src, mode, 0);
3376       if (int_value
3377 	  && aarch64_expand_sve_widened_duplicate (dest, int_mode, int_value))
3378 	return;
3379     }
3380 
3381   /* Expand each pattern individually.  */
3382   rtx_vector_builder builder;
3383   auto_vec<rtx, 16> vectors (npatterns);
3384   for (unsigned int i = 0; i < npatterns; ++i)
3385     {
3386       builder.new_vector (mode, 1, nelts_per_pattern);
3387       for (unsigned int j = 0; j < nelts_per_pattern; ++j)
3388 	builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
3389       vectors.quick_push (force_reg (mode, builder.build ()));
3390     }
3391 
3392   /* Use permutes to interleave the separate vectors.  */
3393   while (npatterns > 1)
3394     {
3395       npatterns /= 2;
3396       for (unsigned int i = 0; i < npatterns; ++i)
3397 	{
3398 	  rtx tmp = (npatterns == 1 ? dest : gen_reg_rtx (mode));
3399 	  rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
3400 	  emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
3401 	  vectors[i] = tmp;
3402 	}
3403     }
3404   gcc_assert (vectors[0] == dest);
3405 }
3406 
3407 /* Set DEST to immediate IMM.  For SVE vector modes, GEN_VEC_DUPLICATE
3408    is a pattern that can be used to set DEST to a replicated scalar
3409    element.  */
3410 
3411 void
aarch64_expand_mov_immediate(rtx dest,rtx imm,rtx (* gen_vec_duplicate)(rtx,rtx))3412 aarch64_expand_mov_immediate (rtx dest, rtx imm,
3413 			      rtx (*gen_vec_duplicate) (rtx, rtx))
3414 {
3415   machine_mode mode = GET_MODE (dest);
3416 
3417   /* Check on what type of symbol it is.  */
3418   scalar_int_mode int_mode;
3419   if ((GET_CODE (imm) == SYMBOL_REF
3420        || GET_CODE (imm) == LABEL_REF
3421        || GET_CODE (imm) == CONST
3422        || GET_CODE (imm) == CONST_POLY_INT)
3423       && is_a <scalar_int_mode> (mode, &int_mode))
3424     {
3425       rtx mem;
3426       poly_int64 offset;
3427       HOST_WIDE_INT const_offset;
3428       enum aarch64_symbol_type sty;
3429 
3430       /* If we have (const (plus symbol offset)), separate out the offset
3431 	 before we start classifying the symbol.  */
3432       rtx base = strip_offset (imm, &offset);
3433 
3434       /* We must always add an offset involving VL separately, rather than
3435 	 folding it into the relocation.  */
3436       if (!offset.is_constant (&const_offset))
3437 	{
3438 	  if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
3439 	    emit_insn (gen_rtx_SET (dest, imm));
3440 	  else
3441 	    {
3442 	      /* Do arithmetic on 32-bit values if the result is smaller
3443 		 than that.  */
3444 	      if (partial_subreg_p (int_mode, SImode))
3445 		{
3446 		  /* It is invalid to do symbol calculations in modes
3447 		     narrower than SImode.  */
3448 		  gcc_assert (base == const0_rtx);
3449 		  dest = gen_lowpart (SImode, dest);
3450 		  int_mode = SImode;
3451 		}
3452 	      if (base != const0_rtx)
3453 		{
3454 		  base = aarch64_force_temporary (int_mode, dest, base);
3455 		  aarch64_add_offset (int_mode, dest, base, offset,
3456 				      NULL_RTX, NULL_RTX, false);
3457 		}
3458 	      else
3459 		aarch64_add_offset (int_mode, dest, base, offset,
3460 				    dest, NULL_RTX, false);
3461 	    }
3462 	  return;
3463 	}
3464 
3465       sty = aarch64_classify_symbol (base, const_offset);
3466       switch (sty)
3467 	{
3468 	case SYMBOL_FORCE_TO_MEM:
3469 	  if (const_offset != 0
3470 	      && targetm.cannot_force_const_mem (int_mode, imm))
3471 	    {
3472 	      gcc_assert (can_create_pseudo_p ());
3473 	      base = aarch64_force_temporary (int_mode, dest, base);
3474 	      aarch64_add_offset (int_mode, dest, base, const_offset,
3475 				  NULL_RTX, NULL_RTX, false);
3476 	      return;
3477 	    }
3478 
3479 	  mem = force_const_mem (ptr_mode, imm);
3480 	  gcc_assert (mem);
3481 
3482 	  /* If we aren't generating PC relative literals, then
3483 	     we need to expand the literal pool access carefully.
3484 	     This is something that needs to be done in a number
3485 	     of places, so could well live as a separate function.  */
3486 	  if (!aarch64_pcrelative_literal_loads)
3487 	    {
3488 	      gcc_assert (can_create_pseudo_p ());
3489 	      base = gen_reg_rtx (ptr_mode);
3490 	      aarch64_expand_mov_immediate (base, XEXP (mem, 0));
3491 	      if (ptr_mode != Pmode)
3492 		base = convert_memory_address (Pmode, base);
3493 	      mem = gen_rtx_MEM (ptr_mode, base);
3494 	    }
3495 
3496 	  if (int_mode != ptr_mode)
3497 	    mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
3498 
3499 	  emit_insn (gen_rtx_SET (dest, mem));
3500 
3501 	  return;
3502 
3503         case SYMBOL_SMALL_TLSGD:
3504         case SYMBOL_SMALL_TLSDESC:
3505 	case SYMBOL_SMALL_TLSIE:
3506 	case SYMBOL_SMALL_GOT_28K:
3507 	case SYMBOL_SMALL_GOT_4G:
3508 	case SYMBOL_TINY_GOT:
3509 	case SYMBOL_TINY_TLSIE:
3510 	  if (const_offset != 0)
3511 	    {
3512 	      gcc_assert(can_create_pseudo_p ());
3513 	      base = aarch64_force_temporary (int_mode, dest, base);
3514 	      aarch64_add_offset (int_mode, dest, base, const_offset,
3515 				  NULL_RTX, NULL_RTX, false);
3516 	      return;
3517 	    }
3518 	  /* FALLTHRU */
3519 
3520 	case SYMBOL_SMALL_ABSOLUTE:
3521 	case SYMBOL_TINY_ABSOLUTE:
3522 	case SYMBOL_TLSLE12:
3523 	case SYMBOL_TLSLE24:
3524 	case SYMBOL_TLSLE32:
3525 	case SYMBOL_TLSLE48:
3526 	  aarch64_load_symref_appropriately (dest, imm, sty);
3527 	  return;
3528 
3529 	default:
3530 	  gcc_unreachable ();
3531 	}
3532     }
3533 
3534   if (!CONST_INT_P (imm))
3535     {
3536       rtx base, step, value;
3537       if (GET_CODE (imm) == HIGH
3538 	  || aarch64_simd_valid_immediate (imm, NULL))
3539 	emit_insn (gen_rtx_SET (dest, imm));
3540       else if (const_vec_series_p (imm, &base, &step))
3541 	aarch64_expand_vec_series (dest, base, step);
3542       else if (const_vec_duplicate_p (imm, &value))
3543 	{
3544 	  /* If the constant is out of range of an SVE vector move,
3545 	     load it from memory if we can, otherwise move it into
3546 	     a register and use a DUP.  */
3547 	  scalar_mode inner_mode = GET_MODE_INNER (mode);
3548 	  rtx op = force_const_mem (inner_mode, value);
3549 	  if (!op)
3550 	    op = force_reg (inner_mode, value);
3551 	  else if (!aarch64_sve_ld1r_operand_p (op))
3552 	    {
3553 	      rtx addr = force_reg (Pmode, XEXP (op, 0));
3554 	      op = replace_equiv_address (op, addr);
3555 	    }
3556 	  emit_insn (gen_vec_duplicate (dest, op));
3557 	}
3558       else if (GET_CODE (imm) == CONST_VECTOR
3559 	       && !GET_MODE_NUNITS (GET_MODE (imm)).is_constant ())
3560 	aarch64_expand_sve_const_vector (dest, imm);
3561       else
3562 	{
3563 	  rtx mem = force_const_mem (mode, imm);
3564 	  gcc_assert (mem);
3565 	  emit_move_insn (dest, mem);
3566 	}
3567 
3568       return;
3569     }
3570 
3571   aarch64_internal_mov_immediate (dest, imm, true,
3572 				  as_a <scalar_int_mode> (mode));
3573 }
3574 
3575 /* Emit an SVE predicated move from SRC to DEST.  PRED is a predicate
3576    that is known to contain PTRUE.  */
3577 
3578 void
aarch64_emit_sve_pred_move(rtx dest,rtx pred,rtx src)3579 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
3580 {
3581   expand_operand ops[3];
3582   machine_mode mode = GET_MODE (dest);
3583   create_output_operand (&ops[0], dest, mode);
3584   create_input_operand (&ops[1], pred, GET_MODE(pred));
3585   create_input_operand (&ops[2], src, mode);
3586   expand_insn (code_for_aarch64_pred_mov (mode), 3, ops);
3587 }
3588 
3589 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
3590    operand is in memory.  In this case we need to use the predicated LD1
3591    and ST1 instead of LDR and STR, both for correctness on big-endian
3592    targets and because LD1 and ST1 support a wider range of addressing modes.
3593    PRED_MODE is the mode of the predicate.
3594 
3595    See the comment at the head of aarch64-sve.md for details about the
3596    big-endian handling.  */
3597 
3598 void
aarch64_expand_sve_mem_move(rtx dest,rtx src,machine_mode pred_mode)3599 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
3600 {
3601   machine_mode mode = GET_MODE (dest);
3602   rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
3603   if (!register_operand (src, mode)
3604       && !register_operand (dest, mode))
3605     {
3606       rtx tmp = gen_reg_rtx (mode);
3607       if (MEM_P (src))
3608 	aarch64_emit_sve_pred_move (tmp, ptrue, src);
3609       else
3610 	emit_move_insn (tmp, src);
3611       src = tmp;
3612     }
3613   aarch64_emit_sve_pred_move (dest, ptrue, src);
3614 }
3615 
3616 /* Called only on big-endian targets.  See whether an SVE vector move
3617    from SRC to DEST is effectively a REV[BHW] instruction, because at
3618    least one operand is a subreg of an SVE vector that has wider or
3619    narrower elements.  Return true and emit the instruction if so.
3620 
3621    For example:
3622 
3623      (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
3624 
3625    represents a VIEW_CONVERT between the following vectors, viewed
3626    in memory order:
3627 
3628      R2: { [0].high, [0].low,  [1].high, [1].low, ... }
3629      R1: { [0],      [1],      [2],      [3],     ... }
3630 
3631    The high part of lane X in R2 should therefore correspond to lane X*2
3632    of R1, but the register representations are:
3633 
3634          msb                                      lsb
3635      R2: ...... [1].high  [1].low   [0].high  [0].low
3636      R1: ...... [3]       [2]       [1]       [0]
3637 
3638    where the low part of lane X in R2 corresponds to lane X*2 in R1.
3639    We therefore need a reverse operation to swap the high and low values
3640    around.
3641 
3642    This is purely an optimization.  Without it we would spill the
3643    subreg operand to the stack in one mode and reload it in the
3644    other mode, which has the same effect as the REV.  */
3645 
3646 bool
aarch64_maybe_expand_sve_subreg_move(rtx dest,rtx src)3647 aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
3648 {
3649   gcc_assert (BYTES_BIG_ENDIAN);
3650   if (GET_CODE (dest) == SUBREG)
3651     dest = SUBREG_REG (dest);
3652   if (GET_CODE (src) == SUBREG)
3653     src = SUBREG_REG (src);
3654 
3655   /* The optimization handles two single SVE REGs with different element
3656      sizes.  */
3657   if (!REG_P (dest)
3658       || !REG_P (src)
3659       || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
3660       || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
3661       || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
3662 	  == GET_MODE_UNIT_SIZE (GET_MODE (src))))
3663     return false;
3664 
3665   /* Generate *aarch64_sve_mov<mode>_subreg_be.  */
3666   rtx ptrue = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
3667   rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
3668 			       UNSPEC_REV_SUBREG);
3669   emit_insn (gen_rtx_SET (dest, unspec));
3670   return true;
3671 }
3672 
3673 /* Return a copy of X with mode MODE, without changing its other
3674    attributes.  Unlike gen_lowpart, this doesn't care whether the
3675    mode change is valid.  */
3676 
3677 static rtx
aarch64_replace_reg_mode(rtx x,machine_mode mode)3678 aarch64_replace_reg_mode (rtx x, machine_mode mode)
3679 {
3680   if (GET_MODE (x) == mode)
3681     return x;
3682 
3683   x = shallow_copy_rtx (x);
3684   set_mode_and_regno (x, mode, REGNO (x));
3685   return x;
3686 }
3687 
3688 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
3689    operands.  */
3690 
3691 void
aarch64_split_sve_subreg_move(rtx dest,rtx ptrue,rtx src)3692 aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
3693 {
3694   /* Decide which REV operation we need.  The mode with narrower elements
3695      determines the mode of the operands and the mode with the wider
3696      elements determines the reverse width.  */
3697   machine_mode mode_with_wider_elts = GET_MODE (dest);
3698   machine_mode mode_with_narrower_elts = GET_MODE (src);
3699   if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
3700       < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
3701     std::swap (mode_with_wider_elts, mode_with_narrower_elts);
3702 
3703   unsigned int wider_bytes = GET_MODE_UNIT_SIZE (mode_with_wider_elts);
3704   unsigned int unspec;
3705   if (wider_bytes == 8)
3706     unspec = UNSPEC_REV64;
3707   else if (wider_bytes == 4)
3708     unspec = UNSPEC_REV32;
3709   else if (wider_bytes == 2)
3710     unspec = UNSPEC_REV16;
3711   else
3712     gcc_unreachable ();
3713   machine_mode pred_mode = aarch64_sve_pred_mode (wider_bytes).require ();
3714 
3715   /* Emit:
3716 
3717        (set DEST (unspec [PTRUE (unspec [SRC] UNSPEC_REV<nn>)]
3718 			 UNSPEC_MERGE_PTRUE))
3719 
3720      with the appropriate modes.  */
3721   ptrue = gen_lowpart (pred_mode, ptrue);
3722   dest = aarch64_replace_reg_mode (dest, mode_with_narrower_elts);
3723   src = aarch64_replace_reg_mode (src, mode_with_narrower_elts);
3724   src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (1, src), unspec);
3725   src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (2, ptrue, src),
3726 			UNSPEC_MERGE_PTRUE);
3727   emit_insn (gen_rtx_SET (dest, src));
3728 }
3729 
3730 static bool
aarch64_function_ok_for_sibcall(tree decl ATTRIBUTE_UNUSED,tree exp ATTRIBUTE_UNUSED)3731 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
3732 				 tree exp ATTRIBUTE_UNUSED)
3733 {
3734   if (aarch64_simd_decl_p (cfun->decl) != aarch64_simd_decl_p (decl))
3735     return false;
3736 
3737   return true;
3738 }
3739 
3740 /* Implement TARGET_PASS_BY_REFERENCE.  */
3741 
3742 static bool
aarch64_pass_by_reference(cumulative_args_t pcum ATTRIBUTE_UNUSED,machine_mode mode,const_tree type,bool named ATTRIBUTE_UNUSED)3743 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
3744 			   machine_mode mode,
3745 			   const_tree type,
3746 			   bool named ATTRIBUTE_UNUSED)
3747 {
3748   HOST_WIDE_INT size;
3749   machine_mode dummymode;
3750   int nregs;
3751 
3752   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
3753   if (mode == BLKmode && type)
3754     size = int_size_in_bytes (type);
3755   else
3756     /* No frontends can create types with variable-sized modes, so we
3757        shouldn't be asked to pass or return them.  */
3758     size = GET_MODE_SIZE (mode).to_constant ();
3759 
3760   /* Aggregates are passed by reference based on their size.  */
3761   if (type && AGGREGATE_TYPE_P (type))
3762     {
3763       size = int_size_in_bytes (type);
3764     }
3765 
3766   /* Variable sized arguments are always returned by reference.  */
3767   if (size < 0)
3768     return true;
3769 
3770   /* Can this be a candidate to be passed in fp/simd register(s)?  */
3771   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
3772 					       &dummymode, &nregs,
3773 					       NULL))
3774     return false;
3775 
3776   /* Arguments which are variable sized or larger than 2 registers are
3777      passed by reference unless they are a homogenous floating point
3778      aggregate.  */
3779   return size > 2 * UNITS_PER_WORD;
3780 }
3781 
3782 /* Return TRUE if VALTYPE is padded to its least significant bits.  */
3783 static bool
aarch64_return_in_msb(const_tree valtype)3784 aarch64_return_in_msb (const_tree valtype)
3785 {
3786   machine_mode dummy_mode;
3787   int dummy_int;
3788 
3789   /* Never happens in little-endian mode.  */
3790   if (!BYTES_BIG_ENDIAN)
3791     return false;
3792 
3793   /* Only composite types smaller than or equal to 16 bytes can
3794      be potentially returned in registers.  */
3795   if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
3796       || int_size_in_bytes (valtype) <= 0
3797       || int_size_in_bytes (valtype) > 16)
3798     return false;
3799 
3800   /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
3801      or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
3802      is always passed/returned in the least significant bits of fp/simd
3803      register(s).  */
3804   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
3805 					       &dummy_mode, &dummy_int, NULL))
3806     return false;
3807 
3808   return true;
3809 }
3810 
3811 /* Implement TARGET_FUNCTION_VALUE.
3812    Define how to find the value returned by a function.  */
3813 
3814 static rtx
aarch64_function_value(const_tree type,const_tree func,bool outgoing ATTRIBUTE_UNUSED)3815 aarch64_function_value (const_tree type, const_tree func,
3816 			bool outgoing ATTRIBUTE_UNUSED)
3817 {
3818   machine_mode mode;
3819   int unsignedp;
3820   int count;
3821   machine_mode ag_mode;
3822 
3823   mode = TYPE_MODE (type);
3824   if (INTEGRAL_TYPE_P (type))
3825     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
3826 
3827   if (aarch64_return_in_msb (type))
3828     {
3829       HOST_WIDE_INT size = int_size_in_bytes (type);
3830 
3831       if (size % UNITS_PER_WORD != 0)
3832 	{
3833 	  size += UNITS_PER_WORD - size % UNITS_PER_WORD;
3834 	  mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
3835 	}
3836     }
3837 
3838   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
3839 					       &ag_mode, &count, NULL))
3840     {
3841       if (!aarch64_composite_type_p (type, mode))
3842 	{
3843 	  gcc_assert (count == 1 && mode == ag_mode);
3844 	  return gen_rtx_REG (mode, V0_REGNUM);
3845 	}
3846       else
3847 	{
3848 	  int i;
3849 	  rtx par;
3850 
3851 	  par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
3852 	  for (i = 0; i < count; i++)
3853 	    {
3854 	      rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
3855 	      rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
3856 	      tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
3857 	      XVECEXP (par, 0, i) = tmp;
3858 	    }
3859 	  return par;
3860 	}
3861     }
3862   else
3863     return gen_rtx_REG (mode, R0_REGNUM);
3864 }
3865 
3866 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
3867    Return true if REGNO is the number of a hard register in which the values
3868    of called function may come back.  */
3869 
3870 static bool
aarch64_function_value_regno_p(const unsigned int regno)3871 aarch64_function_value_regno_p (const unsigned int regno)
3872 {
3873   /* Maximum of 16 bytes can be returned in the general registers.  Examples
3874      of 16-byte return values are: 128-bit integers and 16-byte small
3875      structures (excluding homogeneous floating-point aggregates).  */
3876   if (regno == R0_REGNUM || regno == R1_REGNUM)
3877     return true;
3878 
3879   /* Up to four fp/simd registers can return a function value, e.g. a
3880      homogeneous floating-point aggregate having four members.  */
3881   if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
3882     return TARGET_FLOAT;
3883 
3884   return false;
3885 }
3886 
3887 /* Implement TARGET_RETURN_IN_MEMORY.
3888 
3889    If the type T of the result of a function is such that
3890      void func (T arg)
3891    would require that arg be passed as a value in a register (or set of
3892    registers) according to the parameter passing rules, then the result
3893    is returned in the same registers as would be used for such an
3894    argument.  */
3895 
3896 static bool
aarch64_return_in_memory(const_tree type,const_tree fndecl ATTRIBUTE_UNUSED)3897 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
3898 {
3899   HOST_WIDE_INT size;
3900   machine_mode ag_mode;
3901   int count;
3902 
3903   if (!AGGREGATE_TYPE_P (type)
3904       && TREE_CODE (type) != COMPLEX_TYPE
3905       && TREE_CODE (type) != VECTOR_TYPE)
3906     /* Simple scalar types always returned in registers.  */
3907     return false;
3908 
3909   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
3910 					       type,
3911 					       &ag_mode,
3912 					       &count,
3913 					       NULL))
3914     return false;
3915 
3916   /* Types larger than 2 registers returned in memory.  */
3917   size = int_size_in_bytes (type);
3918   return (size < 0 || size > 2 * UNITS_PER_WORD);
3919 }
3920 
3921 static bool
aarch64_vfp_is_call_candidate(cumulative_args_t pcum_v,machine_mode mode,const_tree type,int * nregs)3922 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
3923 			       const_tree type, int *nregs)
3924 {
3925   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3926   return aarch64_vfp_is_call_or_return_candidate (mode,
3927 						  type,
3928 						  &pcum->aapcs_vfp_rmode,
3929 						  nregs,
3930 						  NULL);
3931 }
3932 
3933 /* Given MODE and TYPE of a function argument, return the alignment in
3934    bits.  The idea is to suppress any stronger alignment requested by
3935    the user and opt for the natural alignment (specified in AAPCS64 \S
3936    4.1).  ABI_BREAK is set to true if the alignment was incorrectly
3937    calculated in versions of GCC prior to GCC-9.  This is a helper
3938    function for local use only.  */
3939 
3940 static unsigned int
aarch64_function_arg_alignment(machine_mode mode,const_tree type,bool * abi_break)3941 aarch64_function_arg_alignment (machine_mode mode, const_tree type,
3942 				bool *abi_break)
3943 {
3944   *abi_break = false;
3945   if (!type)
3946     return GET_MODE_ALIGNMENT (mode);
3947 
3948   if (integer_zerop (TYPE_SIZE (type)))
3949     return 0;
3950 
3951   gcc_assert (TYPE_MODE (type) == mode);
3952 
3953   if (!AGGREGATE_TYPE_P (type))
3954     return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
3955 
3956   if (TREE_CODE (type) == ARRAY_TYPE)
3957     return TYPE_ALIGN (TREE_TYPE (type));
3958 
3959   unsigned int alignment = 0;
3960   unsigned int bitfield_alignment = 0;
3961   for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
3962     if (TREE_CODE (field) == FIELD_DECL)
3963       {
3964 	alignment = std::max (alignment, DECL_ALIGN (field));
3965 	if (DECL_BIT_FIELD_TYPE (field))
3966 	  bitfield_alignment
3967 	    = std::max (bitfield_alignment,
3968 			TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field)));
3969       }
3970 
3971   if (bitfield_alignment > alignment)
3972     {
3973       *abi_break = true;
3974       return bitfield_alignment;
3975     }
3976 
3977   return alignment;
3978 }
3979 
3980 /* Layout a function argument according to the AAPCS64 rules.  The rule
3981    numbers refer to the rule numbers in the AAPCS64.  */
3982 
3983 static void
aarch64_layout_arg(cumulative_args_t pcum_v,machine_mode mode,const_tree type,bool named ATTRIBUTE_UNUSED)3984 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
3985 		    const_tree type,
3986 		    bool named ATTRIBUTE_UNUSED)
3987 {
3988   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3989   int ncrn, nvrn, nregs;
3990   bool allocate_ncrn, allocate_nvrn;
3991   HOST_WIDE_INT size;
3992   bool abi_break;
3993 
3994   /* We need to do this once per argument.  */
3995   if (pcum->aapcs_arg_processed)
3996     return;
3997 
3998   pcum->aapcs_arg_processed = true;
3999 
4000   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
4001   if (type)
4002     size = int_size_in_bytes (type);
4003   else
4004     /* No frontends can create types with variable-sized modes, so we
4005        shouldn't be asked to pass or return them.  */
4006     size = GET_MODE_SIZE (mode).to_constant ();
4007   size = ROUND_UP (size, UNITS_PER_WORD);
4008 
4009   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
4010   allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
4011 						 mode,
4012 						 type,
4013 						 &nregs);
4014 
4015   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
4016      The following code thus handles passing by SIMD/FP registers first.  */
4017 
4018   nvrn = pcum->aapcs_nvrn;
4019 
4020   /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
4021      and homogenous short-vector aggregates (HVA).  */
4022   if (allocate_nvrn)
4023     {
4024       if (!TARGET_FLOAT)
4025 	aarch64_err_no_fpadvsimd (mode);
4026 
4027       if (nvrn + nregs <= NUM_FP_ARG_REGS)
4028 	{
4029 	  pcum->aapcs_nextnvrn = nvrn + nregs;
4030 	  if (!aarch64_composite_type_p (type, mode))
4031 	    {
4032 	      gcc_assert (nregs == 1);
4033 	      pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
4034 	    }
4035 	  else
4036 	    {
4037 	      rtx par;
4038 	      int i;
4039 	      par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
4040 	      for (i = 0; i < nregs; i++)
4041 		{
4042 		  rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
4043 					 V0_REGNUM + nvrn + i);
4044 		  rtx offset = gen_int_mode
4045 		    (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
4046 		  tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
4047 		  XVECEXP (par, 0, i) = tmp;
4048 		}
4049 	      pcum->aapcs_reg = par;
4050 	    }
4051 	  return;
4052 	}
4053       else
4054 	{
4055 	  /* C.3 NSRN is set to 8.  */
4056 	  pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
4057 	  goto on_stack;
4058 	}
4059     }
4060 
4061   ncrn = pcum->aapcs_ncrn;
4062   nregs = size / UNITS_PER_WORD;
4063 
4064   /* C6 - C9.  though the sign and zero extension semantics are
4065      handled elsewhere.  This is the case where the argument fits
4066      entirely general registers.  */
4067   if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
4068     {
4069       gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
4070 
4071       /* C.8 if the argument has an alignment of 16 then the NGRN is
4072 	 rounded up to the next even number.  */
4073       if (nregs == 2
4074 	  && ncrn % 2
4075 	  /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
4076 	     comparison is there because for > 16 * BITS_PER_UNIT
4077 	     alignment nregs should be > 2 and therefore it should be
4078 	     passed by reference rather than value.  */
4079 	  && (aarch64_function_arg_alignment (mode, type, &abi_break)
4080 	      == 16 * BITS_PER_UNIT))
4081 	{
4082 	  if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
4083 	    inform (input_location, "parameter passing for argument of type "
4084 		    "%qT changed in GCC 9.1", type);
4085 	  ++ncrn;
4086 	  gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
4087 	}
4088 
4089       /* NREGS can be 0 when e.g. an empty structure is to be passed.
4090 	 A reg is still generated for it, but the caller should be smart
4091 	 enough not to use it.  */
4092       if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
4093 	pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
4094       else
4095 	{
4096 	  rtx par;
4097 	  int i;
4098 
4099 	  par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
4100 	  for (i = 0; i < nregs; i++)
4101 	    {
4102 	      rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
4103 	      tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
4104 				       GEN_INT (i * UNITS_PER_WORD));
4105 	      XVECEXP (par, 0, i) = tmp;
4106 	    }
4107 	  pcum->aapcs_reg = par;
4108 	}
4109 
4110       pcum->aapcs_nextncrn = ncrn + nregs;
4111       return;
4112     }
4113 
4114   /* C.11  */
4115   pcum->aapcs_nextncrn = NUM_ARG_REGS;
4116 
4117   /* The argument is passed on stack; record the needed number of words for
4118      this argument and align the total size if necessary.  */
4119 on_stack:
4120   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
4121 
4122   if (aarch64_function_arg_alignment (mode, type, &abi_break)
4123       == 16 * BITS_PER_UNIT)
4124     {
4125       int new_size = ROUND_UP (pcum->aapcs_stack_size, 16 / UNITS_PER_WORD);
4126       if (pcum->aapcs_stack_size != new_size)
4127 	{
4128 	  if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
4129 	    inform (input_location, "parameter passing for argument of type "
4130 		    "%qT changed in GCC 9.1", type);
4131 	  pcum->aapcs_stack_size = new_size;
4132 	}
4133     }
4134   return;
4135 }
4136 
4137 /* Implement TARGET_FUNCTION_ARG.  */
4138 
4139 static rtx
aarch64_function_arg(cumulative_args_t pcum_v,machine_mode mode,const_tree type,bool named)4140 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
4141 		      const_tree type, bool named)
4142 {
4143   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4144   gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
4145 
4146   if (mode == VOIDmode)
4147     return NULL_RTX;
4148 
4149   aarch64_layout_arg (pcum_v, mode, type, named);
4150   return pcum->aapcs_reg;
4151 }
4152 
4153 void
aarch64_init_cumulative_args(CUMULATIVE_ARGS * pcum,const_tree fntype ATTRIBUTE_UNUSED,rtx libname ATTRIBUTE_UNUSED,const_tree fndecl ATTRIBUTE_UNUSED,unsigned n_named ATTRIBUTE_UNUSED)4154 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
4155 			   const_tree fntype ATTRIBUTE_UNUSED,
4156 			   rtx libname ATTRIBUTE_UNUSED,
4157 			   const_tree fndecl ATTRIBUTE_UNUSED,
4158 			   unsigned n_named ATTRIBUTE_UNUSED)
4159 {
4160   pcum->aapcs_ncrn = 0;
4161   pcum->aapcs_nvrn = 0;
4162   pcum->aapcs_nextncrn = 0;
4163   pcum->aapcs_nextnvrn = 0;
4164   pcum->pcs_variant = ARM_PCS_AAPCS64;
4165   pcum->aapcs_reg = NULL_RTX;
4166   pcum->aapcs_arg_processed = false;
4167   pcum->aapcs_stack_words = 0;
4168   pcum->aapcs_stack_size = 0;
4169 
4170   if (!TARGET_FLOAT
4171       && fndecl && TREE_PUBLIC (fndecl)
4172       && fntype && fntype != error_mark_node)
4173     {
4174       const_tree type = TREE_TYPE (fntype);
4175       machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument.  */
4176       int nregs ATTRIBUTE_UNUSED; /* Likewise.  */
4177       if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
4178 						   &mode, &nregs, NULL))
4179 	aarch64_err_no_fpadvsimd (TYPE_MODE (type));
4180     }
4181   return;
4182 }
4183 
4184 static void
aarch64_function_arg_advance(cumulative_args_t pcum_v,machine_mode mode,const_tree type,bool named)4185 aarch64_function_arg_advance (cumulative_args_t pcum_v,
4186 			      machine_mode mode,
4187 			      const_tree type,
4188 			      bool named)
4189 {
4190   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4191   if (pcum->pcs_variant == ARM_PCS_AAPCS64)
4192     {
4193       aarch64_layout_arg (pcum_v, mode, type, named);
4194       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
4195 		  != (pcum->aapcs_stack_words != 0));
4196       pcum->aapcs_arg_processed = false;
4197       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
4198       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
4199       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
4200       pcum->aapcs_stack_words = 0;
4201       pcum->aapcs_reg = NULL_RTX;
4202     }
4203 }
4204 
4205 bool
aarch64_function_arg_regno_p(unsigned regno)4206 aarch64_function_arg_regno_p (unsigned regno)
4207 {
4208   return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
4209 	  || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
4210 }
4211 
4212 /* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
4213    PARM_BOUNDARY bits of alignment, but will be given anything up
4214    to STACK_BOUNDARY bits if the type requires it.  This makes sure
4215    that both before and after the layout of each argument, the Next
4216    Stacked Argument Address (NSAA) will have a minimum alignment of
4217    8 bytes.  */
4218 
4219 static unsigned int
aarch64_function_arg_boundary(machine_mode mode,const_tree type)4220 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
4221 {
4222   bool abi_break;
4223   unsigned int alignment = aarch64_function_arg_alignment (mode, type,
4224 							   &abi_break);
4225   if (abi_break & warn_psabi)
4226     inform (input_location, "parameter passing for argument of type "
4227 	    "%qT changed in GCC 9.1", type);
4228 
4229   return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
4230 }
4231 
4232 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE.  */
4233 
4234 static fixed_size_mode
aarch64_get_reg_raw_mode(int regno)4235 aarch64_get_reg_raw_mode (int regno)
4236 {
4237   if (TARGET_SVE && FP_REGNUM_P (regno))
4238     /* Don't use the SVE part of the register for __builtin_apply and
4239        __builtin_return.  The SVE registers aren't used by the normal PCS,
4240        so using them there would be a waste of time.  The PCS extensions
4241        for SVE types are fundamentally incompatible with the
4242        __builtin_return/__builtin_apply interface.  */
4243     return as_a <fixed_size_mode> (V16QImode);
4244   return default_get_reg_raw_mode (regno);
4245 }
4246 
4247 /* Implement TARGET_FUNCTION_ARG_PADDING.
4248 
4249    Small aggregate types are placed in the lowest memory address.
4250 
4251    The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
4252 
4253 static pad_direction
aarch64_function_arg_padding(machine_mode mode,const_tree type)4254 aarch64_function_arg_padding (machine_mode mode, const_tree type)
4255 {
4256   /* On little-endian targets, the least significant byte of every stack
4257      argument is passed at the lowest byte address of the stack slot.  */
4258   if (!BYTES_BIG_ENDIAN)
4259     return PAD_UPWARD;
4260 
4261   /* Otherwise, integral, floating-point and pointer types are padded downward:
4262      the least significant byte of a stack argument is passed at the highest
4263      byte address of the stack slot.  */
4264   if (type
4265       ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
4266 	 || POINTER_TYPE_P (type))
4267       : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
4268     return PAD_DOWNWARD;
4269 
4270   /* Everything else padded upward, i.e. data in first byte of stack slot.  */
4271   return PAD_UPWARD;
4272 }
4273 
4274 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
4275 
4276    It specifies padding for the last (may also be the only)
4277    element of a block move between registers and memory.  If
4278    assuming the block is in the memory, padding upward means that
4279    the last element is padded after its highest significant byte,
4280    while in downward padding, the last element is padded at the
4281    its least significant byte side.
4282 
4283    Small aggregates and small complex types are always padded
4284    upwards.
4285 
4286    We don't need to worry about homogeneous floating-point or
4287    short-vector aggregates; their move is not affected by the
4288    padding direction determined here.  Regardless of endianness,
4289    each element of such an aggregate is put in the least
4290    significant bits of a fp/simd register.
4291 
4292    Return !BYTES_BIG_ENDIAN if the least significant byte of the
4293    register has useful data, and return the opposite if the most
4294    significant byte does.  */
4295 
4296 bool
aarch64_pad_reg_upward(machine_mode mode,const_tree type,bool first ATTRIBUTE_UNUSED)4297 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
4298 		     bool first ATTRIBUTE_UNUSED)
4299 {
4300 
4301   /* Small composite types are always padded upward.  */
4302   if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
4303     {
4304       HOST_WIDE_INT size;
4305       if (type)
4306 	size = int_size_in_bytes (type);
4307       else
4308 	/* No frontends can create types with variable-sized modes, so we
4309 	   shouldn't be asked to pass or return them.  */
4310 	size = GET_MODE_SIZE (mode).to_constant ();
4311       if (size < 2 * UNITS_PER_WORD)
4312 	return true;
4313     }
4314 
4315   /* Otherwise, use the default padding.  */
4316   return !BYTES_BIG_ENDIAN;
4317 }
4318 
4319 static scalar_int_mode
aarch64_libgcc_cmp_return_mode(void)4320 aarch64_libgcc_cmp_return_mode (void)
4321 {
4322   return SImode;
4323 }
4324 
4325 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
4326 
4327 /* We use the 12-bit shifted immediate arithmetic instructions so values
4328    must be multiple of (1 << 12), i.e. 4096.  */
4329 #define ARITH_FACTOR 4096
4330 
4331 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
4332 #error Cannot use simple address calculation for stack probing
4333 #endif
4334 
4335 /* The pair of scratch registers used for stack probing.  */
4336 #define PROBE_STACK_FIRST_REG  R9_REGNUM
4337 #define PROBE_STACK_SECOND_REG R10_REGNUM
4338 
4339 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
4340    inclusive.  These are offsets from the current stack pointer.  */
4341 
4342 static void
aarch64_emit_probe_stack_range(HOST_WIDE_INT first,poly_int64 poly_size)4343 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
4344 {
4345   HOST_WIDE_INT size;
4346   if (!poly_size.is_constant (&size))
4347     {
4348       sorry ("stack probes for SVE frames");
4349       return;
4350     }
4351 
4352   rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
4353 
4354   /* See the same assertion on PROBE_INTERVAL above.  */
4355   gcc_assert ((first % ARITH_FACTOR) == 0);
4356 
4357   /* See if we have a constant small number of probes to generate.  If so,
4358      that's the easy case.  */
4359   if (size <= PROBE_INTERVAL)
4360     {
4361       const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
4362 
4363       emit_set_insn (reg1,
4364 		     plus_constant (Pmode,
4365 				    stack_pointer_rtx, -(first + base)));
4366       emit_stack_probe (plus_constant (Pmode, reg1, base - size));
4367     }
4368 
4369   /* The run-time loop is made up of 8 insns in the generic case while the
4370      compile-time loop is made up of 4+2*(n-2) insns for n # of intervals.  */
4371   else if (size <= 4 * PROBE_INTERVAL)
4372     {
4373       HOST_WIDE_INT i, rem;
4374 
4375       emit_set_insn (reg1,
4376 		     plus_constant (Pmode,
4377 				    stack_pointer_rtx,
4378 				    -(first + PROBE_INTERVAL)));
4379       emit_stack_probe (reg1);
4380 
4381       /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
4382 	 it exceeds SIZE.  If only two probes are needed, this will not
4383 	 generate any code.  Then probe at FIRST + SIZE.  */
4384       for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
4385 	{
4386 	  emit_set_insn (reg1,
4387 			 plus_constant (Pmode, reg1, -PROBE_INTERVAL));
4388 	  emit_stack_probe (reg1);
4389 	}
4390 
4391       rem = size - (i - PROBE_INTERVAL);
4392       if (rem > 256)
4393 	{
4394 	  const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
4395 
4396 	  emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
4397 	  emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
4398 	}
4399       else
4400 	emit_stack_probe (plus_constant (Pmode, reg1, -rem));
4401     }
4402 
4403   /* Otherwise, do the same as above, but in a loop.  Note that we must be
4404      extra careful with variables wrapping around because we might be at
4405      the very top (or the very bottom) of the address space and we have
4406      to be able to handle this case properly; in particular, we use an
4407      equality test for the loop condition.  */
4408   else
4409     {
4410       rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
4411 
4412       /* Step 1: round SIZE to the previous multiple of the interval.  */
4413 
4414       HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
4415 
4416 
4417       /* Step 2: compute initial and final value of the loop counter.  */
4418 
4419       /* TEST_ADDR = SP + FIRST.  */
4420       emit_set_insn (reg1,
4421 		     plus_constant (Pmode, stack_pointer_rtx, -first));
4422 
4423       /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE.  */
4424       HOST_WIDE_INT adjustment = - (first + rounded_size);
4425       if (! aarch64_uimm12_shift (adjustment))
4426 	{
4427 	  aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
4428 					  true, Pmode);
4429 	  emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
4430 	}
4431       else
4432 	emit_set_insn (reg2,
4433 		       plus_constant (Pmode, stack_pointer_rtx, adjustment));
4434 
4435       /* Step 3: the loop
4436 
4437 	 do
4438 	   {
4439 	     TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
4440 	     probe at TEST_ADDR
4441 	   }
4442 	 while (TEST_ADDR != LAST_ADDR)
4443 
4444 	 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
4445 	 until it is equal to ROUNDED_SIZE.  */
4446 
4447       emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
4448 
4449 
4450       /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
4451 	 that SIZE is equal to ROUNDED_SIZE.  */
4452 
4453       if (size != rounded_size)
4454 	{
4455 	  HOST_WIDE_INT rem = size - rounded_size;
4456 
4457 	  if (rem > 256)
4458 	    {
4459 	      const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
4460 
4461 	      emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
4462 	      emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
4463 	    }
4464 	  else
4465 	    emit_stack_probe (plus_constant (Pmode, reg2, -rem));
4466 	}
4467     }
4468 
4469   /* Make sure nothing is scheduled before we are done.  */
4470   emit_insn (gen_blockage ());
4471 }
4472 
4473 /* Probe a range of stack addresses from REG1 to REG2 inclusive.  These are
4474    absolute addresses.  */
4475 
4476 const char *
aarch64_output_probe_stack_range(rtx reg1,rtx reg2)4477 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
4478 {
4479   static int labelno = 0;
4480   char loop_lab[32];
4481   rtx xops[2];
4482 
4483   ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
4484 
4485   /* Loop.  */
4486   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
4487 
4488   HOST_WIDE_INT stack_clash_probe_interval
4489     = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
4490 
4491   /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
4492   xops[0] = reg1;
4493   HOST_WIDE_INT interval;
4494   if (flag_stack_clash_protection)
4495     interval = stack_clash_probe_interval;
4496   else
4497     interval = PROBE_INTERVAL;
4498 
4499   gcc_assert (aarch64_uimm12_shift (interval));
4500   xops[1] = GEN_INT (interval);
4501 
4502   output_asm_insn ("sub\t%0, %0, %1", xops);
4503 
4504   /* If doing stack clash protection then we probe up by the ABI specified
4505      amount.  We do this because we're dropping full pages at a time in the
4506      loop.  But if we're doing non-stack clash probing, probe at SP 0.  */
4507   if (flag_stack_clash_protection)
4508     xops[1] = GEN_INT (STACK_CLASH_CALLER_GUARD);
4509   else
4510     xops[1] = CONST0_RTX (GET_MODE (xops[1]));
4511 
4512   /* Probe at TEST_ADDR.  If we're inside the loop it is always safe to probe
4513      by this amount for each iteration.  */
4514   output_asm_insn ("str\txzr, [%0, %1]", xops);
4515 
4516   /* Test if TEST_ADDR == LAST_ADDR.  */
4517   xops[1] = reg2;
4518   output_asm_insn ("cmp\t%0, %1", xops);
4519 
4520   /* Branch.  */
4521   fputs ("\tb.ne\t", asm_out_file);
4522   assemble_name_raw (asm_out_file, loop_lab);
4523   fputc ('\n', asm_out_file);
4524 
4525   return "";
4526 }
4527 
4528 /* Emit the probe loop for doing stack clash probes and stack adjustments for
4529    SVE.  This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
4530    of GUARD_SIZE.  When a probe is emitted it is done at most
4531    MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of
4532    at most MIN_PROBE_THRESHOLD.  By the end of this function
4533    BASE = BASE - ADJUSTMENT.  */
4534 
4535 const char *
aarch64_output_probe_sve_stack_clash(rtx base,rtx adjustment,rtx min_probe_threshold,rtx guard_size)4536 aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment,
4537 				      rtx min_probe_threshold, rtx guard_size)
4538 {
4539   /* This function is not allowed to use any instruction generation function
4540      like gen_ and friends.  If you do you'll likely ICE during CFG validation,
4541      so instead emit the code you want using output_asm_insn.  */
4542   gcc_assert (flag_stack_clash_protection);
4543   gcc_assert (CONST_INT_P (min_probe_threshold) && CONST_INT_P (guard_size));
4544   gcc_assert (INTVAL (guard_size) > INTVAL (min_probe_threshold));
4545 
4546   /* The minimum required allocation before the residual requires probing.  */
4547   HOST_WIDE_INT residual_probe_guard = INTVAL (min_probe_threshold);
4548 
4549   /* Clamp the value down to the nearest value that can be used with a cmp.  */
4550   residual_probe_guard = aarch64_clamp_to_uimm12_shift (residual_probe_guard);
4551   rtx probe_offset_value_rtx = gen_int_mode (residual_probe_guard, Pmode);
4552 
4553   gcc_assert (INTVAL (min_probe_threshold) >= residual_probe_guard);
4554   gcc_assert (aarch64_uimm12_shift (residual_probe_guard));
4555 
4556   static int labelno = 0;
4557   char loop_start_lab[32];
4558   char loop_end_lab[32];
4559   rtx xops[2];
4560 
4561   ASM_GENERATE_INTERNAL_LABEL (loop_start_lab, "SVLPSPL", labelno);
4562   ASM_GENERATE_INTERNAL_LABEL (loop_end_lab, "SVLPEND", labelno++);
4563 
4564   /* Emit loop start label.  */
4565   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_start_lab);
4566 
4567   /* ADJUSTMENT < RESIDUAL_PROBE_GUARD.  */
4568   xops[0] = adjustment;
4569   xops[1] = probe_offset_value_rtx;
4570   output_asm_insn ("cmp\t%0, %1", xops);
4571 
4572   /* Branch to end if not enough adjustment to probe.  */
4573   fputs ("\tb.lt\t", asm_out_file);
4574   assemble_name_raw (asm_out_file, loop_end_lab);
4575   fputc ('\n', asm_out_file);
4576 
4577   /* BASE = BASE - RESIDUAL_PROBE_GUARD.  */
4578   xops[0] = base;
4579   xops[1] = probe_offset_value_rtx;
4580   output_asm_insn ("sub\t%0, %0, %1", xops);
4581 
4582   /* Probe at BASE.  */
4583   xops[1] = const0_rtx;
4584   output_asm_insn ("str\txzr, [%0, %1]", xops);
4585 
4586   /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD.  */
4587   xops[0] = adjustment;
4588   xops[1] = probe_offset_value_rtx;
4589   output_asm_insn ("sub\t%0, %0, %1", xops);
4590 
4591   /* Branch to start if still more bytes to allocate.  */
4592   fputs ("\tb\t", asm_out_file);
4593   assemble_name_raw (asm_out_file, loop_start_lab);
4594   fputc ('\n', asm_out_file);
4595 
4596   /* No probe leave.  */
4597   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab);
4598 
4599   /* BASE = BASE - ADJUSTMENT.  */
4600   xops[0] = base;
4601   xops[1] = adjustment;
4602   output_asm_insn ("sub\t%0, %0, %1", xops);
4603   return "";
4604 }
4605 
4606 /* Determine whether a frame chain needs to be generated.  */
4607 static bool
aarch64_needs_frame_chain(void)4608 aarch64_needs_frame_chain (void)
4609 {
4610   /* Force a frame chain for EH returns so the return address is at FP+8.  */
4611   if (frame_pointer_needed || crtl->calls_eh_return)
4612     return true;
4613 
4614   /* A leaf function cannot have calls or write LR.  */
4615   bool is_leaf = crtl->is_leaf && !df_regs_ever_live_p (LR_REGNUM);
4616 
4617   /* Don't use a frame chain in leaf functions if leaf frame pointers
4618      are disabled.  */
4619   if (flag_omit_leaf_frame_pointer && is_leaf)
4620     return false;
4621 
4622   return aarch64_use_frame_pointer;
4623 }
4624 
4625 /* Mark the registers that need to be saved by the callee and calculate
4626    the size of the callee-saved registers area and frame record (both FP
4627    and LR may be omitted).  */
4628 static void
aarch64_layout_frame(void)4629 aarch64_layout_frame (void)
4630 {
4631   HOST_WIDE_INT offset = 0;
4632   int regno, last_fp_reg = INVALID_REGNUM;
4633   bool simd_function = aarch64_simd_decl_p (cfun->decl);
4634 
4635   cfun->machine->frame.emit_frame_chain = aarch64_needs_frame_chain ();
4636 
4637   /* Adjust the outgoing arguments size if required.  Keep it in sync with what
4638      the mid-end is doing.  */
4639   crtl->outgoing_args_size = STACK_DYNAMIC_OFFSET (cfun);
4640 
4641 #define SLOT_NOT_REQUIRED (-2)
4642 #define SLOT_REQUIRED     (-1)
4643 
4644   cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
4645   cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
4646 
4647   /* If this is a non-leaf simd function with calls we assume that
4648      at least one of those calls is to a non-simd function and thus
4649      we must save V8 to V23 in the prologue.  */
4650 
4651   if (simd_function && !crtl->is_leaf)
4652     {
4653       for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4654 	if (FP_SIMD_SAVED_REGNUM_P (regno))
4655  	  df_set_regs_ever_live (regno, true);
4656     }
4657 
4658   /* First mark all the registers that really need to be saved...  */
4659   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4660     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
4661 
4662   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4663     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
4664 
4665   /* ... that includes the eh data registers (if needed)...  */
4666   if (crtl->calls_eh_return)
4667     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
4668       cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
4669 	= SLOT_REQUIRED;
4670 
4671   /* ... and any callee saved register that dataflow says is live.  */
4672   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4673     if (df_regs_ever_live_p (regno)
4674 	&& (regno == R30_REGNUM
4675 	    || !call_used_regs[regno]))
4676       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
4677 
4678   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4679     if (df_regs_ever_live_p (regno)
4680 	&& (!call_used_regs[regno]
4681 	    || (simd_function && FP_SIMD_SAVED_REGNUM_P (regno))))
4682       {
4683 	cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
4684 	last_fp_reg = regno;
4685       }
4686 
4687   if (cfun->machine->frame.emit_frame_chain)
4688     {
4689       /* FP and LR are placed in the linkage record.  */
4690       cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
4691       cfun->machine->frame.wb_candidate1 = R29_REGNUM;
4692       cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
4693       cfun->machine->frame.wb_candidate2 = R30_REGNUM;
4694       offset = 2 * UNITS_PER_WORD;
4695     }
4696 
4697   /* With stack-clash, LR must be saved in non-leaf functions.  */
4698   gcc_assert (crtl->is_leaf
4699 	      || (cfun->machine->frame.reg_offset[R30_REGNUM]
4700 		  != SLOT_NOT_REQUIRED));
4701 
4702   /* Now assign stack slots for them.  */
4703   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4704     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
4705       {
4706 	cfun->machine->frame.reg_offset[regno] = offset;
4707 	if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
4708 	  cfun->machine->frame.wb_candidate1 = regno;
4709 	else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
4710 	  cfun->machine->frame.wb_candidate2 = regno;
4711 	offset += UNITS_PER_WORD;
4712       }
4713 
4714   HOST_WIDE_INT max_int_offset = offset;
4715   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
4716   bool has_align_gap = offset != max_int_offset;
4717 
4718   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4719     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
4720       {
4721 	/* If there is an alignment gap between integer and fp callee-saves,
4722 	   allocate the last fp register to it if possible.  */
4723 	if (regno == last_fp_reg
4724 	    && has_align_gap
4725 	    && !simd_function
4726 	    && (offset & 8) == 0)
4727 	  {
4728 	    cfun->machine->frame.reg_offset[regno] = max_int_offset;
4729 	    break;
4730 	  }
4731 
4732 	cfun->machine->frame.reg_offset[regno] = offset;
4733 	if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
4734 	  cfun->machine->frame.wb_candidate1 = regno;
4735 	else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
4736 		 && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
4737 	  cfun->machine->frame.wb_candidate2 = regno;
4738 	offset += simd_function ? UNITS_PER_VREG : UNITS_PER_WORD;
4739       }
4740 
4741   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
4742 
4743   cfun->machine->frame.saved_regs_size = offset;
4744 
4745   HOST_WIDE_INT varargs_and_saved_regs_size
4746     = offset + cfun->machine->frame.saved_varargs_size;
4747 
4748   cfun->machine->frame.hard_fp_offset
4749     = aligned_upper_bound (varargs_and_saved_regs_size
4750 			   + get_frame_size (),
4751 			   STACK_BOUNDARY / BITS_PER_UNIT);
4752 
4753   /* Both these values are already aligned.  */
4754   gcc_assert (multiple_p (crtl->outgoing_args_size,
4755 			  STACK_BOUNDARY / BITS_PER_UNIT));
4756   cfun->machine->frame.frame_size
4757     = (cfun->machine->frame.hard_fp_offset
4758        + crtl->outgoing_args_size);
4759 
4760   cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
4761 
4762   cfun->machine->frame.initial_adjust = 0;
4763   cfun->machine->frame.final_adjust = 0;
4764   cfun->machine->frame.callee_adjust = 0;
4765   cfun->machine->frame.callee_offset = 0;
4766 
4767   HOST_WIDE_INT max_push_offset = 0;
4768   if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
4769     max_push_offset = 512;
4770   else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
4771     max_push_offset = 256;
4772 
4773   HOST_WIDE_INT const_size, const_fp_offset;
4774   if (cfun->machine->frame.frame_size.is_constant (&const_size)
4775       && const_size < max_push_offset
4776       && known_eq (crtl->outgoing_args_size, 0))
4777     {
4778       /* Simple, small frame with no outgoing arguments:
4779 	 stp reg1, reg2, [sp, -frame_size]!
4780 	 stp reg3, reg4, [sp, 16]  */
4781       cfun->machine->frame.callee_adjust = const_size;
4782     }
4783   else if (known_lt (crtl->outgoing_args_size
4784 		     + cfun->machine->frame.saved_regs_size, 512)
4785 	   && !(cfun->calls_alloca
4786 		&& known_lt (cfun->machine->frame.hard_fp_offset,
4787 			     max_push_offset)))
4788     {
4789       /* Frame with small outgoing arguments:
4790 	 sub sp, sp, frame_size
4791 	 stp reg1, reg2, [sp, outgoing_args_size]
4792 	 stp reg3, reg4, [sp, outgoing_args_size + 16]  */
4793       cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
4794       cfun->machine->frame.callee_offset
4795 	= cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
4796     }
4797   else if (cfun->machine->frame.hard_fp_offset.is_constant (&const_fp_offset)
4798 	   && const_fp_offset < max_push_offset)
4799     {
4800       /* Frame with large outgoing arguments but a small local area:
4801 	 stp reg1, reg2, [sp, -hard_fp_offset]!
4802 	 stp reg3, reg4, [sp, 16]
4803 	 sub sp, sp, outgoing_args_size  */
4804       cfun->machine->frame.callee_adjust = const_fp_offset;
4805       cfun->machine->frame.final_adjust
4806 	= cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
4807     }
4808   else
4809     {
4810       /* Frame with large local area and outgoing arguments using frame pointer:
4811 	 sub sp, sp, hard_fp_offset
4812 	 stp x29, x30, [sp, 0]
4813 	 add x29, sp, 0
4814 	 stp reg3, reg4, [sp, 16]
4815 	 sub sp, sp, outgoing_args_size  */
4816       cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
4817       cfun->machine->frame.final_adjust
4818 	= cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
4819     }
4820 
4821   cfun->machine->frame.laid_out = true;
4822 }
4823 
4824 /* Return true if the register REGNO is saved on entry to
4825    the current function.  */
4826 
4827 static bool
aarch64_register_saved_on_entry(int regno)4828 aarch64_register_saved_on_entry (int regno)
4829 {
4830   return cfun->machine->frame.reg_offset[regno] >= 0;
4831 }
4832 
4833 /* Return the next register up from REGNO up to LIMIT for the callee
4834    to save.  */
4835 
4836 static unsigned
aarch64_next_callee_save(unsigned regno,unsigned limit)4837 aarch64_next_callee_save (unsigned regno, unsigned limit)
4838 {
4839   while (regno <= limit && !aarch64_register_saved_on_entry (regno))
4840     regno ++;
4841   return regno;
4842 }
4843 
4844 /* Push the register number REGNO of mode MODE to the stack with write-back
4845    adjusting the stack by ADJUSTMENT.  */
4846 
4847 static void
aarch64_pushwb_single_reg(machine_mode mode,unsigned regno,HOST_WIDE_INT adjustment)4848 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
4849 			   HOST_WIDE_INT adjustment)
4850  {
4851   rtx base_rtx = stack_pointer_rtx;
4852   rtx insn, reg, mem;
4853 
4854   reg = gen_rtx_REG (mode, regno);
4855   mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
4856 			    plus_constant (Pmode, base_rtx, -adjustment));
4857   mem = gen_frame_mem (mode, mem);
4858 
4859   insn = emit_move_insn (mem, reg);
4860   RTX_FRAME_RELATED_P (insn) = 1;
4861 }
4862 
4863 /* Generate and return an instruction to store the pair of registers
4864    REG and REG2 of mode MODE to location BASE with write-back adjusting
4865    the stack location BASE by ADJUSTMENT.  */
4866 
4867 static rtx
aarch64_gen_storewb_pair(machine_mode mode,rtx base,rtx reg,rtx reg2,HOST_WIDE_INT adjustment)4868 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
4869 			  HOST_WIDE_INT adjustment)
4870 {
4871   switch (mode)
4872     {
4873     case E_DImode:
4874       return gen_storewb_pairdi_di (base, base, reg, reg2,
4875 				    GEN_INT (-adjustment),
4876 				    GEN_INT (UNITS_PER_WORD - adjustment));
4877     case E_DFmode:
4878       return gen_storewb_pairdf_di (base, base, reg, reg2,
4879 				    GEN_INT (-adjustment),
4880 				    GEN_INT (UNITS_PER_WORD - adjustment));
4881     case E_TFmode:
4882       return gen_storewb_pairtf_di (base, base, reg, reg2,
4883 				    GEN_INT (-adjustment),
4884 				    GEN_INT (UNITS_PER_VREG - adjustment));
4885     default:
4886       gcc_unreachable ();
4887     }
4888 }
4889 
4890 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
4891    stack pointer by ADJUSTMENT.  */
4892 
4893 static void
aarch64_push_regs(unsigned regno1,unsigned regno2,HOST_WIDE_INT adjustment)4894 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
4895 {
4896   rtx_insn *insn;
4897   machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1);
4898 
4899   if (regno2 == INVALID_REGNUM)
4900     return aarch64_pushwb_single_reg (mode, regno1, adjustment);
4901 
4902   rtx reg1 = gen_rtx_REG (mode, regno1);
4903   rtx reg2 = gen_rtx_REG (mode, regno2);
4904 
4905   insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
4906 					      reg2, adjustment));
4907   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
4908   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
4909   RTX_FRAME_RELATED_P (insn) = 1;
4910 }
4911 
4912 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
4913    adjusting it by ADJUSTMENT afterwards.  */
4914 
4915 static rtx
aarch64_gen_loadwb_pair(machine_mode mode,rtx base,rtx reg,rtx reg2,HOST_WIDE_INT adjustment)4916 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
4917 			 HOST_WIDE_INT adjustment)
4918 {
4919   switch (mode)
4920     {
4921     case E_DImode:
4922       return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
4923 				   GEN_INT (UNITS_PER_WORD));
4924     case E_DFmode:
4925       return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
4926 				   GEN_INT (UNITS_PER_WORD));
4927     case E_TFmode:
4928       return gen_loadwb_pairtf_di (base, base, reg, reg2, GEN_INT (adjustment),
4929 				   GEN_INT (UNITS_PER_VREG));
4930     default:
4931       gcc_unreachable ();
4932     }
4933 }
4934 
4935 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
4936    afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
4937    into CFI_OPS.  */
4938 
4939 static void
aarch64_pop_regs(unsigned regno1,unsigned regno2,HOST_WIDE_INT adjustment,rtx * cfi_ops)4940 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
4941 		  rtx *cfi_ops)
4942 {
4943   machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1);
4944   rtx reg1 = gen_rtx_REG (mode, regno1);
4945 
4946   *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
4947 
4948   if (regno2 == INVALID_REGNUM)
4949     {
4950       rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
4951       mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
4952       emit_move_insn (reg1, gen_frame_mem (mode, mem));
4953     }
4954   else
4955     {
4956       rtx reg2 = gen_rtx_REG (mode, regno2);
4957       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
4958       emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
4959 					  reg2, adjustment));
4960     }
4961 }
4962 
4963 /* Generate and return a store pair instruction of mode MODE to store
4964    register REG1 to MEM1 and register REG2 to MEM2.  */
4965 
4966 static rtx
aarch64_gen_store_pair(machine_mode mode,rtx mem1,rtx reg1,rtx mem2,rtx reg2)4967 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
4968 			rtx reg2)
4969 {
4970   switch (mode)
4971     {
4972     case E_DImode:
4973       return gen_store_pair_dw_didi (mem1, reg1, mem2, reg2);
4974 
4975     case E_DFmode:
4976       return gen_store_pair_dw_dfdf (mem1, reg1, mem2, reg2);
4977 
4978     case E_TFmode:
4979       return gen_store_pair_dw_tftf (mem1, reg1, mem2, reg2);
4980 
4981     default:
4982       gcc_unreachable ();
4983     }
4984 }
4985 
4986 /* Generate and regurn a load pair isntruction of mode MODE to load register
4987    REG1 from MEM1 and register REG2 from MEM2.  */
4988 
4989 static rtx
aarch64_gen_load_pair(machine_mode mode,rtx reg1,rtx mem1,rtx reg2,rtx mem2)4990 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
4991 		       rtx mem2)
4992 {
4993   switch (mode)
4994     {
4995     case E_DImode:
4996       return gen_load_pair_dw_didi (reg1, mem1, reg2, mem2);
4997 
4998     case E_DFmode:
4999       return gen_load_pair_dw_dfdf (reg1, mem1, reg2, mem2);
5000 
5001     case E_TFmode:
5002       return gen_load_pair_dw_tftf (reg1, mem1, reg2, mem2);
5003 
5004     default:
5005       gcc_unreachable ();
5006     }
5007 }
5008 
5009 /* Return TRUE if return address signing should be enabled for the current
5010    function, otherwise return FALSE.  */
5011 
5012 bool
aarch64_return_address_signing_enabled(void)5013 aarch64_return_address_signing_enabled (void)
5014 {
5015   /* This function should only be called after frame laid out.   */
5016   gcc_assert (cfun->machine->frame.laid_out);
5017 
5018   /* Turn return address signing off in any function that uses
5019      __builtin_eh_return.  The address passed to __builtin_eh_return
5020      is not signed so either it has to be signed (with original sp)
5021      or the code path that uses it has to avoid authenticating it.
5022      Currently eh return introduces a return to anywhere gadget, no
5023      matter what we do here since it uses ret with user provided
5024      address. An ideal fix for that is to use indirect branch which
5025      can be protected with BTI j (to some extent).  */
5026   if (crtl->calls_eh_return)
5027     return false;
5028 
5029   /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
5030      if it's LR is pushed onto stack.  */
5031   return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
5032 	  || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
5033 	      && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
5034 }
5035 
5036 /* Return TRUE if Branch Target Identification Mechanism is enabled.  */
5037 bool
aarch64_bti_enabled(void)5038 aarch64_bti_enabled (void)
5039 {
5040   return (aarch64_enable_bti == 1);
5041 }
5042 
5043 /* Emit code to save the callee-saved registers from register number START
5044    to LIMIT to the stack at the location starting at offset START_OFFSET,
5045    skipping any write-back candidates if SKIP_WB is true.  */
5046 
5047 static void
aarch64_save_callee_saves(machine_mode mode,poly_int64 start_offset,unsigned start,unsigned limit,bool skip_wb)5048 aarch64_save_callee_saves (machine_mode mode, poly_int64 start_offset,
5049 			   unsigned start, unsigned limit, bool skip_wb)
5050 {
5051   rtx_insn *insn;
5052   unsigned regno;
5053   unsigned regno2;
5054 
5055   for (regno = aarch64_next_callee_save (start, limit);
5056        regno <= limit;
5057        regno = aarch64_next_callee_save (regno + 1, limit))
5058     {
5059       rtx reg, mem;
5060       poly_int64 offset;
5061       int offset_diff;
5062 
5063       if (skip_wb
5064 	  && (regno == cfun->machine->frame.wb_candidate1
5065 	      || regno == cfun->machine->frame.wb_candidate2))
5066 	continue;
5067 
5068       if (cfun->machine->reg_is_wrapped_separately[regno])
5069        continue;
5070 
5071       reg = gen_rtx_REG (mode, regno);
5072       offset = start_offset + cfun->machine->frame.reg_offset[regno];
5073       mem = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
5074 						offset));
5075 
5076       regno2 = aarch64_next_callee_save (regno + 1, limit);
5077       offset_diff = cfun->machine->frame.reg_offset[regno2]
5078 		    - cfun->machine->frame.reg_offset[regno];
5079 
5080       if (regno2 <= limit
5081 	  && !cfun->machine->reg_is_wrapped_separately[regno2]
5082 	  && known_eq (GET_MODE_SIZE (mode), offset_diff))
5083 	{
5084 	  rtx reg2 = gen_rtx_REG (mode, regno2);
5085 	  rtx mem2;
5086 
5087 	  offset = start_offset + cfun->machine->frame.reg_offset[regno2];
5088 	  mem2 = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
5089 						     offset));
5090 	  insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
5091 						    reg2));
5092 
5093 	  /* The first part of a frame-related parallel insn is
5094 	     always assumed to be relevant to the frame
5095 	     calculations; subsequent parts, are only
5096 	     frame-related if explicitly marked.  */
5097 	  RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
5098 	  regno = regno2;
5099 	}
5100       else
5101 	insn = emit_move_insn (mem, reg);
5102 
5103       RTX_FRAME_RELATED_P (insn) = 1;
5104     }
5105 }
5106 
5107 /* Emit code to restore the callee registers of mode MODE from register
5108    number START up to and including LIMIT.  Restore from the stack offset
5109    START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
5110    Write the appropriate REG_CFA_RESTORE notes into CFI_OPS.  */
5111 
5112 static void
aarch64_restore_callee_saves(machine_mode mode,poly_int64 start_offset,unsigned start,unsigned limit,bool skip_wb,rtx * cfi_ops)5113 aarch64_restore_callee_saves (machine_mode mode,
5114 			      poly_int64 start_offset, unsigned start,
5115 			      unsigned limit, bool skip_wb, rtx *cfi_ops)
5116 {
5117   rtx base_rtx = stack_pointer_rtx;
5118   unsigned regno;
5119   unsigned regno2;
5120   poly_int64 offset;
5121 
5122   for (regno = aarch64_next_callee_save (start, limit);
5123        regno <= limit;
5124        regno = aarch64_next_callee_save (regno + 1, limit))
5125     {
5126       if (cfun->machine->reg_is_wrapped_separately[regno])
5127        continue;
5128 
5129       rtx reg, mem;
5130       int offset_diff;
5131 
5132       if (skip_wb
5133 	  && (regno == cfun->machine->frame.wb_candidate1
5134 	      || regno == cfun->machine->frame.wb_candidate2))
5135 	continue;
5136 
5137       reg = gen_rtx_REG (mode, regno);
5138       offset = start_offset + cfun->machine->frame.reg_offset[regno];
5139       mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
5140 
5141       regno2 = aarch64_next_callee_save (regno + 1, limit);
5142       offset_diff = cfun->machine->frame.reg_offset[regno2]
5143 		    - cfun->machine->frame.reg_offset[regno];
5144 
5145       if (regno2 <= limit
5146 	  && !cfun->machine->reg_is_wrapped_separately[regno2]
5147 	  && known_eq (GET_MODE_SIZE (mode), offset_diff))
5148 	{
5149 	  rtx reg2 = gen_rtx_REG (mode, regno2);
5150 	  rtx mem2;
5151 
5152 	  offset = start_offset + cfun->machine->frame.reg_offset[regno2];
5153 	  mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
5154 	  emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
5155 
5156 	  *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
5157 	  regno = regno2;
5158 	}
5159       else
5160 	emit_move_insn (reg, mem);
5161       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
5162     }
5163 }
5164 
5165 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
5166    of MODE.  */
5167 
5168 static inline bool
offset_4bit_signed_scaled_p(machine_mode mode,poly_int64 offset)5169 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
5170 {
5171   HOST_WIDE_INT multiple;
5172   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5173 	  && IN_RANGE (multiple, -8, 7));
5174 }
5175 
5176 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
5177    of MODE.  */
5178 
5179 static inline bool
offset_6bit_unsigned_scaled_p(machine_mode mode,poly_int64 offset)5180 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
5181 {
5182   HOST_WIDE_INT multiple;
5183   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5184 	  && IN_RANGE (multiple, 0, 63));
5185 }
5186 
5187 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
5188    of MODE.  */
5189 
5190 bool
aarch64_offset_7bit_signed_scaled_p(machine_mode mode,poly_int64 offset)5191 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
5192 {
5193   HOST_WIDE_INT multiple;
5194   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5195 	  && IN_RANGE (multiple, -64, 63));
5196 }
5197 
5198 /* Return true if OFFSET is a signed 9-bit value.  */
5199 
5200 bool
aarch64_offset_9bit_signed_unscaled_p(machine_mode mode ATTRIBUTE_UNUSED,poly_int64 offset)5201 aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
5202 				       poly_int64 offset)
5203 {
5204   HOST_WIDE_INT const_offset;
5205   return (offset.is_constant (&const_offset)
5206 	  && IN_RANGE (const_offset, -256, 255));
5207 }
5208 
5209 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
5210    of MODE.  */
5211 
5212 static inline bool
offset_9bit_signed_scaled_p(machine_mode mode,poly_int64 offset)5213 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
5214 {
5215   HOST_WIDE_INT multiple;
5216   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5217 	  && IN_RANGE (multiple, -256, 255));
5218 }
5219 
5220 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
5221    of MODE.  */
5222 
5223 static inline bool
offset_12bit_unsigned_scaled_p(machine_mode mode,poly_int64 offset)5224 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
5225 {
5226   HOST_WIDE_INT multiple;
5227   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5228 	  && IN_RANGE (multiple, 0, 4095));
5229 }
5230 
5231 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS.  */
5232 
5233 static sbitmap
aarch64_get_separate_components(void)5234 aarch64_get_separate_components (void)
5235 {
5236   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
5237   bitmap_clear (components);
5238 
5239   /* The registers we need saved to the frame.  */
5240   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
5241     if (aarch64_register_saved_on_entry (regno))
5242       {
5243 	poly_int64 offset = cfun->machine->frame.reg_offset[regno];
5244 	if (!frame_pointer_needed)
5245 	  offset += cfun->machine->frame.frame_size
5246 		    - cfun->machine->frame.hard_fp_offset;
5247 	/* Check that we can access the stack slot of the register with one
5248 	   direct load with no adjustments needed.  */
5249 	if (offset_12bit_unsigned_scaled_p (DImode, offset))
5250 	  bitmap_set_bit (components, regno);
5251       }
5252 
5253   /* Don't mess with the hard frame pointer.  */
5254   if (frame_pointer_needed)
5255     bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
5256 
5257   unsigned reg1 = cfun->machine->frame.wb_candidate1;
5258   unsigned reg2 = cfun->machine->frame.wb_candidate2;
5259   /* If registers have been chosen to be stored/restored with
5260      writeback don't interfere with them to avoid having to output explicit
5261      stack adjustment instructions.  */
5262   if (reg2 != INVALID_REGNUM)
5263     bitmap_clear_bit (components, reg2);
5264   if (reg1 != INVALID_REGNUM)
5265     bitmap_clear_bit (components, reg1);
5266 
5267   bitmap_clear_bit (components, LR_REGNUM);
5268   bitmap_clear_bit (components, SP_REGNUM);
5269 
5270   return components;
5271 }
5272 
5273 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB.  */
5274 
5275 static sbitmap
aarch64_components_for_bb(basic_block bb)5276 aarch64_components_for_bb (basic_block bb)
5277 {
5278   bitmap in = DF_LIVE_IN (bb);
5279   bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
5280   bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
5281   bool simd_function = aarch64_simd_decl_p (cfun->decl);
5282 
5283   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
5284   bitmap_clear (components);
5285 
5286   /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets.  */
5287   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
5288     if ((!call_used_regs[regno]
5289 	|| (simd_function && FP_SIMD_SAVED_REGNUM_P (regno)))
5290        && (bitmap_bit_p (in, regno)
5291 	   || bitmap_bit_p (gen, regno)
5292 	   || bitmap_bit_p (kill, regno)))
5293       {
5294 	unsigned regno2, offset, offset2;
5295 	bitmap_set_bit (components, regno);
5296 
5297 	/* If there is a callee-save at an adjacent offset, add it too
5298 	   to increase the use of LDP/STP.  */
5299 	offset = cfun->machine->frame.reg_offset[regno];
5300 	regno2 = ((offset & 8) == 0) ? regno + 1 : regno - 1;
5301 
5302 	if (regno2 <= LAST_SAVED_REGNUM)
5303 	  {
5304 	    offset2 = cfun->machine->frame.reg_offset[regno2];
5305 	    if ((offset & ~8) == (offset2 & ~8))
5306 	      bitmap_set_bit (components, regno2);
5307 	  }
5308       }
5309 
5310   return components;
5311 }
5312 
5313 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
5314    Nothing to do for aarch64.  */
5315 
5316 static void
aarch64_disqualify_components(sbitmap,edge,sbitmap,bool)5317 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
5318 {
5319 }
5320 
5321 /* Return the next set bit in BMP from START onwards.  Return the total number
5322    of bits in BMP if no set bit is found at or after START.  */
5323 
5324 static unsigned int
aarch64_get_next_set_bit(sbitmap bmp,unsigned int start)5325 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
5326 {
5327   unsigned int nbits = SBITMAP_SIZE (bmp);
5328   if (start == nbits)
5329     return start;
5330 
5331   gcc_assert (start < nbits);
5332   for (unsigned int i = start; i < nbits; i++)
5333     if (bitmap_bit_p (bmp, i))
5334       return i;
5335 
5336   return nbits;
5337 }
5338 
5339 /* Do the work for aarch64_emit_prologue_components and
5340    aarch64_emit_epilogue_components.  COMPONENTS is the bitmap of registers
5341    to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
5342    for these components or the epilogue sequence.  That is, it determines
5343    whether we should emit stores or loads and what kind of CFA notes to attach
5344    to the insns.  Otherwise the logic for the two sequences is very
5345    similar.  */
5346 
5347 static void
aarch64_process_components(sbitmap components,bool prologue_p)5348 aarch64_process_components (sbitmap components, bool prologue_p)
5349 {
5350   rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
5351 			     ? HARD_FRAME_POINTER_REGNUM
5352 			     : STACK_POINTER_REGNUM);
5353 
5354   unsigned last_regno = SBITMAP_SIZE (components);
5355   unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
5356   rtx_insn *insn = NULL;
5357 
5358   while (regno != last_regno)
5359     {
5360       /* AAPCS64 section 5.1.2 requires only the low 64 bits to be saved
5361 	 so DFmode for the vector registers is enough.  For simd functions
5362 	 we want to save the low 128 bits.  */
5363       machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno);
5364 
5365       rtx reg = gen_rtx_REG (mode, regno);
5366       poly_int64 offset = cfun->machine->frame.reg_offset[regno];
5367       if (!frame_pointer_needed)
5368 	offset += cfun->machine->frame.frame_size
5369 		  - cfun->machine->frame.hard_fp_offset;
5370       rtx addr = plus_constant (Pmode, ptr_reg, offset);
5371       rtx mem = gen_frame_mem (mode, addr);
5372 
5373       rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
5374       unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
5375       /* No more registers to handle after REGNO.
5376 	 Emit a single save/restore and exit.  */
5377       if (regno2 == last_regno)
5378 	{
5379 	  insn = emit_insn (set);
5380 	  RTX_FRAME_RELATED_P (insn) = 1;
5381 	  if (prologue_p)
5382 	    add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
5383 	  else
5384 	    add_reg_note (insn, REG_CFA_RESTORE, reg);
5385 	  break;
5386 	}
5387 
5388       poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
5389       /* The next register is not of the same class or its offset is not
5390 	 mergeable with the current one into a pair.  */
5391       if (!satisfies_constraint_Ump (mem)
5392 	  || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
5393 	  || (aarch64_simd_decl_p (cfun->decl) && FP_REGNUM_P (regno))
5394 	  || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
5395 		       GET_MODE_SIZE (mode)))
5396 	{
5397 	  insn = emit_insn (set);
5398 	  RTX_FRAME_RELATED_P (insn) = 1;
5399 	  if (prologue_p)
5400 	    add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
5401 	  else
5402 	    add_reg_note (insn, REG_CFA_RESTORE, reg);
5403 
5404 	  regno = regno2;
5405 	  continue;
5406 	}
5407 
5408       /* REGNO2 can be saved/restored in a pair with REGNO.  */
5409       rtx reg2 = gen_rtx_REG (mode, regno2);
5410       if (!frame_pointer_needed)
5411 	offset2 += cfun->machine->frame.frame_size
5412 		  - cfun->machine->frame.hard_fp_offset;
5413       rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
5414       rtx mem2 = gen_frame_mem (mode, addr2);
5415       rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
5416 			     : gen_rtx_SET (reg2, mem2);
5417 
5418       if (prologue_p)
5419 	insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
5420       else
5421 	insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
5422 
5423       RTX_FRAME_RELATED_P (insn) = 1;
5424       if (prologue_p)
5425 	{
5426 	  add_reg_note (insn, REG_CFA_OFFSET, set);
5427 	  add_reg_note (insn, REG_CFA_OFFSET, set2);
5428 	}
5429       else
5430 	{
5431 	  add_reg_note (insn, REG_CFA_RESTORE, reg);
5432 	  add_reg_note (insn, REG_CFA_RESTORE, reg2);
5433 	}
5434 
5435       regno = aarch64_get_next_set_bit (components, regno2 + 1);
5436     }
5437 }
5438 
5439 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS.  */
5440 
5441 static void
aarch64_emit_prologue_components(sbitmap components)5442 aarch64_emit_prologue_components (sbitmap components)
5443 {
5444   aarch64_process_components (components, true);
5445 }
5446 
5447 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS.  */
5448 
5449 static void
aarch64_emit_epilogue_components(sbitmap components)5450 aarch64_emit_epilogue_components (sbitmap components)
5451 {
5452   aarch64_process_components (components, false);
5453 }
5454 
5455 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS.  */
5456 
5457 static void
aarch64_set_handled_components(sbitmap components)5458 aarch64_set_handled_components (sbitmap components)
5459 {
5460   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
5461     if (bitmap_bit_p (components, regno))
5462       cfun->machine->reg_is_wrapped_separately[regno] = true;
5463 }
5464 
5465 /* On AArch64 we have an ABI defined safe buffer.  This constant is used to
5466    determining the probe offset for alloca.  */
5467 
5468 static HOST_WIDE_INT
aarch64_stack_clash_protection_alloca_probe_range(void)5469 aarch64_stack_clash_protection_alloca_probe_range (void)
5470 {
5471   return STACK_CLASH_CALLER_GUARD;
5472 }
5473 
5474 
5475 /* Allocate POLY_SIZE bytes of stack space using TEMP1 and TEMP2 as scratch
5476    registers.  If POLY_SIZE is not large enough to require a probe this function
5477    will only adjust the stack.  When allocating the stack space
5478    FRAME_RELATED_P is then used to indicate if the allocation is frame related.
5479    FINAL_ADJUSTMENT_P indicates whether we are allocating the outgoing
5480    arguments.  If we are then we ensure that any allocation larger than the ABI
5481    defined buffer needs a probe so that the invariant of having a 1KB buffer is
5482    maintained.
5483 
5484    We emit barriers after each stack adjustment to prevent optimizations from
5485    breaking the invariant that we never drop the stack more than a page.  This
5486    invariant is needed to make it easier to correctly handle asynchronous
5487    events, e.g. if we were to allow the stack to be dropped by more than a page
5488    and then have multiple probes up and we take a signal somewhere in between
5489    then the signal handler doesn't know the state of the stack and can make no
5490    assumptions about which pages have been probed.  */
5491 
5492 static void
aarch64_allocate_and_probe_stack_space(rtx temp1,rtx temp2,poly_int64 poly_size,bool frame_related_p,bool final_adjustment_p)5493 aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
5494 					poly_int64 poly_size,
5495 					bool frame_related_p,
5496 					bool final_adjustment_p)
5497 {
5498   HOST_WIDE_INT guard_size
5499     = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
5500   HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
5501   /* When doing the final adjustment for the outgoing argument size we can't
5502      assume that LR was saved at position 0.  So subtract it's offset from the
5503      ABI safe buffer so that we don't accidentally allow an adjustment that
5504      would result in an allocation larger than the ABI buffer without
5505      probing.  */
5506   HOST_WIDE_INT min_probe_threshold
5507     = final_adjustment_p
5508       ? guard_used_by_caller - cfun->machine->frame.reg_offset[LR_REGNUM]
5509       : guard_size - guard_used_by_caller;
5510 
5511   poly_int64 frame_size = cfun->machine->frame.frame_size;
5512 
5513   /* We should always have a positive probe threshold.  */
5514   gcc_assert (min_probe_threshold > 0);
5515 
5516   if (flag_stack_clash_protection && !final_adjustment_p)
5517     {
5518       poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
5519       poly_int64 final_adjust = cfun->machine->frame.final_adjust;
5520 
5521       if (known_eq (frame_size, 0))
5522 	{
5523 	  dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
5524 	}
5525       else if (known_lt (initial_adjust, guard_size - guard_used_by_caller)
5526 	       && known_lt (final_adjust, guard_used_by_caller))
5527 	{
5528 	  dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
5529 	}
5530     }
5531 
5532   /* If SIZE is not large enough to require probing, just adjust the stack and
5533      exit.  */
5534   if (known_lt (poly_size, min_probe_threshold)
5535       || !flag_stack_clash_protection)
5536     {
5537       aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p);
5538       return;
5539     }
5540 
5541   HOST_WIDE_INT size;
5542   /* Handle the SVE non-constant case first.  */
5543   if (!poly_size.is_constant (&size))
5544     {
5545      if (dump_file)
5546       {
5547 	fprintf (dump_file, "Stack clash SVE prologue: ");
5548 	print_dec (poly_size, dump_file);
5549 	fprintf (dump_file, " bytes, dynamic probing will be required.\n");
5550       }
5551 
5552       /* First calculate the amount of bytes we're actually spilling.  */
5553       aarch64_add_offset (Pmode, temp1, CONST0_RTX (Pmode),
5554 			  poly_size, temp1, temp2, false, true);
5555 
5556       rtx_insn *insn = get_last_insn ();
5557 
5558       if (frame_related_p)
5559 	{
5560 	  /* This is done to provide unwinding information for the stack
5561 	     adjustments we're about to do, however to prevent the optimizers
5562 	     from removing the R11 move and leaving the CFA note (which would be
5563 	     very wrong) we tie the old and new stack pointer together.
5564 	     The tie will expand to nothing but the optimizers will not touch
5565 	     the instruction.  */
5566 	  rtx stack_ptr_copy = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
5567 	  emit_move_insn (stack_ptr_copy, stack_pointer_rtx);
5568 	  emit_insn (gen_stack_tie (stack_ptr_copy, stack_pointer_rtx));
5569 
5570 	  /* We want the CFA independent of the stack pointer for the
5571 	     duration of the loop.  */
5572 	  add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy);
5573 	  RTX_FRAME_RELATED_P (insn) = 1;
5574 	}
5575 
5576       rtx probe_const = gen_int_mode (min_probe_threshold, Pmode);
5577       rtx guard_const = gen_int_mode (guard_size, Pmode);
5578 
5579       insn = emit_insn (gen_probe_sve_stack_clash (Pmode, stack_pointer_rtx,
5580 						   stack_pointer_rtx, temp1,
5581 						   probe_const, guard_const));
5582 
5583       /* Now reset the CFA register if needed.  */
5584       if (frame_related_p)
5585 	{
5586 	  add_reg_note (insn, REG_CFA_DEF_CFA,
5587 			gen_rtx_PLUS (Pmode, stack_pointer_rtx,
5588 				      gen_int_mode (poly_size, Pmode)));
5589 	  RTX_FRAME_RELATED_P (insn) = 1;
5590 	}
5591 
5592       return;
5593     }
5594 
5595   if (dump_file)
5596     fprintf (dump_file,
5597 	     "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
5598 	     " bytes, probing will be required.\n", size);
5599 
5600   /* Round size to the nearest multiple of guard_size, and calculate the
5601      residual as the difference between the original size and the rounded
5602      size.  */
5603   HOST_WIDE_INT rounded_size = ROUND_DOWN (size, guard_size);
5604   HOST_WIDE_INT residual = size - rounded_size;
5605 
5606   /* We can handle a small number of allocations/probes inline.  Otherwise
5607      punt to a loop.  */
5608   if (rounded_size <= STACK_CLASH_MAX_UNROLL_PAGES * guard_size)
5609     {
5610       for (HOST_WIDE_INT i = 0; i < rounded_size; i += guard_size)
5611 	{
5612 	  aarch64_sub_sp (NULL, temp2, guard_size, true);
5613 	  emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
5614 					   guard_used_by_caller));
5615 	  emit_insn (gen_blockage ());
5616 	}
5617       dump_stack_clash_frame_info (PROBE_INLINE, size != rounded_size);
5618     }
5619   else
5620     {
5621       /* Compute the ending address.  */
5622       aarch64_add_offset (Pmode, temp1, stack_pointer_rtx, -rounded_size,
5623 			  temp1, NULL, false, true);
5624       rtx_insn *insn = get_last_insn ();
5625 
5626       /* For the initial allocation, we don't have a frame pointer
5627 	 set up, so we always need CFI notes.  If we're doing the
5628 	 final allocation, then we may have a frame pointer, in which
5629 	 case it is the CFA, otherwise we need CFI notes.
5630 
5631 	 We can determine which allocation we are doing by looking at
5632 	 the value of FRAME_RELATED_P since the final allocations are not
5633 	 frame related.  */
5634       if (frame_related_p)
5635 	{
5636 	  /* We want the CFA independent of the stack pointer for the
5637 	     duration of the loop.  */
5638 	  add_reg_note (insn, REG_CFA_DEF_CFA,
5639 			plus_constant (Pmode, temp1, rounded_size));
5640 	  RTX_FRAME_RELATED_P (insn) = 1;
5641 	}
5642 
5643       /* This allocates and probes the stack.  Note that this re-uses some of
5644 	 the existing Ada stack protection code.  However we are guaranteed not
5645 	 to enter the non loop or residual branches of that code.
5646 
5647 	 The non-loop part won't be entered because if our allocation amount
5648 	 doesn't require a loop, the case above would handle it.
5649 
5650 	 The residual amount won't be entered because TEMP1 is a mutliple of
5651 	 the allocation size.  The residual will always be 0.  As such, the only
5652 	 part we are actually using from that code is the loop setup.  The
5653 	 actual probing is done in aarch64_output_probe_stack_range.  */
5654       insn = emit_insn (gen_probe_stack_range (stack_pointer_rtx,
5655 					       stack_pointer_rtx, temp1));
5656 
5657       /* Now reset the CFA register if needed.  */
5658       if (frame_related_p)
5659 	{
5660 	  add_reg_note (insn, REG_CFA_DEF_CFA,
5661 			plus_constant (Pmode, stack_pointer_rtx, rounded_size));
5662 	  RTX_FRAME_RELATED_P (insn) = 1;
5663 	}
5664 
5665       emit_insn (gen_blockage ());
5666       dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
5667     }
5668 
5669   /* Handle any residuals.  Residuals of at least MIN_PROBE_THRESHOLD have to
5670      be probed.  This maintains the requirement that each page is probed at
5671      least once.  For initial probing we probe only if the allocation is
5672      more than GUARD_SIZE - buffer, and for the outgoing arguments we probe
5673      if the amount is larger than buffer.  GUARD_SIZE - buffer + buffer ==
5674      GUARD_SIZE.  This works that for any allocation that is large enough to
5675      trigger a probe here, we'll have at least one, and if they're not large
5676      enough for this code to emit anything for them, The page would have been
5677      probed by the saving of FP/LR either by this function or any callees.  If
5678      we don't have any callees then we won't have more stack adjustments and so
5679      are still safe.  */
5680   if (residual)
5681     {
5682       HOST_WIDE_INT residual_probe_offset = guard_used_by_caller;
5683       /* If we're doing final adjustments, and we've done any full page
5684 	 allocations then any residual needs to be probed.  */
5685       if (final_adjustment_p && rounded_size != 0)
5686 	min_probe_threshold = 0;
5687       /* If doing a small final adjustment, we always probe at offset 0.
5688 	 This is done to avoid issues when LR is not at position 0 or when
5689 	 the final adjustment is smaller than the probing offset.  */
5690       else if (final_adjustment_p && rounded_size == 0)
5691 	residual_probe_offset = 0;
5692 
5693       aarch64_sub_sp (temp1, temp2, residual, frame_related_p);
5694       if (residual >= min_probe_threshold)
5695 	{
5696 	  if (dump_file)
5697 	    fprintf (dump_file,
5698 		     "Stack clash AArch64 prologue residuals: "
5699 		     HOST_WIDE_INT_PRINT_DEC " bytes, probing will be required."
5700 		     "\n", residual);
5701 
5702 	    emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
5703 					     residual_probe_offset));
5704 	  emit_insn (gen_blockage ());
5705 	}
5706     }
5707 }
5708 
5709 /* Return 1 if the register is used by the epilogue.  We need to say the
5710    return register is used, but only after epilogue generation is complete.
5711    Note that in the case of sibcalls, the values "used by the epilogue" are
5712    considered live at the start of the called function.
5713 
5714    For SIMD functions we need to return 1 for FP registers that are saved and
5715    restored by a function but are not zero in call_used_regs.  If we do not do
5716    this optimizations may remove the restore of the register.  */
5717 
5718 int
aarch64_epilogue_uses(int regno)5719 aarch64_epilogue_uses (int regno)
5720 {
5721   if (epilogue_completed)
5722     {
5723       if (regno == LR_REGNUM)
5724 	return 1;
5725       if (aarch64_simd_decl_p (cfun->decl) && FP_SIMD_SAVED_REGNUM_P (regno))
5726 	return 1;
5727     }
5728   return 0;
5729 }
5730 
5731 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
5732    is saved at BASE + OFFSET.  */
5733 
5734 static void
aarch64_add_cfa_expression(rtx_insn * insn,unsigned int reg,rtx base,poly_int64 offset)5735 aarch64_add_cfa_expression (rtx_insn *insn, unsigned int reg,
5736 			    rtx base, poly_int64 offset)
5737 {
5738   rtx mem = gen_frame_mem (DImode, plus_constant (Pmode, base, offset));
5739   add_reg_note (insn, REG_CFA_EXPRESSION,
5740 		gen_rtx_SET (mem, regno_reg_rtx[reg]));
5741 }
5742 
5743 /* AArch64 stack frames generated by this compiler look like:
5744 
5745 	+-------------------------------+
5746 	|                               |
5747 	|  incoming stack arguments     |
5748 	|                               |
5749 	+-------------------------------+
5750 	|                               | <-- incoming stack pointer (aligned)
5751 	|  callee-allocated save area   |
5752 	|  for register varargs         |
5753 	|                               |
5754 	+-------------------------------+
5755 	|  local variables              | <-- frame_pointer_rtx
5756 	|                               |
5757 	+-------------------------------+
5758 	|  padding                      | \
5759 	+-------------------------------+  |
5760 	|  callee-saved registers       |  | frame.saved_regs_size
5761 	+-------------------------------+  |
5762 	|  LR'                          |  |
5763 	+-------------------------------+  |
5764 	|  FP'                          | / <- hard_frame_pointer_rtx (aligned)
5765         +-------------------------------+
5766 	|  dynamic allocation           |
5767 	+-------------------------------+
5768 	|  padding                      |
5769 	+-------------------------------+
5770 	|  outgoing stack arguments     | <-- arg_pointer
5771         |                               |
5772 	+-------------------------------+
5773 	|                               | <-- stack_pointer_rtx (aligned)
5774 
5775    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
5776    but leave frame_pointer_rtx and hard_frame_pointer_rtx
5777    unchanged.
5778 
5779    By default for stack-clash we assume the guard is at least 64KB, but this
5780    value is configurable to either 4KB or 64KB.  We also force the guard size to
5781    be the same as the probing interval and both values are kept in sync.
5782 
5783    With those assumptions the callee can allocate up to 63KB (or 3KB depending
5784    on the guard size) of stack space without probing.
5785 
5786    When probing is needed, we emit a probe at the start of the prologue
5787    and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
5788 
5789    We have to track how much space has been allocated and the only stores
5790    to the stack we track as implicit probes are the FP/LR stores.
5791 
5792    For outgoing arguments we probe if the size is larger than 1KB, such that
5793    the ABI specified buffer is maintained for the next callee.
5794 
5795    The following registers are reserved during frame layout and should not be
5796    used for any other purpose:
5797 
5798    - r11: Used by stack clash protection when SVE is enabled.
5799    - r12(EP0) and r13(EP1): Used as temporaries for stack adjustment.
5800    - r14 and r15: Used for speculation tracking.
5801    - r16(IP0), r17(IP1): Used by indirect tailcalls.
5802    - r30(LR), r29(FP): Used by standard frame layout.
5803 
5804    These registers must be avoided in frame layout related code unless the
5805    explicit intention is to interact with one of the features listed above.  */
5806 
5807 /* Generate the prologue instructions for entry into a function.
5808    Establish the stack frame by decreasing the stack pointer with a
5809    properly calculated size and, if necessary, create a frame record
5810    filled with the values of LR and previous frame pointer.  The
5811    current FP is also set up if it is in use.  */
5812 
5813 void
aarch64_expand_prologue(void)5814 aarch64_expand_prologue (void)
5815 {
5816   poly_int64 frame_size = cfun->machine->frame.frame_size;
5817   poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
5818   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
5819   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
5820   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
5821   unsigned reg1 = cfun->machine->frame.wb_candidate1;
5822   unsigned reg2 = cfun->machine->frame.wb_candidate2;
5823   bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
5824   rtx_insn *insn;
5825 
5826   /* Sign return address for functions.  */
5827   if (aarch64_return_address_signing_enabled ())
5828     {
5829       insn = emit_insn (gen_pacisp ());
5830       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
5831       RTX_FRAME_RELATED_P (insn) = 1;
5832     }
5833 
5834   if (flag_stack_usage_info)
5835     current_function_static_stack_size = constant_lower_bound (frame_size);
5836 
5837   if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
5838     {
5839       if (crtl->is_leaf && !cfun->calls_alloca)
5840 	{
5841 	  if (maybe_gt (frame_size, PROBE_INTERVAL)
5842 	      && maybe_gt (frame_size, get_stack_check_protect ()))
5843 	    aarch64_emit_probe_stack_range (get_stack_check_protect (),
5844 					    (frame_size
5845 					     - get_stack_check_protect ()));
5846 	}
5847       else if (maybe_gt (frame_size, 0))
5848 	aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
5849     }
5850 
5851   rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
5852   rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
5853 
5854   /* In theory we should never have both an initial adjustment
5855      and a callee save adjustment.  Verify that is the case since the
5856      code below does not handle it for -fstack-clash-protection.  */
5857   gcc_assert (known_eq (initial_adjust, 0) || callee_adjust == 0);
5858 
5859   /* Will only probe if the initial adjustment is larger than the guard
5860      less the amount of the guard reserved for use by the caller's
5861      outgoing args.  */
5862   aarch64_allocate_and_probe_stack_space (tmp0_rtx, tmp1_rtx, initial_adjust,
5863 					  true, false);
5864 
5865   if (callee_adjust != 0)
5866     aarch64_push_regs (reg1, reg2, callee_adjust);
5867 
5868   if (emit_frame_chain)
5869     {
5870       poly_int64 reg_offset = callee_adjust;
5871       if (callee_adjust == 0)
5872 	{
5873 	  reg1 = R29_REGNUM;
5874 	  reg2 = R30_REGNUM;
5875 	  reg_offset = callee_offset;
5876 	  aarch64_save_callee_saves (DImode, reg_offset, reg1, reg2, false);
5877 	}
5878       aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
5879 			  stack_pointer_rtx, callee_offset,
5880 			  tmp1_rtx, tmp0_rtx, frame_pointer_needed);
5881       if (frame_pointer_needed && !frame_size.is_constant ())
5882 	{
5883 	  /* Variable-sized frames need to describe the save slot
5884 	     address using DW_CFA_expression rather than DW_CFA_offset.
5885 	     This means that, without taking further action, the
5886 	     locations of the registers that we've already saved would
5887 	     remain based on the stack pointer even after we redefine
5888 	     the CFA based on the frame pointer.  We therefore need new
5889 	     DW_CFA_expressions to re-express the save slots with addresses
5890 	     based on the frame pointer.  */
5891 	  rtx_insn *insn = get_last_insn ();
5892 	  gcc_assert (RTX_FRAME_RELATED_P (insn));
5893 
5894 	  /* Add an explicit CFA definition if this was previously
5895 	     implicit.  */
5896 	  if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
5897 	    {
5898 	      rtx src = plus_constant (Pmode, stack_pointer_rtx,
5899 				       callee_offset);
5900 	      add_reg_note (insn, REG_CFA_ADJUST_CFA,
5901 			    gen_rtx_SET (hard_frame_pointer_rtx, src));
5902 	    }
5903 
5904 	  /* Change the save slot expressions for the registers that
5905 	     we've already saved.  */
5906 	  reg_offset -= callee_offset;
5907 	  aarch64_add_cfa_expression (insn, reg2, hard_frame_pointer_rtx,
5908 				      reg_offset + UNITS_PER_WORD);
5909 	  aarch64_add_cfa_expression (insn, reg1, hard_frame_pointer_rtx,
5910 				      reg_offset);
5911 	}
5912       emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
5913     }
5914 
5915   aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
5916 			     callee_adjust != 0 || emit_frame_chain);
5917   if (aarch64_simd_decl_p (cfun->decl))
5918     aarch64_save_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM,
5919 			       callee_adjust != 0 || emit_frame_chain);
5920   else
5921     aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
5922 			       callee_adjust != 0 || emit_frame_chain);
5923 
5924   /* We may need to probe the final adjustment if it is larger than the guard
5925      that is assumed by the called.  */
5926   aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust,
5927 					  !frame_pointer_needed, true);
5928 }
5929 
5930 /* Return TRUE if we can use a simple_return insn.
5931 
5932    This function checks whether the callee saved stack is empty, which
5933    means no restore actions are need. The pro_and_epilogue will use
5934    this to check whether shrink-wrapping opt is feasible.  */
5935 
5936 bool
aarch64_use_return_insn_p(void)5937 aarch64_use_return_insn_p (void)
5938 {
5939   if (!reload_completed)
5940     return false;
5941 
5942   if (crtl->profile)
5943     return false;
5944 
5945   return known_eq (cfun->machine->frame.frame_size, 0);
5946 }
5947 
5948 /* Return false for non-leaf SIMD functions in order to avoid
5949    shrink-wrapping them.  Doing this will lose the necessary
5950    save/restore of FP registers.  */
5951 
5952 bool
aarch64_use_simple_return_insn_p(void)5953 aarch64_use_simple_return_insn_p (void)
5954 {
5955   if (aarch64_simd_decl_p (cfun->decl) && !crtl->is_leaf)
5956     return false;
5957 
5958   return true;
5959 }
5960 
5961 /* Generate the epilogue instructions for returning from a function.
5962    This is almost exactly the reverse of the prolog sequence, except
5963    that we need to insert barriers to avoid scheduling loads that read
5964    from a deallocated stack, and we optimize the unwind records by
5965    emitting them all together if possible.  */
5966 void
aarch64_expand_epilogue(bool for_sibcall)5967 aarch64_expand_epilogue (bool for_sibcall)
5968 {
5969   poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
5970   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
5971   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
5972   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
5973   unsigned reg1 = cfun->machine->frame.wb_candidate1;
5974   unsigned reg2 = cfun->machine->frame.wb_candidate2;
5975   rtx cfi_ops = NULL;
5976   rtx_insn *insn;
5977   /* A stack clash protection prologue may not have left EP0_REGNUM or
5978      EP1_REGNUM in a usable state.  The same is true for allocations
5979      with an SVE component, since we then need both temporary registers
5980      for each allocation.  For stack clash we are in a usable state if
5981      the adjustment is less than GUARD_SIZE - GUARD_USED_BY_CALLER.  */
5982   HOST_WIDE_INT guard_size
5983     = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
5984   HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
5985 
5986   /* We can re-use the registers when the allocation amount is smaller than
5987      guard_size - guard_used_by_caller because we won't be doing any probes
5988      then.  In such situations the register should remain live with the correct
5989      value.  */
5990   bool can_inherit_p = (initial_adjust.is_constant ()
5991 			&& final_adjust.is_constant ())
5992 			&& (!flag_stack_clash_protection
5993 			    || known_lt (initial_adjust,
5994 					 guard_size - guard_used_by_caller));
5995 
5996   /* We need to add memory barrier to prevent read from deallocated stack.  */
5997   bool need_barrier_p
5998     = maybe_ne (get_frame_size ()
5999 		+ cfun->machine->frame.saved_varargs_size, 0);
6000 
6001   /* Emit a barrier to prevent loads from a deallocated stack.  */
6002   if (maybe_gt (final_adjust, crtl->outgoing_args_size)
6003       || cfun->calls_alloca
6004       || crtl->calls_eh_return)
6005     {
6006       emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
6007       need_barrier_p = false;
6008     }
6009 
6010   /* Restore the stack pointer from the frame pointer if it may not
6011      be the same as the stack pointer.  */
6012   rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
6013   rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
6014   if (frame_pointer_needed
6015       && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
6016     /* If writeback is used when restoring callee-saves, the CFA
6017        is restored on the instruction doing the writeback.  */
6018     aarch64_add_offset (Pmode, stack_pointer_rtx,
6019 			hard_frame_pointer_rtx, -callee_offset,
6020 			tmp1_rtx, tmp0_rtx, callee_adjust == 0);
6021   else
6022      /* The case where we need to re-use the register here is very rare, so
6023 	avoid the complicated condition and just always emit a move if the
6024 	immediate doesn't fit.  */
6025      aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, true);
6026 
6027   aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
6028 				callee_adjust != 0, &cfi_ops);
6029   if (aarch64_simd_decl_p (cfun->decl))
6030     aarch64_restore_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM,
6031 				  callee_adjust != 0, &cfi_ops);
6032   else
6033     aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
6034 				  callee_adjust != 0, &cfi_ops);
6035 
6036   if (need_barrier_p)
6037     emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
6038 
6039   if (callee_adjust != 0)
6040     aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
6041 
6042   if (callee_adjust != 0 || maybe_gt (initial_adjust, 65536))
6043     {
6044       /* Emit delayed restores and set the CFA to be SP + initial_adjust.  */
6045       insn = get_last_insn ();
6046       rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
6047       REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
6048       RTX_FRAME_RELATED_P (insn) = 1;
6049       cfi_ops = NULL;
6050     }
6051 
6052   /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
6053      add restriction on emit_move optimization to leaf functions.  */
6054   aarch64_add_sp (tmp0_rtx, tmp1_rtx, initial_adjust,
6055 		  (!can_inherit_p || !crtl->is_leaf
6056 		   || df_regs_ever_live_p (EP0_REGNUM)));
6057 
6058   if (cfi_ops)
6059     {
6060       /* Emit delayed restores and reset the CFA to be SP.  */
6061       insn = get_last_insn ();
6062       cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
6063       REG_NOTES (insn) = cfi_ops;
6064       RTX_FRAME_RELATED_P (insn) = 1;
6065     }
6066 
6067   /* We prefer to emit the combined return/authenticate instruction RETAA,
6068      however there are three cases in which we must instead emit an explicit
6069      authentication instruction.
6070 
6071 	1) Sibcalls don't return in a normal way, so if we're about to call one
6072 	   we must authenticate.
6073 
6074 	2) The RETAA instruction is not available before ARMv8.3-A, so if we are
6075 	   generating code for !TARGET_ARMV8_3 we can't use it and must
6076 	   explicitly authenticate.
6077 
6078 	3) On an eh_return path we make extra stack adjustments to update the
6079 	   canonical frame address to be the exception handler's CFA.  We want
6080 	   to authenticate using the CFA of the function which calls eh_return.
6081     */
6082   if (aarch64_return_address_signing_enabled ()
6083       && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
6084     {
6085       insn = emit_insn (gen_autisp ());
6086       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
6087       RTX_FRAME_RELATED_P (insn) = 1;
6088     }
6089 
6090   /* Stack adjustment for exception handler.  */
6091   if (crtl->calls_eh_return)
6092     {
6093       /* We need to unwind the stack by the offset computed by
6094 	 EH_RETURN_STACKADJ_RTX.  We have already reset the CFA
6095 	 to be SP; letting the CFA move during this adjustment
6096 	 is just as correct as retaining the CFA from the body
6097 	 of the function.  Therefore, do nothing special.  */
6098       emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
6099     }
6100 
6101   emit_use (gen_rtx_REG (DImode, LR_REGNUM));
6102   if (!for_sibcall)
6103     emit_jump_insn (ret_rtx);
6104 }
6105 
6106 /* Implement EH_RETURN_HANDLER_RTX.  EH returns need to either return
6107    normally or return to a previous frame after unwinding.
6108 
6109    An EH return uses a single shared return sequence.  The epilogue is
6110    exactly like a normal epilogue except that it has an extra input
6111    register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
6112    that must be applied after the frame has been destroyed.  An extra label
6113    is inserted before the epilogue which initializes this register to zero,
6114    and this is the entry point for a normal return.
6115 
6116    An actual EH return updates the return address, initializes the stack
6117    adjustment and jumps directly into the epilogue (bypassing the zeroing
6118    of the adjustment).  Since the return address is typically saved on the
6119    stack when a function makes a call, the saved LR must be updated outside
6120    the epilogue.
6121 
6122    This poses problems as the store is generated well before the epilogue,
6123    so the offset of LR is not known yet.  Also optimizations will remove the
6124    store as it appears dead, even after the epilogue is generated (as the
6125    base or offset for loading LR is different in many cases).
6126 
6127    To avoid these problems this implementation forces the frame pointer
6128    in eh_return functions so that the location of LR is fixed and known early.
6129    It also marks the store volatile, so no optimization is permitted to
6130    remove the store.  */
6131 rtx
aarch64_eh_return_handler_rtx(void)6132 aarch64_eh_return_handler_rtx (void)
6133 {
6134   rtx tmp = gen_frame_mem (Pmode,
6135     plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
6136 
6137   /* Mark the store volatile, so no optimization is permitted to remove it.  */
6138   MEM_VOLATILE_P (tmp) = true;
6139   return tmp;
6140 }
6141 
6142 /* Output code to add DELTA to the first argument, and then jump
6143    to FUNCTION.  Used for C++ multiple inheritance.  */
6144 static void
aarch64_output_mi_thunk(FILE * file,tree thunk ATTRIBUTE_UNUSED,HOST_WIDE_INT delta,HOST_WIDE_INT vcall_offset,tree function)6145 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
6146 			 HOST_WIDE_INT delta,
6147 			 HOST_WIDE_INT vcall_offset,
6148 			 tree function)
6149 {
6150   /* The this pointer is always in x0.  Note that this differs from
6151      Arm where the this pointer maybe bumped to r1 if r0 is required
6152      to return a pointer to an aggregate.  On AArch64 a result value
6153      pointer will be in x8.  */
6154   int this_regno = R0_REGNUM;
6155   rtx this_rtx, temp0, temp1, addr, funexp;
6156   rtx_insn *insn;
6157 
6158   if (aarch64_bti_enabled ())
6159     emit_insn (gen_bti_c());
6160 
6161   reload_completed = 1;
6162   emit_note (NOTE_INSN_PROLOGUE_END);
6163 
6164   this_rtx = gen_rtx_REG (Pmode, this_regno);
6165   temp0 = gen_rtx_REG (Pmode, EP0_REGNUM);
6166   temp1 = gen_rtx_REG (Pmode, EP1_REGNUM);
6167 
6168   if (vcall_offset == 0)
6169     aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
6170   else
6171     {
6172       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
6173 
6174       addr = this_rtx;
6175       if (delta != 0)
6176 	{
6177 	  if (delta >= -256 && delta < 256)
6178 	    addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
6179 				       plus_constant (Pmode, this_rtx, delta));
6180 	  else
6181 	    aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
6182 				temp1, temp0, false);
6183 	}
6184 
6185       if (Pmode == ptr_mode)
6186 	aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
6187       else
6188 	aarch64_emit_move (temp0,
6189 			   gen_rtx_ZERO_EXTEND (Pmode,
6190 						gen_rtx_MEM (ptr_mode, addr)));
6191 
6192       if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
6193 	  addr = plus_constant (Pmode, temp0, vcall_offset);
6194       else
6195 	{
6196 	  aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
6197 					  Pmode);
6198 	  addr = gen_rtx_PLUS (Pmode, temp0, temp1);
6199 	}
6200 
6201       if (Pmode == ptr_mode)
6202 	aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
6203       else
6204 	aarch64_emit_move (temp1,
6205 			   gen_rtx_SIGN_EXTEND (Pmode,
6206 						gen_rtx_MEM (ptr_mode, addr)));
6207 
6208       emit_insn (gen_add2_insn (this_rtx, temp1));
6209     }
6210 
6211   /* Generate a tail call to the target function.  */
6212   if (!TREE_USED (function))
6213     {
6214       assemble_external (function);
6215       TREE_USED (function) = 1;
6216     }
6217   funexp = XEXP (DECL_RTL (function), 0);
6218   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
6219   insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
6220   SIBLING_CALL_P (insn) = 1;
6221 
6222   insn = get_insns ();
6223   shorten_branches (insn);
6224   final_start_function (insn, file, 1);
6225   final (insn, file, 1);
6226   final_end_function ();
6227 
6228   /* Stop pretending to be a post-reload pass.  */
6229   reload_completed = 0;
6230 }
6231 
6232 static bool
aarch64_tls_referenced_p(rtx x)6233 aarch64_tls_referenced_p (rtx x)
6234 {
6235   if (!TARGET_HAVE_TLS)
6236     return false;
6237   subrtx_iterator::array_type array;
6238   FOR_EACH_SUBRTX (iter, array, x, ALL)
6239     {
6240       const_rtx x = *iter;
6241       if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
6242 	return true;
6243       /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
6244 	 TLS offsets, not real symbol references.  */
6245       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
6246 	iter.skip_subrtxes ();
6247     }
6248   return false;
6249 }
6250 
6251 
6252 /* Return true if val can be encoded as a 12-bit unsigned immediate with
6253    a left shift of 0 or 12 bits.  */
6254 bool
aarch64_uimm12_shift(HOST_WIDE_INT val)6255 aarch64_uimm12_shift (HOST_WIDE_INT val)
6256 {
6257   return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
6258 	  || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
6259 	  );
6260 }
6261 
6262 /* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
6263    that can be created with a left shift of 0 or 12.  */
6264 static HOST_WIDE_INT
aarch64_clamp_to_uimm12_shift(HOST_WIDE_INT val)6265 aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val)
6266 {
6267   /* Check to see if the value fits in 24 bits, as that is the maximum we can
6268      handle correctly.  */
6269   gcc_assert ((val & 0xffffff) == val);
6270 
6271   if (((val & 0xfff) << 0) == val)
6272     return val;
6273 
6274   return val & (0xfff << 12);
6275 }
6276 
6277 /* Return true if val is an immediate that can be loaded into a
6278    register by a MOVZ instruction.  */
6279 static bool
aarch64_movw_imm(HOST_WIDE_INT val,scalar_int_mode mode)6280 aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
6281 {
6282   if (GET_MODE_SIZE (mode) > 4)
6283     {
6284       if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
6285 	  || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
6286 	return 1;
6287     }
6288   else
6289     {
6290       /* Ignore sign extension.  */
6291       val &= (HOST_WIDE_INT) 0xffffffff;
6292     }
6293   return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
6294 	  || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
6295 }
6296 
6297 /* VAL is a value with the inner mode of MODE.  Replicate it to fill a
6298    64-bit (DImode) integer.  */
6299 
6300 static unsigned HOST_WIDE_INT
aarch64_replicate_bitmask_imm(unsigned HOST_WIDE_INT val,machine_mode mode)6301 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
6302 {
6303   unsigned int size = GET_MODE_UNIT_PRECISION (mode);
6304   while (size < 64)
6305     {
6306       val &= (HOST_WIDE_INT_1U << size) - 1;
6307       val |= val << size;
6308       size *= 2;
6309     }
6310   return val;
6311 }
6312 
6313 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2.  */
6314 
6315 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
6316   {
6317     0x0000000100000001ull,
6318     0x0001000100010001ull,
6319     0x0101010101010101ull,
6320     0x1111111111111111ull,
6321     0x5555555555555555ull,
6322   };
6323 
6324 
6325 /* Return true if val is a valid bitmask immediate.  */
6326 
6327 bool
aarch64_bitmask_imm(HOST_WIDE_INT val_in,machine_mode mode)6328 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
6329 {
6330   unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
6331   int bits;
6332 
6333   /* Check for a single sequence of one bits and return quickly if so.
6334      The special cases of all ones and all zeroes returns false.  */
6335   val = aarch64_replicate_bitmask_imm (val_in, mode);
6336   tmp = val + (val & -val);
6337 
6338   if (tmp == (tmp & -tmp))
6339     return (val + 1) > 1;
6340 
6341   /* Replicate 32-bit immediates so we can treat them as 64-bit.  */
6342   if (mode == SImode)
6343     val = (val << 32) | (val & 0xffffffff);
6344 
6345   /* Invert if the immediate doesn't start with a zero bit - this means we
6346      only need to search for sequences of one bits.  */
6347   if (val & 1)
6348     val = ~val;
6349 
6350   /* Find the first set bit and set tmp to val with the first sequence of one
6351      bits removed.  Return success if there is a single sequence of ones.  */
6352   first_one = val & -val;
6353   tmp = val & (val + first_one);
6354 
6355   if (tmp == 0)
6356     return true;
6357 
6358   /* Find the next set bit and compute the difference in bit position.  */
6359   next_one = tmp & -tmp;
6360   bits = clz_hwi (first_one) - clz_hwi (next_one);
6361   mask = val ^ tmp;
6362 
6363   /* Check the bit position difference is a power of 2, and that the first
6364      sequence of one bits fits within 'bits' bits.  */
6365   if ((mask >> bits) != 0 || bits != (bits & -bits))
6366     return false;
6367 
6368   /* Check the sequence of one bits is repeated 64/bits times.  */
6369   return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
6370 }
6371 
6372 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
6373    Assumed precondition: VAL_IN Is not zero.  */
6374 
6375 unsigned HOST_WIDE_INT
aarch64_and_split_imm1(HOST_WIDE_INT val_in)6376 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
6377 {
6378   int lowest_bit_set = ctz_hwi (val_in);
6379   int highest_bit_set = floor_log2 (val_in);
6380   gcc_assert (val_in != 0);
6381 
6382   return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
6383 	  (HOST_WIDE_INT_1U << lowest_bit_set));
6384 }
6385 
6386 /* Create constant where bits outside of lowest bit set to highest bit set
6387    are set to 1.  */
6388 
6389 unsigned HOST_WIDE_INT
aarch64_and_split_imm2(HOST_WIDE_INT val_in)6390 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
6391 {
6392   return val_in | ~aarch64_and_split_imm1 (val_in);
6393 }
6394 
6395 /* Return true if VAL_IN is a valid 'and' bitmask immediate.  */
6396 
6397 bool
aarch64_and_bitmask_imm(unsigned HOST_WIDE_INT val_in,machine_mode mode)6398 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
6399 {
6400   scalar_int_mode int_mode;
6401   if (!is_a <scalar_int_mode> (mode, &int_mode))
6402     return false;
6403 
6404   if (aarch64_bitmask_imm (val_in, int_mode))
6405     return false;
6406 
6407   if (aarch64_move_imm (val_in, int_mode))
6408     return false;
6409 
6410   unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
6411 
6412   return aarch64_bitmask_imm (imm2, int_mode);
6413 }
6414 
6415 /* Return true if val is an immediate that can be loaded into a
6416    register in a single instruction.  */
6417 bool
aarch64_move_imm(HOST_WIDE_INT val,machine_mode mode)6418 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
6419 {
6420   scalar_int_mode int_mode;
6421   if (!is_a <scalar_int_mode> (mode, &int_mode))
6422     return false;
6423 
6424   if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
6425     return 1;
6426   return aarch64_bitmask_imm (val, int_mode);
6427 }
6428 
6429 static bool
aarch64_cannot_force_const_mem(machine_mode mode ATTRIBUTE_UNUSED,rtx x)6430 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
6431 {
6432   rtx base, offset;
6433 
6434   if (GET_CODE (x) == HIGH)
6435     return true;
6436 
6437   /* There's no way to calculate VL-based values using relocations.  */
6438   subrtx_iterator::array_type array;
6439   FOR_EACH_SUBRTX (iter, array, x, ALL)
6440     if (GET_CODE (*iter) == CONST_POLY_INT)
6441       return true;
6442 
6443   split_const (x, &base, &offset);
6444   if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
6445     {
6446       if (aarch64_classify_symbol (base, INTVAL (offset))
6447 	  != SYMBOL_FORCE_TO_MEM)
6448 	return true;
6449       else
6450 	/* Avoid generating a 64-bit relocation in ILP32; leave
6451 	   to aarch64_expand_mov_immediate to handle it properly.  */
6452 	return mode != ptr_mode;
6453     }
6454 
6455   return aarch64_tls_referenced_p (x);
6456 }
6457 
6458 /* Implement TARGET_CASE_VALUES_THRESHOLD.
6459    The expansion for a table switch is quite expensive due to the number
6460    of instructions, the table lookup and hard to predict indirect jump.
6461    When optimizing for speed, and -O3 enabled, use the per-core tuning if
6462    set, otherwise use tables for > 16 cases as a tradeoff between size and
6463    performance.  When optimizing for size, use the default setting.  */
6464 
6465 static unsigned int
aarch64_case_values_threshold(void)6466 aarch64_case_values_threshold (void)
6467 {
6468   /* Use the specified limit for the number of cases before using jump
6469      tables at higher optimization levels.  */
6470   if (optimize > 2
6471       && selected_cpu->tune->max_case_values != 0)
6472     return selected_cpu->tune->max_case_values;
6473   else
6474     return optimize_size ? default_case_values_threshold () : 17;
6475 }
6476 
6477 /* Return true if register REGNO is a valid index register.
6478    STRICT_P is true if REG_OK_STRICT is in effect.  */
6479 
6480 bool
aarch64_regno_ok_for_index_p(int regno,bool strict_p)6481 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
6482 {
6483   if (!HARD_REGISTER_NUM_P (regno))
6484     {
6485       if (!strict_p)
6486 	return true;
6487 
6488       if (!reg_renumber)
6489 	return false;
6490 
6491       regno = reg_renumber[regno];
6492     }
6493   return GP_REGNUM_P (regno);
6494 }
6495 
6496 /* Return true if register REGNO is a valid base register for mode MODE.
6497    STRICT_P is true if REG_OK_STRICT is in effect.  */
6498 
6499 bool
aarch64_regno_ok_for_base_p(int regno,bool strict_p)6500 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
6501 {
6502   if (!HARD_REGISTER_NUM_P (regno))
6503     {
6504       if (!strict_p)
6505 	return true;
6506 
6507       if (!reg_renumber)
6508 	return false;
6509 
6510       regno = reg_renumber[regno];
6511     }
6512 
6513   /* The fake registers will be eliminated to either the stack or
6514      hard frame pointer, both of which are usually valid base registers.
6515      Reload deals with the cases where the eliminated form isn't valid.  */
6516   return (GP_REGNUM_P (regno)
6517 	  || regno == SP_REGNUM
6518 	  || regno == FRAME_POINTER_REGNUM
6519 	  || regno == ARG_POINTER_REGNUM);
6520 }
6521 
6522 /* Return true if X is a valid base register for mode MODE.
6523    STRICT_P is true if REG_OK_STRICT is in effect.  */
6524 
6525 static bool
aarch64_base_register_rtx_p(rtx x,bool strict_p)6526 aarch64_base_register_rtx_p (rtx x, bool strict_p)
6527 {
6528   if (!strict_p
6529       && GET_CODE (x) == SUBREG
6530       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
6531     x = SUBREG_REG (x);
6532 
6533   return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
6534 }
6535 
6536 /* Return true if address offset is a valid index.  If it is, fill in INFO
6537    appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
6538 
6539 static bool
aarch64_classify_index(struct aarch64_address_info * info,rtx x,machine_mode mode,bool strict_p)6540 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
6541 			machine_mode mode, bool strict_p)
6542 {
6543   enum aarch64_address_type type;
6544   rtx index;
6545   int shift;
6546 
6547   /* (reg:P) */
6548   if ((REG_P (x) || GET_CODE (x) == SUBREG)
6549       && GET_MODE (x) == Pmode)
6550     {
6551       type = ADDRESS_REG_REG;
6552       index = x;
6553       shift = 0;
6554     }
6555   /* (sign_extend:DI (reg:SI)) */
6556   else if ((GET_CODE (x) == SIGN_EXTEND
6557 	    || GET_CODE (x) == ZERO_EXTEND)
6558 	   && GET_MODE (x) == DImode
6559 	   && GET_MODE (XEXP (x, 0)) == SImode)
6560     {
6561       type = (GET_CODE (x) == SIGN_EXTEND)
6562 	? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6563       index = XEXP (x, 0);
6564       shift = 0;
6565     }
6566   /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
6567   else if (GET_CODE (x) == MULT
6568 	   && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
6569 	       || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
6570 	   && GET_MODE (XEXP (x, 0)) == DImode
6571 	   && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
6572 	   && CONST_INT_P (XEXP (x, 1)))
6573     {
6574       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
6575 	? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6576       index = XEXP (XEXP (x, 0), 0);
6577       shift = exact_log2 (INTVAL (XEXP (x, 1)));
6578     }
6579   /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
6580   else if (GET_CODE (x) == ASHIFT
6581 	   && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
6582 	       || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
6583 	   && GET_MODE (XEXP (x, 0)) == DImode
6584 	   && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
6585 	   && CONST_INT_P (XEXP (x, 1)))
6586     {
6587       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
6588 	? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6589       index = XEXP (XEXP (x, 0), 0);
6590       shift = INTVAL (XEXP (x, 1));
6591     }
6592   /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
6593   else if ((GET_CODE (x) == SIGN_EXTRACT
6594 	    || GET_CODE (x) == ZERO_EXTRACT)
6595 	   && GET_MODE (x) == DImode
6596 	   && GET_CODE (XEXP (x, 0)) == MULT
6597 	   && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
6598 	   && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
6599     {
6600       type = (GET_CODE (x) == SIGN_EXTRACT)
6601 	? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6602       index = XEXP (XEXP (x, 0), 0);
6603       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
6604       if (INTVAL (XEXP (x, 1)) != 32 + shift
6605 	  || INTVAL (XEXP (x, 2)) != 0)
6606 	shift = -1;
6607     }
6608   /* (and:DI (mult:DI (reg:DI) (const_int scale))
6609      (const_int 0xffffffff<<shift)) */
6610   else if (GET_CODE (x) == AND
6611 	   && GET_MODE (x) == DImode
6612 	   && GET_CODE (XEXP (x, 0)) == MULT
6613 	   && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
6614 	   && CONST_INT_P (XEXP (XEXP (x, 0), 1))
6615 	   && CONST_INT_P (XEXP (x, 1)))
6616     {
6617       type = ADDRESS_REG_UXTW;
6618       index = XEXP (XEXP (x, 0), 0);
6619       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
6620       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
6621 	shift = -1;
6622     }
6623   /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
6624   else if ((GET_CODE (x) == SIGN_EXTRACT
6625 	    || GET_CODE (x) == ZERO_EXTRACT)
6626 	   && GET_MODE (x) == DImode
6627 	   && GET_CODE (XEXP (x, 0)) == ASHIFT
6628 	   && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
6629 	   && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
6630     {
6631       type = (GET_CODE (x) == SIGN_EXTRACT)
6632 	? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6633       index = XEXP (XEXP (x, 0), 0);
6634       shift = INTVAL (XEXP (XEXP (x, 0), 1));
6635       if (INTVAL (XEXP (x, 1)) != 32 + shift
6636 	  || INTVAL (XEXP (x, 2)) != 0)
6637 	shift = -1;
6638     }
6639   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
6640      (const_int 0xffffffff<<shift)) */
6641   else if (GET_CODE (x) == AND
6642 	   && GET_MODE (x) == DImode
6643 	   && GET_CODE (XEXP (x, 0)) == ASHIFT
6644 	   && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
6645 	   && CONST_INT_P (XEXP (XEXP (x, 0), 1))
6646 	   && CONST_INT_P (XEXP (x, 1)))
6647     {
6648       type = ADDRESS_REG_UXTW;
6649       index = XEXP (XEXP (x, 0), 0);
6650       shift = INTVAL (XEXP (XEXP (x, 0), 1));
6651       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
6652 	shift = -1;
6653     }
6654   /* (mult:P (reg:P) (const_int scale)) */
6655   else if (GET_CODE (x) == MULT
6656 	   && GET_MODE (x) == Pmode
6657 	   && GET_MODE (XEXP (x, 0)) == Pmode
6658 	   && CONST_INT_P (XEXP (x, 1)))
6659     {
6660       type = ADDRESS_REG_REG;
6661       index = XEXP (x, 0);
6662       shift = exact_log2 (INTVAL (XEXP (x, 1)));
6663     }
6664   /* (ashift:P (reg:P) (const_int shift)) */
6665   else if (GET_CODE (x) == ASHIFT
6666 	   && GET_MODE (x) == Pmode
6667 	   && GET_MODE (XEXP (x, 0)) == Pmode
6668 	   && CONST_INT_P (XEXP (x, 1)))
6669     {
6670       type = ADDRESS_REG_REG;
6671       index = XEXP (x, 0);
6672       shift = INTVAL (XEXP (x, 1));
6673     }
6674   else
6675     return false;
6676 
6677   if (!strict_p
6678       && GET_CODE (index) == SUBREG
6679       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
6680     index = SUBREG_REG (index);
6681 
6682   if (aarch64_sve_data_mode_p (mode))
6683     {
6684       if (type != ADDRESS_REG_REG
6685 	  || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
6686 	return false;
6687     }
6688   else
6689     {
6690       if (shift != 0
6691 	  && !(IN_RANGE (shift, 1, 3)
6692 	       && known_eq (1 << shift, GET_MODE_SIZE (mode))))
6693 	return false;
6694     }
6695 
6696   if (REG_P (index)
6697       && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
6698     {
6699       info->type = type;
6700       info->offset = index;
6701       info->shift = shift;
6702       return true;
6703     }
6704 
6705   return false;
6706 }
6707 
6708 /* Return true if MODE is one of the modes for which we
6709    support LDP/STP operations.  */
6710 
6711 static bool
aarch64_mode_valid_for_sched_fusion_p(machine_mode mode)6712 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
6713 {
6714   return mode == SImode || mode == DImode
6715 	 || mode == SFmode || mode == DFmode
6716 	 || (aarch64_vector_mode_supported_p (mode)
6717 	     && (known_eq (GET_MODE_SIZE (mode), 8)
6718 		 || (known_eq (GET_MODE_SIZE (mode), 16)
6719 		    && (aarch64_tune_params.extra_tuning_flags
6720 			& AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0)));
6721 }
6722 
6723 /* Return true if REGNO is a virtual pointer register, or an eliminable
6724    "soft" frame register.  Like REGNO_PTR_FRAME_P except that we don't
6725    include stack_pointer or hard_frame_pointer.  */
6726 static bool
virt_or_elim_regno_p(unsigned regno)6727 virt_or_elim_regno_p (unsigned regno)
6728 {
6729   return ((regno >= FIRST_VIRTUAL_REGISTER
6730 	   && regno <= LAST_VIRTUAL_POINTER_REGISTER)
6731 	  || regno == FRAME_POINTER_REGNUM
6732 	  || regno == ARG_POINTER_REGNUM);
6733 }
6734 
6735 /* Return true if X is a valid address of type TYPE for machine mode MODE.
6736    If it is, fill in INFO appropriately.  STRICT_P is true if
6737    REG_OK_STRICT is in effect.  */
6738 
6739 bool
aarch64_classify_address(struct aarch64_address_info * info,rtx x,machine_mode mode,bool strict_p,aarch64_addr_query_type type)6740 aarch64_classify_address (struct aarch64_address_info *info,
6741 			  rtx x, machine_mode mode, bool strict_p,
6742 			  aarch64_addr_query_type type)
6743 {
6744   enum rtx_code code = GET_CODE (x);
6745   rtx op0, op1;
6746   poly_int64 offset;
6747 
6748   HOST_WIDE_INT const_size;
6749 
6750   /* On BE, we use load/store pair for all large int mode load/stores.
6751      TI/TFmode may also use a load/store pair.  */
6752   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
6753   bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
6754   bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
6755 			    || type == ADDR_QUERY_LDP_STP_N
6756 			    || mode == TImode
6757 			    || mode == TFmode
6758 			    || (BYTES_BIG_ENDIAN && advsimd_struct_p));
6759 
6760   /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
6761      corresponds to the actual size of the memory being loaded/stored and the
6762      mode of the corresponding addressing mode is half of that.  */
6763   if (type == ADDR_QUERY_LDP_STP_N
6764       && known_eq (GET_MODE_SIZE (mode), 16))
6765     mode = DFmode;
6766 
6767   bool allow_reg_index_p = (!load_store_pair_p
6768 			    && (known_lt (GET_MODE_SIZE (mode), 16)
6769 				|| vec_flags == VEC_ADVSIMD
6770 				|| vec_flags == VEC_SVE_DATA));
6771 
6772   /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
6773      [Rn, #offset, MUL VL].  */
6774   if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
6775       && (code != REG && code != PLUS))
6776     return false;
6777 
6778   /* On LE, for AdvSIMD, don't support anything other than POST_INC or
6779      REG addressing.  */
6780   if (advsimd_struct_p
6781       && !BYTES_BIG_ENDIAN
6782       && (code != POST_INC && code != REG))
6783     return false;
6784 
6785   gcc_checking_assert (GET_MODE (x) == VOIDmode
6786 		       || SCALAR_INT_MODE_P (GET_MODE (x)));
6787 
6788   switch (code)
6789     {
6790     case REG:
6791     case SUBREG:
6792       info->type = ADDRESS_REG_IMM;
6793       info->base = x;
6794       info->offset = const0_rtx;
6795       info->const_offset = 0;
6796       return aarch64_base_register_rtx_p (x, strict_p);
6797 
6798     case PLUS:
6799       op0 = XEXP (x, 0);
6800       op1 = XEXP (x, 1);
6801 
6802       if (! strict_p
6803 	  && REG_P (op0)
6804 	  && virt_or_elim_regno_p (REGNO (op0))
6805 	  && poly_int_rtx_p (op1, &offset))
6806 	{
6807 	  info->type = ADDRESS_REG_IMM;
6808 	  info->base = op0;
6809 	  info->offset = op1;
6810 	  info->const_offset = offset;
6811 
6812 	  return true;
6813 	}
6814 
6815       if (maybe_ne (GET_MODE_SIZE (mode), 0)
6816 	  && aarch64_base_register_rtx_p (op0, strict_p)
6817 	  && poly_int_rtx_p (op1, &offset))
6818 	{
6819 	  info->type = ADDRESS_REG_IMM;
6820 	  info->base = op0;
6821 	  info->offset = op1;
6822 	  info->const_offset = offset;
6823 
6824 	  /* TImode and TFmode values are allowed in both pairs of X
6825 	     registers and individual Q registers.  The available
6826 	     address modes are:
6827 	     X,X: 7-bit signed scaled offset
6828 	     Q:   9-bit signed offset
6829 	     We conservatively require an offset representable in either mode.
6830 	     When performing the check for pairs of X registers i.e.  LDP/STP
6831 	     pass down DImode since that is the natural size of the LDP/STP
6832 	     instruction memory accesses.  */
6833 	  if (mode == TImode || mode == TFmode)
6834 	    return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
6835 		    && (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
6836 			|| offset_12bit_unsigned_scaled_p (mode, offset)));
6837 
6838 	  /* A 7bit offset check because OImode will emit a ldp/stp
6839 	     instruction (only big endian will get here).
6840 	     For ldp/stp instructions, the offset is scaled for the size of a
6841 	     single element of the pair.  */
6842 	  if (mode == OImode)
6843 	    return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
6844 
6845 	  /* Three 9/12 bit offsets checks because CImode will emit three
6846 	     ldr/str instructions (only big endian will get here).  */
6847 	  if (mode == CImode)
6848 	    return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
6849 		    && (aarch64_offset_9bit_signed_unscaled_p (V16QImode,
6850 							       offset + 32)
6851 			|| offset_12bit_unsigned_scaled_p (V16QImode,
6852 							   offset + 32)));
6853 
6854 	  /* Two 7bit offsets checks because XImode will emit two ldp/stp
6855 	     instructions (only big endian will get here).  */
6856 	  if (mode == XImode)
6857 	    return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
6858 		    && aarch64_offset_7bit_signed_scaled_p (TImode,
6859 							    offset + 32));
6860 
6861 	  /* Make "m" use the LD1 offset range for SVE data modes, so
6862 	     that pre-RTL optimizers like ivopts will work to that
6863 	     instead of the wider LDR/STR range.  */
6864 	  if (vec_flags == VEC_SVE_DATA)
6865 	    return (type == ADDR_QUERY_M
6866 		    ? offset_4bit_signed_scaled_p (mode, offset)
6867 		    : offset_9bit_signed_scaled_p (mode, offset));
6868 
6869 	  if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
6870 	    {
6871 	      poly_int64 end_offset = (offset
6872 				       + GET_MODE_SIZE (mode)
6873 				       - BYTES_PER_SVE_VECTOR);
6874 	      return (type == ADDR_QUERY_M
6875 		      ? offset_4bit_signed_scaled_p (mode, offset)
6876 		      : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
6877 			 && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
6878 							 end_offset)));
6879 	    }
6880 
6881 	  if (vec_flags == VEC_SVE_PRED)
6882 	    return offset_9bit_signed_scaled_p (mode, offset);
6883 
6884 	  if (load_store_pair_p)
6885 	    return ((known_eq (GET_MODE_SIZE (mode), 4)
6886 		     || known_eq (GET_MODE_SIZE (mode), 8)
6887 		     || known_eq (GET_MODE_SIZE (mode), 16))
6888 		    && aarch64_offset_7bit_signed_scaled_p (mode, offset));
6889 	  else
6890 	    return (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
6891 		    || offset_12bit_unsigned_scaled_p (mode, offset));
6892 	}
6893 
6894       if (allow_reg_index_p)
6895 	{
6896 	  /* Look for base + (scaled/extended) index register.  */
6897 	  if (aarch64_base_register_rtx_p (op0, strict_p)
6898 	      && aarch64_classify_index (info, op1, mode, strict_p))
6899 	    {
6900 	      info->base = op0;
6901 	      return true;
6902 	    }
6903 	  if (aarch64_base_register_rtx_p (op1, strict_p)
6904 	      && aarch64_classify_index (info, op0, mode, strict_p))
6905 	    {
6906 	      info->base = op1;
6907 	      return true;
6908 	    }
6909 	}
6910 
6911       return false;
6912 
6913     case POST_INC:
6914     case POST_DEC:
6915     case PRE_INC:
6916     case PRE_DEC:
6917       info->type = ADDRESS_REG_WB;
6918       info->base = XEXP (x, 0);
6919       info->offset = NULL_RTX;
6920       return aarch64_base_register_rtx_p (info->base, strict_p);
6921 
6922     case POST_MODIFY:
6923     case PRE_MODIFY:
6924       info->type = ADDRESS_REG_WB;
6925       info->base = XEXP (x, 0);
6926       if (GET_CODE (XEXP (x, 1)) == PLUS
6927 	  && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
6928 	  && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
6929 	  && aarch64_base_register_rtx_p (info->base, strict_p))
6930 	{
6931 	  info->offset = XEXP (XEXP (x, 1), 1);
6932 	  info->const_offset = offset;
6933 
6934 	  /* TImode and TFmode values are allowed in both pairs of X
6935 	     registers and individual Q registers.  The available
6936 	     address modes are:
6937 	     X,X: 7-bit signed scaled offset
6938 	     Q:   9-bit signed offset
6939 	     We conservatively require an offset representable in either mode.
6940 	   */
6941 	  if (mode == TImode || mode == TFmode)
6942 	    return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
6943 		    && aarch64_offset_9bit_signed_unscaled_p (mode, offset));
6944 
6945 	  if (load_store_pair_p)
6946 	    return ((known_eq (GET_MODE_SIZE (mode), 4)
6947 		     || known_eq (GET_MODE_SIZE (mode), 8)
6948 		     || known_eq (GET_MODE_SIZE (mode), 16))
6949 		    && aarch64_offset_7bit_signed_scaled_p (mode, offset));
6950 	  else
6951 	    return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
6952 	}
6953       return false;
6954 
6955     case CONST:
6956     case SYMBOL_REF:
6957     case LABEL_REF:
6958       /* load literal: pc-relative constant pool entry.  Only supported
6959          for SI mode or larger.  */
6960       info->type = ADDRESS_SYMBOLIC;
6961 
6962       if (!load_store_pair_p
6963 	  && GET_MODE_SIZE (mode).is_constant (&const_size)
6964 	  && const_size >= 4)
6965 	{
6966 	  rtx sym, addend;
6967 
6968 	  split_const (x, &sym, &addend);
6969 	  return ((GET_CODE (sym) == LABEL_REF
6970 		   || (GET_CODE (sym) == SYMBOL_REF
6971 		       && CONSTANT_POOL_ADDRESS_P (sym)
6972 		       && aarch64_pcrelative_literal_loads)));
6973 	}
6974       return false;
6975 
6976     case LO_SUM:
6977       info->type = ADDRESS_LO_SUM;
6978       info->base = XEXP (x, 0);
6979       info->offset = XEXP (x, 1);
6980       if (allow_reg_index_p
6981 	  && aarch64_base_register_rtx_p (info->base, strict_p))
6982 	{
6983 	  rtx sym, offs;
6984 	  split_const (info->offset, &sym, &offs);
6985 	  if (GET_CODE (sym) == SYMBOL_REF
6986 	      && (aarch64_classify_symbol (sym, INTVAL (offs))
6987 		  == SYMBOL_SMALL_ABSOLUTE))
6988 	    {
6989 	      /* The symbol and offset must be aligned to the access size.  */
6990 	      unsigned int align;
6991 
6992 	      if (CONSTANT_POOL_ADDRESS_P (sym))
6993 		align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
6994 	      else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
6995 		{
6996 		  tree exp = SYMBOL_REF_DECL (sym);
6997 		  align = TYPE_ALIGN (TREE_TYPE (exp));
6998 		  align = aarch64_constant_alignment (exp, align);
6999 		}
7000 	      else if (SYMBOL_REF_DECL (sym))
7001 		align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
7002 	      else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
7003 		       && SYMBOL_REF_BLOCK (sym) != NULL)
7004 		align = SYMBOL_REF_BLOCK (sym)->alignment;
7005 	      else
7006 		align = BITS_PER_UNIT;
7007 
7008 	      poly_int64 ref_size = GET_MODE_SIZE (mode);
7009 	      if (known_eq (ref_size, 0))
7010 		ref_size = GET_MODE_SIZE (DImode);
7011 
7012 	      return (multiple_p (INTVAL (offs), ref_size)
7013 		      && multiple_p (align / BITS_PER_UNIT, ref_size));
7014 	    }
7015 	}
7016       return false;
7017 
7018     default:
7019       return false;
7020     }
7021 }
7022 
7023 /* Return true if the address X is valid for a PRFM instruction.
7024    STRICT_P is true if we should do strict checking with
7025    aarch64_classify_address.  */
7026 
7027 bool
aarch64_address_valid_for_prefetch_p(rtx x,bool strict_p)7028 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
7029 {
7030   struct aarch64_address_info addr;
7031 
7032   /* PRFM accepts the same addresses as DImode...  */
7033   bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
7034   if (!res)
7035     return false;
7036 
7037   /* ... except writeback forms.  */
7038   return addr.type != ADDRESS_REG_WB;
7039 }
7040 
7041 bool
aarch64_symbolic_address_p(rtx x)7042 aarch64_symbolic_address_p (rtx x)
7043 {
7044   rtx offset;
7045 
7046   split_const (x, &x, &offset);
7047   return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
7048 }
7049 
7050 /* Classify the base of symbolic expression X.  */
7051 
7052 enum aarch64_symbol_type
aarch64_classify_symbolic_expression(rtx x)7053 aarch64_classify_symbolic_expression (rtx x)
7054 {
7055   rtx offset;
7056 
7057   split_const (x, &x, &offset);
7058   return aarch64_classify_symbol (x, INTVAL (offset));
7059 }
7060 
7061 
7062 /* Return TRUE if X is a legitimate address for accessing memory in
7063    mode MODE.  */
7064 static bool
aarch64_legitimate_address_hook_p(machine_mode mode,rtx x,bool strict_p)7065 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
7066 {
7067   struct aarch64_address_info addr;
7068 
7069   return aarch64_classify_address (&addr, x, mode, strict_p);
7070 }
7071 
7072 /* Return TRUE if X is a legitimate address of type TYPE for accessing
7073    memory in mode MODE.  STRICT_P is true if REG_OK_STRICT is in effect.  */
7074 bool
aarch64_legitimate_address_p(machine_mode mode,rtx x,bool strict_p,aarch64_addr_query_type type)7075 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
7076 			      aarch64_addr_query_type type)
7077 {
7078   struct aarch64_address_info addr;
7079 
7080   return aarch64_classify_address (&addr, x, mode, strict_p, type);
7081 }
7082 
7083 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT.  */
7084 
7085 static bool
aarch64_legitimize_address_displacement(rtx * offset1,rtx * offset2,poly_int64 orig_offset,machine_mode mode)7086 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
7087 					 poly_int64 orig_offset,
7088 					 machine_mode mode)
7089 {
7090   HOST_WIDE_INT size;
7091   if (GET_MODE_SIZE (mode).is_constant (&size))
7092     {
7093       HOST_WIDE_INT const_offset, second_offset;
7094 
7095       /* A general SVE offset is A * VQ + B.  Remove the A component from
7096 	 coefficient 0 in order to get the constant B.  */
7097       const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
7098 
7099       /* Split an out-of-range address displacement into a base and
7100 	 offset.  Use 4KB range for 1- and 2-byte accesses and a 16KB
7101 	 range otherwise to increase opportunities for sharing the base
7102 	 address of different sizes.  Unaligned accesses use the signed
7103 	 9-bit range, TImode/TFmode use the intersection of signed
7104 	 scaled 7-bit and signed 9-bit offset.  */
7105       if (mode == TImode || mode == TFmode)
7106 	second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
7107       else if ((const_offset & (size - 1)) != 0)
7108 	second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
7109       else
7110 	second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
7111 
7112       if (second_offset == 0 || known_eq (orig_offset, second_offset))
7113 	return false;
7114 
7115       /* Split the offset into second_offset and the rest.  */
7116       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
7117       *offset2 = gen_int_mode (second_offset, Pmode);
7118       return true;
7119     }
7120   else
7121     {
7122       /* Get the mode we should use as the basis of the range.  For structure
7123 	 modes this is the mode of one vector.  */
7124       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
7125       machine_mode step_mode
7126 	= (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
7127 
7128       /* Get the "mul vl" multiplier we'd like to use.  */
7129       HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
7130       HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
7131       if (vec_flags & VEC_SVE_DATA)
7132 	/* LDR supports a 9-bit range, but the move patterns for
7133 	   structure modes require all vectors to be in range of the
7134 	   same base.  The simplest way of accomodating that while still
7135 	   promoting reuse of anchor points between different modes is
7136 	   to use an 8-bit range unconditionally.  */
7137 	vnum = ((vnum + 128) & 255) - 128;
7138       else
7139 	/* Predicates are only handled singly, so we might as well use
7140 	   the full range.  */
7141 	vnum = ((vnum + 256) & 511) - 256;
7142       if (vnum == 0)
7143 	return false;
7144 
7145       /* Convert the "mul vl" multiplier into a byte offset.  */
7146       poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
7147       if (known_eq (second_offset, orig_offset))
7148 	return false;
7149 
7150       /* Split the offset into second_offset and the rest.  */
7151       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
7152       *offset2 = gen_int_mode (second_offset, Pmode);
7153       return true;
7154     }
7155 }
7156 
7157 /* Return the binary representation of floating point constant VALUE in INTVAL.
7158    If the value cannot be converted, return false without setting INTVAL.
7159    The conversion is done in the given MODE.  */
7160 bool
aarch64_reinterpret_float_as_int(rtx value,unsigned HOST_WIDE_INT * intval)7161 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
7162 {
7163 
7164   /* We make a general exception for 0.  */
7165   if (aarch64_float_const_zero_rtx_p (value))
7166     {
7167       *intval = 0;
7168       return true;
7169     }
7170 
7171   scalar_float_mode mode;
7172   if (GET_CODE (value) != CONST_DOUBLE
7173       || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
7174       || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
7175       /* Only support up to DF mode.  */
7176       || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
7177     return false;
7178 
7179   unsigned HOST_WIDE_INT ival = 0;
7180 
7181   long res[2];
7182   real_to_target (res,
7183 		  CONST_DOUBLE_REAL_VALUE (value),
7184 		  REAL_MODE_FORMAT (mode));
7185 
7186   if (mode == DFmode)
7187     {
7188       int order = BYTES_BIG_ENDIAN ? 1 : 0;
7189       ival = zext_hwi (res[order], 32);
7190       ival |= (zext_hwi (res[1 - order], 32) << 32);
7191     }
7192   else
7193       ival = zext_hwi (res[0], 32);
7194 
7195   *intval = ival;
7196   return true;
7197 }
7198 
7199 /* Return TRUE if rtx X is an immediate constant that can be moved using a
7200    single MOV(+MOVK) followed by an FMOV.  */
7201 bool
aarch64_float_const_rtx_p(rtx x)7202 aarch64_float_const_rtx_p (rtx x)
7203 {
7204   machine_mode mode = GET_MODE (x);
7205   if (mode == VOIDmode)
7206     return false;
7207 
7208   /* Determine whether it's cheaper to write float constants as
7209      mov/movk pairs over ldr/adrp pairs.  */
7210   unsigned HOST_WIDE_INT ival;
7211 
7212   if (GET_CODE (x) == CONST_DOUBLE
7213       && SCALAR_FLOAT_MODE_P (mode)
7214       && aarch64_reinterpret_float_as_int (x, &ival))
7215     {
7216       scalar_int_mode imode = (mode == HFmode
7217 			       ? SImode
7218 			       : int_mode_for_mode (mode).require ());
7219       int num_instr = aarch64_internal_mov_immediate
7220 			(NULL_RTX, gen_int_mode (ival, imode), false, imode);
7221       return num_instr < 3;
7222     }
7223 
7224   return false;
7225 }
7226 
7227 /* Return TRUE if rtx X is immediate constant 0.0 */
7228 bool
aarch64_float_const_zero_rtx_p(rtx x)7229 aarch64_float_const_zero_rtx_p (rtx x)
7230 {
7231   if (GET_MODE (x) == VOIDmode)
7232     return false;
7233 
7234   if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
7235     return !HONOR_SIGNED_ZEROS (GET_MODE (x));
7236   return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
7237 }
7238 
7239 /* Return TRUE if rtx X is immediate constant that fits in a single
7240    MOVI immediate operation.  */
7241 bool
aarch64_can_const_movi_rtx_p(rtx x,machine_mode mode)7242 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
7243 {
7244   if (!TARGET_SIMD)
7245      return false;
7246 
7247   machine_mode vmode;
7248   scalar_int_mode imode;
7249   unsigned HOST_WIDE_INT ival;
7250 
7251   if (GET_CODE (x) == CONST_DOUBLE
7252       && SCALAR_FLOAT_MODE_P (mode))
7253     {
7254       if (!aarch64_reinterpret_float_as_int (x, &ival))
7255 	return false;
7256 
7257       /* We make a general exception for 0.  */
7258       if (aarch64_float_const_zero_rtx_p (x))
7259 	return true;
7260 
7261       imode = int_mode_for_mode (mode).require ();
7262     }
7263   else if (GET_CODE (x) == CONST_INT
7264 	   && is_a <scalar_int_mode> (mode, &imode))
7265     ival = INTVAL (x);
7266   else
7267     return false;
7268 
7269    /* use a 64 bit mode for everything except for DI/DF mode, where we use
7270      a 128 bit vector mode.  */
7271   int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
7272 
7273   vmode = aarch64_simd_container_mode (imode, width);
7274   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
7275 
7276   return aarch64_simd_valid_immediate (v_op, NULL);
7277 }
7278 
7279 
7280 /* Return the fixed registers used for condition codes.  */
7281 
7282 static bool
aarch64_fixed_condition_code_regs(unsigned int * p1,unsigned int * p2)7283 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
7284 {
7285   *p1 = CC_REGNUM;
7286   *p2 = INVALID_REGNUM;
7287   return true;
7288 }
7289 
7290 /* This function is used by the call expanders of the machine description.
7291    RESULT is the register in which the result is returned.  It's NULL for
7292    "call" and "sibcall".
7293    MEM is the location of the function call.
7294    SIBCALL indicates whether this function call is normal call or sibling call.
7295    It will generate different pattern accordingly.  */
7296 
7297 void
aarch64_expand_call(rtx result,rtx mem,bool sibcall)7298 aarch64_expand_call (rtx result, rtx mem, bool sibcall)
7299 {
7300   rtx call, callee, tmp;
7301   rtvec vec;
7302   machine_mode mode;
7303 
7304   gcc_assert (MEM_P (mem));
7305   callee = XEXP (mem, 0);
7306   mode = GET_MODE (callee);
7307   gcc_assert (mode == Pmode);
7308 
7309   /* Decide if we should generate indirect calls by loading the
7310      address of the callee into a register before performing
7311      the branch-and-link.  */
7312   if (SYMBOL_REF_P (callee)
7313       ? (aarch64_is_long_call_p (callee)
7314 	 || aarch64_is_noplt_call_p (callee))
7315       : !REG_P (callee))
7316     XEXP (mem, 0) = force_reg (mode, callee);
7317 
7318   call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
7319 
7320   if (result != NULL_RTX)
7321     call = gen_rtx_SET (result, call);
7322 
7323   if (sibcall)
7324     tmp = ret_rtx;
7325   else
7326     tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
7327 
7328   vec = gen_rtvec (2, call, tmp);
7329   call = gen_rtx_PARALLEL (VOIDmode, vec);
7330 
7331   aarch64_emit_call_insn (call);
7332 }
7333 
7334 /* Emit call insn with PAT and do aarch64-specific handling.  */
7335 
7336 void
aarch64_emit_call_insn(rtx pat)7337 aarch64_emit_call_insn (rtx pat)
7338 {
7339   rtx insn = emit_call_insn (pat);
7340 
7341   rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
7342   clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
7343   clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
7344 }
7345 
7346 machine_mode
aarch64_select_cc_mode(RTX_CODE code,rtx x,rtx y)7347 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
7348 {
7349   machine_mode mode_x = GET_MODE (x);
7350   rtx_code code_x = GET_CODE (x);
7351 
7352   /* All floating point compares return CCFP if it is an equality
7353      comparison, and CCFPE otherwise.  */
7354   if (GET_MODE_CLASS (mode_x) == MODE_FLOAT)
7355     {
7356       switch (code)
7357 	{
7358 	case EQ:
7359 	case NE:
7360 	case UNORDERED:
7361 	case ORDERED:
7362 	case UNLT:
7363 	case UNLE:
7364 	case UNGT:
7365 	case UNGE:
7366 	case UNEQ:
7367 	  return CCFPmode;
7368 
7369 	case LT:
7370 	case LE:
7371 	case GT:
7372 	case GE:
7373 	case LTGT:
7374 	  return CCFPEmode;
7375 
7376 	default:
7377 	  gcc_unreachable ();
7378 	}
7379     }
7380 
7381   /* Equality comparisons of short modes against zero can be performed
7382      using the TST instruction with the appropriate bitmask.  */
7383   if (y == const0_rtx && (REG_P (x) || SUBREG_P (x))
7384       && (code == EQ || code == NE)
7385       && (mode_x == HImode || mode_x == QImode))
7386     return CC_NZmode;
7387 
7388   /* Similarly, comparisons of zero_extends from shorter modes can
7389      be performed using an ANDS with an immediate mask.  */
7390   if (y == const0_rtx && code_x == ZERO_EXTEND
7391       && (mode_x == SImode || mode_x == DImode)
7392       && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
7393       && (code == EQ || code == NE))
7394     return CC_NZmode;
7395 
7396   if ((mode_x == SImode || mode_x == DImode)
7397       && y == const0_rtx
7398       && (code == EQ || code == NE || code == LT || code == GE)
7399       && (code_x == PLUS || code_x == MINUS || code_x == AND
7400 	  || code_x == NEG
7401 	  || (code_x == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
7402 	      && CONST_INT_P (XEXP (x, 2)))))
7403     return CC_NZmode;
7404 
7405   /* A compare with a shifted operand.  Because of canonicalization,
7406      the comparison will have to be swapped when we emit the assembly
7407      code.  */
7408   if ((mode_x == SImode || mode_x == DImode)
7409       && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
7410       && (code_x == ASHIFT || code_x == ASHIFTRT
7411 	  || code_x == LSHIFTRT
7412 	  || code_x == ZERO_EXTEND || code_x == SIGN_EXTEND))
7413     return CC_SWPmode;
7414 
7415   /* Similarly for a negated operand, but we can only do this for
7416      equalities.  */
7417   if ((mode_x == SImode || mode_x == DImode)
7418       && (REG_P (y) || GET_CODE (y) == SUBREG)
7419       && (code == EQ || code == NE)
7420       && code_x == NEG)
7421     return CC_Zmode;
7422 
7423   /* A test for unsigned overflow from an addition.  */
7424   if ((mode_x == DImode || mode_x == TImode)
7425       && (code == LTU || code == GEU)
7426       && code_x == PLUS
7427       && rtx_equal_p (XEXP (x, 0), y))
7428     return CC_Cmode;
7429 
7430   /* A test for unsigned overflow from an add with carry.  */
7431   if ((mode_x == DImode || mode_x == TImode)
7432       && (code == LTU || code == GEU)
7433       && code_x == PLUS
7434       && CONST_SCALAR_INT_P (y)
7435       && (rtx_mode_t (y, mode_x)
7436 	  == (wi::shwi (1, mode_x)
7437 	      << (GET_MODE_BITSIZE (mode_x).to_constant () / 2))))
7438     return CC_ADCmode;
7439 
7440   /* A test for signed overflow.  */
7441   if ((mode_x == DImode || mode_x == TImode)
7442       && code == NE
7443       && code_x == PLUS
7444       && GET_CODE (y) == SIGN_EXTEND)
7445     return CC_Vmode;
7446 
7447   /* For everything else, return CCmode.  */
7448   return CCmode;
7449 }
7450 
7451 static int
7452 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
7453 
7454 int
aarch64_get_condition_code(rtx x)7455 aarch64_get_condition_code (rtx x)
7456 {
7457   machine_mode mode = GET_MODE (XEXP (x, 0));
7458   enum rtx_code comp_code = GET_CODE (x);
7459 
7460   if (GET_MODE_CLASS (mode) != MODE_CC)
7461     mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
7462   return aarch64_get_condition_code_1 (mode, comp_code);
7463 }
7464 
7465 static int
aarch64_get_condition_code_1(machine_mode mode,enum rtx_code comp_code)7466 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
7467 {
7468   switch (mode)
7469     {
7470     case E_CCFPmode:
7471     case E_CCFPEmode:
7472       switch (comp_code)
7473 	{
7474 	case GE: return AARCH64_GE;
7475 	case GT: return AARCH64_GT;
7476 	case LE: return AARCH64_LS;
7477 	case LT: return AARCH64_MI;
7478 	case NE: return AARCH64_NE;
7479 	case EQ: return AARCH64_EQ;
7480 	case ORDERED: return AARCH64_VC;
7481 	case UNORDERED: return AARCH64_VS;
7482 	case UNLT: return AARCH64_LT;
7483 	case UNLE: return AARCH64_LE;
7484 	case UNGT: return AARCH64_HI;
7485 	case UNGE: return AARCH64_PL;
7486 	default: return -1;
7487 	}
7488       break;
7489 
7490     case E_CCmode:
7491       switch (comp_code)
7492 	{
7493 	case NE: return AARCH64_NE;
7494 	case EQ: return AARCH64_EQ;
7495 	case GE: return AARCH64_GE;
7496 	case GT: return AARCH64_GT;
7497 	case LE: return AARCH64_LE;
7498 	case LT: return AARCH64_LT;
7499 	case GEU: return AARCH64_CS;
7500 	case GTU: return AARCH64_HI;
7501 	case LEU: return AARCH64_LS;
7502 	case LTU: return AARCH64_CC;
7503 	default: return -1;
7504 	}
7505       break;
7506 
7507     case E_CC_SWPmode:
7508       switch (comp_code)
7509 	{
7510 	case NE: return AARCH64_NE;
7511 	case EQ: return AARCH64_EQ;
7512 	case GE: return AARCH64_LE;
7513 	case GT: return AARCH64_LT;
7514 	case LE: return AARCH64_GE;
7515 	case LT: return AARCH64_GT;
7516 	case GEU: return AARCH64_LS;
7517 	case GTU: return AARCH64_CC;
7518 	case LEU: return AARCH64_CS;
7519 	case LTU: return AARCH64_HI;
7520 	default: return -1;
7521 	}
7522       break;
7523 
7524     case E_CC_NZmode:
7525       switch (comp_code)
7526 	{
7527 	case NE: return AARCH64_NE;
7528 	case EQ: return AARCH64_EQ;
7529 	case GE: return AARCH64_PL;
7530 	case LT: return AARCH64_MI;
7531 	default: return -1;
7532 	}
7533       break;
7534 
7535     case E_CC_Zmode:
7536       switch (comp_code)
7537 	{
7538 	case NE: return AARCH64_NE;
7539 	case EQ: return AARCH64_EQ;
7540 	default: return -1;
7541 	}
7542       break;
7543 
7544     case E_CC_Cmode:
7545       switch (comp_code)
7546 	{
7547 	case LTU: return AARCH64_CS;
7548 	case GEU: return AARCH64_CC;
7549 	default: return -1;
7550 	}
7551       break;
7552 
7553     case E_CC_ADCmode:
7554       switch (comp_code)
7555 	{
7556 	case GEU: return AARCH64_CS;
7557 	case LTU: return AARCH64_CC;
7558 	default: return -1;
7559 	}
7560       break;
7561 
7562     case E_CC_Vmode:
7563       switch (comp_code)
7564 	{
7565 	case NE: return AARCH64_VS;
7566 	case EQ: return AARCH64_VC;
7567 	default: return -1;
7568 	}
7569       break;
7570 
7571     default:
7572       return -1;
7573     }
7574 
7575   return -1;
7576 }
7577 
7578 bool
aarch64_const_vec_all_same_in_range_p(rtx x,HOST_WIDE_INT minval,HOST_WIDE_INT maxval)7579 aarch64_const_vec_all_same_in_range_p (rtx x,
7580 				       HOST_WIDE_INT minval,
7581 				       HOST_WIDE_INT maxval)
7582 {
7583   rtx elt;
7584   return (const_vec_duplicate_p (x, &elt)
7585 	  && CONST_INT_P (elt)
7586 	  && IN_RANGE (INTVAL (elt), minval, maxval));
7587 }
7588 
7589 bool
aarch64_const_vec_all_same_int_p(rtx x,HOST_WIDE_INT val)7590 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
7591 {
7592   return aarch64_const_vec_all_same_in_range_p (x, val, val);
7593 }
7594 
7595 /* Return true if VEC is a constant in which every element is in the range
7596    [MINVAL, MAXVAL].  The elements do not need to have the same value.  */
7597 
7598 static bool
aarch64_const_vec_all_in_range_p(rtx vec,HOST_WIDE_INT minval,HOST_WIDE_INT maxval)7599 aarch64_const_vec_all_in_range_p (rtx vec,
7600 				  HOST_WIDE_INT minval,
7601 				  HOST_WIDE_INT maxval)
7602 {
7603   if (GET_CODE (vec) != CONST_VECTOR
7604       || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
7605     return false;
7606 
7607   int nunits;
7608   if (!CONST_VECTOR_STEPPED_P (vec))
7609     nunits = const_vector_encoded_nelts (vec);
7610   else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
7611     return false;
7612 
7613   for (int i = 0; i < nunits; i++)
7614     {
7615       rtx vec_elem = CONST_VECTOR_ELT (vec, i);
7616       if (!CONST_INT_P (vec_elem)
7617 	  || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
7618 	return false;
7619     }
7620   return true;
7621 }
7622 
7623 /* N Z C V.  */
7624 #define AARCH64_CC_V 1
7625 #define AARCH64_CC_C (1 << 1)
7626 #define AARCH64_CC_Z (1 << 2)
7627 #define AARCH64_CC_N (1 << 3)
7628 
7629 /* N Z C V flags for ccmp.  Indexed by AARCH64_COND_CODE.  */
7630 static const int aarch64_nzcv_codes[] =
7631 {
7632   0,		/* EQ, Z == 1.  */
7633   AARCH64_CC_Z,	/* NE, Z == 0.  */
7634   0,		/* CS, C == 1.  */
7635   AARCH64_CC_C,	/* CC, C == 0.  */
7636   0,		/* MI, N == 1.  */
7637   AARCH64_CC_N, /* PL, N == 0.  */
7638   0,		/* VS, V == 1.  */
7639   AARCH64_CC_V, /* VC, V == 0.  */
7640   0,		/* HI, C ==1 && Z == 0.  */
7641   AARCH64_CC_C,	/* LS, !(C == 1 && Z == 0).  */
7642   AARCH64_CC_V,	/* GE, N == V.  */
7643   0,		/* LT, N != V.  */
7644   AARCH64_CC_Z, /* GT, Z == 0 && N == V.  */
7645   0,		/* LE, !(Z == 0 && N == V).  */
7646   0,		/* AL, Any.  */
7647   0		/* NV, Any.  */
7648 };
7649 
7650 /* Print floating-point vector immediate operand X to F, negating it
7651    first if NEGATE is true.  Return true on success, false if it isn't
7652    a constant we can handle.  */
7653 
7654 static bool
aarch64_print_vector_float_operand(FILE * f,rtx x,bool negate)7655 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
7656 {
7657   rtx elt;
7658 
7659   if (!const_vec_duplicate_p (x, &elt))
7660     return false;
7661 
7662   REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
7663   if (negate)
7664     r = real_value_negate (&r);
7665 
7666   /* We only handle the SVE single-bit immediates here.  */
7667   if (real_equal (&r, &dconst0))
7668     asm_fprintf (f, "0.0");
7669   else if (real_equal (&r, &dconst1))
7670     asm_fprintf (f, "1.0");
7671   else if (real_equal (&r, &dconsthalf))
7672     asm_fprintf (f, "0.5");
7673   else
7674     return false;
7675 
7676   return true;
7677 }
7678 
7679 /* Return the equivalent letter for size.  */
7680 static char
sizetochar(int size)7681 sizetochar (int size)
7682 {
7683   switch (size)
7684     {
7685     case 64: return 'd';
7686     case 32: return 's';
7687     case 16: return 'h';
7688     case 8 : return 'b';
7689     default: gcc_unreachable ();
7690     }
7691 }
7692 
7693 /* Print operand X to file F in a target specific manner according to CODE.
7694    The acceptable formatting commands given by CODE are:
7695      'c':		An integer or symbol address without a preceding #
7696 			sign.
7697      'C':		Take the duplicated element in a vector constant
7698 			and print it in hex.
7699      'D':		Take the duplicated element in a vector constant
7700 			and print it as an unsigned integer, in decimal.
7701      'e':		Print the sign/zero-extend size as a character 8->b,
7702 			16->h, 32->w.
7703      'p':		Prints N such that 2^N == X (X must be power of 2 and
7704 			const int).
7705      'P':		Print the number of non-zero bits in X (a const_int).
7706      'H':		Print the higher numbered register of a pair (TImode)
7707 			of regs.
7708      'm':		Print a condition (eq, ne, etc).
7709      'M':		Same as 'm', but invert condition.
7710      'N':		Take the duplicated element in a vector constant
7711 			and print the negative of it in decimal.
7712      'b/h/s/d/q':	Print a scalar FP/SIMD register name.
7713      'S/T/U/V':		Print a FP/SIMD register name for a register list.
7714 			The register printed is the FP/SIMD register name
7715 			of X + 0/1/2/3 for S/T/U/V.
7716      'R':		Print a scalar Integer/FP/SIMD register name + 1.
7717      'X':		Print bottom 16 bits of integer constant in hex.
7718      'w/x':		Print a general register name or the zero register
7719 			(32-bit or 64-bit).
7720      '0':		Print a normal operand, if it's a general register,
7721 			then we assume DImode.
7722      'k':		Print NZCV for conditional compare instructions.
7723      'A':		Output address constant representing the first
7724 			argument of X, specifying a relocation offset
7725 			if appropriate.
7726      'L':		Output constant address specified by X
7727 			with a relocation offset if appropriate.
7728      'G':		Prints address of X, specifying a PC relative
7729 			relocation mode if appropriate.
7730      'y':		Output address of LDP or STP - this is used for
7731 			some LDP/STPs which don't use a PARALLEL in their
7732 			pattern (so the mode needs to be adjusted).
7733      'z':		Output address of a typical LDP or STP.  */
7734 
7735 static void
aarch64_print_operand(FILE * f,rtx x,int code)7736 aarch64_print_operand (FILE *f, rtx x, int code)
7737 {
7738   rtx elt;
7739   switch (code)
7740     {
7741     case 'c':
7742       switch (GET_CODE (x))
7743 	{
7744 	case CONST_INT:
7745 	  fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
7746 	  break;
7747 
7748 	case SYMBOL_REF:
7749 	  output_addr_const (f, x);
7750 	  break;
7751 
7752 	case CONST:
7753 	  if (GET_CODE (XEXP (x, 0)) == PLUS
7754 	      && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
7755 	    {
7756 	      output_addr_const (f, x);
7757 	      break;
7758 	    }
7759 	  /* Fall through.  */
7760 
7761 	default:
7762 	  output_operand_lossage ("unsupported operand for code '%c'", code);
7763 	}
7764       break;
7765 
7766     case 'e':
7767       {
7768 	int n;
7769 
7770 	if (!CONST_INT_P (x)
7771 	    || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
7772 	  {
7773 	    output_operand_lossage ("invalid operand for '%%%c'", code);
7774 	    return;
7775 	  }
7776 
7777 	switch (n)
7778 	  {
7779 	  case 3:
7780 	    fputc ('b', f);
7781 	    break;
7782 	  case 4:
7783 	    fputc ('h', f);
7784 	    break;
7785 	  case 5:
7786 	    fputc ('w', f);
7787 	    break;
7788 	  default:
7789 	    output_operand_lossage ("invalid operand for '%%%c'", code);
7790 	    return;
7791 	  }
7792       }
7793       break;
7794 
7795     case 'p':
7796       {
7797 	int n;
7798 
7799 	if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
7800 	  {
7801 	    output_operand_lossage ("invalid operand for '%%%c'", code);
7802 	    return;
7803 	  }
7804 
7805 	asm_fprintf (f, "%d", n);
7806       }
7807       break;
7808 
7809     case 'P':
7810       if (!CONST_INT_P (x))
7811 	{
7812 	  output_operand_lossage ("invalid operand for '%%%c'", code);
7813 	  return;
7814 	}
7815 
7816       asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
7817       break;
7818 
7819     case 'H':
7820       if (x == const0_rtx)
7821 	{
7822 	  asm_fprintf (f, "xzr");
7823 	  break;
7824 	}
7825 
7826       if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
7827 	{
7828 	  output_operand_lossage ("invalid operand for '%%%c'", code);
7829 	  return;
7830 	}
7831 
7832       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
7833       break;
7834 
7835     case 'M':
7836     case 'm':
7837       {
7838         int cond_code;
7839 	/* CONST_TRUE_RTX means al/nv (al is the default, don't print it).  */
7840 	if (x == const_true_rtx)
7841 	  {
7842 	    if (code == 'M')
7843 	      fputs ("nv", f);
7844 	    return;
7845 	  }
7846 
7847         if (!COMPARISON_P (x))
7848 	  {
7849 	    output_operand_lossage ("invalid operand for '%%%c'", code);
7850 	    return;
7851 	  }
7852 
7853         cond_code = aarch64_get_condition_code (x);
7854         gcc_assert (cond_code >= 0);
7855 	if (code == 'M')
7856 	  cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
7857 	fputs (aarch64_condition_codes[cond_code], f);
7858       }
7859       break;
7860 
7861     case 'N':
7862       if (!const_vec_duplicate_p (x, &elt))
7863 	{
7864 	  output_operand_lossage ("invalid vector constant");
7865 	  return;
7866 	}
7867 
7868       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
7869 	asm_fprintf (f, "%wd", -INTVAL (elt));
7870       else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
7871 	       && aarch64_print_vector_float_operand (f, x, true))
7872 	;
7873       else
7874 	{
7875 	  output_operand_lossage ("invalid vector constant");
7876 	  return;
7877 	}
7878       break;
7879 
7880     case 'b':
7881     case 'h':
7882     case 's':
7883     case 'd':
7884     case 'q':
7885       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
7886 	{
7887 	  output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
7888 	  return;
7889 	}
7890       asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
7891       break;
7892 
7893     case 'S':
7894     case 'T':
7895     case 'U':
7896     case 'V':
7897       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
7898 	{
7899 	  output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
7900 	  return;
7901 	}
7902       asm_fprintf (f, "%c%d",
7903 		   aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
7904 		   REGNO (x) - V0_REGNUM + (code - 'S'));
7905       break;
7906 
7907     case 'R':
7908       if (REG_P (x) && FP_REGNUM_P (REGNO (x)))
7909 	asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
7910       else if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
7911 	asm_fprintf (f, "x%d", REGNO (x) - R0_REGNUM + 1);
7912       else
7913 	output_operand_lossage ("incompatible register operand for '%%%c'",
7914 				code);
7915       break;
7916 
7917     case 'X':
7918       if (!CONST_INT_P (x))
7919 	{
7920 	  output_operand_lossage ("invalid operand for '%%%c'", code);
7921 	  return;
7922 	}
7923       asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
7924       break;
7925 
7926     case 'C':
7927       {
7928 	/* Print a replicated constant in hex.  */
7929 	if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
7930 	  {
7931 	    output_operand_lossage ("invalid operand for '%%%c'", code);
7932 	    return;
7933 	  }
7934 	scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
7935 	asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
7936       }
7937       break;
7938 
7939     case 'D':
7940       {
7941 	/* Print a replicated constant in decimal, treating it as
7942 	   unsigned.  */
7943 	if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
7944 	  {
7945 	    output_operand_lossage ("invalid operand for '%%%c'", code);
7946 	    return;
7947 	  }
7948 	scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
7949 	asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
7950       }
7951       break;
7952 
7953     case 'w':
7954     case 'x':
7955       if (x == const0_rtx
7956 	  || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
7957 	{
7958 	  asm_fprintf (f, "%czr", code);
7959 	  break;
7960 	}
7961 
7962       if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
7963 	{
7964 	  asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
7965 	  break;
7966 	}
7967 
7968       if (REG_P (x) && REGNO (x) == SP_REGNUM)
7969 	{
7970 	  asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
7971 	  break;
7972 	}
7973 
7974       /* Fall through */
7975 
7976     case 0:
7977       if (x == NULL)
7978 	{
7979 	  output_operand_lossage ("missing operand");
7980 	  return;
7981 	}
7982 
7983       switch (GET_CODE (x))
7984 	{
7985 	case REG:
7986 	  if (aarch64_sve_data_mode_p (GET_MODE (x)))
7987 	    {
7988 	      if (REG_NREGS (x) == 1)
7989 		asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
7990 	      else
7991 		{
7992 		  char suffix
7993 		    = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
7994 		  asm_fprintf (f, "{z%d.%c - z%d.%c}",
7995 			       REGNO (x) - V0_REGNUM, suffix,
7996 			       END_REGNO (x) - V0_REGNUM - 1, suffix);
7997 		}
7998 	    }
7999 	  else
8000 	    asm_fprintf (f, "%s", reg_names [REGNO (x)]);
8001 	  break;
8002 
8003 	case MEM:
8004 	  output_address (GET_MODE (x), XEXP (x, 0));
8005 	  break;
8006 
8007 	case LABEL_REF:
8008 	case SYMBOL_REF:
8009 	  output_addr_const (asm_out_file, x);
8010 	  break;
8011 
8012 	case CONST_INT:
8013 	  asm_fprintf (f, "%wd", INTVAL (x));
8014 	  break;
8015 
8016 	case CONST:
8017 	  if (!VECTOR_MODE_P (GET_MODE (x)))
8018 	    {
8019 	      output_addr_const (asm_out_file, x);
8020 	      break;
8021 	    }
8022 	  /* fall through */
8023 
8024 	case CONST_VECTOR:
8025 	  if (!const_vec_duplicate_p (x, &elt))
8026 	    {
8027 	      output_operand_lossage ("invalid vector constant");
8028 	      return;
8029 	    }
8030 
8031 	  if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
8032 	    asm_fprintf (f, "%wd", INTVAL (elt));
8033 	  else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
8034 		   && aarch64_print_vector_float_operand (f, x, false))
8035 	    ;
8036 	  else
8037 	    {
8038 	      output_operand_lossage ("invalid vector constant");
8039 	      return;
8040 	    }
8041 	  break;
8042 
8043 	case CONST_DOUBLE:
8044 	  /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
8045 	     be getting CONST_DOUBLEs holding integers.  */
8046 	  gcc_assert (GET_MODE (x) != VOIDmode);
8047 	  if (aarch64_float_const_zero_rtx_p (x))
8048 	    {
8049 	      fputc ('0', f);
8050 	      break;
8051 	    }
8052 	  else if (aarch64_float_const_representable_p (x))
8053 	    {
8054 #define buf_size 20
8055 	      char float_buf[buf_size] = {'\0'};
8056 	      real_to_decimal_for_mode (float_buf,
8057 					CONST_DOUBLE_REAL_VALUE (x),
8058 					buf_size, buf_size,
8059 					1, GET_MODE (x));
8060 	      asm_fprintf (asm_out_file, "%s", float_buf);
8061 	      break;
8062 #undef buf_size
8063 	    }
8064 	  output_operand_lossage ("invalid constant");
8065 	  return;
8066 	default:
8067 	  output_operand_lossage ("invalid operand");
8068 	  return;
8069 	}
8070       break;
8071 
8072     case 'A':
8073       if (GET_CODE (x) == HIGH)
8074 	x = XEXP (x, 0);
8075 
8076       switch (aarch64_classify_symbolic_expression (x))
8077 	{
8078 	case SYMBOL_SMALL_GOT_4G:
8079 	  asm_fprintf (asm_out_file, ":got:");
8080 	  break;
8081 
8082 	case SYMBOL_SMALL_TLSGD:
8083 	  asm_fprintf (asm_out_file, ":tlsgd:");
8084 	  break;
8085 
8086 	case SYMBOL_SMALL_TLSDESC:
8087 	  asm_fprintf (asm_out_file, ":tlsdesc:");
8088 	  break;
8089 
8090 	case SYMBOL_SMALL_TLSIE:
8091 	  asm_fprintf (asm_out_file, ":gottprel:");
8092 	  break;
8093 
8094 	case SYMBOL_TLSLE24:
8095 	  asm_fprintf (asm_out_file, ":tprel:");
8096 	  break;
8097 
8098 	case SYMBOL_TINY_GOT:
8099 	  gcc_unreachable ();
8100 	  break;
8101 
8102 	default:
8103 	  break;
8104 	}
8105       output_addr_const (asm_out_file, x);
8106       break;
8107 
8108     case 'L':
8109       switch (aarch64_classify_symbolic_expression (x))
8110 	{
8111 	case SYMBOL_SMALL_GOT_4G:
8112 	  asm_fprintf (asm_out_file, ":lo12:");
8113 	  break;
8114 
8115 	case SYMBOL_SMALL_TLSGD:
8116 	  asm_fprintf (asm_out_file, ":tlsgd_lo12:");
8117 	  break;
8118 
8119 	case SYMBOL_SMALL_TLSDESC:
8120 	  asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
8121 	  break;
8122 
8123 	case SYMBOL_SMALL_TLSIE:
8124 	  asm_fprintf (asm_out_file, ":gottprel_lo12:");
8125 	  break;
8126 
8127 	case SYMBOL_TLSLE12:
8128 	  asm_fprintf (asm_out_file, ":tprel_lo12:");
8129 	  break;
8130 
8131 	case SYMBOL_TLSLE24:
8132 	  asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
8133 	  break;
8134 
8135 	case SYMBOL_TINY_GOT:
8136 	  asm_fprintf (asm_out_file, ":got:");
8137 	  break;
8138 
8139 	case SYMBOL_TINY_TLSIE:
8140 	  asm_fprintf (asm_out_file, ":gottprel:");
8141 	  break;
8142 
8143 	default:
8144 	  break;
8145 	}
8146       output_addr_const (asm_out_file, x);
8147       break;
8148 
8149     case 'G':
8150       switch (aarch64_classify_symbolic_expression (x))
8151 	{
8152 	case SYMBOL_TLSLE24:
8153 	  asm_fprintf (asm_out_file, ":tprel_hi12:");
8154 	  break;
8155 	default:
8156 	  break;
8157 	}
8158       output_addr_const (asm_out_file, x);
8159       break;
8160 
8161     case 'k':
8162       {
8163 	HOST_WIDE_INT cond_code;
8164 
8165 	if (!CONST_INT_P (x))
8166 	  {
8167 	    output_operand_lossage ("invalid operand for '%%%c'", code);
8168 	    return;
8169 	  }
8170 
8171 	cond_code = INTVAL (x);
8172 	gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
8173 	asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
8174       }
8175       break;
8176 
8177     case 'y':
8178     case 'z':
8179       {
8180 	machine_mode mode = GET_MODE (x);
8181 
8182 	if (GET_CODE (x) != MEM
8183 	    || (code == 'y' && maybe_ne (GET_MODE_SIZE (mode), 16)))
8184 	  {
8185 	    output_operand_lossage ("invalid operand for '%%%c'", code);
8186 	    return;
8187 	  }
8188 
8189 	if (!aarch64_print_address_internal (f, mode, XEXP (x, 0),
8190 					    code == 'y'
8191 					    ? ADDR_QUERY_LDP_STP_N
8192 					    : ADDR_QUERY_LDP_STP))
8193 	  output_operand_lossage ("invalid operand prefix '%%%c'", code);
8194       }
8195       break;
8196 
8197     default:
8198       output_operand_lossage ("invalid operand prefix '%%%c'", code);
8199       return;
8200     }
8201 }
8202 
8203 /* Print address 'x' of a memory access with mode 'mode'.
8204    'op' is the context required by aarch64_classify_address.  It can either be
8205    MEM for a normal memory access or PARALLEL for LDP/STP.  */
8206 static bool
aarch64_print_address_internal(FILE * f,machine_mode mode,rtx x,aarch64_addr_query_type type)8207 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
8208 				aarch64_addr_query_type type)
8209 {
8210   struct aarch64_address_info addr;
8211   unsigned int size;
8212 
8213   /* Check all addresses are Pmode - including ILP32.  */
8214   if (GET_MODE (x) != Pmode
8215       && (!CONST_INT_P (x)
8216 	  || trunc_int_for_mode (INTVAL (x), Pmode) != INTVAL (x)))
8217     {
8218       output_operand_lossage ("invalid address mode");
8219       return false;
8220     }
8221 
8222   if (aarch64_classify_address (&addr, x, mode, true, type))
8223     switch (addr.type)
8224       {
8225       case ADDRESS_REG_IMM:
8226 	if (known_eq (addr.const_offset, 0))
8227 	  asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
8228 	else if (aarch64_sve_data_mode_p (mode))
8229 	  {
8230 	    HOST_WIDE_INT vnum
8231 	      = exact_div (addr.const_offset,
8232 			   BYTES_PER_SVE_VECTOR).to_constant ();
8233 	    asm_fprintf (f, "[%s, #%wd, mul vl]",
8234 			 reg_names[REGNO (addr.base)], vnum);
8235 	  }
8236 	else if (aarch64_sve_pred_mode_p (mode))
8237 	  {
8238 	    HOST_WIDE_INT vnum
8239 	      = exact_div (addr.const_offset,
8240 			   BYTES_PER_SVE_PRED).to_constant ();
8241 	    asm_fprintf (f, "[%s, #%wd, mul vl]",
8242 			 reg_names[REGNO (addr.base)], vnum);
8243 	  }
8244 	else
8245 	  asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
8246 		       INTVAL (addr.offset));
8247 	return true;
8248 
8249       case ADDRESS_REG_REG:
8250 	if (addr.shift == 0)
8251 	  asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
8252 		       reg_names [REGNO (addr.offset)]);
8253 	else
8254 	  asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
8255 		       reg_names [REGNO (addr.offset)], addr.shift);
8256 	return true;
8257 
8258       case ADDRESS_REG_UXTW:
8259 	if (addr.shift == 0)
8260 	  asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
8261 		       REGNO (addr.offset) - R0_REGNUM);
8262 	else
8263 	  asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
8264 		       REGNO (addr.offset) - R0_REGNUM, addr.shift);
8265 	return true;
8266 
8267       case ADDRESS_REG_SXTW:
8268 	if (addr.shift == 0)
8269 	  asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
8270 		       REGNO (addr.offset) - R0_REGNUM);
8271 	else
8272 	  asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
8273 		       REGNO (addr.offset) - R0_REGNUM, addr.shift);
8274 	return true;
8275 
8276       case ADDRESS_REG_WB:
8277 	/* Writeback is only supported for fixed-width modes.  */
8278 	size = GET_MODE_SIZE (mode).to_constant ();
8279 	switch (GET_CODE (x))
8280 	  {
8281 	  case PRE_INC:
8282 	    asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
8283 	    return true;
8284 	  case POST_INC:
8285 	    asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
8286 	    return true;
8287 	  case PRE_DEC:
8288 	    asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
8289 	    return true;
8290 	  case POST_DEC:
8291 	    asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
8292 	    return true;
8293 	  case PRE_MODIFY:
8294 	    asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
8295 			 INTVAL (addr.offset));
8296 	    return true;
8297 	  case POST_MODIFY:
8298 	    asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
8299 			 INTVAL (addr.offset));
8300 	    return true;
8301 	  default:
8302 	    break;
8303 	  }
8304 	break;
8305 
8306       case ADDRESS_LO_SUM:
8307 	asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
8308 	output_addr_const (f, addr.offset);
8309 	asm_fprintf (f, "]");
8310 	return true;
8311 
8312       case ADDRESS_SYMBOLIC:
8313 	output_addr_const (f, x);
8314 	return true;
8315       }
8316 
8317   return false;
8318 }
8319 
8320 /* Print address 'x' of a memory access with mode 'mode'.  */
8321 static void
aarch64_print_operand_address(FILE * f,machine_mode mode,rtx x)8322 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
8323 {
8324   if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
8325     output_addr_const (f, x);
8326 }
8327 
8328 bool
aarch64_label_mentioned_p(rtx x)8329 aarch64_label_mentioned_p (rtx x)
8330 {
8331   const char *fmt;
8332   int i;
8333 
8334   if (GET_CODE (x) == LABEL_REF)
8335     return true;
8336 
8337   /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
8338      referencing instruction, but they are constant offsets, not
8339      symbols.  */
8340   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
8341     return false;
8342 
8343   fmt = GET_RTX_FORMAT (GET_CODE (x));
8344   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
8345     {
8346       if (fmt[i] == 'E')
8347 	{
8348 	  int j;
8349 
8350 	  for (j = XVECLEN (x, i) - 1; j >= 0; j--)
8351 	    if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
8352 	      return 1;
8353 	}
8354       else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
8355 	return 1;
8356     }
8357 
8358   return 0;
8359 }
8360 
8361 /* Implement REGNO_REG_CLASS.  */
8362 
8363 enum reg_class
aarch64_regno_regclass(unsigned regno)8364 aarch64_regno_regclass (unsigned regno)
8365 {
8366   if (STUB_REGNUM_P (regno))
8367     return STUB_REGS;
8368 
8369   if (GP_REGNUM_P (regno))
8370     return GENERAL_REGS;
8371 
8372   if (regno == SP_REGNUM)
8373     return STACK_REG;
8374 
8375   if (regno == FRAME_POINTER_REGNUM
8376       || regno == ARG_POINTER_REGNUM)
8377     return POINTER_REGS;
8378 
8379   if (FP_REGNUM_P (regno))
8380     return FP_LO_REGNUM_P (regno) ?  FP_LO_REGS : FP_REGS;
8381 
8382   if (PR_REGNUM_P (regno))
8383     return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
8384 
8385   return NO_REGS;
8386 }
8387 
8388 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
8389    If OFFSET is out of range, return an offset of an anchor point
8390    that is in range.  Return 0 otherwise.  */
8391 
8392 static HOST_WIDE_INT
aarch64_anchor_offset(HOST_WIDE_INT offset,HOST_WIDE_INT size,machine_mode mode)8393 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
8394 		       machine_mode mode)
8395 {
8396   /* Does it look like we'll need a 16-byte load/store-pair operation?  */
8397   if (size > 16)
8398     return (offset + 0x400) & ~0x7f0;
8399 
8400   /* For offsets that aren't a multiple of the access size, the limit is
8401      -256...255.  */
8402   if (offset & (size - 1))
8403     {
8404       /* BLKmode typically uses LDP of X-registers.  */
8405       if (mode == BLKmode)
8406 	return (offset + 512) & ~0x3ff;
8407       return (offset + 0x100) & ~0x1ff;
8408     }
8409 
8410   /* Small negative offsets are supported.  */
8411   if (IN_RANGE (offset, -256, 0))
8412     return 0;
8413 
8414   if (mode == TImode || mode == TFmode)
8415     return (offset + 0x100) & ~0x1ff;
8416 
8417   /* Use 12-bit offset by access size.  */
8418   return offset & (~0xfff * size);
8419 }
8420 
8421 static rtx
aarch64_legitimize_address(rtx x,rtx,machine_mode mode)8422 aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
8423 {
8424   /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
8425      where mask is selected by alignment and size of the offset.
8426      We try to pick as large a range for the offset as possible to
8427      maximize the chance of a CSE.  However, for aligned addresses
8428      we limit the range to 4k so that structures with different sized
8429      elements are likely to use the same base.  We need to be careful
8430      not to split a CONST for some forms of address expression, otherwise
8431      it will generate sub-optimal code.  */
8432 
8433   if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
8434     {
8435       rtx base = XEXP (x, 0);
8436       rtx offset_rtx = XEXP (x, 1);
8437       HOST_WIDE_INT offset = INTVAL (offset_rtx);
8438 
8439       if (GET_CODE (base) == PLUS)
8440 	{
8441 	  rtx op0 = XEXP (base, 0);
8442 	  rtx op1 = XEXP (base, 1);
8443 
8444 	  /* Force any scaling into a temp for CSE.  */
8445 	  op0 = force_reg (Pmode, op0);
8446 	  op1 = force_reg (Pmode, op1);
8447 
8448 	  /* Let the pointer register be in op0.  */
8449 	  if (REG_POINTER (op1))
8450 	    std::swap (op0, op1);
8451 
8452 	  /* If the pointer is virtual or frame related, then we know that
8453 	     virtual register instantiation or register elimination is going
8454 	     to apply a second constant.  We want the two constants folded
8455 	     together easily.  Therefore, emit as (OP0 + CONST) + OP1.  */
8456 	  if (virt_or_elim_regno_p (REGNO (op0)))
8457 	    {
8458 	      base = expand_binop (Pmode, add_optab, op0, offset_rtx,
8459 				   NULL_RTX, true, OPTAB_DIRECT);
8460 	      return gen_rtx_PLUS (Pmode, base, op1);
8461 	    }
8462 
8463 	  /* Otherwise, in order to encourage CSE (and thence loop strength
8464 	     reduce) scaled addresses, emit as (OP0 + OP1) + CONST.  */
8465 	  base = expand_binop (Pmode, add_optab, op0, op1,
8466 			       NULL_RTX, true, OPTAB_DIRECT);
8467 	  x = gen_rtx_PLUS (Pmode, base, offset_rtx);
8468 	}
8469 
8470       HOST_WIDE_INT size;
8471       if (GET_MODE_SIZE (mode).is_constant (&size))
8472 	{
8473 	  HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
8474 							     mode);
8475 	  if (base_offset != 0)
8476 	    {
8477 	      base = plus_constant (Pmode, base, base_offset);
8478 	      base = force_operand (base, NULL_RTX);
8479 	      return plus_constant (Pmode, base, offset - base_offset);
8480 	    }
8481 	}
8482     }
8483 
8484   return x;
8485 }
8486 
8487 static reg_class_t
aarch64_secondary_reload(bool in_p ATTRIBUTE_UNUSED,rtx x,reg_class_t rclass,machine_mode mode,secondary_reload_info * sri)8488 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
8489 			  reg_class_t rclass,
8490 			  machine_mode mode,
8491 			  secondary_reload_info *sri)
8492 {
8493   /* Use aarch64_sve_reload_be for SVE reloads that cannot be handled
8494      directly by the *aarch64_sve_mov<mode>_be move pattern.  See the
8495      comment at the head of aarch64-sve.md for more details about the
8496      big-endian handling.  */
8497   if (BYTES_BIG_ENDIAN
8498       && reg_class_subset_p (rclass, FP_REGS)
8499       && !((REG_P (x) && HARD_REGISTER_P (x))
8500 	   || aarch64_simd_valid_immediate (x, NULL))
8501       && aarch64_sve_data_mode_p (mode))
8502     {
8503       sri->icode = CODE_FOR_aarch64_sve_reload_be;
8504       return NO_REGS;
8505     }
8506 
8507   /* If we have to disable direct literal pool loads and stores because the
8508      function is too big, then we need a scratch register.  */
8509   if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
8510       && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
8511 	  || targetm.vector_mode_supported_p (GET_MODE (x)))
8512       && !aarch64_pcrelative_literal_loads)
8513     {
8514       sri->icode = code_for_aarch64_reload_movcp (mode, DImode);
8515       return NO_REGS;
8516     }
8517 
8518   /* Without the TARGET_SIMD instructions we cannot move a Q register
8519      to a Q register directly.  We need a scratch.  */
8520   if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
8521       && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
8522       && reg_class_subset_p (rclass, FP_REGS))
8523     {
8524       sri->icode = code_for_aarch64_reload_mov (mode);
8525       return NO_REGS;
8526     }
8527 
8528   /* A TFmode or TImode memory access should be handled via an FP_REGS
8529      because AArch64 has richer addressing modes for LDR/STR instructions
8530      than LDP/STP instructions.  */
8531   if (TARGET_FLOAT && rclass == GENERAL_REGS
8532       && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
8533     return FP_REGS;
8534 
8535   if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
8536       return GENERAL_REGS;
8537 
8538   return NO_REGS;
8539 }
8540 
8541 static bool
aarch64_can_eliminate(const int from ATTRIBUTE_UNUSED,const int to)8542 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
8543 {
8544   gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
8545 
8546   /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
8547      can only eliminate to HARD_FRAME_POINTER_REGNUM.  */
8548   if (frame_pointer_needed)
8549     return to == HARD_FRAME_POINTER_REGNUM;
8550   return true;
8551 }
8552 
8553 poly_int64
aarch64_initial_elimination_offset(unsigned from,unsigned to)8554 aarch64_initial_elimination_offset (unsigned from, unsigned to)
8555 {
8556   if (to == HARD_FRAME_POINTER_REGNUM)
8557     {
8558       if (from == ARG_POINTER_REGNUM)
8559 	return cfun->machine->frame.hard_fp_offset;
8560 
8561       if (from == FRAME_POINTER_REGNUM)
8562 	return cfun->machine->frame.hard_fp_offset
8563 	       - cfun->machine->frame.locals_offset;
8564     }
8565 
8566   if (to == STACK_POINTER_REGNUM)
8567     {
8568       if (from == FRAME_POINTER_REGNUM)
8569 	  return cfun->machine->frame.frame_size
8570 		 - cfun->machine->frame.locals_offset;
8571     }
8572 
8573   return cfun->machine->frame.frame_size;
8574 }
8575 
8576 
8577 /* Get return address without mangling.  */
8578 
8579 rtx
aarch64_return_addr_rtx(void)8580 aarch64_return_addr_rtx (void)
8581 {
8582   rtx val = get_hard_reg_initial_val (Pmode, LR_REGNUM);
8583   /* Note: aarch64_return_address_signing_enabled only
8584      works after cfun->machine->frame.laid_out is set,
8585      so here we don't know if the return address will
8586      be signed or not.  */
8587   rtx lr = gen_rtx_REG (Pmode, LR_REGNUM);
8588   emit_move_insn (lr, val);
8589   emit_insn (GEN_FCN (CODE_FOR_xpaclri) ());
8590   return lr;
8591 }
8592 
8593 
8594 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
8595    previous frame.  */
8596 
8597 rtx
aarch64_return_addr(int count,rtx frame ATTRIBUTE_UNUSED)8598 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
8599 {
8600   if (count != 0)
8601     return const0_rtx;
8602   return aarch64_return_addr_rtx ();
8603 }
8604 
8605 
8606 static void
aarch64_asm_trampoline_template(FILE * f)8607 aarch64_asm_trampoline_template (FILE *f)
8608 {
8609   int offset1 = 24;
8610   int offset2 = 28;
8611 
8612   if (aarch64_bti_enabled ())
8613     {
8614       asm_fprintf (f, "\thint\t34 // bti c\n");
8615       offset1 -= 4;
8616       offset2 -= 4;
8617     }
8618 
8619   if (TARGET_ILP32)
8620     {
8621       asm_fprintf (f, "\tldr\tw%d, .+%d\n", IP1_REGNUM - R0_REGNUM, offset1);
8622       asm_fprintf (f, "\tldr\tw%d, .+%d\n", STATIC_CHAIN_REGNUM - R0_REGNUM,
8623 		   offset1);
8624     }
8625   else
8626     {
8627       asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [IP1_REGNUM], offset1);
8628       asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [STATIC_CHAIN_REGNUM],
8629 		   offset2);
8630     }
8631   asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
8632 
8633   /* We always emit a speculation barrier.
8634      This is because the same trampoline template is used for every nested
8635      function.  Since nested functions are not particularly common or
8636      performant we don't worry too much about the extra instructions to copy
8637      around.
8638      This is not yet a problem, since we have not yet implemented function
8639      specific attributes to choose between hardening against straight line
8640      speculation or not, but such function specific attributes are likely to
8641      happen in the future.  */
8642   asm_fprintf (f, "\tdsb\tsy\n\tisb\n");
8643 
8644   /* The trampoline needs an extra padding instruction.  In case if BTI is
8645      enabled the padding instruction is replaced by the BTI instruction at
8646      the beginning.  */
8647   if (!aarch64_bti_enabled ())
8648     assemble_aligned_integer (4, const0_rtx);
8649 
8650   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
8651   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
8652 }
8653 
8654 static void
aarch64_trampoline_init(rtx m_tramp,tree fndecl,rtx chain_value)8655 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
8656 {
8657   rtx fnaddr, mem, a_tramp;
8658   const int tramp_code_sz = 24;
8659 
8660   /* Don't need to copy the trailing D-words, we fill those in below.  */
8661   /* We create our own memory address in Pmode so that `emit_block_move` can
8662      use parts of the backend which expect Pmode addresses.  */
8663   rtx temp = convert_memory_address (Pmode, XEXP (m_tramp, 0));
8664   emit_block_move (gen_rtx_MEM (BLKmode, temp),
8665 		   assemble_trampoline_template (),
8666 		   GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
8667   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
8668   fnaddr = XEXP (DECL_RTL (fndecl), 0);
8669   if (GET_MODE (fnaddr) != ptr_mode)
8670     fnaddr = convert_memory_address (ptr_mode, fnaddr);
8671   emit_move_insn (mem, fnaddr);
8672 
8673   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
8674   emit_move_insn (mem, chain_value);
8675 
8676   /* XXX We should really define a "clear_cache" pattern and use
8677      gen_clear_cache().  */
8678   a_tramp = XEXP (m_tramp, 0);
8679   emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
8680 		     LCT_NORMAL, VOIDmode, a_tramp, ptr_mode,
8681 		     plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
8682 		     ptr_mode);
8683 }
8684 
8685 static unsigned char
aarch64_class_max_nregs(reg_class_t regclass,machine_mode mode)8686 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
8687 {
8688   /* ??? Logically we should only need to provide a value when
8689      HARD_REGNO_MODE_OK says that at least one register in REGCLASS
8690      can hold MODE, but at the moment we need to handle all modes.
8691      Just ignore any runtime parts for registers that can't store them.  */
8692   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
8693   unsigned int nregs;
8694   switch (regclass)
8695     {
8696     case STUB_REGS:
8697     case TAILCALL_ADDR_REGS:
8698     case POINTER_REGS:
8699     case GENERAL_REGS:
8700     case ALL_REGS:
8701     case POINTER_AND_FP_REGS:
8702     case FP_REGS:
8703     case FP_LO_REGS:
8704       if (aarch64_sve_data_mode_p (mode)
8705 	  && constant_multiple_p (GET_MODE_SIZE (mode),
8706 				  BYTES_PER_SVE_VECTOR, &nregs))
8707 	return nregs;
8708       return (aarch64_vector_data_mode_p (mode)
8709 	      ? CEIL (lowest_size, UNITS_PER_VREG)
8710 	      : CEIL (lowest_size, UNITS_PER_WORD));
8711     case STACK_REG:
8712     case PR_REGS:
8713     case PR_LO_REGS:
8714     case PR_HI_REGS:
8715       return 1;
8716 
8717     case NO_REGS:
8718       return 0;
8719 
8720     default:
8721       break;
8722     }
8723   gcc_unreachable ();
8724 }
8725 
8726 static reg_class_t
aarch64_preferred_reload_class(rtx x,reg_class_t regclass)8727 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
8728 {
8729   if (regclass == POINTER_REGS)
8730     return GENERAL_REGS;
8731 
8732   if (regclass == STACK_REG)
8733     {
8734       if (REG_P(x)
8735 	  && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
8736 	  return regclass;
8737 
8738       return NO_REGS;
8739     }
8740 
8741   /* Register eliminiation can result in a request for
8742      SP+constant->FP_REGS.  We cannot support such operations which
8743      use SP as source and an FP_REG as destination, so reject out
8744      right now.  */
8745   if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
8746     {
8747       rtx lhs = XEXP (x, 0);
8748 
8749       /* Look through a possible SUBREG introduced by ILP32.  */
8750       if (GET_CODE (lhs) == SUBREG)
8751 	lhs = SUBREG_REG (lhs);
8752 
8753       gcc_assert (REG_P (lhs));
8754       gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
8755 				      POINTER_REGS));
8756       return NO_REGS;
8757     }
8758 
8759   return regclass;
8760 }
8761 
8762 void
aarch64_asm_output_labelref(FILE * f,const char * name)8763 aarch64_asm_output_labelref (FILE* f, const char *name)
8764 {
8765   asm_fprintf (f, "%U%s", name);
8766 }
8767 
8768 static void
aarch64_elf_asm_constructor(rtx symbol,int priority)8769 aarch64_elf_asm_constructor (rtx symbol, int priority)
8770 {
8771   if (priority == DEFAULT_INIT_PRIORITY)
8772     default_ctor_section_asm_out_constructor (symbol, priority);
8773   else
8774     {
8775       section *s;
8776       /* While priority is known to be in range [0, 65535], so 18 bytes
8777          would be enough, the compiler might not know that.  To avoid
8778          -Wformat-truncation false positive, use a larger size.  */
8779       char buf[23];
8780       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
8781       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
8782       switch_to_section (s);
8783       assemble_align (POINTER_SIZE);
8784       assemble_aligned_integer (POINTER_BYTES, symbol);
8785     }
8786 }
8787 
8788 static void
aarch64_elf_asm_destructor(rtx symbol,int priority)8789 aarch64_elf_asm_destructor (rtx symbol, int priority)
8790 {
8791   if (priority == DEFAULT_INIT_PRIORITY)
8792     default_dtor_section_asm_out_destructor (symbol, priority);
8793   else
8794     {
8795       section *s;
8796       /* While priority is known to be in range [0, 65535], so 18 bytes
8797          would be enough, the compiler might not know that.  To avoid
8798          -Wformat-truncation false positive, use a larger size.  */
8799       char buf[23];
8800       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
8801       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
8802       switch_to_section (s);
8803       assemble_align (POINTER_SIZE);
8804       assemble_aligned_integer (POINTER_BYTES, symbol);
8805     }
8806 }
8807 
8808 const char*
aarch64_output_casesi(rtx * operands)8809 aarch64_output_casesi (rtx *operands)
8810 {
8811   char buf[100];
8812   char label[100];
8813   rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
8814   int index;
8815   static const char *const patterns[4][2] =
8816   {
8817     {
8818       "ldrb\t%w3, [%0,%w1,uxtw]",
8819       "add\t%3, %4, %w3, sxtb #2"
8820     },
8821     {
8822       "ldrh\t%w3, [%0,%w1,uxtw #1]",
8823       "add\t%3, %4, %w3, sxth #2"
8824     },
8825     {
8826       "ldr\t%w3, [%0,%w1,uxtw #2]",
8827       "add\t%3, %4, %w3, sxtw #2"
8828     },
8829     /* We assume that DImode is only generated when not optimizing and
8830        that we don't really need 64-bit address offsets.  That would
8831        imply an object file with 8GB of code in a single function!  */
8832     {
8833       "ldr\t%w3, [%0,%w1,uxtw #2]",
8834       "add\t%3, %4, %w3, sxtw #2"
8835     }
8836   };
8837 
8838   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
8839 
8840   scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
8841   index = exact_log2 (GET_MODE_SIZE (mode));
8842 
8843   gcc_assert (index >= 0 && index <= 3);
8844 
8845   /* Need to implement table size reduction, by chaning the code below.  */
8846   output_asm_insn (patterns[index][0], operands);
8847   ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
8848   snprintf (buf, sizeof (buf),
8849 	    "adr\t%%4, %s", targetm.strip_name_encoding (label));
8850   output_asm_insn (buf, operands);
8851   output_asm_insn (patterns[index][1], operands);
8852   output_asm_insn ("br\t%3", operands);
8853   output_asm_insn (aarch64_sls_barrier (aarch64_harden_sls_retbr_p ()),
8854 		   operands);
8855   assemble_label (asm_out_file, label);
8856   return "";
8857 }
8858 
8859 
8860 /* Return size in bits of an arithmetic operand which is shifted/scaled and
8861    masked such that it is suitable for a UXTB, UXTH, or UXTW extend
8862    operator.  */
8863 
8864 int
aarch64_uxt_size(int shift,HOST_WIDE_INT mask)8865 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
8866 {
8867   if (shift >= 0 && shift <= 3)
8868     {
8869       int size;
8870       for (size = 8; size <= 32; size *= 2)
8871 	{
8872 	  HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
8873 	  if (mask == bits << shift)
8874 	    return size;
8875 	}
8876     }
8877   return 0;
8878 }
8879 
8880 /* Constant pools are per function only when PC relative
8881    literal loads are true or we are in the large memory
8882    model.  */
8883 
8884 static inline bool
aarch64_can_use_per_function_literal_pools_p(void)8885 aarch64_can_use_per_function_literal_pools_p (void)
8886 {
8887   return (aarch64_pcrelative_literal_loads
8888 	  || aarch64_cmodel == AARCH64_CMODEL_LARGE);
8889 }
8890 
8891 static bool
aarch64_use_blocks_for_constant_p(machine_mode,const_rtx)8892 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
8893 {
8894   /* We can't use blocks for constants when we're using a per-function
8895      constant pool.  */
8896   return !aarch64_can_use_per_function_literal_pools_p ();
8897 }
8898 
8899 /* Select appropriate section for constants depending
8900    on where we place literal pools.  */
8901 
8902 static section *
aarch64_select_rtx_section(machine_mode mode,rtx x,unsigned HOST_WIDE_INT align)8903 aarch64_select_rtx_section (machine_mode mode,
8904 			    rtx x,
8905 			    unsigned HOST_WIDE_INT align)
8906 {
8907   if (aarch64_can_use_per_function_literal_pools_p ())
8908     return function_section (current_function_decl);
8909 
8910   return default_elf_select_rtx_section (mode, x, align);
8911 }
8912 
8913 /* Implement ASM_OUTPUT_POOL_EPILOGUE.  */
8914 void
aarch64_asm_output_pool_epilogue(FILE * f,const char *,tree,HOST_WIDE_INT offset)8915 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
8916 				  HOST_WIDE_INT offset)
8917 {
8918   /* When using per-function literal pools, we must ensure that any code
8919      section is aligned to the minimal instruction length, lest we get
8920      errors from the assembler re "unaligned instructions".  */
8921   if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
8922     ASM_OUTPUT_ALIGN (f, 2);
8923 }
8924 
8925 /* Costs.  */
8926 
8927 /* Helper function for rtx cost calculation.  Strip a shift expression
8928    from X.  Returns the inner operand if successful, or the original
8929    expression on failure.  */
8930 static rtx
aarch64_strip_shift(rtx x)8931 aarch64_strip_shift (rtx x)
8932 {
8933   rtx op = x;
8934 
8935   /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
8936      we can convert both to ROR during final output.  */
8937   if ((GET_CODE (op) == ASHIFT
8938        || GET_CODE (op) == ASHIFTRT
8939        || GET_CODE (op) == LSHIFTRT
8940        || GET_CODE (op) == ROTATERT
8941        || GET_CODE (op) == ROTATE)
8942       && CONST_INT_P (XEXP (op, 1)))
8943     return XEXP (op, 0);
8944 
8945   if (GET_CODE (op) == MULT
8946       && CONST_INT_P (XEXP (op, 1))
8947       && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
8948     return XEXP (op, 0);
8949 
8950   return x;
8951 }
8952 
8953 /* Helper function for rtx cost calculation.  Strip an extend
8954    expression from X.  Returns the inner operand if successful, or the
8955    original expression on failure.  We deal with a number of possible
8956    canonicalization variations here. If STRIP_SHIFT is true, then
8957    we can strip off a shift also.  */
8958 static rtx
aarch64_strip_extend(rtx x,bool strip_shift)8959 aarch64_strip_extend (rtx x, bool strip_shift)
8960 {
8961   scalar_int_mode mode;
8962   rtx op = x;
8963 
8964   if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
8965     return op;
8966 
8967   /* Zero and sign extraction of a widened value.  */
8968   if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
8969       && XEXP (op, 2) == const0_rtx
8970       && GET_CODE (XEXP (op, 0)) == MULT
8971       && aarch64_is_extend_from_extract (mode, XEXP (XEXP (op, 0), 1),
8972 					 XEXP (op, 1)))
8973     return XEXP (XEXP (op, 0), 0);
8974 
8975   /* It can also be represented (for zero-extend) as an AND with an
8976      immediate.  */
8977   if (GET_CODE (op) == AND
8978       && GET_CODE (XEXP (op, 0)) == MULT
8979       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
8980       && CONST_INT_P (XEXP (op, 1))
8981       && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
8982 			   INTVAL (XEXP (op, 1))) != 0)
8983     return XEXP (XEXP (op, 0), 0);
8984 
8985   /* Now handle extended register, as this may also have an optional
8986      left shift by 1..4.  */
8987   if (strip_shift
8988       && GET_CODE (op) == ASHIFT
8989       && CONST_INT_P (XEXP (op, 1))
8990       && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
8991     op = XEXP (op, 0);
8992 
8993   if (GET_CODE (op) == ZERO_EXTEND
8994       || GET_CODE (op) == SIGN_EXTEND)
8995     op = XEXP (op, 0);
8996 
8997   if (op != x)
8998     return op;
8999 
9000   return x;
9001 }
9002 
9003 /* Return true iff CODE is a shift supported in combination
9004    with arithmetic instructions.  */
9005 
9006 static bool
aarch64_shift_p(enum rtx_code code)9007 aarch64_shift_p (enum rtx_code code)
9008 {
9009   return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
9010 }
9011 
9012 
9013 /* Return true iff X is a cheap shift without a sign extend. */
9014 
9015 static bool
aarch64_cheap_mult_shift_p(rtx x)9016 aarch64_cheap_mult_shift_p (rtx x)
9017 {
9018   rtx op0, op1;
9019 
9020   op0 = XEXP (x, 0);
9021   op1 = XEXP (x, 1);
9022 
9023   if (!(aarch64_tune_params.extra_tuning_flags
9024                       & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
9025     return false;
9026 
9027   if (GET_CODE (op0) == SIGN_EXTEND)
9028     return false;
9029 
9030   if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
9031       && UINTVAL (op1) <= 4)
9032     return true;
9033 
9034   if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
9035     return false;
9036 
9037   HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
9038 
9039   if (l2 > 0 && l2 <= 4)
9040     return true;
9041 
9042   return false;
9043 }
9044 
9045 /* Helper function for rtx cost calculation.  Calculate the cost of
9046    a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
9047    Return the calculated cost of the expression, recursing manually in to
9048    operands where needed.  */
9049 
9050 static int
aarch64_rtx_mult_cost(rtx x,enum rtx_code code,int outer,bool speed)9051 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
9052 {
9053   rtx op0, op1;
9054   const struct cpu_cost_table *extra_cost
9055     = aarch64_tune_params.insn_extra_cost;
9056   int cost = 0;
9057   bool compound_p = (outer == PLUS || outer == MINUS);
9058   machine_mode mode = GET_MODE (x);
9059 
9060   gcc_checking_assert (code == MULT);
9061 
9062   op0 = XEXP (x, 0);
9063   op1 = XEXP (x, 1);
9064 
9065   if (VECTOR_MODE_P (mode))
9066     mode = GET_MODE_INNER (mode);
9067 
9068   /* Integer multiply/fma.  */
9069   if (GET_MODE_CLASS (mode) == MODE_INT)
9070     {
9071       /* The multiply will be canonicalized as a shift, cost it as such.  */
9072       if (aarch64_shift_p (GET_CODE (x))
9073 	  || (CONST_INT_P (op1)
9074 	      && exact_log2 (INTVAL (op1)) > 0))
9075 	{
9076 	  bool is_extend = GET_CODE (op0) == ZERO_EXTEND
9077 	                   || GET_CODE (op0) == SIGN_EXTEND;
9078 	  if (speed)
9079 	    {
9080 	      if (compound_p)
9081 	        {
9082 		  /* If the shift is considered cheap,
9083 		     then don't add any cost. */
9084 		  if (aarch64_cheap_mult_shift_p (x))
9085 		    ;
9086 	          else if (REG_P (op1))
9087 		    /* ARITH + shift-by-register.  */
9088 		    cost += extra_cost->alu.arith_shift_reg;
9089 		  else if (is_extend)
9090 		    /* ARITH + extended register.  We don't have a cost field
9091 		       for ARITH+EXTEND+SHIFT, so use extend_arith here.  */
9092 		    cost += extra_cost->alu.extend_arith;
9093 		  else
9094 		    /* ARITH + shift-by-immediate.  */
9095 		    cost += extra_cost->alu.arith_shift;
9096 		}
9097 	      else
9098 		/* LSL (immediate).  */
9099 	        cost += extra_cost->alu.shift;
9100 
9101 	    }
9102 	  /* Strip extends as we will have costed them in the case above.  */
9103 	  if (is_extend)
9104 	    op0 = aarch64_strip_extend (op0, true);
9105 
9106 	  cost += rtx_cost (op0, VOIDmode, code, 0, speed);
9107 
9108 	  return cost;
9109 	}
9110 
9111       /* MNEG or [US]MNEGL.  Extract the NEG operand and indicate that it's a
9112 	 compound and let the below cases handle it.  After all, MNEG is a
9113 	 special-case alias of MSUB.  */
9114       if (GET_CODE (op0) == NEG)
9115 	{
9116 	  op0 = XEXP (op0, 0);
9117 	  compound_p = true;
9118 	}
9119 
9120       /* Integer multiplies or FMAs have zero/sign extending variants.  */
9121       if ((GET_CODE (op0) == ZERO_EXTEND
9122 	   && GET_CODE (op1) == ZERO_EXTEND)
9123 	  || (GET_CODE (op0) == SIGN_EXTEND
9124 	      && GET_CODE (op1) == SIGN_EXTEND))
9125 	{
9126 	  cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
9127 	  cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
9128 
9129 	  if (speed)
9130 	    {
9131 	      if (compound_p)
9132 		/* SMADDL/UMADDL/UMSUBL/SMSUBL.  */
9133 		cost += extra_cost->mult[0].extend_add;
9134 	      else
9135 		/* MUL/SMULL/UMULL.  */
9136 		cost += extra_cost->mult[0].extend;
9137 	    }
9138 
9139 	  return cost;
9140 	}
9141 
9142       /* This is either an integer multiply or a MADD.  In both cases
9143 	 we want to recurse and cost the operands.  */
9144       cost += rtx_cost (op0, mode, MULT, 0, speed);
9145       cost += rtx_cost (op1, mode, MULT, 1, speed);
9146 
9147       if (speed)
9148 	{
9149 	  if (compound_p)
9150 	    /* MADD/MSUB.  */
9151 	    cost += extra_cost->mult[mode == DImode].add;
9152 	  else
9153 	    /* MUL.  */
9154 	    cost += extra_cost->mult[mode == DImode].simple;
9155 	}
9156 
9157       return cost;
9158     }
9159   else
9160     {
9161       if (speed)
9162 	{
9163 	  /* Floating-point FMA/FMUL can also support negations of the
9164 	     operands, unless the rounding mode is upward or downward in
9165 	     which case FNMUL is different than FMUL with operand negation.  */
9166 	  bool neg0 = GET_CODE (op0) == NEG;
9167 	  bool neg1 = GET_CODE (op1) == NEG;
9168 	  if (compound_p || !flag_rounding_math || (neg0 && neg1))
9169 	    {
9170 	      if (neg0)
9171 		op0 = XEXP (op0, 0);
9172 	      if (neg1)
9173 		op1 = XEXP (op1, 0);
9174 	    }
9175 
9176 	  if (compound_p)
9177 	    /* FMADD/FNMADD/FNMSUB/FMSUB.  */
9178 	    cost += extra_cost->fp[mode == DFmode].fma;
9179 	  else
9180 	    /* FMUL/FNMUL.  */
9181 	    cost += extra_cost->fp[mode == DFmode].mult;
9182 	}
9183 
9184       cost += rtx_cost (op0, mode, MULT, 0, speed);
9185       cost += rtx_cost (op1, mode, MULT, 1, speed);
9186       return cost;
9187     }
9188 }
9189 
9190 static int
aarch64_address_cost(rtx x,machine_mode mode,addr_space_t as ATTRIBUTE_UNUSED,bool speed)9191 aarch64_address_cost (rtx x,
9192 		      machine_mode mode,
9193 		      addr_space_t as ATTRIBUTE_UNUSED,
9194 		      bool speed)
9195 {
9196   enum rtx_code c = GET_CODE (x);
9197   const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
9198   struct aarch64_address_info info;
9199   int cost = 0;
9200   info.shift = 0;
9201 
9202   if (!aarch64_classify_address (&info, x, mode, false))
9203     {
9204       if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
9205 	{
9206 	  /* This is a CONST or SYMBOL ref which will be split
9207 	     in a different way depending on the code model in use.
9208 	     Cost it through the generic infrastructure.  */
9209 	  int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
9210 	  /* Divide through by the cost of one instruction to
9211 	     bring it to the same units as the address costs.  */
9212 	  cost_symbol_ref /= COSTS_N_INSNS (1);
9213 	  /* The cost is then the cost of preparing the address,
9214 	     followed by an immediate (possibly 0) offset.  */
9215 	  return cost_symbol_ref + addr_cost->imm_offset;
9216 	}
9217       else
9218 	{
9219 	  /* This is most likely a jump table from a case
9220 	     statement.  */
9221 	  return addr_cost->register_offset;
9222 	}
9223     }
9224 
9225   switch (info.type)
9226     {
9227       case ADDRESS_LO_SUM:
9228       case ADDRESS_SYMBOLIC:
9229       case ADDRESS_REG_IMM:
9230 	cost += addr_cost->imm_offset;
9231 	break;
9232 
9233       case ADDRESS_REG_WB:
9234 	if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
9235 	  cost += addr_cost->pre_modify;
9236 	else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
9237 	  cost += addr_cost->post_modify;
9238 	else
9239 	  gcc_unreachable ();
9240 
9241 	break;
9242 
9243       case ADDRESS_REG_REG:
9244 	cost += addr_cost->register_offset;
9245 	break;
9246 
9247       case ADDRESS_REG_SXTW:
9248 	cost += addr_cost->register_sextend;
9249 	break;
9250 
9251       case ADDRESS_REG_UXTW:
9252 	cost += addr_cost->register_zextend;
9253 	break;
9254 
9255       default:
9256 	gcc_unreachable ();
9257     }
9258 
9259 
9260   if (info.shift > 0)
9261     {
9262       /* For the sake of calculating the cost of the shifted register
9263 	 component, we can treat same sized modes in the same way.  */
9264       if (known_eq (GET_MODE_BITSIZE (mode), 16))
9265 	cost += addr_cost->addr_scale_costs.hi;
9266       else if (known_eq (GET_MODE_BITSIZE (mode), 32))
9267 	cost += addr_cost->addr_scale_costs.si;
9268       else if (known_eq (GET_MODE_BITSIZE (mode), 64))
9269 	cost += addr_cost->addr_scale_costs.di;
9270       else
9271 	/* We can't tell, or this is a 128-bit vector.  */
9272 	cost += addr_cost->addr_scale_costs.ti;
9273     }
9274 
9275   return cost;
9276 }
9277 
9278 /* Return the cost of a branch.  If SPEED_P is true then the compiler is
9279    optimizing for speed.  If PREDICTABLE_P is true then the branch is predicted
9280    to be taken.  */
9281 
9282 int
aarch64_branch_cost(bool speed_p,bool predictable_p)9283 aarch64_branch_cost (bool speed_p, bool predictable_p)
9284 {
9285   /* When optimizing for speed, use the cost of unpredictable branches.  */
9286   const struct cpu_branch_cost *branch_costs =
9287     aarch64_tune_params.branch_costs;
9288 
9289   if (!speed_p || predictable_p)
9290     return branch_costs->predictable;
9291   else
9292     return branch_costs->unpredictable;
9293 }
9294 
9295 /* Return true if the RTX X in mode MODE is a zero or sign extract
9296    usable in an ADD or SUB (extended register) instruction.  */
9297 static bool
aarch64_rtx_arith_op_extract_p(rtx x,scalar_int_mode mode)9298 aarch64_rtx_arith_op_extract_p (rtx x, scalar_int_mode mode)
9299 {
9300   /* Catch add with a sign extract.
9301      This is add_<optab><mode>_multp2.  */
9302   if (GET_CODE (x) == SIGN_EXTRACT
9303       || GET_CODE (x) == ZERO_EXTRACT)
9304     {
9305       rtx op0 = XEXP (x, 0);
9306       rtx op1 = XEXP (x, 1);
9307       rtx op2 = XEXP (x, 2);
9308 
9309       if (GET_CODE (op0) == MULT
9310 	  && CONST_INT_P (op1)
9311 	  && op2 == const0_rtx
9312 	  && CONST_INT_P (XEXP (op0, 1))
9313 	  && aarch64_is_extend_from_extract (mode,
9314 					     XEXP (op0, 1),
9315 					     op1))
9316 	{
9317 	  return true;
9318 	}
9319     }
9320   /* The simple case <ARITH>, XD, XN, XM, [us]xt.
9321      No shift.  */
9322   else if (GET_CODE (x) == SIGN_EXTEND
9323 	   || GET_CODE (x) == ZERO_EXTEND)
9324     return REG_P (XEXP (x, 0));
9325 
9326   return false;
9327 }
9328 
9329 static bool
aarch64_frint_unspec_p(unsigned int u)9330 aarch64_frint_unspec_p (unsigned int u)
9331 {
9332   switch (u)
9333     {
9334       case UNSPEC_FRINTZ:
9335       case UNSPEC_FRINTP:
9336       case UNSPEC_FRINTM:
9337       case UNSPEC_FRINTA:
9338       case UNSPEC_FRINTN:
9339       case UNSPEC_FRINTX:
9340       case UNSPEC_FRINTI:
9341         return true;
9342 
9343       default:
9344         return false;
9345     }
9346 }
9347 
9348 /* Return true iff X is an rtx that will match an extr instruction
9349    i.e. as described in the *extr<mode>5_insn family of patterns.
9350    OP0 and OP1 will be set to the operands of the shifts involved
9351    on success and will be NULL_RTX otherwise.  */
9352 
9353 static bool
aarch64_extr_rtx_p(rtx x,rtx * res_op0,rtx * res_op1)9354 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
9355 {
9356   rtx op0, op1;
9357   scalar_int_mode mode;
9358   if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
9359     return false;
9360 
9361   *res_op0 = NULL_RTX;
9362   *res_op1 = NULL_RTX;
9363 
9364   if (GET_CODE (x) != IOR)
9365     return false;
9366 
9367   op0 = XEXP (x, 0);
9368   op1 = XEXP (x, 1);
9369 
9370   if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
9371       || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
9372     {
9373      /* Canonicalise locally to ashift in op0, lshiftrt in op1.  */
9374       if (GET_CODE (op1) == ASHIFT)
9375         std::swap (op0, op1);
9376 
9377       if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
9378         return false;
9379 
9380       unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
9381       unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
9382 
9383       if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
9384           && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
9385         {
9386           *res_op0 = XEXP (op0, 0);
9387           *res_op1 = XEXP (op1, 0);
9388           return true;
9389         }
9390     }
9391 
9392   return false;
9393 }
9394 
9395 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
9396    storing it in *COST.  Result is true if the total cost of the operation
9397    has now been calculated.  */
9398 static bool
aarch64_if_then_else_costs(rtx op0,rtx op1,rtx op2,int * cost,bool speed)9399 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
9400 {
9401   rtx inner;
9402   rtx comparator;
9403   enum rtx_code cmpcode;
9404 
9405   if (COMPARISON_P (op0))
9406     {
9407       inner = XEXP (op0, 0);
9408       comparator = XEXP (op0, 1);
9409       cmpcode = GET_CODE (op0);
9410     }
9411   else
9412     {
9413       inner = op0;
9414       comparator = const0_rtx;
9415       cmpcode = NE;
9416     }
9417 
9418   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
9419     {
9420       /* Conditional branch.  */
9421       if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
9422 	return true;
9423       else
9424 	{
9425 	  if (cmpcode == NE || cmpcode == EQ)
9426 	    {
9427 	      if (comparator == const0_rtx)
9428 		{
9429 		  /* TBZ/TBNZ/CBZ/CBNZ.  */
9430 		  if (GET_CODE (inner) == ZERO_EXTRACT)
9431 		    /* TBZ/TBNZ.  */
9432 		    *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
9433 				       ZERO_EXTRACT, 0, speed);
9434 		  else
9435 		    /* CBZ/CBNZ.  */
9436 		    *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
9437 
9438 	        return true;
9439 	      }
9440 	    }
9441 	  else if (cmpcode == LT || cmpcode == GE)
9442 	    {
9443 	      /* TBZ/TBNZ.  */
9444 	      if (comparator == const0_rtx)
9445 		return true;
9446 	    }
9447 	}
9448     }
9449   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
9450     {
9451       /* CCMP.  */
9452       if (GET_CODE (op1) == COMPARE)
9453 	{
9454 	  /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0.  */
9455 	  if (XEXP (op1, 1) == const0_rtx)
9456 	    *cost += 1;
9457 	  if (speed)
9458 	    {
9459 	      machine_mode mode = GET_MODE (XEXP (op1, 0));
9460 	      const struct cpu_cost_table *extra_cost
9461 		= aarch64_tune_params.insn_extra_cost;
9462 
9463 	      if (GET_MODE_CLASS (mode) == MODE_INT)
9464 		*cost += extra_cost->alu.arith;
9465 	      else
9466 		*cost += extra_cost->fp[mode == DFmode].compare;
9467 	    }
9468 	  return true;
9469 	}
9470 
9471       /* It's a conditional operation based on the status flags,
9472 	 so it must be some flavor of CSEL.  */
9473 
9474       /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
9475       if (GET_CODE (op1) == NEG
9476           || GET_CODE (op1) == NOT
9477           || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
9478 	op1 = XEXP (op1, 0);
9479       else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
9480 	{
9481 	  /* CSEL with zero-extension (*cmovdi_insn_uxtw).  */
9482 	  op1 = XEXP (op1, 0);
9483 	  op2 = XEXP (op2, 0);
9484 	}
9485 
9486       *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
9487       *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
9488       return true;
9489     }
9490 
9491   /* We don't know what this is, cost all operands.  */
9492   return false;
9493 }
9494 
9495 /* Check whether X is a bitfield operation of the form shift + extend that
9496    maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction.  If so, return the
9497    operand to which the bitfield operation is applied.  Otherwise return
9498    NULL_RTX.  */
9499 
9500 static rtx
aarch64_extend_bitfield_pattern_p(rtx x)9501 aarch64_extend_bitfield_pattern_p (rtx x)
9502 {
9503   rtx_code outer_code = GET_CODE (x);
9504   machine_mode outer_mode = GET_MODE (x);
9505 
9506   if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
9507       && outer_mode != SImode && outer_mode != DImode)
9508     return NULL_RTX;
9509 
9510   rtx inner = XEXP (x, 0);
9511   rtx_code inner_code = GET_CODE (inner);
9512   machine_mode inner_mode = GET_MODE (inner);
9513   rtx op = NULL_RTX;
9514 
9515   switch (inner_code)
9516     {
9517       case ASHIFT:
9518 	if (CONST_INT_P (XEXP (inner, 1))
9519 	    && (inner_mode == QImode || inner_mode == HImode))
9520 	  op = XEXP (inner, 0);
9521 	break;
9522       case LSHIFTRT:
9523 	if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
9524 	    && (inner_mode == QImode || inner_mode == HImode))
9525 	  op = XEXP (inner, 0);
9526 	break;
9527       case ASHIFTRT:
9528 	if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
9529 	    && (inner_mode == QImode || inner_mode == HImode))
9530 	  op = XEXP (inner, 0);
9531 	break;
9532       default:
9533 	break;
9534     }
9535 
9536   return op;
9537 }
9538 
9539 /* Return true if the mask and a shift amount from an RTX of the form
9540    (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
9541    mode MODE.  See the *andim_ashift<mode>_bfiz pattern.  */
9542 
9543 bool
aarch64_mask_and_shift_for_ubfiz_p(scalar_int_mode mode,rtx mask,rtx shft_amnt)9544 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
9545 				    rtx shft_amnt)
9546 {
9547   return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
9548 	 && INTVAL (mask) > 0
9549 	 && UINTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
9550 	 && exact_log2 ((UINTVAL (mask) >> UINTVAL (shft_amnt)) + 1) >= 0
9551 	 && (UINTVAL (mask)
9552 	     & ((HOST_WIDE_INT_1U << UINTVAL (shft_amnt)) - 1)) == 0;
9553 }
9554 
9555 /* Return true if the masks and a shift amount from an RTX of the form
9556    ((x & MASK1) | ((y << SHIFT_AMNT) & MASK2)) are valid to combine into
9557    a BFI instruction of mode MODE.  See *arch64_bfi patterns.  */
9558 
9559 bool
aarch64_masks_and_shift_for_bfi_p(scalar_int_mode mode,unsigned HOST_WIDE_INT mask1,unsigned HOST_WIDE_INT shft_amnt,unsigned HOST_WIDE_INT mask2)9560 aarch64_masks_and_shift_for_bfi_p (scalar_int_mode mode,
9561 				   unsigned HOST_WIDE_INT mask1,
9562 				   unsigned HOST_WIDE_INT shft_amnt,
9563 				   unsigned HOST_WIDE_INT mask2)
9564 {
9565   unsigned HOST_WIDE_INT t;
9566 
9567   /* Verify that there is no overlap in what bits are set in the two masks.  */
9568   if (mask1 != ~mask2)
9569     return false;
9570 
9571   /* Verify that mask2 is not all zeros or ones.  */
9572   if (mask2 == 0 || mask2 == HOST_WIDE_INT_M1U)
9573     return false;
9574 
9575   /* The shift amount should always be less than the mode size.  */
9576   gcc_assert (shft_amnt < GET_MODE_BITSIZE (mode));
9577 
9578   /* Verify that the mask being shifted is contiguous and would be in the
9579      least significant bits after shifting by shft_amnt.  */
9580   t = mask2 + (HOST_WIDE_INT_1U << shft_amnt);
9581   return (t == (t & -t));
9582 }
9583 
9584 /* Calculate the cost of calculating X, storing it in *COST.  Result
9585    is true if the total cost of the operation has now been calculated.  */
9586 static bool
aarch64_rtx_costs(rtx x,machine_mode mode,int outer ATTRIBUTE_UNUSED,int param ATTRIBUTE_UNUSED,int * cost,bool speed)9587 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
9588 		   int param ATTRIBUTE_UNUSED, int *cost, bool speed)
9589 {
9590   rtx op0, op1, op2;
9591   const struct cpu_cost_table *extra_cost
9592     = aarch64_tune_params.insn_extra_cost;
9593   int code = GET_CODE (x);
9594   scalar_int_mode int_mode;
9595 
9596   /* By default, assume that everything has equivalent cost to the
9597      cheapest instruction.  Any additional costs are applied as a delta
9598      above this default.  */
9599   *cost = COSTS_N_INSNS (1);
9600 
9601   switch (code)
9602     {
9603     case SET:
9604       /* The cost depends entirely on the operands to SET.  */
9605       *cost = 0;
9606       op0 = SET_DEST (x);
9607       op1 = SET_SRC (x);
9608 
9609       switch (GET_CODE (op0))
9610 	{
9611 	case MEM:
9612 	  if (speed)
9613 	    {
9614 	      rtx address = XEXP (op0, 0);
9615 	      if (VECTOR_MODE_P (mode))
9616 		*cost += extra_cost->ldst.storev;
9617 	      else if (GET_MODE_CLASS (mode) == MODE_INT)
9618 		*cost += extra_cost->ldst.store;
9619 	      else if (mode == SFmode)
9620 		*cost += extra_cost->ldst.storef;
9621 	      else if (mode == DFmode)
9622 		*cost += extra_cost->ldst.stored;
9623 
9624 	      *cost +=
9625 		COSTS_N_INSNS (aarch64_address_cost (address, mode,
9626 						     0, speed));
9627 	    }
9628 
9629 	  *cost += rtx_cost (op1, mode, SET, 1, speed);
9630 	  return true;
9631 
9632 	case SUBREG:
9633 	  if (! REG_P (SUBREG_REG (op0)))
9634 	    *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
9635 
9636 	  /* Fall through.  */
9637 	case REG:
9638 	  /* The cost is one per vector-register copied.  */
9639 	  if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
9640 	    {
9641 	      int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
9642 	      *cost = COSTS_N_INSNS (nregs);
9643 	    }
9644 	  /* const0_rtx is in general free, but we will use an
9645 	     instruction to set a register to 0.  */
9646 	  else if (REG_P (op1) || op1 == const0_rtx)
9647 	    {
9648 	      /* The cost is 1 per register copied.  */
9649 	      int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
9650 	      *cost = COSTS_N_INSNS (nregs);
9651 	    }
9652           else
9653 	    /* Cost is just the cost of the RHS of the set.  */
9654 	    *cost += rtx_cost (op1, mode, SET, 1, speed);
9655 	  return true;
9656 
9657 	case ZERO_EXTRACT:
9658 	case SIGN_EXTRACT:
9659 	  /* Bit-field insertion.  Strip any redundant widening of
9660 	     the RHS to meet the width of the target.  */
9661 	  if (GET_CODE (op1) == SUBREG)
9662 	    op1 = SUBREG_REG (op1);
9663 	  if ((GET_CODE (op1) == ZERO_EXTEND
9664 	       || GET_CODE (op1) == SIGN_EXTEND)
9665 	      && CONST_INT_P (XEXP (op0, 1))
9666 	      && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
9667 	      && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
9668 	    op1 = XEXP (op1, 0);
9669 
9670           if (CONST_INT_P (op1))
9671             {
9672               /* MOV immediate is assumed to always be cheap.  */
9673               *cost = COSTS_N_INSNS (1);
9674             }
9675           else
9676             {
9677               /* BFM.  */
9678 	      if (speed)
9679 		*cost += extra_cost->alu.bfi;
9680 	      *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
9681             }
9682 
9683 	  return true;
9684 
9685 	default:
9686 	  /* We can't make sense of this, assume default cost.  */
9687           *cost = COSTS_N_INSNS (1);
9688 	  return false;
9689 	}
9690       return false;
9691 
9692     case CONST_INT:
9693       /* If an instruction can incorporate a constant within the
9694 	 instruction, the instruction's expression avoids calling
9695 	 rtx_cost() on the constant.  If rtx_cost() is called on a
9696 	 constant, then it is usually because the constant must be
9697 	 moved into a register by one or more instructions.
9698 
9699 	 The exception is constant 0, which can be expressed
9700 	 as XZR/WZR and is therefore free.  The exception to this is
9701 	 if we have (set (reg) (const0_rtx)) in which case we must cost
9702 	 the move.  However, we can catch that when we cost the SET, so
9703 	 we don't need to consider that here.  */
9704       if (x == const0_rtx)
9705 	*cost = 0;
9706       else
9707 	{
9708 	  /* To an approximation, building any other constant is
9709 	     proportionally expensive to the number of instructions
9710 	     required to build that constant.  This is true whether we
9711 	     are compiling for SPEED or otherwise.  */
9712 	  if (!is_a <scalar_int_mode> (mode, &int_mode))
9713 	    int_mode = word_mode;
9714 	  *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
9715 				 (NULL_RTX, x, false, int_mode));
9716 	}
9717       return true;
9718 
9719     case CONST_DOUBLE:
9720 
9721       /* First determine number of instructions to do the move
9722 	  as an integer constant.  */
9723       if (!aarch64_float_const_representable_p (x)
9724 	   && !aarch64_can_const_movi_rtx_p (x, mode)
9725 	   && aarch64_float_const_rtx_p (x))
9726 	{
9727 	  unsigned HOST_WIDE_INT ival;
9728 	  bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
9729 	  gcc_assert (succeed);
9730 
9731 	  scalar_int_mode imode = (mode == HFmode
9732 				   ? SImode
9733 				   : int_mode_for_mode (mode).require ());
9734 	  int ncost = aarch64_internal_mov_immediate
9735 		(NULL_RTX, gen_int_mode (ival, imode), false, imode);
9736 	  *cost += COSTS_N_INSNS (ncost);
9737 	  return true;
9738 	}
9739 
9740       if (speed)
9741 	{
9742 	  /* mov[df,sf]_aarch64.  */
9743 	  if (aarch64_float_const_representable_p (x))
9744 	    /* FMOV (scalar immediate).  */
9745 	    *cost += extra_cost->fp[mode == DFmode].fpconst;
9746 	  else if (!aarch64_float_const_zero_rtx_p (x))
9747 	    {
9748 	      /* This will be a load from memory.  */
9749 	      if (mode == DFmode)
9750 		*cost += extra_cost->ldst.loadd;
9751 	      else
9752 		*cost += extra_cost->ldst.loadf;
9753 	    }
9754 	  else
9755 	    /* Otherwise this is +0.0.  We get this using MOVI d0, #0
9756 	       or MOV v0.s[0], wzr - neither of which are modeled by the
9757 	       cost tables.  Just use the default cost.  */
9758 	    {
9759 	    }
9760 	}
9761 
9762       return true;
9763 
9764     case MEM:
9765       if (speed)
9766 	{
9767 	  /* For loads we want the base cost of a load, plus an
9768 	     approximation for the additional cost of the addressing
9769 	     mode.  */
9770 	  rtx address = XEXP (x, 0);
9771 	  if (VECTOR_MODE_P (mode))
9772 	    *cost += extra_cost->ldst.loadv;
9773 	  else if (GET_MODE_CLASS (mode) == MODE_INT)
9774 	    *cost += extra_cost->ldst.load;
9775 	  else if (mode == SFmode)
9776 	    *cost += extra_cost->ldst.loadf;
9777 	  else if (mode == DFmode)
9778 	    *cost += extra_cost->ldst.loadd;
9779 
9780 	  *cost +=
9781 		COSTS_N_INSNS (aarch64_address_cost (address, mode,
9782 						     0, speed));
9783 	}
9784 
9785       return true;
9786 
9787     case NEG:
9788       op0 = XEXP (x, 0);
9789 
9790       if (VECTOR_MODE_P (mode))
9791 	{
9792 	  if (speed)
9793 	    {
9794 	      /* FNEG.  */
9795 	      *cost += extra_cost->vect.alu;
9796 	    }
9797 	  return false;
9798 	}
9799 
9800       if (GET_MODE_CLASS (mode) == MODE_INT)
9801 	{
9802           if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
9803               || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
9804             {
9805               /* CSETM.  */
9806 	      *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
9807               return true;
9808             }
9809 
9810 	  /* Cost this as SUB wzr, X.  */
9811           op0 = CONST0_RTX (mode);
9812           op1 = XEXP (x, 0);
9813           goto cost_minus;
9814         }
9815 
9816       if (GET_MODE_CLASS (mode) == MODE_FLOAT)
9817         {
9818           /* Support (neg(fma...)) as a single instruction only if
9819              sign of zeros is unimportant.  This matches the decision
9820              making in aarch64.md.  */
9821           if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
9822             {
9823 	      /* FNMADD.  */
9824 	      *cost = rtx_cost (op0, mode, NEG, 0, speed);
9825               return true;
9826             }
9827 	  if (GET_CODE (op0) == MULT)
9828 	    {
9829 	      /* FNMUL.  */
9830 	      *cost = rtx_cost (op0, mode, NEG, 0, speed);
9831 	      return true;
9832 	    }
9833 	  if (speed)
9834 	    /* FNEG.  */
9835 	    *cost += extra_cost->fp[mode == DFmode].neg;
9836           return false;
9837         }
9838 
9839       return false;
9840 
9841     case CLRSB:
9842     case CLZ:
9843       if (speed)
9844 	{
9845 	  if (VECTOR_MODE_P (mode))
9846 	    *cost += extra_cost->vect.alu;
9847 	  else
9848 	    *cost += extra_cost->alu.clz;
9849 	}
9850 
9851       return false;
9852 
9853     case COMPARE:
9854       op0 = XEXP (x, 0);
9855       op1 = XEXP (x, 1);
9856 
9857       if (op1 == const0_rtx
9858 	  && GET_CODE (op0) == AND)
9859 	{
9860 	  x = op0;
9861 	  mode = GET_MODE (op0);
9862 	  goto cost_logic;
9863 	}
9864 
9865       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
9866         {
9867           /* TODO: A write to the CC flags possibly costs extra, this
9868 	     needs encoding in the cost tables.  */
9869 
9870 	  mode = GET_MODE (op0);
9871           /* ANDS.  */
9872           if (GET_CODE (op0) == AND)
9873             {
9874               x = op0;
9875               goto cost_logic;
9876             }
9877 
9878           if (GET_CODE (op0) == PLUS)
9879             {
9880 	      /* ADDS (and CMN alias).  */
9881               x = op0;
9882               goto cost_plus;
9883             }
9884 
9885           if (GET_CODE (op0) == MINUS)
9886             {
9887 	      /* SUBS.  */
9888               x = op0;
9889               goto cost_minus;
9890             }
9891 
9892 	  if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
9893 	      && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
9894 	      && CONST_INT_P (XEXP (op0, 2)))
9895 	    {
9896 	      /* COMPARE of ZERO_EXTRACT form of TST-immediate.
9897 		 Handle it here directly rather than going to cost_logic
9898 		 since we know the immediate generated for the TST is valid
9899 		 so we can avoid creating an intermediate rtx for it only
9900 		 for costing purposes.  */
9901 	      if (speed)
9902 		*cost += extra_cost->alu.logical;
9903 
9904 	      *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
9905 				 ZERO_EXTRACT, 0, speed);
9906 	      return true;
9907 	    }
9908 
9909           if (GET_CODE (op1) == NEG)
9910             {
9911 	      /* CMN.  */
9912 	      if (speed)
9913 		*cost += extra_cost->alu.arith;
9914 
9915 	      *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
9916 	      *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
9917               return true;
9918             }
9919 
9920           /* CMP.
9921 
9922 	     Compare can freely swap the order of operands, and
9923              canonicalization puts the more complex operation first.
9924              But the integer MINUS logic expects the shift/extend
9925              operation in op1.  */
9926           if (! (REG_P (op0)
9927                  || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
9928           {
9929             op0 = XEXP (x, 1);
9930             op1 = XEXP (x, 0);
9931           }
9932           goto cost_minus;
9933         }
9934 
9935       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
9936         {
9937 	  /* FCMP.  */
9938 	  if (speed)
9939 	    *cost += extra_cost->fp[mode == DFmode].compare;
9940 
9941           if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
9942             {
9943 	      *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
9944               /* FCMP supports constant 0.0 for no extra cost. */
9945               return true;
9946             }
9947           return false;
9948         }
9949 
9950       if (VECTOR_MODE_P (mode))
9951 	{
9952 	  /* Vector compare.  */
9953 	  if (speed)
9954 	    *cost += extra_cost->vect.alu;
9955 
9956 	  if (aarch64_float_const_zero_rtx_p (op1))
9957 	    {
9958 	      /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
9959 		 cost.  */
9960 	      return true;
9961 	    }
9962 	  return false;
9963 	}
9964       return false;
9965 
9966     case MINUS:
9967       {
9968 	op0 = XEXP (x, 0);
9969 	op1 = XEXP (x, 1);
9970 
9971 cost_minus:
9972 	*cost += rtx_cost (op0, mode, MINUS, 0, speed);
9973 
9974 	/* Detect valid immediates.  */
9975 	if ((GET_MODE_CLASS (mode) == MODE_INT
9976 	     || (GET_MODE_CLASS (mode) == MODE_CC
9977 		 && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
9978 	    && CONST_INT_P (op1)
9979 	    && aarch64_uimm12_shift (INTVAL (op1)))
9980 	  {
9981 	    if (speed)
9982 	      /* SUB(S) (immediate).  */
9983 	      *cost += extra_cost->alu.arith;
9984 	    return true;
9985 	  }
9986 
9987 	/* Look for SUB (extended register).  */
9988 	if (is_a <scalar_int_mode> (mode, &int_mode)
9989 	    && aarch64_rtx_arith_op_extract_p (op1, int_mode))
9990 	  {
9991 	    if (speed)
9992 	      *cost += extra_cost->alu.extend_arith;
9993 
9994 	    op1 = aarch64_strip_extend (op1, true);
9995 	    *cost += rtx_cost (op1, VOIDmode,
9996 			       (enum rtx_code) GET_CODE (op1), 0, speed);
9997 	    return true;
9998 	  }
9999 
10000 	rtx new_op1 = aarch64_strip_extend (op1, false);
10001 
10002 	/* Cost this as an FMA-alike operation.  */
10003 	if ((GET_CODE (new_op1) == MULT
10004 	     || aarch64_shift_p (GET_CODE (new_op1)))
10005 	    && code != COMPARE)
10006 	  {
10007 	    *cost += aarch64_rtx_mult_cost (new_op1, MULT,
10008 					    (enum rtx_code) code,
10009 					    speed);
10010 	    return true;
10011 	  }
10012 
10013 	*cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
10014 
10015 	if (speed)
10016 	  {
10017 	    if (VECTOR_MODE_P (mode))
10018 	      {
10019 		/* Vector SUB.  */
10020 		*cost += extra_cost->vect.alu;
10021 	      }
10022 	    else if (GET_MODE_CLASS (mode) == MODE_INT)
10023 	      {
10024 		/* SUB(S).  */
10025 		*cost += extra_cost->alu.arith;
10026 	      }
10027 	    else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
10028 	      {
10029 		/* FSUB.  */
10030 		*cost += extra_cost->fp[mode == DFmode].addsub;
10031 	      }
10032 	  }
10033 	return true;
10034       }
10035 
10036     case PLUS:
10037       {
10038 	rtx new_op0;
10039 
10040 	op0 = XEXP (x, 0);
10041 	op1 = XEXP (x, 1);
10042 
10043 cost_plus:
10044 	if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
10045 	    || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
10046 	  {
10047 	    /* CSINC.  */
10048 	    *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
10049 	    *cost += rtx_cost (op1, mode, PLUS, 1, speed);
10050 	    return true;
10051 	  }
10052 
10053 	if (GET_MODE_CLASS (mode) == MODE_INT
10054 	    && ((CONST_INT_P (op1) && aarch64_uimm12_shift (INTVAL (op1)))
10055 		|| aarch64_sve_addvl_addpl_immediate (op1, mode)))
10056 	  {
10057 	    *cost += rtx_cost (op0, mode, PLUS, 0, speed);
10058 
10059 	    if (speed)
10060 	      /* ADD (immediate).  */
10061 	      *cost += extra_cost->alu.arith;
10062 	    return true;
10063 	  }
10064 
10065 	*cost += rtx_cost (op1, mode, PLUS, 1, speed);
10066 
10067 	/* Look for ADD (extended register).  */
10068 	if (is_a <scalar_int_mode> (mode, &int_mode)
10069 	    && aarch64_rtx_arith_op_extract_p (op0, int_mode))
10070 	  {
10071 	    if (speed)
10072 	      *cost += extra_cost->alu.extend_arith;
10073 
10074 	    op0 = aarch64_strip_extend (op0, true);
10075 	    *cost += rtx_cost (op0, VOIDmode,
10076 			       (enum rtx_code) GET_CODE (op0), 0, speed);
10077 	    return true;
10078 	  }
10079 
10080 	/* Strip any extend, leave shifts behind as we will
10081 	   cost them through mult_cost.  */
10082 	new_op0 = aarch64_strip_extend (op0, false);
10083 
10084 	if (GET_CODE (new_op0) == MULT
10085 	    || aarch64_shift_p (GET_CODE (new_op0)))
10086 	  {
10087 	    *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
10088 					    speed);
10089 	    return true;
10090 	  }
10091 
10092 	*cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
10093 
10094 	if (speed)
10095 	  {
10096 	    if (VECTOR_MODE_P (mode))
10097 	      {
10098 		/* Vector ADD.  */
10099 		*cost += extra_cost->vect.alu;
10100 	      }
10101 	    else if (GET_MODE_CLASS (mode) == MODE_INT)
10102 	      {
10103 		/* ADD.  */
10104 		*cost += extra_cost->alu.arith;
10105 	      }
10106 	    else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
10107 	      {
10108 		/* FADD.  */
10109 		*cost += extra_cost->fp[mode == DFmode].addsub;
10110 	      }
10111 	  }
10112 	return true;
10113       }
10114 
10115     case BSWAP:
10116       *cost = COSTS_N_INSNS (1);
10117 
10118       if (speed)
10119 	{
10120 	  if (VECTOR_MODE_P (mode))
10121 	    *cost += extra_cost->vect.alu;
10122 	  else
10123 	    *cost += extra_cost->alu.rev;
10124 	}
10125       return false;
10126 
10127     case IOR:
10128       if (aarch_rev16_p (x))
10129         {
10130           *cost = COSTS_N_INSNS (1);
10131 
10132 	  if (speed)
10133 	    {
10134 	      if (VECTOR_MODE_P (mode))
10135 		*cost += extra_cost->vect.alu;
10136 	      else
10137 		*cost += extra_cost->alu.rev;
10138 	    }
10139 	  return true;
10140         }
10141 
10142       if (aarch64_extr_rtx_p (x, &op0, &op1))
10143         {
10144 	  *cost += rtx_cost (op0, mode, IOR, 0, speed);
10145 	  *cost += rtx_cost (op1, mode, IOR, 1, speed);
10146           if (speed)
10147             *cost += extra_cost->alu.shift;
10148 
10149           return true;
10150         }
10151     /* Fall through.  */
10152     case XOR:
10153     case AND:
10154     cost_logic:
10155       op0 = XEXP (x, 0);
10156       op1 = XEXP (x, 1);
10157 
10158       if (VECTOR_MODE_P (mode))
10159 	{
10160 	  if (speed)
10161 	    *cost += extra_cost->vect.alu;
10162 	  return true;
10163 	}
10164 
10165       if (code == AND
10166           && GET_CODE (op0) == MULT
10167           && CONST_INT_P (XEXP (op0, 1))
10168           && CONST_INT_P (op1)
10169           && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
10170                                INTVAL (op1)) != 0)
10171         {
10172           /* This is a UBFM/SBFM.  */
10173 	  *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
10174 	  if (speed)
10175 	    *cost += extra_cost->alu.bfx;
10176           return true;
10177         }
10178 
10179       if (is_int_mode (mode, &int_mode))
10180 	{
10181 	  if (CONST_INT_P (op1))
10182 	    {
10183 	      /* We have a mask + shift version of a UBFIZ
10184 		 i.e. the *andim_ashift<mode>_bfiz pattern.  */
10185 	      if (GET_CODE (op0) == ASHIFT
10186 		  && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
10187 							 XEXP (op0, 1)))
10188 		{
10189 		  *cost += rtx_cost (XEXP (op0, 0), int_mode,
10190 				     (enum rtx_code) code, 0, speed);
10191 		  if (speed)
10192 		    *cost += extra_cost->alu.bfx;
10193 
10194 		  return true;
10195 		}
10196 	      else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
10197 		{
10198 		/* We possibly get the immediate for free, this is not
10199 		   modelled.  */
10200 		  *cost += rtx_cost (op0, int_mode,
10201 				     (enum rtx_code) code, 0, speed);
10202 		  if (speed)
10203 		    *cost += extra_cost->alu.logical;
10204 
10205 		  return true;
10206 		}
10207 	    }
10208 	  else
10209 	    {
10210 	      rtx new_op0 = op0;
10211 
10212 	      /* Handle ORN, EON, or BIC.  */
10213 	      if (GET_CODE (op0) == NOT)
10214 		op0 = XEXP (op0, 0);
10215 
10216 	      new_op0 = aarch64_strip_shift (op0);
10217 
10218 	      /* If we had a shift on op0 then this is a logical-shift-
10219 		 by-register/immediate operation.  Otherwise, this is just
10220 		 a logical operation.  */
10221 	      if (speed)
10222 		{
10223 		  if (new_op0 != op0)
10224 		    {
10225 		      /* Shift by immediate.  */
10226 		      if (CONST_INT_P (XEXP (op0, 1)))
10227 			*cost += extra_cost->alu.log_shift;
10228 		      else
10229 			*cost += extra_cost->alu.log_shift_reg;
10230 		    }
10231 		  else
10232 		    *cost += extra_cost->alu.logical;
10233 		}
10234 
10235 	      /* In both cases we want to cost both operands.  */
10236 	      *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
10237 				 0, speed);
10238 	      *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
10239 				 1, speed);
10240 
10241 	      return true;
10242 	    }
10243 	}
10244       return false;
10245 
10246     case NOT:
10247       x = XEXP (x, 0);
10248       op0 = aarch64_strip_shift (x);
10249 
10250       if (VECTOR_MODE_P (mode))
10251 	{
10252 	  /* Vector NOT.  */
10253 	  *cost += extra_cost->vect.alu;
10254 	  return false;
10255 	}
10256 
10257       /* MVN-shifted-reg.  */
10258       if (op0 != x)
10259         {
10260 	  *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
10261 
10262           if (speed)
10263             *cost += extra_cost->alu.log_shift;
10264 
10265           return true;
10266         }
10267       /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
10268          Handle the second form here taking care that 'a' in the above can
10269          be a shift.  */
10270       else if (GET_CODE (op0) == XOR)
10271         {
10272           rtx newop0 = XEXP (op0, 0);
10273           rtx newop1 = XEXP (op0, 1);
10274           rtx op0_stripped = aarch64_strip_shift (newop0);
10275 
10276 	  *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
10277 	  *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
10278 
10279           if (speed)
10280             {
10281               if (op0_stripped != newop0)
10282                 *cost += extra_cost->alu.log_shift;
10283               else
10284                 *cost += extra_cost->alu.logical;
10285             }
10286 
10287           return true;
10288         }
10289       /* MVN.  */
10290       if (speed)
10291 	*cost += extra_cost->alu.logical;
10292 
10293       return false;
10294 
10295     case ZERO_EXTEND:
10296 
10297       op0 = XEXP (x, 0);
10298       /* If a value is written in SI mode, then zero extended to DI
10299 	 mode, the operation will in general be free as a write to
10300 	 a 'w' register implicitly zeroes the upper bits of an 'x'
10301 	 register.  However, if this is
10302 
10303 	   (set (reg) (zero_extend (reg)))
10304 
10305 	 we must cost the explicit register move.  */
10306       if (mode == DImode
10307 	  && GET_MODE (op0) == SImode
10308 	  && outer == SET)
10309 	{
10310 	  int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
10311 
10312 	/* If OP_COST is non-zero, then the cost of the zero extend
10313 	   is effectively the cost of the inner operation.  Otherwise
10314 	   we have a MOV instruction and we take the cost from the MOV
10315 	   itself.  This is true independently of whether we are
10316 	   optimizing for space or time.  */
10317 	  if (op_cost)
10318 	    *cost = op_cost;
10319 
10320 	  return true;
10321 	}
10322       else if (MEM_P (op0))
10323 	{
10324 	  /* All loads can zero extend to any size for free.  */
10325 	  *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
10326 	  return true;
10327 	}
10328 
10329       op0 = aarch64_extend_bitfield_pattern_p (x);
10330       if (op0)
10331 	{
10332 	  *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
10333 	  if (speed)
10334 	    *cost += extra_cost->alu.bfx;
10335 	  return true;
10336 	}
10337 
10338       if (speed)
10339 	{
10340 	  if (VECTOR_MODE_P (mode))
10341 	    {
10342 	      /* UMOV.  */
10343 	      *cost += extra_cost->vect.alu;
10344 	    }
10345 	  else
10346 	    {
10347 	      /* We generate an AND instead of UXTB/UXTH.  */
10348 	      *cost += extra_cost->alu.logical;
10349 	    }
10350 	}
10351       return false;
10352 
10353     case SIGN_EXTEND:
10354       if (MEM_P (XEXP (x, 0)))
10355 	{
10356 	  /* LDRSH.  */
10357 	  if (speed)
10358 	    {
10359 	      rtx address = XEXP (XEXP (x, 0), 0);
10360 	      *cost += extra_cost->ldst.load_sign_extend;
10361 
10362 	      *cost +=
10363 		COSTS_N_INSNS (aarch64_address_cost (address, mode,
10364 						     0, speed));
10365 	    }
10366 	  return true;
10367 	}
10368 
10369       op0 = aarch64_extend_bitfield_pattern_p (x);
10370       if (op0)
10371 	{
10372 	  *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
10373 	  if (speed)
10374 	    *cost += extra_cost->alu.bfx;
10375 	  return true;
10376 	}
10377 
10378       if (speed)
10379 	{
10380 	  if (VECTOR_MODE_P (mode))
10381 	    *cost += extra_cost->vect.alu;
10382 	  else
10383 	    *cost += extra_cost->alu.extend;
10384 	}
10385       return false;
10386 
10387     case ASHIFT:
10388       op0 = XEXP (x, 0);
10389       op1 = XEXP (x, 1);
10390 
10391       if (CONST_INT_P (op1))
10392         {
10393 	  if (speed)
10394 	    {
10395 	      if (VECTOR_MODE_P (mode))
10396 		{
10397 		  /* Vector shift (immediate).  */
10398 		  *cost += extra_cost->vect.alu;
10399 		}
10400 	      else
10401 		{
10402 		  /* LSL (immediate), UBMF, UBFIZ and friends.  These are all
10403 		     aliases.  */
10404 		  *cost += extra_cost->alu.shift;
10405 		}
10406 	    }
10407 
10408           /* We can incorporate zero/sign extend for free.  */
10409           if (GET_CODE (op0) == ZERO_EXTEND
10410               || GET_CODE (op0) == SIGN_EXTEND)
10411             op0 = XEXP (op0, 0);
10412 
10413 	  *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
10414           return true;
10415         }
10416       else
10417         {
10418 	  if (VECTOR_MODE_P (mode))
10419 	    {
10420 	      if (speed)
10421 		/* Vector shift (register).  */
10422 		*cost += extra_cost->vect.alu;
10423 	    }
10424 	  else
10425 	    {
10426 	      if (speed)
10427 		/* LSLV.  */
10428 		*cost += extra_cost->alu.shift_reg;
10429 
10430 	      if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
10431 		  && CONST_INT_P (XEXP (op1, 1))
10432 		  && known_eq (INTVAL (XEXP (op1, 1)),
10433 			       GET_MODE_BITSIZE (mode) - 1))
10434 		{
10435 		  *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
10436 		  /* We already demanded XEXP (op1, 0) to be REG_P, so
10437 		     don't recurse into it.  */
10438 		  return true;
10439 		}
10440 	    }
10441 	  return false;  /* All arguments need to be in registers.  */
10442         }
10443 
10444     case ROTATE:
10445     case ROTATERT:
10446     case LSHIFTRT:
10447     case ASHIFTRT:
10448       op0 = XEXP (x, 0);
10449       op1 = XEXP (x, 1);
10450 
10451       if (CONST_INT_P (op1))
10452 	{
10453 	  /* ASR (immediate) and friends.  */
10454 	  if (speed)
10455 	    {
10456 	      if (VECTOR_MODE_P (mode))
10457 		*cost += extra_cost->vect.alu;
10458 	      else
10459 		*cost += extra_cost->alu.shift;
10460 	    }
10461 
10462 	  *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
10463 	  return true;
10464 	}
10465       else
10466 	{
10467 	  if (VECTOR_MODE_P (mode))
10468 	    {
10469 	      if (speed)
10470 		/* Vector shift (register).  */
10471 		*cost += extra_cost->vect.alu;
10472 	    }
10473 	  else
10474 	    {
10475 	      if (speed)
10476 		/* ASR (register) and friends.  */
10477 		*cost += extra_cost->alu.shift_reg;
10478 
10479 	      if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
10480 		  && CONST_INT_P (XEXP (op1, 1))
10481 		  && known_eq (INTVAL (XEXP (op1, 1)),
10482 			       GET_MODE_BITSIZE (mode) - 1))
10483 		{
10484 		  *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
10485 		  /* We already demanded XEXP (op1, 0) to be REG_P, so
10486 		     don't recurse into it.  */
10487 		  return true;
10488 		}
10489 	    }
10490 	  return false;  /* All arguments need to be in registers.  */
10491 	}
10492 
10493     case SYMBOL_REF:
10494 
10495       if (aarch64_cmodel == AARCH64_CMODEL_LARGE
10496 	  || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
10497 	{
10498 	  /* LDR.  */
10499 	  if (speed)
10500 	    *cost += extra_cost->ldst.load;
10501 	}
10502       else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
10503 	       || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
10504 	{
10505 	  /* ADRP, followed by ADD.  */
10506 	  *cost += COSTS_N_INSNS (1);
10507 	  if (speed)
10508 	    *cost += 2 * extra_cost->alu.arith;
10509 	}
10510       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
10511 	       || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
10512 	{
10513 	  /* ADR.  */
10514 	  if (speed)
10515 	    *cost += extra_cost->alu.arith;
10516 	}
10517 
10518       if (flag_pic)
10519 	{
10520 	  /* One extra load instruction, after accessing the GOT.  */
10521 	  *cost += COSTS_N_INSNS (1);
10522 	  if (speed)
10523 	    *cost += extra_cost->ldst.load;
10524 	}
10525       return true;
10526 
10527     case HIGH:
10528     case LO_SUM:
10529       /* ADRP/ADD (immediate).  */
10530       if (speed)
10531 	*cost += extra_cost->alu.arith;
10532       return true;
10533 
10534     case ZERO_EXTRACT:
10535     case SIGN_EXTRACT:
10536       /* UBFX/SBFX.  */
10537       if (speed)
10538 	{
10539 	  if (VECTOR_MODE_P (mode))
10540 	    *cost += extra_cost->vect.alu;
10541 	  else
10542 	    *cost += extra_cost->alu.bfx;
10543 	}
10544 
10545       /* We can trust that the immediates used will be correct (there
10546 	 are no by-register forms), so we need only cost op0.  */
10547       *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
10548       return true;
10549 
10550     case MULT:
10551       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
10552       /* aarch64_rtx_mult_cost always handles recursion to its
10553 	 operands.  */
10554       return true;
10555 
10556     case MOD:
10557     /* We can expand signed mod by power of 2 using a NEGS, two parallel
10558        ANDs and a CSNEG.  Assume here that CSNEG is the same as the cost of
10559        an unconditional negate.  This case should only ever be reached through
10560        the set_smod_pow2_cheap check in expmed.c.  */
10561       if (CONST_INT_P (XEXP (x, 1))
10562 	  && exact_log2 (INTVAL (XEXP (x, 1))) > 0
10563 	  && (mode == SImode || mode == DImode))
10564 	{
10565 	  /* We expand to 4 instructions.  Reset the baseline.  */
10566 	  *cost = COSTS_N_INSNS (4);
10567 
10568 	  if (speed)
10569 	    *cost += 2 * extra_cost->alu.logical
10570 		     + 2 * extra_cost->alu.arith;
10571 
10572 	  return true;
10573 	}
10574 
10575     /* Fall-through.  */
10576     case UMOD:
10577       if (speed)
10578 	{
10579 	  /* Slighly prefer UMOD over SMOD.  */
10580 	  if (VECTOR_MODE_P (mode))
10581 	    *cost += extra_cost->vect.alu;
10582 	  else if (GET_MODE_CLASS (mode) == MODE_INT)
10583 	    *cost += (extra_cost->mult[mode == DImode].add
10584 		      + extra_cost->mult[mode == DImode].idiv
10585 		      + (code == MOD ? 1 : 0));
10586 	}
10587       return false;  /* All arguments need to be in registers.  */
10588 
10589     case DIV:
10590     case UDIV:
10591     case SQRT:
10592       if (speed)
10593 	{
10594 	  if (VECTOR_MODE_P (mode))
10595 	    *cost += extra_cost->vect.alu;
10596 	  else if (GET_MODE_CLASS (mode) == MODE_INT)
10597 	    /* There is no integer SQRT, so only DIV and UDIV can get
10598 	       here.  */
10599 	    *cost += (extra_cost->mult[mode == DImode].idiv
10600 		     /* Slighly prefer UDIV over SDIV.  */
10601 		     + (code == DIV ? 1 : 0));
10602 	  else
10603 	    *cost += extra_cost->fp[mode == DFmode].div;
10604 	}
10605       return false;  /* All arguments need to be in registers.  */
10606 
10607     case IF_THEN_ELSE:
10608       return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
10609 					 XEXP (x, 2), cost, speed);
10610 
10611     case EQ:
10612     case NE:
10613     case GT:
10614     case GTU:
10615     case LT:
10616     case LTU:
10617     case GE:
10618     case GEU:
10619     case LE:
10620     case LEU:
10621 
10622       return false; /* All arguments must be in registers.  */
10623 
10624     case FMA:
10625       op0 = XEXP (x, 0);
10626       op1 = XEXP (x, 1);
10627       op2 = XEXP (x, 2);
10628 
10629       if (speed)
10630 	{
10631 	  if (VECTOR_MODE_P (mode))
10632 	    *cost += extra_cost->vect.alu;
10633 	  else
10634 	    *cost += extra_cost->fp[mode == DFmode].fma;
10635 	}
10636 
10637       /* FMSUB, FNMADD, and FNMSUB are free.  */
10638       if (GET_CODE (op0) == NEG)
10639         op0 = XEXP (op0, 0);
10640 
10641       if (GET_CODE (op2) == NEG)
10642         op2 = XEXP (op2, 0);
10643 
10644       /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
10645 	 and the by-element operand as operand 0.  */
10646       if (GET_CODE (op1) == NEG)
10647         op1 = XEXP (op1, 0);
10648 
10649       /* Catch vector-by-element operations.  The by-element operand can
10650 	 either be (vec_duplicate (vec_select (x))) or just
10651 	 (vec_select (x)), depending on whether we are multiplying by
10652 	 a vector or a scalar.
10653 
10654 	 Canonicalization is not very good in these cases, FMA4 will put the
10655 	 by-element operand as operand 0, FNMA4 will have it as operand 1.  */
10656       if (GET_CODE (op0) == VEC_DUPLICATE)
10657 	op0 = XEXP (op0, 0);
10658       else if (GET_CODE (op1) == VEC_DUPLICATE)
10659 	op1 = XEXP (op1, 0);
10660 
10661       if (GET_CODE (op0) == VEC_SELECT)
10662 	op0 = XEXP (op0, 0);
10663       else if (GET_CODE (op1) == VEC_SELECT)
10664 	op1 = XEXP (op1, 0);
10665 
10666       /* If the remaining parameters are not registers,
10667          get the cost to put them into registers.  */
10668       *cost += rtx_cost (op0, mode, FMA, 0, speed);
10669       *cost += rtx_cost (op1, mode, FMA, 1, speed);
10670       *cost += rtx_cost (op2, mode, FMA, 2, speed);
10671       return true;
10672 
10673     case FLOAT:
10674     case UNSIGNED_FLOAT:
10675       if (speed)
10676 	*cost += extra_cost->fp[mode == DFmode].fromint;
10677       return false;
10678 
10679     case FLOAT_EXTEND:
10680       if (speed)
10681 	{
10682 	  if (VECTOR_MODE_P (mode))
10683 	    {
10684 	      /*Vector truncate.  */
10685 	      *cost += extra_cost->vect.alu;
10686 	    }
10687 	  else
10688 	    *cost += extra_cost->fp[mode == DFmode].widen;
10689 	}
10690       return false;
10691 
10692     case FLOAT_TRUNCATE:
10693       if (speed)
10694 	{
10695 	  if (VECTOR_MODE_P (mode))
10696 	    {
10697 	      /*Vector conversion.  */
10698 	      *cost += extra_cost->vect.alu;
10699 	    }
10700 	  else
10701 	    *cost += extra_cost->fp[mode == DFmode].narrow;
10702 	}
10703       return false;
10704 
10705     case FIX:
10706     case UNSIGNED_FIX:
10707       x = XEXP (x, 0);
10708       /* Strip the rounding part.  They will all be implemented
10709          by the fcvt* family of instructions anyway.  */
10710       if (GET_CODE (x) == UNSPEC)
10711         {
10712           unsigned int uns_code = XINT (x, 1);
10713 
10714           if (uns_code == UNSPEC_FRINTA
10715               || uns_code == UNSPEC_FRINTM
10716               || uns_code == UNSPEC_FRINTN
10717               || uns_code == UNSPEC_FRINTP
10718               || uns_code == UNSPEC_FRINTZ)
10719             x = XVECEXP (x, 0, 0);
10720         }
10721 
10722       if (speed)
10723 	{
10724 	  if (VECTOR_MODE_P (mode))
10725 	    *cost += extra_cost->vect.alu;
10726 	  else
10727 	    *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
10728 	}
10729 
10730       /* We can combine fmul by a power of 2 followed by a fcvt into a single
10731 	 fixed-point fcvt.  */
10732       if (GET_CODE (x) == MULT
10733 	  && ((VECTOR_MODE_P (mode)
10734 	       && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
10735 	      || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
10736 	{
10737 	  *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
10738 			     0, speed);
10739 	  return true;
10740 	}
10741 
10742       *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
10743       return true;
10744 
10745     case ABS:
10746       if (VECTOR_MODE_P (mode))
10747 	{
10748 	  /* ABS (vector).  */
10749 	  if (speed)
10750 	    *cost += extra_cost->vect.alu;
10751 	}
10752       else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
10753 	{
10754 	  op0 = XEXP (x, 0);
10755 
10756 	  /* FABD, which is analogous to FADD.  */
10757 	  if (GET_CODE (op0) == MINUS)
10758 	    {
10759 	      *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
10760 	      *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
10761 	      if (speed)
10762 		*cost += extra_cost->fp[mode == DFmode].addsub;
10763 
10764 	      return true;
10765 	    }
10766 	  /* Simple FABS is analogous to FNEG.  */
10767 	  if (speed)
10768 	    *cost += extra_cost->fp[mode == DFmode].neg;
10769 	}
10770       else
10771 	{
10772 	  /* Integer ABS will either be split to
10773 	     two arithmetic instructions, or will be an ABS
10774 	     (scalar), which we don't model.  */
10775 	  *cost = COSTS_N_INSNS (2);
10776 	  if (speed)
10777 	    *cost += 2 * extra_cost->alu.arith;
10778 	}
10779       return false;
10780 
10781     case SMAX:
10782     case SMIN:
10783       if (speed)
10784 	{
10785 	  if (VECTOR_MODE_P (mode))
10786 	    *cost += extra_cost->vect.alu;
10787 	  else
10788 	    {
10789 	      /* FMAXNM/FMINNM/FMAX/FMIN.
10790 	         TODO: This may not be accurate for all implementations, but
10791 	         we do not model this in the cost tables.  */
10792 	      *cost += extra_cost->fp[mode == DFmode].addsub;
10793 	    }
10794 	}
10795       return false;
10796 
10797     case UNSPEC:
10798       /* The floating point round to integer frint* instructions.  */
10799       if (aarch64_frint_unspec_p (XINT (x, 1)))
10800         {
10801           if (speed)
10802             *cost += extra_cost->fp[mode == DFmode].roundint;
10803 
10804           return false;
10805         }
10806 
10807       if (XINT (x, 1) == UNSPEC_RBIT)
10808         {
10809           if (speed)
10810             *cost += extra_cost->alu.rev;
10811 
10812           return false;
10813         }
10814       break;
10815 
10816     case TRUNCATE:
10817 
10818       /* Decompose <su>muldi3_highpart.  */
10819       if (/* (truncate:DI  */
10820 	  mode == DImode
10821 	  /*   (lshiftrt:TI  */
10822           && GET_MODE (XEXP (x, 0)) == TImode
10823           && GET_CODE (XEXP (x, 0)) == LSHIFTRT
10824 	  /*      (mult:TI  */
10825           && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
10826 	  /*        (ANY_EXTEND:TI (reg:DI))
10827 	            (ANY_EXTEND:TI (reg:DI)))  */
10828           && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
10829                && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
10830               || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
10831                   && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
10832           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
10833           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
10834 	  /*     (const_int 64)  */
10835           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
10836           && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
10837         {
10838           /* UMULH/SMULH.  */
10839 	  if (speed)
10840 	    *cost += extra_cost->mult[mode == DImode].extend;
10841 	  *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
10842 			     mode, MULT, 0, speed);
10843 	  *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
10844 			     mode, MULT, 1, speed);
10845           return true;
10846         }
10847 
10848       /* Fall through.  */
10849     default:
10850       break;
10851     }
10852 
10853   if (dump_file
10854       && flag_aarch64_verbose_cost)
10855     fprintf (dump_file,
10856       "\nFailed to cost RTX.  Assuming default cost.\n");
10857 
10858   return true;
10859 }
10860 
10861 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
10862    calculated for X.  This cost is stored in *COST.  Returns true
10863    if the total cost of X was calculated.  */
10864 static bool
aarch64_rtx_costs_wrapper(rtx x,machine_mode mode,int outer,int param,int * cost,bool speed)10865 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
10866 		   int param, int *cost, bool speed)
10867 {
10868   bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
10869 
10870   if (dump_file
10871       && flag_aarch64_verbose_cost)
10872     {
10873       print_rtl_single (dump_file, x);
10874       fprintf (dump_file, "\n%s cost: %d (%s)\n",
10875 	       speed ? "Hot" : "Cold",
10876 	       *cost, result ? "final" : "partial");
10877     }
10878 
10879   return result;
10880 }
10881 
10882 static int
aarch64_register_move_cost(machine_mode mode,reg_class_t from_i,reg_class_t to_i)10883 aarch64_register_move_cost (machine_mode mode,
10884 			    reg_class_t from_i, reg_class_t to_i)
10885 {
10886   enum reg_class from = (enum reg_class) from_i;
10887   enum reg_class to = (enum reg_class) to_i;
10888   const struct cpu_regmove_cost *regmove_cost
10889     = aarch64_tune_params.regmove_cost;
10890 
10891   /* Caller save and pointer regs are equivalent to GENERAL_REGS.  */
10892   if (to == TAILCALL_ADDR_REGS || to == POINTER_REGS
10893       || to == STUB_REGS)
10894     to = GENERAL_REGS;
10895 
10896   if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS
10897       || from == STUB_REGS)
10898     from = GENERAL_REGS;
10899 
10900   /* Moving between GPR and stack cost is the same as GP2GP.  */
10901   if ((from == GENERAL_REGS && to == STACK_REG)
10902       || (to == GENERAL_REGS && from == STACK_REG))
10903     return regmove_cost->GP2GP;
10904 
10905   /* To/From the stack register, we move via the gprs.  */
10906   if (to == STACK_REG || from == STACK_REG)
10907     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
10908             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
10909 
10910   if (known_eq (GET_MODE_SIZE (mode), 16))
10911     {
10912       /* 128-bit operations on general registers require 2 instructions.  */
10913       if (from == GENERAL_REGS && to == GENERAL_REGS)
10914 	return regmove_cost->GP2GP * 2;
10915       else if (from == GENERAL_REGS)
10916 	return regmove_cost->GP2FP * 2;
10917       else if (to == GENERAL_REGS)
10918 	return regmove_cost->FP2GP * 2;
10919 
10920       /* When AdvSIMD instructions are disabled it is not possible to move
10921 	 a 128-bit value directly between Q registers.  This is handled in
10922 	 secondary reload.  A general register is used as a scratch to move
10923 	 the upper DI value and the lower DI value is moved directly,
10924 	 hence the cost is the sum of three moves. */
10925       if (! TARGET_SIMD)
10926 	return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
10927 
10928       return regmove_cost->FP2FP;
10929     }
10930 
10931   if (from == GENERAL_REGS && to == GENERAL_REGS)
10932     return regmove_cost->GP2GP;
10933   else if (from == GENERAL_REGS)
10934     return regmove_cost->GP2FP;
10935   else if (to == GENERAL_REGS)
10936     return regmove_cost->FP2GP;
10937 
10938   return regmove_cost->FP2FP;
10939 }
10940 
10941 static int
aarch64_memory_move_cost(machine_mode mode ATTRIBUTE_UNUSED,reg_class_t rclass ATTRIBUTE_UNUSED,bool in ATTRIBUTE_UNUSED)10942 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
10943 			  reg_class_t rclass ATTRIBUTE_UNUSED,
10944 			  bool in ATTRIBUTE_UNUSED)
10945 {
10946   return aarch64_tune_params.memmov_cost;
10947 }
10948 
10949 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
10950    to optimize 1.0/sqrt.  */
10951 
10952 static bool
use_rsqrt_p(machine_mode mode)10953 use_rsqrt_p (machine_mode mode)
10954 {
10955   return (!flag_trapping_math
10956 	  && flag_unsafe_math_optimizations
10957 	  && ((aarch64_tune_params.approx_modes->recip_sqrt
10958 	       & AARCH64_APPROX_MODE (mode))
10959 	      || flag_mrecip_low_precision_sqrt));
10960 }
10961 
10962 /* Function to decide when to use the approximate reciprocal square root
10963    builtin.  */
10964 
10965 static tree
aarch64_builtin_reciprocal(tree fndecl)10966 aarch64_builtin_reciprocal (tree fndecl)
10967 {
10968   machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
10969 
10970   if (!use_rsqrt_p (mode))
10971     return NULL_TREE;
10972   return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
10973 }
10974 
10975 /* Emit instruction sequence to compute either the approximate square root
10976    or its approximate reciprocal, depending on the flag RECP, and return
10977    whether the sequence was emitted or not.  */
10978 
10979 bool
aarch64_emit_approx_sqrt(rtx dst,rtx src,bool recp)10980 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
10981 {
10982   machine_mode mode = GET_MODE (dst);
10983 
10984   if (GET_MODE_INNER (mode) == HFmode)
10985     {
10986       gcc_assert (!recp);
10987       return false;
10988     }
10989 
10990   if (!recp)
10991     {
10992       if (!(flag_mlow_precision_sqrt
10993 	    || (aarch64_tune_params.approx_modes->sqrt
10994 		& AARCH64_APPROX_MODE (mode))))
10995 	return false;
10996 
10997       if (flag_finite_math_only
10998 	  || flag_trapping_math
10999 	  || !flag_unsafe_math_optimizations
11000 	  || optimize_function_for_size_p (cfun))
11001 	return false;
11002     }
11003   else
11004     /* Caller assumes we cannot fail.  */
11005     gcc_assert (use_rsqrt_p (mode));
11006 
11007   machine_mode mmsk = mode_for_int_vector (mode).require ();
11008   rtx xmsk = gen_reg_rtx (mmsk);
11009   if (!recp)
11010     /* When calculating the approximate square root, compare the
11011        argument with 0.0 and create a mask.  */
11012     emit_insn (gen_rtx_SET (xmsk,
11013 			    gen_rtx_NEG (mmsk,
11014 					 gen_rtx_EQ (mmsk, src,
11015 						     CONST0_RTX (mode)))));
11016 
11017   /* Estimate the approximate reciprocal square root.  */
11018   rtx xdst = gen_reg_rtx (mode);
11019   emit_insn (gen_aarch64_rsqrte (mode, xdst, src));
11020 
11021   /* Iterate over the series twice for SF and thrice for DF.  */
11022   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
11023 
11024   /* Optionally iterate over the series once less for faster performance
11025      while sacrificing the accuracy.  */
11026   if ((recp && flag_mrecip_low_precision_sqrt)
11027       || (!recp && flag_mlow_precision_sqrt))
11028     iterations--;
11029 
11030   /* Iterate over the series to calculate the approximate reciprocal square
11031      root.  */
11032   rtx x1 = gen_reg_rtx (mode);
11033   while (iterations--)
11034     {
11035       rtx x2 = gen_reg_rtx (mode);
11036       emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
11037 
11038       emit_insn (gen_aarch64_rsqrts (mode, x1, src, x2));
11039 
11040       if (iterations > 0)
11041 	emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
11042     }
11043 
11044   if (!recp)
11045     {
11046       /* Qualify the approximate reciprocal square root when the argument is
11047 	 0.0 by squashing the intermediary result to 0.0.  */
11048       rtx xtmp = gen_reg_rtx (mmsk);
11049       emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
11050 					      gen_rtx_SUBREG (mmsk, xdst, 0)));
11051       emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
11052 
11053       /* Calculate the approximate square root.  */
11054       emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
11055     }
11056 
11057   /* Finalize the approximation.  */
11058   emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
11059 
11060   return true;
11061 }
11062 
11063 /* Emit the instruction sequence to compute the approximation for the division
11064    of NUM by DEN in QUO and return whether the sequence was emitted or not.  */
11065 
11066 bool
aarch64_emit_approx_div(rtx quo,rtx num,rtx den)11067 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
11068 {
11069   machine_mode mode = GET_MODE (quo);
11070 
11071   if (GET_MODE_INNER (mode) == HFmode)
11072     return false;
11073 
11074   bool use_approx_division_p = (flag_mlow_precision_div
11075 			        || (aarch64_tune_params.approx_modes->division
11076 				    & AARCH64_APPROX_MODE (mode)));
11077 
11078   if (!flag_finite_math_only
11079       || flag_trapping_math
11080       || !flag_unsafe_math_optimizations
11081       || optimize_function_for_size_p (cfun)
11082       || !use_approx_division_p)
11083     return false;
11084 
11085   if (!TARGET_SIMD && VECTOR_MODE_P (mode))
11086     return false;
11087 
11088   /* Estimate the approximate reciprocal.  */
11089   rtx xrcp = gen_reg_rtx (mode);
11090   emit_insn (gen_aarch64_frecpe (mode, xrcp, den));
11091 
11092   /* Iterate over the series twice for SF and thrice for DF.  */
11093   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
11094 
11095   /* Optionally iterate over the series once less for faster performance,
11096      while sacrificing the accuracy.  */
11097   if (flag_mlow_precision_div)
11098     iterations--;
11099 
11100   /* Iterate over the series to calculate the approximate reciprocal.  */
11101   rtx xtmp = gen_reg_rtx (mode);
11102   while (iterations--)
11103     {
11104       emit_insn (gen_aarch64_frecps (mode, xtmp, xrcp, den));
11105 
11106       if (iterations > 0)
11107 	emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
11108     }
11109 
11110   if (num != CONST1_RTX (mode))
11111     {
11112       /* As the approximate reciprocal of DEN is already calculated, only
11113 	 calculate the approximate division when NUM is not 1.0.  */
11114       rtx xnum = force_reg (mode, num);
11115       emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
11116     }
11117 
11118   /* Finalize the approximation.  */
11119   emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
11120   return true;
11121 }
11122 
11123 /* Return the number of instructions that can be issued per cycle.  */
11124 static int
aarch64_sched_issue_rate(void)11125 aarch64_sched_issue_rate (void)
11126 {
11127   return aarch64_tune_params.issue_rate;
11128 }
11129 
11130 static int
aarch64_sched_first_cycle_multipass_dfa_lookahead(void)11131 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
11132 {
11133   int issue_rate = aarch64_sched_issue_rate ();
11134 
11135   return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
11136 }
11137 
11138 
11139 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
11140    autopref_multipass_dfa_lookahead_guard from haifa-sched.c.  It only
11141    has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0.  */
11142 
11143 static int
aarch64_first_cycle_multipass_dfa_lookahead_guard(rtx_insn * insn,int ready_index)11144 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
11145 						    int ready_index)
11146 {
11147   return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
11148 }
11149 
11150 
11151 /* Vectorizer cost model target hooks.  */
11152 
11153 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
11154 static int
aarch64_builtin_vectorization_cost(enum vect_cost_for_stmt type_of_cost,tree vectype,int misalign ATTRIBUTE_UNUSED)11155 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
11156 				    tree vectype,
11157 				    int misalign ATTRIBUTE_UNUSED)
11158 {
11159   unsigned elements;
11160   const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
11161   bool fp = false;
11162 
11163   if (vectype != NULL)
11164     fp = FLOAT_TYPE_P (vectype);
11165 
11166   switch (type_of_cost)
11167     {
11168       case scalar_stmt:
11169 	return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
11170 
11171       case scalar_load:
11172 	return costs->scalar_load_cost;
11173 
11174       case scalar_store:
11175 	return costs->scalar_store_cost;
11176 
11177       case vector_stmt:
11178 	return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
11179 
11180       case vector_load:
11181 	return costs->vec_align_load_cost;
11182 
11183       case vector_store:
11184 	return costs->vec_store_cost;
11185 
11186       case vec_to_scalar:
11187 	return costs->vec_to_scalar_cost;
11188 
11189       case scalar_to_vec:
11190 	return costs->scalar_to_vec_cost;
11191 
11192       case unaligned_load:
11193       case vector_gather_load:
11194 	return costs->vec_unalign_load_cost;
11195 
11196       case unaligned_store:
11197       case vector_scatter_store:
11198 	return costs->vec_unalign_store_cost;
11199 
11200       case cond_branch_taken:
11201 	return costs->cond_taken_branch_cost;
11202 
11203       case cond_branch_not_taken:
11204 	return costs->cond_not_taken_branch_cost;
11205 
11206       case vec_perm:
11207 	return costs->vec_permute_cost;
11208 
11209       case vec_promote_demote:
11210 	return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
11211 
11212       case vec_construct:
11213 	elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
11214 	return elements / 2 + 1;
11215 
11216       default:
11217 	gcc_unreachable ();
11218     }
11219 }
11220 
11221 /* Implement targetm.vectorize.add_stmt_cost.  */
11222 static unsigned
aarch64_add_stmt_cost(void * data,int count,enum vect_cost_for_stmt kind,struct _stmt_vec_info * stmt_info,int misalign,enum vect_cost_model_location where)11223 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
11224 		       struct _stmt_vec_info *stmt_info, int misalign,
11225 		       enum vect_cost_model_location where)
11226 {
11227   unsigned *cost = (unsigned *) data;
11228   unsigned retval = 0;
11229 
11230   if (flag_vect_cost_model)
11231     {
11232       tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
11233       int stmt_cost =
11234 	    aarch64_builtin_vectorization_cost (kind, vectype, misalign);
11235 
11236       /* Statements in an inner loop relative to the loop being
11237 	 vectorized are weighted more heavily.  The value here is
11238 	 arbitrary and could potentially be improved with analysis.  */
11239       if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
11240 	count *= 50; /*  FIXME  */
11241 
11242       retval = (unsigned) (count * stmt_cost);
11243       cost[where] += retval;
11244     }
11245 
11246   return retval;
11247 }
11248 
11249 static void initialize_aarch64_code_model (struct gcc_options *);
11250 
11251 /* Parse the TO_PARSE string and put the architecture struct that it
11252    selects into RES and the architectural features into ISA_FLAGS.
11253    Return an aarch64_parse_opt_result describing the parse result.
11254    If there is an error parsing, RES and ISA_FLAGS are left unchanged.
11255    When the TO_PARSE string contains an invalid extension,
11256    a copy of the string is created and stored to INVALID_EXTENSION.  */
11257 
11258 static enum aarch64_parse_opt_result
aarch64_parse_arch(const char * to_parse,const struct processor ** res,unsigned long * isa_flags,std::string * invalid_extension)11259 aarch64_parse_arch (const char *to_parse, const struct processor **res,
11260 		    unsigned long *isa_flags, std::string *invalid_extension)
11261 {
11262   const char *ext;
11263   const struct processor *arch;
11264   size_t len;
11265 
11266   ext = strchr (to_parse, '+');
11267 
11268   if (ext != NULL)
11269     len = ext - to_parse;
11270   else
11271     len = strlen (to_parse);
11272 
11273   if (len == 0)
11274     return AARCH64_PARSE_MISSING_ARG;
11275 
11276 
11277   /* Loop through the list of supported ARCHes to find a match.  */
11278   for (arch = all_architectures; arch->name != NULL; arch++)
11279     {
11280       if (strlen (arch->name) == len
11281 	  && strncmp (arch->name, to_parse, len) == 0)
11282 	{
11283 	  unsigned long isa_temp = arch->flags;
11284 
11285 	  if (ext != NULL)
11286 	    {
11287 	      /* TO_PARSE string contains at least one extension.  */
11288 	      enum aarch64_parse_opt_result ext_res
11289 		= aarch64_parse_extension (ext, &isa_temp, invalid_extension);
11290 
11291 	      if (ext_res != AARCH64_PARSE_OK)
11292 		return ext_res;
11293 	    }
11294 	  /* Extension parsing was successful.  Confirm the result
11295 	     arch and ISA flags.  */
11296 	  *res = arch;
11297 	  *isa_flags = isa_temp;
11298 	  return AARCH64_PARSE_OK;
11299 	}
11300     }
11301 
11302   /* ARCH name not found in list.  */
11303   return AARCH64_PARSE_INVALID_ARG;
11304 }
11305 
11306 /* Parse the TO_PARSE string and put the result tuning in RES and the
11307    architecture flags in ISA_FLAGS.  Return an aarch64_parse_opt_result
11308    describing the parse result.  If there is an error parsing, RES and
11309    ISA_FLAGS are left unchanged.
11310    When the TO_PARSE string contains an invalid extension,
11311    a copy of the string is created and stored to INVALID_EXTENSION.  */
11312 
11313 static enum aarch64_parse_opt_result
aarch64_parse_cpu(const char * to_parse,const struct processor ** res,unsigned long * isa_flags,std::string * invalid_extension)11314 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
11315 		   unsigned long *isa_flags, std::string *invalid_extension)
11316 {
11317   const char *ext;
11318   const struct processor *cpu;
11319   size_t len;
11320 
11321   ext = strchr (to_parse, '+');
11322 
11323   if (ext != NULL)
11324     len = ext - to_parse;
11325   else
11326     len = strlen (to_parse);
11327 
11328   if (len == 0)
11329     return AARCH64_PARSE_MISSING_ARG;
11330 
11331 
11332   /* Loop through the list of supported CPUs to find a match.  */
11333   for (cpu = all_cores; cpu->name != NULL; cpu++)
11334     {
11335       if (strlen (cpu->name) == len && strncmp (cpu->name, to_parse, len) == 0)
11336 	{
11337 	  unsigned long isa_temp = cpu->flags;
11338 
11339 
11340 	  if (ext != NULL)
11341 	    {
11342 	      /* TO_PARSE string contains at least one extension.  */
11343 	      enum aarch64_parse_opt_result ext_res
11344 		= aarch64_parse_extension (ext, &isa_temp, invalid_extension);
11345 
11346 	      if (ext_res != AARCH64_PARSE_OK)
11347 		return ext_res;
11348 	    }
11349 	  /* Extension parsing was successfull.  Confirm the result
11350 	     cpu and ISA flags.  */
11351 	  *res = cpu;
11352 	  *isa_flags = isa_temp;
11353 	  return AARCH64_PARSE_OK;
11354 	}
11355     }
11356 
11357   /* CPU name not found in list.  */
11358   return AARCH64_PARSE_INVALID_ARG;
11359 }
11360 
11361 /* Parse the TO_PARSE string and put the cpu it selects into RES.
11362    Return an aarch64_parse_opt_result describing the parse result.
11363    If the parsing fails the RES does not change.  */
11364 
11365 static enum aarch64_parse_opt_result
aarch64_parse_tune(const char * to_parse,const struct processor ** res)11366 aarch64_parse_tune (const char *to_parse, const struct processor **res)
11367 {
11368   const struct processor *cpu;
11369 
11370   /* Loop through the list of supported CPUs to find a match.  */
11371   for (cpu = all_cores; cpu->name != NULL; cpu++)
11372     {
11373       if (strcmp (cpu->name, to_parse) == 0)
11374 	{
11375 	  *res = cpu;
11376 	  return AARCH64_PARSE_OK;
11377 	}
11378     }
11379 
11380   /* CPU name not found in list.  */
11381   return AARCH64_PARSE_INVALID_ARG;
11382 }
11383 
11384 /* Parse TOKEN, which has length LENGTH to see if it is an option
11385    described in FLAG.  If it is, return the index bit for that fusion type.
11386    If not, error (printing OPTION_NAME) and return zero.  */
11387 
11388 static unsigned int
aarch64_parse_one_option_token(const char * token,size_t length,const struct aarch64_flag_desc * flag,const char * option_name)11389 aarch64_parse_one_option_token (const char *token,
11390 				size_t length,
11391 				const struct aarch64_flag_desc *flag,
11392 				const char *option_name)
11393 {
11394   for (; flag->name != NULL; flag++)
11395     {
11396       if (length == strlen (flag->name)
11397 	  && !strncmp (flag->name, token, length))
11398 	return flag->flag;
11399     }
11400 
11401   error ("unknown flag passed in %<-moverride=%s%> (%s)", option_name, token);
11402   return 0;
11403 }
11404 
11405 /* Parse OPTION which is a comma-separated list of flags to enable.
11406    FLAGS gives the list of flags we understand, INITIAL_STATE gives any
11407    default state we inherit from the CPU tuning structures.  OPTION_NAME
11408    gives the top-level option we are parsing in the -moverride string,
11409    for use in error messages.  */
11410 
11411 static unsigned int
aarch64_parse_boolean_options(const char * option,const struct aarch64_flag_desc * flags,unsigned int initial_state,const char * option_name)11412 aarch64_parse_boolean_options (const char *option,
11413 			       const struct aarch64_flag_desc *flags,
11414 			       unsigned int initial_state,
11415 			       const char *option_name)
11416 {
11417   const char separator = '.';
11418   const char* specs = option;
11419   const char* ntoken = option;
11420   unsigned int found_flags = initial_state;
11421 
11422   while ((ntoken = strchr (specs, separator)))
11423     {
11424       size_t token_length = ntoken - specs;
11425       unsigned token_ops = aarch64_parse_one_option_token (specs,
11426 							   token_length,
11427 							   flags,
11428 							   option_name);
11429       /* If we find "none" (or, for simplicity's sake, an error) anywhere
11430 	 in the token stream, reset the supported operations.  So:
11431 
11432 	   adrp+add.cmp+branch.none.adrp+add
11433 
11434 	   would have the result of turning on only adrp+add fusion.  */
11435       if (!token_ops)
11436 	found_flags = 0;
11437 
11438       found_flags |= token_ops;
11439       specs = ++ntoken;
11440     }
11441 
11442   /* We ended with a comma, print something.  */
11443   if (!(*specs))
11444     {
11445       error ("%s string ill-formed\n", option_name);
11446       return 0;
11447     }
11448 
11449   /* We still have one more token to parse.  */
11450   size_t token_length = strlen (specs);
11451   unsigned token_ops = aarch64_parse_one_option_token (specs,
11452 						       token_length,
11453 						       flags,
11454 						       option_name);
11455    if (!token_ops)
11456      found_flags = 0;
11457 
11458   found_flags |= token_ops;
11459   return found_flags;
11460 }
11461 
11462 /* Support for overriding instruction fusion.  */
11463 
11464 static void
aarch64_parse_fuse_string(const char * fuse_string,struct tune_params * tune)11465 aarch64_parse_fuse_string (const char *fuse_string,
11466 			    struct tune_params *tune)
11467 {
11468   tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
11469 						     aarch64_fusible_pairs,
11470 						     tune->fusible_ops,
11471 						     "fuse=");
11472 }
11473 
11474 /* Support for overriding other tuning flags.  */
11475 
11476 static void
aarch64_parse_tune_string(const char * tune_string,struct tune_params * tune)11477 aarch64_parse_tune_string (const char *tune_string,
11478 			    struct tune_params *tune)
11479 {
11480   tune->extra_tuning_flags
11481     = aarch64_parse_boolean_options (tune_string,
11482 				     aarch64_tuning_flags,
11483 				     tune->extra_tuning_flags,
11484 				     "tune=");
11485 }
11486 
11487 /* Parse the sve_width tuning moverride string in TUNE_STRING.
11488    Accept the valid SVE vector widths allowed by
11489    aarch64_sve_vector_bits_enum and use it to override sve_width
11490    in TUNE.  */
11491 
11492 static void
aarch64_parse_sve_width_string(const char * tune_string,struct tune_params * tune)11493 aarch64_parse_sve_width_string (const char *tune_string,
11494 				struct tune_params *tune)
11495 {
11496   int width = -1;
11497 
11498   int n = sscanf (tune_string, "%d", &width);
11499   if (n == EOF)
11500     {
11501       error ("invalid format for sve_width");
11502       return;
11503     }
11504   switch (width)
11505     {
11506     case SVE_128:
11507     case SVE_256:
11508     case SVE_512:
11509     case SVE_1024:
11510     case SVE_2048:
11511       break;
11512     default:
11513       error ("invalid sve_width value: %d", width);
11514     }
11515   tune->sve_width = (enum aarch64_sve_vector_bits_enum) width;
11516 }
11517 
11518 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
11519    we understand.  If it is, extract the option string and handoff to
11520    the appropriate function.  */
11521 
11522 void
aarch64_parse_one_override_token(const char * token,size_t length,struct tune_params * tune)11523 aarch64_parse_one_override_token (const char* token,
11524 				  size_t length,
11525 				  struct tune_params *tune)
11526 {
11527   const struct aarch64_tuning_override_function *fn
11528     = aarch64_tuning_override_functions;
11529 
11530   const char *option_part = strchr (token, '=');
11531   if (!option_part)
11532     {
11533       error ("tuning string missing in option (%s)", token);
11534       return;
11535     }
11536 
11537   /* Get the length of the option name.  */
11538   length = option_part - token;
11539   /* Skip the '=' to get to the option string.  */
11540   option_part++;
11541 
11542   for (; fn->name != NULL; fn++)
11543     {
11544       if (!strncmp (fn->name, token, length))
11545 	{
11546 	  fn->parse_override (option_part, tune);
11547 	  return;
11548 	}
11549     }
11550 
11551   error ("unknown tuning option (%s)",token);
11552   return;
11553 }
11554 
11555 /* A checking mechanism for the implementation of the tls size.  */
11556 
11557 static void
initialize_aarch64_tls_size(struct gcc_options * opts)11558 initialize_aarch64_tls_size (struct gcc_options *opts)
11559 {
11560   if (aarch64_tls_size == 0)
11561     aarch64_tls_size = 24;
11562 
11563   switch (opts->x_aarch64_cmodel_var)
11564     {
11565     case AARCH64_CMODEL_TINY:
11566       /* Both the default and maximum TLS size allowed under tiny is 1M which
11567 	 needs two instructions to address, so we clamp the size to 24.  */
11568       if (aarch64_tls_size > 24)
11569 	aarch64_tls_size = 24;
11570       break;
11571     case AARCH64_CMODEL_SMALL:
11572       /* The maximum TLS size allowed under small is 4G.  */
11573       if (aarch64_tls_size > 32)
11574 	aarch64_tls_size = 32;
11575       break;
11576     case AARCH64_CMODEL_LARGE:
11577       /* The maximum TLS size allowed under large is 16E.
11578 	 FIXME: 16E should be 64bit, we only support 48bit offset now.  */
11579       if (aarch64_tls_size > 48)
11580 	aarch64_tls_size = 48;
11581       break;
11582     default:
11583       gcc_unreachable ();
11584     }
11585 
11586   return;
11587 }
11588 
11589 /* Parse STRING looking for options in the format:
11590      string	:: option:string
11591      option	:: name=substring
11592      name	:: {a-z}
11593      substring	:: defined by option.  */
11594 
11595 static void
aarch64_parse_override_string(const char * input_string,struct tune_params * tune)11596 aarch64_parse_override_string (const char* input_string,
11597 			       struct tune_params* tune)
11598 {
11599   const char separator = ':';
11600   size_t string_length = strlen (input_string) + 1;
11601   char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
11602   char *string = string_root;
11603   strncpy (string, input_string, string_length);
11604   string[string_length - 1] = '\0';
11605 
11606   char* ntoken = string;
11607 
11608   while ((ntoken = strchr (string, separator)))
11609     {
11610       size_t token_length = ntoken - string;
11611       /* Make this substring look like a string.  */
11612       *ntoken = '\0';
11613       aarch64_parse_one_override_token (string, token_length, tune);
11614       string = ++ntoken;
11615     }
11616 
11617   /* One last option to parse.  */
11618   aarch64_parse_one_override_token (string, strlen (string), tune);
11619   free (string_root);
11620 }
11621 
11622 
11623 static void
aarch64_override_options_after_change_1(struct gcc_options * opts)11624 aarch64_override_options_after_change_1 (struct gcc_options *opts)
11625 {
11626   if (accepted_branch_protection_string)
11627     {
11628       opts->x_aarch64_branch_protection_string
11629 	= xstrdup (accepted_branch_protection_string);
11630     }
11631 
11632   /* PR 70044: We have to be careful about being called multiple times for the
11633      same function.  This means all changes should be repeatable.  */
11634 
11635   /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
11636      Disable the frame pointer flag so the mid-end will not use a frame
11637      pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
11638      Set x_flag_omit_frame_pointer to the special value 2 to differentiate
11639      between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2).  */
11640   aarch64_use_frame_pointer = opts->x_flag_omit_frame_pointer != 1;
11641   if (opts->x_flag_omit_frame_pointer == 0)
11642     opts->x_flag_omit_frame_pointer = 2;
11643 
11644   /* If not optimizing for size, set the default
11645      alignment to what the target wants.  */
11646   if (!opts->x_optimize_size)
11647     {
11648       if (opts->x_flag_align_loops && !opts->x_str_align_loops)
11649 	opts->x_str_align_loops = aarch64_tune_params.loop_align;
11650       if (opts->x_flag_align_jumps && !opts->x_str_align_jumps)
11651 	opts->x_str_align_jumps = aarch64_tune_params.jump_align;
11652       if (opts->x_flag_align_functions && !opts->x_str_align_functions)
11653 	opts->x_str_align_functions = aarch64_tune_params.function_align;
11654     }
11655 
11656   /* We default to no pc-relative literal loads.  */
11657 
11658   aarch64_pcrelative_literal_loads = false;
11659 
11660   /* If -mpc-relative-literal-loads is set on the command line, this
11661      implies that the user asked for PC relative literal loads.  */
11662   if (opts->x_pcrelative_literal_loads == 1)
11663     aarch64_pcrelative_literal_loads = true;
11664 
11665   /* In the tiny memory model it makes no sense to disallow PC relative
11666      literal pool loads.  */
11667   if (aarch64_cmodel == AARCH64_CMODEL_TINY
11668       || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
11669     aarch64_pcrelative_literal_loads = true;
11670 
11671   /* When enabling the lower precision Newton series for the square root, also
11672      enable it for the reciprocal square root, since the latter is an
11673      intermediary step for the former.  */
11674   if (flag_mlow_precision_sqrt)
11675     flag_mrecip_low_precision_sqrt = true;
11676 }
11677 
11678 /* 'Unpack' up the internal tuning structs and update the options
11679     in OPTS.  The caller must have set up selected_tune and selected_arch
11680     as all the other target-specific codegen decisions are
11681     derived from them.  */
11682 
11683 void
aarch64_override_options_internal(struct gcc_options * opts)11684 aarch64_override_options_internal (struct gcc_options *opts)
11685 {
11686   aarch64_tune_flags = selected_tune->flags;
11687   aarch64_tune = selected_tune->sched_core;
11688   /* Make a copy of the tuning parameters attached to the core, which
11689      we may later overwrite.  */
11690   aarch64_tune_params = *(selected_tune->tune);
11691   aarch64_architecture_version = selected_arch->architecture_version;
11692 
11693   if (opts->x_aarch64_override_tune_string)
11694     aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
11695 				  &aarch64_tune_params);
11696 
11697   /* This target defaults to strict volatile bitfields.  */
11698   if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
11699     opts->x_flag_strict_volatile_bitfields = 1;
11700 
11701   if (aarch64_stack_protector_guard == SSP_GLOBAL
11702       && opts->x_aarch64_stack_protector_guard_offset_str)
11703     {
11704       error ("incompatible options %<-mstack-protector-guard=global%> and "
11705 	     "%<-mstack-protector-guard-offset=%s%>",
11706 	     aarch64_stack_protector_guard_offset_str);
11707     }
11708 
11709   if (aarch64_stack_protector_guard == SSP_SYSREG
11710       && !(opts->x_aarch64_stack_protector_guard_offset_str
11711 	   && opts->x_aarch64_stack_protector_guard_reg_str))
11712     {
11713       error ("both %<-mstack-protector-guard-offset%> and "
11714 	     "%<-mstack-protector-guard-reg%> must be used "
11715 	     "with %<-mstack-protector-guard=sysreg%>");
11716     }
11717 
11718   if (opts->x_aarch64_stack_protector_guard_reg_str)
11719     {
11720       if (strlen (opts->x_aarch64_stack_protector_guard_reg_str) > 100)
11721 	  error ("specify a system register with a small string length.");
11722     }
11723 
11724   if (opts->x_aarch64_stack_protector_guard_offset_str)
11725     {
11726       char *end;
11727       const char *str = aarch64_stack_protector_guard_offset_str;
11728       errno = 0;
11729       long offs = strtol (aarch64_stack_protector_guard_offset_str, &end, 0);
11730       if (!*str || *end || errno)
11731 	error ("%qs is not a valid offset in %qs", str,
11732 	       "-mstack-protector-guard-offset=");
11733       aarch64_stack_protector_guard_offset = offs;
11734     }
11735 
11736   initialize_aarch64_code_model (opts);
11737   initialize_aarch64_tls_size (opts);
11738 
11739   int queue_depth = 0;
11740   switch (aarch64_tune_params.autoprefetcher_model)
11741     {
11742       case tune_params::AUTOPREFETCHER_OFF:
11743 	queue_depth = -1;
11744 	break;
11745       case tune_params::AUTOPREFETCHER_WEAK:
11746 	queue_depth = 0;
11747 	break;
11748       case tune_params::AUTOPREFETCHER_STRONG:
11749 	queue_depth = max_insn_queue_index + 1;
11750 	break;
11751       default:
11752 	gcc_unreachable ();
11753     }
11754 
11755   /* We don't mind passing in global_options_set here as we don't use
11756      the *options_set structs anyway.  */
11757   maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
11758 			 queue_depth,
11759 			 opts->x_param_values,
11760 			 global_options_set.x_param_values);
11761 
11762   /* Set up parameters to be used in prefetching algorithm.  Do not
11763      override the defaults unless we are tuning for a core we have
11764      researched values for.  */
11765   if (aarch64_tune_params.prefetch->num_slots > 0)
11766     maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
11767 			   aarch64_tune_params.prefetch->num_slots,
11768 			   opts->x_param_values,
11769 			   global_options_set.x_param_values);
11770   if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
11771     maybe_set_param_value (PARAM_L1_CACHE_SIZE,
11772 			   aarch64_tune_params.prefetch->l1_cache_size,
11773 			   opts->x_param_values,
11774 			   global_options_set.x_param_values);
11775   if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
11776     maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
11777 			   aarch64_tune_params.prefetch->l1_cache_line_size,
11778 			   opts->x_param_values,
11779 			   global_options_set.x_param_values);
11780   if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
11781     maybe_set_param_value (PARAM_L2_CACHE_SIZE,
11782 			   aarch64_tune_params.prefetch->l2_cache_size,
11783 			   opts->x_param_values,
11784 			   global_options_set.x_param_values);
11785   if (!aarch64_tune_params.prefetch->prefetch_dynamic_strides)
11786     maybe_set_param_value (PARAM_PREFETCH_DYNAMIC_STRIDES,
11787 			   0,
11788 			   opts->x_param_values,
11789 			   global_options_set.x_param_values);
11790   if (aarch64_tune_params.prefetch->minimum_stride >= 0)
11791     maybe_set_param_value (PARAM_PREFETCH_MINIMUM_STRIDE,
11792 			   aarch64_tune_params.prefetch->minimum_stride,
11793 			   opts->x_param_values,
11794 			   global_options_set.x_param_values);
11795 
11796   /* Use the alternative scheduling-pressure algorithm by default.  */
11797   maybe_set_param_value (PARAM_SCHED_PRESSURE_ALGORITHM, SCHED_PRESSURE_MODEL,
11798 			 opts->x_param_values,
11799 			 global_options_set.x_param_values);
11800 
11801   /* If the user hasn't changed it via configure then set the default to 64 KB
11802      for the backend.  */
11803   maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE,
11804 			 DEFAULT_STK_CLASH_GUARD_SIZE == 0
11805 			   ? 16 : DEFAULT_STK_CLASH_GUARD_SIZE,
11806 			 opts->x_param_values,
11807 			 global_options_set.x_param_values);
11808 
11809   /* Validate the guard size.  */
11810   int guard_size = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
11811 
11812   /* Enforce that interval is the same size as size so the mid-end does the
11813      right thing.  */
11814   maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL,
11815 			 guard_size,
11816 			 opts->x_param_values,
11817 			 global_options_set.x_param_values);
11818 
11819   /* The maybe_set calls won't update the value if the user has explicitly set
11820      one.  Which means we need to validate that probing interval and guard size
11821      are equal.  */
11822   int probe_interval
11823     = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL);
11824   if (guard_size != probe_interval)
11825     error ("stack clash guard size %<%d%> must be equal to probing interval "
11826 	   "%<%d%>", guard_size, probe_interval);
11827 
11828   /* Enable sw prefetching at specified optimization level for
11829      CPUS that have prefetch.  Lower optimization level threshold by 1
11830      when profiling is enabled.  */
11831   if (opts->x_flag_prefetch_loop_arrays < 0
11832       && !opts->x_optimize_size
11833       && aarch64_tune_params.prefetch->default_opt_level >= 0
11834       && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
11835     opts->x_flag_prefetch_loop_arrays = 1;
11836 
11837   if (opts->x_aarch64_arch_string == NULL)
11838     opts->x_aarch64_arch_string = selected_arch->name;
11839   if (opts->x_aarch64_cpu_string == NULL)
11840     opts->x_aarch64_cpu_string = selected_cpu->name;
11841   if (opts->x_aarch64_tune_string == NULL)
11842     opts->x_aarch64_tune_string = selected_tune->name;
11843 
11844   aarch64_override_options_after_change_1 (opts);
11845 }
11846 
11847 /* Print a hint with a suggestion for a core or architecture name that
11848    most closely resembles what the user passed in STR.  ARCH is true if
11849    the user is asking for an architecture name.  ARCH is false if the user
11850    is asking for a core name.  */
11851 
11852 static void
aarch64_print_hint_for_core_or_arch(const char * str,bool arch)11853 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
11854 {
11855   auto_vec<const char *> candidates;
11856   const struct processor *entry = arch ? all_architectures : all_cores;
11857   for (; entry->name != NULL; entry++)
11858     candidates.safe_push (entry->name);
11859 
11860 #ifdef HAVE_LOCAL_CPU_DETECT
11861   /* Add also "native" as possible value.  */
11862   if (arch)
11863     candidates.safe_push ("native");
11864 #endif
11865 
11866   char *s;
11867   const char *hint = candidates_list_and_hint (str, s, candidates);
11868   if (hint)
11869     inform (input_location, "valid arguments are: %s;"
11870 			     " did you mean %qs?", s, hint);
11871   else
11872     inform (input_location, "valid arguments are: %s", s);
11873 
11874   XDELETEVEC (s);
11875 }
11876 
11877 /* Print a hint with a suggestion for a core name that most closely resembles
11878    what the user passed in STR.  */
11879 
11880 inline static void
aarch64_print_hint_for_core(const char * str)11881 aarch64_print_hint_for_core (const char *str)
11882 {
11883   aarch64_print_hint_for_core_or_arch (str, false);
11884 }
11885 
11886 /* Print a hint with a suggestion for an architecture name that most closely
11887    resembles what the user passed in STR.  */
11888 
11889 inline static void
aarch64_print_hint_for_arch(const char * str)11890 aarch64_print_hint_for_arch (const char *str)
11891 {
11892   aarch64_print_hint_for_core_or_arch (str, true);
11893 }
11894 
11895 
11896 /* Print a hint with a suggestion for an extension name
11897    that most closely resembles what the user passed in STR.  */
11898 
11899 void
aarch64_print_hint_for_extensions(const std::string & str)11900 aarch64_print_hint_for_extensions (const std::string &str)
11901 {
11902   auto_vec<const char *> candidates;
11903   aarch64_get_all_extension_candidates (&candidates);
11904   char *s;
11905   const char *hint = candidates_list_and_hint (str.c_str (), s, candidates);
11906   if (hint)
11907     inform (input_location, "valid arguments are: %s;"
11908 			     " did you mean %qs?", s, hint);
11909   else
11910     inform (input_location, "valid arguments are: %s;", s);
11911 
11912   XDELETEVEC (s);
11913 }
11914 
11915 /* Validate a command-line -mcpu option.  Parse the cpu and extensions (if any)
11916    specified in STR and throw errors if appropriate.  Put the results if
11917    they are valid in RES and ISA_FLAGS.  Return whether the option is
11918    valid.  */
11919 
11920 static bool
aarch64_validate_mcpu(const char * str,const struct processor ** res,unsigned long * isa_flags)11921 aarch64_validate_mcpu (const char *str, const struct processor **res,
11922 		       unsigned long *isa_flags)
11923 {
11924   std::string invalid_extension;
11925   enum aarch64_parse_opt_result parse_res
11926     = aarch64_parse_cpu (str, res, isa_flags, &invalid_extension);
11927 
11928   if (parse_res == AARCH64_PARSE_OK)
11929     return true;
11930 
11931   switch (parse_res)
11932     {
11933       case AARCH64_PARSE_MISSING_ARG:
11934 	error ("missing cpu name in %<-mcpu=%s%>", str);
11935 	break;
11936       case AARCH64_PARSE_INVALID_ARG:
11937 	error ("unknown value %qs for %<-mcpu%>", str);
11938 	aarch64_print_hint_for_core (str);
11939 	break;
11940       case AARCH64_PARSE_INVALID_FEATURE:
11941 	error ("invalid feature modifier %qs in %<-mcpu=%s%>",
11942 	       invalid_extension.c_str (), str);
11943 	aarch64_print_hint_for_extensions (invalid_extension);
11944 	break;
11945       default:
11946 	gcc_unreachable ();
11947     }
11948 
11949   return false;
11950 }
11951 
11952 /* Straight line speculation indicators.  */
11953 enum aarch64_sls_hardening_type
11954 {
11955   SLS_NONE = 0,
11956   SLS_RETBR = 1,
11957   SLS_BLR = 2,
11958   SLS_ALL = 3,
11959 };
11960 static enum aarch64_sls_hardening_type aarch64_sls_hardening;
11961 
11962 /* Return whether we should mitigatate Straight Line Speculation for the RET
11963    and BR instructions.  */
11964 bool
aarch64_harden_sls_retbr_p(void)11965 aarch64_harden_sls_retbr_p (void)
11966 {
11967   return aarch64_sls_hardening & SLS_RETBR;
11968 }
11969 
11970 /* Return whether we should mitigatate Straight Line Speculation for the BLR
11971    instruction.  */
11972 bool
aarch64_harden_sls_blr_p(void)11973 aarch64_harden_sls_blr_p (void)
11974 {
11975   return aarch64_sls_hardening & SLS_BLR;
11976 }
11977 
11978 /* As of yet we only allow setting these options globally, in the future we may
11979    allow setting them per function.  */
11980 static void
aarch64_validate_sls_mitigation(const char * const_str)11981 aarch64_validate_sls_mitigation (const char *const_str)
11982 {
11983   char *token_save = NULL;
11984   char *str = NULL;
11985 
11986   if (strcmp (const_str, "none") == 0)
11987     {
11988       aarch64_sls_hardening = SLS_NONE;
11989       return;
11990     }
11991   if (strcmp (const_str, "all") == 0)
11992     {
11993       aarch64_sls_hardening = SLS_ALL;
11994       return;
11995     }
11996 
11997   char *str_root = xstrdup (const_str);
11998   str = strtok_r (str_root, ",", &token_save);
11999   if (!str)
12000     error ("invalid argument given to %<-mharden-sls=%>");
12001 
12002   int temp = SLS_NONE;
12003   while (str)
12004     {
12005       if (strcmp (str, "blr") == 0)
12006 	temp |= SLS_BLR;
12007       else if (strcmp (str, "retbr") == 0)
12008 	temp |= SLS_RETBR;
12009       else if (strcmp (str, "none") == 0 || strcmp (str, "all") == 0)
12010 	{
12011 	  error ("%<%s%> must be by itself for %<-mharden-sls=%>", str);
12012 	  break;
12013 	}
12014       else
12015 	{
12016 	  error ("invalid argument %<%s%> for %<-mharden-sls=%>", str);
12017 	  break;
12018 	}
12019       str = strtok_r (NULL, ",", &token_save);
12020     }
12021   aarch64_sls_hardening = (aarch64_sls_hardening_type) temp;
12022   free (str_root);
12023 }
12024 
12025 /* Parses CONST_STR for branch protection features specified in
12026    aarch64_branch_protect_types, and set any global variables required.  Returns
12027    the parsing result and assigns LAST_STR to the last processed token from
12028    CONST_STR so that it can be used for error reporting.  */
12029 
12030 static enum
aarch64_parse_branch_protection(const char * const_str,char ** last_str)12031 aarch64_parse_opt_result aarch64_parse_branch_protection (const char *const_str,
12032 							  char** last_str)
12033 {
12034   char *str_root = xstrdup (const_str);
12035   char* token_save = NULL;
12036   char *str = strtok_r (str_root, "+", &token_save);
12037   enum aarch64_parse_opt_result res = AARCH64_PARSE_OK;
12038   if (!str)
12039     res = AARCH64_PARSE_MISSING_ARG;
12040   else
12041     {
12042       char *next_str = strtok_r (NULL, "+", &token_save);
12043       /* Reset the branch protection features to their defaults.  */
12044       aarch64_handle_no_branch_protection (NULL, NULL);
12045 
12046       while (str && res == AARCH64_PARSE_OK)
12047 	{
12048 	  const aarch64_branch_protect_type* type = aarch64_branch_protect_types;
12049 	  bool found = false;
12050 	  /* Search for this type.  */
12051 	  while (type && type->name && !found && res == AARCH64_PARSE_OK)
12052 	    {
12053 	      if (strcmp (str, type->name) == 0)
12054 		{
12055 		  found = true;
12056 		  res = type->handler (str, next_str);
12057 		  str = next_str;
12058 		  next_str = strtok_r (NULL, "+", &token_save);
12059 		}
12060 	      else
12061 		type++;
12062 	    }
12063 	  if (found && res == AARCH64_PARSE_OK)
12064 	    {
12065 	      bool found_subtype = true;
12066 	      /* Loop through each token until we find one that isn't a
12067 		 subtype.  */
12068 	      while (found_subtype)
12069 		{
12070 		  found_subtype = false;
12071 		  const aarch64_branch_protect_type *subtype = type->subtypes;
12072 		  /* Search for the subtype.  */
12073 		  while (str && subtype && subtype->name && !found_subtype
12074 			  && res == AARCH64_PARSE_OK)
12075 		    {
12076 		      if (strcmp (str, subtype->name) == 0)
12077 			{
12078 			  found_subtype = true;
12079 			  res = subtype->handler (str, next_str);
12080 			  str = next_str;
12081 			  next_str = strtok_r (NULL, "+", &token_save);
12082 			}
12083 		      else
12084 			subtype++;
12085 		    }
12086 		}
12087 	    }
12088 	  else if (!found)
12089 	    res = AARCH64_PARSE_INVALID_ARG;
12090 	}
12091     }
12092   /* Copy the last processed token into the argument to pass it back.
12093     Used by option and attribute validation to print the offending token.  */
12094   if (last_str)
12095     {
12096       if (str) strcpy (*last_str, str);
12097       else *last_str = NULL;
12098     }
12099   if (res == AARCH64_PARSE_OK)
12100     {
12101       /* If needed, alloc the accepted string then copy in const_str.
12102 	Used by override_option_after_change_1.  */
12103       if (!accepted_branch_protection_string)
12104 	accepted_branch_protection_string = (char *) xmalloc (
12105 						      BRANCH_PROTECT_STR_MAX
12106 							+ 1);
12107       strncpy (accepted_branch_protection_string, const_str,
12108 		BRANCH_PROTECT_STR_MAX + 1);
12109       /* Forcibly null-terminate.  */
12110       accepted_branch_protection_string[BRANCH_PROTECT_STR_MAX] = '\0';
12111     }
12112   return res;
12113 }
12114 
12115 static bool
aarch64_validate_mbranch_protection(const char * const_str)12116 aarch64_validate_mbranch_protection (const char *const_str)
12117 {
12118   char *str = (char *) xmalloc (strlen (const_str));
12119   enum aarch64_parse_opt_result res =
12120     aarch64_parse_branch_protection (const_str, &str);
12121   if (res == AARCH64_PARSE_INVALID_ARG)
12122     error ("invalid arg %<%s%> for %<-mbranch-protection=%>", str);
12123   else if (res == AARCH64_PARSE_MISSING_ARG)
12124     error ("missing arg for %<-mbranch-protection=%>");
12125   free (str);
12126   return res == AARCH64_PARSE_OK;
12127 }
12128 
12129 /* Validate a command-line -march option.  Parse the arch and extensions
12130    (if any) specified in STR and throw errors if appropriate.  Put the
12131    results, if they are valid, in RES and ISA_FLAGS.  Return whether the
12132    option is valid.  */
12133 
12134 static bool
aarch64_validate_march(const char * str,const struct processor ** res,unsigned long * isa_flags)12135 aarch64_validate_march (const char *str, const struct processor **res,
12136 			 unsigned long *isa_flags)
12137 {
12138   std::string invalid_extension;
12139   enum aarch64_parse_opt_result parse_res
12140     = aarch64_parse_arch (str, res, isa_flags, &invalid_extension);
12141 
12142   if (parse_res == AARCH64_PARSE_OK)
12143     return true;
12144 
12145   switch (parse_res)
12146     {
12147       case AARCH64_PARSE_MISSING_ARG:
12148 	error ("missing arch name in %<-march=%s%>", str);
12149 	break;
12150       case AARCH64_PARSE_INVALID_ARG:
12151 	error ("unknown value %qs for %<-march%>", str);
12152 	aarch64_print_hint_for_arch (str);
12153 	break;
12154       case AARCH64_PARSE_INVALID_FEATURE:
12155 	error ("invalid feature modifier %qs in %<-march=%s%>",
12156 	       invalid_extension.c_str (), str);
12157 	aarch64_print_hint_for_extensions (invalid_extension);
12158 	break;
12159       default:
12160 	gcc_unreachable ();
12161     }
12162 
12163   return false;
12164 }
12165 
12166 /* Validate a command-line -mtune option.  Parse the cpu
12167    specified in STR and throw errors if appropriate.  Put the
12168    result, if it is valid, in RES.  Return whether the option is
12169    valid.  */
12170 
12171 static bool
aarch64_validate_mtune(const char * str,const struct processor ** res)12172 aarch64_validate_mtune (const char *str, const struct processor **res)
12173 {
12174   enum aarch64_parse_opt_result parse_res
12175     = aarch64_parse_tune (str, res);
12176 
12177   if (parse_res == AARCH64_PARSE_OK)
12178     return true;
12179 
12180   switch (parse_res)
12181     {
12182       case AARCH64_PARSE_MISSING_ARG:
12183 	error ("missing cpu name in %<-mtune=%s%>", str);
12184 	break;
12185       case AARCH64_PARSE_INVALID_ARG:
12186 	error ("unknown value %qs for %<-mtune%>", str);
12187 	aarch64_print_hint_for_core (str);
12188 	break;
12189       default:
12190 	gcc_unreachable ();
12191     }
12192   return false;
12193 }
12194 
12195 /* Return the CPU corresponding to the enum CPU.
12196    If it doesn't specify a cpu, return the default.  */
12197 
12198 static const struct processor *
aarch64_get_tune_cpu(enum aarch64_processor cpu)12199 aarch64_get_tune_cpu (enum aarch64_processor cpu)
12200 {
12201   if (cpu != aarch64_none)
12202     return &all_cores[cpu];
12203 
12204   /* The & 0x3f is to extract the bottom 6 bits that encode the
12205      default cpu as selected by the --with-cpu GCC configure option
12206      in config.gcc.
12207      ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
12208      flags mechanism should be reworked to make it more sane.  */
12209   return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
12210 }
12211 
12212 /* Return the architecture corresponding to the enum ARCH.
12213    If it doesn't specify a valid architecture, return the default.  */
12214 
12215 static const struct processor *
aarch64_get_arch(enum aarch64_arch arch)12216 aarch64_get_arch (enum aarch64_arch arch)
12217 {
12218   if (arch != aarch64_no_arch)
12219     return &all_architectures[arch];
12220 
12221   const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
12222 
12223   return &all_architectures[cpu->arch];
12224 }
12225 
12226 /* Return the VG value associated with -msve-vector-bits= value VALUE.  */
12227 
12228 static poly_uint16
aarch64_convert_sve_vector_bits(aarch64_sve_vector_bits_enum value)12229 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
12230 {
12231   /* For now generate vector-length agnostic code for -msve-vector-bits=128.
12232      This ensures we can clearly distinguish SVE and Advanced SIMD modes when
12233      deciding which .md file patterns to use and when deciding whether
12234      something is a legitimate address or constant.  */
12235   if (value == SVE_SCALABLE || value == SVE_128)
12236     return poly_uint16 (2, 2);
12237   else
12238     return (int) value / 64;
12239 }
12240 
12241 /* Implement TARGET_OPTION_OVERRIDE.  This is called once in the beginning
12242    and is used to parse the -m{cpu,tune,arch} strings and setup the initial
12243    tuning structs.  In particular it must set selected_tune and
12244    aarch64_isa_flags that define the available ISA features and tuning
12245    decisions.  It must also set selected_arch as this will be used to
12246    output the .arch asm tags for each function.  */
12247 
12248 static void
aarch64_override_options(void)12249 aarch64_override_options (void)
12250 {
12251   unsigned long cpu_isa = 0;
12252   unsigned long arch_isa = 0;
12253   aarch64_isa_flags = 0;
12254 
12255   bool valid_cpu = true;
12256   bool valid_tune = true;
12257   bool valid_arch = true;
12258 
12259   selected_cpu = NULL;
12260   selected_arch = NULL;
12261   selected_tune = NULL;
12262 
12263   if (aarch64_harden_sls_string)
12264     aarch64_validate_sls_mitigation (aarch64_harden_sls_string);
12265 
12266   if (aarch64_branch_protection_string)
12267     aarch64_validate_mbranch_protection (aarch64_branch_protection_string);
12268 
12269   /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
12270      If either of -march or -mtune is given, they override their
12271      respective component of -mcpu.  */
12272   if (aarch64_cpu_string)
12273     valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
12274 					&cpu_isa);
12275 
12276   if (aarch64_arch_string)
12277     valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
12278 					  &arch_isa);
12279 
12280   if (aarch64_tune_string)
12281     valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
12282 
12283 #ifdef SUBTARGET_OVERRIDE_OPTIONS
12284   SUBTARGET_OVERRIDE_OPTIONS;
12285 #endif
12286 
12287   /* If the user did not specify a processor, choose the default
12288      one for them.  This will be the CPU set during configuration using
12289      --with-cpu, otherwise it is "generic".  */
12290   if (!selected_cpu)
12291     {
12292       if (selected_arch)
12293 	{
12294 	  selected_cpu = &all_cores[selected_arch->ident];
12295 	  aarch64_isa_flags = arch_isa;
12296 	  explicit_arch = selected_arch->arch;
12297 	}
12298       else
12299 	{
12300 	  /* Get default configure-time CPU.  */
12301 	  selected_cpu = aarch64_get_tune_cpu (aarch64_none);
12302 	  aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
12303 	}
12304 
12305       if (selected_tune)
12306 	explicit_tune_core = selected_tune->ident;
12307     }
12308   /* If both -mcpu and -march are specified check that they are architecturally
12309      compatible, warn if they're not and prefer the -march ISA flags.  */
12310   else if (selected_arch)
12311     {
12312       if (selected_arch->arch != selected_cpu->arch)
12313 	{
12314 	  warning (0, "switch %<-mcpu=%s%> conflicts with %<-march=%s%> switch",
12315 		       all_architectures[selected_cpu->arch].name,
12316 		       selected_arch->name);
12317 	}
12318       aarch64_isa_flags = arch_isa;
12319       explicit_arch = selected_arch->arch;
12320       explicit_tune_core = selected_tune ? selected_tune->ident
12321 					  : selected_cpu->ident;
12322     }
12323   else
12324     {
12325       /* -mcpu but no -march.  */
12326       aarch64_isa_flags = cpu_isa;
12327       explicit_tune_core = selected_tune ? selected_tune->ident
12328 					  : selected_cpu->ident;
12329       gcc_assert (selected_cpu);
12330       selected_arch = &all_architectures[selected_cpu->arch];
12331       explicit_arch = selected_arch->arch;
12332     }
12333 
12334   /* Set the arch as well as we will need it when outputing
12335      the .arch directive in assembly.  */
12336   if (!selected_arch)
12337     {
12338       gcc_assert (selected_cpu);
12339       selected_arch = &all_architectures[selected_cpu->arch];
12340     }
12341 
12342   if (!selected_tune)
12343     selected_tune = selected_cpu;
12344 
12345   if (aarch64_enable_bti == 2)
12346     {
12347 #ifdef TARGET_ENABLE_BTI
12348       aarch64_enable_bti = 1;
12349 #else
12350       aarch64_enable_bti = 0;
12351 #endif
12352     }
12353 
12354   /* Return address signing is currently not supported for ILP32 targets.  For
12355      LP64 targets use the configured option in the absence of a command-line
12356      option for -mbranch-protection.  */
12357   if (!TARGET_ILP32 && accepted_branch_protection_string == NULL)
12358     {
12359 #ifdef TARGET_ENABLE_PAC_RET
12360       aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
12361 #else
12362       aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
12363 #endif
12364     }
12365 
12366 #ifndef HAVE_AS_MABI_OPTION
12367   /* The compiler may have been configured with 2.23.* binutils, which does
12368      not have support for ILP32.  */
12369   if (TARGET_ILP32)
12370     error ("assembler does not support %<-mabi=ilp32%>");
12371 #endif
12372 
12373   /* Convert -msve-vector-bits to a VG count.  */
12374   aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
12375 
12376   if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
12377     sorry ("return address signing is only supported for %<-mabi=lp64%>");
12378 
12379   /* Make sure we properly set up the explicit options.  */
12380   if ((aarch64_cpu_string && valid_cpu)
12381        || (aarch64_tune_string && valid_tune))
12382     gcc_assert (explicit_tune_core != aarch64_none);
12383 
12384   if ((aarch64_cpu_string && valid_cpu)
12385        || (aarch64_arch_string && valid_arch))
12386     gcc_assert (explicit_arch != aarch64_no_arch);
12387 
12388   /* The pass to insert speculation tracking runs before
12389      shrink-wrapping and the latter does not know how to update the
12390      tracking status.  So disable it in this case.  */
12391   if (aarch64_track_speculation)
12392     flag_shrink_wrap = 0;
12393 
12394   aarch64_override_options_internal (&global_options);
12395 
12396   /* Save these options as the default ones in case we push and pop them later
12397      while processing functions with potential target attributes.  */
12398   target_option_default_node = target_option_current_node
12399       = build_target_option_node (&global_options);
12400 }
12401 
12402 /* Implement targetm.override_options_after_change.  */
12403 
12404 static void
aarch64_override_options_after_change(void)12405 aarch64_override_options_after_change (void)
12406 {
12407   aarch64_override_options_after_change_1 (&global_options);
12408 }
12409 
12410 static struct machine_function *
aarch64_init_machine_status(void)12411 aarch64_init_machine_status (void)
12412 {
12413   struct machine_function *machine;
12414   machine = ggc_cleared_alloc<machine_function> ();
12415   return machine;
12416 }
12417 
12418 void
aarch64_init_expanders(void)12419 aarch64_init_expanders (void)
12420 {
12421   init_machine_status = aarch64_init_machine_status;
12422 }
12423 
12424 /* A checking mechanism for the implementation of the various code models.  */
12425 static void
initialize_aarch64_code_model(struct gcc_options * opts)12426 initialize_aarch64_code_model (struct gcc_options *opts)
12427 {
12428    if (opts->x_flag_pic)
12429      {
12430        switch (opts->x_aarch64_cmodel_var)
12431 	 {
12432 	 case AARCH64_CMODEL_TINY:
12433 	   aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
12434 	   break;
12435 	 case AARCH64_CMODEL_SMALL:
12436 #ifdef HAVE_AS_SMALL_PIC_RELOCS
12437 	   aarch64_cmodel = (flag_pic == 2
12438 			     ? AARCH64_CMODEL_SMALL_PIC
12439 			     : AARCH64_CMODEL_SMALL_SPIC);
12440 #else
12441 	   aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
12442 #endif
12443 	   break;
12444 	 case AARCH64_CMODEL_LARGE:
12445 	   sorry ("code model %qs with %<-f%s%>", "large",
12446 		  opts->x_flag_pic > 1 ? "PIC" : "pic");
12447 	   break;
12448 	 default:
12449 	   gcc_unreachable ();
12450 	 }
12451      }
12452    else
12453      aarch64_cmodel = opts->x_aarch64_cmodel_var;
12454 }
12455 
12456 /* Implement TARGET_OPTION_SAVE.  */
12457 
12458 static void
aarch64_option_save(struct cl_target_option * ptr,struct gcc_options * opts)12459 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
12460 {
12461   ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
12462   ptr->x_aarch64_branch_protection_string
12463     = opts->x_aarch64_branch_protection_string;
12464 }
12465 
12466 /* Implements TARGET_OPTION_RESTORE.  Restore the backend codegen decisions
12467    using the information saved in PTR.  */
12468 
12469 static void
aarch64_option_restore(struct gcc_options * opts,struct cl_target_option * ptr)12470 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
12471 {
12472   opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
12473   selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
12474   opts->x_explicit_arch = ptr->x_explicit_arch;
12475   selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
12476   opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
12477   opts->x_aarch64_branch_protection_string
12478     = ptr->x_aarch64_branch_protection_string;
12479   if (opts->x_aarch64_branch_protection_string)
12480     {
12481       aarch64_parse_branch_protection (opts->x_aarch64_branch_protection_string,
12482 					NULL);
12483     }
12484 
12485   aarch64_override_options_internal (opts);
12486 }
12487 
12488 /* Implement TARGET_OPTION_PRINT.  */
12489 
12490 static void
aarch64_option_print(FILE * file,int indent,struct cl_target_option * ptr)12491 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
12492 {
12493   const struct processor *cpu
12494     = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
12495   unsigned long isa_flags = ptr->x_aarch64_isa_flags;
12496   const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
12497   std::string extension
12498     = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
12499 
12500   fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
12501   fprintf (file, "%*sselected arch = %s%s\n", indent, "",
12502 	   arch->name, extension.c_str ());
12503 }
12504 
12505 static GTY(()) tree aarch64_previous_fndecl;
12506 
12507 void
aarch64_reset_previous_fndecl(void)12508 aarch64_reset_previous_fndecl (void)
12509 {
12510   aarch64_previous_fndecl = NULL;
12511 }
12512 
12513 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
12514    Used by aarch64_set_current_function and aarch64_pragma_target_parse to
12515    make sure optab availability predicates are recomputed when necessary.  */
12516 
12517 void
aarch64_save_restore_target_globals(tree new_tree)12518 aarch64_save_restore_target_globals (tree new_tree)
12519 {
12520   if (TREE_TARGET_GLOBALS (new_tree))
12521     restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
12522   else if (new_tree == target_option_default_node)
12523     restore_target_globals (&default_target_globals);
12524   else
12525     TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
12526 }
12527 
12528 /* Implement TARGET_SET_CURRENT_FUNCTION.  Unpack the codegen decisions
12529    like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
12530    of the function, if such exists.  This function may be called multiple
12531    times on a single function so use aarch64_previous_fndecl to avoid
12532    setting up identical state.  */
12533 
12534 static void
aarch64_set_current_function(tree fndecl)12535 aarch64_set_current_function (tree fndecl)
12536 {
12537   if (!fndecl || fndecl == aarch64_previous_fndecl)
12538     return;
12539 
12540   tree old_tree = (aarch64_previous_fndecl
12541 		   ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
12542 		   : NULL_TREE);
12543 
12544   tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
12545 
12546   /* If current function has no attributes but the previous one did,
12547      use the default node.  */
12548   if (!new_tree && old_tree)
12549     new_tree = target_option_default_node;
12550 
12551   /* If nothing to do, return.  #pragma GCC reset or #pragma GCC pop to
12552      the default have been handled by aarch64_save_restore_target_globals from
12553      aarch64_pragma_target_parse.  */
12554   if (old_tree == new_tree)
12555     return;
12556 
12557   aarch64_previous_fndecl = fndecl;
12558 
12559   /* First set the target options.  */
12560   cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
12561 
12562   aarch64_save_restore_target_globals (new_tree);
12563 }
12564 
12565 /* Enum describing the various ways we can handle attributes.
12566    In many cases we can reuse the generic option handling machinery.  */
12567 
12568 enum aarch64_attr_opt_type
12569 {
12570   aarch64_attr_mask,	/* Attribute should set a bit in target_flags.  */
12571   aarch64_attr_bool,	/* Attribute sets or unsets a boolean variable.  */
12572   aarch64_attr_enum,	/* Attribute sets an enum variable.  */
12573   aarch64_attr_custom	/* Attribute requires a custom handling function.  */
12574 };
12575 
12576 /* All the information needed to handle a target attribute.
12577    NAME is the name of the attribute.
12578    ATTR_TYPE specifies the type of behavior of the attribute as described
12579    in the definition of enum aarch64_attr_opt_type.
12580    ALLOW_NEG is true if the attribute supports a "no-" form.
12581    HANDLER is the function that takes the attribute string as an argument
12582    It is needed only when the ATTR_TYPE is aarch64_attr_custom.
12583    OPT_NUM is the enum specifying the option that the attribute modifies.
12584    This is needed for attributes that mirror the behavior of a command-line
12585    option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
12586    aarch64_attr_enum.  */
12587 
12588 struct aarch64_attribute_info
12589 {
12590   const char *name;
12591   enum aarch64_attr_opt_type attr_type;
12592   bool allow_neg;
12593   bool (*handler) (const char *);
12594   enum opt_code opt_num;
12595 };
12596 
12597 /* Handle the ARCH_STR argument to the arch= target attribute.  */
12598 
12599 static bool
aarch64_handle_attr_arch(const char * str)12600 aarch64_handle_attr_arch (const char *str)
12601 {
12602   const struct processor *tmp_arch = NULL;
12603   std::string invalid_extension;
12604   enum aarch64_parse_opt_result parse_res
12605     = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags, &invalid_extension);
12606 
12607   if (parse_res == AARCH64_PARSE_OK)
12608     {
12609       gcc_assert (tmp_arch);
12610       selected_arch = tmp_arch;
12611       explicit_arch = selected_arch->arch;
12612       return true;
12613     }
12614 
12615   switch (parse_res)
12616     {
12617       case AARCH64_PARSE_MISSING_ARG:
12618 	error ("missing name in %<target(\"arch=\")%> pragma or attribute");
12619 	break;
12620       case AARCH64_PARSE_INVALID_ARG:
12621 	error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str);
12622 	aarch64_print_hint_for_arch (str);
12623 	break;
12624       case AARCH64_PARSE_INVALID_FEATURE:
12625 	error ("invalid feature modifier %s of value (\"%s\") in "
12626 	       "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
12627 	aarch64_print_hint_for_extensions (invalid_extension);
12628 	break;
12629       default:
12630 	gcc_unreachable ();
12631     }
12632 
12633   return false;
12634 }
12635 
12636 /* Handle the argument CPU_STR to the cpu= target attribute.  */
12637 
12638 static bool
aarch64_handle_attr_cpu(const char * str)12639 aarch64_handle_attr_cpu (const char *str)
12640 {
12641   const struct processor *tmp_cpu = NULL;
12642   std::string invalid_extension;
12643   enum aarch64_parse_opt_result parse_res
12644     = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags, &invalid_extension);
12645 
12646   if (parse_res == AARCH64_PARSE_OK)
12647     {
12648       gcc_assert (tmp_cpu);
12649       selected_tune = tmp_cpu;
12650       explicit_tune_core = selected_tune->ident;
12651 
12652       selected_arch = &all_architectures[tmp_cpu->arch];
12653       explicit_arch = selected_arch->arch;
12654       return true;
12655     }
12656 
12657   switch (parse_res)
12658     {
12659       case AARCH64_PARSE_MISSING_ARG:
12660 	error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
12661 	break;
12662       case AARCH64_PARSE_INVALID_ARG:
12663 	error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str);
12664 	aarch64_print_hint_for_core (str);
12665 	break;
12666       case AARCH64_PARSE_INVALID_FEATURE:
12667 	error ("invalid feature modifier %s of value (\"%s\") in "
12668 	       "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
12669 	aarch64_print_hint_for_extensions (invalid_extension);
12670 	break;
12671       default:
12672 	gcc_unreachable ();
12673     }
12674 
12675   return false;
12676 }
12677 
12678 /* Handle the argument STR to the branch-protection= attribute.  */
12679 
12680  static bool
aarch64_handle_attr_branch_protection(const char * str)12681  aarch64_handle_attr_branch_protection (const char* str)
12682  {
12683   char *err_str = (char *) xmalloc (strlen (str));
12684   enum aarch64_parse_opt_result res = aarch64_parse_branch_protection (str,
12685 								      &err_str);
12686   bool success = false;
12687   switch (res)
12688     {
12689      case AARCH64_PARSE_MISSING_ARG:
12690        error ("missing argument to %<target(\"branch-protection=\")%> pragma or"
12691 	      " attribute");
12692        break;
12693      case AARCH64_PARSE_INVALID_ARG:
12694        error ("invalid protection type (\"%s\") in %<target(\"branch-protection"
12695 	      "=\")%> pragma or attribute", err_str);
12696        break;
12697      case AARCH64_PARSE_OK:
12698        success = true;
12699       /* Fall through.  */
12700      case AARCH64_PARSE_INVALID_FEATURE:
12701        break;
12702      default:
12703        gcc_unreachable ();
12704     }
12705   free (err_str);
12706   return success;
12707  }
12708 
12709 /* Handle the argument STR to the tune= target attribute.  */
12710 
12711 static bool
aarch64_handle_attr_tune(const char * str)12712 aarch64_handle_attr_tune (const char *str)
12713 {
12714   const struct processor *tmp_tune = NULL;
12715   enum aarch64_parse_opt_result parse_res
12716     = aarch64_parse_tune (str, &tmp_tune);
12717 
12718   if (parse_res == AARCH64_PARSE_OK)
12719     {
12720       gcc_assert (tmp_tune);
12721       selected_tune = tmp_tune;
12722       explicit_tune_core = selected_tune->ident;
12723       return true;
12724     }
12725 
12726   switch (parse_res)
12727     {
12728       case AARCH64_PARSE_INVALID_ARG:
12729 	error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str);
12730 	aarch64_print_hint_for_core (str);
12731 	break;
12732       default:
12733 	gcc_unreachable ();
12734     }
12735 
12736   return false;
12737 }
12738 
12739 /* Parse an architecture extensions target attribute string specified in STR.
12740    For example "+fp+nosimd".  Show any errors if needed.  Return TRUE
12741    if successful.  Update aarch64_isa_flags to reflect the ISA features
12742    modified.  */
12743 
12744 static bool
aarch64_handle_attr_isa_flags(char * str)12745 aarch64_handle_attr_isa_flags (char *str)
12746 {
12747   enum aarch64_parse_opt_result parse_res;
12748   unsigned long isa_flags = aarch64_isa_flags;
12749 
12750   /* We allow "+nothing" in the beginning to clear out all architectural
12751      features if the user wants to handpick specific features.  */
12752   if (strncmp ("+nothing", str, 8) == 0)
12753     {
12754       isa_flags = 0;
12755       str += 8;
12756     }
12757 
12758   std::string invalid_extension;
12759   parse_res = aarch64_parse_extension (str, &isa_flags, &invalid_extension);
12760 
12761   if (parse_res == AARCH64_PARSE_OK)
12762     {
12763       aarch64_isa_flags = isa_flags;
12764       return true;
12765     }
12766 
12767   switch (parse_res)
12768     {
12769       case AARCH64_PARSE_MISSING_ARG:
12770 	error ("missing value in %<target()%> pragma or attribute");
12771 	break;
12772 
12773       case AARCH64_PARSE_INVALID_FEATURE:
12774 	error ("invalid feature modifier %s of value (\"%s\") in "
12775 	       "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
12776 	break;
12777 
12778       default:
12779 	gcc_unreachable ();
12780     }
12781 
12782  return false;
12783 }
12784 
12785 /* The target attributes that we support.  On top of these we also support just
12786    ISA extensions, like  __attribute__ ((target ("+crc"))), but that case is
12787    handled explicitly in aarch64_process_one_target_attr.  */
12788 
12789 static const struct aarch64_attribute_info aarch64_attributes[] =
12790 {
12791   { "general-regs-only", aarch64_attr_mask, false, NULL,
12792      OPT_mgeneral_regs_only },
12793   { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
12794      OPT_mfix_cortex_a53_835769 },
12795   { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
12796      OPT_mfix_cortex_a53_843419 },
12797   { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
12798   { "strict-align", aarch64_attr_mask, true, NULL, OPT_mstrict_align },
12799   { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
12800      OPT_momit_leaf_frame_pointer },
12801   { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
12802   { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
12803      OPT_march_ },
12804   { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
12805   { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
12806      OPT_mtune_ },
12807   { "branch-protection", aarch64_attr_custom, false,
12808      aarch64_handle_attr_branch_protection, OPT_mbranch_protection_ },
12809   { "sign-return-address", aarch64_attr_enum, false, NULL,
12810      OPT_msign_return_address_ },
12811   { "outline-atomics", aarch64_attr_bool, true, NULL,
12812      OPT_moutline_atomics},
12813   { NULL, aarch64_attr_custom, false, NULL, OPT____ }
12814 };
12815 
12816 /* Parse ARG_STR which contains the definition of one target attribute.
12817    Show appropriate errors if any or return true if the attribute is valid.  */
12818 
12819 static bool
aarch64_process_one_target_attr(char * arg_str)12820 aarch64_process_one_target_attr (char *arg_str)
12821 {
12822   bool invert = false;
12823 
12824   size_t len = strlen (arg_str);
12825 
12826   if (len == 0)
12827     {
12828       error ("malformed %<target()%> pragma or attribute");
12829       return false;
12830     }
12831 
12832   char *str_to_check = (char *) alloca (len + 1);
12833   strcpy (str_to_check, arg_str);
12834 
12835   /* Skip leading whitespace.  */
12836   while (*str_to_check == ' ' || *str_to_check == '\t')
12837     str_to_check++;
12838 
12839   /* We have something like __attribute__ ((target ("+fp+nosimd"))).
12840      It is easier to detect and handle it explicitly here rather than going
12841      through the machinery for the rest of the target attributes in this
12842      function.  */
12843   if (*str_to_check == '+')
12844     return aarch64_handle_attr_isa_flags (str_to_check);
12845 
12846   if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
12847     {
12848       invert = true;
12849       str_to_check += 3;
12850     }
12851   char *arg = strchr (str_to_check, '=');
12852 
12853   /* If we found opt=foo then terminate STR_TO_CHECK at the '='
12854      and point ARG to "foo".  */
12855   if (arg)
12856     {
12857       *arg = '\0';
12858       arg++;
12859     }
12860   const struct aarch64_attribute_info *p_attr;
12861   bool found = false;
12862   for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
12863     {
12864       /* If the names don't match up, or the user has given an argument
12865 	 to an attribute that doesn't accept one, or didn't give an argument
12866 	 to an attribute that expects one, fail to match.  */
12867       if (strcmp (str_to_check, p_attr->name) != 0)
12868 	continue;
12869 
12870       found = true;
12871       bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
12872 			      || p_attr->attr_type == aarch64_attr_enum;
12873 
12874       if (attr_need_arg_p ^ (arg != NULL))
12875 	{
12876 	  error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
12877 	  return false;
12878 	}
12879 
12880       /* If the name matches but the attribute does not allow "no-" versions
12881 	 then we can't match.  */
12882       if (invert && !p_attr->allow_neg)
12883 	{
12884 	  error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
12885 	  return false;
12886 	}
12887 
12888       switch (p_attr->attr_type)
12889 	{
12890 	/* Has a custom handler registered.
12891 	   For example, cpu=, arch=, tune=.  */
12892 	  case aarch64_attr_custom:
12893 	    gcc_assert (p_attr->handler);
12894 	    if (!p_attr->handler (arg))
12895 	      return false;
12896 	    break;
12897 
12898 	  /* Either set or unset a boolean option.  */
12899 	  case aarch64_attr_bool:
12900 	    {
12901 	      struct cl_decoded_option decoded;
12902 
12903 	      generate_option (p_attr->opt_num, NULL, !invert,
12904 			       CL_TARGET, &decoded);
12905 	      aarch64_handle_option (&global_options, &global_options_set,
12906 				      &decoded, input_location);
12907 	      break;
12908 	    }
12909 	  /* Set or unset a bit in the target_flags.  aarch64_handle_option
12910 	     should know what mask to apply given the option number.  */
12911 	  case aarch64_attr_mask:
12912 	    {
12913 	      struct cl_decoded_option decoded;
12914 	      /* We only need to specify the option number.
12915 		 aarch64_handle_option will know which mask to apply.  */
12916 	      decoded.opt_index = p_attr->opt_num;
12917 	      decoded.value = !invert;
12918 	      aarch64_handle_option (&global_options, &global_options_set,
12919 				      &decoded, input_location);
12920 	      break;
12921 	    }
12922 	  /* Use the option setting machinery to set an option to an enum.  */
12923 	  case aarch64_attr_enum:
12924 	    {
12925 	      gcc_assert (arg);
12926 	      bool valid;
12927 	      int value;
12928 	      valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
12929 					      &value, CL_TARGET);
12930 	      if (valid)
12931 		{
12932 		  set_option (&global_options, NULL, p_attr->opt_num, value,
12933 			      NULL, DK_UNSPECIFIED, input_location,
12934 			      global_dc);
12935 		}
12936 	      else
12937 		{
12938 		  error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
12939 		}
12940 	      break;
12941 	    }
12942 	  default:
12943 	    gcc_unreachable ();
12944 	}
12945     }
12946 
12947   /* If we reached here we either have found an attribute and validated
12948      it or didn't match any.  If we matched an attribute but its arguments
12949      were malformed we will have returned false already.  */
12950   return found;
12951 }
12952 
12953 /* Count how many times the character C appears in
12954    NULL-terminated string STR.  */
12955 
12956 static unsigned int
num_occurences_in_str(char c,char * str)12957 num_occurences_in_str (char c, char *str)
12958 {
12959   unsigned int res = 0;
12960   while (*str != '\0')
12961     {
12962       if (*str == c)
12963 	res++;
12964 
12965       str++;
12966     }
12967 
12968   return res;
12969 }
12970 
12971 /* Parse the tree in ARGS that contains the target attribute information
12972    and update the global target options space.  */
12973 
12974 bool
aarch64_process_target_attr(tree args)12975 aarch64_process_target_attr (tree args)
12976 {
12977   if (TREE_CODE (args) == TREE_LIST)
12978     {
12979       do
12980 	{
12981 	  tree head = TREE_VALUE (args);
12982 	  if (head)
12983 	    {
12984 	      if (!aarch64_process_target_attr (head))
12985 		return false;
12986 	    }
12987 	  args = TREE_CHAIN (args);
12988 	} while (args);
12989 
12990       return true;
12991     }
12992 
12993   if (TREE_CODE (args) != STRING_CST)
12994     {
12995       error ("attribute %<target%> argument not a string");
12996       return false;
12997     }
12998 
12999   size_t len = strlen (TREE_STRING_POINTER (args));
13000   char *str_to_check = (char *) alloca (len + 1);
13001   strcpy (str_to_check, TREE_STRING_POINTER (args));
13002 
13003   if (len == 0)
13004     {
13005       error ("malformed %<target()%> pragma or attribute");
13006       return false;
13007     }
13008 
13009   /* Used to catch empty spaces between commas i.e.
13010      attribute ((target ("attr1,,attr2"))).  */
13011   unsigned int num_commas = num_occurences_in_str (',', str_to_check);
13012 
13013   /* Handle multiple target attributes separated by ','.  */
13014   char *token = strtok_r (str_to_check, ",", &str_to_check);
13015 
13016   unsigned int num_attrs = 0;
13017   while (token)
13018     {
13019       num_attrs++;
13020       if (!aarch64_process_one_target_attr (token))
13021 	{
13022 	  error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
13023 	  return false;
13024 	}
13025 
13026       token = strtok_r (NULL, ",", &str_to_check);
13027     }
13028 
13029   if (num_attrs != num_commas + 1)
13030     {
13031       error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
13032       return false;
13033     }
13034 
13035   return true;
13036 }
13037 
13038 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P.  This is used to
13039    process attribute ((target ("..."))).  */
13040 
13041 static bool
aarch64_option_valid_attribute_p(tree fndecl,tree,tree args,int)13042 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
13043 {
13044   struct cl_target_option cur_target;
13045   bool ret;
13046   tree old_optimize;
13047   tree new_target, new_optimize;
13048   tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
13049 
13050   /* If what we're processing is the current pragma string then the
13051      target option node is already stored in target_option_current_node
13052      by aarch64_pragma_target_parse in aarch64-c.c.  Use that to avoid
13053      having to re-parse the string.  This is especially useful to keep
13054      arm_neon.h compile times down since that header contains a lot
13055      of intrinsics enclosed in pragmas.  */
13056   if (!existing_target && args == current_target_pragma)
13057     {
13058       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
13059       return true;
13060     }
13061   tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
13062 
13063   old_optimize = build_optimization_node (&global_options);
13064   func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
13065 
13066   /* If the function changed the optimization levels as well as setting
13067      target options, start with the optimizations specified.  */
13068   if (func_optimize && func_optimize != old_optimize)
13069     cl_optimization_restore (&global_options,
13070 			     TREE_OPTIMIZATION (func_optimize));
13071 
13072   /* Save the current target options to restore at the end.  */
13073   cl_target_option_save (&cur_target, &global_options);
13074 
13075   /* If fndecl already has some target attributes applied to it, unpack
13076      them so that we add this attribute on top of them, rather than
13077      overwriting them.  */
13078   if (existing_target)
13079     {
13080       struct cl_target_option *existing_options
13081 	= TREE_TARGET_OPTION (existing_target);
13082 
13083       if (existing_options)
13084 	cl_target_option_restore (&global_options, existing_options);
13085     }
13086   else
13087     cl_target_option_restore (&global_options,
13088 			TREE_TARGET_OPTION (target_option_current_node));
13089 
13090   ret = aarch64_process_target_attr (args);
13091 
13092   /* Set up any additional state.  */
13093   if (ret)
13094     {
13095       aarch64_override_options_internal (&global_options);
13096       /* Initialize SIMD builtins if we haven't already.
13097 	 Set current_target_pragma to NULL for the duration so that
13098 	 the builtin initialization code doesn't try to tag the functions
13099 	 being built with the attributes specified by any current pragma, thus
13100 	 going into an infinite recursion.  */
13101       if (TARGET_SIMD)
13102 	{
13103 	  tree saved_current_target_pragma = current_target_pragma;
13104 	  current_target_pragma = NULL;
13105 	  aarch64_init_simd_builtins ();
13106 	  current_target_pragma = saved_current_target_pragma;
13107 	}
13108       new_target = build_target_option_node (&global_options);
13109     }
13110   else
13111     new_target = NULL;
13112 
13113   new_optimize = build_optimization_node (&global_options);
13114 
13115   if (fndecl && ret)
13116     {
13117       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
13118 
13119       if (old_optimize != new_optimize)
13120 	DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
13121     }
13122 
13123   cl_target_option_restore (&global_options, &cur_target);
13124 
13125   if (old_optimize != new_optimize)
13126     cl_optimization_restore (&global_options,
13127 			     TREE_OPTIMIZATION (old_optimize));
13128   return ret;
13129 }
13130 
13131 /* Helper for aarch64_can_inline_p.  In the case where CALLER and CALLEE are
13132    tri-bool options (yes, no, don't care) and the default value is
13133    DEF, determine whether to reject inlining.  */
13134 
13135 static bool
aarch64_tribools_ok_for_inlining_p(int caller,int callee,int dont_care,int def)13136 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
13137 				     int dont_care, int def)
13138 {
13139   /* If the callee doesn't care, always allow inlining.  */
13140   if (callee == dont_care)
13141     return true;
13142 
13143   /* If the caller doesn't care, always allow inlining.  */
13144   if (caller == dont_care)
13145     return true;
13146 
13147   /* Otherwise, allow inlining if either the callee and caller values
13148      agree, or if the callee is using the default value.  */
13149   return (callee == caller || callee == def);
13150 }
13151 
13152 /* Implement TARGET_CAN_INLINE_P.  Decide whether it is valid
13153    to inline CALLEE into CALLER based on target-specific info.
13154    Make sure that the caller and callee have compatible architectural
13155    features.  Then go through the other possible target attributes
13156    and see if they can block inlining.  Try not to reject always_inline
13157    callees unless they are incompatible architecturally.  */
13158 
13159 static bool
aarch64_can_inline_p(tree caller,tree callee)13160 aarch64_can_inline_p (tree caller, tree callee)
13161 {
13162   tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
13163   tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
13164 
13165   struct cl_target_option *caller_opts
13166 	= TREE_TARGET_OPTION (caller_tree ? caller_tree
13167 					   : target_option_default_node);
13168 
13169   struct cl_target_option *callee_opts
13170 	= TREE_TARGET_OPTION (callee_tree ? callee_tree
13171 					   : target_option_default_node);
13172 
13173   /* Callee's ISA flags should be a subset of the caller's.  */
13174   if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
13175        != callee_opts->x_aarch64_isa_flags)
13176     return false;
13177 
13178   /* Allow non-strict aligned functions inlining into strict
13179      aligned ones.  */
13180   if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
13181        != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
13182       && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
13183 	   && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
13184     return false;
13185 
13186   bool always_inline = lookup_attribute ("always_inline",
13187 					  DECL_ATTRIBUTES (callee));
13188 
13189   /* If the architectural features match up and the callee is always_inline
13190      then the other attributes don't matter.  */
13191   if (always_inline)
13192     return true;
13193 
13194   if (caller_opts->x_aarch64_cmodel_var
13195       != callee_opts->x_aarch64_cmodel_var)
13196     return false;
13197 
13198   if (caller_opts->x_aarch64_tls_dialect
13199       != callee_opts->x_aarch64_tls_dialect)
13200     return false;
13201 
13202   /* Honour explicit requests to workaround errata.  */
13203   if (!aarch64_tribools_ok_for_inlining_p (
13204 	  caller_opts->x_aarch64_fix_a53_err835769,
13205 	  callee_opts->x_aarch64_fix_a53_err835769,
13206 	  2, TARGET_FIX_ERR_A53_835769_DEFAULT))
13207     return false;
13208 
13209   if (!aarch64_tribools_ok_for_inlining_p (
13210 	  caller_opts->x_aarch64_fix_a53_err843419,
13211 	  callee_opts->x_aarch64_fix_a53_err843419,
13212 	  2, TARGET_FIX_ERR_A53_843419))
13213     return false;
13214 
13215   /* If the user explicitly specified -momit-leaf-frame-pointer for the
13216      caller and calle and they don't match up, reject inlining.  */
13217   if (!aarch64_tribools_ok_for_inlining_p (
13218 	  caller_opts->x_flag_omit_leaf_frame_pointer,
13219 	  callee_opts->x_flag_omit_leaf_frame_pointer,
13220 	  2, 1))
13221     return false;
13222 
13223   /* If the callee has specific tuning overrides, respect them.  */
13224   if (callee_opts->x_aarch64_override_tune_string != NULL
13225       && caller_opts->x_aarch64_override_tune_string == NULL)
13226     return false;
13227 
13228   /* If the user specified tuning override strings for the
13229      caller and callee and they don't match up, reject inlining.
13230      We just do a string compare here, we don't analyze the meaning
13231      of the string, as it would be too costly for little gain.  */
13232   if (callee_opts->x_aarch64_override_tune_string
13233       && caller_opts->x_aarch64_override_tune_string
13234       && (strcmp (callee_opts->x_aarch64_override_tune_string,
13235 		  caller_opts->x_aarch64_override_tune_string) != 0))
13236     return false;
13237 
13238   return true;
13239 }
13240 
13241 /* Return true if SYMBOL_REF X binds locally.  */
13242 
13243 static bool
aarch64_symbol_binds_local_p(const_rtx x)13244 aarch64_symbol_binds_local_p (const_rtx x)
13245 {
13246   return (SYMBOL_REF_DECL (x)
13247 	  ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
13248 	  : SYMBOL_REF_LOCAL_P (x));
13249 }
13250 
13251 /* Return true if SYMBOL_REF X is thread local */
13252 static bool
aarch64_tls_symbol_p(rtx x)13253 aarch64_tls_symbol_p (rtx x)
13254 {
13255   if (! TARGET_HAVE_TLS)
13256     return false;
13257 
13258   if (GET_CODE (x) != SYMBOL_REF)
13259     return false;
13260 
13261   return SYMBOL_REF_TLS_MODEL (x) != 0;
13262 }
13263 
13264 /* Classify a TLS symbol into one of the TLS kinds.  */
13265 enum aarch64_symbol_type
aarch64_classify_tls_symbol(rtx x)13266 aarch64_classify_tls_symbol (rtx x)
13267 {
13268   enum tls_model tls_kind = tls_symbolic_operand_type (x);
13269 
13270   switch (tls_kind)
13271     {
13272     case TLS_MODEL_GLOBAL_DYNAMIC:
13273     case TLS_MODEL_LOCAL_DYNAMIC:
13274       return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
13275 
13276     case TLS_MODEL_INITIAL_EXEC:
13277       switch (aarch64_cmodel)
13278 	{
13279 	case AARCH64_CMODEL_TINY:
13280 	case AARCH64_CMODEL_TINY_PIC:
13281 	  return SYMBOL_TINY_TLSIE;
13282 	default:
13283 	  return SYMBOL_SMALL_TLSIE;
13284 	}
13285 
13286     case TLS_MODEL_LOCAL_EXEC:
13287       if (aarch64_tls_size == 12)
13288 	return SYMBOL_TLSLE12;
13289       else if (aarch64_tls_size == 24)
13290 	return SYMBOL_TLSLE24;
13291       else if (aarch64_tls_size == 32)
13292 	return SYMBOL_TLSLE32;
13293       else if (aarch64_tls_size == 48)
13294 	return SYMBOL_TLSLE48;
13295       else
13296 	gcc_unreachable ();
13297 
13298     case TLS_MODEL_EMULATED:
13299     case TLS_MODEL_NONE:
13300       return SYMBOL_FORCE_TO_MEM;
13301 
13302     default:
13303       gcc_unreachable ();
13304     }
13305 }
13306 
13307 /* Return the correct method for accessing X + OFFSET, where X is either
13308    a SYMBOL_REF or LABEL_REF.  */
13309 
13310 enum aarch64_symbol_type
aarch64_classify_symbol(rtx x,HOST_WIDE_INT offset)13311 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
13312 {
13313   if (GET_CODE (x) == LABEL_REF)
13314     {
13315       switch (aarch64_cmodel)
13316 	{
13317 	case AARCH64_CMODEL_LARGE:
13318 	  return SYMBOL_FORCE_TO_MEM;
13319 
13320 	case AARCH64_CMODEL_TINY_PIC:
13321 	case AARCH64_CMODEL_TINY:
13322 	  return SYMBOL_TINY_ABSOLUTE;
13323 
13324 	case AARCH64_CMODEL_SMALL_SPIC:
13325 	case AARCH64_CMODEL_SMALL_PIC:
13326 	case AARCH64_CMODEL_SMALL:
13327 	  return SYMBOL_SMALL_ABSOLUTE;
13328 
13329 	default:
13330 	  gcc_unreachable ();
13331 	}
13332     }
13333 
13334   if (GET_CODE (x) == SYMBOL_REF)
13335     {
13336       if (aarch64_tls_symbol_p (x))
13337 	return aarch64_classify_tls_symbol (x);
13338 
13339       switch (aarch64_cmodel)
13340 	{
13341 	case AARCH64_CMODEL_TINY:
13342 	  /* When we retrieve symbol + offset address, we have to make sure
13343 	     the offset does not cause overflow of the final address.  But
13344 	     we have no way of knowing the address of symbol at compile time
13345 	     so we can't accurately say if the distance between the PC and
13346 	     symbol + offset is outside the addressible range of +/-1MB in the
13347 	     TINY code model.  So we limit the maximum offset to +/-64KB and
13348 	     assume the offset to the symbol is not larger than +/-(1MB - 64KB).
13349 	     If offset_within_block_p is true we allow larger offsets.
13350 	     Furthermore force to memory if the symbol is a weak reference to
13351 	     something that doesn't resolve to a symbol in this module.  */
13352 
13353 	  if (SYMBOL_REF_WEAK (x) && !aarch64_symbol_binds_local_p (x))
13354 	    return SYMBOL_FORCE_TO_MEM;
13355 	  if (!(IN_RANGE (offset, -0x10000, 0x10000)
13356 		|| offset_within_block_p (x, offset)))
13357 	    return SYMBOL_FORCE_TO_MEM;
13358 
13359 	  return SYMBOL_TINY_ABSOLUTE;
13360 
13361 	case AARCH64_CMODEL_SMALL:
13362 	  /* Same reasoning as the tiny code model, but the offset cap here is
13363 	     1MB, allowing +/-3.9GB for the offset to the symbol.  */
13364 
13365 	  if (SYMBOL_REF_WEAK (x) && !aarch64_symbol_binds_local_p (x))
13366 	    return SYMBOL_FORCE_TO_MEM;
13367 	  if (!(IN_RANGE (offset, -0x100000, 0x100000)
13368 		|| offset_within_block_p (x, offset)))
13369 	    return SYMBOL_FORCE_TO_MEM;
13370 
13371 	  return SYMBOL_SMALL_ABSOLUTE;
13372 
13373 	case AARCH64_CMODEL_TINY_PIC:
13374 	  if (!aarch64_symbol_binds_local_p (x))
13375 	    return SYMBOL_TINY_GOT;
13376 	  return SYMBOL_TINY_ABSOLUTE;
13377 
13378 	case AARCH64_CMODEL_SMALL_SPIC:
13379 	case AARCH64_CMODEL_SMALL_PIC:
13380 	  if (!aarch64_symbol_binds_local_p (x))
13381 	    return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
13382 		    ?  SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
13383 	  return SYMBOL_SMALL_ABSOLUTE;
13384 
13385 	case AARCH64_CMODEL_LARGE:
13386 	  /* This is alright even in PIC code as the constant
13387 	     pool reference is always PC relative and within
13388 	     the same translation unit.  */
13389 	  if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
13390 	    return SYMBOL_SMALL_ABSOLUTE;
13391 	  else
13392 	    return SYMBOL_FORCE_TO_MEM;
13393 
13394 	default:
13395 	  gcc_unreachable ();
13396 	}
13397     }
13398 
13399   /* By default push everything into the constant pool.  */
13400   return SYMBOL_FORCE_TO_MEM;
13401 }
13402 
13403 bool
aarch64_constant_address_p(rtx x)13404 aarch64_constant_address_p (rtx x)
13405 {
13406   return (CONSTANT_P (x) && memory_address_p (DImode, x));
13407 }
13408 
13409 bool
aarch64_legitimate_pic_operand_p(rtx x)13410 aarch64_legitimate_pic_operand_p (rtx x)
13411 {
13412   if (GET_CODE (x) == SYMBOL_REF
13413       || (GET_CODE (x) == CONST
13414 	  && GET_CODE (XEXP (x, 0)) == PLUS
13415 	  && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
13416      return false;
13417 
13418   return true;
13419 }
13420 
13421 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook.  Return true for constants
13422    that should be rematerialized rather than spilled.  */
13423 
13424 static bool
aarch64_legitimate_constant_p(machine_mode mode,rtx x)13425 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
13426 {
13427   /* Support CSE and rematerialization of common constants.  */
13428   if (CONST_INT_P (x)
13429       || (CONST_DOUBLE_P (x) && GET_MODE_CLASS (mode) == MODE_FLOAT)
13430       || GET_CODE (x) == CONST_VECTOR)
13431     return true;
13432 
13433   /* Do not allow vector struct mode constants for Advanced SIMD.
13434      We could support 0 and -1 easily, but they need support in
13435      aarch64-simd.md.  */
13436   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
13437   if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
13438     return false;
13439 
13440   /* Only accept variable-length vector constants if they can be
13441      handled directly.
13442 
13443      ??? It would be possible to handle rematerialization of other
13444      constants via secondary reloads.  */
13445   if (vec_flags & VEC_ANY_SVE)
13446     return aarch64_simd_valid_immediate (x, NULL);
13447 
13448   if (GET_CODE (x) == HIGH)
13449     x = XEXP (x, 0);
13450 
13451   /* Accept polynomial constants that can be calculated by using the
13452      destination of a move as the sole temporary.  Constants that
13453      require a second temporary cannot be rematerialized (they can't be
13454      forced to memory and also aren't legitimate constants).  */
13455   poly_int64 offset;
13456   if (poly_int_rtx_p (x, &offset))
13457     return aarch64_offset_temporaries (false, offset) <= 1;
13458 
13459   /* If an offset is being added to something else, we need to allow the
13460      base to be moved into the destination register, meaning that there
13461      are no free temporaries for the offset.  */
13462   x = strip_offset (x, &offset);
13463   if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
13464     return false;
13465 
13466   /* Do not allow const (plus (anchor_symbol, const_int)).  */
13467   if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
13468     return false;
13469 
13470   /* Treat symbols as constants.  Avoid TLS symbols as they are complex,
13471      so spilling them is better than rematerialization.  */
13472   if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
13473     return true;
13474 
13475   /* Label references are always constant.  */
13476   if (GET_CODE (x) == LABEL_REF)
13477     return true;
13478 
13479   return false;
13480 }
13481 
13482 rtx
aarch64_load_tp(rtx target)13483 aarch64_load_tp (rtx target)
13484 {
13485   if (!target
13486       || GET_MODE (target) != Pmode
13487       || !register_operand (target, Pmode))
13488     target = gen_reg_rtx (Pmode);
13489 
13490   /* Can return in any reg.  */
13491   emit_insn (gen_aarch64_load_tp_hard (target));
13492   return target;
13493 }
13494 
13495 /* On AAPCS systems, this is the "struct __va_list".  */
13496 static GTY(()) tree va_list_type;
13497 
13498 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
13499    Return the type to use as __builtin_va_list.
13500 
13501    AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
13502 
13503    struct __va_list
13504    {
13505      void *__stack;
13506      void *__gr_top;
13507      void *__vr_top;
13508      int   __gr_offs;
13509      int   __vr_offs;
13510    };  */
13511 
13512 static tree
aarch64_build_builtin_va_list(void)13513 aarch64_build_builtin_va_list (void)
13514 {
13515   tree va_list_name;
13516   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
13517 
13518   /* Create the type.  */
13519   va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
13520   /* Give it the required name.  */
13521   va_list_name = build_decl (BUILTINS_LOCATION,
13522 			     TYPE_DECL,
13523 			     get_identifier ("__va_list"),
13524 			     va_list_type);
13525   DECL_ARTIFICIAL (va_list_name) = 1;
13526   TYPE_NAME (va_list_type) = va_list_name;
13527   TYPE_STUB_DECL (va_list_type) = va_list_name;
13528 
13529   /* Create the fields.  */
13530   f_stack = build_decl (BUILTINS_LOCATION,
13531 			FIELD_DECL, get_identifier ("__stack"),
13532 			ptr_type_node);
13533   f_grtop = build_decl (BUILTINS_LOCATION,
13534 			FIELD_DECL, get_identifier ("__gr_top"),
13535 			ptr_type_node);
13536   f_vrtop = build_decl (BUILTINS_LOCATION,
13537 			FIELD_DECL, get_identifier ("__vr_top"),
13538 			ptr_type_node);
13539   f_groff = build_decl (BUILTINS_LOCATION,
13540 			FIELD_DECL, get_identifier ("__gr_offs"),
13541 			integer_type_node);
13542   f_vroff = build_decl (BUILTINS_LOCATION,
13543 			FIELD_DECL, get_identifier ("__vr_offs"),
13544 			integer_type_node);
13545 
13546   /* Tell tree-stdarg pass about our internal offset fields.
13547      NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
13548      purpose to identify whether the code is updating va_list internal
13549      offset fields through irregular way.  */
13550   va_list_gpr_counter_field = f_groff;
13551   va_list_fpr_counter_field = f_vroff;
13552 
13553   DECL_ARTIFICIAL (f_stack) = 1;
13554   DECL_ARTIFICIAL (f_grtop) = 1;
13555   DECL_ARTIFICIAL (f_vrtop) = 1;
13556   DECL_ARTIFICIAL (f_groff) = 1;
13557   DECL_ARTIFICIAL (f_vroff) = 1;
13558 
13559   DECL_FIELD_CONTEXT (f_stack) = va_list_type;
13560   DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
13561   DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
13562   DECL_FIELD_CONTEXT (f_groff) = va_list_type;
13563   DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
13564 
13565   TYPE_FIELDS (va_list_type) = f_stack;
13566   DECL_CHAIN (f_stack) = f_grtop;
13567   DECL_CHAIN (f_grtop) = f_vrtop;
13568   DECL_CHAIN (f_vrtop) = f_groff;
13569   DECL_CHAIN (f_groff) = f_vroff;
13570 
13571   /* Compute its layout.  */
13572   layout_type (va_list_type);
13573 
13574   return va_list_type;
13575 }
13576 
13577 /* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
13578 static void
aarch64_expand_builtin_va_start(tree valist,rtx nextarg ATTRIBUTE_UNUSED)13579 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
13580 {
13581   const CUMULATIVE_ARGS *cum;
13582   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
13583   tree stack, grtop, vrtop, groff, vroff;
13584   tree t;
13585   int gr_save_area_size = cfun->va_list_gpr_size;
13586   int vr_save_area_size = cfun->va_list_fpr_size;
13587   int vr_offset;
13588 
13589   cum = &crtl->args.info;
13590   if (cfun->va_list_gpr_size)
13591     gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
13592 			     cfun->va_list_gpr_size);
13593   if (cfun->va_list_fpr_size)
13594     vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
13595 			     * UNITS_PER_VREG, cfun->va_list_fpr_size);
13596 
13597   if (!TARGET_FLOAT)
13598     {
13599       gcc_assert (cum->aapcs_nvrn == 0);
13600       vr_save_area_size = 0;
13601     }
13602 
13603   f_stack = TYPE_FIELDS (va_list_type_node);
13604   f_grtop = DECL_CHAIN (f_stack);
13605   f_vrtop = DECL_CHAIN (f_grtop);
13606   f_groff = DECL_CHAIN (f_vrtop);
13607   f_vroff = DECL_CHAIN (f_groff);
13608 
13609   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
13610 		  NULL_TREE);
13611   grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
13612 		  NULL_TREE);
13613   vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
13614 		  NULL_TREE);
13615   groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
13616 		  NULL_TREE);
13617   vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
13618 		  NULL_TREE);
13619 
13620   /* Emit code to initialize STACK, which points to the next varargs stack
13621      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
13622      by named arguments.  STACK is 8-byte aligned.  */
13623   t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
13624   if (cum->aapcs_stack_size > 0)
13625     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
13626   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
13627   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13628 
13629   /* Emit code to initialize GRTOP, the top of the GR save area.
13630      virtual_incoming_args_rtx should have been 16 byte aligned.  */
13631   t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
13632   t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
13633   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13634 
13635   /* Emit code to initialize VRTOP, the top of the VR save area.
13636      This address is gr_save_area_bytes below GRTOP, rounded
13637      down to the next 16-byte boundary.  */
13638   t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
13639   vr_offset = ROUND_UP (gr_save_area_size,
13640 			STACK_BOUNDARY / BITS_PER_UNIT);
13641 
13642   if (vr_offset)
13643     t = fold_build_pointer_plus_hwi (t, -vr_offset);
13644   t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
13645   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13646 
13647   /* Emit code to initialize GROFF, the offset from GRTOP of the
13648      next GPR argument.  */
13649   t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
13650 	      build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
13651   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13652 
13653   /* Likewise emit code to initialize VROFF, the offset from FTOP
13654      of the next VR argument.  */
13655   t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
13656 	      build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
13657   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13658 }
13659 
13660 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
13661 
13662 static tree
aarch64_gimplify_va_arg_expr(tree valist,tree type,gimple_seq * pre_p,gimple_seq * post_p ATTRIBUTE_UNUSED)13663 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
13664 			      gimple_seq *post_p ATTRIBUTE_UNUSED)
13665 {
13666   tree addr;
13667   bool indirect_p;
13668   bool is_ha;		/* is HFA or HVA.  */
13669   bool dw_align;	/* double-word align.  */
13670   machine_mode ag_mode = VOIDmode;
13671   int nregs;
13672   machine_mode mode;
13673 
13674   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
13675   tree stack, f_top, f_off, off, arg, roundup, on_stack;
13676   HOST_WIDE_INT size, rsize, adjust, align;
13677   tree t, u, cond1, cond2;
13678 
13679   indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
13680   if (indirect_p)
13681     type = build_pointer_type (type);
13682 
13683   mode = TYPE_MODE (type);
13684 
13685   f_stack = TYPE_FIELDS (va_list_type_node);
13686   f_grtop = DECL_CHAIN (f_stack);
13687   f_vrtop = DECL_CHAIN (f_grtop);
13688   f_groff = DECL_CHAIN (f_vrtop);
13689   f_vroff = DECL_CHAIN (f_groff);
13690 
13691   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
13692 		  f_stack, NULL_TREE);
13693   size = int_size_in_bytes (type);
13694 
13695   bool abi_break;
13696   align
13697     = aarch64_function_arg_alignment (mode, type, &abi_break) / BITS_PER_UNIT;
13698 
13699   dw_align = false;
13700   adjust = 0;
13701   if (aarch64_vfp_is_call_or_return_candidate (mode,
13702 					       type,
13703 					       &ag_mode,
13704 					       &nregs,
13705 					       &is_ha))
13706     {
13707       /* No frontends can create types with variable-sized modes, so we
13708 	 shouldn't be asked to pass or return them.  */
13709       unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
13710 
13711       /* TYPE passed in fp/simd registers.  */
13712       if (!TARGET_FLOAT)
13713 	aarch64_err_no_fpadvsimd (mode);
13714 
13715       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
13716 		      unshare_expr (valist), f_vrtop, NULL_TREE);
13717       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
13718 		      unshare_expr (valist), f_vroff, NULL_TREE);
13719 
13720       rsize = nregs * UNITS_PER_VREG;
13721 
13722       if (is_ha)
13723 	{
13724 	  if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
13725 	    adjust = UNITS_PER_VREG - ag_size;
13726 	}
13727       else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
13728 	       && size < UNITS_PER_VREG)
13729 	{
13730 	  adjust = UNITS_PER_VREG - size;
13731 	}
13732     }
13733   else
13734     {
13735       /* TYPE passed in general registers.  */
13736       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
13737 		      unshare_expr (valist), f_grtop, NULL_TREE);
13738       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
13739 		      unshare_expr (valist), f_groff, NULL_TREE);
13740       rsize = ROUND_UP (size, UNITS_PER_WORD);
13741       nregs = rsize / UNITS_PER_WORD;
13742 
13743       if (align > 8)
13744 	{
13745 	  if (abi_break && warn_psabi)
13746 	    inform (input_location, "parameter passing for argument of type "
13747 		    "%qT changed in GCC 9.1", type);
13748 	  dw_align = true;
13749 	}
13750 
13751       if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
13752 	  && size < UNITS_PER_WORD)
13753 	{
13754 	  adjust = UNITS_PER_WORD  - size;
13755 	}
13756     }
13757 
13758   /* Get a local temporary for the field value.  */
13759   off = get_initialized_tmp_var (f_off, pre_p, NULL);
13760 
13761   /* Emit code to branch if off >= 0.  */
13762   t = build2 (GE_EXPR, boolean_type_node, off,
13763 	      build_int_cst (TREE_TYPE (off), 0));
13764   cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
13765 
13766   if (dw_align)
13767     {
13768       /* Emit: offs = (offs + 15) & -16.  */
13769       t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
13770 		  build_int_cst (TREE_TYPE (off), 15));
13771       t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
13772 		  build_int_cst (TREE_TYPE (off), -16));
13773       roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
13774     }
13775   else
13776     roundup = NULL;
13777 
13778   /* Update ap.__[g|v]r_offs  */
13779   t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
13780 	      build_int_cst (TREE_TYPE (off), rsize));
13781   t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
13782 
13783   /* String up.  */
13784   if (roundup)
13785     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
13786 
13787   /* [cond2] if (ap.__[g|v]r_offs > 0)  */
13788   u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
13789 	      build_int_cst (TREE_TYPE (f_off), 0));
13790   cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
13791 
13792   /* String up: make sure the assignment happens before the use.  */
13793   t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
13794   COND_EXPR_ELSE (cond1) = t;
13795 
13796   /* Prepare the trees handling the argument that is passed on the stack;
13797      the top level node will store in ON_STACK.  */
13798   arg = get_initialized_tmp_var (stack, pre_p, NULL);
13799   if (align > 8)
13800     {
13801       /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
13802       t = fold_build_pointer_plus_hwi (arg, 15);
13803       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
13804 		  build_int_cst (TREE_TYPE (t), -16));
13805       roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
13806     }
13807   else
13808     roundup = NULL;
13809   /* Advance ap.__stack  */
13810   t = fold_build_pointer_plus_hwi (arg, size + 7);
13811   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
13812 	      build_int_cst (TREE_TYPE (t), -8));
13813   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
13814   /* String up roundup and advance.  */
13815   if (roundup)
13816     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
13817   /* String up with arg */
13818   on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
13819   /* Big-endianness related address adjustment.  */
13820   if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
13821       && size < UNITS_PER_WORD)
13822   {
13823     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
13824 		size_int (UNITS_PER_WORD - size));
13825     on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
13826   }
13827 
13828   COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
13829   COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
13830 
13831   /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
13832   t = off;
13833   if (adjust)
13834     t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
13835 		build_int_cst (TREE_TYPE (off), adjust));
13836 
13837   t = fold_convert (sizetype, t);
13838   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
13839 
13840   if (is_ha)
13841     {
13842       /* type ha; // treat as "struct {ftype field[n];}"
13843          ... [computing offs]
13844          for (i = 0; i <nregs; ++i, offs += 16)
13845 	   ha.field[i] = *((ftype *)(ap.__vr_top + offs));
13846 	 return ha;  */
13847       int i;
13848       tree tmp_ha, field_t, field_ptr_t;
13849 
13850       /* Declare a local variable.  */
13851       tmp_ha = create_tmp_var_raw (type, "ha");
13852       gimple_add_tmp_var (tmp_ha);
13853 
13854       /* Establish the base type.  */
13855       switch (ag_mode)
13856 	{
13857 	case E_SFmode:
13858 	  field_t = float_type_node;
13859 	  field_ptr_t = float_ptr_type_node;
13860 	  break;
13861 	case E_DFmode:
13862 	  field_t = double_type_node;
13863 	  field_ptr_t = double_ptr_type_node;
13864 	  break;
13865 	case E_TFmode:
13866 	  field_t = long_double_type_node;
13867 	  field_ptr_t = long_double_ptr_type_node;
13868 	  break;
13869 	case E_HFmode:
13870 	  field_t = aarch64_fp16_type_node;
13871 	  field_ptr_t = aarch64_fp16_ptr_type_node;
13872 	  break;
13873 	case E_V2SImode:
13874 	case E_V4SImode:
13875 	    {
13876 	      tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
13877 	      field_t = build_vector_type_for_mode (innertype, ag_mode);
13878 	      field_ptr_t = build_pointer_type (field_t);
13879 	    }
13880 	  break;
13881 	default:
13882 	  gcc_assert (0);
13883 	}
13884 
13885       /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
13886       tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
13887       addr = t;
13888       t = fold_convert (field_ptr_t, addr);
13889       t = build2 (MODIFY_EXPR, field_t,
13890 		  build1 (INDIRECT_REF, field_t, tmp_ha),
13891 		  build1 (INDIRECT_REF, field_t, t));
13892 
13893       /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
13894       for (i = 1; i < nregs; ++i)
13895 	{
13896 	  addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
13897 	  u = fold_convert (field_ptr_t, addr);
13898 	  u = build2 (MODIFY_EXPR, field_t,
13899 		      build2 (MEM_REF, field_t, tmp_ha,
13900 			      build_int_cst (field_ptr_t,
13901 					     (i *
13902 					      int_size_in_bytes (field_t)))),
13903 		      build1 (INDIRECT_REF, field_t, u));
13904 	  t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
13905 	}
13906 
13907       u = fold_convert (TREE_TYPE (f_top), tmp_ha);
13908       t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
13909     }
13910 
13911   COND_EXPR_ELSE (cond2) = t;
13912   addr = fold_convert (build_pointer_type (type), cond1);
13913   addr = build_va_arg_indirect_ref (addr);
13914 
13915   if (indirect_p)
13916     addr = build_va_arg_indirect_ref (addr);
13917 
13918   return addr;
13919 }
13920 
13921 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
13922 
13923 static void
aarch64_setup_incoming_varargs(cumulative_args_t cum_v,machine_mode mode,tree type,int * pretend_size ATTRIBUTE_UNUSED,int no_rtl)13924 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
13925 				tree type, int *pretend_size ATTRIBUTE_UNUSED,
13926 				int no_rtl)
13927 {
13928   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
13929   CUMULATIVE_ARGS local_cum;
13930   int gr_saved = cfun->va_list_gpr_size;
13931   int vr_saved = cfun->va_list_fpr_size;
13932 
13933   /* The caller has advanced CUM up to, but not beyond, the last named
13934      argument.  Advance a local copy of CUM past the last "real" named
13935      argument, to find out how many registers are left over.  */
13936   local_cum = *cum;
13937   aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
13938 
13939   /* Found out how many registers we need to save.
13940      Honor tree-stdvar analysis results.  */
13941   if (cfun->va_list_gpr_size)
13942     gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
13943 		    cfun->va_list_gpr_size / UNITS_PER_WORD);
13944   if (cfun->va_list_fpr_size)
13945     vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
13946 		    cfun->va_list_fpr_size / UNITS_PER_VREG);
13947 
13948   if (!TARGET_FLOAT)
13949     {
13950       gcc_assert (local_cum.aapcs_nvrn == 0);
13951       vr_saved = 0;
13952     }
13953 
13954   if (!no_rtl)
13955     {
13956       if (gr_saved > 0)
13957 	{
13958 	  rtx ptr, mem;
13959 
13960 	  /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
13961 	  ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
13962 			       - gr_saved * UNITS_PER_WORD);
13963 	  mem = gen_frame_mem (BLKmode, ptr);
13964 	  set_mem_alias_set (mem, get_varargs_alias_set ());
13965 
13966 	  move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
13967 			       mem, gr_saved);
13968 	}
13969       if (vr_saved > 0)
13970 	{
13971 	  /* We can't use move_block_from_reg, because it will use
13972 	     the wrong mode, storing D regs only.  */
13973 	  machine_mode mode = TImode;
13974 	  int off, i, vr_start;
13975 
13976 	  /* Set OFF to the offset from virtual_incoming_args_rtx of
13977 	     the first vector register.  The VR save area lies below
13978 	     the GR one, and is aligned to 16 bytes.  */
13979 	  off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
13980 			   STACK_BOUNDARY / BITS_PER_UNIT);
13981 	  off -= vr_saved * UNITS_PER_VREG;
13982 
13983 	  vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
13984 	  for (i = 0; i < vr_saved; ++i)
13985 	    {
13986 	      rtx ptr, mem;
13987 
13988 	      ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
13989 	      mem = gen_frame_mem (mode, ptr);
13990 	      set_mem_alias_set (mem, get_varargs_alias_set ());
13991 	      aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
13992 	      off += UNITS_PER_VREG;
13993 	    }
13994 	}
13995     }
13996 
13997   /* We don't save the size into *PRETEND_SIZE because we want to avoid
13998      any complication of having crtl->args.pretend_args_size changed.  */
13999   cfun->machine->frame.saved_varargs_size
14000     = (ROUND_UP (gr_saved * UNITS_PER_WORD,
14001 		 STACK_BOUNDARY / BITS_PER_UNIT)
14002        + vr_saved * UNITS_PER_VREG);
14003 }
14004 
14005 static void
aarch64_conditional_register_usage(void)14006 aarch64_conditional_register_usage (void)
14007 {
14008   int i;
14009   if (!TARGET_FLOAT)
14010     {
14011       for (i = V0_REGNUM; i <= V31_REGNUM; i++)
14012 	{
14013 	  fixed_regs[i] = 1;
14014 	  call_used_regs[i] = 1;
14015 	}
14016     }
14017   if (!TARGET_SVE)
14018     for (i = P0_REGNUM; i <= P15_REGNUM; i++)
14019       {
14020 	fixed_regs[i] = 1;
14021 	call_used_regs[i] = 1;
14022       }
14023 
14024   /* When tracking speculation, we need a couple of call-clobbered registers
14025      to track the speculation state.  It would be nice to just use
14026      IP0 and IP1, but currently there are numerous places that just
14027      assume these registers are free for other uses (eg pointer
14028      authentication).  */
14029   if (aarch64_track_speculation)
14030     {
14031       fixed_regs[SPECULATION_TRACKER_REGNUM] = 1;
14032       call_used_regs[SPECULATION_TRACKER_REGNUM] = 1;
14033       fixed_regs[SPECULATION_SCRATCH_REGNUM] = 1;
14034       call_used_regs[SPECULATION_SCRATCH_REGNUM] = 1;
14035     }
14036 }
14037 
14038 /* Walk down the type tree of TYPE counting consecutive base elements.
14039    If *MODEP is VOIDmode, then set it to the first valid floating point
14040    type.  If a non-floating point type is found, or if a floating point
14041    type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
14042    otherwise return the count in the sub-tree.  */
14043 static int
aapcs_vfp_sub_candidate(const_tree type,machine_mode * modep)14044 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
14045 {
14046   machine_mode mode;
14047   HOST_WIDE_INT size;
14048 
14049   switch (TREE_CODE (type))
14050     {
14051     case REAL_TYPE:
14052       mode = TYPE_MODE (type);
14053       if (mode != DFmode && mode != SFmode
14054 	  && mode != TFmode && mode != HFmode)
14055 	return -1;
14056 
14057       if (*modep == VOIDmode)
14058 	*modep = mode;
14059 
14060       if (*modep == mode)
14061 	return 1;
14062 
14063       break;
14064 
14065     case COMPLEX_TYPE:
14066       mode = TYPE_MODE (TREE_TYPE (type));
14067       if (mode != DFmode && mode != SFmode
14068 	  && mode != TFmode && mode != HFmode)
14069 	return -1;
14070 
14071       if (*modep == VOIDmode)
14072 	*modep = mode;
14073 
14074       if (*modep == mode)
14075 	return 2;
14076 
14077       break;
14078 
14079     case VECTOR_TYPE:
14080       /* Use V2SImode and V4SImode as representatives of all 64-bit
14081 	 and 128-bit vector types.  */
14082       size = int_size_in_bytes (type);
14083       switch (size)
14084 	{
14085 	case 8:
14086 	  mode = V2SImode;
14087 	  break;
14088 	case 16:
14089 	  mode = V4SImode;
14090 	  break;
14091 	default:
14092 	  return -1;
14093 	}
14094 
14095       if (*modep == VOIDmode)
14096 	*modep = mode;
14097 
14098       /* Vector modes are considered to be opaque: two vectors are
14099 	 equivalent for the purposes of being homogeneous aggregates
14100 	 if they are the same size.  */
14101       if (*modep == mode)
14102 	return 1;
14103 
14104       break;
14105 
14106     case ARRAY_TYPE:
14107       {
14108 	int count;
14109 	tree index = TYPE_DOMAIN (type);
14110 
14111 	/* Can't handle incomplete types nor sizes that are not
14112 	   fixed.  */
14113 	if (!COMPLETE_TYPE_P (type)
14114 	    || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
14115 	  return -1;
14116 
14117 	count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
14118 	if (count == -1
14119 	    || !index
14120 	    || !TYPE_MAX_VALUE (index)
14121 	    || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
14122 	    || !TYPE_MIN_VALUE (index)
14123 	    || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
14124 	    || count < 0)
14125 	  return -1;
14126 
14127 	count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
14128 		      - tree_to_uhwi (TYPE_MIN_VALUE (index)));
14129 
14130 	/* There must be no padding.  */
14131 	if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
14132 		      count * GET_MODE_BITSIZE (*modep)))
14133 	  return -1;
14134 
14135 	return count;
14136       }
14137 
14138     case RECORD_TYPE:
14139       {
14140 	int count = 0;
14141 	int sub_count;
14142 	tree field;
14143 
14144 	/* Can't handle incomplete types nor sizes that are not
14145 	   fixed.  */
14146 	if (!COMPLETE_TYPE_P (type)
14147 	    || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
14148 	  return -1;
14149 
14150 	for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
14151 	  {
14152 	    if (TREE_CODE (field) != FIELD_DECL)
14153 	      continue;
14154 
14155 	    sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
14156 	    if (sub_count < 0)
14157 	      return -1;
14158 	    count += sub_count;
14159 	  }
14160 
14161 	/* There must be no padding.  */
14162 	if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
14163 		      count * GET_MODE_BITSIZE (*modep)))
14164 	  return -1;
14165 
14166 	return count;
14167       }
14168 
14169     case UNION_TYPE:
14170     case QUAL_UNION_TYPE:
14171       {
14172 	/* These aren't very interesting except in a degenerate case.  */
14173 	int count = 0;
14174 	int sub_count;
14175 	tree field;
14176 
14177 	/* Can't handle incomplete types nor sizes that are not
14178 	   fixed.  */
14179 	if (!COMPLETE_TYPE_P (type)
14180 	    || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
14181 	  return -1;
14182 
14183 	for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
14184 	  {
14185 	    if (TREE_CODE (field) != FIELD_DECL)
14186 	      continue;
14187 
14188 	    sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
14189 	    if (sub_count < 0)
14190 	      return -1;
14191 	    count = count > sub_count ? count : sub_count;
14192 	  }
14193 
14194 	/* There must be no padding.  */
14195 	if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
14196 		      count * GET_MODE_BITSIZE (*modep)))
14197 	  return -1;
14198 
14199 	return count;
14200       }
14201 
14202     default:
14203       break;
14204     }
14205 
14206   return -1;
14207 }
14208 
14209 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
14210    type as described in AAPCS64 \S 4.1.2.
14211 
14212    See the comment above aarch64_composite_type_p for the notes on MODE.  */
14213 
14214 static bool
aarch64_short_vector_p(const_tree type,machine_mode mode)14215 aarch64_short_vector_p (const_tree type,
14216 			machine_mode mode)
14217 {
14218   poly_int64 size = -1;
14219 
14220   if (type && TREE_CODE (type) == VECTOR_TYPE)
14221     size = int_size_in_bytes (type);
14222   else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
14223 	    || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
14224     size = GET_MODE_SIZE (mode);
14225 
14226   return known_eq (size, 8) || known_eq (size, 16);
14227 }
14228 
14229 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
14230    type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
14231    array types.  The C99 floating-point complex types are also considered
14232    as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
14233    types, which are GCC extensions and out of the scope of AAPCS64, are
14234    treated as composite types here as well.
14235 
14236    Note that MODE itself is not sufficient in determining whether a type
14237    is such a composite type or not.  This is because
14238    stor-layout.c:compute_record_mode may have already changed the MODE
14239    (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
14240    structure with only one field may have its MODE set to the mode of the
14241    field.  Also an integer mode whose size matches the size of the
14242    RECORD_TYPE type may be used to substitute the original mode
14243    (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
14244    solely relied on.  */
14245 
14246 static bool
aarch64_composite_type_p(const_tree type,machine_mode mode)14247 aarch64_composite_type_p (const_tree type,
14248 			  machine_mode mode)
14249 {
14250   if (aarch64_short_vector_p (type, mode))
14251     return false;
14252 
14253   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
14254     return true;
14255 
14256   if (mode == BLKmode
14257       || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
14258       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
14259     return true;
14260 
14261   return false;
14262 }
14263 
14264 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
14265    shall be passed or returned in simd/fp register(s) (providing these
14266    parameter passing registers are available).
14267 
14268    Upon successful return, *COUNT returns the number of needed registers,
14269    *BASE_MODE returns the mode of the individual register and when IS_HAF
14270    is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
14271    floating-point aggregate or a homogeneous short-vector aggregate.  */
14272 
14273 static bool
aarch64_vfp_is_call_or_return_candidate(machine_mode mode,const_tree type,machine_mode * base_mode,int * count,bool * is_ha)14274 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
14275 					 const_tree type,
14276 					 machine_mode *base_mode,
14277 					 int *count,
14278 					 bool *is_ha)
14279 {
14280   machine_mode new_mode = VOIDmode;
14281   bool composite_p = aarch64_composite_type_p (type, mode);
14282 
14283   if (is_ha != NULL) *is_ha = false;
14284 
14285   if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
14286       || aarch64_short_vector_p (type, mode))
14287     {
14288       *count = 1;
14289       new_mode = mode;
14290     }
14291   else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
14292     {
14293       if (is_ha != NULL) *is_ha = true;
14294       *count = 2;
14295       new_mode = GET_MODE_INNER (mode);
14296     }
14297   else if (type && composite_p)
14298     {
14299       int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
14300 
14301       if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
14302 	{
14303 	  if (is_ha != NULL) *is_ha = true;
14304 	  *count = ag_count;
14305 	}
14306       else
14307 	return false;
14308     }
14309   else
14310     return false;
14311 
14312   *base_mode = new_mode;
14313   return true;
14314 }
14315 
14316 /* Implement TARGET_STRUCT_VALUE_RTX.  */
14317 
14318 static rtx
aarch64_struct_value_rtx(tree fndecl ATTRIBUTE_UNUSED,int incoming ATTRIBUTE_UNUSED)14319 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
14320 			  int incoming ATTRIBUTE_UNUSED)
14321 {
14322   return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
14323 }
14324 
14325 /* Implements target hook vector_mode_supported_p.  */
14326 static bool
aarch64_vector_mode_supported_p(machine_mode mode)14327 aarch64_vector_mode_supported_p (machine_mode mode)
14328 {
14329   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
14330   return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
14331 }
14332 
14333 /* Return appropriate SIMD container
14334    for MODE within a vector of WIDTH bits.  */
14335 static machine_mode
aarch64_simd_container_mode(scalar_mode mode,poly_int64 width)14336 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
14337 {
14338   if (TARGET_SVE && known_eq (width, BITS_PER_SVE_VECTOR))
14339     switch (mode)
14340       {
14341       case E_DFmode:
14342 	return VNx2DFmode;
14343       case E_SFmode:
14344 	return VNx4SFmode;
14345       case E_HFmode:
14346 	return VNx8HFmode;
14347       case E_DImode:
14348 	return VNx2DImode;
14349       case E_SImode:
14350 	return VNx4SImode;
14351       case E_HImode:
14352 	return VNx8HImode;
14353       case E_QImode:
14354 	return VNx16QImode;
14355       default:
14356 	return word_mode;
14357       }
14358 
14359   gcc_assert (known_eq (width, 64) || known_eq (width, 128));
14360   if (TARGET_SIMD)
14361     {
14362       if (known_eq (width, 128))
14363 	switch (mode)
14364 	  {
14365 	  case E_DFmode:
14366 	    return V2DFmode;
14367 	  case E_SFmode:
14368 	    return V4SFmode;
14369 	  case E_HFmode:
14370 	    return V8HFmode;
14371 	  case E_SImode:
14372 	    return V4SImode;
14373 	  case E_HImode:
14374 	    return V8HImode;
14375 	  case E_QImode:
14376 	    return V16QImode;
14377 	  case E_DImode:
14378 	    return V2DImode;
14379 	  default:
14380 	    break;
14381 	  }
14382       else
14383 	switch (mode)
14384 	  {
14385 	  case E_SFmode:
14386 	    return V2SFmode;
14387 	  case E_HFmode:
14388 	    return V4HFmode;
14389 	  case E_SImode:
14390 	    return V2SImode;
14391 	  case E_HImode:
14392 	    return V4HImode;
14393 	  case E_QImode:
14394 	    return V8QImode;
14395 	  default:
14396 	    break;
14397 	  }
14398     }
14399   return word_mode;
14400 }
14401 
14402 /* Return 128-bit container as the preferred SIMD mode for MODE.  */
14403 static machine_mode
aarch64_preferred_simd_mode(scalar_mode mode)14404 aarch64_preferred_simd_mode (scalar_mode mode)
14405 {
14406   /* If current tuning prefers Advanced SIMD, bypass SVE.  */
14407   bool use_sve
14408     = TARGET_SVE
14409       && !(aarch64_tune_params.extra_tuning_flags
14410 	   & AARCH64_EXTRA_TUNE_PREFER_ADVSIMD_AUTOVEC);
14411   poly_int64 bits = use_sve ? BITS_PER_SVE_VECTOR : 128;
14412   return aarch64_simd_container_mode (mode, bits);
14413 }
14414 
14415 /* Return a list of possible vector sizes for the vectorizer
14416    to iterate over.  */
14417 static void
aarch64_autovectorize_vector_sizes(vector_sizes * sizes)14418 aarch64_autovectorize_vector_sizes (vector_sizes *sizes)
14419 {
14420   bool use_sve
14421     = TARGET_SVE
14422       && !(aarch64_tune_params.extra_tuning_flags
14423 	   & AARCH64_EXTRA_TUNE_PREFER_ADVSIMD_AUTOVEC);
14424   if (use_sve)
14425     sizes->safe_push (BYTES_PER_SVE_VECTOR);
14426   sizes->safe_push (16);
14427   sizes->safe_push (8);
14428 }
14429 
14430 /* Implement TARGET_MANGLE_TYPE.  */
14431 
14432 static const char *
aarch64_mangle_type(const_tree type)14433 aarch64_mangle_type (const_tree type)
14434 {
14435   /* The AArch64 ABI documents say that "__va_list" has to be
14436      mangled as if it is in the "std" namespace.  */
14437   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
14438     return "St9__va_list";
14439 
14440   /* Half-precision float.  */
14441   if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
14442     return "Dh";
14443 
14444   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
14445      builtin types.  */
14446   if (TYPE_NAME (type) != NULL)
14447     return aarch64_mangle_builtin_type (type);
14448 
14449   /* Use the default mangling.  */
14450   return NULL;
14451 }
14452 
14453 /* Find the first rtx_insn before insn that will generate an assembly
14454    instruction.  */
14455 
14456 static rtx_insn *
aarch64_prev_real_insn(rtx_insn * insn)14457 aarch64_prev_real_insn (rtx_insn *insn)
14458 {
14459   if (!insn)
14460     return NULL;
14461 
14462   do
14463     {
14464       insn = prev_real_insn (insn);
14465     }
14466   while (insn && recog_memoized (insn) < 0);
14467 
14468   return insn;
14469 }
14470 
14471 static bool
is_madd_op(enum attr_type t1)14472 is_madd_op (enum attr_type t1)
14473 {
14474   unsigned int i;
14475   /* A number of these may be AArch32 only.  */
14476   enum attr_type mlatypes[] = {
14477     TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
14478     TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
14479     TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
14480   };
14481 
14482   for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
14483     {
14484       if (t1 == mlatypes[i])
14485 	return true;
14486     }
14487 
14488   return false;
14489 }
14490 
14491 /* Check if there is a register dependency between a load and the insn
14492    for which we hold recog_data.  */
14493 
14494 static bool
dep_between_memop_and_curr(rtx memop)14495 dep_between_memop_and_curr (rtx memop)
14496 {
14497   rtx load_reg;
14498   int opno;
14499 
14500   gcc_assert (GET_CODE (memop) == SET);
14501 
14502   if (!REG_P (SET_DEST (memop)))
14503     return false;
14504 
14505   load_reg = SET_DEST (memop);
14506   for (opno = 1; opno < recog_data.n_operands; opno++)
14507     {
14508       rtx operand = recog_data.operand[opno];
14509       if (REG_P (operand)
14510           && reg_overlap_mentioned_p (load_reg, operand))
14511         return true;
14512 
14513     }
14514   return false;
14515 }
14516 
14517 
14518 /* When working around the Cortex-A53 erratum 835769,
14519    given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
14520    instruction and has a preceding memory instruction such that a NOP
14521    should be inserted between them.  */
14522 
14523 bool
aarch64_madd_needs_nop(rtx_insn * insn)14524 aarch64_madd_needs_nop (rtx_insn* insn)
14525 {
14526   enum attr_type attr_type;
14527   rtx_insn *prev;
14528   rtx body;
14529 
14530   if (!TARGET_FIX_ERR_A53_835769)
14531     return false;
14532 
14533   if (!INSN_P (insn) || recog_memoized (insn) < 0)
14534     return false;
14535 
14536   attr_type = get_attr_type (insn);
14537   if (!is_madd_op (attr_type))
14538     return false;
14539 
14540   prev = aarch64_prev_real_insn (insn);
14541   /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
14542      Restore recog state to INSN to avoid state corruption.  */
14543   extract_constrain_insn_cached (insn);
14544 
14545   if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
14546     return false;
14547 
14548   body = single_set (prev);
14549 
14550   /* If the previous insn is a memory op and there is no dependency between
14551      it and the DImode madd, emit a NOP between them.  If body is NULL then we
14552      have a complex memory operation, probably a load/store pair.
14553      Be conservative for now and emit a NOP.  */
14554   if (GET_MODE (recog_data.operand[0]) == DImode
14555       && (!body || !dep_between_memop_and_curr (body)))
14556     return true;
14557 
14558   return false;
14559 
14560 }
14561 
14562 
14563 /* Implement FINAL_PRESCAN_INSN.  */
14564 
14565 void
aarch64_final_prescan_insn(rtx_insn * insn)14566 aarch64_final_prescan_insn (rtx_insn *insn)
14567 {
14568   if (aarch64_madd_needs_nop (insn))
14569     fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
14570 }
14571 
14572 
14573 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
14574    instruction.  */
14575 
14576 bool
aarch64_sve_index_immediate_p(rtx base_or_step)14577 aarch64_sve_index_immediate_p (rtx base_or_step)
14578 {
14579   return (CONST_INT_P (base_or_step)
14580 	  && IN_RANGE (INTVAL (base_or_step), -16, 15));
14581 }
14582 
14583 /* Return true if X is a valid immediate for the SVE ADD and SUB
14584    instructions.  Negate X first if NEGATE_P is true.  */
14585 
14586 bool
aarch64_sve_arith_immediate_p(rtx x,bool negate_p)14587 aarch64_sve_arith_immediate_p (rtx x, bool negate_p)
14588 {
14589   rtx elt;
14590 
14591   if (!const_vec_duplicate_p (x, &elt)
14592       || !CONST_INT_P (elt))
14593     return false;
14594 
14595   HOST_WIDE_INT val = INTVAL (elt);
14596   if (negate_p)
14597     val = -val;
14598   val &= GET_MODE_MASK (GET_MODE_INNER (GET_MODE (x)));
14599 
14600   if (val & 0xff)
14601     return IN_RANGE (val, 0, 0xff);
14602   return IN_RANGE (val, 0, 0xff00);
14603 }
14604 
14605 /* Return true if X is a valid immediate operand for an SVE logical
14606    instruction such as AND.  */
14607 
14608 bool
aarch64_sve_bitmask_immediate_p(rtx x)14609 aarch64_sve_bitmask_immediate_p (rtx x)
14610 {
14611   rtx elt;
14612 
14613   return (const_vec_duplicate_p (x, &elt)
14614 	  && CONST_INT_P (elt)
14615 	  && aarch64_bitmask_imm (INTVAL (elt),
14616 				  GET_MODE_INNER (GET_MODE (x))));
14617 }
14618 
14619 /* Return true if X is a valid immediate for the SVE DUP and CPY
14620    instructions.  */
14621 
14622 bool
aarch64_sve_dup_immediate_p(rtx x)14623 aarch64_sve_dup_immediate_p (rtx x)
14624 {
14625   rtx elt;
14626 
14627   if (!const_vec_duplicate_p (x, &elt)
14628       || !CONST_INT_P (elt))
14629     return false;
14630 
14631   HOST_WIDE_INT val = INTVAL (elt);
14632   if (val & 0xff)
14633     return IN_RANGE (val, -0x80, 0x7f);
14634   return IN_RANGE (val, -0x8000, 0x7f00);
14635 }
14636 
14637 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
14638    SIGNED_P says whether the operand is signed rather than unsigned.  */
14639 
14640 bool
aarch64_sve_cmp_immediate_p(rtx x,bool signed_p)14641 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
14642 {
14643   rtx elt;
14644 
14645   return (const_vec_duplicate_p (x, &elt)
14646 	  && CONST_INT_P (elt)
14647 	  && (signed_p
14648 	      ? IN_RANGE (INTVAL (elt), -16, 15)
14649 	      : IN_RANGE (INTVAL (elt), 0, 127)));
14650 }
14651 
14652 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
14653    instruction.  Negate X first if NEGATE_P is true.  */
14654 
14655 bool
aarch64_sve_float_arith_immediate_p(rtx x,bool negate_p)14656 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
14657 {
14658   rtx elt;
14659   REAL_VALUE_TYPE r;
14660 
14661   if (!const_vec_duplicate_p (x, &elt)
14662       || GET_CODE (elt) != CONST_DOUBLE)
14663     return false;
14664 
14665   r = *CONST_DOUBLE_REAL_VALUE (elt);
14666 
14667   if (negate_p)
14668     r = real_value_negate (&r);
14669 
14670   if (real_equal (&r, &dconst1))
14671     return true;
14672   if (real_equal (&r, &dconsthalf))
14673     return true;
14674   return false;
14675 }
14676 
14677 /* Return true if X is a valid immediate operand for an SVE FMUL
14678    instruction.  */
14679 
14680 bool
aarch64_sve_float_mul_immediate_p(rtx x)14681 aarch64_sve_float_mul_immediate_p (rtx x)
14682 {
14683   rtx elt;
14684 
14685   /* GCC will never generate a multiply with an immediate of 2, so there is no
14686      point testing for it (even though it is a valid constant).  */
14687   return (const_vec_duplicate_p (x, &elt)
14688 	  && GET_CODE (elt) == CONST_DOUBLE
14689 	  && real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf));
14690 }
14691 
14692 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
14693    for the Advanced SIMD operation described by WHICH and INSN.  If INFO
14694    is nonnull, use it to describe valid immediates.  */
14695 static bool
aarch64_advsimd_valid_immediate_hs(unsigned int val32,simd_immediate_info * info,enum simd_immediate_check which,simd_immediate_info::insn_type insn)14696 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
14697 				    simd_immediate_info *info,
14698 				    enum simd_immediate_check which,
14699 				    simd_immediate_info::insn_type insn)
14700 {
14701   /* Try a 4-byte immediate with LSL.  */
14702   for (unsigned int shift = 0; shift < 32; shift += 8)
14703     if ((val32 & (0xff << shift)) == val32)
14704       {
14705 	if (info)
14706 	  *info = simd_immediate_info (SImode, val32 >> shift, insn,
14707 				       simd_immediate_info::LSL, shift);
14708 	return true;
14709       }
14710 
14711   /* Try a 2-byte immediate with LSL.  */
14712   unsigned int imm16 = val32 & 0xffff;
14713   if (imm16 == (val32 >> 16))
14714     for (unsigned int shift = 0; shift < 16; shift += 8)
14715       if ((imm16 & (0xff << shift)) == imm16)
14716 	{
14717 	  if (info)
14718 	    *info = simd_immediate_info (HImode, imm16 >> shift, insn,
14719 					 simd_immediate_info::LSL, shift);
14720 	  return true;
14721 	}
14722 
14723   /* Try a 4-byte immediate with MSL, except for cases that MVN
14724      can handle.  */
14725   if (which == AARCH64_CHECK_MOV)
14726     for (unsigned int shift = 8; shift < 24; shift += 8)
14727       {
14728 	unsigned int low = (1 << shift) - 1;
14729 	if (((val32 & (0xff << shift)) | low) == val32)
14730 	  {
14731 	    if (info)
14732 	      *info = simd_immediate_info (SImode, val32 >> shift, insn,
14733 					   simd_immediate_info::MSL, shift);
14734 	    return true;
14735 	  }
14736       }
14737 
14738   return false;
14739 }
14740 
14741 /* Return true if replicating VAL64 is a valid immediate for the
14742    Advanced SIMD operation described by WHICH.  If INFO is nonnull,
14743    use it to describe valid immediates.  */
14744 static bool
aarch64_advsimd_valid_immediate(unsigned HOST_WIDE_INT val64,simd_immediate_info * info,enum simd_immediate_check which)14745 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
14746 				 simd_immediate_info *info,
14747 				 enum simd_immediate_check which)
14748 {
14749   unsigned int val32 = val64 & 0xffffffff;
14750   unsigned int val16 = val64 & 0xffff;
14751   unsigned int val8 = val64 & 0xff;
14752 
14753   if (val32 == (val64 >> 32))
14754     {
14755       if ((which & AARCH64_CHECK_ORR) != 0
14756 	  && aarch64_advsimd_valid_immediate_hs (val32, info, which,
14757 						 simd_immediate_info::MOV))
14758 	return true;
14759 
14760       if ((which & AARCH64_CHECK_BIC) != 0
14761 	  && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
14762 						 simd_immediate_info::MVN))
14763 	return true;
14764 
14765       /* Try using a replicated byte.  */
14766       if (which == AARCH64_CHECK_MOV
14767 	  && val16 == (val32 >> 16)
14768 	  && val8 == (val16 >> 8))
14769 	{
14770 	  if (info)
14771 	    *info = simd_immediate_info (QImode, val8);
14772 	  return true;
14773 	}
14774     }
14775 
14776   /* Try using a bit-to-bytemask.  */
14777   if (which == AARCH64_CHECK_MOV)
14778     {
14779       unsigned int i;
14780       for (i = 0; i < 64; i += 8)
14781 	{
14782 	  unsigned char byte = (val64 >> i) & 0xff;
14783 	  if (byte != 0 && byte != 0xff)
14784 	    break;
14785 	}
14786       if (i == 64)
14787 	{
14788 	  if (info)
14789 	    *info = simd_immediate_info (DImode, val64);
14790 	  return true;
14791 	}
14792     }
14793   return false;
14794 }
14795 
14796 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
14797    instruction.  If INFO is nonnull, use it to describe valid immediates.  */
14798 
14799 static bool
aarch64_sve_valid_immediate(unsigned HOST_WIDE_INT val64,simd_immediate_info * info)14800 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
14801 			     simd_immediate_info *info)
14802 {
14803   scalar_int_mode mode = DImode;
14804   unsigned int val32 = val64 & 0xffffffff;
14805   if (val32 == (val64 >> 32))
14806     {
14807       mode = SImode;
14808       unsigned int val16 = val32 & 0xffff;
14809       if (val16 == (val32 >> 16))
14810 	{
14811 	  mode = HImode;
14812 	  unsigned int val8 = val16 & 0xff;
14813 	  if (val8 == (val16 >> 8))
14814 	    mode = QImode;
14815 	}
14816     }
14817   HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
14818   if (IN_RANGE (val, -0x80, 0x7f))
14819     {
14820       /* DUP with no shift.  */
14821       if (info)
14822 	*info = simd_immediate_info (mode, val);
14823       return true;
14824     }
14825   if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
14826     {
14827       /* DUP with LSL #8.  */
14828       if (info)
14829 	*info = simd_immediate_info (mode, val);
14830       return true;
14831     }
14832   if (aarch64_bitmask_imm (val64, mode))
14833     {
14834       /* DUPM.  */
14835       if (info)
14836 	*info = simd_immediate_info (mode, val);
14837       return true;
14838     }
14839   return false;
14840 }
14841 
14842 /* Return true if OP is a valid SIMD immediate for the operation
14843    described by WHICH.  If INFO is nonnull, use it to describe valid
14844    immediates.  */
14845 bool
aarch64_simd_valid_immediate(rtx op,simd_immediate_info * info,enum simd_immediate_check which)14846 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
14847 			      enum simd_immediate_check which)
14848 {
14849   machine_mode mode = GET_MODE (op);
14850   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
14851   if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
14852     return false;
14853 
14854   scalar_mode elt_mode = GET_MODE_INNER (mode);
14855   rtx base, step;
14856   unsigned int n_elts;
14857   if (GET_CODE (op) == CONST_VECTOR
14858       && CONST_VECTOR_DUPLICATE_P (op))
14859     n_elts = CONST_VECTOR_NPATTERNS (op);
14860   else if ((vec_flags & VEC_SVE_DATA)
14861 	   && const_vec_series_p (op, &base, &step))
14862     {
14863       gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
14864       if (!aarch64_sve_index_immediate_p (base)
14865 	  || !aarch64_sve_index_immediate_p (step))
14866 	return false;
14867 
14868       if (info)
14869 	*info = simd_immediate_info (elt_mode, base, step);
14870       return true;
14871     }
14872   else if (GET_CODE (op) == CONST_VECTOR
14873 	   && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
14874     /* N_ELTS set above.  */;
14875   else
14876     return false;
14877 
14878   /* Handle PFALSE and PTRUE.  */
14879   if (vec_flags & VEC_SVE_PRED)
14880     return (op == CONST0_RTX (mode)
14881 	    || op == CONSTM1_RTX (mode));
14882 
14883   scalar_float_mode elt_float_mode;
14884   if (n_elts == 1
14885       && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
14886     {
14887       rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
14888       if (aarch64_float_const_zero_rtx_p (elt)
14889 	  || aarch64_float_const_representable_p (elt))
14890 	{
14891 	  if (info)
14892 	    *info = simd_immediate_info (elt_float_mode, elt);
14893 	  return true;
14894 	}
14895     }
14896 
14897   unsigned int elt_size = GET_MODE_SIZE (elt_mode);
14898   if (elt_size > 8)
14899     return false;
14900 
14901   scalar_int_mode elt_int_mode = int_mode_for_mode (elt_mode).require ();
14902 
14903   /* Expand the vector constant out into a byte vector, with the least
14904      significant byte of the register first.  */
14905   auto_vec<unsigned char, 16> bytes;
14906   bytes.reserve (n_elts * elt_size);
14907   for (unsigned int i = 0; i < n_elts; i++)
14908     {
14909       /* The vector is provided in gcc endian-neutral fashion.
14910 	 For aarch64_be Advanced SIMD, it must be laid out in the vector
14911 	 register in reverse order.  */
14912       bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
14913       rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
14914 
14915       if (elt_mode != elt_int_mode)
14916 	elt = gen_lowpart (elt_int_mode, elt);
14917 
14918       if (!CONST_INT_P (elt))
14919 	return false;
14920 
14921       unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
14922       for (unsigned int byte = 0; byte < elt_size; byte++)
14923 	{
14924 	  bytes.quick_push (elt_val & 0xff);
14925 	  elt_val >>= BITS_PER_UNIT;
14926 	}
14927     }
14928 
14929   /* The immediate must repeat every eight bytes.  */
14930   unsigned int nbytes = bytes.length ();
14931   for (unsigned i = 8; i < nbytes; ++i)
14932     if (bytes[i] != bytes[i - 8])
14933       return false;
14934 
14935   /* Get the repeating 8-byte value as an integer.  No endian correction
14936      is needed here because bytes is already in lsb-first order.  */
14937   unsigned HOST_WIDE_INT val64 = 0;
14938   for (unsigned int i = 0; i < 8; i++)
14939     val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
14940 	      << (i * BITS_PER_UNIT));
14941 
14942   if (vec_flags & VEC_SVE_DATA)
14943     return aarch64_sve_valid_immediate (val64, info);
14944   else
14945     return aarch64_advsimd_valid_immediate (val64, info, which);
14946 }
14947 
14948 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
14949    has a step in the range of INDEX.  Return the index expression if so,
14950    otherwise return null.  */
14951 rtx
aarch64_check_zero_based_sve_index_immediate(rtx x)14952 aarch64_check_zero_based_sve_index_immediate (rtx x)
14953 {
14954   rtx base, step;
14955   if (const_vec_series_p (x, &base, &step)
14956       && base == const0_rtx
14957       && aarch64_sve_index_immediate_p (step))
14958     return step;
14959   return NULL_RTX;
14960 }
14961 
14962 /* Check of immediate shift constants are within range.  */
14963 bool
aarch64_simd_shift_imm_p(rtx x,machine_mode mode,bool left)14964 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
14965 {
14966   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
14967   if (left)
14968     return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
14969   else
14970     return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
14971 }
14972 
14973 /* Return the bitmask CONST_INT to select the bits required by a zero extract
14974    operation of width WIDTH at bit position POS.  */
14975 
14976 rtx
aarch64_mask_from_zextract_ops(rtx width,rtx pos)14977 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
14978 {
14979   gcc_assert (CONST_INT_P (width));
14980   gcc_assert (CONST_INT_P (pos));
14981 
14982   unsigned HOST_WIDE_INT mask
14983     = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
14984   return GEN_INT (mask << UINTVAL (pos));
14985 }
14986 
14987 bool
aarch64_mov_operand_p(rtx x,machine_mode mode)14988 aarch64_mov_operand_p (rtx x, machine_mode mode)
14989 {
14990   if (GET_CODE (x) == HIGH
14991       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
14992     return true;
14993 
14994   if (CONST_INT_P (x))
14995     return true;
14996 
14997   if (VECTOR_MODE_P (GET_MODE (x)))
14998     return aarch64_simd_valid_immediate (x, NULL);
14999 
15000   if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
15001     return true;
15002 
15003   if (aarch64_sve_cnt_immediate_p (x))
15004     return true;
15005 
15006   return aarch64_classify_symbolic_expression (x)
15007     == SYMBOL_TINY_ABSOLUTE;
15008 }
15009 
15010 /* Return a const_int vector of VAL.  */
15011 rtx
aarch64_simd_gen_const_vector_dup(machine_mode mode,HOST_WIDE_INT val)15012 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
15013 {
15014   rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
15015   return gen_const_vec_duplicate (mode, c);
15016 }
15017 
15018 /* Check OP is a legal scalar immediate for the MOVI instruction.  */
15019 
15020 bool
aarch64_simd_scalar_immediate_valid_for_move(rtx op,scalar_int_mode mode)15021 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
15022 {
15023   machine_mode vmode;
15024 
15025   vmode = aarch64_simd_container_mode (mode, 64);
15026   rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
15027   return aarch64_simd_valid_immediate (op_v, NULL);
15028 }
15029 
15030 /* Construct and return a PARALLEL RTX vector with elements numbering the
15031    lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
15032    the vector - from the perspective of the architecture.  This does not
15033    line up with GCC's perspective on lane numbers, so we end up with
15034    different masks depending on our target endian-ness.  The diagram
15035    below may help.  We must draw the distinction when building masks
15036    which select one half of the vector.  An instruction selecting
15037    architectural low-lanes for a big-endian target, must be described using
15038    a mask selecting GCC high-lanes.
15039 
15040                  Big-Endian             Little-Endian
15041 
15042 GCC             0   1   2   3           3   2   1   0
15043               | x | x | x | x |       | x | x | x | x |
15044 Architecture    3   2   1   0           3   2   1   0
15045 
15046 Low Mask:         { 2, 3 }                { 0, 1 }
15047 High Mask:        { 0, 1 }                { 2, 3 }
15048 
15049    MODE Is the mode of the vector and NUNITS is the number of units in it.  */
15050 
15051 rtx
aarch64_simd_vect_par_cnst_half(machine_mode mode,int nunits,bool high)15052 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
15053 {
15054   rtvec v = rtvec_alloc (nunits / 2);
15055   int high_base = nunits / 2;
15056   int low_base = 0;
15057   int base;
15058   rtx t1;
15059   int i;
15060 
15061   if (BYTES_BIG_ENDIAN)
15062     base = high ? low_base : high_base;
15063   else
15064     base = high ? high_base : low_base;
15065 
15066   for (i = 0; i < nunits / 2; i++)
15067     RTVEC_ELT (v, i) = GEN_INT (base + i);
15068 
15069   t1 = gen_rtx_PARALLEL (mode, v);
15070   return t1;
15071 }
15072 
15073 /* Check OP for validity as a PARALLEL RTX vector with elements
15074    numbering the lanes of either the high (HIGH == TRUE) or low lanes,
15075    from the perspective of the architecture.  See the diagram above
15076    aarch64_simd_vect_par_cnst_half for more details.  */
15077 
15078 bool
aarch64_simd_check_vect_par_cnst_half(rtx op,machine_mode mode,bool high)15079 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
15080 				       bool high)
15081 {
15082   int nelts;
15083   if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
15084     return false;
15085 
15086   rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
15087   HOST_WIDE_INT count_op = XVECLEN (op, 0);
15088   HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
15089   int i = 0;
15090 
15091   if (count_op != count_ideal)
15092     return false;
15093 
15094   for (i = 0; i < count_ideal; i++)
15095     {
15096       rtx elt_op = XVECEXP (op, 0, i);
15097       rtx elt_ideal = XVECEXP (ideal, 0, i);
15098 
15099       if (!CONST_INT_P (elt_op)
15100 	  || INTVAL (elt_ideal) != INTVAL (elt_op))
15101 	return false;
15102     }
15103   return true;
15104 }
15105 
15106 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
15107    HIGH (exclusive).  */
15108 void
aarch64_simd_lane_bounds(rtx operand,HOST_WIDE_INT low,HOST_WIDE_INT high,const_tree exp)15109 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
15110 			  const_tree exp)
15111 {
15112   HOST_WIDE_INT lane;
15113   gcc_assert (CONST_INT_P (operand));
15114   lane = INTVAL (operand);
15115 
15116   if (lane < low || lane >= high)
15117   {
15118     if (exp)
15119       error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
15120     else
15121       error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
15122   }
15123 }
15124 
15125 /* Peform endian correction on lane number N, which indexes a vector
15126    of mode MODE, and return the result as an SImode rtx.  */
15127 
15128 rtx
aarch64_endian_lane_rtx(machine_mode mode,unsigned int n)15129 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
15130 {
15131   return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
15132 }
15133 
15134 /* Return TRUE if OP is a valid vector addressing mode.  */
15135 
15136 bool
aarch64_simd_mem_operand_p(rtx op)15137 aarch64_simd_mem_operand_p (rtx op)
15138 {
15139   return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
15140 			|| REG_P (XEXP (op, 0)));
15141 }
15142 
15143 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction.  */
15144 
15145 bool
aarch64_sve_ld1r_operand_p(rtx op)15146 aarch64_sve_ld1r_operand_p (rtx op)
15147 {
15148   struct aarch64_address_info addr;
15149   scalar_mode mode;
15150 
15151   return (MEM_P (op)
15152 	  && is_a <scalar_mode> (GET_MODE (op), &mode)
15153 	  && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
15154 	  && addr.type == ADDRESS_REG_IMM
15155 	  && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
15156 }
15157 
15158 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
15159    The conditions for STR are the same.  */
15160 bool
aarch64_sve_ldr_operand_p(rtx op)15161 aarch64_sve_ldr_operand_p (rtx op)
15162 {
15163   struct aarch64_address_info addr;
15164 
15165   return (MEM_P (op)
15166 	  && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
15167 				       false, ADDR_QUERY_ANY)
15168 	  && addr.type == ADDRESS_REG_IMM);
15169 }
15170 
15171 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
15172    We need to be able to access the individual pieces, so the range
15173    is different from LD[234] and ST[234].  */
15174 bool
aarch64_sve_struct_memory_operand_p(rtx op)15175 aarch64_sve_struct_memory_operand_p (rtx op)
15176 {
15177   if (!MEM_P (op))
15178     return false;
15179 
15180   machine_mode mode = GET_MODE (op);
15181   struct aarch64_address_info addr;
15182   if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
15183 				 ADDR_QUERY_ANY)
15184       || addr.type != ADDRESS_REG_IMM)
15185     return false;
15186 
15187   poly_int64 first = addr.const_offset;
15188   poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
15189   return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
15190 	  && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
15191 }
15192 
15193 /* Emit a register copy from operand to operand, taking care not to
15194    early-clobber source registers in the process.
15195 
15196    COUNT is the number of components into which the copy needs to be
15197    decomposed.  */
15198 void
aarch64_simd_emit_reg_reg_move(rtx * operands,machine_mode mode,unsigned int count)15199 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
15200 				unsigned int count)
15201 {
15202   unsigned int i;
15203   int rdest = REGNO (operands[0]);
15204   int rsrc = REGNO (operands[1]);
15205 
15206   if (!reg_overlap_mentioned_p (operands[0], operands[1])
15207       || rdest < rsrc)
15208     for (i = 0; i < count; i++)
15209       emit_move_insn (gen_rtx_REG (mode, rdest + i),
15210 		      gen_rtx_REG (mode, rsrc + i));
15211   else
15212     for (i = 0; i < count; i++)
15213       emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
15214 		      gen_rtx_REG (mode, rsrc + count - i - 1));
15215 }
15216 
15217 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
15218    one of VSTRUCT modes: OI, CI, or XI.  */
15219 int
aarch64_simd_attr_length_rglist(machine_mode mode)15220 aarch64_simd_attr_length_rglist (machine_mode mode)
15221 {
15222   /* This is only used (and only meaningful) for Advanced SIMD, not SVE.  */
15223   return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
15224 }
15225 
15226 /* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
15227    alignment of a vector to 128 bits.  SVE predicates have an alignment of
15228    16 bits.  */
15229 static HOST_WIDE_INT
aarch64_simd_vector_alignment(const_tree type)15230 aarch64_simd_vector_alignment (const_tree type)
15231 {
15232   if (TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
15233     /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
15234        be set for non-predicate vectors of booleans.  Modes are the most
15235        direct way we have of identifying real SVE predicate types.  */
15236     return GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL ? 16 : 128;
15237   return wi::umin (wi::to_wide (TYPE_SIZE (type)), 128).to_uhwi ();
15238 }
15239 
15240 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT.  */
15241 static poly_uint64
aarch64_vectorize_preferred_vector_alignment(const_tree type)15242 aarch64_vectorize_preferred_vector_alignment (const_tree type)
15243 {
15244   if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
15245     {
15246       /* If the length of the vector is fixed, try to align to that length,
15247 	 otherwise don't try to align at all.  */
15248       HOST_WIDE_INT result;
15249       if (!BITS_PER_SVE_VECTOR.is_constant (&result))
15250 	result = TYPE_ALIGN (TREE_TYPE (type));
15251       return result;
15252     }
15253   return TYPE_ALIGN (type);
15254 }
15255 
15256 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
15257 static bool
aarch64_simd_vector_alignment_reachable(const_tree type,bool is_packed)15258 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
15259 {
15260   if (is_packed)
15261     return false;
15262 
15263   /* For fixed-length vectors, check that the vectorizer will aim for
15264      full-vector alignment.  This isn't true for generic GCC vectors
15265      that are wider than the ABI maximum of 128 bits.  */
15266   poly_uint64 preferred_alignment =
15267     aarch64_vectorize_preferred_vector_alignment (type);
15268   if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
15269       && maybe_ne (wi::to_widest (TYPE_SIZE (type)),
15270 		   preferred_alignment))
15271     return false;
15272 
15273   /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
15274   return true;
15275 }
15276 
15277 /* Return true if the vector misalignment factor is supported by the
15278    target.  */
15279 static bool
aarch64_builtin_support_vector_misalignment(machine_mode mode,const_tree type,int misalignment,bool is_packed)15280 aarch64_builtin_support_vector_misalignment (machine_mode mode,
15281 					     const_tree type, int misalignment,
15282 					     bool is_packed)
15283 {
15284   if (TARGET_SIMD && STRICT_ALIGNMENT)
15285     {
15286       /* Return if movmisalign pattern is not supported for this mode.  */
15287       if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
15288         return false;
15289 
15290       /* Misalignment factor is unknown at compile time.  */
15291       if (misalignment == -1)
15292 	return false;
15293     }
15294   return default_builtin_support_vector_misalignment (mode, type, misalignment,
15295 						      is_packed);
15296 }
15297 
15298 /* If VALS is a vector constant that can be loaded into a register
15299    using DUP, generate instructions to do so and return an RTX to
15300    assign to the register.  Otherwise return NULL_RTX.  */
15301 static rtx
aarch64_simd_dup_constant(rtx vals)15302 aarch64_simd_dup_constant (rtx vals)
15303 {
15304   machine_mode mode = GET_MODE (vals);
15305   machine_mode inner_mode = GET_MODE_INNER (mode);
15306   rtx x;
15307 
15308   if (!const_vec_duplicate_p (vals, &x))
15309     return NULL_RTX;
15310 
15311   /* We can load this constant by using DUP and a constant in a
15312      single ARM register.  This will be cheaper than a vector
15313      load.  */
15314   x = copy_to_mode_reg (inner_mode, x);
15315   return gen_vec_duplicate (mode, x);
15316 }
15317 
15318 
15319 /* Generate code to load VALS, which is a PARALLEL containing only
15320    constants (for vec_init) or CONST_VECTOR, efficiently into a
15321    register.  Returns an RTX to copy into the register, or NULL_RTX
15322    for a PARALLEL that cannot be converted into a CONST_VECTOR.  */
15323 static rtx
aarch64_simd_make_constant(rtx vals)15324 aarch64_simd_make_constant (rtx vals)
15325 {
15326   machine_mode mode = GET_MODE (vals);
15327   rtx const_dup;
15328   rtx const_vec = NULL_RTX;
15329   int n_const = 0;
15330   int i;
15331 
15332   if (GET_CODE (vals) == CONST_VECTOR)
15333     const_vec = vals;
15334   else if (GET_CODE (vals) == PARALLEL)
15335     {
15336       /* A CONST_VECTOR must contain only CONST_INTs and
15337 	 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
15338 	 Only store valid constants in a CONST_VECTOR.  */
15339       int n_elts = XVECLEN (vals, 0);
15340       for (i = 0; i < n_elts; ++i)
15341 	{
15342 	  rtx x = XVECEXP (vals, 0, i);
15343 	  if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
15344 	    n_const++;
15345 	}
15346       if (n_const == n_elts)
15347 	const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
15348     }
15349   else
15350     gcc_unreachable ();
15351 
15352   if (const_vec != NULL_RTX
15353       && aarch64_simd_valid_immediate (const_vec, NULL))
15354     /* Load using MOVI/MVNI.  */
15355     return const_vec;
15356   else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
15357     /* Loaded using DUP.  */
15358     return const_dup;
15359   else if (const_vec != NULL_RTX)
15360     /* Load from constant pool. We cannot take advantage of single-cycle
15361        LD1 because we need a PC-relative addressing mode.  */
15362     return const_vec;
15363   else
15364     /* A PARALLEL containing something not valid inside CONST_VECTOR.
15365        We cannot construct an initializer.  */
15366     return NULL_RTX;
15367 }
15368 
15369 /* Expand a vector initialisation sequence, such that TARGET is
15370    initialised to contain VALS.  */
15371 
15372 void
aarch64_expand_vector_init(rtx target,rtx vals)15373 aarch64_expand_vector_init (rtx target, rtx vals)
15374 {
15375   machine_mode mode = GET_MODE (target);
15376   scalar_mode inner_mode = GET_MODE_INNER (mode);
15377   /* The number of vector elements.  */
15378   int n_elts = XVECLEN (vals, 0);
15379   /* The number of vector elements which are not constant.  */
15380   int n_var = 0;
15381   rtx any_const = NULL_RTX;
15382   /* The first element of vals.  */
15383   rtx v0 = XVECEXP (vals, 0, 0);
15384   bool all_same = true;
15385 
15386   /* Count the number of variable elements to initialise.  */
15387   for (int i = 0; i < n_elts; ++i)
15388     {
15389       rtx x = XVECEXP (vals, 0, i);
15390       if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
15391 	++n_var;
15392       else
15393 	any_const = x;
15394 
15395       all_same &= rtx_equal_p (x, v0);
15396     }
15397 
15398   /* No variable elements, hand off to aarch64_simd_make_constant which knows
15399      how best to handle this.  */
15400   if (n_var == 0)
15401     {
15402       rtx constant = aarch64_simd_make_constant (vals);
15403       if (constant != NULL_RTX)
15404 	{
15405 	  emit_move_insn (target, constant);
15406 	  return;
15407 	}
15408     }
15409 
15410   /* Splat a single non-constant element if we can.  */
15411   if (all_same)
15412     {
15413       rtx x = copy_to_mode_reg (inner_mode, v0);
15414       aarch64_emit_move (target, gen_vec_duplicate (mode, x));
15415       return;
15416     }
15417 
15418   enum insn_code icode = optab_handler (vec_set_optab, mode);
15419   gcc_assert (icode != CODE_FOR_nothing);
15420 
15421   /* If there are only variable elements, try to optimize
15422      the insertion using dup for the most common element
15423      followed by insertions.  */
15424 
15425   /* The algorithm will fill matches[*][0] with the earliest matching element,
15426      and matches[X][1] with the count of duplicate elements (if X is the
15427      earliest element which has duplicates).  */
15428 
15429   if (n_var == n_elts && n_elts <= 16)
15430     {
15431       int matches[16][2] = {0};
15432       for (int i = 0; i < n_elts; i++)
15433 	{
15434 	  for (int j = 0; j <= i; j++)
15435 	    {
15436 	      if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
15437 		{
15438 		  matches[i][0] = j;
15439 		  matches[j][1]++;
15440 		  break;
15441 		}
15442 	    }
15443 	}
15444       int maxelement = 0;
15445       int maxv = 0;
15446       for (int i = 0; i < n_elts; i++)
15447 	if (matches[i][1] > maxv)
15448 	  {
15449 	    maxelement = i;
15450 	    maxv = matches[i][1];
15451 	  }
15452 
15453       /* Create a duplicate of the most common element, unless all elements
15454 	 are equally useless to us, in which case just immediately set the
15455 	 vector register using the first element.  */
15456 
15457       if (maxv == 1)
15458 	{
15459 	  /* For vectors of two 64-bit elements, we can do even better.  */
15460 	  if (n_elts == 2
15461 	      && (inner_mode == E_DImode
15462 		  || inner_mode == E_DFmode))
15463 
15464 	    {
15465 	      rtx x0 = XVECEXP (vals, 0, 0);
15466 	      rtx x1 = XVECEXP (vals, 0, 1);
15467 	      /* Combine can pick up this case, but handling it directly
15468 		 here leaves clearer RTL.
15469 
15470 		 This is load_pair_lanes<mode>, and also gives us a clean-up
15471 		 for store_pair_lanes<mode>.  */
15472 	      if (memory_operand (x0, inner_mode)
15473 		  && memory_operand (x1, inner_mode)
15474 		  && !STRICT_ALIGNMENT
15475 		  && rtx_equal_p (XEXP (x1, 0),
15476 				  plus_constant (Pmode,
15477 						 XEXP (x0, 0),
15478 						 GET_MODE_SIZE (inner_mode))))
15479 		{
15480 		  rtx t;
15481 		  if (inner_mode == DFmode)
15482 		    t = gen_load_pair_lanesdf (target, x0, x1);
15483 		  else
15484 		    t = gen_load_pair_lanesdi (target, x0, x1);
15485 		  emit_insn (t);
15486 		  return;
15487 		}
15488 	    }
15489 	  /* The subreg-move sequence below will move into lane zero of the
15490 	     vector register.  For big-endian we want that position to hold
15491 	     the last element of VALS.  */
15492 	  maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
15493 	  rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
15494 	  aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
15495 	}
15496       else
15497 	{
15498 	  rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
15499 	  aarch64_emit_move (target, gen_vec_duplicate (mode, x));
15500 	}
15501 
15502       /* Insert the rest.  */
15503       for (int i = 0; i < n_elts; i++)
15504 	{
15505 	  rtx x = XVECEXP (vals, 0, i);
15506 	  if (matches[i][0] == maxelement)
15507 	    continue;
15508 	  x = copy_to_mode_reg (inner_mode, x);
15509 	  emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
15510 	}
15511       return;
15512     }
15513 
15514   /* Initialise a vector which is part-variable.  We want to first try
15515      to build those lanes which are constant in the most efficient way we
15516      can.  */
15517   if (n_var != n_elts)
15518     {
15519       rtx copy = copy_rtx (vals);
15520 
15521       /* Load constant part of vector.  We really don't care what goes into the
15522 	 parts we will overwrite, but we're more likely to be able to load the
15523 	 constant efficiently if it has fewer, larger, repeating parts
15524 	 (see aarch64_simd_valid_immediate).  */
15525       for (int i = 0; i < n_elts; i++)
15526 	{
15527 	  rtx x = XVECEXP (vals, 0, i);
15528 	  if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
15529 	    continue;
15530 	  rtx subst = any_const;
15531 	  for (int bit = n_elts / 2; bit > 0; bit /= 2)
15532 	    {
15533 	      /* Look in the copied vector, as more elements are const.  */
15534 	      rtx test = XVECEXP (copy, 0, i ^ bit);
15535 	      if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
15536 		{
15537 		  subst = test;
15538 		  break;
15539 		}
15540 	    }
15541 	  XVECEXP (copy, 0, i) = subst;
15542 	}
15543       aarch64_expand_vector_init (target, copy);
15544     }
15545 
15546   /* Insert the variable lanes directly.  */
15547   for (int i = 0; i < n_elts; i++)
15548     {
15549       rtx x = XVECEXP (vals, 0, i);
15550       if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
15551 	continue;
15552       x = copy_to_mode_reg (inner_mode, x);
15553       emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
15554     }
15555 }
15556 
15557 static unsigned HOST_WIDE_INT
aarch64_shift_truncation_mask(machine_mode mode)15558 aarch64_shift_truncation_mask (machine_mode mode)
15559 {
15560   if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
15561     return 0;
15562   return GET_MODE_UNIT_BITSIZE (mode) - 1;
15563 }
15564 
15565 /* Select a format to encode pointers in exception handling data.  */
15566 int
aarch64_asm_preferred_eh_data_format(int code ATTRIBUTE_UNUSED,int global)15567 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
15568 {
15569    int type;
15570    switch (aarch64_cmodel)
15571      {
15572      case AARCH64_CMODEL_TINY:
15573      case AARCH64_CMODEL_TINY_PIC:
15574      case AARCH64_CMODEL_SMALL:
15575      case AARCH64_CMODEL_SMALL_PIC:
15576      case AARCH64_CMODEL_SMALL_SPIC:
15577        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
15578 	  for everything.  */
15579        type = DW_EH_PE_sdata4;
15580        break;
15581      default:
15582        /* No assumptions here.  8-byte relocs required.  */
15583        type = DW_EH_PE_sdata8;
15584        break;
15585      }
15586    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
15587 }
15588 
15589 /* Output .variant_pcs for aarch64_vector_pcs function symbols.  */
15590 
15591 static void
aarch64_asm_output_variant_pcs(FILE * stream,const tree decl,const char * name)15592 aarch64_asm_output_variant_pcs (FILE *stream, const tree decl, const char* name)
15593 {
15594   if (aarch64_simd_decl_p (decl))
15595     {
15596       fprintf (stream, "\t.variant_pcs\t");
15597       assemble_name (stream, name);
15598       fprintf (stream, "\n");
15599     }
15600 }
15601 
15602 /* The last .arch and .tune assembly strings that we printed.  */
15603 static std::string aarch64_last_printed_arch_string;
15604 static std::string aarch64_last_printed_tune_string;
15605 
15606 /* Implement ASM_DECLARE_FUNCTION_NAME.  Output the ISA features used
15607    by the function fndecl.  */
15608 
15609 void
aarch64_declare_function_name(FILE * stream,const char * name,tree fndecl)15610 aarch64_declare_function_name (FILE *stream, const char* name,
15611 				tree fndecl)
15612 {
15613   tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
15614 
15615   struct cl_target_option *targ_options;
15616   if (target_parts)
15617     targ_options = TREE_TARGET_OPTION (target_parts);
15618   else
15619     targ_options = TREE_TARGET_OPTION (target_option_current_node);
15620   gcc_assert (targ_options);
15621 
15622   const struct processor *this_arch
15623     = aarch64_get_arch (targ_options->x_explicit_arch);
15624 
15625   unsigned long isa_flags = targ_options->x_aarch64_isa_flags;
15626   std::string extension
15627     = aarch64_get_extension_string_for_isa_flags (isa_flags,
15628 						  this_arch->flags);
15629   /* Only update the assembler .arch string if it is distinct from the last
15630      such string we printed.  */
15631   std::string to_print = this_arch->name + extension;
15632   if (to_print != aarch64_last_printed_arch_string)
15633     {
15634       asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
15635       aarch64_last_printed_arch_string = to_print;
15636     }
15637 
15638   /* Print the cpu name we're tuning for in the comments, might be
15639      useful to readers of the generated asm.  Do it only when it changes
15640      from function to function and verbose assembly is requested.  */
15641   const struct processor *this_tune
15642     = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
15643 
15644   if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
15645     {
15646       asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
15647 		   this_tune->name);
15648       aarch64_last_printed_tune_string = this_tune->name;
15649     }
15650 
15651   aarch64_asm_output_variant_pcs (stream, fndecl, name);
15652 
15653   /* Don't forget the type directive for ELF.  */
15654   ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
15655   ASM_OUTPUT_LABEL (stream, name);
15656 
15657   cfun->machine->label_is_assembled = true;
15658 }
15659 
15660 /* Implement PRINT_PATCHABLE_FUNCTION_ENTRY.  Check if the patch area is after
15661    the function label and emit a BTI if necessary.  */
15662 
15663 void
aarch64_print_patchable_function_entry(FILE * file,unsigned HOST_WIDE_INT patch_area_size,bool record_p)15664 aarch64_print_patchable_function_entry (FILE *file,
15665 					unsigned HOST_WIDE_INT patch_area_size,
15666 					bool record_p)
15667 {
15668   if (cfun->machine->label_is_assembled
15669       && aarch64_bti_enabled ()
15670       && !cgraph_node::get (cfun->decl)->only_called_directly_p ())
15671     {
15672       /* Remove the BTI that follows the patch area and insert a new BTI
15673 	 before the patch area right after the function label.  */
15674       rtx_insn *insn = next_real_nondebug_insn (get_insns ());
15675       if (insn
15676 	  && INSN_P (insn)
15677 	  && GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
15678 	  && XINT (PATTERN (insn), 1) == UNSPECV_BTI_C)
15679 	delete_insn (insn);
15680       asm_fprintf (file, "\thint\t34 // bti c\n");
15681     }
15682 
15683   default_print_patchable_function_entry (file, patch_area_size, record_p);
15684 }
15685 
15686 /* Implement ASM_OUTPUT_DEF_FROM_DECLS.  Output .variant_pcs for aliases.  */
15687 
15688 void
aarch64_asm_output_alias(FILE * stream,const tree decl,const tree target)15689 aarch64_asm_output_alias (FILE *stream, const tree decl, const tree target)
15690 {
15691   const char *name = XSTR (XEXP (DECL_RTL (decl), 0), 0);
15692   const char *value = IDENTIFIER_POINTER (target);
15693   aarch64_asm_output_variant_pcs (stream, decl, name);
15694   ASM_OUTPUT_DEF (stream, name, value);
15695 }
15696 
15697 /* Implement ASM_OUTPUT_EXTERNAL.  Output .variant_pcs for undefined
15698    function symbol references.  */
15699 
15700 void
aarch64_asm_output_external(FILE * stream,tree decl,const char * name)15701 aarch64_asm_output_external (FILE *stream, tree decl, const char* name)
15702 {
15703   default_elf_asm_output_external (stream, decl, name);
15704   aarch64_asm_output_variant_pcs (stream, decl, name);
15705 }
15706 
15707 /* Implements TARGET_ASM_FILE_START.  Output the assembly header.  */
15708 
15709 static void
aarch64_start_file(void)15710 aarch64_start_file (void)
15711 {
15712   struct cl_target_option *default_options
15713     = TREE_TARGET_OPTION (target_option_default_node);
15714 
15715   const struct processor *default_arch
15716     = aarch64_get_arch (default_options->x_explicit_arch);
15717   unsigned long default_isa_flags = default_options->x_aarch64_isa_flags;
15718   std::string extension
15719     = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
15720 						  default_arch->flags);
15721 
15722    aarch64_last_printed_arch_string = default_arch->name + extension;
15723    aarch64_last_printed_tune_string = "";
15724    asm_fprintf (asm_out_file, "\t.arch %s\n",
15725 		aarch64_last_printed_arch_string.c_str ());
15726 
15727    default_file_start ();
15728 }
15729 
15730 /* Emit load exclusive.  */
15731 
15732 static void
aarch64_emit_load_exclusive(machine_mode mode,rtx rval,rtx mem,rtx model_rtx)15733 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
15734 			     rtx mem, rtx model_rtx)
15735 {
15736   if (mode == TImode)
15737     emit_insn (gen_aarch64_load_exclusive_pair (gen_lowpart (DImode, rval),
15738 						gen_highpart (DImode, rval),
15739 						mem, model_rtx));
15740   else
15741     emit_insn (gen_aarch64_load_exclusive (mode, rval, mem, model_rtx));
15742 }
15743 
15744 /* Emit store exclusive.  */
15745 
15746 static void
aarch64_emit_store_exclusive(machine_mode mode,rtx bval,rtx mem,rtx rval,rtx model_rtx)15747 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
15748 			      rtx mem, rtx rval, rtx model_rtx)
15749 {
15750   if (mode == TImode)
15751     emit_insn (gen_aarch64_store_exclusive_pair
15752 	       (bval, mem, operand_subword (rval, 0, 0, TImode),
15753 		operand_subword (rval, 1, 0, TImode), model_rtx));
15754   else
15755     emit_insn (gen_aarch64_store_exclusive (mode, bval, mem, rval, model_rtx));
15756 }
15757 
15758 /* Mark the previous jump instruction as unlikely.  */
15759 
15760 static void
aarch64_emit_unlikely_jump(rtx insn)15761 aarch64_emit_unlikely_jump (rtx insn)
15762 {
15763   rtx_insn *jump = emit_jump_insn (insn);
15764   add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
15765 }
15766 
15767 /* We store the names of the various atomic helpers in a 5x4 array.
15768    Return the libcall function given MODE, MODEL and NAMES.  */
15769 
15770 rtx
aarch64_atomic_ool_func(machine_mode mode,rtx model_rtx,const atomic_ool_names * names)15771 aarch64_atomic_ool_func(machine_mode mode, rtx model_rtx,
15772 			const atomic_ool_names *names)
15773 {
15774   memmodel model = memmodel_base (INTVAL (model_rtx));
15775   int mode_idx, model_idx;
15776 
15777   switch (mode)
15778     {
15779     case E_QImode:
15780       mode_idx = 0;
15781       break;
15782     case E_HImode:
15783       mode_idx = 1;
15784       break;
15785     case E_SImode:
15786       mode_idx = 2;
15787       break;
15788     case E_DImode:
15789       mode_idx = 3;
15790       break;
15791     case E_TImode:
15792       mode_idx = 4;
15793       break;
15794     default:
15795       gcc_unreachable ();
15796     }
15797 
15798   switch (model)
15799     {
15800     case MEMMODEL_RELAXED:
15801       model_idx = 0;
15802       break;
15803     case MEMMODEL_CONSUME:
15804     case MEMMODEL_ACQUIRE:
15805       model_idx = 1;
15806       break;
15807     case MEMMODEL_RELEASE:
15808       model_idx = 2;
15809       break;
15810     case MEMMODEL_ACQ_REL:
15811     case MEMMODEL_SEQ_CST:
15812       model_idx = 3;
15813       break;
15814     default:
15815       gcc_unreachable ();
15816     }
15817 
15818   return init_one_libfunc_visibility (names->str[mode_idx][model_idx],
15819 				      VISIBILITY_HIDDEN);
15820 }
15821 
15822 #define DEF0(B, N) \
15823   { "__aarch64_" #B #N "_relax", \
15824     "__aarch64_" #B #N "_acq", \
15825     "__aarch64_" #B #N "_rel", \
15826     "__aarch64_" #B #N "_acq_rel" }
15827 
15828 #define DEF4(B)  DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), \
15829 		 { NULL, NULL, NULL, NULL }
15830 #define DEF5(B)  DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), DEF0(B, 16)
15831 
15832 static const atomic_ool_names aarch64_ool_cas_names = { { DEF5(cas) } };
15833 const atomic_ool_names aarch64_ool_swp_names = { { DEF4(swp) } };
15834 const atomic_ool_names aarch64_ool_ldadd_names = { { DEF4(ldadd) } };
15835 const atomic_ool_names aarch64_ool_ldset_names = { { DEF4(ldset) } };
15836 const atomic_ool_names aarch64_ool_ldclr_names = { { DEF4(ldclr) } };
15837 const atomic_ool_names aarch64_ool_ldeor_names = { { DEF4(ldeor) } };
15838 
15839 #undef DEF0
15840 #undef DEF4
15841 #undef DEF5
15842 
15843 /* Expand a compare and swap pattern.  */
15844 
15845 void
aarch64_expand_compare_and_swap(rtx operands[])15846 aarch64_expand_compare_and_swap (rtx operands[])
15847 {
15848   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x, cc_reg;
15849   machine_mode mode, r_mode;
15850 
15851   bval = operands[0];
15852   rval = operands[1];
15853   mem = operands[2];
15854   oldval = operands[3];
15855   newval = operands[4];
15856   is_weak = operands[5];
15857   mod_s = operands[6];
15858   mod_f = operands[7];
15859   mode = GET_MODE (mem);
15860 
15861   /* Normally the succ memory model must be stronger than fail, but in the
15862      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
15863      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
15864   if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
15865       && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
15866     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
15867 
15868   r_mode = mode;
15869   if (mode == QImode || mode == HImode)
15870     {
15871       r_mode = SImode;
15872       rval = gen_reg_rtx (r_mode);
15873     }
15874 
15875   if (TARGET_LSE)
15876     {
15877       /* The CAS insn requires oldval and rval overlap, but we need to
15878 	 have a copy of oldval saved across the operation to tell if
15879 	 the operation is successful.  */
15880       if (reg_overlap_mentioned_p (rval, oldval))
15881         rval = copy_to_mode_reg (r_mode, oldval);
15882       else
15883 	emit_move_insn (rval, gen_lowpart (r_mode, oldval));
15884 
15885       emit_insn (gen_aarch64_compare_and_swap_lse (mode, rval, mem,
15886 						   newval, mod_s));
15887       cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
15888     }
15889   else if (TARGET_OUTLINE_ATOMICS)
15890     {
15891       /* Oldval must satisfy compare afterward.  */
15892       if (!aarch64_plus_operand (oldval, mode))
15893 	oldval = force_reg (mode, oldval);
15894       rtx func = aarch64_atomic_ool_func (mode, mod_s, &aarch64_ool_cas_names);
15895       rval = emit_library_call_value (func, NULL_RTX, LCT_NORMAL, r_mode,
15896 				      oldval, mode, newval, mode,
15897 				      XEXP (mem, 0), Pmode);
15898       cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
15899     }
15900   else
15901     {
15902       /* The oldval predicate varies by mode.  Test it and force to reg.  */
15903       insn_code code = code_for_aarch64_compare_and_swap (mode);
15904       if (!insn_data[code].operand[2].predicate (oldval, mode))
15905 	oldval = force_reg (mode, oldval);
15906 
15907       emit_insn (GEN_FCN (code) (rval, mem, oldval, newval,
15908 				 is_weak, mod_s, mod_f));
15909       cc_reg = gen_rtx_REG (CCmode, CC_REGNUM);
15910     }
15911 
15912   if (r_mode != mode)
15913     rval = gen_lowpart (mode, rval);
15914   emit_move_insn (operands[1], rval);
15915 
15916   x = gen_rtx_EQ (SImode, cc_reg, const0_rtx);
15917   emit_insn (gen_rtx_SET (bval, x));
15918 }
15919 
15920 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
15921    sequence implementing an atomic operation.  */
15922 
15923 static void
aarch64_emit_post_barrier(enum memmodel model)15924 aarch64_emit_post_barrier (enum memmodel model)
15925 {
15926   const enum memmodel base_model = memmodel_base (model);
15927 
15928   if (is_mm_sync (model)
15929       && (base_model == MEMMODEL_ACQUIRE
15930 	  || base_model == MEMMODEL_ACQ_REL
15931 	  || base_model == MEMMODEL_SEQ_CST))
15932     {
15933       emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
15934     }
15935 }
15936 
15937 /* Split a compare and swap pattern.  */
15938 
15939 void
aarch64_split_compare_and_swap(rtx operands[])15940 aarch64_split_compare_and_swap (rtx operands[])
15941 {
15942   /* Split after prolog/epilog to avoid interactions with shrinkwrapping.  */
15943   gcc_assert (epilogue_completed);
15944 
15945   rtx rval, mem, oldval, newval, scratch, x, model_rtx;
15946   machine_mode mode;
15947   bool is_weak;
15948   rtx_code_label *label1, *label2;
15949   enum memmodel model;
15950 
15951   rval = operands[0];
15952   mem = operands[1];
15953   oldval = operands[2];
15954   newval = operands[3];
15955   is_weak = (operands[4] != const0_rtx);
15956   model_rtx = operands[5];
15957   scratch = operands[7];
15958   mode = GET_MODE (mem);
15959   model = memmodel_from_int (INTVAL (model_rtx));
15960 
15961   /* When OLDVAL is zero and we want the strong version we can emit a tighter
15962     loop:
15963     .label1:
15964 	LD[A]XR	rval, [mem]
15965 	CBNZ	rval, .label2
15966 	ST[L]XR	scratch, newval, [mem]
15967 	CBNZ	scratch, .label1
15968     .label2:
15969 	CMP	rval, 0.  */
15970   bool strong_zero_p = (!is_weak && !aarch64_track_speculation &&
15971 			oldval == const0_rtx && mode != TImode);
15972 
15973   label1 = NULL;
15974   if (!is_weak)
15975     {
15976       label1 = gen_label_rtx ();
15977       emit_label (label1);
15978     }
15979   label2 = gen_label_rtx ();
15980 
15981   /* The initial load can be relaxed for a __sync operation since a final
15982      barrier will be emitted to stop code hoisting.  */
15983   if (is_mm_sync (model))
15984     aarch64_emit_load_exclusive (mode, rval, mem, GEN_INT (MEMMODEL_RELAXED));
15985   else
15986     aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
15987 
15988   if (strong_zero_p)
15989     x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
15990   else
15991     {
15992       rtx cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
15993       x = gen_rtx_NE (VOIDmode, cc_reg, const0_rtx);
15994     }
15995   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
15996 			    gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
15997   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
15998 
15999   aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
16000 
16001   if (!is_weak)
16002     {
16003       if (aarch64_track_speculation)
16004 	{
16005 	  /* Emit an explicit compare instruction, so that we can correctly
16006 	     track the condition codes.  */
16007 	  rtx cc_reg = aarch64_gen_compare_reg (NE, scratch, const0_rtx);
16008 	  x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
16009 	}
16010       else
16011 	x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
16012 
16013       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
16014 				gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
16015       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
16016     }
16017   else
16018     aarch64_gen_compare_reg (NE, scratch, const0_rtx);
16019 
16020   emit_label (label2);
16021 
16022   /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
16023      to set the condition flags.  If this is not used it will be removed by
16024      later passes.  */
16025   if (strong_zero_p)
16026     aarch64_gen_compare_reg (NE, rval, const0_rtx);
16027 
16028   /* Emit any final barrier needed for a __sync operation.  */
16029   if (is_mm_sync (model))
16030     aarch64_emit_post_barrier (model);
16031 }
16032 
16033 /* Split an atomic operation.  */
16034 
16035 void
aarch64_split_atomic_op(enum rtx_code code,rtx old_out,rtx new_out,rtx mem,rtx value,rtx model_rtx,rtx cond)16036 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
16037 			 rtx value, rtx model_rtx, rtx cond)
16038 {
16039   /* Split after prolog/epilog to avoid interactions with shrinkwrapping.  */
16040   gcc_assert (epilogue_completed);
16041 
16042   machine_mode mode = GET_MODE (mem);
16043   machine_mode wmode = (mode == DImode ? DImode : SImode);
16044   const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
16045   const bool is_sync = is_mm_sync (model);
16046   rtx_code_label *label;
16047   rtx x;
16048 
16049   /* Split the atomic operation into a sequence.  */
16050   label = gen_label_rtx ();
16051   emit_label (label);
16052 
16053   if (new_out)
16054     new_out = gen_lowpart (wmode, new_out);
16055   if (old_out)
16056     old_out = gen_lowpart (wmode, old_out);
16057   else
16058     old_out = new_out;
16059   value = simplify_gen_subreg (wmode, value, mode, 0);
16060 
16061   /* The initial load can be relaxed for a __sync operation since a final
16062      barrier will be emitted to stop code hoisting.  */
16063  if (is_sync)
16064     aarch64_emit_load_exclusive (mode, old_out, mem,
16065 				 GEN_INT (MEMMODEL_RELAXED));
16066   else
16067     aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
16068 
16069   switch (code)
16070     {
16071     case SET:
16072       new_out = value;
16073       break;
16074 
16075     case NOT:
16076       x = gen_rtx_AND (wmode, old_out, value);
16077       emit_insn (gen_rtx_SET (new_out, x));
16078       x = gen_rtx_NOT (wmode, new_out);
16079       emit_insn (gen_rtx_SET (new_out, x));
16080       break;
16081 
16082     case MINUS:
16083       if (CONST_INT_P (value))
16084 	{
16085 	  value = GEN_INT (-INTVAL (value));
16086 	  code = PLUS;
16087 	}
16088       /* Fall through.  */
16089 
16090     default:
16091       x = gen_rtx_fmt_ee (code, wmode, old_out, value);
16092       emit_insn (gen_rtx_SET (new_out, x));
16093       break;
16094     }
16095 
16096   aarch64_emit_store_exclusive (mode, cond, mem,
16097 				gen_lowpart (mode, new_out), model_rtx);
16098 
16099   if (aarch64_track_speculation)
16100     {
16101       /* Emit an explicit compare instruction, so that we can correctly
16102 	 track the condition codes.  */
16103       rtx cc_reg = aarch64_gen_compare_reg (NE, cond, const0_rtx);
16104       x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
16105     }
16106   else
16107     x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
16108 
16109   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
16110 			    gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
16111   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
16112 
16113   /* Emit any final barrier needed for a __sync operation.  */
16114   if (is_sync)
16115     aarch64_emit_post_barrier (model);
16116 }
16117 
16118 static void
aarch64_init_libfuncs(void)16119 aarch64_init_libfuncs (void)
16120 {
16121    /* Half-precision float operations.  The compiler handles all operations
16122      with NULL libfuncs by converting to SFmode.  */
16123 
16124   /* Conversions.  */
16125   set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
16126   set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
16127 
16128   /* Arithmetic.  */
16129   set_optab_libfunc (add_optab, HFmode, NULL);
16130   set_optab_libfunc (sdiv_optab, HFmode, NULL);
16131   set_optab_libfunc (smul_optab, HFmode, NULL);
16132   set_optab_libfunc (neg_optab, HFmode, NULL);
16133   set_optab_libfunc (sub_optab, HFmode, NULL);
16134 
16135   /* Comparisons.  */
16136   set_optab_libfunc (eq_optab, HFmode, NULL);
16137   set_optab_libfunc (ne_optab, HFmode, NULL);
16138   set_optab_libfunc (lt_optab, HFmode, NULL);
16139   set_optab_libfunc (le_optab, HFmode, NULL);
16140   set_optab_libfunc (ge_optab, HFmode, NULL);
16141   set_optab_libfunc (gt_optab, HFmode, NULL);
16142   set_optab_libfunc (unord_optab, HFmode, NULL);
16143 }
16144 
16145 /* Target hook for c_mode_for_suffix.  */
16146 static machine_mode
aarch64_c_mode_for_suffix(char suffix)16147 aarch64_c_mode_for_suffix (char suffix)
16148 {
16149   if (suffix == 'q')
16150     return TFmode;
16151 
16152   return VOIDmode;
16153 }
16154 
16155 /* We can only represent floating point constants which will fit in
16156    "quarter-precision" values.  These values are characterised by
16157    a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
16158    by:
16159 
16160    (-1)^s * (n/16) * 2^r
16161 
16162    Where:
16163      's' is the sign bit.
16164      'n' is an integer in the range 16 <= n <= 31.
16165      'r' is an integer in the range -3 <= r <= 4.  */
16166 
16167 /* Return true iff X can be represented by a quarter-precision
16168    floating point immediate operand X.  Note, we cannot represent 0.0.  */
16169 bool
aarch64_float_const_representable_p(rtx x)16170 aarch64_float_const_representable_p (rtx x)
16171 {
16172   /* This represents our current view of how many bits
16173      make up the mantissa.  */
16174   int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
16175   int exponent;
16176   unsigned HOST_WIDE_INT mantissa, mask;
16177   REAL_VALUE_TYPE r, m;
16178   bool fail;
16179 
16180   if (!CONST_DOUBLE_P (x))
16181     return false;
16182 
16183   if (GET_MODE (x) == VOIDmode
16184       || (GET_MODE (x) == HFmode && !TARGET_FP_F16INST))
16185     return false;
16186 
16187   r = *CONST_DOUBLE_REAL_VALUE (x);
16188 
16189   /* We cannot represent infinities, NaNs or +/-zero.  We won't
16190      know if we have +zero until we analyse the mantissa, but we
16191      can reject the other invalid values.  */
16192   if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
16193       || REAL_VALUE_MINUS_ZERO (r))
16194     return false;
16195 
16196   /* Extract exponent.  */
16197   r = real_value_abs (&r);
16198   exponent = REAL_EXP (&r);
16199 
16200   /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
16201      highest (sign) bit, with a fixed binary point at bit point_pos.
16202      m1 holds the low part of the mantissa, m2 the high part.
16203      WARNING: If we ever have a representation using more than 2 * H_W_I - 1
16204      bits for the mantissa, this can fail (low bits will be lost).  */
16205   real_ldexp (&m, &r, point_pos - exponent);
16206   wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
16207 
16208   /* If the low part of the mantissa has bits set we cannot represent
16209      the value.  */
16210   if (w.ulow () != 0)
16211     return false;
16212   /* We have rejected the lower HOST_WIDE_INT, so update our
16213      understanding of how many bits lie in the mantissa and
16214      look only at the high HOST_WIDE_INT.  */
16215   mantissa = w.elt (1);
16216   point_pos -= HOST_BITS_PER_WIDE_INT;
16217 
16218   /* We can only represent values with a mantissa of the form 1.xxxx.  */
16219   mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
16220   if ((mantissa & mask) != 0)
16221     return false;
16222 
16223   /* Having filtered unrepresentable values, we may now remove all
16224      but the highest 5 bits.  */
16225   mantissa >>= point_pos - 5;
16226 
16227   /* We cannot represent the value 0.0, so reject it.  This is handled
16228      elsewhere.  */
16229   if (mantissa == 0)
16230     return false;
16231 
16232   /* Then, as bit 4 is always set, we can mask it off, leaving
16233      the mantissa in the range [0, 15].  */
16234   mantissa &= ~(1 << 4);
16235   gcc_assert (mantissa <= 15);
16236 
16237   /* GCC internally does not use IEEE754-like encoding (where normalized
16238      significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.c).
16239      Our mantissa values are shifted 4 places to the left relative to
16240      normalized IEEE754 so we must modify the exponent returned by REAL_EXP
16241      by 5 places to correct for GCC's representation.  */
16242   exponent = 5 - exponent;
16243 
16244   return (exponent >= 0 && exponent <= 7);
16245 }
16246 
16247 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
16248    immediate with a CONST_VECTOR of MODE and WIDTH.  WHICH selects whether to
16249    output MOVI/MVNI, ORR or BIC immediate.  */
16250 char*
aarch64_output_simd_mov_immediate(rtx const_vector,unsigned width,enum simd_immediate_check which)16251 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
16252 				   enum simd_immediate_check which)
16253 {
16254   bool is_valid;
16255   static char templ[40];
16256   const char *mnemonic;
16257   const char *shift_op;
16258   unsigned int lane_count = 0;
16259   char element_char;
16260 
16261   struct simd_immediate_info info;
16262 
16263   /* This will return true to show const_vector is legal for use as either
16264      a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
16265      It will also update INFO to show how the immediate should be generated.
16266      WHICH selects whether to check for MOVI/MVNI, ORR or BIC.  */
16267   is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
16268   gcc_assert (is_valid);
16269 
16270   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
16271   lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
16272 
16273   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
16274     {
16275       gcc_assert (info.shift == 0 && info.insn == simd_immediate_info::MOV);
16276       /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
16277 	 move immediate path.  */
16278       if (aarch64_float_const_zero_rtx_p (info.value))
16279         info.value = GEN_INT (0);
16280       else
16281 	{
16282 	  const unsigned int buf_size = 20;
16283 	  char float_buf[buf_size] = {'\0'};
16284 	  real_to_decimal_for_mode (float_buf,
16285 				    CONST_DOUBLE_REAL_VALUE (info.value),
16286 				    buf_size, buf_size, 1, info.elt_mode);
16287 
16288 	  if (lane_count == 1)
16289 	    snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
16290 	  else
16291 	    snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
16292 		      lane_count, element_char, float_buf);
16293 	  return templ;
16294 	}
16295     }
16296 
16297   gcc_assert (CONST_INT_P (info.value));
16298 
16299   if (which == AARCH64_CHECK_MOV)
16300     {
16301       mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
16302       shift_op = info.modifier == simd_immediate_info::MSL ? "msl" : "lsl";
16303       if (lane_count == 1)
16304 	snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
16305 		  mnemonic, UINTVAL (info.value));
16306       else if (info.shift)
16307 	snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
16308 		  HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
16309 		  element_char, UINTVAL (info.value), shift_op, info.shift);
16310       else
16311 	snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
16312 		  HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
16313 		  element_char, UINTVAL (info.value));
16314     }
16315   else
16316     {
16317       /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR.  */
16318       mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
16319       if (info.shift)
16320 	snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
16321 		  HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
16322 		  element_char, UINTVAL (info.value), "lsl", info.shift);
16323       else
16324 	snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
16325 		  HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
16326 		  element_char, UINTVAL (info.value));
16327     }
16328   return templ;
16329 }
16330 
16331 char*
aarch64_output_scalar_simd_mov_immediate(rtx immediate,scalar_int_mode mode)16332 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
16333 {
16334 
16335   /* If a floating point number was passed and we desire to use it in an
16336      integer mode do the conversion to integer.  */
16337   if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
16338     {
16339       unsigned HOST_WIDE_INT ival;
16340       if (!aarch64_reinterpret_float_as_int (immediate, &ival))
16341 	  gcc_unreachable ();
16342       immediate = gen_int_mode (ival, mode);
16343     }
16344 
16345   machine_mode vmode;
16346   /* use a 64 bit mode for everything except for DI/DF mode, where we use
16347      a 128 bit vector mode.  */
16348   int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
16349 
16350   vmode = aarch64_simd_container_mode (mode, width);
16351   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
16352   return aarch64_output_simd_mov_immediate (v_op, width);
16353 }
16354 
16355 /* Return the output string to use for moving immediate CONST_VECTOR
16356    into an SVE register.  */
16357 
16358 char *
aarch64_output_sve_mov_immediate(rtx const_vector)16359 aarch64_output_sve_mov_immediate (rtx const_vector)
16360 {
16361   static char templ[40];
16362   struct simd_immediate_info info;
16363   char element_char;
16364 
16365   bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
16366   gcc_assert (is_valid);
16367 
16368   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
16369 
16370   if (info.step)
16371     {
16372       snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
16373 		HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
16374 		element_char, INTVAL (info.value), INTVAL (info.step));
16375       return templ;
16376     }
16377 
16378   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
16379     {
16380       if (aarch64_float_const_zero_rtx_p (info.value))
16381 	info.value = GEN_INT (0);
16382       else
16383 	{
16384 	  const int buf_size = 20;
16385 	  char float_buf[buf_size] = {};
16386 	  real_to_decimal_for_mode (float_buf,
16387 				    CONST_DOUBLE_REAL_VALUE (info.value),
16388 				    buf_size, buf_size, 1, info.elt_mode);
16389 
16390 	  snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
16391 		    element_char, float_buf);
16392 	  return templ;
16393 	}
16394     }
16395 
16396   snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
16397 	    element_char, INTVAL (info.value));
16398   return templ;
16399 }
16400 
16401 /* Return the asm format for a PTRUE instruction whose destination has
16402    mode MODE.  SUFFIX is the element size suffix.  */
16403 
16404 char *
aarch64_output_ptrue(machine_mode mode,char suffix)16405 aarch64_output_ptrue (machine_mode mode, char suffix)
16406 {
16407   unsigned int nunits;
16408   static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
16409   if (GET_MODE_NUNITS (mode).is_constant (&nunits))
16410     snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", suffix, nunits);
16411   else
16412     snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, all", suffix);
16413   return buf;
16414 }
16415 
16416 /* Split operands into moves from op[1] + op[2] into op[0].  */
16417 
16418 void
aarch64_split_combinev16qi(rtx operands[3])16419 aarch64_split_combinev16qi (rtx operands[3])
16420 {
16421   unsigned int dest = REGNO (operands[0]);
16422   unsigned int src1 = REGNO (operands[1]);
16423   unsigned int src2 = REGNO (operands[2]);
16424   machine_mode halfmode = GET_MODE (operands[1]);
16425   unsigned int halfregs = REG_NREGS (operands[1]);
16426   rtx destlo, desthi;
16427 
16428   gcc_assert (halfmode == V16QImode);
16429 
16430   if (src1 == dest && src2 == dest + halfregs)
16431     {
16432       /* No-op move.  Can't split to nothing; emit something.  */
16433       emit_note (NOTE_INSN_DELETED);
16434       return;
16435     }
16436 
16437   /* Preserve register attributes for variable tracking.  */
16438   destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
16439   desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
16440 			       GET_MODE_SIZE (halfmode));
16441 
16442   /* Special case of reversed high/low parts.  */
16443   if (reg_overlap_mentioned_p (operands[2], destlo)
16444       && reg_overlap_mentioned_p (operands[1], desthi))
16445     {
16446       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
16447       emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
16448       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
16449     }
16450   else if (!reg_overlap_mentioned_p (operands[2], destlo))
16451     {
16452       /* Try to avoid unnecessary moves if part of the result
16453 	 is in the right place already.  */
16454       if (src1 != dest)
16455 	emit_move_insn (destlo, operands[1]);
16456       if (src2 != dest + halfregs)
16457 	emit_move_insn (desthi, operands[2]);
16458     }
16459   else
16460     {
16461       if (src2 != dest + halfregs)
16462 	emit_move_insn (desthi, operands[2]);
16463       if (src1 != dest)
16464 	emit_move_insn (destlo, operands[1]);
16465     }
16466 }
16467 
16468 /* vec_perm support.  */
16469 
16470 struct expand_vec_perm_d
16471 {
16472   rtx target, op0, op1;
16473   vec_perm_indices perm;
16474   machine_mode vmode;
16475   unsigned int vec_flags;
16476   bool one_vector_p;
16477   bool testing_p;
16478 };
16479 
16480 /* Generate a variable permutation.  */
16481 
16482 static void
aarch64_expand_vec_perm_1(rtx target,rtx op0,rtx op1,rtx sel)16483 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
16484 {
16485   machine_mode vmode = GET_MODE (target);
16486   bool one_vector_p = rtx_equal_p (op0, op1);
16487 
16488   gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
16489   gcc_checking_assert (GET_MODE (op0) == vmode);
16490   gcc_checking_assert (GET_MODE (op1) == vmode);
16491   gcc_checking_assert (GET_MODE (sel) == vmode);
16492   gcc_checking_assert (TARGET_SIMD);
16493 
16494   if (one_vector_p)
16495     {
16496       if (vmode == V8QImode)
16497 	{
16498 	  /* Expand the argument to a V16QI mode by duplicating it.  */
16499 	  rtx pair = gen_reg_rtx (V16QImode);
16500 	  emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
16501 	  emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
16502 	}
16503       else
16504 	{
16505 	  emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
16506 	}
16507     }
16508   else
16509     {
16510       rtx pair;
16511 
16512       if (vmode == V8QImode)
16513 	{
16514 	  pair = gen_reg_rtx (V16QImode);
16515 	  emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
16516 	  emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
16517 	}
16518       else
16519 	{
16520 	  pair = gen_reg_rtx (OImode);
16521 	  emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
16522 	  emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
16523 	}
16524     }
16525 }
16526 
16527 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
16528    NELT is the number of elements in the vector.  */
16529 
16530 void
aarch64_expand_vec_perm(rtx target,rtx op0,rtx op1,rtx sel,unsigned int nelt)16531 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
16532 			 unsigned int nelt)
16533 {
16534   machine_mode vmode = GET_MODE (target);
16535   bool one_vector_p = rtx_equal_p (op0, op1);
16536   rtx mask;
16537 
16538   /* The TBL instruction does not use a modulo index, so we must take care
16539      of that ourselves.  */
16540   mask = aarch64_simd_gen_const_vector_dup (vmode,
16541       one_vector_p ? nelt - 1 : 2 * nelt - 1);
16542   sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
16543 
16544   /* For big-endian, we also need to reverse the index within the vector
16545      (but not which vector).  */
16546   if (BYTES_BIG_ENDIAN)
16547     {
16548       /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
16549       if (!one_vector_p)
16550         mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
16551       sel = expand_simple_binop (vmode, XOR, sel, mask,
16552 				 NULL, 0, OPTAB_LIB_WIDEN);
16553     }
16554   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
16555 }
16556 
16557 /* Generate (set TARGET (unspec [OP0 OP1] CODE)).  */
16558 
16559 static void
emit_unspec2(rtx target,int code,rtx op0,rtx op1)16560 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
16561 {
16562   emit_insn (gen_rtx_SET (target,
16563 			  gen_rtx_UNSPEC (GET_MODE (target),
16564 					  gen_rtvec (2, op0, op1), code)));
16565 }
16566 
16567 /* Expand an SVE vec_perm with the given operands.  */
16568 
16569 void
aarch64_expand_sve_vec_perm(rtx target,rtx op0,rtx op1,rtx sel)16570 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
16571 {
16572   machine_mode data_mode = GET_MODE (target);
16573   machine_mode sel_mode = GET_MODE (sel);
16574   /* Enforced by the pattern condition.  */
16575   int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
16576 
16577   /* Note: vec_perm indices are supposed to wrap when they go beyond the
16578      size of the two value vectors, i.e. the upper bits of the indices
16579      are effectively ignored.  SVE TBL instead produces 0 for any
16580      out-of-range indices, so we need to modulo all the vec_perm indices
16581      to ensure they are all in range.  */
16582   rtx sel_reg = force_reg (sel_mode, sel);
16583 
16584   /* Check if the sel only references the first values vector.  */
16585   if (GET_CODE (sel) == CONST_VECTOR
16586       && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
16587     {
16588       emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
16589       return;
16590     }
16591 
16592   /* Check if the two values vectors are the same.  */
16593   if (rtx_equal_p (op0, op1))
16594     {
16595       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
16596       rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
16597 					 NULL, 0, OPTAB_DIRECT);
16598       emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
16599       return;
16600     }
16601 
16602   /* Run TBL on for each value vector and combine the results.  */
16603 
16604   rtx res0 = gen_reg_rtx (data_mode);
16605   rtx res1 = gen_reg_rtx (data_mode);
16606   rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
16607   if (GET_CODE (sel) != CONST_VECTOR
16608       || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
16609     {
16610       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
16611 						       2 * nunits - 1);
16612       sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
16613 				     NULL, 0, OPTAB_DIRECT);
16614     }
16615   emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
16616   rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
16617 				     NULL, 0, OPTAB_DIRECT);
16618   emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
16619   if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
16620     emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
16621   else
16622     emit_unspec2 (target, UNSPEC_IORF, res0, res1);
16623 }
16624 
16625 /* Recognize patterns suitable for the TRN instructions.  */
16626 static bool
aarch64_evpc_trn(struct expand_vec_perm_d * d)16627 aarch64_evpc_trn (struct expand_vec_perm_d *d)
16628 {
16629   HOST_WIDE_INT odd;
16630   poly_uint64 nelt = d->perm.length ();
16631   rtx out, in0, in1, x;
16632   machine_mode vmode = d->vmode;
16633 
16634   if (GET_MODE_UNIT_SIZE (vmode) > 8)
16635     return false;
16636 
16637   /* Note that these are little-endian tests.
16638      We correct for big-endian later.  */
16639   if (!d->perm[0].is_constant (&odd)
16640       || (odd != 0 && odd != 1)
16641       || !d->perm.series_p (0, 2, odd, 2)
16642       || !d->perm.series_p (1, 2, nelt + odd, 2))
16643     return false;
16644 
16645   /* Success!  */
16646   if (d->testing_p)
16647     return true;
16648 
16649   in0 = d->op0;
16650   in1 = d->op1;
16651   /* We don't need a big-endian lane correction for SVE; see the comment
16652      at the head of aarch64-sve.md for details.  */
16653   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
16654     {
16655       x = in0, in0 = in1, in1 = x;
16656       odd = !odd;
16657     }
16658   out = d->target;
16659 
16660   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
16661 				      odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
16662   return true;
16663 }
16664 
16665 /* Recognize patterns suitable for the UZP instructions.  */
16666 static bool
aarch64_evpc_uzp(struct expand_vec_perm_d * d)16667 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
16668 {
16669   HOST_WIDE_INT odd;
16670   rtx out, in0, in1, x;
16671   machine_mode vmode = d->vmode;
16672 
16673   if (GET_MODE_UNIT_SIZE (vmode) > 8)
16674     return false;
16675 
16676   /* Note that these are little-endian tests.
16677      We correct for big-endian later.  */
16678   if (!d->perm[0].is_constant (&odd)
16679       || (odd != 0 && odd != 1)
16680       || !d->perm.series_p (0, 1, odd, 2))
16681     return false;
16682 
16683   /* Success!  */
16684   if (d->testing_p)
16685     return true;
16686 
16687   in0 = d->op0;
16688   in1 = d->op1;
16689   /* We don't need a big-endian lane correction for SVE; see the comment
16690      at the head of aarch64-sve.md for details.  */
16691   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
16692     {
16693       x = in0, in0 = in1, in1 = x;
16694       odd = !odd;
16695     }
16696   out = d->target;
16697 
16698   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
16699 				      odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
16700   return true;
16701 }
16702 
16703 /* Recognize patterns suitable for the ZIP instructions.  */
16704 static bool
aarch64_evpc_zip(struct expand_vec_perm_d * d)16705 aarch64_evpc_zip (struct expand_vec_perm_d *d)
16706 {
16707   unsigned int high;
16708   poly_uint64 nelt = d->perm.length ();
16709   rtx out, in0, in1, x;
16710   machine_mode vmode = d->vmode;
16711 
16712   if (GET_MODE_UNIT_SIZE (vmode) > 8)
16713     return false;
16714 
16715   /* Note that these are little-endian tests.
16716      We correct for big-endian later.  */
16717   poly_uint64 first = d->perm[0];
16718   if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
16719       || !d->perm.series_p (0, 2, first, 1)
16720       || !d->perm.series_p (1, 2, first + nelt, 1))
16721     return false;
16722   high = maybe_ne (first, 0U);
16723 
16724   /* Success!  */
16725   if (d->testing_p)
16726     return true;
16727 
16728   in0 = d->op0;
16729   in1 = d->op1;
16730   /* We don't need a big-endian lane correction for SVE; see the comment
16731      at the head of aarch64-sve.md for details.  */
16732   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
16733     {
16734       x = in0, in0 = in1, in1 = x;
16735       high = !high;
16736     }
16737   out = d->target;
16738 
16739   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
16740 				      high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
16741   return true;
16742 }
16743 
16744 /* Recognize patterns for the EXT insn.  */
16745 
16746 static bool
aarch64_evpc_ext(struct expand_vec_perm_d * d)16747 aarch64_evpc_ext (struct expand_vec_perm_d *d)
16748 {
16749   HOST_WIDE_INT location;
16750   rtx offset;
16751 
16752   /* The first element always refers to the first vector.
16753      Check if the extracted indices are increasing by one.  */
16754   if (d->vec_flags == VEC_SVE_PRED
16755       || !d->perm[0].is_constant (&location)
16756       || !d->perm.series_p (0, 1, location, 1))
16757     return false;
16758 
16759   /* Success! */
16760   if (d->testing_p)
16761     return true;
16762 
16763   /* The case where (location == 0) is a no-op for both big- and little-endian,
16764      and is removed by the mid-end at optimization levels -O1 and higher.
16765 
16766      We don't need a big-endian lane correction for SVE; see the comment
16767      at the head of aarch64-sve.md for details.  */
16768   if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
16769     {
16770       /* After setup, we want the high elements of the first vector (stored
16771          at the LSB end of the register), and the low elements of the second
16772          vector (stored at the MSB end of the register). So swap.  */
16773       std::swap (d->op0, d->op1);
16774       /* location != 0 (above), so safe to assume (nelt - location) < nelt.
16775 	 to_constant () is safe since this is restricted to Advanced SIMD
16776 	 vectors.  */
16777       location = d->perm.length ().to_constant () - location;
16778     }
16779 
16780   offset = GEN_INT (location);
16781   emit_set_insn (d->target,
16782 		 gen_rtx_UNSPEC (d->vmode,
16783 				 gen_rtvec (3, d->op0, d->op1, offset),
16784 				 UNSPEC_EXT));
16785   return true;
16786 }
16787 
16788 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
16789    within each 64-bit, 32-bit or 16-bit granule.  */
16790 
16791 static bool
aarch64_evpc_rev_local(struct expand_vec_perm_d * d)16792 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
16793 {
16794   HOST_WIDE_INT diff;
16795   unsigned int i, size, unspec;
16796   machine_mode pred_mode;
16797 
16798   if (d->vec_flags == VEC_SVE_PRED
16799       || !d->one_vector_p
16800       || !d->perm[0].is_constant (&diff)
16801       || !diff)
16802     return false;
16803 
16804   size = (diff + 1) * GET_MODE_UNIT_SIZE (d->vmode);
16805   if (size == 8)
16806     {
16807       unspec = UNSPEC_REV64;
16808       pred_mode = VNx2BImode;
16809     }
16810   else if (size == 4)
16811     {
16812       unspec = UNSPEC_REV32;
16813       pred_mode = VNx4BImode;
16814     }
16815   else if (size == 2)
16816     {
16817       unspec = UNSPEC_REV16;
16818       pred_mode = VNx8BImode;
16819     }
16820   else
16821     return false;
16822 
16823   unsigned int step = diff + 1;
16824   for (i = 0; i < step; ++i)
16825     if (!d->perm.series_p (i, step, diff - i, step))
16826       return false;
16827 
16828   /* Success! */
16829   if (d->testing_p)
16830     return true;
16831 
16832   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
16833   if (d->vec_flags == VEC_SVE_DATA)
16834     {
16835       rtx pred = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
16836       src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (2, pred, src),
16837 			    UNSPEC_MERGE_PTRUE);
16838     }
16839   emit_set_insn (d->target, src);
16840   return true;
16841 }
16842 
16843 /* Recognize patterns for the REV insn, which reverses elements within
16844    a full vector.  */
16845 
16846 static bool
aarch64_evpc_rev_global(struct expand_vec_perm_d * d)16847 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
16848 {
16849   poly_uint64 nelt = d->perm.length ();
16850 
16851   if (!d->one_vector_p || d->vec_flags != VEC_SVE_DATA)
16852     return false;
16853 
16854   if (!d->perm.series_p (0, 1, nelt - 1, -1))
16855     return false;
16856 
16857   /* Success! */
16858   if (d->testing_p)
16859     return true;
16860 
16861   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
16862   emit_set_insn (d->target, src);
16863   return true;
16864 }
16865 
16866 static bool
aarch64_evpc_dup(struct expand_vec_perm_d * d)16867 aarch64_evpc_dup (struct expand_vec_perm_d *d)
16868 {
16869   rtx out = d->target;
16870   rtx in0;
16871   HOST_WIDE_INT elt;
16872   machine_mode vmode = d->vmode;
16873   rtx lane;
16874 
16875   if (d->vec_flags == VEC_SVE_PRED
16876       || d->perm.encoding ().encoded_nelts () != 1
16877       || !d->perm[0].is_constant (&elt))
16878     return false;
16879 
16880   if (d->vec_flags == VEC_SVE_DATA && elt >= 64 * GET_MODE_UNIT_SIZE (vmode))
16881     return false;
16882 
16883   /* Success! */
16884   if (d->testing_p)
16885     return true;
16886 
16887   /* The generic preparation in aarch64_expand_vec_perm_const_1
16888      swaps the operand order and the permute indices if it finds
16889      d->perm[0] to be in the second operand.  Thus, we can always
16890      use d->op0 and need not do any extra arithmetic to get the
16891      correct lane number.  */
16892   in0 = d->op0;
16893   lane = GEN_INT (elt); /* The pattern corrects for big-endian.  */
16894 
16895   rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
16896   rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
16897   emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
16898   return true;
16899 }
16900 
16901 static bool
aarch64_evpc_tbl(struct expand_vec_perm_d * d)16902 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
16903 {
16904   rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
16905   machine_mode vmode = d->vmode;
16906 
16907   /* Make sure that the indices are constant.  */
16908   unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
16909   for (unsigned int i = 0; i < encoded_nelts; ++i)
16910     if (!d->perm[i].is_constant ())
16911       return false;
16912 
16913   if (d->testing_p)
16914     return true;
16915 
16916   /* Generic code will try constant permutation twice.  Once with the
16917      original mode and again with the elements lowered to QImode.
16918      So wait and don't do the selector expansion ourselves.  */
16919   if (vmode != V8QImode && vmode != V16QImode)
16920     return false;
16921 
16922   /* to_constant is safe since this routine is specific to Advanced SIMD
16923      vectors.  */
16924   unsigned int nelt = d->perm.length ().to_constant ();
16925   for (unsigned int i = 0; i < nelt; ++i)
16926     /* If big-endian and two vectors we end up with a weird mixed-endian
16927        mode on NEON.  Reverse the index within each word but not the word
16928        itself.  to_constant is safe because we checked is_constant above.  */
16929     rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
16930 			? d->perm[i].to_constant () ^ (nelt - 1)
16931 			: d->perm[i].to_constant ());
16932 
16933   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
16934   sel = force_reg (vmode, sel);
16935 
16936   aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
16937   return true;
16938 }
16939 
16940 /* Try to implement D using an SVE TBL instruction.  */
16941 
16942 static bool
aarch64_evpc_sve_tbl(struct expand_vec_perm_d * d)16943 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
16944 {
16945   unsigned HOST_WIDE_INT nelt;
16946 
16947   /* Permuting two variable-length vectors could overflow the
16948      index range.  */
16949   if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
16950     return false;
16951 
16952   if (d->testing_p)
16953     return true;
16954 
16955   machine_mode sel_mode = mode_for_int_vector (d->vmode).require ();
16956   rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
16957   if (d->one_vector_p)
16958     emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
16959   else
16960     aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
16961   return true;
16962 }
16963 
16964 static bool
aarch64_expand_vec_perm_const_1(struct expand_vec_perm_d * d)16965 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
16966 {
16967   /* The pattern matching functions above are written to look for a small
16968      number to begin the sequence (0, 1, N/2).  If we begin with an index
16969      from the second operand, we can swap the operands.  */
16970   poly_int64 nelt = d->perm.length ();
16971   if (known_ge (d->perm[0], nelt))
16972     {
16973       d->perm.rotate_inputs (1);
16974       std::swap (d->op0, d->op1);
16975     }
16976 
16977   if ((d->vec_flags == VEC_ADVSIMD
16978        || d->vec_flags == VEC_SVE_DATA
16979        || d->vec_flags == VEC_SVE_PRED)
16980       && known_gt (nelt, 1))
16981     {
16982       if (aarch64_evpc_rev_local (d))
16983 	return true;
16984       else if (aarch64_evpc_rev_global (d))
16985 	return true;
16986       else if (aarch64_evpc_ext (d))
16987 	return true;
16988       else if (aarch64_evpc_dup (d))
16989 	return true;
16990       else if (aarch64_evpc_zip (d))
16991 	return true;
16992       else if (aarch64_evpc_uzp (d))
16993 	return true;
16994       else if (aarch64_evpc_trn (d))
16995 	return true;
16996       if (d->vec_flags == VEC_SVE_DATA)
16997 	return aarch64_evpc_sve_tbl (d);
16998       else if (d->vec_flags == VEC_ADVSIMD)
16999 	return aarch64_evpc_tbl (d);
17000     }
17001   return false;
17002 }
17003 
17004 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST.  */
17005 
17006 static bool
aarch64_vectorize_vec_perm_const(machine_mode vmode,rtx target,rtx op0,rtx op1,const vec_perm_indices & sel)17007 aarch64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
17008 				  rtx op1, const vec_perm_indices &sel)
17009 {
17010   struct expand_vec_perm_d d;
17011 
17012   /* Check whether the mask can be applied to a single vector.  */
17013   if (sel.ninputs () == 1
17014       || (op0 && rtx_equal_p (op0, op1)))
17015     d.one_vector_p = true;
17016   else if (sel.all_from_input_p (0))
17017     {
17018       d.one_vector_p = true;
17019       op1 = op0;
17020     }
17021   else if (sel.all_from_input_p (1))
17022     {
17023       d.one_vector_p = true;
17024       op0 = op1;
17025     }
17026   else
17027     d.one_vector_p = false;
17028 
17029   d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
17030 		     sel.nelts_per_input ());
17031   d.vmode = vmode;
17032   d.vec_flags = aarch64_classify_vector_mode (d.vmode);
17033   d.target = target;
17034   d.op0 = op0;
17035   d.op1 = op1;
17036   d.testing_p = !target;
17037 
17038   if (!d.testing_p)
17039     return aarch64_expand_vec_perm_const_1 (&d);
17040 
17041   rtx_insn *last = get_last_insn ();
17042   bool ret = aarch64_expand_vec_perm_const_1 (&d);
17043   gcc_assert (last == get_last_insn ());
17044 
17045   return ret;
17046 }
17047 
17048 /* Generate a byte permute mask for a register of mode MODE,
17049    which has NUNITS units.  */
17050 
17051 rtx
aarch64_reverse_mask(machine_mode mode,unsigned int nunits)17052 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
17053 {
17054   /* We have to reverse each vector because we dont have
17055      a permuted load that can reverse-load according to ABI rules.  */
17056   rtx mask;
17057   rtvec v = rtvec_alloc (16);
17058   unsigned int i, j;
17059   unsigned int usize = GET_MODE_UNIT_SIZE (mode);
17060 
17061   gcc_assert (BYTES_BIG_ENDIAN);
17062   gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
17063 
17064   for (i = 0; i < nunits; i++)
17065     for (j = 0; j < usize; j++)
17066       RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
17067   mask = gen_rtx_CONST_VECTOR (V16QImode, v);
17068   return force_reg (V16QImode, mask);
17069 }
17070 
17071 /* Return true if X is a valid second operand for the SVE instruction
17072    that implements integer comparison OP_CODE.  */
17073 
17074 static bool
aarch64_sve_cmp_operand_p(rtx_code op_code,rtx x)17075 aarch64_sve_cmp_operand_p (rtx_code op_code, rtx x)
17076 {
17077   if (register_operand (x, VOIDmode))
17078     return true;
17079 
17080   switch (op_code)
17081     {
17082     case LTU:
17083     case LEU:
17084     case GEU:
17085     case GTU:
17086       return aarch64_sve_cmp_immediate_p (x, false);
17087     case LT:
17088     case LE:
17089     case GE:
17090     case GT:
17091     case NE:
17092     case EQ:
17093       return aarch64_sve_cmp_immediate_p (x, true);
17094     default:
17095       gcc_unreachable ();
17096     }
17097 }
17098 
17099 /* Use predicated SVE instructions to implement the equivalent of:
17100 
17101      (set TARGET OP)
17102 
17103    given that PTRUE is an all-true predicate of the appropriate mode.  */
17104 
17105 static void
aarch64_emit_sve_ptrue_op(rtx target,rtx ptrue,rtx op)17106 aarch64_emit_sve_ptrue_op (rtx target, rtx ptrue, rtx op)
17107 {
17108   rtx unspec = gen_rtx_UNSPEC (GET_MODE (target),
17109 			       gen_rtvec (2, ptrue, op),
17110 			       UNSPEC_MERGE_PTRUE);
17111   rtx_insn *insn = emit_set_insn (target, unspec);
17112   set_unique_reg_note (insn, REG_EQUAL, copy_rtx (op));
17113 }
17114 
17115 /* Likewise, but also clobber the condition codes.  */
17116 
17117 static void
aarch64_emit_sve_ptrue_op_cc(rtx target,rtx ptrue,rtx op)17118 aarch64_emit_sve_ptrue_op_cc (rtx target, rtx ptrue, rtx op)
17119 {
17120   rtx unspec = gen_rtx_UNSPEC (GET_MODE (target),
17121 			       gen_rtvec (2, ptrue, op),
17122 			       UNSPEC_MERGE_PTRUE);
17123   rtx_insn *insn = emit_insn (gen_set_clobber_cc (target, unspec));
17124   set_unique_reg_note (insn, REG_EQUAL, copy_rtx (op));
17125 }
17126 
17127 /* Return the UNSPEC_COND_* code for comparison CODE.  */
17128 
17129 static unsigned int
aarch64_unspec_cond_code(rtx_code code)17130 aarch64_unspec_cond_code (rtx_code code)
17131 {
17132   switch (code)
17133     {
17134     case NE:
17135       return UNSPEC_COND_NE;
17136     case EQ:
17137       return UNSPEC_COND_EQ;
17138     case LT:
17139       return UNSPEC_COND_LT;
17140     case GT:
17141       return UNSPEC_COND_GT;
17142     case LE:
17143       return UNSPEC_COND_LE;
17144     case GE:
17145       return UNSPEC_COND_GE;
17146     default:
17147       gcc_unreachable ();
17148     }
17149 }
17150 
17151 /* Emit:
17152 
17153       (set TARGET (unspec [PRED OP0 OP1] UNSPEC_COND_<X>))
17154 
17155    where <X> is the operation associated with comparison CODE.  This form
17156    of instruction is used when (and (CODE OP0 OP1) PRED) would have different
17157    semantics, such as when PRED might not be all-true and when comparing
17158    inactive lanes could have side effects.  */
17159 
17160 static void
aarch64_emit_sve_predicated_cond(rtx target,rtx_code code,rtx pred,rtx op0,rtx op1)17161 aarch64_emit_sve_predicated_cond (rtx target, rtx_code code,
17162 				  rtx pred, rtx op0, rtx op1)
17163 {
17164   rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
17165 			       gen_rtvec (3, pred, op0, op1),
17166 			       aarch64_unspec_cond_code (code));
17167   emit_set_insn (target, unspec);
17168 }
17169 
17170 /* Expand an SVE integer comparison using the SVE equivalent of:
17171 
17172      (set TARGET (CODE OP0 OP1)).  */
17173 
17174 void
aarch64_expand_sve_vec_cmp_int(rtx target,rtx_code code,rtx op0,rtx op1)17175 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
17176 {
17177   machine_mode pred_mode = GET_MODE (target);
17178   machine_mode data_mode = GET_MODE (op0);
17179 
17180   if (!aarch64_sve_cmp_operand_p (code, op1))
17181     op1 = force_reg (data_mode, op1);
17182 
17183   rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
17184   rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
17185   aarch64_emit_sve_ptrue_op_cc (target, ptrue, cond);
17186 }
17187 
17188 /* Emit the SVE equivalent of:
17189 
17190       (set TMP1 (CODE1 OP0 OP1))
17191       (set TMP2 (CODE2 OP0 OP1))
17192       (set TARGET (ior:PRED_MODE TMP1 TMP2))
17193 
17194    PTRUE is an all-true predicate with the same mode as TARGET.  */
17195 
17196 static void
aarch64_emit_sve_or_conds(rtx target,rtx_code code1,rtx_code code2,rtx ptrue,rtx op0,rtx op1)17197 aarch64_emit_sve_or_conds (rtx target, rtx_code code1, rtx_code code2,
17198 			   rtx ptrue, rtx op0, rtx op1)
17199 {
17200   machine_mode pred_mode = GET_MODE (ptrue);
17201   rtx tmp1 = gen_reg_rtx (pred_mode);
17202   aarch64_emit_sve_ptrue_op (tmp1, ptrue,
17203 			     gen_rtx_fmt_ee (code1, pred_mode, op0, op1));
17204   rtx tmp2 = gen_reg_rtx (pred_mode);
17205   aarch64_emit_sve_ptrue_op (tmp2, ptrue,
17206 			     gen_rtx_fmt_ee (code2, pred_mode, op0, op1));
17207   aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
17208 }
17209 
17210 /* Emit the SVE equivalent of:
17211 
17212       (set TMP (CODE OP0 OP1))
17213       (set TARGET (not TMP))
17214 
17215    PTRUE is an all-true predicate with the same mode as TARGET.  */
17216 
17217 static void
aarch64_emit_sve_inverted_cond(rtx target,rtx ptrue,rtx_code code,rtx op0,rtx op1)17218 aarch64_emit_sve_inverted_cond (rtx target, rtx ptrue, rtx_code code,
17219 				rtx op0, rtx op1)
17220 {
17221   machine_mode pred_mode = GET_MODE (ptrue);
17222   rtx tmp = gen_reg_rtx (pred_mode);
17223   aarch64_emit_sve_ptrue_op (tmp, ptrue,
17224 			     gen_rtx_fmt_ee (code, pred_mode, op0, op1));
17225   aarch64_emit_unop (target, one_cmpl_optab, tmp);
17226 }
17227 
17228 /* Expand an SVE floating-point comparison using the SVE equivalent of:
17229 
17230      (set TARGET (CODE OP0 OP1))
17231 
17232    If CAN_INVERT_P is true, the caller can also handle inverted results;
17233    return true if the result is in fact inverted.  */
17234 
17235 bool
aarch64_expand_sve_vec_cmp_float(rtx target,rtx_code code,rtx op0,rtx op1,bool can_invert_p)17236 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
17237 				  rtx op0, rtx op1, bool can_invert_p)
17238 {
17239   machine_mode pred_mode = GET_MODE (target);
17240   machine_mode data_mode = GET_MODE (op0);
17241 
17242   rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
17243   switch (code)
17244     {
17245     case UNORDERED:
17246       /* UNORDERED has no immediate form.  */
17247       op1 = force_reg (data_mode, op1);
17248       /* fall through */
17249     case LT:
17250     case LE:
17251     case GT:
17252     case GE:
17253     case EQ:
17254     case NE:
17255       {
17256 	/* There is native support for the comparison.  */
17257 	rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
17258 	aarch64_emit_sve_ptrue_op (target, ptrue, cond);
17259 	return false;
17260       }
17261 
17262     case LTGT:
17263       /* This is a trapping operation (LT or GT).  */
17264       aarch64_emit_sve_or_conds (target, LT, GT, ptrue, op0, op1);
17265       return false;
17266 
17267     case UNEQ:
17268       if (!flag_trapping_math)
17269 	{
17270 	  /* This would trap for signaling NaNs.  */
17271 	  op1 = force_reg (data_mode, op1);
17272 	  aarch64_emit_sve_or_conds (target, UNORDERED, EQ, ptrue, op0, op1);
17273 	  return false;
17274 	}
17275       /* fall through */
17276     case UNLT:
17277     case UNLE:
17278     case UNGT:
17279     case UNGE:
17280       if (flag_trapping_math)
17281 	{
17282 	  /* Work out which elements are ordered.  */
17283 	  rtx ordered = gen_reg_rtx (pred_mode);
17284 	  op1 = force_reg (data_mode, op1);
17285 	  aarch64_emit_sve_inverted_cond (ordered, ptrue, UNORDERED, op0, op1);
17286 
17287 	  /* Test the opposite condition for the ordered elements,
17288 	     then invert the result.  */
17289 	  if (code == UNEQ)
17290 	    code = NE;
17291 	  else
17292 	    code = reverse_condition_maybe_unordered (code);
17293 	  if (can_invert_p)
17294 	    {
17295 	      aarch64_emit_sve_predicated_cond (target, code,
17296 						ordered, op0, op1);
17297 	      return true;
17298 	    }
17299 	  rtx tmp = gen_reg_rtx (pred_mode);
17300 	  aarch64_emit_sve_predicated_cond (tmp, code, ordered, op0, op1);
17301 	  aarch64_emit_unop (target, one_cmpl_optab, tmp);
17302 	  return false;
17303 	}
17304       break;
17305 
17306     case ORDERED:
17307       /* ORDERED has no immediate form.  */
17308       op1 = force_reg (data_mode, op1);
17309       break;
17310 
17311     default:
17312       gcc_unreachable ();
17313     }
17314 
17315   /* There is native support for the inverse comparison.  */
17316   code = reverse_condition_maybe_unordered (code);
17317   if (can_invert_p)
17318     {
17319       rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
17320       aarch64_emit_sve_ptrue_op (target, ptrue, cond);
17321       return true;
17322     }
17323   aarch64_emit_sve_inverted_cond (target, ptrue, code, op0, op1);
17324   return false;
17325 }
17326 
17327 /* Expand an SVE vcond pattern with operands OPS.  DATA_MODE is the mode
17328    of the data being selected and CMP_MODE is the mode of the values being
17329    compared.  */
17330 
17331 void
aarch64_expand_sve_vcond(machine_mode data_mode,machine_mode cmp_mode,rtx * ops)17332 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
17333 			  rtx *ops)
17334 {
17335   machine_mode pred_mode
17336     = aarch64_get_mask_mode (GET_MODE_NUNITS (cmp_mode),
17337 			     GET_MODE_SIZE (cmp_mode)).require ();
17338   rtx pred = gen_reg_rtx (pred_mode);
17339   if (FLOAT_MODE_P (cmp_mode))
17340     {
17341       if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
17342 					    ops[4], ops[5], true))
17343 	std::swap (ops[1], ops[2]);
17344     }
17345   else
17346     aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
17347 
17348   rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
17349   emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
17350 }
17351 
17352 /* Implement TARGET_MODES_TIEABLE_P.  In principle we should always return
17353    true.  However due to issues with register allocation it is preferable
17354    to avoid tieing integer scalar and FP scalar modes.  Executing integer
17355    operations in general registers is better than treating them as scalar
17356    vector operations.  This reduces latency and avoids redundant int<->FP
17357    moves.  So tie modes if they are either the same class, or vector modes
17358    with other vector modes, vector structs or any scalar mode.  */
17359 
17360 static bool
aarch64_modes_tieable_p(machine_mode mode1,machine_mode mode2)17361 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
17362 {
17363   if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
17364     return true;
17365 
17366   /* We specifically want to allow elements of "structure" modes to
17367      be tieable to the structure.  This more general condition allows
17368      other rarer situations too.  The reason we don't extend this to
17369      predicate modes is that there are no predicate structure modes
17370      nor any specific instructions for extracting part of a predicate
17371      register.  */
17372   if (aarch64_vector_data_mode_p (mode1)
17373       && aarch64_vector_data_mode_p (mode2))
17374     return true;
17375 
17376   /* Also allow any scalar modes with vectors.  */
17377   if (aarch64_vector_mode_supported_p (mode1)
17378       || aarch64_vector_mode_supported_p (mode2))
17379     return true;
17380 
17381   return false;
17382 }
17383 
17384 /* Return a new RTX holding the result of moving POINTER forward by
17385    AMOUNT bytes.  */
17386 
17387 static rtx
aarch64_move_pointer(rtx pointer,poly_int64 amount)17388 aarch64_move_pointer (rtx pointer, poly_int64 amount)
17389 {
17390   rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
17391 
17392   return adjust_automodify_address (pointer, GET_MODE (pointer),
17393 				    next, amount);
17394 }
17395 
17396 /* Return a new RTX holding the result of moving POINTER forward by the
17397    size of the mode it points to.  */
17398 
17399 static rtx
aarch64_progress_pointer(rtx pointer)17400 aarch64_progress_pointer (rtx pointer)
17401 {
17402   return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
17403 }
17404 
17405 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
17406    MODE bytes.  */
17407 
17408 static void
aarch64_copy_one_block_and_progress_pointers(rtx * src,rtx * dst,machine_mode mode)17409 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
17410 					      machine_mode mode)
17411 {
17412   rtx reg = gen_reg_rtx (mode);
17413 
17414   /* "Cast" the pointers to the correct mode.  */
17415   *src = adjust_address (*src, mode, 0);
17416   *dst = adjust_address (*dst, mode, 0);
17417   /* Emit the memcpy.  */
17418   emit_move_insn (reg, *src);
17419   emit_move_insn (*dst, reg);
17420   /* Move the pointers forward.  */
17421   *src = aarch64_progress_pointer (*src);
17422   *dst = aarch64_progress_pointer (*dst);
17423 }
17424 
17425 /* Expand movmem, as if from a __builtin_memcpy.  Return true if
17426    we succeed, otherwise return false.  */
17427 
17428 bool
aarch64_expand_movmem(rtx * operands)17429 aarch64_expand_movmem (rtx *operands)
17430 {
17431   /* These need to be signed as we need to perform arithmetic on n as
17432      signed operations.  */
17433   int n, mode_bits;
17434   rtx dst = operands[0];
17435   rtx src = operands[1];
17436   rtx base;
17437   machine_mode cur_mode = BLKmode, next_mode;
17438   bool speed_p = !optimize_function_for_size_p (cfun);
17439 
17440   /* When optimizing for size, give a better estimate of the length of a
17441      memcpy call, but use the default otherwise.  Moves larger than 8 bytes
17442      will always require an even number of instructions to do now.  And each
17443      operation requires both a load+store, so divide the max number by 2.  */
17444   unsigned int max_num_moves = (speed_p ? 16 : AARCH64_CALL_RATIO) / 2;
17445 
17446   /* We can't do anything smart if the amount to copy is not constant.  */
17447   if (!CONST_INT_P (operands[2]))
17448     return false;
17449 
17450   unsigned HOST_WIDE_INT tmp = INTVAL (operands[2]);
17451 
17452   /* Try to keep the number of instructions low.  For all cases we will do at
17453      most two moves for the residual amount, since we'll always overlap the
17454      remainder.  */
17455   if (((tmp / 16) + (tmp % 16 ? 2 : 0)) > max_num_moves)
17456     return false;
17457 
17458   /* At this point tmp is known to have to fit inside an int.  */
17459   n = tmp;
17460 
17461   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
17462   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
17463 
17464   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
17465   src = adjust_automodify_address (src, VOIDmode, base, 0);
17466 
17467   /* Convert n to bits to make the rest of the code simpler.  */
17468   n = n * BITS_PER_UNIT;
17469 
17470   /* Maximum amount to copy in one go.  The AArch64 back-end has integer modes
17471      larger than TImode, but we should not use them for loads/stores here.  */
17472   const int copy_limit = GET_MODE_BITSIZE (TImode);
17473 
17474   while (n > 0)
17475     {
17476       /* Find the largest mode in which to do the copy in without over reading
17477 	 or writing.  */
17478       opt_scalar_int_mode mode_iter;
17479       FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
17480 	if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit))
17481 	  cur_mode = mode_iter.require ();
17482 
17483       gcc_assert (cur_mode != BLKmode);
17484 
17485       mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
17486       aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode);
17487 
17488       n -= mode_bits;
17489 
17490       /* Do certain trailing copies as overlapping if it's going to be
17491 	 cheaper.  i.e. less instructions to do so.  For instance doing a 15
17492 	 byte copy it's more efficient to do two overlapping 8 byte copies than
17493 	 8 + 6 + 1.  */
17494       if (n > 0 && n <= 8 * BITS_PER_UNIT)
17495 	{
17496 	  next_mode = smallest_mode_for_size (n, MODE_INT);
17497 	  int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
17498 	  src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT);
17499 	  dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
17500 	  n = n_bits;
17501 	}
17502     }
17503 
17504   return true;
17505 }
17506 
17507 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
17508    SImode stores.  Handle the case when the constant has identical
17509    bottom and top halves.  This is beneficial when the two stores can be
17510    merged into an STP and we avoid synthesising potentially expensive
17511    immediates twice.  Return true if such a split is possible.  */
17512 
17513 bool
aarch64_split_dimode_const_store(rtx dst,rtx src)17514 aarch64_split_dimode_const_store (rtx dst, rtx src)
17515 {
17516   rtx lo = gen_lowpart (SImode, src);
17517   rtx hi = gen_highpart_mode (SImode, DImode, src);
17518 
17519   bool size_p = optimize_function_for_size_p (cfun);
17520 
17521   if (!rtx_equal_p (lo, hi))
17522     return false;
17523 
17524   unsigned int orig_cost
17525     = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
17526   unsigned int lo_cost
17527     = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
17528 
17529   /* We want to transform:
17530      MOV	x1, 49370
17531      MOVK	x1, 0x140, lsl 16
17532      MOVK	x1, 0xc0da, lsl 32
17533      MOVK	x1, 0x140, lsl 48
17534      STR	x1, [x0]
17535    into:
17536      MOV	w1, 49370
17537      MOVK	w1, 0x140, lsl 16
17538      STP	w1, w1, [x0]
17539    So we want to perform this only when we save two instructions
17540    or more.  When optimizing for size, however, accept any code size
17541    savings we can.  */
17542   if (size_p && orig_cost <= lo_cost)
17543     return false;
17544 
17545   if (!size_p
17546       && (orig_cost <= lo_cost + 1))
17547     return false;
17548 
17549   rtx mem_lo = adjust_address (dst, SImode, 0);
17550   if (!aarch64_mem_pair_operand (mem_lo, SImode))
17551     return false;
17552 
17553   rtx tmp_reg = gen_reg_rtx (SImode);
17554   aarch64_expand_mov_immediate (tmp_reg, lo);
17555   rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
17556   /* Don't emit an explicit store pair as this may not be always profitable.
17557      Let the sched-fusion logic decide whether to merge them.  */
17558   emit_move_insn (mem_lo, tmp_reg);
17559   emit_move_insn (mem_hi, tmp_reg);
17560 
17561   return true;
17562 }
17563 
17564 /* Generate RTL for a conditional branch with rtx comparison CODE in
17565    mode CC_MODE.  The destination of the unlikely conditional branch
17566    is LABEL_REF.  */
17567 
17568 void
aarch64_gen_unlikely_cbranch(enum rtx_code code,machine_mode cc_mode,rtx label_ref)17569 aarch64_gen_unlikely_cbranch (enum rtx_code code, machine_mode cc_mode,
17570 			      rtx label_ref)
17571 {
17572   rtx x;
17573   x = gen_rtx_fmt_ee (code, VOIDmode,
17574 		      gen_rtx_REG (cc_mode, CC_REGNUM),
17575 		      const0_rtx);
17576 
17577   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
17578 			    gen_rtx_LABEL_REF (VOIDmode, label_ref),
17579 			    pc_rtx);
17580   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
17581 }
17582 
17583 /* Generate DImode scratch registers for 128-bit (TImode) addition.
17584 
17585    OP1 represents the TImode destination operand 1
17586    OP2 represents the TImode destination operand 2
17587    LOW_DEST represents the low half (DImode) of TImode operand 0
17588    LOW_IN1 represents the low half (DImode) of TImode operand 1
17589    LOW_IN2 represents the low half (DImode) of TImode operand 2
17590    HIGH_DEST represents the high half (DImode) of TImode operand 0
17591    HIGH_IN1 represents the high half (DImode) of TImode operand 1
17592    HIGH_IN2 represents the high half (DImode) of TImode operand 2.  */
17593 
17594 void
aarch64_addti_scratch_regs(rtx op1,rtx op2,rtx * low_dest,rtx * low_in1,rtx * low_in2,rtx * high_dest,rtx * high_in1,rtx * high_in2)17595 aarch64_addti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
17596 			    rtx *low_in1, rtx *low_in2,
17597 			    rtx *high_dest, rtx *high_in1,
17598 			    rtx *high_in2)
17599 {
17600   *low_dest = gen_reg_rtx (DImode);
17601   *low_in1 = gen_lowpart (DImode, op1);
17602   *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
17603 				  subreg_lowpart_offset (DImode, TImode));
17604   *high_dest = gen_reg_rtx (DImode);
17605   *high_in1 = gen_highpart (DImode, op1);
17606   *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
17607 				   subreg_highpart_offset (DImode, TImode));
17608 }
17609 
17610 /* Generate DImode scratch registers for 128-bit (TImode) subtraction.
17611 
17612    This function differs from 'arch64_addti_scratch_regs' in that
17613    OP1 can be an immediate constant (zero). We must call
17614    subreg_highpart_offset with DImode and TImode arguments, otherwise
17615    VOIDmode will be used for the const_int which generates an internal
17616    error from subreg_size_highpart_offset which does not expect a size of zero.
17617 
17618    OP1 represents the TImode destination operand 1
17619    OP2 represents the TImode destination operand 2
17620    LOW_DEST represents the low half (DImode) of TImode operand 0
17621    LOW_IN1 represents the low half (DImode) of TImode operand 1
17622    LOW_IN2 represents the low half (DImode) of TImode operand 2
17623    HIGH_DEST represents the high half (DImode) of TImode operand 0
17624    HIGH_IN1 represents the high half (DImode) of TImode operand 1
17625    HIGH_IN2 represents the high half (DImode) of TImode operand 2.  */
17626 
17627 
17628 void
aarch64_subvti_scratch_regs(rtx op1,rtx op2,rtx * low_dest,rtx * low_in1,rtx * low_in2,rtx * high_dest,rtx * high_in1,rtx * high_in2)17629 aarch64_subvti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
17630 			     rtx *low_in1, rtx *low_in2,
17631 			     rtx *high_dest, rtx *high_in1,
17632 			     rtx *high_in2)
17633 {
17634   *low_dest = gen_reg_rtx (DImode);
17635   *low_in1 = simplify_gen_subreg (DImode, op1, TImode,
17636 				  subreg_lowpart_offset (DImode, TImode));
17637 
17638   *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
17639 				  subreg_lowpart_offset (DImode, TImode));
17640   *high_dest = gen_reg_rtx (DImode);
17641 
17642   *high_in1 = simplify_gen_subreg (DImode, op1, TImode,
17643 				   subreg_highpart_offset (DImode, TImode));
17644   *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
17645 				   subreg_highpart_offset (DImode, TImode));
17646 }
17647 
17648 /* Generate RTL for 128-bit (TImode) subtraction with overflow.
17649 
17650    OP0 represents the TImode destination operand 0
17651    LOW_DEST represents the low half (DImode) of TImode operand 0
17652    LOW_IN1 represents the low half (DImode) of TImode operand 1
17653    LOW_IN2 represents the low half (DImode) of TImode operand 2
17654    HIGH_DEST represents the high half (DImode) of TImode operand 0
17655    HIGH_IN1 represents the high half (DImode) of TImode operand 1
17656    HIGH_IN2 represents the high half (DImode) of TImode operand 2
17657    UNSIGNED_P is true if the operation is being performed on unsigned
17658    values.  */
17659 void
aarch64_expand_subvti(rtx op0,rtx low_dest,rtx low_in1,rtx low_in2,rtx high_dest,rtx high_in1,rtx high_in2,bool unsigned_p)17660 aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1,
17661 		       rtx low_in2, rtx high_dest, rtx high_in1,
17662 		       rtx high_in2, bool unsigned_p)
17663 {
17664   if (low_in2 == const0_rtx)
17665     {
17666       low_dest = low_in1;
17667       high_in2 = force_reg (DImode, high_in2);
17668       if (unsigned_p)
17669 	emit_insn (gen_subdi3_compare1 (high_dest, high_in1, high_in2));
17670       else
17671 	emit_insn (gen_subvdi_insn (high_dest, high_in1, high_in2));
17672     }
17673   else
17674     {
17675       if (aarch64_plus_immediate (low_in2, DImode))
17676 	emit_insn (gen_subdi3_compare1_imm (low_dest, low_in1, low_in2,
17677 					    GEN_INT (-INTVAL (low_in2))));
17678       else
17679 	{
17680 	  low_in2 = force_reg (DImode, low_in2);
17681 	  emit_insn (gen_subdi3_compare1 (low_dest, low_in1, low_in2));
17682 	}
17683       high_in2 = force_reg (DImode, high_in2);
17684 
17685       if (unsigned_p)
17686 	emit_insn (gen_usubdi3_carryinC (high_dest, high_in1, high_in2));
17687       else
17688 	emit_insn (gen_subdi3_carryinV (high_dest, high_in1, high_in2));
17689     }
17690 
17691   emit_move_insn (gen_lowpart (DImode, op0), low_dest);
17692   emit_move_insn (gen_highpart (DImode, op0), high_dest);
17693 
17694 }
17695 
17696 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
17697 
17698 static unsigned HOST_WIDE_INT
aarch64_asan_shadow_offset(void)17699 aarch64_asan_shadow_offset (void)
17700 {
17701   return (HOST_WIDE_INT_1 << 36);
17702 }
17703 
17704 static rtx
aarch64_gen_ccmp_first(rtx_insn ** prep_seq,rtx_insn ** gen_seq,int code,tree treeop0,tree treeop1)17705 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
17706 			int code, tree treeop0, tree treeop1)
17707 {
17708   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
17709   rtx op0, op1;
17710   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
17711   insn_code icode;
17712   struct expand_operand ops[4];
17713 
17714   start_sequence ();
17715   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
17716 
17717   op_mode = GET_MODE (op0);
17718   if (op_mode == VOIDmode)
17719     op_mode = GET_MODE (op1);
17720 
17721   switch (op_mode)
17722     {
17723     case E_QImode:
17724     case E_HImode:
17725     case E_SImode:
17726       cmp_mode = SImode;
17727       icode = CODE_FOR_cmpsi;
17728       break;
17729 
17730     case E_DImode:
17731       cmp_mode = DImode;
17732       icode = CODE_FOR_cmpdi;
17733       break;
17734 
17735     case E_SFmode:
17736       cmp_mode = SFmode;
17737       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
17738       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
17739       break;
17740 
17741     case E_DFmode:
17742       cmp_mode = DFmode;
17743       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
17744       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
17745       break;
17746 
17747     default:
17748       end_sequence ();
17749       return NULL_RTX;
17750     }
17751 
17752   op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
17753   op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
17754   if (!op0 || !op1)
17755     {
17756       end_sequence ();
17757       return NULL_RTX;
17758     }
17759   *prep_seq = get_insns ();
17760   end_sequence ();
17761 
17762   create_fixed_operand (&ops[0], op0);
17763   create_fixed_operand (&ops[1], op1);
17764 
17765   start_sequence ();
17766   if (!maybe_expand_insn (icode, 2, ops))
17767     {
17768       end_sequence ();
17769       return NULL_RTX;
17770     }
17771   *gen_seq = get_insns ();
17772   end_sequence ();
17773 
17774   return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
17775 			 gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
17776 }
17777 
17778 static rtx
aarch64_gen_ccmp_next(rtx_insn ** prep_seq,rtx_insn ** gen_seq,rtx prev,int cmp_code,tree treeop0,tree treeop1,int bit_code)17779 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
17780 		       int cmp_code, tree treeop0, tree treeop1, int bit_code)
17781 {
17782   rtx op0, op1, target;
17783   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
17784   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
17785   insn_code icode;
17786   struct expand_operand ops[6];
17787   int aarch64_cond;
17788 
17789   push_to_sequence (*prep_seq);
17790   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
17791 
17792   op_mode = GET_MODE (op0);
17793   if (op_mode == VOIDmode)
17794     op_mode = GET_MODE (op1);
17795 
17796   switch (op_mode)
17797     {
17798     case E_QImode:
17799     case E_HImode:
17800     case E_SImode:
17801       cmp_mode = SImode;
17802       icode = CODE_FOR_ccmpsi;
17803       break;
17804 
17805     case E_DImode:
17806       cmp_mode = DImode;
17807       icode = CODE_FOR_ccmpdi;
17808       break;
17809 
17810     case E_SFmode:
17811       cmp_mode = SFmode;
17812       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
17813       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
17814       break;
17815 
17816     case E_DFmode:
17817       cmp_mode = DFmode;
17818       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
17819       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
17820       break;
17821 
17822     default:
17823       end_sequence ();
17824       return NULL_RTX;
17825     }
17826 
17827   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
17828   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
17829   if (!op0 || !op1)
17830     {
17831       end_sequence ();
17832       return NULL_RTX;
17833     }
17834   *prep_seq = get_insns ();
17835   end_sequence ();
17836 
17837   target = gen_rtx_REG (cc_mode, CC_REGNUM);
17838   aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
17839 
17840   if (bit_code != AND)
17841     {
17842       prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
17843 						GET_MODE (XEXP (prev, 0))),
17844 			     VOIDmode, XEXP (prev, 0), const0_rtx);
17845       aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
17846     }
17847 
17848   create_fixed_operand (&ops[0], XEXP (prev, 0));
17849   create_fixed_operand (&ops[1], target);
17850   create_fixed_operand (&ops[2], op0);
17851   create_fixed_operand (&ops[3], op1);
17852   create_fixed_operand (&ops[4], prev);
17853   create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
17854 
17855   push_to_sequence (*gen_seq);
17856   if (!maybe_expand_insn (icode, 6, ops))
17857     {
17858       end_sequence ();
17859       return NULL_RTX;
17860     }
17861 
17862   *gen_seq = get_insns ();
17863   end_sequence ();
17864 
17865   return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
17866 }
17867 
17868 #undef TARGET_GEN_CCMP_FIRST
17869 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
17870 
17871 #undef TARGET_GEN_CCMP_NEXT
17872 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
17873 
17874 /* Implement TARGET_SCHED_MACRO_FUSION_P.  Return true if target supports
17875    instruction fusion of some sort.  */
17876 
17877 static bool
aarch64_macro_fusion_p(void)17878 aarch64_macro_fusion_p (void)
17879 {
17880   return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
17881 }
17882 
17883 
17884 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P.  Return true if PREV and CURR
17885    should be kept together during scheduling.  */
17886 
17887 static bool
aarch_macro_fusion_pair_p(rtx_insn * prev,rtx_insn * curr)17888 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
17889 {
17890   rtx set_dest;
17891   rtx prev_set = single_set (prev);
17892   rtx curr_set = single_set (curr);
17893   /* prev and curr are simple SET insns i.e. no flag setting or branching.  */
17894   bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
17895 
17896   if (!aarch64_macro_fusion_p ())
17897     return false;
17898 
17899   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
17900     {
17901       /* We are trying to match:
17902          prev (mov)  == (set (reg r0) (const_int imm16))
17903          curr (movk) == (set (zero_extract (reg r0)
17904                                            (const_int 16)
17905                                            (const_int 16))
17906                              (const_int imm16_1))  */
17907 
17908       set_dest = SET_DEST (curr_set);
17909 
17910       if (GET_CODE (set_dest) == ZERO_EXTRACT
17911           && CONST_INT_P (SET_SRC (curr_set))
17912           && CONST_INT_P (SET_SRC (prev_set))
17913           && CONST_INT_P (XEXP (set_dest, 2))
17914           && INTVAL (XEXP (set_dest, 2)) == 16
17915           && REG_P (XEXP (set_dest, 0))
17916           && REG_P (SET_DEST (prev_set))
17917           && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
17918         {
17919           return true;
17920         }
17921     }
17922 
17923   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
17924     {
17925 
17926       /*  We're trying to match:
17927           prev (adrp) == (set (reg r1)
17928                               (high (symbol_ref ("SYM"))))
17929           curr (add) == (set (reg r0)
17930                              (lo_sum (reg r1)
17931                                      (symbol_ref ("SYM"))))
17932           Note that r0 need not necessarily be the same as r1, especially
17933           during pre-regalloc scheduling.  */
17934 
17935       if (satisfies_constraint_Ush (SET_SRC (prev_set))
17936           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
17937         {
17938           if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
17939               && REG_P (XEXP (SET_SRC (curr_set), 0))
17940               && REGNO (XEXP (SET_SRC (curr_set), 0))
17941                  == REGNO (SET_DEST (prev_set))
17942               && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
17943                               XEXP (SET_SRC (curr_set), 1)))
17944             return true;
17945         }
17946     }
17947 
17948   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
17949     {
17950 
17951       /* We're trying to match:
17952          prev (movk) == (set (zero_extract (reg r0)
17953                                            (const_int 16)
17954                                            (const_int 32))
17955                              (const_int imm16_1))
17956          curr (movk) == (set (zero_extract (reg r0)
17957                                            (const_int 16)
17958                                            (const_int 48))
17959                              (const_int imm16_2))  */
17960 
17961       if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
17962           && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
17963           && REG_P (XEXP (SET_DEST (prev_set), 0))
17964           && REG_P (XEXP (SET_DEST (curr_set), 0))
17965           && REGNO (XEXP (SET_DEST (prev_set), 0))
17966              == REGNO (XEXP (SET_DEST (curr_set), 0))
17967           && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
17968           && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
17969           && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
17970           && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
17971           && CONST_INT_P (SET_SRC (prev_set))
17972           && CONST_INT_P (SET_SRC (curr_set)))
17973         return true;
17974 
17975     }
17976   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
17977     {
17978       /* We're trying to match:
17979           prev (adrp) == (set (reg r0)
17980                               (high (symbol_ref ("SYM"))))
17981           curr (ldr) == (set (reg r1)
17982                              (mem (lo_sum (reg r0)
17983                                              (symbol_ref ("SYM")))))
17984                  or
17985           curr (ldr) == (set (reg r1)
17986                              (zero_extend (mem
17987                                            (lo_sum (reg r0)
17988                                                    (symbol_ref ("SYM"))))))  */
17989       if (satisfies_constraint_Ush (SET_SRC (prev_set))
17990           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
17991         {
17992           rtx curr_src = SET_SRC (curr_set);
17993 
17994           if (GET_CODE (curr_src) == ZERO_EXTEND)
17995             curr_src = XEXP (curr_src, 0);
17996 
17997           if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
17998               && REG_P (XEXP (XEXP (curr_src, 0), 0))
17999               && REGNO (XEXP (XEXP (curr_src, 0), 0))
18000                  == REGNO (SET_DEST (prev_set))
18001               && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
18002                               XEXP (SET_SRC (prev_set), 0)))
18003               return true;
18004         }
18005     }
18006 
18007   if (aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)
18008        && aarch_crypto_can_dual_issue (prev, curr))
18009     return true;
18010 
18011   if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
18012       && any_condjump_p (curr))
18013     {
18014       unsigned int condreg1, condreg2;
18015       rtx cc_reg_1;
18016       aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
18017       cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
18018 
18019       if (reg_referenced_p (cc_reg_1, PATTERN (curr))
18020 	  && prev
18021 	  && modified_in_p (cc_reg_1, prev))
18022 	{
18023 	  enum attr_type prev_type = get_attr_type (prev);
18024 
18025 	  /* FIXME: this misses some which is considered simple arthematic
18026 	     instructions for ThunderX.  Simple shifts are missed here.  */
18027 	  if (prev_type == TYPE_ALUS_SREG
18028 	      || prev_type == TYPE_ALUS_IMM
18029 	      || prev_type == TYPE_LOGICS_REG
18030 	      || prev_type == TYPE_LOGICS_IMM)
18031 	    return true;
18032 	}
18033     }
18034 
18035   if (prev_set
18036       && curr_set
18037       && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
18038       && any_condjump_p (curr))
18039     {
18040       /* We're trying to match:
18041 	  prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
18042 	  curr (cbz) ==  (set (pc) (if_then_else (eq/ne) (r0)
18043 							 (const_int 0))
18044 						 (label_ref ("SYM"))
18045 						 (pc))  */
18046       if (SET_DEST (curr_set) == (pc_rtx)
18047 	  && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
18048 	  && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
18049 	  && REG_P (SET_DEST (prev_set))
18050 	  && REGNO (SET_DEST (prev_set))
18051 	     == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
18052 	{
18053 	  /* Fuse ALU operations followed by conditional branch instruction.  */
18054 	  switch (get_attr_type (prev))
18055 	    {
18056 	    case TYPE_ALU_IMM:
18057 	    case TYPE_ALU_SREG:
18058 	    case TYPE_ADC_REG:
18059 	    case TYPE_ADC_IMM:
18060 	    case TYPE_ADCS_REG:
18061 	    case TYPE_ADCS_IMM:
18062 	    case TYPE_LOGIC_REG:
18063 	    case TYPE_LOGIC_IMM:
18064 	    case TYPE_CSEL:
18065 	    case TYPE_ADR:
18066 	    case TYPE_MOV_IMM:
18067 	    case TYPE_SHIFT_REG:
18068 	    case TYPE_SHIFT_IMM:
18069 	    case TYPE_BFM:
18070 	    case TYPE_RBIT:
18071 	    case TYPE_REV:
18072 	    case TYPE_EXTEND:
18073 	      return true;
18074 
18075 	    default:;
18076 	    }
18077 	}
18078     }
18079 
18080   return false;
18081 }
18082 
18083 /* Return true iff the instruction fusion described by OP is enabled.  */
18084 
18085 bool
aarch64_fusion_enabled_p(enum aarch64_fusion_pairs op)18086 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
18087 {
18088   return (aarch64_tune_params.fusible_ops & op) != 0;
18089 }
18090 
18091 /* If MEM is in the form of [base+offset], extract the two parts
18092    of address and set to BASE and OFFSET, otherwise return false
18093    after clearing BASE and OFFSET.  */
18094 
18095 bool
extract_base_offset_in_addr(rtx mem,rtx * base,rtx * offset)18096 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
18097 {
18098   rtx addr;
18099 
18100   gcc_assert (MEM_P (mem));
18101 
18102   addr = XEXP (mem, 0);
18103 
18104   if (REG_P (addr))
18105     {
18106       *base = addr;
18107       *offset = const0_rtx;
18108       return true;
18109     }
18110 
18111   if (GET_CODE (addr) == PLUS
18112       && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
18113     {
18114       *base = XEXP (addr, 0);
18115       *offset = XEXP (addr, 1);
18116       return true;
18117     }
18118 
18119   *base = NULL_RTX;
18120   *offset = NULL_RTX;
18121 
18122   return false;
18123 }
18124 
18125 /* Types for scheduling fusion.  */
18126 enum sched_fusion_type
18127 {
18128   SCHED_FUSION_NONE = 0,
18129   SCHED_FUSION_LD_SIGN_EXTEND,
18130   SCHED_FUSION_LD_ZERO_EXTEND,
18131   SCHED_FUSION_LD,
18132   SCHED_FUSION_ST,
18133   SCHED_FUSION_NUM
18134 };
18135 
18136 /* If INSN is a load or store of address in the form of [base+offset],
18137    extract the two parts and set to BASE and OFFSET.  Return scheduling
18138    fusion type this INSN is.  */
18139 
18140 static enum sched_fusion_type
fusion_load_store(rtx_insn * insn,rtx * base,rtx * offset)18141 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
18142 {
18143   rtx x, dest, src;
18144   enum sched_fusion_type fusion = SCHED_FUSION_LD;
18145 
18146   gcc_assert (INSN_P (insn));
18147   x = PATTERN (insn);
18148   if (GET_CODE (x) != SET)
18149     return SCHED_FUSION_NONE;
18150 
18151   src = SET_SRC (x);
18152   dest = SET_DEST (x);
18153 
18154   machine_mode dest_mode = GET_MODE (dest);
18155 
18156   if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
18157     return SCHED_FUSION_NONE;
18158 
18159   if (GET_CODE (src) == SIGN_EXTEND)
18160     {
18161       fusion = SCHED_FUSION_LD_SIGN_EXTEND;
18162       src = XEXP (src, 0);
18163       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
18164 	return SCHED_FUSION_NONE;
18165     }
18166   else if (GET_CODE (src) == ZERO_EXTEND)
18167     {
18168       fusion = SCHED_FUSION_LD_ZERO_EXTEND;
18169       src = XEXP (src, 0);
18170       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
18171 	return SCHED_FUSION_NONE;
18172     }
18173 
18174   if (GET_CODE (src) == MEM && REG_P (dest))
18175     extract_base_offset_in_addr (src, base, offset);
18176   else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
18177     {
18178       fusion = SCHED_FUSION_ST;
18179       extract_base_offset_in_addr (dest, base, offset);
18180     }
18181   else
18182     return SCHED_FUSION_NONE;
18183 
18184   if (*base == NULL_RTX || *offset == NULL_RTX)
18185     fusion = SCHED_FUSION_NONE;
18186 
18187   return fusion;
18188 }
18189 
18190 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
18191 
18192    Currently we only support to fuse ldr or str instructions, so FUSION_PRI
18193    and PRI are only calculated for these instructions.  For other instruction,
18194    FUSION_PRI and PRI are simply set to MAX_PRI - 1.  In the future, other
18195    type instruction fusion can be added by returning different priorities.
18196 
18197    It's important that irrelevant instructions get the largest FUSION_PRI.  */
18198 
18199 static void
aarch64_sched_fusion_priority(rtx_insn * insn,int max_pri,int * fusion_pri,int * pri)18200 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
18201 			       int *fusion_pri, int *pri)
18202 {
18203   int tmp, off_val;
18204   rtx base, offset;
18205   enum sched_fusion_type fusion;
18206 
18207   gcc_assert (INSN_P (insn));
18208 
18209   tmp = max_pri - 1;
18210   fusion = fusion_load_store (insn, &base, &offset);
18211   if (fusion == SCHED_FUSION_NONE)
18212     {
18213       *pri = tmp;
18214       *fusion_pri = tmp;
18215       return;
18216     }
18217 
18218   /* Set FUSION_PRI according to fusion type and base register.  */
18219   *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
18220 
18221   /* Calculate PRI.  */
18222   tmp /= 2;
18223 
18224   /* INSN with smaller offset goes first.  */
18225   off_val = (int)(INTVAL (offset));
18226   if (off_val >= 0)
18227     tmp -= (off_val & 0xfffff);
18228   else
18229     tmp += ((- off_val) & 0xfffff);
18230 
18231   *pri = tmp;
18232   return;
18233 }
18234 
18235 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
18236    Adjust priority of sha1h instructions so they are scheduled before
18237    other SHA1 instructions.  */
18238 
18239 static int
aarch64_sched_adjust_priority(rtx_insn * insn,int priority)18240 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
18241 {
18242   rtx x = PATTERN (insn);
18243 
18244   if (GET_CODE (x) == SET)
18245     {
18246       x = SET_SRC (x);
18247 
18248       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
18249 	return priority + 10;
18250     }
18251 
18252   return priority;
18253 }
18254 
18255 /* Given OPERANDS of consecutive load/store, check if we can merge
18256    them into ldp/stp.  LOAD is true if they are load instructions.
18257    MODE is the mode of memory operands.  */
18258 
18259 bool
aarch64_operands_ok_for_ldpstp(rtx * operands,bool load,machine_mode mode)18260 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
18261 				machine_mode mode)
18262 {
18263   HOST_WIDE_INT offval_1, offval_2, msize;
18264   enum reg_class rclass_1, rclass_2;
18265   rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
18266 
18267   if (load)
18268     {
18269       mem_1 = operands[1];
18270       mem_2 = operands[3];
18271       reg_1 = operands[0];
18272       reg_2 = operands[2];
18273       gcc_assert (REG_P (reg_1) && REG_P (reg_2));
18274       if (REGNO (reg_1) == REGNO (reg_2))
18275 	return false;
18276     }
18277   else
18278     {
18279       mem_1 = operands[0];
18280       mem_2 = operands[2];
18281       reg_1 = operands[1];
18282       reg_2 = operands[3];
18283     }
18284 
18285   /* The mems cannot be volatile.  */
18286   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
18287     return false;
18288 
18289   /* If we have SImode and slow unaligned ldp,
18290      check the alignment to be at least 8 byte. */
18291   if (mode == SImode
18292       && (aarch64_tune_params.extra_tuning_flags
18293           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
18294       && !optimize_size
18295       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
18296     return false;
18297 
18298   /* Check if the addresses are in the form of [base+offset].  */
18299   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
18300   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
18301     return false;
18302   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
18303   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
18304     return false;
18305 
18306   /* Check if the bases are same.  */
18307   if (!rtx_equal_p (base_1, base_2))
18308     return false;
18309 
18310   /* The operands must be of the same size.  */
18311   gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1)),
18312 			 GET_MODE_SIZE (GET_MODE (mem_2))));
18313 
18314   offval_1 = INTVAL (offset_1);
18315   offval_2 = INTVAL (offset_2);
18316   /* We should only be trying this for fixed-sized modes.  There is no
18317      SVE LDP/STP instruction.  */
18318   msize = GET_MODE_SIZE (mode).to_constant ();
18319   /* Check if the offsets are consecutive.  */
18320   if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
18321     return false;
18322 
18323   /* Check if the addresses are clobbered by load.  */
18324   if (load)
18325     {
18326       if (reg_mentioned_p (reg_1, mem_1))
18327 	return false;
18328 
18329       /* In increasing order, the last load can clobber the address.  */
18330       if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
18331 	return false;
18332     }
18333 
18334   /* One of the memory accesses must be a mempair operand.
18335      If it is not the first one, they need to be swapped by the
18336      peephole.  */
18337   if (!aarch64_mem_pair_operand (mem_1, GET_MODE (mem_1))
18338        && !aarch64_mem_pair_operand (mem_2, GET_MODE (mem_2)))
18339     return false;
18340 
18341   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
18342     rclass_1 = FP_REGS;
18343   else
18344     rclass_1 = GENERAL_REGS;
18345 
18346   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
18347     rclass_2 = FP_REGS;
18348   else
18349     rclass_2 = GENERAL_REGS;
18350 
18351   /* Check if the registers are of same class.  */
18352   if (rclass_1 != rclass_2)
18353     return false;
18354 
18355   return true;
18356 }
18357 
18358 /* Given OPERANDS of consecutive load/store that can be merged,
18359    swap them if they are not in ascending order.  */
18360 void
aarch64_swap_ldrstr_operands(rtx * operands,bool load)18361 aarch64_swap_ldrstr_operands (rtx* operands, bool load)
18362 {
18363   rtx mem_1, mem_2, base_1, base_2, offset_1, offset_2;
18364   HOST_WIDE_INT offval_1, offval_2;
18365 
18366   if (load)
18367     {
18368       mem_1 = operands[1];
18369       mem_2 = operands[3];
18370     }
18371   else
18372     {
18373       mem_1 = operands[0];
18374       mem_2 = operands[2];
18375     }
18376 
18377   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
18378   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
18379 
18380   offval_1 = INTVAL (offset_1);
18381   offval_2 = INTVAL (offset_2);
18382 
18383   if (offval_1 > offval_2)
18384     {
18385       /* Irrespective of whether this is a load or a store,
18386 	 we do the same swap.  */
18387       std::swap (operands[0], operands[2]);
18388       std::swap (operands[1], operands[3]);
18389     }
18390 }
18391 
18392 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
18393    comparison between the two.  */
18394 int
aarch64_host_wide_int_compare(const void * x,const void * y)18395 aarch64_host_wide_int_compare (const void *x, const void *y)
18396 {
18397   return wi::cmps (* ((const HOST_WIDE_INT *) x),
18398 		   * ((const HOST_WIDE_INT *) y));
18399 }
18400 
18401 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
18402    other pointing to a REG rtx containing an offset, compare the offsets
18403    of the two pairs.
18404 
18405    Return:
18406 
18407 	1 iff offset (X) > offset (Y)
18408 	0 iff offset (X) == offset (Y)
18409 	-1 iff offset (X) < offset (Y)  */
18410 int
aarch64_ldrstr_offset_compare(const void * x,const void * y)18411 aarch64_ldrstr_offset_compare (const void *x, const void *y)
18412 {
18413   const rtx * operands_1 = (const rtx *) x;
18414   const rtx * operands_2 = (const rtx *) y;
18415   rtx mem_1, mem_2, base, offset_1, offset_2;
18416 
18417   if (MEM_P (operands_1[0]))
18418     mem_1 = operands_1[0];
18419   else
18420     mem_1 = operands_1[1];
18421 
18422   if (MEM_P (operands_2[0]))
18423     mem_2 = operands_2[0];
18424   else
18425     mem_2 = operands_2[1];
18426 
18427   /* Extract the offsets.  */
18428   extract_base_offset_in_addr (mem_1, &base, &offset_1);
18429   extract_base_offset_in_addr (mem_2, &base, &offset_2);
18430 
18431   gcc_assert (offset_1 != NULL_RTX && offset_2 != NULL_RTX);
18432 
18433   return wi::cmps (INTVAL (offset_1), INTVAL (offset_2));
18434 }
18435 
18436 /* Given OPERANDS of consecutive load/store, check if we can merge
18437    them into ldp/stp by adjusting the offset.  LOAD is true if they
18438    are load instructions.  MODE is the mode of memory operands.
18439 
18440    Given below consecutive stores:
18441 
18442      str  w1, [xb, 0x100]
18443      str  w1, [xb, 0x104]
18444      str  w1, [xb, 0x108]
18445      str  w1, [xb, 0x10c]
18446 
18447    Though the offsets are out of the range supported by stp, we can
18448    still pair them after adjusting the offset, like:
18449 
18450      add  scratch, xb, 0x100
18451      stp  w1, w1, [scratch]
18452      stp  w1, w1, [scratch, 0x8]
18453 
18454    The peephole patterns detecting this opportunity should guarantee
18455    the scratch register is avaliable.  */
18456 
18457 bool
aarch64_operands_adjust_ok_for_ldpstp(rtx * operands,bool load,scalar_mode mode)18458 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
18459 				       scalar_mode mode)
18460 {
18461   const int num_insns = 4;
18462   enum reg_class rclass;
18463   HOST_WIDE_INT offvals[num_insns], msize;
18464   rtx mem[num_insns], reg[num_insns], base[num_insns], offset[num_insns];
18465 
18466   if (load)
18467     {
18468       for (int i = 0; i < num_insns; i++)
18469 	{
18470 	  reg[i] = operands[2 * i];
18471 	  mem[i] = operands[2 * i + 1];
18472 
18473 	  gcc_assert (REG_P (reg[i]));
18474 	}
18475 
18476       /* Do not attempt to merge the loads if the loads clobber each other.  */
18477       for (int i = 0; i < 8; i += 2)
18478 	for (int j = i + 2; j < 8; j += 2)
18479 	  if (reg_overlap_mentioned_p (operands[i], operands[j]))
18480 	    return false;
18481     }
18482   else
18483     for (int i = 0; i < num_insns; i++)
18484       {
18485 	mem[i] = operands[2 * i];
18486 	reg[i] = operands[2 * i + 1];
18487       }
18488 
18489   /* Skip if memory operand is by itself valid for ldp/stp.  */
18490   if (!MEM_P (mem[0]) || aarch64_mem_pair_operand (mem[0], mode))
18491     return false;
18492 
18493   for (int i = 0; i < num_insns; i++)
18494     {
18495       /* The mems cannot be volatile.  */
18496       if (MEM_VOLATILE_P (mem[i]))
18497 	return false;
18498 
18499       /* Check if the addresses are in the form of [base+offset].  */
18500       extract_base_offset_in_addr (mem[i], base + i, offset + i);
18501       if (base[i] == NULL_RTX || offset[i] == NULL_RTX)
18502 	return false;
18503     }
18504 
18505   /* Check if the registers are of same class.  */
18506   rclass = REG_P (reg[0]) && FP_REGNUM_P (REGNO (reg[0]))
18507     ? FP_REGS : GENERAL_REGS;
18508 
18509   for (int i = 1; i < num_insns; i++)
18510     if (REG_P (reg[i]) && FP_REGNUM_P (REGNO (reg[i])))
18511       {
18512 	if (rclass != FP_REGS)
18513 	  return false;
18514       }
18515     else
18516       {
18517 	if (rclass != GENERAL_REGS)
18518 	  return false;
18519       }
18520 
18521   /* Only the last register in the order in which they occur
18522      may be clobbered by the load.  */
18523   if (rclass == GENERAL_REGS && load)
18524     for (int i = 0; i < num_insns - 1; i++)
18525       if (reg_mentioned_p (reg[i], mem[i]))
18526 	return false;
18527 
18528   /* Check if the bases are same.  */
18529   for (int i = 0; i < num_insns - 1; i++)
18530     if (!rtx_equal_p (base[i], base[i + 1]))
18531       return false;
18532 
18533   for (int i = 0; i < num_insns; i++)
18534     offvals[i] = INTVAL (offset[i]);
18535 
18536   msize = GET_MODE_SIZE (mode);
18537 
18538   /* Check if the offsets can be put in the right order to do a ldp/stp.  */
18539   qsort (offvals, num_insns, sizeof (HOST_WIDE_INT),
18540 	 aarch64_host_wide_int_compare);
18541 
18542   if (!(offvals[1] == offvals[0] + msize
18543 	&& offvals[3] == offvals[2] + msize))
18544     return false;
18545 
18546   /* Check that offsets are within range of each other.  The ldp/stp
18547      instructions have 7 bit immediate offsets, so use 0x80.  */
18548   if (offvals[2] - offvals[0] >= msize * 0x80)
18549     return false;
18550 
18551   /* The offsets must be aligned with respect to each other.  */
18552   if (offvals[0] % msize != offvals[2] % msize)
18553     return false;
18554 
18555   /* If we have SImode and slow unaligned ldp,
18556      check the alignment to be at least 8 byte. */
18557   if (mode == SImode
18558       && (aarch64_tune_params.extra_tuning_flags
18559 	  & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
18560       && !optimize_size
18561       && MEM_ALIGN (mem[0]) < 8 * BITS_PER_UNIT)
18562     return false;
18563 
18564   return true;
18565 }
18566 
18567 /* Given OPERANDS of consecutive load/store, this function pairs them
18568    into LDP/STP after adjusting the offset.  It depends on the fact
18569    that the operands can be sorted so the offsets are correct for STP.
18570    MODE is the mode of memory operands.  CODE is the rtl operator
18571    which should be applied to all memory operands, it's SIGN_EXTEND,
18572    ZERO_EXTEND or UNKNOWN.  */
18573 
18574 bool
aarch64_gen_adjusted_ldpstp(rtx * operands,bool load,scalar_mode mode,RTX_CODE code)18575 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
18576 			     scalar_mode mode, RTX_CODE code)
18577 {
18578   rtx base, offset_1, offset_3, t1, t2;
18579   rtx mem_1, mem_2, mem_3, mem_4;
18580   rtx temp_operands[8];
18581   HOST_WIDE_INT off_val_1, off_val_3, base_off, new_off_1, new_off_3,
18582 		stp_off_upper_limit, stp_off_lower_limit, msize;
18583 
18584   /* We make changes on a copy as we may still bail out.  */
18585   for (int i = 0; i < 8; i ++)
18586     temp_operands[i] = operands[i];
18587 
18588   /* Sort the operands.  */
18589   qsort (temp_operands, 4, 2 * sizeof (rtx *), aarch64_ldrstr_offset_compare);
18590 
18591   /* Copy the memory operands so that if we have to bail for some
18592      reason the original addresses are unchanged.  */
18593   if (load)
18594     {
18595       mem_1 = copy_rtx (temp_operands[1]);
18596       mem_2 = copy_rtx (temp_operands[3]);
18597       mem_3 = copy_rtx (temp_operands[5]);
18598       mem_4 = copy_rtx (temp_operands[7]);
18599     }
18600   else
18601     {
18602       mem_1 = copy_rtx (temp_operands[0]);
18603       mem_2 = copy_rtx (temp_operands[2]);
18604       mem_3 = copy_rtx (temp_operands[4]);
18605       mem_4 = copy_rtx (temp_operands[6]);
18606       gcc_assert (code == UNKNOWN);
18607     }
18608 
18609   extract_base_offset_in_addr (mem_1, &base, &offset_1);
18610   extract_base_offset_in_addr (mem_3, &base, &offset_3);
18611   gcc_assert (base != NULL_RTX && offset_1 != NULL_RTX
18612 	      && offset_3 != NULL_RTX);
18613 
18614   /* Adjust offset so it can fit in LDP/STP instruction.  */
18615   msize = GET_MODE_SIZE (mode);
18616   stp_off_upper_limit = msize * (0x40 - 1);
18617   stp_off_lower_limit = - msize * 0x40;
18618 
18619   off_val_1 = INTVAL (offset_1);
18620   off_val_3 = INTVAL (offset_3);
18621 
18622   /* The base offset is optimally half way between the two STP/LDP offsets.  */
18623   if (msize <= 4)
18624     base_off = (off_val_1 + off_val_3) / 2;
18625   else
18626     /* However, due to issues with negative LDP/STP offset generation for
18627        larger modes, for DF, DI and vector modes. we must not use negative
18628        addresses smaller than 9 signed unadjusted bits can store.  This
18629        provides the most range in this case.  */
18630     base_off = off_val_1;
18631 
18632   /* Adjust the base so that it is aligned with the addresses but still
18633      optimal.  */
18634   if (base_off % msize != off_val_1 % msize)
18635     /* Fix the offset, bearing in mind we want to make it bigger not
18636        smaller.  */
18637     base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
18638   else if (msize <= 4)
18639     /* The negative range of LDP/STP is one larger than the positive range.  */
18640     base_off += msize;
18641 
18642   /* Check if base offset is too big or too small.  We can attempt to resolve
18643      this issue by setting it to the maximum value and seeing if the offsets
18644      still fit.  */
18645   if (base_off >= 0x1000)
18646     {
18647       base_off = 0x1000 - 1;
18648       /* We must still make sure that the base offset is aligned with respect
18649 	 to the address.  But it may may not be made any bigger.  */
18650       base_off -= (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
18651     }
18652 
18653   /* Likewise for the case where the base is too small.  */
18654   if (base_off <= -0x1000)
18655     {
18656       base_off = -0x1000 + 1;
18657       base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
18658     }
18659 
18660   /* Offset of the first STP/LDP.  */
18661   new_off_1 = off_val_1 - base_off;
18662 
18663   /* Offset of the second STP/LDP.  */
18664   new_off_3 = off_val_3 - base_off;
18665 
18666   /* The offsets must be within the range of the LDP/STP instructions.  */
18667   if (new_off_1 > stp_off_upper_limit || new_off_1 < stp_off_lower_limit
18668       || new_off_3 > stp_off_upper_limit || new_off_3 < stp_off_lower_limit)
18669     return false;
18670 
18671   replace_equiv_address_nv (mem_1, plus_constant (Pmode, operands[8],
18672 						  new_off_1), true);
18673   replace_equiv_address_nv (mem_2, plus_constant (Pmode, operands[8],
18674 						  new_off_1 + msize), true);
18675   replace_equiv_address_nv (mem_3, plus_constant (Pmode, operands[8],
18676 						  new_off_3), true);
18677   replace_equiv_address_nv (mem_4, plus_constant (Pmode, operands[8],
18678 						  new_off_3 + msize), true);
18679 
18680   if (!aarch64_mem_pair_operand (mem_1, mode)
18681       || !aarch64_mem_pair_operand (mem_3, mode))
18682     return false;
18683 
18684   if (code == ZERO_EXTEND)
18685     {
18686       mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
18687       mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
18688       mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
18689       mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
18690     }
18691   else if (code == SIGN_EXTEND)
18692     {
18693       mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
18694       mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
18695       mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
18696       mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
18697     }
18698 
18699   if (load)
18700     {
18701       operands[0] = temp_operands[0];
18702       operands[1] = mem_1;
18703       operands[2] = temp_operands[2];
18704       operands[3] = mem_2;
18705       operands[4] = temp_operands[4];
18706       operands[5] = mem_3;
18707       operands[6] = temp_operands[6];
18708       operands[7] = mem_4;
18709     }
18710   else
18711     {
18712       operands[0] = mem_1;
18713       operands[1] = temp_operands[1];
18714       operands[2] = mem_2;
18715       operands[3] = temp_operands[3];
18716       operands[4] = mem_3;
18717       operands[5] = temp_operands[5];
18718       operands[6] = mem_4;
18719       operands[7] = temp_operands[7];
18720     }
18721 
18722   /* Emit adjusting instruction.  */
18723   emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, base_off)));
18724   /* Emit ldp/stp instructions.  */
18725   t1 = gen_rtx_SET (operands[0], operands[1]);
18726   t2 = gen_rtx_SET (operands[2], operands[3]);
18727   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
18728   t1 = gen_rtx_SET (operands[4], operands[5]);
18729   t2 = gen_rtx_SET (operands[6], operands[7]);
18730   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
18731   return true;
18732 }
18733 
18734 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE.  Assume for now that
18735    it isn't worth branching around empty masked ops (including masked
18736    stores).  */
18737 
18738 static bool
aarch64_empty_mask_is_expensive(unsigned)18739 aarch64_empty_mask_is_expensive (unsigned)
18740 {
18741   return false;
18742 }
18743 
18744 /* Return 1 if pseudo register should be created and used to hold
18745    GOT address for PIC code.  */
18746 
18747 bool
aarch64_use_pseudo_pic_reg(void)18748 aarch64_use_pseudo_pic_reg (void)
18749 {
18750   return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
18751 }
18752 
18753 /* Implement TARGET_UNSPEC_MAY_TRAP_P.  */
18754 
18755 static int
aarch64_unspec_may_trap_p(const_rtx x,unsigned flags)18756 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
18757 {
18758   switch (XINT (x, 1))
18759     {
18760     case UNSPEC_GOTSMALLPIC:
18761     case UNSPEC_GOTSMALLPIC28K:
18762     case UNSPEC_GOTTINYPIC:
18763       return 0;
18764     default:
18765       break;
18766     }
18767 
18768   return default_unspec_may_trap_p (x, flags);
18769 }
18770 
18771 
18772 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
18773    return the log2 of that value.  Otherwise return -1.  */
18774 
18775 int
aarch64_fpconst_pow_of_2(rtx x)18776 aarch64_fpconst_pow_of_2 (rtx x)
18777 {
18778   const REAL_VALUE_TYPE *r;
18779 
18780   if (!CONST_DOUBLE_P (x))
18781     return -1;
18782 
18783   r = CONST_DOUBLE_REAL_VALUE (x);
18784 
18785   if (REAL_VALUE_NEGATIVE (*r)
18786       || REAL_VALUE_ISNAN (*r)
18787       || REAL_VALUE_ISINF (*r)
18788       || !real_isinteger (r, DFmode))
18789     return -1;
18790 
18791   return exact_log2 (real_to_integer (r));
18792 }
18793 
18794 /* If X is a vector of equal CONST_DOUBLE values and that value is
18795    Y, return the aarch64_fpconst_pow_of_2 of Y.  Otherwise return -1.  */
18796 
18797 int
aarch64_vec_fpconst_pow_of_2(rtx x)18798 aarch64_vec_fpconst_pow_of_2 (rtx x)
18799 {
18800   int nelts;
18801   if (GET_CODE (x) != CONST_VECTOR
18802       || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
18803     return -1;
18804 
18805   if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
18806     return -1;
18807 
18808   int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
18809   if (firstval <= 0)
18810     return -1;
18811 
18812   for (int i = 1; i < nelts; i++)
18813     if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
18814       return -1;
18815 
18816   return firstval;
18817 }
18818 
18819 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
18820    to float.
18821 
18822    __fp16 always promotes through this hook.
18823    _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
18824    through the generic excess precision logic rather than here.  */
18825 
18826 static tree
aarch64_promoted_type(const_tree t)18827 aarch64_promoted_type (const_tree t)
18828 {
18829   if (SCALAR_FLOAT_TYPE_P (t)
18830       && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
18831     return float_type_node;
18832 
18833   return NULL_TREE;
18834 }
18835 
18836 /* Implement the TARGET_OPTAB_SUPPORTED_P hook.  */
18837 
18838 static bool
aarch64_optab_supported_p(int op,machine_mode mode1,machine_mode,optimization_type opt_type)18839 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
18840 			   optimization_type opt_type)
18841 {
18842   switch (op)
18843     {
18844     case rsqrt_optab:
18845       return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
18846 
18847     default:
18848       return true;
18849     }
18850 }
18851 
18852 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook.  */
18853 
18854 static unsigned int
aarch64_dwarf_poly_indeterminate_value(unsigned int i,unsigned int * factor,int * offset)18855 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
18856 					int *offset)
18857 {
18858   /* Polynomial invariant 1 == (VG / 2) - 1.  */
18859   gcc_assert (i == 1);
18860   *factor = 2;
18861   *offset = 1;
18862   return AARCH64_DWARF_VG;
18863 }
18864 
18865 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
18866    if MODE is HFmode, and punt to the generic implementation otherwise.  */
18867 
18868 static bool
aarch64_libgcc_floating_mode_supported_p(scalar_float_mode mode)18869 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
18870 {
18871   return (mode == HFmode
18872 	  ? true
18873 	  : default_libgcc_floating_mode_supported_p (mode));
18874 }
18875 
18876 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
18877    if MODE is HFmode, and punt to the generic implementation otherwise.  */
18878 
18879 static bool
aarch64_scalar_mode_supported_p(scalar_mode mode)18880 aarch64_scalar_mode_supported_p (scalar_mode mode)
18881 {
18882   return (mode == HFmode
18883 	  ? true
18884 	  : default_scalar_mode_supported_p (mode));
18885 }
18886 
18887 /* Set the value of FLT_EVAL_METHOD.
18888    ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
18889 
18890     0: evaluate all operations and constants, whose semantic type has at
18891        most the range and precision of type float, to the range and
18892        precision of float; evaluate all other operations and constants to
18893        the range and precision of the semantic type;
18894 
18895     N, where _FloatN is a supported interchange floating type
18896        evaluate all operations and constants, whose semantic type has at
18897        most the range and precision of _FloatN type, to the range and
18898        precision of the _FloatN type; evaluate all other operations and
18899        constants to the range and precision of the semantic type;
18900 
18901    If we have the ARMv8.2-A extensions then we support _Float16 in native
18902    precision, so we should set this to 16.  Otherwise, we support the type,
18903    but want to evaluate expressions in float precision, so set this to
18904    0.  */
18905 
18906 static enum flt_eval_method
aarch64_excess_precision(enum excess_precision_type type)18907 aarch64_excess_precision (enum excess_precision_type type)
18908 {
18909   switch (type)
18910     {
18911       case EXCESS_PRECISION_TYPE_FAST:
18912       case EXCESS_PRECISION_TYPE_STANDARD:
18913 	/* We can calculate either in 16-bit range and precision or
18914 	   32-bit range and precision.  Make that decision based on whether
18915 	   we have native support for the ARMv8.2-A 16-bit floating-point
18916 	   instructions or not.  */
18917 	return (TARGET_FP_F16INST
18918 		? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
18919 		: FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
18920       case EXCESS_PRECISION_TYPE_IMPLICIT:
18921 	return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
18922       default:
18923 	gcc_unreachable ();
18924     }
18925   return FLT_EVAL_METHOD_UNPREDICTABLE;
18926 }
18927 
18928 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN.  Return true if INSN can be
18929    scheduled for speculative execution.  Reject the long-running division
18930    and square-root instructions.  */
18931 
18932 static bool
aarch64_sched_can_speculate_insn(rtx_insn * insn)18933 aarch64_sched_can_speculate_insn (rtx_insn *insn)
18934 {
18935   switch (get_attr_type (insn))
18936     {
18937       case TYPE_SDIV:
18938       case TYPE_UDIV:
18939       case TYPE_FDIVS:
18940       case TYPE_FDIVD:
18941       case TYPE_FSQRTS:
18942       case TYPE_FSQRTD:
18943       case TYPE_NEON_FP_SQRT_S:
18944       case TYPE_NEON_FP_SQRT_D:
18945       case TYPE_NEON_FP_SQRT_S_Q:
18946       case TYPE_NEON_FP_SQRT_D_Q:
18947       case TYPE_NEON_FP_DIV_S:
18948       case TYPE_NEON_FP_DIV_D:
18949       case TYPE_NEON_FP_DIV_S_Q:
18950       case TYPE_NEON_FP_DIV_D_Q:
18951 	return false;
18952       default:
18953 	return true;
18954     }
18955 }
18956 
18957 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES.  */
18958 
18959 static int
aarch64_compute_pressure_classes(reg_class * classes)18960 aarch64_compute_pressure_classes (reg_class *classes)
18961 {
18962   int i = 0;
18963   classes[i++] = GENERAL_REGS;
18964   classes[i++] = FP_REGS;
18965   /* PR_REGS isn't a useful pressure class because many predicate pseudo
18966      registers need to go in PR_LO_REGS at some point during their
18967      lifetime.  Splitting it into two halves has the effect of making
18968      all predicates count against PR_LO_REGS, so that we try whenever
18969      possible to restrict the number of live predicates to 8.  This
18970      greatly reduces the amount of spilling in certain loops.  */
18971   classes[i++] = PR_LO_REGS;
18972   classes[i++] = PR_HI_REGS;
18973   return i;
18974 }
18975 
18976 /* Implement TARGET_CAN_CHANGE_MODE_CLASS.  */
18977 
18978 static bool
aarch64_can_change_mode_class(machine_mode from,machine_mode to,reg_class_t)18979 aarch64_can_change_mode_class (machine_mode from,
18980 			       machine_mode to, reg_class_t)
18981 {
18982   if (BYTES_BIG_ENDIAN)
18983     {
18984       bool from_sve_p = aarch64_sve_data_mode_p (from);
18985       bool to_sve_p = aarch64_sve_data_mode_p (to);
18986 
18987       /* Don't allow changes between SVE data modes and non-SVE modes.
18988 	 See the comment at the head of aarch64-sve.md for details.  */
18989       if (from_sve_p != to_sve_p)
18990 	return false;
18991 
18992       /* Don't allow changes in element size: lane 0 of the new vector
18993 	 would not then be lane 0 of the old vector.  See the comment
18994 	 above aarch64_maybe_expand_sve_subreg_move for a more detailed
18995 	 description.
18996 
18997 	 In the worst case, this forces a register to be spilled in
18998 	 one mode and reloaded in the other, which handles the
18999 	 endianness correctly.  */
19000       if (from_sve_p && GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to))
19001 	return false;
19002     }
19003   return true;
19004 }
19005 
19006 /* Implement TARGET_EARLY_REMAT_MODES.  */
19007 
19008 static void
aarch64_select_early_remat_modes(sbitmap modes)19009 aarch64_select_early_remat_modes (sbitmap modes)
19010 {
19011   /* SVE values are not normally live across a call, so it should be
19012      worth doing early rematerialization even in VL-specific mode.  */
19013   for (int i = 0; i < NUM_MACHINE_MODES; ++i)
19014     {
19015       machine_mode mode = (machine_mode) i;
19016       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
19017       if (vec_flags & VEC_ANY_SVE)
19018 	bitmap_set_bit (modes, i);
19019     }
19020 }
19021 
19022 /* Override the default target speculation_safe_value.  */
19023 static rtx
aarch64_speculation_safe_value(machine_mode mode,rtx result,rtx val,rtx failval)19024 aarch64_speculation_safe_value (machine_mode mode,
19025 				rtx result, rtx val, rtx failval)
19026 {
19027   /* Maybe we should warn if falling back to hard barriers.  They are
19028      likely to be noticably more expensive than the alternative below.  */
19029   if (!aarch64_track_speculation)
19030     return default_speculation_safe_value (mode, result, val, failval);
19031 
19032   if (!REG_P (val))
19033     val = copy_to_mode_reg (mode, val);
19034 
19035   if (!aarch64_reg_or_zero (failval, mode))
19036     failval = copy_to_mode_reg (mode, failval);
19037 
19038   emit_insn (gen_despeculate_copy (mode, result, val, failval));
19039   return result;
19040 }
19041 
19042 /* Implement TARGET_ESTIMATED_POLY_VALUE.
19043    Look into the tuning structure for an estimate.
19044    VAL.coeffs[1] is multiplied by the number of VQ chunks over the initial
19045    Advanced SIMD 128 bits.  */
19046 
19047 static HOST_WIDE_INT
aarch64_estimated_poly_value(poly_int64 val)19048 aarch64_estimated_poly_value (poly_int64 val)
19049 {
19050   enum aarch64_sve_vector_bits_enum width_source
19051     = aarch64_tune_params.sve_width;
19052 
19053   /* If we still don't have an estimate, use the default.  */
19054   if (width_source == SVE_SCALABLE)
19055     return default_estimated_poly_value (val);
19056 
19057   HOST_WIDE_INT over_128 = width_source - 128;
19058   return val.coeffs[0] + val.coeffs[1] * over_128 / 128;
19059 }
19060 
19061 
19062 /* Return true for types that could be supported as SIMD return or
19063    argument types.  */
19064 
19065 static bool
supported_simd_type(tree t)19066 supported_simd_type (tree t)
19067 {
19068   if (SCALAR_FLOAT_TYPE_P (t) || INTEGRAL_TYPE_P (t) || POINTER_TYPE_P (t))
19069     {
19070       HOST_WIDE_INT s = tree_to_shwi (TYPE_SIZE_UNIT (t));
19071       return s == 1 || s == 2 || s == 4 || s == 8;
19072     }
19073   return false;
19074 }
19075 
19076 /* Return true for types that currently are supported as SIMD return
19077    or argument types.  */
19078 
19079 static bool
currently_supported_simd_type(tree t,tree b)19080 currently_supported_simd_type (tree t, tree b)
19081 {
19082   if (COMPLEX_FLOAT_TYPE_P (t))
19083     return false;
19084 
19085   if (TYPE_SIZE (t) != TYPE_SIZE (b))
19086     return false;
19087 
19088   return supported_simd_type (t);
19089 }
19090 
19091 /* Implement TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN.  */
19092 
19093 static int
aarch64_simd_clone_compute_vecsize_and_simdlen(struct cgraph_node * node,struct cgraph_simd_clone * clonei,tree base_type,int num)19094 aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
19095 					struct cgraph_simd_clone *clonei,
19096 					tree base_type, int num)
19097 {
19098   tree t, ret_type, arg_type;
19099   unsigned int elt_bits, vec_bits, count;
19100 
19101   if (!TARGET_SIMD)
19102     return 0;
19103 
19104   if (clonei->simdlen
19105       && (clonei->simdlen < 2
19106 	  || clonei->simdlen > 1024
19107 	  || (clonei->simdlen & (clonei->simdlen - 1)) != 0))
19108     {
19109       warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19110 		  "unsupported simdlen %d", clonei->simdlen);
19111       return 0;
19112     }
19113 
19114   ret_type = TREE_TYPE (TREE_TYPE (node->decl));
19115   if (TREE_CODE (ret_type) != VOID_TYPE
19116       && !currently_supported_simd_type (ret_type, base_type))
19117     {
19118       if (TYPE_SIZE (ret_type) != TYPE_SIZE (base_type))
19119 	warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19120 		    "GCC does not currently support mixed size types "
19121 		    "for %<simd%> functions");
19122       else if (supported_simd_type (ret_type))
19123 	warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19124 		    "GCC does not currently support return type %qT "
19125 		    "for %<simd%> functions", ret_type);
19126       else
19127 	warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19128 		    "unsupported return type %qT for %<simd%> functions",
19129 		    ret_type);
19130       return 0;
19131     }
19132 
19133   int i;
19134   tree type_arg_types = TYPE_ARG_TYPES (TREE_TYPE (node->decl));
19135   bool decl_arg_p = (node->definition || type_arg_types == NULL_TREE);
19136 
19137   for (t = (decl_arg_p ? DECL_ARGUMENTS (node->decl) : type_arg_types), i = 0;
19138        t && t != void_list_node; t = TREE_CHAIN (t), i++)
19139     {
19140       tree arg_type = decl_arg_p ? TREE_TYPE (t) : TREE_VALUE (t);
19141 
19142       if (clonei->args[i].arg_type != SIMD_CLONE_ARG_TYPE_UNIFORM
19143 	  && !currently_supported_simd_type (arg_type, base_type))
19144 	{
19145 	  if (TYPE_SIZE (arg_type) != TYPE_SIZE (base_type))
19146 	    warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19147 			"GCC does not currently support mixed size types "
19148 			"for %<simd%> functions");
19149 	  else
19150 	    warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19151 			"GCC does not currently support argument type %qT "
19152 			"for %<simd%> functions", arg_type);
19153 	  return 0;
19154 	}
19155     }
19156 
19157   clonei->vecsize_mangle = 'n';
19158   clonei->mask_mode = VOIDmode;
19159   elt_bits = GET_MODE_BITSIZE (SCALAR_TYPE_MODE (base_type));
19160   if (clonei->simdlen == 0)
19161     {
19162       count = 2;
19163       vec_bits = (num == 0 ? 64 : 128);
19164       clonei->simdlen = vec_bits / elt_bits;
19165     }
19166   else
19167     {
19168       count = 1;
19169       vec_bits = clonei->simdlen * elt_bits;
19170       if (vec_bits != 64 && vec_bits != 128)
19171 	{
19172 	  warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19173 		      "GCC does not currently support simdlen %d for type %qT",
19174 		      clonei->simdlen, base_type);
19175 	  return 0;
19176 	}
19177     }
19178   clonei->vecsize_int = vec_bits;
19179   clonei->vecsize_float = vec_bits;
19180   return count;
19181 }
19182 
19183 /* Implement TARGET_SIMD_CLONE_ADJUST.  */
19184 
19185 static void
aarch64_simd_clone_adjust(struct cgraph_node * node)19186 aarch64_simd_clone_adjust (struct cgraph_node *node)
19187 {
19188   /* Add aarch64_vector_pcs target attribute to SIMD clones so they
19189      use the correct ABI.  */
19190 
19191   tree t = TREE_TYPE (node->decl);
19192   TYPE_ATTRIBUTES (t) = make_attribute ("aarch64_vector_pcs", "default",
19193 					TYPE_ATTRIBUTES (t));
19194 }
19195 
19196 /* Implement TARGET_SIMD_CLONE_USABLE.  */
19197 
19198 static int
aarch64_simd_clone_usable(struct cgraph_node * node)19199 aarch64_simd_clone_usable (struct cgraph_node *node)
19200 {
19201   switch (node->simdclone->vecsize_mangle)
19202     {
19203     case 'n':
19204       if (!TARGET_SIMD)
19205 	return -1;
19206       return 0;
19207     default:
19208       gcc_unreachable ();
19209     }
19210 }
19211 
19212 /* Implement TARGET_COMP_TYPE_ATTRIBUTES */
19213 
19214 static int
aarch64_comp_type_attributes(const_tree type1,const_tree type2)19215 aarch64_comp_type_attributes (const_tree type1, const_tree type2)
19216 {
19217   if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type1))
19218       != lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type2)))
19219     return 0;
19220   return 1;
19221 }
19222 
19223 /* Implement TARGET_GET_MULTILIB_ABI_NAME */
19224 
19225 static const char *
aarch64_get_multilib_abi_name(void)19226 aarch64_get_multilib_abi_name (void)
19227 {
19228   if (TARGET_BIG_END)
19229     return TARGET_ILP32 ? "aarch64_be_ilp32" : "aarch64_be";
19230   return TARGET_ILP32 ? "aarch64_ilp32" : "aarch64";
19231 }
19232 
19233 /* Implement TARGET_STACK_PROTECT_GUARD. In case of a
19234    global variable based guard use the default else
19235    return a null tree.  */
19236 static tree
aarch64_stack_protect_guard(void)19237 aarch64_stack_protect_guard (void)
19238 {
19239   if (aarch64_stack_protector_guard == SSP_GLOBAL)
19240     return default_stack_protect_guard ();
19241 
19242   return NULL_TREE;
19243 }
19244 
19245 /* Implement TARGET_ASM_FILE_END for AArch64.  This adds the AArch64 GNU NOTE
19246    section at the end if needed.  */
19247 #define GNU_PROPERTY_AARCH64_FEATURE_1_AND	0xc0000000
19248 #define GNU_PROPERTY_AARCH64_FEATURE_1_BTI	(1U << 0)
19249 #define GNU_PROPERTY_AARCH64_FEATURE_1_PAC	(1U << 1)
19250 void
aarch64_file_end_indicate_exec_stack()19251 aarch64_file_end_indicate_exec_stack ()
19252 {
19253   file_end_indicate_exec_stack ();
19254 
19255   unsigned feature_1_and = 0;
19256   if (aarch64_bti_enabled ())
19257     feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_BTI;
19258 
19259   if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE)
19260     feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_PAC;
19261 
19262   if (feature_1_and)
19263     {
19264       /* Generate .note.gnu.property section.  */
19265       switch_to_section (get_section (".note.gnu.property",
19266 				      SECTION_NOTYPE, NULL));
19267 
19268       /* PT_NOTE header: namesz, descsz, type.
19269 	 namesz = 4 ("GNU\0")
19270 	 descsz = 16 (Size of the program property array)
19271 		  [(12 + padding) * Number of array elements]
19272 	 type   = 5 (NT_GNU_PROPERTY_TYPE_0).  */
19273       assemble_align (POINTER_SIZE);
19274       assemble_integer (GEN_INT (4), 4, 32, 1);
19275       assemble_integer (GEN_INT (ROUND_UP (12, POINTER_BYTES)), 4, 32, 1);
19276       assemble_integer (GEN_INT (5), 4, 32, 1);
19277 
19278       /* PT_NOTE name.  */
19279       assemble_string ("GNU", 4);
19280 
19281       /* PT_NOTE contents for NT_GNU_PROPERTY_TYPE_0:
19282 	 type   = GNU_PROPERTY_AARCH64_FEATURE_1_AND
19283 	 datasz = 4
19284 	 data   = feature_1_and.  */
19285       assemble_integer (GEN_INT (GNU_PROPERTY_AARCH64_FEATURE_1_AND), 4, 32, 1);
19286       assemble_integer (GEN_INT (4), 4, 32, 1);
19287       assemble_integer (GEN_INT (feature_1_and), 4, 32, 1);
19288 
19289       /* Pad the size of the note to the required alignment.  */
19290       assemble_align (POINTER_SIZE);
19291     }
19292 }
19293 #undef GNU_PROPERTY_AARCH64_FEATURE_1_PAC
19294 #undef GNU_PROPERTY_AARCH64_FEATURE_1_BTI
19295 #undef GNU_PROPERTY_AARCH64_FEATURE_1_AND
19296 
19297 /* Helper function for straight line speculation.
19298    Return what barrier should be emitted for straight line speculation
19299    mitigation.
19300    When not mitigating against straight line speculation this function returns
19301    an empty string.
19302    When mitigating against straight line speculation, use:
19303    * SB when the v8.5-A SB extension is enabled.
19304    * DSB+ISB otherwise.  */
19305 const char *
aarch64_sls_barrier(int mitigation_required)19306 aarch64_sls_barrier (int mitigation_required)
19307 {
19308   return mitigation_required
19309     ? (TARGET_SB ? "sb" : "dsb\tsy\n\tisb")
19310     : "";
19311 }
19312 
19313 static GTY (()) tree aarch64_sls_shared_thunks[30];
19314 static GTY (()) bool aarch64_sls_shared_thunks_needed = false;
19315 const char *indirect_symbol_names[30] = {
19316     "__call_indirect_x0",
19317     "__call_indirect_x1",
19318     "__call_indirect_x2",
19319     "__call_indirect_x3",
19320     "__call_indirect_x4",
19321     "__call_indirect_x5",
19322     "__call_indirect_x6",
19323     "__call_indirect_x7",
19324     "__call_indirect_x8",
19325     "__call_indirect_x9",
19326     "__call_indirect_x10",
19327     "__call_indirect_x11",
19328     "__call_indirect_x12",
19329     "__call_indirect_x13",
19330     "__call_indirect_x14",
19331     "__call_indirect_x15",
19332     "", /* "__call_indirect_x16",  */
19333     "", /* "__call_indirect_x17",  */
19334     "__call_indirect_x18",
19335     "__call_indirect_x19",
19336     "__call_indirect_x20",
19337     "__call_indirect_x21",
19338     "__call_indirect_x22",
19339     "__call_indirect_x23",
19340     "__call_indirect_x24",
19341     "__call_indirect_x25",
19342     "__call_indirect_x26",
19343     "__call_indirect_x27",
19344     "__call_indirect_x28",
19345     "__call_indirect_x29",
19346 };
19347 
19348 /* Function to create a BLR thunk.  This thunk is used to mitigate straight
19349    line speculation.  Instead of a simple BLR that can be speculated past,
19350    we emit a BL to this thunk, and this thunk contains a BR to the relevant
19351    register.  These thunks have the relevant speculation barries put after
19352    their indirect branch so that speculation is blocked.
19353 
19354    We use such a thunk so the speculation barriers are kept off the
19355    architecturally executed path in order to reduce the performance overhead.
19356 
19357    When optimizing for size we use stubs shared by the linked object.
19358    When optimizing for performance we emit stubs for each function in the hope
19359    that the branch predictor can better train on jumps specific for a given
19360    function.  */
19361 rtx
aarch64_sls_create_blr_label(int regnum)19362 aarch64_sls_create_blr_label (int regnum)
19363 {
19364   gcc_assert (STUB_REGNUM_P (regnum));
19365   if (optimize_function_for_size_p (cfun))
19366     {
19367       /* For the thunks shared between different functions in this compilation
19368 	 unit we use a named symbol -- this is just for users to more easily
19369 	 understand the generated assembly.  */
19370       aarch64_sls_shared_thunks_needed = true;
19371       const char *thunk_name = indirect_symbol_names[regnum];
19372       if (aarch64_sls_shared_thunks[regnum] == NULL)
19373 	{
19374 	  /* Build a decl representing this function stub and record it for
19375 	     later.  We build a decl here so we can use the GCC machinery for
19376 	     handling sections automatically (through `get_named_section` and
19377 	     `make_decl_one_only`).  That saves us a lot of trouble handling
19378 	     the specifics of different output file formats.  */
19379 	  tree decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
19380 				  get_identifier (thunk_name),
19381 				  build_function_type_list (void_type_node,
19382 							    NULL_TREE));
19383 	  DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
19384 					   NULL_TREE, void_type_node);
19385 	  TREE_PUBLIC (decl) = 1;
19386 	  TREE_STATIC (decl) = 1;
19387 	  DECL_IGNORED_P (decl) = 1;
19388 	  DECL_ARTIFICIAL (decl) = 1;
19389 	  make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
19390 	  resolve_unique_section (decl, 0, false);
19391 	  aarch64_sls_shared_thunks[regnum] = decl;
19392 	}
19393 
19394       return gen_rtx_SYMBOL_REF (Pmode, thunk_name);
19395     }
19396 
19397   if (cfun->machine->call_via[regnum] == NULL)
19398     cfun->machine->call_via[regnum]
19399       = gen_rtx_LABEL_REF (Pmode, gen_label_rtx ());
19400   return cfun->machine->call_via[regnum];
19401 }
19402 
19403 /* Helper function for aarch64_sls_emit_blr_function_thunks and
19404    aarch64_sls_emit_shared_blr_thunks below.  */
19405 static void
aarch64_sls_emit_function_stub(FILE * out_file,int regnum)19406 aarch64_sls_emit_function_stub (FILE *out_file, int regnum)
19407 {
19408   /* Save in x16 and branch to that function so this transformation does
19409      not prevent jumping to `BTI c` instructions.  */
19410   asm_fprintf (out_file, "\tmov\tx16, x%d\n", regnum);
19411   asm_fprintf (out_file, "\tbr\tx16\n");
19412 }
19413 
19414 /* Emit all BLR stubs for this particular function.
19415    Here we emit all the BLR stubs needed for the current function.  Since we
19416    emit these stubs in a consecutive block we know there will be no speculation
19417    gadgets between each stub, and hence we only emit a speculation barrier at
19418    the end of the stub sequences.
19419 
19420    This is called in the TARGET_ASM_FUNCTION_EPILOGUE hook.  */
19421 void
aarch64_sls_emit_blr_function_thunks(FILE * out_file)19422 aarch64_sls_emit_blr_function_thunks (FILE *out_file)
19423 {
19424   if (! aarch64_harden_sls_blr_p ())
19425     return;
19426 
19427   bool any_functions_emitted = false;
19428   /* We must save and restore the current function section since this assembly
19429      is emitted at the end of the function.  This means it can be emitted *just
19430      after* the cold section of a function.  That cold part would be emitted in
19431      a different section.  That switch would trigger a `.cfi_endproc` directive
19432      to be emitted in the original section and a `.cfi_startproc` directive to
19433      be emitted in the new section.  Switching to the original section without
19434      restoring would mean that the `.cfi_endproc` emitted as a function ends
19435      would happen in a different section -- leaving an unmatched
19436      `.cfi_startproc` in the cold text section and an unmatched `.cfi_endproc`
19437      in the standard text section.  */
19438   section *save_text_section = in_section;
19439   switch_to_section (function_section (current_function_decl));
19440   for (int regnum = 0; regnum < 30; ++regnum)
19441     {
19442       rtx specu_label = cfun->machine->call_via[regnum];
19443       if (specu_label == NULL)
19444 	continue;
19445 
19446       targetm.asm_out.print_operand (out_file, specu_label, 0);
19447       asm_fprintf (out_file, ":\n");
19448       aarch64_sls_emit_function_stub (out_file, regnum);
19449       any_functions_emitted = true;
19450     }
19451   if (any_functions_emitted)
19452     /* Can use the SB if needs be here, since this stub will only be used
19453       by the current function, and hence for the current target.  */
19454     asm_fprintf (out_file, "\t%s\n", aarch64_sls_barrier (true));
19455   switch_to_section (save_text_section);
19456 }
19457 
19458 /* Emit shared BLR stubs for the current compilation unit.
19459    Over the course of compiling this unit we may have converted some BLR
19460    instructions to a BL to a shared stub function.  This is where we emit those
19461    stub functions.
19462    This function is for the stubs shared between different functions in this
19463    compilation unit.  We share when optimizing for size instead of speed.
19464 
19465    This function is called through the TARGET_ASM_FILE_END hook.  */
19466 void
aarch64_sls_emit_shared_blr_thunks(FILE * out_file)19467 aarch64_sls_emit_shared_blr_thunks (FILE *out_file)
19468 {
19469   if (! aarch64_sls_shared_thunks_needed)
19470     return;
19471 
19472   for (int regnum = 0; regnum < 30; ++regnum)
19473     {
19474       tree decl = aarch64_sls_shared_thunks[regnum];
19475       if (!decl)
19476 	continue;
19477 
19478       const char *name = indirect_symbol_names[regnum];
19479       switch_to_section (get_named_section (decl, NULL, 0));
19480       ASM_OUTPUT_ALIGN (out_file, 2);
19481       targetm.asm_out.globalize_label (out_file, name);
19482       /* Only emits if the compiler is configured for an assembler that can
19483 	 handle visibility directives.  */
19484       targetm.asm_out.assemble_visibility (decl, VISIBILITY_HIDDEN);
19485       ASM_OUTPUT_TYPE_DIRECTIVE (out_file, name, "function");
19486       ASM_OUTPUT_LABEL (out_file, name);
19487       aarch64_sls_emit_function_stub (out_file, regnum);
19488       /* Use the most conservative target to ensure it can always be used by any
19489 	 function in the translation unit.  */
19490       asm_fprintf (out_file, "\tdsb\tsy\n\tisb\n");
19491       ASM_DECLARE_FUNCTION_SIZE (out_file, name, decl);
19492     }
19493 }
19494 
19495 /* Implement TARGET_ASM_FILE_END.  */
19496 void
aarch64_asm_file_end()19497 aarch64_asm_file_end ()
19498 {
19499   aarch64_sls_emit_shared_blr_thunks (asm_out_file);
19500   /* Since this function will be called for the ASM_FILE_END hook, we ensure
19501      that what would be called otherwise (e.g. `file_end_indicate_exec_stack`
19502      for FreeBSD) still gets called.  */
19503 #ifdef TARGET_ASM_FILE_END
19504   TARGET_ASM_FILE_END ();
19505 #endif
19506 }
19507 
19508 const char *
aarch64_indirect_call_asm(rtx addr)19509 aarch64_indirect_call_asm (rtx addr)
19510 {
19511   gcc_assert (REG_P (addr));
19512   if (aarch64_harden_sls_blr_p ())
19513     {
19514       rtx stub_label = aarch64_sls_create_blr_label (REGNO (addr));
19515       output_asm_insn ("bl\t%0", &stub_label);
19516     }
19517   else
19518    output_asm_insn ("blr\t%0", &addr);
19519   return "";
19520 }
19521 
19522 /* Target-specific selftests.  */
19523 
19524 #if CHECKING_P
19525 
19526 namespace selftest {
19527 
19528 /* Selftest for the RTL loader.
19529    Verify that the RTL loader copes with a dump from
19530    print_rtx_function.  This is essentially just a test that class
19531    function_reader can handle a real dump, but it also verifies
19532    that lookup_reg_by_dump_name correctly handles hard regs.
19533    The presence of hard reg names in the dump means that the test is
19534    target-specific, hence it is in this file.  */
19535 
19536 static void
aarch64_test_loading_full_dump()19537 aarch64_test_loading_full_dump ()
19538 {
19539   rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
19540 
19541   ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
19542 
19543   rtx_insn *insn_1 = get_insn_by_uid (1);
19544   ASSERT_EQ (NOTE, GET_CODE (insn_1));
19545 
19546   rtx_insn *insn_15 = get_insn_by_uid (15);
19547   ASSERT_EQ (INSN, GET_CODE (insn_15));
19548   ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
19549 
19550   /* Verify crtl->return_rtx.  */
19551   ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
19552   ASSERT_EQ (0, REGNO (crtl->return_rtx));
19553   ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
19554 }
19555 
19556 /* Run all target-specific selftests.  */
19557 
19558 static void
aarch64_run_selftests(void)19559 aarch64_run_selftests (void)
19560 {
19561   aarch64_test_loading_full_dump ();
19562 }
19563 
19564 } // namespace selftest
19565 
19566 #endif /* #if CHECKING_P */
19567 
19568 #undef TARGET_STACK_PROTECT_GUARD
19569 #define TARGET_STACK_PROTECT_GUARD aarch64_stack_protect_guard
19570 
19571 #undef TARGET_ADDRESS_COST
19572 #define TARGET_ADDRESS_COST aarch64_address_cost
19573 
19574 /* This hook will determines whether unnamed bitfields affect the alignment
19575    of the containing structure.  The hook returns true if the structure
19576    should inherit the alignment requirements of an unnamed bitfield's
19577    type.  */
19578 #undef TARGET_ALIGN_ANON_BITFIELD
19579 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
19580 
19581 #undef TARGET_ASM_ALIGNED_DI_OP
19582 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
19583 
19584 #undef TARGET_ASM_ALIGNED_HI_OP
19585 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
19586 
19587 #undef TARGET_ASM_ALIGNED_SI_OP
19588 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
19589 
19590 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
19591 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
19592   hook_bool_const_tree_hwi_hwi_const_tree_true
19593 
19594 #undef TARGET_ASM_FILE_START
19595 #define TARGET_ASM_FILE_START aarch64_start_file
19596 
19597 #undef TARGET_ASM_OUTPUT_MI_THUNK
19598 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
19599 
19600 #undef TARGET_ASM_SELECT_RTX_SECTION
19601 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
19602 
19603 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
19604 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
19605 
19606 #undef TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY
19607 #define TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY aarch64_print_patchable_function_entry
19608 
19609 #undef TARGET_BUILD_BUILTIN_VA_LIST
19610 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
19611 
19612 #undef TARGET_CALLEE_COPIES
19613 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
19614 
19615 #undef TARGET_CAN_ELIMINATE
19616 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
19617 
19618 #undef TARGET_CAN_INLINE_P
19619 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
19620 
19621 #undef TARGET_CANNOT_FORCE_CONST_MEM
19622 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
19623 
19624 #undef TARGET_CASE_VALUES_THRESHOLD
19625 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
19626 
19627 #undef TARGET_CONDITIONAL_REGISTER_USAGE
19628 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
19629 
19630 /* Only the least significant bit is used for initialization guard
19631    variables.  */
19632 #undef TARGET_CXX_GUARD_MASK_BIT
19633 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
19634 
19635 #undef TARGET_C_MODE_FOR_SUFFIX
19636 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
19637 
19638 #ifdef TARGET_BIG_ENDIAN_DEFAULT
19639 #undef  TARGET_DEFAULT_TARGET_FLAGS
19640 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
19641 #endif
19642 
19643 #undef TARGET_CLASS_MAX_NREGS
19644 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
19645 
19646 #undef TARGET_BUILTIN_DECL
19647 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
19648 
19649 #undef TARGET_BUILTIN_RECIPROCAL
19650 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
19651 
19652 #undef TARGET_C_EXCESS_PRECISION
19653 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
19654 
19655 #undef  TARGET_EXPAND_BUILTIN
19656 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
19657 
19658 #undef TARGET_EXPAND_BUILTIN_VA_START
19659 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
19660 
19661 #undef TARGET_FOLD_BUILTIN
19662 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
19663 
19664 #undef TARGET_FUNCTION_ARG
19665 #define TARGET_FUNCTION_ARG aarch64_function_arg
19666 
19667 #undef TARGET_FUNCTION_ARG_ADVANCE
19668 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
19669 
19670 #undef TARGET_FUNCTION_ARG_BOUNDARY
19671 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
19672 
19673 #undef TARGET_FUNCTION_ARG_PADDING
19674 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
19675 
19676 #undef TARGET_GET_RAW_RESULT_MODE
19677 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
19678 #undef TARGET_GET_RAW_ARG_MODE
19679 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
19680 
19681 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
19682 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
19683 
19684 #undef TARGET_FUNCTION_VALUE
19685 #define TARGET_FUNCTION_VALUE aarch64_function_value
19686 
19687 #undef TARGET_FUNCTION_VALUE_REGNO_P
19688 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
19689 
19690 #undef TARGET_GIMPLE_FOLD_BUILTIN
19691 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
19692 
19693 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
19694 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
19695 
19696 #undef  TARGET_INIT_BUILTINS
19697 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
19698 
19699 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
19700 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
19701   aarch64_ira_change_pseudo_allocno_class
19702 
19703 #undef TARGET_LEGITIMATE_ADDRESS_P
19704 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
19705 
19706 #undef TARGET_LEGITIMATE_CONSTANT_P
19707 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
19708 
19709 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
19710 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
19711   aarch64_legitimize_address_displacement
19712 
19713 #undef TARGET_LIBGCC_CMP_RETURN_MODE
19714 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
19715 
19716 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
19717 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
19718 aarch64_libgcc_floating_mode_supported_p
19719 
19720 #undef TARGET_MANGLE_TYPE
19721 #define TARGET_MANGLE_TYPE aarch64_mangle_type
19722 
19723 #undef TARGET_MEMORY_MOVE_COST
19724 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
19725 
19726 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
19727 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
19728 
19729 #undef TARGET_MUST_PASS_IN_STACK
19730 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
19731 
19732 /* This target hook should return true if accesses to volatile bitfields
19733    should use the narrowest mode possible.  It should return false if these
19734    accesses should use the bitfield container type.  */
19735 #undef TARGET_NARROW_VOLATILE_BITFIELD
19736 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
19737 
19738 #undef  TARGET_OPTION_OVERRIDE
19739 #define TARGET_OPTION_OVERRIDE aarch64_override_options
19740 
19741 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
19742 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
19743   aarch64_override_options_after_change
19744 
19745 #undef TARGET_OPTION_SAVE
19746 #define TARGET_OPTION_SAVE aarch64_option_save
19747 
19748 #undef TARGET_OPTION_RESTORE
19749 #define TARGET_OPTION_RESTORE aarch64_option_restore
19750 
19751 #undef TARGET_OPTION_PRINT
19752 #define TARGET_OPTION_PRINT aarch64_option_print
19753 
19754 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
19755 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
19756 
19757 #undef TARGET_SET_CURRENT_FUNCTION
19758 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
19759 
19760 #undef TARGET_PASS_BY_REFERENCE
19761 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
19762 
19763 #undef TARGET_PREFERRED_RELOAD_CLASS
19764 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
19765 
19766 #undef TARGET_SCHED_REASSOCIATION_WIDTH
19767 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
19768 
19769 #undef TARGET_PROMOTED_TYPE
19770 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
19771 
19772 #undef TARGET_SECONDARY_RELOAD
19773 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
19774 
19775 #undef TARGET_SHIFT_TRUNCATION_MASK
19776 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
19777 
19778 #undef TARGET_SETUP_INCOMING_VARARGS
19779 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
19780 
19781 #undef TARGET_STRUCT_VALUE_RTX
19782 #define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
19783 
19784 #undef TARGET_REGISTER_MOVE_COST
19785 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
19786 
19787 #undef TARGET_RETURN_IN_MEMORY
19788 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
19789 
19790 #undef TARGET_RETURN_IN_MSB
19791 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
19792 
19793 #undef TARGET_RTX_COSTS
19794 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
19795 
19796 #undef TARGET_SCALAR_MODE_SUPPORTED_P
19797 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
19798 
19799 #undef TARGET_SCHED_ISSUE_RATE
19800 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
19801 
19802 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
19803 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
19804   aarch64_sched_first_cycle_multipass_dfa_lookahead
19805 
19806 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
19807 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
19808   aarch64_first_cycle_multipass_dfa_lookahead_guard
19809 
19810 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
19811 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
19812   aarch64_get_separate_components
19813 
19814 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
19815 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
19816   aarch64_components_for_bb
19817 
19818 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
19819 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
19820   aarch64_disqualify_components
19821 
19822 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
19823 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
19824   aarch64_emit_prologue_components
19825 
19826 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
19827 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
19828   aarch64_emit_epilogue_components
19829 
19830 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
19831 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
19832   aarch64_set_handled_components
19833 
19834 #undef TARGET_TRAMPOLINE_INIT
19835 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
19836 
19837 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
19838 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
19839 
19840 #undef TARGET_VECTOR_MODE_SUPPORTED_P
19841 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
19842 
19843 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
19844 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
19845   aarch64_builtin_support_vector_misalignment
19846 
19847 #undef TARGET_ARRAY_MODE
19848 #define TARGET_ARRAY_MODE aarch64_array_mode
19849 
19850 #undef TARGET_ARRAY_MODE_SUPPORTED_P
19851 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
19852 
19853 #undef TARGET_VECTORIZE_ADD_STMT_COST
19854 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
19855 
19856 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
19857 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
19858   aarch64_builtin_vectorization_cost
19859 
19860 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
19861 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
19862 
19863 #undef TARGET_VECTORIZE_BUILTINS
19864 #define TARGET_VECTORIZE_BUILTINS
19865 
19866 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
19867 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
19868   aarch64_builtin_vectorized_function
19869 
19870 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
19871 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
19872   aarch64_autovectorize_vector_sizes
19873 
19874 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
19875 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
19876   aarch64_atomic_assign_expand_fenv
19877 
19878 /* Section anchor support.  */
19879 
19880 #undef TARGET_MIN_ANCHOR_OFFSET
19881 #define TARGET_MIN_ANCHOR_OFFSET -256
19882 
19883 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
19884    byte offset; we can do much more for larger data types, but have no way
19885    to determine the size of the access.  We assume accesses are aligned.  */
19886 #undef TARGET_MAX_ANCHOR_OFFSET
19887 #define TARGET_MAX_ANCHOR_OFFSET 4095
19888 
19889 #undef TARGET_VECTOR_ALIGNMENT
19890 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
19891 
19892 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
19893 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
19894   aarch64_vectorize_preferred_vector_alignment
19895 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
19896 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
19897   aarch64_simd_vector_alignment_reachable
19898 
19899 /* vec_perm support.  */
19900 
19901 #undef TARGET_VECTORIZE_VEC_PERM_CONST
19902 #define TARGET_VECTORIZE_VEC_PERM_CONST \
19903   aarch64_vectorize_vec_perm_const
19904 
19905 #undef TARGET_VECTORIZE_GET_MASK_MODE
19906 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
19907 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
19908 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
19909   aarch64_empty_mask_is_expensive
19910 #undef TARGET_PREFERRED_ELSE_VALUE
19911 #define TARGET_PREFERRED_ELSE_VALUE \
19912   aarch64_preferred_else_value
19913 
19914 #undef TARGET_INIT_LIBFUNCS
19915 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
19916 
19917 #undef TARGET_FIXED_CONDITION_CODE_REGS
19918 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
19919 
19920 #undef TARGET_FLAGS_REGNUM
19921 #define TARGET_FLAGS_REGNUM CC_REGNUM
19922 
19923 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
19924 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
19925 
19926 #undef TARGET_ASAN_SHADOW_OFFSET
19927 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
19928 
19929 #undef TARGET_LEGITIMIZE_ADDRESS
19930 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
19931 
19932 #undef TARGET_SCHED_CAN_SPECULATE_INSN
19933 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
19934 
19935 #undef TARGET_CAN_USE_DOLOOP_P
19936 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
19937 
19938 #undef TARGET_SCHED_ADJUST_PRIORITY
19939 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
19940 
19941 #undef TARGET_SCHED_MACRO_FUSION_P
19942 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
19943 
19944 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
19945 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
19946 
19947 #undef TARGET_SCHED_FUSION_PRIORITY
19948 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
19949 
19950 #undef TARGET_UNSPEC_MAY_TRAP_P
19951 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
19952 
19953 #undef TARGET_USE_PSEUDO_PIC_REG
19954 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
19955 
19956 #undef TARGET_PRINT_OPERAND
19957 #define TARGET_PRINT_OPERAND aarch64_print_operand
19958 
19959 #undef TARGET_PRINT_OPERAND_ADDRESS
19960 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
19961 
19962 #undef TARGET_OPTAB_SUPPORTED_P
19963 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
19964 
19965 #undef TARGET_OMIT_STRUCT_RETURN_REG
19966 #define TARGET_OMIT_STRUCT_RETURN_REG true
19967 
19968 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
19969 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
19970   aarch64_dwarf_poly_indeterminate_value
19971 
19972 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors.  */
19973 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
19974 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
19975 
19976 #undef TARGET_HARD_REGNO_NREGS
19977 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
19978 #undef TARGET_HARD_REGNO_MODE_OK
19979 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
19980 
19981 #undef TARGET_MODES_TIEABLE_P
19982 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
19983 
19984 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
19985 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
19986   aarch64_hard_regno_call_part_clobbered
19987 
19988 #undef TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS
19989 #define TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS \
19990   aarch64_remove_extra_call_preserved_regs
19991 
19992 #undef TARGET_RETURN_CALL_WITH_MAX_CLOBBERS
19993 #define TARGET_RETURN_CALL_WITH_MAX_CLOBBERS \
19994   aarch64_return_call_with_max_clobbers
19995 
19996 #undef TARGET_CONSTANT_ALIGNMENT
19997 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
19998 
19999 #undef TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE
20000 #define TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE \
20001   aarch64_stack_clash_protection_alloca_probe_range
20002 
20003 #undef TARGET_COMPUTE_PRESSURE_CLASSES
20004 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
20005 
20006 #undef TARGET_CAN_CHANGE_MODE_CLASS
20007 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
20008 
20009 #undef TARGET_SELECT_EARLY_REMAT_MODES
20010 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
20011 
20012 #undef TARGET_SPECULATION_SAFE_VALUE
20013 #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
20014 
20015 #undef TARGET_ESTIMATED_POLY_VALUE
20016 #define TARGET_ESTIMATED_POLY_VALUE aarch64_estimated_poly_value
20017 
20018 #undef TARGET_ATTRIBUTE_TABLE
20019 #define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
20020 
20021 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
20022 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
20023   aarch64_simd_clone_compute_vecsize_and_simdlen
20024 
20025 #undef TARGET_SIMD_CLONE_ADJUST
20026 #define TARGET_SIMD_CLONE_ADJUST aarch64_simd_clone_adjust
20027 
20028 #undef TARGET_SIMD_CLONE_USABLE
20029 #define TARGET_SIMD_CLONE_USABLE aarch64_simd_clone_usable
20030 
20031 #undef TARGET_COMP_TYPE_ATTRIBUTES
20032 #define TARGET_COMP_TYPE_ATTRIBUTES aarch64_comp_type_attributes
20033 
20034 #undef TARGET_GET_MULTILIB_ABI_NAME
20035 #define TARGET_GET_MULTILIB_ABI_NAME aarch64_get_multilib_abi_name
20036 
20037 #if CHECKING_P
20038 #undef TARGET_RUN_TARGET_SELFTESTS
20039 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
20040 #endif /* #if CHECKING_P */
20041 
20042 #undef TARGET_ASM_FILE_END
20043 #define TARGET_ASM_FILE_END aarch64_asm_file_end
20044 
20045 #undef TARGET_ASM_FUNCTION_EPILOGUE
20046 #define TARGET_ASM_FUNCTION_EPILOGUE aarch64_sls_emit_blr_function_thunks
20047 
20048 struct gcc_target targetm = TARGET_INITIALIZER;
20049 
20050 #include "gt-aarch64.h"
20051