1 /* Machine description for AArch64 architecture.
2    Copyright (C) 2009-2018 Free Software Foundation, Inc.
3    Contributed by ARM Ltd.
4 
5    This file is part of GCC.
6 
7    GCC is free software; you can redistribute it and/or modify it
8    under the terms of the GNU General Public License as published by
9    the Free Software Foundation; either version 3, or (at your option)
10    any later version.
11 
12    GCC is distributed in the hope that it will be useful, but
13    WITHOUT ANY WARRANTY; without even the implied warranty of
14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15    General Public License for more details.
16 
17    You should have received a copy of the GNU General Public License
18    along with GCC; see the file COPYING3.  If not see
19    <http://www.gnu.org/licenses/>.  */
20 
21 #define IN_TARGET_CODE 1
22 
23 #include "config.h"
24 #define INCLUDE_STRING
25 #include "system.h"
26 #include "coretypes.h"
27 #include "backend.h"
28 #include "target.h"
29 #include "rtl.h"
30 #include "tree.h"
31 #include "memmodel.h"
32 #include "gimple.h"
33 #include "cfghooks.h"
34 #include "cfgloop.h"
35 #include "df.h"
36 #include "tm_p.h"
37 #include "stringpool.h"
38 #include "attribs.h"
39 #include "optabs.h"
40 #include "regs.h"
41 #include "emit-rtl.h"
42 #include "recog.h"
43 #include "diagnostic.h"
44 #include "insn-attr.h"
45 #include "alias.h"
46 #include "fold-const.h"
47 #include "stor-layout.h"
48 #include "calls.h"
49 #include "varasm.h"
50 #include "output.h"
51 #include "flags.h"
52 #include "explow.h"
53 #include "expr.h"
54 #include "reload.h"
55 #include "langhooks.h"
56 #include "opts.h"
57 #include "params.h"
58 #include "gimplify.h"
59 #include "dwarf2.h"
60 #include "gimple-iterator.h"
61 #include "tree-vectorizer.h"
62 #include "aarch64-cost-tables.h"
63 #include "dumpfile.h"
64 #include "builtins.h"
65 #include "rtl-iter.h"
66 #include "tm-constrs.h"
67 #include "sched-int.h"
68 #include "target-globals.h"
69 #include "common/common-target.h"
70 #include "cfgrtl.h"
71 #include "selftest.h"
72 #include "selftest-rtl.h"
73 #include "rtx-vector-builder.h"
74 
75 /* This file should be included last.  */
76 #include "target-def.h"
77 
78 /* Defined for convenience.  */
79 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
80 
81 /* Classifies an address.
82 
83    ADDRESS_REG_IMM
84        A simple base register plus immediate offset.
85 
86    ADDRESS_REG_WB
87        A base register indexed by immediate offset with writeback.
88 
89    ADDRESS_REG_REG
90        A base register indexed by (optionally scaled) register.
91 
92    ADDRESS_REG_UXTW
93        A base register indexed by (optionally scaled) zero-extended register.
94 
95    ADDRESS_REG_SXTW
96        A base register indexed by (optionally scaled) sign-extended register.
97 
98    ADDRESS_LO_SUM
99        A LO_SUM rtx with a base register and "LO12" symbol relocation.
100 
101    ADDRESS_SYMBOLIC:
102        A constant symbolic address, in pc-relative literal pool.  */
103 
104 enum aarch64_address_type {
105   ADDRESS_REG_IMM,
106   ADDRESS_REG_WB,
107   ADDRESS_REG_REG,
108   ADDRESS_REG_UXTW,
109   ADDRESS_REG_SXTW,
110   ADDRESS_LO_SUM,
111   ADDRESS_SYMBOLIC
112 };
113 
114 struct aarch64_address_info {
115   enum aarch64_address_type type;
116   rtx base;
117   rtx offset;
118   poly_int64 const_offset;
119   int shift;
120   enum aarch64_symbol_type symbol_type;
121 };
122 
123 /* Information about a legitimate vector immediate operand.  */
124 struct simd_immediate_info
125 {
126   enum insn_type { MOV, MVN };
127   enum modifier_type { LSL, MSL };
128 
simd_immediate_infosimd_immediate_info129   simd_immediate_info () {}
130   simd_immediate_info (scalar_float_mode, rtx);
131   simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
132 		       insn_type = MOV, modifier_type = LSL,
133 		       unsigned int = 0);
134   simd_immediate_info (scalar_mode, rtx, rtx);
135 
136   /* The mode of the elements.  */
137   scalar_mode elt_mode;
138 
139   /* The value of each element if all elements are the same, or the
140      first value if the constant is a series.  */
141   rtx value;
142 
143   /* The value of the step if the constant is a series, null otherwise.  */
144   rtx step;
145 
146   /* The instruction to use to move the immediate into a vector.  */
147   insn_type insn;
148 
149   /* The kind of shift modifier to use, and the number of bits to shift.
150      This is (LSL, 0) if no shift is needed.  */
151   modifier_type modifier;
152   unsigned int shift;
153 };
154 
155 /* Construct a floating-point immediate in which each element has mode
156    ELT_MODE_IN and value VALUE_IN.  */
157 inline simd_immediate_info
simd_immediate_info(scalar_float_mode elt_mode_in,rtx value_in)158 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
159   : elt_mode (elt_mode_in), value (value_in), step (NULL_RTX), insn (MOV),
160     modifier (LSL), shift (0)
161 {}
162 
163 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
164    and value VALUE_IN.  The other parameters are as for the structure
165    fields.  */
166 inline simd_immediate_info
simd_immediate_info(scalar_int_mode elt_mode_in,unsigned HOST_WIDE_INT value_in,insn_type insn_in,modifier_type modifier_in,unsigned int shift_in)167 ::simd_immediate_info (scalar_int_mode elt_mode_in,
168 		       unsigned HOST_WIDE_INT value_in,
169 		       insn_type insn_in, modifier_type modifier_in,
170 		       unsigned int shift_in)
171   : elt_mode (elt_mode_in), value (gen_int_mode (value_in, elt_mode_in)),
172     step (NULL_RTX), insn (insn_in), modifier (modifier_in), shift (shift_in)
173 {}
174 
175 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
176    and where element I is equal to VALUE_IN + I * STEP_IN.  */
177 inline simd_immediate_info
simd_immediate_info(scalar_mode elt_mode_in,rtx value_in,rtx step_in)178 ::simd_immediate_info (scalar_mode elt_mode_in, rtx value_in, rtx step_in)
179   : elt_mode (elt_mode_in), value (value_in), step (step_in), insn (MOV),
180     modifier (LSL), shift (0)
181 {}
182 
183 /* The current code model.  */
184 enum aarch64_code_model aarch64_cmodel;
185 
186 /* The number of 64-bit elements in an SVE vector.  */
187 poly_uint16 aarch64_sve_vg;
188 
189 #ifdef HAVE_AS_TLS
190 #undef TARGET_HAVE_TLS
191 #define TARGET_HAVE_TLS 1
192 #endif
193 
194 static bool aarch64_composite_type_p (const_tree, machine_mode);
195 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
196 						     const_tree,
197 						     machine_mode *, int *,
198 						     bool *);
199 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
200 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
201 static void aarch64_override_options_after_change (void);
202 static bool aarch64_vector_mode_supported_p (machine_mode);
203 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
204 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
205 							 const_tree type,
206 							 int misalignment,
207 							 bool is_packed);
208 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
209 static bool aarch64_print_ldpstp_address (FILE *, machine_mode, rtx);
210 
211 /* Major revision number of the ARM Architecture implemented by the target.  */
212 unsigned aarch64_architecture_version;
213 
214 /* The processor for which instructions should be scheduled.  */
215 enum aarch64_processor aarch64_tune = cortexa53;
216 
217 /* Mask to specify which instruction scheduling options should be used.  */
218 unsigned long aarch64_tune_flags = 0;
219 
220 /* Global flag for PC relative loads.  */
221 bool aarch64_pcrelative_literal_loads;
222 
223 /* Support for command line parsing of boolean flags in the tuning
224    structures.  */
225 struct aarch64_flag_desc
226 {
227   const char* name;
228   unsigned int flag;
229 };
230 
231 #define AARCH64_FUSION_PAIR(name, internal_name) \
232   { name, AARCH64_FUSE_##internal_name },
233 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
234 {
235   { "none", AARCH64_FUSE_NOTHING },
236 #include "aarch64-fusion-pairs.def"
237   { "all", AARCH64_FUSE_ALL },
238   { NULL, AARCH64_FUSE_NOTHING }
239 };
240 
241 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
242   { name, AARCH64_EXTRA_TUNE_##internal_name },
243 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
244 {
245   { "none", AARCH64_EXTRA_TUNE_NONE },
246 #include "aarch64-tuning-flags.def"
247   { "all", AARCH64_EXTRA_TUNE_ALL },
248   { NULL, AARCH64_EXTRA_TUNE_NONE }
249 };
250 
251 /* Tuning parameters.  */
252 
253 static const struct cpu_addrcost_table generic_addrcost_table =
254 {
255     {
256       1, /* hi  */
257       0, /* si  */
258       0, /* di  */
259       1, /* ti  */
260     },
261   0, /* pre_modify  */
262   0, /* post_modify  */
263   0, /* register_offset  */
264   0, /* register_sextend  */
265   0, /* register_zextend  */
266   0 /* imm_offset  */
267 };
268 
269 static const struct cpu_addrcost_table exynosm1_addrcost_table =
270 {
271     {
272       0, /* hi  */
273       0, /* si  */
274       0, /* di  */
275       2, /* ti  */
276     },
277   0, /* pre_modify  */
278   0, /* post_modify  */
279   1, /* register_offset  */
280   1, /* register_sextend  */
281   2, /* register_zextend  */
282   0, /* imm_offset  */
283 };
284 
285 static const struct cpu_addrcost_table xgene1_addrcost_table =
286 {
287     {
288       1, /* hi  */
289       0, /* si  */
290       0, /* di  */
291       1, /* ti  */
292     },
293   1, /* pre_modify  */
294   0, /* post_modify  */
295   0, /* register_offset  */
296   1, /* register_sextend  */
297   1, /* register_zextend  */
298   0, /* imm_offset  */
299 };
300 
301 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
302 {
303     {
304       1, /* hi  */
305       1, /* si  */
306       1, /* di  */
307       2, /* ti  */
308     },
309   0, /* pre_modify  */
310   0, /* post_modify  */
311   2, /* register_offset  */
312   3, /* register_sextend  */
313   3, /* register_zextend  */
314   0, /* imm_offset  */
315 };
316 
317 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
318 {
319     {
320       1, /* hi  */
321       1, /* si  */
322       1, /* di  */
323       2, /* ti  */
324     },
325   1, /* pre_modify  */
326   1, /* post_modify  */
327   3, /* register_offset  */
328   3, /* register_sextend  */
329   3, /* register_zextend  */
330   2, /* imm_offset  */
331 };
332 
333 static const struct cpu_regmove_cost generic_regmove_cost =
334 {
335   1, /* GP2GP  */
336   /* Avoid the use of slow int<->fp moves for spilling by setting
337      their cost higher than memmov_cost.  */
338   5, /* GP2FP  */
339   5, /* FP2GP  */
340   2 /* FP2FP  */
341 };
342 
343 static const struct cpu_regmove_cost cortexa57_regmove_cost =
344 {
345   1, /* GP2GP  */
346   /* Avoid the use of slow int<->fp moves for spilling by setting
347      their cost higher than memmov_cost.  */
348   5, /* GP2FP  */
349   5, /* FP2GP  */
350   2 /* FP2FP  */
351 };
352 
353 static const struct cpu_regmove_cost cortexa53_regmove_cost =
354 {
355   1, /* GP2GP  */
356   /* Avoid the use of slow int<->fp moves for spilling by setting
357      their cost higher than memmov_cost.  */
358   5, /* GP2FP  */
359   5, /* FP2GP  */
360   2 /* FP2FP  */
361 };
362 
363 static const struct cpu_regmove_cost exynosm1_regmove_cost =
364 {
365   1, /* GP2GP  */
366   /* Avoid the use of slow int<->fp moves for spilling by setting
367      their cost higher than memmov_cost (actual, 4 and 9).  */
368   9, /* GP2FP  */
369   9, /* FP2GP  */
370   1 /* FP2FP  */
371 };
372 
373 static const struct cpu_regmove_cost thunderx_regmove_cost =
374 {
375   2, /* GP2GP  */
376   2, /* GP2FP  */
377   6, /* FP2GP  */
378   4 /* FP2FP  */
379 };
380 
381 static const struct cpu_regmove_cost xgene1_regmove_cost =
382 {
383   1, /* GP2GP  */
384   /* Avoid the use of slow int<->fp moves for spilling by setting
385      their cost higher than memmov_cost.  */
386   8, /* GP2FP  */
387   8, /* FP2GP  */
388   2 /* FP2FP  */
389 };
390 
391 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
392 {
393   2, /* GP2GP  */
394   /* Avoid the use of int<->fp moves for spilling.  */
395   6, /* GP2FP  */
396   6, /* FP2GP  */
397   4 /* FP2FP  */
398 };
399 
400 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
401 {
402   1, /* GP2GP  */
403   /* Avoid the use of int<->fp moves for spilling.  */
404   8, /* GP2FP  */
405   8, /* FP2GP  */
406   4  /* FP2FP  */
407 };
408 
409 /* Generic costs for vector insn classes.  */
410 static const struct cpu_vector_cost generic_vector_cost =
411 {
412   1, /* scalar_int_stmt_cost  */
413   1, /* scalar_fp_stmt_cost  */
414   1, /* scalar_load_cost  */
415   1, /* scalar_store_cost  */
416   1, /* vec_int_stmt_cost  */
417   1, /* vec_fp_stmt_cost  */
418   2, /* vec_permute_cost  */
419   1, /* vec_to_scalar_cost  */
420   1, /* scalar_to_vec_cost  */
421   1, /* vec_align_load_cost  */
422   1, /* vec_unalign_load_cost  */
423   1, /* vec_unalign_store_cost  */
424   1, /* vec_store_cost  */
425   3, /* cond_taken_branch_cost  */
426   1 /* cond_not_taken_branch_cost  */
427 };
428 
429 /* QDF24XX costs for vector insn classes.  */
430 static const struct cpu_vector_cost qdf24xx_vector_cost =
431 {
432   1, /* scalar_int_stmt_cost  */
433   1, /* scalar_fp_stmt_cost  */
434   1, /* scalar_load_cost  */
435   1, /* scalar_store_cost  */
436   1, /* vec_int_stmt_cost  */
437   3, /* vec_fp_stmt_cost  */
438   2, /* vec_permute_cost  */
439   1, /* vec_to_scalar_cost  */
440   1, /* scalar_to_vec_cost  */
441   1, /* vec_align_load_cost  */
442   1, /* vec_unalign_load_cost  */
443   1, /* vec_unalign_store_cost  */
444   1, /* vec_store_cost  */
445   3, /* cond_taken_branch_cost  */
446   1 /* cond_not_taken_branch_cost  */
447 };
448 
449 /* ThunderX costs for vector insn classes.  */
450 static const struct cpu_vector_cost thunderx_vector_cost =
451 {
452   1, /* scalar_int_stmt_cost  */
453   1, /* scalar_fp_stmt_cost  */
454   3, /* scalar_load_cost  */
455   1, /* scalar_store_cost  */
456   4, /* vec_int_stmt_cost  */
457   1, /* vec_fp_stmt_cost  */
458   4, /* vec_permute_cost  */
459   2, /* vec_to_scalar_cost  */
460   2, /* scalar_to_vec_cost  */
461   3, /* vec_align_load_cost  */
462   5, /* vec_unalign_load_cost  */
463   5, /* vec_unalign_store_cost  */
464   1, /* vec_store_cost  */
465   3, /* cond_taken_branch_cost  */
466   3 /* cond_not_taken_branch_cost  */
467 };
468 
469 /* Generic costs for vector insn classes.  */
470 static const struct cpu_vector_cost cortexa57_vector_cost =
471 {
472   1, /* scalar_int_stmt_cost  */
473   1, /* scalar_fp_stmt_cost  */
474   4, /* scalar_load_cost  */
475   1, /* scalar_store_cost  */
476   2, /* vec_int_stmt_cost  */
477   2, /* vec_fp_stmt_cost  */
478   3, /* vec_permute_cost  */
479   8, /* vec_to_scalar_cost  */
480   8, /* scalar_to_vec_cost  */
481   4, /* vec_align_load_cost  */
482   4, /* vec_unalign_load_cost  */
483   1, /* vec_unalign_store_cost  */
484   1, /* vec_store_cost  */
485   1, /* cond_taken_branch_cost  */
486   1 /* cond_not_taken_branch_cost  */
487 };
488 
489 static const struct cpu_vector_cost exynosm1_vector_cost =
490 {
491   1, /* scalar_int_stmt_cost  */
492   1, /* scalar_fp_stmt_cost  */
493   5, /* scalar_load_cost  */
494   1, /* scalar_store_cost  */
495   3, /* vec_int_stmt_cost  */
496   3, /* vec_fp_stmt_cost  */
497   3, /* vec_permute_cost  */
498   3, /* vec_to_scalar_cost  */
499   3, /* scalar_to_vec_cost  */
500   5, /* vec_align_load_cost  */
501   5, /* vec_unalign_load_cost  */
502   1, /* vec_unalign_store_cost  */
503   1, /* vec_store_cost  */
504   1, /* cond_taken_branch_cost  */
505   1 /* cond_not_taken_branch_cost  */
506 };
507 
508 /* Generic costs for vector insn classes.  */
509 static const struct cpu_vector_cost xgene1_vector_cost =
510 {
511   1, /* scalar_int_stmt_cost  */
512   1, /* scalar_fp_stmt_cost  */
513   5, /* scalar_load_cost  */
514   1, /* scalar_store_cost  */
515   2, /* vec_int_stmt_cost  */
516   2, /* vec_fp_stmt_cost  */
517   2, /* vec_permute_cost  */
518   4, /* vec_to_scalar_cost  */
519   4, /* scalar_to_vec_cost  */
520   10, /* vec_align_load_cost  */
521   10, /* vec_unalign_load_cost  */
522   2, /* vec_unalign_store_cost  */
523   2, /* vec_store_cost  */
524   2, /* cond_taken_branch_cost  */
525   1 /* cond_not_taken_branch_cost  */
526 };
527 
528 /* Costs for vector insn classes for Vulcan.  */
529 static const struct cpu_vector_cost thunderx2t99_vector_cost =
530 {
531   1, /* scalar_int_stmt_cost  */
532   6, /* scalar_fp_stmt_cost  */
533   4, /* scalar_load_cost  */
534   1, /* scalar_store_cost  */
535   5, /* vec_int_stmt_cost  */
536   6, /* vec_fp_stmt_cost  */
537   3, /* vec_permute_cost  */
538   6, /* vec_to_scalar_cost  */
539   5, /* scalar_to_vec_cost  */
540   8, /* vec_align_load_cost  */
541   8, /* vec_unalign_load_cost  */
542   4, /* vec_unalign_store_cost  */
543   4, /* vec_store_cost  */
544   2, /* cond_taken_branch_cost  */
545   1  /* cond_not_taken_branch_cost  */
546 };
547 
548 /* Generic costs for branch instructions.  */
549 static const struct cpu_branch_cost generic_branch_cost =
550 {
551   1,  /* Predictable.  */
552   3   /* Unpredictable.  */
553 };
554 
555 /* Generic approximation modes.  */
556 static const cpu_approx_modes generic_approx_modes =
557 {
558   AARCH64_APPROX_NONE,	/* division  */
559   AARCH64_APPROX_NONE,	/* sqrt  */
560   AARCH64_APPROX_NONE	/* recip_sqrt  */
561 };
562 
563 /* Approximation modes for Exynos M1.  */
564 static const cpu_approx_modes exynosm1_approx_modes =
565 {
566   AARCH64_APPROX_NONE,	/* division  */
567   AARCH64_APPROX_ALL,	/* sqrt  */
568   AARCH64_APPROX_ALL	/* recip_sqrt  */
569 };
570 
571 /* Approximation modes for X-Gene 1.  */
572 static const cpu_approx_modes xgene1_approx_modes =
573 {
574   AARCH64_APPROX_NONE,	/* division  */
575   AARCH64_APPROX_NONE,	/* sqrt  */
576   AARCH64_APPROX_ALL	/* recip_sqrt  */
577 };
578 
579 /* Generic prefetch settings (which disable prefetch).  */
580 static const cpu_prefetch_tune generic_prefetch_tune =
581 {
582   0,			/* num_slots  */
583   -1,			/* l1_cache_size  */
584   -1,			/* l1_cache_line_size  */
585   -1,			/* l2_cache_size  */
586   -1			/* default_opt_level  */
587 };
588 
589 static const cpu_prefetch_tune exynosm1_prefetch_tune =
590 {
591   0,			/* num_slots  */
592   -1,			/* l1_cache_size  */
593   64,			/* l1_cache_line_size  */
594   -1,			/* l2_cache_size  */
595   -1			/* default_opt_level  */
596 };
597 
598 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
599 {
600   4,			/* num_slots  */
601   32,			/* l1_cache_size  */
602   64,			/* l1_cache_line_size  */
603   1024,			/* l2_cache_size  */
604   -1			/* default_opt_level  */
605 };
606 
607 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
608 {
609   8,			/* num_slots  */
610   32,			/* l1_cache_size  */
611   128,			/* l1_cache_line_size  */
612   16*1024,		/* l2_cache_size  */
613   3			/* default_opt_level  */
614 };
615 
616 static const cpu_prefetch_tune thunderx_prefetch_tune =
617 {
618   8,			/* num_slots  */
619   32,			/* l1_cache_size  */
620   128,			/* l1_cache_line_size  */
621   -1,			/* l2_cache_size  */
622   -1			/* default_opt_level  */
623 };
624 
625 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
626 {
627   8,			/* num_slots  */
628   32,			/* l1_cache_size  */
629   64,			/* l1_cache_line_size  */
630   256,			/* l2_cache_size  */
631   -1			/* default_opt_level  */
632 };
633 
634 static const struct tune_params generic_tunings =
635 {
636   &cortexa57_extra_costs,
637   &generic_addrcost_table,
638   &generic_regmove_cost,
639   &generic_vector_cost,
640   &generic_branch_cost,
641   &generic_approx_modes,
642   4, /* memmov_cost  */
643   2, /* issue_rate  */
644   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
645   8,	/* function_align.  */
646   4,	/* jump_align.  */
647   8,	/* loop_align.  */
648   2,	/* int_reassoc_width.  */
649   4,	/* fp_reassoc_width.  */
650   1,	/* vec_reassoc_width.  */
651   2,	/* min_div_recip_mul_sf.  */
652   2,	/* min_div_recip_mul_df.  */
653   0,	/* max_case_values.  */
654   tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
655   (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
656   &generic_prefetch_tune
657 };
658 
659 static const struct tune_params cortexa35_tunings =
660 {
661   &cortexa53_extra_costs,
662   &generic_addrcost_table,
663   &cortexa53_regmove_cost,
664   &generic_vector_cost,
665   &generic_branch_cost,
666   &generic_approx_modes,
667   4, /* memmov_cost  */
668   1, /* issue_rate  */
669   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
670    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
671   16,	/* function_align.  */
672   4,	/* jump_align.  */
673   8,	/* loop_align.  */
674   2,	/* int_reassoc_width.  */
675   4,	/* fp_reassoc_width.  */
676   1,	/* vec_reassoc_width.  */
677   2,	/* min_div_recip_mul_sf.  */
678   2,	/* min_div_recip_mul_df.  */
679   0,	/* max_case_values.  */
680   tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
681   (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
682   &generic_prefetch_tune
683 };
684 
685 static const struct tune_params cortexa53_tunings =
686 {
687   &cortexa53_extra_costs,
688   &generic_addrcost_table,
689   &cortexa53_regmove_cost,
690   &generic_vector_cost,
691   &generic_branch_cost,
692   &generic_approx_modes,
693   4, /* memmov_cost  */
694   2, /* issue_rate  */
695   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
696    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
697   16,	/* function_align.  */
698   4,	/* jump_align.  */
699   8,	/* loop_align.  */
700   2,	/* int_reassoc_width.  */
701   4,	/* fp_reassoc_width.  */
702   1,	/* vec_reassoc_width.  */
703   2,	/* min_div_recip_mul_sf.  */
704   2,	/* min_div_recip_mul_df.  */
705   0,	/* max_case_values.  */
706   tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
707   (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
708   &generic_prefetch_tune
709 };
710 
711 static const struct tune_params cortexa57_tunings =
712 {
713   &cortexa57_extra_costs,
714   &generic_addrcost_table,
715   &cortexa57_regmove_cost,
716   &cortexa57_vector_cost,
717   &generic_branch_cost,
718   &generic_approx_modes,
719   4, /* memmov_cost  */
720   3, /* issue_rate  */
721   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
722    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
723   16,	/* function_align.  */
724   4,	/* jump_align.  */
725   8,	/* loop_align.  */
726   2,	/* int_reassoc_width.  */
727   4,	/* fp_reassoc_width.  */
728   1,	/* vec_reassoc_width.  */
729   2,	/* min_div_recip_mul_sf.  */
730   2,	/* min_div_recip_mul_df.  */
731   0,	/* max_case_values.  */
732   tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
733   (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS),	/* tune_flags.  */
734   &generic_prefetch_tune
735 };
736 
737 static const struct tune_params cortexa72_tunings =
738 {
739   &cortexa57_extra_costs,
740   &generic_addrcost_table,
741   &cortexa57_regmove_cost,
742   &cortexa57_vector_cost,
743   &generic_branch_cost,
744   &generic_approx_modes,
745   4, /* memmov_cost  */
746   3, /* issue_rate  */
747   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
748    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
749   16,	/* function_align.  */
750   4,	/* jump_align.  */
751   8,	/* loop_align.  */
752   2,	/* int_reassoc_width.  */
753   4,	/* fp_reassoc_width.  */
754   1,	/* vec_reassoc_width.  */
755   2,	/* min_div_recip_mul_sf.  */
756   2,	/* min_div_recip_mul_df.  */
757   0,	/* max_case_values.  */
758   tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
759   (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
760   &generic_prefetch_tune
761 };
762 
763 static const struct tune_params cortexa73_tunings =
764 {
765   &cortexa57_extra_costs,
766   &generic_addrcost_table,
767   &cortexa57_regmove_cost,
768   &cortexa57_vector_cost,
769   &generic_branch_cost,
770   &generic_approx_modes,
771   4, /* memmov_cost.  */
772   2, /* issue_rate.  */
773   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
774    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
775   16,	/* function_align.  */
776   4,	/* jump_align.  */
777   8,	/* loop_align.  */
778   2,	/* int_reassoc_width.  */
779   4,	/* fp_reassoc_width.  */
780   1,	/* vec_reassoc_width.  */
781   2,	/* min_div_recip_mul_sf.  */
782   2,	/* min_div_recip_mul_df.  */
783   0,	/* max_case_values.  */
784   tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
785   (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
786   &generic_prefetch_tune
787 };
788 
789 
790 
791 static const struct tune_params exynosm1_tunings =
792 {
793   &exynosm1_extra_costs,
794   &exynosm1_addrcost_table,
795   &exynosm1_regmove_cost,
796   &exynosm1_vector_cost,
797   &generic_branch_cost,
798   &exynosm1_approx_modes,
799   4,	/* memmov_cost  */
800   3,	/* issue_rate  */
801   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
802   4,	/* function_align.  */
803   4,	/* jump_align.  */
804   4,	/* loop_align.  */
805   2,	/* int_reassoc_width.  */
806   4,	/* fp_reassoc_width.  */
807   1,	/* vec_reassoc_width.  */
808   2,	/* min_div_recip_mul_sf.  */
809   2,	/* min_div_recip_mul_df.  */
810   48,	/* max_case_values.  */
811   tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
812   (AARCH64_EXTRA_TUNE_NONE), /* tune_flags.  */
813   &exynosm1_prefetch_tune
814 };
815 
816 static const struct tune_params thunderxt88_tunings =
817 {
818   &thunderx_extra_costs,
819   &generic_addrcost_table,
820   &thunderx_regmove_cost,
821   &thunderx_vector_cost,
822   &generic_branch_cost,
823   &generic_approx_modes,
824   6, /* memmov_cost  */
825   2, /* issue_rate  */
826   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
827   8,	/* function_align.  */
828   8,	/* jump_align.  */
829   8,	/* loop_align.  */
830   2,	/* int_reassoc_width.  */
831   4,	/* fp_reassoc_width.  */
832   1,	/* vec_reassoc_width.  */
833   2,	/* min_div_recip_mul_sf.  */
834   2,	/* min_div_recip_mul_df.  */
835   0,	/* max_case_values.  */
836   tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
837   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW),	/* tune_flags.  */
838   &thunderxt88_prefetch_tune
839 };
840 
841 static const struct tune_params thunderx_tunings =
842 {
843   &thunderx_extra_costs,
844   &generic_addrcost_table,
845   &thunderx_regmove_cost,
846   &thunderx_vector_cost,
847   &generic_branch_cost,
848   &generic_approx_modes,
849   6, /* memmov_cost  */
850   2, /* issue_rate  */
851   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
852   8,	/* function_align.  */
853   8,	/* jump_align.  */
854   8,	/* loop_align.  */
855   2,	/* int_reassoc_width.  */
856   4,	/* fp_reassoc_width.  */
857   1,	/* vec_reassoc_width.  */
858   2,	/* min_div_recip_mul_sf.  */
859   2,	/* min_div_recip_mul_df.  */
860   0,	/* max_case_values.  */
861   tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
862   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
863    | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),	/* tune_flags.  */
864   &thunderx_prefetch_tune
865 };
866 
867 static const struct tune_params xgene1_tunings =
868 {
869   &xgene1_extra_costs,
870   &xgene1_addrcost_table,
871   &xgene1_regmove_cost,
872   &xgene1_vector_cost,
873   &generic_branch_cost,
874   &xgene1_approx_modes,
875   6, /* memmov_cost  */
876   4, /* issue_rate  */
877   AARCH64_FUSE_NOTHING, /* fusible_ops  */
878   16,	/* function_align.  */
879   8,	/* jump_align.  */
880   16,	/* loop_align.  */
881   2,	/* int_reassoc_width.  */
882   4,	/* fp_reassoc_width.  */
883   1,	/* vec_reassoc_width.  */
884   2,	/* min_div_recip_mul_sf.  */
885   2,	/* min_div_recip_mul_df.  */
886   0,	/* max_case_values.  */
887   tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
888   (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
889   &generic_prefetch_tune
890 };
891 
892 static const struct tune_params qdf24xx_tunings =
893 {
894   &qdf24xx_extra_costs,
895   &qdf24xx_addrcost_table,
896   &qdf24xx_regmove_cost,
897   &qdf24xx_vector_cost,
898   &generic_branch_cost,
899   &generic_approx_modes,
900   4, /* memmov_cost  */
901   4, /* issue_rate  */
902   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
903    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
904   16,	/* function_align.  */
905   8,	/* jump_align.  */
906   16,	/* loop_align.  */
907   2,	/* int_reassoc_width.  */
908   4,	/* fp_reassoc_width.  */
909   1,	/* vec_reassoc_width.  */
910   2,	/* min_div_recip_mul_sf.  */
911   2,	/* min_div_recip_mul_df.  */
912   0,	/* max_case_values.  */
913   tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
914   (AARCH64_EXTRA_TUNE_NONE),		/* tune_flags.  */
915   &qdf24xx_prefetch_tune
916 };
917 
918 /* Tuning structure for the Qualcomm Saphira core.  Default to falkor values
919    for now.  */
920 static const struct tune_params saphira_tunings =
921 {
922   &generic_extra_costs,
923   &generic_addrcost_table,
924   &generic_regmove_cost,
925   &generic_vector_cost,
926   &generic_branch_cost,
927   &generic_approx_modes,
928   4, /* memmov_cost  */
929   4, /* issue_rate  */
930   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
931    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
932   16,	/* function_align.  */
933   8,	/* jump_align.  */
934   16,	/* loop_align.  */
935   2,	/* int_reassoc_width.  */
936   4,	/* fp_reassoc_width.  */
937   1,	/* vec_reassoc_width.  */
938   2,	/* min_div_recip_mul_sf.  */
939   2,	/* min_div_recip_mul_df.  */
940   0,	/* max_case_values.  */
941   tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
942   (AARCH64_EXTRA_TUNE_NONE),		/* tune_flags.  */
943   &generic_prefetch_tune
944 };
945 
946 static const struct tune_params thunderx2t99_tunings =
947 {
948   &thunderx2t99_extra_costs,
949   &thunderx2t99_addrcost_table,
950   &thunderx2t99_regmove_cost,
951   &thunderx2t99_vector_cost,
952   &generic_branch_cost,
953   &generic_approx_modes,
954   4, /* memmov_cost.  */
955   4, /* issue_rate.  */
956   (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
957    | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops  */
958   16,	/* function_align.  */
959   8,	/* jump_align.  */
960   16,	/* loop_align.  */
961   3,	/* int_reassoc_width.  */
962   2,	/* fp_reassoc_width.  */
963   2,	/* vec_reassoc_width.  */
964   2,	/* min_div_recip_mul_sf.  */
965   2,	/* min_div_recip_mul_df.  */
966   0,	/* max_case_values.  */
967   tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
968   (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
969   &thunderx2t99_prefetch_tune
970 };
971 
972 /* Support for fine-grained override of the tuning structures.  */
973 struct aarch64_tuning_override_function
974 {
975   const char* name;
976   void (*parse_override)(const char*, struct tune_params*);
977 };
978 
979 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
980 static void aarch64_parse_tune_string (const char*, struct tune_params*);
981 
982 static const struct aarch64_tuning_override_function
983 aarch64_tuning_override_functions[] =
984 {
985   { "fuse", aarch64_parse_fuse_string },
986   { "tune", aarch64_parse_tune_string },
987   { NULL, NULL }
988 };
989 
990 /* A processor implementing AArch64.  */
991 struct processor
992 {
993   const char *const name;
994   enum aarch64_processor ident;
995   enum aarch64_processor sched_core;
996   enum aarch64_arch arch;
997   unsigned architecture_version;
998   const unsigned long flags;
999   const struct tune_params *const tune;
1000 };
1001 
1002 /* Architectures implementing AArch64.  */
1003 static const struct processor all_architectures[] =
1004 {
1005 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
1006   {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
1007 #include "aarch64-arches.def"
1008   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1009 };
1010 
1011 /* Processor cores implementing AArch64.  */
1012 static const struct processor all_cores[] =
1013 {
1014 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
1015   {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH,				\
1016   all_architectures[AARCH64_ARCH_##ARCH].architecture_version,	\
1017   FLAGS, &COSTS##_tunings},
1018 #include "aarch64-cores.def"
1019   {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
1020     AARCH64_FL_FOR_ARCH8, &generic_tunings},
1021   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1022 };
1023 
1024 
1025 /* Target specification.  These are populated by the -march, -mtune, -mcpu
1026    handling code or by target attributes.  */
1027 static const struct processor *selected_arch;
1028 static const struct processor *selected_cpu;
1029 static const struct processor *selected_tune;
1030 
1031 /* The current tuning set.  */
1032 struct tune_params aarch64_tune_params = generic_tunings;
1033 
1034 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
1035 
1036 /* An ISA extension in the co-processor and main instruction set space.  */
1037 struct aarch64_option_extension
1038 {
1039   const char *const name;
1040   const unsigned long flags_on;
1041   const unsigned long flags_off;
1042 };
1043 
1044 typedef enum aarch64_cond_code
1045 {
1046   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
1047   AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
1048   AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
1049 }
1050 aarch64_cc;
1051 
1052 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1053 
1054 /* The condition codes of the processor, and the inverse function.  */
1055 static const char * const aarch64_condition_codes[] =
1056 {
1057   "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1058   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1059 };
1060 
1061 /* Generate code to enable conditional branches in functions over 1 MiB.  */
1062 const char *
aarch64_gen_far_branch(rtx * operands,int pos_label,const char * dest,const char * branch_format)1063 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
1064 			const char * branch_format)
1065 {
1066     rtx_code_label * tmp_label = gen_label_rtx ();
1067     char label_buf[256];
1068     char buffer[128];
1069     ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
1070 				 CODE_LABEL_NUMBER (tmp_label));
1071     const char *label_ptr = targetm.strip_name_encoding (label_buf);
1072     rtx dest_label = operands[pos_label];
1073     operands[pos_label] = tmp_label;
1074 
1075     snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
1076     output_asm_insn (buffer, operands);
1077 
1078     snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
1079     operands[pos_label] = dest_label;
1080     output_asm_insn (buffer, operands);
1081     return "";
1082 }
1083 
1084 void
aarch64_err_no_fpadvsimd(machine_mode mode,const char * msg)1085 aarch64_err_no_fpadvsimd (machine_mode mode, const char *msg)
1086 {
1087   const char *mc = FLOAT_MODE_P (mode) ? "floating-point" : "vector";
1088   if (TARGET_GENERAL_REGS_ONLY)
1089     error ("%qs is incompatible with %s %s", "-mgeneral-regs-only", mc, msg);
1090   else
1091     error ("%qs feature modifier is incompatible with %s %s", "+nofp", mc, msg);
1092 }
1093 
1094 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1095    The register allocator chooses ALL_REGS if FP_REGS and GENERAL_REGS have
1096    the same cost even if ALL_REGS has a much larger cost.  ALL_REGS is also
1097    used if the cost of both FP_REGS and GENERAL_REGS is lower than the memory
1098    cost (in this case the best class is the lowest cost one).  Using ALL_REGS
1099    irrespectively of its cost results in bad allocations with many redundant
1100    int<->FP moves which are expensive on various cores.
1101    To avoid this we don't allow ALL_REGS as the allocno class, but force a
1102    decision between FP_REGS and GENERAL_REGS.  We use the allocno class if it
1103    isn't ALL_REGS.  Similarly, use the best class if it isn't ALL_REGS.
1104    Otherwise set the allocno class depending on the mode.
1105    The result of this is that it is no longer inefficient to have a higher
1106    memory move cost than the register move cost.
1107 */
1108 
1109 static reg_class_t
aarch64_ira_change_pseudo_allocno_class(int regno,reg_class_t allocno_class,reg_class_t best_class)1110 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
1111 					 reg_class_t best_class)
1112 {
1113   machine_mode mode;
1114 
1115   if (allocno_class != ALL_REGS)
1116     return allocno_class;
1117 
1118   if (best_class != ALL_REGS)
1119     return best_class;
1120 
1121   mode = PSEUDO_REGNO_MODE (regno);
1122   return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1123 }
1124 
1125 static unsigned int
aarch64_min_divisions_for_recip_mul(machine_mode mode)1126 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1127 {
1128   if (GET_MODE_UNIT_SIZE (mode) == 4)
1129     return aarch64_tune_params.min_div_recip_mul_sf;
1130   return aarch64_tune_params.min_div_recip_mul_df;
1131 }
1132 
1133 /* Return the reassociation width of treeop OPC with mode MODE.  */
1134 static int
aarch64_reassociation_width(unsigned opc,machine_mode mode)1135 aarch64_reassociation_width (unsigned opc, machine_mode mode)
1136 {
1137   if (VECTOR_MODE_P (mode))
1138     return aarch64_tune_params.vec_reassoc_width;
1139   if (INTEGRAL_MODE_P (mode))
1140     return aarch64_tune_params.int_reassoc_width;
1141   /* Avoid reassociating floating point addition so we emit more FMAs.  */
1142   if (FLOAT_MODE_P (mode) && opc != PLUS_EXPR)
1143     return aarch64_tune_params.fp_reassoc_width;
1144   return 1;
1145 }
1146 
1147 /* Provide a mapping from gcc register numbers to dwarf register numbers.  */
1148 unsigned
aarch64_dbx_register_number(unsigned regno)1149 aarch64_dbx_register_number (unsigned regno)
1150 {
1151    if (GP_REGNUM_P (regno))
1152      return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1153    else if (regno == SP_REGNUM)
1154      return AARCH64_DWARF_SP;
1155    else if (FP_REGNUM_P (regno))
1156      return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1157    else if (PR_REGNUM_P (regno))
1158      return AARCH64_DWARF_P0 + regno - P0_REGNUM;
1159    else if (regno == VG_REGNUM)
1160      return AARCH64_DWARF_VG;
1161 
1162    /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1163       equivalent DWARF register.  */
1164    return DWARF_FRAME_REGISTERS;
1165 }
1166 
1167 /* Return true if MODE is any of the Advanced SIMD structure modes.  */
1168 static bool
aarch64_advsimd_struct_mode_p(machine_mode mode)1169 aarch64_advsimd_struct_mode_p (machine_mode mode)
1170 {
1171   return (TARGET_SIMD
1172 	  && (mode == OImode || mode == CImode || mode == XImode));
1173 }
1174 
1175 /* Return true if MODE is an SVE predicate mode.  */
1176 static bool
aarch64_sve_pred_mode_p(machine_mode mode)1177 aarch64_sve_pred_mode_p (machine_mode mode)
1178 {
1179   return (TARGET_SVE
1180 	  && (mode == VNx16BImode
1181 	      || mode == VNx8BImode
1182 	      || mode == VNx4BImode
1183 	      || mode == VNx2BImode));
1184 }
1185 
1186 /* Three mutually-exclusive flags describing a vector or predicate type.  */
1187 const unsigned int VEC_ADVSIMD  = 1;
1188 const unsigned int VEC_SVE_DATA = 2;
1189 const unsigned int VEC_SVE_PRED = 4;
1190 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
1191    a structure of 2, 3 or 4 vectors.  */
1192 const unsigned int VEC_STRUCT   = 8;
1193 /* Useful combinations of the above.  */
1194 const unsigned int VEC_ANY_SVE  = VEC_SVE_DATA | VEC_SVE_PRED;
1195 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
1196 
1197 /* Return a set of flags describing the vector properties of mode MODE.
1198    Ignore modes that are not supported by the current target.  */
1199 static unsigned int
aarch64_classify_vector_mode(machine_mode mode)1200 aarch64_classify_vector_mode (machine_mode mode)
1201 {
1202   if (aarch64_advsimd_struct_mode_p (mode))
1203     return VEC_ADVSIMD | VEC_STRUCT;
1204 
1205   if (aarch64_sve_pred_mode_p (mode))
1206     return VEC_SVE_PRED;
1207 
1208   scalar_mode inner = GET_MODE_INNER (mode);
1209   if (VECTOR_MODE_P (mode)
1210       && (inner == QImode
1211 	  || inner == HImode
1212 	  || inner == HFmode
1213 	  || inner == SImode
1214 	  || inner == SFmode
1215 	  || inner == DImode
1216 	  || inner == DFmode))
1217     {
1218       if (TARGET_SVE)
1219 	{
1220 	  if (known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR))
1221 	    return VEC_SVE_DATA;
1222 	  if (known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 2)
1223 	      || known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 3)
1224 	      || known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 4))
1225 	    return VEC_SVE_DATA | VEC_STRUCT;
1226 	}
1227 
1228       /* This includes V1DF but not V1DI (which doesn't exist).  */
1229       if (TARGET_SIMD
1230 	  && (known_eq (GET_MODE_BITSIZE (mode), 64)
1231 	      || known_eq (GET_MODE_BITSIZE (mode), 128)))
1232 	return VEC_ADVSIMD;
1233     }
1234 
1235   return 0;
1236 }
1237 
1238 /* Return true if MODE is any of the data vector modes, including
1239    structure modes.  */
1240 static bool
aarch64_vector_data_mode_p(machine_mode mode)1241 aarch64_vector_data_mode_p (machine_mode mode)
1242 {
1243   return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
1244 }
1245 
1246 /* Return true if MODE is an SVE data vector mode; either a single vector
1247    or a structure of vectors.  */
1248 static bool
aarch64_sve_data_mode_p(machine_mode mode)1249 aarch64_sve_data_mode_p (machine_mode mode)
1250 {
1251   return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
1252 }
1253 
1254 /* Implement target hook TARGET_ARRAY_MODE.  */
1255 static opt_machine_mode
aarch64_array_mode(machine_mode mode,unsigned HOST_WIDE_INT nelems)1256 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
1257 {
1258   if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
1259       && IN_RANGE (nelems, 2, 4))
1260     return mode_for_vector (GET_MODE_INNER (mode),
1261 			    GET_MODE_NUNITS (mode) * nelems);
1262 
1263   return opt_machine_mode ();
1264 }
1265 
1266 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
1267 static bool
aarch64_array_mode_supported_p(machine_mode mode,unsigned HOST_WIDE_INT nelems)1268 aarch64_array_mode_supported_p (machine_mode mode,
1269 				unsigned HOST_WIDE_INT nelems)
1270 {
1271   if (TARGET_SIMD
1272       && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1273 	  || AARCH64_VALID_SIMD_DREG_MODE (mode))
1274       && (nelems >= 2 && nelems <= 4))
1275     return true;
1276 
1277   return false;
1278 }
1279 
1280 /* Return the SVE predicate mode to use for elements that have
1281    ELEM_NBYTES bytes, if such a mode exists.  */
1282 
1283 opt_machine_mode
aarch64_sve_pred_mode(unsigned int elem_nbytes)1284 aarch64_sve_pred_mode (unsigned int elem_nbytes)
1285 {
1286   if (TARGET_SVE)
1287     {
1288       if (elem_nbytes == 1)
1289 	return VNx16BImode;
1290       if (elem_nbytes == 2)
1291 	return VNx8BImode;
1292       if (elem_nbytes == 4)
1293 	return VNx4BImode;
1294       if (elem_nbytes == 8)
1295 	return VNx2BImode;
1296     }
1297   return opt_machine_mode ();
1298 }
1299 
1300 /* Implement TARGET_VECTORIZE_GET_MASK_MODE.  */
1301 
1302 static opt_machine_mode
aarch64_get_mask_mode(poly_uint64 nunits,poly_uint64 nbytes)1303 aarch64_get_mask_mode (poly_uint64 nunits, poly_uint64 nbytes)
1304 {
1305   if (TARGET_SVE && known_eq (nbytes, BYTES_PER_SVE_VECTOR))
1306     {
1307       unsigned int elem_nbytes = vector_element_size (nbytes, nunits);
1308       machine_mode pred_mode;
1309       if (aarch64_sve_pred_mode (elem_nbytes).exists (&pred_mode))
1310 	return pred_mode;
1311     }
1312 
1313   return default_get_mask_mode (nunits, nbytes);
1314 }
1315 
1316 /* Implement TARGET_HARD_REGNO_NREGS.  */
1317 
1318 static unsigned int
aarch64_hard_regno_nregs(unsigned regno,machine_mode mode)1319 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1320 {
1321   /* ??? Logically we should only need to provide a value when
1322      HARD_REGNO_MODE_OK says that the combination is valid,
1323      but at the moment we need to handle all modes.  Just ignore
1324      any runtime parts for registers that can't store them.  */
1325   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
1326   switch (aarch64_regno_regclass (regno))
1327     {
1328     case FP_REGS:
1329     case FP_LO_REGS:
1330       if (aarch64_sve_data_mode_p (mode))
1331 	return exact_div (GET_MODE_SIZE (mode),
1332 			  BYTES_PER_SVE_VECTOR).to_constant ();
1333       return CEIL (lowest_size, UNITS_PER_VREG);
1334     case PR_REGS:
1335     case PR_LO_REGS:
1336     case PR_HI_REGS:
1337       return 1;
1338     default:
1339       return CEIL (lowest_size, UNITS_PER_WORD);
1340     }
1341   gcc_unreachable ();
1342 }
1343 
1344 /* Implement TARGET_HARD_REGNO_MODE_OK.  */
1345 
1346 static bool
aarch64_hard_regno_mode_ok(unsigned regno,machine_mode mode)1347 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1348 {
1349   if (GET_MODE_CLASS (mode) == MODE_CC)
1350     return regno == CC_REGNUM;
1351 
1352   if (regno == VG_REGNUM)
1353     /* This must have the same size as _Unwind_Word.  */
1354     return mode == DImode;
1355 
1356   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1357   if (vec_flags & VEC_SVE_PRED)
1358     return PR_REGNUM_P (regno);
1359 
1360   if (PR_REGNUM_P (regno))
1361     return 0;
1362 
1363   if (regno == SP_REGNUM)
1364     /* The purpose of comparing with ptr_mode is to support the
1365        global register variable associated with the stack pointer
1366        register via the syntax of asm ("wsp") in ILP32.  */
1367     return mode == Pmode || mode == ptr_mode;
1368 
1369   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1370     return mode == Pmode;
1371 
1372   if (GP_REGNUM_P (regno) && known_le (GET_MODE_SIZE (mode), 16))
1373     return true;
1374 
1375   if (FP_REGNUM_P (regno))
1376     {
1377       if (vec_flags & VEC_STRUCT)
1378 	return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
1379       else
1380 	return !VECTOR_MODE_P (mode) || vec_flags != 0;
1381     }
1382 
1383   return false;
1384 }
1385 
1386 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED.  The callee only saves
1387    the lower 64 bits of a 128-bit register.  Tell the compiler the callee
1388    clobbers the top 64 bits when restoring the bottom 64 bits.  */
1389 
1390 static bool
aarch64_hard_regno_call_part_clobbered(unsigned int regno,machine_mode mode)1391 aarch64_hard_regno_call_part_clobbered (unsigned int regno, machine_mode mode)
1392 {
1393   return FP_REGNUM_P (regno) && maybe_gt (GET_MODE_SIZE (mode), 8);
1394 }
1395 
1396 /* Implement REGMODE_NATURAL_SIZE.  */
1397 poly_uint64
aarch64_regmode_natural_size(machine_mode mode)1398 aarch64_regmode_natural_size (machine_mode mode)
1399 {
1400   /* The natural size for SVE data modes is one SVE data vector,
1401      and similarly for predicates.  We can't independently modify
1402      anything smaller than that.  */
1403   /* ??? For now, only do this for variable-width SVE registers.
1404      Doing it for constant-sized registers breaks lower-subreg.c.  */
1405   /* ??? And once that's fixed, we should probably have similar
1406      code for Advanced SIMD.  */
1407   if (!aarch64_sve_vg.is_constant ())
1408     {
1409       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1410       if (vec_flags & VEC_SVE_PRED)
1411 	return BYTES_PER_SVE_PRED;
1412       if (vec_flags & VEC_SVE_DATA)
1413 	return BYTES_PER_SVE_VECTOR;
1414     }
1415   return UNITS_PER_WORD;
1416 }
1417 
1418 /* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
1419 machine_mode
aarch64_hard_regno_caller_save_mode(unsigned regno,unsigned,machine_mode mode)1420 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
1421 				     machine_mode mode)
1422 {
1423   /* The predicate mode determines which bits are significant and
1424      which are "don't care".  Decreasing the number of lanes would
1425      lose data while increasing the number of lanes would make bits
1426      unnecessarily significant.  */
1427   if (PR_REGNUM_P (regno))
1428     return mode;
1429   if (known_ge (GET_MODE_SIZE (mode), 4))
1430     return mode;
1431   else
1432     return SImode;
1433 }
1434 
1435 /* Implement TARGET_CONSTANT_ALIGNMENT.  Make strings word-aligned so
1436    that strcpy from constants will be faster.  */
1437 
1438 static HOST_WIDE_INT
aarch64_constant_alignment(const_tree exp,HOST_WIDE_INT align)1439 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
1440 {
1441   if (TREE_CODE (exp) == STRING_CST && !optimize_size)
1442     return MAX (align, BITS_PER_WORD);
1443   return align;
1444 }
1445 
1446 /* Return true if calls to DECL should be treated as
1447    long-calls (ie called via a register).  */
1448 static bool
aarch64_decl_is_long_call_p(const_tree decl ATTRIBUTE_UNUSED)1449 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1450 {
1451   return false;
1452 }
1453 
1454 /* Return true if calls to symbol-ref SYM should be treated as
1455    long-calls (ie called via a register).  */
1456 bool
aarch64_is_long_call_p(rtx sym)1457 aarch64_is_long_call_p (rtx sym)
1458 {
1459   return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1460 }
1461 
1462 /* Return true if calls to symbol-ref SYM should not go through
1463    plt stubs.  */
1464 
1465 bool
aarch64_is_noplt_call_p(rtx sym)1466 aarch64_is_noplt_call_p (rtx sym)
1467 {
1468   const_tree decl = SYMBOL_REF_DECL (sym);
1469 
1470   if (flag_pic
1471       && decl
1472       && (!flag_plt
1473 	  || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1474       && !targetm.binds_local_p (decl))
1475     return true;
1476 
1477   return false;
1478 }
1479 
1480 /* Return true if the offsets to a zero/sign-extract operation
1481    represent an expression that matches an extend operation.  The
1482    operands represent the paramters from
1483 
1484    (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)).  */
1485 bool
aarch64_is_extend_from_extract(scalar_int_mode mode,rtx mult_imm,rtx extract_imm)1486 aarch64_is_extend_from_extract (scalar_int_mode mode, rtx mult_imm,
1487 				rtx extract_imm)
1488 {
1489   HOST_WIDE_INT mult_val, extract_val;
1490 
1491   if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
1492     return false;
1493 
1494   mult_val = INTVAL (mult_imm);
1495   extract_val = INTVAL (extract_imm);
1496 
1497   if (extract_val > 8
1498       && extract_val < GET_MODE_BITSIZE (mode)
1499       && exact_log2 (extract_val & ~7) > 0
1500       && (extract_val & 7) <= 4
1501       && mult_val == (1 << (extract_val & 7)))
1502     return true;
1503 
1504   return false;
1505 }
1506 
1507 /* Emit an insn that's a simple single-set.  Both the operands must be
1508    known to be valid.  */
1509 inline static rtx_insn *
emit_set_insn(rtx x,rtx y)1510 emit_set_insn (rtx x, rtx y)
1511 {
1512   return emit_insn (gen_rtx_SET (x, y));
1513 }
1514 
1515 /* X and Y are two things to compare using CODE.  Emit the compare insn and
1516    return the rtx for register 0 in the proper mode.  */
1517 rtx
aarch64_gen_compare_reg(RTX_CODE code,rtx x,rtx y)1518 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
1519 {
1520   machine_mode mode = SELECT_CC_MODE (code, x, y);
1521   rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
1522 
1523   emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
1524   return cc_reg;
1525 }
1526 
1527 /* Build the SYMBOL_REF for __tls_get_addr.  */
1528 
1529 static GTY(()) rtx tls_get_addr_libfunc;
1530 
1531 rtx
aarch64_tls_get_addr(void)1532 aarch64_tls_get_addr (void)
1533 {
1534   if (!tls_get_addr_libfunc)
1535     tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
1536   return tls_get_addr_libfunc;
1537 }
1538 
1539 /* Return the TLS model to use for ADDR.  */
1540 
1541 static enum tls_model
tls_symbolic_operand_type(rtx addr)1542 tls_symbolic_operand_type (rtx addr)
1543 {
1544   enum tls_model tls_kind = TLS_MODEL_NONE;
1545   if (GET_CODE (addr) == CONST)
1546     {
1547       poly_int64 addend;
1548       rtx sym = strip_offset (addr, &addend);
1549       if (GET_CODE (sym) == SYMBOL_REF)
1550 	tls_kind = SYMBOL_REF_TLS_MODEL (sym);
1551     }
1552   else if (GET_CODE (addr) == SYMBOL_REF)
1553     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
1554 
1555   return tls_kind;
1556 }
1557 
1558 /* We'll allow lo_sum's in addresses in our legitimate addresses
1559    so that combine would take care of combining addresses where
1560    necessary, but for generation purposes, we'll generate the address
1561    as :
1562    RTL                               Absolute
1563    tmp = hi (symbol_ref);            adrp  x1, foo
1564    dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
1565                                      nop
1566 
1567    PIC                               TLS
1568    adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
1569    ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
1570                                      bl   __tls_get_addr
1571                                      nop
1572 
1573    Load TLS symbol, depending on TLS mechanism and TLS access model.
1574 
1575    Global Dynamic - Traditional TLS:
1576    adrp tmp, :tlsgd:imm
1577    add  dest, tmp, #:tlsgd_lo12:imm
1578    bl   __tls_get_addr
1579 
1580    Global Dynamic - TLS Descriptors:
1581    adrp dest, :tlsdesc:imm
1582    ldr  tmp, [dest, #:tlsdesc_lo12:imm]
1583    add  dest, dest, #:tlsdesc_lo12:imm
1584    blr  tmp
1585    mrs  tp, tpidr_el0
1586    add  dest, dest, tp
1587 
1588    Initial Exec:
1589    mrs  tp, tpidr_el0
1590    adrp tmp, :gottprel:imm
1591    ldr  dest, [tmp, #:gottprel_lo12:imm]
1592    add  dest, dest, tp
1593 
1594    Local Exec:
1595    mrs  tp, tpidr_el0
1596    add  t0, tp, #:tprel_hi12:imm, lsl #12
1597    add  t0, t0, #:tprel_lo12_nc:imm
1598 */
1599 
1600 static void
aarch64_load_symref_appropriately(rtx dest,rtx imm,enum aarch64_symbol_type type)1601 aarch64_load_symref_appropriately (rtx dest, rtx imm,
1602 				   enum aarch64_symbol_type type)
1603 {
1604   switch (type)
1605     {
1606     case SYMBOL_SMALL_ABSOLUTE:
1607       {
1608 	/* In ILP32, the mode of dest can be either SImode or DImode.  */
1609 	rtx tmp_reg = dest;
1610 	machine_mode mode = GET_MODE (dest);
1611 
1612 	gcc_assert (mode == Pmode || mode == ptr_mode);
1613 
1614 	if (can_create_pseudo_p ())
1615 	  tmp_reg = gen_reg_rtx (mode);
1616 
1617 	emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1618 	emit_insn (gen_add_losym (dest, tmp_reg, imm));
1619 	return;
1620       }
1621 
1622     case SYMBOL_TINY_ABSOLUTE:
1623       emit_insn (gen_rtx_SET (dest, imm));
1624       return;
1625 
1626     case SYMBOL_SMALL_GOT_28K:
1627       {
1628 	machine_mode mode = GET_MODE (dest);
1629 	rtx gp_rtx = pic_offset_table_rtx;
1630 	rtx insn;
1631 	rtx mem;
1632 
1633 	/* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
1634 	   here before rtl expand.  Tree IVOPT will generate rtl pattern to
1635 	   decide rtx costs, in which case pic_offset_table_rtx is not
1636 	   initialized.  For that case no need to generate the first adrp
1637 	   instruction as the final cost for global variable access is
1638 	   one instruction.  */
1639 	if (gp_rtx != NULL)
1640 	  {
1641 	    /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
1642 	       using the page base as GOT base, the first page may be wasted,
1643 	       in the worst scenario, there is only 28K space for GOT).
1644 
1645 	       The generate instruction sequence for accessing global variable
1646 	       is:
1647 
1648 		 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
1649 
1650 	       Only one instruction needed. But we must initialize
1651 	       pic_offset_table_rtx properly.  We generate initialize insn for
1652 	       every global access, and allow CSE to remove all redundant.
1653 
1654 	       The final instruction sequences will look like the following
1655 	       for multiply global variables access.
1656 
1657 		 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
1658 
1659 		 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
1660 		 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
1661 		 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
1662 		 ...  */
1663 
1664 	    rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
1665 	    crtl->uses_pic_offset_table = 1;
1666 	    emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
1667 
1668 	    if (mode != GET_MODE (gp_rtx))
1669              gp_rtx = gen_lowpart (mode, gp_rtx);
1670 
1671 	  }
1672 
1673 	if (mode == ptr_mode)
1674 	  {
1675 	    if (mode == DImode)
1676 	      insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
1677 	    else
1678 	      insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
1679 
1680 	    mem = XVECEXP (SET_SRC (insn), 0, 0);
1681 	  }
1682 	else
1683 	  {
1684 	    gcc_assert (mode == Pmode);
1685 
1686 	    insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
1687 	    mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1688 	  }
1689 
1690 	/* The operand is expected to be MEM.  Whenever the related insn
1691 	   pattern changed, above code which calculate mem should be
1692 	   updated.  */
1693 	gcc_assert (GET_CODE (mem) == MEM);
1694 	MEM_READONLY_P (mem) = 1;
1695 	MEM_NOTRAP_P (mem) = 1;
1696 	emit_insn (insn);
1697 	return;
1698       }
1699 
1700     case SYMBOL_SMALL_GOT_4G:
1701       {
1702 	/* In ILP32, the mode of dest can be either SImode or DImode,
1703 	   while the got entry is always of SImode size.  The mode of
1704 	   dest depends on how dest is used: if dest is assigned to a
1705 	   pointer (e.g. in the memory), it has SImode; it may have
1706 	   DImode if dest is dereferenced to access the memeory.
1707 	   This is why we have to handle three different ldr_got_small
1708 	   patterns here (two patterns for ILP32).  */
1709 
1710 	rtx insn;
1711 	rtx mem;
1712 	rtx tmp_reg = dest;
1713 	machine_mode mode = GET_MODE (dest);
1714 
1715 	if (can_create_pseudo_p ())
1716 	  tmp_reg = gen_reg_rtx (mode);
1717 
1718 	emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1719 	if (mode == ptr_mode)
1720 	  {
1721 	    if (mode == DImode)
1722 	      insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
1723 	    else
1724 	      insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
1725 
1726 	    mem = XVECEXP (SET_SRC (insn), 0, 0);
1727 	  }
1728 	else
1729 	  {
1730 	    gcc_assert (mode == Pmode);
1731 
1732 	    insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
1733 	    mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1734 	  }
1735 
1736 	gcc_assert (GET_CODE (mem) == MEM);
1737 	MEM_READONLY_P (mem) = 1;
1738 	MEM_NOTRAP_P (mem) = 1;
1739 	emit_insn (insn);
1740 	return;
1741       }
1742 
1743     case SYMBOL_SMALL_TLSGD:
1744       {
1745 	rtx_insn *insns;
1746 	machine_mode mode = GET_MODE (dest);
1747 	rtx result = gen_rtx_REG (mode, R0_REGNUM);
1748 
1749 	start_sequence ();
1750 	if (TARGET_ILP32)
1751 	  aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
1752 	else
1753 	  aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
1754 	insns = get_insns ();
1755 	end_sequence ();
1756 
1757 	RTL_CONST_CALL_P (insns) = 1;
1758 	emit_libcall_block (insns, dest, result, imm);
1759 	return;
1760       }
1761 
1762     case SYMBOL_SMALL_TLSDESC:
1763       {
1764 	machine_mode mode = GET_MODE (dest);
1765 	rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
1766 	rtx tp;
1767 
1768 	gcc_assert (mode == Pmode || mode == ptr_mode);
1769 
1770 	/* In ILP32, the got entry is always of SImode size.  Unlike
1771 	   small GOT, the dest is fixed at reg 0.  */
1772 	if (TARGET_ILP32)
1773 	  emit_insn (gen_tlsdesc_small_si (imm));
1774 	else
1775 	  emit_insn (gen_tlsdesc_small_di (imm));
1776 	tp = aarch64_load_tp (NULL);
1777 
1778 	if (mode != Pmode)
1779 	  tp = gen_lowpart (mode, tp);
1780 
1781 	emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
1782 	if (REG_P (dest))
1783 	  set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1784 	return;
1785       }
1786 
1787     case SYMBOL_SMALL_TLSIE:
1788       {
1789 	/* In ILP32, the mode of dest can be either SImode or DImode,
1790 	   while the got entry is always of SImode size.  The mode of
1791 	   dest depends on how dest is used: if dest is assigned to a
1792 	   pointer (e.g. in the memory), it has SImode; it may have
1793 	   DImode if dest is dereferenced to access the memeory.
1794 	   This is why we have to handle three different tlsie_small
1795 	   patterns here (two patterns for ILP32).  */
1796 	machine_mode mode = GET_MODE (dest);
1797 	rtx tmp_reg = gen_reg_rtx (mode);
1798 	rtx tp = aarch64_load_tp (NULL);
1799 
1800 	if (mode == ptr_mode)
1801 	  {
1802 	    if (mode == DImode)
1803 	      emit_insn (gen_tlsie_small_di (tmp_reg, imm));
1804 	    else
1805 	      {
1806 		emit_insn (gen_tlsie_small_si (tmp_reg, imm));
1807 		tp = gen_lowpart (mode, tp);
1808 	      }
1809 	  }
1810 	else
1811 	  {
1812 	    gcc_assert (mode == Pmode);
1813 	    emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
1814 	  }
1815 
1816 	emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
1817 	if (REG_P (dest))
1818 	  set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1819 	return;
1820       }
1821 
1822     case SYMBOL_TLSLE12:
1823     case SYMBOL_TLSLE24:
1824     case SYMBOL_TLSLE32:
1825     case SYMBOL_TLSLE48:
1826       {
1827 	machine_mode mode = GET_MODE (dest);
1828 	rtx tp = aarch64_load_tp (NULL);
1829 
1830 	if (mode != Pmode)
1831 	  tp = gen_lowpart (mode, tp);
1832 
1833 	switch (type)
1834 	  {
1835 	  case SYMBOL_TLSLE12:
1836 	    emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
1837 			(dest, tp, imm));
1838 	    break;
1839 	  case SYMBOL_TLSLE24:
1840 	    emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
1841 			(dest, tp, imm));
1842 	  break;
1843 	  case SYMBOL_TLSLE32:
1844 	    emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
1845 			(dest, imm));
1846 	    emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1847 			(dest, dest, tp));
1848 	  break;
1849 	  case SYMBOL_TLSLE48:
1850 	    emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
1851 			(dest, imm));
1852 	    emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1853 			(dest, dest, tp));
1854 	    break;
1855 	  default:
1856 	    gcc_unreachable ();
1857 	  }
1858 
1859 	if (REG_P (dest))
1860 	  set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1861 	return;
1862       }
1863 
1864     case SYMBOL_TINY_GOT:
1865       emit_insn (gen_ldr_got_tiny (dest, imm));
1866       return;
1867 
1868     case SYMBOL_TINY_TLSIE:
1869       {
1870 	machine_mode mode = GET_MODE (dest);
1871 	rtx tp = aarch64_load_tp (NULL);
1872 
1873 	if (mode == ptr_mode)
1874 	  {
1875 	    if (mode == DImode)
1876 	      emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
1877 	    else
1878 	      {
1879 		tp = gen_lowpart (mode, tp);
1880 		emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
1881 	      }
1882 	  }
1883 	else
1884 	  {
1885 	    gcc_assert (mode == Pmode);
1886 	    emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
1887 	  }
1888 
1889 	if (REG_P (dest))
1890 	  set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1891 	return;
1892       }
1893 
1894     default:
1895       gcc_unreachable ();
1896     }
1897 }
1898 
1899 /* Emit a move from SRC to DEST.  Assume that the move expanders can
1900    handle all moves if !can_create_pseudo_p ().  The distinction is
1901    important because, unlike emit_move_insn, the move expanders know
1902    how to force Pmode objects into the constant pool even when the
1903    constant pool address is not itself legitimate.  */
1904 static rtx
aarch64_emit_move(rtx dest,rtx src)1905 aarch64_emit_move (rtx dest, rtx src)
1906 {
1907   return (can_create_pseudo_p ()
1908 	  ? emit_move_insn (dest, src)
1909 	  : emit_move_insn_1 (dest, src));
1910 }
1911 
1912 /* Split a 128-bit move operation into two 64-bit move operations,
1913    taking care to handle partial overlap of register to register
1914    copies.  Special cases are needed when moving between GP regs and
1915    FP regs.  SRC can be a register, constant or memory; DST a register
1916    or memory.  If either operand is memory it must not have any side
1917    effects.  */
1918 void
aarch64_split_128bit_move(rtx dst,rtx src)1919 aarch64_split_128bit_move (rtx dst, rtx src)
1920 {
1921   rtx dst_lo, dst_hi;
1922   rtx src_lo, src_hi;
1923 
1924   machine_mode mode = GET_MODE (dst);
1925 
1926   gcc_assert (mode == TImode || mode == TFmode);
1927   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
1928   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
1929 
1930   if (REG_P (dst) && REG_P (src))
1931     {
1932       int src_regno = REGNO (src);
1933       int dst_regno = REGNO (dst);
1934 
1935       /* Handle FP <-> GP regs.  */
1936       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
1937 	{
1938 	  src_lo = gen_lowpart (word_mode, src);
1939 	  src_hi = gen_highpart (word_mode, src);
1940 
1941 	  if (mode == TImode)
1942 	    {
1943 	      emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
1944 	      emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
1945 	    }
1946 	  else
1947 	    {
1948 	      emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
1949 	      emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
1950 	    }
1951 	  return;
1952 	}
1953       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
1954 	{
1955 	  dst_lo = gen_lowpart (word_mode, dst);
1956 	  dst_hi = gen_highpart (word_mode, dst);
1957 
1958 	  if (mode == TImode)
1959 	    {
1960 	      emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
1961 	      emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
1962 	    }
1963 	  else
1964 	    {
1965 	      emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
1966 	      emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
1967 	    }
1968 	  return;
1969 	}
1970     }
1971 
1972   dst_lo = gen_lowpart (word_mode, dst);
1973   dst_hi = gen_highpart (word_mode, dst);
1974   src_lo = gen_lowpart (word_mode, src);
1975   src_hi = gen_highpart_mode (word_mode, mode, src);
1976 
1977   /* At most one pairing may overlap.  */
1978   if (reg_overlap_mentioned_p (dst_lo, src_hi))
1979     {
1980       aarch64_emit_move (dst_hi, src_hi);
1981       aarch64_emit_move (dst_lo, src_lo);
1982     }
1983   else
1984     {
1985       aarch64_emit_move (dst_lo, src_lo);
1986       aarch64_emit_move (dst_hi, src_hi);
1987     }
1988 }
1989 
1990 bool
aarch64_split_128bit_move_p(rtx dst,rtx src)1991 aarch64_split_128bit_move_p (rtx dst, rtx src)
1992 {
1993   return (! REG_P (src)
1994 	  || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
1995 }
1996 
1997 /* Split a complex SIMD combine.  */
1998 
1999 void
aarch64_split_simd_combine(rtx dst,rtx src1,rtx src2)2000 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
2001 {
2002   machine_mode src_mode = GET_MODE (src1);
2003   machine_mode dst_mode = GET_MODE (dst);
2004 
2005   gcc_assert (VECTOR_MODE_P (dst_mode));
2006   gcc_assert (register_operand (dst, dst_mode)
2007 	      && register_operand (src1, src_mode)
2008 	      && register_operand (src2, src_mode));
2009 
2010   rtx (*gen) (rtx, rtx, rtx);
2011 
2012   switch (src_mode)
2013     {
2014     case E_V8QImode:
2015       gen = gen_aarch64_simd_combinev8qi;
2016       break;
2017     case E_V4HImode:
2018       gen = gen_aarch64_simd_combinev4hi;
2019       break;
2020     case E_V2SImode:
2021       gen = gen_aarch64_simd_combinev2si;
2022       break;
2023     case E_V4HFmode:
2024       gen = gen_aarch64_simd_combinev4hf;
2025       break;
2026     case E_V2SFmode:
2027       gen = gen_aarch64_simd_combinev2sf;
2028       break;
2029     case E_DImode:
2030       gen = gen_aarch64_simd_combinedi;
2031       break;
2032     case E_DFmode:
2033       gen = gen_aarch64_simd_combinedf;
2034       break;
2035     default:
2036       gcc_unreachable ();
2037     }
2038 
2039   emit_insn (gen (dst, src1, src2));
2040   return;
2041 }
2042 
2043 /* Split a complex SIMD move.  */
2044 
2045 void
aarch64_split_simd_move(rtx dst,rtx src)2046 aarch64_split_simd_move (rtx dst, rtx src)
2047 {
2048   machine_mode src_mode = GET_MODE (src);
2049   machine_mode dst_mode = GET_MODE (dst);
2050 
2051   gcc_assert (VECTOR_MODE_P (dst_mode));
2052 
2053   if (REG_P (dst) && REG_P (src))
2054     {
2055       rtx (*gen) (rtx, rtx);
2056 
2057       gcc_assert (VECTOR_MODE_P (src_mode));
2058 
2059       switch (src_mode)
2060 	{
2061 	case E_V16QImode:
2062 	  gen = gen_aarch64_split_simd_movv16qi;
2063 	  break;
2064 	case E_V8HImode:
2065 	  gen = gen_aarch64_split_simd_movv8hi;
2066 	  break;
2067 	case E_V4SImode:
2068 	  gen = gen_aarch64_split_simd_movv4si;
2069 	  break;
2070 	case E_V2DImode:
2071 	  gen = gen_aarch64_split_simd_movv2di;
2072 	  break;
2073 	case E_V8HFmode:
2074 	  gen = gen_aarch64_split_simd_movv8hf;
2075 	  break;
2076 	case E_V4SFmode:
2077 	  gen = gen_aarch64_split_simd_movv4sf;
2078 	  break;
2079 	case E_V2DFmode:
2080 	  gen = gen_aarch64_split_simd_movv2df;
2081 	  break;
2082 	default:
2083 	  gcc_unreachable ();
2084 	}
2085 
2086       emit_insn (gen (dst, src));
2087       return;
2088     }
2089 }
2090 
2091 bool
aarch64_zero_extend_const_eq(machine_mode xmode,rtx x,machine_mode ymode,rtx y)2092 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
2093 			      machine_mode ymode, rtx y)
2094 {
2095   rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
2096   gcc_assert (r != NULL);
2097   return rtx_equal_p (x, r);
2098 }
2099 
2100 
2101 static rtx
aarch64_force_temporary(machine_mode mode,rtx x,rtx value)2102 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
2103 {
2104   if (can_create_pseudo_p ())
2105     return force_reg (mode, value);
2106   else
2107     {
2108       gcc_assert (x);
2109       aarch64_emit_move (x, value);
2110       return x;
2111     }
2112 }
2113 
2114 /* Return true if we can move VALUE into a register using a single
2115    CNT[BHWD] instruction.  */
2116 
2117 static bool
aarch64_sve_cnt_immediate_p(poly_int64 value)2118 aarch64_sve_cnt_immediate_p (poly_int64 value)
2119 {
2120   HOST_WIDE_INT factor = value.coeffs[0];
2121   /* The coefficient must be [1, 16] * {2, 4, 8, 16}.  */
2122   return (value.coeffs[1] == factor
2123 	  && IN_RANGE (factor, 2, 16 * 16)
2124 	  && (factor & 1) == 0
2125 	  && factor <= 16 * (factor & -factor));
2126 }
2127 
2128 /* Likewise for rtx X.  */
2129 
2130 bool
aarch64_sve_cnt_immediate_p(rtx x)2131 aarch64_sve_cnt_immediate_p (rtx x)
2132 {
2133   poly_int64 value;
2134   return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
2135 }
2136 
2137 /* Return the asm string for an instruction with a CNT-like vector size
2138    operand (a vector pattern followed by a multiplier in the range [1, 16]).
2139    PREFIX is the mnemonic without the size suffix and OPERANDS is the
2140    first part of the operands template (the part that comes before the
2141    vector size itself).  FACTOR is the number of quadwords.
2142    NELTS_PER_VQ, if nonzero, is the number of elements in each quadword.
2143    If it is zero, we can use any element size.  */
2144 
2145 static char *
aarch64_output_sve_cnt_immediate(const char * prefix,const char * operands,unsigned int factor,unsigned int nelts_per_vq)2146 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2147 				  unsigned int factor,
2148 				  unsigned int nelts_per_vq)
2149 {
2150   static char buffer[sizeof ("sqincd\t%x0, %w0, all, mul #16")];
2151 
2152   if (nelts_per_vq == 0)
2153     /* There is some overlap in the ranges of the four CNT instructions.
2154        Here we always use the smallest possible element size, so that the
2155        multiplier is 1 whereever possible.  */
2156     nelts_per_vq = factor & -factor;
2157   int shift = std::min (exact_log2 (nelts_per_vq), 4);
2158   gcc_assert (IN_RANGE (shift, 1, 4));
2159   char suffix = "dwhb"[shift - 1];
2160 
2161   factor >>= shift;
2162   unsigned int written;
2163   if (factor == 1)
2164     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
2165 			prefix, suffix, operands);
2166   else
2167     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, all, mul #%d",
2168 			prefix, suffix, operands, factor);
2169   gcc_assert (written < sizeof (buffer));
2170   return buffer;
2171 }
2172 
2173 /* Return the asm string for an instruction with a CNT-like vector size
2174    operand (a vector pattern followed by a multiplier in the range [1, 16]).
2175    PREFIX is the mnemonic without the size suffix and OPERANDS is the
2176    first part of the operands template (the part that comes before the
2177    vector size itself).  X is the value of the vector size operand,
2178    as a polynomial integer rtx.  */
2179 
2180 char *
aarch64_output_sve_cnt_immediate(const char * prefix,const char * operands,rtx x)2181 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2182 				  rtx x)
2183 {
2184   poly_int64 value = rtx_to_poly_int64 (x);
2185   gcc_assert (aarch64_sve_cnt_immediate_p (value));
2186   return aarch64_output_sve_cnt_immediate (prefix, operands,
2187 					   value.coeffs[1], 0);
2188 }
2189 
2190 /* Return true if we can add VALUE to a register using a single ADDVL
2191    or ADDPL instruction.  */
2192 
2193 static bool
aarch64_sve_addvl_addpl_immediate_p(poly_int64 value)2194 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
2195 {
2196   HOST_WIDE_INT factor = value.coeffs[0];
2197   if (factor == 0 || value.coeffs[1] != factor)
2198     return false;
2199   /* FACTOR counts VG / 2, so a value of 2 is one predicate width
2200      and a value of 16 is one vector width.  */
2201   return (((factor & 15) == 0 && IN_RANGE (factor, -32 * 16, 31 * 16))
2202 	  || ((factor & 1) == 0 && IN_RANGE (factor, -32 * 2, 31 * 2)));
2203 }
2204 
2205 /* Likewise for rtx X.  */
2206 
2207 bool
aarch64_sve_addvl_addpl_immediate_p(rtx x)2208 aarch64_sve_addvl_addpl_immediate_p (rtx x)
2209 {
2210   poly_int64 value;
2211   return (poly_int_rtx_p (x, &value)
2212 	  && aarch64_sve_addvl_addpl_immediate_p (value));
2213 }
2214 
2215 /* Return the asm string for adding ADDVL or ADDPL immediate X to operand 1
2216    and storing the result in operand 0.  */
2217 
2218 char *
aarch64_output_sve_addvl_addpl(rtx dest,rtx base,rtx offset)2219 aarch64_output_sve_addvl_addpl (rtx dest, rtx base, rtx offset)
2220 {
2221   static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
2222   poly_int64 offset_value = rtx_to_poly_int64 (offset);
2223   gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
2224 
2225   /* Use INC or DEC if possible.  */
2226   if (rtx_equal_p (dest, base) && GP_REGNUM_P (REGNO (dest)))
2227     {
2228       if (aarch64_sve_cnt_immediate_p (offset_value))
2229 	return aarch64_output_sve_cnt_immediate ("inc", "%x0",
2230 						 offset_value.coeffs[1], 0);
2231       if (aarch64_sve_cnt_immediate_p (-offset_value))
2232 	return aarch64_output_sve_cnt_immediate ("dec", "%x0",
2233 						 -offset_value.coeffs[1], 0);
2234     }
2235 
2236   int factor = offset_value.coeffs[1];
2237   if ((factor & 15) == 0)
2238     snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
2239   else
2240     snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
2241   return buffer;
2242 }
2243 
2244 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2245    instruction.  If it is, store the number of elements in each vector
2246    quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
2247    factor in *FACTOR_OUT (if nonnull).  */
2248 
2249 bool
aarch64_sve_inc_dec_immediate_p(rtx x,int * factor_out,unsigned int * nelts_per_vq_out)2250 aarch64_sve_inc_dec_immediate_p (rtx x, int *factor_out,
2251 				 unsigned int *nelts_per_vq_out)
2252 {
2253   rtx elt;
2254   poly_int64 value;
2255 
2256   if (!const_vec_duplicate_p (x, &elt)
2257       || !poly_int_rtx_p (elt, &value))
2258     return false;
2259 
2260   unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
2261   if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
2262     /* There's no vector INCB.  */
2263     return false;
2264 
2265   HOST_WIDE_INT factor = value.coeffs[0];
2266   if (value.coeffs[1] != factor)
2267     return false;
2268 
2269   /* The coefficient must be [1, 16] * NELTS_PER_VQ.  */
2270   if ((factor % nelts_per_vq) != 0
2271       || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
2272     return false;
2273 
2274   if (factor_out)
2275     *factor_out = factor;
2276   if (nelts_per_vq_out)
2277     *nelts_per_vq_out = nelts_per_vq;
2278   return true;
2279 }
2280 
2281 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2282    instruction.  */
2283 
2284 bool
aarch64_sve_inc_dec_immediate_p(rtx x)2285 aarch64_sve_inc_dec_immediate_p (rtx x)
2286 {
2287   return aarch64_sve_inc_dec_immediate_p (x, NULL, NULL);
2288 }
2289 
2290 /* Return the asm template for an SVE vector INC or DEC instruction.
2291    OPERANDS gives the operands before the vector count and X is the
2292    value of the vector count operand itself.  */
2293 
2294 char *
aarch64_output_sve_inc_dec_immediate(const char * operands,rtx x)2295 aarch64_output_sve_inc_dec_immediate (const char *operands, rtx x)
2296 {
2297   int factor;
2298   unsigned int nelts_per_vq;
2299   if (!aarch64_sve_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
2300     gcc_unreachable ();
2301   if (factor < 0)
2302     return aarch64_output_sve_cnt_immediate ("dec", operands, -factor,
2303 					     nelts_per_vq);
2304   else
2305     return aarch64_output_sve_cnt_immediate ("inc", operands, factor,
2306 					     nelts_per_vq);
2307 }
2308 
2309 static int
aarch64_internal_mov_immediate(rtx dest,rtx imm,bool generate,scalar_int_mode mode)2310 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
2311 				scalar_int_mode mode)
2312 {
2313   int i;
2314   unsigned HOST_WIDE_INT val, val2, mask;
2315   int one_match, zero_match;
2316   int num_insns;
2317 
2318   val = INTVAL (imm);
2319 
2320   if (aarch64_move_imm (val, mode))
2321     {
2322       if (generate)
2323 	emit_insn (gen_rtx_SET (dest, imm));
2324       return 1;
2325     }
2326 
2327   /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
2328      (with XXXX non-zero). In that case check to see if the move can be done in
2329      a smaller mode.  */
2330   val2 = val & 0xffffffff;
2331   if (mode == DImode
2332       && aarch64_move_imm (val2, SImode)
2333       && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
2334     {
2335       if (generate)
2336 	emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
2337 
2338       /* Check if we have to emit a second instruction by checking to see
2339          if any of the upper 32 bits of the original DI mode value is set.  */
2340       if (val == val2)
2341 	return 1;
2342 
2343       i = (val >> 48) ? 48 : 32;
2344 
2345       if (generate)
2346 	 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2347 				    GEN_INT ((val >> i) & 0xffff)));
2348 
2349       return 2;
2350     }
2351 
2352   if ((val >> 32) == 0 || mode == SImode)
2353     {
2354       if (generate)
2355 	{
2356 	  emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
2357 	  if (mode == SImode)
2358 	    emit_insn (gen_insv_immsi (dest, GEN_INT (16),
2359 				       GEN_INT ((val >> 16) & 0xffff)));
2360 	  else
2361 	    emit_insn (gen_insv_immdi (dest, GEN_INT (16),
2362 				       GEN_INT ((val >> 16) & 0xffff)));
2363 	}
2364       return 2;
2365     }
2366 
2367   /* Remaining cases are all for DImode.  */
2368 
2369   mask = 0xffff;
2370   zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
2371     ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
2372   one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
2373     ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
2374 
2375   if (zero_match != 2 && one_match != 2)
2376     {
2377       /* Try emitting a bitmask immediate with a movk replacing 16 bits.
2378 	 For a 64-bit bitmask try whether changing 16 bits to all ones or
2379 	 zeroes creates a valid bitmask.  To check any repeated bitmask,
2380 	 try using 16 bits from the other 32-bit half of val.  */
2381 
2382       for (i = 0; i < 64; i += 16, mask <<= 16)
2383 	{
2384 	  val2 = val & ~mask;
2385 	  if (val2 != val && aarch64_bitmask_imm (val2, mode))
2386 	    break;
2387 	  val2 = val | mask;
2388 	  if (val2 != val && aarch64_bitmask_imm (val2, mode))
2389 	    break;
2390 	  val2 = val2 & ~mask;
2391 	  val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
2392 	  if (val2 != val && aarch64_bitmask_imm (val2, mode))
2393 	    break;
2394 	}
2395       if (i != 64)
2396 	{
2397 	  if (generate)
2398 	    {
2399 	      emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
2400 	      emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2401 					 GEN_INT ((val >> i) & 0xffff)));
2402 	    }
2403 	  return 2;
2404 	}
2405     }
2406 
2407   /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
2408      are emitted by the initial mov.  If one_match > zero_match, skip set bits,
2409      otherwise skip zero bits.  */
2410 
2411   num_insns = 1;
2412   mask = 0xffff;
2413   val2 = one_match > zero_match ? ~val : val;
2414   i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
2415 
2416   if (generate)
2417     emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
2418 					   ? (val | ~(mask << i))
2419 					   : (val & (mask << i)))));
2420   for (i += 16; i < 64; i += 16)
2421     {
2422       if ((val2 & (mask << i)) == 0)
2423 	continue;
2424       if (generate)
2425 	emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2426 				   GEN_INT ((val >> i) & 0xffff)));
2427       num_insns ++;
2428     }
2429 
2430   return num_insns;
2431 }
2432 
2433 /* Return whether imm is a 128-bit immediate which is simple enough to
2434    expand inline.  */
2435 bool
aarch64_mov128_immediate(rtx imm)2436 aarch64_mov128_immediate (rtx imm)
2437 {
2438   if (GET_CODE (imm) == CONST_INT)
2439     return true;
2440 
2441   gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
2442 
2443   rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
2444   rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
2445 
2446   return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
2447 	 + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
2448 }
2449 
2450 
2451 /* Return the number of temporary registers that aarch64_add_offset_1
2452    would need to add OFFSET to a register.  */
2453 
2454 static unsigned int
aarch64_add_offset_1_temporaries(HOST_WIDE_INT offset)2455 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
2456 {
2457   return abs_hwi (offset) < 0x1000000 ? 0 : 1;
2458 }
2459 
2460 /* A subroutine of aarch64_add_offset.  Set DEST to SRC + OFFSET for
2461    a non-polynomial OFFSET.  MODE is the mode of the addition.
2462    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2463    be set and CFA adjustments added to the generated instructions.
2464 
2465    TEMP1, if nonnull, is a register of mode MODE that can be used as a
2466    temporary if register allocation is already complete.  This temporary
2467    register may overlap DEST but must not overlap SRC.  If TEMP1 is known
2468    to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
2469    the immediate again.
2470 
2471    Since this function may be used to adjust the stack pointer, we must
2472    ensure that it cannot cause transient stack deallocation (for example
2473    by first incrementing SP and then decrementing when adjusting by a
2474    large immediate).  */
2475 
2476 static void
aarch64_add_offset_1(scalar_int_mode mode,rtx dest,rtx src,HOST_WIDE_INT offset,rtx temp1,bool frame_related_p,bool emit_move_imm)2477 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
2478 		      rtx src, HOST_WIDE_INT offset, rtx temp1,
2479 		      bool frame_related_p, bool emit_move_imm)
2480 {
2481   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
2482   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
2483 
2484   HOST_WIDE_INT moffset = abs_hwi (offset);
2485   rtx_insn *insn;
2486 
2487   if (!moffset)
2488     {
2489       if (!rtx_equal_p (dest, src))
2490 	{
2491 	  insn = emit_insn (gen_rtx_SET (dest, src));
2492 	  RTX_FRAME_RELATED_P (insn) = frame_related_p;
2493 	}
2494       return;
2495     }
2496 
2497   /* Single instruction adjustment.  */
2498   if (aarch64_uimm12_shift (moffset))
2499     {
2500       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
2501       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2502       return;
2503     }
2504 
2505   /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
2506      and either:
2507 
2508      a) the offset cannot be loaded by a 16-bit move or
2509      b) there is no spare register into which we can move it.  */
2510   if (moffset < 0x1000000
2511       && ((!temp1 && !can_create_pseudo_p ())
2512 	  || !aarch64_move_imm (moffset, mode)))
2513     {
2514       HOST_WIDE_INT low_off = moffset & 0xfff;
2515 
2516       low_off = offset < 0 ? -low_off : low_off;
2517       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
2518       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2519       insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
2520       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2521       return;
2522     }
2523 
2524   /* Emit a move immediate if required and an addition/subtraction.  */
2525   if (emit_move_imm)
2526     {
2527       gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
2528       temp1 = aarch64_force_temporary (mode, temp1, GEN_INT (moffset));
2529     }
2530   insn = emit_insn (offset < 0
2531 		    ? gen_sub3_insn (dest, src, temp1)
2532 		    : gen_add3_insn (dest, src, temp1));
2533   if (frame_related_p)
2534     {
2535       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2536       rtx adj = plus_constant (mode, src, offset);
2537       add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
2538     }
2539 }
2540 
2541 /* Return the number of temporary registers that aarch64_add_offset
2542    would need to move OFFSET into a register or add OFFSET to a register;
2543    ADD_P is true if we want the latter rather than the former.  */
2544 
2545 static unsigned int
aarch64_offset_temporaries(bool add_p,poly_int64 offset)2546 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
2547 {
2548   /* This follows the same structure as aarch64_add_offset.  */
2549   if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
2550     return 0;
2551 
2552   unsigned int count = 0;
2553   HOST_WIDE_INT factor = offset.coeffs[1];
2554   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
2555   poly_int64 poly_offset (factor, factor);
2556   if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
2557     /* Need one register for the ADDVL/ADDPL result.  */
2558     count += 1;
2559   else if (factor != 0)
2560     {
2561       factor = abs (factor);
2562       if (factor > 16 * (factor & -factor))
2563 	/* Need one register for the CNT result and one for the multiplication
2564 	   factor.  If necessary, the second temporary can be reused for the
2565 	   constant part of the offset.  */
2566 	return 2;
2567       /* Need one register for the CNT result (which might then
2568 	 be shifted).  */
2569       count += 1;
2570     }
2571   return count + aarch64_add_offset_1_temporaries (constant);
2572 }
2573 
2574 /* If X can be represented as a poly_int64, return the number
2575    of temporaries that are required to add it to a register.
2576    Return -1 otherwise.  */
2577 
2578 int
aarch64_add_offset_temporaries(rtx x)2579 aarch64_add_offset_temporaries (rtx x)
2580 {
2581   poly_int64 offset;
2582   if (!poly_int_rtx_p (x, &offset))
2583     return -1;
2584   return aarch64_offset_temporaries (true, offset);
2585 }
2586 
2587 /* Set DEST to SRC + OFFSET.  MODE is the mode of the addition.
2588    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2589    be set and CFA adjustments added to the generated instructions.
2590 
2591    TEMP1, if nonnull, is a register of mode MODE that can be used as a
2592    temporary if register allocation is already complete.  This temporary
2593    register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
2594    If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
2595    false to avoid emitting the immediate again.
2596 
2597    TEMP2, if nonnull, is a second temporary register that doesn't
2598    overlap either DEST or REG.
2599 
2600    Since this function may be used to adjust the stack pointer, we must
2601    ensure that it cannot cause transient stack deallocation (for example
2602    by first incrementing SP and then decrementing when adjusting by a
2603    large immediate).  */
2604 
2605 static void
2606 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
2607 		    poly_int64 offset, rtx temp1, rtx temp2,
2608 		    bool frame_related_p, bool emit_move_imm = true)
2609 {
2610   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
2611   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
2612   gcc_assert (temp1 == NULL_RTX
2613 	      || !frame_related_p
2614 	      || !reg_overlap_mentioned_p (temp1, dest));
2615   gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
2616 
2617   /* Try using ADDVL or ADDPL to add the whole value.  */
2618   if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
2619     {
2620       rtx offset_rtx = gen_int_mode (offset, mode);
2621       rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
2622       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2623       return;
2624     }
2625 
2626   /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
2627      SVE vector register, over and above the minimum size of 128 bits.
2628      This is equivalent to half the value returned by CNTD with a
2629      vector shape of ALL.  */
2630   HOST_WIDE_INT factor = offset.coeffs[1];
2631   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
2632 
2633   /* Try using ADDVL or ADDPL to add the VG-based part.  */
2634   poly_int64 poly_offset (factor, factor);
2635   if (src != const0_rtx
2636       && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
2637     {
2638       rtx offset_rtx = gen_int_mode (poly_offset, mode);
2639       if (frame_related_p)
2640 	{
2641 	  rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
2642 	  RTX_FRAME_RELATED_P (insn) = true;
2643 	  src = dest;
2644 	}
2645       else
2646 	{
2647 	  rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
2648 	  src = aarch64_force_temporary (mode, temp1, addr);
2649 	  temp1 = temp2;
2650 	  temp2 = NULL_RTX;
2651 	}
2652     }
2653   /* Otherwise use a CNT-based sequence.  */
2654   else if (factor != 0)
2655     {
2656       /* Use a subtraction if we have a negative factor.  */
2657       rtx_code code = PLUS;
2658       if (factor < 0)
2659 	{
2660 	  factor = -factor;
2661 	  code = MINUS;
2662 	}
2663 
2664       /* Calculate CNTD * FACTOR / 2.  First try to fold the division
2665 	 into the multiplication.  */
2666       rtx val;
2667       int shift = 0;
2668       if (factor & 1)
2669 	/* Use a right shift by 1.  */
2670 	shift = -1;
2671       else
2672 	factor /= 2;
2673       HOST_WIDE_INT low_bit = factor & -factor;
2674       if (factor <= 16 * low_bit)
2675 	{
2676 	  if (factor > 16 * 8)
2677 	    {
2678 	      /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
2679 		 the value with the minimum multiplier and shift it into
2680 		 position.  */
2681 	      int extra_shift = exact_log2 (low_bit);
2682 	      shift += extra_shift;
2683 	      factor >>= extra_shift;
2684 	    }
2685 	  val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
2686 	}
2687       else
2688 	{
2689 	  /* Use CNTD, then multiply it by FACTOR.  */
2690 	  val = gen_int_mode (poly_int64 (2, 2), mode);
2691 	  val = aarch64_force_temporary (mode, temp1, val);
2692 
2693 	  /* Go back to using a negative multiplication factor if we have
2694 	     no register from which to subtract.  */
2695 	  if (code == MINUS && src == const0_rtx)
2696 	    {
2697 	      factor = -factor;
2698 	      code = PLUS;
2699 	    }
2700 	  rtx coeff1 = gen_int_mode (factor, mode);
2701 	  coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
2702 	  val = gen_rtx_MULT (mode, val, coeff1);
2703 	}
2704 
2705       if (shift > 0)
2706 	{
2707 	  /* Multiply by 1 << SHIFT.  */
2708 	  val = aarch64_force_temporary (mode, temp1, val);
2709 	  val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
2710 	}
2711       else if (shift == -1)
2712 	{
2713 	  /* Divide by 2.  */
2714 	  val = aarch64_force_temporary (mode, temp1, val);
2715 	  val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
2716 	}
2717 
2718       /* Calculate SRC +/- CNTD * FACTOR / 2.  */
2719       if (src != const0_rtx)
2720 	{
2721 	  val = aarch64_force_temporary (mode, temp1, val);
2722 	  val = gen_rtx_fmt_ee (code, mode, src, val);
2723 	}
2724       else if (code == MINUS)
2725 	{
2726 	  val = aarch64_force_temporary (mode, temp1, val);
2727 	  val = gen_rtx_NEG (mode, val);
2728 	}
2729 
2730       if (constant == 0 || frame_related_p)
2731 	{
2732 	  rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
2733 	  if (frame_related_p)
2734 	    {
2735 	      RTX_FRAME_RELATED_P (insn) = true;
2736 	      add_reg_note (insn, REG_CFA_ADJUST_CFA,
2737 			    gen_rtx_SET (dest, plus_constant (Pmode, src,
2738 							      poly_offset)));
2739 	    }
2740 	  src = dest;
2741 	  if (constant == 0)
2742 	    return;
2743 	}
2744       else
2745 	{
2746 	  src = aarch64_force_temporary (mode, temp1, val);
2747 	  temp1 = temp2;
2748 	  temp2 = NULL_RTX;
2749 	}
2750 
2751       emit_move_imm = true;
2752     }
2753 
2754   aarch64_add_offset_1 (mode, dest, src, constant, temp1,
2755 			frame_related_p, emit_move_imm);
2756 }
2757 
2758 /* Like aarch64_add_offset, but the offset is given as an rtx rather
2759    than a poly_int64.  */
2760 
2761 void
aarch64_split_add_offset(scalar_int_mode mode,rtx dest,rtx src,rtx offset_rtx,rtx temp1,rtx temp2)2762 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
2763 			  rtx offset_rtx, rtx temp1, rtx temp2)
2764 {
2765   aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
2766 		      temp1, temp2, false);
2767 }
2768 
2769 /* Add DELTA to the stack pointer, marking the instructions frame-related.
2770    TEMP1 is available as a temporary if nonnull.  EMIT_MOVE_IMM is false
2771    if TEMP1 already contains abs (DELTA).  */
2772 
2773 static inline void
aarch64_add_sp(rtx temp1,rtx temp2,poly_int64 delta,bool emit_move_imm)2774 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
2775 {
2776   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
2777 		      temp1, temp2, true, emit_move_imm);
2778 }
2779 
2780 /* Subtract DELTA from the stack pointer, marking the instructions
2781    frame-related if FRAME_RELATED_P.  TEMP1 is available as a temporary
2782    if nonnull.  */
2783 
2784 static inline void
aarch64_sub_sp(rtx temp1,rtx temp2,poly_int64 delta,bool frame_related_p)2785 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p)
2786 {
2787   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
2788 		      temp1, temp2, frame_related_p);
2789 }
2790 
2791 /* Set DEST to (vec_series BASE STEP).  */
2792 
2793 static void
aarch64_expand_vec_series(rtx dest,rtx base,rtx step)2794 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
2795 {
2796   machine_mode mode = GET_MODE (dest);
2797   scalar_mode inner = GET_MODE_INNER (mode);
2798 
2799   /* Each operand can be a register or an immediate in the range [-16, 15].  */
2800   if (!aarch64_sve_index_immediate_p (base))
2801     base = force_reg (inner, base);
2802   if (!aarch64_sve_index_immediate_p (step))
2803     step = force_reg (inner, step);
2804 
2805   emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
2806 }
2807 
2808 /* Try to duplicate SRC into SVE register DEST, given that SRC is an
2809    integer of mode INT_MODE.  Return true on success.  */
2810 
2811 static bool
aarch64_expand_sve_widened_duplicate(rtx dest,scalar_int_mode src_mode,rtx src)2812 aarch64_expand_sve_widened_duplicate (rtx dest, scalar_int_mode src_mode,
2813 				      rtx src)
2814 {
2815   /* If the constant is smaller than 128 bits, we can do the move
2816      using a vector of SRC_MODEs.  */
2817   if (src_mode != TImode)
2818     {
2819       poly_uint64 count = exact_div (GET_MODE_SIZE (GET_MODE (dest)),
2820 				     GET_MODE_SIZE (src_mode));
2821       machine_mode dup_mode = mode_for_vector (src_mode, count).require ();
2822       emit_move_insn (gen_lowpart (dup_mode, dest),
2823 		      gen_const_vec_duplicate (dup_mode, src));
2824       return true;
2825     }
2826 
2827   /* Use LD1RQ[BHWD] to load the 128 bits from memory.  */
2828   src = force_const_mem (src_mode, src);
2829   if (!src)
2830     return false;
2831 
2832   /* Make sure that the address is legitimate.  */
2833   if (!aarch64_sve_ld1r_operand_p (src))
2834     {
2835       rtx addr = force_reg (Pmode, XEXP (src, 0));
2836       src = replace_equiv_address (src, addr);
2837     }
2838 
2839   machine_mode mode = GET_MODE (dest);
2840   unsigned int elem_bytes = GET_MODE_UNIT_SIZE (mode);
2841   machine_mode pred_mode = aarch64_sve_pred_mode (elem_bytes).require ();
2842   rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
2843   src = gen_rtx_UNSPEC (mode, gen_rtvec (2, ptrue, src), UNSPEC_LD1RQ);
2844   emit_insn (gen_rtx_SET (dest, src));
2845   return true;
2846 }
2847 
2848 /* Expand a move of general CONST_VECTOR SRC into DEST, given that it
2849    isn't a simple duplicate or series.  */
2850 
2851 static void
aarch64_expand_sve_const_vector(rtx dest,rtx src)2852 aarch64_expand_sve_const_vector (rtx dest, rtx src)
2853 {
2854   machine_mode mode = GET_MODE (src);
2855   unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
2856   unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
2857   gcc_assert (npatterns > 1);
2858 
2859   if (nelts_per_pattern == 1)
2860     {
2861       /* The constant is a repeating seqeuence of at least two elements,
2862 	 where the repeating elements occupy no more than 128 bits.
2863 	 Get an integer representation of the replicated value.  */
2864       scalar_int_mode int_mode;
2865       if (BYTES_BIG_ENDIAN)
2866 	/* For now, always use LD1RQ to load the value on big-endian
2867 	   targets, since the handling of smaller integers includes a
2868 	   subreg that is semantically an element reverse.  */
2869 	int_mode = TImode;
2870       else
2871 	{
2872 	  unsigned int int_bits = GET_MODE_UNIT_BITSIZE (mode) * npatterns;
2873 	  gcc_assert (int_bits <= 128);
2874 	  int_mode = int_mode_for_size (int_bits, 0).require ();
2875 	}
2876       rtx int_value = simplify_gen_subreg (int_mode, src, mode, 0);
2877       if (int_value
2878 	  && aarch64_expand_sve_widened_duplicate (dest, int_mode, int_value))
2879 	return;
2880     }
2881 
2882   /* Expand each pattern individually.  */
2883   rtx_vector_builder builder;
2884   auto_vec<rtx, 16> vectors (npatterns);
2885   for (unsigned int i = 0; i < npatterns; ++i)
2886     {
2887       builder.new_vector (mode, 1, nelts_per_pattern);
2888       for (unsigned int j = 0; j < nelts_per_pattern; ++j)
2889 	builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
2890       vectors.quick_push (force_reg (mode, builder.build ()));
2891     }
2892 
2893   /* Use permutes to interleave the separate vectors.  */
2894   while (npatterns > 1)
2895     {
2896       npatterns /= 2;
2897       for (unsigned int i = 0; i < npatterns; ++i)
2898 	{
2899 	  rtx tmp = (npatterns == 1 ? dest : gen_reg_rtx (mode));
2900 	  rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
2901 	  emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
2902 	  vectors[i] = tmp;
2903 	}
2904     }
2905   gcc_assert (vectors[0] == dest);
2906 }
2907 
2908 /* Set DEST to immediate IMM.  For SVE vector modes, GEN_VEC_DUPLICATE
2909    is a pattern that can be used to set DEST to a replicated scalar
2910    element.  */
2911 
2912 void
aarch64_expand_mov_immediate(rtx dest,rtx imm,rtx (* gen_vec_duplicate)(rtx,rtx))2913 aarch64_expand_mov_immediate (rtx dest, rtx imm,
2914 			      rtx (*gen_vec_duplicate) (rtx, rtx))
2915 {
2916   machine_mode mode = GET_MODE (dest);
2917 
2918   /* Check on what type of symbol it is.  */
2919   scalar_int_mode int_mode;
2920   if ((GET_CODE (imm) == SYMBOL_REF
2921        || GET_CODE (imm) == LABEL_REF
2922        || GET_CODE (imm) == CONST
2923        || GET_CODE (imm) == CONST_POLY_INT)
2924       && is_a <scalar_int_mode> (mode, &int_mode))
2925     {
2926       rtx mem;
2927       poly_int64 offset;
2928       HOST_WIDE_INT const_offset;
2929       enum aarch64_symbol_type sty;
2930 
2931       /* If we have (const (plus symbol offset)), separate out the offset
2932 	 before we start classifying the symbol.  */
2933       rtx base = strip_offset (imm, &offset);
2934 
2935       /* We must always add an offset involving VL separately, rather than
2936 	 folding it into the relocation.  */
2937       if (!offset.is_constant (&const_offset))
2938 	{
2939 	  if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
2940 	    emit_insn (gen_rtx_SET (dest, imm));
2941 	  else
2942 	    {
2943 	      /* Do arithmetic on 32-bit values if the result is smaller
2944 		 than that.  */
2945 	      if (partial_subreg_p (int_mode, SImode))
2946 		{
2947 		  /* It is invalid to do symbol calculations in modes
2948 		     narrower than SImode.  */
2949 		  gcc_assert (base == const0_rtx);
2950 		  dest = gen_lowpart (SImode, dest);
2951 		  int_mode = SImode;
2952 		}
2953 	      if (base != const0_rtx)
2954 		{
2955 		  base = aarch64_force_temporary (int_mode, dest, base);
2956 		  aarch64_add_offset (int_mode, dest, base, offset,
2957 				      NULL_RTX, NULL_RTX, false);
2958 		}
2959 	      else
2960 		aarch64_add_offset (int_mode, dest, base, offset,
2961 				    dest, NULL_RTX, false);
2962 	    }
2963 	  return;
2964 	}
2965 
2966       sty = aarch64_classify_symbol (base, const_offset);
2967       switch (sty)
2968 	{
2969 	case SYMBOL_FORCE_TO_MEM:
2970 	  if (const_offset != 0
2971 	      && targetm.cannot_force_const_mem (int_mode, imm))
2972 	    {
2973 	      gcc_assert (can_create_pseudo_p ());
2974 	      base = aarch64_force_temporary (int_mode, dest, base);
2975 	      aarch64_add_offset (int_mode, dest, base, const_offset,
2976 				  NULL_RTX, NULL_RTX, false);
2977 	      return;
2978 	    }
2979 
2980 	  mem = force_const_mem (ptr_mode, imm);
2981 	  gcc_assert (mem);
2982 
2983 	  /* If we aren't generating PC relative literals, then
2984 	     we need to expand the literal pool access carefully.
2985 	     This is something that needs to be done in a number
2986 	     of places, so could well live as a separate function.  */
2987 	  if (!aarch64_pcrelative_literal_loads)
2988 	    {
2989 	      gcc_assert (can_create_pseudo_p ());
2990 	      base = gen_reg_rtx (ptr_mode);
2991 	      aarch64_expand_mov_immediate (base, XEXP (mem, 0));
2992 	      if (ptr_mode != Pmode)
2993 		base = convert_memory_address (Pmode, base);
2994 	      mem = gen_rtx_MEM (ptr_mode, base);
2995 	    }
2996 
2997 	  if (int_mode != ptr_mode)
2998 	    mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
2999 
3000 	  emit_insn (gen_rtx_SET (dest, mem));
3001 
3002 	  return;
3003 
3004         case SYMBOL_SMALL_TLSGD:
3005         case SYMBOL_SMALL_TLSDESC:
3006 	case SYMBOL_SMALL_TLSIE:
3007 	case SYMBOL_SMALL_GOT_28K:
3008 	case SYMBOL_SMALL_GOT_4G:
3009 	case SYMBOL_TINY_GOT:
3010 	case SYMBOL_TINY_TLSIE:
3011 	  if (const_offset != 0)
3012 	    {
3013 	      gcc_assert(can_create_pseudo_p ());
3014 	      base = aarch64_force_temporary (int_mode, dest, base);
3015 	      aarch64_add_offset (int_mode, dest, base, const_offset,
3016 				  NULL_RTX, NULL_RTX, false);
3017 	      return;
3018 	    }
3019 	  /* FALLTHRU */
3020 
3021 	case SYMBOL_SMALL_ABSOLUTE:
3022 	case SYMBOL_TINY_ABSOLUTE:
3023 	case SYMBOL_TLSLE12:
3024 	case SYMBOL_TLSLE24:
3025 	case SYMBOL_TLSLE32:
3026 	case SYMBOL_TLSLE48:
3027 	  aarch64_load_symref_appropriately (dest, imm, sty);
3028 	  return;
3029 
3030 	default:
3031 	  gcc_unreachable ();
3032 	}
3033     }
3034 
3035   if (!CONST_INT_P (imm))
3036     {
3037       rtx base, step, value;
3038       if (GET_CODE (imm) == HIGH
3039 	  || aarch64_simd_valid_immediate (imm, NULL))
3040 	emit_insn (gen_rtx_SET (dest, imm));
3041       else if (const_vec_series_p (imm, &base, &step))
3042 	aarch64_expand_vec_series (dest, base, step);
3043       else if (const_vec_duplicate_p (imm, &value))
3044 	{
3045 	  /* If the constant is out of range of an SVE vector move,
3046 	     load it from memory if we can, otherwise move it into
3047 	     a register and use a DUP.  */
3048 	  scalar_mode inner_mode = GET_MODE_INNER (mode);
3049 	  rtx op = force_const_mem (inner_mode, value);
3050 	  if (!op)
3051 	    op = force_reg (inner_mode, value);
3052 	  else if (!aarch64_sve_ld1r_operand_p (op))
3053 	    {
3054 	      rtx addr = force_reg (Pmode, XEXP (op, 0));
3055 	      op = replace_equiv_address (op, addr);
3056 	    }
3057 	  emit_insn (gen_vec_duplicate (dest, op));
3058 	}
3059       else if (GET_CODE (imm) == CONST_VECTOR
3060 	       && !GET_MODE_NUNITS (GET_MODE (imm)).is_constant ())
3061 	aarch64_expand_sve_const_vector (dest, imm);
3062       else
3063 	{
3064 	  rtx mem = force_const_mem (mode, imm);
3065 	  gcc_assert (mem);
3066 	  emit_move_insn (dest, mem);
3067 	}
3068 
3069       return;
3070     }
3071 
3072   aarch64_internal_mov_immediate (dest, imm, true,
3073 				  as_a <scalar_int_mode> (mode));
3074 }
3075 
3076 /* Emit an SVE predicated move from SRC to DEST.  PRED is a predicate
3077    that is known to contain PTRUE.  */
3078 
3079 void
aarch64_emit_sve_pred_move(rtx dest,rtx pred,rtx src)3080 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
3081 {
3082   emit_insn (gen_rtx_SET (dest, gen_rtx_UNSPEC (GET_MODE (dest),
3083 						gen_rtvec (2, pred, src),
3084 						UNSPEC_MERGE_PTRUE)));
3085 }
3086 
3087 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
3088    operand is in memory.  In this case we need to use the predicated LD1
3089    and ST1 instead of LDR and STR, both for correctness on big-endian
3090    targets and because LD1 and ST1 support a wider range of addressing modes.
3091    PRED_MODE is the mode of the predicate.
3092 
3093    See the comment at the head of aarch64-sve.md for details about the
3094    big-endian handling.  */
3095 
3096 void
aarch64_expand_sve_mem_move(rtx dest,rtx src,machine_mode pred_mode)3097 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
3098 {
3099   machine_mode mode = GET_MODE (dest);
3100   rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
3101   if (!register_operand (src, mode)
3102       && !register_operand (dest, mode))
3103     {
3104       rtx tmp = gen_reg_rtx (mode);
3105       if (MEM_P (src))
3106 	aarch64_emit_sve_pred_move (tmp, ptrue, src);
3107       else
3108 	emit_move_insn (tmp, src);
3109       src = tmp;
3110     }
3111   aarch64_emit_sve_pred_move (dest, ptrue, src);
3112 }
3113 
3114 /* Called only on big-endian targets.  See whether an SVE vector move
3115    from SRC to DEST is effectively a REV[BHW] instruction, because at
3116    least one operand is a subreg of an SVE vector that has wider or
3117    narrower elements.  Return true and emit the instruction if so.
3118 
3119    For example:
3120 
3121      (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
3122 
3123    represents a VIEW_CONVERT between the following vectors, viewed
3124    in memory order:
3125 
3126      R2: { [0].high, [0].low,  [1].high, [1].low, ... }
3127      R1: { [0],      [1],      [2],      [3],     ... }
3128 
3129    The high part of lane X in R2 should therefore correspond to lane X*2
3130    of R1, but the register representations are:
3131 
3132          msb                                      lsb
3133      R2: ...... [1].high  [1].low   [0].high  [0].low
3134      R1: ...... [3]       [2]       [1]       [0]
3135 
3136    where the low part of lane X in R2 corresponds to lane X*2 in R1.
3137    We therefore need a reverse operation to swap the high and low values
3138    around.
3139 
3140    This is purely an optimization.  Without it we would spill the
3141    subreg operand to the stack in one mode and reload it in the
3142    other mode, which has the same effect as the REV.  */
3143 
3144 bool
aarch64_maybe_expand_sve_subreg_move(rtx dest,rtx src)3145 aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
3146 {
3147   gcc_assert (BYTES_BIG_ENDIAN);
3148   if (GET_CODE (dest) == SUBREG)
3149     dest = SUBREG_REG (dest);
3150   if (GET_CODE (src) == SUBREG)
3151     src = SUBREG_REG (src);
3152 
3153   /* The optimization handles two single SVE REGs with different element
3154      sizes.  */
3155   if (!REG_P (dest)
3156       || !REG_P (src)
3157       || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
3158       || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
3159       || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
3160 	  == GET_MODE_UNIT_SIZE (GET_MODE (src))))
3161     return false;
3162 
3163   /* Generate *aarch64_sve_mov<mode>_subreg_be.  */
3164   rtx ptrue = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
3165   rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
3166 			       UNSPEC_REV_SUBREG);
3167   emit_insn (gen_rtx_SET (dest, unspec));
3168   return true;
3169 }
3170 
3171 /* Return a copy of X with mode MODE, without changing its other
3172    attributes.  Unlike gen_lowpart, this doesn't care whether the
3173    mode change is valid.  */
3174 
3175 static rtx
aarch64_replace_reg_mode(rtx x,machine_mode mode)3176 aarch64_replace_reg_mode (rtx x, machine_mode mode)
3177 {
3178   if (GET_MODE (x) == mode)
3179     return x;
3180 
3181   x = shallow_copy_rtx (x);
3182   set_mode_and_regno (x, mode, REGNO (x));
3183   return x;
3184 }
3185 
3186 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
3187    operands.  */
3188 
3189 void
aarch64_split_sve_subreg_move(rtx dest,rtx ptrue,rtx src)3190 aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
3191 {
3192   /* Decide which REV operation we need.  The mode with narrower elements
3193      determines the mode of the operands and the mode with the wider
3194      elements determines the reverse width.  */
3195   machine_mode mode_with_wider_elts = GET_MODE (dest);
3196   machine_mode mode_with_narrower_elts = GET_MODE (src);
3197   if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
3198       < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
3199     std::swap (mode_with_wider_elts, mode_with_narrower_elts);
3200 
3201   unsigned int wider_bytes = GET_MODE_UNIT_SIZE (mode_with_wider_elts);
3202   unsigned int unspec;
3203   if (wider_bytes == 8)
3204     unspec = UNSPEC_REV64;
3205   else if (wider_bytes == 4)
3206     unspec = UNSPEC_REV32;
3207   else if (wider_bytes == 2)
3208     unspec = UNSPEC_REV16;
3209   else
3210     gcc_unreachable ();
3211   machine_mode pred_mode = aarch64_sve_pred_mode (wider_bytes).require ();
3212 
3213   /* Emit:
3214 
3215        (set DEST (unspec [PTRUE (unspec [SRC] UNSPEC_REV<nn>)]
3216 			 UNSPEC_MERGE_PTRUE))
3217 
3218      with the appropriate modes.  */
3219   ptrue = gen_lowpart (pred_mode, ptrue);
3220   dest = aarch64_replace_reg_mode (dest, mode_with_narrower_elts);
3221   src = aarch64_replace_reg_mode (src, mode_with_narrower_elts);
3222   src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (1, src), unspec);
3223   src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (2, ptrue, src),
3224 			UNSPEC_MERGE_PTRUE);
3225   emit_insn (gen_rtx_SET (dest, src));
3226 }
3227 
3228 static bool
aarch64_function_ok_for_sibcall(tree decl ATTRIBUTE_UNUSED,tree exp ATTRIBUTE_UNUSED)3229 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
3230 				 tree exp ATTRIBUTE_UNUSED)
3231 {
3232   /* Currently, always true.  */
3233   return true;
3234 }
3235 
3236 /* Implement TARGET_PASS_BY_REFERENCE.  */
3237 
3238 static bool
aarch64_pass_by_reference(cumulative_args_t pcum ATTRIBUTE_UNUSED,machine_mode mode,const_tree type,bool named ATTRIBUTE_UNUSED)3239 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
3240 			   machine_mode mode,
3241 			   const_tree type,
3242 			   bool named ATTRIBUTE_UNUSED)
3243 {
3244   HOST_WIDE_INT size;
3245   machine_mode dummymode;
3246   int nregs;
3247 
3248   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
3249   if (mode == BLKmode && type)
3250     size = int_size_in_bytes (type);
3251   else
3252     /* No frontends can create types with variable-sized modes, so we
3253        shouldn't be asked to pass or return them.  */
3254     size = GET_MODE_SIZE (mode).to_constant ();
3255 
3256   /* Aggregates are passed by reference based on their size.  */
3257   if (type && AGGREGATE_TYPE_P (type))
3258     {
3259       size = int_size_in_bytes (type);
3260     }
3261 
3262   /* Variable sized arguments are always returned by reference.  */
3263   if (size < 0)
3264     return true;
3265 
3266   /* Can this be a candidate to be passed in fp/simd register(s)?  */
3267   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
3268 					       &dummymode, &nregs,
3269 					       NULL))
3270     return false;
3271 
3272   /* Arguments which are variable sized or larger than 2 registers are
3273      passed by reference unless they are a homogenous floating point
3274      aggregate.  */
3275   return size > 2 * UNITS_PER_WORD;
3276 }
3277 
3278 /* Return TRUE if VALTYPE is padded to its least significant bits.  */
3279 static bool
aarch64_return_in_msb(const_tree valtype)3280 aarch64_return_in_msb (const_tree valtype)
3281 {
3282   machine_mode dummy_mode;
3283   int dummy_int;
3284 
3285   /* Never happens in little-endian mode.  */
3286   if (!BYTES_BIG_ENDIAN)
3287     return false;
3288 
3289   /* Only composite types smaller than or equal to 16 bytes can
3290      be potentially returned in registers.  */
3291   if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
3292       || int_size_in_bytes (valtype) <= 0
3293       || int_size_in_bytes (valtype) > 16)
3294     return false;
3295 
3296   /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
3297      or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
3298      is always passed/returned in the least significant bits of fp/simd
3299      register(s).  */
3300   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
3301 					       &dummy_mode, &dummy_int, NULL))
3302     return false;
3303 
3304   return true;
3305 }
3306 
3307 /* Implement TARGET_FUNCTION_VALUE.
3308    Define how to find the value returned by a function.  */
3309 
3310 static rtx
aarch64_function_value(const_tree type,const_tree func,bool outgoing ATTRIBUTE_UNUSED)3311 aarch64_function_value (const_tree type, const_tree func,
3312 			bool outgoing ATTRIBUTE_UNUSED)
3313 {
3314   machine_mode mode;
3315   int unsignedp;
3316   int count;
3317   machine_mode ag_mode;
3318 
3319   mode = TYPE_MODE (type);
3320   if (INTEGRAL_TYPE_P (type))
3321     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
3322 
3323   if (aarch64_return_in_msb (type))
3324     {
3325       HOST_WIDE_INT size = int_size_in_bytes (type);
3326 
3327       if (size % UNITS_PER_WORD != 0)
3328 	{
3329 	  size += UNITS_PER_WORD - size % UNITS_PER_WORD;
3330 	  mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
3331 	}
3332     }
3333 
3334   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
3335 					       &ag_mode, &count, NULL))
3336     {
3337       if (!aarch64_composite_type_p (type, mode))
3338 	{
3339 	  gcc_assert (count == 1 && mode == ag_mode);
3340 	  return gen_rtx_REG (mode, V0_REGNUM);
3341 	}
3342       else
3343 	{
3344 	  int i;
3345 	  rtx par;
3346 
3347 	  par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
3348 	  for (i = 0; i < count; i++)
3349 	    {
3350 	      rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
3351 	      rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
3352 	      tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
3353 	      XVECEXP (par, 0, i) = tmp;
3354 	    }
3355 	  return par;
3356 	}
3357     }
3358   else
3359     return gen_rtx_REG (mode, R0_REGNUM);
3360 }
3361 
3362 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
3363    Return true if REGNO is the number of a hard register in which the values
3364    of called function may come back.  */
3365 
3366 static bool
aarch64_function_value_regno_p(const unsigned int regno)3367 aarch64_function_value_regno_p (const unsigned int regno)
3368 {
3369   /* Maximum of 16 bytes can be returned in the general registers.  Examples
3370      of 16-byte return values are: 128-bit integers and 16-byte small
3371      structures (excluding homogeneous floating-point aggregates).  */
3372   if (regno == R0_REGNUM || regno == R1_REGNUM)
3373     return true;
3374 
3375   /* Up to four fp/simd registers can return a function value, e.g. a
3376      homogeneous floating-point aggregate having four members.  */
3377   if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
3378     return TARGET_FLOAT;
3379 
3380   return false;
3381 }
3382 
3383 /* Implement TARGET_RETURN_IN_MEMORY.
3384 
3385    If the type T of the result of a function is such that
3386      void func (T arg)
3387    would require that arg be passed as a value in a register (or set of
3388    registers) according to the parameter passing rules, then the result
3389    is returned in the same registers as would be used for such an
3390    argument.  */
3391 
3392 static bool
aarch64_return_in_memory(const_tree type,const_tree fndecl ATTRIBUTE_UNUSED)3393 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
3394 {
3395   HOST_WIDE_INT size;
3396   machine_mode ag_mode;
3397   int count;
3398 
3399   if (!AGGREGATE_TYPE_P (type)
3400       && TREE_CODE (type) != COMPLEX_TYPE
3401       && TREE_CODE (type) != VECTOR_TYPE)
3402     /* Simple scalar types always returned in registers.  */
3403     return false;
3404 
3405   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
3406 					       type,
3407 					       &ag_mode,
3408 					       &count,
3409 					       NULL))
3410     return false;
3411 
3412   /* Types larger than 2 registers returned in memory.  */
3413   size = int_size_in_bytes (type);
3414   return (size < 0 || size > 2 * UNITS_PER_WORD);
3415 }
3416 
3417 static bool
aarch64_vfp_is_call_candidate(cumulative_args_t pcum_v,machine_mode mode,const_tree type,int * nregs)3418 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
3419 			       const_tree type, int *nregs)
3420 {
3421   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3422   return aarch64_vfp_is_call_or_return_candidate (mode,
3423 						  type,
3424 						  &pcum->aapcs_vfp_rmode,
3425 						  nregs,
3426 						  NULL);
3427 }
3428 
3429 /* Given MODE and TYPE of a function argument, return the alignment in
3430    bits.  The idea is to suppress any stronger alignment requested by
3431    the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
3432    This is a helper function for local use only.  */
3433 
3434 static unsigned int
aarch64_function_arg_alignment(machine_mode mode,const_tree type)3435 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
3436 {
3437   if (!type)
3438     return GET_MODE_ALIGNMENT (mode);
3439 
3440   if (integer_zerop (TYPE_SIZE (type)))
3441     return 0;
3442 
3443   gcc_assert (TYPE_MODE (type) == mode);
3444 
3445   if (!AGGREGATE_TYPE_P (type))
3446     return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
3447 
3448   if (TREE_CODE (type) == ARRAY_TYPE)
3449     return TYPE_ALIGN (TREE_TYPE (type));
3450 
3451   unsigned int alignment = 0;
3452   for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
3453     if (TREE_CODE (field) == FIELD_DECL)
3454       alignment = std::max (alignment, DECL_ALIGN (field));
3455 
3456   return alignment;
3457 }
3458 
3459 /* Layout a function argument according to the AAPCS64 rules.  The rule
3460    numbers refer to the rule numbers in the AAPCS64.  */
3461 
3462 static void
aarch64_layout_arg(cumulative_args_t pcum_v,machine_mode mode,const_tree type,bool named ATTRIBUTE_UNUSED)3463 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
3464 		    const_tree type,
3465 		    bool named ATTRIBUTE_UNUSED)
3466 {
3467   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3468   int ncrn, nvrn, nregs;
3469   bool allocate_ncrn, allocate_nvrn;
3470   HOST_WIDE_INT size;
3471 
3472   /* We need to do this once per argument.  */
3473   if (pcum->aapcs_arg_processed)
3474     return;
3475 
3476   pcum->aapcs_arg_processed = true;
3477 
3478   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
3479   if (type)
3480     size = int_size_in_bytes (type);
3481   else
3482     /* No frontends can create types with variable-sized modes, so we
3483        shouldn't be asked to pass or return them.  */
3484     size = GET_MODE_SIZE (mode).to_constant ();
3485   size = ROUND_UP (size, UNITS_PER_WORD);
3486 
3487   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
3488   allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
3489 						 mode,
3490 						 type,
3491 						 &nregs);
3492 
3493   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
3494      The following code thus handles passing by SIMD/FP registers first.  */
3495 
3496   nvrn = pcum->aapcs_nvrn;
3497 
3498   /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
3499      and homogenous short-vector aggregates (HVA).  */
3500   if (allocate_nvrn)
3501     {
3502       if (!TARGET_FLOAT)
3503 	aarch64_err_no_fpadvsimd (mode, "argument");
3504 
3505       if (nvrn + nregs <= NUM_FP_ARG_REGS)
3506 	{
3507 	  pcum->aapcs_nextnvrn = nvrn + nregs;
3508 	  if (!aarch64_composite_type_p (type, mode))
3509 	    {
3510 	      gcc_assert (nregs == 1);
3511 	      pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
3512 	    }
3513 	  else
3514 	    {
3515 	      rtx par;
3516 	      int i;
3517 	      par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
3518 	      for (i = 0; i < nregs; i++)
3519 		{
3520 		  rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
3521 					 V0_REGNUM + nvrn + i);
3522 		  rtx offset = gen_int_mode
3523 		    (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
3524 		  tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
3525 		  XVECEXP (par, 0, i) = tmp;
3526 		}
3527 	      pcum->aapcs_reg = par;
3528 	    }
3529 	  return;
3530 	}
3531       else
3532 	{
3533 	  /* C.3 NSRN is set to 8.  */
3534 	  pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
3535 	  goto on_stack;
3536 	}
3537     }
3538 
3539   ncrn = pcum->aapcs_ncrn;
3540   nregs = size / UNITS_PER_WORD;
3541 
3542   /* C6 - C9.  though the sign and zero extension semantics are
3543      handled elsewhere.  This is the case where the argument fits
3544      entirely general registers.  */
3545   if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
3546     {
3547 
3548       gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
3549 
3550       /* C.8 if the argument has an alignment of 16 then the NGRN is
3551          rounded up to the next even number.  */
3552       if (nregs == 2
3553 	  && ncrn % 2
3554 	  /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
3555 	     comparison is there because for > 16 * BITS_PER_UNIT
3556 	     alignment nregs should be > 2 and therefore it should be
3557 	     passed by reference rather than value.  */
3558 	  && aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
3559 	{
3560 	  ++ncrn;
3561 	  gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
3562 	}
3563 
3564       /* NREGS can be 0 when e.g. an empty structure is to be passed.
3565          A reg is still generated for it, but the caller should be smart
3566 	 enough not to use it.  */
3567       if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
3568 	pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
3569       else
3570 	{
3571 	  rtx par;
3572 	  int i;
3573 
3574 	  par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
3575 	  for (i = 0; i < nregs; i++)
3576 	    {
3577 	      rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
3578 	      tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
3579 				       GEN_INT (i * UNITS_PER_WORD));
3580 	      XVECEXP (par, 0, i) = tmp;
3581 	    }
3582 	  pcum->aapcs_reg = par;
3583 	}
3584 
3585       pcum->aapcs_nextncrn = ncrn + nregs;
3586       return;
3587     }
3588 
3589   /* C.11  */
3590   pcum->aapcs_nextncrn = NUM_ARG_REGS;
3591 
3592   /* The argument is passed on stack; record the needed number of words for
3593      this argument and align the total size if necessary.  */
3594 on_stack:
3595   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
3596 
3597   if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
3598     pcum->aapcs_stack_size = ROUND_UP (pcum->aapcs_stack_size,
3599 				       16 / UNITS_PER_WORD);
3600   return;
3601 }
3602 
3603 /* Implement TARGET_FUNCTION_ARG.  */
3604 
3605 static rtx
aarch64_function_arg(cumulative_args_t pcum_v,machine_mode mode,const_tree type,bool named)3606 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
3607 		      const_tree type, bool named)
3608 {
3609   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3610   gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
3611 
3612   if (mode == VOIDmode)
3613     return NULL_RTX;
3614 
3615   aarch64_layout_arg (pcum_v, mode, type, named);
3616   return pcum->aapcs_reg;
3617 }
3618 
3619 void
aarch64_init_cumulative_args(CUMULATIVE_ARGS * pcum,const_tree fntype ATTRIBUTE_UNUSED,rtx libname ATTRIBUTE_UNUSED,const_tree fndecl ATTRIBUTE_UNUSED,unsigned n_named ATTRIBUTE_UNUSED)3620 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
3621 			   const_tree fntype ATTRIBUTE_UNUSED,
3622 			   rtx libname ATTRIBUTE_UNUSED,
3623 			   const_tree fndecl ATTRIBUTE_UNUSED,
3624 			   unsigned n_named ATTRIBUTE_UNUSED)
3625 {
3626   pcum->aapcs_ncrn = 0;
3627   pcum->aapcs_nvrn = 0;
3628   pcum->aapcs_nextncrn = 0;
3629   pcum->aapcs_nextnvrn = 0;
3630   pcum->pcs_variant = ARM_PCS_AAPCS64;
3631   pcum->aapcs_reg = NULL_RTX;
3632   pcum->aapcs_arg_processed = false;
3633   pcum->aapcs_stack_words = 0;
3634   pcum->aapcs_stack_size = 0;
3635 
3636   if (!TARGET_FLOAT
3637       && fndecl && TREE_PUBLIC (fndecl)
3638       && fntype && fntype != error_mark_node)
3639     {
3640       const_tree type = TREE_TYPE (fntype);
3641       machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument.  */
3642       int nregs ATTRIBUTE_UNUSED; /* Likewise.  */
3643       if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
3644 						   &mode, &nregs, NULL))
3645 	aarch64_err_no_fpadvsimd (TYPE_MODE (type), "return type");
3646     }
3647   return;
3648 }
3649 
3650 static void
aarch64_function_arg_advance(cumulative_args_t pcum_v,machine_mode mode,const_tree type,bool named)3651 aarch64_function_arg_advance (cumulative_args_t pcum_v,
3652 			      machine_mode mode,
3653 			      const_tree type,
3654 			      bool named)
3655 {
3656   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3657   if (pcum->pcs_variant == ARM_PCS_AAPCS64)
3658     {
3659       aarch64_layout_arg (pcum_v, mode, type, named);
3660       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
3661 		  != (pcum->aapcs_stack_words != 0));
3662       pcum->aapcs_arg_processed = false;
3663       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
3664       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
3665       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
3666       pcum->aapcs_stack_words = 0;
3667       pcum->aapcs_reg = NULL_RTX;
3668     }
3669 }
3670 
3671 bool
aarch64_function_arg_regno_p(unsigned regno)3672 aarch64_function_arg_regno_p (unsigned regno)
3673 {
3674   return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
3675 	  || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
3676 }
3677 
3678 /* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
3679    PARM_BOUNDARY bits of alignment, but will be given anything up
3680    to STACK_BOUNDARY bits if the type requires it.  This makes sure
3681    that both before and after the layout of each argument, the Next
3682    Stacked Argument Address (NSAA) will have a minimum alignment of
3683    8 bytes.  */
3684 
3685 static unsigned int
aarch64_function_arg_boundary(machine_mode mode,const_tree type)3686 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
3687 {
3688   unsigned int alignment = aarch64_function_arg_alignment (mode, type);
3689   return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
3690 }
3691 
3692 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE.  */
3693 
3694 static fixed_size_mode
aarch64_get_reg_raw_mode(int regno)3695 aarch64_get_reg_raw_mode (int regno)
3696 {
3697   if (TARGET_SVE && FP_REGNUM_P (regno))
3698     /* Don't use the SVE part of the register for __builtin_apply and
3699        __builtin_return.  The SVE registers aren't used by the normal PCS,
3700        so using them there would be a waste of time.  The PCS extensions
3701        for SVE types are fundamentally incompatible with the
3702        __builtin_return/__builtin_apply interface.  */
3703     return as_a <fixed_size_mode> (V16QImode);
3704   return default_get_reg_raw_mode (regno);
3705 }
3706 
3707 /* Implement TARGET_FUNCTION_ARG_PADDING.
3708 
3709    Small aggregate types are placed in the lowest memory address.
3710 
3711    The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
3712 
3713 static pad_direction
aarch64_function_arg_padding(machine_mode mode,const_tree type)3714 aarch64_function_arg_padding (machine_mode mode, const_tree type)
3715 {
3716   /* On little-endian targets, the least significant byte of every stack
3717      argument is passed at the lowest byte address of the stack slot.  */
3718   if (!BYTES_BIG_ENDIAN)
3719     return PAD_UPWARD;
3720 
3721   /* Otherwise, integral, floating-point and pointer types are padded downward:
3722      the least significant byte of a stack argument is passed at the highest
3723      byte address of the stack slot.  */
3724   if (type
3725       ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
3726 	 || POINTER_TYPE_P (type))
3727       : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
3728     return PAD_DOWNWARD;
3729 
3730   /* Everything else padded upward, i.e. data in first byte of stack slot.  */
3731   return PAD_UPWARD;
3732 }
3733 
3734 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
3735 
3736    It specifies padding for the last (may also be the only)
3737    element of a block move between registers and memory.  If
3738    assuming the block is in the memory, padding upward means that
3739    the last element is padded after its highest significant byte,
3740    while in downward padding, the last element is padded at the
3741    its least significant byte side.
3742 
3743    Small aggregates and small complex types are always padded
3744    upwards.
3745 
3746    We don't need to worry about homogeneous floating-point or
3747    short-vector aggregates; their move is not affected by the
3748    padding direction determined here.  Regardless of endianness,
3749    each element of such an aggregate is put in the least
3750    significant bits of a fp/simd register.
3751 
3752    Return !BYTES_BIG_ENDIAN if the least significant byte of the
3753    register has useful data, and return the opposite if the most
3754    significant byte does.  */
3755 
3756 bool
aarch64_pad_reg_upward(machine_mode mode,const_tree type,bool first ATTRIBUTE_UNUSED)3757 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
3758 		     bool first ATTRIBUTE_UNUSED)
3759 {
3760 
3761   /* Small composite types are always padded upward.  */
3762   if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
3763     {
3764       HOST_WIDE_INT size;
3765       if (type)
3766 	size = int_size_in_bytes (type);
3767       else
3768 	/* No frontends can create types with variable-sized modes, so we
3769 	   shouldn't be asked to pass or return them.  */
3770 	size = GET_MODE_SIZE (mode).to_constant ();
3771       if (size < 2 * UNITS_PER_WORD)
3772 	return true;
3773     }
3774 
3775   /* Otherwise, use the default padding.  */
3776   return !BYTES_BIG_ENDIAN;
3777 }
3778 
3779 static scalar_int_mode
aarch64_libgcc_cmp_return_mode(void)3780 aarch64_libgcc_cmp_return_mode (void)
3781 {
3782   return SImode;
3783 }
3784 
3785 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
3786 
3787 /* We use the 12-bit shifted immediate arithmetic instructions so values
3788    must be multiple of (1 << 12), i.e. 4096.  */
3789 #define ARITH_FACTOR 4096
3790 
3791 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
3792 #error Cannot use simple address calculation for stack probing
3793 #endif
3794 
3795 /* The pair of scratch registers used for stack probing.  */
3796 #define PROBE_STACK_FIRST_REG  9
3797 #define PROBE_STACK_SECOND_REG 10
3798 
3799 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
3800    inclusive.  These are offsets from the current stack pointer.  */
3801 
3802 static void
aarch64_emit_probe_stack_range(HOST_WIDE_INT first,poly_int64 poly_size)3803 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
3804 {
3805   HOST_WIDE_INT size;
3806   if (!poly_size.is_constant (&size))
3807     {
3808       sorry ("stack probes for SVE frames");
3809       return;
3810     }
3811 
3812   rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
3813 
3814   /* See the same assertion on PROBE_INTERVAL above.  */
3815   gcc_assert ((first % ARITH_FACTOR) == 0);
3816 
3817   /* See if we have a constant small number of probes to generate.  If so,
3818      that's the easy case.  */
3819   if (size <= PROBE_INTERVAL)
3820     {
3821       const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
3822 
3823       emit_set_insn (reg1,
3824 		     plus_constant (Pmode,
3825 				    stack_pointer_rtx, -(first + base)));
3826       emit_stack_probe (plus_constant (Pmode, reg1, base - size));
3827     }
3828 
3829   /* The run-time loop is made up of 8 insns in the generic case while the
3830      compile-time loop is made up of 4+2*(n-2) insns for n # of intervals.  */
3831   else if (size <= 4 * PROBE_INTERVAL)
3832     {
3833       HOST_WIDE_INT i, rem;
3834 
3835       emit_set_insn (reg1,
3836 		     plus_constant (Pmode,
3837 				    stack_pointer_rtx,
3838 				    -(first + PROBE_INTERVAL)));
3839       emit_stack_probe (reg1);
3840 
3841       /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
3842 	 it exceeds SIZE.  If only two probes are needed, this will not
3843 	 generate any code.  Then probe at FIRST + SIZE.  */
3844       for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
3845 	{
3846 	  emit_set_insn (reg1,
3847 			 plus_constant (Pmode, reg1, -PROBE_INTERVAL));
3848 	  emit_stack_probe (reg1);
3849 	}
3850 
3851       rem = size - (i - PROBE_INTERVAL);
3852       if (rem > 256)
3853 	{
3854 	  const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
3855 
3856 	  emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
3857 	  emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
3858 	}
3859       else
3860 	emit_stack_probe (plus_constant (Pmode, reg1, -rem));
3861     }
3862 
3863   /* Otherwise, do the same as above, but in a loop.  Note that we must be
3864      extra careful with variables wrapping around because we might be at
3865      the very top (or the very bottom) of the address space and we have
3866      to be able to handle this case properly; in particular, we use an
3867      equality test for the loop condition.  */
3868   else
3869     {
3870       rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
3871 
3872       /* Step 1: round SIZE to the previous multiple of the interval.  */
3873 
3874       HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
3875 
3876 
3877       /* Step 2: compute initial and final value of the loop counter.  */
3878 
3879       /* TEST_ADDR = SP + FIRST.  */
3880       emit_set_insn (reg1,
3881 		     plus_constant (Pmode, stack_pointer_rtx, -first));
3882 
3883       /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE.  */
3884       HOST_WIDE_INT adjustment = - (first + rounded_size);
3885       if (! aarch64_uimm12_shift (adjustment))
3886 	{
3887 	  aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
3888 					  true, Pmode);
3889 	  emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
3890 	}
3891       else
3892 	emit_set_insn (reg2,
3893 		       plus_constant (Pmode, stack_pointer_rtx, adjustment));
3894 
3895       /* Step 3: the loop
3896 
3897 	 do
3898 	   {
3899 	     TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
3900 	     probe at TEST_ADDR
3901 	   }
3902 	 while (TEST_ADDR != LAST_ADDR)
3903 
3904 	 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
3905 	 until it is equal to ROUNDED_SIZE.  */
3906 
3907       emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
3908 
3909 
3910       /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
3911 	 that SIZE is equal to ROUNDED_SIZE.  */
3912 
3913       if (size != rounded_size)
3914 	{
3915 	  HOST_WIDE_INT rem = size - rounded_size;
3916 
3917 	  if (rem > 256)
3918 	    {
3919 	      const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
3920 
3921 	      emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
3922 	      emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
3923 	    }
3924 	  else
3925 	    emit_stack_probe (plus_constant (Pmode, reg2, -rem));
3926 	}
3927     }
3928 
3929   /* Make sure nothing is scheduled before we are done.  */
3930   emit_insn (gen_blockage ());
3931 }
3932 
3933 /* Probe a range of stack addresses from REG1 to REG2 inclusive.  These are
3934    absolute addresses.  */
3935 
3936 const char *
aarch64_output_probe_stack_range(rtx reg1,rtx reg2)3937 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
3938 {
3939   static int labelno = 0;
3940   char loop_lab[32];
3941   rtx xops[2];
3942 
3943   ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
3944 
3945   /* Loop.  */
3946   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
3947 
3948   /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
3949   xops[0] = reg1;
3950   xops[1] = GEN_INT (PROBE_INTERVAL);
3951   output_asm_insn ("sub\t%0, %0, %1", xops);
3952 
3953   /* Probe at TEST_ADDR.  */
3954   output_asm_insn ("str\txzr, [%0]", xops);
3955 
3956   /* Test if TEST_ADDR == LAST_ADDR.  */
3957   xops[1] = reg2;
3958   output_asm_insn ("cmp\t%0, %1", xops);
3959 
3960   /* Branch.  */
3961   fputs ("\tb.ne\t", asm_out_file);
3962   assemble_name_raw (asm_out_file, loop_lab);
3963   fputc ('\n', asm_out_file);
3964 
3965   return "";
3966 }
3967 
3968 /* Mark the registers that need to be saved by the callee and calculate
3969    the size of the callee-saved registers area and frame record (both FP
3970    and LR may be omitted).  */
3971 static void
aarch64_layout_frame(void)3972 aarch64_layout_frame (void)
3973 {
3974   HOST_WIDE_INT offset = 0;
3975   int regno, last_fp_reg = INVALID_REGNUM;
3976 
3977   if (reload_completed && cfun->machine->frame.laid_out)
3978     return;
3979 
3980   /* Force a frame chain for EH returns so the return address is at FP+8.  */
3981   cfun->machine->frame.emit_frame_chain
3982     = frame_pointer_needed || crtl->calls_eh_return;
3983 
3984   /* Emit a frame chain if the frame pointer is enabled.
3985      If -momit-leaf-frame-pointer is used, do not use a frame chain
3986      in leaf functions which do not use LR.  */
3987   if (flag_omit_frame_pointer == 2
3988       && !(flag_omit_leaf_frame_pointer && crtl->is_leaf
3989 	   && !df_regs_ever_live_p (LR_REGNUM)))
3990     cfun->machine->frame.emit_frame_chain = true;
3991 
3992 #define SLOT_NOT_REQUIRED (-2)
3993 #define SLOT_REQUIRED     (-1)
3994 
3995   cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
3996   cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
3997 
3998   /* First mark all the registers that really need to be saved...  */
3999   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4000     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
4001 
4002   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4003     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
4004 
4005   /* ... that includes the eh data registers (if needed)...  */
4006   if (crtl->calls_eh_return)
4007     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
4008       cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
4009 	= SLOT_REQUIRED;
4010 
4011   /* ... and any callee saved register that dataflow says is live.  */
4012   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4013     if (df_regs_ever_live_p (regno)
4014 	&& (regno == R30_REGNUM
4015 	    || !call_used_regs[regno]))
4016       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
4017 
4018   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4019     if (df_regs_ever_live_p (regno)
4020 	&& !call_used_regs[regno])
4021       {
4022 	cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
4023 	last_fp_reg = regno;
4024       }
4025 
4026   if (cfun->machine->frame.emit_frame_chain)
4027     {
4028       /* FP and LR are placed in the linkage record.  */
4029       cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
4030       cfun->machine->frame.wb_candidate1 = R29_REGNUM;
4031       cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
4032       cfun->machine->frame.wb_candidate2 = R30_REGNUM;
4033       offset = 2 * UNITS_PER_WORD;
4034     }
4035 
4036   /* Now assign stack slots for them.  */
4037   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4038     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
4039       {
4040 	cfun->machine->frame.reg_offset[regno] = offset;
4041 	if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
4042 	  cfun->machine->frame.wb_candidate1 = regno;
4043 	else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
4044 	  cfun->machine->frame.wb_candidate2 = regno;
4045 	offset += UNITS_PER_WORD;
4046       }
4047 
4048   HOST_WIDE_INT max_int_offset = offset;
4049   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
4050   bool has_align_gap = offset != max_int_offset;
4051 
4052   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4053     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
4054       {
4055 	/* If there is an alignment gap between integer and fp callee-saves,
4056 	   allocate the last fp register to it if possible.  */
4057 	if (regno == last_fp_reg && has_align_gap && (offset & 8) == 0)
4058 	  {
4059 	    cfun->machine->frame.reg_offset[regno] = max_int_offset;
4060 	    break;
4061 	  }
4062 
4063 	cfun->machine->frame.reg_offset[regno] = offset;
4064 	if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
4065 	  cfun->machine->frame.wb_candidate1 = regno;
4066 	else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
4067 		 && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
4068 	  cfun->machine->frame.wb_candidate2 = regno;
4069 	offset += UNITS_PER_WORD;
4070       }
4071 
4072   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
4073 
4074   cfun->machine->frame.saved_regs_size = offset;
4075 
4076   HOST_WIDE_INT varargs_and_saved_regs_size
4077     = offset + cfun->machine->frame.saved_varargs_size;
4078 
4079   cfun->machine->frame.hard_fp_offset
4080     = aligned_upper_bound (varargs_and_saved_regs_size
4081 			   + get_frame_size (),
4082 			   STACK_BOUNDARY / BITS_PER_UNIT);
4083 
4084   /* Both these values are already aligned.  */
4085   gcc_assert (multiple_p (crtl->outgoing_args_size,
4086 			  STACK_BOUNDARY / BITS_PER_UNIT));
4087   cfun->machine->frame.frame_size
4088     = (cfun->machine->frame.hard_fp_offset
4089        + crtl->outgoing_args_size);
4090 
4091   cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
4092 
4093   cfun->machine->frame.initial_adjust = 0;
4094   cfun->machine->frame.final_adjust = 0;
4095   cfun->machine->frame.callee_adjust = 0;
4096   cfun->machine->frame.callee_offset = 0;
4097 
4098   HOST_WIDE_INT max_push_offset = 0;
4099   if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
4100     max_push_offset = 512;
4101   else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
4102     max_push_offset = 256;
4103 
4104   HOST_WIDE_INT const_size, const_fp_offset;
4105   if (cfun->machine->frame.frame_size.is_constant (&const_size)
4106       && const_size < max_push_offset
4107       && known_eq (crtl->outgoing_args_size, 0))
4108     {
4109       /* Simple, small frame with no outgoing arguments:
4110 	 stp reg1, reg2, [sp, -frame_size]!
4111 	 stp reg3, reg4, [sp, 16]  */
4112       cfun->machine->frame.callee_adjust = const_size;
4113     }
4114   else if (known_lt (crtl->outgoing_args_size
4115 		     + cfun->machine->frame.saved_regs_size, 512)
4116 	   && !(cfun->calls_alloca
4117 		&& known_lt (cfun->machine->frame.hard_fp_offset,
4118 			     max_push_offset)))
4119     {
4120       /* Frame with small outgoing arguments:
4121 	 sub sp, sp, frame_size
4122 	 stp reg1, reg2, [sp, outgoing_args_size]
4123 	 stp reg3, reg4, [sp, outgoing_args_size + 16]  */
4124       cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
4125       cfun->machine->frame.callee_offset
4126 	= cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
4127     }
4128   else if (cfun->machine->frame.hard_fp_offset.is_constant (&const_fp_offset)
4129 	   && const_fp_offset < max_push_offset)
4130     {
4131       /* Frame with large outgoing arguments but a small local area:
4132 	 stp reg1, reg2, [sp, -hard_fp_offset]!
4133 	 stp reg3, reg4, [sp, 16]
4134 	 sub sp, sp, outgoing_args_size  */
4135       cfun->machine->frame.callee_adjust = const_fp_offset;
4136       cfun->machine->frame.final_adjust
4137 	= cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
4138     }
4139   else
4140     {
4141       /* Frame with large local area and outgoing arguments using frame pointer:
4142 	 sub sp, sp, hard_fp_offset
4143 	 stp x29, x30, [sp, 0]
4144 	 add x29, sp, 0
4145 	 stp reg3, reg4, [sp, 16]
4146 	 sub sp, sp, outgoing_args_size  */
4147       cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
4148       cfun->machine->frame.final_adjust
4149 	= cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
4150     }
4151 
4152   cfun->machine->frame.laid_out = true;
4153 }
4154 
4155 /* Return true if the register REGNO is saved on entry to
4156    the current function.  */
4157 
4158 static bool
aarch64_register_saved_on_entry(int regno)4159 aarch64_register_saved_on_entry (int regno)
4160 {
4161   return cfun->machine->frame.reg_offset[regno] >= 0;
4162 }
4163 
4164 /* Return the next register up from REGNO up to LIMIT for the callee
4165    to save.  */
4166 
4167 static unsigned
aarch64_next_callee_save(unsigned regno,unsigned limit)4168 aarch64_next_callee_save (unsigned regno, unsigned limit)
4169 {
4170   while (regno <= limit && !aarch64_register_saved_on_entry (regno))
4171     regno ++;
4172   return regno;
4173 }
4174 
4175 /* Push the register number REGNO of mode MODE to the stack with write-back
4176    adjusting the stack by ADJUSTMENT.  */
4177 
4178 static void
aarch64_pushwb_single_reg(machine_mode mode,unsigned regno,HOST_WIDE_INT adjustment)4179 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
4180 			   HOST_WIDE_INT adjustment)
4181  {
4182   rtx base_rtx = stack_pointer_rtx;
4183   rtx insn, reg, mem;
4184 
4185   reg = gen_rtx_REG (mode, regno);
4186   mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
4187 			    plus_constant (Pmode, base_rtx, -adjustment));
4188   mem = gen_frame_mem (mode, mem);
4189 
4190   insn = emit_move_insn (mem, reg);
4191   RTX_FRAME_RELATED_P (insn) = 1;
4192 }
4193 
4194 /* Generate and return an instruction to store the pair of registers
4195    REG and REG2 of mode MODE to location BASE with write-back adjusting
4196    the stack location BASE by ADJUSTMENT.  */
4197 
4198 static rtx
aarch64_gen_storewb_pair(machine_mode mode,rtx base,rtx reg,rtx reg2,HOST_WIDE_INT adjustment)4199 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
4200 			  HOST_WIDE_INT adjustment)
4201 {
4202   switch (mode)
4203     {
4204     case E_DImode:
4205       return gen_storewb_pairdi_di (base, base, reg, reg2,
4206 				    GEN_INT (-adjustment),
4207 				    GEN_INT (UNITS_PER_WORD - adjustment));
4208     case E_DFmode:
4209       return gen_storewb_pairdf_di (base, base, reg, reg2,
4210 				    GEN_INT (-adjustment),
4211 				    GEN_INT (UNITS_PER_WORD - adjustment));
4212     default:
4213       gcc_unreachable ();
4214     }
4215 }
4216 
4217 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
4218    stack pointer by ADJUSTMENT.  */
4219 
4220 static void
aarch64_push_regs(unsigned regno1,unsigned regno2,HOST_WIDE_INT adjustment)4221 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
4222 {
4223   rtx_insn *insn;
4224   machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
4225 
4226   if (regno2 == INVALID_REGNUM)
4227     return aarch64_pushwb_single_reg (mode, regno1, adjustment);
4228 
4229   rtx reg1 = gen_rtx_REG (mode, regno1);
4230   rtx reg2 = gen_rtx_REG (mode, regno2);
4231 
4232   insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
4233 					      reg2, adjustment));
4234   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
4235   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
4236   RTX_FRAME_RELATED_P (insn) = 1;
4237 }
4238 
4239 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
4240    adjusting it by ADJUSTMENT afterwards.  */
4241 
4242 static rtx
aarch64_gen_loadwb_pair(machine_mode mode,rtx base,rtx reg,rtx reg2,HOST_WIDE_INT adjustment)4243 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
4244 			 HOST_WIDE_INT adjustment)
4245 {
4246   switch (mode)
4247     {
4248     case E_DImode:
4249       return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
4250 				   GEN_INT (UNITS_PER_WORD));
4251     case E_DFmode:
4252       return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
4253 				   GEN_INT (UNITS_PER_WORD));
4254     default:
4255       gcc_unreachable ();
4256     }
4257 }
4258 
4259 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
4260    afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
4261    into CFI_OPS.  */
4262 
4263 static void
aarch64_pop_regs(unsigned regno1,unsigned regno2,HOST_WIDE_INT adjustment,rtx * cfi_ops)4264 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
4265 		  rtx *cfi_ops)
4266 {
4267   machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
4268   rtx reg1 = gen_rtx_REG (mode, regno1);
4269 
4270   *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
4271 
4272   if (regno2 == INVALID_REGNUM)
4273     {
4274       rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
4275       mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
4276       emit_move_insn (reg1, gen_frame_mem (mode, mem));
4277     }
4278   else
4279     {
4280       rtx reg2 = gen_rtx_REG (mode, regno2);
4281       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
4282       emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
4283 					  reg2, adjustment));
4284     }
4285 }
4286 
4287 /* Generate and return a store pair instruction of mode MODE to store
4288    register REG1 to MEM1 and register REG2 to MEM2.  */
4289 
4290 static rtx
aarch64_gen_store_pair(machine_mode mode,rtx mem1,rtx reg1,rtx mem2,rtx reg2)4291 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
4292 			rtx reg2)
4293 {
4294   switch (mode)
4295     {
4296     case E_DImode:
4297       return gen_store_pairdi (mem1, reg1, mem2, reg2);
4298 
4299     case E_DFmode:
4300       return gen_store_pairdf (mem1, reg1, mem2, reg2);
4301 
4302     default:
4303       gcc_unreachable ();
4304     }
4305 }
4306 
4307 /* Generate and regurn a load pair isntruction of mode MODE to load register
4308    REG1 from MEM1 and register REG2 from MEM2.  */
4309 
4310 static rtx
aarch64_gen_load_pair(machine_mode mode,rtx reg1,rtx mem1,rtx reg2,rtx mem2)4311 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
4312 		       rtx mem2)
4313 {
4314   switch (mode)
4315     {
4316     case E_DImode:
4317       return gen_load_pairdi (reg1, mem1, reg2, mem2);
4318 
4319     case E_DFmode:
4320       return gen_load_pairdf (reg1, mem1, reg2, mem2);
4321 
4322     default:
4323       gcc_unreachable ();
4324     }
4325 }
4326 
4327 /* Return TRUE if return address signing should be enabled for the current
4328    function, otherwise return FALSE.  */
4329 
4330 bool
aarch64_return_address_signing_enabled(void)4331 aarch64_return_address_signing_enabled (void)
4332 {
4333   /* This function should only be called after frame laid out.   */
4334   gcc_assert (cfun->machine->frame.laid_out);
4335 
4336   /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
4337      if it's LR is pushed onto stack.  */
4338   return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
4339 	  || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
4340 	      && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
4341 }
4342 
4343 /* Emit code to save the callee-saved registers from register number START
4344    to LIMIT to the stack at the location starting at offset START_OFFSET,
4345    skipping any write-back candidates if SKIP_WB is true.  */
4346 
4347 static void
aarch64_save_callee_saves(machine_mode mode,poly_int64 start_offset,unsigned start,unsigned limit,bool skip_wb)4348 aarch64_save_callee_saves (machine_mode mode, poly_int64 start_offset,
4349 			   unsigned start, unsigned limit, bool skip_wb)
4350 {
4351   rtx_insn *insn;
4352   unsigned regno;
4353   unsigned regno2;
4354 
4355   for (regno = aarch64_next_callee_save (start, limit);
4356        regno <= limit;
4357        regno = aarch64_next_callee_save (regno + 1, limit))
4358     {
4359       rtx reg, mem;
4360       poly_int64 offset;
4361 
4362       if (skip_wb
4363 	  && (regno == cfun->machine->frame.wb_candidate1
4364 	      || regno == cfun->machine->frame.wb_candidate2))
4365 	continue;
4366 
4367       if (cfun->machine->reg_is_wrapped_separately[regno])
4368        continue;
4369 
4370       reg = gen_rtx_REG (mode, regno);
4371       offset = start_offset + cfun->machine->frame.reg_offset[regno];
4372       mem = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
4373 						offset));
4374 
4375       regno2 = aarch64_next_callee_save (regno + 1, limit);
4376 
4377       if (regno2 <= limit
4378 	  && !cfun->machine->reg_is_wrapped_separately[regno2]
4379 	  && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
4380 	      == cfun->machine->frame.reg_offset[regno2]))
4381 
4382 	{
4383 	  rtx reg2 = gen_rtx_REG (mode, regno2);
4384 	  rtx mem2;
4385 
4386 	  offset = start_offset + cfun->machine->frame.reg_offset[regno2];
4387 	  mem2 = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
4388 						     offset));
4389 	  insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
4390 						    reg2));
4391 
4392 	  /* The first part of a frame-related parallel insn is
4393 	     always assumed to be relevant to the frame
4394 	     calculations; subsequent parts, are only
4395 	     frame-related if explicitly marked.  */
4396 	  RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
4397 	  regno = regno2;
4398 	}
4399       else
4400 	insn = emit_move_insn (mem, reg);
4401 
4402       RTX_FRAME_RELATED_P (insn) = 1;
4403     }
4404 }
4405 
4406 /* Emit code to restore the callee registers of mode MODE from register
4407    number START up to and including LIMIT.  Restore from the stack offset
4408    START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
4409    Write the appropriate REG_CFA_RESTORE notes into CFI_OPS.  */
4410 
4411 static void
aarch64_restore_callee_saves(machine_mode mode,poly_int64 start_offset,unsigned start,unsigned limit,bool skip_wb,rtx * cfi_ops)4412 aarch64_restore_callee_saves (machine_mode mode,
4413 			      poly_int64 start_offset, unsigned start,
4414 			      unsigned limit, bool skip_wb, rtx *cfi_ops)
4415 {
4416   rtx base_rtx = stack_pointer_rtx;
4417   unsigned regno;
4418   unsigned regno2;
4419   poly_int64 offset;
4420 
4421   for (regno = aarch64_next_callee_save (start, limit);
4422        regno <= limit;
4423        regno = aarch64_next_callee_save (regno + 1, limit))
4424     {
4425       if (cfun->machine->reg_is_wrapped_separately[regno])
4426        continue;
4427 
4428       rtx reg, mem;
4429 
4430       if (skip_wb
4431 	  && (regno == cfun->machine->frame.wb_candidate1
4432 	      || regno == cfun->machine->frame.wb_candidate2))
4433 	continue;
4434 
4435       reg = gen_rtx_REG (mode, regno);
4436       offset = start_offset + cfun->machine->frame.reg_offset[regno];
4437       mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
4438 
4439       regno2 = aarch64_next_callee_save (regno + 1, limit);
4440 
4441       if (regno2 <= limit
4442 	  && !cfun->machine->reg_is_wrapped_separately[regno2]
4443 	  && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
4444 	      == cfun->machine->frame.reg_offset[regno2]))
4445 	{
4446 	  rtx reg2 = gen_rtx_REG (mode, regno2);
4447 	  rtx mem2;
4448 
4449 	  offset = start_offset + cfun->machine->frame.reg_offset[regno2];
4450 	  mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
4451 	  emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
4452 
4453 	  *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
4454 	  regno = regno2;
4455 	}
4456       else
4457 	emit_move_insn (reg, mem);
4458       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
4459     }
4460 }
4461 
4462 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
4463    of MODE.  */
4464 
4465 static inline bool
offset_4bit_signed_scaled_p(machine_mode mode,poly_int64 offset)4466 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4467 {
4468   HOST_WIDE_INT multiple;
4469   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4470 	  && IN_RANGE (multiple, -8, 7));
4471 }
4472 
4473 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
4474    of MODE.  */
4475 
4476 static inline bool
offset_6bit_unsigned_scaled_p(machine_mode mode,poly_int64 offset)4477 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
4478 {
4479   HOST_WIDE_INT multiple;
4480   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4481 	  && IN_RANGE (multiple, 0, 63));
4482 }
4483 
4484 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
4485    of MODE.  */
4486 
4487 bool
aarch64_offset_7bit_signed_scaled_p(machine_mode mode,poly_int64 offset)4488 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4489 {
4490   HOST_WIDE_INT multiple;
4491   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4492 	  && IN_RANGE (multiple, -64, 63));
4493 }
4494 
4495 /* Return true if OFFSET is a signed 9-bit value.  */
4496 
4497 static inline bool
offset_9bit_signed_unscaled_p(machine_mode mode ATTRIBUTE_UNUSED,poly_int64 offset)4498 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
4499 			       poly_int64 offset)
4500 {
4501   HOST_WIDE_INT const_offset;
4502   return (offset.is_constant (&const_offset)
4503 	  && IN_RANGE (const_offset, -256, 255));
4504 }
4505 
4506 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
4507    of MODE.  */
4508 
4509 static inline bool
offset_9bit_signed_scaled_p(machine_mode mode,poly_int64 offset)4510 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4511 {
4512   HOST_WIDE_INT multiple;
4513   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4514 	  && IN_RANGE (multiple, -256, 255));
4515 }
4516 
4517 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
4518    of MODE.  */
4519 
4520 static inline bool
offset_12bit_unsigned_scaled_p(machine_mode mode,poly_int64 offset)4521 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
4522 {
4523   HOST_WIDE_INT multiple;
4524   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4525 	  && IN_RANGE (multiple, 0, 4095));
4526 }
4527 
4528 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS.  */
4529 
4530 static sbitmap
aarch64_get_separate_components(void)4531 aarch64_get_separate_components (void)
4532 {
4533   aarch64_layout_frame ();
4534 
4535   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
4536   bitmap_clear (components);
4537 
4538   /* The registers we need saved to the frame.  */
4539   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
4540     if (aarch64_register_saved_on_entry (regno))
4541       {
4542 	poly_int64 offset = cfun->machine->frame.reg_offset[regno];
4543 	if (!frame_pointer_needed)
4544 	  offset += cfun->machine->frame.frame_size
4545 		    - cfun->machine->frame.hard_fp_offset;
4546 	/* Check that we can access the stack slot of the register with one
4547 	   direct load with no adjustments needed.  */
4548 	if (offset_12bit_unsigned_scaled_p (DImode, offset))
4549 	  bitmap_set_bit (components, regno);
4550       }
4551 
4552   /* Don't mess with the hard frame pointer.  */
4553   if (frame_pointer_needed)
4554     bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
4555 
4556   unsigned reg1 = cfun->machine->frame.wb_candidate1;
4557   unsigned reg2 = cfun->machine->frame.wb_candidate2;
4558   /* If aarch64_layout_frame has chosen registers to store/restore with
4559      writeback don't interfere with them to avoid having to output explicit
4560      stack adjustment instructions.  */
4561   if (reg2 != INVALID_REGNUM)
4562     bitmap_clear_bit (components, reg2);
4563   if (reg1 != INVALID_REGNUM)
4564     bitmap_clear_bit (components, reg1);
4565 
4566   bitmap_clear_bit (components, LR_REGNUM);
4567   bitmap_clear_bit (components, SP_REGNUM);
4568 
4569   return components;
4570 }
4571 
4572 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB.  */
4573 
4574 static sbitmap
aarch64_components_for_bb(basic_block bb)4575 aarch64_components_for_bb (basic_block bb)
4576 {
4577   bitmap in = DF_LIVE_IN (bb);
4578   bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
4579   bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
4580 
4581   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
4582   bitmap_clear (components);
4583 
4584   /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets.  */
4585   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
4586     if ((!call_used_regs[regno])
4587        && (bitmap_bit_p (in, regno)
4588 	   || bitmap_bit_p (gen, regno)
4589 	   || bitmap_bit_p (kill, regno)))
4590       {
4591 	unsigned regno2, offset, offset2;
4592 	bitmap_set_bit (components, regno);
4593 
4594 	/* If there is a callee-save at an adjacent offset, add it too
4595 	   to increase the use of LDP/STP.  */
4596 	offset = cfun->machine->frame.reg_offset[regno];
4597 	regno2 = ((offset & 8) == 0) ? regno + 1 : regno - 1;
4598 
4599 	if (regno2 <= LAST_SAVED_REGNUM)
4600 	  {
4601 	    offset2 = cfun->machine->frame.reg_offset[regno2];
4602 	    if ((offset & ~8) == (offset2 & ~8))
4603 	      bitmap_set_bit (components, regno2);
4604 	  }
4605       }
4606 
4607   return components;
4608 }
4609 
4610 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
4611    Nothing to do for aarch64.  */
4612 
4613 static void
aarch64_disqualify_components(sbitmap,edge,sbitmap,bool)4614 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
4615 {
4616 }
4617 
4618 /* Return the next set bit in BMP from START onwards.  Return the total number
4619    of bits in BMP if no set bit is found at or after START.  */
4620 
4621 static unsigned int
aarch64_get_next_set_bit(sbitmap bmp,unsigned int start)4622 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
4623 {
4624   unsigned int nbits = SBITMAP_SIZE (bmp);
4625   if (start == nbits)
4626     return start;
4627 
4628   gcc_assert (start < nbits);
4629   for (unsigned int i = start; i < nbits; i++)
4630     if (bitmap_bit_p (bmp, i))
4631       return i;
4632 
4633   return nbits;
4634 }
4635 
4636 /* Do the work for aarch64_emit_prologue_components and
4637    aarch64_emit_epilogue_components.  COMPONENTS is the bitmap of registers
4638    to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
4639    for these components or the epilogue sequence.  That is, it determines
4640    whether we should emit stores or loads and what kind of CFA notes to attach
4641    to the insns.  Otherwise the logic for the two sequences is very
4642    similar.  */
4643 
4644 static void
aarch64_process_components(sbitmap components,bool prologue_p)4645 aarch64_process_components (sbitmap components, bool prologue_p)
4646 {
4647   rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
4648 			     ? HARD_FRAME_POINTER_REGNUM
4649 			     : STACK_POINTER_REGNUM);
4650 
4651   unsigned last_regno = SBITMAP_SIZE (components);
4652   unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
4653   rtx_insn *insn = NULL;
4654 
4655   while (regno != last_regno)
4656     {
4657       /* AAPCS64 section 5.1.2 requires only the bottom 64 bits to be saved
4658 	 so DFmode for the vector registers is enough.  */
4659       machine_mode mode = GP_REGNUM_P (regno) ? E_DImode : E_DFmode;
4660       rtx reg = gen_rtx_REG (mode, regno);
4661       poly_int64 offset = cfun->machine->frame.reg_offset[regno];
4662       if (!frame_pointer_needed)
4663 	offset += cfun->machine->frame.frame_size
4664 		  - cfun->machine->frame.hard_fp_offset;
4665       rtx addr = plus_constant (Pmode, ptr_reg, offset);
4666       rtx mem = gen_frame_mem (mode, addr);
4667 
4668       rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
4669       unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
4670       /* No more registers to handle after REGNO.
4671 	 Emit a single save/restore and exit.  */
4672       if (regno2 == last_regno)
4673 	{
4674 	  insn = emit_insn (set);
4675 	  RTX_FRAME_RELATED_P (insn) = 1;
4676 	  if (prologue_p)
4677 	    add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
4678 	  else
4679 	    add_reg_note (insn, REG_CFA_RESTORE, reg);
4680 	  break;
4681 	}
4682 
4683       poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
4684       /* The next register is not of the same class or its offset is not
4685 	 mergeable with the current one into a pair.  */
4686       if (!satisfies_constraint_Ump (mem)
4687 	  || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
4688 	  || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
4689 		       GET_MODE_SIZE (mode)))
4690 	{
4691 	  insn = emit_insn (set);
4692 	  RTX_FRAME_RELATED_P (insn) = 1;
4693 	  if (prologue_p)
4694 	    add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
4695 	  else
4696 	    add_reg_note (insn, REG_CFA_RESTORE, reg);
4697 
4698 	  regno = regno2;
4699 	  continue;
4700 	}
4701 
4702       /* REGNO2 can be saved/restored in a pair with REGNO.  */
4703       rtx reg2 = gen_rtx_REG (mode, regno2);
4704       if (!frame_pointer_needed)
4705 	offset2 += cfun->machine->frame.frame_size
4706 		  - cfun->machine->frame.hard_fp_offset;
4707       rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
4708       rtx mem2 = gen_frame_mem (mode, addr2);
4709       rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
4710 			     : gen_rtx_SET (reg2, mem2);
4711 
4712       if (prologue_p)
4713 	insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
4714       else
4715 	insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
4716 
4717       RTX_FRAME_RELATED_P (insn) = 1;
4718       if (prologue_p)
4719 	{
4720 	  add_reg_note (insn, REG_CFA_OFFSET, set);
4721 	  add_reg_note (insn, REG_CFA_OFFSET, set2);
4722 	}
4723       else
4724 	{
4725 	  add_reg_note (insn, REG_CFA_RESTORE, reg);
4726 	  add_reg_note (insn, REG_CFA_RESTORE, reg2);
4727 	}
4728 
4729       regno = aarch64_get_next_set_bit (components, regno2 + 1);
4730     }
4731 }
4732 
4733 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS.  */
4734 
4735 static void
aarch64_emit_prologue_components(sbitmap components)4736 aarch64_emit_prologue_components (sbitmap components)
4737 {
4738   aarch64_process_components (components, true);
4739 }
4740 
4741 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS.  */
4742 
4743 static void
aarch64_emit_epilogue_components(sbitmap components)4744 aarch64_emit_epilogue_components (sbitmap components)
4745 {
4746   aarch64_process_components (components, false);
4747 }
4748 
4749 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS.  */
4750 
4751 static void
aarch64_set_handled_components(sbitmap components)4752 aarch64_set_handled_components (sbitmap components)
4753 {
4754   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
4755     if (bitmap_bit_p (components, regno))
4756       cfun->machine->reg_is_wrapped_separately[regno] = true;
4757 }
4758 
4759 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
4760    is saved at BASE + OFFSET.  */
4761 
4762 static void
aarch64_add_cfa_expression(rtx_insn * insn,unsigned int reg,rtx base,poly_int64 offset)4763 aarch64_add_cfa_expression (rtx_insn *insn, unsigned int reg,
4764 			    rtx base, poly_int64 offset)
4765 {
4766   rtx mem = gen_frame_mem (DImode, plus_constant (Pmode, base, offset));
4767   add_reg_note (insn, REG_CFA_EXPRESSION,
4768 		gen_rtx_SET (mem, regno_reg_rtx[reg]));
4769 }
4770 
4771 /* AArch64 stack frames generated by this compiler look like:
4772 
4773 	+-------------------------------+
4774 	|                               |
4775 	|  incoming stack arguments     |
4776 	|                               |
4777 	+-------------------------------+
4778 	|                               | <-- incoming stack pointer (aligned)
4779 	|  callee-allocated save area   |
4780 	|  for register varargs         |
4781 	|                               |
4782 	+-------------------------------+
4783 	|  local variables              | <-- frame_pointer_rtx
4784 	|                               |
4785 	+-------------------------------+
4786 	|  padding0                     | \
4787 	+-------------------------------+  |
4788 	|  callee-saved registers       |  | frame.saved_regs_size
4789 	+-------------------------------+  |
4790 	|  LR'                          |  |
4791 	+-------------------------------+  |
4792 	|  FP'                          | / <- hard_frame_pointer_rtx (aligned)
4793         +-------------------------------+
4794 	|  dynamic allocation           |
4795 	+-------------------------------+
4796 	|  padding                      |
4797 	+-------------------------------+
4798 	|  outgoing stack arguments     | <-- arg_pointer
4799         |                               |
4800 	+-------------------------------+
4801 	|                               | <-- stack_pointer_rtx (aligned)
4802 
4803    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
4804    but leave frame_pointer_rtx and hard_frame_pointer_rtx
4805    unchanged.  */
4806 
4807 /* Generate the prologue instructions for entry into a function.
4808    Establish the stack frame by decreasing the stack pointer with a
4809    properly calculated size and, if necessary, create a frame record
4810    filled with the values of LR and previous frame pointer.  The
4811    current FP is also set up if it is in use.  */
4812 
4813 void
aarch64_expand_prologue(void)4814 aarch64_expand_prologue (void)
4815 {
4816   aarch64_layout_frame ();
4817 
4818   poly_int64 frame_size = cfun->machine->frame.frame_size;
4819   poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
4820   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
4821   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
4822   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
4823   unsigned reg1 = cfun->machine->frame.wb_candidate1;
4824   unsigned reg2 = cfun->machine->frame.wb_candidate2;
4825   bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
4826   rtx_insn *insn;
4827 
4828   /* Sign return address for functions.  */
4829   if (aarch64_return_address_signing_enabled ())
4830     {
4831       insn = emit_insn (gen_pacisp ());
4832       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
4833       RTX_FRAME_RELATED_P (insn) = 1;
4834     }
4835 
4836   if (flag_stack_usage_info)
4837     current_function_static_stack_size = constant_lower_bound (frame_size);
4838 
4839   if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
4840     {
4841       if (crtl->is_leaf && !cfun->calls_alloca)
4842 	{
4843 	  if (maybe_gt (frame_size, PROBE_INTERVAL)
4844 	      && maybe_gt (frame_size, get_stack_check_protect ()))
4845 	    aarch64_emit_probe_stack_range (get_stack_check_protect (),
4846 					    (frame_size
4847 					     - get_stack_check_protect ()));
4848 	}
4849       else if (maybe_gt (frame_size, 0))
4850 	aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
4851     }
4852 
4853   rtx ip0_rtx = gen_rtx_REG (Pmode, IP0_REGNUM);
4854   rtx ip1_rtx = gen_rtx_REG (Pmode, IP1_REGNUM);
4855 
4856   aarch64_sub_sp (ip0_rtx, ip1_rtx, initial_adjust, true);
4857 
4858   if (callee_adjust != 0)
4859     aarch64_push_regs (reg1, reg2, callee_adjust);
4860 
4861   if (emit_frame_chain)
4862     {
4863       poly_int64 reg_offset = callee_adjust;
4864       if (callee_adjust == 0)
4865 	{
4866 	  reg1 = R29_REGNUM;
4867 	  reg2 = R30_REGNUM;
4868 	  reg_offset = callee_offset;
4869 	  aarch64_save_callee_saves (DImode, reg_offset, reg1, reg2, false);
4870 	}
4871       aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
4872 			  stack_pointer_rtx, callee_offset,
4873 			  ip1_rtx, ip0_rtx, frame_pointer_needed);
4874       if (frame_pointer_needed && !frame_size.is_constant ())
4875 	{
4876 	  /* Variable-sized frames need to describe the save slot
4877 	     address using DW_CFA_expression rather than DW_CFA_offset.
4878 	     This means that, without taking further action, the
4879 	     locations of the registers that we've already saved would
4880 	     remain based on the stack pointer even after we redefine
4881 	     the CFA based on the frame pointer.  We therefore need new
4882 	     DW_CFA_expressions to re-express the save slots with addresses
4883 	     based on the frame pointer.  */
4884 	  rtx_insn *insn = get_last_insn ();
4885 	  gcc_assert (RTX_FRAME_RELATED_P (insn));
4886 
4887 	  /* Add an explicit CFA definition if this was previously
4888 	     implicit.  */
4889 	  if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
4890 	    {
4891 	      rtx src = plus_constant (Pmode, stack_pointer_rtx,
4892 				       callee_offset);
4893 	      add_reg_note (insn, REG_CFA_ADJUST_CFA,
4894 			    gen_rtx_SET (hard_frame_pointer_rtx, src));
4895 	    }
4896 
4897 	  /* Change the save slot expressions for the registers that
4898 	     we've already saved.  */
4899 	  reg_offset -= callee_offset;
4900 	  aarch64_add_cfa_expression (insn, reg2, hard_frame_pointer_rtx,
4901 				      reg_offset + UNITS_PER_WORD);
4902 	  aarch64_add_cfa_expression (insn, reg1, hard_frame_pointer_rtx,
4903 				      reg_offset);
4904 	}
4905       emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
4906     }
4907 
4908   aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
4909 			     callee_adjust != 0 || emit_frame_chain);
4910   aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
4911 			     callee_adjust != 0 || emit_frame_chain);
4912   aarch64_sub_sp (ip1_rtx, ip0_rtx, final_adjust, !frame_pointer_needed);
4913 }
4914 
4915 /* Return TRUE if we can use a simple_return insn.
4916 
4917    This function checks whether the callee saved stack is empty, which
4918    means no restore actions are need. The pro_and_epilogue will use
4919    this to check whether shrink-wrapping opt is feasible.  */
4920 
4921 bool
aarch64_use_return_insn_p(void)4922 aarch64_use_return_insn_p (void)
4923 {
4924   if (!reload_completed)
4925     return false;
4926 
4927   if (crtl->profile)
4928     return false;
4929 
4930   aarch64_layout_frame ();
4931 
4932   return known_eq (cfun->machine->frame.frame_size, 0);
4933 }
4934 
4935 /* Generate the epilogue instructions for returning from a function.
4936    This is almost exactly the reverse of the prolog sequence, except
4937    that we need to insert barriers to avoid scheduling loads that read
4938    from a deallocated stack, and we optimize the unwind records by
4939    emitting them all together if possible.  */
4940 void
aarch64_expand_epilogue(bool for_sibcall)4941 aarch64_expand_epilogue (bool for_sibcall)
4942 {
4943   aarch64_layout_frame ();
4944 
4945   poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
4946   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
4947   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
4948   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
4949   unsigned reg1 = cfun->machine->frame.wb_candidate1;
4950   unsigned reg2 = cfun->machine->frame.wb_candidate2;
4951   rtx cfi_ops = NULL;
4952   rtx_insn *insn;
4953   /* A stack clash protection prologue may not have left IP0_REGNUM or
4954      IP1_REGNUM in a usable state.  The same is true for allocations
4955      with an SVE component, since we then need both temporary registers
4956      for each allocation.  */
4957   bool can_inherit_p = (initial_adjust.is_constant ()
4958 			&& final_adjust.is_constant ()
4959 			&& !flag_stack_clash_protection);
4960 
4961   /* We need to add memory barrier to prevent read from deallocated stack.  */
4962   bool need_barrier_p
4963     = maybe_ne (get_frame_size ()
4964 		+ cfun->machine->frame.saved_varargs_size, 0);
4965 
4966   /* Emit a barrier to prevent loads from a deallocated stack.  */
4967   if (maybe_gt (final_adjust, crtl->outgoing_args_size)
4968       || cfun->calls_alloca
4969       || crtl->calls_eh_return)
4970     {
4971       emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
4972       need_barrier_p = false;
4973     }
4974 
4975   /* Restore the stack pointer from the frame pointer if it may not
4976      be the same as the stack pointer.  */
4977   rtx ip0_rtx = gen_rtx_REG (Pmode, IP0_REGNUM);
4978   rtx ip1_rtx = gen_rtx_REG (Pmode, IP1_REGNUM);
4979   if (frame_pointer_needed
4980       && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
4981     /* If writeback is used when restoring callee-saves, the CFA
4982        is restored on the instruction doing the writeback.  */
4983     aarch64_add_offset (Pmode, stack_pointer_rtx,
4984 			hard_frame_pointer_rtx, -callee_offset,
4985 			ip1_rtx, ip0_rtx, callee_adjust == 0);
4986   else
4987     aarch64_add_sp (ip1_rtx, ip0_rtx, final_adjust,
4988 		    !can_inherit_p || df_regs_ever_live_p (IP1_REGNUM));
4989 
4990   aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
4991 				callee_adjust != 0, &cfi_ops);
4992   aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
4993 				callee_adjust != 0, &cfi_ops);
4994 
4995   if (need_barrier_p)
4996     emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
4997 
4998   if (callee_adjust != 0)
4999     aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
5000 
5001   if (callee_adjust != 0 || maybe_gt (initial_adjust, 65536))
5002     {
5003       /* Emit delayed restores and set the CFA to be SP + initial_adjust.  */
5004       insn = get_last_insn ();
5005       rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
5006       REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
5007       RTX_FRAME_RELATED_P (insn) = 1;
5008       cfi_ops = NULL;
5009     }
5010 
5011   aarch64_add_sp (ip0_rtx, ip1_rtx, initial_adjust,
5012 		  !can_inherit_p || df_regs_ever_live_p (IP0_REGNUM));
5013 
5014   if (cfi_ops)
5015     {
5016       /* Emit delayed restores and reset the CFA to be SP.  */
5017       insn = get_last_insn ();
5018       cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
5019       REG_NOTES (insn) = cfi_ops;
5020       RTX_FRAME_RELATED_P (insn) = 1;
5021     }
5022 
5023   /* We prefer to emit the combined return/authenticate instruction RETAA,
5024      however there are three cases in which we must instead emit an explicit
5025      authentication instruction.
5026 
5027 	1) Sibcalls don't return in a normal way, so if we're about to call one
5028 	   we must authenticate.
5029 
5030 	2) The RETAA instruction is not available before ARMv8.3-A, so if we are
5031 	   generating code for !TARGET_ARMV8_3 we can't use it and must
5032 	   explicitly authenticate.
5033 
5034 	3) On an eh_return path we make extra stack adjustments to update the
5035 	   canonical frame address to be the exception handler's CFA.  We want
5036 	   to authenticate using the CFA of the function which calls eh_return.
5037     */
5038   if (aarch64_return_address_signing_enabled ()
5039       && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
5040     {
5041       insn = emit_insn (gen_autisp ());
5042       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
5043       RTX_FRAME_RELATED_P (insn) = 1;
5044     }
5045 
5046   /* Stack adjustment for exception handler.  */
5047   if (crtl->calls_eh_return)
5048     {
5049       /* We need to unwind the stack by the offset computed by
5050 	 EH_RETURN_STACKADJ_RTX.  We have already reset the CFA
5051 	 to be SP; letting the CFA move during this adjustment
5052 	 is just as correct as retaining the CFA from the body
5053 	 of the function.  Therefore, do nothing special.  */
5054       emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
5055     }
5056 
5057   emit_use (gen_rtx_REG (DImode, LR_REGNUM));
5058   if (!for_sibcall)
5059     emit_jump_insn (ret_rtx);
5060 }
5061 
5062 /* Implement EH_RETURN_HANDLER_RTX.  EH returns need to either return
5063    normally or return to a previous frame after unwinding.
5064 
5065    An EH return uses a single shared return sequence.  The epilogue is
5066    exactly like a normal epilogue except that it has an extra input
5067    register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
5068    that must be applied after the frame has been destroyed.  An extra label
5069    is inserted before the epilogue which initializes this register to zero,
5070    and this is the entry point for a normal return.
5071 
5072    An actual EH return updates the return address, initializes the stack
5073    adjustment and jumps directly into the epilogue (bypassing the zeroing
5074    of the adjustment).  Since the return address is typically saved on the
5075    stack when a function makes a call, the saved LR must be updated outside
5076    the epilogue.
5077 
5078    This poses problems as the store is generated well before the epilogue,
5079    so the offset of LR is not known yet.  Also optimizations will remove the
5080    store as it appears dead, even after the epilogue is generated (as the
5081    base or offset for loading LR is different in many cases).
5082 
5083    To avoid these problems this implementation forces the frame pointer
5084    in eh_return functions so that the location of LR is fixed and known early.
5085    It also marks the store volatile, so no optimization is permitted to
5086    remove the store.  */
5087 rtx
aarch64_eh_return_handler_rtx(void)5088 aarch64_eh_return_handler_rtx (void)
5089 {
5090   rtx tmp = gen_frame_mem (Pmode,
5091     plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
5092 
5093   /* Mark the store volatile, so no optimization is permitted to remove it.  */
5094   MEM_VOLATILE_P (tmp) = true;
5095   return tmp;
5096 }
5097 
5098 /* Output code to add DELTA to the first argument, and then jump
5099    to FUNCTION.  Used for C++ multiple inheritance.  */
5100 static void
aarch64_output_mi_thunk(FILE * file,tree thunk ATTRIBUTE_UNUSED,HOST_WIDE_INT delta,HOST_WIDE_INT vcall_offset,tree function)5101 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
5102 			 HOST_WIDE_INT delta,
5103 			 HOST_WIDE_INT vcall_offset,
5104 			 tree function)
5105 {
5106   /* The this pointer is always in x0.  Note that this differs from
5107      Arm where the this pointer maybe bumped to r1 if r0 is required
5108      to return a pointer to an aggregate.  On AArch64 a result value
5109      pointer will be in x8.  */
5110   int this_regno = R0_REGNUM;
5111   rtx this_rtx, temp0, temp1, addr, funexp;
5112   rtx_insn *insn;
5113 
5114   reload_completed = 1;
5115   emit_note (NOTE_INSN_PROLOGUE_END);
5116 
5117   this_rtx = gen_rtx_REG (Pmode, this_regno);
5118   temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
5119   temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
5120 
5121   if (vcall_offset == 0)
5122     aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
5123   else
5124     {
5125       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
5126 
5127       addr = this_rtx;
5128       if (delta != 0)
5129 	{
5130 	  if (delta >= -256 && delta < 256)
5131 	    addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
5132 				       plus_constant (Pmode, this_rtx, delta));
5133 	  else
5134 	    aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
5135 				temp1, temp0, false);
5136 	}
5137 
5138       if (Pmode == ptr_mode)
5139 	aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
5140       else
5141 	aarch64_emit_move (temp0,
5142 			   gen_rtx_ZERO_EXTEND (Pmode,
5143 						gen_rtx_MEM (ptr_mode, addr)));
5144 
5145       if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
5146 	  addr = plus_constant (Pmode, temp0, vcall_offset);
5147       else
5148 	{
5149 	  aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
5150 					  Pmode);
5151 	  addr = gen_rtx_PLUS (Pmode, temp0, temp1);
5152 	}
5153 
5154       if (Pmode == ptr_mode)
5155 	aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
5156       else
5157 	aarch64_emit_move (temp1,
5158 			   gen_rtx_SIGN_EXTEND (Pmode,
5159 						gen_rtx_MEM (ptr_mode, addr)));
5160 
5161       emit_insn (gen_add2_insn (this_rtx, temp1));
5162     }
5163 
5164   /* Generate a tail call to the target function.  */
5165   if (!TREE_USED (function))
5166     {
5167       assemble_external (function);
5168       TREE_USED (function) = 1;
5169     }
5170   funexp = XEXP (DECL_RTL (function), 0);
5171   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
5172   insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
5173   SIBLING_CALL_P (insn) = 1;
5174 
5175   insn = get_insns ();
5176   shorten_branches (insn);
5177   final_start_function (insn, file, 1);
5178   final (insn, file, 1);
5179   final_end_function ();
5180 
5181   /* Stop pretending to be a post-reload pass.  */
5182   reload_completed = 0;
5183 }
5184 
5185 static bool
aarch64_tls_referenced_p(rtx x)5186 aarch64_tls_referenced_p (rtx x)
5187 {
5188   if (!TARGET_HAVE_TLS)
5189     return false;
5190   subrtx_iterator::array_type array;
5191   FOR_EACH_SUBRTX (iter, array, x, ALL)
5192     {
5193       const_rtx x = *iter;
5194       if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
5195 	return true;
5196       /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
5197 	 TLS offsets, not real symbol references.  */
5198       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
5199 	iter.skip_subrtxes ();
5200     }
5201   return false;
5202 }
5203 
5204 
5205 /* Return true if val can be encoded as a 12-bit unsigned immediate with
5206    a left shift of 0 or 12 bits.  */
5207 bool
aarch64_uimm12_shift(HOST_WIDE_INT val)5208 aarch64_uimm12_shift (HOST_WIDE_INT val)
5209 {
5210   return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
5211 	  || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
5212 	  );
5213 }
5214 
5215 
5216 /* Return true if val is an immediate that can be loaded into a
5217    register by a MOVZ instruction.  */
5218 static bool
aarch64_movw_imm(HOST_WIDE_INT val,scalar_int_mode mode)5219 aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
5220 {
5221   if (GET_MODE_SIZE (mode) > 4)
5222     {
5223       if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
5224 	  || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
5225 	return 1;
5226     }
5227   else
5228     {
5229       /* Ignore sign extension.  */
5230       val &= (HOST_WIDE_INT) 0xffffffff;
5231     }
5232   return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
5233 	  || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
5234 }
5235 
5236 /* VAL is a value with the inner mode of MODE.  Replicate it to fill a
5237    64-bit (DImode) integer.  */
5238 
5239 static unsigned HOST_WIDE_INT
aarch64_replicate_bitmask_imm(unsigned HOST_WIDE_INT val,machine_mode mode)5240 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
5241 {
5242   unsigned int size = GET_MODE_UNIT_PRECISION (mode);
5243   while (size < 64)
5244     {
5245       val &= (HOST_WIDE_INT_1U << size) - 1;
5246       val |= val << size;
5247       size *= 2;
5248     }
5249   return val;
5250 }
5251 
5252 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2.  */
5253 
5254 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
5255   {
5256     0x0000000100000001ull,
5257     0x0001000100010001ull,
5258     0x0101010101010101ull,
5259     0x1111111111111111ull,
5260     0x5555555555555555ull,
5261   };
5262 
5263 
5264 /* Return true if val is a valid bitmask immediate.  */
5265 
5266 bool
aarch64_bitmask_imm(HOST_WIDE_INT val_in,machine_mode mode)5267 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
5268 {
5269   unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
5270   int bits;
5271 
5272   /* Check for a single sequence of one bits and return quickly if so.
5273      The special cases of all ones and all zeroes returns false.  */
5274   val = aarch64_replicate_bitmask_imm (val_in, mode);
5275   tmp = val + (val & -val);
5276 
5277   if (tmp == (tmp & -tmp))
5278     return (val + 1) > 1;
5279 
5280   /* Replicate 32-bit immediates so we can treat them as 64-bit.  */
5281   if (mode == SImode)
5282     val = (val << 32) | (val & 0xffffffff);
5283 
5284   /* Invert if the immediate doesn't start with a zero bit - this means we
5285      only need to search for sequences of one bits.  */
5286   if (val & 1)
5287     val = ~val;
5288 
5289   /* Find the first set bit and set tmp to val with the first sequence of one
5290      bits removed.  Return success if there is a single sequence of ones.  */
5291   first_one = val & -val;
5292   tmp = val & (val + first_one);
5293 
5294   if (tmp == 0)
5295     return true;
5296 
5297   /* Find the next set bit and compute the difference in bit position.  */
5298   next_one = tmp & -tmp;
5299   bits = clz_hwi (first_one) - clz_hwi (next_one);
5300   mask = val ^ tmp;
5301 
5302   /* Check the bit position difference is a power of 2, and that the first
5303      sequence of one bits fits within 'bits' bits.  */
5304   if ((mask >> bits) != 0 || bits != (bits & -bits))
5305     return false;
5306 
5307   /* Check the sequence of one bits is repeated 64/bits times.  */
5308   return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
5309 }
5310 
5311 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
5312    Assumed precondition: VAL_IN Is not zero.  */
5313 
5314 unsigned HOST_WIDE_INT
aarch64_and_split_imm1(HOST_WIDE_INT val_in)5315 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
5316 {
5317   int lowest_bit_set = ctz_hwi (val_in);
5318   int highest_bit_set = floor_log2 (val_in);
5319   gcc_assert (val_in != 0);
5320 
5321   return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
5322 	  (HOST_WIDE_INT_1U << lowest_bit_set));
5323 }
5324 
5325 /* Create constant where bits outside of lowest bit set to highest bit set
5326    are set to 1.  */
5327 
5328 unsigned HOST_WIDE_INT
aarch64_and_split_imm2(HOST_WIDE_INT val_in)5329 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
5330 {
5331   return val_in | ~aarch64_and_split_imm1 (val_in);
5332 }
5333 
5334 /* Return true if VAL_IN is a valid 'and' bitmask immediate.  */
5335 
5336 bool
aarch64_and_bitmask_imm(unsigned HOST_WIDE_INT val_in,machine_mode mode)5337 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
5338 {
5339   scalar_int_mode int_mode;
5340   if (!is_a <scalar_int_mode> (mode, &int_mode))
5341     return false;
5342 
5343   if (aarch64_bitmask_imm (val_in, int_mode))
5344     return false;
5345 
5346   if (aarch64_move_imm (val_in, int_mode))
5347     return false;
5348 
5349   unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
5350 
5351   return aarch64_bitmask_imm (imm2, int_mode);
5352 }
5353 
5354 /* Return true if val is an immediate that can be loaded into a
5355    register in a single instruction.  */
5356 bool
aarch64_move_imm(HOST_WIDE_INT val,machine_mode mode)5357 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
5358 {
5359   scalar_int_mode int_mode;
5360   if (!is_a <scalar_int_mode> (mode, &int_mode))
5361     return false;
5362 
5363   if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
5364     return 1;
5365   return aarch64_bitmask_imm (val, int_mode);
5366 }
5367 
5368 static bool
aarch64_cannot_force_const_mem(machine_mode mode ATTRIBUTE_UNUSED,rtx x)5369 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
5370 {
5371   rtx base, offset;
5372 
5373   if (GET_CODE (x) == HIGH)
5374     return true;
5375 
5376   /* There's no way to calculate VL-based values using relocations.  */
5377   subrtx_iterator::array_type array;
5378   FOR_EACH_SUBRTX (iter, array, x, ALL)
5379     if (GET_CODE (*iter) == CONST_POLY_INT)
5380       return true;
5381 
5382   split_const (x, &base, &offset);
5383   if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
5384     {
5385       if (aarch64_classify_symbol (base, INTVAL (offset))
5386 	  != SYMBOL_FORCE_TO_MEM)
5387 	return true;
5388       else
5389 	/* Avoid generating a 64-bit relocation in ILP32; leave
5390 	   to aarch64_expand_mov_immediate to handle it properly.  */
5391 	return mode != ptr_mode;
5392     }
5393 
5394   return aarch64_tls_referenced_p (x);
5395 }
5396 
5397 /* Implement TARGET_CASE_VALUES_THRESHOLD.
5398    The expansion for a table switch is quite expensive due to the number
5399    of instructions, the table lookup and hard to predict indirect jump.
5400    When optimizing for speed, and -O3 enabled, use the per-core tuning if
5401    set, otherwise use tables for > 16 cases as a tradeoff between size and
5402    performance.  When optimizing for size, use the default setting.  */
5403 
5404 static unsigned int
aarch64_case_values_threshold(void)5405 aarch64_case_values_threshold (void)
5406 {
5407   /* Use the specified limit for the number of cases before using jump
5408      tables at higher optimization levels.  */
5409   if (optimize > 2
5410       && selected_cpu->tune->max_case_values != 0)
5411     return selected_cpu->tune->max_case_values;
5412   else
5413     return optimize_size ? default_case_values_threshold () : 17;
5414 }
5415 
5416 /* Return true if register REGNO is a valid index register.
5417    STRICT_P is true if REG_OK_STRICT is in effect.  */
5418 
5419 bool
aarch64_regno_ok_for_index_p(int regno,bool strict_p)5420 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
5421 {
5422   if (!HARD_REGISTER_NUM_P (regno))
5423     {
5424       if (!strict_p)
5425 	return true;
5426 
5427       if (!reg_renumber)
5428 	return false;
5429 
5430       regno = reg_renumber[regno];
5431     }
5432   return GP_REGNUM_P (regno);
5433 }
5434 
5435 /* Return true if register REGNO is a valid base register for mode MODE.
5436    STRICT_P is true if REG_OK_STRICT is in effect.  */
5437 
5438 bool
aarch64_regno_ok_for_base_p(int regno,bool strict_p)5439 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
5440 {
5441   if (!HARD_REGISTER_NUM_P (regno))
5442     {
5443       if (!strict_p)
5444 	return true;
5445 
5446       if (!reg_renumber)
5447 	return false;
5448 
5449       regno = reg_renumber[regno];
5450     }
5451 
5452   /* The fake registers will be eliminated to either the stack or
5453      hard frame pointer, both of which are usually valid base registers.
5454      Reload deals with the cases where the eliminated form isn't valid.  */
5455   return (GP_REGNUM_P (regno)
5456 	  || regno == SP_REGNUM
5457 	  || regno == FRAME_POINTER_REGNUM
5458 	  || regno == ARG_POINTER_REGNUM);
5459 }
5460 
5461 /* Return true if X is a valid base register for mode MODE.
5462    STRICT_P is true if REG_OK_STRICT is in effect.  */
5463 
5464 static bool
aarch64_base_register_rtx_p(rtx x,bool strict_p)5465 aarch64_base_register_rtx_p (rtx x, bool strict_p)
5466 {
5467   if (!strict_p
5468       && GET_CODE (x) == SUBREG
5469       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
5470     x = SUBREG_REG (x);
5471 
5472   return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
5473 }
5474 
5475 /* Return true if address offset is a valid index.  If it is, fill in INFO
5476    appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
5477 
5478 static bool
aarch64_classify_index(struct aarch64_address_info * info,rtx x,machine_mode mode,bool strict_p)5479 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
5480 			machine_mode mode, bool strict_p)
5481 {
5482   enum aarch64_address_type type;
5483   rtx index;
5484   int shift;
5485 
5486   /* (reg:P) */
5487   if ((REG_P (x) || GET_CODE (x) == SUBREG)
5488       && GET_MODE (x) == Pmode)
5489     {
5490       type = ADDRESS_REG_REG;
5491       index = x;
5492       shift = 0;
5493     }
5494   /* (sign_extend:DI (reg:SI)) */
5495   else if ((GET_CODE (x) == SIGN_EXTEND
5496 	    || GET_CODE (x) == ZERO_EXTEND)
5497 	   && GET_MODE (x) == DImode
5498 	   && GET_MODE (XEXP (x, 0)) == SImode)
5499     {
5500       type = (GET_CODE (x) == SIGN_EXTEND)
5501 	? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5502       index = XEXP (x, 0);
5503       shift = 0;
5504     }
5505   /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
5506   else if (GET_CODE (x) == MULT
5507 	   && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
5508 	       || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
5509 	   && GET_MODE (XEXP (x, 0)) == DImode
5510 	   && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
5511 	   && CONST_INT_P (XEXP (x, 1)))
5512     {
5513       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
5514 	? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5515       index = XEXP (XEXP (x, 0), 0);
5516       shift = exact_log2 (INTVAL (XEXP (x, 1)));
5517     }
5518   /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
5519   else if (GET_CODE (x) == ASHIFT
5520 	   && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
5521 	       || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
5522 	   && GET_MODE (XEXP (x, 0)) == DImode
5523 	   && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
5524 	   && CONST_INT_P (XEXP (x, 1)))
5525     {
5526       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
5527 	? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5528       index = XEXP (XEXP (x, 0), 0);
5529       shift = INTVAL (XEXP (x, 1));
5530     }
5531   /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
5532   else if ((GET_CODE (x) == SIGN_EXTRACT
5533 	    || GET_CODE (x) == ZERO_EXTRACT)
5534 	   && GET_MODE (x) == DImode
5535 	   && GET_CODE (XEXP (x, 0)) == MULT
5536 	   && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5537 	   && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
5538     {
5539       type = (GET_CODE (x) == SIGN_EXTRACT)
5540 	? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5541       index = XEXP (XEXP (x, 0), 0);
5542       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
5543       if (INTVAL (XEXP (x, 1)) != 32 + shift
5544 	  || INTVAL (XEXP (x, 2)) != 0)
5545 	shift = -1;
5546     }
5547   /* (and:DI (mult:DI (reg:DI) (const_int scale))
5548      (const_int 0xffffffff<<shift)) */
5549   else if (GET_CODE (x) == AND
5550 	   && GET_MODE (x) == DImode
5551 	   && GET_CODE (XEXP (x, 0)) == MULT
5552 	   && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5553 	   && CONST_INT_P (XEXP (XEXP (x, 0), 1))
5554 	   && CONST_INT_P (XEXP (x, 1)))
5555     {
5556       type = ADDRESS_REG_UXTW;
5557       index = XEXP (XEXP (x, 0), 0);
5558       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
5559       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
5560 	shift = -1;
5561     }
5562   /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
5563   else if ((GET_CODE (x) == SIGN_EXTRACT
5564 	    || GET_CODE (x) == ZERO_EXTRACT)
5565 	   && GET_MODE (x) == DImode
5566 	   && GET_CODE (XEXP (x, 0)) == ASHIFT
5567 	   && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5568 	   && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
5569     {
5570       type = (GET_CODE (x) == SIGN_EXTRACT)
5571 	? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5572       index = XEXP (XEXP (x, 0), 0);
5573       shift = INTVAL (XEXP (XEXP (x, 0), 1));
5574       if (INTVAL (XEXP (x, 1)) != 32 + shift
5575 	  || INTVAL (XEXP (x, 2)) != 0)
5576 	shift = -1;
5577     }
5578   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
5579      (const_int 0xffffffff<<shift)) */
5580   else if (GET_CODE (x) == AND
5581 	   && GET_MODE (x) == DImode
5582 	   && GET_CODE (XEXP (x, 0)) == ASHIFT
5583 	   && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5584 	   && CONST_INT_P (XEXP (XEXP (x, 0), 1))
5585 	   && CONST_INT_P (XEXP (x, 1)))
5586     {
5587       type = ADDRESS_REG_UXTW;
5588       index = XEXP (XEXP (x, 0), 0);
5589       shift = INTVAL (XEXP (XEXP (x, 0), 1));
5590       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
5591 	shift = -1;
5592     }
5593   /* (mult:P (reg:P) (const_int scale)) */
5594   else if (GET_CODE (x) == MULT
5595 	   && GET_MODE (x) == Pmode
5596 	   && GET_MODE (XEXP (x, 0)) == Pmode
5597 	   && CONST_INT_P (XEXP (x, 1)))
5598     {
5599       type = ADDRESS_REG_REG;
5600       index = XEXP (x, 0);
5601       shift = exact_log2 (INTVAL (XEXP (x, 1)));
5602     }
5603   /* (ashift:P (reg:P) (const_int shift)) */
5604   else if (GET_CODE (x) == ASHIFT
5605 	   && GET_MODE (x) == Pmode
5606 	   && GET_MODE (XEXP (x, 0)) == Pmode
5607 	   && CONST_INT_P (XEXP (x, 1)))
5608     {
5609       type = ADDRESS_REG_REG;
5610       index = XEXP (x, 0);
5611       shift = INTVAL (XEXP (x, 1));
5612     }
5613   else
5614     return false;
5615 
5616   if (!strict_p
5617       && GET_CODE (index) == SUBREG
5618       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
5619     index = SUBREG_REG (index);
5620 
5621   if (aarch64_sve_data_mode_p (mode))
5622     {
5623       if (type != ADDRESS_REG_REG
5624 	  || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
5625 	return false;
5626     }
5627   else
5628     {
5629       if (shift != 0
5630 	  && !(IN_RANGE (shift, 1, 3)
5631 	       && known_eq (1 << shift, GET_MODE_SIZE (mode))))
5632 	return false;
5633     }
5634 
5635   if (REG_P (index)
5636       && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
5637     {
5638       info->type = type;
5639       info->offset = index;
5640       info->shift = shift;
5641       return true;
5642     }
5643 
5644   return false;
5645 }
5646 
5647 /* Return true if MODE is one of the modes for which we
5648    support LDP/STP operations.  */
5649 
5650 static bool
aarch64_mode_valid_for_sched_fusion_p(machine_mode mode)5651 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
5652 {
5653   return mode == SImode || mode == DImode
5654 	 || mode == SFmode || mode == DFmode
5655 	 || (aarch64_vector_mode_supported_p (mode)
5656 	     && known_eq (GET_MODE_SIZE (mode), 8));
5657 }
5658 
5659 /* Return true if REGNO is a virtual pointer register, or an eliminable
5660    "soft" frame register.  Like REGNO_PTR_FRAME_P except that we don't
5661    include stack_pointer or hard_frame_pointer.  */
5662 static bool
virt_or_elim_regno_p(unsigned regno)5663 virt_or_elim_regno_p (unsigned regno)
5664 {
5665   return ((regno >= FIRST_VIRTUAL_REGISTER
5666 	   && regno <= LAST_VIRTUAL_POINTER_REGISTER)
5667 	  || regno == FRAME_POINTER_REGNUM
5668 	  || regno == ARG_POINTER_REGNUM);
5669 }
5670 
5671 /* Return true if X is a valid address of type TYPE for machine mode MODE.
5672    If it is, fill in INFO appropriately.  STRICT_P is true if
5673    REG_OK_STRICT is in effect.  */
5674 
5675 static bool
5676 aarch64_classify_address (struct aarch64_address_info *info,
5677 			  rtx x, machine_mode mode, bool strict_p,
5678 			  aarch64_addr_query_type type = ADDR_QUERY_M)
5679 {
5680   enum rtx_code code = GET_CODE (x);
5681   rtx op0, op1;
5682   poly_int64 offset;
5683 
5684   HOST_WIDE_INT const_size;
5685 
5686   /* On BE, we use load/store pair for all large int mode load/stores.
5687      TI/TFmode may also use a load/store pair.  */
5688   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
5689   bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
5690   bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
5691 			    || mode == TImode
5692 			    || mode == TFmode
5693 			    || (BYTES_BIG_ENDIAN && advsimd_struct_p));
5694 
5695   bool allow_reg_index_p = (!load_store_pair_p
5696 			    && (known_lt (GET_MODE_SIZE (mode), 16)
5697 				|| vec_flags == VEC_ADVSIMD
5698 				|| vec_flags == VEC_SVE_DATA));
5699 
5700   /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
5701      [Rn, #offset, MUL VL].  */
5702   if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
5703       && (code != REG && code != PLUS))
5704     return false;
5705 
5706   /* On LE, for AdvSIMD, don't support anything other than POST_INC or
5707      REG addressing.  */
5708   if (advsimd_struct_p
5709       && !BYTES_BIG_ENDIAN
5710       && (code != POST_INC && code != REG))
5711     return false;
5712 
5713   gcc_checking_assert (GET_MODE (x) == VOIDmode
5714 		       || SCALAR_INT_MODE_P (GET_MODE (x)));
5715 
5716   switch (code)
5717     {
5718     case REG:
5719     case SUBREG:
5720       info->type = ADDRESS_REG_IMM;
5721       info->base = x;
5722       info->offset = const0_rtx;
5723       info->const_offset = 0;
5724       return aarch64_base_register_rtx_p (x, strict_p);
5725 
5726     case PLUS:
5727       op0 = XEXP (x, 0);
5728       op1 = XEXP (x, 1);
5729 
5730       if (! strict_p
5731 	  && REG_P (op0)
5732 	  && virt_or_elim_regno_p (REGNO (op0))
5733 	  && poly_int_rtx_p (op1, &offset))
5734 	{
5735 	  info->type = ADDRESS_REG_IMM;
5736 	  info->base = op0;
5737 	  info->offset = op1;
5738 	  info->const_offset = offset;
5739 
5740 	  return true;
5741 	}
5742 
5743       if (maybe_ne (GET_MODE_SIZE (mode), 0)
5744 	  && aarch64_base_register_rtx_p (op0, strict_p)
5745 	  && poly_int_rtx_p (op1, &offset))
5746 	{
5747 	  info->type = ADDRESS_REG_IMM;
5748 	  info->base = op0;
5749 	  info->offset = op1;
5750 	  info->const_offset = offset;
5751 
5752 	  /* TImode and TFmode values are allowed in both pairs of X
5753 	     registers and individual Q registers.  The available
5754 	     address modes are:
5755 	     X,X: 7-bit signed scaled offset
5756 	     Q:   9-bit signed offset
5757 	     We conservatively require an offset representable in either mode.
5758 	     When performing the check for pairs of X registers i.e.  LDP/STP
5759 	     pass down DImode since that is the natural size of the LDP/STP
5760 	     instruction memory accesses.  */
5761 	  if (mode == TImode || mode == TFmode)
5762 	    return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
5763 		    && (offset_9bit_signed_unscaled_p (mode, offset)
5764 			|| offset_12bit_unsigned_scaled_p (mode, offset)));
5765 
5766 	  /* A 7bit offset check because OImode will emit a ldp/stp
5767 	     instruction (only big endian will get here).
5768 	     For ldp/stp instructions, the offset is scaled for the size of a
5769 	     single element of the pair.  */
5770 	  if (mode == OImode)
5771 	    return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
5772 
5773 	  /* Three 9/12 bit offsets checks because CImode will emit three
5774 	     ldr/str instructions (only big endian will get here).  */
5775 	  if (mode == CImode)
5776 	    return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
5777 		    && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
5778 			|| offset_12bit_unsigned_scaled_p (V16QImode,
5779 							   offset + 32)));
5780 
5781 	  /* Two 7bit offsets checks because XImode will emit two ldp/stp
5782 	     instructions (only big endian will get here).  */
5783 	  if (mode == XImode)
5784 	    return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
5785 		    && aarch64_offset_7bit_signed_scaled_p (TImode,
5786 							    offset + 32));
5787 
5788 	  /* Make "m" use the LD1 offset range for SVE data modes, so
5789 	     that pre-RTL optimizers like ivopts will work to that
5790 	     instead of the wider LDR/STR range.  */
5791 	  if (vec_flags == VEC_SVE_DATA)
5792 	    return (type == ADDR_QUERY_M
5793 		    ? offset_4bit_signed_scaled_p (mode, offset)
5794 		    : offset_9bit_signed_scaled_p (mode, offset));
5795 
5796 	  if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
5797 	    {
5798 	      poly_int64 end_offset = (offset
5799 				       + GET_MODE_SIZE (mode)
5800 				       - BYTES_PER_SVE_VECTOR);
5801 	      return (type == ADDR_QUERY_M
5802 		      ? offset_4bit_signed_scaled_p (mode, offset)
5803 		      : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
5804 			 && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
5805 							 end_offset)));
5806 	    }
5807 
5808 	  if (vec_flags == VEC_SVE_PRED)
5809 	    return offset_9bit_signed_scaled_p (mode, offset);
5810 
5811 	  if (load_store_pair_p)
5812 	    return ((known_eq (GET_MODE_SIZE (mode), 4)
5813 		     || known_eq (GET_MODE_SIZE (mode), 8))
5814 		    && aarch64_offset_7bit_signed_scaled_p (mode, offset));
5815 	  else
5816 	    return (offset_9bit_signed_unscaled_p (mode, offset)
5817 		    || offset_12bit_unsigned_scaled_p (mode, offset));
5818 	}
5819 
5820       if (allow_reg_index_p)
5821 	{
5822 	  /* Look for base + (scaled/extended) index register.  */
5823 	  if (aarch64_base_register_rtx_p (op0, strict_p)
5824 	      && aarch64_classify_index (info, op1, mode, strict_p))
5825 	    {
5826 	      info->base = op0;
5827 	      return true;
5828 	    }
5829 	  if (aarch64_base_register_rtx_p (op1, strict_p)
5830 	      && aarch64_classify_index (info, op0, mode, strict_p))
5831 	    {
5832 	      info->base = op1;
5833 	      return true;
5834 	    }
5835 	}
5836 
5837       return false;
5838 
5839     case POST_INC:
5840     case POST_DEC:
5841     case PRE_INC:
5842     case PRE_DEC:
5843       info->type = ADDRESS_REG_WB;
5844       info->base = XEXP (x, 0);
5845       info->offset = NULL_RTX;
5846       return aarch64_base_register_rtx_p (info->base, strict_p);
5847 
5848     case POST_MODIFY:
5849     case PRE_MODIFY:
5850       info->type = ADDRESS_REG_WB;
5851       info->base = XEXP (x, 0);
5852       if (GET_CODE (XEXP (x, 1)) == PLUS
5853 	  && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
5854 	  && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
5855 	  && aarch64_base_register_rtx_p (info->base, strict_p))
5856 	{
5857 	  info->offset = XEXP (XEXP (x, 1), 1);
5858 	  info->const_offset = offset;
5859 
5860 	  /* TImode and TFmode values are allowed in both pairs of X
5861 	     registers and individual Q registers.  The available
5862 	     address modes are:
5863 	     X,X: 7-bit signed scaled offset
5864 	     Q:   9-bit signed offset
5865 	     We conservatively require an offset representable in either mode.
5866 	   */
5867 	  if (mode == TImode || mode == TFmode)
5868 	    return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
5869 		    && offset_9bit_signed_unscaled_p (mode, offset));
5870 
5871 	  if (load_store_pair_p)
5872 	    return ((known_eq (GET_MODE_SIZE (mode), 4)
5873 		     || known_eq (GET_MODE_SIZE (mode), 8))
5874 		    && aarch64_offset_7bit_signed_scaled_p (mode, offset));
5875 	  else
5876 	    return offset_9bit_signed_unscaled_p (mode, offset);
5877 	}
5878       return false;
5879 
5880     case CONST:
5881     case SYMBOL_REF:
5882     case LABEL_REF:
5883       /* load literal: pc-relative constant pool entry.  Only supported
5884          for SI mode or larger.  */
5885       info->type = ADDRESS_SYMBOLIC;
5886 
5887       if (!load_store_pair_p
5888 	  && GET_MODE_SIZE (mode).is_constant (&const_size)
5889 	  && const_size >= 4)
5890 	{
5891 	  rtx sym, addend;
5892 
5893 	  split_const (x, &sym, &addend);
5894 	  return ((GET_CODE (sym) == LABEL_REF
5895 		   || (GET_CODE (sym) == SYMBOL_REF
5896 		       && CONSTANT_POOL_ADDRESS_P (sym)
5897 		       && aarch64_pcrelative_literal_loads)));
5898 	}
5899       return false;
5900 
5901     case LO_SUM:
5902       info->type = ADDRESS_LO_SUM;
5903       info->base = XEXP (x, 0);
5904       info->offset = XEXP (x, 1);
5905       if (allow_reg_index_p
5906 	  && aarch64_base_register_rtx_p (info->base, strict_p))
5907 	{
5908 	  rtx sym, offs;
5909 	  split_const (info->offset, &sym, &offs);
5910 	  if (GET_CODE (sym) == SYMBOL_REF
5911 	      && (aarch64_classify_symbol (sym, INTVAL (offs))
5912 		  == SYMBOL_SMALL_ABSOLUTE))
5913 	    {
5914 	      /* The symbol and offset must be aligned to the access size.  */
5915 	      unsigned int align;
5916 
5917 	      if (CONSTANT_POOL_ADDRESS_P (sym))
5918 		align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
5919 	      else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
5920 		{
5921 		  tree exp = SYMBOL_REF_DECL (sym);
5922 		  align = TYPE_ALIGN (TREE_TYPE (exp));
5923 		  align = aarch64_constant_alignment (exp, align);
5924 		}
5925 	      else if (SYMBOL_REF_DECL (sym))
5926 		align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
5927 	      else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
5928 		       && SYMBOL_REF_BLOCK (sym) != NULL)
5929 		align = SYMBOL_REF_BLOCK (sym)->alignment;
5930 	      else
5931 		align = BITS_PER_UNIT;
5932 
5933 	      poly_int64 ref_size = GET_MODE_SIZE (mode);
5934 	      if (known_eq (ref_size, 0))
5935 		ref_size = GET_MODE_SIZE (DImode);
5936 
5937 	      return (multiple_p (INTVAL (offs), ref_size)
5938 		      && multiple_p (align / BITS_PER_UNIT, ref_size));
5939 	    }
5940 	}
5941       return false;
5942 
5943     default:
5944       return false;
5945     }
5946 }
5947 
5948 /* Return true if the address X is valid for a PRFM instruction.
5949    STRICT_P is true if we should do strict checking with
5950    aarch64_classify_address.  */
5951 
5952 bool
aarch64_address_valid_for_prefetch_p(rtx x,bool strict_p)5953 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
5954 {
5955   struct aarch64_address_info addr;
5956 
5957   /* PRFM accepts the same addresses as DImode...  */
5958   bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
5959   if (!res)
5960     return false;
5961 
5962   /* ... except writeback forms.  */
5963   return addr.type != ADDRESS_REG_WB;
5964 }
5965 
5966 bool
aarch64_symbolic_address_p(rtx x)5967 aarch64_symbolic_address_p (rtx x)
5968 {
5969   rtx offset;
5970 
5971   split_const (x, &x, &offset);
5972   return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
5973 }
5974 
5975 /* Classify the base of symbolic expression X.  */
5976 
5977 enum aarch64_symbol_type
aarch64_classify_symbolic_expression(rtx x)5978 aarch64_classify_symbolic_expression (rtx x)
5979 {
5980   rtx offset;
5981 
5982   split_const (x, &x, &offset);
5983   return aarch64_classify_symbol (x, INTVAL (offset));
5984 }
5985 
5986 
5987 /* Return TRUE if X is a legitimate address for accessing memory in
5988    mode MODE.  */
5989 static bool
aarch64_legitimate_address_hook_p(machine_mode mode,rtx x,bool strict_p)5990 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
5991 {
5992   struct aarch64_address_info addr;
5993 
5994   return aarch64_classify_address (&addr, x, mode, strict_p);
5995 }
5996 
5997 /* Return TRUE if X is a legitimate address of type TYPE for accessing
5998    memory in mode MODE.  STRICT_P is true if REG_OK_STRICT is in effect.  */
5999 bool
aarch64_legitimate_address_p(machine_mode mode,rtx x,bool strict_p,aarch64_addr_query_type type)6000 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
6001 			      aarch64_addr_query_type type)
6002 {
6003   struct aarch64_address_info addr;
6004 
6005   return aarch64_classify_address (&addr, x, mode, strict_p, type);
6006 }
6007 
6008 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT.  */
6009 
6010 static bool
aarch64_legitimize_address_displacement(rtx * offset1,rtx * offset2,poly_int64 orig_offset,machine_mode mode)6011 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
6012 					 poly_int64 orig_offset,
6013 					 machine_mode mode)
6014 {
6015   HOST_WIDE_INT size;
6016   if (GET_MODE_SIZE (mode).is_constant (&size))
6017     {
6018       HOST_WIDE_INT const_offset, second_offset;
6019 
6020       /* A general SVE offset is A * VQ + B.  Remove the A component from
6021 	 coefficient 0 in order to get the constant B.  */
6022       const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
6023 
6024       /* Split an out-of-range address displacement into a base and
6025 	 offset.  Use 4KB range for 1- and 2-byte accesses and a 16KB
6026 	 range otherwise to increase opportunities for sharing the base
6027 	 address of different sizes.  Unaligned accesses use the signed
6028 	 9-bit range, TImode/TFmode use the intersection of signed
6029 	 scaled 7-bit and signed 9-bit offset.  */
6030       if (mode == TImode || mode == TFmode)
6031 	second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
6032       else if ((const_offset & (size - 1)) != 0)
6033 	second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
6034       else
6035 	second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
6036 
6037       if (second_offset == 0 || known_eq (orig_offset, second_offset))
6038 	return false;
6039 
6040       /* Split the offset into second_offset and the rest.  */
6041       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
6042       *offset2 = gen_int_mode (second_offset, Pmode);
6043       return true;
6044     }
6045   else
6046     {
6047       /* Get the mode we should use as the basis of the range.  For structure
6048 	 modes this is the mode of one vector.  */
6049       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
6050       machine_mode step_mode
6051 	= (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
6052 
6053       /* Get the "mul vl" multiplier we'd like to use.  */
6054       HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
6055       HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
6056       if (vec_flags & VEC_SVE_DATA)
6057 	/* LDR supports a 9-bit range, but the move patterns for
6058 	   structure modes require all vectors to be in range of the
6059 	   same base.  The simplest way of accomodating that while still
6060 	   promoting reuse of anchor points between different modes is
6061 	   to use an 8-bit range unconditionally.  */
6062 	vnum = ((vnum + 128) & 255) - 128;
6063       else
6064 	/* Predicates are only handled singly, so we might as well use
6065 	   the full range.  */
6066 	vnum = ((vnum + 256) & 511) - 256;
6067       if (vnum == 0)
6068 	return false;
6069 
6070       /* Convert the "mul vl" multiplier into a byte offset.  */
6071       poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
6072       if (known_eq (second_offset, orig_offset))
6073 	return false;
6074 
6075       /* Split the offset into second_offset and the rest.  */
6076       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
6077       *offset2 = gen_int_mode (second_offset, Pmode);
6078       return true;
6079     }
6080 }
6081 
6082 /* Return the binary representation of floating point constant VALUE in INTVAL.
6083    If the value cannot be converted, return false without setting INTVAL.
6084    The conversion is done in the given MODE.  */
6085 bool
aarch64_reinterpret_float_as_int(rtx value,unsigned HOST_WIDE_INT * intval)6086 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
6087 {
6088 
6089   /* We make a general exception for 0.  */
6090   if (aarch64_float_const_zero_rtx_p (value))
6091     {
6092       *intval = 0;
6093       return true;
6094     }
6095 
6096   scalar_float_mode mode;
6097   if (GET_CODE (value) != CONST_DOUBLE
6098       || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
6099       || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
6100       /* Only support up to DF mode.  */
6101       || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
6102     return false;
6103 
6104   unsigned HOST_WIDE_INT ival = 0;
6105 
6106   long res[2];
6107   real_to_target (res,
6108 		  CONST_DOUBLE_REAL_VALUE (value),
6109 		  REAL_MODE_FORMAT (mode));
6110 
6111   if (mode == DFmode)
6112     {
6113       int order = BYTES_BIG_ENDIAN ? 1 : 0;
6114       ival = zext_hwi (res[order], 32);
6115       ival |= (zext_hwi (res[1 - order], 32) << 32);
6116     }
6117   else
6118       ival = zext_hwi (res[0], 32);
6119 
6120   *intval = ival;
6121   return true;
6122 }
6123 
6124 /* Return TRUE if rtx X is an immediate constant that can be moved using a
6125    single MOV(+MOVK) followed by an FMOV.  */
6126 bool
aarch64_float_const_rtx_p(rtx x)6127 aarch64_float_const_rtx_p (rtx x)
6128 {
6129   machine_mode mode = GET_MODE (x);
6130   if (mode == VOIDmode)
6131     return false;
6132 
6133   /* Determine whether it's cheaper to write float constants as
6134      mov/movk pairs over ldr/adrp pairs.  */
6135   unsigned HOST_WIDE_INT ival;
6136 
6137   if (GET_CODE (x) == CONST_DOUBLE
6138       && SCALAR_FLOAT_MODE_P (mode)
6139       && aarch64_reinterpret_float_as_int (x, &ival))
6140     {
6141       scalar_int_mode imode = (mode == HFmode
6142 			       ? SImode
6143 			       : int_mode_for_mode (mode).require ());
6144       int num_instr = aarch64_internal_mov_immediate
6145 			(NULL_RTX, gen_int_mode (ival, imode), false, imode);
6146       return num_instr < 3;
6147     }
6148 
6149   return false;
6150 }
6151 
6152 /* Return TRUE if rtx X is immediate constant 0.0 */
6153 bool
aarch64_float_const_zero_rtx_p(rtx x)6154 aarch64_float_const_zero_rtx_p (rtx x)
6155 {
6156   if (GET_MODE (x) == VOIDmode)
6157     return false;
6158 
6159   if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
6160     return !HONOR_SIGNED_ZEROS (GET_MODE (x));
6161   return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
6162 }
6163 
6164 /* Return TRUE if rtx X is immediate constant that fits in a single
6165    MOVI immediate operation.  */
6166 bool
aarch64_can_const_movi_rtx_p(rtx x,machine_mode mode)6167 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
6168 {
6169   if (!TARGET_SIMD)
6170      return false;
6171 
6172   machine_mode vmode;
6173   scalar_int_mode imode;
6174   unsigned HOST_WIDE_INT ival;
6175 
6176   if (GET_CODE (x) == CONST_DOUBLE
6177       && SCALAR_FLOAT_MODE_P (mode))
6178     {
6179       if (!aarch64_reinterpret_float_as_int (x, &ival))
6180 	return false;
6181 
6182       /* We make a general exception for 0.  */
6183       if (aarch64_float_const_zero_rtx_p (x))
6184 	return true;
6185 
6186       imode = int_mode_for_mode (mode).require ();
6187     }
6188   else if (GET_CODE (x) == CONST_INT
6189 	   && is_a <scalar_int_mode> (mode, &imode))
6190     ival = INTVAL (x);
6191   else
6192     return false;
6193 
6194    /* use a 64 bit mode for everything except for DI/DF mode, where we use
6195      a 128 bit vector mode.  */
6196   int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
6197 
6198   vmode = aarch64_simd_container_mode (imode, width);
6199   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
6200 
6201   return aarch64_simd_valid_immediate (v_op, NULL);
6202 }
6203 
6204 
6205 /* Return the fixed registers used for condition codes.  */
6206 
6207 static bool
aarch64_fixed_condition_code_regs(unsigned int * p1,unsigned int * p2)6208 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
6209 {
6210   *p1 = CC_REGNUM;
6211   *p2 = INVALID_REGNUM;
6212   return true;
6213 }
6214 
6215 /* This function is used by the call expanders of the machine description.
6216    RESULT is the register in which the result is returned.  It's NULL for
6217    "call" and "sibcall".
6218    MEM is the location of the function call.
6219    SIBCALL indicates whether this function call is normal call or sibling call.
6220    It will generate different pattern accordingly.  */
6221 
6222 void
aarch64_expand_call(rtx result,rtx mem,bool sibcall)6223 aarch64_expand_call (rtx result, rtx mem, bool sibcall)
6224 {
6225   rtx call, callee, tmp;
6226   rtvec vec;
6227   machine_mode mode;
6228 
6229   gcc_assert (MEM_P (mem));
6230   callee = XEXP (mem, 0);
6231   mode = GET_MODE (callee);
6232   gcc_assert (mode == Pmode);
6233 
6234   /* Decide if we should generate indirect calls by loading the
6235      address of the callee into a register before performing
6236      the branch-and-link.  */
6237   if (SYMBOL_REF_P (callee)
6238       ? (aarch64_is_long_call_p (callee)
6239 	 || aarch64_is_noplt_call_p (callee))
6240       : !REG_P (callee))
6241     XEXP (mem, 0) = force_reg (mode, callee);
6242 
6243   call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
6244 
6245   if (result != NULL_RTX)
6246     call = gen_rtx_SET (result, call);
6247 
6248   if (sibcall)
6249     tmp = ret_rtx;
6250   else
6251     tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
6252 
6253   vec = gen_rtvec (2, call, tmp);
6254   call = gen_rtx_PARALLEL (VOIDmode, vec);
6255 
6256   aarch64_emit_call_insn (call);
6257 }
6258 
6259 /* Emit call insn with PAT and do aarch64-specific handling.  */
6260 
6261 void
aarch64_emit_call_insn(rtx pat)6262 aarch64_emit_call_insn (rtx pat)
6263 {
6264   rtx insn = emit_call_insn (pat);
6265 
6266   rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
6267   clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
6268   clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
6269 }
6270 
6271 machine_mode
aarch64_select_cc_mode(RTX_CODE code,rtx x,rtx y)6272 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
6273 {
6274   /* All floating point compares return CCFP if it is an equality
6275      comparison, and CCFPE otherwise.  */
6276   if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
6277     {
6278       switch (code)
6279 	{
6280 	case EQ:
6281 	case NE:
6282 	case UNORDERED:
6283 	case ORDERED:
6284 	case UNLT:
6285 	case UNLE:
6286 	case UNGT:
6287 	case UNGE:
6288 	case UNEQ:
6289 	  return CCFPmode;
6290 
6291 	case LT:
6292 	case LE:
6293 	case GT:
6294 	case GE:
6295 	case LTGT:
6296 	  return CCFPEmode;
6297 
6298 	default:
6299 	  gcc_unreachable ();
6300 	}
6301     }
6302 
6303   /* Equality comparisons of short modes against zero can be performed
6304      using the TST instruction with the appropriate bitmask.  */
6305   if (y == const0_rtx && REG_P (x)
6306       && (code == EQ || code == NE)
6307       && (GET_MODE (x) == HImode || GET_MODE (x) == QImode))
6308     return CC_NZmode;
6309 
6310   /* Similarly, comparisons of zero_extends from shorter modes can
6311      be performed using an ANDS with an immediate mask.  */
6312   if (y == const0_rtx && GET_CODE (x) == ZERO_EXTEND
6313       && (GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6314       && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
6315       && (code == EQ || code == NE))
6316     return CC_NZmode;
6317 
6318   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6319       && y == const0_rtx
6320       && (code == EQ || code == NE || code == LT || code == GE)
6321       && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
6322 	  || GET_CODE (x) == NEG
6323 	  || (GET_CODE (x) == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
6324 	      && CONST_INT_P (XEXP (x, 2)))))
6325     return CC_NZmode;
6326 
6327   /* A compare with a shifted operand.  Because of canonicalization,
6328      the comparison will have to be swapped when we emit the assembly
6329      code.  */
6330   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6331       && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
6332       && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
6333 	  || GET_CODE (x) == LSHIFTRT
6334 	  || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
6335     return CC_SWPmode;
6336 
6337   /* Similarly for a negated operand, but we can only do this for
6338      equalities.  */
6339   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6340       && (REG_P (y) || GET_CODE (y) == SUBREG)
6341       && (code == EQ || code == NE)
6342       && GET_CODE (x) == NEG)
6343     return CC_Zmode;
6344 
6345   /* A test for unsigned overflow.  */
6346   if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
6347       && code == NE
6348       && GET_CODE (x) == PLUS
6349       && GET_CODE (y) == ZERO_EXTEND)
6350     return CC_Cmode;
6351 
6352   /* For everything else, return CCmode.  */
6353   return CCmode;
6354 }
6355 
6356 static int
6357 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
6358 
6359 int
aarch64_get_condition_code(rtx x)6360 aarch64_get_condition_code (rtx x)
6361 {
6362   machine_mode mode = GET_MODE (XEXP (x, 0));
6363   enum rtx_code comp_code = GET_CODE (x);
6364 
6365   if (GET_MODE_CLASS (mode) != MODE_CC)
6366     mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
6367   return aarch64_get_condition_code_1 (mode, comp_code);
6368 }
6369 
6370 static int
aarch64_get_condition_code_1(machine_mode mode,enum rtx_code comp_code)6371 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
6372 {
6373   switch (mode)
6374     {
6375     case E_CCFPmode:
6376     case E_CCFPEmode:
6377       switch (comp_code)
6378 	{
6379 	case GE: return AARCH64_GE;
6380 	case GT: return AARCH64_GT;
6381 	case LE: return AARCH64_LS;
6382 	case LT: return AARCH64_MI;
6383 	case NE: return AARCH64_NE;
6384 	case EQ: return AARCH64_EQ;
6385 	case ORDERED: return AARCH64_VC;
6386 	case UNORDERED: return AARCH64_VS;
6387 	case UNLT: return AARCH64_LT;
6388 	case UNLE: return AARCH64_LE;
6389 	case UNGT: return AARCH64_HI;
6390 	case UNGE: return AARCH64_PL;
6391 	default: return -1;
6392 	}
6393       break;
6394 
6395     case E_CCmode:
6396       switch (comp_code)
6397 	{
6398 	case NE: return AARCH64_NE;
6399 	case EQ: return AARCH64_EQ;
6400 	case GE: return AARCH64_GE;
6401 	case GT: return AARCH64_GT;
6402 	case LE: return AARCH64_LE;
6403 	case LT: return AARCH64_LT;
6404 	case GEU: return AARCH64_CS;
6405 	case GTU: return AARCH64_HI;
6406 	case LEU: return AARCH64_LS;
6407 	case LTU: return AARCH64_CC;
6408 	default: return -1;
6409 	}
6410       break;
6411 
6412     case E_CC_SWPmode:
6413       switch (comp_code)
6414 	{
6415 	case NE: return AARCH64_NE;
6416 	case EQ: return AARCH64_EQ;
6417 	case GE: return AARCH64_LE;
6418 	case GT: return AARCH64_LT;
6419 	case LE: return AARCH64_GE;
6420 	case LT: return AARCH64_GT;
6421 	case GEU: return AARCH64_LS;
6422 	case GTU: return AARCH64_CC;
6423 	case LEU: return AARCH64_CS;
6424 	case LTU: return AARCH64_HI;
6425 	default: return -1;
6426 	}
6427       break;
6428 
6429     case E_CC_NZmode:
6430       switch (comp_code)
6431 	{
6432 	case NE: return AARCH64_NE;
6433 	case EQ: return AARCH64_EQ;
6434 	case GE: return AARCH64_PL;
6435 	case LT: return AARCH64_MI;
6436 	default: return -1;
6437 	}
6438       break;
6439 
6440     case E_CC_Zmode:
6441       switch (comp_code)
6442 	{
6443 	case NE: return AARCH64_NE;
6444 	case EQ: return AARCH64_EQ;
6445 	default: return -1;
6446 	}
6447       break;
6448 
6449     case E_CC_Cmode:
6450       switch (comp_code)
6451 	{
6452 	case NE: return AARCH64_CS;
6453 	case EQ: return AARCH64_CC;
6454 	default: return -1;
6455 	}
6456       break;
6457 
6458     default:
6459       return -1;
6460     }
6461 
6462   return -1;
6463 }
6464 
6465 bool
aarch64_const_vec_all_same_in_range_p(rtx x,HOST_WIDE_INT minval,HOST_WIDE_INT maxval)6466 aarch64_const_vec_all_same_in_range_p (rtx x,
6467 				       HOST_WIDE_INT minval,
6468 				       HOST_WIDE_INT maxval)
6469 {
6470   rtx elt;
6471   return (const_vec_duplicate_p (x, &elt)
6472 	  && CONST_INT_P (elt)
6473 	  && IN_RANGE (INTVAL (elt), minval, maxval));
6474 }
6475 
6476 bool
aarch64_const_vec_all_same_int_p(rtx x,HOST_WIDE_INT val)6477 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
6478 {
6479   return aarch64_const_vec_all_same_in_range_p (x, val, val);
6480 }
6481 
6482 /* Return true if VEC is a constant in which every element is in the range
6483    [MINVAL, MAXVAL].  The elements do not need to have the same value.  */
6484 
6485 static bool
aarch64_const_vec_all_in_range_p(rtx vec,HOST_WIDE_INT minval,HOST_WIDE_INT maxval)6486 aarch64_const_vec_all_in_range_p (rtx vec,
6487 				  HOST_WIDE_INT minval,
6488 				  HOST_WIDE_INT maxval)
6489 {
6490   if (GET_CODE (vec) != CONST_VECTOR
6491       || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
6492     return false;
6493 
6494   int nunits;
6495   if (!CONST_VECTOR_STEPPED_P (vec))
6496     nunits = const_vector_encoded_nelts (vec);
6497   else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
6498     return false;
6499 
6500   for (int i = 0; i < nunits; i++)
6501     {
6502       rtx vec_elem = CONST_VECTOR_ELT (vec, i);
6503       if (!CONST_INT_P (vec_elem)
6504 	  || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
6505 	return false;
6506     }
6507   return true;
6508 }
6509 
6510 /* N Z C V.  */
6511 #define AARCH64_CC_V 1
6512 #define AARCH64_CC_C (1 << 1)
6513 #define AARCH64_CC_Z (1 << 2)
6514 #define AARCH64_CC_N (1 << 3)
6515 
6516 /* N Z C V flags for ccmp.  Indexed by AARCH64_COND_CODE.  */
6517 static const int aarch64_nzcv_codes[] =
6518 {
6519   0,		/* EQ, Z == 1.  */
6520   AARCH64_CC_Z,	/* NE, Z == 0.  */
6521   0,		/* CS, C == 1.  */
6522   AARCH64_CC_C,	/* CC, C == 0.  */
6523   0,		/* MI, N == 1.  */
6524   AARCH64_CC_N, /* PL, N == 0.  */
6525   0,		/* VS, V == 1.  */
6526   AARCH64_CC_V, /* VC, V == 0.  */
6527   0,		/* HI, C ==1 && Z == 0.  */
6528   AARCH64_CC_C,	/* LS, !(C == 1 && Z == 0).  */
6529   AARCH64_CC_V,	/* GE, N == V.  */
6530   0,		/* LT, N != V.  */
6531   AARCH64_CC_Z, /* GT, Z == 0 && N == V.  */
6532   0,		/* LE, !(Z == 0 && N == V).  */
6533   0,		/* AL, Any.  */
6534   0		/* NV, Any.  */
6535 };
6536 
6537 /* Print floating-point vector immediate operand X to F, negating it
6538    first if NEGATE is true.  Return true on success, false if it isn't
6539    a constant we can handle.  */
6540 
6541 static bool
aarch64_print_vector_float_operand(FILE * f,rtx x,bool negate)6542 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
6543 {
6544   rtx elt;
6545 
6546   if (!const_vec_duplicate_p (x, &elt))
6547     return false;
6548 
6549   REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
6550   if (negate)
6551     r = real_value_negate (&r);
6552 
6553   /* We only handle the SVE single-bit immediates here.  */
6554   if (real_equal (&r, &dconst0))
6555     asm_fprintf (f, "0.0");
6556   else if (real_equal (&r, &dconst1))
6557     asm_fprintf (f, "1.0");
6558   else if (real_equal (&r, &dconsthalf))
6559     asm_fprintf (f, "0.5");
6560   else
6561     return false;
6562 
6563   return true;
6564 }
6565 
6566 /* Return the equivalent letter for size.  */
6567 static char
sizetochar(int size)6568 sizetochar (int size)
6569 {
6570   switch (size)
6571     {
6572     case 64: return 'd';
6573     case 32: return 's';
6574     case 16: return 'h';
6575     case 8 : return 'b';
6576     default: gcc_unreachable ();
6577     }
6578 }
6579 
6580 /* Print operand X to file F in a target specific manner according to CODE.
6581    The acceptable formatting commands given by CODE are:
6582      'c':		An integer or symbol address without a preceding #
6583 			sign.
6584      'C':		Take the duplicated element in a vector constant
6585 			and print it in hex.
6586      'D':		Take the duplicated element in a vector constant
6587 			and print it as an unsigned integer, in decimal.
6588      'e':		Print the sign/zero-extend size as a character 8->b,
6589 			16->h, 32->w.
6590      'p':		Prints N such that 2^N == X (X must be power of 2 and
6591 			const int).
6592      'P':		Print the number of non-zero bits in X (a const_int).
6593      'H':		Print the higher numbered register of a pair (TImode)
6594 			of regs.
6595      'm':		Print a condition (eq, ne, etc).
6596      'M':		Same as 'm', but invert condition.
6597      'N':		Take the duplicated element in a vector constant
6598 			and print the negative of it in decimal.
6599      'b/h/s/d/q':	Print a scalar FP/SIMD register name.
6600      'S/T/U/V':		Print a FP/SIMD register name for a register list.
6601 			The register printed is the FP/SIMD register name
6602 			of X + 0/1/2/3 for S/T/U/V.
6603      'R':		Print a scalar FP/SIMD register name + 1.
6604      'X':		Print bottom 16 bits of integer constant in hex.
6605      'w/x':		Print a general register name or the zero register
6606 			(32-bit or 64-bit).
6607      '0':		Print a normal operand, if it's a general register,
6608 			then we assume DImode.
6609      'k':		Print NZCV for conditional compare instructions.
6610      'A':		Output address constant representing the first
6611 			argument of X, specifying a relocation offset
6612 			if appropriate.
6613      'L':		Output constant address specified by X
6614 			with a relocation offset if appropriate.
6615      'G':		Prints address of X, specifying a PC relative
6616 			relocation mode if appropriate.
6617      'y':		Output address of LDP or STP - this is used for
6618 			some LDP/STPs which don't use a PARALLEL in their
6619 			pattern (so the mode needs to be adjusted).
6620      'z':		Output address of a typical LDP or STP.  */
6621 
6622 static void
aarch64_print_operand(FILE * f,rtx x,int code)6623 aarch64_print_operand (FILE *f, rtx x, int code)
6624 {
6625   rtx elt;
6626   switch (code)
6627     {
6628     case 'c':
6629       switch (GET_CODE (x))
6630 	{
6631 	case CONST_INT:
6632 	  fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
6633 	  break;
6634 
6635 	case SYMBOL_REF:
6636 	  output_addr_const (f, x);
6637 	  break;
6638 
6639 	case CONST:
6640 	  if (GET_CODE (XEXP (x, 0)) == PLUS
6641 	      && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
6642 	    {
6643 	      output_addr_const (f, x);
6644 	      break;
6645 	    }
6646 	  /* Fall through.  */
6647 
6648 	default:
6649 	  output_operand_lossage ("unsupported operand for code '%c'", code);
6650 	}
6651       break;
6652 
6653     case 'e':
6654       {
6655 	int n;
6656 
6657 	if (!CONST_INT_P (x)
6658 	    || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
6659 	  {
6660 	    output_operand_lossage ("invalid operand for '%%%c'", code);
6661 	    return;
6662 	  }
6663 
6664 	switch (n)
6665 	  {
6666 	  case 3:
6667 	    fputc ('b', f);
6668 	    break;
6669 	  case 4:
6670 	    fputc ('h', f);
6671 	    break;
6672 	  case 5:
6673 	    fputc ('w', f);
6674 	    break;
6675 	  default:
6676 	    output_operand_lossage ("invalid operand for '%%%c'", code);
6677 	    return;
6678 	  }
6679       }
6680       break;
6681 
6682     case 'p':
6683       {
6684 	int n;
6685 
6686 	if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
6687 	  {
6688 	    output_operand_lossage ("invalid operand for '%%%c'", code);
6689 	    return;
6690 	  }
6691 
6692 	asm_fprintf (f, "%d", n);
6693       }
6694       break;
6695 
6696     case 'P':
6697       if (!CONST_INT_P (x))
6698 	{
6699 	  output_operand_lossage ("invalid operand for '%%%c'", code);
6700 	  return;
6701 	}
6702 
6703       asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
6704       break;
6705 
6706     case 'H':
6707       if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
6708 	{
6709 	  output_operand_lossage ("invalid operand for '%%%c'", code);
6710 	  return;
6711 	}
6712 
6713       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
6714       break;
6715 
6716     case 'M':
6717     case 'm':
6718       {
6719         int cond_code;
6720 	/* CONST_TRUE_RTX means al/nv (al is the default, don't print it).  */
6721 	if (x == const_true_rtx)
6722 	  {
6723 	    if (code == 'M')
6724 	      fputs ("nv", f);
6725 	    return;
6726 	  }
6727 
6728         if (!COMPARISON_P (x))
6729 	  {
6730 	    output_operand_lossage ("invalid operand for '%%%c'", code);
6731 	    return;
6732 	  }
6733 
6734         cond_code = aarch64_get_condition_code (x);
6735         gcc_assert (cond_code >= 0);
6736 	if (code == 'M')
6737 	  cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
6738 	fputs (aarch64_condition_codes[cond_code], f);
6739       }
6740       break;
6741 
6742     case 'N':
6743       if (!const_vec_duplicate_p (x, &elt))
6744 	{
6745 	  output_operand_lossage ("invalid vector constant");
6746 	  return;
6747 	}
6748 
6749       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
6750 	asm_fprintf (f, "%wd", -INTVAL (elt));
6751       else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
6752 	       && aarch64_print_vector_float_operand (f, x, true))
6753 	;
6754       else
6755 	{
6756 	  output_operand_lossage ("invalid vector constant");
6757 	  return;
6758 	}
6759       break;
6760 
6761     case 'b':
6762     case 'h':
6763     case 's':
6764     case 'd':
6765     case 'q':
6766       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
6767 	{
6768 	  output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
6769 	  return;
6770 	}
6771       asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
6772       break;
6773 
6774     case 'S':
6775     case 'T':
6776     case 'U':
6777     case 'V':
6778       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
6779 	{
6780 	  output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
6781 	  return;
6782 	}
6783       asm_fprintf (f, "%c%d",
6784 		   aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
6785 		   REGNO (x) - V0_REGNUM + (code - 'S'));
6786       break;
6787 
6788     case 'R':
6789       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
6790 	{
6791 	  output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
6792 	  return;
6793 	}
6794       asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
6795       break;
6796 
6797     case 'X':
6798       if (!CONST_INT_P (x))
6799 	{
6800 	  output_operand_lossage ("invalid operand for '%%%c'", code);
6801 	  return;
6802 	}
6803       asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
6804       break;
6805 
6806     case 'C':
6807       {
6808 	/* Print a replicated constant in hex.  */
6809 	if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
6810 	  {
6811 	    output_operand_lossage ("invalid operand for '%%%c'", code);
6812 	    return;
6813 	  }
6814 	scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
6815 	asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
6816       }
6817       break;
6818 
6819     case 'D':
6820       {
6821 	/* Print a replicated constant in decimal, treating it as
6822 	   unsigned.  */
6823 	if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
6824 	  {
6825 	    output_operand_lossage ("invalid operand for '%%%c'", code);
6826 	    return;
6827 	  }
6828 	scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
6829 	asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
6830       }
6831       break;
6832 
6833     case 'w':
6834     case 'x':
6835       if (x == const0_rtx
6836 	  || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
6837 	{
6838 	  asm_fprintf (f, "%czr", code);
6839 	  break;
6840 	}
6841 
6842       if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
6843 	{
6844 	  asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
6845 	  break;
6846 	}
6847 
6848       if (REG_P (x) && REGNO (x) == SP_REGNUM)
6849 	{
6850 	  asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
6851 	  break;
6852 	}
6853 
6854       /* Fall through */
6855 
6856     case 0:
6857       if (x == NULL)
6858 	{
6859 	  output_operand_lossage ("missing operand");
6860 	  return;
6861 	}
6862 
6863       switch (GET_CODE (x))
6864 	{
6865 	case REG:
6866 	  if (aarch64_sve_data_mode_p (GET_MODE (x)))
6867 	    {
6868 	      if (REG_NREGS (x) == 1)
6869 		asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
6870 	      else
6871 		{
6872 		  char suffix
6873 		    = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
6874 		  asm_fprintf (f, "{z%d.%c - z%d.%c}",
6875 			       REGNO (x) - V0_REGNUM, suffix,
6876 			       END_REGNO (x) - V0_REGNUM - 1, suffix);
6877 		}
6878 	    }
6879 	  else
6880 	    asm_fprintf (f, "%s", reg_names [REGNO (x)]);
6881 	  break;
6882 
6883 	case MEM:
6884 	  output_address (GET_MODE (x), XEXP (x, 0));
6885 	  break;
6886 
6887 	case LABEL_REF:
6888 	case SYMBOL_REF:
6889 	  output_addr_const (asm_out_file, x);
6890 	  break;
6891 
6892 	case CONST_INT:
6893 	  asm_fprintf (f, "%wd", INTVAL (x));
6894 	  break;
6895 
6896 	case CONST:
6897 	  if (!VECTOR_MODE_P (GET_MODE (x)))
6898 	    {
6899 	      output_addr_const (asm_out_file, x);
6900 	      break;
6901 	    }
6902 	  /* fall through */
6903 
6904 	case CONST_VECTOR:
6905 	  if (!const_vec_duplicate_p (x, &elt))
6906 	    {
6907 	      output_operand_lossage ("invalid vector constant");
6908 	      return;
6909 	    }
6910 
6911 	  if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
6912 	    asm_fprintf (f, "%wd", INTVAL (elt));
6913 	  else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
6914 		   && aarch64_print_vector_float_operand (f, x, false))
6915 	    ;
6916 	  else
6917 	    {
6918 	      output_operand_lossage ("invalid vector constant");
6919 	      return;
6920 	    }
6921 	  break;
6922 
6923 	case CONST_DOUBLE:
6924 	  /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
6925 	     be getting CONST_DOUBLEs holding integers.  */
6926 	  gcc_assert (GET_MODE (x) != VOIDmode);
6927 	  if (aarch64_float_const_zero_rtx_p (x))
6928 	    {
6929 	      fputc ('0', f);
6930 	      break;
6931 	    }
6932 	  else if (aarch64_float_const_representable_p (x))
6933 	    {
6934 #define buf_size 20
6935 	      char float_buf[buf_size] = {'\0'};
6936 	      real_to_decimal_for_mode (float_buf,
6937 					CONST_DOUBLE_REAL_VALUE (x),
6938 					buf_size, buf_size,
6939 					1, GET_MODE (x));
6940 	      asm_fprintf (asm_out_file, "%s", float_buf);
6941 	      break;
6942 #undef buf_size
6943 	    }
6944 	  output_operand_lossage ("invalid constant");
6945 	  return;
6946 	default:
6947 	  output_operand_lossage ("invalid operand");
6948 	  return;
6949 	}
6950       break;
6951 
6952     case 'A':
6953       if (GET_CODE (x) == HIGH)
6954 	x = XEXP (x, 0);
6955 
6956       switch (aarch64_classify_symbolic_expression (x))
6957 	{
6958 	case SYMBOL_SMALL_GOT_4G:
6959 	  asm_fprintf (asm_out_file, ":got:");
6960 	  break;
6961 
6962 	case SYMBOL_SMALL_TLSGD:
6963 	  asm_fprintf (asm_out_file, ":tlsgd:");
6964 	  break;
6965 
6966 	case SYMBOL_SMALL_TLSDESC:
6967 	  asm_fprintf (asm_out_file, ":tlsdesc:");
6968 	  break;
6969 
6970 	case SYMBOL_SMALL_TLSIE:
6971 	  asm_fprintf (asm_out_file, ":gottprel:");
6972 	  break;
6973 
6974 	case SYMBOL_TLSLE24:
6975 	  asm_fprintf (asm_out_file, ":tprel:");
6976 	  break;
6977 
6978 	case SYMBOL_TINY_GOT:
6979 	  gcc_unreachable ();
6980 	  break;
6981 
6982 	default:
6983 	  break;
6984 	}
6985       output_addr_const (asm_out_file, x);
6986       break;
6987 
6988     case 'L':
6989       switch (aarch64_classify_symbolic_expression (x))
6990 	{
6991 	case SYMBOL_SMALL_GOT_4G:
6992 	  asm_fprintf (asm_out_file, ":lo12:");
6993 	  break;
6994 
6995 	case SYMBOL_SMALL_TLSGD:
6996 	  asm_fprintf (asm_out_file, ":tlsgd_lo12:");
6997 	  break;
6998 
6999 	case SYMBOL_SMALL_TLSDESC:
7000 	  asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
7001 	  break;
7002 
7003 	case SYMBOL_SMALL_TLSIE:
7004 	  asm_fprintf (asm_out_file, ":gottprel_lo12:");
7005 	  break;
7006 
7007 	case SYMBOL_TLSLE12:
7008 	  asm_fprintf (asm_out_file, ":tprel_lo12:");
7009 	  break;
7010 
7011 	case SYMBOL_TLSLE24:
7012 	  asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
7013 	  break;
7014 
7015 	case SYMBOL_TINY_GOT:
7016 	  asm_fprintf (asm_out_file, ":got:");
7017 	  break;
7018 
7019 	case SYMBOL_TINY_TLSIE:
7020 	  asm_fprintf (asm_out_file, ":gottprel:");
7021 	  break;
7022 
7023 	default:
7024 	  break;
7025 	}
7026       output_addr_const (asm_out_file, x);
7027       break;
7028 
7029     case 'G':
7030       switch (aarch64_classify_symbolic_expression (x))
7031 	{
7032 	case SYMBOL_TLSLE24:
7033 	  asm_fprintf (asm_out_file, ":tprel_hi12:");
7034 	  break;
7035 	default:
7036 	  break;
7037 	}
7038       output_addr_const (asm_out_file, x);
7039       break;
7040 
7041     case 'k':
7042       {
7043 	HOST_WIDE_INT cond_code;
7044 
7045 	if (!CONST_INT_P (x))
7046 	  {
7047 	    output_operand_lossage ("invalid operand for '%%%c'", code);
7048 	    return;
7049 	  }
7050 
7051 	cond_code = INTVAL (x);
7052 	gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
7053 	asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
7054       }
7055       break;
7056 
7057     case 'y':
7058     case 'z':
7059       {
7060 	machine_mode mode = GET_MODE (x);
7061 
7062 	if (GET_CODE (x) != MEM
7063 	    || (code == 'y' && maybe_ne (GET_MODE_SIZE (mode), 16)))
7064 	  {
7065 	    output_operand_lossage ("invalid operand for '%%%c'", code);
7066 	    return;
7067 	  }
7068 
7069 	if (code == 'y')
7070 	  /* LDP/STP which uses a single double-width memory operand.
7071 	     Adjust the mode to appear like a typical LDP/STP.
7072 	     Currently this is supported for 16-byte accesses only.  */
7073 	  mode = DFmode;
7074 
7075 	if (!aarch64_print_ldpstp_address (f, mode, XEXP (x, 0)))
7076 	  output_operand_lossage ("invalid operand prefix '%%%c'", code);
7077       }
7078       break;
7079 
7080     default:
7081       output_operand_lossage ("invalid operand prefix '%%%c'", code);
7082       return;
7083     }
7084 }
7085 
7086 /* Print address 'x' of a memory access with mode 'mode'.
7087    'op' is the context required by aarch64_classify_address.  It can either be
7088    MEM for a normal memory access or PARALLEL for LDP/STP.  */
7089 static bool
aarch64_print_address_internal(FILE * f,machine_mode mode,rtx x,aarch64_addr_query_type type)7090 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
7091 				aarch64_addr_query_type type)
7092 {
7093   struct aarch64_address_info addr;
7094   unsigned int size;
7095 
7096   /* Check all addresses are Pmode - including ILP32.  */
7097   if (GET_MODE (x) != Pmode
7098       && (!CONST_INT_P (x)
7099 	  || trunc_int_for_mode (INTVAL (x), Pmode) != INTVAL (x)))
7100     {
7101       output_operand_lossage ("invalid address mode");
7102       return false;
7103     }
7104 
7105   if (aarch64_classify_address (&addr, x, mode, true, type))
7106     switch (addr.type)
7107       {
7108       case ADDRESS_REG_IMM:
7109 	if (known_eq (addr.const_offset, 0))
7110 	  asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
7111 	else if (aarch64_sve_data_mode_p (mode))
7112 	  {
7113 	    HOST_WIDE_INT vnum
7114 	      = exact_div (addr.const_offset,
7115 			   BYTES_PER_SVE_VECTOR).to_constant ();
7116 	    asm_fprintf (f, "[%s, #%wd, mul vl]",
7117 			 reg_names[REGNO (addr.base)], vnum);
7118 	  }
7119 	else if (aarch64_sve_pred_mode_p (mode))
7120 	  {
7121 	    HOST_WIDE_INT vnum
7122 	      = exact_div (addr.const_offset,
7123 			   BYTES_PER_SVE_PRED).to_constant ();
7124 	    asm_fprintf (f, "[%s, #%wd, mul vl]",
7125 			 reg_names[REGNO (addr.base)], vnum);
7126 	  }
7127 	else
7128 	  asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
7129 		       INTVAL (addr.offset));
7130 	return true;
7131 
7132       case ADDRESS_REG_REG:
7133 	if (addr.shift == 0)
7134 	  asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
7135 		       reg_names [REGNO (addr.offset)]);
7136 	else
7137 	  asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
7138 		       reg_names [REGNO (addr.offset)], addr.shift);
7139 	return true;
7140 
7141       case ADDRESS_REG_UXTW:
7142 	if (addr.shift == 0)
7143 	  asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
7144 		       REGNO (addr.offset) - R0_REGNUM);
7145 	else
7146 	  asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
7147 		       REGNO (addr.offset) - R0_REGNUM, addr.shift);
7148 	return true;
7149 
7150       case ADDRESS_REG_SXTW:
7151 	if (addr.shift == 0)
7152 	  asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
7153 		       REGNO (addr.offset) - R0_REGNUM);
7154 	else
7155 	  asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
7156 		       REGNO (addr.offset) - R0_REGNUM, addr.shift);
7157 	return true;
7158 
7159       case ADDRESS_REG_WB:
7160 	/* Writeback is only supported for fixed-width modes.  */
7161 	size = GET_MODE_SIZE (mode).to_constant ();
7162 	switch (GET_CODE (x))
7163 	  {
7164 	  case PRE_INC:
7165 	    asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
7166 	    return true;
7167 	  case POST_INC:
7168 	    asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
7169 	    return true;
7170 	  case PRE_DEC:
7171 	    asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
7172 	    return true;
7173 	  case POST_DEC:
7174 	    asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
7175 	    return true;
7176 	  case PRE_MODIFY:
7177 	    asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
7178 			 INTVAL (addr.offset));
7179 	    return true;
7180 	  case POST_MODIFY:
7181 	    asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
7182 			 INTVAL (addr.offset));
7183 	    return true;
7184 	  default:
7185 	    break;
7186 	  }
7187 	break;
7188 
7189       case ADDRESS_LO_SUM:
7190 	asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
7191 	output_addr_const (f, addr.offset);
7192 	asm_fprintf (f, "]");
7193 	return true;
7194 
7195       case ADDRESS_SYMBOLIC:
7196 	output_addr_const (f, x);
7197 	return true;
7198       }
7199 
7200   return false;
7201 }
7202 
7203 /* Print address 'x' of a LDP/STP with mode 'mode'.  */
7204 static bool
aarch64_print_ldpstp_address(FILE * f,machine_mode mode,rtx x)7205 aarch64_print_ldpstp_address (FILE *f, machine_mode mode, rtx x)
7206 {
7207   return aarch64_print_address_internal (f, mode, x, ADDR_QUERY_LDP_STP);
7208 }
7209 
7210 /* Print address 'x' of a memory access with mode 'mode'.  */
7211 static void
aarch64_print_operand_address(FILE * f,machine_mode mode,rtx x)7212 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
7213 {
7214   if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
7215     output_addr_const (f, x);
7216 }
7217 
7218 bool
aarch64_label_mentioned_p(rtx x)7219 aarch64_label_mentioned_p (rtx x)
7220 {
7221   const char *fmt;
7222   int i;
7223 
7224   if (GET_CODE (x) == LABEL_REF)
7225     return true;
7226 
7227   /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
7228      referencing instruction, but they are constant offsets, not
7229      symbols.  */
7230   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
7231     return false;
7232 
7233   fmt = GET_RTX_FORMAT (GET_CODE (x));
7234   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
7235     {
7236       if (fmt[i] == 'E')
7237 	{
7238 	  int j;
7239 
7240 	  for (j = XVECLEN (x, i) - 1; j >= 0; j--)
7241 	    if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
7242 	      return 1;
7243 	}
7244       else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
7245 	return 1;
7246     }
7247 
7248   return 0;
7249 }
7250 
7251 /* Implement REGNO_REG_CLASS.  */
7252 
7253 enum reg_class
aarch64_regno_regclass(unsigned regno)7254 aarch64_regno_regclass (unsigned regno)
7255 {
7256   if (GP_REGNUM_P (regno))
7257     return GENERAL_REGS;
7258 
7259   if (regno == SP_REGNUM)
7260     return STACK_REG;
7261 
7262   if (regno == FRAME_POINTER_REGNUM
7263       || regno == ARG_POINTER_REGNUM)
7264     return POINTER_REGS;
7265 
7266   if (FP_REGNUM_P (regno))
7267     return FP_LO_REGNUM_P (regno) ?  FP_LO_REGS : FP_REGS;
7268 
7269   if (PR_REGNUM_P (regno))
7270     return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
7271 
7272   return NO_REGS;
7273 }
7274 
7275 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
7276    If OFFSET is out of range, return an offset of an anchor point
7277    that is in range.  Return 0 otherwise.  */
7278 
7279 static HOST_WIDE_INT
aarch64_anchor_offset(HOST_WIDE_INT offset,HOST_WIDE_INT size,machine_mode mode)7280 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
7281 		       machine_mode mode)
7282 {
7283   /* Does it look like we'll need a 16-byte load/store-pair operation?  */
7284   if (size > 16)
7285     return (offset + 0x400) & ~0x7f0;
7286 
7287   /* For offsets that aren't a multiple of the access size, the limit is
7288      -256...255.  */
7289   if (offset & (size - 1))
7290     {
7291       /* BLKmode typically uses LDP of X-registers.  */
7292       if (mode == BLKmode)
7293 	return (offset + 512) & ~0x3ff;
7294       return (offset + 0x100) & ~0x1ff;
7295     }
7296 
7297   /* Small negative offsets are supported.  */
7298   if (IN_RANGE (offset, -256, 0))
7299     return 0;
7300 
7301   if (mode == TImode || mode == TFmode)
7302     return (offset + 0x100) & ~0x1ff;
7303 
7304   /* Use 12-bit offset by access size.  */
7305   return offset & (~0xfff * size);
7306 }
7307 
7308 static rtx
aarch64_legitimize_address(rtx x,rtx,machine_mode mode)7309 aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
7310 {
7311   /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
7312      where mask is selected by alignment and size of the offset.
7313      We try to pick as large a range for the offset as possible to
7314      maximize the chance of a CSE.  However, for aligned addresses
7315      we limit the range to 4k so that structures with different sized
7316      elements are likely to use the same base.  We need to be careful
7317      not to split a CONST for some forms of address expression, otherwise
7318      it will generate sub-optimal code.  */
7319 
7320   if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
7321     {
7322       rtx base = XEXP (x, 0);
7323       rtx offset_rtx = XEXP (x, 1);
7324       HOST_WIDE_INT offset = INTVAL (offset_rtx);
7325 
7326       if (GET_CODE (base) == PLUS)
7327 	{
7328 	  rtx op0 = XEXP (base, 0);
7329 	  rtx op1 = XEXP (base, 1);
7330 
7331 	  /* Force any scaling into a temp for CSE.  */
7332 	  op0 = force_reg (Pmode, op0);
7333 	  op1 = force_reg (Pmode, op1);
7334 
7335 	  /* Let the pointer register be in op0.  */
7336 	  if (REG_POINTER (op1))
7337 	    std::swap (op0, op1);
7338 
7339 	  /* If the pointer is virtual or frame related, then we know that
7340 	     virtual register instantiation or register elimination is going
7341 	     to apply a second constant.  We want the two constants folded
7342 	     together easily.  Therefore, emit as (OP0 + CONST) + OP1.  */
7343 	  if (virt_or_elim_regno_p (REGNO (op0)))
7344 	    {
7345 	      base = expand_binop (Pmode, add_optab, op0, offset_rtx,
7346 				   NULL_RTX, true, OPTAB_DIRECT);
7347 	      return gen_rtx_PLUS (Pmode, base, op1);
7348 	    }
7349 
7350 	  /* Otherwise, in order to encourage CSE (and thence loop strength
7351 	     reduce) scaled addresses, emit as (OP0 + OP1) + CONST.  */
7352 	  base = expand_binop (Pmode, add_optab, op0, op1,
7353 			       NULL_RTX, true, OPTAB_DIRECT);
7354 	  x = gen_rtx_PLUS (Pmode, base, offset_rtx);
7355 	}
7356 
7357       HOST_WIDE_INT size;
7358       if (GET_MODE_SIZE (mode).is_constant (&size))
7359 	{
7360 	  HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
7361 							     mode);
7362 	  if (base_offset != 0)
7363 	    {
7364 	      base = plus_constant (Pmode, base, base_offset);
7365 	      base = force_operand (base, NULL_RTX);
7366 	      return plus_constant (Pmode, base, offset - base_offset);
7367 	    }
7368 	}
7369     }
7370 
7371   return x;
7372 }
7373 
7374 /* Return the reload icode required for a constant pool in mode.  */
7375 static enum insn_code
aarch64_constant_pool_reload_icode(machine_mode mode)7376 aarch64_constant_pool_reload_icode (machine_mode mode)
7377 {
7378   switch (mode)
7379     {
7380     case E_SFmode:
7381       return CODE_FOR_aarch64_reload_movcpsfdi;
7382 
7383     case E_DFmode:
7384       return CODE_FOR_aarch64_reload_movcpdfdi;
7385 
7386     case E_TFmode:
7387       return CODE_FOR_aarch64_reload_movcptfdi;
7388 
7389     case E_V8QImode:
7390       return CODE_FOR_aarch64_reload_movcpv8qidi;
7391 
7392     case E_V16QImode:
7393       return CODE_FOR_aarch64_reload_movcpv16qidi;
7394 
7395     case E_V4HImode:
7396       return CODE_FOR_aarch64_reload_movcpv4hidi;
7397 
7398     case E_V8HImode:
7399       return CODE_FOR_aarch64_reload_movcpv8hidi;
7400 
7401     case E_V2SImode:
7402       return CODE_FOR_aarch64_reload_movcpv2sidi;
7403 
7404     case E_V4SImode:
7405       return CODE_FOR_aarch64_reload_movcpv4sidi;
7406 
7407     case E_V2DImode:
7408       return CODE_FOR_aarch64_reload_movcpv2didi;
7409 
7410     case E_V2DFmode:
7411       return CODE_FOR_aarch64_reload_movcpv2dfdi;
7412 
7413     default:
7414       gcc_unreachable ();
7415     }
7416 
7417   gcc_unreachable ();
7418 }
7419 static reg_class_t
aarch64_secondary_reload(bool in_p ATTRIBUTE_UNUSED,rtx x,reg_class_t rclass,machine_mode mode,secondary_reload_info * sri)7420 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
7421 			  reg_class_t rclass,
7422 			  machine_mode mode,
7423 			  secondary_reload_info *sri)
7424 {
7425   /* Use aarch64_sve_reload_be for SVE reloads that cannot be handled
7426      directly by the *aarch64_sve_mov<mode>_be move pattern.  See the
7427      comment at the head of aarch64-sve.md for more details about the
7428      big-endian handling.  */
7429   if (BYTES_BIG_ENDIAN
7430       && reg_class_subset_p (rclass, FP_REGS)
7431       && !((REG_P (x) && HARD_REGISTER_P (x))
7432 	   || aarch64_simd_valid_immediate (x, NULL))
7433       && aarch64_sve_data_mode_p (mode))
7434     {
7435       sri->icode = CODE_FOR_aarch64_sve_reload_be;
7436       return NO_REGS;
7437     }
7438 
7439   /* If we have to disable direct literal pool loads and stores because the
7440      function is too big, then we need a scratch register.  */
7441   if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
7442       && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
7443 	  || targetm.vector_mode_supported_p (GET_MODE (x)))
7444       && !aarch64_pcrelative_literal_loads)
7445     {
7446       sri->icode = aarch64_constant_pool_reload_icode (mode);
7447       return NO_REGS;
7448     }
7449 
7450   /* Without the TARGET_SIMD instructions we cannot move a Q register
7451      to a Q register directly.  We need a scratch.  */
7452   if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
7453       && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
7454       && reg_class_subset_p (rclass, FP_REGS))
7455     {
7456       if (mode == TFmode)
7457         sri->icode = CODE_FOR_aarch64_reload_movtf;
7458       else if (mode == TImode)
7459         sri->icode = CODE_FOR_aarch64_reload_movti;
7460       return NO_REGS;
7461     }
7462 
7463   /* A TFmode or TImode memory access should be handled via an FP_REGS
7464      because AArch64 has richer addressing modes for LDR/STR instructions
7465      than LDP/STP instructions.  */
7466   if (TARGET_FLOAT && rclass == GENERAL_REGS
7467       && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
7468     return FP_REGS;
7469 
7470   if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
7471       return GENERAL_REGS;
7472 
7473   return NO_REGS;
7474 }
7475 
7476 static bool
aarch64_can_eliminate(const int from ATTRIBUTE_UNUSED,const int to)7477 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
7478 {
7479   gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
7480 
7481   /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
7482      can only eliminate to HARD_FRAME_POINTER_REGNUM.  */
7483   if (frame_pointer_needed)
7484     return to == HARD_FRAME_POINTER_REGNUM;
7485   return true;
7486 }
7487 
7488 poly_int64
aarch64_initial_elimination_offset(unsigned from,unsigned to)7489 aarch64_initial_elimination_offset (unsigned from, unsigned to)
7490 {
7491   aarch64_layout_frame ();
7492 
7493   if (to == HARD_FRAME_POINTER_REGNUM)
7494     {
7495       if (from == ARG_POINTER_REGNUM)
7496 	return cfun->machine->frame.hard_fp_offset;
7497 
7498       if (from == FRAME_POINTER_REGNUM)
7499 	return cfun->machine->frame.hard_fp_offset
7500 	       - cfun->machine->frame.locals_offset;
7501     }
7502 
7503   if (to == STACK_POINTER_REGNUM)
7504     {
7505       if (from == FRAME_POINTER_REGNUM)
7506 	  return cfun->machine->frame.frame_size
7507 		 - cfun->machine->frame.locals_offset;
7508     }
7509 
7510   return cfun->machine->frame.frame_size;
7511 }
7512 
7513 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
7514    previous frame.  */
7515 
7516 rtx
aarch64_return_addr(int count,rtx frame ATTRIBUTE_UNUSED)7517 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
7518 {
7519   if (count != 0)
7520     return const0_rtx;
7521   return get_hard_reg_initial_val (Pmode, LR_REGNUM);
7522 }
7523 
7524 
7525 static void
aarch64_asm_trampoline_template(FILE * f)7526 aarch64_asm_trampoline_template (FILE *f)
7527 {
7528   if (TARGET_ILP32)
7529     {
7530       asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
7531       asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
7532     }
7533   else
7534     {
7535       asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
7536       asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
7537     }
7538   asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
7539   assemble_aligned_integer (4, const0_rtx);
7540   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
7541   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
7542 }
7543 
7544 static void
aarch64_trampoline_init(rtx m_tramp,tree fndecl,rtx chain_value)7545 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
7546 {
7547   rtx fnaddr, mem, a_tramp;
7548   const int tramp_code_sz = 16;
7549 
7550   /* Don't need to copy the trailing D-words, we fill those in below.  */
7551   emit_block_move (m_tramp, assemble_trampoline_template (),
7552 		   GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
7553   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
7554   fnaddr = XEXP (DECL_RTL (fndecl), 0);
7555   if (GET_MODE (fnaddr) != ptr_mode)
7556     fnaddr = convert_memory_address (ptr_mode, fnaddr);
7557   emit_move_insn (mem, fnaddr);
7558 
7559   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
7560   emit_move_insn (mem, chain_value);
7561 
7562   /* XXX We should really define a "clear_cache" pattern and use
7563      gen_clear_cache().  */
7564   a_tramp = XEXP (m_tramp, 0);
7565   emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
7566 		     LCT_NORMAL, VOIDmode, a_tramp, ptr_mode,
7567 		     plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
7568 		     ptr_mode);
7569 }
7570 
7571 static unsigned char
aarch64_class_max_nregs(reg_class_t regclass,machine_mode mode)7572 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
7573 {
7574   /* ??? Logically we should only need to provide a value when
7575      HARD_REGNO_MODE_OK says that at least one register in REGCLASS
7576      can hold MODE, but at the moment we need to handle all modes.
7577      Just ignore any runtime parts for registers that can't store them.  */
7578   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
7579   unsigned int nregs;
7580   switch (regclass)
7581     {
7582     case TAILCALL_ADDR_REGS:
7583     case POINTER_REGS:
7584     case GENERAL_REGS:
7585     case ALL_REGS:
7586     case POINTER_AND_FP_REGS:
7587     case FP_REGS:
7588     case FP_LO_REGS:
7589       if (aarch64_sve_data_mode_p (mode)
7590 	  && constant_multiple_p (GET_MODE_SIZE (mode),
7591 				  BYTES_PER_SVE_VECTOR, &nregs))
7592 	return nregs;
7593       return (aarch64_vector_data_mode_p (mode)
7594 	      ? CEIL (lowest_size, UNITS_PER_VREG)
7595 	      : CEIL (lowest_size, UNITS_PER_WORD));
7596     case STACK_REG:
7597     case PR_REGS:
7598     case PR_LO_REGS:
7599     case PR_HI_REGS:
7600       return 1;
7601 
7602     case NO_REGS:
7603       return 0;
7604 
7605     default:
7606       break;
7607     }
7608   gcc_unreachable ();
7609 }
7610 
7611 static reg_class_t
aarch64_preferred_reload_class(rtx x,reg_class_t regclass)7612 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
7613 {
7614   if (regclass == POINTER_REGS)
7615     return GENERAL_REGS;
7616 
7617   if (regclass == STACK_REG)
7618     {
7619       if (REG_P(x)
7620 	  && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
7621 	  return regclass;
7622 
7623       return NO_REGS;
7624     }
7625 
7626   /* Register eliminiation can result in a request for
7627      SP+constant->FP_REGS.  We cannot support such operations which
7628      use SP as source and an FP_REG as destination, so reject out
7629      right now.  */
7630   if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
7631     {
7632       rtx lhs = XEXP (x, 0);
7633 
7634       /* Look through a possible SUBREG introduced by ILP32.  */
7635       if (GET_CODE (lhs) == SUBREG)
7636 	lhs = SUBREG_REG (lhs);
7637 
7638       gcc_assert (REG_P (lhs));
7639       gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
7640 				      POINTER_REGS));
7641       return NO_REGS;
7642     }
7643 
7644   return regclass;
7645 }
7646 
7647 void
aarch64_asm_output_labelref(FILE * f,const char * name)7648 aarch64_asm_output_labelref (FILE* f, const char *name)
7649 {
7650   asm_fprintf (f, "%U%s", name);
7651 }
7652 
7653 static void
aarch64_elf_asm_constructor(rtx symbol,int priority)7654 aarch64_elf_asm_constructor (rtx symbol, int priority)
7655 {
7656   if (priority == DEFAULT_INIT_PRIORITY)
7657     default_ctor_section_asm_out_constructor (symbol, priority);
7658   else
7659     {
7660       section *s;
7661       /* While priority is known to be in range [0, 65535], so 18 bytes
7662          would be enough, the compiler might not know that.  To avoid
7663          -Wformat-truncation false positive, use a larger size.  */
7664       char buf[23];
7665       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
7666       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
7667       switch_to_section (s);
7668       assemble_align (POINTER_SIZE);
7669       assemble_aligned_integer (POINTER_BYTES, symbol);
7670     }
7671 }
7672 
7673 static void
aarch64_elf_asm_destructor(rtx symbol,int priority)7674 aarch64_elf_asm_destructor (rtx symbol, int priority)
7675 {
7676   if (priority == DEFAULT_INIT_PRIORITY)
7677     default_dtor_section_asm_out_destructor (symbol, priority);
7678   else
7679     {
7680       section *s;
7681       /* While priority is known to be in range [0, 65535], so 18 bytes
7682          would be enough, the compiler might not know that.  To avoid
7683          -Wformat-truncation false positive, use a larger size.  */
7684       char buf[23];
7685       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
7686       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
7687       switch_to_section (s);
7688       assemble_align (POINTER_SIZE);
7689       assemble_aligned_integer (POINTER_BYTES, symbol);
7690     }
7691 }
7692 
7693 const char*
aarch64_output_casesi(rtx * operands)7694 aarch64_output_casesi (rtx *operands)
7695 {
7696   char buf[100];
7697   char label[100];
7698   rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
7699   int index;
7700   static const char *const patterns[4][2] =
7701   {
7702     {
7703       "ldrb\t%w3, [%0,%w1,uxtw]",
7704       "add\t%3, %4, %w3, sxtb #2"
7705     },
7706     {
7707       "ldrh\t%w3, [%0,%w1,uxtw #1]",
7708       "add\t%3, %4, %w3, sxth #2"
7709     },
7710     {
7711       "ldr\t%w3, [%0,%w1,uxtw #2]",
7712       "add\t%3, %4, %w3, sxtw #2"
7713     },
7714     /* We assume that DImode is only generated when not optimizing and
7715        that we don't really need 64-bit address offsets.  That would
7716        imply an object file with 8GB of code in a single function!  */
7717     {
7718       "ldr\t%w3, [%0,%w1,uxtw #2]",
7719       "add\t%3, %4, %w3, sxtw #2"
7720     }
7721   };
7722 
7723   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
7724 
7725   scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
7726   index = exact_log2 (GET_MODE_SIZE (mode));
7727 
7728   gcc_assert (index >= 0 && index <= 3);
7729 
7730   /* Need to implement table size reduction, by chaning the code below.  */
7731   output_asm_insn (patterns[index][0], operands);
7732   ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
7733   snprintf (buf, sizeof (buf),
7734 	    "adr\t%%4, %s", targetm.strip_name_encoding (label));
7735   output_asm_insn (buf, operands);
7736   output_asm_insn (patterns[index][1], operands);
7737   output_asm_insn ("br\t%3", operands);
7738   assemble_label (asm_out_file, label);
7739   return "";
7740 }
7741 
7742 
7743 /* Return size in bits of an arithmetic operand which is shifted/scaled and
7744    masked such that it is suitable for a UXTB, UXTH, or UXTW extend
7745    operator.  */
7746 
7747 int
aarch64_uxt_size(int shift,HOST_WIDE_INT mask)7748 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
7749 {
7750   if (shift >= 0 && shift <= 3)
7751     {
7752       int size;
7753       for (size = 8; size <= 32; size *= 2)
7754 	{
7755 	  HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
7756 	  if (mask == bits << shift)
7757 	    return size;
7758 	}
7759     }
7760   return 0;
7761 }
7762 
7763 /* Constant pools are per function only when PC relative
7764    literal loads are true or we are in the large memory
7765    model.  */
7766 
7767 static inline bool
aarch64_can_use_per_function_literal_pools_p(void)7768 aarch64_can_use_per_function_literal_pools_p (void)
7769 {
7770   return (aarch64_pcrelative_literal_loads
7771 	  || aarch64_cmodel == AARCH64_CMODEL_LARGE);
7772 }
7773 
7774 static bool
aarch64_use_blocks_for_constant_p(machine_mode,const_rtx)7775 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
7776 {
7777   /* We can't use blocks for constants when we're using a per-function
7778      constant pool.  */
7779   return !aarch64_can_use_per_function_literal_pools_p ();
7780 }
7781 
7782 /* Select appropriate section for constants depending
7783    on where we place literal pools.  */
7784 
7785 static section *
aarch64_select_rtx_section(machine_mode mode,rtx x,unsigned HOST_WIDE_INT align)7786 aarch64_select_rtx_section (machine_mode mode,
7787 			    rtx x,
7788 			    unsigned HOST_WIDE_INT align)
7789 {
7790   if (aarch64_can_use_per_function_literal_pools_p ())
7791     return function_section (current_function_decl);
7792 
7793   return default_elf_select_rtx_section (mode, x, align);
7794 }
7795 
7796 /* Implement ASM_OUTPUT_POOL_EPILOGUE.  */
7797 void
aarch64_asm_output_pool_epilogue(FILE * f,const char *,tree,HOST_WIDE_INT offset)7798 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
7799 				  HOST_WIDE_INT offset)
7800 {
7801   /* When using per-function literal pools, we must ensure that any code
7802      section is aligned to the minimal instruction length, lest we get
7803      errors from the assembler re "unaligned instructions".  */
7804   if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
7805     ASM_OUTPUT_ALIGN (f, 2);
7806 }
7807 
7808 /* Costs.  */
7809 
7810 /* Helper function for rtx cost calculation.  Strip a shift expression
7811    from X.  Returns the inner operand if successful, or the original
7812    expression on failure.  */
7813 static rtx
aarch64_strip_shift(rtx x)7814 aarch64_strip_shift (rtx x)
7815 {
7816   rtx op = x;
7817 
7818   /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
7819      we can convert both to ROR during final output.  */
7820   if ((GET_CODE (op) == ASHIFT
7821        || GET_CODE (op) == ASHIFTRT
7822        || GET_CODE (op) == LSHIFTRT
7823        || GET_CODE (op) == ROTATERT
7824        || GET_CODE (op) == ROTATE)
7825       && CONST_INT_P (XEXP (op, 1)))
7826     return XEXP (op, 0);
7827 
7828   if (GET_CODE (op) == MULT
7829       && CONST_INT_P (XEXP (op, 1))
7830       && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
7831     return XEXP (op, 0);
7832 
7833   return x;
7834 }
7835 
7836 /* Helper function for rtx cost calculation.  Strip an extend
7837    expression from X.  Returns the inner operand if successful, or the
7838    original expression on failure.  We deal with a number of possible
7839    canonicalization variations here. If STRIP_SHIFT is true, then
7840    we can strip off a shift also.  */
7841 static rtx
aarch64_strip_extend(rtx x,bool strip_shift)7842 aarch64_strip_extend (rtx x, bool strip_shift)
7843 {
7844   scalar_int_mode mode;
7845   rtx op = x;
7846 
7847   if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
7848     return op;
7849 
7850   /* Zero and sign extraction of a widened value.  */
7851   if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
7852       && XEXP (op, 2) == const0_rtx
7853       && GET_CODE (XEXP (op, 0)) == MULT
7854       && aarch64_is_extend_from_extract (mode, XEXP (XEXP (op, 0), 1),
7855 					 XEXP (op, 1)))
7856     return XEXP (XEXP (op, 0), 0);
7857 
7858   /* It can also be represented (for zero-extend) as an AND with an
7859      immediate.  */
7860   if (GET_CODE (op) == AND
7861       && GET_CODE (XEXP (op, 0)) == MULT
7862       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
7863       && CONST_INT_P (XEXP (op, 1))
7864       && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
7865 			   INTVAL (XEXP (op, 1))) != 0)
7866     return XEXP (XEXP (op, 0), 0);
7867 
7868   /* Now handle extended register, as this may also have an optional
7869      left shift by 1..4.  */
7870   if (strip_shift
7871       && GET_CODE (op) == ASHIFT
7872       && CONST_INT_P (XEXP (op, 1))
7873       && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
7874     op = XEXP (op, 0);
7875 
7876   if (GET_CODE (op) == ZERO_EXTEND
7877       || GET_CODE (op) == SIGN_EXTEND)
7878     op = XEXP (op, 0);
7879 
7880   if (op != x)
7881     return op;
7882 
7883   return x;
7884 }
7885 
7886 /* Return true iff CODE is a shift supported in combination
7887    with arithmetic instructions.  */
7888 
7889 static bool
aarch64_shift_p(enum rtx_code code)7890 aarch64_shift_p (enum rtx_code code)
7891 {
7892   return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
7893 }
7894 
7895 
7896 /* Return true iff X is a cheap shift without a sign extend. */
7897 
7898 static bool
aarch64_cheap_mult_shift_p(rtx x)7899 aarch64_cheap_mult_shift_p (rtx x)
7900 {
7901   rtx op0, op1;
7902 
7903   op0 = XEXP (x, 0);
7904   op1 = XEXP (x, 1);
7905 
7906   if (!(aarch64_tune_params.extra_tuning_flags
7907                       & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
7908     return false;
7909 
7910   if (GET_CODE (op0) == SIGN_EXTEND)
7911     return false;
7912 
7913   if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
7914       && UINTVAL (op1) <= 4)
7915     return true;
7916 
7917   if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
7918     return false;
7919 
7920   HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
7921 
7922   if (l2 > 0 && l2 <= 4)
7923     return true;
7924 
7925   return false;
7926 }
7927 
7928 /* Helper function for rtx cost calculation.  Calculate the cost of
7929    a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
7930    Return the calculated cost of the expression, recursing manually in to
7931    operands where needed.  */
7932 
7933 static int
aarch64_rtx_mult_cost(rtx x,enum rtx_code code,int outer,bool speed)7934 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
7935 {
7936   rtx op0, op1;
7937   const struct cpu_cost_table *extra_cost
7938     = aarch64_tune_params.insn_extra_cost;
7939   int cost = 0;
7940   bool compound_p = (outer == PLUS || outer == MINUS);
7941   machine_mode mode = GET_MODE (x);
7942 
7943   gcc_checking_assert (code == MULT);
7944 
7945   op0 = XEXP (x, 0);
7946   op1 = XEXP (x, 1);
7947 
7948   if (VECTOR_MODE_P (mode))
7949     mode = GET_MODE_INNER (mode);
7950 
7951   /* Integer multiply/fma.  */
7952   if (GET_MODE_CLASS (mode) == MODE_INT)
7953     {
7954       /* The multiply will be canonicalized as a shift, cost it as such.  */
7955       if (aarch64_shift_p (GET_CODE (x))
7956 	  || (CONST_INT_P (op1)
7957 	      && exact_log2 (INTVAL (op1)) > 0))
7958 	{
7959 	  bool is_extend = GET_CODE (op0) == ZERO_EXTEND
7960 	                   || GET_CODE (op0) == SIGN_EXTEND;
7961 	  if (speed)
7962 	    {
7963 	      if (compound_p)
7964 	        {
7965 		  /* If the shift is considered cheap,
7966 		     then don't add any cost. */
7967 		  if (aarch64_cheap_mult_shift_p (x))
7968 		    ;
7969 	          else if (REG_P (op1))
7970 		    /* ARITH + shift-by-register.  */
7971 		    cost += extra_cost->alu.arith_shift_reg;
7972 		  else if (is_extend)
7973 		    /* ARITH + extended register.  We don't have a cost field
7974 		       for ARITH+EXTEND+SHIFT, so use extend_arith here.  */
7975 		    cost += extra_cost->alu.extend_arith;
7976 		  else
7977 		    /* ARITH + shift-by-immediate.  */
7978 		    cost += extra_cost->alu.arith_shift;
7979 		}
7980 	      else
7981 		/* LSL (immediate).  */
7982 	        cost += extra_cost->alu.shift;
7983 
7984 	    }
7985 	  /* Strip extends as we will have costed them in the case above.  */
7986 	  if (is_extend)
7987 	    op0 = aarch64_strip_extend (op0, true);
7988 
7989 	  cost += rtx_cost (op0, VOIDmode, code, 0, speed);
7990 
7991 	  return cost;
7992 	}
7993 
7994       /* MNEG or [US]MNEGL.  Extract the NEG operand and indicate that it's a
7995 	 compound and let the below cases handle it.  After all, MNEG is a
7996 	 special-case alias of MSUB.  */
7997       if (GET_CODE (op0) == NEG)
7998 	{
7999 	  op0 = XEXP (op0, 0);
8000 	  compound_p = true;
8001 	}
8002 
8003       /* Integer multiplies or FMAs have zero/sign extending variants.  */
8004       if ((GET_CODE (op0) == ZERO_EXTEND
8005 	   && GET_CODE (op1) == ZERO_EXTEND)
8006 	  || (GET_CODE (op0) == SIGN_EXTEND
8007 	      && GET_CODE (op1) == SIGN_EXTEND))
8008 	{
8009 	  cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
8010 	  cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
8011 
8012 	  if (speed)
8013 	    {
8014 	      if (compound_p)
8015 		/* SMADDL/UMADDL/UMSUBL/SMSUBL.  */
8016 		cost += extra_cost->mult[0].extend_add;
8017 	      else
8018 		/* MUL/SMULL/UMULL.  */
8019 		cost += extra_cost->mult[0].extend;
8020 	    }
8021 
8022 	  return cost;
8023 	}
8024 
8025       /* This is either an integer multiply or a MADD.  In both cases
8026 	 we want to recurse and cost the operands.  */
8027       cost += rtx_cost (op0, mode, MULT, 0, speed);
8028       cost += rtx_cost (op1, mode, MULT, 1, speed);
8029 
8030       if (speed)
8031 	{
8032 	  if (compound_p)
8033 	    /* MADD/MSUB.  */
8034 	    cost += extra_cost->mult[mode == DImode].add;
8035 	  else
8036 	    /* MUL.  */
8037 	    cost += extra_cost->mult[mode == DImode].simple;
8038 	}
8039 
8040       return cost;
8041     }
8042   else
8043     {
8044       if (speed)
8045 	{
8046 	  /* Floating-point FMA/FMUL can also support negations of the
8047 	     operands, unless the rounding mode is upward or downward in
8048 	     which case FNMUL is different than FMUL with operand negation.  */
8049 	  bool neg0 = GET_CODE (op0) == NEG;
8050 	  bool neg1 = GET_CODE (op1) == NEG;
8051 	  if (compound_p || !flag_rounding_math || (neg0 && neg1))
8052 	    {
8053 	      if (neg0)
8054 		op0 = XEXP (op0, 0);
8055 	      if (neg1)
8056 		op1 = XEXP (op1, 0);
8057 	    }
8058 
8059 	  if (compound_p)
8060 	    /* FMADD/FNMADD/FNMSUB/FMSUB.  */
8061 	    cost += extra_cost->fp[mode == DFmode].fma;
8062 	  else
8063 	    /* FMUL/FNMUL.  */
8064 	    cost += extra_cost->fp[mode == DFmode].mult;
8065 	}
8066 
8067       cost += rtx_cost (op0, mode, MULT, 0, speed);
8068       cost += rtx_cost (op1, mode, MULT, 1, speed);
8069       return cost;
8070     }
8071 }
8072 
8073 static int
aarch64_address_cost(rtx x,machine_mode mode,addr_space_t as ATTRIBUTE_UNUSED,bool speed)8074 aarch64_address_cost (rtx x,
8075 		      machine_mode mode,
8076 		      addr_space_t as ATTRIBUTE_UNUSED,
8077 		      bool speed)
8078 {
8079   enum rtx_code c = GET_CODE (x);
8080   const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
8081   struct aarch64_address_info info;
8082   int cost = 0;
8083   info.shift = 0;
8084 
8085   if (!aarch64_classify_address (&info, x, mode, false))
8086     {
8087       if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
8088 	{
8089 	  /* This is a CONST or SYMBOL ref which will be split
8090 	     in a different way depending on the code model in use.
8091 	     Cost it through the generic infrastructure.  */
8092 	  int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
8093 	  /* Divide through by the cost of one instruction to
8094 	     bring it to the same units as the address costs.  */
8095 	  cost_symbol_ref /= COSTS_N_INSNS (1);
8096 	  /* The cost is then the cost of preparing the address,
8097 	     followed by an immediate (possibly 0) offset.  */
8098 	  return cost_symbol_ref + addr_cost->imm_offset;
8099 	}
8100       else
8101 	{
8102 	  /* This is most likely a jump table from a case
8103 	     statement.  */
8104 	  return addr_cost->register_offset;
8105 	}
8106     }
8107 
8108   switch (info.type)
8109     {
8110       case ADDRESS_LO_SUM:
8111       case ADDRESS_SYMBOLIC:
8112       case ADDRESS_REG_IMM:
8113 	cost += addr_cost->imm_offset;
8114 	break;
8115 
8116       case ADDRESS_REG_WB:
8117 	if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
8118 	  cost += addr_cost->pre_modify;
8119 	else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
8120 	  cost += addr_cost->post_modify;
8121 	else
8122 	  gcc_unreachable ();
8123 
8124 	break;
8125 
8126       case ADDRESS_REG_REG:
8127 	cost += addr_cost->register_offset;
8128 	break;
8129 
8130       case ADDRESS_REG_SXTW:
8131 	cost += addr_cost->register_sextend;
8132 	break;
8133 
8134       case ADDRESS_REG_UXTW:
8135 	cost += addr_cost->register_zextend;
8136 	break;
8137 
8138       default:
8139 	gcc_unreachable ();
8140     }
8141 
8142 
8143   if (info.shift > 0)
8144     {
8145       /* For the sake of calculating the cost of the shifted register
8146 	 component, we can treat same sized modes in the same way.  */
8147       if (known_eq (GET_MODE_BITSIZE (mode), 16))
8148 	cost += addr_cost->addr_scale_costs.hi;
8149       else if (known_eq (GET_MODE_BITSIZE (mode), 32))
8150 	cost += addr_cost->addr_scale_costs.si;
8151       else if (known_eq (GET_MODE_BITSIZE (mode), 64))
8152 	cost += addr_cost->addr_scale_costs.di;
8153       else
8154 	/* We can't tell, or this is a 128-bit vector.  */
8155 	cost += addr_cost->addr_scale_costs.ti;
8156     }
8157 
8158   return cost;
8159 }
8160 
8161 /* Return the cost of a branch.  If SPEED_P is true then the compiler is
8162    optimizing for speed.  If PREDICTABLE_P is true then the branch is predicted
8163    to be taken.  */
8164 
8165 int
aarch64_branch_cost(bool speed_p,bool predictable_p)8166 aarch64_branch_cost (bool speed_p, bool predictable_p)
8167 {
8168   /* When optimizing for speed, use the cost of unpredictable branches.  */
8169   const struct cpu_branch_cost *branch_costs =
8170     aarch64_tune_params.branch_costs;
8171 
8172   if (!speed_p || predictable_p)
8173     return branch_costs->predictable;
8174   else
8175     return branch_costs->unpredictable;
8176 }
8177 
8178 /* Return true if the RTX X in mode MODE is a zero or sign extract
8179    usable in an ADD or SUB (extended register) instruction.  */
8180 static bool
aarch64_rtx_arith_op_extract_p(rtx x,scalar_int_mode mode)8181 aarch64_rtx_arith_op_extract_p (rtx x, scalar_int_mode mode)
8182 {
8183   /* Catch add with a sign extract.
8184      This is add_<optab><mode>_multp2.  */
8185   if (GET_CODE (x) == SIGN_EXTRACT
8186       || GET_CODE (x) == ZERO_EXTRACT)
8187     {
8188       rtx op0 = XEXP (x, 0);
8189       rtx op1 = XEXP (x, 1);
8190       rtx op2 = XEXP (x, 2);
8191 
8192       if (GET_CODE (op0) == MULT
8193 	  && CONST_INT_P (op1)
8194 	  && op2 == const0_rtx
8195 	  && CONST_INT_P (XEXP (op0, 1))
8196 	  && aarch64_is_extend_from_extract (mode,
8197 					     XEXP (op0, 1),
8198 					     op1))
8199 	{
8200 	  return true;
8201 	}
8202     }
8203   /* The simple case <ARITH>, XD, XN, XM, [us]xt.
8204      No shift.  */
8205   else if (GET_CODE (x) == SIGN_EXTEND
8206 	   || GET_CODE (x) == ZERO_EXTEND)
8207     return REG_P (XEXP (x, 0));
8208 
8209   return false;
8210 }
8211 
8212 static bool
aarch64_frint_unspec_p(unsigned int u)8213 aarch64_frint_unspec_p (unsigned int u)
8214 {
8215   switch (u)
8216     {
8217       case UNSPEC_FRINTZ:
8218       case UNSPEC_FRINTP:
8219       case UNSPEC_FRINTM:
8220       case UNSPEC_FRINTA:
8221       case UNSPEC_FRINTN:
8222       case UNSPEC_FRINTX:
8223       case UNSPEC_FRINTI:
8224         return true;
8225 
8226       default:
8227         return false;
8228     }
8229 }
8230 
8231 /* Return true iff X is an rtx that will match an extr instruction
8232    i.e. as described in the *extr<mode>5_insn family of patterns.
8233    OP0 and OP1 will be set to the operands of the shifts involved
8234    on success and will be NULL_RTX otherwise.  */
8235 
8236 static bool
aarch64_extr_rtx_p(rtx x,rtx * res_op0,rtx * res_op1)8237 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
8238 {
8239   rtx op0, op1;
8240   scalar_int_mode mode;
8241   if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
8242     return false;
8243 
8244   *res_op0 = NULL_RTX;
8245   *res_op1 = NULL_RTX;
8246 
8247   if (GET_CODE (x) != IOR)
8248     return false;
8249 
8250   op0 = XEXP (x, 0);
8251   op1 = XEXP (x, 1);
8252 
8253   if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
8254       || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
8255     {
8256      /* Canonicalise locally to ashift in op0, lshiftrt in op1.  */
8257       if (GET_CODE (op1) == ASHIFT)
8258         std::swap (op0, op1);
8259 
8260       if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
8261         return false;
8262 
8263       unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
8264       unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
8265 
8266       if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
8267           && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
8268         {
8269           *res_op0 = XEXP (op0, 0);
8270           *res_op1 = XEXP (op1, 0);
8271           return true;
8272         }
8273     }
8274 
8275   return false;
8276 }
8277 
8278 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
8279    storing it in *COST.  Result is true if the total cost of the operation
8280    has now been calculated.  */
8281 static bool
aarch64_if_then_else_costs(rtx op0,rtx op1,rtx op2,int * cost,bool speed)8282 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
8283 {
8284   rtx inner;
8285   rtx comparator;
8286   enum rtx_code cmpcode;
8287 
8288   if (COMPARISON_P (op0))
8289     {
8290       inner = XEXP (op0, 0);
8291       comparator = XEXP (op0, 1);
8292       cmpcode = GET_CODE (op0);
8293     }
8294   else
8295     {
8296       inner = op0;
8297       comparator = const0_rtx;
8298       cmpcode = NE;
8299     }
8300 
8301   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
8302     {
8303       /* Conditional branch.  */
8304       if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
8305 	return true;
8306       else
8307 	{
8308 	  if (cmpcode == NE || cmpcode == EQ)
8309 	    {
8310 	      if (comparator == const0_rtx)
8311 		{
8312 		  /* TBZ/TBNZ/CBZ/CBNZ.  */
8313 		  if (GET_CODE (inner) == ZERO_EXTRACT)
8314 		    /* TBZ/TBNZ.  */
8315 		    *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
8316 				       ZERO_EXTRACT, 0, speed);
8317 		  else
8318 		    /* CBZ/CBNZ.  */
8319 		    *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
8320 
8321 	        return true;
8322 	      }
8323 	    }
8324 	  else if (cmpcode == LT || cmpcode == GE)
8325 	    {
8326 	      /* TBZ/TBNZ.  */
8327 	      if (comparator == const0_rtx)
8328 		return true;
8329 	    }
8330 	}
8331     }
8332   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
8333     {
8334       /* CCMP.  */
8335       if (GET_CODE (op1) == COMPARE)
8336 	{
8337 	  /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0.  */
8338 	  if (XEXP (op1, 1) == const0_rtx)
8339 	    *cost += 1;
8340 	  if (speed)
8341 	    {
8342 	      machine_mode mode = GET_MODE (XEXP (op1, 0));
8343 	      const struct cpu_cost_table *extra_cost
8344 		= aarch64_tune_params.insn_extra_cost;
8345 
8346 	      if (GET_MODE_CLASS (mode) == MODE_INT)
8347 		*cost += extra_cost->alu.arith;
8348 	      else
8349 		*cost += extra_cost->fp[mode == DFmode].compare;
8350 	    }
8351 	  return true;
8352 	}
8353 
8354       /* It's a conditional operation based on the status flags,
8355 	 so it must be some flavor of CSEL.  */
8356 
8357       /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
8358       if (GET_CODE (op1) == NEG
8359           || GET_CODE (op1) == NOT
8360           || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
8361 	op1 = XEXP (op1, 0);
8362       else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
8363 	{
8364 	  /* CSEL with zero-extension (*cmovdi_insn_uxtw).  */
8365 	  op1 = XEXP (op1, 0);
8366 	  op2 = XEXP (op2, 0);
8367 	}
8368 
8369       *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
8370       *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
8371       return true;
8372     }
8373 
8374   /* We don't know what this is, cost all operands.  */
8375   return false;
8376 }
8377 
8378 /* Check whether X is a bitfield operation of the form shift + extend that
8379    maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction.  If so, return the
8380    operand to which the bitfield operation is applied.  Otherwise return
8381    NULL_RTX.  */
8382 
8383 static rtx
aarch64_extend_bitfield_pattern_p(rtx x)8384 aarch64_extend_bitfield_pattern_p (rtx x)
8385 {
8386   rtx_code outer_code = GET_CODE (x);
8387   machine_mode outer_mode = GET_MODE (x);
8388 
8389   if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
8390       && outer_mode != SImode && outer_mode != DImode)
8391     return NULL_RTX;
8392 
8393   rtx inner = XEXP (x, 0);
8394   rtx_code inner_code = GET_CODE (inner);
8395   machine_mode inner_mode = GET_MODE (inner);
8396   rtx op = NULL_RTX;
8397 
8398   switch (inner_code)
8399     {
8400       case ASHIFT:
8401 	if (CONST_INT_P (XEXP (inner, 1))
8402 	    && (inner_mode == QImode || inner_mode == HImode))
8403 	  op = XEXP (inner, 0);
8404 	break;
8405       case LSHIFTRT:
8406 	if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
8407 	    && (inner_mode == QImode || inner_mode == HImode))
8408 	  op = XEXP (inner, 0);
8409 	break;
8410       case ASHIFTRT:
8411 	if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
8412 	    && (inner_mode == QImode || inner_mode == HImode))
8413 	  op = XEXP (inner, 0);
8414 	break;
8415       default:
8416 	break;
8417     }
8418 
8419   return op;
8420 }
8421 
8422 /* Return true if the mask and a shift amount from an RTX of the form
8423    (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
8424    mode MODE.  See the *andim_ashift<mode>_bfiz pattern.  */
8425 
8426 bool
aarch64_mask_and_shift_for_ubfiz_p(scalar_int_mode mode,rtx mask,rtx shft_amnt)8427 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
8428 				    rtx shft_amnt)
8429 {
8430   return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
8431 	 && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
8432 	 && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
8433 	 && (INTVAL (mask)
8434 	     & ((HOST_WIDE_INT_1U << INTVAL (shft_amnt)) - 1)) == 0;
8435 }
8436 
8437 /* Calculate the cost of calculating X, storing it in *COST.  Result
8438    is true if the total cost of the operation has now been calculated.  */
8439 static bool
aarch64_rtx_costs(rtx x,machine_mode mode,int outer ATTRIBUTE_UNUSED,int param ATTRIBUTE_UNUSED,int * cost,bool speed)8440 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
8441 		   int param ATTRIBUTE_UNUSED, int *cost, bool speed)
8442 {
8443   rtx op0, op1, op2;
8444   const struct cpu_cost_table *extra_cost
8445     = aarch64_tune_params.insn_extra_cost;
8446   int code = GET_CODE (x);
8447   scalar_int_mode int_mode;
8448 
8449   /* By default, assume that everything has equivalent cost to the
8450      cheapest instruction.  Any additional costs are applied as a delta
8451      above this default.  */
8452   *cost = COSTS_N_INSNS (1);
8453 
8454   switch (code)
8455     {
8456     case SET:
8457       /* The cost depends entirely on the operands to SET.  */
8458       *cost = 0;
8459       op0 = SET_DEST (x);
8460       op1 = SET_SRC (x);
8461 
8462       switch (GET_CODE (op0))
8463 	{
8464 	case MEM:
8465 	  if (speed)
8466 	    {
8467 	      rtx address = XEXP (op0, 0);
8468 	      if (VECTOR_MODE_P (mode))
8469 		*cost += extra_cost->ldst.storev;
8470 	      else if (GET_MODE_CLASS (mode) == MODE_INT)
8471 		*cost += extra_cost->ldst.store;
8472 	      else if (mode == SFmode)
8473 		*cost += extra_cost->ldst.storef;
8474 	      else if (mode == DFmode)
8475 		*cost += extra_cost->ldst.stored;
8476 
8477 	      *cost +=
8478 		COSTS_N_INSNS (aarch64_address_cost (address, mode,
8479 						     0, speed));
8480 	    }
8481 
8482 	  *cost += rtx_cost (op1, mode, SET, 1, speed);
8483 	  return true;
8484 
8485 	case SUBREG:
8486 	  if (! REG_P (SUBREG_REG (op0)))
8487 	    *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
8488 
8489 	  /* Fall through.  */
8490 	case REG:
8491 	  /* The cost is one per vector-register copied.  */
8492 	  if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
8493 	    {
8494 	      int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
8495 	      *cost = COSTS_N_INSNS (nregs);
8496 	    }
8497 	  /* const0_rtx is in general free, but we will use an
8498 	     instruction to set a register to 0.  */
8499 	  else if (REG_P (op1) || op1 == const0_rtx)
8500 	    {
8501 	      /* The cost is 1 per register copied.  */
8502 	      int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
8503 	      *cost = COSTS_N_INSNS (nregs);
8504 	    }
8505           else
8506 	    /* Cost is just the cost of the RHS of the set.  */
8507 	    *cost += rtx_cost (op1, mode, SET, 1, speed);
8508 	  return true;
8509 
8510 	case ZERO_EXTRACT:
8511 	case SIGN_EXTRACT:
8512 	  /* Bit-field insertion.  Strip any redundant widening of
8513 	     the RHS to meet the width of the target.  */
8514 	  if (GET_CODE (op1) == SUBREG)
8515 	    op1 = SUBREG_REG (op1);
8516 	  if ((GET_CODE (op1) == ZERO_EXTEND
8517 	       || GET_CODE (op1) == SIGN_EXTEND)
8518 	      && CONST_INT_P (XEXP (op0, 1))
8519 	      && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
8520 	      && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
8521 	    op1 = XEXP (op1, 0);
8522 
8523           if (CONST_INT_P (op1))
8524             {
8525               /* MOV immediate is assumed to always be cheap.  */
8526               *cost = COSTS_N_INSNS (1);
8527             }
8528           else
8529             {
8530               /* BFM.  */
8531 	      if (speed)
8532 		*cost += extra_cost->alu.bfi;
8533 	      *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
8534             }
8535 
8536 	  return true;
8537 
8538 	default:
8539 	  /* We can't make sense of this, assume default cost.  */
8540           *cost = COSTS_N_INSNS (1);
8541 	  return false;
8542 	}
8543       return false;
8544 
8545     case CONST_INT:
8546       /* If an instruction can incorporate a constant within the
8547 	 instruction, the instruction's expression avoids calling
8548 	 rtx_cost() on the constant.  If rtx_cost() is called on a
8549 	 constant, then it is usually because the constant must be
8550 	 moved into a register by one or more instructions.
8551 
8552 	 The exception is constant 0, which can be expressed
8553 	 as XZR/WZR and is therefore free.  The exception to this is
8554 	 if we have (set (reg) (const0_rtx)) in which case we must cost
8555 	 the move.  However, we can catch that when we cost the SET, so
8556 	 we don't need to consider that here.  */
8557       if (x == const0_rtx)
8558 	*cost = 0;
8559       else
8560 	{
8561 	  /* To an approximation, building any other constant is
8562 	     proportionally expensive to the number of instructions
8563 	     required to build that constant.  This is true whether we
8564 	     are compiling for SPEED or otherwise.  */
8565 	  if (!is_a <scalar_int_mode> (mode, &int_mode))
8566 	    int_mode = word_mode;
8567 	  *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
8568 				 (NULL_RTX, x, false, int_mode));
8569 	}
8570       return true;
8571 
8572     case CONST_DOUBLE:
8573 
8574       /* First determine number of instructions to do the move
8575 	  as an integer constant.  */
8576       if (!aarch64_float_const_representable_p (x)
8577 	   && !aarch64_can_const_movi_rtx_p (x, mode)
8578 	   && aarch64_float_const_rtx_p (x))
8579 	{
8580 	  unsigned HOST_WIDE_INT ival;
8581 	  bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
8582 	  gcc_assert (succeed);
8583 
8584 	  scalar_int_mode imode = (mode == HFmode
8585 				   ? SImode
8586 				   : int_mode_for_mode (mode).require ());
8587 	  int ncost = aarch64_internal_mov_immediate
8588 		(NULL_RTX, gen_int_mode (ival, imode), false, imode);
8589 	  *cost += COSTS_N_INSNS (ncost);
8590 	  return true;
8591 	}
8592 
8593       if (speed)
8594 	{
8595 	  /* mov[df,sf]_aarch64.  */
8596 	  if (aarch64_float_const_representable_p (x))
8597 	    /* FMOV (scalar immediate).  */
8598 	    *cost += extra_cost->fp[mode == DFmode].fpconst;
8599 	  else if (!aarch64_float_const_zero_rtx_p (x))
8600 	    {
8601 	      /* This will be a load from memory.  */
8602 	      if (mode == DFmode)
8603 		*cost += extra_cost->ldst.loadd;
8604 	      else
8605 		*cost += extra_cost->ldst.loadf;
8606 	    }
8607 	  else
8608 	    /* Otherwise this is +0.0.  We get this using MOVI d0, #0
8609 	       or MOV v0.s[0], wzr - neither of which are modeled by the
8610 	       cost tables.  Just use the default cost.  */
8611 	    {
8612 	    }
8613 	}
8614 
8615       return true;
8616 
8617     case MEM:
8618       if (speed)
8619 	{
8620 	  /* For loads we want the base cost of a load, plus an
8621 	     approximation for the additional cost of the addressing
8622 	     mode.  */
8623 	  rtx address = XEXP (x, 0);
8624 	  if (VECTOR_MODE_P (mode))
8625 	    *cost += extra_cost->ldst.loadv;
8626 	  else if (GET_MODE_CLASS (mode) == MODE_INT)
8627 	    *cost += extra_cost->ldst.load;
8628 	  else if (mode == SFmode)
8629 	    *cost += extra_cost->ldst.loadf;
8630 	  else if (mode == DFmode)
8631 	    *cost += extra_cost->ldst.loadd;
8632 
8633 	  *cost +=
8634 		COSTS_N_INSNS (aarch64_address_cost (address, mode,
8635 						     0, speed));
8636 	}
8637 
8638       return true;
8639 
8640     case NEG:
8641       op0 = XEXP (x, 0);
8642 
8643       if (VECTOR_MODE_P (mode))
8644 	{
8645 	  if (speed)
8646 	    {
8647 	      /* FNEG.  */
8648 	      *cost += extra_cost->vect.alu;
8649 	    }
8650 	  return false;
8651 	}
8652 
8653       if (GET_MODE_CLASS (mode) == MODE_INT)
8654 	{
8655           if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
8656               || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
8657             {
8658               /* CSETM.  */
8659 	      *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
8660               return true;
8661             }
8662 
8663 	  /* Cost this as SUB wzr, X.  */
8664           op0 = CONST0_RTX (mode);
8665           op1 = XEXP (x, 0);
8666           goto cost_minus;
8667         }
8668 
8669       if (GET_MODE_CLASS (mode) == MODE_FLOAT)
8670         {
8671           /* Support (neg(fma...)) as a single instruction only if
8672              sign of zeros is unimportant.  This matches the decision
8673              making in aarch64.md.  */
8674           if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
8675             {
8676 	      /* FNMADD.  */
8677 	      *cost = rtx_cost (op0, mode, NEG, 0, speed);
8678               return true;
8679             }
8680 	  if (GET_CODE (op0) == MULT)
8681 	    {
8682 	      /* FNMUL.  */
8683 	      *cost = rtx_cost (op0, mode, NEG, 0, speed);
8684 	      return true;
8685 	    }
8686 	  if (speed)
8687 	    /* FNEG.  */
8688 	    *cost += extra_cost->fp[mode == DFmode].neg;
8689           return false;
8690         }
8691 
8692       return false;
8693 
8694     case CLRSB:
8695     case CLZ:
8696       if (speed)
8697 	{
8698 	  if (VECTOR_MODE_P (mode))
8699 	    *cost += extra_cost->vect.alu;
8700 	  else
8701 	    *cost += extra_cost->alu.clz;
8702 	}
8703 
8704       return false;
8705 
8706     case COMPARE:
8707       op0 = XEXP (x, 0);
8708       op1 = XEXP (x, 1);
8709 
8710       if (op1 == const0_rtx
8711 	  && GET_CODE (op0) == AND)
8712 	{
8713 	  x = op0;
8714 	  mode = GET_MODE (op0);
8715 	  goto cost_logic;
8716 	}
8717 
8718       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
8719         {
8720           /* TODO: A write to the CC flags possibly costs extra, this
8721 	     needs encoding in the cost tables.  */
8722 
8723 	  mode = GET_MODE (op0);
8724           /* ANDS.  */
8725           if (GET_CODE (op0) == AND)
8726             {
8727               x = op0;
8728               goto cost_logic;
8729             }
8730 
8731           if (GET_CODE (op0) == PLUS)
8732             {
8733 	      /* ADDS (and CMN alias).  */
8734               x = op0;
8735               goto cost_plus;
8736             }
8737 
8738           if (GET_CODE (op0) == MINUS)
8739             {
8740 	      /* SUBS.  */
8741               x = op0;
8742               goto cost_minus;
8743             }
8744 
8745 	  if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
8746 	      && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
8747 	      && CONST_INT_P (XEXP (op0, 2)))
8748 	    {
8749 	      /* COMPARE of ZERO_EXTRACT form of TST-immediate.
8750 		 Handle it here directly rather than going to cost_logic
8751 		 since we know the immediate generated for the TST is valid
8752 		 so we can avoid creating an intermediate rtx for it only
8753 		 for costing purposes.  */
8754 	      if (speed)
8755 		*cost += extra_cost->alu.logical;
8756 
8757 	      *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
8758 				 ZERO_EXTRACT, 0, speed);
8759 	      return true;
8760 	    }
8761 
8762           if (GET_CODE (op1) == NEG)
8763             {
8764 	      /* CMN.  */
8765 	      if (speed)
8766 		*cost += extra_cost->alu.arith;
8767 
8768 	      *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
8769 	      *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
8770               return true;
8771             }
8772 
8773           /* CMP.
8774 
8775 	     Compare can freely swap the order of operands, and
8776              canonicalization puts the more complex operation first.
8777              But the integer MINUS logic expects the shift/extend
8778              operation in op1.  */
8779           if (! (REG_P (op0)
8780                  || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
8781           {
8782             op0 = XEXP (x, 1);
8783             op1 = XEXP (x, 0);
8784           }
8785           goto cost_minus;
8786         }
8787 
8788       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
8789         {
8790 	  /* FCMP.  */
8791 	  if (speed)
8792 	    *cost += extra_cost->fp[mode == DFmode].compare;
8793 
8794           if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
8795             {
8796 	      *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
8797               /* FCMP supports constant 0.0 for no extra cost. */
8798               return true;
8799             }
8800           return false;
8801         }
8802 
8803       if (VECTOR_MODE_P (mode))
8804 	{
8805 	  /* Vector compare.  */
8806 	  if (speed)
8807 	    *cost += extra_cost->vect.alu;
8808 
8809 	  if (aarch64_float_const_zero_rtx_p (op1))
8810 	    {
8811 	      /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
8812 		 cost.  */
8813 	      return true;
8814 	    }
8815 	  return false;
8816 	}
8817       return false;
8818 
8819     case MINUS:
8820       {
8821 	op0 = XEXP (x, 0);
8822 	op1 = XEXP (x, 1);
8823 
8824 cost_minus:
8825 	*cost += rtx_cost (op0, mode, MINUS, 0, speed);
8826 
8827 	/* Detect valid immediates.  */
8828 	if ((GET_MODE_CLASS (mode) == MODE_INT
8829 	     || (GET_MODE_CLASS (mode) == MODE_CC
8830 		 && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
8831 	    && CONST_INT_P (op1)
8832 	    && aarch64_uimm12_shift (INTVAL (op1)))
8833 	  {
8834 	    if (speed)
8835 	      /* SUB(S) (immediate).  */
8836 	      *cost += extra_cost->alu.arith;
8837 	    return true;
8838 	  }
8839 
8840 	/* Look for SUB (extended register).  */
8841 	if (is_a <scalar_int_mode> (mode, &int_mode)
8842 	    && aarch64_rtx_arith_op_extract_p (op1, int_mode))
8843 	  {
8844 	    if (speed)
8845 	      *cost += extra_cost->alu.extend_arith;
8846 
8847 	    op1 = aarch64_strip_extend (op1, true);
8848 	    *cost += rtx_cost (op1, VOIDmode,
8849 			       (enum rtx_code) GET_CODE (op1), 0, speed);
8850 	    return true;
8851 	  }
8852 
8853 	rtx new_op1 = aarch64_strip_extend (op1, false);
8854 
8855 	/* Cost this as an FMA-alike operation.  */
8856 	if ((GET_CODE (new_op1) == MULT
8857 	     || aarch64_shift_p (GET_CODE (new_op1)))
8858 	    && code != COMPARE)
8859 	  {
8860 	    *cost += aarch64_rtx_mult_cost (new_op1, MULT,
8861 					    (enum rtx_code) code,
8862 					    speed);
8863 	    return true;
8864 	  }
8865 
8866 	*cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
8867 
8868 	if (speed)
8869 	  {
8870 	    if (VECTOR_MODE_P (mode))
8871 	      {
8872 		/* Vector SUB.  */
8873 		*cost += extra_cost->vect.alu;
8874 	      }
8875 	    else if (GET_MODE_CLASS (mode) == MODE_INT)
8876 	      {
8877 		/* SUB(S).  */
8878 		*cost += extra_cost->alu.arith;
8879 	      }
8880 	    else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
8881 	      {
8882 		/* FSUB.  */
8883 		*cost += extra_cost->fp[mode == DFmode].addsub;
8884 	      }
8885 	  }
8886 	return true;
8887       }
8888 
8889     case PLUS:
8890       {
8891 	rtx new_op0;
8892 
8893 	op0 = XEXP (x, 0);
8894 	op1 = XEXP (x, 1);
8895 
8896 cost_plus:
8897 	if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
8898 	    || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
8899 	  {
8900 	    /* CSINC.  */
8901 	    *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
8902 	    *cost += rtx_cost (op1, mode, PLUS, 1, speed);
8903 	    return true;
8904 	  }
8905 
8906 	if (GET_MODE_CLASS (mode) == MODE_INT
8907 	    && ((CONST_INT_P (op1) && aarch64_uimm12_shift (INTVAL (op1)))
8908 		|| aarch64_sve_addvl_addpl_immediate (op1, mode)))
8909 	  {
8910 	    *cost += rtx_cost (op0, mode, PLUS, 0, speed);
8911 
8912 	    if (speed)
8913 	      /* ADD (immediate).  */
8914 	      *cost += extra_cost->alu.arith;
8915 	    return true;
8916 	  }
8917 
8918 	*cost += rtx_cost (op1, mode, PLUS, 1, speed);
8919 
8920 	/* Look for ADD (extended register).  */
8921 	if (is_a <scalar_int_mode> (mode, &int_mode)
8922 	    && aarch64_rtx_arith_op_extract_p (op0, int_mode))
8923 	  {
8924 	    if (speed)
8925 	      *cost += extra_cost->alu.extend_arith;
8926 
8927 	    op0 = aarch64_strip_extend (op0, true);
8928 	    *cost += rtx_cost (op0, VOIDmode,
8929 			       (enum rtx_code) GET_CODE (op0), 0, speed);
8930 	    return true;
8931 	  }
8932 
8933 	/* Strip any extend, leave shifts behind as we will
8934 	   cost them through mult_cost.  */
8935 	new_op0 = aarch64_strip_extend (op0, false);
8936 
8937 	if (GET_CODE (new_op0) == MULT
8938 	    || aarch64_shift_p (GET_CODE (new_op0)))
8939 	  {
8940 	    *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
8941 					    speed);
8942 	    return true;
8943 	  }
8944 
8945 	*cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
8946 
8947 	if (speed)
8948 	  {
8949 	    if (VECTOR_MODE_P (mode))
8950 	      {
8951 		/* Vector ADD.  */
8952 		*cost += extra_cost->vect.alu;
8953 	      }
8954 	    else if (GET_MODE_CLASS (mode) == MODE_INT)
8955 	      {
8956 		/* ADD.  */
8957 		*cost += extra_cost->alu.arith;
8958 	      }
8959 	    else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
8960 	      {
8961 		/* FADD.  */
8962 		*cost += extra_cost->fp[mode == DFmode].addsub;
8963 	      }
8964 	  }
8965 	return true;
8966       }
8967 
8968     case BSWAP:
8969       *cost = COSTS_N_INSNS (1);
8970 
8971       if (speed)
8972 	{
8973 	  if (VECTOR_MODE_P (mode))
8974 	    *cost += extra_cost->vect.alu;
8975 	  else
8976 	    *cost += extra_cost->alu.rev;
8977 	}
8978       return false;
8979 
8980     case IOR:
8981       if (aarch_rev16_p (x))
8982         {
8983           *cost = COSTS_N_INSNS (1);
8984 
8985 	  if (speed)
8986 	    {
8987 	      if (VECTOR_MODE_P (mode))
8988 		*cost += extra_cost->vect.alu;
8989 	      else
8990 		*cost += extra_cost->alu.rev;
8991 	    }
8992 	  return true;
8993         }
8994 
8995       if (aarch64_extr_rtx_p (x, &op0, &op1))
8996         {
8997 	  *cost += rtx_cost (op0, mode, IOR, 0, speed);
8998 	  *cost += rtx_cost (op1, mode, IOR, 1, speed);
8999           if (speed)
9000             *cost += extra_cost->alu.shift;
9001 
9002           return true;
9003         }
9004     /* Fall through.  */
9005     case XOR:
9006     case AND:
9007     cost_logic:
9008       op0 = XEXP (x, 0);
9009       op1 = XEXP (x, 1);
9010 
9011       if (VECTOR_MODE_P (mode))
9012 	{
9013 	  if (speed)
9014 	    *cost += extra_cost->vect.alu;
9015 	  return true;
9016 	}
9017 
9018       if (code == AND
9019           && GET_CODE (op0) == MULT
9020           && CONST_INT_P (XEXP (op0, 1))
9021           && CONST_INT_P (op1)
9022           && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
9023                                INTVAL (op1)) != 0)
9024         {
9025           /* This is a UBFM/SBFM.  */
9026 	  *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
9027 	  if (speed)
9028 	    *cost += extra_cost->alu.bfx;
9029           return true;
9030         }
9031 
9032       if (is_int_mode (mode, &int_mode))
9033 	{
9034 	  if (CONST_INT_P (op1))
9035 	    {
9036 	      /* We have a mask + shift version of a UBFIZ
9037 		 i.e. the *andim_ashift<mode>_bfiz pattern.  */
9038 	      if (GET_CODE (op0) == ASHIFT
9039 		  && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
9040 							 XEXP (op0, 1)))
9041 		{
9042 		  *cost += rtx_cost (XEXP (op0, 0), int_mode,
9043 				     (enum rtx_code) code, 0, speed);
9044 		  if (speed)
9045 		    *cost += extra_cost->alu.bfx;
9046 
9047 		  return true;
9048 		}
9049 	      else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
9050 		{
9051 		/* We possibly get the immediate for free, this is not
9052 		   modelled.  */
9053 		  *cost += rtx_cost (op0, int_mode,
9054 				     (enum rtx_code) code, 0, speed);
9055 		  if (speed)
9056 		    *cost += extra_cost->alu.logical;
9057 
9058 		  return true;
9059 		}
9060 	    }
9061 	  else
9062 	    {
9063 	      rtx new_op0 = op0;
9064 
9065 	      /* Handle ORN, EON, or BIC.  */
9066 	      if (GET_CODE (op0) == NOT)
9067 		op0 = XEXP (op0, 0);
9068 
9069 	      new_op0 = aarch64_strip_shift (op0);
9070 
9071 	      /* If we had a shift on op0 then this is a logical-shift-
9072 		 by-register/immediate operation.  Otherwise, this is just
9073 		 a logical operation.  */
9074 	      if (speed)
9075 		{
9076 		  if (new_op0 != op0)
9077 		    {
9078 		      /* Shift by immediate.  */
9079 		      if (CONST_INT_P (XEXP (op0, 1)))
9080 			*cost += extra_cost->alu.log_shift;
9081 		      else
9082 			*cost += extra_cost->alu.log_shift_reg;
9083 		    }
9084 		  else
9085 		    *cost += extra_cost->alu.logical;
9086 		}
9087 
9088 	      /* In both cases we want to cost both operands.  */
9089 	      *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
9090 				 0, speed);
9091 	      *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
9092 				 1, speed);
9093 
9094 	      return true;
9095 	    }
9096 	}
9097       return false;
9098 
9099     case NOT:
9100       x = XEXP (x, 0);
9101       op0 = aarch64_strip_shift (x);
9102 
9103       if (VECTOR_MODE_P (mode))
9104 	{
9105 	  /* Vector NOT.  */
9106 	  *cost += extra_cost->vect.alu;
9107 	  return false;
9108 	}
9109 
9110       /* MVN-shifted-reg.  */
9111       if (op0 != x)
9112         {
9113 	  *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
9114 
9115           if (speed)
9116             *cost += extra_cost->alu.log_shift;
9117 
9118           return true;
9119         }
9120       /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
9121          Handle the second form here taking care that 'a' in the above can
9122          be a shift.  */
9123       else if (GET_CODE (op0) == XOR)
9124         {
9125           rtx newop0 = XEXP (op0, 0);
9126           rtx newop1 = XEXP (op0, 1);
9127           rtx op0_stripped = aarch64_strip_shift (newop0);
9128 
9129 	  *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
9130 	  *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
9131 
9132           if (speed)
9133             {
9134               if (op0_stripped != newop0)
9135                 *cost += extra_cost->alu.log_shift;
9136               else
9137                 *cost += extra_cost->alu.logical;
9138             }
9139 
9140           return true;
9141         }
9142       /* MVN.  */
9143       if (speed)
9144 	*cost += extra_cost->alu.logical;
9145 
9146       return false;
9147 
9148     case ZERO_EXTEND:
9149 
9150       op0 = XEXP (x, 0);
9151       /* If a value is written in SI mode, then zero extended to DI
9152 	 mode, the operation will in general be free as a write to
9153 	 a 'w' register implicitly zeroes the upper bits of an 'x'
9154 	 register.  However, if this is
9155 
9156 	   (set (reg) (zero_extend (reg)))
9157 
9158 	 we must cost the explicit register move.  */
9159       if (mode == DImode
9160 	  && GET_MODE (op0) == SImode
9161 	  && outer == SET)
9162 	{
9163 	  int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
9164 
9165 	/* If OP_COST is non-zero, then the cost of the zero extend
9166 	   is effectively the cost of the inner operation.  Otherwise
9167 	   we have a MOV instruction and we take the cost from the MOV
9168 	   itself.  This is true independently of whether we are
9169 	   optimizing for space or time.  */
9170 	  if (op_cost)
9171 	    *cost = op_cost;
9172 
9173 	  return true;
9174 	}
9175       else if (MEM_P (op0))
9176 	{
9177 	  /* All loads can zero extend to any size for free.  */
9178 	  *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
9179 	  return true;
9180 	}
9181 
9182       op0 = aarch64_extend_bitfield_pattern_p (x);
9183       if (op0)
9184 	{
9185 	  *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
9186 	  if (speed)
9187 	    *cost += extra_cost->alu.bfx;
9188 	  return true;
9189 	}
9190 
9191       if (speed)
9192 	{
9193 	  if (VECTOR_MODE_P (mode))
9194 	    {
9195 	      /* UMOV.  */
9196 	      *cost += extra_cost->vect.alu;
9197 	    }
9198 	  else
9199 	    {
9200 	      /* We generate an AND instead of UXTB/UXTH.  */
9201 	      *cost += extra_cost->alu.logical;
9202 	    }
9203 	}
9204       return false;
9205 
9206     case SIGN_EXTEND:
9207       if (MEM_P (XEXP (x, 0)))
9208 	{
9209 	  /* LDRSH.  */
9210 	  if (speed)
9211 	    {
9212 	      rtx address = XEXP (XEXP (x, 0), 0);
9213 	      *cost += extra_cost->ldst.load_sign_extend;
9214 
9215 	      *cost +=
9216 		COSTS_N_INSNS (aarch64_address_cost (address, mode,
9217 						     0, speed));
9218 	    }
9219 	  return true;
9220 	}
9221 
9222       op0 = aarch64_extend_bitfield_pattern_p (x);
9223       if (op0)
9224 	{
9225 	  *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
9226 	  if (speed)
9227 	    *cost += extra_cost->alu.bfx;
9228 	  return true;
9229 	}
9230 
9231       if (speed)
9232 	{
9233 	  if (VECTOR_MODE_P (mode))
9234 	    *cost += extra_cost->vect.alu;
9235 	  else
9236 	    *cost += extra_cost->alu.extend;
9237 	}
9238       return false;
9239 
9240     case ASHIFT:
9241       op0 = XEXP (x, 0);
9242       op1 = XEXP (x, 1);
9243 
9244       if (CONST_INT_P (op1))
9245         {
9246 	  if (speed)
9247 	    {
9248 	      if (VECTOR_MODE_P (mode))
9249 		{
9250 		  /* Vector shift (immediate).  */
9251 		  *cost += extra_cost->vect.alu;
9252 		}
9253 	      else
9254 		{
9255 		  /* LSL (immediate), UBMF, UBFIZ and friends.  These are all
9256 		     aliases.  */
9257 		  *cost += extra_cost->alu.shift;
9258 		}
9259 	    }
9260 
9261           /* We can incorporate zero/sign extend for free.  */
9262           if (GET_CODE (op0) == ZERO_EXTEND
9263               || GET_CODE (op0) == SIGN_EXTEND)
9264             op0 = XEXP (op0, 0);
9265 
9266 	  *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
9267           return true;
9268         }
9269       else
9270         {
9271 	  if (VECTOR_MODE_P (mode))
9272 	    {
9273 	      if (speed)
9274 		/* Vector shift (register).  */
9275 		*cost += extra_cost->vect.alu;
9276 	    }
9277 	  else
9278 	    {
9279 	      if (speed)
9280 		/* LSLV.  */
9281 		*cost += extra_cost->alu.shift_reg;
9282 
9283 	      if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
9284 		  && CONST_INT_P (XEXP (op1, 1))
9285 		  && known_eq (INTVAL (XEXP (op1, 1)),
9286 			       GET_MODE_BITSIZE (mode) - 1))
9287 		{
9288 		  *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
9289 		  /* We already demanded XEXP (op1, 0) to be REG_P, so
9290 		     don't recurse into it.  */
9291 		  return true;
9292 		}
9293 	    }
9294 	  return false;  /* All arguments need to be in registers.  */
9295         }
9296 
9297     case ROTATE:
9298     case ROTATERT:
9299     case LSHIFTRT:
9300     case ASHIFTRT:
9301       op0 = XEXP (x, 0);
9302       op1 = XEXP (x, 1);
9303 
9304       if (CONST_INT_P (op1))
9305 	{
9306 	  /* ASR (immediate) and friends.  */
9307 	  if (speed)
9308 	    {
9309 	      if (VECTOR_MODE_P (mode))
9310 		*cost += extra_cost->vect.alu;
9311 	      else
9312 		*cost += extra_cost->alu.shift;
9313 	    }
9314 
9315 	  *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
9316 	  return true;
9317 	}
9318       else
9319 	{
9320 	  if (VECTOR_MODE_P (mode))
9321 	    {
9322 	      if (speed)
9323 		/* Vector shift (register).  */
9324 		*cost += extra_cost->vect.alu;
9325 	    }
9326 	  else
9327 	    {
9328 	      if (speed)
9329 		/* ASR (register) and friends.  */
9330 		*cost += extra_cost->alu.shift_reg;
9331 
9332 	      if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
9333 		  && CONST_INT_P (XEXP (op1, 1))
9334 		  && known_eq (INTVAL (XEXP (op1, 1)),
9335 			       GET_MODE_BITSIZE (mode) - 1))
9336 		{
9337 		  *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
9338 		  /* We already demanded XEXP (op1, 0) to be REG_P, so
9339 		     don't recurse into it.  */
9340 		  return true;
9341 		}
9342 	    }
9343 	  return false;  /* All arguments need to be in registers.  */
9344 	}
9345 
9346     case SYMBOL_REF:
9347 
9348       if (aarch64_cmodel == AARCH64_CMODEL_LARGE
9349 	  || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
9350 	{
9351 	  /* LDR.  */
9352 	  if (speed)
9353 	    *cost += extra_cost->ldst.load;
9354 	}
9355       else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
9356 	       || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
9357 	{
9358 	  /* ADRP, followed by ADD.  */
9359 	  *cost += COSTS_N_INSNS (1);
9360 	  if (speed)
9361 	    *cost += 2 * extra_cost->alu.arith;
9362 	}
9363       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
9364 	       || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
9365 	{
9366 	  /* ADR.  */
9367 	  if (speed)
9368 	    *cost += extra_cost->alu.arith;
9369 	}
9370 
9371       if (flag_pic)
9372 	{
9373 	  /* One extra load instruction, after accessing the GOT.  */
9374 	  *cost += COSTS_N_INSNS (1);
9375 	  if (speed)
9376 	    *cost += extra_cost->ldst.load;
9377 	}
9378       return true;
9379 
9380     case HIGH:
9381     case LO_SUM:
9382       /* ADRP/ADD (immediate).  */
9383       if (speed)
9384 	*cost += extra_cost->alu.arith;
9385       return true;
9386 
9387     case ZERO_EXTRACT:
9388     case SIGN_EXTRACT:
9389       /* UBFX/SBFX.  */
9390       if (speed)
9391 	{
9392 	  if (VECTOR_MODE_P (mode))
9393 	    *cost += extra_cost->vect.alu;
9394 	  else
9395 	    *cost += extra_cost->alu.bfx;
9396 	}
9397 
9398       /* We can trust that the immediates used will be correct (there
9399 	 are no by-register forms), so we need only cost op0.  */
9400       *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
9401       return true;
9402 
9403     case MULT:
9404       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
9405       /* aarch64_rtx_mult_cost always handles recursion to its
9406 	 operands.  */
9407       return true;
9408 
9409     case MOD:
9410     /* We can expand signed mod by power of 2 using a NEGS, two parallel
9411        ANDs and a CSNEG.  Assume here that CSNEG is the same as the cost of
9412        an unconditional negate.  This case should only ever be reached through
9413        the set_smod_pow2_cheap check in expmed.c.  */
9414       if (CONST_INT_P (XEXP (x, 1))
9415 	  && exact_log2 (INTVAL (XEXP (x, 1))) > 0
9416 	  && (mode == SImode || mode == DImode))
9417 	{
9418 	  /* We expand to 4 instructions.  Reset the baseline.  */
9419 	  *cost = COSTS_N_INSNS (4);
9420 
9421 	  if (speed)
9422 	    *cost += 2 * extra_cost->alu.logical
9423 		     + 2 * extra_cost->alu.arith;
9424 
9425 	  return true;
9426 	}
9427 
9428     /* Fall-through.  */
9429     case UMOD:
9430       if (speed)
9431 	{
9432 	  /* Slighly prefer UMOD over SMOD.  */
9433 	  if (VECTOR_MODE_P (mode))
9434 	    *cost += extra_cost->vect.alu;
9435 	  else if (GET_MODE_CLASS (mode) == MODE_INT)
9436 	    *cost += (extra_cost->mult[mode == DImode].add
9437 		      + extra_cost->mult[mode == DImode].idiv
9438 		      + (code == MOD ? 1 : 0));
9439 	}
9440       return false;  /* All arguments need to be in registers.  */
9441 
9442     case DIV:
9443     case UDIV:
9444     case SQRT:
9445       if (speed)
9446 	{
9447 	  if (VECTOR_MODE_P (mode))
9448 	    *cost += extra_cost->vect.alu;
9449 	  else if (GET_MODE_CLASS (mode) == MODE_INT)
9450 	    /* There is no integer SQRT, so only DIV and UDIV can get
9451 	       here.  */
9452 	    *cost += (extra_cost->mult[mode == DImode].idiv
9453 		     /* Slighly prefer UDIV over SDIV.  */
9454 		     + (code == DIV ? 1 : 0));
9455 	  else
9456 	    *cost += extra_cost->fp[mode == DFmode].div;
9457 	}
9458       return false;  /* All arguments need to be in registers.  */
9459 
9460     case IF_THEN_ELSE:
9461       return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
9462 					 XEXP (x, 2), cost, speed);
9463 
9464     case EQ:
9465     case NE:
9466     case GT:
9467     case GTU:
9468     case LT:
9469     case LTU:
9470     case GE:
9471     case GEU:
9472     case LE:
9473     case LEU:
9474 
9475       return false; /* All arguments must be in registers.  */
9476 
9477     case FMA:
9478       op0 = XEXP (x, 0);
9479       op1 = XEXP (x, 1);
9480       op2 = XEXP (x, 2);
9481 
9482       if (speed)
9483 	{
9484 	  if (VECTOR_MODE_P (mode))
9485 	    *cost += extra_cost->vect.alu;
9486 	  else
9487 	    *cost += extra_cost->fp[mode == DFmode].fma;
9488 	}
9489 
9490       /* FMSUB, FNMADD, and FNMSUB are free.  */
9491       if (GET_CODE (op0) == NEG)
9492         op0 = XEXP (op0, 0);
9493 
9494       if (GET_CODE (op2) == NEG)
9495         op2 = XEXP (op2, 0);
9496 
9497       /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
9498 	 and the by-element operand as operand 0.  */
9499       if (GET_CODE (op1) == NEG)
9500         op1 = XEXP (op1, 0);
9501 
9502       /* Catch vector-by-element operations.  The by-element operand can
9503 	 either be (vec_duplicate (vec_select (x))) or just
9504 	 (vec_select (x)), depending on whether we are multiplying by
9505 	 a vector or a scalar.
9506 
9507 	 Canonicalization is not very good in these cases, FMA4 will put the
9508 	 by-element operand as operand 0, FNMA4 will have it as operand 1.  */
9509       if (GET_CODE (op0) == VEC_DUPLICATE)
9510 	op0 = XEXP (op0, 0);
9511       else if (GET_CODE (op1) == VEC_DUPLICATE)
9512 	op1 = XEXP (op1, 0);
9513 
9514       if (GET_CODE (op0) == VEC_SELECT)
9515 	op0 = XEXP (op0, 0);
9516       else if (GET_CODE (op1) == VEC_SELECT)
9517 	op1 = XEXP (op1, 0);
9518 
9519       /* If the remaining parameters are not registers,
9520          get the cost to put them into registers.  */
9521       *cost += rtx_cost (op0, mode, FMA, 0, speed);
9522       *cost += rtx_cost (op1, mode, FMA, 1, speed);
9523       *cost += rtx_cost (op2, mode, FMA, 2, speed);
9524       return true;
9525 
9526     case FLOAT:
9527     case UNSIGNED_FLOAT:
9528       if (speed)
9529 	*cost += extra_cost->fp[mode == DFmode].fromint;
9530       return false;
9531 
9532     case FLOAT_EXTEND:
9533       if (speed)
9534 	{
9535 	  if (VECTOR_MODE_P (mode))
9536 	    {
9537 	      /*Vector truncate.  */
9538 	      *cost += extra_cost->vect.alu;
9539 	    }
9540 	  else
9541 	    *cost += extra_cost->fp[mode == DFmode].widen;
9542 	}
9543       return false;
9544 
9545     case FLOAT_TRUNCATE:
9546       if (speed)
9547 	{
9548 	  if (VECTOR_MODE_P (mode))
9549 	    {
9550 	      /*Vector conversion.  */
9551 	      *cost += extra_cost->vect.alu;
9552 	    }
9553 	  else
9554 	    *cost += extra_cost->fp[mode == DFmode].narrow;
9555 	}
9556       return false;
9557 
9558     case FIX:
9559     case UNSIGNED_FIX:
9560       x = XEXP (x, 0);
9561       /* Strip the rounding part.  They will all be implemented
9562          by the fcvt* family of instructions anyway.  */
9563       if (GET_CODE (x) == UNSPEC)
9564         {
9565           unsigned int uns_code = XINT (x, 1);
9566 
9567           if (uns_code == UNSPEC_FRINTA
9568               || uns_code == UNSPEC_FRINTM
9569               || uns_code == UNSPEC_FRINTN
9570               || uns_code == UNSPEC_FRINTP
9571               || uns_code == UNSPEC_FRINTZ)
9572             x = XVECEXP (x, 0, 0);
9573         }
9574 
9575       if (speed)
9576 	{
9577 	  if (VECTOR_MODE_P (mode))
9578 	    *cost += extra_cost->vect.alu;
9579 	  else
9580 	    *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
9581 	}
9582 
9583       /* We can combine fmul by a power of 2 followed by a fcvt into a single
9584 	 fixed-point fcvt.  */
9585       if (GET_CODE (x) == MULT
9586 	  && ((VECTOR_MODE_P (mode)
9587 	       && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
9588 	      || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
9589 	{
9590 	  *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
9591 			     0, speed);
9592 	  return true;
9593 	}
9594 
9595       *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
9596       return true;
9597 
9598     case ABS:
9599       if (VECTOR_MODE_P (mode))
9600 	{
9601 	  /* ABS (vector).  */
9602 	  if (speed)
9603 	    *cost += extra_cost->vect.alu;
9604 	}
9605       else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
9606 	{
9607 	  op0 = XEXP (x, 0);
9608 
9609 	  /* FABD, which is analogous to FADD.  */
9610 	  if (GET_CODE (op0) == MINUS)
9611 	    {
9612 	      *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
9613 	      *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
9614 	      if (speed)
9615 		*cost += extra_cost->fp[mode == DFmode].addsub;
9616 
9617 	      return true;
9618 	    }
9619 	  /* Simple FABS is analogous to FNEG.  */
9620 	  if (speed)
9621 	    *cost += extra_cost->fp[mode == DFmode].neg;
9622 	}
9623       else
9624 	{
9625 	  /* Integer ABS will either be split to
9626 	     two arithmetic instructions, or will be an ABS
9627 	     (scalar), which we don't model.  */
9628 	  *cost = COSTS_N_INSNS (2);
9629 	  if (speed)
9630 	    *cost += 2 * extra_cost->alu.arith;
9631 	}
9632       return false;
9633 
9634     case SMAX:
9635     case SMIN:
9636       if (speed)
9637 	{
9638 	  if (VECTOR_MODE_P (mode))
9639 	    *cost += extra_cost->vect.alu;
9640 	  else
9641 	    {
9642 	      /* FMAXNM/FMINNM/FMAX/FMIN.
9643 	         TODO: This may not be accurate for all implementations, but
9644 	         we do not model this in the cost tables.  */
9645 	      *cost += extra_cost->fp[mode == DFmode].addsub;
9646 	    }
9647 	}
9648       return false;
9649 
9650     case UNSPEC:
9651       /* The floating point round to integer frint* instructions.  */
9652       if (aarch64_frint_unspec_p (XINT (x, 1)))
9653         {
9654           if (speed)
9655             *cost += extra_cost->fp[mode == DFmode].roundint;
9656 
9657           return false;
9658         }
9659 
9660       if (XINT (x, 1) == UNSPEC_RBIT)
9661         {
9662           if (speed)
9663             *cost += extra_cost->alu.rev;
9664 
9665           return false;
9666         }
9667       break;
9668 
9669     case TRUNCATE:
9670 
9671       /* Decompose <su>muldi3_highpart.  */
9672       if (/* (truncate:DI  */
9673 	  mode == DImode
9674 	  /*   (lshiftrt:TI  */
9675           && GET_MODE (XEXP (x, 0)) == TImode
9676           && GET_CODE (XEXP (x, 0)) == LSHIFTRT
9677 	  /*      (mult:TI  */
9678           && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
9679 	  /*        (ANY_EXTEND:TI (reg:DI))
9680 	            (ANY_EXTEND:TI (reg:DI)))  */
9681           && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
9682                && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
9683               || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
9684                   && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
9685           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
9686           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
9687 	  /*     (const_int 64)  */
9688           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
9689           && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
9690         {
9691           /* UMULH/SMULH.  */
9692 	  if (speed)
9693 	    *cost += extra_cost->mult[mode == DImode].extend;
9694 	  *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
9695 			     mode, MULT, 0, speed);
9696 	  *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
9697 			     mode, MULT, 1, speed);
9698           return true;
9699         }
9700 
9701       /* Fall through.  */
9702     default:
9703       break;
9704     }
9705 
9706   if (dump_file
9707       && flag_aarch64_verbose_cost)
9708     fprintf (dump_file,
9709       "\nFailed to cost RTX.  Assuming default cost.\n");
9710 
9711   return true;
9712 }
9713 
9714 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
9715    calculated for X.  This cost is stored in *COST.  Returns true
9716    if the total cost of X was calculated.  */
9717 static bool
aarch64_rtx_costs_wrapper(rtx x,machine_mode mode,int outer,int param,int * cost,bool speed)9718 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
9719 		   int param, int *cost, bool speed)
9720 {
9721   bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
9722 
9723   if (dump_file
9724       && flag_aarch64_verbose_cost)
9725     {
9726       print_rtl_single (dump_file, x);
9727       fprintf (dump_file, "\n%s cost: %d (%s)\n",
9728 	       speed ? "Hot" : "Cold",
9729 	       *cost, result ? "final" : "partial");
9730     }
9731 
9732   return result;
9733 }
9734 
9735 static int
aarch64_register_move_cost(machine_mode mode,reg_class_t from_i,reg_class_t to_i)9736 aarch64_register_move_cost (machine_mode mode,
9737 			    reg_class_t from_i, reg_class_t to_i)
9738 {
9739   enum reg_class from = (enum reg_class) from_i;
9740   enum reg_class to = (enum reg_class) to_i;
9741   const struct cpu_regmove_cost *regmove_cost
9742     = aarch64_tune_params.regmove_cost;
9743 
9744   /* Caller save and pointer regs are equivalent to GENERAL_REGS.  */
9745   if (to == TAILCALL_ADDR_REGS || to == POINTER_REGS)
9746     to = GENERAL_REGS;
9747 
9748   if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS)
9749     from = GENERAL_REGS;
9750 
9751   /* Moving between GPR and stack cost is the same as GP2GP.  */
9752   if ((from == GENERAL_REGS && to == STACK_REG)
9753       || (to == GENERAL_REGS && from == STACK_REG))
9754     return regmove_cost->GP2GP;
9755 
9756   /* To/From the stack register, we move via the gprs.  */
9757   if (to == STACK_REG || from == STACK_REG)
9758     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
9759             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
9760 
9761   if (known_eq (GET_MODE_SIZE (mode), 16))
9762     {
9763       /* 128-bit operations on general registers require 2 instructions.  */
9764       if (from == GENERAL_REGS && to == GENERAL_REGS)
9765 	return regmove_cost->GP2GP * 2;
9766       else if (from == GENERAL_REGS)
9767 	return regmove_cost->GP2FP * 2;
9768       else if (to == GENERAL_REGS)
9769 	return regmove_cost->FP2GP * 2;
9770 
9771       /* When AdvSIMD instructions are disabled it is not possible to move
9772 	 a 128-bit value directly between Q registers.  This is handled in
9773 	 secondary reload.  A general register is used as a scratch to move
9774 	 the upper DI value and the lower DI value is moved directly,
9775 	 hence the cost is the sum of three moves. */
9776       if (! TARGET_SIMD)
9777 	return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
9778 
9779       return regmove_cost->FP2FP;
9780     }
9781 
9782   if (from == GENERAL_REGS && to == GENERAL_REGS)
9783     return regmove_cost->GP2GP;
9784   else if (from == GENERAL_REGS)
9785     return regmove_cost->GP2FP;
9786   else if (to == GENERAL_REGS)
9787     return regmove_cost->FP2GP;
9788 
9789   return regmove_cost->FP2FP;
9790 }
9791 
9792 static int
aarch64_memory_move_cost(machine_mode mode ATTRIBUTE_UNUSED,reg_class_t rclass ATTRIBUTE_UNUSED,bool in ATTRIBUTE_UNUSED)9793 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
9794 			  reg_class_t rclass ATTRIBUTE_UNUSED,
9795 			  bool in ATTRIBUTE_UNUSED)
9796 {
9797   return aarch64_tune_params.memmov_cost;
9798 }
9799 
9800 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
9801    to optimize 1.0/sqrt.  */
9802 
9803 static bool
use_rsqrt_p(machine_mode mode)9804 use_rsqrt_p (machine_mode mode)
9805 {
9806   return (!flag_trapping_math
9807 	  && flag_unsafe_math_optimizations
9808 	  && ((aarch64_tune_params.approx_modes->recip_sqrt
9809 	       & AARCH64_APPROX_MODE (mode))
9810 	      || flag_mrecip_low_precision_sqrt));
9811 }
9812 
9813 /* Function to decide when to use the approximate reciprocal square root
9814    builtin.  */
9815 
9816 static tree
aarch64_builtin_reciprocal(tree fndecl)9817 aarch64_builtin_reciprocal (tree fndecl)
9818 {
9819   machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
9820 
9821   if (!use_rsqrt_p (mode))
9822     return NULL_TREE;
9823   return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
9824 }
9825 
9826 typedef rtx (*rsqrte_type) (rtx, rtx);
9827 
9828 /* Select reciprocal square root initial estimate insn depending on machine
9829    mode.  */
9830 
9831 static rsqrte_type
get_rsqrte_type(machine_mode mode)9832 get_rsqrte_type (machine_mode mode)
9833 {
9834   switch (mode)
9835   {
9836     case E_DFmode:   return gen_aarch64_rsqrtedf;
9837     case E_SFmode:   return gen_aarch64_rsqrtesf;
9838     case E_V2DFmode: return gen_aarch64_rsqrtev2df;
9839     case E_V2SFmode: return gen_aarch64_rsqrtev2sf;
9840     case E_V4SFmode: return gen_aarch64_rsqrtev4sf;
9841     default: gcc_unreachable ();
9842   }
9843 }
9844 
9845 typedef rtx (*rsqrts_type) (rtx, rtx, rtx);
9846 
9847 /* Select reciprocal square root series step insn depending on machine mode.  */
9848 
9849 static rsqrts_type
get_rsqrts_type(machine_mode mode)9850 get_rsqrts_type (machine_mode mode)
9851 {
9852   switch (mode)
9853   {
9854     case E_DFmode:   return gen_aarch64_rsqrtsdf;
9855     case E_SFmode:   return gen_aarch64_rsqrtssf;
9856     case E_V2DFmode: return gen_aarch64_rsqrtsv2df;
9857     case E_V2SFmode: return gen_aarch64_rsqrtsv2sf;
9858     case E_V4SFmode: return gen_aarch64_rsqrtsv4sf;
9859     default: gcc_unreachable ();
9860   }
9861 }
9862 
9863 /* Emit instruction sequence to compute either the approximate square root
9864    or its approximate reciprocal, depending on the flag RECP, and return
9865    whether the sequence was emitted or not.  */
9866 
9867 bool
aarch64_emit_approx_sqrt(rtx dst,rtx src,bool recp)9868 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
9869 {
9870   machine_mode mode = GET_MODE (dst);
9871 
9872   if (GET_MODE_INNER (mode) == HFmode)
9873     {
9874       gcc_assert (!recp);
9875       return false;
9876     }
9877 
9878   if (!recp)
9879     {
9880       if (!(flag_mlow_precision_sqrt
9881 	    || (aarch64_tune_params.approx_modes->sqrt
9882 		& AARCH64_APPROX_MODE (mode))))
9883 	return false;
9884 
9885       if (flag_finite_math_only
9886 	  || flag_trapping_math
9887 	  || !flag_unsafe_math_optimizations
9888 	  || optimize_function_for_size_p (cfun))
9889 	return false;
9890     }
9891   else
9892     /* Caller assumes we cannot fail.  */
9893     gcc_assert (use_rsqrt_p (mode));
9894 
9895   machine_mode mmsk = mode_for_int_vector (mode).require ();
9896   rtx xmsk = gen_reg_rtx (mmsk);
9897   if (!recp)
9898     /* When calculating the approximate square root, compare the
9899        argument with 0.0 and create a mask.  */
9900     emit_insn (gen_rtx_SET (xmsk,
9901 			    gen_rtx_NEG (mmsk,
9902 					 gen_rtx_EQ (mmsk, src,
9903 						     CONST0_RTX (mode)))));
9904 
9905   /* Estimate the approximate reciprocal square root.  */
9906   rtx xdst = gen_reg_rtx (mode);
9907   emit_insn ((*get_rsqrte_type (mode)) (xdst, src));
9908 
9909   /* Iterate over the series twice for SF and thrice for DF.  */
9910   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
9911 
9912   /* Optionally iterate over the series once less for faster performance
9913      while sacrificing the accuracy.  */
9914   if ((recp && flag_mrecip_low_precision_sqrt)
9915       || (!recp && flag_mlow_precision_sqrt))
9916     iterations--;
9917 
9918   /* Iterate over the series to calculate the approximate reciprocal square
9919      root.  */
9920   rtx x1 = gen_reg_rtx (mode);
9921   while (iterations--)
9922     {
9923       rtx x2 = gen_reg_rtx (mode);
9924       emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
9925 
9926       emit_insn ((*get_rsqrts_type (mode)) (x1, src, x2));
9927 
9928       if (iterations > 0)
9929 	emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
9930     }
9931 
9932   if (!recp)
9933     {
9934       /* Qualify the approximate reciprocal square root when the argument is
9935 	 0.0 by squashing the intermediary result to 0.0.  */
9936       rtx xtmp = gen_reg_rtx (mmsk);
9937       emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
9938 					      gen_rtx_SUBREG (mmsk, xdst, 0)));
9939       emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
9940 
9941       /* Calculate the approximate square root.  */
9942       emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
9943     }
9944 
9945   /* Finalize the approximation.  */
9946   emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
9947 
9948   return true;
9949 }
9950 
9951 typedef rtx (*recpe_type) (rtx, rtx);
9952 
9953 /* Select reciprocal initial estimate insn depending on machine mode.  */
9954 
9955 static recpe_type
get_recpe_type(machine_mode mode)9956 get_recpe_type (machine_mode mode)
9957 {
9958   switch (mode)
9959   {
9960     case E_SFmode:   return (gen_aarch64_frecpesf);
9961     case E_V2SFmode: return (gen_aarch64_frecpev2sf);
9962     case E_V4SFmode: return (gen_aarch64_frecpev4sf);
9963     case E_DFmode:   return (gen_aarch64_frecpedf);
9964     case E_V2DFmode: return (gen_aarch64_frecpev2df);
9965     default:         gcc_unreachable ();
9966   }
9967 }
9968 
9969 typedef rtx (*recps_type) (rtx, rtx, rtx);
9970 
9971 /* Select reciprocal series step insn depending on machine mode.  */
9972 
9973 static recps_type
get_recps_type(machine_mode mode)9974 get_recps_type (machine_mode mode)
9975 {
9976   switch (mode)
9977   {
9978     case E_SFmode:   return (gen_aarch64_frecpssf);
9979     case E_V2SFmode: return (gen_aarch64_frecpsv2sf);
9980     case E_V4SFmode: return (gen_aarch64_frecpsv4sf);
9981     case E_DFmode:   return (gen_aarch64_frecpsdf);
9982     case E_V2DFmode: return (gen_aarch64_frecpsv2df);
9983     default:         gcc_unreachable ();
9984   }
9985 }
9986 
9987 /* Emit the instruction sequence to compute the approximation for the division
9988    of NUM by DEN in QUO and return whether the sequence was emitted or not.  */
9989 
9990 bool
aarch64_emit_approx_div(rtx quo,rtx num,rtx den)9991 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
9992 {
9993   machine_mode mode = GET_MODE (quo);
9994 
9995   if (GET_MODE_INNER (mode) == HFmode)
9996     return false;
9997 
9998   bool use_approx_division_p = (flag_mlow_precision_div
9999 			        || (aarch64_tune_params.approx_modes->division
10000 				    & AARCH64_APPROX_MODE (mode)));
10001 
10002   if (!flag_finite_math_only
10003       || flag_trapping_math
10004       || !flag_unsafe_math_optimizations
10005       || optimize_function_for_size_p (cfun)
10006       || !use_approx_division_p)
10007     return false;
10008 
10009   if (!TARGET_SIMD && VECTOR_MODE_P (mode))
10010     return false;
10011 
10012   /* Estimate the approximate reciprocal.  */
10013   rtx xrcp = gen_reg_rtx (mode);
10014   emit_insn ((*get_recpe_type (mode)) (xrcp, den));
10015 
10016   /* Iterate over the series twice for SF and thrice for DF.  */
10017   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
10018 
10019   /* Optionally iterate over the series once less for faster performance,
10020      while sacrificing the accuracy.  */
10021   if (flag_mlow_precision_div)
10022     iterations--;
10023 
10024   /* Iterate over the series to calculate the approximate reciprocal.  */
10025   rtx xtmp = gen_reg_rtx (mode);
10026   while (iterations--)
10027     {
10028       emit_insn ((*get_recps_type (mode)) (xtmp, xrcp, den));
10029 
10030       if (iterations > 0)
10031 	emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
10032     }
10033 
10034   if (num != CONST1_RTX (mode))
10035     {
10036       /* As the approximate reciprocal of DEN is already calculated, only
10037 	 calculate the approximate division when NUM is not 1.0.  */
10038       rtx xnum = force_reg (mode, num);
10039       emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
10040     }
10041 
10042   /* Finalize the approximation.  */
10043   emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
10044   return true;
10045 }
10046 
10047 /* Return the number of instructions that can be issued per cycle.  */
10048 static int
aarch64_sched_issue_rate(void)10049 aarch64_sched_issue_rate (void)
10050 {
10051   return aarch64_tune_params.issue_rate;
10052 }
10053 
10054 static int
aarch64_sched_first_cycle_multipass_dfa_lookahead(void)10055 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
10056 {
10057   int issue_rate = aarch64_sched_issue_rate ();
10058 
10059   return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
10060 }
10061 
10062 
10063 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
10064    autopref_multipass_dfa_lookahead_guard from haifa-sched.c.  It only
10065    has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0.  */
10066 
10067 static int
aarch64_first_cycle_multipass_dfa_lookahead_guard(rtx_insn * insn,int ready_index)10068 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
10069 						    int ready_index)
10070 {
10071   return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
10072 }
10073 
10074 
10075 /* Vectorizer cost model target hooks.  */
10076 
10077 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
10078 static int
aarch64_builtin_vectorization_cost(enum vect_cost_for_stmt type_of_cost,tree vectype,int misalign ATTRIBUTE_UNUSED)10079 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
10080 				    tree vectype,
10081 				    int misalign ATTRIBUTE_UNUSED)
10082 {
10083   unsigned elements;
10084   const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
10085   bool fp = false;
10086 
10087   if (vectype != NULL)
10088     fp = FLOAT_TYPE_P (vectype);
10089 
10090   switch (type_of_cost)
10091     {
10092       case scalar_stmt:
10093 	return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
10094 
10095       case scalar_load:
10096 	return costs->scalar_load_cost;
10097 
10098       case scalar_store:
10099 	return costs->scalar_store_cost;
10100 
10101       case vector_stmt:
10102 	return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
10103 
10104       case vector_load:
10105 	return costs->vec_align_load_cost;
10106 
10107       case vector_store:
10108 	return costs->vec_store_cost;
10109 
10110       case vec_to_scalar:
10111 	return costs->vec_to_scalar_cost;
10112 
10113       case scalar_to_vec:
10114 	return costs->scalar_to_vec_cost;
10115 
10116       case unaligned_load:
10117       case vector_gather_load:
10118 	return costs->vec_unalign_load_cost;
10119 
10120       case unaligned_store:
10121       case vector_scatter_store:
10122 	return costs->vec_unalign_store_cost;
10123 
10124       case cond_branch_taken:
10125 	return costs->cond_taken_branch_cost;
10126 
10127       case cond_branch_not_taken:
10128 	return costs->cond_not_taken_branch_cost;
10129 
10130       case vec_perm:
10131 	return costs->vec_permute_cost;
10132 
10133       case vec_promote_demote:
10134 	return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
10135 
10136       case vec_construct:
10137 	elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
10138 	return elements / 2 + 1;
10139 
10140       default:
10141 	gcc_unreachable ();
10142     }
10143 }
10144 
10145 /* Implement targetm.vectorize.add_stmt_cost.  */
10146 static unsigned
aarch64_add_stmt_cost(void * data,int count,enum vect_cost_for_stmt kind,struct _stmt_vec_info * stmt_info,int misalign,enum vect_cost_model_location where)10147 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
10148 		       struct _stmt_vec_info *stmt_info, int misalign,
10149 		       enum vect_cost_model_location where)
10150 {
10151   unsigned *cost = (unsigned *) data;
10152   unsigned retval = 0;
10153 
10154   if (flag_vect_cost_model)
10155     {
10156       tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
10157       int stmt_cost =
10158 	    aarch64_builtin_vectorization_cost (kind, vectype, misalign);
10159 
10160       /* Statements in an inner loop relative to the loop being
10161 	 vectorized are weighted more heavily.  The value here is
10162 	 arbitrary and could potentially be improved with analysis.  */
10163       if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
10164 	count *= 50; /*  FIXME  */
10165 
10166       retval = (unsigned) (count * stmt_cost);
10167       cost[where] += retval;
10168     }
10169 
10170   return retval;
10171 }
10172 
10173 static void initialize_aarch64_code_model (struct gcc_options *);
10174 
10175 /* Parse the TO_PARSE string and put the architecture struct that it
10176    selects into RES and the architectural features into ISA_FLAGS.
10177    Return an aarch64_parse_opt_result describing the parse result.
10178    If there is an error parsing, RES and ISA_FLAGS are left unchanged.  */
10179 
10180 static enum aarch64_parse_opt_result
aarch64_parse_arch(const char * to_parse,const struct processor ** res,unsigned long * isa_flags)10181 aarch64_parse_arch (const char *to_parse, const struct processor **res,
10182 		    unsigned long *isa_flags)
10183 {
10184   char *ext;
10185   const struct processor *arch;
10186   char *str = (char *) alloca (strlen (to_parse) + 1);
10187   size_t len;
10188 
10189   strcpy (str, to_parse);
10190 
10191   ext = strchr (str, '+');
10192 
10193   if (ext != NULL)
10194     len = ext - str;
10195   else
10196     len = strlen (str);
10197 
10198   if (len == 0)
10199     return AARCH64_PARSE_MISSING_ARG;
10200 
10201 
10202   /* Loop through the list of supported ARCHes to find a match.  */
10203   for (arch = all_architectures; arch->name != NULL; arch++)
10204     {
10205       if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
10206 	{
10207 	  unsigned long isa_temp = arch->flags;
10208 
10209 	  if (ext != NULL)
10210 	    {
10211 	      /* TO_PARSE string contains at least one extension.  */
10212 	      enum aarch64_parse_opt_result ext_res
10213 		= aarch64_parse_extension (ext, &isa_temp);
10214 
10215 	      if (ext_res != AARCH64_PARSE_OK)
10216 		return ext_res;
10217 	    }
10218 	  /* Extension parsing was successful.  Confirm the result
10219 	     arch and ISA flags.  */
10220 	  *res = arch;
10221 	  *isa_flags = isa_temp;
10222 	  return AARCH64_PARSE_OK;
10223 	}
10224     }
10225 
10226   /* ARCH name not found in list.  */
10227   return AARCH64_PARSE_INVALID_ARG;
10228 }
10229 
10230 /* Parse the TO_PARSE string and put the result tuning in RES and the
10231    architecture flags in ISA_FLAGS.  Return an aarch64_parse_opt_result
10232    describing the parse result.  If there is an error parsing, RES and
10233    ISA_FLAGS are left unchanged.  */
10234 
10235 static enum aarch64_parse_opt_result
aarch64_parse_cpu(const char * to_parse,const struct processor ** res,unsigned long * isa_flags)10236 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
10237 		   unsigned long *isa_flags)
10238 {
10239   char *ext;
10240   const struct processor *cpu;
10241   char *str = (char *) alloca (strlen (to_parse) + 1);
10242   size_t len;
10243 
10244   strcpy (str, to_parse);
10245 
10246   ext = strchr (str, '+');
10247 
10248   if (ext != NULL)
10249     len = ext - str;
10250   else
10251     len = strlen (str);
10252 
10253   if (len == 0)
10254     return AARCH64_PARSE_MISSING_ARG;
10255 
10256 
10257   /* Loop through the list of supported CPUs to find a match.  */
10258   for (cpu = all_cores; cpu->name != NULL; cpu++)
10259     {
10260       if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
10261 	{
10262 	  unsigned long isa_temp = cpu->flags;
10263 
10264 
10265 	  if (ext != NULL)
10266 	    {
10267 	      /* TO_PARSE string contains at least one extension.  */
10268 	      enum aarch64_parse_opt_result ext_res
10269 		= aarch64_parse_extension (ext, &isa_temp);
10270 
10271 	      if (ext_res != AARCH64_PARSE_OK)
10272 		return ext_res;
10273 	    }
10274 	  /* Extension parsing was successfull.  Confirm the result
10275 	     cpu and ISA flags.  */
10276 	  *res = cpu;
10277 	  *isa_flags = isa_temp;
10278 	  return AARCH64_PARSE_OK;
10279 	}
10280     }
10281 
10282   /* CPU name not found in list.  */
10283   return AARCH64_PARSE_INVALID_ARG;
10284 }
10285 
10286 /* Parse the TO_PARSE string and put the cpu it selects into RES.
10287    Return an aarch64_parse_opt_result describing the parse result.
10288    If the parsing fails the RES does not change.  */
10289 
10290 static enum aarch64_parse_opt_result
aarch64_parse_tune(const char * to_parse,const struct processor ** res)10291 aarch64_parse_tune (const char *to_parse, const struct processor **res)
10292 {
10293   const struct processor *cpu;
10294   char *str = (char *) alloca (strlen (to_parse) + 1);
10295 
10296   strcpy (str, to_parse);
10297 
10298   /* Loop through the list of supported CPUs to find a match.  */
10299   for (cpu = all_cores; cpu->name != NULL; cpu++)
10300     {
10301       if (strcmp (cpu->name, str) == 0)
10302 	{
10303 	  *res = cpu;
10304 	  return AARCH64_PARSE_OK;
10305 	}
10306     }
10307 
10308   /* CPU name not found in list.  */
10309   return AARCH64_PARSE_INVALID_ARG;
10310 }
10311 
10312 /* Parse TOKEN, which has length LENGTH to see if it is an option
10313    described in FLAG.  If it is, return the index bit for that fusion type.
10314    If not, error (printing OPTION_NAME) and return zero.  */
10315 
10316 static unsigned int
aarch64_parse_one_option_token(const char * token,size_t length,const struct aarch64_flag_desc * flag,const char * option_name)10317 aarch64_parse_one_option_token (const char *token,
10318 				size_t length,
10319 				const struct aarch64_flag_desc *flag,
10320 				const char *option_name)
10321 {
10322   for (; flag->name != NULL; flag++)
10323     {
10324       if (length == strlen (flag->name)
10325 	  && !strncmp (flag->name, token, length))
10326 	return flag->flag;
10327     }
10328 
10329   error ("unknown flag passed in -moverride=%s (%s)", option_name, token);
10330   return 0;
10331 }
10332 
10333 /* Parse OPTION which is a comma-separated list of flags to enable.
10334    FLAGS gives the list of flags we understand, INITIAL_STATE gives any
10335    default state we inherit from the CPU tuning structures.  OPTION_NAME
10336    gives the top-level option we are parsing in the -moverride string,
10337    for use in error messages.  */
10338 
10339 static unsigned int
aarch64_parse_boolean_options(const char * option,const struct aarch64_flag_desc * flags,unsigned int initial_state,const char * option_name)10340 aarch64_parse_boolean_options (const char *option,
10341 			       const struct aarch64_flag_desc *flags,
10342 			       unsigned int initial_state,
10343 			       const char *option_name)
10344 {
10345   const char separator = '.';
10346   const char* specs = option;
10347   const char* ntoken = option;
10348   unsigned int found_flags = initial_state;
10349 
10350   while ((ntoken = strchr (specs, separator)))
10351     {
10352       size_t token_length = ntoken - specs;
10353       unsigned token_ops = aarch64_parse_one_option_token (specs,
10354 							   token_length,
10355 							   flags,
10356 							   option_name);
10357       /* If we find "none" (or, for simplicity's sake, an error) anywhere
10358 	 in the token stream, reset the supported operations.  So:
10359 
10360 	   adrp+add.cmp+branch.none.adrp+add
10361 
10362 	   would have the result of turning on only adrp+add fusion.  */
10363       if (!token_ops)
10364 	found_flags = 0;
10365 
10366       found_flags |= token_ops;
10367       specs = ++ntoken;
10368     }
10369 
10370   /* We ended with a comma, print something.  */
10371   if (!(*specs))
10372     {
10373       error ("%s string ill-formed\n", option_name);
10374       return 0;
10375     }
10376 
10377   /* We still have one more token to parse.  */
10378   size_t token_length = strlen (specs);
10379   unsigned token_ops = aarch64_parse_one_option_token (specs,
10380 						       token_length,
10381 						       flags,
10382 						       option_name);
10383    if (!token_ops)
10384      found_flags = 0;
10385 
10386   found_flags |= token_ops;
10387   return found_flags;
10388 }
10389 
10390 /* Support for overriding instruction fusion.  */
10391 
10392 static void
aarch64_parse_fuse_string(const char * fuse_string,struct tune_params * tune)10393 aarch64_parse_fuse_string (const char *fuse_string,
10394 			    struct tune_params *tune)
10395 {
10396   tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
10397 						     aarch64_fusible_pairs,
10398 						     tune->fusible_ops,
10399 						     "fuse=");
10400 }
10401 
10402 /* Support for overriding other tuning flags.  */
10403 
10404 static void
aarch64_parse_tune_string(const char * tune_string,struct tune_params * tune)10405 aarch64_parse_tune_string (const char *tune_string,
10406 			    struct tune_params *tune)
10407 {
10408   tune->extra_tuning_flags
10409     = aarch64_parse_boolean_options (tune_string,
10410 				     aarch64_tuning_flags,
10411 				     tune->extra_tuning_flags,
10412 				     "tune=");
10413 }
10414 
10415 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
10416    we understand.  If it is, extract the option string and handoff to
10417    the appropriate function.  */
10418 
10419 void
aarch64_parse_one_override_token(const char * token,size_t length,struct tune_params * tune)10420 aarch64_parse_one_override_token (const char* token,
10421 				  size_t length,
10422 				  struct tune_params *tune)
10423 {
10424   const struct aarch64_tuning_override_function *fn
10425     = aarch64_tuning_override_functions;
10426 
10427   const char *option_part = strchr (token, '=');
10428   if (!option_part)
10429     {
10430       error ("tuning string missing in option (%s)", token);
10431       return;
10432     }
10433 
10434   /* Get the length of the option name.  */
10435   length = option_part - token;
10436   /* Skip the '=' to get to the option string.  */
10437   option_part++;
10438 
10439   for (; fn->name != NULL; fn++)
10440     {
10441       if (!strncmp (fn->name, token, length))
10442 	{
10443 	  fn->parse_override (option_part, tune);
10444 	  return;
10445 	}
10446     }
10447 
10448   error ("unknown tuning option (%s)",token);
10449   return;
10450 }
10451 
10452 /* A checking mechanism for the implementation of the tls size.  */
10453 
10454 static void
initialize_aarch64_tls_size(struct gcc_options * opts)10455 initialize_aarch64_tls_size (struct gcc_options *opts)
10456 {
10457   if (aarch64_tls_size == 0)
10458     aarch64_tls_size = 24;
10459 
10460   switch (opts->x_aarch64_cmodel_var)
10461     {
10462     case AARCH64_CMODEL_TINY:
10463       /* Both the default and maximum TLS size allowed under tiny is 1M which
10464 	 needs two instructions to address, so we clamp the size to 24.  */
10465       if (aarch64_tls_size > 24)
10466 	aarch64_tls_size = 24;
10467       break;
10468     case AARCH64_CMODEL_SMALL:
10469       /* The maximum TLS size allowed under small is 4G.  */
10470       if (aarch64_tls_size > 32)
10471 	aarch64_tls_size = 32;
10472       break;
10473     case AARCH64_CMODEL_LARGE:
10474       /* The maximum TLS size allowed under large is 16E.
10475 	 FIXME: 16E should be 64bit, we only support 48bit offset now.  */
10476       if (aarch64_tls_size > 48)
10477 	aarch64_tls_size = 48;
10478       break;
10479     default:
10480       gcc_unreachable ();
10481     }
10482 
10483   return;
10484 }
10485 
10486 /* Parse STRING looking for options in the format:
10487      string	:: option:string
10488      option	:: name=substring
10489      name	:: {a-z}
10490      substring	:: defined by option.  */
10491 
10492 static void
aarch64_parse_override_string(const char * input_string,struct tune_params * tune)10493 aarch64_parse_override_string (const char* input_string,
10494 			       struct tune_params* tune)
10495 {
10496   const char separator = ':';
10497   size_t string_length = strlen (input_string) + 1;
10498   char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
10499   char *string = string_root;
10500   strncpy (string, input_string, string_length);
10501   string[string_length - 1] = '\0';
10502 
10503   char* ntoken = string;
10504 
10505   while ((ntoken = strchr (string, separator)))
10506     {
10507       size_t token_length = ntoken - string;
10508       /* Make this substring look like a string.  */
10509       *ntoken = '\0';
10510       aarch64_parse_one_override_token (string, token_length, tune);
10511       string = ++ntoken;
10512     }
10513 
10514   /* One last option to parse.  */
10515   aarch64_parse_one_override_token (string, strlen (string), tune);
10516   free (string_root);
10517 }
10518 
10519 
10520 static void
aarch64_override_options_after_change_1(struct gcc_options * opts)10521 aarch64_override_options_after_change_1 (struct gcc_options *opts)
10522 {
10523   /* PR 70044: We have to be careful about being called multiple times for the
10524      same function.  This means all changes should be repeatable.  */
10525 
10526   /* If the frame pointer is enabled, set it to a special value that behaves
10527      similar to frame pointer omission.  If we don't do this all leaf functions
10528      will get a frame pointer even if flag_omit_leaf_frame_pointer is set.
10529      If flag_omit_frame_pointer has this special value, we must force the
10530      frame pointer if not in a leaf function.  We also need to force it in a
10531      leaf function if flag_omit_frame_pointer is not set or if LR is used.  */
10532   if (opts->x_flag_omit_frame_pointer == 0)
10533     opts->x_flag_omit_frame_pointer = 2;
10534 
10535   /* If not optimizing for size, set the default
10536      alignment to what the target wants.  */
10537   if (!opts->x_optimize_size)
10538     {
10539       if (opts->x_align_loops <= 0)
10540 	opts->x_align_loops = aarch64_tune_params.loop_align;
10541       if (opts->x_align_jumps <= 0)
10542 	opts->x_align_jumps = aarch64_tune_params.jump_align;
10543       if (opts->x_align_functions <= 0)
10544 	opts->x_align_functions = aarch64_tune_params.function_align;
10545     }
10546 
10547   /* We default to no pc-relative literal loads.  */
10548 
10549   aarch64_pcrelative_literal_loads = false;
10550 
10551   /* If -mpc-relative-literal-loads is set on the command line, this
10552      implies that the user asked for PC relative literal loads.  */
10553   if (opts->x_pcrelative_literal_loads == 1)
10554     aarch64_pcrelative_literal_loads = true;
10555 
10556   /* In the tiny memory model it makes no sense to disallow PC relative
10557      literal pool loads.  */
10558   if (aarch64_cmodel == AARCH64_CMODEL_TINY
10559       || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
10560     aarch64_pcrelative_literal_loads = true;
10561 
10562   /* When enabling the lower precision Newton series for the square root, also
10563      enable it for the reciprocal square root, since the latter is an
10564      intermediary step for the former.  */
10565   if (flag_mlow_precision_sqrt)
10566     flag_mrecip_low_precision_sqrt = true;
10567 }
10568 
10569 /* 'Unpack' up the internal tuning structs and update the options
10570     in OPTS.  The caller must have set up selected_tune and selected_arch
10571     as all the other target-specific codegen decisions are
10572     derived from them.  */
10573 
10574 void
aarch64_override_options_internal(struct gcc_options * opts)10575 aarch64_override_options_internal (struct gcc_options *opts)
10576 {
10577   aarch64_tune_flags = selected_tune->flags;
10578   aarch64_tune = selected_tune->sched_core;
10579   /* Make a copy of the tuning parameters attached to the core, which
10580      we may later overwrite.  */
10581   aarch64_tune_params = *(selected_tune->tune);
10582   aarch64_architecture_version = selected_arch->architecture_version;
10583 
10584   if (opts->x_aarch64_override_tune_string)
10585     aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
10586 				  &aarch64_tune_params);
10587 
10588   /* This target defaults to strict volatile bitfields.  */
10589   if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
10590     opts->x_flag_strict_volatile_bitfields = 1;
10591 
10592   initialize_aarch64_code_model (opts);
10593   initialize_aarch64_tls_size (opts);
10594 
10595   int queue_depth = 0;
10596   switch (aarch64_tune_params.autoprefetcher_model)
10597     {
10598       case tune_params::AUTOPREFETCHER_OFF:
10599 	queue_depth = -1;
10600 	break;
10601       case tune_params::AUTOPREFETCHER_WEAK:
10602 	queue_depth = 0;
10603 	break;
10604       case tune_params::AUTOPREFETCHER_STRONG:
10605 	queue_depth = max_insn_queue_index + 1;
10606 	break;
10607       default:
10608 	gcc_unreachable ();
10609     }
10610 
10611   /* We don't mind passing in global_options_set here as we don't use
10612      the *options_set structs anyway.  */
10613   maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
10614 			 queue_depth,
10615 			 opts->x_param_values,
10616 			 global_options_set.x_param_values);
10617 
10618   /* Set up parameters to be used in prefetching algorithm.  Do not
10619      override the defaults unless we are tuning for a core we have
10620      researched values for.  */
10621   if (aarch64_tune_params.prefetch->num_slots > 0)
10622     maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
10623 			   aarch64_tune_params.prefetch->num_slots,
10624 			   opts->x_param_values,
10625 			   global_options_set.x_param_values);
10626   if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
10627     maybe_set_param_value (PARAM_L1_CACHE_SIZE,
10628 			   aarch64_tune_params.prefetch->l1_cache_size,
10629 			   opts->x_param_values,
10630 			   global_options_set.x_param_values);
10631   if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
10632     maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
10633 			   aarch64_tune_params.prefetch->l1_cache_line_size,
10634 			   opts->x_param_values,
10635 			   global_options_set.x_param_values);
10636   if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
10637     maybe_set_param_value (PARAM_L2_CACHE_SIZE,
10638 			   aarch64_tune_params.prefetch->l2_cache_size,
10639 			   opts->x_param_values,
10640 			   global_options_set.x_param_values);
10641 
10642   /* Use the alternative scheduling-pressure algorithm by default.  */
10643   maybe_set_param_value (PARAM_SCHED_PRESSURE_ALGORITHM, SCHED_PRESSURE_MODEL,
10644 			 opts->x_param_values,
10645 			 global_options_set.x_param_values);
10646 
10647   /* Enable sw prefetching at specified optimization level for
10648      CPUS that have prefetch.  Lower optimization level threshold by 1
10649      when profiling is enabled.  */
10650   if (opts->x_flag_prefetch_loop_arrays < 0
10651       && !opts->x_optimize_size
10652       && aarch64_tune_params.prefetch->default_opt_level >= 0
10653       && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
10654     opts->x_flag_prefetch_loop_arrays = 1;
10655 
10656   aarch64_override_options_after_change_1 (opts);
10657 }
10658 
10659 /* Print a hint with a suggestion for a core or architecture name that
10660    most closely resembles what the user passed in STR.  ARCH is true if
10661    the user is asking for an architecture name.  ARCH is false if the user
10662    is asking for a core name.  */
10663 
10664 static void
aarch64_print_hint_for_core_or_arch(const char * str,bool arch)10665 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
10666 {
10667   auto_vec<const char *> candidates;
10668   const struct processor *entry = arch ? all_architectures : all_cores;
10669   for (; entry->name != NULL; entry++)
10670     candidates.safe_push (entry->name);
10671 
10672 #ifdef HAVE_LOCAL_CPU_DETECT
10673   /* Add also "native" as possible value.  */
10674   if (arch)
10675     candidates.safe_push ("native");
10676 #endif
10677 
10678   char *s;
10679   const char *hint = candidates_list_and_hint (str, s, candidates);
10680   if (hint)
10681     inform (input_location, "valid arguments are: %s;"
10682 			     " did you mean %qs?", s, hint);
10683   else
10684     inform (input_location, "valid arguments are: %s", s);
10685 
10686   XDELETEVEC (s);
10687 }
10688 
10689 /* Print a hint with a suggestion for a core name that most closely resembles
10690    what the user passed in STR.  */
10691 
10692 inline static void
aarch64_print_hint_for_core(const char * str)10693 aarch64_print_hint_for_core (const char *str)
10694 {
10695   aarch64_print_hint_for_core_or_arch (str, false);
10696 }
10697 
10698 /* Print a hint with a suggestion for an architecture name that most closely
10699    resembles what the user passed in STR.  */
10700 
10701 inline static void
aarch64_print_hint_for_arch(const char * str)10702 aarch64_print_hint_for_arch (const char *str)
10703 {
10704   aarch64_print_hint_for_core_or_arch (str, true);
10705 }
10706 
10707 /* Validate a command-line -mcpu option.  Parse the cpu and extensions (if any)
10708    specified in STR and throw errors if appropriate.  Put the results if
10709    they are valid in RES and ISA_FLAGS.  Return whether the option is
10710    valid.  */
10711 
10712 static bool
aarch64_validate_mcpu(const char * str,const struct processor ** res,unsigned long * isa_flags)10713 aarch64_validate_mcpu (const char *str, const struct processor **res,
10714 		       unsigned long *isa_flags)
10715 {
10716   enum aarch64_parse_opt_result parse_res
10717     = aarch64_parse_cpu (str, res, isa_flags);
10718 
10719   if (parse_res == AARCH64_PARSE_OK)
10720     return true;
10721 
10722   switch (parse_res)
10723     {
10724       case AARCH64_PARSE_MISSING_ARG:
10725 	error ("missing cpu name in %<-mcpu=%s%>", str);
10726 	break;
10727       case AARCH64_PARSE_INVALID_ARG:
10728 	error ("unknown value %qs for -mcpu", str);
10729 	aarch64_print_hint_for_core (str);
10730 	break;
10731       case AARCH64_PARSE_INVALID_FEATURE:
10732 	error ("invalid feature modifier in %<-mcpu=%s%>", str);
10733 	break;
10734       default:
10735 	gcc_unreachable ();
10736     }
10737 
10738   return false;
10739 }
10740 
10741 /* Validate a command-line -march option.  Parse the arch and extensions
10742    (if any) specified in STR and throw errors if appropriate.  Put the
10743    results, if they are valid, in RES and ISA_FLAGS.  Return whether the
10744    option is valid.  */
10745 
10746 static bool
aarch64_validate_march(const char * str,const struct processor ** res,unsigned long * isa_flags)10747 aarch64_validate_march (const char *str, const struct processor **res,
10748 			 unsigned long *isa_flags)
10749 {
10750   enum aarch64_parse_opt_result parse_res
10751     = aarch64_parse_arch (str, res, isa_flags);
10752 
10753   if (parse_res == AARCH64_PARSE_OK)
10754     return true;
10755 
10756   switch (parse_res)
10757     {
10758       case AARCH64_PARSE_MISSING_ARG:
10759 	error ("missing arch name in %<-march=%s%>", str);
10760 	break;
10761       case AARCH64_PARSE_INVALID_ARG:
10762 	error ("unknown value %qs for -march", str);
10763 	aarch64_print_hint_for_arch (str);
10764 	break;
10765       case AARCH64_PARSE_INVALID_FEATURE:
10766 	error ("invalid feature modifier in %<-march=%s%>", str);
10767 	break;
10768       default:
10769 	gcc_unreachable ();
10770     }
10771 
10772   return false;
10773 }
10774 
10775 /* Validate a command-line -mtune option.  Parse the cpu
10776    specified in STR and throw errors if appropriate.  Put the
10777    result, if it is valid, in RES.  Return whether the option is
10778    valid.  */
10779 
10780 static bool
aarch64_validate_mtune(const char * str,const struct processor ** res)10781 aarch64_validate_mtune (const char *str, const struct processor **res)
10782 {
10783   enum aarch64_parse_opt_result parse_res
10784     = aarch64_parse_tune (str, res);
10785 
10786   if (parse_res == AARCH64_PARSE_OK)
10787     return true;
10788 
10789   switch (parse_res)
10790     {
10791       case AARCH64_PARSE_MISSING_ARG:
10792 	error ("missing cpu name in %<-mtune=%s%>", str);
10793 	break;
10794       case AARCH64_PARSE_INVALID_ARG:
10795 	error ("unknown value %qs for -mtune", str);
10796 	aarch64_print_hint_for_core (str);
10797 	break;
10798       default:
10799 	gcc_unreachable ();
10800     }
10801   return false;
10802 }
10803 
10804 /* Return the CPU corresponding to the enum CPU.
10805    If it doesn't specify a cpu, return the default.  */
10806 
10807 static const struct processor *
aarch64_get_tune_cpu(enum aarch64_processor cpu)10808 aarch64_get_tune_cpu (enum aarch64_processor cpu)
10809 {
10810   if (cpu != aarch64_none)
10811     return &all_cores[cpu];
10812 
10813   /* The & 0x3f is to extract the bottom 6 bits that encode the
10814      default cpu as selected by the --with-cpu GCC configure option
10815      in config.gcc.
10816      ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
10817      flags mechanism should be reworked to make it more sane.  */
10818   return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
10819 }
10820 
10821 /* Return the architecture corresponding to the enum ARCH.
10822    If it doesn't specify a valid architecture, return the default.  */
10823 
10824 static const struct processor *
aarch64_get_arch(enum aarch64_arch arch)10825 aarch64_get_arch (enum aarch64_arch arch)
10826 {
10827   if (arch != aarch64_no_arch)
10828     return &all_architectures[arch];
10829 
10830   const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
10831 
10832   return &all_architectures[cpu->arch];
10833 }
10834 
10835 /* Return the VG value associated with -msve-vector-bits= value VALUE.  */
10836 
10837 static poly_uint16
aarch64_convert_sve_vector_bits(aarch64_sve_vector_bits_enum value)10838 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
10839 {
10840   /* For now generate vector-length agnostic code for -msve-vector-bits=128.
10841      This ensures we can clearly distinguish SVE and Advanced SIMD modes when
10842      deciding which .md file patterns to use and when deciding whether
10843      something is a legitimate address or constant.  */
10844   if (value == SVE_SCALABLE || value == SVE_128)
10845     return poly_uint16 (2, 2);
10846   else
10847     return (int) value / 64;
10848 }
10849 
10850 /* Implement TARGET_OPTION_OVERRIDE.  This is called once in the beginning
10851    and is used to parse the -m{cpu,tune,arch} strings and setup the initial
10852    tuning structs.  In particular it must set selected_tune and
10853    aarch64_isa_flags that define the available ISA features and tuning
10854    decisions.  It must also set selected_arch as this will be used to
10855    output the .arch asm tags for each function.  */
10856 
10857 static void
aarch64_override_options(void)10858 aarch64_override_options (void)
10859 {
10860   unsigned long cpu_isa = 0;
10861   unsigned long arch_isa = 0;
10862   aarch64_isa_flags = 0;
10863 
10864   bool valid_cpu = true;
10865   bool valid_tune = true;
10866   bool valid_arch = true;
10867 
10868   selected_cpu = NULL;
10869   selected_arch = NULL;
10870   selected_tune = NULL;
10871 
10872   /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
10873      If either of -march or -mtune is given, they override their
10874      respective component of -mcpu.  */
10875   if (aarch64_cpu_string)
10876     valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
10877 					&cpu_isa);
10878 
10879   if (aarch64_arch_string)
10880     valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
10881 					  &arch_isa);
10882 
10883   if (aarch64_tune_string)
10884     valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
10885 
10886   /* If the user did not specify a processor, choose the default
10887      one for them.  This will be the CPU set during configuration using
10888      --with-cpu, otherwise it is "generic".  */
10889   if (!selected_cpu)
10890     {
10891       if (selected_arch)
10892 	{
10893 	  selected_cpu = &all_cores[selected_arch->ident];
10894 	  aarch64_isa_flags = arch_isa;
10895 	  explicit_arch = selected_arch->arch;
10896 	}
10897       else
10898 	{
10899 	  /* Get default configure-time CPU.  */
10900 	  selected_cpu = aarch64_get_tune_cpu (aarch64_none);
10901 	  aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
10902 	}
10903 
10904       if (selected_tune)
10905 	explicit_tune_core = selected_tune->ident;
10906     }
10907   /* If both -mcpu and -march are specified check that they are architecturally
10908      compatible, warn if they're not and prefer the -march ISA flags.  */
10909   else if (selected_arch)
10910     {
10911       if (selected_arch->arch != selected_cpu->arch)
10912 	{
10913 	  warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
10914 		       all_architectures[selected_cpu->arch].name,
10915 		       selected_arch->name);
10916 	}
10917       aarch64_isa_flags = arch_isa;
10918       explicit_arch = selected_arch->arch;
10919       explicit_tune_core = selected_tune ? selected_tune->ident
10920 					  : selected_cpu->ident;
10921     }
10922   else
10923     {
10924       /* -mcpu but no -march.  */
10925       aarch64_isa_flags = cpu_isa;
10926       explicit_tune_core = selected_tune ? selected_tune->ident
10927 					  : selected_cpu->ident;
10928       gcc_assert (selected_cpu);
10929       selected_arch = &all_architectures[selected_cpu->arch];
10930       explicit_arch = selected_arch->arch;
10931     }
10932 
10933   /* Set the arch as well as we will need it when outputing
10934      the .arch directive in assembly.  */
10935   if (!selected_arch)
10936     {
10937       gcc_assert (selected_cpu);
10938       selected_arch = &all_architectures[selected_cpu->arch];
10939     }
10940 
10941   if (!selected_tune)
10942     selected_tune = selected_cpu;
10943 
10944 #ifndef HAVE_AS_MABI_OPTION
10945   /* The compiler may have been configured with 2.23.* binutils, which does
10946      not have support for ILP32.  */
10947   if (TARGET_ILP32)
10948     error ("assembler does not support -mabi=ilp32");
10949 #endif
10950 
10951   /* Convert -msve-vector-bits to a VG count.  */
10952   aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
10953 
10954   if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
10955     sorry ("return address signing is only supported for -mabi=lp64");
10956 
10957   /* Make sure we properly set up the explicit options.  */
10958   if ((aarch64_cpu_string && valid_cpu)
10959        || (aarch64_tune_string && valid_tune))
10960     gcc_assert (explicit_tune_core != aarch64_none);
10961 
10962   if ((aarch64_cpu_string && valid_cpu)
10963        || (aarch64_arch_string && valid_arch))
10964     gcc_assert (explicit_arch != aarch64_no_arch);
10965 
10966   aarch64_override_options_internal (&global_options);
10967 
10968   /* Save these options as the default ones in case we push and pop them later
10969      while processing functions with potential target attributes.  */
10970   target_option_default_node = target_option_current_node
10971       = build_target_option_node (&global_options);
10972 }
10973 
10974 /* Implement targetm.override_options_after_change.  */
10975 
10976 static void
aarch64_override_options_after_change(void)10977 aarch64_override_options_after_change (void)
10978 {
10979   aarch64_override_options_after_change_1 (&global_options);
10980 }
10981 
10982 static struct machine_function *
aarch64_init_machine_status(void)10983 aarch64_init_machine_status (void)
10984 {
10985   struct machine_function *machine;
10986   machine = ggc_cleared_alloc<machine_function> ();
10987   return machine;
10988 }
10989 
10990 void
aarch64_init_expanders(void)10991 aarch64_init_expanders (void)
10992 {
10993   init_machine_status = aarch64_init_machine_status;
10994 }
10995 
10996 /* A checking mechanism for the implementation of the various code models.  */
10997 static void
initialize_aarch64_code_model(struct gcc_options * opts)10998 initialize_aarch64_code_model (struct gcc_options *opts)
10999 {
11000    if (opts->x_flag_pic)
11001      {
11002        switch (opts->x_aarch64_cmodel_var)
11003 	 {
11004 	 case AARCH64_CMODEL_TINY:
11005 	   aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
11006 	   break;
11007 	 case AARCH64_CMODEL_SMALL:
11008 #ifdef HAVE_AS_SMALL_PIC_RELOCS
11009 	   aarch64_cmodel = (flag_pic == 2
11010 			     ? AARCH64_CMODEL_SMALL_PIC
11011 			     : AARCH64_CMODEL_SMALL_SPIC);
11012 #else
11013 	   aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
11014 #endif
11015 	   break;
11016 	 case AARCH64_CMODEL_LARGE:
11017 	   sorry ("code model %qs with -f%s", "large",
11018 		  opts->x_flag_pic > 1 ? "PIC" : "pic");
11019 	   break;
11020 	 default:
11021 	   gcc_unreachable ();
11022 	 }
11023      }
11024    else
11025      aarch64_cmodel = opts->x_aarch64_cmodel_var;
11026 }
11027 
11028 /* Implement TARGET_OPTION_SAVE.  */
11029 
11030 static void
aarch64_option_save(struct cl_target_option * ptr,struct gcc_options * opts)11031 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
11032 {
11033   ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
11034 }
11035 
11036 /* Implements TARGET_OPTION_RESTORE.  Restore the backend codegen decisions
11037    using the information saved in PTR.  */
11038 
11039 static void
aarch64_option_restore(struct gcc_options * opts,struct cl_target_option * ptr)11040 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
11041 {
11042   opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
11043   selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
11044   opts->x_explicit_arch = ptr->x_explicit_arch;
11045   selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
11046   opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
11047 
11048   aarch64_override_options_internal (opts);
11049 }
11050 
11051 /* Implement TARGET_OPTION_PRINT.  */
11052 
11053 static void
aarch64_option_print(FILE * file,int indent,struct cl_target_option * ptr)11054 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
11055 {
11056   const struct processor *cpu
11057     = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
11058   unsigned long isa_flags = ptr->x_aarch64_isa_flags;
11059   const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
11060   std::string extension
11061     = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
11062 
11063   fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
11064   fprintf (file, "%*sselected arch = %s%s\n", indent, "",
11065 	   arch->name, extension.c_str ());
11066 }
11067 
11068 static GTY(()) tree aarch64_previous_fndecl;
11069 
11070 void
aarch64_reset_previous_fndecl(void)11071 aarch64_reset_previous_fndecl (void)
11072 {
11073   aarch64_previous_fndecl = NULL;
11074 }
11075 
11076 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
11077    Used by aarch64_set_current_function and aarch64_pragma_target_parse to
11078    make sure optab availability predicates are recomputed when necessary.  */
11079 
11080 void
aarch64_save_restore_target_globals(tree new_tree)11081 aarch64_save_restore_target_globals (tree new_tree)
11082 {
11083   if (TREE_TARGET_GLOBALS (new_tree))
11084     restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
11085   else if (new_tree == target_option_default_node)
11086     restore_target_globals (&default_target_globals);
11087   else
11088     TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
11089 }
11090 
11091 /* Implement TARGET_SET_CURRENT_FUNCTION.  Unpack the codegen decisions
11092    like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
11093    of the function, if such exists.  This function may be called multiple
11094    times on a single function so use aarch64_previous_fndecl to avoid
11095    setting up identical state.  */
11096 
11097 static void
aarch64_set_current_function(tree fndecl)11098 aarch64_set_current_function (tree fndecl)
11099 {
11100   if (!fndecl || fndecl == aarch64_previous_fndecl)
11101     return;
11102 
11103   tree old_tree = (aarch64_previous_fndecl
11104 		   ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
11105 		   : NULL_TREE);
11106 
11107   tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
11108 
11109   /* If current function has no attributes but the previous one did,
11110      use the default node.  */
11111   if (!new_tree && old_tree)
11112     new_tree = target_option_default_node;
11113 
11114   /* If nothing to do, return.  #pragma GCC reset or #pragma GCC pop to
11115      the default have been handled by aarch64_save_restore_target_globals from
11116      aarch64_pragma_target_parse.  */
11117   if (old_tree == new_tree)
11118     return;
11119 
11120   aarch64_previous_fndecl = fndecl;
11121 
11122   /* First set the target options.  */
11123   cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
11124 
11125   aarch64_save_restore_target_globals (new_tree);
11126 }
11127 
11128 /* Enum describing the various ways we can handle attributes.
11129    In many cases we can reuse the generic option handling machinery.  */
11130 
11131 enum aarch64_attr_opt_type
11132 {
11133   aarch64_attr_mask,	/* Attribute should set a bit in target_flags.  */
11134   aarch64_attr_bool,	/* Attribute sets or unsets a boolean variable.  */
11135   aarch64_attr_enum,	/* Attribute sets an enum variable.  */
11136   aarch64_attr_custom	/* Attribute requires a custom handling function.  */
11137 };
11138 
11139 /* All the information needed to handle a target attribute.
11140    NAME is the name of the attribute.
11141    ATTR_TYPE specifies the type of behavior of the attribute as described
11142    in the definition of enum aarch64_attr_opt_type.
11143    ALLOW_NEG is true if the attribute supports a "no-" form.
11144    HANDLER is the function that takes the attribute string as an argument
11145    It is needed only when the ATTR_TYPE is aarch64_attr_custom.
11146    OPT_NUM is the enum specifying the option that the attribute modifies.
11147    This is needed for attributes that mirror the behavior of a command-line
11148    option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
11149    aarch64_attr_enum.  */
11150 
11151 struct aarch64_attribute_info
11152 {
11153   const char *name;
11154   enum aarch64_attr_opt_type attr_type;
11155   bool allow_neg;
11156   bool (*handler) (const char *);
11157   enum opt_code opt_num;
11158 };
11159 
11160 /* Handle the ARCH_STR argument to the arch= target attribute.  */
11161 
11162 static bool
aarch64_handle_attr_arch(const char * str)11163 aarch64_handle_attr_arch (const char *str)
11164 {
11165   const struct processor *tmp_arch = NULL;
11166   enum aarch64_parse_opt_result parse_res
11167     = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags);
11168 
11169   if (parse_res == AARCH64_PARSE_OK)
11170     {
11171       gcc_assert (tmp_arch);
11172       selected_arch = tmp_arch;
11173       explicit_arch = selected_arch->arch;
11174       return true;
11175     }
11176 
11177   switch (parse_res)
11178     {
11179       case AARCH64_PARSE_MISSING_ARG:
11180 	error ("missing name in %<target(\"arch=\")%> pragma or attribute");
11181 	break;
11182       case AARCH64_PARSE_INVALID_ARG:
11183 	error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str);
11184 	aarch64_print_hint_for_arch (str);
11185 	break;
11186       case AARCH64_PARSE_INVALID_FEATURE:
11187 	error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str);
11188 	break;
11189       default:
11190 	gcc_unreachable ();
11191     }
11192 
11193   return false;
11194 }
11195 
11196 /* Handle the argument CPU_STR to the cpu= target attribute.  */
11197 
11198 static bool
aarch64_handle_attr_cpu(const char * str)11199 aarch64_handle_attr_cpu (const char *str)
11200 {
11201   const struct processor *tmp_cpu = NULL;
11202   enum aarch64_parse_opt_result parse_res
11203     = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags);
11204 
11205   if (parse_res == AARCH64_PARSE_OK)
11206     {
11207       gcc_assert (tmp_cpu);
11208       selected_tune = tmp_cpu;
11209       explicit_tune_core = selected_tune->ident;
11210 
11211       selected_arch = &all_architectures[tmp_cpu->arch];
11212       explicit_arch = selected_arch->arch;
11213       return true;
11214     }
11215 
11216   switch (parse_res)
11217     {
11218       case AARCH64_PARSE_MISSING_ARG:
11219 	error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
11220 	break;
11221       case AARCH64_PARSE_INVALID_ARG:
11222 	error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str);
11223 	aarch64_print_hint_for_core (str);
11224 	break;
11225       case AARCH64_PARSE_INVALID_FEATURE:
11226 	error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str);
11227 	break;
11228       default:
11229 	gcc_unreachable ();
11230     }
11231 
11232   return false;
11233 }
11234 
11235 /* Handle the argument STR to the tune= target attribute.  */
11236 
11237 static bool
aarch64_handle_attr_tune(const char * str)11238 aarch64_handle_attr_tune (const char *str)
11239 {
11240   const struct processor *tmp_tune = NULL;
11241   enum aarch64_parse_opt_result parse_res
11242     = aarch64_parse_tune (str, &tmp_tune);
11243 
11244   if (parse_res == AARCH64_PARSE_OK)
11245     {
11246       gcc_assert (tmp_tune);
11247       selected_tune = tmp_tune;
11248       explicit_tune_core = selected_tune->ident;
11249       return true;
11250     }
11251 
11252   switch (parse_res)
11253     {
11254       case AARCH64_PARSE_INVALID_ARG:
11255 	error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str);
11256 	aarch64_print_hint_for_core (str);
11257 	break;
11258       default:
11259 	gcc_unreachable ();
11260     }
11261 
11262   return false;
11263 }
11264 
11265 /* Parse an architecture extensions target attribute string specified in STR.
11266    For example "+fp+nosimd".  Show any errors if needed.  Return TRUE
11267    if successful.  Update aarch64_isa_flags to reflect the ISA features
11268    modified.  */
11269 
11270 static bool
aarch64_handle_attr_isa_flags(char * str)11271 aarch64_handle_attr_isa_flags (char *str)
11272 {
11273   enum aarch64_parse_opt_result parse_res;
11274   unsigned long isa_flags = aarch64_isa_flags;
11275 
11276   /* We allow "+nothing" in the beginning to clear out all architectural
11277      features if the user wants to handpick specific features.  */
11278   if (strncmp ("+nothing", str, 8) == 0)
11279     {
11280       isa_flags = 0;
11281       str += 8;
11282     }
11283 
11284   parse_res = aarch64_parse_extension (str, &isa_flags);
11285 
11286   if (parse_res == AARCH64_PARSE_OK)
11287     {
11288       aarch64_isa_flags = isa_flags;
11289       return true;
11290     }
11291 
11292   switch (parse_res)
11293     {
11294       case AARCH64_PARSE_MISSING_ARG:
11295 	error ("missing value in %<target()%> pragma or attribute");
11296 	break;
11297 
11298       case AARCH64_PARSE_INVALID_FEATURE:
11299 	error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str);
11300 	break;
11301 
11302       default:
11303 	gcc_unreachable ();
11304     }
11305 
11306  return false;
11307 }
11308 
11309 /* The target attributes that we support.  On top of these we also support just
11310    ISA extensions, like  __attribute__ ((target ("+crc"))), but that case is
11311    handled explicitly in aarch64_process_one_target_attr.  */
11312 
11313 static const struct aarch64_attribute_info aarch64_attributes[] =
11314 {
11315   { "general-regs-only", aarch64_attr_mask, false, NULL,
11316      OPT_mgeneral_regs_only },
11317   { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
11318      OPT_mfix_cortex_a53_835769 },
11319   { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
11320      OPT_mfix_cortex_a53_843419 },
11321   { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
11322   { "strict-align", aarch64_attr_mask, false, NULL, OPT_mstrict_align },
11323   { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
11324      OPT_momit_leaf_frame_pointer },
11325   { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
11326   { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
11327      OPT_march_ },
11328   { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
11329   { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
11330      OPT_mtune_ },
11331   { "sign-return-address", aarch64_attr_enum, false, NULL,
11332      OPT_msign_return_address_ },
11333   { NULL, aarch64_attr_custom, false, NULL, OPT____ }
11334 };
11335 
11336 /* Parse ARG_STR which contains the definition of one target attribute.
11337    Show appropriate errors if any or return true if the attribute is valid.  */
11338 
11339 static bool
aarch64_process_one_target_attr(char * arg_str)11340 aarch64_process_one_target_attr (char *arg_str)
11341 {
11342   bool invert = false;
11343 
11344   size_t len = strlen (arg_str);
11345 
11346   if (len == 0)
11347     {
11348       error ("malformed %<target()%> pragma or attribute");
11349       return false;
11350     }
11351 
11352   char *str_to_check = (char *) alloca (len + 1);
11353   strcpy (str_to_check, arg_str);
11354 
11355   /* Skip leading whitespace.  */
11356   while (*str_to_check == ' ' || *str_to_check == '\t')
11357     str_to_check++;
11358 
11359   /* We have something like __attribute__ ((target ("+fp+nosimd"))).
11360      It is easier to detect and handle it explicitly here rather than going
11361      through the machinery for the rest of the target attributes in this
11362      function.  */
11363   if (*str_to_check == '+')
11364     return aarch64_handle_attr_isa_flags (str_to_check);
11365 
11366   if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
11367     {
11368       invert = true;
11369       str_to_check += 3;
11370     }
11371   char *arg = strchr (str_to_check, '=');
11372 
11373   /* If we found opt=foo then terminate STR_TO_CHECK at the '='
11374      and point ARG to "foo".  */
11375   if (arg)
11376     {
11377       *arg = '\0';
11378       arg++;
11379     }
11380   const struct aarch64_attribute_info *p_attr;
11381   bool found = false;
11382   for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
11383     {
11384       /* If the names don't match up, or the user has given an argument
11385 	 to an attribute that doesn't accept one, or didn't give an argument
11386 	 to an attribute that expects one, fail to match.  */
11387       if (strcmp (str_to_check, p_attr->name) != 0)
11388 	continue;
11389 
11390       found = true;
11391       bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
11392 			      || p_attr->attr_type == aarch64_attr_enum;
11393 
11394       if (attr_need_arg_p ^ (arg != NULL))
11395 	{
11396 	  error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
11397 	  return false;
11398 	}
11399 
11400       /* If the name matches but the attribute does not allow "no-" versions
11401 	 then we can't match.  */
11402       if (invert && !p_attr->allow_neg)
11403 	{
11404 	  error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
11405 	  return false;
11406 	}
11407 
11408       switch (p_attr->attr_type)
11409 	{
11410 	/* Has a custom handler registered.
11411 	   For example, cpu=, arch=, tune=.  */
11412 	  case aarch64_attr_custom:
11413 	    gcc_assert (p_attr->handler);
11414 	    if (!p_attr->handler (arg))
11415 	      return false;
11416 	    break;
11417 
11418 	  /* Either set or unset a boolean option.  */
11419 	  case aarch64_attr_bool:
11420 	    {
11421 	      struct cl_decoded_option decoded;
11422 
11423 	      generate_option (p_attr->opt_num, NULL, !invert,
11424 			       CL_TARGET, &decoded);
11425 	      aarch64_handle_option (&global_options, &global_options_set,
11426 				      &decoded, input_location);
11427 	      break;
11428 	    }
11429 	  /* Set or unset a bit in the target_flags.  aarch64_handle_option
11430 	     should know what mask to apply given the option number.  */
11431 	  case aarch64_attr_mask:
11432 	    {
11433 	      struct cl_decoded_option decoded;
11434 	      /* We only need to specify the option number.
11435 		 aarch64_handle_option will know which mask to apply.  */
11436 	      decoded.opt_index = p_attr->opt_num;
11437 	      decoded.value = !invert;
11438 	      aarch64_handle_option (&global_options, &global_options_set,
11439 				      &decoded, input_location);
11440 	      break;
11441 	    }
11442 	  /* Use the option setting machinery to set an option to an enum.  */
11443 	  case aarch64_attr_enum:
11444 	    {
11445 	      gcc_assert (arg);
11446 	      bool valid;
11447 	      int value;
11448 	      valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
11449 					      &value, CL_TARGET);
11450 	      if (valid)
11451 		{
11452 		  set_option (&global_options, NULL, p_attr->opt_num, value,
11453 			      NULL, DK_UNSPECIFIED, input_location,
11454 			      global_dc);
11455 		}
11456 	      else
11457 		{
11458 		  error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
11459 		}
11460 	      break;
11461 	    }
11462 	  default:
11463 	    gcc_unreachable ();
11464 	}
11465     }
11466 
11467   /* If we reached here we either have found an attribute and validated
11468      it or didn't match any.  If we matched an attribute but its arguments
11469      were malformed we will have returned false already.  */
11470   return found;
11471 }
11472 
11473 /* Count how many times the character C appears in
11474    NULL-terminated string STR.  */
11475 
11476 static unsigned int
num_occurences_in_str(char c,char * str)11477 num_occurences_in_str (char c, char *str)
11478 {
11479   unsigned int res = 0;
11480   while (*str != '\0')
11481     {
11482       if (*str == c)
11483 	res++;
11484 
11485       str++;
11486     }
11487 
11488   return res;
11489 }
11490 
11491 /* Parse the tree in ARGS that contains the target attribute information
11492    and update the global target options space.  */
11493 
11494 bool
aarch64_process_target_attr(tree args)11495 aarch64_process_target_attr (tree args)
11496 {
11497   if (TREE_CODE (args) == TREE_LIST)
11498     {
11499       do
11500 	{
11501 	  tree head = TREE_VALUE (args);
11502 	  if (head)
11503 	    {
11504 	      if (!aarch64_process_target_attr (head))
11505 		return false;
11506 	    }
11507 	  args = TREE_CHAIN (args);
11508 	} while (args);
11509 
11510       return true;
11511     }
11512 
11513   if (TREE_CODE (args) != STRING_CST)
11514     {
11515       error ("attribute %<target%> argument not a string");
11516       return false;
11517     }
11518 
11519   size_t len = strlen (TREE_STRING_POINTER (args));
11520   char *str_to_check = (char *) alloca (len + 1);
11521   strcpy (str_to_check, TREE_STRING_POINTER (args));
11522 
11523   if (len == 0)
11524     {
11525       error ("malformed %<target()%> pragma or attribute");
11526       return false;
11527     }
11528 
11529   /* Used to catch empty spaces between commas i.e.
11530      attribute ((target ("attr1,,attr2"))).  */
11531   unsigned int num_commas = num_occurences_in_str (',', str_to_check);
11532 
11533   /* Handle multiple target attributes separated by ','.  */
11534   char *token = strtok (str_to_check, ",");
11535 
11536   unsigned int num_attrs = 0;
11537   while (token)
11538     {
11539       num_attrs++;
11540       if (!aarch64_process_one_target_attr (token))
11541 	{
11542 	  error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
11543 	  return false;
11544 	}
11545 
11546       token = strtok (NULL, ",");
11547     }
11548 
11549   if (num_attrs != num_commas + 1)
11550     {
11551       error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
11552       return false;
11553     }
11554 
11555   return true;
11556 }
11557 
11558 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P.  This is used to
11559    process attribute ((target ("..."))).  */
11560 
11561 static bool
aarch64_option_valid_attribute_p(tree fndecl,tree,tree args,int)11562 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
11563 {
11564   struct cl_target_option cur_target;
11565   bool ret;
11566   tree old_optimize;
11567   tree new_target, new_optimize;
11568   tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
11569 
11570   /* If what we're processing is the current pragma string then the
11571      target option node is already stored in target_option_current_node
11572      by aarch64_pragma_target_parse in aarch64-c.c.  Use that to avoid
11573      having to re-parse the string.  This is especially useful to keep
11574      arm_neon.h compile times down since that header contains a lot
11575      of intrinsics enclosed in pragmas.  */
11576   if (!existing_target && args == current_target_pragma)
11577     {
11578       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
11579       return true;
11580     }
11581   tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
11582 
11583   old_optimize = build_optimization_node (&global_options);
11584   func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
11585 
11586   /* If the function changed the optimization levels as well as setting
11587      target options, start with the optimizations specified.  */
11588   if (func_optimize && func_optimize != old_optimize)
11589     cl_optimization_restore (&global_options,
11590 			     TREE_OPTIMIZATION (func_optimize));
11591 
11592   /* Save the current target options to restore at the end.  */
11593   cl_target_option_save (&cur_target, &global_options);
11594 
11595   /* If fndecl already has some target attributes applied to it, unpack
11596      them so that we add this attribute on top of them, rather than
11597      overwriting them.  */
11598   if (existing_target)
11599     {
11600       struct cl_target_option *existing_options
11601 	= TREE_TARGET_OPTION (existing_target);
11602 
11603       if (existing_options)
11604 	cl_target_option_restore (&global_options, existing_options);
11605     }
11606   else
11607     cl_target_option_restore (&global_options,
11608 			TREE_TARGET_OPTION (target_option_current_node));
11609 
11610   ret = aarch64_process_target_attr (args);
11611 
11612   /* Set up any additional state.  */
11613   if (ret)
11614     {
11615       aarch64_override_options_internal (&global_options);
11616       /* Initialize SIMD builtins if we haven't already.
11617 	 Set current_target_pragma to NULL for the duration so that
11618 	 the builtin initialization code doesn't try to tag the functions
11619 	 being built with the attributes specified by any current pragma, thus
11620 	 going into an infinite recursion.  */
11621       if (TARGET_SIMD)
11622 	{
11623 	  tree saved_current_target_pragma = current_target_pragma;
11624 	  current_target_pragma = NULL;
11625 	  aarch64_init_simd_builtins ();
11626 	  current_target_pragma = saved_current_target_pragma;
11627 	}
11628       new_target = build_target_option_node (&global_options);
11629     }
11630   else
11631     new_target = NULL;
11632 
11633   new_optimize = build_optimization_node (&global_options);
11634 
11635   if (fndecl && ret)
11636     {
11637       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
11638 
11639       if (old_optimize != new_optimize)
11640 	DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
11641     }
11642 
11643   cl_target_option_restore (&global_options, &cur_target);
11644 
11645   if (old_optimize != new_optimize)
11646     cl_optimization_restore (&global_options,
11647 			     TREE_OPTIMIZATION (old_optimize));
11648   return ret;
11649 }
11650 
11651 /* Helper for aarch64_can_inline_p.  In the case where CALLER and CALLEE are
11652    tri-bool options (yes, no, don't care) and the default value is
11653    DEF, determine whether to reject inlining.  */
11654 
11655 static bool
aarch64_tribools_ok_for_inlining_p(int caller,int callee,int dont_care,int def)11656 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
11657 				     int dont_care, int def)
11658 {
11659   /* If the callee doesn't care, always allow inlining.  */
11660   if (callee == dont_care)
11661     return true;
11662 
11663   /* If the caller doesn't care, always allow inlining.  */
11664   if (caller == dont_care)
11665     return true;
11666 
11667   /* Otherwise, allow inlining if either the callee and caller values
11668      agree, or if the callee is using the default value.  */
11669   return (callee == caller || callee == def);
11670 }
11671 
11672 /* Implement TARGET_CAN_INLINE_P.  Decide whether it is valid
11673    to inline CALLEE into CALLER based on target-specific info.
11674    Make sure that the caller and callee have compatible architectural
11675    features.  Then go through the other possible target attributes
11676    and see if they can block inlining.  Try not to reject always_inline
11677    callees unless they are incompatible architecturally.  */
11678 
11679 static bool
aarch64_can_inline_p(tree caller,tree callee)11680 aarch64_can_inline_p (tree caller, tree callee)
11681 {
11682   tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
11683   tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
11684 
11685   /* If callee has no option attributes, then it is ok to inline.  */
11686   if (!callee_tree)
11687     return true;
11688 
11689   struct cl_target_option *caller_opts
11690 	= TREE_TARGET_OPTION (caller_tree ? caller_tree
11691 					   : target_option_default_node);
11692 
11693   struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
11694 
11695 
11696   /* Callee's ISA flags should be a subset of the caller's.  */
11697   if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
11698        != callee_opts->x_aarch64_isa_flags)
11699     return false;
11700 
11701   /* Allow non-strict aligned functions inlining into strict
11702      aligned ones.  */
11703   if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
11704        != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
11705       && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
11706 	   && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
11707     return false;
11708 
11709   bool always_inline = lookup_attribute ("always_inline",
11710 					  DECL_ATTRIBUTES (callee));
11711 
11712   /* If the architectural features match up and the callee is always_inline
11713      then the other attributes don't matter.  */
11714   if (always_inline)
11715     return true;
11716 
11717   if (caller_opts->x_aarch64_cmodel_var
11718       != callee_opts->x_aarch64_cmodel_var)
11719     return false;
11720 
11721   if (caller_opts->x_aarch64_tls_dialect
11722       != callee_opts->x_aarch64_tls_dialect)
11723     return false;
11724 
11725   /* Honour explicit requests to workaround errata.  */
11726   if (!aarch64_tribools_ok_for_inlining_p (
11727 	  caller_opts->x_aarch64_fix_a53_err835769,
11728 	  callee_opts->x_aarch64_fix_a53_err835769,
11729 	  2, TARGET_FIX_ERR_A53_835769_DEFAULT))
11730     return false;
11731 
11732   if (!aarch64_tribools_ok_for_inlining_p (
11733 	  caller_opts->x_aarch64_fix_a53_err843419,
11734 	  callee_opts->x_aarch64_fix_a53_err843419,
11735 	  2, TARGET_FIX_ERR_A53_843419))
11736     return false;
11737 
11738   /* If the user explicitly specified -momit-leaf-frame-pointer for the
11739      caller and calle and they don't match up, reject inlining.  */
11740   if (!aarch64_tribools_ok_for_inlining_p (
11741 	  caller_opts->x_flag_omit_leaf_frame_pointer,
11742 	  callee_opts->x_flag_omit_leaf_frame_pointer,
11743 	  2, 1))
11744     return false;
11745 
11746   /* If the callee has specific tuning overrides, respect them.  */
11747   if (callee_opts->x_aarch64_override_tune_string != NULL
11748       && caller_opts->x_aarch64_override_tune_string == NULL)
11749     return false;
11750 
11751   /* If the user specified tuning override strings for the
11752      caller and callee and they don't match up, reject inlining.
11753      We just do a string compare here, we don't analyze the meaning
11754      of the string, as it would be too costly for little gain.  */
11755   if (callee_opts->x_aarch64_override_tune_string
11756       && caller_opts->x_aarch64_override_tune_string
11757       && (strcmp (callee_opts->x_aarch64_override_tune_string,
11758 		  caller_opts->x_aarch64_override_tune_string) != 0))
11759     return false;
11760 
11761   return true;
11762 }
11763 
11764 /* Return true if SYMBOL_REF X binds locally.  */
11765 
11766 static bool
aarch64_symbol_binds_local_p(const_rtx x)11767 aarch64_symbol_binds_local_p (const_rtx x)
11768 {
11769   return (SYMBOL_REF_DECL (x)
11770 	  ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
11771 	  : SYMBOL_REF_LOCAL_P (x));
11772 }
11773 
11774 /* Return true if SYMBOL_REF X is thread local */
11775 static bool
aarch64_tls_symbol_p(rtx x)11776 aarch64_tls_symbol_p (rtx x)
11777 {
11778   if (! TARGET_HAVE_TLS)
11779     return false;
11780 
11781   if (GET_CODE (x) != SYMBOL_REF)
11782     return false;
11783 
11784   return SYMBOL_REF_TLS_MODEL (x) != 0;
11785 }
11786 
11787 /* Classify a TLS symbol into one of the TLS kinds.  */
11788 enum aarch64_symbol_type
aarch64_classify_tls_symbol(rtx x)11789 aarch64_classify_tls_symbol (rtx x)
11790 {
11791   enum tls_model tls_kind = tls_symbolic_operand_type (x);
11792 
11793   switch (tls_kind)
11794     {
11795     case TLS_MODEL_GLOBAL_DYNAMIC:
11796     case TLS_MODEL_LOCAL_DYNAMIC:
11797       return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
11798 
11799     case TLS_MODEL_INITIAL_EXEC:
11800       switch (aarch64_cmodel)
11801 	{
11802 	case AARCH64_CMODEL_TINY:
11803 	case AARCH64_CMODEL_TINY_PIC:
11804 	  return SYMBOL_TINY_TLSIE;
11805 	default:
11806 	  return SYMBOL_SMALL_TLSIE;
11807 	}
11808 
11809     case TLS_MODEL_LOCAL_EXEC:
11810       if (aarch64_tls_size == 12)
11811 	return SYMBOL_TLSLE12;
11812       else if (aarch64_tls_size == 24)
11813 	return SYMBOL_TLSLE24;
11814       else if (aarch64_tls_size == 32)
11815 	return SYMBOL_TLSLE32;
11816       else if (aarch64_tls_size == 48)
11817 	return SYMBOL_TLSLE48;
11818       else
11819 	gcc_unreachable ();
11820 
11821     case TLS_MODEL_EMULATED:
11822     case TLS_MODEL_NONE:
11823       return SYMBOL_FORCE_TO_MEM;
11824 
11825     default:
11826       gcc_unreachable ();
11827     }
11828 }
11829 
11830 /* Return the correct method for accessing X + OFFSET, where X is either
11831    a SYMBOL_REF or LABEL_REF.  */
11832 
11833 enum aarch64_symbol_type
aarch64_classify_symbol(rtx x,HOST_WIDE_INT offset)11834 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
11835 {
11836   if (GET_CODE (x) == LABEL_REF)
11837     {
11838       switch (aarch64_cmodel)
11839 	{
11840 	case AARCH64_CMODEL_LARGE:
11841 	  return SYMBOL_FORCE_TO_MEM;
11842 
11843 	case AARCH64_CMODEL_TINY_PIC:
11844 	case AARCH64_CMODEL_TINY:
11845 	  return SYMBOL_TINY_ABSOLUTE;
11846 
11847 	case AARCH64_CMODEL_SMALL_SPIC:
11848 	case AARCH64_CMODEL_SMALL_PIC:
11849 	case AARCH64_CMODEL_SMALL:
11850 	  return SYMBOL_SMALL_ABSOLUTE;
11851 
11852 	default:
11853 	  gcc_unreachable ();
11854 	}
11855     }
11856 
11857   if (GET_CODE (x) == SYMBOL_REF)
11858     {
11859       if (aarch64_tls_symbol_p (x))
11860 	return aarch64_classify_tls_symbol (x);
11861 
11862       switch (aarch64_cmodel)
11863 	{
11864 	case AARCH64_CMODEL_TINY:
11865 	  /* When we retrieve symbol + offset address, we have to make sure
11866 	     the offset does not cause overflow of the final address.  But
11867 	     we have no way of knowing the address of symbol at compile time
11868 	     so we can't accurately say if the distance between the PC and
11869 	     symbol + offset is outside the addressible range of +/-1M in the
11870 	     TINY code model.  So we rely on images not being greater than
11871 	     1M and cap the offset at 1M and anything beyond 1M will have to
11872 	     be loaded using an alternative mechanism.  Furthermore if the
11873 	     symbol is a weak reference to something that isn't known to
11874 	     resolve to a symbol in this module, then force to memory.  */
11875 	  if ((SYMBOL_REF_WEAK (x)
11876 	       && !aarch64_symbol_binds_local_p (x))
11877 	      || !IN_RANGE (offset, -1048575, 1048575))
11878 	    return SYMBOL_FORCE_TO_MEM;
11879 	  return SYMBOL_TINY_ABSOLUTE;
11880 
11881 	case AARCH64_CMODEL_SMALL:
11882 	  /* Same reasoning as the tiny code model, but the offset cap here is
11883 	     4G.  */
11884 	  if ((SYMBOL_REF_WEAK (x)
11885 	       && !aarch64_symbol_binds_local_p (x))
11886 	      || !IN_RANGE (offset, HOST_WIDE_INT_C (-4294967263),
11887 			    HOST_WIDE_INT_C (4294967264)))
11888 	    return SYMBOL_FORCE_TO_MEM;
11889 	  return SYMBOL_SMALL_ABSOLUTE;
11890 
11891 	case AARCH64_CMODEL_TINY_PIC:
11892 	  if (!aarch64_symbol_binds_local_p (x))
11893 	    return SYMBOL_TINY_GOT;
11894 	  return SYMBOL_TINY_ABSOLUTE;
11895 
11896 	case AARCH64_CMODEL_SMALL_SPIC:
11897 	case AARCH64_CMODEL_SMALL_PIC:
11898 	  if (!aarch64_symbol_binds_local_p (x))
11899 	    return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
11900 		    ?  SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
11901 	  return SYMBOL_SMALL_ABSOLUTE;
11902 
11903 	case AARCH64_CMODEL_LARGE:
11904 	  /* This is alright even in PIC code as the constant
11905 	     pool reference is always PC relative and within
11906 	     the same translation unit.  */
11907 	  if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
11908 	    return SYMBOL_SMALL_ABSOLUTE;
11909 	  else
11910 	    return SYMBOL_FORCE_TO_MEM;
11911 
11912 	default:
11913 	  gcc_unreachable ();
11914 	}
11915     }
11916 
11917   /* By default push everything into the constant pool.  */
11918   return SYMBOL_FORCE_TO_MEM;
11919 }
11920 
11921 bool
aarch64_constant_address_p(rtx x)11922 aarch64_constant_address_p (rtx x)
11923 {
11924   return (CONSTANT_P (x) && memory_address_p (DImode, x));
11925 }
11926 
11927 bool
aarch64_legitimate_pic_operand_p(rtx x)11928 aarch64_legitimate_pic_operand_p (rtx x)
11929 {
11930   if (GET_CODE (x) == SYMBOL_REF
11931       || (GET_CODE (x) == CONST
11932 	  && GET_CODE (XEXP (x, 0)) == PLUS
11933 	  && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
11934      return false;
11935 
11936   return true;
11937 }
11938 
11939 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook.  Return true for constants
11940    that should be rematerialized rather than spilled.  */
11941 
11942 static bool
aarch64_legitimate_constant_p(machine_mode mode,rtx x)11943 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
11944 {
11945   /* Support CSE and rematerialization of common constants.  */
11946   if (CONST_INT_P (x)
11947       || (CONST_DOUBLE_P (x) && GET_MODE_CLASS (mode) == MODE_FLOAT)
11948       || GET_CODE (x) == CONST_VECTOR)
11949     return true;
11950 
11951   /* Do not allow vector struct mode constants for Advanced SIMD.
11952      We could support 0 and -1 easily, but they need support in
11953      aarch64-simd.md.  */
11954   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
11955   if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
11956     return false;
11957 
11958   /* Only accept variable-length vector constants if they can be
11959      handled directly.
11960 
11961      ??? It would be possible to handle rematerialization of other
11962      constants via secondary reloads.  */
11963   if (vec_flags & VEC_ANY_SVE)
11964     return aarch64_simd_valid_immediate (x, NULL);
11965 
11966   if (GET_CODE (x) == HIGH)
11967     x = XEXP (x, 0);
11968 
11969   /* Accept polynomial constants that can be calculated by using the
11970      destination of a move as the sole temporary.  Constants that
11971      require a second temporary cannot be rematerialized (they can't be
11972      forced to memory and also aren't legitimate constants).  */
11973   poly_int64 offset;
11974   if (poly_int_rtx_p (x, &offset))
11975     return aarch64_offset_temporaries (false, offset) <= 1;
11976 
11977   /* If an offset is being added to something else, we need to allow the
11978      base to be moved into the destination register, meaning that there
11979      are no free temporaries for the offset.  */
11980   x = strip_offset (x, &offset);
11981   if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
11982     return false;
11983 
11984   /* Do not allow const (plus (anchor_symbol, const_int)).  */
11985   if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
11986     return false;
11987 
11988   /* Treat symbols as constants.  Avoid TLS symbols as they are complex,
11989      so spilling them is better than rematerialization.  */
11990   if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
11991     return true;
11992 
11993   /* Label references are always constant.  */
11994   if (GET_CODE (x) == LABEL_REF)
11995     return true;
11996 
11997   return false;
11998 }
11999 
12000 rtx
aarch64_load_tp(rtx target)12001 aarch64_load_tp (rtx target)
12002 {
12003   if (!target
12004       || GET_MODE (target) != Pmode
12005       || !register_operand (target, Pmode))
12006     target = gen_reg_rtx (Pmode);
12007 
12008   /* Can return in any reg.  */
12009   emit_insn (gen_aarch64_load_tp_hard (target));
12010   return target;
12011 }
12012 
12013 /* On AAPCS systems, this is the "struct __va_list".  */
12014 static GTY(()) tree va_list_type;
12015 
12016 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
12017    Return the type to use as __builtin_va_list.
12018 
12019    AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
12020 
12021    struct __va_list
12022    {
12023      void *__stack;
12024      void *__gr_top;
12025      void *__vr_top;
12026      int   __gr_offs;
12027      int   __vr_offs;
12028    };  */
12029 
12030 static tree
aarch64_build_builtin_va_list(void)12031 aarch64_build_builtin_va_list (void)
12032 {
12033   tree va_list_name;
12034   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
12035 
12036   /* Create the type.  */
12037   va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
12038   /* Give it the required name.  */
12039   va_list_name = build_decl (BUILTINS_LOCATION,
12040 			     TYPE_DECL,
12041 			     get_identifier ("__va_list"),
12042 			     va_list_type);
12043   DECL_ARTIFICIAL (va_list_name) = 1;
12044   TYPE_NAME (va_list_type) = va_list_name;
12045   TYPE_STUB_DECL (va_list_type) = va_list_name;
12046 
12047   /* Create the fields.  */
12048   f_stack = build_decl (BUILTINS_LOCATION,
12049 			FIELD_DECL, get_identifier ("__stack"),
12050 			ptr_type_node);
12051   f_grtop = build_decl (BUILTINS_LOCATION,
12052 			FIELD_DECL, get_identifier ("__gr_top"),
12053 			ptr_type_node);
12054   f_vrtop = build_decl (BUILTINS_LOCATION,
12055 			FIELD_DECL, get_identifier ("__vr_top"),
12056 			ptr_type_node);
12057   f_groff = build_decl (BUILTINS_LOCATION,
12058 			FIELD_DECL, get_identifier ("__gr_offs"),
12059 			integer_type_node);
12060   f_vroff = build_decl (BUILTINS_LOCATION,
12061 			FIELD_DECL, get_identifier ("__vr_offs"),
12062 			integer_type_node);
12063 
12064   /* Tell tree-stdarg pass about our internal offset fields.
12065      NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
12066      purpose to identify whether the code is updating va_list internal
12067      offset fields through irregular way.  */
12068   va_list_gpr_counter_field = f_groff;
12069   va_list_fpr_counter_field = f_vroff;
12070 
12071   DECL_ARTIFICIAL (f_stack) = 1;
12072   DECL_ARTIFICIAL (f_grtop) = 1;
12073   DECL_ARTIFICIAL (f_vrtop) = 1;
12074   DECL_ARTIFICIAL (f_groff) = 1;
12075   DECL_ARTIFICIAL (f_vroff) = 1;
12076 
12077   DECL_FIELD_CONTEXT (f_stack) = va_list_type;
12078   DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
12079   DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
12080   DECL_FIELD_CONTEXT (f_groff) = va_list_type;
12081   DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
12082 
12083   TYPE_FIELDS (va_list_type) = f_stack;
12084   DECL_CHAIN (f_stack) = f_grtop;
12085   DECL_CHAIN (f_grtop) = f_vrtop;
12086   DECL_CHAIN (f_vrtop) = f_groff;
12087   DECL_CHAIN (f_groff) = f_vroff;
12088 
12089   /* Compute its layout.  */
12090   layout_type (va_list_type);
12091 
12092   return va_list_type;
12093 }
12094 
12095 /* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
12096 static void
aarch64_expand_builtin_va_start(tree valist,rtx nextarg ATTRIBUTE_UNUSED)12097 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
12098 {
12099   const CUMULATIVE_ARGS *cum;
12100   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
12101   tree stack, grtop, vrtop, groff, vroff;
12102   tree t;
12103   int gr_save_area_size = cfun->va_list_gpr_size;
12104   int vr_save_area_size = cfun->va_list_fpr_size;
12105   int vr_offset;
12106 
12107   cum = &crtl->args.info;
12108   if (cfun->va_list_gpr_size)
12109     gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
12110 			     cfun->va_list_gpr_size);
12111   if (cfun->va_list_fpr_size)
12112     vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
12113 			     * UNITS_PER_VREG, cfun->va_list_fpr_size);
12114 
12115   if (!TARGET_FLOAT)
12116     {
12117       gcc_assert (cum->aapcs_nvrn == 0);
12118       vr_save_area_size = 0;
12119     }
12120 
12121   f_stack = TYPE_FIELDS (va_list_type_node);
12122   f_grtop = DECL_CHAIN (f_stack);
12123   f_vrtop = DECL_CHAIN (f_grtop);
12124   f_groff = DECL_CHAIN (f_vrtop);
12125   f_vroff = DECL_CHAIN (f_groff);
12126 
12127   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
12128 		  NULL_TREE);
12129   grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
12130 		  NULL_TREE);
12131   vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
12132 		  NULL_TREE);
12133   groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
12134 		  NULL_TREE);
12135   vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
12136 		  NULL_TREE);
12137 
12138   /* Emit code to initialize STACK, which points to the next varargs stack
12139      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
12140      by named arguments.  STACK is 8-byte aligned.  */
12141   t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
12142   if (cum->aapcs_stack_size > 0)
12143     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
12144   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
12145   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12146 
12147   /* Emit code to initialize GRTOP, the top of the GR save area.
12148      virtual_incoming_args_rtx should have been 16 byte aligned.  */
12149   t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
12150   t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
12151   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12152 
12153   /* Emit code to initialize VRTOP, the top of the VR save area.
12154      This address is gr_save_area_bytes below GRTOP, rounded
12155      down to the next 16-byte boundary.  */
12156   t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
12157   vr_offset = ROUND_UP (gr_save_area_size,
12158 			STACK_BOUNDARY / BITS_PER_UNIT);
12159 
12160   if (vr_offset)
12161     t = fold_build_pointer_plus_hwi (t, -vr_offset);
12162   t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
12163   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12164 
12165   /* Emit code to initialize GROFF, the offset from GRTOP of the
12166      next GPR argument.  */
12167   t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
12168 	      build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
12169   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12170 
12171   /* Likewise emit code to initialize VROFF, the offset from FTOP
12172      of the next VR argument.  */
12173   t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
12174 	      build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
12175   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12176 }
12177 
12178 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
12179 
12180 static tree
aarch64_gimplify_va_arg_expr(tree valist,tree type,gimple_seq * pre_p,gimple_seq * post_p ATTRIBUTE_UNUSED)12181 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
12182 			      gimple_seq *post_p ATTRIBUTE_UNUSED)
12183 {
12184   tree addr;
12185   bool indirect_p;
12186   bool is_ha;		/* is HFA or HVA.  */
12187   bool dw_align;	/* double-word align.  */
12188   machine_mode ag_mode = VOIDmode;
12189   int nregs;
12190   machine_mode mode;
12191 
12192   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
12193   tree stack, f_top, f_off, off, arg, roundup, on_stack;
12194   HOST_WIDE_INT size, rsize, adjust, align;
12195   tree t, u, cond1, cond2;
12196 
12197   indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
12198   if (indirect_p)
12199     type = build_pointer_type (type);
12200 
12201   mode = TYPE_MODE (type);
12202 
12203   f_stack = TYPE_FIELDS (va_list_type_node);
12204   f_grtop = DECL_CHAIN (f_stack);
12205   f_vrtop = DECL_CHAIN (f_grtop);
12206   f_groff = DECL_CHAIN (f_vrtop);
12207   f_vroff = DECL_CHAIN (f_groff);
12208 
12209   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
12210 		  f_stack, NULL_TREE);
12211   size = int_size_in_bytes (type);
12212   align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
12213 
12214   dw_align = false;
12215   adjust = 0;
12216   if (aarch64_vfp_is_call_or_return_candidate (mode,
12217 					       type,
12218 					       &ag_mode,
12219 					       &nregs,
12220 					       &is_ha))
12221     {
12222       /* No frontends can create types with variable-sized modes, so we
12223 	 shouldn't be asked to pass or return them.  */
12224       unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
12225 
12226       /* TYPE passed in fp/simd registers.  */
12227       if (!TARGET_FLOAT)
12228 	aarch64_err_no_fpadvsimd (mode, "varargs");
12229 
12230       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
12231 		      unshare_expr (valist), f_vrtop, NULL_TREE);
12232       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
12233 		      unshare_expr (valist), f_vroff, NULL_TREE);
12234 
12235       rsize = nregs * UNITS_PER_VREG;
12236 
12237       if (is_ha)
12238 	{
12239 	  if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
12240 	    adjust = UNITS_PER_VREG - ag_size;
12241 	}
12242       else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
12243 	       && size < UNITS_PER_VREG)
12244 	{
12245 	  adjust = UNITS_PER_VREG - size;
12246 	}
12247     }
12248   else
12249     {
12250       /* TYPE passed in general registers.  */
12251       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
12252 		      unshare_expr (valist), f_grtop, NULL_TREE);
12253       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
12254 		      unshare_expr (valist), f_groff, NULL_TREE);
12255       rsize = ROUND_UP (size, UNITS_PER_WORD);
12256       nregs = rsize / UNITS_PER_WORD;
12257 
12258       if (align > 8)
12259 	dw_align = true;
12260 
12261       if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
12262 	  && size < UNITS_PER_WORD)
12263 	{
12264 	  adjust = UNITS_PER_WORD  - size;
12265 	}
12266     }
12267 
12268   /* Get a local temporary for the field value.  */
12269   off = get_initialized_tmp_var (f_off, pre_p, NULL);
12270 
12271   /* Emit code to branch if off >= 0.  */
12272   t = build2 (GE_EXPR, boolean_type_node, off,
12273 	      build_int_cst (TREE_TYPE (off), 0));
12274   cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
12275 
12276   if (dw_align)
12277     {
12278       /* Emit: offs = (offs + 15) & -16.  */
12279       t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
12280 		  build_int_cst (TREE_TYPE (off), 15));
12281       t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
12282 		  build_int_cst (TREE_TYPE (off), -16));
12283       roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
12284     }
12285   else
12286     roundup = NULL;
12287 
12288   /* Update ap.__[g|v]r_offs  */
12289   t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
12290 	      build_int_cst (TREE_TYPE (off), rsize));
12291   t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
12292 
12293   /* String up.  */
12294   if (roundup)
12295     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
12296 
12297   /* [cond2] if (ap.__[g|v]r_offs > 0)  */
12298   u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
12299 	      build_int_cst (TREE_TYPE (f_off), 0));
12300   cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
12301 
12302   /* String up: make sure the assignment happens before the use.  */
12303   t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
12304   COND_EXPR_ELSE (cond1) = t;
12305 
12306   /* Prepare the trees handling the argument that is passed on the stack;
12307      the top level node will store in ON_STACK.  */
12308   arg = get_initialized_tmp_var (stack, pre_p, NULL);
12309   if (align > 8)
12310     {
12311       /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
12312       t = fold_convert (intDI_type_node, arg);
12313       t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
12314 		  build_int_cst (TREE_TYPE (t), 15));
12315       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
12316 		  build_int_cst (TREE_TYPE (t), -16));
12317       t = fold_convert (TREE_TYPE (arg), t);
12318       roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
12319     }
12320   else
12321     roundup = NULL;
12322   /* Advance ap.__stack  */
12323   t = fold_convert (intDI_type_node, arg);
12324   t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
12325 	      build_int_cst (TREE_TYPE (t), size + 7));
12326   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
12327 	      build_int_cst (TREE_TYPE (t), -8));
12328   t = fold_convert (TREE_TYPE (arg), t);
12329   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
12330   /* String up roundup and advance.  */
12331   if (roundup)
12332     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
12333   /* String up with arg */
12334   on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
12335   /* Big-endianness related address adjustment.  */
12336   if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
12337       && size < UNITS_PER_WORD)
12338   {
12339     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
12340 		size_int (UNITS_PER_WORD - size));
12341     on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
12342   }
12343 
12344   COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
12345   COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
12346 
12347   /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
12348   t = off;
12349   if (adjust)
12350     t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
12351 		build_int_cst (TREE_TYPE (off), adjust));
12352 
12353   t = fold_convert (sizetype, t);
12354   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
12355 
12356   if (is_ha)
12357     {
12358       /* type ha; // treat as "struct {ftype field[n];}"
12359          ... [computing offs]
12360          for (i = 0; i <nregs; ++i, offs += 16)
12361 	   ha.field[i] = *((ftype *)(ap.__vr_top + offs));
12362 	 return ha;  */
12363       int i;
12364       tree tmp_ha, field_t, field_ptr_t;
12365 
12366       /* Declare a local variable.  */
12367       tmp_ha = create_tmp_var_raw (type, "ha");
12368       gimple_add_tmp_var (tmp_ha);
12369 
12370       /* Establish the base type.  */
12371       switch (ag_mode)
12372 	{
12373 	case E_SFmode:
12374 	  field_t = float_type_node;
12375 	  field_ptr_t = float_ptr_type_node;
12376 	  break;
12377 	case E_DFmode:
12378 	  field_t = double_type_node;
12379 	  field_ptr_t = double_ptr_type_node;
12380 	  break;
12381 	case E_TFmode:
12382 	  field_t = long_double_type_node;
12383 	  field_ptr_t = long_double_ptr_type_node;
12384 	  break;
12385 	case E_HFmode:
12386 	  field_t = aarch64_fp16_type_node;
12387 	  field_ptr_t = aarch64_fp16_ptr_type_node;
12388 	  break;
12389 	case E_V2SImode:
12390 	case E_V4SImode:
12391 	    {
12392 	      tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
12393 	      field_t = build_vector_type_for_mode (innertype, ag_mode);
12394 	      field_ptr_t = build_pointer_type (field_t);
12395 	    }
12396 	  break;
12397 	default:
12398 	  gcc_assert (0);
12399 	}
12400 
12401       /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
12402       tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
12403       addr = t;
12404       t = fold_convert (field_ptr_t, addr);
12405       t = build2 (MODIFY_EXPR, field_t,
12406 		  build1 (INDIRECT_REF, field_t, tmp_ha),
12407 		  build1 (INDIRECT_REF, field_t, t));
12408 
12409       /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
12410       for (i = 1; i < nregs; ++i)
12411 	{
12412 	  addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
12413 	  u = fold_convert (field_ptr_t, addr);
12414 	  u = build2 (MODIFY_EXPR, field_t,
12415 		      build2 (MEM_REF, field_t, tmp_ha,
12416 			      build_int_cst (field_ptr_t,
12417 					     (i *
12418 					      int_size_in_bytes (field_t)))),
12419 		      build1 (INDIRECT_REF, field_t, u));
12420 	  t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
12421 	}
12422 
12423       u = fold_convert (TREE_TYPE (f_top), tmp_ha);
12424       t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
12425     }
12426 
12427   COND_EXPR_ELSE (cond2) = t;
12428   addr = fold_convert (build_pointer_type (type), cond1);
12429   addr = build_va_arg_indirect_ref (addr);
12430 
12431   if (indirect_p)
12432     addr = build_va_arg_indirect_ref (addr);
12433 
12434   return addr;
12435 }
12436 
12437 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
12438 
12439 static void
aarch64_setup_incoming_varargs(cumulative_args_t cum_v,machine_mode mode,tree type,int * pretend_size ATTRIBUTE_UNUSED,int no_rtl)12440 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
12441 				tree type, int *pretend_size ATTRIBUTE_UNUSED,
12442 				int no_rtl)
12443 {
12444   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
12445   CUMULATIVE_ARGS local_cum;
12446   int gr_saved = cfun->va_list_gpr_size;
12447   int vr_saved = cfun->va_list_fpr_size;
12448 
12449   /* The caller has advanced CUM up to, but not beyond, the last named
12450      argument.  Advance a local copy of CUM past the last "real" named
12451      argument, to find out how many registers are left over.  */
12452   local_cum = *cum;
12453   aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
12454 
12455   /* Found out how many registers we need to save.
12456      Honor tree-stdvar analysis results.  */
12457   if (cfun->va_list_gpr_size)
12458     gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
12459 		    cfun->va_list_gpr_size / UNITS_PER_WORD);
12460   if (cfun->va_list_fpr_size)
12461     vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
12462 		    cfun->va_list_fpr_size / UNITS_PER_VREG);
12463 
12464   if (!TARGET_FLOAT)
12465     {
12466       gcc_assert (local_cum.aapcs_nvrn == 0);
12467       vr_saved = 0;
12468     }
12469 
12470   if (!no_rtl)
12471     {
12472       if (gr_saved > 0)
12473 	{
12474 	  rtx ptr, mem;
12475 
12476 	  /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
12477 	  ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
12478 			       - gr_saved * UNITS_PER_WORD);
12479 	  mem = gen_frame_mem (BLKmode, ptr);
12480 	  set_mem_alias_set (mem, get_varargs_alias_set ());
12481 
12482 	  move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
12483 			       mem, gr_saved);
12484 	}
12485       if (vr_saved > 0)
12486 	{
12487 	  /* We can't use move_block_from_reg, because it will use
12488 	     the wrong mode, storing D regs only.  */
12489 	  machine_mode mode = TImode;
12490 	  int off, i, vr_start;
12491 
12492 	  /* Set OFF to the offset from virtual_incoming_args_rtx of
12493 	     the first vector register.  The VR save area lies below
12494 	     the GR one, and is aligned to 16 bytes.  */
12495 	  off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
12496 			   STACK_BOUNDARY / BITS_PER_UNIT);
12497 	  off -= vr_saved * UNITS_PER_VREG;
12498 
12499 	  vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
12500 	  for (i = 0; i < vr_saved; ++i)
12501 	    {
12502 	      rtx ptr, mem;
12503 
12504 	      ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
12505 	      mem = gen_frame_mem (mode, ptr);
12506 	      set_mem_alias_set (mem, get_varargs_alias_set ());
12507 	      aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
12508 	      off += UNITS_PER_VREG;
12509 	    }
12510 	}
12511     }
12512 
12513   /* We don't save the size into *PRETEND_SIZE because we want to avoid
12514      any complication of having crtl->args.pretend_args_size changed.  */
12515   cfun->machine->frame.saved_varargs_size
12516     = (ROUND_UP (gr_saved * UNITS_PER_WORD,
12517 		 STACK_BOUNDARY / BITS_PER_UNIT)
12518        + vr_saved * UNITS_PER_VREG);
12519 }
12520 
12521 static void
aarch64_conditional_register_usage(void)12522 aarch64_conditional_register_usage (void)
12523 {
12524   int i;
12525   if (!TARGET_FLOAT)
12526     {
12527       for (i = V0_REGNUM; i <= V31_REGNUM; i++)
12528 	{
12529 	  fixed_regs[i] = 1;
12530 	  call_used_regs[i] = 1;
12531 	}
12532     }
12533   if (!TARGET_SVE)
12534     for (i = P0_REGNUM; i <= P15_REGNUM; i++)
12535       {
12536 	fixed_regs[i] = 1;
12537 	call_used_regs[i] = 1;
12538       }
12539 }
12540 
12541 /* Walk down the type tree of TYPE counting consecutive base elements.
12542    If *MODEP is VOIDmode, then set it to the first valid floating point
12543    type.  If a non-floating point type is found, or if a floating point
12544    type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
12545    otherwise return the count in the sub-tree.  */
12546 static int
aapcs_vfp_sub_candidate(const_tree type,machine_mode * modep)12547 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
12548 {
12549   machine_mode mode;
12550   HOST_WIDE_INT size;
12551 
12552   switch (TREE_CODE (type))
12553     {
12554     case REAL_TYPE:
12555       mode = TYPE_MODE (type);
12556       if (mode != DFmode && mode != SFmode
12557 	  && mode != TFmode && mode != HFmode)
12558 	return -1;
12559 
12560       if (*modep == VOIDmode)
12561 	*modep = mode;
12562 
12563       if (*modep == mode)
12564 	return 1;
12565 
12566       break;
12567 
12568     case COMPLEX_TYPE:
12569       mode = TYPE_MODE (TREE_TYPE (type));
12570       if (mode != DFmode && mode != SFmode
12571 	  && mode != TFmode && mode != HFmode)
12572 	return -1;
12573 
12574       if (*modep == VOIDmode)
12575 	*modep = mode;
12576 
12577       if (*modep == mode)
12578 	return 2;
12579 
12580       break;
12581 
12582     case VECTOR_TYPE:
12583       /* Use V2SImode and V4SImode as representatives of all 64-bit
12584 	 and 128-bit vector types.  */
12585       size = int_size_in_bytes (type);
12586       switch (size)
12587 	{
12588 	case 8:
12589 	  mode = V2SImode;
12590 	  break;
12591 	case 16:
12592 	  mode = V4SImode;
12593 	  break;
12594 	default:
12595 	  return -1;
12596 	}
12597 
12598       if (*modep == VOIDmode)
12599 	*modep = mode;
12600 
12601       /* Vector modes are considered to be opaque: two vectors are
12602 	 equivalent for the purposes of being homogeneous aggregates
12603 	 if they are the same size.  */
12604       if (*modep == mode)
12605 	return 1;
12606 
12607       break;
12608 
12609     case ARRAY_TYPE:
12610       {
12611 	int count;
12612 	tree index = TYPE_DOMAIN (type);
12613 
12614 	/* Can't handle incomplete types nor sizes that are not
12615 	   fixed.  */
12616 	if (!COMPLETE_TYPE_P (type)
12617 	    || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
12618 	  return -1;
12619 
12620 	count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
12621 	if (count == -1
12622 	    || !index
12623 	    || !TYPE_MAX_VALUE (index)
12624 	    || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
12625 	    || !TYPE_MIN_VALUE (index)
12626 	    || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
12627 	    || count < 0)
12628 	  return -1;
12629 
12630 	count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
12631 		      - tree_to_uhwi (TYPE_MIN_VALUE (index)));
12632 
12633 	/* There must be no padding.  */
12634 	if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
12635 		      count * GET_MODE_BITSIZE (*modep)))
12636 	  return -1;
12637 
12638 	return count;
12639       }
12640 
12641     case RECORD_TYPE:
12642       {
12643 	int count = 0;
12644 	int sub_count;
12645 	tree field;
12646 
12647 	/* Can't handle incomplete types nor sizes that are not
12648 	   fixed.  */
12649 	if (!COMPLETE_TYPE_P (type)
12650 	    || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
12651 	  return -1;
12652 
12653 	for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
12654 	  {
12655 	    if (TREE_CODE (field) != FIELD_DECL)
12656 	      continue;
12657 
12658 	    sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
12659 	    if (sub_count < 0)
12660 	      return -1;
12661 	    count += sub_count;
12662 	  }
12663 
12664 	/* There must be no padding.  */
12665 	if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
12666 		      count * GET_MODE_BITSIZE (*modep)))
12667 	  return -1;
12668 
12669 	return count;
12670       }
12671 
12672     case UNION_TYPE:
12673     case QUAL_UNION_TYPE:
12674       {
12675 	/* These aren't very interesting except in a degenerate case.  */
12676 	int count = 0;
12677 	int sub_count;
12678 	tree field;
12679 
12680 	/* Can't handle incomplete types nor sizes that are not
12681 	   fixed.  */
12682 	if (!COMPLETE_TYPE_P (type)
12683 	    || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
12684 	  return -1;
12685 
12686 	for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
12687 	  {
12688 	    if (TREE_CODE (field) != FIELD_DECL)
12689 	      continue;
12690 
12691 	    sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
12692 	    if (sub_count < 0)
12693 	      return -1;
12694 	    count = count > sub_count ? count : sub_count;
12695 	  }
12696 
12697 	/* There must be no padding.  */
12698 	if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
12699 		      count * GET_MODE_BITSIZE (*modep)))
12700 	  return -1;
12701 
12702 	return count;
12703       }
12704 
12705     default:
12706       break;
12707     }
12708 
12709   return -1;
12710 }
12711 
12712 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
12713    type as described in AAPCS64 \S 4.1.2.
12714 
12715    See the comment above aarch64_composite_type_p for the notes on MODE.  */
12716 
12717 static bool
aarch64_short_vector_p(const_tree type,machine_mode mode)12718 aarch64_short_vector_p (const_tree type,
12719 			machine_mode mode)
12720 {
12721   poly_int64 size = -1;
12722 
12723   if (type && TREE_CODE (type) == VECTOR_TYPE)
12724     size = int_size_in_bytes (type);
12725   else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
12726 	    || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
12727     size = GET_MODE_SIZE (mode);
12728 
12729   return known_eq (size, 8) || known_eq (size, 16);
12730 }
12731 
12732 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
12733    type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
12734    array types.  The C99 floating-point complex types are also considered
12735    as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
12736    types, which are GCC extensions and out of the scope of AAPCS64, are
12737    treated as composite types here as well.
12738 
12739    Note that MODE itself is not sufficient in determining whether a type
12740    is such a composite type or not.  This is because
12741    stor-layout.c:compute_record_mode may have already changed the MODE
12742    (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
12743    structure with only one field may have its MODE set to the mode of the
12744    field.  Also an integer mode whose size matches the size of the
12745    RECORD_TYPE type may be used to substitute the original mode
12746    (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
12747    solely relied on.  */
12748 
12749 static bool
aarch64_composite_type_p(const_tree type,machine_mode mode)12750 aarch64_composite_type_p (const_tree type,
12751 			  machine_mode mode)
12752 {
12753   if (aarch64_short_vector_p (type, mode))
12754     return false;
12755 
12756   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
12757     return true;
12758 
12759   if (mode == BLKmode
12760       || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
12761       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
12762     return true;
12763 
12764   return false;
12765 }
12766 
12767 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
12768    shall be passed or returned in simd/fp register(s) (providing these
12769    parameter passing registers are available).
12770 
12771    Upon successful return, *COUNT returns the number of needed registers,
12772    *BASE_MODE returns the mode of the individual register and when IS_HAF
12773    is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
12774    floating-point aggregate or a homogeneous short-vector aggregate.  */
12775 
12776 static bool
aarch64_vfp_is_call_or_return_candidate(machine_mode mode,const_tree type,machine_mode * base_mode,int * count,bool * is_ha)12777 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
12778 					 const_tree type,
12779 					 machine_mode *base_mode,
12780 					 int *count,
12781 					 bool *is_ha)
12782 {
12783   machine_mode new_mode = VOIDmode;
12784   bool composite_p = aarch64_composite_type_p (type, mode);
12785 
12786   if (is_ha != NULL) *is_ha = false;
12787 
12788   if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
12789       || aarch64_short_vector_p (type, mode))
12790     {
12791       *count = 1;
12792       new_mode = mode;
12793     }
12794   else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
12795     {
12796       if (is_ha != NULL) *is_ha = true;
12797       *count = 2;
12798       new_mode = GET_MODE_INNER (mode);
12799     }
12800   else if (type && composite_p)
12801     {
12802       int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
12803 
12804       if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
12805 	{
12806 	  if (is_ha != NULL) *is_ha = true;
12807 	  *count = ag_count;
12808 	}
12809       else
12810 	return false;
12811     }
12812   else
12813     return false;
12814 
12815   *base_mode = new_mode;
12816   return true;
12817 }
12818 
12819 /* Implement TARGET_STRUCT_VALUE_RTX.  */
12820 
12821 static rtx
aarch64_struct_value_rtx(tree fndecl ATTRIBUTE_UNUSED,int incoming ATTRIBUTE_UNUSED)12822 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
12823 			  int incoming ATTRIBUTE_UNUSED)
12824 {
12825   return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
12826 }
12827 
12828 /* Implements target hook vector_mode_supported_p.  */
12829 static bool
aarch64_vector_mode_supported_p(machine_mode mode)12830 aarch64_vector_mode_supported_p (machine_mode mode)
12831 {
12832   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
12833   return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
12834 }
12835 
12836 /* Return appropriate SIMD container
12837    for MODE within a vector of WIDTH bits.  */
12838 static machine_mode
aarch64_simd_container_mode(scalar_mode mode,poly_int64 width)12839 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
12840 {
12841   if (TARGET_SVE && known_eq (width, BITS_PER_SVE_VECTOR))
12842     switch (mode)
12843       {
12844       case E_DFmode:
12845 	return VNx2DFmode;
12846       case E_SFmode:
12847 	return VNx4SFmode;
12848       case E_HFmode:
12849 	return VNx8HFmode;
12850       case E_DImode:
12851 	return VNx2DImode;
12852       case E_SImode:
12853 	return VNx4SImode;
12854       case E_HImode:
12855 	return VNx8HImode;
12856       case E_QImode:
12857 	return VNx16QImode;
12858       default:
12859 	return word_mode;
12860       }
12861 
12862   gcc_assert (known_eq (width, 64) || known_eq (width, 128));
12863   if (TARGET_SIMD)
12864     {
12865       if (known_eq (width, 128))
12866 	switch (mode)
12867 	  {
12868 	  case E_DFmode:
12869 	    return V2DFmode;
12870 	  case E_SFmode:
12871 	    return V4SFmode;
12872 	  case E_HFmode:
12873 	    return V8HFmode;
12874 	  case E_SImode:
12875 	    return V4SImode;
12876 	  case E_HImode:
12877 	    return V8HImode;
12878 	  case E_QImode:
12879 	    return V16QImode;
12880 	  case E_DImode:
12881 	    return V2DImode;
12882 	  default:
12883 	    break;
12884 	  }
12885       else
12886 	switch (mode)
12887 	  {
12888 	  case E_SFmode:
12889 	    return V2SFmode;
12890 	  case E_HFmode:
12891 	    return V4HFmode;
12892 	  case E_SImode:
12893 	    return V2SImode;
12894 	  case E_HImode:
12895 	    return V4HImode;
12896 	  case E_QImode:
12897 	    return V8QImode;
12898 	  default:
12899 	    break;
12900 	  }
12901     }
12902   return word_mode;
12903 }
12904 
12905 /* Return 128-bit container as the preferred SIMD mode for MODE.  */
12906 static machine_mode
aarch64_preferred_simd_mode(scalar_mode mode)12907 aarch64_preferred_simd_mode (scalar_mode mode)
12908 {
12909   poly_int64 bits = TARGET_SVE ? BITS_PER_SVE_VECTOR : 128;
12910   return aarch64_simd_container_mode (mode, bits);
12911 }
12912 
12913 /* Return a list of possible vector sizes for the vectorizer
12914    to iterate over.  */
12915 static void
aarch64_autovectorize_vector_sizes(vector_sizes * sizes)12916 aarch64_autovectorize_vector_sizes (vector_sizes *sizes)
12917 {
12918   if (TARGET_SVE)
12919     sizes->safe_push (BYTES_PER_SVE_VECTOR);
12920   sizes->safe_push (16);
12921   sizes->safe_push (8);
12922 }
12923 
12924 /* Implement TARGET_MANGLE_TYPE.  */
12925 
12926 static const char *
aarch64_mangle_type(const_tree type)12927 aarch64_mangle_type (const_tree type)
12928 {
12929   /* The AArch64 ABI documents say that "__va_list" has to be
12930      managled as if it is in the "std" namespace.  */
12931   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
12932     return "St9__va_list";
12933 
12934   /* Half-precision float.  */
12935   if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
12936     return "Dh";
12937 
12938   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
12939      builtin types.  */
12940   if (TYPE_NAME (type) != NULL)
12941     return aarch64_mangle_builtin_type (type);
12942 
12943   /* Use the default mangling.  */
12944   return NULL;
12945 }
12946 
12947 /* Find the first rtx_insn before insn that will generate an assembly
12948    instruction.  */
12949 
12950 static rtx_insn *
aarch64_prev_real_insn(rtx_insn * insn)12951 aarch64_prev_real_insn (rtx_insn *insn)
12952 {
12953   if (!insn)
12954     return NULL;
12955 
12956   do
12957     {
12958       insn = prev_real_insn (insn);
12959     }
12960   while (insn && recog_memoized (insn) < 0);
12961 
12962   return insn;
12963 }
12964 
12965 static bool
is_madd_op(enum attr_type t1)12966 is_madd_op (enum attr_type t1)
12967 {
12968   unsigned int i;
12969   /* A number of these may be AArch32 only.  */
12970   enum attr_type mlatypes[] = {
12971     TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
12972     TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
12973     TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
12974   };
12975 
12976   for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
12977     {
12978       if (t1 == mlatypes[i])
12979 	return true;
12980     }
12981 
12982   return false;
12983 }
12984 
12985 /* Check if there is a register dependency between a load and the insn
12986    for which we hold recog_data.  */
12987 
12988 static bool
dep_between_memop_and_curr(rtx memop)12989 dep_between_memop_and_curr (rtx memop)
12990 {
12991   rtx load_reg;
12992   int opno;
12993 
12994   gcc_assert (GET_CODE (memop) == SET);
12995 
12996   if (!REG_P (SET_DEST (memop)))
12997     return false;
12998 
12999   load_reg = SET_DEST (memop);
13000   for (opno = 1; opno < recog_data.n_operands; opno++)
13001     {
13002       rtx operand = recog_data.operand[opno];
13003       if (REG_P (operand)
13004           && reg_overlap_mentioned_p (load_reg, operand))
13005         return true;
13006 
13007     }
13008   return false;
13009 }
13010 
13011 
13012 /* When working around the Cortex-A53 erratum 835769,
13013    given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
13014    instruction and has a preceding memory instruction such that a NOP
13015    should be inserted between them.  */
13016 
13017 bool
aarch64_madd_needs_nop(rtx_insn * insn)13018 aarch64_madd_needs_nop (rtx_insn* insn)
13019 {
13020   enum attr_type attr_type;
13021   rtx_insn *prev;
13022   rtx body;
13023 
13024   if (!TARGET_FIX_ERR_A53_835769)
13025     return false;
13026 
13027   if (!INSN_P (insn) || recog_memoized (insn) < 0)
13028     return false;
13029 
13030   attr_type = get_attr_type (insn);
13031   if (!is_madd_op (attr_type))
13032     return false;
13033 
13034   prev = aarch64_prev_real_insn (insn);
13035   /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
13036      Restore recog state to INSN to avoid state corruption.  */
13037   extract_constrain_insn_cached (insn);
13038 
13039   if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
13040     return false;
13041 
13042   body = single_set (prev);
13043 
13044   /* If the previous insn is a memory op and there is no dependency between
13045      it and the DImode madd, emit a NOP between them.  If body is NULL then we
13046      have a complex memory operation, probably a load/store pair.
13047      Be conservative for now and emit a NOP.  */
13048   if (GET_MODE (recog_data.operand[0]) == DImode
13049       && (!body || !dep_between_memop_and_curr (body)))
13050     return true;
13051 
13052   return false;
13053 
13054 }
13055 
13056 
13057 /* Implement FINAL_PRESCAN_INSN.  */
13058 
13059 void
aarch64_final_prescan_insn(rtx_insn * insn)13060 aarch64_final_prescan_insn (rtx_insn *insn)
13061 {
13062   if (aarch64_madd_needs_nop (insn))
13063     fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
13064 }
13065 
13066 
13067 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
13068    instruction.  */
13069 
13070 bool
aarch64_sve_index_immediate_p(rtx base_or_step)13071 aarch64_sve_index_immediate_p (rtx base_or_step)
13072 {
13073   return (CONST_INT_P (base_or_step)
13074 	  && IN_RANGE (INTVAL (base_or_step), -16, 15));
13075 }
13076 
13077 /* Return true if X is a valid immediate for the SVE ADD and SUB
13078    instructions.  Negate X first if NEGATE_P is true.  */
13079 
13080 bool
aarch64_sve_arith_immediate_p(rtx x,bool negate_p)13081 aarch64_sve_arith_immediate_p (rtx x, bool negate_p)
13082 {
13083   rtx elt;
13084 
13085   if (!const_vec_duplicate_p (x, &elt)
13086       || !CONST_INT_P (elt))
13087     return false;
13088 
13089   HOST_WIDE_INT val = INTVAL (elt);
13090   if (negate_p)
13091     val = -val;
13092   val &= GET_MODE_MASK (GET_MODE_INNER (GET_MODE (x)));
13093 
13094   if (val & 0xff)
13095     return IN_RANGE (val, 0, 0xff);
13096   return IN_RANGE (val, 0, 0xff00);
13097 }
13098 
13099 /* Return true if X is a valid immediate operand for an SVE logical
13100    instruction such as AND.  */
13101 
13102 bool
aarch64_sve_bitmask_immediate_p(rtx x)13103 aarch64_sve_bitmask_immediate_p (rtx x)
13104 {
13105   rtx elt;
13106 
13107   return (const_vec_duplicate_p (x, &elt)
13108 	  && CONST_INT_P (elt)
13109 	  && aarch64_bitmask_imm (INTVAL (elt),
13110 				  GET_MODE_INNER (GET_MODE (x))));
13111 }
13112 
13113 /* Return true if X is a valid immediate for the SVE DUP and CPY
13114    instructions.  */
13115 
13116 bool
aarch64_sve_dup_immediate_p(rtx x)13117 aarch64_sve_dup_immediate_p (rtx x)
13118 {
13119   rtx elt;
13120 
13121   if (!const_vec_duplicate_p (x, &elt)
13122       || !CONST_INT_P (elt))
13123     return false;
13124 
13125   HOST_WIDE_INT val = INTVAL (elt);
13126   if (val & 0xff)
13127     return IN_RANGE (val, -0x80, 0x7f);
13128   return IN_RANGE (val, -0x8000, 0x7f00);
13129 }
13130 
13131 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
13132    SIGNED_P says whether the operand is signed rather than unsigned.  */
13133 
13134 bool
aarch64_sve_cmp_immediate_p(rtx x,bool signed_p)13135 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
13136 {
13137   rtx elt;
13138 
13139   return (const_vec_duplicate_p (x, &elt)
13140 	  && CONST_INT_P (elt)
13141 	  && (signed_p
13142 	      ? IN_RANGE (INTVAL (elt), -16, 15)
13143 	      : IN_RANGE (INTVAL (elt), 0, 127)));
13144 }
13145 
13146 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
13147    instruction.  Negate X first if NEGATE_P is true.  */
13148 
13149 bool
aarch64_sve_float_arith_immediate_p(rtx x,bool negate_p)13150 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
13151 {
13152   rtx elt;
13153   REAL_VALUE_TYPE r;
13154 
13155   if (!const_vec_duplicate_p (x, &elt)
13156       || GET_CODE (elt) != CONST_DOUBLE)
13157     return false;
13158 
13159   r = *CONST_DOUBLE_REAL_VALUE (elt);
13160 
13161   if (negate_p)
13162     r = real_value_negate (&r);
13163 
13164   if (real_equal (&r, &dconst1))
13165     return true;
13166   if (real_equal (&r, &dconsthalf))
13167     return true;
13168   return false;
13169 }
13170 
13171 /* Return true if X is a valid immediate operand for an SVE FMUL
13172    instruction.  */
13173 
13174 bool
aarch64_sve_float_mul_immediate_p(rtx x)13175 aarch64_sve_float_mul_immediate_p (rtx x)
13176 {
13177   rtx elt;
13178 
13179   /* GCC will never generate a multiply with an immediate of 2, so there is no
13180      point testing for it (even though it is a valid constant).  */
13181   return (const_vec_duplicate_p (x, &elt)
13182 	  && GET_CODE (elt) == CONST_DOUBLE
13183 	  && real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf));
13184 }
13185 
13186 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
13187    for the Advanced SIMD operation described by WHICH and INSN.  If INFO
13188    is nonnull, use it to describe valid immediates.  */
13189 static bool
aarch64_advsimd_valid_immediate_hs(unsigned int val32,simd_immediate_info * info,enum simd_immediate_check which,simd_immediate_info::insn_type insn)13190 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
13191 				    simd_immediate_info *info,
13192 				    enum simd_immediate_check which,
13193 				    simd_immediate_info::insn_type insn)
13194 {
13195   /* Try a 4-byte immediate with LSL.  */
13196   for (unsigned int shift = 0; shift < 32; shift += 8)
13197     if ((val32 & (0xff << shift)) == val32)
13198       {
13199 	if (info)
13200 	  *info = simd_immediate_info (SImode, val32 >> shift, insn,
13201 				       simd_immediate_info::LSL, shift);
13202 	return true;
13203       }
13204 
13205   /* Try a 2-byte immediate with LSL.  */
13206   unsigned int imm16 = val32 & 0xffff;
13207   if (imm16 == (val32 >> 16))
13208     for (unsigned int shift = 0; shift < 16; shift += 8)
13209       if ((imm16 & (0xff << shift)) == imm16)
13210 	{
13211 	  if (info)
13212 	    *info = simd_immediate_info (HImode, imm16 >> shift, insn,
13213 					 simd_immediate_info::LSL, shift);
13214 	  return true;
13215 	}
13216 
13217   /* Try a 4-byte immediate with MSL, except for cases that MVN
13218      can handle.  */
13219   if (which == AARCH64_CHECK_MOV)
13220     for (unsigned int shift = 8; shift < 24; shift += 8)
13221       {
13222 	unsigned int low = (1 << shift) - 1;
13223 	if (((val32 & (0xff << shift)) | low) == val32)
13224 	  {
13225 	    if (info)
13226 	      *info = simd_immediate_info (SImode, val32 >> shift, insn,
13227 					   simd_immediate_info::MSL, shift);
13228 	    return true;
13229 	  }
13230       }
13231 
13232   return false;
13233 }
13234 
13235 /* Return true if replicating VAL64 is a valid immediate for the
13236    Advanced SIMD operation described by WHICH.  If INFO is nonnull,
13237    use it to describe valid immediates.  */
13238 static bool
aarch64_advsimd_valid_immediate(unsigned HOST_WIDE_INT val64,simd_immediate_info * info,enum simd_immediate_check which)13239 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
13240 				 simd_immediate_info *info,
13241 				 enum simd_immediate_check which)
13242 {
13243   unsigned int val32 = val64 & 0xffffffff;
13244   unsigned int val16 = val64 & 0xffff;
13245   unsigned int val8 = val64 & 0xff;
13246 
13247   if (val32 == (val64 >> 32))
13248     {
13249       if ((which & AARCH64_CHECK_ORR) != 0
13250 	  && aarch64_advsimd_valid_immediate_hs (val32, info, which,
13251 						 simd_immediate_info::MOV))
13252 	return true;
13253 
13254       if ((which & AARCH64_CHECK_BIC) != 0
13255 	  && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
13256 						 simd_immediate_info::MVN))
13257 	return true;
13258 
13259       /* Try using a replicated byte.  */
13260       if (which == AARCH64_CHECK_MOV
13261 	  && val16 == (val32 >> 16)
13262 	  && val8 == (val16 >> 8))
13263 	{
13264 	  if (info)
13265 	    *info = simd_immediate_info (QImode, val8);
13266 	  return true;
13267 	}
13268     }
13269 
13270   /* Try using a bit-to-bytemask.  */
13271   if (which == AARCH64_CHECK_MOV)
13272     {
13273       unsigned int i;
13274       for (i = 0; i < 64; i += 8)
13275 	{
13276 	  unsigned char byte = (val64 >> i) & 0xff;
13277 	  if (byte != 0 && byte != 0xff)
13278 	    break;
13279 	}
13280       if (i == 64)
13281 	{
13282 	  if (info)
13283 	    *info = simd_immediate_info (DImode, val64);
13284 	  return true;
13285 	}
13286     }
13287   return false;
13288 }
13289 
13290 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
13291    instruction.  If INFO is nonnull, use it to describe valid immediates.  */
13292 
13293 static bool
aarch64_sve_valid_immediate(unsigned HOST_WIDE_INT val64,simd_immediate_info * info)13294 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
13295 			     simd_immediate_info *info)
13296 {
13297   scalar_int_mode mode = DImode;
13298   unsigned int val32 = val64 & 0xffffffff;
13299   if (val32 == (val64 >> 32))
13300     {
13301       mode = SImode;
13302       unsigned int val16 = val32 & 0xffff;
13303       if (val16 == (val32 >> 16))
13304 	{
13305 	  mode = HImode;
13306 	  unsigned int val8 = val16 & 0xff;
13307 	  if (val8 == (val16 >> 8))
13308 	    mode = QImode;
13309 	}
13310     }
13311   HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
13312   if (IN_RANGE (val, -0x80, 0x7f))
13313     {
13314       /* DUP with no shift.  */
13315       if (info)
13316 	*info = simd_immediate_info (mode, val);
13317       return true;
13318     }
13319   if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
13320     {
13321       /* DUP with LSL #8.  */
13322       if (info)
13323 	*info = simd_immediate_info (mode, val);
13324       return true;
13325     }
13326   if (aarch64_bitmask_imm (val64, mode))
13327     {
13328       /* DUPM.  */
13329       if (info)
13330 	*info = simd_immediate_info (mode, val);
13331       return true;
13332     }
13333   return false;
13334 }
13335 
13336 /* Return true if OP is a valid SIMD immediate for the operation
13337    described by WHICH.  If INFO is nonnull, use it to describe valid
13338    immediates.  */
13339 bool
aarch64_simd_valid_immediate(rtx op,simd_immediate_info * info,enum simd_immediate_check which)13340 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
13341 			      enum simd_immediate_check which)
13342 {
13343   machine_mode mode = GET_MODE (op);
13344   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
13345   if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
13346     return false;
13347 
13348   scalar_mode elt_mode = GET_MODE_INNER (mode);
13349   rtx base, step;
13350   unsigned int n_elts;
13351   if (GET_CODE (op) == CONST_VECTOR
13352       && CONST_VECTOR_DUPLICATE_P (op))
13353     n_elts = CONST_VECTOR_NPATTERNS (op);
13354   else if ((vec_flags & VEC_SVE_DATA)
13355 	   && const_vec_series_p (op, &base, &step))
13356     {
13357       gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
13358       if (!aarch64_sve_index_immediate_p (base)
13359 	  || !aarch64_sve_index_immediate_p (step))
13360 	return false;
13361 
13362       if (info)
13363 	*info = simd_immediate_info (elt_mode, base, step);
13364       return true;
13365     }
13366   else if (GET_CODE (op) == CONST_VECTOR
13367 	   && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
13368     /* N_ELTS set above.  */;
13369   else
13370     return false;
13371 
13372   /* Handle PFALSE and PTRUE.  */
13373   if (vec_flags & VEC_SVE_PRED)
13374     return (op == CONST0_RTX (mode)
13375 	    || op == CONSTM1_RTX (mode));
13376 
13377   scalar_float_mode elt_float_mode;
13378   if (n_elts == 1
13379       && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
13380     {
13381       rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
13382       if (aarch64_float_const_zero_rtx_p (elt)
13383 	  || aarch64_float_const_representable_p (elt))
13384 	{
13385 	  if (info)
13386 	    *info = simd_immediate_info (elt_float_mode, elt);
13387 	  return true;
13388 	}
13389     }
13390 
13391   unsigned int elt_size = GET_MODE_SIZE (elt_mode);
13392   if (elt_size > 8)
13393     return false;
13394 
13395   scalar_int_mode elt_int_mode = int_mode_for_mode (elt_mode).require ();
13396 
13397   /* Expand the vector constant out into a byte vector, with the least
13398      significant byte of the register first.  */
13399   auto_vec<unsigned char, 16> bytes;
13400   bytes.reserve (n_elts * elt_size);
13401   for (unsigned int i = 0; i < n_elts; i++)
13402     {
13403       /* The vector is provided in gcc endian-neutral fashion.
13404 	 For aarch64_be Advanced SIMD, it must be laid out in the vector
13405 	 register in reverse order.  */
13406       bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
13407       rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
13408 
13409       if (elt_mode != elt_int_mode)
13410 	elt = gen_lowpart (elt_int_mode, elt);
13411 
13412       if (!CONST_INT_P (elt))
13413 	return false;
13414 
13415       unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
13416       for (unsigned int byte = 0; byte < elt_size; byte++)
13417 	{
13418 	  bytes.quick_push (elt_val & 0xff);
13419 	  elt_val >>= BITS_PER_UNIT;
13420 	}
13421     }
13422 
13423   /* The immediate must repeat every eight bytes.  */
13424   unsigned int nbytes = bytes.length ();
13425   for (unsigned i = 8; i < nbytes; ++i)
13426     if (bytes[i] != bytes[i - 8])
13427       return false;
13428 
13429   /* Get the repeating 8-byte value as an integer.  No endian correction
13430      is needed here because bytes is already in lsb-first order.  */
13431   unsigned HOST_WIDE_INT val64 = 0;
13432   for (unsigned int i = 0; i < 8; i++)
13433     val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
13434 	      << (i * BITS_PER_UNIT));
13435 
13436   if (vec_flags & VEC_SVE_DATA)
13437     return aarch64_sve_valid_immediate (val64, info);
13438   else
13439     return aarch64_advsimd_valid_immediate (val64, info, which);
13440 }
13441 
13442 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
13443    has a step in the range of INDEX.  Return the index expression if so,
13444    otherwise return null.  */
13445 rtx
aarch64_check_zero_based_sve_index_immediate(rtx x)13446 aarch64_check_zero_based_sve_index_immediate (rtx x)
13447 {
13448   rtx base, step;
13449   if (const_vec_series_p (x, &base, &step)
13450       && base == const0_rtx
13451       && aarch64_sve_index_immediate_p (step))
13452     return step;
13453   return NULL_RTX;
13454 }
13455 
13456 /* Check of immediate shift constants are within range.  */
13457 bool
aarch64_simd_shift_imm_p(rtx x,machine_mode mode,bool left)13458 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
13459 {
13460   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
13461   if (left)
13462     return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
13463   else
13464     return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
13465 }
13466 
13467 /* Return the bitmask CONST_INT to select the bits required by a zero extract
13468    operation of width WIDTH at bit position POS.  */
13469 
13470 rtx
aarch64_mask_from_zextract_ops(rtx width,rtx pos)13471 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
13472 {
13473   gcc_assert (CONST_INT_P (width));
13474   gcc_assert (CONST_INT_P (pos));
13475 
13476   unsigned HOST_WIDE_INT mask
13477     = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
13478   return GEN_INT (mask << UINTVAL (pos));
13479 }
13480 
13481 bool
aarch64_mov_operand_p(rtx x,machine_mode mode)13482 aarch64_mov_operand_p (rtx x, machine_mode mode)
13483 {
13484   if (GET_CODE (x) == HIGH
13485       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
13486     return true;
13487 
13488   if (CONST_INT_P (x))
13489     return true;
13490 
13491   if (VECTOR_MODE_P (GET_MODE (x)))
13492     return aarch64_simd_valid_immediate (x, NULL);
13493 
13494   if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
13495     return true;
13496 
13497   if (aarch64_sve_cnt_immediate_p (x))
13498     return true;
13499 
13500   return aarch64_classify_symbolic_expression (x)
13501     == SYMBOL_TINY_ABSOLUTE;
13502 }
13503 
13504 /* Return a const_int vector of VAL.  */
13505 rtx
aarch64_simd_gen_const_vector_dup(machine_mode mode,HOST_WIDE_INT val)13506 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
13507 {
13508   rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
13509   return gen_const_vec_duplicate (mode, c);
13510 }
13511 
13512 /* Check OP is a legal scalar immediate for the MOVI instruction.  */
13513 
13514 bool
aarch64_simd_scalar_immediate_valid_for_move(rtx op,scalar_int_mode mode)13515 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
13516 {
13517   machine_mode vmode;
13518 
13519   vmode = aarch64_simd_container_mode (mode, 64);
13520   rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
13521   return aarch64_simd_valid_immediate (op_v, NULL);
13522 }
13523 
13524 /* Construct and return a PARALLEL RTX vector with elements numbering the
13525    lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
13526    the vector - from the perspective of the architecture.  This does not
13527    line up with GCC's perspective on lane numbers, so we end up with
13528    different masks depending on our target endian-ness.  The diagram
13529    below may help.  We must draw the distinction when building masks
13530    which select one half of the vector.  An instruction selecting
13531    architectural low-lanes for a big-endian target, must be described using
13532    a mask selecting GCC high-lanes.
13533 
13534                  Big-Endian             Little-Endian
13535 
13536 GCC             0   1   2   3           3   2   1   0
13537               | x | x | x | x |       | x | x | x | x |
13538 Architecture    3   2   1   0           3   2   1   0
13539 
13540 Low Mask:         { 2, 3 }                { 0, 1 }
13541 High Mask:        { 0, 1 }                { 2, 3 }
13542 
13543    MODE Is the mode of the vector and NUNITS is the number of units in it.  */
13544 
13545 rtx
aarch64_simd_vect_par_cnst_half(machine_mode mode,int nunits,bool high)13546 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
13547 {
13548   rtvec v = rtvec_alloc (nunits / 2);
13549   int high_base = nunits / 2;
13550   int low_base = 0;
13551   int base;
13552   rtx t1;
13553   int i;
13554 
13555   if (BYTES_BIG_ENDIAN)
13556     base = high ? low_base : high_base;
13557   else
13558     base = high ? high_base : low_base;
13559 
13560   for (i = 0; i < nunits / 2; i++)
13561     RTVEC_ELT (v, i) = GEN_INT (base + i);
13562 
13563   t1 = gen_rtx_PARALLEL (mode, v);
13564   return t1;
13565 }
13566 
13567 /* Check OP for validity as a PARALLEL RTX vector with elements
13568    numbering the lanes of either the high (HIGH == TRUE) or low lanes,
13569    from the perspective of the architecture.  See the diagram above
13570    aarch64_simd_vect_par_cnst_half for more details.  */
13571 
13572 bool
aarch64_simd_check_vect_par_cnst_half(rtx op,machine_mode mode,bool high)13573 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
13574 				       bool high)
13575 {
13576   int nelts;
13577   if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
13578     return false;
13579 
13580   rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
13581   HOST_WIDE_INT count_op = XVECLEN (op, 0);
13582   HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
13583   int i = 0;
13584 
13585   if (count_op != count_ideal)
13586     return false;
13587 
13588   for (i = 0; i < count_ideal; i++)
13589     {
13590       rtx elt_op = XVECEXP (op, 0, i);
13591       rtx elt_ideal = XVECEXP (ideal, 0, i);
13592 
13593       if (!CONST_INT_P (elt_op)
13594 	  || INTVAL (elt_ideal) != INTVAL (elt_op))
13595 	return false;
13596     }
13597   return true;
13598 }
13599 
13600 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
13601    HIGH (exclusive).  */
13602 void
aarch64_simd_lane_bounds(rtx operand,HOST_WIDE_INT low,HOST_WIDE_INT high,const_tree exp)13603 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
13604 			  const_tree exp)
13605 {
13606   HOST_WIDE_INT lane;
13607   gcc_assert (CONST_INT_P (operand));
13608   lane = INTVAL (operand);
13609 
13610   if (lane < low || lane >= high)
13611   {
13612     if (exp)
13613       error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
13614     else
13615       error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
13616   }
13617 }
13618 
13619 /* Peform endian correction on lane number N, which indexes a vector
13620    of mode MODE, and return the result as an SImode rtx.  */
13621 
13622 rtx
aarch64_endian_lane_rtx(machine_mode mode,unsigned int n)13623 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
13624 {
13625   return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
13626 }
13627 
13628 /* Return TRUE if OP is a valid vector addressing mode.  */
13629 
13630 bool
aarch64_simd_mem_operand_p(rtx op)13631 aarch64_simd_mem_operand_p (rtx op)
13632 {
13633   return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
13634 			|| REG_P (XEXP (op, 0)));
13635 }
13636 
13637 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction.  */
13638 
13639 bool
aarch64_sve_ld1r_operand_p(rtx op)13640 aarch64_sve_ld1r_operand_p (rtx op)
13641 {
13642   struct aarch64_address_info addr;
13643   scalar_mode mode;
13644 
13645   return (MEM_P (op)
13646 	  && is_a <scalar_mode> (GET_MODE (op), &mode)
13647 	  && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
13648 	  && addr.type == ADDRESS_REG_IMM
13649 	  && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
13650 }
13651 
13652 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
13653    The conditions for STR are the same.  */
13654 bool
aarch64_sve_ldr_operand_p(rtx op)13655 aarch64_sve_ldr_operand_p (rtx op)
13656 {
13657   struct aarch64_address_info addr;
13658 
13659   return (MEM_P (op)
13660 	  && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
13661 				       false, ADDR_QUERY_ANY)
13662 	  && addr.type == ADDRESS_REG_IMM);
13663 }
13664 
13665 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
13666    We need to be able to access the individual pieces, so the range
13667    is different from LD[234] and ST[234].  */
13668 bool
aarch64_sve_struct_memory_operand_p(rtx op)13669 aarch64_sve_struct_memory_operand_p (rtx op)
13670 {
13671   if (!MEM_P (op))
13672     return false;
13673 
13674   machine_mode mode = GET_MODE (op);
13675   struct aarch64_address_info addr;
13676   if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
13677 				 ADDR_QUERY_ANY)
13678       || addr.type != ADDRESS_REG_IMM)
13679     return false;
13680 
13681   poly_int64 first = addr.const_offset;
13682   poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
13683   return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
13684 	  && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
13685 }
13686 
13687 /* Emit a register copy from operand to operand, taking care not to
13688    early-clobber source registers in the process.
13689 
13690    COUNT is the number of components into which the copy needs to be
13691    decomposed.  */
13692 void
aarch64_simd_emit_reg_reg_move(rtx * operands,machine_mode mode,unsigned int count)13693 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
13694 				unsigned int count)
13695 {
13696   unsigned int i;
13697   int rdest = REGNO (operands[0]);
13698   int rsrc = REGNO (operands[1]);
13699 
13700   if (!reg_overlap_mentioned_p (operands[0], operands[1])
13701       || rdest < rsrc)
13702     for (i = 0; i < count; i++)
13703       emit_move_insn (gen_rtx_REG (mode, rdest + i),
13704 		      gen_rtx_REG (mode, rsrc + i));
13705   else
13706     for (i = 0; i < count; i++)
13707       emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
13708 		      gen_rtx_REG (mode, rsrc + count - i - 1));
13709 }
13710 
13711 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
13712    one of VSTRUCT modes: OI, CI, or XI.  */
13713 int
aarch64_simd_attr_length_rglist(machine_mode mode)13714 aarch64_simd_attr_length_rglist (machine_mode mode)
13715 {
13716   /* This is only used (and only meaningful) for Advanced SIMD, not SVE.  */
13717   return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
13718 }
13719 
13720 /* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
13721    alignment of a vector to 128 bits.  SVE predicates have an alignment of
13722    16 bits.  */
13723 static HOST_WIDE_INT
aarch64_simd_vector_alignment(const_tree type)13724 aarch64_simd_vector_alignment (const_tree type)
13725 {
13726   if (TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
13727     /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
13728        be set for non-predicate vectors of booleans.  Modes are the most
13729        direct way we have of identifying real SVE predicate types.  */
13730     return GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL ? 16 : 128;
13731   HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
13732   return MIN (align, 128);
13733 }
13734 
13735 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT.  */
13736 static HOST_WIDE_INT
aarch64_vectorize_preferred_vector_alignment(const_tree type)13737 aarch64_vectorize_preferred_vector_alignment (const_tree type)
13738 {
13739   if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
13740     {
13741       /* If the length of the vector is fixed, try to align to that length,
13742 	 otherwise don't try to align at all.  */
13743       HOST_WIDE_INT result;
13744       if (!BITS_PER_SVE_VECTOR.is_constant (&result))
13745 	result = TYPE_ALIGN (TREE_TYPE (type));
13746       return result;
13747     }
13748   return TYPE_ALIGN (type);
13749 }
13750 
13751 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
13752 static bool
aarch64_simd_vector_alignment_reachable(const_tree type,bool is_packed)13753 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
13754 {
13755   if (is_packed)
13756     return false;
13757 
13758   /* For fixed-length vectors, check that the vectorizer will aim for
13759      full-vector alignment.  This isn't true for generic GCC vectors
13760      that are wider than the ABI maximum of 128 bits.  */
13761   if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
13762       && (wi::to_widest (TYPE_SIZE (type))
13763 	  != aarch64_vectorize_preferred_vector_alignment (type)))
13764     return false;
13765 
13766   /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
13767   return true;
13768 }
13769 
13770 /* Return true if the vector misalignment factor is supported by the
13771    target.  */
13772 static bool
aarch64_builtin_support_vector_misalignment(machine_mode mode,const_tree type,int misalignment,bool is_packed)13773 aarch64_builtin_support_vector_misalignment (machine_mode mode,
13774 					     const_tree type, int misalignment,
13775 					     bool is_packed)
13776 {
13777   if (TARGET_SIMD && STRICT_ALIGNMENT)
13778     {
13779       /* Return if movmisalign pattern is not supported for this mode.  */
13780       if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
13781         return false;
13782 
13783       /* Misalignment factor is unknown at compile time.  */
13784       if (misalignment == -1)
13785 	return false;
13786     }
13787   return default_builtin_support_vector_misalignment (mode, type, misalignment,
13788 						      is_packed);
13789 }
13790 
13791 /* If VALS is a vector constant that can be loaded into a register
13792    using DUP, generate instructions to do so and return an RTX to
13793    assign to the register.  Otherwise return NULL_RTX.  */
13794 static rtx
aarch64_simd_dup_constant(rtx vals)13795 aarch64_simd_dup_constant (rtx vals)
13796 {
13797   machine_mode mode = GET_MODE (vals);
13798   machine_mode inner_mode = GET_MODE_INNER (mode);
13799   rtx x;
13800 
13801   if (!const_vec_duplicate_p (vals, &x))
13802     return NULL_RTX;
13803 
13804   /* We can load this constant by using DUP and a constant in a
13805      single ARM register.  This will be cheaper than a vector
13806      load.  */
13807   x = copy_to_mode_reg (inner_mode, x);
13808   return gen_vec_duplicate (mode, x);
13809 }
13810 
13811 
13812 /* Generate code to load VALS, which is a PARALLEL containing only
13813    constants (for vec_init) or CONST_VECTOR, efficiently into a
13814    register.  Returns an RTX to copy into the register, or NULL_RTX
13815    for a PARALLEL that can not be converted into a CONST_VECTOR.  */
13816 static rtx
aarch64_simd_make_constant(rtx vals)13817 aarch64_simd_make_constant (rtx vals)
13818 {
13819   machine_mode mode = GET_MODE (vals);
13820   rtx const_dup;
13821   rtx const_vec = NULL_RTX;
13822   int n_const = 0;
13823   int i;
13824 
13825   if (GET_CODE (vals) == CONST_VECTOR)
13826     const_vec = vals;
13827   else if (GET_CODE (vals) == PARALLEL)
13828     {
13829       /* A CONST_VECTOR must contain only CONST_INTs and
13830 	 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
13831 	 Only store valid constants in a CONST_VECTOR.  */
13832       int n_elts = XVECLEN (vals, 0);
13833       for (i = 0; i < n_elts; ++i)
13834 	{
13835 	  rtx x = XVECEXP (vals, 0, i);
13836 	  if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
13837 	    n_const++;
13838 	}
13839       if (n_const == n_elts)
13840 	const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
13841     }
13842   else
13843     gcc_unreachable ();
13844 
13845   if (const_vec != NULL_RTX
13846       && aarch64_simd_valid_immediate (const_vec, NULL))
13847     /* Load using MOVI/MVNI.  */
13848     return const_vec;
13849   else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
13850     /* Loaded using DUP.  */
13851     return const_dup;
13852   else if (const_vec != NULL_RTX)
13853     /* Load from constant pool. We can not take advantage of single-cycle
13854        LD1 because we need a PC-relative addressing mode.  */
13855     return const_vec;
13856   else
13857     /* A PARALLEL containing something not valid inside CONST_VECTOR.
13858        We can not construct an initializer.  */
13859     return NULL_RTX;
13860 }
13861 
13862 /* Expand a vector initialisation sequence, such that TARGET is
13863    initialised to contain VALS.  */
13864 
13865 void
aarch64_expand_vector_init(rtx target,rtx vals)13866 aarch64_expand_vector_init (rtx target, rtx vals)
13867 {
13868   machine_mode mode = GET_MODE (target);
13869   scalar_mode inner_mode = GET_MODE_INNER (mode);
13870   /* The number of vector elements.  */
13871   int n_elts = XVECLEN (vals, 0);
13872   /* The number of vector elements which are not constant.  */
13873   int n_var = 0;
13874   rtx any_const = NULL_RTX;
13875   /* The first element of vals.  */
13876   rtx v0 = XVECEXP (vals, 0, 0);
13877   bool all_same = true;
13878 
13879   /* Count the number of variable elements to initialise.  */
13880   for (int i = 0; i < n_elts; ++i)
13881     {
13882       rtx x = XVECEXP (vals, 0, i);
13883       if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
13884 	++n_var;
13885       else
13886 	any_const = x;
13887 
13888       all_same &= rtx_equal_p (x, v0);
13889     }
13890 
13891   /* No variable elements, hand off to aarch64_simd_make_constant which knows
13892      how best to handle this.  */
13893   if (n_var == 0)
13894     {
13895       rtx constant = aarch64_simd_make_constant (vals);
13896       if (constant != NULL_RTX)
13897 	{
13898 	  emit_move_insn (target, constant);
13899 	  return;
13900 	}
13901     }
13902 
13903   /* Splat a single non-constant element if we can.  */
13904   if (all_same)
13905     {
13906       rtx x = copy_to_mode_reg (inner_mode, v0);
13907       aarch64_emit_move (target, gen_vec_duplicate (mode, x));
13908       return;
13909     }
13910 
13911   enum insn_code icode = optab_handler (vec_set_optab, mode);
13912   gcc_assert (icode != CODE_FOR_nothing);
13913 
13914   /* If there are only variable elements, try to optimize
13915      the insertion using dup for the most common element
13916      followed by insertions.  */
13917 
13918   /* The algorithm will fill matches[*][0] with the earliest matching element,
13919      and matches[X][1] with the count of duplicate elements (if X is the
13920      earliest element which has duplicates).  */
13921 
13922   if (n_var == n_elts && n_elts <= 16)
13923     {
13924       int matches[16][2] = {0};
13925       for (int i = 0; i < n_elts; i++)
13926 	{
13927 	  for (int j = 0; j <= i; j++)
13928 	    {
13929 	      if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
13930 		{
13931 		  matches[i][0] = j;
13932 		  matches[j][1]++;
13933 		  break;
13934 		}
13935 	    }
13936 	}
13937       int maxelement = 0;
13938       int maxv = 0;
13939       for (int i = 0; i < n_elts; i++)
13940 	if (matches[i][1] > maxv)
13941 	  {
13942 	    maxelement = i;
13943 	    maxv = matches[i][1];
13944 	  }
13945 
13946       /* Create a duplicate of the most common element.  */
13947       rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
13948       aarch64_emit_move (target, gen_vec_duplicate (mode, x));
13949 
13950       /* Insert the rest.  */
13951       for (int i = 0; i < n_elts; i++)
13952 	{
13953 	  rtx x = XVECEXP (vals, 0, i);
13954 	  if (matches[i][0] == maxelement)
13955 	    continue;
13956 	  x = copy_to_mode_reg (inner_mode, x);
13957 	  emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
13958 	}
13959       return;
13960     }
13961 
13962   /* Initialise a vector which is part-variable.  We want to first try
13963      to build those lanes which are constant in the most efficient way we
13964      can.  */
13965   if (n_var != n_elts)
13966     {
13967       rtx copy = copy_rtx (vals);
13968 
13969       /* Load constant part of vector.  We really don't care what goes into the
13970 	 parts we will overwrite, but we're more likely to be able to load the
13971 	 constant efficiently if it has fewer, larger, repeating parts
13972 	 (see aarch64_simd_valid_immediate).  */
13973       for (int i = 0; i < n_elts; i++)
13974 	{
13975 	  rtx x = XVECEXP (vals, 0, i);
13976 	  if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
13977 	    continue;
13978 	  rtx subst = any_const;
13979 	  for (int bit = n_elts / 2; bit > 0; bit /= 2)
13980 	    {
13981 	      /* Look in the copied vector, as more elements are const.  */
13982 	      rtx test = XVECEXP (copy, 0, i ^ bit);
13983 	      if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
13984 		{
13985 		  subst = test;
13986 		  break;
13987 		}
13988 	    }
13989 	  XVECEXP (copy, 0, i) = subst;
13990 	}
13991       aarch64_expand_vector_init (target, copy);
13992     }
13993 
13994   /* Insert the variable lanes directly.  */
13995   for (int i = 0; i < n_elts; i++)
13996     {
13997       rtx x = XVECEXP (vals, 0, i);
13998       if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
13999 	continue;
14000       x = copy_to_mode_reg (inner_mode, x);
14001       emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
14002     }
14003 }
14004 
14005 static unsigned HOST_WIDE_INT
aarch64_shift_truncation_mask(machine_mode mode)14006 aarch64_shift_truncation_mask (machine_mode mode)
14007 {
14008   if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
14009     return 0;
14010   return GET_MODE_UNIT_BITSIZE (mode) - 1;
14011 }
14012 
14013 /* Select a format to encode pointers in exception handling data.  */
14014 int
aarch64_asm_preferred_eh_data_format(int code ATTRIBUTE_UNUSED,int global)14015 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
14016 {
14017    int type;
14018    switch (aarch64_cmodel)
14019      {
14020      case AARCH64_CMODEL_TINY:
14021      case AARCH64_CMODEL_TINY_PIC:
14022      case AARCH64_CMODEL_SMALL:
14023      case AARCH64_CMODEL_SMALL_PIC:
14024      case AARCH64_CMODEL_SMALL_SPIC:
14025        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
14026 	  for everything.  */
14027        type = DW_EH_PE_sdata4;
14028        break;
14029      default:
14030        /* No assumptions here.  8-byte relocs required.  */
14031        type = DW_EH_PE_sdata8;
14032        break;
14033      }
14034    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
14035 }
14036 
14037 /* The last .arch and .tune assembly strings that we printed.  */
14038 static std::string aarch64_last_printed_arch_string;
14039 static std::string aarch64_last_printed_tune_string;
14040 
14041 /* Implement ASM_DECLARE_FUNCTION_NAME.  Output the ISA features used
14042    by the function fndecl.  */
14043 
14044 void
aarch64_declare_function_name(FILE * stream,const char * name,tree fndecl)14045 aarch64_declare_function_name (FILE *stream, const char* name,
14046 				tree fndecl)
14047 {
14048   tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
14049 
14050   struct cl_target_option *targ_options;
14051   if (target_parts)
14052     targ_options = TREE_TARGET_OPTION (target_parts);
14053   else
14054     targ_options = TREE_TARGET_OPTION (target_option_current_node);
14055   gcc_assert (targ_options);
14056 
14057   const struct processor *this_arch
14058     = aarch64_get_arch (targ_options->x_explicit_arch);
14059 
14060   unsigned long isa_flags = targ_options->x_aarch64_isa_flags;
14061   std::string extension
14062     = aarch64_get_extension_string_for_isa_flags (isa_flags,
14063 						  this_arch->flags);
14064   /* Only update the assembler .arch string if it is distinct from the last
14065      such string we printed.  */
14066   std::string to_print = this_arch->name + extension;
14067   if (to_print != aarch64_last_printed_arch_string)
14068     {
14069       asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
14070       aarch64_last_printed_arch_string = to_print;
14071     }
14072 
14073   /* Print the cpu name we're tuning for in the comments, might be
14074      useful to readers of the generated asm.  Do it only when it changes
14075      from function to function and verbose assembly is requested.  */
14076   const struct processor *this_tune
14077     = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
14078 
14079   if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
14080     {
14081       asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
14082 		   this_tune->name);
14083       aarch64_last_printed_tune_string = this_tune->name;
14084     }
14085 
14086   /* Don't forget the type directive for ELF.  */
14087   ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
14088   ASM_OUTPUT_LABEL (stream, name);
14089 }
14090 
14091 /* Implements TARGET_ASM_FILE_START.  Output the assembly header.  */
14092 
14093 static void
aarch64_start_file(void)14094 aarch64_start_file (void)
14095 {
14096   struct cl_target_option *default_options
14097     = TREE_TARGET_OPTION (target_option_default_node);
14098 
14099   const struct processor *default_arch
14100     = aarch64_get_arch (default_options->x_explicit_arch);
14101   unsigned long default_isa_flags = default_options->x_aarch64_isa_flags;
14102   std::string extension
14103     = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
14104 						  default_arch->flags);
14105 
14106    aarch64_last_printed_arch_string = default_arch->name + extension;
14107    aarch64_last_printed_tune_string = "";
14108    asm_fprintf (asm_out_file, "\t.arch %s\n",
14109 		aarch64_last_printed_arch_string.c_str ());
14110 
14111    default_file_start ();
14112 }
14113 
14114 /* Emit load exclusive.  */
14115 
14116 static void
aarch64_emit_load_exclusive(machine_mode mode,rtx rval,rtx mem,rtx model_rtx)14117 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
14118 			     rtx mem, rtx model_rtx)
14119 {
14120   rtx (*gen) (rtx, rtx, rtx);
14121 
14122   switch (mode)
14123     {
14124     case E_QImode: gen = gen_aarch64_load_exclusiveqi; break;
14125     case E_HImode: gen = gen_aarch64_load_exclusivehi; break;
14126     case E_SImode: gen = gen_aarch64_load_exclusivesi; break;
14127     case E_DImode: gen = gen_aarch64_load_exclusivedi; break;
14128     default:
14129       gcc_unreachable ();
14130     }
14131 
14132   emit_insn (gen (rval, mem, model_rtx));
14133 }
14134 
14135 /* Emit store exclusive.  */
14136 
14137 static void
aarch64_emit_store_exclusive(machine_mode mode,rtx bval,rtx rval,rtx mem,rtx model_rtx)14138 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
14139 			      rtx rval, rtx mem, rtx model_rtx)
14140 {
14141   rtx (*gen) (rtx, rtx, rtx, rtx);
14142 
14143   switch (mode)
14144     {
14145     case E_QImode: gen = gen_aarch64_store_exclusiveqi; break;
14146     case E_HImode: gen = gen_aarch64_store_exclusivehi; break;
14147     case E_SImode: gen = gen_aarch64_store_exclusivesi; break;
14148     case E_DImode: gen = gen_aarch64_store_exclusivedi; break;
14149     default:
14150       gcc_unreachable ();
14151     }
14152 
14153   emit_insn (gen (bval, rval, mem, model_rtx));
14154 }
14155 
14156 /* Mark the previous jump instruction as unlikely.  */
14157 
14158 static void
aarch64_emit_unlikely_jump(rtx insn)14159 aarch64_emit_unlikely_jump (rtx insn)
14160 {
14161   rtx_insn *jump = emit_jump_insn (insn);
14162   add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
14163 }
14164 
14165 /* Expand a compare and swap pattern.  */
14166 
14167 void
aarch64_expand_compare_and_swap(rtx operands[])14168 aarch64_expand_compare_and_swap (rtx operands[])
14169 {
14170   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
14171   machine_mode mode, cmp_mode;
14172   typedef rtx (*gen_cas_fn) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
14173   int idx;
14174   gen_cas_fn gen;
14175   const gen_cas_fn split_cas[] =
14176   {
14177     gen_aarch64_compare_and_swapqi,
14178     gen_aarch64_compare_and_swaphi,
14179     gen_aarch64_compare_and_swapsi,
14180     gen_aarch64_compare_and_swapdi
14181   };
14182   const gen_cas_fn atomic_cas[] =
14183   {
14184     gen_aarch64_compare_and_swapqi_lse,
14185     gen_aarch64_compare_and_swaphi_lse,
14186     gen_aarch64_compare_and_swapsi_lse,
14187     gen_aarch64_compare_and_swapdi_lse
14188   };
14189 
14190   bval = operands[0];
14191   rval = operands[1];
14192   mem = operands[2];
14193   oldval = operands[3];
14194   newval = operands[4];
14195   is_weak = operands[5];
14196   mod_s = operands[6];
14197   mod_f = operands[7];
14198   mode = GET_MODE (mem);
14199   cmp_mode = mode;
14200 
14201   /* Normally the succ memory model must be stronger than fail, but in the
14202      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
14203      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
14204 
14205   if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
14206       && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
14207     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
14208 
14209   switch (mode)
14210     {
14211     case E_QImode:
14212     case E_HImode:
14213       /* For short modes, we're going to perform the comparison in SImode,
14214 	 so do the zero-extension now.  */
14215       cmp_mode = SImode;
14216       rval = gen_reg_rtx (SImode);
14217       oldval = convert_modes (SImode, mode, oldval, true);
14218       /* Fall through.  */
14219 
14220     case E_SImode:
14221     case E_DImode:
14222       /* Force the value into a register if needed.  */
14223       if (!aarch64_plus_operand (oldval, mode))
14224 	oldval = force_reg (cmp_mode, oldval);
14225       break;
14226 
14227     default:
14228       gcc_unreachable ();
14229     }
14230 
14231   switch (mode)
14232     {
14233     case E_QImode: idx = 0; break;
14234     case E_HImode: idx = 1; break;
14235     case E_SImode: idx = 2; break;
14236     case E_DImode: idx = 3; break;
14237     default:
14238       gcc_unreachable ();
14239     }
14240   if (TARGET_LSE)
14241     gen = atomic_cas[idx];
14242   else
14243     gen = split_cas[idx];
14244 
14245   emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
14246 
14247   if (mode == QImode || mode == HImode)
14248     emit_move_insn (operands[1], gen_lowpart (mode, rval));
14249 
14250   x = gen_rtx_REG (CCmode, CC_REGNUM);
14251   x = gen_rtx_EQ (SImode, x, const0_rtx);
14252   emit_insn (gen_rtx_SET (bval, x));
14253 }
14254 
14255 /* Test whether the target supports using a atomic load-operate instruction.
14256    CODE is the operation and AFTER is TRUE if the data in memory after the
14257    operation should be returned and FALSE if the data before the operation
14258    should be returned.  Returns FALSE if the operation isn't supported by the
14259    architecture.  */
14260 
14261 bool
aarch64_atomic_ldop_supported_p(enum rtx_code code)14262 aarch64_atomic_ldop_supported_p (enum rtx_code code)
14263 {
14264   if (!TARGET_LSE)
14265     return false;
14266 
14267   switch (code)
14268     {
14269     case SET:
14270     case AND:
14271     case IOR:
14272     case XOR:
14273     case MINUS:
14274     case PLUS:
14275       return true;
14276     default:
14277       return false;
14278     }
14279 }
14280 
14281 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
14282    sequence implementing an atomic operation.  */
14283 
14284 static void
aarch64_emit_post_barrier(enum memmodel model)14285 aarch64_emit_post_barrier (enum memmodel model)
14286 {
14287   const enum memmodel base_model = memmodel_base (model);
14288 
14289   if (is_mm_sync (model)
14290       && (base_model == MEMMODEL_ACQUIRE
14291 	  || base_model == MEMMODEL_ACQ_REL
14292 	  || base_model == MEMMODEL_SEQ_CST))
14293     {
14294       emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
14295     }
14296 }
14297 
14298 /* Emit an atomic compare-and-swap operation.  RVAL is the destination register
14299    for the data in memory.  EXPECTED is the value expected to be in memory.
14300    DESIRED is the value to store to memory.  MEM is the memory location.  MODEL
14301    is the memory ordering to use.  */
14302 
14303 void
aarch64_gen_atomic_cas(rtx rval,rtx mem,rtx expected,rtx desired,rtx model)14304 aarch64_gen_atomic_cas (rtx rval, rtx mem,
14305 			rtx expected, rtx desired,
14306 			rtx model)
14307 {
14308   rtx (*gen) (rtx, rtx, rtx, rtx);
14309   machine_mode mode;
14310 
14311   mode = GET_MODE (mem);
14312 
14313   switch (mode)
14314     {
14315     case E_QImode: gen = gen_aarch64_atomic_casqi; break;
14316     case E_HImode: gen = gen_aarch64_atomic_cashi; break;
14317     case E_SImode: gen = gen_aarch64_atomic_cassi; break;
14318     case E_DImode: gen = gen_aarch64_atomic_casdi; break;
14319     default:
14320       gcc_unreachable ();
14321     }
14322 
14323   /* Move the expected value into the CAS destination register.  */
14324   emit_insn (gen_rtx_SET (rval, expected));
14325 
14326   /* Emit the CAS.  */
14327   emit_insn (gen (rval, mem, desired, model));
14328 
14329   /* Compare the expected value with the value loaded by the CAS, to establish
14330      whether the swap was made.  */
14331   aarch64_gen_compare_reg (EQ, rval, expected);
14332 }
14333 
14334 /* Split a compare and swap pattern.  */
14335 
14336 void
aarch64_split_compare_and_swap(rtx operands[])14337 aarch64_split_compare_and_swap (rtx operands[])
14338 {
14339   rtx rval, mem, oldval, newval, scratch;
14340   machine_mode mode;
14341   bool is_weak;
14342   rtx_code_label *label1, *label2;
14343   rtx x, cond;
14344   enum memmodel model;
14345   rtx model_rtx;
14346 
14347   rval = operands[0];
14348   mem = operands[1];
14349   oldval = operands[2];
14350   newval = operands[3];
14351   is_weak = (operands[4] != const0_rtx);
14352   model_rtx = operands[5];
14353   scratch = operands[7];
14354   mode = GET_MODE (mem);
14355   model = memmodel_from_int (INTVAL (model_rtx));
14356 
14357   /* When OLDVAL is zero and we want the strong version we can emit a tighter
14358     loop:
14359     .label1:
14360 	LD[A]XR	rval, [mem]
14361 	CBNZ	rval, .label2
14362 	ST[L]XR	scratch, newval, [mem]
14363 	CBNZ	scratch, .label1
14364     .label2:
14365 	CMP	rval, 0.  */
14366   bool strong_zero_p = !is_weak && oldval == const0_rtx;
14367 
14368   label1 = NULL;
14369   if (!is_weak)
14370     {
14371       label1 = gen_label_rtx ();
14372       emit_label (label1);
14373     }
14374   label2 = gen_label_rtx ();
14375 
14376   /* The initial load can be relaxed for a __sync operation since a final
14377      barrier will be emitted to stop code hoisting.  */
14378   if (is_mm_sync (model))
14379     aarch64_emit_load_exclusive (mode, rval, mem,
14380 				 GEN_INT (MEMMODEL_RELAXED));
14381   else
14382     aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
14383 
14384   if (strong_zero_p)
14385     {
14386       x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
14387       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14388 				gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
14389       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14390     }
14391   else
14392     {
14393       cond = aarch64_gen_compare_reg (NE, rval, oldval);
14394       x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
14395       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14396 				 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
14397       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14398     }
14399 
14400   aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
14401 
14402   if (!is_weak)
14403     {
14404       x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
14405       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14406 				gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
14407       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14408     }
14409   else
14410     {
14411       cond = gen_rtx_REG (CCmode, CC_REGNUM);
14412       x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
14413       emit_insn (gen_rtx_SET (cond, x));
14414     }
14415 
14416   emit_label (label2);
14417   /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
14418      to set the condition flags.  If this is not used it will be removed by
14419      later passes.  */
14420   if (strong_zero_p)
14421     {
14422       cond = gen_rtx_REG (CCmode, CC_REGNUM);
14423       x = gen_rtx_COMPARE (CCmode, rval, const0_rtx);
14424       emit_insn (gen_rtx_SET (cond, x));
14425     }
14426   /* Emit any final barrier needed for a __sync operation.  */
14427   if (is_mm_sync (model))
14428     aarch64_emit_post_barrier (model);
14429 }
14430 
14431 /* Emit a BIC instruction.  */
14432 
14433 static void
aarch64_emit_bic(machine_mode mode,rtx dst,rtx s1,rtx s2,int shift)14434 aarch64_emit_bic (machine_mode mode, rtx dst, rtx s1, rtx s2, int shift)
14435 {
14436   rtx shift_rtx = GEN_INT (shift);
14437   rtx (*gen) (rtx, rtx, rtx, rtx);
14438 
14439   switch (mode)
14440     {
14441     case E_SImode: gen = gen_and_one_cmpl_lshrsi3; break;
14442     case E_DImode: gen = gen_and_one_cmpl_lshrdi3; break;
14443     default:
14444       gcc_unreachable ();
14445     }
14446 
14447   emit_insn (gen (dst, s2, shift_rtx, s1));
14448 }
14449 
14450 /* Emit an atomic swap.  */
14451 
14452 static void
aarch64_emit_atomic_swap(machine_mode mode,rtx dst,rtx value,rtx mem,rtx model)14453 aarch64_emit_atomic_swap (machine_mode mode, rtx dst, rtx value,
14454 			  rtx mem, rtx model)
14455 {
14456   rtx (*gen) (rtx, rtx, rtx, rtx);
14457 
14458   switch (mode)
14459     {
14460     case E_QImode: gen = gen_aarch64_atomic_swpqi; break;
14461     case E_HImode: gen = gen_aarch64_atomic_swphi; break;
14462     case E_SImode: gen = gen_aarch64_atomic_swpsi; break;
14463     case E_DImode: gen = gen_aarch64_atomic_swpdi; break;
14464     default:
14465       gcc_unreachable ();
14466     }
14467 
14468   emit_insn (gen (dst, mem, value, model));
14469 }
14470 
14471 /* Operations supported by aarch64_emit_atomic_load_op.  */
14472 
14473 enum aarch64_atomic_load_op_code
14474 {
14475   AARCH64_LDOP_PLUS,	/* A + B  */
14476   AARCH64_LDOP_XOR,	/* A ^ B  */
14477   AARCH64_LDOP_OR,	/* A | B  */
14478   AARCH64_LDOP_BIC	/* A & ~B  */
14479 };
14480 
14481 /* Emit an atomic load-operate.  */
14482 
14483 static void
aarch64_emit_atomic_load_op(enum aarch64_atomic_load_op_code code,machine_mode mode,rtx dst,rtx src,rtx mem,rtx model)14484 aarch64_emit_atomic_load_op (enum aarch64_atomic_load_op_code code,
14485 			     machine_mode mode, rtx dst, rtx src,
14486 			     rtx mem, rtx model)
14487 {
14488   typedef rtx (*aarch64_atomic_load_op_fn) (rtx, rtx, rtx, rtx);
14489   const aarch64_atomic_load_op_fn plus[] =
14490   {
14491     gen_aarch64_atomic_loadaddqi,
14492     gen_aarch64_atomic_loadaddhi,
14493     gen_aarch64_atomic_loadaddsi,
14494     gen_aarch64_atomic_loadadddi
14495   };
14496   const aarch64_atomic_load_op_fn eor[] =
14497   {
14498     gen_aarch64_atomic_loadeorqi,
14499     gen_aarch64_atomic_loadeorhi,
14500     gen_aarch64_atomic_loadeorsi,
14501     gen_aarch64_atomic_loadeordi
14502   };
14503   const aarch64_atomic_load_op_fn ior[] =
14504   {
14505     gen_aarch64_atomic_loadsetqi,
14506     gen_aarch64_atomic_loadsethi,
14507     gen_aarch64_atomic_loadsetsi,
14508     gen_aarch64_atomic_loadsetdi
14509   };
14510   const aarch64_atomic_load_op_fn bic[] =
14511   {
14512     gen_aarch64_atomic_loadclrqi,
14513     gen_aarch64_atomic_loadclrhi,
14514     gen_aarch64_atomic_loadclrsi,
14515     gen_aarch64_atomic_loadclrdi
14516   };
14517   aarch64_atomic_load_op_fn gen;
14518   int idx = 0;
14519 
14520   switch (mode)
14521     {
14522     case E_QImode: idx = 0; break;
14523     case E_HImode: idx = 1; break;
14524     case E_SImode: idx = 2; break;
14525     case E_DImode: idx = 3; break;
14526     default:
14527       gcc_unreachable ();
14528     }
14529 
14530   switch (code)
14531     {
14532     case AARCH64_LDOP_PLUS: gen = plus[idx]; break;
14533     case AARCH64_LDOP_XOR: gen = eor[idx]; break;
14534     case AARCH64_LDOP_OR: gen = ior[idx]; break;
14535     case AARCH64_LDOP_BIC: gen = bic[idx]; break;
14536     default:
14537       gcc_unreachable ();
14538     }
14539 
14540   emit_insn (gen (dst, mem, src, model));
14541 }
14542 
14543 /* Emit an atomic load+operate.  CODE is the operation.  OUT_DATA is the
14544    location to store the data read from memory.  OUT_RESULT is the location to
14545    store the result of the operation.  MEM is the memory location to read and
14546    modify.  MODEL_RTX is the memory ordering to use.  VALUE is the second
14547    operand for the operation.  Either OUT_DATA or OUT_RESULT, but not both, can
14548    be NULL.  */
14549 
14550 void
aarch64_gen_atomic_ldop(enum rtx_code code,rtx out_data,rtx out_result,rtx mem,rtx value,rtx model_rtx)14551 aarch64_gen_atomic_ldop (enum rtx_code code, rtx out_data, rtx out_result,
14552 			 rtx mem, rtx value, rtx model_rtx)
14553 {
14554   machine_mode mode = GET_MODE (mem);
14555   machine_mode wmode = (mode == DImode ? DImode : SImode);
14556   const bool short_mode = (mode < SImode);
14557   aarch64_atomic_load_op_code ldop_code;
14558   rtx src;
14559   rtx x;
14560 
14561   if (out_data)
14562     out_data = gen_lowpart (mode, out_data);
14563 
14564   if (out_result)
14565     out_result = gen_lowpart (mode, out_result);
14566 
14567   /* Make sure the value is in a register, putting it into a destination
14568      register if it needs to be manipulated.  */
14569   if (!register_operand (value, mode)
14570       || code == AND || code == MINUS)
14571     {
14572       src = out_result ? out_result : out_data;
14573       emit_move_insn (src, gen_lowpart (mode, value));
14574     }
14575   else
14576     src = value;
14577   gcc_assert (register_operand (src, mode));
14578 
14579   /* Preprocess the data for the operation as necessary.  If the operation is
14580      a SET then emit a swap instruction and finish.  */
14581   switch (code)
14582     {
14583     case SET:
14584       aarch64_emit_atomic_swap (mode, out_data, src, mem, model_rtx);
14585       return;
14586 
14587     case MINUS:
14588       /* Negate the value and treat it as a PLUS.  */
14589       {
14590 	rtx neg_src;
14591 
14592 	/* Resize the value if necessary.  */
14593 	if (short_mode)
14594 	  src = gen_lowpart (wmode, src);
14595 
14596 	neg_src = gen_rtx_NEG (wmode, src);
14597 	emit_insn (gen_rtx_SET (src, neg_src));
14598 
14599 	if (short_mode)
14600 	  src = gen_lowpart (mode, src);
14601       }
14602       /* Fall-through.  */
14603     case PLUS:
14604       ldop_code = AARCH64_LDOP_PLUS;
14605       break;
14606 
14607     case IOR:
14608       ldop_code = AARCH64_LDOP_OR;
14609       break;
14610 
14611     case XOR:
14612       ldop_code = AARCH64_LDOP_XOR;
14613       break;
14614 
14615     case AND:
14616       {
14617 	rtx not_src;
14618 
14619 	/* Resize the value if necessary.  */
14620 	if (short_mode)
14621 	  src = gen_lowpart (wmode, src);
14622 
14623 	not_src = gen_rtx_NOT (wmode, src);
14624 	emit_insn (gen_rtx_SET (src, not_src));
14625 
14626 	if (short_mode)
14627 	  src = gen_lowpart (mode, src);
14628       }
14629       ldop_code = AARCH64_LDOP_BIC;
14630       break;
14631 
14632     default:
14633       /* The operation can't be done with atomic instructions.  */
14634       gcc_unreachable ();
14635     }
14636 
14637   aarch64_emit_atomic_load_op (ldop_code, mode, out_data, src, mem, model_rtx);
14638 
14639   /* If necessary, calculate the data in memory after the update by redoing the
14640      operation from values in registers.  */
14641   if (!out_result)
14642     return;
14643 
14644   if (short_mode)
14645     {
14646       src = gen_lowpart (wmode, src);
14647       out_data = gen_lowpart (wmode, out_data);
14648       out_result = gen_lowpart (wmode, out_result);
14649     }
14650 
14651   x = NULL_RTX;
14652 
14653   switch (code)
14654     {
14655     case MINUS:
14656     case PLUS:
14657       x = gen_rtx_PLUS (wmode, out_data, src);
14658       break;
14659     case IOR:
14660       x = gen_rtx_IOR (wmode, out_data, src);
14661       break;
14662     case XOR:
14663       x = gen_rtx_XOR (wmode, out_data, src);
14664       break;
14665     case AND:
14666       aarch64_emit_bic (wmode, out_result, out_data, src, 0);
14667       return;
14668     default:
14669       gcc_unreachable ();
14670     }
14671 
14672   emit_set_insn (out_result, x);
14673 
14674   return;
14675 }
14676 
14677 /* Split an atomic operation.  */
14678 
14679 void
aarch64_split_atomic_op(enum rtx_code code,rtx old_out,rtx new_out,rtx mem,rtx value,rtx model_rtx,rtx cond)14680 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
14681 			 rtx value, rtx model_rtx, rtx cond)
14682 {
14683   machine_mode mode = GET_MODE (mem);
14684   machine_mode wmode = (mode == DImode ? DImode : SImode);
14685   const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
14686   const bool is_sync = is_mm_sync (model);
14687   rtx_code_label *label;
14688   rtx x;
14689 
14690   /* Split the atomic operation into a sequence.  */
14691   label = gen_label_rtx ();
14692   emit_label (label);
14693 
14694   if (new_out)
14695     new_out = gen_lowpart (wmode, new_out);
14696   if (old_out)
14697     old_out = gen_lowpart (wmode, old_out);
14698   else
14699     old_out = new_out;
14700   value = simplify_gen_subreg (wmode, value, mode, 0);
14701 
14702   /* The initial load can be relaxed for a __sync operation since a final
14703      barrier will be emitted to stop code hoisting.  */
14704  if (is_sync)
14705     aarch64_emit_load_exclusive (mode, old_out, mem,
14706 				 GEN_INT (MEMMODEL_RELAXED));
14707   else
14708     aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
14709 
14710   switch (code)
14711     {
14712     case SET:
14713       new_out = value;
14714       break;
14715 
14716     case NOT:
14717       x = gen_rtx_AND (wmode, old_out, value);
14718       emit_insn (gen_rtx_SET (new_out, x));
14719       x = gen_rtx_NOT (wmode, new_out);
14720       emit_insn (gen_rtx_SET (new_out, x));
14721       break;
14722 
14723     case MINUS:
14724       if (CONST_INT_P (value))
14725 	{
14726 	  value = GEN_INT (-INTVAL (value));
14727 	  code = PLUS;
14728 	}
14729       /* Fall through.  */
14730 
14731     default:
14732       x = gen_rtx_fmt_ee (code, wmode, old_out, value);
14733       emit_insn (gen_rtx_SET (new_out, x));
14734       break;
14735     }
14736 
14737   aarch64_emit_store_exclusive (mode, cond, mem,
14738 				gen_lowpart (mode, new_out), model_rtx);
14739 
14740   x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
14741   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14742 			    gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
14743   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14744 
14745   /* Emit any final barrier needed for a __sync operation.  */
14746   if (is_sync)
14747     aarch64_emit_post_barrier (model);
14748 }
14749 
14750 static void
aarch64_init_libfuncs(void)14751 aarch64_init_libfuncs (void)
14752 {
14753    /* Half-precision float operations.  The compiler handles all operations
14754      with NULL libfuncs by converting to SFmode.  */
14755 
14756   /* Conversions.  */
14757   set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
14758   set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
14759 
14760   /* Arithmetic.  */
14761   set_optab_libfunc (add_optab, HFmode, NULL);
14762   set_optab_libfunc (sdiv_optab, HFmode, NULL);
14763   set_optab_libfunc (smul_optab, HFmode, NULL);
14764   set_optab_libfunc (neg_optab, HFmode, NULL);
14765   set_optab_libfunc (sub_optab, HFmode, NULL);
14766 
14767   /* Comparisons.  */
14768   set_optab_libfunc (eq_optab, HFmode, NULL);
14769   set_optab_libfunc (ne_optab, HFmode, NULL);
14770   set_optab_libfunc (lt_optab, HFmode, NULL);
14771   set_optab_libfunc (le_optab, HFmode, NULL);
14772   set_optab_libfunc (ge_optab, HFmode, NULL);
14773   set_optab_libfunc (gt_optab, HFmode, NULL);
14774   set_optab_libfunc (unord_optab, HFmode, NULL);
14775 }
14776 
14777 /* Target hook for c_mode_for_suffix.  */
14778 static machine_mode
aarch64_c_mode_for_suffix(char suffix)14779 aarch64_c_mode_for_suffix (char suffix)
14780 {
14781   if (suffix == 'q')
14782     return TFmode;
14783 
14784   return VOIDmode;
14785 }
14786 
14787 /* We can only represent floating point constants which will fit in
14788    "quarter-precision" values.  These values are characterised by
14789    a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
14790    by:
14791 
14792    (-1)^s * (n/16) * 2^r
14793 
14794    Where:
14795      's' is the sign bit.
14796      'n' is an integer in the range 16 <= n <= 31.
14797      'r' is an integer in the range -3 <= r <= 4.  */
14798 
14799 /* Return true iff X can be represented by a quarter-precision
14800    floating point immediate operand X.  Note, we cannot represent 0.0.  */
14801 bool
aarch64_float_const_representable_p(rtx x)14802 aarch64_float_const_representable_p (rtx x)
14803 {
14804   /* This represents our current view of how many bits
14805      make up the mantissa.  */
14806   int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
14807   int exponent;
14808   unsigned HOST_WIDE_INT mantissa, mask;
14809   REAL_VALUE_TYPE r, m;
14810   bool fail;
14811 
14812   if (!CONST_DOUBLE_P (x))
14813     return false;
14814 
14815   /* We don't support HFmode constants yet.  */
14816   if (GET_MODE (x) == VOIDmode || GET_MODE (x) == HFmode)
14817     return false;
14818 
14819   r = *CONST_DOUBLE_REAL_VALUE (x);
14820 
14821   /* We cannot represent infinities, NaNs or +/-zero.  We won't
14822      know if we have +zero until we analyse the mantissa, but we
14823      can reject the other invalid values.  */
14824   if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
14825       || REAL_VALUE_MINUS_ZERO (r))
14826     return false;
14827 
14828   /* Extract exponent.  */
14829   r = real_value_abs (&r);
14830   exponent = REAL_EXP (&r);
14831 
14832   /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
14833      highest (sign) bit, with a fixed binary point at bit point_pos.
14834      m1 holds the low part of the mantissa, m2 the high part.
14835      WARNING: If we ever have a representation using more than 2 * H_W_I - 1
14836      bits for the mantissa, this can fail (low bits will be lost).  */
14837   real_ldexp (&m, &r, point_pos - exponent);
14838   wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
14839 
14840   /* If the low part of the mantissa has bits set we cannot represent
14841      the value.  */
14842   if (w.ulow () != 0)
14843     return false;
14844   /* We have rejected the lower HOST_WIDE_INT, so update our
14845      understanding of how many bits lie in the mantissa and
14846      look only at the high HOST_WIDE_INT.  */
14847   mantissa = w.elt (1);
14848   point_pos -= HOST_BITS_PER_WIDE_INT;
14849 
14850   /* We can only represent values with a mantissa of the form 1.xxxx.  */
14851   mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
14852   if ((mantissa & mask) != 0)
14853     return false;
14854 
14855   /* Having filtered unrepresentable values, we may now remove all
14856      but the highest 5 bits.  */
14857   mantissa >>= point_pos - 5;
14858 
14859   /* We cannot represent the value 0.0, so reject it.  This is handled
14860      elsewhere.  */
14861   if (mantissa == 0)
14862     return false;
14863 
14864   /* Then, as bit 4 is always set, we can mask it off, leaving
14865      the mantissa in the range [0, 15].  */
14866   mantissa &= ~(1 << 4);
14867   gcc_assert (mantissa <= 15);
14868 
14869   /* GCC internally does not use IEEE754-like encoding (where normalized
14870      significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.c).
14871      Our mantissa values are shifted 4 places to the left relative to
14872      normalized IEEE754 so we must modify the exponent returned by REAL_EXP
14873      by 5 places to correct for GCC's representation.  */
14874   exponent = 5 - exponent;
14875 
14876   return (exponent >= 0 && exponent <= 7);
14877 }
14878 
14879 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
14880    immediate with a CONST_VECTOR of MODE and WIDTH.  WHICH selects whether to
14881    output MOVI/MVNI, ORR or BIC immediate.  */
14882 char*
aarch64_output_simd_mov_immediate(rtx const_vector,unsigned width,enum simd_immediate_check which)14883 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
14884 				   enum simd_immediate_check which)
14885 {
14886   bool is_valid;
14887   static char templ[40];
14888   const char *mnemonic;
14889   const char *shift_op;
14890   unsigned int lane_count = 0;
14891   char element_char;
14892 
14893   struct simd_immediate_info info;
14894 
14895   /* This will return true to show const_vector is legal for use as either
14896      a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
14897      It will also update INFO to show how the immediate should be generated.
14898      WHICH selects whether to check for MOVI/MVNI, ORR or BIC.  */
14899   is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
14900   gcc_assert (is_valid);
14901 
14902   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
14903   lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
14904 
14905   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
14906     {
14907       gcc_assert (info.shift == 0 && info.insn == simd_immediate_info::MOV);
14908       /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
14909 	 move immediate path.  */
14910       if (aarch64_float_const_zero_rtx_p (info.value))
14911         info.value = GEN_INT (0);
14912       else
14913 	{
14914 	  const unsigned int buf_size = 20;
14915 	  char float_buf[buf_size] = {'\0'};
14916 	  real_to_decimal_for_mode (float_buf,
14917 				    CONST_DOUBLE_REAL_VALUE (info.value),
14918 				    buf_size, buf_size, 1, info.elt_mode);
14919 
14920 	  if (lane_count == 1)
14921 	    snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
14922 	  else
14923 	    snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
14924 		      lane_count, element_char, float_buf);
14925 	  return templ;
14926 	}
14927     }
14928 
14929   gcc_assert (CONST_INT_P (info.value));
14930 
14931   if (which == AARCH64_CHECK_MOV)
14932     {
14933       mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
14934       shift_op = info.modifier == simd_immediate_info::MSL ? "msl" : "lsl";
14935       if (lane_count == 1)
14936 	snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
14937 		  mnemonic, UINTVAL (info.value));
14938       else if (info.shift)
14939 	snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
14940 		  HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
14941 		  element_char, UINTVAL (info.value), shift_op, info.shift);
14942       else
14943 	snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
14944 		  HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
14945 		  element_char, UINTVAL (info.value));
14946     }
14947   else
14948     {
14949       /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR.  */
14950       mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
14951       if (info.shift)
14952 	snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
14953 		  HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
14954 		  element_char, UINTVAL (info.value), "lsl", info.shift);
14955       else
14956 	snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
14957 		  HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
14958 		  element_char, UINTVAL (info.value));
14959     }
14960   return templ;
14961 }
14962 
14963 char*
aarch64_output_scalar_simd_mov_immediate(rtx immediate,scalar_int_mode mode)14964 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
14965 {
14966 
14967   /* If a floating point number was passed and we desire to use it in an
14968      integer mode do the conversion to integer.  */
14969   if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
14970     {
14971       unsigned HOST_WIDE_INT ival;
14972       if (!aarch64_reinterpret_float_as_int (immediate, &ival))
14973 	  gcc_unreachable ();
14974       immediate = gen_int_mode (ival, mode);
14975     }
14976 
14977   machine_mode vmode;
14978   /* use a 64 bit mode for everything except for DI/DF mode, where we use
14979      a 128 bit vector mode.  */
14980   int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
14981 
14982   vmode = aarch64_simd_container_mode (mode, width);
14983   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
14984   return aarch64_output_simd_mov_immediate (v_op, width);
14985 }
14986 
14987 /* Return the output string to use for moving immediate CONST_VECTOR
14988    into an SVE register.  */
14989 
14990 char *
aarch64_output_sve_mov_immediate(rtx const_vector)14991 aarch64_output_sve_mov_immediate (rtx const_vector)
14992 {
14993   static char templ[40];
14994   struct simd_immediate_info info;
14995   char element_char;
14996 
14997   bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
14998   gcc_assert (is_valid);
14999 
15000   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
15001 
15002   if (info.step)
15003     {
15004       snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
15005 		HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
15006 		element_char, INTVAL (info.value), INTVAL (info.step));
15007       return templ;
15008     }
15009 
15010   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
15011     {
15012       if (aarch64_float_const_zero_rtx_p (info.value))
15013 	info.value = GEN_INT (0);
15014       else
15015 	{
15016 	  const int buf_size = 20;
15017 	  char float_buf[buf_size] = {};
15018 	  real_to_decimal_for_mode (float_buf,
15019 				    CONST_DOUBLE_REAL_VALUE (info.value),
15020 				    buf_size, buf_size, 1, info.elt_mode);
15021 
15022 	  snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
15023 		    element_char, float_buf);
15024 	  return templ;
15025 	}
15026     }
15027 
15028   snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
15029 	    element_char, INTVAL (info.value));
15030   return templ;
15031 }
15032 
15033 /* Return the asm format for a PTRUE instruction whose destination has
15034    mode MODE.  SUFFIX is the element size suffix.  */
15035 
15036 char *
aarch64_output_ptrue(machine_mode mode,char suffix)15037 aarch64_output_ptrue (machine_mode mode, char suffix)
15038 {
15039   unsigned int nunits;
15040   static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
15041   if (GET_MODE_NUNITS (mode).is_constant (&nunits))
15042     snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", suffix, nunits);
15043   else
15044     snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, all", suffix);
15045   return buf;
15046 }
15047 
15048 /* Split operands into moves from op[1] + op[2] into op[0].  */
15049 
15050 void
aarch64_split_combinev16qi(rtx operands[3])15051 aarch64_split_combinev16qi (rtx operands[3])
15052 {
15053   unsigned int dest = REGNO (operands[0]);
15054   unsigned int src1 = REGNO (operands[1]);
15055   unsigned int src2 = REGNO (operands[2]);
15056   machine_mode halfmode = GET_MODE (operands[1]);
15057   unsigned int halfregs = REG_NREGS (operands[1]);
15058   rtx destlo, desthi;
15059 
15060   gcc_assert (halfmode == V16QImode);
15061 
15062   if (src1 == dest && src2 == dest + halfregs)
15063     {
15064       /* No-op move.  Can't split to nothing; emit something.  */
15065       emit_note (NOTE_INSN_DELETED);
15066       return;
15067     }
15068 
15069   /* Preserve register attributes for variable tracking.  */
15070   destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
15071   desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
15072 			       GET_MODE_SIZE (halfmode));
15073 
15074   /* Special case of reversed high/low parts.  */
15075   if (reg_overlap_mentioned_p (operands[2], destlo)
15076       && reg_overlap_mentioned_p (operands[1], desthi))
15077     {
15078       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
15079       emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
15080       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
15081     }
15082   else if (!reg_overlap_mentioned_p (operands[2], destlo))
15083     {
15084       /* Try to avoid unnecessary moves if part of the result
15085 	 is in the right place already.  */
15086       if (src1 != dest)
15087 	emit_move_insn (destlo, operands[1]);
15088       if (src2 != dest + halfregs)
15089 	emit_move_insn (desthi, operands[2]);
15090     }
15091   else
15092     {
15093       if (src2 != dest + halfregs)
15094 	emit_move_insn (desthi, operands[2]);
15095       if (src1 != dest)
15096 	emit_move_insn (destlo, operands[1]);
15097     }
15098 }
15099 
15100 /* vec_perm support.  */
15101 
15102 struct expand_vec_perm_d
15103 {
15104   rtx target, op0, op1;
15105   vec_perm_indices perm;
15106   machine_mode vmode;
15107   unsigned int vec_flags;
15108   bool one_vector_p;
15109   bool testing_p;
15110 };
15111 
15112 /* Generate a variable permutation.  */
15113 
15114 static void
aarch64_expand_vec_perm_1(rtx target,rtx op0,rtx op1,rtx sel)15115 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
15116 {
15117   machine_mode vmode = GET_MODE (target);
15118   bool one_vector_p = rtx_equal_p (op0, op1);
15119 
15120   gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
15121   gcc_checking_assert (GET_MODE (op0) == vmode);
15122   gcc_checking_assert (GET_MODE (op1) == vmode);
15123   gcc_checking_assert (GET_MODE (sel) == vmode);
15124   gcc_checking_assert (TARGET_SIMD);
15125 
15126   if (one_vector_p)
15127     {
15128       if (vmode == V8QImode)
15129 	{
15130 	  /* Expand the argument to a V16QI mode by duplicating it.  */
15131 	  rtx pair = gen_reg_rtx (V16QImode);
15132 	  emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
15133 	  emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
15134 	}
15135       else
15136 	{
15137 	  emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
15138 	}
15139     }
15140   else
15141     {
15142       rtx pair;
15143 
15144       if (vmode == V8QImode)
15145 	{
15146 	  pair = gen_reg_rtx (V16QImode);
15147 	  emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
15148 	  emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
15149 	}
15150       else
15151 	{
15152 	  pair = gen_reg_rtx (OImode);
15153 	  emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
15154 	  emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
15155 	}
15156     }
15157 }
15158 
15159 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
15160    NELT is the number of elements in the vector.  */
15161 
15162 void
aarch64_expand_vec_perm(rtx target,rtx op0,rtx op1,rtx sel,unsigned int nelt)15163 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
15164 			 unsigned int nelt)
15165 {
15166   machine_mode vmode = GET_MODE (target);
15167   bool one_vector_p = rtx_equal_p (op0, op1);
15168   rtx mask;
15169 
15170   /* The TBL instruction does not use a modulo index, so we must take care
15171      of that ourselves.  */
15172   mask = aarch64_simd_gen_const_vector_dup (vmode,
15173       one_vector_p ? nelt - 1 : 2 * nelt - 1);
15174   sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
15175 
15176   /* For big-endian, we also need to reverse the index within the vector
15177      (but not which vector).  */
15178   if (BYTES_BIG_ENDIAN)
15179     {
15180       /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
15181       if (!one_vector_p)
15182         mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
15183       sel = expand_simple_binop (vmode, XOR, sel, mask,
15184 				 NULL, 0, OPTAB_LIB_WIDEN);
15185     }
15186   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
15187 }
15188 
15189 /* Generate (set TARGET (unspec [OP0 OP1] CODE)).  */
15190 
15191 static void
emit_unspec2(rtx target,int code,rtx op0,rtx op1)15192 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
15193 {
15194   emit_insn (gen_rtx_SET (target,
15195 			  gen_rtx_UNSPEC (GET_MODE (target),
15196 					  gen_rtvec (2, op0, op1), code)));
15197 }
15198 
15199 /* Expand an SVE vec_perm with the given operands.  */
15200 
15201 void
aarch64_expand_sve_vec_perm(rtx target,rtx op0,rtx op1,rtx sel)15202 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
15203 {
15204   machine_mode data_mode = GET_MODE (target);
15205   machine_mode sel_mode = GET_MODE (sel);
15206   /* Enforced by the pattern condition.  */
15207   int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
15208 
15209   /* Note: vec_perm indices are supposed to wrap when they go beyond the
15210      size of the two value vectors, i.e. the upper bits of the indices
15211      are effectively ignored.  SVE TBL instead produces 0 for any
15212      out-of-range indices, so we need to modulo all the vec_perm indices
15213      to ensure they are all in range.  */
15214   rtx sel_reg = force_reg (sel_mode, sel);
15215 
15216   /* Check if the sel only references the first values vector.  */
15217   if (GET_CODE (sel) == CONST_VECTOR
15218       && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
15219     {
15220       emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
15221       return;
15222     }
15223 
15224   /* Check if the two values vectors are the same.  */
15225   if (rtx_equal_p (op0, op1))
15226     {
15227       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
15228       rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
15229 					 NULL, 0, OPTAB_DIRECT);
15230       emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
15231       return;
15232     }
15233 
15234   /* Run TBL on for each value vector and combine the results.  */
15235 
15236   rtx res0 = gen_reg_rtx (data_mode);
15237   rtx res1 = gen_reg_rtx (data_mode);
15238   rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
15239   if (GET_CODE (sel) != CONST_VECTOR
15240       || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
15241     {
15242       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
15243 						       2 * nunits - 1);
15244       sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
15245 				     NULL, 0, OPTAB_DIRECT);
15246     }
15247   emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
15248   rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
15249 				     NULL, 0, OPTAB_DIRECT);
15250   emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
15251   if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
15252     emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
15253   else
15254     emit_unspec2 (target, UNSPEC_IORF, res0, res1);
15255 }
15256 
15257 /* Recognize patterns suitable for the TRN instructions.  */
15258 static bool
aarch64_evpc_trn(struct expand_vec_perm_d * d)15259 aarch64_evpc_trn (struct expand_vec_perm_d *d)
15260 {
15261   HOST_WIDE_INT odd;
15262   poly_uint64 nelt = d->perm.length ();
15263   rtx out, in0, in1, x;
15264   machine_mode vmode = d->vmode;
15265 
15266   if (GET_MODE_UNIT_SIZE (vmode) > 8)
15267     return false;
15268 
15269   /* Note that these are little-endian tests.
15270      We correct for big-endian later.  */
15271   if (!d->perm[0].is_constant (&odd)
15272       || (odd != 0 && odd != 1)
15273       || !d->perm.series_p (0, 2, odd, 2)
15274       || !d->perm.series_p (1, 2, nelt + odd, 2))
15275     return false;
15276 
15277   /* Success!  */
15278   if (d->testing_p)
15279     return true;
15280 
15281   in0 = d->op0;
15282   in1 = d->op1;
15283   /* We don't need a big-endian lane correction for SVE; see the comment
15284      at the head of aarch64-sve.md for details.  */
15285   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
15286     {
15287       x = in0, in0 = in1, in1 = x;
15288       odd = !odd;
15289     }
15290   out = d->target;
15291 
15292   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
15293 				      odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
15294   return true;
15295 }
15296 
15297 /* Recognize patterns suitable for the UZP instructions.  */
15298 static bool
aarch64_evpc_uzp(struct expand_vec_perm_d * d)15299 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
15300 {
15301   HOST_WIDE_INT odd;
15302   rtx out, in0, in1, x;
15303   machine_mode vmode = d->vmode;
15304 
15305   if (GET_MODE_UNIT_SIZE (vmode) > 8)
15306     return false;
15307 
15308   /* Note that these are little-endian tests.
15309      We correct for big-endian later.  */
15310   if (!d->perm[0].is_constant (&odd)
15311       || (odd != 0 && odd != 1)
15312       || !d->perm.series_p (0, 1, odd, 2))
15313     return false;
15314 
15315   /* Success!  */
15316   if (d->testing_p)
15317     return true;
15318 
15319   in0 = d->op0;
15320   in1 = d->op1;
15321   /* We don't need a big-endian lane correction for SVE; see the comment
15322      at the head of aarch64-sve.md for details.  */
15323   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
15324     {
15325       x = in0, in0 = in1, in1 = x;
15326       odd = !odd;
15327     }
15328   out = d->target;
15329 
15330   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
15331 				      odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
15332   return true;
15333 }
15334 
15335 /* Recognize patterns suitable for the ZIP instructions.  */
15336 static bool
aarch64_evpc_zip(struct expand_vec_perm_d * d)15337 aarch64_evpc_zip (struct expand_vec_perm_d *d)
15338 {
15339   unsigned int high;
15340   poly_uint64 nelt = d->perm.length ();
15341   rtx out, in0, in1, x;
15342   machine_mode vmode = d->vmode;
15343 
15344   if (GET_MODE_UNIT_SIZE (vmode) > 8)
15345     return false;
15346 
15347   /* Note that these are little-endian tests.
15348      We correct for big-endian later.  */
15349   poly_uint64 first = d->perm[0];
15350   if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
15351       || !d->perm.series_p (0, 2, first, 1)
15352       || !d->perm.series_p (1, 2, first + nelt, 1))
15353     return false;
15354   high = maybe_ne (first, 0U);
15355 
15356   /* Success!  */
15357   if (d->testing_p)
15358     return true;
15359 
15360   in0 = d->op0;
15361   in1 = d->op1;
15362   /* We don't need a big-endian lane correction for SVE; see the comment
15363      at the head of aarch64-sve.md for details.  */
15364   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
15365     {
15366       x = in0, in0 = in1, in1 = x;
15367       high = !high;
15368     }
15369   out = d->target;
15370 
15371   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
15372 				      high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
15373   return true;
15374 }
15375 
15376 /* Recognize patterns for the EXT insn.  */
15377 
15378 static bool
aarch64_evpc_ext(struct expand_vec_perm_d * d)15379 aarch64_evpc_ext (struct expand_vec_perm_d *d)
15380 {
15381   HOST_WIDE_INT location;
15382   rtx offset;
15383 
15384   /* The first element always refers to the first vector.
15385      Check if the extracted indices are increasing by one.  */
15386   if (d->vec_flags == VEC_SVE_PRED
15387       || !d->perm[0].is_constant (&location)
15388       || !d->perm.series_p (0, 1, location, 1))
15389     return false;
15390 
15391   /* Success! */
15392   if (d->testing_p)
15393     return true;
15394 
15395   /* The case where (location == 0) is a no-op for both big- and little-endian,
15396      and is removed by the mid-end at optimization levels -O1 and higher.
15397 
15398      We don't need a big-endian lane correction for SVE; see the comment
15399      at the head of aarch64-sve.md for details.  */
15400   if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
15401     {
15402       /* After setup, we want the high elements of the first vector (stored
15403          at the LSB end of the register), and the low elements of the second
15404          vector (stored at the MSB end of the register). So swap.  */
15405       std::swap (d->op0, d->op1);
15406       /* location != 0 (above), so safe to assume (nelt - location) < nelt.
15407 	 to_constant () is safe since this is restricted to Advanced SIMD
15408 	 vectors.  */
15409       location = d->perm.length ().to_constant () - location;
15410     }
15411 
15412   offset = GEN_INT (location);
15413   emit_set_insn (d->target,
15414 		 gen_rtx_UNSPEC (d->vmode,
15415 				 gen_rtvec (3, d->op0, d->op1, offset),
15416 				 UNSPEC_EXT));
15417   return true;
15418 }
15419 
15420 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
15421    within each 64-bit, 32-bit or 16-bit granule.  */
15422 
15423 static bool
aarch64_evpc_rev_local(struct expand_vec_perm_d * d)15424 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
15425 {
15426   HOST_WIDE_INT diff;
15427   unsigned int i, size, unspec;
15428   machine_mode pred_mode;
15429 
15430   if (d->vec_flags == VEC_SVE_PRED
15431       || !d->one_vector_p
15432       || !d->perm[0].is_constant (&diff))
15433     return false;
15434 
15435   size = (diff + 1) * GET_MODE_UNIT_SIZE (d->vmode);
15436   if (size == 8)
15437     {
15438       unspec = UNSPEC_REV64;
15439       pred_mode = VNx2BImode;
15440     }
15441   else if (size == 4)
15442     {
15443       unspec = UNSPEC_REV32;
15444       pred_mode = VNx4BImode;
15445     }
15446   else if (size == 2)
15447     {
15448       unspec = UNSPEC_REV16;
15449       pred_mode = VNx8BImode;
15450     }
15451   else
15452     return false;
15453 
15454   unsigned int step = diff + 1;
15455   for (i = 0; i < step; ++i)
15456     if (!d->perm.series_p (i, step, diff - i, step))
15457       return false;
15458 
15459   /* Success! */
15460   if (d->testing_p)
15461     return true;
15462 
15463   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
15464   if (d->vec_flags == VEC_SVE_DATA)
15465     {
15466       rtx pred = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
15467       src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (2, pred, src),
15468 			    UNSPEC_MERGE_PTRUE);
15469     }
15470   emit_set_insn (d->target, src);
15471   return true;
15472 }
15473 
15474 /* Recognize patterns for the REV insn, which reverses elements within
15475    a full vector.  */
15476 
15477 static bool
aarch64_evpc_rev_global(struct expand_vec_perm_d * d)15478 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
15479 {
15480   poly_uint64 nelt = d->perm.length ();
15481 
15482   if (!d->one_vector_p || d->vec_flags != VEC_SVE_DATA)
15483     return false;
15484 
15485   if (!d->perm.series_p (0, 1, nelt - 1, -1))
15486     return false;
15487 
15488   /* Success! */
15489   if (d->testing_p)
15490     return true;
15491 
15492   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
15493   emit_set_insn (d->target, src);
15494   return true;
15495 }
15496 
15497 static bool
aarch64_evpc_dup(struct expand_vec_perm_d * d)15498 aarch64_evpc_dup (struct expand_vec_perm_d *d)
15499 {
15500   rtx out = d->target;
15501   rtx in0;
15502   HOST_WIDE_INT elt;
15503   machine_mode vmode = d->vmode;
15504   rtx lane;
15505 
15506   if (d->vec_flags == VEC_SVE_PRED
15507       || d->perm.encoding ().encoded_nelts () != 1
15508       || !d->perm[0].is_constant (&elt))
15509     return false;
15510 
15511   if (d->vec_flags == VEC_SVE_DATA && elt >= 64 * GET_MODE_UNIT_SIZE (vmode))
15512     return false;
15513 
15514   /* Success! */
15515   if (d->testing_p)
15516     return true;
15517 
15518   /* The generic preparation in aarch64_expand_vec_perm_const_1
15519      swaps the operand order and the permute indices if it finds
15520      d->perm[0] to be in the second operand.  Thus, we can always
15521      use d->op0 and need not do any extra arithmetic to get the
15522      correct lane number.  */
15523   in0 = d->op0;
15524   lane = GEN_INT (elt); /* The pattern corrects for big-endian.  */
15525 
15526   rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
15527   rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
15528   emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
15529   return true;
15530 }
15531 
15532 static bool
aarch64_evpc_tbl(struct expand_vec_perm_d * d)15533 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
15534 {
15535   rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
15536   machine_mode vmode = d->vmode;
15537 
15538   /* Make sure that the indices are constant.  */
15539   unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
15540   for (unsigned int i = 0; i < encoded_nelts; ++i)
15541     if (!d->perm[i].is_constant ())
15542       return false;
15543 
15544   if (d->testing_p)
15545     return true;
15546 
15547   /* Generic code will try constant permutation twice.  Once with the
15548      original mode and again with the elements lowered to QImode.
15549      So wait and don't do the selector expansion ourselves.  */
15550   if (vmode != V8QImode && vmode != V16QImode)
15551     return false;
15552 
15553   /* to_constant is safe since this routine is specific to Advanced SIMD
15554      vectors.  */
15555   unsigned int nelt = d->perm.length ().to_constant ();
15556   for (unsigned int i = 0; i < nelt; ++i)
15557     /* If big-endian and two vectors we end up with a weird mixed-endian
15558        mode on NEON.  Reverse the index within each word but not the word
15559        itself.  to_constant is safe because we checked is_constant above.  */
15560     rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
15561 			? d->perm[i].to_constant () ^ (nelt - 1)
15562 			: d->perm[i].to_constant ());
15563 
15564   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
15565   sel = force_reg (vmode, sel);
15566 
15567   aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
15568   return true;
15569 }
15570 
15571 /* Try to implement D using an SVE TBL instruction.  */
15572 
15573 static bool
aarch64_evpc_sve_tbl(struct expand_vec_perm_d * d)15574 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
15575 {
15576   unsigned HOST_WIDE_INT nelt;
15577 
15578   /* Permuting two variable-length vectors could overflow the
15579      index range.  */
15580   if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
15581     return false;
15582 
15583   if (d->testing_p)
15584     return true;
15585 
15586   machine_mode sel_mode = mode_for_int_vector (d->vmode).require ();
15587   rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
15588   aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
15589   return true;
15590 }
15591 
15592 static bool
aarch64_expand_vec_perm_const_1(struct expand_vec_perm_d * d)15593 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
15594 {
15595   /* The pattern matching functions above are written to look for a small
15596      number to begin the sequence (0, 1, N/2).  If we begin with an index
15597      from the second operand, we can swap the operands.  */
15598   poly_int64 nelt = d->perm.length ();
15599   if (known_ge (d->perm[0], nelt))
15600     {
15601       d->perm.rotate_inputs (1);
15602       std::swap (d->op0, d->op1);
15603     }
15604 
15605   if ((d->vec_flags == VEC_ADVSIMD
15606        || d->vec_flags == VEC_SVE_DATA
15607        || d->vec_flags == VEC_SVE_PRED)
15608       && known_gt (nelt, 1))
15609     {
15610       if (aarch64_evpc_rev_local (d))
15611 	return true;
15612       else if (aarch64_evpc_rev_global (d))
15613 	return true;
15614       else if (aarch64_evpc_ext (d))
15615 	return true;
15616       else if (aarch64_evpc_dup (d))
15617 	return true;
15618       else if (aarch64_evpc_zip (d))
15619 	return true;
15620       else if (aarch64_evpc_uzp (d))
15621 	return true;
15622       else if (aarch64_evpc_trn (d))
15623 	return true;
15624       if (d->vec_flags == VEC_SVE_DATA)
15625 	return aarch64_evpc_sve_tbl (d);
15626       else if (d->vec_flags == VEC_SVE_DATA)
15627 	return aarch64_evpc_tbl (d);
15628     }
15629   return false;
15630 }
15631 
15632 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST.  */
15633 
15634 static bool
aarch64_vectorize_vec_perm_const(machine_mode vmode,rtx target,rtx op0,rtx op1,const vec_perm_indices & sel)15635 aarch64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
15636 				  rtx op1, const vec_perm_indices &sel)
15637 {
15638   struct expand_vec_perm_d d;
15639 
15640   /* Check whether the mask can be applied to a single vector.  */
15641   if (op0 && rtx_equal_p (op0, op1))
15642     d.one_vector_p = true;
15643   else if (sel.all_from_input_p (0))
15644     {
15645       d.one_vector_p = true;
15646       op1 = op0;
15647     }
15648   else if (sel.all_from_input_p (1))
15649     {
15650       d.one_vector_p = true;
15651       op0 = op1;
15652     }
15653   else
15654     d.one_vector_p = false;
15655 
15656   d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
15657 		     sel.nelts_per_input ());
15658   d.vmode = vmode;
15659   d.vec_flags = aarch64_classify_vector_mode (d.vmode);
15660   d.target = target;
15661   d.op0 = op0;
15662   d.op1 = op1;
15663   d.testing_p = !target;
15664 
15665   if (!d.testing_p)
15666     return aarch64_expand_vec_perm_const_1 (&d);
15667 
15668   rtx_insn *last = get_last_insn ();
15669   bool ret = aarch64_expand_vec_perm_const_1 (&d);
15670   gcc_assert (last == get_last_insn ());
15671 
15672   return ret;
15673 }
15674 
15675 /* Generate a byte permute mask for a register of mode MODE,
15676    which has NUNITS units.  */
15677 
15678 rtx
aarch64_reverse_mask(machine_mode mode,unsigned int nunits)15679 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
15680 {
15681   /* We have to reverse each vector because we dont have
15682      a permuted load that can reverse-load according to ABI rules.  */
15683   rtx mask;
15684   rtvec v = rtvec_alloc (16);
15685   unsigned int i, j;
15686   unsigned int usize = GET_MODE_UNIT_SIZE (mode);
15687 
15688   gcc_assert (BYTES_BIG_ENDIAN);
15689   gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
15690 
15691   for (i = 0; i < nunits; i++)
15692     for (j = 0; j < usize; j++)
15693       RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
15694   mask = gen_rtx_CONST_VECTOR (V16QImode, v);
15695   return force_reg (V16QImode, mask);
15696 }
15697 
15698 /* Return true if X is a valid second operand for the SVE instruction
15699    that implements integer comparison OP_CODE.  */
15700 
15701 static bool
aarch64_sve_cmp_operand_p(rtx_code op_code,rtx x)15702 aarch64_sve_cmp_operand_p (rtx_code op_code, rtx x)
15703 {
15704   if (register_operand (x, VOIDmode))
15705     return true;
15706 
15707   switch (op_code)
15708     {
15709     case LTU:
15710     case LEU:
15711     case GEU:
15712     case GTU:
15713       return aarch64_sve_cmp_immediate_p (x, false);
15714     case LT:
15715     case LE:
15716     case GE:
15717     case GT:
15718     case NE:
15719     case EQ:
15720       return aarch64_sve_cmp_immediate_p (x, true);
15721     default:
15722       gcc_unreachable ();
15723     }
15724 }
15725 
15726 /* Return the UNSPEC_COND_* code for comparison CODE.  */
15727 
15728 static unsigned int
aarch64_unspec_cond_code(rtx_code code)15729 aarch64_unspec_cond_code (rtx_code code)
15730 {
15731   switch (code)
15732     {
15733     case NE:
15734       return UNSPEC_COND_NE;
15735     case EQ:
15736       return UNSPEC_COND_EQ;
15737     case LT:
15738       return UNSPEC_COND_LT;
15739     case GT:
15740       return UNSPEC_COND_GT;
15741     case LE:
15742       return UNSPEC_COND_LE;
15743     case GE:
15744       return UNSPEC_COND_GE;
15745     case LTU:
15746       return UNSPEC_COND_LO;
15747     case GTU:
15748       return UNSPEC_COND_HI;
15749     case LEU:
15750       return UNSPEC_COND_LS;
15751     case GEU:
15752       return UNSPEC_COND_HS;
15753     case UNORDERED:
15754       return UNSPEC_COND_UO;
15755     default:
15756       gcc_unreachable ();
15757     }
15758 }
15759 
15760 /* Return an (unspec:PRED_MODE [PRED OP0 OP1] UNSPEC_COND_<X>) expression,
15761    where <X> is the operation associated with comparison CODE.  */
15762 
15763 static rtx
aarch64_gen_unspec_cond(rtx_code code,machine_mode pred_mode,rtx pred,rtx op0,rtx op1)15764 aarch64_gen_unspec_cond (rtx_code code, machine_mode pred_mode,
15765 			 rtx pred, rtx op0, rtx op1)
15766 {
15767   rtvec vec = gen_rtvec (3, pred, op0, op1);
15768   return gen_rtx_UNSPEC (pred_mode, vec, aarch64_unspec_cond_code (code));
15769 }
15770 
15771 /* Expand an SVE integer comparison:
15772 
15773      TARGET = CODE (OP0, OP1).  */
15774 
15775 void
aarch64_expand_sve_vec_cmp_int(rtx target,rtx_code code,rtx op0,rtx op1)15776 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
15777 {
15778   machine_mode pred_mode = GET_MODE (target);
15779   machine_mode data_mode = GET_MODE (op0);
15780 
15781   if (!aarch64_sve_cmp_operand_p (code, op1))
15782     op1 = force_reg (data_mode, op1);
15783 
15784   rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
15785   rtx unspec = aarch64_gen_unspec_cond (code, pred_mode, ptrue, op0, op1);
15786   emit_insn (gen_set_clobber_cc (target, unspec));
15787 }
15788 
15789 /* Emit an instruction:
15790 
15791       (set TARGET (unspec:PRED_MODE [PRED OP0 OP1] UNSPEC_COND_<X>))
15792 
15793    where <X> is the operation associated with comparison CODE.  */
15794 
15795 static void
aarch64_emit_unspec_cond(rtx target,rtx_code code,machine_mode pred_mode,rtx pred,rtx op0,rtx op1)15796 aarch64_emit_unspec_cond (rtx target, rtx_code code, machine_mode pred_mode,
15797 			  rtx pred, rtx op0, rtx op1)
15798 {
15799   rtx unspec = aarch64_gen_unspec_cond (code, pred_mode, pred, op0, op1);
15800   emit_set_insn (target, unspec);
15801 }
15802 
15803 /* Emit:
15804 
15805       (set TMP1 (unspec:PRED_MODE [PTRUE OP0 OP1] UNSPEC_COND_<X1>))
15806       (set TMP2 (unspec:PRED_MODE [PTRUE OP0 OP1] UNSPEC_COND_<X2>))
15807       (set TARGET (and:PRED_MODE (ior:PRED_MODE TMP1 TMP2) PTRUE))
15808 
15809    where <Xi> is the operation associated with comparison CODEi.  */
15810 
15811 static void
aarch64_emit_unspec_cond_or(rtx target,rtx_code code1,rtx_code code2,machine_mode pred_mode,rtx ptrue,rtx op0,rtx op1)15812 aarch64_emit_unspec_cond_or (rtx target, rtx_code code1, rtx_code code2,
15813 			     machine_mode pred_mode, rtx ptrue,
15814 			     rtx op0, rtx op1)
15815 {
15816   rtx tmp1 = gen_reg_rtx (pred_mode);
15817   aarch64_emit_unspec_cond (tmp1, code1, pred_mode, ptrue, op0, op1);
15818   rtx tmp2 = gen_reg_rtx (pred_mode);
15819   aarch64_emit_unspec_cond (tmp2, code2, pred_mode, ptrue, op0, op1);
15820   emit_set_insn (target, gen_rtx_AND (pred_mode,
15821 				      gen_rtx_IOR (pred_mode, tmp1, tmp2),
15822 				      ptrue));
15823 }
15824 
15825 /* If CAN_INVERT_P, emit an instruction:
15826 
15827       (set TARGET (unspec:PRED_MODE [PRED OP0 OP1] UNSPEC_COND_<X>))
15828 
15829    where <X> is the operation associated with comparison CODE.  Otherwise
15830    emit:
15831 
15832       (set TMP (unspec:PRED_MODE [PRED OP0 OP1] UNSPEC_COND_<X>))
15833       (set TARGET (and:PRED_MODE (not:PRED_MODE TMP) PTRUE))
15834 
15835    where the second instructions sets TARGET to the inverse of TMP.  */
15836 
15837 static void
aarch64_emit_inverted_unspec_cond(rtx target,rtx_code code,machine_mode pred_mode,rtx ptrue,rtx pred,rtx op0,rtx op1,bool can_invert_p)15838 aarch64_emit_inverted_unspec_cond (rtx target, rtx_code code,
15839 				   machine_mode pred_mode, rtx ptrue, rtx pred,
15840 				   rtx op0, rtx op1, bool can_invert_p)
15841 {
15842   if (can_invert_p)
15843     aarch64_emit_unspec_cond (target, code, pred_mode, pred, op0, op1);
15844   else
15845     {
15846       rtx tmp = gen_reg_rtx (pred_mode);
15847       aarch64_emit_unspec_cond (tmp, code, pred_mode, pred, op0, op1);
15848       emit_set_insn (target, gen_rtx_AND (pred_mode,
15849 					  gen_rtx_NOT (pred_mode, tmp),
15850 					  ptrue));
15851     }
15852 }
15853 
15854 /* Expand an SVE floating-point comparison:
15855 
15856      TARGET = CODE (OP0, OP1)
15857 
15858    If CAN_INVERT_P is true, the caller can also handle inverted results;
15859    return true if the result is in fact inverted.  */
15860 
15861 bool
aarch64_expand_sve_vec_cmp_float(rtx target,rtx_code code,rtx op0,rtx op1,bool can_invert_p)15862 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
15863 				  rtx op0, rtx op1, bool can_invert_p)
15864 {
15865   machine_mode pred_mode = GET_MODE (target);
15866   machine_mode data_mode = GET_MODE (op0);
15867 
15868   rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
15869   switch (code)
15870     {
15871     case UNORDERED:
15872       /* UNORDERED has no immediate form.  */
15873       op1 = force_reg (data_mode, op1);
15874       aarch64_emit_unspec_cond (target, code, pred_mode, ptrue, op0, op1);
15875       return false;
15876 
15877     case LT:
15878     case LE:
15879     case GT:
15880     case GE:
15881     case EQ:
15882     case NE:
15883       /* There is native support for the comparison.  */
15884       aarch64_emit_unspec_cond (target, code, pred_mode, ptrue, op0, op1);
15885       return false;
15886 
15887     case ORDERED:
15888       /* There is native support for the inverse comparison.  */
15889       op1 = force_reg (data_mode, op1);
15890       aarch64_emit_inverted_unspec_cond (target, UNORDERED,
15891 					 pred_mode, ptrue, ptrue, op0, op1,
15892 					 can_invert_p);
15893       return can_invert_p;
15894 
15895     case LTGT:
15896       /* This is a trapping operation (LT or GT).  */
15897       aarch64_emit_unspec_cond_or (target, LT, GT, pred_mode, ptrue, op0, op1);
15898       return false;
15899 
15900     case UNEQ:
15901       if (!flag_trapping_math)
15902 	{
15903 	  /* This would trap for signaling NaNs.  */
15904 	  op1 = force_reg (data_mode, op1);
15905 	  aarch64_emit_unspec_cond_or (target, UNORDERED, EQ,
15906 				       pred_mode, ptrue, op0, op1);
15907 	  return false;
15908 	}
15909       /* fall through */
15910 
15911     case UNLT:
15912     case UNLE:
15913     case UNGT:
15914     case UNGE:
15915       {
15916 	rtx ordered = ptrue;
15917 	if (flag_trapping_math)
15918 	  {
15919 	    /* Only compare the elements that are known to be ordered.  */
15920 	    ordered = gen_reg_rtx (pred_mode);
15921 	    op1 = force_reg (data_mode, op1);
15922 	    aarch64_emit_inverted_unspec_cond (ordered, UNORDERED, pred_mode,
15923 					       ptrue, ptrue, op0, op1, false);
15924 	  }
15925 	if (code == UNEQ)
15926 	  code = NE;
15927 	else
15928 	  code = reverse_condition_maybe_unordered (code);
15929 	aarch64_emit_inverted_unspec_cond (target, code, pred_mode, ptrue,
15930 					   ordered, op0, op1, can_invert_p);
15931 	return can_invert_p;
15932       }
15933 
15934     default:
15935       gcc_unreachable ();
15936     }
15937 }
15938 
15939 /* Expand an SVE vcond pattern with operands OPS.  DATA_MODE is the mode
15940    of the data being selected and CMP_MODE is the mode of the values being
15941    compared.  */
15942 
15943 void
aarch64_expand_sve_vcond(machine_mode data_mode,machine_mode cmp_mode,rtx * ops)15944 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
15945 			  rtx *ops)
15946 {
15947   machine_mode pred_mode
15948     = aarch64_get_mask_mode (GET_MODE_NUNITS (cmp_mode),
15949 			     GET_MODE_SIZE (cmp_mode)).require ();
15950   rtx pred = gen_reg_rtx (pred_mode);
15951   if (FLOAT_MODE_P (cmp_mode))
15952     {
15953       if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
15954 					    ops[4], ops[5], true))
15955 	std::swap (ops[1], ops[2]);
15956     }
15957   else
15958     aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
15959 
15960   rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
15961   emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
15962 }
15963 
15964 /* Implement TARGET_MODES_TIEABLE_P.  In principle we should always return
15965    true.  However due to issues with register allocation it is preferable
15966    to avoid tieing integer scalar and FP scalar modes.  Executing integer
15967    operations in general registers is better than treating them as scalar
15968    vector operations.  This reduces latency and avoids redundant int<->FP
15969    moves.  So tie modes if they are either the same class, or vector modes
15970    with other vector modes, vector structs or any scalar mode.  */
15971 
15972 static bool
aarch64_modes_tieable_p(machine_mode mode1,machine_mode mode2)15973 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
15974 {
15975   if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
15976     return true;
15977 
15978   /* We specifically want to allow elements of "structure" modes to
15979      be tieable to the structure.  This more general condition allows
15980      other rarer situations too.  The reason we don't extend this to
15981      predicate modes is that there are no predicate structure modes
15982      nor any specific instructions for extracting part of a predicate
15983      register.  */
15984   if (aarch64_vector_data_mode_p (mode1)
15985       && aarch64_vector_data_mode_p (mode2))
15986     return true;
15987 
15988   /* Also allow any scalar modes with vectors.  */
15989   if (aarch64_vector_mode_supported_p (mode1)
15990       || aarch64_vector_mode_supported_p (mode2))
15991     return true;
15992 
15993   return false;
15994 }
15995 
15996 /* Return a new RTX holding the result of moving POINTER forward by
15997    AMOUNT bytes.  */
15998 
15999 static rtx
aarch64_move_pointer(rtx pointer,poly_int64 amount)16000 aarch64_move_pointer (rtx pointer, poly_int64 amount)
16001 {
16002   rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
16003 
16004   return adjust_automodify_address (pointer, GET_MODE (pointer),
16005 				    next, amount);
16006 }
16007 
16008 /* Return a new RTX holding the result of moving POINTER forward by the
16009    size of the mode it points to.  */
16010 
16011 static rtx
aarch64_progress_pointer(rtx pointer)16012 aarch64_progress_pointer (rtx pointer)
16013 {
16014   return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
16015 }
16016 
16017 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
16018    MODE bytes.  */
16019 
16020 static void
aarch64_copy_one_block_and_progress_pointers(rtx * src,rtx * dst,machine_mode mode)16021 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
16022 					      machine_mode mode)
16023 {
16024   rtx reg = gen_reg_rtx (mode);
16025 
16026   /* "Cast" the pointers to the correct mode.  */
16027   *src = adjust_address (*src, mode, 0);
16028   *dst = adjust_address (*dst, mode, 0);
16029   /* Emit the memcpy.  */
16030   emit_move_insn (reg, *src);
16031   emit_move_insn (*dst, reg);
16032   /* Move the pointers forward.  */
16033   *src = aarch64_progress_pointer (*src);
16034   *dst = aarch64_progress_pointer (*dst);
16035 }
16036 
16037 /* Expand movmem, as if from a __builtin_memcpy.  Return true if
16038    we succeed, otherwise return false.  */
16039 
16040 bool
aarch64_expand_movmem(rtx * operands)16041 aarch64_expand_movmem (rtx *operands)
16042 {
16043   unsigned int n;
16044   rtx dst = operands[0];
16045   rtx src = operands[1];
16046   rtx base;
16047   bool speed_p = !optimize_function_for_size_p (cfun);
16048 
16049   /* When optimizing for size, give a better estimate of the length of a
16050      memcpy call, but use the default otherwise.  */
16051   unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
16052 
16053   /* We can't do anything smart if the amount to copy is not constant.  */
16054   if (!CONST_INT_P (operands[2]))
16055     return false;
16056 
16057   n = UINTVAL (operands[2]);
16058 
16059   /* Try to keep the number of instructions low.  For cases below 16 bytes we
16060      need to make at most two moves.  For cases above 16 bytes it will be one
16061      move for each 16 byte chunk, then at most two additional moves.  */
16062   if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
16063     return false;
16064 
16065   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
16066   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
16067 
16068   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
16069   src = adjust_automodify_address (src, VOIDmode, base, 0);
16070 
16071   /* Simple cases.  Copy 0-3 bytes, as (if applicable) a 2-byte, then a
16072      1-byte chunk.  */
16073   if (n < 4)
16074     {
16075       if (n >= 2)
16076 	{
16077 	  aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
16078 	  n -= 2;
16079 	}
16080 
16081       if (n == 1)
16082 	aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
16083 
16084       return true;
16085     }
16086 
16087   /* Copy 4-8 bytes.  First a 4-byte chunk, then (if applicable) a second
16088      4-byte chunk, partially overlapping with the previously copied chunk.  */
16089   if (n < 8)
16090     {
16091       aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
16092       n -= 4;
16093       if (n > 0)
16094 	{
16095 	  int move = n - 4;
16096 
16097 	  src = aarch64_move_pointer (src, move);
16098 	  dst = aarch64_move_pointer (dst, move);
16099 	  aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
16100 	}
16101       return true;
16102     }
16103 
16104   /* Copy more than 8 bytes.  Copy chunks of 16 bytes until we run out of
16105      them, then (if applicable) an 8-byte chunk.  */
16106   while (n >= 8)
16107     {
16108       if (n / 16)
16109 	{
16110 	  aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
16111 	  n -= 16;
16112 	}
16113       else
16114 	{
16115 	  aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
16116 	  n -= 8;
16117 	}
16118     }
16119 
16120   /* Finish the final bytes of the copy.  We can always do this in one
16121      instruction.  We either copy the exact amount we need, or partially
16122      overlap with the previous chunk we copied and copy 8-bytes.  */
16123   if (n == 0)
16124     return true;
16125   else if (n == 1)
16126     aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
16127   else if (n == 2)
16128     aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
16129   else if (n == 4)
16130     aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
16131   else
16132     {
16133       if (n == 3)
16134 	{
16135 	  src = aarch64_move_pointer (src, -1);
16136 	  dst = aarch64_move_pointer (dst, -1);
16137 	  aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
16138 	}
16139       else
16140 	{
16141 	  int move = n - 8;
16142 
16143 	  src = aarch64_move_pointer (src, move);
16144 	  dst = aarch64_move_pointer (dst, move);
16145 	  aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
16146 	}
16147     }
16148 
16149   return true;
16150 }
16151 
16152 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
16153    SImode stores.  Handle the case when the constant has identical
16154    bottom and top halves.  This is beneficial when the two stores can be
16155    merged into an STP and we avoid synthesising potentially expensive
16156    immediates twice.  Return true if such a split is possible.  */
16157 
16158 bool
aarch64_split_dimode_const_store(rtx dst,rtx src)16159 aarch64_split_dimode_const_store (rtx dst, rtx src)
16160 {
16161   rtx lo = gen_lowpart (SImode, src);
16162   rtx hi = gen_highpart_mode (SImode, DImode, src);
16163 
16164   bool size_p = optimize_function_for_size_p (cfun);
16165 
16166   if (!rtx_equal_p (lo, hi))
16167     return false;
16168 
16169   unsigned int orig_cost
16170     = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
16171   unsigned int lo_cost
16172     = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
16173 
16174   /* We want to transform:
16175      MOV	x1, 49370
16176      MOVK	x1, 0x140, lsl 16
16177      MOVK	x1, 0xc0da, lsl 32
16178      MOVK	x1, 0x140, lsl 48
16179      STR	x1, [x0]
16180    into:
16181      MOV	w1, 49370
16182      MOVK	w1, 0x140, lsl 16
16183      STP	w1, w1, [x0]
16184    So we want to perform this only when we save two instructions
16185    or more.  When optimizing for size, however, accept any code size
16186    savings we can.  */
16187   if (size_p && orig_cost <= lo_cost)
16188     return false;
16189 
16190   if (!size_p
16191       && (orig_cost <= lo_cost + 1))
16192     return false;
16193 
16194   rtx mem_lo = adjust_address (dst, SImode, 0);
16195   if (!aarch64_mem_pair_operand (mem_lo, SImode))
16196     return false;
16197 
16198   rtx tmp_reg = gen_reg_rtx (SImode);
16199   aarch64_expand_mov_immediate (tmp_reg, lo);
16200   rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
16201   /* Don't emit an explicit store pair as this may not be always profitable.
16202      Let the sched-fusion logic decide whether to merge them.  */
16203   emit_move_insn (mem_lo, tmp_reg);
16204   emit_move_insn (mem_hi, tmp_reg);
16205 
16206   return true;
16207 }
16208 
16209 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
16210 
16211 static unsigned HOST_WIDE_INT
aarch64_asan_shadow_offset(void)16212 aarch64_asan_shadow_offset (void)
16213 {
16214   return (HOST_WIDE_INT_1 << 36);
16215 }
16216 
16217 static rtx
aarch64_gen_ccmp_first(rtx_insn ** prep_seq,rtx_insn ** gen_seq,int code,tree treeop0,tree treeop1)16218 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
16219 			int code, tree treeop0, tree treeop1)
16220 {
16221   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
16222   rtx op0, op1;
16223   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
16224   insn_code icode;
16225   struct expand_operand ops[4];
16226 
16227   start_sequence ();
16228   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
16229 
16230   op_mode = GET_MODE (op0);
16231   if (op_mode == VOIDmode)
16232     op_mode = GET_MODE (op1);
16233 
16234   switch (op_mode)
16235     {
16236     case E_QImode:
16237     case E_HImode:
16238     case E_SImode:
16239       cmp_mode = SImode;
16240       icode = CODE_FOR_cmpsi;
16241       break;
16242 
16243     case E_DImode:
16244       cmp_mode = DImode;
16245       icode = CODE_FOR_cmpdi;
16246       break;
16247 
16248     case E_SFmode:
16249       cmp_mode = SFmode;
16250       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
16251       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
16252       break;
16253 
16254     case E_DFmode:
16255       cmp_mode = DFmode;
16256       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
16257       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
16258       break;
16259 
16260     default:
16261       end_sequence ();
16262       return NULL_RTX;
16263     }
16264 
16265   op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
16266   op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
16267   if (!op0 || !op1)
16268     {
16269       end_sequence ();
16270       return NULL_RTX;
16271     }
16272   *prep_seq = get_insns ();
16273   end_sequence ();
16274 
16275   create_fixed_operand (&ops[0], op0);
16276   create_fixed_operand (&ops[1], op1);
16277 
16278   start_sequence ();
16279   if (!maybe_expand_insn (icode, 2, ops))
16280     {
16281       end_sequence ();
16282       return NULL_RTX;
16283     }
16284   *gen_seq = get_insns ();
16285   end_sequence ();
16286 
16287   return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
16288 			 gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
16289 }
16290 
16291 static rtx
aarch64_gen_ccmp_next(rtx_insn ** prep_seq,rtx_insn ** gen_seq,rtx prev,int cmp_code,tree treeop0,tree treeop1,int bit_code)16292 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
16293 		       int cmp_code, tree treeop0, tree treeop1, int bit_code)
16294 {
16295   rtx op0, op1, target;
16296   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
16297   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
16298   insn_code icode;
16299   struct expand_operand ops[6];
16300   int aarch64_cond;
16301 
16302   push_to_sequence (*prep_seq);
16303   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
16304 
16305   op_mode = GET_MODE (op0);
16306   if (op_mode == VOIDmode)
16307     op_mode = GET_MODE (op1);
16308 
16309   switch (op_mode)
16310     {
16311     case E_QImode:
16312     case E_HImode:
16313     case E_SImode:
16314       cmp_mode = SImode;
16315       icode = CODE_FOR_ccmpsi;
16316       break;
16317 
16318     case E_DImode:
16319       cmp_mode = DImode;
16320       icode = CODE_FOR_ccmpdi;
16321       break;
16322 
16323     case E_SFmode:
16324       cmp_mode = SFmode;
16325       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
16326       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
16327       break;
16328 
16329     case E_DFmode:
16330       cmp_mode = DFmode;
16331       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
16332       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
16333       break;
16334 
16335     default:
16336       end_sequence ();
16337       return NULL_RTX;
16338     }
16339 
16340   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
16341   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
16342   if (!op0 || !op1)
16343     {
16344       end_sequence ();
16345       return NULL_RTX;
16346     }
16347   *prep_seq = get_insns ();
16348   end_sequence ();
16349 
16350   target = gen_rtx_REG (cc_mode, CC_REGNUM);
16351   aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
16352 
16353   if (bit_code != AND)
16354     {
16355       prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
16356 						GET_MODE (XEXP (prev, 0))),
16357 			     VOIDmode, XEXP (prev, 0), const0_rtx);
16358       aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
16359     }
16360 
16361   create_fixed_operand (&ops[0], XEXP (prev, 0));
16362   create_fixed_operand (&ops[1], target);
16363   create_fixed_operand (&ops[2], op0);
16364   create_fixed_operand (&ops[3], op1);
16365   create_fixed_operand (&ops[4], prev);
16366   create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
16367 
16368   push_to_sequence (*gen_seq);
16369   if (!maybe_expand_insn (icode, 6, ops))
16370     {
16371       end_sequence ();
16372       return NULL_RTX;
16373     }
16374 
16375   *gen_seq = get_insns ();
16376   end_sequence ();
16377 
16378   return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
16379 }
16380 
16381 #undef TARGET_GEN_CCMP_FIRST
16382 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
16383 
16384 #undef TARGET_GEN_CCMP_NEXT
16385 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
16386 
16387 /* Implement TARGET_SCHED_MACRO_FUSION_P.  Return true if target supports
16388    instruction fusion of some sort.  */
16389 
16390 static bool
aarch64_macro_fusion_p(void)16391 aarch64_macro_fusion_p (void)
16392 {
16393   return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
16394 }
16395 
16396 
16397 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P.  Return true if PREV and CURR
16398    should be kept together during scheduling.  */
16399 
16400 static bool
aarch_macro_fusion_pair_p(rtx_insn * prev,rtx_insn * curr)16401 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
16402 {
16403   rtx set_dest;
16404   rtx prev_set = single_set (prev);
16405   rtx curr_set = single_set (curr);
16406   /* prev and curr are simple SET insns i.e. no flag setting or branching.  */
16407   bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
16408 
16409   if (!aarch64_macro_fusion_p ())
16410     return false;
16411 
16412   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
16413     {
16414       /* We are trying to match:
16415          prev (mov)  == (set (reg r0) (const_int imm16))
16416          curr (movk) == (set (zero_extract (reg r0)
16417                                            (const_int 16)
16418                                            (const_int 16))
16419                              (const_int imm16_1))  */
16420 
16421       set_dest = SET_DEST (curr_set);
16422 
16423       if (GET_CODE (set_dest) == ZERO_EXTRACT
16424           && CONST_INT_P (SET_SRC (curr_set))
16425           && CONST_INT_P (SET_SRC (prev_set))
16426           && CONST_INT_P (XEXP (set_dest, 2))
16427           && INTVAL (XEXP (set_dest, 2)) == 16
16428           && REG_P (XEXP (set_dest, 0))
16429           && REG_P (SET_DEST (prev_set))
16430           && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
16431         {
16432           return true;
16433         }
16434     }
16435 
16436   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
16437     {
16438 
16439       /*  We're trying to match:
16440           prev (adrp) == (set (reg r1)
16441                               (high (symbol_ref ("SYM"))))
16442           curr (add) == (set (reg r0)
16443                              (lo_sum (reg r1)
16444                                      (symbol_ref ("SYM"))))
16445           Note that r0 need not necessarily be the same as r1, especially
16446           during pre-regalloc scheduling.  */
16447 
16448       if (satisfies_constraint_Ush (SET_SRC (prev_set))
16449           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
16450         {
16451           if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
16452               && REG_P (XEXP (SET_SRC (curr_set), 0))
16453               && REGNO (XEXP (SET_SRC (curr_set), 0))
16454                  == REGNO (SET_DEST (prev_set))
16455               && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
16456                               XEXP (SET_SRC (curr_set), 1)))
16457             return true;
16458         }
16459     }
16460 
16461   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
16462     {
16463 
16464       /* We're trying to match:
16465          prev (movk) == (set (zero_extract (reg r0)
16466                                            (const_int 16)
16467                                            (const_int 32))
16468                              (const_int imm16_1))
16469          curr (movk) == (set (zero_extract (reg r0)
16470                                            (const_int 16)
16471                                            (const_int 48))
16472                              (const_int imm16_2))  */
16473 
16474       if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
16475           && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
16476           && REG_P (XEXP (SET_DEST (prev_set), 0))
16477           && REG_P (XEXP (SET_DEST (curr_set), 0))
16478           && REGNO (XEXP (SET_DEST (prev_set), 0))
16479              == REGNO (XEXP (SET_DEST (curr_set), 0))
16480           && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
16481           && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
16482           && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
16483           && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
16484           && CONST_INT_P (SET_SRC (prev_set))
16485           && CONST_INT_P (SET_SRC (curr_set)))
16486         return true;
16487 
16488     }
16489   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
16490     {
16491       /* We're trying to match:
16492           prev (adrp) == (set (reg r0)
16493                               (high (symbol_ref ("SYM"))))
16494           curr (ldr) == (set (reg r1)
16495                              (mem (lo_sum (reg r0)
16496                                              (symbol_ref ("SYM")))))
16497                  or
16498           curr (ldr) == (set (reg r1)
16499                              (zero_extend (mem
16500                                            (lo_sum (reg r0)
16501                                                    (symbol_ref ("SYM"))))))  */
16502       if (satisfies_constraint_Ush (SET_SRC (prev_set))
16503           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
16504         {
16505           rtx curr_src = SET_SRC (curr_set);
16506 
16507           if (GET_CODE (curr_src) == ZERO_EXTEND)
16508             curr_src = XEXP (curr_src, 0);
16509 
16510           if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
16511               && REG_P (XEXP (XEXP (curr_src, 0), 0))
16512               && REGNO (XEXP (XEXP (curr_src, 0), 0))
16513                  == REGNO (SET_DEST (prev_set))
16514               && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
16515                               XEXP (SET_SRC (prev_set), 0)))
16516               return true;
16517         }
16518     }
16519 
16520   if (aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)
16521        && aarch_crypto_can_dual_issue (prev, curr))
16522     return true;
16523 
16524   if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
16525       && any_condjump_p (curr))
16526     {
16527       unsigned int condreg1, condreg2;
16528       rtx cc_reg_1;
16529       aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
16530       cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
16531 
16532       if (reg_referenced_p (cc_reg_1, PATTERN (curr))
16533 	  && prev
16534 	  && modified_in_p (cc_reg_1, prev))
16535 	{
16536 	  enum attr_type prev_type = get_attr_type (prev);
16537 
16538 	  /* FIXME: this misses some which is considered simple arthematic
16539 	     instructions for ThunderX.  Simple shifts are missed here.  */
16540 	  if (prev_type == TYPE_ALUS_SREG
16541 	      || prev_type == TYPE_ALUS_IMM
16542 	      || prev_type == TYPE_LOGICS_REG
16543 	      || prev_type == TYPE_LOGICS_IMM)
16544 	    return true;
16545 	}
16546     }
16547 
16548   if (prev_set
16549       && curr_set
16550       && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
16551       && any_condjump_p (curr))
16552     {
16553       /* We're trying to match:
16554 	  prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
16555 	  curr (cbz) ==  (set (pc) (if_then_else (eq/ne) (r0)
16556 							 (const_int 0))
16557 						 (label_ref ("SYM"))
16558 						 (pc))  */
16559       if (SET_DEST (curr_set) == (pc_rtx)
16560 	  && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
16561 	  && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
16562 	  && REG_P (SET_DEST (prev_set))
16563 	  && REGNO (SET_DEST (prev_set))
16564 	     == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
16565 	{
16566 	  /* Fuse ALU operations followed by conditional branch instruction.  */
16567 	  switch (get_attr_type (prev))
16568 	    {
16569 	    case TYPE_ALU_IMM:
16570 	    case TYPE_ALU_SREG:
16571 	    case TYPE_ADC_REG:
16572 	    case TYPE_ADC_IMM:
16573 	    case TYPE_ADCS_REG:
16574 	    case TYPE_ADCS_IMM:
16575 	    case TYPE_LOGIC_REG:
16576 	    case TYPE_LOGIC_IMM:
16577 	    case TYPE_CSEL:
16578 	    case TYPE_ADR:
16579 	    case TYPE_MOV_IMM:
16580 	    case TYPE_SHIFT_REG:
16581 	    case TYPE_SHIFT_IMM:
16582 	    case TYPE_BFM:
16583 	    case TYPE_RBIT:
16584 	    case TYPE_REV:
16585 	    case TYPE_EXTEND:
16586 	      return true;
16587 
16588 	    default:;
16589 	    }
16590 	}
16591     }
16592 
16593   return false;
16594 }
16595 
16596 /* Return true iff the instruction fusion described by OP is enabled.  */
16597 
16598 bool
aarch64_fusion_enabled_p(enum aarch64_fusion_pairs op)16599 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
16600 {
16601   return (aarch64_tune_params.fusible_ops & op) != 0;
16602 }
16603 
16604 /* If MEM is in the form of [base+offset], extract the two parts
16605    of address and set to BASE and OFFSET, otherwise return false
16606    after clearing BASE and OFFSET.  */
16607 
16608 bool
extract_base_offset_in_addr(rtx mem,rtx * base,rtx * offset)16609 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
16610 {
16611   rtx addr;
16612 
16613   gcc_assert (MEM_P (mem));
16614 
16615   addr = XEXP (mem, 0);
16616 
16617   if (REG_P (addr))
16618     {
16619       *base = addr;
16620       *offset = const0_rtx;
16621       return true;
16622     }
16623 
16624   if (GET_CODE (addr) == PLUS
16625       && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
16626     {
16627       *base = XEXP (addr, 0);
16628       *offset = XEXP (addr, 1);
16629       return true;
16630     }
16631 
16632   *base = NULL_RTX;
16633   *offset = NULL_RTX;
16634 
16635   return false;
16636 }
16637 
16638 /* Types for scheduling fusion.  */
16639 enum sched_fusion_type
16640 {
16641   SCHED_FUSION_NONE = 0,
16642   SCHED_FUSION_LD_SIGN_EXTEND,
16643   SCHED_FUSION_LD_ZERO_EXTEND,
16644   SCHED_FUSION_LD,
16645   SCHED_FUSION_ST,
16646   SCHED_FUSION_NUM
16647 };
16648 
16649 /* If INSN is a load or store of address in the form of [base+offset],
16650    extract the two parts and set to BASE and OFFSET.  Return scheduling
16651    fusion type this INSN is.  */
16652 
16653 static enum sched_fusion_type
fusion_load_store(rtx_insn * insn,rtx * base,rtx * offset)16654 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
16655 {
16656   rtx x, dest, src;
16657   enum sched_fusion_type fusion = SCHED_FUSION_LD;
16658 
16659   gcc_assert (INSN_P (insn));
16660   x = PATTERN (insn);
16661   if (GET_CODE (x) != SET)
16662     return SCHED_FUSION_NONE;
16663 
16664   src = SET_SRC (x);
16665   dest = SET_DEST (x);
16666 
16667   machine_mode dest_mode = GET_MODE (dest);
16668 
16669   if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
16670     return SCHED_FUSION_NONE;
16671 
16672   if (GET_CODE (src) == SIGN_EXTEND)
16673     {
16674       fusion = SCHED_FUSION_LD_SIGN_EXTEND;
16675       src = XEXP (src, 0);
16676       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
16677 	return SCHED_FUSION_NONE;
16678     }
16679   else if (GET_CODE (src) == ZERO_EXTEND)
16680     {
16681       fusion = SCHED_FUSION_LD_ZERO_EXTEND;
16682       src = XEXP (src, 0);
16683       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
16684 	return SCHED_FUSION_NONE;
16685     }
16686 
16687   if (GET_CODE (src) == MEM && REG_P (dest))
16688     extract_base_offset_in_addr (src, base, offset);
16689   else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
16690     {
16691       fusion = SCHED_FUSION_ST;
16692       extract_base_offset_in_addr (dest, base, offset);
16693     }
16694   else
16695     return SCHED_FUSION_NONE;
16696 
16697   if (*base == NULL_RTX || *offset == NULL_RTX)
16698     fusion = SCHED_FUSION_NONE;
16699 
16700   return fusion;
16701 }
16702 
16703 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
16704 
16705    Currently we only support to fuse ldr or str instructions, so FUSION_PRI
16706    and PRI are only calculated for these instructions.  For other instruction,
16707    FUSION_PRI and PRI are simply set to MAX_PRI - 1.  In the future, other
16708    type instruction fusion can be added by returning different priorities.
16709 
16710    It's important that irrelevant instructions get the largest FUSION_PRI.  */
16711 
16712 static void
aarch64_sched_fusion_priority(rtx_insn * insn,int max_pri,int * fusion_pri,int * pri)16713 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
16714 			       int *fusion_pri, int *pri)
16715 {
16716   int tmp, off_val;
16717   rtx base, offset;
16718   enum sched_fusion_type fusion;
16719 
16720   gcc_assert (INSN_P (insn));
16721 
16722   tmp = max_pri - 1;
16723   fusion = fusion_load_store (insn, &base, &offset);
16724   if (fusion == SCHED_FUSION_NONE)
16725     {
16726       *pri = tmp;
16727       *fusion_pri = tmp;
16728       return;
16729     }
16730 
16731   /* Set FUSION_PRI according to fusion type and base register.  */
16732   *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
16733 
16734   /* Calculate PRI.  */
16735   tmp /= 2;
16736 
16737   /* INSN with smaller offset goes first.  */
16738   off_val = (int)(INTVAL (offset));
16739   if (off_val >= 0)
16740     tmp -= (off_val & 0xfffff);
16741   else
16742     tmp += ((- off_val) & 0xfffff);
16743 
16744   *pri = tmp;
16745   return;
16746 }
16747 
16748 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
16749    Adjust priority of sha1h instructions so they are scheduled before
16750    other SHA1 instructions.  */
16751 
16752 static int
aarch64_sched_adjust_priority(rtx_insn * insn,int priority)16753 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
16754 {
16755   rtx x = PATTERN (insn);
16756 
16757   if (GET_CODE (x) == SET)
16758     {
16759       x = SET_SRC (x);
16760 
16761       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
16762 	return priority + 10;
16763     }
16764 
16765   return priority;
16766 }
16767 
16768 /* Given OPERANDS of consecutive load/store, check if we can merge
16769    them into ldp/stp.  LOAD is true if they are load instructions.
16770    MODE is the mode of memory operands.  */
16771 
16772 bool
aarch64_operands_ok_for_ldpstp(rtx * operands,bool load,machine_mode mode)16773 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
16774 				machine_mode mode)
16775 {
16776   HOST_WIDE_INT offval_1, offval_2, msize;
16777   enum reg_class rclass_1, rclass_2;
16778   rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
16779 
16780   if (load)
16781     {
16782       mem_1 = operands[1];
16783       mem_2 = operands[3];
16784       reg_1 = operands[0];
16785       reg_2 = operands[2];
16786       gcc_assert (REG_P (reg_1) && REG_P (reg_2));
16787       if (REGNO (reg_1) == REGNO (reg_2))
16788 	return false;
16789     }
16790   else
16791     {
16792       mem_1 = operands[0];
16793       mem_2 = operands[2];
16794       reg_1 = operands[1];
16795       reg_2 = operands[3];
16796     }
16797 
16798   /* The mems cannot be volatile.  */
16799   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
16800     return false;
16801 
16802   /* If we have SImode and slow unaligned ldp,
16803      check the alignment to be at least 8 byte. */
16804   if (mode == SImode
16805       && (aarch64_tune_params.extra_tuning_flags
16806           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
16807       && !optimize_size
16808       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
16809     return false;
16810 
16811   /* Check if the addresses are in the form of [base+offset].  */
16812   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
16813   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
16814     return false;
16815   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
16816   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
16817     return false;
16818 
16819   /* Check if the bases are same.  */
16820   if (!rtx_equal_p (base_1, base_2))
16821     return false;
16822 
16823   offval_1 = INTVAL (offset_1);
16824   offval_2 = INTVAL (offset_2);
16825   /* We should only be trying this for fixed-sized modes.  There is no
16826      SVE LDP/STP instruction.  */
16827   msize = GET_MODE_SIZE (mode).to_constant ();
16828   /* Check if the offsets are consecutive.  */
16829   if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
16830     return false;
16831 
16832   /* Check if the addresses are clobbered by load.  */
16833   if (load)
16834     {
16835       if (reg_mentioned_p (reg_1, mem_1))
16836 	return false;
16837 
16838       /* In increasing order, the last load can clobber the address.  */
16839       if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
16840       return false;
16841     }
16842 
16843   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
16844     rclass_1 = FP_REGS;
16845   else
16846     rclass_1 = GENERAL_REGS;
16847 
16848   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
16849     rclass_2 = FP_REGS;
16850   else
16851     rclass_2 = GENERAL_REGS;
16852 
16853   /* Check if the registers are of same class.  */
16854   if (rclass_1 != rclass_2)
16855     return false;
16856 
16857   return true;
16858 }
16859 
16860 /* Given OPERANDS of consecutive load/store, check if we can merge
16861    them into ldp/stp by adjusting the offset.  LOAD is true if they
16862    are load instructions.  MODE is the mode of memory operands.
16863 
16864    Given below consecutive stores:
16865 
16866      str  w1, [xb, 0x100]
16867      str  w1, [xb, 0x104]
16868      str  w1, [xb, 0x108]
16869      str  w1, [xb, 0x10c]
16870 
16871    Though the offsets are out of the range supported by stp, we can
16872    still pair them after adjusting the offset, like:
16873 
16874      add  scratch, xb, 0x100
16875      stp  w1, w1, [scratch]
16876      stp  w1, w1, [scratch, 0x8]
16877 
16878    The peephole patterns detecting this opportunity should guarantee
16879    the scratch register is avaliable.  */
16880 
16881 bool
aarch64_operands_adjust_ok_for_ldpstp(rtx * operands,bool load,scalar_mode mode)16882 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
16883 				       scalar_mode mode)
16884 {
16885   enum reg_class rclass_1, rclass_2, rclass_3, rclass_4;
16886   HOST_WIDE_INT offval_1, offval_2, offval_3, offval_4, msize;
16887   rtx mem_1, mem_2, mem_3, mem_4, reg_1, reg_2, reg_3, reg_4;
16888   rtx base_1, base_2, base_3, base_4, offset_1, offset_2, offset_3, offset_4;
16889 
16890   if (load)
16891     {
16892       reg_1 = operands[0];
16893       mem_1 = operands[1];
16894       reg_2 = operands[2];
16895       mem_2 = operands[3];
16896       reg_3 = operands[4];
16897       mem_3 = operands[5];
16898       reg_4 = operands[6];
16899       mem_4 = operands[7];
16900       gcc_assert (REG_P (reg_1) && REG_P (reg_2)
16901 		  && REG_P (reg_3) && REG_P (reg_4));
16902       if (REGNO (reg_1) == REGNO (reg_2) || REGNO (reg_3) == REGNO (reg_4))
16903 	return false;
16904     }
16905   else
16906     {
16907       mem_1 = operands[0];
16908       reg_1 = operands[1];
16909       mem_2 = operands[2];
16910       reg_2 = operands[3];
16911       mem_3 = operands[4];
16912       reg_3 = operands[5];
16913       mem_4 = operands[6];
16914       reg_4 = operands[7];
16915     }
16916   /* Skip if memory operand is by itslef valid for ldp/stp.  */
16917   if (!MEM_P (mem_1) || aarch64_mem_pair_operand (mem_1, mode))
16918     return false;
16919 
16920   /* The mems cannot be volatile.  */
16921   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2)
16922       || MEM_VOLATILE_P (mem_3) ||MEM_VOLATILE_P (mem_4))
16923     return false;
16924 
16925   /* Check if the addresses are in the form of [base+offset].  */
16926   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
16927   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
16928     return false;
16929   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
16930   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
16931     return false;
16932   extract_base_offset_in_addr (mem_3, &base_3, &offset_3);
16933   if (base_3 == NULL_RTX || offset_3 == NULL_RTX)
16934     return false;
16935   extract_base_offset_in_addr (mem_4, &base_4, &offset_4);
16936   if (base_4 == NULL_RTX || offset_4 == NULL_RTX)
16937     return false;
16938 
16939   /* Check if the bases are same.  */
16940   if (!rtx_equal_p (base_1, base_2)
16941       || !rtx_equal_p (base_2, base_3)
16942       || !rtx_equal_p (base_3, base_4))
16943     return false;
16944 
16945   offval_1 = INTVAL (offset_1);
16946   offval_2 = INTVAL (offset_2);
16947   offval_3 = INTVAL (offset_3);
16948   offval_4 = INTVAL (offset_4);
16949   msize = GET_MODE_SIZE (mode);
16950   /* Check if the offsets are consecutive.  */
16951   if ((offval_1 != (offval_2 + msize)
16952        || offval_1 != (offval_3 + msize * 2)
16953        || offval_1 != (offval_4 + msize * 3))
16954       && (offval_4 != (offval_3 + msize)
16955 	  || offval_4 != (offval_2 + msize * 2)
16956 	  || offval_4 != (offval_1 + msize * 3)))
16957     return false;
16958 
16959   /* Check if the addresses are clobbered by load.  */
16960   if (load)
16961     {
16962       if (reg_mentioned_p (reg_1, mem_1)
16963 	  || reg_mentioned_p (reg_2, mem_2)
16964 	  || reg_mentioned_p (reg_3, mem_3))
16965 	return false;
16966 
16967       /* In increasing order, the last load can clobber the address.  */
16968       if (offval_1 > offval_2 && reg_mentioned_p (reg_4, mem_4))
16969 	return false;
16970     }
16971 
16972   /* If we have SImode and slow unaligned ldp,
16973      check the alignment to be at least 8 byte. */
16974   if (mode == SImode
16975       && (aarch64_tune_params.extra_tuning_flags
16976           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
16977       && !optimize_size
16978       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
16979     return false;
16980 
16981   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
16982     rclass_1 = FP_REGS;
16983   else
16984     rclass_1 = GENERAL_REGS;
16985 
16986   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
16987     rclass_2 = FP_REGS;
16988   else
16989     rclass_2 = GENERAL_REGS;
16990 
16991   if (REG_P (reg_3) && FP_REGNUM_P (REGNO (reg_3)))
16992     rclass_3 = FP_REGS;
16993   else
16994     rclass_3 = GENERAL_REGS;
16995 
16996   if (REG_P (reg_4) && FP_REGNUM_P (REGNO (reg_4)))
16997     rclass_4 = FP_REGS;
16998   else
16999     rclass_4 = GENERAL_REGS;
17000 
17001   /* Check if the registers are of same class.  */
17002   if (rclass_1 != rclass_2 || rclass_2 != rclass_3 || rclass_3 != rclass_4)
17003     return false;
17004 
17005   return true;
17006 }
17007 
17008 /* Given OPERANDS of consecutive load/store, this function pairs them
17009    into ldp/stp after adjusting the offset.  It depends on the fact
17010    that addresses of load/store instructions are in increasing order.
17011    MODE is the mode of memory operands.  CODE is the rtl operator
17012    which should be applied to all memory operands, it's SIGN_EXTEND,
17013    ZERO_EXTEND or UNKNOWN.  */
17014 
17015 bool
aarch64_gen_adjusted_ldpstp(rtx * operands,bool load,scalar_mode mode,RTX_CODE code)17016 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
17017 			     scalar_mode mode, RTX_CODE code)
17018 {
17019   rtx base, offset, t1, t2;
17020   rtx mem_1, mem_2, mem_3, mem_4;
17021   HOST_WIDE_INT off_val, abs_off, adj_off, new_off, stp_off_limit, msize;
17022 
17023   if (load)
17024     {
17025       mem_1 = operands[1];
17026       mem_2 = operands[3];
17027       mem_3 = operands[5];
17028       mem_4 = operands[7];
17029     }
17030   else
17031     {
17032       mem_1 = operands[0];
17033       mem_2 = operands[2];
17034       mem_3 = operands[4];
17035       mem_4 = operands[6];
17036       gcc_assert (code == UNKNOWN);
17037     }
17038 
17039   extract_base_offset_in_addr (mem_1, &base, &offset);
17040   gcc_assert (base != NULL_RTX && offset != NULL_RTX);
17041 
17042   /* Adjust offset thus it can fit in ldp/stp instruction.  */
17043   msize = GET_MODE_SIZE (mode);
17044   stp_off_limit = msize * 0x40;
17045   off_val = INTVAL (offset);
17046   abs_off = (off_val < 0) ? -off_val : off_val;
17047   new_off = abs_off % stp_off_limit;
17048   adj_off = abs_off - new_off;
17049 
17050   /* Further adjust to make sure all offsets are OK.  */
17051   if ((new_off + msize * 2) >= stp_off_limit)
17052     {
17053       adj_off += stp_off_limit;
17054       new_off -= stp_off_limit;
17055     }
17056 
17057   /* Make sure the adjustment can be done with ADD/SUB instructions.  */
17058   if (adj_off >= 0x1000)
17059     return false;
17060 
17061   if (off_val < 0)
17062     {
17063       adj_off = -adj_off;
17064       new_off = -new_off;
17065     }
17066 
17067   /* Create new memory references.  */
17068   mem_1 = change_address (mem_1, VOIDmode,
17069 			  plus_constant (DImode, operands[8], new_off));
17070 
17071   /* Check if the adjusted address is OK for ldp/stp.  */
17072   if (!aarch64_mem_pair_operand (mem_1, mode))
17073     return false;
17074 
17075   msize = GET_MODE_SIZE (mode);
17076   mem_2 = change_address (mem_2, VOIDmode,
17077 			  plus_constant (DImode,
17078 					 operands[8],
17079 					 new_off + msize));
17080   mem_3 = change_address (mem_3, VOIDmode,
17081 			  plus_constant (DImode,
17082 					 operands[8],
17083 					 new_off + msize * 2));
17084   mem_4 = change_address (mem_4, VOIDmode,
17085 			  plus_constant (DImode,
17086 					 operands[8],
17087 					 new_off + msize * 3));
17088 
17089   if (code == ZERO_EXTEND)
17090     {
17091       mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
17092       mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
17093       mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
17094       mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
17095     }
17096   else if (code == SIGN_EXTEND)
17097     {
17098       mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
17099       mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
17100       mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
17101       mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
17102     }
17103 
17104   if (load)
17105     {
17106       operands[1] = mem_1;
17107       operands[3] = mem_2;
17108       operands[5] = mem_3;
17109       operands[7] = mem_4;
17110     }
17111   else
17112     {
17113       operands[0] = mem_1;
17114       operands[2] = mem_2;
17115       operands[4] = mem_3;
17116       operands[6] = mem_4;
17117     }
17118 
17119   /* Emit adjusting instruction.  */
17120   emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, adj_off)));
17121   /* Emit ldp/stp instructions.  */
17122   t1 = gen_rtx_SET (operands[0], operands[1]);
17123   t2 = gen_rtx_SET (operands[2], operands[3]);
17124   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
17125   t1 = gen_rtx_SET (operands[4], operands[5]);
17126   t2 = gen_rtx_SET (operands[6], operands[7]);
17127   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
17128   return true;
17129 }
17130 
17131 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE.  Assume for now that
17132    it isn't worth branching around empty masked ops (including masked
17133    stores).  */
17134 
17135 static bool
aarch64_empty_mask_is_expensive(unsigned)17136 aarch64_empty_mask_is_expensive (unsigned)
17137 {
17138   return false;
17139 }
17140 
17141 /* Return 1 if pseudo register should be created and used to hold
17142    GOT address for PIC code.  */
17143 
17144 bool
aarch64_use_pseudo_pic_reg(void)17145 aarch64_use_pseudo_pic_reg (void)
17146 {
17147   return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
17148 }
17149 
17150 /* Implement TARGET_UNSPEC_MAY_TRAP_P.  */
17151 
17152 static int
aarch64_unspec_may_trap_p(const_rtx x,unsigned flags)17153 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
17154 {
17155   switch (XINT (x, 1))
17156     {
17157     case UNSPEC_GOTSMALLPIC:
17158     case UNSPEC_GOTSMALLPIC28K:
17159     case UNSPEC_GOTTINYPIC:
17160       return 0;
17161     default:
17162       break;
17163     }
17164 
17165   return default_unspec_may_trap_p (x, flags);
17166 }
17167 
17168 
17169 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
17170    return the log2 of that value.  Otherwise return -1.  */
17171 
17172 int
aarch64_fpconst_pow_of_2(rtx x)17173 aarch64_fpconst_pow_of_2 (rtx x)
17174 {
17175   const REAL_VALUE_TYPE *r;
17176 
17177   if (!CONST_DOUBLE_P (x))
17178     return -1;
17179 
17180   r = CONST_DOUBLE_REAL_VALUE (x);
17181 
17182   if (REAL_VALUE_NEGATIVE (*r)
17183       || REAL_VALUE_ISNAN (*r)
17184       || REAL_VALUE_ISINF (*r)
17185       || !real_isinteger (r, DFmode))
17186     return -1;
17187 
17188   return exact_log2 (real_to_integer (r));
17189 }
17190 
17191 /* If X is a vector of equal CONST_DOUBLE values and that value is
17192    Y, return the aarch64_fpconst_pow_of_2 of Y.  Otherwise return -1.  */
17193 
17194 int
aarch64_vec_fpconst_pow_of_2(rtx x)17195 aarch64_vec_fpconst_pow_of_2 (rtx x)
17196 {
17197   int nelts;
17198   if (GET_CODE (x) != CONST_VECTOR
17199       || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
17200     return -1;
17201 
17202   if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
17203     return -1;
17204 
17205   int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
17206   if (firstval <= 0)
17207     return -1;
17208 
17209   for (int i = 1; i < nelts; i++)
17210     if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
17211       return -1;
17212 
17213   return firstval;
17214 }
17215 
17216 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
17217    to float.
17218 
17219    __fp16 always promotes through this hook.
17220    _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
17221    through the generic excess precision logic rather than here.  */
17222 
17223 static tree
aarch64_promoted_type(const_tree t)17224 aarch64_promoted_type (const_tree t)
17225 {
17226   if (SCALAR_FLOAT_TYPE_P (t)
17227       && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
17228     return float_type_node;
17229 
17230   return NULL_TREE;
17231 }
17232 
17233 /* Implement the TARGET_OPTAB_SUPPORTED_P hook.  */
17234 
17235 static bool
aarch64_optab_supported_p(int op,machine_mode mode1,machine_mode,optimization_type opt_type)17236 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
17237 			   optimization_type opt_type)
17238 {
17239   switch (op)
17240     {
17241     case rsqrt_optab:
17242       return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
17243 
17244     default:
17245       return true;
17246     }
17247 }
17248 
17249 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook.  */
17250 
17251 static unsigned int
aarch64_dwarf_poly_indeterminate_value(unsigned int i,unsigned int * factor,int * offset)17252 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
17253 					int *offset)
17254 {
17255   /* Polynomial invariant 1 == (VG / 2) - 1.  */
17256   gcc_assert (i == 1);
17257   *factor = 2;
17258   *offset = 1;
17259   return AARCH64_DWARF_VG;
17260 }
17261 
17262 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
17263    if MODE is HFmode, and punt to the generic implementation otherwise.  */
17264 
17265 static bool
aarch64_libgcc_floating_mode_supported_p(scalar_float_mode mode)17266 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
17267 {
17268   return (mode == HFmode
17269 	  ? true
17270 	  : default_libgcc_floating_mode_supported_p (mode));
17271 }
17272 
17273 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
17274    if MODE is HFmode, and punt to the generic implementation otherwise.  */
17275 
17276 static bool
aarch64_scalar_mode_supported_p(scalar_mode mode)17277 aarch64_scalar_mode_supported_p (scalar_mode mode)
17278 {
17279   return (mode == HFmode
17280 	  ? true
17281 	  : default_scalar_mode_supported_p (mode));
17282 }
17283 
17284 /* Set the value of FLT_EVAL_METHOD.
17285    ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
17286 
17287     0: evaluate all operations and constants, whose semantic type has at
17288        most the range and precision of type float, to the range and
17289        precision of float; evaluate all other operations and constants to
17290        the range and precision of the semantic type;
17291 
17292     N, where _FloatN is a supported interchange floating type
17293        evaluate all operations and constants, whose semantic type has at
17294        most the range and precision of _FloatN type, to the range and
17295        precision of the _FloatN type; evaluate all other operations and
17296        constants to the range and precision of the semantic type;
17297 
17298    If we have the ARMv8.2-A extensions then we support _Float16 in native
17299    precision, so we should set this to 16.  Otherwise, we support the type,
17300    but want to evaluate expressions in float precision, so set this to
17301    0.  */
17302 
17303 static enum flt_eval_method
aarch64_excess_precision(enum excess_precision_type type)17304 aarch64_excess_precision (enum excess_precision_type type)
17305 {
17306   switch (type)
17307     {
17308       case EXCESS_PRECISION_TYPE_FAST:
17309       case EXCESS_PRECISION_TYPE_STANDARD:
17310 	/* We can calculate either in 16-bit range and precision or
17311 	   32-bit range and precision.  Make that decision based on whether
17312 	   we have native support for the ARMv8.2-A 16-bit floating-point
17313 	   instructions or not.  */
17314 	return (TARGET_FP_F16INST
17315 		? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
17316 		: FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
17317       case EXCESS_PRECISION_TYPE_IMPLICIT:
17318 	return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
17319       default:
17320 	gcc_unreachable ();
17321     }
17322   return FLT_EVAL_METHOD_UNPREDICTABLE;
17323 }
17324 
17325 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN.  Return true if INSN can be
17326    scheduled for speculative execution.  Reject the long-running division
17327    and square-root instructions.  */
17328 
17329 static bool
aarch64_sched_can_speculate_insn(rtx_insn * insn)17330 aarch64_sched_can_speculate_insn (rtx_insn *insn)
17331 {
17332   switch (get_attr_type (insn))
17333     {
17334       case TYPE_SDIV:
17335       case TYPE_UDIV:
17336       case TYPE_FDIVS:
17337       case TYPE_FDIVD:
17338       case TYPE_FSQRTS:
17339       case TYPE_FSQRTD:
17340       case TYPE_NEON_FP_SQRT_S:
17341       case TYPE_NEON_FP_SQRT_D:
17342       case TYPE_NEON_FP_SQRT_S_Q:
17343       case TYPE_NEON_FP_SQRT_D_Q:
17344       case TYPE_NEON_FP_DIV_S:
17345       case TYPE_NEON_FP_DIV_D:
17346       case TYPE_NEON_FP_DIV_S_Q:
17347       case TYPE_NEON_FP_DIV_D_Q:
17348 	return false;
17349       default:
17350 	return true;
17351     }
17352 }
17353 
17354 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES.  */
17355 
17356 static int
aarch64_compute_pressure_classes(reg_class * classes)17357 aarch64_compute_pressure_classes (reg_class *classes)
17358 {
17359   int i = 0;
17360   classes[i++] = GENERAL_REGS;
17361   classes[i++] = FP_REGS;
17362   /* PR_REGS isn't a useful pressure class because many predicate pseudo
17363      registers need to go in PR_LO_REGS at some point during their
17364      lifetime.  Splitting it into two halves has the effect of making
17365      all predicates count against PR_LO_REGS, so that we try whenever
17366      possible to restrict the number of live predicates to 8.  This
17367      greatly reduces the amount of spilling in certain loops.  */
17368   classes[i++] = PR_LO_REGS;
17369   classes[i++] = PR_HI_REGS;
17370   return i;
17371 }
17372 
17373 /* Implement TARGET_CAN_CHANGE_MODE_CLASS.  */
17374 
17375 static bool
aarch64_can_change_mode_class(machine_mode from,machine_mode to,reg_class_t)17376 aarch64_can_change_mode_class (machine_mode from,
17377 			       machine_mode to, reg_class_t)
17378 {
17379   if (BYTES_BIG_ENDIAN)
17380     {
17381       bool from_sve_p = aarch64_sve_data_mode_p (from);
17382       bool to_sve_p = aarch64_sve_data_mode_p (to);
17383 
17384       /* Don't allow changes between SVE data modes and non-SVE modes.
17385 	 See the comment at the head of aarch64-sve.md for details.  */
17386       if (from_sve_p != to_sve_p)
17387 	return false;
17388 
17389       /* Don't allow changes in element size: lane 0 of the new vector
17390 	 would not then be lane 0 of the old vector.  See the comment
17391 	 above aarch64_maybe_expand_sve_subreg_move for a more detailed
17392 	 description.
17393 
17394 	 In the worst case, this forces a register to be spilled in
17395 	 one mode and reloaded in the other, which handles the
17396 	 endianness correctly.  */
17397       if (from_sve_p && GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to))
17398 	return false;
17399     }
17400   return true;
17401 }
17402 
17403 /* Implement TARGET_EARLY_REMAT_MODES.  */
17404 
17405 static void
aarch64_select_early_remat_modes(sbitmap modes)17406 aarch64_select_early_remat_modes (sbitmap modes)
17407 {
17408   /* SVE values are not normally live across a call, so it should be
17409      worth doing early rematerialization even in VL-specific mode.  */
17410   for (int i = 0; i < NUM_MACHINE_MODES; ++i)
17411     {
17412       machine_mode mode = (machine_mode) i;
17413       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
17414       if (vec_flags & VEC_ANY_SVE)
17415 	bitmap_set_bit (modes, i);
17416     }
17417 }
17418 
17419 /* Target-specific selftests.  */
17420 
17421 #if CHECKING_P
17422 
17423 namespace selftest {
17424 
17425 /* Selftest for the RTL loader.
17426    Verify that the RTL loader copes with a dump from
17427    print_rtx_function.  This is essentially just a test that class
17428    function_reader can handle a real dump, but it also verifies
17429    that lookup_reg_by_dump_name correctly handles hard regs.
17430    The presence of hard reg names in the dump means that the test is
17431    target-specific, hence it is in this file.  */
17432 
17433 static void
aarch64_test_loading_full_dump()17434 aarch64_test_loading_full_dump ()
17435 {
17436   rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
17437 
17438   ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
17439 
17440   rtx_insn *insn_1 = get_insn_by_uid (1);
17441   ASSERT_EQ (NOTE, GET_CODE (insn_1));
17442 
17443   rtx_insn *insn_15 = get_insn_by_uid (15);
17444   ASSERT_EQ (INSN, GET_CODE (insn_15));
17445   ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
17446 
17447   /* Verify crtl->return_rtx.  */
17448   ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
17449   ASSERT_EQ (0, REGNO (crtl->return_rtx));
17450   ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
17451 }
17452 
17453 /* Run all target-specific selftests.  */
17454 
17455 static void
aarch64_run_selftests(void)17456 aarch64_run_selftests (void)
17457 {
17458   aarch64_test_loading_full_dump ();
17459 }
17460 
17461 } // namespace selftest
17462 
17463 #endif /* #if CHECKING_P */
17464 
17465 #undef TARGET_ADDRESS_COST
17466 #define TARGET_ADDRESS_COST aarch64_address_cost
17467 
17468 /* This hook will determines whether unnamed bitfields affect the alignment
17469    of the containing structure.  The hook returns true if the structure
17470    should inherit the alignment requirements of an unnamed bitfield's
17471    type.  */
17472 #undef TARGET_ALIGN_ANON_BITFIELD
17473 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
17474 
17475 #undef TARGET_ASM_ALIGNED_DI_OP
17476 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
17477 
17478 #undef TARGET_ASM_ALIGNED_HI_OP
17479 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
17480 
17481 #undef TARGET_ASM_ALIGNED_SI_OP
17482 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
17483 
17484 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
17485 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
17486   hook_bool_const_tree_hwi_hwi_const_tree_true
17487 
17488 #undef TARGET_ASM_FILE_START
17489 #define TARGET_ASM_FILE_START aarch64_start_file
17490 
17491 #undef TARGET_ASM_OUTPUT_MI_THUNK
17492 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
17493 
17494 #undef TARGET_ASM_SELECT_RTX_SECTION
17495 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
17496 
17497 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
17498 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
17499 
17500 #undef TARGET_BUILD_BUILTIN_VA_LIST
17501 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
17502 
17503 #undef TARGET_CALLEE_COPIES
17504 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
17505 
17506 #undef TARGET_CAN_ELIMINATE
17507 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
17508 
17509 #undef TARGET_CAN_INLINE_P
17510 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
17511 
17512 #undef TARGET_CANNOT_FORCE_CONST_MEM
17513 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
17514 
17515 #undef TARGET_CASE_VALUES_THRESHOLD
17516 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
17517 
17518 #undef TARGET_CONDITIONAL_REGISTER_USAGE
17519 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
17520 
17521 /* Only the least significant bit is used for initialization guard
17522    variables.  */
17523 #undef TARGET_CXX_GUARD_MASK_BIT
17524 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
17525 
17526 #undef TARGET_C_MODE_FOR_SUFFIX
17527 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
17528 
17529 #ifdef TARGET_BIG_ENDIAN_DEFAULT
17530 #undef  TARGET_DEFAULT_TARGET_FLAGS
17531 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
17532 #endif
17533 
17534 #undef TARGET_CLASS_MAX_NREGS
17535 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
17536 
17537 #undef TARGET_BUILTIN_DECL
17538 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
17539 
17540 #undef TARGET_BUILTIN_RECIPROCAL
17541 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
17542 
17543 #undef TARGET_C_EXCESS_PRECISION
17544 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
17545 
17546 #undef  TARGET_EXPAND_BUILTIN
17547 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
17548 
17549 #undef TARGET_EXPAND_BUILTIN_VA_START
17550 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
17551 
17552 #undef TARGET_FOLD_BUILTIN
17553 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
17554 
17555 #undef TARGET_FUNCTION_ARG
17556 #define TARGET_FUNCTION_ARG aarch64_function_arg
17557 
17558 #undef TARGET_FUNCTION_ARG_ADVANCE
17559 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
17560 
17561 #undef TARGET_FUNCTION_ARG_BOUNDARY
17562 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
17563 
17564 #undef TARGET_FUNCTION_ARG_PADDING
17565 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
17566 
17567 #undef TARGET_GET_RAW_RESULT_MODE
17568 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
17569 #undef TARGET_GET_RAW_ARG_MODE
17570 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
17571 
17572 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
17573 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
17574 
17575 #undef TARGET_FUNCTION_VALUE
17576 #define TARGET_FUNCTION_VALUE aarch64_function_value
17577 
17578 #undef TARGET_FUNCTION_VALUE_REGNO_P
17579 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
17580 
17581 #undef TARGET_GIMPLE_FOLD_BUILTIN
17582 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
17583 
17584 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
17585 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
17586 
17587 #undef  TARGET_INIT_BUILTINS
17588 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
17589 
17590 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
17591 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
17592   aarch64_ira_change_pseudo_allocno_class
17593 
17594 #undef TARGET_LEGITIMATE_ADDRESS_P
17595 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
17596 
17597 #undef TARGET_LEGITIMATE_CONSTANT_P
17598 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
17599 
17600 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
17601 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
17602   aarch64_legitimize_address_displacement
17603 
17604 #undef TARGET_LIBGCC_CMP_RETURN_MODE
17605 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
17606 
17607 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
17608 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
17609 aarch64_libgcc_floating_mode_supported_p
17610 
17611 #undef TARGET_MANGLE_TYPE
17612 #define TARGET_MANGLE_TYPE aarch64_mangle_type
17613 
17614 #undef TARGET_MEMORY_MOVE_COST
17615 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
17616 
17617 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
17618 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
17619 
17620 #undef TARGET_MUST_PASS_IN_STACK
17621 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
17622 
17623 /* This target hook should return true if accesses to volatile bitfields
17624    should use the narrowest mode possible.  It should return false if these
17625    accesses should use the bitfield container type.  */
17626 #undef TARGET_NARROW_VOLATILE_BITFIELD
17627 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
17628 
17629 #undef  TARGET_OPTION_OVERRIDE
17630 #define TARGET_OPTION_OVERRIDE aarch64_override_options
17631 
17632 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
17633 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
17634   aarch64_override_options_after_change
17635 
17636 #undef TARGET_OPTION_SAVE
17637 #define TARGET_OPTION_SAVE aarch64_option_save
17638 
17639 #undef TARGET_OPTION_RESTORE
17640 #define TARGET_OPTION_RESTORE aarch64_option_restore
17641 
17642 #undef TARGET_OPTION_PRINT
17643 #define TARGET_OPTION_PRINT aarch64_option_print
17644 
17645 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
17646 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
17647 
17648 #undef TARGET_SET_CURRENT_FUNCTION
17649 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
17650 
17651 #undef TARGET_PASS_BY_REFERENCE
17652 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
17653 
17654 #undef TARGET_PREFERRED_RELOAD_CLASS
17655 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
17656 
17657 #undef TARGET_SCHED_REASSOCIATION_WIDTH
17658 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
17659 
17660 #undef TARGET_PROMOTED_TYPE
17661 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
17662 
17663 #undef TARGET_SECONDARY_RELOAD
17664 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
17665 
17666 #undef TARGET_SHIFT_TRUNCATION_MASK
17667 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
17668 
17669 #undef TARGET_SETUP_INCOMING_VARARGS
17670 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
17671 
17672 #undef TARGET_STRUCT_VALUE_RTX
17673 #define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
17674 
17675 #undef TARGET_REGISTER_MOVE_COST
17676 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
17677 
17678 #undef TARGET_RETURN_IN_MEMORY
17679 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
17680 
17681 #undef TARGET_RETURN_IN_MSB
17682 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
17683 
17684 #undef TARGET_RTX_COSTS
17685 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
17686 
17687 #undef TARGET_SCALAR_MODE_SUPPORTED_P
17688 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
17689 
17690 #undef TARGET_SCHED_ISSUE_RATE
17691 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
17692 
17693 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
17694 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
17695   aarch64_sched_first_cycle_multipass_dfa_lookahead
17696 
17697 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
17698 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
17699   aarch64_first_cycle_multipass_dfa_lookahead_guard
17700 
17701 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
17702 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
17703   aarch64_get_separate_components
17704 
17705 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
17706 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
17707   aarch64_components_for_bb
17708 
17709 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
17710 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
17711   aarch64_disqualify_components
17712 
17713 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
17714 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
17715   aarch64_emit_prologue_components
17716 
17717 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
17718 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
17719   aarch64_emit_epilogue_components
17720 
17721 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
17722 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
17723   aarch64_set_handled_components
17724 
17725 #undef TARGET_TRAMPOLINE_INIT
17726 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
17727 
17728 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
17729 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
17730 
17731 #undef TARGET_VECTOR_MODE_SUPPORTED_P
17732 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
17733 
17734 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
17735 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
17736   aarch64_builtin_support_vector_misalignment
17737 
17738 #undef TARGET_ARRAY_MODE
17739 #define TARGET_ARRAY_MODE aarch64_array_mode
17740 
17741 #undef TARGET_ARRAY_MODE_SUPPORTED_P
17742 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
17743 
17744 #undef TARGET_VECTORIZE_ADD_STMT_COST
17745 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
17746 
17747 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
17748 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
17749   aarch64_builtin_vectorization_cost
17750 
17751 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
17752 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
17753 
17754 #undef TARGET_VECTORIZE_BUILTINS
17755 #define TARGET_VECTORIZE_BUILTINS
17756 
17757 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
17758 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
17759   aarch64_builtin_vectorized_function
17760 
17761 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
17762 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
17763   aarch64_autovectorize_vector_sizes
17764 
17765 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
17766 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
17767   aarch64_atomic_assign_expand_fenv
17768 
17769 /* Section anchor support.  */
17770 
17771 #undef TARGET_MIN_ANCHOR_OFFSET
17772 #define TARGET_MIN_ANCHOR_OFFSET -256
17773 
17774 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
17775    byte offset; we can do much more for larger data types, but have no way
17776    to determine the size of the access.  We assume accesses are aligned.  */
17777 #undef TARGET_MAX_ANCHOR_OFFSET
17778 #define TARGET_MAX_ANCHOR_OFFSET 4095
17779 
17780 #undef TARGET_VECTOR_ALIGNMENT
17781 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
17782 
17783 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
17784 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
17785   aarch64_vectorize_preferred_vector_alignment
17786 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
17787 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
17788   aarch64_simd_vector_alignment_reachable
17789 
17790 /* vec_perm support.  */
17791 
17792 #undef TARGET_VECTORIZE_VEC_PERM_CONST
17793 #define TARGET_VECTORIZE_VEC_PERM_CONST \
17794   aarch64_vectorize_vec_perm_const
17795 
17796 #undef TARGET_VECTORIZE_GET_MASK_MODE
17797 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
17798 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
17799 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
17800   aarch64_empty_mask_is_expensive
17801 
17802 #undef TARGET_INIT_LIBFUNCS
17803 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
17804 
17805 #undef TARGET_FIXED_CONDITION_CODE_REGS
17806 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
17807 
17808 #undef TARGET_FLAGS_REGNUM
17809 #define TARGET_FLAGS_REGNUM CC_REGNUM
17810 
17811 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
17812 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
17813 
17814 #undef TARGET_ASAN_SHADOW_OFFSET
17815 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
17816 
17817 #undef TARGET_LEGITIMIZE_ADDRESS
17818 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
17819 
17820 #undef TARGET_SCHED_CAN_SPECULATE_INSN
17821 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
17822 
17823 #undef TARGET_CAN_USE_DOLOOP_P
17824 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
17825 
17826 #undef TARGET_SCHED_ADJUST_PRIORITY
17827 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
17828 
17829 #undef TARGET_SCHED_MACRO_FUSION_P
17830 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
17831 
17832 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
17833 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
17834 
17835 #undef TARGET_SCHED_FUSION_PRIORITY
17836 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
17837 
17838 #undef TARGET_UNSPEC_MAY_TRAP_P
17839 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
17840 
17841 #undef TARGET_USE_PSEUDO_PIC_REG
17842 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
17843 
17844 #undef TARGET_PRINT_OPERAND
17845 #define TARGET_PRINT_OPERAND aarch64_print_operand
17846 
17847 #undef TARGET_PRINT_OPERAND_ADDRESS
17848 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
17849 
17850 #undef TARGET_OPTAB_SUPPORTED_P
17851 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
17852 
17853 #undef TARGET_OMIT_STRUCT_RETURN_REG
17854 #define TARGET_OMIT_STRUCT_RETURN_REG true
17855 
17856 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
17857 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
17858   aarch64_dwarf_poly_indeterminate_value
17859 
17860 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors.  */
17861 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
17862 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
17863 
17864 #undef TARGET_HARD_REGNO_NREGS
17865 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
17866 #undef TARGET_HARD_REGNO_MODE_OK
17867 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
17868 
17869 #undef TARGET_MODES_TIEABLE_P
17870 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
17871 
17872 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
17873 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
17874   aarch64_hard_regno_call_part_clobbered
17875 
17876 #undef TARGET_CONSTANT_ALIGNMENT
17877 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
17878 
17879 #undef TARGET_COMPUTE_PRESSURE_CLASSES
17880 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
17881 
17882 #undef TARGET_CAN_CHANGE_MODE_CLASS
17883 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
17884 
17885 #undef TARGET_SELECT_EARLY_REMAT_MODES
17886 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
17887 
17888 #if CHECKING_P
17889 #undef TARGET_RUN_TARGET_SELFTESTS
17890 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
17891 #endif /* #if CHECKING_P */
17892 
17893 struct gcc_target targetm = TARGET_INITIALIZER;
17894 
17895 #include "gt-aarch64.h"
17896