1 /* Machine description for AArch64 architecture.
2    Copyright (C) 2009-2018 Free Software Foundation, Inc.
3    Contributed by ARM Ltd.
4 
5    This file is part of GCC.
6 
7    GCC is free software; you can redistribute it and/or modify it
8    under the terms of the GNU General Public License as published by
9    the Free Software Foundation; either version 3, or (at your option)
10    any later version.
11 
12    GCC is distributed in the hope that it will be useful, but
13    WITHOUT ANY WARRANTY; without even the implied warranty of
14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15    General Public License for more details.
16 
17    You should have received a copy of the GNU General Public License
18    along with GCC; see the file COPYING3.  If not see
19    <http://www.gnu.org/licenses/>.  */
20 
21 #define IN_TARGET_CODE 1
22 
23 #include "config.h"
24 #define INCLUDE_STRING
25 #include "system.h"
26 #include "coretypes.h"
27 #include "backend.h"
28 #include "target.h"
29 #include "rtl.h"
30 #include "tree.h"
31 #include "memmodel.h"
32 #include "gimple.h"
33 #include "cfghooks.h"
34 #include "cfgloop.h"
35 #include "df.h"
36 #include "tm_p.h"
37 #include "stringpool.h"
38 #include "attribs.h"
39 #include "optabs.h"
40 #include "regs.h"
41 #include "emit-rtl.h"
42 #include "recog.h"
43 #include "diagnostic.h"
44 #include "insn-attr.h"
45 #include "alias.h"
46 #include "fold-const.h"
47 #include "stor-layout.h"
48 #include "calls.h"
49 #include "varasm.h"
50 #include "output.h"
51 #include "flags.h"
52 #include "explow.h"
53 #include "expr.h"
54 #include "reload.h"
55 #include "langhooks.h"
56 #include "opts.h"
57 #include "params.h"
58 #include "gimplify.h"
59 #include "dwarf2.h"
60 #include "gimple-iterator.h"
61 #include "tree-vectorizer.h"
62 #include "aarch64-cost-tables.h"
63 #include "dumpfile.h"
64 #include "builtins.h"
65 #include "rtl-iter.h"
66 #include "tm-constrs.h"
67 #include "sched-int.h"
68 #include "target-globals.h"
69 #include "common/common-target.h"
70 #include "cfgrtl.h"
71 #include "selftest.h"
72 #include "selftest-rtl.h"
73 #include "rtx-vector-builder.h"
74 
75 /* This file should be included last.  */
76 #include "target-def.h"
77 
78 /* Defined for convenience.  */
79 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
80 
81 /* Classifies an address.
82 
83    ADDRESS_REG_IMM
84        A simple base register plus immediate offset.
85 
86    ADDRESS_REG_WB
87        A base register indexed by immediate offset with writeback.
88 
89    ADDRESS_REG_REG
90        A base register indexed by (optionally scaled) register.
91 
92    ADDRESS_REG_UXTW
93        A base register indexed by (optionally scaled) zero-extended register.
94 
95    ADDRESS_REG_SXTW
96        A base register indexed by (optionally scaled) sign-extended register.
97 
98    ADDRESS_LO_SUM
99        A LO_SUM rtx with a base register and "LO12" symbol relocation.
100 
101    ADDRESS_SYMBOLIC:
102        A constant symbolic address, in pc-relative literal pool.  */
103 
104 enum aarch64_address_type {
105   ADDRESS_REG_IMM,
106   ADDRESS_REG_WB,
107   ADDRESS_REG_REG,
108   ADDRESS_REG_UXTW,
109   ADDRESS_REG_SXTW,
110   ADDRESS_LO_SUM,
111   ADDRESS_SYMBOLIC
112 };
113 
114 struct aarch64_address_info {
115   enum aarch64_address_type type;
116   rtx base;
117   rtx offset;
118   poly_int64 const_offset;
119   int shift;
120   enum aarch64_symbol_type symbol_type;
121 };
122 
123 /* Information about a legitimate vector immediate operand.  */
124 struct simd_immediate_info
125 {
126   enum insn_type { MOV, MVN };
127   enum modifier_type { LSL, MSL };
128 
simd_immediate_infosimd_immediate_info129   simd_immediate_info () {}
130   simd_immediate_info (scalar_float_mode, rtx);
131   simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
132 		       insn_type = MOV, modifier_type = LSL,
133 		       unsigned int = 0);
134   simd_immediate_info (scalar_mode, rtx, rtx);
135 
136   /* The mode of the elements.  */
137   scalar_mode elt_mode;
138 
139   /* The value of each element if all elements are the same, or the
140      first value if the constant is a series.  */
141   rtx value;
142 
143   /* The value of the step if the constant is a series, null otherwise.  */
144   rtx step;
145 
146   /* The instruction to use to move the immediate into a vector.  */
147   insn_type insn;
148 
149   /* The kind of shift modifier to use, and the number of bits to shift.
150      This is (LSL, 0) if no shift is needed.  */
151   modifier_type modifier;
152   unsigned int shift;
153 };
154 
155 /* Construct a floating-point immediate in which each element has mode
156    ELT_MODE_IN and value VALUE_IN.  */
157 inline simd_immediate_info
simd_immediate_info(scalar_float_mode elt_mode_in,rtx value_in)158 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
159   : elt_mode (elt_mode_in), value (value_in), step (NULL_RTX), insn (MOV),
160     modifier (LSL), shift (0)
161 {}
162 
163 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
164    and value VALUE_IN.  The other parameters are as for the structure
165    fields.  */
166 inline simd_immediate_info
simd_immediate_info(scalar_int_mode elt_mode_in,unsigned HOST_WIDE_INT value_in,insn_type insn_in,modifier_type modifier_in,unsigned int shift_in)167 ::simd_immediate_info (scalar_int_mode elt_mode_in,
168 		       unsigned HOST_WIDE_INT value_in,
169 		       insn_type insn_in, modifier_type modifier_in,
170 		       unsigned int shift_in)
171   : elt_mode (elt_mode_in), value (gen_int_mode (value_in, elt_mode_in)),
172     step (NULL_RTX), insn (insn_in), modifier (modifier_in), shift (shift_in)
173 {}
174 
175 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
176    and where element I is equal to VALUE_IN + I * STEP_IN.  */
177 inline simd_immediate_info
simd_immediate_info(scalar_mode elt_mode_in,rtx value_in,rtx step_in)178 ::simd_immediate_info (scalar_mode elt_mode_in, rtx value_in, rtx step_in)
179   : elt_mode (elt_mode_in), value (value_in), step (step_in), insn (MOV),
180     modifier (LSL), shift (0)
181 {}
182 
183 /* The current code model.  */
184 enum aarch64_code_model aarch64_cmodel;
185 
186 /* The number of 64-bit elements in an SVE vector.  */
187 poly_uint16 aarch64_sve_vg;
188 
189 #ifdef HAVE_AS_TLS
190 #undef TARGET_HAVE_TLS
191 #define TARGET_HAVE_TLS 1
192 #endif
193 
194 static bool aarch64_composite_type_p (const_tree, machine_mode);
195 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
196 						     const_tree,
197 						     machine_mode *, int *,
198 						     bool *);
199 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
200 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
201 static void aarch64_override_options_after_change (void);
202 static bool aarch64_vector_mode_supported_p (machine_mode);
203 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
204 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
205 							 const_tree type,
206 							 int misalignment,
207 							 bool is_packed);
208 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
209 static bool aarch64_print_ldpstp_address (FILE *, machine_mode, rtx);
210 
211 /* Major revision number of the ARM Architecture implemented by the target.  */
212 unsigned aarch64_architecture_version;
213 
214 /* The processor for which instructions should be scheduled.  */
215 enum aarch64_processor aarch64_tune = cortexa53;
216 
217 /* Mask to specify which instruction scheduling options should be used.  */
218 unsigned long aarch64_tune_flags = 0;
219 
220 /* Global flag for PC relative loads.  */
221 bool aarch64_pcrelative_literal_loads;
222 
223 /* Support for command line parsing of boolean flags in the tuning
224    structures.  */
225 struct aarch64_flag_desc
226 {
227   const char* name;
228   unsigned int flag;
229 };
230 
231 #define AARCH64_FUSION_PAIR(name, internal_name) \
232   { name, AARCH64_FUSE_##internal_name },
233 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
234 {
235   { "none", AARCH64_FUSE_NOTHING },
236 #include "aarch64-fusion-pairs.def"
237   { "all", AARCH64_FUSE_ALL },
238   { NULL, AARCH64_FUSE_NOTHING }
239 };
240 
241 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
242   { name, AARCH64_EXTRA_TUNE_##internal_name },
243 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
244 {
245   { "none", AARCH64_EXTRA_TUNE_NONE },
246 #include "aarch64-tuning-flags.def"
247   { "all", AARCH64_EXTRA_TUNE_ALL },
248   { NULL, AARCH64_EXTRA_TUNE_NONE }
249 };
250 
251 /* Tuning parameters.  */
252 
253 static const struct cpu_addrcost_table generic_addrcost_table =
254 {
255     {
256       1, /* hi  */
257       0, /* si  */
258       0, /* di  */
259       1, /* ti  */
260     },
261   0, /* pre_modify  */
262   0, /* post_modify  */
263   0, /* register_offset  */
264   0, /* register_sextend  */
265   0, /* register_zextend  */
266   0 /* imm_offset  */
267 };
268 
269 static const struct cpu_addrcost_table exynosm1_addrcost_table =
270 {
271     {
272       0, /* hi  */
273       0, /* si  */
274       0, /* di  */
275       2, /* ti  */
276     },
277   0, /* pre_modify  */
278   0, /* post_modify  */
279   1, /* register_offset  */
280   1, /* register_sextend  */
281   2, /* register_zextend  */
282   0, /* imm_offset  */
283 };
284 
285 static const struct cpu_addrcost_table xgene1_addrcost_table =
286 {
287     {
288       1, /* hi  */
289       0, /* si  */
290       0, /* di  */
291       1, /* ti  */
292     },
293   1, /* pre_modify  */
294   0, /* post_modify  */
295   0, /* register_offset  */
296   1, /* register_sextend  */
297   1, /* register_zextend  */
298   0, /* imm_offset  */
299 };
300 
301 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
302 {
303     {
304       1, /* hi  */
305       1, /* si  */
306       1, /* di  */
307       2, /* ti  */
308     },
309   0, /* pre_modify  */
310   0, /* post_modify  */
311   2, /* register_offset  */
312   3, /* register_sextend  */
313   3, /* register_zextend  */
314   0, /* imm_offset  */
315 };
316 
317 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
318 {
319     {
320       1, /* hi  */
321       1, /* si  */
322       1, /* di  */
323       2, /* ti  */
324     },
325   1, /* pre_modify  */
326   1, /* post_modify  */
327   3, /* register_offset  */
328   3, /* register_sextend  */
329   3, /* register_zextend  */
330   2, /* imm_offset  */
331 };
332 
333 static const struct cpu_regmove_cost generic_regmove_cost =
334 {
335   1, /* GP2GP  */
336   /* Avoid the use of slow int<->fp moves for spilling by setting
337      their cost higher than memmov_cost.  */
338   5, /* GP2FP  */
339   5, /* FP2GP  */
340   2 /* FP2FP  */
341 };
342 
343 static const struct cpu_regmove_cost cortexa57_regmove_cost =
344 {
345   1, /* GP2GP  */
346   /* Avoid the use of slow int<->fp moves for spilling by setting
347      their cost higher than memmov_cost.  */
348   5, /* GP2FP  */
349   5, /* FP2GP  */
350   2 /* FP2FP  */
351 };
352 
353 static const struct cpu_regmove_cost cortexa53_regmove_cost =
354 {
355   1, /* GP2GP  */
356   /* Avoid the use of slow int<->fp moves for spilling by setting
357      their cost higher than memmov_cost.  */
358   5, /* GP2FP  */
359   5, /* FP2GP  */
360   2 /* FP2FP  */
361 };
362 
363 static const struct cpu_regmove_cost exynosm1_regmove_cost =
364 {
365   1, /* GP2GP  */
366   /* Avoid the use of slow int<->fp moves for spilling by setting
367      their cost higher than memmov_cost (actual, 4 and 9).  */
368   9, /* GP2FP  */
369   9, /* FP2GP  */
370   1 /* FP2FP  */
371 };
372 
373 static const struct cpu_regmove_cost thunderx_regmove_cost =
374 {
375   2, /* GP2GP  */
376   2, /* GP2FP  */
377   6, /* FP2GP  */
378   4 /* FP2FP  */
379 };
380 
381 static const struct cpu_regmove_cost xgene1_regmove_cost =
382 {
383   1, /* GP2GP  */
384   /* Avoid the use of slow int<->fp moves for spilling by setting
385      their cost higher than memmov_cost.  */
386   8, /* GP2FP  */
387   8, /* FP2GP  */
388   2 /* FP2FP  */
389 };
390 
391 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
392 {
393   2, /* GP2GP  */
394   /* Avoid the use of int<->fp moves for spilling.  */
395   6, /* GP2FP  */
396   6, /* FP2GP  */
397   4 /* FP2FP  */
398 };
399 
400 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
401 {
402   1, /* GP2GP  */
403   /* Avoid the use of int<->fp moves for spilling.  */
404   8, /* GP2FP  */
405   8, /* FP2GP  */
406   4  /* FP2FP  */
407 };
408 
409 /* Generic costs for vector insn classes.  */
410 static const struct cpu_vector_cost generic_vector_cost =
411 {
412   1, /* scalar_int_stmt_cost  */
413   1, /* scalar_fp_stmt_cost  */
414   1, /* scalar_load_cost  */
415   1, /* scalar_store_cost  */
416   1, /* vec_int_stmt_cost  */
417   1, /* vec_fp_stmt_cost  */
418   2, /* vec_permute_cost  */
419   1, /* vec_to_scalar_cost  */
420   1, /* scalar_to_vec_cost  */
421   1, /* vec_align_load_cost  */
422   1, /* vec_unalign_load_cost  */
423   1, /* vec_unalign_store_cost  */
424   1, /* vec_store_cost  */
425   3, /* cond_taken_branch_cost  */
426   1 /* cond_not_taken_branch_cost  */
427 };
428 
429 /* QDF24XX costs for vector insn classes.  */
430 static const struct cpu_vector_cost qdf24xx_vector_cost =
431 {
432   1, /* scalar_int_stmt_cost  */
433   1, /* scalar_fp_stmt_cost  */
434   1, /* scalar_load_cost  */
435   1, /* scalar_store_cost  */
436   1, /* vec_int_stmt_cost  */
437   3, /* vec_fp_stmt_cost  */
438   2, /* vec_permute_cost  */
439   1, /* vec_to_scalar_cost  */
440   1, /* scalar_to_vec_cost  */
441   1, /* vec_align_load_cost  */
442   1, /* vec_unalign_load_cost  */
443   1, /* vec_unalign_store_cost  */
444   1, /* vec_store_cost  */
445   3, /* cond_taken_branch_cost  */
446   1 /* cond_not_taken_branch_cost  */
447 };
448 
449 /* ThunderX costs for vector insn classes.  */
450 static const struct cpu_vector_cost thunderx_vector_cost =
451 {
452   1, /* scalar_int_stmt_cost  */
453   1, /* scalar_fp_stmt_cost  */
454   3, /* scalar_load_cost  */
455   1, /* scalar_store_cost  */
456   4, /* vec_int_stmt_cost  */
457   1, /* vec_fp_stmt_cost  */
458   4, /* vec_permute_cost  */
459   2, /* vec_to_scalar_cost  */
460   2, /* scalar_to_vec_cost  */
461   3, /* vec_align_load_cost  */
462   5, /* vec_unalign_load_cost  */
463   5, /* vec_unalign_store_cost  */
464   1, /* vec_store_cost  */
465   3, /* cond_taken_branch_cost  */
466   3 /* cond_not_taken_branch_cost  */
467 };
468 
469 /* Generic costs for vector insn classes.  */
470 static const struct cpu_vector_cost cortexa57_vector_cost =
471 {
472   1, /* scalar_int_stmt_cost  */
473   1, /* scalar_fp_stmt_cost  */
474   4, /* scalar_load_cost  */
475   1, /* scalar_store_cost  */
476   2, /* vec_int_stmt_cost  */
477   2, /* vec_fp_stmt_cost  */
478   3, /* vec_permute_cost  */
479   8, /* vec_to_scalar_cost  */
480   8, /* scalar_to_vec_cost  */
481   4, /* vec_align_load_cost  */
482   4, /* vec_unalign_load_cost  */
483   1, /* vec_unalign_store_cost  */
484   1, /* vec_store_cost  */
485   1, /* cond_taken_branch_cost  */
486   1 /* cond_not_taken_branch_cost  */
487 };
488 
489 static const struct cpu_vector_cost exynosm1_vector_cost =
490 {
491   1, /* scalar_int_stmt_cost  */
492   1, /* scalar_fp_stmt_cost  */
493   5, /* scalar_load_cost  */
494   1, /* scalar_store_cost  */
495   3, /* vec_int_stmt_cost  */
496   3, /* vec_fp_stmt_cost  */
497   3, /* vec_permute_cost  */
498   3, /* vec_to_scalar_cost  */
499   3, /* scalar_to_vec_cost  */
500   5, /* vec_align_load_cost  */
501   5, /* vec_unalign_load_cost  */
502   1, /* vec_unalign_store_cost  */
503   1, /* vec_store_cost  */
504   1, /* cond_taken_branch_cost  */
505   1 /* cond_not_taken_branch_cost  */
506 };
507 
508 /* Generic costs for vector insn classes.  */
509 static const struct cpu_vector_cost xgene1_vector_cost =
510 {
511   1, /* scalar_int_stmt_cost  */
512   1, /* scalar_fp_stmt_cost  */
513   5, /* scalar_load_cost  */
514   1, /* scalar_store_cost  */
515   2, /* vec_int_stmt_cost  */
516   2, /* vec_fp_stmt_cost  */
517   2, /* vec_permute_cost  */
518   4, /* vec_to_scalar_cost  */
519   4, /* scalar_to_vec_cost  */
520   10, /* vec_align_load_cost  */
521   10, /* vec_unalign_load_cost  */
522   2, /* vec_unalign_store_cost  */
523   2, /* vec_store_cost  */
524   2, /* cond_taken_branch_cost  */
525   1 /* cond_not_taken_branch_cost  */
526 };
527 
528 /* Costs for vector insn classes for Vulcan.  */
529 static const struct cpu_vector_cost thunderx2t99_vector_cost =
530 {
531   1, /* scalar_int_stmt_cost  */
532   6, /* scalar_fp_stmt_cost  */
533   4, /* scalar_load_cost  */
534   1, /* scalar_store_cost  */
535   5, /* vec_int_stmt_cost  */
536   6, /* vec_fp_stmt_cost  */
537   3, /* vec_permute_cost  */
538   6, /* vec_to_scalar_cost  */
539   5, /* scalar_to_vec_cost  */
540   8, /* vec_align_load_cost  */
541   8, /* vec_unalign_load_cost  */
542   4, /* vec_unalign_store_cost  */
543   4, /* vec_store_cost  */
544   2, /* cond_taken_branch_cost  */
545   1  /* cond_not_taken_branch_cost  */
546 };
547 
548 /* Generic costs for branch instructions.  */
549 static const struct cpu_branch_cost generic_branch_cost =
550 {
551   1,  /* Predictable.  */
552   3   /* Unpredictable.  */
553 };
554 
555 /* Generic approximation modes.  */
556 static const cpu_approx_modes generic_approx_modes =
557 {
558   AARCH64_APPROX_NONE,	/* division  */
559   AARCH64_APPROX_NONE,	/* sqrt  */
560   AARCH64_APPROX_NONE	/* recip_sqrt  */
561 };
562 
563 /* Approximation modes for Exynos M1.  */
564 static const cpu_approx_modes exynosm1_approx_modes =
565 {
566   AARCH64_APPROX_NONE,	/* division  */
567   AARCH64_APPROX_ALL,	/* sqrt  */
568   AARCH64_APPROX_ALL	/* recip_sqrt  */
569 };
570 
571 /* Approximation modes for X-Gene 1.  */
572 static const cpu_approx_modes xgene1_approx_modes =
573 {
574   AARCH64_APPROX_NONE,	/* division  */
575   AARCH64_APPROX_NONE,	/* sqrt  */
576   AARCH64_APPROX_ALL	/* recip_sqrt  */
577 };
578 
579 /* Generic prefetch settings (which disable prefetch).  */
580 static const cpu_prefetch_tune generic_prefetch_tune =
581 {
582   0,			/* num_slots  */
583   -1,			/* l1_cache_size  */
584   -1,			/* l1_cache_line_size  */
585   -1,			/* l2_cache_size  */
586   -1			/* default_opt_level  */
587 };
588 
589 static const cpu_prefetch_tune exynosm1_prefetch_tune =
590 {
591   0,			/* num_slots  */
592   -1,			/* l1_cache_size  */
593   64,			/* l1_cache_line_size  */
594   -1,			/* l2_cache_size  */
595   -1			/* default_opt_level  */
596 };
597 
598 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
599 {
600   4,			/* num_slots  */
601   32,			/* l1_cache_size  */
602   64,			/* l1_cache_line_size  */
603   1024,			/* l2_cache_size  */
604   -1			/* default_opt_level  */
605 };
606 
607 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
608 {
609   8,			/* num_slots  */
610   32,			/* l1_cache_size  */
611   128,			/* l1_cache_line_size  */
612   16*1024,		/* l2_cache_size  */
613   3			/* default_opt_level  */
614 };
615 
616 static const cpu_prefetch_tune thunderx_prefetch_tune =
617 {
618   8,			/* num_slots  */
619   32,			/* l1_cache_size  */
620   128,			/* l1_cache_line_size  */
621   -1,			/* l2_cache_size  */
622   -1			/* default_opt_level  */
623 };
624 
625 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
626 {
627   8,			/* num_slots  */
628   32,			/* l1_cache_size  */
629   64,			/* l1_cache_line_size  */
630   256,			/* l2_cache_size  */
631   -1			/* default_opt_level  */
632 };
633 
634 static const struct tune_params generic_tunings =
635 {
636   &cortexa57_extra_costs,
637   &generic_addrcost_table,
638   &generic_regmove_cost,
639   &generic_vector_cost,
640   &generic_branch_cost,
641   &generic_approx_modes,
642   4, /* memmov_cost  */
643   2, /* issue_rate  */
644   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
645   8,	/* function_align.  */
646   4,	/* jump_align.  */
647   8,	/* loop_align.  */
648   2,	/* int_reassoc_width.  */
649   4,	/* fp_reassoc_width.  */
650   1,	/* vec_reassoc_width.  */
651   2,	/* min_div_recip_mul_sf.  */
652   2,	/* min_div_recip_mul_df.  */
653   0,	/* max_case_values.  */
654   tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
655   (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
656   &generic_prefetch_tune
657 };
658 
659 static const struct tune_params cortexa35_tunings =
660 {
661   &cortexa53_extra_costs,
662   &generic_addrcost_table,
663   &cortexa53_regmove_cost,
664   &generic_vector_cost,
665   &generic_branch_cost,
666   &generic_approx_modes,
667   4, /* memmov_cost  */
668   1, /* issue_rate  */
669   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
670    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
671   16,	/* function_align.  */
672   4,	/* jump_align.  */
673   8,	/* loop_align.  */
674   2,	/* int_reassoc_width.  */
675   4,	/* fp_reassoc_width.  */
676   1,	/* vec_reassoc_width.  */
677   2,	/* min_div_recip_mul_sf.  */
678   2,	/* min_div_recip_mul_df.  */
679   0,	/* max_case_values.  */
680   tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
681   (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
682   &generic_prefetch_tune
683 };
684 
685 static const struct tune_params cortexa53_tunings =
686 {
687   &cortexa53_extra_costs,
688   &generic_addrcost_table,
689   &cortexa53_regmove_cost,
690   &generic_vector_cost,
691   &generic_branch_cost,
692   &generic_approx_modes,
693   4, /* memmov_cost  */
694   2, /* issue_rate  */
695   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
696    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
697   16,	/* function_align.  */
698   4,	/* jump_align.  */
699   8,	/* loop_align.  */
700   2,	/* int_reassoc_width.  */
701   4,	/* fp_reassoc_width.  */
702   1,	/* vec_reassoc_width.  */
703   2,	/* min_div_recip_mul_sf.  */
704   2,	/* min_div_recip_mul_df.  */
705   0,	/* max_case_values.  */
706   tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
707   (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
708   &generic_prefetch_tune
709 };
710 
711 static const struct tune_params cortexa57_tunings =
712 {
713   &cortexa57_extra_costs,
714   &generic_addrcost_table,
715   &cortexa57_regmove_cost,
716   &cortexa57_vector_cost,
717   &generic_branch_cost,
718   &generic_approx_modes,
719   4, /* memmov_cost  */
720   3, /* issue_rate  */
721   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
722    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
723   16,	/* function_align.  */
724   4,	/* jump_align.  */
725   8,	/* loop_align.  */
726   2,	/* int_reassoc_width.  */
727   4,	/* fp_reassoc_width.  */
728   1,	/* vec_reassoc_width.  */
729   2,	/* min_div_recip_mul_sf.  */
730   2,	/* min_div_recip_mul_df.  */
731   0,	/* max_case_values.  */
732   tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
733   (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS),	/* tune_flags.  */
734   &generic_prefetch_tune
735 };
736 
737 static const struct tune_params cortexa72_tunings =
738 {
739   &cortexa57_extra_costs,
740   &generic_addrcost_table,
741   &cortexa57_regmove_cost,
742   &cortexa57_vector_cost,
743   &generic_branch_cost,
744   &generic_approx_modes,
745   4, /* memmov_cost  */
746   3, /* issue_rate  */
747   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
748    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
749   16,	/* function_align.  */
750   4,	/* jump_align.  */
751   8,	/* loop_align.  */
752   2,	/* int_reassoc_width.  */
753   4,	/* fp_reassoc_width.  */
754   1,	/* vec_reassoc_width.  */
755   2,	/* min_div_recip_mul_sf.  */
756   2,	/* min_div_recip_mul_df.  */
757   0,	/* max_case_values.  */
758   tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
759   (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
760   &generic_prefetch_tune
761 };
762 
763 static const struct tune_params cortexa73_tunings =
764 {
765   &cortexa57_extra_costs,
766   &generic_addrcost_table,
767   &cortexa57_regmove_cost,
768   &cortexa57_vector_cost,
769   &generic_branch_cost,
770   &generic_approx_modes,
771   4, /* memmov_cost.  */
772   2, /* issue_rate.  */
773   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
774    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
775   16,	/* function_align.  */
776   4,	/* jump_align.  */
777   8,	/* loop_align.  */
778   2,	/* int_reassoc_width.  */
779   4,	/* fp_reassoc_width.  */
780   1,	/* vec_reassoc_width.  */
781   2,	/* min_div_recip_mul_sf.  */
782   2,	/* min_div_recip_mul_df.  */
783   0,	/* max_case_values.  */
784   tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
785   (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
786   &generic_prefetch_tune
787 };
788 
789 
790 
791 static const struct tune_params exynosm1_tunings =
792 {
793   &exynosm1_extra_costs,
794   &exynosm1_addrcost_table,
795   &exynosm1_regmove_cost,
796   &exynosm1_vector_cost,
797   &generic_branch_cost,
798   &exynosm1_approx_modes,
799   4,	/* memmov_cost  */
800   3,	/* issue_rate  */
801   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
802   4,	/* function_align.  */
803   4,	/* jump_align.  */
804   4,	/* loop_align.  */
805   2,	/* int_reassoc_width.  */
806   4,	/* fp_reassoc_width.  */
807   1,	/* vec_reassoc_width.  */
808   2,	/* min_div_recip_mul_sf.  */
809   2,	/* min_div_recip_mul_df.  */
810   48,	/* max_case_values.  */
811   tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
812   (AARCH64_EXTRA_TUNE_NONE), /* tune_flags.  */
813   &exynosm1_prefetch_tune
814 };
815 
816 static const struct tune_params thunderxt88_tunings =
817 {
818   &thunderx_extra_costs,
819   &generic_addrcost_table,
820   &thunderx_regmove_cost,
821   &thunderx_vector_cost,
822   &generic_branch_cost,
823   &generic_approx_modes,
824   6, /* memmov_cost  */
825   2, /* issue_rate  */
826   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
827   8,	/* function_align.  */
828   8,	/* jump_align.  */
829   8,	/* loop_align.  */
830   2,	/* int_reassoc_width.  */
831   4,	/* fp_reassoc_width.  */
832   1,	/* vec_reassoc_width.  */
833   2,	/* min_div_recip_mul_sf.  */
834   2,	/* min_div_recip_mul_df.  */
835   0,	/* max_case_values.  */
836   tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
837   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW),	/* tune_flags.  */
838   &thunderxt88_prefetch_tune
839 };
840 
841 static const struct tune_params thunderx_tunings =
842 {
843   &thunderx_extra_costs,
844   &generic_addrcost_table,
845   &thunderx_regmove_cost,
846   &thunderx_vector_cost,
847   &generic_branch_cost,
848   &generic_approx_modes,
849   6, /* memmov_cost  */
850   2, /* issue_rate  */
851   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
852   8,	/* function_align.  */
853   8,	/* jump_align.  */
854   8,	/* loop_align.  */
855   2,	/* int_reassoc_width.  */
856   4,	/* fp_reassoc_width.  */
857   1,	/* vec_reassoc_width.  */
858   2,	/* min_div_recip_mul_sf.  */
859   2,	/* min_div_recip_mul_df.  */
860   0,	/* max_case_values.  */
861   tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
862   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
863    | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),	/* tune_flags.  */
864   &thunderx_prefetch_tune
865 };
866 
867 static const struct tune_params xgene1_tunings =
868 {
869   &xgene1_extra_costs,
870   &xgene1_addrcost_table,
871   &xgene1_regmove_cost,
872   &xgene1_vector_cost,
873   &generic_branch_cost,
874   &xgene1_approx_modes,
875   6, /* memmov_cost  */
876   4, /* issue_rate  */
877   AARCH64_FUSE_NOTHING, /* fusible_ops  */
878   16,	/* function_align.  */
879   8,	/* jump_align.  */
880   16,	/* loop_align.  */
881   2,	/* int_reassoc_width.  */
882   4,	/* fp_reassoc_width.  */
883   1,	/* vec_reassoc_width.  */
884   2,	/* min_div_recip_mul_sf.  */
885   2,	/* min_div_recip_mul_df.  */
886   0,	/* max_case_values.  */
887   tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
888   (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
889   &generic_prefetch_tune
890 };
891 
892 static const struct tune_params qdf24xx_tunings =
893 {
894   &qdf24xx_extra_costs,
895   &qdf24xx_addrcost_table,
896   &qdf24xx_regmove_cost,
897   &qdf24xx_vector_cost,
898   &generic_branch_cost,
899   &generic_approx_modes,
900   4, /* memmov_cost  */
901   4, /* issue_rate  */
902   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
903    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
904   16,	/* function_align.  */
905   8,	/* jump_align.  */
906   16,	/* loop_align.  */
907   2,	/* int_reassoc_width.  */
908   4,	/* fp_reassoc_width.  */
909   1,	/* vec_reassoc_width.  */
910   2,	/* min_div_recip_mul_sf.  */
911   2,	/* min_div_recip_mul_df.  */
912   0,	/* max_case_values.  */
913   tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
914   (AARCH64_EXTRA_TUNE_NONE),		/* tune_flags.  */
915   &qdf24xx_prefetch_tune
916 };
917 
918 /* Tuning structure for the Qualcomm Saphira core.  Default to falkor values
919    for now.  */
920 static const struct tune_params saphira_tunings =
921 {
922   &generic_extra_costs,
923   &generic_addrcost_table,
924   &generic_regmove_cost,
925   &generic_vector_cost,
926   &generic_branch_cost,
927   &generic_approx_modes,
928   4, /* memmov_cost  */
929   4, /* issue_rate  */
930   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
931    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
932   16,	/* function_align.  */
933   8,	/* jump_align.  */
934   16,	/* loop_align.  */
935   2,	/* int_reassoc_width.  */
936   4,	/* fp_reassoc_width.  */
937   1,	/* vec_reassoc_width.  */
938   2,	/* min_div_recip_mul_sf.  */
939   2,	/* min_div_recip_mul_df.  */
940   0,	/* max_case_values.  */
941   tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
942   (AARCH64_EXTRA_TUNE_NONE),		/* tune_flags.  */
943   &generic_prefetch_tune
944 };
945 
946 static const struct tune_params thunderx2t99_tunings =
947 {
948   &thunderx2t99_extra_costs,
949   &thunderx2t99_addrcost_table,
950   &thunderx2t99_regmove_cost,
951   &thunderx2t99_vector_cost,
952   &generic_branch_cost,
953   &generic_approx_modes,
954   4, /* memmov_cost.  */
955   4, /* issue_rate.  */
956   (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
957    | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops  */
958   16,	/* function_align.  */
959   8,	/* jump_align.  */
960   16,	/* loop_align.  */
961   3,	/* int_reassoc_width.  */
962   2,	/* fp_reassoc_width.  */
963   2,	/* vec_reassoc_width.  */
964   2,	/* min_div_recip_mul_sf.  */
965   2,	/* min_div_recip_mul_df.  */
966   0,	/* max_case_values.  */
967   tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
968   (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
969   &thunderx2t99_prefetch_tune
970 };
971 
972 /* Support for fine-grained override of the tuning structures.  */
973 struct aarch64_tuning_override_function
974 {
975   const char* name;
976   void (*parse_override)(const char*, struct tune_params*);
977 };
978 
979 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
980 static void aarch64_parse_tune_string (const char*, struct tune_params*);
981 
982 static const struct aarch64_tuning_override_function
983 aarch64_tuning_override_functions[] =
984 {
985   { "fuse", aarch64_parse_fuse_string },
986   { "tune", aarch64_parse_tune_string },
987   { NULL, NULL }
988 };
989 
990 /* A processor implementing AArch64.  */
991 struct processor
992 {
993   const char *const name;
994   enum aarch64_processor ident;
995   enum aarch64_processor sched_core;
996   enum aarch64_arch arch;
997   unsigned architecture_version;
998   const unsigned long flags;
999   const struct tune_params *const tune;
1000 };
1001 
1002 /* Architectures implementing AArch64.  */
1003 static const struct processor all_architectures[] =
1004 {
1005 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
1006   {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
1007 #include "aarch64-arches.def"
1008   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1009 };
1010 
1011 /* Processor cores implementing AArch64.  */
1012 static const struct processor all_cores[] =
1013 {
1014 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
1015   {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH,				\
1016   all_architectures[AARCH64_ARCH_##ARCH].architecture_version,	\
1017   FLAGS, &COSTS##_tunings},
1018 #include "aarch64-cores.def"
1019   {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
1020     AARCH64_FL_FOR_ARCH8, &generic_tunings},
1021   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1022 };
1023 
1024 
1025 /* Target specification.  These are populated by the -march, -mtune, -mcpu
1026    handling code or by target attributes.  */
1027 static const struct processor *selected_arch;
1028 static const struct processor *selected_cpu;
1029 static const struct processor *selected_tune;
1030 
1031 /* The current tuning set.  */
1032 struct tune_params aarch64_tune_params = generic_tunings;
1033 
1034 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
1035 
1036 /* An ISA extension in the co-processor and main instruction set space.  */
1037 struct aarch64_option_extension
1038 {
1039   const char *const name;
1040   const unsigned long flags_on;
1041   const unsigned long flags_off;
1042 };
1043 
1044 typedef enum aarch64_cond_code
1045 {
1046   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
1047   AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
1048   AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
1049 }
1050 aarch64_cc;
1051 
1052 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1053 
1054 /* The condition codes of the processor, and the inverse function.  */
1055 static const char * const aarch64_condition_codes[] =
1056 {
1057   "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1058   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1059 };
1060 
1061 /* Generate code to enable conditional branches in functions over 1 MiB.  */
1062 const char *
aarch64_gen_far_branch(rtx * operands,int pos_label,const char * dest,const char * branch_format)1063 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
1064 			const char * branch_format)
1065 {
1066     rtx_code_label * tmp_label = gen_label_rtx ();
1067     char label_buf[256];
1068     char buffer[128];
1069     ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
1070 				 CODE_LABEL_NUMBER (tmp_label));
1071     const char *label_ptr = targetm.strip_name_encoding (label_buf);
1072     rtx dest_label = operands[pos_label];
1073     operands[pos_label] = tmp_label;
1074 
1075     snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
1076     output_asm_insn (buffer, operands);
1077 
1078     snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
1079     operands[pos_label] = dest_label;
1080     output_asm_insn (buffer, operands);
1081     return "";
1082 }
1083 
1084 void
aarch64_err_no_fpadvsimd(machine_mode mode,const char * msg)1085 aarch64_err_no_fpadvsimd (machine_mode mode, const char *msg)
1086 {
1087   const char *mc = FLOAT_MODE_P (mode) ? "floating-point" : "vector";
1088   if (TARGET_GENERAL_REGS_ONLY)
1089     error ("%qs is incompatible with %s %s", "-mgeneral-regs-only", mc, msg);
1090   else
1091     error ("%qs feature modifier is incompatible with %s %s", "+nofp", mc, msg);
1092 }
1093 
1094 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1095    The register allocator chooses ALL_REGS if FP_REGS and GENERAL_REGS have
1096    the same cost even if ALL_REGS has a much larger cost.  ALL_REGS is also
1097    used if the cost of both FP_REGS and GENERAL_REGS is lower than the memory
1098    cost (in this case the best class is the lowest cost one).  Using ALL_REGS
1099    irrespectively of its cost results in bad allocations with many redundant
1100    int<->FP moves which are expensive on various cores.
1101    To avoid this we don't allow ALL_REGS as the allocno class, but force a
1102    decision between FP_REGS and GENERAL_REGS.  We use the allocno class if it
1103    isn't ALL_REGS.  Similarly, use the best class if it isn't ALL_REGS.
1104    Otherwise set the allocno class depending on the mode.
1105    The result of this is that it is no longer inefficient to have a higher
1106    memory move cost than the register move cost.
1107 */
1108 
1109 static reg_class_t
aarch64_ira_change_pseudo_allocno_class(int regno,reg_class_t allocno_class,reg_class_t best_class)1110 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
1111 					 reg_class_t best_class)
1112 {
1113   machine_mode mode;
1114 
1115   if (allocno_class != ALL_REGS)
1116     return allocno_class;
1117 
1118   if (best_class != ALL_REGS)
1119     return best_class;
1120 
1121   mode = PSEUDO_REGNO_MODE (regno);
1122   return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1123 }
1124 
1125 static unsigned int
aarch64_min_divisions_for_recip_mul(machine_mode mode)1126 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1127 {
1128   if (GET_MODE_UNIT_SIZE (mode) == 4)
1129     return aarch64_tune_params.min_div_recip_mul_sf;
1130   return aarch64_tune_params.min_div_recip_mul_df;
1131 }
1132 
1133 /* Return the reassociation width of treeop OPC with mode MODE.  */
1134 static int
aarch64_reassociation_width(unsigned opc,machine_mode mode)1135 aarch64_reassociation_width (unsigned opc, machine_mode mode)
1136 {
1137   if (VECTOR_MODE_P (mode))
1138     return aarch64_tune_params.vec_reassoc_width;
1139   if (INTEGRAL_MODE_P (mode))
1140     return aarch64_tune_params.int_reassoc_width;
1141   /* Avoid reassociating floating point addition so we emit more FMAs.  */
1142   if (FLOAT_MODE_P (mode) && opc != PLUS_EXPR)
1143     return aarch64_tune_params.fp_reassoc_width;
1144   return 1;
1145 }
1146 
1147 /* Provide a mapping from gcc register numbers to dwarf register numbers.  */
1148 unsigned
aarch64_dbx_register_number(unsigned regno)1149 aarch64_dbx_register_number (unsigned regno)
1150 {
1151    if (GP_REGNUM_P (regno))
1152      return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1153    else if (regno == SP_REGNUM)
1154      return AARCH64_DWARF_SP;
1155    else if (FP_REGNUM_P (regno))
1156      return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1157    else if (PR_REGNUM_P (regno))
1158      return AARCH64_DWARF_P0 + regno - P0_REGNUM;
1159    else if (regno == VG_REGNUM)
1160      return AARCH64_DWARF_VG;
1161 
1162    /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1163       equivalent DWARF register.  */
1164    return DWARF_FRAME_REGISTERS;
1165 }
1166 
1167 /* Return true if MODE is any of the Advanced SIMD structure modes.  */
1168 static bool
aarch64_advsimd_struct_mode_p(machine_mode mode)1169 aarch64_advsimd_struct_mode_p (machine_mode mode)
1170 {
1171   return (TARGET_SIMD
1172 	  && (mode == OImode || mode == CImode || mode == XImode));
1173 }
1174 
1175 /* Return true if MODE is an SVE predicate mode.  */
1176 static bool
aarch64_sve_pred_mode_p(machine_mode mode)1177 aarch64_sve_pred_mode_p (machine_mode mode)
1178 {
1179   return (TARGET_SVE
1180 	  && (mode == VNx16BImode
1181 	      || mode == VNx8BImode
1182 	      || mode == VNx4BImode
1183 	      || mode == VNx2BImode));
1184 }
1185 
1186 /* Three mutually-exclusive flags describing a vector or predicate type.  */
1187 const unsigned int VEC_ADVSIMD  = 1;
1188 const unsigned int VEC_SVE_DATA = 2;
1189 const unsigned int VEC_SVE_PRED = 4;
1190 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
1191    a structure of 2, 3 or 4 vectors.  */
1192 const unsigned int VEC_STRUCT   = 8;
1193 /* Useful combinations of the above.  */
1194 const unsigned int VEC_ANY_SVE  = VEC_SVE_DATA | VEC_SVE_PRED;
1195 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
1196 
1197 /* Return a set of flags describing the vector properties of mode MODE.
1198    Ignore modes that are not supported by the current target.  */
1199 static unsigned int
aarch64_classify_vector_mode(machine_mode mode)1200 aarch64_classify_vector_mode (machine_mode mode)
1201 {
1202   if (aarch64_advsimd_struct_mode_p (mode))
1203     return VEC_ADVSIMD | VEC_STRUCT;
1204 
1205   if (aarch64_sve_pred_mode_p (mode))
1206     return VEC_SVE_PRED;
1207 
1208   scalar_mode inner = GET_MODE_INNER (mode);
1209   if (VECTOR_MODE_P (mode)
1210       && (inner == QImode
1211 	  || inner == HImode
1212 	  || inner == HFmode
1213 	  || inner == SImode
1214 	  || inner == SFmode
1215 	  || inner == DImode
1216 	  || inner == DFmode))
1217     {
1218       if (TARGET_SVE)
1219 	{
1220 	  if (known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR))
1221 	    return VEC_SVE_DATA;
1222 	  if (known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 2)
1223 	      || known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 3)
1224 	      || known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 4))
1225 	    return VEC_SVE_DATA | VEC_STRUCT;
1226 	}
1227 
1228       /* This includes V1DF but not V1DI (which doesn't exist).  */
1229       if (TARGET_SIMD
1230 	  && (known_eq (GET_MODE_BITSIZE (mode), 64)
1231 	      || known_eq (GET_MODE_BITSIZE (mode), 128)))
1232 	return VEC_ADVSIMD;
1233     }
1234 
1235   return 0;
1236 }
1237 
1238 /* Return true if MODE is any of the data vector modes, including
1239    structure modes.  */
1240 static bool
aarch64_vector_data_mode_p(machine_mode mode)1241 aarch64_vector_data_mode_p (machine_mode mode)
1242 {
1243   return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
1244 }
1245 
1246 /* Return true if MODE is an SVE data vector mode; either a single vector
1247    or a structure of vectors.  */
1248 static bool
aarch64_sve_data_mode_p(machine_mode mode)1249 aarch64_sve_data_mode_p (machine_mode mode)
1250 {
1251   return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
1252 }
1253 
1254 /* Implement target hook TARGET_ARRAY_MODE.  */
1255 static opt_machine_mode
aarch64_array_mode(machine_mode mode,unsigned HOST_WIDE_INT nelems)1256 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
1257 {
1258   if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
1259       && IN_RANGE (nelems, 2, 4))
1260     return mode_for_vector (GET_MODE_INNER (mode),
1261 			    GET_MODE_NUNITS (mode) * nelems);
1262 
1263   return opt_machine_mode ();
1264 }
1265 
1266 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
1267 static bool
aarch64_array_mode_supported_p(machine_mode mode,unsigned HOST_WIDE_INT nelems)1268 aarch64_array_mode_supported_p (machine_mode mode,
1269 				unsigned HOST_WIDE_INT nelems)
1270 {
1271   if (TARGET_SIMD
1272       && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1273 	  || AARCH64_VALID_SIMD_DREG_MODE (mode))
1274       && (nelems >= 2 && nelems <= 4))
1275     return true;
1276 
1277   return false;
1278 }
1279 
1280 /* Return the SVE predicate mode to use for elements that have
1281    ELEM_NBYTES bytes, if such a mode exists.  */
1282 
1283 opt_machine_mode
aarch64_sve_pred_mode(unsigned int elem_nbytes)1284 aarch64_sve_pred_mode (unsigned int elem_nbytes)
1285 {
1286   if (TARGET_SVE)
1287     {
1288       if (elem_nbytes == 1)
1289 	return VNx16BImode;
1290       if (elem_nbytes == 2)
1291 	return VNx8BImode;
1292       if (elem_nbytes == 4)
1293 	return VNx4BImode;
1294       if (elem_nbytes == 8)
1295 	return VNx2BImode;
1296     }
1297   return opt_machine_mode ();
1298 }
1299 
1300 /* Implement TARGET_VECTORIZE_GET_MASK_MODE.  */
1301 
1302 static opt_machine_mode
aarch64_get_mask_mode(poly_uint64 nunits,poly_uint64 nbytes)1303 aarch64_get_mask_mode (poly_uint64 nunits, poly_uint64 nbytes)
1304 {
1305   if (TARGET_SVE && known_eq (nbytes, BYTES_PER_SVE_VECTOR))
1306     {
1307       unsigned int elem_nbytes = vector_element_size (nbytes, nunits);
1308       machine_mode pred_mode;
1309       if (aarch64_sve_pred_mode (elem_nbytes).exists (&pred_mode))
1310 	return pred_mode;
1311     }
1312 
1313   return default_get_mask_mode (nunits, nbytes);
1314 }
1315 
1316 /* Implement TARGET_HARD_REGNO_NREGS.  */
1317 
1318 static unsigned int
aarch64_hard_regno_nregs(unsigned regno,machine_mode mode)1319 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1320 {
1321   /* ??? Logically we should only need to provide a value when
1322      HARD_REGNO_MODE_OK says that the combination is valid,
1323      but at the moment we need to handle all modes.  Just ignore
1324      any runtime parts for registers that can't store them.  */
1325   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
1326   switch (aarch64_regno_regclass (regno))
1327     {
1328     case FP_REGS:
1329     case FP_LO_REGS:
1330       if (aarch64_sve_data_mode_p (mode))
1331 	return exact_div (GET_MODE_SIZE (mode),
1332 			  BYTES_PER_SVE_VECTOR).to_constant ();
1333       return CEIL (lowest_size, UNITS_PER_VREG);
1334     case PR_REGS:
1335     case PR_LO_REGS:
1336     case PR_HI_REGS:
1337       return 1;
1338     default:
1339       return CEIL (lowest_size, UNITS_PER_WORD);
1340     }
1341   gcc_unreachable ();
1342 }
1343 
1344 /* Implement TARGET_HARD_REGNO_MODE_OK.  */
1345 
1346 static bool
aarch64_hard_regno_mode_ok(unsigned regno,machine_mode mode)1347 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1348 {
1349   if (GET_MODE_CLASS (mode) == MODE_CC)
1350     return regno == CC_REGNUM;
1351 
1352   if (regno == VG_REGNUM)
1353     /* This must have the same size as _Unwind_Word.  */
1354     return mode == DImode;
1355 
1356   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1357   if (vec_flags & VEC_SVE_PRED)
1358     return PR_REGNUM_P (regno);
1359 
1360   if (PR_REGNUM_P (regno))
1361     return 0;
1362 
1363   if (regno == SP_REGNUM)
1364     /* The purpose of comparing with ptr_mode is to support the
1365        global register variable associated with the stack pointer
1366        register via the syntax of asm ("wsp") in ILP32.  */
1367     return mode == Pmode || mode == ptr_mode;
1368 
1369   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1370     return mode == Pmode;
1371 
1372   if (GP_REGNUM_P (regno) && known_le (GET_MODE_SIZE (mode), 16))
1373     return true;
1374 
1375   if (FP_REGNUM_P (regno))
1376     {
1377       if (vec_flags & VEC_STRUCT)
1378 	return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
1379       else
1380 	return !VECTOR_MODE_P (mode) || vec_flags != 0;
1381     }
1382 
1383   return false;
1384 }
1385 
1386 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED.  The callee only saves
1387    the lower 64 bits of a 128-bit register.  Tell the compiler the callee
1388    clobbers the top 64 bits when restoring the bottom 64 bits.  */
1389 
1390 static bool
aarch64_hard_regno_call_part_clobbered(unsigned int regno,machine_mode mode)1391 aarch64_hard_regno_call_part_clobbered (unsigned int regno, machine_mode mode)
1392 {
1393   return FP_REGNUM_P (regno) && maybe_gt (GET_MODE_SIZE (mode), 8);
1394 }
1395 
1396 /* Implement REGMODE_NATURAL_SIZE.  */
1397 poly_uint64
aarch64_regmode_natural_size(machine_mode mode)1398 aarch64_regmode_natural_size (machine_mode mode)
1399 {
1400   /* The natural size for SVE data modes is one SVE data vector,
1401      and similarly for predicates.  We can't independently modify
1402      anything smaller than that.  */
1403   /* ??? For now, only do this for variable-width SVE registers.
1404      Doing it for constant-sized registers breaks lower-subreg.c.  */
1405   /* ??? And once that's fixed, we should probably have similar
1406      code for Advanced SIMD.  */
1407   if (!aarch64_sve_vg.is_constant ())
1408     {
1409       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1410       if (vec_flags & VEC_SVE_PRED)
1411 	return BYTES_PER_SVE_PRED;
1412       if (vec_flags & VEC_SVE_DATA)
1413 	return BYTES_PER_SVE_VECTOR;
1414     }
1415   return UNITS_PER_WORD;
1416 }
1417 
1418 /* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
1419 machine_mode
aarch64_hard_regno_caller_save_mode(unsigned regno,unsigned,machine_mode mode)1420 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
1421 				     machine_mode mode)
1422 {
1423   /* The predicate mode determines which bits are significant and
1424      which are "don't care".  Decreasing the number of lanes would
1425      lose data while increasing the number of lanes would make bits
1426      unnecessarily significant.  */
1427   if (PR_REGNUM_P (regno))
1428     return mode;
1429   if (known_ge (GET_MODE_SIZE (mode), 4))
1430     return mode;
1431   else
1432     return SImode;
1433 }
1434 
1435 /* Implement TARGET_CONSTANT_ALIGNMENT.  Make strings word-aligned so
1436    that strcpy from constants will be faster.  */
1437 
1438 static HOST_WIDE_INT
aarch64_constant_alignment(const_tree exp,HOST_WIDE_INT align)1439 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
1440 {
1441   if (TREE_CODE (exp) == STRING_CST && !optimize_size)
1442     return MAX (align, BITS_PER_WORD);
1443   return align;
1444 }
1445 
1446 /* Return true if calls to DECL should be treated as
1447    long-calls (ie called via a register).  */
1448 static bool
aarch64_decl_is_long_call_p(const_tree decl ATTRIBUTE_UNUSED)1449 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1450 {
1451   return false;
1452 }
1453 
1454 /* Return true if calls to symbol-ref SYM should be treated as
1455    long-calls (ie called via a register).  */
1456 bool
aarch64_is_long_call_p(rtx sym)1457 aarch64_is_long_call_p (rtx sym)
1458 {
1459   return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1460 }
1461 
1462 /* Return true if calls to symbol-ref SYM should not go through
1463    plt stubs.  */
1464 
1465 bool
aarch64_is_noplt_call_p(rtx sym)1466 aarch64_is_noplt_call_p (rtx sym)
1467 {
1468   const_tree decl = SYMBOL_REF_DECL (sym);
1469 
1470   if (flag_pic
1471       && decl
1472       && (!flag_plt
1473 	  || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1474       && !targetm.binds_local_p (decl))
1475     return true;
1476 
1477   return false;
1478 }
1479 
1480 /* Return true if the offsets to a zero/sign-extract operation
1481    represent an expression that matches an extend operation.  The
1482    operands represent the paramters from
1483 
1484    (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)).  */
1485 bool
aarch64_is_extend_from_extract(scalar_int_mode mode,rtx mult_imm,rtx extract_imm)1486 aarch64_is_extend_from_extract (scalar_int_mode mode, rtx mult_imm,
1487 				rtx extract_imm)
1488 {
1489   HOST_WIDE_INT mult_val, extract_val;
1490 
1491   if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
1492     return false;
1493 
1494   mult_val = INTVAL (mult_imm);
1495   extract_val = INTVAL (extract_imm);
1496 
1497   if (extract_val > 8
1498       && extract_val < GET_MODE_BITSIZE (mode)
1499       && exact_log2 (extract_val & ~7) > 0
1500       && (extract_val & 7) <= 4
1501       && mult_val == (1 << (extract_val & 7)))
1502     return true;
1503 
1504   return false;
1505 }
1506 
1507 /* Emit an insn that's a simple single-set.  Both the operands must be
1508    known to be valid.  */
1509 inline static rtx_insn *
emit_set_insn(rtx x,rtx y)1510 emit_set_insn (rtx x, rtx y)
1511 {
1512   return emit_insn (gen_rtx_SET (x, y));
1513 }
1514 
1515 /* X and Y are two things to compare using CODE.  Emit the compare insn and
1516    return the rtx for register 0 in the proper mode.  */
1517 rtx
aarch64_gen_compare_reg(RTX_CODE code,rtx x,rtx y)1518 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
1519 {
1520   machine_mode mode = SELECT_CC_MODE (code, x, y);
1521   rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
1522 
1523   emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
1524   return cc_reg;
1525 }
1526 
1527 /* Build the SYMBOL_REF for __tls_get_addr.  */
1528 
1529 static GTY(()) rtx tls_get_addr_libfunc;
1530 
1531 rtx
aarch64_tls_get_addr(void)1532 aarch64_tls_get_addr (void)
1533 {
1534   if (!tls_get_addr_libfunc)
1535     tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
1536   return tls_get_addr_libfunc;
1537 }
1538 
1539 /* Return the TLS model to use for ADDR.  */
1540 
1541 static enum tls_model
tls_symbolic_operand_type(rtx addr)1542 tls_symbolic_operand_type (rtx addr)
1543 {
1544   enum tls_model tls_kind = TLS_MODEL_NONE;
1545   if (GET_CODE (addr) == CONST)
1546     {
1547       poly_int64 addend;
1548       rtx sym = strip_offset (addr, &addend);
1549       if (GET_CODE (sym) == SYMBOL_REF)
1550 	tls_kind = SYMBOL_REF_TLS_MODEL (sym);
1551     }
1552   else if (GET_CODE (addr) == SYMBOL_REF)
1553     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
1554 
1555   return tls_kind;
1556 }
1557 
1558 /* We'll allow lo_sum's in addresses in our legitimate addresses
1559    so that combine would take care of combining addresses where
1560    necessary, but for generation purposes, we'll generate the address
1561    as :
1562    RTL                               Absolute
1563    tmp = hi (symbol_ref);            adrp  x1, foo
1564    dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
1565                                      nop
1566 
1567    PIC                               TLS
1568    adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
1569    ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
1570                                      bl   __tls_get_addr
1571                                      nop
1572 
1573    Load TLS symbol, depending on TLS mechanism and TLS access model.
1574 
1575    Global Dynamic - Traditional TLS:
1576    adrp tmp, :tlsgd:imm
1577    add  dest, tmp, #:tlsgd_lo12:imm
1578    bl   __tls_get_addr
1579 
1580    Global Dynamic - TLS Descriptors:
1581    adrp dest, :tlsdesc:imm
1582    ldr  tmp, [dest, #:tlsdesc_lo12:imm]
1583    add  dest, dest, #:tlsdesc_lo12:imm
1584    blr  tmp
1585    mrs  tp, tpidr_el0
1586    add  dest, dest, tp
1587 
1588    Initial Exec:
1589    mrs  tp, tpidr_el0
1590    adrp tmp, :gottprel:imm
1591    ldr  dest, [tmp, #:gottprel_lo12:imm]
1592    add  dest, dest, tp
1593 
1594    Local Exec:
1595    mrs  tp, tpidr_el0
1596    add  t0, tp, #:tprel_hi12:imm, lsl #12
1597    add  t0, t0, #:tprel_lo12_nc:imm
1598 */
1599 
1600 static void
aarch64_load_symref_appropriately(rtx dest,rtx imm,enum aarch64_symbol_type type)1601 aarch64_load_symref_appropriately (rtx dest, rtx imm,
1602 				   enum aarch64_symbol_type type)
1603 {
1604   switch (type)
1605     {
1606     case SYMBOL_SMALL_ABSOLUTE:
1607       {
1608 	/* In ILP32, the mode of dest can be either SImode or DImode.  */
1609 	rtx tmp_reg = dest;
1610 	machine_mode mode = GET_MODE (dest);
1611 
1612 	gcc_assert (mode == Pmode || mode == ptr_mode);
1613 
1614 	if (can_create_pseudo_p ())
1615 	  tmp_reg = gen_reg_rtx (mode);
1616 
1617 	emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1618 	emit_insn (gen_add_losym (dest, tmp_reg, imm));
1619 	return;
1620       }
1621 
1622     case SYMBOL_TINY_ABSOLUTE:
1623       emit_insn (gen_rtx_SET (dest, imm));
1624       return;
1625 
1626     case SYMBOL_SMALL_GOT_28K:
1627       {
1628 	machine_mode mode = GET_MODE (dest);
1629 	rtx gp_rtx = pic_offset_table_rtx;
1630 	rtx insn;
1631 	rtx mem;
1632 
1633 	/* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
1634 	   here before rtl expand.  Tree IVOPT will generate rtl pattern to
1635 	   decide rtx costs, in which case pic_offset_table_rtx is not
1636 	   initialized.  For that case no need to generate the first adrp
1637 	   instruction as the final cost for global variable access is
1638 	   one instruction.  */
1639 	if (gp_rtx != NULL)
1640 	  {
1641 	    /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
1642 	       using the page base as GOT base, the first page may be wasted,
1643 	       in the worst scenario, there is only 28K space for GOT).
1644 
1645 	       The generate instruction sequence for accessing global variable
1646 	       is:
1647 
1648 		 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
1649 
1650 	       Only one instruction needed. But we must initialize
1651 	       pic_offset_table_rtx properly.  We generate initialize insn for
1652 	       every global access, and allow CSE to remove all redundant.
1653 
1654 	       The final instruction sequences will look like the following
1655 	       for multiply global variables access.
1656 
1657 		 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
1658 
1659 		 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
1660 		 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
1661 		 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
1662 		 ...  */
1663 
1664 	    rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
1665 	    crtl->uses_pic_offset_table = 1;
1666 	    emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
1667 
1668 	    if (mode != GET_MODE (gp_rtx))
1669              gp_rtx = gen_lowpart (mode, gp_rtx);
1670 
1671 	  }
1672 
1673 	if (mode == ptr_mode)
1674 	  {
1675 	    if (mode == DImode)
1676 	      insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
1677 	    else
1678 	      insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
1679 
1680 	    mem = XVECEXP (SET_SRC (insn), 0, 0);
1681 	  }
1682 	else
1683 	  {
1684 	    gcc_assert (mode == Pmode);
1685 
1686 	    insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
1687 	    mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1688 	  }
1689 
1690 	/* The operand is expected to be MEM.  Whenever the related insn
1691 	   pattern changed, above code which calculate mem should be
1692 	   updated.  */
1693 	gcc_assert (GET_CODE (mem) == MEM);
1694 	MEM_READONLY_P (mem) = 1;
1695 	MEM_NOTRAP_P (mem) = 1;
1696 	emit_insn (insn);
1697 	return;
1698       }
1699 
1700     case SYMBOL_SMALL_GOT_4G:
1701       {
1702 	/* In ILP32, the mode of dest can be either SImode or DImode,
1703 	   while the got entry is always of SImode size.  The mode of
1704 	   dest depends on how dest is used: if dest is assigned to a
1705 	   pointer (e.g. in the memory), it has SImode; it may have
1706 	   DImode if dest is dereferenced to access the memeory.
1707 	   This is why we have to handle three different ldr_got_small
1708 	   patterns here (two patterns for ILP32).  */
1709 
1710 	rtx insn;
1711 	rtx mem;
1712 	rtx tmp_reg = dest;
1713 	machine_mode mode = GET_MODE (dest);
1714 
1715 	if (can_create_pseudo_p ())
1716 	  tmp_reg = gen_reg_rtx (mode);
1717 
1718 	emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1719 	if (mode == ptr_mode)
1720 	  {
1721 	    if (mode == DImode)
1722 	      insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
1723 	    else
1724 	      insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
1725 
1726 	    mem = XVECEXP (SET_SRC (insn), 0, 0);
1727 	  }
1728 	else
1729 	  {
1730 	    gcc_assert (mode == Pmode);
1731 
1732 	    insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
1733 	    mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1734 	  }
1735 
1736 	gcc_assert (GET_CODE (mem) == MEM);
1737 	MEM_READONLY_P (mem) = 1;
1738 	MEM_NOTRAP_P (mem) = 1;
1739 	emit_insn (insn);
1740 	return;
1741       }
1742 
1743     case SYMBOL_SMALL_TLSGD:
1744       {
1745 	rtx_insn *insns;
1746 	machine_mode mode = GET_MODE (dest);
1747 	rtx result = gen_rtx_REG (mode, R0_REGNUM);
1748 
1749 	start_sequence ();
1750 	if (TARGET_ILP32)
1751 	  aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
1752 	else
1753 	  aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
1754 	insns = get_insns ();
1755 	end_sequence ();
1756 
1757 	RTL_CONST_CALL_P (insns) = 1;
1758 	emit_libcall_block (insns, dest, result, imm);
1759 	return;
1760       }
1761 
1762     case SYMBOL_SMALL_TLSDESC:
1763       {
1764 	machine_mode mode = GET_MODE (dest);
1765 	rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
1766 	rtx tp;
1767 
1768 	gcc_assert (mode == Pmode || mode == ptr_mode);
1769 
1770 	/* In ILP32, the got entry is always of SImode size.  Unlike
1771 	   small GOT, the dest is fixed at reg 0.  */
1772 	if (TARGET_ILP32)
1773 	  emit_insn (gen_tlsdesc_small_si (imm));
1774 	else
1775 	  emit_insn (gen_tlsdesc_small_di (imm));
1776 	tp = aarch64_load_tp (NULL);
1777 
1778 	if (mode != Pmode)
1779 	  tp = gen_lowpart (mode, tp);
1780 
1781 	emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
1782 	if (REG_P (dest))
1783 	  set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1784 	return;
1785       }
1786 
1787     case SYMBOL_SMALL_TLSIE:
1788       {
1789 	/* In ILP32, the mode of dest can be either SImode or DImode,
1790 	   while the got entry is always of SImode size.  The mode of
1791 	   dest depends on how dest is used: if dest is assigned to a
1792 	   pointer (e.g. in the memory), it has SImode; it may have
1793 	   DImode if dest is dereferenced to access the memeory.
1794 	   This is why we have to handle three different tlsie_small
1795 	   patterns here (two patterns for ILP32).  */
1796 	machine_mode mode = GET_MODE (dest);
1797 	rtx tmp_reg = gen_reg_rtx (mode);
1798 	rtx tp = aarch64_load_tp (NULL);
1799 
1800 	if (mode == ptr_mode)
1801 	  {
1802 	    if (mode == DImode)
1803 	      emit_insn (gen_tlsie_small_di (tmp_reg, imm));
1804 	    else
1805 	      {
1806 		emit_insn (gen_tlsie_small_si (tmp_reg, imm));
1807 		tp = gen_lowpart (mode, tp);
1808 	      }
1809 	  }
1810 	else
1811 	  {
1812 	    gcc_assert (mode == Pmode);
1813 	    emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
1814 	  }
1815 
1816 	emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
1817 	if (REG_P (dest))
1818 	  set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1819 	return;
1820       }
1821 
1822     case SYMBOL_TLSLE12:
1823     case SYMBOL_TLSLE24:
1824     case SYMBOL_TLSLE32:
1825     case SYMBOL_TLSLE48:
1826       {
1827 	machine_mode mode = GET_MODE (dest);
1828 	rtx tp = aarch64_load_tp (NULL);
1829 
1830 	if (mode != Pmode)
1831 	  tp = gen_lowpart (mode, tp);
1832 
1833 	switch (type)
1834 	  {
1835 	  case SYMBOL_TLSLE12:
1836 	    emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
1837 			(dest, tp, imm));
1838 	    break;
1839 	  case SYMBOL_TLSLE24:
1840 	    emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
1841 			(dest, tp, imm));
1842 	  break;
1843 	  case SYMBOL_TLSLE32:
1844 	    emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
1845 			(dest, imm));
1846 	    emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1847 			(dest, dest, tp));
1848 	  break;
1849 	  case SYMBOL_TLSLE48:
1850 	    emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
1851 			(dest, imm));
1852 	    emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1853 			(dest, dest, tp));
1854 	    break;
1855 	  default:
1856 	    gcc_unreachable ();
1857 	  }
1858 
1859 	if (REG_P (dest))
1860 	  set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1861 	return;
1862       }
1863 
1864     case SYMBOL_TINY_GOT:
1865       emit_insn (gen_ldr_got_tiny (dest, imm));
1866       return;
1867 
1868     case SYMBOL_TINY_TLSIE:
1869       {
1870 	machine_mode mode = GET_MODE (dest);
1871 	rtx tp = aarch64_load_tp (NULL);
1872 
1873 	if (mode == ptr_mode)
1874 	  {
1875 	    if (mode == DImode)
1876 	      emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
1877 	    else
1878 	      {
1879 		tp = gen_lowpart (mode, tp);
1880 		emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
1881 	      }
1882 	  }
1883 	else
1884 	  {
1885 	    gcc_assert (mode == Pmode);
1886 	    emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
1887 	  }
1888 
1889 	if (REG_P (dest))
1890 	  set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1891 	return;
1892       }
1893 
1894     default:
1895       gcc_unreachable ();
1896     }
1897 }
1898 
1899 /* Emit a move from SRC to DEST.  Assume that the move expanders can
1900    handle all moves if !can_create_pseudo_p ().  The distinction is
1901    important because, unlike emit_move_insn, the move expanders know
1902    how to force Pmode objects into the constant pool even when the
1903    constant pool address is not itself legitimate.  */
1904 static rtx
aarch64_emit_move(rtx dest,rtx src)1905 aarch64_emit_move (rtx dest, rtx src)
1906 {
1907   return (can_create_pseudo_p ()
1908 	  ? emit_move_insn (dest, src)
1909 	  : emit_move_insn_1 (dest, src));
1910 }
1911 
1912 /* Split a 128-bit move operation into two 64-bit move operations,
1913    taking care to handle partial overlap of register to register
1914    copies.  Special cases are needed when moving between GP regs and
1915    FP regs.  SRC can be a register, constant or memory; DST a register
1916    or memory.  If either operand is memory it must not have any side
1917    effects.  */
1918 void
aarch64_split_128bit_move(rtx dst,rtx src)1919 aarch64_split_128bit_move (rtx dst, rtx src)
1920 {
1921   rtx dst_lo, dst_hi;
1922   rtx src_lo, src_hi;
1923 
1924   machine_mode mode = GET_MODE (dst);
1925 
1926   gcc_assert (mode == TImode || mode == TFmode);
1927   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
1928   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
1929 
1930   if (REG_P (dst) && REG_P (src))
1931     {
1932       int src_regno = REGNO (src);
1933       int dst_regno = REGNO (dst);
1934 
1935       /* Handle FP <-> GP regs.  */
1936       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
1937 	{
1938 	  src_lo = gen_lowpart (word_mode, src);
1939 	  src_hi = gen_highpart (word_mode, src);
1940 
1941 	  if (mode == TImode)
1942 	    {
1943 	      emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
1944 	      emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
1945 	    }
1946 	  else
1947 	    {
1948 	      emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
1949 	      emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
1950 	    }
1951 	  return;
1952 	}
1953       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
1954 	{
1955 	  dst_lo = gen_lowpart (word_mode, dst);
1956 	  dst_hi = gen_highpart (word_mode, dst);
1957 
1958 	  if (mode == TImode)
1959 	    {
1960 	      emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
1961 	      emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
1962 	    }
1963 	  else
1964 	    {
1965 	      emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
1966 	      emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
1967 	    }
1968 	  return;
1969 	}
1970     }
1971 
1972   dst_lo = gen_lowpart (word_mode, dst);
1973   dst_hi = gen_highpart (word_mode, dst);
1974   src_lo = gen_lowpart (word_mode, src);
1975   src_hi = gen_highpart_mode (word_mode, mode, src);
1976 
1977   /* At most one pairing may overlap.  */
1978   if (reg_overlap_mentioned_p (dst_lo, src_hi))
1979     {
1980       aarch64_emit_move (dst_hi, src_hi);
1981       aarch64_emit_move (dst_lo, src_lo);
1982     }
1983   else
1984     {
1985       aarch64_emit_move (dst_lo, src_lo);
1986       aarch64_emit_move (dst_hi, src_hi);
1987     }
1988 }
1989 
1990 bool
aarch64_split_128bit_move_p(rtx dst,rtx src)1991 aarch64_split_128bit_move_p (rtx dst, rtx src)
1992 {
1993   return (! REG_P (src)
1994 	  || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
1995 }
1996 
1997 /* Split a complex SIMD combine.  */
1998 
1999 void
aarch64_split_simd_combine(rtx dst,rtx src1,rtx src2)2000 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
2001 {
2002   machine_mode src_mode = GET_MODE (src1);
2003   machine_mode dst_mode = GET_MODE (dst);
2004 
2005   gcc_assert (VECTOR_MODE_P (dst_mode));
2006   gcc_assert (register_operand (dst, dst_mode)
2007 	      && register_operand (src1, src_mode)
2008 	      && register_operand (src2, src_mode));
2009 
2010   rtx (*gen) (rtx, rtx, rtx);
2011 
2012   switch (src_mode)
2013     {
2014     case E_V8QImode:
2015       gen = gen_aarch64_simd_combinev8qi;
2016       break;
2017     case E_V4HImode:
2018       gen = gen_aarch64_simd_combinev4hi;
2019       break;
2020     case E_V2SImode:
2021       gen = gen_aarch64_simd_combinev2si;
2022       break;
2023     case E_V4HFmode:
2024       gen = gen_aarch64_simd_combinev4hf;
2025       break;
2026     case E_V2SFmode:
2027       gen = gen_aarch64_simd_combinev2sf;
2028       break;
2029     case E_DImode:
2030       gen = gen_aarch64_simd_combinedi;
2031       break;
2032     case E_DFmode:
2033       gen = gen_aarch64_simd_combinedf;
2034       break;
2035     default:
2036       gcc_unreachable ();
2037     }
2038 
2039   emit_insn (gen (dst, src1, src2));
2040   return;
2041 }
2042 
2043 /* Split a complex SIMD move.  */
2044 
2045 void
aarch64_split_simd_move(rtx dst,rtx src)2046 aarch64_split_simd_move (rtx dst, rtx src)
2047 {
2048   machine_mode src_mode = GET_MODE (src);
2049   machine_mode dst_mode = GET_MODE (dst);
2050 
2051   gcc_assert (VECTOR_MODE_P (dst_mode));
2052 
2053   if (REG_P (dst) && REG_P (src))
2054     {
2055       rtx (*gen) (rtx, rtx);
2056 
2057       gcc_assert (VECTOR_MODE_P (src_mode));
2058 
2059       switch (src_mode)
2060 	{
2061 	case E_V16QImode:
2062 	  gen = gen_aarch64_split_simd_movv16qi;
2063 	  break;
2064 	case E_V8HImode:
2065 	  gen = gen_aarch64_split_simd_movv8hi;
2066 	  break;
2067 	case E_V4SImode:
2068 	  gen = gen_aarch64_split_simd_movv4si;
2069 	  break;
2070 	case E_V2DImode:
2071 	  gen = gen_aarch64_split_simd_movv2di;
2072 	  break;
2073 	case E_V8HFmode:
2074 	  gen = gen_aarch64_split_simd_movv8hf;
2075 	  break;
2076 	case E_V4SFmode:
2077 	  gen = gen_aarch64_split_simd_movv4sf;
2078 	  break;
2079 	case E_V2DFmode:
2080 	  gen = gen_aarch64_split_simd_movv2df;
2081 	  break;
2082 	default:
2083 	  gcc_unreachable ();
2084 	}
2085 
2086       emit_insn (gen (dst, src));
2087       return;
2088     }
2089 }
2090 
2091 bool
aarch64_zero_extend_const_eq(machine_mode xmode,rtx x,machine_mode ymode,rtx y)2092 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
2093 			      machine_mode ymode, rtx y)
2094 {
2095   rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
2096   gcc_assert (r != NULL);
2097   return rtx_equal_p (x, r);
2098 }
2099 
2100 
2101 static rtx
aarch64_force_temporary(machine_mode mode,rtx x,rtx value)2102 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
2103 {
2104   if (can_create_pseudo_p ())
2105     return force_reg (mode, value);
2106   else
2107     {
2108       gcc_assert (x);
2109       aarch64_emit_move (x, value);
2110       return x;
2111     }
2112 }
2113 
2114 /* Return true if we can move VALUE into a register using a single
2115    CNT[BHWD] instruction.  */
2116 
2117 static bool
aarch64_sve_cnt_immediate_p(poly_int64 value)2118 aarch64_sve_cnt_immediate_p (poly_int64 value)
2119 {
2120   HOST_WIDE_INT factor = value.coeffs[0];
2121   /* The coefficient must be [1, 16] * {2, 4, 8, 16}.  */
2122   return (value.coeffs[1] == factor
2123 	  && IN_RANGE (factor, 2, 16 * 16)
2124 	  && (factor & 1) == 0
2125 	  && factor <= 16 * (factor & -factor));
2126 }
2127 
2128 /* Likewise for rtx X.  */
2129 
2130 bool
aarch64_sve_cnt_immediate_p(rtx x)2131 aarch64_sve_cnt_immediate_p (rtx x)
2132 {
2133   poly_int64 value;
2134   return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
2135 }
2136 
2137 /* Return the asm string for an instruction with a CNT-like vector size
2138    operand (a vector pattern followed by a multiplier in the range [1, 16]).
2139    PREFIX is the mnemonic without the size suffix and OPERANDS is the
2140    first part of the operands template (the part that comes before the
2141    vector size itself).  FACTOR is the number of quadwords.
2142    NELTS_PER_VQ, if nonzero, is the number of elements in each quadword.
2143    If it is zero, we can use any element size.  */
2144 
2145 static char *
aarch64_output_sve_cnt_immediate(const char * prefix,const char * operands,unsigned int factor,unsigned int nelts_per_vq)2146 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2147 				  unsigned int factor,
2148 				  unsigned int nelts_per_vq)
2149 {
2150   static char buffer[sizeof ("sqincd\t%x0, %w0, all, mul #16")];
2151 
2152   if (nelts_per_vq == 0)
2153     /* There is some overlap in the ranges of the four CNT instructions.
2154        Here we always use the smallest possible element size, so that the
2155        multiplier is 1 whereever possible.  */
2156     nelts_per_vq = factor & -factor;
2157   int shift = std::min (exact_log2 (nelts_per_vq), 4);
2158   gcc_assert (IN_RANGE (shift, 1, 4));
2159   char suffix = "dwhb"[shift - 1];
2160 
2161   factor >>= shift;
2162   unsigned int written;
2163   if (factor == 1)
2164     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
2165 			prefix, suffix, operands);
2166   else
2167     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, all, mul #%d",
2168 			prefix, suffix, operands, factor);
2169   gcc_assert (written < sizeof (buffer));
2170   return buffer;
2171 }
2172 
2173 /* Return the asm string for an instruction with a CNT-like vector size
2174    operand (a vector pattern followed by a multiplier in the range [1, 16]).
2175    PREFIX is the mnemonic without the size suffix and OPERANDS is the
2176    first part of the operands template (the part that comes before the
2177    vector size itself).  X is the value of the vector size operand,
2178    as a polynomial integer rtx.  */
2179 
2180 char *
aarch64_output_sve_cnt_immediate(const char * prefix,const char * operands,rtx x)2181 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2182 				  rtx x)
2183 {
2184   poly_int64 value = rtx_to_poly_int64 (x);
2185   gcc_assert (aarch64_sve_cnt_immediate_p (value));
2186   return aarch64_output_sve_cnt_immediate (prefix, operands,
2187 					   value.coeffs[1], 0);
2188 }
2189 
2190 /* Return true if we can add VALUE to a register using a single ADDVL
2191    or ADDPL instruction.  */
2192 
2193 static bool
aarch64_sve_addvl_addpl_immediate_p(poly_int64 value)2194 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
2195 {
2196   HOST_WIDE_INT factor = value.coeffs[0];
2197   if (factor == 0 || value.coeffs[1] != factor)
2198     return false;
2199   /* FACTOR counts VG / 2, so a value of 2 is one predicate width
2200      and a value of 16 is one vector width.  */
2201   return (((factor & 15) == 0 && IN_RANGE (factor, -32 * 16, 31 * 16))
2202 	  || ((factor & 1) == 0 && IN_RANGE (factor, -32 * 2, 31 * 2)));
2203 }
2204 
2205 /* Likewise for rtx X.  */
2206 
2207 bool
aarch64_sve_addvl_addpl_immediate_p(rtx x)2208 aarch64_sve_addvl_addpl_immediate_p (rtx x)
2209 {
2210   poly_int64 value;
2211   return (poly_int_rtx_p (x, &value)
2212 	  && aarch64_sve_addvl_addpl_immediate_p (value));
2213 }
2214 
2215 /* Return the asm string for adding ADDVL or ADDPL immediate X to operand 1
2216    and storing the result in operand 0.  */
2217 
2218 char *
aarch64_output_sve_addvl_addpl(rtx dest,rtx base,rtx offset)2219 aarch64_output_sve_addvl_addpl (rtx dest, rtx base, rtx offset)
2220 {
2221   static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
2222   poly_int64 offset_value = rtx_to_poly_int64 (offset);
2223   gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
2224 
2225   /* Use INC or DEC if possible.  */
2226   if (rtx_equal_p (dest, base) && GP_REGNUM_P (REGNO (dest)))
2227     {
2228       if (aarch64_sve_cnt_immediate_p (offset_value))
2229 	return aarch64_output_sve_cnt_immediate ("inc", "%x0",
2230 						 offset_value.coeffs[1], 0);
2231       if (aarch64_sve_cnt_immediate_p (-offset_value))
2232 	return aarch64_output_sve_cnt_immediate ("dec", "%x0",
2233 						 -offset_value.coeffs[1], 0);
2234     }
2235 
2236   int factor = offset_value.coeffs[1];
2237   if ((factor & 15) == 0)
2238     snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
2239   else
2240     snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
2241   return buffer;
2242 }
2243 
2244 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2245    instruction.  If it is, store the number of elements in each vector
2246    quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
2247    factor in *FACTOR_OUT (if nonnull).  */
2248 
2249 bool
aarch64_sve_inc_dec_immediate_p(rtx x,int * factor_out,unsigned int * nelts_per_vq_out)2250 aarch64_sve_inc_dec_immediate_p (rtx x, int *factor_out,
2251 				 unsigned int *nelts_per_vq_out)
2252 {
2253   rtx elt;
2254   poly_int64 value;
2255 
2256   if (!const_vec_duplicate_p (x, &elt)
2257       || !poly_int_rtx_p (elt, &value))
2258     return false;
2259 
2260   unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
2261   if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
2262     /* There's no vector INCB.  */
2263     return false;
2264 
2265   HOST_WIDE_INT factor = value.coeffs[0];
2266   if (value.coeffs[1] != factor)
2267     return false;
2268 
2269   /* The coefficient must be [1, 16] * NELTS_PER_VQ.  */
2270   if ((factor % nelts_per_vq) != 0
2271       || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
2272     return false;
2273 
2274   if (factor_out)
2275     *factor_out = factor;
2276   if (nelts_per_vq_out)
2277     *nelts_per_vq_out = nelts_per_vq;
2278   return true;
2279 }
2280 
2281 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2282    instruction.  */
2283 
2284 bool
aarch64_sve_inc_dec_immediate_p(rtx x)2285 aarch64_sve_inc_dec_immediate_p (rtx x)
2286 {
2287   return aarch64_sve_inc_dec_immediate_p (x, NULL, NULL);
2288 }
2289 
2290 /* Return the asm template for an SVE vector INC or DEC instruction.
2291    OPERANDS gives the operands before the vector count and X is the
2292    value of the vector count operand itself.  */
2293 
2294 char *
aarch64_output_sve_inc_dec_immediate(const char * operands,rtx x)2295 aarch64_output_sve_inc_dec_immediate (const char *operands, rtx x)
2296 {
2297   int factor;
2298   unsigned int nelts_per_vq;
2299   if (!aarch64_sve_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
2300     gcc_unreachable ();
2301   if (factor < 0)
2302     return aarch64_output_sve_cnt_immediate ("dec", operands, -factor,
2303 					     nelts_per_vq);
2304   else
2305     return aarch64_output_sve_cnt_immediate ("inc", operands, factor,
2306 					     nelts_per_vq);
2307 }
2308 
2309 static int
aarch64_internal_mov_immediate(rtx dest,rtx imm,bool generate,scalar_int_mode mode)2310 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
2311 				scalar_int_mode mode)
2312 {
2313   int i;
2314   unsigned HOST_WIDE_INT val, val2, mask;
2315   int one_match, zero_match;
2316   int num_insns;
2317 
2318   val = INTVAL (imm);
2319 
2320   if (aarch64_move_imm (val, mode))
2321     {
2322       if (generate)
2323 	emit_insn (gen_rtx_SET (dest, imm));
2324       return 1;
2325     }
2326 
2327   /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
2328      (with XXXX non-zero). In that case check to see if the move can be done in
2329      a smaller mode.  */
2330   val2 = val & 0xffffffff;
2331   if (mode == DImode
2332       && aarch64_move_imm (val2, SImode)
2333       && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
2334     {
2335       if (generate)
2336 	emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
2337 
2338       /* Check if we have to emit a second instruction by checking to see
2339          if any of the upper 32 bits of the original DI mode value is set.  */
2340       if (val == val2)
2341 	return 1;
2342 
2343       i = (val >> 48) ? 48 : 32;
2344 
2345       if (generate)
2346 	 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2347 				    GEN_INT ((val >> i) & 0xffff)));
2348 
2349       return 2;
2350     }
2351 
2352   if ((val >> 32) == 0 || mode == SImode)
2353     {
2354       if (generate)
2355 	{
2356 	  emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
2357 	  if (mode == SImode)
2358 	    emit_insn (gen_insv_immsi (dest, GEN_INT (16),
2359 				       GEN_INT ((val >> 16) & 0xffff)));
2360 	  else
2361 	    emit_insn (gen_insv_immdi (dest, GEN_INT (16),
2362 				       GEN_INT ((val >> 16) & 0xffff)));
2363 	}
2364       return 2;
2365     }
2366 
2367   /* Remaining cases are all for DImode.  */
2368 
2369   mask = 0xffff;
2370   zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
2371     ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
2372   one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
2373     ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
2374 
2375   if (zero_match != 2 && one_match != 2)
2376     {
2377       /* Try emitting a bitmask immediate with a movk replacing 16 bits.
2378 	 For a 64-bit bitmask try whether changing 16 bits to all ones or
2379 	 zeroes creates a valid bitmask.  To check any repeated bitmask,
2380 	 try using 16 bits from the other 32-bit half of val.  */
2381 
2382       for (i = 0; i < 64; i += 16, mask <<= 16)
2383 	{
2384 	  val2 = val & ~mask;
2385 	  if (val2 != val && aarch64_bitmask_imm (val2, mode))
2386 	    break;
2387 	  val2 = val | mask;
2388 	  if (val2 != val && aarch64_bitmask_imm (val2, mode))
2389 	    break;
2390 	  val2 = val2 & ~mask;
2391 	  val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
2392 	  if (val2 != val && aarch64_bitmask_imm (val2, mode))
2393 	    break;
2394 	}
2395       if (i != 64)
2396 	{
2397 	  if (generate)
2398 	    {
2399 	      emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
2400 	      emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2401 					 GEN_INT ((val >> i) & 0xffff)));
2402 	    }
2403 	  return 2;
2404 	}
2405     }
2406 
2407   /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
2408      are emitted by the initial mov.  If one_match > zero_match, skip set bits,
2409      otherwise skip zero bits.  */
2410 
2411   num_insns = 1;
2412   mask = 0xffff;
2413   val2 = one_match > zero_match ? ~val : val;
2414   i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
2415 
2416   if (generate)
2417     emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
2418 					   ? (val | ~(mask << i))
2419 					   : (val & (mask << i)))));
2420   for (i += 16; i < 64; i += 16)
2421     {
2422       if ((val2 & (mask << i)) == 0)
2423 	continue;
2424       if (generate)
2425 	emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2426 				   GEN_INT ((val >> i) & 0xffff)));
2427       num_insns ++;
2428     }
2429 
2430   return num_insns;
2431 }
2432 
2433 /* Return whether imm is a 128-bit immediate which is simple enough to
2434    expand inline.  */
2435 bool
aarch64_mov128_immediate(rtx imm)2436 aarch64_mov128_immediate (rtx imm)
2437 {
2438   if (GET_CODE (imm) == CONST_INT)
2439     return true;
2440 
2441   gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
2442 
2443   rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
2444   rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
2445 
2446   return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
2447 	 + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
2448 }
2449 
2450 
2451 /* Return the number of temporary registers that aarch64_add_offset_1
2452    would need to add OFFSET to a register.  */
2453 
2454 static unsigned int
aarch64_add_offset_1_temporaries(HOST_WIDE_INT offset)2455 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
2456 {
2457   return abs_hwi (offset) < 0x1000000 ? 0 : 1;
2458 }
2459 
2460 /* A subroutine of aarch64_add_offset.  Set DEST to SRC + OFFSET for
2461    a non-polynomial OFFSET.  MODE is the mode of the addition.
2462    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2463    be set and CFA adjustments added to the generated instructions.
2464 
2465    TEMP1, if nonnull, is a register of mode MODE that can be used as a
2466    temporary if register allocation is already complete.  This temporary
2467    register may overlap DEST but must not overlap SRC.  If TEMP1 is known
2468    to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
2469    the immediate again.
2470 
2471    Since this function may be used to adjust the stack pointer, we must
2472    ensure that it cannot cause transient stack deallocation (for example
2473    by first incrementing SP and then decrementing when adjusting by a
2474    large immediate).  */
2475 
2476 static void
aarch64_add_offset_1(scalar_int_mode mode,rtx dest,rtx src,HOST_WIDE_INT offset,rtx temp1,bool frame_related_p,bool emit_move_imm)2477 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
2478 		      rtx src, HOST_WIDE_INT offset, rtx temp1,
2479 		      bool frame_related_p, bool emit_move_imm)
2480 {
2481   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
2482   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
2483 
2484   HOST_WIDE_INT moffset = abs_hwi (offset);
2485   rtx_insn *insn;
2486 
2487   if (!moffset)
2488     {
2489       if (!rtx_equal_p (dest, src))
2490 	{
2491 	  insn = emit_insn (gen_rtx_SET (dest, src));
2492 	  RTX_FRAME_RELATED_P (insn) = frame_related_p;
2493 	}
2494       return;
2495     }
2496 
2497   /* Single instruction adjustment.  */
2498   if (aarch64_uimm12_shift (moffset))
2499     {
2500       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
2501       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2502       return;
2503     }
2504 
2505   /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
2506      and either:
2507 
2508      a) the offset cannot be loaded by a 16-bit move or
2509      b) there is no spare register into which we can move it.  */
2510   if (moffset < 0x1000000
2511       && ((!temp1 && !can_create_pseudo_p ())
2512 	  || !aarch64_move_imm (moffset, mode)))
2513     {
2514       HOST_WIDE_INT low_off = moffset & 0xfff;
2515 
2516       low_off = offset < 0 ? -low_off : low_off;
2517       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
2518       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2519       insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
2520       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2521       return;
2522     }
2523 
2524   /* Emit a move immediate if required and an addition/subtraction.  */
2525   if (emit_move_imm)
2526     {
2527       gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
2528       temp1 = aarch64_force_temporary (mode, temp1, GEN_INT (moffset));
2529     }
2530   insn = emit_insn (offset < 0
2531 		    ? gen_sub3_insn (dest, src, temp1)
2532 		    : gen_add3_insn (dest, src, temp1));
2533   if (frame_related_p)
2534     {
2535       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2536       rtx adj = plus_constant (mode, src, offset);
2537       add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
2538     }
2539 }
2540 
2541 /* Return the number of temporary registers that aarch64_add_offset
2542    would need to move OFFSET into a register or add OFFSET to a register;
2543    ADD_P is true if we want the latter rather than the former.  */
2544 
2545 static unsigned int
aarch64_offset_temporaries(bool add_p,poly_int64 offset)2546 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
2547 {
2548   /* This follows the same structure as aarch64_add_offset.  */
2549   if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
2550     return 0;
2551 
2552   unsigned int count = 0;
2553   HOST_WIDE_INT factor = offset.coeffs[1];
2554   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
2555   poly_int64 poly_offset (factor, factor);
2556   if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
2557     /* Need one register for the ADDVL/ADDPL result.  */
2558     count += 1;
2559   else if (factor != 0)
2560     {
2561       factor = abs (factor);
2562       if (factor > 16 * (factor & -factor))
2563 	/* Need one register for the CNT result and one for the multiplication
2564 	   factor.  If necessary, the second temporary can be reused for the
2565 	   constant part of the offset.  */
2566 	return 2;
2567       /* Need one register for the CNT result (which might then
2568 	 be shifted).  */
2569       count += 1;
2570     }
2571   return count + aarch64_add_offset_1_temporaries (constant);
2572 }
2573 
2574 /* If X can be represented as a poly_int64, return the number
2575    of temporaries that are required to add it to a register.
2576    Return -1 otherwise.  */
2577 
2578 int
aarch64_add_offset_temporaries(rtx x)2579 aarch64_add_offset_temporaries (rtx x)
2580 {
2581   poly_int64 offset;
2582   if (!poly_int_rtx_p (x, &offset))
2583     return -1;
2584   return aarch64_offset_temporaries (true, offset);
2585 }
2586 
2587 /* Set DEST to SRC + OFFSET.  MODE is the mode of the addition.
2588    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2589    be set and CFA adjustments added to the generated instructions.
2590 
2591    TEMP1, if nonnull, is a register of mode MODE that can be used as a
2592    temporary if register allocation is already complete.  This temporary
2593    register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
2594    If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
2595    false to avoid emitting the immediate again.
2596 
2597    TEMP2, if nonnull, is a second temporary register that doesn't
2598    overlap either DEST or REG.
2599 
2600    Since this function may be used to adjust the stack pointer, we must
2601    ensure that it cannot cause transient stack deallocation (for example
2602    by first incrementing SP and then decrementing when adjusting by a
2603    large immediate).  */
2604 
2605 static void
2606 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
2607 		    poly_int64 offset, rtx temp1, rtx temp2,
2608 		    bool frame_related_p, bool emit_move_imm = true)
2609 {
2610   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
2611   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
2612   gcc_assert (temp1 == NULL_RTX
2613 	      || !frame_related_p
2614 	      || !reg_overlap_mentioned_p (temp1, dest));
2615   gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
2616 
2617   /* Try using ADDVL or ADDPL to add the whole value.  */
2618   if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
2619     {
2620       rtx offset_rtx = gen_int_mode (offset, mode);
2621       rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
2622       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2623       return;
2624     }
2625 
2626   /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
2627      SVE vector register, over and above the minimum size of 128 bits.
2628      This is equivalent to half the value returned by CNTD with a
2629      vector shape of ALL.  */
2630   HOST_WIDE_INT factor = offset.coeffs[1];
2631   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
2632 
2633   /* Try using ADDVL or ADDPL to add the VG-based part.  */
2634   poly_int64 poly_offset (factor, factor);
2635   if (src != const0_rtx
2636       && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
2637     {
2638       rtx offset_rtx = gen_int_mode (poly_offset, mode);
2639       if (frame_related_p)
2640 	{
2641 	  rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
2642 	  RTX_FRAME_RELATED_P (insn) = true;
2643 	  src = dest;
2644 	}
2645       else
2646 	{
2647 	  rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
2648 	  src = aarch64_force_temporary (mode, temp1, addr);
2649 	  temp1 = temp2;
2650 	  temp2 = NULL_RTX;
2651 	}
2652     }
2653   /* Otherwise use a CNT-based sequence.  */
2654   else if (factor != 0)
2655     {
2656       /* Use a subtraction if we have a negative factor.  */
2657       rtx_code code = PLUS;
2658       if (factor < 0)
2659 	{
2660 	  factor = -factor;
2661 	  code = MINUS;
2662 	}
2663 
2664       /* Calculate CNTD * FACTOR / 2.  First try to fold the division
2665 	 into the multiplication.  */
2666       rtx val;
2667       int shift = 0;
2668       if (factor & 1)
2669 	/* Use a right shift by 1.  */
2670 	shift = -1;
2671       else
2672 	factor /= 2;
2673       HOST_WIDE_INT low_bit = factor & -factor;
2674       if (factor <= 16 * low_bit)
2675 	{
2676 	  if (factor > 16 * 8)
2677 	    {
2678 	      /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
2679 		 the value with the minimum multiplier and shift it into
2680 		 position.  */
2681 	      int extra_shift = exact_log2 (low_bit);
2682 	      shift += extra_shift;
2683 	      factor >>= extra_shift;
2684 	    }
2685 	  val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
2686 	}
2687       else
2688 	{
2689 	  /* Use CNTD, then multiply it by FACTOR.  */
2690 	  val = gen_int_mode (poly_int64 (2, 2), mode);
2691 	  val = aarch64_force_temporary (mode, temp1, val);
2692 
2693 	  /* Go back to using a negative multiplication factor if we have
2694 	     no register from which to subtract.  */
2695 	  if (code == MINUS && src == const0_rtx)
2696 	    {
2697 	      factor = -factor;
2698 	      code = PLUS;
2699 	    }
2700 	  rtx coeff1 = gen_int_mode (factor, mode);
2701 	  coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
2702 	  val = gen_rtx_MULT (mode, val, coeff1);
2703 	}
2704 
2705       if (shift > 0)
2706 	{
2707 	  /* Multiply by 1 << SHIFT.  */
2708 	  val = aarch64_force_temporary (mode, temp1, val);
2709 	  val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
2710 	}
2711       else if (shift == -1)
2712 	{
2713 	  /* Divide by 2.  */
2714 	  val = aarch64_force_temporary (mode, temp1, val);
2715 	  val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
2716 	}
2717 
2718       /* Calculate SRC +/- CNTD * FACTOR / 2.  */
2719       if (src != const0_rtx)
2720 	{
2721 	  val = aarch64_force_temporary (mode, temp1, val);
2722 	  val = gen_rtx_fmt_ee (code, mode, src, val);
2723 	}
2724       else if (code == MINUS)
2725 	{
2726 	  val = aarch64_force_temporary (mode, temp1, val);
2727 	  val = gen_rtx_NEG (mode, val);
2728 	}
2729 
2730       if (constant == 0 || frame_related_p)
2731 	{
2732 	  rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
2733 	  if (frame_related_p)
2734 	    {
2735 	      RTX_FRAME_RELATED_P (insn) = true;
2736 	      add_reg_note (insn, REG_CFA_ADJUST_CFA,
2737 			    gen_rtx_SET (dest, plus_constant (Pmode, src,
2738 							      poly_offset)));
2739 	    }
2740 	  src = dest;
2741 	  if (constant == 0)
2742 	    return;
2743 	}
2744       else
2745 	{
2746 	  src = aarch64_force_temporary (mode, temp1, val);
2747 	  temp1 = temp2;
2748 	  temp2 = NULL_RTX;
2749 	}
2750 
2751       emit_move_imm = true;
2752     }
2753 
2754   aarch64_add_offset_1 (mode, dest, src, constant, temp1,
2755 			frame_related_p, emit_move_imm);
2756 }
2757 
2758 /* Like aarch64_add_offset, but the offset is given as an rtx rather
2759    than a poly_int64.  */
2760 
2761 void
aarch64_split_add_offset(scalar_int_mode mode,rtx dest,rtx src,rtx offset_rtx,rtx temp1,rtx temp2)2762 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
2763 			  rtx offset_rtx, rtx temp1, rtx temp2)
2764 {
2765   aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
2766 		      temp1, temp2, false);
2767 }
2768 
2769 /* Add DELTA to the stack pointer, marking the instructions frame-related.
2770    TEMP1 is available as a temporary if nonnull.  EMIT_MOVE_IMM is false
2771    if TEMP1 already contains abs (DELTA).  */
2772 
2773 static inline void
aarch64_add_sp(rtx temp1,rtx temp2,poly_int64 delta,bool emit_move_imm)2774 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
2775 {
2776   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
2777 		      temp1, temp2, true, emit_move_imm);
2778 }
2779 
2780 /* Subtract DELTA from the stack pointer, marking the instructions
2781    frame-related if FRAME_RELATED_P.  TEMP1 is available as a temporary
2782    if nonnull.  */
2783 
2784 static inline void
aarch64_sub_sp(rtx temp1,rtx temp2,poly_int64 delta,bool frame_related_p)2785 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p)
2786 {
2787   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
2788 		      temp1, temp2, frame_related_p);
2789 }
2790 
2791 /* Set DEST to (vec_series BASE STEP).  */
2792 
2793 static void
aarch64_expand_vec_series(rtx dest,rtx base,rtx step)2794 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
2795 {
2796   machine_mode mode = GET_MODE (dest);
2797   scalar_mode inner = GET_MODE_INNER (mode);
2798 
2799   /* Each operand can be a register or an immediate in the range [-16, 15].  */
2800   if (!aarch64_sve_index_immediate_p (base))
2801     base = force_reg (inner, base);
2802   if (!aarch64_sve_index_immediate_p (step))
2803     step = force_reg (inner, step);
2804 
2805   emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
2806 }
2807 
2808 /* Try to duplicate SRC into SVE register DEST, given that SRC is an
2809    integer of mode INT_MODE.  Return true on success.  */
2810 
2811 static bool
aarch64_expand_sve_widened_duplicate(rtx dest,scalar_int_mode src_mode,rtx src)2812 aarch64_expand_sve_widened_duplicate (rtx dest, scalar_int_mode src_mode,
2813 				      rtx src)
2814 {
2815   /* If the constant is smaller than 128 bits, we can do the move
2816      using a vector of SRC_MODEs.  */
2817   if (src_mode != TImode)
2818     {
2819       poly_uint64 count = exact_div (GET_MODE_SIZE (GET_MODE (dest)),
2820 				     GET_MODE_SIZE (src_mode));
2821       machine_mode dup_mode = mode_for_vector (src_mode, count).require ();
2822       emit_move_insn (gen_lowpart (dup_mode, dest),
2823 		      gen_const_vec_duplicate (dup_mode, src));
2824       return true;
2825     }
2826 
2827   /* Use LD1RQ[BHWD] to load the 128 bits from memory.  */
2828   src = force_const_mem (src_mode, src);
2829   if (!src)
2830     return false;
2831 
2832   /* Make sure that the address is legitimate.  */
2833   if (!aarch64_sve_ld1r_operand_p (src))
2834     {
2835       rtx addr = force_reg (Pmode, XEXP (src, 0));
2836       src = replace_equiv_address (src, addr);
2837     }
2838 
2839   machine_mode mode = GET_MODE (dest);
2840   unsigned int elem_bytes = GET_MODE_UNIT_SIZE (mode);
2841   machine_mode pred_mode = aarch64_sve_pred_mode (elem_bytes).require ();
2842   rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
2843   src = gen_rtx_UNSPEC (mode, gen_rtvec (2, ptrue, src), UNSPEC_LD1RQ);
2844   emit_insn (gen_rtx_SET (dest, src));
2845   return true;
2846 }
2847 
2848 /* Expand a move of general CONST_VECTOR SRC into DEST, given that it
2849    isn't a simple duplicate or series.  */
2850 
2851 static void
aarch64_expand_sve_const_vector(rtx dest,rtx src)2852 aarch64_expand_sve_const_vector (rtx dest, rtx src)
2853 {
2854   machine_mode mode = GET_MODE (src);
2855   unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
2856   unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
2857   gcc_assert (npatterns > 1);
2858 
2859   if (nelts_per_pattern == 1)
2860     {
2861       /* The constant is a repeating seqeuence of at least two elements,
2862 	 where the repeating elements occupy no more than 128 bits.
2863 	 Get an integer representation of the replicated value.  */
2864       scalar_int_mode int_mode;
2865       if (BYTES_BIG_ENDIAN)
2866 	/* For now, always use LD1RQ to load the value on big-endian
2867 	   targets, since the handling of smaller integers includes a
2868 	   subreg that is semantically an element reverse.  */
2869 	int_mode = TImode;
2870       else
2871 	{
2872 	  unsigned int int_bits = GET_MODE_UNIT_BITSIZE (mode) * npatterns;
2873 	  gcc_assert (int_bits <= 128);
2874 	  int_mode = int_mode_for_size (int_bits, 0).require ();
2875 	}
2876       rtx int_value = simplify_gen_subreg (int_mode, src, mode, 0);
2877       if (int_value
2878 	  && aarch64_expand_sve_widened_duplicate (dest, int_mode, int_value))
2879 	return;
2880     }
2881 
2882   /* Expand each pattern individually.  */
2883   rtx_vector_builder builder;
2884   auto_vec<rtx, 16> vectors (npatterns);
2885   for (unsigned int i = 0; i < npatterns; ++i)
2886     {
2887       builder.new_vector (mode, 1, nelts_per_pattern);
2888       for (unsigned int j = 0; j < nelts_per_pattern; ++j)
2889 	builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
2890       vectors.quick_push (force_reg (mode, builder.build ()));
2891     }
2892 
2893   /* Use permutes to interleave the separate vectors.  */
2894   while (npatterns > 1)
2895     {
2896       npatterns /= 2;
2897       for (unsigned int i = 0; i < npatterns; ++i)
2898 	{
2899 	  rtx tmp = (npatterns == 1 ? dest : gen_reg_rtx (mode));
2900 	  rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
2901 	  emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
2902 	  vectors[i] = tmp;
2903 	}
2904     }
2905   gcc_assert (vectors[0] == dest);
2906 }
2907 
2908 /* Set DEST to immediate IMM.  For SVE vector modes, GEN_VEC_DUPLICATE
2909    is a pattern that can be used to set DEST to a replicated scalar
2910    element.  */
2911 
2912 void
aarch64_expand_mov_immediate(rtx dest,rtx imm,rtx (* gen_vec_duplicate)(rtx,rtx))2913 aarch64_expand_mov_immediate (rtx dest, rtx imm,
2914 			      rtx (*gen_vec_duplicate) (rtx, rtx))
2915 {
2916   machine_mode mode = GET_MODE (dest);
2917 
2918   /* Check on what type of symbol it is.  */
2919   scalar_int_mode int_mode;
2920   if ((GET_CODE (imm) == SYMBOL_REF
2921        || GET_CODE (imm) == LABEL_REF
2922        || GET_CODE (imm) == CONST
2923        || GET_CODE (imm) == CONST_POLY_INT)
2924       && is_a <scalar_int_mode> (mode, &int_mode))
2925     {
2926       rtx mem;
2927       poly_int64 offset;
2928       HOST_WIDE_INT const_offset;
2929       enum aarch64_symbol_type sty;
2930 
2931       /* If we have (const (plus symbol offset)), separate out the offset
2932 	 before we start classifying the symbol.  */
2933       rtx base = strip_offset (imm, &offset);
2934 
2935       /* We must always add an offset involving VL separately, rather than
2936 	 folding it into the relocation.  */
2937       if (!offset.is_constant (&const_offset))
2938 	{
2939 	  if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
2940 	    emit_insn (gen_rtx_SET (dest, imm));
2941 	  else
2942 	    {
2943 	      /* Do arithmetic on 32-bit values if the result is smaller
2944 		 than that.  */
2945 	      if (partial_subreg_p (int_mode, SImode))
2946 		{
2947 		  /* It is invalid to do symbol calculations in modes
2948 		     narrower than SImode.  */
2949 		  gcc_assert (base == const0_rtx);
2950 		  dest = gen_lowpart (SImode, dest);
2951 		  int_mode = SImode;
2952 		}
2953 	      if (base != const0_rtx)
2954 		{
2955 		  base = aarch64_force_temporary (int_mode, dest, base);
2956 		  aarch64_add_offset (int_mode, dest, base, offset,
2957 				      NULL_RTX, NULL_RTX, false);
2958 		}
2959 	      else
2960 		aarch64_add_offset (int_mode, dest, base, offset,
2961 				    dest, NULL_RTX, false);
2962 	    }
2963 	  return;
2964 	}
2965 
2966       sty = aarch64_classify_symbol (base, const_offset);
2967       switch (sty)
2968 	{
2969 	case SYMBOL_FORCE_TO_MEM:
2970 	  if (const_offset != 0
2971 	      && targetm.cannot_force_const_mem (int_mode, imm))
2972 	    {
2973 	      gcc_assert (can_create_pseudo_p ());
2974 	      base = aarch64_force_temporary (int_mode, dest, base);
2975 	      aarch64_add_offset (int_mode, dest, base, const_offset,
2976 				  NULL_RTX, NULL_RTX, false);
2977 	      return;
2978 	    }
2979 
2980 	  mem = force_const_mem (ptr_mode, imm);
2981 	  gcc_assert (mem);
2982 
2983 	  /* If we aren't generating PC relative literals, then
2984 	     we need to expand the literal pool access carefully.
2985 	     This is something that needs to be done in a number
2986 	     of places, so could well live as a separate function.  */
2987 	  if (!aarch64_pcrelative_literal_loads)
2988 	    {
2989 	      gcc_assert (can_create_pseudo_p ());
2990 	      base = gen_reg_rtx (ptr_mode);
2991 	      aarch64_expand_mov_immediate (base, XEXP (mem, 0));
2992 	      if (ptr_mode != Pmode)
2993 		base = convert_memory_address (Pmode, base);
2994 	      mem = gen_rtx_MEM (ptr_mode, base);
2995 	    }
2996 
2997 	  if (int_mode != ptr_mode)
2998 	    mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
2999 
3000 	  emit_insn (gen_rtx_SET (dest, mem));
3001 
3002 	  return;
3003 
3004         case SYMBOL_SMALL_TLSGD:
3005         case SYMBOL_SMALL_TLSDESC:
3006 	case SYMBOL_SMALL_TLSIE:
3007 	case SYMBOL_SMALL_GOT_28K:
3008 	case SYMBOL_SMALL_GOT_4G:
3009 	case SYMBOL_TINY_GOT:
3010 	case SYMBOL_TINY_TLSIE:
3011 	  if (const_offset != 0)
3012 	    {
3013 	      gcc_assert(can_create_pseudo_p ());
3014 	      base = aarch64_force_temporary (int_mode, dest, base);
3015 	      aarch64_add_offset (int_mode, dest, base, const_offset,
3016 				  NULL_RTX, NULL_RTX, false);
3017 	      return;
3018 	    }
3019 	  /* FALLTHRU */
3020 
3021 	case SYMBOL_SMALL_ABSOLUTE:
3022 	case SYMBOL_TINY_ABSOLUTE:
3023 	case SYMBOL_TLSLE12:
3024 	case SYMBOL_TLSLE24:
3025 	case SYMBOL_TLSLE32:
3026 	case SYMBOL_TLSLE48:
3027 	  aarch64_load_symref_appropriately (dest, imm, sty);
3028 	  return;
3029 
3030 	default:
3031 	  gcc_unreachable ();
3032 	}
3033     }
3034 
3035   if (!CONST_INT_P (imm))
3036     {
3037       rtx base, step, value;
3038       if (GET_CODE (imm) == HIGH
3039 	  || aarch64_simd_valid_immediate (imm, NULL))
3040 	emit_insn (gen_rtx_SET (dest, imm));
3041       else if (const_vec_series_p (imm, &base, &step))
3042 	aarch64_expand_vec_series (dest, base, step);
3043       else if (const_vec_duplicate_p (imm, &value))
3044 	{
3045 	  /* If the constant is out of range of an SVE vector move,
3046 	     load it from memory if we can, otherwise move it into
3047 	     a register and use a DUP.  */
3048 	  scalar_mode inner_mode = GET_MODE_INNER (mode);
3049 	  rtx op = force_const_mem (inner_mode, value);
3050 	  if (!op)
3051 	    op = force_reg (inner_mode, value);
3052 	  else if (!aarch64_sve_ld1r_operand_p (op))
3053 	    {
3054 	      rtx addr = force_reg (Pmode, XEXP (op, 0));
3055 	      op = replace_equiv_address (op, addr);
3056 	    }
3057 	  emit_insn (gen_vec_duplicate (dest, op));
3058 	}
3059       else if (GET_CODE (imm) == CONST_VECTOR
3060 	       && !GET_MODE_NUNITS (GET_MODE (imm)).is_constant ())
3061 	aarch64_expand_sve_const_vector (dest, imm);
3062       else
3063 	{
3064 	  rtx mem = force_const_mem (mode, imm);
3065 	  gcc_assert (mem);
3066 	  emit_move_insn (dest, mem);
3067 	}
3068 
3069       return;
3070     }
3071 
3072   aarch64_internal_mov_immediate (dest, imm, true,
3073 				  as_a <scalar_int_mode> (mode));
3074 }
3075 
3076 /* Emit an SVE predicated move from SRC to DEST.  PRED is a predicate
3077    that is known to contain PTRUE.  */
3078 
3079 void
aarch64_emit_sve_pred_move(rtx dest,rtx pred,rtx src)3080 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
3081 {
3082   emit_insn (gen_rtx_SET (dest, gen_rtx_UNSPEC (GET_MODE (dest),
3083 						gen_rtvec (2, pred, src),
3084 						UNSPEC_MERGE_PTRUE)));
3085 }
3086 
3087 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
3088    operand is in memory.  In this case we need to use the predicated LD1
3089    and ST1 instead of LDR and STR, both for correctness on big-endian
3090    targets and because LD1 and ST1 support a wider range of addressing modes.
3091    PRED_MODE is the mode of the predicate.
3092 
3093    See the comment at the head of aarch64-sve.md for details about the
3094    big-endian handling.  */
3095 
3096 void
aarch64_expand_sve_mem_move(rtx dest,rtx src,machine_mode pred_mode)3097 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
3098 {
3099   machine_mode mode = GET_MODE (dest);
3100   rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
3101   if (!register_operand (src, mode)
3102       && !register_operand (dest, mode))
3103     {
3104       rtx tmp = gen_reg_rtx (mode);
3105       if (MEM_P (src))
3106 	aarch64_emit_sve_pred_move (tmp, ptrue, src);
3107       else
3108 	emit_move_insn (tmp, src);
3109       src = tmp;
3110     }
3111   aarch64_emit_sve_pred_move (dest, ptrue, src);
3112 }
3113 
3114 /* Called only on big-endian targets.  See whether an SVE vector move
3115    from SRC to DEST is effectively a REV[BHW] instruction, because at
3116    least one operand is a subreg of an SVE vector that has wider or
3117    narrower elements.  Return true and emit the instruction if so.
3118 
3119    For example:
3120 
3121      (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
3122 
3123    represents a VIEW_CONVERT between the following vectors, viewed
3124    in memory order:
3125 
3126      R2: { [0].high, [0].low,  [1].high, [1].low, ... }
3127      R1: { [0],      [1],      [2],      [3],     ... }
3128 
3129    The high part of lane X in R2 should therefore correspond to lane X*2
3130    of R1, but the register representations are:
3131 
3132          msb                                      lsb
3133      R2: ...... [1].high  [1].low   [0].high  [0].low
3134      R1: ...... [3]       [2]       [1]       [0]
3135 
3136    where the low part of lane X in R2 corresponds to lane X*2 in R1.
3137    We therefore need a reverse operation to swap the high and low values
3138    around.
3139 
3140    This is purely an optimization.  Without it we would spill the
3141    subreg operand to the stack in one mode and reload it in the
3142    other mode, which has the same effect as the REV.  */
3143 
3144 bool
aarch64_maybe_expand_sve_subreg_move(rtx dest,rtx src)3145 aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
3146 {
3147   gcc_assert (BYTES_BIG_ENDIAN);
3148   if (GET_CODE (dest) == SUBREG)
3149     dest = SUBREG_REG (dest);
3150   if (GET_CODE (src) == SUBREG)
3151     src = SUBREG_REG (src);
3152 
3153   /* The optimization handles two single SVE REGs with different element
3154      sizes.  */
3155   if (!REG_P (dest)
3156       || !REG_P (src)
3157       || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
3158       || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
3159       || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
3160 	  == GET_MODE_UNIT_SIZE (GET_MODE (src))))
3161     return false;
3162 
3163   /* Generate *aarch64_sve_mov<mode>_subreg_be.  */
3164   rtx ptrue = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
3165   rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
3166 			       UNSPEC_REV_SUBREG);
3167   emit_insn (gen_rtx_SET (dest, unspec));
3168   return true;
3169 }
3170 
3171 /* Return a copy of X with mode MODE, without changing its other
3172    attributes.  Unlike gen_lowpart, this doesn't care whether the
3173    mode change is valid.  */
3174 
3175 static rtx
aarch64_replace_reg_mode(rtx x,machine_mode mode)3176 aarch64_replace_reg_mode (rtx x, machine_mode mode)
3177 {
3178   if (GET_MODE (x) == mode)
3179     return x;
3180 
3181   x = shallow_copy_rtx (x);
3182   set_mode_and_regno (x, mode, REGNO (x));
3183   return x;
3184 }
3185 
3186 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
3187    operands.  */
3188 
3189 void
aarch64_split_sve_subreg_move(rtx dest,rtx ptrue,rtx src)3190 aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
3191 {
3192   /* Decide which REV operation we need.  The mode with narrower elements
3193      determines the mode of the operands and the mode with the wider
3194      elements determines the reverse width.  */
3195   machine_mode mode_with_wider_elts = GET_MODE (dest);
3196   machine_mode mode_with_narrower_elts = GET_MODE (src);
3197   if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
3198       < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
3199     std::swap (mode_with_wider_elts, mode_with_narrower_elts);
3200 
3201   unsigned int wider_bytes = GET_MODE_UNIT_SIZE (mode_with_wider_elts);
3202   unsigned int unspec;
3203   if (wider_bytes == 8)
3204     unspec = UNSPEC_REV64;
3205   else if (wider_bytes == 4)
3206     unspec = UNSPEC_REV32;
3207   else if (wider_bytes == 2)
3208     unspec = UNSPEC_REV16;
3209   else
3210     gcc_unreachable ();
3211   machine_mode pred_mode = aarch64_sve_pred_mode (wider_bytes).require ();
3212 
3213   /* Emit:
3214 
3215        (set DEST (unspec [PTRUE (unspec [SRC] UNSPEC_REV<nn>)]
3216 			 UNSPEC_MERGE_PTRUE))
3217 
3218      with the appropriate modes.  */
3219   ptrue = gen_lowpart (pred_mode, ptrue);
3220   dest = aarch64_replace_reg_mode (dest, mode_with_narrower_elts);
3221   src = aarch64_replace_reg_mode (src, mode_with_narrower_elts);
3222   src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (1, src), unspec);
3223   src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (2, ptrue, src),
3224 			UNSPEC_MERGE_PTRUE);
3225   emit_insn (gen_rtx_SET (dest, src));
3226 }
3227 
3228 static bool
aarch64_function_ok_for_sibcall(tree decl ATTRIBUTE_UNUSED,tree exp ATTRIBUTE_UNUSED)3229 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
3230 				 tree exp ATTRIBUTE_UNUSED)
3231 {
3232   /* Currently, always true.  */
3233   return true;
3234 }
3235 
3236 /* Implement TARGET_PASS_BY_REFERENCE.  */
3237 
3238 static bool
aarch64_pass_by_reference(cumulative_args_t pcum ATTRIBUTE_UNUSED,machine_mode mode,const_tree type,bool named ATTRIBUTE_UNUSED)3239 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
3240 			   machine_mode mode,
3241 			   const_tree type,
3242 			   bool named ATTRIBUTE_UNUSED)
3243 {
3244   HOST_WIDE_INT size;
3245   machine_mode dummymode;
3246   int nregs;
3247 
3248   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
3249   if (mode == BLKmode && type)
3250     size = int_size_in_bytes (type);
3251   else
3252     /* No frontends can create types with variable-sized modes, so we
3253        shouldn't be asked to pass or return them.  */
3254     size = GET_MODE_SIZE (mode).to_constant ();
3255 
3256   /* Aggregates are passed by reference based on their size.  */
3257   if (type && AGGREGATE_TYPE_P (type))
3258     {
3259       size = int_size_in_bytes (type);
3260     }
3261 
3262   /* Variable sized arguments are always returned by reference.  */
3263   if (size < 0)
3264     return true;
3265 
3266   /* Can this be a candidate to be passed in fp/simd register(s)?  */
3267   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
3268 					       &dummymode, &nregs,
3269 					       NULL))
3270     return false;
3271 
3272   /* Arguments which are variable sized or larger than 2 registers are
3273      passed by reference unless they are a homogenous floating point
3274      aggregate.  */
3275   return size > 2 * UNITS_PER_WORD;
3276 }
3277 
3278 /* Return TRUE if VALTYPE is padded to its least significant bits.  */
3279 static bool
aarch64_return_in_msb(const_tree valtype)3280 aarch64_return_in_msb (const_tree valtype)
3281 {
3282   machine_mode dummy_mode;
3283   int dummy_int;
3284 
3285   /* Never happens in little-endian mode.  */
3286   if (!BYTES_BIG_ENDIAN)
3287     return false;
3288 
3289   /* Only composite types smaller than or equal to 16 bytes can
3290      be potentially returned in registers.  */
3291   if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
3292       || int_size_in_bytes (valtype) <= 0
3293       || int_size_in_bytes (valtype) > 16)
3294     return false;
3295 
3296   /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
3297      or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
3298      is always passed/returned in the least significant bits of fp/simd
3299      register(s).  */
3300   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
3301 					       &dummy_mode, &dummy_int, NULL))
3302     return false;
3303 
3304   return true;
3305 }
3306 
3307 /* Implement TARGET_FUNCTION_VALUE.
3308    Define how to find the value returned by a function.  */
3309 
3310 static rtx
aarch64_function_value(const_tree type,const_tree func,bool outgoing ATTRIBUTE_UNUSED)3311 aarch64_function_value (const_tree type, const_tree func,
3312 			bool outgoing ATTRIBUTE_UNUSED)
3313 {
3314   machine_mode mode;
3315   int unsignedp;
3316   int count;
3317   machine_mode ag_mode;
3318 
3319   mode = TYPE_MODE (type);
3320   if (INTEGRAL_TYPE_P (type))
3321     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
3322 
3323   if (aarch64_return_in_msb (type))
3324     {
3325       HOST_WIDE_INT size = int_size_in_bytes (type);
3326 
3327       if (size % UNITS_PER_WORD != 0)
3328 	{
3329 	  size += UNITS_PER_WORD - size % UNITS_PER_WORD;
3330 	  mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
3331 	}
3332     }
3333 
3334   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
3335 					       &ag_mode, &count, NULL))
3336     {
3337       if (!aarch64_composite_type_p (type, mode))
3338 	{
3339 	  gcc_assert (count == 1 && mode == ag_mode);
3340 	  return gen_rtx_REG (mode, V0_REGNUM);
3341 	}
3342       else
3343 	{
3344 	  int i;
3345 	  rtx par;
3346 
3347 	  par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
3348 	  for (i = 0; i < count; i++)
3349 	    {
3350 	      rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
3351 	      rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
3352 	      tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
3353 	      XVECEXP (par, 0, i) = tmp;
3354 	    }
3355 	  return par;
3356 	}
3357     }
3358   else
3359     return gen_rtx_REG (mode, R0_REGNUM);
3360 }
3361 
3362 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
3363    Return true if REGNO is the number of a hard register in which the values
3364    of called function may come back.  */
3365 
3366 static bool
aarch64_function_value_regno_p(const unsigned int regno)3367 aarch64_function_value_regno_p (const unsigned int regno)
3368 {
3369   /* Maximum of 16 bytes can be returned in the general registers.  Examples
3370      of 16-byte return values are: 128-bit integers and 16-byte small
3371      structures (excluding homogeneous floating-point aggregates).  */
3372   if (regno == R0_REGNUM || regno == R1_REGNUM)
3373     return true;
3374 
3375   /* Up to four fp/simd registers can return a function value, e.g. a
3376      homogeneous floating-point aggregate having four members.  */
3377   if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
3378     return TARGET_FLOAT;
3379 
3380   return false;
3381 }
3382 
3383 /* Implement TARGET_RETURN_IN_MEMORY.
3384 
3385    If the type T of the result of a function is such that
3386      void func (T arg)
3387    would require that arg be passed as a value in a register (or set of
3388    registers) according to the parameter passing rules, then the result
3389    is returned in the same registers as would be used for such an
3390    argument.  */
3391 
3392 static bool
aarch64_return_in_memory(const_tree type,const_tree fndecl ATTRIBUTE_UNUSED)3393 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
3394 {
3395   HOST_WIDE_INT size;
3396   machine_mode ag_mode;
3397   int count;
3398 
3399   if (!AGGREGATE_TYPE_P (type)
3400       && TREE_CODE (type) != COMPLEX_TYPE
3401       && TREE_CODE (type) != VECTOR_TYPE)
3402     /* Simple scalar types always returned in registers.  */
3403     return false;
3404 
3405   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
3406 					       type,
3407 					       &ag_mode,
3408 					       &count,
3409 					       NULL))
3410     return false;
3411 
3412   /* Types larger than 2 registers returned in memory.  */
3413   size = int_size_in_bytes (type);
3414   return (size < 0 || size > 2 * UNITS_PER_WORD);
3415 }
3416 
3417 static bool
aarch64_vfp_is_call_candidate(cumulative_args_t pcum_v,machine_mode mode,const_tree type,int * nregs)3418 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
3419 			       const_tree type, int *nregs)
3420 {
3421   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3422   return aarch64_vfp_is_call_or_return_candidate (mode,
3423 						  type,
3424 						  &pcum->aapcs_vfp_rmode,
3425 						  nregs,
3426 						  NULL);
3427 }
3428 
3429 /* Given MODE and TYPE of a function argument, return the alignment in
3430    bits.  The idea is to suppress any stronger alignment requested by
3431    the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
3432    This is a helper function for local use only.  */
3433 
3434 static unsigned int
aarch64_function_arg_alignment(machine_mode mode,const_tree type)3435 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
3436 {
3437   if (!type)
3438     return GET_MODE_ALIGNMENT (mode);
3439 
3440   if (integer_zerop (TYPE_SIZE (type)))
3441     return 0;
3442 
3443   gcc_assert (TYPE_MODE (type) == mode);
3444 
3445   if (!AGGREGATE_TYPE_P (type))
3446     return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
3447 
3448   if (TREE_CODE (type) == ARRAY_TYPE)
3449     return TYPE_ALIGN (TREE_TYPE (type));
3450 
3451   unsigned int alignment = 0;
3452   for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
3453     if (TREE_CODE (field) == FIELD_DECL)
3454       alignment = std::max (alignment, DECL_ALIGN (field));
3455 
3456   return alignment;
3457 }
3458 
3459 /* Layout a function argument according to the AAPCS64 rules.  The rule
3460    numbers refer to the rule numbers in the AAPCS64.  */
3461 
3462 static void
aarch64_layout_arg(cumulative_args_t pcum_v,machine_mode mode,const_tree type,bool named ATTRIBUTE_UNUSED)3463 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
3464 		    const_tree type,
3465 		    bool named ATTRIBUTE_UNUSED)
3466 {
3467   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3468   int ncrn, nvrn, nregs;
3469   bool allocate_ncrn, allocate_nvrn;
3470   HOST_WIDE_INT size;
3471 
3472   /* We need to do this once per argument.  */
3473   if (pcum->aapcs_arg_processed)
3474     return;
3475 
3476   pcum->aapcs_arg_processed = true;
3477 
3478   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
3479   if (type)
3480     size = int_size_in_bytes (type);
3481   else
3482     /* No frontends can create types with variable-sized modes, so we
3483        shouldn't be asked to pass or return them.  */
3484     size = GET_MODE_SIZE (mode).to_constant ();
3485   size = ROUND_UP (size, UNITS_PER_WORD);
3486 
3487   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
3488   allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
3489 						 mode,
3490 						 type,
3491 						 &nregs);
3492 
3493   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
3494      The following code thus handles passing by SIMD/FP registers first.  */
3495 
3496   nvrn = pcum->aapcs_nvrn;
3497 
3498   /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
3499      and homogenous short-vector aggregates (HVA).  */
3500   if (allocate_nvrn)
3501     {
3502       if (!TARGET_FLOAT)
3503 	aarch64_err_no_fpadvsimd (mode, "argument");
3504 
3505       if (nvrn + nregs <= NUM_FP_ARG_REGS)
3506 	{
3507 	  pcum->aapcs_nextnvrn = nvrn + nregs;
3508 	  if (!aarch64_composite_type_p (type, mode))
3509 	    {
3510 	      gcc_assert (nregs == 1);
3511 	      pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
3512 	    }
3513 	  else
3514 	    {
3515 	      rtx par;
3516 	      int i;
3517 	      par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
3518 	      for (i = 0; i < nregs; i++)
3519 		{
3520 		  rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
3521 					 V0_REGNUM + nvrn + i);
3522 		  rtx offset = gen_int_mode
3523 		    (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
3524 		  tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
3525 		  XVECEXP (par, 0, i) = tmp;
3526 		}
3527 	      pcum->aapcs_reg = par;
3528 	    }
3529 	  return;
3530 	}
3531       else
3532 	{
3533 	  /* C.3 NSRN is set to 8.  */
3534 	  pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
3535 	  goto on_stack;
3536 	}
3537     }
3538 
3539   ncrn = pcum->aapcs_ncrn;
3540   nregs = size / UNITS_PER_WORD;
3541 
3542   /* C6 - C9.  though the sign and zero extension semantics are
3543      handled elsewhere.  This is the case where the argument fits
3544      entirely general registers.  */
3545   if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
3546     {
3547 
3548       gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
3549 
3550       /* C.8 if the argument has an alignment of 16 then the NGRN is
3551          rounded up to the next even number.  */
3552       if (nregs == 2
3553 	  && ncrn % 2
3554 	  /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
3555 	     comparison is there because for > 16 * BITS_PER_UNIT
3556 	     alignment nregs should be > 2 and therefore it should be
3557 	     passed by reference rather than value.  */
3558 	  && aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
3559 	{
3560 	  ++ncrn;
3561 	  gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
3562 	}
3563 
3564       /* NREGS can be 0 when e.g. an empty structure is to be passed.
3565          A reg is still generated for it, but the caller should be smart
3566 	 enough not to use it.  */
3567       if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
3568 	pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
3569       else
3570 	{
3571 	  rtx par;
3572 	  int i;
3573 
3574 	  par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
3575 	  for (i = 0; i < nregs; i++)
3576 	    {
3577 	      rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
3578 	      tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
3579 				       GEN_INT (i * UNITS_PER_WORD));
3580 	      XVECEXP (par, 0, i) = tmp;
3581 	    }
3582 	  pcum->aapcs_reg = par;
3583 	}
3584 
3585       pcum->aapcs_nextncrn = ncrn + nregs;
3586       return;
3587     }
3588 
3589   /* C.11  */
3590   pcum->aapcs_nextncrn = NUM_ARG_REGS;
3591 
3592   /* The argument is passed on stack; record the needed number of words for
3593      this argument and align the total size if necessary.  */
3594 on_stack:
3595   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
3596 
3597   if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
3598     pcum->aapcs_stack_size = ROUND_UP (pcum->aapcs_stack_size,
3599 				       16 / UNITS_PER_WORD);
3600   return;
3601 }
3602 
3603 /* Implement TARGET_FUNCTION_ARG.  */
3604 
3605 static rtx
aarch64_function_arg(cumulative_args_t pcum_v,machine_mode mode,const_tree type,bool named)3606 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
3607 		      const_tree type, bool named)
3608 {
3609   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3610   gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
3611 
3612   if (mode == VOIDmode)
3613     return NULL_RTX;
3614 
3615   aarch64_layout_arg (pcum_v, mode, type, named);
3616   return pcum->aapcs_reg;
3617 }
3618 
3619 void
aarch64_init_cumulative_args(CUMULATIVE_ARGS * pcum,const_tree fntype ATTRIBUTE_UNUSED,rtx libname ATTRIBUTE_UNUSED,const_tree fndecl ATTRIBUTE_UNUSED,unsigned n_named ATTRIBUTE_UNUSED)3620 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
3621 			   const_tree fntype ATTRIBUTE_UNUSED,
3622 			   rtx libname ATTRIBUTE_UNUSED,
3623 			   const_tree fndecl ATTRIBUTE_UNUSED,
3624 			   unsigned n_named ATTRIBUTE_UNUSED)
3625 {
3626   pcum->aapcs_ncrn = 0;
3627   pcum->aapcs_nvrn = 0;
3628   pcum->aapcs_nextncrn = 0;
3629   pcum->aapcs_nextnvrn = 0;
3630   pcum->pcs_variant = ARM_PCS_AAPCS64;
3631   pcum->aapcs_reg = NULL_RTX;
3632   pcum->aapcs_arg_processed = false;
3633   pcum->aapcs_stack_words = 0;
3634   pcum->aapcs_stack_size = 0;
3635 
3636   if (!TARGET_FLOAT
3637       && fndecl && TREE_PUBLIC (fndecl)
3638       && fntype && fntype != error_mark_node)
3639     {
3640       const_tree type = TREE_TYPE (fntype);
3641       machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument.  */
3642       int nregs ATTRIBUTE_UNUSED; /* Likewise.  */
3643       if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
3644 						   &mode, &nregs, NULL))
3645 	aarch64_err_no_fpadvsimd (TYPE_MODE (type), "return type");
3646     }
3647   return;
3648 }
3649 
3650 static void
aarch64_function_arg_advance(cumulative_args_t pcum_v,machine_mode mode,const_tree type,bool named)3651 aarch64_function_arg_advance (cumulative_args_t pcum_v,
3652 			      machine_mode mode,
3653 			      const_tree type,
3654 			      bool named)
3655 {
3656   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3657   if (pcum->pcs_variant == ARM_PCS_AAPCS64)
3658     {
3659       aarch64_layout_arg (pcum_v, mode, type, named);
3660       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
3661 		  != (pcum->aapcs_stack_words != 0));
3662       pcum->aapcs_arg_processed = false;
3663       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
3664       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
3665       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
3666       pcum->aapcs_stack_words = 0;
3667       pcum->aapcs_reg = NULL_RTX;
3668     }
3669 }
3670 
3671 bool
aarch64_function_arg_regno_p(unsigned regno)3672 aarch64_function_arg_regno_p (unsigned regno)
3673 {
3674   return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
3675 	  || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
3676 }
3677 
3678 /* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
3679    PARM_BOUNDARY bits of alignment, but will be given anything up
3680    to STACK_BOUNDARY bits if the type requires it.  This makes sure
3681    that both before and after the layout of each argument, the Next
3682    Stacked Argument Address (NSAA) will have a minimum alignment of
3683    8 bytes.  */
3684 
3685 static unsigned int
aarch64_function_arg_boundary(machine_mode mode,const_tree type)3686 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
3687 {
3688   unsigned int alignment = aarch64_function_arg_alignment (mode, type);
3689   return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
3690 }
3691 
3692 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE.  */
3693 
3694 static fixed_size_mode
aarch64_get_reg_raw_mode(int regno)3695 aarch64_get_reg_raw_mode (int regno)
3696 {
3697   if (TARGET_SVE && FP_REGNUM_P (regno))
3698     /* Don't use the SVE part of the register for __builtin_apply and
3699        __builtin_return.  The SVE registers aren't used by the normal PCS,
3700        so using them there would be a waste of time.  The PCS extensions
3701        for SVE types are fundamentally incompatible with the
3702        __builtin_return/__builtin_apply interface.  */
3703     return as_a <fixed_size_mode> (V16QImode);
3704   return default_get_reg_raw_mode (regno);
3705 }
3706 
3707 /* Implement TARGET_FUNCTION_ARG_PADDING.
3708 
3709    Small aggregate types are placed in the lowest memory address.
3710 
3711    The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
3712 
3713 static pad_direction
aarch64_function_arg_padding(machine_mode mode,const_tree type)3714 aarch64_function_arg_padding (machine_mode mode, const_tree type)
3715 {
3716   /* On little-endian targets, the least significant byte of every stack
3717      argument is passed at the lowest byte address of the stack slot.  */
3718   if (!BYTES_BIG_ENDIAN)
3719     return PAD_UPWARD;
3720 
3721   /* Otherwise, integral, floating-point and pointer types are padded downward:
3722      the least significant byte of a stack argument is passed at the highest
3723      byte address of the stack slot.  */
3724   if (type
3725       ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
3726 	 || POINTER_TYPE_P (type))
3727       : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
3728     return PAD_DOWNWARD;
3729 
3730   /* Everything else padded upward, i.e. data in first byte of stack slot.  */
3731   return PAD_UPWARD;
3732 }
3733 
3734 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
3735 
3736    It specifies padding for the last (may also be the only)
3737    element of a block move between registers and memory.  If
3738    assuming the block is in the memory, padding upward means that
3739    the last element is padded after its highest significant byte,
3740    while in downward padding, the last element is padded at the
3741    its least significant byte side.
3742 
3743    Small aggregates and small complex types are always padded
3744    upwards.
3745 
3746    We don't need to worry about homogeneous floating-point or
3747    short-vector aggregates; their move is not affected by the
3748    padding direction determined here.  Regardless of endianness,
3749    each element of such an aggregate is put in the least
3750    significant bits of a fp/simd register.
3751 
3752    Return !BYTES_BIG_ENDIAN if the least significant byte of the
3753    register has useful data, and return the opposite if the most
3754    significant byte does.  */
3755 
3756 bool
aarch64_pad_reg_upward(machine_mode mode,const_tree type,bool first ATTRIBUTE_UNUSED)3757 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
3758 		     bool first ATTRIBUTE_UNUSED)
3759 {
3760 
3761   /* Small composite types are always padded upward.  */
3762   if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
3763     {
3764       HOST_WIDE_INT size;
3765       if (type)
3766 	size = int_size_in_bytes (type);
3767       else
3768 	/* No frontends can create types with variable-sized modes, so we
3769 	   shouldn't be asked to pass or return them.  */
3770 	size = GET_MODE_SIZE (mode).to_constant ();
3771       if (size < 2 * UNITS_PER_WORD)
3772 	return true;
3773     }
3774 
3775   /* Otherwise, use the default padding.  */
3776   return !BYTES_BIG_ENDIAN;
3777 }
3778 
3779 static scalar_int_mode
aarch64_libgcc_cmp_return_mode(void)3780 aarch64_libgcc_cmp_return_mode (void)
3781 {
3782   return SImode;
3783 }
3784 
3785 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
3786 
3787 /* We use the 12-bit shifted immediate arithmetic instructions so values
3788    must be multiple of (1 << 12), i.e. 4096.  */
3789 #define ARITH_FACTOR 4096
3790 
3791 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
3792 #error Cannot use simple address calculation for stack probing
3793 #endif
3794 
3795 /* The pair of scratch registers used for stack probing.  */
3796 #define PROBE_STACK_FIRST_REG  9
3797 #define PROBE_STACK_SECOND_REG 10
3798 
3799 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
3800    inclusive.  These are offsets from the current stack pointer.  */
3801 
3802 static void
aarch64_emit_probe_stack_range(HOST_WIDE_INT first,poly_int64 poly_size)3803 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
3804 {
3805   HOST_WIDE_INT size;
3806   if (!poly_size.is_constant (&size))
3807     {
3808       sorry ("stack probes for SVE frames");
3809       return;
3810     }
3811 
3812   rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
3813 
3814   /* See the same assertion on PROBE_INTERVAL above.  */
3815   gcc_assert ((first % ARITH_FACTOR) == 0);
3816 
3817   /* See if we have a constant small number of probes to generate.  If so,
3818      that's the easy case.  */
3819   if (size <= PROBE_INTERVAL)
3820     {
3821       const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
3822 
3823       emit_set_insn (reg1,
3824 		     plus_constant (Pmode,
3825 				    stack_pointer_rtx, -(first + base)));
3826       emit_stack_probe (plus_constant (Pmode, reg1, base - size));
3827     }
3828 
3829   /* The run-time loop is made up of 8 insns in the generic case while the
3830      compile-time loop is made up of 4+2*(n-2) insns for n # of intervals.  */
3831   else if (size <= 4 * PROBE_INTERVAL)
3832     {
3833       HOST_WIDE_INT i, rem;
3834 
3835       emit_set_insn (reg1,
3836 		     plus_constant (Pmode,
3837 				    stack_pointer_rtx,
3838 				    -(first + PROBE_INTERVAL)));
3839       emit_stack_probe (reg1);
3840 
3841       /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
3842 	 it exceeds SIZE.  If only two probes are needed, this will not
3843 	 generate any code.  Then probe at FIRST + SIZE.  */
3844       for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
3845 	{
3846 	  emit_set_insn (reg1,
3847 			 plus_constant (Pmode, reg1, -PROBE_INTERVAL));
3848 	  emit_stack_probe (reg1);
3849 	}
3850 
3851       rem = size - (i - PROBE_INTERVAL);
3852       if (rem > 256)
3853 	{
3854 	  const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
3855 
3856 	  emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
3857 	  emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
3858 	}
3859       else
3860 	emit_stack_probe (plus_constant (Pmode, reg1, -rem));
3861     }
3862 
3863   /* Otherwise, do the same as above, but in a loop.  Note that we must be
3864      extra careful with variables wrapping around because we might be at
3865      the very top (or the very bottom) of the address space and we have
3866      to be able to handle this case properly; in particular, we use an
3867      equality test for the loop condition.  */
3868   else
3869     {
3870       rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
3871 
3872       /* Step 1: round SIZE to the previous multiple of the interval.  */
3873 
3874       HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
3875 
3876 
3877       /* Step 2: compute initial and final value of the loop counter.  */
3878 
3879       /* TEST_ADDR = SP + FIRST.  */
3880       emit_set_insn (reg1,
3881 		     plus_constant (Pmode, stack_pointer_rtx, -first));
3882 
3883       /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE.  */
3884       HOST_WIDE_INT adjustment = - (first + rounded_size);
3885       if (! aarch64_uimm12_shift (adjustment))
3886 	{
3887 	  aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
3888 					  true, Pmode);
3889 	  emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
3890 	}
3891       else
3892 	emit_set_insn (reg2,
3893 		       plus_constant (Pmode, stack_pointer_rtx, adjustment));
3894 
3895       /* Step 3: the loop
3896 
3897 	 do
3898 	   {
3899 	     TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
3900 	     probe at TEST_ADDR
3901 	   }
3902 	 while (TEST_ADDR != LAST_ADDR)
3903 
3904 	 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
3905 	 until it is equal to ROUNDED_SIZE.  */
3906 
3907       emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
3908 
3909 
3910       /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
3911 	 that SIZE is equal to ROUNDED_SIZE.  */
3912 
3913       if (size != rounded_size)
3914 	{
3915 	  HOST_WIDE_INT rem = size - rounded_size;
3916 
3917 	  if (rem > 256)
3918 	    {
3919 	      const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
3920 
3921 	      emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
3922 	      emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
3923 	    }
3924 	  else
3925 	    emit_stack_probe (plus_constant (Pmode, reg2, -rem));
3926 	}
3927     }
3928 
3929   /* Make sure nothing is scheduled before we are done.  */
3930   emit_insn (gen_blockage ());
3931 }
3932 
3933 /* Probe a range of stack addresses from REG1 to REG2 inclusive.  These are
3934    absolute addresses.  */
3935 
3936 const char *
aarch64_output_probe_stack_range(rtx reg1,rtx reg2)3937 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
3938 {
3939   static int labelno = 0;
3940   char loop_lab[32];
3941   rtx xops[2];
3942 
3943   ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
3944 
3945   /* Loop.  */
3946   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
3947 
3948   /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
3949   xops[0] = reg1;
3950   xops[1] = GEN_INT (PROBE_INTERVAL);
3951   output_asm_insn ("sub\t%0, %0, %1", xops);
3952 
3953   /* Probe at TEST_ADDR.  */
3954   output_asm_insn ("str\txzr, [%0]", xops);
3955 
3956   /* Test if TEST_ADDR == LAST_ADDR.  */
3957   xops[1] = reg2;
3958   output_asm_insn ("cmp\t%0, %1", xops);
3959 
3960   /* Branch.  */
3961   fputs ("\tb.ne\t", asm_out_file);
3962   assemble_name_raw (asm_out_file, loop_lab);
3963   fputc ('\n', asm_out_file);
3964 
3965   return "";
3966 }
3967 
3968 /* Mark the registers that need to be saved by the callee and calculate
3969    the size of the callee-saved registers area and frame record (both FP
3970    and LR may be omitted).  */
3971 static void
aarch64_layout_frame(void)3972 aarch64_layout_frame (void)
3973 {
3974   HOST_WIDE_INT offset = 0;
3975   int regno, last_fp_reg = INVALID_REGNUM;
3976 
3977   if (reload_completed && cfun->machine->frame.laid_out)
3978     return;
3979 
3980   /* Force a frame chain for EH returns so the return address is at FP+8.  */
3981   cfun->machine->frame.emit_frame_chain
3982     = frame_pointer_needed || crtl->calls_eh_return;
3983 
3984   /* Emit a frame chain if the frame pointer is enabled.
3985      If -momit-leaf-frame-pointer is used, do not use a frame chain
3986      in leaf functions which do not use LR.  */
3987   if (flag_omit_frame_pointer == 2
3988       && !(flag_omit_leaf_frame_pointer && crtl->is_leaf
3989 	   && !df_regs_ever_live_p (LR_REGNUM)))
3990     cfun->machine->frame.emit_frame_chain = true;
3991 
3992 #define SLOT_NOT_REQUIRED (-2)
3993 #define SLOT_REQUIRED     (-1)
3994 
3995   cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
3996   cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
3997 
3998   /* First mark all the registers that really need to be saved...  */
3999   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4000     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
4001 
4002   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4003     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
4004 
4005   /* ... that includes the eh data registers (if needed)...  */
4006   if (crtl->calls_eh_return)
4007     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
4008       cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
4009 	= SLOT_REQUIRED;
4010 
4011   /* ... and any callee saved register that dataflow says is live.  */
4012   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4013     if (df_regs_ever_live_p (regno)
4014 	&& (regno == R30_REGNUM
4015 	    || !call_used_regs[regno]))
4016       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
4017 
4018   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4019     if (df_regs_ever_live_p (regno)
4020 	&& !call_used_regs[regno])
4021       {
4022 	cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
4023 	last_fp_reg = regno;
4024       }
4025 
4026   if (cfun->machine->frame.emit_frame_chain)
4027     {
4028       /* FP and LR are placed in the linkage record.  */
4029       cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
4030       cfun->machine->frame.wb_candidate1 = R29_REGNUM;
4031       cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
4032       cfun->machine->frame.wb_candidate2 = R30_REGNUM;
4033       offset = 2 * UNITS_PER_WORD;
4034     }
4035 
4036   /* Now assign stack slots for them.  */
4037   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4038     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
4039       {
4040 	cfun->machine->frame.reg_offset[regno] = offset;
4041 	if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
4042 	  cfun->machine->frame.wb_candidate1 = regno;
4043 	else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
4044 	  cfun->machine->frame.wb_candidate2 = regno;
4045 	offset += UNITS_PER_WORD;
4046       }
4047 
4048   HOST_WIDE_INT max_int_offset = offset;
4049   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
4050   bool has_align_gap = offset != max_int_offset;
4051 
4052   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4053     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
4054       {
4055 	/* If there is an alignment gap between integer and fp callee-saves,
4056 	   allocate the last fp register to it if possible.  */
4057 	if (regno == last_fp_reg && has_align_gap && (offset & 8) == 0)
4058 	  {
4059 	    cfun->machine->frame.reg_offset[regno] = max_int_offset;
4060 	    break;
4061 	  }
4062 
4063 	cfun->machine->frame.reg_offset[regno] = offset;
4064 	if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
4065 	  cfun->machine->frame.wb_candidate1 = regno;
4066 	else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
4067 		 && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
4068 	  cfun->machine->frame.wb_candidate2 = regno;
4069 	offset += UNITS_PER_WORD;
4070       }
4071 
4072   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
4073 
4074   cfun->machine->frame.saved_regs_size = offset;
4075 
4076   HOST_WIDE_INT varargs_and_saved_regs_size
4077     = offset + cfun->machine->frame.saved_varargs_size;
4078 
4079   cfun->machine->frame.hard_fp_offset
4080     = aligned_upper_bound (varargs_and_saved_regs_size
4081 			   + get_frame_size (),
4082 			   STACK_BOUNDARY / BITS_PER_UNIT);
4083 
4084   /* Both these values are already aligned.  */
4085   gcc_assert (multiple_p (crtl->outgoing_args_size,
4086 			  STACK_BOUNDARY / BITS_PER_UNIT));
4087   cfun->machine->frame.frame_size
4088     = (cfun->machine->frame.hard_fp_offset
4089        + crtl->outgoing_args_size);
4090 
4091   cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
4092 
4093   cfun->machine->frame.initial_adjust = 0;
4094   cfun->machine->frame.final_adjust = 0;
4095   cfun->machine->frame.callee_adjust = 0;
4096   cfun->machine->frame.callee_offset = 0;
4097 
4098   HOST_WIDE_INT max_push_offset = 0;
4099   if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
4100     max_push_offset = 512;
4101   else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
4102     max_push_offset = 256;
4103 
4104   HOST_WIDE_INT const_size, const_fp_offset;
4105   if (cfun->machine->frame.frame_size.is_constant (&const_size)
4106       && const_size < max_push_offset
4107       && known_eq (crtl->outgoing_args_size, 0))
4108     {
4109       /* Simple, small frame with no outgoing arguments:
4110 	 stp reg1, reg2, [sp, -frame_size]!
4111 	 stp reg3, reg4, [sp, 16]  */
4112       cfun->machine->frame.callee_adjust = const_size;
4113     }
4114   else if (known_lt (crtl->outgoing_args_size
4115 		     + cfun->machine->frame.saved_regs_size, 512)
4116 	   && !(cfun->calls_alloca
4117 		&& known_lt (cfun->machine->frame.hard_fp_offset,
4118 			     max_push_offset)))
4119     {
4120       /* Frame with small outgoing arguments:
4121 	 sub sp, sp, frame_size
4122 	 stp reg1, reg2, [sp, outgoing_args_size]
4123 	 stp reg3, reg4, [sp, outgoing_args_size + 16]  */
4124       cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
4125       cfun->machine->frame.callee_offset
4126 	= cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
4127     }
4128   else if (cfun->machine->frame.hard_fp_offset.is_constant (&const_fp_offset)
4129 	   && const_fp_offset < max_push_offset)
4130     {
4131       /* Frame with large outgoing arguments but a small local area:
4132 	 stp reg1, reg2, [sp, -hard_fp_offset]!
4133 	 stp reg3, reg4, [sp, 16]
4134 	 sub sp, sp, outgoing_args_size  */
4135       cfun->machine->frame.callee_adjust = const_fp_offset;
4136       cfun->machine->frame.final_adjust
4137 	= cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
4138     }
4139   else
4140     {
4141       /* Frame with large local area and outgoing arguments using frame pointer:
4142 	 sub sp, sp, hard_fp_offset
4143 	 stp x29, x30, [sp, 0]
4144 	 add x29, sp, 0
4145 	 stp reg3, reg4, [sp, 16]
4146 	 sub sp, sp, outgoing_args_size  */
4147       cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
4148       cfun->machine->frame.final_adjust
4149 	= cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
4150     }
4151 
4152   cfun->machine->frame.laid_out = true;
4153 }
4154 
4155 /* Return true if the register REGNO is saved on entry to
4156    the current function.  */
4157 
4158 static bool
aarch64_register_saved_on_entry(int regno)4159 aarch64_register_saved_on_entry (int regno)
4160 {
4161   return cfun->machine->frame.reg_offset[regno] >= 0;
4162 }
4163 
4164 /* Return the next register up from REGNO up to LIMIT for the callee
4165    to save.  */
4166 
4167 static unsigned
aarch64_next_callee_save(unsigned regno,unsigned limit)4168 aarch64_next_callee_save (unsigned regno, unsigned limit)
4169 {
4170   while (regno <= limit && !aarch64_register_saved_on_entry (regno))
4171     regno ++;
4172   return regno;
4173 }
4174 
4175 /* Push the register number REGNO of mode MODE to the stack with write-back
4176    adjusting the stack by ADJUSTMENT.  */
4177 
4178 static void
aarch64_pushwb_single_reg(machine_mode mode,unsigned regno,HOST_WIDE_INT adjustment)4179 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
4180 			   HOST_WIDE_INT adjustment)
4181  {
4182   rtx base_rtx = stack_pointer_rtx;
4183   rtx insn, reg, mem;
4184 
4185   reg = gen_rtx_REG (mode, regno);
4186   mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
4187 			    plus_constant (Pmode, base_rtx, -adjustment));
4188   mem = gen_frame_mem (mode, mem);
4189 
4190   insn = emit_move_insn (mem, reg);
4191   RTX_FRAME_RELATED_P (insn) = 1;
4192 }
4193 
4194 /* Generate and return an instruction to store the pair of registers
4195    REG and REG2 of mode MODE to location BASE with write-back adjusting
4196    the stack location BASE by ADJUSTMENT.  */
4197 
4198 static rtx
aarch64_gen_storewb_pair(machine_mode mode,rtx base,rtx reg,rtx reg2,HOST_WIDE_INT adjustment)4199 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
4200 			  HOST_WIDE_INT adjustment)
4201 {
4202   switch (mode)
4203     {
4204     case E_DImode:
4205       return gen_storewb_pairdi_di (base, base, reg, reg2,
4206 				    GEN_INT (-adjustment),
4207 				    GEN_INT (UNITS_PER_WORD - adjustment));
4208     case E_DFmode:
4209       return gen_storewb_pairdf_di (base, base, reg, reg2,
4210 				    GEN_INT (-adjustment),
4211 				    GEN_INT (UNITS_PER_WORD - adjustment));
4212     default:
4213       gcc_unreachable ();
4214     }
4215 }
4216 
4217 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
4218    stack pointer by ADJUSTMENT.  */
4219 
4220 static void
aarch64_push_regs(unsigned regno1,unsigned regno2,HOST_WIDE_INT adjustment)4221 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
4222 {
4223   rtx_insn *insn;
4224   machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
4225 
4226   if (regno2 == INVALID_REGNUM)
4227     return aarch64_pushwb_single_reg (mode, regno1, adjustment);
4228 
4229   rtx reg1 = gen_rtx_REG (mode, regno1);
4230   rtx reg2 = gen_rtx_REG (mode, regno2);
4231 
4232   insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
4233 					      reg2, adjustment));
4234   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
4235   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
4236   RTX_FRAME_RELATED_P (insn) = 1;
4237 }
4238 
4239 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
4240    adjusting it by ADJUSTMENT afterwards.  */
4241 
4242 static rtx
aarch64_gen_loadwb_pair(machine_mode mode,rtx base,rtx reg,rtx reg2,HOST_WIDE_INT adjustment)4243 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
4244 			 HOST_WIDE_INT adjustment)
4245 {
4246   switch (mode)
4247     {
4248     case E_DImode:
4249       return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
4250 				   GEN_INT (UNITS_PER_WORD));
4251     case E_DFmode:
4252       return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
4253 				   GEN_INT (UNITS_PER_WORD));
4254     default:
4255       gcc_unreachable ();
4256     }
4257 }
4258 
4259 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
4260    afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
4261    into CFI_OPS.  */
4262 
4263 static void
aarch64_pop_regs(unsigned regno1,unsigned regno2,HOST_WIDE_INT adjustment,rtx * cfi_ops)4264 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
4265 		  rtx *cfi_ops)
4266 {
4267   machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
4268   rtx reg1 = gen_rtx_REG (mode, regno1);
4269 
4270   *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
4271 
4272   if (regno2 == INVALID_REGNUM)
4273     {
4274       rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
4275       mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
4276       emit_move_insn (reg1, gen_frame_mem (mode, mem));
4277     }
4278   else
4279     {
4280       rtx reg2 = gen_rtx_REG (mode, regno2);
4281       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
4282       emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
4283 					  reg2, adjustment));
4284     }
4285 }
4286 
4287 /* Generate and return a store pair instruction of mode MODE to store
4288    register REG1 to MEM1 and register REG2 to MEM2.  */
4289 
4290 static rtx
aarch64_gen_store_pair(machine_mode mode,rtx mem1,rtx reg1,rtx mem2,rtx reg2)4291 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
4292 			rtx reg2)
4293 {
4294   switch (mode)
4295     {
4296     case E_DImode:
4297       return gen_store_pairdi (mem1, reg1, mem2, reg2);
4298 
4299     case E_DFmode:
4300       return gen_store_pairdf (mem1, reg1, mem2, reg2);
4301 
4302     default:
4303       gcc_unreachable ();
4304     }
4305 }
4306 
4307 /* Generate and regurn a load pair isntruction of mode MODE to load register
4308    REG1 from MEM1 and register REG2 from MEM2.  */
4309 
4310 static rtx
aarch64_gen_load_pair(machine_mode mode,rtx reg1,rtx mem1,rtx reg2,rtx mem2)4311 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
4312 		       rtx mem2)
4313 {
4314   switch (mode)
4315     {
4316     case E_DImode:
4317       return gen_load_pairdi (reg1, mem1, reg2, mem2);
4318 
4319     case E_DFmode:
4320       return gen_load_pairdf (reg1, mem1, reg2, mem2);
4321 
4322     default:
4323       gcc_unreachable ();
4324     }
4325 }
4326 
4327 /* Return TRUE if return address signing should be enabled for the current
4328    function, otherwise return FALSE.  */
4329 
4330 bool
aarch64_return_address_signing_enabled(void)4331 aarch64_return_address_signing_enabled (void)
4332 {
4333   /* This function should only be called after frame laid out.   */
4334   gcc_assert (cfun->machine->frame.laid_out);
4335 
4336   /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
4337      if it's LR is pushed onto stack.  */
4338   return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
4339 	  || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
4340 	      && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
4341 }
4342 
4343 /* Emit code to save the callee-saved registers from register number START
4344    to LIMIT to the stack at the location starting at offset START_OFFSET,
4345    skipping any write-back candidates if SKIP_WB is true.  */
4346 
4347 static void
aarch64_save_callee_saves(machine_mode mode,poly_int64 start_offset,unsigned start,unsigned limit,bool skip_wb)4348 aarch64_save_callee_saves (machine_mode mode, poly_int64 start_offset,
4349 			   unsigned start, unsigned limit, bool skip_wb)
4350 {
4351   rtx_insn *insn;
4352   unsigned regno;
4353   unsigned regno2;
4354 
4355   for (regno = aarch64_next_callee_save (start, limit);
4356        regno <= limit;
4357        regno = aarch64_next_callee_save (regno + 1, limit))
4358     {
4359       rtx reg, mem;
4360       poly_int64 offset;
4361 
4362       if (skip_wb
4363 	  && (regno == cfun->machine->frame.wb_candidate1
4364 	      || regno == cfun->machine->frame.wb_candidate2))
4365 	continue;
4366 
4367       if (cfun->machine->reg_is_wrapped_separately[regno])
4368        continue;
4369 
4370       reg = gen_rtx_REG (mode, regno);
4371       offset = start_offset + cfun->machine->frame.reg_offset[regno];
4372       mem = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
4373 						offset));
4374 
4375       regno2 = aarch64_next_callee_save (regno + 1, limit);
4376 
4377       if (regno2 <= limit
4378 	  && !cfun->machine->reg_is_wrapped_separately[regno2]
4379 	  && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
4380 	      == cfun->machine->frame.reg_offset[regno2]))
4381 
4382 	{
4383 	  rtx reg2 = gen_rtx_REG (mode, regno2);
4384 	  rtx mem2;
4385 
4386 	  offset = start_offset + cfun->machine->frame.reg_offset[regno2];
4387 	  mem2 = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
4388 						     offset));
4389 	  insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
4390 						    reg2));
4391 
4392 	  /* The first part of a frame-related parallel insn is
4393 	     always assumed to be relevant to the frame
4394 	     calculations; subsequent parts, are only
4395 	     frame-related if explicitly marked.  */
4396 	  RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
4397 	  regno = regno2;
4398 	}
4399       else
4400 	insn = emit_move_insn (mem, reg);
4401 
4402       RTX_FRAME_RELATED_P (insn) = 1;
4403     }
4404 }
4405 
4406 /* Emit code to restore the callee registers of mode MODE from register
4407    number START up to and including LIMIT.  Restore from the stack offset
4408    START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
4409    Write the appropriate REG_CFA_RESTORE notes into CFI_OPS.  */
4410 
4411 static void
aarch64_restore_callee_saves(machine_mode mode,poly_int64 start_offset,unsigned start,unsigned limit,bool skip_wb,rtx * cfi_ops)4412 aarch64_restore_callee_saves (machine_mode mode,
4413 			      poly_int64 start_offset, unsigned start,
4414 			      unsigned limit, bool skip_wb, rtx *cfi_ops)
4415 {
4416   rtx base_rtx = stack_pointer_rtx;
4417   unsigned regno;
4418   unsigned regno2;
4419   poly_int64 offset;
4420 
4421   for (regno = aarch64_next_callee_save (start, limit);
4422        regno <= limit;
4423        regno = aarch64_next_callee_save (regno + 1, limit))
4424     {
4425       if (cfun->machine->reg_is_wrapped_separately[regno])
4426        continue;
4427 
4428       rtx reg, mem;
4429 
4430       if (skip_wb
4431 	  && (regno == cfun->machine->frame.wb_candidate1
4432 	      || regno == cfun->machine->frame.wb_candidate2))
4433 	continue;
4434 
4435       reg = gen_rtx_REG (mode, regno);
4436       offset = start_offset + cfun->machine->frame.reg_offset[regno];
4437       mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
4438 
4439       regno2 = aarch64_next_callee_save (regno + 1, limit);
4440 
4441       if (regno2 <= limit
4442 	  && !cfun->machine->reg_is_wrapped_separately[regno2]
4443 	  && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
4444 	      == cfun->machine->frame.reg_offset[regno2]))
4445 	{
4446 	  rtx reg2 = gen_rtx_REG (mode, regno2);
4447 	  rtx mem2;
4448 
4449 	  offset = start_offset + cfun->machine->frame.reg_offset[regno2];
4450 	  mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
4451 	  emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
4452 
4453 	  *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
4454 	  regno = regno2;
4455 	}
4456       else
4457 	emit_move_insn (reg, mem);
4458       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
4459     }
4460 }
4461 
4462 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
4463    of MODE.  */
4464 
4465 static inline bool
offset_4bit_signed_scaled_p(machine_mode mode,poly_int64 offset)4466 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4467 {
4468   HOST_WIDE_INT multiple;
4469   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4470 	  && IN_RANGE (multiple, -8, 7));
4471 }
4472 
4473 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
4474    of MODE.  */
4475 
4476 static inline bool
offset_6bit_unsigned_scaled_p(machine_mode mode,poly_int64 offset)4477 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
4478 {
4479   HOST_WIDE_INT multiple;
4480   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4481 	  && IN_RANGE (multiple, 0, 63));
4482 }
4483 
4484 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
4485    of MODE.  */
4486 
4487 bool
aarch64_offset_7bit_signed_scaled_p(machine_mode mode,poly_int64 offset)4488 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4489 {
4490   HOST_WIDE_INT multiple;
4491   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4492 	  && IN_RANGE (multiple, -64, 63));
4493 }
4494 
4495 /* Return true if OFFSET is a signed 9-bit value.  */
4496 
4497 static inline bool
offset_9bit_signed_unscaled_p(machine_mode mode ATTRIBUTE_UNUSED,poly_int64 offset)4498 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
4499 			       poly_int64 offset)
4500 {
4501   HOST_WIDE_INT const_offset;
4502   return (offset.is_constant (&const_offset)
4503 	  && IN_RANGE (const_offset, -256, 255));
4504 }
4505 
4506 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
4507    of MODE.  */
4508 
4509 static inline bool
offset_9bit_signed_scaled_p(machine_mode mode,poly_int64 offset)4510 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4511 {
4512   HOST_WIDE_INT multiple;
4513   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4514 	  && IN_RANGE (multiple, -256, 255));
4515 }
4516 
4517 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
4518    of MODE.  */
4519 
4520 static inline bool
offset_12bit_unsigned_scaled_p(machine_mode mode,poly_int64 offset)4521 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
4522 {
4523   HOST_WIDE_INT multiple;
4524   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4525 	  && IN_RANGE (multiple, 0, 4095));
4526 }
4527 
4528 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS.  */
4529 
4530 static sbitmap
aarch64_get_separate_components(void)4531 aarch64_get_separate_components (void)
4532 {
4533   aarch64_layout_frame ();
4534 
4535   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
4536   bitmap_clear (components);
4537 
4538   /* The registers we need saved to the frame.  */
4539   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
4540     if (aarch64_register_saved_on_entry (regno))
4541       {
4542 	poly_int64 offset = cfun->machine->frame.reg_offset[regno];
4543 	if (!frame_pointer_needed)
4544 	  offset += cfun->machine->frame.frame_size
4545 		    - cfun->machine->frame.hard_fp_offset;
4546 	/* Check that we can access the stack slot of the register with one
4547 	   direct load with no adjustments needed.  */
4548 	if (offset_12bit_unsigned_scaled_p (DImode, offset))
4549 	  bitmap_set_bit (components, regno);
4550       }
4551 
4552   /* Don't mess with the hard frame pointer.  */
4553   if (frame_pointer_needed)
4554     bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
4555 
4556   unsigned reg1 = cfun->machine->frame.wb_candidate1;
4557   unsigned reg2 = cfun->machine->frame.wb_candidate2;
4558   /* If aarch64_layout_frame has chosen registers to store/restore with
4559      writeback don't interfere with them to avoid having to output explicit
4560      stack adjustment instructions.  */
4561   if (reg2 != INVALID_REGNUM)
4562     bitmap_clear_bit (components, reg2);
4563   if (reg1 != INVALID_REGNUM)
4564     bitmap_clear_bit (components, reg1);
4565 
4566   bitmap_clear_bit (components, LR_REGNUM);
4567   bitmap_clear_bit (components, SP_REGNUM);
4568 
4569   return components;
4570 }
4571 
4572 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB.  */
4573 
4574 static sbitmap
aarch64_components_for_bb(basic_block bb)4575 aarch64_components_for_bb (basic_block bb)
4576 {
4577   bitmap in = DF_LIVE_IN (bb);
4578   bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
4579   bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
4580 
4581   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
4582   bitmap_clear (components);
4583 
4584   /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets.  */
4585   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
4586     if ((!call_used_regs[regno])
4587        && (bitmap_bit_p (in, regno)
4588 	   || bitmap_bit_p (gen, regno)
4589 	   || bitmap_bit_p (kill, regno)))
4590       {
4591 	unsigned regno2, offset, offset2;
4592 	bitmap_set_bit (components, regno);
4593 
4594 	/* If there is a callee-save at an adjacent offset, add it too
4595 	   to increase the use of LDP/STP.  */
4596 	offset = cfun->machine->frame.reg_offset[regno];
4597 	regno2 = ((offset & 8) == 0) ? regno + 1 : regno - 1;
4598 
4599 	if (regno2 <= LAST_SAVED_REGNUM)
4600 	  {
4601 	    offset2 = cfun->machine->frame.reg_offset[regno2];
4602 	    if ((offset & ~8) == (offset2 & ~8))
4603 	      bitmap_set_bit (components, regno2);
4604 	  }
4605       }
4606 
4607   return components;
4608 }
4609 
4610 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
4611    Nothing to do for aarch64.  */
4612 
4613 static void
aarch64_disqualify_components(sbitmap,edge,sbitmap,bool)4614 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
4615 {
4616 }
4617 
4618 /* Return the next set bit in BMP from START onwards.  Return the total number
4619    of bits in BMP if no set bit is found at or after START.  */
4620 
4621 static unsigned int
aarch64_get_next_set_bit(sbitmap bmp,unsigned int start)4622 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
4623 {
4624   unsigned int nbits = SBITMAP_SIZE (bmp);
4625   if (start == nbits)
4626     return start;
4627 
4628   gcc_assert (start < nbits);
4629   for (unsigned int i = start; i < nbits; i++)
4630     if (bitmap_bit_p (bmp, i))
4631       return i;
4632 
4633   return nbits;
4634 }
4635 
4636 /* Do the work for aarch64_emit_prologue_components and
4637    aarch64_emit_epilogue_components.  COMPONENTS is the bitmap of registers
4638    to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
4639    for these components or the epilogue sequence.  That is, it determines
4640    whether we should emit stores or loads and what kind of CFA notes to attach
4641    to the insns.  Otherwise the logic for the two sequences is very
4642    similar.  */
4643 
4644 static void
aarch64_process_components(sbitmap components,bool prologue_p)4645 aarch64_process_components (sbitmap components, bool prologue_p)
4646 {
4647   rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
4648 			     ? HARD_FRAME_POINTER_REGNUM
4649 			     : STACK_POINTER_REGNUM);
4650 
4651   unsigned last_regno = SBITMAP_SIZE (components);
4652   unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
4653   rtx_insn *insn = NULL;
4654 
4655   while (regno != last_regno)
4656     {
4657       /* AAPCS64 section 5.1.2 requires only the bottom 64 bits to be saved
4658 	 so DFmode for the vector registers is enough.  */
4659       machine_mode mode = GP_REGNUM_P (regno) ? E_DImode : E_DFmode;
4660       rtx reg = gen_rtx_REG (mode, regno);
4661       poly_int64 offset = cfun->machine->frame.reg_offset[regno];
4662       if (!frame_pointer_needed)
4663 	offset += cfun->machine->frame.frame_size
4664 		  - cfun->machine->frame.hard_fp_offset;
4665       rtx addr = plus_constant (Pmode, ptr_reg, offset);
4666       rtx mem = gen_frame_mem (mode, addr);
4667 
4668       rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
4669       unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
4670       /* No more registers to handle after REGNO.
4671 	 Emit a single save/restore and exit.  */
4672       if (regno2 == last_regno)
4673 	{
4674 	  insn = emit_insn (set);
4675 	  RTX_FRAME_RELATED_P (insn) = 1;
4676 	  if (prologue_p)
4677 	    add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
4678 	  else
4679 	    add_reg_note (insn, REG_CFA_RESTORE, reg);
4680 	  break;
4681 	}
4682 
4683       poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
4684       /* The next register is not of the same class or its offset is not
4685 	 mergeable with the current one into a pair.  */
4686       if (!satisfies_constraint_Ump (mem)
4687 	  || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
4688 	  || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
4689 		       GET_MODE_SIZE (mode)))
4690 	{
4691 	  insn = emit_insn (set);
4692 	  RTX_FRAME_RELATED_P (insn) = 1;
4693 	  if (prologue_p)
4694 	    add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
4695 	  else
4696 	    add_reg_note (insn, REG_CFA_RESTORE, reg);
4697 
4698 	  regno = regno2;
4699 	  continue;
4700 	}
4701 
4702       /* REGNO2 can be saved/restored in a pair with REGNO.  */
4703       rtx reg2 = gen_rtx_REG (mode, regno2);
4704       if (!frame_pointer_needed)
4705 	offset2 += cfun->machine->frame.frame_size
4706 		  - cfun->machine->frame.hard_fp_offset;
4707       rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
4708       rtx mem2 = gen_frame_mem (mode, addr2);
4709       rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
4710 			     : gen_rtx_SET (reg2, mem2);
4711 
4712       if (prologue_p)
4713 	insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
4714       else
4715 	insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
4716 
4717       RTX_FRAME_RELATED_P (insn) = 1;
4718       if (prologue_p)
4719 	{
4720 	  add_reg_note (insn, REG_CFA_OFFSET, set);
4721 	  add_reg_note (insn, REG_CFA_OFFSET, set2);
4722 	}
4723       else
4724 	{
4725 	  add_reg_note (insn, REG_CFA_RESTORE, reg);
4726 	  add_reg_note (insn, REG_CFA_RESTORE, reg2);
4727 	}
4728 
4729       regno = aarch64_get_next_set_bit (components, regno2 + 1);
4730     }
4731 }
4732 
4733 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS.  */
4734 
4735 static void
aarch64_emit_prologue_components(sbitmap components)4736 aarch64_emit_prologue_components (sbitmap components)
4737 {
4738   aarch64_process_components (components, true);
4739 }
4740 
4741 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS.  */
4742 
4743 static void
aarch64_emit_epilogue_components(sbitmap components)4744 aarch64_emit_epilogue_components (sbitmap components)
4745 {
4746   aarch64_process_components (components, false);
4747 }
4748 
4749 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS.  */
4750 
4751 static void
aarch64_set_handled_components(sbitmap components)4752 aarch64_set_handled_components (sbitmap components)
4753 {
4754   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
4755     if (bitmap_bit_p (components, regno))
4756       cfun->machine->reg_is_wrapped_separately[regno] = true;
4757 }
4758 
4759 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
4760    is saved at BASE + OFFSET.  */
4761 
4762 static void
aarch64_add_cfa_expression(rtx_insn * insn,unsigned int reg,rtx base,poly_int64 offset)4763 aarch64_add_cfa_expression (rtx_insn *insn, unsigned int reg,
4764 			    rtx base, poly_int64 offset)
4765 {
4766   rtx mem = gen_frame_mem (DImode, plus_constant (Pmode, base, offset));
4767   add_reg_note (insn, REG_CFA_EXPRESSION,
4768 		gen_rtx_SET (mem, regno_reg_rtx[reg]));
4769 }
4770 
4771 /* AArch64 stack frames generated by this compiler look like:
4772 
4773 	+-------------------------------+
4774 	|                               |
4775 	|  incoming stack arguments     |
4776 	|                               |
4777 	+-------------------------------+
4778 	|                               | <-- incoming stack pointer (aligned)
4779 	|  callee-allocated save area   |
4780 	|  for register varargs         |
4781 	|                               |
4782 	+-------------------------------+
4783 	|  local variables              | <-- frame_pointer_rtx
4784 	|                               |
4785 	+-------------------------------+
4786 	|  padding0                     | \
4787 	+-------------------------------+  |
4788 	|  callee-saved registers       |  | frame.saved_regs_size
4789 	+-------------------------------+  |
4790 	|  LR'                          |  |
4791 	+-------------------------------+  |
4792 	|  FP'                          | / <- hard_frame_pointer_rtx (aligned)
4793         +-------------------------------+
4794 	|  dynamic allocation           |
4795 	+-------------------------------+
4796 	|  padding                      |
4797 	+-------------------------------+
4798 	|  outgoing stack arguments     | <-- arg_pointer
4799         |                               |
4800 	+-------------------------------+
4801 	|                               | <-- stack_pointer_rtx (aligned)
4802 
4803    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
4804    but leave frame_pointer_rtx and hard_frame_pointer_rtx
4805    unchanged.  */
4806 
4807 /* Generate the prologue instructions for entry into a function.
4808    Establish the stack frame by decreasing the stack pointer with a
4809    properly calculated size and, if necessary, create a frame record
4810    filled with the values of LR and previous frame pointer.  The
4811    current FP is also set up if it is in use.  */
4812 
4813 void
aarch64_expand_prologue(void)4814 aarch64_expand_prologue (void)
4815 {
4816   aarch64_layout_frame ();
4817 
4818   poly_int64 frame_size = cfun->machine->frame.frame_size;
4819   poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
4820   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
4821   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
4822   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
4823   unsigned reg1 = cfun->machine->frame.wb_candidate1;
4824   unsigned reg2 = cfun->machine->frame.wb_candidate2;
4825   bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
4826   rtx_insn *insn;
4827 
4828   /* Sign return address for functions.  */
4829   if (aarch64_return_address_signing_enabled ())
4830     {
4831       insn = emit_insn (gen_pacisp ());
4832       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
4833       RTX_FRAME_RELATED_P (insn) = 1;
4834     }
4835 
4836   if (flag_stack_usage_info)
4837     current_function_static_stack_size = constant_lower_bound (frame_size);
4838 
4839   if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
4840     {
4841       if (crtl->is_leaf && !cfun->calls_alloca)
4842 	{
4843 	  if (maybe_gt (frame_size, PROBE_INTERVAL)
4844 	      && maybe_gt (frame_size, get_stack_check_protect ()))
4845 	    aarch64_emit_probe_stack_range (get_stack_check_protect (),
4846 					    (frame_size
4847 					     - get_stack_check_protect ()));
4848 	}
4849       else if (maybe_gt (frame_size, 0))
4850 	aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
4851     }
4852 
4853   rtx ip0_rtx = gen_rtx_REG (Pmode, IP0_REGNUM);
4854   rtx ip1_rtx = gen_rtx_REG (Pmode, IP1_REGNUM);
4855 
4856   aarch64_sub_sp (ip0_rtx, ip1_rtx, initial_adjust, true);
4857 
4858   if (callee_adjust != 0)
4859     aarch64_push_regs (reg1, reg2, callee_adjust);
4860 
4861   if (emit_frame_chain)
4862     {
4863       poly_int64 reg_offset = callee_adjust;
4864       if (callee_adjust == 0)
4865 	{
4866 	  reg1 = R29_REGNUM;
4867 	  reg2 = R30_REGNUM;
4868 	  reg_offset = callee_offset;
4869 	  aarch64_save_callee_saves (DImode, reg_offset, reg1, reg2, false);
4870 	}
4871       aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
4872 			  stack_pointer_rtx, callee_offset,
4873 			  ip1_rtx, ip0_rtx, frame_pointer_needed);
4874       if (frame_pointer_needed && !frame_size.is_constant ())
4875 	{
4876 	  /* Variable-sized frames need to describe the save slot
4877 	     address using DW_CFA_expression rather than DW_CFA_offset.
4878 	     This means that, without taking further action, the
4879 	     locations of the registers that we've already saved would
4880 	     remain based on the stack pointer even after we redefine
4881 	     the CFA based on the frame pointer.  We therefore need new
4882 	     DW_CFA_expressions to re-express the save slots with addresses
4883 	     based on the frame pointer.  */
4884 	  rtx_insn *insn = get_last_insn ();
4885 	  gcc_assert (RTX_FRAME_RELATED_P (insn));
4886 
4887 	  /* Add an explicit CFA definition if this was previously
4888 	     implicit.  */
4889 	  if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
4890 	    {
4891 	      rtx src = plus_constant (Pmode, stack_pointer_rtx,
4892 				       callee_offset);
4893 	      add_reg_note (insn, REG_CFA_ADJUST_CFA,
4894 			    gen_rtx_SET (hard_frame_pointer_rtx, src));
4895 	    }
4896 
4897 	  /* Change the save slot expressions for the registers that
4898 	     we've already saved.  */
4899 	  reg_offset -= callee_offset;
4900 	  aarch64_add_cfa_expression (insn, reg2, hard_frame_pointer_rtx,
4901 				      reg_offset + UNITS_PER_WORD);
4902 	  aarch64_add_cfa_expression (insn, reg1, hard_frame_pointer_rtx,
4903 				      reg_offset);
4904 	}
4905       emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
4906     }
4907 
4908   aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
4909 			     callee_adjust != 0 || emit_frame_chain);
4910   aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
4911 			     callee_adjust != 0 || emit_frame_chain);
4912   aarch64_sub_sp (ip1_rtx, ip0_rtx, final_adjust, !frame_pointer_needed);
4913 }
4914 
4915 /* Return TRUE if we can use a simple_return insn.
4916 
4917    This function checks whether the callee saved stack is empty, which
4918    means no restore actions are need. The pro_and_epilogue will use
4919    this to check whether shrink-wrapping opt is feasible.  */
4920 
4921 bool
aarch64_use_return_insn_p(void)4922 aarch64_use_return_insn_p (void)
4923 {
4924   if (!reload_completed)
4925     return false;
4926 
4927   if (crtl->profile)
4928     return false;
4929 
4930   aarch64_layout_frame ();
4931 
4932   return known_eq (cfun->machine->frame.frame_size, 0);
4933 }
4934 
4935 /* Generate the epilogue instructions for returning from a function.
4936    This is almost exactly the reverse of the prolog sequence, except
4937    that we need to insert barriers to avoid scheduling loads that read
4938    from a deallocated stack, and we optimize the unwind records by
4939    emitting them all together if possible.  */
4940 void
aarch64_expand_epilogue(bool for_sibcall)4941 aarch64_expand_epilogue (bool for_sibcall)
4942 {
4943   aarch64_layout_frame ();
4944 
4945   poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
4946   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
4947   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
4948   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
4949   unsigned reg1 = cfun->machine->frame.wb_candidate1;
4950   unsigned reg2 = cfun->machine->frame.wb_candidate2;
4951   rtx cfi_ops = NULL;
4952   rtx_insn *insn;
4953   /* A stack clash protection prologue may not have left IP0_REGNUM or
4954      IP1_REGNUM in a usable state.  The same is true for allocations
4955      with an SVE component, since we then need both temporary registers
4956      for each allocation.  */
4957   bool can_inherit_p = (initial_adjust.is_constant ()
4958 			&& final_adjust.is_constant ()
4959 			&& !flag_stack_clash_protection);
4960 
4961   /* We need to add memory barrier to prevent read from deallocated stack.  */
4962   bool need_barrier_p
4963     = maybe_ne (get_frame_size ()
4964 		+ cfun->machine->frame.saved_varargs_size, 0);
4965 
4966   /* Emit a barrier to prevent loads from a deallocated stack.  */
4967   if (maybe_gt (final_adjust, crtl->outgoing_args_size)
4968       || cfun->calls_alloca
4969       || crtl->calls_eh_return)
4970     {
4971       emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
4972       need_barrier_p = false;
4973     }
4974 
4975   /* Restore the stack pointer from the frame pointer if it may not
4976      be the same as the stack pointer.  */
4977   rtx ip0_rtx = gen_rtx_REG (Pmode, IP0_REGNUM);
4978   rtx ip1_rtx = gen_rtx_REG (Pmode, IP1_REGNUM);
4979   if (frame_pointer_needed
4980       && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
4981     /* If writeback is used when restoring callee-saves, the CFA
4982        is restored on the instruction doing the writeback.  */
4983     aarch64_add_offset (Pmode, stack_pointer_rtx,
4984 			hard_frame_pointer_rtx, -callee_offset,
4985 			ip1_rtx, ip0_rtx, callee_adjust == 0);
4986   else
4987     aarch64_add_sp (ip1_rtx, ip0_rtx, final_adjust,
4988 		    !can_inherit_p || df_regs_ever_live_p (IP1_REGNUM));
4989 
4990   aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
4991 				callee_adjust != 0, &cfi_ops);
4992   aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
4993 				callee_adjust != 0, &cfi_ops);
4994 
4995   if (need_barrier_p)
4996     emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
4997 
4998   if (callee_adjust != 0)
4999     aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
5000 
5001   if (callee_adjust != 0 || maybe_gt (initial_adjust, 65536))
5002     {
5003       /* Emit delayed restores and set the CFA to be SP + initial_adjust.  */
5004       insn = get_last_insn ();
5005       rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
5006       REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
5007       RTX_FRAME_RELATED_P (insn) = 1;
5008       cfi_ops = NULL;
5009     }
5010 
5011   aarch64_add_sp (ip0_rtx, ip1_rtx, initial_adjust,
5012 		  !can_inherit_p || df_regs_ever_live_p (IP0_REGNUM));
5013 
5014   if (cfi_ops)
5015     {
5016       /* Emit delayed restores and reset the CFA to be SP.  */
5017       insn = get_last_insn ();
5018       cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
5019       REG_NOTES (insn) = cfi_ops;
5020       RTX_FRAME_RELATED_P (insn) = 1;
5021     }
5022 
5023   /* We prefer to emit the combined return/authenticate instruction RETAA,
5024      however there are three cases in which we must instead emit an explicit
5025      authentication instruction.
5026 
5027 	1) Sibcalls don't return in a normal way, so if we're about to call one
5028 	   we must authenticate.
5029 
5030 	2) The RETAA instruction is not available before ARMv8.3-A, so if we are
5031 	   generating code for !TARGET_ARMV8_3 we can't use it and must
5032 	   explicitly authenticate.
5033 
5034 	3) On an eh_return path we make extra stack adjustments to update the
5035 	   canonical frame address to be the exception handler's CFA.  We want
5036 	   to authenticate using the CFA of the function which calls eh_return.
5037     */
5038   if (aarch64_return_address_signing_enabled ()
5039       && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
5040     {
5041       insn = emit_insn (gen_autisp ());
5042       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
5043       RTX_FRAME_RELATED_P (insn) = 1;
5044     }
5045 
5046   /* Stack adjustment for exception handler.  */
5047   if (crtl->calls_eh_return)
5048     {
5049       /* We need to unwind the stack by the offset computed by
5050 	 EH_RETURN_STACKADJ_RTX.  We have already reset the CFA
5051 	 to be SP; letting the CFA move during this adjustment
5052 	 is just as correct as retaining the CFA from the body
5053 	 of the function.  Therefore, do nothing special.  */
5054       emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
5055     }
5056 
5057   emit_use (gen_rtx_REG (DImode, LR_REGNUM));
5058   if (!for_sibcall)
5059     emit_jump_insn (ret_rtx);
5060 }
5061 
5062 /* Implement EH_RETURN_HANDLER_RTX.  EH returns need to either return
5063    normally or return to a previous frame after unwinding.
5064 
5065    An EH return uses a single shared return sequence.  The epilogue is
5066    exactly like a normal epilogue except that it has an extra input
5067    register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
5068    that must be applied after the frame has been destroyed.  An extra label
5069    is inserted before the epilogue which initializes this register to zero,
5070    and this is the entry point for a normal return.
5071 
5072    An actual EH return updates the return address, initializes the stack
5073    adjustment and jumps directly into the epilogue (bypassing the zeroing
5074    of the adjustment).  Since the return address is typically saved on the
5075    stack when a function makes a call, the saved LR must be updated outside
5076    the epilogue.
5077 
5078    This poses problems as the store is generated well before the epilogue,
5079    so the offset of LR is not known yet.  Also optimizations will remove the
5080    store as it appears dead, even after the epilogue is generated (as the
5081    base or offset for loading LR is different in many cases).
5082 
5083    To avoid these problems this implementation forces the frame pointer
5084    in eh_return functions so that the location of LR is fixed and known early.
5085    It also marks the store volatile, so no optimization is permitted to
5086    remove the store.  */
5087 rtx
aarch64_eh_return_handler_rtx(void)5088 aarch64_eh_return_handler_rtx (void)
5089 {
5090   rtx tmp = gen_frame_mem (Pmode,
5091     plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
5092 
5093   /* Mark the store volatile, so no optimization is permitted to remove it.  */
5094   MEM_VOLATILE_P (tmp) = true;
5095   return tmp;
5096 }
5097 
5098 /* Output code to add DELTA to the first argument, and then jump
5099    to FUNCTION.  Used for C++ multiple inheritance.  */
5100 static void
aarch64_output_mi_thunk(FILE * file,tree thunk ATTRIBUTE_UNUSED,HOST_WIDE_INT delta,HOST_WIDE_INT vcall_offset,tree function)5101 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
5102 			 HOST_WIDE_INT delta,
5103 			 HOST_WIDE_INT vcall_offset,
5104 			 tree function)
5105 {
5106   /* The this pointer is always in x0.  Note that this differs from
5107      Arm where the this pointer maybe bumped to r1 if r0 is required
5108      to return a pointer to an aggregate.  On AArch64 a result value
5109      pointer will be in x8.  */
5110   int this_regno = R0_REGNUM;
5111   rtx this_rtx, temp0, temp1, addr, funexp;
5112   rtx_insn *insn;
5113 
5114   reload_completed = 1;
5115   emit_note (NOTE_INSN_PROLOGUE_END);
5116 
5117   this_rtx = gen_rtx_REG (Pmode, this_regno);
5118   temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
5119   temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
5120 
5121   if (vcall_offset == 0)
5122     aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
5123   else
5124     {
5125       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
5126 
5127       addr = this_rtx;
5128       if (delta != 0)
5129 	{
5130 	  if (delta >= -256 && delta < 256)
5131 	    addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
5132 				       plus_constant (Pmode, this_rtx, delta));
5133 	  else
5134 	    aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
5135 				temp1, temp0, false);
5136 	}
5137 
5138       if (Pmode == ptr_mode)
5139 	aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
5140       else
5141 	aarch64_emit_move (temp0,
5142 			   gen_rtx_ZERO_EXTEND (Pmode,
5143 						gen_rtx_MEM (ptr_mode, addr)));
5144 
5145       if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
5146 	  addr = plus_constant (Pmode, temp0, vcall_offset);
5147       else
5148 	{
5149 	  aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
5150 					  Pmode);
5151 	  addr = gen_rtx_PLUS (Pmode, temp0, temp1);
5152 	}
5153 
5154       if (Pmode == ptr_mode)
5155 	aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
5156       else
5157 	aarch64_emit_move (temp1,
5158 			   gen_rtx_SIGN_EXTEND (Pmode,
5159 						gen_rtx_MEM (ptr_mode, addr)));
5160 
5161       emit_insn (gen_add2_insn (this_rtx, temp1));
5162     }
5163 
5164   /* Generate a tail call to the target function.  */
5165   if (!TREE_USED (function))
5166     {
5167       assemble_external (function);
5168       TREE_USED (function) = 1;
5169     }
5170   funexp = XEXP (DECL_RTL (function), 0);
5171   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
5172   insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
5173   SIBLING_CALL_P (insn) = 1;
5174 
5175   insn = get_insns ();
5176   shorten_branches (insn);
5177   final_start_function (insn, file, 1);
5178   final (insn, file, 1);
5179   final_end_function ();
5180 
5181   /* Stop pretending to be a post-reload pass.  */
5182   reload_completed = 0;
5183 }
5184 
5185 static bool
aarch64_tls_referenced_p(rtx x)5186 aarch64_tls_referenced_p (rtx x)
5187 {
5188   if (!TARGET_HAVE_TLS)
5189     return false;
5190   subrtx_iterator::array_type array;
5191   FOR_EACH_SUBRTX (iter, array, x, ALL)
5192     {
5193       const_rtx x = *iter;
5194       if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
5195 	return true;
5196       /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
5197 	 TLS offsets, not real symbol references.  */
5198       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
5199 	iter.skip_subrtxes ();
5200     }
5201   return false;
5202 }
5203 
5204 
5205 /* Return true if val can be encoded as a 12-bit unsigned immediate with
5206    a left shift of 0 or 12 bits.  */
5207 bool
aarch64_uimm12_shift(HOST_WIDE_INT val)5208 aarch64_uimm12_shift (HOST_WIDE_INT val)
5209 {
5210   return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
5211 	  || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
5212 	  );
5213 }
5214 
5215 
5216 /* Return true if val is an immediate that can be loaded into a
5217    register by a MOVZ instruction.  */
5218 static bool
aarch64_movw_imm(HOST_WIDE_INT val,scalar_int_mode mode)5219 aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
5220 {
5221   if (GET_MODE_SIZE (mode) > 4)
5222     {
5223       if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
5224 	  || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
5225 	return 1;
5226     }
5227   else
5228     {
5229       /* Ignore sign extension.  */
5230       val &= (HOST_WIDE_INT) 0xffffffff;
5231     }
5232   return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
5233 	  || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
5234 }
5235 
5236 /* VAL is a value with the inner mode of MODE.  Replicate it to fill a
5237    64-bit (DImode) integer.  */
5238 
5239 static unsigned HOST_WIDE_INT
aarch64_replicate_bitmask_imm(unsigned HOST_WIDE_INT val,machine_mode mode)5240 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
5241 {
5242   unsigned int size = GET_MODE_UNIT_PRECISION (mode);
5243   while (size < 64)
5244     {
5245       val &= (HOST_WIDE_INT_1U << size) - 1;
5246       val |= val << size;
5247       size *= 2;
5248     }
5249   return val;
5250 }
5251 
5252 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2.  */
5253 
5254 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
5255   {
5256     0x0000000100000001ull,
5257     0x0001000100010001ull,
5258     0x0101010101010101ull,
5259     0x1111111111111111ull,
5260     0x5555555555555555ull,
5261   };
5262 
5263 
5264 /* Return true if val is a valid bitmask immediate.  */
5265 
5266 bool
aarch64_bitmask_imm(HOST_WIDE_INT val_in,machine_mode mode)5267 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
5268 {
5269   unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
5270   int bits;
5271 
5272   /* Check for a single sequence of one bits and return quickly if so.
5273      The special cases of all ones and all zeroes returns false.  */
5274   val = aarch64_replicate_bitmask_imm (val_in, mode);
5275   tmp = val + (val & -val);
5276 
5277   if (tmp == (tmp & -tmp))
5278     return (val + 1) > 1;
5279 
5280   /* Replicate 32-bit immediates so we can treat them as 64-bit.  */
5281   if (mode == SImode)
5282     val = (val << 32) | (val & 0xffffffff);
5283 
5284   /* Invert if the immediate doesn't start with a zero bit - this means we
5285      only need to search for sequences of one bits.  */
5286   if (val & 1)
5287     val = ~val;
5288 
5289   /* Find the first set bit and set tmp to val with the first sequence of one
5290      bits removed.  Return success if there is a single sequence of ones.  */
5291   first_one = val & -val;
5292   tmp = val & (val + first_one);
5293 
5294   if (tmp == 0)
5295     return true;
5296 
5297   /* Find the next set bit and compute the difference in bit position.  */
5298   next_one = tmp & -tmp;
5299   bits = clz_hwi (first_one) - clz_hwi (next_one);
5300   mask = val ^ tmp;
5301 
5302   /* Check the bit position difference is a power of 2, and that the first
5303      sequence of one bits fits within 'bits' bits.  */
5304   if ((mask >> bits) != 0 || bits != (bits & -bits))
5305     return false;
5306 
5307   /* Check the sequence of one bits is repeated 64/bits times.  */
5308   return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
5309 }
5310 
5311 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
5312    Assumed precondition: VAL_IN Is not zero.  */
5313 
5314 unsigned HOST_WIDE_INT
aarch64_and_split_imm1(HOST_WIDE_INT val_in)5315 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
5316 {
5317   int lowest_bit_set = ctz_hwi (val_in);
5318   int highest_bit_set = floor_log2 (val_in);
5319   gcc_assert (val_in != 0);
5320 
5321   return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
5322 	  (HOST_WIDE_INT_1U << lowest_bit_set));
5323 }
5324 
5325 /* Create constant where bits outside of lowest bit set to highest bit set
5326    are set to 1.  */
5327 
5328 unsigned HOST_WIDE_INT
aarch64_and_split_imm2(HOST_WIDE_INT val_in)5329 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
5330 {
5331   return val_in | ~aarch64_and_split_imm1 (val_in);
5332 }
5333 
5334 /* Return true if VAL_IN is a valid 'and' bitmask immediate.  */
5335 
5336 bool
aarch64_and_bitmask_imm(unsigned HOST_WIDE_INT val_in,machine_mode mode)5337 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
5338 {
5339   scalar_int_mode int_mode;
5340   if (!is_a <scalar_int_mode> (mode, &int_mode))
5341     return false;
5342 
5343   if (aarch64_bitmask_imm (val_in, int_mode))
5344     return false;
5345 
5346   if (aarch64_move_imm (val_in, int_mode))
5347     return false;
5348 
5349   unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
5350 
5351   return aarch64_bitmask_imm (imm2, int_mode);
5352 }
5353 
5354 /* Return true if val is an immediate that can be loaded into a
5355    register in a single instruction.  */
5356 bool
aarch64_move_imm(HOST_WIDE_INT val,machine_mode mode)5357 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
5358 {
5359   scalar_int_mode int_mode;
5360   if (!is_a <scalar_int_mode> (mode, &int_mode))
5361     return false;
5362 
5363   if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
5364     return 1;
5365   return aarch64_bitmask_imm (val, int_mode);
5366 }
5367 
5368 static bool
aarch64_cannot_force_const_mem(machine_mode mode ATTRIBUTE_UNUSED,rtx x)5369 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
5370 {
5371   rtx base, offset;
5372 
5373   if (GET_CODE (x) == HIGH)
5374     return true;
5375 
5376   /* There's no way to calculate VL-based values using relocations.  */
5377   subrtx_iterator::array_type array;
5378   FOR_EACH_SUBRTX (iter, array, x, ALL)
5379     if (GET_CODE (*iter) == CONST_POLY_INT)
5380       return true;
5381 
5382   split_const (x, &base, &offset);
5383   if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
5384     {
5385       if (aarch64_classify_symbol (base, INTVAL (offset))
5386 	  != SYMBOL_FORCE_TO_MEM)
5387 	return true;
5388       else
5389 	/* Avoid generating a 64-bit relocation in ILP32; leave
5390 	   to aarch64_expand_mov_immediate to handle it properly.  */
5391 	return mode != ptr_mode;
5392     }
5393 
5394   return aarch64_tls_referenced_p (x);
5395 }
5396 
5397 /* Implement TARGET_CASE_VALUES_THRESHOLD.
5398    The expansion for a table switch is quite expensive due to the number
5399    of instructions, the table lookup and hard to predict indirect jump.
5400    When optimizing for speed, and -O3 enabled, use the per-core tuning if
5401    set, otherwise use tables for > 16 cases as a tradeoff between size and
5402    performance.  When optimizing for size, use the default setting.  */
5403 
5404 static unsigned int
aarch64_case_values_threshold(void)5405 aarch64_case_values_threshold (void)
5406 {
5407   /* Use the specified limit for the number of cases before using jump
5408      tables at higher optimization levels.  */
5409   if (optimize > 2
5410       && selected_cpu->tune->max_case_values != 0)
5411     return selected_cpu->tune->max_case_values;
5412   else
5413     return optimize_size ? default_case_values_threshold () : 17;
5414 }
5415 
5416 /* Return true if register REGNO is a valid index register.
5417    STRICT_P is true if REG_OK_STRICT is in effect.  */
5418 
5419 bool
aarch64_regno_ok_for_index_p(int regno,bool strict_p)5420 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
5421 {
5422   if (!HARD_REGISTER_NUM_P (regno))
5423     {
5424       if (!strict_p)
5425 	return true;
5426 
5427       if (!reg_renumber)
5428 	return false;
5429 
5430       regno = reg_renumber[regno];
5431     }
5432   return GP_REGNUM_P (regno);
5433 }
5434 
5435 /* Return true if register REGNO is a valid base register for mode MODE.
5436    STRICT_P is true if REG_OK_STRICT is in effect.  */
5437 
5438 bool
aarch64_regno_ok_for_base_p(int regno,bool strict_p)5439 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
5440 {
5441   if (!HARD_REGISTER_NUM_P (regno))
5442     {
5443       if (!strict_p)
5444 	return true;
5445 
5446       if (!reg_renumber)
5447 	return false;
5448 
5449       regno = reg_renumber[regno];
5450     }
5451 
5452   /* The fake registers will be eliminated to either the stack or
5453      hard frame pointer, both of which are usually valid base registers.
5454      Reload deals with the cases where the eliminated form isn't valid.  */
5455   return (GP_REGNUM_P (regno)
5456 	  || regno == SP_REGNUM
5457 	  || regno == FRAME_POINTER_REGNUM
5458 	  || regno == ARG_POINTER_REGNUM);
5459 }
5460 
5461 /* Return true if X is a valid base register for mode MODE.
5462    STRICT_P is true if REG_OK_STRICT is in effect.  */
5463 
5464 static bool
aarch64_base_register_rtx_p(rtx x,bool strict_p)5465 aarch64_base_register_rtx_p (rtx x, bool strict_p)
5466 {
5467   if (!strict_p
5468       && GET_CODE (x) == SUBREG
5469       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
5470     x = SUBREG_REG (x);
5471 
5472   return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
5473 }
5474 
5475 /* Return true if address offset is a valid index.  If it is, fill in INFO
5476    appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
5477 
5478 static bool
aarch64_classify_index(struct aarch64_address_info * info,rtx x,machine_mode mode,bool strict_p)5479 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
5480 			machine_mode mode, bool strict_p)
5481 {
5482   enum aarch64_address_type type;
5483   rtx index;
5484   int shift;
5485 
5486   /* (reg:P) */
5487   if ((REG_P (x) || GET_CODE (x) == SUBREG)
5488       && GET_MODE (x) == Pmode)
5489     {
5490       type = ADDRESS_REG_REG;
5491       index = x;
5492       shift = 0;
5493     }
5494   /* (sign_extend:DI (reg:SI)) */
5495   else if ((GET_CODE (x) == SIGN_EXTEND
5496 	    || GET_CODE (x) == ZERO_EXTEND)
5497 	   && GET_MODE (x) == DImode
5498 	   && GET_MODE (XEXP (x, 0)) == SImode)
5499     {
5500       type = (GET_CODE (x) == SIGN_EXTEND)
5501 	? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5502       index = XEXP (x, 0);
5503       shift = 0;
5504     }
5505   /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
5506   else if (GET_CODE (x) == MULT
5507 	   && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
5508 	       || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
5509 	   && GET_MODE (XEXP (x, 0)) == DImode
5510 	   && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
5511 	   && CONST_INT_P (XEXP (x, 1)))
5512     {
5513       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
5514 	? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5515       index = XEXP (XEXP (x, 0), 0);
5516       shift = exact_log2 (INTVAL (XEXP (x, 1)));
5517     }
5518   /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
5519   else if (GET_CODE (x) == ASHIFT
5520 	   && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
5521 	       || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
5522 	   && GET_MODE (XEXP (x, 0)) == DImode
5523 	   && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
5524 	   && CONST_INT_P (XEXP (x, 1)))
5525     {
5526       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
5527 	? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5528       index = XEXP (XEXP (x, 0), 0);
5529       shift = INTVAL (XEXP (x, 1));
5530     }
5531   /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
5532   else if ((GET_CODE (x) == SIGN_EXTRACT
5533 	    || GET_CODE (x) == ZERO_EXTRACT)
5534 	   && GET_MODE (x) == DImode
5535 	   && GET_CODE (XEXP (x, 0)) == MULT
5536 	   && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5537 	   && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
5538     {
5539       type = (GET_CODE (x) == SIGN_EXTRACT)
5540 	? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5541       index = XEXP (XEXP (x, 0), 0);
5542       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
5543       if (INTVAL (XEXP (x, 1)) != 32 + shift
5544 	  || INTVAL (XEXP (x, 2)) != 0)
5545 	shift = -1;
5546     }
5547   /* (and:DI (mult:DI (reg:DI) (const_int scale))
5548      (const_int 0xffffffff<<shift)) */
5549   else if (GET_CODE (x) == AND
5550 	   && GET_MODE (x) == DImode
5551 	   && GET_CODE (XEXP (x, 0)) == MULT
5552 	   && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5553 	   && CONST_INT_P (XEXP (XEXP (x, 0), 1))
5554 	   && CONST_INT_P (XEXP (x, 1)))
5555     {
5556       type = ADDRESS_REG_UXTW;
5557       index = XEXP (XEXP (x, 0), 0);
5558       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
5559       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
5560 	shift = -1;
5561     }
5562   /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
5563   else if ((GET_CODE (x) == SIGN_EXTRACT
5564 	    || GET_CODE (x) == ZERO_EXTRACT)
5565 	   && GET_MODE (x) == DImode
5566 	   && GET_CODE (XEXP (x, 0)) == ASHIFT
5567 	   && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5568 	   && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
5569     {
5570       type = (GET_CODE (x) == SIGN_EXTRACT)
5571 	? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5572       index = XEXP (XEXP (x, 0), 0);
5573       shift = INTVAL (XEXP (XEXP (x, 0), 1));
5574       if (INTVAL (XEXP (x, 1)) != 32 + shift
5575 	  || INTVAL (XEXP (x, 2)) != 0)
5576 	shift = -1;
5577     }
5578   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
5579      (const_int 0xffffffff<<shift)) */
5580   else if (GET_CODE (x) == AND
5581 	   && GET_MODE (x) == DImode
5582 	   && GET_CODE (XEXP (x, 0)) == ASHIFT
5583 	   && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5584 	   && CONST_INT_P (XEXP (XEXP (x, 0), 1))
5585 	   && CONST_INT_P (XEXP (x, 1)))
5586     {
5587       type = ADDRESS_REG_UXTW;
5588       index = XEXP (XEXP (x, 0), 0);
5589       shift = INTVAL (XEXP (XEXP (x, 0), 1));
5590       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
5591 	shift = -1;
5592     }
5593   /* (mult:P (reg:P) (const_int scale)) */
5594   else if (GET_CODE (x) == MULT
5595 	   && GET_MODE (x) == Pmode
5596 	   && GET_MODE (XEXP (x, 0)) == Pmode
5597 	   && CONST_INT_P (XEXP (x, 1)))
5598     {
5599       type = ADDRESS_REG_REG;
5600       index = XEXP (x, 0);
5601       shift = exact_log2 (INTVAL (XEXP (x, 1)));
5602     }
5603   /* (ashift:P (reg:P) (const_int shift)) */
5604   else if (GET_CODE (x) == ASHIFT
5605 	   && GET_MODE (x) == Pmode
5606 	   && GET_MODE (XEXP (x, 0)) == Pmode
5607 	   && CONST_INT_P (XEXP (x, 1)))
5608     {
5609       type = ADDRESS_REG_REG;
5610       index = XEXP (x, 0);
5611       shift = INTVAL (XEXP (x, 1));
5612     }
5613   else
5614     return false;
5615 
5616   if (!strict_p
5617       && GET_CODE (index) == SUBREG
5618       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
5619     index = SUBREG_REG (index);
5620 
5621   if (aarch64_sve_data_mode_p (mode))
5622     {
5623       if (type != ADDRESS_REG_REG
5624 	  || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
5625 	return false;
5626     }
5627   else
5628     {
5629       if (shift != 0
5630 	  && !(IN_RANGE (shift, 1, 3)
5631 	       && known_eq (1 << shift, GET_MODE_SIZE (mode))))
5632 	return false;
5633     }
5634 
5635   if (REG_P (index)
5636       && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
5637     {
5638       info->type = type;
5639       info->offset = index;
5640       info->shift = shift;
5641       return true;
5642     }
5643 
5644   return false;
5645 }
5646 
5647 /* Return true if MODE is one of the modes for which we
5648    support LDP/STP operations.  */
5649 
5650 static bool
aarch64_mode_valid_for_sched_fusion_p(machine_mode mode)5651 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
5652 {
5653   return mode == SImode || mode == DImode
5654 	 || mode == SFmode || mode == DFmode
5655 	 || (aarch64_vector_mode_supported_p (mode)
5656 	     && known_eq (GET_MODE_SIZE (mode), 8));
5657 }
5658 
5659 /* Return true if REGNO is a virtual pointer register, or an eliminable
5660    "soft" frame register.  Like REGNO_PTR_FRAME_P except that we don't
5661    include stack_pointer or hard_frame_pointer.  */
5662 static bool
virt_or_elim_regno_p(unsigned regno)5663 virt_or_elim_regno_p (unsigned regno)
5664 {
5665   return ((regno >= FIRST_VIRTUAL_REGISTER
5666 	   && regno <= LAST_VIRTUAL_POINTER_REGISTER)
5667 	  || regno == FRAME_POINTER_REGNUM
5668 	  || regno == ARG_POINTER_REGNUM);
5669 }
5670 
5671 /* Return true if X is a valid address of type TYPE for machine mode MODE.
5672    If it is, fill in INFO appropriately.  STRICT_P is true if
5673    REG_OK_STRICT is in effect.  */
5674 
5675 static bool
5676 aarch64_classify_address (struct aarch64_address_info *info,
5677 			  rtx x, machine_mode mode, bool strict_p,
5678 			  aarch64_addr_query_type type = ADDR_QUERY_M)
5679 {
5680   enum rtx_code code = GET_CODE (x);
5681   rtx op0, op1;
5682   poly_int64 offset;
5683 
5684   HOST_WIDE_INT const_size;
5685 
5686   /* On BE, we use load/store pair for all large int mode load/stores.
5687      TI/TFmode may also use a load/store pair.  */
5688   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
5689   bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
5690   bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
5691 			    || mode == TImode
5692 			    || mode == TFmode
5693 			    || (BYTES_BIG_ENDIAN && advsimd_struct_p));
5694 
5695   bool allow_reg_index_p = (!load_store_pair_p
5696 			    && (known_lt (GET_MODE_SIZE (mode), 16)
5697 				|| vec_flags == VEC_ADVSIMD
5698 				|| vec_flags == VEC_SVE_DATA));
5699 
5700   /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
5701      [Rn, #offset, MUL VL].  */
5702   if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
5703       && (code != REG && code != PLUS))
5704     return false;
5705 
5706   /* On LE, for AdvSIMD, don't support anything other than POST_INC or
5707      REG addressing.  */
5708   if (advsimd_struct_p
5709       && !BYTES_BIG_ENDIAN
5710       && (code != POST_INC && code != REG))
5711     return false;
5712 
5713   gcc_checking_assert (GET_MODE (x) == VOIDmode
5714 		       || SCALAR_INT_MODE_P (GET_MODE (x)));
5715 
5716   switch (code)
5717     {
5718     case REG:
5719     case SUBREG:
5720       info->type = ADDRESS_REG_IMM;
5721       info->base = x;
5722       info->offset = const0_rtx;
5723       info->const_offset = 0;
5724       return aarch64_base_register_rtx_p (x, strict_p);
5725 
5726     case PLUS:
5727       op0 = XEXP (x, 0);
5728       op1 = XEXP (x, 1);
5729 
5730       if (! strict_p
5731 	  && REG_P (op0)
5732 	  && virt_or_elim_regno_p (REGNO (op0))
5733 	  && poly_int_rtx_p (op1, &offset))
5734 	{
5735 	  info->type = ADDRESS_REG_IMM;
5736 	  info->base = op0;
5737 	  info->offset = op1;
5738 	  info->const_offset = offset;
5739 
5740 	  return true;
5741 	}
5742 
5743       if (maybe_ne (GET_MODE_SIZE (mode), 0)
5744 	  && aarch64_base_register_rtx_p (op0, strict_p)
5745 	  && poly_int_rtx_p (op1, &offset))
5746 	{
5747 	  info->type = ADDRESS_REG_IMM;
5748 	  info->base = op0;
5749 	  info->offset = op1;
5750 	  info->const_offset = offset;
5751 
5752 	  /* TImode and TFmode values are allowed in both pairs of X
5753 	     registers and individual Q registers.  The available
5754 	     address modes are:
5755 	     X,X: 7-bit signed scaled offset
5756 	     Q:   9-bit signed offset
5757 	     We conservatively require an offset representable in either mode.
5758 	     When performing the check for pairs of X registers i.e.  LDP/STP
5759 	     pass down DImode since that is the natural size of the LDP/STP
5760 	     instruction memory accesses.  */
5761 	  if (mode == TImode || mode == TFmode)
5762 	    return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
5763 		    && (offset_9bit_signed_unscaled_p (mode, offset)
5764 			|| offset_12bit_unsigned_scaled_p (mode, offset)));
5765 
5766 	  /* A 7bit offset check because OImode will emit a ldp/stp
5767 	     instruction (only big endian will get here).
5768 	     For ldp/stp instructions, the offset is scaled for the size of a
5769 	     single element of the pair.  */
5770 	  if (mode == OImode)
5771 	    return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
5772 
5773 	  /* Three 9/12 bit offsets checks because CImode will emit three
5774 	     ldr/str instructions (only big endian will get here).  */
5775 	  if (mode == CImode)
5776 	    return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
5777 		    && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
5778 			|| offset_12bit_unsigned_scaled_p (V16QImode,
5779 							   offset + 32)));
5780 
5781 	  /* Two 7bit offsets checks because XImode will emit two ldp/stp
5782 	     instructions (only big endian will get here).  */
5783 	  if (mode == XImode)
5784 	    return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
5785 		    && aarch64_offset_7bit_signed_scaled_p (TImode,
5786 							    offset + 32));
5787 
5788 	  /* Make "m" use the LD1 offset range for SVE data modes, so
5789 	     that pre-RTL optimizers like ivopts will work to that
5790 	     instead of the wider LDR/STR range.  */
5791 	  if (vec_flags == VEC_SVE_DATA)
5792 	    return (type == ADDR_QUERY_M
5793 		    ? offset_4bit_signed_scaled_p (mode, offset)
5794 		    : offset_9bit_signed_scaled_p (mode, offset));
5795 
5796 	  if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
5797 	    {
5798 	      poly_int64 end_offset = (offset
5799 				       + GET_MODE_SIZE (mode)
5800 				       - BYTES_PER_SVE_VECTOR);
5801 	      return (type == ADDR_QUERY_M
5802 		      ? offset_4bit_signed_scaled_p (mode, offset)
5803 		      : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
5804 			 && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
5805 							 end_offset)));
5806 	    }
5807 
5808 	  if (vec_flags == VEC_SVE_PRED)
5809 	    return offset_9bit_signed_scaled_p (mode, offset);
5810 
5811 	  if (load_store_pair_p)
5812 	    return ((known_eq (GET_MODE_SIZE (mode), 4)
5813 		     || known_eq (GET_MODE_SIZE (mode), 8))
5814 		    && aarch64_offset_7bit_signed_scaled_p (mode, offset));
5815 	  else
5816 	    return (offset_9bit_signed_unscaled_p (mode, offset)
5817 		    || offset_12bit_unsigned_scaled_p (mode, offset));
5818 	}
5819 
5820       if (allow_reg_index_p)
5821 	{
5822 	  /* Look for base + (scaled/extended) index register.  */
5823 	  if (aarch64_base_register_rtx_p (op0, strict_p)
5824 	      && aarch64_classify_index (info, op1, mode, strict_p))
5825 	    {
5826 	      info->base = op0;
5827 	      return true;
5828 	    }
5829 	  if (aarch64_base_register_rtx_p (op1, strict_p)
5830 	      && aarch64_classify_index (info, op0, mode, strict_p))
5831 	    {
5832 	      info->base = op1;
5833 	      return true;
5834 	    }
5835 	}
5836 
5837       return false;
5838 
5839     case POST_INC:
5840     case POST_DEC:
5841     case PRE_INC:
5842     case PRE_DEC:
5843       info->type = ADDRESS_REG_WB;
5844       info->base = XEXP (x, 0);
5845       info->offset = NULL_RTX;
5846       return aarch64_base_register_rtx_p (info->base, strict_p);
5847 
5848     case POST_MODIFY:
5849     case PRE_MODIFY:
5850       info->type = ADDRESS_REG_WB;
5851       info->base = XEXP (x, 0);
5852       if (GET_CODE (XEXP (x, 1)) == PLUS
5853 	  && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
5854 	  && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
5855 	  && aarch64_base_register_rtx_p (info->base, strict_p))
5856 	{
5857 	  info->offset = XEXP (XEXP (x, 1), 1);
5858 	  info->const_offset = offset;
5859 
5860 	  /* TImode and TFmode values are allowed in both pairs of X
5861 	     registers and individual Q registers.  The available
5862 	     address modes are:
5863 	     X,X: 7-bit signed scaled offset
5864 	     Q:   9-bit signed offset
5865 	     We conservatively require an offset representable in either mode.
5866 	   */
5867 	  if (mode == TImode || mode == TFmode)
5868 	    return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
5869 		    && offset_9bit_signed_unscaled_p (mode, offset));
5870 
5871 	  if (load_store_pair_p)
5872 	    return ((known_eq (GET_MODE_SIZE (mode), 4)
5873 		     || known_eq (GET_MODE_SIZE (mode), 8))
5874 		    && aarch64_offset_7bit_signed_scaled_p (mode, offset));
5875 	  else
5876 	    return offset_9bit_signed_unscaled_p (mode, offset);
5877 	}
5878       return false;
5879 
5880     case CONST:
5881     case SYMBOL_REF:
5882     case LABEL_REF:
5883       /* load literal: pc-relative constant pool entry.  Only supported
5884          for SI mode or larger.  */
5885       info->type = ADDRESS_SYMBOLIC;
5886 
5887       if (!load_store_pair_p
5888 	  && GET_MODE_SIZE (mode).is_constant (&const_size)
5889 	  && const_size >= 4)
5890 	{
5891 	  rtx sym, addend;
5892 
5893 	  split_const (x, &sym, &addend);
5894 	  return ((GET_CODE (sym) == LABEL_REF
5895 		   || (GET_CODE (sym) == SYMBOL_REF
5896 		       && CONSTANT_POOL_ADDRESS_P (sym)
5897 		       && aarch64_pcrelative_literal_loads)));
5898 	}
5899       return false;
5900 
5901     case LO_SUM:
5902       info->type = ADDRESS_LO_SUM;
5903       info->base = XEXP (x, 0);
5904       info->offset = XEXP (x, 1);
5905       if (allow_reg_index_p
5906 	  && aarch64_base_register_rtx_p (info->base, strict_p))
5907 	{
5908 	  rtx sym, offs;
5909 	  split_const (info->offset, &sym, &offs);
5910 	  if (GET_CODE (sym) == SYMBOL_REF
5911 	      && (aarch64_classify_symbol (sym, INTVAL (offs))
5912 		  == SYMBOL_SMALL_ABSOLUTE))
5913 	    {
5914 	      /* The symbol and offset must be aligned to the access size.  */
5915 	      unsigned int align;
5916 
5917 	      if (CONSTANT_POOL_ADDRESS_P (sym))
5918 		align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
5919 	      else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
5920 		{
5921 		  tree exp = SYMBOL_REF_DECL (sym);
5922 		  align = TYPE_ALIGN (TREE_TYPE (exp));
5923 		  align = aarch64_constant_alignment (exp, align);
5924 		}
5925 	      else if (SYMBOL_REF_DECL (sym))
5926 		align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
5927 	      else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
5928 		       && SYMBOL_REF_BLOCK (sym) != NULL)
5929 		align = SYMBOL_REF_BLOCK (sym)->alignment;
5930 	      else
5931 		align = BITS_PER_UNIT;
5932 
5933 	      poly_int64 ref_size = GET_MODE_SIZE (mode);
5934 	      if (known_eq (ref_size, 0))
5935 		ref_size = GET_MODE_SIZE (DImode);
5936 
5937 	      return (multiple_p (INTVAL (offs), ref_size)
5938 		      && multiple_p (align / BITS_PER_UNIT, ref_size));
5939 	    }
5940 	}
5941       return false;
5942 
5943     default:
5944       return false;
5945     }
5946 }
5947 
5948 /* Return true if the address X is valid for a PRFM instruction.
5949    STRICT_P is true if we should do strict checking with
5950    aarch64_classify_address.  */
5951 
5952 bool
aarch64_address_valid_for_prefetch_p(rtx x,bool strict_p)5953 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
5954 {
5955   struct aarch64_address_info addr;
5956 
5957   /* PRFM accepts the same addresses as DImode...  */
5958   bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
5959   if (!res)
5960     return false;
5961 
5962   /* ... except writeback forms.  */
5963   return addr.type != ADDRESS_REG_WB;
5964 }
5965 
5966 bool
aarch64_symbolic_address_p(rtx x)5967 aarch64_symbolic_address_p (rtx x)
5968 {
5969   rtx offset;
5970 
5971   split_const (x, &x, &offset);
5972   return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
5973 }
5974 
5975 /* Classify the base of symbolic expression X.  */
5976 
5977 enum aarch64_symbol_type
aarch64_classify_symbolic_expression(rtx x)5978 aarch64_classify_symbolic_expression (rtx x)
5979 {
5980   rtx offset;
5981 
5982   split_const (x, &x, &offset);
5983   return aarch64_classify_symbol (x, INTVAL (offset));
5984 }
5985 
5986 
5987 /* Return TRUE if X is a legitimate address for accessing memory in
5988    mode MODE.  */
5989 static bool
aarch64_legitimate_address_hook_p(machine_mode mode,rtx x,bool strict_p)5990 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
5991 {
5992   struct aarch64_address_info addr;
5993 
5994   return aarch64_classify_address (&addr, x, mode, strict_p);
5995 }
5996 
5997 /* Return TRUE if X is a legitimate address of type TYPE for accessing
5998    memory in mode MODE.  STRICT_P is true if REG_OK_STRICT is in effect.  */
5999 bool
aarch64_legitimate_address_p(machine_mode mode,rtx x,bool strict_p,aarch64_addr_query_type type)6000 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
6001 			      aarch64_addr_query_type type)
6002 {
6003   struct aarch64_address_info addr;
6004 
6005   return aarch64_classify_address (&addr, x, mode, strict_p, type);
6006 }
6007 
6008 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT.  */
6009 
6010 static bool
aarch64_legitimize_address_displacement(rtx * offset1,rtx * offset2,poly_int64 orig_offset,machine_mode mode)6011 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
6012 					 poly_int64 orig_offset,
6013 					 machine_mode mode)
6014 {
6015   HOST_WIDE_INT size;
6016   if (GET_MODE_SIZE (mode).is_constant (&size))
6017     {
6018       HOST_WIDE_INT const_offset, second_offset;
6019 
6020       /* A general SVE offset is A * VQ + B.  Remove the A component from
6021 	 coefficient 0 in order to get the constant B.  */
6022       const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
6023 
6024       /* Split an out-of-range address displacement into a base and
6025 	 offset.  Use 4KB range for 1- and 2-byte accesses and a 16KB
6026 	 range otherwise to increase opportunities for sharing the base
6027 	 address of different sizes.  Unaligned accesses use the signed
6028 	 9-bit range, TImode/TFmode use the intersection of signed
6029 	 scaled 7-bit and signed 9-bit offset.  */
6030       if (mode == TImode || mode == TFmode)
6031 	second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
6032       else if ((const_offset & (size - 1)) != 0)
6033 	second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
6034       else
6035 	second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
6036 
6037       if (second_offset == 0 || known_eq (orig_offset, second_offset))
6038 	return false;
6039 
6040       /* Split the offset into second_offset and the rest.  */
6041       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
6042       *offset2 = gen_int_mode (second_offset, Pmode);
6043       return true;
6044     }
6045   else
6046     {
6047       /* Get the mode we should use as the basis of the range.  For structure
6048 	 modes this is the mode of one vector.  */
6049       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
6050       machine_mode step_mode
6051 	= (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
6052 
6053       /* Get the "mul vl" multiplier we'd like to use.  */
6054       HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
6055       HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
6056       if (vec_flags & VEC_SVE_DATA)
6057 	/* LDR supports a 9-bit range, but the move patterns for
6058 	   structure modes require all vectors to be in range of the
6059 	   same base.  The simplest way of accomodating that while still
6060 	   promoting reuse of anchor points between different modes is
6061 	   to use an 8-bit range unconditionally.  */
6062 	vnum = ((vnum + 128) & 255) - 128;
6063       else
6064 	/* Predicates are only handled singly, so we might as well use
6065 	   the full range.  */
6066 	vnum = ((vnum + 256) & 511) - 256;
6067       if (vnum == 0)
6068 	return false;
6069 
6070       /* Convert the "mul vl" multiplier into a byte offset.  */
6071       poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
6072       if (known_eq (second_offset, orig_offset))
6073 	return false;
6074 
6075       /* Split the offset into second_offset and the rest.  */
6076       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
6077       *offset2 = gen_int_mode (second_offset, Pmode);
6078       return true;
6079     }
6080 }
6081 
6082 /* Return the binary representation of floating point constant VALUE in INTVAL.
6083    If the value cannot be converted, return false without setting INTVAL.
6084    The conversion is done in the given MODE.  */
6085 bool
aarch64_reinterpret_float_as_int(rtx value,unsigned HOST_WIDE_INT * intval)6086 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
6087 {
6088 
6089   /* We make a general exception for 0.  */
6090   if (aarch64_float_const_zero_rtx_p (value))
6091     {
6092       *intval = 0;
6093       return true;
6094     }
6095 
6096   scalar_float_mode mode;
6097   if (GET_CODE (value) != CONST_DOUBLE
6098       || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
6099       || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
6100       /* Only support up to DF mode.  */
6101       || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
6102     return false;
6103 
6104   unsigned HOST_WIDE_INT ival = 0;
6105 
6106   long res[2];
6107   real_to_target (res,
6108 		  CONST_DOUBLE_REAL_VALUE (value),
6109 		  REAL_MODE_FORMAT (mode));
6110 
6111   if (mode == DFmode)
6112     {
6113       int order = BYTES_BIG_ENDIAN ? 1 : 0;
6114       ival = zext_hwi (res[order], 32);
6115       ival |= (zext_hwi (res[1 - order], 32) << 32);
6116     }
6117   else
6118       ival = zext_hwi (res[0], 32);
6119 
6120   *intval = ival;
6121   return true;
6122 }
6123 
6124 /* Return TRUE if rtx X is an immediate constant that can be moved using a
6125    single MOV(+MOVK) followed by an FMOV.  */
6126 bool
aarch64_float_const_rtx_p(rtx x)6127 aarch64_float_const_rtx_p (rtx x)
6128 {
6129   machine_mode mode = GET_MODE (x);
6130   if (mode == VOIDmode)
6131     return false;
6132 
6133   /* Determine whether it's cheaper to write float constants as
6134      mov/movk pairs over ldr/adrp pairs.  */
6135   unsigned HOST_WIDE_INT ival;
6136 
6137   if (GET_CODE (x) == CONST_DOUBLE
6138       && SCALAR_FLOAT_MODE_P (mode)
6139       && aarch64_reinterpret_float_as_int (x, &ival))
6140     {
6141       scalar_int_mode imode = (mode == HFmode
6142 			       ? SImode
6143 			       : int_mode_for_mode (mode).require ());
6144       int num_instr = aarch64_internal_mov_immediate
6145 			(NULL_RTX, gen_int_mode (ival, imode), false, imode);
6146       return num_instr < 3;
6147     }
6148 
6149   return false;
6150 }
6151 
6152 /* Return TRUE if rtx X is immediate constant 0.0 */
6153 bool
aarch64_float_const_zero_rtx_p(rtx x)6154 aarch64_float_const_zero_rtx_p (rtx x)
6155 {
6156   if (GET_MODE (x) == VOIDmode)
6157     return false;
6158 
6159   if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
6160     return !HONOR_SIGNED_ZEROS (GET_MODE (x));
6161   return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
6162 }
6163 
6164 /* Return TRUE if rtx X is immediate constant that fits in a single
6165    MOVI immediate operation.  */
6166 bool
aarch64_can_const_movi_rtx_p(rtx x,machine_mode mode)6167 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
6168 {
6169   if (!TARGET_SIMD)
6170      return false;
6171 
6172   machine_mode vmode;
6173   scalar_int_mode imode;
6174   unsigned HOST_WIDE_INT ival;
6175 
6176   if (GET_CODE (x) == CONST_DOUBLE
6177       && SCALAR_FLOAT_MODE_P (mode))
6178     {
6179       if (!aarch64_reinterpret_float_as_int (x, &ival))
6180 	return false;
6181 
6182       /* We make a general exception for 0.  */
6183       if (aarch64_float_const_zero_rtx_p (x))
6184 	return true;
6185 
6186       imode = int_mode_for_mode (mode).require ();
6187     }
6188   else if (GET_CODE (x) == CONST_INT
6189 	   && is_a <scalar_int_mode> (mode, &imode))
6190     ival = INTVAL (x);
6191   else
6192     return false;
6193 
6194    /* use a 64 bit mode for everything except for DI/DF mode, where we use
6195      a 128 bit vector mode.  */
6196   int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
6197 
6198   vmode = aarch64_simd_container_mode (imode, width);
6199   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
6200 
6201   return aarch64_simd_valid_immediate (v_op, NULL);
6202 }
6203 
6204 
6205 /* Return the fixed registers used for condition codes.  */
6206 
6207 static bool
aarch64_fixed_condition_code_regs(unsigned int * p1,unsigned int * p2)6208 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
6209 {
6210   *p1 = CC_REGNUM;
6211   *p2 = INVALID_REGNUM;
6212   return true;
6213 }
6214 
6215 /* This function is used by the call expanders of the machine description.
6216    RESULT is the register in which the result is returned.  It's NULL for
6217    "call" and "sibcall".
6218    MEM is the location of the function call.
6219    SIBCALL indicates whether this function call is normal call or sibling call.
6220    It will generate different pattern accordingly.  */
6221 
6222 void
aarch64_expand_call(rtx result,rtx mem,bool sibcall)6223 aarch64_expand_call (rtx result, rtx mem, bool sibcall)
6224 {
6225   rtx call, callee, tmp;
6226   rtvec vec;
6227   machine_mode mode;
6228 
6229   gcc_assert (MEM_P (mem));
6230   callee = XEXP (mem, 0);
6231   mode = GET_MODE (callee);
6232   gcc_assert (mode == Pmode);
6233 
6234   /* Decide if we should generate indirect calls by loading the
6235      address of the callee into a register before performing
6236      the branch-and-link.  */
6237   if (SYMBOL_REF_P (callee)
6238       ? (aarch64_is_long_call_p (callee)
6239 	 || aarch64_is_noplt_call_p (callee))
6240       : !REG_P (callee))
6241     XEXP (mem, 0) = force_reg (mode, callee);
6242 
6243   call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
6244 
6245   if (result != NULL_RTX)
6246     call = gen_rtx_SET (result, call);
6247 
6248   if (sibcall)
6249     tmp = ret_rtx;
6250   else
6251     tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
6252 
6253   vec = gen_rtvec (2, call, tmp);
6254   call = gen_rtx_PARALLEL (VOIDmode, vec);
6255 
6256   aarch64_emit_call_insn (call);
6257 }
6258 
6259 /* Emit call insn with PAT and do aarch64-specific handling.  */
6260 
6261 void
aarch64_emit_call_insn(rtx pat)6262 aarch64_emit_call_insn (rtx pat)
6263 {
6264   rtx insn = emit_call_insn (pat);
6265 
6266   rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
6267   clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
6268   clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
6269 }
6270 
6271 machine_mode
aarch64_select_cc_mode(RTX_CODE code,rtx x,rtx y)6272 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
6273 {
6274   /* All floating point compares return CCFP if it is an equality
6275      comparison, and CCFPE otherwise.  */
6276   if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
6277     {
6278       switch (code)
6279 	{
6280 	case EQ:
6281 	case NE:
6282 	case UNORDERED:
6283 	case ORDERED:
6284 	case UNLT:
6285 	case UNLE:
6286 	case UNGT:
6287 	case UNGE:
6288 	case UNEQ:
6289 	  return CCFPmode;
6290 
6291 	case LT:
6292 	case LE:
6293 	case GT:
6294 	case GE:
6295 	case LTGT:
6296 	  return CCFPEmode;
6297 
6298 	default:
6299 	  gcc_unreachable ();
6300 	}
6301     }
6302 
6303   /* Equality comparisons of short modes against zero can be performed
6304      using the TST instruction with the appropriate bitmask.  */
6305   if (y == const0_rtx && REG_P (x)
6306       && (code == EQ || code == NE)
6307       && (GET_MODE (x) == HImode || GET_MODE (x) == QImode))
6308     return CC_NZmode;
6309 
6310   /* Similarly, comparisons of zero_extends from shorter modes can
6311      be performed using an ANDS with an immediate mask.  */
6312   if (y == const0_rtx && GET_CODE (x) == ZERO_EXTEND
6313       && (GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6314       && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
6315       && (code == EQ || code == NE))
6316     return CC_NZmode;
6317 
6318   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6319       && y == const0_rtx
6320       && (code == EQ || code == NE || code == LT || code == GE)
6321       && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
6322 	  || GET_CODE (x) == NEG
6323 	  || (GET_CODE (x) == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
6324 	      && CONST_INT_P (XEXP (x, 2)))))
6325     return CC_NZmode;
6326 
6327   /* A compare with a shifted operand.  Because of canonicalization,
6328      the comparison will have to be swapped when we emit the assembly
6329      code.  */
6330   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6331       && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
6332       && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
6333 	  || GET_CODE (x) == LSHIFTRT
6334 	  || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
6335     return CC_SWPmode;
6336 
6337   /* Similarly for a negated operand, but we can only do this for
6338      equalities.  */
6339   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6340       && (REG_P (y) || GET_CODE (y) == SUBREG)
6341       && (code == EQ || code == NE)
6342       && GET_CODE (x) == NEG)
6343     return CC_Zmode;
6344 
6345   /* A test for unsigned overflow.  */
6346   if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
6347       && code == NE
6348       && GET_CODE (x) == PLUS
6349       && GET_CODE (y) == ZERO_EXTEND)
6350     return CC_Cmode;
6351 
6352   /* For everything else, return CCmode.  */
6353   return CCmode;
6354 }
6355 
6356 static int
6357 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
6358 
6359 int
aarch64_get_condition_code(rtx x)6360 aarch64_get_condition_code (rtx x)
6361 {
6362   machine_mode mode = GET_MODE (XEXP (x, 0));
6363   enum rtx_code comp_code = GET_CODE (x);
6364 
6365   if (GET_MODE_CLASS (mode) != MODE_CC)
6366     mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
6367   return aarch64_get_condition_code_1 (mode, comp_code);
6368 }
6369 
6370 static int
aarch64_get_condition_code_1(machine_mode mode,enum rtx_code comp_code)6371 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
6372 {
6373   switch (mode)
6374     {
6375     case E_CCFPmode:
6376     case E_CCFPEmode:
6377       switch (comp_code)
6378 	{
6379 	case GE: return AARCH64_GE;
6380 	case GT: return AARCH64_GT;
6381 	case LE: return AARCH64_LS;
6382 	case LT: return AARCH64_MI;
6383 	case NE: return AARCH64_NE;
6384 	case EQ: return AARCH64_EQ;
6385 	case ORDERED: return AARCH64_VC;
6386 	case UNORDERED: return AARCH64_VS;
6387 	case UNLT: return AARCH64_LT;
6388 	case UNLE: return AARCH64_LE;
6389 	case UNGT: return AARCH64_HI;
6390 	case UNGE: return AARCH64_PL;
6391 	default: return -1;
6392 	}
6393       break;
6394 
6395     case E_CCmode:
6396       switch (comp_code)
6397 	{
6398 	case NE: return AARCH64_NE;
6399 	case EQ: return AARCH64_EQ;
6400 	case GE: return AARCH64_GE;
6401 	case GT: return AARCH64_GT;
6402 	case LE: return AARCH64_LE;
6403 	case LT: return AARCH64_LT;
6404 	case GEU: return AARCH64_CS;
6405 	case GTU: return AARCH64_HI;
6406 	case LEU: return AARCH64_LS;
6407 	case LTU: return AARCH64_CC;
6408 	default: return -1;
6409 	}
6410       break;
6411 
6412     case E_CC_SWPmode:
6413       switch (comp_code)
6414 	{
6415 	case NE: return AARCH64_NE;
6416 	case EQ: return AARCH64_EQ;
6417 	case GE: return AARCH64_LE;
6418 	case GT: return AARCH64_LT;
6419 	case LE: return AARCH64_GE;
6420 	case LT: return AARCH64_GT;
6421 	case GEU: return AARCH64_LS;
6422 	case GTU: return AARCH64_CC;
6423 	case LEU: return AARCH64_CS;
6424 	case LTU: return AARCH64_HI;
6425 	default: return -1;
6426 	}
6427       break;
6428 
6429     case E_CC_NZmode:
6430       switch (comp_code)
6431 	{
6432 	case NE: return AARCH64_NE;
6433 	case EQ: return AARCH64_EQ;
6434 	case GE: return AARCH64_PL;
6435 	case LT: return AARCH64_MI;
6436 	default: return -1;
6437 	}
6438       break;
6439 
6440     case E_CC_Zmode:
6441       switch (comp_code)
6442 	{
6443 	case NE: return AARCH64_NE;
6444 	case EQ: return AARCH64_EQ;
6445 	default: return -1;
6446 	}
6447       break;
6448 
6449     case E_CC_Cmode:
6450       switch (comp_code)
6451 	{
6452 	case NE: return AARCH64_CS;
6453 	case EQ: return AARCH64_CC;
6454 	default: return -1;
6455 	}
6456       break;
6457 
6458     default:
6459       return -1;
6460     }
6461 
6462   return -1;
6463 }
6464 
6465 bool
aarch64_const_vec_all_same_in_range_p(rtx x,HOST_WIDE_INT minval,HOST_WIDE_INT maxval)6466 aarch64_const_vec_all_same_in_range_p (rtx x,
6467 				       HOST_WIDE_INT minval,
6468 				       HOST_WIDE_INT maxval)
6469 {
6470   rtx elt;
6471   return (const_vec_duplicate_p (x, &elt)
6472 	  && CONST_INT_P (elt)
6473 	  && IN_RANGE (INTVAL (elt), minval, maxval));
6474 }
6475 
6476 bool
aarch64_const_vec_all_same_int_p(rtx x,HOST_WIDE_INT val)6477 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
6478 {
6479   return aarch64_const_vec_all_same_in_range_p (x, val, val);
6480 }
6481 
6482 /* Return true if VEC is a constant in which every element is in the range
6483    [MINVAL, MAXVAL].  The elements do not need to have the same value.  */
6484 
6485 static bool
aarch64_const_vec_all_in_range_p(rtx vec,HOST_WIDE_INT minval,HOST_WIDE_INT maxval)6486 aarch64_const_vec_all_in_range_p (rtx vec,
6487 				  HOST_WIDE_INT minval,
6488 				  HOST_WIDE_INT maxval)
6489 {
6490   if (GET_CODE (vec) != CONST_VECTOR
6491       || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
6492     return false;
6493 
6494   int nunits;
6495   if (!CONST_VECTOR_STEPPED_P (vec))
6496     nunits = const_vector_encoded_nelts (vec);
6497   else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
6498     return false;
6499 
6500   for (int i = 0; i < nunits; i++)
6501     {
6502       rtx vec_elem = CONST_VECTOR_ELT (vec, i);
6503       if (!CONST_INT_P (vec_elem)
6504 	  || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
6505 	return false;
6506     }
6507   return true;
6508 }
6509 
6510 /* N Z C V.  */
6511 #define AARCH64_CC_V 1
6512 #define AARCH64_CC_C (1 << 1)
6513 #define AARCH64_CC_Z (1 << 2)
6514 #define AARCH64_CC_N (1 << 3)
6515 
6516 /* N Z C V flags for ccmp.  Indexed by AARCH64_COND_CODE.  */
6517 static const int aarch64_nzcv_codes[] =
6518 {
6519   0,		/* EQ, Z == 1.  */
6520   AARCH64_CC_Z,	/* NE, Z == 0.  */
6521   0,		/* CS, C == 1.  */
6522   AARCH64_CC_C,	/* CC, C == 0.  */
6523   0,		/* MI, N == 1.  */
6524   AARCH64_CC_N, /* PL, N == 0.  */
6525   0,		/* VS, V == 1.  */
6526   AARCH64_CC_V, /* VC, V == 0.  */
6527   0,		/* HI, C ==1 && Z == 0.  */
6528   AARCH64_CC_C,	/* LS, !(C == 1 && Z == 0).  */
6529   AARCH64_CC_V,	/* GE, N == V.  */
6530   0,		/* LT, N != V.  */
6531   AARCH64_CC_Z, /* GT, Z == 0 && N == V.  */
6532   0,		/* LE, !(Z == 0 && N == V).  */
6533   0,		/* AL, Any.  */
6534   0		/* NV, Any.  */
6535 };
6536 
6537 /* Print floating-point vector immediate operand X to F, negating it
6538    first if NEGATE is true.  Return true on success, false if it isn't
6539    a constant we can handle.  */
6540 
6541 static bool
aarch64_print_vector_float_operand(FILE * f,rtx x,bool negate)6542 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
6543 {
6544   rtx elt;
6545 
6546   if (!const_vec_duplicate_p (x, &elt))
6547     return false;
6548 
6549   REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
6550   if (negate)
6551     r = real_value_negate (&r);
6552 
6553   /* We only handle the SVE single-bit immediates here.  */
6554   if (real_equal (&r, &dconst0))
6555     asm_fprintf (f, "0.0");
6556   else if (real_equal (&r, &dconst1))
6557     asm_fprintf (f, "1.0");
6558   else if (real_equal (&r, &dconsthalf))
6559     asm_fprintf (f, "0.5");
6560   else
6561     return false;
6562 
6563   return true;
6564 }
6565 
6566 /* Return the equivalent letter for size.  */
6567 static char
sizetochar(int size)6568 sizetochar (int size)
6569 {
6570   switch (size)
6571     {
6572     case 64: return 'd';
6573     case 32: return 's';
6574     case 16: return 'h';
6575     case 8 : return 'b';
6576     default: gcc_unreachable ();
6577     }
6578 }
6579 
6580 /* Print operand X to file F in a target specific manner according to CODE.
6581    The acceptable formatting commands given by CODE are:
6582      'c':		An integer or symbol address without a preceding #
6583 			sign.
6584      'C':		Take the duplicated element in a vector constant
6585 			and print it in hex.
6586      'D':		Take the duplicated element in a vector constant
6587 			and print it as an unsigned integer, in decimal.
6588      'e':		Print the sign/zero-extend size as a character 8->b,
6589 			16->h, 32->w.
6590      'p':		Prints N such that 2^N == X (X must be power of 2 and
6591 			const int).
6592      'P':		Print the number of non-zero bits in X (a const_int).
6593      'H':		Print the higher numbered register of a pair (TImode)
6594 			of regs.
6595      'm':		Print a condition (eq, ne, etc).
6596      'M':		Same as 'm', but invert condition.
6597      'N':		Take the duplicated element in a vector constant
6598 			and print the negative of it in decimal.
6599      'b/h/s/d/q':	Print a scalar FP/SIMD register name.
6600      'S/T/U/V':		Print a FP/SIMD register name for a register list.
6601 			The register printed is the FP/SIMD register name
6602 			of X + 0/1/2/3 for S/T/U/V.
6603      'R':		Print a scalar FP/SIMD register name + 1.
6604      'X':		Print bottom 16 bits of integer constant in hex.
6605      'w/x':		Print a general register name or the zero register
6606 			(32-bit or 64-bit).
6607      '0':		Print a normal operand, if it's a general register,
6608 			then we assume DImode.
6609      'k':		Print NZCV for conditional compare instructions.
6610      'A':		Output address constant representing the first
6611 			argument of X, specifying a relocation offset
6612 			if appropriate.
6613      'L':		Output constant address specified by X
6614 			with a relocation offset if appropriate.
6615      'G':		Prints address of X, specifying a PC relative
6616 			relocation mode if appropriate.
6617      'y':		Output address of LDP or STP - this is used for
6618 			some LDP/STPs which don't use a PARALLEL in their
6619 			pattern (so the mode needs to be adjusted).
6620      'z':		Output address of a typical LDP or STP.  */
6621 
6622 static void
aarch64_print_operand(FILE * f,rtx x,int code)6623 aarch64_print_operand (FILE *f, rtx x, int code)
6624 {
6625   rtx elt;
6626   switch (code)
6627     {
6628     case 'c':
6629       switch (GET_CODE (x))
6630 	{
6631 	case CONST_INT:
6632 	  fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
6633 	  break;
6634 
6635 	case SYMBOL_REF:
6636 	  output_addr_const (f, x);
6637 	  break;
6638 
6639 	case CONST:
6640 	  if (GET_CODE (XEXP (x, 0)) == PLUS
6641 	      && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
6642 	    {
6643 	      output_addr_const (f, x);
6644 	      break;
6645 	    }
6646 	  /* Fall through.  */
6647 
6648 	default:
6649 	  output_operand_lossage ("unsupported operand for code '%c'", code);
6650 	}
6651       break;
6652 
6653     case 'e':
6654       {
6655 	int n;
6656 
6657 	if (!CONST_INT_P (x)
6658 	    || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
6659 	  {
6660 	    output_operand_lossage ("invalid operand for '%%%c'", code);
6661 	    return;
6662 	  }
6663 
6664 	switch (n)
6665 	  {
6666 	  case 3:
6667 	    fputc ('b', f);
6668 	    break;
6669 	  case 4:
6670 	    fputc ('h', f);
6671 	    break;
6672 	  case 5:
6673 	    fputc ('w', f);
6674 	    break;
6675 	  default:
6676 	    output_operand_lossage ("invalid operand for '%%%c'", code);
6677 	    return;
6678 	  }
6679       }
6680       break;
6681 
6682     case 'p':
6683       {
6684 	int n;
6685 
6686 	if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
6687 	  {
6688 	    output_operand_lossage ("invalid operand for '%%%c'", code);
6689 	    return;
6690 	  }
6691 
6692 	asm_fprintf (f, "%d", n);
6693       }
6694       break;
6695 
6696     case 'P':
6697       if (!CONST_INT_P (x))
6698 	{
6699 	  output_operand_lossage ("invalid operand for '%%%c'", code);
6700 	  return;
6701 	}
6702 
6703       asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
6704       break;
6705 
6706     case 'H':
6707       if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
6708 	{
6709 	  output_operand_lossage ("invalid operand for '%%%c'", code);
6710 	  return;
6711 	}
6712 
6713       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
6714       break;
6715 
6716     case 'M':
6717     case 'm':
6718       {
6719         int cond_code;
6720 	/* CONST_TRUE_RTX means al/nv (al is the default, don't print it).  */
6721 	if (x == const_true_rtx)
6722 	  {
6723 	    if (code == 'M')
6724 	      fputs ("nv", f);
6725 	    return;
6726 	  }
6727 
6728         if (!COMPARISON_P (x))
6729 	  {
6730 	    output_operand_lossage ("invalid operand for '%%%c'", code);
6731 	    return;
6732 	  }
6733 
6734         cond_code = aarch64_get_condition_code (x);
6735         gcc_assert (cond_code >= 0);
6736 	if (code == 'M')
6737 	  cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
6738 	fputs (aarch64_condition_codes[cond_code], f);
6739       }
6740       break;
6741 
6742     case 'N':
6743       if (!const_vec_duplicate_p (x, &elt))
6744 	{
6745 	  output_operand_lossage ("invalid vector constant");
6746 	  return;
6747 	}
6748 
6749       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
6750 	asm_fprintf (f, "%wd", -INTVAL (elt));
6751       else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
6752 	       && aarch64_print_vector_float_operand (f, x, true))
6753 	;
6754       else
6755 	{
6756 	  output_operand_lossage ("invalid vector constant");
6757 	  return;
6758 	}
6759       break;
6760 
6761     case 'b':
6762     case 'h':
6763     case 's':
6764     case 'd':
6765     case 'q':
6766       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
6767 	{
6768 	  output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
6769 	  return;
6770 	}
6771       asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
6772       break;
6773 
6774     case 'S':
6775     case 'T':
6776     case 'U':
6777     case 'V':
6778       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
6779 	{
6780 	  output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
6781 	  return;
6782 	}
6783       asm_fprintf (f, "%c%d",
6784 		   aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
6785 		   REGNO (x) - V0_REGNUM + (code - 'S'));
6786       break;
6787 
6788     case 'R':
6789       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
6790 	{
6791 	  output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
6792 	  return;
6793 	}
6794       asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
6795       break;
6796 
6797     case 'X':
6798       if (!CONST_INT_P (x))
6799 	{
6800 	  output_operand_lossage ("invalid operand for '%%%c'", code);
6801 	  return;
6802 	}
6803       asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
6804       break;
6805 
6806     case 'C':
6807       {
6808 	/* Print a replicated constant in hex.  */
6809 	if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
6810 	  {
6811 	    output_operand_lossage ("invalid operand for '%%%c'", code);
6812 	    return;
6813 	  }
6814 	scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
6815 	asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
6816       }
6817       break;
6818 
6819     case 'D':
6820       {
6821 	/* Print a replicated constant in decimal, treating it as
6822 	   unsigned.  */
6823 	if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
6824 	  {
6825 	    output_operand_lossage ("invalid operand for '%%%c'", code);
6826 	    return;
6827 	  }
6828 	scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
6829 	asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
6830       }
6831       break;
6832 
6833     case 'w':
6834     case 'x':
6835       if (x == const0_rtx
6836 	  || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
6837 	{
6838 	  asm_fprintf (f, "%czr", code);
6839 	  break;
6840 	}
6841 
6842       if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
6843 	{
6844 	  asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
6845 	  break;
6846 	}
6847 
6848       if (REG_P (x) && REGNO (x) == SP_REGNUM)
6849 	{
6850 	  asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
6851 	  break;
6852 	}
6853 
6854       /* Fall through */
6855 
6856     case 0:
6857       if (x == NULL)
6858 	{
6859 	  output_operand_lossage ("missing operand");
6860 	  return;
6861 	}
6862 
6863       switch (GET_CODE (x))
6864 	{
6865 	case REG:
6866 	  if (aarch64_sve_data_mode_p (GET_MODE (x)))
6867 	    {
6868 	      if (REG_NREGS (x) == 1)
6869 		asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
6870 	      else
6871 		{
6872 		  char suffix
6873 		    = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
6874 		  asm_fprintf (f, "{z%d.%c - z%d.%c}",
6875 			       REGNO (x) - V0_REGNUM, suffix,
6876 			       END_REGNO (x) - V0_REGNUM - 1, suffix);
6877 		}
6878 	    }
6879 	  else
6880 	    asm_fprintf (f, "%s", reg_names [REGNO (x)]);
6881 	  break;
6882 
6883 	case MEM:
6884 	  output_address (GET_MODE (x), XEXP (x, 0));
6885 	  break;
6886 
6887 	case LABEL_REF:
6888 	case SYMBOL_REF:
6889 	  output_addr_const (asm_out_file, x);
6890 	  break;
6891 
6892 	case CONST_INT:
6893 	  asm_fprintf (f, "%wd", INTVAL (x));
6894 	  break;
6895 
6896 	case CONST:
6897 	  if (!VECTOR_MODE_P (GET_MODE (x)))
6898 	    {
6899 	      output_addr_const (asm_out_file, x);
6900 	      break;
6901 	    }
6902 	  /* fall through */
6903 
6904 	case CONST_VECTOR:
6905 	  if (!const_vec_duplicate_p (x, &elt))
6906 	    {
6907 	      output_operand_lossage ("invalid vector constant");
6908 	      return;
6909 	    }
6910 
6911 	  if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
6912 	    asm_fprintf (f, "%wd", INTVAL (elt));
6913 	  else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
6914 		   && aarch64_print_vector_float_operand (f, x, false))
6915 	    ;
6916 	  else
6917 	    {
6918 	      output_operand_lossage ("invalid vector constant");
6919 	      return;
6920 	    }
6921 	  break;
6922 
6923 	case CONST_DOUBLE:
6924 	  /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
6925 	     be getting CONST_DOUBLEs holding integers.  */
6926 	  gcc_assert (GET_MODE (x) != VOIDmode);
6927 	  if (aarch64_float_const_zero_rtx_p (x))
6928 	    {
6929 	      fputc ('0', f);
6930 	      break;
6931 	    }
6932 	  else if (aarch64_float_const_representable_p (x))
6933 	    {
6934 #define buf_size 20
6935 	      char float_buf[buf_size] = {'\0'};
6936 	      real_to_decimal_for_mode (float_buf,
6937 					CONST_DOUBLE_REAL_VALUE (x),
6938 					buf_size, buf_size,
6939 					1, GET_MODE (x));
6940 	      asm_fprintf (asm_out_file, "%s", float_buf);
6941 	      break;
6942 #undef buf_size
6943 	    }
6944 	  output_operand_lossage ("invalid constant");
6945 	  return;
6946 	default:
6947 	  output_operand_lossage ("invalid operand");
6948 	  return;
6949 	}
6950       break;
6951 
6952     case 'A':
6953       if (GET_CODE (x) == HIGH)
6954 	x = XEXP (x, 0);
6955 
6956       switch (aarch64_classify_symbolic_expression (x))
6957 	{
6958 	case SYMBOL_SMALL_GOT_4G:
6959 	  asm_fprintf (asm_out_file, ":got:");
6960 	  break;
6961 
6962 	case SYMBOL_SMALL_TLSGD:
6963 	  asm_fprintf (asm_out_file, ":tlsgd:");
6964 	  break;
6965 
6966 	case SYMBOL_SMALL_TLSDESC:
6967 	  asm_fprintf (asm_out_file, ":tlsdesc:");
6968 	  break;
6969 
6970 	case SYMBOL_SMALL_TLSIE:
6971 	  asm_fprintf (asm_out_file, ":gottprel:");
6972 	  break;
6973 
6974 	case SYMBOL_TLSLE24:
6975 	  asm_fprintf (asm_out_file, ":tprel:");
6976 	  break;
6977 
6978 	case SYMBOL_TINY_GOT:
6979 	  gcc_unreachable ();
6980 	  break;
6981 
6982 	default:
6983 	  break;
6984 	}
6985       output_addr_const (asm_out_file, x);
6986       break;
6987 
6988     case 'L':
6989       switch (aarch64_classify_symbolic_expression (x))
6990 	{
6991 	case SYMBOL_SMALL_GOT_4G:
6992 	  asm_fprintf (asm_out_file, ":lo12:");
6993 	  break;
6994 
6995 	case SYMBOL_SMALL_TLSGD:
6996 	  asm_fprintf (asm_out_file, ":tlsgd_lo12:");
6997 	  break;
6998 
6999 	case SYMBOL_SMALL_TLSDESC:
7000 	  asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
7001 	  break;
7002 
7003 	case SYMBOL_SMALL_TLSIE:
7004 	  asm_fprintf (asm_out_file, ":gottprel_lo12:");
7005 	  break;
7006 
7007 	case SYMBOL_TLSLE12:
7008 	  asm_fprintf (asm_out_file, ":tprel_lo12:");
7009 	  break;
7010 
7011 	case SYMBOL_TLSLE24:
7012 	  asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
7013 	  break;
7014 
7015 	case SYMBOL_TINY_GOT:
7016 	  asm_fprintf (asm_out_file, ":got:");
7017 	  break;
7018 
7019 	case SYMBOL_TINY_TLSIE:
7020 	  asm_fprintf (asm_out_file, ":gottprel:");
7021 	  break;
7022 
7023 	default:
7024 	  break;
7025 	}
7026       output_addr_const (asm_out_file, x);
7027       break;
7028 
7029     case 'G':
7030       switch (aarch64_classify_symbolic_expression (x))
7031 	{
7032 	case SYMBOL_TLSLE24:
7033 	  asm_fprintf (asm_out_file, ":tprel_hi12:");
7034 	  break;
7035 	default:
7036 	  break;
7037 	}
7038       output_addr_const (asm_out_file, x);
7039       break;
7040 
7041     case 'k':
7042       {
7043 	HOST_WIDE_INT cond_code;
7044 
7045 	if (!CONST_INT_P (x))
7046 	  {
7047 	    output_operand_lossage ("invalid operand for '%%%c'", code);
7048 	    return;
7049 	  }
7050 
7051 	cond_code = INTVAL (x);
7052 	gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
7053 	asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
7054       }
7055       break;
7056 
7057     case 'y':
7058     case 'z':
7059       {
7060 	machine_mode mode = GET_MODE (x);
7061 
7062 	if (GET_CODE (x) != MEM
7063 	    || (code == 'y' && maybe_ne (GET_MODE_SIZE (mode), 16)))
7064 	  {
7065 	    output_operand_lossage ("invalid operand for '%%%c'", code);
7066 	    return;
7067 	  }
7068 
7069 	if (code == 'y')
7070 	  /* LDP/STP which uses a single double-width memory operand.
7071 	     Adjust the mode to appear like a typical LDP/STP.
7072 	     Currently this is supported for 16-byte accesses only.  */
7073 	  mode = DFmode;
7074 
7075 	if (!aarch64_print_ldpstp_address (f, mode, XEXP (x, 0)))
7076 	  output_operand_lossage ("invalid operand prefix '%%%c'", code);
7077       }
7078       break;
7079 
7080     default:
7081       output_operand_lossage ("invalid operand prefix '%%%c'", code);
7082       return;
7083     }
7084 }
7085 
7086 /* Print address 'x' of a memory access with mode 'mode'.
7087    'op' is the context required by aarch64_classify_address.  It can either be
7088    MEM for a normal memory access or PARALLEL for LDP/STP.  */
7089 static bool
aarch64_print_address_internal(FILE * f,machine_mode mode,rtx x,aarch64_addr_query_type type)7090 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
7091 				aarch64_addr_query_type type)
7092 {
7093   struct aarch64_address_info addr;
7094   unsigned int size;
7095 
7096   /* Check all addresses are Pmode - including ILP32.  */
7097   if (GET_MODE (x) != Pmode
7098       && (!CONST_INT_P (x)
7099 	  || trunc_int_for_mode (INTVAL (x), Pmode) != INTVAL (x)))
7100     {
7101       output_operand_lossage ("invalid address mode");
7102       return false;
7103     }
7104 
7105   if (aarch64_classify_address (&addr, x, mode, true, type))
7106     switch (addr.type)
7107       {
7108       case ADDRESS_REG_IMM:
7109 	if (known_eq (addr.const_offset, 0))
7110 	  asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
7111 	else if (aarch64_sve_data_mode_p (mode))
7112 	  {
7113 	    HOST_WIDE_INT vnum
7114 	      = exact_div (addr.const_offset,
7115 			   BYTES_PER_SVE_VECTOR).to_constant ();
7116 	    asm_fprintf (f, "[%s, #%wd, mul vl]",
7117 			 reg_names[REGNO (addr.base)], vnum);
7118 	  }
7119 	else if (aarch64_sve_pred_mode_p (mode))
7120 	  {
7121 	    HOST_WIDE_INT vnum
7122 	      = exact_div (addr.const_offset,
7123 			   BYTES_PER_SVE_PRED).to_constant ();
7124 	    asm_fprintf (f, "[%s, #%wd, mul vl]",
7125 			 reg_names[REGNO (addr.base)], vnum);
7126 	  }
7127 	else
7128 	  asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
7129 		       INTVAL (addr.offset));
7130 	return true;
7131 
7132       case ADDRESS_REG_REG:
7133 	if (addr.shift == 0)
7134 	  asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
7135 		       reg_names [REGNO (addr.offset)]);
7136 	else
7137 	  asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
7138 		       reg_names [REGNO (addr.offset)], addr.shift);
7139 	return true;
7140 
7141       case ADDRESS_REG_UXTW:
7142 	if (addr.shift == 0)
7143 	  asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
7144 		       REGNO (addr.offset) - R0_REGNUM);
7145 	else
7146 	  asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
7147 		       REGNO (addr.offset) - R0_REGNUM, addr.shift);
7148 	return true;
7149 
7150       case ADDRESS_REG_SXTW:
7151 	if (addr.shift == 0)
7152 	  asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
7153 		       REGNO (addr.offset) - R0_REGNUM);
7154 	else
7155 	  asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
7156 		       REGNO (addr.offset) - R0_REGNUM, addr.shift);
7157 	return true;
7158 
7159       case ADDRESS_REG_WB:
7160 	/* Writeback is only supported for fixed-width modes.  */
7161 	size = GET_MODE_SIZE (mode).to_constant ();
7162 	switch (GET_CODE (x))
7163 	  {
7164 	  case PRE_INC:
7165 	    asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
7166 	    return true;
7167 	  case POST_INC:
7168 	    asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
7169 	    return true;
7170 	  case PRE_DEC:
7171 	    asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
7172 	    return true;
7173 	  case POST_DEC:
7174 	    asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
7175 	    return true;
7176 	  case PRE_MODIFY:
7177 	    asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
7178 			 INTVAL (addr.offset));
7179 	    return true;
7180 	  case POST_MODIFY:
7181 	    asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
7182 			 INTVAL (addr.offset));
7183 	    return true;
7184 	  default:
7185 	    break;
7186 	  }
7187 	break;
7188 
7189       case ADDRESS_LO_SUM:
7190 	asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
7191 	output_addr_const (f, addr.offset);
7192 	asm_fprintf (f, "]");
7193 	return true;
7194 
7195       case ADDRESS_SYMBOLIC:
7196 	output_addr_const (f, x);
7197 	return true;
7198       }
7199 
7200   return false;
7201 }
7202 
7203 /* Print address 'x' of a LDP/STP with mode 'mode'.  */
7204 static bool
aarch64_print_ldpstp_address(FILE * f,machine_mode mode,rtx x)7205 aarch64_print_ldpstp_address (FILE *f, machine_mode mode, rtx x)
7206 {
7207   return aarch64_print_address_internal (f, mode, x, ADDR_QUERY_LDP_STP);
7208 }
7209 
7210 /* Print address 'x' of a memory access with mode 'mode'.  */
7211 static void
aarch64_print_operand_address(FILE * f,machine_mode mode,rtx x)7212 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
7213 {
7214   if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
7215     output_addr_const (f, x);
7216 }
7217 
7218 bool
aarch64_label_mentioned_p(rtx x)7219 aarch64_label_mentioned_p (rtx x)
7220 {
7221   const char *fmt;
7222   int i;
7223 
7224   if (GET_CODE (x) == LABEL_REF)
7225     return true;
7226 
7227   /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
7228      referencing instruction, but they are constant offsets, not
7229      symbols.  */
7230   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
7231     return false;
7232 
7233   fmt = GET_RTX_FORMAT (GET_CODE (x));
7234   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
7235     {
7236       if (fmt[i] == 'E')
7237 	{
7238 	  int j;
7239 
7240 	  for (j = XVECLEN (x, i) - 1; j >= 0; j--)
7241 	    if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
7242 	      return 1;
7243 	}
7244       else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
7245 	return 1;
7246     }
7247 
7248   return 0;
7249 }
7250 
7251 /* Implement REGNO_REG_CLASS.  */
7252 
7253 enum reg_class
aarch64_regno_regclass(unsigned regno)7254 aarch64_regno_regclass (unsigned regno)
7255 {
7256   if (GP_REGNUM_P (regno))
7257     return GENERAL_REGS;
7258 
7259   if (regno == SP_REGNUM)
7260     return STACK_REG;
7261 
7262   if (regno == FRAME_POINTER_REGNUM
7263       || regno == ARG_POINTER_REGNUM)
7264     return POINTER_REGS;
7265 
7266   if (FP_REGNUM_P (regno))
7267     return FP_LO_REGNUM_P (regno) ?  FP_LO_REGS : FP_REGS;
7268 
7269   if (PR_REGNUM_P (regno))
7270     return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
7271 
7272   return NO_REGS;
7273 }
7274 
7275 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
7276    If OFFSET is out of range, return an offset of an anchor point
7277    that is in range.  Return 0 otherwise.  */
7278 
7279 static HOST_WIDE_INT
aarch64_anchor_offset(HOST_WIDE_INT offset,HOST_WIDE_INT size,machine_mode mode)7280 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
7281 		       machine_mode mode)
7282 {
7283   /* Does it look like we'll need a 16-byte load/store-pair operation?  */
7284   if (size > 16)
7285     return (offset + 0x400) & ~0x7f0;
7286 
7287   /* For offsets that aren't a multiple of the access size, the limit is
7288      -256...255.  */
7289   if (offset & (size - 1))
7290     {
7291       /* BLKmode typically uses LDP of X-registers.  */
7292       if (mode == BLKmode)
7293 	return (offset + 512) & ~0x3ff;
7294       return (offset + 0x100) & ~0x1ff;
7295     }
7296 
7297   /* Small negative offsets are supported.  */
7298   if (IN_RANGE (offset, -256, 0))
7299     return 0;
7300 
7301   if (mode == TImode || mode == TFmode)
7302     return (offset + 0x100) & ~0x1ff;
7303 
7304   /* Use 12-bit offset by access size.  */
7305   return offset & (~0xfff * size);
7306 }
7307 
7308 static rtx
aarch64_legitimize_address(rtx x,rtx,machine_mode mode)7309 aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
7310 {
7311   /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
7312      where mask is selected by alignment and size of the offset.
7313      We try to pick as large a range for the offset as possible to
7314      maximize the chance of a CSE.  However, for aligned addresses
7315      we limit the range to 4k so that structures with different sized
7316      elements are likely to use the same base.  We need to be careful
7317      not to split a CONST for some forms of address expression, otherwise
7318      it will generate sub-optimal code.  */
7319 
7320   if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
7321     {
7322       rtx base = XEXP (x, 0);
7323       rtx offset_rtx = XEXP (x, 1);
7324       HOST_WIDE_INT offset = INTVAL (offset_rtx);
7325 
7326       if (GET_CODE (base) == PLUS)
7327 	{
7328 	  rtx op0 = XEXP (base, 0);
7329 	  rtx op1 = XEXP (base, 1);
7330 
7331 	  /* Force any scaling into a temp for CSE.  */
7332 	  op0 = force_reg (Pmode, op0);
7333 	  op1 = force_reg (Pmode, op1);
7334 
7335 	  /* Let the pointer register be in op0.  */
7336 	  if (REG_POINTER (op1))
7337 	    std::swap (op0, op1);
7338 
7339 	  /* If the pointer is virtual or frame related, then we know that
7340 	     virtual register instantiation or register elimination is going
7341 	     to apply a second constant.  We want the two constants folded
7342 	     together easily.  Therefore, emit as (OP0 + CONST) + OP1.  */
7343 	  if (virt_or_elim_regno_p (REGNO (op0)))
7344 	    {
7345 	      base = expand_binop (Pmode, add_optab, op0, offset_rtx,
7346 				   NULL_RTX, true, OPTAB_DIRECT);
7347 	      return gen_rtx_PLUS (Pmode, base, op1);
7348 	    }
7349 
7350 	  /* Otherwise, in order to encourage CSE (and thence loop strength
7351 	     reduce) scaled addresses, emit as (OP0 + OP1) + CONST.  */
7352 	  base = expand_binop (Pmode, add_optab, op0, op1,
7353 			       NULL_RTX, true, OPTAB_DIRECT);
7354 	  x = gen_rtx_PLUS (Pmode, base, offset_rtx);
7355 	}
7356 
7357       HOST_WIDE_INT size;
7358       if (GET_MODE_SIZE (mode).is_constant (&size))
7359 	{
7360 	  HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
7361 							     mode);
7362 	  if (base_offset != 0)
7363 	    {
7364 	      base = plus_constant (Pmode, base, base_offset);
7365 	      base = force_operand (base, NULL_RTX);
7366 	      return plus_constant (Pmode, base, offset - base_offset);
7367 	    }
7368 	}
7369     }
7370 
7371   return x;
7372 }
7373 
7374 /* Return the reload icode required for a constant pool in mode.  */
7375 static enum insn_code
aarch64_constant_pool_reload_icode(machine_mode mode)7376 aarch64_constant_pool_reload_icode (machine_mode mode)
7377 {
7378   switch (mode)
7379     {
7380     case E_SFmode:
7381       return CODE_FOR_aarch64_reload_movcpsfdi;
7382 
7383     case E_DFmode:
7384       return CODE_FOR_aarch64_reload_movcpdfdi;
7385 
7386     case E_TFmode:
7387       return CODE_FOR_aarch64_reload_movcptfdi;
7388 
7389     case E_V8QImode:
7390       return CODE_FOR_aarch64_reload_movcpv8qidi;
7391 
7392     case E_V16QImode:
7393       return CODE_FOR_aarch64_reload_movcpv16qidi;
7394 
7395     case E_V4HImode:
7396       return CODE_FOR_aarch64_reload_movcpv4hidi;
7397 
7398     case E_V8HImode:
7399       return CODE_FOR_aarch64_reload_movcpv8hidi;
7400 
7401     case E_V2SImode:
7402       return CODE_FOR_aarch64_reload_movcpv2sidi;
7403 
7404     case E_V4SImode:
7405       return CODE_FOR_aarch64_reload_movcpv4sidi;
7406 
7407     case E_V2DImode:
7408       return CODE_FOR_aarch64_reload_movcpv2didi;
7409 
7410     case E_V2DFmode:
7411       return CODE_FOR_aarch64_reload_movcpv2dfdi;
7412 
7413     default:
7414       gcc_unreachable ();
7415     }
7416 
7417   gcc_unreachable ();
7418 }
7419 static reg_class_t
aarch64_secondary_reload(bool in_p ATTRIBUTE_UNUSED,rtx x,reg_class_t rclass,machine_mode mode,secondary_reload_info * sri)7420 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
7421 			  reg_class_t rclass,
7422 			  machine_mode mode,
7423 			  secondary_reload_info *sri)
7424 {
7425   /* Use aarch64_sve_reload_be for SVE reloads that cannot be handled
7426      directly by the *aarch64_sve_mov<mode>_be move pattern.  See the
7427      comment at the head of aarch64-sve.md for more details about the
7428      big-endian handling.  */
7429   if (BYTES_BIG_ENDIAN
7430       && reg_class_subset_p (rclass, FP_REGS)
7431       && !((REG_P (x) && HARD_REGISTER_P (x))
7432 	   || aarch64_simd_valid_immediate (x, NULL))
7433       && aarch64_sve_data_mode_p (mode))
7434     {
7435       sri->icode = CODE_FOR_aarch64_sve_reload_be;
7436       return NO_REGS;
7437     }
7438 
7439   /* If we have to disable direct literal pool loads and stores because the
7440      function is too big, then we need a scratch register.  */
7441   if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
7442       && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
7443 	  || targetm.vector_mode_supported_p (GET_MODE (x)))
7444       && !aarch64_pcrelative_literal_loads)
7445     {
7446       sri->icode = aarch64_constant_pool_reload_icode (mode);
7447       return NO_REGS;
7448     }
7449 
7450   /* Without the TARGET_SIMD instructions we cannot move a Q register
7451      to a Q register directly.  We need a scratch.  */
7452   if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
7453       && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
7454       && reg_class_subset_p (rclass, FP_REGS))
7455     {
7456       if (mode == TFmode)
7457         sri->icode = CODE_FOR_aarch64_reload_movtf;
7458       else if (mode == TImode)
7459         sri->icode = CODE_FOR_aarch64_reload_movti;
7460       return NO_REGS;
7461     }
7462 
7463   /* A TFmode or TImode memory access should be handled via an FP_REGS
7464      because AArch64 has richer addressing modes for LDR/STR instructions
7465      than LDP/STP instructions.  */
7466   if (TARGET_FLOAT && rclass == GENERAL_REGS
7467       && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
7468     return FP_REGS;
7469 
7470   if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
7471       return GENERAL_REGS;
7472 
7473   return NO_REGS;
7474 }
7475 
7476 static bool
aarch64_can_eliminate(const int from ATTRIBUTE_UNUSED,const int to)7477 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
7478 {
7479   gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
7480 
7481   /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
7482      can only eliminate to HARD_FRAME_POINTER_REGNUM.  */
7483   if (frame_pointer_needed)
7484     return to == HARD_FRAME_POINTER_REGNUM;
7485   return true;
7486 }
7487 
7488 poly_int64
aarch64_initial_elimination_offset(unsigned from,unsigned to)7489 aarch64_initial_elimination_offset (unsigned from, unsigned to)
7490 {
7491   aarch64_layout_frame ();
7492 
7493   if (to == HARD_FRAME_POINTER_REGNUM)
7494     {
7495       if (from == ARG_POINTER_REGNUM)
7496 	return cfun->machine->frame.hard_fp_offset;
7497 
7498       if (from == FRAME_POINTER_REGNUM)
7499 	return cfun->machine->frame.hard_fp_offset
7500 	       - cfun->machine->frame.locals_offset;
7501     }
7502 
7503   if (to == STACK_POINTER_REGNUM)
7504     {
7505       if (from == FRAME_POINTER_REGNUM)
7506 	  return cfun->machine->frame.frame_size
7507 		 - cfun->machine->frame.locals_offset;
7508     }
7509 
7510   return cfun->machine->frame.frame_size;
7511 }
7512 
7513 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
7514    previous frame.  */
7515 
7516 rtx
aarch64_return_addr(int count,rtx frame ATTRIBUTE_UNUSED)7517 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
7518 {
7519   if (count != 0)
7520     return const0_rtx;
7521   return get_hard_reg_initial_val (Pmode, LR_REGNUM);
7522 }
7523 
7524 
7525 static void
aarch64_asm_trampoline_template(FILE * f)7526 aarch64_asm_trampoline_template (FILE *f)
7527 {
7528   if (TARGET_ILP32)
7529     {
7530       asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
7531       asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
7532     }
7533   else
7534     {
7535       asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
7536       asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
7537     }
7538   asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
7539   assemble_aligned_integer (4, const0_rtx);
7540   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
7541   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
7542 }
7543 
7544 static void
aarch64_trampoline_init(rtx m_tramp,tree fndecl,rtx chain_value)7545 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
7546 {
7547   rtx fnaddr, mem, a_tramp;
7548   const int tramp_code_sz = 16;
7549 
7550   /* Don't need to copy the trailing D-words, we fill those in below.  */
7551   emit_block_move (m_tramp, assemble_trampoline_template (),
7552 		   GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
7553   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
7554   fnaddr = XEXP (DECL_RTL (fndecl), 0);
7555   if (GET_MODE (fnaddr) != ptr_mode)
7556     fnaddr = convert_memory_address (ptr_mode, fnaddr);
7557   emit_move_insn (mem, fnaddr);
7558 
7559   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
7560   emit_move_insn (mem, chain_value);
7561 
7562   /* XXX We should really define a "clear_cache" pattern and use
7563      gen_clear_cache().  */
7564   a_tramp = XEXP (m_tramp, 0);
7565   emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
7566 		     LCT_NORMAL, VOIDmode, a_tramp, ptr_mode,
7567 		     plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
7568 		     ptr_mode);
7569 }
7570 
7571 static unsigned char
aarch64_class_max_nregs(reg_class_t regclass,machine_mode mode)7572 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
7573 {
7574   /* ??? Logically we should only need to provide a value when
7575      HARD_REGNO_MODE_OK says that at least one register in REGCLASS
7576      can hold MODE, but at the moment we need to handle all modes.
7577      Just ignore any runtime parts for registers that can't store them.  */
7578   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
7579   unsigned int nregs;
7580   switch (regclass)
7581     {
7582     case TAILCALL_ADDR_REGS:
7583     case POINTER_REGS:
7584     case GENERAL_REGS:
7585     case ALL_REGS:
7586     case POINTER_AND_FP_REGS:
7587     case FP_REGS:
7588     case FP_LO_REGS:
7589       if (aarch64_sve_data_mode_p (mode)
7590 	  && constant_multiple_p (GET_MODE_SIZE (mode),
7591 				  BYTES_PER_SVE_VECTOR, &nregs))
7592 	return nregs;
7593       return (aarch64_vector_data_mode_p (mode)
7594 	      ? CEIL (lowest_size, UNITS_PER_VREG)
7595 	      : CEIL (lowest_size, UNITS_PER_WORD));
7596     case STACK_REG:
7597     case PR_REGS:
7598     case PR_LO_REGS:
7599     case PR_HI_REGS:
7600       return 1;
7601 
7602     case NO_REGS:
7603       return 0;
7604 
7605     default:
7606       break;
7607     }
7608   gcc_unreachable ();
7609 }
7610 
7611 static reg_class_t
aarch64_preferred_reload_class(rtx x,reg_class_t regclass)7612 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
7613 {
7614   if (regclass == POINTER_REGS)
7615     return GENERAL_REGS;
7616 
7617   if (regclass == STACK_REG)
7618     {
7619       if (REG_P(x)
7620 	  && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
7621 	  return regclass;
7622 
7623       return NO_REGS;
7624     }
7625 
7626   /* Register eliminiation can result in a request for
7627      SP+constant->FP_REGS.  We cannot support such operations which
7628      use SP as source and an FP_REG as destination, so reject out
7629      right now.  */
7630   if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
7631     {
7632       rtx lhs = XEXP (x, 0);
7633 
7634       /* Look through a possible SUBREG introduced by ILP32.  */
7635       if (GET_CODE (lhs) == SUBREG)
7636 	lhs = SUBREG_REG (lhs);
7637 
7638       gcc_assert (REG_P (lhs));
7639       gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
7640 				      POINTER_REGS));
7641       return NO_REGS;
7642     }
7643 
7644   return regclass;
7645 }
7646 
7647 void
aarch64_asm_output_labelref(FILE * f,const char * name)7648 aarch64_asm_output_labelref (FILE* f, const char *name)
7649 {
7650   asm_fprintf (f, "%U%s", name);
7651 }
7652 
7653 static void
aarch64_elf_asm_constructor(rtx symbol,int priority)7654 aarch64_elf_asm_constructor (rtx symbol, int priority)
7655 {
7656   if (priority == DEFAULT_INIT_PRIORITY)
7657     default_ctor_section_asm_out_constructor (symbol, priority);
7658   else
7659     {
7660       section *s;
7661       /* While priority is known to be in range [0, 65535], so 18 bytes
7662          would be enough, the compiler might not know that.  To avoid
7663          -Wformat-truncation false positive, use a larger size.  */
7664       char buf[23];
7665       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
7666       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
7667       switch_to_section (s);
7668       assemble_align (POINTER_SIZE);
7669       assemble_aligned_integer (POINTER_BYTES, symbol);
7670     }
7671 }
7672 
7673 static void
aarch64_elf_asm_destructor(rtx symbol,int priority)7674 aarch64_elf_asm_destructor (rtx symbol, int priority)
7675 {
7676   if (priority == DEFAULT_INIT_PRIORITY)
7677     default_dtor_section_asm_out_destructor (symbol, priority);
7678   else
7679     {
7680       section *s;
7681       /* While priority is known to be in range [0, 65535], so 18 bytes
7682          would be enough, the compiler might not know that.  To avoid
7683          -Wformat-truncation false positive, use a larger size.  */
7684       char buf[23];
7685       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
7686       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
7687       switch_to_section (s);
7688       assemble_align (POINTER_SIZE);
7689       assemble_aligned_integer (POINTER_BYTES, symbol);
7690     }
7691 }
7692 
7693 const char*
aarch64_output_casesi(rtx * operands)7694 aarch64_output_casesi (rtx *operands)
7695 {
7696   char buf[100];
7697   char label[100];
7698   rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
7699   int index;
7700   static const char *const patterns[4][2] =
7701   {
7702     {
7703       "ldrb\t%w3, [%0,%w1,uxtw]",
7704       "add\t%3, %4, %w3, sxtb #2"
7705     },
7706     {
7707       "ldrh\t%w3, [%0,%w1,uxtw #1]",
7708       "add\t%3, %4, %w3, sxth #2"
7709     },
7710     {
7711       "ldr\t%w3, [%0,%w1,uxtw #2]",
7712       "add\t%3, %4, %w3, sxtw #2"
7713     },
7714     /* We assume that DImode is only generated when not optimizing and
7715        that we don't really need 64-bit address offsets.  That would
7716        imply an object file with 8GB of code in a single function!  */
7717     {
7718       "ldr\t%w3, [%0,%w1,uxtw #2]",
7719       "add\t%3, %4, %w3, sxtw #2"
7720     }
7721   };
7722 
7723   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
7724 
7725   scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
7726   index = exact_log2 (GET_MODE_SIZE (mode));
7727 
7728   gcc_assert (index >= 0 && index <= 3);
7729 
7730   /* Need to implement table size reduction, by chaning the code below.  */
7731   output_asm_insn (patterns[index][0], operands);
7732   ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
7733   snprintf (buf, sizeof (buf),
7734 	    "adr\t%%4, %s", targetm.strip_name_encoding (label));
7735   output_asm_insn (buf, operands);
7736   output_asm_insn (patterns[index][1], operands);
7737   output_asm_insn ("br\t%3", operands);
7738   assemble_label (asm_out_file, label);
7739   return "";
7740 }
7741 
7742 
7743 /* Return size in bits of an arithmetic operand which is shifted/scaled and
7744    masked such that it is suitable for a UXTB, UXTH, or UXTW extend
7745    operator.  */
7746 
7747 int
aarch64_uxt_size(int shift,HOST_WIDE_INT mask)7748 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
7749 {
7750   if (shift >= 0 && shift <= 3)
7751     {
7752       int size;
7753       for (size = 8; size <= 32; size *= 2)
7754 	{
7755 	  HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
7756 	  if (mask == bits << shift)
7757 	    return size;
7758 	}
7759     }
7760   return 0;
7761 }
7762 
7763 /* Constant pools are per function only when PC relative
7764    literal loads are true or we are in the large memory
7765    model.  */
7766 
7767 static inline bool
aarch64_can_use_per_function_literal_pools_p(void)7768 aarch64_can_use_per_function_literal_pools_p (void)
7769 {
7770   return (aarch64_pcrelative_literal_loads
7771 	  || aarch64_cmodel == AARCH64_CMODEL_LARGE);
7772 }
7773 
7774 static bool
aarch64_use_blocks_for_constant_p(machine_mode,const_rtx)7775 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
7776 {
7777   /* We can't use blocks for constants when we're using a per-function
7778      constant pool.  */
7779   return !aarch64_can_use_per_function_literal_pools_p ();
7780 }
7781 
7782 /* Select appropriate section for constants depending
7783    on where we place literal pools.  */
7784 
7785 static section *
aarch64_select_rtx_section(machine_mode mode,rtx x,unsigned HOST_WIDE_INT align)7786 aarch64_select_rtx_section (machine_mode mode,
7787 			    rtx x,
7788 			    unsigned HOST_WIDE_INT align)
7789 {
7790   if (aarch64_can_use_per_function_literal_pools_p ())
7791     return function_section (current_function_decl);
7792 
7793   return default_elf_select_rtx_section (mode, x, align);
7794 }
7795 
7796 /* Implement ASM_OUTPUT_POOL_EPILOGUE.  */
7797 void
aarch64_asm_output_pool_epilogue(FILE * f,const char *,tree,HOST_WIDE_INT offset)7798 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
7799 				  HOST_WIDE_INT offset)
7800 {
7801   /* When using per-function literal pools, we must ensure that any code
7802      section is aligned to the minimal instruction length, lest we get
7803      errors from the assembler re "unaligned instructions".  */
7804   if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
7805     ASM_OUTPUT_ALIGN (f, 2);
7806 }
7807 
7808 /* Costs.  */
7809 
7810 /* Helper function for rtx cost calculation.  Strip a shift expression
7811    from X.  Returns the inner operand if successful, or the original
7812    expression on failure.  */
7813 static rtx
aarch64_strip_shift(rtx x)7814 aarch64_strip_shift (rtx x)
7815 {
7816   rtx op = x;
7817 
7818   /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
7819      we can convert both to ROR during final output.  */
7820   if ((GET_CODE (op) == ASHIFT
7821        || GET_CODE (op) == ASHIFTRT
7822        || GET_CODE (op) == LSHIFTRT
7823        || GET_CODE (op) == ROTATERT
7824        || GET_CODE (op) == ROTATE)
7825       && CONST_INT_P (XEXP (op, 1)))
7826     return XEXP (op, 0);
7827 
7828   if (GET_CODE (op) == MULT
7829       && CONST_INT_P (XEXP (op, 1))
7830       && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
7831     return XEXP (op, 0);
7832 
7833   return x;
7834 }
7835 
7836 /* Helper function for rtx cost calculation.  Strip an extend
7837    expression from X.  Returns the inner operand if successful, or the
7838    original expression on failure.  We deal with a number of possible
7839    canonicalization variations here. If STRIP_SHIFT is true, then
7840    we can strip off a shift also.  */
7841 static rtx
aarch64_strip_extend(rtx x,bool strip_shift)7842 aarch64_strip_extend (rtx x, bool strip_shift)
7843 {
7844   scalar_int_mode mode;
7845   rtx op = x;
7846 
7847   if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
7848     return op;
7849 
7850   /* Zero and sign extraction of a widened value.  */
7851   if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
7852       && XEXP (op, 2) == const0_rtx
7853       && GET_CODE (XEXP (op, 0)) == MULT
7854       && aarch64_is_extend_from_extract (mode, XEXP (XEXP (op, 0), 1),
7855 					 XEXP (op, 1)))
7856     return XEXP (XEXP (op, 0), 0);
7857 
7858   /* It can also be represented (for zero-extend) as an AND with an
7859      immediate.  */
7860   if (GET_CODE (op) == AND
7861       && GET_CODE (XEXP (op, 0)) == MULT
7862       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
7863       && CONST_INT_P (XEXP (op, 1))
7864       && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
7865 			   INTVAL (XEXP (op, 1))) != 0)
7866     return XEXP (XEXP (op, 0), 0);
7867 
7868   /* Now handle extended register, as this may also have an optional
7869      left shift by 1..4.  */
7870   if (strip_shift
7871       && GET_CODE (op) == ASHIFT
7872       && CONST_INT_P (XEXP (op, 1))
7873       && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
7874     op = XEXP (op, 0);
7875 
7876   if (GET_CODE (op) == ZERO_EXTEND
7877       || GET_CODE (op) == SIGN_EXTEND)
7878     op = XEXP (op, 0);
7879 
7880   if (op != x)
7881     return op;
7882 
7883   return x;
7884 }
7885 
7886 /* Return true iff CODE is a shift supported in combination
7887    with arithmetic instructions.  */
7888 
7889 static bool
aarch64_shift_p(enum rtx_code code)7890 aarch64_shift_p (enum rtx_code code)
7891 {
7892   return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
7893 }
7894 
7895 
7896 /* Return true iff X is a cheap shift without a sign extend. */
7897 
7898 static bool
aarch64_cheap_mult_shift_p(rtx x)7899 aarch64_cheap_mult_shift_p (rtx x)
7900 {
7901   rtx op0, op1;
7902 
7903   op0 = XEXP (x, 0);
7904   op1 = XEXP (x, 1);
7905 
7906   if (!(aarch64_tune_params.extra_tuning_flags
7907                       & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
7908     return false;
7909 
7910   if (GET_CODE (op0) == SIGN_EXTEND)
7911     return false;
7912 
7913   if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
7914       && UINTVAL (op1) <= 4)
7915     return true;
7916 
7917   if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
7918     return false;
7919 
7920   HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
7921 
7922   if (l2 > 0 && l2 <= 4)
7923     return true;
7924 
7925   return false;
7926 }
7927 
7928 /* Helper function for rtx cost calculation.  Calculate the cost of
7929    a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
7930    Return the calculated cost of the expression, recursing manually in to
7931    operands where needed.  */
7932 
7933 static int
aarch64_rtx_mult_cost(rtx x,enum rtx_code code,int outer,bool speed)7934 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
7935 {
7936   rtx op0, op1;
7937   const struct cpu_cost_table *extra_cost
7938     = aarch64_tune_params.insn_extra_cost;
7939   int cost = 0;
7940   bool compound_p = (outer == PLUS || outer == MINUS);
7941   machine_mode mode = GET_MODE (x);
7942 
7943   gcc_checking_assert (code == MULT);
7944 
7945   op0 = XEXP (x, 0);
7946   op1 = XEXP (x, 1);
7947 
7948   if (VECTOR_MODE_P (mode))
7949     mode = GET_MODE_INNER (mode);
7950 
7951   /* Integer multiply/fma.  */
7952   if (GET_MODE_CLASS (mode) == MODE_INT)
7953     {
7954       /* The multiply will be canonicalized as a shift, cost it as such.  */
7955       if (aarch64_shift_p (GET_CODE (x))
7956 	  || (CONST_INT_P (op1)
7957 	      && exact_log2 (INTVAL (op1)) > 0))
7958 	{
7959 	  bool is_extend = GET_CODE (op0) == ZERO_EXTEND
7960 	                   || GET_CODE (op0) == SIGN_EXTEND;
7961 	  if (speed)
7962 	    {
7963 	      if (compound_p)
7964 	        {
7965 		  /* If the shift is considered cheap,
7966 		     then don't add any cost. */
7967 		  if (aarch64_cheap_mult_shift_p (x))
7968 		    ;
7969 	          else if (REG_P (op1))
7970 		    /* ARITH + shift-by-register.  */
7971 		    cost += extra_cost->alu.arith_shift_reg;
7972 		  else if (is_extend)
7973 		    /* ARITH + extended register.  We don't have a cost field
7974 		       for ARITH+EXTEND+SHIFT, so use extend_arith here.  */
7975 		    cost += extra_cost->alu.extend_arith;
7976 		  else
7977 		    /* ARITH + shift-by-immediate.  */
7978 		    cost += extra_cost->alu.arith_shift;
7979 		}
7980 	      else
7981 		/* LSL (immediate).  */
7982 	        cost += extra_cost->alu.shift;
7983 
7984 	    }
7985 	  /* Strip extends as we will have costed them in the case above.  */
7986 	  if (is_extend)
7987 	    op0 = aarch64_strip_extend (op0, true);
7988 
7989 	  cost += rtx_cost (op0, VOIDmode, code, 0, speed);
7990 
7991 	  return cost;
7992 	}
7993 
7994       /* MNEG or [US]MNEGL.  Extract the NEG operand and indicate that it's a
7995 	 compound and let the below cases handle it.  After all, MNEG is a
7996 	 special-case alias of MSUB.  */
7997       if (GET_CODE (op0) == NEG)
7998 	{
7999 	  op0 = XEXP (op0, 0);
8000 	  compound_p = true;
8001 	}
8002 
8003       /* Integer multiplies or FMAs have zero/sign extending variants.  */
8004       if ((GET_CODE (op0) == ZERO_EXTEND
8005 	   && GET_CODE (op1) == ZERO_EXTEND)
8006 	  || (GET_CODE (op0) == SIGN_EXTEND
8007 	      && GET_CODE (op1) == SIGN_EXTEND))
8008 	{
8009 	  cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
8010 	  cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
8011 
8012 	  if (speed)
8013 	    {
8014 	      if (compound_p)
8015 		/* SMADDL/UMADDL/UMSUBL/SMSUBL.  */
8016 		cost += extra_cost->mult[0].extend_add;
8017 	      else
8018 		/* MUL/SMULL/UMULL.  */
8019 		cost += extra_cost->mult[0].extend;
8020 	    }
8021 
8022 	  return cost;
8023 	}
8024 
8025       /* This is either an integer multiply or a MADD.  In both cases
8026 	 we want to recurse and cost the operands.  */
8027       cost += rtx_cost (op0, mode, MULT, 0, speed);
8028       cost += rtx_cost (op1, mode, MULT, 1, speed);
8029 
8030       if (speed)
8031 	{
8032 	  if (compound_p)
8033 	    /* MADD/MSUB.  */
8034 	    cost += extra_cost->mult[mode == DImode].add;
8035 	  else
8036 	    /* MUL.  */
8037 	    cost += extra_cost->mult[mode == DImode].simple;
8038 	}
8039 
8040       return cost;
8041     }
8042   else
8043     {
8044       if (speed)
8045 	{
8046 	  /* Floating-point FMA/FMUL can also support negations of the
8047 	     operands, unless the rounding mode is upward or downward in
8048 	     which case FNMUL is different than FMUL with operand negation.  */
8049 	  bool neg0 = GET_CODE (op0) == NEG;
8050 	  bool neg1 = GET_CODE (op1) == NEG;
8051 	  if (compound_p || !flag_rounding_math || (neg0 && neg1))
8052 	    {
8053 	      if (neg0)
8054 		op0 = XEXP (op0, 0);
8055 	      if (neg1)
8056 		op1 = XEXP (op1, 0);
8057 	    }
8058 
8059 	  if (compound_p)
8060 	    /* FMADD/FNMADD/FNMSUB/FMSUB.  */
8061 	    cost += extra_cost->fp[mode == DFmode].fma;
8062 	  else
8063 	    /* FMUL/FNMUL.  */
8064 	    cost += extra_cost->fp[mode == DFmode].mult;
8065 	}
8066 
8067       cost += rtx_cost (op0, mode, MULT, 0, speed);
8068       cost += rtx_cost (op1, mode, MULT, 1, speed);
8069       return cost;
8070     }
8071 }
8072 
8073 static int
aarch64_address_cost(rtx x,machine_mode mode,addr_space_t as ATTRIBUTE_UNUSED,bool speed)8074 aarch64_address_cost (rtx x,
8075 		      machine_mode mode,
8076 		      addr_space_t as ATTRIBUTE_UNUSED,
8077 		      bool speed)
8078 {
8079   enum rtx_code c = GET_CODE (x);
8080   const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
8081   struct aarch64_address_info info;
8082   int cost = 0;
8083   info.shift = 0;
8084 
8085   if (!aarch64_classify_address (&info, x, mode, false))
8086     {
8087       if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
8088 	{
8089 	  /* This is a CONST or SYMBOL ref which will be split
8090 	     in a different way depending on the code model in use.
8091 	     Cost it through the generic infrastructure.  */
8092 	  int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
8093 	  /* Divide through by the cost of one instruction to
8094 	     bring it to the same units as the address costs.  */
8095 	  cost_symbol_ref /= COSTS_N_INSNS (1);
8096 	  /* The cost is then the cost of preparing the address,
8097 	     followed by an immediate (possibly 0) offset.  */
8098 	  return cost_symbol_ref + addr_cost->imm_offset;
8099 	}
8100       else
8101 	{
8102 	  /* This is most likely a jump table from a case
8103 	     statement.  */
8104 	  return addr_cost->register_offset;
8105 	}
8106     }
8107 
8108   switch (info.type)
8109     {
8110       case ADDRESS_LO_SUM:
8111       case ADDRESS_SYMBOLIC:
8112       case ADDRESS_REG_IMM:
8113 	cost += addr_cost->imm_offset;
8114 	break;
8115 
8116       case ADDRESS_REG_WB:
8117 	if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
8118 	  cost += addr_cost->pre_modify;
8119 	else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
8120 	  cost += addr_cost->post_modify;
8121 	else
8122 	  gcc_unreachable ();
8123 
8124 	break;
8125 
8126       case ADDRESS_REG_REG:
8127 	cost += addr_cost->register_offset;
8128 	break;
8129 
8130       case ADDRESS_REG_SXTW:
8131 	cost += addr_cost->register_sextend;
8132 	break;
8133 
8134       case ADDRESS_REG_UXTW:
8135 	cost += addr_cost->register_zextend;
8136 	break;
8137 
8138       default:
8139 	gcc_unreachable ();
8140     }
8141 
8142 
8143   if (info.shift > 0)
8144     {
8145       /* For the sake of calculating the cost of the shifted register
8146 	 component, we can treat same sized modes in the same way.  */
8147       if (known_eq (GET_MODE_BITSIZE (mode), 16))
8148 	cost += addr_cost->addr_scale_costs.hi;
8149       else if (known_eq (GET_MODE_BITSIZE (mode), 32))
8150 	cost += addr_cost->addr_scale_costs.si;
8151       else if (known_eq (GET_MODE_BITSIZE (mode), 64))
8152 	cost += addr_cost->addr_scale_costs.di;
8153       else
8154 	/* We can't tell, or this is a 128-bit vector.  */
8155 	cost += addr_cost->addr_scale_costs.ti;
8156     }
8157 
8158   return cost;
8159 }
8160 
8161 /* Return the cost of a branch.  If SPEED_P is true then the compiler is
8162    optimizing for speed.  If PREDICTABLE_P is true then the branch is predicted
8163    to be taken.  */
8164 
8165 int
aarch64_branch_cost(bool speed_p,bool predictable_p)8166 aarch64_branch_cost (bool speed_p, bool predictable_p)
8167 {
8168   /* When optimizing for speed, use the cost of unpredictable branches.  */
8169   const struct cpu_branch_cost *branch_costs =
8170     aarch64_tune_params.branch_costs;
8171 
8172   if (!speed_p || predictable_p)
8173     return branch_costs->predictable;
8174   else
8175     return branch_costs->unpredictable;
8176 }
8177 
8178 /* Return true if the RTX X in mode MODE is a zero or sign extract
8179    usable in an ADD or SUB (extended register) instruction.  */
8180 static bool
aarch64_rtx_arith_op_extract_p(rtx x,scalar_int_mode mode)8181 aarch64_rtx_arith_op_extract_p (rtx x, scalar_int_mode mode)
8182 {
8183   /* Catch add with a sign extract.
8184      This is add_<optab><mode>_multp2.  */
8185   if (GET_CODE (x) == SIGN_EXTRACT
8186       || GET_CODE (x) == ZERO_EXTRACT)
8187     {
8188       rtx op0 = XEXP (x, 0);
8189       rtx op1 = XEXP (x, 1);
8190       rtx op2 = XEXP (x, 2);
8191 
8192       if (GET_CODE (op0) == MULT
8193 	  && CONST_INT_P (op1)
8194 	  && op2 == const0_rtx
8195 	  && CONST_INT_P (XEXP (op0, 1))
8196 	  && aarch64_is_extend_from_extract (mode,
8197 					     XEXP (op0, 1),
8198 					     op1))
8199 	{
8200 	  return true;
8201 	}
8202     }
8203   /* The simple case <ARITH>, XD, XN, XM, [us]xt.
8204      No shift.  */
8205   else if (GET_CODE (x) == SIGN_EXTEND
8206 	   || GET_CODE (x) == ZERO_EXTEND)
8207     return REG_P (XEXP (x, 0));
8208 
8209   return false;
8210 }
8211 
8212 static bool
aarch64_frint_unspec_p(unsigned int u)8213 aarch64_frint_unspec_p (unsigned int u)
8214 {
8215   switch (u)
8216     {
8217       case UNSPEC_FRINTZ:
8218       case UNSPEC_FRINTP:
8219       case UNSPEC_FRINTM:
8220       case UNSPEC_FRINTA:
8221       case UNSPEC_FRINTN:
8222       case UNSPEC_FRINTX:
8223       case UNSPEC_FRINTI:
8224         return true;
8225 
8226       default:
8227         return false;
8228     }
8229 }
8230 
8231 /* Return true iff X is an rtx that will match an extr instruction
8232    i.e. as described in the *extr<mode>5_insn family of patterns.
8233    OP0 and OP1 will be set to the operands of the shifts involved
8234    on success and will be NULL_RTX otherwise.  */
8235 
8236 static bool
aarch64_extr_rtx_p(rtx x,rtx * res_op0,rtx * res_op1)8237 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
8238 {
8239   rtx op0, op1;
8240   scalar_int_mode mode;
8241   if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
8242     return false;
8243 
8244   *res_op0 = NULL_RTX;
8245   *res_op1 = NULL_RTX;
8246 
8247   if (GET_CODE (x) != IOR)
8248     return false;
8249 
8250   op0 = XEXP (x, 0);
8251   op1 = XEXP (x, 1);
8252 
8253   if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
8254       || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
8255     {
8256      /* Canonicalise locally to ashift in op0, lshiftrt in op1.  */
8257       if (GET_CODE (op1) == ASHIFT)
8258         std::swap (op0, op1);
8259 
8260       if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
8261         return false;
8262 
8263       unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
8264       unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
8265 
8266       if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
8267           && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
8268         {
8269           *res_op0 = XEXP (op0, 0);
8270           *res_op1 = XEXP (op1, 0);
8271           return true;
8272         }
8273     }
8274 
8275   return false;
8276 }
8277 
8278 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
8279    storing it in *COST.  Result is true if the total cost of the operation
8280    has now been calculated.  */
8281 static bool
aarch64_if_then_else_costs(rtx op0,rtx op1,rtx op2,int * cost,bool speed)8282 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
8283 {
8284   rtx inner;
8285   rtx comparator;
8286   enum rtx_code cmpcode;
8287 
8288   if (COMPARISON_P (op0))
8289     {
8290       inner = XEXP (op0, 0);
8291       comparator = XEXP (op0, 1);
8292       cmpcode = GET_CODE (op0);
8293     }
8294   else
8295     {
8296       inner = op0;
8297       comparator = const0_rtx;
8298       cmpcode = NE;
8299     }
8300 
8301   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
8302     {
8303       /* Conditional branch.  */
8304       if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
8305 	return true;
8306       else
8307 	{
8308 	  if (cmpcode == NE || cmpcode == EQ)
8309 	    {
8310 	      if (comparator == const0_rtx)
8311 		{
8312 		  /* TBZ/TBNZ/CBZ/CBNZ.  */
8313 		  if (GET_CODE (inner) == ZERO_EXTRACT)
8314 		    /* TBZ/TBNZ.  */
8315 		    *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
8316 				       ZERO_EXTRACT, 0, speed);
8317 		  else
8318 		    /* CBZ/CBNZ.  */
8319 		    *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
8320 
8321 	        return true;
8322 	      }
8323 	    }
8324 	  else if (cmpcode == LT || cmpcode == GE)
8325 	    {
8326 	      /* TBZ/TBNZ.  */
8327 	      if (comparator == const0_rtx)
8328 		return true;
8329 	    }
8330 	}
8331     }
8332   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
8333     {
8334       /* CCMP.  */
8335       if (GET_CODE (op1) == COMPARE)
8336 	{
8337 	  /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0.  */
8338 	  if (XEXP (op1, 1) == const0_rtx)
8339 	    *cost += 1;
8340 	  if (speed)
8341 	    {
8342 	      machine_mode mode = GET_MODE (XEXP (op1, 0));
8343 	      const struct cpu_cost_table *extra_cost
8344 		= aarch64_tune_params.insn_extra_cost;
8345 
8346 	      if (GET_MODE_CLASS (mode) == MODE_INT)
8347 		*cost += extra_cost->alu.arith;
8348 	      else
8349 		*cost += extra_cost->fp[mode == DFmode].compare;
8350 	    }
8351 	  return true;
8352 	}
8353 
8354       /* It's a conditional operation based on the status flags,
8355 	 so it must be some flavor of CSEL.  */
8356 
8357       /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
8358       if (GET_CODE (op1) == NEG
8359           || GET_CODE (op1) == NOT
8360           || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
8361 	op1 = XEXP (op1, 0);
8362       else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
8363 	{
8364 	  /* CSEL with zero-extension (*cmovdi_insn_uxtw).  */
8365 	  op1 = XEXP (op1, 0);
8366 	  op2 = XEXP (op2, 0);
8367 	}
8368 
8369       *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
8370       *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
8371       return true;
8372     }
8373 
8374   /* We don't know what this is, cost all operands.  */
8375   return false;
8376 }
8377 
8378 /* Check whether X is a bitfield operation of the form shift + extend that
8379    maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction.  If so, return the
8380    operand to which the bitfield operation is applied.  Otherwise return
8381    NULL_RTX.  */
8382 
8383 static rtx
aarch64_extend_bitfield_pattern_p(rtx x)8384 aarch64_extend_bitfield_pattern_p (rtx x)
8385 {
8386   rtx_code outer_code = GET_CODE (x);
8387   machine_mode outer_mode = GET_MODE (x);
8388 
8389   if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
8390       && outer_mode != SImode && outer_mode != DImode)
8391     return NULL_RTX;
8392 
8393   rtx inner = XEXP (x, 0);
8394   rtx_code inner_code = GET_CODE (inner);
8395   machine_mode inner_mode = GET_MODE (inner);
8396   rtx op = NULL_RTX;
8397 
8398   switch (inner_code)
8399     {
8400       case ASHIFT:
8401 	if (CONST_INT_P (XEXP (inner, 1))
8402 	    && (inner_mode == QImode || inner_mode == HImode))
8403 	  op = XEXP (inner, 0);
8404 	break;
8405       case LSHIFTRT:
8406 	if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
8407 	    && (inner_mode == QImode || inner_mode == HImode))
8408 	  op = XEXP (inner, 0);
8409 	break;
8410       case ASHIFTRT:
8411 	if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
8412 	    && (inner_mode == QImode || inner_mode == HImode))
8413 	  op = XEXP (inner, 0);
8414 	break;
8415       default:
8416 	break;
8417     }
8418 
8419   return op;
8420 }
8421 
8422 /* Return true if the mask and a shift amount from an RTX of the form
8423    (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
8424    mode MODE.  See the *andim_ashift<mode>_bfiz pattern.  */
8425 
8426 bool
aarch64_mask_and_shift_for_ubfiz_p(scalar_int_mode mode,rtx mask,rtx shft_amnt)8427 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
8428 				    rtx shft_amnt)
8429 {
8430   return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
8431 	 && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
8432 	 && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
8433 	 && (INTVAL (mask)
8434 	     & ((HOST_WIDE_INT_1U << INTVAL (shft_amnt)) - 1)) == 0;
8435 }
8436 
8437 /* Calculate the cost of calculating X, storing it in *COST.  Result
8438    is true if the total cost of the operation has now been calculated.  */
8439 static bool
aarch64_rtx_costs(rtx x,machine_mode mode,int outer ATTRIBUTE_UNUSED,int param ATTRIBUTE_UNUSED,int * cost,bool speed)8440 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
8441 		   int param ATTRIBUTE_UNUSED, int *cost, bool speed)
8442 {
8443   rtx op0, op1, op2;
8444   const struct cpu_cost_table *extra_cost
8445     = aarch64_tune_params.insn_extra_cost;
8446   int code = GET_CODE (x);
8447   scalar_int_mode int_mode;
8448 
8449   /* By default, assume that everything has equivalent cost to the
8450      cheapest instruction.  Any additional costs are applied as a delta
8451      above this default.  */
8452   *cost = COSTS_N_INSNS (1);
8453 
8454   switch (code)
8455     {
8456     case SET:
8457       /* The cost depends entirely on the operands to SET.  */
8458       *cost = 0;
8459       op0 = SET_DEST (x);
8460       op1 = SET_SRC (x);
8461 
8462       switch (GET_CODE (op0))
8463 	{
8464 	case MEM:
8465 	  if (speed)
8466 	    {
8467 	      rtx address = XEXP (op0, 0);
8468 	      if (VECTOR_MODE_P (mode))
8469 		*cost += extra_cost->ldst.storev;
8470 	      else if (GET_MODE_CLASS (mode) == MODE_INT)
8471 		*cost += extra_cost->ldst.store;
8472 	      else if (mode == SFmode)
8473 		*cost += extra_cost->ldst.storef;
8474 	      else if (mode == DFmode)
8475 		*cost += extra_cost->ldst.stored;
8476 
8477 	      *cost +=
8478 		COSTS_N_INSNS (aarch64_address_cost (address, mode,
8479 						     0, speed));
8480 	    }
8481 
8482 	  *cost += rtx_cost (op1, mode, SET, 1, speed);
8483 	  return true;
8484 
8485 	case SUBREG:
8486 	  if (! REG_P (SUBREG_REG (op0)))
8487 	    *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
8488 
8489 	  /* Fall through.  */
8490 	case REG:
8491 	  /* The cost is one per vector-register copied.  */
8492 	  if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
8493 	    {
8494 	      int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
8495 	      *cost = COSTS_N_INSNS (nregs);
8496 	    }
8497 	  /* const0_rtx is in general free, but we will use an
8498 	     instruction to set a register to 0.  */
8499 	  else if (REG_P (op1) || op1 == const0_rtx)
8500 	    {
8501 	      /* The cost is 1 per register copied.  */
8502 	      int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
8503 	      *cost = COSTS_N_INSNS (nregs);
8504 	    }
8505           else
8506 	    /* Cost is just the cost of the RHS of the set.  */
8507 	    *cost += rtx_cost (op1, mode, SET, 1, speed);
8508 	  return true;
8509 
8510 	case ZERO_EXTRACT:
8511 	case SIGN_EXTRACT:
8512 	  /* Bit-field insertion.  Strip any redundant widening of
8513 	     the RHS to meet the width of the target.  */
8514 	  if (GET_CODE (op1) == SUBREG)
8515 	    op1 = SUBREG_REG (op1);
8516 	  if ((GET_CODE (op1) == ZERO_EXTEND
8517 	       || GET_CODE (op1) == SIGN_EXTEND)
8518 	      && CONST_INT_P (XEXP (op0, 1))
8519 	      && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
8520 	      && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
8521 	    op1 = XEXP (op1, 0);
8522 
8523           if (CONST_INT_P (op1))
8524             {
8525               /* MOV immediate is assumed to always be cheap.  */
8526               *cost = COSTS_N_INSNS (1);
8527             }
8528           else
8529             {
8530               /* BFM.  */
8531 	      if (speed)
8532 		*cost += extra_cost->alu.bfi;
8533 	      *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
8534             }
8535 
8536 	  return true;
8537 
8538 	default:
8539 	  /* We can't make sense of this, assume default cost.  */
8540           *cost = COSTS_N_INSNS (1);
8541 	  return false;
8542 	}
8543       return false;
8544 
8545     case CONST_INT:
8546       /* If an instruction can incorporate a constant within the
8547 	 instruction, the instruction's expression avoids calling
8548 	 rtx_cost() on the constant.  If rtx_cost() is called on a
8549 	 constant, then it is usually because the constant must be
8550 	 moved into a register by one or more instructions.
8551 
8552 	 The exception is constant 0, which can be expressed
8553 	 as XZR/WZR and is therefore free.  The exception to this is
8554 	 if we have (set (reg) (const0_rtx)) in which case we must cost
8555 	 the move.  However, we can catch that when we cost the SET, so
8556 	 we don't need to consider that here.  */
8557       if (x == const0_rtx)
8558 	*cost = 0;
8559       else
8560 	{
8561 	  /* To an approximation, building any other constant is
8562 	     proportionally expensive to the number of instructions
8563 	     required to build that constant.  This is true whether we
8564 	     are compiling for SPEED or otherwise.  */
8565 	  if (!is_a <scalar_int_mode> (mode, &int_mode))
8566 	    int_mode = word_mode;
8567 	  *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
8568 				 (NULL_RTX, x, false, int_mode));
8569 	}
8570       return true;
8571 
8572     case CONST_DOUBLE:
8573 
8574       /* First determine number of instructions to do the move
8575 	  as an integer constant.  */
8576       if (!aarch64_float_const_representable_p (x)
8577 	   && !aarch64_can_const_movi_rtx_p (x, mode)
8578 	   && aarch64_float_const_rtx_p (x))
8579 	{
8580 	  unsigned HOST_WIDE_INT ival;
8581 	  bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
8582 	  gcc_assert (succeed);
8583 
8584 	  scalar_int_mode imode = (mode == HFmode
8585 				   ? SImode
8586 				   : int_mode_for_mode (mode).require ());
8587 	  int ncost = aarch64_internal_mov_immediate
8588 		(NULL_RTX, gen_int_mode (ival, imode), false, imode);
8589 	  *cost += COSTS_N_INSNS (ncost);
8590 	  return true;
8591 	}
8592 
8593       if (speed)
8594 	{
8595 	  /* mov[df,sf]_aarch64.  */
8596 	  if (aarch64_float_const_representable_p (x))
8597 	    /* FMOV (scalar immediate).  */
8598 	    *cost += extra_cost->fp[mode == DFmode].fpconst;
8599 	  else if (!aarch64_float_const_zero_rtx_p (x))
8600 	    {
8601 	      /* This will be a load from memory.  */
8602 	      if (mode == DFmode)
8603 		*cost += extra_cost->ldst.loadd;
8604 	      else
8605 		*cost += extra_cost->ldst.loadf;
8606 	    }
8607 	  else
8608 	    /* Otherwise this is +0.0.  We get this using MOVI d0, #0
8609 	       or MOV v0.s[0], wzr - neither of which are modeled by the
8610 	       cost tables.  Just use the default cost.  */
8611 	    {
8612 	    }
8613 	}
8614 
8615       return true;
8616 
8617     case MEM:
8618       if (speed)
8619 	{
8620 	  /* For loads we want the base cost of a load, plus an
8621 	     approximation for the additional cost of the addressing
8622 	     mode.  */
8623 	  rtx address = XEXP (x, 0);
8624 	  if (VECTOR_MODE_P (mode))
8625 	    *cost += extra_cost->ldst.loadv;
8626 	  else if (GET_MODE_CLASS (mode) == MODE_INT)
8627 	    *cost += extra_cost->ldst.load;
8628 	  else if (mode == SFmode)
8629 	    *cost += extra_cost->ldst.loadf;
8630 	  else if (mode == DFmode)
8631 	    *cost += extra_cost->ldst.loadd;
8632 
8633 	  *cost +=
8634 		COSTS_N_INSNS (aarch64_address_cost (address, mode,
8635 						     0, speed));
8636 	}
8637 
8638       return true;
8639 
8640     case NEG:
8641       op0 = XEXP (x, 0);
8642 
8643       if (VECTOR_MODE_P (mode))
8644 	{
8645 	  if (speed)
8646 	    {
8647 	      /* FNEG.  */
8648 	      *cost += extra_cost->vect.alu;
8649 	    }
8650 	  return false;
8651 	}
8652 
8653       if (GET_MODE_CLASS (mode) == MODE_INT)
8654 	{
8655           if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
8656               || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
8657             {
8658               /* CSETM.  */
8659 	      *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
8660               return true;
8661             }
8662 
8663 	  /* Cost this as SUB wzr, X.  */
8664           op0 = CONST0_RTX (mode);
8665           op1 = XEXP (x, 0);
8666           goto cost_minus;
8667         }
8668 
8669       if (GET_MODE_CLASS (mode) == MODE_FLOAT)
8670         {
8671           /* Support (neg(fma...)) as a single instruction only if
8672              sign of zeros is unimportant.  This matches the decision
8673              making in aarch64.md.  */
8674           if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
8675             {
8676 	      /* FNMADD.  */
8677 	      *cost = rtx_cost (op0, mode, NEG, 0, speed);
8678               return true;
8679             }
8680 	  if (GET_CODE (op0) == MULT)
8681 	    {
8682 	      /* FNMUL.  */
8683 	      *cost = rtx_cost (op0, mode, NEG, 0, speed);
8684 	      return true;
8685 	    }
8686 	  if (speed)
8687 	    /* FNEG.  */
8688 	    *cost += extra_cost->fp[mode == DFmode].neg;
8689           return false;
8690         }
8691 
8692       return false;
8693 
8694     case CLRSB:
8695     case CLZ:
8696       if (speed)
8697 	{
8698 	  if (VECTOR_MODE_P (mode))
8699 	    *cost += extra_cost->vect.alu;
8700 	  else
8701 	    *cost += extra_cost->alu.clz;
8702 	}
8703 
8704       return false;
8705 
8706     case COMPARE:
8707       op0 = XEXP (x, 0);
8708       op1 = XEXP (x, 1);
8709 
8710       if (op1 == const0_rtx
8711 	  && GET_CODE (op0) == AND)
8712 	{
8713 	  x = op0;
8714 	  mode = GET_MODE (op0);
8715 	  goto cost_logic;
8716 	}
8717 
8718       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
8719         {
8720           /* TODO: A write to the CC flags possibly costs extra, this
8721 	     needs encoding in the cost tables.  */
8722 
8723 	  mode = GET_MODE (op0);
8724           /* ANDS.  */
8725           if (GET_CODE (op0) == AND)
8726             {
8727               x = op0;
8728               goto cost_logic;
8729             }
8730 
8731           if (GET_CODE (op0) == PLUS)
8732             {
8733 	      /* ADDS (and CMN alias).  */
8734               x = op0;
8735               goto cost_plus;
8736             }
8737 
8738           if (GET_CODE (op0) == MINUS)
8739             {
8740 	      /* SUBS.  */
8741               x = op0;
8742               goto cost_minus;
8743             }
8744 
8745 	  if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
8746 	      && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
8747 	      && CONST_INT_P (XEXP (op0, 2)))
8748 	    {
8749 	      /* COMPARE of ZERO_EXTRACT form of TST-immediate.
8750 		 Handle it here directly rather than going to cost_logic
8751 		 since we know the immediate generated for the TST is valid
8752 		 so we can avoid creating an intermediate rtx for it only
8753 		 for costing purposes.  */
8754 	      if (speed)
8755 		*cost += extra_cost->alu.logical;
8756 
8757 	      *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
8758 				 ZERO_EXTRACT, 0, speed);
8759 	      return true;
8760 	    }
8761 
8762           if (GET_CODE (op1) == NEG)
8763             {
8764 	      /* CMN.  */
8765 	      if (speed)
8766 		*cost += extra_cost->alu.arith;
8767 
8768 	      *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
8769 	      *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
8770               return true;
8771             }
8772 
8773           /* CMP.
8774 
8775 	     Compare can freely swap the order of operands, and
8776              canonicalization puts the more complex operation first.
8777              But the integer MINUS logic expects the shift/extend
8778              operation in op1.  */
8779           if (! (REG_P (op0)
8780                  || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
8781           {
8782             op0 = XEXP (x, 1);
8783             op1 = XEXP (x, 0);
8784           }
8785           goto cost_minus;
8786         }
8787 
8788       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
8789         {
8790 	  /* FCMP.  */
8791 	  if (speed)
8792 	    *cost += extra_cost->fp[mode == DFmode].compare;
8793 
8794           if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
8795             {
8796 	      *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
8797               /* FCMP supports constant 0.0 for no extra cost. */
8798               return true;
8799             }
8800           return false;
8801         }
8802 
8803       if (VECTOR_MODE_P (mode))
8804 	{
8805 	  /* Vector compare.  */
8806 	  if (speed)
8807 	    *cost += extra_cost->vect.alu;
8808 
8809 	  if (aarch64_float_const_zero_rtx_p (op1))
8810 	    {
8811 	      /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
8812 		 cost.  */
8813 	      return true;
8814 	    }
8815 	  return false;
8816 	}
8817       return false;
8818 
8819     case MINUS:
8820       {
8821 	op0 = XEXP (x, 0);
8822 	op1 = XEXP (x, 1);
8823 
8824 cost_minus:
8825 	*cost += rtx_cost (op0, mode, MINUS, 0, speed);
8826 
8827 	/* Detect valid immediates.  */
8828 	if ((GET_MODE_CLASS (mode) == MODE_INT
8829 	     || (GET_MODE_CLASS (mode) == MODE_CC
8830 		 && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
8831 	    && CONST_INT_P (op1)
8832 	    && aarch64_uimm12_shift (INTVAL (op1)))
8833 	  {
8834 	    if (speed)
8835 	      /* SUB(S) (immediate).  */
8836 	      *cost += extra_cost->alu.arith;
8837 	    return true;
8838 	  }
8839 
8840 	/* Look for SUB (extended register).  */
8841 	if (is_a <scalar_int_mode> (mode, &int_mode)
8842 	    && aarch64_rtx_arith_op_extract_p (op1, int_mode))
8843 	  {
8844 	    if (speed)
8845 	      *cost += extra_cost->alu.extend_arith;
8846 
8847 	    op1 = aarch64_strip_extend (op1, true);
8848 	    *cost += rtx_cost (op1, VOIDmode,
8849 			       (enum rtx_code) GET_CODE (op1), 0, speed);
8850 	    return true;
8851 	  }
8852 
8853 	rtx new_op1 = aarch64_strip_extend (op1, false);
8854 
8855 	/* Cost this as an FMA-alike operation.  */
8856 	if ((GET_CODE (new_op1) == MULT
8857 	     || aarch64_shift_p (GET_CODE (new_op1)))
8858 	    && code != COMPARE)
8859 	  {
8860 	    *cost += aarch64_rtx_mult_cost (new_op1, MULT,
8861 					    (enum rtx_code) code,
8862 					    speed);
8863 	    return true;
8864 	  }
8865 
8866 	*cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
8867 
8868 	if (speed)
8869 	  {
8870 	    if (VECTOR_MODE_P (mode))
8871 	      {
8872 		/* Vector SUB.  */
8873 		*cost += extra_cost->vect.alu;
8874 	      }
8875 	    else if (GET_MODE_CLASS (mode) == MODE_INT)
8876 	      {
8877 		/* SUB(S).  */
8878 		*cost += extra_cost->alu.arith;
8879 	      }
8880 	    else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
8881 	      {
8882 		/* FSUB.  */
8883 		*cost += extra_cost->fp[mode == DFmode].addsub;
8884 	      }
8885 	  }
8886 	return true;
8887       }
8888 
8889     case PLUS:
8890       {
8891 	rtx new_op0;
8892 
8893 	op0 = XEXP (x, 0);
8894 	op1 = XEXP (x, 1);
8895 
8896 cost_plus:
8897 	if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
8898 	    || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
8899 	  {
8900 	    /* CSINC.  */
8901 	    *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
8902 	    *cost += rtx_cost (op1, mode, PLUS, 1, speed);
8903 	    return true;
8904 	  }
8905 
8906 	if (GET_MODE_CLASS (mode) == MODE_INT
8907 	    && ((CONST_INT_P (op1) && aarch64_uimm12_shift (INTVAL (op1)))
8908 		|| aarch64_sve_addvl_addpl_immediate (op1, mode)))
8909 	  {
8910 	    *cost += rtx_cost (op0, mode, PLUS, 0, speed);
8911 
8912 	    if (speed)
8913 	      /* ADD (immediate).  */
8914 	      *cost += extra_cost->alu.arith;
8915 	    return true;
8916 	  }
8917 
8918 	*cost += rtx_cost (op1, mode, PLUS, 1, speed);
8919 
8920 	/* Look for ADD (extended register).  */
8921 	if (is_a <scalar_int_mode> (mode, &int_mode)
8922 	    && aarch64_rtx_arith_op_extract_p (op0, int_mode))
8923 	  {
8924 	    if (speed)
8925 	      *cost += extra_cost->alu.extend_arith;
8926 
8927 	    op0 = aarch64_strip_extend (op0, true);
8928 	    *cost += rtx_cost (op0, VOIDmode,
8929 			       (enum rtx_code) GET_CODE (op0), 0, speed);
8930 	    return true;
8931 	  }
8932 
8933 	/* Strip any extend, leave shifts behind as we will
8934 	   cost them through mult_cost.  */
8935 	new_op0 = aarch64_strip_extend (op0, false);
8936 
8937 	if (GET_CODE (new_op0) == MULT
8938 	    || aarch64_shift_p (GET_CODE (new_op0)))
8939 	  {
8940 	    *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
8941 					    speed);
8942 	    return true;
8943 	  }
8944 
8945 	*cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
8946 
8947 	if (speed)
8948 	  {
8949 	    if (VECTOR_MODE_P (mode))
8950 	      {
8951 		/* Vector ADD.  */
8952 		*cost += extra_cost->vect.alu;
8953 	      }
8954 	    else if (GET_MODE_CLASS (mode) == MODE_INT)
8955 	      {
8956 		/* ADD.  */
8957 		*cost += extra_cost->alu.arith;
8958 	      }
8959 	    else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
8960 	      {
8961 		/* FADD.  */
8962 		*cost += extra_cost->fp[mode == DFmode].addsub;
8963 	      }
8964 	  }
8965 	return true;
8966       }
8967 
8968     case BSWAP:
8969       *cost = COSTS_N_INSNS (1);
8970 
8971       if (speed)
8972 	{
8973 	  if (VECTOR_MODE_P (mode))
8974 	    *cost += extra_cost->vect.alu;
8975 	  else
8976 	    *cost += extra_cost->alu.rev;
8977 	}
8978       return false;
8979 
8980     case IOR:
8981       if (aarch_rev16_p (x))
8982         {
8983           *cost = COSTS_N_INSNS (1);
8984 
8985 	  if (speed)
8986 	    {
8987 	      if (VECTOR_MODE_P (mode))
8988 		*cost += extra_cost->vect.alu;
8989 	      else
8990 		*cost += extra_cost->alu.rev;
8991 	    }
8992 	  return true;
8993         }
8994 
8995       if (aarch64_extr_rtx_p (x, &op0, &op1))
8996         {
8997 	  *cost += rtx_cost (op0, mode, IOR, 0, speed);
8998 	  *cost += rtx_cost (op1, mode, IOR, 1, speed);
8999           if (speed)
9000             *cost += extra_cost->alu.shift;
9001 
9002           return true;
9003         }
9004     /* Fall through.  */
9005     case XOR:
9006     case AND:
9007     cost_logic:
9008       op0 = XEXP (x, 0);
9009       op1 = XEXP (x, 1);
9010 
9011       if (VECTOR_MODE_P (mode))
9012 	{
9013 	  if (speed)
9014 	    *cost += extra_cost->vect.alu;
9015 	  return true;
9016 	}
9017 
9018       if (code == AND
9019           && GET_CODE (op0) == MULT
9020           && CONST_INT_P (XEXP (op0, 1))
9021           && CONST_INT_P (op1)
9022           && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
9023                                INTVAL (op1)) != 0)
9024         {
9025           /* This is a UBFM/SBFM.  */
9026 	  *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
9027 	  if (speed)
9028 	    *cost += extra_cost->alu.bfx;
9029           return true;
9030         }
9031 
9032       if (is_int_mode (mode, &int_mode))
9033 	{
9034 	  if (CONST_INT_P (op1))
9035 	    {
9036 	      /* We have a mask + shift version of a UBFIZ
9037 		 i.e. the *andim_ashift<mode>_bfiz pattern.  */
9038 	      if (GET_CODE (op0) == ASHIFT
9039 		  && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
9040 							 XEXP (op0, 1)))
9041 		{
9042 		  *cost += rtx_cost (XEXP (op0, 0), int_mode,
9043 				     (enum rtx_code) code, 0, speed);
9044 		  if (speed)
9045 		    *cost += extra_cost->alu.bfx;
9046 
9047 		  return true;
9048 		}
9049 	      else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
9050 		{
9051 		/* We possibly get the immediate for free, this is not
9052 		   modelled.  */
9053 		  *cost += rtx_cost (op0, int_mode,
9054 				     (enum rtx_code) code, 0, speed);
9055 		  if (speed)
9056 		    *cost += extra_cost->alu.logical;
9057 
9058 		  return true;
9059 		}
9060 	    }
9061 	  else
9062 	    {
9063 	      rtx new_op0 = op0;
9064 
9065 	      /* Handle ORN, EON, or BIC.  */
9066 	      if (GET_CODE (op0) == NOT)
9067 		op0 = XEXP (op0, 0);
9068 
9069 	      new_op0 = aarch64_strip_shift (op0);
9070 
9071 	      /* If we had a shift on op0 then this is a logical-shift-
9072 		 by-register/immediate operation.  Otherwise, this is just
9073 		 a logical operation.  */
9074 	      if (speed)
9075 		{
9076 		  if (new_op0 != op0)
9077 		    {
9078 		      /* Shift by immediate.  */
9079 		      if (CONST_INT_P (XEXP (op0, 1)))
9080 			*cost += extra_cost->alu.log_shift;
9081 		      else
9082 			*cost += extra_cost->alu.log_shift_reg;
9083 		    }
9084 		  else
9085 		    *cost += extra_cost->alu.logical;
9086 		}
9087 
9088 	      /* In both cases we want to cost both operands.  */
9089 	      *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
9090 				 0, speed);
9091 	      *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
9092 				 1, speed);
9093 
9094 	      return true;
9095 	    }
9096 	}
9097       return false;
9098 
9099     case NOT:
9100       x = XEXP (x, 0);
9101       op0 = aarch64_strip_shift (x);
9102 
9103       if (VECTOR_MODE_P (mode))
9104 	{
9105 	  /* Vector NOT.  */
9106 	  *cost += extra_cost->vect.alu;
9107 	  return false;
9108 	}
9109 
9110       /* MVN-shifted-reg.  */
9111       if (op0 != x)
9112         {
9113 	  *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
9114 
9115           if (speed)
9116             *cost += extra_cost->alu.log_shift;
9117 
9118           return true;
9119         }
9120       /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
9121          Handle the second form here taking care that 'a' in the above can
9122          be a shift.  */
9123       else if (GET_CODE (op0) == XOR)
9124         {
9125           rtx newop0 = XEXP (op0, 0);
9126           rtx newop1 = XEXP (op0, 1);
9127           rtx op0_stripped = aarch64_strip_shift (newop0);
9128 
9129 	  *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
9130 	  *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
9131 
9132           if (speed)
9133             {
9134               if (op0_stripped != newop0)
9135                 *cost += extra_cost->alu.log_shift;
9136               else
9137                 *cost += extra_cost->alu.logical;
9138             }
9139 
9140           return true;
9141         }
9142       /* MVN.  */
9143       if (speed)
9144 	*cost += extra_cost->alu.logical;
9145 
9146       return false;
9147 
9148     case ZERO_EXTEND:
9149 
9150       op0 = XEXP (x, 0);
9151       /* If a value is written in SI mode, then zero extended to DI
9152 	 mode, the operation will in general be free as a write to
9153 	 a 'w' register implicitly zeroes the upper bits of an 'x'
9154 	 register.  However, if this is
9155 
9156 	   (set (reg) (zero_extend (reg)))
9157 
9158 	 we must cost the explicit register move.  */
9159       if (mode == DImode
9160 	  && GET_MODE (op0) == SImode
9161 	  && outer == SET)
9162 	{
9163 	  int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
9164 
9165 	/* If OP_COST is non-zero, then the cost of the zero extend
9166 	   is effectively the cost of the inner operation.  Otherwise
9167 	   we have a MOV instruction and we take the cost from the MOV
9168 	   itself.  This is true independently of whether we are
9169 	   optimizing for space or time.  */
9170 	  if (op_cost)
9171 	    *cost = op_cost;
9172 
9173 	  return true;
9174 	}
9175       else if (MEM_P (op0))
9176 	{
9177 	  /* All loads can zero extend to any size for free.  */
9178 	  *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
9179 	  return true;
9180 	}
9181 
9182       op0 = aarch64_extend_bitfield_pattern_p (x);
9183       if (op0)
9184 	{
9185 	  *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
9186 	  if (speed)
9187 	    *cost += extra_cost->alu.bfx;
9188 	  return true;
9189 	}
9190 
9191       if (speed)
9192 	{
9193 	  if (VECTOR_MODE_P (mode))
9194 	    {
9195 	      /* UMOV.  */
9196 	      *cost += extra_cost->vect.alu;
9197 	    }
9198 	  else
9199 	    {
9200 	      /* We generate an AND instead of UXTB/UXTH.  */
9201 	      *cost += extra_cost->alu.logical;
9202 	    }
9203 	}
9204       return false;
9205 
9206     case SIGN_EXTEND:
9207       if (MEM_P (XEXP (x, 0)))
9208 	{
9209 	  /* LDRSH.  */
9210 	  if (speed)
9211 	    {
9212 	      rtx address = XEXP (XEXP (x, 0), 0);
9213 	      *cost += extra_cost->ldst.load_sign_extend;
9214 
9215 	      *cost +=
9216 		COSTS_N_INSNS (aarch64_address_cost (address, mode,
9217 						     0, speed));
9218 	    }
9219 	  return true;
9220 	}
9221 
9222       op0 = aarch64_extend_bitfield_pattern_p (x);
9223       if (op0)
9224 	{
9225 	  *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
9226 	  if (speed)
9227 	    *cost += extra_cost->alu.bfx;
9228 	  return true;
9229 	}
9230 
9231       if (speed)
9232 	{
9233 	  if (VECTOR_MODE_P (mode))
9234 	    *cost += extra_cost->vect.alu;
9235 	  else
9236 	    *cost += extra_cost->alu.extend;
9237 	}
9238       return false;
9239 
9240     case ASHIFT:
9241       op0 = XEXP (x, 0);
9242       op1 = XEXP (x, 1);
9243 
9244       if (CONST_INT_P (op1))
9245         {
9246 	  if (speed)
9247 	    {
9248 	      if (VECTOR_MODE_P (mode))
9249 		{
9250 		  /* Vector shift (immediate).  */
9251 		  *cost += extra_cost->vect.alu;
9252 		}
9253 	      else
9254 		{
9255 		  /* LSL (immediate), UBMF, UBFIZ and friends.  These are all
9256 		     aliases.  */
9257 		  *cost += extra_cost->alu.shift;
9258 		}
9259 	    }
9260 
9261           /* We can incorporate zero/sign extend for free.  */
9262           if (GET_CODE (op0) == ZERO_EXTEND
9263               || GET_CODE (op0) == SIGN_EXTEND)
9264             op0 = XEXP (op0, 0);
9265 
9266 	  *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
9267           return true;
9268         }
9269       else
9270         {
9271 	  if (VECTOR_MODE_P (mode))
9272 	    {
9273 	      if (speed)
9274 		/* Vector shift (register).  */
9275 		*cost += extra_cost->vect.alu;
9276 	    }
9277 	  else
9278 	    {
9279 	      if (speed)
9280 		/* LSLV.  */
9281 		*cost += extra_cost->alu.shift_reg;
9282 
9283 	      if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
9284 		  && CONST_INT_P (XEXP (op1, 1))
9285 		  && known_eq (INTVAL (XEXP (op1, 1)),
9286 			       GET_MODE_BITSIZE (mode) - 1))
9287 		{
9288 		  *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
9289 		  /* We already demanded XEXP (op1, 0) to be REG_P, so
9290 		     don't recurse into it.  */
9291 		  return true;
9292 		}
9293 	    }
9294 	  return false;  /* All arguments need to be in registers.  */
9295         }
9296 
9297     case ROTATE:
9298     case ROTATERT:
9299     case LSHIFTRT:
9300     case ASHIFTRT:
9301       op0 = XEXP (x, 0);
9302       op1 = XEXP (x, 1);
9303 
9304       if (CONST_INT_P (op1))
9305 	{
9306 	  /* ASR (immediate) and friends.  */
9307 	  if (speed)
9308 	    {
9309 	      if (VECTOR_MODE_P (mode))
9310 		*cost += extra_cost->vect.alu;
9311 	      else
9312 		*cost += extra_cost->alu.shift;
9313 	    }
9314 
9315 	  *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
9316 	  return true;
9317 	}
9318       else
9319 	{
9320 	  if (VECTOR_MODE_P (mode))
9321 	    {
9322 	      if (speed)
9323 		/* Vector shift (register).  */
9324 		*cost += extra_cost->vect.alu;
9325 	    }
9326 	  else
9327 	    {
9328 	      if (speed)
9329 		/* ASR (register) and friends.  */
9330 		*cost += extra_cost->alu.shift_reg;
9331 
9332 	      if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
9333 		  && CONST_INT_P (XEXP (op1, 1))
9334 		  && known_eq (INTVAL (XEXP (op1, 1)),
9335 			       GET_MODE_BITSIZE (mode) - 1))
9336 		{
9337 		  *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
9338 		  /* We already demanded XEXP (op1, 0) to be REG_P, so
9339 		     don't recurse into it.  */
9340 		  return true;
9341 		}
9342 	    }
9343 	  return false;  /* All arguments need to be in registers.  */
9344 	}
9345 
9346     case SYMBOL_REF:
9347 
9348       if (aarch64_cmodel == AARCH64_CMODEL_LARGE
9349 	  || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
9350 	{
9351 	  /* LDR.  */
9352 	  if (speed)
9353 	    *cost += extra_cost->ldst.load;
9354 	}
9355       else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
9356 	       || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
9357 	{
9358 	  /* ADRP, followed by ADD.  */
9359 	  *cost += COSTS_N_INSNS (1);
9360 	  if (speed)
9361 	    *cost += 2 * extra_cost->alu.arith;
9362 	}
9363       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
9364 	       || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
9365 	{
9366 	  /* ADR.  */
9367 	  if (speed)
9368 	    *cost += extra_cost->alu.arith;
9369 	}
9370 
9371       if (flag_pic)
9372 	{
9373 	  /* One extra load instruction, after accessing the GOT.  */
9374 	  *cost += COSTS_N_INSNS (1);
9375 	  if (speed)
9376 	    *cost += extra_cost->ldst.load;
9377 	}
9378       return true;
9379 
9380     case HIGH:
9381     case LO_SUM:
9382       /* ADRP/ADD (immediate).  */
9383       if (speed)
9384 	*cost += extra_cost->alu.arith;
9385       return true;
9386 
9387     case ZERO_EXTRACT:
9388     case SIGN_EXTRACT:
9389       /* UBFX/SBFX.  */
9390       if (speed)
9391 	{
9392 	  if (VECTOR_MODE_P (mode))
9393 	    *cost += extra_cost->vect.alu;
9394 	  else
9395 	    *cost += extra_cost->alu.bfx;
9396 	}
9397 
9398       /* We can trust that the immediates used will be correct (there
9399 	 are no by-register forms), so we need only cost op0.  */
9400       *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
9401       return true;
9402 
9403     case MULT:
9404       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
9405       /* aarch64_rtx_mult_cost always handles recursion to its
9406 	 operands.  */
9407       return true;
9408 
9409     case MOD:
9410     /* We can expand signed mod by power of 2 using a NEGS, two parallel
9411        ANDs and a CSNEG.  Assume here that CSNEG is the same as the cost of
9412        an unconditional negate.  This case should only ever be reached through
9413        the set_smod_pow2_cheap check in expmed.c.  */
9414       if (CONST_INT_P (XEXP (x, 1))
9415 	  && exact_log2 (INTVAL (XEXP (x, 1))) > 0
9416 	  && (mode == SImode || mode == DImode))
9417 	{
9418 	  /* We expand to 4 instructions.  Reset the baseline.  */
9419 	  *cost = COSTS_N_INSNS (4);
9420 
9421 	  if (speed)
9422 	    *cost += 2 * extra_cost->alu.logical
9423 		     + 2 * extra_cost->alu.arith;
9424 
9425 	  return true;
9426 	}
9427 
9428     /* Fall-through.  */
9429     case UMOD:
9430       if (speed)
9431 	{
9432 	  /* Slighly prefer UMOD over SMOD.  */
9433 	  if (VECTOR_MODE_P (mode))
9434 	    *cost += extra_cost->vect.alu;
9435 	  else if (GET_MODE_CLASS (mode) == MODE_INT)
9436 	    *cost += (extra_cost->mult[mode == DImode].add
9437 		      + extra_cost->mult[mode == DImode].idiv
9438 		      + (code == MOD ? 1 : 0));
9439 	}
9440       return false;  /* All arguments need to be in registers.  */
9441 
9442     case DIV:
9443     case UDIV:
9444     case SQRT:
9445       if (speed)
9446 	{
9447 	  if (VECTOR_MODE_P (mode))
9448 	    *cost += extra_cost->vect.alu;
9449 	  else if (GET_MODE_CLASS (mode) == MODE_INT)
9450 	    /* There is no integer SQRT, so only DIV and UDIV can get
9451 	       here.  */
9452 	    *cost += (extra_cost->mult[mode == DImode].idiv
9453 		     /* Slighly prefer UDIV over SDIV.  */
9454 		     + (code == DIV ? 1 : 0));
9455 	  else
9456 	    *cost += extra_cost->fp[mode == DFmode].div;
9457 	}
9458       return false;  /* All arguments need to be in registers.  */
9459 
9460     case IF_THEN_ELSE:
9461       return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
9462 					 XEXP (x, 2), cost, speed);
9463 
9464     case EQ:
9465     case NE:
9466     case GT:
9467     case GTU:
9468     case LT:
9469     case LTU:
9470     case GE:
9471     case GEU:
9472     case LE:
9473     case LEU:
9474 
9475       return false; /* All arguments must be in registers.  */
9476 
9477     case FMA:
9478       op0 = XEXP (x, 0);
9479       op1 = XEXP (x, 1);
9480       op2 = XEXP (x, 2);
9481 
9482       if (speed)
9483 	{
9484 	  if (VECTOR_MODE_P (mode))
9485 	    *cost += extra_cost->vect.alu;
9486 	  else
9487 	    *cost += extra_cost->fp[mode == DFmode].fma;
9488 	}
9489 
9490       /* FMSUB, FNMADD, and FNMSUB are free.  */
9491       if (GET_CODE (op0) == NEG)
9492         op0 = XEXP (op0, 0);
9493 
9494       if (GET_CODE (op2) == NEG)
9495         op2 = XEXP (op2, 0);
9496 
9497       /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
9498 	 and the by-element operand as operand 0.  */
9499       if (GET_CODE (op1) == NEG)
9500         op1 = XEXP (op1, 0);
9501 
9502       /* Catch vector-by-element operations.  The by-element operand can
9503 	 either be (vec_duplicate (vec_select (x))) or just
9504 	 (vec_select (x)), depending on whether we are multiplying by
9505 	 a vector or a scalar.
9506 
9507 	 Canonicalization is not very good in these cases, FMA4 will put the
9508 	 by-element operand as operand 0, FNMA4 will have it as operand 1.  */
9509       if (GET_CODE (op0) == VEC_DUPLICATE)
9510 	op0 = XEXP (op0, 0);
9511       else if (GET_CODE (op1) == VEC_DUPLICATE)
9512 	op1 = XEXP (op1, 0);
9513 
9514       if (GET_CODE (op0) == VEC_SELECT)
9515 	op0 = XEXP (op0, 0);
9516       else if (GET_CODE (op1) == VEC_SELECT)
9517 	op1 = XEXP (op1, 0);
9518 
9519       /* If the remaining parameters are not registers,
9520          get the cost to put them into registers.  */
9521       *cost += rtx_cost (op0, mode, FMA, 0, speed);
9522       *cost += rtx_cost (op1, mode, FMA, 1, speed);
9523       *cost += rtx_cost (op2, mode, FMA, 2, speed);
9524       return true;
9525 
9526     case FLOAT:
9527     case UNSIGNED_FLOAT:
9528       if (speed)
9529 	*cost += extra_cost->fp[mode == DFmode].fromint;
9530       return false;
9531 
9532     case FLOAT_EXTEND:
9533       if (speed)
9534 	{
9535 	  if (VECTOR_MODE_P (mode))
9536 	    {
9537 	      /*Vector truncate.  */
9538 	      *cost += extra_cost->vect.alu;
9539 	    }
9540 	  else
9541 	    *cost += extra_cost->fp[mode == DFmode].widen;
9542 	}
9543       return false;
9544 
9545     case FLOAT_TRUNCATE:
9546       if (speed)
9547 	{
9548 	  if (VECTOR_MODE_P (mode))
9549 	    {
9550 	      /*Vector conversion.  */
9551 	      *cost += extra_cost->vect.alu;
9552 	    }
9553 	  else
9554 	    *cost += extra_cost->fp[mode == DFmode].narrow;
9555 	}
9556       return false;
9557 
9558     case FIX:
9559     case UNSIGNED_FIX:
9560       x = XEXP (x, 0);
9561       /* Strip the rounding part.  They will all be implemented
9562          by the fcvt* family of instructions anyway.  */
9563       if (GET_CODE (x) == UNSPEC)
9564         {
9565           unsigned int uns_code = XINT (x, 1);
9566 
9567           if (uns_code == UNSPEC_FRINTA
9568               || uns_code == UNSPEC_FRINTM
9569               || uns_code == UNSPEC_FRINTN
9570               || uns_code == UNSPEC_FRINTP
9571               || uns_code == UNSPEC_FRINTZ)
9572             x = XVECEXP (x, 0, 0);
9573         }
9574 
9575       if (speed)
9576 	{
9577 	  if (VECTOR_MODE_P (mode))
9578 	    *cost += extra_cost->vect.alu;
9579 	  else
9580 	    *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
9581 	}
9582 
9583       /* We can combine fmul by a power of 2 followed by a fcvt into a single
9584 	 fixed-point fcvt.  */
9585       if (GET_CODE (x) == MULT
9586 	  && ((VECTOR_MODE_P (mode)
9587 	       && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
9588 	      || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
9589 	{
9590 	  *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
9591 			     0, speed);
9592 	  return true;
9593 	}
9594 
9595       *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
9596       return true;
9597 
9598     case ABS:
9599       if (VECTOR_MODE_P (mode))
9600 	{
9601 	  /* ABS (vector).  */
9602 	  if (speed)
9603 	    *cost += extra_cost->vect.alu;
9604 	}
9605       else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
9606 	{
9607 	  op0 = XEXP (x, 0);
9608 
9609 	  /* FABD, which is analogous to FADD.  */
9610 	  if (GET_CODE (op0) == MINUS)
9611 	    {
9612 	      *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
9613 	      *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
9614 	      if (speed)
9615 		*cost += extra_cost->fp[mode == DFmode].addsub;
9616 
9617 	      return true;
9618 	    }
9619 	  /* Simple FABS is analogous to FNEG.  */
9620 	  if (speed)
9621 	    *cost += extra_cost->fp[mode == DFmode].neg;
9622 	}
9623       else
9624 	{
9625 	  /* Integer ABS will either be split to
9626 	     two arithmetic instructions, or will be an ABS
9627 	     (scalar), which we don't model.  */
9628 	  *cost = COSTS_N_INSNS (2);
9629 	  if (speed)
9630 	    *cost += 2 * extra_cost->alu.arith;
9631 	}
9632       return false;
9633 
9634     case SMAX:
9635     case SMIN:
9636       if (speed)
9637 	{
9638 	  if (VECTOR_MODE_P (mode))
9639 	    *cost += extra_cost->vect.alu;
9640 	  else
9641 	    {
9642 	      /* FMAXNM/FMINNM/FMAX/FMIN.
9643 	         TODO: This may not be accurate for all implementations, but
9644 	         we do not model this in the cost tables.  */
9645 	      *cost += extra_cost->fp[mode == DFmode].addsub;
9646 	    }
9647 	}
9648       return false;
9649 
9650     case UNSPEC:
9651       /* The floating point round to integer frint* instructions.  */
9652       if (aarch64_frint_unspec_p (XINT (x, 1)))
9653         {
9654           if (speed)
9655             *cost += extra_cost->fp[mode == DFmode].roundint;
9656 
9657           return false;
9658         }
9659 
9660       if (XINT (x, 1) == UNSPEC_RBIT)
9661         {
9662           if (speed)
9663             *cost += extra_cost->alu.rev;
9664 
9665           return false;
9666         }
9667       break;
9668 
9669     case TRUNCATE:
9670 
9671       /* Decompose <su>muldi3_highpart.  */
9672       if (/* (truncate:DI  */
9673 	  mode == DImode
9674 	  /*   (lshiftrt:TI  */
9675           && GET_MODE (XEXP (x, 0)) == TImode
9676           && GET_CODE (XEXP (x, 0)) == LSHIFTRT
9677 	  /*      (mult:TI  */
9678           && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
9679 	  /*        (ANY_EXTEND:TI (reg:DI))
9680 	            (ANY_EXTEND:TI (reg:DI)))  */
9681           && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
9682                && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
9683               || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
9684                   && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
9685           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
9686           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
9687 	  /*     (const_int 64)  */
9688           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
9689           && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
9690         {
9691           /* UMULH/SMULH.  */
9692 	  if (speed)
9693 	    *cost += extra_cost->mult[mode == DImode].extend;
9694 	  *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
9695 			     mode, MULT, 0, speed);
9696 	  *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
9697 			     mode, MULT, 1, speed);
9698           return true;
9699         }
9700 
9701       /* Fall through.  */
9702     default:
9703       break;
9704     }
9705 
9706   if (dump_file
9707       && flag_aarch64_verbose_cost)
9708     fprintf (dump_file,
9709       "\nFailed to cost RTX.  Assuming default cost.\n");
9710 
9711   return true;
9712 }
9713 
9714 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
9715    calculated for X.  This cost is stored in *COST.  Returns true
9716    if the total cost of X was calculated.  */
9717 static bool
aarch64_rtx_costs_wrapper(rtx x,machine_mode mode,int outer,int param,int * cost,bool speed)9718 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
9719 		   int param, int *cost, bool speed)
9720 {
9721   bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
9722 
9723   if (dump_file
9724       && flag_aarch64_verbose_cost)
9725     {
9726       print_rtl_single (dump_file, x);
9727       fprintf (dump_file, "\n%s cost: %d (%s)\n",
9728 	       speed ? "Hot" : "Cold",
9729 	       *cost, result ? "final" : "partial");
9730     }
9731 
9732   return result;
9733 }
9734 
9735 static int
aarch64_register_move_cost(machine_mode mode,reg_class_t from_i,reg_class_t to_i)9736 aarch64_register_move_cost (machine_mode mode,
9737 			    reg_class_t from_i, reg_class_t to_i)
9738 {
9739   enum reg_class from = (enum reg_class) from_i;
9740   enum reg_class to = (enum reg_class) to_i;
9741   const struct cpu_regmove_cost *regmove_cost
9742     = aarch64_tune_params.regmove_cost;
9743 
9744   /* Caller save and pointer regs are equivalent to GENERAL_REGS.  */
9745   if (to == TAILCALL_ADDR_REGS || to == POINTER_REGS)
9746     to = GENERAL_REGS;
9747 
9748   if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS)
9749     from = GENERAL_REGS;
9750 
9751   /* Moving between GPR and stack cost is the same as GP2GP.  */
9752   if ((from == GENERAL_REGS && to == STACK_REG)
9753       || (to == GENERAL_REGS && from == STACK_REG))
9754     return regmove_cost->GP2GP;
9755 
9756   /* To/From the stack register, we move via the gprs.  */
9757   if (to == STACK_REG || from == STACK_REG)
9758     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
9759             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
9760 
9761   if (known_eq (GET_MODE_SIZE (mode), 16))
9762     {
9763       /* 128-bit operations on general registers require 2 instructions.  */
9764       if (from == GENERAL_REGS && to == GENERAL_REGS)
9765 	return regmove_cost->GP2GP * 2;
9766       else if (from == GENERAL_REGS)
9767 	return regmove_cost->GP2FP * 2;
9768       else if (to == GENERAL_REGS)
9769 	return regmove_cost->FP2GP * 2;
9770 
9771       /* When AdvSIMD instructions are disabled it is not possible to move
9772 	 a 128-bit value directly between Q registers.  This is handled in
9773 	 secondary reload.  A general register is used as a scratch to move
9774 	 the upper DI value and the lower DI value is moved directly,
9775 	 hence the cost is the sum of three moves. */
9776       if (! TARGET_SIMD)
9777 	return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
9778 
9779       return regmove_cost->FP2FP;
9780     }
9781 
9782   if (from == GENERAL_REGS && to == GENERAL_REGS)
9783     return regmove_cost->GP2GP;
9784   else if (from == GENERAL_REGS)
9785     return regmove_cost->GP2FP;
9786   else if (to == GENERAL_REGS)
9787     return regmove_cost->FP2GP;
9788 
9789   return regmove_cost->FP2FP;
9790 }
9791 
9792 static int
aarch64_memory_move_cost(machine_mode mode ATTRIBUTE_UNUSED,reg_class_t rclass ATTRIBUTE_UNUSED,bool in ATTRIBUTE_UNUSED)9793 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
9794 			  reg_class_t rclass ATTRIBUTE_UNUSED,
9795 			  bool in ATTRIBUTE_UNUSED)
9796 {
9797   return aarch64_tune_params.memmov_cost;
9798 }
9799 
9800 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
9801    to optimize 1.0/sqrt.  */
9802 
9803 static bool
use_rsqrt_p(machine_mode mode)9804 use_rsqrt_p (machine_mode mode)
9805 {
9806   return (!flag_trapping_math
9807 	  && flag_unsafe_math_optimizations
9808 	  && ((aarch64_tune_params.approx_modes->recip_sqrt
9809 	       & AARCH64_APPROX_MODE (mode))
9810 	      || flag_mrecip_low_precision_sqrt));
9811 }
9812 
9813 /* Function to decide when to use the approximate reciprocal square root
9814    builtin.  */
9815 
9816 static tree
aarch64_builtin_reciprocal(tree fndecl)9817 aarch64_builtin_reciprocal (tree fndecl)
9818 {
9819   machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
9820 
9821   if (!use_rsqrt_p (mode))
9822     return NULL_TREE;
9823   return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
9824 }
9825 
9826 typedef rtx (*rsqrte_type) (rtx, rtx);
9827 
9828 /* Select reciprocal square root initial estimate insn depending on machine
9829    mode.  */
9830 
9831 static rsqrte_type
get_rsqrte_type(machine_mode mode)9832 get_rsqrte_type (machine_mode mode)
9833 {
9834   switch (mode)
9835   {
9836     case E_DFmode:   return gen_aarch64_rsqrtedf;
9837     case E_SFmode:   return gen_aarch64_rsqrtesf;
9838     case E_V2DFmode: return gen_aarch64_rsqrtev2df;
9839     case E_V2SFmode: return gen_aarch64_rsqrtev2sf;
9840     case E_V4SFmode: return gen_aarch64_rsqrtev4sf;
9841     default: gcc_unreachable ();
9842   }
9843 }
9844 
9845 typedef rtx (*rsqrts_type) (rtx, rtx, rtx);
9846 
9847 /* Select reciprocal square root series step insn depending on machine mode.  */
9848 
9849 static rsqrts_type
get_rsqrts_type(machine_mode mode)9850 get_rsqrts_type (machine_mode mode)
9851 {
9852   switch (mode)
9853   {
9854     case E_DFmode:   return gen_aarch64_rsqrtsdf;
9855     case E_SFmode:   return gen_aarch64_rsqrtssf;
9856     case E_V2DFmode: return gen_aarch64_rsqrtsv2df;
9857     case E_V2SFmode: return gen_aarch64_rsqrtsv2sf;
9858     case E_V4SFmode: return gen_aarch64_rsqrtsv4sf;
9859     default: gcc_unreachable ();
9860   }
9861 }
9862 
9863 /* Emit instruction sequence to compute either the approximate square root
9864    or its approximate reciprocal, depending on the flag RECP, and return
9865    whether the sequence was emitted or not.  */
9866 
9867 bool
aarch64_emit_approx_sqrt(rtx dst,rtx src,bool recp)9868 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
9869 {
9870   machine_mode mode = GET_MODE (dst);
9871 
9872   if (GET_MODE_INNER (mode) == HFmode)
9873     {
9874       gcc_assert (!recp);
9875       return false;
9876     }
9877 
9878   if (!recp)
9879     {
9880       if (!(flag_mlow_precision_sqrt
9881 	    || (aarch64_tune_params.approx_modes->sqrt
9882 		& AARCH64_APPROX_MODE (mode))))
9883 	return false;
9884 
9885       if (flag_finite_math_only
9886 	  || flag_trapping_math
9887 	  || !flag_unsafe_math_optimizations
9888 	  || optimize_function_for_size_p (cfun))
9889 	return false;
9890     }
9891   else
9892     /* Caller assumes we cannot fail.  */
9893     gcc_assert (use_rsqrt_p (mode));
9894 
9895   machine_mode mmsk = mode_for_int_vector (mode).require ();
9896   rtx xmsk = gen_reg_rtx (mmsk);
9897   if (!recp)
9898     /* When calculating the approximate square root, compare the
9899        argument with 0.0 and create a mask.  */
9900     emit_insn (gen_rtx_SET (xmsk,
9901 			    gen_rtx_NEG (mmsk,
9902 					 gen_rtx_EQ (mmsk, src,
9903 						     CONST0_RTX (mode)))));
9904 
9905   /* Estimate the approximate reciprocal square root.  */
9906   rtx xdst = gen_reg_rtx (mode);
9907   emit_insn ((*get_rsqrte_type (mode)) (xdst, src));
9908 
9909   /* Iterate over the series twice for SF and thrice for DF.  */
9910   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
9911 
9912   /* Optionally iterate over the series once less for faster performance
9913      while sacrificing the accuracy.  */
9914   if ((recp && flag_mrecip_low_precision_sqrt)
9915       || (!recp && flag_mlow_precision_sqrt))
9916     iterations--;
9917 
9918   /* Iterate over the series to calculate the approximate reciprocal square
9919      root.  */
9920   rtx x1 = gen_reg_rtx (mode);
9921   while (iterations--)
9922     {
9923       rtx x2 = gen_reg_rtx (mode);
9924       emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
9925 
9926       emit_insn ((*get_rsqrts_type (mode)) (x1, src, x2));
9927 
9928       if (iterations > 0)
9929 	emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
9930     }
9931 
9932   if (!recp)
9933     {
9934       /* Qualify the approximate reciprocal square root when the argument is
9935 	 0.0 by squashing the intermediary result to 0.0.  */
9936       rtx xtmp = gen_reg_rtx (mmsk);
9937       emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
9938 					      gen_rtx_SUBREG (mmsk, xdst, 0)));
9939       emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
9940 
9941       /* Calculate the approximate square root.  */
9942       emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
9943     }
9944 
9945   /* Finalize the approximation.  */
9946   emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
9947 
9948   return true;
9949 }
9950 
9951 typedef rtx (*recpe_type) (rtx, rtx);
9952 
9953 /* Select reciprocal initial estimate insn depending on machine mode.  */
9954 
9955 static recpe_type
get_recpe_type(machine_mode mode)9956 get_recpe_type (machine_mode mode)
9957 {
9958   switch (mode)
9959   {
9960     case E_SFmode:   return (gen_aarch64_frecpesf);
9961     case E_V2SFmode: return (gen_aarch64_frecpev2sf);
9962     case E_V4SFmode: return (gen_aarch64_frecpev4sf);
9963     case E_DFmode:   return (gen_aarch64_frecpedf);
9964     case E_V2DFmode: return (gen_aarch64_frecpev2df);
9965     default:         gcc_unreachable ();
9966   }
9967 }
9968 
9969 typedef rtx (*recps_type) (rtx, rtx, rtx);
9970 
9971 /* Select reciprocal series step insn depending on machine mode.  */
9972 
9973 static recps_type
get_recps_type(machine_mode mode)9974 get_recps_type (machine_mode mode)
9975 {
9976   switch (mode)
9977   {
9978     case E_SFmode:   return (gen_aarch64_frecpssf);
9979     case E_V2SFmode: return (gen_aarch64_frecpsv2sf);
9980     case E_V4SFmode: return (gen_aarch64_frecpsv4sf);
9981     case E_DFmode:   return (gen_aarch64_frecpsdf);
9982     case E_V2DFmode: return (gen_aarch64_frecpsv2df);
9983     default:         gcc_unreachable ();
9984   }
9985 }
9986 
9987 /* Emit the instruction sequence to compute the approximation for the division
9988    of NUM by DEN in QUO and return whether the sequence was emitted or not.  */
9989 
9990 bool
aarch64_emit_approx_div(rtx quo,rtx num,rtx den)9991 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
9992 {
9993   machine_mode mode = GET_MODE (quo);
9994 
9995   if (GET_MODE_INNER (mode) == HFmode)
9996     return false;
9997 
9998   bool use_approx_division_p = (flag_mlow_precision_div
9999 			        || (aarch64_tune_params.approx_modes->division
10000 				    & AARCH64_APPROX_MODE (mode)));
10001 
10002   if (!flag_finite_math_only
10003       || flag_trapping_math
10004       || !flag_unsafe_math_optimizations
10005       || optimize_function_for_size_p (cfun)
10006       || !use_approx_division_p)
10007     return false;
10008 
10009   if (!TARGET_SIMD && VECTOR_MODE_P (mode))
10010     return false;
10011 
10012   /* Estimate the approximate reciprocal.  */
10013   rtx xrcp = gen_reg_rtx (mode);
10014   emit_insn ((*get_recpe_type (mode)) (xrcp, den));
10015 
10016   /* Iterate over the series twice for SF and thrice for DF.  */
10017   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
10018 
10019   /* Optionally iterate over the series once less for faster performance,
10020      while sacrificing the accuracy.  */
10021   if (flag_mlow_precision_div)
10022     iterations--;
10023 
10024   /* Iterate over the series to calculate the approximate reciprocal.  */
10025   rtx xtmp = gen_reg_rtx (mode);
10026   while (iterations--)
10027     {
10028       emit_insn ((*get_recps_type (mode)) (xtmp, xrcp, den));
10029 
10030       if (iterations > 0)
10031 	emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
10032     }
10033 
10034   if (num != CONST1_RTX (mode))
10035     {
10036       /* As the approximate reciprocal of DEN is already calculated, only
10037 	 calculate the approximate division when NUM is not 1.0.  */
10038       rtx xnum = force_reg (mode, num);
10039       emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
10040     }
10041 
10042   /* Finalize the approximation.  */
10043   emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
10044   return true;
10045 }
10046 
10047 /* Return the number of instructions that can be issued per cycle.  */
10048 static int
aarch64_sched_issue_rate(void)10049 aarch64_sched_issue_rate (void)
10050 {
10051   return aarch64_tune_params.issue_rate;
10052 }
10053 
10054 static int
aarch64_sched_first_cycle_multipass_dfa_lookahead(void)10055 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
10056 {
10057   int issue_rate = aarch64_sched_issue_rate ();
10058 
10059   return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
10060 }
10061 
10062 
10063 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
10064    autopref_multipass_dfa_lookahead_guard from haifa-sched.c.  It only
10065    has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0.  */
10066 
10067 static int
aarch64_first_cycle_multipass_dfa_lookahead_guard(rtx_insn * insn,int ready_index)10068 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
10069 						    int ready_index)
10070 {
10071   return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
10072 }
10073 
10074 
10075 /* Vectorizer cost model target hooks.  */
10076 
10077 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
10078 static int
aarch64_builtin_vectorization_cost(enum vect_cost_for_stmt type_of_cost,tree vectype,int misalign ATTRIBUTE_UNUSED)10079 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
10080 				    tree vectype,
10081 				    int misalign ATTRIBUTE_UNUSED)
10082 {
10083   unsigned elements;
10084   const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
10085   bool fp = false;
10086 
10087   if (vectype != NULL)
10088     fp = FLOAT_TYPE_P (vectype);
10089 
10090   switch (type_of_cost)
10091     {
10092       case scalar_stmt:
10093 	return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
10094 
10095       case scalar_load:
10096 	return costs->scalar_load_cost;
10097 
10098       case scalar_store:
10099 	return costs->scalar_store_cost;
10100 
10101       case vector_stmt:
10102 	return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
10103 
10104       case vector_load:
10105 	return costs->vec_align_load_cost;
10106 
10107       case vector_store:
10108 	return costs->vec_store_cost;
10109 
10110       case vec_to_scalar:
10111 	return costs->vec_to_scalar_cost;
10112 
10113       case scalar_to_vec:
10114 	return costs->scalar_to_vec_cost;
10115 
10116       case unaligned_load:
10117       case vector_gather_load:
10118 	return costs->vec_unalign_load_cost;
10119 
10120       case unaligned_store:
10121       case vector_scatter_store:
10122 	return costs->vec_unalign_store_cost;
10123 
10124       case cond_branch_taken:
10125 	return costs->cond_taken_branch_cost;
10126 
10127       case cond_branch_not_taken:
10128 	return costs->cond_not_taken_branch_cost;
10129 
10130       case vec_perm:
10131 	return costs->vec_permute_cost;
10132 
10133       case vec_promote_demote:
10134 	return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
10135 
10136       case vec_construct:
10137 	elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
10138 	return elements / 2 + 1;
10139 
10140       default:
10141 	gcc_unreachable ();
10142     }
10143 }
10144 
10145 /* Implement targetm.vectorize.add_stmt_cost.  */
10146 static unsigned
aarch64_add_stmt_cost(void * data,int count,enum vect_cost_for_stmt kind,struct _stmt_vec_info * stmt_info,int misalign,enum vect_cost_model_location where)10147 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
10148 		       struct _stmt_vec_info *stmt_info, int misalign,
10149 		       enum vect_cost_model_location where)
10150 {
10151   unsigned *cost = (unsigned *) data;
10152   unsigned retval = 0;
10153 
10154   if (flag_vect_cost_model)
10155     {
10156       tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
10157       int stmt_cost =
10158 	    aarch64_builtin_vectorization_cost (kind, vectype, misalign);
10159 
10160       /* Statements in an inner loop relative to the loop being
10161 	 vectorized are weighted more heavily.  The value here is
10162 	 arbitrary and could potentially be improved with analysis.  */
10163       if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
10164 	count *= 50; /*  FIXME  */
10165 
10166       retval = (unsigned) (count * stmt_cost);
10167       cost[where] += retval;
10168     }
10169 
10170   return retval;
10171 }
10172 
10173 static void initialize_aarch64_code_model (struct gcc_options *);
10174 
10175 /* Parse the TO_PARSE string and put the architecture struct that it
10176    selects into RES and the architectural features into ISA_FLAGS.
10177    Return an aarch64_parse_opt_result describing the parse result.
10178    If there is an error parsing, RES and ISA_FLAGS are left unchanged.  */
10179 
10180 static enum aarch64_parse_opt_result
aarch64_parse_arch(const char * to_parse,const struct processor ** res,unsigned long * isa_flags)10181 aarch64_parse_arch (const char *to_parse, const struct processor **res,
10182 		    unsigned long *isa_flags)
10183 {
10184   char *ext;
10185   const struct processor *arch;
10186   char *str = (char *) alloca (strlen (to_parse) + 1);
10187   size_t len;
10188 
10189   strcpy (str, to_parse);
10190 
10191   ext = strchr (str, '+');
10192 
10193   if (ext != NULL)
10194     len = ext - str;
10195   else
10196     len = strlen (str);
10197 
10198   if (len == 0)
10199     return AARCH64_PARSE_MISSING_ARG;
10200 
10201 
10202   /* Loop through the list of supported ARCHes to find a match.  */
10203   for (arch = all_architectures; arch->name != NULL; arch++)
10204     {
10205       if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
10206 	{
10207 	  unsigned long isa_temp = arch->flags;
10208 
10209 	  if (ext != NULL)
10210 	    {
10211 	      /* TO_PARSE string contains at least one extension.  */
10212 	      enum aarch64_parse_opt_result ext_res
10213 		= aarch64_parse_extension (ext, &isa_temp);
10214 
10215 	      if (ext_res != AARCH64_PARSE_OK)
10216 		return ext_res;
10217 	    }
10218 	  /* Extension parsing was successful.  Confirm the result
10219 	     arch and ISA flags.  */
10220 	  *res = arch;
10221 	  *isa_flags = isa_temp;
10222 	  return AARCH64_PARSE_OK;
10223 	}
10224     }
10225 
10226   /* ARCH name not found in list.  */
10227   return AARCH64_PARSE_INVALID_ARG;
10228 }
10229 
10230 /* Parse the TO_PARSE string and put the result tuning in RES and the
10231    architecture flags in ISA_FLAGS.  Return an aarch64_parse_opt_result
10232    describing the parse result.  If there is an error parsing, RES and
10233    ISA_FLAGS are left unchanged.  */
10234 
10235 static enum aarch64_parse_opt_result
aarch64_parse_cpu(const char * to_parse,const struct processor ** res,unsigned long * isa_flags)10236 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
10237 		   unsigned long *isa_flags)
10238 {
10239   char *ext;
10240   const struct processor *cpu;
10241   char *str = (char *) alloca (strlen (to_parse) + 1);
10242   size_t len;
10243 
10244   strcpy (str, to_parse);
10245 
10246   ext = strchr (str, '+');
10247 
10248   if (ext != NULL)
10249     len = ext - str;
10250   else
10251     len = strlen (str);
10252 
10253   if (len == 0)
10254     return AARCH64_PARSE_MISSING_ARG;
10255 
10256 
10257   /* Loop through the list of supported CPUs to find a match.  */
10258   for (cpu = all_cores; cpu->name != NULL; cpu++)
10259     {
10260       if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
10261 	{
10262 	  unsigned long isa_temp = cpu->flags;
10263 
10264 
10265 	  if (ext != NULL)
10266 	    {
10267 	      /* TO_PARSE string contains at least one extension.  */
10268 	      enum aarch64_parse_opt_result ext_res
10269 		= aarch64_parse_extension (ext, &isa_temp);
10270 
10271 	      if (ext_res != AARCH64_PARSE_OK)
10272 		return ext_res;
10273 	    }
10274 	  /* Extension parsing was successfull.  Confirm the result
10275 	     cpu and ISA flags.  */
10276 	  *res = cpu;
10277 	  *isa_flags = isa_temp;
10278 	  return AARCH64_PARSE_OK;
10279 	}
10280     }
10281 
10282   /* CPU name not found in list.  */
10283   return AARCH64_PARSE_INVALID_ARG;
10284 }
10285 
10286 /* Parse the TO_PARSE string and put the cpu it selects into RES.
10287    Return an aarch64_parse_opt_result describing the parse result.
10288    If the parsing fails the RES does not change.  */
10289 
10290 static enum aarch64_parse_opt_result
aarch64_parse_tune(const char * to_parse,const struct processor ** res)10291 aarch64_parse_tune (const char *to_parse, const struct processor **res)
10292 {
10293   const struct processor *cpu;
10294   char *str = (char *) alloca (strlen (to_parse) + 1);
10295 
10296   strcpy (str, to_parse);
10297 
10298   /* Loop through the list of supported CPUs to find a match.  */
10299   for (cpu = all_cores; cpu->name != NULL; cpu++)
10300     {
10301       if (strcmp (cpu->name, str) == 0)
10302 	{
10303 	  *res = cpu;
10304 	  return AARCH64_PARSE_OK;
10305 	}
10306     }
10307 
10308   /* CPU name not found in list.  */
10309   return AARCH64_PARSE_INVALID_ARG;
10310 }
10311 
10312 /* Parse TOKEN, which has length LENGTH to see if it is an option
10313    described in FLAG.  If it is, return the index bit for that fusion type.
10314    If not, error (printing OPTION_NAME) and return zero.  */
10315 
10316 static unsigned int
aarch64_parse_one_option_token(const char * token,size_t length,const struct aarch64_flag_desc * flag,const char * option_name)10317 aarch64_parse_one_option_token (const char *token,
10318 				size_t length,
10319 				const struct aarch64_flag_desc *flag,
10320 				const char *option_name)
10321 {
10322   for (; flag->name != NULL; flag++)
10323     {
10324       if (length == strlen (flag->name)
10325 	  && !strncmp (flag->name, token, length))
10326 	return flag->flag;
10327     }
10328 
10329   error ("unknown flag passed in -moverride=%s (%s)", option_name, token);
10330   return 0;
10331 }
10332 
10333 /* Parse OPTION which is a comma-separated list of flags to enable.
10334    FLAGS gives the list of flags we understand, INITIAL_STATE gives any
10335    default state we inherit from the CPU tuning structures.  OPTION_NAME
10336    gives the top-level option we are parsing in the -moverride string,
10337    for use in error messages.  */
10338 
10339 static unsigned int
aarch64_parse_boolean_options(const char * option,const struct aarch64_flag_desc * flags,unsigned int initial_state,const char * option_name)10340 aarch64_parse_boolean_options (const char *option,
10341 			       const struct aarch64_flag_desc *flags,
10342 			       unsigned int initial_state,
10343 			       const char *option_name)
10344 {
10345   const char separator = '.';
10346   const char* specs = option;
10347   const char* ntoken = option;
10348   unsigned int found_flags = initial_state;
10349 
10350   while ((ntoken = strchr (specs, separator)))
10351     {
10352       size_t token_length = ntoken - specs;
10353       unsigned token_ops = aarch64_parse_one_option_token (specs,
10354 							   token_length,
10355 							   flags,
10356 							   option_name);
10357       /* If we find "none" (or, for simplicity's sake, an error) anywhere
10358 	 in the token stream, reset the supported operations.  So:
10359 
10360 	   adrp+add.cmp+branch.none.adrp+add
10361 
10362 	   would have the result of turning on only adrp+add fusion.  */
10363       if (!token_ops)
10364 	found_flags = 0;
10365 
10366       found_flags |= token_ops;
10367       specs = ++ntoken;
10368     }
10369 
10370   /* We ended with a comma, print something.  */
10371   if (!(*specs))
10372     {
10373       error ("%s string ill-formed\n", option_name);
10374       return 0;
10375     }
10376 
10377   /* We still have one more token to parse.  */
10378   size_t token_length = strlen (specs);
10379   unsigned token_ops = aarch64_parse_one_option_token (specs,
10380 						       token_length,
10381 						       flags,
10382 						       option_name);
10383    if (!token_ops)
10384      found_flags = 0;
10385 
10386   found_flags |= token_ops;
10387   return found_flags;
10388 }
10389 
10390 /* Support for overriding instruction fusion.  */
10391 
10392 static void
aarch64_parse_fuse_string(const char * fuse_string,struct tune_params * tune)10393 aarch64_parse_fuse_string (const char *fuse_string,
10394 			    struct tune_params *tune)
10395 {
10396   tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
10397 						     aarch64_fusible_pairs,
10398 						     tune->fusible_ops,
10399 						     "fuse=");
10400 }
10401 
10402 /* Support for overriding other tuning flags.  */
10403 
10404 static void
aarch64_parse_tune_string(const char * tune_string,struct tune_params * tune)10405 aarch64_parse_tune_string (const char *tune_string,
10406 			    struct tune_params *tune)
10407 {
10408   tune->extra_tuning_flags
10409     = aarch64_parse_boolean_options (tune_string,
10410 				     aarch64_tuning_flags,
10411 				     tune->extra_tuning_flags,
10412 				     "tune=");
10413 }
10414 
10415 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
10416    we understand.  If it is, extract the option string and handoff to
10417    the appropriate function.  */
10418 
10419 void
aarch64_parse_one_override_token(const char * token,size_t length,struct tune_params * tune)10420 aarch64_parse_one_override_token (const char* token,
10421 				  size_t length,
10422 				  struct tune_params *tune)
10423 {
10424   const struct aarch64_tuning_override_function *fn
10425     = aarch64_tuning_override_functions;
10426 
10427   const char *option_part = strchr (token, '=');
10428   if (!option_part)
10429     {
10430       error ("tuning string missing in option (%s)", token);
10431       return;
10432     }
10433 
10434   /* Get the length of the option name.  */
10435   length = option_part - token;
10436   /* Skip the '=' to get to the option string.  */
10437   option_part++;
10438 
10439   for (; fn->name != NULL; fn++)
10440     {
10441       if (!strncmp (fn->name, token, length))
10442 	{
10443 	  fn->parse_override (option_part, tune);
10444 	  return;
10445 	}
10446     }
10447 
10448   error ("unknown tuning option (%s)",token);
10449   return;
10450 }
10451 
10452 /* A checking mechanism for the implementation of the tls size.  */
10453 
10454 static void
initialize_aarch64_tls_size(struct gcc_options * opts)10455 initialize_aarch64_tls_size (struct gcc_options *opts)
10456 {
10457   if (aarch64_tls_size == 0)
10458     aarch64_tls_size = 24;
10459 
10460   switch (opts->x_aarch64_cmodel_var)
10461     {
10462     case AARCH64_CMODEL_TINY:
10463       /* Both the default and maximum TLS size allowed under tiny is 1M which
10464 	 needs two instructions to address, so we clamp the size to 24.  */
10465       if (aarch64_tls_size > 24)
10466 	aarch64_tls_size = 24;
10467       break;
10468     case AARCH64_CMODEL_SMALL:
10469       /* The maximum TLS size allowed under small is 4G.  */
10470       if (aarch64_tls_size > 32)
10471 	aarch64_tls_size = 32;
10472       break;
10473     case AARCH64_CMODEL_LARGE:
10474       /* The maximum TLS size allowed under large is 16E.
10475 	 FIXME: 16E should be 64bit, we only support 48bit offset now.  */
10476       if (aarch64_tls_size > 48)
10477 	aarch64_tls_size = 48;
10478       break;
10479     default:
10480       gcc_unreachable ();
10481     }
10482 
10483   return;
10484 }
10485 
10486 /* Parse STRING looking for options in the format:
10487      string	:: option:string
10488      option	:: name=substring
10489      name	:: {a-z}
10490      substring	:: defined by option.  */
10491 
10492 static void
aarch64_parse_override_string(const char * input_string,struct tune_params * tune)10493 aarch64_parse_override_string (const char* input_string,
10494 			       struct tune_params* tune)
10495 {
10496   const char separator = ':';
10497   size_t string_length = strlen (input_string) + 1;
10498   char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
10499   char *string = string_root;
10500   strncpy (string, input_string, string_length);
10501   string[string_length - 1] = '\0';
10502 
10503   char* ntoken = string;
10504 
10505   while ((ntoken = strchr (string, separator)))
10506     {
10507       size_t token_length = ntoken - string;
10508       /* Make this substring look like a string.  */
10509       *ntoken = '\0';
10510       aarch64_parse_one_override_token (string, token_length, tune);
10511       string = ++ntoken;
10512     }
10513 
10514   /* One last option to parse.  */
10515   aarch64_parse_one_override_token (string, strlen (string), tune);
10516   free (string_root);
10517 }
10518 
10519 
10520 static void
aarch64_override_options_after_change_1(struct gcc_options * opts)10521 aarch64_override_options_after_change_1 (struct gcc_options *opts)
10522 {
10523   /* PR 70044: We have to be careful about being called multiple times for the
10524      same function.  This means all changes should be repeatable.  */
10525 
10526   /* If the frame pointer is enabled, set it to a special value that behaves
10527      similar to frame pointer omission.  If we don't do this all leaf functions
10528      will get a frame pointer even if flag_omit_leaf_frame_pointer is set.
10529      If flag_omit_frame_pointer has this special value, we must force the
10530      frame pointer if not in a leaf function.  We also need to force it in a
10531      leaf function if flag_omit_frame_pointer is not set or if LR is used.  */
10532   if (opts->x_flag_omit_frame_pointer == 0)
10533     opts->x_flag_omit_frame_pointer = 2;
10534 
10535   /* If not optimizing for size, set the default
10536      alignment to what the target wants.  */
10537   if (!opts->x_optimize_size)
10538     {
10539       if (opts->x_align_loops <= 0)
10540 	opts->x_align_loops = aarch64_tune_params.loop_align;
10541       if (opts->x_align_jumps <= 0)
10542 	opts->x_align_jumps = aarch64_tune_params.jump_align;
10543       if (opts->x_align_functions <= 0)
10544 	opts->x_align_functions = aarch64_tune_params.function_align;
10545     }
10546 
10547   /* We default to no pc-relative literal loads.  */
10548 
10549   aarch64_pcrelative_literal_loads = false;
10550 
10551   /* If -mpc-relative-literal-loads is set on the command line, this
10552      implies that the user asked for PC relative literal loads.  */
10553   if (opts->x_pcrelative_literal_loads == 1)
10554     aarch64_pcrelative_literal_loads = true;
10555 
10556   /* In the tiny memory model it makes no sense to disallow PC relative
10557      literal pool loads.  */
10558   if (aarch64_cmodel == AARCH64_CMODEL_TINY
10559       || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
10560     aarch64_pcrelative_literal_loads = true;
10561 
10562   /* When enabling the lower precision Newton series for the square root, also
10563      enable it for the reciprocal square root, since the latter is an
10564      intermediary step for the former.  */
10565   if (flag_mlow_precision_sqrt)
10566     flag_mrecip_low_precision_sqrt = true;
10567 }
10568 
10569 /* 'Unpack' up the internal tuning structs and update the options
10570     in OPTS.  The caller must have set up selected_tune and selected_arch
10571     as all the other target-specific codegen decisions are
10572     derived from them.  */
10573 
10574 void
aarch64_override_options_internal(struct gcc_options * opts)10575 aarch64_override_options_internal (struct gcc_options *opts)
10576 {
10577   aarch64_tune_flags = selected_tune->flags;
10578   aarch64_tune = selected_tune->sched_core;
10579   /* Make a copy of the tuning parameters attached to the core, which
10580      we may later overwrite.  */
10581   aarch64_tune_params = *(selected_tune->tune);
10582   aarch64_architecture_version = selected_arch->architecture_version;
10583 
10584   if (opts->x_aarch64_override_tune_string)
10585     aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
10586 				  &aarch64_tune_params);
10587 
10588   /* This target defaults to strict volatile bitfields.  */
10589   if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
10590     opts->x_flag_strict_volatile_bitfields = 1;
10591 
10592   initialize_aarch64_code_model (opts);
10593   initialize_aarch64_tls_size (opts);
10594 
10595   int queue_depth = 0;
10596   switch (aarch64_tune_params.autoprefetcher_model)
10597     {
10598       case tune_params::AUTOPREFETCHER_OFF:
10599 	queue_depth = -1;
10600 	break;
10601       case tune_params::AUTOPREFETCHER_WEAK:
10602 	queue_depth = 0;
10603 	break;
10604       case tune_params::AUTOPREFETCHER_STRONG:
10605 	queue_depth = max_insn_queue_index + 1;
10606 	break;
10607       default:
10608 	gcc_unreachable ();
10609     }
10610 
10611   /* We don't mind passing in global_options_set here as we don't use
10612      the *options_set structs anyway.  */
10613   maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
10614 			 queue_depth,
10615 			 opts->x_param_values,
10616 			 global_options_set.x_param_values);
10617 
10618   /* Set up parameters to be used in prefetching algorithm.  Do not
10619      override the defaults unless we are tuning for a core we have
10620      researched values for.  */
10621   if (aarch64_tune_params.prefetch->num_slots > 0)
10622     maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
10623 			   aarch64_tune_params.prefetch->num_slots,
10624 			   opts->x_param_values,
10625 			   global_options_set.x_param_values);
10626   if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
10627     maybe_set_param_value (PARAM_L1_CACHE_SIZE,
10628 			   aarch64_tune_params.prefetch->l1_cache_size,
10629 			   opts->x_param_values,
10630 			   global_options_set.x_param_values);
10631   if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
10632     maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
10633 			   aarch64_tune_params.prefetch->l1_cache_line_size,
10634 			   opts->x_param_values,
10635 			   global_options_set.x_param_values);
10636   if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
10637     maybe_set_param_value (PARAM_L2_CACHE_SIZE,
10638 			   aarch64_tune_params.prefetch->l2_cache_size,
10639 			   opts->x_param_values,
10640 			   global_options_set.x_param_values);
10641 
10642   /* Use the alternative scheduling-pressure algorithm by default.  */
10643   maybe_set_param_value (PARAM_SCHED_PRESSURE_ALGORITHM, SCHED_PRESSURE_MODEL,
10644 			 opts->x_param_values,
10645 			 global_options_set.x_param_values);
10646 
10647   /* Enable sw prefetching at specified optimization level for
10648      CPUS that have prefetch.  Lower optimization level threshold by 1
10649      when profiling is enabled.  */
10650   if (opts->x_flag_prefetch_loop_arrays < 0
10651       && !opts->x_optimize_size
10652       && aarch64_tune_params.prefetch->default_opt_level >= 0
10653       && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
10654     opts->x_flag_prefetch_loop_arrays = 1;
10655 
10656   aarch64_override_options_after_change_1 (opts);
10657 }
10658 
10659 /* Print a hint with a suggestion for a core or architecture name that
10660    most closely resembles what the user passed in STR.  ARCH is true if
10661    the user is asking for an architecture name.  ARCH is false if the user
10662    is asking for a core name.  */
10663 
10664 static void
aarch64_print_hint_for_core_or_arch(const char * str,bool arch)10665 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
10666 {
10667   auto_vec<const char *> candidates;
10668   const struct processor *entry = arch ? all_architectures : all_cores;
10669   for (; entry->name != NULL; entry++)
10670     candidates.safe_push (entry->name);
10671 
10672 #ifdef HAVE_LOCAL_CPU_DETECT
10673   /* Add also "native" as possible value.  */
10674   if (arch)
10675     candidates.safe_push ("native");
10676 #endif
10677 
10678   char *s;
10679   const char *hint = candidates_list_and_hint (str, s, candidates);
10680   if (hint)
10681     inform (input_location, "valid arguments are: %s;"
10682 			     " did you mean %qs?", s, hint);
10683   else
10684     inform (input_location, "valid arguments are: %s", s);
10685 
10686   XDELETEVEC (s);
10687 }
10688 
10689 /* Print a hint with a suggestion for a core name that most closely resembles
10690    what the user passed in STR.  */
10691 
10692 inline static void
aarch64_print_hint_for_core(const char * str)10693 aarch64_print_hint_for_core (const char *str)
10694 {
10695   aarch64_print_hint_for_core_or_arch (str, false);
10696 }
10697 
10698 /* Print a hint with a suggestion for an architecture name that most closely
10699    resembles what the user passed in STR.  */
10700 
10701 inline static void
aarch64_print_hint_for_arch(const char * str)10702 aarch64_print_hint_for_arch (const char *str)
10703 {
10704   aarch64_print_hint_for_core_or_arch (str, true);
10705 }
10706 
10707 /* Validate a command-line -mcpu option.  Parse the cpu and extensions (if any)
10708    specified in STR and throw errors if appropriate.  Put the results if
10709    they are valid in RES and ISA_FLAGS.  Return whether the option is
10710    valid.  */
10711 
10712 static bool
aarch64_validate_mcpu(const char * str,const struct processor ** res,unsigned long * isa_flags)10713 aarch64_validate_mcpu (const char *str, const struct processor **res,
10714 		       unsigned long *isa_flags)
10715 {
10716   enum aarch64_parse_opt_result parse_res
10717     = aarch64_parse_cpu (str, res, isa_flags);
10718 
10719   if (parse_res == AARCH64_PARSE_OK)
10720     return true;
10721 
10722   switch (parse_res)
10723     {
10724       case AARCH64_PARSE_MISSING_ARG:
10725 	error ("missing cpu name in %<-mcpu=%s%>", str);
10726 	break;
10727       case AARCH64_PARSE_INVALID_ARG:
10728 	error ("unknown value %qs for -mcpu", str);
10729 	aarch64_print_hint_for_core (str);
10730 	break;
10731       case AARCH64_PARSE_INVALID_FEATURE:
10732 	error ("invalid feature modifier in %<-mcpu=%s%>", str);
10733 	break;
10734       default:
10735 	gcc_unreachable ();
10736     }
10737 
10738   return false;
10739 }
10740 
10741 /* Validate a command-line -march option.  Parse the arch and extensions
10742    (if any) specified in STR and throw errors if appropriate.  Put the
10743    results, if they are valid, in RES and ISA_FLAGS.  Return whether the
10744    option is valid.  */
10745 
10746 static bool
aarch64_validate_march(const char * str,const struct processor ** res,unsigned long * isa_flags)10747 aarch64_validate_march (const char *str, const struct processor **res,
10748 			 unsigned long *isa_flags)
10749 {
10750   enum aarch64_parse_opt_result parse_res
10751     = aarch64_parse_arch (str, res, isa_flags);
10752 
10753   if (parse_res == AARCH64_PARSE_OK)
10754     return true;
10755 
10756   switch (parse_res)
10757     {
10758       case AARCH64_PARSE_MISSING_ARG:
10759 	error ("missing arch name in %<-march=%s%>", str);
10760 	break;
10761       case AARCH64_PARSE_INVALID_ARG:
10762 	error ("unknown value %qs for -march", str);
10763 	aarch64_print_hint_for_arch (str);
10764 	break;
10765       case AARCH64_PARSE_INVALID_FEATURE:
10766 	error ("invalid feature modifier in %<-march=%s%>", str);
10767 	break;
10768       default:
10769 	gcc_unreachable ();
10770     }
10771 
10772   return false;
10773 }
10774 
10775 /* Validate a command-line -mtune option.  Parse the cpu
10776    specified in STR and throw errors if appropriate.  Put the
10777    result, if it is valid, in RES.  Return whether the option is
10778    valid.  */
10779 
10780 static bool
aarch64_validate_mtune(const char * str,const struct processor ** res)10781 aarch64_validate_mtune (const char *str, const struct processor **res)
10782 {
10783   enum aarch64_parse_opt_result parse_res
10784     = aarch64_parse_tune (str, res);
10785 
10786   if (parse_res == AARCH64_PARSE_OK)
10787     return true;
10788 
10789   switch (parse_res)
10790     {
10791       case AARCH64_PARSE_MISSING_ARG:
10792 	error ("missing cpu name in %<-mtune=%s%>", str);
10793 	break;
10794       case AARCH64_PARSE_INVALID_ARG:
10795 	error ("unknown value %qs for -mtune", str);
10796 	aarch64_print_hint_for_core (str);
10797 	break;
10798       default:
10799 	gcc_unreachable ();
10800     }
10801   return false;
10802 }
10803 
10804 /* Return the CPU corresponding to the enum CPU.
10805    If it doesn't specify a cpu, return the default.  */
10806 
10807 static const struct processor *
aarch64_get_tune_cpu(enum aarch64_processor cpu)10808 aarch64_get_tune_cpu (enum aarch64_processor cpu)
10809 {
10810   if (cpu != aarch64_none)
10811     return &all_cores[cpu];
10812 
10813   /* The & 0x3f is to extract the bottom 6 bits that encode the
10814      default cpu as selected by the --with-cpu GCC configure option
10815      in config.gcc.
10816      ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
10817      flags mechanism should be reworked to make it more sane.  */
10818   return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
10819 }
10820 
10821 /* Return the architecture corresponding to the enum ARCH.
10822    If it doesn't specify a valid architecture, return the default.  */
10823 
10824 static const struct processor *
aarch64_get_arch(enum aarch64_arch arch)10825 aarch64_get_arch (enum aarch64_arch arch)
10826 {
10827   if (arch != aarch64_no_arch)
10828     return &all_architectures[arch];
10829 
10830   const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
10831 
10832   return &all_architectures[cpu->arch];
10833 }
10834 
10835 /* Return the VG value associated with -msve-vector-bits= value VALUE.  */
10836 
10837 static poly_uint16
aarch64_convert_sve_vector_bits(aarch64_sve_vector_bits_enum value)10838 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
10839 {
10840   /* For now generate vector-length agnostic code for -msve-vector-bits=128.
10841      This ensures we can clearly distinguish SVE and Advanced SIMD modes when
10842      deciding which .md file patterns to use and when deciding whether
10843      something is a legitimate address or constant.  */
10844   if (value == SVE_SCALABLE || value == SVE_128)
10845     return poly_uint16 (2, 2);
10846   else
10847     return (int) value / 64;
10848 }
10849 
10850 /* Implement TARGET_OPTION_OVERRIDE.  This is called once in the beginning
10851    and is used to parse the -m{cpu,tune,arch} strings and setup the initial
10852    tuning structs.  In particular it must set selected_tune and
10853    aarch64_isa_flags that define the available ISA features and tuning
10854    decisions.  It must also set selected_arch as this will be used to
10855    output the .arch asm tags for each function.  */
10856 
10857 static void
aarch64_override_options(void)10858 aarch64_override_options (void)
10859 {
10860   unsigned long cpu_isa = 0;
10861   unsigned long arch_isa = 0;
10862   aarch64_isa_flags = 0;
10863 
10864   bool valid_cpu = true;
10865   bool valid_tune = true;
10866   bool valid_arch = true;
10867 
10868   selected_cpu = NULL;
10869   selected_arch = NULL;
10870   selected_tune = NULL;
10871 
10872   /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
10873      If either of -march or -mtune is given, they override their
10874      respective component of -mcpu.  */
10875   if (aarch64_cpu_string)
10876     valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
10877 					&cpu_isa);
10878 
10879   if (aarch64_arch_string)
10880     valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
10881 					  &arch_isa);
10882 
10883   if (aarch64_tune_string)
10884     valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
10885 
10886   /* If the user did not specify a processor, choose the default
10887      one for them.  This will be the CPU set during configuration using
10888      --with-cpu, otherwise it is "generic".  */
10889   if (!selected_cpu)
10890     {
10891       if (selected_arch)
10892 	{
10893 	  selected_cpu = &all_cores[selected_arch->ident];
10894 	  aarch64_isa_flags = arch_isa;
10895 	  explicit_arch = selected_arch->arch;
10896 	}
10897       else
10898 	{
10899 	  /* Get default configure-time CPU.  */
10900 	  selected_cpu = aarch64_get_tune_cpu (aarch64_none);
10901 	  aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
10902 	}
10903 
10904       if (selected_tune)
10905 	explicit_tune_core = selected_tune->ident;
10906     }
10907   /* If both -mcpu and -march are specified check that they are architecturally
10908      compatible, warn if they're not and prefer the -march ISA flags.  */
10909   else if (selected_arch)
10910     {
10911       if (selected_arch->arch != selected_cpu->arch)
10912 	{
10913 	  warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
10914 		       all_architectures[selected_cpu->arch].name,
10915 		       selected_arch->name);
10916 	}
10917       aarch64_isa_flags = arch_isa;
10918       explicit_arch = selected_arch->arch;
10919       explicit_tune_core = selected_tune ? selected_tune->ident
10920 					  : selected_cpu->ident;
10921     }
10922   else
10923     {
10924       /* -mcpu but no -march.  */
10925       aarch64_isa_flags = cpu_isa;
10926       explicit_tune_core = selected_tune ? selected_tune->ident
10927 					  : selected_cpu->ident;
10928       gcc_assert (selected_cpu);
10929       selected_arch = &all_architectures[selected_cpu->arch];
10930       explicit_arch = selected_arch->arch;
10931     }
10932 
10933   /* Set the arch as well as we will need it when outputing
10934      the .arch directive in assembly.  */
10935   if (!selected_arch)
10936     {
10937       gcc_assert (selected_cpu);
10938       selected_arch = &all_architectures[selected_cpu->arch];
10939     }
10940 
10941   if (!selected_tune)
10942     selected_tune = selected_cpu;
10943 
10944 #ifndef HAVE_AS_MABI_OPTION
10945   /* The compiler may have been configured with 2.23.* binutils, which does
10946      not have support for ILP32.  */
10947   if (TARGET_ILP32)
10948     error ("assembler does not support -mabi=ilp32");
10949 #endif
10950 
10951   /* Convert -msve-vector-bits to a VG count.  */
10952   aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
10953 
10954   if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
10955     sorry ("return address signing is only supported for -mabi=lp64");
10956 
10957   /* Make sure we properly set up the explicit options.  */
10958   if ((aarch64_cpu_string && valid_cpu)
10959        || (aarch64_tune_string && valid_tune))
10960     gcc_assert (explicit_tune_core != aarch64_none);
10961 
10962   if ((aarch64_cpu_string && valid_cpu)
10963        || (aarch64_arch_string && valid_arch))
10964     gcc_assert (explicit_arch != aarch64_no_arch);
10965 
10966   aarch64_override_options_internal (&global_options);
10967 
10968   /* Save these options as the default ones in case we push and pop them later
10969      while processing functions with potential target attributes.  */
10970   target_option_default_node = target_option_current_node
10971       = build_target_option_node (&global_options);
10972 }
10973 
10974 /* Implement targetm.override_options_after_change.  */
10975 
10976 static void
aarch64_override_options_after_change(void)10977 aarch64_override_options_after_change (void)
10978 {
10979   aarch64_override_options_after_change_1 (&global_options);
10980 }
10981 
10982 static struct machine_function *
aarch64_init_machine_status(void)10983 aarch64_init_machine_status (void)
10984 {
10985   struct machine_function *machine;
10986   machine = ggc_cleared_alloc<machine_function> ();
10987   return machine;
10988 }
10989 
10990 void
aarch64_init_expanders(void)10991 aarch64_init_expanders (void)
10992 {
10993   init_machine_status = aarch64_init_machine_status;
10994 }
10995 
10996 /* A checking mechanism for the implementation of the various code models.  */
10997 static void
initialize_aarch64_code_model(struct gcc_options * opts)10998 initialize_aarch64_code_model (struct gcc_options *opts)
10999 {
11000    if (opts->x_flag_pic)
11001      {
11002        switch (opts->x_aarch64_cmodel_var)
11003 	 {
11004 	 case AARCH64_CMODEL_TINY:
11005 	   aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
11006 	   break;
11007 	 case AARCH64_CMODEL_SMALL:
11008 #ifdef HAVE_AS_SMALL_PIC_RELOCS
11009 	   aarch64_cmodel = (flag_pic == 2
11010 			     ? AARCH64_CMODEL_SMALL_PIC
11011 			     : AARCH64_CMODEL_SMALL_SPIC);
11012 #else
11013 	   aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
11014 #endif
11015 	   break;
11016 	 case AARCH64_CMODEL_LARGE:
11017 	   sorry ("code model %qs with -f%s", "large",
11018 		  opts->x_flag_pic > 1 ? "PIC" : "pic");
11019 	   break;
11020 	 default:
11021 	   gcc_unreachable ();
11022 	 }
11023      }
11024    else
11025      aarch64_cmodel = opts->x_aarch64_cmodel_var;
11026 }
11027 
11028 /* Implement TARGET_OPTION_SAVE.  */
11029 
11030 static void
aarch64_option_save(struct cl_target_option * ptr,struct gcc_options * opts)11031 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
11032 {
11033   ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
11034 }
11035 
11036 /* Implements TARGET_OPTION_RESTORE.  Restore the backend codegen decisions
11037    using the information saved in PTR.  */
11038 
11039 static void
aarch64_option_restore(struct gcc_options * opts,struct cl_target_option * ptr)11040 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
11041 {
11042   opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
11043   selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
11044   opts->x_explicit_arch = ptr->x_explicit_arch;
11045   selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
11046   opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
11047 
11048   aarch64_override_options_internal (opts);
11049 }
11050 
11051 /* Implement TARGET_OPTION_PRINT.  */
11052 
11053 static void
aarch64_option_print(FILE * file,int indent,struct cl_target_option * ptr)11054 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
11055 {
11056   const struct processor *cpu
11057     = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
11058   unsigned long isa_flags = ptr->x_aarch64_isa_flags;
11059   const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
11060   std::string extension
11061     = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
11062 
11063   fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
11064   fprintf (file, "%*sselected arch = %s%s\n", indent, "",
11065 	   arch->name, extension.c_str ());
11066 }
11067 
11068 static GTY(()) tree aarch64_previous_fndecl;
11069 
11070 void
aarch64_reset_previous_fndecl(void)11071 aarch64_reset_previous_fndecl (void)
11072 {
11073   aarch64_previous_fndecl = NULL;
11074 }
11075 
11076 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
11077    Used by aarch64_set_current_function and aarch64_pragma_target_parse to
11078    make sure optab availability predicates are recomputed when necessary.  */
11079 
11080 void
aarch64_save_restore_target_globals(tree new_tree)11081 aarch64_save_restore_target_globals (tree new_tree)
11082 {
11083   if (TREE_TARGET_GLOBALS (new_tree))
11084     restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
11085   else if (new_tree == target_option_default_node)
11086     restore_target_globals (&default_target_globals);
11087   else
11088     TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
11089 }
11090 
11091 /* Implement TARGET_SET_CURRENT_FUNCTION.  Unpack the codegen decisions
11092    like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
11093    of the function, if such exists.  This function may be called multiple
11094    times on a single function so use aarch64_previous_fndecl to avoid
11095    setting up identical state.  */
11096 
11097 static void
aarch64_set_current_function(tree fndecl)11098 aarch64_set_current_function (tree fndecl)
11099 {
11100   if (!fndecl || fndecl == aarch64_previous_fndecl)
11101     return;
11102 
11103   tree old_tree = (aarch64_previous_fndecl
11104 		   ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
11105 		   : NULL_TREE);
11106 
11107   tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
11108 
11109   /* If current function has no attributes but the previous one did,
11110      use the default node.  */
11111   if (!new_tree && old_tree)
11112     new_tree = target_option_default_node;
11113 
11114   /* If nothing to do, return.  #pragma GCC reset or #pragma GCC pop to
11115      the default have been handled by aarch64_save_restore_target_globals from
11116      aarch64_pragma_target_parse.  */
11117   if (old_tree == new_tree)
11118     return;
11119 
11120   aarch64_previous_fndecl = fndecl;
11121 
11122   /* First set the target options.  */
11123   cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
11124 
11125   aarch64_save_restore_target_globals (new_tree);
11126 }
11127 
11128 /* Enum describing the various ways we can handle attributes.
11129    In many cases we can reuse the generic option handling machinery.  */
11130 
11131 enum aarch64_attr_opt_type
11132 {
11133   aarch64_attr_mask,	/* Attribute should set a bit in target_flags.  */
11134   aarch64_attr_bool,	/* Attribute sets or unsets a boolean variable.  */
11135   aarch64_attr_enum,	/* Attribute sets an enum variable.  */
11136   aarch64_attr_custom	/* Attribute requires a custom handling function.  */
11137 };
11138 
11139 /* All the information needed to handle a target attribute.
11140    NAME is the name of the attribute.
11141    ATTR_TYPE specifies the type of behavior of the attribute as described
11142    in the definition of enum aarch64_attr_opt_type.
11143    ALLOW_NEG is true if the attribute supports a "no-" form.
11144    HANDLER is the function that takes the attribute string as an argument
11145    It is needed only when the ATTR_TYPE is aarch64_attr_custom.
11146    OPT_NUM is the enum specifying the option that the attribute modifies.
11147    This is needed for attributes that mirror the behavior of a command-line
11148    option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
11149    aarch64_attr_enum.  */
11150 
11151 struct aarch64_attribute_info
11152 {
11153   const char *name;
11154   enum aarch64_attr_opt_type attr_type;
11155   bool allow_neg;
11156   bool (*handler) (const char *);
11157   enum opt_code opt_num;
11158 };
11159 
11160 /* Handle the ARCH_STR argument to the arch= target attribute.  */
11161 
11162 static bool
aarch64_handle_attr_arch(const char * str)11163 aarch64_handle_attr_arch (const char *str)
11164 {
11165   const struct processor *tmp_arch = NULL;
11166   enum aarch64_parse_opt_result parse_res
11167     = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags);
11168 
11169   if (parse_res == AARCH64_PARSE_OK)
11170     {
11171       gcc_assert (tmp_arch);
11172       selected_arch = tmp_arch;
11173       explicit_arch = selected_arch->arch;
11174       return true;
11175     }
11176 
11177   switch (parse_res)
11178     {
11179       case AARCH64_PARSE_MISSING_ARG:
11180 	error ("missing name in %<target(\"arch=\")%> pragma or attribute");
11181 	break;
11182       case AARCH64_PARSE_INVALID_ARG:
11183 	error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str);
11184 	aarch64_print_hint_for_arch (str);
11185 	break;
11186       case AARCH64_PARSE_INVALID_FEATURE:
11187 	error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str);
11188 	break;
11189       default:
11190 	gcc_unreachable ();
11191     }
11192 
11193   return false;
11194 }
11195 
11196 /* Handle the argument CPU_STR to the cpu= target attribute.  */
11197 
11198 static bool
aarch64_handle_attr_cpu(const char * str)11199 aarch64_handle_attr_cpu (const char *str)
11200 {
11201   const struct processor *tmp_cpu = NULL;
11202   enum aarch64_parse_opt_result parse_res
11203     = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags);
11204 
11205   if (parse_res == AARCH64_PARSE_OK)
11206     {
11207       gcc_assert (tmp_cpu);
11208       selected_tune = tmp_cpu;
11209       explicit_tune_core = selected_tune->ident;
11210 
11211       selected_arch = &all_architectures[tmp_cpu->arch];
11212       explicit_arch = selected_arch->arch;
11213       return true;
11214     }
11215 
11216   switch (parse_res)
11217     {
11218       case AARCH64_PARSE_MISSING_ARG:
11219 	error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
11220 	break;
11221       case AARCH64_PARSE_INVALID_ARG:
11222 	error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str);
11223 	aarch64_print_hint_for_core (str);
11224 	break;
11225       case AARCH64_PARSE_INVALID_FEATURE:
11226 	error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str);
11227 	break;
11228       default:
11229 	gcc_unreachable ();
11230     }
11231 
11232   return false;
11233 }
11234 
11235 /* Handle the argument STR to the tune= target attribute.  */
11236 
11237 static bool
aarch64_handle_attr_tune(const char * str)11238 aarch64_handle_attr_tune (const char *str)
11239 {
11240   const struct processor *tmp_tune = NULL;
11241   enum aarch64_parse_opt_result parse_res
11242     = aarch64_parse_tune (str, &tmp_tune);
11243 
11244   if (parse_res == AARCH64_PARSE_OK)
11245     {
11246       gcc_assert (tmp_tune);
11247       selected_tune = tmp_tune;
11248       explicit_tune_core = selected_tune->ident;
11249       return true;
11250     }
11251 
11252   switch (parse_res)
11253     {
11254       case AARCH64_PARSE_INVALID_ARG:
11255 	error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str);
11256 	aarch64_print_hint_for_core (str);
11257 	break;
11258       default:
11259 	gcc_unreachable ();
11260     }
11261 
11262   return false;
11263 }
11264 
11265 /* Parse an architecture extensions target attribute string specified in STR.
11266    For example "+fp+nosimd".  Show any errors if needed.  Return TRUE
11267    if successful.  Update aarch64_isa_flags to reflect the ISA features
11268    modified.  */
11269 
11270 static bool
aarch64_handle_attr_isa_flags(char * str)11271 aarch64_handle_attr_isa_flags (char *str)
11272 {
11273   enum aarch64_parse_opt_result parse_res;
11274   unsigned long isa_flags = aarch64_isa_flags;
11275 
11276   /* We allow "+nothing" in the beginning to clear out all architectural
11277      features if the user wants to handpick specific features.  */
11278   if (strncmp ("+nothing", str, 8) == 0)
11279     {
11280       isa_flags = 0;
11281       str += 8;
11282     }
11283 
11284   parse_res = aarch64_parse_extension (str, &isa_flags);
11285 
11286   if (parse_res == AARCH64_PARSE_OK)
11287     {
11288       aarch64_isa_flags = isa_flags;
11289       return true;
11290     }
11291 
11292   switch (parse_res)
11293     {
11294       case AARCH64_PARSE_MISSING_ARG:
11295 	error ("missing value in %<target()%> pragma or attribute");
11296 	break;
11297 
11298       case AARCH64_PARSE_INVALID_FEATURE:
11299 	error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str);
11300 	break;
11301 
11302       default:
11303 	gcc_unreachable ();
11304     }
11305 
11306  return false;
11307 }
11308 
11309 /* The target attributes that we support.  On top of these we also support just
11310    ISA extensions, like  __attribute__ ((target ("+crc"))), but that case is
11311    handled explicitly in aarch64_process_one_target_attr.  */
11312 
11313 static const struct aarch64_attribute_info aarch64_attributes[] =
11314 {
11315   { "general-regs-only", aarch64_attr_mask, false, NULL,
11316      OPT_mgeneral_regs_only },
11317   { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
11318      OPT_mfix_cortex_a53_835769 },
11319   { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
11320      OPT_mfix_cortex_a53_843419 },
11321   { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
11322   { "strict-align", aarch64_attr_mask, false, NULL, OPT_mstrict_align },
11323   { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
11324      OPT_momit_leaf_frame_pointer },
11325   { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
11326   { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
11327      OPT_march_ },
11328   { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
11329   { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
11330      OPT_mtune_ },
11331   { "sign-return-address", aarch64_attr_enum, false, NULL,
11332      OPT_msign_return_address_ },
11333   { NULL, aarch64_attr_custom, false, NULL, OPT____ }
11334 };
11335 
11336 /* Parse ARG_STR which contains the definition of one target attribute.
11337    Show appropriate errors if any or return true if the attribute is valid.  */
11338 
11339 static bool
aarch64_process_one_target_attr(char * arg_str)11340 aarch64_process_one_target_attr (char *arg_str)
11341 {
11342   bool invert = false;
11343 
11344   size_t len = strlen (arg_str);
11345 
11346   if (len == 0)
11347     {
11348       error ("malformed %<target()%> pragma or attribute");
11349       return false;
11350     }
11351 
11352   char *str_to_check = (char *) alloca (len + 1);
11353   strcpy (str_to_check, arg_str);
11354 
11355   /* Skip leading whitespace.  */
11356   while (*str_to_check == ' ' || *str_to_check == '\t')
11357     str_to_check++;
11358 
11359   /* We have something like __attribute__ ((target ("+fp+nosimd"))).
11360      It is easier to detect and handle it explicitly here rather than going
11361      through the machinery for the rest of the target attributes in this
11362      function.  */
11363   if (*str_to_check == '+')
11364     return aarch64_handle_attr_isa_flags (str_to_check);
11365 
11366   if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
11367     {
11368       invert = true;
11369       str_to_check += 3;
11370     }
11371   char *arg = strchr (str_to_check, '=');
11372 
11373   /* If we found opt=foo then terminate STR_TO_CHECK at the '='
11374      and point ARG to "foo".  */
11375   if (arg)
11376     {
11377       *arg = '\0';
11378       arg++;
11379     }
11380   const struct aarch64_attribute_info *p_attr;
11381   bool found = false;
11382   for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
11383     {
11384       /* If the names don't match up, or the user has given an argument
11385 	 to an attribute that doesn't accept one, or didn't give an argument
11386 	 to an attribute that expects one, fail to match.  */
11387       if (strcmp (str_to_check, p_attr->name) != 0)
11388 	continue;
11389 
11390       found = true;
11391       bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
11392 			      || p_attr->attr_type == aarch64_attr_enum;
11393 
11394       if (attr_need_arg_p ^ (arg != NULL))
11395 	{
11396 	  error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
11397 	  return false;
11398 	}
11399 
11400       /* If the name matches but the attribute does not allow "no-" versions
11401 	 then we can't match.  */
11402       if (invert && !p_attr->allow_neg)
11403 	{
11404 	  error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
11405 	  return false;
11406 	}
11407 
11408       switch (p_attr->attr_type)
11409 	{
11410 	/* Has a custom handler registered.
11411 	   For example, cpu=, arch=, tune=.  */
11412 	  case aarch64_attr_custom:
11413 	    gcc_assert (p_attr->handler);
11414 	    if (!p_attr->handler (arg))
11415 	      return false;
11416 	    break;
11417 
11418 	  /* Either set or unset a boolean option.  */
11419 	  case aarch64_attr_bool:
11420 	    {
11421 	      struct cl_decoded_option decoded;
11422 
11423 	      generate_option (p_attr->opt_num, NULL, !invert,
11424 			       CL_TARGET, &decoded);
11425 	      aarch64_handle_option (&global_options, &global_options_set,
11426 				      &decoded, input_location);
11427 	      break;
11428 	    }
11429 	  /* Set or unset a bit in the target_flags.  aarch64_handle_option
11430 	     should know what mask to apply given the option number.  */
11431 	  case aarch64_attr_mask:
11432 	    {
11433 	      struct cl_decoded_option decoded;
11434 	      /* We only need to specify the option number.
11435 		 aarch64_handle_option will know which mask to apply.  */
11436 	      decoded.opt_index = p_attr->opt_num;
11437 	      decoded.value = !invert;
11438 	      aarch64_handle_option (&global_options, &global_options_set,
11439 				      &decoded, input_location);
11440 	      break;
11441 	    }
11442 	  /* Use the option setting machinery to set an option to an enum.  */
11443 	  case aarch64_attr_enum:
11444 	    {
11445 	      gcc_assert (arg);
11446 	      bool valid;
11447 	      int value;
11448 	      valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
11449 					      &value, CL_TARGET);
11450 	      if (valid)
11451 		{
11452 		  set_option (&global_options, NULL, p_attr->opt_num, value,
11453 			      NULL, DK_UNSPECIFIED, input_location,
11454 			      global_dc);
11455 		}
11456 	      else
11457 		{
11458 		  error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
11459 		}
11460 	      break;
11461 	    }
11462 	  default:
11463 	    gcc_unreachable ();
11464 	}
11465     }
11466 
11467   /* If we reached here we either have found an attribute and validated
11468      it or didn't match any.  If we matched an attribute but its arguments
11469      were malformed we will have returned false already.  */
11470   return found;
11471 }
11472 
11473 /* Count how many times the character C appears in
11474    NULL-terminated string STR.  */
11475 
11476 static unsigned int
num_occurences_in_str(char c,char * str)11477 num_occurences_in_str (char c, char *str)
11478 {
11479   unsigned int res = 0;
11480   while (*str != '\0')
11481     {
11482       if (*str == c)
11483 	res++;
11484 
11485       str++;
11486     }
11487 
11488   return res;
11489 }
11490 
11491 /* Parse the tree in ARGS that contains the target attribute information
11492    and update the global target options space.  */
11493 
11494 bool
aarch64_process_target_attr(tree args)11495 aarch64_process_target_attr (tree args)
11496 {
11497   if (TREE_CODE (args) == TREE_LIST)
11498     {
11499       do
11500 	{
11501 	  tree head = TREE_VALUE (args);
11502 	  if (head)
11503 	    {
11504 	      if (!aarch64_process_target_attr (head))
11505 		return false;
11506 	    }
11507 	  args = TREE_CHAIN (args);
11508 	} while (args);
11509 
11510       return true;
11511     }
11512 
11513   if (TREE_CODE (args) != STRING_CST)
11514     {
11515       error ("attribute %<target%> argument not a string");
11516       return false;
11517     }
11518 
11519   size_t len = strlen (TREE_STRING_POINTER (args));
11520   char *str_to_check = (char *) alloca (len + 1);
11521   strcpy (str_to_check, TREE_STRING_POINTER (args));
11522 
11523   if (len == 0)
11524     {
11525       error ("malformed %<target()%> pragma or attribute");
11526       return false;
11527     }
11528 
11529   /* Used to catch empty spaces between commas i.e.
11530      attribute ((target ("attr1,,attr2"))).  */
11531   unsigned int num_commas = num_occurences_in_str (',', str_to_check);
11532 
11533   /* Handle multiple target attributes separated by ','.  */
11534   char *token = strtok (str_to_check, ",");
11535 
11536   unsigned int num_attrs = 0;
11537   while (token)
11538     {
11539       num_attrs++;
11540       if (!aarch64_process_one_target_attr (token))
11541 	{
11542 	  error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
11543 	  return false;
11544 	}
11545 
11546       token = strtok (NULL, ",");
11547     }
11548 
11549   if (num_attrs != num_commas + 1)
11550     {
11551       error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
11552       return false;
11553     }
11554 
11555   return true;
11556 }
11557 
11558 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P.  This is used to
11559    process attribute ((target ("..."))).  */
11560 
11561 static bool
aarch64_option_valid_attribute_p(tree fndecl,tree,tree args,int)11562 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
11563 {
11564   struct cl_target_option cur_target;
11565   bool ret;
11566   tree old_optimize;
11567   tree new_target, new_optimize;
11568   tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
11569 
11570   /* If what we're processing is the current pragma string then the
11571      target option node is already stored in target_option_current_node
11572      by aarch64_pragma_target_parse in aarch64-c.c.  Use that to avoid
11573      having to re-parse the string.  This is especially useful to keep
11574      arm_neon.h compile times down since that header contains a lot
11575      of intrinsics enclosed in pragmas.  */
11576   if (!existing_target && args == current_target_pragma)
11577     {
11578       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
11579       return true;
11580     }
11581   tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
11582 
11583   old_optimize = build_optimization_node (&global_options);
11584   func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
11585 
11586   /* If the function changed the optimization levels as well as setting
11587      target options, start with the optimizations specified.  */
11588   if (func_optimize && func_optimize != old_optimize)
11589     cl_optimization_restore (&global_options,
11590 			     TREE_OPTIMIZATION (func_optimize));
11591 
11592   /* Save the current target options to restore at the end.  */
11593   cl_target_option_save (&cur_target, &global_options);
11594 
11595   /* If fndecl already has some target attributes applied to it, unpack
11596      them so that we add this attribute on top of them, rather than
11597      overwriting them.  */
11598   if (existing_target)
11599     {
11600       struct cl_target_option *existing_options
11601 	= TREE_TARGET_OPTION (existing_target);
11602 
11603       if (existing_options)
11604 	cl_target_option_restore (&global_options, existing_options);
11605     }
11606   else
11607     cl_target_option_restore (&global_options,
11608 			TREE_TARGET_OPTION (target_option_current_node));
11609 
11610   ret = aarch64_process_target_attr (args);
11611 
11612   /* Set up any additional state.  */
11613   if (ret)
11614     {
11615       aarch64_override_options_internal (&global_options);
11616       /* Initialize SIMD builtins if we haven't already.
11617 	 Set current_target_pragma to NULL for the duration so that
11618 	 the builtin initialization code doesn't try to tag the functions
11619 	 being built with the attributes specified by any current pragma, thus
11620 	 going into an infinite recursion.  */
11621       if (TARGET_SIMD)
11622 	{
11623 	  tree saved_current_target_pragma = current_target_pragma;
11624 	  current_target_pragma = NULL;
11625 	  aarch64_init_simd_builtins ();
11626 	  current_target_pragma = saved_current_target_pragma;
11627 	}
11628       new_target = build_target_option_node (&global_options);
11629     }
11630   else
11631     new_target = NULL;
11632 
11633   new_optimize = build_optimization_node (&global_options);
11634 
11635   if (fndecl && ret)
11636     {
11637       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
11638 
11639       if (old_optimize != new_optimize)
11640 	DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
11641     }
11642 
11643   cl_target_option_restore (&global_options, &cur_target);
11644 
11645   if (old_optimize != new_optimize)
11646     cl_optimization_restore (&global_options,
11647 			     TREE_OPTIMIZATION (old_optimize));
11648   return ret;
11649 }
11650 
11651 /* Helper for aarch64_can_inline_p.  In the case where CALLER and CALLEE are
11652    tri-bool options (yes, no, don't care) and the default value is
11653    DEF, determine whether to reject inlining.  */
11654 
11655 static bool
aarch64_tribools_ok_for_inlining_p(int caller,int callee,int dont_care,int def)11656 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
11657 				     int dont_care, int def)
11658 {
11659   /* If the callee doesn't care, always allow inlining.  */
11660   if (callee == dont_care)
11661     return true;
11662 
11663   /* If the caller doesn't care, always allow inlining.  */
11664   if (caller == dont_care)
11665     return true;
11666 
11667   /* Otherwise, allow inlining if either the callee and caller values
11668      agree, or if the callee is using the default value.  */
11669   return (callee == caller || callee == def);
11670 }
11671 
11672 /* Implement TARGET_CAN_INLINE_P.  Decide whether it is valid
11673    to inline CALLEE into CALLER based on target-specific info.
11674    Make sure that the caller and callee have compatible architectural
11675    features.  Then go through the other possible target attributes
11676    and see if they can block inlining.  Try not to reject always_inline
11677    callees unless they are incompatible architecturally.  */
11678 
11679 static bool
aarch64_can_inline_p(tree caller,tree callee)11680 aarch64_can_inline_p (tree caller, tree callee)
11681 {
11682   tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
11683   tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
11684 
11685   /* If callee has no option attributes, then it is ok to inline.  */
11686   if (!callee_tree)
11687     return true;
11688 
11689   struct cl_target_option *caller_opts
11690 	= TREE_TARGET_OPTION (caller_tree ? caller_tree
11691 					   : target_option_default_node);
11692 
11693   struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
11694 
11695 
11696   /* Callee's ISA flags should be a subset of the caller's.  */
11697   if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
11698        != callee_opts->x_aarch64_isa_flags)
11699     return false;
11700 
11701   /* Allow non-strict aligned functions inlining into strict
11702      aligned ones.  */
11703   if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
11704        != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
11705       && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
11706 	   && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
11707     return false;
11708 
11709   bool always_inline = lookup_attribute ("always_inline",
11710 					  DECL_ATTRIBUTES (callee));
11711 
11712   /* If the architectural features match up and the callee is always_inline
11713      then the other attributes don't matter.  */
11714   if (always_inline)
11715     return true;
11716 
11717   if (caller_opts->x_aarch64_cmodel_var
11718       != callee_opts->x_aarch64_cmodel_var)
11719     return false;
11720 
11721   if (caller_opts->x_aarch64_tls_dialect
11722       != callee_opts->x_aarch64_tls_dialect)
11723     return false;
11724 
11725   /* Honour explicit requests to workaround errata.  */
11726   if (!aarch64_tribools_ok_for_inlining_p (
11727 	  caller_opts->x_aarch64_fix_a53_err835769,
11728 	  callee_opts->x_aarch64_fix_a53_err835769,
11729 	  2, TARGET_FIX_ERR_A53_835769_DEFAULT))
11730     return false;
11731 
11732   if (!aarch64_tribools_ok_for_inlining_p (
11733 	  caller_opts->x_aarch64_fix_a53_err843419,
11734 	  callee_opts->x_aarch64_fix_a53_err843419,
11735 	  2, TARGET_FIX_ERR_A53_843419))
11736     return false;
11737 
11738   /* If the user explicitly specified -momit-leaf-frame-pointer for the
11739      caller and calle and they don't match up, reject inlining.  */
11740   if (!aarch64_tribools_ok_for_inlining_p (
11741 	  caller_opts->x_flag_omit_leaf_frame_pointer,
11742 	  callee_opts->x_flag_omit_leaf_frame_pointer,
11743 	  2, 1))
11744     return false;
11745 
11746   /* If the callee has specific tuning overrides, respect them.  */
11747   if (callee_opts->x_aarch64_override_tune_string != NULL
11748       && caller_opts->x_aarch64_override_tune_string == NULL)
11749     return false;
11750 
11751   /* If the user specified tuning override strings for the
11752      caller and callee and they don't match up, reject inlining.
11753      We just do a string compare here, we don't analyze the meaning
11754      of the string, as it would be too costly for little gain.  */
11755   if (callee_opts->x_aarch64_override_tune_string
11756       && caller_opts->x_aarch64_override_tune_string
11757       && (strcmp (callee_opts->x_aarch64_override_tune_string,
11758 		  caller_opts->x_aarch64_override_tune_string) != 0))
11759     return false;
11760 
11761   return true;
11762 }
11763 
11764 /* Return true if SYMBOL_REF X binds locally.  */
11765 
11766 static bool
aarch64_symbol_binds_local_p(const_rtx x)11767 aarch64_symbol_binds_local_p (const_rtx x)
11768 {
11769   return (SYMBOL_REF_DECL (x)
11770 	  ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
11771 	  : SYMBOL_REF_LOCAL_P (x));
11772 }
11773 
11774 /* Return true if SYMBOL_REF X is thread local */
11775 static bool
aarch64_tls_symbol_p(rtx x)11776 aarch64_tls_symbol_p (rtx x)
11777 {
11778   if (! TARGET_HAVE_TLS)
11779     return false;
11780 
11781   if (GET_CODE (x) != SYMBOL_REF)
11782     return false;
11783 
11784   return SYMBOL_REF_TLS_MODEL (x) != 0;
11785 }
11786 
11787 /* Classify a TLS symbol into one of the TLS kinds.  */
11788 enum aarch64_symbol_type
aarch64_classify_tls_symbol(rtx x)11789 aarch64_classify_tls_symbol (rtx x)
11790 {
11791   enum tls_model tls_kind = tls_symbolic_operand_type (x);
11792 
11793   switch (tls_kind)
11794     {
11795     case TLS_MODEL_GLOBAL_DYNAMIC:
11796     case TLS_MODEL_LOCAL_DYNAMIC:
11797       return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
11798 
11799     case TLS_MODEL_INITIAL_EXEC:
11800       switch (aarch64_cmodel)
11801 	{
11802 	case AARCH64_CMODEL_TINY:
11803 	case AARCH64_CMODEL_TINY_PIC:
11804 	  return SYMBOL_TINY_TLSIE;
11805 	default:
11806 	  return SYMBOL_SMALL_TLSIE;
11807 	}
11808 
11809     case TLS_MODEL_LOCAL_EXEC:
11810       if (aarch64_tls_size == 12)
11811 	return SYMBOL_TLSLE12;
11812       else if (aarch64_tls_size == 24)
11813 	return SYMBOL_TLSLE24;
11814       else if (aarch64_tls_size == 32)
11815 	return SYMBOL_TLSLE32;
11816       else if (aarch64_tls_size == 48)
11817 	return SYMBOL_TLSLE48;
11818       else
11819 	gcc_unreachable ();
11820 
11821     case TLS_MODEL_EMULATED:
11822     case TLS_MODEL_NONE:
11823       return SYMBOL_FORCE_TO_MEM;
11824 
11825     default:
11826       gcc_unreachable ();
11827     }
11828 }
11829 
11830 /* Return the correct method for accessing X + OFFSET, where X is either
11831    a SYMBOL_REF or LABEL_REF.  */
11832 
11833 enum aarch64_symbol_type
aarch64_classify_symbol(rtx x,HOST_WIDE_INT offset)11834 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
11835 {
11836   if (GET_CODE (x) == LABEL_REF)
11837     {
11838       switch (aarch64_cmodel)
11839 	{
11840 	case AARCH64_CMODEL_LARGE:
11841 	  return SYMBOL_FORCE_TO_MEM;
11842 
11843 	case AARCH64_CMODEL_TINY_PIC:
11844 	case AARCH64_CMODEL_TINY:
11845 	  return SYMBOL_TINY_ABSOLUTE;
11846 
11847 	case AARCH64_CMODEL_SMALL_SPIC:
11848 	case AARCH64_CMODEL_SMALL_PIC:
11849 	case AARCH64_CMODEL_SMALL:
11850 	  return SYMBOL_SMALL_ABSOLUTE;
11851 
11852 	default:
11853 	  gcc_unreachable ();
11854 	}
11855     }
11856 
11857   if (GET_CODE (x) == SYMBOL_REF)
11858     {
11859       if (aarch64_tls_symbol_p (x))
11860 	return aarch64_classify_tls_symbol (x);
11861 
11862       switch (aarch64_cmodel)
11863 	{
11864 	case AARCH64_CMODEL_TINY:
11865 	  /* When we retrieve symbol + offset address, we have to make sure
11866 	     the offset does not cause overflow of the final address.  But
11867 	     we have no way of knowing the address of symbol at compile time
11868 	     so we can't accurately say if the distance between the PC and
11869 	     symbol + offset is outside the addressible range of +/-1M in the
11870 	     TINY code model.  So we rely on images not being greater than
11871 	     1M and cap the offset at 1M and anything beyond 1M will have to
11872 	     be loaded using an alternative mechanism.  Furthermore if the
11873 	     symbol is a weak reference to something that isn't known to
11874 	     resolve to a symbol in this module, then force to memory.  */
11875 	  if ((SYMBOL_REF_WEAK (x)
11876 	       && !aarch64_symbol_binds_local_p (x))
11877 	      || !IN_RANGE (offset, -1048575, 1048575))
11878 	    return SYMBOL_FORCE_TO_MEM;
11879 	  return SYMBOL_TINY_ABSOLUTE;
11880 
11881 	case AARCH64_CMODEL_SMALL:
11882 	  /* Same reasoning as the tiny code model, but the offset cap here is
11883 	     4G.  */
11884 	  if ((SYMBOL_REF_WEAK (x)
11885 	       && !aarch64_symbol_binds_local_p (x))
11886 	      || !IN_RANGE (offset, HOST_WIDE_INT_C (-4294967263),
11887 			    HOST_WIDE_INT_C (4294967264)))
11888 	    return SYMBOL_FORCE_TO_MEM;
11889 	  return SYMBOL_SMALL_ABSOLUTE;
11890 
11891 	case AARCH64_CMODEL_TINY_PIC:
11892 	  if (!aarch64_symbol_binds_local_p (x))
11893 	    return SYMBOL_TINY_GOT;
11894 	  return SYMBOL_TINY_ABSOLUTE;
11895 
11896 	case AARCH64_CMODEL_SMALL_SPIC:
11897 	case AARCH64_CMODEL_SMALL_PIC:
11898 	  if (!aarch64_symbol_binds_local_p (x))
11899 	    return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
11900 		    ?  SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
11901 	  return SYMBOL_SMALL_ABSOLUTE;
11902 
11903 	case AARCH64_CMODEL_LARGE:
11904 	  /* This is alright even in PIC code as the constant
11905 	     pool reference is always PC relative and within
11906 	     the same translation unit.  */
11907 	  if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
11908 	    return SYMBOL_SMALL_ABSOLUTE;
11909 	  else
11910 	    return SYMBOL_FORCE_TO_MEM;
11911 
11912 	default:
11913 	  gcc_unreachable ();
11914 	}
11915     }
11916 
11917   /* By default push everything into the constant pool.  */
11918   return SYMBOL_FORCE_TO_MEM;
11919 }
11920 
11921 bool
aarch64_constant_address_p(rtx x)11922 aarch64_constant_address_p (rtx x)
11923 {
11924   return (CONSTANT_P (x) && memory_address_p (DImode, x));
11925 }
11926 
11927 bool
aarch64_legitimate_pic_operand_p(rtx x)11928 aarch64_legitimate_pic_operand_p (rtx x)
11929 {
11930   if (GET_CODE (x) == SYMBOL_REF
11931       || (GET_CODE (x) == CONST
11932 	  && GET_CODE (XEXP (x, 0)) == PLUS
11933 	  && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
11934      return false;
11935 
11936   return true;
11937 }
11938 
11939 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook.  Return true for constants
11940    that should be rematerialized rather than spilled.  */
11941 
11942 static bool
aarch64_legitimate_constant_p(machine_mode mode,rtx x)11943 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
11944 {
11945   /* Support CSE and rematerialization of common constants.  */
11946   if (CONST_INT_P (x)
11947       || (CONST_DOUBLE_P (x) && GET_MODE_CLASS (mode) == MODE_FLOAT)
11948       || GET_CODE (x) == CONST_VECTOR)
11949     return true;
11950 
11951   /* Do not allow vector struct mode constants for Advanced SIMD.
11952      We could support 0 and -1 easily, but they need support in
11953      aarch64-simd.md.  */
11954   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
11955   if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
11956     return false;
11957 
11958   /* Only accept variable-length vector constants if they can be
11959      handled directly.
11960 
11961      ??? It would be possible to handle rematerialization of other
11962      constants via secondary reloads.  */
11963   if (vec_flags & VEC_ANY_SVE)
11964     return aarch64_simd_valid_immediate (x, NULL);
11965 
11966   if (GET_CODE (x) == HIGH)
11967     x = XEXP (x, 0);
11968 
11969   /* Accept polynomial constants that can be calculated by using the
11970      destination of a move as the sole temporary.  Constants that
11971      require a second temporary cannot be rematerialized (they can't be
11972      forced to memory and also aren't legitimate constants).  */
11973   poly_int64 offset;
11974   if (poly_int_rtx_p (x, &offset))
11975     return aarch64_offset_temporaries (false, offset) <= 1;
11976 
11977   /* If an offset is being added to something else, we need to allow the
11978      base to be moved into the destination register, meaning that there
11979      are no free temporaries for the offset.  */
11980   x = strip_offset (x, &offset);
11981   if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
11982     return false;
11983 
11984   /* Do not allow const (plus (anchor_symbol, const_int)).  */
11985   if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
11986     return false;
11987 
11988   /* Treat symbols as constants.  Avoid TLS symbols as they are complex,
11989      so spilling them is better than rematerialization.  */
11990   if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
11991     return true;
11992 
11993   /* Label references are always constant.  */
11994   if (GET_CODE (x) == LABEL_REF)
11995     return true;
11996 
11997   return false;
11998 }
11999 
12000 rtx
aarch64_load_tp(rtx target)12001 aarch64_load_tp (rtx target)
12002 {
12003   if (!target
12004       || GET_MODE (target) != Pmode
12005       || !register_operand (target, Pmode))
12006     target = gen_reg_rtx (Pmode);
12007 
12008   /* Can return in any reg.  */
12009   emit_insn (gen_aarch64_load_tp_hard (target));
12010   return target;
12011 }
12012 
12013 /* On AAPCS systems, this is the "struct __va_list".  */
12014 static GTY(()) tree va_list_type;
12015 
12016 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
12017    Return the type to use as __builtin_va_list.
12018 
12019    AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
12020 
12021    struct __va_list
12022    {
12023      void *__stack;
12024      void *__gr_top;
12025      void *__vr_top;
12026      int   __gr_offs;
12027      int   __vr_offs;
12028    };  */
12029 
12030 static tree
aarch64_build_builtin_va_list(void)12031 aarch64_build_builtin_va_list (void)
12032 {
12033   tree va_list_name;
12034   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
12035 
12036   /* Create the type.  */
12037   va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
12038   /* Give it the required name.  */
12039   va_list_name = build_decl (BUILTINS_LOCATION,
12040 			     TYPE_DECL,
12041 			     get_identifier ("__va_list"),
12042 			     va_list_type);
12043   DECL_ARTIFICIAL (va_list_name) = 1;
12044   TYPE_NAME (va_list_type) = va_list_name;
12045   TYPE_STUB_DECL (va_list_type) = va_list_name;
12046 
12047   /* Create the fields.  */
12048   f_stack = build_decl (BUILTINS_LOCATION,
12049 			FIELD_DECL, get_identifier ("__stack"),
12050 			ptr_type_node);
12051   f_grtop = build_decl (BUILTINS_LOCATION,
12052 			FIELD_DECL, get_identifier ("__gr_top"),
12053 			ptr_type_node);
12054   f_vrtop = build_decl (BUILTINS_LOCATION,
12055 			FIELD_DECL, get_identifier ("__vr_top"),
12056 			ptr_type_node);
12057   f_groff = build_decl (BUILTINS_LOCATION,
12058 			FIELD_DECL, get_identifier ("__gr_offs"),
12059 			integer_type_node);
12060   f_vroff = build_decl (BUILTINS_LOCATION,
12061 			FIELD_DECL, get_identifier ("__vr_offs"),
12062 			integer_type_node);
12063 
12064   /* Tell tree-stdarg pass about our internal offset fields.
12065      NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
12066      purpose to identify whether the code is updating va_list internal
12067      offset fields through irregular way.  */
12068   va_list_gpr_counter_field = f_groff;
12069   va_list_fpr_counter_field = f_vroff;
12070 
12071   DECL_ARTIFICIAL (f_stack) = 1;
12072   DECL_ARTIFICIAL (f_grtop) = 1;
12073   DECL_ARTIFICIAL (f_vrtop) = 1;
12074   DECL_ARTIFICIAL (f_groff) = 1;
12075   DECL_ARTIFICIAL (f_vroff) = 1;
12076 
12077   DECL_FIELD_CONTEXT (f_stack) = va_list_type;
12078   DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
12079   DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
12080   DECL_FIELD_CONTEXT (f_groff) = va_list_type;
12081   DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
12082 
12083   TYPE_FIELDS (va_list_type) = f_stack;
12084   DECL_CHAIN (f_stack) = f_grtop;
12085   DECL_CHAIN (f_grtop) = f_vrtop;
12086   DECL_CHAIN (f_vrtop) = f_groff;
12087   DECL_CHAIN (f_groff) = f_vroff;
12088 
12089   /* Compute its layout.  */
12090   layout_type (va_list_type);
12091 
12092   return va_list_type;
12093 }
12094 
12095 /* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
12096 static void
aarch64_expand_builtin_va_start(tree valist,rtx nextarg ATTRIBUTE_UNUSED)12097 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
12098 {
12099   const CUMULATIVE_ARGS *cum;
12100   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
12101   tree stack, grtop, vrtop, groff, vroff;
12102   tree t;
12103   int gr_save_area_size = cfun->va_list_gpr_size;
12104   int vr_save_area_size = cfun->va_list_fpr_size;
12105   int vr_offset;
12106 
12107   cum = &crtl->args.info;
12108   if (cfun->va_list_gpr_size)
12109     gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
12110 			     cfun->va_list_gpr_size);
12111   if (cfun->va_list_fpr_size)
12112     vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
12113 			     * UNITS_PER_VREG, cfun->va_list_fpr_size);
12114 
12115   if (!TARGET_FLOAT)
12116     {
12117       gcc_assert (cum->aapcs_nvrn == 0);
12118       vr_save_area_size = 0;
12119     }
12120 
12121   f_stack = TYPE_FIELDS (va_list_type_node);
12122   f_grtop = DECL_CHAIN (f_stack);
12123   f_vrtop = DECL_CHAIN (f_grtop);
12124   f_groff = DECL_CHAIN (f_vrtop);
12125   f_vroff = DECL_CHAIN (f_groff);
12126 
12127   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
12128 		  NULL_TREE);
12129   grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
12130 		  NULL_TREE);
12131   vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
12132 		  NULL_TREE);
12133   groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
12134 		  NULL_TREE);
12135   vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
12136 		  NULL_TREE);
12137 
12138   /* Emit code to initialize STACK, which points to the next varargs stack
12139      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
12140      by named arguments.  STACK is 8-byte aligned.  */
12141   t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
12142   if (cum->aapcs_stack_size > 0)
12143     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
12144   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
12145   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12146 
12147   /* Emit code to initialize GRTOP, the top of the GR save area.
12148      virtual_incoming_args_rtx should have been 16 byte aligned.  */
12149   t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
12150   t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
12151   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12152 
12153   /* Emit code to initialize VRTOP, the top of the VR save area.
12154      This address is gr_save_area_bytes below GRTOP, rounded
12155      down to the next 16-byte boundary.  */
12156   t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
12157   vr_offset = ROUND_UP (gr_save_area_size,
12158 			STACK_BOUNDARY / BITS_PER_UNIT);
12159 
12160   if (vr_offset)
12161     t = fold_build_pointer_plus_hwi (t, -vr_offset);
12162   t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
12163   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12164 
12165   /* Emit code to initialize GROFF, the offset from GRTOP of the
12166      next GPR argument.  */
12167   t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
12168 	      build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
12169   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12170 
12171   /* Likewise emit code to initialize VROFF, the offset from FTOP
12172      of the next VR argument.  */
12173   t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
12174 	      build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
12175   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12176 }
12177 
12178 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
12179 
12180 static tree
aarch64_gimplify_va_arg_expr(tree valist,tree type,gimple_seq * pre_p,gimple_seq * post_p ATTRIBUTE_UNUSED)12181 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
12182 			      gimple_seq *post_p ATTRIBUTE_UNUSED)
12183 {
12184   tree addr;
12185   bool indirect_p;
12186   bool is_ha;		/* is HFA or HVA.  */
12187   bool dw_align;	/* double-word align.  */
12188   machine_mode ag_mode = VOIDmode;
12189   int nregs;
12190   machine_mode mode;
12191 
12192   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
12193   tree stack, f_top, f_off, off, arg, roundup, on_stack;
12194   HOST_WIDE_INT size, rsize, adjust, align;
12195   tree t, u, cond1, cond2;
12196 
12197   indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
12198   if (indirect_p)
12199     type = build_pointer_type (type);
12200 
12201   mode = TYPE_MODE (type);
12202 
12203   f_stack = TYPE_FIELDS (va_list_type_node);
12204   f_grtop = DECL_CHAIN (f_stack);
12205   f_vrtop = DECL_CHAIN (f_grtop);
12206   f_groff = DECL_CHAIN (f_vrtop);
12207   f_vroff = DECL_CHAIN (f_groff);
12208 
12209   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
12210 		  f_stack, NULL_TREE);
12211   size = int_size_in_bytes (type);
12212   align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
12213 
12214   dw_align = false;
12215   adjust = 0;
12216   if (aarch64_vfp_is_call_or_return_candidate (mode,
12217 					       type,
12218 					       &ag_mode,
12219 					       &nregs,
12220 					       &is_ha))
12221     {
12222       /* No frontends can create types with variable-sized modes, so we
12223 	 shouldn't be asked to pass or return them.  */
12224       unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
12225 
12226       /* TYPE passed in fp/simd registers.  */
12227       if (!TARGET_FLOAT)
12228 	aarch64_err_no_fpadvsimd (mode, "varargs");
12229 
12230       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
12231 		      unshare_expr (valist), f_vrtop, NULL_TREE);
12232       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
12233 		      unshare_expr (valist), f_vroff, NULL_TREE);
12234 
12235       rsize = nregs * UNITS_PER_VREG;
12236 
12237       if (is_ha)
12238 	{
12239 	  if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
12240 	    adjust = UNITS_PER_VREG - ag_size;
12241 	}
12242       else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
12243 	       && size < UNITS_PER_VREG)
12244 	{
12245 	  adjust = UNITS_PER_VREG - size;
12246 	}
12247     }
12248   else
12249     {
12250       /* TYPE passed in general registers.  */
12251       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
12252 		      unshare_expr (valist), f_grtop, NULL_TREE);
12253       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
12254 		      unshare_expr (valist), f_groff, NULL_TREE);
12255       rsize = ROUND_UP (size, UNITS_PER_WORD);
12256       nregs = rsize / UNITS_PER_WORD;
12257 
12258       if (align > 8)
12259 	dw_align = true;
12260 
12261       if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
12262 	  && size < UNITS_PER_WORD)
12263 	{
12264 	  adjust = UNITS_PER_WORD  - size;
12265 	}
12266     }
12267 
12268   /* Get a local temporary for the field value.  */
12269   off = get_initialized_tmp_var (f_off, pre_p, NULL);
12270 
12271   /* Emit code to branch if off >= 0.  */
12272   t = build2 (GE_EXPR, boolean_type_node, off,
12273 	      build_int_cst (TREE_TYPE (off), 0));
12274   cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
12275 
12276   if (dw_align)
12277     {
12278       /* Emit: offs = (offs + 15) & -16.  */
12279       t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
12280 		  build_int_cst (TREE_TYPE (off), 15));
12281       t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
12282 		  build_int_cst (TREE_TYPE (off), -16));
12283       roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
12284     }
12285   else
12286     roundup = NULL;
12287 
12288   /* Update ap.__[g|v]r_offs  */
12289   t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
12290 	      build_int_cst (TREE_TYPE (off), rsize));
12291   t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
12292 
12293   /* String up.  */
12294   if (roundup)
12295     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
12296 
12297   /* [cond2] if (ap.__[g|v]r_offs > 0)  */
12298   u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
12299 	      build_int_cst (TREE_TYPE (f_off), 0));
12300   cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
12301 
12302   /* String up: make sure the assignment happens before the use.  */
12303   t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
12304   COND_EXPR_ELSE (cond1) = t;
12305 
12306   /* Prepare the trees handling the argument that is passed on the stack;
12307      the top level node will store in ON_STACK.  */
12308   arg = get_initialized_tmp_var (stack, pre_p, NULL);
12309   if (align > 8)
12310     {
12311       /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
12312       t = fold_convert (intDI_type_node, arg);
12313       t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
12314 		  build_int_cst (TREE_TYPE (t), 15));
12315       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
12316 		  build_int_cst (TREE_TYPE (t), -16));
12317       t = fold_convert (TREE_TYPE (arg), t);
12318       roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
12319     }
12320   else
12321     roundup = NULL;
12322   /* Advance ap.__stack  */
12323   t = fold_convert (intDI_type_node, arg);
12324   t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
12325 	      build_int_cst (TREE_TYPE (t), size + 7));
12326   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
12327 	      build_int_cst (TREE_TYPE (t), -8));
12328   t = fold_convert (TREE_TYPE (arg), t);
12329   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
12330   /* String up roundup and advance.  */
12331   if (roundup)
12332     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
12333   /* String up with arg */
12334   on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
12335   /* Big-endianness related address adjustment.  */
12336   if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
12337       && size < UNITS_PER_WORD)
12338   {
12339     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
12340 		size_int (UNITS_PER_WORD - size));
12341     on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
12342   }
12343 
12344   COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
12345   COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
12346 
12347   /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
12348   t = off;
12349   if (adjust)
12350     t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
12351 		build_int_cst (TREE_TYPE (off), adjust));
12352 
12353   t = fold_convert (sizetype, t);
12354   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
12355 
12356   if (is_ha)
12357     {
12358       /* type ha; // treat as "struct {ftype field[n];}"
12359          ... [computing offs]
12360          for (i = 0; i <nregs; ++i, offs += 16)
12361 	   ha.field[i] = *((ftype *)(ap.__vr_top + offs));
12362 	 return ha;  */
12363       int i;
12364       tree tmp_ha, field_t, field_ptr_t;
12365 
12366       /* Declare a local variable.  */
12367       tmp_ha = create_tmp_var_raw (type, "ha");
12368       gimple_add_tmp_var (tmp_ha);
12369 
12370       /* Establish the base type.  */
12371       switch (ag_mode)
12372 	{
12373 	case E_SFmode:
12374 	  field_t = float_type_node;
12375 	  field_ptr_t = float_ptr_type_node;
12376 	  break;
12377 	case E_DFmode:
12378 	  field_t = double_type_node;
12379 	  field_ptr_t = double_ptr_type_node;
12380 	  break;
12381 	case E_TFmode:
12382 	  field_t = long_double_type_node;
12383 	  field_ptr_t = long_double_ptr_type_node;
12384 	  break;
12385 	case E_HFmode:
12386 	  field_t = aarch64_fp16_type_node;
12387 	  field_ptr_t = aarch64_fp16_ptr_type_node;
12388 	  break;
12389 	case E_V2SImode:
12390 	case E_V4SImode:
12391 	    {
12392 	      tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
12393 	      field_t = build_vector_type_for_mode (innertype, ag_mode);
12394 	      field_ptr_t = build_pointer_type (field_t);
12395 	    }
12396 	  break;
12397 	default:
12398 	  gcc_assert (0);
12399 	}
12400 
12401       /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
12402       tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
12403       addr = t;
12404       t = fold_convert (field_ptr_t, addr);
12405       t = build2 (MODIFY_EXPR, field_t,
12406 		  build1 (INDIRECT_REF, field_t, tmp_ha),
12407 		  build1 (INDIRECT_REF, field_t, t));
12408 
12409       /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
12410       for (i = 1; i < nregs; ++i)
12411 	{
12412 	  addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
12413 	  u = fold_convert (field_ptr_t, addr);
12414 	  u = build2 (MODIFY_EXPR, field_t,
12415 		      build2 (MEM_REF, field_t, tmp_ha,
12416 			      build_int_cst (field_ptr_t,
12417 					     (i *
12418 					      int_size_in_bytes (field_t)))),
12419 		      build1 (INDIRECT_REF, field_t, u));
12420 	  t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
12421 	}
12422 
12423       u = fold_convert (TREE_TYPE (f_top), tmp_ha);
12424       t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
12425     }
12426 
12427   COND_EXPR_ELSE (cond2) = t;
12428   addr = fold_convert (build_pointer_type (type), cond1);
12429   addr = build_va_arg_indirect_ref (addr);
12430 
12431   if (indirect_p)
12432     addr = build_va_arg_indirect_ref (addr);
12433 
12434   return addr;
12435 }
12436 
12437 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
12438 
12439 static void
aarch64_setup_incoming_varargs(cumulative_args_t cum_v,machine_mode mode,tree type,int * pretend_size ATTRIBUTE_UNUSED,int no_rtl)12440 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
12441 				tree type, int *pretend_size ATTRIBUTE_UNUSED,
12442 				int no_rtl)
12443 {
12444   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
12445   CUMULATIVE_ARGS local_cum;
12446   int gr_saved = cfun->va_list_gpr_size;
12447   int vr_saved = cfun->va_list_fpr_size;
12448 
12449   /* The caller has advanced CUM up to, but not beyond, the last named
12450      argument.  Advance a local copy of CUM past the last "real" named
12451      argument, to find out how many registers are left over.  */
12452   local_cum = *cum;
12453   aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
12454 
12455   /* Found out how many registers we need to save.
12456      Honor tree-stdvar analysis results.  */
12457   if (cfun->va_list_gpr_size)
12458     gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
12459 		    cfun->va_list_gpr_size / UNITS_PER_WORD);
12460   if (cfun->va_list_fpr_size)
12461     vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
12462 		    cfun->va_list_fpr_size / UNITS_PER_VREG);
12463 
12464   if (!TARGET_FLOAT)
12465     {
12466       gcc_assert (local_cum.aapcs_nvrn == 0);
12467       vr_saved = 0;
12468     }
12469 
12470   if (!no_rtl)
12471     {
12472       if (gr_saved > 0)
12473 	{
12474 	  rtx ptr, mem;
12475 
12476 	  /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
12477 	  ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
12478 			       - gr_saved * UNITS_PER_WORD);
12479 	  mem = gen_frame_mem (BLKmode, ptr);
12480 	  set_mem_alias_set (mem, get_varargs_alias_set ());
12481 
12482 	  move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
12483 			       mem, gr_saved);
12484 	}
12485       if (vr_saved > 0)
12486 	{
12487 	  /* We can't use move_block_from_reg, because it will use
12488 	     the wrong mode, storing D regs only.  */
12489 	  machine_mode mode = TImode;
12490 	  int off, i, vr_start;
12491 
12492 	  /* Set OFF to the offset from virtual_incoming_args_rtx of
12493 	     the first vector register.  The VR save area lies below
12494 	     the GR one, and is aligned to 16 bytes.  */
12495 	  off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
12496 			   STACK_BOUNDARY / BITS_PER_UNIT);
12497 	  off -= vr_saved * UNITS_PER_VREG;
12498 
12499 	  vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
12500 	  for (i = 0; i < vr_saved; ++i)
12501 	    {
12502 	      rtx ptr, mem;
12503 
12504 	      ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
12505 	      mem = gen_frame_mem (mode, ptr);
12506 	      set_mem_alias_set (mem, get_varargs_alias_set ());
12507 	      aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
12508 	      off += UNITS_PER_VREG;
12509 	    }
12510 	}
12511     }
12512 
12513   /* We don't save the size into *PRETEND_SIZE because we want to avoid
12514      any complication of having crtl->args.pretend_args_size changed.  */
12515   cfun->machine->frame.saved_varargs_size
12516     = (ROUND_UP (gr_saved * UNITS_PER_WORD,
12517 		 STACK_BOUNDARY / BITS_PER_UNIT)
12518        + vr_saved * UNITS_PER_VREG);
12519 }
12520 
12521 static void
aarch64_conditional_register_usage(void)12522 aarch64_conditional_register_usage (void)
12523 {
12524   int i;
12525   if (!TARGET_FLOAT)
12526     {
12527       for (i = V0_REGNUM; i <= V31_REGNUM; i++)
12528 	{
12529 	  fixed_regs[i] = 1;
12530 	  call_used_regs[i] = 1;
12531 	}
12532     }
12533   if (!TARGET_SVE)
12534     for (i = P0_REGNUM; i <= P15_REGNUM; i++)
12535       {
12536 	fixed_regs[i] = 1;
12537 	call_used_regs[i] = 1;
12538       }
12539 }
12540 
12541 /* Walk down the type tree of TYPE counting consecutive base elements.
12542    If *MODEP is VOIDmode, then set it to the first valid floating point
12543    type.  If a non-floating point type is found, or if a floating point
12544    type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
12545    otherwise return the count in the sub-tree.  */
12546 static int
aapcs_vfp_sub_candidate(const_tree type,machine_mode * modep)12547 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
12548 {
12549   machine_mode mode;
12550   HOST_WIDE_INT size;
12551 
12552   switch (TREE_CODE (type))
12553     {
12554     case REAL_TYPE:
12555       mode = TYPE_MODE (type);
12556       if (mode != DFmode && mode != SFmode
12557 	  && mode != TFmode && mode != HFmode)
12558 	return -1;
12559 
12560       if (*modep == VOIDmode)
12561 	*modep = mode;
12562 
12563       if (*modep == mode)
12564 	return 1;
12565 
12566       break;
12567 
12568     case COMPLEX_TYPE:
12569       mode = TYPE_MODE (TREE_TYPE (type));
12570       if (mode != DFmode && mode != SFmode
12571 	  && mode != TFmode && mode != HFmode)
12572 	return -1;
12573 
12574       if (*modep == VOIDmode)
12575 	*modep = mode;
12576 
12577       if (*modep == mode)
12578 	return 2;
12579 
12580       break;
12581 
12582     case VECTOR_TYPE:
12583       /* Use V2SImode and V4SImode as representatives of all 64-bit
12584 	 and 128-bit vector types.  */
12585       size = int_size_in_bytes (type);
12586       switch (size)
12587 	{
12588 	case 8:
12589 	  mode = V2SImode;
12590 	  break;
12591 	case 16:
12592 	  mode = V4SImode;
12593 	  break;
12594 	default:
12595 	  return -1;
12596 	}
12597 
12598       if (*modep == VOIDmode)
12599 	*modep = mode;
12600 
12601       /* Vector modes are considered to be opaque: two vectors are
12602 	 equivalent for the purposes of being homogeneous aggregates
12603 	 if they are the same size.  */
12604       if (*modep == mode)
12605 	return 1;
12606 
12607       break;
12608 
12609     case ARRAY_TYPE:
12610       {
12611 	int count;
12612 	tree index = TYPE_DOMAIN (type);
12613 
12614 	/* Can't handle incomplete types nor sizes that are not
12615 	   fixed.  */
12616 	if (!COMPLETE_TYPE_P (type)
12617 	    || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
12618 	  return -1;
12619 
12620 	count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
12621 	if (count == -1
12622 	    || !index
12623 	    || !TYPE_MAX_VALUE (index)
12624 	    || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
12625 	    || !TYPE_MIN_VALUE (index)
12626 	    || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
12627 	    || count < 0)
12628 	  return -1;
12629 
12630 	count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
12631 		      - tree_to_uhwi (TYPE_MIN_VALUE (index)));
12632 
12633 	/* There must be no padding.  */
12634 	if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
12635 		      count * GET_MODE_BITSIZE (*modep)))
12636 	  return -1;
12637 
12638 	return count;
12639       }
12640 
12641     case RECORD_TYPE:
12642       {
12643 	int count = 0;
12644 	int sub_count;
12645 	tree field;
12646 
12647 	/* Can't handle incomplete types nor sizes that are not
12648 	   fixed.  */
12649 	if (!COMPLETE_TYPE_P (type)
12650 	    || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
12651 	  return -1;
12652 
12653 	for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
12654 	  {
12655 	    if (TREE_CODE (field) != FIELD_DECL)
12656 	      continue;
12657 
12658 	    sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
12659 	    if (sub_count < 0)
12660 	      return -1;
12661 	    count += sub_count;
12662 	  }
12663 
12664 	/* There must be no padding.  */
12665 	if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
12666 		      count * GET_MODE_BITSIZE (*modep)))
12667 	  return -1;
12668 
12669 	return count;
12670       }
12671 
12672     case UNION_TYPE:
12673     case QUAL_UNION_TYPE:
12674       {
12675 	/* These aren't very interesting except in a degenerate case.  */
12676 	int count = 0;
12677 	int sub_count;
12678 	tree field;
12679 
12680 	/* Can't handle incomplete types nor sizes that are not
12681 	   fixed.  */
12682 	if (!COMPLETE_TYPE_P (type)
12683 	    || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
12684 	  return -1;
12685 
12686 	for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
12687 	  {
12688 	    if (TREE_CODE (field) != FIELD_DECL)
12689 	      continue;
12690 
12691 	    sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
12692 	    if (sub_count < 0)
12693 	      return -1;
12694 	    count = count > sub_count ? count : sub_count;
12695 	  }
12696 
12697 	/* There must be no padding.  */
12698 	if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
12699 		      count * GET_MODE_BITSIZE (*modep)))
12700 	  return -1;
12701 
12702 	return count;
12703       }
12704 
12705     default:
12706       break;
12707     }
12708 
12709   return -1;
12710 }
12711 
12712 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
12713    type as described in AAPCS64 \S 4.1.2.
12714 
12715    See the comment above aarch64_composite_type_p for the notes on MODE.  */
12716 
12717 static bool
aarch64_short_vector_p(const_tree type,machine_mode mode)12718 aarch64_short_vector_p (const_tree type,
12719 			machine_mode mode)
12720 {
12721   poly_int64 size = -1;
12722 
12723   if (type && TREE_CODE (type) == VECTOR_TYPE)
12724     size = int_size_in_bytes (type);
12725   else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
12726 	    || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
12727     size = GET_MODE_SIZE (mode);
12728 
12729   return known_eq (size, 8) || known_eq (size, 16);
12730 }
12731 
12732 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
12733    type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
12734    array types.  The C99 floating-point complex types are also considered
12735    as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
12736    types, which are GCC extensions and out of the scope of AAPCS64, are
12737    treated as composite types here as well.
12738 
12739    Note that MODE itself is not sufficient in determining whether a type
12740    is such a composite type or not.  This is because
12741    stor-layout.c:compute_record_mode may have already changed the MODE
12742    (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
12743    structure with only one field may have its MODE set to the mode of the
12744    field.  Also an integer mode whose size matches the size of the
12745    RECORD_TYPE type may be used to substitute the original mode
12746    (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
12747    solely relied on.  */
12748 
12749 static bool
aarch64_composite_type_p(const_tree type,machine_mode mode)12750 aarch64_composite_type_p (const_tree type,
12751 			  machine_mode mode)
12752 {
12753   if (aarch64_short_vector_p (type, mode))
12754     return false;
12755 
12756   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
12757     return true;
12758 
12759   if (mode == BLKmode
12760       || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
12761       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
12762     return true;
12763 
12764   return false;
12765 }
12766 
12767 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
12768    shall be passed or returned in simd/fp register(s) (providing these
12769    parameter passing registers are available).
12770 
12771    Upon successful return, *COUNT returns the number of needed registers,
12772    *BASE_MODE returns the mode of the individual register and when IS_HAF
12773    is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
12774    floating-point aggregate or a homogeneous short-vector aggregate.  */
12775 
12776 static bool
aarch64_vfp_is_call_or_return_candidate(machine_mode mode,const_tree type,machine_mode * base_mode,int * count,bool * is_ha)12777 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
12778 					 const_tree type,
12779 					 machine_mode *base_mode,
12780 					 int *count,
12781 					 bool *is_ha)
12782 {
12783   machine_mode new_mode = VOIDmode;
12784   bool composite_p = aarch64_composite_type_p (type, mode);
12785 
12786   if (is_ha != NULL) *is_ha = false;
12787 
12788   if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
12789       || aarch64_short_vector_p (type, mode))
12790     {
12791       *count = 1;
12792       new_mode = mode;
12793     }
12794   else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
12795     {
12796       if (is_ha != NULL) *is_ha = true;
12797       *count = 2;
12798       new_mode = GET_MODE_INNER (mode);
12799     }
12800   else if (type && composite_p)
12801     {
12802       int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
12803 
12804       if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
12805 	{
12806 	  if (is_ha != NULL) *is_ha = true;
12807 	  *count = ag_count;
12808 	}
12809       else
12810 	return false;
12811     }
12812   else
12813     return false;
12814 
12815   *base_mode = new_mode;
12816   return true;
12817 }
12818 
12819 /* Implement TARGET_STRUCT_VALUE_RTX.  */
12820 
12821 static rtx
aarch64_struct_value_rtx(tree fndecl ATTRIBUTE_UNUSED,int incoming ATTRIBUTE_UNUSED)12822 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
12823 			  int incoming ATTRIBUTE_UNUSED)
12824 {
12825   return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
12826 }
12827 
12828 /* Implements target hook vector_mode_supported_p.  */
12829 static bool
aarch64_vector_mode_supported_p(machine_mode mode)12830 aarch64_vector_mode_supported_p (machine_mode mode)
12831 {
12832   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
12833   return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
12834 }
12835 
12836 /* Return appropriate SIMD container
12837    for MODE within a vector of WIDTH bits.  */
12838 static machine_mode
aarch64_simd_container_mode(scalar_mode mode,poly_int64 width)12839 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
12840 {
12841   if (TARGET_SVE && known_eq (width, BITS_PER_SVE_VECTOR))
12842     switch (mode)
12843       {
12844       case E_DFmode:
12845 	return VNx2DFmode;
12846       case E_SFmode:
12847 	return VNx4SFmode;
12848       case E_HFmode:
12849 	return VNx8HFmode;
12850       case E_DImode:
12851 	return VNx2DImode;
12852       case E_SImode:
12853 	return VNx4SImode;
12854       case E_HImode:
12855 	return VNx8HImode;
12856       case E_QImode:
12857 	return VNx16QImode;
12858       default:
12859 	return word_mode;
12860       }
12861 
12862   gcc_assert (known_eq (width, 64) || known_eq (width, 128));
12863   if (TARGET_SIMD)
12864     {
12865       if (known_eq (width, 128))
12866 	switch (mode)
12867 	  {
12868 	  case E_DFmode:
12869 	    return V2DFmode;
12870 	  case E_SFmode:
12871 	    return V4SFmode;
12872 	  case E_HFmode:
12873 	    return V8HFmode;
12874 	  case E_SImode:
12875 	    return V4SImode;
12876 	  case E_HImode:
12877 	    return V8HImode;
12878 	  case E_QImode:
12879 	    return V16QImode;
12880 	  case E_DImode:
12881 	    return V2DImode;
12882 	  default:
12883 	    break;
12884 	  }
12885       else
12886 	switch (mode)
12887 	  {
12888 	  case E_SFmode:
12889 	    return V2SFmode;
12890 	  case E_HFmode:
12891 	    return V4HFmode;
12892 	  case E_SImode:
12893 	    return V2SImode;
12894 	  case E_HImode:
12895 	    return V4HImode;
12896 	  case E_QImode:
12897 	    return V8QImode;
12898 	  default:
12899 	    break;
12900 	  }
12901     }
12902   return word_mode;
12903 }
12904 
12905 /* Return 128-bit container as the preferred SIMD mode for MODE.  */
12906 static machine_mode
aarch64_preferred_simd_mode(scalar_mode mode)12907 aarch64_preferred_simd_mode (scalar_mode mode)
12908 {
12909   poly_int64 bits = TARGET_SVE ? BITS_PER_SVE_VECTOR : 128;
12910   return aarch64_simd_container_mode (mode, bits);
12911 }
12912 
12913 /* Return a list of possible vector sizes for the vectorizer
12914    to iterate over.  */
12915 static void
aarch64_autovectorize_vector_sizes(vector_sizes * sizes)12916 aarch64_autovectorize_vector_sizes (vector_sizes *sizes)
12917 {
12918   if (TARGET_SVE)
12919     sizes->safe_push (BYTES_PER_SVE_VECTOR);
12920   sizes->safe_push (16);
12921   sizes->safe_push (8);
12922 }
12923 
12924 /* Implement TARGET_MANGLE_TYPE.  */
12925 
12926 static const char *
aarch64_mangle_type(const_tree type)12927 aarch64_mangle_type (const_tree type)
12928 {
12929   /* The AArch64 ABI documents say that "__va_list" has to be
12930      managled as if it is in the "std" namespace.  */
12931   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
12932     return "St9__va_list";
12933 
12934   /* Half-precision float.  */
12935   if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
12936     return "Dh";
12937 
12938   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
12939      builtin types.  */
12940   if (TYPE_NAME (type) != NULL)
12941     return aarch64_mangle_builtin_type (type);
12942 
12943   /* Use the default mangling.  */
12944   return NULL;
12945 }
12946 
12947 /* Find the first rtx_insn before insn that will generate an assembly
12948    instruction.  */
12949 
12950 static rtx_insn *
aarch64_prev_real_insn(rtx_insn * insn)12951 aarch64_prev_real_insn (rtx_insn *insn)
12952 {
12953   if (!insn)
12954     return NULL;
12955 
12956   do
12957     {
12958       insn = prev_real_insn (insn);
12959     }
12960   while (insn && recog_memoized (insn) < 0);
12961 
12962   return insn;
12963 }
12964 
12965 static bool
is_madd_op(enum attr_type t1)12966 is_madd_op (enum attr_type t1)
12967 {
12968   unsigned int i;
12969   /* A number of these may be AArch32 only.  */
12970   enum attr_type mlatypes[] = {
12971     TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
12972     TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
12973     TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
12974   };
12975 
12976   for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
12977     {
12978       if (t1 == mlatypes[i])
12979 	return true;
12980     }
12981 
12982   return false;
12983 }
12984 
12985 /* Check if there is a register dependency between a load and the insn
12986    for which we hold recog_data.  */
12987 
12988 static bool
dep_between_memop_and_curr(rtx memop)12989 dep_between_memop_and_curr (rtx memop)
12990 {
12991   rtx load_reg;
12992   int opno;
12993 
12994   gcc_assert (GET_CODE (memop) == SET);
12995 
12996   if (!REG_P (SET_DEST (memop)))
12997     return false;
12998 
12999   load_reg = SET_DEST (memop);
13000   for (opno = 1; opno < recog_data.n_operands; opno++)
13001     {
13002       rtx operand = recog_data.operand[opno];
13003       if (REG_P (operand)
13004           && reg_overlap_mentioned_p (load_reg, operand))
13005         return true;
13006 
13007     }
13008   return false;
13009 }
13010 
13011 
13012 /* When working around the Cortex-A53 erratum 835769,
13013    given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
13014    instruction and has a preceding memory instruction such that a NOP
13015    should be inserted between them.  */
13016 
13017 bool
aarch64_madd_needs_nop(rtx_insn * insn)13018 aarch64_madd_needs_nop (rtx_insn* insn)
13019 {
13020   enum attr_type attr_type;
13021   rtx_insn *prev;
13022   rtx body;
13023 
13024   if (!TARGET_FIX_ERR_A53_835769)
13025     return false;
13026 
13027   if (!INSN_P (insn) || recog_memoized (insn) < 0)
13028     return false;
13029 
13030   attr_type = get_attr_type (insn);
13031   if (!is_madd_op (attr_type))
13032     return false;
13033 
13034   prev = aarch64_prev_real_insn (insn);
13035   /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
13036      Restore recog state to INSN to avoid state corruption.  */
13037   extract_constrain_insn_cached (insn);
13038 
13039   if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
13040     return false;
13041 
13042   body = single_set (prev);
13043 
13044   /* If the previous insn is a memory op and there is no dependency between
13045      it and the DImode madd, emit a NOP between them.  If body is NULL then we
13046      have a complex memory operation, probably a load/store pair.
13047      Be conservative for now and emit a NOP.  */
13048   if (GET_MODE (recog_data.operand[0]) == DImode
13049       && (!body || !dep_between_memop_and_curr (body)))
13050     return true;
13051 
13052   return false;
13053 
13054 }
13055 
13056 
13057 /* Implement FINAL_PRESCAN_INSN.  */
13058 
13059 void
aarch64_final_prescan_insn(rtx_insn * insn)13060 aarch64_final_prescan_insn (rtx_insn *insn)
13061 {
13062   if (aarch64_madd_needs_nop (insn))
13063     fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
13064 }
13065 
13066 
13067 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
13068    instruction.  */
13069 
13070 bool
aarch64_sve_index_immediate_p(rtx base_or_step)13071 aarch64_sve_index_immediate_p (rtx base_or_step)
13072 {
13073   return (CONST_INT_P (base_or_step)
13074 	  && IN_RANGE (INTVAL (base_or_step), -16, 15));
13075 }
13076 
13077 /* Return true if X is a valid immediate for the SVE ADD and SUB
13078    instructions.  Negate X first if NEGATE_P is true.  */
13079 
13080 bool
aarch64_sve_arith_immediate_p(rtx x,bool negate_p)13081 aarch64_sve_arith_immediate_p (rtx x, bool negate_p)
13082 {
13083   rtx elt;
13084 
13085   if (!const_vec_duplicate_p (x, &elt)
13086       || !CONST_INT_P (elt))
13087     return false;
13088 
13089   HOST_WIDE_INT val = INTVAL (elt);
13090   if (negate_p)
13091     val = -val;
13092   val &= GET_MODE_MASK (GET_MODE_INNER (GET_MODE (x)));
13093 
13094   if (val & 0xff)
13095     return IN_RANGE (val, 0, 0xff);
13096   return IN_RANGE (val, 0, 0xff00);
13097 }
13098 
13099 /* Return true if X is a valid immediate operand for an SVE logical
13100    instruction such as AND.  */
13101 
13102 bool
aarch64_sve_bitmask_immediate_p(rtx x)13103 aarch64_sve_bitmask_immediate_p (rtx x)
13104 {
13105   rtx elt;
13106 
13107   return (const_vec_duplicate_p (x, &elt)
13108 	  && CONST_INT_P (elt)
13109 	  && aarch64_bitmask_imm (INTVAL (elt),
13110 				  GET_MODE_INNER (GET_MODE (x))));
13111 }
13112 
13113 /* Return true if X is a valid immediate for the SVE DUP and CPY
13114    instructions.  */
13115 
13116 bool
aarch64_sve_dup_immediate_p(rtx x)13117 aarch64_sve_dup_immediate_p (rtx x)
13118 {
13119   rtx elt;
13120 
13121   if (!const_vec_duplicate_p (x, &elt)
13122       || !CONST_INT_P (elt))
13123     return false;
13124 
13125   HOST_WIDE_INT val = INTVAL (elt);
13126   if (val & 0xff)
13127     return IN_RANGE (val, -0x80, 0x7f);
13128   return IN_RANGE (val, -0x8000, 0x7f00);
13129 }
13130 
13131 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
13132    SIGNED_P says whether the operand is signed rather than unsigned.  */
13133 
13134 bool
aarch64_sve_cmp_immediate_p(rtx x,bool signed_p)13135 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
13136 {
13137   rtx elt;
13138 
13139   return (const_vec_duplicate_p (x, &elt)
13140 	  && CONST_INT_P (elt)
13141 	  && (signed_p
13142 	      ? IN_RANGE (INTVAL (elt), -16, 15)
13143 	      : IN_RANGE (INTVAL (elt), 0, 127)));
13144 }
13145 
13146 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
13147    instruction.  Negate X first if NEGATE_P is true.  */
13148 
13149 bool
aarch64_sve_float_arith_immediate_p(rtx x,bool negate_p)13150 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
13151 {
13152   rtx elt;
13153   REAL_VALUE_TYPE r;
13154 
13155   if (!const_vec_duplicate_p (x, &elt)
13156       || GET_CODE (elt) != CONST_DOUBLE)
13157     return false;
13158 
13159   r = *CONST_DOUBLE_REAL_VALUE (elt);
13160 
13161   if (negate_p)
13162     r = real_value_negate (&r);
13163 
13164   if (real_equal (&r, &dconst1))
13165     return true;
13166   if (real_equal (&r, &dconsthalf))
13167     return true;
13168   return false;
13169 }
13170 
13171 /* Return true if X is a valid immediate operand for an SVE FMUL
13172    instruction.  */
13173 
13174 bool
aarch64_sve_float_mul_immediate_p(rtx x)13175 aarch64_sve_float_mul_immediate_p (rtx x)
13176 {
13177   rtx elt;
13178 
13179   /* GCC will never generate a multiply with an immediate of 2, so there is no
13180      point testing for it (even though it is a valid constant).  */
13181   return (const_vec_duplicate_p (x, &elt)
13182 	  && GET_CODE (elt) == CONST_DOUBLE
13183 	  && real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf));
13184 }
13185 
13186 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
13187    for the Advanced SIMD operation described by WHICH and INSN.  If INFO
13188    is nonnull, use it to describe valid immediates.  */
13189 static bool
aarch64_advsimd_valid_immediate_hs(unsigned int val32,simd_immediate_info * info,enum simd_immediate_check which,simd_immediate_info::insn_type insn)13190 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
13191 				    simd_immediate_info *info,
13192 				    enum simd_immediate_check which,
13193 				    simd_immediate_info::insn_type insn)
13194 {
13195   /* Try a 4-byte immediate with LSL.  */
13196   for (unsigned int shift = 0; shift < 32; shift += 8)
13197     if ((val32 & (0xff << shift)) == val32)
13198       {
13199 	if (info)
13200 	  *info = simd_immediate_info (SImode, val32 >> shift, insn,
13201 				       simd_immediate_info::LSL, shift);
13202 	return true;
13203       }
13204 
13205   /* Try a 2-byte immediate with LSL.  */
13206   unsigned int imm16 = val32 & 0xffff;
13207   if (imm16 == (val32 >> 16))
13208     for (unsigned int shift = 0; shift < 16; shift += 8)
13209       if ((imm16 & (0xff << shift)) == imm16)
13210 	{
13211 	  if (info)
13212 	    *info = simd_immediate_info (HImode, imm16 >> shift, insn,
13213 					 simd_immediate_info::LSL, shift);
13214 	  return true;
13215 	}
13216 
13217   /* Try a 4-byte immediate with MSL, except for cases that MVN
13218      can handle.  */
13219   if (which == AARCH64_CHECK_MOV)
13220     for (unsigned int shift = 8; shift < 24; shift += 8)
13221       {
13222 	unsigned int low = (1 << shift) - 1;
13223 	if (((val32 & (0xff << shift)) | low) == val32)
13224 	  {
13225 	    if (info)
13226 	      *info = simd_immediate_info (SImode, val32 >> shift, insn,
13227 					   simd_immediate_info::MSL, shift);
13228 	    return true;
13229 	  }
13230       }
13231 
13232   return false;
13233 }
13234 
13235 /* Return true if replicating VAL64 is a valid immediate for the
13236    Advanced SIMD operation described by WHICH.  If INFO is nonnull,
13237    use it to describe valid immediates.  */
13238 static bool
aarch64_advsimd_valid_immediate(unsigned HOST_WIDE_INT val64,simd_immediate_info * info,enum simd_immediate_check which)13239 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
13240 				 simd_immediate_info *info,
13241 				 enum simd_immediate_check which)
13242 {
13243   unsigned int val32 = val64 & 0xffffffff;
13244   unsigned int val16 = val64 & 0xffff;
13245   unsigned int val8 = val64 & 0xff;
13246 
13247   if (val32 == (val64 >> 32))
13248     {
13249       if ((which & AARCH64_CHECK_ORR) != 0
13250 	  && aarch64_advsimd_valid_immediate_hs (val32, info, which,
13251 						 simd_immediate_info::MOV))
13252 	return true;
13253 
13254       if ((which & AARCH64_CHECK_BIC) != 0
13255 	  && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
13256 						 simd_immediate_info::MVN))
13257 	return true;
13258 
13259       /* Try using a replicated byte.  */
13260       if (which == AARCH64_CHECK_MOV
13261 	  && val16 == (val32 >> 16)
13262 	  && val8 == (val16 >> 8))
13263 	{
13264 	  if (info)
13265 	    *info = simd_immediate_info (QImode, val8);
13266 	  return true;
13267 	}
13268     }
13269 
13270   /* Try using a bit-to-bytemask.  */
13271   if (which == AARCH64_CHECK_MOV)
13272     {
13273       unsigned int i;
13274       for (i = 0; i < 64; i += 8)
13275 	{
13276 	  unsigned char byte = (val64 >> i) & 0xff;
13277 	  if (byte != 0 && byte != 0xff)
13278 	    break;
13279 	}
13280       if (i == 64)
13281 	{
13282 	  if (info)
13283 	    *info = simd_immediate_info (DImode, val64);
13284 	  return true;
13285 	}
13286     }
13287   return false;
13288 }
13289 
13290 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
13291    instruction.  If INFO is nonnull, use it to describe valid immediates.  */
13292 
13293 static bool
aarch64_sve_valid_immediate(unsigned HOST_WIDE_INT val64,simd_immediate_info * info)13294 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
13295 			     simd_immediate_info *info)
13296 {
13297   scalar_int_mode mode = DImode;
13298   unsigned int val32 = val64 & 0xffffffff;
13299   if (val32 == (val64 >> 32))
13300     {
13301       mode = SImode;
13302       unsigned int val16 = val32 & 0xffff;
13303       if (val16 == (val32 >> 16))
13304 	{
13305 	  mode = HImode;
13306 	  unsigned int val8 = val16 & 0xff;
13307 	  if (val8 == (val16 >> 8))
13308 	    mode = QImode;
13309 	}
13310     }
13311   HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
13312   if (IN_RANGE (val, -0x80, 0x7f))
13313     {
13314       /* DUP with no shift.  */
13315       if (info)
13316 	*info = simd_immediate_info (mode, val);
13317       return true;
13318     }
13319   if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
13320     {
13321       /* DUP with LSL #8.  */
13322       if (info)
13323 	*info = simd_immediate_info (mode, val);
13324       return true;
13325     }
13326   if (aarch64_bitmask_imm (val64, mode))
13327     {
13328       /* DUPM.  */
13329       if (info)
13330 	*info = simd_immediate_info (mode, val);
13331       return true;
13332     }
13333   return false;
13334 }
13335 
13336 /* Return true if OP is a valid SIMD immediate for the operation
13337    described by WHICH.  If INFO is nonnull, use it to describe valid
13338    immediates.  */
13339 bool
aarch64_simd_valid_immediate(rtx op,simd_immediate_info * info,enum simd_immediate_check which)13340 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
13341 			      enum simd_immediate_check which)
13342 {
13343   machine_mode mode = GET_MODE (op);
13344   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
13345   if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
13346     return false;
13347 
13348   scalar_mode elt_mode = GET_MODE_INNER (mode);
13349   rtx base, step;
13350   unsigned int n_elts;
13351   if (GET_CODE (op) == CONST_VECTOR
13352       && CONST_VECTOR_DUPLICATE_P (op))
13353     n_elts = CONST_VECTOR_NPATTERNS (op);
13354   else if ((vec_flags & VEC_SVE_DATA)
13355 	   && const_vec_series_p (op, &base, &step))
13356     {
13357       gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
13358       if (!aarch64_sve_index_immediate_p (base)
13359 	  || !aarch64_sve_index_immediate_p (step))
13360 	return false;
13361 
13362       if (info)
13363 	*info = simd_immediate_info (elt_mode, base, step);
13364       return true;
13365     }
13366   else if (GET_CODE (op) == CONST_VECTOR
13367 	   && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
13368     /* N_ELTS set above.  */;
13369   else
13370     return false;
13371 
13372   /* Handle PFALSE and PTRUE.  */
13373   if (vec_flags & VEC_SVE_PRED)
13374     return (op == CONST0_RTX (mode)
13375 	    || op == CONSTM1_RTX (mode));
13376 
13377   scalar_float_mode elt_float_mode;
13378   if (n_elts == 1
13379       && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
13380     {
13381       rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
13382       if (aarch64_float_const_zero_rtx_p (elt)
13383 	  || aarch64_float_const_representable_p (elt))
13384 	{
13385 	  if (info)
13386 	    *info = simd_immediate_info (elt_float_mode, elt);
13387 	  return true;
13388 	}
13389     }
13390 
13391   unsigned int elt_size = GET_MODE_SIZE (elt_mode);
13392   if (elt_size > 8)
13393     return false;
13394 
13395   scalar_int_mode elt_int_mode = int_mode_for_mode (elt_mode).require ();
13396 
13397   /* Expand the vector constant out into a byte vector, with the least
13398      significant byte of the register first.  */
13399   auto_vec<unsigned char, 16> bytes;
13400   bytes.reserve (n_elts * elt_size);
13401   for (unsigned int i = 0; i < n_elts; i++)
13402     {
13403       /* The vector is provided in gcc endian-neutral fashion.
13404 	 For aarch64_be Advanced SIMD, it must be laid out in the vector
13405 	 register in reverse order.  */
13406       bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
13407       rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
13408 
13409       if (elt_mode != elt_int_mode)
13410 	elt = gen_lowpart (elt_int_mode, elt);
13411 
13412       if (!CONST_INT_P (elt))
13413 	return false;
13414 
13415       unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
13416       for (unsigned int byte = 0; byte < elt_size; byte++)
13417 	{
13418 	  bytes.quick_push (elt_val & 0xff);
13419 	  elt_val >>= BITS_PER_UNIT;
13420 	}
13421     }
13422 
13423   /* The immediate must repeat every eight bytes.  */
13424   unsigned int nbytes = bytes.length ();
13425   for (unsigned i = 8; i < nbytes; ++i)
13426     if (bytes[i] != bytes[i - 8])
13427       return false;
13428 
13429   /* Get the repeating 8-byte value as an integer.  No endian correction
13430      is needed here because bytes is already in lsb-first order.  */
13431   unsigned HOST_WIDE_INT val64 = 0;
13432   for (unsigned int i = 0; i < 8; i++)
13433     val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
13434 	      << (i * BITS_PER_UNIT));
13435 
13436   if (vec_flags & VEC_SVE_DATA)
13437     return aarch64_sve_valid_immediate (val64, info);
13438   else
13439     return aarch64_advsimd_valid_immediate (val64, info, which);
13440 }
13441 
13442 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
13443    has a step in the range of INDEX.  Return the index expression if so,
13444    otherwise return null.  */
13445 rtx
aarch64_check_zero_based_sve_index_immediate(rtx x)13446 aarch64_check_zero_based_sve_index_immediate (rtx x)
13447 {
13448   rtx base, step;
13449   if (const_vec_series_p (x, &base, &step)
13450       && base == const0_rtx
13451       && aarch64_sve_index_immediate_p (step))
13452     return step;
13453   return NULL_RTX;
13454 }
13455 
13456 /* Check of immediate shift constants are within range.  */
13457 bool
aarch64_simd_shift_imm_p(rtx x,machine_mode mode,bool left)13458 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
13459 {
13460   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
13461   if (left)
13462     return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
13463   else
13464     return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
13465 }
13466 
13467 /* Return the bitmask CONST_INT to select the bits required by a zero extract
13468    operation of width WIDTH at bit position POS.  */
13469 
13470 rtx
aarch64_mask_from_zextract_ops(rtx width,rtx pos)13471 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
13472 {
13473   gcc_assert (CONST_INT_P (width));
13474   gcc_assert (CONST_INT_P (pos));
13475 
13476   unsigned HOST_WIDE_INT mask
13477     = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
13478   return GEN_INT (mask << UINTVAL (pos));
13479 }
13480 
13481 bool
aarch64_mov_operand_p(rtx x,machine_mode mode)13482 aarch64_mov_operand_p (rtx x, machine_mode mode)
13483 {
13484   if (GET_CODE (x) == HIGH
13485       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
13486     return true;
13487 
13488   if (CONST_INT_P (x))
13489     return true;
13490 
13491   if (VECTOR_MODE_P (GET_MODE (x)))
13492     return aarch64_simd_valid_immediate (x, NULL);
13493 
13494   if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
13495     return true;
13496 
13497   if (aarch64_sve_cnt_immediate_p (x))
13498     return true;
13499 
13500   return aarch64_classify_symbolic_expression (x)
13501     == SYMBOL_TINY_ABSOLUTE;
13502 }
13503 
13504 /* Return a const_int vector of VAL.  */
13505 rtx
aarch64_simd_gen_const_vector_dup(machine_mode mode,HOST_WIDE_INT val)13506 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
13507 {
13508   rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
13509   return gen_const_vec_duplicate (mode, c);
13510 }
13511 
13512 /* Check OP is a legal scalar immediate for the MOVI instruction.  */
13513 
13514 bool
aarch64_simd_scalar_immediate_valid_for_move(rtx op,scalar_int_mode mode)13515 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
13516 {
13517   machine_mode vmode;
13518 
13519   vmode = aarch64_simd_container_mode (mode, 64);
13520   rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
13521   return aarch64_simd_valid_immediate (op_v, NULL);
13522 }
13523 
13524 /* Construct and return a PARALLEL RTX vector with elements numbering the
13525    lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
13526    the vector - from the perspective of the architecture.  This does not
13527    line up with GCC's perspective on lane numbers, so we end up with
13528    different masks depending on our target endian-ness.  The diagram
13529    below may help.  We must draw the distinction when building masks
13530    which select one half of the vector.  An instruction selecting
13531    architectural low-lanes for a big-endian target, must be described using
13532    a mask selecting GCC high-lanes.
13533 
13534                  Big-Endian             Little-Endian
13535 
13536 GCC             0   1   2   3           3   2   1   0
13537               | x | x | x | x |       | x | x | x | x |
13538 Architecture    3   2   1   0           3   2   1   0
13539 
13540 Low Mask:         { 2, 3 }                { 0, 1 }
13541 High Mask:        { 0, 1 }                { 2, 3 }
13542 
13543    MODE Is the mode of the vector and NUNITS is the number of units in it.  */
13544 
13545 rtx
aarch64_simd_vect_par_cnst_half(machine_mode mode,int nunits,bool high)13546 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
13547 {
13548   rtvec v = rtvec_alloc (nunits / 2);
13549   int high_base = nunits / 2;
13550   int low_base = 0;
13551   int base;
13552   rtx t1;
13553   int i;
13554 
13555   if (BYTES_BIG_ENDIAN)
13556     base = high ? low_base : high_base;
13557   else
13558     base = high ? high_base : low_base;
13559 
13560   for (i = 0; i < nunits / 2; i++)
13561     RTVEC_ELT (v, i) = GEN_INT (base + i);
13562 
13563   t1 = gen_rtx_PARALLEL (mode, v);
13564   return t1;
13565 }
13566 
13567 /* Check OP for validity as a PARALLEL RTX vector with elements
13568    numbering the lanes of either the high (HIGH == TRUE) or low lanes,
13569    from the perspective of the architecture.  See the diagram above
13570    aarch64_simd_vect_par_cnst_half for more details.  */
13571 
13572 bool
aarch64_simd_check_vect_par_cnst_half(rtx op,machine_mode mode,bool high)13573 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
13574 				       bool high)
13575 {
13576   int nelts;
13577   if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
13578     return false;
13579 
13580   rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
13581   HOST_WIDE_INT count_op = XVECLEN (op, 0);
13582   HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
13583   int i = 0;
13584 
13585   if (count_op != count_ideal)
13586     return false;
13587 
13588   for (i = 0; i < count_ideal; i++)
13589     {
13590       rtx elt_op = XVECEXP (op, 0, i);
13591       rtx elt_ideal = XVECEXP (ideal, 0, i);
13592 
13593       if (!CONST_INT_P (elt_op)
13594 	  || INTVAL (elt_ideal) != INTVAL (elt_op))
13595 	return false;
13596     }
13597   return true;
13598 }
13599 
13600 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
13601    HIGH (exclusive).  */
13602 void
aarch64_simd_lane_bounds(rtx operand,HOST_WIDE_INT low,HOST_WIDE_INT high,const_tree exp)13603 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
13604 			  const_tree exp)
13605 {
13606   HOST_WIDE_INT lane;
13607   gcc_assert (CONST_INT_P (operand));
13608   lane = INTVAL (operand);
13609 
13610   if (lane < low || lane >= high)
13611   {
13612     if (exp)
13613       error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
13614     else
13615       error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
13616   }
13617 }
13618 
13619 /* Peform endian correction on lane number N, which indexes a vector
13620    of mode MODE, and return the result as an SImode rtx.  */
13621 
13622 rtx
aarch64_endian_lane_rtx(machine_mode mode,unsigned int n)13623 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
13624 {
13625   return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
13626 }
13627 
13628 /* Return TRUE if OP is a valid vector addressing mode.  */
13629 
13630 bool
aarch64_simd_mem_operand_p(rtx op)13631 aarch64_simd_mem_operand_p (rtx op)
13632 {
13633   return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
13634 			|| REG_P (XEXP (op, 0)));
13635 }
13636 
13637 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction.  */
13638 
13639 bool
aarch64_sve_ld1r_operand_p(rtx op)13640 aarch64_sve_ld1r_operand_p (rtx op)
13641 {
13642   struct aarch64_address_info addr;
13643   scalar_mode mode;
13644 
13645   return (MEM_P (op)
13646 	  && is_a <scalar_mode> (GET_MODE (op), &mode)
13647 	  && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
13648 	  && addr.type == ADDRESS_REG_IMM
13649 	  && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
13650 }
13651 
13652 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
13653    The conditions for STR are the same.  */
13654 bool
aarch64_sve_ldr_operand_p(rtx op)13655 aarch64_sve_ldr_operand_p (rtx op)
13656 {
13657   struct aarch64_address_info addr;
13658 
13659   return (MEM_P (op)
13660 	  && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
13661 				       false, ADDR_QUERY_ANY)
13662 	  && addr.type == ADDRESS_REG_IMM);
13663 }
13664 
13665 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
13666    We need to be able to access the individual pieces, so the range
13667    is different from LD[234] and ST[234].  */
13668 bool
aarch64_sve_struct_memory_operand_p(rtx op)13669 aarch64_sve_struct_memory_operand_p (rtx op)
13670 {
13671   if (!MEM_P (op))
13672     return false;
13673 
13674   machine_mode mode = GET_MODE (op);
13675   struct aarch64_address_info addr;
13676   if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
13677 				 ADDR_QUERY_ANY)
13678       || addr.type != ADDRESS_REG_IMM)
13679     return false;
13680 
13681   poly_int64 first = addr.const_offset;
13682   poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
13683   return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
13684 	  && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
13685 }
13686 
13687 /* Emit a register copy from operand to operand, taking care not to
13688    early-clobber source registers in the process.
13689 
13690    COUNT is the number of components into which the copy needs to be
13691    decomposed.  */
13692 void
aarch64_simd_emit_reg_reg_move(rtx * operands,machine_mode mode,unsigned int count)13693 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
13694 				unsigned int count)
13695 {
13696   unsigned int i;
13697   int rdest = REGNO (operands[0]);
13698   int rsrc = REGNO (operands[1]);
13699 
13700   if (!reg_overlap_mentioned_p (operands[0], operands[1])
13701       || rdest < rsrc)
13702     for (i = 0; i < count; i++)
13703       emit_move_insn (gen_rtx_REG (mode, rdest + i),
13704 		      gen_rtx_REG (mode, rsrc + i));
13705   else
13706     for (i = 0; i < count; i++)
13707       emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
13708 		      gen_rtx_REG (mode, rsrc + count - i - 1));
13709 }
13710 
13711 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
13712    one of VSTRUCT modes: OI, CI, or XI.  */
13713 int
aarch64_simd_attr_length_rglist(machine_mode mode)13714 aarch64_simd_attr_length_rglist (machine_mode mode)
13715 {
13716   /* This is only used (and only meaningful) for Advanced SIMD, not SVE.  */
13717   return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
13718 }
13719 
13720 /* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
13721    alignment of a vector to 128 bits.  SVE predicates have an alignment of
13722    16 bits.  */
13723 static HOST_WIDE_INT
aarch64_simd_vector_alignment(const_tree type)13724 aarch64_simd_vector_alignment (const_tree type)
13725 {
13726   if (TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
13727     /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
13728        be set for non-predicate vectors of booleans.  Modes are the most
13729        direct way we have of identifying real SVE predicate types.  */
13730     return GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL ? 16 : 128;
13731   HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
13732   return MIN (align, 128);
13733 }
13734 
13735 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT.  */
13736 static HOST_WIDE_INT
aarch64_vectorize_preferred_vector_alignment(const_tree type)13737 aarch64_vectorize_preferred_vector_alignment (const_tree type)
13738 {
13739   if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
13740     {
13741       /* If the length of the vector is fixed, try to align to that length,
13742 	 otherwise don't try to align at all.  */
13743       HOST_WIDE_INT result;
13744       if (!BITS_PER_SVE_VECTOR.is_constant (&result))
13745 	result = TYPE_ALIGN (TREE_TYPE (type));
13746       return result;
13747     }
13748   return TYPE_ALIGN (type);
13749 }
13750 
13751 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
13752 static bool
aarch64_simd_vector_alignment_reachable(const_tree type,bool is_packed)13753 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
13754 {
13755   if (is_packed)
13756     return false;
13757 
13758   /* For fixed-length vectors, check that the vectorizer will aim for
13759      full-vector alignment.  This isn't true for generic GCC vectors
13760      that are wider than the ABI maximum of 128 bits.  */
13761   if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
13762       && (wi::to_widest (TYPE_SIZE (type))
13763 	  != aarch64_vectorize_preferred_vector_alignment (type)))
13764     return false;
13765 
13766   /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
13767   return true;
13768 }
13769 
13770 /* Return true if the vector misalignment factor is supported by the
13771    target.  */
13772 static bool
aarch64_builtin_support_vector_misalignment(machine_mode mode,const_tree type,int misalignment,bool is_packed)13773 aarch64_builtin_support_vector_misalignment (machine_mode mode,
13774 					     const_tree type, int misalignment,
13775 					     bool is_packed)
13776 {
13777   if (TARGET_SIMD && STRICT_ALIGNMENT)
13778     {
13779       /* Return if movmisalign pattern is not supported for this mode.  */
13780       if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
13781         return false;
13782 
13783       /* Misalignment factor is unknown at compile time.  */
13784       if (misalignment == -1)
13785 	return false;
13786     }
13787   return default_builtin_support_vector_misalignment (mode, type, misalignment,
13788 						      is_packed);
13789 }
13790 
13791 /* If VALS is a vector constant that can be loaded into a register
13792    using DUP, generate instructions to do so and return an RTX to
13793    assign to the register.  Otherwise return NULL_RTX.  */
13794 static rtx
aarch64_simd_dup_constant(rtx vals)13795 aarch64_simd_dup_constant (rtx vals)
13796 {
13797   machine_mode mode = GET_MODE (vals);
13798   machine_mode inner_mode = GET_MODE_INNER (mode);
13799   rtx x;
13800 
13801   if (!const_vec_duplicate_p (vals, &x))
13802     return NULL_RTX;
13803 
13804   /* We can load this constant by using DUP and a constant in a
13805      single ARM register.  This will be cheaper than a vector
13806      load.  */
13807   x = copy_to_mode_reg (inner_mode, x);
13808   return gen_vec_duplicate (mode, x);
13809 }
13810 
13811 
13812 /* Generate code to load VALS, which is a PARALLEL containing only
13813    constants (for vec_init) or CONST_VECTOR, efficiently into a
13814    register.  Returns an RTX to copy into the register, or NULL_RTX
13815    for a PARALLEL that can not be converted into a CONST_VECTOR.  */
13816 static rtx
aarch64_simd_make_constant(rtx vals)13817 aarch64_simd_make_constant (rtx vals)
13818 {
13819   machine_mode mode = GET_MODE (vals);
13820   rtx const_dup;
13821   rtx const_vec = NULL_RTX;
13822   int n_const = 0;
13823   int i;
13824 
13825   if (GET_CODE (vals) == CONST_VECTOR)
13826     const_vec = vals;
13827   else if (GET_CODE (vals) == PARALLEL)
13828     {
13829       /* A CONST_VECTOR must contain only CONST_INTs and
13830 	 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
13831 	 Only store valid constants in a CONST_VECTOR.  */
13832       int n_elts = XVECLEN (vals, 0);
13833       for (i = 0; i < n_elts; ++i)
13834 	{
13835 	  rtx x = XVECEXP (vals, 0, i);
13836 	  if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
13837 	    n_const++;
13838 	}
13839       if (n_const == n_elts)
13840 	const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
13841     }
13842   else
13843     gcc_unreachable ();
13844 
13845   if (const_vec != NULL_RTX
13846       && aarch64_simd_valid_immediate (const_vec, NULL))
13847     /* Load using MOVI/MVNI.  */
13848     return const_vec;
13849   else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
13850     /* Loaded using DUP.  */
13851     return const_dup;
13852   else if (const_vec != NULL_RTX)
13853     /* Load from constant pool. We can not take advantage of single-cycle
13854        LD1 because we need a PC-relative addressing mode.  */
13855     return const_vec;
13856   else
13857     /* A PARALLEL containing something not valid inside CONST_VECTOR.
13858        We can not construct an initializer.  */
13859     return NULL_RTX;
13860 }
13861 
13862 /* Expand a vector initialisation sequence, such that TARGET is
13863    initialised to contain VALS.  */
13864 
13865 void
aarch64_expand_vector_init(rtx target,rtx vals)13866 aarch64_expand_vector_init (rtx target, rtx vals)
13867 {
13868   machine_mode mode = GET_MODE (target);
13869   scalar_mode inner_mode = GET_MODE_INNER (mode);
13870   /* The number of vector elements.  */
13871   int n_elts = XVECLEN (vals, 0);
13872   /* The number of vector elements which are not constant.  */
13873   int n_var = 0;
13874   rtx any_const = NULL_RTX;
13875   /* The first element of vals.  */
13876   rtx v0 = XVECEXP (vals, 0, 0);
13877   bool all_same = true;
13878 
13879   /* Count the number of variable elements to initialise.  */
13880   for (int i = 0; i < n_elts; ++i)
13881     {
13882       rtx x = XVECEXP (vals, 0, i);
13883       if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
13884 	++n_var;
13885       else
13886 	any_const = x;
13887 
13888       all_same &= rtx_equal_p (x, v0);
13889     }
13890 
13891   /* No variable elements, hand off to aarch64_simd_make_constant which knows
13892      how best to handle this.  */
13893   if (n_var == 0)
13894     {
13895       rtx constant = aarch64_simd_make_constant (vals);
13896       if (constant != NULL_RTX)
13897 	{
13898 	  emit_move_insn (target, constant);
13899 	  return;
13900 	}
13901     }
13902 
13903   /* Splat a single non-constant element if we can.  */
13904   if (all_same)
13905     {
13906       rtx x = copy_to_mode_reg (inner_mode, v0);
13907       aarch64_emit_move (target, gen_vec_duplicate (mode, x));
13908       return;
13909     }
13910 
13911   enum insn_code icode = optab_handler (vec_set_optab, mode);
13912   gcc_assert (icode != CODE_FOR_nothing);
13913 
13914   /* If there are only variable elements, try to optimize
13915      the insertion using dup for the most common element
13916      followed by insertions.  */
13917 
13918   /* The algorithm will fill matches[*][0] with the earliest matching element,
13919      and matches[X][1] with the count of duplicate elements (if X is the
13920      earliest element which has duplicates).  */
13921 
13922   if (n_var == n_elts && n_elts <= 16)
13923     {
13924       int matches[16][2] = {0};
13925       for (int i = 0; i < n_elts; i++)
13926 	{
13927 	  for (int j = 0; j <= i; j++)
13928 	    {
13929 	      if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
13930 		{
13931 		  matches[i][0] = j;
13932 		  matches[j][1]++;
13933 		  break;
13934 		}
13935 	    }
13936 	}
13937       int maxelement = 0;
13938       int maxv = 0;
13939       for (int i = 0; i < n_elts; i++)
13940 	if (matches[i][1] > maxv)
13941 	  {
13942 	    maxelement = i;
13943 	    maxv = matches[i][1];
13944 	  }
13945 
13946       /* Create a duplicate of the most common element.  */
13947       rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
13948       aarch64_emit_move (target, gen_vec_duplicate (mode, x));
13949 
13950       /* Insert the rest.  */
13951       for (int i = 0; i < n_elts; i++)
13952 	{
13953 	  rtx x = XVECEXP (vals, 0, i);
13954 	  if (matches[i][0] == maxelement)
13955 	    continue;
13956 	  x = copy_to_mode_reg (inner_mode, x);
13957 	  emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
13958 	}
13959       return;
13960     }
13961 
13962   /* Initialise a vector which is part-variable.  We want to first try
13963      to build those lanes which are constant in the most efficient way we
13964      can.  */
13965   if (n_var != n_elts)
13966     {
13967       rtx copy = copy_rtx (vals);
13968 
13969       /* Load constant part of vector.  We really don't care what goes into the
13970 	 parts we will overwrite, but we're more likely to be able to load the
13971 	 constant efficiently if it has fewer, larger, repeating parts
13972 	 (see aarch64_simd_valid_immediate).  */
13973       for (int i = 0; i < n_elts; i++)
13974 	{
13975 	  rtx x = XVECEXP (vals, 0, i);
13976 	  if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
13977 	    continue;
13978 	  rtx subst = any_const;
13979 	  for (int bit = n_elts / 2; bit > 0; bit /= 2)
13980 	    {
13981 	      /* Look in the copied vector, as more elements are const.  */
13982 	      rtx test = XVECEXP (copy, 0, i ^ bit);
13983 	      if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
13984 		{
13985 		  subst = test;
13986 		  break;
13987 		}
13988 	    }
13989 	  XVECEXP (copy, 0, i) = subst;
13990 	}
13991       aarch64_expand_vector_init (target, copy);
13992     }
13993 
13994   /* Insert the variable lanes directly.  */
13995   for (int i = 0; i < n_elts; i++)
13996     {
13997       rtx x = XVECEXP (vals, 0, i);
13998       if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
13999 	continue;
14000       x = copy_to_mode_reg (inner_mode, x);
14001       emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
14002     }
14003 }
14004 
14005 static unsigned HOST_WIDE_INT
aarch64_shift_truncation_mask(machine_mode mode)14006 aarch64_shift_truncation_mask (machine_mode mode)
14007 {
14008   if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
14009     return 0;
14010   return GET_MODE_UNIT_BITSIZE (mode) - 1;
14011 }
14012 
14013 /* Select a format to encode pointers in exception handling data.  */
14014 int
aarch64_asm_preferred_eh_data_format(int code ATTRIBUTE_UNUSED,int global)14015 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
14016 {
14017    int type;
14018    switch (aarch64_cmodel)
14019      {
14020      case AARCH64_CMODEL_TINY:
14021      case AARCH64_CMODEL_TINY_PIC:
14022      case AARCH64_CMODEL_SMALL:
14023      case AARCH64_CMODEL_SMALL_PIC:
14024      case AARCH64_CMODEL_SMALL_SPIC:
14025        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
14026 	  for everything.  */
14027        type = DW_EH_PE_sdata4;
14028        break;
14029      default:
14030        /* No assumptions here.  8-byte relocs required.  */
14031        type = DW_EH_PE_sdata8;
14032        break;
14033      }
14034    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
14035 }
14036 
14037 /* The last .arch and .tune assembly strings that we printed.  */
14038 static std::string aarch64_last_printed_arch_string;
14039 static std::string aarch64_last_printed_tune_string;
14040 
14041 /* Implement ASM_DECLARE_FUNCTION_NAME.  Output the ISA features used
14042    by the function fndecl.  */
14043 
14044 void
aarch64_declare_function_name(FILE * stream,const char * name,tree fndecl)14045 aarch64_declare_function_name (FILE *stream, const char* name,
14046 				tree fndecl)
14047 {
14048   tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
14049 
14050   struct cl_target_option *targ_options;
14051   if (target_parts)
14052     targ_options = TREE_TARGET_OPTION (target_parts);
14053   else
14054     targ_options = TREE_TARGET_OPTION (target_option_current_node);
14055   gcc_assert (targ_options);
14056 
14057   const struct processor *this_arch
14058     = aarch64_get_arch (targ_options->x_explicit_arch);
14059 
14060   unsigned long isa_flags = targ_options->x_aarch64_isa_flags;
14061   std::string extension
14062     = aarch64_get_extension_string_for_isa_flags (isa_flags,
14063 						  this_arch->flags);
14064   /* Only update the assembler .arch string if it is distinct from the last
14065      such string we printed.  */
14066   std::string to_print = this_arch->name + extension;
14067   if (to_print != aarch64_last_printed_arch_string)
14068     {
14069       asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
14070       aarch64_last_printed_arch_string = to_print;
14071     }
14072 
14073   /* Print the cpu name we're tuning for in the comments, might be
14074      useful to readers of the generated asm.  Do it only when it changes
14075      from function to function and verbose assembly is requested.  */
14076   const struct processor *this_tune
14077     = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
14078 
14079   if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
14080     {
14081       asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
14082 		   this_tune->name);
14083       aarch64_last_printed_tune_string = this_tune->name;
14084     }
14085 
14086   /* Don't forget the type directive for ELF.  */
14087   ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
14088   ASM_OUTPUT_LABEL (stream, name);
14089 }
14090 
14091 /* Implements TARGET_ASM_FILE_START.  Output the assembly header.  */
14092 
14093 static void
aarch64_start_file(void)14094 aarch64_start_file (void)
14095 {
14096   struct cl_target_option *default_options
14097     = TREE_TARGET_OPTION (target_option_default_node);
14098 
14099   const struct processor *default_arch
14100     = aarch64_get_arch (default_options->x_explicit_arch);
14101   unsigned long default_isa_flags = default_options->x_aarch64_isa_flags;
14102   std::string extension
14103     = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
14104 						  default_arch->flags);
14105 
14106    aarch64_last_printed_arch_string = default_arch->name + extension;
14107    aarch64_last_printed_tune_string = "";
14108    asm_fprintf (asm_out_file, "\t.arch %s\n",
14109 		aarch64_last_printed_arch_string.c_str ());
14110 
14111    default_file_start ();
14112 }
14113 
14114 /* Emit load exclusive.  */
14115 
14116 static void
aarch64_emit_load_exclusive(machine_mode mode,rtx rval,rtx mem,rtx model_rtx)14117 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
14118 			     rtx mem, rtx model_rtx)
14119 {
14120   rtx (*gen) (rtx, rtx, rtx);
14121 
14122   switch (mode)
14123     {
14124     case E_QImode: gen = gen_aarch64_load_exclusiveqi; break;
14125     case E_HImode: gen = gen_aarch64_load_exclusivehi; break;
14126     case E_SImode: gen = gen_aarch64_load_exclusivesi; break;
14127     case E_DImode: gen = gen_aarch64_load_exclusivedi; break;
14128     default:
14129       gcc_unreachable ();
14130     }
14131 
14132   emit_insn (gen (rval, mem, model_rtx));
14133 }
14134 
14135 /* Emit store exclusive.  */
14136 
14137 static void
aarch64_emit_store_exclusive(machine_mode mode,rtx bval,rtx rval,rtx mem,rtx model_rtx)14138 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
14139 			      rtx rval, rtx mem, rtx model_rtx)
14140 {
14141   rtx (*gen) (rtx, rtx, rtx, rtx);
14142 
14143   switch (mode)
14144     {
14145     case E_QImode: gen = gen_aarch64_store_exclusiveqi; break;
14146     case E_HImode: gen = gen_aarch64_store_exclusivehi; break;
14147     case E_SImode: gen = gen_aarch64_store_exclusivesi; break;
14148     case E_DImode: gen = gen_aarch64_store_exclusivedi; break;
14149     default:
14150       gcc_unreachable ();
14151     }
14152 
14153   emit_insn (gen (bval, rval, mem, model_rtx));
14154 }
14155 
14156 /* Mark the previous jump instruction as unlikely.  */
14157 
14158 static void
aarch64_emit_unlikely_jump(rtx insn)14159 aarch64_emit_unlikely_jump (rtx insn)
14160 {
14161   rtx_insn *jump = emit_jump_insn (insn);
14162   add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
14163 }
14164 
14165 /* Expand a compare and swap pattern.  */
14166 
14167 void
aarch64_expand_compare_and_swap(rtx operands[])14168 aarch64_expand_compare_and_swap (rtx operands[])
14169 {
14170   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
14171   machine_mode mode, cmp_mode;
14172   typedef rtx (*gen_cas_fn) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
14173   int idx;
14174   gen_cas_fn gen;
14175   const gen_cas_fn split_cas[] =
14176   {
14177     gen_aarch64_compare_and_swapqi,
14178     gen_aarch64_compare_and_swaphi,
14179     gen_aarch64_compare_and_swapsi,
14180     gen_aarch64_compare_and_swapdi
14181   };
14182   const gen_cas_fn atomic_cas[] =
14183   {
14184     gen_aarch64_compare_and_swapqi_lse,
14185     gen_aarch64_compare_and_swaphi_lse,
14186     gen_aarch64_compare_and_swapsi_lse,
14187     gen_aarch64_compare_and_swapdi_lse
14188   };
14189 
14190   bval = operands[0];
14191   rval = operands[1];
14192   mem = operands[2];
14193   oldval = operands[3];
14194   newval = operands[4];
14195   is_weak = operands[5];
14196   mod_s = operands[6];
14197   mod_f = operands[7];
14198   mode = GET_MODE (mem);
14199   cmp_mode = mode;
14200 
14201   /* Normally the succ memory model must be stronger than fail, but in the
14202      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
14203      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
14204 
14205   if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
14206       && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
14207     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
14208 
14209   switch (mode)
14210     {
14211     case E_QImode:
14212     case E_HImode:
14213       /* For short modes, we're going to perform the comparison in SImode,
14214 	 so do the zero-extension now.  */
14215       cmp_mode = SImode;
14216       rval = gen_reg_rtx (SImode);
14217       oldval = convert_modes (SImode, mode, oldval, true);
14218       /* Fall through.  */
14219 
14220     case E_SImode:
14221     case E_DImode:
14222       /* Force the value into a register if needed.  */
14223       if (!aarch64_plus_operand (oldval, mode))
14224 	oldval = force_reg (cmp_mode, oldval);
14225       break;
14226 
14227     default:
14228       gcc_unreachable ();
14229     }
14230 
14231   switch (mode)
14232     {
14233     case E_QImode: idx = 0; break;
14234     case E_HImode: idx = 1; break;
14235     case E_SImode: idx = 2; break;
14236     case E_DImode: idx = 3; break;
14237     default:
14238       gcc_unreachable ();
14239     }
14240   if (TARGET_LSE)
14241     gen = atomic_cas[idx];
14242   else
14243     gen = split_cas[idx];
14244 
14245   emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
14246 
14247   if (mode == QImode || mode == HImode)
14248     emit_move_insn (operands[1], gen_lowpart (mode, rval));
14249 
14250   x = gen_rtx_REG (CCmode, CC_REGNUM);
14251   x = gen_rtx_EQ (SImode, x, const0_rtx);
14252   emit_insn (gen_rtx_SET (bval, x));
14253 }
14254 
14255 /* Test whether the target supports using a atomic load-operate instruction.
14256    CODE is the operation and AFTER is TRUE if the data in memory after the
14257    operation should be returned and FALSE if the data before the operation
14258    should be returned.  Returns FALSE if the operation isn't supported by the
14259    architecture.  */
14260 
14261 bool
aarch64_atomic_ldop_supported_p(enum rtx_code code)14262 aarch64_atomic_ldop_supported_p (enum rtx_code code)
14263 {
14264   if (!TARGET_LSE)
14265     return false;
14266 
14267   switch (code)
14268     {
14269     case SET:
14270     case AND:
14271     case IOR:
14272     case XOR:
14273     case MINUS:
14274     case PLUS:
14275       return true;
14276     default:
14277       return false;
14278     }
14279 }
14280 
14281 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
14282    sequence implementing an atomic operation.  */
14283 
14284 static void
aarch64_emit_post_barrier(enum memmodel model)14285 aarch64_emit_post_barrier (enum memmodel model)
14286 {
14287   const enum memmodel base_model = memmodel_base (model);
14288 
14289   if (is_mm_sync (model)
14290       && (base_model == MEMMODEL_ACQUIRE
14291 	  || base_model == MEMMODEL_ACQ_REL
14292 	  || base_model == MEMMODEL_SEQ_CST))
14293     {
14294       emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
14295     }
14296 }
14297 
14298 /* Emit an atomic compare-and-swap operation.  RVAL is the destination register
14299    for the data in memory.  EXPECTED is the value expected to be in memory.
14300    DESIRED is the value to store to memory.  MEM is the memory location.  MODEL
14301    is the memory ordering to use.  */
14302 
14303 void
aarch64_gen_atomic_cas(rtx rval,rtx mem,rtx expected,rtx desired,rtx model)14304 aarch64_gen_atomic_cas (rtx rval, rtx mem,
14305 			rtx expected, rtx desired,
14306 			rtx model)
14307 {
14308   rtx (*gen) (rtx, rtx, rtx, rtx);
14309   machine_mode mode;
14310 
14311   mode = GET_MODE (mem);
14312 
14313   switch (mode)
14314     {
14315     case E_QImode: gen = gen_aarch64_atomic_casqi; break;
14316     case E_HImode: gen = gen_aarch64_atomic_cashi; break;
14317     case E_SImode: gen = gen_aarch64_atomic_cassi; break;
14318     case E_DImode: gen = gen_aarch64_atomic_casdi; break;
14319     default:
14320       gcc_unreachable ();
14321     }
14322 
14323   /* Move the expected value into the CAS destination register.  */
14324   emit_insn (gen_rtx_SET (rval, expected));
14325 
14326   /* Emit the CAS.  */
14327   emit_insn (gen (rval, mem, desired, model));
14328 
14329   /* Compare the expected value with the value loaded by the CAS, to establish
14330      whether the swap was made.  */
14331   aarch64_gen_compare_reg (EQ, rval, expected);
14332 }
14333 
14334 /* Split a compare and swap pattern.  */
14335 
14336 void
aarch64_split_compare_and_swap(rtx operands[])14337 aarch64_split_compare_and_swap (rtx operands[])
14338 {
14339   /* Split after prolog/epilog to avoid interactions with shrinkwrapping.  */
14340   gcc_assert (epilogue_completed);
14341 
14342   rtx rval, mem, oldval, newval, scratch;
14343   machine_mode mode;
14344   bool is_weak;
14345   rtx_code_label *label1, *label2;
14346   rtx x, cond;
14347   enum memmodel model;
14348   rtx model_rtx;
14349 
14350   rval = operands[0];
14351   mem = operands[1];
14352   oldval = operands[2];
14353   newval = operands[3];
14354   is_weak = (operands[4] != const0_rtx);
14355   model_rtx = operands[5];
14356   scratch = operands[7];
14357   mode = GET_MODE (mem);
14358   model = memmodel_from_int (INTVAL (model_rtx));
14359 
14360   /* When OLDVAL is zero and we want the strong version we can emit a tighter
14361     loop:
14362     .label1:
14363 	LD[A]XR	rval, [mem]
14364 	CBNZ	rval, .label2
14365 	ST[L]XR	scratch, newval, [mem]
14366 	CBNZ	scratch, .label1
14367     .label2:
14368 	CMP	rval, 0.  */
14369   bool strong_zero_p = !is_weak && oldval == const0_rtx;
14370 
14371   label1 = NULL;
14372   if (!is_weak)
14373     {
14374       label1 = gen_label_rtx ();
14375       emit_label (label1);
14376     }
14377   label2 = gen_label_rtx ();
14378 
14379   /* The initial load can be relaxed for a __sync operation since a final
14380      barrier will be emitted to stop code hoisting.  */
14381   if (is_mm_sync (model))
14382     aarch64_emit_load_exclusive (mode, rval, mem,
14383 				 GEN_INT (MEMMODEL_RELAXED));
14384   else
14385     aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
14386 
14387   if (strong_zero_p)
14388     {
14389       x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
14390       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14391 				gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
14392       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14393     }
14394   else
14395     {
14396       cond = aarch64_gen_compare_reg (NE, rval, oldval);
14397       x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
14398       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14399 				 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
14400       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14401     }
14402 
14403   aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
14404 
14405   if (!is_weak)
14406     {
14407       x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
14408       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14409 				gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
14410       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14411     }
14412   else
14413     {
14414       cond = gen_rtx_REG (CCmode, CC_REGNUM);
14415       x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
14416       emit_insn (gen_rtx_SET (cond, x));
14417     }
14418 
14419   emit_label (label2);
14420   /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
14421      to set the condition flags.  If this is not used it will be removed by
14422      later passes.  */
14423   if (strong_zero_p)
14424     {
14425       cond = gen_rtx_REG (CCmode, CC_REGNUM);
14426       x = gen_rtx_COMPARE (CCmode, rval, const0_rtx);
14427       emit_insn (gen_rtx_SET (cond, x));
14428     }
14429   /* Emit any final barrier needed for a __sync operation.  */
14430   if (is_mm_sync (model))
14431     aarch64_emit_post_barrier (model);
14432 }
14433 
14434 /* Emit a BIC instruction.  */
14435 
14436 static void
aarch64_emit_bic(machine_mode mode,rtx dst,rtx s1,rtx s2,int shift)14437 aarch64_emit_bic (machine_mode mode, rtx dst, rtx s1, rtx s2, int shift)
14438 {
14439   rtx shift_rtx = GEN_INT (shift);
14440   rtx (*gen) (rtx, rtx, rtx, rtx);
14441 
14442   switch (mode)
14443     {
14444     case E_SImode: gen = gen_and_one_cmpl_lshrsi3; break;
14445     case E_DImode: gen = gen_and_one_cmpl_lshrdi3; break;
14446     default:
14447       gcc_unreachable ();
14448     }
14449 
14450   emit_insn (gen (dst, s2, shift_rtx, s1));
14451 }
14452 
14453 /* Emit an atomic swap.  */
14454 
14455 static void
aarch64_emit_atomic_swap(machine_mode mode,rtx dst,rtx value,rtx mem,rtx model)14456 aarch64_emit_atomic_swap (machine_mode mode, rtx dst, rtx value,
14457 			  rtx mem, rtx model)
14458 {
14459   rtx (*gen) (rtx, rtx, rtx, rtx);
14460 
14461   switch (mode)
14462     {
14463     case E_QImode: gen = gen_aarch64_atomic_swpqi; break;
14464     case E_HImode: gen = gen_aarch64_atomic_swphi; break;
14465     case E_SImode: gen = gen_aarch64_atomic_swpsi; break;
14466     case E_DImode: gen = gen_aarch64_atomic_swpdi; break;
14467     default:
14468       gcc_unreachable ();
14469     }
14470 
14471   emit_insn (gen (dst, mem, value, model));
14472 }
14473 
14474 /* Operations supported by aarch64_emit_atomic_load_op.  */
14475 
14476 enum aarch64_atomic_load_op_code
14477 {
14478   AARCH64_LDOP_PLUS,	/* A + B  */
14479   AARCH64_LDOP_XOR,	/* A ^ B  */
14480   AARCH64_LDOP_OR,	/* A | B  */
14481   AARCH64_LDOP_BIC	/* A & ~B  */
14482 };
14483 
14484 /* Emit an atomic load-operate.  */
14485 
14486 static void
aarch64_emit_atomic_load_op(enum aarch64_atomic_load_op_code code,machine_mode mode,rtx dst,rtx src,rtx mem,rtx model)14487 aarch64_emit_atomic_load_op (enum aarch64_atomic_load_op_code code,
14488 			     machine_mode mode, rtx dst, rtx src,
14489 			     rtx mem, rtx model)
14490 {
14491   typedef rtx (*aarch64_atomic_load_op_fn) (rtx, rtx, rtx, rtx);
14492   const aarch64_atomic_load_op_fn plus[] =
14493   {
14494     gen_aarch64_atomic_loadaddqi,
14495     gen_aarch64_atomic_loadaddhi,
14496     gen_aarch64_atomic_loadaddsi,
14497     gen_aarch64_atomic_loadadddi
14498   };
14499   const aarch64_atomic_load_op_fn eor[] =
14500   {
14501     gen_aarch64_atomic_loadeorqi,
14502     gen_aarch64_atomic_loadeorhi,
14503     gen_aarch64_atomic_loadeorsi,
14504     gen_aarch64_atomic_loadeordi
14505   };
14506   const aarch64_atomic_load_op_fn ior[] =
14507   {
14508     gen_aarch64_atomic_loadsetqi,
14509     gen_aarch64_atomic_loadsethi,
14510     gen_aarch64_atomic_loadsetsi,
14511     gen_aarch64_atomic_loadsetdi
14512   };
14513   const aarch64_atomic_load_op_fn bic[] =
14514   {
14515     gen_aarch64_atomic_loadclrqi,
14516     gen_aarch64_atomic_loadclrhi,
14517     gen_aarch64_atomic_loadclrsi,
14518     gen_aarch64_atomic_loadclrdi
14519   };
14520   aarch64_atomic_load_op_fn gen;
14521   int idx = 0;
14522 
14523   switch (mode)
14524     {
14525     case E_QImode: idx = 0; break;
14526     case E_HImode: idx = 1; break;
14527     case E_SImode: idx = 2; break;
14528     case E_DImode: idx = 3; break;
14529     default:
14530       gcc_unreachable ();
14531     }
14532 
14533   switch (code)
14534     {
14535     case AARCH64_LDOP_PLUS: gen = plus[idx]; break;
14536     case AARCH64_LDOP_XOR: gen = eor[idx]; break;
14537     case AARCH64_LDOP_OR: gen = ior[idx]; break;
14538     case AARCH64_LDOP_BIC: gen = bic[idx]; break;
14539     default:
14540       gcc_unreachable ();
14541     }
14542 
14543   emit_insn (gen (dst, mem, src, model));
14544 }
14545 
14546 /* Emit an atomic load+operate.  CODE is the operation.  OUT_DATA is the
14547    location to store the data read from memory.  OUT_RESULT is the location to
14548    store the result of the operation.  MEM is the memory location to read and
14549    modify.  MODEL_RTX is the memory ordering to use.  VALUE is the second
14550    operand for the operation.  Either OUT_DATA or OUT_RESULT, but not both, can
14551    be NULL.  */
14552 
14553 void
aarch64_gen_atomic_ldop(enum rtx_code code,rtx out_data,rtx out_result,rtx mem,rtx value,rtx model_rtx)14554 aarch64_gen_atomic_ldop (enum rtx_code code, rtx out_data, rtx out_result,
14555 			 rtx mem, rtx value, rtx model_rtx)
14556 {
14557   machine_mode mode = GET_MODE (mem);
14558   machine_mode wmode = (mode == DImode ? DImode : SImode);
14559   const bool short_mode = (mode < SImode);
14560   aarch64_atomic_load_op_code ldop_code;
14561   rtx src;
14562   rtx x;
14563 
14564   if (out_data)
14565     out_data = gen_lowpart (mode, out_data);
14566 
14567   if (out_result)
14568     out_result = gen_lowpart (mode, out_result);
14569 
14570   /* Make sure the value is in a register, putting it into a destination
14571      register if it needs to be manipulated.  */
14572   if (!register_operand (value, mode)
14573       || code == AND || code == MINUS)
14574     {
14575       src = out_result ? out_result : out_data;
14576       emit_move_insn (src, gen_lowpart (mode, value));
14577     }
14578   else
14579     src = value;
14580   gcc_assert (register_operand (src, mode));
14581 
14582   /* Preprocess the data for the operation as necessary.  If the operation is
14583      a SET then emit a swap instruction and finish.  */
14584   switch (code)
14585     {
14586     case SET:
14587       aarch64_emit_atomic_swap (mode, out_data, src, mem, model_rtx);
14588       return;
14589 
14590     case MINUS:
14591       /* Negate the value and treat it as a PLUS.  */
14592       {
14593 	rtx neg_src;
14594 
14595 	/* Resize the value if necessary.  */
14596 	if (short_mode)
14597 	  src = gen_lowpart (wmode, src);
14598 
14599 	neg_src = gen_rtx_NEG (wmode, src);
14600 	emit_insn (gen_rtx_SET (src, neg_src));
14601 
14602 	if (short_mode)
14603 	  src = gen_lowpart (mode, src);
14604       }
14605       /* Fall-through.  */
14606     case PLUS:
14607       ldop_code = AARCH64_LDOP_PLUS;
14608       break;
14609 
14610     case IOR:
14611       ldop_code = AARCH64_LDOP_OR;
14612       break;
14613 
14614     case XOR:
14615       ldop_code = AARCH64_LDOP_XOR;
14616       break;
14617 
14618     case AND:
14619       {
14620 	rtx not_src;
14621 
14622 	/* Resize the value if necessary.  */
14623 	if (short_mode)
14624 	  src = gen_lowpart (wmode, src);
14625 
14626 	not_src = gen_rtx_NOT (wmode, src);
14627 	emit_insn (gen_rtx_SET (src, not_src));
14628 
14629 	if (short_mode)
14630 	  src = gen_lowpart (mode, src);
14631       }
14632       ldop_code = AARCH64_LDOP_BIC;
14633       break;
14634 
14635     default:
14636       /* The operation can't be done with atomic instructions.  */
14637       gcc_unreachable ();
14638     }
14639 
14640   aarch64_emit_atomic_load_op (ldop_code, mode, out_data, src, mem, model_rtx);
14641 
14642   /* If necessary, calculate the data in memory after the update by redoing the
14643      operation from values in registers.  */
14644   if (!out_result)
14645     return;
14646 
14647   if (short_mode)
14648     {
14649       src = gen_lowpart (wmode, src);
14650       out_data = gen_lowpart (wmode, out_data);
14651       out_result = gen_lowpart (wmode, out_result);
14652     }
14653 
14654   x = NULL_RTX;
14655 
14656   switch (code)
14657     {
14658     case MINUS:
14659     case PLUS:
14660       x = gen_rtx_PLUS (wmode, out_data, src);
14661       break;
14662     case IOR:
14663       x = gen_rtx_IOR (wmode, out_data, src);
14664       break;
14665     case XOR:
14666       x = gen_rtx_XOR (wmode, out_data, src);
14667       break;
14668     case AND:
14669       aarch64_emit_bic (wmode, out_result, out_data, src, 0);
14670       return;
14671     default:
14672       gcc_unreachable ();
14673     }
14674 
14675   emit_set_insn (out_result, x);
14676 
14677   return;
14678 }
14679 
14680 /* Split an atomic operation.  */
14681 
14682 void
aarch64_split_atomic_op(enum rtx_code code,rtx old_out,rtx new_out,rtx mem,rtx value,rtx model_rtx,rtx cond)14683 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
14684 			 rtx value, rtx model_rtx, rtx cond)
14685 {
14686   /* Split after prolog/epilog to avoid interactions with shrinkwrapping.  */
14687   gcc_assert (epilogue_completed);
14688 
14689   machine_mode mode = GET_MODE (mem);
14690   machine_mode wmode = (mode == DImode ? DImode : SImode);
14691   const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
14692   const bool is_sync = is_mm_sync (model);
14693   rtx_code_label *label;
14694   rtx x;
14695 
14696   /* Split the atomic operation into a sequence.  */
14697   label = gen_label_rtx ();
14698   emit_label (label);
14699 
14700   if (new_out)
14701     new_out = gen_lowpart (wmode, new_out);
14702   if (old_out)
14703     old_out = gen_lowpart (wmode, old_out);
14704   else
14705     old_out = new_out;
14706   value = simplify_gen_subreg (wmode, value, mode, 0);
14707 
14708   /* The initial load can be relaxed for a __sync operation since a final
14709      barrier will be emitted to stop code hoisting.  */
14710  if (is_sync)
14711     aarch64_emit_load_exclusive (mode, old_out, mem,
14712 				 GEN_INT (MEMMODEL_RELAXED));
14713   else
14714     aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
14715 
14716   switch (code)
14717     {
14718     case SET:
14719       new_out = value;
14720       break;
14721 
14722     case NOT:
14723       x = gen_rtx_AND (wmode, old_out, value);
14724       emit_insn (gen_rtx_SET (new_out, x));
14725       x = gen_rtx_NOT (wmode, new_out);
14726       emit_insn (gen_rtx_SET (new_out, x));
14727       break;
14728 
14729     case MINUS:
14730       if (CONST_INT_P (value))
14731 	{
14732 	  value = GEN_INT (-INTVAL (value));
14733 	  code = PLUS;
14734 	}
14735       /* Fall through.  */
14736 
14737     default:
14738       x = gen_rtx_fmt_ee (code, wmode, old_out, value);
14739       emit_insn (gen_rtx_SET (new_out, x));
14740       break;
14741     }
14742 
14743   aarch64_emit_store_exclusive (mode, cond, mem,
14744 				gen_lowpart (mode, new_out), model_rtx);
14745 
14746   x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
14747   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14748 			    gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
14749   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14750 
14751   /* Emit any final barrier needed for a __sync operation.  */
14752   if (is_sync)
14753     aarch64_emit_post_barrier (model);
14754 }
14755 
14756 static void
aarch64_init_libfuncs(void)14757 aarch64_init_libfuncs (void)
14758 {
14759    /* Half-precision float operations.  The compiler handles all operations
14760      with NULL libfuncs by converting to SFmode.  */
14761 
14762   /* Conversions.  */
14763   set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
14764   set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
14765 
14766   /* Arithmetic.  */
14767   set_optab_libfunc (add_optab, HFmode, NULL);
14768   set_optab_libfunc (sdiv_optab, HFmode, NULL);
14769   set_optab_libfunc (smul_optab, HFmode, NULL);
14770   set_optab_libfunc (neg_optab, HFmode, NULL);
14771   set_optab_libfunc (sub_optab, HFmode, NULL);
14772 
14773   /* Comparisons.  */
14774   set_optab_libfunc (eq_optab, HFmode, NULL);
14775   set_optab_libfunc (ne_optab, HFmode, NULL);
14776   set_optab_libfunc (lt_optab, HFmode, NULL);
14777   set_optab_libfunc (le_optab, HFmode, NULL);
14778   set_optab_libfunc (ge_optab, HFmode, NULL);
14779   set_optab_libfunc (gt_optab, HFmode, NULL);
14780   set_optab_libfunc (unord_optab, HFmode, NULL);
14781 }
14782 
14783 /* Target hook for c_mode_for_suffix.  */
14784 static machine_mode
aarch64_c_mode_for_suffix(char suffix)14785 aarch64_c_mode_for_suffix (char suffix)
14786 {
14787   if (suffix == 'q')
14788     return TFmode;
14789 
14790   return VOIDmode;
14791 }
14792 
14793 /* We can only represent floating point constants which will fit in
14794    "quarter-precision" values.  These values are characterised by
14795    a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
14796    by:
14797 
14798    (-1)^s * (n/16) * 2^r
14799 
14800    Where:
14801      's' is the sign bit.
14802      'n' is an integer in the range 16 <= n <= 31.
14803      'r' is an integer in the range -3 <= r <= 4.  */
14804 
14805 /* Return true iff X can be represented by a quarter-precision
14806    floating point immediate operand X.  Note, we cannot represent 0.0.  */
14807 bool
aarch64_float_const_representable_p(rtx x)14808 aarch64_float_const_representable_p (rtx x)
14809 {
14810   /* This represents our current view of how many bits
14811      make up the mantissa.  */
14812   int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
14813   int exponent;
14814   unsigned HOST_WIDE_INT mantissa, mask;
14815   REAL_VALUE_TYPE r, m;
14816   bool fail;
14817 
14818   if (!CONST_DOUBLE_P (x))
14819     return false;
14820 
14821   /* We don't support HFmode constants yet.  */
14822   if (GET_MODE (x) == VOIDmode || GET_MODE (x) == HFmode)
14823     return false;
14824 
14825   r = *CONST_DOUBLE_REAL_VALUE (x);
14826 
14827   /* We cannot represent infinities, NaNs or +/-zero.  We won't
14828      know if we have +zero until we analyse the mantissa, but we
14829      can reject the other invalid values.  */
14830   if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
14831       || REAL_VALUE_MINUS_ZERO (r))
14832     return false;
14833 
14834   /* Extract exponent.  */
14835   r = real_value_abs (&r);
14836   exponent = REAL_EXP (&r);
14837 
14838   /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
14839      highest (sign) bit, with a fixed binary point at bit point_pos.
14840      m1 holds the low part of the mantissa, m2 the high part.
14841      WARNING: If we ever have a representation using more than 2 * H_W_I - 1
14842      bits for the mantissa, this can fail (low bits will be lost).  */
14843   real_ldexp (&m, &r, point_pos - exponent);
14844   wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
14845 
14846   /* If the low part of the mantissa has bits set we cannot represent
14847      the value.  */
14848   if (w.ulow () != 0)
14849     return false;
14850   /* We have rejected the lower HOST_WIDE_INT, so update our
14851      understanding of how many bits lie in the mantissa and
14852      look only at the high HOST_WIDE_INT.  */
14853   mantissa = w.elt (1);
14854   point_pos -= HOST_BITS_PER_WIDE_INT;
14855 
14856   /* We can only represent values with a mantissa of the form 1.xxxx.  */
14857   mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
14858   if ((mantissa & mask) != 0)
14859     return false;
14860 
14861   /* Having filtered unrepresentable values, we may now remove all
14862      but the highest 5 bits.  */
14863   mantissa >>= point_pos - 5;
14864 
14865   /* We cannot represent the value 0.0, so reject it.  This is handled
14866      elsewhere.  */
14867   if (mantissa == 0)
14868     return false;
14869 
14870   /* Then, as bit 4 is always set, we can mask it off, leaving
14871      the mantissa in the range [0, 15].  */
14872   mantissa &= ~(1 << 4);
14873   gcc_assert (mantissa <= 15);
14874 
14875   /* GCC internally does not use IEEE754-like encoding (where normalized
14876      significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.c).
14877      Our mantissa values are shifted 4 places to the left relative to
14878      normalized IEEE754 so we must modify the exponent returned by REAL_EXP
14879      by 5 places to correct for GCC's representation.  */
14880   exponent = 5 - exponent;
14881 
14882   return (exponent >= 0 && exponent <= 7);
14883 }
14884 
14885 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
14886    immediate with a CONST_VECTOR of MODE and WIDTH.  WHICH selects whether to
14887    output MOVI/MVNI, ORR or BIC immediate.  */
14888 char*
aarch64_output_simd_mov_immediate(rtx const_vector,unsigned width,enum simd_immediate_check which)14889 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
14890 				   enum simd_immediate_check which)
14891 {
14892   bool is_valid;
14893   static char templ[40];
14894   const char *mnemonic;
14895   const char *shift_op;
14896   unsigned int lane_count = 0;
14897   char element_char;
14898 
14899   struct simd_immediate_info info;
14900 
14901   /* This will return true to show const_vector is legal for use as either
14902      a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
14903      It will also update INFO to show how the immediate should be generated.
14904      WHICH selects whether to check for MOVI/MVNI, ORR or BIC.  */
14905   is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
14906   gcc_assert (is_valid);
14907 
14908   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
14909   lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
14910 
14911   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
14912     {
14913       gcc_assert (info.shift == 0 && info.insn == simd_immediate_info::MOV);
14914       /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
14915 	 move immediate path.  */
14916       if (aarch64_float_const_zero_rtx_p (info.value))
14917         info.value = GEN_INT (0);
14918       else
14919 	{
14920 	  const unsigned int buf_size = 20;
14921 	  char float_buf[buf_size] = {'\0'};
14922 	  real_to_decimal_for_mode (float_buf,
14923 				    CONST_DOUBLE_REAL_VALUE (info.value),
14924 				    buf_size, buf_size, 1, info.elt_mode);
14925 
14926 	  if (lane_count == 1)
14927 	    snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
14928 	  else
14929 	    snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
14930 		      lane_count, element_char, float_buf);
14931 	  return templ;
14932 	}
14933     }
14934 
14935   gcc_assert (CONST_INT_P (info.value));
14936 
14937   if (which == AARCH64_CHECK_MOV)
14938     {
14939       mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
14940       shift_op = info.modifier == simd_immediate_info::MSL ? "msl" : "lsl";
14941       if (lane_count == 1)
14942 	snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
14943 		  mnemonic, UINTVAL (info.value));
14944       else if (info.shift)
14945 	snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
14946 		  HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
14947 		  element_char, UINTVAL (info.value), shift_op, info.shift);
14948       else
14949 	snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
14950 		  HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
14951 		  element_char, UINTVAL (info.value));
14952     }
14953   else
14954     {
14955       /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR.  */
14956       mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
14957       if (info.shift)
14958 	snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
14959 		  HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
14960 		  element_char, UINTVAL (info.value), "lsl", info.shift);
14961       else
14962 	snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
14963 		  HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
14964 		  element_char, UINTVAL (info.value));
14965     }
14966   return templ;
14967 }
14968 
14969 char*
aarch64_output_scalar_simd_mov_immediate(rtx immediate,scalar_int_mode mode)14970 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
14971 {
14972 
14973   /* If a floating point number was passed and we desire to use it in an
14974      integer mode do the conversion to integer.  */
14975   if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
14976     {
14977       unsigned HOST_WIDE_INT ival;
14978       if (!aarch64_reinterpret_float_as_int (immediate, &ival))
14979 	  gcc_unreachable ();
14980       immediate = gen_int_mode (ival, mode);
14981     }
14982 
14983   machine_mode vmode;
14984   /* use a 64 bit mode for everything except for DI/DF mode, where we use
14985      a 128 bit vector mode.  */
14986   int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
14987 
14988   vmode = aarch64_simd_container_mode (mode, width);
14989   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
14990   return aarch64_output_simd_mov_immediate (v_op, width);
14991 }
14992 
14993 /* Return the output string to use for moving immediate CONST_VECTOR
14994    into an SVE register.  */
14995 
14996 char *
aarch64_output_sve_mov_immediate(rtx const_vector)14997 aarch64_output_sve_mov_immediate (rtx const_vector)
14998 {
14999   static char templ[40];
15000   struct simd_immediate_info info;
15001   char element_char;
15002 
15003   bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
15004   gcc_assert (is_valid);
15005 
15006   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
15007 
15008   if (info.step)
15009     {
15010       snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
15011 		HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
15012 		element_char, INTVAL (info.value), INTVAL (info.step));
15013       return templ;
15014     }
15015 
15016   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
15017     {
15018       if (aarch64_float_const_zero_rtx_p (info.value))
15019 	info.value = GEN_INT (0);
15020       else
15021 	{
15022 	  const int buf_size = 20;
15023 	  char float_buf[buf_size] = {};
15024 	  real_to_decimal_for_mode (float_buf,
15025 				    CONST_DOUBLE_REAL_VALUE (info.value),
15026 				    buf_size, buf_size, 1, info.elt_mode);
15027 
15028 	  snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
15029 		    element_char, float_buf);
15030 	  return templ;
15031 	}
15032     }
15033 
15034   snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
15035 	    element_char, INTVAL (info.value));
15036   return templ;
15037 }
15038 
15039 /* Return the asm format for a PTRUE instruction whose destination has
15040    mode MODE.  SUFFIX is the element size suffix.  */
15041 
15042 char *
aarch64_output_ptrue(machine_mode mode,char suffix)15043 aarch64_output_ptrue (machine_mode mode, char suffix)
15044 {
15045   unsigned int nunits;
15046   static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
15047   if (GET_MODE_NUNITS (mode).is_constant (&nunits))
15048     snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", suffix, nunits);
15049   else
15050     snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, all", suffix);
15051   return buf;
15052 }
15053 
15054 /* Split operands into moves from op[1] + op[2] into op[0].  */
15055 
15056 void
aarch64_split_combinev16qi(rtx operands[3])15057 aarch64_split_combinev16qi (rtx operands[3])
15058 {
15059   unsigned int dest = REGNO (operands[0]);
15060   unsigned int src1 = REGNO (operands[1]);
15061   unsigned int src2 = REGNO (operands[2]);
15062   machine_mode halfmode = GET_MODE (operands[1]);
15063   unsigned int halfregs = REG_NREGS (operands[1]);
15064   rtx destlo, desthi;
15065 
15066   gcc_assert (halfmode == V16QImode);
15067 
15068   if (src1 == dest && src2 == dest + halfregs)
15069     {
15070       /* No-op move.  Can't split to nothing; emit something.  */
15071       emit_note (NOTE_INSN_DELETED);
15072       return;
15073     }
15074 
15075   /* Preserve register attributes for variable tracking.  */
15076   destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
15077   desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
15078 			       GET_MODE_SIZE (halfmode));
15079 
15080   /* Special case of reversed high/low parts.  */
15081   if (reg_overlap_mentioned_p (operands[2], destlo)
15082       && reg_overlap_mentioned_p (operands[1], desthi))
15083     {
15084       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
15085       emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
15086       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
15087     }
15088   else if (!reg_overlap_mentioned_p (operands[2], destlo))
15089     {
15090       /* Try to avoid unnecessary moves if part of the result
15091 	 is in the right place already.  */
15092       if (src1 != dest)
15093 	emit_move_insn (destlo, operands[1]);
15094       if (src2 != dest + halfregs)
15095 	emit_move_insn (desthi, operands[2]);
15096     }
15097   else
15098     {
15099       if (src2 != dest + halfregs)
15100 	emit_move_insn (desthi, operands[2]);
15101       if (src1 != dest)
15102 	emit_move_insn (destlo, operands[1]);
15103     }
15104 }
15105 
15106 /* vec_perm support.  */
15107 
15108 struct expand_vec_perm_d
15109 {
15110   rtx target, op0, op1;
15111   vec_perm_indices perm;
15112   machine_mode vmode;
15113   unsigned int vec_flags;
15114   bool one_vector_p;
15115   bool testing_p;
15116 };
15117 
15118 /* Generate a variable permutation.  */
15119 
15120 static void
aarch64_expand_vec_perm_1(rtx target,rtx op0,rtx op1,rtx sel)15121 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
15122 {
15123   machine_mode vmode = GET_MODE (target);
15124   bool one_vector_p = rtx_equal_p (op0, op1);
15125 
15126   gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
15127   gcc_checking_assert (GET_MODE (op0) == vmode);
15128   gcc_checking_assert (GET_MODE (op1) == vmode);
15129   gcc_checking_assert (GET_MODE (sel) == vmode);
15130   gcc_checking_assert (TARGET_SIMD);
15131 
15132   if (one_vector_p)
15133     {
15134       if (vmode == V8QImode)
15135 	{
15136 	  /* Expand the argument to a V16QI mode by duplicating it.  */
15137 	  rtx pair = gen_reg_rtx (V16QImode);
15138 	  emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
15139 	  emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
15140 	}
15141       else
15142 	{
15143 	  emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
15144 	}
15145     }
15146   else
15147     {
15148       rtx pair;
15149 
15150       if (vmode == V8QImode)
15151 	{
15152 	  pair = gen_reg_rtx (V16QImode);
15153 	  emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
15154 	  emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
15155 	}
15156       else
15157 	{
15158 	  pair = gen_reg_rtx (OImode);
15159 	  emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
15160 	  emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
15161 	}
15162     }
15163 }
15164 
15165 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
15166    NELT is the number of elements in the vector.  */
15167 
15168 void
aarch64_expand_vec_perm(rtx target,rtx op0,rtx op1,rtx sel,unsigned int nelt)15169 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
15170 			 unsigned int nelt)
15171 {
15172   machine_mode vmode = GET_MODE (target);
15173   bool one_vector_p = rtx_equal_p (op0, op1);
15174   rtx mask;
15175 
15176   /* The TBL instruction does not use a modulo index, so we must take care
15177      of that ourselves.  */
15178   mask = aarch64_simd_gen_const_vector_dup (vmode,
15179       one_vector_p ? nelt - 1 : 2 * nelt - 1);
15180   sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
15181 
15182   /* For big-endian, we also need to reverse the index within the vector
15183      (but not which vector).  */
15184   if (BYTES_BIG_ENDIAN)
15185     {
15186       /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
15187       if (!one_vector_p)
15188         mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
15189       sel = expand_simple_binop (vmode, XOR, sel, mask,
15190 				 NULL, 0, OPTAB_LIB_WIDEN);
15191     }
15192   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
15193 }
15194 
15195 /* Generate (set TARGET (unspec [OP0 OP1] CODE)).  */
15196 
15197 static void
emit_unspec2(rtx target,int code,rtx op0,rtx op1)15198 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
15199 {
15200   emit_insn (gen_rtx_SET (target,
15201 			  gen_rtx_UNSPEC (GET_MODE (target),
15202 					  gen_rtvec (2, op0, op1), code)));
15203 }
15204 
15205 /* Expand an SVE vec_perm with the given operands.  */
15206 
15207 void
aarch64_expand_sve_vec_perm(rtx target,rtx op0,rtx op1,rtx sel)15208 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
15209 {
15210   machine_mode data_mode = GET_MODE (target);
15211   machine_mode sel_mode = GET_MODE (sel);
15212   /* Enforced by the pattern condition.  */
15213   int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
15214 
15215   /* Note: vec_perm indices are supposed to wrap when they go beyond the
15216      size of the two value vectors, i.e. the upper bits of the indices
15217      are effectively ignored.  SVE TBL instead produces 0 for any
15218      out-of-range indices, so we need to modulo all the vec_perm indices
15219      to ensure they are all in range.  */
15220   rtx sel_reg = force_reg (sel_mode, sel);
15221 
15222   /* Check if the sel only references the first values vector.  */
15223   if (GET_CODE (sel) == CONST_VECTOR
15224       && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
15225     {
15226       emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
15227       return;
15228     }
15229 
15230   /* Check if the two values vectors are the same.  */
15231   if (rtx_equal_p (op0, op1))
15232     {
15233       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
15234       rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
15235 					 NULL, 0, OPTAB_DIRECT);
15236       emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
15237       return;
15238     }
15239 
15240   /* Run TBL on for each value vector and combine the results.  */
15241 
15242   rtx res0 = gen_reg_rtx (data_mode);
15243   rtx res1 = gen_reg_rtx (data_mode);
15244   rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
15245   if (GET_CODE (sel) != CONST_VECTOR
15246       || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
15247     {
15248       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
15249 						       2 * nunits - 1);
15250       sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
15251 				     NULL, 0, OPTAB_DIRECT);
15252     }
15253   emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
15254   rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
15255 				     NULL, 0, OPTAB_DIRECT);
15256   emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
15257   if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
15258     emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
15259   else
15260     emit_unspec2 (target, UNSPEC_IORF, res0, res1);
15261 }
15262 
15263 /* Recognize patterns suitable for the TRN instructions.  */
15264 static bool
aarch64_evpc_trn(struct expand_vec_perm_d * d)15265 aarch64_evpc_trn (struct expand_vec_perm_d *d)
15266 {
15267   HOST_WIDE_INT odd;
15268   poly_uint64 nelt = d->perm.length ();
15269   rtx out, in0, in1, x;
15270   machine_mode vmode = d->vmode;
15271 
15272   if (GET_MODE_UNIT_SIZE (vmode) > 8)
15273     return false;
15274 
15275   /* Note that these are little-endian tests.
15276      We correct for big-endian later.  */
15277   if (!d->perm[0].is_constant (&odd)
15278       || (odd != 0 && odd != 1)
15279       || !d->perm.series_p (0, 2, odd, 2)
15280       || !d->perm.series_p (1, 2, nelt + odd, 2))
15281     return false;
15282 
15283   /* Success!  */
15284   if (d->testing_p)
15285     return true;
15286 
15287   in0 = d->op0;
15288   in1 = d->op1;
15289   /* We don't need a big-endian lane correction for SVE; see the comment
15290      at the head of aarch64-sve.md for details.  */
15291   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
15292     {
15293       x = in0, in0 = in1, in1 = x;
15294       odd = !odd;
15295     }
15296   out = d->target;
15297 
15298   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
15299 				      odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
15300   return true;
15301 }
15302 
15303 /* Recognize patterns suitable for the UZP instructions.  */
15304 static bool
aarch64_evpc_uzp(struct expand_vec_perm_d * d)15305 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
15306 {
15307   HOST_WIDE_INT odd;
15308   rtx out, in0, in1, x;
15309   machine_mode vmode = d->vmode;
15310 
15311   if (GET_MODE_UNIT_SIZE (vmode) > 8)
15312     return false;
15313 
15314   /* Note that these are little-endian tests.
15315      We correct for big-endian later.  */
15316   if (!d->perm[0].is_constant (&odd)
15317       || (odd != 0 && odd != 1)
15318       || !d->perm.series_p (0, 1, odd, 2))
15319     return false;
15320 
15321   /* Success!  */
15322   if (d->testing_p)
15323     return true;
15324 
15325   in0 = d->op0;
15326   in1 = d->op1;
15327   /* We don't need a big-endian lane correction for SVE; see the comment
15328      at the head of aarch64-sve.md for details.  */
15329   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
15330     {
15331       x = in0, in0 = in1, in1 = x;
15332       odd = !odd;
15333     }
15334   out = d->target;
15335 
15336   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
15337 				      odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
15338   return true;
15339 }
15340 
15341 /* Recognize patterns suitable for the ZIP instructions.  */
15342 static bool
aarch64_evpc_zip(struct expand_vec_perm_d * d)15343 aarch64_evpc_zip (struct expand_vec_perm_d *d)
15344 {
15345   unsigned int high;
15346   poly_uint64 nelt = d->perm.length ();
15347   rtx out, in0, in1, x;
15348   machine_mode vmode = d->vmode;
15349 
15350   if (GET_MODE_UNIT_SIZE (vmode) > 8)
15351     return false;
15352 
15353   /* Note that these are little-endian tests.
15354      We correct for big-endian later.  */
15355   poly_uint64 first = d->perm[0];
15356   if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
15357       || !d->perm.series_p (0, 2, first, 1)
15358       || !d->perm.series_p (1, 2, first + nelt, 1))
15359     return false;
15360   high = maybe_ne (first, 0U);
15361 
15362   /* Success!  */
15363   if (d->testing_p)
15364     return true;
15365 
15366   in0 = d->op0;
15367   in1 = d->op1;
15368   /* We don't need a big-endian lane correction for SVE; see the comment
15369      at the head of aarch64-sve.md for details.  */
15370   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
15371     {
15372       x = in0, in0 = in1, in1 = x;
15373       high = !high;
15374     }
15375   out = d->target;
15376 
15377   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
15378 				      high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
15379   return true;
15380 }
15381 
15382 /* Recognize patterns for the EXT insn.  */
15383 
15384 static bool
aarch64_evpc_ext(struct expand_vec_perm_d * d)15385 aarch64_evpc_ext (struct expand_vec_perm_d *d)
15386 {
15387   HOST_WIDE_INT location;
15388   rtx offset;
15389 
15390   /* The first element always refers to the first vector.
15391      Check if the extracted indices are increasing by one.  */
15392   if (d->vec_flags == VEC_SVE_PRED
15393       || !d->perm[0].is_constant (&location)
15394       || !d->perm.series_p (0, 1, location, 1))
15395     return false;
15396 
15397   /* Success! */
15398   if (d->testing_p)
15399     return true;
15400 
15401   /* The case where (location == 0) is a no-op for both big- and little-endian,
15402      and is removed by the mid-end at optimization levels -O1 and higher.
15403 
15404      We don't need a big-endian lane correction for SVE; see the comment
15405      at the head of aarch64-sve.md for details.  */
15406   if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
15407     {
15408       /* After setup, we want the high elements of the first vector (stored
15409          at the LSB end of the register), and the low elements of the second
15410          vector (stored at the MSB end of the register). So swap.  */
15411       std::swap (d->op0, d->op1);
15412       /* location != 0 (above), so safe to assume (nelt - location) < nelt.
15413 	 to_constant () is safe since this is restricted to Advanced SIMD
15414 	 vectors.  */
15415       location = d->perm.length ().to_constant () - location;
15416     }
15417 
15418   offset = GEN_INT (location);
15419   emit_set_insn (d->target,
15420 		 gen_rtx_UNSPEC (d->vmode,
15421 				 gen_rtvec (3, d->op0, d->op1, offset),
15422 				 UNSPEC_EXT));
15423   return true;
15424 }
15425 
15426 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
15427    within each 64-bit, 32-bit or 16-bit granule.  */
15428 
15429 static bool
aarch64_evpc_rev_local(struct expand_vec_perm_d * d)15430 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
15431 {
15432   HOST_WIDE_INT diff;
15433   unsigned int i, size, unspec;
15434   machine_mode pred_mode;
15435 
15436   if (d->vec_flags == VEC_SVE_PRED
15437       || !d->one_vector_p
15438       || !d->perm[0].is_constant (&diff))
15439     return false;
15440 
15441   size = (diff + 1) * GET_MODE_UNIT_SIZE (d->vmode);
15442   if (size == 8)
15443     {
15444       unspec = UNSPEC_REV64;
15445       pred_mode = VNx2BImode;
15446     }
15447   else if (size == 4)
15448     {
15449       unspec = UNSPEC_REV32;
15450       pred_mode = VNx4BImode;
15451     }
15452   else if (size == 2)
15453     {
15454       unspec = UNSPEC_REV16;
15455       pred_mode = VNx8BImode;
15456     }
15457   else
15458     return false;
15459 
15460   unsigned int step = diff + 1;
15461   for (i = 0; i < step; ++i)
15462     if (!d->perm.series_p (i, step, diff - i, step))
15463       return false;
15464 
15465   /* Success! */
15466   if (d->testing_p)
15467     return true;
15468 
15469   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
15470   if (d->vec_flags == VEC_SVE_DATA)
15471     {
15472       rtx pred = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
15473       src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (2, pred, src),
15474 			    UNSPEC_MERGE_PTRUE);
15475     }
15476   emit_set_insn (d->target, src);
15477   return true;
15478 }
15479 
15480 /* Recognize patterns for the REV insn, which reverses elements within
15481    a full vector.  */
15482 
15483 static bool
aarch64_evpc_rev_global(struct expand_vec_perm_d * d)15484 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
15485 {
15486   poly_uint64 nelt = d->perm.length ();
15487 
15488   if (!d->one_vector_p || d->vec_flags != VEC_SVE_DATA)
15489     return false;
15490 
15491   if (!d->perm.series_p (0, 1, nelt - 1, -1))
15492     return false;
15493 
15494   /* Success! */
15495   if (d->testing_p)
15496     return true;
15497 
15498   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
15499   emit_set_insn (d->target, src);
15500   return true;
15501 }
15502 
15503 static bool
aarch64_evpc_dup(struct expand_vec_perm_d * d)15504 aarch64_evpc_dup (struct expand_vec_perm_d *d)
15505 {
15506   rtx out = d->target;
15507   rtx in0;
15508   HOST_WIDE_INT elt;
15509   machine_mode vmode = d->vmode;
15510   rtx lane;
15511 
15512   if (d->vec_flags == VEC_SVE_PRED
15513       || d->perm.encoding ().encoded_nelts () != 1
15514       || !d->perm[0].is_constant (&elt))
15515     return false;
15516 
15517   if (d->vec_flags == VEC_SVE_DATA && elt >= 64 * GET_MODE_UNIT_SIZE (vmode))
15518     return false;
15519 
15520   /* Success! */
15521   if (d->testing_p)
15522     return true;
15523 
15524   /* The generic preparation in aarch64_expand_vec_perm_const_1
15525      swaps the operand order and the permute indices if it finds
15526      d->perm[0] to be in the second operand.  Thus, we can always
15527      use d->op0 and need not do any extra arithmetic to get the
15528      correct lane number.  */
15529   in0 = d->op0;
15530   lane = GEN_INT (elt); /* The pattern corrects for big-endian.  */
15531 
15532   rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
15533   rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
15534   emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
15535   return true;
15536 }
15537 
15538 static bool
aarch64_evpc_tbl(struct expand_vec_perm_d * d)15539 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
15540 {
15541   rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
15542   machine_mode vmode = d->vmode;
15543 
15544   /* Make sure that the indices are constant.  */
15545   unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
15546   for (unsigned int i = 0; i < encoded_nelts; ++i)
15547     if (!d->perm[i].is_constant ())
15548       return false;
15549 
15550   if (d->testing_p)
15551     return true;
15552 
15553   /* Generic code will try constant permutation twice.  Once with the
15554      original mode and again with the elements lowered to QImode.
15555      So wait and don't do the selector expansion ourselves.  */
15556   if (vmode != V8QImode && vmode != V16QImode)
15557     return false;
15558 
15559   /* to_constant is safe since this routine is specific to Advanced SIMD
15560      vectors.  */
15561   unsigned int nelt = d->perm.length ().to_constant ();
15562   for (unsigned int i = 0; i < nelt; ++i)
15563     /* If big-endian and two vectors we end up with a weird mixed-endian
15564        mode on NEON.  Reverse the index within each word but not the word
15565        itself.  to_constant is safe because we checked is_constant above.  */
15566     rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
15567 			? d->perm[i].to_constant () ^ (nelt - 1)
15568 			: d->perm[i].to_constant ());
15569 
15570   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
15571   sel = force_reg (vmode, sel);
15572 
15573   aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
15574   return true;
15575 }
15576 
15577 /* Try to implement D using an SVE TBL instruction.  */
15578 
15579 static bool
aarch64_evpc_sve_tbl(struct expand_vec_perm_d * d)15580 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
15581 {
15582   unsigned HOST_WIDE_INT nelt;
15583 
15584   /* Permuting two variable-length vectors could overflow the
15585      index range.  */
15586   if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
15587     return false;
15588 
15589   if (d->testing_p)
15590     return true;
15591 
15592   machine_mode sel_mode = mode_for_int_vector (d->vmode).require ();
15593   rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
15594   aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
15595   return true;
15596 }
15597 
15598 static bool
aarch64_expand_vec_perm_const_1(struct expand_vec_perm_d * d)15599 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
15600 {
15601   /* The pattern matching functions above are written to look for a small
15602      number to begin the sequence (0, 1, N/2).  If we begin with an index
15603      from the second operand, we can swap the operands.  */
15604   poly_int64 nelt = d->perm.length ();
15605   if (known_ge (d->perm[0], nelt))
15606     {
15607       d->perm.rotate_inputs (1);
15608       std::swap (d->op0, d->op1);
15609     }
15610 
15611   if ((d->vec_flags == VEC_ADVSIMD
15612        || d->vec_flags == VEC_SVE_DATA
15613        || d->vec_flags == VEC_SVE_PRED)
15614       && known_gt (nelt, 1))
15615     {
15616       if (aarch64_evpc_rev_local (d))
15617 	return true;
15618       else if (aarch64_evpc_rev_global (d))
15619 	return true;
15620       else if (aarch64_evpc_ext (d))
15621 	return true;
15622       else if (aarch64_evpc_dup (d))
15623 	return true;
15624       else if (aarch64_evpc_zip (d))
15625 	return true;
15626       else if (aarch64_evpc_uzp (d))
15627 	return true;
15628       else if (aarch64_evpc_trn (d))
15629 	return true;
15630       if (d->vec_flags == VEC_SVE_DATA)
15631 	return aarch64_evpc_sve_tbl (d);
15632       else if (d->vec_flags == VEC_ADVSIMD)
15633 	return aarch64_evpc_tbl (d);
15634     }
15635   return false;
15636 }
15637 
15638 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST.  */
15639 
15640 static bool
aarch64_vectorize_vec_perm_const(machine_mode vmode,rtx target,rtx op0,rtx op1,const vec_perm_indices & sel)15641 aarch64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
15642 				  rtx op1, const vec_perm_indices &sel)
15643 {
15644   struct expand_vec_perm_d d;
15645 
15646   /* Check whether the mask can be applied to a single vector.  */
15647   if (op0 && rtx_equal_p (op0, op1))
15648     d.one_vector_p = true;
15649   else if (sel.all_from_input_p (0))
15650     {
15651       d.one_vector_p = true;
15652       op1 = op0;
15653     }
15654   else if (sel.all_from_input_p (1))
15655     {
15656       d.one_vector_p = true;
15657       op0 = op1;
15658     }
15659   else
15660     d.one_vector_p = false;
15661 
15662   d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
15663 		     sel.nelts_per_input ());
15664   d.vmode = vmode;
15665   d.vec_flags = aarch64_classify_vector_mode (d.vmode);
15666   d.target = target;
15667   d.op0 = op0;
15668   d.op1 = op1;
15669   d.testing_p = !target;
15670 
15671   if (!d.testing_p)
15672     return aarch64_expand_vec_perm_const_1 (&d);
15673 
15674   rtx_insn *last = get_last_insn ();
15675   bool ret = aarch64_expand_vec_perm_const_1 (&d);
15676   gcc_assert (last == get_last_insn ());
15677 
15678   return ret;
15679 }
15680 
15681 /* Generate a byte permute mask for a register of mode MODE,
15682    which has NUNITS units.  */
15683 
15684 rtx
aarch64_reverse_mask(machine_mode mode,unsigned int nunits)15685 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
15686 {
15687   /* We have to reverse each vector because we dont have
15688      a permuted load that can reverse-load according to ABI rules.  */
15689   rtx mask;
15690   rtvec v = rtvec_alloc (16);
15691   unsigned int i, j;
15692   unsigned int usize = GET_MODE_UNIT_SIZE (mode);
15693 
15694   gcc_assert (BYTES_BIG_ENDIAN);
15695   gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
15696 
15697   for (i = 0; i < nunits; i++)
15698     for (j = 0; j < usize; j++)
15699       RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
15700   mask = gen_rtx_CONST_VECTOR (V16QImode, v);
15701   return force_reg (V16QImode, mask);
15702 }
15703 
15704 /* Return true if X is a valid second operand for the SVE instruction
15705    that implements integer comparison OP_CODE.  */
15706 
15707 static bool
aarch64_sve_cmp_operand_p(rtx_code op_code,rtx x)15708 aarch64_sve_cmp_operand_p (rtx_code op_code, rtx x)
15709 {
15710   if (register_operand (x, VOIDmode))
15711     return true;
15712 
15713   switch (op_code)
15714     {
15715     case LTU:
15716     case LEU:
15717     case GEU:
15718     case GTU:
15719       return aarch64_sve_cmp_immediate_p (x, false);
15720     case LT:
15721     case LE:
15722     case GE:
15723     case GT:
15724     case NE:
15725     case EQ:
15726       return aarch64_sve_cmp_immediate_p (x, true);
15727     default:
15728       gcc_unreachable ();
15729     }
15730 }
15731 
15732 /* Return the UNSPEC_COND_* code for comparison CODE.  */
15733 
15734 static unsigned int
aarch64_unspec_cond_code(rtx_code code)15735 aarch64_unspec_cond_code (rtx_code code)
15736 {
15737   switch (code)
15738     {
15739     case NE:
15740       return UNSPEC_COND_NE;
15741     case EQ:
15742       return UNSPEC_COND_EQ;
15743     case LT:
15744       return UNSPEC_COND_LT;
15745     case GT:
15746       return UNSPEC_COND_GT;
15747     case LE:
15748       return UNSPEC_COND_LE;
15749     case GE:
15750       return UNSPEC_COND_GE;
15751     case LTU:
15752       return UNSPEC_COND_LO;
15753     case GTU:
15754       return UNSPEC_COND_HI;
15755     case LEU:
15756       return UNSPEC_COND_LS;
15757     case GEU:
15758       return UNSPEC_COND_HS;
15759     case UNORDERED:
15760       return UNSPEC_COND_UO;
15761     default:
15762       gcc_unreachable ();
15763     }
15764 }
15765 
15766 /* Return an (unspec:PRED_MODE [PRED OP0 OP1] UNSPEC_COND_<X>) expression,
15767    where <X> is the operation associated with comparison CODE.  */
15768 
15769 static rtx
aarch64_gen_unspec_cond(rtx_code code,machine_mode pred_mode,rtx pred,rtx op0,rtx op1)15770 aarch64_gen_unspec_cond (rtx_code code, machine_mode pred_mode,
15771 			 rtx pred, rtx op0, rtx op1)
15772 {
15773   rtvec vec = gen_rtvec (3, pred, op0, op1);
15774   return gen_rtx_UNSPEC (pred_mode, vec, aarch64_unspec_cond_code (code));
15775 }
15776 
15777 /* Expand an SVE integer comparison:
15778 
15779      TARGET = CODE (OP0, OP1).  */
15780 
15781 void
aarch64_expand_sve_vec_cmp_int(rtx target,rtx_code code,rtx op0,rtx op1)15782 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
15783 {
15784   machine_mode pred_mode = GET_MODE (target);
15785   machine_mode data_mode = GET_MODE (op0);
15786 
15787   if (!aarch64_sve_cmp_operand_p (code, op1))
15788     op1 = force_reg (data_mode, op1);
15789 
15790   rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
15791   rtx unspec = aarch64_gen_unspec_cond (code, pred_mode, ptrue, op0, op1);
15792   emit_insn (gen_set_clobber_cc (target, unspec));
15793 }
15794 
15795 /* Emit an instruction:
15796 
15797       (set TARGET (unspec:PRED_MODE [PRED OP0 OP1] UNSPEC_COND_<X>))
15798 
15799    where <X> is the operation associated with comparison CODE.  */
15800 
15801 static void
aarch64_emit_unspec_cond(rtx target,rtx_code code,machine_mode pred_mode,rtx pred,rtx op0,rtx op1)15802 aarch64_emit_unspec_cond (rtx target, rtx_code code, machine_mode pred_mode,
15803 			  rtx pred, rtx op0, rtx op1)
15804 {
15805   rtx unspec = aarch64_gen_unspec_cond (code, pred_mode, pred, op0, op1);
15806   emit_set_insn (target, unspec);
15807 }
15808 
15809 /* Emit:
15810 
15811       (set TMP1 (unspec:PRED_MODE [PTRUE OP0 OP1] UNSPEC_COND_<X1>))
15812       (set TMP2 (unspec:PRED_MODE [PTRUE OP0 OP1] UNSPEC_COND_<X2>))
15813       (set TARGET (and:PRED_MODE (ior:PRED_MODE TMP1 TMP2) PTRUE))
15814 
15815    where <Xi> is the operation associated with comparison CODEi.  */
15816 
15817 static void
aarch64_emit_unspec_cond_or(rtx target,rtx_code code1,rtx_code code2,machine_mode pred_mode,rtx ptrue,rtx op0,rtx op1)15818 aarch64_emit_unspec_cond_or (rtx target, rtx_code code1, rtx_code code2,
15819 			     machine_mode pred_mode, rtx ptrue,
15820 			     rtx op0, rtx op1)
15821 {
15822   rtx tmp1 = gen_reg_rtx (pred_mode);
15823   aarch64_emit_unspec_cond (tmp1, code1, pred_mode, ptrue, op0, op1);
15824   rtx tmp2 = gen_reg_rtx (pred_mode);
15825   aarch64_emit_unspec_cond (tmp2, code2, pred_mode, ptrue, op0, op1);
15826   emit_set_insn (target, gen_rtx_AND (pred_mode,
15827 				      gen_rtx_IOR (pred_mode, tmp1, tmp2),
15828 				      ptrue));
15829 }
15830 
15831 /* If CAN_INVERT_P, emit an instruction:
15832 
15833       (set TARGET (unspec:PRED_MODE [PRED OP0 OP1] UNSPEC_COND_<X>))
15834 
15835    where <X> is the operation associated with comparison CODE.  Otherwise
15836    emit:
15837 
15838       (set TMP (unspec:PRED_MODE [PRED OP0 OP1] UNSPEC_COND_<X>))
15839       (set TARGET (and:PRED_MODE (not:PRED_MODE TMP) PTRUE))
15840 
15841    where the second instructions sets TARGET to the inverse of TMP.  */
15842 
15843 static void
aarch64_emit_inverted_unspec_cond(rtx target,rtx_code code,machine_mode pred_mode,rtx ptrue,rtx pred,rtx op0,rtx op1,bool can_invert_p)15844 aarch64_emit_inverted_unspec_cond (rtx target, rtx_code code,
15845 				   machine_mode pred_mode, rtx ptrue, rtx pred,
15846 				   rtx op0, rtx op1, bool can_invert_p)
15847 {
15848   if (can_invert_p)
15849     aarch64_emit_unspec_cond (target, code, pred_mode, pred, op0, op1);
15850   else
15851     {
15852       rtx tmp = gen_reg_rtx (pred_mode);
15853       aarch64_emit_unspec_cond (tmp, code, pred_mode, pred, op0, op1);
15854       emit_set_insn (target, gen_rtx_AND (pred_mode,
15855 					  gen_rtx_NOT (pred_mode, tmp),
15856 					  ptrue));
15857     }
15858 }
15859 
15860 /* Expand an SVE floating-point comparison:
15861 
15862      TARGET = CODE (OP0, OP1)
15863 
15864    If CAN_INVERT_P is true, the caller can also handle inverted results;
15865    return true if the result is in fact inverted.  */
15866 
15867 bool
aarch64_expand_sve_vec_cmp_float(rtx target,rtx_code code,rtx op0,rtx op1,bool can_invert_p)15868 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
15869 				  rtx op0, rtx op1, bool can_invert_p)
15870 {
15871   machine_mode pred_mode = GET_MODE (target);
15872   machine_mode data_mode = GET_MODE (op0);
15873 
15874   rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
15875   switch (code)
15876     {
15877     case UNORDERED:
15878       /* UNORDERED has no immediate form.  */
15879       op1 = force_reg (data_mode, op1);
15880       aarch64_emit_unspec_cond (target, code, pred_mode, ptrue, op0, op1);
15881       return false;
15882 
15883     case LT:
15884     case LE:
15885     case GT:
15886     case GE:
15887     case EQ:
15888     case NE:
15889       /* There is native support for the comparison.  */
15890       aarch64_emit_unspec_cond (target, code, pred_mode, ptrue, op0, op1);
15891       return false;
15892 
15893     case ORDERED:
15894       /* There is native support for the inverse comparison.  */
15895       op1 = force_reg (data_mode, op1);
15896       aarch64_emit_inverted_unspec_cond (target, UNORDERED,
15897 					 pred_mode, ptrue, ptrue, op0, op1,
15898 					 can_invert_p);
15899       return can_invert_p;
15900 
15901     case LTGT:
15902       /* This is a trapping operation (LT or GT).  */
15903       aarch64_emit_unspec_cond_or (target, LT, GT, pred_mode, ptrue, op0, op1);
15904       return false;
15905 
15906     case UNEQ:
15907       if (!flag_trapping_math)
15908 	{
15909 	  /* This would trap for signaling NaNs.  */
15910 	  op1 = force_reg (data_mode, op1);
15911 	  aarch64_emit_unspec_cond_or (target, UNORDERED, EQ,
15912 				       pred_mode, ptrue, op0, op1);
15913 	  return false;
15914 	}
15915       /* fall through */
15916 
15917     case UNLT:
15918     case UNLE:
15919     case UNGT:
15920     case UNGE:
15921       {
15922 	rtx ordered = ptrue;
15923 	if (flag_trapping_math)
15924 	  {
15925 	    /* Only compare the elements that are known to be ordered.  */
15926 	    ordered = gen_reg_rtx (pred_mode);
15927 	    op1 = force_reg (data_mode, op1);
15928 	    aarch64_emit_inverted_unspec_cond (ordered, UNORDERED, pred_mode,
15929 					       ptrue, ptrue, op0, op1, false);
15930 	  }
15931 	if (code == UNEQ)
15932 	  code = NE;
15933 	else
15934 	  code = reverse_condition_maybe_unordered (code);
15935 	aarch64_emit_inverted_unspec_cond (target, code, pred_mode, ptrue,
15936 					   ordered, op0, op1, can_invert_p);
15937 	return can_invert_p;
15938       }
15939 
15940     default:
15941       gcc_unreachable ();
15942     }
15943 }
15944 
15945 /* Expand an SVE vcond pattern with operands OPS.  DATA_MODE is the mode
15946    of the data being selected and CMP_MODE is the mode of the values being
15947    compared.  */
15948 
15949 void
aarch64_expand_sve_vcond(machine_mode data_mode,machine_mode cmp_mode,rtx * ops)15950 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
15951 			  rtx *ops)
15952 {
15953   machine_mode pred_mode
15954     = aarch64_get_mask_mode (GET_MODE_NUNITS (cmp_mode),
15955 			     GET_MODE_SIZE (cmp_mode)).require ();
15956   rtx pred = gen_reg_rtx (pred_mode);
15957   if (FLOAT_MODE_P (cmp_mode))
15958     {
15959       if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
15960 					    ops[4], ops[5], true))
15961 	std::swap (ops[1], ops[2]);
15962     }
15963   else
15964     aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
15965 
15966   rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
15967   emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
15968 }
15969 
15970 /* Implement TARGET_MODES_TIEABLE_P.  In principle we should always return
15971    true.  However due to issues with register allocation it is preferable
15972    to avoid tieing integer scalar and FP scalar modes.  Executing integer
15973    operations in general registers is better than treating them as scalar
15974    vector operations.  This reduces latency and avoids redundant int<->FP
15975    moves.  So tie modes if they are either the same class, or vector modes
15976    with other vector modes, vector structs or any scalar mode.  */
15977 
15978 static bool
aarch64_modes_tieable_p(machine_mode mode1,machine_mode mode2)15979 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
15980 {
15981   if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
15982     return true;
15983 
15984   /* We specifically want to allow elements of "structure" modes to
15985      be tieable to the structure.  This more general condition allows
15986      other rarer situations too.  The reason we don't extend this to
15987      predicate modes is that there are no predicate structure modes
15988      nor any specific instructions for extracting part of a predicate
15989      register.  */
15990   if (aarch64_vector_data_mode_p (mode1)
15991       && aarch64_vector_data_mode_p (mode2))
15992     return true;
15993 
15994   /* Also allow any scalar modes with vectors.  */
15995   if (aarch64_vector_mode_supported_p (mode1)
15996       || aarch64_vector_mode_supported_p (mode2))
15997     return true;
15998 
15999   return false;
16000 }
16001 
16002 /* Return a new RTX holding the result of moving POINTER forward by
16003    AMOUNT bytes.  */
16004 
16005 static rtx
aarch64_move_pointer(rtx pointer,poly_int64 amount)16006 aarch64_move_pointer (rtx pointer, poly_int64 amount)
16007 {
16008   rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
16009 
16010   return adjust_automodify_address (pointer, GET_MODE (pointer),
16011 				    next, amount);
16012 }
16013 
16014 /* Return a new RTX holding the result of moving POINTER forward by the
16015    size of the mode it points to.  */
16016 
16017 static rtx
aarch64_progress_pointer(rtx pointer)16018 aarch64_progress_pointer (rtx pointer)
16019 {
16020   return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
16021 }
16022 
16023 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
16024    MODE bytes.  */
16025 
16026 static void
aarch64_copy_one_block_and_progress_pointers(rtx * src,rtx * dst,machine_mode mode)16027 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
16028 					      machine_mode mode)
16029 {
16030   rtx reg = gen_reg_rtx (mode);
16031 
16032   /* "Cast" the pointers to the correct mode.  */
16033   *src = adjust_address (*src, mode, 0);
16034   *dst = adjust_address (*dst, mode, 0);
16035   /* Emit the memcpy.  */
16036   emit_move_insn (reg, *src);
16037   emit_move_insn (*dst, reg);
16038   /* Move the pointers forward.  */
16039   *src = aarch64_progress_pointer (*src);
16040   *dst = aarch64_progress_pointer (*dst);
16041 }
16042 
16043 /* Expand movmem, as if from a __builtin_memcpy.  Return true if
16044    we succeed, otherwise return false.  */
16045 
16046 bool
aarch64_expand_movmem(rtx * operands)16047 aarch64_expand_movmem (rtx *operands)
16048 {
16049   unsigned int n;
16050   rtx dst = operands[0];
16051   rtx src = operands[1];
16052   rtx base;
16053   bool speed_p = !optimize_function_for_size_p (cfun);
16054 
16055   /* When optimizing for size, give a better estimate of the length of a
16056      memcpy call, but use the default otherwise.  */
16057   unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
16058 
16059   /* We can't do anything smart if the amount to copy is not constant.  */
16060   if (!CONST_INT_P (operands[2]))
16061     return false;
16062 
16063   n = UINTVAL (operands[2]);
16064 
16065   /* Try to keep the number of instructions low.  For cases below 16 bytes we
16066      need to make at most two moves.  For cases above 16 bytes it will be one
16067      move for each 16 byte chunk, then at most two additional moves.  */
16068   if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
16069     return false;
16070 
16071   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
16072   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
16073 
16074   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
16075   src = adjust_automodify_address (src, VOIDmode, base, 0);
16076 
16077   /* Simple cases.  Copy 0-3 bytes, as (if applicable) a 2-byte, then a
16078      1-byte chunk.  */
16079   if (n < 4)
16080     {
16081       if (n >= 2)
16082 	{
16083 	  aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
16084 	  n -= 2;
16085 	}
16086 
16087       if (n == 1)
16088 	aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
16089 
16090       return true;
16091     }
16092 
16093   /* Copy 4-8 bytes.  First a 4-byte chunk, then (if applicable) a second
16094      4-byte chunk, partially overlapping with the previously copied chunk.  */
16095   if (n < 8)
16096     {
16097       aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
16098       n -= 4;
16099       if (n > 0)
16100 	{
16101 	  int move = n - 4;
16102 
16103 	  src = aarch64_move_pointer (src, move);
16104 	  dst = aarch64_move_pointer (dst, move);
16105 	  aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
16106 	}
16107       return true;
16108     }
16109 
16110   /* Copy more than 8 bytes.  Copy chunks of 16 bytes until we run out of
16111      them, then (if applicable) an 8-byte chunk.  */
16112   while (n >= 8)
16113     {
16114       if (n / 16)
16115 	{
16116 	  aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
16117 	  n -= 16;
16118 	}
16119       else
16120 	{
16121 	  aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
16122 	  n -= 8;
16123 	}
16124     }
16125 
16126   /* Finish the final bytes of the copy.  We can always do this in one
16127      instruction.  We either copy the exact amount we need, or partially
16128      overlap with the previous chunk we copied and copy 8-bytes.  */
16129   if (n == 0)
16130     return true;
16131   else if (n == 1)
16132     aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
16133   else if (n == 2)
16134     aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
16135   else if (n == 4)
16136     aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
16137   else
16138     {
16139       if (n == 3)
16140 	{
16141 	  src = aarch64_move_pointer (src, -1);
16142 	  dst = aarch64_move_pointer (dst, -1);
16143 	  aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
16144 	}
16145       else
16146 	{
16147 	  int move = n - 8;
16148 
16149 	  src = aarch64_move_pointer (src, move);
16150 	  dst = aarch64_move_pointer (dst, move);
16151 	  aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
16152 	}
16153     }
16154 
16155   return true;
16156 }
16157 
16158 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
16159    SImode stores.  Handle the case when the constant has identical
16160    bottom and top halves.  This is beneficial when the two stores can be
16161    merged into an STP and we avoid synthesising potentially expensive
16162    immediates twice.  Return true if such a split is possible.  */
16163 
16164 bool
aarch64_split_dimode_const_store(rtx dst,rtx src)16165 aarch64_split_dimode_const_store (rtx dst, rtx src)
16166 {
16167   rtx lo = gen_lowpart (SImode, src);
16168   rtx hi = gen_highpart_mode (SImode, DImode, src);
16169 
16170   bool size_p = optimize_function_for_size_p (cfun);
16171 
16172   if (!rtx_equal_p (lo, hi))
16173     return false;
16174 
16175   unsigned int orig_cost
16176     = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
16177   unsigned int lo_cost
16178     = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
16179 
16180   /* We want to transform:
16181      MOV	x1, 49370
16182      MOVK	x1, 0x140, lsl 16
16183      MOVK	x1, 0xc0da, lsl 32
16184      MOVK	x1, 0x140, lsl 48
16185      STR	x1, [x0]
16186    into:
16187      MOV	w1, 49370
16188      MOVK	w1, 0x140, lsl 16
16189      STP	w1, w1, [x0]
16190    So we want to perform this only when we save two instructions
16191    or more.  When optimizing for size, however, accept any code size
16192    savings we can.  */
16193   if (size_p && orig_cost <= lo_cost)
16194     return false;
16195 
16196   if (!size_p
16197       && (orig_cost <= lo_cost + 1))
16198     return false;
16199 
16200   rtx mem_lo = adjust_address (dst, SImode, 0);
16201   if (!aarch64_mem_pair_operand (mem_lo, SImode))
16202     return false;
16203 
16204   rtx tmp_reg = gen_reg_rtx (SImode);
16205   aarch64_expand_mov_immediate (tmp_reg, lo);
16206   rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
16207   /* Don't emit an explicit store pair as this may not be always profitable.
16208      Let the sched-fusion logic decide whether to merge them.  */
16209   emit_move_insn (mem_lo, tmp_reg);
16210   emit_move_insn (mem_hi, tmp_reg);
16211 
16212   return true;
16213 }
16214 
16215 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
16216 
16217 static unsigned HOST_WIDE_INT
aarch64_asan_shadow_offset(void)16218 aarch64_asan_shadow_offset (void)
16219 {
16220   return (HOST_WIDE_INT_1 << 36);
16221 }
16222 
16223 static rtx
aarch64_gen_ccmp_first(rtx_insn ** prep_seq,rtx_insn ** gen_seq,int code,tree treeop0,tree treeop1)16224 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
16225 			int code, tree treeop0, tree treeop1)
16226 {
16227   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
16228   rtx op0, op1;
16229   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
16230   insn_code icode;
16231   struct expand_operand ops[4];
16232 
16233   start_sequence ();
16234   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
16235 
16236   op_mode = GET_MODE (op0);
16237   if (op_mode == VOIDmode)
16238     op_mode = GET_MODE (op1);
16239 
16240   switch (op_mode)
16241     {
16242     case E_QImode:
16243     case E_HImode:
16244     case E_SImode:
16245       cmp_mode = SImode;
16246       icode = CODE_FOR_cmpsi;
16247       break;
16248 
16249     case E_DImode:
16250       cmp_mode = DImode;
16251       icode = CODE_FOR_cmpdi;
16252       break;
16253 
16254     case E_SFmode:
16255       cmp_mode = SFmode;
16256       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
16257       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
16258       break;
16259 
16260     case E_DFmode:
16261       cmp_mode = DFmode;
16262       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
16263       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
16264       break;
16265 
16266     default:
16267       end_sequence ();
16268       return NULL_RTX;
16269     }
16270 
16271   op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
16272   op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
16273   if (!op0 || !op1)
16274     {
16275       end_sequence ();
16276       return NULL_RTX;
16277     }
16278   *prep_seq = get_insns ();
16279   end_sequence ();
16280 
16281   create_fixed_operand (&ops[0], op0);
16282   create_fixed_operand (&ops[1], op1);
16283 
16284   start_sequence ();
16285   if (!maybe_expand_insn (icode, 2, ops))
16286     {
16287       end_sequence ();
16288       return NULL_RTX;
16289     }
16290   *gen_seq = get_insns ();
16291   end_sequence ();
16292 
16293   return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
16294 			 gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
16295 }
16296 
16297 static rtx
aarch64_gen_ccmp_next(rtx_insn ** prep_seq,rtx_insn ** gen_seq,rtx prev,int cmp_code,tree treeop0,tree treeop1,int bit_code)16298 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
16299 		       int cmp_code, tree treeop0, tree treeop1, int bit_code)
16300 {
16301   rtx op0, op1, target;
16302   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
16303   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
16304   insn_code icode;
16305   struct expand_operand ops[6];
16306   int aarch64_cond;
16307 
16308   push_to_sequence (*prep_seq);
16309   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
16310 
16311   op_mode = GET_MODE (op0);
16312   if (op_mode == VOIDmode)
16313     op_mode = GET_MODE (op1);
16314 
16315   switch (op_mode)
16316     {
16317     case E_QImode:
16318     case E_HImode:
16319     case E_SImode:
16320       cmp_mode = SImode;
16321       icode = CODE_FOR_ccmpsi;
16322       break;
16323 
16324     case E_DImode:
16325       cmp_mode = DImode;
16326       icode = CODE_FOR_ccmpdi;
16327       break;
16328 
16329     case E_SFmode:
16330       cmp_mode = SFmode;
16331       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
16332       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
16333       break;
16334 
16335     case E_DFmode:
16336       cmp_mode = DFmode;
16337       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
16338       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
16339       break;
16340 
16341     default:
16342       end_sequence ();
16343       return NULL_RTX;
16344     }
16345 
16346   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
16347   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
16348   if (!op0 || !op1)
16349     {
16350       end_sequence ();
16351       return NULL_RTX;
16352     }
16353   *prep_seq = get_insns ();
16354   end_sequence ();
16355 
16356   target = gen_rtx_REG (cc_mode, CC_REGNUM);
16357   aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
16358 
16359   if (bit_code != AND)
16360     {
16361       prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
16362 						GET_MODE (XEXP (prev, 0))),
16363 			     VOIDmode, XEXP (prev, 0), const0_rtx);
16364       aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
16365     }
16366 
16367   create_fixed_operand (&ops[0], XEXP (prev, 0));
16368   create_fixed_operand (&ops[1], target);
16369   create_fixed_operand (&ops[2], op0);
16370   create_fixed_operand (&ops[3], op1);
16371   create_fixed_operand (&ops[4], prev);
16372   create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
16373 
16374   push_to_sequence (*gen_seq);
16375   if (!maybe_expand_insn (icode, 6, ops))
16376     {
16377       end_sequence ();
16378       return NULL_RTX;
16379     }
16380 
16381   *gen_seq = get_insns ();
16382   end_sequence ();
16383 
16384   return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
16385 }
16386 
16387 #undef TARGET_GEN_CCMP_FIRST
16388 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
16389 
16390 #undef TARGET_GEN_CCMP_NEXT
16391 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
16392 
16393 /* Implement TARGET_SCHED_MACRO_FUSION_P.  Return true if target supports
16394    instruction fusion of some sort.  */
16395 
16396 static bool
aarch64_macro_fusion_p(void)16397 aarch64_macro_fusion_p (void)
16398 {
16399   return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
16400 }
16401 
16402 
16403 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P.  Return true if PREV and CURR
16404    should be kept together during scheduling.  */
16405 
16406 static bool
aarch_macro_fusion_pair_p(rtx_insn * prev,rtx_insn * curr)16407 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
16408 {
16409   rtx set_dest;
16410   rtx prev_set = single_set (prev);
16411   rtx curr_set = single_set (curr);
16412   /* prev and curr are simple SET insns i.e. no flag setting or branching.  */
16413   bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
16414 
16415   if (!aarch64_macro_fusion_p ())
16416     return false;
16417 
16418   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
16419     {
16420       /* We are trying to match:
16421          prev (mov)  == (set (reg r0) (const_int imm16))
16422          curr (movk) == (set (zero_extract (reg r0)
16423                                            (const_int 16)
16424                                            (const_int 16))
16425                              (const_int imm16_1))  */
16426 
16427       set_dest = SET_DEST (curr_set);
16428 
16429       if (GET_CODE (set_dest) == ZERO_EXTRACT
16430           && CONST_INT_P (SET_SRC (curr_set))
16431           && CONST_INT_P (SET_SRC (prev_set))
16432           && CONST_INT_P (XEXP (set_dest, 2))
16433           && INTVAL (XEXP (set_dest, 2)) == 16
16434           && REG_P (XEXP (set_dest, 0))
16435           && REG_P (SET_DEST (prev_set))
16436           && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
16437         {
16438           return true;
16439         }
16440     }
16441 
16442   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
16443     {
16444 
16445       /*  We're trying to match:
16446           prev (adrp) == (set (reg r1)
16447                               (high (symbol_ref ("SYM"))))
16448           curr (add) == (set (reg r0)
16449                              (lo_sum (reg r1)
16450                                      (symbol_ref ("SYM"))))
16451           Note that r0 need not necessarily be the same as r1, especially
16452           during pre-regalloc scheduling.  */
16453 
16454       if (satisfies_constraint_Ush (SET_SRC (prev_set))
16455           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
16456         {
16457           if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
16458               && REG_P (XEXP (SET_SRC (curr_set), 0))
16459               && REGNO (XEXP (SET_SRC (curr_set), 0))
16460                  == REGNO (SET_DEST (prev_set))
16461               && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
16462                               XEXP (SET_SRC (curr_set), 1)))
16463             return true;
16464         }
16465     }
16466 
16467   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
16468     {
16469 
16470       /* We're trying to match:
16471          prev (movk) == (set (zero_extract (reg r0)
16472                                            (const_int 16)
16473                                            (const_int 32))
16474                              (const_int imm16_1))
16475          curr (movk) == (set (zero_extract (reg r0)
16476                                            (const_int 16)
16477                                            (const_int 48))
16478                              (const_int imm16_2))  */
16479 
16480       if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
16481           && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
16482           && REG_P (XEXP (SET_DEST (prev_set), 0))
16483           && REG_P (XEXP (SET_DEST (curr_set), 0))
16484           && REGNO (XEXP (SET_DEST (prev_set), 0))
16485              == REGNO (XEXP (SET_DEST (curr_set), 0))
16486           && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
16487           && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
16488           && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
16489           && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
16490           && CONST_INT_P (SET_SRC (prev_set))
16491           && CONST_INT_P (SET_SRC (curr_set)))
16492         return true;
16493 
16494     }
16495   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
16496     {
16497       /* We're trying to match:
16498           prev (adrp) == (set (reg r0)
16499                               (high (symbol_ref ("SYM"))))
16500           curr (ldr) == (set (reg r1)
16501                              (mem (lo_sum (reg r0)
16502                                              (symbol_ref ("SYM")))))
16503                  or
16504           curr (ldr) == (set (reg r1)
16505                              (zero_extend (mem
16506                                            (lo_sum (reg r0)
16507                                                    (symbol_ref ("SYM"))))))  */
16508       if (satisfies_constraint_Ush (SET_SRC (prev_set))
16509           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
16510         {
16511           rtx curr_src = SET_SRC (curr_set);
16512 
16513           if (GET_CODE (curr_src) == ZERO_EXTEND)
16514             curr_src = XEXP (curr_src, 0);
16515 
16516           if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
16517               && REG_P (XEXP (XEXP (curr_src, 0), 0))
16518               && REGNO (XEXP (XEXP (curr_src, 0), 0))
16519                  == REGNO (SET_DEST (prev_set))
16520               && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
16521                               XEXP (SET_SRC (prev_set), 0)))
16522               return true;
16523         }
16524     }
16525 
16526   if (aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)
16527        && aarch_crypto_can_dual_issue (prev, curr))
16528     return true;
16529 
16530   if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
16531       && any_condjump_p (curr))
16532     {
16533       unsigned int condreg1, condreg2;
16534       rtx cc_reg_1;
16535       aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
16536       cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
16537 
16538       if (reg_referenced_p (cc_reg_1, PATTERN (curr))
16539 	  && prev
16540 	  && modified_in_p (cc_reg_1, prev))
16541 	{
16542 	  enum attr_type prev_type = get_attr_type (prev);
16543 
16544 	  /* FIXME: this misses some which is considered simple arthematic
16545 	     instructions for ThunderX.  Simple shifts are missed here.  */
16546 	  if (prev_type == TYPE_ALUS_SREG
16547 	      || prev_type == TYPE_ALUS_IMM
16548 	      || prev_type == TYPE_LOGICS_REG
16549 	      || prev_type == TYPE_LOGICS_IMM)
16550 	    return true;
16551 	}
16552     }
16553 
16554   if (prev_set
16555       && curr_set
16556       && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
16557       && any_condjump_p (curr))
16558     {
16559       /* We're trying to match:
16560 	  prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
16561 	  curr (cbz) ==  (set (pc) (if_then_else (eq/ne) (r0)
16562 							 (const_int 0))
16563 						 (label_ref ("SYM"))
16564 						 (pc))  */
16565       if (SET_DEST (curr_set) == (pc_rtx)
16566 	  && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
16567 	  && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
16568 	  && REG_P (SET_DEST (prev_set))
16569 	  && REGNO (SET_DEST (prev_set))
16570 	     == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
16571 	{
16572 	  /* Fuse ALU operations followed by conditional branch instruction.  */
16573 	  switch (get_attr_type (prev))
16574 	    {
16575 	    case TYPE_ALU_IMM:
16576 	    case TYPE_ALU_SREG:
16577 	    case TYPE_ADC_REG:
16578 	    case TYPE_ADC_IMM:
16579 	    case TYPE_ADCS_REG:
16580 	    case TYPE_ADCS_IMM:
16581 	    case TYPE_LOGIC_REG:
16582 	    case TYPE_LOGIC_IMM:
16583 	    case TYPE_CSEL:
16584 	    case TYPE_ADR:
16585 	    case TYPE_MOV_IMM:
16586 	    case TYPE_SHIFT_REG:
16587 	    case TYPE_SHIFT_IMM:
16588 	    case TYPE_BFM:
16589 	    case TYPE_RBIT:
16590 	    case TYPE_REV:
16591 	    case TYPE_EXTEND:
16592 	      return true;
16593 
16594 	    default:;
16595 	    }
16596 	}
16597     }
16598 
16599   return false;
16600 }
16601 
16602 /* Return true iff the instruction fusion described by OP is enabled.  */
16603 
16604 bool
aarch64_fusion_enabled_p(enum aarch64_fusion_pairs op)16605 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
16606 {
16607   return (aarch64_tune_params.fusible_ops & op) != 0;
16608 }
16609 
16610 /* If MEM is in the form of [base+offset], extract the two parts
16611    of address and set to BASE and OFFSET, otherwise return false
16612    after clearing BASE and OFFSET.  */
16613 
16614 bool
extract_base_offset_in_addr(rtx mem,rtx * base,rtx * offset)16615 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
16616 {
16617   rtx addr;
16618 
16619   gcc_assert (MEM_P (mem));
16620 
16621   addr = XEXP (mem, 0);
16622 
16623   if (REG_P (addr))
16624     {
16625       *base = addr;
16626       *offset = const0_rtx;
16627       return true;
16628     }
16629 
16630   if (GET_CODE (addr) == PLUS
16631       && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
16632     {
16633       *base = XEXP (addr, 0);
16634       *offset = XEXP (addr, 1);
16635       return true;
16636     }
16637 
16638   *base = NULL_RTX;
16639   *offset = NULL_RTX;
16640 
16641   return false;
16642 }
16643 
16644 /* Types for scheduling fusion.  */
16645 enum sched_fusion_type
16646 {
16647   SCHED_FUSION_NONE = 0,
16648   SCHED_FUSION_LD_SIGN_EXTEND,
16649   SCHED_FUSION_LD_ZERO_EXTEND,
16650   SCHED_FUSION_LD,
16651   SCHED_FUSION_ST,
16652   SCHED_FUSION_NUM
16653 };
16654 
16655 /* If INSN is a load or store of address in the form of [base+offset],
16656    extract the two parts and set to BASE and OFFSET.  Return scheduling
16657    fusion type this INSN is.  */
16658 
16659 static enum sched_fusion_type
fusion_load_store(rtx_insn * insn,rtx * base,rtx * offset)16660 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
16661 {
16662   rtx x, dest, src;
16663   enum sched_fusion_type fusion = SCHED_FUSION_LD;
16664 
16665   gcc_assert (INSN_P (insn));
16666   x = PATTERN (insn);
16667   if (GET_CODE (x) != SET)
16668     return SCHED_FUSION_NONE;
16669 
16670   src = SET_SRC (x);
16671   dest = SET_DEST (x);
16672 
16673   machine_mode dest_mode = GET_MODE (dest);
16674 
16675   if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
16676     return SCHED_FUSION_NONE;
16677 
16678   if (GET_CODE (src) == SIGN_EXTEND)
16679     {
16680       fusion = SCHED_FUSION_LD_SIGN_EXTEND;
16681       src = XEXP (src, 0);
16682       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
16683 	return SCHED_FUSION_NONE;
16684     }
16685   else if (GET_CODE (src) == ZERO_EXTEND)
16686     {
16687       fusion = SCHED_FUSION_LD_ZERO_EXTEND;
16688       src = XEXP (src, 0);
16689       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
16690 	return SCHED_FUSION_NONE;
16691     }
16692 
16693   if (GET_CODE (src) == MEM && REG_P (dest))
16694     extract_base_offset_in_addr (src, base, offset);
16695   else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
16696     {
16697       fusion = SCHED_FUSION_ST;
16698       extract_base_offset_in_addr (dest, base, offset);
16699     }
16700   else
16701     return SCHED_FUSION_NONE;
16702 
16703   if (*base == NULL_RTX || *offset == NULL_RTX)
16704     fusion = SCHED_FUSION_NONE;
16705 
16706   return fusion;
16707 }
16708 
16709 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
16710 
16711    Currently we only support to fuse ldr or str instructions, so FUSION_PRI
16712    and PRI are only calculated for these instructions.  For other instruction,
16713    FUSION_PRI and PRI are simply set to MAX_PRI - 1.  In the future, other
16714    type instruction fusion can be added by returning different priorities.
16715 
16716    It's important that irrelevant instructions get the largest FUSION_PRI.  */
16717 
16718 static void
aarch64_sched_fusion_priority(rtx_insn * insn,int max_pri,int * fusion_pri,int * pri)16719 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
16720 			       int *fusion_pri, int *pri)
16721 {
16722   int tmp, off_val;
16723   rtx base, offset;
16724   enum sched_fusion_type fusion;
16725 
16726   gcc_assert (INSN_P (insn));
16727 
16728   tmp = max_pri - 1;
16729   fusion = fusion_load_store (insn, &base, &offset);
16730   if (fusion == SCHED_FUSION_NONE)
16731     {
16732       *pri = tmp;
16733       *fusion_pri = tmp;
16734       return;
16735     }
16736 
16737   /* Set FUSION_PRI according to fusion type and base register.  */
16738   *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
16739 
16740   /* Calculate PRI.  */
16741   tmp /= 2;
16742 
16743   /* INSN with smaller offset goes first.  */
16744   off_val = (int)(INTVAL (offset));
16745   if (off_val >= 0)
16746     tmp -= (off_val & 0xfffff);
16747   else
16748     tmp += ((- off_val) & 0xfffff);
16749 
16750   *pri = tmp;
16751   return;
16752 }
16753 
16754 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
16755    Adjust priority of sha1h instructions so they are scheduled before
16756    other SHA1 instructions.  */
16757 
16758 static int
aarch64_sched_adjust_priority(rtx_insn * insn,int priority)16759 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
16760 {
16761   rtx x = PATTERN (insn);
16762 
16763   if (GET_CODE (x) == SET)
16764     {
16765       x = SET_SRC (x);
16766 
16767       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
16768 	return priority + 10;
16769     }
16770 
16771   return priority;
16772 }
16773 
16774 /* Given OPERANDS of consecutive load/store, check if we can merge
16775    them into ldp/stp.  LOAD is true if they are load instructions.
16776    MODE is the mode of memory operands.  */
16777 
16778 bool
aarch64_operands_ok_for_ldpstp(rtx * operands,bool load,machine_mode mode)16779 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
16780 				machine_mode mode)
16781 {
16782   HOST_WIDE_INT offval_1, offval_2, msize;
16783   enum reg_class rclass_1, rclass_2;
16784   rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
16785 
16786   if (load)
16787     {
16788       mem_1 = operands[1];
16789       mem_2 = operands[3];
16790       reg_1 = operands[0];
16791       reg_2 = operands[2];
16792       gcc_assert (REG_P (reg_1) && REG_P (reg_2));
16793       if (REGNO (reg_1) == REGNO (reg_2))
16794 	return false;
16795     }
16796   else
16797     {
16798       mem_1 = operands[0];
16799       mem_2 = operands[2];
16800       reg_1 = operands[1];
16801       reg_2 = operands[3];
16802     }
16803 
16804   /* The mems cannot be volatile.  */
16805   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
16806     return false;
16807 
16808   /* If we have SImode and slow unaligned ldp,
16809      check the alignment to be at least 8 byte. */
16810   if (mode == SImode
16811       && (aarch64_tune_params.extra_tuning_flags
16812           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
16813       && !optimize_size
16814       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
16815     return false;
16816 
16817   /* Check if the addresses are in the form of [base+offset].  */
16818   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
16819   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
16820     return false;
16821   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
16822   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
16823     return false;
16824 
16825   /* Check if the bases are same.  */
16826   if (!rtx_equal_p (base_1, base_2))
16827     return false;
16828 
16829   offval_1 = INTVAL (offset_1);
16830   offval_2 = INTVAL (offset_2);
16831   /* We should only be trying this for fixed-sized modes.  There is no
16832      SVE LDP/STP instruction.  */
16833   msize = GET_MODE_SIZE (mode).to_constant ();
16834   /* Check if the offsets are consecutive.  */
16835   if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
16836     return false;
16837 
16838   /* Check if the addresses are clobbered by load.  */
16839   if (load)
16840     {
16841       if (reg_mentioned_p (reg_1, mem_1))
16842 	return false;
16843 
16844       /* In increasing order, the last load can clobber the address.  */
16845       if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
16846       return false;
16847     }
16848 
16849   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
16850     rclass_1 = FP_REGS;
16851   else
16852     rclass_1 = GENERAL_REGS;
16853 
16854   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
16855     rclass_2 = FP_REGS;
16856   else
16857     rclass_2 = GENERAL_REGS;
16858 
16859   /* Check if the registers are of same class.  */
16860   if (rclass_1 != rclass_2)
16861     return false;
16862 
16863   return true;
16864 }
16865 
16866 /* Given OPERANDS of consecutive load/store, check if we can merge
16867    them into ldp/stp by adjusting the offset.  LOAD is true if they
16868    are load instructions.  MODE is the mode of memory operands.
16869 
16870    Given below consecutive stores:
16871 
16872      str  w1, [xb, 0x100]
16873      str  w1, [xb, 0x104]
16874      str  w1, [xb, 0x108]
16875      str  w1, [xb, 0x10c]
16876 
16877    Though the offsets are out of the range supported by stp, we can
16878    still pair them after adjusting the offset, like:
16879 
16880      add  scratch, xb, 0x100
16881      stp  w1, w1, [scratch]
16882      stp  w1, w1, [scratch, 0x8]
16883 
16884    The peephole patterns detecting this opportunity should guarantee
16885    the scratch register is avaliable.  */
16886 
16887 bool
aarch64_operands_adjust_ok_for_ldpstp(rtx * operands,bool load,scalar_mode mode)16888 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
16889 				       scalar_mode mode)
16890 {
16891   enum reg_class rclass_1, rclass_2, rclass_3, rclass_4;
16892   HOST_WIDE_INT offval_1, offval_2, offval_3, offval_4, msize;
16893   rtx mem_1, mem_2, mem_3, mem_4, reg_1, reg_2, reg_3, reg_4;
16894   rtx base_1, base_2, base_3, base_4, offset_1, offset_2, offset_3, offset_4;
16895 
16896   if (load)
16897     {
16898       reg_1 = operands[0];
16899       mem_1 = operands[1];
16900       reg_2 = operands[2];
16901       mem_2 = operands[3];
16902       reg_3 = operands[4];
16903       mem_3 = operands[5];
16904       reg_4 = operands[6];
16905       mem_4 = operands[7];
16906       gcc_assert (REG_P (reg_1) && REG_P (reg_2)
16907 		  && REG_P (reg_3) && REG_P (reg_4));
16908       if (REGNO (reg_1) == REGNO (reg_2) || REGNO (reg_3) == REGNO (reg_4))
16909 	return false;
16910     }
16911   else
16912     {
16913       mem_1 = operands[0];
16914       reg_1 = operands[1];
16915       mem_2 = operands[2];
16916       reg_2 = operands[3];
16917       mem_3 = operands[4];
16918       reg_3 = operands[5];
16919       mem_4 = operands[6];
16920       reg_4 = operands[7];
16921     }
16922   /* Skip if memory operand is by itslef valid for ldp/stp.  */
16923   if (!MEM_P (mem_1) || aarch64_mem_pair_operand (mem_1, mode))
16924     return false;
16925 
16926   /* The mems cannot be volatile.  */
16927   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2)
16928       || MEM_VOLATILE_P (mem_3) ||MEM_VOLATILE_P (mem_4))
16929     return false;
16930 
16931   /* Check if the addresses are in the form of [base+offset].  */
16932   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
16933   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
16934     return false;
16935   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
16936   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
16937     return false;
16938   extract_base_offset_in_addr (mem_3, &base_3, &offset_3);
16939   if (base_3 == NULL_RTX || offset_3 == NULL_RTX)
16940     return false;
16941   extract_base_offset_in_addr (mem_4, &base_4, &offset_4);
16942   if (base_4 == NULL_RTX || offset_4 == NULL_RTX)
16943     return false;
16944 
16945   /* Check if the bases are same.  */
16946   if (!rtx_equal_p (base_1, base_2)
16947       || !rtx_equal_p (base_2, base_3)
16948       || !rtx_equal_p (base_3, base_4))
16949     return false;
16950 
16951   offval_1 = INTVAL (offset_1);
16952   offval_2 = INTVAL (offset_2);
16953   offval_3 = INTVAL (offset_3);
16954   offval_4 = INTVAL (offset_4);
16955   msize = GET_MODE_SIZE (mode);
16956   /* Check if the offsets are consecutive.  */
16957   if ((offval_1 != (offval_2 + msize)
16958        || offval_1 != (offval_3 + msize * 2)
16959        || offval_1 != (offval_4 + msize * 3))
16960       && (offval_4 != (offval_3 + msize)
16961 	  || offval_4 != (offval_2 + msize * 2)
16962 	  || offval_4 != (offval_1 + msize * 3)))
16963     return false;
16964 
16965   /* Check if the addresses are clobbered by load.  */
16966   if (load)
16967     {
16968       if (reg_mentioned_p (reg_1, mem_1)
16969 	  || reg_mentioned_p (reg_2, mem_2)
16970 	  || reg_mentioned_p (reg_3, mem_3))
16971 	return false;
16972 
16973       /* In increasing order, the last load can clobber the address.  */
16974       if (offval_1 > offval_2 && reg_mentioned_p (reg_4, mem_4))
16975 	return false;
16976     }
16977 
16978   /* If we have SImode and slow unaligned ldp,
16979      check the alignment to be at least 8 byte. */
16980   if (mode == SImode
16981       && (aarch64_tune_params.extra_tuning_flags
16982           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
16983       && !optimize_size
16984       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
16985     return false;
16986 
16987   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
16988     rclass_1 = FP_REGS;
16989   else
16990     rclass_1 = GENERAL_REGS;
16991 
16992   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
16993     rclass_2 = FP_REGS;
16994   else
16995     rclass_2 = GENERAL_REGS;
16996 
16997   if (REG_P (reg_3) && FP_REGNUM_P (REGNO (reg_3)))
16998     rclass_3 = FP_REGS;
16999   else
17000     rclass_3 = GENERAL_REGS;
17001 
17002   if (REG_P (reg_4) && FP_REGNUM_P (REGNO (reg_4)))
17003     rclass_4 = FP_REGS;
17004   else
17005     rclass_4 = GENERAL_REGS;
17006 
17007   /* Check if the registers are of same class.  */
17008   if (rclass_1 != rclass_2 || rclass_2 != rclass_3 || rclass_3 != rclass_4)
17009     return false;
17010 
17011   return true;
17012 }
17013 
17014 /* Given OPERANDS of consecutive load/store, this function pairs them
17015    into ldp/stp after adjusting the offset.  It depends on the fact
17016    that addresses of load/store instructions are in increasing order.
17017    MODE is the mode of memory operands.  CODE is the rtl operator
17018    which should be applied to all memory operands, it's SIGN_EXTEND,
17019    ZERO_EXTEND or UNKNOWN.  */
17020 
17021 bool
aarch64_gen_adjusted_ldpstp(rtx * operands,bool load,scalar_mode mode,RTX_CODE code)17022 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
17023 			     scalar_mode mode, RTX_CODE code)
17024 {
17025   rtx base, offset, t1, t2;
17026   rtx mem_1, mem_2, mem_3, mem_4;
17027   HOST_WIDE_INT off_val, abs_off, adj_off, new_off, stp_off_limit, msize;
17028 
17029   if (load)
17030     {
17031       mem_1 = operands[1];
17032       mem_2 = operands[3];
17033       mem_3 = operands[5];
17034       mem_4 = operands[7];
17035     }
17036   else
17037     {
17038       mem_1 = operands[0];
17039       mem_2 = operands[2];
17040       mem_3 = operands[4];
17041       mem_4 = operands[6];
17042       gcc_assert (code == UNKNOWN);
17043     }
17044 
17045   extract_base_offset_in_addr (mem_1, &base, &offset);
17046   gcc_assert (base != NULL_RTX && offset != NULL_RTX);
17047 
17048   /* Adjust offset thus it can fit in ldp/stp instruction.  */
17049   msize = GET_MODE_SIZE (mode);
17050   stp_off_limit = msize * 0x40;
17051   off_val = INTVAL (offset);
17052   abs_off = (off_val < 0) ? -off_val : off_val;
17053   new_off = abs_off % stp_off_limit;
17054   adj_off = abs_off - new_off;
17055 
17056   /* Further adjust to make sure all offsets are OK.  */
17057   if ((new_off + msize * 2) >= stp_off_limit)
17058     {
17059       adj_off += stp_off_limit;
17060       new_off -= stp_off_limit;
17061     }
17062 
17063   /* Make sure the adjustment can be done with ADD/SUB instructions.  */
17064   if (adj_off >= 0x1000)
17065     return false;
17066 
17067   if (off_val < 0)
17068     {
17069       adj_off = -adj_off;
17070       new_off = -new_off;
17071     }
17072 
17073   /* Create new memory references.  */
17074   mem_1 = change_address (mem_1, VOIDmode,
17075 			  plus_constant (DImode, operands[8], new_off));
17076 
17077   /* Check if the adjusted address is OK for ldp/stp.  */
17078   if (!aarch64_mem_pair_operand (mem_1, mode))
17079     return false;
17080 
17081   msize = GET_MODE_SIZE (mode);
17082   mem_2 = change_address (mem_2, VOIDmode,
17083 			  plus_constant (DImode,
17084 					 operands[8],
17085 					 new_off + msize));
17086   mem_3 = change_address (mem_3, VOIDmode,
17087 			  plus_constant (DImode,
17088 					 operands[8],
17089 					 new_off + msize * 2));
17090   mem_4 = change_address (mem_4, VOIDmode,
17091 			  plus_constant (DImode,
17092 					 operands[8],
17093 					 new_off + msize * 3));
17094 
17095   if (code == ZERO_EXTEND)
17096     {
17097       mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
17098       mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
17099       mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
17100       mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
17101     }
17102   else if (code == SIGN_EXTEND)
17103     {
17104       mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
17105       mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
17106       mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
17107       mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
17108     }
17109 
17110   if (load)
17111     {
17112       operands[1] = mem_1;
17113       operands[3] = mem_2;
17114       operands[5] = mem_3;
17115       operands[7] = mem_4;
17116     }
17117   else
17118     {
17119       operands[0] = mem_1;
17120       operands[2] = mem_2;
17121       operands[4] = mem_3;
17122       operands[6] = mem_4;
17123     }
17124 
17125   /* Emit adjusting instruction.  */
17126   emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, adj_off)));
17127   /* Emit ldp/stp instructions.  */
17128   t1 = gen_rtx_SET (operands[0], operands[1]);
17129   t2 = gen_rtx_SET (operands[2], operands[3]);
17130   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
17131   t1 = gen_rtx_SET (operands[4], operands[5]);
17132   t2 = gen_rtx_SET (operands[6], operands[7]);
17133   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
17134   return true;
17135 }
17136 
17137 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE.  Assume for now that
17138    it isn't worth branching around empty masked ops (including masked
17139    stores).  */
17140 
17141 static bool
aarch64_empty_mask_is_expensive(unsigned)17142 aarch64_empty_mask_is_expensive (unsigned)
17143 {
17144   return false;
17145 }
17146 
17147 /* Return 1 if pseudo register should be created and used to hold
17148    GOT address for PIC code.  */
17149 
17150 bool
aarch64_use_pseudo_pic_reg(void)17151 aarch64_use_pseudo_pic_reg (void)
17152 {
17153   return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
17154 }
17155 
17156 /* Implement TARGET_UNSPEC_MAY_TRAP_P.  */
17157 
17158 static int
aarch64_unspec_may_trap_p(const_rtx x,unsigned flags)17159 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
17160 {
17161   switch (XINT (x, 1))
17162     {
17163     case UNSPEC_GOTSMALLPIC:
17164     case UNSPEC_GOTSMALLPIC28K:
17165     case UNSPEC_GOTTINYPIC:
17166       return 0;
17167     default:
17168       break;
17169     }
17170 
17171   return default_unspec_may_trap_p (x, flags);
17172 }
17173 
17174 
17175 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
17176    return the log2 of that value.  Otherwise return -1.  */
17177 
17178 int
aarch64_fpconst_pow_of_2(rtx x)17179 aarch64_fpconst_pow_of_2 (rtx x)
17180 {
17181   const REAL_VALUE_TYPE *r;
17182 
17183   if (!CONST_DOUBLE_P (x))
17184     return -1;
17185 
17186   r = CONST_DOUBLE_REAL_VALUE (x);
17187 
17188   if (REAL_VALUE_NEGATIVE (*r)
17189       || REAL_VALUE_ISNAN (*r)
17190       || REAL_VALUE_ISINF (*r)
17191       || !real_isinteger (r, DFmode))
17192     return -1;
17193 
17194   return exact_log2 (real_to_integer (r));
17195 }
17196 
17197 /* If X is a vector of equal CONST_DOUBLE values and that value is
17198    Y, return the aarch64_fpconst_pow_of_2 of Y.  Otherwise return -1.  */
17199 
17200 int
aarch64_vec_fpconst_pow_of_2(rtx x)17201 aarch64_vec_fpconst_pow_of_2 (rtx x)
17202 {
17203   int nelts;
17204   if (GET_CODE (x) != CONST_VECTOR
17205       || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
17206     return -1;
17207 
17208   if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
17209     return -1;
17210 
17211   int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
17212   if (firstval <= 0)
17213     return -1;
17214 
17215   for (int i = 1; i < nelts; i++)
17216     if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
17217       return -1;
17218 
17219   return firstval;
17220 }
17221 
17222 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
17223    to float.
17224 
17225    __fp16 always promotes through this hook.
17226    _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
17227    through the generic excess precision logic rather than here.  */
17228 
17229 static tree
aarch64_promoted_type(const_tree t)17230 aarch64_promoted_type (const_tree t)
17231 {
17232   if (SCALAR_FLOAT_TYPE_P (t)
17233       && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
17234     return float_type_node;
17235 
17236   return NULL_TREE;
17237 }
17238 
17239 /* Implement the TARGET_OPTAB_SUPPORTED_P hook.  */
17240 
17241 static bool
aarch64_optab_supported_p(int op,machine_mode mode1,machine_mode,optimization_type opt_type)17242 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
17243 			   optimization_type opt_type)
17244 {
17245   switch (op)
17246     {
17247     case rsqrt_optab:
17248       return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
17249 
17250     default:
17251       return true;
17252     }
17253 }
17254 
17255 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook.  */
17256 
17257 static unsigned int
aarch64_dwarf_poly_indeterminate_value(unsigned int i,unsigned int * factor,int * offset)17258 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
17259 					int *offset)
17260 {
17261   /* Polynomial invariant 1 == (VG / 2) - 1.  */
17262   gcc_assert (i == 1);
17263   *factor = 2;
17264   *offset = 1;
17265   return AARCH64_DWARF_VG;
17266 }
17267 
17268 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
17269    if MODE is HFmode, and punt to the generic implementation otherwise.  */
17270 
17271 static bool
aarch64_libgcc_floating_mode_supported_p(scalar_float_mode mode)17272 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
17273 {
17274   return (mode == HFmode
17275 	  ? true
17276 	  : default_libgcc_floating_mode_supported_p (mode));
17277 }
17278 
17279 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
17280    if MODE is HFmode, and punt to the generic implementation otherwise.  */
17281 
17282 static bool
aarch64_scalar_mode_supported_p(scalar_mode mode)17283 aarch64_scalar_mode_supported_p (scalar_mode mode)
17284 {
17285   return (mode == HFmode
17286 	  ? true
17287 	  : default_scalar_mode_supported_p (mode));
17288 }
17289 
17290 /* Set the value of FLT_EVAL_METHOD.
17291    ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
17292 
17293     0: evaluate all operations and constants, whose semantic type has at
17294        most the range and precision of type float, to the range and
17295        precision of float; evaluate all other operations and constants to
17296        the range and precision of the semantic type;
17297 
17298     N, where _FloatN is a supported interchange floating type
17299        evaluate all operations and constants, whose semantic type has at
17300        most the range and precision of _FloatN type, to the range and
17301        precision of the _FloatN type; evaluate all other operations and
17302        constants to the range and precision of the semantic type;
17303 
17304    If we have the ARMv8.2-A extensions then we support _Float16 in native
17305    precision, so we should set this to 16.  Otherwise, we support the type,
17306    but want to evaluate expressions in float precision, so set this to
17307    0.  */
17308 
17309 static enum flt_eval_method
aarch64_excess_precision(enum excess_precision_type type)17310 aarch64_excess_precision (enum excess_precision_type type)
17311 {
17312   switch (type)
17313     {
17314       case EXCESS_PRECISION_TYPE_FAST:
17315       case EXCESS_PRECISION_TYPE_STANDARD:
17316 	/* We can calculate either in 16-bit range and precision or
17317 	   32-bit range and precision.  Make that decision based on whether
17318 	   we have native support for the ARMv8.2-A 16-bit floating-point
17319 	   instructions or not.  */
17320 	return (TARGET_FP_F16INST
17321 		? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
17322 		: FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
17323       case EXCESS_PRECISION_TYPE_IMPLICIT:
17324 	return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
17325       default:
17326 	gcc_unreachable ();
17327     }
17328   return FLT_EVAL_METHOD_UNPREDICTABLE;
17329 }
17330 
17331 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN.  Return true if INSN can be
17332    scheduled for speculative execution.  Reject the long-running division
17333    and square-root instructions.  */
17334 
17335 static bool
aarch64_sched_can_speculate_insn(rtx_insn * insn)17336 aarch64_sched_can_speculate_insn (rtx_insn *insn)
17337 {
17338   switch (get_attr_type (insn))
17339     {
17340       case TYPE_SDIV:
17341       case TYPE_UDIV:
17342       case TYPE_FDIVS:
17343       case TYPE_FDIVD:
17344       case TYPE_FSQRTS:
17345       case TYPE_FSQRTD:
17346       case TYPE_NEON_FP_SQRT_S:
17347       case TYPE_NEON_FP_SQRT_D:
17348       case TYPE_NEON_FP_SQRT_S_Q:
17349       case TYPE_NEON_FP_SQRT_D_Q:
17350       case TYPE_NEON_FP_DIV_S:
17351       case TYPE_NEON_FP_DIV_D:
17352       case TYPE_NEON_FP_DIV_S_Q:
17353       case TYPE_NEON_FP_DIV_D_Q:
17354 	return false;
17355       default:
17356 	return true;
17357     }
17358 }
17359 
17360 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES.  */
17361 
17362 static int
aarch64_compute_pressure_classes(reg_class * classes)17363 aarch64_compute_pressure_classes (reg_class *classes)
17364 {
17365   int i = 0;
17366   classes[i++] = GENERAL_REGS;
17367   classes[i++] = FP_REGS;
17368   /* PR_REGS isn't a useful pressure class because many predicate pseudo
17369      registers need to go in PR_LO_REGS at some point during their
17370      lifetime.  Splitting it into two halves has the effect of making
17371      all predicates count against PR_LO_REGS, so that we try whenever
17372      possible to restrict the number of live predicates to 8.  This
17373      greatly reduces the amount of spilling in certain loops.  */
17374   classes[i++] = PR_LO_REGS;
17375   classes[i++] = PR_HI_REGS;
17376   return i;
17377 }
17378 
17379 /* Implement TARGET_CAN_CHANGE_MODE_CLASS.  */
17380 
17381 static bool
aarch64_can_change_mode_class(machine_mode from,machine_mode to,reg_class_t)17382 aarch64_can_change_mode_class (machine_mode from,
17383 			       machine_mode to, reg_class_t)
17384 {
17385   if (BYTES_BIG_ENDIAN)
17386     {
17387       bool from_sve_p = aarch64_sve_data_mode_p (from);
17388       bool to_sve_p = aarch64_sve_data_mode_p (to);
17389 
17390       /* Don't allow changes between SVE data modes and non-SVE modes.
17391 	 See the comment at the head of aarch64-sve.md for details.  */
17392       if (from_sve_p != to_sve_p)
17393 	return false;
17394 
17395       /* Don't allow changes in element size: lane 0 of the new vector
17396 	 would not then be lane 0 of the old vector.  See the comment
17397 	 above aarch64_maybe_expand_sve_subreg_move for a more detailed
17398 	 description.
17399 
17400 	 In the worst case, this forces a register to be spilled in
17401 	 one mode and reloaded in the other, which handles the
17402 	 endianness correctly.  */
17403       if (from_sve_p && GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to))
17404 	return false;
17405     }
17406   return true;
17407 }
17408 
17409 /* Implement TARGET_EARLY_REMAT_MODES.  */
17410 
17411 static void
aarch64_select_early_remat_modes(sbitmap modes)17412 aarch64_select_early_remat_modes (sbitmap modes)
17413 {
17414   /* SVE values are not normally live across a call, so it should be
17415      worth doing early rematerialization even in VL-specific mode.  */
17416   for (int i = 0; i < NUM_MACHINE_MODES; ++i)
17417     {
17418       machine_mode mode = (machine_mode) i;
17419       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
17420       if (vec_flags & VEC_ANY_SVE)
17421 	bitmap_set_bit (modes, i);
17422     }
17423 }
17424 
17425 /* Target-specific selftests.  */
17426 
17427 #if CHECKING_P
17428 
17429 namespace selftest {
17430 
17431 /* Selftest for the RTL loader.
17432    Verify that the RTL loader copes with a dump from
17433    print_rtx_function.  This is essentially just a test that class
17434    function_reader can handle a real dump, but it also verifies
17435    that lookup_reg_by_dump_name correctly handles hard regs.
17436    The presence of hard reg names in the dump means that the test is
17437    target-specific, hence it is in this file.  */
17438 
17439 static void
aarch64_test_loading_full_dump()17440 aarch64_test_loading_full_dump ()
17441 {
17442   rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
17443 
17444   ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
17445 
17446   rtx_insn *insn_1 = get_insn_by_uid (1);
17447   ASSERT_EQ (NOTE, GET_CODE (insn_1));
17448 
17449   rtx_insn *insn_15 = get_insn_by_uid (15);
17450   ASSERT_EQ (INSN, GET_CODE (insn_15));
17451   ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
17452 
17453   /* Verify crtl->return_rtx.  */
17454   ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
17455   ASSERT_EQ (0, REGNO (crtl->return_rtx));
17456   ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
17457 }
17458 
17459 /* Run all target-specific selftests.  */
17460 
17461 static void
aarch64_run_selftests(void)17462 aarch64_run_selftests (void)
17463 {
17464   aarch64_test_loading_full_dump ();
17465 }
17466 
17467 } // namespace selftest
17468 
17469 #endif /* #if CHECKING_P */
17470 
17471 #undef TARGET_ADDRESS_COST
17472 #define TARGET_ADDRESS_COST aarch64_address_cost
17473 
17474 /* This hook will determines whether unnamed bitfields affect the alignment
17475    of the containing structure.  The hook returns true if the structure
17476    should inherit the alignment requirements of an unnamed bitfield's
17477    type.  */
17478 #undef TARGET_ALIGN_ANON_BITFIELD
17479 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
17480 
17481 #undef TARGET_ASM_ALIGNED_DI_OP
17482 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
17483 
17484 #undef TARGET_ASM_ALIGNED_HI_OP
17485 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
17486 
17487 #undef TARGET_ASM_ALIGNED_SI_OP
17488 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
17489 
17490 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
17491 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
17492   hook_bool_const_tree_hwi_hwi_const_tree_true
17493 
17494 #undef TARGET_ASM_FILE_START
17495 #define TARGET_ASM_FILE_START aarch64_start_file
17496 
17497 #undef TARGET_ASM_OUTPUT_MI_THUNK
17498 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
17499 
17500 #undef TARGET_ASM_SELECT_RTX_SECTION
17501 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
17502 
17503 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
17504 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
17505 
17506 #undef TARGET_BUILD_BUILTIN_VA_LIST
17507 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
17508 
17509 #undef TARGET_CALLEE_COPIES
17510 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
17511 
17512 #undef TARGET_CAN_ELIMINATE
17513 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
17514 
17515 #undef TARGET_CAN_INLINE_P
17516 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
17517 
17518 #undef TARGET_CANNOT_FORCE_CONST_MEM
17519 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
17520 
17521 #undef TARGET_CASE_VALUES_THRESHOLD
17522 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
17523 
17524 #undef TARGET_CONDITIONAL_REGISTER_USAGE
17525 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
17526 
17527 /* Only the least significant bit is used for initialization guard
17528    variables.  */
17529 #undef TARGET_CXX_GUARD_MASK_BIT
17530 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
17531 
17532 #undef TARGET_C_MODE_FOR_SUFFIX
17533 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
17534 
17535 #ifdef TARGET_BIG_ENDIAN_DEFAULT
17536 #undef  TARGET_DEFAULT_TARGET_FLAGS
17537 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
17538 #endif
17539 
17540 #undef TARGET_CLASS_MAX_NREGS
17541 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
17542 
17543 #undef TARGET_BUILTIN_DECL
17544 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
17545 
17546 #undef TARGET_BUILTIN_RECIPROCAL
17547 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
17548 
17549 #undef TARGET_C_EXCESS_PRECISION
17550 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
17551 
17552 #undef  TARGET_EXPAND_BUILTIN
17553 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
17554 
17555 #undef TARGET_EXPAND_BUILTIN_VA_START
17556 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
17557 
17558 #undef TARGET_FOLD_BUILTIN
17559 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
17560 
17561 #undef TARGET_FUNCTION_ARG
17562 #define TARGET_FUNCTION_ARG aarch64_function_arg
17563 
17564 #undef TARGET_FUNCTION_ARG_ADVANCE
17565 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
17566 
17567 #undef TARGET_FUNCTION_ARG_BOUNDARY
17568 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
17569 
17570 #undef TARGET_FUNCTION_ARG_PADDING
17571 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
17572 
17573 #undef TARGET_GET_RAW_RESULT_MODE
17574 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
17575 #undef TARGET_GET_RAW_ARG_MODE
17576 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
17577 
17578 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
17579 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
17580 
17581 #undef TARGET_FUNCTION_VALUE
17582 #define TARGET_FUNCTION_VALUE aarch64_function_value
17583 
17584 #undef TARGET_FUNCTION_VALUE_REGNO_P
17585 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
17586 
17587 #undef TARGET_GIMPLE_FOLD_BUILTIN
17588 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
17589 
17590 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
17591 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
17592 
17593 #undef  TARGET_INIT_BUILTINS
17594 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
17595 
17596 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
17597 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
17598   aarch64_ira_change_pseudo_allocno_class
17599 
17600 #undef TARGET_LEGITIMATE_ADDRESS_P
17601 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
17602 
17603 #undef TARGET_LEGITIMATE_CONSTANT_P
17604 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
17605 
17606 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
17607 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
17608   aarch64_legitimize_address_displacement
17609 
17610 #undef TARGET_LIBGCC_CMP_RETURN_MODE
17611 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
17612 
17613 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
17614 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
17615 aarch64_libgcc_floating_mode_supported_p
17616 
17617 #undef TARGET_MANGLE_TYPE
17618 #define TARGET_MANGLE_TYPE aarch64_mangle_type
17619 
17620 #undef TARGET_MEMORY_MOVE_COST
17621 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
17622 
17623 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
17624 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
17625 
17626 #undef TARGET_MUST_PASS_IN_STACK
17627 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
17628 
17629 /* This target hook should return true if accesses to volatile bitfields
17630    should use the narrowest mode possible.  It should return false if these
17631    accesses should use the bitfield container type.  */
17632 #undef TARGET_NARROW_VOLATILE_BITFIELD
17633 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
17634 
17635 #undef  TARGET_OPTION_OVERRIDE
17636 #define TARGET_OPTION_OVERRIDE aarch64_override_options
17637 
17638 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
17639 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
17640   aarch64_override_options_after_change
17641 
17642 #undef TARGET_OPTION_SAVE
17643 #define TARGET_OPTION_SAVE aarch64_option_save
17644 
17645 #undef TARGET_OPTION_RESTORE
17646 #define TARGET_OPTION_RESTORE aarch64_option_restore
17647 
17648 #undef TARGET_OPTION_PRINT
17649 #define TARGET_OPTION_PRINT aarch64_option_print
17650 
17651 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
17652 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
17653 
17654 #undef TARGET_SET_CURRENT_FUNCTION
17655 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
17656 
17657 #undef TARGET_PASS_BY_REFERENCE
17658 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
17659 
17660 #undef TARGET_PREFERRED_RELOAD_CLASS
17661 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
17662 
17663 #undef TARGET_SCHED_REASSOCIATION_WIDTH
17664 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
17665 
17666 #undef TARGET_PROMOTED_TYPE
17667 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
17668 
17669 #undef TARGET_SECONDARY_RELOAD
17670 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
17671 
17672 #undef TARGET_SHIFT_TRUNCATION_MASK
17673 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
17674 
17675 #undef TARGET_SETUP_INCOMING_VARARGS
17676 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
17677 
17678 #undef TARGET_STRUCT_VALUE_RTX
17679 #define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
17680 
17681 #undef TARGET_REGISTER_MOVE_COST
17682 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
17683 
17684 #undef TARGET_RETURN_IN_MEMORY
17685 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
17686 
17687 #undef TARGET_RETURN_IN_MSB
17688 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
17689 
17690 #undef TARGET_RTX_COSTS
17691 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
17692 
17693 #undef TARGET_SCALAR_MODE_SUPPORTED_P
17694 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
17695 
17696 #undef TARGET_SCHED_ISSUE_RATE
17697 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
17698 
17699 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
17700 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
17701   aarch64_sched_first_cycle_multipass_dfa_lookahead
17702 
17703 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
17704 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
17705   aarch64_first_cycle_multipass_dfa_lookahead_guard
17706 
17707 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
17708 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
17709   aarch64_get_separate_components
17710 
17711 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
17712 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
17713   aarch64_components_for_bb
17714 
17715 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
17716 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
17717   aarch64_disqualify_components
17718 
17719 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
17720 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
17721   aarch64_emit_prologue_components
17722 
17723 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
17724 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
17725   aarch64_emit_epilogue_components
17726 
17727 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
17728 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
17729   aarch64_set_handled_components
17730 
17731 #undef TARGET_TRAMPOLINE_INIT
17732 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
17733 
17734 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
17735 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
17736 
17737 #undef TARGET_VECTOR_MODE_SUPPORTED_P
17738 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
17739 
17740 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
17741 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
17742   aarch64_builtin_support_vector_misalignment
17743 
17744 #undef TARGET_ARRAY_MODE
17745 #define TARGET_ARRAY_MODE aarch64_array_mode
17746 
17747 #undef TARGET_ARRAY_MODE_SUPPORTED_P
17748 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
17749 
17750 #undef TARGET_VECTORIZE_ADD_STMT_COST
17751 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
17752 
17753 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
17754 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
17755   aarch64_builtin_vectorization_cost
17756 
17757 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
17758 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
17759 
17760 #undef TARGET_VECTORIZE_BUILTINS
17761 #define TARGET_VECTORIZE_BUILTINS
17762 
17763 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
17764 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
17765   aarch64_builtin_vectorized_function
17766 
17767 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
17768 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
17769   aarch64_autovectorize_vector_sizes
17770 
17771 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
17772 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
17773   aarch64_atomic_assign_expand_fenv
17774 
17775 /* Section anchor support.  */
17776 
17777 #undef TARGET_MIN_ANCHOR_OFFSET
17778 #define TARGET_MIN_ANCHOR_OFFSET -256
17779 
17780 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
17781    byte offset; we can do much more for larger data types, but have no way
17782    to determine the size of the access.  We assume accesses are aligned.  */
17783 #undef TARGET_MAX_ANCHOR_OFFSET
17784 #define TARGET_MAX_ANCHOR_OFFSET 4095
17785 
17786 #undef TARGET_VECTOR_ALIGNMENT
17787 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
17788 
17789 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
17790 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
17791   aarch64_vectorize_preferred_vector_alignment
17792 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
17793 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
17794   aarch64_simd_vector_alignment_reachable
17795 
17796 /* vec_perm support.  */
17797 
17798 #undef TARGET_VECTORIZE_VEC_PERM_CONST
17799 #define TARGET_VECTORIZE_VEC_PERM_CONST \
17800   aarch64_vectorize_vec_perm_const
17801 
17802 #undef TARGET_VECTORIZE_GET_MASK_MODE
17803 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
17804 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
17805 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
17806   aarch64_empty_mask_is_expensive
17807 
17808 #undef TARGET_INIT_LIBFUNCS
17809 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
17810 
17811 #undef TARGET_FIXED_CONDITION_CODE_REGS
17812 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
17813 
17814 #undef TARGET_FLAGS_REGNUM
17815 #define TARGET_FLAGS_REGNUM CC_REGNUM
17816 
17817 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
17818 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
17819 
17820 #undef TARGET_ASAN_SHADOW_OFFSET
17821 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
17822 
17823 #undef TARGET_LEGITIMIZE_ADDRESS
17824 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
17825 
17826 #undef TARGET_SCHED_CAN_SPECULATE_INSN
17827 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
17828 
17829 #undef TARGET_CAN_USE_DOLOOP_P
17830 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
17831 
17832 #undef TARGET_SCHED_ADJUST_PRIORITY
17833 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
17834 
17835 #undef TARGET_SCHED_MACRO_FUSION_P
17836 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
17837 
17838 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
17839 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
17840 
17841 #undef TARGET_SCHED_FUSION_PRIORITY
17842 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
17843 
17844 #undef TARGET_UNSPEC_MAY_TRAP_P
17845 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
17846 
17847 #undef TARGET_USE_PSEUDO_PIC_REG
17848 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
17849 
17850 #undef TARGET_PRINT_OPERAND
17851 #define TARGET_PRINT_OPERAND aarch64_print_operand
17852 
17853 #undef TARGET_PRINT_OPERAND_ADDRESS
17854 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
17855 
17856 #undef TARGET_OPTAB_SUPPORTED_P
17857 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
17858 
17859 #undef TARGET_OMIT_STRUCT_RETURN_REG
17860 #define TARGET_OMIT_STRUCT_RETURN_REG true
17861 
17862 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
17863 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
17864   aarch64_dwarf_poly_indeterminate_value
17865 
17866 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors.  */
17867 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
17868 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
17869 
17870 #undef TARGET_HARD_REGNO_NREGS
17871 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
17872 #undef TARGET_HARD_REGNO_MODE_OK
17873 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
17874 
17875 #undef TARGET_MODES_TIEABLE_P
17876 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
17877 
17878 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
17879 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
17880   aarch64_hard_regno_call_part_clobbered
17881 
17882 #undef TARGET_CONSTANT_ALIGNMENT
17883 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
17884 
17885 #undef TARGET_COMPUTE_PRESSURE_CLASSES
17886 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
17887 
17888 #undef TARGET_CAN_CHANGE_MODE_CLASS
17889 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
17890 
17891 #undef TARGET_SELECT_EARLY_REMAT_MODES
17892 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
17893 
17894 #if CHECKING_P
17895 #undef TARGET_RUN_TARGET_SELFTESTS
17896 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
17897 #endif /* #if CHECKING_P */
17898 
17899 struct gcc_target targetm = TARGET_INITIALIZER;
17900 
17901 #include "gt-aarch64.h"
17902