1 /* Machine description for AArch64 architecture.
2    Copyright (C) 2009-2021 Free Software Foundation, Inc.
3    Contributed by ARM Ltd.
4 
5    This file is part of GCC.
6 
7    GCC is free software; you can redistribute it and/or modify it
8    under the terms of the GNU General Public License as published by
9    the Free Software Foundation; either version 3, or (at your option)
10    any later version.
11 
12    GCC is distributed in the hope that it will be useful, but
13    WITHOUT ANY WARRANTY; without even the implied warranty of
14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15    General Public License for more details.
16 
17    You should have received a copy of the GNU General Public License
18    along with GCC; see the file COPYING3.  If not see
19    <http://www.gnu.org/licenses/>.  */
20 
21 #define IN_TARGET_CODE 1
22 
23 #define INCLUDE_STRING
24 #define INCLUDE_ALGORITHM
25 #include "config.h"
26 #include "system.h"
27 #include "coretypes.h"
28 #include "backend.h"
29 #include "target.h"
30 #include "rtl.h"
31 #include "tree.h"
32 #include "memmodel.h"
33 #include "gimple.h"
34 #include "cfghooks.h"
35 #include "cfgloop.h"
36 #include "df.h"
37 #include "tm_p.h"
38 #include "stringpool.h"
39 #include "attribs.h"
40 #include "optabs.h"
41 #include "regs.h"
42 #include "emit-rtl.h"
43 #include "recog.h"
44 #include "cgraph.h"
45 #include "diagnostic.h"
46 #include "insn-attr.h"
47 #include "alias.h"
48 #include "fold-const.h"
49 #include "stor-layout.h"
50 #include "calls.h"
51 #include "varasm.h"
52 #include "output.h"
53 #include "flags.h"
54 #include "explow.h"
55 #include "expr.h"
56 #include "reload.h"
57 #include "langhooks.h"
58 #include "opts.h"
59 #include "gimplify.h"
60 #include "dwarf2.h"
61 #include "gimple-iterator.h"
62 #include "tree-vectorizer.h"
63 #include "aarch64-cost-tables.h"
64 #include "dumpfile.h"
65 #include "builtins.h"
66 #include "rtl-iter.h"
67 #include "tm-constrs.h"
68 #include "sched-int.h"
69 #include "target-globals.h"
70 #include "common/common-target.h"
71 #include "cfgrtl.h"
72 #include "selftest.h"
73 #include "selftest-rtl.h"
74 #include "rtx-vector-builder.h"
75 #include "intl.h"
76 #include "expmed.h"
77 #include "function-abi.h"
78 #include "gimple-pretty-print.h"
79 #include "tree-ssa-loop-niter.h"
80 #include "fractional-cost.h"
81 
82 /* This file should be included last.  */
83 #include "target-def.h"
84 
85 /* Defined for convenience.  */
86 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
87 
88 /* Information about a legitimate vector immediate operand.  */
89 struct simd_immediate_info
90 {
91   enum insn_type { MOV, MVN, INDEX, PTRUE };
92   enum modifier_type { LSL, MSL };
93 
simd_immediate_infosimd_immediate_info94   simd_immediate_info () {}
95   simd_immediate_info (scalar_float_mode, rtx);
96   simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
97 		       insn_type = MOV, modifier_type = LSL,
98 		       unsigned int = 0);
99   simd_immediate_info (scalar_mode, rtx, rtx);
100   simd_immediate_info (scalar_int_mode, aarch64_svpattern);
101 
102   /* The mode of the elements.  */
103   scalar_mode elt_mode;
104 
105   /* The instruction to use to move the immediate into a vector.  */
106   insn_type insn;
107 
108   union
109   {
110     /* For MOV and MVN.  */
111     struct
112     {
113       /* The value of each element.  */
114       rtx value;
115 
116       /* The kind of shift modifier to use, and the number of bits to shift.
117 	 This is (LSL, 0) if no shift is needed.  */
118       modifier_type modifier;
119       unsigned int shift;
120     } mov;
121 
122     /* For INDEX.  */
123     struct
124     {
125       /* The value of the first element and the step to be added for each
126 	 subsequent element.  */
127       rtx base, step;
128     } index;
129 
130     /* For PTRUE.  */
131     aarch64_svpattern pattern;
132   } u;
133 };
134 
135 /* Construct a floating-point immediate in which each element has mode
136    ELT_MODE_IN and value VALUE_IN.  */
137 inline simd_immediate_info
simd_immediate_info(scalar_float_mode elt_mode_in,rtx value_in)138 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
139   : elt_mode (elt_mode_in), insn (MOV)
140 {
141   u.mov.value = value_in;
142   u.mov.modifier = LSL;
143   u.mov.shift = 0;
144 }
145 
146 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
147    and value VALUE_IN.  The other parameters are as for the structure
148    fields.  */
149 inline simd_immediate_info
simd_immediate_info(scalar_int_mode elt_mode_in,unsigned HOST_WIDE_INT value_in,insn_type insn_in,modifier_type modifier_in,unsigned int shift_in)150 ::simd_immediate_info (scalar_int_mode elt_mode_in,
151 		       unsigned HOST_WIDE_INT value_in,
152 		       insn_type insn_in, modifier_type modifier_in,
153 		       unsigned int shift_in)
154   : elt_mode (elt_mode_in), insn (insn_in)
155 {
156   u.mov.value = gen_int_mode (value_in, elt_mode_in);
157   u.mov.modifier = modifier_in;
158   u.mov.shift = shift_in;
159 }
160 
161 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
162    and where element I is equal to BASE_IN + I * STEP_IN.  */
163 inline simd_immediate_info
simd_immediate_info(scalar_mode elt_mode_in,rtx base_in,rtx step_in)164 ::simd_immediate_info (scalar_mode elt_mode_in, rtx base_in, rtx step_in)
165   : elt_mode (elt_mode_in), insn (INDEX)
166 {
167   u.index.base = base_in;
168   u.index.step = step_in;
169 }
170 
171 /* Construct a predicate that controls elements of mode ELT_MODE_IN
172    and has PTRUE pattern PATTERN_IN.  */
173 inline simd_immediate_info
simd_immediate_info(scalar_int_mode elt_mode_in,aarch64_svpattern pattern_in)174 ::simd_immediate_info (scalar_int_mode elt_mode_in,
175 		       aarch64_svpattern pattern_in)
176   : elt_mode (elt_mode_in), insn (PTRUE)
177 {
178   u.pattern = pattern_in;
179 }
180 
181 namespace {
182 
183 /* Describes types that map to Pure Scalable Types (PSTs) in the AAPCS64.  */
184 class pure_scalable_type_info
185 {
186 public:
187   /* Represents the result of analyzing a type.  All values are nonzero,
188      in the possibly forlorn hope that accidental conversions to bool
189      trigger a warning.  */
190   enum analysis_result
191   {
192     /* The type does not have an ABI identity; i.e. it doesn't contain
193        at least one object whose type is a Fundamental Data Type.  */
194     NO_ABI_IDENTITY = 1,
195 
196     /* The type is definitely a Pure Scalable Type.  */
197     IS_PST,
198 
199     /* The type is definitely not a Pure Scalable Type.  */
200     ISNT_PST,
201 
202     /* It doesn't matter for PCS purposes whether the type is a Pure
203        Scalable Type or not, since the type will be handled the same
204        way regardless.
205 
206        Specifically, this means that if the type is a Pure Scalable Type,
207        there aren't enough argument registers to hold it, and so it will
208        need to be passed or returned in memory.  If the type isn't a
209        Pure Scalable Type, it's too big to be passed or returned in core
210        or SIMD&FP registers, and so again will need to go in memory.  */
211     DOESNT_MATTER
212   };
213 
214   /* Aggregates of 17 bytes or more are normally passed and returned
215      in memory, so aggregates of that size can safely be analyzed as
216      DOESNT_MATTER.  We need to be able to collect enough pieces to
217      represent a PST that is smaller than that.  Since predicates are
218      2 bytes in size for -msve-vector-bits=128, that means we need to be
219      able to store at least 8 pieces.
220 
221      We also need to be able to store enough pieces to represent
222      a single vector in each vector argument register and a single
223      predicate in each predicate argument register.  This means that
224      we need at least 12 pieces.  */
225   static const unsigned int MAX_PIECES = NUM_FP_ARG_REGS + NUM_PR_ARG_REGS;
226 #if __cplusplus >= 201103L
227   static_assert (MAX_PIECES >= 8, "Need to store at least 8 predicates");
228 #endif
229 
230   /* Describes one piece of a PST.  Each piece is one of:
231 
232      - a single Scalable Vector Type (SVT)
233      - a single Scalable Predicate Type (SPT)
234      - a PST containing 2, 3 or 4 SVTs, with no padding
235 
236      It either represents a single built-in type or a PST formed from
237      multiple homogeneous built-in types.  */
238   struct piece
239   {
240     rtx get_rtx (unsigned int, unsigned int) const;
241 
242     /* The number of vector and predicate registers that the piece
243        occupies.  One of the two is always zero.  */
244     unsigned int num_zr;
245     unsigned int num_pr;
246 
247     /* The mode of the registers described above.  */
248     machine_mode mode;
249 
250     /* If this piece is formed from multiple homogeneous built-in types,
251        this is the mode of the built-in types, otherwise it is MODE.  */
252     machine_mode orig_mode;
253 
254     /* The offset in bytes of the piece from the start of the type.  */
255     poly_uint64_pod offset;
256   };
257 
258   /* Divides types analyzed as IS_PST into individual pieces.  The pieces
259      are in memory order.  */
260   auto_vec<piece, MAX_PIECES> pieces;
261 
262   unsigned int num_zr () const;
263   unsigned int num_pr () const;
264 
265   rtx get_rtx (machine_mode mode, unsigned int, unsigned int) const;
266 
267   analysis_result analyze (const_tree);
268   bool analyze_registers (const_tree);
269 
270 private:
271   analysis_result analyze_array (const_tree);
272   analysis_result analyze_record (const_tree);
273   void add_piece (const piece &);
274 };
275 }
276 
277 /* The current code model.  */
278 enum aarch64_code_model aarch64_cmodel;
279 
280 /* The number of 64-bit elements in an SVE vector.  */
281 poly_uint16 aarch64_sve_vg;
282 
283 #ifdef HAVE_AS_TLS
284 #undef TARGET_HAVE_TLS
285 #define TARGET_HAVE_TLS 1
286 #endif
287 
288 static bool aarch64_composite_type_p (const_tree, machine_mode);
289 static bool aarch64_return_in_memory_1 (const_tree);
290 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
291 						     const_tree,
292 						     machine_mode *, int *,
293 						     bool *, bool);
294 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
295 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
296 static void aarch64_override_options_after_change (void);
297 static bool aarch64_vector_mode_supported_p (machine_mode);
298 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
299 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
300 							 const_tree type,
301 							 int misalignment,
302 							 bool is_packed);
303 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
304 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
305 					    aarch64_addr_query_type);
306 static HOST_WIDE_INT aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val);
307 
308 /* Major revision number of the ARM Architecture implemented by the target.  */
309 unsigned aarch64_architecture_version;
310 
311 /* The processor for which instructions should be scheduled.  */
312 enum aarch64_processor aarch64_tune = cortexa53;
313 
314 /* Mask to specify which instruction scheduling options should be used.  */
315 uint64_t aarch64_tune_flags = 0;
316 
317 /* Global flag for PC relative loads.  */
318 bool aarch64_pcrelative_literal_loads;
319 
320 /* Global flag for whether frame pointer is enabled.  */
321 bool aarch64_use_frame_pointer;
322 
323 #define BRANCH_PROTECT_STR_MAX 255
324 char *accepted_branch_protection_string = NULL;
325 
326 static enum aarch64_parse_opt_result
327 aarch64_parse_branch_protection (const char*, char**);
328 
329 /* Support for command line parsing of boolean flags in the tuning
330    structures.  */
331 struct aarch64_flag_desc
332 {
333   const char* name;
334   unsigned int flag;
335 };
336 
337 #define AARCH64_FUSION_PAIR(name, internal_name) \
338   { name, AARCH64_FUSE_##internal_name },
339 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
340 {
341   { "none", AARCH64_FUSE_NOTHING },
342 #include "aarch64-fusion-pairs.def"
343   { "all", AARCH64_FUSE_ALL },
344   { NULL, AARCH64_FUSE_NOTHING }
345 };
346 
347 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
348   { name, AARCH64_EXTRA_TUNE_##internal_name },
349 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
350 {
351   { "none", AARCH64_EXTRA_TUNE_NONE },
352 #include "aarch64-tuning-flags.def"
353   { "all", AARCH64_EXTRA_TUNE_ALL },
354   { NULL, AARCH64_EXTRA_TUNE_NONE }
355 };
356 
357 /* Tuning parameters.  */
358 
359 static const struct cpu_addrcost_table generic_addrcost_table =
360 {
361     {
362       1, /* hi  */
363       0, /* si  */
364       0, /* di  */
365       1, /* ti  */
366     },
367   0, /* pre_modify  */
368   0, /* post_modify  */
369   0, /* post_modify_ld3_st3  */
370   0, /* post_modify_ld4_st4  */
371   0, /* register_offset  */
372   0, /* register_sextend  */
373   0, /* register_zextend  */
374   0 /* imm_offset  */
375 };
376 
377 static const struct cpu_addrcost_table exynosm1_addrcost_table =
378 {
379     {
380       0, /* hi  */
381       0, /* si  */
382       0, /* di  */
383       2, /* ti  */
384     },
385   0, /* pre_modify  */
386   0, /* post_modify  */
387   0, /* post_modify_ld3_st3  */
388   0, /* post_modify_ld4_st4  */
389   1, /* register_offset  */
390   1, /* register_sextend  */
391   2, /* register_zextend  */
392   0, /* imm_offset  */
393 };
394 
395 static const struct cpu_addrcost_table xgene1_addrcost_table =
396 {
397     {
398       1, /* hi  */
399       0, /* si  */
400       0, /* di  */
401       1, /* ti  */
402     },
403   1, /* pre_modify  */
404   1, /* post_modify  */
405   1, /* post_modify_ld3_st3  */
406   1, /* post_modify_ld4_st4  */
407   0, /* register_offset  */
408   1, /* register_sextend  */
409   1, /* register_zextend  */
410   0, /* imm_offset  */
411 };
412 
413 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
414 {
415     {
416       1, /* hi  */
417       1, /* si  */
418       1, /* di  */
419       2, /* ti  */
420     },
421   0, /* pre_modify  */
422   0, /* post_modify  */
423   0, /* post_modify_ld3_st3  */
424   0, /* post_modify_ld4_st4  */
425   2, /* register_offset  */
426   3, /* register_sextend  */
427   3, /* register_zextend  */
428   0, /* imm_offset  */
429 };
430 
431 static const struct cpu_addrcost_table thunderx3t110_addrcost_table =
432 {
433     {
434       1, /* hi  */
435       1, /* si  */
436       1, /* di  */
437       2, /* ti  */
438     },
439   0, /* pre_modify  */
440   0, /* post_modify  */
441   0, /* post_modify_ld3_st3  */
442   0, /* post_modify_ld4_st4  */
443   2, /* register_offset  */
444   3, /* register_sextend  */
445   3, /* register_zextend  */
446   0, /* imm_offset  */
447 };
448 
449 static const struct cpu_addrcost_table tsv110_addrcost_table =
450 {
451     {
452       1, /* hi  */
453       0, /* si  */
454       0, /* di  */
455       1, /* ti  */
456     },
457   0, /* pre_modify  */
458   0, /* post_modify  */
459   0, /* post_modify_ld3_st3  */
460   0, /* post_modify_ld4_st4  */
461   0, /* register_offset  */
462   1, /* register_sextend  */
463   1, /* register_zextend  */
464   0, /* imm_offset  */
465 };
466 
467 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
468 {
469     {
470       1, /* hi  */
471       1, /* si  */
472       1, /* di  */
473       2, /* ti  */
474     },
475   1, /* pre_modify  */
476   1, /* post_modify  */
477   1, /* post_modify_ld3_st3  */
478   1, /* post_modify_ld4_st4  */
479   3, /* register_offset  */
480   3, /* register_sextend  */
481   3, /* register_zextend  */
482   2, /* imm_offset  */
483 };
484 
485 static const struct cpu_addrcost_table a64fx_addrcost_table =
486 {
487     {
488       1, /* hi  */
489       1, /* si  */
490       1, /* di  */
491       2, /* ti  */
492     },
493   0, /* pre_modify  */
494   0, /* post_modify  */
495   0, /* post_modify_ld3_st3  */
496   0, /* post_modify_ld4_st4  */
497   2, /* register_offset  */
498   3, /* register_sextend  */
499   3, /* register_zextend  */
500   0, /* imm_offset  */
501 };
502 
503 static const struct cpu_addrcost_table neoversev1_addrcost_table =
504 {
505     {
506       1, /* hi  */
507       0, /* si  */
508       0, /* di  */
509       1, /* ti  */
510     },
511   0, /* pre_modify  */
512   0, /* post_modify  */
513   3, /* post_modify_ld3_st3  */
514   3, /* post_modify_ld4_st4  */
515   0, /* register_offset  */
516   0, /* register_sextend  */
517   0, /* register_zextend  */
518   0 /* imm_offset  */
519 };
520 
521 static const struct cpu_regmove_cost generic_regmove_cost =
522 {
523   1, /* GP2GP  */
524   /* Avoid the use of slow int<->fp moves for spilling by setting
525      their cost higher than memmov_cost.  */
526   5, /* GP2FP  */
527   5, /* FP2GP  */
528   2 /* FP2FP  */
529 };
530 
531 static const struct cpu_regmove_cost cortexa57_regmove_cost =
532 {
533   1, /* GP2GP  */
534   /* Avoid the use of slow int<->fp moves for spilling by setting
535      their cost higher than memmov_cost.  */
536   5, /* GP2FP  */
537   5, /* FP2GP  */
538   2 /* FP2FP  */
539 };
540 
541 static const struct cpu_regmove_cost cortexa53_regmove_cost =
542 {
543   1, /* GP2GP  */
544   /* Avoid the use of slow int<->fp moves for spilling by setting
545      their cost higher than memmov_cost.  */
546   5, /* GP2FP  */
547   5, /* FP2GP  */
548   2 /* FP2FP  */
549 };
550 
551 static const struct cpu_regmove_cost exynosm1_regmove_cost =
552 {
553   1, /* GP2GP  */
554   /* Avoid the use of slow int<->fp moves for spilling by setting
555      their cost higher than memmov_cost (actual, 4 and 9).  */
556   9, /* GP2FP  */
557   9, /* FP2GP  */
558   1 /* FP2FP  */
559 };
560 
561 static const struct cpu_regmove_cost thunderx_regmove_cost =
562 {
563   2, /* GP2GP  */
564   2, /* GP2FP  */
565   6, /* FP2GP  */
566   4 /* FP2FP  */
567 };
568 
569 static const struct cpu_regmove_cost xgene1_regmove_cost =
570 {
571   1, /* GP2GP  */
572   /* Avoid the use of slow int<->fp moves for spilling by setting
573      their cost higher than memmov_cost.  */
574   8, /* GP2FP  */
575   8, /* FP2GP  */
576   2 /* FP2FP  */
577 };
578 
579 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
580 {
581   2, /* GP2GP  */
582   /* Avoid the use of int<->fp moves for spilling.  */
583   6, /* GP2FP  */
584   6, /* FP2GP  */
585   4 /* FP2FP  */
586 };
587 
588 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
589 {
590   1, /* GP2GP  */
591   /* Avoid the use of int<->fp moves for spilling.  */
592   5, /* GP2FP  */
593   6, /* FP2GP  */
594   3, /* FP2FP  */
595 };
596 
597 static const struct cpu_regmove_cost thunderx3t110_regmove_cost =
598 {
599   1, /* GP2GP  */
600   /* Avoid the use of int<->fp moves for spilling.  */
601   4, /* GP2FP  */
602   5, /* FP2GP  */
603   4  /* FP2FP  */
604 };
605 
606 static const struct cpu_regmove_cost tsv110_regmove_cost =
607 {
608   1, /* GP2GP  */
609   /* Avoid the use of slow int<->fp moves for spilling by setting
610      their cost higher than memmov_cost.  */
611   2, /* GP2FP  */
612   3, /* FP2GP  */
613   2  /* FP2FP  */
614 };
615 
616 static const struct cpu_regmove_cost a64fx_regmove_cost =
617 {
618   1, /* GP2GP  */
619   /* Avoid the use of slow int<->fp moves for spilling by setting
620      their cost higher than memmov_cost.  */
621   5, /* GP2FP  */
622   7, /* FP2GP  */
623   2 /* FP2FP  */
624 };
625 
626 /* Generic costs for Advanced SIMD vector operations.   */
627 static const advsimd_vec_cost generic_advsimd_vector_cost =
628 {
629   1, /* int_stmt_cost  */
630   1, /* fp_stmt_cost  */
631   0, /* ld2_st2_permute_cost  */
632   0, /* ld3_st3_permute_cost  */
633   0, /* ld4_st4_permute_cost  */
634   2, /* permute_cost  */
635   2, /* reduc_i8_cost  */
636   2, /* reduc_i16_cost  */
637   2, /* reduc_i32_cost  */
638   2, /* reduc_i64_cost  */
639   2, /* reduc_f16_cost  */
640   2, /* reduc_f32_cost  */
641   2, /* reduc_f64_cost  */
642   2, /* store_elt_extra_cost  */
643   2, /* vec_to_scalar_cost  */
644   1, /* scalar_to_vec_cost  */
645   1, /* align_load_cost  */
646   1, /* unalign_load_cost  */
647   1, /* unalign_store_cost  */
648   1  /* store_cost  */
649 };
650 
651 /* Generic costs for SVE vector operations.  */
652 static const sve_vec_cost generic_sve_vector_cost =
653 {
654   {
655     1, /* int_stmt_cost  */
656     1, /* fp_stmt_cost  */
657     0, /* ld2_st2_permute_cost  */
658     0, /* ld3_st3_permute_cost  */
659     0, /* ld4_st4_permute_cost  */
660     2, /* permute_cost  */
661     2, /* reduc_i8_cost  */
662     2, /* reduc_i16_cost  */
663     2, /* reduc_i32_cost  */
664     2, /* reduc_i64_cost  */
665     2, /* reduc_f16_cost  */
666     2, /* reduc_f32_cost  */
667     2, /* reduc_f64_cost  */
668     2, /* store_elt_extra_cost  */
669     2, /* vec_to_scalar_cost  */
670     1, /* scalar_to_vec_cost  */
671     1, /* align_load_cost  */
672     1, /* unalign_load_cost  */
673     1, /* unalign_store_cost  */
674     1  /* store_cost  */
675   },
676   2, /* clast_cost  */
677   2, /* fadda_f16_cost  */
678   2, /* fadda_f32_cost  */
679   2, /* fadda_f64_cost  */
680   4, /* gather_load_x32_cost  */
681   2, /* gather_load_x64_cost  */
682   1 /* scatter_store_elt_cost  */
683 };
684 
685 /* Generic costs for vector insn classes.  */
686 static const struct cpu_vector_cost generic_vector_cost =
687 {
688   1, /* scalar_int_stmt_cost  */
689   1, /* scalar_fp_stmt_cost  */
690   1, /* scalar_load_cost  */
691   1, /* scalar_store_cost  */
692   3, /* cond_taken_branch_cost  */
693   1, /* cond_not_taken_branch_cost  */
694   &generic_advsimd_vector_cost, /* advsimd  */
695   &generic_sve_vector_cost, /* sve */
696   nullptr /* issue_info  */
697 };
698 
699 static const advsimd_vec_cost a64fx_advsimd_vector_cost =
700 {
701   2, /* int_stmt_cost  */
702   5, /* fp_stmt_cost  */
703   0, /* ld2_st2_permute_cost  */
704   0, /* ld3_st3_permute_cost  */
705   0, /* ld4_st4_permute_cost  */
706   3, /* permute_cost  */
707   13, /* reduc_i8_cost  */
708   13, /* reduc_i16_cost  */
709   13, /* reduc_i32_cost  */
710   13, /* reduc_i64_cost  */
711   13, /* reduc_f16_cost  */
712   13, /* reduc_f32_cost  */
713   13, /* reduc_f64_cost  */
714   13, /* store_elt_extra_cost  */
715   13, /* vec_to_scalar_cost  */
716   4, /* scalar_to_vec_cost  */
717   6, /* align_load_cost  */
718   6, /* unalign_load_cost  */
719   1, /* unalign_store_cost  */
720   1  /* store_cost  */
721 };
722 
723 static const sve_vec_cost a64fx_sve_vector_cost =
724 {
725   {
726     2, /* int_stmt_cost  */
727     5, /* fp_stmt_cost  */
728     0, /* ld2_st2_permute_cost  */
729     0, /* ld3_st3_permute_cost  */
730     0, /* ld4_st4_permute_cost  */
731     3, /* permute_cost  */
732     13, /* reduc_i8_cost  */
733     13, /* reduc_i16_cost  */
734     13, /* reduc_i32_cost  */
735     13, /* reduc_i64_cost  */
736     13, /* reduc_f16_cost  */
737     13, /* reduc_f32_cost  */
738     13, /* reduc_f64_cost  */
739     13, /* store_elt_extra_cost  */
740     13, /* vec_to_scalar_cost  */
741     4, /* scalar_to_vec_cost  */
742     6, /* align_load_cost  */
743     6, /* unalign_load_cost  */
744     1, /* unalign_store_cost  */
745     1  /* store_cost  */
746   },
747   13, /* clast_cost  */
748   13, /* fadda_f16_cost  */
749   13, /* fadda_f32_cost  */
750   13, /* fadda_f64_cost  */
751   64, /* gather_load_x32_cost  */
752   32, /* gather_load_x64_cost  */
753   1 /* scatter_store_elt_cost  */
754 };
755 
756 static const struct cpu_vector_cost a64fx_vector_cost =
757 {
758   1, /* scalar_int_stmt_cost  */
759   5, /* scalar_fp_stmt_cost  */
760   4, /* scalar_load_cost  */
761   1, /* scalar_store_cost  */
762   3, /* cond_taken_branch_cost  */
763   1, /* cond_not_taken_branch_cost  */
764   &a64fx_advsimd_vector_cost, /* advsimd  */
765   &a64fx_sve_vector_cost, /* sve  */
766   nullptr /* issue_info  */
767 };
768 
769 static const advsimd_vec_cost qdf24xx_advsimd_vector_cost =
770 {
771   1, /* int_stmt_cost  */
772   3, /* fp_stmt_cost  */
773   0, /* ld2_st2_permute_cost  */
774   0, /* ld3_st3_permute_cost  */
775   0, /* ld4_st4_permute_cost  */
776   2, /* permute_cost  */
777   1, /* reduc_i8_cost  */
778   1, /* reduc_i16_cost  */
779   1, /* reduc_i32_cost  */
780   1, /* reduc_i64_cost  */
781   1, /* reduc_f16_cost  */
782   1, /* reduc_f32_cost  */
783   1, /* reduc_f64_cost  */
784   1, /* store_elt_extra_cost  */
785   1, /* vec_to_scalar_cost  */
786   1, /* scalar_to_vec_cost  */
787   1, /* align_load_cost  */
788   1, /* unalign_load_cost  */
789   1, /* unalign_store_cost  */
790   1  /* store_cost  */
791 };
792 
793 /* QDF24XX costs for vector insn classes.  */
794 static const struct cpu_vector_cost qdf24xx_vector_cost =
795 {
796   1, /* scalar_int_stmt_cost  */
797   1, /* scalar_fp_stmt_cost  */
798   1, /* scalar_load_cost  */
799   1, /* scalar_store_cost  */
800   3, /* cond_taken_branch_cost  */
801   1, /* cond_not_taken_branch_cost  */
802   &qdf24xx_advsimd_vector_cost, /* advsimd  */
803   nullptr, /* sve  */
804   nullptr /* issue_info  */
805 };
806 
807 
808 static const advsimd_vec_cost thunderx_advsimd_vector_cost =
809 {
810   4, /* int_stmt_cost  */
811   1, /* fp_stmt_cost  */
812   0, /* ld2_st2_permute_cost  */
813   0, /* ld3_st3_permute_cost  */
814   0, /* ld4_st4_permute_cost  */
815   4, /* permute_cost  */
816   2, /* reduc_i8_cost  */
817   2, /* reduc_i16_cost  */
818   2, /* reduc_i32_cost  */
819   2, /* reduc_i64_cost  */
820   2, /* reduc_f16_cost  */
821   2, /* reduc_f32_cost  */
822   2, /* reduc_f64_cost  */
823   2, /* store_elt_extra_cost  */
824   2, /* vec_to_scalar_cost  */
825   2, /* scalar_to_vec_cost  */
826   3, /* align_load_cost  */
827   5, /* unalign_load_cost  */
828   5, /* unalign_store_cost  */
829   1  /* store_cost  */
830 };
831 
832 /* ThunderX costs for vector insn classes.  */
833 static const struct cpu_vector_cost thunderx_vector_cost =
834 {
835   1, /* scalar_int_stmt_cost  */
836   1, /* scalar_fp_stmt_cost  */
837   3, /* scalar_load_cost  */
838   1, /* scalar_store_cost  */
839   3, /* cond_taken_branch_cost  */
840   3, /* cond_not_taken_branch_cost  */
841   &thunderx_advsimd_vector_cost, /* advsimd  */
842   nullptr, /* sve  */
843   nullptr /* issue_info  */
844 };
845 
846 static const advsimd_vec_cost tsv110_advsimd_vector_cost =
847 {
848   2, /* int_stmt_cost  */
849   2, /* fp_stmt_cost  */
850   0, /* ld2_st2_permute_cost  */
851   0, /* ld3_st3_permute_cost  */
852   0, /* ld4_st4_permute_cost  */
853   2, /* permute_cost  */
854   3, /* reduc_i8_cost  */
855   3, /* reduc_i16_cost  */
856   3, /* reduc_i32_cost  */
857   3, /* reduc_i64_cost  */
858   3, /* reduc_f16_cost  */
859   3, /* reduc_f32_cost  */
860   3, /* reduc_f64_cost  */
861   3, /* store_elt_extra_cost  */
862   3, /* vec_to_scalar_cost  */
863   2, /* scalar_to_vec_cost  */
864   5, /* align_load_cost  */
865   5, /* unalign_load_cost  */
866   1, /* unalign_store_cost  */
867   1  /* store_cost  */
868 };
869 
870 static const struct cpu_vector_cost tsv110_vector_cost =
871 {
872   1, /* scalar_int_stmt_cost  */
873   1, /* scalar_fp_stmt_cost  */
874   5, /* scalar_load_cost  */
875   1, /* scalar_store_cost  */
876   1, /* cond_taken_branch_cost  */
877   1, /* cond_not_taken_branch_cost  */
878   &tsv110_advsimd_vector_cost, /* advsimd  */
879   nullptr, /* sve  */
880   nullptr /* issue_info  */
881 };
882 
883 static const advsimd_vec_cost cortexa57_advsimd_vector_cost =
884 {
885   2, /* int_stmt_cost  */
886   2, /* fp_stmt_cost  */
887   0, /* ld2_st2_permute_cost  */
888   0, /* ld3_st3_permute_cost  */
889   0, /* ld4_st4_permute_cost  */
890   3, /* permute_cost  */
891   8, /* reduc_i8_cost  */
892   8, /* reduc_i16_cost  */
893   8, /* reduc_i32_cost  */
894   8, /* reduc_i64_cost  */
895   8, /* reduc_f16_cost  */
896   8, /* reduc_f32_cost  */
897   8, /* reduc_f64_cost  */
898   8, /* store_elt_extra_cost  */
899   8, /* vec_to_scalar_cost  */
900   8, /* scalar_to_vec_cost  */
901   4, /* align_load_cost  */
902   4, /* unalign_load_cost  */
903   1, /* unalign_store_cost  */
904   1  /* store_cost  */
905 };
906 
907 /* Cortex-A57 costs for vector insn classes.  */
908 static const struct cpu_vector_cost cortexa57_vector_cost =
909 {
910   1, /* scalar_int_stmt_cost  */
911   1, /* scalar_fp_stmt_cost  */
912   4, /* scalar_load_cost  */
913   1, /* scalar_store_cost  */
914   1, /* cond_taken_branch_cost  */
915   1, /* cond_not_taken_branch_cost  */
916   &cortexa57_advsimd_vector_cost, /* advsimd  */
917   nullptr, /* sve  */
918   nullptr /* issue_info  */
919 };
920 
921 static const advsimd_vec_cost exynosm1_advsimd_vector_cost =
922 {
923   3, /* int_stmt_cost  */
924   3, /* fp_stmt_cost  */
925   0, /* ld2_st2_permute_cost  */
926   0, /* ld3_st3_permute_cost  */
927   0, /* ld4_st4_permute_cost  */
928   3, /* permute_cost  */
929   3, /* reduc_i8_cost  */
930   3, /* reduc_i16_cost  */
931   3, /* reduc_i32_cost  */
932   3, /* reduc_i64_cost  */
933   3, /* reduc_f16_cost  */
934   3, /* reduc_f32_cost  */
935   3, /* reduc_f64_cost  */
936   3, /* store_elt_extra_cost  */
937   3, /* vec_to_scalar_cost  */
938   3, /* scalar_to_vec_cost  */
939   5, /* align_load_cost  */
940   5, /* unalign_load_cost  */
941   1, /* unalign_store_cost  */
942   1  /* store_cost  */
943 };
944 
945 static const struct cpu_vector_cost exynosm1_vector_cost =
946 {
947   1, /* scalar_int_stmt_cost  */
948   1, /* scalar_fp_stmt_cost  */
949   5, /* scalar_load_cost  */
950   1, /* scalar_store_cost  */
951   1, /* cond_taken_branch_cost  */
952   1, /* cond_not_taken_branch_cost  */
953   &exynosm1_advsimd_vector_cost, /* advsimd  */
954   nullptr, /* sve  */
955   nullptr /* issue_info  */
956 };
957 
958 static const advsimd_vec_cost xgene1_advsimd_vector_cost =
959 {
960   2, /* int_stmt_cost  */
961   2, /* fp_stmt_cost  */
962   0, /* ld2_st2_permute_cost  */
963   0, /* ld3_st3_permute_cost  */
964   0, /* ld4_st4_permute_cost  */
965   2, /* permute_cost  */
966   4, /* reduc_i8_cost  */
967   4, /* reduc_i16_cost  */
968   4, /* reduc_i32_cost  */
969   4, /* reduc_i64_cost  */
970   4, /* reduc_f16_cost  */
971   4, /* reduc_f32_cost  */
972   4, /* reduc_f64_cost  */
973   4, /* store_elt_extra_cost  */
974   4, /* vec_to_scalar_cost  */
975   4, /* scalar_to_vec_cost  */
976   10, /* align_load_cost  */
977   10, /* unalign_load_cost  */
978   2, /* unalign_store_cost  */
979   2  /* store_cost  */
980 };
981 
982 /* Generic costs for vector insn classes.  */
983 static const struct cpu_vector_cost xgene1_vector_cost =
984 {
985   1, /* scalar_int_stmt_cost  */
986   1, /* scalar_fp_stmt_cost  */
987   5, /* scalar_load_cost  */
988   1, /* scalar_store_cost  */
989   2, /* cond_taken_branch_cost  */
990   1, /* cond_not_taken_branch_cost  */
991   &xgene1_advsimd_vector_cost, /* advsimd  */
992   nullptr, /* sve  */
993   nullptr /* issue_info  */
994 };
995 
996 static const advsimd_vec_cost thunderx2t99_advsimd_vector_cost =
997 {
998   4, /* int_stmt_cost  */
999   5, /* fp_stmt_cost  */
1000   0, /* ld2_st2_permute_cost  */
1001   0, /* ld3_st3_permute_cost  */
1002   0, /* ld4_st4_permute_cost  */
1003   10, /* permute_cost  */
1004   6, /* reduc_i8_cost  */
1005   6, /* reduc_i16_cost  */
1006   6, /* reduc_i32_cost  */
1007   6, /* reduc_i64_cost  */
1008   6, /* reduc_f16_cost  */
1009   6, /* reduc_f32_cost  */
1010   6, /* reduc_f64_cost  */
1011   6, /* store_elt_extra_cost  */
1012   6, /* vec_to_scalar_cost  */
1013   5, /* scalar_to_vec_cost  */
1014   4, /* align_load_cost  */
1015   4, /* unalign_load_cost  */
1016   1, /* unalign_store_cost  */
1017   1  /* store_cost  */
1018 };
1019 
1020 /* Costs for vector insn classes for Vulcan.  */
1021 static const struct cpu_vector_cost thunderx2t99_vector_cost =
1022 {
1023   1, /* scalar_int_stmt_cost  */
1024   6, /* scalar_fp_stmt_cost  */
1025   4, /* scalar_load_cost  */
1026   1, /* scalar_store_cost  */
1027   2, /* cond_taken_branch_cost  */
1028   1,  /* cond_not_taken_branch_cost  */
1029   &thunderx2t99_advsimd_vector_cost, /* advsimd  */
1030   nullptr, /* sve  */
1031   nullptr /* issue_info  */
1032 };
1033 
1034 static const advsimd_vec_cost thunderx3t110_advsimd_vector_cost =
1035 {
1036   5, /* int_stmt_cost  */
1037   5, /* fp_stmt_cost  */
1038   0, /* ld2_st2_permute_cost  */
1039   0, /* ld3_st3_permute_cost  */
1040   0, /* ld4_st4_permute_cost  */
1041   10, /* permute_cost  */
1042   5, /* reduc_i8_cost  */
1043   5, /* reduc_i16_cost  */
1044   5, /* reduc_i32_cost  */
1045   5, /* reduc_i64_cost  */
1046   5, /* reduc_f16_cost  */
1047   5, /* reduc_f32_cost  */
1048   5, /* reduc_f64_cost  */
1049   5, /* store_elt_extra_cost  */
1050   5, /* vec_to_scalar_cost  */
1051   5, /* scalar_to_vec_cost  */
1052   4, /* align_load_cost  */
1053   4, /* unalign_load_cost  */
1054   4, /* unalign_store_cost  */
1055   4  /* store_cost  */
1056 };
1057 
1058 static const struct cpu_vector_cost thunderx3t110_vector_cost =
1059 {
1060   1, /* scalar_int_stmt_cost  */
1061   5, /* scalar_fp_stmt_cost  */
1062   4, /* scalar_load_cost  */
1063   1, /* scalar_store_cost  */
1064   2, /* cond_taken_branch_cost  */
1065   1,  /* cond_not_taken_branch_cost  */
1066   &thunderx3t110_advsimd_vector_cost, /* advsimd  */
1067   nullptr, /* sve  */
1068   nullptr /* issue_info  */
1069 };
1070 
1071 
1072 /* Generic costs for branch instructions.  */
1073 static const struct cpu_branch_cost generic_branch_cost =
1074 {
1075   1,  /* Predictable.  */
1076   3   /* Unpredictable.  */
1077 };
1078 
1079 /* Generic approximation modes.  */
1080 static const cpu_approx_modes generic_approx_modes =
1081 {
1082   AARCH64_APPROX_NONE,	/* division  */
1083   AARCH64_APPROX_NONE,	/* sqrt  */
1084   AARCH64_APPROX_NONE	/* recip_sqrt  */
1085 };
1086 
1087 /* Approximation modes for Exynos M1.  */
1088 static const cpu_approx_modes exynosm1_approx_modes =
1089 {
1090   AARCH64_APPROX_NONE,	/* division  */
1091   AARCH64_APPROX_ALL,	/* sqrt  */
1092   AARCH64_APPROX_ALL	/* recip_sqrt  */
1093 };
1094 
1095 /* Approximation modes for X-Gene 1.  */
1096 static const cpu_approx_modes xgene1_approx_modes =
1097 {
1098   AARCH64_APPROX_NONE,	/* division  */
1099   AARCH64_APPROX_NONE,	/* sqrt  */
1100   AARCH64_APPROX_ALL	/* recip_sqrt  */
1101 };
1102 
1103 /* Generic prefetch settings (which disable prefetch).  */
1104 static const cpu_prefetch_tune generic_prefetch_tune =
1105 {
1106   0,			/* num_slots  */
1107   -1,			/* l1_cache_size  */
1108   -1,			/* l1_cache_line_size  */
1109   -1,			/* l2_cache_size  */
1110   true,			/* prefetch_dynamic_strides */
1111   -1,			/* minimum_stride */
1112   -1			/* default_opt_level  */
1113 };
1114 
1115 static const cpu_prefetch_tune exynosm1_prefetch_tune =
1116 {
1117   0,			/* num_slots  */
1118   -1,			/* l1_cache_size  */
1119   64,			/* l1_cache_line_size  */
1120   -1,			/* l2_cache_size  */
1121   true,			/* prefetch_dynamic_strides */
1122   -1,			/* minimum_stride */
1123   -1			/* default_opt_level  */
1124 };
1125 
1126 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
1127 {
1128   4,			/* num_slots  */
1129   32,			/* l1_cache_size  */
1130   64,			/* l1_cache_line_size  */
1131   512,			/* l2_cache_size  */
1132   false,		/* prefetch_dynamic_strides */
1133   2048,			/* minimum_stride */
1134   3			/* default_opt_level  */
1135 };
1136 
1137 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
1138 {
1139   8,			/* num_slots  */
1140   32,			/* l1_cache_size  */
1141   128,			/* l1_cache_line_size  */
1142   16*1024,		/* l2_cache_size  */
1143   true,			/* prefetch_dynamic_strides */
1144   -1,			/* minimum_stride */
1145   3			/* default_opt_level  */
1146 };
1147 
1148 static const cpu_prefetch_tune thunderx_prefetch_tune =
1149 {
1150   8,			/* num_slots  */
1151   32,			/* l1_cache_size  */
1152   128,			/* l1_cache_line_size  */
1153   -1,			/* l2_cache_size  */
1154   true,			/* prefetch_dynamic_strides */
1155   -1,			/* minimum_stride */
1156   -1			/* default_opt_level  */
1157 };
1158 
1159 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
1160 {
1161   8,			/* num_slots  */
1162   32,			/* l1_cache_size  */
1163   64,			/* l1_cache_line_size  */
1164   256,			/* l2_cache_size  */
1165   true,			/* prefetch_dynamic_strides */
1166   -1,			/* minimum_stride */
1167   -1			/* default_opt_level  */
1168 };
1169 
1170 static const cpu_prefetch_tune thunderx3t110_prefetch_tune =
1171 {
1172   8,			/* num_slots  */
1173   32,			/* l1_cache_size  */
1174   64,			/* l1_cache_line_size  */
1175   256,			/* l2_cache_size  */
1176   true,			/* prefetch_dynamic_strides */
1177   -1,			/* minimum_stride */
1178   -1			/* default_opt_level  */
1179 };
1180 
1181 static const cpu_prefetch_tune tsv110_prefetch_tune =
1182 {
1183   0,                    /* num_slots  */
1184   64,                   /* l1_cache_size  */
1185   64,                   /* l1_cache_line_size  */
1186   512,                  /* l2_cache_size  */
1187   true,                 /* prefetch_dynamic_strides */
1188   -1,                   /* minimum_stride */
1189   -1                    /* default_opt_level  */
1190 };
1191 
1192 static const cpu_prefetch_tune xgene1_prefetch_tune =
1193 {
1194   8,			/* num_slots  */
1195   32,			/* l1_cache_size  */
1196   64,			/* l1_cache_line_size  */
1197   256,			/* l2_cache_size  */
1198   true,                 /* prefetch_dynamic_strides */
1199   -1,                   /* minimum_stride */
1200   -1			/* default_opt_level  */
1201 };
1202 
1203 static const cpu_prefetch_tune a64fx_prefetch_tune =
1204 {
1205   8,			/* num_slots  */
1206   64,			/* l1_cache_size  */
1207   256,			/* l1_cache_line_size  */
1208   32768,		/* l2_cache_size  */
1209   true,			/* prefetch_dynamic_strides */
1210   -1,			/* minimum_stride */
1211   -1			/* default_opt_level  */
1212 };
1213 
1214 static const struct tune_params generic_tunings =
1215 {
1216   &cortexa57_extra_costs,
1217   &generic_addrcost_table,
1218   &generic_regmove_cost,
1219   &generic_vector_cost,
1220   &generic_branch_cost,
1221   &generic_approx_modes,
1222   SVE_NOT_IMPLEMENTED, /* sve_width  */
1223   4, /* memmov_cost  */
1224   2, /* issue_rate  */
1225   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
1226   "16:12",	/* function_align.  */
1227   "4",	/* jump_align.  */
1228   "8",	/* loop_align.  */
1229   2,	/* int_reassoc_width.  */
1230   4,	/* fp_reassoc_width.  */
1231   1,	/* vec_reassoc_width.  */
1232   2,	/* min_div_recip_mul_sf.  */
1233   2,	/* min_div_recip_mul_df.  */
1234   0,	/* max_case_values.  */
1235   tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
1236   /* Enabling AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS significantly benefits
1237      Neoverse V1.  It does not have a noticeable effect on A64FX and should
1238      have at most a very minor effect on SVE2 cores.  */
1239   (AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS),	/* tune_flags.  */
1240   &generic_prefetch_tune
1241 };
1242 
1243 static const struct tune_params cortexa35_tunings =
1244 {
1245   &cortexa53_extra_costs,
1246   &generic_addrcost_table,
1247   &cortexa53_regmove_cost,
1248   &generic_vector_cost,
1249   &generic_branch_cost,
1250   &generic_approx_modes,
1251   SVE_NOT_IMPLEMENTED, /* sve_width  */
1252   4, /* memmov_cost  */
1253   1, /* issue_rate  */
1254   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1255    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
1256   "16",	/* function_align.  */
1257   "4",	/* jump_align.  */
1258   "8",	/* loop_align.  */
1259   2,	/* int_reassoc_width.  */
1260   4,	/* fp_reassoc_width.  */
1261   1,	/* vec_reassoc_width.  */
1262   2,	/* min_div_recip_mul_sf.  */
1263   2,	/* min_div_recip_mul_df.  */
1264   0,	/* max_case_values.  */
1265   tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
1266   (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
1267   &generic_prefetch_tune
1268 };
1269 
1270 static const struct tune_params cortexa53_tunings =
1271 {
1272   &cortexa53_extra_costs,
1273   &generic_addrcost_table,
1274   &cortexa53_regmove_cost,
1275   &generic_vector_cost,
1276   &generic_branch_cost,
1277   &generic_approx_modes,
1278   SVE_NOT_IMPLEMENTED, /* sve_width  */
1279   4, /* memmov_cost  */
1280   2, /* issue_rate  */
1281   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1282    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
1283   "16",	/* function_align.  */
1284   "4",	/* jump_align.  */
1285   "8",	/* loop_align.  */
1286   2,	/* int_reassoc_width.  */
1287   4,	/* fp_reassoc_width.  */
1288   1,	/* vec_reassoc_width.  */
1289   2,	/* min_div_recip_mul_sf.  */
1290   2,	/* min_div_recip_mul_df.  */
1291   0,	/* max_case_values.  */
1292   tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
1293   (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
1294   &generic_prefetch_tune
1295 };
1296 
1297 static const struct tune_params cortexa57_tunings =
1298 {
1299   &cortexa57_extra_costs,
1300   &generic_addrcost_table,
1301   &cortexa57_regmove_cost,
1302   &cortexa57_vector_cost,
1303   &generic_branch_cost,
1304   &generic_approx_modes,
1305   SVE_NOT_IMPLEMENTED, /* sve_width  */
1306   4, /* memmov_cost  */
1307   3, /* issue_rate  */
1308   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1309    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
1310   "16",	/* function_align.  */
1311   "4",	/* jump_align.  */
1312   "8",	/* loop_align.  */
1313   2,	/* int_reassoc_width.  */
1314   4,	/* fp_reassoc_width.  */
1315   1,	/* vec_reassoc_width.  */
1316   2,	/* min_div_recip_mul_sf.  */
1317   2,	/* min_div_recip_mul_df.  */
1318   0,	/* max_case_values.  */
1319   tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
1320   (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS),	/* tune_flags.  */
1321   &generic_prefetch_tune
1322 };
1323 
1324 static const struct tune_params cortexa72_tunings =
1325 {
1326   &cortexa57_extra_costs,
1327   &generic_addrcost_table,
1328   &cortexa57_regmove_cost,
1329   &cortexa57_vector_cost,
1330   &generic_branch_cost,
1331   &generic_approx_modes,
1332   SVE_NOT_IMPLEMENTED, /* sve_width  */
1333   4, /* memmov_cost  */
1334   3, /* issue_rate  */
1335   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1336    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
1337   "16",	/* function_align.  */
1338   "4",	/* jump_align.  */
1339   "8",	/* loop_align.  */
1340   2,	/* int_reassoc_width.  */
1341   4,	/* fp_reassoc_width.  */
1342   1,	/* vec_reassoc_width.  */
1343   2,	/* min_div_recip_mul_sf.  */
1344   2,	/* min_div_recip_mul_df.  */
1345   0,	/* max_case_values.  */
1346   tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
1347   (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
1348   &generic_prefetch_tune
1349 };
1350 
1351 static const struct tune_params cortexa73_tunings =
1352 {
1353   &cortexa57_extra_costs,
1354   &generic_addrcost_table,
1355   &cortexa57_regmove_cost,
1356   &cortexa57_vector_cost,
1357   &generic_branch_cost,
1358   &generic_approx_modes,
1359   SVE_NOT_IMPLEMENTED, /* sve_width  */
1360   4, /* memmov_cost.  */
1361   2, /* issue_rate.  */
1362   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1363    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
1364   "16",	/* function_align.  */
1365   "4",	/* jump_align.  */
1366   "8",	/* loop_align.  */
1367   2,	/* int_reassoc_width.  */
1368   4,	/* fp_reassoc_width.  */
1369   1,	/* vec_reassoc_width.  */
1370   2,	/* min_div_recip_mul_sf.  */
1371   2,	/* min_div_recip_mul_df.  */
1372   0,	/* max_case_values.  */
1373   tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
1374   (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
1375   &generic_prefetch_tune
1376 };
1377 
1378 
1379 
1380 static const struct tune_params exynosm1_tunings =
1381 {
1382   &exynosm1_extra_costs,
1383   &exynosm1_addrcost_table,
1384   &exynosm1_regmove_cost,
1385   &exynosm1_vector_cost,
1386   &generic_branch_cost,
1387   &exynosm1_approx_modes,
1388   SVE_NOT_IMPLEMENTED, /* sve_width  */
1389   4,	/* memmov_cost  */
1390   3,	/* issue_rate  */
1391   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
1392   "4",	/* function_align.  */
1393   "4",	/* jump_align.  */
1394   "4",	/* loop_align.  */
1395   2,	/* int_reassoc_width.  */
1396   4,	/* fp_reassoc_width.  */
1397   1,	/* vec_reassoc_width.  */
1398   2,	/* min_div_recip_mul_sf.  */
1399   2,	/* min_div_recip_mul_df.  */
1400   48,	/* max_case_values.  */
1401   tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
1402   (AARCH64_EXTRA_TUNE_NONE), /* tune_flags.  */
1403   &exynosm1_prefetch_tune
1404 };
1405 
1406 static const struct tune_params thunderxt88_tunings =
1407 {
1408   &thunderx_extra_costs,
1409   &generic_addrcost_table,
1410   &thunderx_regmove_cost,
1411   &thunderx_vector_cost,
1412   &generic_branch_cost,
1413   &generic_approx_modes,
1414   SVE_NOT_IMPLEMENTED, /* sve_width  */
1415   6, /* memmov_cost  */
1416   2, /* issue_rate  */
1417   AARCH64_FUSE_ALU_BRANCH, /* fusible_ops  */
1418   "8",	/* function_align.  */
1419   "8",	/* jump_align.  */
1420   "8",	/* loop_align.  */
1421   2,	/* int_reassoc_width.  */
1422   4,	/* fp_reassoc_width.  */
1423   1,	/* vec_reassoc_width.  */
1424   2,	/* min_div_recip_mul_sf.  */
1425   2,	/* min_div_recip_mul_df.  */
1426   0,	/* max_case_values.  */
1427   tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
1428   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW),	/* tune_flags.  */
1429   &thunderxt88_prefetch_tune
1430 };
1431 
1432 static const struct tune_params thunderx_tunings =
1433 {
1434   &thunderx_extra_costs,
1435   &generic_addrcost_table,
1436   &thunderx_regmove_cost,
1437   &thunderx_vector_cost,
1438   &generic_branch_cost,
1439   &generic_approx_modes,
1440   SVE_NOT_IMPLEMENTED, /* sve_width  */
1441   6, /* memmov_cost  */
1442   2, /* issue_rate  */
1443   AARCH64_FUSE_ALU_BRANCH, /* fusible_ops  */
1444   "8",	/* function_align.  */
1445   "8",	/* jump_align.  */
1446   "8",	/* loop_align.  */
1447   2,	/* int_reassoc_width.  */
1448   4,	/* fp_reassoc_width.  */
1449   1,	/* vec_reassoc_width.  */
1450   2,	/* min_div_recip_mul_sf.  */
1451   2,	/* min_div_recip_mul_df.  */
1452   0,	/* max_case_values.  */
1453   tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
1454   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
1455    | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),	/* tune_flags.  */
1456   &thunderx_prefetch_tune
1457 };
1458 
1459 static const struct tune_params tsv110_tunings =
1460 {
1461   &tsv110_extra_costs,
1462   &tsv110_addrcost_table,
1463   &tsv110_regmove_cost,
1464   &tsv110_vector_cost,
1465   &generic_branch_cost,
1466   &generic_approx_modes,
1467   SVE_NOT_IMPLEMENTED, /* sve_width  */
1468   4,    /* memmov_cost  */
1469   4,    /* issue_rate  */
1470   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_ALU_BRANCH
1471    | AARCH64_FUSE_ALU_CBZ), /* fusible_ops  */
1472   "16", /* function_align.  */
1473   "4",  /* jump_align.  */
1474   "8",  /* loop_align.  */
1475   2,    /* int_reassoc_width.  */
1476   4,    /* fp_reassoc_width.  */
1477   1,    /* vec_reassoc_width.  */
1478   2,    /* min_div_recip_mul_sf.  */
1479   2,    /* min_div_recip_mul_df.  */
1480   0,    /* max_case_values.  */
1481   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1482   (AARCH64_EXTRA_TUNE_NONE),     /* tune_flags.  */
1483   &tsv110_prefetch_tune
1484 };
1485 
1486 static const struct tune_params xgene1_tunings =
1487 {
1488   &xgene1_extra_costs,
1489   &xgene1_addrcost_table,
1490   &xgene1_regmove_cost,
1491   &xgene1_vector_cost,
1492   &generic_branch_cost,
1493   &xgene1_approx_modes,
1494   SVE_NOT_IMPLEMENTED, /* sve_width  */
1495   6, /* memmov_cost  */
1496   4, /* issue_rate  */
1497   AARCH64_FUSE_NOTHING, /* fusible_ops  */
1498   "16",	/* function_align.  */
1499   "16",	/* jump_align.  */
1500   "16",	/* loop_align.  */
1501   2,	/* int_reassoc_width.  */
1502   4,	/* fp_reassoc_width.  */
1503   1,	/* vec_reassoc_width.  */
1504   2,	/* min_div_recip_mul_sf.  */
1505   2,	/* min_div_recip_mul_df.  */
1506   17,	/* max_case_values.  */
1507   tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
1508   (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS),	/* tune_flags.  */
1509   &xgene1_prefetch_tune
1510 };
1511 
1512 static const struct tune_params emag_tunings =
1513 {
1514   &xgene1_extra_costs,
1515   &xgene1_addrcost_table,
1516   &xgene1_regmove_cost,
1517   &xgene1_vector_cost,
1518   &generic_branch_cost,
1519   &xgene1_approx_modes,
1520   SVE_NOT_IMPLEMENTED,
1521   6, /* memmov_cost  */
1522   4, /* issue_rate  */
1523   AARCH64_FUSE_NOTHING, /* fusible_ops  */
1524   "16",	/* function_align.  */
1525   "16",	/* jump_align.  */
1526   "16",	/* loop_align.  */
1527   2,	/* int_reassoc_width.  */
1528   4,	/* fp_reassoc_width.  */
1529   1,	/* vec_reassoc_width.  */
1530   2,	/* min_div_recip_mul_sf.  */
1531   2,	/* min_div_recip_mul_df.  */
1532   17,	/* max_case_values.  */
1533   tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
1534   (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS),	/* tune_flags.  */
1535   &xgene1_prefetch_tune
1536 };
1537 
1538 static const struct tune_params qdf24xx_tunings =
1539 {
1540   &qdf24xx_extra_costs,
1541   &qdf24xx_addrcost_table,
1542   &qdf24xx_regmove_cost,
1543   &qdf24xx_vector_cost,
1544   &generic_branch_cost,
1545   &generic_approx_modes,
1546   SVE_NOT_IMPLEMENTED, /* sve_width  */
1547   4, /* memmov_cost  */
1548   4, /* issue_rate  */
1549   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1550    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
1551   "16",	/* function_align.  */
1552   "8",	/* jump_align.  */
1553   "16",	/* loop_align.  */
1554   2,	/* int_reassoc_width.  */
1555   4,	/* fp_reassoc_width.  */
1556   1,	/* vec_reassoc_width.  */
1557   2,	/* min_div_recip_mul_sf.  */
1558   2,	/* min_div_recip_mul_df.  */
1559   0,	/* max_case_values.  */
1560   tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
1561   AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS, /* tune_flags.  */
1562   &qdf24xx_prefetch_tune
1563 };
1564 
1565 /* Tuning structure for the Qualcomm Saphira core.  Default to falkor values
1566    for now.  */
1567 static const struct tune_params saphira_tunings =
1568 {
1569   &generic_extra_costs,
1570   &generic_addrcost_table,
1571   &generic_regmove_cost,
1572   &generic_vector_cost,
1573   &generic_branch_cost,
1574   &generic_approx_modes,
1575   SVE_NOT_IMPLEMENTED, /* sve_width  */
1576   4, /* memmov_cost  */
1577   4, /* issue_rate  */
1578   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1579    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
1580   "16",	/* function_align.  */
1581   "8",	/* jump_align.  */
1582   "16",	/* loop_align.  */
1583   2,	/* int_reassoc_width.  */
1584   4,	/* fp_reassoc_width.  */
1585   1,	/* vec_reassoc_width.  */
1586   2,	/* min_div_recip_mul_sf.  */
1587   2,	/* min_div_recip_mul_df.  */
1588   0,	/* max_case_values.  */
1589   tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
1590   (AARCH64_EXTRA_TUNE_NONE),		/* tune_flags.  */
1591   &generic_prefetch_tune
1592 };
1593 
1594 static const struct tune_params thunderx2t99_tunings =
1595 {
1596   &thunderx2t99_extra_costs,
1597   &thunderx2t99_addrcost_table,
1598   &thunderx2t99_regmove_cost,
1599   &thunderx2t99_vector_cost,
1600   &generic_branch_cost,
1601   &generic_approx_modes,
1602   SVE_NOT_IMPLEMENTED, /* sve_width  */
1603   4, /* memmov_cost.  */
1604   4, /* issue_rate.  */
1605   (AARCH64_FUSE_ALU_BRANCH | AARCH64_FUSE_AES_AESMC
1606    | AARCH64_FUSE_ALU_CBZ), /* fusible_ops  */
1607   "16",	/* function_align.  */
1608   "8",	/* jump_align.  */
1609   "16",	/* loop_align.  */
1610   3,	/* int_reassoc_width.  */
1611   2,	/* fp_reassoc_width.  */
1612   2,	/* vec_reassoc_width.  */
1613   2,	/* min_div_recip_mul_sf.  */
1614   2,	/* min_div_recip_mul_df.  */
1615   0,	/* max_case_values.  */
1616   tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
1617   (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
1618   &thunderx2t99_prefetch_tune
1619 };
1620 
1621 static const struct tune_params thunderx3t110_tunings =
1622 {
1623   &thunderx3t110_extra_costs,
1624   &thunderx3t110_addrcost_table,
1625   &thunderx3t110_regmove_cost,
1626   &thunderx3t110_vector_cost,
1627   &generic_branch_cost,
1628   &generic_approx_modes,
1629   SVE_NOT_IMPLEMENTED, /* sve_width  */
1630   4, /* memmov_cost.  */
1631   6, /* issue_rate.  */
1632   (AARCH64_FUSE_ALU_BRANCH | AARCH64_FUSE_AES_AESMC
1633    | AARCH64_FUSE_ALU_CBZ), /* fusible_ops  */
1634   "16",	/* function_align.  */
1635   "8",	/* jump_align.  */
1636   "16",	/* loop_align.  */
1637   3,	/* int_reassoc_width.  */
1638   2,	/* fp_reassoc_width.  */
1639   2,	/* vec_reassoc_width.  */
1640   2,	/* min_div_recip_mul_sf.  */
1641   2,	/* min_div_recip_mul_df.  */
1642   0,	/* max_case_values.  */
1643   tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
1644   (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
1645   &thunderx3t110_prefetch_tune
1646 };
1647 
1648 static const struct tune_params neoversen1_tunings =
1649 {
1650   &cortexa76_extra_costs,
1651   &generic_addrcost_table,
1652   &generic_regmove_cost,
1653   &cortexa57_vector_cost,
1654   &generic_branch_cost,
1655   &generic_approx_modes,
1656   SVE_NOT_IMPLEMENTED, /* sve_width  */
1657   4, /* memmov_cost  */
1658   3, /* issue_rate  */
1659   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
1660   "32:16",	/* function_align.  */
1661   "4",		/* jump_align.  */
1662   "32:16",	/* loop_align.  */
1663   2,	/* int_reassoc_width.  */
1664   4,	/* fp_reassoc_width.  */
1665   2,	/* vec_reassoc_width.  */
1666   2,	/* min_div_recip_mul_sf.  */
1667   2,	/* min_div_recip_mul_df.  */
1668   0,	/* max_case_values.  */
1669   tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
1670   (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
1671   &generic_prefetch_tune
1672 };
1673 
1674 static const advsimd_vec_cost neoversev1_advsimd_vector_cost =
1675 {
1676   2, /* int_stmt_cost  */
1677   2, /* fp_stmt_cost  */
1678   4, /* ld2_st2_permute_cost */
1679   4, /* ld3_st3_permute_cost  */
1680   5, /* ld4_st4_permute_cost  */
1681   3, /* permute_cost  */
1682   4, /* reduc_i8_cost  */
1683   4, /* reduc_i16_cost  */
1684   2, /* reduc_i32_cost  */
1685   2, /* reduc_i64_cost  */
1686   6, /* reduc_f16_cost  */
1687   3, /* reduc_f32_cost  */
1688   2, /* reduc_f64_cost  */
1689   2, /* store_elt_extra_cost  */
1690   /* This value is just inherited from the Cortex-A57 table.  */
1691   8, /* vec_to_scalar_cost  */
1692   /* This depends very much on what the scalar value is and
1693      where it comes from.  E.g. some constants take two dependent
1694      instructions or a load, while others might be moved from a GPR.
1695      4 seems to be a reasonable compromise in practice.  */
1696   4, /* scalar_to_vec_cost  */
1697   4, /* align_load_cost  */
1698   4, /* unalign_load_cost  */
1699   /* Although stores have a latency of 2 and compete for the
1700      vector pipes, in practice it's better not to model that.  */
1701   1, /* unalign_store_cost  */
1702   1  /* store_cost  */
1703 };
1704 
1705 static const sve_vec_cost neoversev1_sve_vector_cost =
1706 {
1707   {
1708     2, /* int_stmt_cost  */
1709     2, /* fp_stmt_cost  */
1710     4, /* ld2_st2_permute_cost  */
1711     7, /* ld3_st3_permute_cost  */
1712     8, /* ld4_st4_permute_cost  */
1713     3, /* permute_cost  */
1714     /* Theoretically, a reduction involving 31 scalar ADDs could
1715        complete in ~9 cycles and would have a cost of 31.  [SU]ADDV
1716        completes in 14 cycles, so give it a cost of 31 + 5.  */
1717     36, /* reduc_i8_cost  */
1718     /* Likewise for 15 scalar ADDs (~5 cycles) vs. 12: 15 + 7.  */
1719     22, /* reduc_i16_cost  */
1720     /* Likewise for 7 scalar ADDs (~3 cycles) vs. 10: 7 + 7.  */
1721     14, /* reduc_i32_cost  */
1722     /* Likewise for 3 scalar ADDs (~2 cycles) vs. 10: 3 + 8.  */
1723     11, /* reduc_i64_cost  */
1724     /* Theoretically, a reduction involving 15 scalar FADDs could
1725        complete in ~9 cycles and would have a cost of 30.  FADDV
1726        completes in 13 cycles, so give it a cost of 30 + 4.  */
1727     34, /* reduc_f16_cost  */
1728     /* Likewise for 7 scalar FADDs (~6 cycles) vs. 11: 14 + 5.  */
1729     19, /* reduc_f32_cost  */
1730     /* Likewise for 3 scalar FADDs (~4 cycles) vs. 9: 6 + 5.  */
1731     11, /* reduc_f64_cost  */
1732     2, /* store_elt_extra_cost  */
1733     /* This value is just inherited from the Cortex-A57 table.  */
1734     8, /* vec_to_scalar_cost  */
1735     /* See the comment above the Advanced SIMD versions.  */
1736     4, /* scalar_to_vec_cost  */
1737     4, /* align_load_cost  */
1738     4, /* unalign_load_cost  */
1739     /* Although stores have a latency of 2 and compete for the
1740        vector pipes, in practice it's better not to model that.  */
1741     1, /* unalign_store_cost  */
1742     1  /* store_cost  */
1743   },
1744   3, /* clast_cost  */
1745   19, /* fadda_f16_cost  */
1746   11, /* fadda_f32_cost  */
1747   8, /* fadda_f64_cost  */
1748   32, /* gather_load_x32_cost  */
1749   16, /* gather_load_x64_cost  */
1750   3 /* scatter_store_elt_cost  */
1751 };
1752 
1753 static const aarch64_scalar_vec_issue_info neoversev1_scalar_issue_info =
1754 {
1755   3, /* loads_stores_per_cycle  */
1756   2, /* stores_per_cycle  */
1757   4, /* general_ops_per_cycle  */
1758   0, /* fp_simd_load_general_ops  */
1759   1 /* fp_simd_store_general_ops  */
1760 };
1761 
1762 static const aarch64_advsimd_vec_issue_info neoversev1_advsimd_issue_info =
1763 {
1764   {
1765     3, /* loads_stores_per_cycle  */
1766     2, /* stores_per_cycle  */
1767     4, /* general_ops_per_cycle  */
1768     0, /* fp_simd_load_general_ops  */
1769     1 /* fp_simd_store_general_ops  */
1770   },
1771   2, /* ld2_st2_general_ops  */
1772   2, /* ld3_st3_general_ops  */
1773   3 /* ld4_st4_general_ops  */
1774 };
1775 
1776 static const aarch64_sve_vec_issue_info neoversev1_sve_issue_info =
1777 {
1778   {
1779     {
1780       2, /* loads_per_cycle  */
1781       2, /* stores_per_cycle  */
1782       2, /* general_ops_per_cycle  */
1783       0, /* fp_simd_load_general_ops  */
1784       1 /* fp_simd_store_general_ops  */
1785     },
1786     2, /* ld2_st2_general_ops  */
1787     2, /* ld3_st3_general_ops  */
1788     3 /* ld4_st4_general_ops  */
1789   },
1790   1, /* pred_ops_per_cycle  */
1791   2, /* while_pred_ops  */
1792   2, /* int_cmp_pred_ops  */
1793   1, /* fp_cmp_pred_ops  */
1794   1, /* gather_scatter_pair_general_ops  */
1795   1 /* gather_scatter_pair_pred_ops  */
1796 };
1797 
1798 static const aarch64_vec_issue_info neoversev1_vec_issue_info =
1799 {
1800   &neoversev1_scalar_issue_info,
1801   &neoversev1_advsimd_issue_info,
1802   &neoversev1_sve_issue_info
1803 };
1804 
1805 /* Neoverse V1 costs for vector insn classes.  */
1806 static const struct cpu_vector_cost neoversev1_vector_cost =
1807 {
1808   1, /* scalar_int_stmt_cost  */
1809   2, /* scalar_fp_stmt_cost  */
1810   4, /* scalar_load_cost  */
1811   1, /* scalar_store_cost  */
1812   1, /* cond_taken_branch_cost  */
1813   1, /* cond_not_taken_branch_cost  */
1814   &neoversev1_advsimd_vector_cost, /* advsimd  */
1815   &neoversev1_sve_vector_cost, /* sve  */
1816   &neoversev1_vec_issue_info /* issue_info  */
1817 };
1818 
1819 static const struct tune_params neoversev1_tunings =
1820 {
1821   &cortexa76_extra_costs,
1822   &neoversev1_addrcost_table,
1823   &generic_regmove_cost,
1824   &neoversev1_vector_cost,
1825   &generic_branch_cost,
1826   &generic_approx_modes,
1827   SVE_256, /* sve_width  */
1828   4, /* memmov_cost  */
1829   3, /* issue_rate  */
1830   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
1831   "32:16",	/* function_align.  */
1832   "4",		/* jump_align.  */
1833   "32:16",	/* loop_align.  */
1834   2,	/* int_reassoc_width.  */
1835   4,	/* fp_reassoc_width.  */
1836   2,	/* vec_reassoc_width.  */
1837   2,	/* min_div_recip_mul_sf.  */
1838   2,	/* min_div_recip_mul_df.  */
1839   0,	/* max_case_values.  */
1840   tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
1841   (AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
1842    | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
1843    | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT),	/* tune_flags.  */
1844   &generic_prefetch_tune
1845 };
1846 
1847 static const sve_vec_cost neoverse512tvb_sve_vector_cost =
1848 {
1849   {
1850     2, /* int_stmt_cost  */
1851     2, /* fp_stmt_cost  */
1852     4, /* ld2_st2_permute_cost  */
1853     5, /* ld3_st3_permute_cost  */
1854     5, /* ld4_st4_permute_cost  */
1855     3, /* permute_cost  */
1856     /* Theoretically, a reduction involving 15 scalar ADDs could
1857        complete in ~5 cycles and would have a cost of 15.  Assume that
1858        [SU]ADDV completes in 11 cycles and so give it a cost of 15 + 6.  */
1859     21, /* reduc_i8_cost  */
1860     /* Likewise for 7 scalar ADDs (~3 cycles) vs. 9: 7 + 6.  */
1861     13, /* reduc_i16_cost  */
1862     /* Likewise for 3 scalar ADDs (~2 cycles) vs. 8: 3 + 6.  */
1863     9, /* reduc_i32_cost  */
1864     /* Likewise for 1 scalar ADD (1 cycle) vs. 8: 1 + 7.  */
1865     8, /* reduc_i64_cost  */
1866     /* Theoretically, a reduction involving 7 scalar FADDs could
1867        complete in ~6 cycles and would have a cost of 14.  Assume that
1868        FADDV completes in 8 cycles and so give it a cost of 14 + 2.  */
1869     16, /* reduc_f16_cost  */
1870     /* Likewise for 3 scalar FADDs (~4 cycles) vs. 6: 6 + 2.  */
1871     8, /* reduc_f32_cost  */
1872     /* Likewise for 1 scalar FADD (2 cycles) vs. 4: 2 + 2.  */
1873     4, /* reduc_f64_cost  */
1874     2, /* store_elt_extra_cost  */
1875     /* This value is just inherited from the Cortex-A57 table.  */
1876     8, /* vec_to_scalar_cost  */
1877     /* This depends very much on what the scalar value is and
1878        where it comes from.  E.g. some constants take two dependent
1879        instructions or a load, while others might be moved from a GPR.
1880        4 seems to be a reasonable compromise in practice.  */
1881     4, /* scalar_to_vec_cost  */
1882     4, /* align_load_cost  */
1883     4, /* unalign_load_cost  */
1884     /* Although stores generally have a latency of 2 and compete for the
1885        vector pipes, in practice it's better not to model that.  */
1886     1, /* unalign_store_cost  */
1887     1  /* store_cost  */
1888   },
1889   3, /* clast_cost  */
1890   10, /* fadda_f16_cost  */
1891   6, /* fadda_f32_cost  */
1892   4, /* fadda_f64_cost  */
1893   /* A strided Advanced SIMD x64 load would take two parallel FP loads
1894      (6 cycles) plus an insertion (2 cycles).  Assume a 64-bit SVE gather
1895      is 1 cycle more.  The Advanced SIMD version is costed as 2 scalar loads
1896      (cost 8) and a vec_construct (cost 2).  Add a full vector operation
1897      (cost 2) to that, to avoid the difference being lost in rounding.
1898 
1899      There is no easy comparison between a strided Advanced SIMD x32 load
1900      and an SVE 32-bit gather, but cost an SVE 32-bit gather as 1 vector
1901      operation more than a 64-bit gather.  */
1902   14, /* gather_load_x32_cost  */
1903   12, /* gather_load_x64_cost  */
1904   3 /* scatter_store_elt_cost  */
1905 };
1906 
1907 static const aarch64_sve_vec_issue_info neoverse512tvb_sve_issue_info =
1908 {
1909   {
1910     {
1911       3, /* loads_per_cycle  */
1912       2, /* stores_per_cycle  */
1913       4, /* general_ops_per_cycle  */
1914       0, /* fp_simd_load_general_ops  */
1915       1 /* fp_simd_store_general_ops  */
1916     },
1917     2, /* ld2_st2_general_ops  */
1918     2, /* ld3_st3_general_ops  */
1919     3 /* ld4_st4_general_ops  */
1920   },
1921   2, /* pred_ops_per_cycle  */
1922   2, /* while_pred_ops  */
1923   2, /* int_cmp_pred_ops  */
1924   1, /* fp_cmp_pred_ops  */
1925   1, /* gather_scatter_pair_general_ops  */
1926   1 /* gather_scatter_pair_pred_ops  */
1927 };
1928 
1929 static const aarch64_vec_issue_info neoverse512tvb_vec_issue_info =
1930 {
1931   &neoversev1_scalar_issue_info,
1932   &neoversev1_advsimd_issue_info,
1933   &neoverse512tvb_sve_issue_info
1934 };
1935 
1936 static const struct cpu_vector_cost neoverse512tvb_vector_cost =
1937 {
1938   1, /* scalar_int_stmt_cost  */
1939   2, /* scalar_fp_stmt_cost  */
1940   4, /* scalar_load_cost  */
1941   1, /* scalar_store_cost  */
1942   1, /* cond_taken_branch_cost  */
1943   1, /* cond_not_taken_branch_cost  */
1944   &neoversev1_advsimd_vector_cost, /* advsimd  */
1945   &neoverse512tvb_sve_vector_cost, /* sve  */
1946   &neoverse512tvb_vec_issue_info /* issue_info  */
1947 };
1948 
1949 static const struct tune_params neoverse512tvb_tunings =
1950 {
1951   &cortexa76_extra_costs,
1952   &neoversev1_addrcost_table,
1953   &generic_regmove_cost,
1954   &neoverse512tvb_vector_cost,
1955   &generic_branch_cost,
1956   &generic_approx_modes,
1957   SVE_128 | SVE_256, /* sve_width  */
1958   4, /* memmov_cost  */
1959   3, /* issue_rate  */
1960   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
1961   "32:16",	/* function_align.  */
1962   "4",		/* jump_align.  */
1963   "32:16",	/* loop_align.  */
1964   2,	/* int_reassoc_width.  */
1965   4,	/* fp_reassoc_width.  */
1966   2,	/* vec_reassoc_width.  */
1967   2,	/* min_div_recip_mul_sf.  */
1968   2,	/* min_div_recip_mul_df.  */
1969   0,	/* max_case_values.  */
1970   tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
1971   (AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
1972    | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
1973    | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT),	/* tune_flags.  */
1974   &generic_prefetch_tune
1975 };
1976 
1977 static const struct tune_params neoversen2_tunings =
1978 {
1979   &cortexa76_extra_costs,
1980   &generic_addrcost_table,
1981   &generic_regmove_cost,
1982   &cortexa57_vector_cost,
1983   &generic_branch_cost,
1984   &generic_approx_modes,
1985   SVE_128, /* sve_width  */
1986   4, /* memmov_cost  */
1987   3, /* issue_rate  */
1988   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
1989   "32:16",	/* function_align.  */
1990   "4",		/* jump_align.  */
1991   "32:16",	/* loop_align.  */
1992   2,	/* int_reassoc_width.  */
1993   4,	/* fp_reassoc_width.  */
1994   2,	/* vec_reassoc_width.  */
1995   2,	/* min_div_recip_mul_sf.  */
1996   2,	/* min_div_recip_mul_df.  */
1997   0,	/* max_case_values.  */
1998   tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
1999   (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
2000   &generic_prefetch_tune
2001 };
2002 
2003 static const struct tune_params a64fx_tunings =
2004 {
2005   &a64fx_extra_costs,
2006   &a64fx_addrcost_table,
2007   &a64fx_regmove_cost,
2008   &a64fx_vector_cost,
2009   &generic_branch_cost,
2010   &generic_approx_modes,
2011   SVE_512, /* sve_width  */
2012   4, /* memmov_cost  */
2013   7, /* issue_rate  */
2014   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
2015   "32",	/* function_align.  */
2016   "16",	/* jump_align.  */
2017   "32",	/* loop_align.  */
2018   4,	/* int_reassoc_width.  */
2019   2,	/* fp_reassoc_width.  */
2020   2,	/* vec_reassoc_width.  */
2021   2,	/* min_div_recip_mul_sf.  */
2022   2,	/* min_div_recip_mul_df.  */
2023   0,	/* max_case_values.  */
2024   tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
2025   (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
2026   &a64fx_prefetch_tune
2027 };
2028 
2029 /* Support for fine-grained override of the tuning structures.  */
2030 struct aarch64_tuning_override_function
2031 {
2032   const char* name;
2033   void (*parse_override)(const char*, struct tune_params*);
2034 };
2035 
2036 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
2037 static void aarch64_parse_tune_string (const char*, struct tune_params*);
2038 static void aarch64_parse_sve_width_string (const char*, struct tune_params*);
2039 
2040 static const struct aarch64_tuning_override_function
2041 aarch64_tuning_override_functions[] =
2042 {
2043   { "fuse", aarch64_parse_fuse_string },
2044   { "tune", aarch64_parse_tune_string },
2045   { "sve_width", aarch64_parse_sve_width_string },
2046   { NULL, NULL }
2047 };
2048 
2049 /* A processor implementing AArch64.  */
2050 struct processor
2051 {
2052   const char *const name;
2053   enum aarch64_processor ident;
2054   enum aarch64_processor sched_core;
2055   enum aarch64_arch arch;
2056   unsigned architecture_version;
2057   const uint64_t flags;
2058   const struct tune_params *const tune;
2059 };
2060 
2061 /* Architectures implementing AArch64.  */
2062 static const struct processor all_architectures[] =
2063 {
2064 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
2065   {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
2066 #include "aarch64-arches.def"
2067   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
2068 };
2069 
2070 /* Processor cores implementing AArch64.  */
2071 static const struct processor all_cores[] =
2072 {
2073 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
2074   {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH,				\
2075   all_architectures[AARCH64_ARCH_##ARCH].architecture_version,	\
2076   FLAGS, &COSTS##_tunings},
2077 #include "aarch64-cores.def"
2078   {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
2079     AARCH64_FL_FOR_ARCH8, &generic_tunings},
2080   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
2081 };
2082 
2083 
2084 /* Target specification.  These are populated by the -march, -mtune, -mcpu
2085    handling code or by target attributes.  */
2086 static const struct processor *selected_arch;
2087 static const struct processor *selected_cpu;
2088 static const struct processor *selected_tune;
2089 
2090 enum aarch64_key_type aarch64_ra_sign_key = AARCH64_KEY_A;
2091 
2092 /* The current tuning set.  */
2093 struct tune_params aarch64_tune_params = generic_tunings;
2094 
2095 /* Check whether an 'aarch64_vector_pcs' attribute is valid.  */
2096 
2097 static tree
handle_aarch64_vector_pcs_attribute(tree * node,tree name,tree,int,bool * no_add_attrs)2098 handle_aarch64_vector_pcs_attribute (tree *node, tree name, tree,
2099 				     int, bool *no_add_attrs)
2100 {
2101   /* Since we set fn_type_req to true, the caller should have checked
2102      this for us.  */
2103   gcc_assert (FUNC_OR_METHOD_TYPE_P (*node));
2104   switch ((arm_pcs) fntype_abi (*node).id ())
2105     {
2106     case ARM_PCS_AAPCS64:
2107     case ARM_PCS_SIMD:
2108       return NULL_TREE;
2109 
2110     case ARM_PCS_SVE:
2111       error ("the %qE attribute cannot be applied to an SVE function type",
2112 	     name);
2113       *no_add_attrs = true;
2114       return NULL_TREE;
2115 
2116     case ARM_PCS_TLSDESC:
2117     case ARM_PCS_UNKNOWN:
2118       break;
2119     }
2120   gcc_unreachable ();
2121 }
2122 
2123 /* Table of machine attributes.  */
2124 static const struct attribute_spec aarch64_attribute_table[] =
2125 {
2126   /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
2127        affects_type_identity, handler, exclude } */
2128   { "aarch64_vector_pcs", 0, 0, false, true,  true,  true,
2129 			  handle_aarch64_vector_pcs_attribute, NULL },
2130   { "arm_sve_vector_bits", 1, 1, false, true,  false, true,
2131 			  aarch64_sve::handle_arm_sve_vector_bits_attribute,
2132 			  NULL },
2133   { "Advanced SIMD type", 1, 1, false, true,  false, true,  NULL, NULL },
2134   { "SVE type",		  3, 3, false, true,  false, true,  NULL, NULL },
2135   { "SVE sizeless type",  0, 0, false, true,  false, true,  NULL, NULL },
2136   { NULL,                 0, 0, false, false, false, false, NULL, NULL }
2137 };
2138 
2139 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
2140 
2141 /* An ISA extension in the co-processor and main instruction set space.  */
2142 struct aarch64_option_extension
2143 {
2144   const char *const name;
2145   const unsigned long flags_on;
2146   const unsigned long flags_off;
2147 };
2148 
2149 typedef enum aarch64_cond_code
2150 {
2151   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
2152   AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
2153   AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
2154 }
2155 aarch64_cc;
2156 
2157 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
2158 
2159 struct aarch64_branch_protect_type
2160 {
2161   /* The type's name that the user passes to the branch-protection option
2162     string.  */
2163   const char* name;
2164   /* Function to handle the protection type and set global variables.
2165     First argument is the string token corresponding with this type and the
2166     second argument is the next token in the option string.
2167     Return values:
2168     * AARCH64_PARSE_OK: Handling was sucessful.
2169     * AARCH64_INVALID_ARG: The type is invalid in this context and the caller
2170       should print an error.
2171     * AARCH64_INVALID_FEATURE: The type is invalid and the handler prints its
2172       own error.  */
2173   enum aarch64_parse_opt_result (*handler)(char*, char*);
2174   /* A list of types that can follow this type in the option string.  */
2175   const aarch64_branch_protect_type* subtypes;
2176   unsigned int num_subtypes;
2177 };
2178 
2179 static enum aarch64_parse_opt_result
aarch64_handle_no_branch_protection(char * str,char * rest)2180 aarch64_handle_no_branch_protection (char* str, char* rest)
2181 {
2182   aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
2183   aarch64_enable_bti = 0;
2184   if (rest)
2185     {
2186       error ("unexpected %<%s%> after %<%s%>", rest, str);
2187       return AARCH64_PARSE_INVALID_FEATURE;
2188     }
2189   return AARCH64_PARSE_OK;
2190 }
2191 
2192 static enum aarch64_parse_opt_result
aarch64_handle_standard_branch_protection(char * str,char * rest)2193 aarch64_handle_standard_branch_protection (char* str, char* rest)
2194 {
2195   aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
2196   aarch64_ra_sign_key = AARCH64_KEY_A;
2197   aarch64_enable_bti = 1;
2198   if (rest)
2199     {
2200       error ("unexpected %<%s%> after %<%s%>", rest, str);
2201       return AARCH64_PARSE_INVALID_FEATURE;
2202     }
2203   return AARCH64_PARSE_OK;
2204 }
2205 
2206 static enum aarch64_parse_opt_result
aarch64_handle_pac_ret_protection(char * str ATTRIBUTE_UNUSED,char * rest ATTRIBUTE_UNUSED)2207 aarch64_handle_pac_ret_protection (char* str ATTRIBUTE_UNUSED,
2208 				    char* rest ATTRIBUTE_UNUSED)
2209 {
2210   aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
2211   aarch64_ra_sign_key = AARCH64_KEY_A;
2212   return AARCH64_PARSE_OK;
2213 }
2214 
2215 static enum aarch64_parse_opt_result
aarch64_handle_pac_ret_leaf(char * str ATTRIBUTE_UNUSED,char * rest ATTRIBUTE_UNUSED)2216 aarch64_handle_pac_ret_leaf (char* str ATTRIBUTE_UNUSED,
2217 			      char* rest ATTRIBUTE_UNUSED)
2218 {
2219   aarch64_ra_sign_scope = AARCH64_FUNCTION_ALL;
2220   return AARCH64_PARSE_OK;
2221 }
2222 
2223 static enum aarch64_parse_opt_result
aarch64_handle_pac_ret_b_key(char * str ATTRIBUTE_UNUSED,char * rest ATTRIBUTE_UNUSED)2224 aarch64_handle_pac_ret_b_key (char* str ATTRIBUTE_UNUSED,
2225 			      char* rest ATTRIBUTE_UNUSED)
2226 {
2227   aarch64_ra_sign_key = AARCH64_KEY_B;
2228   return AARCH64_PARSE_OK;
2229 }
2230 
2231 static enum aarch64_parse_opt_result
aarch64_handle_bti_protection(char * str ATTRIBUTE_UNUSED,char * rest ATTRIBUTE_UNUSED)2232 aarch64_handle_bti_protection (char* str ATTRIBUTE_UNUSED,
2233 				    char* rest ATTRIBUTE_UNUSED)
2234 {
2235   aarch64_enable_bti = 1;
2236   return AARCH64_PARSE_OK;
2237 }
2238 
2239 static const struct aarch64_branch_protect_type aarch64_pac_ret_subtypes[] = {
2240   { "leaf", aarch64_handle_pac_ret_leaf, NULL, 0 },
2241   { "b-key", aarch64_handle_pac_ret_b_key, NULL, 0 },
2242   { NULL, NULL, NULL, 0 }
2243 };
2244 
2245 static const struct aarch64_branch_protect_type aarch64_branch_protect_types[] = {
2246   { "none", aarch64_handle_no_branch_protection, NULL, 0 },
2247   { "standard", aarch64_handle_standard_branch_protection, NULL, 0 },
2248   { "pac-ret", aarch64_handle_pac_ret_protection, aarch64_pac_ret_subtypes,
2249     ARRAY_SIZE (aarch64_pac_ret_subtypes) },
2250   { "bti", aarch64_handle_bti_protection, NULL, 0 },
2251   { NULL, NULL, NULL, 0 }
2252 };
2253 
2254 /* The condition codes of the processor, and the inverse function.  */
2255 static const char * const aarch64_condition_codes[] =
2256 {
2257   "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
2258   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
2259 };
2260 
2261 /* The preferred condition codes for SVE conditions.  */
2262 static const char *const aarch64_sve_condition_codes[] =
2263 {
2264   "none", "any", "nlast", "last", "first", "nfrst", "vs", "vc",
2265   "pmore", "plast", "tcont", "tstop", "gt", "le", "al", "nv"
2266 };
2267 
2268 /* Return the assembly token for svpattern value VALUE.  */
2269 
2270 static const char *
svpattern_token(enum aarch64_svpattern pattern)2271 svpattern_token (enum aarch64_svpattern pattern)
2272 {
2273   switch (pattern)
2274     {
2275 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
2276     AARCH64_FOR_SVPATTERN (CASE)
2277 #undef CASE
2278     case AARCH64_NUM_SVPATTERNS:
2279       break;
2280     }
2281   gcc_unreachable ();
2282 }
2283 
2284 /* Return the location of a piece that is known to be passed or returned
2285    in registers.  FIRST_ZR is the first unused vector argument register
2286    and FIRST_PR is the first unused predicate argument register.  */
2287 
2288 rtx
get_rtx(unsigned int first_zr,unsigned int first_pr)2289 pure_scalable_type_info::piece::get_rtx (unsigned int first_zr,
2290 					 unsigned int first_pr) const
2291 {
2292   gcc_assert (VECTOR_MODE_P (mode)
2293 	      && first_zr + num_zr <= V0_REGNUM + NUM_FP_ARG_REGS
2294 	      && first_pr + num_pr <= P0_REGNUM + NUM_PR_ARG_REGS);
2295 
2296   if (num_zr > 0 && num_pr == 0)
2297     return gen_rtx_REG (mode, first_zr);
2298 
2299   if (num_zr == 0 && num_pr == 1)
2300     return gen_rtx_REG (mode, first_pr);
2301 
2302   gcc_unreachable ();
2303 }
2304 
2305 /* Return the total number of vector registers required by the PST.  */
2306 
2307 unsigned int
num_zr()2308 pure_scalable_type_info::num_zr () const
2309 {
2310   unsigned int res = 0;
2311   for (unsigned int i = 0; i < pieces.length (); ++i)
2312     res += pieces[i].num_zr;
2313   return res;
2314 }
2315 
2316 /* Return the total number of predicate registers required by the PST.  */
2317 
2318 unsigned int
num_pr()2319 pure_scalable_type_info::num_pr () const
2320 {
2321   unsigned int res = 0;
2322   for (unsigned int i = 0; i < pieces.length (); ++i)
2323     res += pieces[i].num_pr;
2324   return res;
2325 }
2326 
2327 /* Return the location of a PST that is known to be passed or returned
2328    in registers.  FIRST_ZR is the first unused vector argument register
2329    and FIRST_PR is the first unused predicate argument register.  */
2330 
2331 rtx
get_rtx(machine_mode mode,unsigned int first_zr,unsigned int first_pr)2332 pure_scalable_type_info::get_rtx (machine_mode mode,
2333 				  unsigned int first_zr,
2334 				  unsigned int first_pr) const
2335 {
2336   /* Try to return a single REG if possible.  This leads to better
2337      code generation; it isn't required for correctness.  */
2338   if (mode == pieces[0].mode)
2339     {
2340       gcc_assert (pieces.length () == 1);
2341       return pieces[0].get_rtx (first_zr, first_pr);
2342     }
2343 
2344   /* Build up a PARALLEL that contains the individual pieces.  */
2345   rtvec rtxes = rtvec_alloc (pieces.length ());
2346   for (unsigned int i = 0; i < pieces.length (); ++i)
2347     {
2348       rtx reg = pieces[i].get_rtx (first_zr, first_pr);
2349       rtx offset = gen_int_mode (pieces[i].offset, Pmode);
2350       RTVEC_ELT (rtxes, i) = gen_rtx_EXPR_LIST (VOIDmode, reg, offset);
2351       first_zr += pieces[i].num_zr;
2352       first_pr += pieces[i].num_pr;
2353     }
2354   return gen_rtx_PARALLEL (mode, rtxes);
2355 }
2356 
2357 /* Analyze whether TYPE is a Pure Scalable Type according to the rules
2358    in the AAPCS64.  */
2359 
2360 pure_scalable_type_info::analysis_result
analyze(const_tree type)2361 pure_scalable_type_info::analyze (const_tree type)
2362 {
2363   /* Prevent accidental reuse.  */
2364   gcc_assert (pieces.is_empty ());
2365 
2366   /* No code will be generated for erroneous types, so we won't establish
2367      an ABI mapping.  */
2368   if (type == error_mark_node)
2369     return NO_ABI_IDENTITY;
2370 
2371   /* Zero-sized types disappear in the language->ABI mapping.  */
2372   if (TYPE_SIZE (type) && integer_zerop (TYPE_SIZE (type)))
2373     return NO_ABI_IDENTITY;
2374 
2375   /* Check for SVTs, SPTs, and built-in tuple types that map to PSTs.  */
2376   piece p = {};
2377   if (aarch64_sve::builtin_type_p (type, &p.num_zr, &p.num_pr))
2378     {
2379       machine_mode mode = TYPE_MODE_RAW (type);
2380       gcc_assert (VECTOR_MODE_P (mode)
2381 		  && (!TARGET_SVE || aarch64_sve_mode_p (mode)));
2382 
2383       p.mode = p.orig_mode = mode;
2384       add_piece (p);
2385       return IS_PST;
2386     }
2387 
2388   /* Check for user-defined PSTs.  */
2389   if (TREE_CODE (type) == ARRAY_TYPE)
2390     return analyze_array (type);
2391   if (TREE_CODE (type) == RECORD_TYPE)
2392     return analyze_record (type);
2393 
2394   return ISNT_PST;
2395 }
2396 
2397 /* Analyze a type that is known not to be passed or returned in memory.
2398    Return true if it has an ABI identity and is a Pure Scalable Type.  */
2399 
2400 bool
analyze_registers(const_tree type)2401 pure_scalable_type_info::analyze_registers (const_tree type)
2402 {
2403   analysis_result result = analyze (type);
2404   gcc_assert (result != DOESNT_MATTER);
2405   return result == IS_PST;
2406 }
2407 
2408 /* Subroutine of analyze for handling ARRAY_TYPEs.  */
2409 
2410 pure_scalable_type_info::analysis_result
analyze_array(const_tree type)2411 pure_scalable_type_info::analyze_array (const_tree type)
2412 {
2413   /* Analyze the element type.  */
2414   pure_scalable_type_info element_info;
2415   analysis_result result = element_info.analyze (TREE_TYPE (type));
2416   if (result != IS_PST)
2417     return result;
2418 
2419   /* An array of unknown, flexible or variable length will be passed and
2420      returned by reference whatever we do.  */
2421   tree nelts_minus_one = array_type_nelts (type);
2422   if (!tree_fits_uhwi_p (nelts_minus_one))
2423     return DOESNT_MATTER;
2424 
2425   /* Likewise if the array is constant-sized but too big to be interesting.
2426      The double checks against MAX_PIECES are to protect against overflow.  */
2427   unsigned HOST_WIDE_INT count = tree_to_uhwi (nelts_minus_one);
2428   if (count > MAX_PIECES)
2429     return DOESNT_MATTER;
2430   count += 1;
2431   if (count * element_info.pieces.length () > MAX_PIECES)
2432     return DOESNT_MATTER;
2433 
2434   /* The above checks should have weeded out elements of unknown size.  */
2435   poly_uint64 element_bytes;
2436   if (!poly_int_tree_p (TYPE_SIZE_UNIT (TREE_TYPE (type)), &element_bytes))
2437     gcc_unreachable ();
2438 
2439   /* Build up the list of individual vectors and predicates.  */
2440   gcc_assert (!element_info.pieces.is_empty ());
2441   for (unsigned int i = 0; i < count; ++i)
2442     for (unsigned int j = 0; j < element_info.pieces.length (); ++j)
2443       {
2444 	piece p = element_info.pieces[j];
2445 	p.offset += i * element_bytes;
2446 	add_piece (p);
2447       }
2448   return IS_PST;
2449 }
2450 
2451 /* Subroutine of analyze for handling RECORD_TYPEs.  */
2452 
2453 pure_scalable_type_info::analysis_result
analyze_record(const_tree type)2454 pure_scalable_type_info::analyze_record (const_tree type)
2455 {
2456   for (tree field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
2457     {
2458       if (TREE_CODE (field) != FIELD_DECL)
2459 	continue;
2460 
2461       /* Zero-sized fields disappear in the language->ABI mapping.  */
2462       if (DECL_SIZE (field) && integer_zerop (DECL_SIZE (field)))
2463 	continue;
2464 
2465       /* All fields with an ABI identity must be PSTs for the record as
2466 	 a whole to be a PST.  If any individual field is too big to be
2467 	 interesting then the record is too.  */
2468       pure_scalable_type_info field_info;
2469       analysis_result subresult = field_info.analyze (TREE_TYPE (field));
2470       if (subresult == NO_ABI_IDENTITY)
2471 	continue;
2472       if (subresult != IS_PST)
2473 	return subresult;
2474 
2475       /* Since all previous fields are PSTs, we ought to be able to track
2476 	 the field offset using poly_ints.  */
2477       tree bitpos = bit_position (field);
2478       gcc_assert (poly_int_tree_p (bitpos));
2479 
2480       /* For the same reason, it shouldn't be possible to create a PST field
2481 	 whose offset isn't byte-aligned.  */
2482       poly_widest_int wide_bytepos = exact_div (wi::to_poly_widest (bitpos),
2483 						BITS_PER_UNIT);
2484 
2485       /* Punt if the record is too big to be interesting.  */
2486       poly_uint64 bytepos;
2487       if (!wide_bytepos.to_uhwi (&bytepos)
2488 	  || pieces.length () + field_info.pieces.length () > MAX_PIECES)
2489 	return DOESNT_MATTER;
2490 
2491       /* Add the individual vectors and predicates in the field to the
2492 	 record's list.  */
2493       gcc_assert (!field_info.pieces.is_empty ());
2494       for (unsigned int i = 0; i < field_info.pieces.length (); ++i)
2495 	{
2496 	  piece p = field_info.pieces[i];
2497 	  p.offset += bytepos;
2498 	  add_piece (p);
2499 	}
2500     }
2501   /* Empty structures disappear in the language->ABI mapping.  */
2502   return pieces.is_empty () ? NO_ABI_IDENTITY : IS_PST;
2503 }
2504 
2505 /* Add P to the list of pieces in the type.  */
2506 
2507 void
add_piece(const piece & p)2508 pure_scalable_type_info::add_piece (const piece &p)
2509 {
2510   /* Try to fold the new piece into the previous one to form a
2511      single-mode PST.  For example, if we see three consecutive vectors
2512      of the same mode, we can represent them using the corresponding
2513      3-tuple mode.
2514 
2515      This is purely an optimization.  */
2516   if (!pieces.is_empty ())
2517     {
2518       piece &prev = pieces.last ();
2519       gcc_assert (VECTOR_MODE_P (p.mode) && VECTOR_MODE_P (prev.mode));
2520       unsigned int nelems1, nelems2;
2521       if (prev.orig_mode == p.orig_mode
2522 	  && known_eq (prev.offset + GET_MODE_SIZE (prev.mode), p.offset)
2523 	  && constant_multiple_p (GET_MODE_NUNITS (prev.mode),
2524 				  GET_MODE_NUNITS (p.orig_mode), &nelems1)
2525 	  && constant_multiple_p (GET_MODE_NUNITS (p.mode),
2526 				  GET_MODE_NUNITS (p.orig_mode), &nelems2)
2527 	  && targetm.array_mode (p.orig_mode,
2528 				 nelems1 + nelems2).exists (&prev.mode))
2529 	{
2530 	  prev.num_zr += p.num_zr;
2531 	  prev.num_pr += p.num_pr;
2532 	  return;
2533 	}
2534     }
2535   pieces.quick_push (p);
2536 }
2537 
2538 /* Return true if at least one possible value of type TYPE includes at
2539    least one object of Pure Scalable Type, in the sense of the AAPCS64.
2540 
2541    This is a relatively expensive test for some types, so it should
2542    generally be made as late as possible.  */
2543 
2544 static bool
aarch64_some_values_include_pst_objects_p(const_tree type)2545 aarch64_some_values_include_pst_objects_p (const_tree type)
2546 {
2547   if (TYPE_SIZE (type) && integer_zerop (TYPE_SIZE (type)))
2548     return false;
2549 
2550   if (aarch64_sve::builtin_type_p (type))
2551     return true;
2552 
2553   if (TREE_CODE (type) == ARRAY_TYPE || TREE_CODE (type) == COMPLEX_TYPE)
2554     return aarch64_some_values_include_pst_objects_p (TREE_TYPE (type));
2555 
2556   if (RECORD_OR_UNION_TYPE_P (type))
2557     for (tree field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
2558       if (TREE_CODE (field) == FIELD_DECL
2559 	  && aarch64_some_values_include_pst_objects_p (TREE_TYPE (field)))
2560 	return true;
2561 
2562   return false;
2563 }
2564 
2565 /* Return the descriptor of the SIMD ABI.  */
2566 
2567 static const predefined_function_abi &
aarch64_simd_abi(void)2568 aarch64_simd_abi (void)
2569 {
2570   predefined_function_abi &simd_abi = function_abis[ARM_PCS_SIMD];
2571   if (!simd_abi.initialized_p ())
2572     {
2573       HARD_REG_SET full_reg_clobbers
2574 	= default_function_abi.full_reg_clobbers ();
2575       for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
2576 	if (FP_SIMD_SAVED_REGNUM_P (regno))
2577 	  CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
2578       simd_abi.initialize (ARM_PCS_SIMD, full_reg_clobbers);
2579     }
2580   return simd_abi;
2581 }
2582 
2583 /* Return the descriptor of the SVE PCS.  */
2584 
2585 static const predefined_function_abi &
aarch64_sve_abi(void)2586 aarch64_sve_abi (void)
2587 {
2588   predefined_function_abi &sve_abi = function_abis[ARM_PCS_SVE];
2589   if (!sve_abi.initialized_p ())
2590     {
2591       HARD_REG_SET full_reg_clobbers
2592 	= default_function_abi.full_reg_clobbers ();
2593       for (int regno = V8_REGNUM; regno <= V23_REGNUM; ++regno)
2594 	CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
2595       for (int regno = P4_REGNUM; regno <= P15_REGNUM; ++regno)
2596 	CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
2597       sve_abi.initialize (ARM_PCS_SVE, full_reg_clobbers);
2598     }
2599   return sve_abi;
2600 }
2601 
2602 /* If X is an UNSPEC_SALT_ADDR expression, return the address that it
2603    wraps, otherwise return X itself.  */
2604 
2605 static rtx
strip_salt(rtx x)2606 strip_salt (rtx x)
2607 {
2608   rtx search = x;
2609   if (GET_CODE (search) == CONST)
2610     search = XEXP (search, 0);
2611   if (GET_CODE (search) == UNSPEC && XINT (search, 1) == UNSPEC_SALT_ADDR)
2612     x = XVECEXP (search, 0, 0);
2613   return x;
2614 }
2615 
2616 /* Like strip_offset, but also strip any UNSPEC_SALT_ADDR from the
2617    expression.  */
2618 
2619 static rtx
strip_offset_and_salt(rtx addr,poly_int64 * offset)2620 strip_offset_and_salt (rtx addr, poly_int64 *offset)
2621 {
2622   return strip_salt (strip_offset (addr, offset));
2623 }
2624 
2625 /* Generate code to enable conditional branches in functions over 1 MiB.  */
2626 const char *
aarch64_gen_far_branch(rtx * operands,int pos_label,const char * dest,const char * branch_format)2627 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
2628 			const char * branch_format)
2629 {
2630     rtx_code_label * tmp_label = gen_label_rtx ();
2631     char label_buf[256];
2632     char buffer[128];
2633     ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
2634 				 CODE_LABEL_NUMBER (tmp_label));
2635     const char *label_ptr = targetm.strip_name_encoding (label_buf);
2636     rtx dest_label = operands[pos_label];
2637     operands[pos_label] = tmp_label;
2638 
2639     snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
2640     output_asm_insn (buffer, operands);
2641 
2642     snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
2643     operands[pos_label] = dest_label;
2644     output_asm_insn (buffer, operands);
2645     return "";
2646 }
2647 
2648 void
aarch64_err_no_fpadvsimd(machine_mode mode)2649 aarch64_err_no_fpadvsimd (machine_mode mode)
2650 {
2651   if (TARGET_GENERAL_REGS_ONLY)
2652     if (FLOAT_MODE_P (mode))
2653       error ("%qs is incompatible with the use of floating-point types",
2654 	     "-mgeneral-regs-only");
2655     else
2656       error ("%qs is incompatible with the use of vector types",
2657 	     "-mgeneral-regs-only");
2658   else
2659     if (FLOAT_MODE_P (mode))
2660       error ("%qs feature modifier is incompatible with the use of"
2661 	     " floating-point types", "+nofp");
2662     else
2663       error ("%qs feature modifier is incompatible with the use of"
2664 	     " vector types", "+nofp");
2665 }
2666 
2667 /* Report when we try to do something that requires SVE when SVE is disabled.
2668    This is an error of last resort and isn't very high-quality.  It usually
2669    involves attempts to measure the vector length in some way.  */
2670 static void
aarch64_report_sve_required(void)2671 aarch64_report_sve_required (void)
2672 {
2673   static bool reported_p = false;
2674 
2675   /* Avoid reporting a slew of messages for a single oversight.  */
2676   if (reported_p)
2677     return;
2678 
2679   error ("this operation requires the SVE ISA extension");
2680   inform (input_location, "you can enable SVE using the command-line"
2681 	  " option %<-march%>, or by using the %<target%>"
2682 	  " attribute or pragma");
2683   reported_p = true;
2684 }
2685 
2686 /* Return true if REGNO is P0-P15 or one of the special FFR-related
2687    registers.  */
2688 inline bool
pr_or_ffr_regnum_p(unsigned int regno)2689 pr_or_ffr_regnum_p (unsigned int regno)
2690 {
2691   return PR_REGNUM_P (regno) || regno == FFR_REGNUM || regno == FFRT_REGNUM;
2692 }
2693 
2694 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
2695    The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
2696    GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
2697    higher cost.  POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
2698    and GENERAL_REGS is lower than the memory cost (in this case the best class
2699    is the lowest cost one).  Using POINTER_AND_FP_REGS irrespectively of its
2700    cost results in bad allocations with many redundant int<->FP moves which
2701    are expensive on various cores.
2702    To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
2703    force a decision between FP_REGS and GENERAL_REGS.  We use the allocno class
2704    if it isn't POINTER_AND_FP_REGS.  Similarly, use the best class if it isn't
2705    POINTER_AND_FP_REGS.  Otherwise set the allocno class depending on the mode.
2706    The result of this is that it is no longer inefficient to have a higher
2707    memory move cost than the register move cost.
2708 */
2709 
2710 static reg_class_t
aarch64_ira_change_pseudo_allocno_class(int regno,reg_class_t allocno_class,reg_class_t best_class)2711 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
2712 					 reg_class_t best_class)
2713 {
2714   machine_mode mode;
2715 
2716   if (!reg_class_subset_p (GENERAL_REGS, allocno_class)
2717       || !reg_class_subset_p (FP_REGS, allocno_class))
2718     return allocno_class;
2719 
2720   if (!reg_class_subset_p (GENERAL_REGS, best_class)
2721       || !reg_class_subset_p (FP_REGS, best_class))
2722     return best_class;
2723 
2724   mode = PSEUDO_REGNO_MODE (regno);
2725   return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
2726 }
2727 
2728 static unsigned int
aarch64_min_divisions_for_recip_mul(machine_mode mode)2729 aarch64_min_divisions_for_recip_mul (machine_mode mode)
2730 {
2731   if (GET_MODE_UNIT_SIZE (mode) == 4)
2732     return aarch64_tune_params.min_div_recip_mul_sf;
2733   return aarch64_tune_params.min_div_recip_mul_df;
2734 }
2735 
2736 /* Return the reassociation width of treeop OPC with mode MODE.  */
2737 static int
aarch64_reassociation_width(unsigned opc,machine_mode mode)2738 aarch64_reassociation_width (unsigned opc, machine_mode mode)
2739 {
2740   if (VECTOR_MODE_P (mode))
2741     return aarch64_tune_params.vec_reassoc_width;
2742   if (INTEGRAL_MODE_P (mode))
2743     return aarch64_tune_params.int_reassoc_width;
2744   /* Avoid reassociating floating point addition so we emit more FMAs.  */
2745   if (FLOAT_MODE_P (mode) && opc != PLUS_EXPR)
2746     return aarch64_tune_params.fp_reassoc_width;
2747   return 1;
2748 }
2749 
2750 /* Provide a mapping from gcc register numbers to dwarf register numbers.  */
2751 unsigned
aarch64_dbx_register_number(unsigned regno)2752 aarch64_dbx_register_number (unsigned regno)
2753 {
2754    if (GP_REGNUM_P (regno))
2755      return AARCH64_DWARF_R0 + regno - R0_REGNUM;
2756    else if (regno == SP_REGNUM)
2757      return AARCH64_DWARF_SP;
2758    else if (FP_REGNUM_P (regno))
2759      return AARCH64_DWARF_V0 + regno - V0_REGNUM;
2760    else if (PR_REGNUM_P (regno))
2761      return AARCH64_DWARF_P0 + regno - P0_REGNUM;
2762    else if (regno == VG_REGNUM)
2763      return AARCH64_DWARF_VG;
2764 
2765    /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
2766       equivalent DWARF register.  */
2767    return DWARF_FRAME_REGISTERS;
2768 }
2769 
2770 /* If X is a CONST_DOUBLE, return its bit representation as a constant
2771    integer, otherwise return X unmodified.  */
2772 static rtx
aarch64_bit_representation(rtx x)2773 aarch64_bit_representation (rtx x)
2774 {
2775   if (CONST_DOUBLE_P (x))
2776     x = gen_lowpart (int_mode_for_mode (GET_MODE (x)).require (), x);
2777   return x;
2778 }
2779 
2780 /* Return an estimate for the number of quadwords in an SVE vector.  This is
2781    equivalent to the number of Advanced SIMD vectors in an SVE vector.  */
2782 static unsigned int
aarch64_estimated_sve_vq()2783 aarch64_estimated_sve_vq ()
2784 {
2785   return estimated_poly_value (BITS_PER_SVE_VECTOR) / 128;
2786 }
2787 
2788 /* Return true if MODE is any of the Advanced SIMD structure modes.  */
2789 static bool
aarch64_advsimd_struct_mode_p(machine_mode mode)2790 aarch64_advsimd_struct_mode_p (machine_mode mode)
2791 {
2792   return (TARGET_SIMD
2793 	  && (mode == OImode || mode == CImode || mode == XImode));
2794 }
2795 
2796 /* Return true if MODE is an SVE predicate mode.  */
2797 static bool
aarch64_sve_pred_mode_p(machine_mode mode)2798 aarch64_sve_pred_mode_p (machine_mode mode)
2799 {
2800   return (TARGET_SVE
2801 	  && (mode == VNx16BImode
2802 	      || mode == VNx8BImode
2803 	      || mode == VNx4BImode
2804 	      || mode == VNx2BImode));
2805 }
2806 
2807 /* Three mutually-exclusive flags describing a vector or predicate type.  */
2808 const unsigned int VEC_ADVSIMD  = 1;
2809 const unsigned int VEC_SVE_DATA = 2;
2810 const unsigned int VEC_SVE_PRED = 4;
2811 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
2812    a structure of 2, 3 or 4 vectors.  */
2813 const unsigned int VEC_STRUCT   = 8;
2814 /* Can be used in combination with VEC_SVE_DATA to indicate that the
2815    vector has fewer significant bytes than a full SVE vector.  */
2816 const unsigned int VEC_PARTIAL  = 16;
2817 /* Useful combinations of the above.  */
2818 const unsigned int VEC_ANY_SVE  = VEC_SVE_DATA | VEC_SVE_PRED;
2819 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
2820 
2821 /* Return a set of flags describing the vector properties of mode MODE.
2822    Ignore modes that are not supported by the current target.  */
2823 static unsigned int
aarch64_classify_vector_mode(machine_mode mode)2824 aarch64_classify_vector_mode (machine_mode mode)
2825 {
2826   if (aarch64_advsimd_struct_mode_p (mode))
2827     return VEC_ADVSIMD | VEC_STRUCT;
2828 
2829   if (aarch64_sve_pred_mode_p (mode))
2830     return VEC_SVE_PRED;
2831 
2832   /* Make the decision based on the mode's enum value rather than its
2833      properties, so that we keep the correct classification regardless
2834      of -msve-vector-bits.  */
2835   switch (mode)
2836     {
2837     /* Partial SVE QI vectors.  */
2838     case E_VNx2QImode:
2839     case E_VNx4QImode:
2840     case E_VNx8QImode:
2841     /* Partial SVE HI vectors.  */
2842     case E_VNx2HImode:
2843     case E_VNx4HImode:
2844     /* Partial SVE SI vector.  */
2845     case E_VNx2SImode:
2846     /* Partial SVE HF vectors.  */
2847     case E_VNx2HFmode:
2848     case E_VNx4HFmode:
2849     /* Partial SVE BF vectors.  */
2850     case E_VNx2BFmode:
2851     case E_VNx4BFmode:
2852     /* Partial SVE SF vector.  */
2853     case E_VNx2SFmode:
2854       return TARGET_SVE ? VEC_SVE_DATA | VEC_PARTIAL : 0;
2855 
2856     case E_VNx16QImode:
2857     case E_VNx8HImode:
2858     case E_VNx4SImode:
2859     case E_VNx2DImode:
2860     case E_VNx8BFmode:
2861     case E_VNx8HFmode:
2862     case E_VNx4SFmode:
2863     case E_VNx2DFmode:
2864       return TARGET_SVE ? VEC_SVE_DATA : 0;
2865 
2866     /* x2 SVE vectors.  */
2867     case E_VNx32QImode:
2868     case E_VNx16HImode:
2869     case E_VNx8SImode:
2870     case E_VNx4DImode:
2871     case E_VNx16BFmode:
2872     case E_VNx16HFmode:
2873     case E_VNx8SFmode:
2874     case E_VNx4DFmode:
2875     /* x3 SVE vectors.  */
2876     case E_VNx48QImode:
2877     case E_VNx24HImode:
2878     case E_VNx12SImode:
2879     case E_VNx6DImode:
2880     case E_VNx24BFmode:
2881     case E_VNx24HFmode:
2882     case E_VNx12SFmode:
2883     case E_VNx6DFmode:
2884     /* x4 SVE vectors.  */
2885     case E_VNx64QImode:
2886     case E_VNx32HImode:
2887     case E_VNx16SImode:
2888     case E_VNx8DImode:
2889     case E_VNx32BFmode:
2890     case E_VNx32HFmode:
2891     case E_VNx16SFmode:
2892     case E_VNx8DFmode:
2893       return TARGET_SVE ? VEC_SVE_DATA | VEC_STRUCT : 0;
2894 
2895     /* 64-bit Advanced SIMD vectors.  */
2896     case E_V8QImode:
2897     case E_V4HImode:
2898     case E_V2SImode:
2899     /* ...E_V1DImode doesn't exist.  */
2900     case E_V4HFmode:
2901     case E_V4BFmode:
2902     case E_V2SFmode:
2903     case E_V1DFmode:
2904     /* 128-bit Advanced SIMD vectors.  */
2905     case E_V16QImode:
2906     case E_V8HImode:
2907     case E_V4SImode:
2908     case E_V2DImode:
2909     case E_V8HFmode:
2910     case E_V8BFmode:
2911     case E_V4SFmode:
2912     case E_V2DFmode:
2913       return TARGET_SIMD ? VEC_ADVSIMD : 0;
2914 
2915     default:
2916       return 0;
2917     }
2918 }
2919 
2920 /* Return true if MODE is any of the data vector modes, including
2921    structure modes.  */
2922 static bool
aarch64_vector_data_mode_p(machine_mode mode)2923 aarch64_vector_data_mode_p (machine_mode mode)
2924 {
2925   return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
2926 }
2927 
2928 /* Return true if MODE is any form of SVE mode, including predicates,
2929    vectors and structures.  */
2930 bool
aarch64_sve_mode_p(machine_mode mode)2931 aarch64_sve_mode_p (machine_mode mode)
2932 {
2933   return aarch64_classify_vector_mode (mode) & VEC_ANY_SVE;
2934 }
2935 
2936 /* Return true if MODE is an SVE data vector mode; either a single vector
2937    or a structure of vectors.  */
2938 static bool
aarch64_sve_data_mode_p(machine_mode mode)2939 aarch64_sve_data_mode_p (machine_mode mode)
2940 {
2941   return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
2942 }
2943 
2944 /* Return the number of defined bytes in one constituent vector of
2945    SVE mode MODE, which has vector flags VEC_FLAGS.  */
2946 static poly_int64
aarch64_vl_bytes(machine_mode mode,unsigned int vec_flags)2947 aarch64_vl_bytes (machine_mode mode, unsigned int vec_flags)
2948 {
2949   if (vec_flags & VEC_PARTIAL)
2950     /* A single partial vector.  */
2951     return GET_MODE_SIZE (mode);
2952 
2953   if (vec_flags & VEC_SVE_DATA)
2954     /* A single vector or a tuple.  */
2955     return BYTES_PER_SVE_VECTOR;
2956 
2957   /* A single predicate.  */
2958   gcc_assert (vec_flags & VEC_SVE_PRED);
2959   return BYTES_PER_SVE_PRED;
2960 }
2961 
2962 /* Implement target hook TARGET_ARRAY_MODE.  */
2963 static opt_machine_mode
aarch64_array_mode(machine_mode mode,unsigned HOST_WIDE_INT nelems)2964 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
2965 {
2966   if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
2967       && IN_RANGE (nelems, 2, 4))
2968     return mode_for_vector (GET_MODE_INNER (mode),
2969 			    GET_MODE_NUNITS (mode) * nelems);
2970 
2971   return opt_machine_mode ();
2972 }
2973 
2974 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
2975 static bool
aarch64_array_mode_supported_p(machine_mode mode,unsigned HOST_WIDE_INT nelems)2976 aarch64_array_mode_supported_p (machine_mode mode,
2977 				unsigned HOST_WIDE_INT nelems)
2978 {
2979   if (TARGET_SIMD
2980       && (AARCH64_VALID_SIMD_QREG_MODE (mode)
2981 	  || AARCH64_VALID_SIMD_DREG_MODE (mode))
2982       && (nelems >= 2 && nelems <= 4))
2983     return true;
2984 
2985   return false;
2986 }
2987 
2988 /* MODE is some form of SVE vector mode.  For data modes, return the number
2989    of vector register bits that each element of MODE occupies, such as 64
2990    for both VNx2DImode and VNx2SImode (where each 32-bit value is stored
2991    in a 64-bit container).  For predicate modes, return the number of
2992    data bits controlled by each significant predicate bit.  */
2993 
2994 static unsigned int
aarch64_sve_container_bits(machine_mode mode)2995 aarch64_sve_container_bits (machine_mode mode)
2996 {
2997   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
2998   poly_uint64 vector_bits = (vec_flags & (VEC_PARTIAL | VEC_SVE_PRED)
2999 			     ? BITS_PER_SVE_VECTOR
3000 			     : GET_MODE_BITSIZE (mode));
3001   return vector_element_size (vector_bits, GET_MODE_NUNITS (mode));
3002 }
3003 
3004 /* Return the SVE predicate mode to use for elements that have
3005    ELEM_NBYTES bytes, if such a mode exists.  */
3006 
3007 opt_machine_mode
aarch64_sve_pred_mode(unsigned int elem_nbytes)3008 aarch64_sve_pred_mode (unsigned int elem_nbytes)
3009 {
3010   if (TARGET_SVE)
3011     {
3012       if (elem_nbytes == 1)
3013 	return VNx16BImode;
3014       if (elem_nbytes == 2)
3015 	return VNx8BImode;
3016       if (elem_nbytes == 4)
3017 	return VNx4BImode;
3018       if (elem_nbytes == 8)
3019 	return VNx2BImode;
3020     }
3021   return opt_machine_mode ();
3022 }
3023 
3024 /* Return the SVE predicate mode that should be used to control
3025    SVE mode MODE.  */
3026 
3027 machine_mode
aarch64_sve_pred_mode(machine_mode mode)3028 aarch64_sve_pred_mode (machine_mode mode)
3029 {
3030   unsigned int bits = aarch64_sve_container_bits (mode);
3031   return aarch64_sve_pred_mode (bits / BITS_PER_UNIT).require ();
3032 }
3033 
3034 /* Implement TARGET_VECTORIZE_GET_MASK_MODE.  */
3035 
3036 static opt_machine_mode
aarch64_get_mask_mode(machine_mode mode)3037 aarch64_get_mask_mode (machine_mode mode)
3038 {
3039   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
3040   if (vec_flags & VEC_SVE_DATA)
3041     return aarch64_sve_pred_mode (mode);
3042 
3043   return default_get_mask_mode (mode);
3044 }
3045 
3046 /* Return the SVE vector mode that has NUNITS elements of mode INNER_MODE.  */
3047 
3048 opt_machine_mode
aarch64_sve_data_mode(scalar_mode inner_mode,poly_uint64 nunits)3049 aarch64_sve_data_mode (scalar_mode inner_mode, poly_uint64 nunits)
3050 {
3051   enum mode_class mclass = (is_a <scalar_float_mode> (inner_mode)
3052 			    ? MODE_VECTOR_FLOAT : MODE_VECTOR_INT);
3053   machine_mode mode;
3054   FOR_EACH_MODE_IN_CLASS (mode, mclass)
3055     if (inner_mode == GET_MODE_INNER (mode)
3056 	&& known_eq (nunits, GET_MODE_NUNITS (mode))
3057 	&& aarch64_sve_data_mode_p (mode))
3058       return mode;
3059   return opt_machine_mode ();
3060 }
3061 
3062 /* Return the integer element mode associated with SVE mode MODE.  */
3063 
3064 static scalar_int_mode
aarch64_sve_element_int_mode(machine_mode mode)3065 aarch64_sve_element_int_mode (machine_mode mode)
3066 {
3067   poly_uint64 vector_bits = (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
3068 			     ? BITS_PER_SVE_VECTOR
3069 			     : GET_MODE_BITSIZE (mode));
3070   unsigned int elt_bits = vector_element_size (vector_bits,
3071 					       GET_MODE_NUNITS (mode));
3072   return int_mode_for_size (elt_bits, 0).require ();
3073 }
3074 
3075 /* Return an integer element mode that contains exactly
3076    aarch64_sve_container_bits (MODE) bits.  This is wider than
3077    aarch64_sve_element_int_mode if MODE is a partial vector,
3078    otherwise it's the same.  */
3079 
3080 static scalar_int_mode
aarch64_sve_container_int_mode(machine_mode mode)3081 aarch64_sve_container_int_mode (machine_mode mode)
3082 {
3083   return int_mode_for_size (aarch64_sve_container_bits (mode), 0).require ();
3084 }
3085 
3086 /* Return the integer vector mode associated with SVE mode MODE.
3087    Unlike related_int_vector_mode, this can handle the case in which
3088    MODE is a predicate (and thus has a different total size).  */
3089 
3090 machine_mode
aarch64_sve_int_mode(machine_mode mode)3091 aarch64_sve_int_mode (machine_mode mode)
3092 {
3093   scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
3094   return aarch64_sve_data_mode (int_mode, GET_MODE_NUNITS (mode)).require ();
3095 }
3096 
3097 /* Implement TARGET_VECTORIZE_RELATED_MODE.  */
3098 
3099 static opt_machine_mode
aarch64_vectorize_related_mode(machine_mode vector_mode,scalar_mode element_mode,poly_uint64 nunits)3100 aarch64_vectorize_related_mode (machine_mode vector_mode,
3101 				scalar_mode element_mode,
3102 				poly_uint64 nunits)
3103 {
3104   unsigned int vec_flags = aarch64_classify_vector_mode (vector_mode);
3105 
3106   /* If we're operating on SVE vectors, try to return an SVE mode.  */
3107   poly_uint64 sve_nunits;
3108   if ((vec_flags & VEC_SVE_DATA)
3109       && multiple_p (BYTES_PER_SVE_VECTOR,
3110 		     GET_MODE_SIZE (element_mode), &sve_nunits))
3111     {
3112       machine_mode sve_mode;
3113       if (maybe_ne (nunits, 0U))
3114 	{
3115 	  /* Try to find a full or partial SVE mode with exactly
3116 	     NUNITS units.  */
3117 	  if (multiple_p (sve_nunits, nunits)
3118 	      && aarch64_sve_data_mode (element_mode,
3119 					nunits).exists (&sve_mode))
3120 	    return sve_mode;
3121 	}
3122       else
3123 	{
3124 	  /* Take the preferred number of units from the number of bytes
3125 	     that fit in VECTOR_MODE.  We always start by "autodetecting"
3126 	     a full vector mode with preferred_simd_mode, so vectors
3127 	     chosen here will also be full vector modes.  Then
3128 	     autovectorize_vector_modes tries smaller starting modes
3129 	     and thus smaller preferred numbers of units.  */
3130 	  sve_nunits = ordered_min (sve_nunits, GET_MODE_SIZE (vector_mode));
3131 	  if (aarch64_sve_data_mode (element_mode,
3132 				     sve_nunits).exists (&sve_mode))
3133 	    return sve_mode;
3134 	}
3135     }
3136 
3137   /* Prefer to use 1 128-bit vector instead of 2 64-bit vectors.  */
3138   if ((vec_flags & VEC_ADVSIMD)
3139       && known_eq (nunits, 0U)
3140       && known_eq (GET_MODE_BITSIZE (vector_mode), 64U)
3141       && maybe_ge (GET_MODE_BITSIZE (element_mode)
3142 		   * GET_MODE_NUNITS (vector_mode), 128U))
3143     {
3144       machine_mode res = aarch64_simd_container_mode (element_mode, 128);
3145       if (VECTOR_MODE_P (res))
3146 	return res;
3147     }
3148 
3149   return default_vectorize_related_mode (vector_mode, element_mode, nunits);
3150 }
3151 
3152 /* Implement TARGET_PREFERRED_ELSE_VALUE.  For binary operations,
3153    prefer to use the first arithmetic operand as the else value if
3154    the else value doesn't matter, since that exactly matches the SVE
3155    destructive merging form.  For ternary operations we could either
3156    pick the first operand and use FMAD-like instructions or the last
3157    operand and use FMLA-like instructions; the latter seems more
3158    natural.  */
3159 
3160 static tree
aarch64_preferred_else_value(unsigned,tree,unsigned int nops,tree * ops)3161 aarch64_preferred_else_value (unsigned, tree, unsigned int nops, tree *ops)
3162 {
3163   return nops == 3 ? ops[2] : ops[0];
3164 }
3165 
3166 /* Implement TARGET_HARD_REGNO_NREGS.  */
3167 
3168 static unsigned int
aarch64_hard_regno_nregs(unsigned regno,machine_mode mode)3169 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
3170 {
3171   /* ??? Logically we should only need to provide a value when
3172      HARD_REGNO_MODE_OK says that the combination is valid,
3173      but at the moment we need to handle all modes.  Just ignore
3174      any runtime parts for registers that can't store them.  */
3175   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
3176   switch (aarch64_regno_regclass (regno))
3177     {
3178     case FP_REGS:
3179     case FP_LO_REGS:
3180     case FP_LO8_REGS:
3181       {
3182 	unsigned int vec_flags = aarch64_classify_vector_mode (mode);
3183 	if (vec_flags & VEC_SVE_DATA)
3184 	  return exact_div (GET_MODE_SIZE (mode),
3185 			    aarch64_vl_bytes (mode, vec_flags)).to_constant ();
3186 	return CEIL (lowest_size, UNITS_PER_VREG);
3187       }
3188     case PR_REGS:
3189     case PR_LO_REGS:
3190     case PR_HI_REGS:
3191     case FFR_REGS:
3192     case PR_AND_FFR_REGS:
3193       return 1;
3194     default:
3195       return CEIL (lowest_size, UNITS_PER_WORD);
3196     }
3197   gcc_unreachable ();
3198 }
3199 
3200 /* Implement TARGET_HARD_REGNO_MODE_OK.  */
3201 
3202 static bool
aarch64_hard_regno_mode_ok(unsigned regno,machine_mode mode)3203 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
3204 {
3205   if (GET_MODE_CLASS (mode) == MODE_CC)
3206     return regno == CC_REGNUM;
3207 
3208   if (regno == VG_REGNUM)
3209     /* This must have the same size as _Unwind_Word.  */
3210     return mode == DImode;
3211 
3212   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
3213   if (vec_flags & VEC_SVE_PRED)
3214     return pr_or_ffr_regnum_p (regno);
3215 
3216   if (pr_or_ffr_regnum_p (regno))
3217     return false;
3218 
3219   if (regno == SP_REGNUM)
3220     /* The purpose of comparing with ptr_mode is to support the
3221        global register variable associated with the stack pointer
3222        register via the syntax of asm ("wsp") in ILP32.  */
3223     return mode == Pmode || mode == ptr_mode;
3224 
3225   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
3226     return mode == Pmode;
3227 
3228   if (GP_REGNUM_P (regno))
3229     {
3230       if (vec_flags & VEC_ANY_SVE)
3231 	return false;
3232       if (known_le (GET_MODE_SIZE (mode), 8))
3233 	return true;
3234       if (known_le (GET_MODE_SIZE (mode), 16))
3235 	return (regno & 1) == 0;
3236     }
3237   else if (FP_REGNUM_P (regno))
3238     {
3239       if (vec_flags & VEC_STRUCT)
3240 	return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
3241       else
3242 	return !VECTOR_MODE_P (mode) || vec_flags != 0;
3243     }
3244 
3245   return false;
3246 }
3247 
3248 /* Return true if a function with type FNTYPE returns its value in
3249    SVE vector or predicate registers.  */
3250 
3251 static bool
aarch64_returns_value_in_sve_regs_p(const_tree fntype)3252 aarch64_returns_value_in_sve_regs_p (const_tree fntype)
3253 {
3254   tree return_type = TREE_TYPE (fntype);
3255 
3256   pure_scalable_type_info pst_info;
3257   switch (pst_info.analyze (return_type))
3258     {
3259     case pure_scalable_type_info::IS_PST:
3260       return (pst_info.num_zr () <= NUM_FP_ARG_REGS
3261 	      && pst_info.num_pr () <= NUM_PR_ARG_REGS);
3262 
3263     case pure_scalable_type_info::DOESNT_MATTER:
3264       gcc_assert (aarch64_return_in_memory_1 (return_type));
3265       return false;
3266 
3267     case pure_scalable_type_info::NO_ABI_IDENTITY:
3268     case pure_scalable_type_info::ISNT_PST:
3269       return false;
3270     }
3271   gcc_unreachable ();
3272 }
3273 
3274 /* Return true if a function with type FNTYPE takes arguments in
3275    SVE vector or predicate registers.  */
3276 
3277 static bool
aarch64_takes_arguments_in_sve_regs_p(const_tree fntype)3278 aarch64_takes_arguments_in_sve_regs_p (const_tree fntype)
3279 {
3280   CUMULATIVE_ARGS args_so_far_v;
3281   aarch64_init_cumulative_args (&args_so_far_v, NULL_TREE, NULL_RTX,
3282 				NULL_TREE, 0, true);
3283   cumulative_args_t args_so_far = pack_cumulative_args (&args_so_far_v);
3284 
3285   for (tree chain = TYPE_ARG_TYPES (fntype);
3286        chain && chain != void_list_node;
3287        chain = TREE_CHAIN (chain))
3288     {
3289       tree arg_type = TREE_VALUE (chain);
3290       if (arg_type == error_mark_node)
3291 	return false;
3292 
3293       function_arg_info arg (arg_type, /*named=*/true);
3294       apply_pass_by_reference_rules (&args_so_far_v, arg);
3295       pure_scalable_type_info pst_info;
3296       if (pst_info.analyze_registers (arg.type))
3297 	{
3298 	  unsigned int end_zr = args_so_far_v.aapcs_nvrn + pst_info.num_zr ();
3299 	  unsigned int end_pr = args_so_far_v.aapcs_nprn + pst_info.num_pr ();
3300 	  gcc_assert (end_zr <= NUM_FP_ARG_REGS && end_pr <= NUM_PR_ARG_REGS);
3301 	  return true;
3302 	}
3303 
3304       targetm.calls.function_arg_advance (args_so_far, arg);
3305     }
3306   return false;
3307 }
3308 
3309 /* Implement TARGET_FNTYPE_ABI.  */
3310 
3311 static const predefined_function_abi &
aarch64_fntype_abi(const_tree fntype)3312 aarch64_fntype_abi (const_tree fntype)
3313 {
3314   if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)))
3315     return aarch64_simd_abi ();
3316 
3317   if (aarch64_returns_value_in_sve_regs_p (fntype)
3318       || aarch64_takes_arguments_in_sve_regs_p (fntype))
3319     return aarch64_sve_abi ();
3320 
3321   return default_function_abi;
3322 }
3323 
3324 /* Implement TARGET_COMPATIBLE_VECTOR_TYPES_P.  */
3325 
3326 static bool
aarch64_compatible_vector_types_p(const_tree type1,const_tree type2)3327 aarch64_compatible_vector_types_p (const_tree type1, const_tree type2)
3328 {
3329   return (aarch64_sve::builtin_type_p (type1)
3330 	  == aarch64_sve::builtin_type_p (type2));
3331 }
3332 
3333 /* Return true if we should emit CFI for register REGNO.  */
3334 
3335 static bool
aarch64_emit_cfi_for_reg_p(unsigned int regno)3336 aarch64_emit_cfi_for_reg_p (unsigned int regno)
3337 {
3338   return (GP_REGNUM_P (regno)
3339 	  || !default_function_abi.clobbers_full_reg_p (regno));
3340 }
3341 
3342 /* Return the mode we should use to save and restore register REGNO.  */
3343 
3344 static machine_mode
aarch64_reg_save_mode(unsigned int regno)3345 aarch64_reg_save_mode (unsigned int regno)
3346 {
3347   if (GP_REGNUM_P (regno))
3348     return DImode;
3349 
3350   if (FP_REGNUM_P (regno))
3351     switch (crtl->abi->id ())
3352       {
3353       case ARM_PCS_AAPCS64:
3354 	/* Only the low 64 bits are saved by the base PCS.  */
3355 	return DFmode;
3356 
3357       case ARM_PCS_SIMD:
3358 	/* The vector PCS saves the low 128 bits (which is the full
3359 	   register on non-SVE targets).  */
3360 	return TFmode;
3361 
3362       case ARM_PCS_SVE:
3363 	/* Use vectors of DImode for registers that need frame
3364 	   information, so that the first 64 bytes of the save slot
3365 	   are always the equivalent of what storing D<n> would give.  */
3366 	if (aarch64_emit_cfi_for_reg_p (regno))
3367 	  return VNx2DImode;
3368 
3369 	/* Use vectors of bytes otherwise, so that the layout is
3370 	   endian-agnostic, and so that we can use LDR and STR for
3371 	   big-endian targets.  */
3372 	return VNx16QImode;
3373 
3374       case ARM_PCS_TLSDESC:
3375       case ARM_PCS_UNKNOWN:
3376 	break;
3377       }
3378 
3379   if (PR_REGNUM_P (regno))
3380     /* Save the full predicate register.  */
3381     return VNx16BImode;
3382 
3383   gcc_unreachable ();
3384 }
3385 
3386 /* Implement TARGET_INSN_CALLEE_ABI.  */
3387 
3388 const predefined_function_abi &
aarch64_insn_callee_abi(const rtx_insn * insn)3389 aarch64_insn_callee_abi (const rtx_insn *insn)
3390 {
3391   rtx pat = PATTERN (insn);
3392   gcc_assert (GET_CODE (pat) == PARALLEL);
3393   rtx unspec = XVECEXP (pat, 0, 1);
3394   gcc_assert (GET_CODE (unspec) == UNSPEC
3395 	      && XINT (unspec, 1) == UNSPEC_CALLEE_ABI);
3396   return function_abis[INTVAL (XVECEXP (unspec, 0, 0))];
3397 }
3398 
3399 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED.  The callee only saves
3400    the lower 64 bits of a 128-bit register.  Tell the compiler the callee
3401    clobbers the top 64 bits when restoring the bottom 64 bits.  */
3402 
3403 static bool
aarch64_hard_regno_call_part_clobbered(unsigned int abi_id,unsigned int regno,machine_mode mode)3404 aarch64_hard_regno_call_part_clobbered (unsigned int abi_id,
3405 					unsigned int regno,
3406 					machine_mode mode)
3407 {
3408   if (FP_REGNUM_P (regno) && abi_id != ARM_PCS_SVE)
3409     {
3410       poly_int64 per_register_size = GET_MODE_SIZE (mode);
3411       unsigned int nregs = hard_regno_nregs (regno, mode);
3412       if (nregs > 1)
3413 	per_register_size = exact_div (per_register_size, nregs);
3414       if (abi_id == ARM_PCS_SIMD || abi_id == ARM_PCS_TLSDESC)
3415 	return maybe_gt (per_register_size, 16);
3416       return maybe_gt (per_register_size, 8);
3417     }
3418   return false;
3419 }
3420 
3421 /* Implement REGMODE_NATURAL_SIZE.  */
3422 poly_uint64
aarch64_regmode_natural_size(machine_mode mode)3423 aarch64_regmode_natural_size (machine_mode mode)
3424 {
3425   /* The natural size for SVE data modes is one SVE data vector,
3426      and similarly for predicates.  We can't independently modify
3427      anything smaller than that.  */
3428   /* ??? For now, only do this for variable-width SVE registers.
3429      Doing it for constant-sized registers breaks lower-subreg.c.  */
3430   /* ??? And once that's fixed, we should probably have similar
3431      code for Advanced SIMD.  */
3432   if (!aarch64_sve_vg.is_constant ())
3433     {
3434       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
3435       if (vec_flags & VEC_SVE_PRED)
3436 	return BYTES_PER_SVE_PRED;
3437       if (vec_flags & VEC_SVE_DATA)
3438 	return BYTES_PER_SVE_VECTOR;
3439     }
3440   return UNITS_PER_WORD;
3441 }
3442 
3443 /* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
3444 machine_mode
aarch64_hard_regno_caller_save_mode(unsigned regno,unsigned,machine_mode mode)3445 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
3446 				     machine_mode mode)
3447 {
3448   /* The predicate mode determines which bits are significant and
3449      which are "don't care".  Decreasing the number of lanes would
3450      lose data while increasing the number of lanes would make bits
3451      unnecessarily significant.  */
3452   if (PR_REGNUM_P (regno))
3453     return mode;
3454   if (known_ge (GET_MODE_SIZE (mode), 4))
3455     return mode;
3456   else
3457     return SImode;
3458 }
3459 
3460 /* Return true if I's bits are consecutive ones from the MSB.  */
3461 bool
aarch64_high_bits_all_ones_p(HOST_WIDE_INT i)3462 aarch64_high_bits_all_ones_p (HOST_WIDE_INT i)
3463 {
3464   return exact_log2 (-i) != HOST_WIDE_INT_M1;
3465 }
3466 
3467 /* Implement TARGET_CONSTANT_ALIGNMENT.  Make strings word-aligned so
3468    that strcpy from constants will be faster.  */
3469 
3470 static HOST_WIDE_INT
aarch64_constant_alignment(const_tree exp,HOST_WIDE_INT align)3471 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
3472 {
3473   if (TREE_CODE (exp) == STRING_CST && !optimize_size)
3474     return MAX (align, BITS_PER_WORD);
3475   return align;
3476 }
3477 
3478 /* Return true if calls to DECL should be treated as
3479    long-calls (ie called via a register).  */
3480 static bool
aarch64_decl_is_long_call_p(const_tree decl ATTRIBUTE_UNUSED)3481 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
3482 {
3483   return false;
3484 }
3485 
3486 /* Return true if calls to symbol-ref SYM should be treated as
3487    long-calls (ie called via a register).  */
3488 bool
aarch64_is_long_call_p(rtx sym)3489 aarch64_is_long_call_p (rtx sym)
3490 {
3491   return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
3492 }
3493 
3494 /* Return true if calls to symbol-ref SYM should not go through
3495    plt stubs.  */
3496 
3497 bool
aarch64_is_noplt_call_p(rtx sym)3498 aarch64_is_noplt_call_p (rtx sym)
3499 {
3500   const_tree decl = SYMBOL_REF_DECL (sym);
3501 
3502   if (flag_pic
3503       && decl
3504       && (!flag_plt
3505 	  || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
3506       && !targetm.binds_local_p (decl))
3507     return true;
3508 
3509   return false;
3510 }
3511 
3512 /* Emit an insn that's a simple single-set.  Both the operands must be
3513    known to be valid.  */
3514 inline static rtx_insn *
emit_set_insn(rtx x,rtx y)3515 emit_set_insn (rtx x, rtx y)
3516 {
3517   return emit_insn (gen_rtx_SET (x, y));
3518 }
3519 
3520 /* X and Y are two things to compare using CODE.  Emit the compare insn and
3521    return the rtx for register 0 in the proper mode.  */
3522 rtx
aarch64_gen_compare_reg(RTX_CODE code,rtx x,rtx y)3523 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
3524 {
3525   machine_mode cmp_mode = GET_MODE (x);
3526   machine_mode cc_mode;
3527   rtx cc_reg;
3528 
3529   if (cmp_mode == TImode)
3530     {
3531       gcc_assert (code == NE);
3532 
3533       cc_mode = CCmode;
3534       cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
3535 
3536       rtx x_lo = operand_subword (x, 0, 0, TImode);
3537       rtx y_lo = operand_subword (y, 0, 0, TImode);
3538       emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x_lo, y_lo));
3539 
3540       rtx x_hi = operand_subword (x, 1, 0, TImode);
3541       rtx y_hi = operand_subword (y, 1, 0, TImode);
3542       emit_insn (gen_ccmpccdi (cc_reg, cc_reg, x_hi, y_hi,
3543 			       gen_rtx_EQ (cc_mode, cc_reg, const0_rtx),
3544 			       GEN_INT (AARCH64_EQ)));
3545     }
3546   else
3547     {
3548       cc_mode = SELECT_CC_MODE (code, x, y);
3549       cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
3550       emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x, y));
3551     }
3552   return cc_reg;
3553 }
3554 
3555 /* Similarly, but maybe zero-extend Y if Y_MODE < SImode.  */
3556 
3557 static rtx
aarch64_gen_compare_reg_maybe_ze(RTX_CODE code,rtx x,rtx y,machine_mode y_mode)3558 aarch64_gen_compare_reg_maybe_ze (RTX_CODE code, rtx x, rtx y,
3559                                   machine_mode y_mode)
3560 {
3561   if (y_mode == E_QImode || y_mode == E_HImode)
3562     {
3563       if (CONST_INT_P (y))
3564 	{
3565 	  y = GEN_INT (INTVAL (y) & GET_MODE_MASK (y_mode));
3566 	  y_mode = SImode;
3567 	}
3568       else
3569 	{
3570 	  rtx t, cc_reg;
3571 	  machine_mode cc_mode;
3572 
3573 	  t = gen_rtx_ZERO_EXTEND (SImode, y);
3574 	  t = gen_rtx_COMPARE (CC_SWPmode, t, x);
3575 	  cc_mode = CC_SWPmode;
3576 	  cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
3577 	  emit_set_insn (cc_reg, t);
3578 	  return cc_reg;
3579 	}
3580     }
3581 
3582   if (!aarch64_plus_operand (y, y_mode))
3583     y = force_reg (y_mode, y);
3584 
3585   return aarch64_gen_compare_reg (code, x, y);
3586 }
3587 
3588 /* Build the SYMBOL_REF for __tls_get_addr.  */
3589 
3590 static GTY(()) rtx tls_get_addr_libfunc;
3591 
3592 rtx
aarch64_tls_get_addr(void)3593 aarch64_tls_get_addr (void)
3594 {
3595   if (!tls_get_addr_libfunc)
3596     tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
3597   return tls_get_addr_libfunc;
3598 }
3599 
3600 /* Return the TLS model to use for ADDR.  */
3601 
3602 static enum tls_model
tls_symbolic_operand_type(rtx addr)3603 tls_symbolic_operand_type (rtx addr)
3604 {
3605   enum tls_model tls_kind = TLS_MODEL_NONE;
3606   poly_int64 offset;
3607   addr = strip_offset_and_salt (addr, &offset);
3608   if (SYMBOL_REF_P (addr))
3609     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
3610 
3611   return tls_kind;
3612 }
3613 
3614 /* We'll allow lo_sum's in addresses in our legitimate addresses
3615    so that combine would take care of combining addresses where
3616    necessary, but for generation purposes, we'll generate the address
3617    as :
3618    RTL                               Absolute
3619    tmp = hi (symbol_ref);            adrp  x1, foo
3620    dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
3621                                      nop
3622 
3623    PIC                               TLS
3624    adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
3625    ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
3626                                      bl   __tls_get_addr
3627                                      nop
3628 
3629    Load TLS symbol, depending on TLS mechanism and TLS access model.
3630 
3631    Global Dynamic - Traditional TLS:
3632    adrp tmp, :tlsgd:imm
3633    add  dest, tmp, #:tlsgd_lo12:imm
3634    bl   __tls_get_addr
3635 
3636    Global Dynamic - TLS Descriptors:
3637    adrp dest, :tlsdesc:imm
3638    ldr  tmp, [dest, #:tlsdesc_lo12:imm]
3639    add  dest, dest, #:tlsdesc_lo12:imm
3640    blr  tmp
3641    mrs  tp, tpidr_el0
3642    add  dest, dest, tp
3643 
3644    Initial Exec:
3645    mrs  tp, tpidr_el0
3646    adrp tmp, :gottprel:imm
3647    ldr  dest, [tmp, #:gottprel_lo12:imm]
3648    add  dest, dest, tp
3649 
3650    Local Exec:
3651    mrs  tp, tpidr_el0
3652    add  t0, tp, #:tprel_hi12:imm, lsl #12
3653    add  t0, t0, #:tprel_lo12_nc:imm
3654 */
3655 
3656 static void
aarch64_load_symref_appropriately(rtx dest,rtx imm,enum aarch64_symbol_type type)3657 aarch64_load_symref_appropriately (rtx dest, rtx imm,
3658 				   enum aarch64_symbol_type type)
3659 {
3660   switch (type)
3661     {
3662     case SYMBOL_SMALL_ABSOLUTE:
3663       {
3664 	/* In ILP32, the mode of dest can be either SImode or DImode.  */
3665 	rtx tmp_reg = dest;
3666 	machine_mode mode = GET_MODE (dest);
3667 
3668 	gcc_assert (mode == Pmode || mode == ptr_mode);
3669 
3670 	if (can_create_pseudo_p ())
3671 	  tmp_reg = gen_reg_rtx (mode);
3672 
3673 	emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
3674 	emit_insn (gen_add_losym (dest, tmp_reg, imm));
3675 	return;
3676       }
3677 
3678     case SYMBOL_TINY_ABSOLUTE:
3679       emit_insn (gen_rtx_SET (dest, imm));
3680       return;
3681 
3682     case SYMBOL_SMALL_GOT_28K:
3683       {
3684 	machine_mode mode = GET_MODE (dest);
3685 	rtx gp_rtx = pic_offset_table_rtx;
3686 	rtx insn;
3687 	rtx mem;
3688 
3689 	/* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
3690 	   here before rtl expand.  Tree IVOPT will generate rtl pattern to
3691 	   decide rtx costs, in which case pic_offset_table_rtx is not
3692 	   initialized.  For that case no need to generate the first adrp
3693 	   instruction as the final cost for global variable access is
3694 	   one instruction.  */
3695 	if (gp_rtx != NULL)
3696 	  {
3697 	    /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
3698 	       using the page base as GOT base, the first page may be wasted,
3699 	       in the worst scenario, there is only 28K space for GOT).
3700 
3701 	       The generate instruction sequence for accessing global variable
3702 	       is:
3703 
3704 		 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
3705 
3706 	       Only one instruction needed. But we must initialize
3707 	       pic_offset_table_rtx properly.  We generate initialize insn for
3708 	       every global access, and allow CSE to remove all redundant.
3709 
3710 	       The final instruction sequences will look like the following
3711 	       for multiply global variables access.
3712 
3713 		 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
3714 
3715 		 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
3716 		 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
3717 		 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
3718 		 ...  */
3719 
3720 	    rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
3721 	    crtl->uses_pic_offset_table = 1;
3722 	    emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
3723 
3724 	    if (mode != GET_MODE (gp_rtx))
3725              gp_rtx = gen_lowpart (mode, gp_rtx);
3726 
3727 	  }
3728 
3729 	if (mode == ptr_mode)
3730 	  {
3731 	    if (mode == DImode)
3732 	      insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
3733 	    else
3734 	      insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
3735 
3736 	    mem = XVECEXP (SET_SRC (insn), 0, 0);
3737 	  }
3738 	else
3739 	  {
3740 	    gcc_assert (mode == Pmode);
3741 
3742 	    insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
3743 	    mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
3744 	  }
3745 
3746 	/* The operand is expected to be MEM.  Whenever the related insn
3747 	   pattern changed, above code which calculate mem should be
3748 	   updated.  */
3749 	gcc_assert (MEM_P (mem));
3750 	MEM_READONLY_P (mem) = 1;
3751 	MEM_NOTRAP_P (mem) = 1;
3752 	emit_insn (insn);
3753 	return;
3754       }
3755 
3756     case SYMBOL_SMALL_GOT_4G:
3757       {
3758 	/* In ILP32, the mode of dest can be either SImode or DImode,
3759 	   while the got entry is always of SImode size.  The mode of
3760 	   dest depends on how dest is used: if dest is assigned to a
3761 	   pointer (e.g. in the memory), it has SImode; it may have
3762 	   DImode if dest is dereferenced to access the memeory.
3763 	   This is why we have to handle three different ldr_got_small
3764 	   patterns here (two patterns for ILP32).  */
3765 
3766 	rtx insn;
3767 	rtx mem;
3768 	rtx tmp_reg = dest;
3769 	machine_mode mode = GET_MODE (dest);
3770 
3771 	if (can_create_pseudo_p ())
3772 	  tmp_reg = gen_reg_rtx (mode);
3773 
3774 	emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
3775 	if (mode == ptr_mode)
3776 	  {
3777 	    if (mode == DImode)
3778 	      insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
3779 	    else
3780 	      insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
3781 
3782 	    mem = XVECEXP (SET_SRC (insn), 0, 0);
3783 	  }
3784 	else
3785 	  {
3786 	    gcc_assert (mode == Pmode);
3787 
3788 	    insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
3789 	    mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
3790 	  }
3791 
3792 	gcc_assert (MEM_P (mem));
3793 	MEM_READONLY_P (mem) = 1;
3794 	MEM_NOTRAP_P (mem) = 1;
3795 	emit_insn (insn);
3796 	return;
3797       }
3798 
3799     case SYMBOL_SMALL_TLSGD:
3800       {
3801 	rtx_insn *insns;
3802 	/* The return type of __tls_get_addr is the C pointer type
3803 	   so use ptr_mode.  */
3804 	rtx result = gen_rtx_REG (ptr_mode, R0_REGNUM);
3805 	rtx tmp_reg = dest;
3806 
3807 	if (GET_MODE (dest) != ptr_mode)
3808 	  tmp_reg = can_create_pseudo_p () ? gen_reg_rtx (ptr_mode) : result;
3809 
3810 	start_sequence ();
3811 	if (ptr_mode == SImode)
3812 	  aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
3813 	else
3814 	  aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
3815 	insns = get_insns ();
3816 	end_sequence ();
3817 
3818 	RTL_CONST_CALL_P (insns) = 1;
3819 	emit_libcall_block (insns, tmp_reg, result, imm);
3820 	/* Convert back to the mode of the dest adding a zero_extend
3821 	   from SImode (ptr_mode) to DImode (Pmode). */
3822 	if (dest != tmp_reg)
3823 	  convert_move (dest, tmp_reg, true);
3824 	return;
3825       }
3826 
3827     case SYMBOL_SMALL_TLSDESC:
3828       {
3829 	machine_mode mode = GET_MODE (dest);
3830 	rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
3831 	rtx tp;
3832 
3833 	gcc_assert (mode == Pmode || mode == ptr_mode);
3834 
3835 	/* In ILP32, the got entry is always of SImode size.  Unlike
3836 	   small GOT, the dest is fixed at reg 0.  */
3837 	if (TARGET_ILP32)
3838 	  emit_insn (gen_tlsdesc_small_si (imm));
3839 	else
3840 	  emit_insn (gen_tlsdesc_small_di (imm));
3841 	tp = aarch64_load_tp (NULL);
3842 
3843 	if (mode != Pmode)
3844 	  tp = gen_lowpart (mode, tp);
3845 
3846 	emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
3847 	if (REG_P (dest))
3848 	  set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
3849 	return;
3850       }
3851 
3852     case SYMBOL_SMALL_TLSIE:
3853       {
3854 	/* In ILP32, the mode of dest can be either SImode or DImode,
3855 	   while the got entry is always of SImode size.  The mode of
3856 	   dest depends on how dest is used: if dest is assigned to a
3857 	   pointer (e.g. in the memory), it has SImode; it may have
3858 	   DImode if dest is dereferenced to access the memeory.
3859 	   This is why we have to handle three different tlsie_small
3860 	   patterns here (two patterns for ILP32).  */
3861 	machine_mode mode = GET_MODE (dest);
3862 	rtx tmp_reg = gen_reg_rtx (mode);
3863 	rtx tp = aarch64_load_tp (NULL);
3864 
3865 	if (mode == ptr_mode)
3866 	  {
3867 	    if (mode == DImode)
3868 	      emit_insn (gen_tlsie_small_di (tmp_reg, imm));
3869 	    else
3870 	      {
3871 		emit_insn (gen_tlsie_small_si (tmp_reg, imm));
3872 		tp = gen_lowpart (mode, tp);
3873 	      }
3874 	  }
3875 	else
3876 	  {
3877 	    gcc_assert (mode == Pmode);
3878 	    emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
3879 	  }
3880 
3881 	emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
3882 	if (REG_P (dest))
3883 	  set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
3884 	return;
3885       }
3886 
3887     case SYMBOL_TLSLE12:
3888     case SYMBOL_TLSLE24:
3889     case SYMBOL_TLSLE32:
3890     case SYMBOL_TLSLE48:
3891       {
3892 	machine_mode mode = GET_MODE (dest);
3893 	rtx tp = aarch64_load_tp (NULL);
3894 
3895 	if (mode != Pmode)
3896 	  tp = gen_lowpart (mode, tp);
3897 
3898 	switch (type)
3899 	  {
3900 	  case SYMBOL_TLSLE12:
3901 	    emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
3902 			(dest, tp, imm));
3903 	    break;
3904 	  case SYMBOL_TLSLE24:
3905 	    emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
3906 			(dest, tp, imm));
3907 	  break;
3908 	  case SYMBOL_TLSLE32:
3909 	    emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
3910 			(dest, imm));
3911 	    emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
3912 			(dest, dest, tp));
3913 	  break;
3914 	  case SYMBOL_TLSLE48:
3915 	    emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
3916 			(dest, imm));
3917 	    emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
3918 			(dest, dest, tp));
3919 	    break;
3920 	  default:
3921 	    gcc_unreachable ();
3922 	  }
3923 
3924 	if (REG_P (dest))
3925 	  set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
3926 	return;
3927       }
3928 
3929     case SYMBOL_TINY_GOT:
3930       {
3931 	rtx insn;
3932 	machine_mode mode = GET_MODE (dest);
3933 
3934 	if (mode == ptr_mode)
3935 	  insn = gen_ldr_got_tiny (mode, dest, imm);
3936 	else
3937 	  {
3938 	    gcc_assert (mode == Pmode);
3939 	    insn = gen_ldr_got_tiny_sidi (dest, imm);
3940 	  }
3941 
3942 	emit_insn (insn);
3943 	return;
3944       }
3945 
3946     case SYMBOL_TINY_TLSIE:
3947       {
3948 	machine_mode mode = GET_MODE (dest);
3949 	rtx tp = aarch64_load_tp (NULL);
3950 
3951 	if (mode == ptr_mode)
3952 	  {
3953 	    if (mode == DImode)
3954 	      emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
3955 	    else
3956 	      {
3957 		tp = gen_lowpart (mode, tp);
3958 		emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
3959 	      }
3960 	  }
3961 	else
3962 	  {
3963 	    gcc_assert (mode == Pmode);
3964 	    emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
3965 	  }
3966 
3967 	if (REG_P (dest))
3968 	  set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
3969 	return;
3970       }
3971 
3972     default:
3973       gcc_unreachable ();
3974     }
3975 }
3976 
3977 /* Emit a move from SRC to DEST.  Assume that the move expanders can
3978    handle all moves if !can_create_pseudo_p ().  The distinction is
3979    important because, unlike emit_move_insn, the move expanders know
3980    how to force Pmode objects into the constant pool even when the
3981    constant pool address is not itself legitimate.  */
3982 static rtx
aarch64_emit_move(rtx dest,rtx src)3983 aarch64_emit_move (rtx dest, rtx src)
3984 {
3985   return (can_create_pseudo_p ()
3986 	  ? emit_move_insn (dest, src)
3987 	  : emit_move_insn_1 (dest, src));
3988 }
3989 
3990 /* Apply UNOPTAB to OP and store the result in DEST.  */
3991 
3992 static void
aarch64_emit_unop(rtx dest,optab unoptab,rtx op)3993 aarch64_emit_unop (rtx dest, optab unoptab, rtx op)
3994 {
3995   rtx tmp = expand_unop (GET_MODE (dest), unoptab, op, dest, 0);
3996   if (dest != tmp)
3997     emit_move_insn (dest, tmp);
3998 }
3999 
4000 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST.  */
4001 
4002 static void
aarch64_emit_binop(rtx dest,optab binoptab,rtx op0,rtx op1)4003 aarch64_emit_binop (rtx dest, optab binoptab, rtx op0, rtx op1)
4004 {
4005   rtx tmp = expand_binop (GET_MODE (dest), binoptab, op0, op1, dest, 0,
4006 			  OPTAB_DIRECT);
4007   if (dest != tmp)
4008     emit_move_insn (dest, tmp);
4009 }
4010 
4011 /* Split a 128-bit move operation into two 64-bit move operations,
4012    taking care to handle partial overlap of register to register
4013    copies.  Special cases are needed when moving between GP regs and
4014    FP regs.  SRC can be a register, constant or memory; DST a register
4015    or memory.  If either operand is memory it must not have any side
4016    effects.  */
4017 void
aarch64_split_128bit_move(rtx dst,rtx src)4018 aarch64_split_128bit_move (rtx dst, rtx src)
4019 {
4020   rtx dst_lo, dst_hi;
4021   rtx src_lo, src_hi;
4022 
4023   machine_mode mode = GET_MODE (dst);
4024 
4025   gcc_assert (mode == TImode || mode == TFmode);
4026   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
4027   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
4028 
4029   if (REG_P (dst) && REG_P (src))
4030     {
4031       int src_regno = REGNO (src);
4032       int dst_regno = REGNO (dst);
4033 
4034       /* Handle FP <-> GP regs.  */
4035       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
4036 	{
4037 	  src_lo = gen_lowpart (word_mode, src);
4038 	  src_hi = gen_highpart (word_mode, src);
4039 
4040 	  emit_insn (gen_aarch64_movlow_di (mode, dst, src_lo));
4041 	  emit_insn (gen_aarch64_movhigh_di (mode, dst, src_hi));
4042 	  return;
4043 	}
4044       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
4045 	{
4046 	  dst_lo = gen_lowpart (word_mode, dst);
4047 	  dst_hi = gen_highpart (word_mode, dst);
4048 
4049 	  emit_insn (gen_aarch64_movdi_low (mode, dst_lo, src));
4050 	  emit_insn (gen_aarch64_movdi_high (mode, dst_hi, src));
4051 	  return;
4052 	}
4053     }
4054 
4055   dst_lo = gen_lowpart (word_mode, dst);
4056   dst_hi = gen_highpart (word_mode, dst);
4057   src_lo = gen_lowpart (word_mode, src);
4058   src_hi = gen_highpart_mode (word_mode, mode, src);
4059 
4060   /* At most one pairing may overlap.  */
4061   if (reg_overlap_mentioned_p (dst_lo, src_hi))
4062     {
4063       aarch64_emit_move (dst_hi, src_hi);
4064       aarch64_emit_move (dst_lo, src_lo);
4065     }
4066   else
4067     {
4068       aarch64_emit_move (dst_lo, src_lo);
4069       aarch64_emit_move (dst_hi, src_hi);
4070     }
4071 }
4072 
4073 /* Return true if we should split a move from 128-bit value SRC
4074    to 128-bit register DEST.  */
4075 
4076 bool
aarch64_split_128bit_move_p(rtx dst,rtx src)4077 aarch64_split_128bit_move_p (rtx dst, rtx src)
4078 {
4079   if (FP_REGNUM_P (REGNO (dst)))
4080     return REG_P (src) && !FP_REGNUM_P (REGNO (src));
4081   /* All moves to GPRs need to be split.  */
4082   return true;
4083 }
4084 
4085 /* Split a complex SIMD combine.  */
4086 
4087 void
aarch64_split_simd_combine(rtx dst,rtx src1,rtx src2)4088 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
4089 {
4090   machine_mode src_mode = GET_MODE (src1);
4091   machine_mode dst_mode = GET_MODE (dst);
4092 
4093   gcc_assert (VECTOR_MODE_P (dst_mode));
4094   gcc_assert (register_operand (dst, dst_mode)
4095 	      && register_operand (src1, src_mode)
4096 	      && register_operand (src2, src_mode));
4097 
4098   emit_insn (gen_aarch64_simd_combine (src_mode, dst, src1, src2));
4099   return;
4100 }
4101 
4102 /* Split a complex SIMD move.  */
4103 
4104 void
aarch64_split_simd_move(rtx dst,rtx src)4105 aarch64_split_simd_move (rtx dst, rtx src)
4106 {
4107   machine_mode src_mode = GET_MODE (src);
4108   machine_mode dst_mode = GET_MODE (dst);
4109 
4110   gcc_assert (VECTOR_MODE_P (dst_mode));
4111 
4112   if (REG_P (dst) && REG_P (src))
4113     {
4114       gcc_assert (VECTOR_MODE_P (src_mode));
4115       emit_insn (gen_aarch64_split_simd_mov (src_mode, dst, src));
4116     }
4117 }
4118 
4119 bool
aarch64_zero_extend_const_eq(machine_mode xmode,rtx x,machine_mode ymode,rtx y)4120 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
4121 			      machine_mode ymode, rtx y)
4122 {
4123   rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
4124   gcc_assert (r != NULL);
4125   return rtx_equal_p (x, r);
4126 }
4127 
4128 /* Return TARGET if it is nonnull and a register of mode MODE.
4129    Otherwise, return a fresh register of mode MODE if we can,
4130    or TARGET reinterpreted as MODE if we can't.  */
4131 
4132 static rtx
aarch64_target_reg(rtx target,machine_mode mode)4133 aarch64_target_reg (rtx target, machine_mode mode)
4134 {
4135   if (target && REG_P (target) && GET_MODE (target) == mode)
4136     return target;
4137   if (!can_create_pseudo_p ())
4138     {
4139       gcc_assert (target);
4140       return gen_lowpart (mode, target);
4141     }
4142   return gen_reg_rtx (mode);
4143 }
4144 
4145 /* Return a register that contains the constant in BUILDER, given that
4146    the constant is a legitimate move operand.  Use TARGET as the register
4147    if it is nonnull and convenient.  */
4148 
4149 static rtx
aarch64_emit_set_immediate(rtx target,rtx_vector_builder & builder)4150 aarch64_emit_set_immediate (rtx target, rtx_vector_builder &builder)
4151 {
4152   rtx src = builder.build ();
4153   target = aarch64_target_reg (target, GET_MODE (src));
4154   emit_insn (gen_rtx_SET (target, src));
4155   return target;
4156 }
4157 
4158 static rtx
aarch64_force_temporary(machine_mode mode,rtx x,rtx value)4159 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
4160 {
4161   if (can_create_pseudo_p ())
4162     return force_reg (mode, value);
4163   else
4164     {
4165       gcc_assert (x);
4166       aarch64_emit_move (x, value);
4167       return x;
4168     }
4169 }
4170 
4171 /* Return true if predicate value X is a constant in which every element
4172    is a CONST_INT.  When returning true, describe X in BUILDER as a VNx16BI
4173    value, i.e. as a predicate in which all bits are significant.  */
4174 
4175 static bool
aarch64_get_sve_pred_bits(rtx_vector_builder & builder,rtx x)4176 aarch64_get_sve_pred_bits (rtx_vector_builder &builder, rtx x)
4177 {
4178   if (GET_CODE (x) != CONST_VECTOR)
4179     return false;
4180 
4181   unsigned int factor = vector_element_size (GET_MODE_NUNITS (VNx16BImode),
4182 					     GET_MODE_NUNITS (GET_MODE (x)));
4183   unsigned int npatterns = CONST_VECTOR_NPATTERNS (x) * factor;
4184   unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (x);
4185   builder.new_vector (VNx16BImode, npatterns, nelts_per_pattern);
4186 
4187   unsigned int nelts = const_vector_encoded_nelts (x);
4188   for (unsigned int i = 0; i < nelts; ++i)
4189     {
4190       rtx elt = CONST_VECTOR_ENCODED_ELT (x, i);
4191       if (!CONST_INT_P (elt))
4192 	return false;
4193 
4194       builder.quick_push (elt);
4195       for (unsigned int j = 1; j < factor; ++j)
4196 	builder.quick_push (const0_rtx);
4197     }
4198   builder.finalize ();
4199   return true;
4200 }
4201 
4202 /* BUILDER contains a predicate constant of mode VNx16BI.  Return the
4203    widest predicate element size it can have (that is, the largest size
4204    for which each element would still be 0 or 1).  */
4205 
4206 unsigned int
aarch64_widest_sve_pred_elt_size(rtx_vector_builder & builder)4207 aarch64_widest_sve_pred_elt_size (rtx_vector_builder &builder)
4208 {
4209   /* Start with the most optimistic assumption: that we only need
4210      one bit per pattern.  This is what we will use if only the first
4211      bit in each pattern is ever set.  */
4212   unsigned int mask = GET_MODE_SIZE (DImode);
4213   mask |= builder.npatterns ();
4214 
4215   /* Look for set bits.  */
4216   unsigned int nelts = builder.encoded_nelts ();
4217   for (unsigned int i = 1; i < nelts; ++i)
4218     if (INTVAL (builder.elt (i)) != 0)
4219       {
4220 	if (i & 1)
4221 	  return 1;
4222 	mask |= i;
4223       }
4224   return mask & -mask;
4225 }
4226 
4227 /* If VNx16BImode rtx X is a canonical PTRUE for a predicate mode,
4228    return that predicate mode, otherwise return opt_machine_mode ().  */
4229 
4230 opt_machine_mode
aarch64_ptrue_all_mode(rtx x)4231 aarch64_ptrue_all_mode (rtx x)
4232 {
4233   gcc_assert (GET_MODE (x) == VNx16BImode);
4234   if (GET_CODE (x) != CONST_VECTOR
4235       || !CONST_VECTOR_DUPLICATE_P (x)
4236       || !CONST_INT_P (CONST_VECTOR_ENCODED_ELT (x, 0))
4237       || INTVAL (CONST_VECTOR_ENCODED_ELT (x, 0)) == 0)
4238     return opt_machine_mode ();
4239 
4240   unsigned int nelts = const_vector_encoded_nelts (x);
4241   for (unsigned int i = 1; i < nelts; ++i)
4242     if (CONST_VECTOR_ENCODED_ELT (x, i) != const0_rtx)
4243       return opt_machine_mode ();
4244 
4245   return aarch64_sve_pred_mode (nelts);
4246 }
4247 
4248 /* BUILDER is a predicate constant of mode VNx16BI.  Consider the value
4249    that the constant would have with predicate element size ELT_SIZE
4250    (ignoring the upper bits in each element) and return:
4251 
4252    * -1 if all bits are set
4253    * N if the predicate has N leading set bits followed by all clear bits
4254    * 0 if the predicate does not have any of these forms.  */
4255 
4256 int
aarch64_partial_ptrue_length(rtx_vector_builder & builder,unsigned int elt_size)4257 aarch64_partial_ptrue_length (rtx_vector_builder &builder,
4258 			      unsigned int elt_size)
4259 {
4260   /* If nelts_per_pattern is 3, we have set bits followed by clear bits
4261      followed by set bits.  */
4262   if (builder.nelts_per_pattern () == 3)
4263     return 0;
4264 
4265   /* Skip over leading set bits.  */
4266   unsigned int nelts = builder.encoded_nelts ();
4267   unsigned int i = 0;
4268   for (; i < nelts; i += elt_size)
4269     if (INTVAL (builder.elt (i)) == 0)
4270       break;
4271   unsigned int vl = i / elt_size;
4272 
4273   /* Check for the all-true case.  */
4274   if (i == nelts)
4275     return -1;
4276 
4277   /* If nelts_per_pattern is 1, then either VL is zero, or we have a
4278      repeating pattern of set bits followed by clear bits.  */
4279   if (builder.nelts_per_pattern () != 2)
4280     return 0;
4281 
4282   /* We have a "foreground" value and a duplicated "background" value.
4283      If the background might repeat and the last set bit belongs to it,
4284      we might have set bits followed by clear bits followed by set bits.  */
4285   if (i > builder.npatterns () && maybe_ne (nelts, builder.full_nelts ()))
4286     return 0;
4287 
4288   /* Make sure that the rest are all clear.  */
4289   for (; i < nelts; i += elt_size)
4290     if (INTVAL (builder.elt (i)) != 0)
4291       return 0;
4292 
4293   return vl;
4294 }
4295 
4296 /* See if there is an svpattern that encodes an SVE predicate of mode
4297    PRED_MODE in which the first VL bits are set and the rest are clear.
4298    Return the pattern if so, otherwise return AARCH64_NUM_SVPATTERNS.
4299    A VL of -1 indicates an all-true vector.  */
4300 
4301 aarch64_svpattern
aarch64_svpattern_for_vl(machine_mode pred_mode,int vl)4302 aarch64_svpattern_for_vl (machine_mode pred_mode, int vl)
4303 {
4304   if (vl < 0)
4305     return AARCH64_SV_ALL;
4306 
4307   if (maybe_gt (vl, GET_MODE_NUNITS (pred_mode)))
4308     return AARCH64_NUM_SVPATTERNS;
4309 
4310   if (vl >= 1 && vl <= 8)
4311     return aarch64_svpattern (AARCH64_SV_VL1 + (vl - 1));
4312 
4313   if (vl >= 16 && vl <= 256 && pow2p_hwi (vl))
4314     return aarch64_svpattern (AARCH64_SV_VL16 + (exact_log2 (vl) - 4));
4315 
4316   int max_vl;
4317   if (GET_MODE_NUNITS (pred_mode).is_constant (&max_vl))
4318     {
4319       if (vl == (max_vl / 3) * 3)
4320 	return AARCH64_SV_MUL3;
4321       /* These would only trigger for non-power-of-2 lengths.  */
4322       if (vl == (max_vl & -4))
4323 	return AARCH64_SV_MUL4;
4324       if (vl == (1 << floor_log2 (max_vl)))
4325 	return AARCH64_SV_POW2;
4326       if (vl == max_vl)
4327 	return AARCH64_SV_ALL;
4328     }
4329   return AARCH64_NUM_SVPATTERNS;
4330 }
4331 
4332 /* Return a VNx16BImode constant in which every sequence of ELT_SIZE
4333    bits has the lowest bit set and the upper bits clear.  This is the
4334    VNx16BImode equivalent of a PTRUE for controlling elements of
4335    ELT_SIZE bytes.  However, because the constant is VNx16BImode,
4336    all bits are significant, even the upper zeros.  */
4337 
4338 rtx
aarch64_ptrue_all(unsigned int elt_size)4339 aarch64_ptrue_all (unsigned int elt_size)
4340 {
4341   rtx_vector_builder builder (VNx16BImode, elt_size, 1);
4342   builder.quick_push (const1_rtx);
4343   for (unsigned int i = 1; i < elt_size; ++i)
4344     builder.quick_push (const0_rtx);
4345   return builder.build ();
4346 }
4347 
4348 /* Return an all-true predicate register of mode MODE.  */
4349 
4350 rtx
aarch64_ptrue_reg(machine_mode mode)4351 aarch64_ptrue_reg (machine_mode mode)
4352 {
4353   gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
4354   rtx reg = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
4355   return gen_lowpart (mode, reg);
4356 }
4357 
4358 /* Return an all-false predicate register of mode MODE.  */
4359 
4360 rtx
aarch64_pfalse_reg(machine_mode mode)4361 aarch64_pfalse_reg (machine_mode mode)
4362 {
4363   gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
4364   rtx reg = force_reg (VNx16BImode, CONST0_RTX (VNx16BImode));
4365   return gen_lowpart (mode, reg);
4366 }
4367 
4368 /* PRED1[0] is a PTEST predicate and PRED1[1] is an aarch64_sve_ptrue_flag
4369    for it.  PRED2[0] is the predicate for the instruction whose result
4370    is tested by the PTEST and PRED2[1] is again an aarch64_sve_ptrue_flag
4371    for it.  Return true if we can prove that the two predicates are
4372    equivalent for PTEST purposes; that is, if we can replace PRED2[0]
4373    with PRED1[0] without changing behavior.  */
4374 
4375 bool
aarch64_sve_same_pred_for_ptest_p(rtx * pred1,rtx * pred2)4376 aarch64_sve_same_pred_for_ptest_p (rtx *pred1, rtx *pred2)
4377 {
4378   machine_mode mode = GET_MODE (pred1[0]);
4379   gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
4380 	      && mode == GET_MODE (pred2[0])
4381 	      && aarch64_sve_ptrue_flag (pred1[1], SImode)
4382 	      && aarch64_sve_ptrue_flag (pred2[1], SImode));
4383 
4384   bool ptrue1_p = (pred1[0] == CONSTM1_RTX (mode)
4385 		   || INTVAL (pred1[1]) == SVE_KNOWN_PTRUE);
4386   bool ptrue2_p = (pred2[0] == CONSTM1_RTX (mode)
4387 		   || INTVAL (pred2[1]) == SVE_KNOWN_PTRUE);
4388   return (ptrue1_p && ptrue2_p) || rtx_equal_p (pred1[0], pred2[0]);
4389 }
4390 
4391 /* Emit a comparison CMP between OP0 and OP1, both of which have mode
4392    DATA_MODE, and return the result in a predicate of mode PRED_MODE.
4393    Use TARGET as the target register if nonnull and convenient.  */
4394 
4395 static rtx
aarch64_sve_emit_int_cmp(rtx target,machine_mode pred_mode,rtx_code cmp,machine_mode data_mode,rtx op1,rtx op2)4396 aarch64_sve_emit_int_cmp (rtx target, machine_mode pred_mode, rtx_code cmp,
4397 			  machine_mode data_mode, rtx op1, rtx op2)
4398 {
4399   insn_code icode = code_for_aarch64_pred_cmp (cmp, data_mode);
4400   expand_operand ops[5];
4401   create_output_operand (&ops[0], target, pred_mode);
4402   create_input_operand (&ops[1], CONSTM1_RTX (pred_mode), pred_mode);
4403   create_integer_operand (&ops[2], SVE_KNOWN_PTRUE);
4404   create_input_operand (&ops[3], op1, data_mode);
4405   create_input_operand (&ops[4], op2, data_mode);
4406   expand_insn (icode, 5, ops);
4407   return ops[0].value;
4408 }
4409 
4410 /* Use a comparison to convert integer vector SRC into MODE, which is
4411    the corresponding SVE predicate mode.  Use TARGET for the result
4412    if it's nonnull and convenient.  */
4413 
4414 rtx
aarch64_convert_sve_data_to_pred(rtx target,machine_mode mode,rtx src)4415 aarch64_convert_sve_data_to_pred (rtx target, machine_mode mode, rtx src)
4416 {
4417   machine_mode src_mode = GET_MODE (src);
4418   return aarch64_sve_emit_int_cmp (target, mode, NE, src_mode,
4419 				   src, CONST0_RTX (src_mode));
4420 }
4421 
4422 /* Return the assembly token for svprfop value PRFOP.  */
4423 
4424 static const char *
svprfop_token(enum aarch64_svprfop prfop)4425 svprfop_token (enum aarch64_svprfop prfop)
4426 {
4427   switch (prfop)
4428     {
4429 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
4430     AARCH64_FOR_SVPRFOP (CASE)
4431 #undef CASE
4432     case AARCH64_NUM_SVPRFOPS:
4433       break;
4434     }
4435   gcc_unreachable ();
4436 }
4437 
4438 /* Return the assembly string for an SVE prefetch operation with
4439    mnemonic MNEMONIC, given that PRFOP_RTX is the prefetch operation
4440    and that SUFFIX is the format for the remaining operands.  */
4441 
4442 char *
aarch64_output_sve_prefetch(const char * mnemonic,rtx prfop_rtx,const char * suffix)4443 aarch64_output_sve_prefetch (const char *mnemonic, rtx prfop_rtx,
4444 			     const char *suffix)
4445 {
4446   static char buffer[128];
4447   aarch64_svprfop prfop = (aarch64_svprfop) INTVAL (prfop_rtx);
4448   unsigned int written = snprintf (buffer, sizeof (buffer), "%s\t%s, %s",
4449 				   mnemonic, svprfop_token (prfop), suffix);
4450   gcc_assert (written < sizeof (buffer));
4451   return buffer;
4452 }
4453 
4454 /* Check whether we can calculate the number of elements in PATTERN
4455    at compile time, given that there are NELTS_PER_VQ elements per
4456    128-bit block.  Return the value if so, otherwise return -1.  */
4457 
4458 HOST_WIDE_INT
aarch64_fold_sve_cnt_pat(aarch64_svpattern pattern,unsigned int nelts_per_vq)4459 aarch64_fold_sve_cnt_pat (aarch64_svpattern pattern, unsigned int nelts_per_vq)
4460 {
4461   unsigned int vl, const_vg;
4462   if (pattern >= AARCH64_SV_VL1 && pattern <= AARCH64_SV_VL8)
4463     vl = 1 + (pattern - AARCH64_SV_VL1);
4464   else if (pattern >= AARCH64_SV_VL16 && pattern <= AARCH64_SV_VL256)
4465     vl = 16 << (pattern - AARCH64_SV_VL16);
4466   else if (aarch64_sve_vg.is_constant (&const_vg))
4467     {
4468       /* There are two vector granules per quadword.  */
4469       unsigned int nelts = (const_vg / 2) * nelts_per_vq;
4470       switch (pattern)
4471 	{
4472 	case AARCH64_SV_POW2: return 1 << floor_log2 (nelts);
4473 	case AARCH64_SV_MUL4: return nelts & -4;
4474 	case AARCH64_SV_MUL3: return (nelts / 3) * 3;
4475 	case AARCH64_SV_ALL: return nelts;
4476 	default: gcc_unreachable ();
4477 	}
4478     }
4479   else
4480     return -1;
4481 
4482   /* There are two vector granules per quadword.  */
4483   poly_uint64 nelts_all = exact_div (aarch64_sve_vg, 2) * nelts_per_vq;
4484   if (known_le (vl, nelts_all))
4485     return vl;
4486 
4487   /* Requesting more elements than are available results in a PFALSE.  */
4488   if (known_gt (vl, nelts_all))
4489     return 0;
4490 
4491   return -1;
4492 }
4493 
4494 /* Return true if we can move VALUE into a register using a single
4495    CNT[BHWD] instruction.  */
4496 
4497 static bool
aarch64_sve_cnt_immediate_p(poly_int64 value)4498 aarch64_sve_cnt_immediate_p (poly_int64 value)
4499 {
4500   HOST_WIDE_INT factor = value.coeffs[0];
4501   /* The coefficient must be [1, 16] * {2, 4, 8, 16}.  */
4502   return (value.coeffs[1] == factor
4503 	  && IN_RANGE (factor, 2, 16 * 16)
4504 	  && (factor & 1) == 0
4505 	  && factor <= 16 * (factor & -factor));
4506 }
4507 
4508 /* Likewise for rtx X.  */
4509 
4510 bool
aarch64_sve_cnt_immediate_p(rtx x)4511 aarch64_sve_cnt_immediate_p (rtx x)
4512 {
4513   poly_int64 value;
4514   return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
4515 }
4516 
4517 /* Return the asm string for an instruction with a CNT-like vector size
4518    operand (a vector pattern followed by a multiplier in the range [1, 16]).
4519    PREFIX is the mnemonic without the size suffix and OPERANDS is the
4520    first part of the operands template (the part that comes before the
4521    vector size itself).  PATTERN is the pattern to use.  FACTOR is the
4522    number of quadwords.  NELTS_PER_VQ, if nonzero, is the number of elements
4523    in each quadword.  If it is zero, we can use any element size.  */
4524 
4525 static char *
aarch64_output_sve_cnt_immediate(const char * prefix,const char * operands,aarch64_svpattern pattern,unsigned int factor,unsigned int nelts_per_vq)4526 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
4527 				  aarch64_svpattern pattern,
4528 				  unsigned int factor,
4529 				  unsigned int nelts_per_vq)
4530 {
4531   static char buffer[sizeof ("sqincd\t%x0, %w0, vl256, mul #16")];
4532 
4533   if (nelts_per_vq == 0)
4534     /* There is some overlap in the ranges of the four CNT instructions.
4535        Here we always use the smallest possible element size, so that the
4536        multiplier is 1 whereever possible.  */
4537     nelts_per_vq = factor & -factor;
4538   int shift = std::min (exact_log2 (nelts_per_vq), 4);
4539   gcc_assert (IN_RANGE (shift, 1, 4));
4540   char suffix = "dwhb"[shift - 1];
4541 
4542   factor >>= shift;
4543   unsigned int written;
4544   if (pattern == AARCH64_SV_ALL && factor == 1)
4545     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
4546 			prefix, suffix, operands);
4547   else if (factor == 1)
4548     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, %s",
4549 			prefix, suffix, operands, svpattern_token (pattern));
4550   else
4551     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, %s, mul #%d",
4552 			prefix, suffix, operands, svpattern_token (pattern),
4553 			factor);
4554   gcc_assert (written < sizeof (buffer));
4555   return buffer;
4556 }
4557 
4558 /* Return the asm string for an instruction with a CNT-like vector size
4559    operand (a vector pattern followed by a multiplier in the range [1, 16]).
4560    PREFIX is the mnemonic without the size suffix and OPERANDS is the
4561    first part of the operands template (the part that comes before the
4562    vector size itself).  X is the value of the vector size operand,
4563    as a polynomial integer rtx; we need to convert this into an "all"
4564    pattern with a multiplier.  */
4565 
4566 char *
aarch64_output_sve_cnt_immediate(const char * prefix,const char * operands,rtx x)4567 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
4568 				  rtx x)
4569 {
4570   poly_int64 value = rtx_to_poly_int64 (x);
4571   gcc_assert (aarch64_sve_cnt_immediate_p (value));
4572   return aarch64_output_sve_cnt_immediate (prefix, operands, AARCH64_SV_ALL,
4573 					   value.coeffs[1], 0);
4574 }
4575 
4576 /* Return the asm string for an instruction with a CNT-like vector size
4577    operand (a vector pattern followed by a multiplier in the range [1, 16]).
4578    PREFIX is the mnemonic without the size suffix and OPERANDS is the
4579    first part of the operands template (the part that comes before the
4580    vector size itself).  CNT_PAT[0..2] are the operands of the
4581    UNSPEC_SVE_CNT_PAT; see aarch64_sve_cnt_pat for details.  */
4582 
4583 char *
aarch64_output_sve_cnt_pat_immediate(const char * prefix,const char * operands,rtx * cnt_pat)4584 aarch64_output_sve_cnt_pat_immediate (const char *prefix,
4585 				      const char *operands, rtx *cnt_pat)
4586 {
4587   aarch64_svpattern pattern = (aarch64_svpattern) INTVAL (cnt_pat[0]);
4588   unsigned int nelts_per_vq = INTVAL (cnt_pat[1]);
4589   unsigned int factor = INTVAL (cnt_pat[2]) * nelts_per_vq;
4590   return aarch64_output_sve_cnt_immediate (prefix, operands, pattern,
4591 					   factor, nelts_per_vq);
4592 }
4593 
4594 /* Return true if we can add X using a single SVE INC or DEC instruction.  */
4595 
4596 bool
aarch64_sve_scalar_inc_dec_immediate_p(rtx x)4597 aarch64_sve_scalar_inc_dec_immediate_p (rtx x)
4598 {
4599   poly_int64 value;
4600   return (poly_int_rtx_p (x, &value)
4601 	  && (aarch64_sve_cnt_immediate_p (value)
4602 	      || aarch64_sve_cnt_immediate_p (-value)));
4603 }
4604 
4605 /* Return the asm string for adding SVE INC/DEC immediate OFFSET to
4606    operand 0.  */
4607 
4608 char *
aarch64_output_sve_scalar_inc_dec(rtx offset)4609 aarch64_output_sve_scalar_inc_dec (rtx offset)
4610 {
4611   poly_int64 offset_value = rtx_to_poly_int64 (offset);
4612   gcc_assert (offset_value.coeffs[0] == offset_value.coeffs[1]);
4613   if (offset_value.coeffs[1] > 0)
4614     return aarch64_output_sve_cnt_immediate ("inc", "%x0", AARCH64_SV_ALL,
4615 					     offset_value.coeffs[1], 0);
4616   else
4617     return aarch64_output_sve_cnt_immediate ("dec", "%x0", AARCH64_SV_ALL,
4618 					     -offset_value.coeffs[1], 0);
4619 }
4620 
4621 /* Return true if we can add VALUE to a register using a single ADDVL
4622    or ADDPL instruction.  */
4623 
4624 static bool
aarch64_sve_addvl_addpl_immediate_p(poly_int64 value)4625 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
4626 {
4627   HOST_WIDE_INT factor = value.coeffs[0];
4628   if (factor == 0 || value.coeffs[1] != factor)
4629     return false;
4630   /* FACTOR counts VG / 2, so a value of 2 is one predicate width
4631      and a value of 16 is one vector width.  */
4632   return (((factor & 15) == 0 && IN_RANGE (factor, -32 * 16, 31 * 16))
4633 	  || ((factor & 1) == 0 && IN_RANGE (factor, -32 * 2, 31 * 2)));
4634 }
4635 
4636 /* Likewise for rtx X.  */
4637 
4638 bool
aarch64_sve_addvl_addpl_immediate_p(rtx x)4639 aarch64_sve_addvl_addpl_immediate_p (rtx x)
4640 {
4641   poly_int64 value;
4642   return (poly_int_rtx_p (x, &value)
4643 	  && aarch64_sve_addvl_addpl_immediate_p (value));
4644 }
4645 
4646 /* Return the asm string for adding ADDVL or ADDPL immediate OFFSET
4647    to operand 1 and storing the result in operand 0.  */
4648 
4649 char *
aarch64_output_sve_addvl_addpl(rtx offset)4650 aarch64_output_sve_addvl_addpl (rtx offset)
4651 {
4652   static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
4653   poly_int64 offset_value = rtx_to_poly_int64 (offset);
4654   gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
4655 
4656   int factor = offset_value.coeffs[1];
4657   if ((factor & 15) == 0)
4658     snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
4659   else
4660     snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
4661   return buffer;
4662 }
4663 
4664 /* Return true if X is a valid immediate for an SVE vector INC or DEC
4665    instruction.  If it is, store the number of elements in each vector
4666    quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
4667    factor in *FACTOR_OUT (if nonnull).  */
4668 
4669 bool
aarch64_sve_vector_inc_dec_immediate_p(rtx x,int * factor_out,unsigned int * nelts_per_vq_out)4670 aarch64_sve_vector_inc_dec_immediate_p (rtx x, int *factor_out,
4671 					unsigned int *nelts_per_vq_out)
4672 {
4673   rtx elt;
4674   poly_int64 value;
4675 
4676   if (!const_vec_duplicate_p (x, &elt)
4677       || !poly_int_rtx_p (elt, &value))
4678     return false;
4679 
4680   unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
4681   if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
4682     /* There's no vector INCB.  */
4683     return false;
4684 
4685   HOST_WIDE_INT factor = value.coeffs[0];
4686   if (value.coeffs[1] != factor)
4687     return false;
4688 
4689   /* The coefficient must be [1, 16] * NELTS_PER_VQ.  */
4690   if ((factor % nelts_per_vq) != 0
4691       || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
4692     return false;
4693 
4694   if (factor_out)
4695     *factor_out = factor;
4696   if (nelts_per_vq_out)
4697     *nelts_per_vq_out = nelts_per_vq;
4698   return true;
4699 }
4700 
4701 /* Return true if X is a valid immediate for an SVE vector INC or DEC
4702    instruction.  */
4703 
4704 bool
aarch64_sve_vector_inc_dec_immediate_p(rtx x)4705 aarch64_sve_vector_inc_dec_immediate_p (rtx x)
4706 {
4707   return aarch64_sve_vector_inc_dec_immediate_p (x, NULL, NULL);
4708 }
4709 
4710 /* Return the asm template for an SVE vector INC or DEC instruction.
4711    OPERANDS gives the operands before the vector count and X is the
4712    value of the vector count operand itself.  */
4713 
4714 char *
aarch64_output_sve_vector_inc_dec(const char * operands,rtx x)4715 aarch64_output_sve_vector_inc_dec (const char *operands, rtx x)
4716 {
4717   int factor;
4718   unsigned int nelts_per_vq;
4719   if (!aarch64_sve_vector_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
4720     gcc_unreachable ();
4721   if (factor < 0)
4722     return aarch64_output_sve_cnt_immediate ("dec", operands, AARCH64_SV_ALL,
4723 					     -factor, nelts_per_vq);
4724   else
4725     return aarch64_output_sve_cnt_immediate ("inc", operands, AARCH64_SV_ALL,
4726 					     factor, nelts_per_vq);
4727 }
4728 
4729 static int
aarch64_internal_mov_immediate(rtx dest,rtx imm,bool generate,scalar_int_mode mode)4730 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
4731 				scalar_int_mode mode)
4732 {
4733   int i;
4734   unsigned HOST_WIDE_INT val, val2, mask;
4735   int one_match, zero_match;
4736   int num_insns;
4737 
4738   val = INTVAL (imm);
4739 
4740   if (aarch64_move_imm (val, mode))
4741     {
4742       if (generate)
4743 	emit_insn (gen_rtx_SET (dest, imm));
4744       return 1;
4745     }
4746 
4747   /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
4748      (with XXXX non-zero). In that case check to see if the move can be done in
4749      a smaller mode.  */
4750   val2 = val & 0xffffffff;
4751   if (mode == DImode
4752       && aarch64_move_imm (val2, SImode)
4753       && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
4754     {
4755       if (generate)
4756 	emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
4757 
4758       /* Check if we have to emit a second instruction by checking to see
4759          if any of the upper 32 bits of the original DI mode value is set.  */
4760       if (val == val2)
4761 	return 1;
4762 
4763       i = (val >> 48) ? 48 : 32;
4764 
4765       if (generate)
4766 	 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
4767 				    GEN_INT ((val >> i) & 0xffff)));
4768 
4769       return 2;
4770     }
4771 
4772   if ((val >> 32) == 0 || mode == SImode)
4773     {
4774       if (generate)
4775 	{
4776 	  emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
4777 	  if (mode == SImode)
4778 	    emit_insn (gen_insv_immsi (dest, GEN_INT (16),
4779 				       GEN_INT ((val >> 16) & 0xffff)));
4780 	  else
4781 	    emit_insn (gen_insv_immdi (dest, GEN_INT (16),
4782 				       GEN_INT ((val >> 16) & 0xffff)));
4783 	}
4784       return 2;
4785     }
4786 
4787   /* Remaining cases are all for DImode.  */
4788 
4789   mask = 0xffff;
4790   zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
4791     ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
4792   one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
4793     ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
4794 
4795   if (zero_match != 2 && one_match != 2)
4796     {
4797       /* Try emitting a bitmask immediate with a movk replacing 16 bits.
4798 	 For a 64-bit bitmask try whether changing 16 bits to all ones or
4799 	 zeroes creates a valid bitmask.  To check any repeated bitmask,
4800 	 try using 16 bits from the other 32-bit half of val.  */
4801 
4802       for (i = 0; i < 64; i += 16, mask <<= 16)
4803 	{
4804 	  val2 = val & ~mask;
4805 	  if (val2 != val && aarch64_bitmask_imm (val2, mode))
4806 	    break;
4807 	  val2 = val | mask;
4808 	  if (val2 != val && aarch64_bitmask_imm (val2, mode))
4809 	    break;
4810 	  val2 = val2 & ~mask;
4811 	  val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
4812 	  if (val2 != val && aarch64_bitmask_imm (val2, mode))
4813 	    break;
4814 	}
4815       if (i != 64)
4816 	{
4817 	  if (generate)
4818 	    {
4819 	      emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
4820 	      emit_insn (gen_insv_immdi (dest, GEN_INT (i),
4821 					 GEN_INT ((val >> i) & 0xffff)));
4822 	    }
4823 	  return 2;
4824 	}
4825     }
4826 
4827   /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
4828      are emitted by the initial mov.  If one_match > zero_match, skip set bits,
4829      otherwise skip zero bits.  */
4830 
4831   num_insns = 1;
4832   mask = 0xffff;
4833   val2 = one_match > zero_match ? ~val : val;
4834   i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
4835 
4836   if (generate)
4837     emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
4838 					   ? (val | ~(mask << i))
4839 					   : (val & (mask << i)))));
4840   for (i += 16; i < 64; i += 16)
4841     {
4842       if ((val2 & (mask << i)) == 0)
4843 	continue;
4844       if (generate)
4845 	emit_insn (gen_insv_immdi (dest, GEN_INT (i),
4846 				   GEN_INT ((val >> i) & 0xffff)));
4847       num_insns ++;
4848     }
4849 
4850   return num_insns;
4851 }
4852 
4853 /* Return whether imm is a 128-bit immediate which is simple enough to
4854    expand inline.  */
4855 bool
aarch64_mov128_immediate(rtx imm)4856 aarch64_mov128_immediate (rtx imm)
4857 {
4858   if (CONST_INT_P (imm))
4859     return true;
4860 
4861   gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
4862 
4863   rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
4864   rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
4865 
4866   return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
4867 	 + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
4868 }
4869 
4870 
4871 /* Return the number of temporary registers that aarch64_add_offset_1
4872    would need to add OFFSET to a register.  */
4873 
4874 static unsigned int
aarch64_add_offset_1_temporaries(HOST_WIDE_INT offset)4875 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
4876 {
4877   return absu_hwi (offset) < 0x1000000 ? 0 : 1;
4878 }
4879 
4880 /* A subroutine of aarch64_add_offset.  Set DEST to SRC + OFFSET for
4881    a non-polynomial OFFSET.  MODE is the mode of the addition.
4882    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
4883    be set and CFA adjustments added to the generated instructions.
4884 
4885    TEMP1, if nonnull, is a register of mode MODE that can be used as a
4886    temporary if register allocation is already complete.  This temporary
4887    register may overlap DEST but must not overlap SRC.  If TEMP1 is known
4888    to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
4889    the immediate again.
4890 
4891    Since this function may be used to adjust the stack pointer, we must
4892    ensure that it cannot cause transient stack deallocation (for example
4893    by first incrementing SP and then decrementing when adjusting by a
4894    large immediate).  */
4895 
4896 static void
aarch64_add_offset_1(scalar_int_mode mode,rtx dest,rtx src,HOST_WIDE_INT offset,rtx temp1,bool frame_related_p,bool emit_move_imm)4897 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
4898 		      rtx src, HOST_WIDE_INT offset, rtx temp1,
4899 		      bool frame_related_p, bool emit_move_imm)
4900 {
4901   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
4902   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
4903 
4904   unsigned HOST_WIDE_INT moffset = absu_hwi (offset);
4905   rtx_insn *insn;
4906 
4907   if (!moffset)
4908     {
4909       if (!rtx_equal_p (dest, src))
4910 	{
4911 	  insn = emit_insn (gen_rtx_SET (dest, src));
4912 	  RTX_FRAME_RELATED_P (insn) = frame_related_p;
4913 	}
4914       return;
4915     }
4916 
4917   /* Single instruction adjustment.  */
4918   if (aarch64_uimm12_shift (moffset))
4919     {
4920       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
4921       RTX_FRAME_RELATED_P (insn) = frame_related_p;
4922       return;
4923     }
4924 
4925   /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
4926      and either:
4927 
4928      a) the offset cannot be loaded by a 16-bit move or
4929      b) there is no spare register into which we can move it.  */
4930   if (moffset < 0x1000000
4931       && ((!temp1 && !can_create_pseudo_p ())
4932 	  || !aarch64_move_imm (moffset, mode)))
4933     {
4934       HOST_WIDE_INT low_off = moffset & 0xfff;
4935 
4936       low_off = offset < 0 ? -low_off : low_off;
4937       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
4938       RTX_FRAME_RELATED_P (insn) = frame_related_p;
4939       insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
4940       RTX_FRAME_RELATED_P (insn) = frame_related_p;
4941       return;
4942     }
4943 
4944   /* Emit a move immediate if required and an addition/subtraction.  */
4945   if (emit_move_imm)
4946     {
4947       gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
4948       temp1 = aarch64_force_temporary (mode, temp1,
4949 				       gen_int_mode (moffset, mode));
4950     }
4951   insn = emit_insn (offset < 0
4952 		    ? gen_sub3_insn (dest, src, temp1)
4953 		    : gen_add3_insn (dest, src, temp1));
4954   if (frame_related_p)
4955     {
4956       RTX_FRAME_RELATED_P (insn) = frame_related_p;
4957       rtx adj = plus_constant (mode, src, offset);
4958       add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
4959     }
4960 }
4961 
4962 /* Return the number of temporary registers that aarch64_add_offset
4963    would need to move OFFSET into a register or add OFFSET to a register;
4964    ADD_P is true if we want the latter rather than the former.  */
4965 
4966 static unsigned int
aarch64_offset_temporaries(bool add_p,poly_int64 offset)4967 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
4968 {
4969   /* This follows the same structure as aarch64_add_offset.  */
4970   if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
4971     return 0;
4972 
4973   unsigned int count = 0;
4974   HOST_WIDE_INT factor = offset.coeffs[1];
4975   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
4976   poly_int64 poly_offset (factor, factor);
4977   if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
4978     /* Need one register for the ADDVL/ADDPL result.  */
4979     count += 1;
4980   else if (factor != 0)
4981     {
4982       factor = abs (factor);
4983       if (factor > 16 * (factor & -factor))
4984 	/* Need one register for the CNT result and one for the multiplication
4985 	   factor.  If necessary, the second temporary can be reused for the
4986 	   constant part of the offset.  */
4987 	return 2;
4988       /* Need one register for the CNT result (which might then
4989 	 be shifted).  */
4990       count += 1;
4991     }
4992   return count + aarch64_add_offset_1_temporaries (constant);
4993 }
4994 
4995 /* If X can be represented as a poly_int64, return the number
4996    of temporaries that are required to add it to a register.
4997    Return -1 otherwise.  */
4998 
4999 int
aarch64_add_offset_temporaries(rtx x)5000 aarch64_add_offset_temporaries (rtx x)
5001 {
5002   poly_int64 offset;
5003   if (!poly_int_rtx_p (x, &offset))
5004     return -1;
5005   return aarch64_offset_temporaries (true, offset);
5006 }
5007 
5008 /* Set DEST to SRC + OFFSET.  MODE is the mode of the addition.
5009    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
5010    be set and CFA adjustments added to the generated instructions.
5011 
5012    TEMP1, if nonnull, is a register of mode MODE that can be used as a
5013    temporary if register allocation is already complete.  This temporary
5014    register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
5015    If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
5016    false to avoid emitting the immediate again.
5017 
5018    TEMP2, if nonnull, is a second temporary register that doesn't
5019    overlap either DEST or REG.
5020 
5021    Since this function may be used to adjust the stack pointer, we must
5022    ensure that it cannot cause transient stack deallocation (for example
5023    by first incrementing SP and then decrementing when adjusting by a
5024    large immediate).  */
5025 
5026 static void
5027 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
5028 		    poly_int64 offset, rtx temp1, rtx temp2,
5029 		    bool frame_related_p, bool emit_move_imm = true)
5030 {
5031   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
5032   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
5033   gcc_assert (temp1 == NULL_RTX
5034 	      || !frame_related_p
5035 	      || !reg_overlap_mentioned_p (temp1, dest));
5036   gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
5037 
5038   /* Try using ADDVL or ADDPL to add the whole value.  */
5039   if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
5040     {
5041       rtx offset_rtx = gen_int_mode (offset, mode);
5042       rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
5043       RTX_FRAME_RELATED_P (insn) = frame_related_p;
5044       return;
5045     }
5046 
5047   /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
5048      SVE vector register, over and above the minimum size of 128 bits.
5049      This is equivalent to half the value returned by CNTD with a
5050      vector shape of ALL.  */
5051   HOST_WIDE_INT factor = offset.coeffs[1];
5052   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
5053 
5054   /* Try using ADDVL or ADDPL to add the VG-based part.  */
5055   poly_int64 poly_offset (factor, factor);
5056   if (src != const0_rtx
5057       && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
5058     {
5059       rtx offset_rtx = gen_int_mode (poly_offset, mode);
5060       if (frame_related_p)
5061 	{
5062 	  rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
5063 	  RTX_FRAME_RELATED_P (insn) = true;
5064 	  src = dest;
5065 	}
5066       else
5067 	{
5068 	  rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
5069 	  src = aarch64_force_temporary (mode, temp1, addr);
5070 	  temp1 = temp2;
5071 	  temp2 = NULL_RTX;
5072 	}
5073     }
5074   /* Otherwise use a CNT-based sequence.  */
5075   else if (factor != 0)
5076     {
5077       /* Use a subtraction if we have a negative factor.  */
5078       rtx_code code = PLUS;
5079       if (factor < 0)
5080 	{
5081 	  factor = -factor;
5082 	  code = MINUS;
5083 	}
5084 
5085       /* Calculate CNTD * FACTOR / 2.  First try to fold the division
5086 	 into the multiplication.  */
5087       rtx val;
5088       int shift = 0;
5089       if (factor & 1)
5090 	/* Use a right shift by 1.  */
5091 	shift = -1;
5092       else
5093 	factor /= 2;
5094       HOST_WIDE_INT low_bit = factor & -factor;
5095       if (factor <= 16 * low_bit)
5096 	{
5097 	  if (factor > 16 * 8)
5098 	    {
5099 	      /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
5100 		 the value with the minimum multiplier and shift it into
5101 		 position.  */
5102 	      int extra_shift = exact_log2 (low_bit);
5103 	      shift += extra_shift;
5104 	      factor >>= extra_shift;
5105 	    }
5106 	  val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
5107 	}
5108       else
5109 	{
5110 	  /* Base the factor on LOW_BIT if we can calculate LOW_BIT
5111 	     directly, since that should increase the chances of being
5112 	     able to use a shift and add sequence.  If LOW_BIT itself
5113 	     is out of range, just use CNTD.  */
5114 	  if (low_bit <= 16 * 8)
5115 	    factor /= low_bit;
5116 	  else
5117 	    low_bit = 1;
5118 
5119 	  val = gen_int_mode (poly_int64 (low_bit * 2, low_bit * 2), mode);
5120 	  val = aarch64_force_temporary (mode, temp1, val);
5121 
5122 	  if (can_create_pseudo_p ())
5123 	    {
5124 	      rtx coeff1 = gen_int_mode (factor, mode);
5125 	      val = expand_mult (mode, val, coeff1, NULL_RTX, true, true);
5126 	    }
5127 	  else
5128 	    {
5129 	      /* Go back to using a negative multiplication factor if we have
5130 		 no register from which to subtract.  */
5131 	      if (code == MINUS && src == const0_rtx)
5132 		{
5133 		  factor = -factor;
5134 		  code = PLUS;
5135 		}
5136 	      rtx coeff1 = gen_int_mode (factor, mode);
5137 	      coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
5138 	      val = gen_rtx_MULT (mode, val, coeff1);
5139 	    }
5140 	}
5141 
5142       if (shift > 0)
5143 	{
5144 	  /* Multiply by 1 << SHIFT.  */
5145 	  val = aarch64_force_temporary (mode, temp1, val);
5146 	  val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
5147 	}
5148       else if (shift == -1)
5149 	{
5150 	  /* Divide by 2.  */
5151 	  val = aarch64_force_temporary (mode, temp1, val);
5152 	  val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
5153 	}
5154 
5155       /* Calculate SRC +/- CNTD * FACTOR / 2.  */
5156       if (src != const0_rtx)
5157 	{
5158 	  val = aarch64_force_temporary (mode, temp1, val);
5159 	  val = gen_rtx_fmt_ee (code, mode, src, val);
5160 	}
5161       else if (code == MINUS)
5162 	{
5163 	  val = aarch64_force_temporary (mode, temp1, val);
5164 	  val = gen_rtx_NEG (mode, val);
5165 	}
5166 
5167       if (constant == 0 || frame_related_p)
5168 	{
5169 	  rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
5170 	  if (frame_related_p)
5171 	    {
5172 	      RTX_FRAME_RELATED_P (insn) = true;
5173 	      add_reg_note (insn, REG_CFA_ADJUST_CFA,
5174 			    gen_rtx_SET (dest, plus_constant (Pmode, src,
5175 							      poly_offset)));
5176 	    }
5177 	  src = dest;
5178 	  if (constant == 0)
5179 	    return;
5180 	}
5181       else
5182 	{
5183 	  src = aarch64_force_temporary (mode, temp1, val);
5184 	  temp1 = temp2;
5185 	  temp2 = NULL_RTX;
5186 	}
5187 
5188       emit_move_imm = true;
5189     }
5190 
5191   aarch64_add_offset_1 (mode, dest, src, constant, temp1,
5192 			frame_related_p, emit_move_imm);
5193 }
5194 
5195 /* Like aarch64_add_offset, but the offset is given as an rtx rather
5196    than a poly_int64.  */
5197 
5198 void
aarch64_split_add_offset(scalar_int_mode mode,rtx dest,rtx src,rtx offset_rtx,rtx temp1,rtx temp2)5199 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
5200 			  rtx offset_rtx, rtx temp1, rtx temp2)
5201 {
5202   aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
5203 		      temp1, temp2, false);
5204 }
5205 
5206 /* Add DELTA to the stack pointer, marking the instructions frame-related.
5207    TEMP1 is available as a temporary if nonnull.  EMIT_MOVE_IMM is false
5208    if TEMP1 already contains abs (DELTA).  */
5209 
5210 static inline void
aarch64_add_sp(rtx temp1,rtx temp2,poly_int64 delta,bool emit_move_imm)5211 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
5212 {
5213   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
5214 		      temp1, temp2, true, emit_move_imm);
5215 }
5216 
5217 /* Subtract DELTA from the stack pointer, marking the instructions
5218    frame-related if FRAME_RELATED_P.  TEMP1 is available as a temporary
5219    if nonnull.  */
5220 
5221 static inline void
5222 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p,
5223 		bool emit_move_imm = true)
5224 {
5225   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
5226 		      temp1, temp2, frame_related_p, emit_move_imm);
5227 }
5228 
5229 /* Set DEST to (vec_series BASE STEP).  */
5230 
5231 static void
aarch64_expand_vec_series(rtx dest,rtx base,rtx step)5232 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
5233 {
5234   machine_mode mode = GET_MODE (dest);
5235   scalar_mode inner = GET_MODE_INNER (mode);
5236 
5237   /* Each operand can be a register or an immediate in the range [-16, 15].  */
5238   if (!aarch64_sve_index_immediate_p (base))
5239     base = force_reg (inner, base);
5240   if (!aarch64_sve_index_immediate_p (step))
5241     step = force_reg (inner, step);
5242 
5243   emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
5244 }
5245 
5246 /* Duplicate 128-bit Advanced SIMD vector SRC so that it fills an SVE
5247    register of mode MODE.  Use TARGET for the result if it's nonnull
5248    and convenient.
5249 
5250    The two vector modes must have the same element mode.  The behavior
5251    is to duplicate architectural lane N of SRC into architectural lanes
5252    N + I * STEP of the result.  On big-endian targets, architectural
5253    lane 0 of an Advanced SIMD vector is the last element of the vector
5254    in memory layout, so for big-endian targets this operation has the
5255    effect of reversing SRC before duplicating it.  Callers need to
5256    account for this.  */
5257 
5258 rtx
aarch64_expand_sve_dupq(rtx target,machine_mode mode,rtx src)5259 aarch64_expand_sve_dupq (rtx target, machine_mode mode, rtx src)
5260 {
5261   machine_mode src_mode = GET_MODE (src);
5262   gcc_assert (GET_MODE_INNER (mode) == GET_MODE_INNER (src_mode));
5263   insn_code icode = (BYTES_BIG_ENDIAN
5264 		     ? code_for_aarch64_vec_duplicate_vq_be (mode)
5265 		     : code_for_aarch64_vec_duplicate_vq_le (mode));
5266 
5267   unsigned int i = 0;
5268   expand_operand ops[3];
5269   create_output_operand (&ops[i++], target, mode);
5270   create_output_operand (&ops[i++], src, src_mode);
5271   if (BYTES_BIG_ENDIAN)
5272     {
5273       /* Create a PARALLEL describing the reversal of SRC.  */
5274       unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (mode);
5275       rtx sel = aarch64_gen_stepped_int_parallel (nelts_per_vq,
5276 						  nelts_per_vq - 1, -1);
5277       create_fixed_operand (&ops[i++], sel);
5278     }
5279   expand_insn (icode, i, ops);
5280   return ops[0].value;
5281 }
5282 
5283 /* Try to force 128-bit vector value SRC into memory and use LD1RQ to fetch
5284    the memory image into DEST.  Return true on success.  */
5285 
5286 static bool
aarch64_expand_sve_ld1rq(rtx dest,rtx src)5287 aarch64_expand_sve_ld1rq (rtx dest, rtx src)
5288 {
5289   src = force_const_mem (GET_MODE (src), src);
5290   if (!src)
5291     return false;
5292 
5293   /* Make sure that the address is legitimate.  */
5294   if (!aarch64_sve_ld1rq_operand_p (src))
5295     {
5296       rtx addr = force_reg (Pmode, XEXP (src, 0));
5297       src = replace_equiv_address (src, addr);
5298     }
5299 
5300   machine_mode mode = GET_MODE (dest);
5301   machine_mode pred_mode = aarch64_sve_pred_mode (mode);
5302   rtx ptrue = aarch64_ptrue_reg (pred_mode);
5303   emit_insn (gen_aarch64_sve_ld1rq (mode, dest, src, ptrue));
5304   return true;
5305 }
5306 
5307 /* SRC is an SVE CONST_VECTOR that contains N "foreground" values followed
5308    by N "background" values.  Try to move it into TARGET using:
5309 
5310       PTRUE PRED.<T>, VL<N>
5311       MOV TRUE.<T>, #<foreground>
5312       MOV FALSE.<T>, #<background>
5313       SEL TARGET.<T>, PRED.<T>, TRUE.<T>, FALSE.<T>
5314 
5315    The PTRUE is always a single instruction but the MOVs might need a
5316    longer sequence.  If the background value is zero (as it often is),
5317    the sequence can sometimes collapse to a PTRUE followed by a
5318    zero-predicated move.
5319 
5320    Return the target on success, otherwise return null.  */
5321 
5322 static rtx
aarch64_expand_sve_const_vector_sel(rtx target,rtx src)5323 aarch64_expand_sve_const_vector_sel (rtx target, rtx src)
5324 {
5325   gcc_assert (CONST_VECTOR_NELTS_PER_PATTERN (src) == 2);
5326 
5327   /* Make sure that the PTRUE is valid.  */
5328   machine_mode mode = GET_MODE (src);
5329   machine_mode pred_mode = aarch64_sve_pred_mode (mode);
5330   unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
5331   if (aarch64_svpattern_for_vl (pred_mode, npatterns)
5332       == AARCH64_NUM_SVPATTERNS)
5333     return NULL_RTX;
5334 
5335   rtx_vector_builder pred_builder (pred_mode, npatterns, 2);
5336   rtx_vector_builder true_builder (mode, npatterns, 1);
5337   rtx_vector_builder false_builder (mode, npatterns, 1);
5338   for (unsigned int i = 0; i < npatterns; ++i)
5339     {
5340       true_builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, i));
5341       pred_builder.quick_push (CONST1_RTX (BImode));
5342     }
5343   for (unsigned int i = 0; i < npatterns; ++i)
5344     {
5345       false_builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, i + npatterns));
5346       pred_builder.quick_push (CONST0_RTX (BImode));
5347     }
5348   expand_operand ops[4];
5349   create_output_operand (&ops[0], target, mode);
5350   create_input_operand (&ops[1], true_builder.build (), mode);
5351   create_input_operand (&ops[2], false_builder.build (), mode);
5352   create_input_operand (&ops[3], pred_builder.build (), pred_mode);
5353   expand_insn (code_for_vcond_mask (mode, mode), 4, ops);
5354   return target;
5355 }
5356 
5357 /* Return a register containing CONST_VECTOR SRC, given that SRC has an
5358    SVE data mode and isn't a legitimate constant.  Use TARGET for the
5359    result if convenient.
5360 
5361    The returned register can have whatever mode seems most natural
5362    given the contents of SRC.  */
5363 
5364 static rtx
aarch64_expand_sve_const_vector(rtx target,rtx src)5365 aarch64_expand_sve_const_vector (rtx target, rtx src)
5366 {
5367   machine_mode mode = GET_MODE (src);
5368   unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
5369   unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
5370   scalar_mode elt_mode = GET_MODE_INNER (mode);
5371   unsigned int elt_bits = GET_MODE_BITSIZE (elt_mode);
5372   unsigned int container_bits = aarch64_sve_container_bits (mode);
5373   unsigned int encoded_bits = npatterns * nelts_per_pattern * container_bits;
5374 
5375   if (nelts_per_pattern == 1
5376       && encoded_bits <= 128
5377       && container_bits != elt_bits)
5378     {
5379       /* We have a partial vector mode and a constant whose full-vector
5380 	 equivalent would occupy a repeating 128-bit sequence.  Build that
5381 	 full-vector equivalent instead, so that we have the option of
5382 	 using LD1RQ and Advanced SIMD operations.  */
5383       unsigned int repeat = container_bits / elt_bits;
5384       machine_mode full_mode = aarch64_full_sve_mode (elt_mode).require ();
5385       rtx_vector_builder builder (full_mode, npatterns * repeat, 1);
5386       for (unsigned int i = 0; i < npatterns; ++i)
5387 	for (unsigned int j = 0; j < repeat; ++j)
5388 	  builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, i));
5389       target = aarch64_target_reg (target, full_mode);
5390       return aarch64_expand_sve_const_vector (target, builder.build ());
5391     }
5392 
5393   if (nelts_per_pattern == 1 && encoded_bits == 128)
5394     {
5395       /* The constant is a duplicated quadword but can't be narrowed
5396 	 beyond a quadword.  Get the memory image of the first quadword
5397 	 as a 128-bit vector and try using LD1RQ to load it from memory.
5398 
5399 	 The effect for both endiannesses is to load memory lane N into
5400 	 architectural lanes N + I * STEP of the result.  On big-endian
5401 	 targets, the layout of the 128-bit vector in an Advanced SIMD
5402 	 register would be different from its layout in an SVE register,
5403 	 but this 128-bit vector is a memory value only.  */
5404       machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
5405       rtx vq_value = simplify_gen_subreg (vq_mode, src, mode, 0);
5406       if (vq_value && aarch64_expand_sve_ld1rq (target, vq_value))
5407 	return target;
5408     }
5409 
5410   if (nelts_per_pattern == 1 && encoded_bits < 128)
5411     {
5412       /* The vector is a repeating sequence of 64 bits or fewer.
5413 	 See if we can load them using an Advanced SIMD move and then
5414 	 duplicate it to fill a vector.  This is better than using a GPR
5415 	 move because it keeps everything in the same register file.  */
5416       machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
5417       rtx_vector_builder builder (vq_mode, npatterns, 1);
5418       for (unsigned int i = 0; i < npatterns; ++i)
5419 	{
5420 	  /* We want memory lane N to go into architectural lane N,
5421 	     so reverse for big-endian targets.  The DUP .Q pattern
5422 	     has a compensating reverse built-in.  */
5423 	  unsigned int srci = BYTES_BIG_ENDIAN ? npatterns - i - 1 : i;
5424 	  builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, srci));
5425 	}
5426       rtx vq_src = builder.build ();
5427       if (aarch64_simd_valid_immediate (vq_src, NULL))
5428 	{
5429 	  vq_src = force_reg (vq_mode, vq_src);
5430 	  return aarch64_expand_sve_dupq (target, mode, vq_src);
5431 	}
5432 
5433       /* Get an integer representation of the repeating part of Advanced
5434 	 SIMD vector VQ_SRC.  This preserves the endianness of VQ_SRC,
5435 	 which for big-endian targets is lane-swapped wrt a normal
5436 	 Advanced SIMD vector.  This means that for both endiannesses,
5437 	 memory lane N of SVE vector SRC corresponds to architectural
5438 	 lane N of a register holding VQ_SRC.  This in turn means that
5439 	 memory lane 0 of SVE vector SRC is in the lsb of VQ_SRC (viewed
5440 	 as a single 128-bit value) and thus that memory lane 0 of SRC is
5441 	 in the lsb of the integer.  Duplicating the integer therefore
5442 	 ensures that memory lane N of SRC goes into architectural lane
5443 	 N + I * INDEX of the SVE register.  */
5444       scalar_mode int_mode = int_mode_for_size (encoded_bits, 0).require ();
5445       rtx elt_value = simplify_gen_subreg (int_mode, vq_src, vq_mode, 0);
5446       if (elt_value)
5447 	{
5448 	  /* Pretend that we had a vector of INT_MODE to start with.  */
5449 	  elt_mode = int_mode;
5450 	  mode = aarch64_full_sve_mode (int_mode).require ();
5451 
5452 	  /* If the integer can be moved into a general register by a
5453 	     single instruction, do that and duplicate the result.  */
5454 	  if (CONST_INT_P (elt_value)
5455 	      && aarch64_move_imm (INTVAL (elt_value), elt_mode))
5456 	    {
5457 	      elt_value = force_reg (elt_mode, elt_value);
5458 	      return expand_vector_broadcast (mode, elt_value);
5459 	    }
5460 	}
5461       else if (npatterns == 1)
5462 	/* We're duplicating a single value, but can't do better than
5463 	   force it to memory and load from there.  This handles things
5464 	   like symbolic constants.  */
5465 	elt_value = CONST_VECTOR_ENCODED_ELT (src, 0);
5466 
5467       if (elt_value)
5468 	{
5469 	  /* Load the element from memory if we can, otherwise move it into
5470 	     a register and use a DUP.  */
5471 	  rtx op = force_const_mem (elt_mode, elt_value);
5472 	  if (!op)
5473 	    op = force_reg (elt_mode, elt_value);
5474 	  return expand_vector_broadcast (mode, op);
5475 	}
5476     }
5477 
5478   /* Try using INDEX.  */
5479   rtx base, step;
5480   if (const_vec_series_p (src, &base, &step))
5481     {
5482       aarch64_expand_vec_series (target, base, step);
5483       return target;
5484     }
5485 
5486   /* From here on, it's better to force the whole constant to memory
5487      if we can.  */
5488   if (GET_MODE_NUNITS (mode).is_constant ())
5489     return NULL_RTX;
5490 
5491   if (nelts_per_pattern == 2)
5492     if (rtx res = aarch64_expand_sve_const_vector_sel (target, src))
5493       return res;
5494 
5495   /* Expand each pattern individually.  */
5496   gcc_assert (npatterns > 1);
5497   rtx_vector_builder builder;
5498   auto_vec<rtx, 16> vectors (npatterns);
5499   for (unsigned int i = 0; i < npatterns; ++i)
5500     {
5501       builder.new_vector (mode, 1, nelts_per_pattern);
5502       for (unsigned int j = 0; j < nelts_per_pattern; ++j)
5503 	builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
5504       vectors.quick_push (force_reg (mode, builder.build ()));
5505     }
5506 
5507   /* Use permutes to interleave the separate vectors.  */
5508   while (npatterns > 1)
5509     {
5510       npatterns /= 2;
5511       for (unsigned int i = 0; i < npatterns; ++i)
5512 	{
5513 	  rtx tmp = (npatterns == 1 ? target : gen_reg_rtx (mode));
5514 	  rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
5515 	  emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
5516 	  vectors[i] = tmp;
5517 	}
5518     }
5519   gcc_assert (vectors[0] == target);
5520   return target;
5521 }
5522 
5523 /* Use WHILE to set a predicate register of mode MODE in which the first
5524    VL bits are set and the rest are clear.  Use TARGET for the register
5525    if it's nonnull and convenient.  */
5526 
5527 static rtx
aarch64_sve_move_pred_via_while(rtx target,machine_mode mode,unsigned int vl)5528 aarch64_sve_move_pred_via_while (rtx target, machine_mode mode,
5529 				 unsigned int vl)
5530 {
5531   rtx limit = force_reg (DImode, gen_int_mode (vl, DImode));
5532   target = aarch64_target_reg (target, mode);
5533   emit_insn (gen_while (UNSPEC_WHILELO, DImode, mode,
5534 			target, const0_rtx, limit));
5535   return target;
5536 }
5537 
5538 static rtx
5539 aarch64_expand_sve_const_pred_1 (rtx, rtx_vector_builder &, bool);
5540 
5541 /* BUILDER is a constant predicate in which the index of every set bit
5542    is a multiple of ELT_SIZE (which is <= 8).  Try to load the constant
5543    by inverting every element at a multiple of ELT_SIZE and EORing the
5544    result with an ELT_SIZE PTRUE.
5545 
5546    Return a register that contains the constant on success, otherwise
5547    return null.  Use TARGET as the register if it is nonnull and
5548    convenient.  */
5549 
5550 static rtx
aarch64_expand_sve_const_pred_eor(rtx target,rtx_vector_builder & builder,unsigned int elt_size)5551 aarch64_expand_sve_const_pred_eor (rtx target, rtx_vector_builder &builder,
5552 				   unsigned int elt_size)
5553 {
5554   /* Invert every element at a multiple of ELT_SIZE, keeping the
5555      other bits zero.  */
5556   rtx_vector_builder inv_builder (VNx16BImode, builder.npatterns (),
5557 				  builder.nelts_per_pattern ());
5558   for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
5559     if ((i & (elt_size - 1)) == 0 && INTVAL (builder.elt (i)) == 0)
5560       inv_builder.quick_push (const1_rtx);
5561     else
5562       inv_builder.quick_push (const0_rtx);
5563   inv_builder.finalize ();
5564 
5565   /* See if we can load the constant cheaply.  */
5566   rtx inv = aarch64_expand_sve_const_pred_1 (NULL_RTX, inv_builder, false);
5567   if (!inv)
5568     return NULL_RTX;
5569 
5570   /* EOR the result with an ELT_SIZE PTRUE.  */
5571   rtx mask = aarch64_ptrue_all (elt_size);
5572   mask = force_reg (VNx16BImode, mask);
5573   inv = gen_lowpart (VNx16BImode, inv);
5574   target = aarch64_target_reg (target, VNx16BImode);
5575   emit_insn (gen_aarch64_pred_z (XOR, VNx16BImode, target, mask, inv, mask));
5576   return target;
5577 }
5578 
5579 /* BUILDER is a constant predicate in which the index of every set bit
5580    is a multiple of ELT_SIZE (which is <= 8).  Try to load the constant
5581    using a TRN1 of size PERMUTE_SIZE, which is >= ELT_SIZE.  Return the
5582    register on success, otherwise return null.  Use TARGET as the register
5583    if nonnull and convenient.  */
5584 
5585 static rtx
aarch64_expand_sve_const_pred_trn(rtx target,rtx_vector_builder & builder,unsigned int elt_size,unsigned int permute_size)5586 aarch64_expand_sve_const_pred_trn (rtx target, rtx_vector_builder &builder,
5587 				   unsigned int elt_size,
5588 				   unsigned int permute_size)
5589 {
5590   /* We're going to split the constant into two new constants A and B,
5591      with element I of BUILDER going into A if (I & PERMUTE_SIZE) == 0
5592      and into B otherwise.  E.g. for PERMUTE_SIZE == 4 && ELT_SIZE == 1:
5593 
5594      A: { 0, 1, 2, 3, _, _, _, _, 8, 9, 10, 11, _, _, _, _ }
5595      B: { 4, 5, 6, 7, _, _, _, _, 12, 13, 14, 15, _, _, _, _ }
5596 
5597      where _ indicates elements that will be discarded by the permute.
5598 
5599      First calculate the ELT_SIZEs for A and B.  */
5600   unsigned int a_elt_size = GET_MODE_SIZE (DImode);
5601   unsigned int b_elt_size = GET_MODE_SIZE (DImode);
5602   for (unsigned int i = 0; i < builder.encoded_nelts (); i += elt_size)
5603     if (INTVAL (builder.elt (i)) != 0)
5604       {
5605 	if (i & permute_size)
5606 	  b_elt_size |= i - permute_size;
5607 	else
5608 	  a_elt_size |= i;
5609       }
5610   a_elt_size &= -a_elt_size;
5611   b_elt_size &= -b_elt_size;
5612 
5613   /* Now construct the vectors themselves.  */
5614   rtx_vector_builder a_builder (VNx16BImode, builder.npatterns (),
5615 				builder.nelts_per_pattern ());
5616   rtx_vector_builder b_builder (VNx16BImode, builder.npatterns (),
5617 				builder.nelts_per_pattern ());
5618   unsigned int nelts = builder.encoded_nelts ();
5619   for (unsigned int i = 0; i < nelts; ++i)
5620     if (i & (elt_size - 1))
5621       {
5622 	a_builder.quick_push (const0_rtx);
5623 	b_builder.quick_push (const0_rtx);
5624       }
5625     else if ((i & permute_size) == 0)
5626       {
5627 	/* The A and B elements are significant.  */
5628 	a_builder.quick_push (builder.elt (i));
5629 	b_builder.quick_push (builder.elt (i + permute_size));
5630       }
5631     else
5632       {
5633 	/* The A and B elements are going to be discarded, so pick whatever
5634 	   is likely to give a nice constant.  We are targeting element
5635 	   sizes A_ELT_SIZE and B_ELT_SIZE for A and B respectively,
5636 	   with the aim of each being a sequence of ones followed by
5637 	   a sequence of zeros.  So:
5638 
5639 	   * if X_ELT_SIZE <= PERMUTE_SIZE, the best approach is to
5640 	     duplicate the last X_ELT_SIZE element, to extend the
5641 	     current sequence of ones or zeros.
5642 
5643 	   * if X_ELT_SIZE > PERMUTE_SIZE, the best approach is to add a
5644 	     zero, so that the constant really does have X_ELT_SIZE and
5645 	     not a smaller size.  */
5646 	if (a_elt_size > permute_size)
5647 	  a_builder.quick_push (const0_rtx);
5648 	else
5649 	  a_builder.quick_push (a_builder.elt (i - a_elt_size));
5650 	if (b_elt_size > permute_size)
5651 	  b_builder.quick_push (const0_rtx);
5652 	else
5653 	  b_builder.quick_push (b_builder.elt (i - b_elt_size));
5654       }
5655   a_builder.finalize ();
5656   b_builder.finalize ();
5657 
5658   /* Try loading A into a register.  */
5659   rtx_insn *last = get_last_insn ();
5660   rtx a = aarch64_expand_sve_const_pred_1 (NULL_RTX, a_builder, false);
5661   if (!a)
5662     return NULL_RTX;
5663 
5664   /* Try loading B into a register.  */
5665   rtx b = a;
5666   if (a_builder != b_builder)
5667     {
5668       b = aarch64_expand_sve_const_pred_1 (NULL_RTX, b_builder, false);
5669       if (!b)
5670 	{
5671 	  delete_insns_since (last);
5672 	  return NULL_RTX;
5673 	}
5674     }
5675 
5676   /* Emit the TRN1 itself.  We emit a TRN that operates on VNx16BI
5677      operands but permutes them as though they had mode MODE.  */
5678   machine_mode mode = aarch64_sve_pred_mode (permute_size).require ();
5679   target = aarch64_target_reg (target, GET_MODE (a));
5680   rtx type_reg = CONST0_RTX (mode);
5681   emit_insn (gen_aarch64_sve_trn1_conv (mode, target, a, b, type_reg));
5682   return target;
5683 }
5684 
5685 /* Subroutine of aarch64_expand_sve_const_pred.  Try to load the VNx16BI
5686    constant in BUILDER into an SVE predicate register.  Return the register
5687    on success, otherwise return null.  Use TARGET for the register if
5688    nonnull and convenient.
5689 
5690    ALLOW_RECURSE_P is true if we can use methods that would call this
5691    function recursively.  */
5692 
5693 static rtx
aarch64_expand_sve_const_pred_1(rtx target,rtx_vector_builder & builder,bool allow_recurse_p)5694 aarch64_expand_sve_const_pred_1 (rtx target, rtx_vector_builder &builder,
5695 				 bool allow_recurse_p)
5696 {
5697   if (builder.encoded_nelts () == 1)
5698     /* A PFALSE or a PTRUE .B ALL.  */
5699     return aarch64_emit_set_immediate (target, builder);
5700 
5701   unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
5702   if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
5703     {
5704       /* If we can load the constant using PTRUE, use it as-is.  */
5705       machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
5706       if (aarch64_svpattern_for_vl (mode, vl) != AARCH64_NUM_SVPATTERNS)
5707 	return aarch64_emit_set_immediate (target, builder);
5708 
5709       /* Otherwise use WHILE to set the first VL bits.  */
5710       return aarch64_sve_move_pred_via_while (target, mode, vl);
5711     }
5712 
5713   if (!allow_recurse_p)
5714     return NULL_RTX;
5715 
5716   /* Try inverting the vector in element size ELT_SIZE and then EORing
5717      the result with an ELT_SIZE PTRUE.  */
5718   if (INTVAL (builder.elt (0)) == 0)
5719     if (rtx res = aarch64_expand_sve_const_pred_eor (target, builder,
5720 						     elt_size))
5721       return res;
5722 
5723   /* Try using TRN1 to permute two simpler constants.  */
5724   for (unsigned int i = elt_size; i <= 8; i *= 2)
5725     if (rtx res = aarch64_expand_sve_const_pred_trn (target, builder,
5726 						     elt_size, i))
5727       return res;
5728 
5729   return NULL_RTX;
5730 }
5731 
5732 /* Return an SVE predicate register that contains the VNx16BImode
5733    constant in BUILDER, without going through the move expanders.
5734 
5735    The returned register can have whatever mode seems most natural
5736    given the contents of BUILDER.  Use TARGET for the result if
5737    convenient.  */
5738 
5739 static rtx
aarch64_expand_sve_const_pred(rtx target,rtx_vector_builder & builder)5740 aarch64_expand_sve_const_pred (rtx target, rtx_vector_builder &builder)
5741 {
5742   /* Try loading the constant using pure predicate operations.  */
5743   if (rtx res = aarch64_expand_sve_const_pred_1 (target, builder, true))
5744     return res;
5745 
5746   /* Try forcing the constant to memory.  */
5747   if (builder.full_nelts ().is_constant ())
5748     if (rtx mem = force_const_mem (VNx16BImode, builder.build ()))
5749       {
5750 	target = aarch64_target_reg (target, VNx16BImode);
5751 	emit_move_insn (target, mem);
5752 	return target;
5753       }
5754 
5755   /* The last resort is to load the constant as an integer and then
5756      compare it against zero.  Use -1 for set bits in order to increase
5757      the changes of using SVE DUPM or an Advanced SIMD byte mask.  */
5758   rtx_vector_builder int_builder (VNx16QImode, builder.npatterns (),
5759 				  builder.nelts_per_pattern ());
5760   for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
5761     int_builder.quick_push (INTVAL (builder.elt (i))
5762 			    ? constm1_rtx : const0_rtx);
5763   return aarch64_convert_sve_data_to_pred (target, VNx16BImode,
5764 					   int_builder.build ());
5765 }
5766 
5767 /* Set DEST to immediate IMM.  */
5768 
5769 void
aarch64_expand_mov_immediate(rtx dest,rtx imm)5770 aarch64_expand_mov_immediate (rtx dest, rtx imm)
5771 {
5772   machine_mode mode = GET_MODE (dest);
5773 
5774   /* Check on what type of symbol it is.  */
5775   scalar_int_mode int_mode;
5776   if ((SYMBOL_REF_P (imm)
5777        || LABEL_REF_P (imm)
5778        || GET_CODE (imm) == CONST
5779        || GET_CODE (imm) == CONST_POLY_INT)
5780       && is_a <scalar_int_mode> (mode, &int_mode))
5781     {
5782       rtx mem;
5783       poly_int64 offset;
5784       HOST_WIDE_INT const_offset;
5785       enum aarch64_symbol_type sty;
5786 
5787       /* If we have (const (plus symbol offset)), separate out the offset
5788 	 before we start classifying the symbol.  */
5789       rtx base = strip_offset (imm, &offset);
5790 
5791       /* We must always add an offset involving VL separately, rather than
5792 	 folding it into the relocation.  */
5793       if (!offset.is_constant (&const_offset))
5794 	{
5795 	  if (!TARGET_SVE)
5796 	    {
5797 	      aarch64_report_sve_required ();
5798 	      return;
5799 	    }
5800 	  if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
5801 	    emit_insn (gen_rtx_SET (dest, imm));
5802 	  else
5803 	    {
5804 	      /* Do arithmetic on 32-bit values if the result is smaller
5805 		 than that.  */
5806 	      if (partial_subreg_p (int_mode, SImode))
5807 		{
5808 		  /* It is invalid to do symbol calculations in modes
5809 		     narrower than SImode.  */
5810 		  gcc_assert (base == const0_rtx);
5811 		  dest = gen_lowpart (SImode, dest);
5812 		  int_mode = SImode;
5813 		}
5814 	      if (base != const0_rtx)
5815 		{
5816 		  base = aarch64_force_temporary (int_mode, dest, base);
5817 		  aarch64_add_offset (int_mode, dest, base, offset,
5818 				      NULL_RTX, NULL_RTX, false);
5819 		}
5820 	      else
5821 		aarch64_add_offset (int_mode, dest, base, offset,
5822 				    dest, NULL_RTX, false);
5823 	    }
5824 	  return;
5825 	}
5826 
5827       sty = aarch64_classify_symbol (base, const_offset);
5828       switch (sty)
5829 	{
5830 	case SYMBOL_FORCE_TO_MEM:
5831 	  if (int_mode != ptr_mode)
5832 	    imm = convert_memory_address (ptr_mode, imm);
5833 
5834 	  if (const_offset != 0
5835 	      && targetm.cannot_force_const_mem (ptr_mode, imm))
5836 	    {
5837 	      gcc_assert (can_create_pseudo_p ());
5838 	      base = aarch64_force_temporary (int_mode, dest, base);
5839 	      aarch64_add_offset (int_mode, dest, base, const_offset,
5840 				  NULL_RTX, NULL_RTX, false);
5841 	      return;
5842 	    }
5843 
5844 	  mem = force_const_mem (ptr_mode, imm);
5845 	  gcc_assert (mem);
5846 
5847 	  /* If we aren't generating PC relative literals, then
5848 	     we need to expand the literal pool access carefully.
5849 	     This is something that needs to be done in a number
5850 	     of places, so could well live as a separate function.  */
5851 	  if (!aarch64_pcrelative_literal_loads)
5852 	    {
5853 	      gcc_assert (can_create_pseudo_p ());
5854 	      base = gen_reg_rtx (ptr_mode);
5855 	      aarch64_expand_mov_immediate (base, XEXP (mem, 0));
5856 	      if (ptr_mode != Pmode)
5857 		base = convert_memory_address (Pmode, base);
5858 	      mem = gen_rtx_MEM (ptr_mode, base);
5859 	    }
5860 
5861 	  if (int_mode != ptr_mode)
5862 	    mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
5863 
5864 	  emit_insn (gen_rtx_SET (dest, mem));
5865 
5866 	  return;
5867 
5868         case SYMBOL_SMALL_TLSGD:
5869         case SYMBOL_SMALL_TLSDESC:
5870 	case SYMBOL_SMALL_TLSIE:
5871 	case SYMBOL_SMALL_GOT_28K:
5872 	case SYMBOL_SMALL_GOT_4G:
5873 	case SYMBOL_TINY_GOT:
5874 	case SYMBOL_TINY_TLSIE:
5875 	  if (const_offset != 0)
5876 	    {
5877 	      gcc_assert(can_create_pseudo_p ());
5878 	      base = aarch64_force_temporary (int_mode, dest, base);
5879 	      aarch64_add_offset (int_mode, dest, base, const_offset,
5880 				  NULL_RTX, NULL_RTX, false);
5881 	      return;
5882 	    }
5883 	  /* FALLTHRU */
5884 
5885 	case SYMBOL_SMALL_ABSOLUTE:
5886 	case SYMBOL_TINY_ABSOLUTE:
5887 	case SYMBOL_TLSLE12:
5888 	case SYMBOL_TLSLE24:
5889 	case SYMBOL_TLSLE32:
5890 	case SYMBOL_TLSLE48:
5891 	  aarch64_load_symref_appropriately (dest, imm, sty);
5892 	  return;
5893 
5894 	default:
5895 	  gcc_unreachable ();
5896 	}
5897     }
5898 
5899   if (!CONST_INT_P (imm))
5900     {
5901       if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL)
5902 	{
5903 	  /* Only the low bit of each .H, .S and .D element is defined,
5904 	     so we can set the upper bits to whatever we like.  If the
5905 	     predicate is all-true in MODE, prefer to set all the undefined
5906 	     bits as well, so that we can share a single .B predicate for
5907 	     all modes.  */
5908 	  if (imm == CONSTM1_RTX (mode))
5909 	    imm = CONSTM1_RTX (VNx16BImode);
5910 
5911 	  /* All methods for constructing predicate modes wider than VNx16BI
5912 	     will set the upper bits of each element to zero.  Expose this
5913 	     by moving such constants as a VNx16BI, so that all bits are
5914 	     significant and so that constants for different modes can be
5915 	     shared.  The wider constant will still be available as a
5916 	     REG_EQUAL note.  */
5917 	  rtx_vector_builder builder;
5918 	  if (aarch64_get_sve_pred_bits (builder, imm))
5919 	    {
5920 	      rtx res = aarch64_expand_sve_const_pred (dest, builder);
5921 	      if (dest != res)
5922 		emit_move_insn (dest, gen_lowpart (mode, res));
5923 	      return;
5924 	    }
5925 	}
5926 
5927       if (GET_CODE (imm) == HIGH
5928 	  || aarch64_simd_valid_immediate (imm, NULL))
5929 	{
5930 	  emit_insn (gen_rtx_SET (dest, imm));
5931 	  return;
5932 	}
5933 
5934       if (GET_CODE (imm) == CONST_VECTOR && aarch64_sve_data_mode_p (mode))
5935 	if (rtx res = aarch64_expand_sve_const_vector (dest, imm))
5936 	  {
5937 	    if (dest != res)
5938 	      emit_insn (gen_aarch64_sve_reinterpret (mode, dest, res));
5939 	    return;
5940 	  }
5941 
5942       rtx mem = force_const_mem (mode, imm);
5943       gcc_assert (mem);
5944       emit_move_insn (dest, mem);
5945       return;
5946     }
5947 
5948   aarch64_internal_mov_immediate (dest, imm, true,
5949 				  as_a <scalar_int_mode> (mode));
5950 }
5951 
5952 /* Return the MEM rtx that provides the canary value that should be used
5953    for stack-smashing protection.  MODE is the mode of the memory.
5954    For SSP_GLOBAL, DECL_RTL is the MEM rtx for the canary variable
5955    (__stack_chk_guard), otherwise it has no useful value.  SALT_TYPE
5956    indicates whether the caller is performing a SET or a TEST operation.  */
5957 
5958 rtx
aarch64_stack_protect_canary_mem(machine_mode mode,rtx decl_rtl,aarch64_salt_type salt_type)5959 aarch64_stack_protect_canary_mem (machine_mode mode, rtx decl_rtl,
5960 				  aarch64_salt_type salt_type)
5961 {
5962   rtx addr;
5963   if (aarch64_stack_protector_guard == SSP_GLOBAL)
5964     {
5965       gcc_assert (MEM_P (decl_rtl));
5966       addr = XEXP (decl_rtl, 0);
5967       poly_int64 offset;
5968       rtx base = strip_offset_and_salt (addr, &offset);
5969       if (!SYMBOL_REF_P (base))
5970 	return decl_rtl;
5971 
5972       rtvec v = gen_rtvec (2, base, GEN_INT (salt_type));
5973       addr = gen_rtx_UNSPEC (Pmode, v, UNSPEC_SALT_ADDR);
5974       addr = gen_rtx_CONST (Pmode, addr);
5975       addr = plus_constant (Pmode, addr, offset);
5976     }
5977   else
5978     {
5979       /* Calculate the address from the system register.  */
5980       rtx salt = GEN_INT (salt_type);
5981       addr = gen_reg_rtx (mode);
5982       if (mode == DImode)
5983 	emit_insn (gen_reg_stack_protect_address_di (addr, salt));
5984       else
5985 	{
5986 	  emit_insn (gen_reg_stack_protect_address_si (addr, salt));
5987 	  addr = convert_memory_address (Pmode, addr);
5988 	}
5989       addr = plus_constant (Pmode, addr, aarch64_stack_protector_guard_offset);
5990     }
5991   return gen_rtx_MEM (mode, force_reg (Pmode, addr));
5992 }
5993 
5994 /* Emit an SVE predicated move from SRC to DEST.  PRED is a predicate
5995    that is known to contain PTRUE.  */
5996 
5997 void
aarch64_emit_sve_pred_move(rtx dest,rtx pred,rtx src)5998 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
5999 {
6000   expand_operand ops[3];
6001   machine_mode mode = GET_MODE (dest);
6002   create_output_operand (&ops[0], dest, mode);
6003   create_input_operand (&ops[1], pred, GET_MODE(pred));
6004   create_input_operand (&ops[2], src, mode);
6005   temporary_volatile_ok v (true);
6006   expand_insn (code_for_aarch64_pred_mov (mode), 3, ops);
6007 }
6008 
6009 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
6010    operand is in memory.  In this case we need to use the predicated LD1
6011    and ST1 instead of LDR and STR, both for correctness on big-endian
6012    targets and because LD1 and ST1 support a wider range of addressing modes.
6013    PRED_MODE is the mode of the predicate.
6014 
6015    See the comment at the head of aarch64-sve.md for details about the
6016    big-endian handling.  */
6017 
6018 void
aarch64_expand_sve_mem_move(rtx dest,rtx src,machine_mode pred_mode)6019 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
6020 {
6021   machine_mode mode = GET_MODE (dest);
6022   rtx ptrue = aarch64_ptrue_reg (pred_mode);
6023   if (!register_operand (src, mode)
6024       && !register_operand (dest, mode))
6025     {
6026       rtx tmp = gen_reg_rtx (mode);
6027       if (MEM_P (src))
6028 	aarch64_emit_sve_pred_move (tmp, ptrue, src);
6029       else
6030 	emit_move_insn (tmp, src);
6031       src = tmp;
6032     }
6033   aarch64_emit_sve_pred_move (dest, ptrue, src);
6034 }
6035 
6036 /* Called only on big-endian targets.  See whether an SVE vector move
6037    from SRC to DEST is effectively a REV[BHW] instruction, because at
6038    least one operand is a subreg of an SVE vector that has wider or
6039    narrower elements.  Return true and emit the instruction if so.
6040 
6041    For example:
6042 
6043      (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
6044 
6045    represents a VIEW_CONVERT between the following vectors, viewed
6046    in memory order:
6047 
6048      R2: { [0].high, [0].low,  [1].high, [1].low, ... }
6049      R1: { [0],      [1],      [2],      [3],     ... }
6050 
6051    The high part of lane X in R2 should therefore correspond to lane X*2
6052    of R1, but the register representations are:
6053 
6054          msb                                      lsb
6055      R2: ...... [1].high  [1].low   [0].high  [0].low
6056      R1: ...... [3]       [2]       [1]       [0]
6057 
6058    where the low part of lane X in R2 corresponds to lane X*2 in R1.
6059    We therefore need a reverse operation to swap the high and low values
6060    around.
6061 
6062    This is purely an optimization.  Without it we would spill the
6063    subreg operand to the stack in one mode and reload it in the
6064    other mode, which has the same effect as the REV.  */
6065 
6066 bool
aarch64_maybe_expand_sve_subreg_move(rtx dest,rtx src)6067 aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
6068 {
6069   gcc_assert (BYTES_BIG_ENDIAN);
6070 
6071   /* Do not try to optimize subregs that LRA has created for matched
6072      reloads.  These subregs only exist as a temporary measure to make
6073      the RTL well-formed, but they are exempt from the usual
6074      TARGET_CAN_CHANGE_MODE_CLASS rules.
6075 
6076      For example, if we have:
6077 
6078        (set (reg:VNx8HI R1) (foo:VNx8HI (reg:VNx4SI R2)))
6079 
6080      and the constraints require R1 and R2 to be in the same register,
6081      LRA may need to create RTL such as:
6082 
6083        (set (subreg:VNx4SI (reg:VNx8HI TMP) 0) (reg:VNx4SI R2))
6084        (set (reg:VNx8HI TMP) (foo:VNx8HI (subreg:VNx4SI (reg:VNx8HI TMP) 0)))
6085        (set (reg:VNx8HI R1) (reg:VNx8HI TMP))
6086 
6087      which forces both the input and output of the original instruction
6088      to use the same hard register.  But for this to work, the normal
6089      rules have to be suppressed on the subreg input, otherwise LRA
6090      would need to reload that input too, meaning that the process
6091      would never terminate.  To compensate for this, the normal rules
6092      are also suppressed for the subreg output of the first move.
6093      Ignoring the special case and handling the first move normally
6094      would therefore generate wrong code: we would reverse the elements
6095      for the first subreg but not reverse them back for the second subreg.  */
6096   if (SUBREG_P (dest) && !LRA_SUBREG_P (dest))
6097     dest = SUBREG_REG (dest);
6098   if (SUBREG_P (src) && !LRA_SUBREG_P (src))
6099     src = SUBREG_REG (src);
6100 
6101   /* The optimization handles two single SVE REGs with different element
6102      sizes.  */
6103   if (!REG_P (dest)
6104       || !REG_P (src)
6105       || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
6106       || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
6107       || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
6108 	  == GET_MODE_UNIT_SIZE (GET_MODE (src))))
6109     return false;
6110 
6111   /* Generate *aarch64_sve_mov<mode>_subreg_be.  */
6112   rtx ptrue = aarch64_ptrue_reg (VNx16BImode);
6113   rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
6114 			       UNSPEC_REV_SUBREG);
6115   emit_insn (gen_rtx_SET (dest, unspec));
6116   return true;
6117 }
6118 
6119 /* Return a copy of X with mode MODE, without changing its other
6120    attributes.  Unlike gen_lowpart, this doesn't care whether the
6121    mode change is valid.  */
6122 
6123 rtx
aarch64_replace_reg_mode(rtx x,machine_mode mode)6124 aarch64_replace_reg_mode (rtx x, machine_mode mode)
6125 {
6126   if (GET_MODE (x) == mode)
6127     return x;
6128 
6129   x = shallow_copy_rtx (x);
6130   set_mode_and_regno (x, mode, REGNO (x));
6131   return x;
6132 }
6133 
6134 /* Return the SVE REV[BHW] unspec for reversing quantites of mode MODE
6135    stored in wider integer containers.  */
6136 
6137 static unsigned int
aarch64_sve_rev_unspec(machine_mode mode)6138 aarch64_sve_rev_unspec (machine_mode mode)
6139 {
6140   switch (GET_MODE_UNIT_SIZE (mode))
6141     {
6142     case 1: return UNSPEC_REVB;
6143     case 2: return UNSPEC_REVH;
6144     case 4: return UNSPEC_REVW;
6145     }
6146   gcc_unreachable ();
6147 }
6148 
6149 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
6150    operands.  */
6151 
6152 void
aarch64_split_sve_subreg_move(rtx dest,rtx ptrue,rtx src)6153 aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
6154 {
6155   /* Decide which REV operation we need.  The mode with wider elements
6156      determines the mode of the operands and the mode with the narrower
6157      elements determines the reverse width.  */
6158   machine_mode mode_with_wider_elts = aarch64_sve_int_mode (GET_MODE (dest));
6159   machine_mode mode_with_narrower_elts = aarch64_sve_int_mode (GET_MODE (src));
6160   if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
6161       < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
6162     std::swap (mode_with_wider_elts, mode_with_narrower_elts);
6163 
6164   unsigned int unspec = aarch64_sve_rev_unspec (mode_with_narrower_elts);
6165   machine_mode pred_mode = aarch64_sve_pred_mode (mode_with_wider_elts);
6166 
6167   /* Get the operands in the appropriate modes and emit the instruction.  */
6168   ptrue = gen_lowpart (pred_mode, ptrue);
6169   dest = aarch64_replace_reg_mode (dest, mode_with_wider_elts);
6170   src = aarch64_replace_reg_mode (src, mode_with_wider_elts);
6171   emit_insn (gen_aarch64_pred (unspec, mode_with_wider_elts,
6172 			       dest, ptrue, src));
6173 }
6174 
6175 static bool
aarch64_function_ok_for_sibcall(tree,tree exp)6176 aarch64_function_ok_for_sibcall (tree, tree exp)
6177 {
6178   if (crtl->abi->id () != expr_callee_abi (exp).id ())
6179     return false;
6180 
6181   return true;
6182 }
6183 
6184 /* Subroutine of aarch64_pass_by_reference for arguments that are not
6185    passed in SVE registers.  */
6186 
6187 static bool
aarch64_pass_by_reference_1(CUMULATIVE_ARGS * pcum,const function_arg_info & arg)6188 aarch64_pass_by_reference_1 (CUMULATIVE_ARGS *pcum,
6189 			     const function_arg_info &arg)
6190 {
6191   HOST_WIDE_INT size;
6192   machine_mode dummymode;
6193   int nregs;
6194 
6195   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
6196   if (arg.mode == BLKmode && arg.type)
6197     size = int_size_in_bytes (arg.type);
6198   else
6199     /* No frontends can create types with variable-sized modes, so we
6200        shouldn't be asked to pass or return them.  */
6201     size = GET_MODE_SIZE (arg.mode).to_constant ();
6202 
6203   /* Aggregates are passed by reference based on their size.  */
6204   if (arg.aggregate_type_p ())
6205     size = int_size_in_bytes (arg.type);
6206 
6207   /* Variable sized arguments are always returned by reference.  */
6208   if (size < 0)
6209     return true;
6210 
6211   /* Can this be a candidate to be passed in fp/simd register(s)?  */
6212   if (aarch64_vfp_is_call_or_return_candidate (arg.mode, arg.type,
6213 					       &dummymode, &nregs, NULL,
6214 					       !pcum || pcum->silent_p))
6215     return false;
6216 
6217   /* Arguments which are variable sized or larger than 2 registers are
6218      passed by reference unless they are a homogenous floating point
6219      aggregate.  */
6220   return size > 2 * UNITS_PER_WORD;
6221 }
6222 
6223 /* Implement TARGET_PASS_BY_REFERENCE.  */
6224 
6225 static bool
aarch64_pass_by_reference(cumulative_args_t pcum_v,const function_arg_info & arg)6226 aarch64_pass_by_reference (cumulative_args_t pcum_v,
6227 			   const function_arg_info &arg)
6228 {
6229   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
6230 
6231   if (!arg.type)
6232     return aarch64_pass_by_reference_1 (pcum, arg);
6233 
6234   pure_scalable_type_info pst_info;
6235   switch (pst_info.analyze (arg.type))
6236     {
6237     case pure_scalable_type_info::IS_PST:
6238       if (pcum && !pcum->silent_p && !TARGET_SVE)
6239 	/* We can't gracefully recover at this point, so make this a
6240 	   fatal error.  */
6241 	fatal_error (input_location, "arguments of type %qT require"
6242 		     " the SVE ISA extension", arg.type);
6243 
6244       /* Variadic SVE types are passed by reference.  Normal non-variadic
6245 	 arguments are too if we've run out of registers.  */
6246       return (!arg.named
6247 	      || pcum->aapcs_nvrn + pst_info.num_zr () > NUM_FP_ARG_REGS
6248 	      || pcum->aapcs_nprn + pst_info.num_pr () > NUM_PR_ARG_REGS);
6249 
6250     case pure_scalable_type_info::DOESNT_MATTER:
6251       gcc_assert (aarch64_pass_by_reference_1 (pcum, arg));
6252       return true;
6253 
6254     case pure_scalable_type_info::NO_ABI_IDENTITY:
6255     case pure_scalable_type_info::ISNT_PST:
6256       return aarch64_pass_by_reference_1 (pcum, arg);
6257     }
6258   gcc_unreachable ();
6259 }
6260 
6261 /* Return TRUE if VALTYPE is padded to its least significant bits.  */
6262 static bool
aarch64_return_in_msb(const_tree valtype)6263 aarch64_return_in_msb (const_tree valtype)
6264 {
6265   machine_mode dummy_mode;
6266   int dummy_int;
6267 
6268   /* Never happens in little-endian mode.  */
6269   if (!BYTES_BIG_ENDIAN)
6270     return false;
6271 
6272   /* Only composite types smaller than or equal to 16 bytes can
6273      be potentially returned in registers.  */
6274   if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
6275       || int_size_in_bytes (valtype) <= 0
6276       || int_size_in_bytes (valtype) > 16)
6277     return false;
6278 
6279   /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
6280      or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
6281      is always passed/returned in the least significant bits of fp/simd
6282      register(s).  */
6283   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
6284 					       &dummy_mode, &dummy_int, NULL,
6285 					       false))
6286     return false;
6287 
6288   /* Likewise pure scalable types for SVE vector and predicate registers.  */
6289   pure_scalable_type_info pst_info;
6290   if (pst_info.analyze_registers (valtype))
6291     return false;
6292 
6293   return true;
6294 }
6295 
6296 /* Implement TARGET_FUNCTION_VALUE.
6297    Define how to find the value returned by a function.  */
6298 
6299 static rtx
aarch64_function_value(const_tree type,const_tree func,bool outgoing ATTRIBUTE_UNUSED)6300 aarch64_function_value (const_tree type, const_tree func,
6301 			bool outgoing ATTRIBUTE_UNUSED)
6302 {
6303   machine_mode mode;
6304   int unsignedp;
6305 
6306   mode = TYPE_MODE (type);
6307   if (INTEGRAL_TYPE_P (type))
6308     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
6309 
6310   pure_scalable_type_info pst_info;
6311   if (type && pst_info.analyze_registers (type))
6312     return pst_info.get_rtx (mode, V0_REGNUM, P0_REGNUM);
6313 
6314   /* Generic vectors that map to full SVE modes with -msve-vector-bits=N
6315      are returned in memory, not by value.  */
6316   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
6317   bool sve_p = (vec_flags & VEC_ANY_SVE);
6318 
6319   if (aarch64_return_in_msb (type))
6320     {
6321       HOST_WIDE_INT size = int_size_in_bytes (type);
6322 
6323       if (size % UNITS_PER_WORD != 0)
6324 	{
6325 	  size += UNITS_PER_WORD - size % UNITS_PER_WORD;
6326 	  mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
6327 	}
6328     }
6329 
6330   int count;
6331   machine_mode ag_mode;
6332   if (aarch64_vfp_is_call_or_return_candidate (mode, type, &ag_mode, &count,
6333 					       NULL, false))
6334     {
6335       gcc_assert (!sve_p);
6336       if (!aarch64_composite_type_p (type, mode))
6337 	{
6338 	  gcc_assert (count == 1 && mode == ag_mode);
6339 	  return gen_rtx_REG (mode, V0_REGNUM);
6340 	}
6341       else
6342 	{
6343 	  int i;
6344 	  rtx par;
6345 
6346 	  par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
6347 	  for (i = 0; i < count; i++)
6348 	    {
6349 	      rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
6350 	      rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
6351 	      tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
6352 	      XVECEXP (par, 0, i) = tmp;
6353 	    }
6354 	  return par;
6355 	}
6356     }
6357   else
6358     {
6359       if (sve_p)
6360 	{
6361 	  /* Vector types can acquire a partial SVE mode using things like
6362 	     __attribute__((vector_size(N))), and this is potentially useful.
6363 	     However, the choice of mode doesn't affect the type's ABI
6364 	     identity, so we should treat the types as though they had
6365 	     the associated integer mode, just like they did before SVE
6366 	     was introduced.
6367 
6368 	     We know that the vector must be 128 bits or smaller,
6369 	     otherwise we'd have returned it in memory instead.  */
6370 	  gcc_assert (type
6371 		      && (aarch64_some_values_include_pst_objects_p (type)
6372 			  || (vec_flags & VEC_PARTIAL)));
6373 
6374 	  scalar_int_mode int_mode = int_mode_for_mode (mode).require ();
6375 	  rtx reg = gen_rtx_REG (int_mode, R0_REGNUM);
6376 	  rtx pair = gen_rtx_EXPR_LIST (VOIDmode, reg, const0_rtx);
6377 	  return gen_rtx_PARALLEL (mode, gen_rtvec (1, pair));
6378 	}
6379       return gen_rtx_REG (mode, R0_REGNUM);
6380     }
6381 }
6382 
6383 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
6384    Return true if REGNO is the number of a hard register in which the values
6385    of called function may come back.  */
6386 
6387 static bool
aarch64_function_value_regno_p(const unsigned int regno)6388 aarch64_function_value_regno_p (const unsigned int regno)
6389 {
6390   /* Maximum of 16 bytes can be returned in the general registers.  Examples
6391      of 16-byte return values are: 128-bit integers and 16-byte small
6392      structures (excluding homogeneous floating-point aggregates).  */
6393   if (regno == R0_REGNUM || regno == R1_REGNUM)
6394     return true;
6395 
6396   /* Up to four fp/simd registers can return a function value, e.g. a
6397      homogeneous floating-point aggregate having four members.  */
6398   if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
6399     return TARGET_FLOAT;
6400 
6401   return false;
6402 }
6403 
6404 /* Subroutine for aarch64_return_in_memory for types that are not returned
6405    in SVE registers.  */
6406 
6407 static bool
aarch64_return_in_memory_1(const_tree type)6408 aarch64_return_in_memory_1 (const_tree type)
6409 {
6410   HOST_WIDE_INT size;
6411   machine_mode ag_mode;
6412   int count;
6413 
6414   if (!AGGREGATE_TYPE_P (type)
6415       && TREE_CODE (type) != COMPLEX_TYPE
6416       && TREE_CODE (type) != VECTOR_TYPE)
6417     /* Simple scalar types always returned in registers.  */
6418     return false;
6419 
6420   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
6421 					       &ag_mode, &count, NULL, false))
6422     return false;
6423 
6424   /* Types larger than 2 registers returned in memory.  */
6425   size = int_size_in_bytes (type);
6426   return (size < 0 || size > 2 * UNITS_PER_WORD);
6427 }
6428 
6429 /* Implement TARGET_RETURN_IN_MEMORY.
6430 
6431    If the type T of the result of a function is such that
6432      void func (T arg)
6433    would require that arg be passed as a value in a register (or set of
6434    registers) according to the parameter passing rules, then the result
6435    is returned in the same registers as would be used for such an
6436    argument.  */
6437 
6438 static bool
aarch64_return_in_memory(const_tree type,const_tree fndecl ATTRIBUTE_UNUSED)6439 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
6440 {
6441   pure_scalable_type_info pst_info;
6442   switch (pst_info.analyze (type))
6443     {
6444     case pure_scalable_type_info::IS_PST:
6445       return (pst_info.num_zr () > NUM_FP_ARG_REGS
6446 	      || pst_info.num_pr () > NUM_PR_ARG_REGS);
6447 
6448     case pure_scalable_type_info::DOESNT_MATTER:
6449       gcc_assert (aarch64_return_in_memory_1 (type));
6450       return true;
6451 
6452     case pure_scalable_type_info::NO_ABI_IDENTITY:
6453     case pure_scalable_type_info::ISNT_PST:
6454       return aarch64_return_in_memory_1 (type);
6455     }
6456   gcc_unreachable ();
6457 }
6458 
6459 static bool
aarch64_vfp_is_call_candidate(cumulative_args_t pcum_v,machine_mode mode,const_tree type,int * nregs)6460 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
6461 			       const_tree type, int *nregs)
6462 {
6463   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
6464   return aarch64_vfp_is_call_or_return_candidate (mode, type,
6465 						  &pcum->aapcs_vfp_rmode,
6466 						  nregs, NULL, pcum->silent_p);
6467 }
6468 
6469 /* Given MODE and TYPE of a function argument, return the alignment in
6470    bits.  The idea is to suppress any stronger alignment requested by
6471    the user and opt for the natural alignment (specified in AAPCS64 \S
6472    4.1).  ABI_BREAK is set to true if the alignment was incorrectly
6473    calculated in versions of GCC prior to GCC-9.  This is a helper
6474    function for local use only.  */
6475 
6476 static unsigned int
aarch64_function_arg_alignment(machine_mode mode,const_tree type,unsigned int * abi_break)6477 aarch64_function_arg_alignment (machine_mode mode, const_tree type,
6478 				unsigned int *abi_break)
6479 {
6480   *abi_break = 0;
6481   if (!type)
6482     return GET_MODE_ALIGNMENT (mode);
6483 
6484   if (integer_zerop (TYPE_SIZE (type)))
6485     return 0;
6486 
6487   gcc_assert (TYPE_MODE (type) == mode);
6488 
6489   if (!AGGREGATE_TYPE_P (type))
6490     return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
6491 
6492   if (TREE_CODE (type) == ARRAY_TYPE)
6493     return TYPE_ALIGN (TREE_TYPE (type));
6494 
6495   unsigned int alignment = 0;
6496   unsigned int bitfield_alignment = 0;
6497   for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
6498     if (TREE_CODE (field) == FIELD_DECL)
6499       {
6500 	/* Note that we explicitly consider zero-sized fields here,
6501 	   even though they don't map to AAPCS64 machine types.
6502 	   For example, in:
6503 
6504 	       struct __attribute__((aligned(8))) empty {};
6505 
6506 	       struct s {
6507 		 [[no_unique_address]] empty e;
6508 		 int x;
6509 	       };
6510 
6511 	   "s" contains only one Fundamental Data Type (the int field)
6512 	   but gains 8-byte alignment and size thanks to "e".  */
6513 	alignment = std::max (alignment, DECL_ALIGN (field));
6514 	if (DECL_BIT_FIELD_TYPE (field))
6515 	  bitfield_alignment
6516 	    = std::max (bitfield_alignment,
6517 			TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field)));
6518       }
6519 
6520   if (bitfield_alignment > alignment)
6521     {
6522       *abi_break = alignment;
6523       return bitfield_alignment;
6524     }
6525 
6526   return alignment;
6527 }
6528 
6529 /* Layout a function argument according to the AAPCS64 rules.  The rule
6530    numbers refer to the rule numbers in the AAPCS64.  ORIG_MODE is the
6531    mode that was originally given to us by the target hook, whereas the
6532    mode in ARG might be the result of replacing partial SVE modes with
6533    the equivalent integer mode.  */
6534 
6535 static void
aarch64_layout_arg(cumulative_args_t pcum_v,const function_arg_info & arg)6536 aarch64_layout_arg (cumulative_args_t pcum_v, const function_arg_info &arg)
6537 {
6538   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
6539   tree type = arg.type;
6540   machine_mode mode = arg.mode;
6541   int ncrn, nvrn, nregs;
6542   bool allocate_ncrn, allocate_nvrn;
6543   HOST_WIDE_INT size;
6544   unsigned int abi_break;
6545 
6546   /* We need to do this once per argument.  */
6547   if (pcum->aapcs_arg_processed)
6548     return;
6549 
6550   pcum->aapcs_arg_processed = true;
6551 
6552   pure_scalable_type_info pst_info;
6553   if (type && pst_info.analyze_registers (type))
6554     {
6555       /* The PCS says that it is invalid to pass an SVE value to an
6556 	 unprototyped function.  There is no ABI-defined location we
6557 	 can return in this case, so we have no real choice but to raise
6558 	 an error immediately, even though this is only a query function.  */
6559       if (arg.named && pcum->pcs_variant != ARM_PCS_SVE)
6560 	{
6561 	  gcc_assert (!pcum->silent_p);
6562 	  error ("SVE type %qT cannot be passed to an unprototyped function",
6563 		 arg.type);
6564 	  /* Avoid repeating the message, and avoid tripping the assert
6565 	     below.  */
6566 	  pcum->pcs_variant = ARM_PCS_SVE;
6567 	}
6568 
6569       /* We would have converted the argument into pass-by-reference
6570 	 form if it didn't fit in registers.  */
6571       pcum->aapcs_nextnvrn = pcum->aapcs_nvrn + pst_info.num_zr ();
6572       pcum->aapcs_nextnprn = pcum->aapcs_nprn + pst_info.num_pr ();
6573       gcc_assert (arg.named
6574 		  && pcum->pcs_variant == ARM_PCS_SVE
6575 		  && pcum->aapcs_nextnvrn <= NUM_FP_ARG_REGS
6576 		  && pcum->aapcs_nextnprn <= NUM_PR_ARG_REGS);
6577       pcum->aapcs_reg = pst_info.get_rtx (mode, V0_REGNUM + pcum->aapcs_nvrn,
6578 					  P0_REGNUM + pcum->aapcs_nprn);
6579       return;
6580     }
6581 
6582   /* Generic vectors that map to full SVE modes with -msve-vector-bits=N
6583      are passed by reference, not by value.  */
6584   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
6585   bool sve_p = (vec_flags & VEC_ANY_SVE);
6586   if (sve_p)
6587     /* Vector types can acquire a partial SVE mode using things like
6588        __attribute__((vector_size(N))), and this is potentially useful.
6589        However, the choice of mode doesn't affect the type's ABI
6590        identity, so we should treat the types as though they had
6591        the associated integer mode, just like they did before SVE
6592        was introduced.
6593 
6594        We know that the vector must be 128 bits or smaller,
6595        otherwise we'd have passed it in memory instead.  */
6596     gcc_assert (type
6597 		&& (aarch64_some_values_include_pst_objects_p (type)
6598 		    || (vec_flags & VEC_PARTIAL)));
6599 
6600   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
6601   if (type)
6602     size = int_size_in_bytes (type);
6603   else
6604     /* No frontends can create types with variable-sized modes, so we
6605        shouldn't be asked to pass or return them.  */
6606     size = GET_MODE_SIZE (mode).to_constant ();
6607   size = ROUND_UP (size, UNITS_PER_WORD);
6608 
6609   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
6610   allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
6611 						 mode,
6612 						 type,
6613 						 &nregs);
6614   gcc_assert (!sve_p || !allocate_nvrn);
6615 
6616   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
6617      The following code thus handles passing by SIMD/FP registers first.  */
6618 
6619   nvrn = pcum->aapcs_nvrn;
6620 
6621   /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
6622      and homogenous short-vector aggregates (HVA).  */
6623   if (allocate_nvrn)
6624     {
6625       if (!pcum->silent_p && !TARGET_FLOAT)
6626 	aarch64_err_no_fpadvsimd (mode);
6627 
6628       if (nvrn + nregs <= NUM_FP_ARG_REGS)
6629 	{
6630 	  pcum->aapcs_nextnvrn = nvrn + nregs;
6631 	  if (!aarch64_composite_type_p (type, mode))
6632 	    {
6633 	      gcc_assert (nregs == 1);
6634 	      pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
6635 	    }
6636 	  else
6637 	    {
6638 	      rtx par;
6639 	      int i;
6640 	      par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
6641 	      for (i = 0; i < nregs; i++)
6642 		{
6643 		  rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
6644 					 V0_REGNUM + nvrn + i);
6645 		  rtx offset = gen_int_mode
6646 		    (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
6647 		  tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
6648 		  XVECEXP (par, 0, i) = tmp;
6649 		}
6650 	      pcum->aapcs_reg = par;
6651 	    }
6652 	  return;
6653 	}
6654       else
6655 	{
6656 	  /* C.3 NSRN is set to 8.  */
6657 	  pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
6658 	  goto on_stack;
6659 	}
6660     }
6661 
6662   ncrn = pcum->aapcs_ncrn;
6663   nregs = size / UNITS_PER_WORD;
6664 
6665   /* C6 - C9.  though the sign and zero extension semantics are
6666      handled elsewhere.  This is the case where the argument fits
6667      entirely general registers.  */
6668   if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
6669     {
6670       gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
6671 
6672       /* C.8 if the argument has an alignment of 16 then the NGRN is
6673 	 rounded up to the next even number.  */
6674       if (nregs == 2
6675 	  && ncrn % 2
6676 	  /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
6677 	     comparison is there because for > 16 * BITS_PER_UNIT
6678 	     alignment nregs should be > 2 and therefore it should be
6679 	     passed by reference rather than value.  */
6680 	  && (aarch64_function_arg_alignment (mode, type, &abi_break)
6681 	      == 16 * BITS_PER_UNIT))
6682 	{
6683 	  if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
6684 	    inform (input_location, "parameter passing for argument of type "
6685 		    "%qT changed in GCC 9.1", type);
6686 	  ++ncrn;
6687 	  gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
6688 	}
6689 
6690       /* If an argument with an SVE mode needs to be shifted up to the
6691 	 high part of the register, treat it as though it had an integer mode.
6692 	 Using the normal (parallel [...]) would suppress the shifting.  */
6693       if (sve_p
6694 	  && BYTES_BIG_ENDIAN
6695 	  && maybe_ne (GET_MODE_SIZE (mode), nregs * UNITS_PER_WORD)
6696 	  && aarch64_pad_reg_upward (mode, type, false))
6697 	{
6698 	  mode = int_mode_for_mode (mode).require ();
6699 	  sve_p = false;
6700 	}
6701 
6702       /* NREGS can be 0 when e.g. an empty structure is to be passed.
6703 	 A reg is still generated for it, but the caller should be smart
6704 	 enough not to use it.  */
6705       if (nregs == 0
6706 	  || (nregs == 1 && !sve_p)
6707 	  || GET_MODE_CLASS (mode) == MODE_INT)
6708 	pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
6709       else
6710 	{
6711 	  rtx par;
6712 	  int i;
6713 
6714 	  par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
6715 	  for (i = 0; i < nregs; i++)
6716 	    {
6717 	      scalar_int_mode reg_mode = word_mode;
6718 	      if (nregs == 1)
6719 		reg_mode = int_mode_for_mode (mode).require ();
6720 	      rtx tmp = gen_rtx_REG (reg_mode, R0_REGNUM + ncrn + i);
6721 	      tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
6722 				       GEN_INT (i * UNITS_PER_WORD));
6723 	      XVECEXP (par, 0, i) = tmp;
6724 	    }
6725 	  pcum->aapcs_reg = par;
6726 	}
6727 
6728       pcum->aapcs_nextncrn = ncrn + nregs;
6729       return;
6730     }
6731 
6732   /* C.11  */
6733   pcum->aapcs_nextncrn = NUM_ARG_REGS;
6734 
6735   /* The argument is passed on stack; record the needed number of words for
6736      this argument and align the total size if necessary.  */
6737 on_stack:
6738   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
6739 
6740   if (aarch64_function_arg_alignment (mode, type, &abi_break)
6741       == 16 * BITS_PER_UNIT)
6742     {
6743       int new_size = ROUND_UP (pcum->aapcs_stack_size, 16 / UNITS_PER_WORD);
6744       if (pcum->aapcs_stack_size != new_size)
6745 	{
6746 	  if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
6747 	    inform (input_location, "parameter passing for argument of type "
6748 		    "%qT changed in GCC 9.1", type);
6749 	  pcum->aapcs_stack_size = new_size;
6750 	}
6751     }
6752   return;
6753 }
6754 
6755 /* Implement TARGET_FUNCTION_ARG.  */
6756 
6757 static rtx
aarch64_function_arg(cumulative_args_t pcum_v,const function_arg_info & arg)6758 aarch64_function_arg (cumulative_args_t pcum_v, const function_arg_info &arg)
6759 {
6760   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
6761   gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64
6762 	      || pcum->pcs_variant == ARM_PCS_SIMD
6763 	      || pcum->pcs_variant == ARM_PCS_SVE);
6764 
6765   if (arg.end_marker_p ())
6766     return gen_int_mode (pcum->pcs_variant, DImode);
6767 
6768   aarch64_layout_arg (pcum_v, arg);
6769   return pcum->aapcs_reg;
6770 }
6771 
6772 void
aarch64_init_cumulative_args(CUMULATIVE_ARGS * pcum,const_tree fntype,rtx libname ATTRIBUTE_UNUSED,const_tree fndecl ATTRIBUTE_UNUSED,unsigned n_named ATTRIBUTE_UNUSED,bool silent_p)6773 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
6774 			      const_tree fntype,
6775 			      rtx libname ATTRIBUTE_UNUSED,
6776 			      const_tree fndecl ATTRIBUTE_UNUSED,
6777 			      unsigned n_named ATTRIBUTE_UNUSED,
6778 			      bool silent_p)
6779 {
6780   pcum->aapcs_ncrn = 0;
6781   pcum->aapcs_nvrn = 0;
6782   pcum->aapcs_nprn = 0;
6783   pcum->aapcs_nextncrn = 0;
6784   pcum->aapcs_nextnvrn = 0;
6785   pcum->aapcs_nextnprn = 0;
6786   if (fntype)
6787     pcum->pcs_variant = (arm_pcs) fntype_abi (fntype).id ();
6788   else
6789     pcum->pcs_variant = ARM_PCS_AAPCS64;
6790   pcum->aapcs_reg = NULL_RTX;
6791   pcum->aapcs_arg_processed = false;
6792   pcum->aapcs_stack_words = 0;
6793   pcum->aapcs_stack_size = 0;
6794   pcum->silent_p = silent_p;
6795 
6796   if (!silent_p
6797       && !TARGET_FLOAT
6798       && fntype && fntype != error_mark_node)
6799     {
6800       const_tree type = TREE_TYPE (fntype);
6801       machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument.  */
6802       int nregs ATTRIBUTE_UNUSED; /* Likewise.  */
6803       if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
6804 						   &mode, &nregs, NULL, false))
6805 	aarch64_err_no_fpadvsimd (TYPE_MODE (type));
6806     }
6807 
6808   if (!silent_p
6809       && !TARGET_SVE
6810       && pcum->pcs_variant == ARM_PCS_SVE)
6811     {
6812       /* We can't gracefully recover at this point, so make this a
6813 	 fatal error.  */
6814       if (fndecl)
6815 	fatal_error (input_location, "%qE requires the SVE ISA extension",
6816 		     fndecl);
6817       else
6818 	fatal_error (input_location, "calls to functions of type %qT require"
6819 		     " the SVE ISA extension", fntype);
6820     }
6821 }
6822 
6823 static void
aarch64_function_arg_advance(cumulative_args_t pcum_v,const function_arg_info & arg)6824 aarch64_function_arg_advance (cumulative_args_t pcum_v,
6825 			      const function_arg_info &arg)
6826 {
6827   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
6828   if (pcum->pcs_variant == ARM_PCS_AAPCS64
6829       || pcum->pcs_variant == ARM_PCS_SIMD
6830       || pcum->pcs_variant == ARM_PCS_SVE)
6831     {
6832       aarch64_layout_arg (pcum_v, arg);
6833       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
6834 		  != (pcum->aapcs_stack_words != 0));
6835       pcum->aapcs_arg_processed = false;
6836       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
6837       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
6838       pcum->aapcs_nprn = pcum->aapcs_nextnprn;
6839       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
6840       pcum->aapcs_stack_words = 0;
6841       pcum->aapcs_reg = NULL_RTX;
6842     }
6843 }
6844 
6845 bool
aarch64_function_arg_regno_p(unsigned regno)6846 aarch64_function_arg_regno_p (unsigned regno)
6847 {
6848   return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
6849 	  || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
6850 }
6851 
6852 /* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
6853    PARM_BOUNDARY bits of alignment, but will be given anything up
6854    to STACK_BOUNDARY bits if the type requires it.  This makes sure
6855    that both before and after the layout of each argument, the Next
6856    Stacked Argument Address (NSAA) will have a minimum alignment of
6857    8 bytes.  */
6858 
6859 static unsigned int
aarch64_function_arg_boundary(machine_mode mode,const_tree type)6860 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
6861 {
6862   unsigned int abi_break;
6863   unsigned int alignment = aarch64_function_arg_alignment (mode, type,
6864 							   &abi_break);
6865   alignment = MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
6866   if (abi_break & warn_psabi)
6867     {
6868       abi_break = MIN (MAX (abi_break, PARM_BOUNDARY), STACK_BOUNDARY);
6869       if (alignment != abi_break)
6870 	inform (input_location, "parameter passing for argument of type "
6871 		"%qT changed in GCC 9.1", type);
6872     }
6873 
6874   return alignment;
6875 }
6876 
6877 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE.  */
6878 
6879 static fixed_size_mode
aarch64_get_reg_raw_mode(int regno)6880 aarch64_get_reg_raw_mode (int regno)
6881 {
6882   if (TARGET_SVE && FP_REGNUM_P (regno))
6883     /* Don't use the SVE part of the register for __builtin_apply and
6884        __builtin_return.  The SVE registers aren't used by the normal PCS,
6885        so using them there would be a waste of time.  The PCS extensions
6886        for SVE types are fundamentally incompatible with the
6887        __builtin_return/__builtin_apply interface.  */
6888     return as_a <fixed_size_mode> (V16QImode);
6889   return default_get_reg_raw_mode (regno);
6890 }
6891 
6892 /* Implement TARGET_FUNCTION_ARG_PADDING.
6893 
6894    Small aggregate types are placed in the lowest memory address.
6895 
6896    The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
6897 
6898 static pad_direction
aarch64_function_arg_padding(machine_mode mode,const_tree type)6899 aarch64_function_arg_padding (machine_mode mode, const_tree type)
6900 {
6901   /* On little-endian targets, the least significant byte of every stack
6902      argument is passed at the lowest byte address of the stack slot.  */
6903   if (!BYTES_BIG_ENDIAN)
6904     return PAD_UPWARD;
6905 
6906   /* Otherwise, integral, floating-point and pointer types are padded downward:
6907      the least significant byte of a stack argument is passed at the highest
6908      byte address of the stack slot.  */
6909   if (type
6910       ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
6911 	 || POINTER_TYPE_P (type))
6912       : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
6913     return PAD_DOWNWARD;
6914 
6915   /* Everything else padded upward, i.e. data in first byte of stack slot.  */
6916   return PAD_UPWARD;
6917 }
6918 
6919 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
6920 
6921    It specifies padding for the last (may also be the only)
6922    element of a block move between registers and memory.  If
6923    assuming the block is in the memory, padding upward means that
6924    the last element is padded after its highest significant byte,
6925    while in downward padding, the last element is padded at the
6926    its least significant byte side.
6927 
6928    Small aggregates and small complex types are always padded
6929    upwards.
6930 
6931    We don't need to worry about homogeneous floating-point or
6932    short-vector aggregates; their move is not affected by the
6933    padding direction determined here.  Regardless of endianness,
6934    each element of such an aggregate is put in the least
6935    significant bits of a fp/simd register.
6936 
6937    Return !BYTES_BIG_ENDIAN if the least significant byte of the
6938    register has useful data, and return the opposite if the most
6939    significant byte does.  */
6940 
6941 bool
aarch64_pad_reg_upward(machine_mode mode,const_tree type,bool first ATTRIBUTE_UNUSED)6942 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
6943 		     bool first ATTRIBUTE_UNUSED)
6944 {
6945 
6946   /* Aside from pure scalable types, small composite types are always
6947      padded upward.  */
6948   if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
6949     {
6950       HOST_WIDE_INT size;
6951       if (type)
6952 	size = int_size_in_bytes (type);
6953       else
6954 	/* No frontends can create types with variable-sized modes, so we
6955 	   shouldn't be asked to pass or return them.  */
6956 	size = GET_MODE_SIZE (mode).to_constant ();
6957       if (size < 2 * UNITS_PER_WORD)
6958 	{
6959 	  pure_scalable_type_info pst_info;
6960 	  if (pst_info.analyze_registers (type))
6961 	    return false;
6962 	  return true;
6963 	}
6964     }
6965 
6966   /* Otherwise, use the default padding.  */
6967   return !BYTES_BIG_ENDIAN;
6968 }
6969 
6970 static scalar_int_mode
aarch64_libgcc_cmp_return_mode(void)6971 aarch64_libgcc_cmp_return_mode (void)
6972 {
6973   return SImode;
6974 }
6975 
6976 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
6977 
6978 /* We use the 12-bit shifted immediate arithmetic instructions so values
6979    must be multiple of (1 << 12), i.e. 4096.  */
6980 #define ARITH_FACTOR 4096
6981 
6982 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
6983 #error Cannot use simple address calculation for stack probing
6984 #endif
6985 
6986 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
6987    inclusive.  These are offsets from the current stack pointer.  */
6988 
6989 static void
aarch64_emit_probe_stack_range(HOST_WIDE_INT first,poly_int64 poly_size)6990 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
6991 {
6992   HOST_WIDE_INT size;
6993   if (!poly_size.is_constant (&size))
6994     {
6995       sorry ("stack probes for SVE frames");
6996       return;
6997     }
6998 
6999   rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REGNUM);
7000 
7001   /* See the same assertion on PROBE_INTERVAL above.  */
7002   gcc_assert ((first % ARITH_FACTOR) == 0);
7003 
7004   /* See if we have a constant small number of probes to generate.  If so,
7005      that's the easy case.  */
7006   if (size <= PROBE_INTERVAL)
7007     {
7008       const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
7009 
7010       emit_set_insn (reg1,
7011 		     plus_constant (Pmode,
7012 				    stack_pointer_rtx, -(first + base)));
7013       emit_stack_probe (plus_constant (Pmode, reg1, base - size));
7014     }
7015 
7016   /* The run-time loop is made up of 8 insns in the generic case while the
7017      compile-time loop is made up of 4+2*(n-2) insns for n # of intervals.  */
7018   else if (size <= 4 * PROBE_INTERVAL)
7019     {
7020       HOST_WIDE_INT i, rem;
7021 
7022       emit_set_insn (reg1,
7023 		     plus_constant (Pmode,
7024 				    stack_pointer_rtx,
7025 				    -(first + PROBE_INTERVAL)));
7026       emit_stack_probe (reg1);
7027 
7028       /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
7029 	 it exceeds SIZE.  If only two probes are needed, this will not
7030 	 generate any code.  Then probe at FIRST + SIZE.  */
7031       for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
7032 	{
7033 	  emit_set_insn (reg1,
7034 			 plus_constant (Pmode, reg1, -PROBE_INTERVAL));
7035 	  emit_stack_probe (reg1);
7036 	}
7037 
7038       rem = size - (i - PROBE_INTERVAL);
7039       if (rem > 256)
7040 	{
7041 	  const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
7042 
7043 	  emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
7044 	  emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
7045 	}
7046       else
7047 	emit_stack_probe (plus_constant (Pmode, reg1, -rem));
7048     }
7049 
7050   /* Otherwise, do the same as above, but in a loop.  Note that we must be
7051      extra careful with variables wrapping around because we might be at
7052      the very top (or the very bottom) of the address space and we have
7053      to be able to handle this case properly; in particular, we use an
7054      equality test for the loop condition.  */
7055   else
7056     {
7057       rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REGNUM);
7058 
7059       /* Step 1: round SIZE to the previous multiple of the interval.  */
7060 
7061       HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
7062 
7063 
7064       /* Step 2: compute initial and final value of the loop counter.  */
7065 
7066       /* TEST_ADDR = SP + FIRST.  */
7067       emit_set_insn (reg1,
7068 		     plus_constant (Pmode, stack_pointer_rtx, -first));
7069 
7070       /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE.  */
7071       HOST_WIDE_INT adjustment = - (first + rounded_size);
7072       if (! aarch64_uimm12_shift (adjustment))
7073 	{
7074 	  aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
7075 					  true, Pmode);
7076 	  emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
7077 	}
7078       else
7079 	emit_set_insn (reg2,
7080 		       plus_constant (Pmode, stack_pointer_rtx, adjustment));
7081 
7082       /* Step 3: the loop
7083 
7084 	 do
7085 	   {
7086 	     TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
7087 	     probe at TEST_ADDR
7088 	   }
7089 	 while (TEST_ADDR != LAST_ADDR)
7090 
7091 	 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
7092 	 until it is equal to ROUNDED_SIZE.  */
7093 
7094       emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
7095 
7096 
7097       /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
7098 	 that SIZE is equal to ROUNDED_SIZE.  */
7099 
7100       if (size != rounded_size)
7101 	{
7102 	  HOST_WIDE_INT rem = size - rounded_size;
7103 
7104 	  if (rem > 256)
7105 	    {
7106 	      const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
7107 
7108 	      emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
7109 	      emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
7110 	    }
7111 	  else
7112 	    emit_stack_probe (plus_constant (Pmode, reg2, -rem));
7113 	}
7114     }
7115 
7116   /* Make sure nothing is scheduled before we are done.  */
7117   emit_insn (gen_blockage ());
7118 }
7119 
7120 /* Probe a range of stack addresses from REG1 to REG2 inclusive.  These are
7121    absolute addresses.  */
7122 
7123 const char *
aarch64_output_probe_stack_range(rtx reg1,rtx reg2)7124 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
7125 {
7126   static int labelno = 0;
7127   char loop_lab[32];
7128   rtx xops[2];
7129 
7130   ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
7131 
7132   /* Loop.  */
7133   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
7134 
7135   HOST_WIDE_INT stack_clash_probe_interval
7136     = 1 << param_stack_clash_protection_guard_size;
7137 
7138   /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
7139   xops[0] = reg1;
7140   HOST_WIDE_INT interval;
7141   if (flag_stack_clash_protection)
7142     interval = stack_clash_probe_interval;
7143   else
7144     interval = PROBE_INTERVAL;
7145 
7146   gcc_assert (aarch64_uimm12_shift (interval));
7147   xops[1] = GEN_INT (interval);
7148 
7149   output_asm_insn ("sub\t%0, %0, %1", xops);
7150 
7151   /* If doing stack clash protection then we probe up by the ABI specified
7152      amount.  We do this because we're dropping full pages at a time in the
7153      loop.  But if we're doing non-stack clash probing, probe at SP 0.  */
7154   if (flag_stack_clash_protection)
7155     xops[1] = GEN_INT (STACK_CLASH_CALLER_GUARD);
7156   else
7157     xops[1] = CONST0_RTX (GET_MODE (xops[1]));
7158 
7159   /* Probe at TEST_ADDR.  If we're inside the loop it is always safe to probe
7160      by this amount for each iteration.  */
7161   output_asm_insn ("str\txzr, [%0, %1]", xops);
7162 
7163   /* Test if TEST_ADDR == LAST_ADDR.  */
7164   xops[1] = reg2;
7165   output_asm_insn ("cmp\t%0, %1", xops);
7166 
7167   /* Branch.  */
7168   fputs ("\tb.ne\t", asm_out_file);
7169   assemble_name_raw (asm_out_file, loop_lab);
7170   fputc ('\n', asm_out_file);
7171 
7172   return "";
7173 }
7174 
7175 /* Emit the probe loop for doing stack clash probes and stack adjustments for
7176    SVE.  This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
7177    of GUARD_SIZE.  When a probe is emitted it is done at most
7178    MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of
7179    at most MIN_PROBE_THRESHOLD.  By the end of this function
7180    BASE = BASE - ADJUSTMENT.  */
7181 
7182 const char *
aarch64_output_probe_sve_stack_clash(rtx base,rtx adjustment,rtx min_probe_threshold,rtx guard_size)7183 aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment,
7184 				      rtx min_probe_threshold, rtx guard_size)
7185 {
7186   /* This function is not allowed to use any instruction generation function
7187      like gen_ and friends.  If you do you'll likely ICE during CFG validation,
7188      so instead emit the code you want using output_asm_insn.  */
7189   gcc_assert (flag_stack_clash_protection);
7190   gcc_assert (CONST_INT_P (min_probe_threshold) && CONST_INT_P (guard_size));
7191   gcc_assert (INTVAL (guard_size) > INTVAL (min_probe_threshold));
7192 
7193   /* The minimum required allocation before the residual requires probing.  */
7194   HOST_WIDE_INT residual_probe_guard = INTVAL (min_probe_threshold);
7195 
7196   /* Clamp the value down to the nearest value that can be used with a cmp.  */
7197   residual_probe_guard = aarch64_clamp_to_uimm12_shift (residual_probe_guard);
7198   rtx probe_offset_value_rtx = gen_int_mode (residual_probe_guard, Pmode);
7199 
7200   gcc_assert (INTVAL (min_probe_threshold) >= residual_probe_guard);
7201   gcc_assert (aarch64_uimm12_shift (residual_probe_guard));
7202 
7203   static int labelno = 0;
7204   char loop_start_lab[32];
7205   char loop_end_lab[32];
7206   rtx xops[2];
7207 
7208   ASM_GENERATE_INTERNAL_LABEL (loop_start_lab, "SVLPSPL", labelno);
7209   ASM_GENERATE_INTERNAL_LABEL (loop_end_lab, "SVLPEND", labelno++);
7210 
7211   /* Emit loop start label.  */
7212   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_start_lab);
7213 
7214   /* ADJUSTMENT < RESIDUAL_PROBE_GUARD.  */
7215   xops[0] = adjustment;
7216   xops[1] = probe_offset_value_rtx;
7217   output_asm_insn ("cmp\t%0, %1", xops);
7218 
7219   /* Branch to end if not enough adjustment to probe.  */
7220   fputs ("\tb.lt\t", asm_out_file);
7221   assemble_name_raw (asm_out_file, loop_end_lab);
7222   fputc ('\n', asm_out_file);
7223 
7224   /* BASE = BASE - RESIDUAL_PROBE_GUARD.  */
7225   xops[0] = base;
7226   xops[1] = probe_offset_value_rtx;
7227   output_asm_insn ("sub\t%0, %0, %1", xops);
7228 
7229   /* Probe at BASE.  */
7230   xops[1] = const0_rtx;
7231   output_asm_insn ("str\txzr, [%0, %1]", xops);
7232 
7233   /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD.  */
7234   xops[0] = adjustment;
7235   xops[1] = probe_offset_value_rtx;
7236   output_asm_insn ("sub\t%0, %0, %1", xops);
7237 
7238   /* Branch to start if still more bytes to allocate.  */
7239   fputs ("\tb\t", asm_out_file);
7240   assemble_name_raw (asm_out_file, loop_start_lab);
7241   fputc ('\n', asm_out_file);
7242 
7243   /* No probe leave.  */
7244   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab);
7245 
7246   /* BASE = BASE - ADJUSTMENT.  */
7247   xops[0] = base;
7248   xops[1] = adjustment;
7249   output_asm_insn ("sub\t%0, %0, %1", xops);
7250   return "";
7251 }
7252 
7253 /* Determine whether a frame chain needs to be generated.  */
7254 static bool
aarch64_needs_frame_chain(void)7255 aarch64_needs_frame_chain (void)
7256 {
7257   /* Force a frame chain for EH returns so the return address is at FP+8.  */
7258   if (frame_pointer_needed || crtl->calls_eh_return)
7259     return true;
7260 
7261   /* A leaf function cannot have calls or write LR.  */
7262   bool is_leaf = crtl->is_leaf && !df_regs_ever_live_p (LR_REGNUM);
7263 
7264   /* Don't use a frame chain in leaf functions if leaf frame pointers
7265      are disabled.  */
7266   if (flag_omit_leaf_frame_pointer && is_leaf)
7267     return false;
7268 
7269   return aarch64_use_frame_pointer;
7270 }
7271 
7272 /* Mark the registers that need to be saved by the callee and calculate
7273    the size of the callee-saved registers area and frame record (both FP
7274    and LR may be omitted).  */
7275 static void
aarch64_layout_frame(void)7276 aarch64_layout_frame (void)
7277 {
7278   poly_int64 offset = 0;
7279   int regno, last_fp_reg = INVALID_REGNUM;
7280   machine_mode vector_save_mode = aarch64_reg_save_mode (V8_REGNUM);
7281   poly_int64 vector_save_size = GET_MODE_SIZE (vector_save_mode);
7282   bool frame_related_fp_reg_p = false;
7283   aarch64_frame &frame = cfun->machine->frame;
7284 
7285   frame.emit_frame_chain = aarch64_needs_frame_chain ();
7286 
7287   /* Adjust the outgoing arguments size if required.  Keep it in sync with what
7288      the mid-end is doing.  */
7289   crtl->outgoing_args_size = STACK_DYNAMIC_OFFSET (cfun);
7290 
7291 #define SLOT_NOT_REQUIRED (-2)
7292 #define SLOT_REQUIRED     (-1)
7293 
7294   frame.wb_candidate1 = INVALID_REGNUM;
7295   frame.wb_candidate2 = INVALID_REGNUM;
7296   frame.spare_pred_reg = INVALID_REGNUM;
7297 
7298   /* First mark all the registers that really need to be saved...  */
7299   for (regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
7300     frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
7301 
7302   /* ... that includes the eh data registers (if needed)...  */
7303   if (crtl->calls_eh_return)
7304     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
7305       frame.reg_offset[EH_RETURN_DATA_REGNO (regno)] = SLOT_REQUIRED;
7306 
7307   /* ... and any callee saved register that dataflow says is live.  */
7308   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
7309     if (df_regs_ever_live_p (regno)
7310 	&& !fixed_regs[regno]
7311 	&& (regno == R30_REGNUM
7312 	    || !crtl->abi->clobbers_full_reg_p (regno)))
7313       frame.reg_offset[regno] = SLOT_REQUIRED;
7314 
7315   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
7316     if (df_regs_ever_live_p (regno)
7317 	&& !fixed_regs[regno]
7318 	&& !crtl->abi->clobbers_full_reg_p (regno))
7319       {
7320 	frame.reg_offset[regno] = SLOT_REQUIRED;
7321 	last_fp_reg = regno;
7322 	if (aarch64_emit_cfi_for_reg_p (regno))
7323 	  frame_related_fp_reg_p = true;
7324       }
7325 
7326   /* Big-endian SVE frames need a spare predicate register in order
7327      to save Z8-Z15.  Decide which register they should use.  Prefer
7328      an unused argument register if possible, so that we don't force P4
7329      to be saved unnecessarily.  */
7330   if (frame_related_fp_reg_p
7331       && crtl->abi->id () == ARM_PCS_SVE
7332       && BYTES_BIG_ENDIAN)
7333     {
7334       bitmap live1 = df_get_live_out (ENTRY_BLOCK_PTR_FOR_FN (cfun));
7335       bitmap live2 = df_get_live_in (EXIT_BLOCK_PTR_FOR_FN (cfun));
7336       for (regno = P0_REGNUM; regno <= P7_REGNUM; regno++)
7337 	if (!bitmap_bit_p (live1, regno) && !bitmap_bit_p (live2, regno))
7338 	  break;
7339       gcc_assert (regno <= P7_REGNUM);
7340       frame.spare_pred_reg = regno;
7341       df_set_regs_ever_live (regno, true);
7342     }
7343 
7344   for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++)
7345     if (df_regs_ever_live_p (regno)
7346 	&& !fixed_regs[regno]
7347 	&& !crtl->abi->clobbers_full_reg_p (regno))
7348       frame.reg_offset[regno] = SLOT_REQUIRED;
7349 
7350   /* With stack-clash, LR must be saved in non-leaf functions.  The saving of
7351      LR counts as an implicit probe which allows us to maintain the invariant
7352      described in the comment at expand_prologue.  */
7353   gcc_assert (crtl->is_leaf
7354 	      || maybe_ne (frame.reg_offset[R30_REGNUM], SLOT_NOT_REQUIRED));
7355 
7356   /* Now assign stack slots for the registers.  Start with the predicate
7357      registers, since predicate LDR and STR have a relatively small
7358      offset range.  These saves happen below the hard frame pointer.  */
7359   for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++)
7360     if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
7361       {
7362 	frame.reg_offset[regno] = offset;
7363 	offset += BYTES_PER_SVE_PRED;
7364       }
7365 
7366   if (maybe_ne (offset, 0))
7367     {
7368       /* If we have any vector registers to save above the predicate registers,
7369 	 the offset of the vector register save slots need to be a multiple
7370 	 of the vector size.  This lets us use the immediate forms of LDR/STR
7371 	 (or LD1/ST1 for big-endian).
7372 
7373 	 A vector register is 8 times the size of a predicate register,
7374 	 and we need to save a maximum of 12 predicate registers, so the
7375 	 first vector register will be at either #1, MUL VL or #2, MUL VL.
7376 
7377 	 If we don't have any vector registers to save, and we know how
7378 	 big the predicate save area is, we can just round it up to the
7379 	 next 16-byte boundary.  */
7380       if (last_fp_reg == (int) INVALID_REGNUM && offset.is_constant ())
7381 	offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
7382       else
7383 	{
7384 	  if (known_le (offset, vector_save_size))
7385 	    offset = vector_save_size;
7386 	  else if (known_le (offset, vector_save_size * 2))
7387 	    offset = vector_save_size * 2;
7388 	  else
7389 	    gcc_unreachable ();
7390 	}
7391     }
7392 
7393   /* If we need to save any SVE vector registers, add them next.  */
7394   if (last_fp_reg != (int) INVALID_REGNUM && crtl->abi->id () == ARM_PCS_SVE)
7395     for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
7396       if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
7397 	{
7398 	  frame.reg_offset[regno] = offset;
7399 	  offset += vector_save_size;
7400 	}
7401 
7402   /* OFFSET is now the offset of the hard frame pointer from the bottom
7403      of the callee save area.  */
7404   bool saves_below_hard_fp_p = maybe_ne (offset, 0);
7405   frame.below_hard_fp_saved_regs_size = offset;
7406   if (frame.emit_frame_chain)
7407     {
7408       /* FP and LR are placed in the linkage record.  */
7409       frame.reg_offset[R29_REGNUM] = offset;
7410       frame.wb_candidate1 = R29_REGNUM;
7411       frame.reg_offset[R30_REGNUM] = offset + UNITS_PER_WORD;
7412       frame.wb_candidate2 = R30_REGNUM;
7413       offset += 2 * UNITS_PER_WORD;
7414     }
7415 
7416   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
7417     if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
7418       {
7419 	frame.reg_offset[regno] = offset;
7420 	if (frame.wb_candidate1 == INVALID_REGNUM)
7421 	  frame.wb_candidate1 = regno;
7422 	else if (frame.wb_candidate2 == INVALID_REGNUM)
7423 	  frame.wb_candidate2 = regno;
7424 	offset += UNITS_PER_WORD;
7425       }
7426 
7427   poly_int64 max_int_offset = offset;
7428   offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
7429   bool has_align_gap = maybe_ne (offset, max_int_offset);
7430 
7431   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
7432     if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
7433       {
7434 	/* If there is an alignment gap between integer and fp callee-saves,
7435 	   allocate the last fp register to it if possible.  */
7436 	if (regno == last_fp_reg
7437 	    && has_align_gap
7438 	    && known_eq (vector_save_size, 8)
7439 	    && multiple_p (offset, 16))
7440 	  {
7441 	    frame.reg_offset[regno] = max_int_offset;
7442 	    break;
7443 	  }
7444 
7445 	frame.reg_offset[regno] = offset;
7446 	if (frame.wb_candidate1 == INVALID_REGNUM)
7447 	  frame.wb_candidate1 = regno;
7448 	else if (frame.wb_candidate2 == INVALID_REGNUM
7449 		 && frame.wb_candidate1 >= V0_REGNUM)
7450 	  frame.wb_candidate2 = regno;
7451 	offset += vector_save_size;
7452       }
7453 
7454   offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
7455 
7456   frame.saved_regs_size = offset;
7457 
7458   poly_int64 varargs_and_saved_regs_size = offset + frame.saved_varargs_size;
7459 
7460   poly_int64 above_outgoing_args
7461     = aligned_upper_bound (varargs_and_saved_regs_size
7462 			   + get_frame_size (),
7463 			   STACK_BOUNDARY / BITS_PER_UNIT);
7464 
7465   frame.hard_fp_offset
7466     = above_outgoing_args - frame.below_hard_fp_saved_regs_size;
7467 
7468   /* Both these values are already aligned.  */
7469   gcc_assert (multiple_p (crtl->outgoing_args_size,
7470 			  STACK_BOUNDARY / BITS_PER_UNIT));
7471   frame.frame_size = above_outgoing_args + crtl->outgoing_args_size;
7472 
7473   frame.locals_offset = frame.saved_varargs_size;
7474 
7475   frame.initial_adjust = 0;
7476   frame.final_adjust = 0;
7477   frame.callee_adjust = 0;
7478   frame.sve_callee_adjust = 0;
7479   frame.callee_offset = 0;
7480 
7481   HOST_WIDE_INT max_push_offset = 0;
7482   if (frame.wb_candidate2 != INVALID_REGNUM)
7483     max_push_offset = 512;
7484   else if (frame.wb_candidate1 != INVALID_REGNUM)
7485     max_push_offset = 256;
7486 
7487   HOST_WIDE_INT const_size, const_outgoing_args_size, const_fp_offset;
7488   HOST_WIDE_INT const_saved_regs_size;
7489   if (frame.frame_size.is_constant (&const_size)
7490       && const_size < max_push_offset
7491       && known_eq (frame.hard_fp_offset, const_size))
7492     {
7493       /* Simple, small frame with no outgoing arguments:
7494 
7495 	 stp reg1, reg2, [sp, -frame_size]!
7496 	 stp reg3, reg4, [sp, 16]  */
7497       frame.callee_adjust = const_size;
7498     }
7499   else if (crtl->outgoing_args_size.is_constant (&const_outgoing_args_size)
7500 	   && frame.saved_regs_size.is_constant (&const_saved_regs_size)
7501 	   && const_outgoing_args_size + const_saved_regs_size < 512
7502 	   /* We could handle this case even with outgoing args, provided
7503 	      that the number of args left us with valid offsets for all
7504 	      predicate and vector save slots.  It's such a rare case that
7505 	      it hardly seems worth the effort though.  */
7506 	   && (!saves_below_hard_fp_p || const_outgoing_args_size == 0)
7507 	   && !(cfun->calls_alloca
7508 		&& frame.hard_fp_offset.is_constant (&const_fp_offset)
7509 		&& const_fp_offset < max_push_offset))
7510     {
7511       /* Frame with small outgoing arguments:
7512 
7513 	 sub sp, sp, frame_size
7514 	 stp reg1, reg2, [sp, outgoing_args_size]
7515 	 stp reg3, reg4, [sp, outgoing_args_size + 16]  */
7516       frame.initial_adjust = frame.frame_size;
7517       frame.callee_offset = const_outgoing_args_size;
7518     }
7519   else if (saves_below_hard_fp_p
7520 	   && known_eq (frame.saved_regs_size,
7521 			frame.below_hard_fp_saved_regs_size))
7522     {
7523       /* Frame in which all saves are SVE saves:
7524 
7525 	 sub sp, sp, hard_fp_offset + below_hard_fp_saved_regs_size
7526 	 save SVE registers relative to SP
7527 	 sub sp, sp, outgoing_args_size  */
7528       frame.initial_adjust = (frame.hard_fp_offset
7529 			      + frame.below_hard_fp_saved_regs_size);
7530       frame.final_adjust = crtl->outgoing_args_size;
7531     }
7532   else if (frame.hard_fp_offset.is_constant (&const_fp_offset)
7533 	   && const_fp_offset < max_push_offset)
7534     {
7535       /* Frame with large outgoing arguments or SVE saves, but with
7536 	 a small local area:
7537 
7538 	 stp reg1, reg2, [sp, -hard_fp_offset]!
7539 	 stp reg3, reg4, [sp, 16]
7540 	 [sub sp, sp, below_hard_fp_saved_regs_size]
7541 	 [save SVE registers relative to SP]
7542 	 sub sp, sp, outgoing_args_size  */
7543       frame.callee_adjust = const_fp_offset;
7544       frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size;
7545       frame.final_adjust = crtl->outgoing_args_size;
7546     }
7547   else
7548     {
7549       /* Frame with large local area and outgoing arguments or SVE saves,
7550 	 using frame pointer:
7551 
7552 	 sub sp, sp, hard_fp_offset
7553 	 stp x29, x30, [sp, 0]
7554 	 add x29, sp, 0
7555 	 stp reg3, reg4, [sp, 16]
7556 	 [sub sp, sp, below_hard_fp_saved_regs_size]
7557 	 [save SVE registers relative to SP]
7558 	 sub sp, sp, outgoing_args_size  */
7559       frame.initial_adjust = frame.hard_fp_offset;
7560       frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size;
7561       frame.final_adjust = crtl->outgoing_args_size;
7562     }
7563 
7564   /* Make sure the individual adjustments add up to the full frame size.  */
7565   gcc_assert (known_eq (frame.initial_adjust
7566 			+ frame.callee_adjust
7567 			+ frame.sve_callee_adjust
7568 			+ frame.final_adjust, frame.frame_size));
7569 
7570   if (!frame.emit_frame_chain && frame.callee_adjust == 0)
7571     {
7572       /* We've decided not to associate any register saves with the initial
7573 	 stack allocation.  */
7574       frame.wb_candidate1 = INVALID_REGNUM;
7575       frame.wb_candidate2 = INVALID_REGNUM;
7576     }
7577 
7578   frame.laid_out = true;
7579 }
7580 
7581 /* Return true if the register REGNO is saved on entry to
7582    the current function.  */
7583 
7584 static bool
aarch64_register_saved_on_entry(int regno)7585 aarch64_register_saved_on_entry (int regno)
7586 {
7587   return known_ge (cfun->machine->frame.reg_offset[regno], 0);
7588 }
7589 
7590 /* Return the next register up from REGNO up to LIMIT for the callee
7591    to save.  */
7592 
7593 static unsigned
aarch64_next_callee_save(unsigned regno,unsigned limit)7594 aarch64_next_callee_save (unsigned regno, unsigned limit)
7595 {
7596   while (regno <= limit && !aarch64_register_saved_on_entry (regno))
7597     regno ++;
7598   return regno;
7599 }
7600 
7601 /* Push the register number REGNO of mode MODE to the stack with write-back
7602    adjusting the stack by ADJUSTMENT.  */
7603 
7604 static void
aarch64_pushwb_single_reg(machine_mode mode,unsigned regno,HOST_WIDE_INT adjustment)7605 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
7606 			   HOST_WIDE_INT adjustment)
7607  {
7608   rtx base_rtx = stack_pointer_rtx;
7609   rtx insn, reg, mem;
7610 
7611   reg = gen_rtx_REG (mode, regno);
7612   mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
7613 			    plus_constant (Pmode, base_rtx, -adjustment));
7614   mem = gen_frame_mem (mode, mem);
7615 
7616   insn = emit_move_insn (mem, reg);
7617   RTX_FRAME_RELATED_P (insn) = 1;
7618 }
7619 
7620 /* Generate and return an instruction to store the pair of registers
7621    REG and REG2 of mode MODE to location BASE with write-back adjusting
7622    the stack location BASE by ADJUSTMENT.  */
7623 
7624 static rtx
aarch64_gen_storewb_pair(machine_mode mode,rtx base,rtx reg,rtx reg2,HOST_WIDE_INT adjustment)7625 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
7626 			  HOST_WIDE_INT adjustment)
7627 {
7628   switch (mode)
7629     {
7630     case E_DImode:
7631       return gen_storewb_pairdi_di (base, base, reg, reg2,
7632 				    GEN_INT (-adjustment),
7633 				    GEN_INT (UNITS_PER_WORD - adjustment));
7634     case E_DFmode:
7635       return gen_storewb_pairdf_di (base, base, reg, reg2,
7636 				    GEN_INT (-adjustment),
7637 				    GEN_INT (UNITS_PER_WORD - adjustment));
7638     case E_TFmode:
7639       return gen_storewb_pairtf_di (base, base, reg, reg2,
7640 				    GEN_INT (-adjustment),
7641 				    GEN_INT (UNITS_PER_VREG - adjustment));
7642     default:
7643       gcc_unreachable ();
7644     }
7645 }
7646 
7647 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
7648    stack pointer by ADJUSTMENT.  */
7649 
7650 static void
aarch64_push_regs(unsigned regno1,unsigned regno2,HOST_WIDE_INT adjustment)7651 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
7652 {
7653   rtx_insn *insn;
7654   machine_mode mode = aarch64_reg_save_mode (regno1);
7655 
7656   if (regno2 == INVALID_REGNUM)
7657     return aarch64_pushwb_single_reg (mode, regno1, adjustment);
7658 
7659   rtx reg1 = gen_rtx_REG (mode, regno1);
7660   rtx reg2 = gen_rtx_REG (mode, regno2);
7661 
7662   insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
7663 					      reg2, adjustment));
7664   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
7665   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
7666   RTX_FRAME_RELATED_P (insn) = 1;
7667 }
7668 
7669 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
7670    adjusting it by ADJUSTMENT afterwards.  */
7671 
7672 static rtx
aarch64_gen_loadwb_pair(machine_mode mode,rtx base,rtx reg,rtx reg2,HOST_WIDE_INT adjustment)7673 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
7674 			 HOST_WIDE_INT adjustment)
7675 {
7676   switch (mode)
7677     {
7678     case E_DImode:
7679       return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
7680 				   GEN_INT (UNITS_PER_WORD));
7681     case E_DFmode:
7682       return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
7683 				   GEN_INT (UNITS_PER_WORD));
7684     case E_TFmode:
7685       return gen_loadwb_pairtf_di (base, base, reg, reg2, GEN_INT (adjustment),
7686 				   GEN_INT (UNITS_PER_VREG));
7687     default:
7688       gcc_unreachable ();
7689     }
7690 }
7691 
7692 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
7693    afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
7694    into CFI_OPS.  */
7695 
7696 static void
aarch64_pop_regs(unsigned regno1,unsigned regno2,HOST_WIDE_INT adjustment,rtx * cfi_ops)7697 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
7698 		  rtx *cfi_ops)
7699 {
7700   machine_mode mode = aarch64_reg_save_mode (regno1);
7701   rtx reg1 = gen_rtx_REG (mode, regno1);
7702 
7703   *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
7704 
7705   if (regno2 == INVALID_REGNUM)
7706     {
7707       rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
7708       mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
7709       emit_move_insn (reg1, gen_frame_mem (mode, mem));
7710     }
7711   else
7712     {
7713       rtx reg2 = gen_rtx_REG (mode, regno2);
7714       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
7715       emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
7716 					  reg2, adjustment));
7717     }
7718 }
7719 
7720 /* Generate and return a store pair instruction of mode MODE to store
7721    register REG1 to MEM1 and register REG2 to MEM2.  */
7722 
7723 static rtx
aarch64_gen_store_pair(machine_mode mode,rtx mem1,rtx reg1,rtx mem2,rtx reg2)7724 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
7725 			rtx reg2)
7726 {
7727   switch (mode)
7728     {
7729     case E_DImode:
7730       return gen_store_pair_dw_didi (mem1, reg1, mem2, reg2);
7731 
7732     case E_DFmode:
7733       return gen_store_pair_dw_dfdf (mem1, reg1, mem2, reg2);
7734 
7735     case E_TFmode:
7736       return gen_store_pair_dw_tftf (mem1, reg1, mem2, reg2);
7737 
7738     case E_V4SImode:
7739       return gen_vec_store_pairv4siv4si (mem1, reg1, mem2, reg2);
7740 
7741     case E_V16QImode:
7742       return gen_vec_store_pairv16qiv16qi (mem1, reg1, mem2, reg2);
7743 
7744     default:
7745       gcc_unreachable ();
7746     }
7747 }
7748 
7749 /* Generate and regurn a load pair isntruction of mode MODE to load register
7750    REG1 from MEM1 and register REG2 from MEM2.  */
7751 
7752 static rtx
aarch64_gen_load_pair(machine_mode mode,rtx reg1,rtx mem1,rtx reg2,rtx mem2)7753 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
7754 		       rtx mem2)
7755 {
7756   switch (mode)
7757     {
7758     case E_DImode:
7759       return gen_load_pair_dw_didi (reg1, mem1, reg2, mem2);
7760 
7761     case E_DFmode:
7762       return gen_load_pair_dw_dfdf (reg1, mem1, reg2, mem2);
7763 
7764     case E_TFmode:
7765       return gen_load_pair_dw_tftf (reg1, mem1, reg2, mem2);
7766 
7767     case E_V4SImode:
7768       return gen_load_pairv4siv4si (reg1, mem1, reg2, mem2);
7769 
7770     default:
7771       gcc_unreachable ();
7772     }
7773 }
7774 
7775 /* Return TRUE if return address signing should be enabled for the current
7776    function, otherwise return FALSE.  */
7777 
7778 bool
aarch64_return_address_signing_enabled(void)7779 aarch64_return_address_signing_enabled (void)
7780 {
7781   /* This function should only be called after frame laid out.   */
7782   gcc_assert (cfun->machine->frame.laid_out);
7783 
7784   /* Turn return address signing off in any function that uses
7785      __builtin_eh_return.  The address passed to __builtin_eh_return
7786      is not signed so either it has to be signed (with original sp)
7787      or the code path that uses it has to avoid authenticating it.
7788      Currently eh return introduces a return to anywhere gadget, no
7789      matter what we do here since it uses ret with user provided
7790      address. An ideal fix for that is to use indirect branch which
7791      can be protected with BTI j (to some extent).  */
7792   if (crtl->calls_eh_return)
7793     return false;
7794 
7795   /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
7796      if its LR is pushed onto stack.  */
7797   return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
7798 	  || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
7799 	      && known_ge (cfun->machine->frame.reg_offset[LR_REGNUM], 0)));
7800 }
7801 
7802 /* Return TRUE if Branch Target Identification Mechanism is enabled.  */
7803 bool
aarch64_bti_enabled(void)7804 aarch64_bti_enabled (void)
7805 {
7806   return (aarch64_enable_bti == 1);
7807 }
7808 
7809 /* The caller is going to use ST1D or LD1D to save or restore an SVE
7810    register in mode MODE at BASE_RTX + OFFSET, where OFFSET is in
7811    the range [1, 16] * GET_MODE_SIZE (MODE).  Prepare for this by:
7812 
7813      (1) updating BASE_RTX + OFFSET so that it is a legitimate ST1D
7814 	 or LD1D address
7815 
7816      (2) setting PRED to a valid predicate register for the ST1D or LD1D,
7817 	 if the variable isn't already nonnull
7818 
7819    (1) is needed when OFFSET is in the range [8, 16] * GET_MODE_SIZE (MODE).
7820    Handle this case using a temporary base register that is suitable for
7821    all offsets in that range.  Use ANCHOR_REG as this base register if it
7822    is nonnull, otherwise create a new register and store it in ANCHOR_REG.  */
7823 
7824 static inline void
aarch64_adjust_sve_callee_save_base(machine_mode mode,rtx & base_rtx,rtx & anchor_reg,poly_int64 & offset,rtx & ptrue)7825 aarch64_adjust_sve_callee_save_base (machine_mode mode, rtx &base_rtx,
7826 				     rtx &anchor_reg, poly_int64 &offset,
7827 				     rtx &ptrue)
7828 {
7829   if (maybe_ge (offset, 8 * GET_MODE_SIZE (mode)))
7830     {
7831       /* This is the maximum valid offset of the anchor from the base.
7832 	 Lower values would be valid too.  */
7833       poly_int64 anchor_offset = 16 * GET_MODE_SIZE (mode);
7834       if (!anchor_reg)
7835 	{
7836 	  anchor_reg = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
7837 	  emit_insn (gen_add3_insn (anchor_reg, base_rtx,
7838 				    gen_int_mode (anchor_offset, Pmode)));
7839 	}
7840       base_rtx = anchor_reg;
7841       offset -= anchor_offset;
7842     }
7843   if (!ptrue)
7844     {
7845       int pred_reg = cfun->machine->frame.spare_pred_reg;
7846       emit_move_insn (gen_rtx_REG (VNx16BImode, pred_reg),
7847 		      CONSTM1_RTX (VNx16BImode));
7848       ptrue = gen_rtx_REG (VNx2BImode, pred_reg);
7849     }
7850 }
7851 
7852 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
7853    is saved at BASE + OFFSET.  */
7854 
7855 static void
aarch64_add_cfa_expression(rtx_insn * insn,rtx reg,rtx base,poly_int64 offset)7856 aarch64_add_cfa_expression (rtx_insn *insn, rtx reg,
7857 			    rtx base, poly_int64 offset)
7858 {
7859   rtx mem = gen_frame_mem (GET_MODE (reg),
7860 			   plus_constant (Pmode, base, offset));
7861   add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg));
7862 }
7863 
7864 /* Emit code to save the callee-saved registers from register number START
7865    to LIMIT to the stack at the location starting at offset START_OFFSET,
7866    skipping any write-back candidates if SKIP_WB is true.  HARD_FP_VALID_P
7867    is true if the hard frame pointer has been set up.  */
7868 
7869 static void
aarch64_save_callee_saves(poly_int64 start_offset,unsigned start,unsigned limit,bool skip_wb,bool hard_fp_valid_p)7870 aarch64_save_callee_saves (poly_int64 start_offset,
7871 			   unsigned start, unsigned limit, bool skip_wb,
7872 			   bool hard_fp_valid_p)
7873 {
7874   rtx_insn *insn;
7875   unsigned regno;
7876   unsigned regno2;
7877   rtx anchor_reg = NULL_RTX, ptrue = NULL_RTX;
7878 
7879   for (regno = aarch64_next_callee_save (start, limit);
7880        regno <= limit;
7881        regno = aarch64_next_callee_save (regno + 1, limit))
7882     {
7883       rtx reg, mem;
7884       poly_int64 offset;
7885       bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
7886 
7887       if (skip_wb
7888 	  && (regno == cfun->machine->frame.wb_candidate1
7889 	      || regno == cfun->machine->frame.wb_candidate2))
7890 	continue;
7891 
7892       if (cfun->machine->reg_is_wrapped_separately[regno])
7893 	continue;
7894 
7895       machine_mode mode = aarch64_reg_save_mode (regno);
7896       reg = gen_rtx_REG (mode, regno);
7897       offset = start_offset + cfun->machine->frame.reg_offset[regno];
7898       rtx base_rtx = stack_pointer_rtx;
7899       poly_int64 sp_offset = offset;
7900 
7901       HOST_WIDE_INT const_offset;
7902       if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
7903 	aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg,
7904 					     offset, ptrue);
7905       else if (GP_REGNUM_P (regno)
7906 	       && (!offset.is_constant (&const_offset) || const_offset >= 512))
7907 	{
7908 	  gcc_assert (known_eq (start_offset, 0));
7909 	  poly_int64 fp_offset
7910 	    = cfun->machine->frame.below_hard_fp_saved_regs_size;
7911 	  if (hard_fp_valid_p)
7912 	    base_rtx = hard_frame_pointer_rtx;
7913 	  else
7914 	    {
7915 	      if (!anchor_reg)
7916 		{
7917 		  anchor_reg = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
7918 		  emit_insn (gen_add3_insn (anchor_reg, base_rtx,
7919 					    gen_int_mode (fp_offset, Pmode)));
7920 		}
7921 	      base_rtx = anchor_reg;
7922 	    }
7923 	  offset -= fp_offset;
7924 	}
7925       mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
7926       bool need_cfa_note_p = (base_rtx != stack_pointer_rtx);
7927 
7928       if (!aarch64_sve_mode_p (mode)
7929 	  && (regno2 = aarch64_next_callee_save (regno + 1, limit)) <= limit
7930 	  && !cfun->machine->reg_is_wrapped_separately[regno2]
7931 	  && known_eq (GET_MODE_SIZE (mode),
7932 		       cfun->machine->frame.reg_offset[regno2]
7933 		       - cfun->machine->frame.reg_offset[regno]))
7934 	{
7935 	  rtx reg2 = gen_rtx_REG (mode, regno2);
7936 	  rtx mem2;
7937 
7938 	  offset += GET_MODE_SIZE (mode);
7939 	  mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
7940 	  insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
7941 						    reg2));
7942 
7943 	  /* The first part of a frame-related parallel insn is
7944 	     always assumed to be relevant to the frame
7945 	     calculations; subsequent parts, are only
7946 	     frame-related if explicitly marked.  */
7947 	  if (aarch64_emit_cfi_for_reg_p (regno2))
7948 	    {
7949 	      if (need_cfa_note_p)
7950 		aarch64_add_cfa_expression (insn, reg2, stack_pointer_rtx,
7951 					    sp_offset + GET_MODE_SIZE (mode));
7952 	      else
7953 		RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
7954 	    }
7955 
7956 	  regno = regno2;
7957 	}
7958       else if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
7959 	{
7960 	  insn = emit_insn (gen_aarch64_pred_mov (mode, mem, ptrue, reg));
7961 	  need_cfa_note_p = true;
7962 	}
7963       else if (aarch64_sve_mode_p (mode))
7964 	insn = emit_insn (gen_rtx_SET (mem, reg));
7965       else
7966 	insn = emit_move_insn (mem, reg);
7967 
7968       RTX_FRAME_RELATED_P (insn) = frame_related_p;
7969       if (frame_related_p && need_cfa_note_p)
7970 	aarch64_add_cfa_expression (insn, reg, stack_pointer_rtx, sp_offset);
7971     }
7972 }
7973 
7974 /* Emit code to restore the callee registers from register number START
7975    up to and including LIMIT.  Restore from the stack offset START_OFFSET,
7976    skipping any write-back candidates if SKIP_WB is true.  Write the
7977    appropriate REG_CFA_RESTORE notes into CFI_OPS.  */
7978 
7979 static void
aarch64_restore_callee_saves(poly_int64 start_offset,unsigned start,unsigned limit,bool skip_wb,rtx * cfi_ops)7980 aarch64_restore_callee_saves (poly_int64 start_offset, unsigned start,
7981 			      unsigned limit, bool skip_wb, rtx *cfi_ops)
7982 {
7983   unsigned regno;
7984   unsigned regno2;
7985   poly_int64 offset;
7986   rtx anchor_reg = NULL_RTX, ptrue = NULL_RTX;
7987 
7988   for (regno = aarch64_next_callee_save (start, limit);
7989        regno <= limit;
7990        regno = aarch64_next_callee_save (regno + 1, limit))
7991     {
7992       bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
7993       if (cfun->machine->reg_is_wrapped_separately[regno])
7994 	continue;
7995 
7996       rtx reg, mem;
7997 
7998       if (skip_wb
7999 	  && (regno == cfun->machine->frame.wb_candidate1
8000 	      || regno == cfun->machine->frame.wb_candidate2))
8001 	continue;
8002 
8003       machine_mode mode = aarch64_reg_save_mode (regno);
8004       reg = gen_rtx_REG (mode, regno);
8005       offset = start_offset + cfun->machine->frame.reg_offset[regno];
8006       rtx base_rtx = stack_pointer_rtx;
8007       if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
8008 	aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg,
8009 					     offset, ptrue);
8010       mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
8011 
8012       if (!aarch64_sve_mode_p (mode)
8013 	  && (regno2 = aarch64_next_callee_save (regno + 1, limit)) <= limit
8014 	  && !cfun->machine->reg_is_wrapped_separately[regno2]
8015 	  && known_eq (GET_MODE_SIZE (mode),
8016 		       cfun->machine->frame.reg_offset[regno2]
8017 		       - cfun->machine->frame.reg_offset[regno]))
8018 	{
8019 	  rtx reg2 = gen_rtx_REG (mode, regno2);
8020 	  rtx mem2;
8021 
8022 	  offset += GET_MODE_SIZE (mode);
8023 	  mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
8024 	  emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
8025 
8026 	  *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
8027 	  regno = regno2;
8028 	}
8029       else if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
8030 	emit_insn (gen_aarch64_pred_mov (mode, reg, ptrue, mem));
8031       else if (aarch64_sve_mode_p (mode))
8032 	emit_insn (gen_rtx_SET (reg, mem));
8033       else
8034 	emit_move_insn (reg, mem);
8035       if (frame_related_p)
8036 	*cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
8037     }
8038 }
8039 
8040 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
8041    of MODE.  */
8042 
8043 static inline bool
offset_4bit_signed_scaled_p(machine_mode mode,poly_int64 offset)8044 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
8045 {
8046   HOST_WIDE_INT multiple;
8047   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
8048 	  && IN_RANGE (multiple, -8, 7));
8049 }
8050 
8051 /* Return true if OFFSET is a signed 6-bit value multiplied by the size
8052    of MODE.  */
8053 
8054 static inline bool
offset_6bit_signed_scaled_p(machine_mode mode,poly_int64 offset)8055 offset_6bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
8056 {
8057   HOST_WIDE_INT multiple;
8058   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
8059 	  && IN_RANGE (multiple, -32, 31));
8060 }
8061 
8062 /* Return true if OFFSET is an unsigned 6-bit value multiplied by the size
8063    of MODE.  */
8064 
8065 static inline bool
offset_6bit_unsigned_scaled_p(machine_mode mode,poly_int64 offset)8066 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
8067 {
8068   HOST_WIDE_INT multiple;
8069   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
8070 	  && IN_RANGE (multiple, 0, 63));
8071 }
8072 
8073 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
8074    of MODE.  */
8075 
8076 bool
aarch64_offset_7bit_signed_scaled_p(machine_mode mode,poly_int64 offset)8077 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
8078 {
8079   HOST_WIDE_INT multiple;
8080   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
8081 	  && IN_RANGE (multiple, -64, 63));
8082 }
8083 
8084 /* Return true if OFFSET is a signed 9-bit value.  */
8085 
8086 bool
aarch64_offset_9bit_signed_unscaled_p(machine_mode mode ATTRIBUTE_UNUSED,poly_int64 offset)8087 aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
8088 				       poly_int64 offset)
8089 {
8090   HOST_WIDE_INT const_offset;
8091   return (offset.is_constant (&const_offset)
8092 	  && IN_RANGE (const_offset, -256, 255));
8093 }
8094 
8095 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
8096    of MODE.  */
8097 
8098 static inline bool
offset_9bit_signed_scaled_p(machine_mode mode,poly_int64 offset)8099 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
8100 {
8101   HOST_WIDE_INT multiple;
8102   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
8103 	  && IN_RANGE (multiple, -256, 255));
8104 }
8105 
8106 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
8107    of MODE.  */
8108 
8109 static inline bool
offset_12bit_unsigned_scaled_p(machine_mode mode,poly_int64 offset)8110 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
8111 {
8112   HOST_WIDE_INT multiple;
8113   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
8114 	  && IN_RANGE (multiple, 0, 4095));
8115 }
8116 
8117 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS.  */
8118 
8119 static sbitmap
aarch64_get_separate_components(void)8120 aarch64_get_separate_components (void)
8121 {
8122   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
8123   bitmap_clear (components);
8124 
8125   /* The registers we need saved to the frame.  */
8126   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
8127     if (aarch64_register_saved_on_entry (regno))
8128       {
8129 	/* Punt on saves and restores that use ST1D and LD1D.  We could
8130 	   try to be smarter, but it would involve making sure that the
8131 	   spare predicate register itself is safe to use at the save
8132 	   and restore points.  Also, when a frame pointer is being used,
8133 	   the slots are often out of reach of ST1D and LD1D anyway.  */
8134 	machine_mode mode = aarch64_reg_save_mode (regno);
8135 	if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
8136 	  continue;
8137 
8138 	poly_int64 offset = cfun->machine->frame.reg_offset[regno];
8139 
8140 	/* If the register is saved in the first SVE save slot, we use
8141 	   it as a stack probe for -fstack-clash-protection.  */
8142 	if (flag_stack_clash_protection
8143 	    && maybe_ne (cfun->machine->frame.below_hard_fp_saved_regs_size, 0)
8144 	    && known_eq (offset, 0))
8145 	  continue;
8146 
8147 	/* Get the offset relative to the register we'll use.  */
8148 	if (frame_pointer_needed)
8149 	  offset -= cfun->machine->frame.below_hard_fp_saved_regs_size;
8150 	else
8151 	  offset += crtl->outgoing_args_size;
8152 
8153 	/* Check that we can access the stack slot of the register with one
8154 	   direct load with no adjustments needed.  */
8155 	if (aarch64_sve_mode_p (mode)
8156 	    ? offset_9bit_signed_scaled_p (mode, offset)
8157 	    : offset_12bit_unsigned_scaled_p (mode, offset))
8158 	  bitmap_set_bit (components, regno);
8159       }
8160 
8161   /* Don't mess with the hard frame pointer.  */
8162   if (frame_pointer_needed)
8163     bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
8164 
8165   /* If the spare predicate register used by big-endian SVE code
8166      is call-preserved, it must be saved in the main prologue
8167      before any saves that use it.  */
8168   if (cfun->machine->frame.spare_pred_reg != INVALID_REGNUM)
8169     bitmap_clear_bit (components, cfun->machine->frame.spare_pred_reg);
8170 
8171   unsigned reg1 = cfun->machine->frame.wb_candidate1;
8172   unsigned reg2 = cfun->machine->frame.wb_candidate2;
8173   /* If registers have been chosen to be stored/restored with
8174      writeback don't interfere with them to avoid having to output explicit
8175      stack adjustment instructions.  */
8176   if (reg2 != INVALID_REGNUM)
8177     bitmap_clear_bit (components, reg2);
8178   if (reg1 != INVALID_REGNUM)
8179     bitmap_clear_bit (components, reg1);
8180 
8181   bitmap_clear_bit (components, LR_REGNUM);
8182   bitmap_clear_bit (components, SP_REGNUM);
8183 
8184   return components;
8185 }
8186 
8187 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB.  */
8188 
8189 static sbitmap
aarch64_components_for_bb(basic_block bb)8190 aarch64_components_for_bb (basic_block bb)
8191 {
8192   bitmap in = DF_LIVE_IN (bb);
8193   bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
8194   bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
8195 
8196   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
8197   bitmap_clear (components);
8198 
8199   /* Clobbered registers don't generate values in any meaningful sense,
8200      since nothing after the clobber can rely on their value.  And we can't
8201      say that partially-clobbered registers are unconditionally killed,
8202      because whether they're killed or not depends on the mode of the
8203      value they're holding.  Thus partially call-clobbered registers
8204      appear in neither the kill set nor the gen set.
8205 
8206      Check manually for any calls that clobber more of a register than the
8207      current function can.  */
8208   function_abi_aggregator callee_abis;
8209   rtx_insn *insn;
8210   FOR_BB_INSNS (bb, insn)
8211     if (CALL_P (insn))
8212       callee_abis.note_callee_abi (insn_callee_abi (insn));
8213   HARD_REG_SET extra_caller_saves = callee_abis.caller_save_regs (*crtl->abi);
8214 
8215   /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets.  */
8216   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
8217     if (!fixed_regs[regno]
8218 	&& !crtl->abi->clobbers_full_reg_p (regno)
8219 	&& (TEST_HARD_REG_BIT (extra_caller_saves, regno)
8220 	    || bitmap_bit_p (in, regno)
8221 	    || bitmap_bit_p (gen, regno)
8222 	    || bitmap_bit_p (kill, regno)))
8223       {
8224 	bitmap_set_bit (components, regno);
8225 
8226 	/* If there is a callee-save at an adjacent offset, add it too
8227 	   to increase the use of LDP/STP.  */
8228 	poly_int64 offset = cfun->machine->frame.reg_offset[regno];
8229 	unsigned regno2 = multiple_p (offset, 16) ? regno + 1 : regno - 1;
8230 
8231 	if (regno2 <= LAST_SAVED_REGNUM)
8232 	  {
8233 	    poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
8234 	    if (regno < regno2
8235 		? known_eq (offset + 8, offset2)
8236 		: multiple_p (offset2, 16) && known_eq (offset2 + 8, offset))
8237 	      bitmap_set_bit (components, regno2);
8238 	  }
8239       }
8240 
8241   return components;
8242 }
8243 
8244 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
8245    Nothing to do for aarch64.  */
8246 
8247 static void
aarch64_disqualify_components(sbitmap,edge,sbitmap,bool)8248 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
8249 {
8250 }
8251 
8252 /* Return the next set bit in BMP from START onwards.  Return the total number
8253    of bits in BMP if no set bit is found at or after START.  */
8254 
8255 static unsigned int
aarch64_get_next_set_bit(sbitmap bmp,unsigned int start)8256 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
8257 {
8258   unsigned int nbits = SBITMAP_SIZE (bmp);
8259   if (start == nbits)
8260     return start;
8261 
8262   gcc_assert (start < nbits);
8263   for (unsigned int i = start; i < nbits; i++)
8264     if (bitmap_bit_p (bmp, i))
8265       return i;
8266 
8267   return nbits;
8268 }
8269 
8270 /* Do the work for aarch64_emit_prologue_components and
8271    aarch64_emit_epilogue_components.  COMPONENTS is the bitmap of registers
8272    to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
8273    for these components or the epilogue sequence.  That is, it determines
8274    whether we should emit stores or loads and what kind of CFA notes to attach
8275    to the insns.  Otherwise the logic for the two sequences is very
8276    similar.  */
8277 
8278 static void
aarch64_process_components(sbitmap components,bool prologue_p)8279 aarch64_process_components (sbitmap components, bool prologue_p)
8280 {
8281   rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
8282 			     ? HARD_FRAME_POINTER_REGNUM
8283 			     : STACK_POINTER_REGNUM);
8284 
8285   unsigned last_regno = SBITMAP_SIZE (components);
8286   unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
8287   rtx_insn *insn = NULL;
8288 
8289   while (regno != last_regno)
8290     {
8291       bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
8292       machine_mode mode = aarch64_reg_save_mode (regno);
8293 
8294       rtx reg = gen_rtx_REG (mode, regno);
8295       poly_int64 offset = cfun->machine->frame.reg_offset[regno];
8296       if (frame_pointer_needed)
8297 	offset -= cfun->machine->frame.below_hard_fp_saved_regs_size;
8298       else
8299 	offset += crtl->outgoing_args_size;
8300 
8301       rtx addr = plus_constant (Pmode, ptr_reg, offset);
8302       rtx mem = gen_frame_mem (mode, addr);
8303 
8304       rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
8305       unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
8306       /* No more registers to handle after REGNO.
8307 	 Emit a single save/restore and exit.  */
8308       if (regno2 == last_regno)
8309 	{
8310 	  insn = emit_insn (set);
8311 	  if (frame_related_p)
8312 	    {
8313 	      RTX_FRAME_RELATED_P (insn) = 1;
8314 	      if (prologue_p)
8315 		add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
8316 	      else
8317 		add_reg_note (insn, REG_CFA_RESTORE, reg);
8318 	    }
8319 	  break;
8320 	}
8321 
8322       poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
8323       /* The next register is not of the same class or its offset is not
8324 	 mergeable with the current one into a pair.  */
8325       if (aarch64_sve_mode_p (mode)
8326 	  || !satisfies_constraint_Ump (mem)
8327 	  || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
8328 	  || (crtl->abi->id () == ARM_PCS_SIMD && FP_REGNUM_P (regno))
8329 	  || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
8330 		       GET_MODE_SIZE (mode)))
8331 	{
8332 	  insn = emit_insn (set);
8333 	  if (frame_related_p)
8334 	    {
8335 	      RTX_FRAME_RELATED_P (insn) = 1;
8336 	      if (prologue_p)
8337 		add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
8338 	      else
8339 		add_reg_note (insn, REG_CFA_RESTORE, reg);
8340 	    }
8341 
8342 	  regno = regno2;
8343 	  continue;
8344 	}
8345 
8346       bool frame_related2_p = aarch64_emit_cfi_for_reg_p (regno2);
8347 
8348       /* REGNO2 can be saved/restored in a pair with REGNO.  */
8349       rtx reg2 = gen_rtx_REG (mode, regno2);
8350       if (frame_pointer_needed)
8351 	offset2 -= cfun->machine->frame.below_hard_fp_saved_regs_size;
8352       else
8353 	offset2 += crtl->outgoing_args_size;
8354       rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
8355       rtx mem2 = gen_frame_mem (mode, addr2);
8356       rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
8357 			     : gen_rtx_SET (reg2, mem2);
8358 
8359       if (prologue_p)
8360 	insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
8361       else
8362 	insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
8363 
8364       if (frame_related_p || frame_related2_p)
8365 	{
8366 	  RTX_FRAME_RELATED_P (insn) = 1;
8367 	  if (prologue_p)
8368 	    {
8369 	      if (frame_related_p)
8370 		add_reg_note (insn, REG_CFA_OFFSET, set);
8371 	      if (frame_related2_p)
8372 		add_reg_note (insn, REG_CFA_OFFSET, set2);
8373 	    }
8374 	  else
8375 	    {
8376 	      if (frame_related_p)
8377 		add_reg_note (insn, REG_CFA_RESTORE, reg);
8378 	      if (frame_related2_p)
8379 		add_reg_note (insn, REG_CFA_RESTORE, reg2);
8380 	    }
8381 	}
8382 
8383       regno = aarch64_get_next_set_bit (components, regno2 + 1);
8384     }
8385 }
8386 
8387 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS.  */
8388 
8389 static void
aarch64_emit_prologue_components(sbitmap components)8390 aarch64_emit_prologue_components (sbitmap components)
8391 {
8392   aarch64_process_components (components, true);
8393 }
8394 
8395 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS.  */
8396 
8397 static void
aarch64_emit_epilogue_components(sbitmap components)8398 aarch64_emit_epilogue_components (sbitmap components)
8399 {
8400   aarch64_process_components (components, false);
8401 }
8402 
8403 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS.  */
8404 
8405 static void
aarch64_set_handled_components(sbitmap components)8406 aarch64_set_handled_components (sbitmap components)
8407 {
8408   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
8409     if (bitmap_bit_p (components, regno))
8410       cfun->machine->reg_is_wrapped_separately[regno] = true;
8411 }
8412 
8413 /* On AArch64 we have an ABI defined safe buffer.  This constant is used to
8414    determining the probe offset for alloca.  */
8415 
8416 static HOST_WIDE_INT
aarch64_stack_clash_protection_alloca_probe_range(void)8417 aarch64_stack_clash_protection_alloca_probe_range (void)
8418 {
8419   return STACK_CLASH_CALLER_GUARD;
8420 }
8421 
8422 
8423 /* Allocate POLY_SIZE bytes of stack space using TEMP1 and TEMP2 as scratch
8424    registers.  If POLY_SIZE is not large enough to require a probe this function
8425    will only adjust the stack.  When allocating the stack space
8426    FRAME_RELATED_P is then used to indicate if the allocation is frame related.
8427    FINAL_ADJUSTMENT_P indicates whether we are allocating the outgoing
8428    arguments.  If we are then we ensure that any allocation larger than the ABI
8429    defined buffer needs a probe so that the invariant of having a 1KB buffer is
8430    maintained.
8431 
8432    We emit barriers after each stack adjustment to prevent optimizations from
8433    breaking the invariant that we never drop the stack more than a page.  This
8434    invariant is needed to make it easier to correctly handle asynchronous
8435    events, e.g. if we were to allow the stack to be dropped by more than a page
8436    and then have multiple probes up and we take a signal somewhere in between
8437    then the signal handler doesn't know the state of the stack and can make no
8438    assumptions about which pages have been probed.  */
8439 
8440 static void
aarch64_allocate_and_probe_stack_space(rtx temp1,rtx temp2,poly_int64 poly_size,bool frame_related_p,bool final_adjustment_p)8441 aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
8442 					poly_int64 poly_size,
8443 					bool frame_related_p,
8444 					bool final_adjustment_p)
8445 {
8446   HOST_WIDE_INT guard_size
8447     = 1 << param_stack_clash_protection_guard_size;
8448   HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
8449   HOST_WIDE_INT min_probe_threshold
8450     = (final_adjustment_p
8451        ? guard_used_by_caller
8452        : guard_size - guard_used_by_caller);
8453   /* When doing the final adjustment for the outgoing arguments, take into
8454      account any unprobed space there is above the current SP.  There are
8455      two cases:
8456 
8457      - When saving SVE registers below the hard frame pointer, we force
8458        the lowest save to take place in the prologue before doing the final
8459        adjustment (i.e. we don't allow the save to be shrink-wrapped).
8460        This acts as a probe at SP, so there is no unprobed space.
8461 
8462      - When there are no SVE register saves, we use the store of the link
8463        register as a probe.  We can't assume that LR was saved at position 0
8464        though, so treat any space below it as unprobed.  */
8465   if (final_adjustment_p
8466       && known_eq (cfun->machine->frame.below_hard_fp_saved_regs_size, 0))
8467     {
8468       poly_int64 lr_offset = cfun->machine->frame.reg_offset[LR_REGNUM];
8469       if (known_ge (lr_offset, 0))
8470 	min_probe_threshold -= lr_offset.to_constant ();
8471       else
8472 	gcc_assert (!flag_stack_clash_protection || known_eq (poly_size, 0));
8473     }
8474 
8475   poly_int64 frame_size = cfun->machine->frame.frame_size;
8476 
8477   /* We should always have a positive probe threshold.  */
8478   gcc_assert (min_probe_threshold > 0);
8479 
8480   if (flag_stack_clash_protection && !final_adjustment_p)
8481     {
8482       poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
8483       poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
8484       poly_int64 final_adjust = cfun->machine->frame.final_adjust;
8485 
8486       if (known_eq (frame_size, 0))
8487 	{
8488 	  dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
8489 	}
8490       else if (known_lt (initial_adjust + sve_callee_adjust,
8491 			 guard_size - guard_used_by_caller)
8492 	       && known_lt (final_adjust, guard_used_by_caller))
8493 	{
8494 	  dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
8495 	}
8496     }
8497 
8498   /* If SIZE is not large enough to require probing, just adjust the stack and
8499      exit.  */
8500   if (known_lt (poly_size, min_probe_threshold)
8501       || !flag_stack_clash_protection)
8502     {
8503       aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p);
8504       return;
8505     }
8506 
8507   HOST_WIDE_INT size;
8508   /* Handle the SVE non-constant case first.  */
8509   if (!poly_size.is_constant (&size))
8510     {
8511      if (dump_file)
8512       {
8513 	fprintf (dump_file, "Stack clash SVE prologue: ");
8514 	print_dec (poly_size, dump_file);
8515 	fprintf (dump_file, " bytes, dynamic probing will be required.\n");
8516       }
8517 
8518       /* First calculate the amount of bytes we're actually spilling.  */
8519       aarch64_add_offset (Pmode, temp1, CONST0_RTX (Pmode),
8520 			  poly_size, temp1, temp2, false, true);
8521 
8522       rtx_insn *insn = get_last_insn ();
8523 
8524       if (frame_related_p)
8525 	{
8526 	  /* This is done to provide unwinding information for the stack
8527 	     adjustments we're about to do, however to prevent the optimizers
8528 	     from removing the R11 move and leaving the CFA note (which would be
8529 	     very wrong) we tie the old and new stack pointer together.
8530 	     The tie will expand to nothing but the optimizers will not touch
8531 	     the instruction.  */
8532 	  rtx stack_ptr_copy = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
8533 	  emit_move_insn (stack_ptr_copy, stack_pointer_rtx);
8534 	  emit_insn (gen_stack_tie (stack_ptr_copy, stack_pointer_rtx));
8535 
8536 	  /* We want the CFA independent of the stack pointer for the
8537 	     duration of the loop.  */
8538 	  add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy);
8539 	  RTX_FRAME_RELATED_P (insn) = 1;
8540 	}
8541 
8542       rtx probe_const = gen_int_mode (min_probe_threshold, Pmode);
8543       rtx guard_const = gen_int_mode (guard_size, Pmode);
8544 
8545       insn = emit_insn (gen_probe_sve_stack_clash (Pmode, stack_pointer_rtx,
8546 						   stack_pointer_rtx, temp1,
8547 						   probe_const, guard_const));
8548 
8549       /* Now reset the CFA register if needed.  */
8550       if (frame_related_p)
8551 	{
8552 	  add_reg_note (insn, REG_CFA_DEF_CFA,
8553 			gen_rtx_PLUS (Pmode, stack_pointer_rtx,
8554 				      gen_int_mode (poly_size, Pmode)));
8555 	  RTX_FRAME_RELATED_P (insn) = 1;
8556 	}
8557 
8558       return;
8559     }
8560 
8561   if (dump_file)
8562     fprintf (dump_file,
8563 	     "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
8564 	     " bytes, probing will be required.\n", size);
8565 
8566   /* Round size to the nearest multiple of guard_size, and calculate the
8567      residual as the difference between the original size and the rounded
8568      size.  */
8569   HOST_WIDE_INT rounded_size = ROUND_DOWN (size, guard_size);
8570   HOST_WIDE_INT residual = size - rounded_size;
8571 
8572   /* We can handle a small number of allocations/probes inline.  Otherwise
8573      punt to a loop.  */
8574   if (rounded_size <= STACK_CLASH_MAX_UNROLL_PAGES * guard_size)
8575     {
8576       for (HOST_WIDE_INT i = 0; i < rounded_size; i += guard_size)
8577 	{
8578 	  aarch64_sub_sp (NULL, temp2, guard_size, true);
8579 	  emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
8580 					   guard_used_by_caller));
8581 	  emit_insn (gen_blockage ());
8582 	}
8583       dump_stack_clash_frame_info (PROBE_INLINE, size != rounded_size);
8584     }
8585   else
8586     {
8587       /* Compute the ending address.  */
8588       aarch64_add_offset (Pmode, temp1, stack_pointer_rtx, -rounded_size,
8589 			  temp1, NULL, false, true);
8590       rtx_insn *insn = get_last_insn ();
8591 
8592       /* For the initial allocation, we don't have a frame pointer
8593 	 set up, so we always need CFI notes.  If we're doing the
8594 	 final allocation, then we may have a frame pointer, in which
8595 	 case it is the CFA, otherwise we need CFI notes.
8596 
8597 	 We can determine which allocation we are doing by looking at
8598 	 the value of FRAME_RELATED_P since the final allocations are not
8599 	 frame related.  */
8600       if (frame_related_p)
8601 	{
8602 	  /* We want the CFA independent of the stack pointer for the
8603 	     duration of the loop.  */
8604 	  add_reg_note (insn, REG_CFA_DEF_CFA,
8605 			plus_constant (Pmode, temp1, rounded_size));
8606 	  RTX_FRAME_RELATED_P (insn) = 1;
8607 	}
8608 
8609       /* This allocates and probes the stack.  Note that this re-uses some of
8610 	 the existing Ada stack protection code.  However we are guaranteed not
8611 	 to enter the non loop or residual branches of that code.
8612 
8613 	 The non-loop part won't be entered because if our allocation amount
8614 	 doesn't require a loop, the case above would handle it.
8615 
8616 	 The residual amount won't be entered because TEMP1 is a mutliple of
8617 	 the allocation size.  The residual will always be 0.  As such, the only
8618 	 part we are actually using from that code is the loop setup.  The
8619 	 actual probing is done in aarch64_output_probe_stack_range.  */
8620       insn = emit_insn (gen_probe_stack_range (stack_pointer_rtx,
8621 					       stack_pointer_rtx, temp1));
8622 
8623       /* Now reset the CFA register if needed.  */
8624       if (frame_related_p)
8625 	{
8626 	  add_reg_note (insn, REG_CFA_DEF_CFA,
8627 			plus_constant (Pmode, stack_pointer_rtx, rounded_size));
8628 	  RTX_FRAME_RELATED_P (insn) = 1;
8629 	}
8630 
8631       emit_insn (gen_blockage ());
8632       dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
8633     }
8634 
8635   /* Handle any residuals.  Residuals of at least MIN_PROBE_THRESHOLD have to
8636      be probed.  This maintains the requirement that each page is probed at
8637      least once.  For initial probing we probe only if the allocation is
8638      more than GUARD_SIZE - buffer, and for the outgoing arguments we probe
8639      if the amount is larger than buffer.  GUARD_SIZE - buffer + buffer ==
8640      GUARD_SIZE.  This works that for any allocation that is large enough to
8641      trigger a probe here, we'll have at least one, and if they're not large
8642      enough for this code to emit anything for them, The page would have been
8643      probed by the saving of FP/LR either by this function or any callees.  If
8644      we don't have any callees then we won't have more stack adjustments and so
8645      are still safe.  */
8646   if (residual)
8647     {
8648       HOST_WIDE_INT residual_probe_offset = guard_used_by_caller;
8649       /* If we're doing final adjustments, and we've done any full page
8650 	 allocations then any residual needs to be probed.  */
8651       if (final_adjustment_p && rounded_size != 0)
8652 	min_probe_threshold = 0;
8653       /* If doing a small final adjustment, we always probe at offset 0.
8654 	 This is done to avoid issues when LR is not at position 0 or when
8655 	 the final adjustment is smaller than the probing offset.  */
8656       else if (final_adjustment_p && rounded_size == 0)
8657 	residual_probe_offset = 0;
8658 
8659       aarch64_sub_sp (temp1, temp2, residual, frame_related_p);
8660       if (residual >= min_probe_threshold)
8661 	{
8662 	  if (dump_file)
8663 	    fprintf (dump_file,
8664 		     "Stack clash AArch64 prologue residuals: "
8665 		     HOST_WIDE_INT_PRINT_DEC " bytes, probing will be required."
8666 		     "\n", residual);
8667 
8668 	    emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
8669 					     residual_probe_offset));
8670 	  emit_insn (gen_blockage ());
8671 	}
8672     }
8673 }
8674 
8675 /* Return 1 if the register is used by the epilogue.  We need to say the
8676    return register is used, but only after epilogue generation is complete.
8677    Note that in the case of sibcalls, the values "used by the epilogue" are
8678    considered live at the start of the called function.
8679 
8680    For SIMD functions we need to return 1 for FP registers that are saved and
8681    restored by a function but are not zero in call_used_regs.  If we do not do
8682    this optimizations may remove the restore of the register.  */
8683 
8684 int
aarch64_epilogue_uses(int regno)8685 aarch64_epilogue_uses (int regno)
8686 {
8687   if (epilogue_completed)
8688     {
8689       if (regno == LR_REGNUM)
8690 	return 1;
8691     }
8692   return 0;
8693 }
8694 
8695 /* AArch64 stack frames generated by this compiler look like:
8696 
8697 	+-------------------------------+
8698 	|                               |
8699 	|  incoming stack arguments     |
8700 	|                               |
8701 	+-------------------------------+
8702 	|                               | <-- incoming stack pointer (aligned)
8703 	|  callee-allocated save area   |
8704 	|  for register varargs         |
8705 	|                               |
8706 	+-------------------------------+
8707 	|  local variables              | <-- frame_pointer_rtx
8708 	|                               |
8709 	+-------------------------------+
8710 	|  padding                      | \
8711 	+-------------------------------+  |
8712 	|  callee-saved registers       |  | frame.saved_regs_size
8713 	+-------------------------------+  |
8714 	|  LR'                          |  |
8715 	+-------------------------------+  |
8716 	|  FP'                          |  |
8717 	+-------------------------------+  |<- hard_frame_pointer_rtx (aligned)
8718 	|  SVE vector registers         |  | \
8719 	+-------------------------------+  |  | below_hard_fp_saved_regs_size
8720 	|  SVE predicate registers      | /  /
8721 	+-------------------------------+
8722 	|  dynamic allocation           |
8723 	+-------------------------------+
8724 	|  padding                      |
8725 	+-------------------------------+
8726 	|  outgoing stack arguments     | <-- arg_pointer
8727         |                               |
8728 	+-------------------------------+
8729 	|                               | <-- stack_pointer_rtx (aligned)
8730 
8731    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
8732    but leave frame_pointer_rtx and hard_frame_pointer_rtx
8733    unchanged.
8734 
8735    By default for stack-clash we assume the guard is at least 64KB, but this
8736    value is configurable to either 4KB or 64KB.  We also force the guard size to
8737    be the same as the probing interval and both values are kept in sync.
8738 
8739    With those assumptions the callee can allocate up to 63KB (or 3KB depending
8740    on the guard size) of stack space without probing.
8741 
8742    When probing is needed, we emit a probe at the start of the prologue
8743    and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
8744 
8745    We have to track how much space has been allocated and the only stores
8746    to the stack we track as implicit probes are the FP/LR stores.
8747 
8748    For outgoing arguments we probe if the size is larger than 1KB, such that
8749    the ABI specified buffer is maintained for the next callee.
8750 
8751    The following registers are reserved during frame layout and should not be
8752    used for any other purpose:
8753 
8754    - r11: Used by stack clash protection when SVE is enabled, and also
8755 	  as an anchor register when saving and restoring registers
8756    - r12(EP0) and r13(EP1): Used as temporaries for stack adjustment.
8757    - r14 and r15: Used for speculation tracking.
8758    - r16(IP0), r17(IP1): Used by indirect tailcalls.
8759    - r30(LR), r29(FP): Used by standard frame layout.
8760 
8761    These registers must be avoided in frame layout related code unless the
8762    explicit intention is to interact with one of the features listed above.  */
8763 
8764 /* Generate the prologue instructions for entry into a function.
8765    Establish the stack frame by decreasing the stack pointer with a
8766    properly calculated size and, if necessary, create a frame record
8767    filled with the values of LR and previous frame pointer.  The
8768    current FP is also set up if it is in use.  */
8769 
8770 void
aarch64_expand_prologue(void)8771 aarch64_expand_prologue (void)
8772 {
8773   poly_int64 frame_size = cfun->machine->frame.frame_size;
8774   poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
8775   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
8776   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
8777   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
8778   poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
8779   poly_int64 below_hard_fp_saved_regs_size
8780     = cfun->machine->frame.below_hard_fp_saved_regs_size;
8781   unsigned reg1 = cfun->machine->frame.wb_candidate1;
8782   unsigned reg2 = cfun->machine->frame.wb_candidate2;
8783   bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
8784   rtx_insn *insn;
8785 
8786   if (flag_stack_clash_protection && known_eq (callee_adjust, 0))
8787     {
8788       /* Fold the SVE allocation into the initial allocation.
8789 	 We don't do this in aarch64_layout_arg to avoid pessimizing
8790 	 the epilogue code.  */
8791       initial_adjust += sve_callee_adjust;
8792       sve_callee_adjust = 0;
8793     }
8794 
8795   /* Sign return address for functions.  */
8796   if (aarch64_return_address_signing_enabled ())
8797     {
8798       switch (aarch64_ra_sign_key)
8799 	{
8800 	  case AARCH64_KEY_A:
8801 	    insn = emit_insn (gen_paciasp ());
8802 	    break;
8803 	  case AARCH64_KEY_B:
8804 	    insn = emit_insn (gen_pacibsp ());
8805 	    break;
8806 	  default:
8807 	    gcc_unreachable ();
8808 	}
8809       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
8810       RTX_FRAME_RELATED_P (insn) = 1;
8811     }
8812 
8813   if (flag_stack_usage_info)
8814     current_function_static_stack_size = constant_lower_bound (frame_size);
8815 
8816   if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
8817     {
8818       if (crtl->is_leaf && !cfun->calls_alloca)
8819 	{
8820 	  if (maybe_gt (frame_size, PROBE_INTERVAL)
8821 	      && maybe_gt (frame_size, get_stack_check_protect ()))
8822 	    aarch64_emit_probe_stack_range (get_stack_check_protect (),
8823 					    (frame_size
8824 					     - get_stack_check_protect ()));
8825 	}
8826       else if (maybe_gt (frame_size, 0))
8827 	aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
8828     }
8829 
8830   rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
8831   rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
8832 
8833   /* In theory we should never have both an initial adjustment
8834      and a callee save adjustment.  Verify that is the case since the
8835      code below does not handle it for -fstack-clash-protection.  */
8836   gcc_assert (known_eq (initial_adjust, 0) || callee_adjust == 0);
8837 
8838   /* Will only probe if the initial adjustment is larger than the guard
8839      less the amount of the guard reserved for use by the caller's
8840      outgoing args.  */
8841   aarch64_allocate_and_probe_stack_space (tmp0_rtx, tmp1_rtx, initial_adjust,
8842 					  true, false);
8843 
8844   if (callee_adjust != 0)
8845     aarch64_push_regs (reg1, reg2, callee_adjust);
8846 
8847   /* The offset of the frame chain record (if any) from the current SP.  */
8848   poly_int64 chain_offset = (initial_adjust + callee_adjust
8849 			     - cfun->machine->frame.hard_fp_offset);
8850   gcc_assert (known_ge (chain_offset, 0));
8851 
8852   /* The offset of the bottom of the save area from the current SP.  */
8853   poly_int64 saved_regs_offset = chain_offset - below_hard_fp_saved_regs_size;
8854 
8855   if (emit_frame_chain)
8856     {
8857       if (callee_adjust == 0)
8858 	{
8859 	  reg1 = R29_REGNUM;
8860 	  reg2 = R30_REGNUM;
8861 	  aarch64_save_callee_saves (saved_regs_offset, reg1, reg2,
8862 				     false, false);
8863 	}
8864       else
8865 	gcc_assert (known_eq (chain_offset, 0));
8866       aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
8867 			  stack_pointer_rtx, chain_offset,
8868 			  tmp1_rtx, tmp0_rtx, frame_pointer_needed);
8869       if (frame_pointer_needed && !frame_size.is_constant ())
8870 	{
8871 	  /* Variable-sized frames need to describe the save slot
8872 	     address using DW_CFA_expression rather than DW_CFA_offset.
8873 	     This means that, without taking further action, the
8874 	     locations of the registers that we've already saved would
8875 	     remain based on the stack pointer even after we redefine
8876 	     the CFA based on the frame pointer.  We therefore need new
8877 	     DW_CFA_expressions to re-express the save slots with addresses
8878 	     based on the frame pointer.  */
8879 	  rtx_insn *insn = get_last_insn ();
8880 	  gcc_assert (RTX_FRAME_RELATED_P (insn));
8881 
8882 	  /* Add an explicit CFA definition if this was previously
8883 	     implicit.  */
8884 	  if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
8885 	    {
8886 	      rtx src = plus_constant (Pmode, stack_pointer_rtx,
8887 				       callee_offset);
8888 	      add_reg_note (insn, REG_CFA_ADJUST_CFA,
8889 			    gen_rtx_SET (hard_frame_pointer_rtx, src));
8890 	    }
8891 
8892 	  /* Change the save slot expressions for the registers that
8893 	     we've already saved.  */
8894 	  aarch64_add_cfa_expression (insn, regno_reg_rtx[reg2],
8895 				      hard_frame_pointer_rtx, UNITS_PER_WORD);
8896 	  aarch64_add_cfa_expression (insn, regno_reg_rtx[reg1],
8897 				      hard_frame_pointer_rtx, 0);
8898 	}
8899       emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
8900     }
8901 
8902   aarch64_save_callee_saves (saved_regs_offset, R0_REGNUM, R30_REGNUM,
8903 			     callee_adjust != 0 || emit_frame_chain,
8904 			     emit_frame_chain);
8905   if (maybe_ne (sve_callee_adjust, 0))
8906     {
8907       gcc_assert (!flag_stack_clash_protection
8908 		  || known_eq (initial_adjust, 0));
8909       aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx,
8910 					      sve_callee_adjust,
8911 					      !frame_pointer_needed, false);
8912       saved_regs_offset += sve_callee_adjust;
8913     }
8914   aarch64_save_callee_saves (saved_regs_offset, P0_REGNUM, P15_REGNUM,
8915 			     false, emit_frame_chain);
8916   aarch64_save_callee_saves (saved_regs_offset, V0_REGNUM, V31_REGNUM,
8917 			     callee_adjust != 0 || emit_frame_chain,
8918 			     emit_frame_chain);
8919 
8920   /* We may need to probe the final adjustment if it is larger than the guard
8921      that is assumed by the called.  */
8922   aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust,
8923 					  !frame_pointer_needed, true);
8924 }
8925 
8926 /* Return TRUE if we can use a simple_return insn.
8927 
8928    This function checks whether the callee saved stack is empty, which
8929    means no restore actions are need. The pro_and_epilogue will use
8930    this to check whether shrink-wrapping opt is feasible.  */
8931 
8932 bool
aarch64_use_return_insn_p(void)8933 aarch64_use_return_insn_p (void)
8934 {
8935   if (!reload_completed)
8936     return false;
8937 
8938   if (crtl->profile)
8939     return false;
8940 
8941   return known_eq (cfun->machine->frame.frame_size, 0);
8942 }
8943 
8944 /* Generate the epilogue instructions for returning from a function.
8945    This is almost exactly the reverse of the prolog sequence, except
8946    that we need to insert barriers to avoid scheduling loads that read
8947    from a deallocated stack, and we optimize the unwind records by
8948    emitting them all together if possible.  */
8949 void
aarch64_expand_epilogue(bool for_sibcall)8950 aarch64_expand_epilogue (bool for_sibcall)
8951 {
8952   poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
8953   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
8954   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
8955   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
8956   poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
8957   poly_int64 below_hard_fp_saved_regs_size
8958     = cfun->machine->frame.below_hard_fp_saved_regs_size;
8959   unsigned reg1 = cfun->machine->frame.wb_candidate1;
8960   unsigned reg2 = cfun->machine->frame.wb_candidate2;
8961   rtx cfi_ops = NULL;
8962   rtx_insn *insn;
8963   /* A stack clash protection prologue may not have left EP0_REGNUM or
8964      EP1_REGNUM in a usable state.  The same is true for allocations
8965      with an SVE component, since we then need both temporary registers
8966      for each allocation.  For stack clash we are in a usable state if
8967      the adjustment is less than GUARD_SIZE - GUARD_USED_BY_CALLER.  */
8968   HOST_WIDE_INT guard_size
8969     = 1 << param_stack_clash_protection_guard_size;
8970   HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
8971 
8972   /* We can re-use the registers when:
8973 
8974      (a) the deallocation amount is the same as the corresponding
8975 	 allocation amount (which is false if we combine the initial
8976 	 and SVE callee save allocations in the prologue); and
8977 
8978      (b) the allocation amount doesn't need a probe (which is false
8979 	 if the amount is guard_size - guard_used_by_caller or greater).
8980 
8981      In such situations the register should remain live with the correct
8982      value.  */
8983   bool can_inherit_p = (initial_adjust.is_constant ()
8984 			&& final_adjust.is_constant ()
8985 			&& (!flag_stack_clash_protection
8986 			    || (known_lt (initial_adjust,
8987 					  guard_size - guard_used_by_caller)
8988 				&& known_eq (sve_callee_adjust, 0))));
8989 
8990   /* We need to add memory barrier to prevent read from deallocated stack.  */
8991   bool need_barrier_p
8992     = maybe_ne (get_frame_size ()
8993 		+ cfun->machine->frame.saved_varargs_size, 0);
8994 
8995   /* Emit a barrier to prevent loads from a deallocated stack.  */
8996   if (maybe_gt (final_adjust, crtl->outgoing_args_size)
8997       || cfun->calls_alloca
8998       || crtl->calls_eh_return)
8999     {
9000       emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
9001       need_barrier_p = false;
9002     }
9003 
9004   /* Restore the stack pointer from the frame pointer if it may not
9005      be the same as the stack pointer.  */
9006   rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
9007   rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
9008   if (frame_pointer_needed
9009       && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
9010     /* If writeback is used when restoring callee-saves, the CFA
9011        is restored on the instruction doing the writeback.  */
9012     aarch64_add_offset (Pmode, stack_pointer_rtx,
9013 			hard_frame_pointer_rtx,
9014 			-callee_offset - below_hard_fp_saved_regs_size,
9015 			tmp1_rtx, tmp0_rtx, callee_adjust == 0);
9016   else
9017      /* The case where we need to re-use the register here is very rare, so
9018 	avoid the complicated condition and just always emit a move if the
9019 	immediate doesn't fit.  */
9020      aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, true);
9021 
9022   /* Restore the vector registers before the predicate registers,
9023      so that we can use P4 as a temporary for big-endian SVE frames.  */
9024   aarch64_restore_callee_saves (callee_offset, V0_REGNUM, V31_REGNUM,
9025 				callee_adjust != 0, &cfi_ops);
9026   aarch64_restore_callee_saves (callee_offset, P0_REGNUM, P15_REGNUM,
9027 				false, &cfi_ops);
9028   if (maybe_ne (sve_callee_adjust, 0))
9029     aarch64_add_sp (NULL_RTX, NULL_RTX, sve_callee_adjust, true);
9030   aarch64_restore_callee_saves (callee_offset - sve_callee_adjust,
9031 				R0_REGNUM, R30_REGNUM,
9032 				callee_adjust != 0, &cfi_ops);
9033 
9034   if (need_barrier_p)
9035     emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
9036 
9037   if (callee_adjust != 0)
9038     aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
9039 
9040   /* If we have no register restore information, the CFA must have been
9041      defined in terms of the stack pointer since the end of the prologue.  */
9042   gcc_assert (cfi_ops || !frame_pointer_needed);
9043 
9044   if (cfi_ops && (callee_adjust != 0 || maybe_gt (initial_adjust, 65536)))
9045     {
9046       /* Emit delayed restores and set the CFA to be SP + initial_adjust.  */
9047       insn = get_last_insn ();
9048       rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
9049       REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
9050       RTX_FRAME_RELATED_P (insn) = 1;
9051       cfi_ops = NULL;
9052     }
9053 
9054   /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
9055      add restriction on emit_move optimization to leaf functions.  */
9056   aarch64_add_sp (tmp0_rtx, tmp1_rtx, initial_adjust,
9057 		  (!can_inherit_p || !crtl->is_leaf
9058 		   || df_regs_ever_live_p (EP0_REGNUM)));
9059 
9060   if (cfi_ops)
9061     {
9062       /* Emit delayed restores and reset the CFA to be SP.  */
9063       insn = get_last_insn ();
9064       cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
9065       REG_NOTES (insn) = cfi_ops;
9066       RTX_FRAME_RELATED_P (insn) = 1;
9067     }
9068 
9069   /* We prefer to emit the combined return/authenticate instruction RETAA,
9070      however there are three cases in which we must instead emit an explicit
9071      authentication instruction.
9072 
9073 	1) Sibcalls don't return in a normal way, so if we're about to call one
9074 	   we must authenticate.
9075 
9076 	2) The RETAA instruction is not available before ARMv8.3-A, so if we are
9077 	   generating code for !TARGET_ARMV8_3 we can't use it and must
9078 	   explicitly authenticate.
9079 
9080 	3) On an eh_return path we make extra stack adjustments to update the
9081 	   canonical frame address to be the exception handler's CFA.  We want
9082 	   to authenticate using the CFA of the function which calls eh_return.
9083     */
9084   if (aarch64_return_address_signing_enabled ()
9085       && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
9086     {
9087       switch (aarch64_ra_sign_key)
9088 	{
9089 	  case AARCH64_KEY_A:
9090 	    insn = emit_insn (gen_autiasp ());
9091 	    break;
9092 	  case AARCH64_KEY_B:
9093 	    insn = emit_insn (gen_autibsp ());
9094 	    break;
9095 	  default:
9096 	    gcc_unreachable ();
9097 	}
9098       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
9099       RTX_FRAME_RELATED_P (insn) = 1;
9100     }
9101 
9102   /* Stack adjustment for exception handler.  */
9103   if (crtl->calls_eh_return && !for_sibcall)
9104     {
9105       /* We need to unwind the stack by the offset computed by
9106 	 EH_RETURN_STACKADJ_RTX.  We have already reset the CFA
9107 	 to be SP; letting the CFA move during this adjustment
9108 	 is just as correct as retaining the CFA from the body
9109 	 of the function.  Therefore, do nothing special.  */
9110       emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
9111     }
9112 
9113   emit_use (gen_rtx_REG (DImode, LR_REGNUM));
9114   if (!for_sibcall)
9115     emit_jump_insn (ret_rtx);
9116 }
9117 
9118 /* Implement EH_RETURN_HANDLER_RTX.  EH returns need to either return
9119    normally or return to a previous frame after unwinding.
9120 
9121    An EH return uses a single shared return sequence.  The epilogue is
9122    exactly like a normal epilogue except that it has an extra input
9123    register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
9124    that must be applied after the frame has been destroyed.  An extra label
9125    is inserted before the epilogue which initializes this register to zero,
9126    and this is the entry point for a normal return.
9127 
9128    An actual EH return updates the return address, initializes the stack
9129    adjustment and jumps directly into the epilogue (bypassing the zeroing
9130    of the adjustment).  Since the return address is typically saved on the
9131    stack when a function makes a call, the saved LR must be updated outside
9132    the epilogue.
9133 
9134    This poses problems as the store is generated well before the epilogue,
9135    so the offset of LR is not known yet.  Also optimizations will remove the
9136    store as it appears dead, even after the epilogue is generated (as the
9137    base or offset for loading LR is different in many cases).
9138 
9139    To avoid these problems this implementation forces the frame pointer
9140    in eh_return functions so that the location of LR is fixed and known early.
9141    It also marks the store volatile, so no optimization is permitted to
9142    remove the store.  */
9143 rtx
aarch64_eh_return_handler_rtx(void)9144 aarch64_eh_return_handler_rtx (void)
9145 {
9146   rtx tmp = gen_frame_mem (Pmode,
9147     plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
9148 
9149   /* Mark the store volatile, so no optimization is permitted to remove it.  */
9150   MEM_VOLATILE_P (tmp) = true;
9151   return tmp;
9152 }
9153 
9154 /* Output code to add DELTA to the first argument, and then jump
9155    to FUNCTION.  Used for C++ multiple inheritance.  */
9156 static void
aarch64_output_mi_thunk(FILE * file,tree thunk ATTRIBUTE_UNUSED,HOST_WIDE_INT delta,HOST_WIDE_INT vcall_offset,tree function)9157 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
9158 			 HOST_WIDE_INT delta,
9159 			 HOST_WIDE_INT vcall_offset,
9160 			 tree function)
9161 {
9162   /* The this pointer is always in x0.  Note that this differs from
9163      Arm where the this pointer maybe bumped to r1 if r0 is required
9164      to return a pointer to an aggregate.  On AArch64 a result value
9165      pointer will be in x8.  */
9166   int this_regno = R0_REGNUM;
9167   rtx this_rtx, temp0, temp1, addr, funexp;
9168   rtx_insn *insn;
9169   const char *fnname = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (thunk));
9170 
9171   if (aarch64_bti_enabled ())
9172     emit_insn (gen_bti_c());
9173 
9174   reload_completed = 1;
9175   emit_note (NOTE_INSN_PROLOGUE_END);
9176 
9177   this_rtx = gen_rtx_REG (Pmode, this_regno);
9178   temp0 = gen_rtx_REG (Pmode, EP0_REGNUM);
9179   temp1 = gen_rtx_REG (Pmode, EP1_REGNUM);
9180 
9181   if (vcall_offset == 0)
9182     aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
9183   else
9184     {
9185       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
9186 
9187       addr = this_rtx;
9188       if (delta != 0)
9189 	{
9190 	  if (delta >= -256 && delta < 256)
9191 	    addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
9192 				       plus_constant (Pmode, this_rtx, delta));
9193 	  else
9194 	    aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
9195 				temp1, temp0, false);
9196 	}
9197 
9198       if (Pmode == ptr_mode)
9199 	aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
9200       else
9201 	aarch64_emit_move (temp0,
9202 			   gen_rtx_ZERO_EXTEND (Pmode,
9203 						gen_rtx_MEM (ptr_mode, addr)));
9204 
9205       if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
9206 	  addr = plus_constant (Pmode, temp0, vcall_offset);
9207       else
9208 	{
9209 	  aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
9210 					  Pmode);
9211 	  addr = gen_rtx_PLUS (Pmode, temp0, temp1);
9212 	}
9213 
9214       if (Pmode == ptr_mode)
9215 	aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
9216       else
9217 	aarch64_emit_move (temp1,
9218 			   gen_rtx_SIGN_EXTEND (Pmode,
9219 						gen_rtx_MEM (ptr_mode, addr)));
9220 
9221       emit_insn (gen_add2_insn (this_rtx, temp1));
9222     }
9223 
9224   /* Generate a tail call to the target function.  */
9225   if (!TREE_USED (function))
9226     {
9227       assemble_external (function);
9228       TREE_USED (function) = 1;
9229     }
9230   funexp = XEXP (DECL_RTL (function), 0);
9231   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
9232   rtx callee_abi = gen_int_mode (fndecl_abi (function).id (), DImode);
9233   insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, callee_abi));
9234   SIBLING_CALL_P (insn) = 1;
9235 
9236   insn = get_insns ();
9237   shorten_branches (insn);
9238 
9239   assemble_start_function (thunk, fnname);
9240   final_start_function (insn, file, 1);
9241   final (insn, file, 1);
9242   final_end_function ();
9243   assemble_end_function (thunk, fnname);
9244 
9245   /* Stop pretending to be a post-reload pass.  */
9246   reload_completed = 0;
9247 }
9248 
9249 static bool
aarch64_tls_referenced_p(rtx x)9250 aarch64_tls_referenced_p (rtx x)
9251 {
9252   if (!TARGET_HAVE_TLS)
9253     return false;
9254   subrtx_iterator::array_type array;
9255   FOR_EACH_SUBRTX (iter, array, x, ALL)
9256     {
9257       const_rtx x = *iter;
9258       if (SYMBOL_REF_P (x) && SYMBOL_REF_TLS_MODEL (x) != 0)
9259 	return true;
9260       /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
9261 	 TLS offsets, not real symbol references.  */
9262       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
9263 	iter.skip_subrtxes ();
9264     }
9265   return false;
9266 }
9267 
9268 
9269 /* Return true if val can be encoded as a 12-bit unsigned immediate with
9270    a left shift of 0 or 12 bits.  */
9271 bool
aarch64_uimm12_shift(HOST_WIDE_INT val)9272 aarch64_uimm12_shift (HOST_WIDE_INT val)
9273 {
9274   return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
9275 	  || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
9276 	  );
9277 }
9278 
9279 /* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
9280    that can be created with a left shift of 0 or 12.  */
9281 static HOST_WIDE_INT
aarch64_clamp_to_uimm12_shift(HOST_WIDE_INT val)9282 aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val)
9283 {
9284   /* Check to see if the value fits in 24 bits, as that is the maximum we can
9285      handle correctly.  */
9286   gcc_assert ((val & 0xffffff) == val);
9287 
9288   if (((val & 0xfff) << 0) == val)
9289     return val;
9290 
9291   return val & (0xfff << 12);
9292 }
9293 
9294 /* Return true if val is an immediate that can be loaded into a
9295    register by a MOVZ instruction.  */
9296 static bool
aarch64_movw_imm(HOST_WIDE_INT val,scalar_int_mode mode)9297 aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
9298 {
9299   if (GET_MODE_SIZE (mode) > 4)
9300     {
9301       if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
9302 	  || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
9303 	return 1;
9304     }
9305   else
9306     {
9307       /* Ignore sign extension.  */
9308       val &= (HOST_WIDE_INT) 0xffffffff;
9309     }
9310   return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
9311 	  || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
9312 }
9313 
9314 /* Test whether:
9315 
9316      X = (X & AND_VAL) | IOR_VAL;
9317 
9318    can be implemented using:
9319 
9320      MOVK X, #(IOR_VAL >> shift), LSL #shift
9321 
9322    Return the shift if so, otherwise return -1.  */
9323 int
aarch64_movk_shift(const wide_int_ref & and_val,const wide_int_ref & ior_val)9324 aarch64_movk_shift (const wide_int_ref &and_val,
9325 		    const wide_int_ref &ior_val)
9326 {
9327   unsigned int precision = and_val.get_precision ();
9328   unsigned HOST_WIDE_INT mask = 0xffff;
9329   for (unsigned int shift = 0; shift < precision; shift += 16)
9330     {
9331       if (and_val == ~mask && (ior_val & mask) == ior_val)
9332 	return shift;
9333       mask <<= 16;
9334     }
9335   return -1;
9336 }
9337 
9338 /* VAL is a value with the inner mode of MODE.  Replicate it to fill a
9339    64-bit (DImode) integer.  */
9340 
9341 static unsigned HOST_WIDE_INT
aarch64_replicate_bitmask_imm(unsigned HOST_WIDE_INT val,machine_mode mode)9342 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
9343 {
9344   unsigned int size = GET_MODE_UNIT_PRECISION (mode);
9345   while (size < 64)
9346     {
9347       val &= (HOST_WIDE_INT_1U << size) - 1;
9348       val |= val << size;
9349       size *= 2;
9350     }
9351   return val;
9352 }
9353 
9354 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2.  */
9355 
9356 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
9357   {
9358     0x0000000100000001ull,
9359     0x0001000100010001ull,
9360     0x0101010101010101ull,
9361     0x1111111111111111ull,
9362     0x5555555555555555ull,
9363   };
9364 
9365 
9366 /* Return true if val is a valid bitmask immediate.  */
9367 
9368 bool
aarch64_bitmask_imm(HOST_WIDE_INT val_in,machine_mode mode)9369 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
9370 {
9371   unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
9372   int bits;
9373 
9374   /* Check for a single sequence of one bits and return quickly if so.
9375      The special cases of all ones and all zeroes returns false.  */
9376   val = aarch64_replicate_bitmask_imm (val_in, mode);
9377   tmp = val + (val & -val);
9378 
9379   if (tmp == (tmp & -tmp))
9380     return (val + 1) > 1;
9381 
9382   /* Replicate 32-bit immediates so we can treat them as 64-bit.  */
9383   if (mode == SImode)
9384     val = (val << 32) | (val & 0xffffffff);
9385 
9386   /* Invert if the immediate doesn't start with a zero bit - this means we
9387      only need to search for sequences of one bits.  */
9388   if (val & 1)
9389     val = ~val;
9390 
9391   /* Find the first set bit and set tmp to val with the first sequence of one
9392      bits removed.  Return success if there is a single sequence of ones.  */
9393   first_one = val & -val;
9394   tmp = val & (val + first_one);
9395 
9396   if (tmp == 0)
9397     return true;
9398 
9399   /* Find the next set bit and compute the difference in bit position.  */
9400   next_one = tmp & -tmp;
9401   bits = clz_hwi (first_one) - clz_hwi (next_one);
9402   mask = val ^ tmp;
9403 
9404   /* Check the bit position difference is a power of 2, and that the first
9405      sequence of one bits fits within 'bits' bits.  */
9406   if ((mask >> bits) != 0 || bits != (bits & -bits))
9407     return false;
9408 
9409   /* Check the sequence of one bits is repeated 64/bits times.  */
9410   return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
9411 }
9412 
9413 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
9414    Assumed precondition: VAL_IN Is not zero.  */
9415 
9416 unsigned HOST_WIDE_INT
aarch64_and_split_imm1(HOST_WIDE_INT val_in)9417 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
9418 {
9419   int lowest_bit_set = ctz_hwi (val_in);
9420   int highest_bit_set = floor_log2 (val_in);
9421   gcc_assert (val_in != 0);
9422 
9423   return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
9424 	  (HOST_WIDE_INT_1U << lowest_bit_set));
9425 }
9426 
9427 /* Create constant where bits outside of lowest bit set to highest bit set
9428    are set to 1.  */
9429 
9430 unsigned HOST_WIDE_INT
aarch64_and_split_imm2(HOST_WIDE_INT val_in)9431 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
9432 {
9433   return val_in | ~aarch64_and_split_imm1 (val_in);
9434 }
9435 
9436 /* Return true if VAL_IN is a valid 'and' bitmask immediate.  */
9437 
9438 bool
aarch64_and_bitmask_imm(unsigned HOST_WIDE_INT val_in,machine_mode mode)9439 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
9440 {
9441   scalar_int_mode int_mode;
9442   if (!is_a <scalar_int_mode> (mode, &int_mode))
9443     return false;
9444 
9445   if (aarch64_bitmask_imm (val_in, int_mode))
9446     return false;
9447 
9448   if (aarch64_move_imm (val_in, int_mode))
9449     return false;
9450 
9451   unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
9452 
9453   return aarch64_bitmask_imm (imm2, int_mode);
9454 }
9455 
9456 /* Return true if val is an immediate that can be loaded into a
9457    register in a single instruction.  */
9458 bool
aarch64_move_imm(HOST_WIDE_INT val,machine_mode mode)9459 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
9460 {
9461   scalar_int_mode int_mode;
9462   if (!is_a <scalar_int_mode> (mode, &int_mode))
9463     return false;
9464 
9465   if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
9466     return 1;
9467   return aarch64_bitmask_imm (val, int_mode);
9468 }
9469 
9470 static bool
aarch64_cannot_force_const_mem(machine_mode mode ATTRIBUTE_UNUSED,rtx x)9471 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
9472 {
9473   if (GET_CODE (x) == HIGH)
9474     return true;
9475 
9476   /* There's no way to calculate VL-based values using relocations.  */
9477   subrtx_iterator::array_type array;
9478   FOR_EACH_SUBRTX (iter, array, x, ALL)
9479     if (GET_CODE (*iter) == CONST_POLY_INT)
9480       return true;
9481 
9482   poly_int64 offset;
9483   rtx base = strip_offset_and_salt (x, &offset);
9484   if (SYMBOL_REF_P (base) || LABEL_REF_P (base))
9485     {
9486       /* We checked for POLY_INT_CST offsets above.  */
9487       if (aarch64_classify_symbol (base, offset.to_constant ())
9488 	  != SYMBOL_FORCE_TO_MEM)
9489 	return true;
9490       else
9491 	/* Avoid generating a 64-bit relocation in ILP32; leave
9492 	   to aarch64_expand_mov_immediate to handle it properly.  */
9493 	return mode != ptr_mode;
9494     }
9495 
9496   return aarch64_tls_referenced_p (x);
9497 }
9498 
9499 /* Implement TARGET_CASE_VALUES_THRESHOLD.
9500    The expansion for a table switch is quite expensive due to the number
9501    of instructions, the table lookup and hard to predict indirect jump.
9502    When optimizing for speed, and -O3 enabled, use the per-core tuning if
9503    set, otherwise use tables for > 16 cases as a tradeoff between size and
9504    performance.  When optimizing for size, use the default setting.  */
9505 
9506 static unsigned int
aarch64_case_values_threshold(void)9507 aarch64_case_values_threshold (void)
9508 {
9509   /* Use the specified limit for the number of cases before using jump
9510      tables at higher optimization levels.  */
9511   if (optimize > 2
9512       && selected_cpu->tune->max_case_values != 0)
9513     return selected_cpu->tune->max_case_values;
9514   else
9515     return optimize_size ? default_case_values_threshold () : 17;
9516 }
9517 
9518 /* Return true if register REGNO is a valid index register.
9519    STRICT_P is true if REG_OK_STRICT is in effect.  */
9520 
9521 bool
aarch64_regno_ok_for_index_p(int regno,bool strict_p)9522 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
9523 {
9524   if (!HARD_REGISTER_NUM_P (regno))
9525     {
9526       if (!strict_p)
9527 	return true;
9528 
9529       if (!reg_renumber)
9530 	return false;
9531 
9532       regno = reg_renumber[regno];
9533     }
9534   return GP_REGNUM_P (regno);
9535 }
9536 
9537 /* Return true if register REGNO is a valid base register for mode MODE.
9538    STRICT_P is true if REG_OK_STRICT is in effect.  */
9539 
9540 bool
aarch64_regno_ok_for_base_p(int regno,bool strict_p)9541 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
9542 {
9543   if (!HARD_REGISTER_NUM_P (regno))
9544     {
9545       if (!strict_p)
9546 	return true;
9547 
9548       if (!reg_renumber)
9549 	return false;
9550 
9551       regno = reg_renumber[regno];
9552     }
9553 
9554   /* The fake registers will be eliminated to either the stack or
9555      hard frame pointer, both of which are usually valid base registers.
9556      Reload deals with the cases where the eliminated form isn't valid.  */
9557   return (GP_REGNUM_P (regno)
9558 	  || regno == SP_REGNUM
9559 	  || regno == FRAME_POINTER_REGNUM
9560 	  || regno == ARG_POINTER_REGNUM);
9561 }
9562 
9563 /* Return true if X is a valid base register for mode MODE.
9564    STRICT_P is true if REG_OK_STRICT is in effect.  */
9565 
9566 static bool
aarch64_base_register_rtx_p(rtx x,bool strict_p)9567 aarch64_base_register_rtx_p (rtx x, bool strict_p)
9568 {
9569   if (!strict_p
9570       && SUBREG_P (x)
9571       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
9572     x = SUBREG_REG (x);
9573 
9574   return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
9575 }
9576 
9577 /* Return true if address offset is a valid index.  If it is, fill in INFO
9578    appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
9579 
9580 static bool
aarch64_classify_index(struct aarch64_address_info * info,rtx x,machine_mode mode,bool strict_p)9581 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
9582 			machine_mode mode, bool strict_p)
9583 {
9584   enum aarch64_address_type type;
9585   rtx index;
9586   int shift;
9587 
9588   /* (reg:P) */
9589   if ((REG_P (x) || SUBREG_P (x))
9590       && GET_MODE (x) == Pmode)
9591     {
9592       type = ADDRESS_REG_REG;
9593       index = x;
9594       shift = 0;
9595     }
9596   /* (sign_extend:DI (reg:SI)) */
9597   else if ((GET_CODE (x) == SIGN_EXTEND
9598 	    || GET_CODE (x) == ZERO_EXTEND)
9599 	   && GET_MODE (x) == DImode
9600 	   && GET_MODE (XEXP (x, 0)) == SImode)
9601     {
9602       type = (GET_CODE (x) == SIGN_EXTEND)
9603 	? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
9604       index = XEXP (x, 0);
9605       shift = 0;
9606     }
9607   /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
9608   else if (GET_CODE (x) == MULT
9609 	   && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
9610 	       || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
9611 	   && GET_MODE (XEXP (x, 0)) == DImode
9612 	   && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
9613 	   && CONST_INT_P (XEXP (x, 1)))
9614     {
9615       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
9616 	? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
9617       index = XEXP (XEXP (x, 0), 0);
9618       shift = exact_log2 (INTVAL (XEXP (x, 1)));
9619     }
9620   /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
9621   else if (GET_CODE (x) == ASHIFT
9622 	   && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
9623 	       || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
9624 	   && GET_MODE (XEXP (x, 0)) == DImode
9625 	   && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
9626 	   && CONST_INT_P (XEXP (x, 1)))
9627     {
9628       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
9629 	? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
9630       index = XEXP (XEXP (x, 0), 0);
9631       shift = INTVAL (XEXP (x, 1));
9632     }
9633   /* (and:DI (mult:DI (reg:DI) (const_int scale))
9634      (const_int 0xffffffff<<shift)) */
9635   else if (GET_CODE (x) == AND
9636 	   && GET_MODE (x) == DImode
9637 	   && GET_CODE (XEXP (x, 0)) == MULT
9638 	   && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
9639 	   && CONST_INT_P (XEXP (XEXP (x, 0), 1))
9640 	   && CONST_INT_P (XEXP (x, 1)))
9641     {
9642       type = ADDRESS_REG_UXTW;
9643       index = XEXP (XEXP (x, 0), 0);
9644       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
9645       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
9646 	shift = -1;
9647     }
9648   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
9649      (const_int 0xffffffff<<shift)) */
9650   else if (GET_CODE (x) == AND
9651 	   && GET_MODE (x) == DImode
9652 	   && GET_CODE (XEXP (x, 0)) == ASHIFT
9653 	   && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
9654 	   && CONST_INT_P (XEXP (XEXP (x, 0), 1))
9655 	   && CONST_INT_P (XEXP (x, 1)))
9656     {
9657       type = ADDRESS_REG_UXTW;
9658       index = XEXP (XEXP (x, 0), 0);
9659       shift = INTVAL (XEXP (XEXP (x, 0), 1));
9660       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
9661 	shift = -1;
9662     }
9663   /* (mult:P (reg:P) (const_int scale)) */
9664   else if (GET_CODE (x) == MULT
9665 	   && GET_MODE (x) == Pmode
9666 	   && GET_MODE (XEXP (x, 0)) == Pmode
9667 	   && CONST_INT_P (XEXP (x, 1)))
9668     {
9669       type = ADDRESS_REG_REG;
9670       index = XEXP (x, 0);
9671       shift = exact_log2 (INTVAL (XEXP (x, 1)));
9672     }
9673   /* (ashift:P (reg:P) (const_int shift)) */
9674   else if (GET_CODE (x) == ASHIFT
9675 	   && GET_MODE (x) == Pmode
9676 	   && GET_MODE (XEXP (x, 0)) == Pmode
9677 	   && CONST_INT_P (XEXP (x, 1)))
9678     {
9679       type = ADDRESS_REG_REG;
9680       index = XEXP (x, 0);
9681       shift = INTVAL (XEXP (x, 1));
9682     }
9683   else
9684     return false;
9685 
9686   if (!strict_p
9687       && SUBREG_P (index)
9688       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
9689     index = SUBREG_REG (index);
9690 
9691   if (aarch64_sve_data_mode_p (mode))
9692     {
9693       if (type != ADDRESS_REG_REG
9694 	  || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
9695 	return false;
9696     }
9697   else
9698     {
9699       if (shift != 0
9700 	  && !(IN_RANGE (shift, 1, 3)
9701 	       && known_eq (1 << shift, GET_MODE_SIZE (mode))))
9702 	return false;
9703     }
9704 
9705   if (REG_P (index)
9706       && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
9707     {
9708       info->type = type;
9709       info->offset = index;
9710       info->shift = shift;
9711       return true;
9712     }
9713 
9714   return false;
9715 }
9716 
9717 /* Return true if MODE is one of the modes for which we
9718    support LDP/STP operations.  */
9719 
9720 static bool
aarch64_mode_valid_for_sched_fusion_p(machine_mode mode)9721 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
9722 {
9723   return mode == SImode || mode == DImode
9724 	 || mode == SFmode || mode == DFmode
9725 	 || (aarch64_vector_mode_supported_p (mode)
9726 	     && (known_eq (GET_MODE_SIZE (mode), 8)
9727 		 || (known_eq (GET_MODE_SIZE (mode), 16)
9728 		    && (aarch64_tune_params.extra_tuning_flags
9729 			& AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0)));
9730 }
9731 
9732 /* Return true if REGNO is a virtual pointer register, or an eliminable
9733    "soft" frame register.  Like REGNO_PTR_FRAME_P except that we don't
9734    include stack_pointer or hard_frame_pointer.  */
9735 static bool
virt_or_elim_regno_p(unsigned regno)9736 virt_or_elim_regno_p (unsigned regno)
9737 {
9738   return ((regno >= FIRST_VIRTUAL_REGISTER
9739 	   && regno <= LAST_VIRTUAL_POINTER_REGISTER)
9740 	  || regno == FRAME_POINTER_REGNUM
9741 	  || regno == ARG_POINTER_REGNUM);
9742 }
9743 
9744 /* Return true if X is a valid address of type TYPE for machine mode MODE.
9745    If it is, fill in INFO appropriately.  STRICT_P is true if
9746    REG_OK_STRICT is in effect.  */
9747 
9748 bool
aarch64_classify_address(struct aarch64_address_info * info,rtx x,machine_mode mode,bool strict_p,aarch64_addr_query_type type)9749 aarch64_classify_address (struct aarch64_address_info *info,
9750 			  rtx x, machine_mode mode, bool strict_p,
9751 			  aarch64_addr_query_type type)
9752 {
9753   enum rtx_code code = GET_CODE (x);
9754   rtx op0, op1;
9755   poly_int64 offset;
9756 
9757   HOST_WIDE_INT const_size;
9758 
9759   /* Whether a vector mode is partial doesn't affect address legitimacy.
9760      Partial vectors like VNx8QImode allow the same indexed addressing
9761      mode and MUL VL addressing mode as full vectors like VNx16QImode;
9762      in both cases, MUL VL counts multiples of GET_MODE_SIZE.  */
9763   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
9764   vec_flags &= ~VEC_PARTIAL;
9765 
9766   /* On BE, we use load/store pair for all large int mode load/stores.
9767      TI/TFmode may also use a load/store pair.  */
9768   bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
9769   bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
9770 			    || type == ADDR_QUERY_LDP_STP_N
9771 			    || mode == TImode
9772 			    || mode == TFmode
9773 			    || (BYTES_BIG_ENDIAN && advsimd_struct_p));
9774 
9775   /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
9776      corresponds to the actual size of the memory being loaded/stored and the
9777      mode of the corresponding addressing mode is half of that.  */
9778   if (type == ADDR_QUERY_LDP_STP_N
9779       && known_eq (GET_MODE_SIZE (mode), 16))
9780     mode = DFmode;
9781 
9782   bool allow_reg_index_p = (!load_store_pair_p
9783 			    && (known_lt (GET_MODE_SIZE (mode), 16)
9784 				|| vec_flags == VEC_ADVSIMD
9785 				|| vec_flags & VEC_SVE_DATA));
9786 
9787   /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
9788      [Rn, #offset, MUL VL].  */
9789   if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
9790       && (code != REG && code != PLUS))
9791     return false;
9792 
9793   /* On LE, for AdvSIMD, don't support anything other than POST_INC or
9794      REG addressing.  */
9795   if (advsimd_struct_p
9796       && !BYTES_BIG_ENDIAN
9797       && (code != POST_INC && code != REG))
9798     return false;
9799 
9800   gcc_checking_assert (GET_MODE (x) == VOIDmode
9801 		       || SCALAR_INT_MODE_P (GET_MODE (x)));
9802 
9803   switch (code)
9804     {
9805     case REG:
9806     case SUBREG:
9807       info->type = ADDRESS_REG_IMM;
9808       info->base = x;
9809       info->offset = const0_rtx;
9810       info->const_offset = 0;
9811       return aarch64_base_register_rtx_p (x, strict_p);
9812 
9813     case PLUS:
9814       op0 = XEXP (x, 0);
9815       op1 = XEXP (x, 1);
9816 
9817       if (! strict_p
9818 	  && REG_P (op0)
9819 	  && virt_or_elim_regno_p (REGNO (op0))
9820 	  && poly_int_rtx_p (op1, &offset))
9821 	{
9822 	  info->type = ADDRESS_REG_IMM;
9823 	  info->base = op0;
9824 	  info->offset = op1;
9825 	  info->const_offset = offset;
9826 
9827 	  return true;
9828 	}
9829 
9830       if (maybe_ne (GET_MODE_SIZE (mode), 0)
9831 	  && aarch64_base_register_rtx_p (op0, strict_p)
9832 	  && poly_int_rtx_p (op1, &offset))
9833 	{
9834 	  info->type = ADDRESS_REG_IMM;
9835 	  info->base = op0;
9836 	  info->offset = op1;
9837 	  info->const_offset = offset;
9838 
9839 	  /* TImode and TFmode values are allowed in both pairs of X
9840 	     registers and individual Q registers.  The available
9841 	     address modes are:
9842 	     X,X: 7-bit signed scaled offset
9843 	     Q:   9-bit signed offset
9844 	     We conservatively require an offset representable in either mode.
9845 	     When performing the check for pairs of X registers i.e.  LDP/STP
9846 	     pass down DImode since that is the natural size of the LDP/STP
9847 	     instruction memory accesses.  */
9848 	  if (mode == TImode || mode == TFmode)
9849 	    return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
9850 		    && (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
9851 			|| offset_12bit_unsigned_scaled_p (mode, offset)));
9852 
9853 	  /* A 7bit offset check because OImode will emit a ldp/stp
9854 	     instruction (only big endian will get here).
9855 	     For ldp/stp instructions, the offset is scaled for the size of a
9856 	     single element of the pair.  */
9857 	  if (mode == OImode)
9858 	    return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
9859 
9860 	  /* Three 9/12 bit offsets checks because CImode will emit three
9861 	     ldr/str instructions (only big endian will get here).  */
9862 	  if (mode == CImode)
9863 	    return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
9864 		    && (aarch64_offset_9bit_signed_unscaled_p (V16QImode,
9865 							       offset + 32)
9866 			|| offset_12bit_unsigned_scaled_p (V16QImode,
9867 							   offset + 32)));
9868 
9869 	  /* Two 7bit offsets checks because XImode will emit two ldp/stp
9870 	     instructions (only big endian will get here).  */
9871 	  if (mode == XImode)
9872 	    return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
9873 		    && aarch64_offset_7bit_signed_scaled_p (TImode,
9874 							    offset + 32));
9875 
9876 	  /* Make "m" use the LD1 offset range for SVE data modes, so
9877 	     that pre-RTL optimizers like ivopts will work to that
9878 	     instead of the wider LDR/STR range.  */
9879 	  if (vec_flags == VEC_SVE_DATA)
9880 	    return (type == ADDR_QUERY_M
9881 		    ? offset_4bit_signed_scaled_p (mode, offset)
9882 		    : offset_9bit_signed_scaled_p (mode, offset));
9883 
9884 	  if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
9885 	    {
9886 	      poly_int64 end_offset = (offset
9887 				       + GET_MODE_SIZE (mode)
9888 				       - BYTES_PER_SVE_VECTOR);
9889 	      return (type == ADDR_QUERY_M
9890 		      ? offset_4bit_signed_scaled_p (mode, offset)
9891 		      : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
9892 			 && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
9893 							 end_offset)));
9894 	    }
9895 
9896 	  if (vec_flags == VEC_SVE_PRED)
9897 	    return offset_9bit_signed_scaled_p (mode, offset);
9898 
9899 	  if (load_store_pair_p)
9900 	    return ((known_eq (GET_MODE_SIZE (mode), 4)
9901 		     || known_eq (GET_MODE_SIZE (mode), 8)
9902 		     || known_eq (GET_MODE_SIZE (mode), 16))
9903 		    && aarch64_offset_7bit_signed_scaled_p (mode, offset));
9904 	  else
9905 	    return (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
9906 		    || offset_12bit_unsigned_scaled_p (mode, offset));
9907 	}
9908 
9909       if (allow_reg_index_p)
9910 	{
9911 	  /* Look for base + (scaled/extended) index register.  */
9912 	  if (aarch64_base_register_rtx_p (op0, strict_p)
9913 	      && aarch64_classify_index (info, op1, mode, strict_p))
9914 	    {
9915 	      info->base = op0;
9916 	      return true;
9917 	    }
9918 	  if (aarch64_base_register_rtx_p (op1, strict_p)
9919 	      && aarch64_classify_index (info, op0, mode, strict_p))
9920 	    {
9921 	      info->base = op1;
9922 	      return true;
9923 	    }
9924 	}
9925 
9926       return false;
9927 
9928     case POST_INC:
9929     case POST_DEC:
9930     case PRE_INC:
9931     case PRE_DEC:
9932       info->type = ADDRESS_REG_WB;
9933       info->base = XEXP (x, 0);
9934       info->offset = NULL_RTX;
9935       return aarch64_base_register_rtx_p (info->base, strict_p);
9936 
9937     case POST_MODIFY:
9938     case PRE_MODIFY:
9939       info->type = ADDRESS_REG_WB;
9940       info->base = XEXP (x, 0);
9941       if (GET_CODE (XEXP (x, 1)) == PLUS
9942 	  && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
9943 	  && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
9944 	  && aarch64_base_register_rtx_p (info->base, strict_p))
9945 	{
9946 	  info->offset = XEXP (XEXP (x, 1), 1);
9947 	  info->const_offset = offset;
9948 
9949 	  /* TImode and TFmode values are allowed in both pairs of X
9950 	     registers and individual Q registers.  The available
9951 	     address modes are:
9952 	     X,X: 7-bit signed scaled offset
9953 	     Q:   9-bit signed offset
9954 	     We conservatively require an offset representable in either mode.
9955 	   */
9956 	  if (mode == TImode || mode == TFmode)
9957 	    return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
9958 		    && aarch64_offset_9bit_signed_unscaled_p (mode, offset));
9959 
9960 	  if (load_store_pair_p)
9961 	    return ((known_eq (GET_MODE_SIZE (mode), 4)
9962 		     || known_eq (GET_MODE_SIZE (mode), 8)
9963 		     || known_eq (GET_MODE_SIZE (mode), 16))
9964 		    && aarch64_offset_7bit_signed_scaled_p (mode, offset));
9965 	  else
9966 	    return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
9967 	}
9968       return false;
9969 
9970     case CONST:
9971     case SYMBOL_REF:
9972     case LABEL_REF:
9973       /* load literal: pc-relative constant pool entry.  Only supported
9974          for SI mode or larger.  */
9975       info->type = ADDRESS_SYMBOLIC;
9976 
9977       if (!load_store_pair_p
9978 	  && GET_MODE_SIZE (mode).is_constant (&const_size)
9979 	  && const_size >= 4)
9980 	{
9981 	  poly_int64 offset;
9982 	  rtx sym = strip_offset_and_salt (x, &offset);
9983 	  return ((LABEL_REF_P (sym)
9984 		   || (SYMBOL_REF_P (sym)
9985 		       && CONSTANT_POOL_ADDRESS_P (sym)
9986 		       && aarch64_pcrelative_literal_loads)));
9987 	}
9988       return false;
9989 
9990     case LO_SUM:
9991       info->type = ADDRESS_LO_SUM;
9992       info->base = XEXP (x, 0);
9993       info->offset = XEXP (x, 1);
9994       if (allow_reg_index_p
9995 	  && aarch64_base_register_rtx_p (info->base, strict_p))
9996 	{
9997 	  poly_int64 offset;
9998 	  HOST_WIDE_INT const_offset;
9999 	  rtx sym = strip_offset_and_salt (info->offset, &offset);
10000 	  if (SYMBOL_REF_P (sym)
10001 	      && offset.is_constant (&const_offset)
10002 	      && (aarch64_classify_symbol (sym, const_offset)
10003 		  == SYMBOL_SMALL_ABSOLUTE))
10004 	    {
10005 	      /* The symbol and offset must be aligned to the access size.  */
10006 	      unsigned int align;
10007 
10008 	      if (CONSTANT_POOL_ADDRESS_P (sym))
10009 		align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
10010 	      else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
10011 		{
10012 		  tree exp = SYMBOL_REF_DECL (sym);
10013 		  align = TYPE_ALIGN (TREE_TYPE (exp));
10014 		  align = aarch64_constant_alignment (exp, align);
10015 		}
10016 	      else if (SYMBOL_REF_DECL (sym))
10017 		align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
10018 	      else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
10019 		       && SYMBOL_REF_BLOCK (sym) != NULL)
10020 		align = SYMBOL_REF_BLOCK (sym)->alignment;
10021 	      else
10022 		align = BITS_PER_UNIT;
10023 
10024 	      poly_int64 ref_size = GET_MODE_SIZE (mode);
10025 	      if (known_eq (ref_size, 0))
10026 		ref_size = GET_MODE_SIZE (DImode);
10027 
10028 	      return (multiple_p (const_offset, ref_size)
10029 		      && multiple_p (align / BITS_PER_UNIT, ref_size));
10030 	    }
10031 	}
10032       return false;
10033 
10034     default:
10035       return false;
10036     }
10037 }
10038 
10039 /* Return true if the address X is valid for a PRFM instruction.
10040    STRICT_P is true if we should do strict checking with
10041    aarch64_classify_address.  */
10042 
10043 bool
aarch64_address_valid_for_prefetch_p(rtx x,bool strict_p)10044 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
10045 {
10046   struct aarch64_address_info addr;
10047 
10048   /* PRFM accepts the same addresses as DImode...  */
10049   bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
10050   if (!res)
10051     return false;
10052 
10053   /* ... except writeback forms.  */
10054   return addr.type != ADDRESS_REG_WB;
10055 }
10056 
10057 bool
aarch64_symbolic_address_p(rtx x)10058 aarch64_symbolic_address_p (rtx x)
10059 {
10060   poly_int64 offset;
10061   x = strip_offset_and_salt (x, &offset);
10062   return SYMBOL_REF_P (x) || LABEL_REF_P (x);
10063 }
10064 
10065 /* Classify the base of symbolic expression X.  */
10066 
10067 enum aarch64_symbol_type
aarch64_classify_symbolic_expression(rtx x)10068 aarch64_classify_symbolic_expression (rtx x)
10069 {
10070   rtx offset;
10071 
10072   split_const (x, &x, &offset);
10073   return aarch64_classify_symbol (x, INTVAL (offset));
10074 }
10075 
10076 
10077 /* Return TRUE if X is a legitimate address for accessing memory in
10078    mode MODE.  */
10079 static bool
aarch64_legitimate_address_hook_p(machine_mode mode,rtx x,bool strict_p)10080 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
10081 {
10082   struct aarch64_address_info addr;
10083 
10084   return aarch64_classify_address (&addr, x, mode, strict_p);
10085 }
10086 
10087 /* Return TRUE if X is a legitimate address of type TYPE for accessing
10088    memory in mode MODE.  STRICT_P is true if REG_OK_STRICT is in effect.  */
10089 bool
aarch64_legitimate_address_p(machine_mode mode,rtx x,bool strict_p,aarch64_addr_query_type type)10090 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
10091 			      aarch64_addr_query_type type)
10092 {
10093   struct aarch64_address_info addr;
10094 
10095   return aarch64_classify_address (&addr, x, mode, strict_p, type);
10096 }
10097 
10098 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT.  */
10099 
10100 static bool
aarch64_legitimize_address_displacement(rtx * offset1,rtx * offset2,poly_int64 orig_offset,machine_mode mode)10101 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
10102 					 poly_int64 orig_offset,
10103 					 machine_mode mode)
10104 {
10105   HOST_WIDE_INT size;
10106   if (GET_MODE_SIZE (mode).is_constant (&size))
10107     {
10108       HOST_WIDE_INT const_offset, second_offset;
10109 
10110       /* A general SVE offset is A * VQ + B.  Remove the A component from
10111 	 coefficient 0 in order to get the constant B.  */
10112       const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
10113 
10114       /* Split an out-of-range address displacement into a base and
10115 	 offset.  Use 4KB range for 1- and 2-byte accesses and a 16KB
10116 	 range otherwise to increase opportunities for sharing the base
10117 	 address of different sizes.  Unaligned accesses use the signed
10118 	 9-bit range, TImode/TFmode use the intersection of signed
10119 	 scaled 7-bit and signed 9-bit offset.  */
10120       if (mode == TImode || mode == TFmode)
10121 	second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
10122       else if ((const_offset & (size - 1)) != 0)
10123 	second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
10124       else
10125 	second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
10126 
10127       if (second_offset == 0 || known_eq (orig_offset, second_offset))
10128 	return false;
10129 
10130       /* Split the offset into second_offset and the rest.  */
10131       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
10132       *offset2 = gen_int_mode (second_offset, Pmode);
10133       return true;
10134     }
10135   else
10136     {
10137       /* Get the mode we should use as the basis of the range.  For structure
10138 	 modes this is the mode of one vector.  */
10139       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
10140       machine_mode step_mode
10141 	= (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
10142 
10143       /* Get the "mul vl" multiplier we'd like to use.  */
10144       HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
10145       HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
10146       if (vec_flags & VEC_SVE_DATA)
10147 	/* LDR supports a 9-bit range, but the move patterns for
10148 	   structure modes require all vectors to be in range of the
10149 	   same base.  The simplest way of accomodating that while still
10150 	   promoting reuse of anchor points between different modes is
10151 	   to use an 8-bit range unconditionally.  */
10152 	vnum = ((vnum + 128) & 255) - 128;
10153       else
10154 	/* Predicates are only handled singly, so we might as well use
10155 	   the full range.  */
10156 	vnum = ((vnum + 256) & 511) - 256;
10157       if (vnum == 0)
10158 	return false;
10159 
10160       /* Convert the "mul vl" multiplier into a byte offset.  */
10161       poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
10162       if (known_eq (second_offset, orig_offset))
10163 	return false;
10164 
10165       /* Split the offset into second_offset and the rest.  */
10166       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
10167       *offset2 = gen_int_mode (second_offset, Pmode);
10168       return true;
10169     }
10170 }
10171 
10172 /* Return the binary representation of floating point constant VALUE in INTVAL.
10173    If the value cannot be converted, return false without setting INTVAL.
10174    The conversion is done in the given MODE.  */
10175 bool
aarch64_reinterpret_float_as_int(rtx value,unsigned HOST_WIDE_INT * intval)10176 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
10177 {
10178 
10179   /* We make a general exception for 0.  */
10180   if (aarch64_float_const_zero_rtx_p (value))
10181     {
10182       *intval = 0;
10183       return true;
10184     }
10185 
10186   scalar_float_mode mode;
10187   if (!CONST_DOUBLE_P (value)
10188       || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
10189       || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
10190       /* Only support up to DF mode.  */
10191       || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
10192     return false;
10193 
10194   unsigned HOST_WIDE_INT ival = 0;
10195 
10196   long res[2];
10197   real_to_target (res,
10198 		  CONST_DOUBLE_REAL_VALUE (value),
10199 		  REAL_MODE_FORMAT (mode));
10200 
10201   if (mode == DFmode)
10202     {
10203       int order = BYTES_BIG_ENDIAN ? 1 : 0;
10204       ival = zext_hwi (res[order], 32);
10205       ival |= (zext_hwi (res[1 - order], 32) << 32);
10206     }
10207   else
10208       ival = zext_hwi (res[0], 32);
10209 
10210   *intval = ival;
10211   return true;
10212 }
10213 
10214 /* Return TRUE if rtx X is an immediate constant that can be moved using a
10215    single MOV(+MOVK) followed by an FMOV.  */
10216 bool
aarch64_float_const_rtx_p(rtx x)10217 aarch64_float_const_rtx_p (rtx x)
10218 {
10219   machine_mode mode = GET_MODE (x);
10220   if (mode == VOIDmode)
10221     return false;
10222 
10223   /* Determine whether it's cheaper to write float constants as
10224      mov/movk pairs over ldr/adrp pairs.  */
10225   unsigned HOST_WIDE_INT ival;
10226 
10227   if (CONST_DOUBLE_P (x)
10228       && SCALAR_FLOAT_MODE_P (mode)
10229       && aarch64_reinterpret_float_as_int (x, &ival))
10230     {
10231       scalar_int_mode imode = (mode == HFmode
10232 			       ? SImode
10233 			       : int_mode_for_mode (mode).require ());
10234       int num_instr = aarch64_internal_mov_immediate
10235 			(NULL_RTX, gen_int_mode (ival, imode), false, imode);
10236       return num_instr < 3;
10237     }
10238 
10239   return false;
10240 }
10241 
10242 /* Return TRUE if rtx X is immediate constant 0.0 */
10243 bool
aarch64_float_const_zero_rtx_p(rtx x)10244 aarch64_float_const_zero_rtx_p (rtx x)
10245 {
10246   if (GET_MODE (x) == VOIDmode)
10247     return false;
10248 
10249   if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
10250     return !HONOR_SIGNED_ZEROS (GET_MODE (x));
10251   return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
10252 }
10253 
10254 /* Return TRUE if rtx X is immediate constant that fits in a single
10255    MOVI immediate operation.  */
10256 bool
aarch64_can_const_movi_rtx_p(rtx x,machine_mode mode)10257 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
10258 {
10259   if (!TARGET_SIMD)
10260      return false;
10261 
10262   machine_mode vmode;
10263   scalar_int_mode imode;
10264   unsigned HOST_WIDE_INT ival;
10265 
10266   if (CONST_DOUBLE_P (x)
10267       && SCALAR_FLOAT_MODE_P (mode))
10268     {
10269       if (!aarch64_reinterpret_float_as_int (x, &ival))
10270 	return false;
10271 
10272       /* We make a general exception for 0.  */
10273       if (aarch64_float_const_zero_rtx_p (x))
10274 	return true;
10275 
10276       imode = int_mode_for_mode (mode).require ();
10277     }
10278   else if (CONST_INT_P (x)
10279 	   && is_a <scalar_int_mode> (mode, &imode))
10280     ival = INTVAL (x);
10281   else
10282     return false;
10283 
10284    /* use a 64 bit mode for everything except for DI/DF mode, where we use
10285      a 128 bit vector mode.  */
10286   int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
10287 
10288   vmode = aarch64_simd_container_mode (imode, width);
10289   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
10290 
10291   return aarch64_simd_valid_immediate (v_op, NULL);
10292 }
10293 
10294 
10295 /* Return the fixed registers used for condition codes.  */
10296 
10297 static bool
aarch64_fixed_condition_code_regs(unsigned int * p1,unsigned int * p2)10298 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
10299 {
10300   *p1 = CC_REGNUM;
10301   *p2 = INVALID_REGNUM;
10302   return true;
10303 }
10304 
10305 /* This function is used by the call expanders of the machine description.
10306    RESULT is the register in which the result is returned.  It's NULL for
10307    "call" and "sibcall".
10308    MEM is the location of the function call.
10309    CALLEE_ABI is a const_int that gives the arm_pcs of the callee.
10310    SIBCALL indicates whether this function call is normal call or sibling call.
10311    It will generate different pattern accordingly.  */
10312 
10313 void
aarch64_expand_call(rtx result,rtx mem,rtx callee_abi,bool sibcall)10314 aarch64_expand_call (rtx result, rtx mem, rtx callee_abi, bool sibcall)
10315 {
10316   rtx call, callee, tmp;
10317   rtvec vec;
10318   machine_mode mode;
10319 
10320   gcc_assert (MEM_P (mem));
10321   callee = XEXP (mem, 0);
10322   mode = GET_MODE (callee);
10323   gcc_assert (mode == Pmode);
10324 
10325   /* Decide if we should generate indirect calls by loading the
10326      address of the callee into a register before performing
10327      the branch-and-link.  */
10328   if (SYMBOL_REF_P (callee)
10329       ? (aarch64_is_long_call_p (callee)
10330 	 || aarch64_is_noplt_call_p (callee))
10331       : !REG_P (callee))
10332     XEXP (mem, 0) = force_reg (mode, callee);
10333 
10334   call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
10335 
10336   if (result != NULL_RTX)
10337     call = gen_rtx_SET (result, call);
10338 
10339   if (sibcall)
10340     tmp = ret_rtx;
10341   else
10342     tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
10343 
10344   gcc_assert (CONST_INT_P (callee_abi));
10345   callee_abi = gen_rtx_UNSPEC (DImode, gen_rtvec (1, callee_abi),
10346 			       UNSPEC_CALLEE_ABI);
10347 
10348   vec = gen_rtvec (3, call, callee_abi, tmp);
10349   call = gen_rtx_PARALLEL (VOIDmode, vec);
10350 
10351   aarch64_emit_call_insn (call);
10352 }
10353 
10354 /* Emit call insn with PAT and do aarch64-specific handling.  */
10355 
10356 void
aarch64_emit_call_insn(rtx pat)10357 aarch64_emit_call_insn (rtx pat)
10358 {
10359   rtx insn = emit_call_insn (pat);
10360 
10361   rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
10362   clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
10363   clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
10364 }
10365 
10366 machine_mode
aarch64_select_cc_mode(RTX_CODE code,rtx x,rtx y)10367 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
10368 {
10369   machine_mode mode_x = GET_MODE (x);
10370   rtx_code code_x = GET_CODE (x);
10371 
10372   /* All floating point compares return CCFP if it is an equality
10373      comparison, and CCFPE otherwise.  */
10374   if (GET_MODE_CLASS (mode_x) == MODE_FLOAT)
10375     {
10376       switch (code)
10377 	{
10378 	case EQ:
10379 	case NE:
10380 	case UNORDERED:
10381 	case ORDERED:
10382 	case UNLT:
10383 	case UNLE:
10384 	case UNGT:
10385 	case UNGE:
10386 	case UNEQ:
10387 	  return CCFPmode;
10388 
10389 	case LT:
10390 	case LE:
10391 	case GT:
10392 	case GE:
10393 	case LTGT:
10394 	  return CCFPEmode;
10395 
10396 	default:
10397 	  gcc_unreachable ();
10398 	}
10399     }
10400 
10401   /* Equality comparisons of short modes against zero can be performed
10402      using the TST instruction with the appropriate bitmask.  */
10403   if (y == const0_rtx && (REG_P (x) || SUBREG_P (x))
10404       && (code == EQ || code == NE)
10405       && (mode_x == HImode || mode_x == QImode))
10406     return CC_NZmode;
10407 
10408   /* Similarly, comparisons of zero_extends from shorter modes can
10409      be performed using an ANDS with an immediate mask.  */
10410   if (y == const0_rtx && code_x == ZERO_EXTEND
10411       && (mode_x == SImode || mode_x == DImode)
10412       && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
10413       && (code == EQ || code == NE))
10414     return CC_NZmode;
10415 
10416   if ((mode_x == SImode || mode_x == DImode)
10417       && y == const0_rtx
10418       && (code == EQ || code == NE || code == LT || code == GE)
10419       && (code_x == PLUS || code_x == MINUS || code_x == AND
10420 	  || code_x == NEG
10421 	  || (code_x == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
10422 	      && CONST_INT_P (XEXP (x, 2)))))
10423     return CC_NZmode;
10424 
10425   /* A compare with a shifted operand.  Because of canonicalization,
10426      the comparison will have to be swapped when we emit the assembly
10427      code.  */
10428   if ((mode_x == SImode || mode_x == DImode)
10429       && (REG_P (y) || SUBREG_P (y) || y == const0_rtx)
10430       && (code_x == ASHIFT || code_x == ASHIFTRT
10431 	  || code_x == LSHIFTRT
10432 	  || code_x == ZERO_EXTEND || code_x == SIGN_EXTEND))
10433     return CC_SWPmode;
10434 
10435   /* Similarly for a negated operand, but we can only do this for
10436      equalities.  */
10437   if ((mode_x == SImode || mode_x == DImode)
10438       && (REG_P (y) || SUBREG_P (y))
10439       && (code == EQ || code == NE)
10440       && code_x == NEG)
10441     return CC_Zmode;
10442 
10443   /* A test for unsigned overflow from an addition.  */
10444   if ((mode_x == DImode || mode_x == TImode)
10445       && (code == LTU || code == GEU)
10446       && code_x == PLUS
10447       && rtx_equal_p (XEXP (x, 0), y))
10448     return CC_Cmode;
10449 
10450   /* A test for unsigned overflow from an add with carry.  */
10451   if ((mode_x == DImode || mode_x == TImode)
10452       && (code == LTU || code == GEU)
10453       && code_x == PLUS
10454       && CONST_SCALAR_INT_P (y)
10455       && (rtx_mode_t (y, mode_x)
10456 	  == (wi::shwi (1, mode_x)
10457 	      << (GET_MODE_BITSIZE (mode_x).to_constant () / 2))))
10458     return CC_ADCmode;
10459 
10460   /* A test for signed overflow.  */
10461   if ((mode_x == DImode || mode_x == TImode)
10462       && code == NE
10463       && code_x == PLUS
10464       && GET_CODE (y) == SIGN_EXTEND)
10465     return CC_Vmode;
10466 
10467   /* For everything else, return CCmode.  */
10468   return CCmode;
10469 }
10470 
10471 static int
10472 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
10473 
10474 int
aarch64_get_condition_code(rtx x)10475 aarch64_get_condition_code (rtx x)
10476 {
10477   machine_mode mode = GET_MODE (XEXP (x, 0));
10478   enum rtx_code comp_code = GET_CODE (x);
10479 
10480   if (GET_MODE_CLASS (mode) != MODE_CC)
10481     mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
10482   return aarch64_get_condition_code_1 (mode, comp_code);
10483 }
10484 
10485 static int
aarch64_get_condition_code_1(machine_mode mode,enum rtx_code comp_code)10486 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
10487 {
10488   switch (mode)
10489     {
10490     case E_CCFPmode:
10491     case E_CCFPEmode:
10492       switch (comp_code)
10493 	{
10494 	case GE: return AARCH64_GE;
10495 	case GT: return AARCH64_GT;
10496 	case LE: return AARCH64_LS;
10497 	case LT: return AARCH64_MI;
10498 	case NE: return AARCH64_NE;
10499 	case EQ: return AARCH64_EQ;
10500 	case ORDERED: return AARCH64_VC;
10501 	case UNORDERED: return AARCH64_VS;
10502 	case UNLT: return AARCH64_LT;
10503 	case UNLE: return AARCH64_LE;
10504 	case UNGT: return AARCH64_HI;
10505 	case UNGE: return AARCH64_PL;
10506 	default: return -1;
10507 	}
10508       break;
10509 
10510     case E_CCmode:
10511       switch (comp_code)
10512 	{
10513 	case NE: return AARCH64_NE;
10514 	case EQ: return AARCH64_EQ;
10515 	case GE: return AARCH64_GE;
10516 	case GT: return AARCH64_GT;
10517 	case LE: return AARCH64_LE;
10518 	case LT: return AARCH64_LT;
10519 	case GEU: return AARCH64_CS;
10520 	case GTU: return AARCH64_HI;
10521 	case LEU: return AARCH64_LS;
10522 	case LTU: return AARCH64_CC;
10523 	default: return -1;
10524 	}
10525       break;
10526 
10527     case E_CC_SWPmode:
10528       switch (comp_code)
10529 	{
10530 	case NE: return AARCH64_NE;
10531 	case EQ: return AARCH64_EQ;
10532 	case GE: return AARCH64_LE;
10533 	case GT: return AARCH64_LT;
10534 	case LE: return AARCH64_GE;
10535 	case LT: return AARCH64_GT;
10536 	case GEU: return AARCH64_LS;
10537 	case GTU: return AARCH64_CC;
10538 	case LEU: return AARCH64_CS;
10539 	case LTU: return AARCH64_HI;
10540 	default: return -1;
10541 	}
10542       break;
10543 
10544     case E_CC_NZCmode:
10545       switch (comp_code)
10546 	{
10547 	case NE: return AARCH64_NE; /* = any */
10548 	case EQ: return AARCH64_EQ; /* = none */
10549 	case GE: return AARCH64_PL; /* = nfrst */
10550 	case LT: return AARCH64_MI; /* = first */
10551 	case GEU: return AARCH64_CS; /* = nlast */
10552 	case GTU: return AARCH64_HI; /* = pmore */
10553 	case LEU: return AARCH64_LS; /* = plast */
10554 	case LTU: return AARCH64_CC; /* = last */
10555 	default: return -1;
10556 	}
10557       break;
10558 
10559     case E_CC_NZmode:
10560       switch (comp_code)
10561 	{
10562 	case NE: return AARCH64_NE;
10563 	case EQ: return AARCH64_EQ;
10564 	case GE: return AARCH64_PL;
10565 	case LT: return AARCH64_MI;
10566 	default: return -1;
10567 	}
10568       break;
10569 
10570     case E_CC_Zmode:
10571       switch (comp_code)
10572 	{
10573 	case NE: return AARCH64_NE;
10574 	case EQ: return AARCH64_EQ;
10575 	default: return -1;
10576 	}
10577       break;
10578 
10579     case E_CC_Cmode:
10580       switch (comp_code)
10581 	{
10582 	case LTU: return AARCH64_CS;
10583 	case GEU: return AARCH64_CC;
10584 	default: return -1;
10585 	}
10586       break;
10587 
10588     case E_CC_ADCmode:
10589       switch (comp_code)
10590 	{
10591 	case GEU: return AARCH64_CS;
10592 	case LTU: return AARCH64_CC;
10593 	default: return -1;
10594 	}
10595       break;
10596 
10597     case E_CC_Vmode:
10598       switch (comp_code)
10599 	{
10600 	case NE: return AARCH64_VS;
10601 	case EQ: return AARCH64_VC;
10602 	default: return -1;
10603 	}
10604       break;
10605 
10606     default:
10607       return -1;
10608     }
10609 
10610   return -1;
10611 }
10612 
10613 bool
aarch64_const_vec_all_same_in_range_p(rtx x,HOST_WIDE_INT minval,HOST_WIDE_INT maxval)10614 aarch64_const_vec_all_same_in_range_p (rtx x,
10615 				       HOST_WIDE_INT minval,
10616 				       HOST_WIDE_INT maxval)
10617 {
10618   rtx elt;
10619   return (const_vec_duplicate_p (x, &elt)
10620 	  && CONST_INT_P (elt)
10621 	  && IN_RANGE (INTVAL (elt), minval, maxval));
10622 }
10623 
10624 bool
aarch64_const_vec_all_same_int_p(rtx x,HOST_WIDE_INT val)10625 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
10626 {
10627   return aarch64_const_vec_all_same_in_range_p (x, val, val);
10628 }
10629 
10630 /* Return true if VEC is a constant in which every element is in the range
10631    [MINVAL, MAXVAL].  The elements do not need to have the same value.  */
10632 
10633 static bool
aarch64_const_vec_all_in_range_p(rtx vec,HOST_WIDE_INT minval,HOST_WIDE_INT maxval)10634 aarch64_const_vec_all_in_range_p (rtx vec,
10635 				  HOST_WIDE_INT minval,
10636 				  HOST_WIDE_INT maxval)
10637 {
10638   if (GET_CODE (vec) != CONST_VECTOR
10639       || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
10640     return false;
10641 
10642   int nunits;
10643   if (!CONST_VECTOR_STEPPED_P (vec))
10644     nunits = const_vector_encoded_nelts (vec);
10645   else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
10646     return false;
10647 
10648   for (int i = 0; i < nunits; i++)
10649     {
10650       rtx vec_elem = CONST_VECTOR_ELT (vec, i);
10651       if (!CONST_INT_P (vec_elem)
10652 	  || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
10653 	return false;
10654     }
10655   return true;
10656 }
10657 
10658 /* N Z C V.  */
10659 #define AARCH64_CC_V 1
10660 #define AARCH64_CC_C (1 << 1)
10661 #define AARCH64_CC_Z (1 << 2)
10662 #define AARCH64_CC_N (1 << 3)
10663 
10664 /* N Z C V flags for ccmp.  Indexed by AARCH64_COND_CODE.  */
10665 static const int aarch64_nzcv_codes[] =
10666 {
10667   0,		/* EQ, Z == 1.  */
10668   AARCH64_CC_Z,	/* NE, Z == 0.  */
10669   0,		/* CS, C == 1.  */
10670   AARCH64_CC_C,	/* CC, C == 0.  */
10671   0,		/* MI, N == 1.  */
10672   AARCH64_CC_N, /* PL, N == 0.  */
10673   0,		/* VS, V == 1.  */
10674   AARCH64_CC_V, /* VC, V == 0.  */
10675   0,		/* HI, C ==1 && Z == 0.  */
10676   AARCH64_CC_C,	/* LS, !(C == 1 && Z == 0).  */
10677   AARCH64_CC_V,	/* GE, N == V.  */
10678   0,		/* LT, N != V.  */
10679   AARCH64_CC_Z, /* GT, Z == 0 && N == V.  */
10680   0,		/* LE, !(Z == 0 && N == V).  */
10681   0,		/* AL, Any.  */
10682   0		/* NV, Any.  */
10683 };
10684 
10685 /* Print floating-point vector immediate operand X to F, negating it
10686    first if NEGATE is true.  Return true on success, false if it isn't
10687    a constant we can handle.  */
10688 
10689 static bool
aarch64_print_vector_float_operand(FILE * f,rtx x,bool negate)10690 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
10691 {
10692   rtx elt;
10693 
10694   if (!const_vec_duplicate_p (x, &elt))
10695     return false;
10696 
10697   REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
10698   if (negate)
10699     r = real_value_negate (&r);
10700 
10701   /* Handle the SVE single-bit immediates specially, since they have a
10702      fixed form in the assembly syntax.  */
10703   if (real_equal (&r, &dconst0))
10704     asm_fprintf (f, "0.0");
10705   else if (real_equal (&r, &dconst2))
10706     asm_fprintf (f, "2.0");
10707   else if (real_equal (&r, &dconst1))
10708     asm_fprintf (f, "1.0");
10709   else if (real_equal (&r, &dconsthalf))
10710     asm_fprintf (f, "0.5");
10711   else
10712     {
10713       const int buf_size = 20;
10714       char float_buf[buf_size] = {'\0'};
10715       real_to_decimal_for_mode (float_buf, &r, buf_size, buf_size,
10716 				1, GET_MODE (elt));
10717       asm_fprintf (f, "%s", float_buf);
10718     }
10719 
10720   return true;
10721 }
10722 
10723 /* Return the equivalent letter for size.  */
10724 static char
sizetochar(int size)10725 sizetochar (int size)
10726 {
10727   switch (size)
10728     {
10729     case 64: return 'd';
10730     case 32: return 's';
10731     case 16: return 'h';
10732     case 8 : return 'b';
10733     default: gcc_unreachable ();
10734     }
10735 }
10736 
10737 /* Print operand X to file F in a target specific manner according to CODE.
10738    The acceptable formatting commands given by CODE are:
10739      'c':		An integer or symbol address without a preceding #
10740 			sign.
10741      'C':		Take the duplicated element in a vector constant
10742 			and print it in hex.
10743      'D':		Take the duplicated element in a vector constant
10744 			and print it as an unsigned integer, in decimal.
10745      'e':		Print the sign/zero-extend size as a character 8->b,
10746 			16->h, 32->w.  Can also be used for masks:
10747 			0xff->b, 0xffff->h, 0xffffffff->w.
10748      'I':		If the operand is a duplicated vector constant,
10749 			replace it with the duplicated scalar.  If the
10750 			operand is then a floating-point constant, replace
10751 			it with the integer bit representation.  Print the
10752 			transformed constant as a signed decimal number.
10753      'p':		Prints N such that 2^N == X (X must be power of 2 and
10754 			const int).
10755      'P':		Print the number of non-zero bits in X (a const_int).
10756      'H':		Print the higher numbered register of a pair (TImode)
10757 			of regs.
10758      'm':		Print a condition (eq, ne, etc).
10759      'M':		Same as 'm', but invert condition.
10760      'N':		Take the duplicated element in a vector constant
10761 			and print the negative of it in decimal.
10762      'b/h/s/d/q':	Print a scalar FP/SIMD register name.
10763      'S/T/U/V':		Print a FP/SIMD register name for a register list.
10764 			The register printed is the FP/SIMD register name
10765 			of X + 0/1/2/3 for S/T/U/V.
10766      'R':		Print a scalar Integer/FP/SIMD register name + 1.
10767      'X':		Print bottom 16 bits of integer constant in hex.
10768      'w/x':		Print a general register name or the zero register
10769 			(32-bit or 64-bit).
10770      '0':		Print a normal operand, if it's a general register,
10771 			then we assume DImode.
10772      'k':		Print NZCV for conditional compare instructions.
10773      'A':		Output address constant representing the first
10774 			argument of X, specifying a relocation offset
10775 			if appropriate.
10776      'L':		Output constant address specified by X
10777 			with a relocation offset if appropriate.
10778      'G':		Prints address of X, specifying a PC relative
10779 			relocation mode if appropriate.
10780      'y':		Output address of LDP or STP - this is used for
10781 			some LDP/STPs which don't use a PARALLEL in their
10782 			pattern (so the mode needs to be adjusted).
10783      'z':		Output address of a typical LDP or STP.  */
10784 
10785 static void
aarch64_print_operand(FILE * f,rtx x,int code)10786 aarch64_print_operand (FILE *f, rtx x, int code)
10787 {
10788   rtx elt;
10789   switch (code)
10790     {
10791     case 'c':
10792       if (CONST_INT_P (x))
10793 	fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
10794       else
10795 	{
10796 	  poly_int64 offset;
10797 	  rtx base = strip_offset_and_salt (x, &offset);
10798 	  if (SYMBOL_REF_P (base))
10799 	    output_addr_const (f, x);
10800 	  else
10801 	    output_operand_lossage ("unsupported operand for code '%c'", code);
10802 	}
10803       break;
10804 
10805     case 'e':
10806       {
10807 	x = unwrap_const_vec_duplicate (x);
10808 	if (!CONST_INT_P (x))
10809 	  {
10810 	    output_operand_lossage ("invalid operand for '%%%c'", code);
10811 	    return;
10812 	  }
10813 
10814 	HOST_WIDE_INT val = INTVAL (x);
10815 	if ((val & ~7) == 8 || val == 0xff)
10816 	  fputc ('b', f);
10817 	else if ((val & ~7) == 16 || val == 0xffff)
10818 	  fputc ('h', f);
10819 	else if ((val & ~7) == 32 || val == 0xffffffff)
10820 	  fputc ('w', f);
10821 	else
10822 	  {
10823 	    output_operand_lossage ("invalid operand for '%%%c'", code);
10824 	    return;
10825 	  }
10826       }
10827       break;
10828 
10829     case 'p':
10830       {
10831 	int n;
10832 
10833 	if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
10834 	  {
10835 	    output_operand_lossage ("invalid operand for '%%%c'", code);
10836 	    return;
10837 	  }
10838 
10839 	asm_fprintf (f, "%d", n);
10840       }
10841       break;
10842 
10843     case 'P':
10844       if (!CONST_INT_P (x))
10845 	{
10846 	  output_operand_lossage ("invalid operand for '%%%c'", code);
10847 	  return;
10848 	}
10849 
10850       asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
10851       break;
10852 
10853     case 'H':
10854       if (x == const0_rtx)
10855 	{
10856 	  asm_fprintf (f, "xzr");
10857 	  break;
10858 	}
10859 
10860       if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
10861 	{
10862 	  output_operand_lossage ("invalid operand for '%%%c'", code);
10863 	  return;
10864 	}
10865 
10866       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
10867       break;
10868 
10869     case 'I':
10870       {
10871 	x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
10872 	if (CONST_INT_P (x))
10873 	  asm_fprintf (f, "%wd", INTVAL (x));
10874 	else
10875 	  {
10876 	    output_operand_lossage ("invalid operand for '%%%c'", code);
10877 	    return;
10878 	  }
10879 	break;
10880       }
10881 
10882     case 'M':
10883     case 'm':
10884       {
10885         int cond_code;
10886 	/* CONST_TRUE_RTX means al/nv (al is the default, don't print it).  */
10887 	if (x == const_true_rtx)
10888 	  {
10889 	    if (code == 'M')
10890 	      fputs ("nv", f);
10891 	    return;
10892 	  }
10893 
10894         if (!COMPARISON_P (x))
10895 	  {
10896 	    output_operand_lossage ("invalid operand for '%%%c'", code);
10897 	    return;
10898 	  }
10899 
10900         cond_code = aarch64_get_condition_code (x);
10901         gcc_assert (cond_code >= 0);
10902 	if (code == 'M')
10903 	  cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
10904 	if (GET_MODE (XEXP (x, 0)) == CC_NZCmode)
10905 	  fputs (aarch64_sve_condition_codes[cond_code], f);
10906 	else
10907 	  fputs (aarch64_condition_codes[cond_code], f);
10908       }
10909       break;
10910 
10911     case 'N':
10912       if (!const_vec_duplicate_p (x, &elt))
10913 	{
10914 	  output_operand_lossage ("invalid vector constant");
10915 	  return;
10916 	}
10917 
10918       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
10919 	asm_fprintf (f, "%wd", (HOST_WIDE_INT) -UINTVAL (elt));
10920       else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
10921 	       && aarch64_print_vector_float_operand (f, x, true))
10922 	;
10923       else
10924 	{
10925 	  output_operand_lossage ("invalid vector constant");
10926 	  return;
10927 	}
10928       break;
10929 
10930     case 'b':
10931     case 'h':
10932     case 's':
10933     case 'd':
10934     case 'q':
10935       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
10936 	{
10937 	  output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
10938 	  return;
10939 	}
10940       asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
10941       break;
10942 
10943     case 'S':
10944     case 'T':
10945     case 'U':
10946     case 'V':
10947       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
10948 	{
10949 	  output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
10950 	  return;
10951 	}
10952       asm_fprintf (f, "%c%d",
10953 		   aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
10954 		   REGNO (x) - V0_REGNUM + (code - 'S'));
10955       break;
10956 
10957     case 'R':
10958       if (REG_P (x) && FP_REGNUM_P (REGNO (x)))
10959 	asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
10960       else if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
10961 	asm_fprintf (f, "x%d", REGNO (x) - R0_REGNUM + 1);
10962       else
10963 	output_operand_lossage ("incompatible register operand for '%%%c'",
10964 				code);
10965       break;
10966 
10967     case 'X':
10968       if (!CONST_INT_P (x))
10969 	{
10970 	  output_operand_lossage ("invalid operand for '%%%c'", code);
10971 	  return;
10972 	}
10973       asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
10974       break;
10975 
10976     case 'C':
10977       {
10978 	/* Print a replicated constant in hex.  */
10979 	if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
10980 	  {
10981 	    output_operand_lossage ("invalid operand for '%%%c'", code);
10982 	    return;
10983 	  }
10984 	scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
10985 	asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
10986       }
10987       break;
10988 
10989     case 'D':
10990       {
10991 	/* Print a replicated constant in decimal, treating it as
10992 	   unsigned.  */
10993 	if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
10994 	  {
10995 	    output_operand_lossage ("invalid operand for '%%%c'", code);
10996 	    return;
10997 	  }
10998 	scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
10999 	asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
11000       }
11001       break;
11002 
11003     case 'w':
11004     case 'x':
11005       if (x == const0_rtx
11006 	  || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
11007 	{
11008 	  asm_fprintf (f, "%czr", code);
11009 	  break;
11010 	}
11011 
11012       if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
11013 	{
11014 	  asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
11015 	  break;
11016 	}
11017 
11018       if (REG_P (x) && REGNO (x) == SP_REGNUM)
11019 	{
11020 	  asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
11021 	  break;
11022 	}
11023 
11024       /* Fall through */
11025 
11026     case 0:
11027       if (x == NULL)
11028 	{
11029 	  output_operand_lossage ("missing operand");
11030 	  return;
11031 	}
11032 
11033       switch (GET_CODE (x))
11034 	{
11035 	case REG:
11036 	  if (aarch64_sve_data_mode_p (GET_MODE (x)))
11037 	    {
11038 	      if (REG_NREGS (x) == 1)
11039 		asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
11040 	      else
11041 		{
11042 		  char suffix
11043 		    = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
11044 		  asm_fprintf (f, "{z%d.%c - z%d.%c}",
11045 			       REGNO (x) - V0_REGNUM, suffix,
11046 			       END_REGNO (x) - V0_REGNUM - 1, suffix);
11047 		}
11048 	    }
11049 	  else
11050 	    asm_fprintf (f, "%s", reg_names [REGNO (x)]);
11051 	  break;
11052 
11053 	case MEM:
11054 	  output_address (GET_MODE (x), XEXP (x, 0));
11055 	  break;
11056 
11057 	case LABEL_REF:
11058 	case SYMBOL_REF:
11059 	  output_addr_const (asm_out_file, x);
11060 	  break;
11061 
11062 	case CONST_INT:
11063 	  asm_fprintf (f, "%wd", INTVAL (x));
11064 	  break;
11065 
11066 	case CONST:
11067 	  if (!VECTOR_MODE_P (GET_MODE (x)))
11068 	    {
11069 	      output_addr_const (asm_out_file, x);
11070 	      break;
11071 	    }
11072 	  /* fall through */
11073 
11074 	case CONST_VECTOR:
11075 	  if (!const_vec_duplicate_p (x, &elt))
11076 	    {
11077 	      output_operand_lossage ("invalid vector constant");
11078 	      return;
11079 	    }
11080 
11081 	  if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
11082 	    asm_fprintf (f, "%wd", INTVAL (elt));
11083 	  else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
11084 		   && aarch64_print_vector_float_operand (f, x, false))
11085 	    ;
11086 	  else
11087 	    {
11088 	      output_operand_lossage ("invalid vector constant");
11089 	      return;
11090 	    }
11091 	  break;
11092 
11093 	case CONST_DOUBLE:
11094 	  /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
11095 	     be getting CONST_DOUBLEs holding integers.  */
11096 	  gcc_assert (GET_MODE (x) != VOIDmode);
11097 	  if (aarch64_float_const_zero_rtx_p (x))
11098 	    {
11099 	      fputc ('0', f);
11100 	      break;
11101 	    }
11102 	  else if (aarch64_float_const_representable_p (x))
11103 	    {
11104 #define buf_size 20
11105 	      char float_buf[buf_size] = {'\0'};
11106 	      real_to_decimal_for_mode (float_buf,
11107 					CONST_DOUBLE_REAL_VALUE (x),
11108 					buf_size, buf_size,
11109 					1, GET_MODE (x));
11110 	      asm_fprintf (asm_out_file, "%s", float_buf);
11111 	      break;
11112 #undef buf_size
11113 	    }
11114 	  output_operand_lossage ("invalid constant");
11115 	  return;
11116 	default:
11117 	  output_operand_lossage ("invalid operand");
11118 	  return;
11119 	}
11120       break;
11121 
11122     case 'A':
11123       if (GET_CODE (x) == HIGH)
11124 	x = XEXP (x, 0);
11125 
11126       switch (aarch64_classify_symbolic_expression (x))
11127 	{
11128 	case SYMBOL_SMALL_GOT_4G:
11129 	  asm_fprintf (asm_out_file, ":got:");
11130 	  break;
11131 
11132 	case SYMBOL_SMALL_TLSGD:
11133 	  asm_fprintf (asm_out_file, ":tlsgd:");
11134 	  break;
11135 
11136 	case SYMBOL_SMALL_TLSDESC:
11137 	  asm_fprintf (asm_out_file, ":tlsdesc:");
11138 	  break;
11139 
11140 	case SYMBOL_SMALL_TLSIE:
11141 	  asm_fprintf (asm_out_file, ":gottprel:");
11142 	  break;
11143 
11144 	case SYMBOL_TLSLE24:
11145 	  asm_fprintf (asm_out_file, ":tprel:");
11146 	  break;
11147 
11148 	case SYMBOL_TINY_GOT:
11149 	  gcc_unreachable ();
11150 	  break;
11151 
11152 	default:
11153 	  break;
11154 	}
11155       output_addr_const (asm_out_file, x);
11156       break;
11157 
11158     case 'L':
11159       switch (aarch64_classify_symbolic_expression (x))
11160 	{
11161 	case SYMBOL_SMALL_GOT_4G:
11162 	  asm_fprintf (asm_out_file, ":lo12:");
11163 	  break;
11164 
11165 	case SYMBOL_SMALL_TLSGD:
11166 	  asm_fprintf (asm_out_file, ":tlsgd_lo12:");
11167 	  break;
11168 
11169 	case SYMBOL_SMALL_TLSDESC:
11170 	  asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
11171 	  break;
11172 
11173 	case SYMBOL_SMALL_TLSIE:
11174 	  asm_fprintf (asm_out_file, ":gottprel_lo12:");
11175 	  break;
11176 
11177 	case SYMBOL_TLSLE12:
11178 	  asm_fprintf (asm_out_file, ":tprel_lo12:");
11179 	  break;
11180 
11181 	case SYMBOL_TLSLE24:
11182 	  asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
11183 	  break;
11184 
11185 	case SYMBOL_TINY_GOT:
11186 	  asm_fprintf (asm_out_file, ":got:");
11187 	  break;
11188 
11189 	case SYMBOL_TINY_TLSIE:
11190 	  asm_fprintf (asm_out_file, ":gottprel:");
11191 	  break;
11192 
11193 	default:
11194 	  break;
11195 	}
11196       output_addr_const (asm_out_file, x);
11197       break;
11198 
11199     case 'G':
11200       switch (aarch64_classify_symbolic_expression (x))
11201 	{
11202 	case SYMBOL_TLSLE24:
11203 	  asm_fprintf (asm_out_file, ":tprel_hi12:");
11204 	  break;
11205 	default:
11206 	  break;
11207 	}
11208       output_addr_const (asm_out_file, x);
11209       break;
11210 
11211     case 'k':
11212       {
11213 	HOST_WIDE_INT cond_code;
11214 
11215 	if (!CONST_INT_P (x))
11216 	  {
11217 	    output_operand_lossage ("invalid operand for '%%%c'", code);
11218 	    return;
11219 	  }
11220 
11221 	cond_code = INTVAL (x);
11222 	gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
11223 	asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
11224       }
11225       break;
11226 
11227     case 'y':
11228     case 'z':
11229       {
11230 	machine_mode mode = GET_MODE (x);
11231 
11232 	if (!MEM_P (x)
11233 	    || (code == 'y' && maybe_ne (GET_MODE_SIZE (mode), 16)))
11234 	  {
11235 	    output_operand_lossage ("invalid operand for '%%%c'", code);
11236 	    return;
11237 	  }
11238 
11239 	if (!aarch64_print_address_internal (f, mode, XEXP (x, 0),
11240 					    code == 'y'
11241 					    ? ADDR_QUERY_LDP_STP_N
11242 					    : ADDR_QUERY_LDP_STP))
11243 	  output_operand_lossage ("invalid operand prefix '%%%c'", code);
11244       }
11245       break;
11246 
11247     default:
11248       output_operand_lossage ("invalid operand prefix '%%%c'", code);
11249       return;
11250     }
11251 }
11252 
11253 /* Print address 'x' of a memory access with mode 'mode'.
11254    'op' is the context required by aarch64_classify_address.  It can either be
11255    MEM for a normal memory access or PARALLEL for LDP/STP.  */
11256 static bool
aarch64_print_address_internal(FILE * f,machine_mode mode,rtx x,aarch64_addr_query_type type)11257 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
11258 				aarch64_addr_query_type type)
11259 {
11260   struct aarch64_address_info addr;
11261   unsigned int size, vec_flags;
11262 
11263   /* Check all addresses are Pmode - including ILP32.  */
11264   if (GET_MODE (x) != Pmode
11265       && (!CONST_INT_P (x)
11266 	  || trunc_int_for_mode (INTVAL (x), Pmode) != INTVAL (x)))
11267     {
11268       output_operand_lossage ("invalid address mode");
11269       return false;
11270     }
11271 
11272   if (aarch64_classify_address (&addr, x, mode, true, type))
11273     switch (addr.type)
11274       {
11275       case ADDRESS_REG_IMM:
11276 	if (known_eq (addr.const_offset, 0))
11277 	  {
11278 	    asm_fprintf (f, "[%s]", reg_names[REGNO (addr.base)]);
11279 	    return true;
11280 	  }
11281 
11282 	vec_flags = aarch64_classify_vector_mode (mode);
11283 	if (vec_flags & VEC_ANY_SVE)
11284 	  {
11285 	    HOST_WIDE_INT vnum
11286 	      = exact_div (addr.const_offset,
11287 			   aarch64_vl_bytes (mode, vec_flags)).to_constant ();
11288 	    asm_fprintf (f, "[%s, #%wd, mul vl]",
11289 			 reg_names[REGNO (addr.base)], vnum);
11290 	    return true;
11291 	  }
11292 
11293 	asm_fprintf (f, "[%s, %wd]", reg_names[REGNO (addr.base)],
11294 		     INTVAL (addr.offset));
11295 	return true;
11296 
11297       case ADDRESS_REG_REG:
11298 	if (addr.shift == 0)
11299 	  asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
11300 		       reg_names [REGNO (addr.offset)]);
11301 	else
11302 	  asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
11303 		       reg_names [REGNO (addr.offset)], addr.shift);
11304 	return true;
11305 
11306       case ADDRESS_REG_UXTW:
11307 	if (addr.shift == 0)
11308 	  asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
11309 		       REGNO (addr.offset) - R0_REGNUM);
11310 	else
11311 	  asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
11312 		       REGNO (addr.offset) - R0_REGNUM, addr.shift);
11313 	return true;
11314 
11315       case ADDRESS_REG_SXTW:
11316 	if (addr.shift == 0)
11317 	  asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
11318 		       REGNO (addr.offset) - R0_REGNUM);
11319 	else
11320 	  asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
11321 		       REGNO (addr.offset) - R0_REGNUM, addr.shift);
11322 	return true;
11323 
11324       case ADDRESS_REG_WB:
11325 	/* Writeback is only supported for fixed-width modes.  */
11326 	size = GET_MODE_SIZE (mode).to_constant ();
11327 	switch (GET_CODE (x))
11328 	  {
11329 	  case PRE_INC:
11330 	    asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
11331 	    return true;
11332 	  case POST_INC:
11333 	    asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
11334 	    return true;
11335 	  case PRE_DEC:
11336 	    asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
11337 	    return true;
11338 	  case POST_DEC:
11339 	    asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
11340 	    return true;
11341 	  case PRE_MODIFY:
11342 	    asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
11343 			 INTVAL (addr.offset));
11344 	    return true;
11345 	  case POST_MODIFY:
11346 	    asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
11347 			 INTVAL (addr.offset));
11348 	    return true;
11349 	  default:
11350 	    break;
11351 	  }
11352 	break;
11353 
11354       case ADDRESS_LO_SUM:
11355 	asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
11356 	output_addr_const (f, addr.offset);
11357 	asm_fprintf (f, "]");
11358 	return true;
11359 
11360       case ADDRESS_SYMBOLIC:
11361 	output_addr_const (f, x);
11362 	return true;
11363       }
11364 
11365   return false;
11366 }
11367 
11368 /* Print address 'x' of a memory access with mode 'mode'.  */
11369 static void
aarch64_print_operand_address(FILE * f,machine_mode mode,rtx x)11370 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
11371 {
11372   if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
11373     output_addr_const (f, x);
11374 }
11375 
11376 /* Implement TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA.  */
11377 
11378 static bool
aarch64_output_addr_const_extra(FILE * file,rtx x)11379 aarch64_output_addr_const_extra (FILE *file, rtx x)
11380 {
11381   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SALT_ADDR)
11382     {
11383       output_addr_const (file, XVECEXP (x, 0, 0));
11384       return true;
11385    }
11386   return false;
11387 }
11388 
11389 bool
aarch64_label_mentioned_p(rtx x)11390 aarch64_label_mentioned_p (rtx x)
11391 {
11392   const char *fmt;
11393   int i;
11394 
11395   if (LABEL_REF_P (x))
11396     return true;
11397 
11398   /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
11399      referencing instruction, but they are constant offsets, not
11400      symbols.  */
11401   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
11402     return false;
11403 
11404   fmt = GET_RTX_FORMAT (GET_CODE (x));
11405   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
11406     {
11407       if (fmt[i] == 'E')
11408 	{
11409 	  int j;
11410 
11411 	  for (j = XVECLEN (x, i) - 1; j >= 0; j--)
11412 	    if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
11413 	      return 1;
11414 	}
11415       else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
11416 	return 1;
11417     }
11418 
11419   return 0;
11420 }
11421 
11422 /* Implement REGNO_REG_CLASS.  */
11423 
11424 enum reg_class
aarch64_regno_regclass(unsigned regno)11425 aarch64_regno_regclass (unsigned regno)
11426 {
11427   if (STUB_REGNUM_P (regno))
11428     return STUB_REGS;
11429 
11430   if (GP_REGNUM_P (regno))
11431     return GENERAL_REGS;
11432 
11433   if (regno == SP_REGNUM)
11434     return STACK_REG;
11435 
11436   if (regno == FRAME_POINTER_REGNUM
11437       || regno == ARG_POINTER_REGNUM)
11438     return POINTER_REGS;
11439 
11440   if (FP_REGNUM_P (regno))
11441     return (FP_LO8_REGNUM_P (regno) ? FP_LO8_REGS
11442 	    : FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS);
11443 
11444   if (PR_REGNUM_P (regno))
11445     return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
11446 
11447   if (regno == FFR_REGNUM || regno == FFRT_REGNUM)
11448     return FFR_REGS;
11449 
11450   return NO_REGS;
11451 }
11452 
11453 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
11454    If OFFSET is out of range, return an offset of an anchor point
11455    that is in range.  Return 0 otherwise.  */
11456 
11457 static HOST_WIDE_INT
aarch64_anchor_offset(HOST_WIDE_INT offset,HOST_WIDE_INT size,machine_mode mode)11458 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
11459 		       machine_mode mode)
11460 {
11461   /* Does it look like we'll need a 16-byte load/store-pair operation?  */
11462   if (size > 16)
11463     return (offset + 0x400) & ~0x7f0;
11464 
11465   /* For offsets that aren't a multiple of the access size, the limit is
11466      -256...255.  */
11467   if (offset & (size - 1))
11468     {
11469       /* BLKmode typically uses LDP of X-registers.  */
11470       if (mode == BLKmode)
11471 	return (offset + 512) & ~0x3ff;
11472       return (offset + 0x100) & ~0x1ff;
11473     }
11474 
11475   /* Small negative offsets are supported.  */
11476   if (IN_RANGE (offset, -256, 0))
11477     return 0;
11478 
11479   if (mode == TImode || mode == TFmode)
11480     return (offset + 0x100) & ~0x1ff;
11481 
11482   /* Use 12-bit offset by access size.  */
11483   return offset & (~0xfff * size);
11484 }
11485 
11486 static rtx
aarch64_legitimize_address(rtx x,rtx,machine_mode mode)11487 aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
11488 {
11489   /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
11490      where mask is selected by alignment and size of the offset.
11491      We try to pick as large a range for the offset as possible to
11492      maximize the chance of a CSE.  However, for aligned addresses
11493      we limit the range to 4k so that structures with different sized
11494      elements are likely to use the same base.  We need to be careful
11495      not to split a CONST for some forms of address expression, otherwise
11496      it will generate sub-optimal code.  */
11497 
11498   if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
11499     {
11500       rtx base = XEXP (x, 0);
11501       rtx offset_rtx = XEXP (x, 1);
11502       HOST_WIDE_INT offset = INTVAL (offset_rtx);
11503 
11504       if (GET_CODE (base) == PLUS)
11505 	{
11506 	  rtx op0 = XEXP (base, 0);
11507 	  rtx op1 = XEXP (base, 1);
11508 
11509 	  /* Force any scaling into a temp for CSE.  */
11510 	  op0 = force_reg (Pmode, op0);
11511 	  op1 = force_reg (Pmode, op1);
11512 
11513 	  /* Let the pointer register be in op0.  */
11514 	  if (REG_POINTER (op1))
11515 	    std::swap (op0, op1);
11516 
11517 	  /* If the pointer is virtual or frame related, then we know that
11518 	     virtual register instantiation or register elimination is going
11519 	     to apply a second constant.  We want the two constants folded
11520 	     together easily.  Therefore, emit as (OP0 + CONST) + OP1.  */
11521 	  if (virt_or_elim_regno_p (REGNO (op0)))
11522 	    {
11523 	      base = expand_binop (Pmode, add_optab, op0, offset_rtx,
11524 				   NULL_RTX, true, OPTAB_DIRECT);
11525 	      return gen_rtx_PLUS (Pmode, base, op1);
11526 	    }
11527 
11528 	  /* Otherwise, in order to encourage CSE (and thence loop strength
11529 	     reduce) scaled addresses, emit as (OP0 + OP1) + CONST.  */
11530 	  base = expand_binop (Pmode, add_optab, op0, op1,
11531 			       NULL_RTX, true, OPTAB_DIRECT);
11532 	  x = gen_rtx_PLUS (Pmode, base, offset_rtx);
11533 	}
11534 
11535       HOST_WIDE_INT size;
11536       if (GET_MODE_SIZE (mode).is_constant (&size))
11537 	{
11538 	  HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
11539 							     mode);
11540 	  if (base_offset != 0)
11541 	    {
11542 	      base = plus_constant (Pmode, base, base_offset);
11543 	      base = force_operand (base, NULL_RTX);
11544 	      return plus_constant (Pmode, base, offset - base_offset);
11545 	    }
11546 	}
11547     }
11548 
11549   return x;
11550 }
11551 
11552 static reg_class_t
aarch64_secondary_reload(bool in_p ATTRIBUTE_UNUSED,rtx x,reg_class_t rclass,machine_mode mode,secondary_reload_info * sri)11553 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
11554 			  reg_class_t rclass,
11555 			  machine_mode mode,
11556 			  secondary_reload_info *sri)
11557 {
11558   /* Use aarch64_sve_reload_mem for SVE memory reloads that cannot use
11559      LDR and STR.  See the comment at the head of aarch64-sve.md for
11560      more details about the big-endian handling.  */
11561   if (reg_class_subset_p (rclass, FP_REGS)
11562       && !((REG_P (x) && HARD_REGISTER_P (x))
11563 	   || aarch64_simd_valid_immediate (x, NULL))
11564       && mode != VNx16QImode)
11565     {
11566       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
11567       if ((vec_flags & VEC_SVE_DATA)
11568 	  && ((vec_flags & VEC_PARTIAL) || BYTES_BIG_ENDIAN))
11569 	{
11570 	  sri->icode = CODE_FOR_aarch64_sve_reload_mem;
11571 	  return NO_REGS;
11572 	}
11573     }
11574 
11575   /* If we have to disable direct literal pool loads and stores because the
11576      function is too big, then we need a scratch register.  */
11577   if (MEM_P (x) && SYMBOL_REF_P (x) && CONSTANT_POOL_ADDRESS_P (x)
11578       && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
11579 	  || targetm.vector_mode_supported_p (GET_MODE (x)))
11580       && !aarch64_pcrelative_literal_loads)
11581     {
11582       sri->icode = code_for_aarch64_reload_movcp (mode, DImode);
11583       return NO_REGS;
11584     }
11585 
11586   /* Without the TARGET_SIMD instructions we cannot move a Q register
11587      to a Q register directly.  We need a scratch.  */
11588   if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
11589       && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
11590       && reg_class_subset_p (rclass, FP_REGS))
11591     {
11592       sri->icode = code_for_aarch64_reload_mov (mode);
11593       return NO_REGS;
11594     }
11595 
11596   /* A TFmode or TImode memory access should be handled via an FP_REGS
11597      because AArch64 has richer addressing modes for LDR/STR instructions
11598      than LDP/STP instructions.  */
11599   if (TARGET_FLOAT && rclass == GENERAL_REGS
11600       && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
11601     return FP_REGS;
11602 
11603   if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
11604       return GENERAL_REGS;
11605 
11606   return NO_REGS;
11607 }
11608 
11609 static bool
aarch64_can_eliminate(const int from ATTRIBUTE_UNUSED,const int to)11610 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
11611 {
11612   gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
11613 
11614   /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
11615      can only eliminate to HARD_FRAME_POINTER_REGNUM.  */
11616   if (frame_pointer_needed)
11617     return to == HARD_FRAME_POINTER_REGNUM;
11618   return true;
11619 }
11620 
11621 poly_int64
aarch64_initial_elimination_offset(unsigned from,unsigned to)11622 aarch64_initial_elimination_offset (unsigned from, unsigned to)
11623 {
11624   if (to == HARD_FRAME_POINTER_REGNUM)
11625     {
11626       if (from == ARG_POINTER_REGNUM)
11627 	return cfun->machine->frame.hard_fp_offset;
11628 
11629       if (from == FRAME_POINTER_REGNUM)
11630 	return cfun->machine->frame.hard_fp_offset
11631 	       - cfun->machine->frame.locals_offset;
11632     }
11633 
11634   if (to == STACK_POINTER_REGNUM)
11635     {
11636       if (from == FRAME_POINTER_REGNUM)
11637 	  return cfun->machine->frame.frame_size
11638 		 - cfun->machine->frame.locals_offset;
11639     }
11640 
11641   return cfun->machine->frame.frame_size;
11642 }
11643 
11644 
11645 /* Get return address without mangling.  */
11646 
11647 rtx
aarch64_return_addr_rtx(void)11648 aarch64_return_addr_rtx (void)
11649 {
11650   rtx val = get_hard_reg_initial_val (Pmode, LR_REGNUM);
11651   /* Note: aarch64_return_address_signing_enabled only
11652      works after cfun->machine->frame.laid_out is set,
11653      so here we don't know if the return address will
11654      be signed or not.  */
11655   rtx lr = gen_rtx_REG (Pmode, LR_REGNUM);
11656   emit_move_insn (lr, val);
11657   emit_insn (GEN_FCN (CODE_FOR_xpaclri) ());
11658   return lr;
11659 }
11660 
11661 
11662 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
11663    previous frame.  */
11664 
11665 rtx
aarch64_return_addr(int count,rtx frame ATTRIBUTE_UNUSED)11666 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
11667 {
11668   if (count != 0)
11669     return const0_rtx;
11670   return aarch64_return_addr_rtx ();
11671 }
11672 
11673 static void
aarch64_asm_trampoline_template(FILE * f)11674 aarch64_asm_trampoline_template (FILE *f)
11675 {
11676   /* Even if the current function doesn't have branch protection, some
11677      later function might, so since this template is only generated once
11678      we have to add a BTI just in case. */
11679   asm_fprintf (f, "\thint\t34 // bti c\n");
11680 
11681   if (TARGET_ILP32)
11682     {
11683       asm_fprintf (f, "\tldr\tw%d, .+20\n", IP1_REGNUM - R0_REGNUM);
11684       asm_fprintf (f, "\tldr\tw%d, .+20\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
11685     }
11686   else
11687     {
11688       asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [IP1_REGNUM]);
11689       asm_fprintf (f, "\tldr\t%s, .+24\n", reg_names [STATIC_CHAIN_REGNUM]);
11690     }
11691   asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
11692 
11693   /* We always emit a speculation barrier.
11694      This is because the same trampoline template is used for every nested
11695      function.  Since nested functions are not particularly common or
11696      performant we don't worry too much about the extra instructions to copy
11697      around.
11698      This is not yet a problem, since we have not yet implemented function
11699      specific attributes to choose between hardening against straight line
11700      speculation or not, but such function specific attributes are likely to
11701      happen in the future.  */
11702   asm_fprintf (f, "\tdsb\tsy\n\tisb\n");
11703 
11704   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
11705   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
11706 }
11707 
11708 static void
aarch64_trampoline_init(rtx m_tramp,tree fndecl,rtx chain_value)11709 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
11710 {
11711   rtx fnaddr, mem, a_tramp;
11712   const int tramp_code_sz = 24;
11713 
11714   /* Don't need to copy the trailing D-words, we fill those in below.  */
11715   /* We create our own memory address in Pmode so that `emit_block_move` can
11716      use parts of the backend which expect Pmode addresses.  */
11717   rtx temp = convert_memory_address (Pmode, XEXP (m_tramp, 0));
11718   emit_block_move (gen_rtx_MEM (BLKmode, temp),
11719 		   assemble_trampoline_template (),
11720 		   GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
11721   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
11722   fnaddr = XEXP (DECL_RTL (fndecl), 0);
11723   if (GET_MODE (fnaddr) != ptr_mode)
11724     fnaddr = convert_memory_address (ptr_mode, fnaddr);
11725   emit_move_insn (mem, fnaddr);
11726 
11727   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
11728   emit_move_insn (mem, chain_value);
11729 
11730   /* XXX We should really define a "clear_cache" pattern and use
11731      gen_clear_cache().  */
11732   a_tramp = XEXP (m_tramp, 0);
11733   maybe_emit_call_builtin___clear_cache (a_tramp,
11734 					 plus_constant (ptr_mode,
11735 							a_tramp,
11736 							TRAMPOLINE_SIZE));
11737 }
11738 
11739 static unsigned char
aarch64_class_max_nregs(reg_class_t regclass,machine_mode mode)11740 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
11741 {
11742   /* ??? Logically we should only need to provide a value when
11743      HARD_REGNO_MODE_OK says that at least one register in REGCLASS
11744      can hold MODE, but at the moment we need to handle all modes.
11745      Just ignore any runtime parts for registers that can't store them.  */
11746   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
11747   unsigned int nregs, vec_flags;
11748   switch (regclass)
11749     {
11750     case STUB_REGS:
11751     case TAILCALL_ADDR_REGS:
11752     case POINTER_REGS:
11753     case GENERAL_REGS:
11754     case ALL_REGS:
11755     case POINTER_AND_FP_REGS:
11756     case FP_REGS:
11757     case FP_LO_REGS:
11758     case FP_LO8_REGS:
11759       vec_flags = aarch64_classify_vector_mode (mode);
11760       if ((vec_flags & VEC_SVE_DATA)
11761 	  && constant_multiple_p (GET_MODE_SIZE (mode),
11762 				  aarch64_vl_bytes (mode, vec_flags), &nregs))
11763 	return nregs;
11764       return (vec_flags & VEC_ADVSIMD
11765 	      ? CEIL (lowest_size, UNITS_PER_VREG)
11766 	      : CEIL (lowest_size, UNITS_PER_WORD));
11767     case STACK_REG:
11768     case PR_REGS:
11769     case PR_LO_REGS:
11770     case PR_HI_REGS:
11771     case FFR_REGS:
11772     case PR_AND_FFR_REGS:
11773       return 1;
11774 
11775     case NO_REGS:
11776       return 0;
11777 
11778     default:
11779       break;
11780     }
11781   gcc_unreachable ();
11782 }
11783 
11784 static reg_class_t
aarch64_preferred_reload_class(rtx x,reg_class_t regclass)11785 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
11786 {
11787   if (regclass == POINTER_REGS)
11788     return GENERAL_REGS;
11789 
11790   if (regclass == STACK_REG)
11791     {
11792       if (REG_P(x)
11793 	  && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
11794 	  return regclass;
11795 
11796       return NO_REGS;
11797     }
11798 
11799   /* Register eliminiation can result in a request for
11800      SP+constant->FP_REGS.  We cannot support such operations which
11801      use SP as source and an FP_REG as destination, so reject out
11802      right now.  */
11803   if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
11804     {
11805       rtx lhs = XEXP (x, 0);
11806 
11807       /* Look through a possible SUBREG introduced by ILP32.  */
11808       if (SUBREG_P (lhs))
11809 	lhs = SUBREG_REG (lhs);
11810 
11811       gcc_assert (REG_P (lhs));
11812       gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
11813 				      POINTER_REGS));
11814       return NO_REGS;
11815     }
11816 
11817   return regclass;
11818 }
11819 
11820 void
aarch64_asm_output_labelref(FILE * f,const char * name)11821 aarch64_asm_output_labelref (FILE* f, const char *name)
11822 {
11823   asm_fprintf (f, "%U%s", name);
11824 }
11825 
11826 static void
aarch64_elf_asm_constructor(rtx symbol,int priority)11827 aarch64_elf_asm_constructor (rtx symbol, int priority)
11828 {
11829   if (priority == DEFAULT_INIT_PRIORITY)
11830     default_ctor_section_asm_out_constructor (symbol, priority);
11831   else
11832     {
11833       section *s;
11834       /* While priority is known to be in range [0, 65535], so 18 bytes
11835          would be enough, the compiler might not know that.  To avoid
11836          -Wformat-truncation false positive, use a larger size.  */
11837       char buf[23];
11838       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
11839       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
11840       switch_to_section (s);
11841       assemble_align (POINTER_SIZE);
11842       assemble_aligned_integer (POINTER_BYTES, symbol);
11843     }
11844 }
11845 
11846 static void
aarch64_elf_asm_destructor(rtx symbol,int priority)11847 aarch64_elf_asm_destructor (rtx symbol, int priority)
11848 {
11849   if (priority == DEFAULT_INIT_PRIORITY)
11850     default_dtor_section_asm_out_destructor (symbol, priority);
11851   else
11852     {
11853       section *s;
11854       /* While priority is known to be in range [0, 65535], so 18 bytes
11855          would be enough, the compiler might not know that.  To avoid
11856          -Wformat-truncation false positive, use a larger size.  */
11857       char buf[23];
11858       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
11859       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
11860       switch_to_section (s);
11861       assemble_align (POINTER_SIZE);
11862       assemble_aligned_integer (POINTER_BYTES, symbol);
11863     }
11864 }
11865 
11866 const char*
aarch64_output_casesi(rtx * operands)11867 aarch64_output_casesi (rtx *operands)
11868 {
11869   char buf[100];
11870   char label[100];
11871   rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
11872   int index;
11873   static const char *const patterns[4][2] =
11874   {
11875     {
11876       "ldrb\t%w3, [%0,%w1,uxtw]",
11877       "add\t%3, %4, %w3, sxtb #2"
11878     },
11879     {
11880       "ldrh\t%w3, [%0,%w1,uxtw #1]",
11881       "add\t%3, %4, %w3, sxth #2"
11882     },
11883     {
11884       "ldr\t%w3, [%0,%w1,uxtw #2]",
11885       "add\t%3, %4, %w3, sxtw #2"
11886     },
11887     /* We assume that DImode is only generated when not optimizing and
11888        that we don't really need 64-bit address offsets.  That would
11889        imply an object file with 8GB of code in a single function!  */
11890     {
11891       "ldr\t%w3, [%0,%w1,uxtw #2]",
11892       "add\t%3, %4, %w3, sxtw #2"
11893     }
11894   };
11895 
11896   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
11897 
11898   scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
11899   index = exact_log2 (GET_MODE_SIZE (mode));
11900 
11901   gcc_assert (index >= 0 && index <= 3);
11902 
11903   /* Need to implement table size reduction, by chaning the code below.  */
11904   output_asm_insn (patterns[index][0], operands);
11905   ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
11906   snprintf (buf, sizeof (buf),
11907 	    "adr\t%%4, %s", targetm.strip_name_encoding (label));
11908   output_asm_insn (buf, operands);
11909   output_asm_insn (patterns[index][1], operands);
11910   output_asm_insn ("br\t%3", operands);
11911   output_asm_insn (aarch64_sls_barrier (aarch64_harden_sls_retbr_p ()),
11912 		   operands);
11913   assemble_label (asm_out_file, label);
11914   return "";
11915 }
11916 
11917 
11918 /* Return size in bits of an arithmetic operand which is shifted/scaled and
11919    masked such that it is suitable for a UXTB, UXTH, or UXTW extend
11920    operator.  */
11921 
11922 int
aarch64_uxt_size(int shift,HOST_WIDE_INT mask)11923 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
11924 {
11925   if (shift >= 0 && shift <= 3)
11926     {
11927       int size;
11928       for (size = 8; size <= 32; size *= 2)
11929 	{
11930 	  HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
11931 	  if (mask == bits << shift)
11932 	    return size;
11933 	}
11934     }
11935   return 0;
11936 }
11937 
11938 /* Constant pools are per function only when PC relative
11939    literal loads are true or we are in the large memory
11940    model.  */
11941 
11942 static inline bool
aarch64_can_use_per_function_literal_pools_p(void)11943 aarch64_can_use_per_function_literal_pools_p (void)
11944 {
11945   return (aarch64_pcrelative_literal_loads
11946 	  || aarch64_cmodel == AARCH64_CMODEL_LARGE);
11947 }
11948 
11949 static bool
aarch64_use_blocks_for_constant_p(machine_mode,const_rtx)11950 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
11951 {
11952   /* We can't use blocks for constants when we're using a per-function
11953      constant pool.  */
11954   return !aarch64_can_use_per_function_literal_pools_p ();
11955 }
11956 
11957 /* Select appropriate section for constants depending
11958    on where we place literal pools.  */
11959 
11960 static section *
aarch64_select_rtx_section(machine_mode mode,rtx x,unsigned HOST_WIDE_INT align)11961 aarch64_select_rtx_section (machine_mode mode,
11962 			    rtx x,
11963 			    unsigned HOST_WIDE_INT align)
11964 {
11965   if (aarch64_can_use_per_function_literal_pools_p ())
11966     return function_section (current_function_decl);
11967 
11968   return default_elf_select_rtx_section (mode, x, align);
11969 }
11970 
11971 /* Implement ASM_OUTPUT_POOL_EPILOGUE.  */
11972 void
aarch64_asm_output_pool_epilogue(FILE * f,const char *,tree,HOST_WIDE_INT offset)11973 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
11974 				  HOST_WIDE_INT offset)
11975 {
11976   /* When using per-function literal pools, we must ensure that any code
11977      section is aligned to the minimal instruction length, lest we get
11978      errors from the assembler re "unaligned instructions".  */
11979   if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
11980     ASM_OUTPUT_ALIGN (f, 2);
11981 }
11982 
11983 /* Costs.  */
11984 
11985 /* Helper function for rtx cost calculation.  Strip a shift expression
11986    from X.  Returns the inner operand if successful, or the original
11987    expression on failure.  */
11988 static rtx
aarch64_strip_shift(rtx x)11989 aarch64_strip_shift (rtx x)
11990 {
11991   rtx op = x;
11992 
11993   /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
11994      we can convert both to ROR during final output.  */
11995   if ((GET_CODE (op) == ASHIFT
11996        || GET_CODE (op) == ASHIFTRT
11997        || GET_CODE (op) == LSHIFTRT
11998        || GET_CODE (op) == ROTATERT
11999        || GET_CODE (op) == ROTATE)
12000       && CONST_INT_P (XEXP (op, 1)))
12001     return XEXP (op, 0);
12002 
12003   if (GET_CODE (op) == MULT
12004       && CONST_INT_P (XEXP (op, 1))
12005       && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
12006     return XEXP (op, 0);
12007 
12008   return x;
12009 }
12010 
12011 /* Helper function for rtx cost calculation.  Strip an extend
12012    expression from X.  Returns the inner operand if successful, or the
12013    original expression on failure.  We deal with a number of possible
12014    canonicalization variations here. If STRIP_SHIFT is true, then
12015    we can strip off a shift also.  */
12016 static rtx
aarch64_strip_extend(rtx x,bool strip_shift)12017 aarch64_strip_extend (rtx x, bool strip_shift)
12018 {
12019   scalar_int_mode mode;
12020   rtx op = x;
12021 
12022   if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
12023     return op;
12024 
12025   if (GET_CODE (op) == AND
12026       && GET_CODE (XEXP (op, 0)) == MULT
12027       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
12028       && CONST_INT_P (XEXP (op, 1))
12029       && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
12030 			   INTVAL (XEXP (op, 1))) != 0)
12031     return XEXP (XEXP (op, 0), 0);
12032 
12033   /* Now handle extended register, as this may also have an optional
12034      left shift by 1..4.  */
12035   if (strip_shift
12036       && GET_CODE (op) == ASHIFT
12037       && CONST_INT_P (XEXP (op, 1))
12038       && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
12039     op = XEXP (op, 0);
12040 
12041   if (GET_CODE (op) == ZERO_EXTEND
12042       || GET_CODE (op) == SIGN_EXTEND)
12043     op = XEXP (op, 0);
12044 
12045   if (op != x)
12046     return op;
12047 
12048   return x;
12049 }
12050 
12051 /* Return true iff CODE is a shift supported in combination
12052    with arithmetic instructions.  */
12053 
12054 static bool
aarch64_shift_p(enum rtx_code code)12055 aarch64_shift_p (enum rtx_code code)
12056 {
12057   return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
12058 }
12059 
12060 
12061 /* Return true iff X is a cheap shift without a sign extend. */
12062 
12063 static bool
aarch64_cheap_mult_shift_p(rtx x)12064 aarch64_cheap_mult_shift_p (rtx x)
12065 {
12066   rtx op0, op1;
12067 
12068   op0 = XEXP (x, 0);
12069   op1 = XEXP (x, 1);
12070 
12071   if (!(aarch64_tune_params.extra_tuning_flags
12072                       & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
12073     return false;
12074 
12075   if (GET_CODE (op0) == SIGN_EXTEND)
12076     return false;
12077 
12078   if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
12079       && UINTVAL (op1) <= 4)
12080     return true;
12081 
12082   if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
12083     return false;
12084 
12085   HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
12086 
12087   if (l2 > 0 && l2 <= 4)
12088     return true;
12089 
12090   return false;
12091 }
12092 
12093 /* Helper function for rtx cost calculation.  Calculate the cost of
12094    a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
12095    Return the calculated cost of the expression, recursing manually in to
12096    operands where needed.  */
12097 
12098 static int
aarch64_rtx_mult_cost(rtx x,enum rtx_code code,int outer,bool speed)12099 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
12100 {
12101   rtx op0, op1;
12102   const struct cpu_cost_table *extra_cost
12103     = aarch64_tune_params.insn_extra_cost;
12104   int cost = 0;
12105   bool compound_p = (outer == PLUS || outer == MINUS);
12106   machine_mode mode = GET_MODE (x);
12107 
12108   gcc_checking_assert (code == MULT);
12109 
12110   op0 = XEXP (x, 0);
12111   op1 = XEXP (x, 1);
12112 
12113   if (VECTOR_MODE_P (mode))
12114     {
12115       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
12116       if (vec_flags & VEC_ADVSIMD)
12117 	{
12118 	  /* The by-element versions of the instruction have the same costs as
12119 	     the normal 3-vector version.  So don't add the costs of the
12120 	     duplicate into the costs of the multiply.  We make an assumption
12121 	     that the input to the VEC_DUPLICATE is already on the FP & SIMD
12122 	     side.  This means costing of a MUL by element pre RA is a bit
12123 	     optimistic.  */
12124 	  if (GET_CODE (op0) == VEC_DUPLICATE)
12125 	    op0 = XEXP (op0, 0);
12126 	  else if (GET_CODE (op1) == VEC_DUPLICATE)
12127 	    op1 = XEXP (op1, 0);
12128 	}
12129       cost += rtx_cost (op0, mode, MULT, 0, speed);
12130       cost += rtx_cost (op1, mode, MULT, 1, speed);
12131       if (speed)
12132 	{
12133 	  if (GET_CODE (x) == MULT)
12134 	    cost += extra_cost->vect.mult;
12135 	  /* This is to catch the SSRA costing currently flowing here.  */
12136 	  else
12137 	    cost += extra_cost->vect.alu;
12138 	}
12139       return cost;
12140     }
12141 
12142   /* Integer multiply/fma.  */
12143   if (GET_MODE_CLASS (mode) == MODE_INT)
12144     {
12145       /* The multiply will be canonicalized as a shift, cost it as such.  */
12146       if (aarch64_shift_p (GET_CODE (x))
12147 	  || (CONST_INT_P (op1)
12148 	      && exact_log2 (INTVAL (op1)) > 0))
12149 	{
12150 	  bool is_extend = GET_CODE (op0) == ZERO_EXTEND
12151 	                   || GET_CODE (op0) == SIGN_EXTEND;
12152 	  if (speed)
12153 	    {
12154 	      if (compound_p)
12155 	        {
12156 		  /* If the shift is considered cheap,
12157 		     then don't add any cost. */
12158 		  if (aarch64_cheap_mult_shift_p (x))
12159 		    ;
12160 	          else if (REG_P (op1))
12161 		    /* ARITH + shift-by-register.  */
12162 		    cost += extra_cost->alu.arith_shift_reg;
12163 		  else if (is_extend)
12164 		    /* ARITH + extended register.  We don't have a cost field
12165 		       for ARITH+EXTEND+SHIFT, so use extend_arith here.  */
12166 		    cost += extra_cost->alu.extend_arith;
12167 		  else
12168 		    /* ARITH + shift-by-immediate.  */
12169 		    cost += extra_cost->alu.arith_shift;
12170 		}
12171 	      else
12172 		/* LSL (immediate).  */
12173 	        cost += extra_cost->alu.shift;
12174 
12175 	    }
12176 	  /* Strip extends as we will have costed them in the case above.  */
12177 	  if (is_extend)
12178 	    op0 = aarch64_strip_extend (op0, true);
12179 
12180 	  cost += rtx_cost (op0, VOIDmode, code, 0, speed);
12181 
12182 	  return cost;
12183 	}
12184 
12185       /* MNEG or [US]MNEGL.  Extract the NEG operand and indicate that it's a
12186 	 compound and let the below cases handle it.  After all, MNEG is a
12187 	 special-case alias of MSUB.  */
12188       if (GET_CODE (op0) == NEG)
12189 	{
12190 	  op0 = XEXP (op0, 0);
12191 	  compound_p = true;
12192 	}
12193 
12194       /* Integer multiplies or FMAs have zero/sign extending variants.  */
12195       if ((GET_CODE (op0) == ZERO_EXTEND
12196 	   && GET_CODE (op1) == ZERO_EXTEND)
12197 	  || (GET_CODE (op0) == SIGN_EXTEND
12198 	      && GET_CODE (op1) == SIGN_EXTEND))
12199 	{
12200 	  cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
12201 	  cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
12202 
12203 	  if (speed)
12204 	    {
12205 	      if (compound_p)
12206 		/* SMADDL/UMADDL/UMSUBL/SMSUBL.  */
12207 		cost += extra_cost->mult[0].extend_add;
12208 	      else
12209 		/* MUL/SMULL/UMULL.  */
12210 		cost += extra_cost->mult[0].extend;
12211 	    }
12212 
12213 	  return cost;
12214 	}
12215 
12216       /* This is either an integer multiply or a MADD.  In both cases
12217 	 we want to recurse and cost the operands.  */
12218       cost += rtx_cost (op0, mode, MULT, 0, speed);
12219       cost += rtx_cost (op1, mode, MULT, 1, speed);
12220 
12221       if (speed)
12222 	{
12223 	  if (compound_p)
12224 	    /* MADD/MSUB.  */
12225 	    cost += extra_cost->mult[mode == DImode].add;
12226 	  else
12227 	    /* MUL.  */
12228 	    cost += extra_cost->mult[mode == DImode].simple;
12229 	}
12230 
12231       return cost;
12232     }
12233   else
12234     {
12235       if (speed)
12236 	{
12237 	  /* Floating-point FMA/FMUL can also support negations of the
12238 	     operands, unless the rounding mode is upward or downward in
12239 	     which case FNMUL is different than FMUL with operand negation.  */
12240 	  bool neg0 = GET_CODE (op0) == NEG;
12241 	  bool neg1 = GET_CODE (op1) == NEG;
12242 	  if (compound_p || !flag_rounding_math || (neg0 && neg1))
12243 	    {
12244 	      if (neg0)
12245 		op0 = XEXP (op0, 0);
12246 	      if (neg1)
12247 		op1 = XEXP (op1, 0);
12248 	    }
12249 
12250 	  if (compound_p)
12251 	    /* FMADD/FNMADD/FNMSUB/FMSUB.  */
12252 	    cost += extra_cost->fp[mode == DFmode].fma;
12253 	  else
12254 	    /* FMUL/FNMUL.  */
12255 	    cost += extra_cost->fp[mode == DFmode].mult;
12256 	}
12257 
12258       cost += rtx_cost (op0, mode, MULT, 0, speed);
12259       cost += rtx_cost (op1, mode, MULT, 1, speed);
12260       return cost;
12261     }
12262 }
12263 
12264 static int
aarch64_address_cost(rtx x,machine_mode mode,addr_space_t as ATTRIBUTE_UNUSED,bool speed)12265 aarch64_address_cost (rtx x,
12266 		      machine_mode mode,
12267 		      addr_space_t as ATTRIBUTE_UNUSED,
12268 		      bool speed)
12269 {
12270   enum rtx_code c = GET_CODE (x);
12271   const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
12272   struct aarch64_address_info info;
12273   int cost = 0;
12274   info.shift = 0;
12275 
12276   if (!aarch64_classify_address (&info, x, mode, false))
12277     {
12278       if (GET_CODE (x) == CONST || SYMBOL_REF_P (x))
12279 	{
12280 	  /* This is a CONST or SYMBOL ref which will be split
12281 	     in a different way depending on the code model in use.
12282 	     Cost it through the generic infrastructure.  */
12283 	  int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
12284 	  /* Divide through by the cost of one instruction to
12285 	     bring it to the same units as the address costs.  */
12286 	  cost_symbol_ref /= COSTS_N_INSNS (1);
12287 	  /* The cost is then the cost of preparing the address,
12288 	     followed by an immediate (possibly 0) offset.  */
12289 	  return cost_symbol_ref + addr_cost->imm_offset;
12290 	}
12291       else
12292 	{
12293 	  /* This is most likely a jump table from a case
12294 	     statement.  */
12295 	  return addr_cost->register_offset;
12296 	}
12297     }
12298 
12299   switch (info.type)
12300     {
12301       case ADDRESS_LO_SUM:
12302       case ADDRESS_SYMBOLIC:
12303       case ADDRESS_REG_IMM:
12304 	cost += addr_cost->imm_offset;
12305 	break;
12306 
12307       case ADDRESS_REG_WB:
12308 	if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
12309 	  cost += addr_cost->pre_modify;
12310 	else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
12311 	  {
12312 	    if (mode == CImode)
12313 	      cost += addr_cost->post_modify_ld3_st3;
12314 	    else if (mode == XImode)
12315 	      cost += addr_cost->post_modify_ld4_st4;
12316 	    else
12317 	      cost += addr_cost->post_modify;
12318 	  }
12319 	else
12320 	  gcc_unreachable ();
12321 
12322 	break;
12323 
12324       case ADDRESS_REG_REG:
12325 	cost += addr_cost->register_offset;
12326 	break;
12327 
12328       case ADDRESS_REG_SXTW:
12329 	cost += addr_cost->register_sextend;
12330 	break;
12331 
12332       case ADDRESS_REG_UXTW:
12333 	cost += addr_cost->register_zextend;
12334 	break;
12335 
12336       default:
12337 	gcc_unreachable ();
12338     }
12339 
12340 
12341   if (info.shift > 0)
12342     {
12343       /* For the sake of calculating the cost of the shifted register
12344 	 component, we can treat same sized modes in the same way.  */
12345       if (known_eq (GET_MODE_BITSIZE (mode), 16))
12346 	cost += addr_cost->addr_scale_costs.hi;
12347       else if (known_eq (GET_MODE_BITSIZE (mode), 32))
12348 	cost += addr_cost->addr_scale_costs.si;
12349       else if (known_eq (GET_MODE_BITSIZE (mode), 64))
12350 	cost += addr_cost->addr_scale_costs.di;
12351       else
12352 	/* We can't tell, or this is a 128-bit vector.  */
12353 	cost += addr_cost->addr_scale_costs.ti;
12354     }
12355 
12356   return cost;
12357 }
12358 
12359 /* Return the cost of a branch.  If SPEED_P is true then the compiler is
12360    optimizing for speed.  If PREDICTABLE_P is true then the branch is predicted
12361    to be taken.  */
12362 
12363 int
aarch64_branch_cost(bool speed_p,bool predictable_p)12364 aarch64_branch_cost (bool speed_p, bool predictable_p)
12365 {
12366   /* When optimizing for speed, use the cost of unpredictable branches.  */
12367   const struct cpu_branch_cost *branch_costs =
12368     aarch64_tune_params.branch_costs;
12369 
12370   if (!speed_p || predictable_p)
12371     return branch_costs->predictable;
12372   else
12373     return branch_costs->unpredictable;
12374 }
12375 
12376 /* Return true if X is a zero or sign extract
12377    usable in an ADD or SUB (extended register) instruction.  */
12378 static bool
aarch64_rtx_arith_op_extract_p(rtx x)12379 aarch64_rtx_arith_op_extract_p (rtx x)
12380 {
12381   /* The simple case <ARITH>, XD, XN, XM, [us]xt.
12382      No shift.  */
12383   if (GET_CODE (x) == SIGN_EXTEND
12384       || GET_CODE (x) == ZERO_EXTEND)
12385     return REG_P (XEXP (x, 0));
12386 
12387   return false;
12388 }
12389 
12390 static bool
aarch64_frint_unspec_p(unsigned int u)12391 aarch64_frint_unspec_p (unsigned int u)
12392 {
12393   switch (u)
12394     {
12395       case UNSPEC_FRINTZ:
12396       case UNSPEC_FRINTP:
12397       case UNSPEC_FRINTM:
12398       case UNSPEC_FRINTA:
12399       case UNSPEC_FRINTN:
12400       case UNSPEC_FRINTX:
12401       case UNSPEC_FRINTI:
12402         return true;
12403 
12404       default:
12405         return false;
12406     }
12407 }
12408 
12409 /* Return true iff X is an rtx that will match an extr instruction
12410    i.e. as described in the *extr<mode>5_insn family of patterns.
12411    OP0 and OP1 will be set to the operands of the shifts involved
12412    on success and will be NULL_RTX otherwise.  */
12413 
12414 static bool
aarch64_extr_rtx_p(rtx x,rtx * res_op0,rtx * res_op1)12415 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
12416 {
12417   rtx op0, op1;
12418   scalar_int_mode mode;
12419   if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
12420     return false;
12421 
12422   *res_op0 = NULL_RTX;
12423   *res_op1 = NULL_RTX;
12424 
12425   if (GET_CODE (x) != IOR)
12426     return false;
12427 
12428   op0 = XEXP (x, 0);
12429   op1 = XEXP (x, 1);
12430 
12431   if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
12432       || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
12433     {
12434      /* Canonicalise locally to ashift in op0, lshiftrt in op1.  */
12435       if (GET_CODE (op1) == ASHIFT)
12436         std::swap (op0, op1);
12437 
12438       if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
12439         return false;
12440 
12441       unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
12442       unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
12443 
12444       if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
12445           && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
12446         {
12447           *res_op0 = XEXP (op0, 0);
12448           *res_op1 = XEXP (op1, 0);
12449           return true;
12450         }
12451     }
12452 
12453   return false;
12454 }
12455 
12456 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
12457    storing it in *COST.  Result is true if the total cost of the operation
12458    has now been calculated.  */
12459 static bool
aarch64_if_then_else_costs(rtx op0,rtx op1,rtx op2,int * cost,bool speed)12460 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
12461 {
12462   rtx inner;
12463   rtx comparator;
12464   enum rtx_code cmpcode;
12465   const struct cpu_cost_table *extra_cost
12466     = aarch64_tune_params.insn_extra_cost;
12467 
12468   if (COMPARISON_P (op0))
12469     {
12470       inner = XEXP (op0, 0);
12471       comparator = XEXP (op0, 1);
12472       cmpcode = GET_CODE (op0);
12473     }
12474   else
12475     {
12476       inner = op0;
12477       comparator = const0_rtx;
12478       cmpcode = NE;
12479     }
12480 
12481   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
12482     {
12483       /* Conditional branch.  */
12484       if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
12485 	return true;
12486       else
12487 	{
12488 	  if (cmpcode == NE || cmpcode == EQ)
12489 	    {
12490 	      if (comparator == const0_rtx)
12491 		{
12492 		  /* TBZ/TBNZ/CBZ/CBNZ.  */
12493 		  if (GET_CODE (inner) == ZERO_EXTRACT)
12494 		    /* TBZ/TBNZ.  */
12495 		    *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
12496 				       ZERO_EXTRACT, 0, speed);
12497 		  else
12498 		    /* CBZ/CBNZ.  */
12499 		    *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
12500 
12501 		  return true;
12502 		}
12503 	      if (register_operand (inner, VOIDmode)
12504 		  && aarch64_imm24 (comparator, VOIDmode))
12505 		{
12506 		  /* SUB and SUBS.  */
12507 		  *cost += COSTS_N_INSNS (2);
12508 		  if (speed)
12509 		    *cost += extra_cost->alu.arith * 2;
12510 		  return true;
12511 		}
12512 	    }
12513 	  else if (cmpcode == LT || cmpcode == GE)
12514 	    {
12515 	      /* TBZ/TBNZ.  */
12516 	      if (comparator == const0_rtx)
12517 		return true;
12518 	    }
12519 	}
12520     }
12521   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
12522     {
12523       /* CCMP.  */
12524       if (GET_CODE (op1) == COMPARE)
12525 	{
12526 	  /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0.  */
12527 	  if (XEXP (op1, 1) == const0_rtx)
12528 	    *cost += 1;
12529 	  if (speed)
12530 	    {
12531 	      machine_mode mode = GET_MODE (XEXP (op1, 0));
12532 
12533 	      if (GET_MODE_CLASS (mode) == MODE_INT)
12534 		*cost += extra_cost->alu.arith;
12535 	      else
12536 		*cost += extra_cost->fp[mode == DFmode].compare;
12537 	    }
12538 	  return true;
12539 	}
12540 
12541       /* It's a conditional operation based on the status flags,
12542 	 so it must be some flavor of CSEL.  */
12543 
12544       /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
12545       if (GET_CODE (op1) == NEG
12546           || GET_CODE (op1) == NOT
12547           || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
12548 	op1 = XEXP (op1, 0);
12549       else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
12550 	{
12551 	  /* CSEL with zero-extension (*cmovdi_insn_uxtw).  */
12552 	  op1 = XEXP (op1, 0);
12553 	  op2 = XEXP (op2, 0);
12554 	}
12555       else if (GET_CODE (op1) == ZERO_EXTEND && op2 == const0_rtx)
12556 	{
12557 	  inner = XEXP (op1, 0);
12558 	  if (GET_CODE (inner) == NEG || GET_CODE (inner) == NOT)
12559 	    /* CSINV/NEG with zero extend + const 0 (*csinv3_uxtw_insn3).  */
12560 	    op1 = XEXP (inner, 0);
12561 	}
12562 
12563       *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
12564       *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
12565       return true;
12566     }
12567 
12568   /* We don't know what this is, cost all operands.  */
12569   return false;
12570 }
12571 
12572 /* Check whether X is a bitfield operation of the form shift + extend that
12573    maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction.  If so, return the
12574    operand to which the bitfield operation is applied.  Otherwise return
12575    NULL_RTX.  */
12576 
12577 static rtx
aarch64_extend_bitfield_pattern_p(rtx x)12578 aarch64_extend_bitfield_pattern_p (rtx x)
12579 {
12580   rtx_code outer_code = GET_CODE (x);
12581   machine_mode outer_mode = GET_MODE (x);
12582 
12583   if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
12584       && outer_mode != SImode && outer_mode != DImode)
12585     return NULL_RTX;
12586 
12587   rtx inner = XEXP (x, 0);
12588   rtx_code inner_code = GET_CODE (inner);
12589   machine_mode inner_mode = GET_MODE (inner);
12590   rtx op = NULL_RTX;
12591 
12592   switch (inner_code)
12593     {
12594       case ASHIFT:
12595 	if (CONST_INT_P (XEXP (inner, 1))
12596 	    && (inner_mode == QImode || inner_mode == HImode))
12597 	  op = XEXP (inner, 0);
12598 	break;
12599       case LSHIFTRT:
12600 	if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
12601 	    && (inner_mode == QImode || inner_mode == HImode))
12602 	  op = XEXP (inner, 0);
12603 	break;
12604       case ASHIFTRT:
12605 	if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
12606 	    && (inner_mode == QImode || inner_mode == HImode))
12607 	  op = XEXP (inner, 0);
12608 	break;
12609       default:
12610 	break;
12611     }
12612 
12613   return op;
12614 }
12615 
12616 /* Return true if the mask and a shift amount from an RTX of the form
12617    (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
12618    mode MODE.  See the *andim_ashift<mode>_bfiz pattern.  */
12619 
12620 bool
aarch64_mask_and_shift_for_ubfiz_p(scalar_int_mode mode,rtx mask,rtx shft_amnt)12621 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
12622 				    rtx shft_amnt)
12623 {
12624   return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
12625 	 && INTVAL (mask) > 0
12626 	 && UINTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
12627 	 && exact_log2 ((UINTVAL (mask) >> UINTVAL (shft_amnt)) + 1) >= 0
12628 	 && (UINTVAL (mask)
12629 	     & ((HOST_WIDE_INT_1U << UINTVAL (shft_amnt)) - 1)) == 0;
12630 }
12631 
12632 /* Return true if the masks and a shift amount from an RTX of the form
12633    ((x & MASK1) | ((y << SHIFT_AMNT) & MASK2)) are valid to combine into
12634    a BFI instruction of mode MODE.  See *arch64_bfi patterns.  */
12635 
12636 bool
aarch64_masks_and_shift_for_bfi_p(scalar_int_mode mode,unsigned HOST_WIDE_INT mask1,unsigned HOST_WIDE_INT shft_amnt,unsigned HOST_WIDE_INT mask2)12637 aarch64_masks_and_shift_for_bfi_p (scalar_int_mode mode,
12638 				   unsigned HOST_WIDE_INT mask1,
12639 				   unsigned HOST_WIDE_INT shft_amnt,
12640 				   unsigned HOST_WIDE_INT mask2)
12641 {
12642   unsigned HOST_WIDE_INT t;
12643 
12644   /* Verify that there is no overlap in what bits are set in the two masks.  */
12645   if (mask1 != ~mask2)
12646     return false;
12647 
12648   /* Verify that mask2 is not all zeros or ones.  */
12649   if (mask2 == 0 || mask2 == HOST_WIDE_INT_M1U)
12650     return false;
12651 
12652   /* The shift amount should always be less than the mode size.  */
12653   gcc_assert (shft_amnt < GET_MODE_BITSIZE (mode));
12654 
12655   /* Verify that the mask being shifted is contiguous and would be in the
12656      least significant bits after shifting by shft_amnt.  */
12657   t = mask2 + (HOST_WIDE_INT_1U << shft_amnt);
12658   return (t == (t & -t));
12659 }
12660 
12661 /* Calculate the cost of calculating X, storing it in *COST.  Result
12662    is true if the total cost of the operation has now been calculated.  */
12663 static bool
aarch64_rtx_costs(rtx x,machine_mode mode,int outer ATTRIBUTE_UNUSED,int param ATTRIBUTE_UNUSED,int * cost,bool speed)12664 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
12665 		   int param ATTRIBUTE_UNUSED, int *cost, bool speed)
12666 {
12667   rtx op0, op1, op2;
12668   const struct cpu_cost_table *extra_cost
12669     = aarch64_tune_params.insn_extra_cost;
12670   int code = GET_CODE (x);
12671   scalar_int_mode int_mode;
12672 
12673   /* By default, assume that everything has equivalent cost to the
12674      cheapest instruction.  Any additional costs are applied as a delta
12675      above this default.  */
12676   *cost = COSTS_N_INSNS (1);
12677 
12678   switch (code)
12679     {
12680     case SET:
12681       /* The cost depends entirely on the operands to SET.  */
12682       *cost = 0;
12683       op0 = SET_DEST (x);
12684       op1 = SET_SRC (x);
12685 
12686       switch (GET_CODE (op0))
12687 	{
12688 	case MEM:
12689 	  if (speed)
12690 	    {
12691 	      rtx address = XEXP (op0, 0);
12692 	      if (VECTOR_MODE_P (mode))
12693 		*cost += extra_cost->ldst.storev;
12694 	      else if (GET_MODE_CLASS (mode) == MODE_INT)
12695 		*cost += extra_cost->ldst.store;
12696 	      else if (mode == SFmode)
12697 		*cost += extra_cost->ldst.storef;
12698 	      else if (mode == DFmode)
12699 		*cost += extra_cost->ldst.stored;
12700 
12701 	      *cost +=
12702 		COSTS_N_INSNS (aarch64_address_cost (address, mode,
12703 						     0, speed));
12704 	    }
12705 
12706 	  *cost += rtx_cost (op1, mode, SET, 1, speed);
12707 	  return true;
12708 
12709 	case SUBREG:
12710 	  if (! REG_P (SUBREG_REG (op0)))
12711 	    *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
12712 
12713 	  /* Fall through.  */
12714 	case REG:
12715 	  /* The cost is one per vector-register copied.  */
12716 	  if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
12717 	    {
12718 	      int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
12719 	      *cost = COSTS_N_INSNS (nregs);
12720 	    }
12721 	  /* const0_rtx is in general free, but we will use an
12722 	     instruction to set a register to 0.  */
12723 	  else if (REG_P (op1) || op1 == const0_rtx)
12724 	    {
12725 	      /* The cost is 1 per register copied.  */
12726 	      int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
12727 	      *cost = COSTS_N_INSNS (nregs);
12728 	    }
12729           else
12730 	    /* Cost is just the cost of the RHS of the set.  */
12731 	    *cost += rtx_cost (op1, mode, SET, 1, speed);
12732 	  return true;
12733 
12734 	case ZERO_EXTRACT:
12735 	case SIGN_EXTRACT:
12736 	  /* Bit-field insertion.  Strip any redundant widening of
12737 	     the RHS to meet the width of the target.  */
12738 	  if (GET_CODE (op1) == SUBREG)
12739 	    op1 = SUBREG_REG (op1);
12740 	  if ((GET_CODE (op1) == ZERO_EXTEND
12741 	       || GET_CODE (op1) == SIGN_EXTEND)
12742 	      && CONST_INT_P (XEXP (op0, 1))
12743 	      && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
12744 	      && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
12745 	    op1 = XEXP (op1, 0);
12746 
12747           if (CONST_INT_P (op1))
12748             {
12749               /* MOV immediate is assumed to always be cheap.  */
12750               *cost = COSTS_N_INSNS (1);
12751             }
12752           else
12753             {
12754               /* BFM.  */
12755 	      if (speed)
12756 		*cost += extra_cost->alu.bfi;
12757 	      *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
12758             }
12759 
12760 	  return true;
12761 
12762 	default:
12763 	  /* We can't make sense of this, assume default cost.  */
12764           *cost = COSTS_N_INSNS (1);
12765 	  return false;
12766 	}
12767       return false;
12768 
12769     case CONST_INT:
12770       /* If an instruction can incorporate a constant within the
12771 	 instruction, the instruction's expression avoids calling
12772 	 rtx_cost() on the constant.  If rtx_cost() is called on a
12773 	 constant, then it is usually because the constant must be
12774 	 moved into a register by one or more instructions.
12775 
12776 	 The exception is constant 0, which can be expressed
12777 	 as XZR/WZR and is therefore free.  The exception to this is
12778 	 if we have (set (reg) (const0_rtx)) in which case we must cost
12779 	 the move.  However, we can catch that when we cost the SET, so
12780 	 we don't need to consider that here.  */
12781       if (x == const0_rtx)
12782 	*cost = 0;
12783       else
12784 	{
12785 	  /* To an approximation, building any other constant is
12786 	     proportionally expensive to the number of instructions
12787 	     required to build that constant.  This is true whether we
12788 	     are compiling for SPEED or otherwise.  */
12789 	  if (!is_a <scalar_int_mode> (mode, &int_mode))
12790 	    int_mode = word_mode;
12791 	  *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
12792 				 (NULL_RTX, x, false, int_mode));
12793 	}
12794       return true;
12795 
12796     case CONST_DOUBLE:
12797 
12798       /* First determine number of instructions to do the move
12799 	  as an integer constant.  */
12800       if (!aarch64_float_const_representable_p (x)
12801 	   && !aarch64_can_const_movi_rtx_p (x, mode)
12802 	   && aarch64_float_const_rtx_p (x))
12803 	{
12804 	  unsigned HOST_WIDE_INT ival;
12805 	  bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
12806 	  gcc_assert (succeed);
12807 
12808 	  scalar_int_mode imode = (mode == HFmode
12809 				   ? SImode
12810 				   : int_mode_for_mode (mode).require ());
12811 	  int ncost = aarch64_internal_mov_immediate
12812 		(NULL_RTX, gen_int_mode (ival, imode), false, imode);
12813 	  *cost += COSTS_N_INSNS (ncost);
12814 	  return true;
12815 	}
12816 
12817       if (speed)
12818 	{
12819 	  /* mov[df,sf]_aarch64.  */
12820 	  if (aarch64_float_const_representable_p (x))
12821 	    /* FMOV (scalar immediate).  */
12822 	    *cost += extra_cost->fp[mode == DFmode].fpconst;
12823 	  else if (!aarch64_float_const_zero_rtx_p (x))
12824 	    {
12825 	      /* This will be a load from memory.  */
12826 	      if (mode == DFmode)
12827 		*cost += extra_cost->ldst.loadd;
12828 	      else
12829 		*cost += extra_cost->ldst.loadf;
12830 	    }
12831 	  else
12832 	    /* Otherwise this is +0.0.  We get this using MOVI d0, #0
12833 	       or MOV v0.s[0], wzr - neither of which are modeled by the
12834 	       cost tables.  Just use the default cost.  */
12835 	    {
12836 	    }
12837 	}
12838 
12839       return true;
12840 
12841     case MEM:
12842       if (speed)
12843 	{
12844 	  /* For loads we want the base cost of a load, plus an
12845 	     approximation for the additional cost of the addressing
12846 	     mode.  */
12847 	  rtx address = XEXP (x, 0);
12848 	  if (VECTOR_MODE_P (mode))
12849 	    *cost += extra_cost->ldst.loadv;
12850 	  else if (GET_MODE_CLASS (mode) == MODE_INT)
12851 	    *cost += extra_cost->ldst.load;
12852 	  else if (mode == SFmode)
12853 	    *cost += extra_cost->ldst.loadf;
12854 	  else if (mode == DFmode)
12855 	    *cost += extra_cost->ldst.loadd;
12856 
12857 	  *cost +=
12858 		COSTS_N_INSNS (aarch64_address_cost (address, mode,
12859 						     0, speed));
12860 	}
12861 
12862       return true;
12863 
12864     case NEG:
12865       op0 = XEXP (x, 0);
12866 
12867       if (VECTOR_MODE_P (mode))
12868 	{
12869 	  if (speed)
12870 	    {
12871 	      /* FNEG.  */
12872 	      *cost += extra_cost->vect.alu;
12873 	    }
12874 	  return false;
12875 	}
12876 
12877       if (GET_MODE_CLASS (mode) == MODE_INT)
12878 	{
12879           if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
12880               || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
12881             {
12882               /* CSETM.  */
12883 	      *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
12884               return true;
12885             }
12886 
12887 	  /* Cost this as SUB wzr, X.  */
12888           op0 = CONST0_RTX (mode);
12889           op1 = XEXP (x, 0);
12890           goto cost_minus;
12891         }
12892 
12893       if (GET_MODE_CLASS (mode) == MODE_FLOAT)
12894         {
12895           /* Support (neg(fma...)) as a single instruction only if
12896              sign of zeros is unimportant.  This matches the decision
12897              making in aarch64.md.  */
12898           if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
12899             {
12900 	      /* FNMADD.  */
12901 	      *cost = rtx_cost (op0, mode, NEG, 0, speed);
12902               return true;
12903             }
12904 	  if (GET_CODE (op0) == MULT)
12905 	    {
12906 	      /* FNMUL.  */
12907 	      *cost = rtx_cost (op0, mode, NEG, 0, speed);
12908 	      return true;
12909 	    }
12910 	  if (speed)
12911 	    /* FNEG.  */
12912 	    *cost += extra_cost->fp[mode == DFmode].neg;
12913           return false;
12914         }
12915 
12916       return false;
12917 
12918     case CLRSB:
12919     case CLZ:
12920       if (speed)
12921 	{
12922 	  if (VECTOR_MODE_P (mode))
12923 	    *cost += extra_cost->vect.alu;
12924 	  else
12925 	    *cost += extra_cost->alu.clz;
12926 	}
12927 
12928       return false;
12929 
12930     case CTZ:
12931       *cost = COSTS_N_INSNS (2);
12932 
12933       if (speed)
12934 	*cost += extra_cost->alu.clz + extra_cost->alu.rev;
12935       return false;
12936 
12937     case COMPARE:
12938       op0 = XEXP (x, 0);
12939       op1 = XEXP (x, 1);
12940 
12941       if (op1 == const0_rtx
12942 	  && GET_CODE (op0) == AND)
12943 	{
12944 	  x = op0;
12945 	  mode = GET_MODE (op0);
12946 	  goto cost_logic;
12947 	}
12948 
12949       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
12950         {
12951           /* TODO: A write to the CC flags possibly costs extra, this
12952 	     needs encoding in the cost tables.  */
12953 
12954 	  mode = GET_MODE (op0);
12955           /* ANDS.  */
12956           if (GET_CODE (op0) == AND)
12957             {
12958               x = op0;
12959               goto cost_logic;
12960             }
12961 
12962           if (GET_CODE (op0) == PLUS)
12963             {
12964 	      /* ADDS (and CMN alias).  */
12965               x = op0;
12966               goto cost_plus;
12967             }
12968 
12969           if (GET_CODE (op0) == MINUS)
12970             {
12971 	      /* SUBS.  */
12972               x = op0;
12973               goto cost_minus;
12974             }
12975 
12976 	  if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
12977 	      && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
12978 	      && CONST_INT_P (XEXP (op0, 2)))
12979 	    {
12980 	      /* COMPARE of ZERO_EXTRACT form of TST-immediate.
12981 		 Handle it here directly rather than going to cost_logic
12982 		 since we know the immediate generated for the TST is valid
12983 		 so we can avoid creating an intermediate rtx for it only
12984 		 for costing purposes.  */
12985 	      if (speed)
12986 		*cost += extra_cost->alu.logical;
12987 
12988 	      *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
12989 				 ZERO_EXTRACT, 0, speed);
12990 	      return true;
12991 	    }
12992 
12993           if (GET_CODE (op1) == NEG)
12994             {
12995 	      /* CMN.  */
12996 	      if (speed)
12997 		*cost += extra_cost->alu.arith;
12998 
12999 	      *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
13000 	      *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
13001               return true;
13002             }
13003 
13004           /* CMP.
13005 
13006 	     Compare can freely swap the order of operands, and
13007              canonicalization puts the more complex operation first.
13008              But the integer MINUS logic expects the shift/extend
13009              operation in op1.  */
13010           if (! (REG_P (op0)
13011                  || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
13012           {
13013             op0 = XEXP (x, 1);
13014             op1 = XEXP (x, 0);
13015           }
13016           goto cost_minus;
13017         }
13018 
13019       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
13020         {
13021 	  /* FCMP.  */
13022 	  if (speed)
13023 	    *cost += extra_cost->fp[mode == DFmode].compare;
13024 
13025           if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
13026             {
13027 	      *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
13028               /* FCMP supports constant 0.0 for no extra cost. */
13029               return true;
13030             }
13031           return false;
13032         }
13033 
13034       if (VECTOR_MODE_P (mode))
13035 	{
13036 	  /* Vector compare.  */
13037 	  if (speed)
13038 	    *cost += extra_cost->vect.alu;
13039 
13040 	  if (aarch64_float_const_zero_rtx_p (op1))
13041 	    {
13042 	      /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
13043 		 cost.  */
13044 	      return true;
13045 	    }
13046 	  return false;
13047 	}
13048       return false;
13049 
13050     case MINUS:
13051       {
13052 	op0 = XEXP (x, 0);
13053 	op1 = XEXP (x, 1);
13054 
13055 cost_minus:
13056 	*cost += rtx_cost (op0, mode, MINUS, 0, speed);
13057 
13058 	/* Detect valid immediates.  */
13059 	if ((GET_MODE_CLASS (mode) == MODE_INT
13060 	     || (GET_MODE_CLASS (mode) == MODE_CC
13061 		 && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
13062 	    && CONST_INT_P (op1)
13063 	    && aarch64_uimm12_shift (INTVAL (op1)))
13064 	  {
13065 	    if (speed)
13066 	      /* SUB(S) (immediate).  */
13067 	      *cost += extra_cost->alu.arith;
13068 	    return true;
13069 	  }
13070 
13071 	/* Look for SUB (extended register).  */
13072 	if (is_a <scalar_int_mode> (mode)
13073 	    && aarch64_rtx_arith_op_extract_p (op1))
13074 	  {
13075 	    if (speed)
13076 	      *cost += extra_cost->alu.extend_arith;
13077 
13078 	    op1 = aarch64_strip_extend (op1, true);
13079 	    *cost += rtx_cost (op1, VOIDmode,
13080 			       (enum rtx_code) GET_CODE (op1), 0, speed);
13081 	    return true;
13082 	  }
13083 
13084 	rtx new_op1 = aarch64_strip_extend (op1, false);
13085 
13086 	/* Cost this as an FMA-alike operation.  */
13087 	if ((GET_CODE (new_op1) == MULT
13088 	     || aarch64_shift_p (GET_CODE (new_op1)))
13089 	    && code != COMPARE)
13090 	  {
13091 	    *cost += aarch64_rtx_mult_cost (new_op1, MULT,
13092 					    (enum rtx_code) code,
13093 					    speed);
13094 	    return true;
13095 	  }
13096 
13097 	*cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
13098 
13099 	if (speed)
13100 	  {
13101 	    if (VECTOR_MODE_P (mode))
13102 	      {
13103 		/* Vector SUB.  */
13104 		*cost += extra_cost->vect.alu;
13105 	      }
13106 	    else if (GET_MODE_CLASS (mode) == MODE_INT)
13107 	      {
13108 		/* SUB(S).  */
13109 		*cost += extra_cost->alu.arith;
13110 	      }
13111 	    else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
13112 	      {
13113 		/* FSUB.  */
13114 		*cost += extra_cost->fp[mode == DFmode].addsub;
13115 	      }
13116 	  }
13117 	return true;
13118       }
13119 
13120     case PLUS:
13121       {
13122 	rtx new_op0;
13123 
13124 	op0 = XEXP (x, 0);
13125 	op1 = XEXP (x, 1);
13126 
13127 cost_plus:
13128 	if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
13129 	    || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
13130 	  {
13131 	    /* CSINC.  */
13132 	    *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
13133 	    *cost += rtx_cost (op1, mode, PLUS, 1, speed);
13134 	    return true;
13135 	  }
13136 
13137 	if (GET_MODE_CLASS (mode) == MODE_INT
13138 	    && (aarch64_plus_immediate (op1, mode)
13139 		|| aarch64_sve_addvl_addpl_immediate (op1, mode)))
13140 	  {
13141 	    *cost += rtx_cost (op0, mode, PLUS, 0, speed);
13142 
13143 	    if (speed)
13144 	      {
13145 		/* ADD (immediate).  */
13146 		*cost += extra_cost->alu.arith;
13147 
13148 		/* Some tunings prefer to not use the VL-based scalar ops.
13149 		   Increase the cost of the poly immediate to prevent their
13150 		   formation.  */
13151 		if (GET_CODE (op1) == CONST_POLY_INT
13152 		    && (aarch64_tune_params.extra_tuning_flags
13153 			& AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS))
13154 		  *cost += COSTS_N_INSNS (1);
13155 	      }
13156 	    return true;
13157 	  }
13158 
13159 	*cost += rtx_cost (op1, mode, PLUS, 1, speed);
13160 
13161 	/* Look for ADD (extended register).  */
13162 	if (is_a <scalar_int_mode> (mode)
13163 	    && aarch64_rtx_arith_op_extract_p (op0))
13164 	  {
13165 	    if (speed)
13166 	      *cost += extra_cost->alu.extend_arith;
13167 
13168 	    op0 = aarch64_strip_extend (op0, true);
13169 	    *cost += rtx_cost (op0, VOIDmode,
13170 			       (enum rtx_code) GET_CODE (op0), 0, speed);
13171 	    return true;
13172 	  }
13173 
13174 	/* Strip any extend, leave shifts behind as we will
13175 	   cost them through mult_cost.  */
13176 	new_op0 = aarch64_strip_extend (op0, false);
13177 
13178 	if (GET_CODE (new_op0) == MULT
13179 	    || aarch64_shift_p (GET_CODE (new_op0)))
13180 	  {
13181 	    *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
13182 					    speed);
13183 	    return true;
13184 	  }
13185 
13186 	*cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
13187 
13188 	if (speed)
13189 	  {
13190 	    if (VECTOR_MODE_P (mode))
13191 	      {
13192 		/* Vector ADD.  */
13193 		*cost += extra_cost->vect.alu;
13194 	      }
13195 	    else if (GET_MODE_CLASS (mode) == MODE_INT)
13196 	      {
13197 		/* ADD.  */
13198 		*cost += extra_cost->alu.arith;
13199 	      }
13200 	    else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
13201 	      {
13202 		/* FADD.  */
13203 		*cost += extra_cost->fp[mode == DFmode].addsub;
13204 	      }
13205 	  }
13206 	return true;
13207       }
13208 
13209     case BSWAP:
13210       *cost = COSTS_N_INSNS (1);
13211 
13212       if (speed)
13213 	{
13214 	  if (VECTOR_MODE_P (mode))
13215 	    *cost += extra_cost->vect.alu;
13216 	  else
13217 	    *cost += extra_cost->alu.rev;
13218 	}
13219       return false;
13220 
13221     case IOR:
13222       if (aarch_rev16_p (x))
13223         {
13224           *cost = COSTS_N_INSNS (1);
13225 
13226 	  if (speed)
13227 	    {
13228 	      if (VECTOR_MODE_P (mode))
13229 		*cost += extra_cost->vect.alu;
13230 	      else
13231 		*cost += extra_cost->alu.rev;
13232 	    }
13233 	  return true;
13234         }
13235 
13236       if (aarch64_extr_rtx_p (x, &op0, &op1))
13237         {
13238 	  *cost += rtx_cost (op0, mode, IOR, 0, speed);
13239 	  *cost += rtx_cost (op1, mode, IOR, 1, speed);
13240           if (speed)
13241             *cost += extra_cost->alu.shift;
13242 
13243           return true;
13244         }
13245     /* Fall through.  */
13246     case XOR:
13247     case AND:
13248     cost_logic:
13249       op0 = XEXP (x, 0);
13250       op1 = XEXP (x, 1);
13251 
13252       if (VECTOR_MODE_P (mode))
13253 	{
13254 	  if (speed)
13255 	    *cost += extra_cost->vect.alu;
13256 	  return true;
13257 	}
13258 
13259       if (code == AND
13260           && GET_CODE (op0) == MULT
13261           && CONST_INT_P (XEXP (op0, 1))
13262           && CONST_INT_P (op1)
13263           && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
13264                                INTVAL (op1)) != 0)
13265         {
13266           /* This is a UBFM/SBFM.  */
13267 	  *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
13268 	  if (speed)
13269 	    *cost += extra_cost->alu.bfx;
13270           return true;
13271         }
13272 
13273       if (is_int_mode (mode, &int_mode))
13274 	{
13275 	  if (CONST_INT_P (op1))
13276 	    {
13277 	      /* We have a mask + shift version of a UBFIZ
13278 		 i.e. the *andim_ashift<mode>_bfiz pattern.  */
13279 	      if (GET_CODE (op0) == ASHIFT
13280 		  && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
13281 							 XEXP (op0, 1)))
13282 		{
13283 		  *cost += rtx_cost (XEXP (op0, 0), int_mode,
13284 				     (enum rtx_code) code, 0, speed);
13285 		  if (speed)
13286 		    *cost += extra_cost->alu.bfx;
13287 
13288 		  return true;
13289 		}
13290 	      else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
13291 		{
13292 		/* We possibly get the immediate for free, this is not
13293 		   modelled.  */
13294 		  *cost += rtx_cost (op0, int_mode,
13295 				     (enum rtx_code) code, 0, speed);
13296 		  if (speed)
13297 		    *cost += extra_cost->alu.logical;
13298 
13299 		  return true;
13300 		}
13301 	    }
13302 	  else
13303 	    {
13304 	      rtx new_op0 = op0;
13305 
13306 	      /* Handle ORN, EON, or BIC.  */
13307 	      if (GET_CODE (op0) == NOT)
13308 		op0 = XEXP (op0, 0);
13309 
13310 	      new_op0 = aarch64_strip_shift (op0);
13311 
13312 	      /* If we had a shift on op0 then this is a logical-shift-
13313 		 by-register/immediate operation.  Otherwise, this is just
13314 		 a logical operation.  */
13315 	      if (speed)
13316 		{
13317 		  if (new_op0 != op0)
13318 		    {
13319 		      /* Shift by immediate.  */
13320 		      if (CONST_INT_P (XEXP (op0, 1)))
13321 			*cost += extra_cost->alu.log_shift;
13322 		      else
13323 			*cost += extra_cost->alu.log_shift_reg;
13324 		    }
13325 		  else
13326 		    *cost += extra_cost->alu.logical;
13327 		}
13328 
13329 	      /* In both cases we want to cost both operands.  */
13330 	      *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
13331 				 0, speed);
13332 	      *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
13333 				 1, speed);
13334 
13335 	      return true;
13336 	    }
13337 	}
13338       return false;
13339 
13340     case NOT:
13341       x = XEXP (x, 0);
13342       op0 = aarch64_strip_shift (x);
13343 
13344       if (VECTOR_MODE_P (mode))
13345 	{
13346 	  /* Vector NOT.  */
13347 	  *cost += extra_cost->vect.alu;
13348 	  return false;
13349 	}
13350 
13351       /* MVN-shifted-reg.  */
13352       if (op0 != x)
13353         {
13354 	  *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
13355 
13356           if (speed)
13357             *cost += extra_cost->alu.log_shift;
13358 
13359           return true;
13360         }
13361       /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
13362          Handle the second form here taking care that 'a' in the above can
13363          be a shift.  */
13364       else if (GET_CODE (op0) == XOR)
13365         {
13366           rtx newop0 = XEXP (op0, 0);
13367           rtx newop1 = XEXP (op0, 1);
13368           rtx op0_stripped = aarch64_strip_shift (newop0);
13369 
13370 	  *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
13371 	  *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
13372 
13373           if (speed)
13374             {
13375               if (op0_stripped != newop0)
13376                 *cost += extra_cost->alu.log_shift;
13377               else
13378                 *cost += extra_cost->alu.logical;
13379             }
13380 
13381           return true;
13382         }
13383       /* MVN.  */
13384       if (speed)
13385 	*cost += extra_cost->alu.logical;
13386 
13387       return false;
13388 
13389     case ZERO_EXTEND:
13390 
13391       op0 = XEXP (x, 0);
13392       /* If a value is written in SI mode, then zero extended to DI
13393 	 mode, the operation will in general be free as a write to
13394 	 a 'w' register implicitly zeroes the upper bits of an 'x'
13395 	 register.  However, if this is
13396 
13397 	   (set (reg) (zero_extend (reg)))
13398 
13399 	 we must cost the explicit register move.  */
13400       if (mode == DImode
13401 	  && GET_MODE (op0) == SImode
13402 	  && outer == SET)
13403 	{
13404 	  int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
13405 
13406 	/* If OP_COST is non-zero, then the cost of the zero extend
13407 	   is effectively the cost of the inner operation.  Otherwise
13408 	   we have a MOV instruction and we take the cost from the MOV
13409 	   itself.  This is true independently of whether we are
13410 	   optimizing for space or time.  */
13411 	  if (op_cost)
13412 	    *cost = op_cost;
13413 
13414 	  return true;
13415 	}
13416       else if (MEM_P (op0))
13417 	{
13418 	  /* All loads can zero extend to any size for free.  */
13419 	  *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
13420 	  return true;
13421 	}
13422 
13423       op0 = aarch64_extend_bitfield_pattern_p (x);
13424       if (op0)
13425 	{
13426 	  *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
13427 	  if (speed)
13428 	    *cost += extra_cost->alu.bfx;
13429 	  return true;
13430 	}
13431 
13432       if (speed)
13433 	{
13434 	  if (VECTOR_MODE_P (mode))
13435 	    {
13436 	      /* UMOV.  */
13437 	      *cost += extra_cost->vect.alu;
13438 	    }
13439 	  else
13440 	    {
13441 	      /* We generate an AND instead of UXTB/UXTH.  */
13442 	      *cost += extra_cost->alu.logical;
13443 	    }
13444 	}
13445       return false;
13446 
13447     case SIGN_EXTEND:
13448       if (MEM_P (XEXP (x, 0)))
13449 	{
13450 	  /* LDRSH.  */
13451 	  if (speed)
13452 	    {
13453 	      rtx address = XEXP (XEXP (x, 0), 0);
13454 	      *cost += extra_cost->ldst.load_sign_extend;
13455 
13456 	      *cost +=
13457 		COSTS_N_INSNS (aarch64_address_cost (address, mode,
13458 						     0, speed));
13459 	    }
13460 	  return true;
13461 	}
13462 
13463       op0 = aarch64_extend_bitfield_pattern_p (x);
13464       if (op0)
13465 	{
13466 	  *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
13467 	  if (speed)
13468 	    *cost += extra_cost->alu.bfx;
13469 	  return true;
13470 	}
13471 
13472       if (speed)
13473 	{
13474 	  if (VECTOR_MODE_P (mode))
13475 	    *cost += extra_cost->vect.alu;
13476 	  else
13477 	    *cost += extra_cost->alu.extend;
13478 	}
13479       return false;
13480 
13481     case ASHIFT:
13482       op0 = XEXP (x, 0);
13483       op1 = XEXP (x, 1);
13484 
13485       if (CONST_INT_P (op1))
13486         {
13487 	  if (speed)
13488 	    {
13489 	      if (VECTOR_MODE_P (mode))
13490 		{
13491 		  /* Vector shift (immediate).  */
13492 		  *cost += extra_cost->vect.alu;
13493 		}
13494 	      else
13495 		{
13496 		  /* LSL (immediate), UBMF, UBFIZ and friends.  These are all
13497 		     aliases.  */
13498 		  *cost += extra_cost->alu.shift;
13499 		}
13500 	    }
13501 
13502           /* We can incorporate zero/sign extend for free.  */
13503           if (GET_CODE (op0) == ZERO_EXTEND
13504               || GET_CODE (op0) == SIGN_EXTEND)
13505             op0 = XEXP (op0, 0);
13506 
13507 	  *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
13508           return true;
13509         }
13510       else
13511         {
13512 	  if (VECTOR_MODE_P (mode))
13513 	    {
13514 	      if (speed)
13515 		/* Vector shift (register).  */
13516 		*cost += extra_cost->vect.alu;
13517 	    }
13518 	  else
13519 	    {
13520 	      if (speed)
13521 		/* LSLV.  */
13522 		*cost += extra_cost->alu.shift_reg;
13523 
13524 	      if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
13525 		  && CONST_INT_P (XEXP (op1, 1))
13526 		  && known_eq (INTVAL (XEXP (op1, 1)),
13527 			       GET_MODE_BITSIZE (mode) - 1))
13528 		{
13529 		  *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
13530 		  /* We already demanded XEXP (op1, 0) to be REG_P, so
13531 		     don't recurse into it.  */
13532 		  return true;
13533 		}
13534 	    }
13535 	  return false;  /* All arguments need to be in registers.  */
13536         }
13537 
13538     case ROTATE:
13539     case ROTATERT:
13540     case LSHIFTRT:
13541     case ASHIFTRT:
13542       op0 = XEXP (x, 0);
13543       op1 = XEXP (x, 1);
13544 
13545       if (CONST_INT_P (op1))
13546 	{
13547 	  /* ASR (immediate) and friends.  */
13548 	  if (speed)
13549 	    {
13550 	      if (VECTOR_MODE_P (mode))
13551 		*cost += extra_cost->vect.alu;
13552 	      else
13553 		*cost += extra_cost->alu.shift;
13554 	    }
13555 
13556 	  *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
13557 	  return true;
13558 	}
13559       else
13560 	{
13561 	  if (VECTOR_MODE_P (mode))
13562 	    {
13563 	      if (speed)
13564 		/* Vector shift (register).  */
13565 		*cost += extra_cost->vect.alu;
13566 	    }
13567 	  else
13568 	    {
13569 	      if (speed)
13570 		/* ASR (register) and friends.  */
13571 		*cost += extra_cost->alu.shift_reg;
13572 
13573 	      if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
13574 		  && CONST_INT_P (XEXP (op1, 1))
13575 		  && known_eq (INTVAL (XEXP (op1, 1)),
13576 			       GET_MODE_BITSIZE (mode) - 1))
13577 		{
13578 		  *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
13579 		  /* We already demanded XEXP (op1, 0) to be REG_P, so
13580 		     don't recurse into it.  */
13581 		  return true;
13582 		}
13583 	    }
13584 	  return false;  /* All arguments need to be in registers.  */
13585 	}
13586 
13587     case SYMBOL_REF:
13588 
13589       if (aarch64_cmodel == AARCH64_CMODEL_LARGE
13590 	  || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
13591 	{
13592 	  /* LDR.  */
13593 	  if (speed)
13594 	    *cost += extra_cost->ldst.load;
13595 	}
13596       else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
13597 	       || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
13598 	{
13599 	  /* ADRP, followed by ADD.  */
13600 	  *cost += COSTS_N_INSNS (1);
13601 	  if (speed)
13602 	    *cost += 2 * extra_cost->alu.arith;
13603 	}
13604       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
13605 	       || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
13606 	{
13607 	  /* ADR.  */
13608 	  if (speed)
13609 	    *cost += extra_cost->alu.arith;
13610 	}
13611 
13612       if (flag_pic)
13613 	{
13614 	  /* One extra load instruction, after accessing the GOT.  */
13615 	  *cost += COSTS_N_INSNS (1);
13616 	  if (speed)
13617 	    *cost += extra_cost->ldst.load;
13618 	}
13619       return true;
13620 
13621     case HIGH:
13622     case LO_SUM:
13623       /* ADRP/ADD (immediate).  */
13624       if (speed)
13625 	*cost += extra_cost->alu.arith;
13626       return true;
13627 
13628     case ZERO_EXTRACT:
13629     case SIGN_EXTRACT:
13630       /* UBFX/SBFX.  */
13631       if (speed)
13632 	{
13633 	  if (VECTOR_MODE_P (mode))
13634 	    *cost += extra_cost->vect.alu;
13635 	  else
13636 	    *cost += extra_cost->alu.bfx;
13637 	}
13638 
13639       /* We can trust that the immediates used will be correct (there
13640 	 are no by-register forms), so we need only cost op0.  */
13641       *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
13642       return true;
13643 
13644     case MULT:
13645       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
13646       /* aarch64_rtx_mult_cost always handles recursion to its
13647 	 operands.  */
13648       return true;
13649 
13650     case MOD:
13651     /* We can expand signed mod by power of 2 using a NEGS, two parallel
13652        ANDs and a CSNEG.  Assume here that CSNEG is the same as the cost of
13653        an unconditional negate.  This case should only ever be reached through
13654        the set_smod_pow2_cheap check in expmed.c.  */
13655       if (CONST_INT_P (XEXP (x, 1))
13656 	  && exact_log2 (INTVAL (XEXP (x, 1))) > 0
13657 	  && (mode == SImode || mode == DImode))
13658 	{
13659 	  /* We expand to 4 instructions.  Reset the baseline.  */
13660 	  *cost = COSTS_N_INSNS (4);
13661 
13662 	  if (speed)
13663 	    *cost += 2 * extra_cost->alu.logical
13664 		     + 2 * extra_cost->alu.arith;
13665 
13666 	  return true;
13667 	}
13668 
13669     /* Fall-through.  */
13670     case UMOD:
13671       if (speed)
13672 	{
13673 	  /* Slighly prefer UMOD over SMOD.  */
13674 	  if (VECTOR_MODE_P (mode))
13675 	    *cost += extra_cost->vect.alu;
13676 	  else if (GET_MODE_CLASS (mode) == MODE_INT)
13677 	    *cost += (extra_cost->mult[mode == DImode].add
13678 		      + extra_cost->mult[mode == DImode].idiv
13679 		      + (code == MOD ? 1 : 0));
13680 	}
13681       return false;  /* All arguments need to be in registers.  */
13682 
13683     case DIV:
13684     case UDIV:
13685     case SQRT:
13686       if (speed)
13687 	{
13688 	  if (VECTOR_MODE_P (mode))
13689 	    *cost += extra_cost->vect.alu;
13690 	  else if (GET_MODE_CLASS (mode) == MODE_INT)
13691 	    /* There is no integer SQRT, so only DIV and UDIV can get
13692 	       here.  */
13693 	    *cost += (extra_cost->mult[mode == DImode].idiv
13694 		     /* Slighly prefer UDIV over SDIV.  */
13695 		     + (code == DIV ? 1 : 0));
13696 	  else
13697 	    *cost += extra_cost->fp[mode == DFmode].div;
13698 	}
13699       return false;  /* All arguments need to be in registers.  */
13700 
13701     case IF_THEN_ELSE:
13702       return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
13703 					 XEXP (x, 2), cost, speed);
13704 
13705     case EQ:
13706     case NE:
13707     case GT:
13708     case GTU:
13709     case LT:
13710     case LTU:
13711     case GE:
13712     case GEU:
13713     case LE:
13714     case LEU:
13715 
13716       return false; /* All arguments must be in registers.  */
13717 
13718     case FMA:
13719       op0 = XEXP (x, 0);
13720       op1 = XEXP (x, 1);
13721       op2 = XEXP (x, 2);
13722 
13723       if (speed)
13724 	{
13725 	  if (VECTOR_MODE_P (mode))
13726 	    *cost += extra_cost->vect.alu;
13727 	  else
13728 	    *cost += extra_cost->fp[mode == DFmode].fma;
13729 	}
13730 
13731       /* FMSUB, FNMADD, and FNMSUB are free.  */
13732       if (GET_CODE (op0) == NEG)
13733         op0 = XEXP (op0, 0);
13734 
13735       if (GET_CODE (op2) == NEG)
13736         op2 = XEXP (op2, 0);
13737 
13738       /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
13739 	 and the by-element operand as operand 0.  */
13740       if (GET_CODE (op1) == NEG)
13741         op1 = XEXP (op1, 0);
13742 
13743       /* Catch vector-by-element operations.  The by-element operand can
13744 	 either be (vec_duplicate (vec_select (x))) or just
13745 	 (vec_select (x)), depending on whether we are multiplying by
13746 	 a vector or a scalar.
13747 
13748 	 Canonicalization is not very good in these cases, FMA4 will put the
13749 	 by-element operand as operand 0, FNMA4 will have it as operand 1.  */
13750       if (GET_CODE (op0) == VEC_DUPLICATE)
13751 	op0 = XEXP (op0, 0);
13752       else if (GET_CODE (op1) == VEC_DUPLICATE)
13753 	op1 = XEXP (op1, 0);
13754 
13755       if (GET_CODE (op0) == VEC_SELECT)
13756 	op0 = XEXP (op0, 0);
13757       else if (GET_CODE (op1) == VEC_SELECT)
13758 	op1 = XEXP (op1, 0);
13759 
13760       /* If the remaining parameters are not registers,
13761          get the cost to put them into registers.  */
13762       *cost += rtx_cost (op0, mode, FMA, 0, speed);
13763       *cost += rtx_cost (op1, mode, FMA, 1, speed);
13764       *cost += rtx_cost (op2, mode, FMA, 2, speed);
13765       return true;
13766 
13767     case FLOAT:
13768     case UNSIGNED_FLOAT:
13769       if (speed)
13770 	*cost += extra_cost->fp[mode == DFmode].fromint;
13771       return false;
13772 
13773     case FLOAT_EXTEND:
13774       if (speed)
13775 	{
13776 	  if (VECTOR_MODE_P (mode))
13777 	    {
13778 	      /*Vector truncate.  */
13779 	      *cost += extra_cost->vect.alu;
13780 	    }
13781 	  else
13782 	    *cost += extra_cost->fp[mode == DFmode].widen;
13783 	}
13784       return false;
13785 
13786     case FLOAT_TRUNCATE:
13787       if (speed)
13788 	{
13789 	  if (VECTOR_MODE_P (mode))
13790 	    {
13791 	      /*Vector conversion.  */
13792 	      *cost += extra_cost->vect.alu;
13793 	    }
13794 	  else
13795 	    *cost += extra_cost->fp[mode == DFmode].narrow;
13796 	}
13797       return false;
13798 
13799     case FIX:
13800     case UNSIGNED_FIX:
13801       x = XEXP (x, 0);
13802       /* Strip the rounding part.  They will all be implemented
13803          by the fcvt* family of instructions anyway.  */
13804       if (GET_CODE (x) == UNSPEC)
13805         {
13806           unsigned int uns_code = XINT (x, 1);
13807 
13808           if (uns_code == UNSPEC_FRINTA
13809               || uns_code == UNSPEC_FRINTM
13810               || uns_code == UNSPEC_FRINTN
13811               || uns_code == UNSPEC_FRINTP
13812               || uns_code == UNSPEC_FRINTZ)
13813             x = XVECEXP (x, 0, 0);
13814         }
13815 
13816       if (speed)
13817 	{
13818 	  if (VECTOR_MODE_P (mode))
13819 	    *cost += extra_cost->vect.alu;
13820 	  else
13821 	    *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
13822 	}
13823 
13824       /* We can combine fmul by a power of 2 followed by a fcvt into a single
13825 	 fixed-point fcvt.  */
13826       if (GET_CODE (x) == MULT
13827 	  && ((VECTOR_MODE_P (mode)
13828 	       && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
13829 	      || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
13830 	{
13831 	  *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
13832 			     0, speed);
13833 	  return true;
13834 	}
13835 
13836       *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
13837       return true;
13838 
13839     case ABS:
13840       if (VECTOR_MODE_P (mode))
13841 	{
13842 	  /* ABS (vector).  */
13843 	  if (speed)
13844 	    *cost += extra_cost->vect.alu;
13845 	}
13846       else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
13847 	{
13848 	  op0 = XEXP (x, 0);
13849 
13850 	  /* FABD, which is analogous to FADD.  */
13851 	  if (GET_CODE (op0) == MINUS)
13852 	    {
13853 	      *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
13854 	      *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
13855 	      if (speed)
13856 		*cost += extra_cost->fp[mode == DFmode].addsub;
13857 
13858 	      return true;
13859 	    }
13860 	  /* Simple FABS is analogous to FNEG.  */
13861 	  if (speed)
13862 	    *cost += extra_cost->fp[mode == DFmode].neg;
13863 	}
13864       else
13865 	{
13866 	  /* Integer ABS will either be split to
13867 	     two arithmetic instructions, or will be an ABS
13868 	     (scalar), which we don't model.  */
13869 	  *cost = COSTS_N_INSNS (2);
13870 	  if (speed)
13871 	    *cost += 2 * extra_cost->alu.arith;
13872 	}
13873       return false;
13874 
13875     case SMAX:
13876     case SMIN:
13877       if (speed)
13878 	{
13879 	  if (VECTOR_MODE_P (mode))
13880 	    *cost += extra_cost->vect.alu;
13881 	  else
13882 	    {
13883 	      /* FMAXNM/FMINNM/FMAX/FMIN.
13884 	         TODO: This may not be accurate for all implementations, but
13885 	         we do not model this in the cost tables.  */
13886 	      *cost += extra_cost->fp[mode == DFmode].addsub;
13887 	    }
13888 	}
13889       return false;
13890 
13891     case UNSPEC:
13892       /* The floating point round to integer frint* instructions.  */
13893       if (aarch64_frint_unspec_p (XINT (x, 1)))
13894         {
13895           if (speed)
13896             *cost += extra_cost->fp[mode == DFmode].roundint;
13897 
13898           return false;
13899         }
13900 
13901       if (XINT (x, 1) == UNSPEC_RBIT)
13902         {
13903           if (speed)
13904             *cost += extra_cost->alu.rev;
13905 
13906           return false;
13907         }
13908       break;
13909 
13910     case TRUNCATE:
13911 
13912       /* Decompose <su>muldi3_highpart.  */
13913       if (/* (truncate:DI  */
13914 	  mode == DImode
13915 	  /*   (lshiftrt:TI  */
13916           && GET_MODE (XEXP (x, 0)) == TImode
13917           && GET_CODE (XEXP (x, 0)) == LSHIFTRT
13918 	  /*      (mult:TI  */
13919           && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
13920 	  /*        (ANY_EXTEND:TI (reg:DI))
13921 	            (ANY_EXTEND:TI (reg:DI)))  */
13922           && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
13923                && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
13924               || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
13925                   && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
13926           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
13927           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
13928 	  /*     (const_int 64)  */
13929           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
13930           && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
13931         {
13932           /* UMULH/SMULH.  */
13933 	  if (speed)
13934 	    *cost += extra_cost->mult[mode == DImode].extend;
13935 	  *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
13936 			     mode, MULT, 0, speed);
13937 	  *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
13938 			     mode, MULT, 1, speed);
13939           return true;
13940         }
13941 
13942       /* Fall through.  */
13943     default:
13944       break;
13945     }
13946 
13947   if (dump_file
13948       && flag_aarch64_verbose_cost)
13949     fprintf (dump_file,
13950       "\nFailed to cost RTX.  Assuming default cost.\n");
13951 
13952   return true;
13953 }
13954 
13955 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
13956    calculated for X.  This cost is stored in *COST.  Returns true
13957    if the total cost of X was calculated.  */
13958 static bool
aarch64_rtx_costs_wrapper(rtx x,machine_mode mode,int outer,int param,int * cost,bool speed)13959 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
13960 		   int param, int *cost, bool speed)
13961 {
13962   bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
13963 
13964   if (dump_file
13965       && flag_aarch64_verbose_cost)
13966     {
13967       print_rtl_single (dump_file, x);
13968       fprintf (dump_file, "\n%s cost: %d (%s)\n",
13969 	       speed ? "Hot" : "Cold",
13970 	       *cost, result ? "final" : "partial");
13971     }
13972 
13973   return result;
13974 }
13975 
13976 static int
aarch64_register_move_cost(machine_mode mode,reg_class_t from_i,reg_class_t to_i)13977 aarch64_register_move_cost (machine_mode mode,
13978 			    reg_class_t from_i, reg_class_t to_i)
13979 {
13980   enum reg_class from = (enum reg_class) from_i;
13981   enum reg_class to = (enum reg_class) to_i;
13982   const struct cpu_regmove_cost *regmove_cost
13983     = aarch64_tune_params.regmove_cost;
13984 
13985   /* Caller save and pointer regs are equivalent to GENERAL_REGS.  */
13986   if (to == TAILCALL_ADDR_REGS || to == POINTER_REGS
13987       || to == STUB_REGS)
13988     to = GENERAL_REGS;
13989 
13990   if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS
13991       || from == STUB_REGS)
13992     from = GENERAL_REGS;
13993 
13994   /* Make RDFFR very expensive.  In particular, if we know that the FFR
13995      contains a PTRUE (e.g. after a SETFFR), we must never use RDFFR
13996      as a way of obtaining a PTRUE.  */
13997   if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
13998       && hard_reg_set_subset_p (reg_class_contents[from_i],
13999 				reg_class_contents[FFR_REGS]))
14000     return 80;
14001 
14002   /* Moving between GPR and stack cost is the same as GP2GP.  */
14003   if ((from == GENERAL_REGS && to == STACK_REG)
14004       || (to == GENERAL_REGS && from == STACK_REG))
14005     return regmove_cost->GP2GP;
14006 
14007   /* To/From the stack register, we move via the gprs.  */
14008   if (to == STACK_REG || from == STACK_REG)
14009     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
14010             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
14011 
14012   if (known_eq (GET_MODE_SIZE (mode), 16))
14013     {
14014       /* 128-bit operations on general registers require 2 instructions.  */
14015       if (from == GENERAL_REGS && to == GENERAL_REGS)
14016 	return regmove_cost->GP2GP * 2;
14017       else if (from == GENERAL_REGS)
14018 	return regmove_cost->GP2FP * 2;
14019       else if (to == GENERAL_REGS)
14020 	return regmove_cost->FP2GP * 2;
14021 
14022       /* When AdvSIMD instructions are disabled it is not possible to move
14023 	 a 128-bit value directly between Q registers.  This is handled in
14024 	 secondary reload.  A general register is used as a scratch to move
14025 	 the upper DI value and the lower DI value is moved directly,
14026 	 hence the cost is the sum of three moves. */
14027       if (! TARGET_SIMD)
14028 	return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
14029 
14030       return regmove_cost->FP2FP;
14031     }
14032 
14033   if (from == GENERAL_REGS && to == GENERAL_REGS)
14034     return regmove_cost->GP2GP;
14035   else if (from == GENERAL_REGS)
14036     return regmove_cost->GP2FP;
14037   else if (to == GENERAL_REGS)
14038     return regmove_cost->FP2GP;
14039 
14040   return regmove_cost->FP2FP;
14041 }
14042 
14043 static int
aarch64_memory_move_cost(machine_mode mode ATTRIBUTE_UNUSED,reg_class_t rclass ATTRIBUTE_UNUSED,bool in ATTRIBUTE_UNUSED)14044 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
14045 			  reg_class_t rclass ATTRIBUTE_UNUSED,
14046 			  bool in ATTRIBUTE_UNUSED)
14047 {
14048   return aarch64_tune_params.memmov_cost;
14049 }
14050 
14051 /* Implement TARGET_INIT_BUILTINS.  */
14052 static void
aarch64_init_builtins()14053 aarch64_init_builtins ()
14054 {
14055   aarch64_general_init_builtins ();
14056   aarch64_sve::init_builtins ();
14057 #ifdef SUBTARGET_INIT_BUILTINS
14058   SUBTARGET_INIT_BUILTINS;
14059 #endif
14060 }
14061 
14062 /* Implement TARGET_FOLD_BUILTIN.  */
14063 static tree
aarch64_fold_builtin(tree fndecl,int nargs,tree * args,bool)14064 aarch64_fold_builtin (tree fndecl, int nargs, tree *args, bool)
14065 {
14066   unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
14067   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
14068   tree type = TREE_TYPE (TREE_TYPE (fndecl));
14069   switch (code & AARCH64_BUILTIN_CLASS)
14070     {
14071     case AARCH64_BUILTIN_GENERAL:
14072       return aarch64_general_fold_builtin (subcode, type, nargs, args);
14073 
14074     case AARCH64_BUILTIN_SVE:
14075       return NULL_TREE;
14076     }
14077   gcc_unreachable ();
14078 }
14079 
14080 /* Implement TARGET_GIMPLE_FOLD_BUILTIN.  */
14081 static bool
aarch64_gimple_fold_builtin(gimple_stmt_iterator * gsi)14082 aarch64_gimple_fold_builtin (gimple_stmt_iterator *gsi)
14083 {
14084   gcall *stmt = as_a <gcall *> (gsi_stmt (*gsi));
14085   tree fndecl = gimple_call_fndecl (stmt);
14086   unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
14087   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
14088   gimple *new_stmt = NULL;
14089   switch (code & AARCH64_BUILTIN_CLASS)
14090     {
14091     case AARCH64_BUILTIN_GENERAL:
14092       new_stmt = aarch64_general_gimple_fold_builtin (subcode, stmt);
14093       break;
14094 
14095     case AARCH64_BUILTIN_SVE:
14096       new_stmt = aarch64_sve::gimple_fold_builtin (subcode, gsi, stmt);
14097       break;
14098     }
14099 
14100   if (!new_stmt)
14101     return false;
14102 
14103   gsi_replace (gsi, new_stmt, true);
14104   return true;
14105 }
14106 
14107 /* Implement TARGET_EXPAND_BUILTIN.  */
14108 static rtx
aarch64_expand_builtin(tree exp,rtx target,rtx,machine_mode,int ignore)14109 aarch64_expand_builtin (tree exp, rtx target, rtx, machine_mode, int ignore)
14110 {
14111   tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
14112   unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
14113   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
14114   switch (code & AARCH64_BUILTIN_CLASS)
14115     {
14116     case AARCH64_BUILTIN_GENERAL:
14117       return aarch64_general_expand_builtin (subcode, exp, target, ignore);
14118 
14119     case AARCH64_BUILTIN_SVE:
14120       return aarch64_sve::expand_builtin (subcode, exp, target);
14121     }
14122   gcc_unreachable ();
14123 }
14124 
14125 /* Implement TARGET_BUILTIN_DECL.  */
14126 static tree
aarch64_builtin_decl(unsigned int code,bool initialize_p)14127 aarch64_builtin_decl (unsigned int code, bool initialize_p)
14128 {
14129   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
14130   switch (code & AARCH64_BUILTIN_CLASS)
14131     {
14132     case AARCH64_BUILTIN_GENERAL:
14133       return aarch64_general_builtin_decl (subcode, initialize_p);
14134 
14135     case AARCH64_BUILTIN_SVE:
14136       return aarch64_sve::builtin_decl (subcode, initialize_p);
14137     }
14138   gcc_unreachable ();
14139 }
14140 
14141 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
14142    to optimize 1.0/sqrt.  */
14143 
14144 static bool
use_rsqrt_p(machine_mode mode)14145 use_rsqrt_p (machine_mode mode)
14146 {
14147   return (!flag_trapping_math
14148 	  && flag_unsafe_math_optimizations
14149 	  && ((aarch64_tune_params.approx_modes->recip_sqrt
14150 	       & AARCH64_APPROX_MODE (mode))
14151 	      || flag_mrecip_low_precision_sqrt));
14152 }
14153 
14154 /* Function to decide when to use the approximate reciprocal square root
14155    builtin.  */
14156 
14157 static tree
aarch64_builtin_reciprocal(tree fndecl)14158 aarch64_builtin_reciprocal (tree fndecl)
14159 {
14160   machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
14161 
14162   if (!use_rsqrt_p (mode))
14163     return NULL_TREE;
14164   unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
14165   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
14166   switch (code & AARCH64_BUILTIN_CLASS)
14167     {
14168     case AARCH64_BUILTIN_GENERAL:
14169       return aarch64_general_builtin_rsqrt (subcode);
14170 
14171     case AARCH64_BUILTIN_SVE:
14172       return NULL_TREE;
14173     }
14174   gcc_unreachable ();
14175 }
14176 
14177 /* Emit code to perform the floating-point operation:
14178 
14179      DST = SRC1 * SRC2
14180 
14181    where all three operands are already known to be registers.
14182    If the operation is an SVE one, PTRUE is a suitable all-true
14183    predicate.  */
14184 
14185 static void
aarch64_emit_mult(rtx dst,rtx ptrue,rtx src1,rtx src2)14186 aarch64_emit_mult (rtx dst, rtx ptrue, rtx src1, rtx src2)
14187 {
14188   if (ptrue)
14189     emit_insn (gen_aarch64_pred (UNSPEC_COND_FMUL, GET_MODE (dst),
14190 				 dst, ptrue, src1, src2,
14191 				 gen_int_mode (SVE_RELAXED_GP, SImode)));
14192   else
14193     emit_set_insn (dst, gen_rtx_MULT (GET_MODE (dst), src1, src2));
14194 }
14195 
14196 /* Emit instruction sequence to compute either the approximate square root
14197    or its approximate reciprocal, depending on the flag RECP, and return
14198    whether the sequence was emitted or not.  */
14199 
14200 bool
aarch64_emit_approx_sqrt(rtx dst,rtx src,bool recp)14201 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
14202 {
14203   machine_mode mode = GET_MODE (dst);
14204 
14205   if (GET_MODE_INNER (mode) == HFmode)
14206     {
14207       gcc_assert (!recp);
14208       return false;
14209     }
14210 
14211   if (!recp)
14212     {
14213       if (!(flag_mlow_precision_sqrt
14214 	    || (aarch64_tune_params.approx_modes->sqrt
14215 		& AARCH64_APPROX_MODE (mode))))
14216 	return false;
14217 
14218       if (!flag_finite_math_only
14219 	  || flag_trapping_math
14220 	  || !flag_unsafe_math_optimizations
14221 	  || optimize_function_for_size_p (cfun))
14222 	return false;
14223     }
14224   else
14225     /* Caller assumes we cannot fail.  */
14226     gcc_assert (use_rsqrt_p (mode));
14227 
14228   rtx pg = NULL_RTX;
14229   if (aarch64_sve_mode_p (mode))
14230     pg = aarch64_ptrue_reg (aarch64_sve_pred_mode (mode));
14231   machine_mode mmsk = (VECTOR_MODE_P (mode)
14232 		       ? related_int_vector_mode (mode).require ()
14233 		       : int_mode_for_mode (mode).require ());
14234   rtx xmsk = NULL_RTX;
14235   if (!recp)
14236     {
14237       /* When calculating the approximate square root, compare the
14238 	 argument with 0.0 and create a mask.  */
14239       rtx zero = CONST0_RTX (mode);
14240       if (pg)
14241 	{
14242 	  xmsk = gen_reg_rtx (GET_MODE (pg));
14243 	  rtx hint = gen_int_mode (SVE_KNOWN_PTRUE, SImode);
14244 	  emit_insn (gen_aarch64_pred_fcm (UNSPEC_COND_FCMNE, mode,
14245 					   xmsk, pg, hint, src, zero));
14246 	}
14247       else
14248 	{
14249 	  xmsk = gen_reg_rtx (mmsk);
14250 	  emit_insn (gen_rtx_SET (xmsk,
14251 				  gen_rtx_NEG (mmsk,
14252 					       gen_rtx_EQ (mmsk, src, zero))));
14253 	}
14254     }
14255 
14256   /* Estimate the approximate reciprocal square root.  */
14257   rtx xdst = gen_reg_rtx (mode);
14258   emit_insn (gen_aarch64_rsqrte (mode, xdst, src));
14259 
14260   /* Iterate over the series twice for SF and thrice for DF.  */
14261   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
14262 
14263   /* Optionally iterate over the series once less for faster performance
14264      while sacrificing the accuracy.  */
14265   if ((recp && flag_mrecip_low_precision_sqrt)
14266       || (!recp && flag_mlow_precision_sqrt))
14267     iterations--;
14268 
14269   /* Iterate over the series to calculate the approximate reciprocal square
14270      root.  */
14271   rtx x1 = gen_reg_rtx (mode);
14272   while (iterations--)
14273     {
14274       rtx x2 = gen_reg_rtx (mode);
14275       aarch64_emit_mult (x2, pg, xdst, xdst);
14276 
14277       emit_insn (gen_aarch64_rsqrts (mode, x1, src, x2));
14278 
14279       if (iterations > 0)
14280 	aarch64_emit_mult (xdst, pg, xdst, x1);
14281     }
14282 
14283   if (!recp)
14284     {
14285       if (pg)
14286 	/* Multiply nonzero source values by the corresponding intermediate
14287 	   result elements, so that the final calculation is the approximate
14288 	   square root rather than its reciprocal.  Select a zero result for
14289 	   zero source values, to avoid the Inf * 0 -> NaN that we'd get
14290 	   otherwise.  */
14291 	emit_insn (gen_cond (UNSPEC_COND_FMUL, mode,
14292 			     xdst, xmsk, xdst, src, CONST0_RTX (mode)));
14293       else
14294 	{
14295 	  /* Qualify the approximate reciprocal square root when the
14296 	     argument is 0.0 by squashing the intermediary result to 0.0.  */
14297 	  rtx xtmp = gen_reg_rtx (mmsk);
14298 	  emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
14299 					    gen_rtx_SUBREG (mmsk, xdst, 0)));
14300 	  emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
14301 
14302 	  /* Calculate the approximate square root.  */
14303 	  aarch64_emit_mult (xdst, pg, xdst, src);
14304 	}
14305     }
14306 
14307   /* Finalize the approximation.  */
14308   aarch64_emit_mult (dst, pg, xdst, x1);
14309 
14310   return true;
14311 }
14312 
14313 /* Emit the instruction sequence to compute the approximation for the division
14314    of NUM by DEN in QUO and return whether the sequence was emitted or not.  */
14315 
14316 bool
aarch64_emit_approx_div(rtx quo,rtx num,rtx den)14317 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
14318 {
14319   machine_mode mode = GET_MODE (quo);
14320 
14321   if (GET_MODE_INNER (mode) == HFmode)
14322     return false;
14323 
14324   bool use_approx_division_p = (flag_mlow_precision_div
14325 			        || (aarch64_tune_params.approx_modes->division
14326 				    & AARCH64_APPROX_MODE (mode)));
14327 
14328   if (!flag_finite_math_only
14329       || flag_trapping_math
14330       || !flag_unsafe_math_optimizations
14331       || optimize_function_for_size_p (cfun)
14332       || !use_approx_division_p)
14333     return false;
14334 
14335   if (!TARGET_SIMD && VECTOR_MODE_P (mode))
14336     return false;
14337 
14338   rtx pg = NULL_RTX;
14339   if (aarch64_sve_mode_p (mode))
14340     pg = aarch64_ptrue_reg (aarch64_sve_pred_mode (mode));
14341 
14342   /* Estimate the approximate reciprocal.  */
14343   rtx xrcp = gen_reg_rtx (mode);
14344   emit_insn (gen_aarch64_frecpe (mode, xrcp, den));
14345 
14346   /* Iterate over the series twice for SF and thrice for DF.  */
14347   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
14348 
14349   /* Optionally iterate over the series less for faster performance,
14350      while sacrificing the accuracy.  The default is 2 for DF and 1 for SF.  */
14351   if (flag_mlow_precision_div)
14352     iterations = (GET_MODE_INNER (mode) == DFmode
14353 		  ? aarch64_double_recp_precision
14354 		  : aarch64_float_recp_precision);
14355 
14356   /* Iterate over the series to calculate the approximate reciprocal.  */
14357   rtx xtmp = gen_reg_rtx (mode);
14358   while (iterations--)
14359     {
14360       emit_insn (gen_aarch64_frecps (mode, xtmp, xrcp, den));
14361 
14362       if (iterations > 0)
14363 	aarch64_emit_mult (xrcp, pg, xrcp, xtmp);
14364     }
14365 
14366   if (num != CONST1_RTX (mode))
14367     {
14368       /* As the approximate reciprocal of DEN is already calculated, only
14369 	 calculate the approximate division when NUM is not 1.0.  */
14370       rtx xnum = force_reg (mode, num);
14371       aarch64_emit_mult (xrcp, pg, xrcp, xnum);
14372     }
14373 
14374   /* Finalize the approximation.  */
14375   aarch64_emit_mult (quo, pg, xrcp, xtmp);
14376   return true;
14377 }
14378 
14379 /* Return the number of instructions that can be issued per cycle.  */
14380 static int
aarch64_sched_issue_rate(void)14381 aarch64_sched_issue_rate (void)
14382 {
14383   return aarch64_tune_params.issue_rate;
14384 }
14385 
14386 /* Implement TARGET_SCHED_VARIABLE_ISSUE.  */
14387 static int
aarch64_sched_variable_issue(FILE *,int,rtx_insn * insn,int more)14388 aarch64_sched_variable_issue (FILE *, int, rtx_insn *insn, int more)
14389 {
14390   if (DEBUG_INSN_P (insn))
14391     return more;
14392 
14393   rtx_code code = GET_CODE (PATTERN (insn));
14394   if (code == USE || code == CLOBBER)
14395     return more;
14396 
14397   if (get_attr_type (insn) == TYPE_NO_INSN)
14398     return more;
14399 
14400   return more - 1;
14401 }
14402 
14403 static int
aarch64_sched_first_cycle_multipass_dfa_lookahead(void)14404 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
14405 {
14406   int issue_rate = aarch64_sched_issue_rate ();
14407 
14408   return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
14409 }
14410 
14411 
14412 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
14413    autopref_multipass_dfa_lookahead_guard from haifa-sched.c.  It only
14414    has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0.  */
14415 
14416 static int
aarch64_first_cycle_multipass_dfa_lookahead_guard(rtx_insn * insn,int ready_index)14417 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
14418 						    int ready_index)
14419 {
14420   return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
14421 }
14422 
14423 
14424 /* Vectorizer cost model target hooks.  */
14425 
14426 /* Information about how the CPU would issue the scalar, Advanced SIMD
14427    or SVE version of a vector loop, using the scheme defined by the
14428    aarch64_base_vec_issue_info hierarchy of structures.  */
14429 struct aarch64_vec_op_count
14430 {
14431   void dump () const;
14432 
14433   /* The number of individual "general" operations.  See the comments
14434      in aarch64_base_vec_issue_info for details.  */
14435   unsigned int general_ops = 0;
14436 
14437   /* The number of load and store operations, under the same scheme
14438      as above.  */
14439   unsigned int loads = 0;
14440   unsigned int stores = 0;
14441 
14442   /* The minimum number of cycles needed to execute all loop-carried
14443      operations, which in the vector code become associated with
14444      reductions.  */
14445   unsigned int reduction_latency = 0;
14446 };
14447 
14448 /* Extends aarch64_vec_op_count with SVE-specific information.  */
14449 struct aarch64_sve_op_count : aarch64_vec_op_count
14450 {
14451   void dump () const;
14452 
14453   /* The number of individual predicate operations.  See the comments
14454      in aarch64_sve_vec_issue_info for details.  */
14455   unsigned int pred_ops = 0;
14456 };
14457 
14458 /* Information about vector code that we're in the process of costing.  */
14459 struct aarch64_vector_costs
14460 {
14461   /* The normal latency-based costs for each region (prologue, body and
14462      epilogue), indexed by vect_cost_model_location.  */
14463   unsigned int region[3] = {};
14464 
14465   /* True if we have performed one-time initialization based on the vec_info.
14466 
14467      This variable exists because the vec_info is not passed to the
14468      init_cost hook.  We therefore have to defer initialization based on
14469      it till later.  */
14470   bool analyzed_vinfo = false;
14471 
14472   /* True if we're costing a vector loop, false if we're costing block-level
14473      vectorization.  */
14474   bool is_loop = false;
14475 
14476   /* True if we've seen an SVE operation that we cannot currently vectorize
14477      using Advanced SIMD.  */
14478   bool saw_sve_only_op = false;
14479 
14480   /* - If VEC_FLAGS is zero then we're costing the original scalar code.
14481      - If VEC_FLAGS & VEC_ADVSIMD is nonzero then we're costing Advanced
14482        SIMD code.
14483      - If VEC_FLAGS & VEC_ANY_SVE is nonzero then we're costing SVE code.  */
14484   unsigned int vec_flags = 0;
14485 
14486   /* On some CPUs, SVE and Advanced SIMD provide the same theoretical vector
14487      throughput, such as 4x128 Advanced SIMD vs. 2x256 SVE.  In those
14488      situations, we try to predict whether an Advanced SIMD implementation
14489      of the loop could be completely unrolled and become straight-line code.
14490      If so, it is generally better to use the Advanced SIMD version rather
14491      than length-agnostic SVE, since the SVE loop would execute an unknown
14492      number of times and so could not be completely unrolled in the same way.
14493 
14494      If we're applying this heuristic, UNROLLED_ADVSIMD_NITERS is the
14495      number of Advanced SIMD loop iterations that would be unrolled and
14496      UNROLLED_ADVSIMD_STMTS estimates the total number of statements
14497      in the unrolled loop.  Both values are zero if we're not applying
14498      the heuristic.  */
14499   unsigned HOST_WIDE_INT unrolled_advsimd_niters = 0;
14500   unsigned HOST_WIDE_INT unrolled_advsimd_stmts = 0;
14501 
14502   /* If we're vectorizing a loop that executes a constant number of times,
14503      this variable gives the number of times that the vector loop would
14504      iterate, otherwise it is zero.  */
14505   uint64_t num_vector_iterations = 0;
14506 
14507   /* Used only when vectorizing loops.  Estimates the number and kind of scalar
14508      operations that would be needed to perform the same work as one iteration
14509      of the vector loop.  */
14510   aarch64_vec_op_count scalar_ops;
14511 
14512   /* Used only when vectorizing loops.  If VEC_FLAGS & VEC_ADVSIMD,
14513      this structure estimates the number and kind of operations that the
14514      vector loop would contain.  If VEC_FLAGS & VEC_SVE, the structure
14515      estimates what the equivalent Advanced SIMD-only code would need in
14516      order to perform the same work as one iteration of the SVE loop.  */
14517   aarch64_vec_op_count advsimd_ops;
14518 
14519   /* Used only when vectorizing loops with SVE.  It estimates the number and
14520      kind of operations that the SVE loop would contain.  */
14521   aarch64_sve_op_count sve_ops;
14522 
14523   /* Used to detect cases in which we end up costing the same load twice,
14524      once to account for results that are actually used and once to account
14525      for unused results.  */
14526   hash_map<nofree_ptr_hash<_stmt_vec_info>, unsigned int> seen_loads;
14527 };
14528 
14529 /* Implement TARGET_VECTORIZE_INIT_COST.  */
14530 void *
aarch64_init_cost(class loop *)14531 aarch64_init_cost (class loop *)
14532 {
14533   return new aarch64_vector_costs;
14534 }
14535 
14536 /* Return true if the current CPU should use the new costs defined
14537    in GCC 11.  This should be removed for GCC 12 and above, with the
14538    costs applying to all CPUs instead.  */
14539 static bool
aarch64_use_new_vector_costs_p()14540 aarch64_use_new_vector_costs_p ()
14541 {
14542   return (aarch64_tune_params.extra_tuning_flags
14543 	  & AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS);
14544 }
14545 
14546 /* Return the appropriate SIMD costs for vectors of type VECTYPE.  */
14547 static const simd_vec_cost *
aarch64_simd_vec_costs(tree vectype)14548 aarch64_simd_vec_costs (tree vectype)
14549 {
14550   const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
14551   if (vectype != NULL
14552       && aarch64_sve_mode_p (TYPE_MODE (vectype))
14553       && costs->sve != NULL)
14554     return costs->sve;
14555   return costs->advsimd;
14556 }
14557 
14558 /* Return the appropriate SIMD costs for vectors with VEC_* flags FLAGS.  */
14559 static const simd_vec_cost *
aarch64_simd_vec_costs_for_flags(unsigned int flags)14560 aarch64_simd_vec_costs_for_flags (unsigned int flags)
14561 {
14562   const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
14563   if ((flags & VEC_ANY_SVE) && costs->sve)
14564     return costs->sve;
14565   return costs->advsimd;
14566 }
14567 
14568 /* Decide whether to use the unrolling heuristic described above
14569    aarch64_vector_costs::unrolled_advsimd_niters, updating that
14570    field if so.  LOOP_VINFO describes the loop that we're vectorizing
14571    and COSTS are the costs that we're calculating for it.  */
14572 static void
aarch64_record_potential_advsimd_unrolling(loop_vec_info loop_vinfo,aarch64_vector_costs * costs)14573 aarch64_record_potential_advsimd_unrolling (loop_vec_info loop_vinfo,
14574 					    aarch64_vector_costs *costs)
14575 {
14576   /* The heuristic only makes sense on targets that have the same
14577      vector throughput for SVE and Advanced SIMD.  */
14578   if (!(aarch64_tune_params.extra_tuning_flags
14579 	& AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT))
14580     return;
14581 
14582   /* We only want to apply the heuristic if LOOP_VINFO is being
14583      vectorized for SVE.  */
14584   if (!(costs->vec_flags & VEC_ANY_SVE))
14585     return;
14586 
14587   /* Check whether it is possible in principle to use Advanced SIMD
14588      instead.  */
14589   if (aarch64_autovec_preference == 2)
14590     return;
14591 
14592   /* We don't want to apply the heuristic to outer loops, since it's
14593      harder to track two levels of unrolling.  */
14594   if (LOOP_VINFO_LOOP (loop_vinfo)->inner)
14595     return;
14596 
14597   /* Only handle cases in which the number of Advanced SIMD iterations
14598      would be known at compile time but the number of SVE iterations
14599      would not.  */
14600   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
14601       || aarch64_sve_vg.is_constant ())
14602     return;
14603 
14604   /* Guess how many times the Advanced SIMD loop would iterate and make
14605      sure that it is within the complete unrolling limit.  Even if the
14606      number of iterations is small enough, the number of statements might
14607      not be, which is why we need to estimate the number of statements too.  */
14608   unsigned int estimated_vq = aarch64_estimated_sve_vq ();
14609   unsigned int advsimd_vf = CEIL (vect_vf_for_cost (loop_vinfo), estimated_vq);
14610   unsigned HOST_WIDE_INT unrolled_advsimd_niters
14611     = LOOP_VINFO_INT_NITERS (loop_vinfo) / advsimd_vf;
14612   if (unrolled_advsimd_niters > (unsigned int) param_max_completely_peel_times)
14613     return;
14614 
14615   /* Record that we're applying the heuristic and should try to estimate
14616      the number of statements in the Advanced SIMD loop.  */
14617   costs->unrolled_advsimd_niters = unrolled_advsimd_niters;
14618 }
14619 
14620 /* Do one-time initialization of COSTS given that we're costing the loop
14621    vectorization described by LOOP_VINFO.  */
14622 static void
aarch64_analyze_loop_vinfo(loop_vec_info loop_vinfo,aarch64_vector_costs * costs)14623 aarch64_analyze_loop_vinfo (loop_vec_info loop_vinfo,
14624 			    aarch64_vector_costs *costs)
14625 {
14626   costs->is_loop = true;
14627 
14628   /* Record the number of times that the vector loop would execute,
14629      if known.  */
14630   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
14631   auto scalar_niters = max_stmt_executions_int (loop);
14632   if (scalar_niters >= 0)
14633     {
14634       unsigned int vf = vect_vf_for_cost (loop_vinfo);
14635       if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
14636 	costs->num_vector_iterations = scalar_niters / vf;
14637       else
14638 	costs->num_vector_iterations = CEIL (scalar_niters, vf);
14639     }
14640 
14641   /* Detect whether we're costing the scalar code or the vector code.
14642      This is a bit hacky: it would be better if the vectorizer told
14643      us directly.
14644 
14645      If we're costing the vector code, record whether we're vectorizing
14646      for Advanced SIMD or SVE.  */
14647   if (costs == LOOP_VINFO_TARGET_COST_DATA (loop_vinfo))
14648     costs->vec_flags = aarch64_classify_vector_mode (loop_vinfo->vector_mode);
14649   else
14650     costs->vec_flags = 0;
14651 
14652   /* Detect whether we're vectorizing for SVE and should
14653      apply the unrolling heuristic described above
14654      aarch64_vector_costs::unrolled_advsimd_niters.  */
14655   aarch64_record_potential_advsimd_unrolling (loop_vinfo, costs);
14656 
14657   /* Record the issue information for any SVE WHILE instructions that the
14658      loop needs.  */
14659   auto *issue_info = aarch64_tune_params.vec_costs->issue_info;
14660   if (issue_info
14661       && issue_info->sve
14662       && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
14663     {
14664       unsigned int num_masks = 0;
14665       rgroup_controls *rgm;
14666       unsigned int num_vectors_m1;
14667       FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), num_vectors_m1, rgm)
14668 	if (rgm->type)
14669 	  num_masks += num_vectors_m1 + 1;
14670       costs->sve_ops.pred_ops += num_masks * issue_info->sve->while_pred_ops;
14671     }
14672 }
14673 
14674 /* Do one-time initialization of COSTS given that we're costing the block
14675    vectorization described by BB_VINFO.  */
14676 static void
aarch64_analyze_bb_vinfo(bb_vec_info bb_vinfo,aarch64_vector_costs * costs)14677 aarch64_analyze_bb_vinfo (bb_vec_info bb_vinfo, aarch64_vector_costs *costs)
14678 {
14679   /* Unfortunately, there's no easy way of telling whether we're costing
14680      the vector code or the scalar code, so just assume that we're costing
14681      the vector code.  */
14682   costs->vec_flags = aarch64_classify_vector_mode (bb_vinfo->vector_mode);
14683 }
14684 
14685 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
14686 static int
aarch64_builtin_vectorization_cost(enum vect_cost_for_stmt type_of_cost,tree vectype,int misalign ATTRIBUTE_UNUSED)14687 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
14688 				    tree vectype,
14689 				    int misalign ATTRIBUTE_UNUSED)
14690 {
14691   unsigned elements;
14692   const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
14693   bool fp = false;
14694 
14695   if (vectype != NULL)
14696     fp = FLOAT_TYPE_P (vectype);
14697 
14698   const simd_vec_cost *simd_costs = aarch64_simd_vec_costs (vectype);
14699 
14700   switch (type_of_cost)
14701     {
14702       case scalar_stmt:
14703 	return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
14704 
14705       case scalar_load:
14706 	return costs->scalar_load_cost;
14707 
14708       case scalar_store:
14709 	return costs->scalar_store_cost;
14710 
14711       case vector_stmt:
14712 	return fp ? simd_costs->fp_stmt_cost
14713 		  : simd_costs->int_stmt_cost;
14714 
14715       case vector_load:
14716 	return simd_costs->align_load_cost;
14717 
14718       case vector_store:
14719 	return simd_costs->store_cost;
14720 
14721       case vec_to_scalar:
14722 	return simd_costs->vec_to_scalar_cost;
14723 
14724       case scalar_to_vec:
14725 	return simd_costs->scalar_to_vec_cost;
14726 
14727       case unaligned_load:
14728       case vector_gather_load:
14729 	return simd_costs->unalign_load_cost;
14730 
14731       case unaligned_store:
14732       case vector_scatter_store:
14733 	return simd_costs->unalign_store_cost;
14734 
14735       case cond_branch_taken:
14736 	return costs->cond_taken_branch_cost;
14737 
14738       case cond_branch_not_taken:
14739 	return costs->cond_not_taken_branch_cost;
14740 
14741       case vec_perm:
14742 	return simd_costs->permute_cost;
14743 
14744       case vec_promote_demote:
14745 	return fp ? simd_costs->fp_stmt_cost
14746 		  : simd_costs->int_stmt_cost;
14747 
14748       case vec_construct:
14749 	elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
14750 	return elements / 2 + 1;
14751 
14752       default:
14753 	gcc_unreachable ();
14754     }
14755 }
14756 
14757 /* Return true if an operaton of kind KIND for STMT_INFO represents
14758    the extraction of an element from a vector in preparation for
14759    storing the element to memory.  */
14760 static bool
aarch64_is_store_elt_extraction(vect_cost_for_stmt kind,stmt_vec_info stmt_info)14761 aarch64_is_store_elt_extraction (vect_cost_for_stmt kind,
14762 				 stmt_vec_info stmt_info)
14763 {
14764   return (kind == vec_to_scalar
14765 	  && STMT_VINFO_DATA_REF (stmt_info)
14766 	  && DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info)));
14767 }
14768 
14769 /* Return true if STMT_INFO represents part of a reduction.  */
14770 static bool
aarch64_is_reduction(stmt_vec_info stmt_info)14771 aarch64_is_reduction (stmt_vec_info stmt_info)
14772 {
14773   return (STMT_VINFO_REDUC_DEF (stmt_info)
14774 	  || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)));
14775 }
14776 
14777 /* If STMT_INFO describes a reduction, return the type of reduction
14778    it describes, otherwise return -1.  */
14779 static int
aarch64_reduc_type(vec_info * vinfo,stmt_vec_info stmt_info)14780 aarch64_reduc_type (vec_info *vinfo, stmt_vec_info stmt_info)
14781 {
14782   if (loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo))
14783     if (STMT_VINFO_REDUC_DEF (stmt_info))
14784       {
14785 	stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
14786 	return int (STMT_VINFO_REDUC_TYPE (reduc_info));
14787       }
14788   return -1;
14789 }
14790 
14791 /* Return true if an access of kind KIND for STMT_INFO represents one
14792    vector of an LD[234] or ST[234] operation.  Return the total number of
14793    vectors (2, 3 or 4) if so, otherwise return a value outside that range.  */
14794 static int
aarch64_ld234_st234_vectors(vect_cost_for_stmt kind,stmt_vec_info stmt_info)14795 aarch64_ld234_st234_vectors (vect_cost_for_stmt kind, stmt_vec_info stmt_info)
14796 {
14797   if ((kind == vector_load
14798        || kind == unaligned_load
14799        || kind == vector_store
14800        || kind == unaligned_store)
14801       && STMT_VINFO_DATA_REF (stmt_info))
14802     {
14803       stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
14804       if (stmt_info
14805 	  && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_LOAD_STORE_LANES)
14806 	return DR_GROUP_SIZE (stmt_info);
14807     }
14808   return 0;
14809 }
14810 
14811 /* If STMT_INFO is a COND_EXPR that includes an embedded comparison, return the
14812    scalar type of the values being compared.  Return null otherwise.  */
14813 static tree
aarch64_embedded_comparison_type(stmt_vec_info stmt_info)14814 aarch64_embedded_comparison_type (stmt_vec_info stmt_info)
14815 {
14816   if (auto *assign = dyn_cast<gassign *> (stmt_info->stmt))
14817     if (gimple_assign_rhs_code (assign) == COND_EXPR)
14818       {
14819 	tree cond = gimple_assign_rhs1 (assign);
14820 	if (COMPARISON_CLASS_P (cond))
14821 	  return TREE_TYPE (TREE_OPERAND (cond, 0));
14822       }
14823   return NULL_TREE;
14824 }
14825 
14826 /* If STMT_INFO is a comparison or contains an embedded comparison, return the
14827    scalar type of the values being compared.  Return null otherwise.  */
14828 static tree
aarch64_comparison_type(stmt_vec_info stmt_info)14829 aarch64_comparison_type (stmt_vec_info stmt_info)
14830 {
14831   if (auto *assign = dyn_cast<gassign *> (stmt_info->stmt))
14832     if (TREE_CODE_CLASS (gimple_assign_rhs_code (assign)) == tcc_comparison)
14833       return TREE_TYPE (gimple_assign_rhs1 (assign));
14834   return aarch64_embedded_comparison_type (stmt_info);
14835 }
14836 
14837 /* Return true if creating multiple copies of STMT_INFO for Advanced SIMD
14838    vectors would produce a series of LDP or STP operations.  KIND is the
14839    kind of statement that STMT_INFO represents.  */
14840 static bool
aarch64_advsimd_ldp_stp_p(enum vect_cost_for_stmt kind,stmt_vec_info stmt_info)14841 aarch64_advsimd_ldp_stp_p (enum vect_cost_for_stmt kind,
14842 			   stmt_vec_info stmt_info)
14843 {
14844   switch (kind)
14845     {
14846     case vector_load:
14847     case vector_store:
14848     case unaligned_load:
14849     case unaligned_store:
14850       break;
14851 
14852     default:
14853       return false;
14854     }
14855 
14856   if (aarch64_tune_params.extra_tuning_flags
14857       & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)
14858     return false;
14859 
14860   return is_gimple_assign (stmt_info->stmt);
14861 }
14862 
14863 /* Return true if STMT_INFO extends the result of a load.  */
14864 static bool
aarch64_extending_load_p(class vec_info * vinfo,stmt_vec_info stmt_info)14865 aarch64_extending_load_p (class vec_info *vinfo, stmt_vec_info stmt_info)
14866 {
14867   gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
14868   if (!assign || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign)))
14869     return false;
14870 
14871   tree rhs = gimple_assign_rhs1 (stmt_info->stmt);
14872   tree lhs_type = TREE_TYPE (gimple_assign_lhs (assign));
14873   tree rhs_type = TREE_TYPE (rhs);
14874   if (!INTEGRAL_TYPE_P (lhs_type)
14875       || !INTEGRAL_TYPE_P (rhs_type)
14876       || TYPE_PRECISION (lhs_type) <= TYPE_PRECISION (rhs_type))
14877     return false;
14878 
14879   stmt_vec_info def_stmt_info = vinfo->lookup_def (rhs);
14880   return (def_stmt_info
14881 	  && STMT_VINFO_DATA_REF (def_stmt_info)
14882 	  && DR_IS_READ (STMT_VINFO_DATA_REF (def_stmt_info)));
14883 }
14884 
14885 /* Return true if STMT_INFO is an integer truncation.  */
14886 static bool
aarch64_integer_truncation_p(stmt_vec_info stmt_info)14887 aarch64_integer_truncation_p (stmt_vec_info stmt_info)
14888 {
14889   gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
14890   if (!assign || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign)))
14891     return false;
14892 
14893   tree lhs_type = TREE_TYPE (gimple_assign_lhs (assign));
14894   tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
14895   return (INTEGRAL_TYPE_P (lhs_type)
14896 	  && INTEGRAL_TYPE_P (rhs_type)
14897 	  && TYPE_PRECISION (lhs_type) < TYPE_PRECISION (rhs_type));
14898 }
14899 
14900 /* Return true if STMT_INFO is the second part of a two-statement multiply-add
14901    or multiply-subtract sequence that might be suitable for fusing into a
14902    single instruction.  If VEC_FLAGS is zero, analyze the operation as
14903    a scalar one, otherwise analyze it as an operation on vectors with those
14904    VEC_* flags.  */
14905 static bool
aarch64_multiply_add_p(vec_info * vinfo,stmt_vec_info stmt_info,unsigned int vec_flags)14906 aarch64_multiply_add_p (vec_info *vinfo, stmt_vec_info stmt_info,
14907 			unsigned int vec_flags)
14908 {
14909   gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
14910   if (!assign)
14911     return false;
14912   tree_code code = gimple_assign_rhs_code (assign);
14913   if (code != PLUS_EXPR && code != MINUS_EXPR)
14914     return false;
14915 
14916   if (CONSTANT_CLASS_P (gimple_assign_rhs1 (assign))
14917       || CONSTANT_CLASS_P (gimple_assign_rhs2 (assign)))
14918     return false;
14919 
14920   for (int i = 1; i < 3; ++i)
14921     {
14922       tree rhs = gimple_op (assign, i);
14923       /* ??? Should we try to check for a single use as well?  */
14924       if (TREE_CODE (rhs) != SSA_NAME)
14925 	continue;
14926 
14927       stmt_vec_info def_stmt_info = vinfo->lookup_def (rhs);
14928       if (!def_stmt_info
14929 	  || STMT_VINFO_DEF_TYPE (def_stmt_info) != vect_internal_def)
14930 	continue;
14931       gassign *rhs_assign = dyn_cast<gassign *> (def_stmt_info->stmt);
14932       if (!rhs_assign || gimple_assign_rhs_code (rhs_assign) != MULT_EXPR)
14933 	continue;
14934 
14935       if (vec_flags & VEC_ADVSIMD)
14936 	{
14937 	  /* Scalar and SVE code can tie the result to any FMLA input (or none,
14938 	     although that requires a MOVPRFX for SVE).  However, Advanced SIMD
14939 	     only supports MLA forms, so will require a move if the result
14940 	     cannot be tied to the accumulator.  The most important case in
14941 	     which this is true is when the accumulator input is invariant.  */
14942 	  rhs = gimple_op (assign, 3 - i);
14943 	  if (TREE_CODE (rhs) != SSA_NAME)
14944 	    return false;
14945 	  def_stmt_info = vinfo->lookup_def (rhs);
14946 	  if (!def_stmt_info
14947 	      || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_external_def)
14948 	    return false;
14949 	}
14950 
14951       return true;
14952     }
14953   return false;
14954 }
14955 
14956 /* Return true if the vectorized form of STMT_INFO is something that is only
14957    possible when using SVE instead of Advanced SIMD.  VECTYPE is the type of
14958    the vector that STMT_INFO is operating on.  */
14959 static bool
aarch64_sve_only_stmt_p(stmt_vec_info stmt_info,tree vectype)14960 aarch64_sve_only_stmt_p (stmt_vec_info stmt_info, tree vectype)
14961 {
14962   if (!aarch64_sve_mode_p (TYPE_MODE (vectype)))
14963     return false;
14964 
14965   if (STMT_VINFO_DATA_REF (stmt_info))
14966     {
14967       /* Check for true gathers and scatters (rather than just strided accesses
14968 	 that we've chosen to implement using gathers and scatters).  Although
14969 	 in principle we could use elementwise accesses for Advanced SIMD,
14970 	 the vectorizer doesn't yet support that.  */
14971       if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
14972 	return true;
14973 
14974       /* Check for masked loads and stores.  */
14975       if (auto *call = dyn_cast<gcall *> (stmt_info->stmt))
14976 	if (gimple_call_internal_p (call)
14977 	    && internal_fn_mask_index (gimple_call_internal_fn (call)) >= 0)
14978 	  return true;
14979     }
14980 
14981   /* Check for 64-bit integer multiplications.  */
14982   auto *assign = dyn_cast<gassign *> (stmt_info->stmt);
14983   if (assign
14984       && gimple_assign_rhs_code (assign) == MULT_EXPR
14985       && GET_MODE_INNER (TYPE_MODE (vectype)) == DImode
14986       && !integer_pow2p (gimple_assign_rhs2 (assign)))
14987     return true;
14988 
14989   return false;
14990 }
14991 
14992 /* We are considering implementing STMT_INFO using SVE vector type VECTYPE.
14993    If STMT_INFO is an in-loop reduction that SVE supports directly, return
14994    its latency in cycles, otherwise return zero.  SVE_COSTS specifies the
14995    latencies of the relevant instructions.  */
14996 static unsigned int
aarch64_sve_in_loop_reduction_latency(vec_info * vinfo,stmt_vec_info stmt_info,tree vectype,const sve_vec_cost * sve_costs)14997 aarch64_sve_in_loop_reduction_latency (vec_info *vinfo,
14998 				       stmt_vec_info stmt_info,
14999 				       tree vectype,
15000 				       const sve_vec_cost *sve_costs)
15001 {
15002   switch (aarch64_reduc_type (vinfo, stmt_info))
15003     {
15004     case EXTRACT_LAST_REDUCTION:
15005       return sve_costs->clast_cost;
15006 
15007     case FOLD_LEFT_REDUCTION:
15008       switch (GET_MODE_INNER (TYPE_MODE (vectype)))
15009 	{
15010 	case E_HFmode:
15011 	case E_BFmode:
15012 	  return sve_costs->fadda_f16_cost;
15013 
15014 	case E_SFmode:
15015 	  return sve_costs->fadda_f32_cost;
15016 
15017 	case E_DFmode:
15018 	  return sve_costs->fadda_f64_cost;
15019 
15020 	default:
15021 	  break;
15022 	}
15023       break;
15024     }
15025 
15026   return 0;
15027 }
15028 
15029 /* STMT_INFO describes a loop-carried operation in the original scalar code
15030    that we are considering implementing as a reduction.  Return one of the
15031    following values, depending on VEC_FLAGS:
15032 
15033    - If VEC_FLAGS is zero, return the loop carry latency of the original
15034      scalar operation.
15035 
15036    - If VEC_FLAGS & VEC_ADVSIMD, return the loop carry latency of the
15037      the Advanced SIMD implementation.
15038 
15039    - If VEC_FLAGS & VEC_ANY_SVE, return the loop carry latency of the
15040      SVE implementation.
15041 
15042    VECTYPE is the type of vector that the vectorizer is considering using
15043    for STMT_INFO, which might be different from the type of vector described
15044    by VEC_FLAGS.  */
15045 static unsigned int
aarch64_in_loop_reduction_latency(vec_info * vinfo,stmt_vec_info stmt_info,tree vectype,unsigned int vec_flags)15046 aarch64_in_loop_reduction_latency (vec_info *vinfo, stmt_vec_info stmt_info,
15047 				   tree vectype, unsigned int vec_flags)
15048 {
15049   const cpu_vector_cost *vec_costs = aarch64_tune_params.vec_costs;
15050   const sve_vec_cost *sve_costs = nullptr;
15051   if (vec_flags & VEC_ANY_SVE)
15052     sve_costs = aarch64_tune_params.vec_costs->sve;
15053 
15054   /* If the caller is asking for the SVE latency, check for forms of reduction
15055      that only SVE can handle directly.  */
15056   if (sve_costs)
15057     {
15058       unsigned int latency
15059 	= aarch64_sve_in_loop_reduction_latency (vinfo, stmt_info, vectype,
15060 						 sve_costs);
15061       if (latency)
15062 	return latency;
15063     }
15064 
15065   /* Handle scalar costs.  */
15066   if (vec_flags == 0)
15067     {
15068       if (FLOAT_TYPE_P (vectype))
15069 	return vec_costs->scalar_fp_stmt_cost;
15070       return vec_costs->scalar_int_stmt_cost;
15071     }
15072 
15073   /* Otherwise, the loop body just contains normal integer or FP operations,
15074      with a vector reduction outside the loop.  */
15075   const simd_vec_cost *simd_costs
15076     = aarch64_simd_vec_costs_for_flags (vec_flags);
15077   if (FLOAT_TYPE_P (vectype))
15078     return simd_costs->fp_stmt_cost;
15079   return simd_costs->int_stmt_cost;
15080 }
15081 
15082 /* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
15083    for STMT_INFO, which has cost kind KIND.  If this is a scalar operation,
15084    try to subdivide the target-independent categorization provided by KIND
15085    to get a more accurate cost.  */
15086 static fractional_cost
aarch64_detect_scalar_stmt_subtype(vec_info * vinfo,vect_cost_for_stmt kind,stmt_vec_info stmt_info,fractional_cost stmt_cost)15087 aarch64_detect_scalar_stmt_subtype (vec_info *vinfo, vect_cost_for_stmt kind,
15088 				    stmt_vec_info stmt_info,
15089 				    fractional_cost stmt_cost)
15090 {
15091   /* Detect an extension of a loaded value.  In general, we'll be able to fuse
15092      the extension with the load.  */
15093   if (kind == scalar_stmt && aarch64_extending_load_p (vinfo, stmt_info))
15094     return 0;
15095 
15096   return stmt_cost;
15097 }
15098 
15099 /* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
15100    for the vectorized form of STMT_INFO, which has cost kind KIND and which
15101    when vectorized would operate on vector type VECTYPE.  Try to subdivide
15102    the target-independent categorization provided by KIND to get a more
15103    accurate cost.  WHERE specifies where the cost associated with KIND
15104    occurs.  */
15105 static fractional_cost
aarch64_detect_vector_stmt_subtype(vec_info * vinfo,vect_cost_for_stmt kind,stmt_vec_info stmt_info,tree vectype,enum vect_cost_model_location where,fractional_cost stmt_cost)15106 aarch64_detect_vector_stmt_subtype (vec_info *vinfo, vect_cost_for_stmt kind,
15107 				    stmt_vec_info stmt_info, tree vectype,
15108 				    enum vect_cost_model_location where,
15109 				    fractional_cost stmt_cost)
15110 {
15111   const simd_vec_cost *simd_costs = aarch64_simd_vec_costs (vectype);
15112   const sve_vec_cost *sve_costs = nullptr;
15113   if (aarch64_sve_mode_p (TYPE_MODE (vectype)))
15114     sve_costs = aarch64_tune_params.vec_costs->sve;
15115 
15116   /* It's generally better to avoid costing inductions, since the induction
15117      will usually be hidden by other operations.  This is particularly true
15118      for things like COND_REDUCTIONS.  */
15119   if (is_a<gphi *> (stmt_info->stmt))
15120     return 0;
15121 
15122   /* Detect cases in which vec_to_scalar is describing the extraction of a
15123      vector element in preparation for a scalar store.  The store itself is
15124      costed separately.  */
15125   if (aarch64_is_store_elt_extraction (kind, stmt_info))
15126     return simd_costs->store_elt_extra_cost;
15127 
15128   /* Detect SVE gather loads, which are costed as a single scalar_load
15129      for each element.  We therefore need to divide the full-instruction
15130      cost by the number of elements in the vector.  */
15131   if (kind == scalar_load
15132       && sve_costs
15133       && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_GATHER_SCATTER)
15134     {
15135       unsigned int nunits = vect_nunits_for_cost (vectype);
15136       if (GET_MODE_UNIT_BITSIZE (TYPE_MODE (vectype)) == 64)
15137 	return { sve_costs->gather_load_x64_cost, nunits };
15138       return { sve_costs->gather_load_x32_cost, nunits };
15139     }
15140 
15141   /* Detect cases in which a scalar_store is really storing one element
15142      in a scatter operation.  */
15143   if (kind == scalar_store
15144       && sve_costs
15145       && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_GATHER_SCATTER)
15146     return sve_costs->scatter_store_elt_cost;
15147 
15148   /* Detect cases in which vec_to_scalar represents an in-loop reduction.  */
15149   if (kind == vec_to_scalar
15150       && where == vect_body
15151       && sve_costs)
15152     {
15153       unsigned int latency
15154 	= aarch64_sve_in_loop_reduction_latency (vinfo, stmt_info, vectype,
15155 						 sve_costs);
15156       if (latency)
15157 	return latency;
15158     }
15159 
15160   /* Detect cases in which vec_to_scalar represents a single reduction
15161      instruction like FADDP or MAXV.  */
15162   if (kind == vec_to_scalar
15163       && where == vect_epilogue
15164       && aarch64_is_reduction (stmt_info))
15165     switch (GET_MODE_INNER (TYPE_MODE (vectype)))
15166       {
15167       case E_QImode:
15168 	return simd_costs->reduc_i8_cost;
15169 
15170       case E_HImode:
15171 	return simd_costs->reduc_i16_cost;
15172 
15173       case E_SImode:
15174 	return simd_costs->reduc_i32_cost;
15175 
15176       case E_DImode:
15177 	return simd_costs->reduc_i64_cost;
15178 
15179       case E_HFmode:
15180       case E_BFmode:
15181 	return simd_costs->reduc_f16_cost;
15182 
15183       case E_SFmode:
15184 	return simd_costs->reduc_f32_cost;
15185 
15186       case E_DFmode:
15187 	return simd_costs->reduc_f64_cost;
15188 
15189       default:
15190 	break;
15191       }
15192 
15193   /* Otherwise stick with the original categorization.  */
15194   return stmt_cost;
15195 }
15196 
15197 /* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
15198    for STMT_INFO, which has cost kind KIND and which when vectorized would
15199    operate on vector type VECTYPE.  Adjust the cost as necessary for SVE
15200    targets.  */
15201 static fractional_cost
aarch64_sve_adjust_stmt_cost(class vec_info * vinfo,vect_cost_for_stmt kind,stmt_vec_info stmt_info,tree vectype,fractional_cost stmt_cost)15202 aarch64_sve_adjust_stmt_cost (class vec_info *vinfo, vect_cost_for_stmt kind,
15203 			      stmt_vec_info stmt_info, tree vectype,
15204 			      fractional_cost stmt_cost)
15205 {
15206   /* Unlike vec_promote_demote, vector_stmt conversions do not change the
15207      vector register size or number of units.  Integer promotions of this
15208      type therefore map to SXT[BHW] or UXT[BHW].
15209 
15210      Most loads have extending forms that can do the sign or zero extension
15211      on the fly.  Optimistically assume that a load followed by an extension
15212      will fold to this form during combine, and that the extension therefore
15213      comes for free.  */
15214   if (kind == vector_stmt && aarch64_extending_load_p (vinfo, stmt_info))
15215     stmt_cost = 0;
15216 
15217   /* For similar reasons, vector_stmt integer truncations are a no-op,
15218      because we can just ignore the unused upper bits of the source.  */
15219   if (kind == vector_stmt && aarch64_integer_truncation_p (stmt_info))
15220     stmt_cost = 0;
15221 
15222   /* Advanced SIMD can load and store pairs of registers using LDP and STP,
15223      but there are no equivalent instructions for SVE.  This means that
15224      (all other things being equal) 128-bit SVE needs twice as many load
15225      and store instructions as Advanced SIMD in order to process vector pairs.
15226 
15227      Also, scalar code can often use LDP and STP to access pairs of values,
15228      so it is too simplistic to say that one SVE load or store replaces
15229      VF scalar loads and stores.
15230 
15231      Ideally we would account for this in the scalar and Advanced SIMD
15232      costs by making suitable load/store pairs as cheap as a single
15233      load/store.  However, that would be a very invasive change and in
15234      practice it tends to stress other parts of the cost model too much.
15235      E.g. stores of scalar constants currently count just a store,
15236      whereas stores of vector constants count a store and a vec_init.
15237      This is an artificial distinction for AArch64, where stores of
15238      nonzero scalar constants need the same kind of register invariant
15239      as vector stores.
15240 
15241      An alternative would be to double the cost of any SVE loads and stores
15242      that could be paired in Advanced SIMD (and possibly also paired in
15243      scalar code).  But this tends to stress other parts of the cost model
15244      in the same way.  It also means that we can fall back to Advanced SIMD
15245      even if full-loop predication would have been useful.
15246 
15247      Here we go for a more conservative version: double the costs of SVE
15248      loads and stores if one iteration of the scalar loop processes enough
15249      elements for it to use a whole number of Advanced SIMD LDP or STP
15250      instructions.  This makes it very likely that the VF would be 1 for
15251      Advanced SIMD, and so no epilogue should be needed.  */
15252   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
15253     {
15254       stmt_vec_info first = DR_GROUP_FIRST_ELEMENT (stmt_info);
15255       unsigned int count = DR_GROUP_SIZE (first) - DR_GROUP_GAP (first);
15256       unsigned int elt_bits = GET_MODE_UNIT_BITSIZE (TYPE_MODE (vectype));
15257       if (multiple_p (count * elt_bits, 256)
15258 	  && aarch64_advsimd_ldp_stp_p (kind, stmt_info))
15259 	stmt_cost *= 2;
15260     }
15261 
15262   return stmt_cost;
15263 }
15264 
15265 /* STMT_COST is the cost calculated for STMT_INFO, which has cost kind KIND
15266    and which when vectorized would operate on vector type VECTYPE.  Add the
15267    cost of any embedded operations.  */
15268 static fractional_cost
aarch64_adjust_stmt_cost(vect_cost_for_stmt kind,stmt_vec_info stmt_info,tree vectype,fractional_cost stmt_cost)15269 aarch64_adjust_stmt_cost (vect_cost_for_stmt kind, stmt_vec_info stmt_info,
15270 			  tree vectype, fractional_cost stmt_cost)
15271 {
15272   if (vectype)
15273     {
15274       const simd_vec_cost *simd_costs = aarch64_simd_vec_costs (vectype);
15275 
15276       /* Detect cases in which a vector load or store represents an
15277 	 LD[234] or ST[234] instruction.  */
15278       switch (aarch64_ld234_st234_vectors (kind, stmt_info))
15279 	{
15280 	case 2:
15281 	  stmt_cost += simd_costs->ld2_st2_permute_cost;
15282 	  break;
15283 
15284 	case 3:
15285 	  stmt_cost += simd_costs->ld3_st3_permute_cost;
15286 	  break;
15287 
15288 	case 4:
15289 	  stmt_cost += simd_costs->ld4_st4_permute_cost;
15290 	  break;
15291 	}
15292 
15293       if (kind == vector_stmt || kind == vec_to_scalar)
15294 	if (tree cmp_type = aarch64_embedded_comparison_type (stmt_info))
15295 	  {
15296 	    if (FLOAT_TYPE_P (cmp_type))
15297 	      stmt_cost += simd_costs->fp_stmt_cost;
15298 	    else
15299 	      stmt_cost += simd_costs->int_stmt_cost;
15300 	  }
15301     }
15302 
15303   if (kind == scalar_stmt)
15304     if (tree cmp_type = aarch64_embedded_comparison_type (stmt_info))
15305       {
15306 	if (FLOAT_TYPE_P (cmp_type))
15307 	  stmt_cost += aarch64_tune_params.vec_costs->scalar_fp_stmt_cost;
15308 	else
15309 	  stmt_cost += aarch64_tune_params.vec_costs->scalar_int_stmt_cost;
15310       }
15311 
15312   return stmt_cost;
15313 }
15314 
15315 /* VINFO, COSTS, COUNT, KIND, STMT_INFO and VECTYPE are the same as for
15316    TARGET_VECTORIZE_ADD_STMT_COST and they describe an operation in the
15317    body of a vector loop.  Record issue information relating to the vector
15318    operation in OPS, where OPS is one of COSTS->scalar_ops, COSTS->advsimd_ops
15319    or COSTS->sve_ops; see the comments above those variables for details.
15320    In addition:
15321 
15322    - VEC_FLAGS is zero if OPS is COSTS->scalar_ops.
15323 
15324    - VEC_FLAGS & VEC_ADVSIMD is nonzero if OPS is COSTS->advsimd_ops.
15325 
15326    - VEC_FLAGS & VEC_ANY_SVE is nonzero if OPS is COSTS->sve_ops.
15327 
15328    ISSUE_INFO provides the scalar, Advanced SIMD or SVE issue information
15329    associated with OPS and VEC_FLAGS.  FACTOR says how many iterations of
15330    the loop described by VEC_FLAGS would be needed to match one iteration
15331    of the vector loop in VINFO.  */
15332 static void
aarch64_count_ops(class vec_info * vinfo,aarch64_vector_costs * costs,unsigned int count,enum vect_cost_for_stmt kind,_stmt_vec_info * stmt_info,tree vectype,unsigned int vec_flags,aarch64_vec_op_count * ops,const aarch64_base_vec_issue_info * issue_info,unsigned int factor)15333 aarch64_count_ops (class vec_info *vinfo, aarch64_vector_costs *costs,
15334 		   unsigned int count, enum vect_cost_for_stmt kind,
15335 		   _stmt_vec_info *stmt_info, tree vectype,
15336 		   unsigned int vec_flags, aarch64_vec_op_count *ops,
15337 		   const aarch64_base_vec_issue_info *issue_info,
15338 		   unsigned int factor)
15339 {
15340   if (!issue_info)
15341     return;
15342 
15343   const aarch64_simd_vec_issue_info *simd_issue = nullptr;
15344   if (vec_flags)
15345     simd_issue = static_cast<const aarch64_simd_vec_issue_info *> (issue_info);
15346 
15347   const aarch64_sve_vec_issue_info *sve_issue = nullptr;
15348   if (vec_flags & VEC_ANY_SVE)
15349     sve_issue = static_cast<const aarch64_sve_vec_issue_info *> (issue_info);
15350 
15351   /* Calculate the minimum cycles per iteration imposed by a reduction
15352      operation.  */
15353   if ((kind == vector_stmt || kind == vec_to_scalar)
15354       && aarch64_is_reduction (stmt_info))
15355     {
15356       unsigned int base
15357 	= aarch64_in_loop_reduction_latency (vinfo, stmt_info, vectype,
15358 					     vec_flags);
15359       if (aarch64_reduc_type (vinfo, stmt_info) == FOLD_LEFT_REDUCTION)
15360 	{
15361 	  if (aarch64_sve_mode_p (TYPE_MODE (vectype)))
15362 	    {
15363 	      /* When costing an SVE FADDA, the vectorizer treats vec_to_scalar
15364 		 as a single operation, whereas for Advanced SIMD it is a
15365 		 per-element one.  Increase the factor accordingly, both for
15366 		 the reduction_latency calculation and for the op couting.  */
15367 	      if (vec_flags & VEC_ADVSIMD)
15368 		factor = vect_nunits_for_cost (vectype);
15369 	    }
15370 	  else
15371 	    /* An Advanced SIMD fold-left reduction is the same as a
15372 	       scalar one and the vectorizer therefore treats vec_to_scalar
15373 	       as a per-element cost.  There is no extra factor to apply for
15374 	       scalar code, either for reduction_latency or for the op
15375 	       counting below.  */
15376 	    factor = 1;
15377 	}
15378 
15379       /* ??? Ideally for vector code we'd do COUNT * FACTOR reductions in
15380 	 parallel, but unfortunately that's not yet the case.  */
15381       ops->reduction_latency = MAX (ops->reduction_latency,
15382 				    base * count * factor);
15383     }
15384 
15385   /* Assume that multiply-adds will become a single operation.  */
15386   if (stmt_info && aarch64_multiply_add_p (vinfo, stmt_info, vec_flags))
15387     return;
15388 
15389   /* When costing scalar statements in vector code, the count already
15390      includes the number of scalar elements in the vector, so we don't
15391      need to apply the factor as well.  */
15392   if (kind == scalar_load || kind == scalar_store || kind == scalar_stmt)
15393     factor = 1;
15394 
15395   /* This can go negative with the load handling below.  */
15396   int num_copies = count * factor;
15397 
15398   /* Count the basic operation cost associated with KIND.  */
15399   switch (kind)
15400     {
15401     case cond_branch_taken:
15402     case cond_branch_not_taken:
15403     case vector_gather_load:
15404     case vector_scatter_store:
15405       /* We currently don't expect these to be used in a loop body.  */
15406       break;
15407 
15408     case vec_perm:
15409     case vec_promote_demote:
15410     case vec_construct:
15411     case vec_to_scalar:
15412     case scalar_to_vec:
15413       /* Assume that these operations have no overhead in the original
15414 	 scalar code.  */
15415       if (!vec_flags)
15416 	break;
15417       /* Fallthrough.  */
15418     case vector_stmt:
15419     case scalar_stmt:
15420       ops->general_ops += num_copies;
15421       break;
15422 
15423     case scalar_load:
15424     case vector_load:
15425     case unaligned_load:
15426       /* When costing scalars, detect cases in which we are called twice for
15427 	 the same load.  This happens for LD[234] operations if only some of
15428 	 the results are used.  The first time represents the cost of loading
15429 	 the unused vectors, while the second time represents the cost of
15430 	 loading the useful parts.  Only the latter should count towards the
15431 	 scalar costs.  */
15432       if (stmt_info && !vec_flags)
15433 	{
15434 	  bool existed = false;
15435 	  unsigned int &prev_count
15436 	    = costs->seen_loads.get_or_insert (stmt_info, &existed);
15437 	  if (existed)
15438 	    num_copies -= prev_count;
15439 	  else
15440 	    prev_count = num_copies;
15441 	}
15442       ops->loads += num_copies;
15443       if (vec_flags || FLOAT_TYPE_P (vectype))
15444 	ops->general_ops += issue_info->fp_simd_load_general_ops * num_copies;
15445       break;
15446 
15447     case vector_store:
15448     case unaligned_store:
15449     case scalar_store:
15450       ops->stores += num_copies;
15451       if (vec_flags || FLOAT_TYPE_P (vectype))
15452 	ops->general_ops += issue_info->fp_simd_store_general_ops * num_copies;
15453       break;
15454     }
15455 
15456   /* Add any embedded comparison operations.  */
15457   if ((kind == scalar_stmt || kind == vector_stmt || kind == vec_to_scalar)
15458       && aarch64_embedded_comparison_type (stmt_info))
15459     ops->general_ops += num_copies;
15460 
15461   /* Detect COND_REDUCTIONs and things that would need to become
15462      COND_REDUCTIONs if they were implemented using Advanced SIMD.
15463      There are then two sets of VEC_COND_EXPRs, whereas so far we
15464      have only accounted for one.  */
15465   if (vec_flags && (kind == vector_stmt || kind == vec_to_scalar))
15466     {
15467       int reduc_type = aarch64_reduc_type (vinfo, stmt_info);
15468       if ((reduc_type == EXTRACT_LAST_REDUCTION && (vec_flags & VEC_ADVSIMD))
15469 	  || reduc_type == COND_REDUCTION)
15470 	ops->general_ops += num_copies;
15471     }
15472 
15473   /* Count the predicate operations needed by an SVE comparison.  */
15474   if (sve_issue && (kind == vector_stmt || kind == vec_to_scalar))
15475     if (tree type = aarch64_comparison_type (stmt_info))
15476       {
15477 	unsigned int base = (FLOAT_TYPE_P (type)
15478 			     ? sve_issue->fp_cmp_pred_ops
15479 			     : sve_issue->int_cmp_pred_ops);
15480 	costs->sve_ops.pred_ops += base * num_copies;
15481       }
15482 
15483   /* Add any extra overhead associated with LD[234] and ST[234] operations.  */
15484   if (simd_issue)
15485     switch (aarch64_ld234_st234_vectors (kind, stmt_info))
15486       {
15487       case 2:
15488 	ops->general_ops += simd_issue->ld2_st2_general_ops * num_copies;
15489 	break;
15490 
15491       case 3:
15492 	ops->general_ops += simd_issue->ld3_st3_general_ops * num_copies;
15493 	break;
15494 
15495       case 4:
15496 	ops->general_ops += simd_issue->ld4_st4_general_ops * num_copies;
15497 	break;
15498       }
15499 
15500   /* Add any overhead associated with gather loads and scatter stores.  */
15501   if (sve_issue
15502       && (kind == scalar_load || kind == scalar_store)
15503       && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_GATHER_SCATTER)
15504     {
15505       unsigned int pairs = CEIL (count, 2);
15506       costs->sve_ops.pred_ops
15507 	+= sve_issue->gather_scatter_pair_pred_ops * pairs;
15508       ops->general_ops += sve_issue->gather_scatter_pair_general_ops * pairs;
15509     }
15510 }
15511 
15512 /* Implement targetm.vectorize.add_stmt_cost.  */
15513 static unsigned
aarch64_add_stmt_cost(class vec_info * vinfo,void * data,int count,enum vect_cost_for_stmt kind,struct _stmt_vec_info * stmt_info,tree vectype,int misalign,enum vect_cost_model_location where)15514 aarch64_add_stmt_cost (class vec_info *vinfo, void *data, int count,
15515 		       enum vect_cost_for_stmt kind,
15516 		       struct _stmt_vec_info *stmt_info, tree vectype,
15517 		       int misalign, enum vect_cost_model_location where)
15518 {
15519   auto *costs = static_cast<aarch64_vector_costs *> (data);
15520   unsigned retval = 0;
15521 
15522   if (flag_vect_cost_model)
15523     {
15524       fractional_cost stmt_cost
15525 	= aarch64_builtin_vectorization_cost (kind, vectype, misalign);
15526 
15527       bool in_inner_loop_p = (where == vect_body
15528 			      && stmt_info
15529 			      && stmt_in_inner_loop_p (vinfo, stmt_info));
15530 
15531       /* Do one-time initialization based on the vinfo.  */
15532       loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo);
15533       bb_vec_info bb_vinfo = dyn_cast<bb_vec_info> (vinfo);
15534       if (!costs->analyzed_vinfo && aarch64_use_new_vector_costs_p ())
15535 	{
15536 	  if (loop_vinfo)
15537 	    aarch64_analyze_loop_vinfo (loop_vinfo, costs);
15538 	  else
15539 	    aarch64_analyze_bb_vinfo (bb_vinfo, costs);
15540 	  costs->analyzed_vinfo = true;
15541 	}
15542 
15543       /* Try to get a more accurate cost by looking at STMT_INFO instead
15544 	 of just looking at KIND.  */
15545       if (stmt_info && aarch64_use_new_vector_costs_p ())
15546 	{
15547 	  if (vectype && aarch64_sve_only_stmt_p (stmt_info, vectype))
15548 	    costs->saw_sve_only_op = true;
15549 
15550 	  /* If we scalarize a strided store, the vectorizer costs one
15551 	     vec_to_scalar for each element.  However, we can store the first
15552 	     element using an FP store without a separate extract step.  */
15553 	  if (aarch64_is_store_elt_extraction (kind, stmt_info))
15554 	    count -= 1;
15555 
15556 	  stmt_cost = aarch64_detect_scalar_stmt_subtype
15557 	    (vinfo, kind, stmt_info, stmt_cost);
15558 
15559 	  if (vectype && costs->vec_flags)
15560 	    stmt_cost = aarch64_detect_vector_stmt_subtype (vinfo, kind,
15561 							    stmt_info, vectype,
15562 							    where, stmt_cost);
15563 	}
15564 
15565       /* Do any SVE-specific adjustments to the cost.  */
15566       if (stmt_info && vectype && aarch64_sve_mode_p (TYPE_MODE (vectype)))
15567 	stmt_cost = aarch64_sve_adjust_stmt_cost (vinfo, kind, stmt_info,
15568 						  vectype, stmt_cost);
15569 
15570       if (stmt_info && aarch64_use_new_vector_costs_p ())
15571 	{
15572 	  /* Account for any extra "embedded" costs that apply additively
15573 	     to the base cost calculated above.  */
15574 	  stmt_cost = aarch64_adjust_stmt_cost (kind, stmt_info, vectype,
15575 						stmt_cost);
15576 
15577 	  /* If we're recording a nonzero vector loop body cost for the
15578 	     innermost loop, also estimate the operations that would need
15579 	     to be issued by all relevant implementations of the loop.  */
15580 	  auto *issue_info = aarch64_tune_params.vec_costs->issue_info;
15581 	  if (loop_vinfo
15582 	      && issue_info
15583 	      && costs->vec_flags
15584 	      && where == vect_body
15585 	      && (!LOOP_VINFO_LOOP (loop_vinfo)->inner || in_inner_loop_p)
15586 	      && vectype
15587 	      && stmt_cost != 0)
15588 	    {
15589 	      /* Record estimates for the scalar code.  */
15590 	      aarch64_count_ops (vinfo, costs, count, kind, stmt_info, vectype,
15591 				 0, &costs->scalar_ops, issue_info->scalar,
15592 				 vect_nunits_for_cost (vectype));
15593 
15594 	      if (aarch64_sve_mode_p (vinfo->vector_mode) && issue_info->sve)
15595 		{
15596 		  /* Record estimates for a possible Advanced SIMD version
15597 		     of the SVE code.  */
15598 		  aarch64_count_ops (vinfo, costs, count, kind, stmt_info,
15599 				     vectype, VEC_ADVSIMD, &costs->advsimd_ops,
15600 				     issue_info->advsimd,
15601 				     aarch64_estimated_sve_vq ());
15602 
15603 		  /* Record estimates for the SVE code itself.  */
15604 		  aarch64_count_ops (vinfo, costs, count, kind, stmt_info,
15605 				     vectype, VEC_ANY_SVE, &costs->sve_ops,
15606 				     issue_info->sve, 1);
15607 		}
15608 	      else
15609 		/* Record estimates for the Advanced SIMD code.  Treat SVE like
15610 		   Advanced SIMD if the CPU has no specific SVE costs.  */
15611 		aarch64_count_ops (vinfo, costs, count, kind, stmt_info,
15612 				   vectype, VEC_ADVSIMD, &costs->advsimd_ops,
15613 				   issue_info->advsimd, 1);
15614 	    }
15615 
15616 	  /* If we're applying the SVE vs. Advanced SIMD unrolling heuristic,
15617 	     estimate the number of statements in the unrolled Advanced SIMD
15618 	     loop.  For simplicitly, we assume that one iteration of the
15619 	     Advanced SIMD loop would need the same number of statements
15620 	     as one iteration of the SVE loop.  */
15621 	  if (where == vect_body && costs->unrolled_advsimd_niters)
15622 	    costs->unrolled_advsimd_stmts
15623 	      += count * costs->unrolled_advsimd_niters;
15624 	}
15625 
15626       /* Statements in an inner loop relative to the loop being
15627 	 vectorized are weighted more heavily.  The value here is
15628 	 arbitrary and could potentially be improved with analysis.  */
15629       if (in_inner_loop_p)
15630 	count *= 50; /*  FIXME  */
15631 
15632       retval = (count * stmt_cost).ceil ();
15633       costs->region[where] += retval;
15634     }
15635 
15636   return retval;
15637 }
15638 
15639 /* Dump information about the structure.  */
15640 void
dump()15641 aarch64_vec_op_count::dump () const
15642 {
15643   dump_printf_loc (MSG_NOTE, vect_location,
15644 		   "  load operations = %d\n", loads);
15645   dump_printf_loc (MSG_NOTE, vect_location,
15646 		   "  store operations = %d\n", stores);
15647   dump_printf_loc (MSG_NOTE, vect_location,
15648 		   "  general operations = %d\n", general_ops);
15649   dump_printf_loc (MSG_NOTE, vect_location,
15650 		   "  reduction latency = %d\n", reduction_latency);
15651 }
15652 
15653 /* Dump information about the structure.  */
15654 void
dump()15655 aarch64_sve_op_count::dump () const
15656 {
15657   aarch64_vec_op_count::dump ();
15658   dump_printf_loc (MSG_NOTE, vect_location,
15659 		   "  predicate operations = %d\n", pred_ops);
15660 }
15661 
15662 /* Use ISSUE_INFO to estimate the minimum number of cycles needed to issue
15663    the operations described by OPS.  This is a very simplistic model!  */
15664 static fractional_cost
aarch64_estimate_min_cycles_per_iter(const aarch64_vec_op_count * ops,const aarch64_base_vec_issue_info * issue_info)15665 aarch64_estimate_min_cycles_per_iter
15666   (const aarch64_vec_op_count *ops,
15667    const aarch64_base_vec_issue_info *issue_info)
15668 {
15669   fractional_cost cycles = MAX (ops->reduction_latency, 1);
15670   cycles = std::max (cycles, { ops->stores, issue_info->stores_per_cycle });
15671   cycles = std::max (cycles, { ops->loads + ops->stores,
15672 			       issue_info->loads_stores_per_cycle });
15673   cycles = std::max (cycles, { ops->general_ops,
15674 			       issue_info->general_ops_per_cycle });
15675   return cycles;
15676 }
15677 
15678 /* Subroutine of aarch64_adjust_body_cost for handling SVE.
15679    Use ISSUE_INFO to work out how fast the SVE code can be issued and compare
15680    it to the equivalent value for scalar code (SCALAR_CYCLES_PER_ITER).
15681    If COULD_USE_ADVSIMD is true, also compare it to the issue rate of
15682    Advanced SIMD code (ADVSIMD_CYCLES_PER_ITER).
15683 
15684    COSTS is as for aarch64_adjust_body_cost.  ORIG_BODY_COST is the cost
15685    originally passed to aarch64_adjust_body_cost and *BODY_COST is the current
15686    value of the adjusted cost.  *SHOULD_DISPARAGE is true if we think the loop
15687    body is too expensive.  */
15688 
15689 static fractional_cost
aarch64_adjust_body_cost_sve(const aarch64_vector_costs * costs,const aarch64_vec_issue_info * issue_info,fractional_cost scalar_cycles_per_iter,fractional_cost advsimd_cycles_per_iter,bool could_use_advsimd,unsigned int orig_body_cost,unsigned int * body_cost,bool * should_disparage)15690 aarch64_adjust_body_cost_sve (const aarch64_vector_costs *costs,
15691 			      const aarch64_vec_issue_info *issue_info,
15692 			      fractional_cost scalar_cycles_per_iter,
15693 			      fractional_cost advsimd_cycles_per_iter,
15694 			      bool could_use_advsimd,
15695 			      unsigned int orig_body_cost,
15696 			      unsigned int *body_cost,
15697 			      bool *should_disparage)
15698 {
15699   /* Estimate the minimum number of cycles per iteration needed to issue
15700      non-predicate operations.  */
15701   fractional_cost sve_nonpred_issue_cycles_per_iter
15702     = aarch64_estimate_min_cycles_per_iter (&costs->sve_ops,
15703 					    issue_info->sve);
15704 
15705   /* Estimate the minimum number of cycles per iteration needed to rename
15706      SVE instructions.
15707 
15708      ??? For now this is done inline rather than via cost tables, since it
15709      isn't clear how it should be parameterized for the general case.  */
15710   fractional_cost sve_rename_cycles_per_iter = 0;
15711   if (issue_info == &neoverse512tvb_vec_issue_info)
15712     /* + 1 for an addition.  We've already counted a general op for each
15713        store, so we don't need to account for stores separately.  The branch
15714        reads no registers and so does not need to be counted either.
15715 
15716        ??? This value is very much on the pessimistic side, but seems to work
15717        pretty well in practice.  */
15718     sve_rename_cycles_per_iter
15719       = { costs->sve_ops.general_ops
15720 	  + costs->sve_ops.loads
15721 	  + costs->sve_ops.pred_ops + 1, 5 };
15722 
15723   /* Combine the rename and non-predicate issue limits into a single value.  */
15724   fractional_cost sve_nonpred_cycles_per_iter
15725     = std::max (sve_nonpred_issue_cycles_per_iter, sve_rename_cycles_per_iter);
15726 
15727   /* Separately estimate the minimum number of cycles per iteration needed
15728      to issue the predicate operations.  */
15729   fractional_cost sve_pred_issue_cycles_per_iter
15730     = { costs->sve_ops.pred_ops, issue_info->sve->pred_ops_per_cycle };
15731 
15732   /* Calculate the overall limit on the number of cycles per iteration.  */
15733   fractional_cost sve_cycles_per_iter
15734     = std::max (sve_nonpred_cycles_per_iter, sve_pred_issue_cycles_per_iter);
15735 
15736   if (dump_enabled_p ())
15737     {
15738       costs->sve_ops.dump ();
15739       dump_printf_loc (MSG_NOTE, vect_location,
15740 		       "  estimated cycles per iteration = %f\n",
15741 		       sve_cycles_per_iter.as_double ());
15742       if (costs->sve_ops.pred_ops)
15743 	dump_printf_loc (MSG_NOTE, vect_location,
15744 			 "    predicate issue = %f\n",
15745 			 sve_pred_issue_cycles_per_iter.as_double ());
15746       if (costs->sve_ops.pred_ops || sve_rename_cycles_per_iter)
15747 	dump_printf_loc (MSG_NOTE, vect_location,
15748 			 "    non-predicate issue = %f\n",
15749 			 sve_nonpred_issue_cycles_per_iter.as_double ());
15750       if (sve_rename_cycles_per_iter)
15751 	dump_printf_loc (MSG_NOTE, vect_location, "    rename = %f\n",
15752 			 sve_rename_cycles_per_iter.as_double ());
15753     }
15754 
15755   /* If the scalar version of the loop could issue at least as
15756      quickly as the predicate parts of the SVE loop, make the SVE loop
15757      prohibitively expensive.  In this case vectorization is adding an
15758      overhead that the original scalar code didn't have.
15759 
15760      This is mostly intended to detect cases in which WHILELOs dominate
15761      for very tight loops, which is something that normal latency-based
15762      costs would not model.  Adding this kind of cliffedge would be
15763      too drastic for scalar_cycles_per_iter vs. sve_cycles_per_iter;
15764      code in the caller handles that case in a more conservative way.  */
15765   fractional_cost sve_estimate = sve_pred_issue_cycles_per_iter + 1;
15766   if (scalar_cycles_per_iter < sve_estimate)
15767     {
15768       unsigned int min_cost
15769 	= orig_body_cost * estimated_poly_value (BYTES_PER_SVE_VECTOR);
15770       if (*body_cost < min_cost)
15771 	{
15772 	  if (dump_enabled_p ())
15773 	    dump_printf_loc (MSG_NOTE, vect_location,
15774 			     "Increasing body cost to %d because the"
15775 			     " scalar code could issue within the limit"
15776 			     " imposed by predicate operations\n",
15777 			     min_cost);
15778 	  *body_cost = min_cost;
15779 	  *should_disparage = true;
15780 	}
15781     }
15782 
15783   /* If it appears that the Advanced SIMD version of a loop could issue
15784      more quickly than the SVE one, increase the SVE cost in proportion
15785      to the difference.  The intention is to make Advanced SIMD preferable
15786      in cases where an Advanced SIMD version exists, without increasing
15787      the costs so much that SVE won't be used at all.
15788 
15789      The reasoning is similar to the scalar vs. predicate comparison above:
15790      if the issue rate of the SVE code is limited by predicate operations
15791      (i.e. if sve_pred_issue_cycles_per_iter > sve_nonpred_cycles_per_iter),
15792      and if the Advanced SIMD code could issue within the limit imposed
15793      by the predicate operations, the predicate operations are adding an
15794      overhead that the original code didn't have and so we should prefer
15795      the Advanced SIMD version.  However, if the predicate operations
15796      do not dominate in this way, we should only increase the cost of
15797      the SVE code if sve_cycles_per_iter is strictly greater than
15798      advsimd_cycles_per_iter.  Given rounding effects, this should mean
15799      that Advanced SIMD is either better or at least no worse.  */
15800   if (sve_nonpred_cycles_per_iter >= sve_pred_issue_cycles_per_iter)
15801     sve_estimate = sve_cycles_per_iter;
15802   if (could_use_advsimd && advsimd_cycles_per_iter < sve_estimate)
15803     {
15804       /* This ensures that min_cost > orig_body_cost * 2.  */
15805       unsigned int factor = fractional_cost::scale (1, sve_estimate,
15806 						    advsimd_cycles_per_iter);
15807       unsigned int min_cost = orig_body_cost * factor + 1;
15808       if (*body_cost < min_cost)
15809 	{
15810 	  if (dump_enabled_p ())
15811 	    dump_printf_loc (MSG_NOTE, vect_location,
15812 			     "Increasing body cost to %d because Advanced"
15813 			     " SIMD code could issue as quickly\n",
15814 			     min_cost);
15815 	  *body_cost = min_cost;
15816 	  *should_disparage = true;
15817 	}
15818     }
15819 
15820   return sve_cycles_per_iter;
15821 }
15822 
15823 /* BODY_COST is the cost of a vector loop body recorded in COSTS.
15824    Adjust the cost as necessary and return the new cost.  */
15825 static unsigned int
aarch64_adjust_body_cost(aarch64_vector_costs * costs,unsigned int body_cost)15826 aarch64_adjust_body_cost (aarch64_vector_costs *costs, unsigned int body_cost)
15827 {
15828   unsigned int orig_body_cost = body_cost;
15829   bool should_disparage = false;
15830 
15831   if (dump_enabled_p ())
15832     dump_printf_loc (MSG_NOTE, vect_location,
15833 		     "Original vector body cost = %d\n", body_cost);
15834 
15835   if (costs->unrolled_advsimd_stmts)
15836     {
15837       if (dump_enabled_p ())
15838 	dump_printf_loc (MSG_NOTE, vect_location, "Number of insns in"
15839 			 " unrolled Advanced SIMD loop = %d\n",
15840 			 costs->unrolled_advsimd_stmts);
15841 
15842       /* Apply the Advanced SIMD vs. SVE unrolling heuristic described above
15843 	 aarch64_vector_costs::unrolled_advsimd_niters.
15844 
15845 	 The balance here is tricky.  On the one hand, we can't be sure whether
15846 	 the code is vectorizable with Advanced SIMD or not.  However, even if
15847 	 it isn't vectorizable with Advanced SIMD, there's a possibility that
15848 	 the scalar code could also be unrolled.  Some of the code might then
15849 	 benefit from SLP, or from using LDP and STP.  We therefore apply
15850 	 the heuristic regardless of can_use_advsimd_p.  */
15851       if (costs->unrolled_advsimd_stmts
15852 	  && (costs->unrolled_advsimd_stmts
15853 	      <= (unsigned int) param_max_completely_peeled_insns))
15854 	{
15855 	  unsigned int estimated_vq = aarch64_estimated_sve_vq ();
15856 	  unsigned int min_cost = (orig_body_cost * estimated_vq) + 1;
15857 	  if (body_cost < min_cost)
15858 	    {
15859 	      if (dump_enabled_p ())
15860 		dump_printf_loc (MSG_NOTE, vect_location,
15861 				 "Increasing body cost to %d to account for"
15862 				 " unrolling\n", min_cost);
15863 	      body_cost = min_cost;
15864 	      should_disparage = true;
15865 	    }
15866 	}
15867     }
15868 
15869   auto *issue_info = aarch64_tune_params.vec_costs->issue_info;
15870   if (!issue_info)
15871     return body_cost;
15872 
15873   fractional_cost scalar_cycles_per_iter
15874     = aarch64_estimate_min_cycles_per_iter (&costs->scalar_ops,
15875 					    issue_info->scalar);
15876 
15877   fractional_cost advsimd_cycles_per_iter
15878     = aarch64_estimate_min_cycles_per_iter (&costs->advsimd_ops,
15879 					    issue_info->advsimd);
15880 
15881   bool could_use_advsimd
15882     = ((costs->vec_flags & VEC_ADVSIMD)
15883        || (aarch64_autovec_preference != 2
15884 	   && (aarch64_tune_params.extra_tuning_flags
15885 	       & AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT)
15886 	   && !costs->saw_sve_only_op));
15887 
15888   if (dump_enabled_p ())
15889     {
15890       if (IN_RANGE (costs->num_vector_iterations, 0, 65536))
15891 	dump_printf_loc (MSG_NOTE, vect_location,
15892 			 "Vector loop iterates at most %wd times\n",
15893 			 costs->num_vector_iterations);
15894       dump_printf_loc (MSG_NOTE, vect_location, "Scalar issue estimate:\n");
15895       costs->scalar_ops.dump ();
15896       dump_printf_loc (MSG_NOTE, vect_location,
15897 		       "  estimated cycles per iteration = %f\n",
15898 		       scalar_cycles_per_iter.as_double ());
15899       if (could_use_advsimd)
15900 	{
15901 	  dump_printf_loc (MSG_NOTE, vect_location,
15902 			   "Advanced SIMD issue estimate:\n");
15903 	  costs->advsimd_ops.dump ();
15904 	  dump_printf_loc (MSG_NOTE, vect_location,
15905 			   "  estimated cycles per iteration = %f\n",
15906 			   advsimd_cycles_per_iter.as_double ());
15907 	}
15908       else
15909 	dump_printf_loc (MSG_NOTE, vect_location,
15910 			 "Loop could not use Advanced SIMD\n");
15911     }
15912 
15913   fractional_cost vector_cycles_per_iter = advsimd_cycles_per_iter;
15914   unsigned int vector_reduction_latency = costs->advsimd_ops.reduction_latency;
15915 
15916   if ((costs->vec_flags & VEC_ANY_SVE) && issue_info->sve)
15917     {
15918       if (dump_enabled_p ())
15919 	dump_printf_loc (MSG_NOTE, vect_location, "SVE issue estimate:\n");
15920       vector_reduction_latency = costs->sve_ops.reduction_latency;
15921       vector_cycles_per_iter
15922 	= aarch64_adjust_body_cost_sve (costs, issue_info,
15923 					scalar_cycles_per_iter,
15924 					advsimd_cycles_per_iter,
15925 					could_use_advsimd, orig_body_cost,
15926 					&body_cost, &should_disparage);
15927 
15928       if (aarch64_tune_params.vec_costs == &neoverse512tvb_vector_cost)
15929 	{
15930 	  /* Also take Neoverse V1 tuning into account, doubling the
15931 	     scalar and Advanced SIMD estimates to account for the
15932 	     doubling in SVE vector length.  */
15933 	  if (dump_enabled_p ())
15934 	    dump_printf_loc (MSG_NOTE, vect_location,
15935 			     "Neoverse V1 estimate:\n");
15936 	  aarch64_adjust_body_cost_sve (costs, &neoversev1_vec_issue_info,
15937 					scalar_cycles_per_iter * 2,
15938 					advsimd_cycles_per_iter * 2,
15939 					could_use_advsimd, orig_body_cost,
15940 					&body_cost, &should_disparage);
15941 	}
15942     }
15943 
15944   /* Decide whether to stick to latency-based costs or whether to try to
15945      take issue rates into account.  */
15946   unsigned int threshold = aarch64_loop_vect_issue_rate_niters;
15947   if (costs->vec_flags & VEC_ANY_SVE)
15948     threshold = CEIL (threshold, aarch64_estimated_sve_vq ());
15949 
15950   if (costs->num_vector_iterations >= 1
15951       && costs->num_vector_iterations < threshold)
15952     {
15953       if (dump_enabled_p ())
15954 	dump_printf_loc (MSG_NOTE, vect_location,
15955 			 "Low iteration count, so using pure latency"
15956 			 " costs\n");
15957     }
15958   /* Increase the cost of the vector code if it looks like the scalar code
15959      could issue more quickly.  These values are only rough estimates,
15960      so minor differences should only result in minor changes.  */
15961   else if (scalar_cycles_per_iter < vector_cycles_per_iter)
15962     {
15963       body_cost = fractional_cost::scale (body_cost, vector_cycles_per_iter,
15964 					  scalar_cycles_per_iter);
15965       if (dump_enabled_p ())
15966 	dump_printf_loc (MSG_NOTE, vect_location,
15967 			 "Increasing body cost to %d because scalar code"
15968 			 " would issue more quickly\n", body_cost);
15969     }
15970   /* In general, it's expected that the proposed vector code would be able
15971      to issue more quickly than the original scalar code.  This should
15972      already be reflected to some extent in the latency-based costs.
15973 
15974      However, the latency-based costs effectively assume that the scalar
15975      code and the vector code execute serially, which tends to underplay
15976      one important case: if the real (non-serialized) execution time of
15977      a scalar iteration is dominated by loop-carried dependencies,
15978      and if the vector code is able to reduce both the length of
15979      the loop-carried dependencies *and* the number of cycles needed
15980      to issue the code in general, we can be more confident that the
15981      vector code is an improvement, even if adding the other (non-loop-carried)
15982      latencies tends to hide this saving.  We therefore reduce the cost of the
15983      vector loop body in proportion to the saving.  */
15984   else if (costs->scalar_ops.reduction_latency > vector_reduction_latency
15985 	   && costs->scalar_ops.reduction_latency == scalar_cycles_per_iter
15986 	   && scalar_cycles_per_iter > vector_cycles_per_iter
15987 	   && !should_disparage)
15988     {
15989       body_cost = fractional_cost::scale (body_cost, vector_cycles_per_iter,
15990 					  scalar_cycles_per_iter);
15991       if (dump_enabled_p ())
15992 	dump_printf_loc (MSG_NOTE, vect_location,
15993 			 "Decreasing body cost to %d account for smaller"
15994 			 " reduction latency\n", body_cost);
15995     }
15996 
15997   return body_cost;
15998 }
15999 
16000 /* Implement TARGET_VECTORIZE_FINISH_COST.  */
16001 static void
aarch64_finish_cost(void * data,unsigned * prologue_cost,unsigned * body_cost,unsigned * epilogue_cost)16002 aarch64_finish_cost (void *data, unsigned *prologue_cost,
16003 		     unsigned *body_cost, unsigned *epilogue_cost)
16004 {
16005   auto *costs = static_cast<aarch64_vector_costs *> (data);
16006   *prologue_cost = costs->region[vect_prologue];
16007   *body_cost     = costs->region[vect_body];
16008   *epilogue_cost = costs->region[vect_epilogue];
16009 
16010   if (costs->is_loop
16011       && costs->vec_flags
16012       && aarch64_use_new_vector_costs_p ())
16013     *body_cost = aarch64_adjust_body_cost (costs, *body_cost);
16014 }
16015 
16016 /* Implement TARGET_VECTORIZE_DESTROY_COST_DATA.  */
16017 static void
aarch64_destroy_cost_data(void * data)16018 aarch64_destroy_cost_data (void *data)
16019 {
16020   delete static_cast<aarch64_vector_costs *> (data);
16021 }
16022 
16023 static void initialize_aarch64_code_model (struct gcc_options *);
16024 
16025 /* Parse the TO_PARSE string and put the architecture struct that it
16026    selects into RES and the architectural features into ISA_FLAGS.
16027    Return an aarch64_parse_opt_result describing the parse result.
16028    If there is an error parsing, RES and ISA_FLAGS are left unchanged.
16029    When the TO_PARSE string contains an invalid extension,
16030    a copy of the string is created and stored to INVALID_EXTENSION.  */
16031 
16032 static enum aarch64_parse_opt_result
aarch64_parse_arch(const char * to_parse,const struct processor ** res,uint64_t * isa_flags,std::string * invalid_extension)16033 aarch64_parse_arch (const char *to_parse, const struct processor **res,
16034 		    uint64_t *isa_flags, std::string *invalid_extension)
16035 {
16036   const char *ext;
16037   const struct processor *arch;
16038   size_t len;
16039 
16040   ext = strchr (to_parse, '+');
16041 
16042   if (ext != NULL)
16043     len = ext - to_parse;
16044   else
16045     len = strlen (to_parse);
16046 
16047   if (len == 0)
16048     return AARCH64_PARSE_MISSING_ARG;
16049 
16050 
16051   /* Loop through the list of supported ARCHes to find a match.  */
16052   for (arch = all_architectures; arch->name != NULL; arch++)
16053     {
16054       if (strlen (arch->name) == len
16055 	  && strncmp (arch->name, to_parse, len) == 0)
16056 	{
16057 	  uint64_t isa_temp = arch->flags;
16058 
16059 	  if (ext != NULL)
16060 	    {
16061 	      /* TO_PARSE string contains at least one extension.  */
16062 	      enum aarch64_parse_opt_result ext_res
16063 		= aarch64_parse_extension (ext, &isa_temp, invalid_extension);
16064 
16065 	      if (ext_res != AARCH64_PARSE_OK)
16066 		return ext_res;
16067 	    }
16068 	  /* Extension parsing was successful.  Confirm the result
16069 	     arch and ISA flags.  */
16070 	  *res = arch;
16071 	  *isa_flags = isa_temp;
16072 	  return AARCH64_PARSE_OK;
16073 	}
16074     }
16075 
16076   /* ARCH name not found in list.  */
16077   return AARCH64_PARSE_INVALID_ARG;
16078 }
16079 
16080 /* Parse the TO_PARSE string and put the result tuning in RES and the
16081    architecture flags in ISA_FLAGS.  Return an aarch64_parse_opt_result
16082    describing the parse result.  If there is an error parsing, RES and
16083    ISA_FLAGS are left unchanged.
16084    When the TO_PARSE string contains an invalid extension,
16085    a copy of the string is created and stored to INVALID_EXTENSION.  */
16086 
16087 static enum aarch64_parse_opt_result
aarch64_parse_cpu(const char * to_parse,const struct processor ** res,uint64_t * isa_flags,std::string * invalid_extension)16088 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
16089 		   uint64_t *isa_flags, std::string *invalid_extension)
16090 {
16091   const char *ext;
16092   const struct processor *cpu;
16093   size_t len;
16094 
16095   ext = strchr (to_parse, '+');
16096 
16097   if (ext != NULL)
16098     len = ext - to_parse;
16099   else
16100     len = strlen (to_parse);
16101 
16102   if (len == 0)
16103     return AARCH64_PARSE_MISSING_ARG;
16104 
16105 
16106   /* Loop through the list of supported CPUs to find a match.  */
16107   for (cpu = all_cores; cpu->name != NULL; cpu++)
16108     {
16109       if (strlen (cpu->name) == len && strncmp (cpu->name, to_parse, len) == 0)
16110 	{
16111 	  uint64_t isa_temp = cpu->flags;
16112 
16113 
16114 	  if (ext != NULL)
16115 	    {
16116 	      /* TO_PARSE string contains at least one extension.  */
16117 	      enum aarch64_parse_opt_result ext_res
16118 		= aarch64_parse_extension (ext, &isa_temp, invalid_extension);
16119 
16120 	      if (ext_res != AARCH64_PARSE_OK)
16121 		return ext_res;
16122 	    }
16123 	  /* Extension parsing was successfull.  Confirm the result
16124 	     cpu and ISA flags.  */
16125 	  *res = cpu;
16126 	  *isa_flags = isa_temp;
16127 	  return AARCH64_PARSE_OK;
16128 	}
16129     }
16130 
16131   /* CPU name not found in list.  */
16132   return AARCH64_PARSE_INVALID_ARG;
16133 }
16134 
16135 /* Parse the TO_PARSE string and put the cpu it selects into RES.
16136    Return an aarch64_parse_opt_result describing the parse result.
16137    If the parsing fails the RES does not change.  */
16138 
16139 static enum aarch64_parse_opt_result
aarch64_parse_tune(const char * to_parse,const struct processor ** res)16140 aarch64_parse_tune (const char *to_parse, const struct processor **res)
16141 {
16142   const struct processor *cpu;
16143 
16144   /* Loop through the list of supported CPUs to find a match.  */
16145   for (cpu = all_cores; cpu->name != NULL; cpu++)
16146     {
16147       if (strcmp (cpu->name, to_parse) == 0)
16148 	{
16149 	  *res = cpu;
16150 	  return AARCH64_PARSE_OK;
16151 	}
16152     }
16153 
16154   /* CPU name not found in list.  */
16155   return AARCH64_PARSE_INVALID_ARG;
16156 }
16157 
16158 /* Parse TOKEN, which has length LENGTH to see if it is an option
16159    described in FLAG.  If it is, return the index bit for that fusion type.
16160    If not, error (printing OPTION_NAME) and return zero.  */
16161 
16162 static unsigned int
aarch64_parse_one_option_token(const char * token,size_t length,const struct aarch64_flag_desc * flag,const char * option_name)16163 aarch64_parse_one_option_token (const char *token,
16164 				size_t length,
16165 				const struct aarch64_flag_desc *flag,
16166 				const char *option_name)
16167 {
16168   for (; flag->name != NULL; flag++)
16169     {
16170       if (length == strlen (flag->name)
16171 	  && !strncmp (flag->name, token, length))
16172 	return flag->flag;
16173     }
16174 
16175   error ("unknown flag passed in %<-moverride=%s%> (%s)", option_name, token);
16176   return 0;
16177 }
16178 
16179 /* Parse OPTION which is a comma-separated list of flags to enable.
16180    FLAGS gives the list of flags we understand, INITIAL_STATE gives any
16181    default state we inherit from the CPU tuning structures.  OPTION_NAME
16182    gives the top-level option we are parsing in the -moverride string,
16183    for use in error messages.  */
16184 
16185 static unsigned int
aarch64_parse_boolean_options(const char * option,const struct aarch64_flag_desc * flags,unsigned int initial_state,const char * option_name)16186 aarch64_parse_boolean_options (const char *option,
16187 			       const struct aarch64_flag_desc *flags,
16188 			       unsigned int initial_state,
16189 			       const char *option_name)
16190 {
16191   const char separator = '.';
16192   const char* specs = option;
16193   const char* ntoken = option;
16194   unsigned int found_flags = initial_state;
16195 
16196   while ((ntoken = strchr (specs, separator)))
16197     {
16198       size_t token_length = ntoken - specs;
16199       unsigned token_ops = aarch64_parse_one_option_token (specs,
16200 							   token_length,
16201 							   flags,
16202 							   option_name);
16203       /* If we find "none" (or, for simplicity's sake, an error) anywhere
16204 	 in the token stream, reset the supported operations.  So:
16205 
16206 	   adrp+add.cmp+branch.none.adrp+add
16207 
16208 	   would have the result of turning on only adrp+add fusion.  */
16209       if (!token_ops)
16210 	found_flags = 0;
16211 
16212       found_flags |= token_ops;
16213       specs = ++ntoken;
16214     }
16215 
16216   /* We ended with a comma, print something.  */
16217   if (!(*specs))
16218     {
16219       error ("%s string ill-formed\n", option_name);
16220       return 0;
16221     }
16222 
16223   /* We still have one more token to parse.  */
16224   size_t token_length = strlen (specs);
16225   unsigned token_ops = aarch64_parse_one_option_token (specs,
16226 						       token_length,
16227 						       flags,
16228 						       option_name);
16229    if (!token_ops)
16230      found_flags = 0;
16231 
16232   found_flags |= token_ops;
16233   return found_flags;
16234 }
16235 
16236 /* Support for overriding instruction fusion.  */
16237 
16238 static void
aarch64_parse_fuse_string(const char * fuse_string,struct tune_params * tune)16239 aarch64_parse_fuse_string (const char *fuse_string,
16240 			    struct tune_params *tune)
16241 {
16242   tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
16243 						     aarch64_fusible_pairs,
16244 						     tune->fusible_ops,
16245 						     "fuse=");
16246 }
16247 
16248 /* Support for overriding other tuning flags.  */
16249 
16250 static void
aarch64_parse_tune_string(const char * tune_string,struct tune_params * tune)16251 aarch64_parse_tune_string (const char *tune_string,
16252 			    struct tune_params *tune)
16253 {
16254   tune->extra_tuning_flags
16255     = aarch64_parse_boolean_options (tune_string,
16256 				     aarch64_tuning_flags,
16257 				     tune->extra_tuning_flags,
16258 				     "tune=");
16259 }
16260 
16261 /* Parse the sve_width tuning moverride string in TUNE_STRING.
16262    Accept the valid SVE vector widths allowed by
16263    aarch64_sve_vector_bits_enum and use it to override sve_width
16264    in TUNE.  */
16265 
16266 static void
aarch64_parse_sve_width_string(const char * tune_string,struct tune_params * tune)16267 aarch64_parse_sve_width_string (const char *tune_string,
16268 				struct tune_params *tune)
16269 {
16270   int width = -1;
16271 
16272   int n = sscanf (tune_string, "%d", &width);
16273   if (n == EOF)
16274     {
16275       error ("invalid format for sve_width");
16276       return;
16277     }
16278   switch (width)
16279     {
16280     case SVE_128:
16281     case SVE_256:
16282     case SVE_512:
16283     case SVE_1024:
16284     case SVE_2048:
16285       break;
16286     default:
16287       error ("invalid sve_width value: %d", width);
16288     }
16289   tune->sve_width = (enum aarch64_sve_vector_bits_enum) width;
16290 }
16291 
16292 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
16293    we understand.  If it is, extract the option string and handoff to
16294    the appropriate function.  */
16295 
16296 void
aarch64_parse_one_override_token(const char * token,size_t length,struct tune_params * tune)16297 aarch64_parse_one_override_token (const char* token,
16298 				  size_t length,
16299 				  struct tune_params *tune)
16300 {
16301   const struct aarch64_tuning_override_function *fn
16302     = aarch64_tuning_override_functions;
16303 
16304   const char *option_part = strchr (token, '=');
16305   if (!option_part)
16306     {
16307       error ("tuning string missing in option (%s)", token);
16308       return;
16309     }
16310 
16311   /* Get the length of the option name.  */
16312   length = option_part - token;
16313   /* Skip the '=' to get to the option string.  */
16314   option_part++;
16315 
16316   for (; fn->name != NULL; fn++)
16317     {
16318       if (!strncmp (fn->name, token, length))
16319 	{
16320 	  fn->parse_override (option_part, tune);
16321 	  return;
16322 	}
16323     }
16324 
16325   error ("unknown tuning option (%s)",token);
16326   return;
16327 }
16328 
16329 /* A checking mechanism for the implementation of the tls size.  */
16330 
16331 static void
initialize_aarch64_tls_size(struct gcc_options * opts)16332 initialize_aarch64_tls_size (struct gcc_options *opts)
16333 {
16334   if (aarch64_tls_size == 0)
16335     aarch64_tls_size = 24;
16336 
16337   switch (opts->x_aarch64_cmodel_var)
16338     {
16339     case AARCH64_CMODEL_TINY:
16340       /* Both the default and maximum TLS size allowed under tiny is 1M which
16341 	 needs two instructions to address, so we clamp the size to 24.  */
16342       if (aarch64_tls_size > 24)
16343 	aarch64_tls_size = 24;
16344       break;
16345     case AARCH64_CMODEL_SMALL:
16346       /* The maximum TLS size allowed under small is 4G.  */
16347       if (aarch64_tls_size > 32)
16348 	aarch64_tls_size = 32;
16349       break;
16350     case AARCH64_CMODEL_LARGE:
16351       /* The maximum TLS size allowed under large is 16E.
16352 	 FIXME: 16E should be 64bit, we only support 48bit offset now.  */
16353       if (aarch64_tls_size > 48)
16354 	aarch64_tls_size = 48;
16355       break;
16356     default:
16357       gcc_unreachable ();
16358     }
16359 
16360   return;
16361 }
16362 
16363 /* Parse STRING looking for options in the format:
16364      string	:: option:string
16365      option	:: name=substring
16366      name	:: {a-z}
16367      substring	:: defined by option.  */
16368 
16369 static void
aarch64_parse_override_string(const char * input_string,struct tune_params * tune)16370 aarch64_parse_override_string (const char* input_string,
16371 			       struct tune_params* tune)
16372 {
16373   const char separator = ':';
16374   size_t string_length = strlen (input_string) + 1;
16375   char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
16376   char *string = string_root;
16377   strncpy (string, input_string, string_length);
16378   string[string_length - 1] = '\0';
16379 
16380   char* ntoken = string;
16381 
16382   while ((ntoken = strchr (string, separator)))
16383     {
16384       size_t token_length = ntoken - string;
16385       /* Make this substring look like a string.  */
16386       *ntoken = '\0';
16387       aarch64_parse_one_override_token (string, token_length, tune);
16388       string = ++ntoken;
16389     }
16390 
16391   /* One last option to parse.  */
16392   aarch64_parse_one_override_token (string, strlen (string), tune);
16393   free (string_root);
16394 }
16395 
16396 /* Adjust CURRENT_TUNE (a generic tuning struct) with settings that
16397    are best for a generic target with the currently-enabled architecture
16398    extensions.  */
16399 static void
aarch64_adjust_generic_arch_tuning(struct tune_params & current_tune)16400 aarch64_adjust_generic_arch_tuning (struct tune_params &current_tune)
16401 {
16402   /* Neoverse V1 is the only core that is known to benefit from
16403      AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS.  There is therefore no
16404      point enabling it for SVE2 and above.  */
16405   if (TARGET_SVE2)
16406     current_tune.extra_tuning_flags
16407       &= ~AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS;
16408 }
16409 
16410 static void
aarch64_override_options_after_change_1(struct gcc_options * opts)16411 aarch64_override_options_after_change_1 (struct gcc_options *opts)
16412 {
16413   if (accepted_branch_protection_string)
16414     {
16415       opts->x_aarch64_branch_protection_string
16416 	= xstrdup (accepted_branch_protection_string);
16417     }
16418 
16419   /* PR 70044: We have to be careful about being called multiple times for the
16420      same function.  This means all changes should be repeatable.  */
16421 
16422   /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
16423      Disable the frame pointer flag so the mid-end will not use a frame
16424      pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
16425      Set x_flag_omit_frame_pointer to the special value 2 to differentiate
16426      between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2).  */
16427   aarch64_use_frame_pointer = opts->x_flag_omit_frame_pointer != 1;
16428   if (opts->x_flag_omit_frame_pointer == 0)
16429     opts->x_flag_omit_frame_pointer = 2;
16430 
16431   /* If not optimizing for size, set the default
16432      alignment to what the target wants.  */
16433   if (!opts->x_optimize_size)
16434     {
16435       if (opts->x_flag_align_loops && !opts->x_str_align_loops)
16436 	opts->x_str_align_loops = aarch64_tune_params.loop_align;
16437       if (opts->x_flag_align_jumps && !opts->x_str_align_jumps)
16438 	opts->x_str_align_jumps = aarch64_tune_params.jump_align;
16439       if (opts->x_flag_align_functions && !opts->x_str_align_functions)
16440 	opts->x_str_align_functions = aarch64_tune_params.function_align;
16441     }
16442 
16443   /* We default to no pc-relative literal loads.  */
16444 
16445   aarch64_pcrelative_literal_loads = false;
16446 
16447   /* If -mpc-relative-literal-loads is set on the command line, this
16448      implies that the user asked for PC relative literal loads.  */
16449   if (opts->x_pcrelative_literal_loads == 1)
16450     aarch64_pcrelative_literal_loads = true;
16451 
16452   /* In the tiny memory model it makes no sense to disallow PC relative
16453      literal pool loads.  */
16454   if (aarch64_cmodel == AARCH64_CMODEL_TINY
16455       || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
16456     aarch64_pcrelative_literal_loads = true;
16457 
16458   /* When enabling the lower precision Newton series for the square root, also
16459      enable it for the reciprocal square root, since the latter is an
16460      intermediary step for the former.  */
16461   if (flag_mlow_precision_sqrt)
16462     flag_mrecip_low_precision_sqrt = true;
16463 }
16464 
16465 /* 'Unpack' up the internal tuning structs and update the options
16466     in OPTS.  The caller must have set up selected_tune and selected_arch
16467     as all the other target-specific codegen decisions are
16468     derived from them.  */
16469 
16470 void
aarch64_override_options_internal(struct gcc_options * opts)16471 aarch64_override_options_internal (struct gcc_options *opts)
16472 {
16473   aarch64_tune_flags = selected_tune->flags;
16474   aarch64_tune = selected_tune->sched_core;
16475   /* Make a copy of the tuning parameters attached to the core, which
16476      we may later overwrite.  */
16477   aarch64_tune_params = *(selected_tune->tune);
16478   aarch64_architecture_version = selected_arch->architecture_version;
16479   if (selected_tune->tune == &generic_tunings)
16480     aarch64_adjust_generic_arch_tuning (aarch64_tune_params);
16481 
16482   if (opts->x_aarch64_override_tune_string)
16483     aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
16484 				  &aarch64_tune_params);
16485 
16486   /* This target defaults to strict volatile bitfields.  */
16487   if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
16488     opts->x_flag_strict_volatile_bitfields = 1;
16489 
16490   if (aarch64_stack_protector_guard == SSP_GLOBAL
16491       && opts->x_aarch64_stack_protector_guard_offset_str)
16492     {
16493       error ("incompatible options %<-mstack-protector-guard=global%> and "
16494 	     "%<-mstack-protector-guard-offset=%s%>",
16495 	     aarch64_stack_protector_guard_offset_str);
16496     }
16497 
16498   if (aarch64_stack_protector_guard == SSP_SYSREG
16499       && !(opts->x_aarch64_stack_protector_guard_offset_str
16500 	   && opts->x_aarch64_stack_protector_guard_reg_str))
16501     {
16502       error ("both %<-mstack-protector-guard-offset%> and "
16503 	     "%<-mstack-protector-guard-reg%> must be used "
16504 	     "with %<-mstack-protector-guard=sysreg%>");
16505     }
16506 
16507   if (opts->x_aarch64_stack_protector_guard_reg_str)
16508     {
16509       if (strlen (opts->x_aarch64_stack_protector_guard_reg_str) > 100)
16510 	  error ("specify a system register with a small string length.");
16511     }
16512 
16513   if (opts->x_aarch64_stack_protector_guard_offset_str)
16514     {
16515       char *end;
16516       const char *str = aarch64_stack_protector_guard_offset_str;
16517       errno = 0;
16518       long offs = strtol (aarch64_stack_protector_guard_offset_str, &end, 0);
16519       if (!*str || *end || errno)
16520 	error ("%qs is not a valid offset in %qs", str,
16521 	       "-mstack-protector-guard-offset=");
16522       aarch64_stack_protector_guard_offset = offs;
16523     }
16524 
16525   initialize_aarch64_code_model (opts);
16526   initialize_aarch64_tls_size (opts);
16527 
16528   int queue_depth = 0;
16529   switch (aarch64_tune_params.autoprefetcher_model)
16530     {
16531       case tune_params::AUTOPREFETCHER_OFF:
16532 	queue_depth = -1;
16533 	break;
16534       case tune_params::AUTOPREFETCHER_WEAK:
16535 	queue_depth = 0;
16536 	break;
16537       case tune_params::AUTOPREFETCHER_STRONG:
16538 	queue_depth = max_insn_queue_index + 1;
16539 	break;
16540       default:
16541 	gcc_unreachable ();
16542     }
16543 
16544   /* We don't mind passing in global_options_set here as we don't use
16545      the *options_set structs anyway.  */
16546   SET_OPTION_IF_UNSET (opts, &global_options_set,
16547 		       param_sched_autopref_queue_depth, queue_depth);
16548 
16549   /* If using Advanced SIMD only for autovectorization disable SVE vector costs
16550      comparison.  */
16551   if (aarch64_autovec_preference == 1)
16552     SET_OPTION_IF_UNSET (opts, &global_options_set,
16553 			 aarch64_sve_compare_costs, 0);
16554 
16555   /* Set up parameters to be used in prefetching algorithm.  Do not
16556      override the defaults unless we are tuning for a core we have
16557      researched values for.  */
16558   if (aarch64_tune_params.prefetch->num_slots > 0)
16559     SET_OPTION_IF_UNSET (opts, &global_options_set,
16560 			 param_simultaneous_prefetches,
16561 			 aarch64_tune_params.prefetch->num_slots);
16562   if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
16563     SET_OPTION_IF_UNSET (opts, &global_options_set,
16564 			 param_l1_cache_size,
16565 			 aarch64_tune_params.prefetch->l1_cache_size);
16566   if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
16567     SET_OPTION_IF_UNSET (opts, &global_options_set,
16568 			 param_l1_cache_line_size,
16569 			 aarch64_tune_params.prefetch->l1_cache_line_size);
16570   if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
16571     SET_OPTION_IF_UNSET (opts, &global_options_set,
16572 			 param_l2_cache_size,
16573 			 aarch64_tune_params.prefetch->l2_cache_size);
16574   if (!aarch64_tune_params.prefetch->prefetch_dynamic_strides)
16575     SET_OPTION_IF_UNSET (opts, &global_options_set,
16576 			 param_prefetch_dynamic_strides, 0);
16577   if (aarch64_tune_params.prefetch->minimum_stride >= 0)
16578     SET_OPTION_IF_UNSET (opts, &global_options_set,
16579 			 param_prefetch_minimum_stride,
16580 			 aarch64_tune_params.prefetch->minimum_stride);
16581 
16582   /* Use the alternative scheduling-pressure algorithm by default.  */
16583   SET_OPTION_IF_UNSET (opts, &global_options_set,
16584 		       param_sched_pressure_algorithm,
16585 		       SCHED_PRESSURE_MODEL);
16586 
16587   /* Validate the guard size.  */
16588   int guard_size = param_stack_clash_protection_guard_size;
16589 
16590   if (guard_size != 12 && guard_size != 16)
16591     error ("only values 12 (4 KB) and 16 (64 KB) are supported for guard "
16592 	   "size.  Given value %d (%llu KB) is out of range",
16593 	   guard_size, (1ULL << guard_size) / 1024ULL);
16594 
16595   /* Enforce that interval is the same size as size so the mid-end does the
16596      right thing.  */
16597   SET_OPTION_IF_UNSET (opts, &global_options_set,
16598 		       param_stack_clash_protection_probe_interval,
16599 		       guard_size);
16600 
16601   /* The maybe_set calls won't update the value if the user has explicitly set
16602      one.  Which means we need to validate that probing interval and guard size
16603      are equal.  */
16604   int probe_interval
16605     = param_stack_clash_protection_probe_interval;
16606   if (guard_size != probe_interval)
16607     error ("stack clash guard size %<%d%> must be equal to probing interval "
16608 	   "%<%d%>", guard_size, probe_interval);
16609 
16610   /* Enable sw prefetching at specified optimization level for
16611      CPUS that have prefetch.  Lower optimization level threshold by 1
16612      when profiling is enabled.  */
16613   if (opts->x_flag_prefetch_loop_arrays < 0
16614       && !opts->x_optimize_size
16615       && aarch64_tune_params.prefetch->default_opt_level >= 0
16616       && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
16617     opts->x_flag_prefetch_loop_arrays = 1;
16618 
16619   if (opts->x_aarch64_arch_string == NULL)
16620     opts->x_aarch64_arch_string = selected_arch->name;
16621   if (opts->x_aarch64_cpu_string == NULL)
16622     opts->x_aarch64_cpu_string = selected_cpu->name;
16623   if (opts->x_aarch64_tune_string == NULL)
16624     opts->x_aarch64_tune_string = selected_tune->name;
16625 
16626   aarch64_override_options_after_change_1 (opts);
16627 }
16628 
16629 /* Print a hint with a suggestion for a core or architecture name that
16630    most closely resembles what the user passed in STR.  ARCH is true if
16631    the user is asking for an architecture name.  ARCH is false if the user
16632    is asking for a core name.  */
16633 
16634 static void
aarch64_print_hint_for_core_or_arch(const char * str,bool arch)16635 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
16636 {
16637   auto_vec<const char *> candidates;
16638   const struct processor *entry = arch ? all_architectures : all_cores;
16639   for (; entry->name != NULL; entry++)
16640     candidates.safe_push (entry->name);
16641 
16642 #ifdef HAVE_LOCAL_CPU_DETECT
16643   /* Add also "native" as possible value.  */
16644   if (arch)
16645     candidates.safe_push ("native");
16646 #endif
16647 
16648   char *s;
16649   const char *hint = candidates_list_and_hint (str, s, candidates);
16650   if (hint)
16651     inform (input_location, "valid arguments are: %s;"
16652 			     " did you mean %qs?", s, hint);
16653   else
16654     inform (input_location, "valid arguments are: %s", s);
16655 
16656   XDELETEVEC (s);
16657 }
16658 
16659 /* Print a hint with a suggestion for a core name that most closely resembles
16660    what the user passed in STR.  */
16661 
16662 inline static void
aarch64_print_hint_for_core(const char * str)16663 aarch64_print_hint_for_core (const char *str)
16664 {
16665   aarch64_print_hint_for_core_or_arch (str, false);
16666 }
16667 
16668 /* Print a hint with a suggestion for an architecture name that most closely
16669    resembles what the user passed in STR.  */
16670 
16671 inline static void
aarch64_print_hint_for_arch(const char * str)16672 aarch64_print_hint_for_arch (const char *str)
16673 {
16674   aarch64_print_hint_for_core_or_arch (str, true);
16675 }
16676 
16677 
16678 /* Print a hint with a suggestion for an extension name
16679    that most closely resembles what the user passed in STR.  */
16680 
16681 void
aarch64_print_hint_for_extensions(const std::string & str)16682 aarch64_print_hint_for_extensions (const std::string &str)
16683 {
16684   auto_vec<const char *> candidates;
16685   aarch64_get_all_extension_candidates (&candidates);
16686   char *s;
16687   const char *hint = candidates_list_and_hint (str.c_str (), s, candidates);
16688   if (hint)
16689     inform (input_location, "valid arguments are: %s;"
16690 			     " did you mean %qs?", s, hint);
16691   else
16692     inform (input_location, "valid arguments are: %s;", s);
16693 
16694   XDELETEVEC (s);
16695 }
16696 
16697 /* Validate a command-line -mcpu option.  Parse the cpu and extensions (if any)
16698    specified in STR and throw errors if appropriate.  Put the results if
16699    they are valid in RES and ISA_FLAGS.  Return whether the option is
16700    valid.  */
16701 
16702 static bool
aarch64_validate_mcpu(const char * str,const struct processor ** res,uint64_t * isa_flags)16703 aarch64_validate_mcpu (const char *str, const struct processor **res,
16704 		       uint64_t *isa_flags)
16705 {
16706   std::string invalid_extension;
16707   enum aarch64_parse_opt_result parse_res
16708     = aarch64_parse_cpu (str, res, isa_flags, &invalid_extension);
16709 
16710   if (parse_res == AARCH64_PARSE_OK)
16711     return true;
16712 
16713   switch (parse_res)
16714     {
16715       case AARCH64_PARSE_MISSING_ARG:
16716 	error ("missing cpu name in %<-mcpu=%s%>", str);
16717 	break;
16718       case AARCH64_PARSE_INVALID_ARG:
16719 	error ("unknown value %qs for %<-mcpu%>", str);
16720 	aarch64_print_hint_for_core (str);
16721 	break;
16722       case AARCH64_PARSE_INVALID_FEATURE:
16723 	error ("invalid feature modifier %qs in %<-mcpu=%s%>",
16724 	       invalid_extension.c_str (), str);
16725 	aarch64_print_hint_for_extensions (invalid_extension);
16726 	break;
16727       default:
16728 	gcc_unreachable ();
16729     }
16730 
16731   return false;
16732 }
16733 
16734 /* Straight line speculation indicators.  */
16735 enum aarch64_sls_hardening_type
16736 {
16737   SLS_NONE = 0,
16738   SLS_RETBR = 1,
16739   SLS_BLR = 2,
16740   SLS_ALL = 3,
16741 };
16742 static enum aarch64_sls_hardening_type aarch64_sls_hardening;
16743 
16744 /* Return whether we should mitigatate Straight Line Speculation for the RET
16745    and BR instructions.  */
16746 bool
aarch64_harden_sls_retbr_p(void)16747 aarch64_harden_sls_retbr_p (void)
16748 {
16749   return aarch64_sls_hardening & SLS_RETBR;
16750 }
16751 
16752 /* Return whether we should mitigatate Straight Line Speculation for the BLR
16753    instruction.  */
16754 bool
aarch64_harden_sls_blr_p(void)16755 aarch64_harden_sls_blr_p (void)
16756 {
16757   return aarch64_sls_hardening & SLS_BLR;
16758 }
16759 
16760 /* As of yet we only allow setting these options globally, in the future we may
16761    allow setting them per function.  */
16762 static void
aarch64_validate_sls_mitigation(const char * const_str)16763 aarch64_validate_sls_mitigation (const char *const_str)
16764 {
16765   char *token_save = NULL;
16766   char *str = NULL;
16767 
16768   if (strcmp (const_str, "none") == 0)
16769     {
16770       aarch64_sls_hardening = SLS_NONE;
16771       return;
16772     }
16773   if (strcmp (const_str, "all") == 0)
16774     {
16775       aarch64_sls_hardening = SLS_ALL;
16776       return;
16777     }
16778 
16779   char *str_root = xstrdup (const_str);
16780   str = strtok_r (str_root, ",", &token_save);
16781   if (!str)
16782     error ("invalid argument given to %<-mharden-sls=%>");
16783 
16784   int temp = SLS_NONE;
16785   while (str)
16786     {
16787       if (strcmp (str, "blr") == 0)
16788 	temp |= SLS_BLR;
16789       else if (strcmp (str, "retbr") == 0)
16790 	temp |= SLS_RETBR;
16791       else if (strcmp (str, "none") == 0 || strcmp (str, "all") == 0)
16792 	{
16793 	  error ("%<%s%> must be by itself for %<-mharden-sls=%>", str);
16794 	  break;
16795 	}
16796       else
16797 	{
16798 	  error ("invalid argument %<%s%> for %<-mharden-sls=%>", str);
16799 	  break;
16800 	}
16801       str = strtok_r (NULL, ",", &token_save);
16802     }
16803   aarch64_sls_hardening = (aarch64_sls_hardening_type) temp;
16804   free (str_root);
16805 }
16806 
16807 /* Parses CONST_STR for branch protection features specified in
16808    aarch64_branch_protect_types, and set any global variables required.  Returns
16809    the parsing result and assigns LAST_STR to the last processed token from
16810    CONST_STR so that it can be used for error reporting.  */
16811 
16812 static enum
aarch64_parse_branch_protection(const char * const_str,char ** last_str)16813 aarch64_parse_opt_result aarch64_parse_branch_protection (const char *const_str,
16814 							  char** last_str)
16815 {
16816   char *str_root = xstrdup (const_str);
16817   char* token_save = NULL;
16818   char *str = strtok_r (str_root, "+", &token_save);
16819   enum aarch64_parse_opt_result res = AARCH64_PARSE_OK;
16820   if (!str)
16821     res = AARCH64_PARSE_MISSING_ARG;
16822   else
16823     {
16824       char *next_str = strtok_r (NULL, "+", &token_save);
16825       /* Reset the branch protection features to their defaults.  */
16826       aarch64_handle_no_branch_protection (NULL, NULL);
16827 
16828       while (str && res == AARCH64_PARSE_OK)
16829 	{
16830 	  const aarch64_branch_protect_type* type = aarch64_branch_protect_types;
16831 	  bool found = false;
16832 	  /* Search for this type.  */
16833 	  while (type && type->name && !found && res == AARCH64_PARSE_OK)
16834 	    {
16835 	      if (strcmp (str, type->name) == 0)
16836 		{
16837 		  found = true;
16838 		  res = type->handler (str, next_str);
16839 		  str = next_str;
16840 		  next_str = strtok_r (NULL, "+", &token_save);
16841 		}
16842 	      else
16843 		type++;
16844 	    }
16845 	  if (found && res == AARCH64_PARSE_OK)
16846 	    {
16847 	      bool found_subtype = true;
16848 	      /* Loop through each token until we find one that isn't a
16849 		 subtype.  */
16850 	      while (found_subtype)
16851 		{
16852 		  found_subtype = false;
16853 		  const aarch64_branch_protect_type *subtype = type->subtypes;
16854 		  /* Search for the subtype.  */
16855 		  while (str && subtype && subtype->name && !found_subtype
16856 			  && res == AARCH64_PARSE_OK)
16857 		    {
16858 		      if (strcmp (str, subtype->name) == 0)
16859 			{
16860 			  found_subtype = true;
16861 			  res = subtype->handler (str, next_str);
16862 			  str = next_str;
16863 			  next_str = strtok_r (NULL, "+", &token_save);
16864 			}
16865 		      else
16866 			subtype++;
16867 		    }
16868 		}
16869 	    }
16870 	  else if (!found)
16871 	    res = AARCH64_PARSE_INVALID_ARG;
16872 	}
16873     }
16874   /* Copy the last processed token into the argument to pass it back.
16875     Used by option and attribute validation to print the offending token.  */
16876   if (last_str)
16877     {
16878       if (str) strcpy (*last_str, str);
16879       else *last_str = NULL;
16880     }
16881   if (res == AARCH64_PARSE_OK)
16882     {
16883       /* If needed, alloc the accepted string then copy in const_str.
16884 	Used by override_option_after_change_1.  */
16885       if (!accepted_branch_protection_string)
16886 	accepted_branch_protection_string = (char *) xmalloc (
16887 						      BRANCH_PROTECT_STR_MAX
16888 							+ 1);
16889       strncpy (accepted_branch_protection_string, const_str,
16890 		BRANCH_PROTECT_STR_MAX + 1);
16891       /* Forcibly null-terminate.  */
16892       accepted_branch_protection_string[BRANCH_PROTECT_STR_MAX] = '\0';
16893     }
16894   return res;
16895 }
16896 
16897 static bool
aarch64_validate_mbranch_protection(const char * const_str)16898 aarch64_validate_mbranch_protection (const char *const_str)
16899 {
16900   char *str = (char *) xmalloc (strlen (const_str));
16901   enum aarch64_parse_opt_result res =
16902     aarch64_parse_branch_protection (const_str, &str);
16903   if (res == AARCH64_PARSE_INVALID_ARG)
16904     error ("invalid argument %<%s%> for %<-mbranch-protection=%>", str);
16905   else if (res == AARCH64_PARSE_MISSING_ARG)
16906     error ("missing argument for %<-mbranch-protection=%>");
16907   free (str);
16908   return res == AARCH64_PARSE_OK;
16909 }
16910 
16911 /* Validate a command-line -march option.  Parse the arch and extensions
16912    (if any) specified in STR and throw errors if appropriate.  Put the
16913    results, if they are valid, in RES and ISA_FLAGS.  Return whether the
16914    option is valid.  */
16915 
16916 static bool
aarch64_validate_march(const char * str,const struct processor ** res,uint64_t * isa_flags)16917 aarch64_validate_march (const char *str, const struct processor **res,
16918 			 uint64_t *isa_flags)
16919 {
16920   std::string invalid_extension;
16921   enum aarch64_parse_opt_result parse_res
16922     = aarch64_parse_arch (str, res, isa_flags, &invalid_extension);
16923 
16924   if (parse_res == AARCH64_PARSE_OK)
16925     return true;
16926 
16927   switch (parse_res)
16928     {
16929       case AARCH64_PARSE_MISSING_ARG:
16930 	error ("missing arch name in %<-march=%s%>", str);
16931 	break;
16932       case AARCH64_PARSE_INVALID_ARG:
16933 	error ("unknown value %qs for %<-march%>", str);
16934 	aarch64_print_hint_for_arch (str);
16935 	break;
16936       case AARCH64_PARSE_INVALID_FEATURE:
16937 	error ("invalid feature modifier %qs in %<-march=%s%>",
16938 	       invalid_extension.c_str (), str);
16939 	aarch64_print_hint_for_extensions (invalid_extension);
16940 	break;
16941       default:
16942 	gcc_unreachable ();
16943     }
16944 
16945   return false;
16946 }
16947 
16948 /* Validate a command-line -mtune option.  Parse the cpu
16949    specified in STR and throw errors if appropriate.  Put the
16950    result, if it is valid, in RES.  Return whether the option is
16951    valid.  */
16952 
16953 static bool
aarch64_validate_mtune(const char * str,const struct processor ** res)16954 aarch64_validate_mtune (const char *str, const struct processor **res)
16955 {
16956   enum aarch64_parse_opt_result parse_res
16957     = aarch64_parse_tune (str, res);
16958 
16959   if (parse_res == AARCH64_PARSE_OK)
16960     return true;
16961 
16962   switch (parse_res)
16963     {
16964       case AARCH64_PARSE_MISSING_ARG:
16965 	error ("missing cpu name in %<-mtune=%s%>", str);
16966 	break;
16967       case AARCH64_PARSE_INVALID_ARG:
16968 	error ("unknown value %qs for %<-mtune%>", str);
16969 	aarch64_print_hint_for_core (str);
16970 	break;
16971       default:
16972 	gcc_unreachable ();
16973     }
16974   return false;
16975 }
16976 
16977 /* Return the CPU corresponding to the enum CPU.
16978    If it doesn't specify a cpu, return the default.  */
16979 
16980 static const struct processor *
aarch64_get_tune_cpu(enum aarch64_processor cpu)16981 aarch64_get_tune_cpu (enum aarch64_processor cpu)
16982 {
16983   if (cpu != aarch64_none)
16984     return &all_cores[cpu];
16985 
16986   /* The & 0x3f is to extract the bottom 6 bits that encode the
16987      default cpu as selected by the --with-cpu GCC configure option
16988      in config.gcc.
16989      ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
16990      flags mechanism should be reworked to make it more sane.  */
16991   return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
16992 }
16993 
16994 /* Return the architecture corresponding to the enum ARCH.
16995    If it doesn't specify a valid architecture, return the default.  */
16996 
16997 static const struct processor *
aarch64_get_arch(enum aarch64_arch arch)16998 aarch64_get_arch (enum aarch64_arch arch)
16999 {
17000   if (arch != aarch64_no_arch)
17001     return &all_architectures[arch];
17002 
17003   const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
17004 
17005   return &all_architectures[cpu->arch];
17006 }
17007 
17008 /* Return the VG value associated with -msve-vector-bits= value VALUE.  */
17009 
17010 static poly_uint16
aarch64_convert_sve_vector_bits(aarch64_sve_vector_bits_enum value)17011 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
17012 {
17013   /* 128-bit SVE and Advanced SIMD modes use different register layouts
17014      on big-endian targets, so we would need to forbid subregs that convert
17015      from one to the other.  By default a reinterpret sequence would then
17016      involve a store to memory in one mode and a load back in the other.
17017      Even if we optimize that sequence using reverse instructions,
17018      it would still be a significant potential overhead.
17019 
17020      For now, it seems better to generate length-agnostic code for that
17021      case instead.  */
17022   if (value == SVE_SCALABLE
17023       || (value == SVE_128 && BYTES_BIG_ENDIAN))
17024     return poly_uint16 (2, 2);
17025   else
17026     return (int) value / 64;
17027 }
17028 
17029 /* Implement TARGET_OPTION_OVERRIDE.  This is called once in the beginning
17030    and is used to parse the -m{cpu,tune,arch} strings and setup the initial
17031    tuning structs.  In particular it must set selected_tune and
17032    aarch64_isa_flags that define the available ISA features and tuning
17033    decisions.  It must also set selected_arch as this will be used to
17034    output the .arch asm tags for each function.  */
17035 
17036 static void
aarch64_override_options(void)17037 aarch64_override_options (void)
17038 {
17039   uint64_t cpu_isa = 0;
17040   uint64_t arch_isa = 0;
17041   aarch64_isa_flags = 0;
17042 
17043   bool valid_cpu = true;
17044   bool valid_tune = true;
17045   bool valid_arch = true;
17046 
17047   selected_cpu = NULL;
17048   selected_arch = NULL;
17049   selected_tune = NULL;
17050 
17051   if (aarch64_harden_sls_string)
17052     aarch64_validate_sls_mitigation (aarch64_harden_sls_string);
17053 
17054   if (aarch64_branch_protection_string)
17055     aarch64_validate_mbranch_protection (aarch64_branch_protection_string);
17056 
17057   /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
17058      If either of -march or -mtune is given, they override their
17059      respective component of -mcpu.  */
17060   if (aarch64_cpu_string)
17061     valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
17062 					&cpu_isa);
17063 
17064   if (aarch64_arch_string)
17065     valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
17066 					  &arch_isa);
17067 
17068   if (aarch64_tune_string)
17069     valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
17070 
17071 #ifdef SUBTARGET_OVERRIDE_OPTIONS
17072   SUBTARGET_OVERRIDE_OPTIONS;
17073 #endif
17074 
17075   /* If the user did not specify a processor, choose the default
17076      one for them.  This will be the CPU set during configuration using
17077      --with-cpu, otherwise it is "generic".  */
17078   if (!selected_cpu)
17079     {
17080       if (selected_arch)
17081 	{
17082 	  selected_cpu = &all_cores[selected_arch->ident];
17083 	  aarch64_isa_flags = arch_isa;
17084 	  explicit_arch = selected_arch->arch;
17085 	}
17086       else
17087 	{
17088 	  /* Get default configure-time CPU.  */
17089 	  selected_cpu = aarch64_get_tune_cpu (aarch64_none);
17090 	  aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
17091 	}
17092 
17093       if (selected_tune)
17094 	explicit_tune_core = selected_tune->ident;
17095     }
17096   /* If both -mcpu and -march are specified check that they are architecturally
17097      compatible, warn if they're not and prefer the -march ISA flags.  */
17098   else if (selected_arch)
17099     {
17100       if (selected_arch->arch != selected_cpu->arch)
17101 	{
17102 	  warning (0, "switch %<-mcpu=%s%> conflicts with %<-march=%s%> switch",
17103 		       aarch64_cpu_string,
17104 		       aarch64_arch_string);
17105 	}
17106       aarch64_isa_flags = arch_isa;
17107       explicit_arch = selected_arch->arch;
17108       explicit_tune_core = selected_tune ? selected_tune->ident
17109 					  : selected_cpu->ident;
17110     }
17111   else
17112     {
17113       /* -mcpu but no -march.  */
17114       aarch64_isa_flags = cpu_isa;
17115       explicit_tune_core = selected_tune ? selected_tune->ident
17116 					  : selected_cpu->ident;
17117       gcc_assert (selected_cpu);
17118       selected_arch = &all_architectures[selected_cpu->arch];
17119       explicit_arch = selected_arch->arch;
17120     }
17121 
17122   /* Set the arch as well as we will need it when outputing
17123      the .arch directive in assembly.  */
17124   if (!selected_arch)
17125     {
17126       gcc_assert (selected_cpu);
17127       selected_arch = &all_architectures[selected_cpu->arch];
17128     }
17129 
17130   if (!selected_tune)
17131     selected_tune = selected_cpu;
17132 
17133   if (aarch64_enable_bti == 2)
17134     {
17135 #ifdef TARGET_ENABLE_BTI
17136       aarch64_enable_bti = 1;
17137 #else
17138       aarch64_enable_bti = 0;
17139 #endif
17140     }
17141 
17142   /* Return address signing is currently not supported for ILP32 targets.  For
17143      LP64 targets use the configured option in the absence of a command-line
17144      option for -mbranch-protection.  */
17145   if (!TARGET_ILP32 && accepted_branch_protection_string == NULL)
17146     {
17147 #ifdef TARGET_ENABLE_PAC_RET
17148       aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
17149 #else
17150       aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
17151 #endif
17152     }
17153 
17154 #ifndef HAVE_AS_MABI_OPTION
17155   /* The compiler may have been configured with 2.23.* binutils, which does
17156      not have support for ILP32.  */
17157   if (TARGET_ILP32)
17158     error ("assembler does not support %<-mabi=ilp32%>");
17159 #endif
17160 
17161   /* Convert -msve-vector-bits to a VG count.  */
17162   aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
17163 
17164   if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
17165     sorry ("return address signing is only supported for %<-mabi=lp64%>");
17166 
17167   /* Make sure we properly set up the explicit options.  */
17168   if ((aarch64_cpu_string && valid_cpu)
17169        || (aarch64_tune_string && valid_tune))
17170     gcc_assert (explicit_tune_core != aarch64_none);
17171 
17172   if ((aarch64_cpu_string && valid_cpu)
17173        || (aarch64_arch_string && valid_arch))
17174     gcc_assert (explicit_arch != aarch64_no_arch);
17175 
17176   /* The pass to insert speculation tracking runs before
17177      shrink-wrapping and the latter does not know how to update the
17178      tracking status.  So disable it in this case.  */
17179   if (aarch64_track_speculation)
17180     flag_shrink_wrap = 0;
17181 
17182   aarch64_override_options_internal (&global_options);
17183 
17184   /* Save these options as the default ones in case we push and pop them later
17185      while processing functions with potential target attributes.  */
17186   target_option_default_node = target_option_current_node
17187     = build_target_option_node (&global_options, &global_options_set);
17188 }
17189 
17190 /* Implement targetm.override_options_after_change.  */
17191 
17192 static void
aarch64_override_options_after_change(void)17193 aarch64_override_options_after_change (void)
17194 {
17195   aarch64_override_options_after_change_1 (&global_options);
17196 }
17197 
17198 /* Implement the TARGET_OFFLOAD_OPTIONS hook.  */
17199 static char *
aarch64_offload_options(void)17200 aarch64_offload_options (void)
17201 {
17202   if (TARGET_ILP32)
17203     return xstrdup ("-foffload-abi=ilp32");
17204   else
17205     return xstrdup ("-foffload-abi=lp64");
17206 }
17207 
17208 static struct machine_function *
aarch64_init_machine_status(void)17209 aarch64_init_machine_status (void)
17210 {
17211   struct machine_function *machine;
17212   machine = ggc_cleared_alloc<machine_function> ();
17213   return machine;
17214 }
17215 
17216 void
aarch64_init_expanders(void)17217 aarch64_init_expanders (void)
17218 {
17219   init_machine_status = aarch64_init_machine_status;
17220 }
17221 
17222 /* A checking mechanism for the implementation of the various code models.  */
17223 static void
initialize_aarch64_code_model(struct gcc_options * opts)17224 initialize_aarch64_code_model (struct gcc_options *opts)
17225 {
17226   aarch64_cmodel = opts->x_aarch64_cmodel_var;
17227   switch (opts->x_aarch64_cmodel_var)
17228     {
17229     case AARCH64_CMODEL_TINY:
17230       if (opts->x_flag_pic)
17231 	aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
17232       break;
17233     case AARCH64_CMODEL_SMALL:
17234       if (opts->x_flag_pic)
17235 	{
17236 #ifdef HAVE_AS_SMALL_PIC_RELOCS
17237 	  aarch64_cmodel = (flag_pic == 2
17238 			    ? AARCH64_CMODEL_SMALL_PIC
17239 			    : AARCH64_CMODEL_SMALL_SPIC);
17240 #else
17241 	  aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
17242 #endif
17243 	}
17244       break;
17245     case AARCH64_CMODEL_LARGE:
17246       if (opts->x_flag_pic)
17247 	sorry ("code model %qs with %<-f%s%>", "large",
17248 	       opts->x_flag_pic > 1 ? "PIC" : "pic");
17249       if (opts->x_aarch64_abi == AARCH64_ABI_ILP32)
17250 	sorry ("code model %qs not supported in ilp32 mode", "large");
17251       break;
17252     case AARCH64_CMODEL_TINY_PIC:
17253     case AARCH64_CMODEL_SMALL_PIC:
17254     case AARCH64_CMODEL_SMALL_SPIC:
17255       gcc_unreachable ();
17256     }
17257 }
17258 
17259 /* Implement TARGET_OPTION_SAVE.  */
17260 
17261 static void
aarch64_option_save(struct cl_target_option * ptr,struct gcc_options * opts,struct gcc_options *)17262 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts,
17263 		     struct gcc_options */* opts_set */)
17264 {
17265   ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
17266   ptr->x_aarch64_branch_protection_string
17267     = opts->x_aarch64_branch_protection_string;
17268 }
17269 
17270 /* Implements TARGET_OPTION_RESTORE.  Restore the backend codegen decisions
17271    using the information saved in PTR.  */
17272 
17273 static void
aarch64_option_restore(struct gcc_options * opts,struct gcc_options *,struct cl_target_option * ptr)17274 aarch64_option_restore (struct gcc_options *opts,
17275 			struct gcc_options */* opts_set */,
17276 			struct cl_target_option *ptr)
17277 {
17278   opts->x_explicit_arch = ptr->x_explicit_arch;
17279   selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
17280   opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
17281   if (opts->x_explicit_tune_core == aarch64_none
17282       && opts->x_explicit_arch != aarch64_no_arch)
17283     selected_tune = &all_cores[selected_arch->ident];
17284   else
17285     selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
17286   opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
17287   opts->x_aarch64_branch_protection_string
17288     = ptr->x_aarch64_branch_protection_string;
17289   if (opts->x_aarch64_branch_protection_string)
17290     {
17291       aarch64_parse_branch_protection (opts->x_aarch64_branch_protection_string,
17292 					NULL);
17293     }
17294 
17295   aarch64_override_options_internal (opts);
17296 }
17297 
17298 /* Implement TARGET_OPTION_PRINT.  */
17299 
17300 static void
aarch64_option_print(FILE * file,int indent,struct cl_target_option * ptr)17301 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
17302 {
17303   const struct processor *cpu
17304     = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
17305   uint64_t isa_flags = ptr->x_aarch64_isa_flags;
17306   const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
17307   std::string extension
17308     = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
17309 
17310   fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
17311   fprintf (file, "%*sselected arch = %s%s\n", indent, "",
17312 	   arch->name, extension.c_str ());
17313 }
17314 
17315 static GTY(()) tree aarch64_previous_fndecl;
17316 
17317 void
aarch64_reset_previous_fndecl(void)17318 aarch64_reset_previous_fndecl (void)
17319 {
17320   aarch64_previous_fndecl = NULL;
17321 }
17322 
17323 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
17324    Used by aarch64_set_current_function and aarch64_pragma_target_parse to
17325    make sure optab availability predicates are recomputed when necessary.  */
17326 
17327 void
aarch64_save_restore_target_globals(tree new_tree)17328 aarch64_save_restore_target_globals (tree new_tree)
17329 {
17330   if (TREE_TARGET_GLOBALS (new_tree))
17331     restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
17332   else if (new_tree == target_option_default_node)
17333     restore_target_globals (&default_target_globals);
17334   else
17335     TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
17336 }
17337 
17338 /* Implement TARGET_SET_CURRENT_FUNCTION.  Unpack the codegen decisions
17339    like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
17340    of the function, if such exists.  This function may be called multiple
17341    times on a single function so use aarch64_previous_fndecl to avoid
17342    setting up identical state.  */
17343 
17344 static void
aarch64_set_current_function(tree fndecl)17345 aarch64_set_current_function (tree fndecl)
17346 {
17347   if (!fndecl || fndecl == aarch64_previous_fndecl)
17348     return;
17349 
17350   tree old_tree = (aarch64_previous_fndecl
17351 		   ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
17352 		   : NULL_TREE);
17353 
17354   tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
17355 
17356   /* If current function has no attributes but the previous one did,
17357      use the default node.  */
17358   if (!new_tree && old_tree)
17359     new_tree = target_option_default_node;
17360 
17361   /* If nothing to do, return.  #pragma GCC reset or #pragma GCC pop to
17362      the default have been handled by aarch64_save_restore_target_globals from
17363      aarch64_pragma_target_parse.  */
17364   if (old_tree == new_tree)
17365     return;
17366 
17367   aarch64_previous_fndecl = fndecl;
17368 
17369   /* First set the target options.  */
17370   cl_target_option_restore (&global_options, &global_options_set,
17371 			    TREE_TARGET_OPTION (new_tree));
17372 
17373   aarch64_save_restore_target_globals (new_tree);
17374 }
17375 
17376 /* Enum describing the various ways we can handle attributes.
17377    In many cases we can reuse the generic option handling machinery.  */
17378 
17379 enum aarch64_attr_opt_type
17380 {
17381   aarch64_attr_mask,	/* Attribute should set a bit in target_flags.  */
17382   aarch64_attr_bool,	/* Attribute sets or unsets a boolean variable.  */
17383   aarch64_attr_enum,	/* Attribute sets an enum variable.  */
17384   aarch64_attr_custom	/* Attribute requires a custom handling function.  */
17385 };
17386 
17387 /* All the information needed to handle a target attribute.
17388    NAME is the name of the attribute.
17389    ATTR_TYPE specifies the type of behavior of the attribute as described
17390    in the definition of enum aarch64_attr_opt_type.
17391    ALLOW_NEG is true if the attribute supports a "no-" form.
17392    HANDLER is the function that takes the attribute string as an argument
17393    It is needed only when the ATTR_TYPE is aarch64_attr_custom.
17394    OPT_NUM is the enum specifying the option that the attribute modifies.
17395    This is needed for attributes that mirror the behavior of a command-line
17396    option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
17397    aarch64_attr_enum.  */
17398 
17399 struct aarch64_attribute_info
17400 {
17401   const char *name;
17402   enum aarch64_attr_opt_type attr_type;
17403   bool allow_neg;
17404   bool (*handler) (const char *);
17405   enum opt_code opt_num;
17406 };
17407 
17408 /* Handle the ARCH_STR argument to the arch= target attribute.  */
17409 
17410 static bool
aarch64_handle_attr_arch(const char * str)17411 aarch64_handle_attr_arch (const char *str)
17412 {
17413   const struct processor *tmp_arch = NULL;
17414   std::string invalid_extension;
17415   enum aarch64_parse_opt_result parse_res
17416     = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags, &invalid_extension);
17417 
17418   if (parse_res == AARCH64_PARSE_OK)
17419     {
17420       gcc_assert (tmp_arch);
17421       selected_arch = tmp_arch;
17422       explicit_arch = selected_arch->arch;
17423       return true;
17424     }
17425 
17426   switch (parse_res)
17427     {
17428       case AARCH64_PARSE_MISSING_ARG:
17429 	error ("missing name in %<target(\"arch=\")%> pragma or attribute");
17430 	break;
17431       case AARCH64_PARSE_INVALID_ARG:
17432 	error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str);
17433 	aarch64_print_hint_for_arch (str);
17434 	break;
17435       case AARCH64_PARSE_INVALID_FEATURE:
17436 	error ("invalid feature modifier %s of value (\"%s\") in "
17437 	       "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
17438 	aarch64_print_hint_for_extensions (invalid_extension);
17439 	break;
17440       default:
17441 	gcc_unreachable ();
17442     }
17443 
17444   return false;
17445 }
17446 
17447 /* Handle the argument CPU_STR to the cpu= target attribute.  */
17448 
17449 static bool
aarch64_handle_attr_cpu(const char * str)17450 aarch64_handle_attr_cpu (const char *str)
17451 {
17452   const struct processor *tmp_cpu = NULL;
17453   std::string invalid_extension;
17454   enum aarch64_parse_opt_result parse_res
17455     = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags, &invalid_extension);
17456 
17457   if (parse_res == AARCH64_PARSE_OK)
17458     {
17459       gcc_assert (tmp_cpu);
17460       selected_tune = tmp_cpu;
17461       explicit_tune_core = selected_tune->ident;
17462 
17463       selected_arch = &all_architectures[tmp_cpu->arch];
17464       explicit_arch = selected_arch->arch;
17465       return true;
17466     }
17467 
17468   switch (parse_res)
17469     {
17470       case AARCH64_PARSE_MISSING_ARG:
17471 	error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
17472 	break;
17473       case AARCH64_PARSE_INVALID_ARG:
17474 	error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str);
17475 	aarch64_print_hint_for_core (str);
17476 	break;
17477       case AARCH64_PARSE_INVALID_FEATURE:
17478 	error ("invalid feature modifier %s of value (\"%s\") in "
17479 	       "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
17480 	aarch64_print_hint_for_extensions (invalid_extension);
17481 	break;
17482       default:
17483 	gcc_unreachable ();
17484     }
17485 
17486   return false;
17487 }
17488 
17489 /* Handle the argument STR to the branch-protection= attribute.  */
17490 
17491  static bool
aarch64_handle_attr_branch_protection(const char * str)17492  aarch64_handle_attr_branch_protection (const char* str)
17493  {
17494   char *err_str = (char *) xmalloc (strlen (str) + 1);
17495   enum aarch64_parse_opt_result res = aarch64_parse_branch_protection (str,
17496 								      &err_str);
17497   bool success = false;
17498   switch (res)
17499     {
17500      case AARCH64_PARSE_MISSING_ARG:
17501        error ("missing argument to %<target(\"branch-protection=\")%> pragma or"
17502 	      " attribute");
17503        break;
17504      case AARCH64_PARSE_INVALID_ARG:
17505        error ("invalid protection type (\"%s\") in %<target(\"branch-protection"
17506 	      "=\")%> pragma or attribute", err_str);
17507        break;
17508      case AARCH64_PARSE_OK:
17509        success = true;
17510       /* Fall through.  */
17511      case AARCH64_PARSE_INVALID_FEATURE:
17512        break;
17513      default:
17514        gcc_unreachable ();
17515     }
17516   free (err_str);
17517   return success;
17518  }
17519 
17520 /* Handle the argument STR to the tune= target attribute.  */
17521 
17522 static bool
aarch64_handle_attr_tune(const char * str)17523 aarch64_handle_attr_tune (const char *str)
17524 {
17525   const struct processor *tmp_tune = NULL;
17526   enum aarch64_parse_opt_result parse_res
17527     = aarch64_parse_tune (str, &tmp_tune);
17528 
17529   if (parse_res == AARCH64_PARSE_OK)
17530     {
17531       gcc_assert (tmp_tune);
17532       selected_tune = tmp_tune;
17533       explicit_tune_core = selected_tune->ident;
17534       return true;
17535     }
17536 
17537   switch (parse_res)
17538     {
17539       case AARCH64_PARSE_INVALID_ARG:
17540 	error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str);
17541 	aarch64_print_hint_for_core (str);
17542 	break;
17543       default:
17544 	gcc_unreachable ();
17545     }
17546 
17547   return false;
17548 }
17549 
17550 /* Parse an architecture extensions target attribute string specified in STR.
17551    For example "+fp+nosimd".  Show any errors if needed.  Return TRUE
17552    if successful.  Update aarch64_isa_flags to reflect the ISA features
17553    modified.  */
17554 
17555 static bool
aarch64_handle_attr_isa_flags(char * str)17556 aarch64_handle_attr_isa_flags (char *str)
17557 {
17558   enum aarch64_parse_opt_result parse_res;
17559   uint64_t isa_flags = aarch64_isa_flags;
17560 
17561   /* We allow "+nothing" in the beginning to clear out all architectural
17562      features if the user wants to handpick specific features.  */
17563   if (strncmp ("+nothing", str, 8) == 0)
17564     {
17565       isa_flags = 0;
17566       str += 8;
17567     }
17568 
17569   std::string invalid_extension;
17570   parse_res = aarch64_parse_extension (str, &isa_flags, &invalid_extension);
17571 
17572   if (parse_res == AARCH64_PARSE_OK)
17573     {
17574       aarch64_isa_flags = isa_flags;
17575       return true;
17576     }
17577 
17578   switch (parse_res)
17579     {
17580       case AARCH64_PARSE_MISSING_ARG:
17581 	error ("missing value in %<target()%> pragma or attribute");
17582 	break;
17583 
17584       case AARCH64_PARSE_INVALID_FEATURE:
17585 	error ("invalid feature modifier %s of value (\"%s\") in "
17586 	       "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
17587 	break;
17588 
17589       default:
17590 	gcc_unreachable ();
17591     }
17592 
17593  return false;
17594 }
17595 
17596 /* The target attributes that we support.  On top of these we also support just
17597    ISA extensions, like  __attribute__ ((target ("+crc"))), but that case is
17598    handled explicitly in aarch64_process_one_target_attr.  */
17599 
17600 static const struct aarch64_attribute_info aarch64_attributes[] =
17601 {
17602   { "general-regs-only", aarch64_attr_mask, false, NULL,
17603      OPT_mgeneral_regs_only },
17604   { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
17605      OPT_mfix_cortex_a53_835769 },
17606   { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
17607      OPT_mfix_cortex_a53_843419 },
17608   { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
17609   { "strict-align", aarch64_attr_mask, true, NULL, OPT_mstrict_align },
17610   { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
17611      OPT_momit_leaf_frame_pointer },
17612   { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
17613   { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
17614      OPT_march_ },
17615   { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
17616   { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
17617      OPT_mtune_ },
17618   { "branch-protection", aarch64_attr_custom, false,
17619      aarch64_handle_attr_branch_protection, OPT_mbranch_protection_ },
17620   { "sign-return-address", aarch64_attr_enum, false, NULL,
17621      OPT_msign_return_address_ },
17622   { "outline-atomics", aarch64_attr_bool, true, NULL,
17623      OPT_moutline_atomics},
17624   { NULL, aarch64_attr_custom, false, NULL, OPT____ }
17625 };
17626 
17627 /* Parse ARG_STR which contains the definition of one target attribute.
17628    Show appropriate errors if any or return true if the attribute is valid.  */
17629 
17630 static bool
aarch64_process_one_target_attr(char * arg_str)17631 aarch64_process_one_target_attr (char *arg_str)
17632 {
17633   bool invert = false;
17634 
17635   size_t len = strlen (arg_str);
17636 
17637   if (len == 0)
17638     {
17639       error ("malformed %<target()%> pragma or attribute");
17640       return false;
17641     }
17642 
17643   char *str_to_check = (char *) alloca (len + 1);
17644   strcpy (str_to_check, arg_str);
17645 
17646   /* We have something like __attribute__ ((target ("+fp+nosimd"))).
17647      It is easier to detect and handle it explicitly here rather than going
17648      through the machinery for the rest of the target attributes in this
17649      function.  */
17650   if (*str_to_check == '+')
17651     return aarch64_handle_attr_isa_flags (str_to_check);
17652 
17653   if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
17654     {
17655       invert = true;
17656       str_to_check += 3;
17657     }
17658   char *arg = strchr (str_to_check, '=');
17659 
17660   /* If we found opt=foo then terminate STR_TO_CHECK at the '='
17661      and point ARG to "foo".  */
17662   if (arg)
17663     {
17664       *arg = '\0';
17665       arg++;
17666     }
17667   const struct aarch64_attribute_info *p_attr;
17668   bool found = false;
17669   for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
17670     {
17671       /* If the names don't match up, or the user has given an argument
17672 	 to an attribute that doesn't accept one, or didn't give an argument
17673 	 to an attribute that expects one, fail to match.  */
17674       if (strcmp (str_to_check, p_attr->name) != 0)
17675 	continue;
17676 
17677       found = true;
17678       bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
17679 			      || p_attr->attr_type == aarch64_attr_enum;
17680 
17681       if (attr_need_arg_p ^ (arg != NULL))
17682 	{
17683 	  error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
17684 	  return false;
17685 	}
17686 
17687       /* If the name matches but the attribute does not allow "no-" versions
17688 	 then we can't match.  */
17689       if (invert && !p_attr->allow_neg)
17690 	{
17691 	  error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
17692 	  return false;
17693 	}
17694 
17695       switch (p_attr->attr_type)
17696 	{
17697 	/* Has a custom handler registered.
17698 	   For example, cpu=, arch=, tune=.  */
17699 	  case aarch64_attr_custom:
17700 	    gcc_assert (p_attr->handler);
17701 	    if (!p_attr->handler (arg))
17702 	      return false;
17703 	    break;
17704 
17705 	  /* Either set or unset a boolean option.  */
17706 	  case aarch64_attr_bool:
17707 	    {
17708 	      struct cl_decoded_option decoded;
17709 
17710 	      generate_option (p_attr->opt_num, NULL, !invert,
17711 			       CL_TARGET, &decoded);
17712 	      aarch64_handle_option (&global_options, &global_options_set,
17713 				      &decoded, input_location);
17714 	      break;
17715 	    }
17716 	  /* Set or unset a bit in the target_flags.  aarch64_handle_option
17717 	     should know what mask to apply given the option number.  */
17718 	  case aarch64_attr_mask:
17719 	    {
17720 	      struct cl_decoded_option decoded;
17721 	      /* We only need to specify the option number.
17722 		 aarch64_handle_option will know which mask to apply.  */
17723 	      decoded.opt_index = p_attr->opt_num;
17724 	      decoded.value = !invert;
17725 	      aarch64_handle_option (&global_options, &global_options_set,
17726 				      &decoded, input_location);
17727 	      break;
17728 	    }
17729 	  /* Use the option setting machinery to set an option to an enum.  */
17730 	  case aarch64_attr_enum:
17731 	    {
17732 	      gcc_assert (arg);
17733 	      bool valid;
17734 	      int value;
17735 	      valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
17736 					      &value, CL_TARGET);
17737 	      if (valid)
17738 		{
17739 		  set_option (&global_options, NULL, p_attr->opt_num, value,
17740 			      NULL, DK_UNSPECIFIED, input_location,
17741 			      global_dc);
17742 		}
17743 	      else
17744 		{
17745 		  error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
17746 		}
17747 	      break;
17748 	    }
17749 	  default:
17750 	    gcc_unreachable ();
17751 	}
17752     }
17753 
17754   /* If we reached here we either have found an attribute and validated
17755      it or didn't match any.  If we matched an attribute but its arguments
17756      were malformed we will have returned false already.  */
17757   return found;
17758 }
17759 
17760 /* Count how many times the character C appears in
17761    NULL-terminated string STR.  */
17762 
17763 static unsigned int
num_occurences_in_str(char c,char * str)17764 num_occurences_in_str (char c, char *str)
17765 {
17766   unsigned int res = 0;
17767   while (*str != '\0')
17768     {
17769       if (*str == c)
17770 	res++;
17771 
17772       str++;
17773     }
17774 
17775   return res;
17776 }
17777 
17778 /* Parse the tree in ARGS that contains the target attribute information
17779    and update the global target options space.  */
17780 
17781 bool
aarch64_process_target_attr(tree args)17782 aarch64_process_target_attr (tree args)
17783 {
17784   if (TREE_CODE (args) == TREE_LIST)
17785     {
17786       do
17787 	{
17788 	  tree head = TREE_VALUE (args);
17789 	  if (head)
17790 	    {
17791 	      if (!aarch64_process_target_attr (head))
17792 		return false;
17793 	    }
17794 	  args = TREE_CHAIN (args);
17795 	} while (args);
17796 
17797       return true;
17798     }
17799 
17800   if (TREE_CODE (args) != STRING_CST)
17801     {
17802       error ("attribute %<target%> argument not a string");
17803       return false;
17804     }
17805 
17806   size_t len = strlen (TREE_STRING_POINTER (args));
17807   char *str_to_check = (char *) alloca (len + 1);
17808   strcpy (str_to_check, TREE_STRING_POINTER (args));
17809 
17810   if (len == 0)
17811     {
17812       error ("malformed %<target()%> pragma or attribute");
17813       return false;
17814     }
17815 
17816   /* Used to catch empty spaces between commas i.e.
17817      attribute ((target ("attr1,,attr2"))).  */
17818   unsigned int num_commas = num_occurences_in_str (',', str_to_check);
17819 
17820   /* Handle multiple target attributes separated by ','.  */
17821   char *token = strtok_r (str_to_check, ",", &str_to_check);
17822 
17823   unsigned int num_attrs = 0;
17824   while (token)
17825     {
17826       num_attrs++;
17827       if (!aarch64_process_one_target_attr (token))
17828 	{
17829 	  error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
17830 	  return false;
17831 	}
17832 
17833       token = strtok_r (NULL, ",", &str_to_check);
17834     }
17835 
17836   if (num_attrs != num_commas + 1)
17837     {
17838       error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
17839       return false;
17840     }
17841 
17842   return true;
17843 }
17844 
17845 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P.  This is used to
17846    process attribute ((target ("..."))).  */
17847 
17848 static bool
aarch64_option_valid_attribute_p(tree fndecl,tree,tree args,int)17849 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
17850 {
17851   struct cl_target_option cur_target;
17852   bool ret;
17853   tree old_optimize;
17854   tree new_target, new_optimize;
17855   tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
17856 
17857   /* If what we're processing is the current pragma string then the
17858      target option node is already stored in target_option_current_node
17859      by aarch64_pragma_target_parse in aarch64-c.c.  Use that to avoid
17860      having to re-parse the string.  This is especially useful to keep
17861      arm_neon.h compile times down since that header contains a lot
17862      of intrinsics enclosed in pragmas.  */
17863   if (!existing_target && args == current_target_pragma)
17864     {
17865       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
17866       return true;
17867     }
17868   tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
17869 
17870   old_optimize
17871     = build_optimization_node (&global_options, &global_options_set);
17872   func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
17873 
17874   /* If the function changed the optimization levels as well as setting
17875      target options, start with the optimizations specified.  */
17876   if (func_optimize && func_optimize != old_optimize)
17877     cl_optimization_restore (&global_options, &global_options_set,
17878 			     TREE_OPTIMIZATION (func_optimize));
17879 
17880   /* Save the current target options to restore at the end.  */
17881   cl_target_option_save (&cur_target, &global_options, &global_options_set);
17882 
17883   /* If fndecl already has some target attributes applied to it, unpack
17884      them so that we add this attribute on top of them, rather than
17885      overwriting them.  */
17886   if (existing_target)
17887     {
17888       struct cl_target_option *existing_options
17889 	= TREE_TARGET_OPTION (existing_target);
17890 
17891       if (existing_options)
17892 	cl_target_option_restore (&global_options, &global_options_set,
17893 				  existing_options);
17894     }
17895   else
17896     cl_target_option_restore (&global_options, &global_options_set,
17897 			      TREE_TARGET_OPTION (target_option_current_node));
17898 
17899   ret = aarch64_process_target_attr (args);
17900 
17901   /* Set up any additional state.  */
17902   if (ret)
17903     {
17904       aarch64_override_options_internal (&global_options);
17905       /* Initialize SIMD builtins if we haven't already.
17906 	 Set current_target_pragma to NULL for the duration so that
17907 	 the builtin initialization code doesn't try to tag the functions
17908 	 being built with the attributes specified by any current pragma, thus
17909 	 going into an infinite recursion.  */
17910       if (TARGET_SIMD)
17911 	{
17912 	  tree saved_current_target_pragma = current_target_pragma;
17913 	  current_target_pragma = NULL;
17914 	  aarch64_init_simd_builtins ();
17915 	  current_target_pragma = saved_current_target_pragma;
17916 	}
17917       new_target = build_target_option_node (&global_options,
17918 					     &global_options_set);
17919     }
17920   else
17921     new_target = NULL;
17922 
17923   new_optimize = build_optimization_node (&global_options,
17924 					  &global_options_set);
17925 
17926   if (fndecl && ret)
17927     {
17928       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
17929 
17930       if (old_optimize != new_optimize)
17931 	DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
17932     }
17933 
17934   cl_target_option_restore (&global_options, &global_options_set, &cur_target);
17935 
17936   if (old_optimize != new_optimize)
17937     cl_optimization_restore (&global_options, &global_options_set,
17938 			     TREE_OPTIMIZATION (old_optimize));
17939   return ret;
17940 }
17941 
17942 /* Helper for aarch64_can_inline_p.  In the case where CALLER and CALLEE are
17943    tri-bool options (yes, no, don't care) and the default value is
17944    DEF, determine whether to reject inlining.  */
17945 
17946 static bool
aarch64_tribools_ok_for_inlining_p(int caller,int callee,int dont_care,int def)17947 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
17948 				     int dont_care, int def)
17949 {
17950   /* If the callee doesn't care, always allow inlining.  */
17951   if (callee == dont_care)
17952     return true;
17953 
17954   /* If the caller doesn't care, always allow inlining.  */
17955   if (caller == dont_care)
17956     return true;
17957 
17958   /* Otherwise, allow inlining if either the callee and caller values
17959      agree, or if the callee is using the default value.  */
17960   return (callee == caller || callee == def);
17961 }
17962 
17963 /* Implement TARGET_CAN_INLINE_P.  Decide whether it is valid
17964    to inline CALLEE into CALLER based on target-specific info.
17965    Make sure that the caller and callee have compatible architectural
17966    features.  Then go through the other possible target attributes
17967    and see if they can block inlining.  Try not to reject always_inline
17968    callees unless they are incompatible architecturally.  */
17969 
17970 static bool
aarch64_can_inline_p(tree caller,tree callee)17971 aarch64_can_inline_p (tree caller, tree callee)
17972 {
17973   tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
17974   tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
17975 
17976   struct cl_target_option *caller_opts
17977 	= TREE_TARGET_OPTION (caller_tree ? caller_tree
17978 					   : target_option_default_node);
17979 
17980   struct cl_target_option *callee_opts
17981 	= TREE_TARGET_OPTION (callee_tree ? callee_tree
17982 					   : target_option_default_node);
17983 
17984   /* Callee's ISA flags should be a subset of the caller's.  */
17985   if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
17986        != callee_opts->x_aarch64_isa_flags)
17987     return false;
17988 
17989   /* Allow non-strict aligned functions inlining into strict
17990      aligned ones.  */
17991   if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
17992        != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
17993       && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
17994 	   && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
17995     return false;
17996 
17997   bool always_inline = lookup_attribute ("always_inline",
17998 					  DECL_ATTRIBUTES (callee));
17999 
18000   /* If the architectural features match up and the callee is always_inline
18001      then the other attributes don't matter.  */
18002   if (always_inline)
18003     return true;
18004 
18005   if (caller_opts->x_aarch64_cmodel_var
18006       != callee_opts->x_aarch64_cmodel_var)
18007     return false;
18008 
18009   if (caller_opts->x_aarch64_tls_dialect
18010       != callee_opts->x_aarch64_tls_dialect)
18011     return false;
18012 
18013   /* Honour explicit requests to workaround errata.  */
18014   if (!aarch64_tribools_ok_for_inlining_p (
18015 	  caller_opts->x_aarch64_fix_a53_err835769,
18016 	  callee_opts->x_aarch64_fix_a53_err835769,
18017 	  2, TARGET_FIX_ERR_A53_835769_DEFAULT))
18018     return false;
18019 
18020   if (!aarch64_tribools_ok_for_inlining_p (
18021 	  caller_opts->x_aarch64_fix_a53_err843419,
18022 	  callee_opts->x_aarch64_fix_a53_err843419,
18023 	  2, TARGET_FIX_ERR_A53_843419))
18024     return false;
18025 
18026   /* If the user explicitly specified -momit-leaf-frame-pointer for the
18027      caller and calle and they don't match up, reject inlining.  */
18028   if (!aarch64_tribools_ok_for_inlining_p (
18029 	  caller_opts->x_flag_omit_leaf_frame_pointer,
18030 	  callee_opts->x_flag_omit_leaf_frame_pointer,
18031 	  2, 1))
18032     return false;
18033 
18034   /* If the callee has specific tuning overrides, respect them.  */
18035   if (callee_opts->x_aarch64_override_tune_string != NULL
18036       && caller_opts->x_aarch64_override_tune_string == NULL)
18037     return false;
18038 
18039   /* If the user specified tuning override strings for the
18040      caller and callee and they don't match up, reject inlining.
18041      We just do a string compare here, we don't analyze the meaning
18042      of the string, as it would be too costly for little gain.  */
18043   if (callee_opts->x_aarch64_override_tune_string
18044       && caller_opts->x_aarch64_override_tune_string
18045       && (strcmp (callee_opts->x_aarch64_override_tune_string,
18046 		  caller_opts->x_aarch64_override_tune_string) != 0))
18047     return false;
18048 
18049   return true;
18050 }
18051 
18052 /* Return the ID of the TLDESC ABI, initializing the descriptor if hasn't
18053    been already.  */
18054 
18055 unsigned int
aarch64_tlsdesc_abi_id()18056 aarch64_tlsdesc_abi_id ()
18057 {
18058   predefined_function_abi &tlsdesc_abi = function_abis[ARM_PCS_TLSDESC];
18059   if (!tlsdesc_abi.initialized_p ())
18060     {
18061       HARD_REG_SET full_reg_clobbers;
18062       CLEAR_HARD_REG_SET (full_reg_clobbers);
18063       SET_HARD_REG_BIT (full_reg_clobbers, R0_REGNUM);
18064       SET_HARD_REG_BIT (full_reg_clobbers, CC_REGNUM);
18065       for (int regno = P0_REGNUM; regno <= P15_REGNUM; ++regno)
18066 	SET_HARD_REG_BIT (full_reg_clobbers, regno);
18067       tlsdesc_abi.initialize (ARM_PCS_TLSDESC, full_reg_clobbers);
18068     }
18069   return tlsdesc_abi.id ();
18070 }
18071 
18072 /* Return true if SYMBOL_REF X binds locally.  */
18073 
18074 static bool
aarch64_symbol_binds_local_p(const_rtx x)18075 aarch64_symbol_binds_local_p (const_rtx x)
18076 {
18077   return (SYMBOL_REF_DECL (x)
18078 	  ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
18079 	  : SYMBOL_REF_LOCAL_P (x));
18080 }
18081 
18082 /* Return true if SYMBOL_REF X is thread local */
18083 static bool
aarch64_tls_symbol_p(rtx x)18084 aarch64_tls_symbol_p (rtx x)
18085 {
18086   if (! TARGET_HAVE_TLS)
18087     return false;
18088 
18089   x = strip_salt (x);
18090   if (!SYMBOL_REF_P (x))
18091     return false;
18092 
18093   return SYMBOL_REF_TLS_MODEL (x) != 0;
18094 }
18095 
18096 /* Classify a TLS symbol into one of the TLS kinds.  */
18097 enum aarch64_symbol_type
aarch64_classify_tls_symbol(rtx x)18098 aarch64_classify_tls_symbol (rtx x)
18099 {
18100   enum tls_model tls_kind = tls_symbolic_operand_type (x);
18101 
18102   switch (tls_kind)
18103     {
18104     case TLS_MODEL_GLOBAL_DYNAMIC:
18105     case TLS_MODEL_LOCAL_DYNAMIC:
18106       return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
18107 
18108     case TLS_MODEL_INITIAL_EXEC:
18109       switch (aarch64_cmodel)
18110 	{
18111 	case AARCH64_CMODEL_TINY:
18112 	case AARCH64_CMODEL_TINY_PIC:
18113 	  return SYMBOL_TINY_TLSIE;
18114 	default:
18115 	  return SYMBOL_SMALL_TLSIE;
18116 	}
18117 
18118     case TLS_MODEL_LOCAL_EXEC:
18119       if (aarch64_tls_size == 12)
18120 	return SYMBOL_TLSLE12;
18121       else if (aarch64_tls_size == 24)
18122 	return SYMBOL_TLSLE24;
18123       else if (aarch64_tls_size == 32)
18124 	return SYMBOL_TLSLE32;
18125       else if (aarch64_tls_size == 48)
18126 	return SYMBOL_TLSLE48;
18127       else
18128 	gcc_unreachable ();
18129 
18130     case TLS_MODEL_EMULATED:
18131     case TLS_MODEL_NONE:
18132       return SYMBOL_FORCE_TO_MEM;
18133 
18134     default:
18135       gcc_unreachable ();
18136     }
18137 }
18138 
18139 /* Return the correct method for accessing X + OFFSET, where X is either
18140    a SYMBOL_REF or LABEL_REF.  */
18141 
18142 enum aarch64_symbol_type
aarch64_classify_symbol(rtx x,HOST_WIDE_INT offset)18143 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
18144 {
18145   x = strip_salt (x);
18146 
18147   if (LABEL_REF_P (x))
18148     {
18149       switch (aarch64_cmodel)
18150 	{
18151 	case AARCH64_CMODEL_LARGE:
18152 	  return SYMBOL_FORCE_TO_MEM;
18153 
18154 	case AARCH64_CMODEL_TINY_PIC:
18155 	case AARCH64_CMODEL_TINY:
18156 	  return SYMBOL_TINY_ABSOLUTE;
18157 
18158 	case AARCH64_CMODEL_SMALL_SPIC:
18159 	case AARCH64_CMODEL_SMALL_PIC:
18160 	case AARCH64_CMODEL_SMALL:
18161 	  return SYMBOL_SMALL_ABSOLUTE;
18162 
18163 	default:
18164 	  gcc_unreachable ();
18165 	}
18166     }
18167 
18168   if (SYMBOL_REF_P (x))
18169     {
18170       if (aarch64_tls_symbol_p (x))
18171 	return aarch64_classify_tls_symbol (x);
18172 
18173       switch (aarch64_cmodel)
18174 	{
18175 	case AARCH64_CMODEL_TINY:
18176 	  /* When we retrieve symbol + offset address, we have to make sure
18177 	     the offset does not cause overflow of the final address.  But
18178 	     we have no way of knowing the address of symbol at compile time
18179 	     so we can't accurately say if the distance between the PC and
18180 	     symbol + offset is outside the addressible range of +/-1MB in the
18181 	     TINY code model.  So we limit the maximum offset to +/-64KB and
18182 	     assume the offset to the symbol is not larger than +/-(1MB - 64KB).
18183 	     If offset_within_block_p is true we allow larger offsets.
18184 	     Furthermore force to memory if the symbol is a weak reference to
18185 	     something that doesn't resolve to a symbol in this module.  */
18186 
18187 	  if (SYMBOL_REF_WEAK (x) && !aarch64_symbol_binds_local_p (x))
18188 	    return SYMBOL_FORCE_TO_MEM;
18189 	  if (!(IN_RANGE (offset, -0x10000, 0x10000)
18190 		|| offset_within_block_p (x, offset)))
18191 	    return SYMBOL_FORCE_TO_MEM;
18192 
18193 	  return SYMBOL_TINY_ABSOLUTE;
18194 
18195 	case AARCH64_CMODEL_SMALL:
18196 	  /* Same reasoning as the tiny code model, but the offset cap here is
18197 	     1MB, allowing +/-3.9GB for the offset to the symbol.  */
18198 
18199 	  if (SYMBOL_REF_WEAK (x) && !aarch64_symbol_binds_local_p (x))
18200 	    return SYMBOL_FORCE_TO_MEM;
18201 	  if (!(IN_RANGE (offset, -0x100000, 0x100000)
18202 		|| offset_within_block_p (x, offset)))
18203 	    return SYMBOL_FORCE_TO_MEM;
18204 
18205 	  return SYMBOL_SMALL_ABSOLUTE;
18206 
18207 	case AARCH64_CMODEL_TINY_PIC:
18208 	  if (!aarch64_symbol_binds_local_p (x))
18209 	    return SYMBOL_TINY_GOT;
18210 	  return SYMBOL_TINY_ABSOLUTE;
18211 
18212 	case AARCH64_CMODEL_SMALL_SPIC:
18213 	case AARCH64_CMODEL_SMALL_PIC:
18214 	  if (!aarch64_symbol_binds_local_p (x))
18215 	    return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
18216 		    ?  SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
18217 	  return SYMBOL_SMALL_ABSOLUTE;
18218 
18219 	case AARCH64_CMODEL_LARGE:
18220 	  /* This is alright even in PIC code as the constant
18221 	     pool reference is always PC relative and within
18222 	     the same translation unit.  */
18223 	  if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
18224 	    return SYMBOL_SMALL_ABSOLUTE;
18225 	  else
18226 	    return SYMBOL_FORCE_TO_MEM;
18227 
18228 	default:
18229 	  gcc_unreachable ();
18230 	}
18231     }
18232 
18233   /* By default push everything into the constant pool.  */
18234   return SYMBOL_FORCE_TO_MEM;
18235 }
18236 
18237 bool
aarch64_constant_address_p(rtx x)18238 aarch64_constant_address_p (rtx x)
18239 {
18240   return (CONSTANT_P (x) && memory_address_p (DImode, x));
18241 }
18242 
18243 bool
aarch64_legitimate_pic_operand_p(rtx x)18244 aarch64_legitimate_pic_operand_p (rtx x)
18245 {
18246   poly_int64 offset;
18247   x = strip_offset_and_salt (x, &offset);
18248   if (SYMBOL_REF_P (x))
18249     return false;
18250 
18251   return true;
18252 }
18253 
18254 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook.  Return true for constants
18255    that should be rematerialized rather than spilled.  */
18256 
18257 static bool
aarch64_legitimate_constant_p(machine_mode mode,rtx x)18258 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
18259 {
18260   /* Support CSE and rematerialization of common constants.  */
18261   if (CONST_INT_P (x)
18262       || (CONST_DOUBLE_P (x) && GET_MODE_CLASS (mode) == MODE_FLOAT))
18263     return true;
18264 
18265   /* Only accept variable-length vector constants if they can be
18266      handled directly.
18267 
18268      ??? It would be possible (but complex) to handle rematerialization
18269      of other constants via secondary reloads.  */
18270   if (!GET_MODE_SIZE (mode).is_constant ())
18271     return aarch64_simd_valid_immediate (x, NULL);
18272 
18273   /* Otherwise, accept any CONST_VECTOR that, if all else fails, can at
18274      least be forced to memory and loaded from there.  */
18275   if (GET_CODE (x) == CONST_VECTOR)
18276     return !targetm.cannot_force_const_mem (mode, x);
18277 
18278   /* Do not allow vector struct mode constants for Advanced SIMD.
18279      We could support 0 and -1 easily, but they need support in
18280      aarch64-simd.md.  */
18281   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
18282   if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
18283     return false;
18284 
18285   if (GET_CODE (x) == HIGH)
18286     x = XEXP (x, 0);
18287 
18288   /* Accept polynomial constants that can be calculated by using the
18289      destination of a move as the sole temporary.  Constants that
18290      require a second temporary cannot be rematerialized (they can't be
18291      forced to memory and also aren't legitimate constants).  */
18292   poly_int64 offset;
18293   if (poly_int_rtx_p (x, &offset))
18294     return aarch64_offset_temporaries (false, offset) <= 1;
18295 
18296   /* If an offset is being added to something else, we need to allow the
18297      base to be moved into the destination register, meaning that there
18298      are no free temporaries for the offset.  */
18299   x = strip_offset_and_salt (x, &offset);
18300   if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
18301     return false;
18302 
18303   /* Do not allow const (plus (anchor_symbol, const_int)).  */
18304   if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
18305     return false;
18306 
18307   /* Treat symbols as constants.  Avoid TLS symbols as they are complex,
18308      so spilling them is better than rematerialization.  */
18309   if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
18310     return true;
18311 
18312   /* Label references are always constant.  */
18313   if (LABEL_REF_P (x))
18314     return true;
18315 
18316   return false;
18317 }
18318 
18319 rtx
aarch64_load_tp(rtx target)18320 aarch64_load_tp (rtx target)
18321 {
18322   if (!target
18323       || GET_MODE (target) != Pmode
18324       || !register_operand (target, Pmode))
18325     target = gen_reg_rtx (Pmode);
18326 
18327   /* Can return in any reg.  */
18328   emit_insn (gen_aarch64_load_tp_hard (target));
18329   return target;
18330 }
18331 
18332 /* On AAPCS systems, this is the "struct __va_list".  */
18333 static GTY(()) tree va_list_type;
18334 
18335 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
18336    Return the type to use as __builtin_va_list.
18337 
18338    AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
18339 
18340    struct __va_list
18341    {
18342      void *__stack;
18343      void *__gr_top;
18344      void *__vr_top;
18345      int   __gr_offs;
18346      int   __vr_offs;
18347    };  */
18348 
18349 static tree
aarch64_build_builtin_va_list(void)18350 aarch64_build_builtin_va_list (void)
18351 {
18352   tree va_list_name;
18353   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
18354 
18355   /* Create the type.  */
18356   va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
18357   /* Give it the required name.  */
18358   va_list_name = build_decl (BUILTINS_LOCATION,
18359 			     TYPE_DECL,
18360 			     get_identifier ("__va_list"),
18361 			     va_list_type);
18362   DECL_ARTIFICIAL (va_list_name) = 1;
18363   TYPE_NAME (va_list_type) = va_list_name;
18364   TYPE_STUB_DECL (va_list_type) = va_list_name;
18365 
18366   /* Create the fields.  */
18367   f_stack = build_decl (BUILTINS_LOCATION,
18368 			FIELD_DECL, get_identifier ("__stack"),
18369 			ptr_type_node);
18370   f_grtop = build_decl (BUILTINS_LOCATION,
18371 			FIELD_DECL, get_identifier ("__gr_top"),
18372 			ptr_type_node);
18373   f_vrtop = build_decl (BUILTINS_LOCATION,
18374 			FIELD_DECL, get_identifier ("__vr_top"),
18375 			ptr_type_node);
18376   f_groff = build_decl (BUILTINS_LOCATION,
18377 			FIELD_DECL, get_identifier ("__gr_offs"),
18378 			integer_type_node);
18379   f_vroff = build_decl (BUILTINS_LOCATION,
18380 			FIELD_DECL, get_identifier ("__vr_offs"),
18381 			integer_type_node);
18382 
18383   /* Tell tree-stdarg pass about our internal offset fields.
18384      NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
18385      purpose to identify whether the code is updating va_list internal
18386      offset fields through irregular way.  */
18387   va_list_gpr_counter_field = f_groff;
18388   va_list_fpr_counter_field = f_vroff;
18389 
18390   DECL_ARTIFICIAL (f_stack) = 1;
18391   DECL_ARTIFICIAL (f_grtop) = 1;
18392   DECL_ARTIFICIAL (f_vrtop) = 1;
18393   DECL_ARTIFICIAL (f_groff) = 1;
18394   DECL_ARTIFICIAL (f_vroff) = 1;
18395 
18396   DECL_FIELD_CONTEXT (f_stack) = va_list_type;
18397   DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
18398   DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
18399   DECL_FIELD_CONTEXT (f_groff) = va_list_type;
18400   DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
18401 
18402   TYPE_FIELDS (va_list_type) = f_stack;
18403   DECL_CHAIN (f_stack) = f_grtop;
18404   DECL_CHAIN (f_grtop) = f_vrtop;
18405   DECL_CHAIN (f_vrtop) = f_groff;
18406   DECL_CHAIN (f_groff) = f_vroff;
18407 
18408   /* Compute its layout.  */
18409   layout_type (va_list_type);
18410 
18411   return va_list_type;
18412 }
18413 
18414 /* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
18415 static void
aarch64_expand_builtin_va_start(tree valist,rtx nextarg ATTRIBUTE_UNUSED)18416 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
18417 {
18418   const CUMULATIVE_ARGS *cum;
18419   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
18420   tree stack, grtop, vrtop, groff, vroff;
18421   tree t;
18422   int gr_save_area_size = cfun->va_list_gpr_size;
18423   int vr_save_area_size = cfun->va_list_fpr_size;
18424   int vr_offset;
18425 
18426   cum = &crtl->args.info;
18427   if (cfun->va_list_gpr_size)
18428     gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
18429 			     cfun->va_list_gpr_size);
18430   if (cfun->va_list_fpr_size)
18431     vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
18432 			     * UNITS_PER_VREG, cfun->va_list_fpr_size);
18433 
18434   if (!TARGET_FLOAT)
18435     {
18436       gcc_assert (cum->aapcs_nvrn == 0);
18437       vr_save_area_size = 0;
18438     }
18439 
18440   f_stack = TYPE_FIELDS (va_list_type_node);
18441   f_grtop = DECL_CHAIN (f_stack);
18442   f_vrtop = DECL_CHAIN (f_grtop);
18443   f_groff = DECL_CHAIN (f_vrtop);
18444   f_vroff = DECL_CHAIN (f_groff);
18445 
18446   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
18447 		  NULL_TREE);
18448   grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
18449 		  NULL_TREE);
18450   vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
18451 		  NULL_TREE);
18452   groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
18453 		  NULL_TREE);
18454   vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
18455 		  NULL_TREE);
18456 
18457   /* Emit code to initialize STACK, which points to the next varargs stack
18458      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
18459      by named arguments.  STACK is 8-byte aligned.  */
18460   t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
18461   if (cum->aapcs_stack_size > 0)
18462     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
18463   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
18464   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
18465 
18466   /* Emit code to initialize GRTOP, the top of the GR save area.
18467      virtual_incoming_args_rtx should have been 16 byte aligned.  */
18468   t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
18469   t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
18470   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
18471 
18472   /* Emit code to initialize VRTOP, the top of the VR save area.
18473      This address is gr_save_area_bytes below GRTOP, rounded
18474      down to the next 16-byte boundary.  */
18475   t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
18476   vr_offset = ROUND_UP (gr_save_area_size,
18477 			STACK_BOUNDARY / BITS_PER_UNIT);
18478 
18479   if (vr_offset)
18480     t = fold_build_pointer_plus_hwi (t, -vr_offset);
18481   t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
18482   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
18483 
18484   /* Emit code to initialize GROFF, the offset from GRTOP of the
18485      next GPR argument.  */
18486   t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
18487 	      build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
18488   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
18489 
18490   /* Likewise emit code to initialize VROFF, the offset from FTOP
18491      of the next VR argument.  */
18492   t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
18493 	      build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
18494   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
18495 }
18496 
18497 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
18498 
18499 static tree
aarch64_gimplify_va_arg_expr(tree valist,tree type,gimple_seq * pre_p,gimple_seq * post_p ATTRIBUTE_UNUSED)18500 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
18501 			      gimple_seq *post_p ATTRIBUTE_UNUSED)
18502 {
18503   tree addr;
18504   bool indirect_p;
18505   bool is_ha;		/* is HFA or HVA.  */
18506   bool dw_align;	/* double-word align.  */
18507   machine_mode ag_mode = VOIDmode;
18508   int nregs;
18509   machine_mode mode;
18510 
18511   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
18512   tree stack, f_top, f_off, off, arg, roundup, on_stack;
18513   HOST_WIDE_INT size, rsize, adjust, align;
18514   tree t, u, cond1, cond2;
18515 
18516   indirect_p = pass_va_arg_by_reference (type);
18517   if (indirect_p)
18518     type = build_pointer_type (type);
18519 
18520   mode = TYPE_MODE (type);
18521 
18522   f_stack = TYPE_FIELDS (va_list_type_node);
18523   f_grtop = DECL_CHAIN (f_stack);
18524   f_vrtop = DECL_CHAIN (f_grtop);
18525   f_groff = DECL_CHAIN (f_vrtop);
18526   f_vroff = DECL_CHAIN (f_groff);
18527 
18528   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
18529 		  f_stack, NULL_TREE);
18530   size = int_size_in_bytes (type);
18531 
18532   unsigned int abi_break;
18533   align
18534     = aarch64_function_arg_alignment (mode, type, &abi_break) / BITS_PER_UNIT;
18535 
18536   dw_align = false;
18537   adjust = 0;
18538   if (aarch64_vfp_is_call_or_return_candidate (mode, type, &ag_mode, &nregs,
18539 					       &is_ha, false))
18540     {
18541       /* No frontends can create types with variable-sized modes, so we
18542 	 shouldn't be asked to pass or return them.  */
18543       unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
18544 
18545       /* TYPE passed in fp/simd registers.  */
18546       if (!TARGET_FLOAT)
18547 	aarch64_err_no_fpadvsimd (mode);
18548 
18549       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
18550 		      unshare_expr (valist), f_vrtop, NULL_TREE);
18551       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
18552 		      unshare_expr (valist), f_vroff, NULL_TREE);
18553 
18554       rsize = nregs * UNITS_PER_VREG;
18555 
18556       if (is_ha)
18557 	{
18558 	  if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
18559 	    adjust = UNITS_PER_VREG - ag_size;
18560 	}
18561       else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
18562 	       && size < UNITS_PER_VREG)
18563 	{
18564 	  adjust = UNITS_PER_VREG - size;
18565 	}
18566     }
18567   else
18568     {
18569       /* TYPE passed in general registers.  */
18570       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
18571 		      unshare_expr (valist), f_grtop, NULL_TREE);
18572       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
18573 		      unshare_expr (valist), f_groff, NULL_TREE);
18574       rsize = ROUND_UP (size, UNITS_PER_WORD);
18575       nregs = rsize / UNITS_PER_WORD;
18576 
18577       if (align > 8)
18578 	{
18579 	  if (abi_break && warn_psabi)
18580 	    inform (input_location, "parameter passing for argument of type "
18581 		    "%qT changed in GCC 9.1", type);
18582 	  dw_align = true;
18583 	}
18584 
18585       if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
18586 	  && size < UNITS_PER_WORD)
18587 	{
18588 	  adjust = UNITS_PER_WORD  - size;
18589 	}
18590     }
18591 
18592   /* Get a local temporary for the field value.  */
18593   off = get_initialized_tmp_var (f_off, pre_p, NULL);
18594 
18595   /* Emit code to branch if off >= 0.  */
18596   t = build2 (GE_EXPR, boolean_type_node, off,
18597 	      build_int_cst (TREE_TYPE (off), 0));
18598   cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
18599 
18600   if (dw_align)
18601     {
18602       /* Emit: offs = (offs + 15) & -16.  */
18603       t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
18604 		  build_int_cst (TREE_TYPE (off), 15));
18605       t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
18606 		  build_int_cst (TREE_TYPE (off), -16));
18607       roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
18608     }
18609   else
18610     roundup = NULL;
18611 
18612   /* Update ap.__[g|v]r_offs  */
18613   t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
18614 	      build_int_cst (TREE_TYPE (off), rsize));
18615   t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
18616 
18617   /* String up.  */
18618   if (roundup)
18619     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
18620 
18621   /* [cond2] if (ap.__[g|v]r_offs > 0)  */
18622   u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
18623 	      build_int_cst (TREE_TYPE (f_off), 0));
18624   cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
18625 
18626   /* String up: make sure the assignment happens before the use.  */
18627   t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
18628   COND_EXPR_ELSE (cond1) = t;
18629 
18630   /* Prepare the trees handling the argument that is passed on the stack;
18631      the top level node will store in ON_STACK.  */
18632   arg = get_initialized_tmp_var (stack, pre_p, NULL);
18633   if (align > 8)
18634     {
18635       /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
18636       t = fold_build_pointer_plus_hwi (arg, 15);
18637       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
18638 		  build_int_cst (TREE_TYPE (t), -16));
18639       roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
18640     }
18641   else
18642     roundup = NULL;
18643   /* Advance ap.__stack  */
18644   t = fold_build_pointer_plus_hwi (arg, size + 7);
18645   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
18646 	      build_int_cst (TREE_TYPE (t), -8));
18647   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
18648   /* String up roundup and advance.  */
18649   if (roundup)
18650     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
18651   /* String up with arg */
18652   on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
18653   /* Big-endianness related address adjustment.  */
18654   if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
18655       && size < UNITS_PER_WORD)
18656   {
18657     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
18658 		size_int (UNITS_PER_WORD - size));
18659     on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
18660   }
18661 
18662   COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
18663   COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
18664 
18665   /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
18666   t = off;
18667   if (adjust)
18668     t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
18669 		build_int_cst (TREE_TYPE (off), adjust));
18670 
18671   t = fold_convert (sizetype, t);
18672   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
18673 
18674   if (is_ha)
18675     {
18676       /* type ha; // treat as "struct {ftype field[n];}"
18677          ... [computing offs]
18678          for (i = 0; i <nregs; ++i, offs += 16)
18679 	   ha.field[i] = *((ftype *)(ap.__vr_top + offs));
18680 	 return ha;  */
18681       int i;
18682       tree tmp_ha, field_t, field_ptr_t;
18683 
18684       /* Declare a local variable.  */
18685       tmp_ha = create_tmp_var_raw (type, "ha");
18686       gimple_add_tmp_var (tmp_ha);
18687 
18688       /* Establish the base type.  */
18689       switch (ag_mode)
18690 	{
18691 	case E_SFmode:
18692 	  field_t = float_type_node;
18693 	  field_ptr_t = float_ptr_type_node;
18694 	  break;
18695 	case E_DFmode:
18696 	  field_t = double_type_node;
18697 	  field_ptr_t = double_ptr_type_node;
18698 	  break;
18699 	case E_TFmode:
18700 	  field_t = long_double_type_node;
18701 	  field_ptr_t = long_double_ptr_type_node;
18702 	  break;
18703 	case E_HFmode:
18704 	  field_t = aarch64_fp16_type_node;
18705 	  field_ptr_t = aarch64_fp16_ptr_type_node;
18706 	  break;
18707 	case E_BFmode:
18708 	  field_t = aarch64_bf16_type_node;
18709 	  field_ptr_t = aarch64_bf16_ptr_type_node;
18710 	  break;
18711 	case E_V2SImode:
18712 	case E_V4SImode:
18713 	    {
18714 	      tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
18715 	      field_t = build_vector_type_for_mode (innertype, ag_mode);
18716 	      field_ptr_t = build_pointer_type (field_t);
18717 	    }
18718 	  break;
18719 	default:
18720 	  gcc_assert (0);
18721 	}
18722 
18723       /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
18724       TREE_ADDRESSABLE (tmp_ha) = 1;
18725       tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
18726       addr = t;
18727       t = fold_convert (field_ptr_t, addr);
18728       t = build2 (MODIFY_EXPR, field_t,
18729 		  build1 (INDIRECT_REF, field_t, tmp_ha),
18730 		  build1 (INDIRECT_REF, field_t, t));
18731 
18732       /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
18733       for (i = 1; i < nregs; ++i)
18734 	{
18735 	  addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
18736 	  u = fold_convert (field_ptr_t, addr);
18737 	  u = build2 (MODIFY_EXPR, field_t,
18738 		      build2 (MEM_REF, field_t, tmp_ha,
18739 			      build_int_cst (field_ptr_t,
18740 					     (i *
18741 					      int_size_in_bytes (field_t)))),
18742 		      build1 (INDIRECT_REF, field_t, u));
18743 	  t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
18744 	}
18745 
18746       u = fold_convert (TREE_TYPE (f_top), tmp_ha);
18747       t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
18748     }
18749 
18750   COND_EXPR_ELSE (cond2) = t;
18751   addr = fold_convert (build_pointer_type (type), cond1);
18752   addr = build_va_arg_indirect_ref (addr);
18753 
18754   if (indirect_p)
18755     addr = build_va_arg_indirect_ref (addr);
18756 
18757   return addr;
18758 }
18759 
18760 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
18761 
18762 static void
aarch64_setup_incoming_varargs(cumulative_args_t cum_v,const function_arg_info & arg,int * pretend_size ATTRIBUTE_UNUSED,int no_rtl)18763 aarch64_setup_incoming_varargs (cumulative_args_t cum_v,
18764 				const function_arg_info &arg,
18765 				int *pretend_size ATTRIBUTE_UNUSED, int no_rtl)
18766 {
18767   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
18768   CUMULATIVE_ARGS local_cum;
18769   int gr_saved = cfun->va_list_gpr_size;
18770   int vr_saved = cfun->va_list_fpr_size;
18771 
18772   /* The caller has advanced CUM up to, but not beyond, the last named
18773      argument.  Advance a local copy of CUM past the last "real" named
18774      argument, to find out how many registers are left over.  */
18775   local_cum = *cum;
18776   aarch64_function_arg_advance (pack_cumulative_args(&local_cum), arg);
18777 
18778   /* Found out how many registers we need to save.
18779      Honor tree-stdvar analysis results.  */
18780   if (cfun->va_list_gpr_size)
18781     gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
18782 		    cfun->va_list_gpr_size / UNITS_PER_WORD);
18783   if (cfun->va_list_fpr_size)
18784     vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
18785 		    cfun->va_list_fpr_size / UNITS_PER_VREG);
18786 
18787   if (!TARGET_FLOAT)
18788     {
18789       gcc_assert (local_cum.aapcs_nvrn == 0);
18790       vr_saved = 0;
18791     }
18792 
18793   if (!no_rtl)
18794     {
18795       if (gr_saved > 0)
18796 	{
18797 	  rtx ptr, mem;
18798 
18799 	  /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
18800 	  ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
18801 			       - gr_saved * UNITS_PER_WORD);
18802 	  mem = gen_frame_mem (BLKmode, ptr);
18803 	  set_mem_alias_set (mem, get_varargs_alias_set ());
18804 
18805 	  move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
18806 			       mem, gr_saved);
18807 	}
18808       if (vr_saved > 0)
18809 	{
18810 	  /* We can't use move_block_from_reg, because it will use
18811 	     the wrong mode, storing D regs only.  */
18812 	  machine_mode mode = TImode;
18813 	  int off, i, vr_start;
18814 
18815 	  /* Set OFF to the offset from virtual_incoming_args_rtx of
18816 	     the first vector register.  The VR save area lies below
18817 	     the GR one, and is aligned to 16 bytes.  */
18818 	  off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
18819 			   STACK_BOUNDARY / BITS_PER_UNIT);
18820 	  off -= vr_saved * UNITS_PER_VREG;
18821 
18822 	  vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
18823 	  for (i = 0; i < vr_saved; ++i)
18824 	    {
18825 	      rtx ptr, mem;
18826 
18827 	      ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
18828 	      mem = gen_frame_mem (mode, ptr);
18829 	      set_mem_alias_set (mem, get_varargs_alias_set ());
18830 	      aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
18831 	      off += UNITS_PER_VREG;
18832 	    }
18833 	}
18834     }
18835 
18836   /* We don't save the size into *PRETEND_SIZE because we want to avoid
18837      any complication of having crtl->args.pretend_args_size changed.  */
18838   cfun->machine->frame.saved_varargs_size
18839     = (ROUND_UP (gr_saved * UNITS_PER_WORD,
18840 		 STACK_BOUNDARY / BITS_PER_UNIT)
18841        + vr_saved * UNITS_PER_VREG);
18842 }
18843 
18844 static void
aarch64_conditional_register_usage(void)18845 aarch64_conditional_register_usage (void)
18846 {
18847   int i;
18848   if (!TARGET_FLOAT)
18849     {
18850       for (i = V0_REGNUM; i <= V31_REGNUM; i++)
18851 	{
18852 	  fixed_regs[i] = 1;
18853 	  call_used_regs[i] = 1;
18854 	}
18855     }
18856   if (!TARGET_SVE)
18857     for (i = P0_REGNUM; i <= P15_REGNUM; i++)
18858       {
18859 	fixed_regs[i] = 1;
18860 	call_used_regs[i] = 1;
18861       }
18862 
18863   /* Only allow the FFR and FFRT to be accessed via special patterns.  */
18864   CLEAR_HARD_REG_BIT (operand_reg_set, FFR_REGNUM);
18865   CLEAR_HARD_REG_BIT (operand_reg_set, FFRT_REGNUM);
18866 
18867   /* When tracking speculation, we need a couple of call-clobbered registers
18868      to track the speculation state.  It would be nice to just use
18869      IP0 and IP1, but currently there are numerous places that just
18870      assume these registers are free for other uses (eg pointer
18871      authentication).  */
18872   if (aarch64_track_speculation)
18873     {
18874       fixed_regs[SPECULATION_TRACKER_REGNUM] = 1;
18875       call_used_regs[SPECULATION_TRACKER_REGNUM] = 1;
18876       fixed_regs[SPECULATION_SCRATCH_REGNUM] = 1;
18877       call_used_regs[SPECULATION_SCRATCH_REGNUM] = 1;
18878     }
18879 }
18880 
18881 /* Implement TARGET_MEMBER_TYPE_FORCES_BLK.  */
18882 
18883 bool
aarch64_member_type_forces_blk(const_tree field_or_array,machine_mode mode)18884 aarch64_member_type_forces_blk (const_tree field_or_array, machine_mode mode)
18885 {
18886   /* For records we're passed a FIELD_DECL, for arrays we're passed
18887      an ARRAY_TYPE.  In both cases we're interested in the TREE_TYPE.  */
18888   const_tree type = TREE_TYPE (field_or_array);
18889 
18890   /* Assign BLKmode to anything that contains multiple SVE predicates.
18891      For structures, the "multiple" case is indicated by MODE being
18892      VOIDmode.  */
18893   unsigned int num_zr, num_pr;
18894   if (aarch64_sve::builtin_type_p (type, &num_zr, &num_pr) && num_pr != 0)
18895     {
18896       if (TREE_CODE (field_or_array) == ARRAY_TYPE)
18897 	return !simple_cst_equal (TYPE_SIZE (field_or_array),
18898 				  TYPE_SIZE (type));
18899       return mode == VOIDmode;
18900     }
18901 
18902   return default_member_type_forces_blk (field_or_array, mode);
18903 }
18904 
18905 /* Bitmasks that indicate whether earlier versions of GCC would have
18906    taken a different path through the ABI logic.  This should result in
18907    a -Wpsabi warning if the earlier path led to a different ABI decision.
18908 
18909    WARN_PSABI_EMPTY_CXX17_BASE
18910       Indicates that the type includes an artificial empty C++17 base field
18911       that, prior to GCC 10.1, would prevent the type from being treated as
18912       a HFA or HVA.  See PR94383 for details.
18913 
18914    WARN_PSABI_NO_UNIQUE_ADDRESS
18915       Indicates that the type includes an empty [[no_unique_address]] field
18916       that, prior to GCC 10.1, would prevent the type from being treated as
18917       a HFA or HVA.  */
18918 const unsigned int WARN_PSABI_EMPTY_CXX17_BASE = 1U << 0;
18919 const unsigned int WARN_PSABI_NO_UNIQUE_ADDRESS = 1U << 1;
18920 
18921 /* Walk down the type tree of TYPE counting consecutive base elements.
18922    If *MODEP is VOIDmode, then set it to the first valid floating point
18923    type.  If a non-floating point type is found, or if a floating point
18924    type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
18925    otherwise return the count in the sub-tree.
18926 
18927    The WARN_PSABI_FLAGS argument allows the caller to check whether this
18928    function has changed its behavior relative to earlier versions of GCC.
18929    Normally the argument should be nonnull and point to a zero-initialized
18930    variable.  The function then records whether the ABI decision might
18931    be affected by a known fix to the ABI logic, setting the associated
18932    WARN_PSABI_* bits if so.
18933 
18934    When the argument is instead a null pointer, the function tries to
18935    simulate the behavior of GCC before all such ABI fixes were made.
18936    This is useful to check whether the function returns something
18937    different after the ABI fixes.  */
18938 static int
aapcs_vfp_sub_candidate(const_tree type,machine_mode * modep,unsigned int * warn_psabi_flags)18939 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep,
18940 			 unsigned int *warn_psabi_flags)
18941 {
18942   machine_mode mode;
18943   HOST_WIDE_INT size;
18944 
18945   if (aarch64_sve::builtin_type_p (type))
18946     return -1;
18947 
18948   switch (TREE_CODE (type))
18949     {
18950     case REAL_TYPE:
18951       mode = TYPE_MODE (type);
18952       if (mode != DFmode && mode != SFmode
18953 	  && mode != TFmode && mode != HFmode)
18954 	return -1;
18955 
18956       if (*modep == VOIDmode)
18957 	*modep = mode;
18958 
18959       if (*modep == mode)
18960 	return 1;
18961 
18962       break;
18963 
18964     case COMPLEX_TYPE:
18965       mode = TYPE_MODE (TREE_TYPE (type));
18966       if (mode != DFmode && mode != SFmode
18967 	  && mode != TFmode && mode != HFmode)
18968 	return -1;
18969 
18970       if (*modep == VOIDmode)
18971 	*modep = mode;
18972 
18973       if (*modep == mode)
18974 	return 2;
18975 
18976       break;
18977 
18978     case VECTOR_TYPE:
18979       /* Use V2SImode and V4SImode as representatives of all 64-bit
18980 	 and 128-bit vector types.  */
18981       size = int_size_in_bytes (type);
18982       switch (size)
18983 	{
18984 	case 8:
18985 	  mode = V2SImode;
18986 	  break;
18987 	case 16:
18988 	  mode = V4SImode;
18989 	  break;
18990 	default:
18991 	  return -1;
18992 	}
18993 
18994       if (*modep == VOIDmode)
18995 	*modep = mode;
18996 
18997       /* Vector modes are considered to be opaque: two vectors are
18998 	 equivalent for the purposes of being homogeneous aggregates
18999 	 if they are the same size.  */
19000       if (*modep == mode)
19001 	return 1;
19002 
19003       break;
19004 
19005     case ARRAY_TYPE:
19006       {
19007 	int count;
19008 	tree index = TYPE_DOMAIN (type);
19009 
19010 	/* Can't handle incomplete types nor sizes that are not
19011 	   fixed.  */
19012 	if (!COMPLETE_TYPE_P (type)
19013 	    || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
19014 	  return -1;
19015 
19016 	count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep,
19017 					 warn_psabi_flags);
19018 	if (count == -1
19019 	    || !index
19020 	    || !TYPE_MAX_VALUE (index)
19021 	    || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
19022 	    || !TYPE_MIN_VALUE (index)
19023 	    || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
19024 	    || count < 0)
19025 	  return -1;
19026 
19027 	count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
19028 		      - tree_to_uhwi (TYPE_MIN_VALUE (index)));
19029 
19030 	/* There must be no padding.  */
19031 	if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
19032 		      count * GET_MODE_BITSIZE (*modep)))
19033 	  return -1;
19034 
19035 	return count;
19036       }
19037 
19038     case RECORD_TYPE:
19039       {
19040 	int count = 0;
19041 	int sub_count;
19042 	tree field;
19043 
19044 	/* Can't handle incomplete types nor sizes that are not
19045 	   fixed.  */
19046 	if (!COMPLETE_TYPE_P (type)
19047 	    || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
19048 	  return -1;
19049 
19050 	for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
19051 	  {
19052 	    if (TREE_CODE (field) != FIELD_DECL)
19053 	      continue;
19054 
19055 	    if (DECL_FIELD_ABI_IGNORED (field))
19056 	      {
19057 		/* See whether this is something that earlier versions of
19058 		   GCC failed to ignore.  */
19059 		unsigned int flag;
19060 		if (lookup_attribute ("no_unique_address",
19061 				      DECL_ATTRIBUTES (field)))
19062 		  flag = WARN_PSABI_NO_UNIQUE_ADDRESS;
19063 		else if (cxx17_empty_base_field_p (field))
19064 		  flag = WARN_PSABI_EMPTY_CXX17_BASE;
19065 		else
19066 		  /* No compatibility problem.  */
19067 		  continue;
19068 
19069 		/* Simulate the old behavior when WARN_PSABI_FLAGS is null.  */
19070 		if (warn_psabi_flags)
19071 		  {
19072 		    *warn_psabi_flags |= flag;
19073 		    continue;
19074 		  }
19075 	      }
19076 
19077 	    sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep,
19078 						 warn_psabi_flags);
19079 	    if (sub_count < 0)
19080 	      return -1;
19081 	    count += sub_count;
19082 	  }
19083 
19084 	/* There must be no padding.  */
19085 	if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
19086 		      count * GET_MODE_BITSIZE (*modep)))
19087 	  return -1;
19088 
19089 	return count;
19090       }
19091 
19092     case UNION_TYPE:
19093     case QUAL_UNION_TYPE:
19094       {
19095 	/* These aren't very interesting except in a degenerate case.  */
19096 	int count = 0;
19097 	int sub_count;
19098 	tree field;
19099 
19100 	/* Can't handle incomplete types nor sizes that are not
19101 	   fixed.  */
19102 	if (!COMPLETE_TYPE_P (type)
19103 	    || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
19104 	  return -1;
19105 
19106 	for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
19107 	  {
19108 	    if (TREE_CODE (field) != FIELD_DECL)
19109 	      continue;
19110 
19111 	    sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep,
19112 						 warn_psabi_flags);
19113 	    if (sub_count < 0)
19114 	      return -1;
19115 	    count = count > sub_count ? count : sub_count;
19116 	  }
19117 
19118 	/* There must be no padding.  */
19119 	if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
19120 		      count * GET_MODE_BITSIZE (*modep)))
19121 	  return -1;
19122 
19123 	return count;
19124       }
19125 
19126     default:
19127       break;
19128     }
19129 
19130   return -1;
19131 }
19132 
19133 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
19134    type as described in AAPCS64 \S 4.1.2.
19135 
19136    See the comment above aarch64_composite_type_p for the notes on MODE.  */
19137 
19138 static bool
aarch64_short_vector_p(const_tree type,machine_mode mode)19139 aarch64_short_vector_p (const_tree type,
19140 			machine_mode mode)
19141 {
19142   poly_int64 size = -1;
19143 
19144   if (type && TREE_CODE (type) == VECTOR_TYPE)
19145     {
19146       if (aarch64_sve::builtin_type_p (type))
19147 	return false;
19148       size = int_size_in_bytes (type);
19149     }
19150   else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
19151 	   || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
19152     {
19153       /* Rely only on the type, not the mode, when processing SVE types.  */
19154       if (type && aarch64_some_values_include_pst_objects_p (type))
19155 	/* Leave later code to report an error if SVE is disabled.  */
19156 	gcc_assert (!TARGET_SVE || aarch64_sve_mode_p (mode));
19157       else
19158 	size = GET_MODE_SIZE (mode);
19159     }
19160   if (known_eq (size, 8) || known_eq (size, 16))
19161     {
19162       /* 64-bit and 128-bit vectors should only acquire an SVE mode if
19163 	 they are being treated as scalable AAPCS64 types.  */
19164       gcc_assert (!aarch64_sve_mode_p (mode));
19165       return true;
19166     }
19167   return false;
19168 }
19169 
19170 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
19171    type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
19172    array types.  The C99 floating-point complex types are also considered
19173    as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
19174    types, which are GCC extensions and out of the scope of AAPCS64, are
19175    treated as composite types here as well.
19176 
19177    Note that MODE itself is not sufficient in determining whether a type
19178    is such a composite type or not.  This is because
19179    stor-layout.c:compute_record_mode may have already changed the MODE
19180    (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
19181    structure with only one field may have its MODE set to the mode of the
19182    field.  Also an integer mode whose size matches the size of the
19183    RECORD_TYPE type may be used to substitute the original mode
19184    (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
19185    solely relied on.  */
19186 
19187 static bool
aarch64_composite_type_p(const_tree type,machine_mode mode)19188 aarch64_composite_type_p (const_tree type,
19189 			  machine_mode mode)
19190 {
19191   if (aarch64_short_vector_p (type, mode))
19192     return false;
19193 
19194   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
19195     return true;
19196 
19197   if (mode == BLKmode
19198       || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
19199       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
19200     return true;
19201 
19202   return false;
19203 }
19204 
19205 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
19206    shall be passed or returned in simd/fp register(s) (providing these
19207    parameter passing registers are available).
19208 
19209    Upon successful return, *COUNT returns the number of needed registers,
19210    *BASE_MODE returns the mode of the individual register and when IS_HA
19211    is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
19212    floating-point aggregate or a homogeneous short-vector aggregate.
19213 
19214    SILENT_P is true if the function should refrain from reporting any
19215    diagnostics.  This should only be used if the caller is certain that
19216    any ABI decisions would eventually come through this function with
19217    SILENT_P set to false.  */
19218 
19219 static bool
aarch64_vfp_is_call_or_return_candidate(machine_mode mode,const_tree type,machine_mode * base_mode,int * count,bool * is_ha,bool silent_p)19220 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
19221 					 const_tree type,
19222 					 machine_mode *base_mode,
19223 					 int *count,
19224 					 bool *is_ha,
19225 					 bool silent_p)
19226 {
19227   if (is_ha != NULL) *is_ha = false;
19228 
19229   machine_mode new_mode = VOIDmode;
19230   bool composite_p = aarch64_composite_type_p (type, mode);
19231 
19232   if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
19233       || aarch64_short_vector_p (type, mode))
19234     {
19235       *count = 1;
19236       new_mode = mode;
19237     }
19238   else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
19239     {
19240       if (is_ha != NULL) *is_ha = true;
19241       *count = 2;
19242       new_mode = GET_MODE_INNER (mode);
19243     }
19244   else if (type && composite_p)
19245     {
19246       unsigned int warn_psabi_flags = 0;
19247       int ag_count = aapcs_vfp_sub_candidate (type, &new_mode,
19248 					      &warn_psabi_flags);
19249       if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
19250 	{
19251 	  static unsigned last_reported_type_uid;
19252 	  unsigned uid = TYPE_UID (TYPE_MAIN_VARIANT (type));
19253 	  int alt;
19254 	  if (!silent_p
19255 	      && warn_psabi
19256 	      && warn_psabi_flags
19257 	      && uid != last_reported_type_uid
19258 	      && ((alt = aapcs_vfp_sub_candidate (type, &new_mode, NULL))
19259 		  != ag_count))
19260 	    {
19261 	      const char *url
19262 		= CHANGES_ROOT_URL "gcc-10/changes.html#empty_base";
19263 	      gcc_assert (alt == -1);
19264 	      last_reported_type_uid = uid;
19265 	      /* Use TYPE_MAIN_VARIANT to strip any redundant const
19266 		 qualification.  */
19267 	      if (warn_psabi_flags & WARN_PSABI_NO_UNIQUE_ADDRESS)
19268 		inform (input_location, "parameter passing for argument of "
19269 			"type %qT with %<[[no_unique_address]]%> members "
19270 			"changed %{in GCC 10.1%}",
19271 			TYPE_MAIN_VARIANT (type), url);
19272 	      else if (warn_psabi_flags & WARN_PSABI_EMPTY_CXX17_BASE)
19273 		inform (input_location, "parameter passing for argument of "
19274 			"type %qT when C++17 is enabled changed to match "
19275 			"C++14 %{in GCC 10.1%}",
19276 			TYPE_MAIN_VARIANT (type), url);
19277 	    }
19278 
19279 	  if (is_ha != NULL) *is_ha = true;
19280 	  *count = ag_count;
19281 	}
19282       else
19283 	return false;
19284     }
19285   else
19286     return false;
19287 
19288   gcc_assert (!aarch64_sve_mode_p (new_mode));
19289   *base_mode = new_mode;
19290   return true;
19291 }
19292 
19293 /* Implement TARGET_STRUCT_VALUE_RTX.  */
19294 
19295 static rtx
aarch64_struct_value_rtx(tree fndecl ATTRIBUTE_UNUSED,int incoming ATTRIBUTE_UNUSED)19296 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
19297 			  int incoming ATTRIBUTE_UNUSED)
19298 {
19299   return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
19300 }
19301 
19302 /* Implements target hook vector_mode_supported_p.  */
19303 static bool
aarch64_vector_mode_supported_p(machine_mode mode)19304 aarch64_vector_mode_supported_p (machine_mode mode)
19305 {
19306   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
19307   return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
19308 }
19309 
19310 /* Return the full-width SVE vector mode for element mode MODE, if one
19311    exists.  */
19312 opt_machine_mode
aarch64_full_sve_mode(scalar_mode mode)19313 aarch64_full_sve_mode (scalar_mode mode)
19314 {
19315   switch (mode)
19316     {
19317     case E_DFmode:
19318       return VNx2DFmode;
19319     case E_SFmode:
19320       return VNx4SFmode;
19321     case E_HFmode:
19322       return VNx8HFmode;
19323     case E_BFmode:
19324       return VNx8BFmode;
19325     case E_DImode:
19326       return VNx2DImode;
19327     case E_SImode:
19328       return VNx4SImode;
19329     case E_HImode:
19330       return VNx8HImode;
19331     case E_QImode:
19332       return VNx16QImode;
19333     default:
19334       return opt_machine_mode ();
19335     }
19336 }
19337 
19338 /* Return the 128-bit Advanced SIMD vector mode for element mode MODE,
19339    if it exists.  */
19340 opt_machine_mode
aarch64_vq_mode(scalar_mode mode)19341 aarch64_vq_mode (scalar_mode mode)
19342 {
19343   switch (mode)
19344     {
19345     case E_DFmode:
19346       return V2DFmode;
19347     case E_SFmode:
19348       return V4SFmode;
19349     case E_HFmode:
19350       return V8HFmode;
19351     case E_BFmode:
19352       return V8BFmode;
19353     case E_SImode:
19354       return V4SImode;
19355     case E_HImode:
19356       return V8HImode;
19357     case E_QImode:
19358       return V16QImode;
19359     case E_DImode:
19360       return V2DImode;
19361     default:
19362       return opt_machine_mode ();
19363     }
19364 }
19365 
19366 /* Return appropriate SIMD container
19367    for MODE within a vector of WIDTH bits.  */
19368 static machine_mode
aarch64_simd_container_mode(scalar_mode mode,poly_int64 width)19369 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
19370 {
19371   if (TARGET_SVE
19372       && maybe_ne (width, 128)
19373       && known_eq (width, BITS_PER_SVE_VECTOR))
19374     return aarch64_full_sve_mode (mode).else_mode (word_mode);
19375 
19376   gcc_assert (known_eq (width, 64) || known_eq (width, 128));
19377   if (TARGET_SIMD)
19378     {
19379       if (known_eq (width, 128))
19380 	return aarch64_vq_mode (mode).else_mode (word_mode);
19381       else
19382 	switch (mode)
19383 	  {
19384 	  case E_SFmode:
19385 	    return V2SFmode;
19386 	  case E_HFmode:
19387 	    return V4HFmode;
19388 	  case E_BFmode:
19389 	    return V4BFmode;
19390 	  case E_SImode:
19391 	    return V2SImode;
19392 	  case E_HImode:
19393 	    return V4HImode;
19394 	  case E_QImode:
19395 	    return V8QImode;
19396 	  default:
19397 	    break;
19398 	  }
19399     }
19400   return word_mode;
19401 }
19402 
19403 /* Compare an SVE mode SVE_M and an Advanced SIMD mode ASIMD_M
19404    and return whether the SVE mode should be preferred over the
19405    Advanced SIMD one in aarch64_autovectorize_vector_modes.  */
19406 static bool
aarch64_cmp_autovec_modes(machine_mode sve_m,machine_mode asimd_m)19407 aarch64_cmp_autovec_modes (machine_mode sve_m, machine_mode asimd_m)
19408 {
19409   /* Take into account the aarch64-autovec-preference param if non-zero.  */
19410   bool only_asimd_p = aarch64_autovec_preference == 1;
19411   bool only_sve_p = aarch64_autovec_preference == 2;
19412 
19413   if (only_asimd_p)
19414     return false;
19415   if (only_sve_p)
19416     return true;
19417 
19418   /* The preference in case of a tie in costs.  */
19419   bool prefer_asimd = aarch64_autovec_preference == 3;
19420   bool prefer_sve = aarch64_autovec_preference == 4;
19421 
19422   poly_int64 nunits_sve = GET_MODE_NUNITS (sve_m);
19423   poly_int64 nunits_asimd = GET_MODE_NUNITS (asimd_m);
19424   /* If the CPU information does not have an SVE width registered use the
19425      generic poly_int comparison that prefers SVE.  If a preference is
19426      explicitly requested avoid this path.  */
19427   if (aarch64_tune_params.sve_width == SVE_SCALABLE
19428       && !prefer_asimd
19429       && !prefer_sve)
19430     return maybe_gt (nunits_sve, nunits_asimd);
19431 
19432   /* Otherwise estimate the runtime width of the modes involved.  */
19433   HOST_WIDE_INT est_sve = estimated_poly_value (nunits_sve);
19434   HOST_WIDE_INT est_asimd = estimated_poly_value (nunits_asimd);
19435 
19436   /* Preferring SVE means picking it first unless the Advanced SIMD mode
19437      is clearly wider.  */
19438   if (prefer_sve)
19439     return est_sve >= est_asimd;
19440   /* Conversely, preferring Advanced SIMD means picking SVE only if SVE
19441      is clearly wider.  */
19442   if (prefer_asimd)
19443     return est_sve > est_asimd;
19444 
19445   /* In the default case prefer Advanced SIMD over SVE in case of a tie.  */
19446   return est_sve > est_asimd;
19447 }
19448 
19449 /* Return 128-bit container as the preferred SIMD mode for MODE.  */
19450 static machine_mode
aarch64_preferred_simd_mode(scalar_mode mode)19451 aarch64_preferred_simd_mode (scalar_mode mode)
19452 {
19453   /* Take into account explicit auto-vectorization ISA preferences through
19454      aarch64_cmp_autovec_modes.  */
19455   if (TARGET_SVE && aarch64_cmp_autovec_modes (VNx16QImode, V16QImode))
19456     return aarch64_full_sve_mode (mode).else_mode (word_mode);
19457   if (TARGET_SIMD)
19458     return aarch64_vq_mode (mode).else_mode (word_mode);
19459   return word_mode;
19460 }
19461 
19462 /* Return a list of possible vector sizes for the vectorizer
19463    to iterate over.  */
19464 static unsigned int
aarch64_autovectorize_vector_modes(vector_modes * modes,bool)19465 aarch64_autovectorize_vector_modes (vector_modes *modes, bool)
19466 {
19467   static const machine_mode sve_modes[] = {
19468     /* Try using full vectors for all element types.  */
19469     VNx16QImode,
19470 
19471     /* Try using 16-bit containers for 8-bit elements and full vectors
19472        for wider elements.  */
19473     VNx8QImode,
19474 
19475     /* Try using 32-bit containers for 8-bit and 16-bit elements and
19476        full vectors for wider elements.  */
19477     VNx4QImode,
19478 
19479     /* Try using 64-bit containers for all element types.  */
19480     VNx2QImode
19481   };
19482 
19483   static const machine_mode advsimd_modes[] = {
19484     /* Try using 128-bit vectors for all element types.  */
19485     V16QImode,
19486 
19487     /* Try using 64-bit vectors for 8-bit elements and 128-bit vectors
19488        for wider elements.  */
19489     V8QImode,
19490 
19491     /* Try using 64-bit vectors for 16-bit elements and 128-bit vectors
19492        for wider elements.
19493 
19494        TODO: We could support a limited form of V4QImode too, so that
19495        we use 32-bit vectors for 8-bit elements.  */
19496     V4HImode,
19497 
19498     /* Try using 64-bit vectors for 32-bit elements and 128-bit vectors
19499        for 64-bit elements.
19500 
19501        TODO: We could similarly support limited forms of V2QImode and V2HImode
19502        for this case.  */
19503     V2SImode
19504   };
19505 
19506   /* Try using N-byte SVE modes only after trying N-byte Advanced SIMD mode.
19507      This is because:
19508 
19509      - If we can't use N-byte Advanced SIMD vectors then the placement
19510        doesn't matter; we'll just continue as though the Advanced SIMD
19511        entry didn't exist.
19512 
19513      - If an SVE main loop with N bytes ends up being cheaper than an
19514        Advanced SIMD main loop with N bytes then by default we'll replace
19515        the Advanced SIMD version with the SVE one.
19516 
19517      - If an Advanced SIMD main loop with N bytes ends up being cheaper
19518        than an SVE main loop with N bytes then by default we'll try to
19519        use the SVE loop to vectorize the epilogue instead.  */
19520 
19521   bool only_asimd_p = aarch64_autovec_preference == 1;
19522   bool only_sve_p = aarch64_autovec_preference == 2;
19523 
19524   unsigned int sve_i = (TARGET_SVE && !only_asimd_p) ? 0 : ARRAY_SIZE (sve_modes);
19525   unsigned int advsimd_i = 0;
19526 
19527   while (!only_sve_p && advsimd_i < ARRAY_SIZE (advsimd_modes))
19528     {
19529       if (sve_i < ARRAY_SIZE (sve_modes)
19530 	  && aarch64_cmp_autovec_modes (sve_modes[sve_i],
19531 					advsimd_modes[advsimd_i]))
19532 	modes->safe_push (sve_modes[sve_i++]);
19533       else
19534 	modes->safe_push (advsimd_modes[advsimd_i++]);
19535     }
19536   while (sve_i < ARRAY_SIZE (sve_modes))
19537    modes->safe_push (sve_modes[sve_i++]);
19538 
19539   unsigned int flags = 0;
19540   /* Consider enabling VECT_COMPARE_COSTS for SVE, both so that we
19541      can compare SVE against Advanced SIMD and so that we can compare
19542      multiple SVE vectorization approaches against each other.  There's
19543      not really any point doing this for Advanced SIMD only, since the
19544      first mode that works should always be the best.  */
19545   if (TARGET_SVE && aarch64_sve_compare_costs)
19546     flags |= VECT_COMPARE_COSTS;
19547   return flags;
19548 }
19549 
19550 /* Implement TARGET_MANGLE_TYPE.  */
19551 
19552 static const char *
aarch64_mangle_type(const_tree type)19553 aarch64_mangle_type (const_tree type)
19554 {
19555   /* The AArch64 ABI documents say that "__va_list" has to be
19556      mangled as if it is in the "std" namespace.  */
19557   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
19558     return "St9__va_list";
19559 
19560   /* Half-precision floating point types.  */
19561   if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
19562     {
19563       if (TYPE_MODE (type) == BFmode)
19564 	return "u6__bf16";
19565       else
19566 	return "Dh";
19567     }
19568 
19569   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
19570      builtin types.  */
19571   if (TYPE_NAME (type) != NULL)
19572     {
19573       const char *res;
19574       if ((res = aarch64_general_mangle_builtin_type (type))
19575 	  || (res = aarch64_sve::mangle_builtin_type (type)))
19576 	return res;
19577     }
19578 
19579   /* Use the default mangling.  */
19580   return NULL;
19581 }
19582 
19583 /* Implement TARGET_VERIFY_TYPE_CONTEXT.  */
19584 
19585 static bool
aarch64_verify_type_context(location_t loc,type_context_kind context,const_tree type,bool silent_p)19586 aarch64_verify_type_context (location_t loc, type_context_kind context,
19587 			     const_tree type, bool silent_p)
19588 {
19589   return aarch64_sve::verify_type_context (loc, context, type, silent_p);
19590 }
19591 
19592 /* Find the first rtx_insn before insn that will generate an assembly
19593    instruction.  */
19594 
19595 static rtx_insn *
aarch64_prev_real_insn(rtx_insn * insn)19596 aarch64_prev_real_insn (rtx_insn *insn)
19597 {
19598   if (!insn)
19599     return NULL;
19600 
19601   do
19602     {
19603       insn = prev_real_insn (insn);
19604     }
19605   while (insn && recog_memoized (insn) < 0);
19606 
19607   return insn;
19608 }
19609 
19610 static bool
is_madd_op(enum attr_type t1)19611 is_madd_op (enum attr_type t1)
19612 {
19613   unsigned int i;
19614   /* A number of these may be AArch32 only.  */
19615   enum attr_type mlatypes[] = {
19616     TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
19617     TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
19618     TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
19619   };
19620 
19621   for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
19622     {
19623       if (t1 == mlatypes[i])
19624 	return true;
19625     }
19626 
19627   return false;
19628 }
19629 
19630 /* Check if there is a register dependency between a load and the insn
19631    for which we hold recog_data.  */
19632 
19633 static bool
dep_between_memop_and_curr(rtx memop)19634 dep_between_memop_and_curr (rtx memop)
19635 {
19636   rtx load_reg;
19637   int opno;
19638 
19639   gcc_assert (GET_CODE (memop) == SET);
19640 
19641   if (!REG_P (SET_DEST (memop)))
19642     return false;
19643 
19644   load_reg = SET_DEST (memop);
19645   for (opno = 1; opno < recog_data.n_operands; opno++)
19646     {
19647       rtx operand = recog_data.operand[opno];
19648       if (REG_P (operand)
19649           && reg_overlap_mentioned_p (load_reg, operand))
19650         return true;
19651 
19652     }
19653   return false;
19654 }
19655 
19656 
19657 /* When working around the Cortex-A53 erratum 835769,
19658    given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
19659    instruction and has a preceding memory instruction such that a NOP
19660    should be inserted between them.  */
19661 
19662 bool
aarch64_madd_needs_nop(rtx_insn * insn)19663 aarch64_madd_needs_nop (rtx_insn* insn)
19664 {
19665   enum attr_type attr_type;
19666   rtx_insn *prev;
19667   rtx body;
19668 
19669   if (!TARGET_FIX_ERR_A53_835769)
19670     return false;
19671 
19672   if (!INSN_P (insn) || recog_memoized (insn) < 0)
19673     return false;
19674 
19675   attr_type = get_attr_type (insn);
19676   if (!is_madd_op (attr_type))
19677     return false;
19678 
19679   prev = aarch64_prev_real_insn (insn);
19680   /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
19681      Restore recog state to INSN to avoid state corruption.  */
19682   extract_constrain_insn_cached (insn);
19683 
19684   if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
19685     return false;
19686 
19687   body = single_set (prev);
19688 
19689   /* If the previous insn is a memory op and there is no dependency between
19690      it and the DImode madd, emit a NOP between them.  If body is NULL then we
19691      have a complex memory operation, probably a load/store pair.
19692      Be conservative for now and emit a NOP.  */
19693   if (GET_MODE (recog_data.operand[0]) == DImode
19694       && (!body || !dep_between_memop_and_curr (body)))
19695     return true;
19696 
19697   return false;
19698 
19699 }
19700 
19701 
19702 /* Implement FINAL_PRESCAN_INSN.  */
19703 
19704 void
aarch64_final_prescan_insn(rtx_insn * insn)19705 aarch64_final_prescan_insn (rtx_insn *insn)
19706 {
19707   if (aarch64_madd_needs_nop (insn))
19708     fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
19709 }
19710 
19711 
19712 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
19713    instruction.  */
19714 
19715 bool
aarch64_sve_index_immediate_p(rtx base_or_step)19716 aarch64_sve_index_immediate_p (rtx base_or_step)
19717 {
19718   return (CONST_INT_P (base_or_step)
19719 	  && IN_RANGE (INTVAL (base_or_step), -16, 15));
19720 }
19721 
19722 /* Return true if X is a valid immediate for the SVE ADD and SUB instructions
19723    when applied to mode MODE.  Negate X first if NEGATE_P is true.  */
19724 
19725 bool
aarch64_sve_arith_immediate_p(machine_mode mode,rtx x,bool negate_p)19726 aarch64_sve_arith_immediate_p (machine_mode mode, rtx x, bool negate_p)
19727 {
19728   rtx elt = unwrap_const_vec_duplicate (x);
19729   if (!CONST_INT_P (elt))
19730     return false;
19731 
19732   HOST_WIDE_INT val = INTVAL (elt);
19733   if (negate_p)
19734     val = -val;
19735   val &= GET_MODE_MASK (GET_MODE_INNER (mode));
19736 
19737   if (val & 0xff)
19738     return IN_RANGE (val, 0, 0xff);
19739   return IN_RANGE (val, 0, 0xff00);
19740 }
19741 
19742 /* Return true if X is a valid immediate for the SVE SQADD and SQSUB
19743    instructions when applied to mode MODE.  Negate X first if NEGATE_P
19744    is true.  */
19745 
19746 bool
aarch64_sve_sqadd_sqsub_immediate_p(machine_mode mode,rtx x,bool negate_p)19747 aarch64_sve_sqadd_sqsub_immediate_p (machine_mode mode, rtx x, bool negate_p)
19748 {
19749   if (!aarch64_sve_arith_immediate_p (mode, x, negate_p))
19750     return false;
19751 
19752   /* After the optional negation, the immediate must be nonnegative.
19753      E.g. a saturating add of -127 must be done via SQSUB Zn.B, Zn.B, #127
19754      instead of SQADD Zn.B, Zn.B, #129.  */
19755   rtx elt = unwrap_const_vec_duplicate (x);
19756   return negate_p == (INTVAL (elt) < 0);
19757 }
19758 
19759 /* Return true if X is a valid immediate operand for an SVE logical
19760    instruction such as AND.  */
19761 
19762 bool
aarch64_sve_bitmask_immediate_p(rtx x)19763 aarch64_sve_bitmask_immediate_p (rtx x)
19764 {
19765   rtx elt;
19766 
19767   return (const_vec_duplicate_p (x, &elt)
19768 	  && CONST_INT_P (elt)
19769 	  && aarch64_bitmask_imm (INTVAL (elt),
19770 				  GET_MODE_INNER (GET_MODE (x))));
19771 }
19772 
19773 /* Return true if X is a valid immediate for the SVE DUP and CPY
19774    instructions.  */
19775 
19776 bool
aarch64_sve_dup_immediate_p(rtx x)19777 aarch64_sve_dup_immediate_p (rtx x)
19778 {
19779   x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
19780   if (!CONST_INT_P (x))
19781     return false;
19782 
19783   HOST_WIDE_INT val = INTVAL (x);
19784   if (val & 0xff)
19785     return IN_RANGE (val, -0x80, 0x7f);
19786   return IN_RANGE (val, -0x8000, 0x7f00);
19787 }
19788 
19789 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
19790    SIGNED_P says whether the operand is signed rather than unsigned.  */
19791 
19792 bool
aarch64_sve_cmp_immediate_p(rtx x,bool signed_p)19793 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
19794 {
19795   x = unwrap_const_vec_duplicate (x);
19796   return (CONST_INT_P (x)
19797 	  && (signed_p
19798 	      ? IN_RANGE (INTVAL (x), -16, 15)
19799 	      : IN_RANGE (INTVAL (x), 0, 127)));
19800 }
19801 
19802 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
19803    instruction.  Negate X first if NEGATE_P is true.  */
19804 
19805 bool
aarch64_sve_float_arith_immediate_p(rtx x,bool negate_p)19806 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
19807 {
19808   rtx elt;
19809   REAL_VALUE_TYPE r;
19810 
19811   if (!const_vec_duplicate_p (x, &elt)
19812       || !CONST_DOUBLE_P (elt))
19813     return false;
19814 
19815   r = *CONST_DOUBLE_REAL_VALUE (elt);
19816 
19817   if (negate_p)
19818     r = real_value_negate (&r);
19819 
19820   if (real_equal (&r, &dconst1))
19821     return true;
19822   if (real_equal (&r, &dconsthalf))
19823     return true;
19824   return false;
19825 }
19826 
19827 /* Return true if X is a valid immediate operand for an SVE FMUL
19828    instruction.  */
19829 
19830 bool
aarch64_sve_float_mul_immediate_p(rtx x)19831 aarch64_sve_float_mul_immediate_p (rtx x)
19832 {
19833   rtx elt;
19834 
19835   return (const_vec_duplicate_p (x, &elt)
19836 	  && CONST_DOUBLE_P (elt)
19837 	  && (real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf)
19838 	      || real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconst2)));
19839 }
19840 
19841 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
19842    for the Advanced SIMD operation described by WHICH and INSN.  If INFO
19843    is nonnull, use it to describe valid immediates.  */
19844 static bool
aarch64_advsimd_valid_immediate_hs(unsigned int val32,simd_immediate_info * info,enum simd_immediate_check which,simd_immediate_info::insn_type insn)19845 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
19846 				    simd_immediate_info *info,
19847 				    enum simd_immediate_check which,
19848 				    simd_immediate_info::insn_type insn)
19849 {
19850   /* Try a 4-byte immediate with LSL.  */
19851   for (unsigned int shift = 0; shift < 32; shift += 8)
19852     if ((val32 & (0xff << shift)) == val32)
19853       {
19854 	if (info)
19855 	  *info = simd_immediate_info (SImode, val32 >> shift, insn,
19856 				       simd_immediate_info::LSL, shift);
19857 	return true;
19858       }
19859 
19860   /* Try a 2-byte immediate with LSL.  */
19861   unsigned int imm16 = val32 & 0xffff;
19862   if (imm16 == (val32 >> 16))
19863     for (unsigned int shift = 0; shift < 16; shift += 8)
19864       if ((imm16 & (0xff << shift)) == imm16)
19865 	{
19866 	  if (info)
19867 	    *info = simd_immediate_info (HImode, imm16 >> shift, insn,
19868 					 simd_immediate_info::LSL, shift);
19869 	  return true;
19870 	}
19871 
19872   /* Try a 4-byte immediate with MSL, except for cases that MVN
19873      can handle.  */
19874   if (which == AARCH64_CHECK_MOV)
19875     for (unsigned int shift = 8; shift < 24; shift += 8)
19876       {
19877 	unsigned int low = (1 << shift) - 1;
19878 	if (((val32 & (0xff << shift)) | low) == val32)
19879 	  {
19880 	    if (info)
19881 	      *info = simd_immediate_info (SImode, val32 >> shift, insn,
19882 					   simd_immediate_info::MSL, shift);
19883 	    return true;
19884 	  }
19885       }
19886 
19887   return false;
19888 }
19889 
19890 /* Return true if replicating VAL64 is a valid immediate for the
19891    Advanced SIMD operation described by WHICH.  If INFO is nonnull,
19892    use it to describe valid immediates.  */
19893 static bool
aarch64_advsimd_valid_immediate(unsigned HOST_WIDE_INT val64,simd_immediate_info * info,enum simd_immediate_check which)19894 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
19895 				 simd_immediate_info *info,
19896 				 enum simd_immediate_check which)
19897 {
19898   unsigned int val32 = val64 & 0xffffffff;
19899   unsigned int val16 = val64 & 0xffff;
19900   unsigned int val8 = val64 & 0xff;
19901 
19902   if (val32 == (val64 >> 32))
19903     {
19904       if ((which & AARCH64_CHECK_ORR) != 0
19905 	  && aarch64_advsimd_valid_immediate_hs (val32, info, which,
19906 						 simd_immediate_info::MOV))
19907 	return true;
19908 
19909       if ((which & AARCH64_CHECK_BIC) != 0
19910 	  && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
19911 						 simd_immediate_info::MVN))
19912 	return true;
19913 
19914       /* Try using a replicated byte.  */
19915       if (which == AARCH64_CHECK_MOV
19916 	  && val16 == (val32 >> 16)
19917 	  && val8 == (val16 >> 8))
19918 	{
19919 	  if (info)
19920 	    *info = simd_immediate_info (QImode, val8);
19921 	  return true;
19922 	}
19923     }
19924 
19925   /* Try using a bit-to-bytemask.  */
19926   if (which == AARCH64_CHECK_MOV)
19927     {
19928       unsigned int i;
19929       for (i = 0; i < 64; i += 8)
19930 	{
19931 	  unsigned char byte = (val64 >> i) & 0xff;
19932 	  if (byte != 0 && byte != 0xff)
19933 	    break;
19934 	}
19935       if (i == 64)
19936 	{
19937 	  if (info)
19938 	    *info = simd_immediate_info (DImode, val64);
19939 	  return true;
19940 	}
19941     }
19942   return false;
19943 }
19944 
19945 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
19946    instruction.  If INFO is nonnull, use it to describe valid immediates.  */
19947 
19948 static bool
aarch64_sve_valid_immediate(unsigned HOST_WIDE_INT val64,simd_immediate_info * info)19949 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
19950 			     simd_immediate_info *info)
19951 {
19952   scalar_int_mode mode = DImode;
19953   unsigned int val32 = val64 & 0xffffffff;
19954   if (val32 == (val64 >> 32))
19955     {
19956       mode = SImode;
19957       unsigned int val16 = val32 & 0xffff;
19958       if (val16 == (val32 >> 16))
19959 	{
19960 	  mode = HImode;
19961 	  unsigned int val8 = val16 & 0xff;
19962 	  if (val8 == (val16 >> 8))
19963 	    mode = QImode;
19964 	}
19965     }
19966   HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
19967   if (IN_RANGE (val, -0x80, 0x7f))
19968     {
19969       /* DUP with no shift.  */
19970       if (info)
19971 	*info = simd_immediate_info (mode, val);
19972       return true;
19973     }
19974   if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
19975     {
19976       /* DUP with LSL #8.  */
19977       if (info)
19978 	*info = simd_immediate_info (mode, val);
19979       return true;
19980     }
19981   if (aarch64_bitmask_imm (val64, mode))
19982     {
19983       /* DUPM.  */
19984       if (info)
19985 	*info = simd_immediate_info (mode, val);
19986       return true;
19987     }
19988   return false;
19989 }
19990 
19991 /* Return true if X is an UNSPEC_PTRUE constant of the form:
19992 
19993        (const (unspec [PATTERN ZERO] UNSPEC_PTRUE))
19994 
19995    where PATTERN is the svpattern as a CONST_INT and where ZERO
19996    is a zero constant of the required PTRUE mode (which can have
19997    fewer elements than X's mode, if zero bits are significant).
19998 
19999    If so, and if INFO is nonnull, describe the immediate in INFO.  */
20000 bool
aarch64_sve_ptrue_svpattern_p(rtx x,struct simd_immediate_info * info)20001 aarch64_sve_ptrue_svpattern_p (rtx x, struct simd_immediate_info *info)
20002 {
20003   if (GET_CODE (x) != CONST)
20004     return false;
20005 
20006   x = XEXP (x, 0);
20007   if (GET_CODE (x) != UNSPEC || XINT (x, 1) != UNSPEC_PTRUE)
20008     return false;
20009 
20010   if (info)
20011     {
20012       aarch64_svpattern pattern
20013 	= (aarch64_svpattern) INTVAL (XVECEXP (x, 0, 0));
20014       machine_mode pred_mode = GET_MODE (XVECEXP (x, 0, 1));
20015       scalar_int_mode int_mode = aarch64_sve_element_int_mode (pred_mode);
20016       *info = simd_immediate_info (int_mode, pattern);
20017     }
20018   return true;
20019 }
20020 
20021 /* Return true if X is a valid SVE predicate.  If INFO is nonnull, use
20022    it to describe valid immediates.  */
20023 
20024 static bool
aarch64_sve_pred_valid_immediate(rtx x,simd_immediate_info * info)20025 aarch64_sve_pred_valid_immediate (rtx x, simd_immediate_info *info)
20026 {
20027   if (aarch64_sve_ptrue_svpattern_p (x, info))
20028     return true;
20029 
20030   if (x == CONST0_RTX (GET_MODE (x)))
20031     {
20032       if (info)
20033 	*info = simd_immediate_info (DImode, 0);
20034       return true;
20035     }
20036 
20037   /* Analyze the value as a VNx16BImode.  This should be relatively
20038      efficient, since rtx_vector_builder has enough built-in capacity
20039      to store all VLA predicate constants without needing the heap.  */
20040   rtx_vector_builder builder;
20041   if (!aarch64_get_sve_pred_bits (builder, x))
20042     return false;
20043 
20044   unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
20045   if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
20046     {
20047       machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
20048       aarch64_svpattern pattern = aarch64_svpattern_for_vl (mode, vl);
20049       if (pattern != AARCH64_NUM_SVPATTERNS)
20050 	{
20051 	  if (info)
20052 	    {
20053 	      scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
20054 	      *info = simd_immediate_info (int_mode, pattern);
20055 	    }
20056 	  return true;
20057 	}
20058     }
20059   return false;
20060 }
20061 
20062 /* Return true if OP is a valid SIMD immediate for the operation
20063    described by WHICH.  If INFO is nonnull, use it to describe valid
20064    immediates.  */
20065 bool
aarch64_simd_valid_immediate(rtx op,simd_immediate_info * info,enum simd_immediate_check which)20066 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
20067 			      enum simd_immediate_check which)
20068 {
20069   machine_mode mode = GET_MODE (op);
20070   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
20071   if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
20072     return false;
20073 
20074   if (vec_flags & VEC_SVE_PRED)
20075     return aarch64_sve_pred_valid_immediate (op, info);
20076 
20077   scalar_mode elt_mode = GET_MODE_INNER (mode);
20078   rtx base, step;
20079   unsigned int n_elts;
20080   if (GET_CODE (op) == CONST_VECTOR
20081       && CONST_VECTOR_DUPLICATE_P (op))
20082     n_elts = CONST_VECTOR_NPATTERNS (op);
20083   else if ((vec_flags & VEC_SVE_DATA)
20084 	   && const_vec_series_p (op, &base, &step))
20085     {
20086       gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
20087       if (!aarch64_sve_index_immediate_p (base)
20088 	  || !aarch64_sve_index_immediate_p (step))
20089 	return false;
20090 
20091       if (info)
20092 	{
20093 	  /* Get the corresponding container mode.  E.g. an INDEX on V2SI
20094 	     should yield two integer values per 128-bit block, meaning
20095 	     that we need to treat it in the same way as V2DI and then
20096 	     ignore the upper 32 bits of each element.  */
20097 	  elt_mode = aarch64_sve_container_int_mode (mode);
20098 	  *info = simd_immediate_info (elt_mode, base, step);
20099 	}
20100       return true;
20101     }
20102   else if (GET_CODE (op) == CONST_VECTOR
20103 	   && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
20104     /* N_ELTS set above.  */;
20105   else
20106     return false;
20107 
20108   scalar_float_mode elt_float_mode;
20109   if (n_elts == 1
20110       && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
20111     {
20112       rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
20113       if (aarch64_float_const_zero_rtx_p (elt)
20114 	  || aarch64_float_const_representable_p (elt))
20115 	{
20116 	  if (info)
20117 	    *info = simd_immediate_info (elt_float_mode, elt);
20118 	  return true;
20119 	}
20120     }
20121 
20122   /* If all elements in an SVE vector have the same value, we have a free
20123      choice between using the element mode and using the container mode.
20124      Using the element mode means that unused parts of the vector are
20125      duplicates of the used elements, while using the container mode means
20126      that the unused parts are an extension of the used elements.  Using the
20127      element mode is better for (say) VNx4HI 0x101, since 0x01010101 is valid
20128      for its container mode VNx4SI while 0x00000101 isn't.
20129 
20130      If not all elements in an SVE vector have the same value, we need the
20131      transition from one element to the next to occur at container boundaries.
20132      E.g. a fixed-length VNx4HI containing { 1, 2, 3, 4 } should be treated
20133      in the same way as a VNx4SI containing { 1, 2, 3, 4 }.  */
20134   scalar_int_mode elt_int_mode;
20135   if ((vec_flags & VEC_SVE_DATA) && n_elts > 1)
20136     elt_int_mode = aarch64_sve_container_int_mode (mode);
20137   else
20138     elt_int_mode = int_mode_for_mode (elt_mode).require ();
20139 
20140   unsigned int elt_size = GET_MODE_SIZE (elt_int_mode);
20141   if (elt_size > 8)
20142     return false;
20143 
20144   /* Expand the vector constant out into a byte vector, with the least
20145      significant byte of the register first.  */
20146   auto_vec<unsigned char, 16> bytes;
20147   bytes.reserve (n_elts * elt_size);
20148   for (unsigned int i = 0; i < n_elts; i++)
20149     {
20150       /* The vector is provided in gcc endian-neutral fashion.
20151 	 For aarch64_be Advanced SIMD, it must be laid out in the vector
20152 	 register in reverse order.  */
20153       bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
20154       rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
20155 
20156       if (elt_mode != elt_int_mode)
20157 	elt = gen_lowpart (elt_int_mode, elt);
20158 
20159       if (!CONST_INT_P (elt))
20160 	return false;
20161 
20162       unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
20163       for (unsigned int byte = 0; byte < elt_size; byte++)
20164 	{
20165 	  bytes.quick_push (elt_val & 0xff);
20166 	  elt_val >>= BITS_PER_UNIT;
20167 	}
20168     }
20169 
20170   /* The immediate must repeat every eight bytes.  */
20171   unsigned int nbytes = bytes.length ();
20172   for (unsigned i = 8; i < nbytes; ++i)
20173     if (bytes[i] != bytes[i - 8])
20174       return false;
20175 
20176   /* Get the repeating 8-byte value as an integer.  No endian correction
20177      is needed here because bytes is already in lsb-first order.  */
20178   unsigned HOST_WIDE_INT val64 = 0;
20179   for (unsigned int i = 0; i < 8; i++)
20180     val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
20181 	      << (i * BITS_PER_UNIT));
20182 
20183   if (vec_flags & VEC_SVE_DATA)
20184     return aarch64_sve_valid_immediate (val64, info);
20185   else
20186     return aarch64_advsimd_valid_immediate (val64, info, which);
20187 }
20188 
20189 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
20190    has a step in the range of INDEX.  Return the index expression if so,
20191    otherwise return null.  */
20192 rtx
aarch64_check_zero_based_sve_index_immediate(rtx x)20193 aarch64_check_zero_based_sve_index_immediate (rtx x)
20194 {
20195   rtx base, step;
20196   if (const_vec_series_p (x, &base, &step)
20197       && base == const0_rtx
20198       && aarch64_sve_index_immediate_p (step))
20199     return step;
20200   return NULL_RTX;
20201 }
20202 
20203 /* Check of immediate shift constants are within range.  */
20204 bool
aarch64_simd_shift_imm_p(rtx x,machine_mode mode,bool left)20205 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
20206 {
20207   x = unwrap_const_vec_duplicate (x);
20208   if (!CONST_INT_P (x))
20209     return false;
20210   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
20211   if (left)
20212     return IN_RANGE (INTVAL (x), 0, bit_width - 1);
20213   else
20214     return IN_RANGE (INTVAL (x), 1, bit_width);
20215 }
20216 
20217 /* Return the bitmask CONST_INT to select the bits required by a zero extract
20218    operation of width WIDTH at bit position POS.  */
20219 
20220 rtx
aarch64_mask_from_zextract_ops(rtx width,rtx pos)20221 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
20222 {
20223   gcc_assert (CONST_INT_P (width));
20224   gcc_assert (CONST_INT_P (pos));
20225 
20226   unsigned HOST_WIDE_INT mask
20227     = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
20228   return GEN_INT (mask << UINTVAL (pos));
20229 }
20230 
20231 bool
aarch64_mov_operand_p(rtx x,machine_mode mode)20232 aarch64_mov_operand_p (rtx x, machine_mode mode)
20233 {
20234   if (GET_CODE (x) == HIGH
20235       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
20236     return true;
20237 
20238   if (CONST_INT_P (x))
20239     return true;
20240 
20241   if (VECTOR_MODE_P (GET_MODE (x)))
20242     {
20243       /* Require predicate constants to be VNx16BI before RA, so that we
20244 	 force everything to have a canonical form.  */
20245       if (!lra_in_progress
20246 	  && !reload_completed
20247 	  && GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_BOOL
20248 	  && GET_MODE (x) != VNx16BImode)
20249 	return false;
20250 
20251       return aarch64_simd_valid_immediate (x, NULL);
20252     }
20253 
20254   x = strip_salt (x);
20255   if (SYMBOL_REF_P (x) && mode == DImode && CONSTANT_ADDRESS_P (x))
20256     return true;
20257 
20258   if (TARGET_SVE && aarch64_sve_cnt_immediate_p (x))
20259     return true;
20260 
20261   return aarch64_classify_symbolic_expression (x)
20262     == SYMBOL_TINY_ABSOLUTE;
20263 }
20264 
20265 /* Return a const_int vector of VAL.  */
20266 rtx
aarch64_simd_gen_const_vector_dup(machine_mode mode,HOST_WIDE_INT val)20267 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
20268 {
20269   rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
20270   return gen_const_vec_duplicate (mode, c);
20271 }
20272 
20273 /* Check OP is a legal scalar immediate for the MOVI instruction.  */
20274 
20275 bool
aarch64_simd_scalar_immediate_valid_for_move(rtx op,scalar_int_mode mode)20276 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
20277 {
20278   machine_mode vmode;
20279 
20280   vmode = aarch64_simd_container_mode (mode, 64);
20281   rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
20282   return aarch64_simd_valid_immediate (op_v, NULL);
20283 }
20284 
20285 /* Construct and return a PARALLEL RTX vector with elements numbering the
20286    lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
20287    the vector - from the perspective of the architecture.  This does not
20288    line up with GCC's perspective on lane numbers, so we end up with
20289    different masks depending on our target endian-ness.  The diagram
20290    below may help.  We must draw the distinction when building masks
20291    which select one half of the vector.  An instruction selecting
20292    architectural low-lanes for a big-endian target, must be described using
20293    a mask selecting GCC high-lanes.
20294 
20295                  Big-Endian             Little-Endian
20296 
20297 GCC             0   1   2   3           3   2   1   0
20298               | x | x | x | x |       | x | x | x | x |
20299 Architecture    3   2   1   0           3   2   1   0
20300 
20301 Low Mask:         { 2, 3 }                { 0, 1 }
20302 High Mask:        { 0, 1 }                { 2, 3 }
20303 
20304    MODE Is the mode of the vector and NUNITS is the number of units in it.  */
20305 
20306 rtx
aarch64_simd_vect_par_cnst_half(machine_mode mode,int nunits,bool high)20307 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
20308 {
20309   rtvec v = rtvec_alloc (nunits / 2);
20310   int high_base = nunits / 2;
20311   int low_base = 0;
20312   int base;
20313   rtx t1;
20314   int i;
20315 
20316   if (BYTES_BIG_ENDIAN)
20317     base = high ? low_base : high_base;
20318   else
20319     base = high ? high_base : low_base;
20320 
20321   for (i = 0; i < nunits / 2; i++)
20322     RTVEC_ELT (v, i) = GEN_INT (base + i);
20323 
20324   t1 = gen_rtx_PARALLEL (mode, v);
20325   return t1;
20326 }
20327 
20328 /* Check OP for validity as a PARALLEL RTX vector with elements
20329    numbering the lanes of either the high (HIGH == TRUE) or low lanes,
20330    from the perspective of the architecture.  See the diagram above
20331    aarch64_simd_vect_par_cnst_half for more details.  */
20332 
20333 bool
aarch64_simd_check_vect_par_cnst_half(rtx op,machine_mode mode,bool high)20334 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
20335 				       bool high)
20336 {
20337   int nelts;
20338   if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
20339     return false;
20340 
20341   rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
20342   HOST_WIDE_INT count_op = XVECLEN (op, 0);
20343   HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
20344   int i = 0;
20345 
20346   if (count_op != count_ideal)
20347     return false;
20348 
20349   for (i = 0; i < count_ideal; i++)
20350     {
20351       rtx elt_op = XVECEXP (op, 0, i);
20352       rtx elt_ideal = XVECEXP (ideal, 0, i);
20353 
20354       if (!CONST_INT_P (elt_op)
20355 	  || INTVAL (elt_ideal) != INTVAL (elt_op))
20356 	return false;
20357     }
20358   return true;
20359 }
20360 
20361 /* Return a PARALLEL containing NELTS elements, with element I equal
20362    to BASE + I * STEP.  */
20363 
20364 rtx
aarch64_gen_stepped_int_parallel(unsigned int nelts,int base,int step)20365 aarch64_gen_stepped_int_parallel (unsigned int nelts, int base, int step)
20366 {
20367   rtvec vec = rtvec_alloc (nelts);
20368   for (unsigned int i = 0; i < nelts; ++i)
20369     RTVEC_ELT (vec, i) = gen_int_mode (base + i * step, DImode);
20370   return gen_rtx_PARALLEL (VOIDmode, vec);
20371 }
20372 
20373 /* Return true if OP is a PARALLEL of CONST_INTs that form a linear
20374    series with step STEP.  */
20375 
20376 bool
aarch64_stepped_int_parallel_p(rtx op,int step)20377 aarch64_stepped_int_parallel_p (rtx op, int step)
20378 {
20379   if (GET_CODE (op) != PARALLEL || !CONST_INT_P (XVECEXP (op, 0, 0)))
20380     return false;
20381 
20382   unsigned HOST_WIDE_INT base = UINTVAL (XVECEXP (op, 0, 0));
20383   for (int i = 1; i < XVECLEN (op, 0); ++i)
20384     if (!CONST_INT_P (XVECEXP (op, 0, i))
20385 	|| UINTVAL (XVECEXP (op, 0, i)) != base + i * step)
20386       return false;
20387 
20388   return true;
20389 }
20390 
20391 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
20392    HIGH (exclusive).  */
20393 void
aarch64_simd_lane_bounds(rtx operand,HOST_WIDE_INT low,HOST_WIDE_INT high,const_tree exp)20394 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
20395 			  const_tree exp)
20396 {
20397   HOST_WIDE_INT lane;
20398   gcc_assert (CONST_INT_P (operand));
20399   lane = INTVAL (operand);
20400 
20401   if (lane < low || lane >= high)
20402   {
20403     if (exp)
20404       error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
20405     else
20406       error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
20407   }
20408 }
20409 
20410 /* Peform endian correction on lane number N, which indexes a vector
20411    of mode MODE, and return the result as an SImode rtx.  */
20412 
20413 rtx
aarch64_endian_lane_rtx(machine_mode mode,unsigned int n)20414 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
20415 {
20416   return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
20417 }
20418 
20419 /* Return TRUE if OP is a valid vector addressing mode.  */
20420 
20421 bool
aarch64_simd_mem_operand_p(rtx op)20422 aarch64_simd_mem_operand_p (rtx op)
20423 {
20424   return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
20425 			|| REG_P (XEXP (op, 0)));
20426 }
20427 
20428 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction.  */
20429 
20430 bool
aarch64_sve_ld1r_operand_p(rtx op)20431 aarch64_sve_ld1r_operand_p (rtx op)
20432 {
20433   struct aarch64_address_info addr;
20434   scalar_mode mode;
20435 
20436   return (MEM_P (op)
20437 	  && is_a <scalar_mode> (GET_MODE (op), &mode)
20438 	  && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
20439 	  && addr.type == ADDRESS_REG_IMM
20440 	  && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
20441 }
20442 
20443 /* Return true if OP is a valid MEM operand for an SVE LD1R{Q,O} instruction
20444    where the size of the read data is specified by `mode` and the size of the
20445    vector elements are specified by `elem_mode`.   */
20446 bool
aarch64_sve_ld1rq_ld1ro_operand_p(rtx op,machine_mode mode,scalar_mode elem_mode)20447 aarch64_sve_ld1rq_ld1ro_operand_p (rtx op, machine_mode mode,
20448 				   scalar_mode elem_mode)
20449 {
20450   struct aarch64_address_info addr;
20451   if (!MEM_P (op)
20452       || !aarch64_classify_address (&addr, XEXP (op, 0), elem_mode, false))
20453     return false;
20454 
20455   if (addr.type == ADDRESS_REG_IMM)
20456     return offset_4bit_signed_scaled_p (mode, addr.const_offset);
20457 
20458   if (addr.type == ADDRESS_REG_REG)
20459     return (1U << addr.shift) == GET_MODE_SIZE (elem_mode);
20460 
20461   return false;
20462 }
20463 
20464 /* Return true if OP is a valid MEM operand for an SVE LD1RQ instruction.  */
20465 bool
aarch64_sve_ld1rq_operand_p(rtx op)20466 aarch64_sve_ld1rq_operand_p (rtx op)
20467 {
20468   return aarch64_sve_ld1rq_ld1ro_operand_p (op, TImode,
20469 					    GET_MODE_INNER (GET_MODE (op)));
20470 }
20471 
20472 /* Return true if OP is a valid MEM operand for an SVE LD1RO instruction for
20473    accessing a vector where the element size is specified by `elem_mode`.  */
20474 bool
aarch64_sve_ld1ro_operand_p(rtx op,scalar_mode elem_mode)20475 aarch64_sve_ld1ro_operand_p (rtx op, scalar_mode elem_mode)
20476 {
20477   return aarch64_sve_ld1rq_ld1ro_operand_p (op, OImode, elem_mode);
20478 }
20479 
20480 /* Return true if OP is a valid MEM operand for an SVE LDFF1 instruction.  */
20481 bool
aarch64_sve_ldff1_operand_p(rtx op)20482 aarch64_sve_ldff1_operand_p (rtx op)
20483 {
20484   if (!MEM_P (op))
20485     return false;
20486 
20487   struct aarch64_address_info addr;
20488   if (!aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op), false))
20489     return false;
20490 
20491   if (addr.type == ADDRESS_REG_IMM)
20492     return known_eq (addr.const_offset, 0);
20493 
20494   return addr.type == ADDRESS_REG_REG;
20495 }
20496 
20497 /* Return true if OP is a valid MEM operand for an SVE LDNF1 instruction.  */
20498 bool
aarch64_sve_ldnf1_operand_p(rtx op)20499 aarch64_sve_ldnf1_operand_p (rtx op)
20500 {
20501   struct aarch64_address_info addr;
20502 
20503   return (MEM_P (op)
20504 	  && aarch64_classify_address (&addr, XEXP (op, 0),
20505 				       GET_MODE (op), false)
20506 	  && addr.type == ADDRESS_REG_IMM);
20507 }
20508 
20509 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
20510    The conditions for STR are the same.  */
20511 bool
aarch64_sve_ldr_operand_p(rtx op)20512 aarch64_sve_ldr_operand_p (rtx op)
20513 {
20514   struct aarch64_address_info addr;
20515 
20516   return (MEM_P (op)
20517 	  && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
20518 				       false, ADDR_QUERY_ANY)
20519 	  && addr.type == ADDRESS_REG_IMM);
20520 }
20521 
20522 /* Return true if OP is a valid address for an SVE PRF[BHWD] instruction,
20523    addressing memory of mode MODE.  */
20524 bool
aarch64_sve_prefetch_operand_p(rtx op,machine_mode mode)20525 aarch64_sve_prefetch_operand_p (rtx op, machine_mode mode)
20526 {
20527   struct aarch64_address_info addr;
20528   if (!aarch64_classify_address (&addr, op, mode, false, ADDR_QUERY_ANY))
20529     return false;
20530 
20531   if (addr.type == ADDRESS_REG_IMM)
20532     return offset_6bit_signed_scaled_p (mode, addr.const_offset);
20533 
20534   return addr.type == ADDRESS_REG_REG;
20535 }
20536 
20537 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
20538    We need to be able to access the individual pieces, so the range
20539    is different from LD[234] and ST[234].  */
20540 bool
aarch64_sve_struct_memory_operand_p(rtx op)20541 aarch64_sve_struct_memory_operand_p (rtx op)
20542 {
20543   if (!MEM_P (op))
20544     return false;
20545 
20546   machine_mode mode = GET_MODE (op);
20547   struct aarch64_address_info addr;
20548   if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
20549 				 ADDR_QUERY_ANY)
20550       || addr.type != ADDRESS_REG_IMM)
20551     return false;
20552 
20553   poly_int64 first = addr.const_offset;
20554   poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
20555   return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
20556 	  && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
20557 }
20558 
20559 /* Emit a register copy from operand to operand, taking care not to
20560    early-clobber source registers in the process.
20561 
20562    COUNT is the number of components into which the copy needs to be
20563    decomposed.  */
20564 void
aarch64_simd_emit_reg_reg_move(rtx * operands,machine_mode mode,unsigned int count)20565 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
20566 				unsigned int count)
20567 {
20568   unsigned int i;
20569   int rdest = REGNO (operands[0]);
20570   int rsrc = REGNO (operands[1]);
20571 
20572   if (!reg_overlap_mentioned_p (operands[0], operands[1])
20573       || rdest < rsrc)
20574     for (i = 0; i < count; i++)
20575       emit_move_insn (gen_rtx_REG (mode, rdest + i),
20576 		      gen_rtx_REG (mode, rsrc + i));
20577   else
20578     for (i = 0; i < count; i++)
20579       emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
20580 		      gen_rtx_REG (mode, rsrc + count - i - 1));
20581 }
20582 
20583 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
20584    one of VSTRUCT modes: OI, CI, or XI.  */
20585 int
aarch64_simd_attr_length_rglist(machine_mode mode)20586 aarch64_simd_attr_length_rglist (machine_mode mode)
20587 {
20588   /* This is only used (and only meaningful) for Advanced SIMD, not SVE.  */
20589   return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
20590 }
20591 
20592 /* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
20593    alignment of a vector to 128 bits.  SVE predicates have an alignment of
20594    16 bits.  */
20595 static HOST_WIDE_INT
aarch64_simd_vector_alignment(const_tree type)20596 aarch64_simd_vector_alignment (const_tree type)
20597 {
20598   /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
20599      be set for non-predicate vectors of booleans.  Modes are the most
20600      direct way we have of identifying real SVE predicate types.  */
20601   if (GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL)
20602     return 16;
20603   widest_int min_size
20604     = constant_lower_bound (wi::to_poly_widest (TYPE_SIZE (type)));
20605   return wi::umin (min_size, 128).to_uhwi ();
20606 }
20607 
20608 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT.  */
20609 static poly_uint64
aarch64_vectorize_preferred_vector_alignment(const_tree type)20610 aarch64_vectorize_preferred_vector_alignment (const_tree type)
20611 {
20612   if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
20613     {
20614       /* If the length of the vector is a fixed power of 2, try to align
20615 	 to that length, otherwise don't try to align at all.  */
20616       HOST_WIDE_INT result;
20617       if (!GET_MODE_BITSIZE (TYPE_MODE (type)).is_constant (&result)
20618 	  || !pow2p_hwi (result))
20619 	result = TYPE_ALIGN (TREE_TYPE (type));
20620       return result;
20621     }
20622   return TYPE_ALIGN (type);
20623 }
20624 
20625 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
20626 static bool
aarch64_simd_vector_alignment_reachable(const_tree type,bool is_packed)20627 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
20628 {
20629   if (is_packed)
20630     return false;
20631 
20632   /* For fixed-length vectors, check that the vectorizer will aim for
20633      full-vector alignment.  This isn't true for generic GCC vectors
20634      that are wider than the ABI maximum of 128 bits.  */
20635   poly_uint64 preferred_alignment =
20636     aarch64_vectorize_preferred_vector_alignment (type);
20637   if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
20638       && maybe_ne (wi::to_widest (TYPE_SIZE (type)),
20639 		   preferred_alignment))
20640     return false;
20641 
20642   /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
20643   return true;
20644 }
20645 
20646 /* Return true if the vector misalignment factor is supported by the
20647    target.  */
20648 static bool
aarch64_builtin_support_vector_misalignment(machine_mode mode,const_tree type,int misalignment,bool is_packed)20649 aarch64_builtin_support_vector_misalignment (machine_mode mode,
20650 					     const_tree type, int misalignment,
20651 					     bool is_packed)
20652 {
20653   if (TARGET_SIMD && STRICT_ALIGNMENT)
20654     {
20655       /* Return if movmisalign pattern is not supported for this mode.  */
20656       if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
20657         return false;
20658 
20659       /* Misalignment factor is unknown at compile time.  */
20660       if (misalignment == -1)
20661 	return false;
20662     }
20663   return default_builtin_support_vector_misalignment (mode, type, misalignment,
20664 						      is_packed);
20665 }
20666 
20667 /* If VALS is a vector constant that can be loaded into a register
20668    using DUP, generate instructions to do so and return an RTX to
20669    assign to the register.  Otherwise return NULL_RTX.  */
20670 static rtx
aarch64_simd_dup_constant(rtx vals)20671 aarch64_simd_dup_constant (rtx vals)
20672 {
20673   machine_mode mode = GET_MODE (vals);
20674   machine_mode inner_mode = GET_MODE_INNER (mode);
20675   rtx x;
20676 
20677   if (!const_vec_duplicate_p (vals, &x))
20678     return NULL_RTX;
20679 
20680   /* We can load this constant by using DUP and a constant in a
20681      single ARM register.  This will be cheaper than a vector
20682      load.  */
20683   x = copy_to_mode_reg (inner_mode, x);
20684   return gen_vec_duplicate (mode, x);
20685 }
20686 
20687 
20688 /* Generate code to load VALS, which is a PARALLEL containing only
20689    constants (for vec_init) or CONST_VECTOR, efficiently into a
20690    register.  Returns an RTX to copy into the register, or NULL_RTX
20691    for a PARALLEL that cannot be converted into a CONST_VECTOR.  */
20692 static rtx
aarch64_simd_make_constant(rtx vals)20693 aarch64_simd_make_constant (rtx vals)
20694 {
20695   machine_mode mode = GET_MODE (vals);
20696   rtx const_dup;
20697   rtx const_vec = NULL_RTX;
20698   int n_const = 0;
20699   int i;
20700 
20701   if (GET_CODE (vals) == CONST_VECTOR)
20702     const_vec = vals;
20703   else if (GET_CODE (vals) == PARALLEL)
20704     {
20705       /* A CONST_VECTOR must contain only CONST_INTs and
20706 	 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
20707 	 Only store valid constants in a CONST_VECTOR.  */
20708       int n_elts = XVECLEN (vals, 0);
20709       for (i = 0; i < n_elts; ++i)
20710 	{
20711 	  rtx x = XVECEXP (vals, 0, i);
20712 	  if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
20713 	    n_const++;
20714 	}
20715       if (n_const == n_elts)
20716 	const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
20717     }
20718   else
20719     gcc_unreachable ();
20720 
20721   if (const_vec != NULL_RTX
20722       && aarch64_simd_valid_immediate (const_vec, NULL))
20723     /* Load using MOVI/MVNI.  */
20724     return const_vec;
20725   else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
20726     /* Loaded using DUP.  */
20727     return const_dup;
20728   else if (const_vec != NULL_RTX)
20729     /* Load from constant pool. We cannot take advantage of single-cycle
20730        LD1 because we need a PC-relative addressing mode.  */
20731     return const_vec;
20732   else
20733     /* A PARALLEL containing something not valid inside CONST_VECTOR.
20734        We cannot construct an initializer.  */
20735     return NULL_RTX;
20736 }
20737 
20738 /* Expand a vector initialisation sequence, such that TARGET is
20739    initialised to contain VALS.  */
20740 
20741 void
aarch64_expand_vector_init(rtx target,rtx vals)20742 aarch64_expand_vector_init (rtx target, rtx vals)
20743 {
20744   machine_mode mode = GET_MODE (target);
20745   scalar_mode inner_mode = GET_MODE_INNER (mode);
20746   /* The number of vector elements.  */
20747   int n_elts = XVECLEN (vals, 0);
20748   /* The number of vector elements which are not constant.  */
20749   int n_var = 0;
20750   rtx any_const = NULL_RTX;
20751   /* The first element of vals.  */
20752   rtx v0 = XVECEXP (vals, 0, 0);
20753   bool all_same = true;
20754 
20755   /* This is a special vec_init<M><N> where N is not an element mode but a
20756      vector mode with half the elements of M.  We expect to find two entries
20757      of mode N in VALS and we must put their concatentation into TARGET.  */
20758   if (XVECLEN (vals, 0) == 2 && VECTOR_MODE_P (GET_MODE (XVECEXP (vals, 0, 0))))
20759     {
20760       gcc_assert (known_eq (GET_MODE_SIZE (mode),
20761 		  2 * GET_MODE_SIZE (GET_MODE (XVECEXP (vals, 0, 0)))));
20762       rtx lo = XVECEXP (vals, 0, 0);
20763       rtx hi = XVECEXP (vals, 0, 1);
20764       machine_mode narrow_mode = GET_MODE (lo);
20765       gcc_assert (GET_MODE_INNER (narrow_mode) == inner_mode);
20766       gcc_assert (narrow_mode == GET_MODE (hi));
20767 
20768       /* When we want to concatenate a half-width vector with zeroes we can
20769 	 use the aarch64_combinez[_be] patterns.  Just make sure that the
20770 	 zeroes are in the right half.  */
20771       if (BYTES_BIG_ENDIAN
20772 	  && aarch64_simd_imm_zero (lo, narrow_mode)
20773 	  && general_operand (hi, narrow_mode))
20774 	emit_insn (gen_aarch64_combinez_be (narrow_mode, target, hi, lo));
20775       else if (!BYTES_BIG_ENDIAN
20776 	       && aarch64_simd_imm_zero (hi, narrow_mode)
20777 	       && general_operand (lo, narrow_mode))
20778 	emit_insn (gen_aarch64_combinez (narrow_mode, target, lo, hi));
20779       else
20780 	{
20781 	  /* Else create the two half-width registers and combine them.  */
20782 	  if (!REG_P (lo))
20783 	    lo = force_reg (GET_MODE (lo), lo);
20784 	  if (!REG_P (hi))
20785 	    hi = force_reg (GET_MODE (hi), hi);
20786 
20787 	  if (BYTES_BIG_ENDIAN)
20788 	    std::swap (lo, hi);
20789 	  emit_insn (gen_aarch64_simd_combine (narrow_mode, target, lo, hi));
20790 	}
20791      return;
20792    }
20793 
20794   /* Count the number of variable elements to initialise.  */
20795   for (int i = 0; i < n_elts; ++i)
20796     {
20797       rtx x = XVECEXP (vals, 0, i);
20798       if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
20799 	++n_var;
20800       else
20801 	any_const = x;
20802 
20803       all_same &= rtx_equal_p (x, v0);
20804     }
20805 
20806   /* No variable elements, hand off to aarch64_simd_make_constant which knows
20807      how best to handle this.  */
20808   if (n_var == 0)
20809     {
20810       rtx constant = aarch64_simd_make_constant (vals);
20811       if (constant != NULL_RTX)
20812 	{
20813 	  emit_move_insn (target, constant);
20814 	  return;
20815 	}
20816     }
20817 
20818   /* Splat a single non-constant element if we can.  */
20819   if (all_same)
20820     {
20821       rtx x = copy_to_mode_reg (inner_mode, v0);
20822       aarch64_emit_move (target, gen_vec_duplicate (mode, x));
20823       return;
20824     }
20825 
20826   enum insn_code icode = optab_handler (vec_set_optab, mode);
20827   gcc_assert (icode != CODE_FOR_nothing);
20828 
20829   /* If there are only variable elements, try to optimize
20830      the insertion using dup for the most common element
20831      followed by insertions.  */
20832 
20833   /* The algorithm will fill matches[*][0] with the earliest matching element,
20834      and matches[X][1] with the count of duplicate elements (if X is the
20835      earliest element which has duplicates).  */
20836 
20837   if (n_var == n_elts && n_elts <= 16)
20838     {
20839       int matches[16][2] = {0};
20840       for (int i = 0; i < n_elts; i++)
20841 	{
20842 	  for (int j = 0; j <= i; j++)
20843 	    {
20844 	      if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
20845 		{
20846 		  matches[i][0] = j;
20847 		  matches[j][1]++;
20848 		  break;
20849 		}
20850 	    }
20851 	}
20852       int maxelement = 0;
20853       int maxv = 0;
20854       for (int i = 0; i < n_elts; i++)
20855 	if (matches[i][1] > maxv)
20856 	  {
20857 	    maxelement = i;
20858 	    maxv = matches[i][1];
20859 	  }
20860 
20861       /* Create a duplicate of the most common element, unless all elements
20862 	 are equally useless to us, in which case just immediately set the
20863 	 vector register using the first element.  */
20864 
20865       if (maxv == 1)
20866 	{
20867 	  /* For vectors of two 64-bit elements, we can do even better.  */
20868 	  if (n_elts == 2
20869 	      && (inner_mode == E_DImode
20870 		  || inner_mode == E_DFmode))
20871 
20872 	    {
20873 	      rtx x0 = XVECEXP (vals, 0, 0);
20874 	      rtx x1 = XVECEXP (vals, 0, 1);
20875 	      /* Combine can pick up this case, but handling it directly
20876 		 here leaves clearer RTL.
20877 
20878 		 This is load_pair_lanes<mode>, and also gives us a clean-up
20879 		 for store_pair_lanes<mode>.  */
20880 	      if (memory_operand (x0, inner_mode)
20881 		  && memory_operand (x1, inner_mode)
20882 		  && !STRICT_ALIGNMENT
20883 		  && rtx_equal_p (XEXP (x1, 0),
20884 				  plus_constant (Pmode,
20885 						 XEXP (x0, 0),
20886 						 GET_MODE_SIZE (inner_mode))))
20887 		{
20888 		  rtx t;
20889 		  if (inner_mode == DFmode)
20890 		    t = gen_load_pair_lanesdf (target, x0, x1);
20891 		  else
20892 		    t = gen_load_pair_lanesdi (target, x0, x1);
20893 		  emit_insn (t);
20894 		  return;
20895 		}
20896 	    }
20897 	  /* The subreg-move sequence below will move into lane zero of the
20898 	     vector register.  For big-endian we want that position to hold
20899 	     the last element of VALS.  */
20900 	  maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
20901 	  rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
20902 	  aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
20903 	}
20904       else
20905 	{
20906 	  rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
20907 	  aarch64_emit_move (target, gen_vec_duplicate (mode, x));
20908 	}
20909 
20910       /* Insert the rest.  */
20911       for (int i = 0; i < n_elts; i++)
20912 	{
20913 	  rtx x = XVECEXP (vals, 0, i);
20914 	  if (matches[i][0] == maxelement)
20915 	    continue;
20916 	  x = copy_to_mode_reg (inner_mode, x);
20917 	  emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
20918 	}
20919       return;
20920     }
20921 
20922   /* Initialise a vector which is part-variable.  We want to first try
20923      to build those lanes which are constant in the most efficient way we
20924      can.  */
20925   if (n_var != n_elts)
20926     {
20927       rtx copy = copy_rtx (vals);
20928 
20929       /* Load constant part of vector.  We really don't care what goes into the
20930 	 parts we will overwrite, but we're more likely to be able to load the
20931 	 constant efficiently if it has fewer, larger, repeating parts
20932 	 (see aarch64_simd_valid_immediate).  */
20933       for (int i = 0; i < n_elts; i++)
20934 	{
20935 	  rtx x = XVECEXP (vals, 0, i);
20936 	  if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
20937 	    continue;
20938 	  rtx subst = any_const;
20939 	  for (int bit = n_elts / 2; bit > 0; bit /= 2)
20940 	    {
20941 	      /* Look in the copied vector, as more elements are const.  */
20942 	      rtx test = XVECEXP (copy, 0, i ^ bit);
20943 	      if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
20944 		{
20945 		  subst = test;
20946 		  break;
20947 		}
20948 	    }
20949 	  XVECEXP (copy, 0, i) = subst;
20950 	}
20951       aarch64_expand_vector_init (target, copy);
20952     }
20953 
20954   /* Insert the variable lanes directly.  */
20955   for (int i = 0; i < n_elts; i++)
20956     {
20957       rtx x = XVECEXP (vals, 0, i);
20958       if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
20959 	continue;
20960       x = copy_to_mode_reg (inner_mode, x);
20961       emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
20962     }
20963 }
20964 
20965 /* Emit RTL corresponding to:
20966    insr TARGET, ELEM.  */
20967 
20968 static void
emit_insr(rtx target,rtx elem)20969 emit_insr (rtx target, rtx elem)
20970 {
20971   machine_mode mode = GET_MODE (target);
20972   scalar_mode elem_mode = GET_MODE_INNER (mode);
20973   elem = force_reg (elem_mode, elem);
20974 
20975   insn_code icode = optab_handler (vec_shl_insert_optab, mode);
20976   gcc_assert (icode != CODE_FOR_nothing);
20977   emit_insn (GEN_FCN (icode) (target, target, elem));
20978 }
20979 
20980 /* Subroutine of aarch64_sve_expand_vector_init for handling
20981    trailing constants.
20982    This function works as follows:
20983    (a) Create a new vector consisting of trailing constants.
20984    (b) Initialize TARGET with the constant vector using emit_move_insn.
20985    (c) Insert remaining elements in TARGET using insr.
20986    NELTS is the total number of elements in original vector while
20987    while NELTS_REQD is the number of elements that are actually
20988    significant.
20989 
20990    ??? The heuristic used is to do above only if number of constants
20991    is at least half the total number of elements.  May need fine tuning.  */
20992 
20993 static bool
aarch64_sve_expand_vector_init_handle_trailing_constants(rtx target,const rtx_vector_builder & builder,int nelts,int nelts_reqd)20994 aarch64_sve_expand_vector_init_handle_trailing_constants
20995  (rtx target, const rtx_vector_builder &builder, int nelts, int nelts_reqd)
20996 {
20997   machine_mode mode = GET_MODE (target);
20998   scalar_mode elem_mode = GET_MODE_INNER (mode);
20999   int n_trailing_constants = 0;
21000 
21001   for (int i = nelts_reqd - 1;
21002        i >= 0 && valid_for_const_vector_p (elem_mode, builder.elt (i));
21003        i--)
21004     n_trailing_constants++;
21005 
21006   if (n_trailing_constants >= nelts_reqd / 2)
21007     {
21008       /* Try to use the natural pattern of BUILDER to extend the trailing
21009 	 constant elements to a full vector.  Replace any variables in the
21010 	 extra elements with zeros.
21011 
21012 	 ??? It would be better if the builders supported "don't care"
21013 	     elements, with the builder filling in whichever elements
21014 	     give the most compact encoding.  */
21015       rtx_vector_builder v (mode, nelts, 1);
21016       for (int i = 0; i < nelts; i++)
21017 	{
21018 	  rtx x = builder.elt (i + nelts_reqd - n_trailing_constants);
21019 	  if (!valid_for_const_vector_p (elem_mode, x))
21020 	    x = const0_rtx;
21021 	  v.quick_push (x);
21022 	}
21023       rtx const_vec = v.build ();
21024       emit_move_insn (target, const_vec);
21025 
21026       for (int i = nelts_reqd - n_trailing_constants - 1; i >= 0; i--)
21027 	emit_insr (target, builder.elt (i));
21028 
21029       return true;
21030     }
21031 
21032   return false;
21033 }
21034 
21035 /* Subroutine of aarch64_sve_expand_vector_init.
21036    Works as follows:
21037    (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER.
21038    (b) Skip trailing elements from BUILDER, which are the same as
21039        element NELTS_REQD - 1.
21040    (c) Insert earlier elements in reverse order in TARGET using insr.  */
21041 
21042 static void
aarch64_sve_expand_vector_init_insert_elems(rtx target,const rtx_vector_builder & builder,int nelts_reqd)21043 aarch64_sve_expand_vector_init_insert_elems (rtx target,
21044 					     const rtx_vector_builder &builder,
21045 					     int nelts_reqd)
21046 {
21047   machine_mode mode = GET_MODE (target);
21048   scalar_mode elem_mode = GET_MODE_INNER (mode);
21049 
21050   struct expand_operand ops[2];
21051   enum insn_code icode = optab_handler (vec_duplicate_optab, mode);
21052   gcc_assert (icode != CODE_FOR_nothing);
21053 
21054   create_output_operand (&ops[0], target, mode);
21055   create_input_operand (&ops[1], builder.elt (nelts_reqd - 1), elem_mode);
21056   expand_insn (icode, 2, ops);
21057 
21058   int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
21059   for (int i = nelts_reqd - ndups - 1; i >= 0; i--)
21060     emit_insr (target, builder.elt (i));
21061 }
21062 
21063 /* Subroutine of aarch64_sve_expand_vector_init to handle case
21064    when all trailing elements of builder are same.
21065    This works as follows:
21066    (a) Use expand_insn interface to broadcast last vector element in TARGET.
21067    (b) Insert remaining elements in TARGET using insr.
21068 
21069    ??? The heuristic used is to do above if number of same trailing elements
21070    is at least 3/4 of total number of elements, loosely based on
21071    heuristic from mostly_zeros_p.  May need fine-tuning.  */
21072 
21073 static bool
aarch64_sve_expand_vector_init_handle_trailing_same_elem(rtx target,const rtx_vector_builder & builder,int nelts_reqd)21074 aarch64_sve_expand_vector_init_handle_trailing_same_elem
21075  (rtx target, const rtx_vector_builder &builder, int nelts_reqd)
21076 {
21077   int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
21078   if (ndups >= (3 * nelts_reqd) / 4)
21079     {
21080       aarch64_sve_expand_vector_init_insert_elems (target, builder,
21081 						   nelts_reqd - ndups + 1);
21082       return true;
21083     }
21084 
21085   return false;
21086 }
21087 
21088 /* Initialize register TARGET from BUILDER. NELTS is the constant number
21089    of elements in BUILDER.
21090 
21091    The function tries to initialize TARGET from BUILDER if it fits one
21092    of the special cases outlined below.
21093 
21094    Failing that, the function divides BUILDER into two sub-vectors:
21095    v_even = even elements of BUILDER;
21096    v_odd = odd elements of BUILDER;
21097 
21098    and recursively calls itself with v_even and v_odd.
21099 
21100    if (recursive call succeeded for v_even or v_odd)
21101      TARGET = zip (v_even, v_odd)
21102 
21103    The function returns true if it managed to build TARGET from BUILDER
21104    with one of the special cases, false otherwise.
21105 
21106    Example: {a, 1, b, 2, c, 3, d, 4}
21107 
21108    The vector gets divided into:
21109    v_even = {a, b, c, d}
21110    v_odd = {1, 2, 3, 4}
21111 
21112    aarch64_sve_expand_vector_init(v_odd) hits case 1 and
21113    initialize tmp2 from constant vector v_odd using emit_move_insn.
21114 
21115    aarch64_sve_expand_vector_init(v_even) fails since v_even contains
21116    4 elements, so we construct tmp1 from v_even using insr:
21117    tmp1 = dup(d)
21118    insr tmp1, c
21119    insr tmp1, b
21120    insr tmp1, a
21121 
21122    And finally:
21123    TARGET = zip (tmp1, tmp2)
21124    which sets TARGET to {a, 1, b, 2, c, 3, d, 4}.  */
21125 
21126 static bool
aarch64_sve_expand_vector_init(rtx target,const rtx_vector_builder & builder,int nelts,int nelts_reqd)21127 aarch64_sve_expand_vector_init (rtx target, const rtx_vector_builder &builder,
21128 				int nelts, int nelts_reqd)
21129 {
21130   machine_mode mode = GET_MODE (target);
21131 
21132   /* Case 1: Vector contains trailing constants.  */
21133 
21134   if (aarch64_sve_expand_vector_init_handle_trailing_constants
21135        (target, builder, nelts, nelts_reqd))
21136     return true;
21137 
21138   /* Case 2: Vector contains leading constants.  */
21139 
21140   rtx_vector_builder rev_builder (mode, nelts_reqd, 1);
21141   for (int i = 0; i < nelts_reqd; i++)
21142     rev_builder.quick_push (builder.elt (nelts_reqd - i - 1));
21143   rev_builder.finalize ();
21144 
21145   if (aarch64_sve_expand_vector_init_handle_trailing_constants
21146        (target, rev_builder, nelts, nelts_reqd))
21147     {
21148       emit_insn (gen_aarch64_sve_rev (mode, target, target));
21149       return true;
21150     }
21151 
21152   /* Case 3: Vector contains trailing same element.  */
21153 
21154   if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
21155        (target, builder, nelts_reqd))
21156     return true;
21157 
21158   /* Case 4: Vector contains leading same element.  */
21159 
21160   if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
21161        (target, rev_builder, nelts_reqd) && nelts_reqd == nelts)
21162     {
21163       emit_insn (gen_aarch64_sve_rev (mode, target, target));
21164       return true;
21165     }
21166 
21167   /* Avoid recursing below 4-elements.
21168      ??? The threshold 4 may need fine-tuning.  */
21169 
21170   if (nelts_reqd <= 4)
21171     return false;
21172 
21173   rtx_vector_builder v_even (mode, nelts, 1);
21174   rtx_vector_builder v_odd (mode, nelts, 1);
21175 
21176   for (int i = 0; i < nelts * 2; i += 2)
21177     {
21178       v_even.quick_push (builder.elt (i));
21179       v_odd.quick_push (builder.elt (i + 1));
21180     }
21181 
21182   v_even.finalize ();
21183   v_odd.finalize ();
21184 
21185   rtx tmp1 = gen_reg_rtx (mode);
21186   bool did_even_p = aarch64_sve_expand_vector_init (tmp1, v_even,
21187 						    nelts, nelts_reqd / 2);
21188 
21189   rtx tmp2 = gen_reg_rtx (mode);
21190   bool did_odd_p = aarch64_sve_expand_vector_init (tmp2, v_odd,
21191 						   nelts, nelts_reqd / 2);
21192 
21193   if (!did_even_p && !did_odd_p)
21194     return false;
21195 
21196   /* Initialize v_even and v_odd using INSR if it didn't match any of the
21197      special cases and zip v_even, v_odd.  */
21198 
21199   if (!did_even_p)
21200     aarch64_sve_expand_vector_init_insert_elems (tmp1, v_even, nelts_reqd / 2);
21201 
21202   if (!did_odd_p)
21203     aarch64_sve_expand_vector_init_insert_elems (tmp2, v_odd, nelts_reqd / 2);
21204 
21205   rtvec v = gen_rtvec (2, tmp1, tmp2);
21206   emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
21207   return true;
21208 }
21209 
21210 /* Initialize register TARGET from the elements in PARALLEL rtx VALS.  */
21211 
21212 void
aarch64_sve_expand_vector_init(rtx target,rtx vals)21213 aarch64_sve_expand_vector_init (rtx target, rtx vals)
21214 {
21215   machine_mode mode = GET_MODE (target);
21216   int nelts = XVECLEN (vals, 0);
21217 
21218   rtx_vector_builder v (mode, nelts, 1);
21219   for (int i = 0; i < nelts; i++)
21220     v.quick_push (XVECEXP (vals, 0, i));
21221   v.finalize ();
21222 
21223   /* If neither sub-vectors of v could be initialized specially,
21224      then use INSR to insert all elements from v into TARGET.
21225      ??? This might not be optimal for vectors with large
21226      initializers like 16-element or above.
21227      For nelts < 4, it probably isn't useful to handle specially.  */
21228 
21229   if (nelts < 4
21230       || !aarch64_sve_expand_vector_init (target, v, nelts, nelts))
21231     aarch64_sve_expand_vector_init_insert_elems (target, v, nelts);
21232 }
21233 
21234 /* Check whether VALUE is a vector constant in which every element
21235    is either a power of 2 or a negated power of 2.  If so, return
21236    a constant vector of log2s, and flip CODE between PLUS and MINUS
21237    if VALUE contains negated powers of 2.  Return NULL_RTX otherwise.  */
21238 
21239 static rtx
aarch64_convert_mult_to_shift(rtx value,rtx_code & code)21240 aarch64_convert_mult_to_shift (rtx value, rtx_code &code)
21241 {
21242   if (GET_CODE (value) != CONST_VECTOR)
21243     return NULL_RTX;
21244 
21245   rtx_vector_builder builder;
21246   if (!builder.new_unary_operation (GET_MODE (value), value, false))
21247     return NULL_RTX;
21248 
21249   scalar_mode int_mode = GET_MODE_INNER (GET_MODE (value));
21250   /* 1 if the result of the multiplication must be negated,
21251      0 if it mustn't, or -1 if we don't yet care.  */
21252   int negate = -1;
21253   unsigned int encoded_nelts = const_vector_encoded_nelts (value);
21254   for (unsigned int i = 0; i < encoded_nelts; ++i)
21255     {
21256       rtx elt = CONST_VECTOR_ENCODED_ELT (value, i);
21257       if (!CONST_SCALAR_INT_P (elt))
21258 	return NULL_RTX;
21259       rtx_mode_t val (elt, int_mode);
21260       wide_int pow2 = wi::neg (val);
21261       if (val != pow2)
21262 	{
21263 	  /* It matters whether we negate or not.  Make that choice,
21264 	     and make sure that it's consistent with previous elements.  */
21265 	  if (negate == !wi::neg_p (val))
21266 	    return NULL_RTX;
21267 	  negate = wi::neg_p (val);
21268 	  if (!negate)
21269 	    pow2 = val;
21270 	}
21271       /* POW2 is now the value that we want to be a power of 2.  */
21272       int shift = wi::exact_log2 (pow2);
21273       if (shift < 0)
21274 	return NULL_RTX;
21275       builder.quick_push (gen_int_mode (shift, int_mode));
21276     }
21277   if (negate == -1)
21278     /* PLUS and MINUS are equivalent; canonicalize on PLUS.  */
21279     code = PLUS;
21280   else if (negate == 1)
21281     code = code == PLUS ? MINUS : PLUS;
21282   return builder.build ();
21283 }
21284 
21285 /* Prepare for an integer SVE multiply-add or multiply-subtract pattern;
21286    CODE is PLUS for the former and MINUS for the latter.  OPERANDS is the
21287    operands array, in the same order as for fma_optab.  Return true if
21288    the function emitted all the necessary instructions, false if the caller
21289    should generate the pattern normally with the new OPERANDS array.  */
21290 
21291 bool
aarch64_prepare_sve_int_fma(rtx * operands,rtx_code code)21292 aarch64_prepare_sve_int_fma (rtx *operands, rtx_code code)
21293 {
21294   machine_mode mode = GET_MODE (operands[0]);
21295   if (rtx shifts = aarch64_convert_mult_to_shift (operands[2], code))
21296     {
21297       rtx product = expand_binop (mode, vashl_optab, operands[1], shifts,
21298 				  NULL_RTX, true, OPTAB_DIRECT);
21299       force_expand_binop (mode, code == PLUS ? add_optab : sub_optab,
21300 			  operands[3], product, operands[0], true,
21301 			  OPTAB_DIRECT);
21302       return true;
21303     }
21304   operands[2] = force_reg (mode, operands[2]);
21305   return false;
21306 }
21307 
21308 /* Likewise, but for a conditional pattern.  */
21309 
21310 bool
aarch64_prepare_sve_cond_int_fma(rtx * operands,rtx_code code)21311 aarch64_prepare_sve_cond_int_fma (rtx *operands, rtx_code code)
21312 {
21313   machine_mode mode = GET_MODE (operands[0]);
21314   if (rtx shifts = aarch64_convert_mult_to_shift (operands[3], code))
21315     {
21316       rtx product = expand_binop (mode, vashl_optab, operands[2], shifts,
21317 				  NULL_RTX, true, OPTAB_DIRECT);
21318       emit_insn (gen_cond (code, mode, operands[0], operands[1],
21319 			   operands[4], product, operands[5]));
21320       return true;
21321     }
21322   operands[3] = force_reg (mode, operands[3]);
21323   return false;
21324 }
21325 
21326 static unsigned HOST_WIDE_INT
aarch64_shift_truncation_mask(machine_mode mode)21327 aarch64_shift_truncation_mask (machine_mode mode)
21328 {
21329   if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
21330     return 0;
21331   return GET_MODE_UNIT_BITSIZE (mode) - 1;
21332 }
21333 
21334 /* Select a format to encode pointers in exception handling data.  */
21335 int
aarch64_asm_preferred_eh_data_format(int code ATTRIBUTE_UNUSED,int global)21336 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
21337 {
21338    int type;
21339    switch (aarch64_cmodel)
21340      {
21341      case AARCH64_CMODEL_TINY:
21342      case AARCH64_CMODEL_TINY_PIC:
21343      case AARCH64_CMODEL_SMALL:
21344      case AARCH64_CMODEL_SMALL_PIC:
21345      case AARCH64_CMODEL_SMALL_SPIC:
21346        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
21347 	  for everything.  */
21348        type = DW_EH_PE_sdata4;
21349        break;
21350      default:
21351        /* No assumptions here.  8-byte relocs required.  */
21352        type = DW_EH_PE_sdata8;
21353        break;
21354      }
21355    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
21356 }
21357 
21358 /* Output .variant_pcs for aarch64_vector_pcs function symbols.  */
21359 
21360 static void
aarch64_asm_output_variant_pcs(FILE * stream,const tree decl,const char * name)21361 aarch64_asm_output_variant_pcs (FILE *stream, const tree decl, const char* name)
21362 {
21363   if (TREE_CODE (decl) == FUNCTION_DECL)
21364     {
21365       arm_pcs pcs = (arm_pcs) fndecl_abi (decl).id ();
21366       if (pcs == ARM_PCS_SIMD || pcs == ARM_PCS_SVE)
21367 	{
21368 	  fprintf (stream, "\t.variant_pcs\t");
21369 	  assemble_name (stream, name);
21370 	  fprintf (stream, "\n");
21371 	}
21372     }
21373 }
21374 
21375 /* The last .arch and .tune assembly strings that we printed.  */
21376 static std::string aarch64_last_printed_arch_string;
21377 static std::string aarch64_last_printed_tune_string;
21378 
21379 /* Implement ASM_DECLARE_FUNCTION_NAME.  Output the ISA features used
21380    by the function fndecl.  */
21381 
21382 void
aarch64_declare_function_name(FILE * stream,const char * name,tree fndecl)21383 aarch64_declare_function_name (FILE *stream, const char* name,
21384 				tree fndecl)
21385 {
21386   tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
21387 
21388   struct cl_target_option *targ_options;
21389   if (target_parts)
21390     targ_options = TREE_TARGET_OPTION (target_parts);
21391   else
21392     targ_options = TREE_TARGET_OPTION (target_option_current_node);
21393   gcc_assert (targ_options);
21394 
21395   const struct processor *this_arch
21396     = aarch64_get_arch (targ_options->x_explicit_arch);
21397 
21398   uint64_t isa_flags = targ_options->x_aarch64_isa_flags;
21399   std::string extension
21400     = aarch64_get_extension_string_for_isa_flags (isa_flags,
21401 						  this_arch->flags);
21402   /* Only update the assembler .arch string if it is distinct from the last
21403      such string we printed.  */
21404   std::string to_print = this_arch->name + extension;
21405   if (to_print != aarch64_last_printed_arch_string)
21406     {
21407       asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
21408       aarch64_last_printed_arch_string = to_print;
21409     }
21410 
21411   /* Print the cpu name we're tuning for in the comments, might be
21412      useful to readers of the generated asm.  Do it only when it changes
21413      from function to function and verbose assembly is requested.  */
21414   const struct processor *this_tune
21415     = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
21416 
21417   if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
21418     {
21419       asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
21420 		   this_tune->name);
21421       aarch64_last_printed_tune_string = this_tune->name;
21422     }
21423 
21424   aarch64_asm_output_variant_pcs (stream, fndecl, name);
21425 
21426   /* Don't forget the type directive for ELF.  */
21427   ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
21428   ASM_OUTPUT_LABEL (stream, name);
21429 
21430   cfun->machine->label_is_assembled = true;
21431 }
21432 
21433 /* Implement PRINT_PATCHABLE_FUNCTION_ENTRY.  Check if the patch area is after
21434    the function label and emit a BTI if necessary.  */
21435 
21436 void
aarch64_print_patchable_function_entry(FILE * file,unsigned HOST_WIDE_INT patch_area_size,bool record_p)21437 aarch64_print_patchable_function_entry (FILE *file,
21438 					unsigned HOST_WIDE_INT patch_area_size,
21439 					bool record_p)
21440 {
21441   if (cfun->machine->label_is_assembled
21442       && aarch64_bti_enabled ()
21443       && !cgraph_node::get (cfun->decl)->only_called_directly_p ())
21444     {
21445       /* Remove the BTI that follows the patch area and insert a new BTI
21446 	 before the patch area right after the function label.  */
21447       rtx_insn *insn = next_real_nondebug_insn (get_insns ());
21448       if (insn
21449 	  && INSN_P (insn)
21450 	  && GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
21451 	  && XINT (PATTERN (insn), 1) == UNSPECV_BTI_C)
21452 	delete_insn (insn);
21453       asm_fprintf (file, "\thint\t34 // bti c\n");
21454     }
21455 
21456   default_print_patchable_function_entry (file, patch_area_size, record_p);
21457 }
21458 
21459 /* Implement ASM_OUTPUT_DEF_FROM_DECLS.  Output .variant_pcs for aliases.  */
21460 
21461 void
aarch64_asm_output_alias(FILE * stream,const tree decl,const tree target)21462 aarch64_asm_output_alias (FILE *stream, const tree decl, const tree target)
21463 {
21464   const char *name = XSTR (XEXP (DECL_RTL (decl), 0), 0);
21465   const char *value = IDENTIFIER_POINTER (target);
21466   aarch64_asm_output_variant_pcs (stream, decl, name);
21467   ASM_OUTPUT_DEF (stream, name, value);
21468 }
21469 
21470 /* Implement ASM_OUTPUT_EXTERNAL.  Output .variant_pcs for undefined
21471    function symbol references.  */
21472 
21473 void
aarch64_asm_output_external(FILE * stream,tree decl,const char * name)21474 aarch64_asm_output_external (FILE *stream, tree decl, const char* name)
21475 {
21476   default_elf_asm_output_external (stream, decl, name);
21477   aarch64_asm_output_variant_pcs (stream, decl, name);
21478 }
21479 
21480 /* Triggered after a .cfi_startproc directive is emitted into the assembly file.
21481    Used to output the .cfi_b_key_frame directive when signing the current
21482    function with the B key.  */
21483 
21484 void
aarch64_post_cfi_startproc(FILE * f,tree ignored ATTRIBUTE_UNUSED)21485 aarch64_post_cfi_startproc (FILE *f, tree ignored ATTRIBUTE_UNUSED)
21486 {
21487   if (cfun->machine->frame.laid_out && aarch64_return_address_signing_enabled ()
21488       && aarch64_ra_sign_key == AARCH64_KEY_B)
21489 	asm_fprintf (f, "\t.cfi_b_key_frame\n");
21490 }
21491 
21492 /* Implements TARGET_ASM_FILE_START.  Output the assembly header.  */
21493 
21494 static void
aarch64_start_file(void)21495 aarch64_start_file (void)
21496 {
21497   struct cl_target_option *default_options
21498     = TREE_TARGET_OPTION (target_option_default_node);
21499 
21500   const struct processor *default_arch
21501     = aarch64_get_arch (default_options->x_explicit_arch);
21502   uint64_t default_isa_flags = default_options->x_aarch64_isa_flags;
21503   std::string extension
21504     = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
21505 						  default_arch->flags);
21506 
21507    aarch64_last_printed_arch_string = default_arch->name + extension;
21508    aarch64_last_printed_tune_string = "";
21509    asm_fprintf (asm_out_file, "\t.arch %s\n",
21510 		aarch64_last_printed_arch_string.c_str ());
21511 
21512    default_file_start ();
21513 }
21514 
21515 /* Emit load exclusive.  */
21516 
21517 static void
aarch64_emit_load_exclusive(machine_mode mode,rtx rval,rtx mem,rtx model_rtx)21518 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
21519 			     rtx mem, rtx model_rtx)
21520 {
21521   if (mode == TImode)
21522     emit_insn (gen_aarch64_load_exclusive_pair (gen_lowpart (DImode, rval),
21523 						gen_highpart (DImode, rval),
21524 						mem, model_rtx));
21525   else
21526     emit_insn (gen_aarch64_load_exclusive (mode, rval, mem, model_rtx));
21527 }
21528 
21529 /* Emit store exclusive.  */
21530 
21531 static void
aarch64_emit_store_exclusive(machine_mode mode,rtx bval,rtx mem,rtx rval,rtx model_rtx)21532 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
21533 			      rtx mem, rtx rval, rtx model_rtx)
21534 {
21535   if (mode == TImode)
21536     emit_insn (gen_aarch64_store_exclusive_pair
21537 	       (bval, mem, operand_subword (rval, 0, 0, TImode),
21538 		operand_subword (rval, 1, 0, TImode), model_rtx));
21539   else
21540     emit_insn (gen_aarch64_store_exclusive (mode, bval, mem, rval, model_rtx));
21541 }
21542 
21543 /* Mark the previous jump instruction as unlikely.  */
21544 
21545 static void
aarch64_emit_unlikely_jump(rtx insn)21546 aarch64_emit_unlikely_jump (rtx insn)
21547 {
21548   rtx_insn *jump = emit_jump_insn (insn);
21549   add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
21550 }
21551 
21552 /* We store the names of the various atomic helpers in a 5x4 array.
21553    Return the libcall function given MODE, MODEL and NAMES.  */
21554 
21555 rtx
aarch64_atomic_ool_func(machine_mode mode,rtx model_rtx,const atomic_ool_names * names)21556 aarch64_atomic_ool_func(machine_mode mode, rtx model_rtx,
21557 			const atomic_ool_names *names)
21558 {
21559   memmodel model = memmodel_base (INTVAL (model_rtx));
21560   int mode_idx, model_idx;
21561 
21562   switch (mode)
21563     {
21564     case E_QImode:
21565       mode_idx = 0;
21566       break;
21567     case E_HImode:
21568       mode_idx = 1;
21569       break;
21570     case E_SImode:
21571       mode_idx = 2;
21572       break;
21573     case E_DImode:
21574       mode_idx = 3;
21575       break;
21576     case E_TImode:
21577       mode_idx = 4;
21578       break;
21579     default:
21580       gcc_unreachable ();
21581     }
21582 
21583   switch (model)
21584     {
21585     case MEMMODEL_RELAXED:
21586       model_idx = 0;
21587       break;
21588     case MEMMODEL_CONSUME:
21589     case MEMMODEL_ACQUIRE:
21590       model_idx = 1;
21591       break;
21592     case MEMMODEL_RELEASE:
21593       model_idx = 2;
21594       break;
21595     case MEMMODEL_ACQ_REL:
21596     case MEMMODEL_SEQ_CST:
21597       model_idx = 3;
21598       break;
21599     default:
21600       gcc_unreachable ();
21601     }
21602 
21603   return init_one_libfunc_visibility (names->str[mode_idx][model_idx],
21604 				      VISIBILITY_HIDDEN);
21605 }
21606 
21607 #define DEF0(B, N) \
21608   { "__aarch64_" #B #N "_relax", \
21609     "__aarch64_" #B #N "_acq", \
21610     "__aarch64_" #B #N "_rel", \
21611     "__aarch64_" #B #N "_acq_rel" }
21612 
21613 #define DEF4(B)  DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), \
21614 		 { NULL, NULL, NULL, NULL }
21615 #define DEF5(B)  DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), DEF0(B, 16)
21616 
21617 static const atomic_ool_names aarch64_ool_cas_names = { { DEF5(cas) } };
21618 const atomic_ool_names aarch64_ool_swp_names = { { DEF4(swp) } };
21619 const atomic_ool_names aarch64_ool_ldadd_names = { { DEF4(ldadd) } };
21620 const atomic_ool_names aarch64_ool_ldset_names = { { DEF4(ldset) } };
21621 const atomic_ool_names aarch64_ool_ldclr_names = { { DEF4(ldclr) } };
21622 const atomic_ool_names aarch64_ool_ldeor_names = { { DEF4(ldeor) } };
21623 
21624 #undef DEF0
21625 #undef DEF4
21626 #undef DEF5
21627 
21628 /* Expand a compare and swap pattern.  */
21629 
21630 void
aarch64_expand_compare_and_swap(rtx operands[])21631 aarch64_expand_compare_and_swap (rtx operands[])
21632 {
21633   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x, cc_reg;
21634   machine_mode mode, r_mode;
21635 
21636   bval = operands[0];
21637   rval = operands[1];
21638   mem = operands[2];
21639   oldval = operands[3];
21640   newval = operands[4];
21641   is_weak = operands[5];
21642   mod_s = operands[6];
21643   mod_f = operands[7];
21644   mode = GET_MODE (mem);
21645 
21646   /* Normally the succ memory model must be stronger than fail, but in the
21647      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
21648      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
21649   if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
21650       && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
21651     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
21652 
21653   r_mode = mode;
21654   if (mode == QImode || mode == HImode)
21655     {
21656       r_mode = SImode;
21657       rval = gen_reg_rtx (r_mode);
21658     }
21659 
21660   if (TARGET_LSE)
21661     {
21662       /* The CAS insn requires oldval and rval overlap, but we need to
21663 	 have a copy of oldval saved across the operation to tell if
21664 	 the operation is successful.  */
21665       if (reg_overlap_mentioned_p (rval, oldval))
21666         rval = copy_to_mode_reg (r_mode, oldval);
21667       else
21668 	emit_move_insn (rval, gen_lowpart (r_mode, oldval));
21669 
21670       emit_insn (gen_aarch64_compare_and_swap_lse (mode, rval, mem,
21671 						   newval, mod_s));
21672       cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
21673     }
21674   else if (TARGET_OUTLINE_ATOMICS)
21675     {
21676       /* Oldval must satisfy compare afterward.  */
21677       if (!aarch64_plus_operand (oldval, mode))
21678 	oldval = force_reg (mode, oldval);
21679       rtx func = aarch64_atomic_ool_func (mode, mod_s, &aarch64_ool_cas_names);
21680       rval = emit_library_call_value (func, NULL_RTX, LCT_NORMAL, r_mode,
21681 				      oldval, mode, newval, mode,
21682 				      XEXP (mem, 0), Pmode);
21683       cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
21684     }
21685   else
21686     {
21687       /* The oldval predicate varies by mode.  Test it and force to reg.  */
21688       insn_code code = code_for_aarch64_compare_and_swap (mode);
21689       if (!insn_data[code].operand[2].predicate (oldval, mode))
21690 	oldval = force_reg (mode, oldval);
21691 
21692       emit_insn (GEN_FCN (code) (rval, mem, oldval, newval,
21693 				 is_weak, mod_s, mod_f));
21694       cc_reg = gen_rtx_REG (CCmode, CC_REGNUM);
21695     }
21696 
21697   if (r_mode != mode)
21698     rval = gen_lowpart (mode, rval);
21699   emit_move_insn (operands[1], rval);
21700 
21701   x = gen_rtx_EQ (SImode, cc_reg, const0_rtx);
21702   emit_insn (gen_rtx_SET (bval, x));
21703 }
21704 
21705 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
21706    sequence implementing an atomic operation.  */
21707 
21708 static void
aarch64_emit_post_barrier(enum memmodel model)21709 aarch64_emit_post_barrier (enum memmodel model)
21710 {
21711   const enum memmodel base_model = memmodel_base (model);
21712 
21713   if (is_mm_sync (model)
21714       && (base_model == MEMMODEL_ACQUIRE
21715 	  || base_model == MEMMODEL_ACQ_REL
21716 	  || base_model == MEMMODEL_SEQ_CST))
21717     {
21718       emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
21719     }
21720 }
21721 
21722 /* Split a compare and swap pattern.  */
21723 
21724 void
aarch64_split_compare_and_swap(rtx operands[])21725 aarch64_split_compare_and_swap (rtx operands[])
21726 {
21727   /* Split after prolog/epilog to avoid interactions with shrinkwrapping.  */
21728   gcc_assert (epilogue_completed);
21729 
21730   rtx rval, mem, oldval, newval, scratch, x, model_rtx;
21731   machine_mode mode;
21732   bool is_weak;
21733   rtx_code_label *label1, *label2;
21734   enum memmodel model;
21735 
21736   rval = operands[0];
21737   mem = operands[1];
21738   oldval = operands[2];
21739   newval = operands[3];
21740   is_weak = (operands[4] != const0_rtx);
21741   model_rtx = operands[5];
21742   scratch = operands[7];
21743   mode = GET_MODE (mem);
21744   model = memmodel_from_int (INTVAL (model_rtx));
21745 
21746   /* When OLDVAL is zero and we want the strong version we can emit a tighter
21747     loop:
21748     .label1:
21749 	LD[A]XR	rval, [mem]
21750 	CBNZ	rval, .label2
21751 	ST[L]XR	scratch, newval, [mem]
21752 	CBNZ	scratch, .label1
21753     .label2:
21754 	CMP	rval, 0.  */
21755   bool strong_zero_p = (!is_weak && !aarch64_track_speculation &&
21756 			oldval == const0_rtx && mode != TImode);
21757 
21758   label1 = NULL;
21759   if (!is_weak)
21760     {
21761       label1 = gen_label_rtx ();
21762       emit_label (label1);
21763     }
21764   label2 = gen_label_rtx ();
21765 
21766   /* The initial load can be relaxed for a __sync operation since a final
21767      barrier will be emitted to stop code hoisting.  */
21768   if (is_mm_sync (model))
21769     aarch64_emit_load_exclusive (mode, rval, mem, GEN_INT (MEMMODEL_RELAXED));
21770   else
21771     aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
21772 
21773   if (strong_zero_p)
21774     x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
21775   else
21776     {
21777       rtx cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
21778       x = gen_rtx_NE (VOIDmode, cc_reg, const0_rtx);
21779     }
21780   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
21781 			    gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
21782   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
21783 
21784   aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
21785 
21786   if (!is_weak)
21787     {
21788       if (aarch64_track_speculation)
21789 	{
21790 	  /* Emit an explicit compare instruction, so that we can correctly
21791 	     track the condition codes.  */
21792 	  rtx cc_reg = aarch64_gen_compare_reg (NE, scratch, const0_rtx);
21793 	  x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
21794 	}
21795       else
21796 	x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
21797 
21798       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
21799 				gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
21800       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
21801     }
21802   else
21803     aarch64_gen_compare_reg (NE, scratch, const0_rtx);
21804 
21805   emit_label (label2);
21806 
21807   /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
21808      to set the condition flags.  If this is not used it will be removed by
21809      later passes.  */
21810   if (strong_zero_p)
21811     aarch64_gen_compare_reg (NE, rval, const0_rtx);
21812 
21813   /* Emit any final barrier needed for a __sync operation.  */
21814   if (is_mm_sync (model))
21815     aarch64_emit_post_barrier (model);
21816 }
21817 
21818 /* Split an atomic operation.  */
21819 
21820 void
aarch64_split_atomic_op(enum rtx_code code,rtx old_out,rtx new_out,rtx mem,rtx value,rtx model_rtx,rtx cond)21821 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
21822 			 rtx value, rtx model_rtx, rtx cond)
21823 {
21824   /* Split after prolog/epilog to avoid interactions with shrinkwrapping.  */
21825   gcc_assert (epilogue_completed);
21826 
21827   machine_mode mode = GET_MODE (mem);
21828   machine_mode wmode = (mode == DImode ? DImode : SImode);
21829   const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
21830   const bool is_sync = is_mm_sync (model);
21831   rtx_code_label *label;
21832   rtx x;
21833 
21834   /* Split the atomic operation into a sequence.  */
21835   label = gen_label_rtx ();
21836   emit_label (label);
21837 
21838   if (new_out)
21839     new_out = gen_lowpart (wmode, new_out);
21840   if (old_out)
21841     old_out = gen_lowpart (wmode, old_out);
21842   else
21843     old_out = new_out;
21844   value = simplify_gen_subreg (wmode, value, mode, 0);
21845 
21846   /* The initial load can be relaxed for a __sync operation since a final
21847      barrier will be emitted to stop code hoisting.  */
21848  if (is_sync)
21849     aarch64_emit_load_exclusive (mode, old_out, mem,
21850 				 GEN_INT (MEMMODEL_RELAXED));
21851   else
21852     aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
21853 
21854   switch (code)
21855     {
21856     case SET:
21857       new_out = value;
21858       break;
21859 
21860     case NOT:
21861       x = gen_rtx_AND (wmode, old_out, value);
21862       emit_insn (gen_rtx_SET (new_out, x));
21863       x = gen_rtx_NOT (wmode, new_out);
21864       emit_insn (gen_rtx_SET (new_out, x));
21865       break;
21866 
21867     case MINUS:
21868       if (CONST_INT_P (value))
21869 	{
21870 	  value = GEN_INT (-UINTVAL (value));
21871 	  code = PLUS;
21872 	}
21873       /* Fall through.  */
21874 
21875     default:
21876       x = gen_rtx_fmt_ee (code, wmode, old_out, value);
21877       emit_insn (gen_rtx_SET (new_out, x));
21878       break;
21879     }
21880 
21881   aarch64_emit_store_exclusive (mode, cond, mem,
21882 				gen_lowpart (mode, new_out), model_rtx);
21883 
21884   if (aarch64_track_speculation)
21885     {
21886       /* Emit an explicit compare instruction, so that we can correctly
21887 	 track the condition codes.  */
21888       rtx cc_reg = aarch64_gen_compare_reg (NE, cond, const0_rtx);
21889       x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
21890     }
21891   else
21892     x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
21893 
21894   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
21895 			    gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
21896   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
21897 
21898   /* Emit any final barrier needed for a __sync operation.  */
21899   if (is_sync)
21900     aarch64_emit_post_barrier (model);
21901 }
21902 
21903 static void
aarch64_init_libfuncs(void)21904 aarch64_init_libfuncs (void)
21905 {
21906    /* Half-precision float operations.  The compiler handles all operations
21907      with NULL libfuncs by converting to SFmode.  */
21908 
21909   /* Conversions.  */
21910   set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
21911   set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
21912 
21913   /* Arithmetic.  */
21914   set_optab_libfunc (add_optab, HFmode, NULL);
21915   set_optab_libfunc (sdiv_optab, HFmode, NULL);
21916   set_optab_libfunc (smul_optab, HFmode, NULL);
21917   set_optab_libfunc (neg_optab, HFmode, NULL);
21918   set_optab_libfunc (sub_optab, HFmode, NULL);
21919 
21920   /* Comparisons.  */
21921   set_optab_libfunc (eq_optab, HFmode, NULL);
21922   set_optab_libfunc (ne_optab, HFmode, NULL);
21923   set_optab_libfunc (lt_optab, HFmode, NULL);
21924   set_optab_libfunc (le_optab, HFmode, NULL);
21925   set_optab_libfunc (ge_optab, HFmode, NULL);
21926   set_optab_libfunc (gt_optab, HFmode, NULL);
21927   set_optab_libfunc (unord_optab, HFmode, NULL);
21928 }
21929 
21930 /* Target hook for c_mode_for_suffix.  */
21931 static machine_mode
aarch64_c_mode_for_suffix(char suffix)21932 aarch64_c_mode_for_suffix (char suffix)
21933 {
21934   if (suffix == 'q')
21935     return TFmode;
21936 
21937   return VOIDmode;
21938 }
21939 
21940 /* We can only represent floating point constants which will fit in
21941    "quarter-precision" values.  These values are characterised by
21942    a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
21943    by:
21944 
21945    (-1)^s * (n/16) * 2^r
21946 
21947    Where:
21948      's' is the sign bit.
21949      'n' is an integer in the range 16 <= n <= 31.
21950      'r' is an integer in the range -3 <= r <= 4.  */
21951 
21952 /* Return true iff X can be represented by a quarter-precision
21953    floating point immediate operand X.  Note, we cannot represent 0.0.  */
21954 bool
aarch64_float_const_representable_p(rtx x)21955 aarch64_float_const_representable_p (rtx x)
21956 {
21957   /* This represents our current view of how many bits
21958      make up the mantissa.  */
21959   int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
21960   int exponent;
21961   unsigned HOST_WIDE_INT mantissa, mask;
21962   REAL_VALUE_TYPE r, m;
21963   bool fail;
21964 
21965   x = unwrap_const_vec_duplicate (x);
21966   if (!CONST_DOUBLE_P (x))
21967     return false;
21968 
21969   if (GET_MODE (x) == VOIDmode
21970       || (GET_MODE (x) == HFmode && !TARGET_FP_F16INST))
21971     return false;
21972 
21973   r = *CONST_DOUBLE_REAL_VALUE (x);
21974 
21975   /* We cannot represent infinities, NaNs or +/-zero.  We won't
21976      know if we have +zero until we analyse the mantissa, but we
21977      can reject the other invalid values.  */
21978   if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
21979       || REAL_VALUE_MINUS_ZERO (r))
21980     return false;
21981 
21982   /* Extract exponent.  */
21983   r = real_value_abs (&r);
21984   exponent = REAL_EXP (&r);
21985 
21986   /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
21987      highest (sign) bit, with a fixed binary point at bit point_pos.
21988      m1 holds the low part of the mantissa, m2 the high part.
21989      WARNING: If we ever have a representation using more than 2 * H_W_I - 1
21990      bits for the mantissa, this can fail (low bits will be lost).  */
21991   real_ldexp (&m, &r, point_pos - exponent);
21992   wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
21993 
21994   /* If the low part of the mantissa has bits set we cannot represent
21995      the value.  */
21996   if (w.ulow () != 0)
21997     return false;
21998   /* We have rejected the lower HOST_WIDE_INT, so update our
21999      understanding of how many bits lie in the mantissa and
22000      look only at the high HOST_WIDE_INT.  */
22001   mantissa = w.elt (1);
22002   point_pos -= HOST_BITS_PER_WIDE_INT;
22003 
22004   /* We can only represent values with a mantissa of the form 1.xxxx.  */
22005   mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
22006   if ((mantissa & mask) != 0)
22007     return false;
22008 
22009   /* Having filtered unrepresentable values, we may now remove all
22010      but the highest 5 bits.  */
22011   mantissa >>= point_pos - 5;
22012 
22013   /* We cannot represent the value 0.0, so reject it.  This is handled
22014      elsewhere.  */
22015   if (mantissa == 0)
22016     return false;
22017 
22018   /* Then, as bit 4 is always set, we can mask it off, leaving
22019      the mantissa in the range [0, 15].  */
22020   mantissa &= ~(1 << 4);
22021   gcc_assert (mantissa <= 15);
22022 
22023   /* GCC internally does not use IEEE754-like encoding (where normalized
22024      significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.c).
22025      Our mantissa values are shifted 4 places to the left relative to
22026      normalized IEEE754 so we must modify the exponent returned by REAL_EXP
22027      by 5 places to correct for GCC's representation.  */
22028   exponent = 5 - exponent;
22029 
22030   return (exponent >= 0 && exponent <= 7);
22031 }
22032 
22033 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
22034    immediate with a CONST_VECTOR of MODE and WIDTH.  WHICH selects whether to
22035    output MOVI/MVNI, ORR or BIC immediate.  */
22036 char*
aarch64_output_simd_mov_immediate(rtx const_vector,unsigned width,enum simd_immediate_check which)22037 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
22038 				   enum simd_immediate_check which)
22039 {
22040   bool is_valid;
22041   static char templ[40];
22042   const char *mnemonic;
22043   const char *shift_op;
22044   unsigned int lane_count = 0;
22045   char element_char;
22046 
22047   struct simd_immediate_info info;
22048 
22049   /* This will return true to show const_vector is legal for use as either
22050      a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
22051      It will also update INFO to show how the immediate should be generated.
22052      WHICH selects whether to check for MOVI/MVNI, ORR or BIC.  */
22053   is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
22054   gcc_assert (is_valid);
22055 
22056   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
22057   lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
22058 
22059   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
22060     {
22061       gcc_assert (info.insn == simd_immediate_info::MOV
22062 		  && info.u.mov.shift == 0);
22063       /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
22064 	 move immediate path.  */
22065       if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
22066         info.u.mov.value = GEN_INT (0);
22067       else
22068 	{
22069 	  const unsigned int buf_size = 20;
22070 	  char float_buf[buf_size] = {'\0'};
22071 	  real_to_decimal_for_mode (float_buf,
22072 				    CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
22073 				    buf_size, buf_size, 1, info.elt_mode);
22074 
22075 	  if (lane_count == 1)
22076 	    snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
22077 	  else
22078 	    snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
22079 		      lane_count, element_char, float_buf);
22080 	  return templ;
22081 	}
22082     }
22083 
22084   gcc_assert (CONST_INT_P (info.u.mov.value));
22085 
22086   if (which == AARCH64_CHECK_MOV)
22087     {
22088       mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
22089       shift_op = (info.u.mov.modifier == simd_immediate_info::MSL
22090 		  ? "msl" : "lsl");
22091       if (lane_count == 1)
22092 	snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
22093 		  mnemonic, UINTVAL (info.u.mov.value));
22094       else if (info.u.mov.shift)
22095 	snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
22096 		  HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
22097 		  element_char, UINTVAL (info.u.mov.value), shift_op,
22098 		  info.u.mov.shift);
22099       else
22100 	snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
22101 		  HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
22102 		  element_char, UINTVAL (info.u.mov.value));
22103     }
22104   else
22105     {
22106       /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR.  */
22107       mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
22108       if (info.u.mov.shift)
22109 	snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
22110 		  HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
22111 		  element_char, UINTVAL (info.u.mov.value), "lsl",
22112 		  info.u.mov.shift);
22113       else
22114 	snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
22115 		  HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
22116 		  element_char, UINTVAL (info.u.mov.value));
22117     }
22118   return templ;
22119 }
22120 
22121 char*
aarch64_output_scalar_simd_mov_immediate(rtx immediate,scalar_int_mode mode)22122 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
22123 {
22124 
22125   /* If a floating point number was passed and we desire to use it in an
22126      integer mode do the conversion to integer.  */
22127   if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
22128     {
22129       unsigned HOST_WIDE_INT ival;
22130       if (!aarch64_reinterpret_float_as_int (immediate, &ival))
22131 	  gcc_unreachable ();
22132       immediate = gen_int_mode (ival, mode);
22133     }
22134 
22135   machine_mode vmode;
22136   /* use a 64 bit mode for everything except for DI/DF mode, where we use
22137      a 128 bit vector mode.  */
22138   int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
22139 
22140   vmode = aarch64_simd_container_mode (mode, width);
22141   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
22142   return aarch64_output_simd_mov_immediate (v_op, width);
22143 }
22144 
22145 /* Return the output string to use for moving immediate CONST_VECTOR
22146    into an SVE register.  */
22147 
22148 char *
aarch64_output_sve_mov_immediate(rtx const_vector)22149 aarch64_output_sve_mov_immediate (rtx const_vector)
22150 {
22151   static char templ[40];
22152   struct simd_immediate_info info;
22153   char element_char;
22154 
22155   bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
22156   gcc_assert (is_valid);
22157 
22158   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
22159 
22160   machine_mode vec_mode = GET_MODE (const_vector);
22161   if (aarch64_sve_pred_mode_p (vec_mode))
22162     {
22163       static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
22164       if (info.insn == simd_immediate_info::MOV)
22165 	{
22166 	  gcc_assert (info.u.mov.value == const0_rtx);
22167 	  snprintf (buf, sizeof (buf), "pfalse\t%%0.b");
22168 	}
22169       else
22170 	{
22171 	  gcc_assert (info.insn == simd_immediate_info::PTRUE);
22172 	  unsigned int total_bytes;
22173 	  if (info.u.pattern == AARCH64_SV_ALL
22174 	      && BYTES_PER_SVE_VECTOR.is_constant (&total_bytes))
22175 	    snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", element_char,
22176 		      total_bytes / GET_MODE_SIZE (info.elt_mode));
22177 	  else
22178 	    snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, %s", element_char,
22179 		      svpattern_token (info.u.pattern));
22180 	}
22181       return buf;
22182     }
22183 
22184   if (info.insn == simd_immediate_info::INDEX)
22185     {
22186       snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
22187 		HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
22188 		element_char, INTVAL (info.u.index.base),
22189 		INTVAL (info.u.index.step));
22190       return templ;
22191     }
22192 
22193   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
22194     {
22195       if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
22196 	info.u.mov.value = GEN_INT (0);
22197       else
22198 	{
22199 	  const int buf_size = 20;
22200 	  char float_buf[buf_size] = {};
22201 	  real_to_decimal_for_mode (float_buf,
22202 				    CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
22203 				    buf_size, buf_size, 1, info.elt_mode);
22204 
22205 	  snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
22206 		    element_char, float_buf);
22207 	  return templ;
22208 	}
22209     }
22210 
22211   snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
22212 	    element_char, INTVAL (info.u.mov.value));
22213   return templ;
22214 }
22215 
22216 /* Return the asm template for a PTRUES.  CONST_UNSPEC is the
22217    aarch64_sve_ptrue_svpattern_immediate that describes the predicate
22218    pattern.  */
22219 
22220 char *
aarch64_output_sve_ptrues(rtx const_unspec)22221 aarch64_output_sve_ptrues (rtx const_unspec)
22222 {
22223   static char templ[40];
22224 
22225   struct simd_immediate_info info;
22226   bool is_valid = aarch64_simd_valid_immediate (const_unspec, &info);
22227   gcc_assert (is_valid && info.insn == simd_immediate_info::PTRUE);
22228 
22229   char element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
22230   snprintf (templ, sizeof (templ), "ptrues\t%%0.%c, %s", element_char,
22231 	    svpattern_token (info.u.pattern));
22232   return templ;
22233 }
22234 
22235 /* Split operands into moves from op[1] + op[2] into op[0].  */
22236 
22237 void
aarch64_split_combinev16qi(rtx operands[3])22238 aarch64_split_combinev16qi (rtx operands[3])
22239 {
22240   unsigned int dest = REGNO (operands[0]);
22241   unsigned int src1 = REGNO (operands[1]);
22242   unsigned int src2 = REGNO (operands[2]);
22243   machine_mode halfmode = GET_MODE (operands[1]);
22244   unsigned int halfregs = REG_NREGS (operands[1]);
22245   rtx destlo, desthi;
22246 
22247   gcc_assert (halfmode == V16QImode);
22248 
22249   if (src1 == dest && src2 == dest + halfregs)
22250     {
22251       /* No-op move.  Can't split to nothing; emit something.  */
22252       emit_note (NOTE_INSN_DELETED);
22253       return;
22254     }
22255 
22256   /* Preserve register attributes for variable tracking.  */
22257   destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
22258   desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
22259 			       GET_MODE_SIZE (halfmode));
22260 
22261   /* Special case of reversed high/low parts.  */
22262   if (reg_overlap_mentioned_p (operands[2], destlo)
22263       && reg_overlap_mentioned_p (operands[1], desthi))
22264     {
22265       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
22266       emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
22267       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
22268     }
22269   else if (!reg_overlap_mentioned_p (operands[2], destlo))
22270     {
22271       /* Try to avoid unnecessary moves if part of the result
22272 	 is in the right place already.  */
22273       if (src1 != dest)
22274 	emit_move_insn (destlo, operands[1]);
22275       if (src2 != dest + halfregs)
22276 	emit_move_insn (desthi, operands[2]);
22277     }
22278   else
22279     {
22280       if (src2 != dest + halfregs)
22281 	emit_move_insn (desthi, operands[2]);
22282       if (src1 != dest)
22283 	emit_move_insn (destlo, operands[1]);
22284     }
22285 }
22286 
22287 /* vec_perm support.  */
22288 
22289 struct expand_vec_perm_d
22290 {
22291   rtx target, op0, op1;
22292   vec_perm_indices perm;
22293   machine_mode vmode;
22294   unsigned int vec_flags;
22295   bool one_vector_p;
22296   bool testing_p;
22297 };
22298 
22299 static bool aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d);
22300 
22301 /* Generate a variable permutation.  */
22302 
22303 static void
aarch64_expand_vec_perm_1(rtx target,rtx op0,rtx op1,rtx sel)22304 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
22305 {
22306   machine_mode vmode = GET_MODE (target);
22307   bool one_vector_p = rtx_equal_p (op0, op1);
22308 
22309   gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
22310   gcc_checking_assert (GET_MODE (op0) == vmode);
22311   gcc_checking_assert (GET_MODE (op1) == vmode);
22312   gcc_checking_assert (GET_MODE (sel) == vmode);
22313   gcc_checking_assert (TARGET_SIMD);
22314 
22315   if (one_vector_p)
22316     {
22317       if (vmode == V8QImode)
22318 	{
22319 	  /* Expand the argument to a V16QI mode by duplicating it.  */
22320 	  rtx pair = gen_reg_rtx (V16QImode);
22321 	  emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
22322 	  emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
22323 	}
22324       else
22325 	{
22326 	  emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
22327 	}
22328     }
22329   else
22330     {
22331       rtx pair;
22332 
22333       if (vmode == V8QImode)
22334 	{
22335 	  pair = gen_reg_rtx (V16QImode);
22336 	  emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
22337 	  emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
22338 	}
22339       else
22340 	{
22341 	  pair = gen_reg_rtx (OImode);
22342 	  emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
22343 	  emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
22344 	}
22345     }
22346 }
22347 
22348 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
22349    NELT is the number of elements in the vector.  */
22350 
22351 void
aarch64_expand_vec_perm(rtx target,rtx op0,rtx op1,rtx sel,unsigned int nelt)22352 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
22353 			 unsigned int nelt)
22354 {
22355   machine_mode vmode = GET_MODE (target);
22356   bool one_vector_p = rtx_equal_p (op0, op1);
22357   rtx mask;
22358 
22359   /* The TBL instruction does not use a modulo index, so we must take care
22360      of that ourselves.  */
22361   mask = aarch64_simd_gen_const_vector_dup (vmode,
22362       one_vector_p ? nelt - 1 : 2 * nelt - 1);
22363   sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
22364 
22365   /* For big-endian, we also need to reverse the index within the vector
22366      (but not which vector).  */
22367   if (BYTES_BIG_ENDIAN)
22368     {
22369       /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
22370       if (!one_vector_p)
22371         mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
22372       sel = expand_simple_binop (vmode, XOR, sel, mask,
22373 				 NULL, 0, OPTAB_LIB_WIDEN);
22374     }
22375   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
22376 }
22377 
22378 /* Generate (set TARGET (unspec [OP0 OP1] CODE)).  */
22379 
22380 static void
emit_unspec2(rtx target,int code,rtx op0,rtx op1)22381 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
22382 {
22383   emit_insn (gen_rtx_SET (target,
22384 			  gen_rtx_UNSPEC (GET_MODE (target),
22385 					  gen_rtvec (2, op0, op1), code)));
22386 }
22387 
22388 /* Expand an SVE vec_perm with the given operands.  */
22389 
22390 void
aarch64_expand_sve_vec_perm(rtx target,rtx op0,rtx op1,rtx sel)22391 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
22392 {
22393   machine_mode data_mode = GET_MODE (target);
22394   machine_mode sel_mode = GET_MODE (sel);
22395   /* Enforced by the pattern condition.  */
22396   int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
22397 
22398   /* Note: vec_perm indices are supposed to wrap when they go beyond the
22399      size of the two value vectors, i.e. the upper bits of the indices
22400      are effectively ignored.  SVE TBL instead produces 0 for any
22401      out-of-range indices, so we need to modulo all the vec_perm indices
22402      to ensure they are all in range.  */
22403   rtx sel_reg = force_reg (sel_mode, sel);
22404 
22405   /* Check if the sel only references the first values vector.  */
22406   if (GET_CODE (sel) == CONST_VECTOR
22407       && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
22408     {
22409       emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
22410       return;
22411     }
22412 
22413   /* Check if the two values vectors are the same.  */
22414   if (rtx_equal_p (op0, op1))
22415     {
22416       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
22417       rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
22418 					 NULL, 0, OPTAB_DIRECT);
22419       emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
22420       return;
22421     }
22422 
22423   /* Run TBL on for each value vector and combine the results.  */
22424 
22425   rtx res0 = gen_reg_rtx (data_mode);
22426   rtx res1 = gen_reg_rtx (data_mode);
22427   rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
22428   if (GET_CODE (sel) != CONST_VECTOR
22429       || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
22430     {
22431       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
22432 						       2 * nunits - 1);
22433       sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
22434 				     NULL, 0, OPTAB_DIRECT);
22435     }
22436   emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
22437   rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
22438 				     NULL, 0, OPTAB_DIRECT);
22439   emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
22440   if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
22441     emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
22442   else
22443     emit_unspec2 (target, UNSPEC_IORF, res0, res1);
22444 }
22445 
22446 /* Recognize patterns suitable for the TRN instructions.  */
22447 static bool
aarch64_evpc_trn(struct expand_vec_perm_d * d)22448 aarch64_evpc_trn (struct expand_vec_perm_d *d)
22449 {
22450   HOST_WIDE_INT odd;
22451   poly_uint64 nelt = d->perm.length ();
22452   rtx out, in0, in1, x;
22453   machine_mode vmode = d->vmode;
22454 
22455   if (GET_MODE_UNIT_SIZE (vmode) > 8)
22456     return false;
22457 
22458   /* Note that these are little-endian tests.
22459      We correct for big-endian later.  */
22460   if (!d->perm[0].is_constant (&odd)
22461       || (odd != 0 && odd != 1)
22462       || !d->perm.series_p (0, 2, odd, 2)
22463       || !d->perm.series_p (1, 2, nelt + odd, 2))
22464     return false;
22465 
22466   /* Success!  */
22467   if (d->testing_p)
22468     return true;
22469 
22470   in0 = d->op0;
22471   in1 = d->op1;
22472   /* We don't need a big-endian lane correction for SVE; see the comment
22473      at the head of aarch64-sve.md for details.  */
22474   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
22475     {
22476       x = in0, in0 = in1, in1 = x;
22477       odd = !odd;
22478     }
22479   out = d->target;
22480 
22481   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
22482 				      odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
22483   return true;
22484 }
22485 
22486 /* Try to re-encode the PERM constant so it combines odd and even elements.
22487    This rewrites constants such as {0, 1, 4, 5}/V4SF to {0, 2}/V2DI.
22488    We retry with this new constant with the full suite of patterns.  */
22489 static bool
aarch64_evpc_reencode(struct expand_vec_perm_d * d)22490 aarch64_evpc_reencode (struct expand_vec_perm_d *d)
22491 {
22492   expand_vec_perm_d newd;
22493   unsigned HOST_WIDE_INT nelt;
22494 
22495   if (d->vec_flags != VEC_ADVSIMD)
22496     return false;
22497 
22498   /* Get the new mode.  Always twice the size of the inner
22499      and half the elements.  */
22500   poly_uint64 vec_bits = GET_MODE_BITSIZE (d->vmode);
22501   unsigned int new_elt_bits = GET_MODE_UNIT_BITSIZE (d->vmode) * 2;
22502   auto new_elt_mode = int_mode_for_size (new_elt_bits, false).require ();
22503   machine_mode new_mode = aarch64_simd_container_mode (new_elt_mode, vec_bits);
22504 
22505   if (new_mode == word_mode)
22506     return false;
22507 
22508   /* to_constant is safe since this routine is specific to Advanced SIMD
22509      vectors.  */
22510   nelt = d->perm.length ().to_constant ();
22511 
22512   vec_perm_builder newpermconst;
22513   newpermconst.new_vector (nelt / 2, nelt / 2, 1);
22514 
22515   /* Convert the perm constant if we can.  Require even, odd as the pairs.  */
22516   for (unsigned int i = 0; i < nelt; i += 2)
22517     {
22518       poly_int64 elt0 = d->perm[i];
22519       poly_int64 elt1 = d->perm[i + 1];
22520       poly_int64 newelt;
22521       if (!multiple_p (elt0, 2, &newelt) || maybe_ne (elt0 + 1, elt1))
22522 	return false;
22523       newpermconst.quick_push (newelt.to_constant ());
22524     }
22525   newpermconst.finalize ();
22526 
22527   newd.vmode = new_mode;
22528   newd.vec_flags = VEC_ADVSIMD;
22529   newd.target = d->target ? gen_lowpart (new_mode, d->target) : NULL;
22530   newd.op0 = d->op0 ? gen_lowpart (new_mode, d->op0) : NULL;
22531   newd.op1 = d->op1 ? gen_lowpart (new_mode, d->op1) : NULL;
22532   newd.testing_p = d->testing_p;
22533   newd.one_vector_p = d->one_vector_p;
22534 
22535   newd.perm.new_vector (newpermconst, newd.one_vector_p ? 1 : 2, nelt / 2);
22536   return aarch64_expand_vec_perm_const_1 (&newd);
22537 }
22538 
22539 /* Recognize patterns suitable for the UZP instructions.  */
22540 static bool
aarch64_evpc_uzp(struct expand_vec_perm_d * d)22541 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
22542 {
22543   HOST_WIDE_INT odd;
22544   rtx out, in0, in1, x;
22545   machine_mode vmode = d->vmode;
22546 
22547   if (GET_MODE_UNIT_SIZE (vmode) > 8)
22548     return false;
22549 
22550   /* Note that these are little-endian tests.
22551      We correct for big-endian later.  */
22552   if (!d->perm[0].is_constant (&odd)
22553       || (odd != 0 && odd != 1)
22554       || !d->perm.series_p (0, 1, odd, 2))
22555     return false;
22556 
22557   /* Success!  */
22558   if (d->testing_p)
22559     return true;
22560 
22561   in0 = d->op0;
22562   in1 = d->op1;
22563   /* We don't need a big-endian lane correction for SVE; see the comment
22564      at the head of aarch64-sve.md for details.  */
22565   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
22566     {
22567       x = in0, in0 = in1, in1 = x;
22568       odd = !odd;
22569     }
22570   out = d->target;
22571 
22572   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
22573 				      odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
22574   return true;
22575 }
22576 
22577 /* Recognize patterns suitable for the ZIP instructions.  */
22578 static bool
aarch64_evpc_zip(struct expand_vec_perm_d * d)22579 aarch64_evpc_zip (struct expand_vec_perm_d *d)
22580 {
22581   unsigned int high;
22582   poly_uint64 nelt = d->perm.length ();
22583   rtx out, in0, in1, x;
22584   machine_mode vmode = d->vmode;
22585 
22586   if (GET_MODE_UNIT_SIZE (vmode) > 8)
22587     return false;
22588 
22589   /* Note that these are little-endian tests.
22590      We correct for big-endian later.  */
22591   poly_uint64 first = d->perm[0];
22592   if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
22593       || !d->perm.series_p (0, 2, first, 1)
22594       || !d->perm.series_p (1, 2, first + nelt, 1))
22595     return false;
22596   high = maybe_ne (first, 0U);
22597 
22598   /* Success!  */
22599   if (d->testing_p)
22600     return true;
22601 
22602   in0 = d->op0;
22603   in1 = d->op1;
22604   /* We don't need a big-endian lane correction for SVE; see the comment
22605      at the head of aarch64-sve.md for details.  */
22606   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
22607     {
22608       x = in0, in0 = in1, in1 = x;
22609       high = !high;
22610     }
22611   out = d->target;
22612 
22613   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
22614 				      high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
22615   return true;
22616 }
22617 
22618 /* Recognize patterns for the EXT insn.  */
22619 
22620 static bool
aarch64_evpc_ext(struct expand_vec_perm_d * d)22621 aarch64_evpc_ext (struct expand_vec_perm_d *d)
22622 {
22623   HOST_WIDE_INT location;
22624   rtx offset;
22625 
22626   /* The first element always refers to the first vector.
22627      Check if the extracted indices are increasing by one.  */
22628   if (d->vec_flags == VEC_SVE_PRED
22629       || !d->perm[0].is_constant (&location)
22630       || !d->perm.series_p (0, 1, location, 1))
22631     return false;
22632 
22633   /* Success! */
22634   if (d->testing_p)
22635     return true;
22636 
22637   /* The case where (location == 0) is a no-op for both big- and little-endian,
22638      and is removed by the mid-end at optimization levels -O1 and higher.
22639 
22640      We don't need a big-endian lane correction for SVE; see the comment
22641      at the head of aarch64-sve.md for details.  */
22642   if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
22643     {
22644       /* After setup, we want the high elements of the first vector (stored
22645          at the LSB end of the register), and the low elements of the second
22646          vector (stored at the MSB end of the register). So swap.  */
22647       std::swap (d->op0, d->op1);
22648       /* location != 0 (above), so safe to assume (nelt - location) < nelt.
22649 	 to_constant () is safe since this is restricted to Advanced SIMD
22650 	 vectors.  */
22651       location = d->perm.length ().to_constant () - location;
22652     }
22653 
22654   offset = GEN_INT (location);
22655   emit_set_insn (d->target,
22656 		 gen_rtx_UNSPEC (d->vmode,
22657 				 gen_rtvec (3, d->op0, d->op1, offset),
22658 				 UNSPEC_EXT));
22659   return true;
22660 }
22661 
22662 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
22663    within each 64-bit, 32-bit or 16-bit granule.  */
22664 
22665 static bool
aarch64_evpc_rev_local(struct expand_vec_perm_d * d)22666 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
22667 {
22668   HOST_WIDE_INT diff;
22669   unsigned int i, size, unspec;
22670   machine_mode pred_mode;
22671 
22672   if (d->vec_flags == VEC_SVE_PRED
22673       || !d->one_vector_p
22674       || !d->perm[0].is_constant (&diff)
22675       || !diff)
22676     return false;
22677 
22678   if (d->vec_flags & VEC_SVE_DATA)
22679     size = (diff + 1) * aarch64_sve_container_bits (d->vmode);
22680   else
22681     size = (diff + 1) * GET_MODE_UNIT_BITSIZE (d->vmode);
22682   if (size == 64)
22683     {
22684       unspec = UNSPEC_REV64;
22685       pred_mode = VNx2BImode;
22686     }
22687   else if (size == 32)
22688     {
22689       unspec = UNSPEC_REV32;
22690       pred_mode = VNx4BImode;
22691     }
22692   else if (size == 16)
22693     {
22694       unspec = UNSPEC_REV16;
22695       pred_mode = VNx8BImode;
22696     }
22697   else
22698     return false;
22699 
22700   unsigned int step = diff + 1;
22701   for (i = 0; i < step; ++i)
22702     if (!d->perm.series_p (i, step, diff - i, step))
22703       return false;
22704 
22705   /* Success! */
22706   if (d->testing_p)
22707     return true;
22708 
22709   if (d->vec_flags & VEC_SVE_DATA)
22710     {
22711       rtx pred = aarch64_ptrue_reg (pred_mode);
22712       emit_insn (gen_aarch64_sve_revbhw (d->vmode, pred_mode,
22713 					 d->target, pred, d->op0));
22714       return true;
22715     }
22716   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
22717   emit_set_insn (d->target, src);
22718   return true;
22719 }
22720 
22721 /* Recognize patterns for the REV insn, which reverses elements within
22722    a full vector.  */
22723 
22724 static bool
aarch64_evpc_rev_global(struct expand_vec_perm_d * d)22725 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
22726 {
22727   poly_uint64 nelt = d->perm.length ();
22728 
22729   if (!d->one_vector_p || d->vec_flags == VEC_ADVSIMD)
22730     return false;
22731 
22732   if (!d->perm.series_p (0, 1, nelt - 1, -1))
22733     return false;
22734 
22735   /* Success! */
22736   if (d->testing_p)
22737     return true;
22738 
22739   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
22740   emit_set_insn (d->target, src);
22741   return true;
22742 }
22743 
22744 static bool
aarch64_evpc_dup(struct expand_vec_perm_d * d)22745 aarch64_evpc_dup (struct expand_vec_perm_d *d)
22746 {
22747   rtx out = d->target;
22748   rtx in0;
22749   HOST_WIDE_INT elt;
22750   machine_mode vmode = d->vmode;
22751   rtx lane;
22752 
22753   if (d->vec_flags == VEC_SVE_PRED
22754       || d->perm.encoding ().encoded_nelts () != 1
22755       || !d->perm[0].is_constant (&elt))
22756     return false;
22757 
22758   if ((d->vec_flags & VEC_SVE_DATA)
22759       && elt * (aarch64_sve_container_bits (vmode) / 8) >= 64)
22760     return false;
22761 
22762   /* Success! */
22763   if (d->testing_p)
22764     return true;
22765 
22766   /* The generic preparation in aarch64_expand_vec_perm_const_1
22767      swaps the operand order and the permute indices if it finds
22768      d->perm[0] to be in the second operand.  Thus, we can always
22769      use d->op0 and need not do any extra arithmetic to get the
22770      correct lane number.  */
22771   in0 = d->op0;
22772   lane = GEN_INT (elt); /* The pattern corrects for big-endian.  */
22773 
22774   rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
22775   rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
22776   emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
22777   return true;
22778 }
22779 
22780 static bool
aarch64_evpc_tbl(struct expand_vec_perm_d * d)22781 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
22782 {
22783   rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
22784   machine_mode vmode = d->vmode;
22785 
22786   /* Make sure that the indices are constant.  */
22787   unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
22788   for (unsigned int i = 0; i < encoded_nelts; ++i)
22789     if (!d->perm[i].is_constant ())
22790       return false;
22791 
22792   if (d->testing_p)
22793     return true;
22794 
22795   /* Generic code will try constant permutation twice.  Once with the
22796      original mode and again with the elements lowered to QImode.
22797      So wait and don't do the selector expansion ourselves.  */
22798   if (vmode != V8QImode && vmode != V16QImode)
22799     return false;
22800 
22801   /* to_constant is safe since this routine is specific to Advanced SIMD
22802      vectors.  */
22803   unsigned int nelt = d->perm.length ().to_constant ();
22804   for (unsigned int i = 0; i < nelt; ++i)
22805     /* If big-endian and two vectors we end up with a weird mixed-endian
22806        mode on NEON.  Reverse the index within each word but not the word
22807        itself.  to_constant is safe because we checked is_constant above.  */
22808     rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
22809 			? d->perm[i].to_constant () ^ (nelt - 1)
22810 			: d->perm[i].to_constant ());
22811 
22812   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
22813   sel = force_reg (vmode, sel);
22814 
22815   aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
22816   return true;
22817 }
22818 
22819 /* Try to implement D using an SVE TBL instruction.  */
22820 
22821 static bool
aarch64_evpc_sve_tbl(struct expand_vec_perm_d * d)22822 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
22823 {
22824   unsigned HOST_WIDE_INT nelt;
22825 
22826   /* Permuting two variable-length vectors could overflow the
22827      index range.  */
22828   if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
22829     return false;
22830 
22831   if (d->testing_p)
22832     return true;
22833 
22834   machine_mode sel_mode = related_int_vector_mode (d->vmode).require ();
22835   rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
22836   if (d->one_vector_p)
22837     emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
22838   else
22839     aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
22840   return true;
22841 }
22842 
22843 /* Try to implement D using SVE SEL instruction.  */
22844 
22845 static bool
aarch64_evpc_sel(struct expand_vec_perm_d * d)22846 aarch64_evpc_sel (struct expand_vec_perm_d *d)
22847 {
22848   machine_mode vmode = d->vmode;
22849   int unit_size = GET_MODE_UNIT_SIZE (vmode);
22850 
22851   if (d->vec_flags != VEC_SVE_DATA
22852       || unit_size > 8)
22853     return false;
22854 
22855   int n_patterns = d->perm.encoding ().npatterns ();
22856   poly_int64 vec_len = d->perm.length ();
22857 
22858   for (int i = 0; i < n_patterns; ++i)
22859     if (!known_eq (d->perm[i], i)
22860 	&& !known_eq (d->perm[i], vec_len + i))
22861       return false;
22862 
22863   for (int i = n_patterns; i < n_patterns * 2; i++)
22864     if (!d->perm.series_p (i, n_patterns, i, n_patterns)
22865 	&& !d->perm.series_p (i, n_patterns, vec_len + i, n_patterns))
22866       return false;
22867 
22868   if (d->testing_p)
22869     return true;
22870 
22871   machine_mode pred_mode = aarch64_sve_pred_mode (vmode);
22872 
22873   /* Build a predicate that is true when op0 elements should be used.  */
22874   rtx_vector_builder builder (pred_mode, n_patterns, 2);
22875   for (int i = 0; i < n_patterns * 2; i++)
22876     {
22877       rtx elem = known_eq (d->perm[i], i) ? CONST1_RTX (BImode)
22878 					  : CONST0_RTX (BImode);
22879       builder.quick_push (elem);
22880     }
22881 
22882   rtx const_vec = builder.build ();
22883   rtx pred = force_reg (pred_mode, const_vec);
22884   /* TARGET = PRED ? OP0 : OP1.  */
22885   emit_insn (gen_vcond_mask (vmode, vmode, d->target, d->op0, d->op1, pred));
22886   return true;
22887 }
22888 
22889 /* Recognize patterns suitable for the INS instructions.  */
22890 static bool
aarch64_evpc_ins(struct expand_vec_perm_d * d)22891 aarch64_evpc_ins (struct expand_vec_perm_d *d)
22892 {
22893   machine_mode mode = d->vmode;
22894   unsigned HOST_WIDE_INT nelt;
22895 
22896   if (d->vec_flags != VEC_ADVSIMD)
22897     return false;
22898 
22899   /* to_constant is safe since this routine is specific to Advanced SIMD
22900      vectors.  */
22901   nelt = d->perm.length ().to_constant ();
22902   rtx insv = d->op0;
22903 
22904   HOST_WIDE_INT idx = -1;
22905 
22906   for (unsigned HOST_WIDE_INT i = 0; i < nelt; i++)
22907     {
22908       HOST_WIDE_INT elt;
22909       if (!d->perm[i].is_constant (&elt))
22910 	return false;
22911       if (elt == (HOST_WIDE_INT) i)
22912 	continue;
22913       if (idx != -1)
22914 	{
22915 	  idx = -1;
22916 	  break;
22917 	}
22918       idx = i;
22919     }
22920 
22921   if (idx == -1)
22922     {
22923       insv = d->op1;
22924       for (unsigned HOST_WIDE_INT i = 0; i < nelt; i++)
22925 	{
22926 	  if (d->perm[i].to_constant () == (HOST_WIDE_INT) (i + nelt))
22927 	    continue;
22928 	  if (idx != -1)
22929 	    return false;
22930 	  idx = i;
22931 	}
22932 
22933       if (idx == -1)
22934 	return false;
22935     }
22936 
22937   if (d->testing_p)
22938     return true;
22939 
22940   gcc_assert (idx != -1);
22941 
22942   unsigned extractindex = d->perm[idx].to_constant ();
22943   rtx extractv = d->op0;
22944   if (extractindex >= nelt)
22945     {
22946       extractv = d->op1;
22947       extractindex -= nelt;
22948     }
22949   gcc_assert (extractindex < nelt);
22950 
22951   emit_move_insn (d->target, insv);
22952   insn_code icode = code_for_aarch64_simd_vec_copy_lane (mode);
22953   expand_operand ops[5];
22954   create_output_operand (&ops[0], d->target, mode);
22955   create_input_operand (&ops[1], d->target, mode);
22956   create_integer_operand (&ops[2], 1 << idx);
22957   create_input_operand (&ops[3], extractv, mode);
22958   create_integer_operand (&ops[4], extractindex);
22959   expand_insn (icode, 5, ops);
22960 
22961   return true;
22962 }
22963 
22964 static bool
aarch64_expand_vec_perm_const_1(struct expand_vec_perm_d * d)22965 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
22966 {
22967   /* The pattern matching functions above are written to look for a small
22968      number to begin the sequence (0, 1, N/2).  If we begin with an index
22969      from the second operand, we can swap the operands.  */
22970   poly_int64 nelt = d->perm.length ();
22971   if (known_ge (d->perm[0], nelt))
22972     {
22973       d->perm.rotate_inputs (1);
22974       std::swap (d->op0, d->op1);
22975     }
22976 
22977   if ((d->vec_flags == VEC_ADVSIMD
22978        || d->vec_flags == VEC_SVE_DATA
22979        || d->vec_flags == (VEC_SVE_DATA | VEC_PARTIAL)
22980        || d->vec_flags == VEC_SVE_PRED)
22981       && known_gt (nelt, 1))
22982     {
22983       if (aarch64_evpc_rev_local (d))
22984 	return true;
22985       else if (aarch64_evpc_rev_global (d))
22986 	return true;
22987       else if (aarch64_evpc_ext (d))
22988 	return true;
22989       else if (aarch64_evpc_dup (d))
22990 	return true;
22991       else if (aarch64_evpc_zip (d))
22992 	return true;
22993       else if (aarch64_evpc_uzp (d))
22994 	return true;
22995       else if (aarch64_evpc_trn (d))
22996 	return true;
22997       else if (aarch64_evpc_sel (d))
22998 	return true;
22999       else if (aarch64_evpc_ins (d))
23000 	return true;
23001       else if (aarch64_evpc_reencode (d))
23002 	return true;
23003       if (d->vec_flags == VEC_SVE_DATA)
23004 	return aarch64_evpc_sve_tbl (d);
23005       else if (d->vec_flags == VEC_ADVSIMD)
23006 	return aarch64_evpc_tbl (d);
23007     }
23008   return false;
23009 }
23010 
23011 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST.  */
23012 
23013 static bool
aarch64_vectorize_vec_perm_const(machine_mode vmode,rtx target,rtx op0,rtx op1,const vec_perm_indices & sel)23014 aarch64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
23015 				  rtx op1, const vec_perm_indices &sel)
23016 {
23017   struct expand_vec_perm_d d;
23018 
23019   /* Check whether the mask can be applied to a single vector.  */
23020   if (sel.ninputs () == 1
23021       || (op0 && rtx_equal_p (op0, op1)))
23022     d.one_vector_p = true;
23023   else if (sel.all_from_input_p (0))
23024     {
23025       d.one_vector_p = true;
23026       op1 = op0;
23027     }
23028   else if (sel.all_from_input_p (1))
23029     {
23030       d.one_vector_p = true;
23031       op0 = op1;
23032     }
23033   else
23034     d.one_vector_p = false;
23035 
23036   d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
23037 		     sel.nelts_per_input ());
23038   d.vmode = vmode;
23039   d.vec_flags = aarch64_classify_vector_mode (d.vmode);
23040   d.target = target;
23041   d.op0 = op0 ? force_reg (vmode, op0) : NULL_RTX;
23042   if (op0 == op1)
23043     d.op1 = d.op0;
23044   else
23045     d.op1 = op1 ? force_reg (vmode, op1) : NULL_RTX;
23046   d.testing_p = !target;
23047 
23048   if (!d.testing_p)
23049     return aarch64_expand_vec_perm_const_1 (&d);
23050 
23051   rtx_insn *last = get_last_insn ();
23052   bool ret = aarch64_expand_vec_perm_const_1 (&d);
23053   gcc_assert (last == get_last_insn ());
23054 
23055   return ret;
23056 }
23057 
23058 /* Generate a byte permute mask for a register of mode MODE,
23059    which has NUNITS units.  */
23060 
23061 rtx
aarch64_reverse_mask(machine_mode mode,unsigned int nunits)23062 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
23063 {
23064   /* We have to reverse each vector because we dont have
23065      a permuted load that can reverse-load according to ABI rules.  */
23066   rtx mask;
23067   rtvec v = rtvec_alloc (16);
23068   unsigned int i, j;
23069   unsigned int usize = GET_MODE_UNIT_SIZE (mode);
23070 
23071   gcc_assert (BYTES_BIG_ENDIAN);
23072   gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
23073 
23074   for (i = 0; i < nunits; i++)
23075     for (j = 0; j < usize; j++)
23076       RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
23077   mask = gen_rtx_CONST_VECTOR (V16QImode, v);
23078   return force_reg (V16QImode, mask);
23079 }
23080 
23081 /* Expand an SVE integer comparison using the SVE equivalent of:
23082 
23083      (set TARGET (CODE OP0 OP1)).  */
23084 
23085 void
aarch64_expand_sve_vec_cmp_int(rtx target,rtx_code code,rtx op0,rtx op1)23086 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
23087 {
23088   machine_mode pred_mode = GET_MODE (target);
23089   machine_mode data_mode = GET_MODE (op0);
23090   rtx res = aarch64_sve_emit_int_cmp (target, pred_mode, code, data_mode,
23091 				      op0, op1);
23092   if (!rtx_equal_p (target, res))
23093     emit_move_insn (target, res);
23094 }
23095 
23096 /* Return the UNSPEC_COND_* code for comparison CODE.  */
23097 
23098 static unsigned int
aarch64_unspec_cond_code(rtx_code code)23099 aarch64_unspec_cond_code (rtx_code code)
23100 {
23101   switch (code)
23102     {
23103     case NE:
23104       return UNSPEC_COND_FCMNE;
23105     case EQ:
23106       return UNSPEC_COND_FCMEQ;
23107     case LT:
23108       return UNSPEC_COND_FCMLT;
23109     case GT:
23110       return UNSPEC_COND_FCMGT;
23111     case LE:
23112       return UNSPEC_COND_FCMLE;
23113     case GE:
23114       return UNSPEC_COND_FCMGE;
23115     case UNORDERED:
23116       return UNSPEC_COND_FCMUO;
23117     default:
23118       gcc_unreachable ();
23119     }
23120 }
23121 
23122 /* Emit:
23123 
23124       (set TARGET (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
23125 
23126    where <X> is the operation associated with comparison CODE.
23127    KNOWN_PTRUE_P is true if PRED is known to be a PTRUE.  */
23128 
23129 static void
aarch64_emit_sve_fp_cond(rtx target,rtx_code code,rtx pred,bool known_ptrue_p,rtx op0,rtx op1)23130 aarch64_emit_sve_fp_cond (rtx target, rtx_code code, rtx pred,
23131 			  bool known_ptrue_p, rtx op0, rtx op1)
23132 {
23133   rtx flag = gen_int_mode (known_ptrue_p, SImode);
23134   rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
23135 			       gen_rtvec (4, pred, flag, op0, op1),
23136 			       aarch64_unspec_cond_code (code));
23137   emit_set_insn (target, unspec);
23138 }
23139 
23140 /* Emit the SVE equivalent of:
23141 
23142       (set TMP1 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X1>))
23143       (set TMP2 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X2>))
23144       (set TARGET (ior:PRED_MODE TMP1 TMP2))
23145 
23146    where <Xi> is the operation associated with comparison CODEi.
23147    KNOWN_PTRUE_P is true if PRED is known to be a PTRUE.  */
23148 
23149 static void
aarch64_emit_sve_or_fp_conds(rtx target,rtx_code code1,rtx_code code2,rtx pred,bool known_ptrue_p,rtx op0,rtx op1)23150 aarch64_emit_sve_or_fp_conds (rtx target, rtx_code code1, rtx_code code2,
23151 			      rtx pred, bool known_ptrue_p, rtx op0, rtx op1)
23152 {
23153   machine_mode pred_mode = GET_MODE (pred);
23154   rtx tmp1 = gen_reg_rtx (pred_mode);
23155   aarch64_emit_sve_fp_cond (tmp1, code1, pred, known_ptrue_p, op0, op1);
23156   rtx tmp2 = gen_reg_rtx (pred_mode);
23157   aarch64_emit_sve_fp_cond (tmp2, code2, pred, known_ptrue_p, op0, op1);
23158   aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
23159 }
23160 
23161 /* Emit the SVE equivalent of:
23162 
23163       (set TMP (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
23164       (set TARGET (not TMP))
23165 
23166    where <X> is the operation associated with comparison CODE.
23167    KNOWN_PTRUE_P is true if PRED is known to be a PTRUE.  */
23168 
23169 static void
aarch64_emit_sve_invert_fp_cond(rtx target,rtx_code code,rtx pred,bool known_ptrue_p,rtx op0,rtx op1)23170 aarch64_emit_sve_invert_fp_cond (rtx target, rtx_code code, rtx pred,
23171 				 bool known_ptrue_p, rtx op0, rtx op1)
23172 {
23173   machine_mode pred_mode = GET_MODE (pred);
23174   rtx tmp = gen_reg_rtx (pred_mode);
23175   aarch64_emit_sve_fp_cond (tmp, code, pred, known_ptrue_p, op0, op1);
23176   aarch64_emit_unop (target, one_cmpl_optab, tmp);
23177 }
23178 
23179 /* Expand an SVE floating-point comparison using the SVE equivalent of:
23180 
23181      (set TARGET (CODE OP0 OP1))
23182 
23183    If CAN_INVERT_P is true, the caller can also handle inverted results;
23184    return true if the result is in fact inverted.  */
23185 
23186 bool
aarch64_expand_sve_vec_cmp_float(rtx target,rtx_code code,rtx op0,rtx op1,bool can_invert_p)23187 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
23188 				  rtx op0, rtx op1, bool can_invert_p)
23189 {
23190   machine_mode pred_mode = GET_MODE (target);
23191   machine_mode data_mode = GET_MODE (op0);
23192 
23193   rtx ptrue = aarch64_ptrue_reg (pred_mode);
23194   switch (code)
23195     {
23196     case UNORDERED:
23197       /* UNORDERED has no immediate form.  */
23198       op1 = force_reg (data_mode, op1);
23199       /* fall through */
23200     case LT:
23201     case LE:
23202     case GT:
23203     case GE:
23204     case EQ:
23205     case NE:
23206       {
23207 	/* There is native support for the comparison.  */
23208 	aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
23209 	return false;
23210       }
23211 
23212     case LTGT:
23213       /* This is a trapping operation (LT or GT).  */
23214       aarch64_emit_sve_or_fp_conds (target, LT, GT, ptrue, true, op0, op1);
23215       return false;
23216 
23217     case UNEQ:
23218       if (!flag_trapping_math)
23219 	{
23220 	  /* This would trap for signaling NaNs.  */
23221 	  op1 = force_reg (data_mode, op1);
23222 	  aarch64_emit_sve_or_fp_conds (target, UNORDERED, EQ,
23223 					ptrue, true, op0, op1);
23224 	  return false;
23225 	}
23226       /* fall through */
23227     case UNLT:
23228     case UNLE:
23229     case UNGT:
23230     case UNGE:
23231       if (flag_trapping_math)
23232 	{
23233 	  /* Work out which elements are ordered.  */
23234 	  rtx ordered = gen_reg_rtx (pred_mode);
23235 	  op1 = force_reg (data_mode, op1);
23236 	  aarch64_emit_sve_invert_fp_cond (ordered, UNORDERED,
23237 					   ptrue, true, op0, op1);
23238 
23239 	  /* Test the opposite condition for the ordered elements,
23240 	     then invert the result.  */
23241 	  if (code == UNEQ)
23242 	    code = NE;
23243 	  else
23244 	    code = reverse_condition_maybe_unordered (code);
23245 	  if (can_invert_p)
23246 	    {
23247 	      aarch64_emit_sve_fp_cond (target, code,
23248 					ordered, false, op0, op1);
23249 	      return true;
23250 	    }
23251 	  aarch64_emit_sve_invert_fp_cond (target, code,
23252 					   ordered, false, op0, op1);
23253 	  return false;
23254 	}
23255       break;
23256 
23257     case ORDERED:
23258       /* ORDERED has no immediate form.  */
23259       op1 = force_reg (data_mode, op1);
23260       break;
23261 
23262     default:
23263       gcc_unreachable ();
23264     }
23265 
23266   /* There is native support for the inverse comparison.  */
23267   code = reverse_condition_maybe_unordered (code);
23268   if (can_invert_p)
23269     {
23270       aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
23271       return true;
23272     }
23273   aarch64_emit_sve_invert_fp_cond (target, code, ptrue, true, op0, op1);
23274   return false;
23275 }
23276 
23277 /* Expand an SVE vcond pattern with operands OPS.  DATA_MODE is the mode
23278    of the data being selected and CMP_MODE is the mode of the values being
23279    compared.  */
23280 
23281 void
aarch64_expand_sve_vcond(machine_mode data_mode,machine_mode cmp_mode,rtx * ops)23282 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
23283 			  rtx *ops)
23284 {
23285   machine_mode pred_mode = aarch64_get_mask_mode (cmp_mode).require ();
23286   rtx pred = gen_reg_rtx (pred_mode);
23287   if (FLOAT_MODE_P (cmp_mode))
23288     {
23289       if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
23290 					    ops[4], ops[5], true))
23291 	std::swap (ops[1], ops[2]);
23292     }
23293   else
23294     aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
23295 
23296   if (!aarch64_sve_reg_or_dup_imm (ops[1], data_mode))
23297     ops[1] = force_reg (data_mode, ops[1]);
23298   /* The "false" value can only be zero if the "true" value is a constant.  */
23299   if (register_operand (ops[1], data_mode)
23300       || !aarch64_simd_reg_or_zero (ops[2], data_mode))
23301     ops[2] = force_reg (data_mode, ops[2]);
23302 
23303   rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
23304   emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
23305 }
23306 
23307 /* Implement TARGET_MODES_TIEABLE_P.  In principle we should always return
23308    true.  However due to issues with register allocation it is preferable
23309    to avoid tieing integer scalar and FP scalar modes.  Executing integer
23310    operations in general registers is better than treating them as scalar
23311    vector operations.  This reduces latency and avoids redundant int<->FP
23312    moves.  So tie modes if they are either the same class, or vector modes
23313    with other vector modes, vector structs or any scalar mode.  */
23314 
23315 static bool
aarch64_modes_tieable_p(machine_mode mode1,machine_mode mode2)23316 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
23317 {
23318   if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
23319     return true;
23320 
23321   /* We specifically want to allow elements of "structure" modes to
23322      be tieable to the structure.  This more general condition allows
23323      other rarer situations too.  The reason we don't extend this to
23324      predicate modes is that there are no predicate structure modes
23325      nor any specific instructions for extracting part of a predicate
23326      register.  */
23327   if (aarch64_vector_data_mode_p (mode1)
23328       && aarch64_vector_data_mode_p (mode2))
23329     return true;
23330 
23331   /* Also allow any scalar modes with vectors.  */
23332   if (aarch64_vector_mode_supported_p (mode1)
23333       || aarch64_vector_mode_supported_p (mode2))
23334     return true;
23335 
23336   return false;
23337 }
23338 
23339 /* Return a new RTX holding the result of moving POINTER forward by
23340    AMOUNT bytes.  */
23341 
23342 static rtx
aarch64_move_pointer(rtx pointer,poly_int64 amount)23343 aarch64_move_pointer (rtx pointer, poly_int64 amount)
23344 {
23345   rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
23346 
23347   return adjust_automodify_address (pointer, GET_MODE (pointer),
23348 				    next, amount);
23349 }
23350 
23351 /* Return a new RTX holding the result of moving POINTER forward by the
23352    size of the mode it points to.  */
23353 
23354 static rtx
aarch64_progress_pointer(rtx pointer)23355 aarch64_progress_pointer (rtx pointer)
23356 {
23357   return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
23358 }
23359 
23360 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
23361    MODE bytes.  */
23362 
23363 static void
aarch64_copy_one_block_and_progress_pointers(rtx * src,rtx * dst,machine_mode mode)23364 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
23365 					      machine_mode mode)
23366 {
23367   /* Handle 256-bit memcpy separately.  We do this by making 2 adjacent memory
23368      address copies using V4SImode so that we can use Q registers.  */
23369   if (known_eq (GET_MODE_BITSIZE (mode), 256))
23370     {
23371       mode = V4SImode;
23372       rtx reg1 = gen_reg_rtx (mode);
23373       rtx reg2 = gen_reg_rtx (mode);
23374       /* "Cast" the pointers to the correct mode.  */
23375       *src = adjust_address (*src, mode, 0);
23376       *dst = adjust_address (*dst, mode, 0);
23377       /* Emit the memcpy.  */
23378       emit_insn (aarch64_gen_load_pair (mode, reg1, *src, reg2,
23379 					aarch64_progress_pointer (*src)));
23380       emit_insn (aarch64_gen_store_pair (mode, *dst, reg1,
23381 					 aarch64_progress_pointer (*dst), reg2));
23382       /* Move the pointers forward.  */
23383       *src = aarch64_move_pointer (*src, 32);
23384       *dst = aarch64_move_pointer (*dst, 32);
23385       return;
23386     }
23387 
23388   rtx reg = gen_reg_rtx (mode);
23389 
23390   /* "Cast" the pointers to the correct mode.  */
23391   *src = adjust_address (*src, mode, 0);
23392   *dst = adjust_address (*dst, mode, 0);
23393   /* Emit the memcpy.  */
23394   emit_move_insn (reg, *src);
23395   emit_move_insn (*dst, reg);
23396   /* Move the pointers forward.  */
23397   *src = aarch64_progress_pointer (*src);
23398   *dst = aarch64_progress_pointer (*dst);
23399 }
23400 
23401 /* Expand cpymem, as if from a __builtin_memcpy.  Return true if
23402    we succeed, otherwise return false.  */
23403 
23404 bool
aarch64_expand_cpymem(rtx * operands)23405 aarch64_expand_cpymem (rtx *operands)
23406 {
23407   int mode_bits;
23408   rtx dst = operands[0];
23409   rtx src = operands[1];
23410   rtx base;
23411   machine_mode cur_mode = BLKmode;
23412 
23413   /* Only expand fixed-size copies.  */
23414   if (!CONST_INT_P (operands[2]))
23415     return false;
23416 
23417   unsigned HOST_WIDE_INT size = INTVAL (operands[2]);
23418 
23419   /* Inline up to 256 bytes when optimizing for speed.  */
23420   unsigned HOST_WIDE_INT max_copy_size = 256;
23421 
23422   if (optimize_function_for_size_p (cfun))
23423     max_copy_size = 128;
23424 
23425   int copy_bits = 256;
23426 
23427   /* Default to 256-bit LDP/STP on large copies, however small copies, no SIMD
23428      support or slow 256-bit LDP/STP fall back to 128-bit chunks.  */
23429   if (size <= 24
23430       || !TARGET_SIMD
23431       || (aarch64_tune_params.extra_tuning_flags
23432 	  & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS))
23433     {
23434       copy_bits = 128;
23435       max_copy_size = max_copy_size / 2;
23436     }
23437 
23438   if (size > max_copy_size)
23439     return false;
23440 
23441   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
23442   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
23443 
23444   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
23445   src = adjust_automodify_address (src, VOIDmode, base, 0);
23446 
23447   /* Convert size to bits to make the rest of the code simpler.  */
23448   int n = size * BITS_PER_UNIT;
23449 
23450   while (n > 0)
23451     {
23452       /* Find the largest mode in which to do the copy in without over reading
23453 	 or writing.  */
23454       opt_scalar_int_mode mode_iter;
23455       FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
23456 	if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_bits))
23457 	  cur_mode = mode_iter.require ();
23458 
23459       gcc_assert (cur_mode != BLKmode);
23460 
23461       mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
23462 
23463       /* Prefer Q-register accesses for the last bytes.  */
23464       if (mode_bits == 128 && copy_bits == 256)
23465 	cur_mode = V4SImode;
23466 
23467       aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode);
23468 
23469       n -= mode_bits;
23470 
23471       /* Emit trailing copies using overlapping unaligned accesses - this is
23472 	 smaller and faster.  */
23473       if (n > 0 && n < copy_bits / 2)
23474 	{
23475 	  machine_mode next_mode = smallest_mode_for_size (n, MODE_INT);
23476 	  int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
23477 	  gcc_assert (n_bits <= mode_bits);
23478 	  src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT);
23479 	  dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
23480 	  n = n_bits;
23481 	}
23482     }
23483 
23484   return true;
23485 }
23486 
23487 /* Like aarch64_copy_one_block_and_progress_pointers, except for memset where
23488    SRC is a register we have created with the duplicated value to be set.  */
23489 static void
aarch64_set_one_block_and_progress_pointer(rtx src,rtx * dst,machine_mode mode)23490 aarch64_set_one_block_and_progress_pointer (rtx src, rtx *dst,
23491 					    machine_mode mode)
23492 {
23493   /* If we are copying 128bits or 256bits, we can do that straight from
23494      the SIMD register we prepared.  */
23495   if (known_eq (GET_MODE_BITSIZE (mode), 256))
23496     {
23497       mode = GET_MODE (src);
23498       /* "Cast" the *dst to the correct mode.  */
23499       *dst = adjust_address (*dst, mode, 0);
23500       /* Emit the memset.  */
23501       emit_insn (aarch64_gen_store_pair (mode, *dst, src,
23502 					 aarch64_progress_pointer (*dst), src));
23503 
23504       /* Move the pointers forward.  */
23505       *dst = aarch64_move_pointer (*dst, 32);
23506       return;
23507     }
23508   if (known_eq (GET_MODE_BITSIZE (mode), 128))
23509     {
23510       /* "Cast" the *dst to the correct mode.  */
23511       *dst = adjust_address (*dst, GET_MODE (src), 0);
23512       /* Emit the memset.  */
23513       emit_move_insn (*dst, src);
23514       /* Move the pointers forward.  */
23515       *dst = aarch64_move_pointer (*dst, 16);
23516       return;
23517     }
23518   /* For copying less, we have to extract the right amount from src.  */
23519   rtx reg = lowpart_subreg (mode, src, GET_MODE (src));
23520 
23521   /* "Cast" the *dst to the correct mode.  */
23522   *dst = adjust_address (*dst, mode, 0);
23523   /* Emit the memset.  */
23524   emit_move_insn (*dst, reg);
23525   /* Move the pointer forward.  */
23526   *dst = aarch64_progress_pointer (*dst);
23527 }
23528 
23529 /* Expand setmem, as if from a __builtin_memset.  Return true if
23530    we succeed, otherwise return false.  */
23531 
23532 bool
aarch64_expand_setmem(rtx * operands)23533 aarch64_expand_setmem (rtx *operands)
23534 {
23535   int n, mode_bits;
23536   unsigned HOST_WIDE_INT len;
23537   rtx dst = operands[0];
23538   rtx val = operands[2], src;
23539   rtx base;
23540   machine_mode cur_mode = BLKmode, next_mode;
23541 
23542   /* We can't do anything smart if the amount to copy is not constant.  */
23543   if (!CONST_INT_P (operands[1]))
23544     return false;
23545 
23546   bool speed_p = !optimize_function_for_size_p (cfun);
23547 
23548   /* Default the maximum to 256-bytes.  */
23549   unsigned max_set_size = 256;
23550 
23551   /* In case we are optimizing for size or if the core does not
23552      want to use STP Q regs, lower the max_set_size.  */
23553   max_set_size = (!speed_p
23554 		  || (aarch64_tune_params.extra_tuning_flags
23555 		      & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS))
23556 		  ? max_set_size / 2 : max_set_size;
23557 
23558   len = INTVAL (operands[1]);
23559 
23560   /* Upper bound check.  */
23561   if (len > max_set_size)
23562     return false;
23563 
23564   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
23565   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
23566 
23567   /* Prepare the val using a DUP/MOVI v0.16B, val.  */
23568   src = expand_vector_broadcast (V16QImode, val);
23569   src = force_reg (V16QImode, src);
23570 
23571   /* Convert len to bits to make the rest of the code simpler.  */
23572   n = len * BITS_PER_UNIT;
23573 
23574   /* Maximum amount to copy in one go.  We allow 256-bit chunks based on the
23575      AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS tuning parameter.  setmem expand
23576      pattern is only turned on for TARGET_SIMD.  */
23577   const int copy_limit = (speed_p
23578 			  && (aarch64_tune_params.extra_tuning_flags
23579 			      & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS))
23580 			  ? GET_MODE_BITSIZE (TImode) : 256;
23581 
23582   while (n > 0)
23583     {
23584       /* Find the largest mode in which to do the copy without
23585 	 over writing.  */
23586       opt_scalar_int_mode mode_iter;
23587       FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
23588 	if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit))
23589 	  cur_mode = mode_iter.require ();
23590 
23591       gcc_assert (cur_mode != BLKmode);
23592 
23593       mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
23594       aarch64_set_one_block_and_progress_pointer (src, &dst, cur_mode);
23595 
23596       n -= mode_bits;
23597 
23598       /* Do certain trailing copies as overlapping if it's going to be
23599 	 cheaper.  i.e. less instructions to do so.  For instance doing a 15
23600 	 byte copy it's more efficient to do two overlapping 8 byte copies than
23601 	 8 + 4 + 2 + 1.  Only do this when -mstrict-align is not supplied.  */
23602       if (n > 0 && n < copy_limit / 2 && !STRICT_ALIGNMENT)
23603 	{
23604 	  next_mode = smallest_mode_for_size (n, MODE_INT);
23605 	  int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
23606 	  gcc_assert (n_bits <= mode_bits);
23607 	  dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
23608 	  n = n_bits;
23609 	}
23610     }
23611 
23612   return true;
23613 }
23614 
23615 
23616 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
23617    SImode stores.  Handle the case when the constant has identical
23618    bottom and top halves.  This is beneficial when the two stores can be
23619    merged into an STP and we avoid synthesising potentially expensive
23620    immediates twice.  Return true if such a split is possible.  */
23621 
23622 bool
aarch64_split_dimode_const_store(rtx dst,rtx src)23623 aarch64_split_dimode_const_store (rtx dst, rtx src)
23624 {
23625   rtx lo = gen_lowpart (SImode, src);
23626   rtx hi = gen_highpart_mode (SImode, DImode, src);
23627 
23628   bool size_p = optimize_function_for_size_p (cfun);
23629 
23630   if (!rtx_equal_p (lo, hi))
23631     return false;
23632 
23633   unsigned int orig_cost
23634     = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
23635   unsigned int lo_cost
23636     = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
23637 
23638   /* We want to transform:
23639      MOV	x1, 49370
23640      MOVK	x1, 0x140, lsl 16
23641      MOVK	x1, 0xc0da, lsl 32
23642      MOVK	x1, 0x140, lsl 48
23643      STR	x1, [x0]
23644    into:
23645      MOV	w1, 49370
23646      MOVK	w1, 0x140, lsl 16
23647      STP	w1, w1, [x0]
23648    So we want to perform this only when we save two instructions
23649    or more.  When optimizing for size, however, accept any code size
23650    savings we can.  */
23651   if (size_p && orig_cost <= lo_cost)
23652     return false;
23653 
23654   if (!size_p
23655       && (orig_cost <= lo_cost + 1))
23656     return false;
23657 
23658   rtx mem_lo = adjust_address (dst, SImode, 0);
23659   if (!aarch64_mem_pair_operand (mem_lo, SImode))
23660     return false;
23661 
23662   rtx tmp_reg = gen_reg_rtx (SImode);
23663   aarch64_expand_mov_immediate (tmp_reg, lo);
23664   rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
23665   /* Don't emit an explicit store pair as this may not be always profitable.
23666      Let the sched-fusion logic decide whether to merge them.  */
23667   emit_move_insn (mem_lo, tmp_reg);
23668   emit_move_insn (mem_hi, tmp_reg);
23669 
23670   return true;
23671 }
23672 
23673 /* Generate RTL for a conditional branch with rtx comparison CODE in
23674    mode CC_MODE.  The destination of the unlikely conditional branch
23675    is LABEL_REF.  */
23676 
23677 void
aarch64_gen_unlikely_cbranch(enum rtx_code code,machine_mode cc_mode,rtx label_ref)23678 aarch64_gen_unlikely_cbranch (enum rtx_code code, machine_mode cc_mode,
23679 			      rtx label_ref)
23680 {
23681   rtx x;
23682   x = gen_rtx_fmt_ee (code, VOIDmode,
23683 		      gen_rtx_REG (cc_mode, CC_REGNUM),
23684 		      const0_rtx);
23685 
23686   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
23687 			    gen_rtx_LABEL_REF (VOIDmode, label_ref),
23688 			    pc_rtx);
23689   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
23690 }
23691 
23692 /* Generate DImode scratch registers for 128-bit (TImode) addition.
23693 
23694    OP1 represents the TImode destination operand 1
23695    OP2 represents the TImode destination operand 2
23696    LOW_DEST represents the low half (DImode) of TImode operand 0
23697    LOW_IN1 represents the low half (DImode) of TImode operand 1
23698    LOW_IN2 represents the low half (DImode) of TImode operand 2
23699    HIGH_DEST represents the high half (DImode) of TImode operand 0
23700    HIGH_IN1 represents the high half (DImode) of TImode operand 1
23701    HIGH_IN2 represents the high half (DImode) of TImode operand 2.  */
23702 
23703 void
aarch64_addti_scratch_regs(rtx op1,rtx op2,rtx * low_dest,rtx * low_in1,rtx * low_in2,rtx * high_dest,rtx * high_in1,rtx * high_in2)23704 aarch64_addti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
23705 			    rtx *low_in1, rtx *low_in2,
23706 			    rtx *high_dest, rtx *high_in1,
23707 			    rtx *high_in2)
23708 {
23709   *low_dest = gen_reg_rtx (DImode);
23710   *low_in1 = gen_lowpart (DImode, op1);
23711   *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
23712 				  subreg_lowpart_offset (DImode, TImode));
23713   *high_dest = gen_reg_rtx (DImode);
23714   *high_in1 = gen_highpart (DImode, op1);
23715   *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
23716 				   subreg_highpart_offset (DImode, TImode));
23717 }
23718 
23719 /* Generate DImode scratch registers for 128-bit (TImode) subtraction.
23720 
23721    This function differs from 'arch64_addti_scratch_regs' in that
23722    OP1 can be an immediate constant (zero). We must call
23723    subreg_highpart_offset with DImode and TImode arguments, otherwise
23724    VOIDmode will be used for the const_int which generates an internal
23725    error from subreg_size_highpart_offset which does not expect a size of zero.
23726 
23727    OP1 represents the TImode destination operand 1
23728    OP2 represents the TImode destination operand 2
23729    LOW_DEST represents the low half (DImode) of TImode operand 0
23730    LOW_IN1 represents the low half (DImode) of TImode operand 1
23731    LOW_IN2 represents the low half (DImode) of TImode operand 2
23732    HIGH_DEST represents the high half (DImode) of TImode operand 0
23733    HIGH_IN1 represents the high half (DImode) of TImode operand 1
23734    HIGH_IN2 represents the high half (DImode) of TImode operand 2.  */
23735 
23736 
23737 void
aarch64_subvti_scratch_regs(rtx op1,rtx op2,rtx * low_dest,rtx * low_in1,rtx * low_in2,rtx * high_dest,rtx * high_in1,rtx * high_in2)23738 aarch64_subvti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
23739 			     rtx *low_in1, rtx *low_in2,
23740 			     rtx *high_dest, rtx *high_in1,
23741 			     rtx *high_in2)
23742 {
23743   *low_dest = gen_reg_rtx (DImode);
23744   *low_in1 = simplify_gen_subreg (DImode, op1, TImode,
23745 				  subreg_lowpart_offset (DImode, TImode));
23746 
23747   *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
23748 				  subreg_lowpart_offset (DImode, TImode));
23749   *high_dest = gen_reg_rtx (DImode);
23750 
23751   *high_in1 = simplify_gen_subreg (DImode, op1, TImode,
23752 				   subreg_highpart_offset (DImode, TImode));
23753   *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
23754 				   subreg_highpart_offset (DImode, TImode));
23755 }
23756 
23757 /* Generate RTL for 128-bit (TImode) subtraction with overflow.
23758 
23759    OP0 represents the TImode destination operand 0
23760    LOW_DEST represents the low half (DImode) of TImode operand 0
23761    LOW_IN1 represents the low half (DImode) of TImode operand 1
23762    LOW_IN2 represents the low half (DImode) of TImode operand 2
23763    HIGH_DEST represents the high half (DImode) of TImode operand 0
23764    HIGH_IN1 represents the high half (DImode) of TImode operand 1
23765    HIGH_IN2 represents the high half (DImode) of TImode operand 2
23766    UNSIGNED_P is true if the operation is being performed on unsigned
23767    values.  */
23768 void
aarch64_expand_subvti(rtx op0,rtx low_dest,rtx low_in1,rtx low_in2,rtx high_dest,rtx high_in1,rtx high_in2,bool unsigned_p)23769 aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1,
23770 		       rtx low_in2, rtx high_dest, rtx high_in1,
23771 		       rtx high_in2, bool unsigned_p)
23772 {
23773   if (low_in2 == const0_rtx)
23774     {
23775       low_dest = low_in1;
23776       high_in2 = force_reg (DImode, high_in2);
23777       if (unsigned_p)
23778 	emit_insn (gen_subdi3_compare1 (high_dest, high_in1, high_in2));
23779       else
23780 	emit_insn (gen_subvdi_insn (high_dest, high_in1, high_in2));
23781     }
23782   else
23783     {
23784       if (aarch64_plus_immediate (low_in2, DImode))
23785 	emit_insn (gen_subdi3_compare1_imm (low_dest, low_in1, low_in2,
23786 					    GEN_INT (-UINTVAL (low_in2))));
23787       else
23788 	{
23789 	  low_in2 = force_reg (DImode, low_in2);
23790 	  emit_insn (gen_subdi3_compare1 (low_dest, low_in1, low_in2));
23791 	}
23792       high_in2 = force_reg (DImode, high_in2);
23793 
23794       if (unsigned_p)
23795 	emit_insn (gen_usubdi3_carryinC (high_dest, high_in1, high_in2));
23796       else
23797 	emit_insn (gen_subdi3_carryinV (high_dest, high_in1, high_in2));
23798     }
23799 
23800   emit_move_insn (gen_lowpart (DImode, op0), low_dest);
23801   emit_move_insn (gen_highpart (DImode, op0), high_dest);
23802 
23803 }
23804 
23805 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
23806 
23807 static unsigned HOST_WIDE_INT
aarch64_asan_shadow_offset(void)23808 aarch64_asan_shadow_offset (void)
23809 {
23810   if (TARGET_ILP32)
23811     return (HOST_WIDE_INT_1 << 29);
23812   else
23813     return (HOST_WIDE_INT_1 << 36);
23814 }
23815 
23816 static rtx
aarch64_gen_ccmp_first(rtx_insn ** prep_seq,rtx_insn ** gen_seq,int code,tree treeop0,tree treeop1)23817 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
23818 			int code, tree treeop0, tree treeop1)
23819 {
23820   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
23821   rtx op0, op1;
23822   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
23823   insn_code icode;
23824   struct expand_operand ops[4];
23825 
23826   start_sequence ();
23827   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
23828 
23829   op_mode = GET_MODE (op0);
23830   if (op_mode == VOIDmode)
23831     op_mode = GET_MODE (op1);
23832 
23833   switch (op_mode)
23834     {
23835     case E_QImode:
23836     case E_HImode:
23837     case E_SImode:
23838       cmp_mode = SImode;
23839       icode = CODE_FOR_cmpsi;
23840       break;
23841 
23842     case E_DImode:
23843       cmp_mode = DImode;
23844       icode = CODE_FOR_cmpdi;
23845       break;
23846 
23847     case E_SFmode:
23848       cmp_mode = SFmode;
23849       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
23850       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
23851       break;
23852 
23853     case E_DFmode:
23854       cmp_mode = DFmode;
23855       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
23856       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
23857       break;
23858 
23859     default:
23860       end_sequence ();
23861       return NULL_RTX;
23862     }
23863 
23864   op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
23865   op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
23866   if (!op0 || !op1)
23867     {
23868       end_sequence ();
23869       return NULL_RTX;
23870     }
23871   *prep_seq = get_insns ();
23872   end_sequence ();
23873 
23874   create_fixed_operand (&ops[0], op0);
23875   create_fixed_operand (&ops[1], op1);
23876 
23877   start_sequence ();
23878   if (!maybe_expand_insn (icode, 2, ops))
23879     {
23880       end_sequence ();
23881       return NULL_RTX;
23882     }
23883   *gen_seq = get_insns ();
23884   end_sequence ();
23885 
23886   return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
23887 			 gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
23888 }
23889 
23890 static rtx
aarch64_gen_ccmp_next(rtx_insn ** prep_seq,rtx_insn ** gen_seq,rtx prev,int cmp_code,tree treeop0,tree treeop1,int bit_code)23891 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
23892 		       int cmp_code, tree treeop0, tree treeop1, int bit_code)
23893 {
23894   rtx op0, op1, target;
23895   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
23896   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
23897   insn_code icode;
23898   struct expand_operand ops[6];
23899   int aarch64_cond;
23900 
23901   push_to_sequence (*prep_seq);
23902   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
23903 
23904   op_mode = GET_MODE (op0);
23905   if (op_mode == VOIDmode)
23906     op_mode = GET_MODE (op1);
23907 
23908   switch (op_mode)
23909     {
23910     case E_QImode:
23911     case E_HImode:
23912     case E_SImode:
23913       cmp_mode = SImode;
23914       break;
23915 
23916     case E_DImode:
23917       cmp_mode = DImode;
23918       break;
23919 
23920     case E_SFmode:
23921       cmp_mode = SFmode;
23922       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
23923       break;
23924 
23925     case E_DFmode:
23926       cmp_mode = DFmode;
23927       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
23928       break;
23929 
23930     default:
23931       end_sequence ();
23932       return NULL_RTX;
23933     }
23934 
23935   icode = code_for_ccmp (cc_mode, cmp_mode);
23936 
23937   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
23938   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
23939   if (!op0 || !op1)
23940     {
23941       end_sequence ();
23942       return NULL_RTX;
23943     }
23944   *prep_seq = get_insns ();
23945   end_sequence ();
23946 
23947   target = gen_rtx_REG (cc_mode, CC_REGNUM);
23948   aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
23949 
23950   if (bit_code != AND)
23951     {
23952       /* Treat the ccmp patterns as canonical and use them where possible,
23953 	 but fall back to ccmp_rev patterns if there's no other option.  */
23954       rtx_code prev_code = GET_CODE (prev);
23955       machine_mode prev_mode = GET_MODE (XEXP (prev, 0));
23956       if ((prev_mode == CCFPmode || prev_mode == CCFPEmode)
23957 	  && !(prev_code == EQ
23958 	       || prev_code == NE
23959 	       || prev_code == ORDERED
23960 	       || prev_code == UNORDERED))
23961 	icode = code_for_ccmp_rev (cc_mode, cmp_mode);
23962       else
23963 	{
23964 	  rtx_code code = reverse_condition (prev_code);
23965 	  prev = gen_rtx_fmt_ee (code, VOIDmode, XEXP (prev, 0), const0_rtx);
23966 	}
23967       aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
23968     }
23969 
23970   create_fixed_operand (&ops[0], XEXP (prev, 0));
23971   create_fixed_operand (&ops[1], target);
23972   create_fixed_operand (&ops[2], op0);
23973   create_fixed_operand (&ops[3], op1);
23974   create_fixed_operand (&ops[4], prev);
23975   create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
23976 
23977   push_to_sequence (*gen_seq);
23978   if (!maybe_expand_insn (icode, 6, ops))
23979     {
23980       end_sequence ();
23981       return NULL_RTX;
23982     }
23983 
23984   *gen_seq = get_insns ();
23985   end_sequence ();
23986 
23987   return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
23988 }
23989 
23990 #undef TARGET_GEN_CCMP_FIRST
23991 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
23992 
23993 #undef TARGET_GEN_CCMP_NEXT
23994 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
23995 
23996 /* Implement TARGET_SCHED_MACRO_FUSION_P.  Return true if target supports
23997    instruction fusion of some sort.  */
23998 
23999 static bool
aarch64_macro_fusion_p(void)24000 aarch64_macro_fusion_p (void)
24001 {
24002   return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
24003 }
24004 
24005 
24006 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P.  Return true if PREV and CURR
24007    should be kept together during scheduling.  */
24008 
24009 static bool
aarch_macro_fusion_pair_p(rtx_insn * prev,rtx_insn * curr)24010 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
24011 {
24012   rtx set_dest;
24013   rtx prev_set = single_set (prev);
24014   rtx curr_set = single_set (curr);
24015   /* prev and curr are simple SET insns i.e. no flag setting or branching.  */
24016   bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
24017 
24018   if (!aarch64_macro_fusion_p ())
24019     return false;
24020 
24021   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
24022     {
24023       /* We are trying to match:
24024          prev (mov)  == (set (reg r0) (const_int imm16))
24025          curr (movk) == (set (zero_extract (reg r0)
24026                                            (const_int 16)
24027                                            (const_int 16))
24028                              (const_int imm16_1))  */
24029 
24030       set_dest = SET_DEST (curr_set);
24031 
24032       if (GET_CODE (set_dest) == ZERO_EXTRACT
24033           && CONST_INT_P (SET_SRC (curr_set))
24034           && CONST_INT_P (SET_SRC (prev_set))
24035           && CONST_INT_P (XEXP (set_dest, 2))
24036           && INTVAL (XEXP (set_dest, 2)) == 16
24037           && REG_P (XEXP (set_dest, 0))
24038           && REG_P (SET_DEST (prev_set))
24039           && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
24040         {
24041           return true;
24042         }
24043     }
24044 
24045   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
24046     {
24047 
24048       /*  We're trying to match:
24049           prev (adrp) == (set (reg r1)
24050                               (high (symbol_ref ("SYM"))))
24051           curr (add) == (set (reg r0)
24052                              (lo_sum (reg r1)
24053                                      (symbol_ref ("SYM"))))
24054           Note that r0 need not necessarily be the same as r1, especially
24055           during pre-regalloc scheduling.  */
24056 
24057       if (satisfies_constraint_Ush (SET_SRC (prev_set))
24058           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
24059         {
24060           if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
24061               && REG_P (XEXP (SET_SRC (curr_set), 0))
24062               && REGNO (XEXP (SET_SRC (curr_set), 0))
24063                  == REGNO (SET_DEST (prev_set))
24064               && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
24065                               XEXP (SET_SRC (curr_set), 1)))
24066             return true;
24067         }
24068     }
24069 
24070   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
24071     {
24072 
24073       /* We're trying to match:
24074          prev (movk) == (set (zero_extract (reg r0)
24075                                            (const_int 16)
24076                                            (const_int 32))
24077                              (const_int imm16_1))
24078          curr (movk) == (set (zero_extract (reg r0)
24079                                            (const_int 16)
24080                                            (const_int 48))
24081                              (const_int imm16_2))  */
24082 
24083       if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
24084           && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
24085           && REG_P (XEXP (SET_DEST (prev_set), 0))
24086           && REG_P (XEXP (SET_DEST (curr_set), 0))
24087           && REGNO (XEXP (SET_DEST (prev_set), 0))
24088              == REGNO (XEXP (SET_DEST (curr_set), 0))
24089           && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
24090           && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
24091           && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
24092           && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
24093           && CONST_INT_P (SET_SRC (prev_set))
24094           && CONST_INT_P (SET_SRC (curr_set)))
24095         return true;
24096 
24097     }
24098   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
24099     {
24100       /* We're trying to match:
24101           prev (adrp) == (set (reg r0)
24102                               (high (symbol_ref ("SYM"))))
24103           curr (ldr) == (set (reg r1)
24104                              (mem (lo_sum (reg r0)
24105                                              (symbol_ref ("SYM")))))
24106                  or
24107           curr (ldr) == (set (reg r1)
24108                              (zero_extend (mem
24109                                            (lo_sum (reg r0)
24110                                                    (symbol_ref ("SYM"))))))  */
24111       if (satisfies_constraint_Ush (SET_SRC (prev_set))
24112           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
24113         {
24114           rtx curr_src = SET_SRC (curr_set);
24115 
24116           if (GET_CODE (curr_src) == ZERO_EXTEND)
24117             curr_src = XEXP (curr_src, 0);
24118 
24119           if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
24120               && REG_P (XEXP (XEXP (curr_src, 0), 0))
24121               && REGNO (XEXP (XEXP (curr_src, 0), 0))
24122                  == REGNO (SET_DEST (prev_set))
24123               && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
24124                               XEXP (SET_SRC (prev_set), 0)))
24125               return true;
24126         }
24127     }
24128 
24129   /* Fuse compare (CMP/CMN/TST/BICS) and conditional branch.  */
24130   if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
24131       && prev_set && curr_set && any_condjump_p (curr)
24132       && GET_CODE (SET_SRC (prev_set)) == COMPARE
24133       && SCALAR_INT_MODE_P (GET_MODE (XEXP (SET_SRC (prev_set), 0)))
24134       && reg_referenced_p (SET_DEST (prev_set), PATTERN (curr)))
24135     return true;
24136 
24137   /* Fuse flag-setting ALU instructions and conditional branch.  */
24138   if (aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
24139       && any_condjump_p (curr))
24140     {
24141       unsigned int condreg1, condreg2;
24142       rtx cc_reg_1;
24143       aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
24144       cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
24145 
24146       if (reg_referenced_p (cc_reg_1, PATTERN (curr))
24147 	  && prev
24148 	  && modified_in_p (cc_reg_1, prev))
24149 	{
24150 	  enum attr_type prev_type = get_attr_type (prev);
24151 
24152 	  /* FIXME: this misses some which is considered simple arthematic
24153 	     instructions for ThunderX.  Simple shifts are missed here.  */
24154 	  if (prev_type == TYPE_ALUS_SREG
24155 	      || prev_type == TYPE_ALUS_IMM
24156 	      || prev_type == TYPE_LOGICS_REG
24157 	      || prev_type == TYPE_LOGICS_IMM)
24158 	    return true;
24159 	}
24160     }
24161 
24162   /* Fuse ALU instructions and CBZ/CBNZ.  */
24163   if (prev_set
24164       && curr_set
24165       && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_CBZ)
24166       && any_condjump_p (curr))
24167     {
24168       /* We're trying to match:
24169 	  prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
24170 	  curr (cbz) ==  (set (pc) (if_then_else (eq/ne) (r0)
24171 							 (const_int 0))
24172 						 (label_ref ("SYM"))
24173 						 (pc))  */
24174       if (SET_DEST (curr_set) == (pc_rtx)
24175 	  && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
24176 	  && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
24177 	  && REG_P (SET_DEST (prev_set))
24178 	  && REGNO (SET_DEST (prev_set))
24179 	     == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
24180 	{
24181 	  /* Fuse ALU operations followed by conditional branch instruction.  */
24182 	  switch (get_attr_type (prev))
24183 	    {
24184 	    case TYPE_ALU_IMM:
24185 	    case TYPE_ALU_SREG:
24186 	    case TYPE_ADC_REG:
24187 	    case TYPE_ADC_IMM:
24188 	    case TYPE_ADCS_REG:
24189 	    case TYPE_ADCS_IMM:
24190 	    case TYPE_LOGIC_REG:
24191 	    case TYPE_LOGIC_IMM:
24192 	    case TYPE_CSEL:
24193 	    case TYPE_ADR:
24194 	    case TYPE_MOV_IMM:
24195 	    case TYPE_SHIFT_REG:
24196 	    case TYPE_SHIFT_IMM:
24197 	    case TYPE_BFM:
24198 	    case TYPE_RBIT:
24199 	    case TYPE_REV:
24200 	    case TYPE_EXTEND:
24201 	      return true;
24202 
24203 	    default:;
24204 	    }
24205 	}
24206     }
24207 
24208   return false;
24209 }
24210 
24211 /* Return true iff the instruction fusion described by OP is enabled.  */
24212 
24213 bool
aarch64_fusion_enabled_p(enum aarch64_fusion_pairs op)24214 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
24215 {
24216   return (aarch64_tune_params.fusible_ops & op) != 0;
24217 }
24218 
24219 /* If MEM is in the form of [base+offset], extract the two parts
24220    of address and set to BASE and OFFSET, otherwise return false
24221    after clearing BASE and OFFSET.  */
24222 
24223 bool
extract_base_offset_in_addr(rtx mem,rtx * base,rtx * offset)24224 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
24225 {
24226   rtx addr;
24227 
24228   gcc_assert (MEM_P (mem));
24229 
24230   addr = XEXP (mem, 0);
24231 
24232   if (REG_P (addr))
24233     {
24234       *base = addr;
24235       *offset = const0_rtx;
24236       return true;
24237     }
24238 
24239   if (GET_CODE (addr) == PLUS
24240       && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
24241     {
24242       *base = XEXP (addr, 0);
24243       *offset = XEXP (addr, 1);
24244       return true;
24245     }
24246 
24247   *base = NULL_RTX;
24248   *offset = NULL_RTX;
24249 
24250   return false;
24251 }
24252 
24253 /* Types for scheduling fusion.  */
24254 enum sched_fusion_type
24255 {
24256   SCHED_FUSION_NONE = 0,
24257   SCHED_FUSION_LD_SIGN_EXTEND,
24258   SCHED_FUSION_LD_ZERO_EXTEND,
24259   SCHED_FUSION_LD,
24260   SCHED_FUSION_ST,
24261   SCHED_FUSION_NUM
24262 };
24263 
24264 /* If INSN is a load or store of address in the form of [base+offset],
24265    extract the two parts and set to BASE and OFFSET.  Return scheduling
24266    fusion type this INSN is.  */
24267 
24268 static enum sched_fusion_type
fusion_load_store(rtx_insn * insn,rtx * base,rtx * offset)24269 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
24270 {
24271   rtx x, dest, src;
24272   enum sched_fusion_type fusion = SCHED_FUSION_LD;
24273 
24274   gcc_assert (INSN_P (insn));
24275   x = PATTERN (insn);
24276   if (GET_CODE (x) != SET)
24277     return SCHED_FUSION_NONE;
24278 
24279   src = SET_SRC (x);
24280   dest = SET_DEST (x);
24281 
24282   machine_mode dest_mode = GET_MODE (dest);
24283 
24284   if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
24285     return SCHED_FUSION_NONE;
24286 
24287   if (GET_CODE (src) == SIGN_EXTEND)
24288     {
24289       fusion = SCHED_FUSION_LD_SIGN_EXTEND;
24290       src = XEXP (src, 0);
24291       if (!MEM_P (src) || GET_MODE (src) != SImode)
24292 	return SCHED_FUSION_NONE;
24293     }
24294   else if (GET_CODE (src) == ZERO_EXTEND)
24295     {
24296       fusion = SCHED_FUSION_LD_ZERO_EXTEND;
24297       src = XEXP (src, 0);
24298       if (!MEM_P (src) || GET_MODE (src) != SImode)
24299 	return SCHED_FUSION_NONE;
24300     }
24301 
24302   if (MEM_P (src) && REG_P (dest))
24303     extract_base_offset_in_addr (src, base, offset);
24304   else if (MEM_P (dest) && (REG_P (src) || src == const0_rtx))
24305     {
24306       fusion = SCHED_FUSION_ST;
24307       extract_base_offset_in_addr (dest, base, offset);
24308     }
24309   else
24310     return SCHED_FUSION_NONE;
24311 
24312   if (*base == NULL_RTX || *offset == NULL_RTX)
24313     fusion = SCHED_FUSION_NONE;
24314 
24315   return fusion;
24316 }
24317 
24318 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
24319 
24320    Currently we only support to fuse ldr or str instructions, so FUSION_PRI
24321    and PRI are only calculated for these instructions.  For other instruction,
24322    FUSION_PRI and PRI are simply set to MAX_PRI - 1.  In the future, other
24323    type instruction fusion can be added by returning different priorities.
24324 
24325    It's important that irrelevant instructions get the largest FUSION_PRI.  */
24326 
24327 static void
aarch64_sched_fusion_priority(rtx_insn * insn,int max_pri,int * fusion_pri,int * pri)24328 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
24329 			       int *fusion_pri, int *pri)
24330 {
24331   int tmp, off_val;
24332   rtx base, offset;
24333   enum sched_fusion_type fusion;
24334 
24335   gcc_assert (INSN_P (insn));
24336 
24337   tmp = max_pri - 1;
24338   fusion = fusion_load_store (insn, &base, &offset);
24339   if (fusion == SCHED_FUSION_NONE)
24340     {
24341       *pri = tmp;
24342       *fusion_pri = tmp;
24343       return;
24344     }
24345 
24346   /* Set FUSION_PRI according to fusion type and base register.  */
24347   *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
24348 
24349   /* Calculate PRI.  */
24350   tmp /= 2;
24351 
24352   /* INSN with smaller offset goes first.  */
24353   off_val = (int)(INTVAL (offset));
24354   if (off_val >= 0)
24355     tmp -= (off_val & 0xfffff);
24356   else
24357     tmp += ((- off_val) & 0xfffff);
24358 
24359   *pri = tmp;
24360   return;
24361 }
24362 
24363 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
24364    Adjust priority of sha1h instructions so they are scheduled before
24365    other SHA1 instructions.  */
24366 
24367 static int
aarch64_sched_adjust_priority(rtx_insn * insn,int priority)24368 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
24369 {
24370   rtx x = PATTERN (insn);
24371 
24372   if (GET_CODE (x) == SET)
24373     {
24374       x = SET_SRC (x);
24375 
24376       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
24377 	return priority + 10;
24378     }
24379 
24380   return priority;
24381 }
24382 
24383 /* Given OPERANDS of consecutive load/store, check if we can merge
24384    them into ldp/stp.  LOAD is true if they are load instructions.
24385    MODE is the mode of memory operands.  */
24386 
24387 bool
aarch64_operands_ok_for_ldpstp(rtx * operands,bool load,machine_mode mode)24388 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
24389 				machine_mode mode)
24390 {
24391   HOST_WIDE_INT offval_1, offval_2, msize;
24392   enum reg_class rclass_1, rclass_2;
24393   rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
24394 
24395   if (load)
24396     {
24397       mem_1 = operands[1];
24398       mem_2 = operands[3];
24399       reg_1 = operands[0];
24400       reg_2 = operands[2];
24401       gcc_assert (REG_P (reg_1) && REG_P (reg_2));
24402       if (REGNO (reg_1) == REGNO (reg_2))
24403 	return false;
24404     }
24405   else
24406     {
24407       mem_1 = operands[0];
24408       mem_2 = operands[2];
24409       reg_1 = operands[1];
24410       reg_2 = operands[3];
24411     }
24412 
24413   /* The mems cannot be volatile.  */
24414   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
24415     return false;
24416 
24417   /* If we have SImode and slow unaligned ldp,
24418      check the alignment to be at least 8 byte. */
24419   if (mode == SImode
24420       && (aarch64_tune_params.extra_tuning_flags
24421           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
24422       && !optimize_size
24423       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
24424     return false;
24425 
24426   /* Check if the addresses are in the form of [base+offset].  */
24427   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
24428   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
24429     return false;
24430   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
24431   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
24432     return false;
24433 
24434   /* Check if the bases are same.  */
24435   if (!rtx_equal_p (base_1, base_2))
24436     return false;
24437 
24438   /* The operands must be of the same size.  */
24439   gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1)),
24440 			 GET_MODE_SIZE (GET_MODE (mem_2))));
24441 
24442   offval_1 = INTVAL (offset_1);
24443   offval_2 = INTVAL (offset_2);
24444   /* We should only be trying this for fixed-sized modes.  There is no
24445      SVE LDP/STP instruction.  */
24446   msize = GET_MODE_SIZE (mode).to_constant ();
24447   /* Check if the offsets are consecutive.  */
24448   if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
24449     return false;
24450 
24451   /* Check if the addresses are clobbered by load.  */
24452   if (load)
24453     {
24454       if (reg_mentioned_p (reg_1, mem_1))
24455 	return false;
24456 
24457       /* In increasing order, the last load can clobber the address.  */
24458       if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
24459 	return false;
24460     }
24461 
24462   /* One of the memory accesses must be a mempair operand.
24463      If it is not the first one, they need to be swapped by the
24464      peephole.  */
24465   if (!aarch64_mem_pair_operand (mem_1, GET_MODE (mem_1))
24466        && !aarch64_mem_pair_operand (mem_2, GET_MODE (mem_2)))
24467     return false;
24468 
24469   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
24470     rclass_1 = FP_REGS;
24471   else
24472     rclass_1 = GENERAL_REGS;
24473 
24474   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
24475     rclass_2 = FP_REGS;
24476   else
24477     rclass_2 = GENERAL_REGS;
24478 
24479   /* Check if the registers are of same class.  */
24480   if (rclass_1 != rclass_2)
24481     return false;
24482 
24483   return true;
24484 }
24485 
24486 /* Given OPERANDS of consecutive load/store that can be merged,
24487    swap them if they are not in ascending order.  */
24488 void
aarch64_swap_ldrstr_operands(rtx * operands,bool load)24489 aarch64_swap_ldrstr_operands (rtx* operands, bool load)
24490 {
24491   rtx mem_1, mem_2, base_1, base_2, offset_1, offset_2;
24492   HOST_WIDE_INT offval_1, offval_2;
24493 
24494   if (load)
24495     {
24496       mem_1 = operands[1];
24497       mem_2 = operands[3];
24498     }
24499   else
24500     {
24501       mem_1 = operands[0];
24502       mem_2 = operands[2];
24503     }
24504 
24505   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
24506   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
24507 
24508   offval_1 = INTVAL (offset_1);
24509   offval_2 = INTVAL (offset_2);
24510 
24511   if (offval_1 > offval_2)
24512     {
24513       /* Irrespective of whether this is a load or a store,
24514 	 we do the same swap.  */
24515       std::swap (operands[0], operands[2]);
24516       std::swap (operands[1], operands[3]);
24517     }
24518 }
24519 
24520 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
24521    comparison between the two.  */
24522 int
aarch64_host_wide_int_compare(const void * x,const void * y)24523 aarch64_host_wide_int_compare (const void *x, const void *y)
24524 {
24525   return wi::cmps (* ((const HOST_WIDE_INT *) x),
24526 		   * ((const HOST_WIDE_INT *) y));
24527 }
24528 
24529 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
24530    other pointing to a REG rtx containing an offset, compare the offsets
24531    of the two pairs.
24532 
24533    Return:
24534 
24535 	1 iff offset (X) > offset (Y)
24536 	0 iff offset (X) == offset (Y)
24537 	-1 iff offset (X) < offset (Y)  */
24538 int
aarch64_ldrstr_offset_compare(const void * x,const void * y)24539 aarch64_ldrstr_offset_compare (const void *x, const void *y)
24540 {
24541   const rtx * operands_1 = (const rtx *) x;
24542   const rtx * operands_2 = (const rtx *) y;
24543   rtx mem_1, mem_2, base, offset_1, offset_2;
24544 
24545   if (MEM_P (operands_1[0]))
24546     mem_1 = operands_1[0];
24547   else
24548     mem_1 = operands_1[1];
24549 
24550   if (MEM_P (operands_2[0]))
24551     mem_2 = operands_2[0];
24552   else
24553     mem_2 = operands_2[1];
24554 
24555   /* Extract the offsets.  */
24556   extract_base_offset_in_addr (mem_1, &base, &offset_1);
24557   extract_base_offset_in_addr (mem_2, &base, &offset_2);
24558 
24559   gcc_assert (offset_1 != NULL_RTX && offset_2 != NULL_RTX);
24560 
24561   return wi::cmps (INTVAL (offset_1), INTVAL (offset_2));
24562 }
24563 
24564 /* Given OPERANDS of consecutive load/store, check if we can merge
24565    them into ldp/stp by adjusting the offset.  LOAD is true if they
24566    are load instructions.  MODE is the mode of memory operands.
24567 
24568    Given below consecutive stores:
24569 
24570      str  w1, [xb, 0x100]
24571      str  w1, [xb, 0x104]
24572      str  w1, [xb, 0x108]
24573      str  w1, [xb, 0x10c]
24574 
24575    Though the offsets are out of the range supported by stp, we can
24576    still pair them after adjusting the offset, like:
24577 
24578      add  scratch, xb, 0x100
24579      stp  w1, w1, [scratch]
24580      stp  w1, w1, [scratch, 0x8]
24581 
24582    The peephole patterns detecting this opportunity should guarantee
24583    the scratch register is avaliable.  */
24584 
24585 bool
aarch64_operands_adjust_ok_for_ldpstp(rtx * operands,bool load,machine_mode mode)24586 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
24587 				       machine_mode mode)
24588 {
24589   const int num_insns = 4;
24590   enum reg_class rclass;
24591   HOST_WIDE_INT offvals[num_insns], msize;
24592   rtx mem[num_insns], reg[num_insns], base[num_insns], offset[num_insns];
24593 
24594   if (load)
24595     {
24596       for (int i = 0; i < num_insns; i++)
24597 	{
24598 	  reg[i] = operands[2 * i];
24599 	  mem[i] = operands[2 * i + 1];
24600 
24601 	  gcc_assert (REG_P (reg[i]));
24602 	}
24603 
24604       /* Do not attempt to merge the loads if the loads clobber each other.  */
24605       for (int i = 0; i < 8; i += 2)
24606 	for (int j = i + 2; j < 8; j += 2)
24607 	  if (reg_overlap_mentioned_p (operands[i], operands[j]))
24608 	    return false;
24609     }
24610   else
24611     for (int i = 0; i < num_insns; i++)
24612       {
24613 	mem[i] = operands[2 * i];
24614 	reg[i] = operands[2 * i + 1];
24615       }
24616 
24617   /* Skip if memory operand is by itself valid for ldp/stp.  */
24618   if (!MEM_P (mem[0]) || aarch64_mem_pair_operand (mem[0], mode))
24619     return false;
24620 
24621   for (int i = 0; i < num_insns; i++)
24622     {
24623       /* The mems cannot be volatile.  */
24624       if (MEM_VOLATILE_P (mem[i]))
24625 	return false;
24626 
24627       /* Check if the addresses are in the form of [base+offset].  */
24628       extract_base_offset_in_addr (mem[i], base + i, offset + i);
24629       if (base[i] == NULL_RTX || offset[i] == NULL_RTX)
24630 	return false;
24631     }
24632 
24633   /* Check if the registers are of same class.  */
24634   rclass = REG_P (reg[0]) && FP_REGNUM_P (REGNO (reg[0]))
24635     ? FP_REGS : GENERAL_REGS;
24636 
24637   for (int i = 1; i < num_insns; i++)
24638     if (REG_P (reg[i]) && FP_REGNUM_P (REGNO (reg[i])))
24639       {
24640 	if (rclass != FP_REGS)
24641 	  return false;
24642       }
24643     else
24644       {
24645 	if (rclass != GENERAL_REGS)
24646 	  return false;
24647       }
24648 
24649   /* Only the last register in the order in which they occur
24650      may be clobbered by the load.  */
24651   if (rclass == GENERAL_REGS && load)
24652     for (int i = 0; i < num_insns - 1; i++)
24653       if (reg_mentioned_p (reg[i], mem[i]))
24654 	return false;
24655 
24656   /* Check if the bases are same.  */
24657   for (int i = 0; i < num_insns - 1; i++)
24658     if (!rtx_equal_p (base[i], base[i + 1]))
24659       return false;
24660 
24661   for (int i = 0; i < num_insns; i++)
24662     offvals[i] = INTVAL (offset[i]);
24663 
24664   msize = GET_MODE_SIZE (mode).to_constant ();
24665 
24666   /* Check if the offsets can be put in the right order to do a ldp/stp.  */
24667   qsort (offvals, num_insns, sizeof (HOST_WIDE_INT),
24668 	 aarch64_host_wide_int_compare);
24669 
24670   if (!(offvals[1] == offvals[0] + msize
24671 	&& offvals[3] == offvals[2] + msize))
24672     return false;
24673 
24674   /* Check that offsets are within range of each other.  The ldp/stp
24675      instructions have 7 bit immediate offsets, so use 0x80.  */
24676   if (offvals[2] - offvals[0] >= msize * 0x80)
24677     return false;
24678 
24679   /* The offsets must be aligned with respect to each other.  */
24680   if (offvals[0] % msize != offvals[2] % msize)
24681     return false;
24682 
24683   /* If we have SImode and slow unaligned ldp,
24684      check the alignment to be at least 8 byte. */
24685   if (mode == SImode
24686       && (aarch64_tune_params.extra_tuning_flags
24687 	  & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
24688       && !optimize_size
24689       && MEM_ALIGN (mem[0]) < 8 * BITS_PER_UNIT)
24690     return false;
24691 
24692   return true;
24693 }
24694 
24695 /* Given OPERANDS of consecutive load/store, this function pairs them
24696    into LDP/STP after adjusting the offset.  It depends on the fact
24697    that the operands can be sorted so the offsets are correct for STP.
24698    MODE is the mode of memory operands.  CODE is the rtl operator
24699    which should be applied to all memory operands, it's SIGN_EXTEND,
24700    ZERO_EXTEND or UNKNOWN.  */
24701 
24702 bool
aarch64_gen_adjusted_ldpstp(rtx * operands,bool load,machine_mode mode,RTX_CODE code)24703 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
24704 			     machine_mode mode, RTX_CODE code)
24705 {
24706   rtx base, offset_1, offset_3, t1, t2;
24707   rtx mem_1, mem_2, mem_3, mem_4;
24708   rtx temp_operands[8];
24709   HOST_WIDE_INT off_val_1, off_val_3, base_off, new_off_1, new_off_3,
24710 		stp_off_upper_limit, stp_off_lower_limit, msize;
24711 
24712   /* We make changes on a copy as we may still bail out.  */
24713   for (int i = 0; i < 8; i ++)
24714     temp_operands[i] = operands[i];
24715 
24716   /* Sort the operands.  */
24717   qsort (temp_operands, 4, 2 * sizeof (rtx *), aarch64_ldrstr_offset_compare);
24718 
24719   /* Copy the memory operands so that if we have to bail for some
24720      reason the original addresses are unchanged.  */
24721   if (load)
24722     {
24723       mem_1 = copy_rtx (temp_operands[1]);
24724       mem_2 = copy_rtx (temp_operands[3]);
24725       mem_3 = copy_rtx (temp_operands[5]);
24726       mem_4 = copy_rtx (temp_operands[7]);
24727     }
24728   else
24729     {
24730       mem_1 = copy_rtx (temp_operands[0]);
24731       mem_2 = copy_rtx (temp_operands[2]);
24732       mem_3 = copy_rtx (temp_operands[4]);
24733       mem_4 = copy_rtx (temp_operands[6]);
24734       gcc_assert (code == UNKNOWN);
24735     }
24736 
24737   extract_base_offset_in_addr (mem_1, &base, &offset_1);
24738   extract_base_offset_in_addr (mem_3, &base, &offset_3);
24739   gcc_assert (base != NULL_RTX && offset_1 != NULL_RTX
24740 	      && offset_3 != NULL_RTX);
24741 
24742   /* Adjust offset so it can fit in LDP/STP instruction.  */
24743   msize = GET_MODE_SIZE (mode).to_constant();
24744   stp_off_upper_limit = msize * (0x40 - 1);
24745   stp_off_lower_limit = - msize * 0x40;
24746 
24747   off_val_1 = INTVAL (offset_1);
24748   off_val_3 = INTVAL (offset_3);
24749 
24750   /* The base offset is optimally half way between the two STP/LDP offsets.  */
24751   if (msize <= 4)
24752     base_off = (off_val_1 + off_val_3) / 2;
24753   else
24754     /* However, due to issues with negative LDP/STP offset generation for
24755        larger modes, for DF, DI and vector modes. we must not use negative
24756        addresses smaller than 9 signed unadjusted bits can store.  This
24757        provides the most range in this case.  */
24758     base_off = off_val_1;
24759 
24760   /* Adjust the base so that it is aligned with the addresses but still
24761      optimal.  */
24762   if (base_off % msize != off_val_1 % msize)
24763     /* Fix the offset, bearing in mind we want to make it bigger not
24764        smaller.  */
24765     base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
24766   else if (msize <= 4)
24767     /* The negative range of LDP/STP is one larger than the positive range.  */
24768     base_off += msize;
24769 
24770   /* Check if base offset is too big or too small.  We can attempt to resolve
24771      this issue by setting it to the maximum value and seeing if the offsets
24772      still fit.  */
24773   if (base_off >= 0x1000)
24774     {
24775       base_off = 0x1000 - 1;
24776       /* We must still make sure that the base offset is aligned with respect
24777 	 to the address.  But it may not be made any bigger.  */
24778       base_off -= (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
24779     }
24780 
24781   /* Likewise for the case where the base is too small.  */
24782   if (base_off <= -0x1000)
24783     {
24784       base_off = -0x1000 + 1;
24785       base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
24786     }
24787 
24788   /* Offset of the first STP/LDP.  */
24789   new_off_1 = off_val_1 - base_off;
24790 
24791   /* Offset of the second STP/LDP.  */
24792   new_off_3 = off_val_3 - base_off;
24793 
24794   /* The offsets must be within the range of the LDP/STP instructions.  */
24795   if (new_off_1 > stp_off_upper_limit || new_off_1 < stp_off_lower_limit
24796       || new_off_3 > stp_off_upper_limit || new_off_3 < stp_off_lower_limit)
24797     return false;
24798 
24799   replace_equiv_address_nv (mem_1, plus_constant (Pmode, operands[8],
24800 						  new_off_1), true);
24801   replace_equiv_address_nv (mem_2, plus_constant (Pmode, operands[8],
24802 						  new_off_1 + msize), true);
24803   replace_equiv_address_nv (mem_3, plus_constant (Pmode, operands[8],
24804 						  new_off_3), true);
24805   replace_equiv_address_nv (mem_4, plus_constant (Pmode, operands[8],
24806 						  new_off_3 + msize), true);
24807 
24808   if (!aarch64_mem_pair_operand (mem_1, mode)
24809       || !aarch64_mem_pair_operand (mem_3, mode))
24810     return false;
24811 
24812   if (code == ZERO_EXTEND)
24813     {
24814       mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
24815       mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
24816       mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
24817       mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
24818     }
24819   else if (code == SIGN_EXTEND)
24820     {
24821       mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
24822       mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
24823       mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
24824       mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
24825     }
24826 
24827   if (load)
24828     {
24829       operands[0] = temp_operands[0];
24830       operands[1] = mem_1;
24831       operands[2] = temp_operands[2];
24832       operands[3] = mem_2;
24833       operands[4] = temp_operands[4];
24834       operands[5] = mem_3;
24835       operands[6] = temp_operands[6];
24836       operands[7] = mem_4;
24837     }
24838   else
24839     {
24840       operands[0] = mem_1;
24841       operands[1] = temp_operands[1];
24842       operands[2] = mem_2;
24843       operands[3] = temp_operands[3];
24844       operands[4] = mem_3;
24845       operands[5] = temp_operands[5];
24846       operands[6] = mem_4;
24847       operands[7] = temp_operands[7];
24848     }
24849 
24850   /* Emit adjusting instruction.  */
24851   emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, base_off)));
24852   /* Emit ldp/stp instructions.  */
24853   t1 = gen_rtx_SET (operands[0], operands[1]);
24854   t2 = gen_rtx_SET (operands[2], operands[3]);
24855   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
24856   t1 = gen_rtx_SET (operands[4], operands[5]);
24857   t2 = gen_rtx_SET (operands[6], operands[7]);
24858   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
24859   return true;
24860 }
24861 
24862 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE.  Assume for now that
24863    it isn't worth branching around empty masked ops (including masked
24864    stores).  */
24865 
24866 static bool
aarch64_empty_mask_is_expensive(unsigned)24867 aarch64_empty_mask_is_expensive (unsigned)
24868 {
24869   return false;
24870 }
24871 
24872 /* Return 1 if pseudo register should be created and used to hold
24873    GOT address for PIC code.  */
24874 
24875 bool
aarch64_use_pseudo_pic_reg(void)24876 aarch64_use_pseudo_pic_reg (void)
24877 {
24878   return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
24879 }
24880 
24881 /* Implement TARGET_UNSPEC_MAY_TRAP_P.  */
24882 
24883 static int
aarch64_unspec_may_trap_p(const_rtx x,unsigned flags)24884 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
24885 {
24886   switch (XINT (x, 1))
24887     {
24888     case UNSPEC_GOTSMALLPIC:
24889     case UNSPEC_GOTSMALLPIC28K:
24890     case UNSPEC_GOTTINYPIC:
24891       return 0;
24892     default:
24893       break;
24894     }
24895 
24896   return default_unspec_may_trap_p (x, flags);
24897 }
24898 
24899 
24900 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
24901    return the log2 of that value.  Otherwise return -1.  */
24902 
24903 int
aarch64_fpconst_pow_of_2(rtx x)24904 aarch64_fpconst_pow_of_2 (rtx x)
24905 {
24906   const REAL_VALUE_TYPE *r;
24907 
24908   if (!CONST_DOUBLE_P (x))
24909     return -1;
24910 
24911   r = CONST_DOUBLE_REAL_VALUE (x);
24912 
24913   if (REAL_VALUE_NEGATIVE (*r)
24914       || REAL_VALUE_ISNAN (*r)
24915       || REAL_VALUE_ISINF (*r)
24916       || !real_isinteger (r, DFmode))
24917     return -1;
24918 
24919   return exact_log2 (real_to_integer (r));
24920 }
24921 
24922 /* If X is a positive CONST_DOUBLE with a value that is the reciprocal of a
24923    power of 2 (i.e 1/2^n) return the number of float bits. e.g. for x==(1/2^n)
24924    return n. Otherwise return -1.  */
24925 
24926 int
aarch64_fpconst_pow2_recip(rtx x)24927 aarch64_fpconst_pow2_recip (rtx x)
24928 {
24929   REAL_VALUE_TYPE r0;
24930 
24931   if (!CONST_DOUBLE_P (x))
24932     return -1;
24933 
24934   r0 = *CONST_DOUBLE_REAL_VALUE (x);
24935   if (exact_real_inverse (DFmode, &r0)
24936       && !REAL_VALUE_NEGATIVE (r0))
24937     {
24938 	int ret = exact_log2 (real_to_integer (&r0));
24939 	if (ret >= 1 && ret <= 32)
24940 	    return ret;
24941     }
24942   return -1;
24943 }
24944 
24945 /* If X is a vector of equal CONST_DOUBLE values and that value is
24946    Y, return the aarch64_fpconst_pow_of_2 of Y.  Otherwise return -1.  */
24947 
24948 int
aarch64_vec_fpconst_pow_of_2(rtx x)24949 aarch64_vec_fpconst_pow_of_2 (rtx x)
24950 {
24951   int nelts;
24952   if (GET_CODE (x) != CONST_VECTOR
24953       || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
24954     return -1;
24955 
24956   if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
24957     return -1;
24958 
24959   int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
24960   if (firstval <= 0)
24961     return -1;
24962 
24963   for (int i = 1; i < nelts; i++)
24964     if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
24965       return -1;
24966 
24967   return firstval;
24968 }
24969 
24970 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
24971    to float.
24972 
24973    __fp16 always promotes through this hook.
24974    _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
24975    through the generic excess precision logic rather than here.  */
24976 
24977 static tree
aarch64_promoted_type(const_tree t)24978 aarch64_promoted_type (const_tree t)
24979 {
24980   if (SCALAR_FLOAT_TYPE_P (t)
24981       && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
24982     return float_type_node;
24983 
24984   return NULL_TREE;
24985 }
24986 
24987 /* Implement the TARGET_OPTAB_SUPPORTED_P hook.  */
24988 
24989 static bool
aarch64_optab_supported_p(int op,machine_mode mode1,machine_mode,optimization_type opt_type)24990 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
24991 			   optimization_type opt_type)
24992 {
24993   switch (op)
24994     {
24995     case rsqrt_optab:
24996       return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
24997 
24998     default:
24999       return true;
25000     }
25001 }
25002 
25003 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook.  */
25004 
25005 static unsigned int
aarch64_dwarf_poly_indeterminate_value(unsigned int i,unsigned int * factor,int * offset)25006 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
25007 					int *offset)
25008 {
25009   /* Polynomial invariant 1 == (VG / 2) - 1.  */
25010   gcc_assert (i == 1);
25011   *factor = 2;
25012   *offset = 1;
25013   return AARCH64_DWARF_VG;
25014 }
25015 
25016 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
25017    if MODE is HFmode, and punt to the generic implementation otherwise.  */
25018 
25019 static bool
aarch64_libgcc_floating_mode_supported_p(scalar_float_mode mode)25020 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
25021 {
25022   return (mode == HFmode
25023 	  ? true
25024 	  : default_libgcc_floating_mode_supported_p (mode));
25025 }
25026 
25027 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
25028    if MODE is HFmode, and punt to the generic implementation otherwise.  */
25029 
25030 static bool
aarch64_scalar_mode_supported_p(scalar_mode mode)25031 aarch64_scalar_mode_supported_p (scalar_mode mode)
25032 {
25033   return (mode == HFmode
25034 	  ? true
25035 	  : default_scalar_mode_supported_p (mode));
25036 }
25037 
25038 /* Set the value of FLT_EVAL_METHOD.
25039    ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
25040 
25041     0: evaluate all operations and constants, whose semantic type has at
25042        most the range and precision of type float, to the range and
25043        precision of float; evaluate all other operations and constants to
25044        the range and precision of the semantic type;
25045 
25046     N, where _FloatN is a supported interchange floating type
25047        evaluate all operations and constants, whose semantic type has at
25048        most the range and precision of _FloatN type, to the range and
25049        precision of the _FloatN type; evaluate all other operations and
25050        constants to the range and precision of the semantic type;
25051 
25052    If we have the ARMv8.2-A extensions then we support _Float16 in native
25053    precision, so we should set this to 16.  Otherwise, we support the type,
25054    but want to evaluate expressions in float precision, so set this to
25055    0.  */
25056 
25057 static enum flt_eval_method
aarch64_excess_precision(enum excess_precision_type type)25058 aarch64_excess_precision (enum excess_precision_type type)
25059 {
25060   switch (type)
25061     {
25062       case EXCESS_PRECISION_TYPE_FAST:
25063       case EXCESS_PRECISION_TYPE_STANDARD:
25064 	/* We can calculate either in 16-bit range and precision or
25065 	   32-bit range and precision.  Make that decision based on whether
25066 	   we have native support for the ARMv8.2-A 16-bit floating-point
25067 	   instructions or not.  */
25068 	return (TARGET_FP_F16INST
25069 		? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
25070 		: FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
25071       case EXCESS_PRECISION_TYPE_IMPLICIT:
25072 	return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
25073       default:
25074 	gcc_unreachable ();
25075     }
25076   return FLT_EVAL_METHOD_UNPREDICTABLE;
25077 }
25078 
25079 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN.  Return true if INSN can be
25080    scheduled for speculative execution.  Reject the long-running division
25081    and square-root instructions.  */
25082 
25083 static bool
aarch64_sched_can_speculate_insn(rtx_insn * insn)25084 aarch64_sched_can_speculate_insn (rtx_insn *insn)
25085 {
25086   switch (get_attr_type (insn))
25087     {
25088       case TYPE_SDIV:
25089       case TYPE_UDIV:
25090       case TYPE_FDIVS:
25091       case TYPE_FDIVD:
25092       case TYPE_FSQRTS:
25093       case TYPE_FSQRTD:
25094       case TYPE_NEON_FP_SQRT_S:
25095       case TYPE_NEON_FP_SQRT_D:
25096       case TYPE_NEON_FP_SQRT_S_Q:
25097       case TYPE_NEON_FP_SQRT_D_Q:
25098       case TYPE_NEON_FP_DIV_S:
25099       case TYPE_NEON_FP_DIV_D:
25100       case TYPE_NEON_FP_DIV_S_Q:
25101       case TYPE_NEON_FP_DIV_D_Q:
25102 	return false;
25103       default:
25104 	return true;
25105     }
25106 }
25107 
25108 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES.  */
25109 
25110 static int
aarch64_compute_pressure_classes(reg_class * classes)25111 aarch64_compute_pressure_classes (reg_class *classes)
25112 {
25113   int i = 0;
25114   classes[i++] = GENERAL_REGS;
25115   classes[i++] = FP_REGS;
25116   /* PR_REGS isn't a useful pressure class because many predicate pseudo
25117      registers need to go in PR_LO_REGS at some point during their
25118      lifetime.  Splitting it into two halves has the effect of making
25119      all predicates count against PR_LO_REGS, so that we try whenever
25120      possible to restrict the number of live predicates to 8.  This
25121      greatly reduces the amount of spilling in certain loops.  */
25122   classes[i++] = PR_LO_REGS;
25123   classes[i++] = PR_HI_REGS;
25124   return i;
25125 }
25126 
25127 /* Implement TARGET_CAN_CHANGE_MODE_CLASS.  */
25128 
25129 static bool
aarch64_can_change_mode_class(machine_mode from,machine_mode to,reg_class_t)25130 aarch64_can_change_mode_class (machine_mode from,
25131 			       machine_mode to, reg_class_t)
25132 {
25133   unsigned int from_flags = aarch64_classify_vector_mode (from);
25134   unsigned int to_flags = aarch64_classify_vector_mode (to);
25135 
25136   bool from_sve_p = (from_flags & VEC_ANY_SVE);
25137   bool to_sve_p = (to_flags & VEC_ANY_SVE);
25138 
25139   bool from_partial_sve_p = from_sve_p && (from_flags & VEC_PARTIAL);
25140   bool to_partial_sve_p = to_sve_p && (to_flags & VEC_PARTIAL);
25141 
25142   bool from_pred_p = (from_flags & VEC_SVE_PRED);
25143   bool to_pred_p = (to_flags & VEC_SVE_PRED);
25144 
25145   /* Don't allow changes between predicate modes and other modes.
25146      Only predicate registers can hold predicate modes and only
25147      non-predicate registers can hold non-predicate modes, so any
25148      attempt to mix them would require a round trip through memory.  */
25149   if (from_pred_p != to_pred_p)
25150     return false;
25151 
25152   /* Don't allow changes between partial SVE modes and other modes.
25153      The contents of partial SVE modes are distributed evenly across
25154      the register, whereas GCC expects them to be clustered together.  */
25155   if (from_partial_sve_p != to_partial_sve_p)
25156     return false;
25157 
25158   /* Similarly reject changes between partial SVE modes that have
25159      different patterns of significant and insignificant bits.  */
25160   if (from_partial_sve_p
25161       && (aarch64_sve_container_bits (from) != aarch64_sve_container_bits (to)
25162 	  || GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to)))
25163     return false;
25164 
25165   if (maybe_ne (BITS_PER_SVE_VECTOR, 128u))
25166     {
25167       /* Don't allow changes between SVE modes and other modes that might
25168 	 be bigger than 128 bits.  In particular, OImode, CImode and XImode
25169 	 divide into 128-bit quantities while SVE modes divide into
25170 	 BITS_PER_SVE_VECTOR quantities.  */
25171       if (from_sve_p && !to_sve_p && maybe_gt (GET_MODE_BITSIZE (to), 128))
25172 	return false;
25173       if (to_sve_p && !from_sve_p && maybe_gt (GET_MODE_BITSIZE (from), 128))
25174 	return false;
25175     }
25176 
25177   if (BYTES_BIG_ENDIAN)
25178     {
25179       /* Don't allow changes between SVE data modes and non-SVE modes.
25180 	 See the comment at the head of aarch64-sve.md for details.  */
25181       if (from_sve_p != to_sve_p)
25182 	return false;
25183 
25184       /* Don't allow changes in element size: lane 0 of the new vector
25185 	 would not then be lane 0 of the old vector.  See the comment
25186 	 above aarch64_maybe_expand_sve_subreg_move for a more detailed
25187 	 description.
25188 
25189 	 In the worst case, this forces a register to be spilled in
25190 	 one mode and reloaded in the other, which handles the
25191 	 endianness correctly.  */
25192       if (from_sve_p && GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to))
25193 	return false;
25194     }
25195   return true;
25196 }
25197 
25198 /* Implement TARGET_EARLY_REMAT_MODES.  */
25199 
25200 static void
aarch64_select_early_remat_modes(sbitmap modes)25201 aarch64_select_early_remat_modes (sbitmap modes)
25202 {
25203   /* SVE values are not normally live across a call, so it should be
25204      worth doing early rematerialization even in VL-specific mode.  */
25205   for (int i = 0; i < NUM_MACHINE_MODES; ++i)
25206     if (aarch64_sve_mode_p ((machine_mode) i))
25207       bitmap_set_bit (modes, i);
25208 }
25209 
25210 /* Override the default target speculation_safe_value.  */
25211 static rtx
aarch64_speculation_safe_value(machine_mode mode,rtx result,rtx val,rtx failval)25212 aarch64_speculation_safe_value (machine_mode mode,
25213 				rtx result, rtx val, rtx failval)
25214 {
25215   /* Maybe we should warn if falling back to hard barriers.  They are
25216      likely to be noticably more expensive than the alternative below.  */
25217   if (!aarch64_track_speculation)
25218     return default_speculation_safe_value (mode, result, val, failval);
25219 
25220   if (!REG_P (val))
25221     val = copy_to_mode_reg (mode, val);
25222 
25223   if (!aarch64_reg_or_zero (failval, mode))
25224     failval = copy_to_mode_reg (mode, failval);
25225 
25226   emit_insn (gen_despeculate_copy (mode, result, val, failval));
25227   return result;
25228 }
25229 
25230 /* Implement TARGET_ESTIMATED_POLY_VALUE.
25231    Look into the tuning structure for an estimate.
25232    KIND specifies the type of requested estimate: min, max or likely.
25233    For cores with a known SVE width all three estimates are the same.
25234    For generic SVE tuning we want to distinguish the maximum estimate from
25235    the minimum and likely ones.
25236    The likely estimate is the same as the minimum in that case to give a
25237    conservative behavior of auto-vectorizing with SVE when it is a win
25238    even for 128-bit SVE.
25239    When SVE width information is available VAL.coeffs[1] is multiplied by
25240    the number of VQ chunks over the initial Advanced SIMD 128 bits.  */
25241 
25242 static HOST_WIDE_INT
25243 aarch64_estimated_poly_value (poly_int64 val,
25244 			      poly_value_estimate_kind kind
25245 				= POLY_VALUE_LIKELY)
25246 {
25247   unsigned int width_source = aarch64_tune_params.sve_width;
25248 
25249   /* If there is no core-specific information then the minimum and likely
25250      values are based on 128-bit vectors and the maximum is based on
25251      the architectural maximum of 2048 bits.  */
25252   if (width_source == SVE_SCALABLE)
25253     switch (kind)
25254       {
25255       case POLY_VALUE_MIN:
25256       case POLY_VALUE_LIKELY:
25257 	return val.coeffs[0];
25258       case POLY_VALUE_MAX:
25259 	  return val.coeffs[0] + val.coeffs[1] * 15;
25260       }
25261 
25262   /* Allow sve_width to be a bitmask of different VL, treating the lowest
25263      as likely.  This could be made more general if future -mtune options
25264      need it to be.  */
25265   if (kind == POLY_VALUE_MAX)
25266     width_source = 1 << floor_log2 (width_source);
25267   else
25268     width_source = least_bit_hwi (width_source);
25269 
25270   /* If the core provides width information, use that.  */
25271   HOST_WIDE_INT over_128 = width_source - 128;
25272   return val.coeffs[0] + val.coeffs[1] * over_128 / 128;
25273 }
25274 
25275 
25276 /* Return true for types that could be supported as SIMD return or
25277    argument types.  */
25278 
25279 static bool
supported_simd_type(tree t)25280 supported_simd_type (tree t)
25281 {
25282   if (SCALAR_FLOAT_TYPE_P (t) || INTEGRAL_TYPE_P (t) || POINTER_TYPE_P (t))
25283     {
25284       HOST_WIDE_INT s = tree_to_shwi (TYPE_SIZE_UNIT (t));
25285       return s == 1 || s == 2 || s == 4 || s == 8;
25286     }
25287   return false;
25288 }
25289 
25290 /* Return true for types that currently are supported as SIMD return
25291    or argument types.  */
25292 
25293 static bool
currently_supported_simd_type(tree t,tree b)25294 currently_supported_simd_type (tree t, tree b)
25295 {
25296   if (COMPLEX_FLOAT_TYPE_P (t))
25297     return false;
25298 
25299   if (TYPE_SIZE (t) != TYPE_SIZE (b))
25300     return false;
25301 
25302   return supported_simd_type (t);
25303 }
25304 
25305 /* Implement TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN.  */
25306 
25307 static int
aarch64_simd_clone_compute_vecsize_and_simdlen(struct cgraph_node * node,struct cgraph_simd_clone * clonei,tree base_type,int num)25308 aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
25309 					struct cgraph_simd_clone *clonei,
25310 					tree base_type, int num)
25311 {
25312   tree t, ret_type;
25313   unsigned int elt_bits, count;
25314   unsigned HOST_WIDE_INT const_simdlen;
25315   poly_uint64 vec_bits;
25316 
25317   if (!TARGET_SIMD)
25318     return 0;
25319 
25320   /* For now, SVE simdclones won't produce illegal simdlen, So only check
25321      const simdlens here.  */
25322   if (maybe_ne (clonei->simdlen, 0U)
25323       && clonei->simdlen.is_constant (&const_simdlen)
25324       && (const_simdlen < 2
25325 	  || const_simdlen > 1024
25326 	  || (const_simdlen & (const_simdlen - 1)) != 0))
25327     {
25328       warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
25329 		  "unsupported simdlen %wd", const_simdlen);
25330       return 0;
25331     }
25332 
25333   ret_type = TREE_TYPE (TREE_TYPE (node->decl));
25334   if (TREE_CODE (ret_type) != VOID_TYPE
25335       && !currently_supported_simd_type (ret_type, base_type))
25336     {
25337       if (TYPE_SIZE (ret_type) != TYPE_SIZE (base_type))
25338 	warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
25339 		    "GCC does not currently support mixed size types "
25340 		    "for %<simd%> functions");
25341       else if (supported_simd_type (ret_type))
25342 	warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
25343 		    "GCC does not currently support return type %qT "
25344 		    "for %<simd%> functions", ret_type);
25345       else
25346 	warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
25347 		    "unsupported return type %qT for %<simd%> functions",
25348 		    ret_type);
25349       return 0;
25350     }
25351 
25352   int i;
25353   tree type_arg_types = TYPE_ARG_TYPES (TREE_TYPE (node->decl));
25354   bool decl_arg_p = (node->definition || type_arg_types == NULL_TREE);
25355 
25356   for (t = (decl_arg_p ? DECL_ARGUMENTS (node->decl) : type_arg_types), i = 0;
25357        t && t != void_list_node; t = TREE_CHAIN (t), i++)
25358     {
25359       tree arg_type = decl_arg_p ? TREE_TYPE (t) : TREE_VALUE (t);
25360 
25361       if (clonei->args[i].arg_type != SIMD_CLONE_ARG_TYPE_UNIFORM
25362 	  && !currently_supported_simd_type (arg_type, base_type))
25363 	{
25364 	  if (TYPE_SIZE (arg_type) != TYPE_SIZE (base_type))
25365 	    warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
25366 			"GCC does not currently support mixed size types "
25367 			"for %<simd%> functions");
25368 	  else
25369 	    warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
25370 			"GCC does not currently support argument type %qT "
25371 			"for %<simd%> functions", arg_type);
25372 	  return 0;
25373 	}
25374     }
25375 
25376   clonei->vecsize_mangle = 'n';
25377   clonei->mask_mode = VOIDmode;
25378   elt_bits = GET_MODE_BITSIZE (SCALAR_TYPE_MODE (base_type));
25379   if (known_eq (clonei->simdlen, 0U))
25380     {
25381       count = 2;
25382       vec_bits = (num == 0 ? 64 : 128);
25383       clonei->simdlen = exact_div (vec_bits, elt_bits);
25384     }
25385   else
25386     {
25387       count = 1;
25388       vec_bits = clonei->simdlen * elt_bits;
25389       /* For now, SVE simdclones won't produce illegal simdlen, So only check
25390 	 const simdlens here.  */
25391       if (clonei->simdlen.is_constant (&const_simdlen)
25392 	  && maybe_ne (vec_bits, 64U) && maybe_ne (vec_bits, 128U))
25393 	{
25394 	  warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
25395 		      "GCC does not currently support simdlen %wd for type %qT",
25396 		      const_simdlen, base_type);
25397 	  return 0;
25398 	}
25399     }
25400   clonei->vecsize_int = vec_bits;
25401   clonei->vecsize_float = vec_bits;
25402   return count;
25403 }
25404 
25405 /* Implement TARGET_SIMD_CLONE_ADJUST.  */
25406 
25407 static void
aarch64_simd_clone_adjust(struct cgraph_node * node)25408 aarch64_simd_clone_adjust (struct cgraph_node *node)
25409 {
25410   /* Add aarch64_vector_pcs target attribute to SIMD clones so they
25411      use the correct ABI.  */
25412 
25413   tree t = TREE_TYPE (node->decl);
25414   TYPE_ATTRIBUTES (t) = make_attribute ("aarch64_vector_pcs", "default",
25415 					TYPE_ATTRIBUTES (t));
25416 }
25417 
25418 /* Implement TARGET_SIMD_CLONE_USABLE.  */
25419 
25420 static int
aarch64_simd_clone_usable(struct cgraph_node * node)25421 aarch64_simd_clone_usable (struct cgraph_node *node)
25422 {
25423   switch (node->simdclone->vecsize_mangle)
25424     {
25425     case 'n':
25426       if (!TARGET_SIMD)
25427 	return -1;
25428       return 0;
25429     default:
25430       gcc_unreachable ();
25431     }
25432 }
25433 
25434 /* Implement TARGET_COMP_TYPE_ATTRIBUTES */
25435 
25436 static int
aarch64_comp_type_attributes(const_tree type1,const_tree type2)25437 aarch64_comp_type_attributes (const_tree type1, const_tree type2)
25438 {
25439   auto check_attr = [&](const char *name) {
25440     tree attr1 = lookup_attribute (name, TYPE_ATTRIBUTES (type1));
25441     tree attr2 = lookup_attribute (name, TYPE_ATTRIBUTES (type2));
25442     if (!attr1 && !attr2)
25443       return true;
25444 
25445     return attr1 && attr2 && attribute_value_equal (attr1, attr2);
25446   };
25447 
25448   if (!check_attr ("aarch64_vector_pcs"))
25449     return 0;
25450   if (!check_attr ("Advanced SIMD type"))
25451     return 0;
25452   if (!check_attr ("SVE type"))
25453     return 0;
25454   if (!check_attr ("SVE sizeless type"))
25455     return 0;
25456   return 1;
25457 }
25458 
25459 /* Implement TARGET_GET_MULTILIB_ABI_NAME */
25460 
25461 static const char *
aarch64_get_multilib_abi_name(void)25462 aarch64_get_multilib_abi_name (void)
25463 {
25464   if (TARGET_BIG_END)
25465     return TARGET_ILP32 ? "aarch64_be_ilp32" : "aarch64_be";
25466   return TARGET_ILP32 ? "aarch64_ilp32" : "aarch64";
25467 }
25468 
25469 /* Implement TARGET_STACK_PROTECT_GUARD. In case of a
25470    global variable based guard use the default else
25471    return a null tree.  */
25472 static tree
aarch64_stack_protect_guard(void)25473 aarch64_stack_protect_guard (void)
25474 {
25475   if (aarch64_stack_protector_guard == SSP_GLOBAL)
25476     return default_stack_protect_guard ();
25477 
25478   return NULL_TREE;
25479 }
25480 
25481 /* Return the diagnostic message string if conversion from FROMTYPE to
25482    TOTYPE is not allowed, NULL otherwise.  */
25483 
25484 static const char *
aarch64_invalid_conversion(const_tree fromtype,const_tree totype)25485 aarch64_invalid_conversion (const_tree fromtype, const_tree totype)
25486 {
25487   if (element_mode (fromtype) != element_mode (totype))
25488     {
25489       /* Do no allow conversions to/from BFmode scalar types.  */
25490       if (TYPE_MODE (fromtype) == BFmode)
25491 	return N_("invalid conversion from type %<bfloat16_t%>");
25492       if (TYPE_MODE (totype) == BFmode)
25493 	return N_("invalid conversion to type %<bfloat16_t%>");
25494     }
25495 
25496   /* Conversion allowed.  */
25497   return NULL;
25498 }
25499 
25500 /* Return the diagnostic message string if the unary operation OP is
25501    not permitted on TYPE, NULL otherwise.  */
25502 
25503 static const char *
aarch64_invalid_unary_op(int op,const_tree type)25504 aarch64_invalid_unary_op (int op, const_tree type)
25505 {
25506   /* Reject all single-operand operations on BFmode except for &.  */
25507   if (element_mode (type) == BFmode && op != ADDR_EXPR)
25508     return N_("operation not permitted on type %<bfloat16_t%>");
25509 
25510   /* Operation allowed.  */
25511   return NULL;
25512 }
25513 
25514 /* Return the diagnostic message string if the binary operation OP is
25515    not permitted on TYPE1 and TYPE2, NULL otherwise.  */
25516 
25517 static const char *
aarch64_invalid_binary_op(int op ATTRIBUTE_UNUSED,const_tree type1,const_tree type2)25518 aarch64_invalid_binary_op (int op ATTRIBUTE_UNUSED, const_tree type1,
25519 			   const_tree type2)
25520 {
25521   /* Reject all 2-operand operations on BFmode.  */
25522   if (element_mode (type1) == BFmode
25523       || element_mode (type2) == BFmode)
25524     return N_("operation not permitted on type %<bfloat16_t%>");
25525 
25526   if (VECTOR_TYPE_P (type1)
25527       && VECTOR_TYPE_P (type2)
25528       && !TYPE_INDIVISIBLE_P (type1)
25529       && !TYPE_INDIVISIBLE_P (type2)
25530       && (aarch64_sve::builtin_type_p (type1)
25531 	  != aarch64_sve::builtin_type_p (type2)))
25532     return N_("cannot combine GNU and SVE vectors in a binary operation");
25533 
25534   /* Operation allowed.  */
25535   return NULL;
25536 }
25537 
25538 /* Implement TARGET_MEMTAG_CAN_TAG_ADDRESSES.  Here we tell the rest of the
25539    compiler that we automatically ignore the top byte of our pointers, which
25540    allows using -fsanitize=hwaddress.  */
25541 bool
aarch64_can_tag_addresses()25542 aarch64_can_tag_addresses ()
25543 {
25544   return !TARGET_ILP32;
25545 }
25546 
25547 /* Implement TARGET_ASM_FILE_END for AArch64.  This adds the AArch64 GNU NOTE
25548    section at the end if needed.  */
25549 #define GNU_PROPERTY_AARCH64_FEATURE_1_AND	0xc0000000
25550 #define GNU_PROPERTY_AARCH64_FEATURE_1_BTI	(1U << 0)
25551 #define GNU_PROPERTY_AARCH64_FEATURE_1_PAC	(1U << 1)
25552 void
aarch64_file_end_indicate_exec_stack()25553 aarch64_file_end_indicate_exec_stack ()
25554 {
25555   file_end_indicate_exec_stack ();
25556 
25557   unsigned feature_1_and = 0;
25558   if (aarch64_bti_enabled ())
25559     feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_BTI;
25560 
25561   if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE)
25562     feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_PAC;
25563 
25564   if (feature_1_and)
25565     {
25566       /* Generate .note.gnu.property section.  */
25567       switch_to_section (get_section (".note.gnu.property",
25568 				      SECTION_NOTYPE, NULL));
25569 
25570       /* PT_NOTE header: namesz, descsz, type.
25571 	 namesz = 4 ("GNU\0")
25572 	 descsz = 16 (Size of the program property array)
25573 		  [(12 + padding) * Number of array elements]
25574 	 type   = 5 (NT_GNU_PROPERTY_TYPE_0).  */
25575       assemble_align (POINTER_SIZE);
25576       assemble_integer (GEN_INT (4), 4, 32, 1);
25577       assemble_integer (GEN_INT (ROUND_UP (12, POINTER_BYTES)), 4, 32, 1);
25578       assemble_integer (GEN_INT (5), 4, 32, 1);
25579 
25580       /* PT_NOTE name.  */
25581       assemble_string ("GNU", 4);
25582 
25583       /* PT_NOTE contents for NT_GNU_PROPERTY_TYPE_0:
25584 	 type   = GNU_PROPERTY_AARCH64_FEATURE_1_AND
25585 	 datasz = 4
25586 	 data   = feature_1_and.  */
25587       assemble_integer (GEN_INT (GNU_PROPERTY_AARCH64_FEATURE_1_AND), 4, 32, 1);
25588       assemble_integer (GEN_INT (4), 4, 32, 1);
25589       assemble_integer (GEN_INT (feature_1_and), 4, 32, 1);
25590 
25591       /* Pad the size of the note to the required alignment.  */
25592       assemble_align (POINTER_SIZE);
25593     }
25594 }
25595 #undef GNU_PROPERTY_AARCH64_FEATURE_1_PAC
25596 #undef GNU_PROPERTY_AARCH64_FEATURE_1_BTI
25597 #undef GNU_PROPERTY_AARCH64_FEATURE_1_AND
25598 
25599 /* Helper function for straight line speculation.
25600    Return what barrier should be emitted for straight line speculation
25601    mitigation.
25602    When not mitigating against straight line speculation this function returns
25603    an empty string.
25604    When mitigating against straight line speculation, use:
25605    * SB when the v8.5-A SB extension is enabled.
25606    * DSB+ISB otherwise.  */
25607 const char *
aarch64_sls_barrier(int mitigation_required)25608 aarch64_sls_barrier (int mitigation_required)
25609 {
25610   return mitigation_required
25611     ? (TARGET_SB ? "sb" : "dsb\tsy\n\tisb")
25612     : "";
25613 }
25614 
25615 static GTY (()) tree aarch64_sls_shared_thunks[30];
25616 static GTY (()) bool aarch64_sls_shared_thunks_needed = false;
25617 const char *indirect_symbol_names[30] = {
25618     "__call_indirect_x0",
25619     "__call_indirect_x1",
25620     "__call_indirect_x2",
25621     "__call_indirect_x3",
25622     "__call_indirect_x4",
25623     "__call_indirect_x5",
25624     "__call_indirect_x6",
25625     "__call_indirect_x7",
25626     "__call_indirect_x8",
25627     "__call_indirect_x9",
25628     "__call_indirect_x10",
25629     "__call_indirect_x11",
25630     "__call_indirect_x12",
25631     "__call_indirect_x13",
25632     "__call_indirect_x14",
25633     "__call_indirect_x15",
25634     "", /* "__call_indirect_x16",  */
25635     "", /* "__call_indirect_x17",  */
25636     "__call_indirect_x18",
25637     "__call_indirect_x19",
25638     "__call_indirect_x20",
25639     "__call_indirect_x21",
25640     "__call_indirect_x22",
25641     "__call_indirect_x23",
25642     "__call_indirect_x24",
25643     "__call_indirect_x25",
25644     "__call_indirect_x26",
25645     "__call_indirect_x27",
25646     "__call_indirect_x28",
25647     "__call_indirect_x29",
25648 };
25649 
25650 /* Function to create a BLR thunk.  This thunk is used to mitigate straight
25651    line speculation.  Instead of a simple BLR that can be speculated past,
25652    we emit a BL to this thunk, and this thunk contains a BR to the relevant
25653    register.  These thunks have the relevant speculation barries put after
25654    their indirect branch so that speculation is blocked.
25655 
25656    We use such a thunk so the speculation barriers are kept off the
25657    architecturally executed path in order to reduce the performance overhead.
25658 
25659    When optimizing for size we use stubs shared by the linked object.
25660    When optimizing for performance we emit stubs for each function in the hope
25661    that the branch predictor can better train on jumps specific for a given
25662    function.  */
25663 rtx
aarch64_sls_create_blr_label(int regnum)25664 aarch64_sls_create_blr_label (int regnum)
25665 {
25666   gcc_assert (STUB_REGNUM_P (regnum));
25667   if (optimize_function_for_size_p (cfun))
25668     {
25669       /* For the thunks shared between different functions in this compilation
25670 	 unit we use a named symbol -- this is just for users to more easily
25671 	 understand the generated assembly.  */
25672       aarch64_sls_shared_thunks_needed = true;
25673       const char *thunk_name = indirect_symbol_names[regnum];
25674       if (aarch64_sls_shared_thunks[regnum] == NULL)
25675 	{
25676 	  /* Build a decl representing this function stub and record it for
25677 	     later.  We build a decl here so we can use the GCC machinery for
25678 	     handling sections automatically (through `get_named_section` and
25679 	     `make_decl_one_only`).  That saves us a lot of trouble handling
25680 	     the specifics of different output file formats.  */
25681 	  tree decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
25682 				  get_identifier (thunk_name),
25683 				  build_function_type_list (void_type_node,
25684 							    NULL_TREE));
25685 	  DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
25686 					   NULL_TREE, void_type_node);
25687 	  TREE_PUBLIC (decl) = 1;
25688 	  TREE_STATIC (decl) = 1;
25689 	  DECL_IGNORED_P (decl) = 1;
25690 	  DECL_ARTIFICIAL (decl) = 1;
25691 	  make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
25692 	  resolve_unique_section (decl, 0, false);
25693 	  aarch64_sls_shared_thunks[regnum] = decl;
25694 	}
25695 
25696       return gen_rtx_SYMBOL_REF (Pmode, thunk_name);
25697     }
25698 
25699   if (cfun->machine->call_via[regnum] == NULL)
25700     cfun->machine->call_via[regnum]
25701       = gen_rtx_LABEL_REF (Pmode, gen_label_rtx ());
25702   return cfun->machine->call_via[regnum];
25703 }
25704 
25705 /* Helper function for aarch64_sls_emit_blr_function_thunks and
25706    aarch64_sls_emit_shared_blr_thunks below.  */
25707 static void
aarch64_sls_emit_function_stub(FILE * out_file,int regnum)25708 aarch64_sls_emit_function_stub (FILE *out_file, int regnum)
25709 {
25710   /* Save in x16 and branch to that function so this transformation does
25711      not prevent jumping to `BTI c` instructions.  */
25712   asm_fprintf (out_file, "\tmov\tx16, x%d\n", regnum);
25713   asm_fprintf (out_file, "\tbr\tx16\n");
25714 }
25715 
25716 /* Emit all BLR stubs for this particular function.
25717    Here we emit all the BLR stubs needed for the current function.  Since we
25718    emit these stubs in a consecutive block we know there will be no speculation
25719    gadgets between each stub, and hence we only emit a speculation barrier at
25720    the end of the stub sequences.
25721 
25722    This is called in the TARGET_ASM_FUNCTION_EPILOGUE hook.  */
25723 void
aarch64_sls_emit_blr_function_thunks(FILE * out_file)25724 aarch64_sls_emit_blr_function_thunks (FILE *out_file)
25725 {
25726   if (! aarch64_harden_sls_blr_p ())
25727     return;
25728 
25729   bool any_functions_emitted = false;
25730   /* We must save and restore the current function section since this assembly
25731      is emitted at the end of the function.  This means it can be emitted *just
25732      after* the cold section of a function.  That cold part would be emitted in
25733      a different section.  That switch would trigger a `.cfi_endproc` directive
25734      to be emitted in the original section and a `.cfi_startproc` directive to
25735      be emitted in the new section.  Switching to the original section without
25736      restoring would mean that the `.cfi_endproc` emitted as a function ends
25737      would happen in a different section -- leaving an unmatched
25738      `.cfi_startproc` in the cold text section and an unmatched `.cfi_endproc`
25739      in the standard text section.  */
25740   section *save_text_section = in_section;
25741   switch_to_section (function_section (current_function_decl));
25742   for (int regnum = 0; regnum < 30; ++regnum)
25743     {
25744       rtx specu_label = cfun->machine->call_via[regnum];
25745       if (specu_label == NULL)
25746 	continue;
25747 
25748       targetm.asm_out.print_operand (out_file, specu_label, 0);
25749       asm_fprintf (out_file, ":\n");
25750       aarch64_sls_emit_function_stub (out_file, regnum);
25751       any_functions_emitted = true;
25752     }
25753   if (any_functions_emitted)
25754     /* Can use the SB if needs be here, since this stub will only be used
25755       by the current function, and hence for the current target.  */
25756     asm_fprintf (out_file, "\t%s\n", aarch64_sls_barrier (true));
25757   switch_to_section (save_text_section);
25758 }
25759 
25760 /* Emit shared BLR stubs for the current compilation unit.
25761    Over the course of compiling this unit we may have converted some BLR
25762    instructions to a BL to a shared stub function.  This is where we emit those
25763    stub functions.
25764    This function is for the stubs shared between different functions in this
25765    compilation unit.  We share when optimizing for size instead of speed.
25766 
25767    This function is called through the TARGET_ASM_FILE_END hook.  */
25768 void
aarch64_sls_emit_shared_blr_thunks(FILE * out_file)25769 aarch64_sls_emit_shared_blr_thunks (FILE *out_file)
25770 {
25771   if (! aarch64_sls_shared_thunks_needed)
25772     return;
25773 
25774   for (int regnum = 0; regnum < 30; ++regnum)
25775     {
25776       tree decl = aarch64_sls_shared_thunks[regnum];
25777       if (!decl)
25778 	continue;
25779 
25780       const char *name = indirect_symbol_names[regnum];
25781       switch_to_section (get_named_section (decl, NULL, 0));
25782       ASM_OUTPUT_ALIGN (out_file, 2);
25783       targetm.asm_out.globalize_label (out_file, name);
25784       /* Only emits if the compiler is configured for an assembler that can
25785 	 handle visibility directives.  */
25786       targetm.asm_out.assemble_visibility (decl, VISIBILITY_HIDDEN);
25787       ASM_OUTPUT_TYPE_DIRECTIVE (out_file, name, "function");
25788       ASM_OUTPUT_LABEL (out_file, name);
25789       aarch64_sls_emit_function_stub (out_file, regnum);
25790       /* Use the most conservative target to ensure it can always be used by any
25791 	 function in the translation unit.  */
25792       asm_fprintf (out_file, "\tdsb\tsy\n\tisb\n");
25793       ASM_DECLARE_FUNCTION_SIZE (out_file, name, decl);
25794     }
25795 }
25796 
25797 /* Implement TARGET_ASM_FILE_END.  */
25798 void
aarch64_asm_file_end()25799 aarch64_asm_file_end ()
25800 {
25801   aarch64_sls_emit_shared_blr_thunks (asm_out_file);
25802   /* Since this function will be called for the ASM_FILE_END hook, we ensure
25803      that what would be called otherwise (e.g. `file_end_indicate_exec_stack`
25804      for FreeBSD) still gets called.  */
25805 #ifdef TARGET_ASM_FILE_END
25806   TARGET_ASM_FILE_END ();
25807 #endif
25808 }
25809 
25810 const char *
aarch64_indirect_call_asm(rtx addr)25811 aarch64_indirect_call_asm (rtx addr)
25812 {
25813   gcc_assert (REG_P (addr));
25814   if (aarch64_harden_sls_blr_p ())
25815     {
25816       rtx stub_label = aarch64_sls_create_blr_label (REGNO (addr));
25817       output_asm_insn ("bl\t%0", &stub_label);
25818     }
25819   else
25820    output_asm_insn ("blr\t%0", &addr);
25821   return "";
25822 }
25823 
25824 /* Target-specific selftests.  */
25825 
25826 #if CHECKING_P
25827 
25828 namespace selftest {
25829 
25830 /* Selftest for the RTL loader.
25831    Verify that the RTL loader copes with a dump from
25832    print_rtx_function.  This is essentially just a test that class
25833    function_reader can handle a real dump, but it also verifies
25834    that lookup_reg_by_dump_name correctly handles hard regs.
25835    The presence of hard reg names in the dump means that the test is
25836    target-specific, hence it is in this file.  */
25837 
25838 static void
aarch64_test_loading_full_dump()25839 aarch64_test_loading_full_dump ()
25840 {
25841   rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
25842 
25843   ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
25844 
25845   rtx_insn *insn_1 = get_insn_by_uid (1);
25846   ASSERT_EQ (NOTE, GET_CODE (insn_1));
25847 
25848   rtx_insn *insn_15 = get_insn_by_uid (15);
25849   ASSERT_EQ (INSN, GET_CODE (insn_15));
25850   ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
25851 
25852   /* Verify crtl->return_rtx.  */
25853   ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
25854   ASSERT_EQ (0, REGNO (crtl->return_rtx));
25855   ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
25856 }
25857 
25858 /* Test the fractional_cost class.  */
25859 
25860 static void
aarch64_test_fractional_cost()25861 aarch64_test_fractional_cost ()
25862 {
25863   using cf = fractional_cost;
25864 
25865   ASSERT_EQ (cf (0, 20), 0);
25866 
25867   ASSERT_EQ (cf (4, 2), 2);
25868   ASSERT_EQ (3, cf (9, 3));
25869 
25870   ASSERT_NE (cf (5, 2), 2);
25871   ASSERT_NE (3, cf (8, 3));
25872 
25873   ASSERT_EQ (cf (7, 11) + cf (15, 11), 2);
25874   ASSERT_EQ (cf (2, 3) + cf (3, 5), cf (19, 15));
25875   ASSERT_EQ (cf (2, 3) + cf (1, 6) + cf (1, 6), 1);
25876 
25877   ASSERT_EQ (cf (14, 15) - cf (4, 15), cf (2, 3));
25878   ASSERT_EQ (cf (1, 4) - cf (1, 2), 0);
25879   ASSERT_EQ (cf (3, 5) - cf (1, 10), cf (1, 2));
25880   ASSERT_EQ (cf (11, 3) - 3, cf (2, 3));
25881   ASSERT_EQ (3 - cf (7, 3), cf (2, 3));
25882   ASSERT_EQ (3 - cf (10, 3), 0);
25883 
25884   ASSERT_EQ (cf (2, 3) * 5, cf (10, 3));
25885   ASSERT_EQ (14 * cf (11, 21), cf (22, 3));
25886 
25887   ASSERT_TRUE (cf (4, 15) < cf (5, 15));
25888   ASSERT_FALSE (cf (5, 15) < cf (5, 15));
25889   ASSERT_FALSE (cf (6, 15) < cf (5, 15));
25890   ASSERT_TRUE (cf (1, 3) < cf (2, 5));
25891   ASSERT_TRUE (cf (1, 12) < cf (1, 6));
25892   ASSERT_FALSE (cf (5, 3) < cf (5, 3));
25893   ASSERT_TRUE (cf (239, 240) < 1);
25894   ASSERT_FALSE (cf (240, 240) < 1);
25895   ASSERT_FALSE (cf (241, 240) < 1);
25896   ASSERT_FALSE (2 < cf (207, 104));
25897   ASSERT_FALSE (2 < cf (208, 104));
25898   ASSERT_TRUE (2 < cf (209, 104));
25899 
25900   ASSERT_TRUE (cf (4, 15) < cf (5, 15));
25901   ASSERT_FALSE (cf (5, 15) < cf (5, 15));
25902   ASSERT_FALSE (cf (6, 15) < cf (5, 15));
25903   ASSERT_TRUE (cf (1, 3) < cf (2, 5));
25904   ASSERT_TRUE (cf (1, 12) < cf (1, 6));
25905   ASSERT_FALSE (cf (5, 3) < cf (5, 3));
25906   ASSERT_TRUE (cf (239, 240) < 1);
25907   ASSERT_FALSE (cf (240, 240) < 1);
25908   ASSERT_FALSE (cf (241, 240) < 1);
25909   ASSERT_FALSE (2 < cf (207, 104));
25910   ASSERT_FALSE (2 < cf (208, 104));
25911   ASSERT_TRUE (2 < cf (209, 104));
25912 
25913   ASSERT_FALSE (cf (4, 15) >= cf (5, 15));
25914   ASSERT_TRUE (cf (5, 15) >= cf (5, 15));
25915   ASSERT_TRUE (cf (6, 15) >= cf (5, 15));
25916   ASSERT_FALSE (cf (1, 3) >= cf (2, 5));
25917   ASSERT_FALSE (cf (1, 12) >= cf (1, 6));
25918   ASSERT_TRUE (cf (5, 3) >= cf (5, 3));
25919   ASSERT_FALSE (cf (239, 240) >= 1);
25920   ASSERT_TRUE (cf (240, 240) >= 1);
25921   ASSERT_TRUE (cf (241, 240) >= 1);
25922   ASSERT_TRUE (2 >= cf (207, 104));
25923   ASSERT_TRUE (2 >= cf (208, 104));
25924   ASSERT_FALSE (2 >= cf (209, 104));
25925 
25926   ASSERT_FALSE (cf (4, 15) > cf (5, 15));
25927   ASSERT_FALSE (cf (5, 15) > cf (5, 15));
25928   ASSERT_TRUE (cf (6, 15) > cf (5, 15));
25929   ASSERT_FALSE (cf (1, 3) > cf (2, 5));
25930   ASSERT_FALSE (cf (1, 12) > cf (1, 6));
25931   ASSERT_FALSE (cf (5, 3) > cf (5, 3));
25932   ASSERT_FALSE (cf (239, 240) > 1);
25933   ASSERT_FALSE (cf (240, 240) > 1);
25934   ASSERT_TRUE (cf (241, 240) > 1);
25935   ASSERT_TRUE (2 > cf (207, 104));
25936   ASSERT_FALSE (2 > cf (208, 104));
25937   ASSERT_FALSE (2 > cf (209, 104));
25938 
25939   ASSERT_EQ (cf (1, 2).ceil (), 1);
25940   ASSERT_EQ (cf (11, 7).ceil (), 2);
25941   ASSERT_EQ (cf (20, 1).ceil (), 20);
25942   ASSERT_EQ ((cf (0xfffffffd) + 1).ceil (), 0xfffffffe);
25943   ASSERT_EQ ((cf (0xfffffffd) + 2).ceil (), 0xffffffff);
25944   ASSERT_EQ ((cf (0xfffffffd) + 3).ceil (), 0xffffffff);
25945   ASSERT_EQ ((cf (0x7fffffff) * 2).ceil (), 0xfffffffe);
25946   ASSERT_EQ ((cf (0x80000000) * 2).ceil (), 0xffffffff);
25947 
25948   ASSERT_EQ (cf (1, 2).as_double (), 0.5);
25949 }
25950 
25951 /* Run all target-specific selftests.  */
25952 
25953 static void
aarch64_run_selftests(void)25954 aarch64_run_selftests (void)
25955 {
25956   aarch64_test_loading_full_dump ();
25957   aarch64_test_fractional_cost ();
25958 }
25959 
25960 } // namespace selftest
25961 
25962 #endif /* #if CHECKING_P */
25963 
25964 #undef TARGET_STACK_PROTECT_GUARD
25965 #define TARGET_STACK_PROTECT_GUARD aarch64_stack_protect_guard
25966 
25967 #undef TARGET_ADDRESS_COST
25968 #define TARGET_ADDRESS_COST aarch64_address_cost
25969 
25970 /* This hook will determines whether unnamed bitfields affect the alignment
25971    of the containing structure.  The hook returns true if the structure
25972    should inherit the alignment requirements of an unnamed bitfield's
25973    type.  */
25974 #undef TARGET_ALIGN_ANON_BITFIELD
25975 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
25976 
25977 #undef TARGET_ASM_ALIGNED_DI_OP
25978 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
25979 
25980 #undef TARGET_ASM_ALIGNED_HI_OP
25981 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
25982 
25983 #undef TARGET_ASM_ALIGNED_SI_OP
25984 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
25985 
25986 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
25987 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
25988   hook_bool_const_tree_hwi_hwi_const_tree_true
25989 
25990 #undef TARGET_ASM_FILE_START
25991 #define TARGET_ASM_FILE_START aarch64_start_file
25992 
25993 #undef TARGET_ASM_OUTPUT_MI_THUNK
25994 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
25995 
25996 #undef TARGET_ASM_SELECT_RTX_SECTION
25997 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
25998 
25999 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
26000 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
26001 
26002 #undef TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY
26003 #define TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY aarch64_print_patchable_function_entry
26004 
26005 #undef TARGET_BUILD_BUILTIN_VA_LIST
26006 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
26007 
26008 #undef TARGET_CALLEE_COPIES
26009 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_arg_info_false
26010 
26011 #undef TARGET_CAN_ELIMINATE
26012 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
26013 
26014 #undef TARGET_CAN_INLINE_P
26015 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
26016 
26017 #undef TARGET_CANNOT_FORCE_CONST_MEM
26018 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
26019 
26020 #undef TARGET_CASE_VALUES_THRESHOLD
26021 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
26022 
26023 #undef TARGET_CONDITIONAL_REGISTER_USAGE
26024 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
26025 
26026 #undef TARGET_MEMBER_TYPE_FORCES_BLK
26027 #define TARGET_MEMBER_TYPE_FORCES_BLK aarch64_member_type_forces_blk
26028 
26029 /* Only the least significant bit is used for initialization guard
26030    variables.  */
26031 #undef TARGET_CXX_GUARD_MASK_BIT
26032 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
26033 
26034 #undef TARGET_C_MODE_FOR_SUFFIX
26035 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
26036 
26037 #ifdef TARGET_BIG_ENDIAN_DEFAULT
26038 #undef  TARGET_DEFAULT_TARGET_FLAGS
26039 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
26040 #endif
26041 
26042 #undef TARGET_CLASS_MAX_NREGS
26043 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
26044 
26045 #undef TARGET_BUILTIN_DECL
26046 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
26047 
26048 #undef TARGET_BUILTIN_RECIPROCAL
26049 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
26050 
26051 #undef TARGET_C_EXCESS_PRECISION
26052 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
26053 
26054 #undef  TARGET_EXPAND_BUILTIN
26055 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
26056 
26057 #undef TARGET_EXPAND_BUILTIN_VA_START
26058 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
26059 
26060 #undef TARGET_FOLD_BUILTIN
26061 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
26062 
26063 #undef TARGET_FUNCTION_ARG
26064 #define TARGET_FUNCTION_ARG aarch64_function_arg
26065 
26066 #undef TARGET_FUNCTION_ARG_ADVANCE
26067 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
26068 
26069 #undef TARGET_FUNCTION_ARG_BOUNDARY
26070 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
26071 
26072 #undef TARGET_FUNCTION_ARG_PADDING
26073 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
26074 
26075 #undef TARGET_GET_RAW_RESULT_MODE
26076 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
26077 #undef TARGET_GET_RAW_ARG_MODE
26078 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
26079 
26080 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
26081 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
26082 
26083 #undef TARGET_FUNCTION_VALUE
26084 #define TARGET_FUNCTION_VALUE aarch64_function_value
26085 
26086 #undef TARGET_FUNCTION_VALUE_REGNO_P
26087 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
26088 
26089 #undef TARGET_GIMPLE_FOLD_BUILTIN
26090 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
26091 
26092 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
26093 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
26094 
26095 #undef  TARGET_INIT_BUILTINS
26096 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
26097 
26098 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
26099 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
26100   aarch64_ira_change_pseudo_allocno_class
26101 
26102 #undef TARGET_LEGITIMATE_ADDRESS_P
26103 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
26104 
26105 #undef TARGET_LEGITIMATE_CONSTANT_P
26106 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
26107 
26108 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
26109 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
26110   aarch64_legitimize_address_displacement
26111 
26112 #undef TARGET_LIBGCC_CMP_RETURN_MODE
26113 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
26114 
26115 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
26116 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
26117 aarch64_libgcc_floating_mode_supported_p
26118 
26119 #undef TARGET_MANGLE_TYPE
26120 #define TARGET_MANGLE_TYPE aarch64_mangle_type
26121 
26122 #undef TARGET_INVALID_CONVERSION
26123 #define TARGET_INVALID_CONVERSION aarch64_invalid_conversion
26124 
26125 #undef TARGET_INVALID_UNARY_OP
26126 #define TARGET_INVALID_UNARY_OP aarch64_invalid_unary_op
26127 
26128 #undef TARGET_INVALID_BINARY_OP
26129 #define TARGET_INVALID_BINARY_OP aarch64_invalid_binary_op
26130 
26131 #undef TARGET_VERIFY_TYPE_CONTEXT
26132 #define TARGET_VERIFY_TYPE_CONTEXT aarch64_verify_type_context
26133 
26134 #undef TARGET_MEMORY_MOVE_COST
26135 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
26136 
26137 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
26138 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
26139 
26140 #undef TARGET_MUST_PASS_IN_STACK
26141 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
26142 
26143 /* This target hook should return true if accesses to volatile bitfields
26144    should use the narrowest mode possible.  It should return false if these
26145    accesses should use the bitfield container type.  */
26146 #undef TARGET_NARROW_VOLATILE_BITFIELD
26147 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
26148 
26149 #undef  TARGET_OPTION_OVERRIDE
26150 #define TARGET_OPTION_OVERRIDE aarch64_override_options
26151 
26152 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
26153 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
26154   aarch64_override_options_after_change
26155 
26156 #undef TARGET_OFFLOAD_OPTIONS
26157 #define TARGET_OFFLOAD_OPTIONS aarch64_offload_options
26158 
26159 #undef TARGET_OPTION_SAVE
26160 #define TARGET_OPTION_SAVE aarch64_option_save
26161 
26162 #undef TARGET_OPTION_RESTORE
26163 #define TARGET_OPTION_RESTORE aarch64_option_restore
26164 
26165 #undef TARGET_OPTION_PRINT
26166 #define TARGET_OPTION_PRINT aarch64_option_print
26167 
26168 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
26169 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
26170 
26171 #undef TARGET_SET_CURRENT_FUNCTION
26172 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
26173 
26174 #undef TARGET_PASS_BY_REFERENCE
26175 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
26176 
26177 #undef TARGET_PREFERRED_RELOAD_CLASS
26178 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
26179 
26180 #undef TARGET_SCHED_REASSOCIATION_WIDTH
26181 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
26182 
26183 #undef TARGET_PROMOTED_TYPE
26184 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
26185 
26186 #undef TARGET_SECONDARY_RELOAD
26187 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
26188 
26189 #undef TARGET_SHIFT_TRUNCATION_MASK
26190 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
26191 
26192 #undef TARGET_SETUP_INCOMING_VARARGS
26193 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
26194 
26195 #undef TARGET_STRUCT_VALUE_RTX
26196 #define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
26197 
26198 #undef TARGET_REGISTER_MOVE_COST
26199 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
26200 
26201 #undef TARGET_RETURN_IN_MEMORY
26202 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
26203 
26204 #undef TARGET_RETURN_IN_MSB
26205 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
26206 
26207 #undef TARGET_RTX_COSTS
26208 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
26209 
26210 #undef TARGET_SCALAR_MODE_SUPPORTED_P
26211 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
26212 
26213 #undef TARGET_SCHED_ISSUE_RATE
26214 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
26215 
26216 #undef TARGET_SCHED_VARIABLE_ISSUE
26217 #define TARGET_SCHED_VARIABLE_ISSUE aarch64_sched_variable_issue
26218 
26219 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
26220 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
26221   aarch64_sched_first_cycle_multipass_dfa_lookahead
26222 
26223 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
26224 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
26225   aarch64_first_cycle_multipass_dfa_lookahead_guard
26226 
26227 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
26228 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
26229   aarch64_get_separate_components
26230 
26231 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
26232 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
26233   aarch64_components_for_bb
26234 
26235 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
26236 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
26237   aarch64_disqualify_components
26238 
26239 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
26240 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
26241   aarch64_emit_prologue_components
26242 
26243 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
26244 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
26245   aarch64_emit_epilogue_components
26246 
26247 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
26248 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
26249   aarch64_set_handled_components
26250 
26251 #undef TARGET_TRAMPOLINE_INIT
26252 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
26253 
26254 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
26255 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
26256 
26257 #undef TARGET_VECTOR_MODE_SUPPORTED_P
26258 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
26259 
26260 #undef TARGET_COMPATIBLE_VECTOR_TYPES_P
26261 #define TARGET_COMPATIBLE_VECTOR_TYPES_P aarch64_compatible_vector_types_p
26262 
26263 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
26264 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
26265   aarch64_builtin_support_vector_misalignment
26266 
26267 #undef TARGET_ARRAY_MODE
26268 #define TARGET_ARRAY_MODE aarch64_array_mode
26269 
26270 #undef TARGET_ARRAY_MODE_SUPPORTED_P
26271 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
26272 
26273 #undef TARGET_VECTORIZE_INIT_COST
26274 #define TARGET_VECTORIZE_INIT_COST aarch64_init_cost
26275 
26276 #undef TARGET_VECTORIZE_ADD_STMT_COST
26277 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
26278 
26279 #undef TARGET_VECTORIZE_FINISH_COST
26280 #define TARGET_VECTORIZE_FINISH_COST aarch64_finish_cost
26281 
26282 #undef TARGET_VECTORIZE_DESTROY_COST_DATA
26283 #define TARGET_VECTORIZE_DESTROY_COST_DATA aarch64_destroy_cost_data
26284 
26285 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
26286 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
26287   aarch64_builtin_vectorization_cost
26288 
26289 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
26290 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
26291 
26292 #undef TARGET_VECTORIZE_BUILTINS
26293 #define TARGET_VECTORIZE_BUILTINS
26294 
26295 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
26296 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
26297   aarch64_builtin_vectorized_function
26298 
26299 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES
26300 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES \
26301   aarch64_autovectorize_vector_modes
26302 
26303 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
26304 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
26305   aarch64_atomic_assign_expand_fenv
26306 
26307 /* Section anchor support.  */
26308 
26309 #undef TARGET_MIN_ANCHOR_OFFSET
26310 #define TARGET_MIN_ANCHOR_OFFSET -256
26311 
26312 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
26313    byte offset; we can do much more for larger data types, but have no way
26314    to determine the size of the access.  We assume accesses are aligned.  */
26315 #undef TARGET_MAX_ANCHOR_OFFSET
26316 #define TARGET_MAX_ANCHOR_OFFSET 4095
26317 
26318 #undef TARGET_VECTOR_ALIGNMENT
26319 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
26320 
26321 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
26322 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
26323   aarch64_vectorize_preferred_vector_alignment
26324 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
26325 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
26326   aarch64_simd_vector_alignment_reachable
26327 
26328 /* vec_perm support.  */
26329 
26330 #undef TARGET_VECTORIZE_VEC_PERM_CONST
26331 #define TARGET_VECTORIZE_VEC_PERM_CONST \
26332   aarch64_vectorize_vec_perm_const
26333 
26334 #undef TARGET_VECTORIZE_RELATED_MODE
26335 #define TARGET_VECTORIZE_RELATED_MODE aarch64_vectorize_related_mode
26336 #undef TARGET_VECTORIZE_GET_MASK_MODE
26337 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
26338 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
26339 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
26340   aarch64_empty_mask_is_expensive
26341 #undef TARGET_PREFERRED_ELSE_VALUE
26342 #define TARGET_PREFERRED_ELSE_VALUE \
26343   aarch64_preferred_else_value
26344 
26345 #undef TARGET_INIT_LIBFUNCS
26346 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
26347 
26348 #undef TARGET_FIXED_CONDITION_CODE_REGS
26349 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
26350 
26351 #undef TARGET_FLAGS_REGNUM
26352 #define TARGET_FLAGS_REGNUM CC_REGNUM
26353 
26354 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
26355 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
26356 
26357 #undef TARGET_ASAN_SHADOW_OFFSET
26358 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
26359 
26360 #undef TARGET_LEGITIMIZE_ADDRESS
26361 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
26362 
26363 #undef TARGET_SCHED_CAN_SPECULATE_INSN
26364 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
26365 
26366 #undef TARGET_CAN_USE_DOLOOP_P
26367 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
26368 
26369 #undef TARGET_SCHED_ADJUST_PRIORITY
26370 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
26371 
26372 #undef TARGET_SCHED_MACRO_FUSION_P
26373 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
26374 
26375 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
26376 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
26377 
26378 #undef TARGET_SCHED_FUSION_PRIORITY
26379 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
26380 
26381 #undef TARGET_UNSPEC_MAY_TRAP_P
26382 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
26383 
26384 #undef TARGET_USE_PSEUDO_PIC_REG
26385 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
26386 
26387 #undef TARGET_PRINT_OPERAND
26388 #define TARGET_PRINT_OPERAND aarch64_print_operand
26389 
26390 #undef TARGET_PRINT_OPERAND_ADDRESS
26391 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
26392 
26393 #undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
26394 #define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA aarch64_output_addr_const_extra
26395 
26396 #undef TARGET_OPTAB_SUPPORTED_P
26397 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
26398 
26399 #undef TARGET_OMIT_STRUCT_RETURN_REG
26400 #define TARGET_OMIT_STRUCT_RETURN_REG true
26401 
26402 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
26403 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
26404   aarch64_dwarf_poly_indeterminate_value
26405 
26406 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors.  */
26407 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
26408 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
26409 
26410 #undef TARGET_HARD_REGNO_NREGS
26411 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
26412 #undef TARGET_HARD_REGNO_MODE_OK
26413 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
26414 
26415 #undef TARGET_MODES_TIEABLE_P
26416 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
26417 
26418 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
26419 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
26420   aarch64_hard_regno_call_part_clobbered
26421 
26422 #undef TARGET_INSN_CALLEE_ABI
26423 #define TARGET_INSN_CALLEE_ABI aarch64_insn_callee_abi
26424 
26425 #undef TARGET_CONSTANT_ALIGNMENT
26426 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
26427 
26428 #undef TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE
26429 #define TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE \
26430   aarch64_stack_clash_protection_alloca_probe_range
26431 
26432 #undef TARGET_COMPUTE_PRESSURE_CLASSES
26433 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
26434 
26435 #undef TARGET_CAN_CHANGE_MODE_CLASS
26436 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
26437 
26438 #undef TARGET_SELECT_EARLY_REMAT_MODES
26439 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
26440 
26441 #undef TARGET_SPECULATION_SAFE_VALUE
26442 #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
26443 
26444 #undef TARGET_ESTIMATED_POLY_VALUE
26445 #define TARGET_ESTIMATED_POLY_VALUE aarch64_estimated_poly_value
26446 
26447 #undef TARGET_ATTRIBUTE_TABLE
26448 #define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
26449 
26450 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
26451 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
26452   aarch64_simd_clone_compute_vecsize_and_simdlen
26453 
26454 #undef TARGET_SIMD_CLONE_ADJUST
26455 #define TARGET_SIMD_CLONE_ADJUST aarch64_simd_clone_adjust
26456 
26457 #undef TARGET_SIMD_CLONE_USABLE
26458 #define TARGET_SIMD_CLONE_USABLE aarch64_simd_clone_usable
26459 
26460 #undef TARGET_COMP_TYPE_ATTRIBUTES
26461 #define TARGET_COMP_TYPE_ATTRIBUTES aarch64_comp_type_attributes
26462 
26463 #undef TARGET_GET_MULTILIB_ABI_NAME
26464 #define TARGET_GET_MULTILIB_ABI_NAME aarch64_get_multilib_abi_name
26465 
26466 #undef TARGET_FNTYPE_ABI
26467 #define TARGET_FNTYPE_ABI aarch64_fntype_abi
26468 
26469 #undef TARGET_MEMTAG_CAN_TAG_ADDRESSES
26470 #define TARGET_MEMTAG_CAN_TAG_ADDRESSES aarch64_can_tag_addresses
26471 
26472 #if CHECKING_P
26473 #undef TARGET_RUN_TARGET_SELFTESTS
26474 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
26475 #endif /* #if CHECKING_P */
26476 
26477 #undef TARGET_ASM_POST_CFI_STARTPROC
26478 #define TARGET_ASM_POST_CFI_STARTPROC aarch64_post_cfi_startproc
26479 
26480 #undef TARGET_STRICT_ARGUMENT_NAMING
26481 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
26482 
26483 #undef TARGET_MD_ASM_ADJUST
26484 #define TARGET_MD_ASM_ADJUST arm_md_asm_adjust
26485 
26486 #undef TARGET_ASM_FILE_END
26487 #define TARGET_ASM_FILE_END aarch64_asm_file_end
26488 
26489 #undef TARGET_ASM_FUNCTION_EPILOGUE
26490 #define TARGET_ASM_FUNCTION_EPILOGUE aarch64_sls_emit_blr_function_thunks
26491 
26492 struct gcc_target targetm = TARGET_INITIALIZER;
26493 
26494 #include "gt-aarch64.h"
26495