1 /* Machine description for AArch64 architecture.
2    Copyright (C) 2009-2020 Free Software Foundation, Inc.
3    Contributed by ARM Ltd.
4 
5    This file is part of GCC.
6 
7    GCC is free software; you can redistribute it and/or modify it
8    under the terms of the GNU General Public License as published by
9    the Free Software Foundation; either version 3, or (at your option)
10    any later version.
11 
12    GCC is distributed in the hope that it will be useful, but
13    WITHOUT ANY WARRANTY; without even the implied warranty of
14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15    General Public License for more details.
16 
17    You should have received a copy of the GNU General Public License
18    along with GCC; see the file COPYING3.  If not see
19    <http://www.gnu.org/licenses/>.  */
20 
21 #define IN_TARGET_CODE 1
22 
23 #include "config.h"
24 #define INCLUDE_STRING
25 #include "system.h"
26 #include "coretypes.h"
27 #include "backend.h"
28 #include "target.h"
29 #include "rtl.h"
30 #include "tree.h"
31 #include "memmodel.h"
32 #include "gimple.h"
33 #include "cfghooks.h"
34 #include "cfgloop.h"
35 #include "df.h"
36 #include "tm_p.h"
37 #include "stringpool.h"
38 #include "attribs.h"
39 #include "optabs.h"
40 #include "regs.h"
41 #include "emit-rtl.h"
42 #include "recog.h"
43 #include "cgraph.h"
44 #include "diagnostic.h"
45 #include "insn-attr.h"
46 #include "alias.h"
47 #include "fold-const.h"
48 #include "stor-layout.h"
49 #include "calls.h"
50 #include "varasm.h"
51 #include "output.h"
52 #include "flags.h"
53 #include "explow.h"
54 #include "expr.h"
55 #include "reload.h"
56 #include "langhooks.h"
57 #include "opts.h"
58 #include "gimplify.h"
59 #include "dwarf2.h"
60 #include "gimple-iterator.h"
61 #include "tree-vectorizer.h"
62 #include "aarch64-cost-tables.h"
63 #include "dumpfile.h"
64 #include "builtins.h"
65 #include "rtl-iter.h"
66 #include "tm-constrs.h"
67 #include "sched-int.h"
68 #include "target-globals.h"
69 #include "common/common-target.h"
70 #include "cfgrtl.h"
71 #include "selftest.h"
72 #include "selftest-rtl.h"
73 #include "rtx-vector-builder.h"
74 #include "intl.h"
75 #include "expmed.h"
76 #include "function-abi.h"
77 
78 /* This file should be included last.  */
79 #include "target-def.h"
80 
81 /* Defined for convenience.  */
82 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
83 
84 /* Information about a legitimate vector immediate operand.  */
85 struct simd_immediate_info
86 {
87   enum insn_type { MOV, MVN, INDEX, PTRUE };
88   enum modifier_type { LSL, MSL };
89 
simd_immediate_infosimd_immediate_info90   simd_immediate_info () {}
91   simd_immediate_info (scalar_float_mode, rtx);
92   simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
93 		       insn_type = MOV, modifier_type = LSL,
94 		       unsigned int = 0);
95   simd_immediate_info (scalar_mode, rtx, rtx);
96   simd_immediate_info (scalar_int_mode, aarch64_svpattern);
97 
98   /* The mode of the elements.  */
99   scalar_mode elt_mode;
100 
101   /* The instruction to use to move the immediate into a vector.  */
102   insn_type insn;
103 
104   union
105   {
106     /* For MOV and MVN.  */
107     struct
108     {
109       /* The value of each element.  */
110       rtx value;
111 
112       /* The kind of shift modifier to use, and the number of bits to shift.
113 	 This is (LSL, 0) if no shift is needed.  */
114       modifier_type modifier;
115       unsigned int shift;
116     } mov;
117 
118     /* For INDEX.  */
119     struct
120     {
121       /* The value of the first element and the step to be added for each
122 	 subsequent element.  */
123       rtx base, step;
124     } index;
125 
126     /* For PTRUE.  */
127     aarch64_svpattern pattern;
128   } u;
129 };
130 
131 /* Construct a floating-point immediate in which each element has mode
132    ELT_MODE_IN and value VALUE_IN.  */
133 inline simd_immediate_info
simd_immediate_info(scalar_float_mode elt_mode_in,rtx value_in)134 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
135   : elt_mode (elt_mode_in), insn (MOV)
136 {
137   u.mov.value = value_in;
138   u.mov.modifier = LSL;
139   u.mov.shift = 0;
140 }
141 
142 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
143    and value VALUE_IN.  The other parameters are as for the structure
144    fields.  */
145 inline simd_immediate_info
simd_immediate_info(scalar_int_mode elt_mode_in,unsigned HOST_WIDE_INT value_in,insn_type insn_in,modifier_type modifier_in,unsigned int shift_in)146 ::simd_immediate_info (scalar_int_mode elt_mode_in,
147 		       unsigned HOST_WIDE_INT value_in,
148 		       insn_type insn_in, modifier_type modifier_in,
149 		       unsigned int shift_in)
150   : elt_mode (elt_mode_in), insn (insn_in)
151 {
152   u.mov.value = gen_int_mode (value_in, elt_mode_in);
153   u.mov.modifier = modifier_in;
154   u.mov.shift = shift_in;
155 }
156 
157 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
158    and where element I is equal to BASE_IN + I * STEP_IN.  */
159 inline simd_immediate_info
simd_immediate_info(scalar_mode elt_mode_in,rtx base_in,rtx step_in)160 ::simd_immediate_info (scalar_mode elt_mode_in, rtx base_in, rtx step_in)
161   : elt_mode (elt_mode_in), insn (INDEX)
162 {
163   u.index.base = base_in;
164   u.index.step = step_in;
165 }
166 
167 /* Construct a predicate that controls elements of mode ELT_MODE_IN
168    and has PTRUE pattern PATTERN_IN.  */
169 inline simd_immediate_info
simd_immediate_info(scalar_int_mode elt_mode_in,aarch64_svpattern pattern_in)170 ::simd_immediate_info (scalar_int_mode elt_mode_in,
171 		       aarch64_svpattern pattern_in)
172   : elt_mode (elt_mode_in), insn (PTRUE)
173 {
174   u.pattern = pattern_in;
175 }
176 
177 namespace {
178 
179 /* Describes types that map to Pure Scalable Types (PSTs) in the AAPCS64.  */
180 class pure_scalable_type_info
181 {
182 public:
183   /* Represents the result of analyzing a type.  All values are nonzero,
184      in the possibly forlorn hope that accidental conversions to bool
185      trigger a warning.  */
186   enum analysis_result
187   {
188     /* The type does not have an ABI identity; i.e. it doesn't contain
189        at least one object whose type is a Fundamental Data Type.  */
190     NO_ABI_IDENTITY = 1,
191 
192     /* The type is definitely a Pure Scalable Type.  */
193     IS_PST,
194 
195     /* The type is definitely not a Pure Scalable Type.  */
196     ISNT_PST,
197 
198     /* It doesn't matter for PCS purposes whether the type is a Pure
199        Scalable Type or not, since the type will be handled the same
200        way regardless.
201 
202        Specifically, this means that if the type is a Pure Scalable Type,
203        there aren't enough argument registers to hold it, and so it will
204        need to be passed or returned in memory.  If the type isn't a
205        Pure Scalable Type, it's too big to be passed or returned in core
206        or SIMD&FP registers, and so again will need to go in memory.  */
207     DOESNT_MATTER
208   };
209 
210   /* Aggregates of 17 bytes or more are normally passed and returned
211      in memory, so aggregates of that size can safely be analyzed as
212      DOESNT_MATTER.  We need to be able to collect enough pieces to
213      represent a PST that is smaller than that.  Since predicates are
214      2 bytes in size for -msve-vector-bits=128, that means we need to be
215      able to store at least 8 pieces.
216 
217      We also need to be able to store enough pieces to represent
218      a single vector in each vector argument register and a single
219      predicate in each predicate argument register.  This means that
220      we need at least 12 pieces.  */
221   static const unsigned int MAX_PIECES = NUM_FP_ARG_REGS + NUM_PR_ARG_REGS;
222 #if __cplusplus >= 201103L
223   static_assert (MAX_PIECES >= 8, "Need to store at least 8 predicates");
224 #endif
225 
226   /* Describes one piece of a PST.  Each piece is one of:
227 
228      - a single Scalable Vector Type (SVT)
229      - a single Scalable Predicate Type (SPT)
230      - a PST containing 2, 3 or 4 SVTs, with no padding
231 
232      It either represents a single built-in type or a PST formed from
233      multiple homogeneous built-in types.  */
234   struct piece
235   {
236     rtx get_rtx (unsigned int, unsigned int) const;
237 
238     /* The number of vector and predicate registers that the piece
239        occupies.  One of the two is always zero.  */
240     unsigned int num_zr;
241     unsigned int num_pr;
242 
243     /* The mode of the registers described above.  */
244     machine_mode mode;
245 
246     /* If this piece is formed from multiple homogeneous built-in types,
247        this is the mode of the built-in types, otherwise it is MODE.  */
248     machine_mode orig_mode;
249 
250     /* The offset in bytes of the piece from the start of the type.  */
251     poly_uint64_pod offset;
252   };
253 
254   /* Divides types analyzed as IS_PST into individual pieces.  The pieces
255      are in memory order.  */
256   auto_vec<piece, MAX_PIECES> pieces;
257 
258   unsigned int num_zr () const;
259   unsigned int num_pr () const;
260 
261   rtx get_rtx (machine_mode mode, unsigned int, unsigned int) const;
262 
263   analysis_result analyze (const_tree);
264   bool analyze_registers (const_tree);
265 
266 private:
267   analysis_result analyze_array (const_tree);
268   analysis_result analyze_record (const_tree);
269   void add_piece (const piece &);
270 };
271 }
272 
273 /* The current code model.  */
274 enum aarch64_code_model aarch64_cmodel;
275 
276 /* The number of 64-bit elements in an SVE vector.  */
277 poly_uint16 aarch64_sve_vg;
278 
279 #ifdef HAVE_AS_TLS
280 #undef TARGET_HAVE_TLS
281 #define TARGET_HAVE_TLS 1
282 #endif
283 
284 static bool aarch64_composite_type_p (const_tree, machine_mode);
285 static bool aarch64_return_in_memory_1 (const_tree);
286 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
287 						     const_tree,
288 						     machine_mode *, int *,
289 						     bool *, bool);
290 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
291 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
292 static void aarch64_override_options_after_change (void);
293 static bool aarch64_vector_mode_supported_p (machine_mode);
294 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
295 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
296 							 const_tree type,
297 							 int misalignment,
298 							 bool is_packed);
299 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
300 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
301 					    aarch64_addr_query_type);
302 static HOST_WIDE_INT aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val);
303 
304 /* Major revision number of the ARM Architecture implemented by the target.  */
305 unsigned aarch64_architecture_version;
306 
307 /* The processor for which instructions should be scheduled.  */
308 enum aarch64_processor aarch64_tune = cortexa53;
309 
310 /* Mask to specify which instruction scheduling options should be used.  */
311 uint64_t aarch64_tune_flags = 0;
312 
313 /* Global flag for PC relative loads.  */
314 bool aarch64_pcrelative_literal_loads;
315 
316 /* Global flag for whether frame pointer is enabled.  */
317 bool aarch64_use_frame_pointer;
318 
319 #define BRANCH_PROTECT_STR_MAX 255
320 char *accepted_branch_protection_string = NULL;
321 
322 static enum aarch64_parse_opt_result
323 aarch64_parse_branch_protection (const char*, char**);
324 
325 /* Support for command line parsing of boolean flags in the tuning
326    structures.  */
327 struct aarch64_flag_desc
328 {
329   const char* name;
330   unsigned int flag;
331 };
332 
333 #define AARCH64_FUSION_PAIR(name, internal_name) \
334   { name, AARCH64_FUSE_##internal_name },
335 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
336 {
337   { "none", AARCH64_FUSE_NOTHING },
338 #include "aarch64-fusion-pairs.def"
339   { "all", AARCH64_FUSE_ALL },
340   { NULL, AARCH64_FUSE_NOTHING }
341 };
342 
343 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
344   { name, AARCH64_EXTRA_TUNE_##internal_name },
345 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
346 {
347   { "none", AARCH64_EXTRA_TUNE_NONE },
348 #include "aarch64-tuning-flags.def"
349   { "all", AARCH64_EXTRA_TUNE_ALL },
350   { NULL, AARCH64_EXTRA_TUNE_NONE }
351 };
352 
353 /* Tuning parameters.  */
354 
355 static const struct cpu_addrcost_table generic_addrcost_table =
356 {
357     {
358       1, /* hi  */
359       0, /* si  */
360       0, /* di  */
361       1, /* ti  */
362     },
363   0, /* pre_modify  */
364   0, /* post_modify  */
365   0, /* register_offset  */
366   0, /* register_sextend  */
367   0, /* register_zextend  */
368   0 /* imm_offset  */
369 };
370 
371 static const struct cpu_addrcost_table exynosm1_addrcost_table =
372 {
373     {
374       0, /* hi  */
375       0, /* si  */
376       0, /* di  */
377       2, /* ti  */
378     },
379   0, /* pre_modify  */
380   0, /* post_modify  */
381   1, /* register_offset  */
382   1, /* register_sextend  */
383   2, /* register_zextend  */
384   0, /* imm_offset  */
385 };
386 
387 static const struct cpu_addrcost_table xgene1_addrcost_table =
388 {
389     {
390       1, /* hi  */
391       0, /* si  */
392       0, /* di  */
393       1, /* ti  */
394     },
395   1, /* pre_modify  */
396   1, /* post_modify  */
397   0, /* register_offset  */
398   1, /* register_sextend  */
399   1, /* register_zextend  */
400   0, /* imm_offset  */
401 };
402 
403 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
404 {
405     {
406       1, /* hi  */
407       1, /* si  */
408       1, /* di  */
409       2, /* ti  */
410     },
411   0, /* pre_modify  */
412   0, /* post_modify  */
413   2, /* register_offset  */
414   3, /* register_sextend  */
415   3, /* register_zextend  */
416   0, /* imm_offset  */
417 };
418 
419 static const struct cpu_addrcost_table thunderx3t110_addrcost_table =
420 {
421     {
422       1, /* hi  */
423       1, /* si  */
424       1, /* di  */
425       2, /* ti  */
426     },
427   0, /* pre_modify  */
428   0, /* post_modify  */
429   2, /* register_offset  */
430   3, /* register_sextend  */
431   3, /* register_zextend  */
432   0, /* imm_offset  */
433 };
434 
435 static const struct cpu_addrcost_table tsv110_addrcost_table =
436 {
437     {
438       1, /* hi  */
439       0, /* si  */
440       0, /* di  */
441       1, /* ti  */
442     },
443   0, /* pre_modify  */
444   0, /* post_modify  */
445   0, /* register_offset  */
446   1, /* register_sextend  */
447   1, /* register_zextend  */
448   0, /* imm_offset  */
449 };
450 
451 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
452 {
453     {
454       1, /* hi  */
455       1, /* si  */
456       1, /* di  */
457       2, /* ti  */
458     },
459   1, /* pre_modify  */
460   1, /* post_modify  */
461   3, /* register_offset  */
462   3, /* register_sextend  */
463   3, /* register_zextend  */
464   2, /* imm_offset  */
465 };
466 
467 static const struct cpu_regmove_cost generic_regmove_cost =
468 {
469   1, /* GP2GP  */
470   /* Avoid the use of slow int<->fp moves for spilling by setting
471      their cost higher than memmov_cost.  */
472   5, /* GP2FP  */
473   5, /* FP2GP  */
474   2 /* FP2FP  */
475 };
476 
477 static const struct cpu_regmove_cost cortexa57_regmove_cost =
478 {
479   1, /* GP2GP  */
480   /* Avoid the use of slow int<->fp moves for spilling by setting
481      their cost higher than memmov_cost.  */
482   5, /* GP2FP  */
483   5, /* FP2GP  */
484   2 /* FP2FP  */
485 };
486 
487 static const struct cpu_regmove_cost cortexa53_regmove_cost =
488 {
489   1, /* GP2GP  */
490   /* Avoid the use of slow int<->fp moves for spilling by setting
491      their cost higher than memmov_cost.  */
492   5, /* GP2FP  */
493   5, /* FP2GP  */
494   2 /* FP2FP  */
495 };
496 
497 static const struct cpu_regmove_cost exynosm1_regmove_cost =
498 {
499   1, /* GP2GP  */
500   /* Avoid the use of slow int<->fp moves for spilling by setting
501      their cost higher than memmov_cost (actual, 4 and 9).  */
502   9, /* GP2FP  */
503   9, /* FP2GP  */
504   1 /* FP2FP  */
505 };
506 
507 static const struct cpu_regmove_cost thunderx_regmove_cost =
508 {
509   2, /* GP2GP  */
510   2, /* GP2FP  */
511   6, /* FP2GP  */
512   4 /* FP2FP  */
513 };
514 
515 static const struct cpu_regmove_cost xgene1_regmove_cost =
516 {
517   1, /* GP2GP  */
518   /* Avoid the use of slow int<->fp moves for spilling by setting
519      their cost higher than memmov_cost.  */
520   8, /* GP2FP  */
521   8, /* FP2GP  */
522   2 /* FP2FP  */
523 };
524 
525 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
526 {
527   2, /* GP2GP  */
528   /* Avoid the use of int<->fp moves for spilling.  */
529   6, /* GP2FP  */
530   6, /* FP2GP  */
531   4 /* FP2FP  */
532 };
533 
534 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
535 {
536   1, /* GP2GP  */
537   /* Avoid the use of int<->fp moves for spilling.  */
538   5, /* GP2FP  */
539   6, /* FP2GP  */
540   3, /* FP2FP  */
541 };
542 
543 static const struct cpu_regmove_cost thunderx3t110_regmove_cost =
544 {
545   1, /* GP2GP  */
546   /* Avoid the use of int<->fp moves for spilling.  */
547   4, /* GP2FP  */
548   5, /* FP2GP  */
549   4  /* FP2FP  */
550 };
551 
552 static const struct cpu_regmove_cost tsv110_regmove_cost =
553 {
554   1, /* GP2GP  */
555   /* Avoid the use of slow int<->fp moves for spilling by setting
556      their cost higher than memmov_cost.  */
557   2, /* GP2FP  */
558   3, /* FP2GP  */
559   2  /* FP2FP  */
560 };
561 
562 /* Generic costs for vector insn classes.  */
563 static const struct cpu_vector_cost generic_vector_cost =
564 {
565   1, /* scalar_int_stmt_cost  */
566   1, /* scalar_fp_stmt_cost  */
567   1, /* scalar_load_cost  */
568   1, /* scalar_store_cost  */
569   1, /* vec_int_stmt_cost  */
570   1, /* vec_fp_stmt_cost  */
571   2, /* vec_permute_cost  */
572   2, /* vec_to_scalar_cost  */
573   1, /* scalar_to_vec_cost  */
574   1, /* vec_align_load_cost  */
575   1, /* vec_unalign_load_cost  */
576   1, /* vec_unalign_store_cost  */
577   1, /* vec_store_cost  */
578   3, /* cond_taken_branch_cost  */
579   1 /* cond_not_taken_branch_cost  */
580 };
581 
582 /* QDF24XX costs for vector insn classes.  */
583 static const struct cpu_vector_cost qdf24xx_vector_cost =
584 {
585   1, /* scalar_int_stmt_cost  */
586   1, /* scalar_fp_stmt_cost  */
587   1, /* scalar_load_cost  */
588   1, /* scalar_store_cost  */
589   1, /* vec_int_stmt_cost  */
590   3, /* vec_fp_stmt_cost  */
591   2, /* vec_permute_cost  */
592   1, /* vec_to_scalar_cost  */
593   1, /* scalar_to_vec_cost  */
594   1, /* vec_align_load_cost  */
595   1, /* vec_unalign_load_cost  */
596   1, /* vec_unalign_store_cost  */
597   1, /* vec_store_cost  */
598   3, /* cond_taken_branch_cost  */
599   1 /* cond_not_taken_branch_cost  */
600 };
601 
602 /* ThunderX costs for vector insn classes.  */
603 static const struct cpu_vector_cost thunderx_vector_cost =
604 {
605   1, /* scalar_int_stmt_cost  */
606   1, /* scalar_fp_stmt_cost  */
607   3, /* scalar_load_cost  */
608   1, /* scalar_store_cost  */
609   4, /* vec_int_stmt_cost  */
610   1, /* vec_fp_stmt_cost  */
611   4, /* vec_permute_cost  */
612   2, /* vec_to_scalar_cost  */
613   2, /* scalar_to_vec_cost  */
614   3, /* vec_align_load_cost  */
615   5, /* vec_unalign_load_cost  */
616   5, /* vec_unalign_store_cost  */
617   1, /* vec_store_cost  */
618   3, /* cond_taken_branch_cost  */
619   3 /* cond_not_taken_branch_cost  */
620 };
621 
622 static const struct cpu_vector_cost tsv110_vector_cost =
623 {
624   1, /* scalar_int_stmt_cost  */
625   1, /* scalar_fp_stmt_cost  */
626   5, /* scalar_load_cost  */
627   1, /* scalar_store_cost  */
628   2, /* vec_int_stmt_cost  */
629   2, /* vec_fp_stmt_cost  */
630   2, /* vec_permute_cost  */
631   3, /* vec_to_scalar_cost  */
632   2, /* scalar_to_vec_cost  */
633   5, /* vec_align_load_cost  */
634   5, /* vec_unalign_load_cost  */
635   1, /* vec_unalign_store_cost  */
636   1, /* vec_store_cost  */
637   1, /* cond_taken_branch_cost  */
638   1 /* cond_not_taken_branch_cost  */
639 };
640 
641 /* Generic costs for vector insn classes.  */
642 static const struct cpu_vector_cost cortexa57_vector_cost =
643 {
644   1, /* scalar_int_stmt_cost  */
645   1, /* scalar_fp_stmt_cost  */
646   4, /* scalar_load_cost  */
647   1, /* scalar_store_cost  */
648   2, /* vec_int_stmt_cost  */
649   2, /* vec_fp_stmt_cost  */
650   3, /* vec_permute_cost  */
651   8, /* vec_to_scalar_cost  */
652   8, /* scalar_to_vec_cost  */
653   4, /* vec_align_load_cost  */
654   4, /* vec_unalign_load_cost  */
655   1, /* vec_unalign_store_cost  */
656   1, /* vec_store_cost  */
657   1, /* cond_taken_branch_cost  */
658   1 /* cond_not_taken_branch_cost  */
659 };
660 
661 static const struct cpu_vector_cost exynosm1_vector_cost =
662 {
663   1, /* scalar_int_stmt_cost  */
664   1, /* scalar_fp_stmt_cost  */
665   5, /* scalar_load_cost  */
666   1, /* scalar_store_cost  */
667   3, /* vec_int_stmt_cost  */
668   3, /* vec_fp_stmt_cost  */
669   3, /* vec_permute_cost  */
670   3, /* vec_to_scalar_cost  */
671   3, /* scalar_to_vec_cost  */
672   5, /* vec_align_load_cost  */
673   5, /* vec_unalign_load_cost  */
674   1, /* vec_unalign_store_cost  */
675   1, /* vec_store_cost  */
676   1, /* cond_taken_branch_cost  */
677   1 /* cond_not_taken_branch_cost  */
678 };
679 
680 /* Generic costs for vector insn classes.  */
681 static const struct cpu_vector_cost xgene1_vector_cost =
682 {
683   1, /* scalar_int_stmt_cost  */
684   1, /* scalar_fp_stmt_cost  */
685   5, /* scalar_load_cost  */
686   1, /* scalar_store_cost  */
687   2, /* vec_int_stmt_cost  */
688   2, /* vec_fp_stmt_cost  */
689   2, /* vec_permute_cost  */
690   4, /* vec_to_scalar_cost  */
691   4, /* scalar_to_vec_cost  */
692   10, /* vec_align_load_cost  */
693   10, /* vec_unalign_load_cost  */
694   2, /* vec_unalign_store_cost  */
695   2, /* vec_store_cost  */
696   2, /* cond_taken_branch_cost  */
697   1 /* cond_not_taken_branch_cost  */
698 };
699 
700 /* Costs for vector insn classes for Vulcan.  */
701 static const struct cpu_vector_cost thunderx2t99_vector_cost =
702 {
703   1, /* scalar_int_stmt_cost  */
704   6, /* scalar_fp_stmt_cost  */
705   4, /* scalar_load_cost  */
706   1, /* scalar_store_cost  */
707   4, /* vec_int_stmt_cost  */
708   5, /* vec_fp_stmt_cost  */
709   10, /* vec_permute_cost  */
710   6, /* vec_to_scalar_cost  */
711   5, /* scalar_to_vec_cost  */
712   4, /* vec_align_load_cost  */
713   4, /* vec_unalign_load_cost  */
714   1, /* vec_unalign_store_cost  */
715   1, /* vec_store_cost  */
716   2, /* cond_taken_branch_cost  */
717   1  /* cond_not_taken_branch_cost  */
718 };
719 
720 static const struct cpu_vector_cost thunderx3t110_vector_cost =
721 {
722   1, /* scalar_int_stmt_cost  */
723   5, /* scalar_fp_stmt_cost  */
724   4, /* scalar_load_cost  */
725   1, /* scalar_store_cost  */
726   5, /* vec_int_stmt_cost  */
727   5, /* vec_fp_stmt_cost  */
728   10, /* vec_permute_cost  */
729   5, /* vec_to_scalar_cost  */
730   5, /* scalar_to_vec_cost  */
731   4, /* vec_align_load_cost  */
732   4, /* vec_unalign_load_cost  */
733   4, /* vec_unalign_store_cost  */
734   4, /* vec_store_cost  */
735   2, /* cond_taken_branch_cost  */
736   1  /* cond_not_taken_branch_cost  */
737 };
738 
739 
740 /* Generic costs for branch instructions.  */
741 static const struct cpu_branch_cost generic_branch_cost =
742 {
743   1,  /* Predictable.  */
744   3   /* Unpredictable.  */
745 };
746 
747 /* Generic approximation modes.  */
748 static const cpu_approx_modes generic_approx_modes =
749 {
750   AARCH64_APPROX_NONE,	/* division  */
751   AARCH64_APPROX_NONE,	/* sqrt  */
752   AARCH64_APPROX_NONE	/* recip_sqrt  */
753 };
754 
755 /* Approximation modes for Exynos M1.  */
756 static const cpu_approx_modes exynosm1_approx_modes =
757 {
758   AARCH64_APPROX_NONE,	/* division  */
759   AARCH64_APPROX_ALL,	/* sqrt  */
760   AARCH64_APPROX_ALL	/* recip_sqrt  */
761 };
762 
763 /* Approximation modes for X-Gene 1.  */
764 static const cpu_approx_modes xgene1_approx_modes =
765 {
766   AARCH64_APPROX_NONE,	/* division  */
767   AARCH64_APPROX_NONE,	/* sqrt  */
768   AARCH64_APPROX_ALL	/* recip_sqrt  */
769 };
770 
771 /* Generic prefetch settings (which disable prefetch).  */
772 static const cpu_prefetch_tune generic_prefetch_tune =
773 {
774   0,			/* num_slots  */
775   -1,			/* l1_cache_size  */
776   -1,			/* l1_cache_line_size  */
777   -1,			/* l2_cache_size  */
778   true,			/* prefetch_dynamic_strides */
779   -1,			/* minimum_stride */
780   -1			/* default_opt_level  */
781 };
782 
783 static const cpu_prefetch_tune exynosm1_prefetch_tune =
784 {
785   0,			/* num_slots  */
786   -1,			/* l1_cache_size  */
787   64,			/* l1_cache_line_size  */
788   -1,			/* l2_cache_size  */
789   true,			/* prefetch_dynamic_strides */
790   -1,			/* minimum_stride */
791   -1			/* default_opt_level  */
792 };
793 
794 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
795 {
796   4,			/* num_slots  */
797   32,			/* l1_cache_size  */
798   64,			/* l1_cache_line_size  */
799   512,			/* l2_cache_size  */
800   false,		/* prefetch_dynamic_strides */
801   2048,			/* minimum_stride */
802   3			/* default_opt_level  */
803 };
804 
805 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
806 {
807   8,			/* num_slots  */
808   32,			/* l1_cache_size  */
809   128,			/* l1_cache_line_size  */
810   16*1024,		/* l2_cache_size  */
811   true,			/* prefetch_dynamic_strides */
812   -1,			/* minimum_stride */
813   3			/* default_opt_level  */
814 };
815 
816 static const cpu_prefetch_tune thunderx_prefetch_tune =
817 {
818   8,			/* num_slots  */
819   32,			/* l1_cache_size  */
820   128,			/* l1_cache_line_size  */
821   -1,			/* l2_cache_size  */
822   true,			/* prefetch_dynamic_strides */
823   -1,			/* minimum_stride */
824   -1			/* default_opt_level  */
825 };
826 
827 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
828 {
829   8,			/* num_slots  */
830   32,			/* l1_cache_size  */
831   64,			/* l1_cache_line_size  */
832   256,			/* l2_cache_size  */
833   true,			/* prefetch_dynamic_strides */
834   -1,			/* minimum_stride */
835   -1			/* default_opt_level  */
836 };
837 
838 static const cpu_prefetch_tune thunderx3t110_prefetch_tune =
839 {
840   8,			/* num_slots  */
841   32,			/* l1_cache_size  */
842   64,			/* l1_cache_line_size  */
843   256,			/* l2_cache_size  */
844   true,			/* prefetch_dynamic_strides */
845   -1,			/* minimum_stride */
846   -1			/* default_opt_level  */
847 };
848 
849 static const cpu_prefetch_tune tsv110_prefetch_tune =
850 {
851   0,                    /* num_slots  */
852   64,                   /* l1_cache_size  */
853   64,                   /* l1_cache_line_size  */
854   512,                  /* l2_cache_size  */
855   true,                 /* prefetch_dynamic_strides */
856   -1,                   /* minimum_stride */
857   -1                    /* default_opt_level  */
858 };
859 
860 static const cpu_prefetch_tune xgene1_prefetch_tune =
861 {
862   8,			/* num_slots  */
863   32,			/* l1_cache_size  */
864   64,			/* l1_cache_line_size  */
865   256,			/* l2_cache_size  */
866   true,                 /* prefetch_dynamic_strides */
867   -1,                   /* minimum_stride */
868   -1			/* default_opt_level  */
869 };
870 
871 static const struct tune_params generic_tunings =
872 {
873   &cortexa57_extra_costs,
874   &generic_addrcost_table,
875   &generic_regmove_cost,
876   &generic_vector_cost,
877   &generic_branch_cost,
878   &generic_approx_modes,
879   SVE_NOT_IMPLEMENTED, /* sve_width  */
880   4, /* memmov_cost  */
881   2, /* issue_rate  */
882   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
883   "16:12",	/* function_align.  */
884   "4",	/* jump_align.  */
885   "8",	/* loop_align.  */
886   2,	/* int_reassoc_width.  */
887   4,	/* fp_reassoc_width.  */
888   1,	/* vec_reassoc_width.  */
889   2,	/* min_div_recip_mul_sf.  */
890   2,	/* min_div_recip_mul_df.  */
891   0,	/* max_case_values.  */
892   tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
893   (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
894   &generic_prefetch_tune
895 };
896 
897 static const struct tune_params cortexa35_tunings =
898 {
899   &cortexa53_extra_costs,
900   &generic_addrcost_table,
901   &cortexa53_regmove_cost,
902   &generic_vector_cost,
903   &generic_branch_cost,
904   &generic_approx_modes,
905   SVE_NOT_IMPLEMENTED, /* sve_width  */
906   4, /* memmov_cost  */
907   1, /* issue_rate  */
908   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
909    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
910   "16",	/* function_align.  */
911   "4",	/* jump_align.  */
912   "8",	/* loop_align.  */
913   2,	/* int_reassoc_width.  */
914   4,	/* fp_reassoc_width.  */
915   1,	/* vec_reassoc_width.  */
916   2,	/* min_div_recip_mul_sf.  */
917   2,	/* min_div_recip_mul_df.  */
918   0,	/* max_case_values.  */
919   tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
920   (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
921   &generic_prefetch_tune
922 };
923 
924 static const struct tune_params cortexa53_tunings =
925 {
926   &cortexa53_extra_costs,
927   &generic_addrcost_table,
928   &cortexa53_regmove_cost,
929   &generic_vector_cost,
930   &generic_branch_cost,
931   &generic_approx_modes,
932   SVE_NOT_IMPLEMENTED, /* sve_width  */
933   4, /* memmov_cost  */
934   2, /* issue_rate  */
935   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
936    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
937   "16",	/* function_align.  */
938   "4",	/* jump_align.  */
939   "8",	/* loop_align.  */
940   2,	/* int_reassoc_width.  */
941   4,	/* fp_reassoc_width.  */
942   1,	/* vec_reassoc_width.  */
943   2,	/* min_div_recip_mul_sf.  */
944   2,	/* min_div_recip_mul_df.  */
945   0,	/* max_case_values.  */
946   tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
947   (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
948   &generic_prefetch_tune
949 };
950 
951 static const struct tune_params cortexa57_tunings =
952 {
953   &cortexa57_extra_costs,
954   &generic_addrcost_table,
955   &cortexa57_regmove_cost,
956   &cortexa57_vector_cost,
957   &generic_branch_cost,
958   &generic_approx_modes,
959   SVE_NOT_IMPLEMENTED, /* sve_width  */
960   4, /* memmov_cost  */
961   3, /* issue_rate  */
962   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
963    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
964   "16",	/* function_align.  */
965   "4",	/* jump_align.  */
966   "8",	/* loop_align.  */
967   2,	/* int_reassoc_width.  */
968   4,	/* fp_reassoc_width.  */
969   1,	/* vec_reassoc_width.  */
970   2,	/* min_div_recip_mul_sf.  */
971   2,	/* min_div_recip_mul_df.  */
972   0,	/* max_case_values.  */
973   tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
974   (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS),	/* tune_flags.  */
975   &generic_prefetch_tune
976 };
977 
978 static const struct tune_params cortexa72_tunings =
979 {
980   &cortexa57_extra_costs,
981   &generic_addrcost_table,
982   &cortexa57_regmove_cost,
983   &cortexa57_vector_cost,
984   &generic_branch_cost,
985   &generic_approx_modes,
986   SVE_NOT_IMPLEMENTED, /* sve_width  */
987   4, /* memmov_cost  */
988   3, /* issue_rate  */
989   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
990    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
991   "16",	/* function_align.  */
992   "4",	/* jump_align.  */
993   "8",	/* loop_align.  */
994   2,	/* int_reassoc_width.  */
995   4,	/* fp_reassoc_width.  */
996   1,	/* vec_reassoc_width.  */
997   2,	/* min_div_recip_mul_sf.  */
998   2,	/* min_div_recip_mul_df.  */
999   0,	/* max_case_values.  */
1000   tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
1001   (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
1002   &generic_prefetch_tune
1003 };
1004 
1005 static const struct tune_params cortexa73_tunings =
1006 {
1007   &cortexa57_extra_costs,
1008   &generic_addrcost_table,
1009   &cortexa57_regmove_cost,
1010   &cortexa57_vector_cost,
1011   &generic_branch_cost,
1012   &generic_approx_modes,
1013   SVE_NOT_IMPLEMENTED, /* sve_width  */
1014   4, /* memmov_cost.  */
1015   2, /* issue_rate.  */
1016   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1017    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
1018   "16",	/* function_align.  */
1019   "4",	/* jump_align.  */
1020   "8",	/* loop_align.  */
1021   2,	/* int_reassoc_width.  */
1022   4,	/* fp_reassoc_width.  */
1023   1,	/* vec_reassoc_width.  */
1024   2,	/* min_div_recip_mul_sf.  */
1025   2,	/* min_div_recip_mul_df.  */
1026   0,	/* max_case_values.  */
1027   tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
1028   (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
1029   &generic_prefetch_tune
1030 };
1031 
1032 
1033 
1034 static const struct tune_params exynosm1_tunings =
1035 {
1036   &exynosm1_extra_costs,
1037   &exynosm1_addrcost_table,
1038   &exynosm1_regmove_cost,
1039   &exynosm1_vector_cost,
1040   &generic_branch_cost,
1041   &exynosm1_approx_modes,
1042   SVE_NOT_IMPLEMENTED, /* sve_width  */
1043   4,	/* memmov_cost  */
1044   3,	/* issue_rate  */
1045   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
1046   "4",	/* function_align.  */
1047   "4",	/* jump_align.  */
1048   "4",	/* loop_align.  */
1049   2,	/* int_reassoc_width.  */
1050   4,	/* fp_reassoc_width.  */
1051   1,	/* vec_reassoc_width.  */
1052   2,	/* min_div_recip_mul_sf.  */
1053   2,	/* min_div_recip_mul_df.  */
1054   48,	/* max_case_values.  */
1055   tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
1056   (AARCH64_EXTRA_TUNE_NONE), /* tune_flags.  */
1057   &exynosm1_prefetch_tune
1058 };
1059 
1060 static const struct tune_params thunderxt88_tunings =
1061 {
1062   &thunderx_extra_costs,
1063   &generic_addrcost_table,
1064   &thunderx_regmove_cost,
1065   &thunderx_vector_cost,
1066   &generic_branch_cost,
1067   &generic_approx_modes,
1068   SVE_NOT_IMPLEMENTED, /* sve_width  */
1069   6, /* memmov_cost  */
1070   2, /* issue_rate  */
1071   AARCH64_FUSE_ALU_BRANCH, /* fusible_ops  */
1072   "8",	/* function_align.  */
1073   "8",	/* jump_align.  */
1074   "8",	/* loop_align.  */
1075   2,	/* int_reassoc_width.  */
1076   4,	/* fp_reassoc_width.  */
1077   1,	/* vec_reassoc_width.  */
1078   2,	/* min_div_recip_mul_sf.  */
1079   2,	/* min_div_recip_mul_df.  */
1080   0,	/* max_case_values.  */
1081   tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
1082   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW),	/* tune_flags.  */
1083   &thunderxt88_prefetch_tune
1084 };
1085 
1086 static const struct tune_params thunderx_tunings =
1087 {
1088   &thunderx_extra_costs,
1089   &generic_addrcost_table,
1090   &thunderx_regmove_cost,
1091   &thunderx_vector_cost,
1092   &generic_branch_cost,
1093   &generic_approx_modes,
1094   SVE_NOT_IMPLEMENTED, /* sve_width  */
1095   6, /* memmov_cost  */
1096   2, /* issue_rate  */
1097   AARCH64_FUSE_ALU_BRANCH, /* fusible_ops  */
1098   "8",	/* function_align.  */
1099   "8",	/* jump_align.  */
1100   "8",	/* loop_align.  */
1101   2,	/* int_reassoc_width.  */
1102   4,	/* fp_reassoc_width.  */
1103   1,	/* vec_reassoc_width.  */
1104   2,	/* min_div_recip_mul_sf.  */
1105   2,	/* min_div_recip_mul_df.  */
1106   0,	/* max_case_values.  */
1107   tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
1108   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
1109    | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),	/* tune_flags.  */
1110   &thunderx_prefetch_tune
1111 };
1112 
1113 static const struct tune_params tsv110_tunings =
1114 {
1115   &tsv110_extra_costs,
1116   &tsv110_addrcost_table,
1117   &tsv110_regmove_cost,
1118   &tsv110_vector_cost,
1119   &generic_branch_cost,
1120   &generic_approx_modes,
1121   SVE_NOT_IMPLEMENTED, /* sve_width  */
1122   4,    /* memmov_cost  */
1123   4,    /* issue_rate  */
1124   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_ALU_BRANCH
1125    | AARCH64_FUSE_ALU_CBZ), /* fusible_ops  */
1126   "16", /* function_align.  */
1127   "4",  /* jump_align.  */
1128   "8",  /* loop_align.  */
1129   2,    /* int_reassoc_width.  */
1130   4,    /* fp_reassoc_width.  */
1131   1,    /* vec_reassoc_width.  */
1132   2,    /* min_div_recip_mul_sf.  */
1133   2,    /* min_div_recip_mul_df.  */
1134   0,    /* max_case_values.  */
1135   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1136   (AARCH64_EXTRA_TUNE_NONE),     /* tune_flags.  */
1137   &tsv110_prefetch_tune
1138 };
1139 
1140 static const struct tune_params xgene1_tunings =
1141 {
1142   &xgene1_extra_costs,
1143   &xgene1_addrcost_table,
1144   &xgene1_regmove_cost,
1145   &xgene1_vector_cost,
1146   &generic_branch_cost,
1147   &xgene1_approx_modes,
1148   SVE_NOT_IMPLEMENTED, /* sve_width  */
1149   6, /* memmov_cost  */
1150   4, /* issue_rate  */
1151   AARCH64_FUSE_NOTHING, /* fusible_ops  */
1152   "16",	/* function_align.  */
1153   "16",	/* jump_align.  */
1154   "16",	/* loop_align.  */
1155   2,	/* int_reassoc_width.  */
1156   4,	/* fp_reassoc_width.  */
1157   1,	/* vec_reassoc_width.  */
1158   2,	/* min_div_recip_mul_sf.  */
1159   2,	/* min_div_recip_mul_df.  */
1160   17,	/* max_case_values.  */
1161   tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
1162   (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS),	/* tune_flags.  */
1163   &xgene1_prefetch_tune
1164 };
1165 
1166 static const struct tune_params emag_tunings =
1167 {
1168   &xgene1_extra_costs,
1169   &xgene1_addrcost_table,
1170   &xgene1_regmove_cost,
1171   &xgene1_vector_cost,
1172   &generic_branch_cost,
1173   &xgene1_approx_modes,
1174   SVE_NOT_IMPLEMENTED,
1175   6, /* memmov_cost  */
1176   4, /* issue_rate  */
1177   AARCH64_FUSE_NOTHING, /* fusible_ops  */
1178   "16",	/* function_align.  */
1179   "16",	/* jump_align.  */
1180   "16",	/* loop_align.  */
1181   2,	/* int_reassoc_width.  */
1182   4,	/* fp_reassoc_width.  */
1183   1,	/* vec_reassoc_width.  */
1184   2,	/* min_div_recip_mul_sf.  */
1185   2,	/* min_div_recip_mul_df.  */
1186   17,	/* max_case_values.  */
1187   tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
1188   (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS),	/* tune_flags.  */
1189   &xgene1_prefetch_tune
1190 };
1191 
1192 static const struct tune_params qdf24xx_tunings =
1193 {
1194   &qdf24xx_extra_costs,
1195   &qdf24xx_addrcost_table,
1196   &qdf24xx_regmove_cost,
1197   &qdf24xx_vector_cost,
1198   &generic_branch_cost,
1199   &generic_approx_modes,
1200   SVE_NOT_IMPLEMENTED, /* sve_width  */
1201   4, /* memmov_cost  */
1202   4, /* issue_rate  */
1203   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1204    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
1205   "16",	/* function_align.  */
1206   "8",	/* jump_align.  */
1207   "16",	/* loop_align.  */
1208   2,	/* int_reassoc_width.  */
1209   4,	/* fp_reassoc_width.  */
1210   1,	/* vec_reassoc_width.  */
1211   2,	/* min_div_recip_mul_sf.  */
1212   2,	/* min_div_recip_mul_df.  */
1213   0,	/* max_case_values.  */
1214   tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
1215   AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS, /* tune_flags.  */
1216   &qdf24xx_prefetch_tune
1217 };
1218 
1219 /* Tuning structure for the Qualcomm Saphira core.  Default to falkor values
1220    for now.  */
1221 static const struct tune_params saphira_tunings =
1222 {
1223   &generic_extra_costs,
1224   &generic_addrcost_table,
1225   &generic_regmove_cost,
1226   &generic_vector_cost,
1227   &generic_branch_cost,
1228   &generic_approx_modes,
1229   SVE_NOT_IMPLEMENTED, /* sve_width  */
1230   4, /* memmov_cost  */
1231   4, /* issue_rate  */
1232   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1233    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
1234   "16",	/* function_align.  */
1235   "8",	/* jump_align.  */
1236   "16",	/* loop_align.  */
1237   2,	/* int_reassoc_width.  */
1238   4,	/* fp_reassoc_width.  */
1239   1,	/* vec_reassoc_width.  */
1240   2,	/* min_div_recip_mul_sf.  */
1241   2,	/* min_div_recip_mul_df.  */
1242   0,	/* max_case_values.  */
1243   tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
1244   (AARCH64_EXTRA_TUNE_NONE),		/* tune_flags.  */
1245   &generic_prefetch_tune
1246 };
1247 
1248 static const struct tune_params thunderx2t99_tunings =
1249 {
1250   &thunderx2t99_extra_costs,
1251   &thunderx2t99_addrcost_table,
1252   &thunderx2t99_regmove_cost,
1253   &thunderx2t99_vector_cost,
1254   &generic_branch_cost,
1255   &generic_approx_modes,
1256   SVE_NOT_IMPLEMENTED, /* sve_width  */
1257   4, /* memmov_cost.  */
1258   4, /* issue_rate.  */
1259   (AARCH64_FUSE_ALU_BRANCH | AARCH64_FUSE_AES_AESMC
1260    | AARCH64_FUSE_ALU_CBZ), /* fusible_ops  */
1261   "16",	/* function_align.  */
1262   "8",	/* jump_align.  */
1263   "16",	/* loop_align.  */
1264   3,	/* int_reassoc_width.  */
1265   2,	/* fp_reassoc_width.  */
1266   2,	/* vec_reassoc_width.  */
1267   2,	/* min_div_recip_mul_sf.  */
1268   2,	/* min_div_recip_mul_df.  */
1269   0,	/* max_case_values.  */
1270   tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
1271   (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
1272   &thunderx2t99_prefetch_tune
1273 };
1274 
1275 static const struct tune_params thunderx3t110_tunings =
1276 {
1277   &thunderx3t110_extra_costs,
1278   &thunderx3t110_addrcost_table,
1279   &thunderx3t110_regmove_cost,
1280   &thunderx3t110_vector_cost,
1281   &generic_branch_cost,
1282   &generic_approx_modes,
1283   SVE_NOT_IMPLEMENTED, /* sve_width  */
1284   4, /* memmov_cost.  */
1285   6, /* issue_rate.  */
1286   (AARCH64_FUSE_ALU_BRANCH | AARCH64_FUSE_AES_AESMC
1287    | AARCH64_FUSE_ALU_CBZ), /* fusible_ops  */
1288   "16",	/* function_align.  */
1289   "8",	/* jump_align.  */
1290   "16",	/* loop_align.  */
1291   3,	/* int_reassoc_width.  */
1292   2,	/* fp_reassoc_width.  */
1293   2,	/* vec_reassoc_width.  */
1294   2,	/* min_div_recip_mul_sf.  */
1295   2,	/* min_div_recip_mul_df.  */
1296   0,	/* max_case_values.  */
1297   tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
1298   (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
1299   &thunderx3t110_prefetch_tune
1300 };
1301 
1302 static const struct tune_params neoversen1_tunings =
1303 {
1304   &cortexa57_extra_costs,
1305   &generic_addrcost_table,
1306   &generic_regmove_cost,
1307   &cortexa57_vector_cost,
1308   &generic_branch_cost,
1309   &generic_approx_modes,
1310   SVE_NOT_IMPLEMENTED, /* sve_width  */
1311   4, /* memmov_cost  */
1312   3, /* issue_rate  */
1313   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
1314   "32:16",	/* function_align.  */
1315   "4",		/* jump_align.  */
1316   "32:16",	/* loop_align.  */
1317   2,	/* int_reassoc_width.  */
1318   4,	/* fp_reassoc_width.  */
1319   2,	/* vec_reassoc_width.  */
1320   2,	/* min_div_recip_mul_sf.  */
1321   2,	/* min_div_recip_mul_df.  */
1322   0,	/* max_case_values.  */
1323   tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
1324   (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
1325   &generic_prefetch_tune
1326 };
1327 
1328 /* Support for fine-grained override of the tuning structures.  */
1329 struct aarch64_tuning_override_function
1330 {
1331   const char* name;
1332   void (*parse_override)(const char*, struct tune_params*);
1333 };
1334 
1335 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
1336 static void aarch64_parse_tune_string (const char*, struct tune_params*);
1337 static void aarch64_parse_sve_width_string (const char*, struct tune_params*);
1338 
1339 static const struct aarch64_tuning_override_function
1340 aarch64_tuning_override_functions[] =
1341 {
1342   { "fuse", aarch64_parse_fuse_string },
1343   { "tune", aarch64_parse_tune_string },
1344   { "sve_width", aarch64_parse_sve_width_string },
1345   { NULL, NULL }
1346 };
1347 
1348 /* A processor implementing AArch64.  */
1349 struct processor
1350 {
1351   const char *const name;
1352   enum aarch64_processor ident;
1353   enum aarch64_processor sched_core;
1354   enum aarch64_arch arch;
1355   unsigned architecture_version;
1356   const uint64_t flags;
1357   const struct tune_params *const tune;
1358 };
1359 
1360 /* Architectures implementing AArch64.  */
1361 static const struct processor all_architectures[] =
1362 {
1363 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
1364   {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
1365 #include "aarch64-arches.def"
1366   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1367 };
1368 
1369 /* Processor cores implementing AArch64.  */
1370 static const struct processor all_cores[] =
1371 {
1372 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
1373   {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH,				\
1374   all_architectures[AARCH64_ARCH_##ARCH].architecture_version,	\
1375   FLAGS, &COSTS##_tunings},
1376 #include "aarch64-cores.def"
1377   {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
1378     AARCH64_FL_FOR_ARCH8, &generic_tunings},
1379   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1380 };
1381 
1382 
1383 /* Target specification.  These are populated by the -march, -mtune, -mcpu
1384    handling code or by target attributes.  */
1385 static const struct processor *selected_arch;
1386 static const struct processor *selected_cpu;
1387 static const struct processor *selected_tune;
1388 
1389 enum aarch64_key_type aarch64_ra_sign_key = AARCH64_KEY_A;
1390 
1391 /* The current tuning set.  */
1392 struct tune_params aarch64_tune_params = generic_tunings;
1393 
1394 /* Check whether an 'aarch64_vector_pcs' attribute is valid.  */
1395 
1396 static tree
handle_aarch64_vector_pcs_attribute(tree * node,tree name,tree,int,bool * no_add_attrs)1397 handle_aarch64_vector_pcs_attribute (tree *node, tree name, tree,
1398 				     int, bool *no_add_attrs)
1399 {
1400   /* Since we set fn_type_req to true, the caller should have checked
1401      this for us.  */
1402   gcc_assert (FUNC_OR_METHOD_TYPE_P (*node));
1403   switch ((arm_pcs) fntype_abi (*node).id ())
1404     {
1405     case ARM_PCS_AAPCS64:
1406     case ARM_PCS_SIMD:
1407       return NULL_TREE;
1408 
1409     case ARM_PCS_SVE:
1410       error ("the %qE attribute cannot be applied to an SVE function type",
1411 	     name);
1412       *no_add_attrs = true;
1413       return NULL_TREE;
1414 
1415     case ARM_PCS_TLSDESC:
1416     case ARM_PCS_UNKNOWN:
1417       break;
1418     }
1419   gcc_unreachable ();
1420 }
1421 
1422 /* Table of machine attributes.  */
1423 static const struct attribute_spec aarch64_attribute_table[] =
1424 {
1425   /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
1426        affects_type_identity, handler, exclude } */
1427   { "aarch64_vector_pcs", 0, 0, false, true,  true,  true,
1428 			  handle_aarch64_vector_pcs_attribute, NULL },
1429   { "arm_sve_vector_bits", 1, 1, false, true,  false, true,
1430 			  aarch64_sve::handle_arm_sve_vector_bits_attribute,
1431 			  NULL },
1432   { "Advanced SIMD type", 0, 0, false, true,  false, true,  NULL, NULL },
1433   { "SVE type",		  3, 3, false, true,  false, true,  NULL, NULL },
1434   { "SVE sizeless type",  0, 0, false, true,  false, true,  NULL, NULL },
1435   { NULL,                 0, 0, false, false, false, false, NULL, NULL }
1436 };
1437 
1438 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
1439 
1440 /* An ISA extension in the co-processor and main instruction set space.  */
1441 struct aarch64_option_extension
1442 {
1443   const char *const name;
1444   const unsigned long flags_on;
1445   const unsigned long flags_off;
1446 };
1447 
1448 typedef enum aarch64_cond_code
1449 {
1450   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
1451   AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
1452   AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
1453 }
1454 aarch64_cc;
1455 
1456 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1457 
1458 struct aarch64_branch_protect_type
1459 {
1460   /* The type's name that the user passes to the branch-protection option
1461     string.  */
1462   const char* name;
1463   /* Function to handle the protection type and set global variables.
1464     First argument is the string token corresponding with this type and the
1465     second argument is the next token in the option string.
1466     Return values:
1467     * AARCH64_PARSE_OK: Handling was sucessful.
1468     * AARCH64_INVALID_ARG: The type is invalid in this context and the caller
1469       should print an error.
1470     * AARCH64_INVALID_FEATURE: The type is invalid and the handler prints its
1471       own error.  */
1472   enum aarch64_parse_opt_result (*handler)(char*, char*);
1473   /* A list of types that can follow this type in the option string.  */
1474   const aarch64_branch_protect_type* subtypes;
1475   unsigned int num_subtypes;
1476 };
1477 
1478 static enum aarch64_parse_opt_result
aarch64_handle_no_branch_protection(char * str,char * rest)1479 aarch64_handle_no_branch_protection (char* str, char* rest)
1480 {
1481   aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
1482   aarch64_enable_bti = 0;
1483   if (rest)
1484     {
1485       error ("unexpected %<%s%> after %<%s%>", rest, str);
1486       return AARCH64_PARSE_INVALID_FEATURE;
1487     }
1488   return AARCH64_PARSE_OK;
1489 }
1490 
1491 static enum aarch64_parse_opt_result
aarch64_handle_standard_branch_protection(char * str,char * rest)1492 aarch64_handle_standard_branch_protection (char* str, char* rest)
1493 {
1494   aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1495   aarch64_ra_sign_key = AARCH64_KEY_A;
1496   aarch64_enable_bti = 1;
1497   if (rest)
1498     {
1499       error ("unexpected %<%s%> after %<%s%>", rest, str);
1500       return AARCH64_PARSE_INVALID_FEATURE;
1501     }
1502   return AARCH64_PARSE_OK;
1503 }
1504 
1505 static enum aarch64_parse_opt_result
aarch64_handle_pac_ret_protection(char * str ATTRIBUTE_UNUSED,char * rest ATTRIBUTE_UNUSED)1506 aarch64_handle_pac_ret_protection (char* str ATTRIBUTE_UNUSED,
1507 				    char* rest ATTRIBUTE_UNUSED)
1508 {
1509   aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1510   aarch64_ra_sign_key = AARCH64_KEY_A;
1511   return AARCH64_PARSE_OK;
1512 }
1513 
1514 static enum aarch64_parse_opt_result
aarch64_handle_pac_ret_leaf(char * str ATTRIBUTE_UNUSED,char * rest ATTRIBUTE_UNUSED)1515 aarch64_handle_pac_ret_leaf (char* str ATTRIBUTE_UNUSED,
1516 			      char* rest ATTRIBUTE_UNUSED)
1517 {
1518   aarch64_ra_sign_scope = AARCH64_FUNCTION_ALL;
1519   return AARCH64_PARSE_OK;
1520 }
1521 
1522 static enum aarch64_parse_opt_result
aarch64_handle_pac_ret_b_key(char * str ATTRIBUTE_UNUSED,char * rest ATTRIBUTE_UNUSED)1523 aarch64_handle_pac_ret_b_key (char* str ATTRIBUTE_UNUSED,
1524 			      char* rest ATTRIBUTE_UNUSED)
1525 {
1526   aarch64_ra_sign_key = AARCH64_KEY_B;
1527   return AARCH64_PARSE_OK;
1528 }
1529 
1530 static enum aarch64_parse_opt_result
aarch64_handle_bti_protection(char * str ATTRIBUTE_UNUSED,char * rest ATTRIBUTE_UNUSED)1531 aarch64_handle_bti_protection (char* str ATTRIBUTE_UNUSED,
1532 				    char* rest ATTRIBUTE_UNUSED)
1533 {
1534   aarch64_enable_bti = 1;
1535   return AARCH64_PARSE_OK;
1536 }
1537 
1538 static const struct aarch64_branch_protect_type aarch64_pac_ret_subtypes[] = {
1539   { "leaf", aarch64_handle_pac_ret_leaf, NULL, 0 },
1540   { "b-key", aarch64_handle_pac_ret_b_key, NULL, 0 },
1541   { NULL, NULL, NULL, 0 }
1542 };
1543 
1544 static const struct aarch64_branch_protect_type aarch64_branch_protect_types[] = {
1545   { "none", aarch64_handle_no_branch_protection, NULL, 0 },
1546   { "standard", aarch64_handle_standard_branch_protection, NULL, 0 },
1547   { "pac-ret", aarch64_handle_pac_ret_protection, aarch64_pac_ret_subtypes,
1548     ARRAY_SIZE (aarch64_pac_ret_subtypes) },
1549   { "bti", aarch64_handle_bti_protection, NULL, 0 },
1550   { NULL, NULL, NULL, 0 }
1551 };
1552 
1553 /* The condition codes of the processor, and the inverse function.  */
1554 static const char * const aarch64_condition_codes[] =
1555 {
1556   "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1557   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1558 };
1559 
1560 /* The preferred condition codes for SVE conditions.  */
1561 static const char *const aarch64_sve_condition_codes[] =
1562 {
1563   "none", "any", "nlast", "last", "first", "nfrst", "vs", "vc",
1564   "pmore", "plast", "tcont", "tstop", "gt", "le", "al", "nv"
1565 };
1566 
1567 /* Return the assembly token for svpattern value VALUE.  */
1568 
1569 static const char *
svpattern_token(enum aarch64_svpattern pattern)1570 svpattern_token (enum aarch64_svpattern pattern)
1571 {
1572   switch (pattern)
1573     {
1574 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
1575     AARCH64_FOR_SVPATTERN (CASE)
1576 #undef CASE
1577     case AARCH64_NUM_SVPATTERNS:
1578       break;
1579     }
1580   gcc_unreachable ();
1581 }
1582 
1583 /* Return the location of a piece that is known to be passed or returned
1584    in registers.  FIRST_ZR is the first unused vector argument register
1585    and FIRST_PR is the first unused predicate argument register.  */
1586 
1587 rtx
get_rtx(unsigned int first_zr,unsigned int first_pr)1588 pure_scalable_type_info::piece::get_rtx (unsigned int first_zr,
1589 					 unsigned int first_pr) const
1590 {
1591   gcc_assert (VECTOR_MODE_P (mode)
1592 	      && first_zr + num_zr <= V0_REGNUM + NUM_FP_ARG_REGS
1593 	      && first_pr + num_pr <= P0_REGNUM + NUM_PR_ARG_REGS);
1594 
1595   if (num_zr > 0 && num_pr == 0)
1596     return gen_rtx_REG (mode, first_zr);
1597 
1598   if (num_zr == 0 && num_pr == 1)
1599     return gen_rtx_REG (mode, first_pr);
1600 
1601   gcc_unreachable ();
1602 }
1603 
1604 /* Return the total number of vector registers required by the PST.  */
1605 
1606 unsigned int
num_zr()1607 pure_scalable_type_info::num_zr () const
1608 {
1609   unsigned int res = 0;
1610   for (unsigned int i = 0; i < pieces.length (); ++i)
1611     res += pieces[i].num_zr;
1612   return res;
1613 }
1614 
1615 /* Return the total number of predicate registers required by the PST.  */
1616 
1617 unsigned int
num_pr()1618 pure_scalable_type_info::num_pr () const
1619 {
1620   unsigned int res = 0;
1621   for (unsigned int i = 0; i < pieces.length (); ++i)
1622     res += pieces[i].num_pr;
1623   return res;
1624 }
1625 
1626 /* Return the location of a PST that is known to be passed or returned
1627    in registers.  FIRST_ZR is the first unused vector argument register
1628    and FIRST_PR is the first unused predicate argument register.  */
1629 
1630 rtx
get_rtx(machine_mode mode,unsigned int first_zr,unsigned int first_pr)1631 pure_scalable_type_info::get_rtx (machine_mode mode,
1632 				  unsigned int first_zr,
1633 				  unsigned int first_pr) const
1634 {
1635   /* Try to return a single REG if possible.  This leads to better
1636      code generation; it isn't required for correctness.  */
1637   if (mode == pieces[0].mode)
1638     {
1639       gcc_assert (pieces.length () == 1);
1640       return pieces[0].get_rtx (first_zr, first_pr);
1641     }
1642 
1643   /* Build up a PARALLEL that contains the individual pieces.  */
1644   rtvec rtxes = rtvec_alloc (pieces.length ());
1645   for (unsigned int i = 0; i < pieces.length (); ++i)
1646     {
1647       rtx reg = pieces[i].get_rtx (first_zr, first_pr);
1648       rtx offset = gen_int_mode (pieces[i].offset, Pmode);
1649       RTVEC_ELT (rtxes, i) = gen_rtx_EXPR_LIST (VOIDmode, reg, offset);
1650       first_zr += pieces[i].num_zr;
1651       first_pr += pieces[i].num_pr;
1652     }
1653   return gen_rtx_PARALLEL (mode, rtxes);
1654 }
1655 
1656 /* Analyze whether TYPE is a Pure Scalable Type according to the rules
1657    in the AAPCS64.  */
1658 
1659 pure_scalable_type_info::analysis_result
analyze(const_tree type)1660 pure_scalable_type_info::analyze (const_tree type)
1661 {
1662   /* Prevent accidental reuse.  */
1663   gcc_assert (pieces.is_empty ());
1664 
1665   /* No code will be generated for erroneous types, so we won't establish
1666      an ABI mapping.  */
1667   if (type == error_mark_node)
1668     return NO_ABI_IDENTITY;
1669 
1670   /* Zero-sized types disappear in the language->ABI mapping.  */
1671   if (TYPE_SIZE (type) && integer_zerop (TYPE_SIZE (type)))
1672     return NO_ABI_IDENTITY;
1673 
1674   /* Check for SVTs, SPTs, and built-in tuple types that map to PSTs.  */
1675   piece p = {};
1676   if (aarch64_sve::builtin_type_p (type, &p.num_zr, &p.num_pr))
1677     {
1678       machine_mode mode = TYPE_MODE_RAW (type);
1679       gcc_assert (VECTOR_MODE_P (mode)
1680 		  && (!TARGET_SVE || aarch64_sve_mode_p (mode)));
1681 
1682       p.mode = p.orig_mode = mode;
1683       add_piece (p);
1684       return IS_PST;
1685     }
1686 
1687   /* Check for user-defined PSTs.  */
1688   if (TREE_CODE (type) == ARRAY_TYPE)
1689     return analyze_array (type);
1690   if (TREE_CODE (type) == RECORD_TYPE)
1691     return analyze_record (type);
1692 
1693   return ISNT_PST;
1694 }
1695 
1696 /* Analyze a type that is known not to be passed or returned in memory.
1697    Return true if it has an ABI identity and is a Pure Scalable Type.  */
1698 
1699 bool
analyze_registers(const_tree type)1700 pure_scalable_type_info::analyze_registers (const_tree type)
1701 {
1702   analysis_result result = analyze (type);
1703   gcc_assert (result != DOESNT_MATTER);
1704   return result == IS_PST;
1705 }
1706 
1707 /* Subroutine of analyze for handling ARRAY_TYPEs.  */
1708 
1709 pure_scalable_type_info::analysis_result
analyze_array(const_tree type)1710 pure_scalable_type_info::analyze_array (const_tree type)
1711 {
1712   /* Analyze the element type.  */
1713   pure_scalable_type_info element_info;
1714   analysis_result result = element_info.analyze (TREE_TYPE (type));
1715   if (result != IS_PST)
1716     return result;
1717 
1718   /* An array of unknown, flexible or variable length will be passed and
1719      returned by reference whatever we do.  */
1720   tree nelts_minus_one = array_type_nelts (type);
1721   if (!tree_fits_uhwi_p (nelts_minus_one))
1722     return DOESNT_MATTER;
1723 
1724   /* Likewise if the array is constant-sized but too big to be interesting.
1725      The double checks against MAX_PIECES are to protect against overflow.  */
1726   unsigned HOST_WIDE_INT count = tree_to_uhwi (nelts_minus_one);
1727   if (count > MAX_PIECES)
1728     return DOESNT_MATTER;
1729   count += 1;
1730   if (count * element_info.pieces.length () > MAX_PIECES)
1731     return DOESNT_MATTER;
1732 
1733   /* The above checks should have weeded out elements of unknown size.  */
1734   poly_uint64 element_bytes;
1735   if (!poly_int_tree_p (TYPE_SIZE_UNIT (TREE_TYPE (type)), &element_bytes))
1736     gcc_unreachable ();
1737 
1738   /* Build up the list of individual vectors and predicates.  */
1739   gcc_assert (!element_info.pieces.is_empty ());
1740   for (unsigned int i = 0; i < count; ++i)
1741     for (unsigned int j = 0; j < element_info.pieces.length (); ++j)
1742       {
1743 	piece p = element_info.pieces[j];
1744 	p.offset += i * element_bytes;
1745 	add_piece (p);
1746       }
1747   return IS_PST;
1748 }
1749 
1750 /* Subroutine of analyze for handling RECORD_TYPEs.  */
1751 
1752 pure_scalable_type_info::analysis_result
analyze_record(const_tree type)1753 pure_scalable_type_info::analyze_record (const_tree type)
1754 {
1755   for (tree field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
1756     {
1757       if (TREE_CODE (field) != FIELD_DECL)
1758 	continue;
1759 
1760       /* Zero-sized fields disappear in the language->ABI mapping.  */
1761       if (DECL_SIZE (field) && integer_zerop (DECL_SIZE (field)))
1762 	continue;
1763 
1764       /* All fields with an ABI identity must be PSTs for the record as
1765 	 a whole to be a PST.  If any individual field is too big to be
1766 	 interesting then the record is too.  */
1767       pure_scalable_type_info field_info;
1768       analysis_result subresult = field_info.analyze (TREE_TYPE (field));
1769       if (subresult == NO_ABI_IDENTITY)
1770 	continue;
1771       if (subresult != IS_PST)
1772 	return subresult;
1773 
1774       /* Since all previous fields are PSTs, we ought to be able to track
1775 	 the field offset using poly_ints.  */
1776       tree bitpos = bit_position (field);
1777       gcc_assert (poly_int_tree_p (bitpos));
1778 
1779       /* For the same reason, it shouldn't be possible to create a PST field
1780 	 whose offset isn't byte-aligned.  */
1781       poly_widest_int wide_bytepos = exact_div (wi::to_poly_widest (bitpos),
1782 						BITS_PER_UNIT);
1783 
1784       /* Punt if the record is too big to be interesting.  */
1785       poly_uint64 bytepos;
1786       if (!wide_bytepos.to_uhwi (&bytepos)
1787 	  || pieces.length () + field_info.pieces.length () > MAX_PIECES)
1788 	return DOESNT_MATTER;
1789 
1790       /* Add the individual vectors and predicates in the field to the
1791 	 record's list.  */
1792       gcc_assert (!field_info.pieces.is_empty ());
1793       for (unsigned int i = 0; i < field_info.pieces.length (); ++i)
1794 	{
1795 	  piece p = field_info.pieces[i];
1796 	  p.offset += bytepos;
1797 	  add_piece (p);
1798 	}
1799     }
1800   /* Empty structures disappear in the language->ABI mapping.  */
1801   return pieces.is_empty () ? NO_ABI_IDENTITY : IS_PST;
1802 }
1803 
1804 /* Add P to the list of pieces in the type.  */
1805 
1806 void
add_piece(const piece & p)1807 pure_scalable_type_info::add_piece (const piece &p)
1808 {
1809   /* Try to fold the new piece into the previous one to form a
1810      single-mode PST.  For example, if we see three consecutive vectors
1811      of the same mode, we can represent them using the corresponding
1812      3-tuple mode.
1813 
1814      This is purely an optimization.  */
1815   if (!pieces.is_empty ())
1816     {
1817       piece &prev = pieces.last ();
1818       gcc_assert (VECTOR_MODE_P (p.mode) && VECTOR_MODE_P (prev.mode));
1819       unsigned int nelems1, nelems2;
1820       if (prev.orig_mode == p.orig_mode
1821 	  && known_eq (prev.offset + GET_MODE_SIZE (prev.mode), p.offset)
1822 	  && constant_multiple_p (GET_MODE_NUNITS (prev.mode),
1823 				  GET_MODE_NUNITS (p.orig_mode), &nelems1)
1824 	  && constant_multiple_p (GET_MODE_NUNITS (p.mode),
1825 				  GET_MODE_NUNITS (p.orig_mode), &nelems2)
1826 	  && targetm.array_mode (p.orig_mode,
1827 				 nelems1 + nelems2).exists (&prev.mode))
1828 	{
1829 	  prev.num_zr += p.num_zr;
1830 	  prev.num_pr += p.num_pr;
1831 	  return;
1832 	}
1833     }
1834   pieces.quick_push (p);
1835 }
1836 
1837 /* Return true if at least one possible value of type TYPE includes at
1838    least one object of Pure Scalable Type, in the sense of the AAPCS64.
1839 
1840    This is a relatively expensive test for some types, so it should
1841    generally be made as late as possible.  */
1842 
1843 static bool
aarch64_some_values_include_pst_objects_p(const_tree type)1844 aarch64_some_values_include_pst_objects_p (const_tree type)
1845 {
1846   if (TYPE_SIZE (type) && integer_zerop (TYPE_SIZE (type)))
1847     return false;
1848 
1849   if (aarch64_sve::builtin_type_p (type))
1850     return true;
1851 
1852   if (TREE_CODE (type) == ARRAY_TYPE || TREE_CODE (type) == COMPLEX_TYPE)
1853     return aarch64_some_values_include_pst_objects_p (TREE_TYPE (type));
1854 
1855   if (RECORD_OR_UNION_TYPE_P (type))
1856     for (tree field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
1857       if (TREE_CODE (field) == FIELD_DECL
1858 	  && aarch64_some_values_include_pst_objects_p (TREE_TYPE (field)))
1859 	return true;
1860 
1861   return false;
1862 }
1863 
1864 /* Return the descriptor of the SIMD ABI.  */
1865 
1866 static const predefined_function_abi &
aarch64_simd_abi(void)1867 aarch64_simd_abi (void)
1868 {
1869   predefined_function_abi &simd_abi = function_abis[ARM_PCS_SIMD];
1870   if (!simd_abi.initialized_p ())
1871     {
1872       HARD_REG_SET full_reg_clobbers
1873 	= default_function_abi.full_reg_clobbers ();
1874       for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
1875 	if (FP_SIMD_SAVED_REGNUM_P (regno))
1876 	  CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
1877       simd_abi.initialize (ARM_PCS_SIMD, full_reg_clobbers);
1878     }
1879   return simd_abi;
1880 }
1881 
1882 /* Return the descriptor of the SVE PCS.  */
1883 
1884 static const predefined_function_abi &
aarch64_sve_abi(void)1885 aarch64_sve_abi (void)
1886 {
1887   predefined_function_abi &sve_abi = function_abis[ARM_PCS_SVE];
1888   if (!sve_abi.initialized_p ())
1889     {
1890       HARD_REG_SET full_reg_clobbers
1891 	= default_function_abi.full_reg_clobbers ();
1892       for (int regno = V8_REGNUM; regno <= V23_REGNUM; ++regno)
1893 	CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
1894       for (int regno = P4_REGNUM; regno <= P15_REGNUM; ++regno)
1895 	CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
1896       sve_abi.initialize (ARM_PCS_SVE, full_reg_clobbers);
1897     }
1898   return sve_abi;
1899 }
1900 
1901 /* Generate code to enable conditional branches in functions over 1 MiB.  */
1902 const char *
aarch64_gen_far_branch(rtx * operands,int pos_label,const char * dest,const char * branch_format)1903 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
1904 			const char * branch_format)
1905 {
1906     rtx_code_label * tmp_label = gen_label_rtx ();
1907     char label_buf[256];
1908     char buffer[128];
1909     ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
1910 				 CODE_LABEL_NUMBER (tmp_label));
1911     const char *label_ptr = targetm.strip_name_encoding (label_buf);
1912     rtx dest_label = operands[pos_label];
1913     operands[pos_label] = tmp_label;
1914 
1915     snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
1916     output_asm_insn (buffer, operands);
1917 
1918     snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
1919     operands[pos_label] = dest_label;
1920     output_asm_insn (buffer, operands);
1921     return "";
1922 }
1923 
1924 void
aarch64_err_no_fpadvsimd(machine_mode mode)1925 aarch64_err_no_fpadvsimd (machine_mode mode)
1926 {
1927   if (TARGET_GENERAL_REGS_ONLY)
1928     if (FLOAT_MODE_P (mode))
1929       error ("%qs is incompatible with the use of floating-point types",
1930 	     "-mgeneral-regs-only");
1931     else
1932       error ("%qs is incompatible with the use of vector types",
1933 	     "-mgeneral-regs-only");
1934   else
1935     if (FLOAT_MODE_P (mode))
1936       error ("%qs feature modifier is incompatible with the use of"
1937 	     " floating-point types", "+nofp");
1938     else
1939       error ("%qs feature modifier is incompatible with the use of"
1940 	     " vector types", "+nofp");
1941 }
1942 
1943 /* Report when we try to do something that requires SVE when SVE is disabled.
1944    This is an error of last resort and isn't very high-quality.  It usually
1945    involves attempts to measure the vector length in some way.  */
1946 static void
aarch64_report_sve_required(void)1947 aarch64_report_sve_required (void)
1948 {
1949   static bool reported_p = false;
1950 
1951   /* Avoid reporting a slew of messages for a single oversight.  */
1952   if (reported_p)
1953     return;
1954 
1955   error ("this operation requires the SVE ISA extension");
1956   inform (input_location, "you can enable SVE using the command-line"
1957 	  " option %<-march%>, or by using the %<target%>"
1958 	  " attribute or pragma");
1959   reported_p = true;
1960 }
1961 
1962 /* Return true if REGNO is P0-P15 or one of the special FFR-related
1963    registers.  */
1964 inline bool
pr_or_ffr_regnum_p(unsigned int regno)1965 pr_or_ffr_regnum_p (unsigned int regno)
1966 {
1967   return PR_REGNUM_P (regno) || regno == FFR_REGNUM || regno == FFRT_REGNUM;
1968 }
1969 
1970 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1971    The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
1972    GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
1973    higher cost.  POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
1974    and GENERAL_REGS is lower than the memory cost (in this case the best class
1975    is the lowest cost one).  Using POINTER_AND_FP_REGS irrespectively of its
1976    cost results in bad allocations with many redundant int<->FP moves which
1977    are expensive on various cores.
1978    To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
1979    force a decision between FP_REGS and GENERAL_REGS.  We use the allocno class
1980    if it isn't POINTER_AND_FP_REGS.  Similarly, use the best class if it isn't
1981    POINTER_AND_FP_REGS.  Otherwise set the allocno class depending on the mode.
1982    The result of this is that it is no longer inefficient to have a higher
1983    memory move cost than the register move cost.
1984 */
1985 
1986 static reg_class_t
aarch64_ira_change_pseudo_allocno_class(int regno,reg_class_t allocno_class,reg_class_t best_class)1987 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
1988 					 reg_class_t best_class)
1989 {
1990   machine_mode mode;
1991 
1992   if (!reg_class_subset_p (GENERAL_REGS, allocno_class)
1993       || !reg_class_subset_p (FP_REGS, allocno_class))
1994     return allocno_class;
1995 
1996   if (!reg_class_subset_p (GENERAL_REGS, best_class)
1997       || !reg_class_subset_p (FP_REGS, best_class))
1998     return best_class;
1999 
2000   mode = PSEUDO_REGNO_MODE (regno);
2001   return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
2002 }
2003 
2004 static unsigned int
aarch64_min_divisions_for_recip_mul(machine_mode mode)2005 aarch64_min_divisions_for_recip_mul (machine_mode mode)
2006 {
2007   if (GET_MODE_UNIT_SIZE (mode) == 4)
2008     return aarch64_tune_params.min_div_recip_mul_sf;
2009   return aarch64_tune_params.min_div_recip_mul_df;
2010 }
2011 
2012 /* Return the reassociation width of treeop OPC with mode MODE.  */
2013 static int
aarch64_reassociation_width(unsigned opc,machine_mode mode)2014 aarch64_reassociation_width (unsigned opc, machine_mode mode)
2015 {
2016   if (VECTOR_MODE_P (mode))
2017     return aarch64_tune_params.vec_reassoc_width;
2018   if (INTEGRAL_MODE_P (mode))
2019     return aarch64_tune_params.int_reassoc_width;
2020   /* Avoid reassociating floating point addition so we emit more FMAs.  */
2021   if (FLOAT_MODE_P (mode) && opc != PLUS_EXPR)
2022     return aarch64_tune_params.fp_reassoc_width;
2023   return 1;
2024 }
2025 
2026 /* Provide a mapping from gcc register numbers to dwarf register numbers.  */
2027 unsigned
aarch64_dbx_register_number(unsigned regno)2028 aarch64_dbx_register_number (unsigned regno)
2029 {
2030    if (GP_REGNUM_P (regno))
2031      return AARCH64_DWARF_R0 + regno - R0_REGNUM;
2032    else if (regno == SP_REGNUM)
2033      return AARCH64_DWARF_SP;
2034    else if (FP_REGNUM_P (regno))
2035      return AARCH64_DWARF_V0 + regno - V0_REGNUM;
2036    else if (PR_REGNUM_P (regno))
2037      return AARCH64_DWARF_P0 + regno - P0_REGNUM;
2038    else if (regno == VG_REGNUM)
2039      return AARCH64_DWARF_VG;
2040 
2041    /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
2042       equivalent DWARF register.  */
2043    return DWARF_FRAME_REGISTERS;
2044 }
2045 
2046 /* If X is a CONST_DOUBLE, return its bit representation as a constant
2047    integer, otherwise return X unmodified.  */
2048 static rtx
aarch64_bit_representation(rtx x)2049 aarch64_bit_representation (rtx x)
2050 {
2051   if (CONST_DOUBLE_P (x))
2052     x = gen_lowpart (int_mode_for_mode (GET_MODE (x)).require (), x);
2053   return x;
2054 }
2055 
2056 /* Return true if MODE is any of the Advanced SIMD structure modes.  */
2057 static bool
aarch64_advsimd_struct_mode_p(machine_mode mode)2058 aarch64_advsimd_struct_mode_p (machine_mode mode)
2059 {
2060   return (TARGET_SIMD
2061 	  && (mode == OImode || mode == CImode || mode == XImode));
2062 }
2063 
2064 /* Return true if MODE is an SVE predicate mode.  */
2065 static bool
aarch64_sve_pred_mode_p(machine_mode mode)2066 aarch64_sve_pred_mode_p (machine_mode mode)
2067 {
2068   return (TARGET_SVE
2069 	  && (mode == VNx16BImode
2070 	      || mode == VNx8BImode
2071 	      || mode == VNx4BImode
2072 	      || mode == VNx2BImode));
2073 }
2074 
2075 /* Three mutually-exclusive flags describing a vector or predicate type.  */
2076 const unsigned int VEC_ADVSIMD  = 1;
2077 const unsigned int VEC_SVE_DATA = 2;
2078 const unsigned int VEC_SVE_PRED = 4;
2079 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
2080    a structure of 2, 3 or 4 vectors.  */
2081 const unsigned int VEC_STRUCT   = 8;
2082 /* Can be used in combination with VEC_SVE_DATA to indicate that the
2083    vector has fewer significant bytes than a full SVE vector.  */
2084 const unsigned int VEC_PARTIAL  = 16;
2085 /* Useful combinations of the above.  */
2086 const unsigned int VEC_ANY_SVE  = VEC_SVE_DATA | VEC_SVE_PRED;
2087 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
2088 
2089 /* Return a set of flags describing the vector properties of mode MODE.
2090    Ignore modes that are not supported by the current target.  */
2091 static unsigned int
aarch64_classify_vector_mode(machine_mode mode)2092 aarch64_classify_vector_mode (machine_mode mode)
2093 {
2094   if (aarch64_advsimd_struct_mode_p (mode))
2095     return VEC_ADVSIMD | VEC_STRUCT;
2096 
2097   if (aarch64_sve_pred_mode_p (mode))
2098     return VEC_SVE_PRED;
2099 
2100   /* Make the decision based on the mode's enum value rather than its
2101      properties, so that we keep the correct classification regardless
2102      of -msve-vector-bits.  */
2103   switch (mode)
2104     {
2105     /* Partial SVE QI vectors.  */
2106     case E_VNx2QImode:
2107     case E_VNx4QImode:
2108     case E_VNx8QImode:
2109     /* Partial SVE HI vectors.  */
2110     case E_VNx2HImode:
2111     case E_VNx4HImode:
2112     /* Partial SVE SI vector.  */
2113     case E_VNx2SImode:
2114     /* Partial SVE HF vectors.  */
2115     case E_VNx2HFmode:
2116     case E_VNx4HFmode:
2117     /* Partial SVE SF vector.  */
2118     case E_VNx2SFmode:
2119       return TARGET_SVE ? VEC_SVE_DATA | VEC_PARTIAL : 0;
2120 
2121     case E_VNx16QImode:
2122     case E_VNx8HImode:
2123     case E_VNx4SImode:
2124     case E_VNx2DImode:
2125     case E_VNx8BFmode:
2126     case E_VNx8HFmode:
2127     case E_VNx4SFmode:
2128     case E_VNx2DFmode:
2129       return TARGET_SVE ? VEC_SVE_DATA : 0;
2130 
2131     /* x2 SVE vectors.  */
2132     case E_VNx32QImode:
2133     case E_VNx16HImode:
2134     case E_VNx8SImode:
2135     case E_VNx4DImode:
2136     case E_VNx16BFmode:
2137     case E_VNx16HFmode:
2138     case E_VNx8SFmode:
2139     case E_VNx4DFmode:
2140     /* x3 SVE vectors.  */
2141     case E_VNx48QImode:
2142     case E_VNx24HImode:
2143     case E_VNx12SImode:
2144     case E_VNx6DImode:
2145     case E_VNx24BFmode:
2146     case E_VNx24HFmode:
2147     case E_VNx12SFmode:
2148     case E_VNx6DFmode:
2149     /* x4 SVE vectors.  */
2150     case E_VNx64QImode:
2151     case E_VNx32HImode:
2152     case E_VNx16SImode:
2153     case E_VNx8DImode:
2154     case E_VNx32BFmode:
2155     case E_VNx32HFmode:
2156     case E_VNx16SFmode:
2157     case E_VNx8DFmode:
2158       return TARGET_SVE ? VEC_SVE_DATA | VEC_STRUCT : 0;
2159 
2160     /* 64-bit Advanced SIMD vectors.  */
2161     case E_V8QImode:
2162     case E_V4HImode:
2163     case E_V2SImode:
2164     /* ...E_V1DImode doesn't exist.  */
2165     case E_V4HFmode:
2166     case E_V4BFmode:
2167     case E_V2SFmode:
2168     case E_V1DFmode:
2169     /* 128-bit Advanced SIMD vectors.  */
2170     case E_V16QImode:
2171     case E_V8HImode:
2172     case E_V4SImode:
2173     case E_V2DImode:
2174     case E_V8HFmode:
2175     case E_V8BFmode:
2176     case E_V4SFmode:
2177     case E_V2DFmode:
2178       return TARGET_SIMD ? VEC_ADVSIMD : 0;
2179 
2180     default:
2181       return 0;
2182     }
2183 }
2184 
2185 /* Return true if MODE is any of the data vector modes, including
2186    structure modes.  */
2187 static bool
aarch64_vector_data_mode_p(machine_mode mode)2188 aarch64_vector_data_mode_p (machine_mode mode)
2189 {
2190   return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
2191 }
2192 
2193 /* Return true if MODE is any form of SVE mode, including predicates,
2194    vectors and structures.  */
2195 bool
aarch64_sve_mode_p(machine_mode mode)2196 aarch64_sve_mode_p (machine_mode mode)
2197 {
2198   return aarch64_classify_vector_mode (mode) & VEC_ANY_SVE;
2199 }
2200 
2201 /* Return true if MODE is an SVE data vector mode; either a single vector
2202    or a structure of vectors.  */
2203 static bool
aarch64_sve_data_mode_p(machine_mode mode)2204 aarch64_sve_data_mode_p (machine_mode mode)
2205 {
2206   return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
2207 }
2208 
2209 /* Return the number of defined bytes in one constituent vector of
2210    SVE mode MODE, which has vector flags VEC_FLAGS.  */
2211 static poly_int64
aarch64_vl_bytes(machine_mode mode,unsigned int vec_flags)2212 aarch64_vl_bytes (machine_mode mode, unsigned int vec_flags)
2213 {
2214   if (vec_flags & VEC_PARTIAL)
2215     /* A single partial vector.  */
2216     return GET_MODE_SIZE (mode);
2217 
2218   if (vec_flags & VEC_SVE_DATA)
2219     /* A single vector or a tuple.  */
2220     return BYTES_PER_SVE_VECTOR;
2221 
2222   /* A single predicate.  */
2223   gcc_assert (vec_flags & VEC_SVE_PRED);
2224   return BYTES_PER_SVE_PRED;
2225 }
2226 
2227 /* Implement target hook TARGET_ARRAY_MODE.  */
2228 static opt_machine_mode
aarch64_array_mode(machine_mode mode,unsigned HOST_WIDE_INT nelems)2229 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
2230 {
2231   if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
2232       && IN_RANGE (nelems, 2, 4))
2233     return mode_for_vector (GET_MODE_INNER (mode),
2234 			    GET_MODE_NUNITS (mode) * nelems);
2235 
2236   return opt_machine_mode ();
2237 }
2238 
2239 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
2240 static bool
aarch64_array_mode_supported_p(machine_mode mode,unsigned HOST_WIDE_INT nelems)2241 aarch64_array_mode_supported_p (machine_mode mode,
2242 				unsigned HOST_WIDE_INT nelems)
2243 {
2244   if (TARGET_SIMD
2245       && (AARCH64_VALID_SIMD_QREG_MODE (mode)
2246 	  || AARCH64_VALID_SIMD_DREG_MODE (mode))
2247       && (nelems >= 2 && nelems <= 4))
2248     return true;
2249 
2250   return false;
2251 }
2252 
2253 /* MODE is some form of SVE vector mode.  For data modes, return the number
2254    of vector register bits that each element of MODE occupies, such as 64
2255    for both VNx2DImode and VNx2SImode (where each 32-bit value is stored
2256    in a 64-bit container).  For predicate modes, return the number of
2257    data bits controlled by each significant predicate bit.  */
2258 
2259 static unsigned int
aarch64_sve_container_bits(machine_mode mode)2260 aarch64_sve_container_bits (machine_mode mode)
2261 {
2262   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
2263   poly_uint64 vector_bits = (vec_flags & (VEC_PARTIAL | VEC_SVE_PRED)
2264 			     ? BITS_PER_SVE_VECTOR
2265 			     : GET_MODE_BITSIZE (mode));
2266   return vector_element_size (vector_bits, GET_MODE_NUNITS (mode));
2267 }
2268 
2269 /* Return the SVE predicate mode to use for elements that have
2270    ELEM_NBYTES bytes, if such a mode exists.  */
2271 
2272 opt_machine_mode
aarch64_sve_pred_mode(unsigned int elem_nbytes)2273 aarch64_sve_pred_mode (unsigned int elem_nbytes)
2274 {
2275   if (TARGET_SVE)
2276     {
2277       if (elem_nbytes == 1)
2278 	return VNx16BImode;
2279       if (elem_nbytes == 2)
2280 	return VNx8BImode;
2281       if (elem_nbytes == 4)
2282 	return VNx4BImode;
2283       if (elem_nbytes == 8)
2284 	return VNx2BImode;
2285     }
2286   return opt_machine_mode ();
2287 }
2288 
2289 /* Return the SVE predicate mode that should be used to control
2290    SVE mode MODE.  */
2291 
2292 machine_mode
aarch64_sve_pred_mode(machine_mode mode)2293 aarch64_sve_pred_mode (machine_mode mode)
2294 {
2295   unsigned int bits = aarch64_sve_container_bits (mode);
2296   return aarch64_sve_pred_mode (bits / BITS_PER_UNIT).require ();
2297 }
2298 
2299 /* Implement TARGET_VECTORIZE_GET_MASK_MODE.  */
2300 
2301 static opt_machine_mode
aarch64_get_mask_mode(machine_mode mode)2302 aarch64_get_mask_mode (machine_mode mode)
2303 {
2304   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
2305   if (vec_flags & VEC_SVE_DATA)
2306     return aarch64_sve_pred_mode (mode);
2307 
2308   return default_get_mask_mode (mode);
2309 }
2310 
2311 /* Return the SVE vector mode that has NUNITS elements of mode INNER_MODE.  */
2312 
2313 opt_machine_mode
aarch64_sve_data_mode(scalar_mode inner_mode,poly_uint64 nunits)2314 aarch64_sve_data_mode (scalar_mode inner_mode, poly_uint64 nunits)
2315 {
2316   enum mode_class mclass = (is_a <scalar_float_mode> (inner_mode)
2317 			    ? MODE_VECTOR_FLOAT : MODE_VECTOR_INT);
2318   machine_mode mode;
2319   FOR_EACH_MODE_IN_CLASS (mode, mclass)
2320     if (inner_mode == GET_MODE_INNER (mode)
2321 	&& known_eq (nunits, GET_MODE_NUNITS (mode))
2322 	&& aarch64_sve_data_mode_p (mode))
2323       return mode;
2324   return opt_machine_mode ();
2325 }
2326 
2327 /* Return the integer element mode associated with SVE mode MODE.  */
2328 
2329 static scalar_int_mode
aarch64_sve_element_int_mode(machine_mode mode)2330 aarch64_sve_element_int_mode (machine_mode mode)
2331 {
2332   poly_uint64 vector_bits = (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
2333 			     ? BITS_PER_SVE_VECTOR
2334 			     : GET_MODE_BITSIZE (mode));
2335   unsigned int elt_bits = vector_element_size (vector_bits,
2336 					       GET_MODE_NUNITS (mode));
2337   return int_mode_for_size (elt_bits, 0).require ();
2338 }
2339 
2340 /* Return an integer element mode that contains exactly
2341    aarch64_sve_container_bits (MODE) bits.  This is wider than
2342    aarch64_sve_element_int_mode if MODE is a partial vector,
2343    otherwise it's the same.  */
2344 
2345 static scalar_int_mode
aarch64_sve_container_int_mode(machine_mode mode)2346 aarch64_sve_container_int_mode (machine_mode mode)
2347 {
2348   return int_mode_for_size (aarch64_sve_container_bits (mode), 0).require ();
2349 }
2350 
2351 /* Return the integer vector mode associated with SVE mode MODE.
2352    Unlike related_int_vector_mode, this can handle the case in which
2353    MODE is a predicate (and thus has a different total size).  */
2354 
2355 machine_mode
aarch64_sve_int_mode(machine_mode mode)2356 aarch64_sve_int_mode (machine_mode mode)
2357 {
2358   scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
2359   return aarch64_sve_data_mode (int_mode, GET_MODE_NUNITS (mode)).require ();
2360 }
2361 
2362 /* Implement TARGET_VECTORIZE_RELATED_MODE.  */
2363 
2364 static opt_machine_mode
aarch64_vectorize_related_mode(machine_mode vector_mode,scalar_mode element_mode,poly_uint64 nunits)2365 aarch64_vectorize_related_mode (machine_mode vector_mode,
2366 				scalar_mode element_mode,
2367 				poly_uint64 nunits)
2368 {
2369   unsigned int vec_flags = aarch64_classify_vector_mode (vector_mode);
2370 
2371   /* If we're operating on SVE vectors, try to return an SVE mode.  */
2372   poly_uint64 sve_nunits;
2373   if ((vec_flags & VEC_SVE_DATA)
2374       && multiple_p (BYTES_PER_SVE_VECTOR,
2375 		     GET_MODE_SIZE (element_mode), &sve_nunits))
2376     {
2377       machine_mode sve_mode;
2378       if (maybe_ne (nunits, 0U))
2379 	{
2380 	  /* Try to find a full or partial SVE mode with exactly
2381 	     NUNITS units.  */
2382 	  if (multiple_p (sve_nunits, nunits)
2383 	      && aarch64_sve_data_mode (element_mode,
2384 					nunits).exists (&sve_mode))
2385 	    return sve_mode;
2386 	}
2387       else
2388 	{
2389 	  /* Take the preferred number of units from the number of bytes
2390 	     that fit in VECTOR_MODE.  We always start by "autodetecting"
2391 	     a full vector mode with preferred_simd_mode, so vectors
2392 	     chosen here will also be full vector modes.  Then
2393 	     autovectorize_vector_modes tries smaller starting modes
2394 	     and thus smaller preferred numbers of units.  */
2395 	  sve_nunits = ordered_min (sve_nunits, GET_MODE_SIZE (vector_mode));
2396 	  if (aarch64_sve_data_mode (element_mode,
2397 				     sve_nunits).exists (&sve_mode))
2398 	    return sve_mode;
2399 	}
2400     }
2401 
2402   /* Prefer to use 1 128-bit vector instead of 2 64-bit vectors.  */
2403   if ((vec_flags & VEC_ADVSIMD)
2404       && known_eq (nunits, 0U)
2405       && known_eq (GET_MODE_BITSIZE (vector_mode), 64U)
2406       && maybe_ge (GET_MODE_BITSIZE (element_mode)
2407 		   * GET_MODE_NUNITS (vector_mode), 128U))
2408     {
2409       machine_mode res = aarch64_simd_container_mode (element_mode, 128);
2410       if (VECTOR_MODE_P (res))
2411 	return res;
2412     }
2413 
2414   return default_vectorize_related_mode (vector_mode, element_mode, nunits);
2415 }
2416 
2417 /* Implement TARGET_PREFERRED_ELSE_VALUE.  For binary operations,
2418    prefer to use the first arithmetic operand as the else value if
2419    the else value doesn't matter, since that exactly matches the SVE
2420    destructive merging form.  For ternary operations we could either
2421    pick the first operand and use FMAD-like instructions or the last
2422    operand and use FMLA-like instructions; the latter seems more
2423    natural.  */
2424 
2425 static tree
aarch64_preferred_else_value(unsigned,tree,unsigned int nops,tree * ops)2426 aarch64_preferred_else_value (unsigned, tree, unsigned int nops, tree *ops)
2427 {
2428   return nops == 3 ? ops[2] : ops[0];
2429 }
2430 
2431 /* Implement TARGET_HARD_REGNO_NREGS.  */
2432 
2433 static unsigned int
aarch64_hard_regno_nregs(unsigned regno,machine_mode mode)2434 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
2435 {
2436   /* ??? Logically we should only need to provide a value when
2437      HARD_REGNO_MODE_OK says that the combination is valid,
2438      but at the moment we need to handle all modes.  Just ignore
2439      any runtime parts for registers that can't store them.  */
2440   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
2441   switch (aarch64_regno_regclass (regno))
2442     {
2443     case FP_REGS:
2444     case FP_LO_REGS:
2445     case FP_LO8_REGS:
2446       {
2447 	unsigned int vec_flags = aarch64_classify_vector_mode (mode);
2448 	if (vec_flags & VEC_SVE_DATA)
2449 	  return exact_div (GET_MODE_SIZE (mode),
2450 			    aarch64_vl_bytes (mode, vec_flags)).to_constant ();
2451 	return CEIL (lowest_size, UNITS_PER_VREG);
2452       }
2453     case PR_REGS:
2454     case PR_LO_REGS:
2455     case PR_HI_REGS:
2456     case FFR_REGS:
2457     case PR_AND_FFR_REGS:
2458       return 1;
2459     default:
2460       return CEIL (lowest_size, UNITS_PER_WORD);
2461     }
2462   gcc_unreachable ();
2463 }
2464 
2465 /* Implement TARGET_HARD_REGNO_MODE_OK.  */
2466 
2467 static bool
aarch64_hard_regno_mode_ok(unsigned regno,machine_mode mode)2468 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
2469 {
2470   if (GET_MODE_CLASS (mode) == MODE_CC)
2471     return regno == CC_REGNUM;
2472 
2473   if (regno == VG_REGNUM)
2474     /* This must have the same size as _Unwind_Word.  */
2475     return mode == DImode;
2476 
2477   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
2478   if (vec_flags & VEC_SVE_PRED)
2479     return pr_or_ffr_regnum_p (regno);
2480 
2481   if (pr_or_ffr_regnum_p (regno))
2482     return false;
2483 
2484   if (regno == SP_REGNUM)
2485     /* The purpose of comparing with ptr_mode is to support the
2486        global register variable associated with the stack pointer
2487        register via the syntax of asm ("wsp") in ILP32.  */
2488     return mode == Pmode || mode == ptr_mode;
2489 
2490   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
2491     return mode == Pmode;
2492 
2493   if (GP_REGNUM_P (regno))
2494     {
2495       if (vec_flags & VEC_ANY_SVE)
2496 	return false;
2497       if (known_le (GET_MODE_SIZE (mode), 8))
2498 	return true;
2499       if (known_le (GET_MODE_SIZE (mode), 16))
2500 	return (regno & 1) == 0;
2501     }
2502   else if (FP_REGNUM_P (regno))
2503     {
2504       if (vec_flags & VEC_STRUCT)
2505 	return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
2506       else
2507 	return !VECTOR_MODE_P (mode) || vec_flags != 0;
2508     }
2509 
2510   return false;
2511 }
2512 
2513 /* Return true if a function with type FNTYPE returns its value in
2514    SVE vector or predicate registers.  */
2515 
2516 static bool
aarch64_returns_value_in_sve_regs_p(const_tree fntype)2517 aarch64_returns_value_in_sve_regs_p (const_tree fntype)
2518 {
2519   tree return_type = TREE_TYPE (fntype);
2520 
2521   pure_scalable_type_info pst_info;
2522   switch (pst_info.analyze (return_type))
2523     {
2524     case pure_scalable_type_info::IS_PST:
2525       return (pst_info.num_zr () <= NUM_FP_ARG_REGS
2526 	      && pst_info.num_pr () <= NUM_PR_ARG_REGS);
2527 
2528     case pure_scalable_type_info::DOESNT_MATTER:
2529       gcc_assert (aarch64_return_in_memory_1 (return_type));
2530       return false;
2531 
2532     case pure_scalable_type_info::NO_ABI_IDENTITY:
2533     case pure_scalable_type_info::ISNT_PST:
2534       return false;
2535     }
2536   gcc_unreachable ();
2537 }
2538 
2539 /* Return true if a function with type FNTYPE takes arguments in
2540    SVE vector or predicate registers.  */
2541 
2542 static bool
aarch64_takes_arguments_in_sve_regs_p(const_tree fntype)2543 aarch64_takes_arguments_in_sve_regs_p (const_tree fntype)
2544 {
2545   CUMULATIVE_ARGS args_so_far_v;
2546   aarch64_init_cumulative_args (&args_so_far_v, NULL_TREE, NULL_RTX,
2547 				NULL_TREE, 0, true);
2548   cumulative_args_t args_so_far = pack_cumulative_args (&args_so_far_v);
2549 
2550   for (tree chain = TYPE_ARG_TYPES (fntype);
2551        chain && chain != void_list_node;
2552        chain = TREE_CHAIN (chain))
2553     {
2554       tree arg_type = TREE_VALUE (chain);
2555       if (arg_type == error_mark_node)
2556 	return false;
2557 
2558       function_arg_info arg (arg_type, /*named=*/true);
2559       apply_pass_by_reference_rules (&args_so_far_v, arg);
2560       pure_scalable_type_info pst_info;
2561       if (pst_info.analyze_registers (arg.type))
2562 	{
2563 	  unsigned int end_zr = args_so_far_v.aapcs_nvrn + pst_info.num_zr ();
2564 	  unsigned int end_pr = args_so_far_v.aapcs_nprn + pst_info.num_pr ();
2565 	  gcc_assert (end_zr <= NUM_FP_ARG_REGS && end_pr <= NUM_PR_ARG_REGS);
2566 	  return true;
2567 	}
2568 
2569       targetm.calls.function_arg_advance (args_so_far, arg);
2570     }
2571   return false;
2572 }
2573 
2574 /* Implement TARGET_FNTYPE_ABI.  */
2575 
2576 static const predefined_function_abi &
aarch64_fntype_abi(const_tree fntype)2577 aarch64_fntype_abi (const_tree fntype)
2578 {
2579   if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)))
2580     return aarch64_simd_abi ();
2581 
2582   if (aarch64_returns_value_in_sve_regs_p (fntype)
2583       || aarch64_takes_arguments_in_sve_regs_p (fntype))
2584     return aarch64_sve_abi ();
2585 
2586   return default_function_abi;
2587 }
2588 
2589 /* Implement TARGET_COMPATIBLE_VECTOR_TYPES_P.  */
2590 
2591 static bool
aarch64_compatible_vector_types_p(const_tree type1,const_tree type2)2592 aarch64_compatible_vector_types_p (const_tree type1, const_tree type2)
2593 {
2594   return (aarch64_sve::builtin_type_p (type1)
2595 	  == aarch64_sve::builtin_type_p (type2));
2596 }
2597 
2598 /* Return true if we should emit CFI for register REGNO.  */
2599 
2600 static bool
aarch64_emit_cfi_for_reg_p(unsigned int regno)2601 aarch64_emit_cfi_for_reg_p (unsigned int regno)
2602 {
2603   return (GP_REGNUM_P (regno)
2604 	  || !default_function_abi.clobbers_full_reg_p (regno));
2605 }
2606 
2607 /* Return the mode we should use to save and restore register REGNO.  */
2608 
2609 static machine_mode
aarch64_reg_save_mode(unsigned int regno)2610 aarch64_reg_save_mode (unsigned int regno)
2611 {
2612   if (GP_REGNUM_P (regno))
2613     return DImode;
2614 
2615   if (FP_REGNUM_P (regno))
2616     switch (crtl->abi->id ())
2617       {
2618       case ARM_PCS_AAPCS64:
2619 	/* Only the low 64 bits are saved by the base PCS.  */
2620 	return DFmode;
2621 
2622       case ARM_PCS_SIMD:
2623 	/* The vector PCS saves the low 128 bits (which is the full
2624 	   register on non-SVE targets).  */
2625 	return TFmode;
2626 
2627       case ARM_PCS_SVE:
2628 	/* Use vectors of DImode for registers that need frame
2629 	   information, so that the first 64 bytes of the save slot
2630 	   are always the equivalent of what storing D<n> would give.  */
2631 	if (aarch64_emit_cfi_for_reg_p (regno))
2632 	  return VNx2DImode;
2633 
2634 	/* Use vectors of bytes otherwise, so that the layout is
2635 	   endian-agnostic, and so that we can use LDR and STR for
2636 	   big-endian targets.  */
2637 	return VNx16QImode;
2638 
2639       case ARM_PCS_TLSDESC:
2640       case ARM_PCS_UNKNOWN:
2641 	break;
2642       }
2643 
2644   if (PR_REGNUM_P (regno))
2645     /* Save the full predicate register.  */
2646     return VNx16BImode;
2647 
2648   gcc_unreachable ();
2649 }
2650 
2651 /* Implement TARGET_INSN_CALLEE_ABI.  */
2652 
2653 const predefined_function_abi &
aarch64_insn_callee_abi(const rtx_insn * insn)2654 aarch64_insn_callee_abi (const rtx_insn *insn)
2655 {
2656   rtx pat = PATTERN (insn);
2657   gcc_assert (GET_CODE (pat) == PARALLEL);
2658   rtx unspec = XVECEXP (pat, 0, 1);
2659   gcc_assert (GET_CODE (unspec) == UNSPEC
2660 	      && XINT (unspec, 1) == UNSPEC_CALLEE_ABI);
2661   return function_abis[INTVAL (XVECEXP (unspec, 0, 0))];
2662 }
2663 
2664 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED.  The callee only saves
2665    the lower 64 bits of a 128-bit register.  Tell the compiler the callee
2666    clobbers the top 64 bits when restoring the bottom 64 bits.  */
2667 
2668 static bool
aarch64_hard_regno_call_part_clobbered(unsigned int abi_id,unsigned int regno,machine_mode mode)2669 aarch64_hard_regno_call_part_clobbered (unsigned int abi_id,
2670 					unsigned int regno,
2671 					machine_mode mode)
2672 {
2673   if (FP_REGNUM_P (regno) && abi_id != ARM_PCS_SVE)
2674     {
2675       poly_int64 per_register_size = GET_MODE_SIZE (mode);
2676       unsigned int nregs = hard_regno_nregs (regno, mode);
2677       if (nregs > 1)
2678 	per_register_size = exact_div (per_register_size, nregs);
2679       if (abi_id == ARM_PCS_SIMD || abi_id == ARM_PCS_TLSDESC)
2680 	return maybe_gt (per_register_size, 16);
2681       return maybe_gt (per_register_size, 8);
2682     }
2683   return false;
2684 }
2685 
2686 /* Implement REGMODE_NATURAL_SIZE.  */
2687 poly_uint64
aarch64_regmode_natural_size(machine_mode mode)2688 aarch64_regmode_natural_size (machine_mode mode)
2689 {
2690   /* The natural size for SVE data modes is one SVE data vector,
2691      and similarly for predicates.  We can't independently modify
2692      anything smaller than that.  */
2693   /* ??? For now, only do this for variable-width SVE registers.
2694      Doing it for constant-sized registers breaks lower-subreg.c.  */
2695   /* ??? And once that's fixed, we should probably have similar
2696      code for Advanced SIMD.  */
2697   if (!aarch64_sve_vg.is_constant ())
2698     {
2699       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
2700       if (vec_flags & VEC_SVE_PRED)
2701 	return BYTES_PER_SVE_PRED;
2702       if (vec_flags & VEC_SVE_DATA)
2703 	return BYTES_PER_SVE_VECTOR;
2704     }
2705   return UNITS_PER_WORD;
2706 }
2707 
2708 /* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
2709 machine_mode
aarch64_hard_regno_caller_save_mode(unsigned regno,unsigned,machine_mode mode)2710 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
2711 				     machine_mode mode)
2712 {
2713   /* The predicate mode determines which bits are significant and
2714      which are "don't care".  Decreasing the number of lanes would
2715      lose data while increasing the number of lanes would make bits
2716      unnecessarily significant.  */
2717   if (PR_REGNUM_P (regno))
2718     return mode;
2719   if (known_ge (GET_MODE_SIZE (mode), 4))
2720     return mode;
2721   else
2722     return SImode;
2723 }
2724 
2725 /* Return true if I's bits are consecutive ones from the MSB.  */
2726 bool
aarch64_high_bits_all_ones_p(HOST_WIDE_INT i)2727 aarch64_high_bits_all_ones_p (HOST_WIDE_INT i)
2728 {
2729   return exact_log2 (-i) != HOST_WIDE_INT_M1;
2730 }
2731 
2732 /* Implement TARGET_CONSTANT_ALIGNMENT.  Make strings word-aligned so
2733    that strcpy from constants will be faster.  */
2734 
2735 static HOST_WIDE_INT
aarch64_constant_alignment(const_tree exp,HOST_WIDE_INT align)2736 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
2737 {
2738   if (TREE_CODE (exp) == STRING_CST && !optimize_size)
2739     return MAX (align, BITS_PER_WORD);
2740   return align;
2741 }
2742 
2743 /* Return true if calls to DECL should be treated as
2744    long-calls (ie called via a register).  */
2745 static bool
aarch64_decl_is_long_call_p(const_tree decl ATTRIBUTE_UNUSED)2746 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
2747 {
2748   return false;
2749 }
2750 
2751 /* Return true if calls to symbol-ref SYM should be treated as
2752    long-calls (ie called via a register).  */
2753 bool
aarch64_is_long_call_p(rtx sym)2754 aarch64_is_long_call_p (rtx sym)
2755 {
2756   return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
2757 }
2758 
2759 /* Return true if calls to symbol-ref SYM should not go through
2760    plt stubs.  */
2761 
2762 bool
aarch64_is_noplt_call_p(rtx sym)2763 aarch64_is_noplt_call_p (rtx sym)
2764 {
2765   const_tree decl = SYMBOL_REF_DECL (sym);
2766 
2767   if (flag_pic
2768       && decl
2769       && (!flag_plt
2770 	  || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
2771       && !targetm.binds_local_p (decl))
2772     return true;
2773 
2774   return false;
2775 }
2776 
2777 /* Return true if the offsets to a zero/sign-extract operation
2778    represent an expression that matches an extend operation.  The
2779    operands represent the parameters from
2780 
2781    (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)).  */
2782 bool
aarch64_is_extend_from_extract(scalar_int_mode mode,rtx mult_imm,rtx extract_imm)2783 aarch64_is_extend_from_extract (scalar_int_mode mode, rtx mult_imm,
2784 				rtx extract_imm)
2785 {
2786   HOST_WIDE_INT mult_val, extract_val;
2787 
2788   if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
2789     return false;
2790 
2791   mult_val = INTVAL (mult_imm);
2792   extract_val = INTVAL (extract_imm);
2793 
2794   if (extract_val > 8
2795       && extract_val < GET_MODE_BITSIZE (mode)
2796       && exact_log2 (extract_val & ~7) > 0
2797       && (extract_val & 7) <= 4
2798       && mult_val == (1 << (extract_val & 7)))
2799     return true;
2800 
2801   return false;
2802 }
2803 
2804 /* Emit an insn that's a simple single-set.  Both the operands must be
2805    known to be valid.  */
2806 inline static rtx_insn *
emit_set_insn(rtx x,rtx y)2807 emit_set_insn (rtx x, rtx y)
2808 {
2809   return emit_insn (gen_rtx_SET (x, y));
2810 }
2811 
2812 /* X and Y are two things to compare using CODE.  Emit the compare insn and
2813    return the rtx for register 0 in the proper mode.  */
2814 rtx
aarch64_gen_compare_reg(RTX_CODE code,rtx x,rtx y)2815 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
2816 {
2817   machine_mode cmp_mode = GET_MODE (x);
2818   machine_mode cc_mode;
2819   rtx cc_reg;
2820 
2821   if (cmp_mode == TImode)
2822     {
2823       gcc_assert (code == NE);
2824 
2825       cc_mode = CCmode;
2826       cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2827 
2828       rtx x_lo = operand_subword (x, 0, 0, TImode);
2829       rtx y_lo = operand_subword (y, 0, 0, TImode);
2830       emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x_lo, y_lo));
2831 
2832       rtx x_hi = operand_subword (x, 1, 0, TImode);
2833       rtx y_hi = operand_subword (y, 1, 0, TImode);
2834       emit_insn (gen_ccmpccdi (cc_reg, cc_reg, x_hi, y_hi,
2835 			       gen_rtx_EQ (cc_mode, cc_reg, const0_rtx),
2836 			       GEN_INT (AARCH64_EQ)));
2837     }
2838   else
2839     {
2840       cc_mode = SELECT_CC_MODE (code, x, y);
2841       cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2842       emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x, y));
2843     }
2844   return cc_reg;
2845 }
2846 
2847 /* Similarly, but maybe zero-extend Y if Y_MODE < SImode.  */
2848 
2849 static rtx
aarch64_gen_compare_reg_maybe_ze(RTX_CODE code,rtx x,rtx y,machine_mode y_mode)2850 aarch64_gen_compare_reg_maybe_ze (RTX_CODE code, rtx x, rtx y,
2851                                   machine_mode y_mode)
2852 {
2853   if (y_mode == E_QImode || y_mode == E_HImode)
2854     {
2855       if (CONST_INT_P (y))
2856 	{
2857 	  y = GEN_INT (INTVAL (y) & GET_MODE_MASK (y_mode));
2858 	  y_mode = SImode;
2859 	}
2860       else
2861 	{
2862 	  rtx t, cc_reg;
2863 	  machine_mode cc_mode;
2864 
2865 	  t = gen_rtx_ZERO_EXTEND (SImode, y);
2866 	  t = gen_rtx_COMPARE (CC_SWPmode, t, x);
2867 	  cc_mode = CC_SWPmode;
2868 	  cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2869 	  emit_set_insn (cc_reg, t);
2870 	  return cc_reg;
2871 	}
2872     }
2873 
2874   if (!aarch64_plus_operand (y, y_mode))
2875     y = force_reg (y_mode, y);
2876 
2877   return aarch64_gen_compare_reg (code, x, y);
2878 }
2879 
2880 /* Build the SYMBOL_REF for __tls_get_addr.  */
2881 
2882 static GTY(()) rtx tls_get_addr_libfunc;
2883 
2884 rtx
aarch64_tls_get_addr(void)2885 aarch64_tls_get_addr (void)
2886 {
2887   if (!tls_get_addr_libfunc)
2888     tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
2889   return tls_get_addr_libfunc;
2890 }
2891 
2892 /* Return the TLS model to use for ADDR.  */
2893 
2894 static enum tls_model
tls_symbolic_operand_type(rtx addr)2895 tls_symbolic_operand_type (rtx addr)
2896 {
2897   enum tls_model tls_kind = TLS_MODEL_NONE;
2898   if (GET_CODE (addr) == CONST)
2899     {
2900       poly_int64 addend;
2901       rtx sym = strip_offset (addr, &addend);
2902       if (GET_CODE (sym) == SYMBOL_REF)
2903 	tls_kind = SYMBOL_REF_TLS_MODEL (sym);
2904     }
2905   else if (GET_CODE (addr) == SYMBOL_REF)
2906     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
2907 
2908   return tls_kind;
2909 }
2910 
2911 /* We'll allow lo_sum's in addresses in our legitimate addresses
2912    so that combine would take care of combining addresses where
2913    necessary, but for generation purposes, we'll generate the address
2914    as :
2915    RTL                               Absolute
2916    tmp = hi (symbol_ref);            adrp  x1, foo
2917    dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
2918                                      nop
2919 
2920    PIC                               TLS
2921    adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
2922    ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
2923                                      bl   __tls_get_addr
2924                                      nop
2925 
2926    Load TLS symbol, depending on TLS mechanism and TLS access model.
2927 
2928    Global Dynamic - Traditional TLS:
2929    adrp tmp, :tlsgd:imm
2930    add  dest, tmp, #:tlsgd_lo12:imm
2931    bl   __tls_get_addr
2932 
2933    Global Dynamic - TLS Descriptors:
2934    adrp dest, :tlsdesc:imm
2935    ldr  tmp, [dest, #:tlsdesc_lo12:imm]
2936    add  dest, dest, #:tlsdesc_lo12:imm
2937    blr  tmp
2938    mrs  tp, tpidr_el0
2939    add  dest, dest, tp
2940 
2941    Initial Exec:
2942    mrs  tp, tpidr_el0
2943    adrp tmp, :gottprel:imm
2944    ldr  dest, [tmp, #:gottprel_lo12:imm]
2945    add  dest, dest, tp
2946 
2947    Local Exec:
2948    mrs  tp, tpidr_el0
2949    add  t0, tp, #:tprel_hi12:imm, lsl #12
2950    add  t0, t0, #:tprel_lo12_nc:imm
2951 */
2952 
2953 static void
aarch64_load_symref_appropriately(rtx dest,rtx imm,enum aarch64_symbol_type type)2954 aarch64_load_symref_appropriately (rtx dest, rtx imm,
2955 				   enum aarch64_symbol_type type)
2956 {
2957   switch (type)
2958     {
2959     case SYMBOL_SMALL_ABSOLUTE:
2960       {
2961 	/* In ILP32, the mode of dest can be either SImode or DImode.  */
2962 	rtx tmp_reg = dest;
2963 	machine_mode mode = GET_MODE (dest);
2964 
2965 	gcc_assert (mode == Pmode || mode == ptr_mode);
2966 
2967 	if (can_create_pseudo_p ())
2968 	  tmp_reg = gen_reg_rtx (mode);
2969 
2970 	emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
2971 	emit_insn (gen_add_losym (dest, tmp_reg, imm));
2972 	return;
2973       }
2974 
2975     case SYMBOL_TINY_ABSOLUTE:
2976       emit_insn (gen_rtx_SET (dest, imm));
2977       return;
2978 
2979     case SYMBOL_SMALL_GOT_28K:
2980       {
2981 	machine_mode mode = GET_MODE (dest);
2982 	rtx gp_rtx = pic_offset_table_rtx;
2983 	rtx insn;
2984 	rtx mem;
2985 
2986 	/* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
2987 	   here before rtl expand.  Tree IVOPT will generate rtl pattern to
2988 	   decide rtx costs, in which case pic_offset_table_rtx is not
2989 	   initialized.  For that case no need to generate the first adrp
2990 	   instruction as the final cost for global variable access is
2991 	   one instruction.  */
2992 	if (gp_rtx != NULL)
2993 	  {
2994 	    /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
2995 	       using the page base as GOT base, the first page may be wasted,
2996 	       in the worst scenario, there is only 28K space for GOT).
2997 
2998 	       The generate instruction sequence for accessing global variable
2999 	       is:
3000 
3001 		 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
3002 
3003 	       Only one instruction needed. But we must initialize
3004 	       pic_offset_table_rtx properly.  We generate initialize insn for
3005 	       every global access, and allow CSE to remove all redundant.
3006 
3007 	       The final instruction sequences will look like the following
3008 	       for multiply global variables access.
3009 
3010 		 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
3011 
3012 		 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
3013 		 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
3014 		 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
3015 		 ...  */
3016 
3017 	    rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
3018 	    crtl->uses_pic_offset_table = 1;
3019 	    emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
3020 
3021 	    if (mode != GET_MODE (gp_rtx))
3022              gp_rtx = gen_lowpart (mode, gp_rtx);
3023 
3024 	  }
3025 
3026 	if (mode == ptr_mode)
3027 	  {
3028 	    if (mode == DImode)
3029 	      insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
3030 	    else
3031 	      insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
3032 
3033 	    mem = XVECEXP (SET_SRC (insn), 0, 0);
3034 	  }
3035 	else
3036 	  {
3037 	    gcc_assert (mode == Pmode);
3038 
3039 	    insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
3040 	    mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
3041 	  }
3042 
3043 	/* The operand is expected to be MEM.  Whenever the related insn
3044 	   pattern changed, above code which calculate mem should be
3045 	   updated.  */
3046 	gcc_assert (GET_CODE (mem) == MEM);
3047 	MEM_READONLY_P (mem) = 1;
3048 	MEM_NOTRAP_P (mem) = 1;
3049 	emit_insn (insn);
3050 	return;
3051       }
3052 
3053     case SYMBOL_SMALL_GOT_4G:
3054       {
3055 	/* In ILP32, the mode of dest can be either SImode or DImode,
3056 	   while the got entry is always of SImode size.  The mode of
3057 	   dest depends on how dest is used: if dest is assigned to a
3058 	   pointer (e.g. in the memory), it has SImode; it may have
3059 	   DImode if dest is dereferenced to access the memeory.
3060 	   This is why we have to handle three different ldr_got_small
3061 	   patterns here (two patterns for ILP32).  */
3062 
3063 	rtx insn;
3064 	rtx mem;
3065 	rtx tmp_reg = dest;
3066 	machine_mode mode = GET_MODE (dest);
3067 
3068 	if (can_create_pseudo_p ())
3069 	  tmp_reg = gen_reg_rtx (mode);
3070 
3071 	emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
3072 	if (mode == ptr_mode)
3073 	  {
3074 	    if (mode == DImode)
3075 	      insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
3076 	    else
3077 	      insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
3078 
3079 	    mem = XVECEXP (SET_SRC (insn), 0, 0);
3080 	  }
3081 	else
3082 	  {
3083 	    gcc_assert (mode == Pmode);
3084 
3085 	    insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
3086 	    mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
3087 	  }
3088 
3089 	gcc_assert (GET_CODE (mem) == MEM);
3090 	MEM_READONLY_P (mem) = 1;
3091 	MEM_NOTRAP_P (mem) = 1;
3092 	emit_insn (insn);
3093 	return;
3094       }
3095 
3096     case SYMBOL_SMALL_TLSGD:
3097       {
3098 	rtx_insn *insns;
3099 	/* The return type of __tls_get_addr is the C pointer type
3100 	   so use ptr_mode.  */
3101 	rtx result = gen_rtx_REG (ptr_mode, R0_REGNUM);
3102 	rtx tmp_reg = dest;
3103 
3104 	if (GET_MODE (dest) != ptr_mode)
3105 	  tmp_reg = can_create_pseudo_p () ? gen_reg_rtx (ptr_mode) : result;
3106 
3107 	start_sequence ();
3108 	if (ptr_mode == SImode)
3109 	  aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
3110 	else
3111 	  aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
3112 	insns = get_insns ();
3113 	end_sequence ();
3114 
3115 	RTL_CONST_CALL_P (insns) = 1;
3116 	emit_libcall_block (insns, tmp_reg, result, imm);
3117 	/* Convert back to the mode of the dest adding a zero_extend
3118 	   from SImode (ptr_mode) to DImode (Pmode). */
3119 	if (dest != tmp_reg)
3120 	  convert_move (dest, tmp_reg, true);
3121 	return;
3122       }
3123 
3124     case SYMBOL_SMALL_TLSDESC:
3125       {
3126 	machine_mode mode = GET_MODE (dest);
3127 	rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
3128 	rtx tp;
3129 
3130 	gcc_assert (mode == Pmode || mode == ptr_mode);
3131 
3132 	/* In ILP32, the got entry is always of SImode size.  Unlike
3133 	   small GOT, the dest is fixed at reg 0.  */
3134 	if (TARGET_ILP32)
3135 	  emit_insn (gen_tlsdesc_small_si (imm));
3136 	else
3137 	  emit_insn (gen_tlsdesc_small_di (imm));
3138 	tp = aarch64_load_tp (NULL);
3139 
3140 	if (mode != Pmode)
3141 	  tp = gen_lowpart (mode, tp);
3142 
3143 	emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
3144 	if (REG_P (dest))
3145 	  set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
3146 	return;
3147       }
3148 
3149     case SYMBOL_SMALL_TLSIE:
3150       {
3151 	/* In ILP32, the mode of dest can be either SImode or DImode,
3152 	   while the got entry is always of SImode size.  The mode of
3153 	   dest depends on how dest is used: if dest is assigned to a
3154 	   pointer (e.g. in the memory), it has SImode; it may have
3155 	   DImode if dest is dereferenced to access the memeory.
3156 	   This is why we have to handle three different tlsie_small
3157 	   patterns here (two patterns for ILP32).  */
3158 	machine_mode mode = GET_MODE (dest);
3159 	rtx tmp_reg = gen_reg_rtx (mode);
3160 	rtx tp = aarch64_load_tp (NULL);
3161 
3162 	if (mode == ptr_mode)
3163 	  {
3164 	    if (mode == DImode)
3165 	      emit_insn (gen_tlsie_small_di (tmp_reg, imm));
3166 	    else
3167 	      {
3168 		emit_insn (gen_tlsie_small_si (tmp_reg, imm));
3169 		tp = gen_lowpart (mode, tp);
3170 	      }
3171 	  }
3172 	else
3173 	  {
3174 	    gcc_assert (mode == Pmode);
3175 	    emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
3176 	  }
3177 
3178 	emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
3179 	if (REG_P (dest))
3180 	  set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
3181 	return;
3182       }
3183 
3184     case SYMBOL_TLSLE12:
3185     case SYMBOL_TLSLE24:
3186     case SYMBOL_TLSLE32:
3187     case SYMBOL_TLSLE48:
3188       {
3189 	machine_mode mode = GET_MODE (dest);
3190 	rtx tp = aarch64_load_tp (NULL);
3191 
3192 	if (mode != Pmode)
3193 	  tp = gen_lowpart (mode, tp);
3194 
3195 	switch (type)
3196 	  {
3197 	  case SYMBOL_TLSLE12:
3198 	    emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
3199 			(dest, tp, imm));
3200 	    break;
3201 	  case SYMBOL_TLSLE24:
3202 	    emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
3203 			(dest, tp, imm));
3204 	  break;
3205 	  case SYMBOL_TLSLE32:
3206 	    emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
3207 			(dest, imm));
3208 	    emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
3209 			(dest, dest, tp));
3210 	  break;
3211 	  case SYMBOL_TLSLE48:
3212 	    emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
3213 			(dest, imm));
3214 	    emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
3215 			(dest, dest, tp));
3216 	    break;
3217 	  default:
3218 	    gcc_unreachable ();
3219 	  }
3220 
3221 	if (REG_P (dest))
3222 	  set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
3223 	return;
3224       }
3225 
3226     case SYMBOL_TINY_GOT:
3227       {
3228 	rtx insn;
3229 	machine_mode mode = GET_MODE (dest);
3230 
3231 	if (mode == ptr_mode)
3232 	  insn = gen_ldr_got_tiny (mode, dest, imm);
3233 	else
3234 	  {
3235 	    gcc_assert (mode == Pmode);
3236 	    insn = gen_ldr_got_tiny_sidi (dest, imm);
3237 	  }
3238 
3239 	emit_insn (insn);
3240 	return;
3241       }
3242 
3243     case SYMBOL_TINY_TLSIE:
3244       {
3245 	machine_mode mode = GET_MODE (dest);
3246 	rtx tp = aarch64_load_tp (NULL);
3247 
3248 	if (mode == ptr_mode)
3249 	  {
3250 	    if (mode == DImode)
3251 	      emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
3252 	    else
3253 	      {
3254 		tp = gen_lowpart (mode, tp);
3255 		emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
3256 	      }
3257 	  }
3258 	else
3259 	  {
3260 	    gcc_assert (mode == Pmode);
3261 	    emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
3262 	  }
3263 
3264 	if (REG_P (dest))
3265 	  set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
3266 	return;
3267       }
3268 
3269     default:
3270       gcc_unreachable ();
3271     }
3272 }
3273 
3274 /* Emit a move from SRC to DEST.  Assume that the move expanders can
3275    handle all moves if !can_create_pseudo_p ().  The distinction is
3276    important because, unlike emit_move_insn, the move expanders know
3277    how to force Pmode objects into the constant pool even when the
3278    constant pool address is not itself legitimate.  */
3279 static rtx
aarch64_emit_move(rtx dest,rtx src)3280 aarch64_emit_move (rtx dest, rtx src)
3281 {
3282   return (can_create_pseudo_p ()
3283 	  ? emit_move_insn (dest, src)
3284 	  : emit_move_insn_1 (dest, src));
3285 }
3286 
3287 /* Apply UNOPTAB to OP and store the result in DEST.  */
3288 
3289 static void
aarch64_emit_unop(rtx dest,optab unoptab,rtx op)3290 aarch64_emit_unop (rtx dest, optab unoptab, rtx op)
3291 {
3292   rtx tmp = expand_unop (GET_MODE (dest), unoptab, op, dest, 0);
3293   if (dest != tmp)
3294     emit_move_insn (dest, tmp);
3295 }
3296 
3297 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST.  */
3298 
3299 static void
aarch64_emit_binop(rtx dest,optab binoptab,rtx op0,rtx op1)3300 aarch64_emit_binop (rtx dest, optab binoptab, rtx op0, rtx op1)
3301 {
3302   rtx tmp = expand_binop (GET_MODE (dest), binoptab, op0, op1, dest, 0,
3303 			  OPTAB_DIRECT);
3304   if (dest != tmp)
3305     emit_move_insn (dest, tmp);
3306 }
3307 
3308 /* Split a 128-bit move operation into two 64-bit move operations,
3309    taking care to handle partial overlap of register to register
3310    copies.  Special cases are needed when moving between GP regs and
3311    FP regs.  SRC can be a register, constant or memory; DST a register
3312    or memory.  If either operand is memory it must not have any side
3313    effects.  */
3314 void
aarch64_split_128bit_move(rtx dst,rtx src)3315 aarch64_split_128bit_move (rtx dst, rtx src)
3316 {
3317   rtx dst_lo, dst_hi;
3318   rtx src_lo, src_hi;
3319 
3320   machine_mode mode = GET_MODE (dst);
3321 
3322   gcc_assert (mode == TImode || mode == TFmode);
3323   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
3324   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
3325 
3326   if (REG_P (dst) && REG_P (src))
3327     {
3328       int src_regno = REGNO (src);
3329       int dst_regno = REGNO (dst);
3330 
3331       /* Handle FP <-> GP regs.  */
3332       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
3333 	{
3334 	  src_lo = gen_lowpart (word_mode, src);
3335 	  src_hi = gen_highpart (word_mode, src);
3336 
3337 	  emit_insn (gen_aarch64_movlow_di (mode, dst, src_lo));
3338 	  emit_insn (gen_aarch64_movhigh_di (mode, dst, src_hi));
3339 	  return;
3340 	}
3341       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
3342 	{
3343 	  dst_lo = gen_lowpart (word_mode, dst);
3344 	  dst_hi = gen_highpart (word_mode, dst);
3345 
3346 	  emit_insn (gen_aarch64_movdi_low (mode, dst_lo, src));
3347 	  emit_insn (gen_aarch64_movdi_high (mode, dst_hi, src));
3348 	  return;
3349 	}
3350     }
3351 
3352   dst_lo = gen_lowpart (word_mode, dst);
3353   dst_hi = gen_highpart (word_mode, dst);
3354   src_lo = gen_lowpart (word_mode, src);
3355   src_hi = gen_highpart_mode (word_mode, mode, src);
3356 
3357   /* At most one pairing may overlap.  */
3358   if (reg_overlap_mentioned_p (dst_lo, src_hi))
3359     {
3360       aarch64_emit_move (dst_hi, src_hi);
3361       aarch64_emit_move (dst_lo, src_lo);
3362     }
3363   else
3364     {
3365       aarch64_emit_move (dst_lo, src_lo);
3366       aarch64_emit_move (dst_hi, src_hi);
3367     }
3368 }
3369 
3370 bool
aarch64_split_128bit_move_p(rtx dst,rtx src)3371 aarch64_split_128bit_move_p (rtx dst, rtx src)
3372 {
3373   return (! REG_P (src)
3374 	  || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
3375 }
3376 
3377 /* Split a complex SIMD combine.  */
3378 
3379 void
aarch64_split_simd_combine(rtx dst,rtx src1,rtx src2)3380 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
3381 {
3382   machine_mode src_mode = GET_MODE (src1);
3383   machine_mode dst_mode = GET_MODE (dst);
3384 
3385   gcc_assert (VECTOR_MODE_P (dst_mode));
3386   gcc_assert (register_operand (dst, dst_mode)
3387 	      && register_operand (src1, src_mode)
3388 	      && register_operand (src2, src_mode));
3389 
3390   emit_insn (gen_aarch64_simd_combine (src_mode, dst, src1, src2));
3391   return;
3392 }
3393 
3394 /* Split a complex SIMD move.  */
3395 
3396 void
aarch64_split_simd_move(rtx dst,rtx src)3397 aarch64_split_simd_move (rtx dst, rtx src)
3398 {
3399   machine_mode src_mode = GET_MODE (src);
3400   machine_mode dst_mode = GET_MODE (dst);
3401 
3402   gcc_assert (VECTOR_MODE_P (dst_mode));
3403 
3404   if (REG_P (dst) && REG_P (src))
3405     {
3406       gcc_assert (VECTOR_MODE_P (src_mode));
3407       emit_insn (gen_aarch64_split_simd_mov (src_mode, dst, src));
3408     }
3409 }
3410 
3411 bool
aarch64_zero_extend_const_eq(machine_mode xmode,rtx x,machine_mode ymode,rtx y)3412 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
3413 			      machine_mode ymode, rtx y)
3414 {
3415   rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
3416   gcc_assert (r != NULL);
3417   return rtx_equal_p (x, r);
3418 }
3419 
3420 /* Return TARGET if it is nonnull and a register of mode MODE.
3421    Otherwise, return a fresh register of mode MODE if we can,
3422    or TARGET reinterpreted as MODE if we can't.  */
3423 
3424 static rtx
aarch64_target_reg(rtx target,machine_mode mode)3425 aarch64_target_reg (rtx target, machine_mode mode)
3426 {
3427   if (target && REG_P (target) && GET_MODE (target) == mode)
3428     return target;
3429   if (!can_create_pseudo_p ())
3430     {
3431       gcc_assert (target);
3432       return gen_lowpart (mode, target);
3433     }
3434   return gen_reg_rtx (mode);
3435 }
3436 
3437 /* Return a register that contains the constant in BUILDER, given that
3438    the constant is a legitimate move operand.  Use TARGET as the register
3439    if it is nonnull and convenient.  */
3440 
3441 static rtx
aarch64_emit_set_immediate(rtx target,rtx_vector_builder & builder)3442 aarch64_emit_set_immediate (rtx target, rtx_vector_builder &builder)
3443 {
3444   rtx src = builder.build ();
3445   target = aarch64_target_reg (target, GET_MODE (src));
3446   emit_insn (gen_rtx_SET (target, src));
3447   return target;
3448 }
3449 
3450 static rtx
aarch64_force_temporary(machine_mode mode,rtx x,rtx value)3451 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
3452 {
3453   if (can_create_pseudo_p ())
3454     return force_reg (mode, value);
3455   else
3456     {
3457       gcc_assert (x);
3458       aarch64_emit_move (x, value);
3459       return x;
3460     }
3461 }
3462 
3463 /* Return true if predicate value X is a constant in which every element
3464    is a CONST_INT.  When returning true, describe X in BUILDER as a VNx16BI
3465    value, i.e. as a predicate in which all bits are significant.  */
3466 
3467 static bool
aarch64_get_sve_pred_bits(rtx_vector_builder & builder,rtx x)3468 aarch64_get_sve_pred_bits (rtx_vector_builder &builder, rtx x)
3469 {
3470   if (GET_CODE (x) != CONST_VECTOR)
3471     return false;
3472 
3473   unsigned int factor = vector_element_size (GET_MODE_NUNITS (VNx16BImode),
3474 					     GET_MODE_NUNITS (GET_MODE (x)));
3475   unsigned int npatterns = CONST_VECTOR_NPATTERNS (x) * factor;
3476   unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (x);
3477   builder.new_vector (VNx16BImode, npatterns, nelts_per_pattern);
3478 
3479   unsigned int nelts = const_vector_encoded_nelts (x);
3480   for (unsigned int i = 0; i < nelts; ++i)
3481     {
3482       rtx elt = CONST_VECTOR_ENCODED_ELT (x, i);
3483       if (!CONST_INT_P (elt))
3484 	return false;
3485 
3486       builder.quick_push (elt);
3487       for (unsigned int j = 1; j < factor; ++j)
3488 	builder.quick_push (const0_rtx);
3489     }
3490   builder.finalize ();
3491   return true;
3492 }
3493 
3494 /* BUILDER contains a predicate constant of mode VNx16BI.  Return the
3495    widest predicate element size it can have (that is, the largest size
3496    for which each element would still be 0 or 1).  */
3497 
3498 unsigned int
aarch64_widest_sve_pred_elt_size(rtx_vector_builder & builder)3499 aarch64_widest_sve_pred_elt_size (rtx_vector_builder &builder)
3500 {
3501   /* Start with the most optimistic assumption: that we only need
3502      one bit per pattern.  This is what we will use if only the first
3503      bit in each pattern is ever set.  */
3504   unsigned int mask = GET_MODE_SIZE (DImode);
3505   mask |= builder.npatterns ();
3506 
3507   /* Look for set bits.  */
3508   unsigned int nelts = builder.encoded_nelts ();
3509   for (unsigned int i = 1; i < nelts; ++i)
3510     if (INTVAL (builder.elt (i)) != 0)
3511       {
3512 	if (i & 1)
3513 	  return 1;
3514 	mask |= i;
3515       }
3516   return mask & -mask;
3517 }
3518 
3519 /* If VNx16BImode rtx X is a canonical PTRUE for a predicate mode,
3520    return that predicate mode, otherwise return opt_machine_mode ().  */
3521 
3522 opt_machine_mode
aarch64_ptrue_all_mode(rtx x)3523 aarch64_ptrue_all_mode (rtx x)
3524 {
3525   gcc_assert (GET_MODE (x) == VNx16BImode);
3526   if (GET_CODE (x) != CONST_VECTOR
3527       || !CONST_VECTOR_DUPLICATE_P (x)
3528       || !CONST_INT_P (CONST_VECTOR_ENCODED_ELT (x, 0))
3529       || INTVAL (CONST_VECTOR_ENCODED_ELT (x, 0)) == 0)
3530     return opt_machine_mode ();
3531 
3532   unsigned int nelts = const_vector_encoded_nelts (x);
3533   for (unsigned int i = 1; i < nelts; ++i)
3534     if (CONST_VECTOR_ENCODED_ELT (x, i) != const0_rtx)
3535       return opt_machine_mode ();
3536 
3537   return aarch64_sve_pred_mode (nelts);
3538 }
3539 
3540 /* BUILDER is a predicate constant of mode VNx16BI.  Consider the value
3541    that the constant would have with predicate element size ELT_SIZE
3542    (ignoring the upper bits in each element) and return:
3543 
3544    * -1 if all bits are set
3545    * N if the predicate has N leading set bits followed by all clear bits
3546    * 0 if the predicate does not have any of these forms.  */
3547 
3548 int
aarch64_partial_ptrue_length(rtx_vector_builder & builder,unsigned int elt_size)3549 aarch64_partial_ptrue_length (rtx_vector_builder &builder,
3550 			      unsigned int elt_size)
3551 {
3552   /* If nelts_per_pattern is 3, we have set bits followed by clear bits
3553      followed by set bits.  */
3554   if (builder.nelts_per_pattern () == 3)
3555     return 0;
3556 
3557   /* Skip over leading set bits.  */
3558   unsigned int nelts = builder.encoded_nelts ();
3559   unsigned int i = 0;
3560   for (; i < nelts; i += elt_size)
3561     if (INTVAL (builder.elt (i)) == 0)
3562       break;
3563   unsigned int vl = i / elt_size;
3564 
3565   /* Check for the all-true case.  */
3566   if (i == nelts)
3567     return -1;
3568 
3569   /* If nelts_per_pattern is 1, then either VL is zero, or we have a
3570      repeating pattern of set bits followed by clear bits.  */
3571   if (builder.nelts_per_pattern () != 2)
3572     return 0;
3573 
3574   /* We have a "foreground" value and a duplicated "background" value.
3575      If the background might repeat and the last set bit belongs to it,
3576      we might have set bits followed by clear bits followed by set bits.  */
3577   if (i > builder.npatterns () && maybe_ne (nelts, builder.full_nelts ()))
3578     return 0;
3579 
3580   /* Make sure that the rest are all clear.  */
3581   for (; i < nelts; i += elt_size)
3582     if (INTVAL (builder.elt (i)) != 0)
3583       return 0;
3584 
3585   return vl;
3586 }
3587 
3588 /* See if there is an svpattern that encodes an SVE predicate of mode
3589    PRED_MODE in which the first VL bits are set and the rest are clear.
3590    Return the pattern if so, otherwise return AARCH64_NUM_SVPATTERNS.
3591    A VL of -1 indicates an all-true vector.  */
3592 
3593 aarch64_svpattern
aarch64_svpattern_for_vl(machine_mode pred_mode,int vl)3594 aarch64_svpattern_for_vl (machine_mode pred_mode, int vl)
3595 {
3596   if (vl < 0)
3597     return AARCH64_SV_ALL;
3598 
3599   if (maybe_gt (vl, GET_MODE_NUNITS (pred_mode)))
3600     return AARCH64_NUM_SVPATTERNS;
3601 
3602   if (vl >= 1 && vl <= 8)
3603     return aarch64_svpattern (AARCH64_SV_VL1 + (vl - 1));
3604 
3605   if (vl >= 16 && vl <= 256 && pow2p_hwi (vl))
3606     return aarch64_svpattern (AARCH64_SV_VL16 + (exact_log2 (vl) - 4));
3607 
3608   int max_vl;
3609   if (GET_MODE_NUNITS (pred_mode).is_constant (&max_vl))
3610     {
3611       if (vl == (max_vl / 3) * 3)
3612 	return AARCH64_SV_MUL3;
3613       /* These would only trigger for non-power-of-2 lengths.  */
3614       if (vl == (max_vl & -4))
3615 	return AARCH64_SV_MUL4;
3616       if (vl == (1 << floor_log2 (max_vl)))
3617 	return AARCH64_SV_POW2;
3618       if (vl == max_vl)
3619 	return AARCH64_SV_ALL;
3620     }
3621   return AARCH64_NUM_SVPATTERNS;
3622 }
3623 
3624 /* Return a VNx16BImode constant in which every sequence of ELT_SIZE
3625    bits has the lowest bit set and the upper bits clear.  This is the
3626    VNx16BImode equivalent of a PTRUE for controlling elements of
3627    ELT_SIZE bytes.  However, because the constant is VNx16BImode,
3628    all bits are significant, even the upper zeros.  */
3629 
3630 rtx
aarch64_ptrue_all(unsigned int elt_size)3631 aarch64_ptrue_all (unsigned int elt_size)
3632 {
3633   rtx_vector_builder builder (VNx16BImode, elt_size, 1);
3634   builder.quick_push (const1_rtx);
3635   for (unsigned int i = 1; i < elt_size; ++i)
3636     builder.quick_push (const0_rtx);
3637   return builder.build ();
3638 }
3639 
3640 /* Return an all-true predicate register of mode MODE.  */
3641 
3642 rtx
aarch64_ptrue_reg(machine_mode mode)3643 aarch64_ptrue_reg (machine_mode mode)
3644 {
3645   gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
3646   rtx reg = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
3647   return gen_lowpart (mode, reg);
3648 }
3649 
3650 /* Return an all-false predicate register of mode MODE.  */
3651 
3652 rtx
aarch64_pfalse_reg(machine_mode mode)3653 aarch64_pfalse_reg (machine_mode mode)
3654 {
3655   gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
3656   rtx reg = force_reg (VNx16BImode, CONST0_RTX (VNx16BImode));
3657   return gen_lowpart (mode, reg);
3658 }
3659 
3660 /* Return true if predicate PRED1[0] is true whenever predicate PRED2 is
3661    true, or alternatively if we know that the operation predicated by
3662    PRED1[0] is safe to perform whenever PRED2 is true.  PRED1[1] is a
3663    aarch64_sve_gp_strictness operand that describes the operation
3664    predicated by PRED1[0].  */
3665 
3666 bool
aarch64_sve_pred_dominates_p(rtx * pred1,rtx pred2)3667 aarch64_sve_pred_dominates_p (rtx *pred1, rtx pred2)
3668 {
3669   machine_mode mode = GET_MODE (pred2);
3670   gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
3671 	      && mode == GET_MODE (pred1[0])
3672 	      && aarch64_sve_gp_strictness (pred1[1], SImode));
3673   return (pred1[0] == CONSTM1_RTX (mode)
3674 	  || INTVAL (pred1[1]) == SVE_RELAXED_GP
3675 	  || rtx_equal_p (pred1[0], pred2));
3676 }
3677 
3678 /* PRED1[0] is a PTEST predicate and PRED1[1] is an aarch64_sve_ptrue_flag
3679    for it.  PRED2[0] is the predicate for the instruction whose result
3680    is tested by the PTEST and PRED2[1] is again an aarch64_sve_ptrue_flag
3681    for it.  Return true if we can prove that the two predicates are
3682    equivalent for PTEST purposes; that is, if we can replace PRED2[0]
3683    with PRED1[0] without changing behavior.  */
3684 
3685 bool
aarch64_sve_same_pred_for_ptest_p(rtx * pred1,rtx * pred2)3686 aarch64_sve_same_pred_for_ptest_p (rtx *pred1, rtx *pred2)
3687 {
3688   machine_mode mode = GET_MODE (pred1[0]);
3689   gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
3690 	      && mode == GET_MODE (pred2[0])
3691 	      && aarch64_sve_ptrue_flag (pred1[1], SImode)
3692 	      && aarch64_sve_ptrue_flag (pred2[1], SImode));
3693 
3694   bool ptrue1_p = (pred1[0] == CONSTM1_RTX (mode)
3695 		   || INTVAL (pred1[1]) == SVE_KNOWN_PTRUE);
3696   bool ptrue2_p = (pred2[0] == CONSTM1_RTX (mode)
3697 		   || INTVAL (pred2[1]) == SVE_KNOWN_PTRUE);
3698   return (ptrue1_p && ptrue2_p) || rtx_equal_p (pred1[0], pred2[0]);
3699 }
3700 
3701 /* Emit a comparison CMP between OP0 and OP1, both of which have mode
3702    DATA_MODE, and return the result in a predicate of mode PRED_MODE.
3703    Use TARGET as the target register if nonnull and convenient.  */
3704 
3705 static rtx
aarch64_sve_emit_int_cmp(rtx target,machine_mode pred_mode,rtx_code cmp,machine_mode data_mode,rtx op1,rtx op2)3706 aarch64_sve_emit_int_cmp (rtx target, machine_mode pred_mode, rtx_code cmp,
3707 			  machine_mode data_mode, rtx op1, rtx op2)
3708 {
3709   insn_code icode = code_for_aarch64_pred_cmp (cmp, data_mode);
3710   expand_operand ops[5];
3711   create_output_operand (&ops[0], target, pred_mode);
3712   create_input_operand (&ops[1], CONSTM1_RTX (pred_mode), pred_mode);
3713   create_integer_operand (&ops[2], SVE_KNOWN_PTRUE);
3714   create_input_operand (&ops[3], op1, data_mode);
3715   create_input_operand (&ops[4], op2, data_mode);
3716   expand_insn (icode, 5, ops);
3717   return ops[0].value;
3718 }
3719 
3720 /* Use a comparison to convert integer vector SRC into MODE, which is
3721    the corresponding SVE predicate mode.  Use TARGET for the result
3722    if it's nonnull and convenient.  */
3723 
3724 rtx
aarch64_convert_sve_data_to_pred(rtx target,machine_mode mode,rtx src)3725 aarch64_convert_sve_data_to_pred (rtx target, machine_mode mode, rtx src)
3726 {
3727   machine_mode src_mode = GET_MODE (src);
3728   return aarch64_sve_emit_int_cmp (target, mode, NE, src_mode,
3729 				   src, CONST0_RTX (src_mode));
3730 }
3731 
3732 /* Return the assembly token for svprfop value PRFOP.  */
3733 
3734 static const char *
svprfop_token(enum aarch64_svprfop prfop)3735 svprfop_token (enum aarch64_svprfop prfop)
3736 {
3737   switch (prfop)
3738     {
3739 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
3740     AARCH64_FOR_SVPRFOP (CASE)
3741 #undef CASE
3742     case AARCH64_NUM_SVPRFOPS:
3743       break;
3744     }
3745   gcc_unreachable ();
3746 }
3747 
3748 /* Return the assembly string for an SVE prefetch operation with
3749    mnemonic MNEMONIC, given that PRFOP_RTX is the prefetch operation
3750    and that SUFFIX is the format for the remaining operands.  */
3751 
3752 char *
aarch64_output_sve_prefetch(const char * mnemonic,rtx prfop_rtx,const char * suffix)3753 aarch64_output_sve_prefetch (const char *mnemonic, rtx prfop_rtx,
3754 			     const char *suffix)
3755 {
3756   static char buffer[128];
3757   aarch64_svprfop prfop = (aarch64_svprfop) INTVAL (prfop_rtx);
3758   unsigned int written = snprintf (buffer, sizeof (buffer), "%s\t%s, %s",
3759 				   mnemonic, svprfop_token (prfop), suffix);
3760   gcc_assert (written < sizeof (buffer));
3761   return buffer;
3762 }
3763 
3764 /* Check whether we can calculate the number of elements in PATTERN
3765    at compile time, given that there are NELTS_PER_VQ elements per
3766    128-bit block.  Return the value if so, otherwise return -1.  */
3767 
3768 HOST_WIDE_INT
aarch64_fold_sve_cnt_pat(aarch64_svpattern pattern,unsigned int nelts_per_vq)3769 aarch64_fold_sve_cnt_pat (aarch64_svpattern pattern, unsigned int nelts_per_vq)
3770 {
3771   unsigned int vl, const_vg;
3772   if (pattern >= AARCH64_SV_VL1 && pattern <= AARCH64_SV_VL8)
3773     vl = 1 + (pattern - AARCH64_SV_VL1);
3774   else if (pattern >= AARCH64_SV_VL16 && pattern <= AARCH64_SV_VL256)
3775     vl = 16 << (pattern - AARCH64_SV_VL16);
3776   else if (aarch64_sve_vg.is_constant (&const_vg))
3777     {
3778       /* There are two vector granules per quadword.  */
3779       unsigned int nelts = (const_vg / 2) * nelts_per_vq;
3780       switch (pattern)
3781 	{
3782 	case AARCH64_SV_POW2: return 1 << floor_log2 (nelts);
3783 	case AARCH64_SV_MUL4: return nelts & -4;
3784 	case AARCH64_SV_MUL3: return (nelts / 3) * 3;
3785 	case AARCH64_SV_ALL: return nelts;
3786 	default: gcc_unreachable ();
3787 	}
3788     }
3789   else
3790     return -1;
3791 
3792   /* There are two vector granules per quadword.  */
3793   poly_uint64 nelts_all = exact_div (aarch64_sve_vg, 2) * nelts_per_vq;
3794   if (known_le (vl, nelts_all))
3795     return vl;
3796 
3797   /* Requesting more elements than are available results in a PFALSE.  */
3798   if (known_gt (vl, nelts_all))
3799     return 0;
3800 
3801   return -1;
3802 }
3803 
3804 /* Return true if we can move VALUE into a register using a single
3805    CNT[BHWD] instruction.  */
3806 
3807 static bool
aarch64_sve_cnt_immediate_p(poly_int64 value)3808 aarch64_sve_cnt_immediate_p (poly_int64 value)
3809 {
3810   HOST_WIDE_INT factor = value.coeffs[0];
3811   /* The coefficient must be [1, 16] * {2, 4, 8, 16}.  */
3812   return (value.coeffs[1] == factor
3813 	  && IN_RANGE (factor, 2, 16 * 16)
3814 	  && (factor & 1) == 0
3815 	  && factor <= 16 * (factor & -factor));
3816 }
3817 
3818 /* Likewise for rtx X.  */
3819 
3820 bool
aarch64_sve_cnt_immediate_p(rtx x)3821 aarch64_sve_cnt_immediate_p (rtx x)
3822 {
3823   poly_int64 value;
3824   return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
3825 }
3826 
3827 /* Return the asm string for an instruction with a CNT-like vector size
3828    operand (a vector pattern followed by a multiplier in the range [1, 16]).
3829    PREFIX is the mnemonic without the size suffix and OPERANDS is the
3830    first part of the operands template (the part that comes before the
3831    vector size itself).  PATTERN is the pattern to use.  FACTOR is the
3832    number of quadwords.  NELTS_PER_VQ, if nonzero, is the number of elements
3833    in each quadword.  If it is zero, we can use any element size.  */
3834 
3835 static char *
aarch64_output_sve_cnt_immediate(const char * prefix,const char * operands,aarch64_svpattern pattern,unsigned int factor,unsigned int nelts_per_vq)3836 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
3837 				  aarch64_svpattern pattern,
3838 				  unsigned int factor,
3839 				  unsigned int nelts_per_vq)
3840 {
3841   static char buffer[sizeof ("sqincd\t%x0, %w0, vl256, mul #16")];
3842 
3843   if (nelts_per_vq == 0)
3844     /* There is some overlap in the ranges of the four CNT instructions.
3845        Here we always use the smallest possible element size, so that the
3846        multiplier is 1 whereever possible.  */
3847     nelts_per_vq = factor & -factor;
3848   int shift = std::min (exact_log2 (nelts_per_vq), 4);
3849   gcc_assert (IN_RANGE (shift, 1, 4));
3850   char suffix = "dwhb"[shift - 1];
3851 
3852   factor >>= shift;
3853   unsigned int written;
3854   if (pattern == AARCH64_SV_ALL && factor == 1)
3855     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
3856 			prefix, suffix, operands);
3857   else if (factor == 1)
3858     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, %s",
3859 			prefix, suffix, operands, svpattern_token (pattern));
3860   else
3861     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, %s, mul #%d",
3862 			prefix, suffix, operands, svpattern_token (pattern),
3863 			factor);
3864   gcc_assert (written < sizeof (buffer));
3865   return buffer;
3866 }
3867 
3868 /* Return the asm string for an instruction with a CNT-like vector size
3869    operand (a vector pattern followed by a multiplier in the range [1, 16]).
3870    PREFIX is the mnemonic without the size suffix and OPERANDS is the
3871    first part of the operands template (the part that comes before the
3872    vector size itself).  X is the value of the vector size operand,
3873    as a polynomial integer rtx; we need to convert this into an "all"
3874    pattern with a multiplier.  */
3875 
3876 char *
aarch64_output_sve_cnt_immediate(const char * prefix,const char * operands,rtx x)3877 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
3878 				  rtx x)
3879 {
3880   poly_int64 value = rtx_to_poly_int64 (x);
3881   gcc_assert (aarch64_sve_cnt_immediate_p (value));
3882   return aarch64_output_sve_cnt_immediate (prefix, operands, AARCH64_SV_ALL,
3883 					   value.coeffs[1], 0);
3884 }
3885 
3886 /* Return the asm string for an instruction with a CNT-like vector size
3887    operand (a vector pattern followed by a multiplier in the range [1, 16]).
3888    PREFIX is the mnemonic without the size suffix and OPERANDS is the
3889    first part of the operands template (the part that comes before the
3890    vector size itself).  CNT_PAT[0..2] are the operands of the
3891    UNSPEC_SVE_CNT_PAT; see aarch64_sve_cnt_pat for details.  */
3892 
3893 char *
aarch64_output_sve_cnt_pat_immediate(const char * prefix,const char * operands,rtx * cnt_pat)3894 aarch64_output_sve_cnt_pat_immediate (const char *prefix,
3895 				      const char *operands, rtx *cnt_pat)
3896 {
3897   aarch64_svpattern pattern = (aarch64_svpattern) INTVAL (cnt_pat[0]);
3898   unsigned int nelts_per_vq = INTVAL (cnt_pat[1]);
3899   unsigned int factor = INTVAL (cnt_pat[2]) * nelts_per_vq;
3900   return aarch64_output_sve_cnt_immediate (prefix, operands, pattern,
3901 					   factor, nelts_per_vq);
3902 }
3903 
3904 /* Return true if we can add X using a single SVE INC or DEC instruction.  */
3905 
3906 bool
aarch64_sve_scalar_inc_dec_immediate_p(rtx x)3907 aarch64_sve_scalar_inc_dec_immediate_p (rtx x)
3908 {
3909   poly_int64 value;
3910   return (poly_int_rtx_p (x, &value)
3911 	  && (aarch64_sve_cnt_immediate_p (value)
3912 	      || aarch64_sve_cnt_immediate_p (-value)));
3913 }
3914 
3915 /* Return the asm string for adding SVE INC/DEC immediate OFFSET to
3916    operand 0.  */
3917 
3918 char *
aarch64_output_sve_scalar_inc_dec(rtx offset)3919 aarch64_output_sve_scalar_inc_dec (rtx offset)
3920 {
3921   poly_int64 offset_value = rtx_to_poly_int64 (offset);
3922   gcc_assert (offset_value.coeffs[0] == offset_value.coeffs[1]);
3923   if (offset_value.coeffs[1] > 0)
3924     return aarch64_output_sve_cnt_immediate ("inc", "%x0", AARCH64_SV_ALL,
3925 					     offset_value.coeffs[1], 0);
3926   else
3927     return aarch64_output_sve_cnt_immediate ("dec", "%x0", AARCH64_SV_ALL,
3928 					     -offset_value.coeffs[1], 0);
3929 }
3930 
3931 /* Return true if we can add VALUE to a register using a single ADDVL
3932    or ADDPL instruction.  */
3933 
3934 static bool
aarch64_sve_addvl_addpl_immediate_p(poly_int64 value)3935 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
3936 {
3937   HOST_WIDE_INT factor = value.coeffs[0];
3938   if (factor == 0 || value.coeffs[1] != factor)
3939     return false;
3940   /* FACTOR counts VG / 2, so a value of 2 is one predicate width
3941      and a value of 16 is one vector width.  */
3942   return (((factor & 15) == 0 && IN_RANGE (factor, -32 * 16, 31 * 16))
3943 	  || ((factor & 1) == 0 && IN_RANGE (factor, -32 * 2, 31 * 2)));
3944 }
3945 
3946 /* Likewise for rtx X.  */
3947 
3948 bool
aarch64_sve_addvl_addpl_immediate_p(rtx x)3949 aarch64_sve_addvl_addpl_immediate_p (rtx x)
3950 {
3951   poly_int64 value;
3952   return (poly_int_rtx_p (x, &value)
3953 	  && aarch64_sve_addvl_addpl_immediate_p (value));
3954 }
3955 
3956 /* Return the asm string for adding ADDVL or ADDPL immediate OFFSET
3957    to operand 1 and storing the result in operand 0.  */
3958 
3959 char *
aarch64_output_sve_addvl_addpl(rtx offset)3960 aarch64_output_sve_addvl_addpl (rtx offset)
3961 {
3962   static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
3963   poly_int64 offset_value = rtx_to_poly_int64 (offset);
3964   gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
3965 
3966   int factor = offset_value.coeffs[1];
3967   if ((factor & 15) == 0)
3968     snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
3969   else
3970     snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
3971   return buffer;
3972 }
3973 
3974 /* Return true if X is a valid immediate for an SVE vector INC or DEC
3975    instruction.  If it is, store the number of elements in each vector
3976    quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
3977    factor in *FACTOR_OUT (if nonnull).  */
3978 
3979 bool
aarch64_sve_vector_inc_dec_immediate_p(rtx x,int * factor_out,unsigned int * nelts_per_vq_out)3980 aarch64_sve_vector_inc_dec_immediate_p (rtx x, int *factor_out,
3981 					unsigned int *nelts_per_vq_out)
3982 {
3983   rtx elt;
3984   poly_int64 value;
3985 
3986   if (!const_vec_duplicate_p (x, &elt)
3987       || !poly_int_rtx_p (elt, &value))
3988     return false;
3989 
3990   unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
3991   if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
3992     /* There's no vector INCB.  */
3993     return false;
3994 
3995   HOST_WIDE_INT factor = value.coeffs[0];
3996   if (value.coeffs[1] != factor)
3997     return false;
3998 
3999   /* The coefficient must be [1, 16] * NELTS_PER_VQ.  */
4000   if ((factor % nelts_per_vq) != 0
4001       || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
4002     return false;
4003 
4004   if (factor_out)
4005     *factor_out = factor;
4006   if (nelts_per_vq_out)
4007     *nelts_per_vq_out = nelts_per_vq;
4008   return true;
4009 }
4010 
4011 /* Return true if X is a valid immediate for an SVE vector INC or DEC
4012    instruction.  */
4013 
4014 bool
aarch64_sve_vector_inc_dec_immediate_p(rtx x)4015 aarch64_sve_vector_inc_dec_immediate_p (rtx x)
4016 {
4017   return aarch64_sve_vector_inc_dec_immediate_p (x, NULL, NULL);
4018 }
4019 
4020 /* Return the asm template for an SVE vector INC or DEC instruction.
4021    OPERANDS gives the operands before the vector count and X is the
4022    value of the vector count operand itself.  */
4023 
4024 char *
aarch64_output_sve_vector_inc_dec(const char * operands,rtx x)4025 aarch64_output_sve_vector_inc_dec (const char *operands, rtx x)
4026 {
4027   int factor;
4028   unsigned int nelts_per_vq;
4029   if (!aarch64_sve_vector_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
4030     gcc_unreachable ();
4031   if (factor < 0)
4032     return aarch64_output_sve_cnt_immediate ("dec", operands, AARCH64_SV_ALL,
4033 					     -factor, nelts_per_vq);
4034   else
4035     return aarch64_output_sve_cnt_immediate ("inc", operands, AARCH64_SV_ALL,
4036 					     factor, nelts_per_vq);
4037 }
4038 
4039 static int
aarch64_internal_mov_immediate(rtx dest,rtx imm,bool generate,scalar_int_mode mode)4040 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
4041 				scalar_int_mode mode)
4042 {
4043   int i;
4044   unsigned HOST_WIDE_INT val, val2, mask;
4045   int one_match, zero_match;
4046   int num_insns;
4047 
4048   val = INTVAL (imm);
4049 
4050   if (aarch64_move_imm (val, mode))
4051     {
4052       if (generate)
4053 	emit_insn (gen_rtx_SET (dest, imm));
4054       return 1;
4055     }
4056 
4057   /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
4058      (with XXXX non-zero). In that case check to see if the move can be done in
4059      a smaller mode.  */
4060   val2 = val & 0xffffffff;
4061   if (mode == DImode
4062       && aarch64_move_imm (val2, SImode)
4063       && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
4064     {
4065       if (generate)
4066 	emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
4067 
4068       /* Check if we have to emit a second instruction by checking to see
4069          if any of the upper 32 bits of the original DI mode value is set.  */
4070       if (val == val2)
4071 	return 1;
4072 
4073       i = (val >> 48) ? 48 : 32;
4074 
4075       if (generate)
4076 	 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
4077 				    GEN_INT ((val >> i) & 0xffff)));
4078 
4079       return 2;
4080     }
4081 
4082   if ((val >> 32) == 0 || mode == SImode)
4083     {
4084       if (generate)
4085 	{
4086 	  emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
4087 	  if (mode == SImode)
4088 	    emit_insn (gen_insv_immsi (dest, GEN_INT (16),
4089 				       GEN_INT ((val >> 16) & 0xffff)));
4090 	  else
4091 	    emit_insn (gen_insv_immdi (dest, GEN_INT (16),
4092 				       GEN_INT ((val >> 16) & 0xffff)));
4093 	}
4094       return 2;
4095     }
4096 
4097   /* Remaining cases are all for DImode.  */
4098 
4099   mask = 0xffff;
4100   zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
4101     ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
4102   one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
4103     ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
4104 
4105   if (zero_match != 2 && one_match != 2)
4106     {
4107       /* Try emitting a bitmask immediate with a movk replacing 16 bits.
4108 	 For a 64-bit bitmask try whether changing 16 bits to all ones or
4109 	 zeroes creates a valid bitmask.  To check any repeated bitmask,
4110 	 try using 16 bits from the other 32-bit half of val.  */
4111 
4112       for (i = 0; i < 64; i += 16, mask <<= 16)
4113 	{
4114 	  val2 = val & ~mask;
4115 	  if (val2 != val && aarch64_bitmask_imm (val2, mode))
4116 	    break;
4117 	  val2 = val | mask;
4118 	  if (val2 != val && aarch64_bitmask_imm (val2, mode))
4119 	    break;
4120 	  val2 = val2 & ~mask;
4121 	  val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
4122 	  if (val2 != val && aarch64_bitmask_imm (val2, mode))
4123 	    break;
4124 	}
4125       if (i != 64)
4126 	{
4127 	  if (generate)
4128 	    {
4129 	      emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
4130 	      emit_insn (gen_insv_immdi (dest, GEN_INT (i),
4131 					 GEN_INT ((val >> i) & 0xffff)));
4132 	    }
4133 	  return 2;
4134 	}
4135     }
4136 
4137   /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
4138      are emitted by the initial mov.  If one_match > zero_match, skip set bits,
4139      otherwise skip zero bits.  */
4140 
4141   num_insns = 1;
4142   mask = 0xffff;
4143   val2 = one_match > zero_match ? ~val : val;
4144   i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
4145 
4146   if (generate)
4147     emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
4148 					   ? (val | ~(mask << i))
4149 					   : (val & (mask << i)))));
4150   for (i += 16; i < 64; i += 16)
4151     {
4152       if ((val2 & (mask << i)) == 0)
4153 	continue;
4154       if (generate)
4155 	emit_insn (gen_insv_immdi (dest, GEN_INT (i),
4156 				   GEN_INT ((val >> i) & 0xffff)));
4157       num_insns ++;
4158     }
4159 
4160   return num_insns;
4161 }
4162 
4163 /* Return whether imm is a 128-bit immediate which is simple enough to
4164    expand inline.  */
4165 bool
aarch64_mov128_immediate(rtx imm)4166 aarch64_mov128_immediate (rtx imm)
4167 {
4168   if (GET_CODE (imm) == CONST_INT)
4169     return true;
4170 
4171   gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
4172 
4173   rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
4174   rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
4175 
4176   return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
4177 	 + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
4178 }
4179 
4180 
4181 /* Return the number of temporary registers that aarch64_add_offset_1
4182    would need to add OFFSET to a register.  */
4183 
4184 static unsigned int
aarch64_add_offset_1_temporaries(HOST_WIDE_INT offset)4185 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
4186 {
4187   return abs_hwi (offset) < 0x1000000 ? 0 : 1;
4188 }
4189 
4190 /* A subroutine of aarch64_add_offset.  Set DEST to SRC + OFFSET for
4191    a non-polynomial OFFSET.  MODE is the mode of the addition.
4192    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
4193    be set and CFA adjustments added to the generated instructions.
4194 
4195    TEMP1, if nonnull, is a register of mode MODE that can be used as a
4196    temporary if register allocation is already complete.  This temporary
4197    register may overlap DEST but must not overlap SRC.  If TEMP1 is known
4198    to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
4199    the immediate again.
4200 
4201    Since this function may be used to adjust the stack pointer, we must
4202    ensure that it cannot cause transient stack deallocation (for example
4203    by first incrementing SP and then decrementing when adjusting by a
4204    large immediate).  */
4205 
4206 static void
aarch64_add_offset_1(scalar_int_mode mode,rtx dest,rtx src,HOST_WIDE_INT offset,rtx temp1,bool frame_related_p,bool emit_move_imm)4207 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
4208 		      rtx src, HOST_WIDE_INT offset, rtx temp1,
4209 		      bool frame_related_p, bool emit_move_imm)
4210 {
4211   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
4212   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
4213 
4214   unsigned HOST_WIDE_INT moffset = absu_hwi (offset);
4215   rtx_insn *insn;
4216 
4217   if (!moffset)
4218     {
4219       if (!rtx_equal_p (dest, src))
4220 	{
4221 	  insn = emit_insn (gen_rtx_SET (dest, src));
4222 	  RTX_FRAME_RELATED_P (insn) = frame_related_p;
4223 	}
4224       return;
4225     }
4226 
4227   /* Single instruction adjustment.  */
4228   if (aarch64_uimm12_shift (moffset))
4229     {
4230       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
4231       RTX_FRAME_RELATED_P (insn) = frame_related_p;
4232       return;
4233     }
4234 
4235   /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
4236      and either:
4237 
4238      a) the offset cannot be loaded by a 16-bit move or
4239      b) there is no spare register into which we can move it.  */
4240   if (moffset < 0x1000000
4241       && ((!temp1 && !can_create_pseudo_p ())
4242 	  || !aarch64_move_imm (moffset, mode)))
4243     {
4244       HOST_WIDE_INT low_off = moffset & 0xfff;
4245 
4246       low_off = offset < 0 ? -low_off : low_off;
4247       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
4248       RTX_FRAME_RELATED_P (insn) = frame_related_p;
4249       insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
4250       RTX_FRAME_RELATED_P (insn) = frame_related_p;
4251       return;
4252     }
4253 
4254   /* Emit a move immediate if required and an addition/subtraction.  */
4255   if (emit_move_imm)
4256     {
4257       gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
4258       temp1 = aarch64_force_temporary (mode, temp1,
4259 				       gen_int_mode (moffset, mode));
4260     }
4261   insn = emit_insn (offset < 0
4262 		    ? gen_sub3_insn (dest, src, temp1)
4263 		    : gen_add3_insn (dest, src, temp1));
4264   if (frame_related_p)
4265     {
4266       RTX_FRAME_RELATED_P (insn) = frame_related_p;
4267       rtx adj = plus_constant (mode, src, offset);
4268       add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
4269     }
4270 }
4271 
4272 /* Return the number of temporary registers that aarch64_add_offset
4273    would need to move OFFSET into a register or add OFFSET to a register;
4274    ADD_P is true if we want the latter rather than the former.  */
4275 
4276 static unsigned int
aarch64_offset_temporaries(bool add_p,poly_int64 offset)4277 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
4278 {
4279   /* This follows the same structure as aarch64_add_offset.  */
4280   if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
4281     return 0;
4282 
4283   unsigned int count = 0;
4284   HOST_WIDE_INT factor = offset.coeffs[1];
4285   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
4286   poly_int64 poly_offset (factor, factor);
4287   if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
4288     /* Need one register for the ADDVL/ADDPL result.  */
4289     count += 1;
4290   else if (factor != 0)
4291     {
4292       factor = abs (factor);
4293       if (factor > 16 * (factor & -factor))
4294 	/* Need one register for the CNT result and one for the multiplication
4295 	   factor.  If necessary, the second temporary can be reused for the
4296 	   constant part of the offset.  */
4297 	return 2;
4298       /* Need one register for the CNT result (which might then
4299 	 be shifted).  */
4300       count += 1;
4301     }
4302   return count + aarch64_add_offset_1_temporaries (constant);
4303 }
4304 
4305 /* If X can be represented as a poly_int64, return the number
4306    of temporaries that are required to add it to a register.
4307    Return -1 otherwise.  */
4308 
4309 int
aarch64_add_offset_temporaries(rtx x)4310 aarch64_add_offset_temporaries (rtx x)
4311 {
4312   poly_int64 offset;
4313   if (!poly_int_rtx_p (x, &offset))
4314     return -1;
4315   return aarch64_offset_temporaries (true, offset);
4316 }
4317 
4318 /* Set DEST to SRC + OFFSET.  MODE is the mode of the addition.
4319    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
4320    be set and CFA adjustments added to the generated instructions.
4321 
4322    TEMP1, if nonnull, is a register of mode MODE that can be used as a
4323    temporary if register allocation is already complete.  This temporary
4324    register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
4325    If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
4326    false to avoid emitting the immediate again.
4327 
4328    TEMP2, if nonnull, is a second temporary register that doesn't
4329    overlap either DEST or REG.
4330 
4331    Since this function may be used to adjust the stack pointer, we must
4332    ensure that it cannot cause transient stack deallocation (for example
4333    by first incrementing SP and then decrementing when adjusting by a
4334    large immediate).  */
4335 
4336 static void
4337 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
4338 		    poly_int64 offset, rtx temp1, rtx temp2,
4339 		    bool frame_related_p, bool emit_move_imm = true)
4340 {
4341   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
4342   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
4343   gcc_assert (temp1 == NULL_RTX
4344 	      || !frame_related_p
4345 	      || !reg_overlap_mentioned_p (temp1, dest));
4346   gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
4347 
4348   /* Try using ADDVL or ADDPL to add the whole value.  */
4349   if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
4350     {
4351       rtx offset_rtx = gen_int_mode (offset, mode);
4352       rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
4353       RTX_FRAME_RELATED_P (insn) = frame_related_p;
4354       return;
4355     }
4356 
4357   /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
4358      SVE vector register, over and above the minimum size of 128 bits.
4359      This is equivalent to half the value returned by CNTD with a
4360      vector shape of ALL.  */
4361   HOST_WIDE_INT factor = offset.coeffs[1];
4362   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
4363 
4364   /* Try using ADDVL or ADDPL to add the VG-based part.  */
4365   poly_int64 poly_offset (factor, factor);
4366   if (src != const0_rtx
4367       && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
4368     {
4369       rtx offset_rtx = gen_int_mode (poly_offset, mode);
4370       if (frame_related_p)
4371 	{
4372 	  rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
4373 	  RTX_FRAME_RELATED_P (insn) = true;
4374 	  src = dest;
4375 	}
4376       else
4377 	{
4378 	  rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
4379 	  src = aarch64_force_temporary (mode, temp1, addr);
4380 	  temp1 = temp2;
4381 	  temp2 = NULL_RTX;
4382 	}
4383     }
4384   /* Otherwise use a CNT-based sequence.  */
4385   else if (factor != 0)
4386     {
4387       /* Use a subtraction if we have a negative factor.  */
4388       rtx_code code = PLUS;
4389       if (factor < 0)
4390 	{
4391 	  factor = -factor;
4392 	  code = MINUS;
4393 	}
4394 
4395       /* Calculate CNTD * FACTOR / 2.  First try to fold the division
4396 	 into the multiplication.  */
4397       rtx val;
4398       int shift = 0;
4399       if (factor & 1)
4400 	/* Use a right shift by 1.  */
4401 	shift = -1;
4402       else
4403 	factor /= 2;
4404       HOST_WIDE_INT low_bit = factor & -factor;
4405       if (factor <= 16 * low_bit)
4406 	{
4407 	  if (factor > 16 * 8)
4408 	    {
4409 	      /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
4410 		 the value with the minimum multiplier and shift it into
4411 		 position.  */
4412 	      int extra_shift = exact_log2 (low_bit);
4413 	      shift += extra_shift;
4414 	      factor >>= extra_shift;
4415 	    }
4416 	  val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
4417 	}
4418       else
4419 	{
4420 	  /* Base the factor on LOW_BIT if we can calculate LOW_BIT
4421 	     directly, since that should increase the chances of being
4422 	     able to use a shift and add sequence.  If LOW_BIT itself
4423 	     is out of range, just use CNTD.  */
4424 	  if (low_bit <= 16 * 8)
4425 	    factor /= low_bit;
4426 	  else
4427 	    low_bit = 1;
4428 
4429 	  val = gen_int_mode (poly_int64 (low_bit * 2, low_bit * 2), mode);
4430 	  val = aarch64_force_temporary (mode, temp1, val);
4431 
4432 	  if (can_create_pseudo_p ())
4433 	    {
4434 	      rtx coeff1 = gen_int_mode (factor, mode);
4435 	      val = expand_mult (mode, val, coeff1, NULL_RTX, false, true);
4436 	    }
4437 	  else
4438 	    {
4439 	      /* Go back to using a negative multiplication factor if we have
4440 		 no register from which to subtract.  */
4441 	      if (code == MINUS && src == const0_rtx)
4442 		{
4443 		  factor = -factor;
4444 		  code = PLUS;
4445 		}
4446 	      rtx coeff1 = gen_int_mode (factor, mode);
4447 	      coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
4448 	      val = gen_rtx_MULT (mode, val, coeff1);
4449 	    }
4450 	}
4451 
4452       if (shift > 0)
4453 	{
4454 	  /* Multiply by 1 << SHIFT.  */
4455 	  val = aarch64_force_temporary (mode, temp1, val);
4456 	  val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
4457 	}
4458       else if (shift == -1)
4459 	{
4460 	  /* Divide by 2.  */
4461 	  val = aarch64_force_temporary (mode, temp1, val);
4462 	  val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
4463 	}
4464 
4465       /* Calculate SRC +/- CNTD * FACTOR / 2.  */
4466       if (src != const0_rtx)
4467 	{
4468 	  val = aarch64_force_temporary (mode, temp1, val);
4469 	  val = gen_rtx_fmt_ee (code, mode, src, val);
4470 	}
4471       else if (code == MINUS)
4472 	{
4473 	  val = aarch64_force_temporary (mode, temp1, val);
4474 	  val = gen_rtx_NEG (mode, val);
4475 	}
4476 
4477       if (constant == 0 || frame_related_p)
4478 	{
4479 	  rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
4480 	  if (frame_related_p)
4481 	    {
4482 	      RTX_FRAME_RELATED_P (insn) = true;
4483 	      add_reg_note (insn, REG_CFA_ADJUST_CFA,
4484 			    gen_rtx_SET (dest, plus_constant (Pmode, src,
4485 							      poly_offset)));
4486 	    }
4487 	  src = dest;
4488 	  if (constant == 0)
4489 	    return;
4490 	}
4491       else
4492 	{
4493 	  src = aarch64_force_temporary (mode, temp1, val);
4494 	  temp1 = temp2;
4495 	  temp2 = NULL_RTX;
4496 	}
4497 
4498       emit_move_imm = true;
4499     }
4500 
4501   aarch64_add_offset_1 (mode, dest, src, constant, temp1,
4502 			frame_related_p, emit_move_imm);
4503 }
4504 
4505 /* Like aarch64_add_offset, but the offset is given as an rtx rather
4506    than a poly_int64.  */
4507 
4508 void
aarch64_split_add_offset(scalar_int_mode mode,rtx dest,rtx src,rtx offset_rtx,rtx temp1,rtx temp2)4509 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
4510 			  rtx offset_rtx, rtx temp1, rtx temp2)
4511 {
4512   aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
4513 		      temp1, temp2, false);
4514 }
4515 
4516 /* Add DELTA to the stack pointer, marking the instructions frame-related.
4517    TEMP1 is available as a temporary if nonnull.  EMIT_MOVE_IMM is false
4518    if TEMP1 already contains abs (DELTA).  */
4519 
4520 static inline void
aarch64_add_sp(rtx temp1,rtx temp2,poly_int64 delta,bool emit_move_imm)4521 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
4522 {
4523   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
4524 		      temp1, temp2, true, emit_move_imm);
4525 }
4526 
4527 /* Subtract DELTA from the stack pointer, marking the instructions
4528    frame-related if FRAME_RELATED_P.  TEMP1 is available as a temporary
4529    if nonnull.  */
4530 
4531 static inline void
4532 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p,
4533 		bool emit_move_imm = true)
4534 {
4535   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
4536 		      temp1, temp2, frame_related_p, emit_move_imm);
4537 }
4538 
4539 /* Set DEST to (vec_series BASE STEP).  */
4540 
4541 static void
aarch64_expand_vec_series(rtx dest,rtx base,rtx step)4542 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
4543 {
4544   machine_mode mode = GET_MODE (dest);
4545   scalar_mode inner = GET_MODE_INNER (mode);
4546 
4547   /* Each operand can be a register or an immediate in the range [-16, 15].  */
4548   if (!aarch64_sve_index_immediate_p (base))
4549     base = force_reg (inner, base);
4550   if (!aarch64_sve_index_immediate_p (step))
4551     step = force_reg (inner, step);
4552 
4553   emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
4554 }
4555 
4556 /* Duplicate 128-bit Advanced SIMD vector SRC so that it fills an SVE
4557    register of mode MODE.  Use TARGET for the result if it's nonnull
4558    and convenient.
4559 
4560    The two vector modes must have the same element mode.  The behavior
4561    is to duplicate architectural lane N of SRC into architectural lanes
4562    N + I * STEP of the result.  On big-endian targets, architectural
4563    lane 0 of an Advanced SIMD vector is the last element of the vector
4564    in memory layout, so for big-endian targets this operation has the
4565    effect of reversing SRC before duplicating it.  Callers need to
4566    account for this.  */
4567 
4568 rtx
aarch64_expand_sve_dupq(rtx target,machine_mode mode,rtx src)4569 aarch64_expand_sve_dupq (rtx target, machine_mode mode, rtx src)
4570 {
4571   machine_mode src_mode = GET_MODE (src);
4572   gcc_assert (GET_MODE_INNER (mode) == GET_MODE_INNER (src_mode));
4573   insn_code icode = (BYTES_BIG_ENDIAN
4574 		     ? code_for_aarch64_vec_duplicate_vq_be (mode)
4575 		     : code_for_aarch64_vec_duplicate_vq_le (mode));
4576 
4577   unsigned int i = 0;
4578   expand_operand ops[3];
4579   create_output_operand (&ops[i++], target, mode);
4580   create_output_operand (&ops[i++], src, src_mode);
4581   if (BYTES_BIG_ENDIAN)
4582     {
4583       /* Create a PARALLEL describing the reversal of SRC.  */
4584       unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (mode);
4585       rtx sel = aarch64_gen_stepped_int_parallel (nelts_per_vq,
4586 						  nelts_per_vq - 1, -1);
4587       create_fixed_operand (&ops[i++], sel);
4588     }
4589   expand_insn (icode, i, ops);
4590   return ops[0].value;
4591 }
4592 
4593 /* Try to force 128-bit vector value SRC into memory and use LD1RQ to fetch
4594    the memory image into DEST.  Return true on success.  */
4595 
4596 static bool
aarch64_expand_sve_ld1rq(rtx dest,rtx src)4597 aarch64_expand_sve_ld1rq (rtx dest, rtx src)
4598 {
4599   src = force_const_mem (GET_MODE (src), src);
4600   if (!src)
4601     return false;
4602 
4603   /* Make sure that the address is legitimate.  */
4604   if (!aarch64_sve_ld1rq_operand_p (src))
4605     {
4606       rtx addr = force_reg (Pmode, XEXP (src, 0));
4607       src = replace_equiv_address (src, addr);
4608     }
4609 
4610   machine_mode mode = GET_MODE (dest);
4611   machine_mode pred_mode = aarch64_sve_pred_mode (mode);
4612   rtx ptrue = aarch64_ptrue_reg (pred_mode);
4613   emit_insn (gen_aarch64_sve_ld1rq (mode, dest, src, ptrue));
4614   return true;
4615 }
4616 
4617 /* Return a register containing CONST_VECTOR SRC, given that SRC has an
4618    SVE data mode and isn't a legitimate constant.  Use TARGET for the
4619    result if convenient.
4620 
4621    The returned register can have whatever mode seems most natural
4622    given the contents of SRC.  */
4623 
4624 static rtx
aarch64_expand_sve_const_vector(rtx target,rtx src)4625 aarch64_expand_sve_const_vector (rtx target, rtx src)
4626 {
4627   machine_mode mode = GET_MODE (src);
4628   unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
4629   unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
4630   scalar_mode elt_mode = GET_MODE_INNER (mode);
4631   unsigned int elt_bits = GET_MODE_BITSIZE (elt_mode);
4632   unsigned int container_bits = aarch64_sve_container_bits (mode);
4633   unsigned int encoded_bits = npatterns * nelts_per_pattern * container_bits;
4634 
4635   if (nelts_per_pattern == 1
4636       && encoded_bits <= 128
4637       && container_bits != elt_bits)
4638     {
4639       /* We have a partial vector mode and a constant whose full-vector
4640 	 equivalent would occupy a repeating 128-bit sequence.  Build that
4641 	 full-vector equivalent instead, so that we have the option of
4642 	 using LD1RQ and Advanced SIMD operations.  */
4643       unsigned int repeat = container_bits / elt_bits;
4644       machine_mode full_mode = aarch64_full_sve_mode (elt_mode).require ();
4645       rtx_vector_builder builder (full_mode, npatterns * repeat, 1);
4646       for (unsigned int i = 0; i < npatterns; ++i)
4647 	for (unsigned int j = 0; j < repeat; ++j)
4648 	  builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, i));
4649       target = aarch64_target_reg (target, full_mode);
4650       return aarch64_expand_sve_const_vector (target, builder.build ());
4651     }
4652 
4653   if (nelts_per_pattern == 1 && encoded_bits == 128)
4654     {
4655       /* The constant is a duplicated quadword but can't be narrowed
4656 	 beyond a quadword.  Get the memory image of the first quadword
4657 	 as a 128-bit vector and try using LD1RQ to load it from memory.
4658 
4659 	 The effect for both endiannesses is to load memory lane N into
4660 	 architectural lanes N + I * STEP of the result.  On big-endian
4661 	 targets, the layout of the 128-bit vector in an Advanced SIMD
4662 	 register would be different from its layout in an SVE register,
4663 	 but this 128-bit vector is a memory value only.  */
4664       machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
4665       rtx vq_value = simplify_gen_subreg (vq_mode, src, mode, 0);
4666       if (vq_value && aarch64_expand_sve_ld1rq (target, vq_value))
4667 	return target;
4668     }
4669 
4670   if (nelts_per_pattern == 1 && encoded_bits < 128)
4671     {
4672       /* The vector is a repeating sequence of 64 bits or fewer.
4673 	 See if we can load them using an Advanced SIMD move and then
4674 	 duplicate it to fill a vector.  This is better than using a GPR
4675 	 move because it keeps everything in the same register file.  */
4676       machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
4677       rtx_vector_builder builder (vq_mode, npatterns, 1);
4678       for (unsigned int i = 0; i < npatterns; ++i)
4679 	{
4680 	  /* We want memory lane N to go into architectural lane N,
4681 	     so reverse for big-endian targets.  The DUP .Q pattern
4682 	     has a compensating reverse built-in.  */
4683 	  unsigned int srci = BYTES_BIG_ENDIAN ? npatterns - i - 1 : i;
4684 	  builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, srci));
4685 	}
4686       rtx vq_src = builder.build ();
4687       if (aarch64_simd_valid_immediate (vq_src, NULL))
4688 	{
4689 	  vq_src = force_reg (vq_mode, vq_src);
4690 	  return aarch64_expand_sve_dupq (target, mode, vq_src);
4691 	}
4692 
4693       /* Get an integer representation of the repeating part of Advanced
4694 	 SIMD vector VQ_SRC.  This preserves the endianness of VQ_SRC,
4695 	 which for big-endian targets is lane-swapped wrt a normal
4696 	 Advanced SIMD vector.  This means that for both endiannesses,
4697 	 memory lane N of SVE vector SRC corresponds to architectural
4698 	 lane N of a register holding VQ_SRC.  This in turn means that
4699 	 memory lane 0 of SVE vector SRC is in the lsb of VQ_SRC (viewed
4700 	 as a single 128-bit value) and thus that memory lane 0 of SRC is
4701 	 in the lsb of the integer.  Duplicating the integer therefore
4702 	 ensures that memory lane N of SRC goes into architectural lane
4703 	 N + I * INDEX of the SVE register.  */
4704       scalar_mode int_mode = int_mode_for_size (encoded_bits, 0).require ();
4705       rtx elt_value = simplify_gen_subreg (int_mode, vq_src, vq_mode, 0);
4706       if (elt_value)
4707 	{
4708 	  /* Pretend that we had a vector of INT_MODE to start with.  */
4709 	  elt_mode = int_mode;
4710 	  mode = aarch64_full_sve_mode (int_mode).require ();
4711 
4712 	  /* If the integer can be moved into a general register by a
4713 	     single instruction, do that and duplicate the result.  */
4714 	  if (CONST_INT_P (elt_value)
4715 	      && aarch64_move_imm (INTVAL (elt_value), elt_mode))
4716 	    {
4717 	      elt_value = force_reg (elt_mode, elt_value);
4718 	      return expand_vector_broadcast (mode, elt_value);
4719 	    }
4720 	}
4721       else if (npatterns == 1)
4722 	/* We're duplicating a single value, but can't do better than
4723 	   force it to memory and load from there.  This handles things
4724 	   like symbolic constants.  */
4725 	elt_value = CONST_VECTOR_ENCODED_ELT (src, 0);
4726 
4727       if (elt_value)
4728 	{
4729 	  /* Load the element from memory if we can, otherwise move it into
4730 	     a register and use a DUP.  */
4731 	  rtx op = force_const_mem (elt_mode, elt_value);
4732 	  if (!op)
4733 	    op = force_reg (elt_mode, elt_value);
4734 	  return expand_vector_broadcast (mode, op);
4735 	}
4736     }
4737 
4738   /* Try using INDEX.  */
4739   rtx base, step;
4740   if (const_vec_series_p (src, &base, &step))
4741     {
4742       aarch64_expand_vec_series (target, base, step);
4743       return target;
4744     }
4745 
4746   /* From here on, it's better to force the whole constant to memory
4747      if we can.  */
4748   if (GET_MODE_NUNITS (mode).is_constant ())
4749     return NULL_RTX;
4750 
4751   /* Expand each pattern individually.  */
4752   gcc_assert (npatterns > 1);
4753   rtx_vector_builder builder;
4754   auto_vec<rtx, 16> vectors (npatterns);
4755   for (unsigned int i = 0; i < npatterns; ++i)
4756     {
4757       builder.new_vector (mode, 1, nelts_per_pattern);
4758       for (unsigned int j = 0; j < nelts_per_pattern; ++j)
4759 	builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
4760       vectors.quick_push (force_reg (mode, builder.build ()));
4761     }
4762 
4763   /* Use permutes to interleave the separate vectors.  */
4764   while (npatterns > 1)
4765     {
4766       npatterns /= 2;
4767       for (unsigned int i = 0; i < npatterns; ++i)
4768 	{
4769 	  rtx tmp = (npatterns == 1 ? target : gen_reg_rtx (mode));
4770 	  rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
4771 	  emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
4772 	  vectors[i] = tmp;
4773 	}
4774     }
4775   gcc_assert (vectors[0] == target);
4776   return target;
4777 }
4778 
4779 /* Use WHILE to set a predicate register of mode MODE in which the first
4780    VL bits are set and the rest are clear.  Use TARGET for the register
4781    if it's nonnull and convenient.  */
4782 
4783 static rtx
aarch64_sve_move_pred_via_while(rtx target,machine_mode mode,unsigned int vl)4784 aarch64_sve_move_pred_via_while (rtx target, machine_mode mode,
4785 				 unsigned int vl)
4786 {
4787   rtx limit = force_reg (DImode, gen_int_mode (vl, DImode));
4788   target = aarch64_target_reg (target, mode);
4789   emit_insn (gen_while (UNSPEC_WHILELO, DImode, mode,
4790 			target, const0_rtx, limit));
4791   return target;
4792 }
4793 
4794 static rtx
4795 aarch64_expand_sve_const_pred_1 (rtx, rtx_vector_builder &, bool);
4796 
4797 /* BUILDER is a constant predicate in which the index of every set bit
4798    is a multiple of ELT_SIZE (which is <= 8).  Try to load the constant
4799    by inverting every element at a multiple of ELT_SIZE and EORing the
4800    result with an ELT_SIZE PTRUE.
4801 
4802    Return a register that contains the constant on success, otherwise
4803    return null.  Use TARGET as the register if it is nonnull and
4804    convenient.  */
4805 
4806 static rtx
aarch64_expand_sve_const_pred_eor(rtx target,rtx_vector_builder & builder,unsigned int elt_size)4807 aarch64_expand_sve_const_pred_eor (rtx target, rtx_vector_builder &builder,
4808 				   unsigned int elt_size)
4809 {
4810   /* Invert every element at a multiple of ELT_SIZE, keeping the
4811      other bits zero.  */
4812   rtx_vector_builder inv_builder (VNx16BImode, builder.npatterns (),
4813 				  builder.nelts_per_pattern ());
4814   for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
4815     if ((i & (elt_size - 1)) == 0 && INTVAL (builder.elt (i)) == 0)
4816       inv_builder.quick_push (const1_rtx);
4817     else
4818       inv_builder.quick_push (const0_rtx);
4819   inv_builder.finalize ();
4820 
4821   /* See if we can load the constant cheaply.  */
4822   rtx inv = aarch64_expand_sve_const_pred_1 (NULL_RTX, inv_builder, false);
4823   if (!inv)
4824     return NULL_RTX;
4825 
4826   /* EOR the result with an ELT_SIZE PTRUE.  */
4827   rtx mask = aarch64_ptrue_all (elt_size);
4828   mask = force_reg (VNx16BImode, mask);
4829   inv = gen_lowpart (VNx16BImode, inv);
4830   target = aarch64_target_reg (target, VNx16BImode);
4831   emit_insn (gen_aarch64_pred_z (XOR, VNx16BImode, target, mask, inv, mask));
4832   return target;
4833 }
4834 
4835 /* BUILDER is a constant predicate in which the index of every set bit
4836    is a multiple of ELT_SIZE (which is <= 8).  Try to load the constant
4837    using a TRN1 of size PERMUTE_SIZE, which is >= ELT_SIZE.  Return the
4838    register on success, otherwise return null.  Use TARGET as the register
4839    if nonnull and convenient.  */
4840 
4841 static rtx
aarch64_expand_sve_const_pred_trn(rtx target,rtx_vector_builder & builder,unsigned int elt_size,unsigned int permute_size)4842 aarch64_expand_sve_const_pred_trn (rtx target, rtx_vector_builder &builder,
4843 				   unsigned int elt_size,
4844 				   unsigned int permute_size)
4845 {
4846   /* We're going to split the constant into two new constants A and B,
4847      with element I of BUILDER going into A if (I & PERMUTE_SIZE) == 0
4848      and into B otherwise.  E.g. for PERMUTE_SIZE == 4 && ELT_SIZE == 1:
4849 
4850      A: { 0, 1, 2, 3, _, _, _, _, 8, 9, 10, 11, _, _, _, _ }
4851      B: { 4, 5, 6, 7, _, _, _, _, 12, 13, 14, 15, _, _, _, _ }
4852 
4853      where _ indicates elements that will be discarded by the permute.
4854 
4855      First calculate the ELT_SIZEs for A and B.  */
4856   unsigned int a_elt_size = GET_MODE_SIZE (DImode);
4857   unsigned int b_elt_size = GET_MODE_SIZE (DImode);
4858   for (unsigned int i = 0; i < builder.encoded_nelts (); i += elt_size)
4859     if (INTVAL (builder.elt (i)) != 0)
4860       {
4861 	if (i & permute_size)
4862 	  b_elt_size |= i - permute_size;
4863 	else
4864 	  a_elt_size |= i;
4865       }
4866   a_elt_size &= -a_elt_size;
4867   b_elt_size &= -b_elt_size;
4868 
4869   /* Now construct the vectors themselves.  */
4870   rtx_vector_builder a_builder (VNx16BImode, builder.npatterns (),
4871 				builder.nelts_per_pattern ());
4872   rtx_vector_builder b_builder (VNx16BImode, builder.npatterns (),
4873 				builder.nelts_per_pattern ());
4874   unsigned int nelts = builder.encoded_nelts ();
4875   for (unsigned int i = 0; i < nelts; ++i)
4876     if (i & (elt_size - 1))
4877       {
4878 	a_builder.quick_push (const0_rtx);
4879 	b_builder.quick_push (const0_rtx);
4880       }
4881     else if ((i & permute_size) == 0)
4882       {
4883 	/* The A and B elements are significant.  */
4884 	a_builder.quick_push (builder.elt (i));
4885 	b_builder.quick_push (builder.elt (i + permute_size));
4886       }
4887     else
4888       {
4889 	/* The A and B elements are going to be discarded, so pick whatever
4890 	   is likely to give a nice constant.  We are targeting element
4891 	   sizes A_ELT_SIZE and B_ELT_SIZE for A and B respectively,
4892 	   with the aim of each being a sequence of ones followed by
4893 	   a sequence of zeros.  So:
4894 
4895 	   * if X_ELT_SIZE <= PERMUTE_SIZE, the best approach is to
4896 	     duplicate the last X_ELT_SIZE element, to extend the
4897 	     current sequence of ones or zeros.
4898 
4899 	   * if X_ELT_SIZE > PERMUTE_SIZE, the best approach is to add a
4900 	     zero, so that the constant really does have X_ELT_SIZE and
4901 	     not a smaller size.  */
4902 	if (a_elt_size > permute_size)
4903 	  a_builder.quick_push (const0_rtx);
4904 	else
4905 	  a_builder.quick_push (a_builder.elt (i - a_elt_size));
4906 	if (b_elt_size > permute_size)
4907 	  b_builder.quick_push (const0_rtx);
4908 	else
4909 	  b_builder.quick_push (b_builder.elt (i - b_elt_size));
4910       }
4911   a_builder.finalize ();
4912   b_builder.finalize ();
4913 
4914   /* Try loading A into a register.  */
4915   rtx_insn *last = get_last_insn ();
4916   rtx a = aarch64_expand_sve_const_pred_1 (NULL_RTX, a_builder, false);
4917   if (!a)
4918     return NULL_RTX;
4919 
4920   /* Try loading B into a register.  */
4921   rtx b = a;
4922   if (a_builder != b_builder)
4923     {
4924       b = aarch64_expand_sve_const_pred_1 (NULL_RTX, b_builder, false);
4925       if (!b)
4926 	{
4927 	  delete_insns_since (last);
4928 	  return NULL_RTX;
4929 	}
4930     }
4931 
4932   /* Emit the TRN1 itself.  */
4933   machine_mode mode = aarch64_sve_pred_mode (permute_size).require ();
4934   target = aarch64_target_reg (target, mode);
4935   emit_insn (gen_aarch64_sve (UNSPEC_TRN1, mode, target,
4936 			      gen_lowpart (mode, a),
4937 			      gen_lowpart (mode, b)));
4938   return target;
4939 }
4940 
4941 /* Subroutine of aarch64_expand_sve_const_pred.  Try to load the VNx16BI
4942    constant in BUILDER into an SVE predicate register.  Return the register
4943    on success, otherwise return null.  Use TARGET for the register if
4944    nonnull and convenient.
4945 
4946    ALLOW_RECURSE_P is true if we can use methods that would call this
4947    function recursively.  */
4948 
4949 static rtx
aarch64_expand_sve_const_pred_1(rtx target,rtx_vector_builder & builder,bool allow_recurse_p)4950 aarch64_expand_sve_const_pred_1 (rtx target, rtx_vector_builder &builder,
4951 				 bool allow_recurse_p)
4952 {
4953   if (builder.encoded_nelts () == 1)
4954     /* A PFALSE or a PTRUE .B ALL.  */
4955     return aarch64_emit_set_immediate (target, builder);
4956 
4957   unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
4958   if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
4959     {
4960       /* If we can load the constant using PTRUE, use it as-is.  */
4961       machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
4962       if (aarch64_svpattern_for_vl (mode, vl) != AARCH64_NUM_SVPATTERNS)
4963 	return aarch64_emit_set_immediate (target, builder);
4964 
4965       /* Otherwise use WHILE to set the first VL bits.  */
4966       return aarch64_sve_move_pred_via_while (target, mode, vl);
4967     }
4968 
4969   if (!allow_recurse_p)
4970     return NULL_RTX;
4971 
4972   /* Try inverting the vector in element size ELT_SIZE and then EORing
4973      the result with an ELT_SIZE PTRUE.  */
4974   if (INTVAL (builder.elt (0)) == 0)
4975     if (rtx res = aarch64_expand_sve_const_pred_eor (target, builder,
4976 						     elt_size))
4977       return res;
4978 
4979   /* Try using TRN1 to permute two simpler constants.  */
4980   for (unsigned int i = elt_size; i <= 8; i *= 2)
4981     if (rtx res = aarch64_expand_sve_const_pred_trn (target, builder,
4982 						     elt_size, i))
4983       return res;
4984 
4985   return NULL_RTX;
4986 }
4987 
4988 /* Return an SVE predicate register that contains the VNx16BImode
4989    constant in BUILDER, without going through the move expanders.
4990 
4991    The returned register can have whatever mode seems most natural
4992    given the contents of BUILDER.  Use TARGET for the result if
4993    convenient.  */
4994 
4995 static rtx
aarch64_expand_sve_const_pred(rtx target,rtx_vector_builder & builder)4996 aarch64_expand_sve_const_pred (rtx target, rtx_vector_builder &builder)
4997 {
4998   /* Try loading the constant using pure predicate operations.  */
4999   if (rtx res = aarch64_expand_sve_const_pred_1 (target, builder, true))
5000     return res;
5001 
5002   /* Try forcing the constant to memory.  */
5003   if (builder.full_nelts ().is_constant ())
5004     if (rtx mem = force_const_mem (VNx16BImode, builder.build ()))
5005       {
5006 	target = aarch64_target_reg (target, VNx16BImode);
5007 	emit_move_insn (target, mem);
5008 	return target;
5009       }
5010 
5011   /* The last resort is to load the constant as an integer and then
5012      compare it against zero.  Use -1 for set bits in order to increase
5013      the changes of using SVE DUPM or an Advanced SIMD byte mask.  */
5014   rtx_vector_builder int_builder (VNx16QImode, builder.npatterns (),
5015 				  builder.nelts_per_pattern ());
5016   for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
5017     int_builder.quick_push (INTVAL (builder.elt (i))
5018 			    ? constm1_rtx : const0_rtx);
5019   return aarch64_convert_sve_data_to_pred (target, VNx16BImode,
5020 					   int_builder.build ());
5021 }
5022 
5023 /* Set DEST to immediate IMM.  */
5024 
5025 void
aarch64_expand_mov_immediate(rtx dest,rtx imm)5026 aarch64_expand_mov_immediate (rtx dest, rtx imm)
5027 {
5028   machine_mode mode = GET_MODE (dest);
5029 
5030   /* Check on what type of symbol it is.  */
5031   scalar_int_mode int_mode;
5032   if ((GET_CODE (imm) == SYMBOL_REF
5033        || GET_CODE (imm) == LABEL_REF
5034        || GET_CODE (imm) == CONST
5035        || GET_CODE (imm) == CONST_POLY_INT)
5036       && is_a <scalar_int_mode> (mode, &int_mode))
5037     {
5038       rtx mem;
5039       poly_int64 offset;
5040       HOST_WIDE_INT const_offset;
5041       enum aarch64_symbol_type sty;
5042 
5043       /* If we have (const (plus symbol offset)), separate out the offset
5044 	 before we start classifying the symbol.  */
5045       rtx base = strip_offset (imm, &offset);
5046 
5047       /* We must always add an offset involving VL separately, rather than
5048 	 folding it into the relocation.  */
5049       if (!offset.is_constant (&const_offset))
5050 	{
5051 	  if (!TARGET_SVE)
5052 	    {
5053 	      aarch64_report_sve_required ();
5054 	      return;
5055 	    }
5056 	  if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
5057 	    emit_insn (gen_rtx_SET (dest, imm));
5058 	  else
5059 	    {
5060 	      /* Do arithmetic on 32-bit values if the result is smaller
5061 		 than that.  */
5062 	      if (partial_subreg_p (int_mode, SImode))
5063 		{
5064 		  /* It is invalid to do symbol calculations in modes
5065 		     narrower than SImode.  */
5066 		  gcc_assert (base == const0_rtx);
5067 		  dest = gen_lowpart (SImode, dest);
5068 		  int_mode = SImode;
5069 		}
5070 	      if (base != const0_rtx)
5071 		{
5072 		  base = aarch64_force_temporary (int_mode, dest, base);
5073 		  aarch64_add_offset (int_mode, dest, base, offset,
5074 				      NULL_RTX, NULL_RTX, false);
5075 		}
5076 	      else
5077 		aarch64_add_offset (int_mode, dest, base, offset,
5078 				    dest, NULL_RTX, false);
5079 	    }
5080 	  return;
5081 	}
5082 
5083       sty = aarch64_classify_symbol (base, const_offset);
5084       switch (sty)
5085 	{
5086 	case SYMBOL_FORCE_TO_MEM:
5087 	  if (const_offset != 0
5088 	      && targetm.cannot_force_const_mem (int_mode, imm))
5089 	    {
5090 	      gcc_assert (can_create_pseudo_p ());
5091 	      base = aarch64_force_temporary (int_mode, dest, base);
5092 	      aarch64_add_offset (int_mode, dest, base, const_offset,
5093 				  NULL_RTX, NULL_RTX, false);
5094 	      return;
5095 	    }
5096 
5097 	  mem = force_const_mem (ptr_mode, imm);
5098 	  gcc_assert (mem);
5099 
5100 	  /* If we aren't generating PC relative literals, then
5101 	     we need to expand the literal pool access carefully.
5102 	     This is something that needs to be done in a number
5103 	     of places, so could well live as a separate function.  */
5104 	  if (!aarch64_pcrelative_literal_loads)
5105 	    {
5106 	      gcc_assert (can_create_pseudo_p ());
5107 	      base = gen_reg_rtx (ptr_mode);
5108 	      aarch64_expand_mov_immediate (base, XEXP (mem, 0));
5109 	      if (ptr_mode != Pmode)
5110 		base = convert_memory_address (Pmode, base);
5111 	      mem = gen_rtx_MEM (ptr_mode, base);
5112 	    }
5113 
5114 	  if (int_mode != ptr_mode)
5115 	    mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
5116 
5117 	  emit_insn (gen_rtx_SET (dest, mem));
5118 
5119 	  return;
5120 
5121         case SYMBOL_SMALL_TLSGD:
5122         case SYMBOL_SMALL_TLSDESC:
5123 	case SYMBOL_SMALL_TLSIE:
5124 	case SYMBOL_SMALL_GOT_28K:
5125 	case SYMBOL_SMALL_GOT_4G:
5126 	case SYMBOL_TINY_GOT:
5127 	case SYMBOL_TINY_TLSIE:
5128 	  if (const_offset != 0)
5129 	    {
5130 	      gcc_assert(can_create_pseudo_p ());
5131 	      base = aarch64_force_temporary (int_mode, dest, base);
5132 	      aarch64_add_offset (int_mode, dest, base, const_offset,
5133 				  NULL_RTX, NULL_RTX, false);
5134 	      return;
5135 	    }
5136 	  /* FALLTHRU */
5137 
5138 	case SYMBOL_SMALL_ABSOLUTE:
5139 	case SYMBOL_TINY_ABSOLUTE:
5140 	case SYMBOL_TLSLE12:
5141 	case SYMBOL_TLSLE24:
5142 	case SYMBOL_TLSLE32:
5143 	case SYMBOL_TLSLE48:
5144 	  aarch64_load_symref_appropriately (dest, imm, sty);
5145 	  return;
5146 
5147 	default:
5148 	  gcc_unreachable ();
5149 	}
5150     }
5151 
5152   if (!CONST_INT_P (imm))
5153     {
5154       if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL)
5155 	{
5156 	  /* Only the low bit of each .H, .S and .D element is defined,
5157 	     so we can set the upper bits to whatever we like.  If the
5158 	     predicate is all-true in MODE, prefer to set all the undefined
5159 	     bits as well, so that we can share a single .B predicate for
5160 	     all modes.  */
5161 	  if (imm == CONSTM1_RTX (mode))
5162 	    imm = CONSTM1_RTX (VNx16BImode);
5163 
5164 	  /* All methods for constructing predicate modes wider than VNx16BI
5165 	     will set the upper bits of each element to zero.  Expose this
5166 	     by moving such constants as a VNx16BI, so that all bits are
5167 	     significant and so that constants for different modes can be
5168 	     shared.  The wider constant will still be available as a
5169 	     REG_EQUAL note.  */
5170 	  rtx_vector_builder builder;
5171 	  if (aarch64_get_sve_pred_bits (builder, imm))
5172 	    {
5173 	      rtx res = aarch64_expand_sve_const_pred (dest, builder);
5174 	      if (dest != res)
5175 		emit_move_insn (dest, gen_lowpart (mode, res));
5176 	      return;
5177 	    }
5178 	}
5179 
5180       if (GET_CODE (imm) == HIGH
5181 	  || aarch64_simd_valid_immediate (imm, NULL))
5182 	{
5183 	  emit_insn (gen_rtx_SET (dest, imm));
5184 	  return;
5185 	}
5186 
5187       if (GET_CODE (imm) == CONST_VECTOR && aarch64_sve_data_mode_p (mode))
5188 	if (rtx res = aarch64_expand_sve_const_vector (dest, imm))
5189 	  {
5190 	    if (dest != res)
5191 	      emit_insn (gen_aarch64_sve_reinterpret (mode, dest, res));
5192 	    return;
5193 	  }
5194 
5195       rtx mem = force_const_mem (mode, imm);
5196       gcc_assert (mem);
5197       emit_move_insn (dest, mem);
5198       return;
5199     }
5200 
5201   aarch64_internal_mov_immediate (dest, imm, true,
5202 				  as_a <scalar_int_mode> (mode));
5203 }
5204 
5205 /* Emit an SVE predicated move from SRC to DEST.  PRED is a predicate
5206    that is known to contain PTRUE.  */
5207 
5208 void
aarch64_emit_sve_pred_move(rtx dest,rtx pred,rtx src)5209 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
5210 {
5211   expand_operand ops[3];
5212   machine_mode mode = GET_MODE (dest);
5213   create_output_operand (&ops[0], dest, mode);
5214   create_input_operand (&ops[1], pred, GET_MODE(pred));
5215   create_input_operand (&ops[2], src, mode);
5216   temporary_volatile_ok v (true);
5217   expand_insn (code_for_aarch64_pred_mov (mode), 3, ops);
5218 }
5219 
5220 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
5221    operand is in memory.  In this case we need to use the predicated LD1
5222    and ST1 instead of LDR and STR, both for correctness on big-endian
5223    targets and because LD1 and ST1 support a wider range of addressing modes.
5224    PRED_MODE is the mode of the predicate.
5225 
5226    See the comment at the head of aarch64-sve.md for details about the
5227    big-endian handling.  */
5228 
5229 void
aarch64_expand_sve_mem_move(rtx dest,rtx src,machine_mode pred_mode)5230 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
5231 {
5232   machine_mode mode = GET_MODE (dest);
5233   rtx ptrue = aarch64_ptrue_reg (pred_mode);
5234   if (!register_operand (src, mode)
5235       && !register_operand (dest, mode))
5236     {
5237       rtx tmp = gen_reg_rtx (mode);
5238       if (MEM_P (src))
5239 	aarch64_emit_sve_pred_move (tmp, ptrue, src);
5240       else
5241 	emit_move_insn (tmp, src);
5242       src = tmp;
5243     }
5244   aarch64_emit_sve_pred_move (dest, ptrue, src);
5245 }
5246 
5247 /* Called only on big-endian targets.  See whether an SVE vector move
5248    from SRC to DEST is effectively a REV[BHW] instruction, because at
5249    least one operand is a subreg of an SVE vector that has wider or
5250    narrower elements.  Return true and emit the instruction if so.
5251 
5252    For example:
5253 
5254      (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
5255 
5256    represents a VIEW_CONVERT between the following vectors, viewed
5257    in memory order:
5258 
5259      R2: { [0].high, [0].low,  [1].high, [1].low, ... }
5260      R1: { [0],      [1],      [2],      [3],     ... }
5261 
5262    The high part of lane X in R2 should therefore correspond to lane X*2
5263    of R1, but the register representations are:
5264 
5265          msb                                      lsb
5266      R2: ...... [1].high  [1].low   [0].high  [0].low
5267      R1: ...... [3]       [2]       [1]       [0]
5268 
5269    where the low part of lane X in R2 corresponds to lane X*2 in R1.
5270    We therefore need a reverse operation to swap the high and low values
5271    around.
5272 
5273    This is purely an optimization.  Without it we would spill the
5274    subreg operand to the stack in one mode and reload it in the
5275    other mode, which has the same effect as the REV.  */
5276 
5277 bool
aarch64_maybe_expand_sve_subreg_move(rtx dest,rtx src)5278 aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
5279 {
5280   gcc_assert (BYTES_BIG_ENDIAN);
5281   if (GET_CODE (dest) == SUBREG)
5282     dest = SUBREG_REG (dest);
5283   if (GET_CODE (src) == SUBREG)
5284     src = SUBREG_REG (src);
5285 
5286   /* The optimization handles two single SVE REGs with different element
5287      sizes.  */
5288   if (!REG_P (dest)
5289       || !REG_P (src)
5290       || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
5291       || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
5292       || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
5293 	  == GET_MODE_UNIT_SIZE (GET_MODE (src))))
5294     return false;
5295 
5296   /* Generate *aarch64_sve_mov<mode>_subreg_be.  */
5297   rtx ptrue = aarch64_ptrue_reg (VNx16BImode);
5298   rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
5299 			       UNSPEC_REV_SUBREG);
5300   emit_insn (gen_rtx_SET (dest, unspec));
5301   return true;
5302 }
5303 
5304 /* Return a copy of X with mode MODE, without changing its other
5305    attributes.  Unlike gen_lowpart, this doesn't care whether the
5306    mode change is valid.  */
5307 
5308 rtx
aarch64_replace_reg_mode(rtx x,machine_mode mode)5309 aarch64_replace_reg_mode (rtx x, machine_mode mode)
5310 {
5311   if (GET_MODE (x) == mode)
5312     return x;
5313 
5314   x = shallow_copy_rtx (x);
5315   set_mode_and_regno (x, mode, REGNO (x));
5316   return x;
5317 }
5318 
5319 /* Return the SVE REV[BHW] unspec for reversing quantites of mode MODE
5320    stored in wider integer containers.  */
5321 
5322 static unsigned int
aarch64_sve_rev_unspec(machine_mode mode)5323 aarch64_sve_rev_unspec (machine_mode mode)
5324 {
5325   switch (GET_MODE_UNIT_SIZE (mode))
5326     {
5327     case 1: return UNSPEC_REVB;
5328     case 2: return UNSPEC_REVH;
5329     case 4: return UNSPEC_REVW;
5330     }
5331   gcc_unreachable ();
5332 }
5333 
5334 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
5335    operands.  */
5336 
5337 void
aarch64_split_sve_subreg_move(rtx dest,rtx ptrue,rtx src)5338 aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
5339 {
5340   /* Decide which REV operation we need.  The mode with wider elements
5341      determines the mode of the operands and the mode with the narrower
5342      elements determines the reverse width.  */
5343   machine_mode mode_with_wider_elts = aarch64_sve_int_mode (GET_MODE (dest));
5344   machine_mode mode_with_narrower_elts = aarch64_sve_int_mode (GET_MODE (src));
5345   if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
5346       < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
5347     std::swap (mode_with_wider_elts, mode_with_narrower_elts);
5348 
5349   unsigned int unspec = aarch64_sve_rev_unspec (mode_with_narrower_elts);
5350   machine_mode pred_mode = aarch64_sve_pred_mode (mode_with_wider_elts);
5351 
5352   /* Get the operands in the appropriate modes and emit the instruction.  */
5353   ptrue = gen_lowpart (pred_mode, ptrue);
5354   dest = aarch64_replace_reg_mode (dest, mode_with_wider_elts);
5355   src = aarch64_replace_reg_mode (src, mode_with_wider_elts);
5356   emit_insn (gen_aarch64_pred (unspec, mode_with_wider_elts,
5357 			       dest, ptrue, src));
5358 }
5359 
5360 static bool
aarch64_function_ok_for_sibcall(tree,tree exp)5361 aarch64_function_ok_for_sibcall (tree, tree exp)
5362 {
5363   if (crtl->abi->id () != expr_callee_abi (exp).id ())
5364     return false;
5365 
5366   return true;
5367 }
5368 
5369 /* Subroutine of aarch64_pass_by_reference for arguments that are not
5370    passed in SVE registers.  */
5371 
5372 static bool
aarch64_pass_by_reference_1(CUMULATIVE_ARGS * pcum,const function_arg_info & arg)5373 aarch64_pass_by_reference_1 (CUMULATIVE_ARGS *pcum,
5374 			     const function_arg_info &arg)
5375 {
5376   HOST_WIDE_INT size;
5377   machine_mode dummymode;
5378   int nregs;
5379 
5380   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
5381   if (arg.mode == BLKmode && arg.type)
5382     size = int_size_in_bytes (arg.type);
5383   else
5384     /* No frontends can create types with variable-sized modes, so we
5385        shouldn't be asked to pass or return them.  */
5386     size = GET_MODE_SIZE (arg.mode).to_constant ();
5387 
5388   /* Aggregates are passed by reference based on their size.  */
5389   if (arg.aggregate_type_p ())
5390     size = int_size_in_bytes (arg.type);
5391 
5392   /* Variable sized arguments are always returned by reference.  */
5393   if (size < 0)
5394     return true;
5395 
5396   /* Can this be a candidate to be passed in fp/simd register(s)?  */
5397   if (aarch64_vfp_is_call_or_return_candidate (arg.mode, arg.type,
5398 					       &dummymode, &nregs, NULL,
5399 					       !pcum || pcum->silent_p))
5400     return false;
5401 
5402   /* Arguments which are variable sized or larger than 2 registers are
5403      passed by reference unless they are a homogenous floating point
5404      aggregate.  */
5405   return size > 2 * UNITS_PER_WORD;
5406 }
5407 
5408 /* Implement TARGET_PASS_BY_REFERENCE.  */
5409 
5410 static bool
aarch64_pass_by_reference(cumulative_args_t pcum_v,const function_arg_info & arg)5411 aarch64_pass_by_reference (cumulative_args_t pcum_v,
5412 			   const function_arg_info &arg)
5413 {
5414   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
5415 
5416   if (!arg.type)
5417     return aarch64_pass_by_reference_1 (pcum, arg);
5418 
5419   pure_scalable_type_info pst_info;
5420   switch (pst_info.analyze (arg.type))
5421     {
5422     case pure_scalable_type_info::IS_PST:
5423       if (pcum && !pcum->silent_p && !TARGET_SVE)
5424 	/* We can't gracefully recover at this point, so make this a
5425 	   fatal error.  */
5426 	fatal_error (input_location, "arguments of type %qT require"
5427 		     " the SVE ISA extension", arg.type);
5428 
5429       /* Variadic SVE types are passed by reference.  Normal non-variadic
5430 	 arguments are too if we've run out of registers.  */
5431       return (!arg.named
5432 	      || pcum->aapcs_nvrn + pst_info.num_zr () > NUM_FP_ARG_REGS
5433 	      || pcum->aapcs_nprn + pst_info.num_pr () > NUM_PR_ARG_REGS);
5434 
5435     case pure_scalable_type_info::DOESNT_MATTER:
5436       gcc_assert (aarch64_pass_by_reference_1 (pcum, arg));
5437       return true;
5438 
5439     case pure_scalable_type_info::NO_ABI_IDENTITY:
5440     case pure_scalable_type_info::ISNT_PST:
5441       return aarch64_pass_by_reference_1 (pcum, arg);
5442     }
5443   gcc_unreachable ();
5444 }
5445 
5446 /* Return TRUE if VALTYPE is padded to its least significant bits.  */
5447 static bool
aarch64_return_in_msb(const_tree valtype)5448 aarch64_return_in_msb (const_tree valtype)
5449 {
5450   machine_mode dummy_mode;
5451   int dummy_int;
5452 
5453   /* Never happens in little-endian mode.  */
5454   if (!BYTES_BIG_ENDIAN)
5455     return false;
5456 
5457   /* Only composite types smaller than or equal to 16 bytes can
5458      be potentially returned in registers.  */
5459   if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
5460       || int_size_in_bytes (valtype) <= 0
5461       || int_size_in_bytes (valtype) > 16)
5462     return false;
5463 
5464   /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
5465      or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
5466      is always passed/returned in the least significant bits of fp/simd
5467      register(s).  */
5468   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
5469 					       &dummy_mode, &dummy_int, NULL,
5470 					       false))
5471     return false;
5472 
5473   /* Likewise pure scalable types for SVE vector and predicate registers.  */
5474   pure_scalable_type_info pst_info;
5475   if (pst_info.analyze_registers (valtype))
5476     return false;
5477 
5478   return true;
5479 }
5480 
5481 /* Implement TARGET_FUNCTION_VALUE.
5482    Define how to find the value returned by a function.  */
5483 
5484 static rtx
aarch64_function_value(const_tree type,const_tree func,bool outgoing ATTRIBUTE_UNUSED)5485 aarch64_function_value (const_tree type, const_tree func,
5486 			bool outgoing ATTRIBUTE_UNUSED)
5487 {
5488   machine_mode mode;
5489   int unsignedp;
5490 
5491   mode = TYPE_MODE (type);
5492   if (INTEGRAL_TYPE_P (type))
5493     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
5494 
5495   pure_scalable_type_info pst_info;
5496   if (type && pst_info.analyze_registers (type))
5497     return pst_info.get_rtx (mode, V0_REGNUM, P0_REGNUM);
5498 
5499   /* Generic vectors that map to full SVE modes with -msve-vector-bits=N
5500      are returned in memory, not by value.  */
5501   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
5502   bool sve_p = (vec_flags & VEC_ANY_SVE);
5503 
5504   if (aarch64_return_in_msb (type))
5505     {
5506       HOST_WIDE_INT size = int_size_in_bytes (type);
5507 
5508       if (size % UNITS_PER_WORD != 0)
5509 	{
5510 	  size += UNITS_PER_WORD - size % UNITS_PER_WORD;
5511 	  mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
5512 	}
5513     }
5514 
5515   int count;
5516   machine_mode ag_mode;
5517   if (aarch64_vfp_is_call_or_return_candidate (mode, type, &ag_mode, &count,
5518 					       NULL, false))
5519     {
5520       gcc_assert (!sve_p);
5521       if (!aarch64_composite_type_p (type, mode))
5522 	{
5523 	  gcc_assert (count == 1 && mode == ag_mode);
5524 	  return gen_rtx_REG (mode, V0_REGNUM);
5525 	}
5526       else
5527 	{
5528 	  int i;
5529 	  rtx par;
5530 
5531 	  par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
5532 	  for (i = 0; i < count; i++)
5533 	    {
5534 	      rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
5535 	      rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
5536 	      tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
5537 	      XVECEXP (par, 0, i) = tmp;
5538 	    }
5539 	  return par;
5540 	}
5541     }
5542   else
5543     {
5544       if (sve_p)
5545 	{
5546 	  /* Vector types can acquire a partial SVE mode using things like
5547 	     __attribute__((vector_size(N))), and this is potentially useful.
5548 	     However, the choice of mode doesn't affect the type's ABI
5549 	     identity, so we should treat the types as though they had
5550 	     the associated integer mode, just like they did before SVE
5551 	     was introduced.
5552 
5553 	     We know that the vector must be 128 bits or smaller,
5554 	     otherwise we'd have returned it in memory instead.  */
5555 	  gcc_assert (type
5556 		      && (aarch64_some_values_include_pst_objects_p (type)
5557 			  || (vec_flags & VEC_PARTIAL)));
5558 
5559 	  scalar_int_mode int_mode = int_mode_for_mode (mode).require ();
5560 	  rtx reg = gen_rtx_REG (int_mode, R0_REGNUM);
5561 	  rtx pair = gen_rtx_EXPR_LIST (VOIDmode, reg, const0_rtx);
5562 	  return gen_rtx_PARALLEL (mode, gen_rtvec (1, pair));
5563 	}
5564       return gen_rtx_REG (mode, R0_REGNUM);
5565     }
5566 }
5567 
5568 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
5569    Return true if REGNO is the number of a hard register in which the values
5570    of called function may come back.  */
5571 
5572 static bool
aarch64_function_value_regno_p(const unsigned int regno)5573 aarch64_function_value_regno_p (const unsigned int regno)
5574 {
5575   /* Maximum of 16 bytes can be returned in the general registers.  Examples
5576      of 16-byte return values are: 128-bit integers and 16-byte small
5577      structures (excluding homogeneous floating-point aggregates).  */
5578   if (regno == R0_REGNUM || regno == R1_REGNUM)
5579     return true;
5580 
5581   /* Up to four fp/simd registers can return a function value, e.g. a
5582      homogeneous floating-point aggregate having four members.  */
5583   if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
5584     return TARGET_FLOAT;
5585 
5586   return false;
5587 }
5588 
5589 /* Subroutine for aarch64_return_in_memory for types that are not returned
5590    in SVE registers.  */
5591 
5592 static bool
aarch64_return_in_memory_1(const_tree type)5593 aarch64_return_in_memory_1 (const_tree type)
5594 {
5595   HOST_WIDE_INT size;
5596   machine_mode ag_mode;
5597   int count;
5598 
5599   if (!AGGREGATE_TYPE_P (type)
5600       && TREE_CODE (type) != COMPLEX_TYPE
5601       && TREE_CODE (type) != VECTOR_TYPE)
5602     /* Simple scalar types always returned in registers.  */
5603     return false;
5604 
5605   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
5606 					       &ag_mode, &count, NULL, false))
5607     return false;
5608 
5609   /* Types larger than 2 registers returned in memory.  */
5610   size = int_size_in_bytes (type);
5611   return (size < 0 || size > 2 * UNITS_PER_WORD);
5612 }
5613 
5614 /* Implement TARGET_RETURN_IN_MEMORY.
5615 
5616    If the type T of the result of a function is such that
5617      void func (T arg)
5618    would require that arg be passed as a value in a register (or set of
5619    registers) according to the parameter passing rules, then the result
5620    is returned in the same registers as would be used for such an
5621    argument.  */
5622 
5623 static bool
aarch64_return_in_memory(const_tree type,const_tree fndecl ATTRIBUTE_UNUSED)5624 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
5625 {
5626   pure_scalable_type_info pst_info;
5627   switch (pst_info.analyze (type))
5628     {
5629     case pure_scalable_type_info::IS_PST:
5630       return (pst_info.num_zr () > NUM_FP_ARG_REGS
5631 	      || pst_info.num_pr () > NUM_PR_ARG_REGS);
5632 
5633     case pure_scalable_type_info::DOESNT_MATTER:
5634       gcc_assert (aarch64_return_in_memory_1 (type));
5635       return true;
5636 
5637     case pure_scalable_type_info::NO_ABI_IDENTITY:
5638     case pure_scalable_type_info::ISNT_PST:
5639       return aarch64_return_in_memory_1 (type);
5640     }
5641   gcc_unreachable ();
5642 }
5643 
5644 static bool
aarch64_vfp_is_call_candidate(cumulative_args_t pcum_v,machine_mode mode,const_tree type,int * nregs)5645 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
5646 			       const_tree type, int *nregs)
5647 {
5648   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
5649   return aarch64_vfp_is_call_or_return_candidate (mode, type,
5650 						  &pcum->aapcs_vfp_rmode,
5651 						  nregs, NULL, pcum->silent_p);
5652 }
5653 
5654 /* Given MODE and TYPE of a function argument, return the alignment in
5655    bits.  The idea is to suppress any stronger alignment requested by
5656    the user and opt for the natural alignment (specified in AAPCS64 \S
5657    4.1).  ABI_BREAK is set to true if the alignment was incorrectly
5658    calculated in versions of GCC prior to GCC-9.  This is a helper
5659    function for local use only.  */
5660 
5661 static unsigned int
aarch64_function_arg_alignment(machine_mode mode,const_tree type,bool * abi_break)5662 aarch64_function_arg_alignment (machine_mode mode, const_tree type,
5663 				bool *abi_break)
5664 {
5665   *abi_break = false;
5666   if (!type)
5667     return GET_MODE_ALIGNMENT (mode);
5668 
5669   if (integer_zerop (TYPE_SIZE (type)))
5670     return 0;
5671 
5672   gcc_assert (TYPE_MODE (type) == mode);
5673 
5674   if (!AGGREGATE_TYPE_P (type))
5675     return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
5676 
5677   if (TREE_CODE (type) == ARRAY_TYPE)
5678     return TYPE_ALIGN (TREE_TYPE (type));
5679 
5680   unsigned int alignment = 0;
5681   unsigned int bitfield_alignment = 0;
5682   for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
5683     if (TREE_CODE (field) == FIELD_DECL)
5684       {
5685 	/* Note that we explicitly consider zero-sized fields here,
5686 	   even though they don't map to AAPCS64 machine types.
5687 	   For example, in:
5688 
5689 	       struct __attribute__((aligned(8))) empty {};
5690 
5691 	       struct s {
5692 		 [[no_unique_address]] empty e;
5693 		 int x;
5694 	       };
5695 
5696 	   "s" contains only one Fundamental Data Type (the int field)
5697 	   but gains 8-byte alignment and size thanks to "e".  */
5698 	alignment = std::max (alignment, DECL_ALIGN (field));
5699 	if (DECL_BIT_FIELD_TYPE (field))
5700 	  bitfield_alignment
5701 	    = std::max (bitfield_alignment,
5702 			TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field)));
5703       }
5704 
5705   if (bitfield_alignment > alignment)
5706     {
5707       *abi_break = true;
5708       return bitfield_alignment;
5709     }
5710 
5711   return alignment;
5712 }
5713 
5714 /* Layout a function argument according to the AAPCS64 rules.  The rule
5715    numbers refer to the rule numbers in the AAPCS64.  ORIG_MODE is the
5716    mode that was originally given to us by the target hook, whereas the
5717    mode in ARG might be the result of replacing partial SVE modes with
5718    the equivalent integer mode.  */
5719 
5720 static void
aarch64_layout_arg(cumulative_args_t pcum_v,const function_arg_info & arg)5721 aarch64_layout_arg (cumulative_args_t pcum_v, const function_arg_info &arg)
5722 {
5723   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
5724   tree type = arg.type;
5725   machine_mode mode = arg.mode;
5726   int ncrn, nvrn, nregs;
5727   bool allocate_ncrn, allocate_nvrn;
5728   HOST_WIDE_INT size;
5729   bool abi_break;
5730 
5731   /* We need to do this once per argument.  */
5732   if (pcum->aapcs_arg_processed)
5733     return;
5734 
5735   pcum->aapcs_arg_processed = true;
5736 
5737   pure_scalable_type_info pst_info;
5738   if (type && pst_info.analyze_registers (type))
5739     {
5740       /* The PCS says that it is invalid to pass an SVE value to an
5741 	 unprototyped function.  There is no ABI-defined location we
5742 	 can return in this case, so we have no real choice but to raise
5743 	 an error immediately, even though this is only a query function.  */
5744       if (arg.named && pcum->pcs_variant != ARM_PCS_SVE)
5745 	{
5746 	  gcc_assert (!pcum->silent_p);
5747 	  error ("SVE type %qT cannot be passed to an unprototyped function",
5748 		 arg.type);
5749 	  /* Avoid repeating the message, and avoid tripping the assert
5750 	     below.  */
5751 	  pcum->pcs_variant = ARM_PCS_SVE;
5752 	}
5753 
5754       /* We would have converted the argument into pass-by-reference
5755 	 form if it didn't fit in registers.  */
5756       pcum->aapcs_nextnvrn = pcum->aapcs_nvrn + pst_info.num_zr ();
5757       pcum->aapcs_nextnprn = pcum->aapcs_nprn + pst_info.num_pr ();
5758       gcc_assert (arg.named
5759 		  && pcum->pcs_variant == ARM_PCS_SVE
5760 		  && pcum->aapcs_nextnvrn <= NUM_FP_ARG_REGS
5761 		  && pcum->aapcs_nextnprn <= NUM_PR_ARG_REGS);
5762       pcum->aapcs_reg = pst_info.get_rtx (mode, V0_REGNUM + pcum->aapcs_nvrn,
5763 					  P0_REGNUM + pcum->aapcs_nprn);
5764       return;
5765     }
5766 
5767   /* Generic vectors that map to full SVE modes with -msve-vector-bits=N
5768      are passed by reference, not by value.  */
5769   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
5770   bool sve_p = (vec_flags & VEC_ANY_SVE);
5771   if (sve_p)
5772     /* Vector types can acquire a partial SVE mode using things like
5773        __attribute__((vector_size(N))), and this is potentially useful.
5774        However, the choice of mode doesn't affect the type's ABI
5775        identity, so we should treat the types as though they had
5776        the associated integer mode, just like they did before SVE
5777        was introduced.
5778 
5779        We know that the vector must be 128 bits or smaller,
5780        otherwise we'd have passed it in memory instead.  */
5781     gcc_assert (type
5782 		&& (aarch64_some_values_include_pst_objects_p (type)
5783 		    || (vec_flags & VEC_PARTIAL)));
5784 
5785   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
5786   if (type)
5787     size = int_size_in_bytes (type);
5788   else
5789     /* No frontends can create types with variable-sized modes, so we
5790        shouldn't be asked to pass or return them.  */
5791     size = GET_MODE_SIZE (mode).to_constant ();
5792   size = ROUND_UP (size, UNITS_PER_WORD);
5793 
5794   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
5795   allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
5796 						 mode,
5797 						 type,
5798 						 &nregs);
5799   gcc_assert (!sve_p || !allocate_nvrn);
5800 
5801   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
5802      The following code thus handles passing by SIMD/FP registers first.  */
5803 
5804   nvrn = pcum->aapcs_nvrn;
5805 
5806   /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
5807      and homogenous short-vector aggregates (HVA).  */
5808   if (allocate_nvrn)
5809     {
5810       if (!pcum->silent_p && !TARGET_FLOAT)
5811 	aarch64_err_no_fpadvsimd (mode);
5812 
5813       if (nvrn + nregs <= NUM_FP_ARG_REGS)
5814 	{
5815 	  pcum->aapcs_nextnvrn = nvrn + nregs;
5816 	  if (!aarch64_composite_type_p (type, mode))
5817 	    {
5818 	      gcc_assert (nregs == 1);
5819 	      pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
5820 	    }
5821 	  else
5822 	    {
5823 	      rtx par;
5824 	      int i;
5825 	      par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
5826 	      for (i = 0; i < nregs; i++)
5827 		{
5828 		  rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
5829 					 V0_REGNUM + nvrn + i);
5830 		  rtx offset = gen_int_mode
5831 		    (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
5832 		  tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
5833 		  XVECEXP (par, 0, i) = tmp;
5834 		}
5835 	      pcum->aapcs_reg = par;
5836 	    }
5837 	  return;
5838 	}
5839       else
5840 	{
5841 	  /* C.3 NSRN is set to 8.  */
5842 	  pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
5843 	  goto on_stack;
5844 	}
5845     }
5846 
5847   ncrn = pcum->aapcs_ncrn;
5848   nregs = size / UNITS_PER_WORD;
5849 
5850   /* C6 - C9.  though the sign and zero extension semantics are
5851      handled elsewhere.  This is the case where the argument fits
5852      entirely general registers.  */
5853   if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
5854     {
5855       gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
5856 
5857       /* C.8 if the argument has an alignment of 16 then the NGRN is
5858 	 rounded up to the next even number.  */
5859       if (nregs == 2
5860 	  && ncrn % 2
5861 	  /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
5862 	     comparison is there because for > 16 * BITS_PER_UNIT
5863 	     alignment nregs should be > 2 and therefore it should be
5864 	     passed by reference rather than value.  */
5865 	  && (aarch64_function_arg_alignment (mode, type, &abi_break)
5866 	      == 16 * BITS_PER_UNIT))
5867 	{
5868 	  if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
5869 	    inform (input_location, "parameter passing for argument of type "
5870 		    "%qT changed in GCC 9.1", type);
5871 	  ++ncrn;
5872 	  gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
5873 	}
5874 
5875       /* If an argument with an SVE mode needs to be shifted up to the
5876 	 high part of the register, treat it as though it had an integer mode.
5877 	 Using the normal (parallel [...]) would suppress the shifting.  */
5878       if (sve_p
5879 	  && BYTES_BIG_ENDIAN
5880 	  && maybe_ne (GET_MODE_SIZE (mode), nregs * UNITS_PER_WORD)
5881 	  && aarch64_pad_reg_upward (mode, type, false))
5882 	{
5883 	  mode = int_mode_for_mode (mode).require ();
5884 	  sve_p = false;
5885 	}
5886 
5887       /* NREGS can be 0 when e.g. an empty structure is to be passed.
5888 	 A reg is still generated for it, but the caller should be smart
5889 	 enough not to use it.  */
5890       if (nregs == 0
5891 	  || (nregs == 1 && !sve_p)
5892 	  || GET_MODE_CLASS (mode) == MODE_INT)
5893 	pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
5894       else
5895 	{
5896 	  rtx par;
5897 	  int i;
5898 
5899 	  par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
5900 	  for (i = 0; i < nregs; i++)
5901 	    {
5902 	      scalar_int_mode reg_mode = word_mode;
5903 	      if (nregs == 1)
5904 		reg_mode = int_mode_for_mode (mode).require ();
5905 	      rtx tmp = gen_rtx_REG (reg_mode, R0_REGNUM + ncrn + i);
5906 	      tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
5907 				       GEN_INT (i * UNITS_PER_WORD));
5908 	      XVECEXP (par, 0, i) = tmp;
5909 	    }
5910 	  pcum->aapcs_reg = par;
5911 	}
5912 
5913       pcum->aapcs_nextncrn = ncrn + nregs;
5914       return;
5915     }
5916 
5917   /* C.11  */
5918   pcum->aapcs_nextncrn = NUM_ARG_REGS;
5919 
5920   /* The argument is passed on stack; record the needed number of words for
5921      this argument and align the total size if necessary.  */
5922 on_stack:
5923   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
5924 
5925   if (aarch64_function_arg_alignment (mode, type, &abi_break)
5926       == 16 * BITS_PER_UNIT)
5927     {
5928       int new_size = ROUND_UP (pcum->aapcs_stack_size, 16 / UNITS_PER_WORD);
5929       if (pcum->aapcs_stack_size != new_size)
5930 	{
5931 	  if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
5932 	    inform (input_location, "parameter passing for argument of type "
5933 		    "%qT changed in GCC 9.1", type);
5934 	  pcum->aapcs_stack_size = new_size;
5935 	}
5936     }
5937   return;
5938 }
5939 
5940 /* Implement TARGET_FUNCTION_ARG.  */
5941 
5942 static rtx
aarch64_function_arg(cumulative_args_t pcum_v,const function_arg_info & arg)5943 aarch64_function_arg (cumulative_args_t pcum_v, const function_arg_info &arg)
5944 {
5945   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
5946   gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64
5947 	      || pcum->pcs_variant == ARM_PCS_SIMD
5948 	      || pcum->pcs_variant == ARM_PCS_SVE);
5949 
5950   if (arg.end_marker_p ())
5951     return gen_int_mode (pcum->pcs_variant, DImode);
5952 
5953   aarch64_layout_arg (pcum_v, arg);
5954   return pcum->aapcs_reg;
5955 }
5956 
5957 void
aarch64_init_cumulative_args(CUMULATIVE_ARGS * pcum,const_tree fntype,rtx libname ATTRIBUTE_UNUSED,const_tree fndecl ATTRIBUTE_UNUSED,unsigned n_named ATTRIBUTE_UNUSED,bool silent_p)5958 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
5959 			      const_tree fntype,
5960 			      rtx libname ATTRIBUTE_UNUSED,
5961 			      const_tree fndecl ATTRIBUTE_UNUSED,
5962 			      unsigned n_named ATTRIBUTE_UNUSED,
5963 			      bool silent_p)
5964 {
5965   pcum->aapcs_ncrn = 0;
5966   pcum->aapcs_nvrn = 0;
5967   pcum->aapcs_nprn = 0;
5968   pcum->aapcs_nextncrn = 0;
5969   pcum->aapcs_nextnvrn = 0;
5970   pcum->aapcs_nextnprn = 0;
5971   if (fntype)
5972     pcum->pcs_variant = (arm_pcs) fntype_abi (fntype).id ();
5973   else
5974     pcum->pcs_variant = ARM_PCS_AAPCS64;
5975   pcum->aapcs_reg = NULL_RTX;
5976   pcum->aapcs_arg_processed = false;
5977   pcum->aapcs_stack_words = 0;
5978   pcum->aapcs_stack_size = 0;
5979   pcum->silent_p = silent_p;
5980 
5981   if (!silent_p
5982       && !TARGET_FLOAT
5983       && fndecl && TREE_PUBLIC (fndecl)
5984       && fntype && fntype != error_mark_node)
5985     {
5986       const_tree type = TREE_TYPE (fntype);
5987       machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument.  */
5988       int nregs ATTRIBUTE_UNUSED; /* Likewise.  */
5989       if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
5990 						   &mode, &nregs, NULL, false))
5991 	aarch64_err_no_fpadvsimd (TYPE_MODE (type));
5992     }
5993 
5994   if (!silent_p
5995       && !TARGET_SVE
5996       && pcum->pcs_variant == ARM_PCS_SVE)
5997     {
5998       /* We can't gracefully recover at this point, so make this a
5999 	 fatal error.  */
6000       if (fndecl)
6001 	fatal_error (input_location, "%qE requires the SVE ISA extension",
6002 		     fndecl);
6003       else
6004 	fatal_error (input_location, "calls to functions of type %qT require"
6005 		     " the SVE ISA extension", fntype);
6006     }
6007 }
6008 
6009 static void
aarch64_function_arg_advance(cumulative_args_t pcum_v,const function_arg_info & arg)6010 aarch64_function_arg_advance (cumulative_args_t pcum_v,
6011 			      const function_arg_info &arg)
6012 {
6013   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
6014   if (pcum->pcs_variant == ARM_PCS_AAPCS64
6015       || pcum->pcs_variant == ARM_PCS_SIMD
6016       || pcum->pcs_variant == ARM_PCS_SVE)
6017     {
6018       aarch64_layout_arg (pcum_v, arg);
6019       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
6020 		  != (pcum->aapcs_stack_words != 0));
6021       pcum->aapcs_arg_processed = false;
6022       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
6023       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
6024       pcum->aapcs_nprn = pcum->aapcs_nextnprn;
6025       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
6026       pcum->aapcs_stack_words = 0;
6027       pcum->aapcs_reg = NULL_RTX;
6028     }
6029 }
6030 
6031 bool
aarch64_function_arg_regno_p(unsigned regno)6032 aarch64_function_arg_regno_p (unsigned regno)
6033 {
6034   return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
6035 	  || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
6036 }
6037 
6038 /* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
6039    PARM_BOUNDARY bits of alignment, but will be given anything up
6040    to STACK_BOUNDARY bits if the type requires it.  This makes sure
6041    that both before and after the layout of each argument, the Next
6042    Stacked Argument Address (NSAA) will have a minimum alignment of
6043    8 bytes.  */
6044 
6045 static unsigned int
aarch64_function_arg_boundary(machine_mode mode,const_tree type)6046 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
6047 {
6048   bool abi_break;
6049   unsigned int alignment = aarch64_function_arg_alignment (mode, type,
6050 							   &abi_break);
6051   if (abi_break & warn_psabi)
6052     inform (input_location, "parameter passing for argument of type "
6053 	    "%qT changed in GCC 9.1", type);
6054 
6055   return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
6056 }
6057 
6058 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE.  */
6059 
6060 static fixed_size_mode
aarch64_get_reg_raw_mode(int regno)6061 aarch64_get_reg_raw_mode (int regno)
6062 {
6063   if (TARGET_SVE && FP_REGNUM_P (regno))
6064     /* Don't use the SVE part of the register for __builtin_apply and
6065        __builtin_return.  The SVE registers aren't used by the normal PCS,
6066        so using them there would be a waste of time.  The PCS extensions
6067        for SVE types are fundamentally incompatible with the
6068        __builtin_return/__builtin_apply interface.  */
6069     return as_a <fixed_size_mode> (V16QImode);
6070   return default_get_reg_raw_mode (regno);
6071 }
6072 
6073 /* Implement TARGET_FUNCTION_ARG_PADDING.
6074 
6075    Small aggregate types are placed in the lowest memory address.
6076 
6077    The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
6078 
6079 static pad_direction
aarch64_function_arg_padding(machine_mode mode,const_tree type)6080 aarch64_function_arg_padding (machine_mode mode, const_tree type)
6081 {
6082   /* On little-endian targets, the least significant byte of every stack
6083      argument is passed at the lowest byte address of the stack slot.  */
6084   if (!BYTES_BIG_ENDIAN)
6085     return PAD_UPWARD;
6086 
6087   /* Otherwise, integral, floating-point and pointer types are padded downward:
6088      the least significant byte of a stack argument is passed at the highest
6089      byte address of the stack slot.  */
6090   if (type
6091       ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
6092 	 || POINTER_TYPE_P (type))
6093       : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
6094     return PAD_DOWNWARD;
6095 
6096   /* Everything else padded upward, i.e. data in first byte of stack slot.  */
6097   return PAD_UPWARD;
6098 }
6099 
6100 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
6101 
6102    It specifies padding for the last (may also be the only)
6103    element of a block move between registers and memory.  If
6104    assuming the block is in the memory, padding upward means that
6105    the last element is padded after its highest significant byte,
6106    while in downward padding, the last element is padded at the
6107    its least significant byte side.
6108 
6109    Small aggregates and small complex types are always padded
6110    upwards.
6111 
6112    We don't need to worry about homogeneous floating-point or
6113    short-vector aggregates; their move is not affected by the
6114    padding direction determined here.  Regardless of endianness,
6115    each element of such an aggregate is put in the least
6116    significant bits of a fp/simd register.
6117 
6118    Return !BYTES_BIG_ENDIAN if the least significant byte of the
6119    register has useful data, and return the opposite if the most
6120    significant byte does.  */
6121 
6122 bool
aarch64_pad_reg_upward(machine_mode mode,const_tree type,bool first ATTRIBUTE_UNUSED)6123 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
6124 		     bool first ATTRIBUTE_UNUSED)
6125 {
6126 
6127   /* Aside from pure scalable types, small composite types are always
6128      padded upward.  */
6129   if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
6130     {
6131       HOST_WIDE_INT size;
6132       if (type)
6133 	size = int_size_in_bytes (type);
6134       else
6135 	/* No frontends can create types with variable-sized modes, so we
6136 	   shouldn't be asked to pass or return them.  */
6137 	size = GET_MODE_SIZE (mode).to_constant ();
6138       if (size < 2 * UNITS_PER_WORD)
6139 	{
6140 	  pure_scalable_type_info pst_info;
6141 	  if (pst_info.analyze_registers (type))
6142 	    return false;
6143 	  return true;
6144 	}
6145     }
6146 
6147   /* Otherwise, use the default padding.  */
6148   return !BYTES_BIG_ENDIAN;
6149 }
6150 
6151 static scalar_int_mode
aarch64_libgcc_cmp_return_mode(void)6152 aarch64_libgcc_cmp_return_mode (void)
6153 {
6154   return SImode;
6155 }
6156 
6157 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
6158 
6159 /* We use the 12-bit shifted immediate arithmetic instructions so values
6160    must be multiple of (1 << 12), i.e. 4096.  */
6161 #define ARITH_FACTOR 4096
6162 
6163 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
6164 #error Cannot use simple address calculation for stack probing
6165 #endif
6166 
6167 /* The pair of scratch registers used for stack probing.  */
6168 #define PROBE_STACK_FIRST_REG  R9_REGNUM
6169 #define PROBE_STACK_SECOND_REG R10_REGNUM
6170 
6171 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
6172    inclusive.  These are offsets from the current stack pointer.  */
6173 
6174 static void
aarch64_emit_probe_stack_range(HOST_WIDE_INT first,poly_int64 poly_size)6175 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
6176 {
6177   HOST_WIDE_INT size;
6178   if (!poly_size.is_constant (&size))
6179     {
6180       sorry ("stack probes for SVE frames");
6181       return;
6182     }
6183 
6184   rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
6185 
6186   /* See the same assertion on PROBE_INTERVAL above.  */
6187   gcc_assert ((first % ARITH_FACTOR) == 0);
6188 
6189   /* See if we have a constant small number of probes to generate.  If so,
6190      that's the easy case.  */
6191   if (size <= PROBE_INTERVAL)
6192     {
6193       const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
6194 
6195       emit_set_insn (reg1,
6196 		     plus_constant (Pmode,
6197 				    stack_pointer_rtx, -(first + base)));
6198       emit_stack_probe (plus_constant (Pmode, reg1, base - size));
6199     }
6200 
6201   /* The run-time loop is made up of 8 insns in the generic case while the
6202      compile-time loop is made up of 4+2*(n-2) insns for n # of intervals.  */
6203   else if (size <= 4 * PROBE_INTERVAL)
6204     {
6205       HOST_WIDE_INT i, rem;
6206 
6207       emit_set_insn (reg1,
6208 		     plus_constant (Pmode,
6209 				    stack_pointer_rtx,
6210 				    -(first + PROBE_INTERVAL)));
6211       emit_stack_probe (reg1);
6212 
6213       /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
6214 	 it exceeds SIZE.  If only two probes are needed, this will not
6215 	 generate any code.  Then probe at FIRST + SIZE.  */
6216       for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
6217 	{
6218 	  emit_set_insn (reg1,
6219 			 plus_constant (Pmode, reg1, -PROBE_INTERVAL));
6220 	  emit_stack_probe (reg1);
6221 	}
6222 
6223       rem = size - (i - PROBE_INTERVAL);
6224       if (rem > 256)
6225 	{
6226 	  const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
6227 
6228 	  emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
6229 	  emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
6230 	}
6231       else
6232 	emit_stack_probe (plus_constant (Pmode, reg1, -rem));
6233     }
6234 
6235   /* Otherwise, do the same as above, but in a loop.  Note that we must be
6236      extra careful with variables wrapping around because we might be at
6237      the very top (or the very bottom) of the address space and we have
6238      to be able to handle this case properly; in particular, we use an
6239      equality test for the loop condition.  */
6240   else
6241     {
6242       rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
6243 
6244       /* Step 1: round SIZE to the previous multiple of the interval.  */
6245 
6246       HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
6247 
6248 
6249       /* Step 2: compute initial and final value of the loop counter.  */
6250 
6251       /* TEST_ADDR = SP + FIRST.  */
6252       emit_set_insn (reg1,
6253 		     plus_constant (Pmode, stack_pointer_rtx, -first));
6254 
6255       /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE.  */
6256       HOST_WIDE_INT adjustment = - (first + rounded_size);
6257       if (! aarch64_uimm12_shift (adjustment))
6258 	{
6259 	  aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
6260 					  true, Pmode);
6261 	  emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
6262 	}
6263       else
6264 	emit_set_insn (reg2,
6265 		       plus_constant (Pmode, stack_pointer_rtx, adjustment));
6266 
6267       /* Step 3: the loop
6268 
6269 	 do
6270 	   {
6271 	     TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
6272 	     probe at TEST_ADDR
6273 	   }
6274 	 while (TEST_ADDR != LAST_ADDR)
6275 
6276 	 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
6277 	 until it is equal to ROUNDED_SIZE.  */
6278 
6279       emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
6280 
6281 
6282       /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
6283 	 that SIZE is equal to ROUNDED_SIZE.  */
6284 
6285       if (size != rounded_size)
6286 	{
6287 	  HOST_WIDE_INT rem = size - rounded_size;
6288 
6289 	  if (rem > 256)
6290 	    {
6291 	      const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
6292 
6293 	      emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
6294 	      emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
6295 	    }
6296 	  else
6297 	    emit_stack_probe (plus_constant (Pmode, reg2, -rem));
6298 	}
6299     }
6300 
6301   /* Make sure nothing is scheduled before we are done.  */
6302   emit_insn (gen_blockage ());
6303 }
6304 
6305 /* Probe a range of stack addresses from REG1 to REG2 inclusive.  These are
6306    absolute addresses.  */
6307 
6308 const char *
aarch64_output_probe_stack_range(rtx reg1,rtx reg2)6309 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
6310 {
6311   static int labelno = 0;
6312   char loop_lab[32];
6313   rtx xops[2];
6314 
6315   ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
6316 
6317   /* Loop.  */
6318   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
6319 
6320   HOST_WIDE_INT stack_clash_probe_interval
6321     = 1 << param_stack_clash_protection_guard_size;
6322 
6323   /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
6324   xops[0] = reg1;
6325   HOST_WIDE_INT interval;
6326   if (flag_stack_clash_protection)
6327     interval = stack_clash_probe_interval;
6328   else
6329     interval = PROBE_INTERVAL;
6330 
6331   gcc_assert (aarch64_uimm12_shift (interval));
6332   xops[1] = GEN_INT (interval);
6333 
6334   output_asm_insn ("sub\t%0, %0, %1", xops);
6335 
6336   /* If doing stack clash protection then we probe up by the ABI specified
6337      amount.  We do this because we're dropping full pages at a time in the
6338      loop.  But if we're doing non-stack clash probing, probe at SP 0.  */
6339   if (flag_stack_clash_protection)
6340     xops[1] = GEN_INT (STACK_CLASH_CALLER_GUARD);
6341   else
6342     xops[1] = CONST0_RTX (GET_MODE (xops[1]));
6343 
6344   /* Probe at TEST_ADDR.  If we're inside the loop it is always safe to probe
6345      by this amount for each iteration.  */
6346   output_asm_insn ("str\txzr, [%0, %1]", xops);
6347 
6348   /* Test if TEST_ADDR == LAST_ADDR.  */
6349   xops[1] = reg2;
6350   output_asm_insn ("cmp\t%0, %1", xops);
6351 
6352   /* Branch.  */
6353   fputs ("\tb.ne\t", asm_out_file);
6354   assemble_name_raw (asm_out_file, loop_lab);
6355   fputc ('\n', asm_out_file);
6356 
6357   return "";
6358 }
6359 
6360 /* Emit the probe loop for doing stack clash probes and stack adjustments for
6361    SVE.  This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
6362    of GUARD_SIZE.  When a probe is emitted it is done at most
6363    MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of
6364    at most MIN_PROBE_THRESHOLD.  By the end of this function
6365    BASE = BASE - ADJUSTMENT.  */
6366 
6367 const char *
aarch64_output_probe_sve_stack_clash(rtx base,rtx adjustment,rtx min_probe_threshold,rtx guard_size)6368 aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment,
6369 				      rtx min_probe_threshold, rtx guard_size)
6370 {
6371   /* This function is not allowed to use any instruction generation function
6372      like gen_ and friends.  If you do you'll likely ICE during CFG validation,
6373      so instead emit the code you want using output_asm_insn.  */
6374   gcc_assert (flag_stack_clash_protection);
6375   gcc_assert (CONST_INT_P (min_probe_threshold) && CONST_INT_P (guard_size));
6376   gcc_assert (INTVAL (guard_size) > INTVAL (min_probe_threshold));
6377 
6378   /* The minimum required allocation before the residual requires probing.  */
6379   HOST_WIDE_INT residual_probe_guard = INTVAL (min_probe_threshold);
6380 
6381   /* Clamp the value down to the nearest value that can be used with a cmp.  */
6382   residual_probe_guard = aarch64_clamp_to_uimm12_shift (residual_probe_guard);
6383   rtx probe_offset_value_rtx = gen_int_mode (residual_probe_guard, Pmode);
6384 
6385   gcc_assert (INTVAL (min_probe_threshold) >= residual_probe_guard);
6386   gcc_assert (aarch64_uimm12_shift (residual_probe_guard));
6387 
6388   static int labelno = 0;
6389   char loop_start_lab[32];
6390   char loop_end_lab[32];
6391   rtx xops[2];
6392 
6393   ASM_GENERATE_INTERNAL_LABEL (loop_start_lab, "SVLPSPL", labelno);
6394   ASM_GENERATE_INTERNAL_LABEL (loop_end_lab, "SVLPEND", labelno++);
6395 
6396   /* Emit loop start label.  */
6397   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_start_lab);
6398 
6399   /* ADJUSTMENT < RESIDUAL_PROBE_GUARD.  */
6400   xops[0] = adjustment;
6401   xops[1] = probe_offset_value_rtx;
6402   output_asm_insn ("cmp\t%0, %1", xops);
6403 
6404   /* Branch to end if not enough adjustment to probe.  */
6405   fputs ("\tb.lt\t", asm_out_file);
6406   assemble_name_raw (asm_out_file, loop_end_lab);
6407   fputc ('\n', asm_out_file);
6408 
6409   /* BASE = BASE - RESIDUAL_PROBE_GUARD.  */
6410   xops[0] = base;
6411   xops[1] = probe_offset_value_rtx;
6412   output_asm_insn ("sub\t%0, %0, %1", xops);
6413 
6414   /* Probe at BASE.  */
6415   xops[1] = const0_rtx;
6416   output_asm_insn ("str\txzr, [%0, %1]", xops);
6417 
6418   /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD.  */
6419   xops[0] = adjustment;
6420   xops[1] = probe_offset_value_rtx;
6421   output_asm_insn ("sub\t%0, %0, %1", xops);
6422 
6423   /* Branch to start if still more bytes to allocate.  */
6424   fputs ("\tb\t", asm_out_file);
6425   assemble_name_raw (asm_out_file, loop_start_lab);
6426   fputc ('\n', asm_out_file);
6427 
6428   /* No probe leave.  */
6429   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab);
6430 
6431   /* BASE = BASE - ADJUSTMENT.  */
6432   xops[0] = base;
6433   xops[1] = adjustment;
6434   output_asm_insn ("sub\t%0, %0, %1", xops);
6435   return "";
6436 }
6437 
6438 /* Determine whether a frame chain needs to be generated.  */
6439 static bool
aarch64_needs_frame_chain(void)6440 aarch64_needs_frame_chain (void)
6441 {
6442   /* Force a frame chain for EH returns so the return address is at FP+8.  */
6443   if (frame_pointer_needed || crtl->calls_eh_return)
6444     return true;
6445 
6446   /* A leaf function cannot have calls or write LR.  */
6447   bool is_leaf = crtl->is_leaf && !df_regs_ever_live_p (LR_REGNUM);
6448 
6449   /* Don't use a frame chain in leaf functions if leaf frame pointers
6450      are disabled.  */
6451   if (flag_omit_leaf_frame_pointer && is_leaf)
6452     return false;
6453 
6454   return aarch64_use_frame_pointer;
6455 }
6456 
6457 /* Mark the registers that need to be saved by the callee and calculate
6458    the size of the callee-saved registers area and frame record (both FP
6459    and LR may be omitted).  */
6460 static void
aarch64_layout_frame(void)6461 aarch64_layout_frame (void)
6462 {
6463   poly_int64 offset = 0;
6464   int regno, last_fp_reg = INVALID_REGNUM;
6465   machine_mode vector_save_mode = aarch64_reg_save_mode (V8_REGNUM);
6466   poly_int64 vector_save_size = GET_MODE_SIZE (vector_save_mode);
6467   bool frame_related_fp_reg_p = false;
6468   aarch64_frame &frame = cfun->machine->frame;
6469 
6470   frame.emit_frame_chain = aarch64_needs_frame_chain ();
6471 
6472   /* Adjust the outgoing arguments size if required.  Keep it in sync with what
6473      the mid-end is doing.  */
6474   crtl->outgoing_args_size = STACK_DYNAMIC_OFFSET (cfun);
6475 
6476 #define SLOT_NOT_REQUIRED (-2)
6477 #define SLOT_REQUIRED     (-1)
6478 
6479   frame.wb_candidate1 = INVALID_REGNUM;
6480   frame.wb_candidate2 = INVALID_REGNUM;
6481   frame.spare_pred_reg = INVALID_REGNUM;
6482 
6483   /* First mark all the registers that really need to be saved...  */
6484   for (regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
6485     frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
6486 
6487   /* ... that includes the eh data registers (if needed)...  */
6488   if (crtl->calls_eh_return)
6489     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
6490       frame.reg_offset[EH_RETURN_DATA_REGNO (regno)] = SLOT_REQUIRED;
6491 
6492   /* ... and any callee saved register that dataflow says is live.  */
6493   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
6494     if (df_regs_ever_live_p (regno)
6495 	&& !fixed_regs[regno]
6496 	&& (regno == R30_REGNUM
6497 	    || !crtl->abi->clobbers_full_reg_p (regno)))
6498       frame.reg_offset[regno] = SLOT_REQUIRED;
6499 
6500   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
6501     if (df_regs_ever_live_p (regno)
6502 	&& !fixed_regs[regno]
6503 	&& !crtl->abi->clobbers_full_reg_p (regno))
6504       {
6505 	frame.reg_offset[regno] = SLOT_REQUIRED;
6506 	last_fp_reg = regno;
6507 	if (aarch64_emit_cfi_for_reg_p (regno))
6508 	  frame_related_fp_reg_p = true;
6509       }
6510 
6511   /* Big-endian SVE frames need a spare predicate register in order
6512      to save Z8-Z15.  Decide which register they should use.  Prefer
6513      an unused argument register if possible, so that we don't force P4
6514      to be saved unnecessarily.  */
6515   if (frame_related_fp_reg_p
6516       && crtl->abi->id () == ARM_PCS_SVE
6517       && BYTES_BIG_ENDIAN)
6518     {
6519       bitmap live1 = df_get_live_out (ENTRY_BLOCK_PTR_FOR_FN (cfun));
6520       bitmap live2 = df_get_live_in (EXIT_BLOCK_PTR_FOR_FN (cfun));
6521       for (regno = P0_REGNUM; regno <= P7_REGNUM; regno++)
6522 	if (!bitmap_bit_p (live1, regno) && !bitmap_bit_p (live2, regno))
6523 	  break;
6524       gcc_assert (regno <= P7_REGNUM);
6525       frame.spare_pred_reg = regno;
6526       df_set_regs_ever_live (regno, true);
6527     }
6528 
6529   for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++)
6530     if (df_regs_ever_live_p (regno)
6531 	&& !fixed_regs[regno]
6532 	&& !crtl->abi->clobbers_full_reg_p (regno))
6533       frame.reg_offset[regno] = SLOT_REQUIRED;
6534 
6535   /* With stack-clash, LR must be saved in non-leaf functions.  */
6536   gcc_assert (crtl->is_leaf
6537 	      || maybe_ne (frame.reg_offset[R30_REGNUM], SLOT_NOT_REQUIRED));
6538 
6539   /* Now assign stack slots for the registers.  Start with the predicate
6540      registers, since predicate LDR and STR have a relatively small
6541      offset range.  These saves happen below the hard frame pointer.  */
6542   for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++)
6543     if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
6544       {
6545 	frame.reg_offset[regno] = offset;
6546 	offset += BYTES_PER_SVE_PRED;
6547       }
6548 
6549   if (maybe_ne (offset, 0))
6550     {
6551       /* If we have any vector registers to save above the predicate registers,
6552 	 the offset of the vector register save slots need to be a multiple
6553 	 of the vector size.  This lets us use the immediate forms of LDR/STR
6554 	 (or LD1/ST1 for big-endian).
6555 
6556 	 A vector register is 8 times the size of a predicate register,
6557 	 and we need to save a maximum of 12 predicate registers, so the
6558 	 first vector register will be at either #1, MUL VL or #2, MUL VL.
6559 
6560 	 If we don't have any vector registers to save, and we know how
6561 	 big the predicate save area is, we can just round it up to the
6562 	 next 16-byte boundary.  */
6563       if (last_fp_reg == (int) INVALID_REGNUM && offset.is_constant ())
6564 	offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
6565       else
6566 	{
6567 	  if (known_le (offset, vector_save_size))
6568 	    offset = vector_save_size;
6569 	  else if (known_le (offset, vector_save_size * 2))
6570 	    offset = vector_save_size * 2;
6571 	  else
6572 	    gcc_unreachable ();
6573 	}
6574     }
6575 
6576   /* If we need to save any SVE vector registers, add them next.  */
6577   if (last_fp_reg != (int) INVALID_REGNUM && crtl->abi->id () == ARM_PCS_SVE)
6578     for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
6579       if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
6580 	{
6581 	  frame.reg_offset[regno] = offset;
6582 	  offset += vector_save_size;
6583 	}
6584 
6585   /* OFFSET is now the offset of the hard frame pointer from the bottom
6586      of the callee save area.  */
6587   bool saves_below_hard_fp_p = maybe_ne (offset, 0);
6588   frame.below_hard_fp_saved_regs_size = offset;
6589   if (frame.emit_frame_chain)
6590     {
6591       /* FP and LR are placed in the linkage record.  */
6592       frame.reg_offset[R29_REGNUM] = offset;
6593       frame.wb_candidate1 = R29_REGNUM;
6594       frame.reg_offset[R30_REGNUM] = offset + UNITS_PER_WORD;
6595       frame.wb_candidate2 = R30_REGNUM;
6596       offset += 2 * UNITS_PER_WORD;
6597     }
6598 
6599   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
6600     if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
6601       {
6602 	frame.reg_offset[regno] = offset;
6603 	if (frame.wb_candidate1 == INVALID_REGNUM)
6604 	  frame.wb_candidate1 = regno;
6605 	else if (frame.wb_candidate2 == INVALID_REGNUM)
6606 	  frame.wb_candidate2 = regno;
6607 	offset += UNITS_PER_WORD;
6608       }
6609 
6610   poly_int64 max_int_offset = offset;
6611   offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
6612   bool has_align_gap = maybe_ne (offset, max_int_offset);
6613 
6614   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
6615     if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
6616       {
6617 	/* If there is an alignment gap between integer and fp callee-saves,
6618 	   allocate the last fp register to it if possible.  */
6619 	if (regno == last_fp_reg
6620 	    && has_align_gap
6621 	    && known_eq (vector_save_size, 8)
6622 	    && multiple_p (offset, 16))
6623 	  {
6624 	    frame.reg_offset[regno] = max_int_offset;
6625 	    break;
6626 	  }
6627 
6628 	frame.reg_offset[regno] = offset;
6629 	if (frame.wb_candidate1 == INVALID_REGNUM)
6630 	  frame.wb_candidate1 = regno;
6631 	else if (frame.wb_candidate2 == INVALID_REGNUM
6632 		 && frame.wb_candidate1 >= V0_REGNUM)
6633 	  frame.wb_candidate2 = regno;
6634 	offset += vector_save_size;
6635       }
6636 
6637   offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
6638 
6639   frame.saved_regs_size = offset;
6640 
6641   poly_int64 varargs_and_saved_regs_size = offset + frame.saved_varargs_size;
6642 
6643   poly_int64 above_outgoing_args
6644     = aligned_upper_bound (varargs_and_saved_regs_size
6645 			   + get_frame_size (),
6646 			   STACK_BOUNDARY / BITS_PER_UNIT);
6647 
6648   frame.hard_fp_offset
6649     = above_outgoing_args - frame.below_hard_fp_saved_regs_size;
6650 
6651   /* Both these values are already aligned.  */
6652   gcc_assert (multiple_p (crtl->outgoing_args_size,
6653 			  STACK_BOUNDARY / BITS_PER_UNIT));
6654   frame.frame_size = above_outgoing_args + crtl->outgoing_args_size;
6655 
6656   frame.locals_offset = frame.saved_varargs_size;
6657 
6658   frame.initial_adjust = 0;
6659   frame.final_adjust = 0;
6660   frame.callee_adjust = 0;
6661   frame.sve_callee_adjust = 0;
6662   frame.callee_offset = 0;
6663 
6664   HOST_WIDE_INT max_push_offset = 0;
6665   if (frame.wb_candidate2 != INVALID_REGNUM)
6666     max_push_offset = 512;
6667   else if (frame.wb_candidate1 != INVALID_REGNUM)
6668     max_push_offset = 256;
6669 
6670   HOST_WIDE_INT const_size, const_outgoing_args_size, const_fp_offset;
6671   HOST_WIDE_INT const_saved_regs_size;
6672   if (frame.frame_size.is_constant (&const_size)
6673       && const_size < max_push_offset
6674       && known_eq (frame.hard_fp_offset, const_size))
6675     {
6676       /* Simple, small frame with no outgoing arguments:
6677 
6678 	 stp reg1, reg2, [sp, -frame_size]!
6679 	 stp reg3, reg4, [sp, 16]  */
6680       frame.callee_adjust = const_size;
6681     }
6682   else if (crtl->outgoing_args_size.is_constant (&const_outgoing_args_size)
6683 	   && frame.saved_regs_size.is_constant (&const_saved_regs_size)
6684 	   && const_outgoing_args_size + const_saved_regs_size < 512
6685 	   /* We could handle this case even with outgoing args, provided
6686 	      that the number of args left us with valid offsets for all
6687 	      predicate and vector save slots.  It's such a rare case that
6688 	      it hardly seems worth the effort though.  */
6689 	   && (!saves_below_hard_fp_p || const_outgoing_args_size == 0)
6690 	   && !(cfun->calls_alloca
6691 		&& frame.hard_fp_offset.is_constant (&const_fp_offset)
6692 		&& const_fp_offset < max_push_offset))
6693     {
6694       /* Frame with small outgoing arguments:
6695 
6696 	 sub sp, sp, frame_size
6697 	 stp reg1, reg2, [sp, outgoing_args_size]
6698 	 stp reg3, reg4, [sp, outgoing_args_size + 16]  */
6699       frame.initial_adjust = frame.frame_size;
6700       frame.callee_offset = const_outgoing_args_size;
6701     }
6702   else if (saves_below_hard_fp_p
6703 	   && known_eq (frame.saved_regs_size,
6704 			frame.below_hard_fp_saved_regs_size))
6705     {
6706       /* Frame in which all saves are SVE saves:
6707 
6708 	 sub sp, sp, hard_fp_offset + below_hard_fp_saved_regs_size
6709 	 save SVE registers relative to SP
6710 	 sub sp, sp, outgoing_args_size  */
6711       frame.initial_adjust = (frame.hard_fp_offset
6712 			      + frame.below_hard_fp_saved_regs_size);
6713       frame.final_adjust = crtl->outgoing_args_size;
6714     }
6715   else if (frame.hard_fp_offset.is_constant (&const_fp_offset)
6716 	   && const_fp_offset < max_push_offset)
6717     {
6718       /* Frame with large outgoing arguments or SVE saves, but with
6719 	 a small local area:
6720 
6721 	 stp reg1, reg2, [sp, -hard_fp_offset]!
6722 	 stp reg3, reg4, [sp, 16]
6723 	 [sub sp, sp, below_hard_fp_saved_regs_size]
6724 	 [save SVE registers relative to SP]
6725 	 sub sp, sp, outgoing_args_size  */
6726       frame.callee_adjust = const_fp_offset;
6727       frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size;
6728       frame.final_adjust = crtl->outgoing_args_size;
6729     }
6730   else
6731     {
6732       /* Frame with large local area and outgoing arguments or SVE saves,
6733 	 using frame pointer:
6734 
6735 	 sub sp, sp, hard_fp_offset
6736 	 stp x29, x30, [sp, 0]
6737 	 add x29, sp, 0
6738 	 stp reg3, reg4, [sp, 16]
6739 	 [sub sp, sp, below_hard_fp_saved_regs_size]
6740 	 [save SVE registers relative to SP]
6741 	 sub sp, sp, outgoing_args_size  */
6742       frame.initial_adjust = frame.hard_fp_offset;
6743       frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size;
6744       frame.final_adjust = crtl->outgoing_args_size;
6745     }
6746 
6747   /* Make sure the individual adjustments add up to the full frame size.  */
6748   gcc_assert (known_eq (frame.initial_adjust
6749 			+ frame.callee_adjust
6750 			+ frame.sve_callee_adjust
6751 			+ frame.final_adjust, frame.frame_size));
6752 
6753   frame.laid_out = true;
6754 }
6755 
6756 /* Return true if the register REGNO is saved on entry to
6757    the current function.  */
6758 
6759 static bool
aarch64_register_saved_on_entry(int regno)6760 aarch64_register_saved_on_entry (int regno)
6761 {
6762   return known_ge (cfun->machine->frame.reg_offset[regno], 0);
6763 }
6764 
6765 /* Return the next register up from REGNO up to LIMIT for the callee
6766    to save.  */
6767 
6768 static unsigned
aarch64_next_callee_save(unsigned regno,unsigned limit)6769 aarch64_next_callee_save (unsigned regno, unsigned limit)
6770 {
6771   while (regno <= limit && !aarch64_register_saved_on_entry (regno))
6772     regno ++;
6773   return regno;
6774 }
6775 
6776 /* Push the register number REGNO of mode MODE to the stack with write-back
6777    adjusting the stack by ADJUSTMENT.  */
6778 
6779 static void
aarch64_pushwb_single_reg(machine_mode mode,unsigned regno,HOST_WIDE_INT adjustment)6780 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
6781 			   HOST_WIDE_INT adjustment)
6782  {
6783   rtx base_rtx = stack_pointer_rtx;
6784   rtx insn, reg, mem;
6785 
6786   reg = gen_rtx_REG (mode, regno);
6787   mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
6788 			    plus_constant (Pmode, base_rtx, -adjustment));
6789   mem = gen_frame_mem (mode, mem);
6790 
6791   insn = emit_move_insn (mem, reg);
6792   RTX_FRAME_RELATED_P (insn) = 1;
6793 }
6794 
6795 /* Generate and return an instruction to store the pair of registers
6796    REG and REG2 of mode MODE to location BASE with write-back adjusting
6797    the stack location BASE by ADJUSTMENT.  */
6798 
6799 static rtx
aarch64_gen_storewb_pair(machine_mode mode,rtx base,rtx reg,rtx reg2,HOST_WIDE_INT adjustment)6800 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
6801 			  HOST_WIDE_INT adjustment)
6802 {
6803   switch (mode)
6804     {
6805     case E_DImode:
6806       return gen_storewb_pairdi_di (base, base, reg, reg2,
6807 				    GEN_INT (-adjustment),
6808 				    GEN_INT (UNITS_PER_WORD - adjustment));
6809     case E_DFmode:
6810       return gen_storewb_pairdf_di (base, base, reg, reg2,
6811 				    GEN_INT (-adjustment),
6812 				    GEN_INT (UNITS_PER_WORD - adjustment));
6813     case E_TFmode:
6814       return gen_storewb_pairtf_di (base, base, reg, reg2,
6815 				    GEN_INT (-adjustment),
6816 				    GEN_INT (UNITS_PER_VREG - adjustment));
6817     default:
6818       gcc_unreachable ();
6819     }
6820 }
6821 
6822 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
6823    stack pointer by ADJUSTMENT.  */
6824 
6825 static void
aarch64_push_regs(unsigned regno1,unsigned regno2,HOST_WIDE_INT adjustment)6826 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
6827 {
6828   rtx_insn *insn;
6829   machine_mode mode = aarch64_reg_save_mode (regno1);
6830 
6831   if (regno2 == INVALID_REGNUM)
6832     return aarch64_pushwb_single_reg (mode, regno1, adjustment);
6833 
6834   rtx reg1 = gen_rtx_REG (mode, regno1);
6835   rtx reg2 = gen_rtx_REG (mode, regno2);
6836 
6837   insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
6838 					      reg2, adjustment));
6839   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
6840   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
6841   RTX_FRAME_RELATED_P (insn) = 1;
6842 }
6843 
6844 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
6845    adjusting it by ADJUSTMENT afterwards.  */
6846 
6847 static rtx
aarch64_gen_loadwb_pair(machine_mode mode,rtx base,rtx reg,rtx reg2,HOST_WIDE_INT adjustment)6848 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
6849 			 HOST_WIDE_INT adjustment)
6850 {
6851   switch (mode)
6852     {
6853     case E_DImode:
6854       return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
6855 				   GEN_INT (UNITS_PER_WORD));
6856     case E_DFmode:
6857       return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
6858 				   GEN_INT (UNITS_PER_WORD));
6859     case E_TFmode:
6860       return gen_loadwb_pairtf_di (base, base, reg, reg2, GEN_INT (adjustment),
6861 				   GEN_INT (UNITS_PER_VREG));
6862     default:
6863       gcc_unreachable ();
6864     }
6865 }
6866 
6867 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
6868    afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
6869    into CFI_OPS.  */
6870 
6871 static void
aarch64_pop_regs(unsigned regno1,unsigned regno2,HOST_WIDE_INT adjustment,rtx * cfi_ops)6872 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
6873 		  rtx *cfi_ops)
6874 {
6875   machine_mode mode = aarch64_reg_save_mode (regno1);
6876   rtx reg1 = gen_rtx_REG (mode, regno1);
6877 
6878   *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
6879 
6880   if (regno2 == INVALID_REGNUM)
6881     {
6882       rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
6883       mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
6884       emit_move_insn (reg1, gen_frame_mem (mode, mem));
6885     }
6886   else
6887     {
6888       rtx reg2 = gen_rtx_REG (mode, regno2);
6889       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
6890       emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
6891 					  reg2, adjustment));
6892     }
6893 }
6894 
6895 /* Generate and return a store pair instruction of mode MODE to store
6896    register REG1 to MEM1 and register REG2 to MEM2.  */
6897 
6898 static rtx
aarch64_gen_store_pair(machine_mode mode,rtx mem1,rtx reg1,rtx mem2,rtx reg2)6899 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
6900 			rtx reg2)
6901 {
6902   switch (mode)
6903     {
6904     case E_DImode:
6905       return gen_store_pair_dw_didi (mem1, reg1, mem2, reg2);
6906 
6907     case E_DFmode:
6908       return gen_store_pair_dw_dfdf (mem1, reg1, mem2, reg2);
6909 
6910     case E_TFmode:
6911       return gen_store_pair_dw_tftf (mem1, reg1, mem2, reg2);
6912 
6913     default:
6914       gcc_unreachable ();
6915     }
6916 }
6917 
6918 /* Generate and regurn a load pair isntruction of mode MODE to load register
6919    REG1 from MEM1 and register REG2 from MEM2.  */
6920 
6921 static rtx
aarch64_gen_load_pair(machine_mode mode,rtx reg1,rtx mem1,rtx reg2,rtx mem2)6922 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
6923 		       rtx mem2)
6924 {
6925   switch (mode)
6926     {
6927     case E_DImode:
6928       return gen_load_pair_dw_didi (reg1, mem1, reg2, mem2);
6929 
6930     case E_DFmode:
6931       return gen_load_pair_dw_dfdf (reg1, mem1, reg2, mem2);
6932 
6933     case E_TFmode:
6934       return gen_load_pair_dw_tftf (reg1, mem1, reg2, mem2);
6935 
6936     default:
6937       gcc_unreachable ();
6938     }
6939 }
6940 
6941 /* Return TRUE if return address signing should be enabled for the current
6942    function, otherwise return FALSE.  */
6943 
6944 bool
aarch64_return_address_signing_enabled(void)6945 aarch64_return_address_signing_enabled (void)
6946 {
6947   /* This function should only be called after frame laid out.   */
6948   gcc_assert (cfun->machine->frame.laid_out);
6949 
6950   /* Turn return address signing off in any function that uses
6951      __builtin_eh_return.  The address passed to __builtin_eh_return
6952      is not signed so either it has to be signed (with original sp)
6953      or the code path that uses it has to avoid authenticating it.
6954      Currently eh return introduces a return to anywhere gadget, no
6955      matter what we do here since it uses ret with user provided
6956      address. An ideal fix for that is to use indirect branch which
6957      can be protected with BTI j (to some extent).  */
6958   if (crtl->calls_eh_return)
6959     return false;
6960 
6961   /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
6962      if its LR is pushed onto stack.  */
6963   return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
6964 	  || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
6965 	      && known_ge (cfun->machine->frame.reg_offset[LR_REGNUM], 0)));
6966 }
6967 
6968 /* Return TRUE if Branch Target Identification Mechanism is enabled.  */
6969 bool
aarch64_bti_enabled(void)6970 aarch64_bti_enabled (void)
6971 {
6972   return (aarch64_enable_bti == 1);
6973 }
6974 
6975 /* The caller is going to use ST1D or LD1D to save or restore an SVE
6976    register in mode MODE at BASE_RTX + OFFSET, where OFFSET is in
6977    the range [1, 16] * GET_MODE_SIZE (MODE).  Prepare for this by:
6978 
6979      (1) updating BASE_RTX + OFFSET so that it is a legitimate ST1D
6980 	 or LD1D address
6981 
6982      (2) setting PRED to a valid predicate register for the ST1D or LD1D,
6983 	 if the variable isn't already nonnull
6984 
6985    (1) is needed when OFFSET is in the range [8, 16] * GET_MODE_SIZE (MODE).
6986    Handle this case using a temporary base register that is suitable for
6987    all offsets in that range.  Use ANCHOR_REG as this base register if it
6988    is nonnull, otherwise create a new register and store it in ANCHOR_REG.  */
6989 
6990 static inline void
aarch64_adjust_sve_callee_save_base(machine_mode mode,rtx & base_rtx,rtx & anchor_reg,poly_int64 & offset,rtx & ptrue)6991 aarch64_adjust_sve_callee_save_base (machine_mode mode, rtx &base_rtx,
6992 				     rtx &anchor_reg, poly_int64 &offset,
6993 				     rtx &ptrue)
6994 {
6995   if (maybe_ge (offset, 8 * GET_MODE_SIZE (mode)))
6996     {
6997       /* This is the maximum valid offset of the anchor from the base.
6998 	 Lower values would be valid too.  */
6999       poly_int64 anchor_offset = 16 * GET_MODE_SIZE (mode);
7000       if (!anchor_reg)
7001 	{
7002 	  anchor_reg = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
7003 	  emit_insn (gen_add3_insn (anchor_reg, base_rtx,
7004 				    gen_int_mode (anchor_offset, Pmode)));
7005 	}
7006       base_rtx = anchor_reg;
7007       offset -= anchor_offset;
7008     }
7009   if (!ptrue)
7010     {
7011       int pred_reg = cfun->machine->frame.spare_pred_reg;
7012       emit_move_insn (gen_rtx_REG (VNx16BImode, pred_reg),
7013 		      CONSTM1_RTX (VNx16BImode));
7014       ptrue = gen_rtx_REG (VNx2BImode, pred_reg);
7015     }
7016 }
7017 
7018 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
7019    is saved at BASE + OFFSET.  */
7020 
7021 static void
aarch64_add_cfa_expression(rtx_insn * insn,rtx reg,rtx base,poly_int64 offset)7022 aarch64_add_cfa_expression (rtx_insn *insn, rtx reg,
7023 			    rtx base, poly_int64 offset)
7024 {
7025   rtx mem = gen_frame_mem (GET_MODE (reg),
7026 			   plus_constant (Pmode, base, offset));
7027   add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg));
7028 }
7029 
7030 /* Emit code to save the callee-saved registers from register number START
7031    to LIMIT to the stack at the location starting at offset START_OFFSET,
7032    skipping any write-back candidates if SKIP_WB is true.  HARD_FP_VALID_P
7033    is true if the hard frame pointer has been set up.  */
7034 
7035 static void
aarch64_save_callee_saves(poly_int64 start_offset,unsigned start,unsigned limit,bool skip_wb,bool hard_fp_valid_p)7036 aarch64_save_callee_saves (poly_int64 start_offset,
7037 			   unsigned start, unsigned limit, bool skip_wb,
7038 			   bool hard_fp_valid_p)
7039 {
7040   rtx_insn *insn;
7041   unsigned regno;
7042   unsigned regno2;
7043   rtx anchor_reg = NULL_RTX, ptrue = NULL_RTX;
7044 
7045   for (regno = aarch64_next_callee_save (start, limit);
7046        regno <= limit;
7047        regno = aarch64_next_callee_save (regno + 1, limit))
7048     {
7049       rtx reg, mem;
7050       poly_int64 offset;
7051       bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
7052 
7053       if (skip_wb
7054 	  && (regno == cfun->machine->frame.wb_candidate1
7055 	      || regno == cfun->machine->frame.wb_candidate2))
7056 	continue;
7057 
7058       if (cfun->machine->reg_is_wrapped_separately[regno])
7059 	continue;
7060 
7061       machine_mode mode = aarch64_reg_save_mode (regno);
7062       reg = gen_rtx_REG (mode, regno);
7063       offset = start_offset + cfun->machine->frame.reg_offset[regno];
7064       rtx base_rtx = stack_pointer_rtx;
7065       poly_int64 sp_offset = offset;
7066 
7067       HOST_WIDE_INT const_offset;
7068       if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
7069 	aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg,
7070 					     offset, ptrue);
7071       else if (GP_REGNUM_P (regno)
7072 	       && (!offset.is_constant (&const_offset) || const_offset >= 512))
7073 	{
7074 	  gcc_assert (known_eq (start_offset, 0));
7075 	  poly_int64 fp_offset
7076 	    = cfun->machine->frame.below_hard_fp_saved_regs_size;
7077 	  if (hard_fp_valid_p)
7078 	    base_rtx = hard_frame_pointer_rtx;
7079 	  else
7080 	    {
7081 	      if (!anchor_reg)
7082 		{
7083 		  anchor_reg = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
7084 		  emit_insn (gen_add3_insn (anchor_reg, base_rtx,
7085 					    gen_int_mode (fp_offset, Pmode)));
7086 		}
7087 	      base_rtx = anchor_reg;
7088 	    }
7089 	  offset -= fp_offset;
7090 	}
7091       mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
7092       bool need_cfa_note_p = (base_rtx != stack_pointer_rtx);
7093 
7094       if (!aarch64_sve_mode_p (mode)
7095 	  && (regno2 = aarch64_next_callee_save (regno + 1, limit)) <= limit
7096 	  && !cfun->machine->reg_is_wrapped_separately[regno2]
7097 	  && known_eq (GET_MODE_SIZE (mode),
7098 		       cfun->machine->frame.reg_offset[regno2]
7099 		       - cfun->machine->frame.reg_offset[regno]))
7100 	{
7101 	  rtx reg2 = gen_rtx_REG (mode, regno2);
7102 	  rtx mem2;
7103 
7104 	  offset += GET_MODE_SIZE (mode);
7105 	  mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
7106 	  insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
7107 						    reg2));
7108 
7109 	  /* The first part of a frame-related parallel insn is
7110 	     always assumed to be relevant to the frame
7111 	     calculations; subsequent parts, are only
7112 	     frame-related if explicitly marked.  */
7113 	  if (aarch64_emit_cfi_for_reg_p (regno2))
7114 	    {
7115 	      if (need_cfa_note_p)
7116 		aarch64_add_cfa_expression (insn, reg2, stack_pointer_rtx,
7117 					    sp_offset + GET_MODE_SIZE (mode));
7118 	      else
7119 		RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
7120 	    }
7121 
7122 	  regno = regno2;
7123 	}
7124       else if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
7125 	{
7126 	  insn = emit_insn (gen_aarch64_pred_mov (mode, mem, ptrue, reg));
7127 	  need_cfa_note_p = true;
7128 	}
7129       else if (aarch64_sve_mode_p (mode))
7130 	insn = emit_insn (gen_rtx_SET (mem, reg));
7131       else
7132 	insn = emit_move_insn (mem, reg);
7133 
7134       RTX_FRAME_RELATED_P (insn) = frame_related_p;
7135       if (frame_related_p && need_cfa_note_p)
7136 	aarch64_add_cfa_expression (insn, reg, stack_pointer_rtx, sp_offset);
7137     }
7138 }
7139 
7140 /* Emit code to restore the callee registers from register number START
7141    up to and including LIMIT.  Restore from the stack offset START_OFFSET,
7142    skipping any write-back candidates if SKIP_WB is true.  Write the
7143    appropriate REG_CFA_RESTORE notes into CFI_OPS.  */
7144 
7145 static void
aarch64_restore_callee_saves(poly_int64 start_offset,unsigned start,unsigned limit,bool skip_wb,rtx * cfi_ops)7146 aarch64_restore_callee_saves (poly_int64 start_offset, unsigned start,
7147 			      unsigned limit, bool skip_wb, rtx *cfi_ops)
7148 {
7149   unsigned regno;
7150   unsigned regno2;
7151   poly_int64 offset;
7152   rtx anchor_reg = NULL_RTX, ptrue = NULL_RTX;
7153 
7154   for (regno = aarch64_next_callee_save (start, limit);
7155        regno <= limit;
7156        regno = aarch64_next_callee_save (regno + 1, limit))
7157     {
7158       bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
7159       if (cfun->machine->reg_is_wrapped_separately[regno])
7160 	continue;
7161 
7162       rtx reg, mem;
7163 
7164       if (skip_wb
7165 	  && (regno == cfun->machine->frame.wb_candidate1
7166 	      || regno == cfun->machine->frame.wb_candidate2))
7167 	continue;
7168 
7169       machine_mode mode = aarch64_reg_save_mode (regno);
7170       reg = gen_rtx_REG (mode, regno);
7171       offset = start_offset + cfun->machine->frame.reg_offset[regno];
7172       rtx base_rtx = stack_pointer_rtx;
7173       if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
7174 	aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg,
7175 					     offset, ptrue);
7176       mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
7177 
7178       if (!aarch64_sve_mode_p (mode)
7179 	  && (regno2 = aarch64_next_callee_save (regno + 1, limit)) <= limit
7180 	  && !cfun->machine->reg_is_wrapped_separately[regno2]
7181 	  && known_eq (GET_MODE_SIZE (mode),
7182 		       cfun->machine->frame.reg_offset[regno2]
7183 		       - cfun->machine->frame.reg_offset[regno]))
7184 	{
7185 	  rtx reg2 = gen_rtx_REG (mode, regno2);
7186 	  rtx mem2;
7187 
7188 	  offset += GET_MODE_SIZE (mode);
7189 	  mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
7190 	  emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
7191 
7192 	  *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
7193 	  regno = regno2;
7194 	}
7195       else if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
7196 	emit_insn (gen_aarch64_pred_mov (mode, reg, ptrue, mem));
7197       else if (aarch64_sve_mode_p (mode))
7198 	emit_insn (gen_rtx_SET (reg, mem));
7199       else
7200 	emit_move_insn (reg, mem);
7201       if (frame_related_p)
7202 	*cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
7203     }
7204 }
7205 
7206 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
7207    of MODE.  */
7208 
7209 static inline bool
offset_4bit_signed_scaled_p(machine_mode mode,poly_int64 offset)7210 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
7211 {
7212   HOST_WIDE_INT multiple;
7213   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
7214 	  && IN_RANGE (multiple, -8, 7));
7215 }
7216 
7217 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
7218    of MODE.  */
7219 
7220 static inline bool
offset_6bit_unsigned_scaled_p(machine_mode mode,poly_int64 offset)7221 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
7222 {
7223   HOST_WIDE_INT multiple;
7224   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
7225 	  && IN_RANGE (multiple, 0, 63));
7226 }
7227 
7228 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
7229    of MODE.  */
7230 
7231 bool
aarch64_offset_7bit_signed_scaled_p(machine_mode mode,poly_int64 offset)7232 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
7233 {
7234   HOST_WIDE_INT multiple;
7235   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
7236 	  && IN_RANGE (multiple, -64, 63));
7237 }
7238 
7239 /* Return true if OFFSET is a signed 9-bit value.  */
7240 
7241 bool
aarch64_offset_9bit_signed_unscaled_p(machine_mode mode ATTRIBUTE_UNUSED,poly_int64 offset)7242 aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
7243 				       poly_int64 offset)
7244 {
7245   HOST_WIDE_INT const_offset;
7246   return (offset.is_constant (&const_offset)
7247 	  && IN_RANGE (const_offset, -256, 255));
7248 }
7249 
7250 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
7251    of MODE.  */
7252 
7253 static inline bool
offset_9bit_signed_scaled_p(machine_mode mode,poly_int64 offset)7254 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
7255 {
7256   HOST_WIDE_INT multiple;
7257   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
7258 	  && IN_RANGE (multiple, -256, 255));
7259 }
7260 
7261 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
7262    of MODE.  */
7263 
7264 static inline bool
offset_12bit_unsigned_scaled_p(machine_mode mode,poly_int64 offset)7265 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
7266 {
7267   HOST_WIDE_INT multiple;
7268   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
7269 	  && IN_RANGE (multiple, 0, 4095));
7270 }
7271 
7272 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS.  */
7273 
7274 static sbitmap
aarch64_get_separate_components(void)7275 aarch64_get_separate_components (void)
7276 {
7277   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
7278   bitmap_clear (components);
7279 
7280   /* The registers we need saved to the frame.  */
7281   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
7282     if (aarch64_register_saved_on_entry (regno))
7283       {
7284 	/* Punt on saves and restores that use ST1D and LD1D.  We could
7285 	   try to be smarter, but it would involve making sure that the
7286 	   spare predicate register itself is safe to use at the save
7287 	   and restore points.  Also, when a frame pointer is being used,
7288 	   the slots are often out of reach of ST1D and LD1D anyway.  */
7289 	machine_mode mode = aarch64_reg_save_mode (regno);
7290 	if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
7291 	  continue;
7292 
7293 	poly_int64 offset = cfun->machine->frame.reg_offset[regno];
7294 
7295 	/* If the register is saved in the first SVE save slot, we use
7296 	   it as a stack probe for -fstack-clash-protection.  */
7297 	if (flag_stack_clash_protection
7298 	    && maybe_ne (cfun->machine->frame.below_hard_fp_saved_regs_size, 0)
7299 	    && known_eq (offset, 0))
7300 	  continue;
7301 
7302 	/* Get the offset relative to the register we'll use.  */
7303 	if (frame_pointer_needed)
7304 	  offset -= cfun->machine->frame.below_hard_fp_saved_regs_size;
7305 	else
7306 	  offset += crtl->outgoing_args_size;
7307 
7308 	/* Check that we can access the stack slot of the register with one
7309 	   direct load with no adjustments needed.  */
7310 	if (aarch64_sve_mode_p (mode)
7311 	    ? offset_9bit_signed_scaled_p (mode, offset)
7312 	    : offset_12bit_unsigned_scaled_p (mode, offset))
7313 	  bitmap_set_bit (components, regno);
7314       }
7315 
7316   /* Don't mess with the hard frame pointer.  */
7317   if (frame_pointer_needed)
7318     bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
7319 
7320   /* If the spare predicate register used by big-endian SVE code
7321      is call-preserved, it must be saved in the main prologue
7322      before any saves that use it.  */
7323   if (cfun->machine->frame.spare_pred_reg != INVALID_REGNUM)
7324     bitmap_clear_bit (components, cfun->machine->frame.spare_pred_reg);
7325 
7326   unsigned reg1 = cfun->machine->frame.wb_candidate1;
7327   unsigned reg2 = cfun->machine->frame.wb_candidate2;
7328   /* If registers have been chosen to be stored/restored with
7329      writeback don't interfere with them to avoid having to output explicit
7330      stack adjustment instructions.  */
7331   if (reg2 != INVALID_REGNUM)
7332     bitmap_clear_bit (components, reg2);
7333   if (reg1 != INVALID_REGNUM)
7334     bitmap_clear_bit (components, reg1);
7335 
7336   bitmap_clear_bit (components, LR_REGNUM);
7337   bitmap_clear_bit (components, SP_REGNUM);
7338 
7339   return components;
7340 }
7341 
7342 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB.  */
7343 
7344 static sbitmap
aarch64_components_for_bb(basic_block bb)7345 aarch64_components_for_bb (basic_block bb)
7346 {
7347   bitmap in = DF_LIVE_IN (bb);
7348   bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
7349   bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
7350 
7351   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
7352   bitmap_clear (components);
7353 
7354   /* Clobbered registers don't generate values in any meaningful sense,
7355      since nothing after the clobber can rely on their value.  And we can't
7356      say that partially-clobbered registers are unconditionally killed,
7357      because whether they're killed or not depends on the mode of the
7358      value they're holding.  Thus partially call-clobbered registers
7359      appear in neither the kill set nor the gen set.
7360 
7361      Check manually for any calls that clobber more of a register than the
7362      current function can.  */
7363   function_abi_aggregator callee_abis;
7364   rtx_insn *insn;
7365   FOR_BB_INSNS (bb, insn)
7366     if (CALL_P (insn))
7367       callee_abis.note_callee_abi (insn_callee_abi (insn));
7368   HARD_REG_SET extra_caller_saves = callee_abis.caller_save_regs (*crtl->abi);
7369 
7370   /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets.  */
7371   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
7372     if (!fixed_regs[regno]
7373 	&& !crtl->abi->clobbers_full_reg_p (regno)
7374 	&& (TEST_HARD_REG_BIT (extra_caller_saves, regno)
7375 	    || bitmap_bit_p (in, regno)
7376 	    || bitmap_bit_p (gen, regno)
7377 	    || bitmap_bit_p (kill, regno)))
7378       {
7379 	bitmap_set_bit (components, regno);
7380 
7381 	/* If there is a callee-save at an adjacent offset, add it too
7382 	   to increase the use of LDP/STP.  */
7383 	poly_int64 offset = cfun->machine->frame.reg_offset[regno];
7384 	unsigned regno2 = multiple_p (offset, 16) ? regno + 1 : regno - 1;
7385 
7386 	if (regno2 <= LAST_SAVED_REGNUM)
7387 	  {
7388 	    poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
7389 	    if (regno < regno2
7390 		? known_eq (offset + 8, offset2)
7391 		: multiple_p (offset2, 16) && known_eq (offset2 + 8, offset))
7392 	      bitmap_set_bit (components, regno2);
7393 	  }
7394       }
7395 
7396   return components;
7397 }
7398 
7399 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
7400    Nothing to do for aarch64.  */
7401 
7402 static void
aarch64_disqualify_components(sbitmap,edge,sbitmap,bool)7403 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
7404 {
7405 }
7406 
7407 /* Return the next set bit in BMP from START onwards.  Return the total number
7408    of bits in BMP if no set bit is found at or after START.  */
7409 
7410 static unsigned int
aarch64_get_next_set_bit(sbitmap bmp,unsigned int start)7411 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
7412 {
7413   unsigned int nbits = SBITMAP_SIZE (bmp);
7414   if (start == nbits)
7415     return start;
7416 
7417   gcc_assert (start < nbits);
7418   for (unsigned int i = start; i < nbits; i++)
7419     if (bitmap_bit_p (bmp, i))
7420       return i;
7421 
7422   return nbits;
7423 }
7424 
7425 /* Do the work for aarch64_emit_prologue_components and
7426    aarch64_emit_epilogue_components.  COMPONENTS is the bitmap of registers
7427    to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
7428    for these components or the epilogue sequence.  That is, it determines
7429    whether we should emit stores or loads and what kind of CFA notes to attach
7430    to the insns.  Otherwise the logic for the two sequences is very
7431    similar.  */
7432 
7433 static void
aarch64_process_components(sbitmap components,bool prologue_p)7434 aarch64_process_components (sbitmap components, bool prologue_p)
7435 {
7436   rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
7437 			     ? HARD_FRAME_POINTER_REGNUM
7438 			     : STACK_POINTER_REGNUM);
7439 
7440   unsigned last_regno = SBITMAP_SIZE (components);
7441   unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
7442   rtx_insn *insn = NULL;
7443 
7444   while (regno != last_regno)
7445     {
7446       bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
7447       machine_mode mode = aarch64_reg_save_mode (regno);
7448 
7449       rtx reg = gen_rtx_REG (mode, regno);
7450       poly_int64 offset = cfun->machine->frame.reg_offset[regno];
7451       if (frame_pointer_needed)
7452 	offset -= cfun->machine->frame.below_hard_fp_saved_regs_size;
7453       else
7454 	offset += crtl->outgoing_args_size;
7455 
7456       rtx addr = plus_constant (Pmode, ptr_reg, offset);
7457       rtx mem = gen_frame_mem (mode, addr);
7458 
7459       rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
7460       unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
7461       /* No more registers to handle after REGNO.
7462 	 Emit a single save/restore and exit.  */
7463       if (regno2 == last_regno)
7464 	{
7465 	  insn = emit_insn (set);
7466 	  if (frame_related_p)
7467 	    {
7468 	      RTX_FRAME_RELATED_P (insn) = 1;
7469 	      if (prologue_p)
7470 		add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
7471 	      else
7472 		add_reg_note (insn, REG_CFA_RESTORE, reg);
7473 	    }
7474 	  break;
7475 	}
7476 
7477       poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
7478       /* The next register is not of the same class or its offset is not
7479 	 mergeable with the current one into a pair.  */
7480       if (aarch64_sve_mode_p (mode)
7481 	  || !satisfies_constraint_Ump (mem)
7482 	  || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
7483 	  || (crtl->abi->id () == ARM_PCS_SIMD && FP_REGNUM_P (regno))
7484 	  || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
7485 		       GET_MODE_SIZE (mode)))
7486 	{
7487 	  insn = emit_insn (set);
7488 	  if (frame_related_p)
7489 	    {
7490 	      RTX_FRAME_RELATED_P (insn) = 1;
7491 	      if (prologue_p)
7492 		add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
7493 	      else
7494 		add_reg_note (insn, REG_CFA_RESTORE, reg);
7495 	    }
7496 
7497 	  regno = regno2;
7498 	  continue;
7499 	}
7500 
7501       bool frame_related2_p = aarch64_emit_cfi_for_reg_p (regno2);
7502 
7503       /* REGNO2 can be saved/restored in a pair with REGNO.  */
7504       rtx reg2 = gen_rtx_REG (mode, regno2);
7505       if (frame_pointer_needed)
7506 	offset2 -= cfun->machine->frame.below_hard_fp_saved_regs_size;
7507       else
7508 	offset2 += crtl->outgoing_args_size;
7509       rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
7510       rtx mem2 = gen_frame_mem (mode, addr2);
7511       rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
7512 			     : gen_rtx_SET (reg2, mem2);
7513 
7514       if (prologue_p)
7515 	insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
7516       else
7517 	insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
7518 
7519       if (frame_related_p || frame_related2_p)
7520 	{
7521 	  RTX_FRAME_RELATED_P (insn) = 1;
7522 	  if (prologue_p)
7523 	    {
7524 	      if (frame_related_p)
7525 		add_reg_note (insn, REG_CFA_OFFSET, set);
7526 	      if (frame_related2_p)
7527 		add_reg_note (insn, REG_CFA_OFFSET, set2);
7528 	    }
7529 	  else
7530 	    {
7531 	      if (frame_related_p)
7532 		add_reg_note (insn, REG_CFA_RESTORE, reg);
7533 	      if (frame_related2_p)
7534 		add_reg_note (insn, REG_CFA_RESTORE, reg2);
7535 	    }
7536 	}
7537 
7538       regno = aarch64_get_next_set_bit (components, regno2 + 1);
7539     }
7540 }
7541 
7542 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS.  */
7543 
7544 static void
aarch64_emit_prologue_components(sbitmap components)7545 aarch64_emit_prologue_components (sbitmap components)
7546 {
7547   aarch64_process_components (components, true);
7548 }
7549 
7550 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS.  */
7551 
7552 static void
aarch64_emit_epilogue_components(sbitmap components)7553 aarch64_emit_epilogue_components (sbitmap components)
7554 {
7555   aarch64_process_components (components, false);
7556 }
7557 
7558 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS.  */
7559 
7560 static void
aarch64_set_handled_components(sbitmap components)7561 aarch64_set_handled_components (sbitmap components)
7562 {
7563   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
7564     if (bitmap_bit_p (components, regno))
7565       cfun->machine->reg_is_wrapped_separately[regno] = true;
7566 }
7567 
7568 /* On AArch64 we have an ABI defined safe buffer.  This constant is used to
7569    determining the probe offset for alloca.  */
7570 
7571 static HOST_WIDE_INT
aarch64_stack_clash_protection_alloca_probe_range(void)7572 aarch64_stack_clash_protection_alloca_probe_range (void)
7573 {
7574   return STACK_CLASH_CALLER_GUARD;
7575 }
7576 
7577 
7578 /* Allocate POLY_SIZE bytes of stack space using TEMP1 and TEMP2 as scratch
7579    registers.  If POLY_SIZE is not large enough to require a probe this function
7580    will only adjust the stack.  When allocating the stack space
7581    FRAME_RELATED_P is then used to indicate if the allocation is frame related.
7582    FINAL_ADJUSTMENT_P indicates whether we are allocating the outgoing
7583    arguments.  If we are then we ensure that any allocation larger than the ABI
7584    defined buffer needs a probe so that the invariant of having a 1KB buffer is
7585    maintained.
7586 
7587    We emit barriers after each stack adjustment to prevent optimizations from
7588    breaking the invariant that we never drop the stack more than a page.  This
7589    invariant is needed to make it easier to correctly handle asynchronous
7590    events, e.g. if we were to allow the stack to be dropped by more than a page
7591    and then have multiple probes up and we take a signal somewhere in between
7592    then the signal handler doesn't know the state of the stack and can make no
7593    assumptions about which pages have been probed.  */
7594 
7595 static void
aarch64_allocate_and_probe_stack_space(rtx temp1,rtx temp2,poly_int64 poly_size,bool frame_related_p,bool final_adjustment_p)7596 aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
7597 					poly_int64 poly_size,
7598 					bool frame_related_p,
7599 					bool final_adjustment_p)
7600 {
7601   HOST_WIDE_INT guard_size
7602     = 1 << param_stack_clash_protection_guard_size;
7603   HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
7604   HOST_WIDE_INT min_probe_threshold
7605     = (final_adjustment_p
7606        ? guard_used_by_caller
7607        : guard_size - guard_used_by_caller);
7608   /* When doing the final adjustment for the outgoing arguments, take into
7609      account any unprobed space there is above the current SP.  There are
7610      two cases:
7611 
7612      - When saving SVE registers below the hard frame pointer, we force
7613        the lowest save to take place in the prologue before doing the final
7614        adjustment (i.e. we don't allow the save to be shrink-wrapped).
7615        This acts as a probe at SP, so there is no unprobed space.
7616 
7617      - When there are no SVE register saves, we use the store of the link
7618        register as a probe.  We can't assume that LR was saved at position 0
7619        though, so treat any space below it as unprobed.  */
7620   if (final_adjustment_p
7621       && known_eq (cfun->machine->frame.below_hard_fp_saved_regs_size, 0))
7622     {
7623       poly_int64 lr_offset = cfun->machine->frame.reg_offset[LR_REGNUM];
7624       if (known_ge (lr_offset, 0))
7625 	min_probe_threshold -= lr_offset.to_constant ();
7626       else
7627 	gcc_assert (!flag_stack_clash_protection || known_eq (poly_size, 0));
7628     }
7629 
7630   poly_int64 frame_size = cfun->machine->frame.frame_size;
7631 
7632   /* We should always have a positive probe threshold.  */
7633   gcc_assert (min_probe_threshold > 0);
7634 
7635   if (flag_stack_clash_protection && !final_adjustment_p)
7636     {
7637       poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
7638       poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
7639       poly_int64 final_adjust = cfun->machine->frame.final_adjust;
7640 
7641       if (known_eq (frame_size, 0))
7642 	{
7643 	  dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
7644 	}
7645       else if (known_lt (initial_adjust + sve_callee_adjust,
7646 			 guard_size - guard_used_by_caller)
7647 	       && known_lt (final_adjust, guard_used_by_caller))
7648 	{
7649 	  dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
7650 	}
7651     }
7652 
7653   /* If SIZE is not large enough to require probing, just adjust the stack and
7654      exit.  */
7655   if (known_lt (poly_size, min_probe_threshold)
7656       || !flag_stack_clash_protection)
7657     {
7658       aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p);
7659       return;
7660     }
7661 
7662   HOST_WIDE_INT size;
7663   /* Handle the SVE non-constant case first.  */
7664   if (!poly_size.is_constant (&size))
7665     {
7666      if (dump_file)
7667       {
7668 	fprintf (dump_file, "Stack clash SVE prologue: ");
7669 	print_dec (poly_size, dump_file);
7670 	fprintf (dump_file, " bytes, dynamic probing will be required.\n");
7671       }
7672 
7673       /* First calculate the amount of bytes we're actually spilling.  */
7674       aarch64_add_offset (Pmode, temp1, CONST0_RTX (Pmode),
7675 			  poly_size, temp1, temp2, false, true);
7676 
7677       rtx_insn *insn = get_last_insn ();
7678 
7679       if (frame_related_p)
7680 	{
7681 	  /* This is done to provide unwinding information for the stack
7682 	     adjustments we're about to do, however to prevent the optimizers
7683 	     from removing the R11 move and leaving the CFA note (which would be
7684 	     very wrong) we tie the old and new stack pointer together.
7685 	     The tie will expand to nothing but the optimizers will not touch
7686 	     the instruction.  */
7687 	  rtx stack_ptr_copy = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
7688 	  emit_move_insn (stack_ptr_copy, stack_pointer_rtx);
7689 	  emit_insn (gen_stack_tie (stack_ptr_copy, stack_pointer_rtx));
7690 
7691 	  /* We want the CFA independent of the stack pointer for the
7692 	     duration of the loop.  */
7693 	  add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy);
7694 	  RTX_FRAME_RELATED_P (insn) = 1;
7695 	}
7696 
7697       rtx probe_const = gen_int_mode (min_probe_threshold, Pmode);
7698       rtx guard_const = gen_int_mode (guard_size, Pmode);
7699 
7700       insn = emit_insn (gen_probe_sve_stack_clash (Pmode, stack_pointer_rtx,
7701 						   stack_pointer_rtx, temp1,
7702 						   probe_const, guard_const));
7703 
7704       /* Now reset the CFA register if needed.  */
7705       if (frame_related_p)
7706 	{
7707 	  add_reg_note (insn, REG_CFA_DEF_CFA,
7708 			gen_rtx_PLUS (Pmode, stack_pointer_rtx,
7709 				      gen_int_mode (poly_size, Pmode)));
7710 	  RTX_FRAME_RELATED_P (insn) = 1;
7711 	}
7712 
7713       return;
7714     }
7715 
7716   if (dump_file)
7717     fprintf (dump_file,
7718 	     "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
7719 	     " bytes, probing will be required.\n", size);
7720 
7721   /* Round size to the nearest multiple of guard_size, and calculate the
7722      residual as the difference between the original size and the rounded
7723      size.  */
7724   HOST_WIDE_INT rounded_size = ROUND_DOWN (size, guard_size);
7725   HOST_WIDE_INT residual = size - rounded_size;
7726 
7727   /* We can handle a small number of allocations/probes inline.  Otherwise
7728      punt to a loop.  */
7729   if (rounded_size <= STACK_CLASH_MAX_UNROLL_PAGES * guard_size)
7730     {
7731       for (HOST_WIDE_INT i = 0; i < rounded_size; i += guard_size)
7732 	{
7733 	  aarch64_sub_sp (NULL, temp2, guard_size, true);
7734 	  emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
7735 					   guard_used_by_caller));
7736 	  emit_insn (gen_blockage ());
7737 	}
7738       dump_stack_clash_frame_info (PROBE_INLINE, size != rounded_size);
7739     }
7740   else
7741     {
7742       /* Compute the ending address.  */
7743       aarch64_add_offset (Pmode, temp1, stack_pointer_rtx, -rounded_size,
7744 			  temp1, NULL, false, true);
7745       rtx_insn *insn = get_last_insn ();
7746 
7747       /* For the initial allocation, we don't have a frame pointer
7748 	 set up, so we always need CFI notes.  If we're doing the
7749 	 final allocation, then we may have a frame pointer, in which
7750 	 case it is the CFA, otherwise we need CFI notes.
7751 
7752 	 We can determine which allocation we are doing by looking at
7753 	 the value of FRAME_RELATED_P since the final allocations are not
7754 	 frame related.  */
7755       if (frame_related_p)
7756 	{
7757 	  /* We want the CFA independent of the stack pointer for the
7758 	     duration of the loop.  */
7759 	  add_reg_note (insn, REG_CFA_DEF_CFA,
7760 			plus_constant (Pmode, temp1, rounded_size));
7761 	  RTX_FRAME_RELATED_P (insn) = 1;
7762 	}
7763 
7764       /* This allocates and probes the stack.  Note that this re-uses some of
7765 	 the existing Ada stack protection code.  However we are guaranteed not
7766 	 to enter the non loop or residual branches of that code.
7767 
7768 	 The non-loop part won't be entered because if our allocation amount
7769 	 doesn't require a loop, the case above would handle it.
7770 
7771 	 The residual amount won't be entered because TEMP1 is a mutliple of
7772 	 the allocation size.  The residual will always be 0.  As such, the only
7773 	 part we are actually using from that code is the loop setup.  The
7774 	 actual probing is done in aarch64_output_probe_stack_range.  */
7775       insn = emit_insn (gen_probe_stack_range (stack_pointer_rtx,
7776 					       stack_pointer_rtx, temp1));
7777 
7778       /* Now reset the CFA register if needed.  */
7779       if (frame_related_p)
7780 	{
7781 	  add_reg_note (insn, REG_CFA_DEF_CFA,
7782 			plus_constant (Pmode, stack_pointer_rtx, rounded_size));
7783 	  RTX_FRAME_RELATED_P (insn) = 1;
7784 	}
7785 
7786       emit_insn (gen_blockage ());
7787       dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
7788     }
7789 
7790   /* Handle any residuals.  Residuals of at least MIN_PROBE_THRESHOLD have to
7791      be probed.  This maintains the requirement that each page is probed at
7792      least once.  For initial probing we probe only if the allocation is
7793      more than GUARD_SIZE - buffer, and for the outgoing arguments we probe
7794      if the amount is larger than buffer.  GUARD_SIZE - buffer + buffer ==
7795      GUARD_SIZE.  This works that for any allocation that is large enough to
7796      trigger a probe here, we'll have at least one, and if they're not large
7797      enough for this code to emit anything for them, The page would have been
7798      probed by the saving of FP/LR either by this function or any callees.  If
7799      we don't have any callees then we won't have more stack adjustments and so
7800      are still safe.  */
7801   if (residual)
7802     {
7803       HOST_WIDE_INT residual_probe_offset = guard_used_by_caller;
7804       /* If we're doing final adjustments, and we've done any full page
7805 	 allocations then any residual needs to be probed.  */
7806       if (final_adjustment_p && rounded_size != 0)
7807 	min_probe_threshold = 0;
7808       /* If doing a small final adjustment, we always probe at offset 0.
7809 	 This is done to avoid issues when LR is not at position 0 or when
7810 	 the final adjustment is smaller than the probing offset.  */
7811       else if (final_adjustment_p && rounded_size == 0)
7812 	residual_probe_offset = 0;
7813 
7814       aarch64_sub_sp (temp1, temp2, residual, frame_related_p);
7815       if (residual >= min_probe_threshold)
7816 	{
7817 	  if (dump_file)
7818 	    fprintf (dump_file,
7819 		     "Stack clash AArch64 prologue residuals: "
7820 		     HOST_WIDE_INT_PRINT_DEC " bytes, probing will be required."
7821 		     "\n", residual);
7822 
7823 	    emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
7824 					     residual_probe_offset));
7825 	  emit_insn (gen_blockage ());
7826 	}
7827     }
7828 }
7829 
7830 /* Return 1 if the register is used by the epilogue.  We need to say the
7831    return register is used, but only after epilogue generation is complete.
7832    Note that in the case of sibcalls, the values "used by the epilogue" are
7833    considered live at the start of the called function.
7834 
7835    For SIMD functions we need to return 1 for FP registers that are saved and
7836    restored by a function but are not zero in call_used_regs.  If we do not do
7837    this optimizations may remove the restore of the register.  */
7838 
7839 int
aarch64_epilogue_uses(int regno)7840 aarch64_epilogue_uses (int regno)
7841 {
7842   if (epilogue_completed)
7843     {
7844       if (regno == LR_REGNUM)
7845 	return 1;
7846     }
7847   return 0;
7848 }
7849 
7850 /* AArch64 stack frames generated by this compiler look like:
7851 
7852 	+-------------------------------+
7853 	|                               |
7854 	|  incoming stack arguments     |
7855 	|                               |
7856 	+-------------------------------+
7857 	|                               | <-- incoming stack pointer (aligned)
7858 	|  callee-allocated save area   |
7859 	|  for register varargs         |
7860 	|                               |
7861 	+-------------------------------+
7862 	|  local variables              | <-- frame_pointer_rtx
7863 	|                               |
7864 	+-------------------------------+
7865 	|  padding                      | \
7866 	+-------------------------------+  |
7867 	|  callee-saved registers       |  | frame.saved_regs_size
7868 	+-------------------------------+  |
7869 	|  LR'                          |  |
7870 	+-------------------------------+  |
7871 	|  FP'                          |  |
7872 	+-------------------------------+  |<- hard_frame_pointer_rtx (aligned)
7873 	|  SVE vector registers         |  | \
7874 	+-------------------------------+  |  | below_hard_fp_saved_regs_size
7875 	|  SVE predicate registers      | /  /
7876 	+-------------------------------+
7877 	|  dynamic allocation           |
7878 	+-------------------------------+
7879 	|  padding                      |
7880 	+-------------------------------+
7881 	|  outgoing stack arguments     | <-- arg_pointer
7882         |                               |
7883 	+-------------------------------+
7884 	|                               | <-- stack_pointer_rtx (aligned)
7885 
7886    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
7887    but leave frame_pointer_rtx and hard_frame_pointer_rtx
7888    unchanged.
7889 
7890    By default for stack-clash we assume the guard is at least 64KB, but this
7891    value is configurable to either 4KB or 64KB.  We also force the guard size to
7892    be the same as the probing interval and both values are kept in sync.
7893 
7894    With those assumptions the callee can allocate up to 63KB (or 3KB depending
7895    on the guard size) of stack space without probing.
7896 
7897    When probing is needed, we emit a probe at the start of the prologue
7898    and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
7899 
7900    We have to track how much space has been allocated and the only stores
7901    to the stack we track as implicit probes are the FP/LR stores.
7902 
7903    For outgoing arguments we probe if the size is larger than 1KB, such that
7904    the ABI specified buffer is maintained for the next callee.
7905 
7906    The following registers are reserved during frame layout and should not be
7907    used for any other purpose:
7908 
7909    - r11: Used by stack clash protection when SVE is enabled, and also
7910 	  as an anchor register when saving and restoring registers
7911    - r12(EP0) and r13(EP1): Used as temporaries for stack adjustment.
7912    - r14 and r15: Used for speculation tracking.
7913    - r16(IP0), r17(IP1): Used by indirect tailcalls.
7914    - r30(LR), r29(FP): Used by standard frame layout.
7915 
7916    These registers must be avoided in frame layout related code unless the
7917    explicit intention is to interact with one of the features listed above.  */
7918 
7919 /* Generate the prologue instructions for entry into a function.
7920    Establish the stack frame by decreasing the stack pointer with a
7921    properly calculated size and, if necessary, create a frame record
7922    filled with the values of LR and previous frame pointer.  The
7923    current FP is also set up if it is in use.  */
7924 
7925 void
aarch64_expand_prologue(void)7926 aarch64_expand_prologue (void)
7927 {
7928   poly_int64 frame_size = cfun->machine->frame.frame_size;
7929   poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
7930   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
7931   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
7932   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
7933   poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
7934   poly_int64 below_hard_fp_saved_regs_size
7935     = cfun->machine->frame.below_hard_fp_saved_regs_size;
7936   unsigned reg1 = cfun->machine->frame.wb_candidate1;
7937   unsigned reg2 = cfun->machine->frame.wb_candidate2;
7938   bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
7939   rtx_insn *insn;
7940 
7941   if (flag_stack_clash_protection && known_eq (callee_adjust, 0))
7942     {
7943       /* Fold the SVE allocation into the initial allocation.
7944 	 We don't do this in aarch64_layout_arg to avoid pessimizing
7945 	 the epilogue code.  */
7946       initial_adjust += sve_callee_adjust;
7947       sve_callee_adjust = 0;
7948     }
7949 
7950   /* Sign return address for functions.  */
7951   if (aarch64_return_address_signing_enabled ())
7952     {
7953       switch (aarch64_ra_sign_key)
7954 	{
7955 	  case AARCH64_KEY_A:
7956 	    insn = emit_insn (gen_paciasp ());
7957 	    break;
7958 	  case AARCH64_KEY_B:
7959 	    insn = emit_insn (gen_pacibsp ());
7960 	    break;
7961 	  default:
7962 	    gcc_unreachable ();
7963 	}
7964       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
7965       RTX_FRAME_RELATED_P (insn) = 1;
7966     }
7967 
7968   if (flag_stack_usage_info)
7969     current_function_static_stack_size = constant_lower_bound (frame_size);
7970 
7971   if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
7972     {
7973       if (crtl->is_leaf && !cfun->calls_alloca)
7974 	{
7975 	  if (maybe_gt (frame_size, PROBE_INTERVAL)
7976 	      && maybe_gt (frame_size, get_stack_check_protect ()))
7977 	    aarch64_emit_probe_stack_range (get_stack_check_protect (),
7978 					    (frame_size
7979 					     - get_stack_check_protect ()));
7980 	}
7981       else if (maybe_gt (frame_size, 0))
7982 	aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
7983     }
7984 
7985   rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
7986   rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
7987 
7988   /* In theory we should never have both an initial adjustment
7989      and a callee save adjustment.  Verify that is the case since the
7990      code below does not handle it for -fstack-clash-protection.  */
7991   gcc_assert (known_eq (initial_adjust, 0) || callee_adjust == 0);
7992 
7993   /* Will only probe if the initial adjustment is larger than the guard
7994      less the amount of the guard reserved for use by the caller's
7995      outgoing args.  */
7996   aarch64_allocate_and_probe_stack_space (tmp0_rtx, tmp1_rtx, initial_adjust,
7997 					  true, false);
7998 
7999   if (callee_adjust != 0)
8000     aarch64_push_regs (reg1, reg2, callee_adjust);
8001 
8002   /* The offset of the frame chain record (if any) from the current SP.  */
8003   poly_int64 chain_offset = (initial_adjust + callee_adjust
8004 			     - cfun->machine->frame.hard_fp_offset);
8005   gcc_assert (known_ge (chain_offset, 0));
8006 
8007   /* The offset of the bottom of the save area from the current SP.  */
8008   poly_int64 saved_regs_offset = chain_offset - below_hard_fp_saved_regs_size;
8009 
8010   if (emit_frame_chain)
8011     {
8012       if (callee_adjust == 0)
8013 	{
8014 	  reg1 = R29_REGNUM;
8015 	  reg2 = R30_REGNUM;
8016 	  aarch64_save_callee_saves (saved_regs_offset, reg1, reg2,
8017 				     false, false);
8018 	}
8019       else
8020 	gcc_assert (known_eq (chain_offset, 0));
8021       aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
8022 			  stack_pointer_rtx, chain_offset,
8023 			  tmp1_rtx, tmp0_rtx, frame_pointer_needed);
8024       if (frame_pointer_needed && !frame_size.is_constant ())
8025 	{
8026 	  /* Variable-sized frames need to describe the save slot
8027 	     address using DW_CFA_expression rather than DW_CFA_offset.
8028 	     This means that, without taking further action, the
8029 	     locations of the registers that we've already saved would
8030 	     remain based on the stack pointer even after we redefine
8031 	     the CFA based on the frame pointer.  We therefore need new
8032 	     DW_CFA_expressions to re-express the save slots with addresses
8033 	     based on the frame pointer.  */
8034 	  rtx_insn *insn = get_last_insn ();
8035 	  gcc_assert (RTX_FRAME_RELATED_P (insn));
8036 
8037 	  /* Add an explicit CFA definition if this was previously
8038 	     implicit.  */
8039 	  if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
8040 	    {
8041 	      rtx src = plus_constant (Pmode, stack_pointer_rtx,
8042 				       callee_offset);
8043 	      add_reg_note (insn, REG_CFA_ADJUST_CFA,
8044 			    gen_rtx_SET (hard_frame_pointer_rtx, src));
8045 	    }
8046 
8047 	  /* Change the save slot expressions for the registers that
8048 	     we've already saved.  */
8049 	  aarch64_add_cfa_expression (insn, regno_reg_rtx[reg2],
8050 				      hard_frame_pointer_rtx, UNITS_PER_WORD);
8051 	  aarch64_add_cfa_expression (insn, regno_reg_rtx[reg1],
8052 				      hard_frame_pointer_rtx, 0);
8053 	}
8054       emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
8055     }
8056 
8057   aarch64_save_callee_saves (saved_regs_offset, R0_REGNUM, R30_REGNUM,
8058 			     callee_adjust != 0 || emit_frame_chain,
8059 			     emit_frame_chain);
8060   if (maybe_ne (sve_callee_adjust, 0))
8061     {
8062       gcc_assert (!flag_stack_clash_protection
8063 		  || known_eq (initial_adjust, 0));
8064       aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx,
8065 					      sve_callee_adjust,
8066 					      !frame_pointer_needed, false);
8067       saved_regs_offset += sve_callee_adjust;
8068     }
8069   aarch64_save_callee_saves (saved_regs_offset, P0_REGNUM, P15_REGNUM,
8070 			     false, emit_frame_chain);
8071   aarch64_save_callee_saves (saved_regs_offset, V0_REGNUM, V31_REGNUM,
8072 			     callee_adjust != 0 || emit_frame_chain,
8073 			     emit_frame_chain);
8074 
8075   /* We may need to probe the final adjustment if it is larger than the guard
8076      that is assumed by the called.  */
8077   aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust,
8078 					  !frame_pointer_needed, true);
8079 }
8080 
8081 /* Return TRUE if we can use a simple_return insn.
8082 
8083    This function checks whether the callee saved stack is empty, which
8084    means no restore actions are need. The pro_and_epilogue will use
8085    this to check whether shrink-wrapping opt is feasible.  */
8086 
8087 bool
aarch64_use_return_insn_p(void)8088 aarch64_use_return_insn_p (void)
8089 {
8090   if (!reload_completed)
8091     return false;
8092 
8093   if (crtl->profile)
8094     return false;
8095 
8096   return known_eq (cfun->machine->frame.frame_size, 0);
8097 }
8098 
8099 /* Generate the epilogue instructions for returning from a function.
8100    This is almost exactly the reverse of the prolog sequence, except
8101    that we need to insert barriers to avoid scheduling loads that read
8102    from a deallocated stack, and we optimize the unwind records by
8103    emitting them all together if possible.  */
8104 void
aarch64_expand_epilogue(bool for_sibcall)8105 aarch64_expand_epilogue (bool for_sibcall)
8106 {
8107   poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
8108   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
8109   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
8110   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
8111   poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
8112   poly_int64 below_hard_fp_saved_regs_size
8113     = cfun->machine->frame.below_hard_fp_saved_regs_size;
8114   unsigned reg1 = cfun->machine->frame.wb_candidate1;
8115   unsigned reg2 = cfun->machine->frame.wb_candidate2;
8116   rtx cfi_ops = NULL;
8117   rtx_insn *insn;
8118   /* A stack clash protection prologue may not have left EP0_REGNUM or
8119      EP1_REGNUM in a usable state.  The same is true for allocations
8120      with an SVE component, since we then need both temporary registers
8121      for each allocation.  For stack clash we are in a usable state if
8122      the adjustment is less than GUARD_SIZE - GUARD_USED_BY_CALLER.  */
8123   HOST_WIDE_INT guard_size
8124     = 1 << param_stack_clash_protection_guard_size;
8125   HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
8126 
8127   /* We can re-use the registers when:
8128 
8129      (a) the deallocation amount is the same as the corresponding
8130 	 allocation amount (which is false if we combine the initial
8131 	 and SVE callee save allocations in the prologue); and
8132 
8133      (b) the allocation amount doesn't need a probe (which is false
8134 	 if the amount is guard_size - guard_used_by_caller or greater).
8135 
8136      In such situations the register should remain live with the correct
8137      value.  */
8138   bool can_inherit_p = (initial_adjust.is_constant ()
8139 			&& final_adjust.is_constant ()
8140 			&& (!flag_stack_clash_protection
8141 			    || (known_lt (initial_adjust,
8142 					  guard_size - guard_used_by_caller)
8143 				&& known_eq (sve_callee_adjust, 0))));
8144 
8145   /* We need to add memory barrier to prevent read from deallocated stack.  */
8146   bool need_barrier_p
8147     = maybe_ne (get_frame_size ()
8148 		+ cfun->machine->frame.saved_varargs_size, 0);
8149 
8150   /* Emit a barrier to prevent loads from a deallocated stack.  */
8151   if (maybe_gt (final_adjust, crtl->outgoing_args_size)
8152       || cfun->calls_alloca
8153       || crtl->calls_eh_return)
8154     {
8155       emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
8156       need_barrier_p = false;
8157     }
8158 
8159   /* Restore the stack pointer from the frame pointer if it may not
8160      be the same as the stack pointer.  */
8161   rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
8162   rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
8163   if (frame_pointer_needed
8164       && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
8165     /* If writeback is used when restoring callee-saves, the CFA
8166        is restored on the instruction doing the writeback.  */
8167     aarch64_add_offset (Pmode, stack_pointer_rtx,
8168 			hard_frame_pointer_rtx,
8169 			-callee_offset - below_hard_fp_saved_regs_size,
8170 			tmp1_rtx, tmp0_rtx, callee_adjust == 0);
8171   else
8172      /* The case where we need to re-use the register here is very rare, so
8173 	avoid the complicated condition and just always emit a move if the
8174 	immediate doesn't fit.  */
8175      aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, true);
8176 
8177   /* Restore the vector registers before the predicate registers,
8178      so that we can use P4 as a temporary for big-endian SVE frames.  */
8179   aarch64_restore_callee_saves (callee_offset, V0_REGNUM, V31_REGNUM,
8180 				callee_adjust != 0, &cfi_ops);
8181   aarch64_restore_callee_saves (callee_offset, P0_REGNUM, P15_REGNUM,
8182 				false, &cfi_ops);
8183   if (maybe_ne (sve_callee_adjust, 0))
8184     aarch64_add_sp (NULL_RTX, NULL_RTX, sve_callee_adjust, true);
8185   aarch64_restore_callee_saves (callee_offset - sve_callee_adjust,
8186 				R0_REGNUM, R30_REGNUM,
8187 				callee_adjust != 0, &cfi_ops);
8188 
8189   if (need_barrier_p)
8190     emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
8191 
8192   if (callee_adjust != 0)
8193     aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
8194 
8195   if (cfi_ops && (callee_adjust != 0 || maybe_gt (initial_adjust, 65536)))
8196     {
8197       /* Emit delayed restores and set the CFA to be SP + initial_adjust.  */
8198       insn = get_last_insn ();
8199       rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
8200       REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
8201       RTX_FRAME_RELATED_P (insn) = 1;
8202       cfi_ops = NULL;
8203     }
8204 
8205   /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
8206      add restriction on emit_move optimization to leaf functions.  */
8207   aarch64_add_sp (tmp0_rtx, tmp1_rtx, initial_adjust,
8208 		  (!can_inherit_p || !crtl->is_leaf
8209 		   || df_regs_ever_live_p (EP0_REGNUM)));
8210 
8211   if (cfi_ops)
8212     {
8213       /* Emit delayed restores and reset the CFA to be SP.  */
8214       insn = get_last_insn ();
8215       cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
8216       REG_NOTES (insn) = cfi_ops;
8217       RTX_FRAME_RELATED_P (insn) = 1;
8218     }
8219 
8220   /* We prefer to emit the combined return/authenticate instruction RETAA,
8221      however there are three cases in which we must instead emit an explicit
8222      authentication instruction.
8223 
8224 	1) Sibcalls don't return in a normal way, so if we're about to call one
8225 	   we must authenticate.
8226 
8227 	2) The RETAA instruction is not available before ARMv8.3-A, so if we are
8228 	   generating code for !TARGET_ARMV8_3 we can't use it and must
8229 	   explicitly authenticate.
8230 
8231 	3) On an eh_return path we make extra stack adjustments to update the
8232 	   canonical frame address to be the exception handler's CFA.  We want
8233 	   to authenticate using the CFA of the function which calls eh_return.
8234     */
8235   if (aarch64_return_address_signing_enabled ()
8236       && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
8237     {
8238       switch (aarch64_ra_sign_key)
8239 	{
8240 	  case AARCH64_KEY_A:
8241 	    insn = emit_insn (gen_autiasp ());
8242 	    break;
8243 	  case AARCH64_KEY_B:
8244 	    insn = emit_insn (gen_autibsp ());
8245 	    break;
8246 	  default:
8247 	    gcc_unreachable ();
8248 	}
8249       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
8250       RTX_FRAME_RELATED_P (insn) = 1;
8251     }
8252 
8253   /* Stack adjustment for exception handler.  */
8254   if (crtl->calls_eh_return && !for_sibcall)
8255     {
8256       /* We need to unwind the stack by the offset computed by
8257 	 EH_RETURN_STACKADJ_RTX.  We have already reset the CFA
8258 	 to be SP; letting the CFA move during this adjustment
8259 	 is just as correct as retaining the CFA from the body
8260 	 of the function.  Therefore, do nothing special.  */
8261       emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
8262     }
8263 
8264   emit_use (gen_rtx_REG (DImode, LR_REGNUM));
8265   if (!for_sibcall)
8266     emit_jump_insn (ret_rtx);
8267 }
8268 
8269 /* Implement EH_RETURN_HANDLER_RTX.  EH returns need to either return
8270    normally or return to a previous frame after unwinding.
8271 
8272    An EH return uses a single shared return sequence.  The epilogue is
8273    exactly like a normal epilogue except that it has an extra input
8274    register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
8275    that must be applied after the frame has been destroyed.  An extra label
8276    is inserted before the epilogue which initializes this register to zero,
8277    and this is the entry point for a normal return.
8278 
8279    An actual EH return updates the return address, initializes the stack
8280    adjustment and jumps directly into the epilogue (bypassing the zeroing
8281    of the adjustment).  Since the return address is typically saved on the
8282    stack when a function makes a call, the saved LR must be updated outside
8283    the epilogue.
8284 
8285    This poses problems as the store is generated well before the epilogue,
8286    so the offset of LR is not known yet.  Also optimizations will remove the
8287    store as it appears dead, even after the epilogue is generated (as the
8288    base or offset for loading LR is different in many cases).
8289 
8290    To avoid these problems this implementation forces the frame pointer
8291    in eh_return functions so that the location of LR is fixed and known early.
8292    It also marks the store volatile, so no optimization is permitted to
8293    remove the store.  */
8294 rtx
aarch64_eh_return_handler_rtx(void)8295 aarch64_eh_return_handler_rtx (void)
8296 {
8297   rtx tmp = gen_frame_mem (Pmode,
8298     plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
8299 
8300   /* Mark the store volatile, so no optimization is permitted to remove it.  */
8301   MEM_VOLATILE_P (tmp) = true;
8302   return tmp;
8303 }
8304 
8305 /* Output code to add DELTA to the first argument, and then jump
8306    to FUNCTION.  Used for C++ multiple inheritance.  */
8307 static void
aarch64_output_mi_thunk(FILE * file,tree thunk ATTRIBUTE_UNUSED,HOST_WIDE_INT delta,HOST_WIDE_INT vcall_offset,tree function)8308 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
8309 			 HOST_WIDE_INT delta,
8310 			 HOST_WIDE_INT vcall_offset,
8311 			 tree function)
8312 {
8313   /* The this pointer is always in x0.  Note that this differs from
8314      Arm where the this pointer maybe bumped to r1 if r0 is required
8315      to return a pointer to an aggregate.  On AArch64 a result value
8316      pointer will be in x8.  */
8317   int this_regno = R0_REGNUM;
8318   rtx this_rtx, temp0, temp1, addr, funexp;
8319   rtx_insn *insn;
8320   const char *fnname = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (thunk));
8321 
8322   if (aarch64_bti_enabled ())
8323     emit_insn (gen_bti_c());
8324 
8325   reload_completed = 1;
8326   emit_note (NOTE_INSN_PROLOGUE_END);
8327 
8328   this_rtx = gen_rtx_REG (Pmode, this_regno);
8329   temp0 = gen_rtx_REG (Pmode, EP0_REGNUM);
8330   temp1 = gen_rtx_REG (Pmode, EP1_REGNUM);
8331 
8332   if (vcall_offset == 0)
8333     aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
8334   else
8335     {
8336       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
8337 
8338       addr = this_rtx;
8339       if (delta != 0)
8340 	{
8341 	  if (delta >= -256 && delta < 256)
8342 	    addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
8343 				       plus_constant (Pmode, this_rtx, delta));
8344 	  else
8345 	    aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
8346 				temp1, temp0, false);
8347 	}
8348 
8349       if (Pmode == ptr_mode)
8350 	aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
8351       else
8352 	aarch64_emit_move (temp0,
8353 			   gen_rtx_ZERO_EXTEND (Pmode,
8354 						gen_rtx_MEM (ptr_mode, addr)));
8355 
8356       if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
8357 	  addr = plus_constant (Pmode, temp0, vcall_offset);
8358       else
8359 	{
8360 	  aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
8361 					  Pmode);
8362 	  addr = gen_rtx_PLUS (Pmode, temp0, temp1);
8363 	}
8364 
8365       if (Pmode == ptr_mode)
8366 	aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
8367       else
8368 	aarch64_emit_move (temp1,
8369 			   gen_rtx_SIGN_EXTEND (Pmode,
8370 						gen_rtx_MEM (ptr_mode, addr)));
8371 
8372       emit_insn (gen_add2_insn (this_rtx, temp1));
8373     }
8374 
8375   /* Generate a tail call to the target function.  */
8376   if (!TREE_USED (function))
8377     {
8378       assemble_external (function);
8379       TREE_USED (function) = 1;
8380     }
8381   funexp = XEXP (DECL_RTL (function), 0);
8382   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
8383   rtx callee_abi = gen_int_mode (fndecl_abi (function).id (), DImode);
8384   insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, callee_abi));
8385   SIBLING_CALL_P (insn) = 1;
8386 
8387   insn = get_insns ();
8388   shorten_branches (insn);
8389 
8390   assemble_start_function (thunk, fnname);
8391   final_start_function (insn, file, 1);
8392   final (insn, file, 1);
8393   final_end_function ();
8394   assemble_end_function (thunk, fnname);
8395 
8396   /* Stop pretending to be a post-reload pass.  */
8397   reload_completed = 0;
8398 }
8399 
8400 static bool
aarch64_tls_referenced_p(rtx x)8401 aarch64_tls_referenced_p (rtx x)
8402 {
8403   if (!TARGET_HAVE_TLS)
8404     return false;
8405   subrtx_iterator::array_type array;
8406   FOR_EACH_SUBRTX (iter, array, x, ALL)
8407     {
8408       const_rtx x = *iter;
8409       if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
8410 	return true;
8411       /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
8412 	 TLS offsets, not real symbol references.  */
8413       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
8414 	iter.skip_subrtxes ();
8415     }
8416   return false;
8417 }
8418 
8419 
8420 /* Return true if val can be encoded as a 12-bit unsigned immediate with
8421    a left shift of 0 or 12 bits.  */
8422 bool
aarch64_uimm12_shift(HOST_WIDE_INT val)8423 aarch64_uimm12_shift (HOST_WIDE_INT val)
8424 {
8425   return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
8426 	  || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
8427 	  );
8428 }
8429 
8430 /* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
8431    that can be created with a left shift of 0 or 12.  */
8432 static HOST_WIDE_INT
aarch64_clamp_to_uimm12_shift(HOST_WIDE_INT val)8433 aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val)
8434 {
8435   /* Check to see if the value fits in 24 bits, as that is the maximum we can
8436      handle correctly.  */
8437   gcc_assert ((val & 0xffffff) == val);
8438 
8439   if (((val & 0xfff) << 0) == val)
8440     return val;
8441 
8442   return val & (0xfff << 12);
8443 }
8444 
8445 /* Return true if val is an immediate that can be loaded into a
8446    register by a MOVZ instruction.  */
8447 static bool
aarch64_movw_imm(HOST_WIDE_INT val,scalar_int_mode mode)8448 aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
8449 {
8450   if (GET_MODE_SIZE (mode) > 4)
8451     {
8452       if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
8453 	  || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
8454 	return 1;
8455     }
8456   else
8457     {
8458       /* Ignore sign extension.  */
8459       val &= (HOST_WIDE_INT) 0xffffffff;
8460     }
8461   return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
8462 	  || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
8463 }
8464 
8465 /* Test whether:
8466 
8467      X = (X & AND_VAL) | IOR_VAL;
8468 
8469    can be implemented using:
8470 
8471      MOVK X, #(IOR_VAL >> shift), LSL #shift
8472 
8473    Return the shift if so, otherwise return -1.  */
8474 int
aarch64_movk_shift(const wide_int_ref & and_val,const wide_int_ref & ior_val)8475 aarch64_movk_shift (const wide_int_ref &and_val,
8476 		    const wide_int_ref &ior_val)
8477 {
8478   unsigned int precision = and_val.get_precision ();
8479   unsigned HOST_WIDE_INT mask = 0xffff;
8480   for (unsigned int shift = 0; shift < precision; shift += 16)
8481     {
8482       if (and_val == ~mask && (ior_val & mask) == ior_val)
8483 	return shift;
8484       mask <<= 16;
8485     }
8486   return -1;
8487 }
8488 
8489 /* VAL is a value with the inner mode of MODE.  Replicate it to fill a
8490    64-bit (DImode) integer.  */
8491 
8492 static unsigned HOST_WIDE_INT
aarch64_replicate_bitmask_imm(unsigned HOST_WIDE_INT val,machine_mode mode)8493 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
8494 {
8495   unsigned int size = GET_MODE_UNIT_PRECISION (mode);
8496   while (size < 64)
8497     {
8498       val &= (HOST_WIDE_INT_1U << size) - 1;
8499       val |= val << size;
8500       size *= 2;
8501     }
8502   return val;
8503 }
8504 
8505 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2.  */
8506 
8507 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
8508   {
8509     0x0000000100000001ull,
8510     0x0001000100010001ull,
8511     0x0101010101010101ull,
8512     0x1111111111111111ull,
8513     0x5555555555555555ull,
8514   };
8515 
8516 
8517 /* Return true if val is a valid bitmask immediate.  */
8518 
8519 bool
aarch64_bitmask_imm(HOST_WIDE_INT val_in,machine_mode mode)8520 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
8521 {
8522   unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
8523   int bits;
8524 
8525   /* Check for a single sequence of one bits and return quickly if so.
8526      The special cases of all ones and all zeroes returns false.  */
8527   val = aarch64_replicate_bitmask_imm (val_in, mode);
8528   tmp = val + (val & -val);
8529 
8530   if (tmp == (tmp & -tmp))
8531     return (val + 1) > 1;
8532 
8533   /* Replicate 32-bit immediates so we can treat them as 64-bit.  */
8534   if (mode == SImode)
8535     val = (val << 32) | (val & 0xffffffff);
8536 
8537   /* Invert if the immediate doesn't start with a zero bit - this means we
8538      only need to search for sequences of one bits.  */
8539   if (val & 1)
8540     val = ~val;
8541 
8542   /* Find the first set bit and set tmp to val with the first sequence of one
8543      bits removed.  Return success if there is a single sequence of ones.  */
8544   first_one = val & -val;
8545   tmp = val & (val + first_one);
8546 
8547   if (tmp == 0)
8548     return true;
8549 
8550   /* Find the next set bit and compute the difference in bit position.  */
8551   next_one = tmp & -tmp;
8552   bits = clz_hwi (first_one) - clz_hwi (next_one);
8553   mask = val ^ tmp;
8554 
8555   /* Check the bit position difference is a power of 2, and that the first
8556      sequence of one bits fits within 'bits' bits.  */
8557   if ((mask >> bits) != 0 || bits != (bits & -bits))
8558     return false;
8559 
8560   /* Check the sequence of one bits is repeated 64/bits times.  */
8561   return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
8562 }
8563 
8564 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
8565    Assumed precondition: VAL_IN Is not zero.  */
8566 
8567 unsigned HOST_WIDE_INT
aarch64_and_split_imm1(HOST_WIDE_INT val_in)8568 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
8569 {
8570   int lowest_bit_set = ctz_hwi (val_in);
8571   int highest_bit_set = floor_log2 (val_in);
8572   gcc_assert (val_in != 0);
8573 
8574   return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
8575 	  (HOST_WIDE_INT_1U << lowest_bit_set));
8576 }
8577 
8578 /* Create constant where bits outside of lowest bit set to highest bit set
8579    are set to 1.  */
8580 
8581 unsigned HOST_WIDE_INT
aarch64_and_split_imm2(HOST_WIDE_INT val_in)8582 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
8583 {
8584   return val_in | ~aarch64_and_split_imm1 (val_in);
8585 }
8586 
8587 /* Return true if VAL_IN is a valid 'and' bitmask immediate.  */
8588 
8589 bool
aarch64_and_bitmask_imm(unsigned HOST_WIDE_INT val_in,machine_mode mode)8590 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
8591 {
8592   scalar_int_mode int_mode;
8593   if (!is_a <scalar_int_mode> (mode, &int_mode))
8594     return false;
8595 
8596   if (aarch64_bitmask_imm (val_in, int_mode))
8597     return false;
8598 
8599   if (aarch64_move_imm (val_in, int_mode))
8600     return false;
8601 
8602   unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
8603 
8604   return aarch64_bitmask_imm (imm2, int_mode);
8605 }
8606 
8607 /* Return true if val is an immediate that can be loaded into a
8608    register in a single instruction.  */
8609 bool
aarch64_move_imm(HOST_WIDE_INT val,machine_mode mode)8610 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
8611 {
8612   scalar_int_mode int_mode;
8613   if (!is_a <scalar_int_mode> (mode, &int_mode))
8614     return false;
8615 
8616   if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
8617     return 1;
8618   return aarch64_bitmask_imm (val, int_mode);
8619 }
8620 
8621 static bool
aarch64_cannot_force_const_mem(machine_mode mode ATTRIBUTE_UNUSED,rtx x)8622 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
8623 {
8624   rtx base, offset;
8625 
8626   if (GET_CODE (x) == HIGH)
8627     return true;
8628 
8629   /* There's no way to calculate VL-based values using relocations.  */
8630   subrtx_iterator::array_type array;
8631   FOR_EACH_SUBRTX (iter, array, x, ALL)
8632     if (GET_CODE (*iter) == CONST_POLY_INT)
8633       return true;
8634 
8635   split_const (x, &base, &offset);
8636   if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
8637     {
8638       if (aarch64_classify_symbol (base, INTVAL (offset))
8639 	  != SYMBOL_FORCE_TO_MEM)
8640 	return true;
8641       else
8642 	/* Avoid generating a 64-bit relocation in ILP32; leave
8643 	   to aarch64_expand_mov_immediate to handle it properly.  */
8644 	return mode != ptr_mode;
8645     }
8646 
8647   return aarch64_tls_referenced_p (x);
8648 }
8649 
8650 /* Implement TARGET_CASE_VALUES_THRESHOLD.
8651    The expansion for a table switch is quite expensive due to the number
8652    of instructions, the table lookup and hard to predict indirect jump.
8653    When optimizing for speed, and -O3 enabled, use the per-core tuning if
8654    set, otherwise use tables for > 16 cases as a tradeoff between size and
8655    performance.  When optimizing for size, use the default setting.  */
8656 
8657 static unsigned int
aarch64_case_values_threshold(void)8658 aarch64_case_values_threshold (void)
8659 {
8660   /* Use the specified limit for the number of cases before using jump
8661      tables at higher optimization levels.  */
8662   if (optimize > 2
8663       && selected_cpu->tune->max_case_values != 0)
8664     return selected_cpu->tune->max_case_values;
8665   else
8666     return optimize_size ? default_case_values_threshold () : 17;
8667 }
8668 
8669 /* Return true if register REGNO is a valid index register.
8670    STRICT_P is true if REG_OK_STRICT is in effect.  */
8671 
8672 bool
aarch64_regno_ok_for_index_p(int regno,bool strict_p)8673 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
8674 {
8675   if (!HARD_REGISTER_NUM_P (regno))
8676     {
8677       if (!strict_p)
8678 	return true;
8679 
8680       if (!reg_renumber)
8681 	return false;
8682 
8683       regno = reg_renumber[regno];
8684     }
8685   return GP_REGNUM_P (regno);
8686 }
8687 
8688 /* Return true if register REGNO is a valid base register for mode MODE.
8689    STRICT_P is true if REG_OK_STRICT is in effect.  */
8690 
8691 bool
aarch64_regno_ok_for_base_p(int regno,bool strict_p)8692 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
8693 {
8694   if (!HARD_REGISTER_NUM_P (regno))
8695     {
8696       if (!strict_p)
8697 	return true;
8698 
8699       if (!reg_renumber)
8700 	return false;
8701 
8702       regno = reg_renumber[regno];
8703     }
8704 
8705   /* The fake registers will be eliminated to either the stack or
8706      hard frame pointer, both of which are usually valid base registers.
8707      Reload deals with the cases where the eliminated form isn't valid.  */
8708   return (GP_REGNUM_P (regno)
8709 	  || regno == SP_REGNUM
8710 	  || regno == FRAME_POINTER_REGNUM
8711 	  || regno == ARG_POINTER_REGNUM);
8712 }
8713 
8714 /* Return true if X is a valid base register for mode MODE.
8715    STRICT_P is true if REG_OK_STRICT is in effect.  */
8716 
8717 static bool
aarch64_base_register_rtx_p(rtx x,bool strict_p)8718 aarch64_base_register_rtx_p (rtx x, bool strict_p)
8719 {
8720   if (!strict_p
8721       && GET_CODE (x) == SUBREG
8722       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
8723     x = SUBREG_REG (x);
8724 
8725   return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
8726 }
8727 
8728 /* Return true if address offset is a valid index.  If it is, fill in INFO
8729    appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
8730 
8731 static bool
aarch64_classify_index(struct aarch64_address_info * info,rtx x,machine_mode mode,bool strict_p)8732 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
8733 			machine_mode mode, bool strict_p)
8734 {
8735   enum aarch64_address_type type;
8736   rtx index;
8737   int shift;
8738 
8739   /* (reg:P) */
8740   if ((REG_P (x) || GET_CODE (x) == SUBREG)
8741       && GET_MODE (x) == Pmode)
8742     {
8743       type = ADDRESS_REG_REG;
8744       index = x;
8745       shift = 0;
8746     }
8747   /* (sign_extend:DI (reg:SI)) */
8748   else if ((GET_CODE (x) == SIGN_EXTEND
8749 	    || GET_CODE (x) == ZERO_EXTEND)
8750 	   && GET_MODE (x) == DImode
8751 	   && GET_MODE (XEXP (x, 0)) == SImode)
8752     {
8753       type = (GET_CODE (x) == SIGN_EXTEND)
8754 	? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
8755       index = XEXP (x, 0);
8756       shift = 0;
8757     }
8758   /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
8759   else if (GET_CODE (x) == MULT
8760 	   && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
8761 	       || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
8762 	   && GET_MODE (XEXP (x, 0)) == DImode
8763 	   && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
8764 	   && CONST_INT_P (XEXP (x, 1)))
8765     {
8766       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
8767 	? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
8768       index = XEXP (XEXP (x, 0), 0);
8769       shift = exact_log2 (INTVAL (XEXP (x, 1)));
8770     }
8771   /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
8772   else if (GET_CODE (x) == ASHIFT
8773 	   && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
8774 	       || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
8775 	   && GET_MODE (XEXP (x, 0)) == DImode
8776 	   && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
8777 	   && CONST_INT_P (XEXP (x, 1)))
8778     {
8779       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
8780 	? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
8781       index = XEXP (XEXP (x, 0), 0);
8782       shift = INTVAL (XEXP (x, 1));
8783     }
8784   /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
8785   else if ((GET_CODE (x) == SIGN_EXTRACT
8786 	    || GET_CODE (x) == ZERO_EXTRACT)
8787 	   && GET_MODE (x) == DImode
8788 	   && GET_CODE (XEXP (x, 0)) == MULT
8789 	   && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
8790 	   && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
8791     {
8792       type = (GET_CODE (x) == SIGN_EXTRACT)
8793 	? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
8794       index = XEXP (XEXP (x, 0), 0);
8795       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
8796       if (INTVAL (XEXP (x, 1)) != 32 + shift
8797 	  || INTVAL (XEXP (x, 2)) != 0)
8798 	shift = -1;
8799     }
8800   /* (and:DI (mult:DI (reg:DI) (const_int scale))
8801      (const_int 0xffffffff<<shift)) */
8802   else if (GET_CODE (x) == AND
8803 	   && GET_MODE (x) == DImode
8804 	   && GET_CODE (XEXP (x, 0)) == MULT
8805 	   && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
8806 	   && CONST_INT_P (XEXP (XEXP (x, 0), 1))
8807 	   && CONST_INT_P (XEXP (x, 1)))
8808     {
8809       type = ADDRESS_REG_UXTW;
8810       index = XEXP (XEXP (x, 0), 0);
8811       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
8812       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
8813 	shift = -1;
8814     }
8815   /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
8816   else if ((GET_CODE (x) == SIGN_EXTRACT
8817 	    || GET_CODE (x) == ZERO_EXTRACT)
8818 	   && GET_MODE (x) == DImode
8819 	   && GET_CODE (XEXP (x, 0)) == ASHIFT
8820 	   && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
8821 	   && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
8822     {
8823       type = (GET_CODE (x) == SIGN_EXTRACT)
8824 	? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
8825       index = XEXP (XEXP (x, 0), 0);
8826       shift = INTVAL (XEXP (XEXP (x, 0), 1));
8827       if (INTVAL (XEXP (x, 1)) != 32 + shift
8828 	  || INTVAL (XEXP (x, 2)) != 0)
8829 	shift = -1;
8830     }
8831   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
8832      (const_int 0xffffffff<<shift)) */
8833   else if (GET_CODE (x) == AND
8834 	   && GET_MODE (x) == DImode
8835 	   && GET_CODE (XEXP (x, 0)) == ASHIFT
8836 	   && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
8837 	   && CONST_INT_P (XEXP (XEXP (x, 0), 1))
8838 	   && CONST_INT_P (XEXP (x, 1)))
8839     {
8840       type = ADDRESS_REG_UXTW;
8841       index = XEXP (XEXP (x, 0), 0);
8842       shift = INTVAL (XEXP (XEXP (x, 0), 1));
8843       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
8844 	shift = -1;
8845     }
8846   /* (mult:P (reg:P) (const_int scale)) */
8847   else if (GET_CODE (x) == MULT
8848 	   && GET_MODE (x) == Pmode
8849 	   && GET_MODE (XEXP (x, 0)) == Pmode
8850 	   && CONST_INT_P (XEXP (x, 1)))
8851     {
8852       type = ADDRESS_REG_REG;
8853       index = XEXP (x, 0);
8854       shift = exact_log2 (INTVAL (XEXP (x, 1)));
8855     }
8856   /* (ashift:P (reg:P) (const_int shift)) */
8857   else if (GET_CODE (x) == ASHIFT
8858 	   && GET_MODE (x) == Pmode
8859 	   && GET_MODE (XEXP (x, 0)) == Pmode
8860 	   && CONST_INT_P (XEXP (x, 1)))
8861     {
8862       type = ADDRESS_REG_REG;
8863       index = XEXP (x, 0);
8864       shift = INTVAL (XEXP (x, 1));
8865     }
8866   else
8867     return false;
8868 
8869   if (!strict_p
8870       && GET_CODE (index) == SUBREG
8871       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
8872     index = SUBREG_REG (index);
8873 
8874   if (aarch64_sve_data_mode_p (mode))
8875     {
8876       if (type != ADDRESS_REG_REG
8877 	  || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
8878 	return false;
8879     }
8880   else
8881     {
8882       if (shift != 0
8883 	  && !(IN_RANGE (shift, 1, 3)
8884 	       && known_eq (1 << shift, GET_MODE_SIZE (mode))))
8885 	return false;
8886     }
8887 
8888   if (REG_P (index)
8889       && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
8890     {
8891       info->type = type;
8892       info->offset = index;
8893       info->shift = shift;
8894       return true;
8895     }
8896 
8897   return false;
8898 }
8899 
8900 /* Return true if MODE is one of the modes for which we
8901    support LDP/STP operations.  */
8902 
8903 static bool
aarch64_mode_valid_for_sched_fusion_p(machine_mode mode)8904 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
8905 {
8906   return mode == SImode || mode == DImode
8907 	 || mode == SFmode || mode == DFmode
8908 	 || (aarch64_vector_mode_supported_p (mode)
8909 	     && (known_eq (GET_MODE_SIZE (mode), 8)
8910 		 || (known_eq (GET_MODE_SIZE (mode), 16)
8911 		    && (aarch64_tune_params.extra_tuning_flags
8912 			& AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0)));
8913 }
8914 
8915 /* Return true if REGNO is a virtual pointer register, or an eliminable
8916    "soft" frame register.  Like REGNO_PTR_FRAME_P except that we don't
8917    include stack_pointer or hard_frame_pointer.  */
8918 static bool
virt_or_elim_regno_p(unsigned regno)8919 virt_or_elim_regno_p (unsigned regno)
8920 {
8921   return ((regno >= FIRST_VIRTUAL_REGISTER
8922 	   && regno <= LAST_VIRTUAL_POINTER_REGISTER)
8923 	  || regno == FRAME_POINTER_REGNUM
8924 	  || regno == ARG_POINTER_REGNUM);
8925 }
8926 
8927 /* Return true if X is a valid address of type TYPE for machine mode MODE.
8928    If it is, fill in INFO appropriately.  STRICT_P is true if
8929    REG_OK_STRICT is in effect.  */
8930 
8931 bool
aarch64_classify_address(struct aarch64_address_info * info,rtx x,machine_mode mode,bool strict_p,aarch64_addr_query_type type)8932 aarch64_classify_address (struct aarch64_address_info *info,
8933 			  rtx x, machine_mode mode, bool strict_p,
8934 			  aarch64_addr_query_type type)
8935 {
8936   enum rtx_code code = GET_CODE (x);
8937   rtx op0, op1;
8938   poly_int64 offset;
8939 
8940   HOST_WIDE_INT const_size;
8941 
8942   /* Whether a vector mode is partial doesn't affect address legitimacy.
8943      Partial vectors like VNx8QImode allow the same indexed addressing
8944      mode and MUL VL addressing mode as full vectors like VNx16QImode;
8945      in both cases, MUL VL counts multiples of GET_MODE_SIZE.  */
8946   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
8947   vec_flags &= ~VEC_PARTIAL;
8948 
8949   /* On BE, we use load/store pair for all large int mode load/stores.
8950      TI/TFmode may also use a load/store pair.  */
8951   bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
8952   bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
8953 			    || type == ADDR_QUERY_LDP_STP_N
8954 			    || mode == TImode
8955 			    || mode == TFmode
8956 			    || (BYTES_BIG_ENDIAN && advsimd_struct_p));
8957 
8958   /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
8959      corresponds to the actual size of the memory being loaded/stored and the
8960      mode of the corresponding addressing mode is half of that.  */
8961   if (type == ADDR_QUERY_LDP_STP_N
8962       && known_eq (GET_MODE_SIZE (mode), 16))
8963     mode = DFmode;
8964 
8965   bool allow_reg_index_p = (!load_store_pair_p
8966 			    && (known_lt (GET_MODE_SIZE (mode), 16)
8967 				|| vec_flags == VEC_ADVSIMD
8968 				|| vec_flags & VEC_SVE_DATA));
8969 
8970   /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
8971      [Rn, #offset, MUL VL].  */
8972   if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
8973       && (code != REG && code != PLUS))
8974     return false;
8975 
8976   /* On LE, for AdvSIMD, don't support anything other than POST_INC or
8977      REG addressing.  */
8978   if (advsimd_struct_p
8979       && !BYTES_BIG_ENDIAN
8980       && (code != POST_INC && code != REG))
8981     return false;
8982 
8983   gcc_checking_assert (GET_MODE (x) == VOIDmode
8984 		       || SCALAR_INT_MODE_P (GET_MODE (x)));
8985 
8986   switch (code)
8987     {
8988     case REG:
8989     case SUBREG:
8990       info->type = ADDRESS_REG_IMM;
8991       info->base = x;
8992       info->offset = const0_rtx;
8993       info->const_offset = 0;
8994       return aarch64_base_register_rtx_p (x, strict_p);
8995 
8996     case PLUS:
8997       op0 = XEXP (x, 0);
8998       op1 = XEXP (x, 1);
8999 
9000       if (! strict_p
9001 	  && REG_P (op0)
9002 	  && virt_or_elim_regno_p (REGNO (op0))
9003 	  && poly_int_rtx_p (op1, &offset))
9004 	{
9005 	  info->type = ADDRESS_REG_IMM;
9006 	  info->base = op0;
9007 	  info->offset = op1;
9008 	  info->const_offset = offset;
9009 
9010 	  return true;
9011 	}
9012 
9013       if (maybe_ne (GET_MODE_SIZE (mode), 0)
9014 	  && aarch64_base_register_rtx_p (op0, strict_p)
9015 	  && poly_int_rtx_p (op1, &offset))
9016 	{
9017 	  info->type = ADDRESS_REG_IMM;
9018 	  info->base = op0;
9019 	  info->offset = op1;
9020 	  info->const_offset = offset;
9021 
9022 	  /* TImode and TFmode values are allowed in both pairs of X
9023 	     registers and individual Q registers.  The available
9024 	     address modes are:
9025 	     X,X: 7-bit signed scaled offset
9026 	     Q:   9-bit signed offset
9027 	     We conservatively require an offset representable in either mode.
9028 	     When performing the check for pairs of X registers i.e.  LDP/STP
9029 	     pass down DImode since that is the natural size of the LDP/STP
9030 	     instruction memory accesses.  */
9031 	  if (mode == TImode || mode == TFmode)
9032 	    return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
9033 		    && (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
9034 			|| offset_12bit_unsigned_scaled_p (mode, offset)));
9035 
9036 	  /* A 7bit offset check because OImode will emit a ldp/stp
9037 	     instruction (only big endian will get here).
9038 	     For ldp/stp instructions, the offset is scaled for the size of a
9039 	     single element of the pair.  */
9040 	  if (mode == OImode)
9041 	    return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
9042 
9043 	  /* Three 9/12 bit offsets checks because CImode will emit three
9044 	     ldr/str instructions (only big endian will get here).  */
9045 	  if (mode == CImode)
9046 	    return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
9047 		    && (aarch64_offset_9bit_signed_unscaled_p (V16QImode,
9048 							       offset + 32)
9049 			|| offset_12bit_unsigned_scaled_p (V16QImode,
9050 							   offset + 32)));
9051 
9052 	  /* Two 7bit offsets checks because XImode will emit two ldp/stp
9053 	     instructions (only big endian will get here).  */
9054 	  if (mode == XImode)
9055 	    return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
9056 		    && aarch64_offset_7bit_signed_scaled_p (TImode,
9057 							    offset + 32));
9058 
9059 	  /* Make "m" use the LD1 offset range for SVE data modes, so
9060 	     that pre-RTL optimizers like ivopts will work to that
9061 	     instead of the wider LDR/STR range.  */
9062 	  if (vec_flags == VEC_SVE_DATA)
9063 	    return (type == ADDR_QUERY_M
9064 		    ? offset_4bit_signed_scaled_p (mode, offset)
9065 		    : offset_9bit_signed_scaled_p (mode, offset));
9066 
9067 	  if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
9068 	    {
9069 	      poly_int64 end_offset = (offset
9070 				       + GET_MODE_SIZE (mode)
9071 				       - BYTES_PER_SVE_VECTOR);
9072 	      return (type == ADDR_QUERY_M
9073 		      ? offset_4bit_signed_scaled_p (mode, offset)
9074 		      : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
9075 			 && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
9076 							 end_offset)));
9077 	    }
9078 
9079 	  if (vec_flags == VEC_SVE_PRED)
9080 	    return offset_9bit_signed_scaled_p (mode, offset);
9081 
9082 	  if (load_store_pair_p)
9083 	    return ((known_eq (GET_MODE_SIZE (mode), 4)
9084 		     || known_eq (GET_MODE_SIZE (mode), 8)
9085 		     || known_eq (GET_MODE_SIZE (mode), 16))
9086 		    && aarch64_offset_7bit_signed_scaled_p (mode, offset));
9087 	  else
9088 	    return (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
9089 		    || offset_12bit_unsigned_scaled_p (mode, offset));
9090 	}
9091 
9092       if (allow_reg_index_p)
9093 	{
9094 	  /* Look for base + (scaled/extended) index register.  */
9095 	  if (aarch64_base_register_rtx_p (op0, strict_p)
9096 	      && aarch64_classify_index (info, op1, mode, strict_p))
9097 	    {
9098 	      info->base = op0;
9099 	      return true;
9100 	    }
9101 	  if (aarch64_base_register_rtx_p (op1, strict_p)
9102 	      && aarch64_classify_index (info, op0, mode, strict_p))
9103 	    {
9104 	      info->base = op1;
9105 	      return true;
9106 	    }
9107 	}
9108 
9109       return false;
9110 
9111     case POST_INC:
9112     case POST_DEC:
9113     case PRE_INC:
9114     case PRE_DEC:
9115       info->type = ADDRESS_REG_WB;
9116       info->base = XEXP (x, 0);
9117       info->offset = NULL_RTX;
9118       return aarch64_base_register_rtx_p (info->base, strict_p);
9119 
9120     case POST_MODIFY:
9121     case PRE_MODIFY:
9122       info->type = ADDRESS_REG_WB;
9123       info->base = XEXP (x, 0);
9124       if (GET_CODE (XEXP (x, 1)) == PLUS
9125 	  && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
9126 	  && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
9127 	  && aarch64_base_register_rtx_p (info->base, strict_p))
9128 	{
9129 	  info->offset = XEXP (XEXP (x, 1), 1);
9130 	  info->const_offset = offset;
9131 
9132 	  /* TImode and TFmode values are allowed in both pairs of X
9133 	     registers and individual Q registers.  The available
9134 	     address modes are:
9135 	     X,X: 7-bit signed scaled offset
9136 	     Q:   9-bit signed offset
9137 	     We conservatively require an offset representable in either mode.
9138 	   */
9139 	  if (mode == TImode || mode == TFmode)
9140 	    return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
9141 		    && aarch64_offset_9bit_signed_unscaled_p (mode, offset));
9142 
9143 	  if (load_store_pair_p)
9144 	    return ((known_eq (GET_MODE_SIZE (mode), 4)
9145 		     || known_eq (GET_MODE_SIZE (mode), 8)
9146 		     || known_eq (GET_MODE_SIZE (mode), 16))
9147 		    && aarch64_offset_7bit_signed_scaled_p (mode, offset));
9148 	  else
9149 	    return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
9150 	}
9151       return false;
9152 
9153     case CONST:
9154     case SYMBOL_REF:
9155     case LABEL_REF:
9156       /* load literal: pc-relative constant pool entry.  Only supported
9157          for SI mode or larger.  */
9158       info->type = ADDRESS_SYMBOLIC;
9159 
9160       if (!load_store_pair_p
9161 	  && GET_MODE_SIZE (mode).is_constant (&const_size)
9162 	  && const_size >= 4)
9163 	{
9164 	  rtx sym, addend;
9165 
9166 	  split_const (x, &sym, &addend);
9167 	  return ((GET_CODE (sym) == LABEL_REF
9168 		   || (GET_CODE (sym) == SYMBOL_REF
9169 		       && CONSTANT_POOL_ADDRESS_P (sym)
9170 		       && aarch64_pcrelative_literal_loads)));
9171 	}
9172       return false;
9173 
9174     case LO_SUM:
9175       info->type = ADDRESS_LO_SUM;
9176       info->base = XEXP (x, 0);
9177       info->offset = XEXP (x, 1);
9178       if (allow_reg_index_p
9179 	  && aarch64_base_register_rtx_p (info->base, strict_p))
9180 	{
9181 	  rtx sym, offs;
9182 	  split_const (info->offset, &sym, &offs);
9183 	  if (GET_CODE (sym) == SYMBOL_REF
9184 	      && (aarch64_classify_symbol (sym, INTVAL (offs))
9185 		  == SYMBOL_SMALL_ABSOLUTE))
9186 	    {
9187 	      /* The symbol and offset must be aligned to the access size.  */
9188 	      unsigned int align;
9189 
9190 	      if (CONSTANT_POOL_ADDRESS_P (sym))
9191 		align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
9192 	      else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
9193 		{
9194 		  tree exp = SYMBOL_REF_DECL (sym);
9195 		  align = TYPE_ALIGN (TREE_TYPE (exp));
9196 		  align = aarch64_constant_alignment (exp, align);
9197 		}
9198 	      else if (SYMBOL_REF_DECL (sym))
9199 		align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
9200 	      else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
9201 		       && SYMBOL_REF_BLOCK (sym) != NULL)
9202 		align = SYMBOL_REF_BLOCK (sym)->alignment;
9203 	      else
9204 		align = BITS_PER_UNIT;
9205 
9206 	      poly_int64 ref_size = GET_MODE_SIZE (mode);
9207 	      if (known_eq (ref_size, 0))
9208 		ref_size = GET_MODE_SIZE (DImode);
9209 
9210 	      return (multiple_p (INTVAL (offs), ref_size)
9211 		      && multiple_p (align / BITS_PER_UNIT, ref_size));
9212 	    }
9213 	}
9214       return false;
9215 
9216     default:
9217       return false;
9218     }
9219 }
9220 
9221 /* Return true if the address X is valid for a PRFM instruction.
9222    STRICT_P is true if we should do strict checking with
9223    aarch64_classify_address.  */
9224 
9225 bool
aarch64_address_valid_for_prefetch_p(rtx x,bool strict_p)9226 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
9227 {
9228   struct aarch64_address_info addr;
9229 
9230   /* PRFM accepts the same addresses as DImode...  */
9231   bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
9232   if (!res)
9233     return false;
9234 
9235   /* ... except writeback forms.  */
9236   return addr.type != ADDRESS_REG_WB;
9237 }
9238 
9239 bool
aarch64_symbolic_address_p(rtx x)9240 aarch64_symbolic_address_p (rtx x)
9241 {
9242   rtx offset;
9243 
9244   split_const (x, &x, &offset);
9245   return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
9246 }
9247 
9248 /* Classify the base of symbolic expression X.  */
9249 
9250 enum aarch64_symbol_type
aarch64_classify_symbolic_expression(rtx x)9251 aarch64_classify_symbolic_expression (rtx x)
9252 {
9253   rtx offset;
9254 
9255   split_const (x, &x, &offset);
9256   return aarch64_classify_symbol (x, INTVAL (offset));
9257 }
9258 
9259 
9260 /* Return TRUE if X is a legitimate address for accessing memory in
9261    mode MODE.  */
9262 static bool
aarch64_legitimate_address_hook_p(machine_mode mode,rtx x,bool strict_p)9263 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
9264 {
9265   struct aarch64_address_info addr;
9266 
9267   return aarch64_classify_address (&addr, x, mode, strict_p);
9268 }
9269 
9270 /* Return TRUE if X is a legitimate address of type TYPE for accessing
9271    memory in mode MODE.  STRICT_P is true if REG_OK_STRICT is in effect.  */
9272 bool
aarch64_legitimate_address_p(machine_mode mode,rtx x,bool strict_p,aarch64_addr_query_type type)9273 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
9274 			      aarch64_addr_query_type type)
9275 {
9276   struct aarch64_address_info addr;
9277 
9278   return aarch64_classify_address (&addr, x, mode, strict_p, type);
9279 }
9280 
9281 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT.  */
9282 
9283 static bool
aarch64_legitimize_address_displacement(rtx * offset1,rtx * offset2,poly_int64 orig_offset,machine_mode mode)9284 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
9285 					 poly_int64 orig_offset,
9286 					 machine_mode mode)
9287 {
9288   HOST_WIDE_INT size;
9289   if (GET_MODE_SIZE (mode).is_constant (&size))
9290     {
9291       HOST_WIDE_INT const_offset, second_offset;
9292 
9293       /* A general SVE offset is A * VQ + B.  Remove the A component from
9294 	 coefficient 0 in order to get the constant B.  */
9295       const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
9296 
9297       /* Split an out-of-range address displacement into a base and
9298 	 offset.  Use 4KB range for 1- and 2-byte accesses and a 16KB
9299 	 range otherwise to increase opportunities for sharing the base
9300 	 address of different sizes.  Unaligned accesses use the signed
9301 	 9-bit range, TImode/TFmode use the intersection of signed
9302 	 scaled 7-bit and signed 9-bit offset.  */
9303       if (mode == TImode || mode == TFmode)
9304 	second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
9305       else if ((const_offset & (size - 1)) != 0)
9306 	second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
9307       else
9308 	second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
9309 
9310       if (second_offset == 0 || known_eq (orig_offset, second_offset))
9311 	return false;
9312 
9313       /* Split the offset into second_offset and the rest.  */
9314       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
9315       *offset2 = gen_int_mode (second_offset, Pmode);
9316       return true;
9317     }
9318   else
9319     {
9320       /* Get the mode we should use as the basis of the range.  For structure
9321 	 modes this is the mode of one vector.  */
9322       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
9323       machine_mode step_mode
9324 	= (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
9325 
9326       /* Get the "mul vl" multiplier we'd like to use.  */
9327       HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
9328       HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
9329       if (vec_flags & VEC_SVE_DATA)
9330 	/* LDR supports a 9-bit range, but the move patterns for
9331 	   structure modes require all vectors to be in range of the
9332 	   same base.  The simplest way of accomodating that while still
9333 	   promoting reuse of anchor points between different modes is
9334 	   to use an 8-bit range unconditionally.  */
9335 	vnum = ((vnum + 128) & 255) - 128;
9336       else
9337 	/* Predicates are only handled singly, so we might as well use
9338 	   the full range.  */
9339 	vnum = ((vnum + 256) & 511) - 256;
9340       if (vnum == 0)
9341 	return false;
9342 
9343       /* Convert the "mul vl" multiplier into a byte offset.  */
9344       poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
9345       if (known_eq (second_offset, orig_offset))
9346 	return false;
9347 
9348       /* Split the offset into second_offset and the rest.  */
9349       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
9350       *offset2 = gen_int_mode (second_offset, Pmode);
9351       return true;
9352     }
9353 }
9354 
9355 /* Return the binary representation of floating point constant VALUE in INTVAL.
9356    If the value cannot be converted, return false without setting INTVAL.
9357    The conversion is done in the given MODE.  */
9358 bool
aarch64_reinterpret_float_as_int(rtx value,unsigned HOST_WIDE_INT * intval)9359 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
9360 {
9361 
9362   /* We make a general exception for 0.  */
9363   if (aarch64_float_const_zero_rtx_p (value))
9364     {
9365       *intval = 0;
9366       return true;
9367     }
9368 
9369   scalar_float_mode mode;
9370   if (GET_CODE (value) != CONST_DOUBLE
9371       || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
9372       || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
9373       /* Only support up to DF mode.  */
9374       || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
9375     return false;
9376 
9377   unsigned HOST_WIDE_INT ival = 0;
9378 
9379   long res[2];
9380   real_to_target (res,
9381 		  CONST_DOUBLE_REAL_VALUE (value),
9382 		  REAL_MODE_FORMAT (mode));
9383 
9384   if (mode == DFmode)
9385     {
9386       int order = BYTES_BIG_ENDIAN ? 1 : 0;
9387       ival = zext_hwi (res[order], 32);
9388       ival |= (zext_hwi (res[1 - order], 32) << 32);
9389     }
9390   else
9391       ival = zext_hwi (res[0], 32);
9392 
9393   *intval = ival;
9394   return true;
9395 }
9396 
9397 /* Return TRUE if rtx X is an immediate constant that can be moved using a
9398    single MOV(+MOVK) followed by an FMOV.  */
9399 bool
aarch64_float_const_rtx_p(rtx x)9400 aarch64_float_const_rtx_p (rtx x)
9401 {
9402   machine_mode mode = GET_MODE (x);
9403   if (mode == VOIDmode)
9404     return false;
9405 
9406   /* Determine whether it's cheaper to write float constants as
9407      mov/movk pairs over ldr/adrp pairs.  */
9408   unsigned HOST_WIDE_INT ival;
9409 
9410   if (GET_CODE (x) == CONST_DOUBLE
9411       && SCALAR_FLOAT_MODE_P (mode)
9412       && aarch64_reinterpret_float_as_int (x, &ival))
9413     {
9414       scalar_int_mode imode = (mode == HFmode
9415 			       ? SImode
9416 			       : int_mode_for_mode (mode).require ());
9417       int num_instr = aarch64_internal_mov_immediate
9418 			(NULL_RTX, gen_int_mode (ival, imode), false, imode);
9419       return num_instr < 3;
9420     }
9421 
9422   return false;
9423 }
9424 
9425 /* Return TRUE if rtx X is immediate constant 0.0 */
9426 bool
aarch64_float_const_zero_rtx_p(rtx x)9427 aarch64_float_const_zero_rtx_p (rtx x)
9428 {
9429   if (GET_MODE (x) == VOIDmode)
9430     return false;
9431 
9432   if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
9433     return !HONOR_SIGNED_ZEROS (GET_MODE (x));
9434   return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
9435 }
9436 
9437 /* Return TRUE if rtx X is immediate constant that fits in a single
9438    MOVI immediate operation.  */
9439 bool
aarch64_can_const_movi_rtx_p(rtx x,machine_mode mode)9440 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
9441 {
9442   if (!TARGET_SIMD)
9443      return false;
9444 
9445   machine_mode vmode;
9446   scalar_int_mode imode;
9447   unsigned HOST_WIDE_INT ival;
9448 
9449   if (GET_CODE (x) == CONST_DOUBLE
9450       && SCALAR_FLOAT_MODE_P (mode))
9451     {
9452       if (!aarch64_reinterpret_float_as_int (x, &ival))
9453 	return false;
9454 
9455       /* We make a general exception for 0.  */
9456       if (aarch64_float_const_zero_rtx_p (x))
9457 	return true;
9458 
9459       imode = int_mode_for_mode (mode).require ();
9460     }
9461   else if (GET_CODE (x) == CONST_INT
9462 	   && is_a <scalar_int_mode> (mode, &imode))
9463     ival = INTVAL (x);
9464   else
9465     return false;
9466 
9467    /* use a 64 bit mode for everything except for DI/DF mode, where we use
9468      a 128 bit vector mode.  */
9469   int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
9470 
9471   vmode = aarch64_simd_container_mode (imode, width);
9472   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
9473 
9474   return aarch64_simd_valid_immediate (v_op, NULL);
9475 }
9476 
9477 
9478 /* Return the fixed registers used for condition codes.  */
9479 
9480 static bool
aarch64_fixed_condition_code_regs(unsigned int * p1,unsigned int * p2)9481 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
9482 {
9483   *p1 = CC_REGNUM;
9484   *p2 = INVALID_REGNUM;
9485   return true;
9486 }
9487 
9488 /* This function is used by the call expanders of the machine description.
9489    RESULT is the register in which the result is returned.  It's NULL for
9490    "call" and "sibcall".
9491    MEM is the location of the function call.
9492    CALLEE_ABI is a const_int that gives the arm_pcs of the callee.
9493    SIBCALL indicates whether this function call is normal call or sibling call.
9494    It will generate different pattern accordingly.  */
9495 
9496 void
aarch64_expand_call(rtx result,rtx mem,rtx callee_abi,bool sibcall)9497 aarch64_expand_call (rtx result, rtx mem, rtx callee_abi, bool sibcall)
9498 {
9499   rtx call, callee, tmp;
9500   rtvec vec;
9501   machine_mode mode;
9502 
9503   gcc_assert (MEM_P (mem));
9504   callee = XEXP (mem, 0);
9505   mode = GET_MODE (callee);
9506   gcc_assert (mode == Pmode);
9507 
9508   /* Decide if we should generate indirect calls by loading the
9509      address of the callee into a register before performing
9510      the branch-and-link.  */
9511   if (SYMBOL_REF_P (callee)
9512       ? (aarch64_is_long_call_p (callee)
9513 	 || aarch64_is_noplt_call_p (callee))
9514       : !REG_P (callee))
9515     XEXP (mem, 0) = force_reg (mode, callee);
9516 
9517   call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
9518 
9519   if (result != NULL_RTX)
9520     call = gen_rtx_SET (result, call);
9521 
9522   if (sibcall)
9523     tmp = ret_rtx;
9524   else
9525     tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
9526 
9527   gcc_assert (CONST_INT_P (callee_abi));
9528   callee_abi = gen_rtx_UNSPEC (DImode, gen_rtvec (1, callee_abi),
9529 			       UNSPEC_CALLEE_ABI);
9530 
9531   vec = gen_rtvec (3, call, callee_abi, tmp);
9532   call = gen_rtx_PARALLEL (VOIDmode, vec);
9533 
9534   aarch64_emit_call_insn (call);
9535 }
9536 
9537 /* Emit call insn with PAT and do aarch64-specific handling.  */
9538 
9539 void
aarch64_emit_call_insn(rtx pat)9540 aarch64_emit_call_insn (rtx pat)
9541 {
9542   rtx insn = emit_call_insn (pat);
9543 
9544   rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
9545   clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
9546   clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
9547 }
9548 
9549 machine_mode
aarch64_select_cc_mode(RTX_CODE code,rtx x,rtx y)9550 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
9551 {
9552   machine_mode mode_x = GET_MODE (x);
9553   rtx_code code_x = GET_CODE (x);
9554 
9555   /* All floating point compares return CCFP if it is an equality
9556      comparison, and CCFPE otherwise.  */
9557   if (GET_MODE_CLASS (mode_x) == MODE_FLOAT)
9558     {
9559       switch (code)
9560 	{
9561 	case EQ:
9562 	case NE:
9563 	case UNORDERED:
9564 	case ORDERED:
9565 	case UNLT:
9566 	case UNLE:
9567 	case UNGT:
9568 	case UNGE:
9569 	case UNEQ:
9570 	  return CCFPmode;
9571 
9572 	case LT:
9573 	case LE:
9574 	case GT:
9575 	case GE:
9576 	case LTGT:
9577 	  return CCFPEmode;
9578 
9579 	default:
9580 	  gcc_unreachable ();
9581 	}
9582     }
9583 
9584   /* Equality comparisons of short modes against zero can be performed
9585      using the TST instruction with the appropriate bitmask.  */
9586   if (y == const0_rtx && (REG_P (x) || SUBREG_P (x))
9587       && (code == EQ || code == NE)
9588       && (mode_x == HImode || mode_x == QImode))
9589     return CC_NZmode;
9590 
9591   /* Similarly, comparisons of zero_extends from shorter modes can
9592      be performed using an ANDS with an immediate mask.  */
9593   if (y == const0_rtx && code_x == ZERO_EXTEND
9594       && (mode_x == SImode || mode_x == DImode)
9595       && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
9596       && (code == EQ || code == NE))
9597     return CC_NZmode;
9598 
9599   if ((mode_x == SImode || mode_x == DImode)
9600       && y == const0_rtx
9601       && (code == EQ || code == NE || code == LT || code == GE)
9602       && (code_x == PLUS || code_x == MINUS || code_x == AND
9603 	  || code_x == NEG
9604 	  || (code_x == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
9605 	      && CONST_INT_P (XEXP (x, 2)))))
9606     return CC_NZmode;
9607 
9608   /* A compare with a shifted operand.  Because of canonicalization,
9609      the comparison will have to be swapped when we emit the assembly
9610      code.  */
9611   if ((mode_x == SImode || mode_x == DImode)
9612       && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
9613       && (code_x == ASHIFT || code_x == ASHIFTRT
9614 	  || code_x == LSHIFTRT
9615 	  || code_x == ZERO_EXTEND || code_x == SIGN_EXTEND))
9616     return CC_SWPmode;
9617 
9618   /* Similarly for a negated operand, but we can only do this for
9619      equalities.  */
9620   if ((mode_x == SImode || mode_x == DImode)
9621       && (REG_P (y) || GET_CODE (y) == SUBREG)
9622       && (code == EQ || code == NE)
9623       && code_x == NEG)
9624     return CC_Zmode;
9625 
9626   /* A test for unsigned overflow from an addition.  */
9627   if ((mode_x == DImode || mode_x == TImode)
9628       && (code == LTU || code == GEU)
9629       && code_x == PLUS
9630       && rtx_equal_p (XEXP (x, 0), y))
9631     return CC_Cmode;
9632 
9633   /* A test for unsigned overflow from an add with carry.  */
9634   if ((mode_x == DImode || mode_x == TImode)
9635       && (code == LTU || code == GEU)
9636       && code_x == PLUS
9637       && CONST_SCALAR_INT_P (y)
9638       && (rtx_mode_t (y, mode_x)
9639 	  == (wi::shwi (1, mode_x)
9640 	      << (GET_MODE_BITSIZE (mode_x).to_constant () / 2))))
9641     return CC_ADCmode;
9642 
9643   /* A test for signed overflow.  */
9644   if ((mode_x == DImode || mode_x == TImode)
9645       && code == NE
9646       && code_x == PLUS
9647       && GET_CODE (y) == SIGN_EXTEND)
9648     return CC_Vmode;
9649 
9650   /* For everything else, return CCmode.  */
9651   return CCmode;
9652 }
9653 
9654 static int
9655 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
9656 
9657 int
aarch64_get_condition_code(rtx x)9658 aarch64_get_condition_code (rtx x)
9659 {
9660   machine_mode mode = GET_MODE (XEXP (x, 0));
9661   enum rtx_code comp_code = GET_CODE (x);
9662 
9663   if (GET_MODE_CLASS (mode) != MODE_CC)
9664     mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
9665   return aarch64_get_condition_code_1 (mode, comp_code);
9666 }
9667 
9668 static int
aarch64_get_condition_code_1(machine_mode mode,enum rtx_code comp_code)9669 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
9670 {
9671   switch (mode)
9672     {
9673     case E_CCFPmode:
9674     case E_CCFPEmode:
9675       switch (comp_code)
9676 	{
9677 	case GE: return AARCH64_GE;
9678 	case GT: return AARCH64_GT;
9679 	case LE: return AARCH64_LS;
9680 	case LT: return AARCH64_MI;
9681 	case NE: return AARCH64_NE;
9682 	case EQ: return AARCH64_EQ;
9683 	case ORDERED: return AARCH64_VC;
9684 	case UNORDERED: return AARCH64_VS;
9685 	case UNLT: return AARCH64_LT;
9686 	case UNLE: return AARCH64_LE;
9687 	case UNGT: return AARCH64_HI;
9688 	case UNGE: return AARCH64_PL;
9689 	default: return -1;
9690 	}
9691       break;
9692 
9693     case E_CCmode:
9694       switch (comp_code)
9695 	{
9696 	case NE: return AARCH64_NE;
9697 	case EQ: return AARCH64_EQ;
9698 	case GE: return AARCH64_GE;
9699 	case GT: return AARCH64_GT;
9700 	case LE: return AARCH64_LE;
9701 	case LT: return AARCH64_LT;
9702 	case GEU: return AARCH64_CS;
9703 	case GTU: return AARCH64_HI;
9704 	case LEU: return AARCH64_LS;
9705 	case LTU: return AARCH64_CC;
9706 	default: return -1;
9707 	}
9708       break;
9709 
9710     case E_CC_SWPmode:
9711       switch (comp_code)
9712 	{
9713 	case NE: return AARCH64_NE;
9714 	case EQ: return AARCH64_EQ;
9715 	case GE: return AARCH64_LE;
9716 	case GT: return AARCH64_LT;
9717 	case LE: return AARCH64_GE;
9718 	case LT: return AARCH64_GT;
9719 	case GEU: return AARCH64_LS;
9720 	case GTU: return AARCH64_CC;
9721 	case LEU: return AARCH64_CS;
9722 	case LTU: return AARCH64_HI;
9723 	default: return -1;
9724 	}
9725       break;
9726 
9727     case E_CC_NZCmode:
9728       switch (comp_code)
9729 	{
9730 	case NE: return AARCH64_NE; /* = any */
9731 	case EQ: return AARCH64_EQ; /* = none */
9732 	case GE: return AARCH64_PL; /* = nfrst */
9733 	case LT: return AARCH64_MI; /* = first */
9734 	case GEU: return AARCH64_CS; /* = nlast */
9735 	case GTU: return AARCH64_HI; /* = pmore */
9736 	case LEU: return AARCH64_LS; /* = plast */
9737 	case LTU: return AARCH64_CC; /* = last */
9738 	default: return -1;
9739 	}
9740       break;
9741 
9742     case E_CC_NZmode:
9743       switch (comp_code)
9744 	{
9745 	case NE: return AARCH64_NE;
9746 	case EQ: return AARCH64_EQ;
9747 	case GE: return AARCH64_PL;
9748 	case LT: return AARCH64_MI;
9749 	default: return -1;
9750 	}
9751       break;
9752 
9753     case E_CC_Zmode:
9754       switch (comp_code)
9755 	{
9756 	case NE: return AARCH64_NE;
9757 	case EQ: return AARCH64_EQ;
9758 	default: return -1;
9759 	}
9760       break;
9761 
9762     case E_CC_Cmode:
9763       switch (comp_code)
9764 	{
9765 	case LTU: return AARCH64_CS;
9766 	case GEU: return AARCH64_CC;
9767 	default: return -1;
9768 	}
9769       break;
9770 
9771     case E_CC_ADCmode:
9772       switch (comp_code)
9773 	{
9774 	case GEU: return AARCH64_CS;
9775 	case LTU: return AARCH64_CC;
9776 	default: return -1;
9777 	}
9778       break;
9779 
9780     case E_CC_Vmode:
9781       switch (comp_code)
9782 	{
9783 	case NE: return AARCH64_VS;
9784 	case EQ: return AARCH64_VC;
9785 	default: return -1;
9786 	}
9787       break;
9788 
9789     default:
9790       return -1;
9791     }
9792 
9793   return -1;
9794 }
9795 
9796 bool
aarch64_const_vec_all_same_in_range_p(rtx x,HOST_WIDE_INT minval,HOST_WIDE_INT maxval)9797 aarch64_const_vec_all_same_in_range_p (rtx x,
9798 				       HOST_WIDE_INT minval,
9799 				       HOST_WIDE_INT maxval)
9800 {
9801   rtx elt;
9802   return (const_vec_duplicate_p (x, &elt)
9803 	  && CONST_INT_P (elt)
9804 	  && IN_RANGE (INTVAL (elt), minval, maxval));
9805 }
9806 
9807 bool
aarch64_const_vec_all_same_int_p(rtx x,HOST_WIDE_INT val)9808 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
9809 {
9810   return aarch64_const_vec_all_same_in_range_p (x, val, val);
9811 }
9812 
9813 /* Return true if VEC is a constant in which every element is in the range
9814    [MINVAL, MAXVAL].  The elements do not need to have the same value.  */
9815 
9816 static bool
aarch64_const_vec_all_in_range_p(rtx vec,HOST_WIDE_INT minval,HOST_WIDE_INT maxval)9817 aarch64_const_vec_all_in_range_p (rtx vec,
9818 				  HOST_WIDE_INT minval,
9819 				  HOST_WIDE_INT maxval)
9820 {
9821   if (GET_CODE (vec) != CONST_VECTOR
9822       || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
9823     return false;
9824 
9825   int nunits;
9826   if (!CONST_VECTOR_STEPPED_P (vec))
9827     nunits = const_vector_encoded_nelts (vec);
9828   else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
9829     return false;
9830 
9831   for (int i = 0; i < nunits; i++)
9832     {
9833       rtx vec_elem = CONST_VECTOR_ELT (vec, i);
9834       if (!CONST_INT_P (vec_elem)
9835 	  || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
9836 	return false;
9837     }
9838   return true;
9839 }
9840 
9841 /* N Z C V.  */
9842 #define AARCH64_CC_V 1
9843 #define AARCH64_CC_C (1 << 1)
9844 #define AARCH64_CC_Z (1 << 2)
9845 #define AARCH64_CC_N (1 << 3)
9846 
9847 /* N Z C V flags for ccmp.  Indexed by AARCH64_COND_CODE.  */
9848 static const int aarch64_nzcv_codes[] =
9849 {
9850   0,		/* EQ, Z == 1.  */
9851   AARCH64_CC_Z,	/* NE, Z == 0.  */
9852   0,		/* CS, C == 1.  */
9853   AARCH64_CC_C,	/* CC, C == 0.  */
9854   0,		/* MI, N == 1.  */
9855   AARCH64_CC_N, /* PL, N == 0.  */
9856   0,		/* VS, V == 1.  */
9857   AARCH64_CC_V, /* VC, V == 0.  */
9858   0,		/* HI, C ==1 && Z == 0.  */
9859   AARCH64_CC_C,	/* LS, !(C == 1 && Z == 0).  */
9860   AARCH64_CC_V,	/* GE, N == V.  */
9861   0,		/* LT, N != V.  */
9862   AARCH64_CC_Z, /* GT, Z == 0 && N == V.  */
9863   0,		/* LE, !(Z == 0 && N == V).  */
9864   0,		/* AL, Any.  */
9865   0		/* NV, Any.  */
9866 };
9867 
9868 /* Print floating-point vector immediate operand X to F, negating it
9869    first if NEGATE is true.  Return true on success, false if it isn't
9870    a constant we can handle.  */
9871 
9872 static bool
aarch64_print_vector_float_operand(FILE * f,rtx x,bool negate)9873 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
9874 {
9875   rtx elt;
9876 
9877   if (!const_vec_duplicate_p (x, &elt))
9878     return false;
9879 
9880   REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
9881   if (negate)
9882     r = real_value_negate (&r);
9883 
9884   /* Handle the SVE single-bit immediates specially, since they have a
9885      fixed form in the assembly syntax.  */
9886   if (real_equal (&r, &dconst0))
9887     asm_fprintf (f, "0.0");
9888   else if (real_equal (&r, &dconst2))
9889     asm_fprintf (f, "2.0");
9890   else if (real_equal (&r, &dconst1))
9891     asm_fprintf (f, "1.0");
9892   else if (real_equal (&r, &dconsthalf))
9893     asm_fprintf (f, "0.5");
9894   else
9895     {
9896       const int buf_size = 20;
9897       char float_buf[buf_size] = {'\0'};
9898       real_to_decimal_for_mode (float_buf, &r, buf_size, buf_size,
9899 				1, GET_MODE (elt));
9900       asm_fprintf (f, "%s", float_buf);
9901     }
9902 
9903   return true;
9904 }
9905 
9906 /* Return the equivalent letter for size.  */
9907 static char
sizetochar(int size)9908 sizetochar (int size)
9909 {
9910   switch (size)
9911     {
9912     case 64: return 'd';
9913     case 32: return 's';
9914     case 16: return 'h';
9915     case 8 : return 'b';
9916     default: gcc_unreachable ();
9917     }
9918 }
9919 
9920 /* Print operand X to file F in a target specific manner according to CODE.
9921    The acceptable formatting commands given by CODE are:
9922      'c':		An integer or symbol address without a preceding #
9923 			sign.
9924      'C':		Take the duplicated element in a vector constant
9925 			and print it in hex.
9926      'D':		Take the duplicated element in a vector constant
9927 			and print it as an unsigned integer, in decimal.
9928      'e':		Print the sign/zero-extend size as a character 8->b,
9929 			16->h, 32->w.  Can also be used for masks:
9930 			0xff->b, 0xffff->h, 0xffffffff->w.
9931      'I':		If the operand is a duplicated vector constant,
9932 			replace it with the duplicated scalar.  If the
9933 			operand is then a floating-point constant, replace
9934 			it with the integer bit representation.  Print the
9935 			transformed constant as a signed decimal number.
9936      'p':		Prints N such that 2^N == X (X must be power of 2 and
9937 			const int).
9938      'P':		Print the number of non-zero bits in X (a const_int).
9939      'H':		Print the higher numbered register of a pair (TImode)
9940 			of regs.
9941      'm':		Print a condition (eq, ne, etc).
9942      'M':		Same as 'm', but invert condition.
9943      'N':		Take the duplicated element in a vector constant
9944 			and print the negative of it in decimal.
9945      'b/h/s/d/q':	Print a scalar FP/SIMD register name.
9946      'S/T/U/V':		Print a FP/SIMD register name for a register list.
9947 			The register printed is the FP/SIMD register name
9948 			of X + 0/1/2/3 for S/T/U/V.
9949      'R':		Print a scalar Integer/FP/SIMD register name + 1.
9950      'X':		Print bottom 16 bits of integer constant in hex.
9951      'w/x':		Print a general register name or the zero register
9952 			(32-bit or 64-bit).
9953      '0':		Print a normal operand, if it's a general register,
9954 			then we assume DImode.
9955      'k':		Print NZCV for conditional compare instructions.
9956      'A':		Output address constant representing the first
9957 			argument of X, specifying a relocation offset
9958 			if appropriate.
9959      'L':		Output constant address specified by X
9960 			with a relocation offset if appropriate.
9961      'G':		Prints address of X, specifying a PC relative
9962 			relocation mode if appropriate.
9963      'y':		Output address of LDP or STP - this is used for
9964 			some LDP/STPs which don't use a PARALLEL in their
9965 			pattern (so the mode needs to be adjusted).
9966      'z':		Output address of a typical LDP or STP.  */
9967 
9968 static void
aarch64_print_operand(FILE * f,rtx x,int code)9969 aarch64_print_operand (FILE *f, rtx x, int code)
9970 {
9971   rtx elt;
9972   switch (code)
9973     {
9974     case 'c':
9975       switch (GET_CODE (x))
9976 	{
9977 	case CONST_INT:
9978 	  fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
9979 	  break;
9980 
9981 	case SYMBOL_REF:
9982 	  output_addr_const (f, x);
9983 	  break;
9984 
9985 	case CONST:
9986 	  if (GET_CODE (XEXP (x, 0)) == PLUS
9987 	      && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
9988 	    {
9989 	      output_addr_const (f, x);
9990 	      break;
9991 	    }
9992 	  /* Fall through.  */
9993 
9994 	default:
9995 	  output_operand_lossage ("unsupported operand for code '%c'", code);
9996 	}
9997       break;
9998 
9999     case 'e':
10000       {
10001 	x = unwrap_const_vec_duplicate (x);
10002 	if (!CONST_INT_P (x))
10003 	  {
10004 	    output_operand_lossage ("invalid operand for '%%%c'", code);
10005 	    return;
10006 	  }
10007 
10008 	HOST_WIDE_INT val = INTVAL (x);
10009 	if ((val & ~7) == 8 || val == 0xff)
10010 	  fputc ('b', f);
10011 	else if ((val & ~7) == 16 || val == 0xffff)
10012 	  fputc ('h', f);
10013 	else if ((val & ~7) == 32 || val == 0xffffffff)
10014 	  fputc ('w', f);
10015 	else
10016 	  {
10017 	    output_operand_lossage ("invalid operand for '%%%c'", code);
10018 	    return;
10019 	  }
10020       }
10021       break;
10022 
10023     case 'p':
10024       {
10025 	int n;
10026 
10027 	if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
10028 	  {
10029 	    output_operand_lossage ("invalid operand for '%%%c'", code);
10030 	    return;
10031 	  }
10032 
10033 	asm_fprintf (f, "%d", n);
10034       }
10035       break;
10036 
10037     case 'P':
10038       if (!CONST_INT_P (x))
10039 	{
10040 	  output_operand_lossage ("invalid operand for '%%%c'", code);
10041 	  return;
10042 	}
10043 
10044       asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
10045       break;
10046 
10047     case 'H':
10048       if (x == const0_rtx)
10049 	{
10050 	  asm_fprintf (f, "xzr");
10051 	  break;
10052 	}
10053 
10054       if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
10055 	{
10056 	  output_operand_lossage ("invalid operand for '%%%c'", code);
10057 	  return;
10058 	}
10059 
10060       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
10061       break;
10062 
10063     case 'I':
10064       {
10065 	x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
10066 	if (CONST_INT_P (x))
10067 	  asm_fprintf (f, "%wd", INTVAL (x));
10068 	else
10069 	  {
10070 	    output_operand_lossage ("invalid operand for '%%%c'", code);
10071 	    return;
10072 	  }
10073 	break;
10074       }
10075 
10076     case 'M':
10077     case 'm':
10078       {
10079         int cond_code;
10080 	/* CONST_TRUE_RTX means al/nv (al is the default, don't print it).  */
10081 	if (x == const_true_rtx)
10082 	  {
10083 	    if (code == 'M')
10084 	      fputs ("nv", f);
10085 	    return;
10086 	  }
10087 
10088         if (!COMPARISON_P (x))
10089 	  {
10090 	    output_operand_lossage ("invalid operand for '%%%c'", code);
10091 	    return;
10092 	  }
10093 
10094         cond_code = aarch64_get_condition_code (x);
10095         gcc_assert (cond_code >= 0);
10096 	if (code == 'M')
10097 	  cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
10098 	if (GET_MODE (XEXP (x, 0)) == CC_NZCmode)
10099 	  fputs (aarch64_sve_condition_codes[cond_code], f);
10100 	else
10101 	  fputs (aarch64_condition_codes[cond_code], f);
10102       }
10103       break;
10104 
10105     case 'N':
10106       if (!const_vec_duplicate_p (x, &elt))
10107 	{
10108 	  output_operand_lossage ("invalid vector constant");
10109 	  return;
10110 	}
10111 
10112       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
10113 	asm_fprintf (f, "%wd", -INTVAL (elt));
10114       else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
10115 	       && aarch64_print_vector_float_operand (f, x, true))
10116 	;
10117       else
10118 	{
10119 	  output_operand_lossage ("invalid vector constant");
10120 	  return;
10121 	}
10122       break;
10123 
10124     case 'b':
10125     case 'h':
10126     case 's':
10127     case 'd':
10128     case 'q':
10129       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
10130 	{
10131 	  output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
10132 	  return;
10133 	}
10134       asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
10135       break;
10136 
10137     case 'S':
10138     case 'T':
10139     case 'U':
10140     case 'V':
10141       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
10142 	{
10143 	  output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
10144 	  return;
10145 	}
10146       asm_fprintf (f, "%c%d",
10147 		   aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
10148 		   REGNO (x) - V0_REGNUM + (code - 'S'));
10149       break;
10150 
10151     case 'R':
10152       if (REG_P (x) && FP_REGNUM_P (REGNO (x)))
10153 	asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
10154       else if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
10155 	asm_fprintf (f, "x%d", REGNO (x) - R0_REGNUM + 1);
10156       else
10157 	output_operand_lossage ("incompatible register operand for '%%%c'",
10158 				code);
10159       break;
10160 
10161     case 'X':
10162       if (!CONST_INT_P (x))
10163 	{
10164 	  output_operand_lossage ("invalid operand for '%%%c'", code);
10165 	  return;
10166 	}
10167       asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
10168       break;
10169 
10170     case 'C':
10171       {
10172 	/* Print a replicated constant in hex.  */
10173 	if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
10174 	  {
10175 	    output_operand_lossage ("invalid operand for '%%%c'", code);
10176 	    return;
10177 	  }
10178 	scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
10179 	asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
10180       }
10181       break;
10182 
10183     case 'D':
10184       {
10185 	/* Print a replicated constant in decimal, treating it as
10186 	   unsigned.  */
10187 	if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
10188 	  {
10189 	    output_operand_lossage ("invalid operand for '%%%c'", code);
10190 	    return;
10191 	  }
10192 	scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
10193 	asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
10194       }
10195       break;
10196 
10197     case 'w':
10198     case 'x':
10199       if (x == const0_rtx
10200 	  || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
10201 	{
10202 	  asm_fprintf (f, "%czr", code);
10203 	  break;
10204 	}
10205 
10206       if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
10207 	{
10208 	  asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
10209 	  break;
10210 	}
10211 
10212       if (REG_P (x) && REGNO (x) == SP_REGNUM)
10213 	{
10214 	  asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
10215 	  break;
10216 	}
10217 
10218       /* Fall through */
10219 
10220     case 0:
10221       if (x == NULL)
10222 	{
10223 	  output_operand_lossage ("missing operand");
10224 	  return;
10225 	}
10226 
10227       switch (GET_CODE (x))
10228 	{
10229 	case REG:
10230 	  if (aarch64_sve_data_mode_p (GET_MODE (x)))
10231 	    {
10232 	      if (REG_NREGS (x) == 1)
10233 		asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
10234 	      else
10235 		{
10236 		  char suffix
10237 		    = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
10238 		  asm_fprintf (f, "{z%d.%c - z%d.%c}",
10239 			       REGNO (x) - V0_REGNUM, suffix,
10240 			       END_REGNO (x) - V0_REGNUM - 1, suffix);
10241 		}
10242 	    }
10243 	  else
10244 	    asm_fprintf (f, "%s", reg_names [REGNO (x)]);
10245 	  break;
10246 
10247 	case MEM:
10248 	  output_address (GET_MODE (x), XEXP (x, 0));
10249 	  break;
10250 
10251 	case LABEL_REF:
10252 	case SYMBOL_REF:
10253 	  output_addr_const (asm_out_file, x);
10254 	  break;
10255 
10256 	case CONST_INT:
10257 	  asm_fprintf (f, "%wd", INTVAL (x));
10258 	  break;
10259 
10260 	case CONST:
10261 	  if (!VECTOR_MODE_P (GET_MODE (x)))
10262 	    {
10263 	      output_addr_const (asm_out_file, x);
10264 	      break;
10265 	    }
10266 	  /* fall through */
10267 
10268 	case CONST_VECTOR:
10269 	  if (!const_vec_duplicate_p (x, &elt))
10270 	    {
10271 	      output_operand_lossage ("invalid vector constant");
10272 	      return;
10273 	    }
10274 
10275 	  if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
10276 	    asm_fprintf (f, "%wd", INTVAL (elt));
10277 	  else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
10278 		   && aarch64_print_vector_float_operand (f, x, false))
10279 	    ;
10280 	  else
10281 	    {
10282 	      output_operand_lossage ("invalid vector constant");
10283 	      return;
10284 	    }
10285 	  break;
10286 
10287 	case CONST_DOUBLE:
10288 	  /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
10289 	     be getting CONST_DOUBLEs holding integers.  */
10290 	  gcc_assert (GET_MODE (x) != VOIDmode);
10291 	  if (aarch64_float_const_zero_rtx_p (x))
10292 	    {
10293 	      fputc ('0', f);
10294 	      break;
10295 	    }
10296 	  else if (aarch64_float_const_representable_p (x))
10297 	    {
10298 #define buf_size 20
10299 	      char float_buf[buf_size] = {'\0'};
10300 	      real_to_decimal_for_mode (float_buf,
10301 					CONST_DOUBLE_REAL_VALUE (x),
10302 					buf_size, buf_size,
10303 					1, GET_MODE (x));
10304 	      asm_fprintf (asm_out_file, "%s", float_buf);
10305 	      break;
10306 #undef buf_size
10307 	    }
10308 	  output_operand_lossage ("invalid constant");
10309 	  return;
10310 	default:
10311 	  output_operand_lossage ("invalid operand");
10312 	  return;
10313 	}
10314       break;
10315 
10316     case 'A':
10317       if (GET_CODE (x) == HIGH)
10318 	x = XEXP (x, 0);
10319 
10320       switch (aarch64_classify_symbolic_expression (x))
10321 	{
10322 	case SYMBOL_SMALL_GOT_4G:
10323 	  asm_fprintf (asm_out_file, ":got:");
10324 	  break;
10325 
10326 	case SYMBOL_SMALL_TLSGD:
10327 	  asm_fprintf (asm_out_file, ":tlsgd:");
10328 	  break;
10329 
10330 	case SYMBOL_SMALL_TLSDESC:
10331 	  asm_fprintf (asm_out_file, ":tlsdesc:");
10332 	  break;
10333 
10334 	case SYMBOL_SMALL_TLSIE:
10335 	  asm_fprintf (asm_out_file, ":gottprel:");
10336 	  break;
10337 
10338 	case SYMBOL_TLSLE24:
10339 	  asm_fprintf (asm_out_file, ":tprel:");
10340 	  break;
10341 
10342 	case SYMBOL_TINY_GOT:
10343 	  gcc_unreachable ();
10344 	  break;
10345 
10346 	default:
10347 	  break;
10348 	}
10349       output_addr_const (asm_out_file, x);
10350       break;
10351 
10352     case 'L':
10353       switch (aarch64_classify_symbolic_expression (x))
10354 	{
10355 	case SYMBOL_SMALL_GOT_4G:
10356 	  asm_fprintf (asm_out_file, ":lo12:");
10357 	  break;
10358 
10359 	case SYMBOL_SMALL_TLSGD:
10360 	  asm_fprintf (asm_out_file, ":tlsgd_lo12:");
10361 	  break;
10362 
10363 	case SYMBOL_SMALL_TLSDESC:
10364 	  asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
10365 	  break;
10366 
10367 	case SYMBOL_SMALL_TLSIE:
10368 	  asm_fprintf (asm_out_file, ":gottprel_lo12:");
10369 	  break;
10370 
10371 	case SYMBOL_TLSLE12:
10372 	  asm_fprintf (asm_out_file, ":tprel_lo12:");
10373 	  break;
10374 
10375 	case SYMBOL_TLSLE24:
10376 	  asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
10377 	  break;
10378 
10379 	case SYMBOL_TINY_GOT:
10380 	  asm_fprintf (asm_out_file, ":got:");
10381 	  break;
10382 
10383 	case SYMBOL_TINY_TLSIE:
10384 	  asm_fprintf (asm_out_file, ":gottprel:");
10385 	  break;
10386 
10387 	default:
10388 	  break;
10389 	}
10390       output_addr_const (asm_out_file, x);
10391       break;
10392 
10393     case 'G':
10394       switch (aarch64_classify_symbolic_expression (x))
10395 	{
10396 	case SYMBOL_TLSLE24:
10397 	  asm_fprintf (asm_out_file, ":tprel_hi12:");
10398 	  break;
10399 	default:
10400 	  break;
10401 	}
10402       output_addr_const (asm_out_file, x);
10403       break;
10404 
10405     case 'k':
10406       {
10407 	HOST_WIDE_INT cond_code;
10408 
10409 	if (!CONST_INT_P (x))
10410 	  {
10411 	    output_operand_lossage ("invalid operand for '%%%c'", code);
10412 	    return;
10413 	  }
10414 
10415 	cond_code = INTVAL (x);
10416 	gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
10417 	asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
10418       }
10419       break;
10420 
10421     case 'y':
10422     case 'z':
10423       {
10424 	machine_mode mode = GET_MODE (x);
10425 
10426 	if (GET_CODE (x) != MEM
10427 	    || (code == 'y' && maybe_ne (GET_MODE_SIZE (mode), 16)))
10428 	  {
10429 	    output_operand_lossage ("invalid operand for '%%%c'", code);
10430 	    return;
10431 	  }
10432 
10433 	if (!aarch64_print_address_internal (f, mode, XEXP (x, 0),
10434 					    code == 'y'
10435 					    ? ADDR_QUERY_LDP_STP_N
10436 					    : ADDR_QUERY_LDP_STP))
10437 	  output_operand_lossage ("invalid operand prefix '%%%c'", code);
10438       }
10439       break;
10440 
10441     default:
10442       output_operand_lossage ("invalid operand prefix '%%%c'", code);
10443       return;
10444     }
10445 }
10446 
10447 /* Print address 'x' of a memory access with mode 'mode'.
10448    'op' is the context required by aarch64_classify_address.  It can either be
10449    MEM for a normal memory access or PARALLEL for LDP/STP.  */
10450 static bool
aarch64_print_address_internal(FILE * f,machine_mode mode,rtx x,aarch64_addr_query_type type)10451 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
10452 				aarch64_addr_query_type type)
10453 {
10454   struct aarch64_address_info addr;
10455   unsigned int size, vec_flags;
10456 
10457   /* Check all addresses are Pmode - including ILP32.  */
10458   if (GET_MODE (x) != Pmode
10459       && (!CONST_INT_P (x)
10460 	  || trunc_int_for_mode (INTVAL (x), Pmode) != INTVAL (x)))
10461     {
10462       output_operand_lossage ("invalid address mode");
10463       return false;
10464     }
10465 
10466   if (aarch64_classify_address (&addr, x, mode, true, type))
10467     switch (addr.type)
10468       {
10469       case ADDRESS_REG_IMM:
10470 	if (known_eq (addr.const_offset, 0))
10471 	  {
10472 	    asm_fprintf (f, "[%s]", reg_names[REGNO (addr.base)]);
10473 	    return true;
10474 	  }
10475 
10476 	vec_flags = aarch64_classify_vector_mode (mode);
10477 	if (vec_flags & VEC_ANY_SVE)
10478 	  {
10479 	    HOST_WIDE_INT vnum
10480 	      = exact_div (addr.const_offset,
10481 			   aarch64_vl_bytes (mode, vec_flags)).to_constant ();
10482 	    asm_fprintf (f, "[%s, #%wd, mul vl]",
10483 			 reg_names[REGNO (addr.base)], vnum);
10484 	    return true;
10485 	  }
10486 
10487 	asm_fprintf (f, "[%s, %wd]", reg_names[REGNO (addr.base)],
10488 		     INTVAL (addr.offset));
10489 	return true;
10490 
10491       case ADDRESS_REG_REG:
10492 	if (addr.shift == 0)
10493 	  asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
10494 		       reg_names [REGNO (addr.offset)]);
10495 	else
10496 	  asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
10497 		       reg_names [REGNO (addr.offset)], addr.shift);
10498 	return true;
10499 
10500       case ADDRESS_REG_UXTW:
10501 	if (addr.shift == 0)
10502 	  asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
10503 		       REGNO (addr.offset) - R0_REGNUM);
10504 	else
10505 	  asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
10506 		       REGNO (addr.offset) - R0_REGNUM, addr.shift);
10507 	return true;
10508 
10509       case ADDRESS_REG_SXTW:
10510 	if (addr.shift == 0)
10511 	  asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
10512 		       REGNO (addr.offset) - R0_REGNUM);
10513 	else
10514 	  asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
10515 		       REGNO (addr.offset) - R0_REGNUM, addr.shift);
10516 	return true;
10517 
10518       case ADDRESS_REG_WB:
10519 	/* Writeback is only supported for fixed-width modes.  */
10520 	size = GET_MODE_SIZE (mode).to_constant ();
10521 	switch (GET_CODE (x))
10522 	  {
10523 	  case PRE_INC:
10524 	    asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
10525 	    return true;
10526 	  case POST_INC:
10527 	    asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
10528 	    return true;
10529 	  case PRE_DEC:
10530 	    asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
10531 	    return true;
10532 	  case POST_DEC:
10533 	    asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
10534 	    return true;
10535 	  case PRE_MODIFY:
10536 	    asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
10537 			 INTVAL (addr.offset));
10538 	    return true;
10539 	  case POST_MODIFY:
10540 	    asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
10541 			 INTVAL (addr.offset));
10542 	    return true;
10543 	  default:
10544 	    break;
10545 	  }
10546 	break;
10547 
10548       case ADDRESS_LO_SUM:
10549 	asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
10550 	output_addr_const (f, addr.offset);
10551 	asm_fprintf (f, "]");
10552 	return true;
10553 
10554       case ADDRESS_SYMBOLIC:
10555 	output_addr_const (f, x);
10556 	return true;
10557       }
10558 
10559   return false;
10560 }
10561 
10562 /* Print address 'x' of a memory access with mode 'mode'.  */
10563 static void
aarch64_print_operand_address(FILE * f,machine_mode mode,rtx x)10564 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
10565 {
10566   if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
10567     output_addr_const (f, x);
10568 }
10569 
10570 bool
aarch64_label_mentioned_p(rtx x)10571 aarch64_label_mentioned_p (rtx x)
10572 {
10573   const char *fmt;
10574   int i;
10575 
10576   if (GET_CODE (x) == LABEL_REF)
10577     return true;
10578 
10579   /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
10580      referencing instruction, but they are constant offsets, not
10581      symbols.  */
10582   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
10583     return false;
10584 
10585   fmt = GET_RTX_FORMAT (GET_CODE (x));
10586   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
10587     {
10588       if (fmt[i] == 'E')
10589 	{
10590 	  int j;
10591 
10592 	  for (j = XVECLEN (x, i) - 1; j >= 0; j--)
10593 	    if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
10594 	      return 1;
10595 	}
10596       else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
10597 	return 1;
10598     }
10599 
10600   return 0;
10601 }
10602 
10603 /* Implement REGNO_REG_CLASS.  */
10604 
10605 enum reg_class
aarch64_regno_regclass(unsigned regno)10606 aarch64_regno_regclass (unsigned regno)
10607 {
10608   if (GP_REGNUM_P (regno))
10609     return GENERAL_REGS;
10610 
10611   if (regno == SP_REGNUM)
10612     return STACK_REG;
10613 
10614   if (regno == FRAME_POINTER_REGNUM
10615       || regno == ARG_POINTER_REGNUM)
10616     return POINTER_REGS;
10617 
10618   if (FP_REGNUM_P (regno))
10619     return (FP_LO8_REGNUM_P (regno) ? FP_LO8_REGS
10620 	    : FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS);
10621 
10622   if (PR_REGNUM_P (regno))
10623     return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
10624 
10625   if (regno == FFR_REGNUM || regno == FFRT_REGNUM)
10626     return FFR_REGS;
10627 
10628   return NO_REGS;
10629 }
10630 
10631 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
10632    If OFFSET is out of range, return an offset of an anchor point
10633    that is in range.  Return 0 otherwise.  */
10634 
10635 static HOST_WIDE_INT
aarch64_anchor_offset(HOST_WIDE_INT offset,HOST_WIDE_INT size,machine_mode mode)10636 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
10637 		       machine_mode mode)
10638 {
10639   /* Does it look like we'll need a 16-byte load/store-pair operation?  */
10640   if (size > 16)
10641     return (offset + 0x400) & ~0x7f0;
10642 
10643   /* For offsets that aren't a multiple of the access size, the limit is
10644      -256...255.  */
10645   if (offset & (size - 1))
10646     {
10647       /* BLKmode typically uses LDP of X-registers.  */
10648       if (mode == BLKmode)
10649 	return (offset + 512) & ~0x3ff;
10650       return (offset + 0x100) & ~0x1ff;
10651     }
10652 
10653   /* Small negative offsets are supported.  */
10654   if (IN_RANGE (offset, -256, 0))
10655     return 0;
10656 
10657   if (mode == TImode || mode == TFmode)
10658     return (offset + 0x100) & ~0x1ff;
10659 
10660   /* Use 12-bit offset by access size.  */
10661   return offset & (~0xfff * size);
10662 }
10663 
10664 static rtx
aarch64_legitimize_address(rtx x,rtx,machine_mode mode)10665 aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
10666 {
10667   /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
10668      where mask is selected by alignment and size of the offset.
10669      We try to pick as large a range for the offset as possible to
10670      maximize the chance of a CSE.  However, for aligned addresses
10671      we limit the range to 4k so that structures with different sized
10672      elements are likely to use the same base.  We need to be careful
10673      not to split a CONST for some forms of address expression, otherwise
10674      it will generate sub-optimal code.  */
10675 
10676   if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
10677     {
10678       rtx base = XEXP (x, 0);
10679       rtx offset_rtx = XEXP (x, 1);
10680       HOST_WIDE_INT offset = INTVAL (offset_rtx);
10681 
10682       if (GET_CODE (base) == PLUS)
10683 	{
10684 	  rtx op0 = XEXP (base, 0);
10685 	  rtx op1 = XEXP (base, 1);
10686 
10687 	  /* Force any scaling into a temp for CSE.  */
10688 	  op0 = force_reg (Pmode, op0);
10689 	  op1 = force_reg (Pmode, op1);
10690 
10691 	  /* Let the pointer register be in op0.  */
10692 	  if (REG_POINTER (op1))
10693 	    std::swap (op0, op1);
10694 
10695 	  /* If the pointer is virtual or frame related, then we know that
10696 	     virtual register instantiation or register elimination is going
10697 	     to apply a second constant.  We want the two constants folded
10698 	     together easily.  Therefore, emit as (OP0 + CONST) + OP1.  */
10699 	  if (virt_or_elim_regno_p (REGNO (op0)))
10700 	    {
10701 	      base = expand_binop (Pmode, add_optab, op0, offset_rtx,
10702 				   NULL_RTX, true, OPTAB_DIRECT);
10703 	      return gen_rtx_PLUS (Pmode, base, op1);
10704 	    }
10705 
10706 	  /* Otherwise, in order to encourage CSE (and thence loop strength
10707 	     reduce) scaled addresses, emit as (OP0 + OP1) + CONST.  */
10708 	  base = expand_binop (Pmode, add_optab, op0, op1,
10709 			       NULL_RTX, true, OPTAB_DIRECT);
10710 	  x = gen_rtx_PLUS (Pmode, base, offset_rtx);
10711 	}
10712 
10713       HOST_WIDE_INT size;
10714       if (GET_MODE_SIZE (mode).is_constant (&size))
10715 	{
10716 	  HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
10717 							     mode);
10718 	  if (base_offset != 0)
10719 	    {
10720 	      base = plus_constant (Pmode, base, base_offset);
10721 	      base = force_operand (base, NULL_RTX);
10722 	      return plus_constant (Pmode, base, offset - base_offset);
10723 	    }
10724 	}
10725     }
10726 
10727   return x;
10728 }
10729 
10730 static reg_class_t
aarch64_secondary_reload(bool in_p ATTRIBUTE_UNUSED,rtx x,reg_class_t rclass,machine_mode mode,secondary_reload_info * sri)10731 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
10732 			  reg_class_t rclass,
10733 			  machine_mode mode,
10734 			  secondary_reload_info *sri)
10735 {
10736   /* Use aarch64_sve_reload_mem for SVE memory reloads that cannot use
10737      LDR and STR.  See the comment at the head of aarch64-sve.md for
10738      more details about the big-endian handling.  */
10739   if (reg_class_subset_p (rclass, FP_REGS)
10740       && !((REG_P (x) && HARD_REGISTER_P (x))
10741 	   || aarch64_simd_valid_immediate (x, NULL))
10742       && mode != VNx16QImode)
10743     {
10744       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
10745       if ((vec_flags & VEC_SVE_DATA)
10746 	  && ((vec_flags & VEC_PARTIAL) || BYTES_BIG_ENDIAN))
10747 	{
10748 	  sri->icode = CODE_FOR_aarch64_sve_reload_mem;
10749 	  return NO_REGS;
10750 	}
10751     }
10752 
10753   /* If we have to disable direct literal pool loads and stores because the
10754      function is too big, then we need a scratch register.  */
10755   if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
10756       && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
10757 	  || targetm.vector_mode_supported_p (GET_MODE (x)))
10758       && !aarch64_pcrelative_literal_loads)
10759     {
10760       sri->icode = code_for_aarch64_reload_movcp (mode, DImode);
10761       return NO_REGS;
10762     }
10763 
10764   /* Without the TARGET_SIMD instructions we cannot move a Q register
10765      to a Q register directly.  We need a scratch.  */
10766   if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
10767       && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
10768       && reg_class_subset_p (rclass, FP_REGS))
10769     {
10770       sri->icode = code_for_aarch64_reload_mov (mode);
10771       return NO_REGS;
10772     }
10773 
10774   /* A TFmode or TImode memory access should be handled via an FP_REGS
10775      because AArch64 has richer addressing modes for LDR/STR instructions
10776      than LDP/STP instructions.  */
10777   if (TARGET_FLOAT && rclass == GENERAL_REGS
10778       && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
10779     return FP_REGS;
10780 
10781   if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
10782       return GENERAL_REGS;
10783 
10784   return NO_REGS;
10785 }
10786 
10787 static bool
aarch64_can_eliminate(const int from ATTRIBUTE_UNUSED,const int to)10788 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
10789 {
10790   gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
10791 
10792   /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
10793      can only eliminate to HARD_FRAME_POINTER_REGNUM.  */
10794   if (frame_pointer_needed)
10795     return to == HARD_FRAME_POINTER_REGNUM;
10796   return true;
10797 }
10798 
10799 poly_int64
aarch64_initial_elimination_offset(unsigned from,unsigned to)10800 aarch64_initial_elimination_offset (unsigned from, unsigned to)
10801 {
10802   if (to == HARD_FRAME_POINTER_REGNUM)
10803     {
10804       if (from == ARG_POINTER_REGNUM)
10805 	return cfun->machine->frame.hard_fp_offset;
10806 
10807       if (from == FRAME_POINTER_REGNUM)
10808 	return cfun->machine->frame.hard_fp_offset
10809 	       - cfun->machine->frame.locals_offset;
10810     }
10811 
10812   if (to == STACK_POINTER_REGNUM)
10813     {
10814       if (from == FRAME_POINTER_REGNUM)
10815 	  return cfun->machine->frame.frame_size
10816 		 - cfun->machine->frame.locals_offset;
10817     }
10818 
10819   return cfun->machine->frame.frame_size;
10820 }
10821 
10822 
10823 /* Get return address without mangling.  */
10824 
10825 rtx
aarch64_return_addr_rtx(void)10826 aarch64_return_addr_rtx (void)
10827 {
10828   rtx val = get_hard_reg_initial_val (Pmode, LR_REGNUM);
10829   /* Note: aarch64_return_address_signing_enabled only
10830      works after cfun->machine->frame.laid_out is set,
10831      so here we don't know if the return address will
10832      be signed or not.  */
10833   rtx lr = gen_rtx_REG (Pmode, LR_REGNUM);
10834   emit_move_insn (lr, val);
10835   emit_insn (GEN_FCN (CODE_FOR_xpaclri) ());
10836   return lr;
10837 }
10838 
10839 
10840 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
10841    previous frame.  */
10842 
10843 rtx
aarch64_return_addr(int count,rtx frame ATTRIBUTE_UNUSED)10844 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
10845 {
10846   if (count != 0)
10847     return const0_rtx;
10848   return aarch64_return_addr_rtx ();
10849 }
10850 
10851 
10852 static void
aarch64_asm_trampoline_template(FILE * f)10853 aarch64_asm_trampoline_template (FILE *f)
10854 {
10855   int offset1 = 16;
10856   int offset2 = 20;
10857 
10858   if (aarch64_bti_enabled ())
10859     {
10860       asm_fprintf (f, "\thint\t34 // bti c\n");
10861       offset1 -= 4;
10862       offset2 -= 4;
10863     }
10864 
10865   if (TARGET_ILP32)
10866     {
10867       asm_fprintf (f, "\tldr\tw%d, .+%d\n", IP1_REGNUM - R0_REGNUM, offset1);
10868       asm_fprintf (f, "\tldr\tw%d, .+%d\n", STATIC_CHAIN_REGNUM - R0_REGNUM,
10869 		   offset1);
10870     }
10871   else
10872     {
10873       asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [IP1_REGNUM], offset1);
10874       asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [STATIC_CHAIN_REGNUM],
10875 		   offset2);
10876     }
10877   asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
10878 
10879   /* The trampoline needs an extra padding instruction.  In case if BTI is
10880      enabled the padding instruction is replaced by the BTI instruction at
10881      the beginning.  */
10882   if (!aarch64_bti_enabled ())
10883     assemble_aligned_integer (4, const0_rtx);
10884 
10885   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
10886   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
10887 }
10888 
10889 static void
aarch64_trampoline_init(rtx m_tramp,tree fndecl,rtx chain_value)10890 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
10891 {
10892   rtx fnaddr, mem, a_tramp;
10893   const int tramp_code_sz = 16;
10894 
10895   /* Don't need to copy the trailing D-words, we fill those in below.  */
10896   emit_block_move (m_tramp, assemble_trampoline_template (),
10897 		   GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
10898   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
10899   fnaddr = XEXP (DECL_RTL (fndecl), 0);
10900   if (GET_MODE (fnaddr) != ptr_mode)
10901     fnaddr = convert_memory_address (ptr_mode, fnaddr);
10902   emit_move_insn (mem, fnaddr);
10903 
10904   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
10905   emit_move_insn (mem, chain_value);
10906 
10907   /* XXX We should really define a "clear_cache" pattern and use
10908      gen_clear_cache().  */
10909   a_tramp = XEXP (m_tramp, 0);
10910   emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
10911 		     LCT_NORMAL, VOIDmode, a_tramp, ptr_mode,
10912 		     plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
10913 		     ptr_mode);
10914 }
10915 
10916 static unsigned char
aarch64_class_max_nregs(reg_class_t regclass,machine_mode mode)10917 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
10918 {
10919   /* ??? Logically we should only need to provide a value when
10920      HARD_REGNO_MODE_OK says that at least one register in REGCLASS
10921      can hold MODE, but at the moment we need to handle all modes.
10922      Just ignore any runtime parts for registers that can't store them.  */
10923   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
10924   unsigned int nregs, vec_flags;
10925   switch (regclass)
10926     {
10927     case TAILCALL_ADDR_REGS:
10928     case POINTER_REGS:
10929     case GENERAL_REGS:
10930     case ALL_REGS:
10931     case POINTER_AND_FP_REGS:
10932     case FP_REGS:
10933     case FP_LO_REGS:
10934     case FP_LO8_REGS:
10935       vec_flags = aarch64_classify_vector_mode (mode);
10936       if ((vec_flags & VEC_SVE_DATA)
10937 	  && constant_multiple_p (GET_MODE_SIZE (mode),
10938 				  aarch64_vl_bytes (mode, vec_flags), &nregs))
10939 	return nregs;
10940       return (vec_flags & VEC_ADVSIMD
10941 	      ? CEIL (lowest_size, UNITS_PER_VREG)
10942 	      : CEIL (lowest_size, UNITS_PER_WORD));
10943     case STACK_REG:
10944     case PR_REGS:
10945     case PR_LO_REGS:
10946     case PR_HI_REGS:
10947     case FFR_REGS:
10948     case PR_AND_FFR_REGS:
10949       return 1;
10950 
10951     case NO_REGS:
10952       return 0;
10953 
10954     default:
10955       break;
10956     }
10957   gcc_unreachable ();
10958 }
10959 
10960 static reg_class_t
aarch64_preferred_reload_class(rtx x,reg_class_t regclass)10961 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
10962 {
10963   if (regclass == POINTER_REGS)
10964     return GENERAL_REGS;
10965 
10966   if (regclass == STACK_REG)
10967     {
10968       if (REG_P(x)
10969 	  && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
10970 	  return regclass;
10971 
10972       return NO_REGS;
10973     }
10974 
10975   /* Register eliminiation can result in a request for
10976      SP+constant->FP_REGS.  We cannot support such operations which
10977      use SP as source and an FP_REG as destination, so reject out
10978      right now.  */
10979   if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
10980     {
10981       rtx lhs = XEXP (x, 0);
10982 
10983       /* Look through a possible SUBREG introduced by ILP32.  */
10984       if (GET_CODE (lhs) == SUBREG)
10985 	lhs = SUBREG_REG (lhs);
10986 
10987       gcc_assert (REG_P (lhs));
10988       gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
10989 				      POINTER_REGS));
10990       return NO_REGS;
10991     }
10992 
10993   return regclass;
10994 }
10995 
10996 void
aarch64_asm_output_labelref(FILE * f,const char * name)10997 aarch64_asm_output_labelref (FILE* f, const char *name)
10998 {
10999   asm_fprintf (f, "%U%s", name);
11000 }
11001 
11002 static void
aarch64_elf_asm_constructor(rtx symbol,int priority)11003 aarch64_elf_asm_constructor (rtx symbol, int priority)
11004 {
11005   if (priority == DEFAULT_INIT_PRIORITY)
11006     default_ctor_section_asm_out_constructor (symbol, priority);
11007   else
11008     {
11009       section *s;
11010       /* While priority is known to be in range [0, 65535], so 18 bytes
11011          would be enough, the compiler might not know that.  To avoid
11012          -Wformat-truncation false positive, use a larger size.  */
11013       char buf[23];
11014       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
11015       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
11016       switch_to_section (s);
11017       assemble_align (POINTER_SIZE);
11018       assemble_aligned_integer (POINTER_BYTES, symbol);
11019     }
11020 }
11021 
11022 static void
aarch64_elf_asm_destructor(rtx symbol,int priority)11023 aarch64_elf_asm_destructor (rtx symbol, int priority)
11024 {
11025   if (priority == DEFAULT_INIT_PRIORITY)
11026     default_dtor_section_asm_out_destructor (symbol, priority);
11027   else
11028     {
11029       section *s;
11030       /* While priority is known to be in range [0, 65535], so 18 bytes
11031          would be enough, the compiler might not know that.  To avoid
11032          -Wformat-truncation false positive, use a larger size.  */
11033       char buf[23];
11034       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
11035       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
11036       switch_to_section (s);
11037       assemble_align (POINTER_SIZE);
11038       assemble_aligned_integer (POINTER_BYTES, symbol);
11039     }
11040 }
11041 
11042 const char*
aarch64_output_casesi(rtx * operands)11043 aarch64_output_casesi (rtx *operands)
11044 {
11045   char buf[100];
11046   char label[100];
11047   rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
11048   int index;
11049   static const char *const patterns[4][2] =
11050   {
11051     {
11052       "ldrb\t%w3, [%0,%w1,uxtw]",
11053       "add\t%3, %4, %w3, sxtb #2"
11054     },
11055     {
11056       "ldrh\t%w3, [%0,%w1,uxtw #1]",
11057       "add\t%3, %4, %w3, sxth #2"
11058     },
11059     {
11060       "ldr\t%w3, [%0,%w1,uxtw #2]",
11061       "add\t%3, %4, %w3, sxtw #2"
11062     },
11063     /* We assume that DImode is only generated when not optimizing and
11064        that we don't really need 64-bit address offsets.  That would
11065        imply an object file with 8GB of code in a single function!  */
11066     {
11067       "ldr\t%w3, [%0,%w1,uxtw #2]",
11068       "add\t%3, %4, %w3, sxtw #2"
11069     }
11070   };
11071 
11072   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
11073 
11074   scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
11075   index = exact_log2 (GET_MODE_SIZE (mode));
11076 
11077   gcc_assert (index >= 0 && index <= 3);
11078 
11079   /* Need to implement table size reduction, by chaning the code below.  */
11080   output_asm_insn (patterns[index][0], operands);
11081   ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
11082   snprintf (buf, sizeof (buf),
11083 	    "adr\t%%4, %s", targetm.strip_name_encoding (label));
11084   output_asm_insn (buf, operands);
11085   output_asm_insn (patterns[index][1], operands);
11086   output_asm_insn ("br\t%3", operands);
11087   assemble_label (asm_out_file, label);
11088   return "";
11089 }
11090 
11091 
11092 /* Return size in bits of an arithmetic operand which is shifted/scaled and
11093    masked such that it is suitable for a UXTB, UXTH, or UXTW extend
11094    operator.  */
11095 
11096 int
aarch64_uxt_size(int shift,HOST_WIDE_INT mask)11097 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
11098 {
11099   if (shift >= 0 && shift <= 3)
11100     {
11101       int size;
11102       for (size = 8; size <= 32; size *= 2)
11103 	{
11104 	  HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
11105 	  if (mask == bits << shift)
11106 	    return size;
11107 	}
11108     }
11109   return 0;
11110 }
11111 
11112 /* Constant pools are per function only when PC relative
11113    literal loads are true or we are in the large memory
11114    model.  */
11115 
11116 static inline bool
aarch64_can_use_per_function_literal_pools_p(void)11117 aarch64_can_use_per_function_literal_pools_p (void)
11118 {
11119   return (aarch64_pcrelative_literal_loads
11120 	  || aarch64_cmodel == AARCH64_CMODEL_LARGE);
11121 }
11122 
11123 static bool
aarch64_use_blocks_for_constant_p(machine_mode,const_rtx)11124 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
11125 {
11126   /* We can't use blocks for constants when we're using a per-function
11127      constant pool.  */
11128   return !aarch64_can_use_per_function_literal_pools_p ();
11129 }
11130 
11131 /* Select appropriate section for constants depending
11132    on where we place literal pools.  */
11133 
11134 static section *
aarch64_select_rtx_section(machine_mode mode,rtx x,unsigned HOST_WIDE_INT align)11135 aarch64_select_rtx_section (machine_mode mode,
11136 			    rtx x,
11137 			    unsigned HOST_WIDE_INT align)
11138 {
11139   if (aarch64_can_use_per_function_literal_pools_p ())
11140     return function_section (current_function_decl);
11141 
11142   return default_elf_select_rtx_section (mode, x, align);
11143 }
11144 
11145 /* Implement ASM_OUTPUT_POOL_EPILOGUE.  */
11146 void
aarch64_asm_output_pool_epilogue(FILE * f,const char *,tree,HOST_WIDE_INT offset)11147 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
11148 				  HOST_WIDE_INT offset)
11149 {
11150   /* When using per-function literal pools, we must ensure that any code
11151      section is aligned to the minimal instruction length, lest we get
11152      errors from the assembler re "unaligned instructions".  */
11153   if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
11154     ASM_OUTPUT_ALIGN (f, 2);
11155 }
11156 
11157 /* Costs.  */
11158 
11159 /* Helper function for rtx cost calculation.  Strip a shift expression
11160    from X.  Returns the inner operand if successful, or the original
11161    expression on failure.  */
11162 static rtx
aarch64_strip_shift(rtx x)11163 aarch64_strip_shift (rtx x)
11164 {
11165   rtx op = x;
11166 
11167   /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
11168      we can convert both to ROR during final output.  */
11169   if ((GET_CODE (op) == ASHIFT
11170        || GET_CODE (op) == ASHIFTRT
11171        || GET_CODE (op) == LSHIFTRT
11172        || GET_CODE (op) == ROTATERT
11173        || GET_CODE (op) == ROTATE)
11174       && CONST_INT_P (XEXP (op, 1)))
11175     return XEXP (op, 0);
11176 
11177   if (GET_CODE (op) == MULT
11178       && CONST_INT_P (XEXP (op, 1))
11179       && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
11180     return XEXP (op, 0);
11181 
11182   return x;
11183 }
11184 
11185 /* Helper function for rtx cost calculation.  Strip an extend
11186    expression from X.  Returns the inner operand if successful, or the
11187    original expression on failure.  We deal with a number of possible
11188    canonicalization variations here. If STRIP_SHIFT is true, then
11189    we can strip off a shift also.  */
11190 static rtx
aarch64_strip_extend(rtx x,bool strip_shift)11191 aarch64_strip_extend (rtx x, bool strip_shift)
11192 {
11193   scalar_int_mode mode;
11194   rtx op = x;
11195 
11196   if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
11197     return op;
11198 
11199   /* Zero and sign extraction of a widened value.  */
11200   if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
11201       && XEXP (op, 2) == const0_rtx
11202       && GET_CODE (XEXP (op, 0)) == MULT
11203       && aarch64_is_extend_from_extract (mode, XEXP (XEXP (op, 0), 1),
11204 					 XEXP (op, 1)))
11205     return XEXP (XEXP (op, 0), 0);
11206 
11207   /* It can also be represented (for zero-extend) as an AND with an
11208      immediate.  */
11209   if (GET_CODE (op) == AND
11210       && GET_CODE (XEXP (op, 0)) == MULT
11211       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
11212       && CONST_INT_P (XEXP (op, 1))
11213       && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
11214 			   INTVAL (XEXP (op, 1))) != 0)
11215     return XEXP (XEXP (op, 0), 0);
11216 
11217   /* Now handle extended register, as this may also have an optional
11218      left shift by 1..4.  */
11219   if (strip_shift
11220       && GET_CODE (op) == ASHIFT
11221       && CONST_INT_P (XEXP (op, 1))
11222       && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
11223     op = XEXP (op, 0);
11224 
11225   if (GET_CODE (op) == ZERO_EXTEND
11226       || GET_CODE (op) == SIGN_EXTEND)
11227     op = XEXP (op, 0);
11228 
11229   if (op != x)
11230     return op;
11231 
11232   return x;
11233 }
11234 
11235 /* Return true iff CODE is a shift supported in combination
11236    with arithmetic instructions.  */
11237 
11238 static bool
aarch64_shift_p(enum rtx_code code)11239 aarch64_shift_p (enum rtx_code code)
11240 {
11241   return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
11242 }
11243 
11244 
11245 /* Return true iff X is a cheap shift without a sign extend. */
11246 
11247 static bool
aarch64_cheap_mult_shift_p(rtx x)11248 aarch64_cheap_mult_shift_p (rtx x)
11249 {
11250   rtx op0, op1;
11251 
11252   op0 = XEXP (x, 0);
11253   op1 = XEXP (x, 1);
11254 
11255   if (!(aarch64_tune_params.extra_tuning_flags
11256                       & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
11257     return false;
11258 
11259   if (GET_CODE (op0) == SIGN_EXTEND)
11260     return false;
11261 
11262   if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
11263       && UINTVAL (op1) <= 4)
11264     return true;
11265 
11266   if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
11267     return false;
11268 
11269   HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
11270 
11271   if (l2 > 0 && l2 <= 4)
11272     return true;
11273 
11274   return false;
11275 }
11276 
11277 /* Helper function for rtx cost calculation.  Calculate the cost of
11278    a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
11279    Return the calculated cost of the expression, recursing manually in to
11280    operands where needed.  */
11281 
11282 static int
aarch64_rtx_mult_cost(rtx x,enum rtx_code code,int outer,bool speed)11283 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
11284 {
11285   rtx op0, op1;
11286   const struct cpu_cost_table *extra_cost
11287     = aarch64_tune_params.insn_extra_cost;
11288   int cost = 0;
11289   bool compound_p = (outer == PLUS || outer == MINUS);
11290   machine_mode mode = GET_MODE (x);
11291 
11292   gcc_checking_assert (code == MULT);
11293 
11294   op0 = XEXP (x, 0);
11295   op1 = XEXP (x, 1);
11296 
11297   if (VECTOR_MODE_P (mode))
11298     mode = GET_MODE_INNER (mode);
11299 
11300   /* Integer multiply/fma.  */
11301   if (GET_MODE_CLASS (mode) == MODE_INT)
11302     {
11303       /* The multiply will be canonicalized as a shift, cost it as such.  */
11304       if (aarch64_shift_p (GET_CODE (x))
11305 	  || (CONST_INT_P (op1)
11306 	      && exact_log2 (INTVAL (op1)) > 0))
11307 	{
11308 	  bool is_extend = GET_CODE (op0) == ZERO_EXTEND
11309 	                   || GET_CODE (op0) == SIGN_EXTEND;
11310 	  if (speed)
11311 	    {
11312 	      if (compound_p)
11313 	        {
11314 		  /* If the shift is considered cheap,
11315 		     then don't add any cost. */
11316 		  if (aarch64_cheap_mult_shift_p (x))
11317 		    ;
11318 	          else if (REG_P (op1))
11319 		    /* ARITH + shift-by-register.  */
11320 		    cost += extra_cost->alu.arith_shift_reg;
11321 		  else if (is_extend)
11322 		    /* ARITH + extended register.  We don't have a cost field
11323 		       for ARITH+EXTEND+SHIFT, so use extend_arith here.  */
11324 		    cost += extra_cost->alu.extend_arith;
11325 		  else
11326 		    /* ARITH + shift-by-immediate.  */
11327 		    cost += extra_cost->alu.arith_shift;
11328 		}
11329 	      else
11330 		/* LSL (immediate).  */
11331 	        cost += extra_cost->alu.shift;
11332 
11333 	    }
11334 	  /* Strip extends as we will have costed them in the case above.  */
11335 	  if (is_extend)
11336 	    op0 = aarch64_strip_extend (op0, true);
11337 
11338 	  cost += rtx_cost (op0, VOIDmode, code, 0, speed);
11339 
11340 	  return cost;
11341 	}
11342 
11343       /* MNEG or [US]MNEGL.  Extract the NEG operand and indicate that it's a
11344 	 compound and let the below cases handle it.  After all, MNEG is a
11345 	 special-case alias of MSUB.  */
11346       if (GET_CODE (op0) == NEG)
11347 	{
11348 	  op0 = XEXP (op0, 0);
11349 	  compound_p = true;
11350 	}
11351 
11352       /* Integer multiplies or FMAs have zero/sign extending variants.  */
11353       if ((GET_CODE (op0) == ZERO_EXTEND
11354 	   && GET_CODE (op1) == ZERO_EXTEND)
11355 	  || (GET_CODE (op0) == SIGN_EXTEND
11356 	      && GET_CODE (op1) == SIGN_EXTEND))
11357 	{
11358 	  cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
11359 	  cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
11360 
11361 	  if (speed)
11362 	    {
11363 	      if (compound_p)
11364 		/* SMADDL/UMADDL/UMSUBL/SMSUBL.  */
11365 		cost += extra_cost->mult[0].extend_add;
11366 	      else
11367 		/* MUL/SMULL/UMULL.  */
11368 		cost += extra_cost->mult[0].extend;
11369 	    }
11370 
11371 	  return cost;
11372 	}
11373 
11374       /* This is either an integer multiply or a MADD.  In both cases
11375 	 we want to recurse and cost the operands.  */
11376       cost += rtx_cost (op0, mode, MULT, 0, speed);
11377       cost += rtx_cost (op1, mode, MULT, 1, speed);
11378 
11379       if (speed)
11380 	{
11381 	  if (compound_p)
11382 	    /* MADD/MSUB.  */
11383 	    cost += extra_cost->mult[mode == DImode].add;
11384 	  else
11385 	    /* MUL.  */
11386 	    cost += extra_cost->mult[mode == DImode].simple;
11387 	}
11388 
11389       return cost;
11390     }
11391   else
11392     {
11393       if (speed)
11394 	{
11395 	  /* Floating-point FMA/FMUL can also support negations of the
11396 	     operands, unless the rounding mode is upward or downward in
11397 	     which case FNMUL is different than FMUL with operand negation.  */
11398 	  bool neg0 = GET_CODE (op0) == NEG;
11399 	  bool neg1 = GET_CODE (op1) == NEG;
11400 	  if (compound_p || !flag_rounding_math || (neg0 && neg1))
11401 	    {
11402 	      if (neg0)
11403 		op0 = XEXP (op0, 0);
11404 	      if (neg1)
11405 		op1 = XEXP (op1, 0);
11406 	    }
11407 
11408 	  if (compound_p)
11409 	    /* FMADD/FNMADD/FNMSUB/FMSUB.  */
11410 	    cost += extra_cost->fp[mode == DFmode].fma;
11411 	  else
11412 	    /* FMUL/FNMUL.  */
11413 	    cost += extra_cost->fp[mode == DFmode].mult;
11414 	}
11415 
11416       cost += rtx_cost (op0, mode, MULT, 0, speed);
11417       cost += rtx_cost (op1, mode, MULT, 1, speed);
11418       return cost;
11419     }
11420 }
11421 
11422 static int
aarch64_address_cost(rtx x,machine_mode mode,addr_space_t as ATTRIBUTE_UNUSED,bool speed)11423 aarch64_address_cost (rtx x,
11424 		      machine_mode mode,
11425 		      addr_space_t as ATTRIBUTE_UNUSED,
11426 		      bool speed)
11427 {
11428   enum rtx_code c = GET_CODE (x);
11429   const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
11430   struct aarch64_address_info info;
11431   int cost = 0;
11432   info.shift = 0;
11433 
11434   if (!aarch64_classify_address (&info, x, mode, false))
11435     {
11436       if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
11437 	{
11438 	  /* This is a CONST or SYMBOL ref which will be split
11439 	     in a different way depending on the code model in use.
11440 	     Cost it through the generic infrastructure.  */
11441 	  int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
11442 	  /* Divide through by the cost of one instruction to
11443 	     bring it to the same units as the address costs.  */
11444 	  cost_symbol_ref /= COSTS_N_INSNS (1);
11445 	  /* The cost is then the cost of preparing the address,
11446 	     followed by an immediate (possibly 0) offset.  */
11447 	  return cost_symbol_ref + addr_cost->imm_offset;
11448 	}
11449       else
11450 	{
11451 	  /* This is most likely a jump table from a case
11452 	     statement.  */
11453 	  return addr_cost->register_offset;
11454 	}
11455     }
11456 
11457   switch (info.type)
11458     {
11459       case ADDRESS_LO_SUM:
11460       case ADDRESS_SYMBOLIC:
11461       case ADDRESS_REG_IMM:
11462 	cost += addr_cost->imm_offset;
11463 	break;
11464 
11465       case ADDRESS_REG_WB:
11466 	if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
11467 	  cost += addr_cost->pre_modify;
11468 	else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
11469 	  cost += addr_cost->post_modify;
11470 	else
11471 	  gcc_unreachable ();
11472 
11473 	break;
11474 
11475       case ADDRESS_REG_REG:
11476 	cost += addr_cost->register_offset;
11477 	break;
11478 
11479       case ADDRESS_REG_SXTW:
11480 	cost += addr_cost->register_sextend;
11481 	break;
11482 
11483       case ADDRESS_REG_UXTW:
11484 	cost += addr_cost->register_zextend;
11485 	break;
11486 
11487       default:
11488 	gcc_unreachable ();
11489     }
11490 
11491 
11492   if (info.shift > 0)
11493     {
11494       /* For the sake of calculating the cost of the shifted register
11495 	 component, we can treat same sized modes in the same way.  */
11496       if (known_eq (GET_MODE_BITSIZE (mode), 16))
11497 	cost += addr_cost->addr_scale_costs.hi;
11498       else if (known_eq (GET_MODE_BITSIZE (mode), 32))
11499 	cost += addr_cost->addr_scale_costs.si;
11500       else if (known_eq (GET_MODE_BITSIZE (mode), 64))
11501 	cost += addr_cost->addr_scale_costs.di;
11502       else
11503 	/* We can't tell, or this is a 128-bit vector.  */
11504 	cost += addr_cost->addr_scale_costs.ti;
11505     }
11506 
11507   return cost;
11508 }
11509 
11510 /* Return the cost of a branch.  If SPEED_P is true then the compiler is
11511    optimizing for speed.  If PREDICTABLE_P is true then the branch is predicted
11512    to be taken.  */
11513 
11514 int
aarch64_branch_cost(bool speed_p,bool predictable_p)11515 aarch64_branch_cost (bool speed_p, bool predictable_p)
11516 {
11517   /* When optimizing for speed, use the cost of unpredictable branches.  */
11518   const struct cpu_branch_cost *branch_costs =
11519     aarch64_tune_params.branch_costs;
11520 
11521   if (!speed_p || predictable_p)
11522     return branch_costs->predictable;
11523   else
11524     return branch_costs->unpredictable;
11525 }
11526 
11527 /* Return true if the RTX X in mode MODE is a zero or sign extract
11528    usable in an ADD or SUB (extended register) instruction.  */
11529 static bool
aarch64_rtx_arith_op_extract_p(rtx x,scalar_int_mode mode)11530 aarch64_rtx_arith_op_extract_p (rtx x, scalar_int_mode mode)
11531 {
11532   /* Catch add with a sign extract.
11533      This is add_<optab><mode>_multp2.  */
11534   if (GET_CODE (x) == SIGN_EXTRACT
11535       || GET_CODE (x) == ZERO_EXTRACT)
11536     {
11537       rtx op0 = XEXP (x, 0);
11538       rtx op1 = XEXP (x, 1);
11539       rtx op2 = XEXP (x, 2);
11540 
11541       if (GET_CODE (op0) == MULT
11542 	  && CONST_INT_P (op1)
11543 	  && op2 == const0_rtx
11544 	  && CONST_INT_P (XEXP (op0, 1))
11545 	  && aarch64_is_extend_from_extract (mode,
11546 					     XEXP (op0, 1),
11547 					     op1))
11548 	{
11549 	  return true;
11550 	}
11551     }
11552   /* The simple case <ARITH>, XD, XN, XM, [us]xt.
11553      No shift.  */
11554   else if (GET_CODE (x) == SIGN_EXTEND
11555 	   || GET_CODE (x) == ZERO_EXTEND)
11556     return REG_P (XEXP (x, 0));
11557 
11558   return false;
11559 }
11560 
11561 static bool
aarch64_frint_unspec_p(unsigned int u)11562 aarch64_frint_unspec_p (unsigned int u)
11563 {
11564   switch (u)
11565     {
11566       case UNSPEC_FRINTZ:
11567       case UNSPEC_FRINTP:
11568       case UNSPEC_FRINTM:
11569       case UNSPEC_FRINTA:
11570       case UNSPEC_FRINTN:
11571       case UNSPEC_FRINTX:
11572       case UNSPEC_FRINTI:
11573         return true;
11574 
11575       default:
11576         return false;
11577     }
11578 }
11579 
11580 /* Return true iff X is an rtx that will match an extr instruction
11581    i.e. as described in the *extr<mode>5_insn family of patterns.
11582    OP0 and OP1 will be set to the operands of the shifts involved
11583    on success and will be NULL_RTX otherwise.  */
11584 
11585 static bool
aarch64_extr_rtx_p(rtx x,rtx * res_op0,rtx * res_op1)11586 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
11587 {
11588   rtx op0, op1;
11589   scalar_int_mode mode;
11590   if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
11591     return false;
11592 
11593   *res_op0 = NULL_RTX;
11594   *res_op1 = NULL_RTX;
11595 
11596   if (GET_CODE (x) != IOR)
11597     return false;
11598 
11599   op0 = XEXP (x, 0);
11600   op1 = XEXP (x, 1);
11601 
11602   if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
11603       || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
11604     {
11605      /* Canonicalise locally to ashift in op0, lshiftrt in op1.  */
11606       if (GET_CODE (op1) == ASHIFT)
11607         std::swap (op0, op1);
11608 
11609       if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
11610         return false;
11611 
11612       unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
11613       unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
11614 
11615       if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
11616           && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
11617         {
11618           *res_op0 = XEXP (op0, 0);
11619           *res_op1 = XEXP (op1, 0);
11620           return true;
11621         }
11622     }
11623 
11624   return false;
11625 }
11626 
11627 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
11628    storing it in *COST.  Result is true if the total cost of the operation
11629    has now been calculated.  */
11630 static bool
aarch64_if_then_else_costs(rtx op0,rtx op1,rtx op2,int * cost,bool speed)11631 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
11632 {
11633   rtx inner;
11634   rtx comparator;
11635   enum rtx_code cmpcode;
11636   const struct cpu_cost_table *extra_cost
11637     = aarch64_tune_params.insn_extra_cost;
11638 
11639   if (COMPARISON_P (op0))
11640     {
11641       inner = XEXP (op0, 0);
11642       comparator = XEXP (op0, 1);
11643       cmpcode = GET_CODE (op0);
11644     }
11645   else
11646     {
11647       inner = op0;
11648       comparator = const0_rtx;
11649       cmpcode = NE;
11650     }
11651 
11652   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
11653     {
11654       /* Conditional branch.  */
11655       if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
11656 	return true;
11657       else
11658 	{
11659 	  if (cmpcode == NE || cmpcode == EQ)
11660 	    {
11661 	      if (comparator == const0_rtx)
11662 		{
11663 		  /* TBZ/TBNZ/CBZ/CBNZ.  */
11664 		  if (GET_CODE (inner) == ZERO_EXTRACT)
11665 		    /* TBZ/TBNZ.  */
11666 		    *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
11667 				       ZERO_EXTRACT, 0, speed);
11668 		  else
11669 		    /* CBZ/CBNZ.  */
11670 		    *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
11671 
11672 		  return true;
11673 		}
11674 	      if (register_operand (inner, VOIDmode)
11675 		  && aarch64_imm24 (comparator, VOIDmode))
11676 		{
11677 		  /* SUB and SUBS.  */
11678 		  *cost += COSTS_N_INSNS (2);
11679 		  if (speed)
11680 		    *cost += extra_cost->alu.arith * 2;
11681 		  return true;
11682 		}
11683 	    }
11684 	  else if (cmpcode == LT || cmpcode == GE)
11685 	    {
11686 	      /* TBZ/TBNZ.  */
11687 	      if (comparator == const0_rtx)
11688 		return true;
11689 	    }
11690 	}
11691     }
11692   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
11693     {
11694       /* CCMP.  */
11695       if (GET_CODE (op1) == COMPARE)
11696 	{
11697 	  /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0.  */
11698 	  if (XEXP (op1, 1) == const0_rtx)
11699 	    *cost += 1;
11700 	  if (speed)
11701 	    {
11702 	      machine_mode mode = GET_MODE (XEXP (op1, 0));
11703 	      const struct cpu_cost_table *extra_cost
11704 		= aarch64_tune_params.insn_extra_cost;
11705 
11706 	      if (GET_MODE_CLASS (mode) == MODE_INT)
11707 		*cost += extra_cost->alu.arith;
11708 	      else
11709 		*cost += extra_cost->fp[mode == DFmode].compare;
11710 	    }
11711 	  return true;
11712 	}
11713 
11714       /* It's a conditional operation based on the status flags,
11715 	 so it must be some flavor of CSEL.  */
11716 
11717       /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
11718       if (GET_CODE (op1) == NEG
11719           || GET_CODE (op1) == NOT
11720           || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
11721 	op1 = XEXP (op1, 0);
11722       else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
11723 	{
11724 	  /* CSEL with zero-extension (*cmovdi_insn_uxtw).  */
11725 	  op1 = XEXP (op1, 0);
11726 	  op2 = XEXP (op2, 0);
11727 	}
11728 
11729       *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
11730       *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
11731       return true;
11732     }
11733 
11734   /* We don't know what this is, cost all operands.  */
11735   return false;
11736 }
11737 
11738 /* Check whether X is a bitfield operation of the form shift + extend that
11739    maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction.  If so, return the
11740    operand to which the bitfield operation is applied.  Otherwise return
11741    NULL_RTX.  */
11742 
11743 static rtx
aarch64_extend_bitfield_pattern_p(rtx x)11744 aarch64_extend_bitfield_pattern_p (rtx x)
11745 {
11746   rtx_code outer_code = GET_CODE (x);
11747   machine_mode outer_mode = GET_MODE (x);
11748 
11749   if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
11750       && outer_mode != SImode && outer_mode != DImode)
11751     return NULL_RTX;
11752 
11753   rtx inner = XEXP (x, 0);
11754   rtx_code inner_code = GET_CODE (inner);
11755   machine_mode inner_mode = GET_MODE (inner);
11756   rtx op = NULL_RTX;
11757 
11758   switch (inner_code)
11759     {
11760       case ASHIFT:
11761 	if (CONST_INT_P (XEXP (inner, 1))
11762 	    && (inner_mode == QImode || inner_mode == HImode))
11763 	  op = XEXP (inner, 0);
11764 	break;
11765       case LSHIFTRT:
11766 	if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
11767 	    && (inner_mode == QImode || inner_mode == HImode))
11768 	  op = XEXP (inner, 0);
11769 	break;
11770       case ASHIFTRT:
11771 	if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
11772 	    && (inner_mode == QImode || inner_mode == HImode))
11773 	  op = XEXP (inner, 0);
11774 	break;
11775       default:
11776 	break;
11777     }
11778 
11779   return op;
11780 }
11781 
11782 /* Return true if the mask and a shift amount from an RTX of the form
11783    (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
11784    mode MODE.  See the *andim_ashift<mode>_bfiz pattern.  */
11785 
11786 bool
aarch64_mask_and_shift_for_ubfiz_p(scalar_int_mode mode,rtx mask,rtx shft_amnt)11787 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
11788 				    rtx shft_amnt)
11789 {
11790   return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
11791 	 && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
11792 	 && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
11793 	 && (INTVAL (mask)
11794 	     & ((HOST_WIDE_INT_1U << INTVAL (shft_amnt)) - 1)) == 0;
11795 }
11796 
11797 /* Return true if the masks and a shift amount from an RTX of the form
11798    ((x & MASK1) | ((y << SHIFT_AMNT) & MASK2)) are valid to combine into
11799    a BFI instruction of mode MODE.  See *arch64_bfi patterns.  */
11800 
11801 bool
aarch64_masks_and_shift_for_bfi_p(scalar_int_mode mode,unsigned HOST_WIDE_INT mask1,unsigned HOST_WIDE_INT shft_amnt,unsigned HOST_WIDE_INT mask2)11802 aarch64_masks_and_shift_for_bfi_p (scalar_int_mode mode,
11803 				   unsigned HOST_WIDE_INT mask1,
11804 				   unsigned HOST_WIDE_INT shft_amnt,
11805 				   unsigned HOST_WIDE_INT mask2)
11806 {
11807   unsigned HOST_WIDE_INT t;
11808 
11809   /* Verify that there is no overlap in what bits are set in the two masks.  */
11810   if (mask1 != ~mask2)
11811     return false;
11812 
11813   /* Verify that mask2 is not all zeros or ones.  */
11814   if (mask2 == 0 || mask2 == HOST_WIDE_INT_M1U)
11815     return false;
11816 
11817   /* The shift amount should always be less than the mode size.  */
11818   gcc_assert (shft_amnt < GET_MODE_BITSIZE (mode));
11819 
11820   /* Verify that the mask being shifted is contiguous and would be in the
11821      least significant bits after shifting by shft_amnt.  */
11822   t = mask2 + (HOST_WIDE_INT_1U << shft_amnt);
11823   return (t == (t & -t));
11824 }
11825 
11826 /* Calculate the cost of calculating X, storing it in *COST.  Result
11827    is true if the total cost of the operation has now been calculated.  */
11828 static bool
aarch64_rtx_costs(rtx x,machine_mode mode,int outer ATTRIBUTE_UNUSED,int param ATTRIBUTE_UNUSED,int * cost,bool speed)11829 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
11830 		   int param ATTRIBUTE_UNUSED, int *cost, bool speed)
11831 {
11832   rtx op0, op1, op2;
11833   const struct cpu_cost_table *extra_cost
11834     = aarch64_tune_params.insn_extra_cost;
11835   int code = GET_CODE (x);
11836   scalar_int_mode int_mode;
11837 
11838   /* By default, assume that everything has equivalent cost to the
11839      cheapest instruction.  Any additional costs are applied as a delta
11840      above this default.  */
11841   *cost = COSTS_N_INSNS (1);
11842 
11843   switch (code)
11844     {
11845     case SET:
11846       /* The cost depends entirely on the operands to SET.  */
11847       *cost = 0;
11848       op0 = SET_DEST (x);
11849       op1 = SET_SRC (x);
11850 
11851       switch (GET_CODE (op0))
11852 	{
11853 	case MEM:
11854 	  if (speed)
11855 	    {
11856 	      rtx address = XEXP (op0, 0);
11857 	      if (VECTOR_MODE_P (mode))
11858 		*cost += extra_cost->ldst.storev;
11859 	      else if (GET_MODE_CLASS (mode) == MODE_INT)
11860 		*cost += extra_cost->ldst.store;
11861 	      else if (mode == SFmode)
11862 		*cost += extra_cost->ldst.storef;
11863 	      else if (mode == DFmode)
11864 		*cost += extra_cost->ldst.stored;
11865 
11866 	      *cost +=
11867 		COSTS_N_INSNS (aarch64_address_cost (address, mode,
11868 						     0, speed));
11869 	    }
11870 
11871 	  *cost += rtx_cost (op1, mode, SET, 1, speed);
11872 	  return true;
11873 
11874 	case SUBREG:
11875 	  if (! REG_P (SUBREG_REG (op0)))
11876 	    *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
11877 
11878 	  /* Fall through.  */
11879 	case REG:
11880 	  /* The cost is one per vector-register copied.  */
11881 	  if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
11882 	    {
11883 	      int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
11884 	      *cost = COSTS_N_INSNS (nregs);
11885 	    }
11886 	  /* const0_rtx is in general free, but we will use an
11887 	     instruction to set a register to 0.  */
11888 	  else if (REG_P (op1) || op1 == const0_rtx)
11889 	    {
11890 	      /* The cost is 1 per register copied.  */
11891 	      int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
11892 	      *cost = COSTS_N_INSNS (nregs);
11893 	    }
11894           else
11895 	    /* Cost is just the cost of the RHS of the set.  */
11896 	    *cost += rtx_cost (op1, mode, SET, 1, speed);
11897 	  return true;
11898 
11899 	case ZERO_EXTRACT:
11900 	case SIGN_EXTRACT:
11901 	  /* Bit-field insertion.  Strip any redundant widening of
11902 	     the RHS to meet the width of the target.  */
11903 	  if (GET_CODE (op1) == SUBREG)
11904 	    op1 = SUBREG_REG (op1);
11905 	  if ((GET_CODE (op1) == ZERO_EXTEND
11906 	       || GET_CODE (op1) == SIGN_EXTEND)
11907 	      && CONST_INT_P (XEXP (op0, 1))
11908 	      && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
11909 	      && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
11910 	    op1 = XEXP (op1, 0);
11911 
11912           if (CONST_INT_P (op1))
11913             {
11914               /* MOV immediate is assumed to always be cheap.  */
11915               *cost = COSTS_N_INSNS (1);
11916             }
11917           else
11918             {
11919               /* BFM.  */
11920 	      if (speed)
11921 		*cost += extra_cost->alu.bfi;
11922 	      *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
11923             }
11924 
11925 	  return true;
11926 
11927 	default:
11928 	  /* We can't make sense of this, assume default cost.  */
11929           *cost = COSTS_N_INSNS (1);
11930 	  return false;
11931 	}
11932       return false;
11933 
11934     case CONST_INT:
11935       /* If an instruction can incorporate a constant within the
11936 	 instruction, the instruction's expression avoids calling
11937 	 rtx_cost() on the constant.  If rtx_cost() is called on a
11938 	 constant, then it is usually because the constant must be
11939 	 moved into a register by one or more instructions.
11940 
11941 	 The exception is constant 0, which can be expressed
11942 	 as XZR/WZR and is therefore free.  The exception to this is
11943 	 if we have (set (reg) (const0_rtx)) in which case we must cost
11944 	 the move.  However, we can catch that when we cost the SET, so
11945 	 we don't need to consider that here.  */
11946       if (x == const0_rtx)
11947 	*cost = 0;
11948       else
11949 	{
11950 	  /* To an approximation, building any other constant is
11951 	     proportionally expensive to the number of instructions
11952 	     required to build that constant.  This is true whether we
11953 	     are compiling for SPEED or otherwise.  */
11954 	  if (!is_a <scalar_int_mode> (mode, &int_mode))
11955 	    int_mode = word_mode;
11956 	  *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
11957 				 (NULL_RTX, x, false, int_mode));
11958 	}
11959       return true;
11960 
11961     case CONST_DOUBLE:
11962 
11963       /* First determine number of instructions to do the move
11964 	  as an integer constant.  */
11965       if (!aarch64_float_const_representable_p (x)
11966 	   && !aarch64_can_const_movi_rtx_p (x, mode)
11967 	   && aarch64_float_const_rtx_p (x))
11968 	{
11969 	  unsigned HOST_WIDE_INT ival;
11970 	  bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
11971 	  gcc_assert (succeed);
11972 
11973 	  scalar_int_mode imode = (mode == HFmode
11974 				   ? SImode
11975 				   : int_mode_for_mode (mode).require ());
11976 	  int ncost = aarch64_internal_mov_immediate
11977 		(NULL_RTX, gen_int_mode (ival, imode), false, imode);
11978 	  *cost += COSTS_N_INSNS (ncost);
11979 	  return true;
11980 	}
11981 
11982       if (speed)
11983 	{
11984 	  /* mov[df,sf]_aarch64.  */
11985 	  if (aarch64_float_const_representable_p (x))
11986 	    /* FMOV (scalar immediate).  */
11987 	    *cost += extra_cost->fp[mode == DFmode].fpconst;
11988 	  else if (!aarch64_float_const_zero_rtx_p (x))
11989 	    {
11990 	      /* This will be a load from memory.  */
11991 	      if (mode == DFmode)
11992 		*cost += extra_cost->ldst.loadd;
11993 	      else
11994 		*cost += extra_cost->ldst.loadf;
11995 	    }
11996 	  else
11997 	    /* Otherwise this is +0.0.  We get this using MOVI d0, #0
11998 	       or MOV v0.s[0], wzr - neither of which are modeled by the
11999 	       cost tables.  Just use the default cost.  */
12000 	    {
12001 	    }
12002 	}
12003 
12004       return true;
12005 
12006     case MEM:
12007       if (speed)
12008 	{
12009 	  /* For loads we want the base cost of a load, plus an
12010 	     approximation for the additional cost of the addressing
12011 	     mode.  */
12012 	  rtx address = XEXP (x, 0);
12013 	  if (VECTOR_MODE_P (mode))
12014 	    *cost += extra_cost->ldst.loadv;
12015 	  else if (GET_MODE_CLASS (mode) == MODE_INT)
12016 	    *cost += extra_cost->ldst.load;
12017 	  else if (mode == SFmode)
12018 	    *cost += extra_cost->ldst.loadf;
12019 	  else if (mode == DFmode)
12020 	    *cost += extra_cost->ldst.loadd;
12021 
12022 	  *cost +=
12023 		COSTS_N_INSNS (aarch64_address_cost (address, mode,
12024 						     0, speed));
12025 	}
12026 
12027       return true;
12028 
12029     case NEG:
12030       op0 = XEXP (x, 0);
12031 
12032       if (VECTOR_MODE_P (mode))
12033 	{
12034 	  if (speed)
12035 	    {
12036 	      /* FNEG.  */
12037 	      *cost += extra_cost->vect.alu;
12038 	    }
12039 	  return false;
12040 	}
12041 
12042       if (GET_MODE_CLASS (mode) == MODE_INT)
12043 	{
12044           if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
12045               || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
12046             {
12047               /* CSETM.  */
12048 	      *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
12049               return true;
12050             }
12051 
12052 	  /* Cost this as SUB wzr, X.  */
12053           op0 = CONST0_RTX (mode);
12054           op1 = XEXP (x, 0);
12055           goto cost_minus;
12056         }
12057 
12058       if (GET_MODE_CLASS (mode) == MODE_FLOAT)
12059         {
12060           /* Support (neg(fma...)) as a single instruction only if
12061              sign of zeros is unimportant.  This matches the decision
12062              making in aarch64.md.  */
12063           if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
12064             {
12065 	      /* FNMADD.  */
12066 	      *cost = rtx_cost (op0, mode, NEG, 0, speed);
12067               return true;
12068             }
12069 	  if (GET_CODE (op0) == MULT)
12070 	    {
12071 	      /* FNMUL.  */
12072 	      *cost = rtx_cost (op0, mode, NEG, 0, speed);
12073 	      return true;
12074 	    }
12075 	  if (speed)
12076 	    /* FNEG.  */
12077 	    *cost += extra_cost->fp[mode == DFmode].neg;
12078           return false;
12079         }
12080 
12081       return false;
12082 
12083     case CLRSB:
12084     case CLZ:
12085       if (speed)
12086 	{
12087 	  if (VECTOR_MODE_P (mode))
12088 	    *cost += extra_cost->vect.alu;
12089 	  else
12090 	    *cost += extra_cost->alu.clz;
12091 	}
12092 
12093       return false;
12094 
12095     case CTZ:
12096       *cost = COSTS_N_INSNS (2);
12097 
12098       if (speed)
12099 	*cost += extra_cost->alu.clz + extra_cost->alu.rev;
12100       return false;
12101 
12102     case COMPARE:
12103       op0 = XEXP (x, 0);
12104       op1 = XEXP (x, 1);
12105 
12106       if (op1 == const0_rtx
12107 	  && GET_CODE (op0) == AND)
12108 	{
12109 	  x = op0;
12110 	  mode = GET_MODE (op0);
12111 	  goto cost_logic;
12112 	}
12113 
12114       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
12115         {
12116           /* TODO: A write to the CC flags possibly costs extra, this
12117 	     needs encoding in the cost tables.  */
12118 
12119 	  mode = GET_MODE (op0);
12120           /* ANDS.  */
12121           if (GET_CODE (op0) == AND)
12122             {
12123               x = op0;
12124               goto cost_logic;
12125             }
12126 
12127           if (GET_CODE (op0) == PLUS)
12128             {
12129 	      /* ADDS (and CMN alias).  */
12130               x = op0;
12131               goto cost_plus;
12132             }
12133 
12134           if (GET_CODE (op0) == MINUS)
12135             {
12136 	      /* SUBS.  */
12137               x = op0;
12138               goto cost_minus;
12139             }
12140 
12141 	  if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
12142 	      && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
12143 	      && CONST_INT_P (XEXP (op0, 2)))
12144 	    {
12145 	      /* COMPARE of ZERO_EXTRACT form of TST-immediate.
12146 		 Handle it here directly rather than going to cost_logic
12147 		 since we know the immediate generated for the TST is valid
12148 		 so we can avoid creating an intermediate rtx for it only
12149 		 for costing purposes.  */
12150 	      if (speed)
12151 		*cost += extra_cost->alu.logical;
12152 
12153 	      *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
12154 				 ZERO_EXTRACT, 0, speed);
12155 	      return true;
12156 	    }
12157 
12158           if (GET_CODE (op1) == NEG)
12159             {
12160 	      /* CMN.  */
12161 	      if (speed)
12162 		*cost += extra_cost->alu.arith;
12163 
12164 	      *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
12165 	      *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
12166               return true;
12167             }
12168 
12169           /* CMP.
12170 
12171 	     Compare can freely swap the order of operands, and
12172              canonicalization puts the more complex operation first.
12173              But the integer MINUS logic expects the shift/extend
12174              operation in op1.  */
12175           if (! (REG_P (op0)
12176                  || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
12177           {
12178             op0 = XEXP (x, 1);
12179             op1 = XEXP (x, 0);
12180           }
12181           goto cost_minus;
12182         }
12183 
12184       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
12185         {
12186 	  /* FCMP.  */
12187 	  if (speed)
12188 	    *cost += extra_cost->fp[mode == DFmode].compare;
12189 
12190           if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
12191             {
12192 	      *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
12193               /* FCMP supports constant 0.0 for no extra cost. */
12194               return true;
12195             }
12196           return false;
12197         }
12198 
12199       if (VECTOR_MODE_P (mode))
12200 	{
12201 	  /* Vector compare.  */
12202 	  if (speed)
12203 	    *cost += extra_cost->vect.alu;
12204 
12205 	  if (aarch64_float_const_zero_rtx_p (op1))
12206 	    {
12207 	      /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
12208 		 cost.  */
12209 	      return true;
12210 	    }
12211 	  return false;
12212 	}
12213       return false;
12214 
12215     case MINUS:
12216       {
12217 	op0 = XEXP (x, 0);
12218 	op1 = XEXP (x, 1);
12219 
12220 cost_minus:
12221 	*cost += rtx_cost (op0, mode, MINUS, 0, speed);
12222 
12223 	/* Detect valid immediates.  */
12224 	if ((GET_MODE_CLASS (mode) == MODE_INT
12225 	     || (GET_MODE_CLASS (mode) == MODE_CC
12226 		 && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
12227 	    && CONST_INT_P (op1)
12228 	    && aarch64_uimm12_shift (INTVAL (op1)))
12229 	  {
12230 	    if (speed)
12231 	      /* SUB(S) (immediate).  */
12232 	      *cost += extra_cost->alu.arith;
12233 	    return true;
12234 	  }
12235 
12236 	/* Look for SUB (extended register).  */
12237 	if (is_a <scalar_int_mode> (mode, &int_mode)
12238 	    && aarch64_rtx_arith_op_extract_p (op1, int_mode))
12239 	  {
12240 	    if (speed)
12241 	      *cost += extra_cost->alu.extend_arith;
12242 
12243 	    op1 = aarch64_strip_extend (op1, true);
12244 	    *cost += rtx_cost (op1, VOIDmode,
12245 			       (enum rtx_code) GET_CODE (op1), 0, speed);
12246 	    return true;
12247 	  }
12248 
12249 	rtx new_op1 = aarch64_strip_extend (op1, false);
12250 
12251 	/* Cost this as an FMA-alike operation.  */
12252 	if ((GET_CODE (new_op1) == MULT
12253 	     || aarch64_shift_p (GET_CODE (new_op1)))
12254 	    && code != COMPARE)
12255 	  {
12256 	    *cost += aarch64_rtx_mult_cost (new_op1, MULT,
12257 					    (enum rtx_code) code,
12258 					    speed);
12259 	    return true;
12260 	  }
12261 
12262 	*cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
12263 
12264 	if (speed)
12265 	  {
12266 	    if (VECTOR_MODE_P (mode))
12267 	      {
12268 		/* Vector SUB.  */
12269 		*cost += extra_cost->vect.alu;
12270 	      }
12271 	    else if (GET_MODE_CLASS (mode) == MODE_INT)
12272 	      {
12273 		/* SUB(S).  */
12274 		*cost += extra_cost->alu.arith;
12275 	      }
12276 	    else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
12277 	      {
12278 		/* FSUB.  */
12279 		*cost += extra_cost->fp[mode == DFmode].addsub;
12280 	      }
12281 	  }
12282 	return true;
12283       }
12284 
12285     case PLUS:
12286       {
12287 	rtx new_op0;
12288 
12289 	op0 = XEXP (x, 0);
12290 	op1 = XEXP (x, 1);
12291 
12292 cost_plus:
12293 	if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
12294 	    || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
12295 	  {
12296 	    /* CSINC.  */
12297 	    *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
12298 	    *cost += rtx_cost (op1, mode, PLUS, 1, speed);
12299 	    return true;
12300 	  }
12301 
12302 	if (GET_MODE_CLASS (mode) == MODE_INT
12303 	    && (aarch64_plus_immediate (op1, mode)
12304 		|| aarch64_sve_addvl_addpl_immediate (op1, mode)))
12305 	  {
12306 	    *cost += rtx_cost (op0, mode, PLUS, 0, speed);
12307 
12308 	    if (speed)
12309 	      /* ADD (immediate).  */
12310 	      *cost += extra_cost->alu.arith;
12311 	    return true;
12312 	  }
12313 
12314 	*cost += rtx_cost (op1, mode, PLUS, 1, speed);
12315 
12316 	/* Look for ADD (extended register).  */
12317 	if (is_a <scalar_int_mode> (mode, &int_mode)
12318 	    && aarch64_rtx_arith_op_extract_p (op0, int_mode))
12319 	  {
12320 	    if (speed)
12321 	      *cost += extra_cost->alu.extend_arith;
12322 
12323 	    op0 = aarch64_strip_extend (op0, true);
12324 	    *cost += rtx_cost (op0, VOIDmode,
12325 			       (enum rtx_code) GET_CODE (op0), 0, speed);
12326 	    return true;
12327 	  }
12328 
12329 	/* Strip any extend, leave shifts behind as we will
12330 	   cost them through mult_cost.  */
12331 	new_op0 = aarch64_strip_extend (op0, false);
12332 
12333 	if (GET_CODE (new_op0) == MULT
12334 	    || aarch64_shift_p (GET_CODE (new_op0)))
12335 	  {
12336 	    *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
12337 					    speed);
12338 	    return true;
12339 	  }
12340 
12341 	*cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
12342 
12343 	if (speed)
12344 	  {
12345 	    if (VECTOR_MODE_P (mode))
12346 	      {
12347 		/* Vector ADD.  */
12348 		*cost += extra_cost->vect.alu;
12349 	      }
12350 	    else if (GET_MODE_CLASS (mode) == MODE_INT)
12351 	      {
12352 		/* ADD.  */
12353 		*cost += extra_cost->alu.arith;
12354 	      }
12355 	    else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
12356 	      {
12357 		/* FADD.  */
12358 		*cost += extra_cost->fp[mode == DFmode].addsub;
12359 	      }
12360 	  }
12361 	return true;
12362       }
12363 
12364     case BSWAP:
12365       *cost = COSTS_N_INSNS (1);
12366 
12367       if (speed)
12368 	{
12369 	  if (VECTOR_MODE_P (mode))
12370 	    *cost += extra_cost->vect.alu;
12371 	  else
12372 	    *cost += extra_cost->alu.rev;
12373 	}
12374       return false;
12375 
12376     case IOR:
12377       if (aarch_rev16_p (x))
12378         {
12379           *cost = COSTS_N_INSNS (1);
12380 
12381 	  if (speed)
12382 	    {
12383 	      if (VECTOR_MODE_P (mode))
12384 		*cost += extra_cost->vect.alu;
12385 	      else
12386 		*cost += extra_cost->alu.rev;
12387 	    }
12388 	  return true;
12389         }
12390 
12391       if (aarch64_extr_rtx_p (x, &op0, &op1))
12392         {
12393 	  *cost += rtx_cost (op0, mode, IOR, 0, speed);
12394 	  *cost += rtx_cost (op1, mode, IOR, 1, speed);
12395           if (speed)
12396             *cost += extra_cost->alu.shift;
12397 
12398           return true;
12399         }
12400     /* Fall through.  */
12401     case XOR:
12402     case AND:
12403     cost_logic:
12404       op0 = XEXP (x, 0);
12405       op1 = XEXP (x, 1);
12406 
12407       if (VECTOR_MODE_P (mode))
12408 	{
12409 	  if (speed)
12410 	    *cost += extra_cost->vect.alu;
12411 	  return true;
12412 	}
12413 
12414       if (code == AND
12415           && GET_CODE (op0) == MULT
12416           && CONST_INT_P (XEXP (op0, 1))
12417           && CONST_INT_P (op1)
12418           && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
12419                                INTVAL (op1)) != 0)
12420         {
12421           /* This is a UBFM/SBFM.  */
12422 	  *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
12423 	  if (speed)
12424 	    *cost += extra_cost->alu.bfx;
12425           return true;
12426         }
12427 
12428       if (is_int_mode (mode, &int_mode))
12429 	{
12430 	  if (CONST_INT_P (op1))
12431 	    {
12432 	      /* We have a mask + shift version of a UBFIZ
12433 		 i.e. the *andim_ashift<mode>_bfiz pattern.  */
12434 	      if (GET_CODE (op0) == ASHIFT
12435 		  && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
12436 							 XEXP (op0, 1)))
12437 		{
12438 		  *cost += rtx_cost (XEXP (op0, 0), int_mode,
12439 				     (enum rtx_code) code, 0, speed);
12440 		  if (speed)
12441 		    *cost += extra_cost->alu.bfx;
12442 
12443 		  return true;
12444 		}
12445 	      else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
12446 		{
12447 		/* We possibly get the immediate for free, this is not
12448 		   modelled.  */
12449 		  *cost += rtx_cost (op0, int_mode,
12450 				     (enum rtx_code) code, 0, speed);
12451 		  if (speed)
12452 		    *cost += extra_cost->alu.logical;
12453 
12454 		  return true;
12455 		}
12456 	    }
12457 	  else
12458 	    {
12459 	      rtx new_op0 = op0;
12460 
12461 	      /* Handle ORN, EON, or BIC.  */
12462 	      if (GET_CODE (op0) == NOT)
12463 		op0 = XEXP (op0, 0);
12464 
12465 	      new_op0 = aarch64_strip_shift (op0);
12466 
12467 	      /* If we had a shift on op0 then this is a logical-shift-
12468 		 by-register/immediate operation.  Otherwise, this is just
12469 		 a logical operation.  */
12470 	      if (speed)
12471 		{
12472 		  if (new_op0 != op0)
12473 		    {
12474 		      /* Shift by immediate.  */
12475 		      if (CONST_INT_P (XEXP (op0, 1)))
12476 			*cost += extra_cost->alu.log_shift;
12477 		      else
12478 			*cost += extra_cost->alu.log_shift_reg;
12479 		    }
12480 		  else
12481 		    *cost += extra_cost->alu.logical;
12482 		}
12483 
12484 	      /* In both cases we want to cost both operands.  */
12485 	      *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
12486 				 0, speed);
12487 	      *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
12488 				 1, speed);
12489 
12490 	      return true;
12491 	    }
12492 	}
12493       return false;
12494 
12495     case NOT:
12496       x = XEXP (x, 0);
12497       op0 = aarch64_strip_shift (x);
12498 
12499       if (VECTOR_MODE_P (mode))
12500 	{
12501 	  /* Vector NOT.  */
12502 	  *cost += extra_cost->vect.alu;
12503 	  return false;
12504 	}
12505 
12506       /* MVN-shifted-reg.  */
12507       if (op0 != x)
12508         {
12509 	  *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
12510 
12511           if (speed)
12512             *cost += extra_cost->alu.log_shift;
12513 
12514           return true;
12515         }
12516       /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
12517          Handle the second form here taking care that 'a' in the above can
12518          be a shift.  */
12519       else if (GET_CODE (op0) == XOR)
12520         {
12521           rtx newop0 = XEXP (op0, 0);
12522           rtx newop1 = XEXP (op0, 1);
12523           rtx op0_stripped = aarch64_strip_shift (newop0);
12524 
12525 	  *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
12526 	  *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
12527 
12528           if (speed)
12529             {
12530               if (op0_stripped != newop0)
12531                 *cost += extra_cost->alu.log_shift;
12532               else
12533                 *cost += extra_cost->alu.logical;
12534             }
12535 
12536           return true;
12537         }
12538       /* MVN.  */
12539       if (speed)
12540 	*cost += extra_cost->alu.logical;
12541 
12542       return false;
12543 
12544     case ZERO_EXTEND:
12545 
12546       op0 = XEXP (x, 0);
12547       /* If a value is written in SI mode, then zero extended to DI
12548 	 mode, the operation will in general be free as a write to
12549 	 a 'w' register implicitly zeroes the upper bits of an 'x'
12550 	 register.  However, if this is
12551 
12552 	   (set (reg) (zero_extend (reg)))
12553 
12554 	 we must cost the explicit register move.  */
12555       if (mode == DImode
12556 	  && GET_MODE (op0) == SImode
12557 	  && outer == SET)
12558 	{
12559 	  int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
12560 
12561 	/* If OP_COST is non-zero, then the cost of the zero extend
12562 	   is effectively the cost of the inner operation.  Otherwise
12563 	   we have a MOV instruction and we take the cost from the MOV
12564 	   itself.  This is true independently of whether we are
12565 	   optimizing for space or time.  */
12566 	  if (op_cost)
12567 	    *cost = op_cost;
12568 
12569 	  return true;
12570 	}
12571       else if (MEM_P (op0))
12572 	{
12573 	  /* All loads can zero extend to any size for free.  */
12574 	  *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
12575 	  return true;
12576 	}
12577 
12578       op0 = aarch64_extend_bitfield_pattern_p (x);
12579       if (op0)
12580 	{
12581 	  *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
12582 	  if (speed)
12583 	    *cost += extra_cost->alu.bfx;
12584 	  return true;
12585 	}
12586 
12587       if (speed)
12588 	{
12589 	  if (VECTOR_MODE_P (mode))
12590 	    {
12591 	      /* UMOV.  */
12592 	      *cost += extra_cost->vect.alu;
12593 	    }
12594 	  else
12595 	    {
12596 	      /* We generate an AND instead of UXTB/UXTH.  */
12597 	      *cost += extra_cost->alu.logical;
12598 	    }
12599 	}
12600       return false;
12601 
12602     case SIGN_EXTEND:
12603       if (MEM_P (XEXP (x, 0)))
12604 	{
12605 	  /* LDRSH.  */
12606 	  if (speed)
12607 	    {
12608 	      rtx address = XEXP (XEXP (x, 0), 0);
12609 	      *cost += extra_cost->ldst.load_sign_extend;
12610 
12611 	      *cost +=
12612 		COSTS_N_INSNS (aarch64_address_cost (address, mode,
12613 						     0, speed));
12614 	    }
12615 	  return true;
12616 	}
12617 
12618       op0 = aarch64_extend_bitfield_pattern_p (x);
12619       if (op0)
12620 	{
12621 	  *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
12622 	  if (speed)
12623 	    *cost += extra_cost->alu.bfx;
12624 	  return true;
12625 	}
12626 
12627       if (speed)
12628 	{
12629 	  if (VECTOR_MODE_P (mode))
12630 	    *cost += extra_cost->vect.alu;
12631 	  else
12632 	    *cost += extra_cost->alu.extend;
12633 	}
12634       return false;
12635 
12636     case ASHIFT:
12637       op0 = XEXP (x, 0);
12638       op1 = XEXP (x, 1);
12639 
12640       if (CONST_INT_P (op1))
12641         {
12642 	  if (speed)
12643 	    {
12644 	      if (VECTOR_MODE_P (mode))
12645 		{
12646 		  /* Vector shift (immediate).  */
12647 		  *cost += extra_cost->vect.alu;
12648 		}
12649 	      else
12650 		{
12651 		  /* LSL (immediate), UBMF, UBFIZ and friends.  These are all
12652 		     aliases.  */
12653 		  *cost += extra_cost->alu.shift;
12654 		}
12655 	    }
12656 
12657           /* We can incorporate zero/sign extend for free.  */
12658           if (GET_CODE (op0) == ZERO_EXTEND
12659               || GET_CODE (op0) == SIGN_EXTEND)
12660             op0 = XEXP (op0, 0);
12661 
12662 	  *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
12663           return true;
12664         }
12665       else
12666         {
12667 	  if (VECTOR_MODE_P (mode))
12668 	    {
12669 	      if (speed)
12670 		/* Vector shift (register).  */
12671 		*cost += extra_cost->vect.alu;
12672 	    }
12673 	  else
12674 	    {
12675 	      if (speed)
12676 		/* LSLV.  */
12677 		*cost += extra_cost->alu.shift_reg;
12678 
12679 	      if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
12680 		  && CONST_INT_P (XEXP (op1, 1))
12681 		  && known_eq (INTVAL (XEXP (op1, 1)),
12682 			       GET_MODE_BITSIZE (mode) - 1))
12683 		{
12684 		  *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
12685 		  /* We already demanded XEXP (op1, 0) to be REG_P, so
12686 		     don't recurse into it.  */
12687 		  return true;
12688 		}
12689 	    }
12690 	  return false;  /* All arguments need to be in registers.  */
12691         }
12692 
12693     case ROTATE:
12694     case ROTATERT:
12695     case LSHIFTRT:
12696     case ASHIFTRT:
12697       op0 = XEXP (x, 0);
12698       op1 = XEXP (x, 1);
12699 
12700       if (CONST_INT_P (op1))
12701 	{
12702 	  /* ASR (immediate) and friends.  */
12703 	  if (speed)
12704 	    {
12705 	      if (VECTOR_MODE_P (mode))
12706 		*cost += extra_cost->vect.alu;
12707 	      else
12708 		*cost += extra_cost->alu.shift;
12709 	    }
12710 
12711 	  *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
12712 	  return true;
12713 	}
12714       else
12715 	{
12716 	  if (VECTOR_MODE_P (mode))
12717 	    {
12718 	      if (speed)
12719 		/* Vector shift (register).  */
12720 		*cost += extra_cost->vect.alu;
12721 	    }
12722 	  else
12723 	    {
12724 	      if (speed)
12725 		/* ASR (register) and friends.  */
12726 		*cost += extra_cost->alu.shift_reg;
12727 
12728 	      if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
12729 		  && CONST_INT_P (XEXP (op1, 1))
12730 		  && known_eq (INTVAL (XEXP (op1, 1)),
12731 			       GET_MODE_BITSIZE (mode) - 1))
12732 		{
12733 		  *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
12734 		  /* We already demanded XEXP (op1, 0) to be REG_P, so
12735 		     don't recurse into it.  */
12736 		  return true;
12737 		}
12738 	    }
12739 	  return false;  /* All arguments need to be in registers.  */
12740 	}
12741 
12742     case SYMBOL_REF:
12743 
12744       if (aarch64_cmodel == AARCH64_CMODEL_LARGE
12745 	  || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
12746 	{
12747 	  /* LDR.  */
12748 	  if (speed)
12749 	    *cost += extra_cost->ldst.load;
12750 	}
12751       else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
12752 	       || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
12753 	{
12754 	  /* ADRP, followed by ADD.  */
12755 	  *cost += COSTS_N_INSNS (1);
12756 	  if (speed)
12757 	    *cost += 2 * extra_cost->alu.arith;
12758 	}
12759       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
12760 	       || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
12761 	{
12762 	  /* ADR.  */
12763 	  if (speed)
12764 	    *cost += extra_cost->alu.arith;
12765 	}
12766 
12767       if (flag_pic)
12768 	{
12769 	  /* One extra load instruction, after accessing the GOT.  */
12770 	  *cost += COSTS_N_INSNS (1);
12771 	  if (speed)
12772 	    *cost += extra_cost->ldst.load;
12773 	}
12774       return true;
12775 
12776     case HIGH:
12777     case LO_SUM:
12778       /* ADRP/ADD (immediate).  */
12779       if (speed)
12780 	*cost += extra_cost->alu.arith;
12781       return true;
12782 
12783     case ZERO_EXTRACT:
12784     case SIGN_EXTRACT:
12785       /* UBFX/SBFX.  */
12786       if (speed)
12787 	{
12788 	  if (VECTOR_MODE_P (mode))
12789 	    *cost += extra_cost->vect.alu;
12790 	  else
12791 	    *cost += extra_cost->alu.bfx;
12792 	}
12793 
12794       /* We can trust that the immediates used will be correct (there
12795 	 are no by-register forms), so we need only cost op0.  */
12796       *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
12797       return true;
12798 
12799     case MULT:
12800       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
12801       /* aarch64_rtx_mult_cost always handles recursion to its
12802 	 operands.  */
12803       return true;
12804 
12805     case MOD:
12806     /* We can expand signed mod by power of 2 using a NEGS, two parallel
12807        ANDs and a CSNEG.  Assume here that CSNEG is the same as the cost of
12808        an unconditional negate.  This case should only ever be reached through
12809        the set_smod_pow2_cheap check in expmed.c.  */
12810       if (CONST_INT_P (XEXP (x, 1))
12811 	  && exact_log2 (INTVAL (XEXP (x, 1))) > 0
12812 	  && (mode == SImode || mode == DImode))
12813 	{
12814 	  /* We expand to 4 instructions.  Reset the baseline.  */
12815 	  *cost = COSTS_N_INSNS (4);
12816 
12817 	  if (speed)
12818 	    *cost += 2 * extra_cost->alu.logical
12819 		     + 2 * extra_cost->alu.arith;
12820 
12821 	  return true;
12822 	}
12823 
12824     /* Fall-through.  */
12825     case UMOD:
12826       if (speed)
12827 	{
12828 	  /* Slighly prefer UMOD over SMOD.  */
12829 	  if (VECTOR_MODE_P (mode))
12830 	    *cost += extra_cost->vect.alu;
12831 	  else if (GET_MODE_CLASS (mode) == MODE_INT)
12832 	    *cost += (extra_cost->mult[mode == DImode].add
12833 		      + extra_cost->mult[mode == DImode].idiv
12834 		      + (code == MOD ? 1 : 0));
12835 	}
12836       return false;  /* All arguments need to be in registers.  */
12837 
12838     case DIV:
12839     case UDIV:
12840     case SQRT:
12841       if (speed)
12842 	{
12843 	  if (VECTOR_MODE_P (mode))
12844 	    *cost += extra_cost->vect.alu;
12845 	  else if (GET_MODE_CLASS (mode) == MODE_INT)
12846 	    /* There is no integer SQRT, so only DIV and UDIV can get
12847 	       here.  */
12848 	    *cost += (extra_cost->mult[mode == DImode].idiv
12849 		     /* Slighly prefer UDIV over SDIV.  */
12850 		     + (code == DIV ? 1 : 0));
12851 	  else
12852 	    *cost += extra_cost->fp[mode == DFmode].div;
12853 	}
12854       return false;  /* All arguments need to be in registers.  */
12855 
12856     case IF_THEN_ELSE:
12857       return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
12858 					 XEXP (x, 2), cost, speed);
12859 
12860     case EQ:
12861     case NE:
12862     case GT:
12863     case GTU:
12864     case LT:
12865     case LTU:
12866     case GE:
12867     case GEU:
12868     case LE:
12869     case LEU:
12870 
12871       return false; /* All arguments must be in registers.  */
12872 
12873     case FMA:
12874       op0 = XEXP (x, 0);
12875       op1 = XEXP (x, 1);
12876       op2 = XEXP (x, 2);
12877 
12878       if (speed)
12879 	{
12880 	  if (VECTOR_MODE_P (mode))
12881 	    *cost += extra_cost->vect.alu;
12882 	  else
12883 	    *cost += extra_cost->fp[mode == DFmode].fma;
12884 	}
12885 
12886       /* FMSUB, FNMADD, and FNMSUB are free.  */
12887       if (GET_CODE (op0) == NEG)
12888         op0 = XEXP (op0, 0);
12889 
12890       if (GET_CODE (op2) == NEG)
12891         op2 = XEXP (op2, 0);
12892 
12893       /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
12894 	 and the by-element operand as operand 0.  */
12895       if (GET_CODE (op1) == NEG)
12896         op1 = XEXP (op1, 0);
12897 
12898       /* Catch vector-by-element operations.  The by-element operand can
12899 	 either be (vec_duplicate (vec_select (x))) or just
12900 	 (vec_select (x)), depending on whether we are multiplying by
12901 	 a vector or a scalar.
12902 
12903 	 Canonicalization is not very good in these cases, FMA4 will put the
12904 	 by-element operand as operand 0, FNMA4 will have it as operand 1.  */
12905       if (GET_CODE (op0) == VEC_DUPLICATE)
12906 	op0 = XEXP (op0, 0);
12907       else if (GET_CODE (op1) == VEC_DUPLICATE)
12908 	op1 = XEXP (op1, 0);
12909 
12910       if (GET_CODE (op0) == VEC_SELECT)
12911 	op0 = XEXP (op0, 0);
12912       else if (GET_CODE (op1) == VEC_SELECT)
12913 	op1 = XEXP (op1, 0);
12914 
12915       /* If the remaining parameters are not registers,
12916          get the cost to put them into registers.  */
12917       *cost += rtx_cost (op0, mode, FMA, 0, speed);
12918       *cost += rtx_cost (op1, mode, FMA, 1, speed);
12919       *cost += rtx_cost (op2, mode, FMA, 2, speed);
12920       return true;
12921 
12922     case FLOAT:
12923     case UNSIGNED_FLOAT:
12924       if (speed)
12925 	*cost += extra_cost->fp[mode == DFmode].fromint;
12926       return false;
12927 
12928     case FLOAT_EXTEND:
12929       if (speed)
12930 	{
12931 	  if (VECTOR_MODE_P (mode))
12932 	    {
12933 	      /*Vector truncate.  */
12934 	      *cost += extra_cost->vect.alu;
12935 	    }
12936 	  else
12937 	    *cost += extra_cost->fp[mode == DFmode].widen;
12938 	}
12939       return false;
12940 
12941     case FLOAT_TRUNCATE:
12942       if (speed)
12943 	{
12944 	  if (VECTOR_MODE_P (mode))
12945 	    {
12946 	      /*Vector conversion.  */
12947 	      *cost += extra_cost->vect.alu;
12948 	    }
12949 	  else
12950 	    *cost += extra_cost->fp[mode == DFmode].narrow;
12951 	}
12952       return false;
12953 
12954     case FIX:
12955     case UNSIGNED_FIX:
12956       x = XEXP (x, 0);
12957       /* Strip the rounding part.  They will all be implemented
12958          by the fcvt* family of instructions anyway.  */
12959       if (GET_CODE (x) == UNSPEC)
12960         {
12961           unsigned int uns_code = XINT (x, 1);
12962 
12963           if (uns_code == UNSPEC_FRINTA
12964               || uns_code == UNSPEC_FRINTM
12965               || uns_code == UNSPEC_FRINTN
12966               || uns_code == UNSPEC_FRINTP
12967               || uns_code == UNSPEC_FRINTZ)
12968             x = XVECEXP (x, 0, 0);
12969         }
12970 
12971       if (speed)
12972 	{
12973 	  if (VECTOR_MODE_P (mode))
12974 	    *cost += extra_cost->vect.alu;
12975 	  else
12976 	    *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
12977 	}
12978 
12979       /* We can combine fmul by a power of 2 followed by a fcvt into a single
12980 	 fixed-point fcvt.  */
12981       if (GET_CODE (x) == MULT
12982 	  && ((VECTOR_MODE_P (mode)
12983 	       && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
12984 	      || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
12985 	{
12986 	  *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
12987 			     0, speed);
12988 	  return true;
12989 	}
12990 
12991       *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
12992       return true;
12993 
12994     case ABS:
12995       if (VECTOR_MODE_P (mode))
12996 	{
12997 	  /* ABS (vector).  */
12998 	  if (speed)
12999 	    *cost += extra_cost->vect.alu;
13000 	}
13001       else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
13002 	{
13003 	  op0 = XEXP (x, 0);
13004 
13005 	  /* FABD, which is analogous to FADD.  */
13006 	  if (GET_CODE (op0) == MINUS)
13007 	    {
13008 	      *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
13009 	      *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
13010 	      if (speed)
13011 		*cost += extra_cost->fp[mode == DFmode].addsub;
13012 
13013 	      return true;
13014 	    }
13015 	  /* Simple FABS is analogous to FNEG.  */
13016 	  if (speed)
13017 	    *cost += extra_cost->fp[mode == DFmode].neg;
13018 	}
13019       else
13020 	{
13021 	  /* Integer ABS will either be split to
13022 	     two arithmetic instructions, or will be an ABS
13023 	     (scalar), which we don't model.  */
13024 	  *cost = COSTS_N_INSNS (2);
13025 	  if (speed)
13026 	    *cost += 2 * extra_cost->alu.arith;
13027 	}
13028       return false;
13029 
13030     case SMAX:
13031     case SMIN:
13032       if (speed)
13033 	{
13034 	  if (VECTOR_MODE_P (mode))
13035 	    *cost += extra_cost->vect.alu;
13036 	  else
13037 	    {
13038 	      /* FMAXNM/FMINNM/FMAX/FMIN.
13039 	         TODO: This may not be accurate for all implementations, but
13040 	         we do not model this in the cost tables.  */
13041 	      *cost += extra_cost->fp[mode == DFmode].addsub;
13042 	    }
13043 	}
13044       return false;
13045 
13046     case UNSPEC:
13047       /* The floating point round to integer frint* instructions.  */
13048       if (aarch64_frint_unspec_p (XINT (x, 1)))
13049         {
13050           if (speed)
13051             *cost += extra_cost->fp[mode == DFmode].roundint;
13052 
13053           return false;
13054         }
13055 
13056       if (XINT (x, 1) == UNSPEC_RBIT)
13057         {
13058           if (speed)
13059             *cost += extra_cost->alu.rev;
13060 
13061           return false;
13062         }
13063       break;
13064 
13065     case TRUNCATE:
13066 
13067       /* Decompose <su>muldi3_highpart.  */
13068       if (/* (truncate:DI  */
13069 	  mode == DImode
13070 	  /*   (lshiftrt:TI  */
13071           && GET_MODE (XEXP (x, 0)) == TImode
13072           && GET_CODE (XEXP (x, 0)) == LSHIFTRT
13073 	  /*      (mult:TI  */
13074           && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
13075 	  /*        (ANY_EXTEND:TI (reg:DI))
13076 	            (ANY_EXTEND:TI (reg:DI)))  */
13077           && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
13078                && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
13079               || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
13080                   && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
13081           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
13082           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
13083 	  /*     (const_int 64)  */
13084           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
13085           && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
13086         {
13087           /* UMULH/SMULH.  */
13088 	  if (speed)
13089 	    *cost += extra_cost->mult[mode == DImode].extend;
13090 	  *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
13091 			     mode, MULT, 0, speed);
13092 	  *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
13093 			     mode, MULT, 1, speed);
13094           return true;
13095         }
13096 
13097       /* Fall through.  */
13098     default:
13099       break;
13100     }
13101 
13102   if (dump_file
13103       && flag_aarch64_verbose_cost)
13104     fprintf (dump_file,
13105       "\nFailed to cost RTX.  Assuming default cost.\n");
13106 
13107   return true;
13108 }
13109 
13110 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
13111    calculated for X.  This cost is stored in *COST.  Returns true
13112    if the total cost of X was calculated.  */
13113 static bool
aarch64_rtx_costs_wrapper(rtx x,machine_mode mode,int outer,int param,int * cost,bool speed)13114 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
13115 		   int param, int *cost, bool speed)
13116 {
13117   bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
13118 
13119   if (dump_file
13120       && flag_aarch64_verbose_cost)
13121     {
13122       print_rtl_single (dump_file, x);
13123       fprintf (dump_file, "\n%s cost: %d (%s)\n",
13124 	       speed ? "Hot" : "Cold",
13125 	       *cost, result ? "final" : "partial");
13126     }
13127 
13128   return result;
13129 }
13130 
13131 static int
aarch64_register_move_cost(machine_mode mode,reg_class_t from_i,reg_class_t to_i)13132 aarch64_register_move_cost (machine_mode mode,
13133 			    reg_class_t from_i, reg_class_t to_i)
13134 {
13135   enum reg_class from = (enum reg_class) from_i;
13136   enum reg_class to = (enum reg_class) to_i;
13137   const struct cpu_regmove_cost *regmove_cost
13138     = aarch64_tune_params.regmove_cost;
13139 
13140   /* Caller save and pointer regs are equivalent to GENERAL_REGS.  */
13141   if (to == TAILCALL_ADDR_REGS || to == POINTER_REGS)
13142     to = GENERAL_REGS;
13143 
13144   if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS)
13145     from = GENERAL_REGS;
13146 
13147   /* Make RDFFR very expensive.  In particular, if we know that the FFR
13148      contains a PTRUE (e.g. after a SETFFR), we must never use RDFFR
13149      as a way of obtaining a PTRUE.  */
13150   if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
13151       && hard_reg_set_subset_p (reg_class_contents[from_i],
13152 				reg_class_contents[FFR_REGS]))
13153     return 80;
13154 
13155   /* Moving between GPR and stack cost is the same as GP2GP.  */
13156   if ((from == GENERAL_REGS && to == STACK_REG)
13157       || (to == GENERAL_REGS && from == STACK_REG))
13158     return regmove_cost->GP2GP;
13159 
13160   /* To/From the stack register, we move via the gprs.  */
13161   if (to == STACK_REG || from == STACK_REG)
13162     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
13163             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
13164 
13165   if (known_eq (GET_MODE_SIZE (mode), 16))
13166     {
13167       /* 128-bit operations on general registers require 2 instructions.  */
13168       if (from == GENERAL_REGS && to == GENERAL_REGS)
13169 	return regmove_cost->GP2GP * 2;
13170       else if (from == GENERAL_REGS)
13171 	return regmove_cost->GP2FP * 2;
13172       else if (to == GENERAL_REGS)
13173 	return regmove_cost->FP2GP * 2;
13174 
13175       /* When AdvSIMD instructions are disabled it is not possible to move
13176 	 a 128-bit value directly between Q registers.  This is handled in
13177 	 secondary reload.  A general register is used as a scratch to move
13178 	 the upper DI value and the lower DI value is moved directly,
13179 	 hence the cost is the sum of three moves. */
13180       if (! TARGET_SIMD)
13181 	return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
13182 
13183       return regmove_cost->FP2FP;
13184     }
13185 
13186   if (from == GENERAL_REGS && to == GENERAL_REGS)
13187     return regmove_cost->GP2GP;
13188   else if (from == GENERAL_REGS)
13189     return regmove_cost->GP2FP;
13190   else if (to == GENERAL_REGS)
13191     return regmove_cost->FP2GP;
13192 
13193   return regmove_cost->FP2FP;
13194 }
13195 
13196 static int
aarch64_memory_move_cost(machine_mode mode ATTRIBUTE_UNUSED,reg_class_t rclass ATTRIBUTE_UNUSED,bool in ATTRIBUTE_UNUSED)13197 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
13198 			  reg_class_t rclass ATTRIBUTE_UNUSED,
13199 			  bool in ATTRIBUTE_UNUSED)
13200 {
13201   return aarch64_tune_params.memmov_cost;
13202 }
13203 
13204 /* Implement TARGET_INIT_BUILTINS.  */
13205 static void
aarch64_init_builtins()13206 aarch64_init_builtins ()
13207 {
13208   aarch64_general_init_builtins ();
13209   aarch64_sve::init_builtins ();
13210 }
13211 
13212 /* Implement TARGET_FOLD_BUILTIN.  */
13213 static tree
aarch64_fold_builtin(tree fndecl,int nargs,tree * args,bool)13214 aarch64_fold_builtin (tree fndecl, int nargs, tree *args, bool)
13215 {
13216   unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
13217   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
13218   tree type = TREE_TYPE (TREE_TYPE (fndecl));
13219   switch (code & AARCH64_BUILTIN_CLASS)
13220     {
13221     case AARCH64_BUILTIN_GENERAL:
13222       return aarch64_general_fold_builtin (subcode, type, nargs, args);
13223 
13224     case AARCH64_BUILTIN_SVE:
13225       return NULL_TREE;
13226     }
13227   gcc_unreachable ();
13228 }
13229 
13230 /* Implement TARGET_GIMPLE_FOLD_BUILTIN.  */
13231 static bool
aarch64_gimple_fold_builtin(gimple_stmt_iterator * gsi)13232 aarch64_gimple_fold_builtin (gimple_stmt_iterator *gsi)
13233 {
13234   gcall *stmt = as_a <gcall *> (gsi_stmt (*gsi));
13235   tree fndecl = gimple_call_fndecl (stmt);
13236   unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
13237   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
13238   gimple *new_stmt = NULL;
13239   switch (code & AARCH64_BUILTIN_CLASS)
13240     {
13241     case AARCH64_BUILTIN_GENERAL:
13242       new_stmt = aarch64_general_gimple_fold_builtin (subcode, stmt);
13243       break;
13244 
13245     case AARCH64_BUILTIN_SVE:
13246       new_stmt = aarch64_sve::gimple_fold_builtin (subcode, gsi, stmt);
13247       break;
13248     }
13249 
13250   if (!new_stmt)
13251     return false;
13252 
13253   gsi_replace (gsi, new_stmt, true);
13254   return true;
13255 }
13256 
13257 /* Implement TARGET_EXPAND_BUILTIN.  */
13258 static rtx
aarch64_expand_builtin(tree exp,rtx target,rtx,machine_mode,int ignore)13259 aarch64_expand_builtin (tree exp, rtx target, rtx, machine_mode, int ignore)
13260 {
13261   tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
13262   unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
13263   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
13264   switch (code & AARCH64_BUILTIN_CLASS)
13265     {
13266     case AARCH64_BUILTIN_GENERAL:
13267       return aarch64_general_expand_builtin (subcode, exp, target, ignore);
13268 
13269     case AARCH64_BUILTIN_SVE:
13270       return aarch64_sve::expand_builtin (subcode, exp, target);
13271     }
13272   gcc_unreachable ();
13273 }
13274 
13275 /* Implement TARGET_BUILTIN_DECL.  */
13276 static tree
aarch64_builtin_decl(unsigned int code,bool initialize_p)13277 aarch64_builtin_decl (unsigned int code, bool initialize_p)
13278 {
13279   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
13280   switch (code & AARCH64_BUILTIN_CLASS)
13281     {
13282     case AARCH64_BUILTIN_GENERAL:
13283       return aarch64_general_builtin_decl (subcode, initialize_p);
13284 
13285     case AARCH64_BUILTIN_SVE:
13286       return aarch64_sve::builtin_decl (subcode, initialize_p);
13287     }
13288   gcc_unreachable ();
13289 }
13290 
13291 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
13292    to optimize 1.0/sqrt.  */
13293 
13294 static bool
use_rsqrt_p(machine_mode mode)13295 use_rsqrt_p (machine_mode mode)
13296 {
13297   return (!flag_trapping_math
13298 	  && flag_unsafe_math_optimizations
13299 	  && ((aarch64_tune_params.approx_modes->recip_sqrt
13300 	       & AARCH64_APPROX_MODE (mode))
13301 	      || flag_mrecip_low_precision_sqrt));
13302 }
13303 
13304 /* Function to decide when to use the approximate reciprocal square root
13305    builtin.  */
13306 
13307 static tree
aarch64_builtin_reciprocal(tree fndecl)13308 aarch64_builtin_reciprocal (tree fndecl)
13309 {
13310   machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
13311 
13312   if (!use_rsqrt_p (mode))
13313     return NULL_TREE;
13314   unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
13315   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
13316   switch (code & AARCH64_BUILTIN_CLASS)
13317     {
13318     case AARCH64_BUILTIN_GENERAL:
13319       return aarch64_general_builtin_rsqrt (subcode);
13320 
13321     case AARCH64_BUILTIN_SVE:
13322       return NULL_TREE;
13323     }
13324   gcc_unreachable ();
13325 }
13326 
13327 /* Emit code to perform the floating-point operation:
13328 
13329      DST = SRC1 * SRC2
13330 
13331    where all three operands are already known to be registers.
13332    If the operation is an SVE one, PTRUE is a suitable all-true
13333    predicate.  */
13334 
13335 static void
aarch64_emit_mult(rtx dst,rtx ptrue,rtx src1,rtx src2)13336 aarch64_emit_mult (rtx dst, rtx ptrue, rtx src1, rtx src2)
13337 {
13338   if (ptrue)
13339     emit_insn (gen_aarch64_pred (UNSPEC_COND_FMUL, GET_MODE (dst),
13340 				 dst, ptrue, src1, src2,
13341 				 gen_int_mode (SVE_RELAXED_GP, SImode)));
13342   else
13343     emit_set_insn (dst, gen_rtx_MULT (GET_MODE (dst), src1, src2));
13344 }
13345 
13346 /* Emit instruction sequence to compute either the approximate square root
13347    or its approximate reciprocal, depending on the flag RECP, and return
13348    whether the sequence was emitted or not.  */
13349 
13350 bool
aarch64_emit_approx_sqrt(rtx dst,rtx src,bool recp)13351 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
13352 {
13353   machine_mode mode = GET_MODE (dst);
13354 
13355   if (GET_MODE_INNER (mode) == HFmode)
13356     {
13357       gcc_assert (!recp);
13358       return false;
13359     }
13360 
13361   if (!recp)
13362     {
13363       if (!(flag_mlow_precision_sqrt
13364 	    || (aarch64_tune_params.approx_modes->sqrt
13365 		& AARCH64_APPROX_MODE (mode))))
13366 	return false;
13367 
13368       if (!flag_finite_math_only
13369 	  || flag_trapping_math
13370 	  || !flag_unsafe_math_optimizations
13371 	  || optimize_function_for_size_p (cfun))
13372 	return false;
13373     }
13374   else
13375     /* Caller assumes we cannot fail.  */
13376     gcc_assert (use_rsqrt_p (mode));
13377 
13378   rtx pg = NULL_RTX;
13379   if (aarch64_sve_mode_p (mode))
13380     pg = aarch64_ptrue_reg (aarch64_sve_pred_mode (mode));
13381   machine_mode mmsk = (VECTOR_MODE_P (mode)
13382 		       ? related_int_vector_mode (mode).require ()
13383 		       : int_mode_for_mode (mode).require ());
13384   rtx xmsk = NULL_RTX;
13385   if (!recp)
13386     {
13387       /* When calculating the approximate square root, compare the
13388 	 argument with 0.0 and create a mask.  */
13389       rtx zero = CONST0_RTX (mode);
13390       if (pg)
13391 	{
13392 	  xmsk = gen_reg_rtx (GET_MODE (pg));
13393 	  rtx hint = gen_int_mode (SVE_KNOWN_PTRUE, SImode);
13394 	  emit_insn (gen_aarch64_pred_fcm (UNSPEC_COND_FCMNE, mode,
13395 					   xmsk, pg, hint, src, zero));
13396 	}
13397       else
13398 	{
13399 	  xmsk = gen_reg_rtx (mmsk);
13400 	  emit_insn (gen_rtx_SET (xmsk,
13401 				  gen_rtx_NEG (mmsk,
13402 					       gen_rtx_EQ (mmsk, src, zero))));
13403 	}
13404     }
13405 
13406   /* Estimate the approximate reciprocal square root.  */
13407   rtx xdst = gen_reg_rtx (mode);
13408   emit_insn (gen_aarch64_rsqrte (mode, xdst, src));
13409 
13410   /* Iterate over the series twice for SF and thrice for DF.  */
13411   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
13412 
13413   /* Optionally iterate over the series once less for faster performance
13414      while sacrificing the accuracy.  */
13415   if ((recp && flag_mrecip_low_precision_sqrt)
13416       || (!recp && flag_mlow_precision_sqrt))
13417     iterations--;
13418 
13419   /* Iterate over the series to calculate the approximate reciprocal square
13420      root.  */
13421   rtx x1 = gen_reg_rtx (mode);
13422   while (iterations--)
13423     {
13424       rtx x2 = gen_reg_rtx (mode);
13425       aarch64_emit_mult (x2, pg, xdst, xdst);
13426 
13427       emit_insn (gen_aarch64_rsqrts (mode, x1, src, x2));
13428 
13429       if (iterations > 0)
13430 	aarch64_emit_mult (xdst, pg, xdst, x1);
13431     }
13432 
13433   if (!recp)
13434     {
13435       if (pg)
13436 	/* Multiply nonzero source values by the corresponding intermediate
13437 	   result elements, so that the final calculation is the approximate
13438 	   square root rather than its reciprocal.  Select a zero result for
13439 	   zero source values, to avoid the Inf * 0 -> NaN that we'd get
13440 	   otherwise.  */
13441 	emit_insn (gen_cond (UNSPEC_COND_FMUL, mode,
13442 			     xdst, xmsk, xdst, src, CONST0_RTX (mode)));
13443       else
13444 	{
13445 	  /* Qualify the approximate reciprocal square root when the
13446 	     argument is 0.0 by squashing the intermediary result to 0.0.  */
13447 	  rtx xtmp = gen_reg_rtx (mmsk);
13448 	  emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
13449 					    gen_rtx_SUBREG (mmsk, xdst, 0)));
13450 	  emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
13451 
13452 	  /* Calculate the approximate square root.  */
13453 	  aarch64_emit_mult (xdst, pg, xdst, src);
13454 	}
13455     }
13456 
13457   /* Finalize the approximation.  */
13458   aarch64_emit_mult (dst, pg, xdst, x1);
13459 
13460   return true;
13461 }
13462 
13463 /* Emit the instruction sequence to compute the approximation for the division
13464    of NUM by DEN in QUO and return whether the sequence was emitted or not.  */
13465 
13466 bool
aarch64_emit_approx_div(rtx quo,rtx num,rtx den)13467 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
13468 {
13469   machine_mode mode = GET_MODE (quo);
13470 
13471   if (GET_MODE_INNER (mode) == HFmode)
13472     return false;
13473 
13474   bool use_approx_division_p = (flag_mlow_precision_div
13475 			        || (aarch64_tune_params.approx_modes->division
13476 				    & AARCH64_APPROX_MODE (mode)));
13477 
13478   if (!flag_finite_math_only
13479       || flag_trapping_math
13480       || !flag_unsafe_math_optimizations
13481       || optimize_function_for_size_p (cfun)
13482       || !use_approx_division_p)
13483     return false;
13484 
13485   if (!TARGET_SIMD && VECTOR_MODE_P (mode))
13486     return false;
13487 
13488   rtx pg = NULL_RTX;
13489   if (aarch64_sve_mode_p (mode))
13490     pg = aarch64_ptrue_reg (aarch64_sve_pred_mode (mode));
13491 
13492   /* Estimate the approximate reciprocal.  */
13493   rtx xrcp = gen_reg_rtx (mode);
13494   emit_insn (gen_aarch64_frecpe (mode, xrcp, den));
13495 
13496   /* Iterate over the series twice for SF and thrice for DF.  */
13497   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
13498 
13499   /* Optionally iterate over the series less for faster performance,
13500      while sacrificing the accuracy.  The default is 2 for DF and 1 for SF.  */
13501   if (flag_mlow_precision_div)
13502     iterations = (GET_MODE_INNER (mode) == DFmode
13503 		  ? aarch64_double_recp_precision
13504 		  : aarch64_float_recp_precision);
13505 
13506   /* Iterate over the series to calculate the approximate reciprocal.  */
13507   rtx xtmp = gen_reg_rtx (mode);
13508   while (iterations--)
13509     {
13510       emit_insn (gen_aarch64_frecps (mode, xtmp, xrcp, den));
13511 
13512       if (iterations > 0)
13513 	aarch64_emit_mult (xrcp, pg, xrcp, xtmp);
13514     }
13515 
13516   if (num != CONST1_RTX (mode))
13517     {
13518       /* As the approximate reciprocal of DEN is already calculated, only
13519 	 calculate the approximate division when NUM is not 1.0.  */
13520       rtx xnum = force_reg (mode, num);
13521       aarch64_emit_mult (xrcp, pg, xrcp, xnum);
13522     }
13523 
13524   /* Finalize the approximation.  */
13525   aarch64_emit_mult (quo, pg, xrcp, xtmp);
13526   return true;
13527 }
13528 
13529 /* Return the number of instructions that can be issued per cycle.  */
13530 static int
aarch64_sched_issue_rate(void)13531 aarch64_sched_issue_rate (void)
13532 {
13533   return aarch64_tune_params.issue_rate;
13534 }
13535 
13536 /* Implement TARGET_SCHED_VARIABLE_ISSUE.  */
13537 static int
aarch64_sched_variable_issue(FILE *,int,rtx_insn * insn,int more)13538 aarch64_sched_variable_issue (FILE *, int, rtx_insn *insn, int more)
13539 {
13540   if (DEBUG_INSN_P (insn))
13541     return more;
13542 
13543   rtx_code code = GET_CODE (PATTERN (insn));
13544   if (code == USE || code == CLOBBER)
13545     return more;
13546 
13547   if (get_attr_type (insn) == TYPE_NO_INSN)
13548     return more;
13549 
13550   return more - 1;
13551 }
13552 
13553 static int
aarch64_sched_first_cycle_multipass_dfa_lookahead(void)13554 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
13555 {
13556   int issue_rate = aarch64_sched_issue_rate ();
13557 
13558   return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
13559 }
13560 
13561 
13562 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
13563    autopref_multipass_dfa_lookahead_guard from haifa-sched.c.  It only
13564    has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0.  */
13565 
13566 static int
aarch64_first_cycle_multipass_dfa_lookahead_guard(rtx_insn * insn,int ready_index)13567 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
13568 						    int ready_index)
13569 {
13570   return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
13571 }
13572 
13573 
13574 /* Vectorizer cost model target hooks.  */
13575 
13576 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
13577 static int
aarch64_builtin_vectorization_cost(enum vect_cost_for_stmt type_of_cost,tree vectype,int misalign ATTRIBUTE_UNUSED)13578 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
13579 				    tree vectype,
13580 				    int misalign ATTRIBUTE_UNUSED)
13581 {
13582   unsigned elements;
13583   const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
13584   bool fp = false;
13585 
13586   if (vectype != NULL)
13587     fp = FLOAT_TYPE_P (vectype);
13588 
13589   switch (type_of_cost)
13590     {
13591       case scalar_stmt:
13592 	return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
13593 
13594       case scalar_load:
13595 	return costs->scalar_load_cost;
13596 
13597       case scalar_store:
13598 	return costs->scalar_store_cost;
13599 
13600       case vector_stmt:
13601 	return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
13602 
13603       case vector_load:
13604 	return costs->vec_align_load_cost;
13605 
13606       case vector_store:
13607 	return costs->vec_store_cost;
13608 
13609       case vec_to_scalar:
13610 	return costs->vec_to_scalar_cost;
13611 
13612       case scalar_to_vec:
13613 	return costs->scalar_to_vec_cost;
13614 
13615       case unaligned_load:
13616       case vector_gather_load:
13617 	return costs->vec_unalign_load_cost;
13618 
13619       case unaligned_store:
13620       case vector_scatter_store:
13621 	return costs->vec_unalign_store_cost;
13622 
13623       case cond_branch_taken:
13624 	return costs->cond_taken_branch_cost;
13625 
13626       case cond_branch_not_taken:
13627 	return costs->cond_not_taken_branch_cost;
13628 
13629       case vec_perm:
13630 	return costs->vec_permute_cost;
13631 
13632       case vec_promote_demote:
13633 	return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
13634 
13635       case vec_construct:
13636 	elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
13637 	return elements / 2 + 1;
13638 
13639       default:
13640 	gcc_unreachable ();
13641     }
13642 }
13643 
13644 /* Return true if creating multiple copies of STMT_INFO for Advanced SIMD
13645    vectors would produce a series of LDP or STP operations.  KIND is the
13646    kind of statement that STMT_INFO represents.  */
13647 static bool
aarch64_advsimd_ldp_stp_p(enum vect_cost_for_stmt kind,stmt_vec_info stmt_info)13648 aarch64_advsimd_ldp_stp_p (enum vect_cost_for_stmt kind,
13649 			   stmt_vec_info stmt_info)
13650 {
13651   switch (kind)
13652     {
13653     case vector_load:
13654     case vector_store:
13655     case unaligned_load:
13656     case unaligned_store:
13657       break;
13658 
13659     default:
13660       return false;
13661     }
13662 
13663   if (aarch64_tune_params.extra_tuning_flags
13664       & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)
13665     return false;
13666 
13667   return is_gimple_assign (stmt_info->stmt);
13668 }
13669 
13670 /* Return true if STMT_INFO extends the result of a load.  */
13671 static bool
aarch64_extending_load_p(stmt_vec_info stmt_info)13672 aarch64_extending_load_p (stmt_vec_info stmt_info)
13673 {
13674   gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
13675   if (!assign || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign)))
13676     return false;
13677 
13678   tree rhs = gimple_assign_rhs1 (stmt_info->stmt);
13679   tree lhs_type = TREE_TYPE (gimple_assign_lhs (assign));
13680   tree rhs_type = TREE_TYPE (rhs);
13681   if (!INTEGRAL_TYPE_P (lhs_type)
13682       || !INTEGRAL_TYPE_P (rhs_type)
13683       || TYPE_PRECISION (lhs_type) <= TYPE_PRECISION (rhs_type))
13684     return false;
13685 
13686   stmt_vec_info def_stmt_info = stmt_info->vinfo->lookup_def (rhs);
13687   return (def_stmt_info
13688 	  && STMT_VINFO_DATA_REF (def_stmt_info)
13689 	  && DR_IS_READ (STMT_VINFO_DATA_REF (def_stmt_info)));
13690 }
13691 
13692 /* Return true if STMT_INFO is an integer truncation.  */
13693 static bool
aarch64_integer_truncation_p(stmt_vec_info stmt_info)13694 aarch64_integer_truncation_p (stmt_vec_info stmt_info)
13695 {
13696   gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
13697   if (!assign || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign)))
13698     return false;
13699 
13700   tree lhs_type = TREE_TYPE (gimple_assign_lhs (assign));
13701   tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
13702   return (INTEGRAL_TYPE_P (lhs_type)
13703 	  && INTEGRAL_TYPE_P (rhs_type)
13704 	  && TYPE_PRECISION (lhs_type) < TYPE_PRECISION (rhs_type));
13705 }
13706 
13707 /* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
13708    for STMT_INFO, which has cost kind KIND and which when vectorized would
13709    operate on vector type VECTYPE.  Adjust the cost as necessary for SVE
13710    targets.  */
13711 static unsigned int
aarch64_sve_adjust_stmt_cost(vect_cost_for_stmt kind,stmt_vec_info stmt_info,tree vectype,unsigned int stmt_cost)13712 aarch64_sve_adjust_stmt_cost (vect_cost_for_stmt kind,
13713 			      stmt_vec_info stmt_info, tree vectype,
13714 			      unsigned int stmt_cost)
13715 {
13716   /* Unlike vec_promote_demote, vector_stmt conversions do not change the
13717      vector register size or number of units.  Integer promotions of this
13718      type therefore map to SXT[BHW] or UXT[BHW].
13719 
13720      Most loads have extending forms that can do the sign or zero extension
13721      on the fly.  Optimistically assume that a load followed by an extension
13722      will fold to this form during combine, and that the extension therefore
13723      comes for free.  */
13724   if (kind == vector_stmt && aarch64_extending_load_p (stmt_info))
13725     stmt_cost = 0;
13726 
13727   /* For similar reasons, vector_stmt integer truncations are a no-op,
13728      because we can just ignore the unused upper bits of the source.  */
13729   if (kind == vector_stmt && aarch64_integer_truncation_p (stmt_info))
13730     stmt_cost = 0;
13731 
13732   /* Advanced SIMD can load and store pairs of registers using LDP and STP,
13733      but there are no equivalent instructions for SVE.  This means that
13734      (all other things being equal) 128-bit SVE needs twice as many load
13735      and store instructions as Advanced SIMD in order to process vector pairs.
13736 
13737      Also, scalar code can often use LDP and STP to access pairs of values,
13738      so it is too simplistic to say that one SVE load or store replaces
13739      VF scalar loads and stores.
13740 
13741      Ideally we would account for this in the scalar and Advanced SIMD
13742      costs by making suitable load/store pairs as cheap as a single
13743      load/store.  However, that would be a very invasive change and in
13744      practice it tends to stress other parts of the cost model too much.
13745      E.g. stores of scalar constants currently count just a store,
13746      whereas stores of vector constants count a store and a vec_init.
13747      This is an artificial distinction for AArch64, where stores of
13748      nonzero scalar constants need the same kind of register invariant
13749      as vector stores.
13750 
13751      An alternative would be to double the cost of any SVE loads and stores
13752      that could be paired in Advanced SIMD (and possibly also paired in
13753      scalar code).  But this tends to stress other parts of the cost model
13754      in the same way.  It also means that we can fall back to Advanced SIMD
13755      even if full-loop predication would have been useful.
13756 
13757      Here we go for a more conservative version: double the costs of SVE
13758      loads and stores if one iteration of the scalar loop processes enough
13759      elements for it to use a whole number of Advanced SIMD LDP or STP
13760      instructions.  This makes it very likely that the VF would be 1 for
13761      Advanced SIMD, and so no epilogue should be needed.  */
13762   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
13763     {
13764       stmt_vec_info first = DR_GROUP_FIRST_ELEMENT (stmt_info);
13765       unsigned int count = DR_GROUP_SIZE (first) - DR_GROUP_GAP (first);
13766       unsigned int elt_bits = GET_MODE_UNIT_BITSIZE (TYPE_MODE (vectype));
13767       if (multiple_p (count * elt_bits, 256)
13768 	  && aarch64_advsimd_ldp_stp_p (kind, stmt_info))
13769 	stmt_cost *= 2;
13770     }
13771 
13772   return stmt_cost;
13773 }
13774 
13775 /* Implement targetm.vectorize.add_stmt_cost.  */
13776 static unsigned
aarch64_add_stmt_cost(void * data,int count,enum vect_cost_for_stmt kind,struct _stmt_vec_info * stmt_info,int misalign,enum vect_cost_model_location where)13777 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
13778 		       struct _stmt_vec_info *stmt_info, int misalign,
13779 		       enum vect_cost_model_location where)
13780 {
13781   unsigned *cost = (unsigned *) data;
13782   unsigned retval = 0;
13783 
13784   if (flag_vect_cost_model)
13785     {
13786       tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
13787       int stmt_cost =
13788 	    aarch64_builtin_vectorization_cost (kind, vectype, misalign);
13789 
13790       if (stmt_info && vectype && aarch64_sve_mode_p (TYPE_MODE (vectype)))
13791 	stmt_cost = aarch64_sve_adjust_stmt_cost (kind, stmt_info, vectype,
13792 						  stmt_cost);
13793 
13794       /* Statements in an inner loop relative to the loop being
13795 	 vectorized are weighted more heavily.  The value here is
13796 	 arbitrary and could potentially be improved with analysis.  */
13797       if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
13798 	count *= 50; /*  FIXME  */
13799 
13800       retval = (unsigned) (count * stmt_cost);
13801       cost[where] += retval;
13802     }
13803 
13804   return retval;
13805 }
13806 
13807 static void initialize_aarch64_code_model (struct gcc_options *);
13808 
13809 /* Parse the TO_PARSE string and put the architecture struct that it
13810    selects into RES and the architectural features into ISA_FLAGS.
13811    Return an aarch64_parse_opt_result describing the parse result.
13812    If there is an error parsing, RES and ISA_FLAGS are left unchanged.
13813    When the TO_PARSE string contains an invalid extension,
13814    a copy of the string is created and stored to INVALID_EXTENSION.  */
13815 
13816 static enum aarch64_parse_opt_result
aarch64_parse_arch(const char * to_parse,const struct processor ** res,uint64_t * isa_flags,std::string * invalid_extension)13817 aarch64_parse_arch (const char *to_parse, const struct processor **res,
13818 		    uint64_t *isa_flags, std::string *invalid_extension)
13819 {
13820   const char *ext;
13821   const struct processor *arch;
13822   size_t len;
13823 
13824   ext = strchr (to_parse, '+');
13825 
13826   if (ext != NULL)
13827     len = ext - to_parse;
13828   else
13829     len = strlen (to_parse);
13830 
13831   if (len == 0)
13832     return AARCH64_PARSE_MISSING_ARG;
13833 
13834 
13835   /* Loop through the list of supported ARCHes to find a match.  */
13836   for (arch = all_architectures; arch->name != NULL; arch++)
13837     {
13838       if (strlen (arch->name) == len
13839 	  && strncmp (arch->name, to_parse, len) == 0)
13840 	{
13841 	  uint64_t isa_temp = arch->flags;
13842 
13843 	  if (ext != NULL)
13844 	    {
13845 	      /* TO_PARSE string contains at least one extension.  */
13846 	      enum aarch64_parse_opt_result ext_res
13847 		= aarch64_parse_extension (ext, &isa_temp, invalid_extension);
13848 
13849 	      if (ext_res != AARCH64_PARSE_OK)
13850 		return ext_res;
13851 	    }
13852 	  /* Extension parsing was successful.  Confirm the result
13853 	     arch and ISA flags.  */
13854 	  *res = arch;
13855 	  *isa_flags = isa_temp;
13856 	  return AARCH64_PARSE_OK;
13857 	}
13858     }
13859 
13860   /* ARCH name not found in list.  */
13861   return AARCH64_PARSE_INVALID_ARG;
13862 }
13863 
13864 /* Parse the TO_PARSE string and put the result tuning in RES and the
13865    architecture flags in ISA_FLAGS.  Return an aarch64_parse_opt_result
13866    describing the parse result.  If there is an error parsing, RES and
13867    ISA_FLAGS are left unchanged.
13868    When the TO_PARSE string contains an invalid extension,
13869    a copy of the string is created and stored to INVALID_EXTENSION.  */
13870 
13871 static enum aarch64_parse_opt_result
aarch64_parse_cpu(const char * to_parse,const struct processor ** res,uint64_t * isa_flags,std::string * invalid_extension)13872 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
13873 		   uint64_t *isa_flags, std::string *invalid_extension)
13874 {
13875   const char *ext;
13876   const struct processor *cpu;
13877   size_t len;
13878 
13879   ext = strchr (to_parse, '+');
13880 
13881   if (ext != NULL)
13882     len = ext - to_parse;
13883   else
13884     len = strlen (to_parse);
13885 
13886   if (len == 0)
13887     return AARCH64_PARSE_MISSING_ARG;
13888 
13889 
13890   /* Loop through the list of supported CPUs to find a match.  */
13891   for (cpu = all_cores; cpu->name != NULL; cpu++)
13892     {
13893       if (strlen (cpu->name) == len && strncmp (cpu->name, to_parse, len) == 0)
13894 	{
13895 	  uint64_t isa_temp = cpu->flags;
13896 
13897 
13898 	  if (ext != NULL)
13899 	    {
13900 	      /* TO_PARSE string contains at least one extension.  */
13901 	      enum aarch64_parse_opt_result ext_res
13902 		= aarch64_parse_extension (ext, &isa_temp, invalid_extension);
13903 
13904 	      if (ext_res != AARCH64_PARSE_OK)
13905 		return ext_res;
13906 	    }
13907 	  /* Extension parsing was successfull.  Confirm the result
13908 	     cpu and ISA flags.  */
13909 	  *res = cpu;
13910 	  *isa_flags = isa_temp;
13911 	  return AARCH64_PARSE_OK;
13912 	}
13913     }
13914 
13915   /* CPU name not found in list.  */
13916   return AARCH64_PARSE_INVALID_ARG;
13917 }
13918 
13919 /* Parse the TO_PARSE string and put the cpu it selects into RES.
13920    Return an aarch64_parse_opt_result describing the parse result.
13921    If the parsing fails the RES does not change.  */
13922 
13923 static enum aarch64_parse_opt_result
aarch64_parse_tune(const char * to_parse,const struct processor ** res)13924 aarch64_parse_tune (const char *to_parse, const struct processor **res)
13925 {
13926   const struct processor *cpu;
13927 
13928   /* Loop through the list of supported CPUs to find a match.  */
13929   for (cpu = all_cores; cpu->name != NULL; cpu++)
13930     {
13931       if (strcmp (cpu->name, to_parse) == 0)
13932 	{
13933 	  *res = cpu;
13934 	  return AARCH64_PARSE_OK;
13935 	}
13936     }
13937 
13938   /* CPU name not found in list.  */
13939   return AARCH64_PARSE_INVALID_ARG;
13940 }
13941 
13942 /* Parse TOKEN, which has length LENGTH to see if it is an option
13943    described in FLAG.  If it is, return the index bit for that fusion type.
13944    If not, error (printing OPTION_NAME) and return zero.  */
13945 
13946 static unsigned int
aarch64_parse_one_option_token(const char * token,size_t length,const struct aarch64_flag_desc * flag,const char * option_name)13947 aarch64_parse_one_option_token (const char *token,
13948 				size_t length,
13949 				const struct aarch64_flag_desc *flag,
13950 				const char *option_name)
13951 {
13952   for (; flag->name != NULL; flag++)
13953     {
13954       if (length == strlen (flag->name)
13955 	  && !strncmp (flag->name, token, length))
13956 	return flag->flag;
13957     }
13958 
13959   error ("unknown flag passed in %<-moverride=%s%> (%s)", option_name, token);
13960   return 0;
13961 }
13962 
13963 /* Parse OPTION which is a comma-separated list of flags to enable.
13964    FLAGS gives the list of flags we understand, INITIAL_STATE gives any
13965    default state we inherit from the CPU tuning structures.  OPTION_NAME
13966    gives the top-level option we are parsing in the -moverride string,
13967    for use in error messages.  */
13968 
13969 static unsigned int
aarch64_parse_boolean_options(const char * option,const struct aarch64_flag_desc * flags,unsigned int initial_state,const char * option_name)13970 aarch64_parse_boolean_options (const char *option,
13971 			       const struct aarch64_flag_desc *flags,
13972 			       unsigned int initial_state,
13973 			       const char *option_name)
13974 {
13975   const char separator = '.';
13976   const char* specs = option;
13977   const char* ntoken = option;
13978   unsigned int found_flags = initial_state;
13979 
13980   while ((ntoken = strchr (specs, separator)))
13981     {
13982       size_t token_length = ntoken - specs;
13983       unsigned token_ops = aarch64_parse_one_option_token (specs,
13984 							   token_length,
13985 							   flags,
13986 							   option_name);
13987       /* If we find "none" (or, for simplicity's sake, an error) anywhere
13988 	 in the token stream, reset the supported operations.  So:
13989 
13990 	   adrp+add.cmp+branch.none.adrp+add
13991 
13992 	   would have the result of turning on only adrp+add fusion.  */
13993       if (!token_ops)
13994 	found_flags = 0;
13995 
13996       found_flags |= token_ops;
13997       specs = ++ntoken;
13998     }
13999 
14000   /* We ended with a comma, print something.  */
14001   if (!(*specs))
14002     {
14003       error ("%s string ill-formed\n", option_name);
14004       return 0;
14005     }
14006 
14007   /* We still have one more token to parse.  */
14008   size_t token_length = strlen (specs);
14009   unsigned token_ops = aarch64_parse_one_option_token (specs,
14010 						       token_length,
14011 						       flags,
14012 						       option_name);
14013    if (!token_ops)
14014      found_flags = 0;
14015 
14016   found_flags |= token_ops;
14017   return found_flags;
14018 }
14019 
14020 /* Support for overriding instruction fusion.  */
14021 
14022 static void
aarch64_parse_fuse_string(const char * fuse_string,struct tune_params * tune)14023 aarch64_parse_fuse_string (const char *fuse_string,
14024 			    struct tune_params *tune)
14025 {
14026   tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
14027 						     aarch64_fusible_pairs,
14028 						     tune->fusible_ops,
14029 						     "fuse=");
14030 }
14031 
14032 /* Support for overriding other tuning flags.  */
14033 
14034 static void
aarch64_parse_tune_string(const char * tune_string,struct tune_params * tune)14035 aarch64_parse_tune_string (const char *tune_string,
14036 			    struct tune_params *tune)
14037 {
14038   tune->extra_tuning_flags
14039     = aarch64_parse_boolean_options (tune_string,
14040 				     aarch64_tuning_flags,
14041 				     tune->extra_tuning_flags,
14042 				     "tune=");
14043 }
14044 
14045 /* Parse the sve_width tuning moverride string in TUNE_STRING.
14046    Accept the valid SVE vector widths allowed by
14047    aarch64_sve_vector_bits_enum and use it to override sve_width
14048    in TUNE.  */
14049 
14050 static void
aarch64_parse_sve_width_string(const char * tune_string,struct tune_params * tune)14051 aarch64_parse_sve_width_string (const char *tune_string,
14052 				struct tune_params *tune)
14053 {
14054   int width = -1;
14055 
14056   int n = sscanf (tune_string, "%d", &width);
14057   if (n == EOF)
14058     {
14059       error ("invalid format for sve_width");
14060       return;
14061     }
14062   switch (width)
14063     {
14064     case SVE_128:
14065     case SVE_256:
14066     case SVE_512:
14067     case SVE_1024:
14068     case SVE_2048:
14069       break;
14070     default:
14071       error ("invalid sve_width value: %d", width);
14072     }
14073   tune->sve_width = (enum aarch64_sve_vector_bits_enum) width;
14074 }
14075 
14076 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
14077    we understand.  If it is, extract the option string and handoff to
14078    the appropriate function.  */
14079 
14080 void
aarch64_parse_one_override_token(const char * token,size_t length,struct tune_params * tune)14081 aarch64_parse_one_override_token (const char* token,
14082 				  size_t length,
14083 				  struct tune_params *tune)
14084 {
14085   const struct aarch64_tuning_override_function *fn
14086     = aarch64_tuning_override_functions;
14087 
14088   const char *option_part = strchr (token, '=');
14089   if (!option_part)
14090     {
14091       error ("tuning string missing in option (%s)", token);
14092       return;
14093     }
14094 
14095   /* Get the length of the option name.  */
14096   length = option_part - token;
14097   /* Skip the '=' to get to the option string.  */
14098   option_part++;
14099 
14100   for (; fn->name != NULL; fn++)
14101     {
14102       if (!strncmp (fn->name, token, length))
14103 	{
14104 	  fn->parse_override (option_part, tune);
14105 	  return;
14106 	}
14107     }
14108 
14109   error ("unknown tuning option (%s)",token);
14110   return;
14111 }
14112 
14113 /* A checking mechanism for the implementation of the tls size.  */
14114 
14115 static void
initialize_aarch64_tls_size(struct gcc_options * opts)14116 initialize_aarch64_tls_size (struct gcc_options *opts)
14117 {
14118   if (aarch64_tls_size == 0)
14119     aarch64_tls_size = 24;
14120 
14121   switch (opts->x_aarch64_cmodel_var)
14122     {
14123     case AARCH64_CMODEL_TINY:
14124       /* Both the default and maximum TLS size allowed under tiny is 1M which
14125 	 needs two instructions to address, so we clamp the size to 24.  */
14126       if (aarch64_tls_size > 24)
14127 	aarch64_tls_size = 24;
14128       break;
14129     case AARCH64_CMODEL_SMALL:
14130       /* The maximum TLS size allowed under small is 4G.  */
14131       if (aarch64_tls_size > 32)
14132 	aarch64_tls_size = 32;
14133       break;
14134     case AARCH64_CMODEL_LARGE:
14135       /* The maximum TLS size allowed under large is 16E.
14136 	 FIXME: 16E should be 64bit, we only support 48bit offset now.  */
14137       if (aarch64_tls_size > 48)
14138 	aarch64_tls_size = 48;
14139       break;
14140     default:
14141       gcc_unreachable ();
14142     }
14143 
14144   return;
14145 }
14146 
14147 /* Parse STRING looking for options in the format:
14148      string	:: option:string
14149      option	:: name=substring
14150      name	:: {a-z}
14151      substring	:: defined by option.  */
14152 
14153 static void
aarch64_parse_override_string(const char * input_string,struct tune_params * tune)14154 aarch64_parse_override_string (const char* input_string,
14155 			       struct tune_params* tune)
14156 {
14157   const char separator = ':';
14158   size_t string_length = strlen (input_string) + 1;
14159   char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
14160   char *string = string_root;
14161   strncpy (string, input_string, string_length);
14162   string[string_length - 1] = '\0';
14163 
14164   char* ntoken = string;
14165 
14166   while ((ntoken = strchr (string, separator)))
14167     {
14168       size_t token_length = ntoken - string;
14169       /* Make this substring look like a string.  */
14170       *ntoken = '\0';
14171       aarch64_parse_one_override_token (string, token_length, tune);
14172       string = ++ntoken;
14173     }
14174 
14175   /* One last option to parse.  */
14176   aarch64_parse_one_override_token (string, strlen (string), tune);
14177   free (string_root);
14178 }
14179 
14180 
14181 static void
aarch64_override_options_after_change_1(struct gcc_options * opts)14182 aarch64_override_options_after_change_1 (struct gcc_options *opts)
14183 {
14184   if (accepted_branch_protection_string)
14185     {
14186       opts->x_aarch64_branch_protection_string
14187 	= xstrdup (accepted_branch_protection_string);
14188     }
14189 
14190   /* PR 70044: We have to be careful about being called multiple times for the
14191      same function.  This means all changes should be repeatable.  */
14192 
14193   /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
14194      Disable the frame pointer flag so the mid-end will not use a frame
14195      pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
14196      Set x_flag_omit_frame_pointer to the special value 2 to differentiate
14197      between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2).  */
14198   aarch64_use_frame_pointer = opts->x_flag_omit_frame_pointer != 1;
14199   if (opts->x_flag_omit_frame_pointer == 0)
14200     opts->x_flag_omit_frame_pointer = 2;
14201 
14202   /* If not optimizing for size, set the default
14203      alignment to what the target wants.  */
14204   if (!opts->x_optimize_size)
14205     {
14206       if (opts->x_flag_align_loops && !opts->x_str_align_loops)
14207 	opts->x_str_align_loops = aarch64_tune_params.loop_align;
14208       if (opts->x_flag_align_jumps && !opts->x_str_align_jumps)
14209 	opts->x_str_align_jumps = aarch64_tune_params.jump_align;
14210       if (opts->x_flag_align_functions && !opts->x_str_align_functions)
14211 	opts->x_str_align_functions = aarch64_tune_params.function_align;
14212     }
14213 
14214   /* We default to no pc-relative literal loads.  */
14215 
14216   aarch64_pcrelative_literal_loads = false;
14217 
14218   /* If -mpc-relative-literal-loads is set on the command line, this
14219      implies that the user asked for PC relative literal loads.  */
14220   if (opts->x_pcrelative_literal_loads == 1)
14221     aarch64_pcrelative_literal_loads = true;
14222 
14223   /* In the tiny memory model it makes no sense to disallow PC relative
14224      literal pool loads.  */
14225   if (aarch64_cmodel == AARCH64_CMODEL_TINY
14226       || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
14227     aarch64_pcrelative_literal_loads = true;
14228 
14229   /* When enabling the lower precision Newton series for the square root, also
14230      enable it for the reciprocal square root, since the latter is an
14231      intermediary step for the former.  */
14232   if (flag_mlow_precision_sqrt)
14233     flag_mrecip_low_precision_sqrt = true;
14234 }
14235 
14236 /* 'Unpack' up the internal tuning structs and update the options
14237     in OPTS.  The caller must have set up selected_tune and selected_arch
14238     as all the other target-specific codegen decisions are
14239     derived from them.  */
14240 
14241 void
aarch64_override_options_internal(struct gcc_options * opts)14242 aarch64_override_options_internal (struct gcc_options *opts)
14243 {
14244   aarch64_tune_flags = selected_tune->flags;
14245   aarch64_tune = selected_tune->sched_core;
14246   /* Make a copy of the tuning parameters attached to the core, which
14247      we may later overwrite.  */
14248   aarch64_tune_params = *(selected_tune->tune);
14249   aarch64_architecture_version = selected_arch->architecture_version;
14250 
14251   if (opts->x_aarch64_override_tune_string)
14252     aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
14253 				  &aarch64_tune_params);
14254 
14255   /* This target defaults to strict volatile bitfields.  */
14256   if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
14257     opts->x_flag_strict_volatile_bitfields = 1;
14258 
14259   if (aarch64_stack_protector_guard == SSP_GLOBAL
14260       && opts->x_aarch64_stack_protector_guard_offset_str)
14261     {
14262       error ("incompatible options %<-mstack-protector-guard=global%> and "
14263 	     "%<-mstack-protector-guard-offset=%s%>",
14264 	     aarch64_stack_protector_guard_offset_str);
14265     }
14266 
14267   if (aarch64_stack_protector_guard == SSP_SYSREG
14268       && !(opts->x_aarch64_stack_protector_guard_offset_str
14269 	   && opts->x_aarch64_stack_protector_guard_reg_str))
14270     {
14271       error ("both %<-mstack-protector-guard-offset%> and "
14272 	     "%<-mstack-protector-guard-reg%> must be used "
14273 	     "with %<-mstack-protector-guard=sysreg%>");
14274     }
14275 
14276   if (opts->x_aarch64_stack_protector_guard_reg_str)
14277     {
14278       if (strlen (opts->x_aarch64_stack_protector_guard_reg_str) > 100)
14279 	  error ("specify a system register with a small string length.");
14280     }
14281 
14282   if (opts->x_aarch64_stack_protector_guard_offset_str)
14283     {
14284       char *end;
14285       const char *str = aarch64_stack_protector_guard_offset_str;
14286       errno = 0;
14287       long offs = strtol (aarch64_stack_protector_guard_offset_str, &end, 0);
14288       if (!*str || *end || errno)
14289 	error ("%qs is not a valid offset in %qs", str,
14290 	       "-mstack-protector-guard-offset=");
14291       aarch64_stack_protector_guard_offset = offs;
14292     }
14293 
14294   initialize_aarch64_code_model (opts);
14295   initialize_aarch64_tls_size (opts);
14296 
14297   int queue_depth = 0;
14298   switch (aarch64_tune_params.autoprefetcher_model)
14299     {
14300       case tune_params::AUTOPREFETCHER_OFF:
14301 	queue_depth = -1;
14302 	break;
14303       case tune_params::AUTOPREFETCHER_WEAK:
14304 	queue_depth = 0;
14305 	break;
14306       case tune_params::AUTOPREFETCHER_STRONG:
14307 	queue_depth = max_insn_queue_index + 1;
14308 	break;
14309       default:
14310 	gcc_unreachable ();
14311     }
14312 
14313   /* We don't mind passing in global_options_set here as we don't use
14314      the *options_set structs anyway.  */
14315   SET_OPTION_IF_UNSET (opts, &global_options_set,
14316 		       param_sched_autopref_queue_depth, queue_depth);
14317 
14318   /* Set up parameters to be used in prefetching algorithm.  Do not
14319      override the defaults unless we are tuning for a core we have
14320      researched values for.  */
14321   if (aarch64_tune_params.prefetch->num_slots > 0)
14322     SET_OPTION_IF_UNSET (opts, &global_options_set,
14323 			 param_simultaneous_prefetches,
14324 			 aarch64_tune_params.prefetch->num_slots);
14325   if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
14326     SET_OPTION_IF_UNSET (opts, &global_options_set,
14327 			 param_l1_cache_size,
14328 			 aarch64_tune_params.prefetch->l1_cache_size);
14329   if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
14330     SET_OPTION_IF_UNSET (opts, &global_options_set,
14331 			 param_l1_cache_line_size,
14332 			 aarch64_tune_params.prefetch->l1_cache_line_size);
14333   if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
14334     SET_OPTION_IF_UNSET (opts, &global_options_set,
14335 			 param_l2_cache_size,
14336 			 aarch64_tune_params.prefetch->l2_cache_size);
14337   if (!aarch64_tune_params.prefetch->prefetch_dynamic_strides)
14338     SET_OPTION_IF_UNSET (opts, &global_options_set,
14339 			 param_prefetch_dynamic_strides, 0);
14340   if (aarch64_tune_params.prefetch->minimum_stride >= 0)
14341     SET_OPTION_IF_UNSET (opts, &global_options_set,
14342 			 param_prefetch_minimum_stride,
14343 			 aarch64_tune_params.prefetch->minimum_stride);
14344 
14345   /* Use the alternative scheduling-pressure algorithm by default.  */
14346   SET_OPTION_IF_UNSET (opts, &global_options_set,
14347 		       param_sched_pressure_algorithm,
14348 		       SCHED_PRESSURE_MODEL);
14349 
14350   /* Validate the guard size.  */
14351   int guard_size = param_stack_clash_protection_guard_size;
14352 
14353   if (guard_size != 12 && guard_size != 16)
14354     error ("only values 12 (4 KB) and 16 (64 KB) are supported for guard "
14355 	   "size.  Given value %d (%llu KB) is out of range",
14356 	   guard_size, (1ULL << guard_size) / 1024ULL);
14357 
14358   /* Enforce that interval is the same size as size so the mid-end does the
14359      right thing.  */
14360   SET_OPTION_IF_UNSET (opts, &global_options_set,
14361 		       param_stack_clash_protection_probe_interval,
14362 		       guard_size);
14363 
14364   /* The maybe_set calls won't update the value if the user has explicitly set
14365      one.  Which means we need to validate that probing interval and guard size
14366      are equal.  */
14367   int probe_interval
14368     = param_stack_clash_protection_probe_interval;
14369   if (guard_size != probe_interval)
14370     error ("stack clash guard size %<%d%> must be equal to probing interval "
14371 	   "%<%d%>", guard_size, probe_interval);
14372 
14373   /* Enable sw prefetching at specified optimization level for
14374      CPUS that have prefetch.  Lower optimization level threshold by 1
14375      when profiling is enabled.  */
14376   if (opts->x_flag_prefetch_loop_arrays < 0
14377       && !opts->x_optimize_size
14378       && aarch64_tune_params.prefetch->default_opt_level >= 0
14379       && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
14380     opts->x_flag_prefetch_loop_arrays = 1;
14381 
14382   if (opts->x_aarch64_arch_string == NULL)
14383     opts->x_aarch64_arch_string = selected_arch->name;
14384   if (opts->x_aarch64_cpu_string == NULL)
14385     opts->x_aarch64_cpu_string = selected_cpu->name;
14386   if (opts->x_aarch64_tune_string == NULL)
14387     opts->x_aarch64_tune_string = selected_tune->name;
14388 
14389   aarch64_override_options_after_change_1 (opts);
14390 }
14391 
14392 /* Print a hint with a suggestion for a core or architecture name that
14393    most closely resembles what the user passed in STR.  ARCH is true if
14394    the user is asking for an architecture name.  ARCH is false if the user
14395    is asking for a core name.  */
14396 
14397 static void
aarch64_print_hint_for_core_or_arch(const char * str,bool arch)14398 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
14399 {
14400   auto_vec<const char *> candidates;
14401   const struct processor *entry = arch ? all_architectures : all_cores;
14402   for (; entry->name != NULL; entry++)
14403     candidates.safe_push (entry->name);
14404 
14405 #ifdef HAVE_LOCAL_CPU_DETECT
14406   /* Add also "native" as possible value.  */
14407   if (arch)
14408     candidates.safe_push ("native");
14409 #endif
14410 
14411   char *s;
14412   const char *hint = candidates_list_and_hint (str, s, candidates);
14413   if (hint)
14414     inform (input_location, "valid arguments are: %s;"
14415 			     " did you mean %qs?", s, hint);
14416   else
14417     inform (input_location, "valid arguments are: %s", s);
14418 
14419   XDELETEVEC (s);
14420 }
14421 
14422 /* Print a hint with a suggestion for a core name that most closely resembles
14423    what the user passed in STR.  */
14424 
14425 inline static void
aarch64_print_hint_for_core(const char * str)14426 aarch64_print_hint_for_core (const char *str)
14427 {
14428   aarch64_print_hint_for_core_or_arch (str, false);
14429 }
14430 
14431 /* Print a hint with a suggestion for an architecture name that most closely
14432    resembles what the user passed in STR.  */
14433 
14434 inline static void
aarch64_print_hint_for_arch(const char * str)14435 aarch64_print_hint_for_arch (const char *str)
14436 {
14437   aarch64_print_hint_for_core_or_arch (str, true);
14438 }
14439 
14440 
14441 /* Print a hint with a suggestion for an extension name
14442    that most closely resembles what the user passed in STR.  */
14443 
14444 void
aarch64_print_hint_for_extensions(const std::string & str)14445 aarch64_print_hint_for_extensions (const std::string &str)
14446 {
14447   auto_vec<const char *> candidates;
14448   aarch64_get_all_extension_candidates (&candidates);
14449   char *s;
14450   const char *hint = candidates_list_and_hint (str.c_str (), s, candidates);
14451   if (hint)
14452     inform (input_location, "valid arguments are: %s;"
14453 			     " did you mean %qs?", s, hint);
14454   else
14455     inform (input_location, "valid arguments are: %s;", s);
14456 
14457   XDELETEVEC (s);
14458 }
14459 
14460 /* Validate a command-line -mcpu option.  Parse the cpu and extensions (if any)
14461    specified in STR and throw errors if appropriate.  Put the results if
14462    they are valid in RES and ISA_FLAGS.  Return whether the option is
14463    valid.  */
14464 
14465 static bool
aarch64_validate_mcpu(const char * str,const struct processor ** res,uint64_t * isa_flags)14466 aarch64_validate_mcpu (const char *str, const struct processor **res,
14467 		       uint64_t *isa_flags)
14468 {
14469   std::string invalid_extension;
14470   enum aarch64_parse_opt_result parse_res
14471     = aarch64_parse_cpu (str, res, isa_flags, &invalid_extension);
14472 
14473   if (parse_res == AARCH64_PARSE_OK)
14474     return true;
14475 
14476   switch (parse_res)
14477     {
14478       case AARCH64_PARSE_MISSING_ARG:
14479 	error ("missing cpu name in %<-mcpu=%s%>", str);
14480 	break;
14481       case AARCH64_PARSE_INVALID_ARG:
14482 	error ("unknown value %qs for %<-mcpu%>", str);
14483 	aarch64_print_hint_for_core (str);
14484 	break;
14485       case AARCH64_PARSE_INVALID_FEATURE:
14486 	error ("invalid feature modifier %qs in %<-mcpu=%s%>",
14487 	       invalid_extension.c_str (), str);
14488 	aarch64_print_hint_for_extensions (invalid_extension);
14489 	break;
14490       default:
14491 	gcc_unreachable ();
14492     }
14493 
14494   return false;
14495 }
14496 
14497 /* Parses CONST_STR for branch protection features specified in
14498    aarch64_branch_protect_types, and set any global variables required.  Returns
14499    the parsing result and assigns LAST_STR to the last processed token from
14500    CONST_STR so that it can be used for error reporting.  */
14501 
14502 static enum
aarch64_parse_branch_protection(const char * const_str,char ** last_str)14503 aarch64_parse_opt_result aarch64_parse_branch_protection (const char *const_str,
14504 							  char** last_str)
14505 {
14506   char *str_root = xstrdup (const_str);
14507   char* token_save = NULL;
14508   char *str = strtok_r (str_root, "+", &token_save);
14509   enum aarch64_parse_opt_result res = AARCH64_PARSE_OK;
14510   if (!str)
14511     res = AARCH64_PARSE_MISSING_ARG;
14512   else
14513     {
14514       char *next_str = strtok_r (NULL, "+", &token_save);
14515       /* Reset the branch protection features to their defaults.  */
14516       aarch64_handle_no_branch_protection (NULL, NULL);
14517 
14518       while (str && res == AARCH64_PARSE_OK)
14519 	{
14520 	  const aarch64_branch_protect_type* type = aarch64_branch_protect_types;
14521 	  bool found = false;
14522 	  /* Search for this type.  */
14523 	  while (type && type->name && !found && res == AARCH64_PARSE_OK)
14524 	    {
14525 	      if (strcmp (str, type->name) == 0)
14526 		{
14527 		  found = true;
14528 		  res = type->handler (str, next_str);
14529 		  str = next_str;
14530 		  next_str = strtok_r (NULL, "+", &token_save);
14531 		}
14532 	      else
14533 		type++;
14534 	    }
14535 	  if (found && res == AARCH64_PARSE_OK)
14536 	    {
14537 	      bool found_subtype = true;
14538 	      /* Loop through each token until we find one that isn't a
14539 		 subtype.  */
14540 	      while (found_subtype)
14541 		{
14542 		  found_subtype = false;
14543 		  const aarch64_branch_protect_type *subtype = type->subtypes;
14544 		  /* Search for the subtype.  */
14545 		  while (str && subtype && subtype->name && !found_subtype
14546 			  && res == AARCH64_PARSE_OK)
14547 		    {
14548 		      if (strcmp (str, subtype->name) == 0)
14549 			{
14550 			  found_subtype = true;
14551 			  res = subtype->handler (str, next_str);
14552 			  str = next_str;
14553 			  next_str = strtok_r (NULL, "+", &token_save);
14554 			}
14555 		      else
14556 			subtype++;
14557 		    }
14558 		}
14559 	    }
14560 	  else if (!found)
14561 	    res = AARCH64_PARSE_INVALID_ARG;
14562 	}
14563     }
14564   /* Copy the last processed token into the argument to pass it back.
14565     Used by option and attribute validation to print the offending token.  */
14566   if (last_str)
14567     {
14568       if (str) strcpy (*last_str, str);
14569       else *last_str = NULL;
14570     }
14571   if (res == AARCH64_PARSE_OK)
14572     {
14573       /* If needed, alloc the accepted string then copy in const_str.
14574 	Used by override_option_after_change_1.  */
14575       if (!accepted_branch_protection_string)
14576 	accepted_branch_protection_string = (char *) xmalloc (
14577 						      BRANCH_PROTECT_STR_MAX
14578 							+ 1);
14579       strncpy (accepted_branch_protection_string, const_str,
14580 		BRANCH_PROTECT_STR_MAX + 1);
14581       /* Forcibly null-terminate.  */
14582       accepted_branch_protection_string[BRANCH_PROTECT_STR_MAX] = '\0';
14583     }
14584   return res;
14585 }
14586 
14587 static bool
aarch64_validate_mbranch_protection(const char * const_str)14588 aarch64_validate_mbranch_protection (const char *const_str)
14589 {
14590   char *str = (char *) xmalloc (strlen (const_str));
14591   enum aarch64_parse_opt_result res =
14592     aarch64_parse_branch_protection (const_str, &str);
14593   if (res == AARCH64_PARSE_INVALID_ARG)
14594     error ("invalid argument %<%s%> for %<-mbranch-protection=%>", str);
14595   else if (res == AARCH64_PARSE_MISSING_ARG)
14596     error ("missing argument for %<-mbranch-protection=%>");
14597   free (str);
14598   return res == AARCH64_PARSE_OK;
14599 }
14600 
14601 /* Validate a command-line -march option.  Parse the arch and extensions
14602    (if any) specified in STR and throw errors if appropriate.  Put the
14603    results, if they are valid, in RES and ISA_FLAGS.  Return whether the
14604    option is valid.  */
14605 
14606 static bool
aarch64_validate_march(const char * str,const struct processor ** res,uint64_t * isa_flags)14607 aarch64_validate_march (const char *str, const struct processor **res,
14608 			 uint64_t *isa_flags)
14609 {
14610   std::string invalid_extension;
14611   enum aarch64_parse_opt_result parse_res
14612     = aarch64_parse_arch (str, res, isa_flags, &invalid_extension);
14613 
14614   if (parse_res == AARCH64_PARSE_OK)
14615     return true;
14616 
14617   switch (parse_res)
14618     {
14619       case AARCH64_PARSE_MISSING_ARG:
14620 	error ("missing arch name in %<-march=%s%>", str);
14621 	break;
14622       case AARCH64_PARSE_INVALID_ARG:
14623 	error ("unknown value %qs for %<-march%>", str);
14624 	aarch64_print_hint_for_arch (str);
14625 	break;
14626       case AARCH64_PARSE_INVALID_FEATURE:
14627 	error ("invalid feature modifier %qs in %<-march=%s%>",
14628 	       invalid_extension.c_str (), str);
14629 	aarch64_print_hint_for_extensions (invalid_extension);
14630 	break;
14631       default:
14632 	gcc_unreachable ();
14633     }
14634 
14635   return false;
14636 }
14637 
14638 /* Validate a command-line -mtune option.  Parse the cpu
14639    specified in STR and throw errors if appropriate.  Put the
14640    result, if it is valid, in RES.  Return whether the option is
14641    valid.  */
14642 
14643 static bool
aarch64_validate_mtune(const char * str,const struct processor ** res)14644 aarch64_validate_mtune (const char *str, const struct processor **res)
14645 {
14646   enum aarch64_parse_opt_result parse_res
14647     = aarch64_parse_tune (str, res);
14648 
14649   if (parse_res == AARCH64_PARSE_OK)
14650     return true;
14651 
14652   switch (parse_res)
14653     {
14654       case AARCH64_PARSE_MISSING_ARG:
14655 	error ("missing cpu name in %<-mtune=%s%>", str);
14656 	break;
14657       case AARCH64_PARSE_INVALID_ARG:
14658 	error ("unknown value %qs for %<-mtune%>", str);
14659 	aarch64_print_hint_for_core (str);
14660 	break;
14661       default:
14662 	gcc_unreachable ();
14663     }
14664   return false;
14665 }
14666 
14667 /* Return the CPU corresponding to the enum CPU.
14668    If it doesn't specify a cpu, return the default.  */
14669 
14670 static const struct processor *
aarch64_get_tune_cpu(enum aarch64_processor cpu)14671 aarch64_get_tune_cpu (enum aarch64_processor cpu)
14672 {
14673   if (cpu != aarch64_none)
14674     return &all_cores[cpu];
14675 
14676   /* The & 0x3f is to extract the bottom 6 bits that encode the
14677      default cpu as selected by the --with-cpu GCC configure option
14678      in config.gcc.
14679      ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
14680      flags mechanism should be reworked to make it more sane.  */
14681   return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
14682 }
14683 
14684 /* Return the architecture corresponding to the enum ARCH.
14685    If it doesn't specify a valid architecture, return the default.  */
14686 
14687 static const struct processor *
aarch64_get_arch(enum aarch64_arch arch)14688 aarch64_get_arch (enum aarch64_arch arch)
14689 {
14690   if (arch != aarch64_no_arch)
14691     return &all_architectures[arch];
14692 
14693   const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
14694 
14695   return &all_architectures[cpu->arch];
14696 }
14697 
14698 /* Return the VG value associated with -msve-vector-bits= value VALUE.  */
14699 
14700 static poly_uint16
aarch64_convert_sve_vector_bits(aarch64_sve_vector_bits_enum value)14701 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
14702 {
14703   /* 128-bit SVE and Advanced SIMD modes use different register layouts
14704      on big-endian targets, so we would need to forbid subregs that convert
14705      from one to the other.  By default a reinterpret sequence would then
14706      involve a store to memory in one mode and a load back in the other.
14707      Even if we optimize that sequence using reverse instructions,
14708      it would still be a significant potential overhead.
14709 
14710      For now, it seems better to generate length-agnostic code for that
14711      case instead.  */
14712   if (value == SVE_SCALABLE
14713       || (value == SVE_128 && BYTES_BIG_ENDIAN))
14714     return poly_uint16 (2, 2);
14715   else
14716     return (int) value / 64;
14717 }
14718 
14719 /* Implement TARGET_OPTION_OVERRIDE.  This is called once in the beginning
14720    and is used to parse the -m{cpu,tune,arch} strings and setup the initial
14721    tuning structs.  In particular it must set selected_tune and
14722    aarch64_isa_flags that define the available ISA features and tuning
14723    decisions.  It must also set selected_arch as this will be used to
14724    output the .arch asm tags for each function.  */
14725 
14726 static void
aarch64_override_options(void)14727 aarch64_override_options (void)
14728 {
14729   uint64_t cpu_isa = 0;
14730   uint64_t arch_isa = 0;
14731   aarch64_isa_flags = 0;
14732 
14733   bool valid_cpu = true;
14734   bool valid_tune = true;
14735   bool valid_arch = true;
14736 
14737   selected_cpu = NULL;
14738   selected_arch = NULL;
14739   selected_tune = NULL;
14740 
14741   if (aarch64_branch_protection_string)
14742     aarch64_validate_mbranch_protection (aarch64_branch_protection_string);
14743 
14744   /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
14745      If either of -march or -mtune is given, they override their
14746      respective component of -mcpu.  */
14747   if (aarch64_cpu_string)
14748     valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
14749 					&cpu_isa);
14750 
14751   if (aarch64_arch_string)
14752     valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
14753 					  &arch_isa);
14754 
14755   if (aarch64_tune_string)
14756     valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
14757 
14758 #ifdef SUBTARGET_OVERRIDE_OPTIONS
14759   SUBTARGET_OVERRIDE_OPTIONS;
14760 #endif
14761 
14762   /* If the user did not specify a processor, choose the default
14763      one for them.  This will be the CPU set during configuration using
14764      --with-cpu, otherwise it is "generic".  */
14765   if (!selected_cpu)
14766     {
14767       if (selected_arch)
14768 	{
14769 	  selected_cpu = &all_cores[selected_arch->ident];
14770 	  aarch64_isa_flags = arch_isa;
14771 	  explicit_arch = selected_arch->arch;
14772 	}
14773       else
14774 	{
14775 	  /* Get default configure-time CPU.  */
14776 	  selected_cpu = aarch64_get_tune_cpu (aarch64_none);
14777 	  aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
14778 	}
14779 
14780       if (selected_tune)
14781 	explicit_tune_core = selected_tune->ident;
14782     }
14783   /* If both -mcpu and -march are specified check that they are architecturally
14784      compatible, warn if they're not and prefer the -march ISA flags.  */
14785   else if (selected_arch)
14786     {
14787       if (selected_arch->arch != selected_cpu->arch)
14788 	{
14789 	  warning (0, "switch %<-mcpu=%s%> conflicts with %<-march=%s%> switch",
14790 		       aarch64_cpu_string,
14791 		       aarch64_arch_string);
14792 	}
14793       aarch64_isa_flags = arch_isa;
14794       explicit_arch = selected_arch->arch;
14795       explicit_tune_core = selected_tune ? selected_tune->ident
14796 					  : selected_cpu->ident;
14797     }
14798   else
14799     {
14800       /* -mcpu but no -march.  */
14801       aarch64_isa_flags = cpu_isa;
14802       explicit_tune_core = selected_tune ? selected_tune->ident
14803 					  : selected_cpu->ident;
14804       gcc_assert (selected_cpu);
14805       selected_arch = &all_architectures[selected_cpu->arch];
14806       explicit_arch = selected_arch->arch;
14807     }
14808 
14809   /* Set the arch as well as we will need it when outputing
14810      the .arch directive in assembly.  */
14811   if (!selected_arch)
14812     {
14813       gcc_assert (selected_cpu);
14814       selected_arch = &all_architectures[selected_cpu->arch];
14815     }
14816 
14817   if (!selected_tune)
14818     selected_tune = selected_cpu;
14819 
14820   if (aarch64_enable_bti == 2)
14821     {
14822 #ifdef TARGET_ENABLE_BTI
14823       aarch64_enable_bti = 1;
14824 #else
14825       aarch64_enable_bti = 0;
14826 #endif
14827     }
14828 
14829   /* Return address signing is currently not supported for ILP32 targets.  For
14830      LP64 targets use the configured option in the absence of a command-line
14831      option for -mbranch-protection.  */
14832   if (!TARGET_ILP32 && accepted_branch_protection_string == NULL)
14833     {
14834 #ifdef TARGET_ENABLE_PAC_RET
14835       aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
14836 #else
14837       aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
14838 #endif
14839     }
14840 
14841 #ifndef HAVE_AS_MABI_OPTION
14842   /* The compiler may have been configured with 2.23.* binutils, which does
14843      not have support for ILP32.  */
14844   if (TARGET_ILP32)
14845     error ("assembler does not support %<-mabi=ilp32%>");
14846 #endif
14847 
14848   /* Convert -msve-vector-bits to a VG count.  */
14849   aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
14850 
14851   if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
14852     sorry ("return address signing is only supported for %<-mabi=lp64%>");
14853 
14854   /* Make sure we properly set up the explicit options.  */
14855   if ((aarch64_cpu_string && valid_cpu)
14856        || (aarch64_tune_string && valid_tune))
14857     gcc_assert (explicit_tune_core != aarch64_none);
14858 
14859   if ((aarch64_cpu_string && valid_cpu)
14860        || (aarch64_arch_string && valid_arch))
14861     gcc_assert (explicit_arch != aarch64_no_arch);
14862 
14863   /* The pass to insert speculation tracking runs before
14864      shrink-wrapping and the latter does not know how to update the
14865      tracking status.  So disable it in this case.  */
14866   if (aarch64_track_speculation)
14867     flag_shrink_wrap = 0;
14868 
14869   aarch64_override_options_internal (&global_options);
14870 
14871   /* Save these options as the default ones in case we push and pop them later
14872      while processing functions with potential target attributes.  */
14873   target_option_default_node = target_option_current_node
14874       = build_target_option_node (&global_options);
14875 }
14876 
14877 /* Implement targetm.override_options_after_change.  */
14878 
14879 static void
aarch64_override_options_after_change(void)14880 aarch64_override_options_after_change (void)
14881 {
14882   aarch64_override_options_after_change_1 (&global_options);
14883 }
14884 
14885 static struct machine_function *
aarch64_init_machine_status(void)14886 aarch64_init_machine_status (void)
14887 {
14888   struct machine_function *machine;
14889   machine = ggc_cleared_alloc<machine_function> ();
14890   return machine;
14891 }
14892 
14893 void
aarch64_init_expanders(void)14894 aarch64_init_expanders (void)
14895 {
14896   init_machine_status = aarch64_init_machine_status;
14897 }
14898 
14899 /* A checking mechanism for the implementation of the various code models.  */
14900 static void
initialize_aarch64_code_model(struct gcc_options * opts)14901 initialize_aarch64_code_model (struct gcc_options *opts)
14902 {
14903   aarch64_cmodel = opts->x_aarch64_cmodel_var;
14904   switch (opts->x_aarch64_cmodel_var)
14905     {
14906     case AARCH64_CMODEL_TINY:
14907       if (opts->x_flag_pic)
14908 	aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
14909       break;
14910     case AARCH64_CMODEL_SMALL:
14911       if (opts->x_flag_pic)
14912 	{
14913 #ifdef HAVE_AS_SMALL_PIC_RELOCS
14914 	  aarch64_cmodel = (flag_pic == 2
14915 			    ? AARCH64_CMODEL_SMALL_PIC
14916 			    : AARCH64_CMODEL_SMALL_SPIC);
14917 #else
14918 	  aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
14919 #endif
14920 	}
14921       break;
14922     case AARCH64_CMODEL_LARGE:
14923       if (opts->x_flag_pic)
14924 	sorry ("code model %qs with %<-f%s%>", "large",
14925 	       opts->x_flag_pic > 1 ? "PIC" : "pic");
14926       if (opts->x_aarch64_abi == AARCH64_ABI_ILP32)
14927 	sorry ("code model %qs not supported in ilp32 mode", "large");
14928       break;
14929     case AARCH64_CMODEL_TINY_PIC:
14930     case AARCH64_CMODEL_SMALL_PIC:
14931     case AARCH64_CMODEL_SMALL_SPIC:
14932       gcc_unreachable ();
14933     }
14934 }
14935 
14936 /* Implement TARGET_OPTION_SAVE.  */
14937 
14938 static void
aarch64_option_save(struct cl_target_option * ptr,struct gcc_options * opts)14939 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
14940 {
14941   ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
14942   ptr->x_aarch64_branch_protection_string
14943     = opts->x_aarch64_branch_protection_string;
14944 }
14945 
14946 /* Implements TARGET_OPTION_RESTORE.  Restore the backend codegen decisions
14947    using the information saved in PTR.  */
14948 
14949 static void
aarch64_option_restore(struct gcc_options * opts,struct cl_target_option * ptr)14950 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
14951 {
14952   opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
14953   selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
14954   opts->x_explicit_arch = ptr->x_explicit_arch;
14955   selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
14956   opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
14957   opts->x_aarch64_branch_protection_string
14958     = ptr->x_aarch64_branch_protection_string;
14959   if (opts->x_aarch64_branch_protection_string)
14960     {
14961       aarch64_parse_branch_protection (opts->x_aarch64_branch_protection_string,
14962 					NULL);
14963     }
14964 
14965   aarch64_override_options_internal (opts);
14966 }
14967 
14968 /* Implement TARGET_OPTION_PRINT.  */
14969 
14970 static void
aarch64_option_print(FILE * file,int indent,struct cl_target_option * ptr)14971 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
14972 {
14973   const struct processor *cpu
14974     = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
14975   uint64_t isa_flags = ptr->x_aarch64_isa_flags;
14976   const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
14977   std::string extension
14978     = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
14979 
14980   fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
14981   fprintf (file, "%*sselected arch = %s%s\n", indent, "",
14982 	   arch->name, extension.c_str ());
14983 }
14984 
14985 static GTY(()) tree aarch64_previous_fndecl;
14986 
14987 void
aarch64_reset_previous_fndecl(void)14988 aarch64_reset_previous_fndecl (void)
14989 {
14990   aarch64_previous_fndecl = NULL;
14991 }
14992 
14993 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
14994    Used by aarch64_set_current_function and aarch64_pragma_target_parse to
14995    make sure optab availability predicates are recomputed when necessary.  */
14996 
14997 void
aarch64_save_restore_target_globals(tree new_tree)14998 aarch64_save_restore_target_globals (tree new_tree)
14999 {
15000   if (TREE_TARGET_GLOBALS (new_tree))
15001     restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
15002   else if (new_tree == target_option_default_node)
15003     restore_target_globals (&default_target_globals);
15004   else
15005     TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
15006 }
15007 
15008 /* Implement TARGET_SET_CURRENT_FUNCTION.  Unpack the codegen decisions
15009    like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
15010    of the function, if such exists.  This function may be called multiple
15011    times on a single function so use aarch64_previous_fndecl to avoid
15012    setting up identical state.  */
15013 
15014 static void
aarch64_set_current_function(tree fndecl)15015 aarch64_set_current_function (tree fndecl)
15016 {
15017   if (!fndecl || fndecl == aarch64_previous_fndecl)
15018     return;
15019 
15020   tree old_tree = (aarch64_previous_fndecl
15021 		   ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
15022 		   : NULL_TREE);
15023 
15024   tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
15025 
15026   /* If current function has no attributes but the previous one did,
15027      use the default node.  */
15028   if (!new_tree && old_tree)
15029     new_tree = target_option_default_node;
15030 
15031   /* If nothing to do, return.  #pragma GCC reset or #pragma GCC pop to
15032      the default have been handled by aarch64_save_restore_target_globals from
15033      aarch64_pragma_target_parse.  */
15034   if (old_tree == new_tree)
15035     return;
15036 
15037   aarch64_previous_fndecl = fndecl;
15038 
15039   /* First set the target options.  */
15040   cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
15041 
15042   aarch64_save_restore_target_globals (new_tree);
15043 }
15044 
15045 /* Enum describing the various ways we can handle attributes.
15046    In many cases we can reuse the generic option handling machinery.  */
15047 
15048 enum aarch64_attr_opt_type
15049 {
15050   aarch64_attr_mask,	/* Attribute should set a bit in target_flags.  */
15051   aarch64_attr_bool,	/* Attribute sets or unsets a boolean variable.  */
15052   aarch64_attr_enum,	/* Attribute sets an enum variable.  */
15053   aarch64_attr_custom	/* Attribute requires a custom handling function.  */
15054 };
15055 
15056 /* All the information needed to handle a target attribute.
15057    NAME is the name of the attribute.
15058    ATTR_TYPE specifies the type of behavior of the attribute as described
15059    in the definition of enum aarch64_attr_opt_type.
15060    ALLOW_NEG is true if the attribute supports a "no-" form.
15061    HANDLER is the function that takes the attribute string as an argument
15062    It is needed only when the ATTR_TYPE is aarch64_attr_custom.
15063    OPT_NUM is the enum specifying the option that the attribute modifies.
15064    This is needed for attributes that mirror the behavior of a command-line
15065    option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
15066    aarch64_attr_enum.  */
15067 
15068 struct aarch64_attribute_info
15069 {
15070   const char *name;
15071   enum aarch64_attr_opt_type attr_type;
15072   bool allow_neg;
15073   bool (*handler) (const char *);
15074   enum opt_code opt_num;
15075 };
15076 
15077 /* Handle the ARCH_STR argument to the arch= target attribute.  */
15078 
15079 static bool
aarch64_handle_attr_arch(const char * str)15080 aarch64_handle_attr_arch (const char *str)
15081 {
15082   const struct processor *tmp_arch = NULL;
15083   std::string invalid_extension;
15084   enum aarch64_parse_opt_result parse_res
15085     = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags, &invalid_extension);
15086 
15087   if (parse_res == AARCH64_PARSE_OK)
15088     {
15089       gcc_assert (tmp_arch);
15090       selected_arch = tmp_arch;
15091       explicit_arch = selected_arch->arch;
15092       return true;
15093     }
15094 
15095   switch (parse_res)
15096     {
15097       case AARCH64_PARSE_MISSING_ARG:
15098 	error ("missing name in %<target(\"arch=\")%> pragma or attribute");
15099 	break;
15100       case AARCH64_PARSE_INVALID_ARG:
15101 	error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str);
15102 	aarch64_print_hint_for_arch (str);
15103 	break;
15104       case AARCH64_PARSE_INVALID_FEATURE:
15105 	error ("invalid feature modifier %s of value (\"%s\") in "
15106 	       "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
15107 	aarch64_print_hint_for_extensions (invalid_extension);
15108 	break;
15109       default:
15110 	gcc_unreachable ();
15111     }
15112 
15113   return false;
15114 }
15115 
15116 /* Handle the argument CPU_STR to the cpu= target attribute.  */
15117 
15118 static bool
aarch64_handle_attr_cpu(const char * str)15119 aarch64_handle_attr_cpu (const char *str)
15120 {
15121   const struct processor *tmp_cpu = NULL;
15122   std::string invalid_extension;
15123   enum aarch64_parse_opt_result parse_res
15124     = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags, &invalid_extension);
15125 
15126   if (parse_res == AARCH64_PARSE_OK)
15127     {
15128       gcc_assert (tmp_cpu);
15129       selected_tune = tmp_cpu;
15130       explicit_tune_core = selected_tune->ident;
15131 
15132       selected_arch = &all_architectures[tmp_cpu->arch];
15133       explicit_arch = selected_arch->arch;
15134       return true;
15135     }
15136 
15137   switch (parse_res)
15138     {
15139       case AARCH64_PARSE_MISSING_ARG:
15140 	error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
15141 	break;
15142       case AARCH64_PARSE_INVALID_ARG:
15143 	error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str);
15144 	aarch64_print_hint_for_core (str);
15145 	break;
15146       case AARCH64_PARSE_INVALID_FEATURE:
15147 	error ("invalid feature modifier %s of value (\"%s\") in "
15148 	       "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
15149 	aarch64_print_hint_for_extensions (invalid_extension);
15150 	break;
15151       default:
15152 	gcc_unreachable ();
15153     }
15154 
15155   return false;
15156 }
15157 
15158 /* Handle the argument STR to the branch-protection= attribute.  */
15159 
15160  static bool
aarch64_handle_attr_branch_protection(const char * str)15161  aarch64_handle_attr_branch_protection (const char* str)
15162  {
15163   char *err_str = (char *) xmalloc (strlen (str) + 1);
15164   enum aarch64_parse_opt_result res = aarch64_parse_branch_protection (str,
15165 								      &err_str);
15166   bool success = false;
15167   switch (res)
15168     {
15169      case AARCH64_PARSE_MISSING_ARG:
15170        error ("missing argument to %<target(\"branch-protection=\")%> pragma or"
15171 	      " attribute");
15172        break;
15173      case AARCH64_PARSE_INVALID_ARG:
15174        error ("invalid protection type (\"%s\") in %<target(\"branch-protection"
15175 	      "=\")%> pragma or attribute", err_str);
15176        break;
15177      case AARCH64_PARSE_OK:
15178        success = true;
15179       /* Fall through.  */
15180      case AARCH64_PARSE_INVALID_FEATURE:
15181        break;
15182      default:
15183        gcc_unreachable ();
15184     }
15185   free (err_str);
15186   return success;
15187  }
15188 
15189 /* Handle the argument STR to the tune= target attribute.  */
15190 
15191 static bool
aarch64_handle_attr_tune(const char * str)15192 aarch64_handle_attr_tune (const char *str)
15193 {
15194   const struct processor *tmp_tune = NULL;
15195   enum aarch64_parse_opt_result parse_res
15196     = aarch64_parse_tune (str, &tmp_tune);
15197 
15198   if (parse_res == AARCH64_PARSE_OK)
15199     {
15200       gcc_assert (tmp_tune);
15201       selected_tune = tmp_tune;
15202       explicit_tune_core = selected_tune->ident;
15203       return true;
15204     }
15205 
15206   switch (parse_res)
15207     {
15208       case AARCH64_PARSE_INVALID_ARG:
15209 	error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str);
15210 	aarch64_print_hint_for_core (str);
15211 	break;
15212       default:
15213 	gcc_unreachable ();
15214     }
15215 
15216   return false;
15217 }
15218 
15219 /* Parse an architecture extensions target attribute string specified in STR.
15220    For example "+fp+nosimd".  Show any errors if needed.  Return TRUE
15221    if successful.  Update aarch64_isa_flags to reflect the ISA features
15222    modified.  */
15223 
15224 static bool
aarch64_handle_attr_isa_flags(char * str)15225 aarch64_handle_attr_isa_flags (char *str)
15226 {
15227   enum aarch64_parse_opt_result parse_res;
15228   uint64_t isa_flags = aarch64_isa_flags;
15229 
15230   /* We allow "+nothing" in the beginning to clear out all architectural
15231      features if the user wants to handpick specific features.  */
15232   if (strncmp ("+nothing", str, 8) == 0)
15233     {
15234       isa_flags = 0;
15235       str += 8;
15236     }
15237 
15238   std::string invalid_extension;
15239   parse_res = aarch64_parse_extension (str, &isa_flags, &invalid_extension);
15240 
15241   if (parse_res == AARCH64_PARSE_OK)
15242     {
15243       aarch64_isa_flags = isa_flags;
15244       return true;
15245     }
15246 
15247   switch (parse_res)
15248     {
15249       case AARCH64_PARSE_MISSING_ARG:
15250 	error ("missing value in %<target()%> pragma or attribute");
15251 	break;
15252 
15253       case AARCH64_PARSE_INVALID_FEATURE:
15254 	error ("invalid feature modifier %s of value (\"%s\") in "
15255 	       "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
15256 	break;
15257 
15258       default:
15259 	gcc_unreachable ();
15260     }
15261 
15262  return false;
15263 }
15264 
15265 /* The target attributes that we support.  On top of these we also support just
15266    ISA extensions, like  __attribute__ ((target ("+crc"))), but that case is
15267    handled explicitly in aarch64_process_one_target_attr.  */
15268 
15269 static const struct aarch64_attribute_info aarch64_attributes[] =
15270 {
15271   { "general-regs-only", aarch64_attr_mask, false, NULL,
15272      OPT_mgeneral_regs_only },
15273   { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
15274      OPT_mfix_cortex_a53_835769 },
15275   { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
15276      OPT_mfix_cortex_a53_843419 },
15277   { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
15278   { "strict-align", aarch64_attr_mask, true, NULL, OPT_mstrict_align },
15279   { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
15280      OPT_momit_leaf_frame_pointer },
15281   { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
15282   { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
15283      OPT_march_ },
15284   { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
15285   { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
15286      OPT_mtune_ },
15287   { "branch-protection", aarch64_attr_custom, false,
15288      aarch64_handle_attr_branch_protection, OPT_mbranch_protection_ },
15289   { "sign-return-address", aarch64_attr_enum, false, NULL,
15290      OPT_msign_return_address_ },
15291   { "outline-atomics", aarch64_attr_bool, true, NULL,
15292      OPT_moutline_atomics},
15293   { NULL, aarch64_attr_custom, false, NULL, OPT____ }
15294 };
15295 
15296 /* Parse ARG_STR which contains the definition of one target attribute.
15297    Show appropriate errors if any or return true if the attribute is valid.  */
15298 
15299 static bool
aarch64_process_one_target_attr(char * arg_str)15300 aarch64_process_one_target_attr (char *arg_str)
15301 {
15302   bool invert = false;
15303 
15304   size_t len = strlen (arg_str);
15305 
15306   if (len == 0)
15307     {
15308       error ("malformed %<target()%> pragma or attribute");
15309       return false;
15310     }
15311 
15312   char *str_to_check = (char *) alloca (len + 1);
15313   strcpy (str_to_check, arg_str);
15314 
15315   /* We have something like __attribute__ ((target ("+fp+nosimd"))).
15316      It is easier to detect and handle it explicitly here rather than going
15317      through the machinery for the rest of the target attributes in this
15318      function.  */
15319   if (*str_to_check == '+')
15320     return aarch64_handle_attr_isa_flags (str_to_check);
15321 
15322   if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
15323     {
15324       invert = true;
15325       str_to_check += 3;
15326     }
15327   char *arg = strchr (str_to_check, '=');
15328 
15329   /* If we found opt=foo then terminate STR_TO_CHECK at the '='
15330      and point ARG to "foo".  */
15331   if (arg)
15332     {
15333       *arg = '\0';
15334       arg++;
15335     }
15336   const struct aarch64_attribute_info *p_attr;
15337   bool found = false;
15338   for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
15339     {
15340       /* If the names don't match up, or the user has given an argument
15341 	 to an attribute that doesn't accept one, or didn't give an argument
15342 	 to an attribute that expects one, fail to match.  */
15343       if (strcmp (str_to_check, p_attr->name) != 0)
15344 	continue;
15345 
15346       found = true;
15347       bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
15348 			      || p_attr->attr_type == aarch64_attr_enum;
15349 
15350       if (attr_need_arg_p ^ (arg != NULL))
15351 	{
15352 	  error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
15353 	  return false;
15354 	}
15355 
15356       /* If the name matches but the attribute does not allow "no-" versions
15357 	 then we can't match.  */
15358       if (invert && !p_attr->allow_neg)
15359 	{
15360 	  error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
15361 	  return false;
15362 	}
15363 
15364       switch (p_attr->attr_type)
15365 	{
15366 	/* Has a custom handler registered.
15367 	   For example, cpu=, arch=, tune=.  */
15368 	  case aarch64_attr_custom:
15369 	    gcc_assert (p_attr->handler);
15370 	    if (!p_attr->handler (arg))
15371 	      return false;
15372 	    break;
15373 
15374 	  /* Either set or unset a boolean option.  */
15375 	  case aarch64_attr_bool:
15376 	    {
15377 	      struct cl_decoded_option decoded;
15378 
15379 	      generate_option (p_attr->opt_num, NULL, !invert,
15380 			       CL_TARGET, &decoded);
15381 	      aarch64_handle_option (&global_options, &global_options_set,
15382 				      &decoded, input_location);
15383 	      break;
15384 	    }
15385 	  /* Set or unset a bit in the target_flags.  aarch64_handle_option
15386 	     should know what mask to apply given the option number.  */
15387 	  case aarch64_attr_mask:
15388 	    {
15389 	      struct cl_decoded_option decoded;
15390 	      /* We only need to specify the option number.
15391 		 aarch64_handle_option will know which mask to apply.  */
15392 	      decoded.opt_index = p_attr->opt_num;
15393 	      decoded.value = !invert;
15394 	      aarch64_handle_option (&global_options, &global_options_set,
15395 				      &decoded, input_location);
15396 	      break;
15397 	    }
15398 	  /* Use the option setting machinery to set an option to an enum.  */
15399 	  case aarch64_attr_enum:
15400 	    {
15401 	      gcc_assert (arg);
15402 	      bool valid;
15403 	      int value;
15404 	      valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
15405 					      &value, CL_TARGET);
15406 	      if (valid)
15407 		{
15408 		  set_option (&global_options, NULL, p_attr->opt_num, value,
15409 			      NULL, DK_UNSPECIFIED, input_location,
15410 			      global_dc);
15411 		}
15412 	      else
15413 		{
15414 		  error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
15415 		}
15416 	      break;
15417 	    }
15418 	  default:
15419 	    gcc_unreachable ();
15420 	}
15421     }
15422 
15423   /* If we reached here we either have found an attribute and validated
15424      it or didn't match any.  If we matched an attribute but its arguments
15425      were malformed we will have returned false already.  */
15426   return found;
15427 }
15428 
15429 /* Count how many times the character C appears in
15430    NULL-terminated string STR.  */
15431 
15432 static unsigned int
num_occurences_in_str(char c,char * str)15433 num_occurences_in_str (char c, char *str)
15434 {
15435   unsigned int res = 0;
15436   while (*str != '\0')
15437     {
15438       if (*str == c)
15439 	res++;
15440 
15441       str++;
15442     }
15443 
15444   return res;
15445 }
15446 
15447 /* Parse the tree in ARGS that contains the target attribute information
15448    and update the global target options space.  */
15449 
15450 bool
aarch64_process_target_attr(tree args)15451 aarch64_process_target_attr (tree args)
15452 {
15453   if (TREE_CODE (args) == TREE_LIST)
15454     {
15455       do
15456 	{
15457 	  tree head = TREE_VALUE (args);
15458 	  if (head)
15459 	    {
15460 	      if (!aarch64_process_target_attr (head))
15461 		return false;
15462 	    }
15463 	  args = TREE_CHAIN (args);
15464 	} while (args);
15465 
15466       return true;
15467     }
15468 
15469   if (TREE_CODE (args) != STRING_CST)
15470     {
15471       error ("attribute %<target%> argument not a string");
15472       return false;
15473     }
15474 
15475   size_t len = strlen (TREE_STRING_POINTER (args));
15476   char *str_to_check = (char *) alloca (len + 1);
15477   strcpy (str_to_check, TREE_STRING_POINTER (args));
15478 
15479   if (len == 0)
15480     {
15481       error ("malformed %<target()%> pragma or attribute");
15482       return false;
15483     }
15484 
15485   /* Used to catch empty spaces between commas i.e.
15486      attribute ((target ("attr1,,attr2"))).  */
15487   unsigned int num_commas = num_occurences_in_str (',', str_to_check);
15488 
15489   /* Handle multiple target attributes separated by ','.  */
15490   char *token = strtok_r (str_to_check, ",", &str_to_check);
15491 
15492   unsigned int num_attrs = 0;
15493   while (token)
15494     {
15495       num_attrs++;
15496       if (!aarch64_process_one_target_attr (token))
15497 	{
15498 	  error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
15499 	  return false;
15500 	}
15501 
15502       token = strtok_r (NULL, ",", &str_to_check);
15503     }
15504 
15505   if (num_attrs != num_commas + 1)
15506     {
15507       error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
15508       return false;
15509     }
15510 
15511   return true;
15512 }
15513 
15514 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P.  This is used to
15515    process attribute ((target ("..."))).  */
15516 
15517 static bool
aarch64_option_valid_attribute_p(tree fndecl,tree,tree args,int)15518 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
15519 {
15520   struct cl_target_option cur_target;
15521   bool ret;
15522   tree old_optimize;
15523   tree new_target, new_optimize;
15524   tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
15525 
15526   /* If what we're processing is the current pragma string then the
15527      target option node is already stored in target_option_current_node
15528      by aarch64_pragma_target_parse in aarch64-c.c.  Use that to avoid
15529      having to re-parse the string.  This is especially useful to keep
15530      arm_neon.h compile times down since that header contains a lot
15531      of intrinsics enclosed in pragmas.  */
15532   if (!existing_target && args == current_target_pragma)
15533     {
15534       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
15535       return true;
15536     }
15537   tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
15538 
15539   old_optimize = build_optimization_node (&global_options);
15540   func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
15541 
15542   /* If the function changed the optimization levels as well as setting
15543      target options, start with the optimizations specified.  */
15544   if (func_optimize && func_optimize != old_optimize)
15545     cl_optimization_restore (&global_options,
15546 			     TREE_OPTIMIZATION (func_optimize));
15547 
15548   /* Save the current target options to restore at the end.  */
15549   cl_target_option_save (&cur_target, &global_options);
15550 
15551   /* If fndecl already has some target attributes applied to it, unpack
15552      them so that we add this attribute on top of them, rather than
15553      overwriting them.  */
15554   if (existing_target)
15555     {
15556       struct cl_target_option *existing_options
15557 	= TREE_TARGET_OPTION (existing_target);
15558 
15559       if (existing_options)
15560 	cl_target_option_restore (&global_options, existing_options);
15561     }
15562   else
15563     cl_target_option_restore (&global_options,
15564 			TREE_TARGET_OPTION (target_option_current_node));
15565 
15566   ret = aarch64_process_target_attr (args);
15567 
15568   /* Set up any additional state.  */
15569   if (ret)
15570     {
15571       aarch64_override_options_internal (&global_options);
15572       /* Initialize SIMD builtins if we haven't already.
15573 	 Set current_target_pragma to NULL for the duration so that
15574 	 the builtin initialization code doesn't try to tag the functions
15575 	 being built with the attributes specified by any current pragma, thus
15576 	 going into an infinite recursion.  */
15577       if (TARGET_SIMD)
15578 	{
15579 	  tree saved_current_target_pragma = current_target_pragma;
15580 	  current_target_pragma = NULL;
15581 	  aarch64_init_simd_builtins ();
15582 	  current_target_pragma = saved_current_target_pragma;
15583 	}
15584       new_target = build_target_option_node (&global_options);
15585     }
15586   else
15587     new_target = NULL;
15588 
15589   new_optimize = build_optimization_node (&global_options);
15590 
15591   if (fndecl && ret)
15592     {
15593       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
15594 
15595       if (old_optimize != new_optimize)
15596 	DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
15597     }
15598 
15599   cl_target_option_restore (&global_options, &cur_target);
15600 
15601   if (old_optimize != new_optimize)
15602     cl_optimization_restore (&global_options,
15603 			     TREE_OPTIMIZATION (old_optimize));
15604   return ret;
15605 }
15606 
15607 /* Helper for aarch64_can_inline_p.  In the case where CALLER and CALLEE are
15608    tri-bool options (yes, no, don't care) and the default value is
15609    DEF, determine whether to reject inlining.  */
15610 
15611 static bool
aarch64_tribools_ok_for_inlining_p(int caller,int callee,int dont_care,int def)15612 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
15613 				     int dont_care, int def)
15614 {
15615   /* If the callee doesn't care, always allow inlining.  */
15616   if (callee == dont_care)
15617     return true;
15618 
15619   /* If the caller doesn't care, always allow inlining.  */
15620   if (caller == dont_care)
15621     return true;
15622 
15623   /* Otherwise, allow inlining if either the callee and caller values
15624      agree, or if the callee is using the default value.  */
15625   return (callee == caller || callee == def);
15626 }
15627 
15628 /* Implement TARGET_CAN_INLINE_P.  Decide whether it is valid
15629    to inline CALLEE into CALLER based on target-specific info.
15630    Make sure that the caller and callee have compatible architectural
15631    features.  Then go through the other possible target attributes
15632    and see if they can block inlining.  Try not to reject always_inline
15633    callees unless they are incompatible architecturally.  */
15634 
15635 static bool
aarch64_can_inline_p(tree caller,tree callee)15636 aarch64_can_inline_p (tree caller, tree callee)
15637 {
15638   tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
15639   tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
15640 
15641   struct cl_target_option *caller_opts
15642 	= TREE_TARGET_OPTION (caller_tree ? caller_tree
15643 					   : target_option_default_node);
15644 
15645   struct cl_target_option *callee_opts
15646 	= TREE_TARGET_OPTION (callee_tree ? callee_tree
15647 					   : target_option_default_node);
15648 
15649   /* Callee's ISA flags should be a subset of the caller's.  */
15650   if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
15651        != callee_opts->x_aarch64_isa_flags)
15652     return false;
15653 
15654   /* Allow non-strict aligned functions inlining into strict
15655      aligned ones.  */
15656   if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
15657        != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
15658       && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
15659 	   && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
15660     return false;
15661 
15662   bool always_inline = lookup_attribute ("always_inline",
15663 					  DECL_ATTRIBUTES (callee));
15664 
15665   /* If the architectural features match up and the callee is always_inline
15666      then the other attributes don't matter.  */
15667   if (always_inline)
15668     return true;
15669 
15670   if (caller_opts->x_aarch64_cmodel_var
15671       != callee_opts->x_aarch64_cmodel_var)
15672     return false;
15673 
15674   if (caller_opts->x_aarch64_tls_dialect
15675       != callee_opts->x_aarch64_tls_dialect)
15676     return false;
15677 
15678   /* Honour explicit requests to workaround errata.  */
15679   if (!aarch64_tribools_ok_for_inlining_p (
15680 	  caller_opts->x_aarch64_fix_a53_err835769,
15681 	  callee_opts->x_aarch64_fix_a53_err835769,
15682 	  2, TARGET_FIX_ERR_A53_835769_DEFAULT))
15683     return false;
15684 
15685   if (!aarch64_tribools_ok_for_inlining_p (
15686 	  caller_opts->x_aarch64_fix_a53_err843419,
15687 	  callee_opts->x_aarch64_fix_a53_err843419,
15688 	  2, TARGET_FIX_ERR_A53_843419))
15689     return false;
15690 
15691   /* If the user explicitly specified -momit-leaf-frame-pointer for the
15692      caller and calle and they don't match up, reject inlining.  */
15693   if (!aarch64_tribools_ok_for_inlining_p (
15694 	  caller_opts->x_flag_omit_leaf_frame_pointer,
15695 	  callee_opts->x_flag_omit_leaf_frame_pointer,
15696 	  2, 1))
15697     return false;
15698 
15699   /* If the callee has specific tuning overrides, respect them.  */
15700   if (callee_opts->x_aarch64_override_tune_string != NULL
15701       && caller_opts->x_aarch64_override_tune_string == NULL)
15702     return false;
15703 
15704   /* If the user specified tuning override strings for the
15705      caller and callee and they don't match up, reject inlining.
15706      We just do a string compare here, we don't analyze the meaning
15707      of the string, as it would be too costly for little gain.  */
15708   if (callee_opts->x_aarch64_override_tune_string
15709       && caller_opts->x_aarch64_override_tune_string
15710       && (strcmp (callee_opts->x_aarch64_override_tune_string,
15711 		  caller_opts->x_aarch64_override_tune_string) != 0))
15712     return false;
15713 
15714   return true;
15715 }
15716 
15717 /* Return the ID of the TLDESC ABI, initializing the descriptor if hasn't
15718    been already.  */
15719 
15720 unsigned int
aarch64_tlsdesc_abi_id()15721 aarch64_tlsdesc_abi_id ()
15722 {
15723   predefined_function_abi &tlsdesc_abi = function_abis[ARM_PCS_TLSDESC];
15724   if (!tlsdesc_abi.initialized_p ())
15725     {
15726       HARD_REG_SET full_reg_clobbers;
15727       CLEAR_HARD_REG_SET (full_reg_clobbers);
15728       SET_HARD_REG_BIT (full_reg_clobbers, R0_REGNUM);
15729       SET_HARD_REG_BIT (full_reg_clobbers, CC_REGNUM);
15730       for (int regno = P0_REGNUM; regno <= P15_REGNUM; ++regno)
15731 	SET_HARD_REG_BIT (full_reg_clobbers, regno);
15732       tlsdesc_abi.initialize (ARM_PCS_TLSDESC, full_reg_clobbers);
15733     }
15734   return tlsdesc_abi.id ();
15735 }
15736 
15737 /* Return true if SYMBOL_REF X binds locally.  */
15738 
15739 static bool
aarch64_symbol_binds_local_p(const_rtx x)15740 aarch64_symbol_binds_local_p (const_rtx x)
15741 {
15742   return (SYMBOL_REF_DECL (x)
15743 	  ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
15744 	  : SYMBOL_REF_LOCAL_P (x));
15745 }
15746 
15747 /* Return true if SYMBOL_REF X is thread local */
15748 static bool
aarch64_tls_symbol_p(rtx x)15749 aarch64_tls_symbol_p (rtx x)
15750 {
15751   if (! TARGET_HAVE_TLS)
15752     return false;
15753 
15754   if (GET_CODE (x) != SYMBOL_REF)
15755     return false;
15756 
15757   return SYMBOL_REF_TLS_MODEL (x) != 0;
15758 }
15759 
15760 /* Classify a TLS symbol into one of the TLS kinds.  */
15761 enum aarch64_symbol_type
aarch64_classify_tls_symbol(rtx x)15762 aarch64_classify_tls_symbol (rtx x)
15763 {
15764   enum tls_model tls_kind = tls_symbolic_operand_type (x);
15765 
15766   switch (tls_kind)
15767     {
15768     case TLS_MODEL_GLOBAL_DYNAMIC:
15769     case TLS_MODEL_LOCAL_DYNAMIC:
15770       return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
15771 
15772     case TLS_MODEL_INITIAL_EXEC:
15773       switch (aarch64_cmodel)
15774 	{
15775 	case AARCH64_CMODEL_TINY:
15776 	case AARCH64_CMODEL_TINY_PIC:
15777 	  return SYMBOL_TINY_TLSIE;
15778 	default:
15779 	  return SYMBOL_SMALL_TLSIE;
15780 	}
15781 
15782     case TLS_MODEL_LOCAL_EXEC:
15783       if (aarch64_tls_size == 12)
15784 	return SYMBOL_TLSLE12;
15785       else if (aarch64_tls_size == 24)
15786 	return SYMBOL_TLSLE24;
15787       else if (aarch64_tls_size == 32)
15788 	return SYMBOL_TLSLE32;
15789       else if (aarch64_tls_size == 48)
15790 	return SYMBOL_TLSLE48;
15791       else
15792 	gcc_unreachable ();
15793 
15794     case TLS_MODEL_EMULATED:
15795     case TLS_MODEL_NONE:
15796       return SYMBOL_FORCE_TO_MEM;
15797 
15798     default:
15799       gcc_unreachable ();
15800     }
15801 }
15802 
15803 /* Return the correct method for accessing X + OFFSET, where X is either
15804    a SYMBOL_REF or LABEL_REF.  */
15805 
15806 enum aarch64_symbol_type
aarch64_classify_symbol(rtx x,HOST_WIDE_INT offset)15807 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
15808 {
15809   if (GET_CODE (x) == LABEL_REF)
15810     {
15811       switch (aarch64_cmodel)
15812 	{
15813 	case AARCH64_CMODEL_LARGE:
15814 	  return SYMBOL_FORCE_TO_MEM;
15815 
15816 	case AARCH64_CMODEL_TINY_PIC:
15817 	case AARCH64_CMODEL_TINY:
15818 	  return SYMBOL_TINY_ABSOLUTE;
15819 
15820 	case AARCH64_CMODEL_SMALL_SPIC:
15821 	case AARCH64_CMODEL_SMALL_PIC:
15822 	case AARCH64_CMODEL_SMALL:
15823 	  return SYMBOL_SMALL_ABSOLUTE;
15824 
15825 	default:
15826 	  gcc_unreachable ();
15827 	}
15828     }
15829 
15830   if (GET_CODE (x) == SYMBOL_REF)
15831     {
15832       if (aarch64_tls_symbol_p (x))
15833 	return aarch64_classify_tls_symbol (x);
15834 
15835       switch (aarch64_cmodel)
15836 	{
15837 	case AARCH64_CMODEL_TINY:
15838 	  /* When we retrieve symbol + offset address, we have to make sure
15839 	     the offset does not cause overflow of the final address.  But
15840 	     we have no way of knowing the address of symbol at compile time
15841 	     so we can't accurately say if the distance between the PC and
15842 	     symbol + offset is outside the addressible range of +/-1MB in the
15843 	     TINY code model.  So we limit the maximum offset to +/-64KB and
15844 	     assume the offset to the symbol is not larger than +/-(1MB - 64KB).
15845 	     If offset_within_block_p is true we allow larger offsets.
15846 	     Furthermore force to memory if the symbol is a weak reference to
15847 	     something that doesn't resolve to a symbol in this module.  */
15848 
15849 	  if (SYMBOL_REF_WEAK (x) && !aarch64_symbol_binds_local_p (x))
15850 	    return SYMBOL_FORCE_TO_MEM;
15851 	  if (!(IN_RANGE (offset, -0x10000, 0x10000)
15852 		|| offset_within_block_p (x, offset)))
15853 	    return SYMBOL_FORCE_TO_MEM;
15854 
15855 	  return SYMBOL_TINY_ABSOLUTE;
15856 
15857 	case AARCH64_CMODEL_SMALL:
15858 	  /* Same reasoning as the tiny code model, but the offset cap here is
15859 	     1MB, allowing +/-3.9GB for the offset to the symbol.  */
15860 
15861 	  if (SYMBOL_REF_WEAK (x) && !aarch64_symbol_binds_local_p (x))
15862 	    return SYMBOL_FORCE_TO_MEM;
15863 	  if (!(IN_RANGE (offset, -0x100000, 0x100000)
15864 		|| offset_within_block_p (x, offset)))
15865 	    return SYMBOL_FORCE_TO_MEM;
15866 
15867 	  return SYMBOL_SMALL_ABSOLUTE;
15868 
15869 	case AARCH64_CMODEL_TINY_PIC:
15870 	  if (!aarch64_symbol_binds_local_p (x))
15871 	    return SYMBOL_TINY_GOT;
15872 	  return SYMBOL_TINY_ABSOLUTE;
15873 
15874 	case AARCH64_CMODEL_SMALL_SPIC:
15875 	case AARCH64_CMODEL_SMALL_PIC:
15876 	  if (!aarch64_symbol_binds_local_p (x))
15877 	    return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
15878 		    ?  SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
15879 	  return SYMBOL_SMALL_ABSOLUTE;
15880 
15881 	case AARCH64_CMODEL_LARGE:
15882 	  /* This is alright even in PIC code as the constant
15883 	     pool reference is always PC relative and within
15884 	     the same translation unit.  */
15885 	  if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
15886 	    return SYMBOL_SMALL_ABSOLUTE;
15887 	  else
15888 	    return SYMBOL_FORCE_TO_MEM;
15889 
15890 	default:
15891 	  gcc_unreachable ();
15892 	}
15893     }
15894 
15895   /* By default push everything into the constant pool.  */
15896   return SYMBOL_FORCE_TO_MEM;
15897 }
15898 
15899 bool
aarch64_constant_address_p(rtx x)15900 aarch64_constant_address_p (rtx x)
15901 {
15902   return (CONSTANT_P (x) && memory_address_p (DImode, x));
15903 }
15904 
15905 bool
aarch64_legitimate_pic_operand_p(rtx x)15906 aarch64_legitimate_pic_operand_p (rtx x)
15907 {
15908   if (GET_CODE (x) == SYMBOL_REF
15909       || (GET_CODE (x) == CONST
15910 	  && GET_CODE (XEXP (x, 0)) == PLUS
15911 	  && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
15912      return false;
15913 
15914   return true;
15915 }
15916 
15917 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook.  Return true for constants
15918    that should be rematerialized rather than spilled.  */
15919 
15920 static bool
aarch64_legitimate_constant_p(machine_mode mode,rtx x)15921 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
15922 {
15923   /* Support CSE and rematerialization of common constants.  */
15924   if (CONST_INT_P (x)
15925       || (CONST_DOUBLE_P (x) && GET_MODE_CLASS (mode) == MODE_FLOAT)
15926       || GET_CODE (x) == CONST_VECTOR)
15927     return true;
15928 
15929   /* Do not allow vector struct mode constants for Advanced SIMD.
15930      We could support 0 and -1 easily, but they need support in
15931      aarch64-simd.md.  */
15932   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
15933   if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
15934     return false;
15935 
15936   /* Only accept variable-length vector constants if they can be
15937      handled directly.
15938 
15939      ??? It would be possible to handle rematerialization of other
15940      constants via secondary reloads.  */
15941   if (vec_flags & VEC_ANY_SVE)
15942     return aarch64_simd_valid_immediate (x, NULL);
15943 
15944   if (GET_CODE (x) == HIGH)
15945     x = XEXP (x, 0);
15946 
15947   /* Accept polynomial constants that can be calculated by using the
15948      destination of a move as the sole temporary.  Constants that
15949      require a second temporary cannot be rematerialized (they can't be
15950      forced to memory and also aren't legitimate constants).  */
15951   poly_int64 offset;
15952   if (poly_int_rtx_p (x, &offset))
15953     return aarch64_offset_temporaries (false, offset) <= 1;
15954 
15955   /* If an offset is being added to something else, we need to allow the
15956      base to be moved into the destination register, meaning that there
15957      are no free temporaries for the offset.  */
15958   x = strip_offset (x, &offset);
15959   if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
15960     return false;
15961 
15962   /* Do not allow const (plus (anchor_symbol, const_int)).  */
15963   if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
15964     return false;
15965 
15966   /* Treat symbols as constants.  Avoid TLS symbols as they are complex,
15967      so spilling them is better than rematerialization.  */
15968   if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
15969     return true;
15970 
15971   /* Label references are always constant.  */
15972   if (GET_CODE (x) == LABEL_REF)
15973     return true;
15974 
15975   return false;
15976 }
15977 
15978 rtx
aarch64_load_tp(rtx target)15979 aarch64_load_tp (rtx target)
15980 {
15981   if (!target
15982       || GET_MODE (target) != Pmode
15983       || !register_operand (target, Pmode))
15984     target = gen_reg_rtx (Pmode);
15985 
15986   /* Can return in any reg.  */
15987   emit_insn (gen_aarch64_load_tp_hard (target));
15988   return target;
15989 }
15990 
15991 /* On AAPCS systems, this is the "struct __va_list".  */
15992 static GTY(()) tree va_list_type;
15993 
15994 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
15995    Return the type to use as __builtin_va_list.
15996 
15997    AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
15998 
15999    struct __va_list
16000    {
16001      void *__stack;
16002      void *__gr_top;
16003      void *__vr_top;
16004      int   __gr_offs;
16005      int   __vr_offs;
16006    };  */
16007 
16008 static tree
aarch64_build_builtin_va_list(void)16009 aarch64_build_builtin_va_list (void)
16010 {
16011   tree va_list_name;
16012   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
16013 
16014   /* Create the type.  */
16015   va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
16016   /* Give it the required name.  */
16017   va_list_name = build_decl (BUILTINS_LOCATION,
16018 			     TYPE_DECL,
16019 			     get_identifier ("__va_list"),
16020 			     va_list_type);
16021   DECL_ARTIFICIAL (va_list_name) = 1;
16022   TYPE_NAME (va_list_type) = va_list_name;
16023   TYPE_STUB_DECL (va_list_type) = va_list_name;
16024 
16025   /* Create the fields.  */
16026   f_stack = build_decl (BUILTINS_LOCATION,
16027 			FIELD_DECL, get_identifier ("__stack"),
16028 			ptr_type_node);
16029   f_grtop = build_decl (BUILTINS_LOCATION,
16030 			FIELD_DECL, get_identifier ("__gr_top"),
16031 			ptr_type_node);
16032   f_vrtop = build_decl (BUILTINS_LOCATION,
16033 			FIELD_DECL, get_identifier ("__vr_top"),
16034 			ptr_type_node);
16035   f_groff = build_decl (BUILTINS_LOCATION,
16036 			FIELD_DECL, get_identifier ("__gr_offs"),
16037 			integer_type_node);
16038   f_vroff = build_decl (BUILTINS_LOCATION,
16039 			FIELD_DECL, get_identifier ("__vr_offs"),
16040 			integer_type_node);
16041 
16042   /* Tell tree-stdarg pass about our internal offset fields.
16043      NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
16044      purpose to identify whether the code is updating va_list internal
16045      offset fields through irregular way.  */
16046   va_list_gpr_counter_field = f_groff;
16047   va_list_fpr_counter_field = f_vroff;
16048 
16049   DECL_ARTIFICIAL (f_stack) = 1;
16050   DECL_ARTIFICIAL (f_grtop) = 1;
16051   DECL_ARTIFICIAL (f_vrtop) = 1;
16052   DECL_ARTIFICIAL (f_groff) = 1;
16053   DECL_ARTIFICIAL (f_vroff) = 1;
16054 
16055   DECL_FIELD_CONTEXT (f_stack) = va_list_type;
16056   DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
16057   DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
16058   DECL_FIELD_CONTEXT (f_groff) = va_list_type;
16059   DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
16060 
16061   TYPE_FIELDS (va_list_type) = f_stack;
16062   DECL_CHAIN (f_stack) = f_grtop;
16063   DECL_CHAIN (f_grtop) = f_vrtop;
16064   DECL_CHAIN (f_vrtop) = f_groff;
16065   DECL_CHAIN (f_groff) = f_vroff;
16066 
16067   /* Compute its layout.  */
16068   layout_type (va_list_type);
16069 
16070   return va_list_type;
16071 }
16072 
16073 /* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
16074 static void
aarch64_expand_builtin_va_start(tree valist,rtx nextarg ATTRIBUTE_UNUSED)16075 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
16076 {
16077   const CUMULATIVE_ARGS *cum;
16078   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
16079   tree stack, grtop, vrtop, groff, vroff;
16080   tree t;
16081   int gr_save_area_size = cfun->va_list_gpr_size;
16082   int vr_save_area_size = cfun->va_list_fpr_size;
16083   int vr_offset;
16084 
16085   cum = &crtl->args.info;
16086   if (cfun->va_list_gpr_size)
16087     gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
16088 			     cfun->va_list_gpr_size);
16089   if (cfun->va_list_fpr_size)
16090     vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
16091 			     * UNITS_PER_VREG, cfun->va_list_fpr_size);
16092 
16093   if (!TARGET_FLOAT)
16094     {
16095       gcc_assert (cum->aapcs_nvrn == 0);
16096       vr_save_area_size = 0;
16097     }
16098 
16099   f_stack = TYPE_FIELDS (va_list_type_node);
16100   f_grtop = DECL_CHAIN (f_stack);
16101   f_vrtop = DECL_CHAIN (f_grtop);
16102   f_groff = DECL_CHAIN (f_vrtop);
16103   f_vroff = DECL_CHAIN (f_groff);
16104 
16105   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
16106 		  NULL_TREE);
16107   grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
16108 		  NULL_TREE);
16109   vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
16110 		  NULL_TREE);
16111   groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
16112 		  NULL_TREE);
16113   vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
16114 		  NULL_TREE);
16115 
16116   /* Emit code to initialize STACK, which points to the next varargs stack
16117      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
16118      by named arguments.  STACK is 8-byte aligned.  */
16119   t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
16120   if (cum->aapcs_stack_size > 0)
16121     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
16122   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
16123   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
16124 
16125   /* Emit code to initialize GRTOP, the top of the GR save area.
16126      virtual_incoming_args_rtx should have been 16 byte aligned.  */
16127   t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
16128   t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
16129   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
16130 
16131   /* Emit code to initialize VRTOP, the top of the VR save area.
16132      This address is gr_save_area_bytes below GRTOP, rounded
16133      down to the next 16-byte boundary.  */
16134   t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
16135   vr_offset = ROUND_UP (gr_save_area_size,
16136 			STACK_BOUNDARY / BITS_PER_UNIT);
16137 
16138   if (vr_offset)
16139     t = fold_build_pointer_plus_hwi (t, -vr_offset);
16140   t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
16141   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
16142 
16143   /* Emit code to initialize GROFF, the offset from GRTOP of the
16144      next GPR argument.  */
16145   t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
16146 	      build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
16147   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
16148 
16149   /* Likewise emit code to initialize VROFF, the offset from FTOP
16150      of the next VR argument.  */
16151   t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
16152 	      build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
16153   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
16154 }
16155 
16156 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
16157 
16158 static tree
aarch64_gimplify_va_arg_expr(tree valist,tree type,gimple_seq * pre_p,gimple_seq * post_p ATTRIBUTE_UNUSED)16159 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
16160 			      gimple_seq *post_p ATTRIBUTE_UNUSED)
16161 {
16162   tree addr;
16163   bool indirect_p;
16164   bool is_ha;		/* is HFA or HVA.  */
16165   bool dw_align;	/* double-word align.  */
16166   machine_mode ag_mode = VOIDmode;
16167   int nregs;
16168   machine_mode mode;
16169 
16170   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
16171   tree stack, f_top, f_off, off, arg, roundup, on_stack;
16172   HOST_WIDE_INT size, rsize, adjust, align;
16173   tree t, u, cond1, cond2;
16174 
16175   indirect_p = pass_va_arg_by_reference (type);
16176   if (indirect_p)
16177     type = build_pointer_type (type);
16178 
16179   mode = TYPE_MODE (type);
16180 
16181   f_stack = TYPE_FIELDS (va_list_type_node);
16182   f_grtop = DECL_CHAIN (f_stack);
16183   f_vrtop = DECL_CHAIN (f_grtop);
16184   f_groff = DECL_CHAIN (f_vrtop);
16185   f_vroff = DECL_CHAIN (f_groff);
16186 
16187   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
16188 		  f_stack, NULL_TREE);
16189   size = int_size_in_bytes (type);
16190 
16191   bool abi_break;
16192   align
16193     = aarch64_function_arg_alignment (mode, type, &abi_break) / BITS_PER_UNIT;
16194 
16195   dw_align = false;
16196   adjust = 0;
16197   if (aarch64_vfp_is_call_or_return_candidate (mode, type, &ag_mode, &nregs,
16198 					       &is_ha, false))
16199     {
16200       /* No frontends can create types with variable-sized modes, so we
16201 	 shouldn't be asked to pass or return them.  */
16202       unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
16203 
16204       /* TYPE passed in fp/simd registers.  */
16205       if (!TARGET_FLOAT)
16206 	aarch64_err_no_fpadvsimd (mode);
16207 
16208       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
16209 		      unshare_expr (valist), f_vrtop, NULL_TREE);
16210       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
16211 		      unshare_expr (valist), f_vroff, NULL_TREE);
16212 
16213       rsize = nregs * UNITS_PER_VREG;
16214 
16215       if (is_ha)
16216 	{
16217 	  if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
16218 	    adjust = UNITS_PER_VREG - ag_size;
16219 	}
16220       else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
16221 	       && size < UNITS_PER_VREG)
16222 	{
16223 	  adjust = UNITS_PER_VREG - size;
16224 	}
16225     }
16226   else
16227     {
16228       /* TYPE passed in general registers.  */
16229       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
16230 		      unshare_expr (valist), f_grtop, NULL_TREE);
16231       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
16232 		      unshare_expr (valist), f_groff, NULL_TREE);
16233       rsize = ROUND_UP (size, UNITS_PER_WORD);
16234       nregs = rsize / UNITS_PER_WORD;
16235 
16236       if (align > 8)
16237 	{
16238 	  if (abi_break && warn_psabi)
16239 	    inform (input_location, "parameter passing for argument of type "
16240 		    "%qT changed in GCC 9.1", type);
16241 	  dw_align = true;
16242 	}
16243 
16244       if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
16245 	  && size < UNITS_PER_WORD)
16246 	{
16247 	  adjust = UNITS_PER_WORD  - size;
16248 	}
16249     }
16250 
16251   /* Get a local temporary for the field value.  */
16252   off = get_initialized_tmp_var (f_off, pre_p, NULL);
16253 
16254   /* Emit code to branch if off >= 0.  */
16255   t = build2 (GE_EXPR, boolean_type_node, off,
16256 	      build_int_cst (TREE_TYPE (off), 0));
16257   cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
16258 
16259   if (dw_align)
16260     {
16261       /* Emit: offs = (offs + 15) & -16.  */
16262       t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
16263 		  build_int_cst (TREE_TYPE (off), 15));
16264       t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
16265 		  build_int_cst (TREE_TYPE (off), -16));
16266       roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
16267     }
16268   else
16269     roundup = NULL;
16270 
16271   /* Update ap.__[g|v]r_offs  */
16272   t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
16273 	      build_int_cst (TREE_TYPE (off), rsize));
16274   t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
16275 
16276   /* String up.  */
16277   if (roundup)
16278     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
16279 
16280   /* [cond2] if (ap.__[g|v]r_offs > 0)  */
16281   u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
16282 	      build_int_cst (TREE_TYPE (f_off), 0));
16283   cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
16284 
16285   /* String up: make sure the assignment happens before the use.  */
16286   t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
16287   COND_EXPR_ELSE (cond1) = t;
16288 
16289   /* Prepare the trees handling the argument that is passed on the stack;
16290      the top level node will store in ON_STACK.  */
16291   arg = get_initialized_tmp_var (stack, pre_p, NULL);
16292   if (align > 8)
16293     {
16294       /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
16295       t = fold_build_pointer_plus_hwi (arg, 15);
16296       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
16297 		  build_int_cst (TREE_TYPE (t), -16));
16298       roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
16299     }
16300   else
16301     roundup = NULL;
16302   /* Advance ap.__stack  */
16303   t = fold_build_pointer_plus_hwi (arg, size + 7);
16304   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
16305 	      build_int_cst (TREE_TYPE (t), -8));
16306   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
16307   /* String up roundup and advance.  */
16308   if (roundup)
16309     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
16310   /* String up with arg */
16311   on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
16312   /* Big-endianness related address adjustment.  */
16313   if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
16314       && size < UNITS_PER_WORD)
16315   {
16316     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
16317 		size_int (UNITS_PER_WORD - size));
16318     on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
16319   }
16320 
16321   COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
16322   COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
16323 
16324   /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
16325   t = off;
16326   if (adjust)
16327     t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
16328 		build_int_cst (TREE_TYPE (off), adjust));
16329 
16330   t = fold_convert (sizetype, t);
16331   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
16332 
16333   if (is_ha)
16334     {
16335       /* type ha; // treat as "struct {ftype field[n];}"
16336          ... [computing offs]
16337          for (i = 0; i <nregs; ++i, offs += 16)
16338 	   ha.field[i] = *((ftype *)(ap.__vr_top + offs));
16339 	 return ha;  */
16340       int i;
16341       tree tmp_ha, field_t, field_ptr_t;
16342 
16343       /* Declare a local variable.  */
16344       tmp_ha = create_tmp_var_raw (type, "ha");
16345       gimple_add_tmp_var (tmp_ha);
16346 
16347       /* Establish the base type.  */
16348       switch (ag_mode)
16349 	{
16350 	case E_SFmode:
16351 	  field_t = float_type_node;
16352 	  field_ptr_t = float_ptr_type_node;
16353 	  break;
16354 	case E_DFmode:
16355 	  field_t = double_type_node;
16356 	  field_ptr_t = double_ptr_type_node;
16357 	  break;
16358 	case E_TFmode:
16359 	  field_t = long_double_type_node;
16360 	  field_ptr_t = long_double_ptr_type_node;
16361 	  break;
16362 	case E_HFmode:
16363 	  field_t = aarch64_fp16_type_node;
16364 	  field_ptr_t = aarch64_fp16_ptr_type_node;
16365 	  break;
16366 	case E_BFmode:
16367 	  field_t = aarch64_bf16_type_node;
16368 	  field_ptr_t = aarch64_bf16_ptr_type_node;
16369 	  break;
16370 	case E_V2SImode:
16371 	case E_V4SImode:
16372 	    {
16373 	      tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
16374 	      field_t = build_vector_type_for_mode (innertype, ag_mode);
16375 	      field_ptr_t = build_pointer_type (field_t);
16376 	    }
16377 	  break;
16378 	default:
16379 	  gcc_assert (0);
16380 	}
16381 
16382       /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
16383       tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
16384       addr = t;
16385       t = fold_convert (field_ptr_t, addr);
16386       t = build2 (MODIFY_EXPR, field_t,
16387 		  build1 (INDIRECT_REF, field_t, tmp_ha),
16388 		  build1 (INDIRECT_REF, field_t, t));
16389 
16390       /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
16391       for (i = 1; i < nregs; ++i)
16392 	{
16393 	  addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
16394 	  u = fold_convert (field_ptr_t, addr);
16395 	  u = build2 (MODIFY_EXPR, field_t,
16396 		      build2 (MEM_REF, field_t, tmp_ha,
16397 			      build_int_cst (field_ptr_t,
16398 					     (i *
16399 					      int_size_in_bytes (field_t)))),
16400 		      build1 (INDIRECT_REF, field_t, u));
16401 	  t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
16402 	}
16403 
16404       u = fold_convert (TREE_TYPE (f_top), tmp_ha);
16405       t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
16406     }
16407 
16408   COND_EXPR_ELSE (cond2) = t;
16409   addr = fold_convert (build_pointer_type (type), cond1);
16410   addr = build_va_arg_indirect_ref (addr);
16411 
16412   if (indirect_p)
16413     addr = build_va_arg_indirect_ref (addr);
16414 
16415   return addr;
16416 }
16417 
16418 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
16419 
16420 static void
aarch64_setup_incoming_varargs(cumulative_args_t cum_v,const function_arg_info & arg,int * pretend_size ATTRIBUTE_UNUSED,int no_rtl)16421 aarch64_setup_incoming_varargs (cumulative_args_t cum_v,
16422 				const function_arg_info &arg,
16423 				int *pretend_size ATTRIBUTE_UNUSED, int no_rtl)
16424 {
16425   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
16426   CUMULATIVE_ARGS local_cum;
16427   int gr_saved = cfun->va_list_gpr_size;
16428   int vr_saved = cfun->va_list_fpr_size;
16429 
16430   /* The caller has advanced CUM up to, but not beyond, the last named
16431      argument.  Advance a local copy of CUM past the last "real" named
16432      argument, to find out how many registers are left over.  */
16433   local_cum = *cum;
16434   aarch64_function_arg_advance (pack_cumulative_args(&local_cum), arg);
16435 
16436   /* Found out how many registers we need to save.
16437      Honor tree-stdvar analysis results.  */
16438   if (cfun->va_list_gpr_size)
16439     gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
16440 		    cfun->va_list_gpr_size / UNITS_PER_WORD);
16441   if (cfun->va_list_fpr_size)
16442     vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
16443 		    cfun->va_list_fpr_size / UNITS_PER_VREG);
16444 
16445   if (!TARGET_FLOAT)
16446     {
16447       gcc_assert (local_cum.aapcs_nvrn == 0);
16448       vr_saved = 0;
16449     }
16450 
16451   if (!no_rtl)
16452     {
16453       if (gr_saved > 0)
16454 	{
16455 	  rtx ptr, mem;
16456 
16457 	  /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
16458 	  ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
16459 			       - gr_saved * UNITS_PER_WORD);
16460 	  mem = gen_frame_mem (BLKmode, ptr);
16461 	  set_mem_alias_set (mem, get_varargs_alias_set ());
16462 
16463 	  move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
16464 			       mem, gr_saved);
16465 	}
16466       if (vr_saved > 0)
16467 	{
16468 	  /* We can't use move_block_from_reg, because it will use
16469 	     the wrong mode, storing D regs only.  */
16470 	  machine_mode mode = TImode;
16471 	  int off, i, vr_start;
16472 
16473 	  /* Set OFF to the offset from virtual_incoming_args_rtx of
16474 	     the first vector register.  The VR save area lies below
16475 	     the GR one, and is aligned to 16 bytes.  */
16476 	  off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
16477 			   STACK_BOUNDARY / BITS_PER_UNIT);
16478 	  off -= vr_saved * UNITS_PER_VREG;
16479 
16480 	  vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
16481 	  for (i = 0; i < vr_saved; ++i)
16482 	    {
16483 	      rtx ptr, mem;
16484 
16485 	      ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
16486 	      mem = gen_frame_mem (mode, ptr);
16487 	      set_mem_alias_set (mem, get_varargs_alias_set ());
16488 	      aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
16489 	      off += UNITS_PER_VREG;
16490 	    }
16491 	}
16492     }
16493 
16494   /* We don't save the size into *PRETEND_SIZE because we want to avoid
16495      any complication of having crtl->args.pretend_args_size changed.  */
16496   cfun->machine->frame.saved_varargs_size
16497     = (ROUND_UP (gr_saved * UNITS_PER_WORD,
16498 		 STACK_BOUNDARY / BITS_PER_UNIT)
16499        + vr_saved * UNITS_PER_VREG);
16500 }
16501 
16502 static void
aarch64_conditional_register_usage(void)16503 aarch64_conditional_register_usage (void)
16504 {
16505   int i;
16506   if (!TARGET_FLOAT)
16507     {
16508       for (i = V0_REGNUM; i <= V31_REGNUM; i++)
16509 	{
16510 	  fixed_regs[i] = 1;
16511 	  call_used_regs[i] = 1;
16512 	}
16513     }
16514   if (!TARGET_SVE)
16515     for (i = P0_REGNUM; i <= P15_REGNUM; i++)
16516       {
16517 	fixed_regs[i] = 1;
16518 	call_used_regs[i] = 1;
16519       }
16520 
16521   /* Only allow the FFR and FFRT to be accessed via special patterns.  */
16522   CLEAR_HARD_REG_BIT (operand_reg_set, FFR_REGNUM);
16523   CLEAR_HARD_REG_BIT (operand_reg_set, FFRT_REGNUM);
16524 
16525   /* When tracking speculation, we need a couple of call-clobbered registers
16526      to track the speculation state.  It would be nice to just use
16527      IP0 and IP1, but currently there are numerous places that just
16528      assume these registers are free for other uses (eg pointer
16529      authentication).  */
16530   if (aarch64_track_speculation)
16531     {
16532       fixed_regs[SPECULATION_TRACKER_REGNUM] = 1;
16533       call_used_regs[SPECULATION_TRACKER_REGNUM] = 1;
16534       fixed_regs[SPECULATION_SCRATCH_REGNUM] = 1;
16535       call_used_regs[SPECULATION_SCRATCH_REGNUM] = 1;
16536     }
16537 }
16538 
16539 /* Implement TARGET_MEMBER_TYPE_FORCES_BLK.  */
16540 
16541 bool
aarch64_member_type_forces_blk(const_tree field_or_array,machine_mode mode)16542 aarch64_member_type_forces_blk (const_tree field_or_array, machine_mode mode)
16543 {
16544   /* For records we're passed a FIELD_DECL, for arrays we're passed
16545      an ARRAY_TYPE.  In both cases we're interested in the TREE_TYPE.  */
16546   const_tree type = TREE_TYPE (field_or_array);
16547 
16548   /* Assign BLKmode to anything that contains multiple SVE predicates.
16549      For structures, the "multiple" case is indicated by MODE being
16550      VOIDmode.  */
16551   unsigned int num_zr, num_pr;
16552   if (aarch64_sve::builtin_type_p (type, &num_zr, &num_pr) && num_pr != 0)
16553     {
16554       if (TREE_CODE (field_or_array) == ARRAY_TYPE)
16555 	return !simple_cst_equal (TYPE_SIZE (field_or_array),
16556 				  TYPE_SIZE (type));
16557       return mode == VOIDmode;
16558     }
16559 
16560   return default_member_type_forces_blk (field_or_array, mode);
16561 }
16562 
16563 /* Bitmasks that indicate whether earlier versions of GCC would have
16564    taken a different path through the ABI logic.  This should result in
16565    a -Wpsabi warning if the earlier path led to a different ABI decision.
16566 
16567    WARN_PSABI_EMPTY_CXX17_BASE
16568       Indicates that the type includes an artificial empty C++17 base field
16569       that, prior to GCC 10.1, would prevent the type from being treated as
16570       a HFA or HVA.  See PR94383 for details.
16571 
16572    WARN_PSABI_NO_UNIQUE_ADDRESS
16573       Indicates that the type includes an empty [[no_unique_address]] field
16574       that, prior to GCC 10.1, would prevent the type from being treated as
16575       a HFA or HVA.  */
16576 const unsigned int WARN_PSABI_EMPTY_CXX17_BASE = 1U << 0;
16577 const unsigned int WARN_PSABI_NO_UNIQUE_ADDRESS = 1U << 1;
16578 
16579 /* Walk down the type tree of TYPE counting consecutive base elements.
16580    If *MODEP is VOIDmode, then set it to the first valid floating point
16581    type.  If a non-floating point type is found, or if a floating point
16582    type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
16583    otherwise return the count in the sub-tree.
16584 
16585    The WARN_PSABI_FLAGS argument allows the caller to check whether this
16586    function has changed its behavior relative to earlier versions of GCC.
16587    Normally the argument should be nonnull and point to a zero-initialized
16588    variable.  The function then records whether the ABI decision might
16589    be affected by a known fix to the ABI logic, setting the associated
16590    WARN_PSABI_* bits if so.
16591 
16592    When the argument is instead a null pointer, the function tries to
16593    simulate the behavior of GCC before all such ABI fixes were made.
16594    This is useful to check whether the function returns something
16595    different after the ABI fixes.  */
16596 static int
aapcs_vfp_sub_candidate(const_tree type,machine_mode * modep,unsigned int * warn_psabi_flags)16597 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep,
16598 			 unsigned int *warn_psabi_flags)
16599 {
16600   machine_mode mode;
16601   HOST_WIDE_INT size;
16602 
16603   if (aarch64_sve::builtin_type_p (type))
16604     return -1;
16605 
16606   switch (TREE_CODE (type))
16607     {
16608     case REAL_TYPE:
16609       mode = TYPE_MODE (type);
16610       if (mode != DFmode && mode != SFmode
16611 	  && mode != TFmode && mode != HFmode)
16612 	return -1;
16613 
16614       if (*modep == VOIDmode)
16615 	*modep = mode;
16616 
16617       if (*modep == mode)
16618 	return 1;
16619 
16620       break;
16621 
16622     case COMPLEX_TYPE:
16623       mode = TYPE_MODE (TREE_TYPE (type));
16624       if (mode != DFmode && mode != SFmode
16625 	  && mode != TFmode && mode != HFmode)
16626 	return -1;
16627 
16628       if (*modep == VOIDmode)
16629 	*modep = mode;
16630 
16631       if (*modep == mode)
16632 	return 2;
16633 
16634       break;
16635 
16636     case VECTOR_TYPE:
16637       /* Use V2SImode and V4SImode as representatives of all 64-bit
16638 	 and 128-bit vector types.  */
16639       size = int_size_in_bytes (type);
16640       switch (size)
16641 	{
16642 	case 8:
16643 	  mode = V2SImode;
16644 	  break;
16645 	case 16:
16646 	  mode = V4SImode;
16647 	  break;
16648 	default:
16649 	  return -1;
16650 	}
16651 
16652       if (*modep == VOIDmode)
16653 	*modep = mode;
16654 
16655       /* Vector modes are considered to be opaque: two vectors are
16656 	 equivalent for the purposes of being homogeneous aggregates
16657 	 if they are the same size.  */
16658       if (*modep == mode)
16659 	return 1;
16660 
16661       break;
16662 
16663     case ARRAY_TYPE:
16664       {
16665 	int count;
16666 	tree index = TYPE_DOMAIN (type);
16667 
16668 	/* Can't handle incomplete types nor sizes that are not
16669 	   fixed.  */
16670 	if (!COMPLETE_TYPE_P (type)
16671 	    || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
16672 	  return -1;
16673 
16674 	count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep,
16675 					 warn_psabi_flags);
16676 	if (count == -1
16677 	    || !index
16678 	    || !TYPE_MAX_VALUE (index)
16679 	    || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
16680 	    || !TYPE_MIN_VALUE (index)
16681 	    || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
16682 	    || count < 0)
16683 	  return -1;
16684 
16685 	count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
16686 		      - tree_to_uhwi (TYPE_MIN_VALUE (index)));
16687 
16688 	/* There must be no padding.  */
16689 	if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
16690 		      count * GET_MODE_BITSIZE (*modep)))
16691 	  return -1;
16692 
16693 	return count;
16694       }
16695 
16696     case RECORD_TYPE:
16697       {
16698 	int count = 0;
16699 	int sub_count;
16700 	tree field;
16701 
16702 	/* Can't handle incomplete types nor sizes that are not
16703 	   fixed.  */
16704 	if (!COMPLETE_TYPE_P (type)
16705 	    || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
16706 	  return -1;
16707 
16708 	for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
16709 	  {
16710 	    if (TREE_CODE (field) != FIELD_DECL)
16711 	      continue;
16712 
16713 	    if (DECL_FIELD_ABI_IGNORED (field))
16714 	      {
16715 		/* See whether this is something that earlier versions of
16716 		   GCC failed to ignore.  */
16717 		unsigned int flag;
16718 		if (lookup_attribute ("no_unique_address",
16719 				      DECL_ATTRIBUTES (field)))
16720 		  flag = WARN_PSABI_NO_UNIQUE_ADDRESS;
16721 		else if (cxx17_empty_base_field_p (field))
16722 		  flag = WARN_PSABI_EMPTY_CXX17_BASE;
16723 		else
16724 		  /* No compatibility problem.  */
16725 		  continue;
16726 
16727 		/* Simulate the old behavior when WARN_PSABI_FLAGS is null.  */
16728 		if (warn_psabi_flags)
16729 		  {
16730 		    *warn_psabi_flags |= flag;
16731 		    continue;
16732 		  }
16733 	      }
16734 
16735 	    sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep,
16736 						 warn_psabi_flags);
16737 	    if (sub_count < 0)
16738 	      return -1;
16739 	    count += sub_count;
16740 	  }
16741 
16742 	/* There must be no padding.  */
16743 	if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
16744 		      count * GET_MODE_BITSIZE (*modep)))
16745 	  return -1;
16746 
16747 	return count;
16748       }
16749 
16750     case UNION_TYPE:
16751     case QUAL_UNION_TYPE:
16752       {
16753 	/* These aren't very interesting except in a degenerate case.  */
16754 	int count = 0;
16755 	int sub_count;
16756 	tree field;
16757 
16758 	/* Can't handle incomplete types nor sizes that are not
16759 	   fixed.  */
16760 	if (!COMPLETE_TYPE_P (type)
16761 	    || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
16762 	  return -1;
16763 
16764 	for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
16765 	  {
16766 	    if (TREE_CODE (field) != FIELD_DECL)
16767 	      continue;
16768 
16769 	    sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep,
16770 						 warn_psabi_flags);
16771 	    if (sub_count < 0)
16772 	      return -1;
16773 	    count = count > sub_count ? count : sub_count;
16774 	  }
16775 
16776 	/* There must be no padding.  */
16777 	if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
16778 		      count * GET_MODE_BITSIZE (*modep)))
16779 	  return -1;
16780 
16781 	return count;
16782       }
16783 
16784     default:
16785       break;
16786     }
16787 
16788   return -1;
16789 }
16790 
16791 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
16792    type as described in AAPCS64 \S 4.1.2.
16793 
16794    See the comment above aarch64_composite_type_p for the notes on MODE.  */
16795 
16796 static bool
aarch64_short_vector_p(const_tree type,machine_mode mode)16797 aarch64_short_vector_p (const_tree type,
16798 			machine_mode mode)
16799 {
16800   poly_int64 size = -1;
16801 
16802   if (type && TREE_CODE (type) == VECTOR_TYPE)
16803     {
16804       if (aarch64_sve::builtin_type_p (type))
16805 	return false;
16806       size = int_size_in_bytes (type);
16807     }
16808   else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
16809 	   || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
16810     {
16811       /* Rely only on the type, not the mode, when processing SVE types.  */
16812       if (type && aarch64_some_values_include_pst_objects_p (type))
16813 	gcc_assert (aarch64_sve_mode_p (mode));
16814       else
16815 	size = GET_MODE_SIZE (mode);
16816     }
16817   if (known_eq (size, 8) || known_eq (size, 16))
16818     {
16819       /* 64-bit and 128-bit vectors should only acquire an SVE mode if
16820 	 they are being treated as scalable AAPCS64 types.  */
16821       gcc_assert (!aarch64_sve_mode_p (mode));
16822       return true;
16823     }
16824   return false;
16825 }
16826 
16827 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
16828    type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
16829    array types.  The C99 floating-point complex types are also considered
16830    as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
16831    types, which are GCC extensions and out of the scope of AAPCS64, are
16832    treated as composite types here as well.
16833 
16834    Note that MODE itself is not sufficient in determining whether a type
16835    is such a composite type or not.  This is because
16836    stor-layout.c:compute_record_mode may have already changed the MODE
16837    (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
16838    structure with only one field may have its MODE set to the mode of the
16839    field.  Also an integer mode whose size matches the size of the
16840    RECORD_TYPE type may be used to substitute the original mode
16841    (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
16842    solely relied on.  */
16843 
16844 static bool
aarch64_composite_type_p(const_tree type,machine_mode mode)16845 aarch64_composite_type_p (const_tree type,
16846 			  machine_mode mode)
16847 {
16848   if (aarch64_short_vector_p (type, mode))
16849     return false;
16850 
16851   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
16852     return true;
16853 
16854   if (mode == BLKmode
16855       || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
16856       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
16857     return true;
16858 
16859   return false;
16860 }
16861 
16862 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
16863    shall be passed or returned in simd/fp register(s) (providing these
16864    parameter passing registers are available).
16865 
16866    Upon successful return, *COUNT returns the number of needed registers,
16867    *BASE_MODE returns the mode of the individual register and when IS_HAF
16868    is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
16869    floating-point aggregate or a homogeneous short-vector aggregate.
16870 
16871    SILENT_P is true if the function should refrain from reporting any
16872    diagnostics.  This should only be used if the caller is certain that
16873    any ABI decisions would eventually come through this function with
16874    SILENT_P set to false.  */
16875 
16876 static bool
aarch64_vfp_is_call_or_return_candidate(machine_mode mode,const_tree type,machine_mode * base_mode,int * count,bool * is_ha,bool silent_p)16877 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
16878 					 const_tree type,
16879 					 machine_mode *base_mode,
16880 					 int *count,
16881 					 bool *is_ha,
16882 					 bool silent_p)
16883 {
16884   if (is_ha != NULL) *is_ha = false;
16885 
16886   machine_mode new_mode = VOIDmode;
16887   bool composite_p = aarch64_composite_type_p (type, mode);
16888 
16889   if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
16890       || aarch64_short_vector_p (type, mode))
16891     {
16892       *count = 1;
16893       new_mode = mode;
16894     }
16895   else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
16896     {
16897       if (is_ha != NULL) *is_ha = true;
16898       *count = 2;
16899       new_mode = GET_MODE_INNER (mode);
16900     }
16901   else if (type && composite_p)
16902     {
16903       unsigned int warn_psabi_flags = 0;
16904       int ag_count = aapcs_vfp_sub_candidate (type, &new_mode,
16905 					      &warn_psabi_flags);
16906       if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
16907 	{
16908 	  static unsigned last_reported_type_uid;
16909 	  unsigned uid = TYPE_UID (TYPE_MAIN_VARIANT (type));
16910 	  int alt;
16911 	  if (!silent_p
16912 	      && warn_psabi
16913 	      && warn_psabi_flags
16914 	      && uid != last_reported_type_uid
16915 	      && ((alt = aapcs_vfp_sub_candidate (type, &new_mode, NULL))
16916 		  != ag_count))
16917 	    {
16918 	      const char *url
16919 		= CHANGES_ROOT_URL "gcc-10/changes.html#empty_base";
16920 	      gcc_assert (alt == -1);
16921 	      last_reported_type_uid = uid;
16922 	      /* Use TYPE_MAIN_VARIANT to strip any redundant const
16923 		 qualification.  */
16924 	      if (warn_psabi_flags & WARN_PSABI_NO_UNIQUE_ADDRESS)
16925 		inform (input_location, "parameter passing for argument of "
16926 			"type %qT with %<[[no_unique_address]]%> members "
16927 			"changed %{in GCC 10.1%}",
16928 			TYPE_MAIN_VARIANT (type), url);
16929 	      else if (warn_psabi_flags & WARN_PSABI_EMPTY_CXX17_BASE)
16930 		inform (input_location, "parameter passing for argument of "
16931 			"type %qT when C++17 is enabled changed to match "
16932 			"C++14 %{in GCC 10.1%}",
16933 			TYPE_MAIN_VARIANT (type), url);
16934 	    }
16935 
16936 	  if (is_ha != NULL) *is_ha = true;
16937 	  *count = ag_count;
16938 	}
16939       else
16940 	return false;
16941     }
16942   else
16943     return false;
16944 
16945   gcc_assert (!aarch64_sve_mode_p (new_mode));
16946   *base_mode = new_mode;
16947   return true;
16948 }
16949 
16950 /* Implement TARGET_STRUCT_VALUE_RTX.  */
16951 
16952 static rtx
aarch64_struct_value_rtx(tree fndecl ATTRIBUTE_UNUSED,int incoming ATTRIBUTE_UNUSED)16953 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
16954 			  int incoming ATTRIBUTE_UNUSED)
16955 {
16956   return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
16957 }
16958 
16959 /* Implements target hook vector_mode_supported_p.  */
16960 static bool
aarch64_vector_mode_supported_p(machine_mode mode)16961 aarch64_vector_mode_supported_p (machine_mode mode)
16962 {
16963   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
16964   return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
16965 }
16966 
16967 /* Return the full-width SVE vector mode for element mode MODE, if one
16968    exists.  */
16969 opt_machine_mode
aarch64_full_sve_mode(scalar_mode mode)16970 aarch64_full_sve_mode (scalar_mode mode)
16971 {
16972   switch (mode)
16973     {
16974     case E_DFmode:
16975       return VNx2DFmode;
16976     case E_SFmode:
16977       return VNx4SFmode;
16978     case E_HFmode:
16979       return VNx8HFmode;
16980     case E_BFmode:
16981       return VNx8BFmode;
16982     case E_DImode:
16983       return VNx2DImode;
16984     case E_SImode:
16985       return VNx4SImode;
16986     case E_HImode:
16987       return VNx8HImode;
16988     case E_QImode:
16989       return VNx16QImode;
16990     default:
16991       return opt_machine_mode ();
16992     }
16993 }
16994 
16995 /* Return the 128-bit Advanced SIMD vector mode for element mode MODE,
16996    if it exists.  */
16997 opt_machine_mode
aarch64_vq_mode(scalar_mode mode)16998 aarch64_vq_mode (scalar_mode mode)
16999 {
17000   switch (mode)
17001     {
17002     case E_DFmode:
17003       return V2DFmode;
17004     case E_SFmode:
17005       return V4SFmode;
17006     case E_HFmode:
17007       return V8HFmode;
17008     case E_BFmode:
17009       return V8BFmode;
17010     case E_SImode:
17011       return V4SImode;
17012     case E_HImode:
17013       return V8HImode;
17014     case E_QImode:
17015       return V16QImode;
17016     case E_DImode:
17017       return V2DImode;
17018     default:
17019       return opt_machine_mode ();
17020     }
17021 }
17022 
17023 /* Return appropriate SIMD container
17024    for MODE within a vector of WIDTH bits.  */
17025 static machine_mode
aarch64_simd_container_mode(scalar_mode mode,poly_int64 width)17026 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
17027 {
17028   if (TARGET_SVE
17029       && maybe_ne (width, 128)
17030       && known_eq (width, BITS_PER_SVE_VECTOR))
17031     return aarch64_full_sve_mode (mode).else_mode (word_mode);
17032 
17033   gcc_assert (known_eq (width, 64) || known_eq (width, 128));
17034   if (TARGET_SIMD)
17035     {
17036       if (known_eq (width, 128))
17037 	return aarch64_vq_mode (mode).else_mode (word_mode);
17038       else
17039 	switch (mode)
17040 	  {
17041 	  case E_SFmode:
17042 	    return V2SFmode;
17043 	  case E_HFmode:
17044 	    return V4HFmode;
17045 	  case E_BFmode:
17046 	    return V4BFmode;
17047 	  case E_SImode:
17048 	    return V2SImode;
17049 	  case E_HImode:
17050 	    return V4HImode;
17051 	  case E_QImode:
17052 	    return V8QImode;
17053 	  default:
17054 	    break;
17055 	  }
17056     }
17057   return word_mode;
17058 }
17059 
17060 /* Return 128-bit container as the preferred SIMD mode for MODE.  */
17061 static machine_mode
aarch64_preferred_simd_mode(scalar_mode mode)17062 aarch64_preferred_simd_mode (scalar_mode mode)
17063 {
17064   poly_int64 bits = TARGET_SVE ? BITS_PER_SVE_VECTOR : 128;
17065   return aarch64_simd_container_mode (mode, bits);
17066 }
17067 
17068 /* Return a list of possible vector sizes for the vectorizer
17069    to iterate over.  */
17070 static unsigned int
aarch64_autovectorize_vector_modes(vector_modes * modes,bool)17071 aarch64_autovectorize_vector_modes (vector_modes *modes, bool)
17072 {
17073   static const machine_mode sve_modes[] = {
17074     /* Try using full vectors for all element types.  */
17075     VNx16QImode,
17076 
17077     /* Try using 16-bit containers for 8-bit elements and full vectors
17078        for wider elements.  */
17079     VNx8QImode,
17080 
17081     /* Try using 32-bit containers for 8-bit and 16-bit elements and
17082        full vectors for wider elements.  */
17083     VNx4QImode,
17084 
17085     /* Try using 64-bit containers for all element types.  */
17086     VNx2QImode
17087   };
17088 
17089   static const machine_mode advsimd_modes[] = {
17090     /* Try using 128-bit vectors for all element types.  */
17091     V16QImode,
17092 
17093     /* Try using 64-bit vectors for 8-bit elements and 128-bit vectors
17094        for wider elements.  */
17095     V8QImode,
17096 
17097     /* Try using 64-bit vectors for 16-bit elements and 128-bit vectors
17098        for wider elements.
17099 
17100        TODO: We could support a limited form of V4QImode too, so that
17101        we use 32-bit vectors for 8-bit elements.  */
17102     V4HImode,
17103 
17104     /* Try using 64-bit vectors for 32-bit elements and 128-bit vectors
17105        for 64-bit elements.
17106 
17107        TODO: We could similarly support limited forms of V2QImode and V2HImode
17108        for this case.  */
17109     V2SImode
17110   };
17111 
17112   /* Try using N-byte SVE modes only after trying N-byte Advanced SIMD mode.
17113      This is because:
17114 
17115      - If we can't use N-byte Advanced SIMD vectors then the placement
17116        doesn't matter; we'll just continue as though the Advanced SIMD
17117        entry didn't exist.
17118 
17119      - If an SVE main loop with N bytes ends up being cheaper than an
17120        Advanced SIMD main loop with N bytes then by default we'll replace
17121        the Advanced SIMD version with the SVE one.
17122 
17123      - If an Advanced SIMD main loop with N bytes ends up being cheaper
17124        than an SVE main loop with N bytes then by default we'll try to
17125        use the SVE loop to vectorize the epilogue instead.  */
17126   unsigned int sve_i = TARGET_SVE ? 0 : ARRAY_SIZE (sve_modes);
17127   unsigned int advsimd_i = 0;
17128   while (advsimd_i < ARRAY_SIZE (advsimd_modes))
17129     {
17130       if (sve_i < ARRAY_SIZE (sve_modes)
17131 	  && maybe_gt (GET_MODE_NUNITS (sve_modes[sve_i]),
17132 		       GET_MODE_NUNITS (advsimd_modes[advsimd_i])))
17133 	modes->safe_push (sve_modes[sve_i++]);
17134       else
17135 	modes->safe_push (advsimd_modes[advsimd_i++]);
17136     }
17137   while (sve_i < ARRAY_SIZE (sve_modes))
17138     modes->safe_push (sve_modes[sve_i++]);
17139 
17140   unsigned int flags = 0;
17141   /* Consider enabling VECT_COMPARE_COSTS for SVE, both so that we
17142      can compare SVE against Advanced SIMD and so that we can compare
17143      multiple SVE vectorization approaches against each other.  There's
17144      not really any point doing this for Advanced SIMD only, since the
17145      first mode that works should always be the best.  */
17146   if (TARGET_SVE && aarch64_sve_compare_costs)
17147     flags |= VECT_COMPARE_COSTS;
17148   return flags;
17149 }
17150 
17151 /* Implement TARGET_MANGLE_TYPE.  */
17152 
17153 static const char *
aarch64_mangle_type(const_tree type)17154 aarch64_mangle_type (const_tree type)
17155 {
17156   /* The AArch64 ABI documents say that "__va_list" has to be
17157      mangled as if it is in the "std" namespace.  */
17158   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
17159     return "St9__va_list";
17160 
17161   /* Half-precision floating point types.  */
17162   if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
17163     {
17164       if (TYPE_MODE (type) == BFmode)
17165 	return "u6__bf16";
17166       else
17167 	return "Dh";
17168     }
17169 
17170   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
17171      builtin types.  */
17172   if (TYPE_NAME (type) != NULL)
17173     {
17174       const char *res;
17175       if ((res = aarch64_general_mangle_builtin_type (type))
17176 	  || (res = aarch64_sve::mangle_builtin_type (type)))
17177 	return res;
17178     }
17179 
17180   /* Use the default mangling.  */
17181   return NULL;
17182 }
17183 
17184 /* Implement TARGET_VERIFY_TYPE_CONTEXT.  */
17185 
17186 static bool
aarch64_verify_type_context(location_t loc,type_context_kind context,const_tree type,bool silent_p)17187 aarch64_verify_type_context (location_t loc, type_context_kind context,
17188 			     const_tree type, bool silent_p)
17189 {
17190   return aarch64_sve::verify_type_context (loc, context, type, silent_p);
17191 }
17192 
17193 /* Find the first rtx_insn before insn that will generate an assembly
17194    instruction.  */
17195 
17196 static rtx_insn *
aarch64_prev_real_insn(rtx_insn * insn)17197 aarch64_prev_real_insn (rtx_insn *insn)
17198 {
17199   if (!insn)
17200     return NULL;
17201 
17202   do
17203     {
17204       insn = prev_real_insn (insn);
17205     }
17206   while (insn && recog_memoized (insn) < 0);
17207 
17208   return insn;
17209 }
17210 
17211 static bool
is_madd_op(enum attr_type t1)17212 is_madd_op (enum attr_type t1)
17213 {
17214   unsigned int i;
17215   /* A number of these may be AArch32 only.  */
17216   enum attr_type mlatypes[] = {
17217     TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
17218     TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
17219     TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
17220   };
17221 
17222   for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
17223     {
17224       if (t1 == mlatypes[i])
17225 	return true;
17226     }
17227 
17228   return false;
17229 }
17230 
17231 /* Check if there is a register dependency between a load and the insn
17232    for which we hold recog_data.  */
17233 
17234 static bool
dep_between_memop_and_curr(rtx memop)17235 dep_between_memop_and_curr (rtx memop)
17236 {
17237   rtx load_reg;
17238   int opno;
17239 
17240   gcc_assert (GET_CODE (memop) == SET);
17241 
17242   if (!REG_P (SET_DEST (memop)))
17243     return false;
17244 
17245   load_reg = SET_DEST (memop);
17246   for (opno = 1; opno < recog_data.n_operands; opno++)
17247     {
17248       rtx operand = recog_data.operand[opno];
17249       if (REG_P (operand)
17250           && reg_overlap_mentioned_p (load_reg, operand))
17251         return true;
17252 
17253     }
17254   return false;
17255 }
17256 
17257 
17258 /* When working around the Cortex-A53 erratum 835769,
17259    given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
17260    instruction and has a preceding memory instruction such that a NOP
17261    should be inserted between them.  */
17262 
17263 bool
aarch64_madd_needs_nop(rtx_insn * insn)17264 aarch64_madd_needs_nop (rtx_insn* insn)
17265 {
17266   enum attr_type attr_type;
17267   rtx_insn *prev;
17268   rtx body;
17269 
17270   if (!TARGET_FIX_ERR_A53_835769)
17271     return false;
17272 
17273   if (!INSN_P (insn) || recog_memoized (insn) < 0)
17274     return false;
17275 
17276   attr_type = get_attr_type (insn);
17277   if (!is_madd_op (attr_type))
17278     return false;
17279 
17280   prev = aarch64_prev_real_insn (insn);
17281   /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
17282      Restore recog state to INSN to avoid state corruption.  */
17283   extract_constrain_insn_cached (insn);
17284 
17285   if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
17286     return false;
17287 
17288   body = single_set (prev);
17289 
17290   /* If the previous insn is a memory op and there is no dependency between
17291      it and the DImode madd, emit a NOP between them.  If body is NULL then we
17292      have a complex memory operation, probably a load/store pair.
17293      Be conservative for now and emit a NOP.  */
17294   if (GET_MODE (recog_data.operand[0]) == DImode
17295       && (!body || !dep_between_memop_and_curr (body)))
17296     return true;
17297 
17298   return false;
17299 
17300 }
17301 
17302 
17303 /* Implement FINAL_PRESCAN_INSN.  */
17304 
17305 void
aarch64_final_prescan_insn(rtx_insn * insn)17306 aarch64_final_prescan_insn (rtx_insn *insn)
17307 {
17308   if (aarch64_madd_needs_nop (insn))
17309     fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
17310 }
17311 
17312 
17313 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
17314    instruction.  */
17315 
17316 bool
aarch64_sve_index_immediate_p(rtx base_or_step)17317 aarch64_sve_index_immediate_p (rtx base_or_step)
17318 {
17319   return (CONST_INT_P (base_or_step)
17320 	  && IN_RANGE (INTVAL (base_or_step), -16, 15));
17321 }
17322 
17323 /* Return true if X is a valid immediate for the SVE ADD and SUB instructions
17324    when applied to mode MODE.  Negate X first if NEGATE_P is true.  */
17325 
17326 bool
aarch64_sve_arith_immediate_p(machine_mode mode,rtx x,bool negate_p)17327 aarch64_sve_arith_immediate_p (machine_mode mode, rtx x, bool negate_p)
17328 {
17329   rtx elt = unwrap_const_vec_duplicate (x);
17330   if (!CONST_INT_P (elt))
17331     return false;
17332 
17333   HOST_WIDE_INT val = INTVAL (elt);
17334   if (negate_p)
17335     val = -val;
17336   val &= GET_MODE_MASK (GET_MODE_INNER (mode));
17337 
17338   if (val & 0xff)
17339     return IN_RANGE (val, 0, 0xff);
17340   return IN_RANGE (val, 0, 0xff00);
17341 }
17342 
17343 /* Return true if X is a valid immediate for the SVE SQADD and SQSUB
17344    instructions when applied to mode MODE.  Negate X first if NEGATE_P
17345    is true.  */
17346 
17347 bool
aarch64_sve_sqadd_sqsub_immediate_p(machine_mode mode,rtx x,bool negate_p)17348 aarch64_sve_sqadd_sqsub_immediate_p (machine_mode mode, rtx x, bool negate_p)
17349 {
17350   if (!aarch64_sve_arith_immediate_p (mode, x, negate_p))
17351     return false;
17352 
17353   /* After the optional negation, the immediate must be nonnegative.
17354      E.g. a saturating add of -127 must be done via SQSUB Zn.B, Zn.B, #127
17355      instead of SQADD Zn.B, Zn.B, #129.  */
17356   rtx elt = unwrap_const_vec_duplicate (x);
17357   return negate_p == (INTVAL (elt) < 0);
17358 }
17359 
17360 /* Return true if X is a valid immediate operand for an SVE logical
17361    instruction such as AND.  */
17362 
17363 bool
aarch64_sve_bitmask_immediate_p(rtx x)17364 aarch64_sve_bitmask_immediate_p (rtx x)
17365 {
17366   rtx elt;
17367 
17368   return (const_vec_duplicate_p (x, &elt)
17369 	  && CONST_INT_P (elt)
17370 	  && aarch64_bitmask_imm (INTVAL (elt),
17371 				  GET_MODE_INNER (GET_MODE (x))));
17372 }
17373 
17374 /* Return true if X is a valid immediate for the SVE DUP and CPY
17375    instructions.  */
17376 
17377 bool
aarch64_sve_dup_immediate_p(rtx x)17378 aarch64_sve_dup_immediate_p (rtx x)
17379 {
17380   x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
17381   if (!CONST_INT_P (x))
17382     return false;
17383 
17384   HOST_WIDE_INT val = INTVAL (x);
17385   if (val & 0xff)
17386     return IN_RANGE (val, -0x80, 0x7f);
17387   return IN_RANGE (val, -0x8000, 0x7f00);
17388 }
17389 
17390 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
17391    SIGNED_P says whether the operand is signed rather than unsigned.  */
17392 
17393 bool
aarch64_sve_cmp_immediate_p(rtx x,bool signed_p)17394 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
17395 {
17396   x = unwrap_const_vec_duplicate (x);
17397   return (CONST_INT_P (x)
17398 	  && (signed_p
17399 	      ? IN_RANGE (INTVAL (x), -16, 15)
17400 	      : IN_RANGE (INTVAL (x), 0, 127)));
17401 }
17402 
17403 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
17404    instruction.  Negate X first if NEGATE_P is true.  */
17405 
17406 bool
aarch64_sve_float_arith_immediate_p(rtx x,bool negate_p)17407 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
17408 {
17409   rtx elt;
17410   REAL_VALUE_TYPE r;
17411 
17412   if (!const_vec_duplicate_p (x, &elt)
17413       || GET_CODE (elt) != CONST_DOUBLE)
17414     return false;
17415 
17416   r = *CONST_DOUBLE_REAL_VALUE (elt);
17417 
17418   if (negate_p)
17419     r = real_value_negate (&r);
17420 
17421   if (real_equal (&r, &dconst1))
17422     return true;
17423   if (real_equal (&r, &dconsthalf))
17424     return true;
17425   return false;
17426 }
17427 
17428 /* Return true if X is a valid immediate operand for an SVE FMUL
17429    instruction.  */
17430 
17431 bool
aarch64_sve_float_mul_immediate_p(rtx x)17432 aarch64_sve_float_mul_immediate_p (rtx x)
17433 {
17434   rtx elt;
17435 
17436   return (const_vec_duplicate_p (x, &elt)
17437 	  && GET_CODE (elt) == CONST_DOUBLE
17438 	  && (real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf)
17439 	      || real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconst2)));
17440 }
17441 
17442 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
17443    for the Advanced SIMD operation described by WHICH and INSN.  If INFO
17444    is nonnull, use it to describe valid immediates.  */
17445 static bool
aarch64_advsimd_valid_immediate_hs(unsigned int val32,simd_immediate_info * info,enum simd_immediate_check which,simd_immediate_info::insn_type insn)17446 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
17447 				    simd_immediate_info *info,
17448 				    enum simd_immediate_check which,
17449 				    simd_immediate_info::insn_type insn)
17450 {
17451   /* Try a 4-byte immediate with LSL.  */
17452   for (unsigned int shift = 0; shift < 32; shift += 8)
17453     if ((val32 & (0xff << shift)) == val32)
17454       {
17455 	if (info)
17456 	  *info = simd_immediate_info (SImode, val32 >> shift, insn,
17457 				       simd_immediate_info::LSL, shift);
17458 	return true;
17459       }
17460 
17461   /* Try a 2-byte immediate with LSL.  */
17462   unsigned int imm16 = val32 & 0xffff;
17463   if (imm16 == (val32 >> 16))
17464     for (unsigned int shift = 0; shift < 16; shift += 8)
17465       if ((imm16 & (0xff << shift)) == imm16)
17466 	{
17467 	  if (info)
17468 	    *info = simd_immediate_info (HImode, imm16 >> shift, insn,
17469 					 simd_immediate_info::LSL, shift);
17470 	  return true;
17471 	}
17472 
17473   /* Try a 4-byte immediate with MSL, except for cases that MVN
17474      can handle.  */
17475   if (which == AARCH64_CHECK_MOV)
17476     for (unsigned int shift = 8; shift < 24; shift += 8)
17477       {
17478 	unsigned int low = (1 << shift) - 1;
17479 	if (((val32 & (0xff << shift)) | low) == val32)
17480 	  {
17481 	    if (info)
17482 	      *info = simd_immediate_info (SImode, val32 >> shift, insn,
17483 					   simd_immediate_info::MSL, shift);
17484 	    return true;
17485 	  }
17486       }
17487 
17488   return false;
17489 }
17490 
17491 /* Return true if replicating VAL64 is a valid immediate for the
17492    Advanced SIMD operation described by WHICH.  If INFO is nonnull,
17493    use it to describe valid immediates.  */
17494 static bool
aarch64_advsimd_valid_immediate(unsigned HOST_WIDE_INT val64,simd_immediate_info * info,enum simd_immediate_check which)17495 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
17496 				 simd_immediate_info *info,
17497 				 enum simd_immediate_check which)
17498 {
17499   unsigned int val32 = val64 & 0xffffffff;
17500   unsigned int val16 = val64 & 0xffff;
17501   unsigned int val8 = val64 & 0xff;
17502 
17503   if (val32 == (val64 >> 32))
17504     {
17505       if ((which & AARCH64_CHECK_ORR) != 0
17506 	  && aarch64_advsimd_valid_immediate_hs (val32, info, which,
17507 						 simd_immediate_info::MOV))
17508 	return true;
17509 
17510       if ((which & AARCH64_CHECK_BIC) != 0
17511 	  && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
17512 						 simd_immediate_info::MVN))
17513 	return true;
17514 
17515       /* Try using a replicated byte.  */
17516       if (which == AARCH64_CHECK_MOV
17517 	  && val16 == (val32 >> 16)
17518 	  && val8 == (val16 >> 8))
17519 	{
17520 	  if (info)
17521 	    *info = simd_immediate_info (QImode, val8);
17522 	  return true;
17523 	}
17524     }
17525 
17526   /* Try using a bit-to-bytemask.  */
17527   if (which == AARCH64_CHECK_MOV)
17528     {
17529       unsigned int i;
17530       for (i = 0; i < 64; i += 8)
17531 	{
17532 	  unsigned char byte = (val64 >> i) & 0xff;
17533 	  if (byte != 0 && byte != 0xff)
17534 	    break;
17535 	}
17536       if (i == 64)
17537 	{
17538 	  if (info)
17539 	    *info = simd_immediate_info (DImode, val64);
17540 	  return true;
17541 	}
17542     }
17543   return false;
17544 }
17545 
17546 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
17547    instruction.  If INFO is nonnull, use it to describe valid immediates.  */
17548 
17549 static bool
aarch64_sve_valid_immediate(unsigned HOST_WIDE_INT val64,simd_immediate_info * info)17550 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
17551 			     simd_immediate_info *info)
17552 {
17553   scalar_int_mode mode = DImode;
17554   unsigned int val32 = val64 & 0xffffffff;
17555   if (val32 == (val64 >> 32))
17556     {
17557       mode = SImode;
17558       unsigned int val16 = val32 & 0xffff;
17559       if (val16 == (val32 >> 16))
17560 	{
17561 	  mode = HImode;
17562 	  unsigned int val8 = val16 & 0xff;
17563 	  if (val8 == (val16 >> 8))
17564 	    mode = QImode;
17565 	}
17566     }
17567   HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
17568   if (IN_RANGE (val, -0x80, 0x7f))
17569     {
17570       /* DUP with no shift.  */
17571       if (info)
17572 	*info = simd_immediate_info (mode, val);
17573       return true;
17574     }
17575   if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
17576     {
17577       /* DUP with LSL #8.  */
17578       if (info)
17579 	*info = simd_immediate_info (mode, val);
17580       return true;
17581     }
17582   if (aarch64_bitmask_imm (val64, mode))
17583     {
17584       /* DUPM.  */
17585       if (info)
17586 	*info = simd_immediate_info (mode, val);
17587       return true;
17588     }
17589   return false;
17590 }
17591 
17592 /* Return true if X is an UNSPEC_PTRUE constant of the form:
17593 
17594        (const (unspec [PATTERN ZERO] UNSPEC_PTRUE))
17595 
17596    where PATTERN is the svpattern as a CONST_INT and where ZERO
17597    is a zero constant of the required PTRUE mode (which can have
17598    fewer elements than X's mode, if zero bits are significant).
17599 
17600    If so, and if INFO is nonnull, describe the immediate in INFO.  */
17601 bool
aarch64_sve_ptrue_svpattern_p(rtx x,struct simd_immediate_info * info)17602 aarch64_sve_ptrue_svpattern_p (rtx x, struct simd_immediate_info *info)
17603 {
17604   if (GET_CODE (x) != CONST)
17605     return false;
17606 
17607   x = XEXP (x, 0);
17608   if (GET_CODE (x) != UNSPEC || XINT (x, 1) != UNSPEC_PTRUE)
17609     return false;
17610 
17611   if (info)
17612     {
17613       aarch64_svpattern pattern
17614 	= (aarch64_svpattern) INTVAL (XVECEXP (x, 0, 0));
17615       machine_mode pred_mode = GET_MODE (XVECEXP (x, 0, 1));
17616       scalar_int_mode int_mode = aarch64_sve_element_int_mode (pred_mode);
17617       *info = simd_immediate_info (int_mode, pattern);
17618     }
17619   return true;
17620 }
17621 
17622 /* Return true if X is a valid SVE predicate.  If INFO is nonnull, use
17623    it to describe valid immediates.  */
17624 
17625 static bool
aarch64_sve_pred_valid_immediate(rtx x,simd_immediate_info * info)17626 aarch64_sve_pred_valid_immediate (rtx x, simd_immediate_info *info)
17627 {
17628   if (aarch64_sve_ptrue_svpattern_p (x, info))
17629     return true;
17630 
17631   if (x == CONST0_RTX (GET_MODE (x)))
17632     {
17633       if (info)
17634 	*info = simd_immediate_info (DImode, 0);
17635       return true;
17636     }
17637 
17638   /* Analyze the value as a VNx16BImode.  This should be relatively
17639      efficient, since rtx_vector_builder has enough built-in capacity
17640      to store all VLA predicate constants without needing the heap.  */
17641   rtx_vector_builder builder;
17642   if (!aarch64_get_sve_pred_bits (builder, x))
17643     return false;
17644 
17645   unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
17646   if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
17647     {
17648       machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
17649       aarch64_svpattern pattern = aarch64_svpattern_for_vl (mode, vl);
17650       if (pattern != AARCH64_NUM_SVPATTERNS)
17651 	{
17652 	  if (info)
17653 	    {
17654 	      scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
17655 	      *info = simd_immediate_info (int_mode, pattern);
17656 	    }
17657 	  return true;
17658 	}
17659     }
17660   return false;
17661 }
17662 
17663 /* Return true if OP is a valid SIMD immediate for the operation
17664    described by WHICH.  If INFO is nonnull, use it to describe valid
17665    immediates.  */
17666 bool
aarch64_simd_valid_immediate(rtx op,simd_immediate_info * info,enum simd_immediate_check which)17667 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
17668 			      enum simd_immediate_check which)
17669 {
17670   machine_mode mode = GET_MODE (op);
17671   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
17672   if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
17673     return false;
17674 
17675   if (vec_flags & VEC_SVE_PRED)
17676     return aarch64_sve_pred_valid_immediate (op, info);
17677 
17678   scalar_mode elt_mode = GET_MODE_INNER (mode);
17679   rtx base, step;
17680   unsigned int n_elts;
17681   if (GET_CODE (op) == CONST_VECTOR
17682       && CONST_VECTOR_DUPLICATE_P (op))
17683     n_elts = CONST_VECTOR_NPATTERNS (op);
17684   else if ((vec_flags & VEC_SVE_DATA)
17685 	   && const_vec_series_p (op, &base, &step))
17686     {
17687       gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
17688       if (!aarch64_sve_index_immediate_p (base)
17689 	  || !aarch64_sve_index_immediate_p (step))
17690 	return false;
17691 
17692       if (info)
17693 	{
17694 	  /* Get the corresponding container mode.  E.g. an INDEX on V2SI
17695 	     should yield two integer values per 128-bit block, meaning
17696 	     that we need to treat it in the same way as V2DI and then
17697 	     ignore the upper 32 bits of each element.  */
17698 	  elt_mode = aarch64_sve_container_int_mode (mode);
17699 	  *info = simd_immediate_info (elt_mode, base, step);
17700 	}
17701       return true;
17702     }
17703   else if (GET_CODE (op) == CONST_VECTOR
17704 	   && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
17705     /* N_ELTS set above.  */;
17706   else
17707     return false;
17708 
17709   scalar_float_mode elt_float_mode;
17710   if (n_elts == 1
17711       && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
17712     {
17713       rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
17714       if (aarch64_float_const_zero_rtx_p (elt)
17715 	  || aarch64_float_const_representable_p (elt))
17716 	{
17717 	  if (info)
17718 	    *info = simd_immediate_info (elt_float_mode, elt);
17719 	  return true;
17720 	}
17721     }
17722 
17723   /* If all elements in an SVE vector have the same value, we have a free
17724      choice between using the element mode and using the container mode.
17725      Using the element mode means that unused parts of the vector are
17726      duplicates of the used elements, while using the container mode means
17727      that the unused parts are an extension of the used elements.  Using the
17728      element mode is better for (say) VNx4HI 0x101, since 0x01010101 is valid
17729      for its container mode VNx4SI while 0x00000101 isn't.
17730 
17731      If not all elements in an SVE vector have the same value, we need the
17732      transition from one element to the next to occur at container boundaries.
17733      E.g. a fixed-length VNx4HI containing { 1, 2, 3, 4 } should be treated
17734      in the same way as a VNx4SI containing { 1, 2, 3, 4 }.  */
17735   scalar_int_mode elt_int_mode;
17736   if ((vec_flags & VEC_SVE_DATA) && n_elts > 1)
17737     elt_int_mode = aarch64_sve_container_int_mode (mode);
17738   else
17739     elt_int_mode = int_mode_for_mode (elt_mode).require ();
17740 
17741   unsigned int elt_size = GET_MODE_SIZE (elt_int_mode);
17742   if (elt_size > 8)
17743     return false;
17744 
17745   /* Expand the vector constant out into a byte vector, with the least
17746      significant byte of the register first.  */
17747   auto_vec<unsigned char, 16> bytes;
17748   bytes.reserve (n_elts * elt_size);
17749   for (unsigned int i = 0; i < n_elts; i++)
17750     {
17751       /* The vector is provided in gcc endian-neutral fashion.
17752 	 For aarch64_be Advanced SIMD, it must be laid out in the vector
17753 	 register in reverse order.  */
17754       bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
17755       rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
17756 
17757       if (elt_mode != elt_int_mode)
17758 	elt = gen_lowpart (elt_int_mode, elt);
17759 
17760       if (!CONST_INT_P (elt))
17761 	return false;
17762 
17763       unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
17764       for (unsigned int byte = 0; byte < elt_size; byte++)
17765 	{
17766 	  bytes.quick_push (elt_val & 0xff);
17767 	  elt_val >>= BITS_PER_UNIT;
17768 	}
17769     }
17770 
17771   /* The immediate must repeat every eight bytes.  */
17772   unsigned int nbytes = bytes.length ();
17773   for (unsigned i = 8; i < nbytes; ++i)
17774     if (bytes[i] != bytes[i - 8])
17775       return false;
17776 
17777   /* Get the repeating 8-byte value as an integer.  No endian correction
17778      is needed here because bytes is already in lsb-first order.  */
17779   unsigned HOST_WIDE_INT val64 = 0;
17780   for (unsigned int i = 0; i < 8; i++)
17781     val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
17782 	      << (i * BITS_PER_UNIT));
17783 
17784   if (vec_flags & VEC_SVE_DATA)
17785     return aarch64_sve_valid_immediate (val64, info);
17786   else
17787     return aarch64_advsimd_valid_immediate (val64, info, which);
17788 }
17789 
17790 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
17791    has a step in the range of INDEX.  Return the index expression if so,
17792    otherwise return null.  */
17793 rtx
aarch64_check_zero_based_sve_index_immediate(rtx x)17794 aarch64_check_zero_based_sve_index_immediate (rtx x)
17795 {
17796   rtx base, step;
17797   if (const_vec_series_p (x, &base, &step)
17798       && base == const0_rtx
17799       && aarch64_sve_index_immediate_p (step))
17800     return step;
17801   return NULL_RTX;
17802 }
17803 
17804 /* Check of immediate shift constants are within range.  */
17805 bool
aarch64_simd_shift_imm_p(rtx x,machine_mode mode,bool left)17806 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
17807 {
17808   x = unwrap_const_vec_duplicate (x);
17809   if (!CONST_INT_P (x))
17810     return false;
17811   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
17812   if (left)
17813     return IN_RANGE (INTVAL (x), 0, bit_width - 1);
17814   else
17815     return IN_RANGE (INTVAL (x), 1, bit_width);
17816 }
17817 
17818 /* Return the bitmask CONST_INT to select the bits required by a zero extract
17819    operation of width WIDTH at bit position POS.  */
17820 
17821 rtx
aarch64_mask_from_zextract_ops(rtx width,rtx pos)17822 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
17823 {
17824   gcc_assert (CONST_INT_P (width));
17825   gcc_assert (CONST_INT_P (pos));
17826 
17827   unsigned HOST_WIDE_INT mask
17828     = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
17829   return GEN_INT (mask << UINTVAL (pos));
17830 }
17831 
17832 bool
aarch64_mov_operand_p(rtx x,machine_mode mode)17833 aarch64_mov_operand_p (rtx x, machine_mode mode)
17834 {
17835   if (GET_CODE (x) == HIGH
17836       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
17837     return true;
17838 
17839   if (CONST_INT_P (x))
17840     return true;
17841 
17842   if (VECTOR_MODE_P (GET_MODE (x)))
17843     {
17844       /* Require predicate constants to be VNx16BI before RA, so that we
17845 	 force everything to have a canonical form.  */
17846       if (!lra_in_progress
17847 	  && !reload_completed
17848 	  && GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_BOOL
17849 	  && GET_MODE (x) != VNx16BImode)
17850 	return false;
17851 
17852       return aarch64_simd_valid_immediate (x, NULL);
17853     }
17854 
17855   if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
17856     return true;
17857 
17858   if (TARGET_SVE && aarch64_sve_cnt_immediate_p (x))
17859     return true;
17860 
17861   return aarch64_classify_symbolic_expression (x)
17862     == SYMBOL_TINY_ABSOLUTE;
17863 }
17864 
17865 /* Return a const_int vector of VAL.  */
17866 rtx
aarch64_simd_gen_const_vector_dup(machine_mode mode,HOST_WIDE_INT val)17867 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
17868 {
17869   rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
17870   return gen_const_vec_duplicate (mode, c);
17871 }
17872 
17873 /* Check OP is a legal scalar immediate for the MOVI instruction.  */
17874 
17875 bool
aarch64_simd_scalar_immediate_valid_for_move(rtx op,scalar_int_mode mode)17876 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
17877 {
17878   machine_mode vmode;
17879 
17880   vmode = aarch64_simd_container_mode (mode, 64);
17881   rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
17882   return aarch64_simd_valid_immediate (op_v, NULL);
17883 }
17884 
17885 /* Construct and return a PARALLEL RTX vector with elements numbering the
17886    lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
17887    the vector - from the perspective of the architecture.  This does not
17888    line up with GCC's perspective on lane numbers, so we end up with
17889    different masks depending on our target endian-ness.  The diagram
17890    below may help.  We must draw the distinction when building masks
17891    which select one half of the vector.  An instruction selecting
17892    architectural low-lanes for a big-endian target, must be described using
17893    a mask selecting GCC high-lanes.
17894 
17895                  Big-Endian             Little-Endian
17896 
17897 GCC             0   1   2   3           3   2   1   0
17898               | x | x | x | x |       | x | x | x | x |
17899 Architecture    3   2   1   0           3   2   1   0
17900 
17901 Low Mask:         { 2, 3 }                { 0, 1 }
17902 High Mask:        { 0, 1 }                { 2, 3 }
17903 
17904    MODE Is the mode of the vector and NUNITS is the number of units in it.  */
17905 
17906 rtx
aarch64_simd_vect_par_cnst_half(machine_mode mode,int nunits,bool high)17907 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
17908 {
17909   rtvec v = rtvec_alloc (nunits / 2);
17910   int high_base = nunits / 2;
17911   int low_base = 0;
17912   int base;
17913   rtx t1;
17914   int i;
17915 
17916   if (BYTES_BIG_ENDIAN)
17917     base = high ? low_base : high_base;
17918   else
17919     base = high ? high_base : low_base;
17920 
17921   for (i = 0; i < nunits / 2; i++)
17922     RTVEC_ELT (v, i) = GEN_INT (base + i);
17923 
17924   t1 = gen_rtx_PARALLEL (mode, v);
17925   return t1;
17926 }
17927 
17928 /* Check OP for validity as a PARALLEL RTX vector with elements
17929    numbering the lanes of either the high (HIGH == TRUE) or low lanes,
17930    from the perspective of the architecture.  See the diagram above
17931    aarch64_simd_vect_par_cnst_half for more details.  */
17932 
17933 bool
aarch64_simd_check_vect_par_cnst_half(rtx op,machine_mode mode,bool high)17934 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
17935 				       bool high)
17936 {
17937   int nelts;
17938   if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
17939     return false;
17940 
17941   rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
17942   HOST_WIDE_INT count_op = XVECLEN (op, 0);
17943   HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
17944   int i = 0;
17945 
17946   if (count_op != count_ideal)
17947     return false;
17948 
17949   for (i = 0; i < count_ideal; i++)
17950     {
17951       rtx elt_op = XVECEXP (op, 0, i);
17952       rtx elt_ideal = XVECEXP (ideal, 0, i);
17953 
17954       if (!CONST_INT_P (elt_op)
17955 	  || INTVAL (elt_ideal) != INTVAL (elt_op))
17956 	return false;
17957     }
17958   return true;
17959 }
17960 
17961 /* Return a PARALLEL containing NELTS elements, with element I equal
17962    to BASE + I * STEP.  */
17963 
17964 rtx
aarch64_gen_stepped_int_parallel(unsigned int nelts,int base,int step)17965 aarch64_gen_stepped_int_parallel (unsigned int nelts, int base, int step)
17966 {
17967   rtvec vec = rtvec_alloc (nelts);
17968   for (unsigned int i = 0; i < nelts; ++i)
17969     RTVEC_ELT (vec, i) = gen_int_mode (base + i * step, DImode);
17970   return gen_rtx_PARALLEL (VOIDmode, vec);
17971 }
17972 
17973 /* Return true if OP is a PARALLEL of CONST_INTs that form a linear
17974    series with step STEP.  */
17975 
17976 bool
aarch64_stepped_int_parallel_p(rtx op,int step)17977 aarch64_stepped_int_parallel_p (rtx op, int step)
17978 {
17979   if (GET_CODE (op) != PARALLEL || !CONST_INT_P (XVECEXP (op, 0, 0)))
17980     return false;
17981 
17982   unsigned HOST_WIDE_INT base = UINTVAL (XVECEXP (op, 0, 0));
17983   for (int i = 1; i < XVECLEN (op, 0); ++i)
17984     if (!CONST_INT_P (XVECEXP (op, 0, i))
17985 	|| UINTVAL (XVECEXP (op, 0, i)) != base + i * step)
17986       return false;
17987 
17988   return true;
17989 }
17990 
17991 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
17992    HIGH (exclusive).  */
17993 void
aarch64_simd_lane_bounds(rtx operand,HOST_WIDE_INT low,HOST_WIDE_INT high,const_tree exp)17994 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
17995 			  const_tree exp)
17996 {
17997   HOST_WIDE_INT lane;
17998   gcc_assert (CONST_INT_P (operand));
17999   lane = INTVAL (operand);
18000 
18001   if (lane < low || lane >= high)
18002   {
18003     if (exp)
18004       error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
18005     else
18006       error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
18007   }
18008 }
18009 
18010 /* Peform endian correction on lane number N, which indexes a vector
18011    of mode MODE, and return the result as an SImode rtx.  */
18012 
18013 rtx
aarch64_endian_lane_rtx(machine_mode mode,unsigned int n)18014 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
18015 {
18016   return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
18017 }
18018 
18019 /* Return TRUE if OP is a valid vector addressing mode.  */
18020 
18021 bool
aarch64_simd_mem_operand_p(rtx op)18022 aarch64_simd_mem_operand_p (rtx op)
18023 {
18024   return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
18025 			|| REG_P (XEXP (op, 0)));
18026 }
18027 
18028 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction.  */
18029 
18030 bool
aarch64_sve_ld1r_operand_p(rtx op)18031 aarch64_sve_ld1r_operand_p (rtx op)
18032 {
18033   struct aarch64_address_info addr;
18034   scalar_mode mode;
18035 
18036   return (MEM_P (op)
18037 	  && is_a <scalar_mode> (GET_MODE (op), &mode)
18038 	  && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
18039 	  && addr.type == ADDRESS_REG_IMM
18040 	  && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
18041 }
18042 
18043 /* Return true if OP is a valid MEM operand for an SVE LD1R{Q,O} instruction
18044    where the size of the read data is specified by `mode` and the size of the
18045    vector elements are specified by `elem_mode`.   */
18046 bool
aarch64_sve_ld1rq_ld1ro_operand_p(rtx op,machine_mode mode,scalar_mode elem_mode)18047 aarch64_sve_ld1rq_ld1ro_operand_p (rtx op, machine_mode mode,
18048 				   scalar_mode elem_mode)
18049 {
18050   struct aarch64_address_info addr;
18051   if (!MEM_P (op)
18052       || !aarch64_classify_address (&addr, XEXP (op, 0), elem_mode, false))
18053     return false;
18054 
18055   if (addr.type == ADDRESS_REG_IMM)
18056     return offset_4bit_signed_scaled_p (mode, addr.const_offset);
18057 
18058   if (addr.type == ADDRESS_REG_REG)
18059     return (1U << addr.shift) == GET_MODE_SIZE (elem_mode);
18060 
18061   return false;
18062 }
18063 
18064 /* Return true if OP is a valid MEM operand for an SVE LD1RQ instruction.  */
18065 bool
aarch64_sve_ld1rq_operand_p(rtx op)18066 aarch64_sve_ld1rq_operand_p (rtx op)
18067 {
18068   return aarch64_sve_ld1rq_ld1ro_operand_p (op, TImode,
18069 					    GET_MODE_INNER (GET_MODE (op)));
18070 }
18071 
18072 /* Return true if OP is a valid MEM operand for an SVE LD1RO instruction for
18073    accessing a vector where the element size is specified by `elem_mode`.  */
18074 bool
aarch64_sve_ld1ro_operand_p(rtx op,scalar_mode elem_mode)18075 aarch64_sve_ld1ro_operand_p (rtx op, scalar_mode elem_mode)
18076 {
18077   return aarch64_sve_ld1rq_ld1ro_operand_p (op, OImode, elem_mode);
18078 }
18079 
18080 /* Return true if OP is a valid MEM operand for an SVE LDFF1 instruction.  */
18081 bool
aarch64_sve_ldff1_operand_p(rtx op)18082 aarch64_sve_ldff1_operand_p (rtx op)
18083 {
18084   if (!MEM_P (op))
18085     return false;
18086 
18087   struct aarch64_address_info addr;
18088   if (!aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op), false))
18089     return false;
18090 
18091   if (addr.type == ADDRESS_REG_IMM)
18092     return known_eq (addr.const_offset, 0);
18093 
18094   return addr.type == ADDRESS_REG_REG;
18095 }
18096 
18097 /* Return true if OP is a valid MEM operand for an SVE LDNF1 instruction.  */
18098 bool
aarch64_sve_ldnf1_operand_p(rtx op)18099 aarch64_sve_ldnf1_operand_p (rtx op)
18100 {
18101   struct aarch64_address_info addr;
18102 
18103   return (MEM_P (op)
18104 	  && aarch64_classify_address (&addr, XEXP (op, 0),
18105 				       GET_MODE (op), false)
18106 	  && addr.type == ADDRESS_REG_IMM);
18107 }
18108 
18109 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
18110    The conditions for STR are the same.  */
18111 bool
aarch64_sve_ldr_operand_p(rtx op)18112 aarch64_sve_ldr_operand_p (rtx op)
18113 {
18114   struct aarch64_address_info addr;
18115 
18116   return (MEM_P (op)
18117 	  && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
18118 				       false, ADDR_QUERY_ANY)
18119 	  && addr.type == ADDRESS_REG_IMM);
18120 }
18121 
18122 /* Return true if OP is a valid address for an SVE PRF[BHWD] instruction,
18123    addressing memory of mode MODE.  */
18124 bool
aarch64_sve_prefetch_operand_p(rtx op,machine_mode mode)18125 aarch64_sve_prefetch_operand_p (rtx op, machine_mode mode)
18126 {
18127   struct aarch64_address_info addr;
18128   if (!aarch64_classify_address (&addr, op, mode, false))
18129     return false;
18130 
18131   if (addr.type == ADDRESS_REG_IMM)
18132     return known_eq (addr.const_offset, 0);
18133 
18134   return addr.type == ADDRESS_REG_REG;
18135 }
18136 
18137 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
18138    We need to be able to access the individual pieces, so the range
18139    is different from LD[234] and ST[234].  */
18140 bool
aarch64_sve_struct_memory_operand_p(rtx op)18141 aarch64_sve_struct_memory_operand_p (rtx op)
18142 {
18143   if (!MEM_P (op))
18144     return false;
18145 
18146   machine_mode mode = GET_MODE (op);
18147   struct aarch64_address_info addr;
18148   if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
18149 				 ADDR_QUERY_ANY)
18150       || addr.type != ADDRESS_REG_IMM)
18151     return false;
18152 
18153   poly_int64 first = addr.const_offset;
18154   poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
18155   return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
18156 	  && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
18157 }
18158 
18159 /* Emit a register copy from operand to operand, taking care not to
18160    early-clobber source registers in the process.
18161 
18162    COUNT is the number of components into which the copy needs to be
18163    decomposed.  */
18164 void
aarch64_simd_emit_reg_reg_move(rtx * operands,machine_mode mode,unsigned int count)18165 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
18166 				unsigned int count)
18167 {
18168   unsigned int i;
18169   int rdest = REGNO (operands[0]);
18170   int rsrc = REGNO (operands[1]);
18171 
18172   if (!reg_overlap_mentioned_p (operands[0], operands[1])
18173       || rdest < rsrc)
18174     for (i = 0; i < count; i++)
18175       emit_move_insn (gen_rtx_REG (mode, rdest + i),
18176 		      gen_rtx_REG (mode, rsrc + i));
18177   else
18178     for (i = 0; i < count; i++)
18179       emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
18180 		      gen_rtx_REG (mode, rsrc + count - i - 1));
18181 }
18182 
18183 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
18184    one of VSTRUCT modes: OI, CI, or XI.  */
18185 int
aarch64_simd_attr_length_rglist(machine_mode mode)18186 aarch64_simd_attr_length_rglist (machine_mode mode)
18187 {
18188   /* This is only used (and only meaningful) for Advanced SIMD, not SVE.  */
18189   return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
18190 }
18191 
18192 /* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
18193    alignment of a vector to 128 bits.  SVE predicates have an alignment of
18194    16 bits.  */
18195 static HOST_WIDE_INT
aarch64_simd_vector_alignment(const_tree type)18196 aarch64_simd_vector_alignment (const_tree type)
18197 {
18198   /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
18199      be set for non-predicate vectors of booleans.  Modes are the most
18200      direct way we have of identifying real SVE predicate types.  */
18201   if (GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL)
18202     return 16;
18203   widest_int min_size
18204     = constant_lower_bound (wi::to_poly_widest (TYPE_SIZE (type)));
18205   return wi::umin (min_size, 128).to_uhwi ();
18206 }
18207 
18208 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT.  */
18209 static poly_uint64
aarch64_vectorize_preferred_vector_alignment(const_tree type)18210 aarch64_vectorize_preferred_vector_alignment (const_tree type)
18211 {
18212   if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
18213     {
18214       /* If the length of the vector is fixed, try to align to that length,
18215 	 otherwise don't try to align at all.  */
18216       HOST_WIDE_INT result;
18217       if (!BITS_PER_SVE_VECTOR.is_constant (&result))
18218 	result = TYPE_ALIGN (TREE_TYPE (type));
18219       return result;
18220     }
18221   return TYPE_ALIGN (type);
18222 }
18223 
18224 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
18225 static bool
aarch64_simd_vector_alignment_reachable(const_tree type,bool is_packed)18226 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
18227 {
18228   if (is_packed)
18229     return false;
18230 
18231   /* For fixed-length vectors, check that the vectorizer will aim for
18232      full-vector alignment.  This isn't true for generic GCC vectors
18233      that are wider than the ABI maximum of 128 bits.  */
18234   poly_uint64 preferred_alignment =
18235     aarch64_vectorize_preferred_vector_alignment (type);
18236   if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
18237       && maybe_ne (wi::to_widest (TYPE_SIZE (type)),
18238 		   preferred_alignment))
18239     return false;
18240 
18241   /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
18242   return true;
18243 }
18244 
18245 /* Return true if the vector misalignment factor is supported by the
18246    target.  */
18247 static bool
aarch64_builtin_support_vector_misalignment(machine_mode mode,const_tree type,int misalignment,bool is_packed)18248 aarch64_builtin_support_vector_misalignment (machine_mode mode,
18249 					     const_tree type, int misalignment,
18250 					     bool is_packed)
18251 {
18252   if (TARGET_SIMD && STRICT_ALIGNMENT)
18253     {
18254       /* Return if movmisalign pattern is not supported for this mode.  */
18255       if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
18256         return false;
18257 
18258       /* Misalignment factor is unknown at compile time.  */
18259       if (misalignment == -1)
18260 	return false;
18261     }
18262   return default_builtin_support_vector_misalignment (mode, type, misalignment,
18263 						      is_packed);
18264 }
18265 
18266 /* If VALS is a vector constant that can be loaded into a register
18267    using DUP, generate instructions to do so and return an RTX to
18268    assign to the register.  Otherwise return NULL_RTX.  */
18269 static rtx
aarch64_simd_dup_constant(rtx vals)18270 aarch64_simd_dup_constant (rtx vals)
18271 {
18272   machine_mode mode = GET_MODE (vals);
18273   machine_mode inner_mode = GET_MODE_INNER (mode);
18274   rtx x;
18275 
18276   if (!const_vec_duplicate_p (vals, &x))
18277     return NULL_RTX;
18278 
18279   /* We can load this constant by using DUP and a constant in a
18280      single ARM register.  This will be cheaper than a vector
18281      load.  */
18282   x = copy_to_mode_reg (inner_mode, x);
18283   return gen_vec_duplicate (mode, x);
18284 }
18285 
18286 
18287 /* Generate code to load VALS, which is a PARALLEL containing only
18288    constants (for vec_init) or CONST_VECTOR, efficiently into a
18289    register.  Returns an RTX to copy into the register, or NULL_RTX
18290    for a PARALLEL that cannot be converted into a CONST_VECTOR.  */
18291 static rtx
aarch64_simd_make_constant(rtx vals)18292 aarch64_simd_make_constant (rtx vals)
18293 {
18294   machine_mode mode = GET_MODE (vals);
18295   rtx const_dup;
18296   rtx const_vec = NULL_RTX;
18297   int n_const = 0;
18298   int i;
18299 
18300   if (GET_CODE (vals) == CONST_VECTOR)
18301     const_vec = vals;
18302   else if (GET_CODE (vals) == PARALLEL)
18303     {
18304       /* A CONST_VECTOR must contain only CONST_INTs and
18305 	 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
18306 	 Only store valid constants in a CONST_VECTOR.  */
18307       int n_elts = XVECLEN (vals, 0);
18308       for (i = 0; i < n_elts; ++i)
18309 	{
18310 	  rtx x = XVECEXP (vals, 0, i);
18311 	  if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
18312 	    n_const++;
18313 	}
18314       if (n_const == n_elts)
18315 	const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
18316     }
18317   else
18318     gcc_unreachable ();
18319 
18320   if (const_vec != NULL_RTX
18321       && aarch64_simd_valid_immediate (const_vec, NULL))
18322     /* Load using MOVI/MVNI.  */
18323     return const_vec;
18324   else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
18325     /* Loaded using DUP.  */
18326     return const_dup;
18327   else if (const_vec != NULL_RTX)
18328     /* Load from constant pool. We cannot take advantage of single-cycle
18329        LD1 because we need a PC-relative addressing mode.  */
18330     return const_vec;
18331   else
18332     /* A PARALLEL containing something not valid inside CONST_VECTOR.
18333        We cannot construct an initializer.  */
18334     return NULL_RTX;
18335 }
18336 
18337 /* Expand a vector initialisation sequence, such that TARGET is
18338    initialised to contain VALS.  */
18339 
18340 void
aarch64_expand_vector_init(rtx target,rtx vals)18341 aarch64_expand_vector_init (rtx target, rtx vals)
18342 {
18343   machine_mode mode = GET_MODE (target);
18344   scalar_mode inner_mode = GET_MODE_INNER (mode);
18345   /* The number of vector elements.  */
18346   int n_elts = XVECLEN (vals, 0);
18347   /* The number of vector elements which are not constant.  */
18348   int n_var = 0;
18349   rtx any_const = NULL_RTX;
18350   /* The first element of vals.  */
18351   rtx v0 = XVECEXP (vals, 0, 0);
18352   bool all_same = true;
18353 
18354   /* This is a special vec_init<M><N> where N is not an element mode but a
18355      vector mode with half the elements of M.  We expect to find two entries
18356      of mode N in VALS and we must put their concatentation into TARGET.  */
18357   if (XVECLEN (vals, 0) == 2 && VECTOR_MODE_P (GET_MODE (XVECEXP (vals, 0, 0))))
18358     {
18359       gcc_assert (known_eq (GET_MODE_SIZE (mode),
18360 		  2 * GET_MODE_SIZE (GET_MODE (XVECEXP (vals, 0, 0)))));
18361       rtx lo = XVECEXP (vals, 0, 0);
18362       rtx hi = XVECEXP (vals, 0, 1);
18363       machine_mode narrow_mode = GET_MODE (lo);
18364       gcc_assert (GET_MODE_INNER (narrow_mode) == inner_mode);
18365       gcc_assert (narrow_mode == GET_MODE (hi));
18366 
18367       /* When we want to concatenate a half-width vector with zeroes we can
18368 	 use the aarch64_combinez[_be] patterns.  Just make sure that the
18369 	 zeroes are in the right half.  */
18370       if (BYTES_BIG_ENDIAN
18371 	  && aarch64_simd_imm_zero (lo, narrow_mode)
18372 	  && general_operand (hi, narrow_mode))
18373 	emit_insn (gen_aarch64_combinez_be (narrow_mode, target, hi, lo));
18374       else if (!BYTES_BIG_ENDIAN
18375 	       && aarch64_simd_imm_zero (hi, narrow_mode)
18376 	       && general_operand (lo, narrow_mode))
18377 	emit_insn (gen_aarch64_combinez (narrow_mode, target, lo, hi));
18378       else
18379 	{
18380 	  /* Else create the two half-width registers and combine them.  */
18381 	  if (!REG_P (lo))
18382 	    lo = force_reg (GET_MODE (lo), lo);
18383 	  if (!REG_P (hi))
18384 	    hi = force_reg (GET_MODE (hi), hi);
18385 
18386 	  if (BYTES_BIG_ENDIAN)
18387 	    std::swap (lo, hi);
18388 	  emit_insn (gen_aarch64_simd_combine (narrow_mode, target, lo, hi));
18389 	}
18390      return;
18391    }
18392 
18393   /* Count the number of variable elements to initialise.  */
18394   for (int i = 0; i < n_elts; ++i)
18395     {
18396       rtx x = XVECEXP (vals, 0, i);
18397       if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
18398 	++n_var;
18399       else
18400 	any_const = x;
18401 
18402       all_same &= rtx_equal_p (x, v0);
18403     }
18404 
18405   /* No variable elements, hand off to aarch64_simd_make_constant which knows
18406      how best to handle this.  */
18407   if (n_var == 0)
18408     {
18409       rtx constant = aarch64_simd_make_constant (vals);
18410       if (constant != NULL_RTX)
18411 	{
18412 	  emit_move_insn (target, constant);
18413 	  return;
18414 	}
18415     }
18416 
18417   /* Splat a single non-constant element if we can.  */
18418   if (all_same)
18419     {
18420       rtx x = copy_to_mode_reg (inner_mode, v0);
18421       aarch64_emit_move (target, gen_vec_duplicate (mode, x));
18422       return;
18423     }
18424 
18425   enum insn_code icode = optab_handler (vec_set_optab, mode);
18426   gcc_assert (icode != CODE_FOR_nothing);
18427 
18428   /* If there are only variable elements, try to optimize
18429      the insertion using dup for the most common element
18430      followed by insertions.  */
18431 
18432   /* The algorithm will fill matches[*][0] with the earliest matching element,
18433      and matches[X][1] with the count of duplicate elements (if X is the
18434      earliest element which has duplicates).  */
18435 
18436   if (n_var == n_elts && n_elts <= 16)
18437     {
18438       int matches[16][2] = {0};
18439       for (int i = 0; i < n_elts; i++)
18440 	{
18441 	  for (int j = 0; j <= i; j++)
18442 	    {
18443 	      if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
18444 		{
18445 		  matches[i][0] = j;
18446 		  matches[j][1]++;
18447 		  break;
18448 		}
18449 	    }
18450 	}
18451       int maxelement = 0;
18452       int maxv = 0;
18453       for (int i = 0; i < n_elts; i++)
18454 	if (matches[i][1] > maxv)
18455 	  {
18456 	    maxelement = i;
18457 	    maxv = matches[i][1];
18458 	  }
18459 
18460       /* Create a duplicate of the most common element, unless all elements
18461 	 are equally useless to us, in which case just immediately set the
18462 	 vector register using the first element.  */
18463 
18464       if (maxv == 1)
18465 	{
18466 	  /* For vectors of two 64-bit elements, we can do even better.  */
18467 	  if (n_elts == 2
18468 	      && (inner_mode == E_DImode
18469 		  || inner_mode == E_DFmode))
18470 
18471 	    {
18472 	      rtx x0 = XVECEXP (vals, 0, 0);
18473 	      rtx x1 = XVECEXP (vals, 0, 1);
18474 	      /* Combine can pick up this case, but handling it directly
18475 		 here leaves clearer RTL.
18476 
18477 		 This is load_pair_lanes<mode>, and also gives us a clean-up
18478 		 for store_pair_lanes<mode>.  */
18479 	      if (memory_operand (x0, inner_mode)
18480 		  && memory_operand (x1, inner_mode)
18481 		  && !STRICT_ALIGNMENT
18482 		  && rtx_equal_p (XEXP (x1, 0),
18483 				  plus_constant (Pmode,
18484 						 XEXP (x0, 0),
18485 						 GET_MODE_SIZE (inner_mode))))
18486 		{
18487 		  rtx t;
18488 		  if (inner_mode == DFmode)
18489 		    t = gen_load_pair_lanesdf (target, x0, x1);
18490 		  else
18491 		    t = gen_load_pair_lanesdi (target, x0, x1);
18492 		  emit_insn (t);
18493 		  return;
18494 		}
18495 	    }
18496 	  /* The subreg-move sequence below will move into lane zero of the
18497 	     vector register.  For big-endian we want that position to hold
18498 	     the last element of VALS.  */
18499 	  maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
18500 	  rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
18501 	  aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
18502 	}
18503       else
18504 	{
18505 	  rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
18506 	  aarch64_emit_move (target, gen_vec_duplicate (mode, x));
18507 	}
18508 
18509       /* Insert the rest.  */
18510       for (int i = 0; i < n_elts; i++)
18511 	{
18512 	  rtx x = XVECEXP (vals, 0, i);
18513 	  if (matches[i][0] == maxelement)
18514 	    continue;
18515 	  x = copy_to_mode_reg (inner_mode, x);
18516 	  emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
18517 	}
18518       return;
18519     }
18520 
18521   /* Initialise a vector which is part-variable.  We want to first try
18522      to build those lanes which are constant in the most efficient way we
18523      can.  */
18524   if (n_var != n_elts)
18525     {
18526       rtx copy = copy_rtx (vals);
18527 
18528       /* Load constant part of vector.  We really don't care what goes into the
18529 	 parts we will overwrite, but we're more likely to be able to load the
18530 	 constant efficiently if it has fewer, larger, repeating parts
18531 	 (see aarch64_simd_valid_immediate).  */
18532       for (int i = 0; i < n_elts; i++)
18533 	{
18534 	  rtx x = XVECEXP (vals, 0, i);
18535 	  if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
18536 	    continue;
18537 	  rtx subst = any_const;
18538 	  for (int bit = n_elts / 2; bit > 0; bit /= 2)
18539 	    {
18540 	      /* Look in the copied vector, as more elements are const.  */
18541 	      rtx test = XVECEXP (copy, 0, i ^ bit);
18542 	      if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
18543 		{
18544 		  subst = test;
18545 		  break;
18546 		}
18547 	    }
18548 	  XVECEXP (copy, 0, i) = subst;
18549 	}
18550       aarch64_expand_vector_init (target, copy);
18551     }
18552 
18553   /* Insert the variable lanes directly.  */
18554   for (int i = 0; i < n_elts; i++)
18555     {
18556       rtx x = XVECEXP (vals, 0, i);
18557       if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
18558 	continue;
18559       x = copy_to_mode_reg (inner_mode, x);
18560       emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
18561     }
18562 }
18563 
18564 /* Emit RTL corresponding to:
18565    insr TARGET, ELEM.  */
18566 
18567 static void
emit_insr(rtx target,rtx elem)18568 emit_insr (rtx target, rtx elem)
18569 {
18570   machine_mode mode = GET_MODE (target);
18571   scalar_mode elem_mode = GET_MODE_INNER (mode);
18572   elem = force_reg (elem_mode, elem);
18573 
18574   insn_code icode = optab_handler (vec_shl_insert_optab, mode);
18575   gcc_assert (icode != CODE_FOR_nothing);
18576   emit_insn (GEN_FCN (icode) (target, target, elem));
18577 }
18578 
18579 /* Subroutine of aarch64_sve_expand_vector_init for handling
18580    trailing constants.
18581    This function works as follows:
18582    (a) Create a new vector consisting of trailing constants.
18583    (b) Initialize TARGET with the constant vector using emit_move_insn.
18584    (c) Insert remaining elements in TARGET using insr.
18585    NELTS is the total number of elements in original vector while
18586    while NELTS_REQD is the number of elements that are actually
18587    significant.
18588 
18589    ??? The heuristic used is to do above only if number of constants
18590    is at least half the total number of elements.  May need fine tuning.  */
18591 
18592 static bool
aarch64_sve_expand_vector_init_handle_trailing_constants(rtx target,const rtx_vector_builder & builder,int nelts,int nelts_reqd)18593 aarch64_sve_expand_vector_init_handle_trailing_constants
18594  (rtx target, const rtx_vector_builder &builder, int nelts, int nelts_reqd)
18595 {
18596   machine_mode mode = GET_MODE (target);
18597   scalar_mode elem_mode = GET_MODE_INNER (mode);
18598   int n_trailing_constants = 0;
18599 
18600   for (int i = nelts_reqd - 1;
18601        i >= 0 && valid_for_const_vector_p (elem_mode, builder.elt (i));
18602        i--)
18603     n_trailing_constants++;
18604 
18605   if (n_trailing_constants >= nelts_reqd / 2)
18606     {
18607       /* Try to use the natural pattern of BUILDER to extend the trailing
18608 	 constant elements to a full vector.  Replace any variables in the
18609 	 extra elements with zeros.
18610 
18611 	 ??? It would be better if the builders supported "don't care"
18612 	     elements, with the builder filling in whichever elements
18613 	     give the most compact encoding.  */
18614       rtx_vector_builder v (mode, nelts, 1);
18615       for (int i = 0; i < nelts; i++)
18616 	{
18617 	  rtx x = builder.elt (i + nelts_reqd - n_trailing_constants);
18618 	  if (!valid_for_const_vector_p (elem_mode, x))
18619 	    x = const0_rtx;
18620 	  v.quick_push (x);
18621 	}
18622       rtx const_vec = v.build ();
18623       emit_move_insn (target, const_vec);
18624 
18625       for (int i = nelts_reqd - n_trailing_constants - 1; i >= 0; i--)
18626 	emit_insr (target, builder.elt (i));
18627 
18628       return true;
18629     }
18630 
18631   return false;
18632 }
18633 
18634 /* Subroutine of aarch64_sve_expand_vector_init.
18635    Works as follows:
18636    (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER.
18637    (b) Skip trailing elements from BUILDER, which are the same as
18638        element NELTS_REQD - 1.
18639    (c) Insert earlier elements in reverse order in TARGET using insr.  */
18640 
18641 static void
aarch64_sve_expand_vector_init_insert_elems(rtx target,const rtx_vector_builder & builder,int nelts_reqd)18642 aarch64_sve_expand_vector_init_insert_elems (rtx target,
18643 					     const rtx_vector_builder &builder,
18644 					     int nelts_reqd)
18645 {
18646   machine_mode mode = GET_MODE (target);
18647   scalar_mode elem_mode = GET_MODE_INNER (mode);
18648 
18649   struct expand_operand ops[2];
18650   enum insn_code icode = optab_handler (vec_duplicate_optab, mode);
18651   gcc_assert (icode != CODE_FOR_nothing);
18652 
18653   create_output_operand (&ops[0], target, mode);
18654   create_input_operand (&ops[1], builder.elt (nelts_reqd - 1), elem_mode);
18655   expand_insn (icode, 2, ops);
18656 
18657   int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
18658   for (int i = nelts_reqd - ndups - 1; i >= 0; i--)
18659     emit_insr (target, builder.elt (i));
18660 }
18661 
18662 /* Subroutine of aarch64_sve_expand_vector_init to handle case
18663    when all trailing elements of builder are same.
18664    This works as follows:
18665    (a) Use expand_insn interface to broadcast last vector element in TARGET.
18666    (b) Insert remaining elements in TARGET using insr.
18667 
18668    ??? The heuristic used is to do above if number of same trailing elements
18669    is at least 3/4 of total number of elements, loosely based on
18670    heuristic from mostly_zeros_p.  May need fine-tuning.  */
18671 
18672 static bool
aarch64_sve_expand_vector_init_handle_trailing_same_elem(rtx target,const rtx_vector_builder & builder,int nelts_reqd)18673 aarch64_sve_expand_vector_init_handle_trailing_same_elem
18674  (rtx target, const rtx_vector_builder &builder, int nelts_reqd)
18675 {
18676   int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
18677   if (ndups >= (3 * nelts_reqd) / 4)
18678     {
18679       aarch64_sve_expand_vector_init_insert_elems (target, builder,
18680 						   nelts_reqd - ndups + 1);
18681       return true;
18682     }
18683 
18684   return false;
18685 }
18686 
18687 /* Initialize register TARGET from BUILDER. NELTS is the constant number
18688    of elements in BUILDER.
18689 
18690    The function tries to initialize TARGET from BUILDER if it fits one
18691    of the special cases outlined below.
18692 
18693    Failing that, the function divides BUILDER into two sub-vectors:
18694    v_even = even elements of BUILDER;
18695    v_odd = odd elements of BUILDER;
18696 
18697    and recursively calls itself with v_even and v_odd.
18698 
18699    if (recursive call succeeded for v_even or v_odd)
18700      TARGET = zip (v_even, v_odd)
18701 
18702    The function returns true if it managed to build TARGET from BUILDER
18703    with one of the special cases, false otherwise.
18704 
18705    Example: {a, 1, b, 2, c, 3, d, 4}
18706 
18707    The vector gets divided into:
18708    v_even = {a, b, c, d}
18709    v_odd = {1, 2, 3, 4}
18710 
18711    aarch64_sve_expand_vector_init(v_odd) hits case 1 and
18712    initialize tmp2 from constant vector v_odd using emit_move_insn.
18713 
18714    aarch64_sve_expand_vector_init(v_even) fails since v_even contains
18715    4 elements, so we construct tmp1 from v_even using insr:
18716    tmp1 = dup(d)
18717    insr tmp1, c
18718    insr tmp1, b
18719    insr tmp1, a
18720 
18721    And finally:
18722    TARGET = zip (tmp1, tmp2)
18723    which sets TARGET to {a, 1, b, 2, c, 3, d, 4}.  */
18724 
18725 static bool
aarch64_sve_expand_vector_init(rtx target,const rtx_vector_builder & builder,int nelts,int nelts_reqd)18726 aarch64_sve_expand_vector_init (rtx target, const rtx_vector_builder &builder,
18727 				int nelts, int nelts_reqd)
18728 {
18729   machine_mode mode = GET_MODE (target);
18730 
18731   /* Case 1: Vector contains trailing constants.  */
18732 
18733   if (aarch64_sve_expand_vector_init_handle_trailing_constants
18734        (target, builder, nelts, nelts_reqd))
18735     return true;
18736 
18737   /* Case 2: Vector contains leading constants.  */
18738 
18739   rtx_vector_builder rev_builder (mode, nelts_reqd, 1);
18740   for (int i = 0; i < nelts_reqd; i++)
18741     rev_builder.quick_push (builder.elt (nelts_reqd - i - 1));
18742   rev_builder.finalize ();
18743 
18744   if (aarch64_sve_expand_vector_init_handle_trailing_constants
18745        (target, rev_builder, nelts, nelts_reqd))
18746     {
18747       emit_insn (gen_aarch64_sve_rev (mode, target, target));
18748       return true;
18749     }
18750 
18751   /* Case 3: Vector contains trailing same element.  */
18752 
18753   if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
18754        (target, builder, nelts_reqd))
18755     return true;
18756 
18757   /* Case 4: Vector contains leading same element.  */
18758 
18759   if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
18760        (target, rev_builder, nelts_reqd) && nelts_reqd == nelts)
18761     {
18762       emit_insn (gen_aarch64_sve_rev (mode, target, target));
18763       return true;
18764     }
18765 
18766   /* Avoid recursing below 4-elements.
18767      ??? The threshold 4 may need fine-tuning.  */
18768 
18769   if (nelts_reqd <= 4)
18770     return false;
18771 
18772   rtx_vector_builder v_even (mode, nelts, 1);
18773   rtx_vector_builder v_odd (mode, nelts, 1);
18774 
18775   for (int i = 0; i < nelts * 2; i += 2)
18776     {
18777       v_even.quick_push (builder.elt (i));
18778       v_odd.quick_push (builder.elt (i + 1));
18779     }
18780 
18781   v_even.finalize ();
18782   v_odd.finalize ();
18783 
18784   rtx tmp1 = gen_reg_rtx (mode);
18785   bool did_even_p = aarch64_sve_expand_vector_init (tmp1, v_even,
18786 						    nelts, nelts_reqd / 2);
18787 
18788   rtx tmp2 = gen_reg_rtx (mode);
18789   bool did_odd_p = aarch64_sve_expand_vector_init (tmp2, v_odd,
18790 						   nelts, nelts_reqd / 2);
18791 
18792   if (!did_even_p && !did_odd_p)
18793     return false;
18794 
18795   /* Initialize v_even and v_odd using INSR if it didn't match any of the
18796      special cases and zip v_even, v_odd.  */
18797 
18798   if (!did_even_p)
18799     aarch64_sve_expand_vector_init_insert_elems (tmp1, v_even, nelts_reqd / 2);
18800 
18801   if (!did_odd_p)
18802     aarch64_sve_expand_vector_init_insert_elems (tmp2, v_odd, nelts_reqd / 2);
18803 
18804   rtvec v = gen_rtvec (2, tmp1, tmp2);
18805   emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
18806   return true;
18807 }
18808 
18809 /* Initialize register TARGET from the elements in PARALLEL rtx VALS.  */
18810 
18811 void
aarch64_sve_expand_vector_init(rtx target,rtx vals)18812 aarch64_sve_expand_vector_init (rtx target, rtx vals)
18813 {
18814   machine_mode mode = GET_MODE (target);
18815   int nelts = XVECLEN (vals, 0);
18816 
18817   rtx_vector_builder v (mode, nelts, 1);
18818   for (int i = 0; i < nelts; i++)
18819     v.quick_push (XVECEXP (vals, 0, i));
18820   v.finalize ();
18821 
18822   /* If neither sub-vectors of v could be initialized specially,
18823      then use INSR to insert all elements from v into TARGET.
18824      ??? This might not be optimal for vectors with large
18825      initializers like 16-element or above.
18826      For nelts < 4, it probably isn't useful to handle specially.  */
18827 
18828   if (nelts < 4
18829       || !aarch64_sve_expand_vector_init (target, v, nelts, nelts))
18830     aarch64_sve_expand_vector_init_insert_elems (target, v, nelts);
18831 }
18832 
18833 /* Check whether VALUE is a vector constant in which every element
18834    is either a power of 2 or a negated power of 2.  If so, return
18835    a constant vector of log2s, and flip CODE between PLUS and MINUS
18836    if VALUE contains negated powers of 2.  Return NULL_RTX otherwise.  */
18837 
18838 static rtx
aarch64_convert_mult_to_shift(rtx value,rtx_code & code)18839 aarch64_convert_mult_to_shift (rtx value, rtx_code &code)
18840 {
18841   if (GET_CODE (value) != CONST_VECTOR)
18842     return NULL_RTX;
18843 
18844   rtx_vector_builder builder;
18845   if (!builder.new_unary_operation (GET_MODE (value), value, false))
18846     return NULL_RTX;
18847 
18848   scalar_mode int_mode = GET_MODE_INNER (GET_MODE (value));
18849   /* 1 if the result of the multiplication must be negated,
18850      0 if it mustn't, or -1 if we don't yet care.  */
18851   int negate = -1;
18852   unsigned int encoded_nelts = const_vector_encoded_nelts (value);
18853   for (unsigned int i = 0; i < encoded_nelts; ++i)
18854     {
18855       rtx elt = CONST_VECTOR_ENCODED_ELT (value, i);
18856       if (!CONST_SCALAR_INT_P (elt))
18857 	return NULL_RTX;
18858       rtx_mode_t val (elt, int_mode);
18859       wide_int pow2 = wi::neg (val);
18860       if (val != pow2)
18861 	{
18862 	  /* It matters whether we negate or not.  Make that choice,
18863 	     and make sure that it's consistent with previous elements.  */
18864 	  if (negate == !wi::neg_p (val))
18865 	    return NULL_RTX;
18866 	  negate = wi::neg_p (val);
18867 	  if (!negate)
18868 	    pow2 = val;
18869 	}
18870       /* POW2 is now the value that we want to be a power of 2.  */
18871       int shift = wi::exact_log2 (pow2);
18872       if (shift < 0)
18873 	return NULL_RTX;
18874       builder.quick_push (gen_int_mode (shift, int_mode));
18875     }
18876   if (negate == -1)
18877     /* PLUS and MINUS are equivalent; canonicalize on PLUS.  */
18878     code = PLUS;
18879   else if (negate == 1)
18880     code = code == PLUS ? MINUS : PLUS;
18881   return builder.build ();
18882 }
18883 
18884 /* Prepare for an integer SVE multiply-add or multiply-subtract pattern;
18885    CODE is PLUS for the former and MINUS for the latter.  OPERANDS is the
18886    operands array, in the same order as for fma_optab.  Return true if
18887    the function emitted all the necessary instructions, false if the caller
18888    should generate the pattern normally with the new OPERANDS array.  */
18889 
18890 bool
aarch64_prepare_sve_int_fma(rtx * operands,rtx_code code)18891 aarch64_prepare_sve_int_fma (rtx *operands, rtx_code code)
18892 {
18893   machine_mode mode = GET_MODE (operands[0]);
18894   if (rtx shifts = aarch64_convert_mult_to_shift (operands[2], code))
18895     {
18896       rtx product = expand_binop (mode, vashl_optab, operands[1], shifts,
18897 				  NULL_RTX, true, OPTAB_DIRECT);
18898       force_expand_binop (mode, code == PLUS ? add_optab : sub_optab,
18899 			  operands[3], product, operands[0], true,
18900 			  OPTAB_DIRECT);
18901       return true;
18902     }
18903   operands[2] = force_reg (mode, operands[2]);
18904   return false;
18905 }
18906 
18907 /* Likewise, but for a conditional pattern.  */
18908 
18909 bool
aarch64_prepare_sve_cond_int_fma(rtx * operands,rtx_code code)18910 aarch64_prepare_sve_cond_int_fma (rtx *operands, rtx_code code)
18911 {
18912   machine_mode mode = GET_MODE (operands[0]);
18913   if (rtx shifts = aarch64_convert_mult_to_shift (operands[3], code))
18914     {
18915       rtx product = expand_binop (mode, vashl_optab, operands[2], shifts,
18916 				  NULL_RTX, true, OPTAB_DIRECT);
18917       emit_insn (gen_cond (code, mode, operands[0], operands[1],
18918 			   operands[4], product, operands[5]));
18919       return true;
18920     }
18921   operands[3] = force_reg (mode, operands[3]);
18922   return false;
18923 }
18924 
18925 static unsigned HOST_WIDE_INT
aarch64_shift_truncation_mask(machine_mode mode)18926 aarch64_shift_truncation_mask (machine_mode mode)
18927 {
18928   if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
18929     return 0;
18930   return GET_MODE_UNIT_BITSIZE (mode) - 1;
18931 }
18932 
18933 /* Select a format to encode pointers in exception handling data.  */
18934 int
aarch64_asm_preferred_eh_data_format(int code ATTRIBUTE_UNUSED,int global)18935 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
18936 {
18937    int type;
18938    switch (aarch64_cmodel)
18939      {
18940      case AARCH64_CMODEL_TINY:
18941      case AARCH64_CMODEL_TINY_PIC:
18942      case AARCH64_CMODEL_SMALL:
18943      case AARCH64_CMODEL_SMALL_PIC:
18944      case AARCH64_CMODEL_SMALL_SPIC:
18945        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
18946 	  for everything.  */
18947        type = DW_EH_PE_sdata4;
18948        break;
18949      default:
18950        /* No assumptions here.  8-byte relocs required.  */
18951        type = DW_EH_PE_sdata8;
18952        break;
18953      }
18954    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
18955 }
18956 
18957 /* Output .variant_pcs for aarch64_vector_pcs function symbols.  */
18958 
18959 static void
aarch64_asm_output_variant_pcs(FILE * stream,const tree decl,const char * name)18960 aarch64_asm_output_variant_pcs (FILE *stream, const tree decl, const char* name)
18961 {
18962   if (TREE_CODE (decl) == FUNCTION_DECL)
18963     {
18964       arm_pcs pcs = (arm_pcs) fndecl_abi (decl).id ();
18965       if (pcs == ARM_PCS_SIMD || pcs == ARM_PCS_SVE)
18966 	{
18967 	  fprintf (stream, "\t.variant_pcs\t");
18968 	  assemble_name (stream, name);
18969 	  fprintf (stream, "\n");
18970 	}
18971     }
18972 }
18973 
18974 /* The last .arch and .tune assembly strings that we printed.  */
18975 static std::string aarch64_last_printed_arch_string;
18976 static std::string aarch64_last_printed_tune_string;
18977 
18978 /* Implement ASM_DECLARE_FUNCTION_NAME.  Output the ISA features used
18979    by the function fndecl.  */
18980 
18981 void
aarch64_declare_function_name(FILE * stream,const char * name,tree fndecl)18982 aarch64_declare_function_name (FILE *stream, const char* name,
18983 				tree fndecl)
18984 {
18985   tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
18986 
18987   struct cl_target_option *targ_options;
18988   if (target_parts)
18989     targ_options = TREE_TARGET_OPTION (target_parts);
18990   else
18991     targ_options = TREE_TARGET_OPTION (target_option_current_node);
18992   gcc_assert (targ_options);
18993 
18994   const struct processor *this_arch
18995     = aarch64_get_arch (targ_options->x_explicit_arch);
18996 
18997   uint64_t isa_flags = targ_options->x_aarch64_isa_flags;
18998   std::string extension
18999     = aarch64_get_extension_string_for_isa_flags (isa_flags,
19000 						  this_arch->flags);
19001   /* Only update the assembler .arch string if it is distinct from the last
19002      such string we printed.  */
19003   std::string to_print = this_arch->name + extension;
19004   if (to_print != aarch64_last_printed_arch_string)
19005     {
19006       asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
19007       aarch64_last_printed_arch_string = to_print;
19008     }
19009 
19010   /* Print the cpu name we're tuning for in the comments, might be
19011      useful to readers of the generated asm.  Do it only when it changes
19012      from function to function and verbose assembly is requested.  */
19013   const struct processor *this_tune
19014     = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
19015 
19016   if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
19017     {
19018       asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
19019 		   this_tune->name);
19020       aarch64_last_printed_tune_string = this_tune->name;
19021     }
19022 
19023   aarch64_asm_output_variant_pcs (stream, fndecl, name);
19024 
19025   /* Don't forget the type directive for ELF.  */
19026   ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
19027   ASM_OUTPUT_LABEL (stream, name);
19028 
19029   cfun->machine->label_is_assembled = true;
19030 }
19031 
19032 /* Implement PRINT_PATCHABLE_FUNCTION_ENTRY.  Check if the patch area is after
19033    the function label and emit a BTI if necessary.  */
19034 
19035 void
aarch64_print_patchable_function_entry(FILE * file,unsigned HOST_WIDE_INT patch_area_size,bool record_p)19036 aarch64_print_patchable_function_entry (FILE *file,
19037 					unsigned HOST_WIDE_INT patch_area_size,
19038 					bool record_p)
19039 {
19040   if (cfun->machine->label_is_assembled
19041       && aarch64_bti_enabled ()
19042       && !cgraph_node::get (cfun->decl)->only_called_directly_p ())
19043     {
19044       /* Remove the BTI that follows the patch area and insert a new BTI
19045 	 before the patch area right after the function label.  */
19046       rtx_insn *insn = next_real_nondebug_insn (get_insns ());
19047       if (insn
19048 	  && INSN_P (insn)
19049 	  && GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
19050 	  && XINT (PATTERN (insn), 1) == UNSPECV_BTI_C)
19051 	delete_insn (insn);
19052       asm_fprintf (file, "\thint\t34 // bti c\n");
19053     }
19054 
19055   default_print_patchable_function_entry (file, patch_area_size, record_p);
19056 }
19057 
19058 /* Implement ASM_OUTPUT_DEF_FROM_DECLS.  Output .variant_pcs for aliases.  */
19059 
19060 void
aarch64_asm_output_alias(FILE * stream,const tree decl,const tree target)19061 aarch64_asm_output_alias (FILE *stream, const tree decl, const tree target)
19062 {
19063   const char *name = XSTR (XEXP (DECL_RTL (decl), 0), 0);
19064   const char *value = IDENTIFIER_POINTER (target);
19065   aarch64_asm_output_variant_pcs (stream, decl, name);
19066   ASM_OUTPUT_DEF (stream, name, value);
19067 }
19068 
19069 /* Implement ASM_OUTPUT_EXTERNAL.  Output .variant_pcs for undefined
19070    function symbol references.  */
19071 
19072 void
aarch64_asm_output_external(FILE * stream,tree decl,const char * name)19073 aarch64_asm_output_external (FILE *stream, tree decl, const char* name)
19074 {
19075   default_elf_asm_output_external (stream, decl, name);
19076   aarch64_asm_output_variant_pcs (stream, decl, name);
19077 }
19078 
19079 /* Triggered after a .cfi_startproc directive is emitted into the assembly file.
19080    Used to output the .cfi_b_key_frame directive when signing the current
19081    function with the B key.  */
19082 
19083 void
aarch64_post_cfi_startproc(FILE * f,tree ignored ATTRIBUTE_UNUSED)19084 aarch64_post_cfi_startproc (FILE *f, tree ignored ATTRIBUTE_UNUSED)
19085 {
19086   if (cfun->machine->frame.laid_out && aarch64_return_address_signing_enabled ()
19087       && aarch64_ra_sign_key == AARCH64_KEY_B)
19088 	asm_fprintf (f, "\t.cfi_b_key_frame\n");
19089 }
19090 
19091 /* Implements TARGET_ASM_FILE_START.  Output the assembly header.  */
19092 
19093 static void
aarch64_start_file(void)19094 aarch64_start_file (void)
19095 {
19096   struct cl_target_option *default_options
19097     = TREE_TARGET_OPTION (target_option_default_node);
19098 
19099   const struct processor *default_arch
19100     = aarch64_get_arch (default_options->x_explicit_arch);
19101   uint64_t default_isa_flags = default_options->x_aarch64_isa_flags;
19102   std::string extension
19103     = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
19104 						  default_arch->flags);
19105 
19106    aarch64_last_printed_arch_string = default_arch->name + extension;
19107    aarch64_last_printed_tune_string = "";
19108    asm_fprintf (asm_out_file, "\t.arch %s\n",
19109 		aarch64_last_printed_arch_string.c_str ());
19110 
19111    default_file_start ();
19112 }
19113 
19114 /* Emit load exclusive.  */
19115 
19116 static void
aarch64_emit_load_exclusive(machine_mode mode,rtx rval,rtx mem,rtx model_rtx)19117 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
19118 			     rtx mem, rtx model_rtx)
19119 {
19120   if (mode == TImode)
19121     emit_insn (gen_aarch64_load_exclusive_pair (gen_lowpart (DImode, rval),
19122 						gen_highpart (DImode, rval),
19123 						mem, model_rtx));
19124   else
19125     emit_insn (gen_aarch64_load_exclusive (mode, rval, mem, model_rtx));
19126 }
19127 
19128 /* Emit store exclusive.  */
19129 
19130 static void
aarch64_emit_store_exclusive(machine_mode mode,rtx bval,rtx mem,rtx rval,rtx model_rtx)19131 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
19132 			      rtx mem, rtx rval, rtx model_rtx)
19133 {
19134   if (mode == TImode)
19135     emit_insn (gen_aarch64_store_exclusive_pair
19136 	       (bval, mem, operand_subword (rval, 0, 0, TImode),
19137 		operand_subword (rval, 1, 0, TImode), model_rtx));
19138   else
19139     emit_insn (gen_aarch64_store_exclusive (mode, bval, mem, rval, model_rtx));
19140 }
19141 
19142 /* Mark the previous jump instruction as unlikely.  */
19143 
19144 static void
aarch64_emit_unlikely_jump(rtx insn)19145 aarch64_emit_unlikely_jump (rtx insn)
19146 {
19147   rtx_insn *jump = emit_jump_insn (insn);
19148   add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
19149 }
19150 
19151 /* We store the names of the various atomic helpers in a 5x4 array.
19152    Return the libcall function given MODE, MODEL and NAMES.  */
19153 
19154 rtx
aarch64_atomic_ool_func(machine_mode mode,rtx model_rtx,const atomic_ool_names * names)19155 aarch64_atomic_ool_func(machine_mode mode, rtx model_rtx,
19156 			const atomic_ool_names *names)
19157 {
19158   memmodel model = memmodel_base (INTVAL (model_rtx));
19159   int mode_idx, model_idx;
19160 
19161   switch (mode)
19162     {
19163     case E_QImode:
19164       mode_idx = 0;
19165       break;
19166     case E_HImode:
19167       mode_idx = 1;
19168       break;
19169     case E_SImode:
19170       mode_idx = 2;
19171       break;
19172     case E_DImode:
19173       mode_idx = 3;
19174       break;
19175     case E_TImode:
19176       mode_idx = 4;
19177       break;
19178     default:
19179       gcc_unreachable ();
19180     }
19181 
19182   switch (model)
19183     {
19184     case MEMMODEL_RELAXED:
19185       model_idx = 0;
19186       break;
19187     case MEMMODEL_CONSUME:
19188     case MEMMODEL_ACQUIRE:
19189       model_idx = 1;
19190       break;
19191     case MEMMODEL_RELEASE:
19192       model_idx = 2;
19193       break;
19194     case MEMMODEL_ACQ_REL:
19195     case MEMMODEL_SEQ_CST:
19196       model_idx = 3;
19197       break;
19198     default:
19199       gcc_unreachable ();
19200     }
19201 
19202   return init_one_libfunc_visibility (names->str[mode_idx][model_idx],
19203 				      VISIBILITY_HIDDEN);
19204 }
19205 
19206 #define DEF0(B, N) \
19207   { "__aarch64_" #B #N "_relax", \
19208     "__aarch64_" #B #N "_acq", \
19209     "__aarch64_" #B #N "_rel", \
19210     "__aarch64_" #B #N "_acq_rel" }
19211 
19212 #define DEF4(B)  DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), \
19213 		 { NULL, NULL, NULL, NULL }
19214 #define DEF5(B)  DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), DEF0(B, 16)
19215 
19216 static const atomic_ool_names aarch64_ool_cas_names = { { DEF5(cas) } };
19217 const atomic_ool_names aarch64_ool_swp_names = { { DEF4(swp) } };
19218 const atomic_ool_names aarch64_ool_ldadd_names = { { DEF4(ldadd) } };
19219 const atomic_ool_names aarch64_ool_ldset_names = { { DEF4(ldset) } };
19220 const atomic_ool_names aarch64_ool_ldclr_names = { { DEF4(ldclr) } };
19221 const atomic_ool_names aarch64_ool_ldeor_names = { { DEF4(ldeor) } };
19222 
19223 #undef DEF0
19224 #undef DEF4
19225 #undef DEF5
19226 
19227 /* Expand a compare and swap pattern.  */
19228 
19229 void
aarch64_expand_compare_and_swap(rtx operands[])19230 aarch64_expand_compare_and_swap (rtx operands[])
19231 {
19232   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x, cc_reg;
19233   machine_mode mode, r_mode;
19234 
19235   bval = operands[0];
19236   rval = operands[1];
19237   mem = operands[2];
19238   oldval = operands[3];
19239   newval = operands[4];
19240   is_weak = operands[5];
19241   mod_s = operands[6];
19242   mod_f = operands[7];
19243   mode = GET_MODE (mem);
19244 
19245   /* Normally the succ memory model must be stronger than fail, but in the
19246      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
19247      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
19248   if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
19249       && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
19250     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
19251 
19252   r_mode = mode;
19253   if (mode == QImode || mode == HImode)
19254     {
19255       r_mode = SImode;
19256       rval = gen_reg_rtx (r_mode);
19257     }
19258 
19259   if (TARGET_LSE)
19260     {
19261       /* The CAS insn requires oldval and rval overlap, but we need to
19262 	 have a copy of oldval saved across the operation to tell if
19263 	 the operation is successful.  */
19264       if (reg_overlap_mentioned_p (rval, oldval))
19265         rval = copy_to_mode_reg (r_mode, oldval);
19266       else
19267 	emit_move_insn (rval, gen_lowpart (r_mode, oldval));
19268 
19269       emit_insn (gen_aarch64_compare_and_swap_lse (mode, rval, mem,
19270 						   newval, mod_s));
19271       cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
19272     }
19273   else if (TARGET_OUTLINE_ATOMICS)
19274     {
19275       /* Oldval must satisfy compare afterward.  */
19276       if (!aarch64_plus_operand (oldval, mode))
19277 	oldval = force_reg (mode, oldval);
19278       rtx func = aarch64_atomic_ool_func (mode, mod_s, &aarch64_ool_cas_names);
19279       rval = emit_library_call_value (func, NULL_RTX, LCT_NORMAL, r_mode,
19280 				      oldval, mode, newval, mode,
19281 				      XEXP (mem, 0), Pmode);
19282       cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
19283     }
19284   else
19285     {
19286       /* The oldval predicate varies by mode.  Test it and force to reg.  */
19287       insn_code code = code_for_aarch64_compare_and_swap (mode);
19288       if (!insn_data[code].operand[2].predicate (oldval, mode))
19289 	oldval = force_reg (mode, oldval);
19290 
19291       emit_insn (GEN_FCN (code) (rval, mem, oldval, newval,
19292 				 is_weak, mod_s, mod_f));
19293       cc_reg = gen_rtx_REG (CCmode, CC_REGNUM);
19294     }
19295 
19296   if (r_mode != mode)
19297     rval = gen_lowpart (mode, rval);
19298   emit_move_insn (operands[1], rval);
19299 
19300   x = gen_rtx_EQ (SImode, cc_reg, const0_rtx);
19301   emit_insn (gen_rtx_SET (bval, x));
19302 }
19303 
19304 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
19305    sequence implementing an atomic operation.  */
19306 
19307 static void
aarch64_emit_post_barrier(enum memmodel model)19308 aarch64_emit_post_barrier (enum memmodel model)
19309 {
19310   const enum memmodel base_model = memmodel_base (model);
19311 
19312   if (is_mm_sync (model)
19313       && (base_model == MEMMODEL_ACQUIRE
19314 	  || base_model == MEMMODEL_ACQ_REL
19315 	  || base_model == MEMMODEL_SEQ_CST))
19316     {
19317       emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
19318     }
19319 }
19320 
19321 /* Split a compare and swap pattern.  */
19322 
19323 void
aarch64_split_compare_and_swap(rtx operands[])19324 aarch64_split_compare_and_swap (rtx operands[])
19325 {
19326   /* Split after prolog/epilog to avoid interactions with shrinkwrapping.  */
19327   gcc_assert (epilogue_completed);
19328 
19329   rtx rval, mem, oldval, newval, scratch, x, model_rtx;
19330   machine_mode mode;
19331   bool is_weak;
19332   rtx_code_label *label1, *label2;
19333   enum memmodel model;
19334 
19335   rval = operands[0];
19336   mem = operands[1];
19337   oldval = operands[2];
19338   newval = operands[3];
19339   is_weak = (operands[4] != const0_rtx);
19340   model_rtx = operands[5];
19341   scratch = operands[7];
19342   mode = GET_MODE (mem);
19343   model = memmodel_from_int (INTVAL (model_rtx));
19344 
19345   /* When OLDVAL is zero and we want the strong version we can emit a tighter
19346     loop:
19347     .label1:
19348 	LD[A]XR	rval, [mem]
19349 	CBNZ	rval, .label2
19350 	ST[L]XR	scratch, newval, [mem]
19351 	CBNZ	scratch, .label1
19352     .label2:
19353 	CMP	rval, 0.  */
19354   bool strong_zero_p = (!is_weak && !aarch64_track_speculation &&
19355 			oldval == const0_rtx && mode != TImode);
19356 
19357   label1 = NULL;
19358   if (!is_weak)
19359     {
19360       label1 = gen_label_rtx ();
19361       emit_label (label1);
19362     }
19363   label2 = gen_label_rtx ();
19364 
19365   /* The initial load can be relaxed for a __sync operation since a final
19366      barrier will be emitted to stop code hoisting.  */
19367   if (is_mm_sync (model))
19368     aarch64_emit_load_exclusive (mode, rval, mem, GEN_INT (MEMMODEL_RELAXED));
19369   else
19370     aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
19371 
19372   if (strong_zero_p)
19373     x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
19374   else
19375     {
19376       rtx cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
19377       x = gen_rtx_NE (VOIDmode, cc_reg, const0_rtx);
19378     }
19379   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
19380 			    gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
19381   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
19382 
19383   aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
19384 
19385   if (!is_weak)
19386     {
19387       if (aarch64_track_speculation)
19388 	{
19389 	  /* Emit an explicit compare instruction, so that we can correctly
19390 	     track the condition codes.  */
19391 	  rtx cc_reg = aarch64_gen_compare_reg (NE, scratch, const0_rtx);
19392 	  x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
19393 	}
19394       else
19395 	x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
19396 
19397       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
19398 				gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
19399       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
19400     }
19401   else
19402     aarch64_gen_compare_reg (NE, scratch, const0_rtx);
19403 
19404   emit_label (label2);
19405 
19406   /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
19407      to set the condition flags.  If this is not used it will be removed by
19408      later passes.  */
19409   if (strong_zero_p)
19410     aarch64_gen_compare_reg (NE, rval, const0_rtx);
19411 
19412   /* Emit any final barrier needed for a __sync operation.  */
19413   if (is_mm_sync (model))
19414     aarch64_emit_post_barrier (model);
19415 }
19416 
19417 /* Split an atomic operation.  */
19418 
19419 void
aarch64_split_atomic_op(enum rtx_code code,rtx old_out,rtx new_out,rtx mem,rtx value,rtx model_rtx,rtx cond)19420 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
19421 			 rtx value, rtx model_rtx, rtx cond)
19422 {
19423   /* Split after prolog/epilog to avoid interactions with shrinkwrapping.  */
19424   gcc_assert (epilogue_completed);
19425 
19426   machine_mode mode = GET_MODE (mem);
19427   machine_mode wmode = (mode == DImode ? DImode : SImode);
19428   const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
19429   const bool is_sync = is_mm_sync (model);
19430   rtx_code_label *label;
19431   rtx x;
19432 
19433   /* Split the atomic operation into a sequence.  */
19434   label = gen_label_rtx ();
19435   emit_label (label);
19436 
19437   if (new_out)
19438     new_out = gen_lowpart (wmode, new_out);
19439   if (old_out)
19440     old_out = gen_lowpart (wmode, old_out);
19441   else
19442     old_out = new_out;
19443   value = simplify_gen_subreg (wmode, value, mode, 0);
19444 
19445   /* The initial load can be relaxed for a __sync operation since a final
19446      barrier will be emitted to stop code hoisting.  */
19447  if (is_sync)
19448     aarch64_emit_load_exclusive (mode, old_out, mem,
19449 				 GEN_INT (MEMMODEL_RELAXED));
19450   else
19451     aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
19452 
19453   switch (code)
19454     {
19455     case SET:
19456       new_out = value;
19457       break;
19458 
19459     case NOT:
19460       x = gen_rtx_AND (wmode, old_out, value);
19461       emit_insn (gen_rtx_SET (new_out, x));
19462       x = gen_rtx_NOT (wmode, new_out);
19463       emit_insn (gen_rtx_SET (new_out, x));
19464       break;
19465 
19466     case MINUS:
19467       if (CONST_INT_P (value))
19468 	{
19469 	  value = GEN_INT (-INTVAL (value));
19470 	  code = PLUS;
19471 	}
19472       /* Fall through.  */
19473 
19474     default:
19475       x = gen_rtx_fmt_ee (code, wmode, old_out, value);
19476       emit_insn (gen_rtx_SET (new_out, x));
19477       break;
19478     }
19479 
19480   aarch64_emit_store_exclusive (mode, cond, mem,
19481 				gen_lowpart (mode, new_out), model_rtx);
19482 
19483   if (aarch64_track_speculation)
19484     {
19485       /* Emit an explicit compare instruction, so that we can correctly
19486 	 track the condition codes.  */
19487       rtx cc_reg = aarch64_gen_compare_reg (NE, cond, const0_rtx);
19488       x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
19489     }
19490   else
19491     x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
19492 
19493   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
19494 			    gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
19495   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
19496 
19497   /* Emit any final barrier needed for a __sync operation.  */
19498   if (is_sync)
19499     aarch64_emit_post_barrier (model);
19500 }
19501 
19502 static void
aarch64_init_libfuncs(void)19503 aarch64_init_libfuncs (void)
19504 {
19505    /* Half-precision float operations.  The compiler handles all operations
19506      with NULL libfuncs by converting to SFmode.  */
19507 
19508   /* Conversions.  */
19509   set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
19510   set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
19511 
19512   /* Arithmetic.  */
19513   set_optab_libfunc (add_optab, HFmode, NULL);
19514   set_optab_libfunc (sdiv_optab, HFmode, NULL);
19515   set_optab_libfunc (smul_optab, HFmode, NULL);
19516   set_optab_libfunc (neg_optab, HFmode, NULL);
19517   set_optab_libfunc (sub_optab, HFmode, NULL);
19518 
19519   /* Comparisons.  */
19520   set_optab_libfunc (eq_optab, HFmode, NULL);
19521   set_optab_libfunc (ne_optab, HFmode, NULL);
19522   set_optab_libfunc (lt_optab, HFmode, NULL);
19523   set_optab_libfunc (le_optab, HFmode, NULL);
19524   set_optab_libfunc (ge_optab, HFmode, NULL);
19525   set_optab_libfunc (gt_optab, HFmode, NULL);
19526   set_optab_libfunc (unord_optab, HFmode, NULL);
19527 }
19528 
19529 /* Target hook for c_mode_for_suffix.  */
19530 static machine_mode
aarch64_c_mode_for_suffix(char suffix)19531 aarch64_c_mode_for_suffix (char suffix)
19532 {
19533   if (suffix == 'q')
19534     return TFmode;
19535 
19536   return VOIDmode;
19537 }
19538 
19539 /* We can only represent floating point constants which will fit in
19540    "quarter-precision" values.  These values are characterised by
19541    a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
19542    by:
19543 
19544    (-1)^s * (n/16) * 2^r
19545 
19546    Where:
19547      's' is the sign bit.
19548      'n' is an integer in the range 16 <= n <= 31.
19549      'r' is an integer in the range -3 <= r <= 4.  */
19550 
19551 /* Return true iff X can be represented by a quarter-precision
19552    floating point immediate operand X.  Note, we cannot represent 0.0.  */
19553 bool
aarch64_float_const_representable_p(rtx x)19554 aarch64_float_const_representable_p (rtx x)
19555 {
19556   /* This represents our current view of how many bits
19557      make up the mantissa.  */
19558   int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
19559   int exponent;
19560   unsigned HOST_WIDE_INT mantissa, mask;
19561   REAL_VALUE_TYPE r, m;
19562   bool fail;
19563 
19564   x = unwrap_const_vec_duplicate (x);
19565   if (!CONST_DOUBLE_P (x))
19566     return false;
19567 
19568   if (GET_MODE (x) == VOIDmode
19569       || (GET_MODE (x) == HFmode && !TARGET_FP_F16INST))
19570     return false;
19571 
19572   r = *CONST_DOUBLE_REAL_VALUE (x);
19573 
19574   /* We cannot represent infinities, NaNs or +/-zero.  We won't
19575      know if we have +zero until we analyse the mantissa, but we
19576      can reject the other invalid values.  */
19577   if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
19578       || REAL_VALUE_MINUS_ZERO (r))
19579     return false;
19580 
19581   /* Extract exponent.  */
19582   r = real_value_abs (&r);
19583   exponent = REAL_EXP (&r);
19584 
19585   /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
19586      highest (sign) bit, with a fixed binary point at bit point_pos.
19587      m1 holds the low part of the mantissa, m2 the high part.
19588      WARNING: If we ever have a representation using more than 2 * H_W_I - 1
19589      bits for the mantissa, this can fail (low bits will be lost).  */
19590   real_ldexp (&m, &r, point_pos - exponent);
19591   wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
19592 
19593   /* If the low part of the mantissa has bits set we cannot represent
19594      the value.  */
19595   if (w.ulow () != 0)
19596     return false;
19597   /* We have rejected the lower HOST_WIDE_INT, so update our
19598      understanding of how many bits lie in the mantissa and
19599      look only at the high HOST_WIDE_INT.  */
19600   mantissa = w.elt (1);
19601   point_pos -= HOST_BITS_PER_WIDE_INT;
19602 
19603   /* We can only represent values with a mantissa of the form 1.xxxx.  */
19604   mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
19605   if ((mantissa & mask) != 0)
19606     return false;
19607 
19608   /* Having filtered unrepresentable values, we may now remove all
19609      but the highest 5 bits.  */
19610   mantissa >>= point_pos - 5;
19611 
19612   /* We cannot represent the value 0.0, so reject it.  This is handled
19613      elsewhere.  */
19614   if (mantissa == 0)
19615     return false;
19616 
19617   /* Then, as bit 4 is always set, we can mask it off, leaving
19618      the mantissa in the range [0, 15].  */
19619   mantissa &= ~(1 << 4);
19620   gcc_assert (mantissa <= 15);
19621 
19622   /* GCC internally does not use IEEE754-like encoding (where normalized
19623      significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.c).
19624      Our mantissa values are shifted 4 places to the left relative to
19625      normalized IEEE754 so we must modify the exponent returned by REAL_EXP
19626      by 5 places to correct for GCC's representation.  */
19627   exponent = 5 - exponent;
19628 
19629   return (exponent >= 0 && exponent <= 7);
19630 }
19631 
19632 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
19633    immediate with a CONST_VECTOR of MODE and WIDTH.  WHICH selects whether to
19634    output MOVI/MVNI, ORR or BIC immediate.  */
19635 char*
aarch64_output_simd_mov_immediate(rtx const_vector,unsigned width,enum simd_immediate_check which)19636 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
19637 				   enum simd_immediate_check which)
19638 {
19639   bool is_valid;
19640   static char templ[40];
19641   const char *mnemonic;
19642   const char *shift_op;
19643   unsigned int lane_count = 0;
19644   char element_char;
19645 
19646   struct simd_immediate_info info;
19647 
19648   /* This will return true to show const_vector is legal for use as either
19649      a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
19650      It will also update INFO to show how the immediate should be generated.
19651      WHICH selects whether to check for MOVI/MVNI, ORR or BIC.  */
19652   is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
19653   gcc_assert (is_valid);
19654 
19655   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
19656   lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
19657 
19658   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
19659     {
19660       gcc_assert (info.insn == simd_immediate_info::MOV
19661 		  && info.u.mov.shift == 0);
19662       /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
19663 	 move immediate path.  */
19664       if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
19665         info.u.mov.value = GEN_INT (0);
19666       else
19667 	{
19668 	  const unsigned int buf_size = 20;
19669 	  char float_buf[buf_size] = {'\0'};
19670 	  real_to_decimal_for_mode (float_buf,
19671 				    CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
19672 				    buf_size, buf_size, 1, info.elt_mode);
19673 
19674 	  if (lane_count == 1)
19675 	    snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
19676 	  else
19677 	    snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
19678 		      lane_count, element_char, float_buf);
19679 	  return templ;
19680 	}
19681     }
19682 
19683   gcc_assert (CONST_INT_P (info.u.mov.value));
19684 
19685   if (which == AARCH64_CHECK_MOV)
19686     {
19687       mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
19688       shift_op = (info.u.mov.modifier == simd_immediate_info::MSL
19689 		  ? "msl" : "lsl");
19690       if (lane_count == 1)
19691 	snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
19692 		  mnemonic, UINTVAL (info.u.mov.value));
19693       else if (info.u.mov.shift)
19694 	snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
19695 		  HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
19696 		  element_char, UINTVAL (info.u.mov.value), shift_op,
19697 		  info.u.mov.shift);
19698       else
19699 	snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
19700 		  HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
19701 		  element_char, UINTVAL (info.u.mov.value));
19702     }
19703   else
19704     {
19705       /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR.  */
19706       mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
19707       if (info.u.mov.shift)
19708 	snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
19709 		  HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
19710 		  element_char, UINTVAL (info.u.mov.value), "lsl",
19711 		  info.u.mov.shift);
19712       else
19713 	snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
19714 		  HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
19715 		  element_char, UINTVAL (info.u.mov.value));
19716     }
19717   return templ;
19718 }
19719 
19720 char*
aarch64_output_scalar_simd_mov_immediate(rtx immediate,scalar_int_mode mode)19721 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
19722 {
19723 
19724   /* If a floating point number was passed and we desire to use it in an
19725      integer mode do the conversion to integer.  */
19726   if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
19727     {
19728       unsigned HOST_WIDE_INT ival;
19729       if (!aarch64_reinterpret_float_as_int (immediate, &ival))
19730 	  gcc_unreachable ();
19731       immediate = gen_int_mode (ival, mode);
19732     }
19733 
19734   machine_mode vmode;
19735   /* use a 64 bit mode for everything except for DI/DF mode, where we use
19736      a 128 bit vector mode.  */
19737   int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
19738 
19739   vmode = aarch64_simd_container_mode (mode, width);
19740   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
19741   return aarch64_output_simd_mov_immediate (v_op, width);
19742 }
19743 
19744 /* Return the output string to use for moving immediate CONST_VECTOR
19745    into an SVE register.  */
19746 
19747 char *
aarch64_output_sve_mov_immediate(rtx const_vector)19748 aarch64_output_sve_mov_immediate (rtx const_vector)
19749 {
19750   static char templ[40];
19751   struct simd_immediate_info info;
19752   char element_char;
19753 
19754   bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
19755   gcc_assert (is_valid);
19756 
19757   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
19758 
19759   machine_mode vec_mode = GET_MODE (const_vector);
19760   if (aarch64_sve_pred_mode_p (vec_mode))
19761     {
19762       static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
19763       if (info.insn == simd_immediate_info::MOV)
19764 	{
19765 	  gcc_assert (info.u.mov.value == const0_rtx);
19766 	  snprintf (buf, sizeof (buf), "pfalse\t%%0.b");
19767 	}
19768       else
19769 	{
19770 	  gcc_assert (info.insn == simd_immediate_info::PTRUE);
19771 	  unsigned int total_bytes;
19772 	  if (info.u.pattern == AARCH64_SV_ALL
19773 	      && BYTES_PER_SVE_VECTOR.is_constant (&total_bytes))
19774 	    snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", element_char,
19775 		      total_bytes / GET_MODE_SIZE (info.elt_mode));
19776 	  else
19777 	    snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, %s", element_char,
19778 		      svpattern_token (info.u.pattern));
19779 	}
19780       return buf;
19781     }
19782 
19783   if (info.insn == simd_immediate_info::INDEX)
19784     {
19785       snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
19786 		HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
19787 		element_char, INTVAL (info.u.index.base),
19788 		INTVAL (info.u.index.step));
19789       return templ;
19790     }
19791 
19792   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
19793     {
19794       if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
19795 	info.u.mov.value = GEN_INT (0);
19796       else
19797 	{
19798 	  const int buf_size = 20;
19799 	  char float_buf[buf_size] = {};
19800 	  real_to_decimal_for_mode (float_buf,
19801 				    CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
19802 				    buf_size, buf_size, 1, info.elt_mode);
19803 
19804 	  snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
19805 		    element_char, float_buf);
19806 	  return templ;
19807 	}
19808     }
19809 
19810   snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
19811 	    element_char, INTVAL (info.u.mov.value));
19812   return templ;
19813 }
19814 
19815 /* Return the asm template for a PTRUES.  CONST_UNSPEC is the
19816    aarch64_sve_ptrue_svpattern_immediate that describes the predicate
19817    pattern.  */
19818 
19819 char *
aarch64_output_sve_ptrues(rtx const_unspec)19820 aarch64_output_sve_ptrues (rtx const_unspec)
19821 {
19822   static char templ[40];
19823 
19824   struct simd_immediate_info info;
19825   bool is_valid = aarch64_simd_valid_immediate (const_unspec, &info);
19826   gcc_assert (is_valid && info.insn == simd_immediate_info::PTRUE);
19827 
19828   char element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
19829   snprintf (templ, sizeof (templ), "ptrues\t%%0.%c, %s", element_char,
19830 	    svpattern_token (info.u.pattern));
19831   return templ;
19832 }
19833 
19834 /* Split operands into moves from op[1] + op[2] into op[0].  */
19835 
19836 void
aarch64_split_combinev16qi(rtx operands[3])19837 aarch64_split_combinev16qi (rtx operands[3])
19838 {
19839   unsigned int dest = REGNO (operands[0]);
19840   unsigned int src1 = REGNO (operands[1]);
19841   unsigned int src2 = REGNO (operands[2]);
19842   machine_mode halfmode = GET_MODE (operands[1]);
19843   unsigned int halfregs = REG_NREGS (operands[1]);
19844   rtx destlo, desthi;
19845 
19846   gcc_assert (halfmode == V16QImode);
19847 
19848   if (src1 == dest && src2 == dest + halfregs)
19849     {
19850       /* No-op move.  Can't split to nothing; emit something.  */
19851       emit_note (NOTE_INSN_DELETED);
19852       return;
19853     }
19854 
19855   /* Preserve register attributes for variable tracking.  */
19856   destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
19857   desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
19858 			       GET_MODE_SIZE (halfmode));
19859 
19860   /* Special case of reversed high/low parts.  */
19861   if (reg_overlap_mentioned_p (operands[2], destlo)
19862       && reg_overlap_mentioned_p (operands[1], desthi))
19863     {
19864       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
19865       emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
19866       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
19867     }
19868   else if (!reg_overlap_mentioned_p (operands[2], destlo))
19869     {
19870       /* Try to avoid unnecessary moves if part of the result
19871 	 is in the right place already.  */
19872       if (src1 != dest)
19873 	emit_move_insn (destlo, operands[1]);
19874       if (src2 != dest + halfregs)
19875 	emit_move_insn (desthi, operands[2]);
19876     }
19877   else
19878     {
19879       if (src2 != dest + halfregs)
19880 	emit_move_insn (desthi, operands[2]);
19881       if (src1 != dest)
19882 	emit_move_insn (destlo, operands[1]);
19883     }
19884 }
19885 
19886 /* vec_perm support.  */
19887 
19888 struct expand_vec_perm_d
19889 {
19890   rtx target, op0, op1;
19891   vec_perm_indices perm;
19892   machine_mode vmode;
19893   unsigned int vec_flags;
19894   bool one_vector_p;
19895   bool testing_p;
19896 };
19897 
19898 /* Generate a variable permutation.  */
19899 
19900 static void
aarch64_expand_vec_perm_1(rtx target,rtx op0,rtx op1,rtx sel)19901 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
19902 {
19903   machine_mode vmode = GET_MODE (target);
19904   bool one_vector_p = rtx_equal_p (op0, op1);
19905 
19906   gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
19907   gcc_checking_assert (GET_MODE (op0) == vmode);
19908   gcc_checking_assert (GET_MODE (op1) == vmode);
19909   gcc_checking_assert (GET_MODE (sel) == vmode);
19910   gcc_checking_assert (TARGET_SIMD);
19911 
19912   if (one_vector_p)
19913     {
19914       if (vmode == V8QImode)
19915 	{
19916 	  /* Expand the argument to a V16QI mode by duplicating it.  */
19917 	  rtx pair = gen_reg_rtx (V16QImode);
19918 	  emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
19919 	  emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
19920 	}
19921       else
19922 	{
19923 	  emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
19924 	}
19925     }
19926   else
19927     {
19928       rtx pair;
19929 
19930       if (vmode == V8QImode)
19931 	{
19932 	  pair = gen_reg_rtx (V16QImode);
19933 	  emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
19934 	  emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
19935 	}
19936       else
19937 	{
19938 	  pair = gen_reg_rtx (OImode);
19939 	  emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
19940 	  emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
19941 	}
19942     }
19943 }
19944 
19945 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
19946    NELT is the number of elements in the vector.  */
19947 
19948 void
aarch64_expand_vec_perm(rtx target,rtx op0,rtx op1,rtx sel,unsigned int nelt)19949 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
19950 			 unsigned int nelt)
19951 {
19952   machine_mode vmode = GET_MODE (target);
19953   bool one_vector_p = rtx_equal_p (op0, op1);
19954   rtx mask;
19955 
19956   /* The TBL instruction does not use a modulo index, so we must take care
19957      of that ourselves.  */
19958   mask = aarch64_simd_gen_const_vector_dup (vmode,
19959       one_vector_p ? nelt - 1 : 2 * nelt - 1);
19960   sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
19961 
19962   /* For big-endian, we also need to reverse the index within the vector
19963      (but not which vector).  */
19964   if (BYTES_BIG_ENDIAN)
19965     {
19966       /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
19967       if (!one_vector_p)
19968         mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
19969       sel = expand_simple_binop (vmode, XOR, sel, mask,
19970 				 NULL, 0, OPTAB_LIB_WIDEN);
19971     }
19972   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
19973 }
19974 
19975 /* Generate (set TARGET (unspec [OP0 OP1] CODE)).  */
19976 
19977 static void
emit_unspec2(rtx target,int code,rtx op0,rtx op1)19978 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
19979 {
19980   emit_insn (gen_rtx_SET (target,
19981 			  gen_rtx_UNSPEC (GET_MODE (target),
19982 					  gen_rtvec (2, op0, op1), code)));
19983 }
19984 
19985 /* Expand an SVE vec_perm with the given operands.  */
19986 
19987 void
aarch64_expand_sve_vec_perm(rtx target,rtx op0,rtx op1,rtx sel)19988 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
19989 {
19990   machine_mode data_mode = GET_MODE (target);
19991   machine_mode sel_mode = GET_MODE (sel);
19992   /* Enforced by the pattern condition.  */
19993   int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
19994 
19995   /* Note: vec_perm indices are supposed to wrap when they go beyond the
19996      size of the two value vectors, i.e. the upper bits of the indices
19997      are effectively ignored.  SVE TBL instead produces 0 for any
19998      out-of-range indices, so we need to modulo all the vec_perm indices
19999      to ensure they are all in range.  */
20000   rtx sel_reg = force_reg (sel_mode, sel);
20001 
20002   /* Check if the sel only references the first values vector.  */
20003   if (GET_CODE (sel) == CONST_VECTOR
20004       && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
20005     {
20006       emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
20007       return;
20008     }
20009 
20010   /* Check if the two values vectors are the same.  */
20011   if (rtx_equal_p (op0, op1))
20012     {
20013       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
20014       rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
20015 					 NULL, 0, OPTAB_DIRECT);
20016       emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
20017       return;
20018     }
20019 
20020   /* Run TBL on for each value vector and combine the results.  */
20021 
20022   rtx res0 = gen_reg_rtx (data_mode);
20023   rtx res1 = gen_reg_rtx (data_mode);
20024   rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
20025   if (GET_CODE (sel) != CONST_VECTOR
20026       || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
20027     {
20028       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
20029 						       2 * nunits - 1);
20030       sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
20031 				     NULL, 0, OPTAB_DIRECT);
20032     }
20033   emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
20034   rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
20035 				     NULL, 0, OPTAB_DIRECT);
20036   emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
20037   if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
20038     emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
20039   else
20040     emit_unspec2 (target, UNSPEC_IORF, res0, res1);
20041 }
20042 
20043 /* Recognize patterns suitable for the TRN instructions.  */
20044 static bool
aarch64_evpc_trn(struct expand_vec_perm_d * d)20045 aarch64_evpc_trn (struct expand_vec_perm_d *d)
20046 {
20047   HOST_WIDE_INT odd;
20048   poly_uint64 nelt = d->perm.length ();
20049   rtx out, in0, in1, x;
20050   machine_mode vmode = d->vmode;
20051 
20052   if (GET_MODE_UNIT_SIZE (vmode) > 8)
20053     return false;
20054 
20055   /* Note that these are little-endian tests.
20056      We correct for big-endian later.  */
20057   if (!d->perm[0].is_constant (&odd)
20058       || (odd != 0 && odd != 1)
20059       || !d->perm.series_p (0, 2, odd, 2)
20060       || !d->perm.series_p (1, 2, nelt + odd, 2))
20061     return false;
20062 
20063   /* Success!  */
20064   if (d->testing_p)
20065     return true;
20066 
20067   in0 = d->op0;
20068   in1 = d->op1;
20069   /* We don't need a big-endian lane correction for SVE; see the comment
20070      at the head of aarch64-sve.md for details.  */
20071   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
20072     {
20073       x = in0, in0 = in1, in1 = x;
20074       odd = !odd;
20075     }
20076   out = d->target;
20077 
20078   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
20079 				      odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
20080   return true;
20081 }
20082 
20083 /* Recognize patterns suitable for the UZP instructions.  */
20084 static bool
aarch64_evpc_uzp(struct expand_vec_perm_d * d)20085 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
20086 {
20087   HOST_WIDE_INT odd;
20088   rtx out, in0, in1, x;
20089   machine_mode vmode = d->vmode;
20090 
20091   if (GET_MODE_UNIT_SIZE (vmode) > 8)
20092     return false;
20093 
20094   /* Note that these are little-endian tests.
20095      We correct for big-endian later.  */
20096   if (!d->perm[0].is_constant (&odd)
20097       || (odd != 0 && odd != 1)
20098       || !d->perm.series_p (0, 1, odd, 2))
20099     return false;
20100 
20101   /* Success!  */
20102   if (d->testing_p)
20103     return true;
20104 
20105   in0 = d->op0;
20106   in1 = d->op1;
20107   /* We don't need a big-endian lane correction for SVE; see the comment
20108      at the head of aarch64-sve.md for details.  */
20109   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
20110     {
20111       x = in0, in0 = in1, in1 = x;
20112       odd = !odd;
20113     }
20114   out = d->target;
20115 
20116   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
20117 				      odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
20118   return true;
20119 }
20120 
20121 /* Recognize patterns suitable for the ZIP instructions.  */
20122 static bool
aarch64_evpc_zip(struct expand_vec_perm_d * d)20123 aarch64_evpc_zip (struct expand_vec_perm_d *d)
20124 {
20125   unsigned int high;
20126   poly_uint64 nelt = d->perm.length ();
20127   rtx out, in0, in1, x;
20128   machine_mode vmode = d->vmode;
20129 
20130   if (GET_MODE_UNIT_SIZE (vmode) > 8)
20131     return false;
20132 
20133   /* Note that these are little-endian tests.
20134      We correct for big-endian later.  */
20135   poly_uint64 first = d->perm[0];
20136   if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
20137       || !d->perm.series_p (0, 2, first, 1)
20138       || !d->perm.series_p (1, 2, first + nelt, 1))
20139     return false;
20140   high = maybe_ne (first, 0U);
20141 
20142   /* Success!  */
20143   if (d->testing_p)
20144     return true;
20145 
20146   in0 = d->op0;
20147   in1 = d->op1;
20148   /* We don't need a big-endian lane correction for SVE; see the comment
20149      at the head of aarch64-sve.md for details.  */
20150   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
20151     {
20152       x = in0, in0 = in1, in1 = x;
20153       high = !high;
20154     }
20155   out = d->target;
20156 
20157   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
20158 				      high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
20159   return true;
20160 }
20161 
20162 /* Recognize patterns for the EXT insn.  */
20163 
20164 static bool
aarch64_evpc_ext(struct expand_vec_perm_d * d)20165 aarch64_evpc_ext (struct expand_vec_perm_d *d)
20166 {
20167   HOST_WIDE_INT location;
20168   rtx offset;
20169 
20170   /* The first element always refers to the first vector.
20171      Check if the extracted indices are increasing by one.  */
20172   if (d->vec_flags == VEC_SVE_PRED
20173       || !d->perm[0].is_constant (&location)
20174       || !d->perm.series_p (0, 1, location, 1))
20175     return false;
20176 
20177   /* Success! */
20178   if (d->testing_p)
20179     return true;
20180 
20181   /* The case where (location == 0) is a no-op for both big- and little-endian,
20182      and is removed by the mid-end at optimization levels -O1 and higher.
20183 
20184      We don't need a big-endian lane correction for SVE; see the comment
20185      at the head of aarch64-sve.md for details.  */
20186   if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
20187     {
20188       /* After setup, we want the high elements of the first vector (stored
20189          at the LSB end of the register), and the low elements of the second
20190          vector (stored at the MSB end of the register). So swap.  */
20191       std::swap (d->op0, d->op1);
20192       /* location != 0 (above), so safe to assume (nelt - location) < nelt.
20193 	 to_constant () is safe since this is restricted to Advanced SIMD
20194 	 vectors.  */
20195       location = d->perm.length ().to_constant () - location;
20196     }
20197 
20198   offset = GEN_INT (location);
20199   emit_set_insn (d->target,
20200 		 gen_rtx_UNSPEC (d->vmode,
20201 				 gen_rtvec (3, d->op0, d->op1, offset),
20202 				 UNSPEC_EXT));
20203   return true;
20204 }
20205 
20206 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
20207    within each 64-bit, 32-bit or 16-bit granule.  */
20208 
20209 static bool
aarch64_evpc_rev_local(struct expand_vec_perm_d * d)20210 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
20211 {
20212   HOST_WIDE_INT diff;
20213   unsigned int i, size, unspec;
20214   machine_mode pred_mode;
20215 
20216   if (d->vec_flags == VEC_SVE_PRED
20217       || !d->one_vector_p
20218       || !d->perm[0].is_constant (&diff)
20219       || !diff)
20220     return false;
20221 
20222   size = (diff + 1) * GET_MODE_UNIT_SIZE (d->vmode);
20223   if (size == 8)
20224     {
20225       unspec = UNSPEC_REV64;
20226       pred_mode = VNx2BImode;
20227     }
20228   else if (size == 4)
20229     {
20230       unspec = UNSPEC_REV32;
20231       pred_mode = VNx4BImode;
20232     }
20233   else if (size == 2)
20234     {
20235       unspec = UNSPEC_REV16;
20236       pred_mode = VNx8BImode;
20237     }
20238   else
20239     return false;
20240 
20241   unsigned int step = diff + 1;
20242   for (i = 0; i < step; ++i)
20243     if (!d->perm.series_p (i, step, diff - i, step))
20244       return false;
20245 
20246   /* Success! */
20247   if (d->testing_p)
20248     return true;
20249 
20250   if (d->vec_flags == VEC_SVE_DATA)
20251     {
20252       machine_mode int_mode = aarch64_sve_int_mode (pred_mode);
20253       rtx target = gen_reg_rtx (int_mode);
20254       if (BYTES_BIG_ENDIAN)
20255 	/* The act of taking a subreg between INT_MODE and d->vmode
20256 	   is itself a reversing operation on big-endian targets;
20257 	   see the comment at the head of aarch64-sve.md for details.
20258 	   First reinterpret OP0 as INT_MODE without using a subreg
20259 	   and without changing the contents.  */
20260 	emit_insn (gen_aarch64_sve_reinterpret (int_mode, target, d->op0));
20261       else
20262 	{
20263 	  /* For SVE we use REV[BHW] unspecs derived from the element size
20264 	     of v->mode and vector modes whose elements have SIZE bytes.
20265 	     This ensures that the vector modes match the predicate modes.  */
20266 	  int unspec = aarch64_sve_rev_unspec (d->vmode);
20267 	  rtx pred = aarch64_ptrue_reg (pred_mode);
20268 	  emit_insn (gen_aarch64_pred (unspec, int_mode, target, pred,
20269 				       gen_lowpart (int_mode, d->op0)));
20270 	}
20271       emit_move_insn (d->target, gen_lowpart (d->vmode, target));
20272       return true;
20273     }
20274   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
20275   emit_set_insn (d->target, src);
20276   return true;
20277 }
20278 
20279 /* Recognize patterns for the REV insn, which reverses elements within
20280    a full vector.  */
20281 
20282 static bool
aarch64_evpc_rev_global(struct expand_vec_perm_d * d)20283 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
20284 {
20285   poly_uint64 nelt = d->perm.length ();
20286 
20287   if (!d->one_vector_p || d->vec_flags == VEC_ADVSIMD)
20288     return false;
20289 
20290   if (!d->perm.series_p (0, 1, nelt - 1, -1))
20291     return false;
20292 
20293   /* Success! */
20294   if (d->testing_p)
20295     return true;
20296 
20297   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
20298   emit_set_insn (d->target, src);
20299   return true;
20300 }
20301 
20302 static bool
aarch64_evpc_dup(struct expand_vec_perm_d * d)20303 aarch64_evpc_dup (struct expand_vec_perm_d *d)
20304 {
20305   rtx out = d->target;
20306   rtx in0;
20307   HOST_WIDE_INT elt;
20308   machine_mode vmode = d->vmode;
20309   rtx lane;
20310 
20311   if (d->vec_flags == VEC_SVE_PRED
20312       || d->perm.encoding ().encoded_nelts () != 1
20313       || !d->perm[0].is_constant (&elt))
20314     return false;
20315 
20316   if (d->vec_flags == VEC_SVE_DATA && elt >= 64 * GET_MODE_UNIT_SIZE (vmode))
20317     return false;
20318 
20319   /* Success! */
20320   if (d->testing_p)
20321     return true;
20322 
20323   /* The generic preparation in aarch64_expand_vec_perm_const_1
20324      swaps the operand order and the permute indices if it finds
20325      d->perm[0] to be in the second operand.  Thus, we can always
20326      use d->op0 and need not do any extra arithmetic to get the
20327      correct lane number.  */
20328   in0 = d->op0;
20329   lane = GEN_INT (elt); /* The pattern corrects for big-endian.  */
20330 
20331   rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
20332   rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
20333   emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
20334   return true;
20335 }
20336 
20337 static bool
aarch64_evpc_tbl(struct expand_vec_perm_d * d)20338 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
20339 {
20340   rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
20341   machine_mode vmode = d->vmode;
20342 
20343   /* Make sure that the indices are constant.  */
20344   unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
20345   for (unsigned int i = 0; i < encoded_nelts; ++i)
20346     if (!d->perm[i].is_constant ())
20347       return false;
20348 
20349   if (d->testing_p)
20350     return true;
20351 
20352   /* Generic code will try constant permutation twice.  Once with the
20353      original mode and again with the elements lowered to QImode.
20354      So wait and don't do the selector expansion ourselves.  */
20355   if (vmode != V8QImode && vmode != V16QImode)
20356     return false;
20357 
20358   /* to_constant is safe since this routine is specific to Advanced SIMD
20359      vectors.  */
20360   unsigned int nelt = d->perm.length ().to_constant ();
20361   for (unsigned int i = 0; i < nelt; ++i)
20362     /* If big-endian and two vectors we end up with a weird mixed-endian
20363        mode on NEON.  Reverse the index within each word but not the word
20364        itself.  to_constant is safe because we checked is_constant above.  */
20365     rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
20366 			? d->perm[i].to_constant () ^ (nelt - 1)
20367 			: d->perm[i].to_constant ());
20368 
20369   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
20370   sel = force_reg (vmode, sel);
20371 
20372   aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
20373   return true;
20374 }
20375 
20376 /* Try to implement D using an SVE TBL instruction.  */
20377 
20378 static bool
aarch64_evpc_sve_tbl(struct expand_vec_perm_d * d)20379 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
20380 {
20381   unsigned HOST_WIDE_INT nelt;
20382 
20383   /* Permuting two variable-length vectors could overflow the
20384      index range.  */
20385   if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
20386     return false;
20387 
20388   if (d->testing_p)
20389     return true;
20390 
20391   machine_mode sel_mode = related_int_vector_mode (d->vmode).require ();
20392   rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
20393   if (d->one_vector_p)
20394     emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
20395   else
20396     aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
20397   return true;
20398 }
20399 
20400 /* Try to implement D using SVE SEL instruction.  */
20401 
20402 static bool
aarch64_evpc_sel(struct expand_vec_perm_d * d)20403 aarch64_evpc_sel (struct expand_vec_perm_d *d)
20404 {
20405   machine_mode vmode = d->vmode;
20406   int unit_size = GET_MODE_UNIT_SIZE (vmode);
20407 
20408   if (d->vec_flags != VEC_SVE_DATA
20409       || unit_size > 8)
20410     return false;
20411 
20412   int n_patterns = d->perm.encoding ().npatterns ();
20413   poly_int64 vec_len = d->perm.length ();
20414 
20415   for (int i = 0; i < n_patterns; ++i)
20416     if (!known_eq (d->perm[i], i)
20417 	&& !known_eq (d->perm[i], vec_len + i))
20418       return false;
20419 
20420   for (int i = n_patterns; i < n_patterns * 2; i++)
20421     if (!d->perm.series_p (i, n_patterns, i, n_patterns)
20422 	&& !d->perm.series_p (i, n_patterns, vec_len + i, n_patterns))
20423       return false;
20424 
20425   if (d->testing_p)
20426     return true;
20427 
20428   machine_mode pred_mode = aarch64_sve_pred_mode (vmode);
20429 
20430   /* Build a predicate that is true when op0 elements should be used.  */
20431   rtx_vector_builder builder (pred_mode, n_patterns, 2);
20432   for (int i = 0; i < n_patterns * 2; i++)
20433     {
20434       rtx elem = known_eq (d->perm[i], i) ? CONST1_RTX (BImode)
20435 					  : CONST0_RTX (BImode);
20436       builder.quick_push (elem);
20437     }
20438 
20439   rtx const_vec = builder.build ();
20440   rtx pred = force_reg (pred_mode, const_vec);
20441   /* TARGET = PRED ? OP0 : OP1.  */
20442   emit_insn (gen_vcond_mask (vmode, vmode, d->target, d->op0, d->op1, pred));
20443   return true;
20444 }
20445 
20446 static bool
aarch64_expand_vec_perm_const_1(struct expand_vec_perm_d * d)20447 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
20448 {
20449   /* The pattern matching functions above are written to look for a small
20450      number to begin the sequence (0, 1, N/2).  If we begin with an index
20451      from the second operand, we can swap the operands.  */
20452   poly_int64 nelt = d->perm.length ();
20453   if (known_ge (d->perm[0], nelt))
20454     {
20455       d->perm.rotate_inputs (1);
20456       std::swap (d->op0, d->op1);
20457     }
20458 
20459   if ((d->vec_flags == VEC_ADVSIMD
20460        || d->vec_flags == VEC_SVE_DATA
20461        || d->vec_flags == VEC_SVE_PRED)
20462       && known_gt (nelt, 1))
20463     {
20464       if (aarch64_evpc_rev_local (d))
20465 	return true;
20466       else if (aarch64_evpc_rev_global (d))
20467 	return true;
20468       else if (aarch64_evpc_ext (d))
20469 	return true;
20470       else if (aarch64_evpc_dup (d))
20471 	return true;
20472       else if (aarch64_evpc_zip (d))
20473 	return true;
20474       else if (aarch64_evpc_uzp (d))
20475 	return true;
20476       else if (aarch64_evpc_trn (d))
20477 	return true;
20478       else if (aarch64_evpc_sel (d))
20479 	return true;
20480       if (d->vec_flags == VEC_SVE_DATA)
20481 	return aarch64_evpc_sve_tbl (d);
20482       else if (d->vec_flags == VEC_ADVSIMD)
20483 	return aarch64_evpc_tbl (d);
20484     }
20485   return false;
20486 }
20487 
20488 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST.  */
20489 
20490 static bool
aarch64_vectorize_vec_perm_const(machine_mode vmode,rtx target,rtx op0,rtx op1,const vec_perm_indices & sel)20491 aarch64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
20492 				  rtx op1, const vec_perm_indices &sel)
20493 {
20494   struct expand_vec_perm_d d;
20495 
20496   /* Check whether the mask can be applied to a single vector.  */
20497   if (sel.ninputs () == 1
20498       || (op0 && rtx_equal_p (op0, op1)))
20499     d.one_vector_p = true;
20500   else if (sel.all_from_input_p (0))
20501     {
20502       d.one_vector_p = true;
20503       op1 = op0;
20504     }
20505   else if (sel.all_from_input_p (1))
20506     {
20507       d.one_vector_p = true;
20508       op0 = op1;
20509     }
20510   else
20511     d.one_vector_p = false;
20512 
20513   d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
20514 		     sel.nelts_per_input ());
20515   d.vmode = vmode;
20516   d.vec_flags = aarch64_classify_vector_mode (d.vmode);
20517   d.target = target;
20518   d.op0 = op0;
20519   d.op1 = op1;
20520   d.testing_p = !target;
20521 
20522   if (!d.testing_p)
20523     return aarch64_expand_vec_perm_const_1 (&d);
20524 
20525   rtx_insn *last = get_last_insn ();
20526   bool ret = aarch64_expand_vec_perm_const_1 (&d);
20527   gcc_assert (last == get_last_insn ());
20528 
20529   return ret;
20530 }
20531 
20532 /* Generate a byte permute mask for a register of mode MODE,
20533    which has NUNITS units.  */
20534 
20535 rtx
aarch64_reverse_mask(machine_mode mode,unsigned int nunits)20536 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
20537 {
20538   /* We have to reverse each vector because we dont have
20539      a permuted load that can reverse-load according to ABI rules.  */
20540   rtx mask;
20541   rtvec v = rtvec_alloc (16);
20542   unsigned int i, j;
20543   unsigned int usize = GET_MODE_UNIT_SIZE (mode);
20544 
20545   gcc_assert (BYTES_BIG_ENDIAN);
20546   gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
20547 
20548   for (i = 0; i < nunits; i++)
20549     for (j = 0; j < usize; j++)
20550       RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
20551   mask = gen_rtx_CONST_VECTOR (V16QImode, v);
20552   return force_reg (V16QImode, mask);
20553 }
20554 
20555 /* Expand an SVE integer comparison using the SVE equivalent of:
20556 
20557      (set TARGET (CODE OP0 OP1)).  */
20558 
20559 void
aarch64_expand_sve_vec_cmp_int(rtx target,rtx_code code,rtx op0,rtx op1)20560 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
20561 {
20562   machine_mode pred_mode = GET_MODE (target);
20563   machine_mode data_mode = GET_MODE (op0);
20564   rtx res = aarch64_sve_emit_int_cmp (target, pred_mode, code, data_mode,
20565 				      op0, op1);
20566   if (!rtx_equal_p (target, res))
20567     emit_move_insn (target, res);
20568 }
20569 
20570 /* Return the UNSPEC_COND_* code for comparison CODE.  */
20571 
20572 static unsigned int
aarch64_unspec_cond_code(rtx_code code)20573 aarch64_unspec_cond_code (rtx_code code)
20574 {
20575   switch (code)
20576     {
20577     case NE:
20578       return UNSPEC_COND_FCMNE;
20579     case EQ:
20580       return UNSPEC_COND_FCMEQ;
20581     case LT:
20582       return UNSPEC_COND_FCMLT;
20583     case GT:
20584       return UNSPEC_COND_FCMGT;
20585     case LE:
20586       return UNSPEC_COND_FCMLE;
20587     case GE:
20588       return UNSPEC_COND_FCMGE;
20589     case UNORDERED:
20590       return UNSPEC_COND_FCMUO;
20591     default:
20592       gcc_unreachable ();
20593     }
20594 }
20595 
20596 /* Emit:
20597 
20598       (set TARGET (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
20599 
20600    where <X> is the operation associated with comparison CODE.
20601    KNOWN_PTRUE_P is true if PRED is known to be a PTRUE.  */
20602 
20603 static void
aarch64_emit_sve_fp_cond(rtx target,rtx_code code,rtx pred,bool known_ptrue_p,rtx op0,rtx op1)20604 aarch64_emit_sve_fp_cond (rtx target, rtx_code code, rtx pred,
20605 			  bool known_ptrue_p, rtx op0, rtx op1)
20606 {
20607   rtx flag = gen_int_mode (known_ptrue_p, SImode);
20608   rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
20609 			       gen_rtvec (4, pred, flag, op0, op1),
20610 			       aarch64_unspec_cond_code (code));
20611   emit_set_insn (target, unspec);
20612 }
20613 
20614 /* Emit the SVE equivalent of:
20615 
20616       (set TMP1 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X1>))
20617       (set TMP2 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X2>))
20618       (set TARGET (ior:PRED_MODE TMP1 TMP2))
20619 
20620    where <Xi> is the operation associated with comparison CODEi.
20621    KNOWN_PTRUE_P is true if PRED is known to be a PTRUE.  */
20622 
20623 static void
aarch64_emit_sve_or_fp_conds(rtx target,rtx_code code1,rtx_code code2,rtx pred,bool known_ptrue_p,rtx op0,rtx op1)20624 aarch64_emit_sve_or_fp_conds (rtx target, rtx_code code1, rtx_code code2,
20625 			      rtx pred, bool known_ptrue_p, rtx op0, rtx op1)
20626 {
20627   machine_mode pred_mode = GET_MODE (pred);
20628   rtx tmp1 = gen_reg_rtx (pred_mode);
20629   aarch64_emit_sve_fp_cond (tmp1, code1, pred, known_ptrue_p, op0, op1);
20630   rtx tmp2 = gen_reg_rtx (pred_mode);
20631   aarch64_emit_sve_fp_cond (tmp2, code2, pred, known_ptrue_p, op0, op1);
20632   aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
20633 }
20634 
20635 /* Emit the SVE equivalent of:
20636 
20637       (set TMP (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
20638       (set TARGET (not TMP))
20639 
20640    where <X> is the operation associated with comparison CODE.
20641    KNOWN_PTRUE_P is true if PRED is known to be a PTRUE.  */
20642 
20643 static void
aarch64_emit_sve_invert_fp_cond(rtx target,rtx_code code,rtx pred,bool known_ptrue_p,rtx op0,rtx op1)20644 aarch64_emit_sve_invert_fp_cond (rtx target, rtx_code code, rtx pred,
20645 				 bool known_ptrue_p, rtx op0, rtx op1)
20646 {
20647   machine_mode pred_mode = GET_MODE (pred);
20648   rtx tmp = gen_reg_rtx (pred_mode);
20649   aarch64_emit_sve_fp_cond (tmp, code, pred, known_ptrue_p, op0, op1);
20650   aarch64_emit_unop (target, one_cmpl_optab, tmp);
20651 }
20652 
20653 /* Expand an SVE floating-point comparison using the SVE equivalent of:
20654 
20655      (set TARGET (CODE OP0 OP1))
20656 
20657    If CAN_INVERT_P is true, the caller can also handle inverted results;
20658    return true if the result is in fact inverted.  */
20659 
20660 bool
aarch64_expand_sve_vec_cmp_float(rtx target,rtx_code code,rtx op0,rtx op1,bool can_invert_p)20661 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
20662 				  rtx op0, rtx op1, bool can_invert_p)
20663 {
20664   machine_mode pred_mode = GET_MODE (target);
20665   machine_mode data_mode = GET_MODE (op0);
20666 
20667   rtx ptrue = aarch64_ptrue_reg (pred_mode);
20668   switch (code)
20669     {
20670     case UNORDERED:
20671       /* UNORDERED has no immediate form.  */
20672       op1 = force_reg (data_mode, op1);
20673       /* fall through */
20674     case LT:
20675     case LE:
20676     case GT:
20677     case GE:
20678     case EQ:
20679     case NE:
20680       {
20681 	/* There is native support for the comparison.  */
20682 	aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
20683 	return false;
20684       }
20685 
20686     case LTGT:
20687       /* This is a trapping operation (LT or GT).  */
20688       aarch64_emit_sve_or_fp_conds (target, LT, GT, ptrue, true, op0, op1);
20689       return false;
20690 
20691     case UNEQ:
20692       if (!flag_trapping_math)
20693 	{
20694 	  /* This would trap for signaling NaNs.  */
20695 	  op1 = force_reg (data_mode, op1);
20696 	  aarch64_emit_sve_or_fp_conds (target, UNORDERED, EQ,
20697 					ptrue, true, op0, op1);
20698 	  return false;
20699 	}
20700       /* fall through */
20701     case UNLT:
20702     case UNLE:
20703     case UNGT:
20704     case UNGE:
20705       if (flag_trapping_math)
20706 	{
20707 	  /* Work out which elements are ordered.  */
20708 	  rtx ordered = gen_reg_rtx (pred_mode);
20709 	  op1 = force_reg (data_mode, op1);
20710 	  aarch64_emit_sve_invert_fp_cond (ordered, UNORDERED,
20711 					   ptrue, true, op0, op1);
20712 
20713 	  /* Test the opposite condition for the ordered elements,
20714 	     then invert the result.  */
20715 	  if (code == UNEQ)
20716 	    code = NE;
20717 	  else
20718 	    code = reverse_condition_maybe_unordered (code);
20719 	  if (can_invert_p)
20720 	    {
20721 	      aarch64_emit_sve_fp_cond (target, code,
20722 					ordered, false, op0, op1);
20723 	      return true;
20724 	    }
20725 	  aarch64_emit_sve_invert_fp_cond (target, code,
20726 					   ordered, false, op0, op1);
20727 	  return false;
20728 	}
20729       break;
20730 
20731     case ORDERED:
20732       /* ORDERED has no immediate form.  */
20733       op1 = force_reg (data_mode, op1);
20734       break;
20735 
20736     default:
20737       gcc_unreachable ();
20738     }
20739 
20740   /* There is native support for the inverse comparison.  */
20741   code = reverse_condition_maybe_unordered (code);
20742   if (can_invert_p)
20743     {
20744       aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
20745       return true;
20746     }
20747   aarch64_emit_sve_invert_fp_cond (target, code, ptrue, true, op0, op1);
20748   return false;
20749 }
20750 
20751 /* Expand an SVE vcond pattern with operands OPS.  DATA_MODE is the mode
20752    of the data being selected and CMP_MODE is the mode of the values being
20753    compared.  */
20754 
20755 void
aarch64_expand_sve_vcond(machine_mode data_mode,machine_mode cmp_mode,rtx * ops)20756 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
20757 			  rtx *ops)
20758 {
20759   machine_mode pred_mode = aarch64_get_mask_mode (cmp_mode).require ();
20760   rtx pred = gen_reg_rtx (pred_mode);
20761   if (FLOAT_MODE_P (cmp_mode))
20762     {
20763       if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
20764 					    ops[4], ops[5], true))
20765 	std::swap (ops[1], ops[2]);
20766     }
20767   else
20768     aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
20769 
20770   if (!aarch64_sve_reg_or_dup_imm (ops[1], data_mode))
20771     ops[1] = force_reg (data_mode, ops[1]);
20772   /* The "false" value can only be zero if the "true" value is a constant.  */
20773   if (register_operand (ops[1], data_mode)
20774       || !aarch64_simd_reg_or_zero (ops[2], data_mode))
20775     ops[2] = force_reg (data_mode, ops[2]);
20776 
20777   rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
20778   emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
20779 }
20780 
20781 /* Implement TARGET_MODES_TIEABLE_P.  In principle we should always return
20782    true.  However due to issues with register allocation it is preferable
20783    to avoid tieing integer scalar and FP scalar modes.  Executing integer
20784    operations in general registers is better than treating them as scalar
20785    vector operations.  This reduces latency and avoids redundant int<->FP
20786    moves.  So tie modes if they are either the same class, or vector modes
20787    with other vector modes, vector structs or any scalar mode.  */
20788 
20789 static bool
aarch64_modes_tieable_p(machine_mode mode1,machine_mode mode2)20790 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
20791 {
20792   if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
20793     return true;
20794 
20795   /* We specifically want to allow elements of "structure" modes to
20796      be tieable to the structure.  This more general condition allows
20797      other rarer situations too.  The reason we don't extend this to
20798      predicate modes is that there are no predicate structure modes
20799      nor any specific instructions for extracting part of a predicate
20800      register.  */
20801   if (aarch64_vector_data_mode_p (mode1)
20802       && aarch64_vector_data_mode_p (mode2))
20803     return true;
20804 
20805   /* Also allow any scalar modes with vectors.  */
20806   if (aarch64_vector_mode_supported_p (mode1)
20807       || aarch64_vector_mode_supported_p (mode2))
20808     return true;
20809 
20810   return false;
20811 }
20812 
20813 /* Return a new RTX holding the result of moving POINTER forward by
20814    AMOUNT bytes.  */
20815 
20816 static rtx
aarch64_move_pointer(rtx pointer,poly_int64 amount)20817 aarch64_move_pointer (rtx pointer, poly_int64 amount)
20818 {
20819   rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
20820 
20821   return adjust_automodify_address (pointer, GET_MODE (pointer),
20822 				    next, amount);
20823 }
20824 
20825 /* Return a new RTX holding the result of moving POINTER forward by the
20826    size of the mode it points to.  */
20827 
20828 static rtx
aarch64_progress_pointer(rtx pointer)20829 aarch64_progress_pointer (rtx pointer)
20830 {
20831   return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
20832 }
20833 
20834 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
20835    MODE bytes.  */
20836 
20837 static void
aarch64_copy_one_block_and_progress_pointers(rtx * src,rtx * dst,machine_mode mode)20838 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
20839 					      machine_mode mode)
20840 {
20841   rtx reg = gen_reg_rtx (mode);
20842 
20843   /* "Cast" the pointers to the correct mode.  */
20844   *src = adjust_address (*src, mode, 0);
20845   *dst = adjust_address (*dst, mode, 0);
20846   /* Emit the memcpy.  */
20847   emit_move_insn (reg, *src);
20848   emit_move_insn (*dst, reg);
20849   /* Move the pointers forward.  */
20850   *src = aarch64_progress_pointer (*src);
20851   *dst = aarch64_progress_pointer (*dst);
20852 }
20853 
20854 /* Expand cpymem, as if from a __builtin_memcpy.  Return true if
20855    we succeed, otherwise return false.  */
20856 
20857 bool
aarch64_expand_cpymem(rtx * operands)20858 aarch64_expand_cpymem (rtx *operands)
20859 {
20860   int n, mode_bits;
20861   rtx dst = operands[0];
20862   rtx src = operands[1];
20863   rtx base;
20864   machine_mode cur_mode = BLKmode, next_mode;
20865   bool speed_p = !optimize_function_for_size_p (cfun);
20866 
20867   /* When optimizing for size, give a better estimate of the length of a
20868      memcpy call, but use the default otherwise.  Moves larger than 8 bytes
20869      will always require an even number of instructions to do now.  And each
20870      operation requires both a load+store, so devide the max number by 2.  */
20871   int max_num_moves = (speed_p ? 16 : AARCH64_CALL_RATIO) / 2;
20872 
20873   /* We can't do anything smart if the amount to copy is not constant.  */
20874   if (!CONST_INT_P (operands[2]))
20875     return false;
20876 
20877   n = INTVAL (operands[2]);
20878 
20879   /* Try to keep the number of instructions low.  For all cases we will do at
20880      most two moves for the residual amount, since we'll always overlap the
20881      remainder.  */
20882   if (((n / 16) + (n % 16 ? 2 : 0)) > max_num_moves)
20883     return false;
20884 
20885   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
20886   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
20887 
20888   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
20889   src = adjust_automodify_address (src, VOIDmode, base, 0);
20890 
20891   /* Convert n to bits to make the rest of the code simpler.  */
20892   n = n * BITS_PER_UNIT;
20893 
20894   /* Maximum amount to copy in one go.  The AArch64 back-end has integer modes
20895      larger than TImode, but we should not use them for loads/stores here.  */
20896   const int copy_limit = GET_MODE_BITSIZE (TImode);
20897 
20898   while (n > 0)
20899     {
20900       /* Find the largest mode in which to do the copy in without over reading
20901 	 or writing.  */
20902       opt_scalar_int_mode mode_iter;
20903       FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
20904 	if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit))
20905 	  cur_mode = mode_iter.require ();
20906 
20907       gcc_assert (cur_mode != BLKmode);
20908 
20909       mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
20910       aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode);
20911 
20912       n -= mode_bits;
20913 
20914       /* Do certain trailing copies as overlapping if it's going to be
20915 	 cheaper.  i.e. less instructions to do so.  For instance doing a 15
20916 	 byte copy it's more efficient to do two overlapping 8 byte copies than
20917 	 8 + 6 + 1.  */
20918       if (n > 0 && n <= 8 * BITS_PER_UNIT)
20919 	{
20920 	  next_mode = smallest_mode_for_size (n, MODE_INT);
20921 	  int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
20922 	  src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT);
20923 	  dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
20924 	  n = n_bits;
20925 	}
20926     }
20927 
20928   return true;
20929 }
20930 
20931 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
20932    SImode stores.  Handle the case when the constant has identical
20933    bottom and top halves.  This is beneficial when the two stores can be
20934    merged into an STP and we avoid synthesising potentially expensive
20935    immediates twice.  Return true if such a split is possible.  */
20936 
20937 bool
aarch64_split_dimode_const_store(rtx dst,rtx src)20938 aarch64_split_dimode_const_store (rtx dst, rtx src)
20939 {
20940   rtx lo = gen_lowpart (SImode, src);
20941   rtx hi = gen_highpart_mode (SImode, DImode, src);
20942 
20943   bool size_p = optimize_function_for_size_p (cfun);
20944 
20945   if (!rtx_equal_p (lo, hi))
20946     return false;
20947 
20948   unsigned int orig_cost
20949     = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
20950   unsigned int lo_cost
20951     = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
20952 
20953   /* We want to transform:
20954      MOV	x1, 49370
20955      MOVK	x1, 0x140, lsl 16
20956      MOVK	x1, 0xc0da, lsl 32
20957      MOVK	x1, 0x140, lsl 48
20958      STR	x1, [x0]
20959    into:
20960      MOV	w1, 49370
20961      MOVK	w1, 0x140, lsl 16
20962      STP	w1, w1, [x0]
20963    So we want to perform this only when we save two instructions
20964    or more.  When optimizing for size, however, accept any code size
20965    savings we can.  */
20966   if (size_p && orig_cost <= lo_cost)
20967     return false;
20968 
20969   if (!size_p
20970       && (orig_cost <= lo_cost + 1))
20971     return false;
20972 
20973   rtx mem_lo = adjust_address (dst, SImode, 0);
20974   if (!aarch64_mem_pair_operand (mem_lo, SImode))
20975     return false;
20976 
20977   rtx tmp_reg = gen_reg_rtx (SImode);
20978   aarch64_expand_mov_immediate (tmp_reg, lo);
20979   rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
20980   /* Don't emit an explicit store pair as this may not be always profitable.
20981      Let the sched-fusion logic decide whether to merge them.  */
20982   emit_move_insn (mem_lo, tmp_reg);
20983   emit_move_insn (mem_hi, tmp_reg);
20984 
20985   return true;
20986 }
20987 
20988 /* Generate RTL for a conditional branch with rtx comparison CODE in
20989    mode CC_MODE.  The destination of the unlikely conditional branch
20990    is LABEL_REF.  */
20991 
20992 void
aarch64_gen_unlikely_cbranch(enum rtx_code code,machine_mode cc_mode,rtx label_ref)20993 aarch64_gen_unlikely_cbranch (enum rtx_code code, machine_mode cc_mode,
20994 			      rtx label_ref)
20995 {
20996   rtx x;
20997   x = gen_rtx_fmt_ee (code, VOIDmode,
20998 		      gen_rtx_REG (cc_mode, CC_REGNUM),
20999 		      const0_rtx);
21000 
21001   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
21002 			    gen_rtx_LABEL_REF (VOIDmode, label_ref),
21003 			    pc_rtx);
21004   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
21005 }
21006 
21007 /* Generate DImode scratch registers for 128-bit (TImode) addition.
21008 
21009    OP1 represents the TImode destination operand 1
21010    OP2 represents the TImode destination operand 2
21011    LOW_DEST represents the low half (DImode) of TImode operand 0
21012    LOW_IN1 represents the low half (DImode) of TImode operand 1
21013    LOW_IN2 represents the low half (DImode) of TImode operand 2
21014    HIGH_DEST represents the high half (DImode) of TImode operand 0
21015    HIGH_IN1 represents the high half (DImode) of TImode operand 1
21016    HIGH_IN2 represents the high half (DImode) of TImode operand 2.  */
21017 
21018 void
aarch64_addti_scratch_regs(rtx op1,rtx op2,rtx * low_dest,rtx * low_in1,rtx * low_in2,rtx * high_dest,rtx * high_in1,rtx * high_in2)21019 aarch64_addti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
21020 			    rtx *low_in1, rtx *low_in2,
21021 			    rtx *high_dest, rtx *high_in1,
21022 			    rtx *high_in2)
21023 {
21024   *low_dest = gen_reg_rtx (DImode);
21025   *low_in1 = gen_lowpart (DImode, op1);
21026   *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
21027 				  subreg_lowpart_offset (DImode, TImode));
21028   *high_dest = gen_reg_rtx (DImode);
21029   *high_in1 = gen_highpart (DImode, op1);
21030   *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
21031 				   subreg_highpart_offset (DImode, TImode));
21032 }
21033 
21034 /* Generate DImode scratch registers for 128-bit (TImode) subtraction.
21035 
21036    This function differs from 'arch64_addti_scratch_regs' in that
21037    OP1 can be an immediate constant (zero). We must call
21038    subreg_highpart_offset with DImode and TImode arguments, otherwise
21039    VOIDmode will be used for the const_int which generates an internal
21040    error from subreg_size_highpart_offset which does not expect a size of zero.
21041 
21042    OP1 represents the TImode destination operand 1
21043    OP2 represents the TImode destination operand 2
21044    LOW_DEST represents the low half (DImode) of TImode operand 0
21045    LOW_IN1 represents the low half (DImode) of TImode operand 1
21046    LOW_IN2 represents the low half (DImode) of TImode operand 2
21047    HIGH_DEST represents the high half (DImode) of TImode operand 0
21048    HIGH_IN1 represents the high half (DImode) of TImode operand 1
21049    HIGH_IN2 represents the high half (DImode) of TImode operand 2.  */
21050 
21051 
21052 void
aarch64_subvti_scratch_regs(rtx op1,rtx op2,rtx * low_dest,rtx * low_in1,rtx * low_in2,rtx * high_dest,rtx * high_in1,rtx * high_in2)21053 aarch64_subvti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
21054 			     rtx *low_in1, rtx *low_in2,
21055 			     rtx *high_dest, rtx *high_in1,
21056 			     rtx *high_in2)
21057 {
21058   *low_dest = gen_reg_rtx (DImode);
21059   *low_in1 = simplify_gen_subreg (DImode, op1, TImode,
21060 				  subreg_lowpart_offset (DImode, TImode));
21061 
21062   *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
21063 				  subreg_lowpart_offset (DImode, TImode));
21064   *high_dest = gen_reg_rtx (DImode);
21065 
21066   *high_in1 = simplify_gen_subreg (DImode, op1, TImode,
21067 				   subreg_highpart_offset (DImode, TImode));
21068   *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
21069 				   subreg_highpart_offset (DImode, TImode));
21070 }
21071 
21072 /* Generate RTL for 128-bit (TImode) subtraction with overflow.
21073 
21074    OP0 represents the TImode destination operand 0
21075    LOW_DEST represents the low half (DImode) of TImode operand 0
21076    LOW_IN1 represents the low half (DImode) of TImode operand 1
21077    LOW_IN2 represents the low half (DImode) of TImode operand 2
21078    HIGH_DEST represents the high half (DImode) of TImode operand 0
21079    HIGH_IN1 represents the high half (DImode) of TImode operand 1
21080    HIGH_IN2 represents the high half (DImode) of TImode operand 2
21081    UNSIGNED_P is true if the operation is being performed on unsigned
21082    values.  */
21083 void
aarch64_expand_subvti(rtx op0,rtx low_dest,rtx low_in1,rtx low_in2,rtx high_dest,rtx high_in1,rtx high_in2,bool unsigned_p)21084 aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1,
21085 		       rtx low_in2, rtx high_dest, rtx high_in1,
21086 		       rtx high_in2, bool unsigned_p)
21087 {
21088   if (low_in2 == const0_rtx)
21089     {
21090       low_dest = low_in1;
21091       high_in2 = force_reg (DImode, high_in2);
21092       if (unsigned_p)
21093 	emit_insn (gen_subdi3_compare1 (high_dest, high_in1, high_in2));
21094       else
21095 	emit_insn (gen_subvdi_insn (high_dest, high_in1, high_in2));
21096     }
21097   else
21098     {
21099       if (aarch64_plus_immediate (low_in2, DImode))
21100 	emit_insn (gen_subdi3_compare1_imm (low_dest, low_in1, low_in2,
21101 					    GEN_INT (-INTVAL (low_in2))));
21102       else
21103 	{
21104 	  low_in2 = force_reg (DImode, low_in2);
21105 	  emit_insn (gen_subdi3_compare1 (low_dest, low_in1, low_in2));
21106 	}
21107       high_in2 = force_reg (DImode, high_in2);
21108 
21109       if (unsigned_p)
21110 	emit_insn (gen_usubdi3_carryinC (high_dest, high_in1, high_in2));
21111       else
21112 	emit_insn (gen_subdi3_carryinV (high_dest, high_in1, high_in2));
21113     }
21114 
21115   emit_move_insn (gen_lowpart (DImode, op0), low_dest);
21116   emit_move_insn (gen_highpart (DImode, op0), high_dest);
21117 
21118 }
21119 
21120 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
21121 
21122 static unsigned HOST_WIDE_INT
aarch64_asan_shadow_offset(void)21123 aarch64_asan_shadow_offset (void)
21124 {
21125   if (TARGET_ILP32)
21126     return (HOST_WIDE_INT_1 << 29);
21127   else
21128     return (HOST_WIDE_INT_1 << 36);
21129 }
21130 
21131 static rtx
aarch64_gen_ccmp_first(rtx_insn ** prep_seq,rtx_insn ** gen_seq,int code,tree treeop0,tree treeop1)21132 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
21133 			int code, tree treeop0, tree treeop1)
21134 {
21135   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
21136   rtx op0, op1;
21137   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
21138   insn_code icode;
21139   struct expand_operand ops[4];
21140 
21141   start_sequence ();
21142   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
21143 
21144   op_mode = GET_MODE (op0);
21145   if (op_mode == VOIDmode)
21146     op_mode = GET_MODE (op1);
21147 
21148   switch (op_mode)
21149     {
21150     case E_QImode:
21151     case E_HImode:
21152     case E_SImode:
21153       cmp_mode = SImode;
21154       icode = CODE_FOR_cmpsi;
21155       break;
21156 
21157     case E_DImode:
21158       cmp_mode = DImode;
21159       icode = CODE_FOR_cmpdi;
21160       break;
21161 
21162     case E_SFmode:
21163       cmp_mode = SFmode;
21164       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
21165       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
21166       break;
21167 
21168     case E_DFmode:
21169       cmp_mode = DFmode;
21170       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
21171       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
21172       break;
21173 
21174     default:
21175       end_sequence ();
21176       return NULL_RTX;
21177     }
21178 
21179   op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
21180   op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
21181   if (!op0 || !op1)
21182     {
21183       end_sequence ();
21184       return NULL_RTX;
21185     }
21186   *prep_seq = get_insns ();
21187   end_sequence ();
21188 
21189   create_fixed_operand (&ops[0], op0);
21190   create_fixed_operand (&ops[1], op1);
21191 
21192   start_sequence ();
21193   if (!maybe_expand_insn (icode, 2, ops))
21194     {
21195       end_sequence ();
21196       return NULL_RTX;
21197     }
21198   *gen_seq = get_insns ();
21199   end_sequence ();
21200 
21201   return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
21202 			 gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
21203 }
21204 
21205 static rtx
aarch64_gen_ccmp_next(rtx_insn ** prep_seq,rtx_insn ** gen_seq,rtx prev,int cmp_code,tree treeop0,tree treeop1,int bit_code)21206 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
21207 		       int cmp_code, tree treeop0, tree treeop1, int bit_code)
21208 {
21209   rtx op0, op1, target;
21210   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
21211   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
21212   insn_code icode;
21213   struct expand_operand ops[6];
21214   int aarch64_cond;
21215 
21216   push_to_sequence (*prep_seq);
21217   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
21218 
21219   op_mode = GET_MODE (op0);
21220   if (op_mode == VOIDmode)
21221     op_mode = GET_MODE (op1);
21222 
21223   switch (op_mode)
21224     {
21225     case E_QImode:
21226     case E_HImode:
21227     case E_SImode:
21228       cmp_mode = SImode;
21229       break;
21230 
21231     case E_DImode:
21232       cmp_mode = DImode;
21233       break;
21234 
21235     case E_SFmode:
21236       cmp_mode = SFmode;
21237       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
21238       break;
21239 
21240     case E_DFmode:
21241       cmp_mode = DFmode;
21242       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
21243       break;
21244 
21245     default:
21246       end_sequence ();
21247       return NULL_RTX;
21248     }
21249 
21250   icode = code_for_ccmp (cc_mode, cmp_mode);
21251 
21252   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
21253   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
21254   if (!op0 || !op1)
21255     {
21256       end_sequence ();
21257       return NULL_RTX;
21258     }
21259   *prep_seq = get_insns ();
21260   end_sequence ();
21261 
21262   target = gen_rtx_REG (cc_mode, CC_REGNUM);
21263   aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
21264 
21265   if (bit_code != AND)
21266     {
21267       /* Treat the ccmp patterns as canonical and use them where possible,
21268 	 but fall back to ccmp_rev patterns if there's no other option.  */
21269       rtx_code prev_code = GET_CODE (prev);
21270       machine_mode prev_mode = GET_MODE (XEXP (prev, 0));
21271       if ((prev_mode == CCFPmode || prev_mode == CCFPEmode)
21272 	  && !(prev_code == EQ
21273 	       || prev_code == NE
21274 	       || prev_code == ORDERED
21275 	       || prev_code == UNORDERED))
21276 	icode = code_for_ccmp_rev (cc_mode, cmp_mode);
21277       else
21278 	{
21279 	  rtx_code code = reverse_condition (prev_code);
21280 	  prev = gen_rtx_fmt_ee (code, VOIDmode, XEXP (prev, 0), const0_rtx);
21281 	}
21282       aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
21283     }
21284 
21285   create_fixed_operand (&ops[0], XEXP (prev, 0));
21286   create_fixed_operand (&ops[1], target);
21287   create_fixed_operand (&ops[2], op0);
21288   create_fixed_operand (&ops[3], op1);
21289   create_fixed_operand (&ops[4], prev);
21290   create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
21291 
21292   push_to_sequence (*gen_seq);
21293   if (!maybe_expand_insn (icode, 6, ops))
21294     {
21295       end_sequence ();
21296       return NULL_RTX;
21297     }
21298 
21299   *gen_seq = get_insns ();
21300   end_sequence ();
21301 
21302   return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
21303 }
21304 
21305 #undef TARGET_GEN_CCMP_FIRST
21306 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
21307 
21308 #undef TARGET_GEN_CCMP_NEXT
21309 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
21310 
21311 /* Implement TARGET_SCHED_MACRO_FUSION_P.  Return true if target supports
21312    instruction fusion of some sort.  */
21313 
21314 static bool
aarch64_macro_fusion_p(void)21315 aarch64_macro_fusion_p (void)
21316 {
21317   return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
21318 }
21319 
21320 
21321 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P.  Return true if PREV and CURR
21322    should be kept together during scheduling.  */
21323 
21324 static bool
aarch_macro_fusion_pair_p(rtx_insn * prev,rtx_insn * curr)21325 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
21326 {
21327   rtx set_dest;
21328   rtx prev_set = single_set (prev);
21329   rtx curr_set = single_set (curr);
21330   /* prev and curr are simple SET insns i.e. no flag setting or branching.  */
21331   bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
21332 
21333   if (!aarch64_macro_fusion_p ())
21334     return false;
21335 
21336   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
21337     {
21338       /* We are trying to match:
21339          prev (mov)  == (set (reg r0) (const_int imm16))
21340          curr (movk) == (set (zero_extract (reg r0)
21341                                            (const_int 16)
21342                                            (const_int 16))
21343                              (const_int imm16_1))  */
21344 
21345       set_dest = SET_DEST (curr_set);
21346 
21347       if (GET_CODE (set_dest) == ZERO_EXTRACT
21348           && CONST_INT_P (SET_SRC (curr_set))
21349           && CONST_INT_P (SET_SRC (prev_set))
21350           && CONST_INT_P (XEXP (set_dest, 2))
21351           && INTVAL (XEXP (set_dest, 2)) == 16
21352           && REG_P (XEXP (set_dest, 0))
21353           && REG_P (SET_DEST (prev_set))
21354           && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
21355         {
21356           return true;
21357         }
21358     }
21359 
21360   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
21361     {
21362 
21363       /*  We're trying to match:
21364           prev (adrp) == (set (reg r1)
21365                               (high (symbol_ref ("SYM"))))
21366           curr (add) == (set (reg r0)
21367                              (lo_sum (reg r1)
21368                                      (symbol_ref ("SYM"))))
21369           Note that r0 need not necessarily be the same as r1, especially
21370           during pre-regalloc scheduling.  */
21371 
21372       if (satisfies_constraint_Ush (SET_SRC (prev_set))
21373           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
21374         {
21375           if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
21376               && REG_P (XEXP (SET_SRC (curr_set), 0))
21377               && REGNO (XEXP (SET_SRC (curr_set), 0))
21378                  == REGNO (SET_DEST (prev_set))
21379               && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
21380                               XEXP (SET_SRC (curr_set), 1)))
21381             return true;
21382         }
21383     }
21384 
21385   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
21386     {
21387 
21388       /* We're trying to match:
21389          prev (movk) == (set (zero_extract (reg r0)
21390                                            (const_int 16)
21391                                            (const_int 32))
21392                              (const_int imm16_1))
21393          curr (movk) == (set (zero_extract (reg r0)
21394                                            (const_int 16)
21395                                            (const_int 48))
21396                              (const_int imm16_2))  */
21397 
21398       if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
21399           && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
21400           && REG_P (XEXP (SET_DEST (prev_set), 0))
21401           && REG_P (XEXP (SET_DEST (curr_set), 0))
21402           && REGNO (XEXP (SET_DEST (prev_set), 0))
21403              == REGNO (XEXP (SET_DEST (curr_set), 0))
21404           && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
21405           && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
21406           && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
21407           && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
21408           && CONST_INT_P (SET_SRC (prev_set))
21409           && CONST_INT_P (SET_SRC (curr_set)))
21410         return true;
21411 
21412     }
21413   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
21414     {
21415       /* We're trying to match:
21416           prev (adrp) == (set (reg r0)
21417                               (high (symbol_ref ("SYM"))))
21418           curr (ldr) == (set (reg r1)
21419                              (mem (lo_sum (reg r0)
21420                                              (symbol_ref ("SYM")))))
21421                  or
21422           curr (ldr) == (set (reg r1)
21423                              (zero_extend (mem
21424                                            (lo_sum (reg r0)
21425                                                    (symbol_ref ("SYM"))))))  */
21426       if (satisfies_constraint_Ush (SET_SRC (prev_set))
21427           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
21428         {
21429           rtx curr_src = SET_SRC (curr_set);
21430 
21431           if (GET_CODE (curr_src) == ZERO_EXTEND)
21432             curr_src = XEXP (curr_src, 0);
21433 
21434           if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
21435               && REG_P (XEXP (XEXP (curr_src, 0), 0))
21436               && REGNO (XEXP (XEXP (curr_src, 0), 0))
21437                  == REGNO (SET_DEST (prev_set))
21438               && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
21439                               XEXP (SET_SRC (prev_set), 0)))
21440               return true;
21441         }
21442     }
21443 
21444   /* Fuse compare (CMP/CMN/TST/BICS) and conditional branch.  */
21445   if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
21446       && prev_set && curr_set && any_condjump_p (curr)
21447       && GET_CODE (SET_SRC (prev_set)) == COMPARE
21448       && SCALAR_INT_MODE_P (GET_MODE (XEXP (SET_SRC (prev_set), 0)))
21449       && reg_referenced_p (SET_DEST (prev_set), PATTERN (curr)))
21450     return true;
21451 
21452   /* Fuse flag-setting ALU instructions and conditional branch.  */
21453   if (aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
21454       && any_condjump_p (curr))
21455     {
21456       unsigned int condreg1, condreg2;
21457       rtx cc_reg_1;
21458       aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
21459       cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
21460 
21461       if (reg_referenced_p (cc_reg_1, PATTERN (curr))
21462 	  && prev
21463 	  && modified_in_p (cc_reg_1, prev))
21464 	{
21465 	  enum attr_type prev_type = get_attr_type (prev);
21466 
21467 	  /* FIXME: this misses some which is considered simple arthematic
21468 	     instructions for ThunderX.  Simple shifts are missed here.  */
21469 	  if (prev_type == TYPE_ALUS_SREG
21470 	      || prev_type == TYPE_ALUS_IMM
21471 	      || prev_type == TYPE_LOGICS_REG
21472 	      || prev_type == TYPE_LOGICS_IMM)
21473 	    return true;
21474 	}
21475     }
21476 
21477   /* Fuse ALU instructions and CBZ/CBNZ.  */
21478   if (prev_set
21479       && curr_set
21480       && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_CBZ)
21481       && any_condjump_p (curr))
21482     {
21483       /* We're trying to match:
21484 	  prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
21485 	  curr (cbz) ==  (set (pc) (if_then_else (eq/ne) (r0)
21486 							 (const_int 0))
21487 						 (label_ref ("SYM"))
21488 						 (pc))  */
21489       if (SET_DEST (curr_set) == (pc_rtx)
21490 	  && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
21491 	  && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
21492 	  && REG_P (SET_DEST (prev_set))
21493 	  && REGNO (SET_DEST (prev_set))
21494 	     == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
21495 	{
21496 	  /* Fuse ALU operations followed by conditional branch instruction.  */
21497 	  switch (get_attr_type (prev))
21498 	    {
21499 	    case TYPE_ALU_IMM:
21500 	    case TYPE_ALU_SREG:
21501 	    case TYPE_ADC_REG:
21502 	    case TYPE_ADC_IMM:
21503 	    case TYPE_ADCS_REG:
21504 	    case TYPE_ADCS_IMM:
21505 	    case TYPE_LOGIC_REG:
21506 	    case TYPE_LOGIC_IMM:
21507 	    case TYPE_CSEL:
21508 	    case TYPE_ADR:
21509 	    case TYPE_MOV_IMM:
21510 	    case TYPE_SHIFT_REG:
21511 	    case TYPE_SHIFT_IMM:
21512 	    case TYPE_BFM:
21513 	    case TYPE_RBIT:
21514 	    case TYPE_REV:
21515 	    case TYPE_EXTEND:
21516 	      return true;
21517 
21518 	    default:;
21519 	    }
21520 	}
21521     }
21522 
21523   return false;
21524 }
21525 
21526 /* Return true iff the instruction fusion described by OP is enabled.  */
21527 
21528 bool
aarch64_fusion_enabled_p(enum aarch64_fusion_pairs op)21529 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
21530 {
21531   return (aarch64_tune_params.fusible_ops & op) != 0;
21532 }
21533 
21534 /* If MEM is in the form of [base+offset], extract the two parts
21535    of address and set to BASE and OFFSET, otherwise return false
21536    after clearing BASE and OFFSET.  */
21537 
21538 bool
extract_base_offset_in_addr(rtx mem,rtx * base,rtx * offset)21539 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
21540 {
21541   rtx addr;
21542 
21543   gcc_assert (MEM_P (mem));
21544 
21545   addr = XEXP (mem, 0);
21546 
21547   if (REG_P (addr))
21548     {
21549       *base = addr;
21550       *offset = const0_rtx;
21551       return true;
21552     }
21553 
21554   if (GET_CODE (addr) == PLUS
21555       && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
21556     {
21557       *base = XEXP (addr, 0);
21558       *offset = XEXP (addr, 1);
21559       return true;
21560     }
21561 
21562   *base = NULL_RTX;
21563   *offset = NULL_RTX;
21564 
21565   return false;
21566 }
21567 
21568 /* Types for scheduling fusion.  */
21569 enum sched_fusion_type
21570 {
21571   SCHED_FUSION_NONE = 0,
21572   SCHED_FUSION_LD_SIGN_EXTEND,
21573   SCHED_FUSION_LD_ZERO_EXTEND,
21574   SCHED_FUSION_LD,
21575   SCHED_FUSION_ST,
21576   SCHED_FUSION_NUM
21577 };
21578 
21579 /* If INSN is a load or store of address in the form of [base+offset],
21580    extract the two parts and set to BASE and OFFSET.  Return scheduling
21581    fusion type this INSN is.  */
21582 
21583 static enum sched_fusion_type
fusion_load_store(rtx_insn * insn,rtx * base,rtx * offset)21584 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
21585 {
21586   rtx x, dest, src;
21587   enum sched_fusion_type fusion = SCHED_FUSION_LD;
21588 
21589   gcc_assert (INSN_P (insn));
21590   x = PATTERN (insn);
21591   if (GET_CODE (x) != SET)
21592     return SCHED_FUSION_NONE;
21593 
21594   src = SET_SRC (x);
21595   dest = SET_DEST (x);
21596 
21597   machine_mode dest_mode = GET_MODE (dest);
21598 
21599   if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
21600     return SCHED_FUSION_NONE;
21601 
21602   if (GET_CODE (src) == SIGN_EXTEND)
21603     {
21604       fusion = SCHED_FUSION_LD_SIGN_EXTEND;
21605       src = XEXP (src, 0);
21606       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
21607 	return SCHED_FUSION_NONE;
21608     }
21609   else if (GET_CODE (src) == ZERO_EXTEND)
21610     {
21611       fusion = SCHED_FUSION_LD_ZERO_EXTEND;
21612       src = XEXP (src, 0);
21613       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
21614 	return SCHED_FUSION_NONE;
21615     }
21616 
21617   if (GET_CODE (src) == MEM && REG_P (dest))
21618     extract_base_offset_in_addr (src, base, offset);
21619   else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
21620     {
21621       fusion = SCHED_FUSION_ST;
21622       extract_base_offset_in_addr (dest, base, offset);
21623     }
21624   else
21625     return SCHED_FUSION_NONE;
21626 
21627   if (*base == NULL_RTX || *offset == NULL_RTX)
21628     fusion = SCHED_FUSION_NONE;
21629 
21630   return fusion;
21631 }
21632 
21633 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
21634 
21635    Currently we only support to fuse ldr or str instructions, so FUSION_PRI
21636    and PRI are only calculated for these instructions.  For other instruction,
21637    FUSION_PRI and PRI are simply set to MAX_PRI - 1.  In the future, other
21638    type instruction fusion can be added by returning different priorities.
21639 
21640    It's important that irrelevant instructions get the largest FUSION_PRI.  */
21641 
21642 static void
aarch64_sched_fusion_priority(rtx_insn * insn,int max_pri,int * fusion_pri,int * pri)21643 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
21644 			       int *fusion_pri, int *pri)
21645 {
21646   int tmp, off_val;
21647   rtx base, offset;
21648   enum sched_fusion_type fusion;
21649 
21650   gcc_assert (INSN_P (insn));
21651 
21652   tmp = max_pri - 1;
21653   fusion = fusion_load_store (insn, &base, &offset);
21654   if (fusion == SCHED_FUSION_NONE)
21655     {
21656       *pri = tmp;
21657       *fusion_pri = tmp;
21658       return;
21659     }
21660 
21661   /* Set FUSION_PRI according to fusion type and base register.  */
21662   *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
21663 
21664   /* Calculate PRI.  */
21665   tmp /= 2;
21666 
21667   /* INSN with smaller offset goes first.  */
21668   off_val = (int)(INTVAL (offset));
21669   if (off_val >= 0)
21670     tmp -= (off_val & 0xfffff);
21671   else
21672     tmp += ((- off_val) & 0xfffff);
21673 
21674   *pri = tmp;
21675   return;
21676 }
21677 
21678 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
21679    Adjust priority of sha1h instructions so they are scheduled before
21680    other SHA1 instructions.  */
21681 
21682 static int
aarch64_sched_adjust_priority(rtx_insn * insn,int priority)21683 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
21684 {
21685   rtx x = PATTERN (insn);
21686 
21687   if (GET_CODE (x) == SET)
21688     {
21689       x = SET_SRC (x);
21690 
21691       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
21692 	return priority + 10;
21693     }
21694 
21695   return priority;
21696 }
21697 
21698 /* Given OPERANDS of consecutive load/store, check if we can merge
21699    them into ldp/stp.  LOAD is true if they are load instructions.
21700    MODE is the mode of memory operands.  */
21701 
21702 bool
aarch64_operands_ok_for_ldpstp(rtx * operands,bool load,machine_mode mode)21703 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
21704 				machine_mode mode)
21705 {
21706   HOST_WIDE_INT offval_1, offval_2, msize;
21707   enum reg_class rclass_1, rclass_2;
21708   rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
21709 
21710   if (load)
21711     {
21712       mem_1 = operands[1];
21713       mem_2 = operands[3];
21714       reg_1 = operands[0];
21715       reg_2 = operands[2];
21716       gcc_assert (REG_P (reg_1) && REG_P (reg_2));
21717       if (REGNO (reg_1) == REGNO (reg_2))
21718 	return false;
21719     }
21720   else
21721     {
21722       mem_1 = operands[0];
21723       mem_2 = operands[2];
21724       reg_1 = operands[1];
21725       reg_2 = operands[3];
21726     }
21727 
21728   /* The mems cannot be volatile.  */
21729   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
21730     return false;
21731 
21732   /* If we have SImode and slow unaligned ldp,
21733      check the alignment to be at least 8 byte. */
21734   if (mode == SImode
21735       && (aarch64_tune_params.extra_tuning_flags
21736           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
21737       && !optimize_size
21738       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
21739     return false;
21740 
21741   /* Check if the addresses are in the form of [base+offset].  */
21742   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
21743   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
21744     return false;
21745   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
21746   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
21747     return false;
21748 
21749   /* Check if the bases are same.  */
21750   if (!rtx_equal_p (base_1, base_2))
21751     return false;
21752 
21753   /* The operands must be of the same size.  */
21754   gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1)),
21755 			 GET_MODE_SIZE (GET_MODE (mem_2))));
21756 
21757   offval_1 = INTVAL (offset_1);
21758   offval_2 = INTVAL (offset_2);
21759   /* We should only be trying this for fixed-sized modes.  There is no
21760      SVE LDP/STP instruction.  */
21761   msize = GET_MODE_SIZE (mode).to_constant ();
21762   /* Check if the offsets are consecutive.  */
21763   if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
21764     return false;
21765 
21766   /* Check if the addresses are clobbered by load.  */
21767   if (load)
21768     {
21769       if (reg_mentioned_p (reg_1, mem_1))
21770 	return false;
21771 
21772       /* In increasing order, the last load can clobber the address.  */
21773       if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
21774 	return false;
21775     }
21776 
21777   /* One of the memory accesses must be a mempair operand.
21778      If it is not the first one, they need to be swapped by the
21779      peephole.  */
21780   if (!aarch64_mem_pair_operand (mem_1, GET_MODE (mem_1))
21781        && !aarch64_mem_pair_operand (mem_2, GET_MODE (mem_2)))
21782     return false;
21783 
21784   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
21785     rclass_1 = FP_REGS;
21786   else
21787     rclass_1 = GENERAL_REGS;
21788 
21789   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
21790     rclass_2 = FP_REGS;
21791   else
21792     rclass_2 = GENERAL_REGS;
21793 
21794   /* Check if the registers are of same class.  */
21795   if (rclass_1 != rclass_2)
21796     return false;
21797 
21798   return true;
21799 }
21800 
21801 /* Given OPERANDS of consecutive load/store that can be merged,
21802    swap them if they are not in ascending order.  */
21803 void
aarch64_swap_ldrstr_operands(rtx * operands,bool load)21804 aarch64_swap_ldrstr_operands (rtx* operands, bool load)
21805 {
21806   rtx mem_1, mem_2, base_1, base_2, offset_1, offset_2;
21807   HOST_WIDE_INT offval_1, offval_2;
21808 
21809   if (load)
21810     {
21811       mem_1 = operands[1];
21812       mem_2 = operands[3];
21813     }
21814   else
21815     {
21816       mem_1 = operands[0];
21817       mem_2 = operands[2];
21818     }
21819 
21820   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
21821   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
21822 
21823   offval_1 = INTVAL (offset_1);
21824   offval_2 = INTVAL (offset_2);
21825 
21826   if (offval_1 > offval_2)
21827     {
21828       /* Irrespective of whether this is a load or a store,
21829 	 we do the same swap.  */
21830       std::swap (operands[0], operands[2]);
21831       std::swap (operands[1], operands[3]);
21832     }
21833 }
21834 
21835 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
21836    comparison between the two.  */
21837 int
aarch64_host_wide_int_compare(const void * x,const void * y)21838 aarch64_host_wide_int_compare (const void *x, const void *y)
21839 {
21840   return wi::cmps (* ((const HOST_WIDE_INT *) x),
21841 		   * ((const HOST_WIDE_INT *) y));
21842 }
21843 
21844 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
21845    other pointing to a REG rtx containing an offset, compare the offsets
21846    of the two pairs.
21847 
21848    Return:
21849 
21850 	1 iff offset (X) > offset (Y)
21851 	0 iff offset (X) == offset (Y)
21852 	-1 iff offset (X) < offset (Y)  */
21853 int
aarch64_ldrstr_offset_compare(const void * x,const void * y)21854 aarch64_ldrstr_offset_compare (const void *x, const void *y)
21855 {
21856   const rtx * operands_1 = (const rtx *) x;
21857   const rtx * operands_2 = (const rtx *) y;
21858   rtx mem_1, mem_2, base, offset_1, offset_2;
21859 
21860   if (MEM_P (operands_1[0]))
21861     mem_1 = operands_1[0];
21862   else
21863     mem_1 = operands_1[1];
21864 
21865   if (MEM_P (operands_2[0]))
21866     mem_2 = operands_2[0];
21867   else
21868     mem_2 = operands_2[1];
21869 
21870   /* Extract the offsets.  */
21871   extract_base_offset_in_addr (mem_1, &base, &offset_1);
21872   extract_base_offset_in_addr (mem_2, &base, &offset_2);
21873 
21874   gcc_assert (offset_1 != NULL_RTX && offset_2 != NULL_RTX);
21875 
21876   return wi::cmps (INTVAL (offset_1), INTVAL (offset_2));
21877 }
21878 
21879 /* Given OPERANDS of consecutive load/store, check if we can merge
21880    them into ldp/stp by adjusting the offset.  LOAD is true if they
21881    are load instructions.  MODE is the mode of memory operands.
21882 
21883    Given below consecutive stores:
21884 
21885      str  w1, [xb, 0x100]
21886      str  w1, [xb, 0x104]
21887      str  w1, [xb, 0x108]
21888      str  w1, [xb, 0x10c]
21889 
21890    Though the offsets are out of the range supported by stp, we can
21891    still pair them after adjusting the offset, like:
21892 
21893      add  scratch, xb, 0x100
21894      stp  w1, w1, [scratch]
21895      stp  w1, w1, [scratch, 0x8]
21896 
21897    The peephole patterns detecting this opportunity should guarantee
21898    the scratch register is avaliable.  */
21899 
21900 bool
aarch64_operands_adjust_ok_for_ldpstp(rtx * operands,bool load,scalar_mode mode)21901 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
21902 				       scalar_mode mode)
21903 {
21904   const int num_insns = 4;
21905   enum reg_class rclass;
21906   HOST_WIDE_INT offvals[num_insns], msize;
21907   rtx mem[num_insns], reg[num_insns], base[num_insns], offset[num_insns];
21908 
21909   if (load)
21910     {
21911       for (int i = 0; i < num_insns; i++)
21912 	{
21913 	  reg[i] = operands[2 * i];
21914 	  mem[i] = operands[2 * i + 1];
21915 
21916 	  gcc_assert (REG_P (reg[i]));
21917 	}
21918 
21919       /* Do not attempt to merge the loads if the loads clobber each other.  */
21920       for (int i = 0; i < 8; i += 2)
21921 	for (int j = i + 2; j < 8; j += 2)
21922 	  if (reg_overlap_mentioned_p (operands[i], operands[j]))
21923 	    return false;
21924     }
21925   else
21926     for (int i = 0; i < num_insns; i++)
21927       {
21928 	mem[i] = operands[2 * i];
21929 	reg[i] = operands[2 * i + 1];
21930       }
21931 
21932   /* Skip if memory operand is by itself valid for ldp/stp.  */
21933   if (!MEM_P (mem[0]) || aarch64_mem_pair_operand (mem[0], mode))
21934     return false;
21935 
21936   for (int i = 0; i < num_insns; i++)
21937     {
21938       /* The mems cannot be volatile.  */
21939       if (MEM_VOLATILE_P (mem[i]))
21940 	return false;
21941 
21942       /* Check if the addresses are in the form of [base+offset].  */
21943       extract_base_offset_in_addr (mem[i], base + i, offset + i);
21944       if (base[i] == NULL_RTX || offset[i] == NULL_RTX)
21945 	return false;
21946     }
21947 
21948   /* Check if the registers are of same class.  */
21949   rclass = REG_P (reg[0]) && FP_REGNUM_P (REGNO (reg[0]))
21950     ? FP_REGS : GENERAL_REGS;
21951 
21952   for (int i = 1; i < num_insns; i++)
21953     if (REG_P (reg[i]) && FP_REGNUM_P (REGNO (reg[i])))
21954       {
21955 	if (rclass != FP_REGS)
21956 	  return false;
21957       }
21958     else
21959       {
21960 	if (rclass != GENERAL_REGS)
21961 	  return false;
21962       }
21963 
21964   /* Only the last register in the order in which they occur
21965      may be clobbered by the load.  */
21966   if (rclass == GENERAL_REGS && load)
21967     for (int i = 0; i < num_insns - 1; i++)
21968       if (reg_mentioned_p (reg[i], mem[i]))
21969 	return false;
21970 
21971   /* Check if the bases are same.  */
21972   for (int i = 0; i < num_insns - 1; i++)
21973     if (!rtx_equal_p (base[i], base[i + 1]))
21974       return false;
21975 
21976   for (int i = 0; i < num_insns; i++)
21977     offvals[i] = INTVAL (offset[i]);
21978 
21979   msize = GET_MODE_SIZE (mode);
21980 
21981   /* Check if the offsets can be put in the right order to do a ldp/stp.  */
21982   qsort (offvals, num_insns, sizeof (HOST_WIDE_INT),
21983 	 aarch64_host_wide_int_compare);
21984 
21985   if (!(offvals[1] == offvals[0] + msize
21986 	&& offvals[3] == offvals[2] + msize))
21987     return false;
21988 
21989   /* Check that offsets are within range of each other.  The ldp/stp
21990      instructions have 7 bit immediate offsets, so use 0x80.  */
21991   if (offvals[2] - offvals[0] >= msize * 0x80)
21992     return false;
21993 
21994   /* The offsets must be aligned with respect to each other.  */
21995   if (offvals[0] % msize != offvals[2] % msize)
21996     return false;
21997 
21998   /* If we have SImode and slow unaligned ldp,
21999      check the alignment to be at least 8 byte. */
22000   if (mode == SImode
22001       && (aarch64_tune_params.extra_tuning_flags
22002 	  & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
22003       && !optimize_size
22004       && MEM_ALIGN (mem[0]) < 8 * BITS_PER_UNIT)
22005     return false;
22006 
22007   return true;
22008 }
22009 
22010 /* Given OPERANDS of consecutive load/store, this function pairs them
22011    into LDP/STP after adjusting the offset.  It depends on the fact
22012    that the operands can be sorted so the offsets are correct for STP.
22013    MODE is the mode of memory operands.  CODE is the rtl operator
22014    which should be applied to all memory operands, it's SIGN_EXTEND,
22015    ZERO_EXTEND or UNKNOWN.  */
22016 
22017 bool
aarch64_gen_adjusted_ldpstp(rtx * operands,bool load,scalar_mode mode,RTX_CODE code)22018 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
22019 			     scalar_mode mode, RTX_CODE code)
22020 {
22021   rtx base, offset_1, offset_3, t1, t2;
22022   rtx mem_1, mem_2, mem_3, mem_4;
22023   rtx temp_operands[8];
22024   HOST_WIDE_INT off_val_1, off_val_3, base_off, new_off_1, new_off_3,
22025 		stp_off_upper_limit, stp_off_lower_limit, msize;
22026 
22027   /* We make changes on a copy as we may still bail out.  */
22028   for (int i = 0; i < 8; i ++)
22029     temp_operands[i] = operands[i];
22030 
22031   /* Sort the operands.  */
22032   qsort (temp_operands, 4, 2 * sizeof (rtx *), aarch64_ldrstr_offset_compare);
22033 
22034   /* Copy the memory operands so that if we have to bail for some
22035      reason the original addresses are unchanged.  */
22036   if (load)
22037     {
22038       mem_1 = copy_rtx (temp_operands[1]);
22039       mem_2 = copy_rtx (temp_operands[3]);
22040       mem_3 = copy_rtx (temp_operands[5]);
22041       mem_4 = copy_rtx (temp_operands[7]);
22042     }
22043   else
22044     {
22045       mem_1 = copy_rtx (temp_operands[0]);
22046       mem_2 = copy_rtx (temp_operands[2]);
22047       mem_3 = copy_rtx (temp_operands[4]);
22048       mem_4 = copy_rtx (temp_operands[6]);
22049       gcc_assert (code == UNKNOWN);
22050     }
22051 
22052   extract_base_offset_in_addr (mem_1, &base, &offset_1);
22053   extract_base_offset_in_addr (mem_3, &base, &offset_3);
22054   gcc_assert (base != NULL_RTX && offset_1 != NULL_RTX
22055 	      && offset_3 != NULL_RTX);
22056 
22057   /* Adjust offset so it can fit in LDP/STP instruction.  */
22058   msize = GET_MODE_SIZE (mode);
22059   stp_off_upper_limit = msize * (0x40 - 1);
22060   stp_off_lower_limit = - msize * 0x40;
22061 
22062   off_val_1 = INTVAL (offset_1);
22063   off_val_3 = INTVAL (offset_3);
22064 
22065   /* The base offset is optimally half way between the two STP/LDP offsets.  */
22066   if (msize <= 4)
22067     base_off = (off_val_1 + off_val_3) / 2;
22068   else
22069     /* However, due to issues with negative LDP/STP offset generation for
22070        larger modes, for DF, DI and vector modes. we must not use negative
22071        addresses smaller than 9 signed unadjusted bits can store.  This
22072        provides the most range in this case.  */
22073     base_off = off_val_1;
22074 
22075   /* Adjust the base so that it is aligned with the addresses but still
22076      optimal.  */
22077   if (base_off % msize != off_val_1 % msize)
22078     /* Fix the offset, bearing in mind we want to make it bigger not
22079        smaller.  */
22080     base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
22081   else if (msize <= 4)
22082     /* The negative range of LDP/STP is one larger than the positive range.  */
22083     base_off += msize;
22084 
22085   /* Check if base offset is too big or too small.  We can attempt to resolve
22086      this issue by setting it to the maximum value and seeing if the offsets
22087      still fit.  */
22088   if (base_off >= 0x1000)
22089     {
22090       base_off = 0x1000 - 1;
22091       /* We must still make sure that the base offset is aligned with respect
22092 	 to the address.  But it may not be made any bigger.  */
22093       base_off -= (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
22094     }
22095 
22096   /* Likewise for the case where the base is too small.  */
22097   if (base_off <= -0x1000)
22098     {
22099       base_off = -0x1000 + 1;
22100       base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
22101     }
22102 
22103   /* Offset of the first STP/LDP.  */
22104   new_off_1 = off_val_1 - base_off;
22105 
22106   /* Offset of the second STP/LDP.  */
22107   new_off_3 = off_val_3 - base_off;
22108 
22109   /* The offsets must be within the range of the LDP/STP instructions.  */
22110   if (new_off_1 > stp_off_upper_limit || new_off_1 < stp_off_lower_limit
22111       || new_off_3 > stp_off_upper_limit || new_off_3 < stp_off_lower_limit)
22112     return false;
22113 
22114   replace_equiv_address_nv (mem_1, plus_constant (Pmode, operands[8],
22115 						  new_off_1), true);
22116   replace_equiv_address_nv (mem_2, plus_constant (Pmode, operands[8],
22117 						  new_off_1 + msize), true);
22118   replace_equiv_address_nv (mem_3, plus_constant (Pmode, operands[8],
22119 						  new_off_3), true);
22120   replace_equiv_address_nv (mem_4, plus_constant (Pmode, operands[8],
22121 						  new_off_3 + msize), true);
22122 
22123   if (!aarch64_mem_pair_operand (mem_1, mode)
22124       || !aarch64_mem_pair_operand (mem_3, mode))
22125     return false;
22126 
22127   if (code == ZERO_EXTEND)
22128     {
22129       mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
22130       mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
22131       mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
22132       mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
22133     }
22134   else if (code == SIGN_EXTEND)
22135     {
22136       mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
22137       mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
22138       mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
22139       mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
22140     }
22141 
22142   if (load)
22143     {
22144       operands[0] = temp_operands[0];
22145       operands[1] = mem_1;
22146       operands[2] = temp_operands[2];
22147       operands[3] = mem_2;
22148       operands[4] = temp_operands[4];
22149       operands[5] = mem_3;
22150       operands[6] = temp_operands[6];
22151       operands[7] = mem_4;
22152     }
22153   else
22154     {
22155       operands[0] = mem_1;
22156       operands[1] = temp_operands[1];
22157       operands[2] = mem_2;
22158       operands[3] = temp_operands[3];
22159       operands[4] = mem_3;
22160       operands[5] = temp_operands[5];
22161       operands[6] = mem_4;
22162       operands[7] = temp_operands[7];
22163     }
22164 
22165   /* Emit adjusting instruction.  */
22166   emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, base_off)));
22167   /* Emit ldp/stp instructions.  */
22168   t1 = gen_rtx_SET (operands[0], operands[1]);
22169   t2 = gen_rtx_SET (operands[2], operands[3]);
22170   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
22171   t1 = gen_rtx_SET (operands[4], operands[5]);
22172   t2 = gen_rtx_SET (operands[6], operands[7]);
22173   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
22174   return true;
22175 }
22176 
22177 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE.  Assume for now that
22178    it isn't worth branching around empty masked ops (including masked
22179    stores).  */
22180 
22181 static bool
aarch64_empty_mask_is_expensive(unsigned)22182 aarch64_empty_mask_is_expensive (unsigned)
22183 {
22184   return false;
22185 }
22186 
22187 /* Return 1 if pseudo register should be created and used to hold
22188    GOT address for PIC code.  */
22189 
22190 bool
aarch64_use_pseudo_pic_reg(void)22191 aarch64_use_pseudo_pic_reg (void)
22192 {
22193   return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
22194 }
22195 
22196 /* Implement TARGET_UNSPEC_MAY_TRAP_P.  */
22197 
22198 static int
aarch64_unspec_may_trap_p(const_rtx x,unsigned flags)22199 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
22200 {
22201   switch (XINT (x, 1))
22202     {
22203     case UNSPEC_GOTSMALLPIC:
22204     case UNSPEC_GOTSMALLPIC28K:
22205     case UNSPEC_GOTTINYPIC:
22206       return 0;
22207     default:
22208       break;
22209     }
22210 
22211   return default_unspec_may_trap_p (x, flags);
22212 }
22213 
22214 
22215 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
22216    return the log2 of that value.  Otherwise return -1.  */
22217 
22218 int
aarch64_fpconst_pow_of_2(rtx x)22219 aarch64_fpconst_pow_of_2 (rtx x)
22220 {
22221   const REAL_VALUE_TYPE *r;
22222 
22223   if (!CONST_DOUBLE_P (x))
22224     return -1;
22225 
22226   r = CONST_DOUBLE_REAL_VALUE (x);
22227 
22228   if (REAL_VALUE_NEGATIVE (*r)
22229       || REAL_VALUE_ISNAN (*r)
22230       || REAL_VALUE_ISINF (*r)
22231       || !real_isinteger (r, DFmode))
22232     return -1;
22233 
22234   return exact_log2 (real_to_integer (r));
22235 }
22236 
22237 /* If X is a positive CONST_DOUBLE with a value that is the reciprocal of a
22238    power of 2 (i.e 1/2^n) return the number of float bits. e.g. for x==(1/2^n)
22239    return n. Otherwise return -1.  */
22240 
22241 int
aarch64_fpconst_pow2_recip(rtx x)22242 aarch64_fpconst_pow2_recip (rtx x)
22243 {
22244   REAL_VALUE_TYPE r0;
22245 
22246   if (!CONST_DOUBLE_P (x))
22247     return -1;
22248 
22249   r0 = *CONST_DOUBLE_REAL_VALUE (x);
22250   if (exact_real_inverse (DFmode, &r0)
22251       && !REAL_VALUE_NEGATIVE (r0))
22252     {
22253 	int ret = exact_log2 (real_to_integer (&r0));
22254 	if (ret >= 1 && ret <= 32)
22255 	    return ret;
22256     }
22257   return -1;
22258 }
22259 
22260 /* If X is a vector of equal CONST_DOUBLE values and that value is
22261    Y, return the aarch64_fpconst_pow_of_2 of Y.  Otherwise return -1.  */
22262 
22263 int
aarch64_vec_fpconst_pow_of_2(rtx x)22264 aarch64_vec_fpconst_pow_of_2 (rtx x)
22265 {
22266   int nelts;
22267   if (GET_CODE (x) != CONST_VECTOR
22268       || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
22269     return -1;
22270 
22271   if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
22272     return -1;
22273 
22274   int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
22275   if (firstval <= 0)
22276     return -1;
22277 
22278   for (int i = 1; i < nelts; i++)
22279     if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
22280       return -1;
22281 
22282   return firstval;
22283 }
22284 
22285 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
22286    to float.
22287 
22288    __fp16 always promotes through this hook.
22289    _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
22290    through the generic excess precision logic rather than here.  */
22291 
22292 static tree
aarch64_promoted_type(const_tree t)22293 aarch64_promoted_type (const_tree t)
22294 {
22295   if (SCALAR_FLOAT_TYPE_P (t)
22296       && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
22297     return float_type_node;
22298 
22299   return NULL_TREE;
22300 }
22301 
22302 /* Implement the TARGET_OPTAB_SUPPORTED_P hook.  */
22303 
22304 static bool
aarch64_optab_supported_p(int op,machine_mode mode1,machine_mode,optimization_type opt_type)22305 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
22306 			   optimization_type opt_type)
22307 {
22308   switch (op)
22309     {
22310     case rsqrt_optab:
22311       return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
22312 
22313     default:
22314       return true;
22315     }
22316 }
22317 
22318 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook.  */
22319 
22320 static unsigned int
aarch64_dwarf_poly_indeterminate_value(unsigned int i,unsigned int * factor,int * offset)22321 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
22322 					int *offset)
22323 {
22324   /* Polynomial invariant 1 == (VG / 2) - 1.  */
22325   gcc_assert (i == 1);
22326   *factor = 2;
22327   *offset = 1;
22328   return AARCH64_DWARF_VG;
22329 }
22330 
22331 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
22332    if MODE is HFmode, and punt to the generic implementation otherwise.  */
22333 
22334 static bool
aarch64_libgcc_floating_mode_supported_p(scalar_float_mode mode)22335 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
22336 {
22337   return (mode == HFmode
22338 	  ? true
22339 	  : default_libgcc_floating_mode_supported_p (mode));
22340 }
22341 
22342 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
22343    if MODE is HFmode, and punt to the generic implementation otherwise.  */
22344 
22345 static bool
aarch64_scalar_mode_supported_p(scalar_mode mode)22346 aarch64_scalar_mode_supported_p (scalar_mode mode)
22347 {
22348   return (mode == HFmode
22349 	  ? true
22350 	  : default_scalar_mode_supported_p (mode));
22351 }
22352 
22353 /* Set the value of FLT_EVAL_METHOD.
22354    ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
22355 
22356     0: evaluate all operations and constants, whose semantic type has at
22357        most the range and precision of type float, to the range and
22358        precision of float; evaluate all other operations and constants to
22359        the range and precision of the semantic type;
22360 
22361     N, where _FloatN is a supported interchange floating type
22362        evaluate all operations and constants, whose semantic type has at
22363        most the range and precision of _FloatN type, to the range and
22364        precision of the _FloatN type; evaluate all other operations and
22365        constants to the range and precision of the semantic type;
22366 
22367    If we have the ARMv8.2-A extensions then we support _Float16 in native
22368    precision, so we should set this to 16.  Otherwise, we support the type,
22369    but want to evaluate expressions in float precision, so set this to
22370    0.  */
22371 
22372 static enum flt_eval_method
aarch64_excess_precision(enum excess_precision_type type)22373 aarch64_excess_precision (enum excess_precision_type type)
22374 {
22375   switch (type)
22376     {
22377       case EXCESS_PRECISION_TYPE_FAST:
22378       case EXCESS_PRECISION_TYPE_STANDARD:
22379 	/* We can calculate either in 16-bit range and precision or
22380 	   32-bit range and precision.  Make that decision based on whether
22381 	   we have native support for the ARMv8.2-A 16-bit floating-point
22382 	   instructions or not.  */
22383 	return (TARGET_FP_F16INST
22384 		? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
22385 		: FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
22386       case EXCESS_PRECISION_TYPE_IMPLICIT:
22387 	return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
22388       default:
22389 	gcc_unreachable ();
22390     }
22391   return FLT_EVAL_METHOD_UNPREDICTABLE;
22392 }
22393 
22394 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN.  Return true if INSN can be
22395    scheduled for speculative execution.  Reject the long-running division
22396    and square-root instructions.  */
22397 
22398 static bool
aarch64_sched_can_speculate_insn(rtx_insn * insn)22399 aarch64_sched_can_speculate_insn (rtx_insn *insn)
22400 {
22401   switch (get_attr_type (insn))
22402     {
22403       case TYPE_SDIV:
22404       case TYPE_UDIV:
22405       case TYPE_FDIVS:
22406       case TYPE_FDIVD:
22407       case TYPE_FSQRTS:
22408       case TYPE_FSQRTD:
22409       case TYPE_NEON_FP_SQRT_S:
22410       case TYPE_NEON_FP_SQRT_D:
22411       case TYPE_NEON_FP_SQRT_S_Q:
22412       case TYPE_NEON_FP_SQRT_D_Q:
22413       case TYPE_NEON_FP_DIV_S:
22414       case TYPE_NEON_FP_DIV_D:
22415       case TYPE_NEON_FP_DIV_S_Q:
22416       case TYPE_NEON_FP_DIV_D_Q:
22417 	return false;
22418       default:
22419 	return true;
22420     }
22421 }
22422 
22423 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES.  */
22424 
22425 static int
aarch64_compute_pressure_classes(reg_class * classes)22426 aarch64_compute_pressure_classes (reg_class *classes)
22427 {
22428   int i = 0;
22429   classes[i++] = GENERAL_REGS;
22430   classes[i++] = FP_REGS;
22431   /* PR_REGS isn't a useful pressure class because many predicate pseudo
22432      registers need to go in PR_LO_REGS at some point during their
22433      lifetime.  Splitting it into two halves has the effect of making
22434      all predicates count against PR_LO_REGS, so that we try whenever
22435      possible to restrict the number of live predicates to 8.  This
22436      greatly reduces the amount of spilling in certain loops.  */
22437   classes[i++] = PR_LO_REGS;
22438   classes[i++] = PR_HI_REGS;
22439   return i;
22440 }
22441 
22442 /* Implement TARGET_CAN_CHANGE_MODE_CLASS.  */
22443 
22444 static bool
aarch64_can_change_mode_class(machine_mode from,machine_mode to,reg_class_t)22445 aarch64_can_change_mode_class (machine_mode from,
22446 			       machine_mode to, reg_class_t)
22447 {
22448   unsigned int from_flags = aarch64_classify_vector_mode (from);
22449   unsigned int to_flags = aarch64_classify_vector_mode (to);
22450 
22451   bool from_sve_p = (from_flags & VEC_ANY_SVE);
22452   bool to_sve_p = (to_flags & VEC_ANY_SVE);
22453 
22454   bool from_partial_sve_p = from_sve_p && (from_flags & VEC_PARTIAL);
22455   bool to_partial_sve_p = to_sve_p && (to_flags & VEC_PARTIAL);
22456 
22457   bool from_pred_p = (from_flags & VEC_SVE_PRED);
22458   bool to_pred_p = (to_flags & VEC_SVE_PRED);
22459 
22460   /* Don't allow changes between predicate modes and other modes.
22461      Only predicate registers can hold predicate modes and only
22462      non-predicate registers can hold non-predicate modes, so any
22463      attempt to mix them would require a round trip through memory.  */
22464   if (from_pred_p != to_pred_p)
22465     return false;
22466 
22467   /* Don't allow changes between partial SVE modes and other modes.
22468      The contents of partial SVE modes are distributed evenly across
22469      the register, whereas GCC expects them to be clustered together.  */
22470   if (from_partial_sve_p != to_partial_sve_p)
22471     return false;
22472 
22473   /* Similarly reject changes between partial SVE modes that have
22474      different patterns of significant and insignificant bits.  */
22475   if (from_partial_sve_p
22476       && (aarch64_sve_container_bits (from) != aarch64_sve_container_bits (to)
22477 	  || GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to)))
22478     return false;
22479 
22480   if (maybe_ne (BITS_PER_SVE_VECTOR, 128u))
22481     {
22482       /* Don't allow changes between SVE modes and other modes that might
22483 	 be bigger than 128 bits.  In particular, OImode, CImode and XImode
22484 	 divide into 128-bit quantities while SVE modes divide into
22485 	 BITS_PER_SVE_VECTOR quantities.  */
22486       if (from_sve_p && !to_sve_p && maybe_gt (GET_MODE_BITSIZE (to), 128))
22487 	return false;
22488       if (to_sve_p && !from_sve_p && maybe_gt (GET_MODE_BITSIZE (from), 128))
22489 	return false;
22490     }
22491 
22492   if (BYTES_BIG_ENDIAN)
22493     {
22494       /* Don't allow changes between SVE data modes and non-SVE modes.
22495 	 See the comment at the head of aarch64-sve.md for details.  */
22496       if (from_sve_p != to_sve_p)
22497 	return false;
22498 
22499       /* Don't allow changes in element size: lane 0 of the new vector
22500 	 would not then be lane 0 of the old vector.  See the comment
22501 	 above aarch64_maybe_expand_sve_subreg_move for a more detailed
22502 	 description.
22503 
22504 	 In the worst case, this forces a register to be spilled in
22505 	 one mode and reloaded in the other, which handles the
22506 	 endianness correctly.  */
22507       if (from_sve_p && GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to))
22508 	return false;
22509     }
22510   return true;
22511 }
22512 
22513 /* Implement TARGET_EARLY_REMAT_MODES.  */
22514 
22515 static void
aarch64_select_early_remat_modes(sbitmap modes)22516 aarch64_select_early_remat_modes (sbitmap modes)
22517 {
22518   /* SVE values are not normally live across a call, so it should be
22519      worth doing early rematerialization even in VL-specific mode.  */
22520   for (int i = 0; i < NUM_MACHINE_MODES; ++i)
22521     if (aarch64_sve_mode_p ((machine_mode) i))
22522       bitmap_set_bit (modes, i);
22523 }
22524 
22525 /* Override the default target speculation_safe_value.  */
22526 static rtx
aarch64_speculation_safe_value(machine_mode mode,rtx result,rtx val,rtx failval)22527 aarch64_speculation_safe_value (machine_mode mode,
22528 				rtx result, rtx val, rtx failval)
22529 {
22530   /* Maybe we should warn if falling back to hard barriers.  They are
22531      likely to be noticably more expensive than the alternative below.  */
22532   if (!aarch64_track_speculation)
22533     return default_speculation_safe_value (mode, result, val, failval);
22534 
22535   if (!REG_P (val))
22536     val = copy_to_mode_reg (mode, val);
22537 
22538   if (!aarch64_reg_or_zero (failval, mode))
22539     failval = copy_to_mode_reg (mode, failval);
22540 
22541   emit_insn (gen_despeculate_copy (mode, result, val, failval));
22542   return result;
22543 }
22544 
22545 /* Implement TARGET_ESTIMATED_POLY_VALUE.
22546    Look into the tuning structure for an estimate.
22547    VAL.coeffs[1] is multiplied by the number of VQ chunks over the initial
22548    Advanced SIMD 128 bits.  */
22549 
22550 static HOST_WIDE_INT
aarch64_estimated_poly_value(poly_int64 val)22551 aarch64_estimated_poly_value (poly_int64 val)
22552 {
22553   enum aarch64_sve_vector_bits_enum width_source
22554     = aarch64_tune_params.sve_width;
22555 
22556   /* If we still don't have an estimate, use the default.  */
22557   if (width_source == SVE_SCALABLE)
22558     return default_estimated_poly_value (val);
22559 
22560   HOST_WIDE_INT over_128 = width_source - 128;
22561   return val.coeffs[0] + val.coeffs[1] * over_128 / 128;
22562 }
22563 
22564 
22565 /* Return true for types that could be supported as SIMD return or
22566    argument types.  */
22567 
22568 static bool
supported_simd_type(tree t)22569 supported_simd_type (tree t)
22570 {
22571   if (SCALAR_FLOAT_TYPE_P (t) || INTEGRAL_TYPE_P (t) || POINTER_TYPE_P (t))
22572     {
22573       HOST_WIDE_INT s = tree_to_shwi (TYPE_SIZE_UNIT (t));
22574       return s == 1 || s == 2 || s == 4 || s == 8;
22575     }
22576   return false;
22577 }
22578 
22579 /* Return true for types that currently are supported as SIMD return
22580    or argument types.  */
22581 
22582 static bool
currently_supported_simd_type(tree t,tree b)22583 currently_supported_simd_type (tree t, tree b)
22584 {
22585   if (COMPLEX_FLOAT_TYPE_P (t))
22586     return false;
22587 
22588   if (TYPE_SIZE (t) != TYPE_SIZE (b))
22589     return false;
22590 
22591   return supported_simd_type (t);
22592 }
22593 
22594 /* Implement TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN.  */
22595 
22596 static int
aarch64_simd_clone_compute_vecsize_and_simdlen(struct cgraph_node * node,struct cgraph_simd_clone * clonei,tree base_type,int num)22597 aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
22598 					struct cgraph_simd_clone *clonei,
22599 					tree base_type, int num)
22600 {
22601   tree t, ret_type, arg_type;
22602   unsigned int elt_bits, vec_bits, count;
22603 
22604   if (!TARGET_SIMD)
22605     return 0;
22606 
22607   if (clonei->simdlen
22608       && (clonei->simdlen < 2
22609 	  || clonei->simdlen > 1024
22610 	  || (clonei->simdlen & (clonei->simdlen - 1)) != 0))
22611     {
22612       warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
22613 		  "unsupported simdlen %d", clonei->simdlen);
22614       return 0;
22615     }
22616 
22617   ret_type = TREE_TYPE (TREE_TYPE (node->decl));
22618   if (TREE_CODE (ret_type) != VOID_TYPE
22619       && !currently_supported_simd_type (ret_type, base_type))
22620     {
22621       if (TYPE_SIZE (ret_type) != TYPE_SIZE (base_type))
22622 	warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
22623 		    "GCC does not currently support mixed size types "
22624 		    "for %<simd%> functions");
22625       else if (supported_simd_type (ret_type))
22626 	warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
22627 		    "GCC does not currently support return type %qT "
22628 		    "for %<simd%> functions", ret_type);
22629       else
22630 	warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
22631 		    "unsupported return type %qT for %<simd%> functions",
22632 		    ret_type);
22633       return 0;
22634     }
22635 
22636   for (t = DECL_ARGUMENTS (node->decl); t; t = DECL_CHAIN (t))
22637     {
22638       arg_type = TREE_TYPE (t);
22639 
22640       if (!currently_supported_simd_type (arg_type, base_type))
22641 	{
22642 	  if (TYPE_SIZE (arg_type) != TYPE_SIZE (base_type))
22643 	    warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
22644 			"GCC does not currently support mixed size types "
22645 			"for %<simd%> functions");
22646 	  else
22647 	    warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
22648 			"GCC does not currently support argument type %qT "
22649 			"for %<simd%> functions", arg_type);
22650 	  return 0;
22651 	}
22652     }
22653 
22654   clonei->vecsize_mangle = 'n';
22655   clonei->mask_mode = VOIDmode;
22656   elt_bits = GET_MODE_BITSIZE (SCALAR_TYPE_MODE (base_type));
22657   if (clonei->simdlen == 0)
22658     {
22659       count = 2;
22660       vec_bits = (num == 0 ? 64 : 128);
22661       clonei->simdlen = vec_bits / elt_bits;
22662     }
22663   else
22664     {
22665       count = 1;
22666       vec_bits = clonei->simdlen * elt_bits;
22667       if (vec_bits != 64 && vec_bits != 128)
22668 	{
22669 	  warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
22670 		      "GCC does not currently support simdlen %d for type %qT",
22671 		      clonei->simdlen, base_type);
22672 	  return 0;
22673 	}
22674     }
22675   clonei->vecsize_int = vec_bits;
22676   clonei->vecsize_float = vec_bits;
22677   return count;
22678 }
22679 
22680 /* Implement TARGET_SIMD_CLONE_ADJUST.  */
22681 
22682 static void
aarch64_simd_clone_adjust(struct cgraph_node * node)22683 aarch64_simd_clone_adjust (struct cgraph_node *node)
22684 {
22685   /* Add aarch64_vector_pcs target attribute to SIMD clones so they
22686      use the correct ABI.  */
22687 
22688   tree t = TREE_TYPE (node->decl);
22689   TYPE_ATTRIBUTES (t) = make_attribute ("aarch64_vector_pcs", "default",
22690 					TYPE_ATTRIBUTES (t));
22691 }
22692 
22693 /* Implement TARGET_SIMD_CLONE_USABLE.  */
22694 
22695 static int
aarch64_simd_clone_usable(struct cgraph_node * node)22696 aarch64_simd_clone_usable (struct cgraph_node *node)
22697 {
22698   switch (node->simdclone->vecsize_mangle)
22699     {
22700     case 'n':
22701       if (!TARGET_SIMD)
22702 	return -1;
22703       return 0;
22704     default:
22705       gcc_unreachable ();
22706     }
22707 }
22708 
22709 /* Implement TARGET_COMP_TYPE_ATTRIBUTES */
22710 
22711 static int
aarch64_comp_type_attributes(const_tree type1,const_tree type2)22712 aarch64_comp_type_attributes (const_tree type1, const_tree type2)
22713 {
22714   if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type1))
22715       != lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type2)))
22716     return 0;
22717   return 1;
22718 }
22719 
22720 /* Implement TARGET_GET_MULTILIB_ABI_NAME */
22721 
22722 static const char *
aarch64_get_multilib_abi_name(void)22723 aarch64_get_multilib_abi_name (void)
22724 {
22725   if (TARGET_BIG_END)
22726     return TARGET_ILP32 ? "aarch64_be_ilp32" : "aarch64_be";
22727   return TARGET_ILP32 ? "aarch64_ilp32" : "aarch64";
22728 }
22729 
22730 /* Implement TARGET_STACK_PROTECT_GUARD. In case of a
22731    global variable based guard use the default else
22732    return a null tree.  */
22733 static tree
aarch64_stack_protect_guard(void)22734 aarch64_stack_protect_guard (void)
22735 {
22736   if (aarch64_stack_protector_guard == SSP_GLOBAL)
22737     return default_stack_protect_guard ();
22738 
22739   return NULL_TREE;
22740 }
22741 
22742 /* Return the diagnostic message string if conversion from FROMTYPE to
22743    TOTYPE is not allowed, NULL otherwise.  */
22744 
22745 static const char *
aarch64_invalid_conversion(const_tree fromtype,const_tree totype)22746 aarch64_invalid_conversion (const_tree fromtype, const_tree totype)
22747 {
22748   if (element_mode (fromtype) != element_mode (totype))
22749     {
22750       /* Do no allow conversions to/from BFmode scalar types.  */
22751       if (TYPE_MODE (fromtype) == BFmode)
22752 	return N_("invalid conversion from type %<bfloat16_t%>");
22753       if (TYPE_MODE (totype) == BFmode)
22754 	return N_("invalid conversion to type %<bfloat16_t%>");
22755     }
22756 
22757   /* Conversion allowed.  */
22758   return NULL;
22759 }
22760 
22761 /* Return the diagnostic message string if the unary operation OP is
22762    not permitted on TYPE, NULL otherwise.  */
22763 
22764 static const char *
aarch64_invalid_unary_op(int op,const_tree type)22765 aarch64_invalid_unary_op (int op, const_tree type)
22766 {
22767   /* Reject all single-operand operations on BFmode except for &.  */
22768   if (element_mode (type) == BFmode && op != ADDR_EXPR)
22769     return N_("operation not permitted on type %<bfloat16_t%>");
22770 
22771   /* Operation allowed.  */
22772   return NULL;
22773 }
22774 
22775 /* Return the diagnostic message string if the binary operation OP is
22776    not permitted on TYPE1 and TYPE2, NULL otherwise.  */
22777 
22778 static const char *
aarch64_invalid_binary_op(int op ATTRIBUTE_UNUSED,const_tree type1,const_tree type2)22779 aarch64_invalid_binary_op (int op ATTRIBUTE_UNUSED, const_tree type1,
22780 			   const_tree type2)
22781 {
22782   /* Reject all 2-operand operations on BFmode.  */
22783   if (element_mode (type1) == BFmode
22784       || element_mode (type2) == BFmode)
22785     return N_("operation not permitted on type %<bfloat16_t%>");
22786 
22787   if (VECTOR_TYPE_P (type1)
22788       && VECTOR_TYPE_P (type2)
22789       && !TYPE_INDIVISIBLE_P (type1)
22790       && !TYPE_INDIVISIBLE_P (type2)
22791       && (aarch64_sve::builtin_type_p (type1)
22792 	  != aarch64_sve::builtin_type_p (type2)))
22793     return N_("cannot combine GNU and SVE vectors in a binary operation");
22794 
22795   /* Operation allowed.  */
22796   return NULL;
22797 }
22798 
22799 /* Implement TARGET_ASM_FILE_END for AArch64.  This adds the AArch64 GNU NOTE
22800    section at the end if needed.  */
22801 #define GNU_PROPERTY_AARCH64_FEATURE_1_AND	0xc0000000
22802 #define GNU_PROPERTY_AARCH64_FEATURE_1_BTI	(1U << 0)
22803 #define GNU_PROPERTY_AARCH64_FEATURE_1_PAC	(1U << 1)
22804 void
aarch64_file_end_indicate_exec_stack()22805 aarch64_file_end_indicate_exec_stack ()
22806 {
22807   file_end_indicate_exec_stack ();
22808 
22809   unsigned feature_1_and = 0;
22810   if (aarch64_bti_enabled ())
22811     feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_BTI;
22812 
22813   if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE)
22814     feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_PAC;
22815 
22816   if (feature_1_and)
22817     {
22818       /* Generate .note.gnu.property section.  */
22819       switch_to_section (get_section (".note.gnu.property",
22820 				      SECTION_NOTYPE, NULL));
22821 
22822       /* PT_NOTE header: namesz, descsz, type.
22823 	 namesz = 4 ("GNU\0")
22824 	 descsz = 16 (Size of the program property array)
22825 		  [(12 + padding) * Number of array elements]
22826 	 type   = 5 (NT_GNU_PROPERTY_TYPE_0).  */
22827       assemble_align (POINTER_SIZE);
22828       assemble_integer (GEN_INT (4), 4, 32, 1);
22829       assemble_integer (GEN_INT (ROUND_UP (12, POINTER_BYTES)), 4, 32, 1);
22830       assemble_integer (GEN_INT (5), 4, 32, 1);
22831 
22832       /* PT_NOTE name.  */
22833       assemble_string ("GNU", 4);
22834 
22835       /* PT_NOTE contents for NT_GNU_PROPERTY_TYPE_0:
22836 	 type   = GNU_PROPERTY_AARCH64_FEATURE_1_AND
22837 	 datasz = 4
22838 	 data   = feature_1_and.  */
22839       assemble_integer (GEN_INT (GNU_PROPERTY_AARCH64_FEATURE_1_AND), 4, 32, 1);
22840       assemble_integer (GEN_INT (4), 4, 32, 1);
22841       assemble_integer (GEN_INT (feature_1_and), 4, 32, 1);
22842 
22843       /* Pad the size of the note to the required alignment.  */
22844       assemble_align (POINTER_SIZE);
22845     }
22846 }
22847 #undef GNU_PROPERTY_AARCH64_FEATURE_1_PAC
22848 #undef GNU_PROPERTY_AARCH64_FEATURE_1_BTI
22849 #undef GNU_PROPERTY_AARCH64_FEATURE_1_AND
22850 
22851 /* Target-specific selftests.  */
22852 
22853 #if CHECKING_P
22854 
22855 namespace selftest {
22856 
22857 /* Selftest for the RTL loader.
22858    Verify that the RTL loader copes with a dump from
22859    print_rtx_function.  This is essentially just a test that class
22860    function_reader can handle a real dump, but it also verifies
22861    that lookup_reg_by_dump_name correctly handles hard regs.
22862    The presence of hard reg names in the dump means that the test is
22863    target-specific, hence it is in this file.  */
22864 
22865 static void
aarch64_test_loading_full_dump()22866 aarch64_test_loading_full_dump ()
22867 {
22868   rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
22869 
22870   ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
22871 
22872   rtx_insn *insn_1 = get_insn_by_uid (1);
22873   ASSERT_EQ (NOTE, GET_CODE (insn_1));
22874 
22875   rtx_insn *insn_15 = get_insn_by_uid (15);
22876   ASSERT_EQ (INSN, GET_CODE (insn_15));
22877   ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
22878 
22879   /* Verify crtl->return_rtx.  */
22880   ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
22881   ASSERT_EQ (0, REGNO (crtl->return_rtx));
22882   ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
22883 }
22884 
22885 /* Run all target-specific selftests.  */
22886 
22887 static void
aarch64_run_selftests(void)22888 aarch64_run_selftests (void)
22889 {
22890   aarch64_test_loading_full_dump ();
22891 }
22892 
22893 } // namespace selftest
22894 
22895 #endif /* #if CHECKING_P */
22896 
22897 #undef TARGET_STACK_PROTECT_GUARD
22898 #define TARGET_STACK_PROTECT_GUARD aarch64_stack_protect_guard
22899 
22900 #undef TARGET_ADDRESS_COST
22901 #define TARGET_ADDRESS_COST aarch64_address_cost
22902 
22903 /* This hook will determines whether unnamed bitfields affect the alignment
22904    of the containing structure.  The hook returns true if the structure
22905    should inherit the alignment requirements of an unnamed bitfield's
22906    type.  */
22907 #undef TARGET_ALIGN_ANON_BITFIELD
22908 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
22909 
22910 #undef TARGET_ASM_ALIGNED_DI_OP
22911 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
22912 
22913 #undef TARGET_ASM_ALIGNED_HI_OP
22914 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
22915 
22916 #undef TARGET_ASM_ALIGNED_SI_OP
22917 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
22918 
22919 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
22920 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
22921   hook_bool_const_tree_hwi_hwi_const_tree_true
22922 
22923 #undef TARGET_ASM_FILE_START
22924 #define TARGET_ASM_FILE_START aarch64_start_file
22925 
22926 #undef TARGET_ASM_OUTPUT_MI_THUNK
22927 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
22928 
22929 #undef TARGET_ASM_SELECT_RTX_SECTION
22930 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
22931 
22932 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
22933 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
22934 
22935 #undef TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY
22936 #define TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY aarch64_print_patchable_function_entry
22937 
22938 #undef TARGET_BUILD_BUILTIN_VA_LIST
22939 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
22940 
22941 #undef TARGET_CALLEE_COPIES
22942 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_arg_info_false
22943 
22944 #undef TARGET_CAN_ELIMINATE
22945 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
22946 
22947 #undef TARGET_CAN_INLINE_P
22948 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
22949 
22950 #undef TARGET_CANNOT_FORCE_CONST_MEM
22951 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
22952 
22953 #undef TARGET_CASE_VALUES_THRESHOLD
22954 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
22955 
22956 #undef TARGET_CONDITIONAL_REGISTER_USAGE
22957 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
22958 
22959 #undef TARGET_MEMBER_TYPE_FORCES_BLK
22960 #define TARGET_MEMBER_TYPE_FORCES_BLK aarch64_member_type_forces_blk
22961 
22962 /* Only the least significant bit is used for initialization guard
22963    variables.  */
22964 #undef TARGET_CXX_GUARD_MASK_BIT
22965 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
22966 
22967 #undef TARGET_C_MODE_FOR_SUFFIX
22968 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
22969 
22970 #ifdef TARGET_BIG_ENDIAN_DEFAULT
22971 #undef  TARGET_DEFAULT_TARGET_FLAGS
22972 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
22973 #endif
22974 
22975 #undef TARGET_CLASS_MAX_NREGS
22976 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
22977 
22978 #undef TARGET_BUILTIN_DECL
22979 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
22980 
22981 #undef TARGET_BUILTIN_RECIPROCAL
22982 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
22983 
22984 #undef TARGET_C_EXCESS_PRECISION
22985 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
22986 
22987 #undef  TARGET_EXPAND_BUILTIN
22988 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
22989 
22990 #undef TARGET_EXPAND_BUILTIN_VA_START
22991 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
22992 
22993 #undef TARGET_FOLD_BUILTIN
22994 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
22995 
22996 #undef TARGET_FUNCTION_ARG
22997 #define TARGET_FUNCTION_ARG aarch64_function_arg
22998 
22999 #undef TARGET_FUNCTION_ARG_ADVANCE
23000 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
23001 
23002 #undef TARGET_FUNCTION_ARG_BOUNDARY
23003 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
23004 
23005 #undef TARGET_FUNCTION_ARG_PADDING
23006 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
23007 
23008 #undef TARGET_GET_RAW_RESULT_MODE
23009 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
23010 #undef TARGET_GET_RAW_ARG_MODE
23011 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
23012 
23013 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
23014 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
23015 
23016 #undef TARGET_FUNCTION_VALUE
23017 #define TARGET_FUNCTION_VALUE aarch64_function_value
23018 
23019 #undef TARGET_FUNCTION_VALUE_REGNO_P
23020 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
23021 
23022 #undef TARGET_GIMPLE_FOLD_BUILTIN
23023 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
23024 
23025 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
23026 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
23027 
23028 #undef  TARGET_INIT_BUILTINS
23029 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
23030 
23031 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
23032 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
23033   aarch64_ira_change_pseudo_allocno_class
23034 
23035 #undef TARGET_LEGITIMATE_ADDRESS_P
23036 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
23037 
23038 #undef TARGET_LEGITIMATE_CONSTANT_P
23039 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
23040 
23041 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
23042 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
23043   aarch64_legitimize_address_displacement
23044 
23045 #undef TARGET_LIBGCC_CMP_RETURN_MODE
23046 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
23047 
23048 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
23049 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
23050 aarch64_libgcc_floating_mode_supported_p
23051 
23052 #undef TARGET_MANGLE_TYPE
23053 #define TARGET_MANGLE_TYPE aarch64_mangle_type
23054 
23055 #undef TARGET_INVALID_CONVERSION
23056 #define TARGET_INVALID_CONVERSION aarch64_invalid_conversion
23057 
23058 #undef TARGET_INVALID_UNARY_OP
23059 #define TARGET_INVALID_UNARY_OP aarch64_invalid_unary_op
23060 
23061 #undef TARGET_INVALID_BINARY_OP
23062 #define TARGET_INVALID_BINARY_OP aarch64_invalid_binary_op
23063 
23064 #undef TARGET_VERIFY_TYPE_CONTEXT
23065 #define TARGET_VERIFY_TYPE_CONTEXT aarch64_verify_type_context
23066 
23067 #undef TARGET_MEMORY_MOVE_COST
23068 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
23069 
23070 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
23071 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
23072 
23073 #undef TARGET_MUST_PASS_IN_STACK
23074 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
23075 
23076 /* This target hook should return true if accesses to volatile bitfields
23077    should use the narrowest mode possible.  It should return false if these
23078    accesses should use the bitfield container type.  */
23079 #undef TARGET_NARROW_VOLATILE_BITFIELD
23080 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
23081 
23082 #undef  TARGET_OPTION_OVERRIDE
23083 #define TARGET_OPTION_OVERRIDE aarch64_override_options
23084 
23085 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
23086 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
23087   aarch64_override_options_after_change
23088 
23089 #undef TARGET_OPTION_SAVE
23090 #define TARGET_OPTION_SAVE aarch64_option_save
23091 
23092 #undef TARGET_OPTION_RESTORE
23093 #define TARGET_OPTION_RESTORE aarch64_option_restore
23094 
23095 #undef TARGET_OPTION_PRINT
23096 #define TARGET_OPTION_PRINT aarch64_option_print
23097 
23098 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
23099 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
23100 
23101 #undef TARGET_SET_CURRENT_FUNCTION
23102 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
23103 
23104 #undef TARGET_PASS_BY_REFERENCE
23105 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
23106 
23107 #undef TARGET_PREFERRED_RELOAD_CLASS
23108 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
23109 
23110 #undef TARGET_SCHED_REASSOCIATION_WIDTH
23111 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
23112 
23113 #undef TARGET_PROMOTED_TYPE
23114 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
23115 
23116 #undef TARGET_SECONDARY_RELOAD
23117 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
23118 
23119 #undef TARGET_SHIFT_TRUNCATION_MASK
23120 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
23121 
23122 #undef TARGET_SETUP_INCOMING_VARARGS
23123 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
23124 
23125 #undef TARGET_STRUCT_VALUE_RTX
23126 #define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
23127 
23128 #undef TARGET_REGISTER_MOVE_COST
23129 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
23130 
23131 #undef TARGET_RETURN_IN_MEMORY
23132 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
23133 
23134 #undef TARGET_RETURN_IN_MSB
23135 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
23136 
23137 #undef TARGET_RTX_COSTS
23138 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
23139 
23140 #undef TARGET_SCALAR_MODE_SUPPORTED_P
23141 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
23142 
23143 #undef TARGET_SCHED_ISSUE_RATE
23144 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
23145 
23146 #undef TARGET_SCHED_VARIABLE_ISSUE
23147 #define TARGET_SCHED_VARIABLE_ISSUE aarch64_sched_variable_issue
23148 
23149 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
23150 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
23151   aarch64_sched_first_cycle_multipass_dfa_lookahead
23152 
23153 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
23154 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
23155   aarch64_first_cycle_multipass_dfa_lookahead_guard
23156 
23157 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
23158 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
23159   aarch64_get_separate_components
23160 
23161 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
23162 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
23163   aarch64_components_for_bb
23164 
23165 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
23166 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
23167   aarch64_disqualify_components
23168 
23169 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
23170 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
23171   aarch64_emit_prologue_components
23172 
23173 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
23174 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
23175   aarch64_emit_epilogue_components
23176 
23177 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
23178 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
23179   aarch64_set_handled_components
23180 
23181 #undef TARGET_TRAMPOLINE_INIT
23182 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
23183 
23184 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
23185 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
23186 
23187 #undef TARGET_VECTOR_MODE_SUPPORTED_P
23188 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
23189 
23190 #undef TARGET_COMPATIBLE_VECTOR_TYPES_P
23191 #define TARGET_COMPATIBLE_VECTOR_TYPES_P aarch64_compatible_vector_types_p
23192 
23193 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
23194 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
23195   aarch64_builtin_support_vector_misalignment
23196 
23197 #undef TARGET_ARRAY_MODE
23198 #define TARGET_ARRAY_MODE aarch64_array_mode
23199 
23200 #undef TARGET_ARRAY_MODE_SUPPORTED_P
23201 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
23202 
23203 #undef TARGET_VECTORIZE_ADD_STMT_COST
23204 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
23205 
23206 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
23207 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
23208   aarch64_builtin_vectorization_cost
23209 
23210 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
23211 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
23212 
23213 #undef TARGET_VECTORIZE_BUILTINS
23214 #define TARGET_VECTORIZE_BUILTINS
23215 
23216 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
23217 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
23218   aarch64_builtin_vectorized_function
23219 
23220 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES
23221 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES \
23222   aarch64_autovectorize_vector_modes
23223 
23224 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
23225 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
23226   aarch64_atomic_assign_expand_fenv
23227 
23228 /* Section anchor support.  */
23229 
23230 #undef TARGET_MIN_ANCHOR_OFFSET
23231 #define TARGET_MIN_ANCHOR_OFFSET -256
23232 
23233 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
23234    byte offset; we can do much more for larger data types, but have no way
23235    to determine the size of the access.  We assume accesses are aligned.  */
23236 #undef TARGET_MAX_ANCHOR_OFFSET
23237 #define TARGET_MAX_ANCHOR_OFFSET 4095
23238 
23239 #undef TARGET_VECTOR_ALIGNMENT
23240 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
23241 
23242 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
23243 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
23244   aarch64_vectorize_preferred_vector_alignment
23245 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
23246 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
23247   aarch64_simd_vector_alignment_reachable
23248 
23249 /* vec_perm support.  */
23250 
23251 #undef TARGET_VECTORIZE_VEC_PERM_CONST
23252 #define TARGET_VECTORIZE_VEC_PERM_CONST \
23253   aarch64_vectorize_vec_perm_const
23254 
23255 #undef TARGET_VECTORIZE_RELATED_MODE
23256 #define TARGET_VECTORIZE_RELATED_MODE aarch64_vectorize_related_mode
23257 #undef TARGET_VECTORIZE_GET_MASK_MODE
23258 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
23259 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
23260 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
23261   aarch64_empty_mask_is_expensive
23262 #undef TARGET_PREFERRED_ELSE_VALUE
23263 #define TARGET_PREFERRED_ELSE_VALUE \
23264   aarch64_preferred_else_value
23265 
23266 #undef TARGET_INIT_LIBFUNCS
23267 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
23268 
23269 #undef TARGET_FIXED_CONDITION_CODE_REGS
23270 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
23271 
23272 #undef TARGET_FLAGS_REGNUM
23273 #define TARGET_FLAGS_REGNUM CC_REGNUM
23274 
23275 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
23276 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
23277 
23278 #undef TARGET_ASAN_SHADOW_OFFSET
23279 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
23280 
23281 #undef TARGET_LEGITIMIZE_ADDRESS
23282 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
23283 
23284 #undef TARGET_SCHED_CAN_SPECULATE_INSN
23285 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
23286 
23287 #undef TARGET_CAN_USE_DOLOOP_P
23288 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
23289 
23290 #undef TARGET_SCHED_ADJUST_PRIORITY
23291 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
23292 
23293 #undef TARGET_SCHED_MACRO_FUSION_P
23294 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
23295 
23296 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
23297 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
23298 
23299 #undef TARGET_SCHED_FUSION_PRIORITY
23300 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
23301 
23302 #undef TARGET_UNSPEC_MAY_TRAP_P
23303 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
23304 
23305 #undef TARGET_USE_PSEUDO_PIC_REG
23306 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
23307 
23308 #undef TARGET_PRINT_OPERAND
23309 #define TARGET_PRINT_OPERAND aarch64_print_operand
23310 
23311 #undef TARGET_PRINT_OPERAND_ADDRESS
23312 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
23313 
23314 #undef TARGET_OPTAB_SUPPORTED_P
23315 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
23316 
23317 #undef TARGET_OMIT_STRUCT_RETURN_REG
23318 #define TARGET_OMIT_STRUCT_RETURN_REG true
23319 
23320 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
23321 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
23322   aarch64_dwarf_poly_indeterminate_value
23323 
23324 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors.  */
23325 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
23326 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
23327 
23328 #undef TARGET_HARD_REGNO_NREGS
23329 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
23330 #undef TARGET_HARD_REGNO_MODE_OK
23331 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
23332 
23333 #undef TARGET_MODES_TIEABLE_P
23334 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
23335 
23336 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
23337 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
23338   aarch64_hard_regno_call_part_clobbered
23339 
23340 #undef TARGET_INSN_CALLEE_ABI
23341 #define TARGET_INSN_CALLEE_ABI aarch64_insn_callee_abi
23342 
23343 #undef TARGET_CONSTANT_ALIGNMENT
23344 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
23345 
23346 #undef TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE
23347 #define TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE \
23348   aarch64_stack_clash_protection_alloca_probe_range
23349 
23350 #undef TARGET_COMPUTE_PRESSURE_CLASSES
23351 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
23352 
23353 #undef TARGET_CAN_CHANGE_MODE_CLASS
23354 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
23355 
23356 #undef TARGET_SELECT_EARLY_REMAT_MODES
23357 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
23358 
23359 #undef TARGET_SPECULATION_SAFE_VALUE
23360 #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
23361 
23362 #undef TARGET_ESTIMATED_POLY_VALUE
23363 #define TARGET_ESTIMATED_POLY_VALUE aarch64_estimated_poly_value
23364 
23365 #undef TARGET_ATTRIBUTE_TABLE
23366 #define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
23367 
23368 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
23369 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
23370   aarch64_simd_clone_compute_vecsize_and_simdlen
23371 
23372 #undef TARGET_SIMD_CLONE_ADJUST
23373 #define TARGET_SIMD_CLONE_ADJUST aarch64_simd_clone_adjust
23374 
23375 #undef TARGET_SIMD_CLONE_USABLE
23376 #define TARGET_SIMD_CLONE_USABLE aarch64_simd_clone_usable
23377 
23378 #undef TARGET_COMP_TYPE_ATTRIBUTES
23379 #define TARGET_COMP_TYPE_ATTRIBUTES aarch64_comp_type_attributes
23380 
23381 #undef TARGET_GET_MULTILIB_ABI_NAME
23382 #define TARGET_GET_MULTILIB_ABI_NAME aarch64_get_multilib_abi_name
23383 
23384 #undef TARGET_FNTYPE_ABI
23385 #define TARGET_FNTYPE_ABI aarch64_fntype_abi
23386 
23387 #if CHECKING_P
23388 #undef TARGET_RUN_TARGET_SELFTESTS
23389 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
23390 #endif /* #if CHECKING_P */
23391 
23392 #undef TARGET_ASM_POST_CFI_STARTPROC
23393 #define TARGET_ASM_POST_CFI_STARTPROC aarch64_post_cfi_startproc
23394 
23395 #undef TARGET_STRICT_ARGUMENT_NAMING
23396 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
23397 
23398 #undef TARGET_MD_ASM_ADJUST
23399 #define TARGET_MD_ASM_ADJUST arm_md_asm_adjust
23400 
23401 struct gcc_target targetm = TARGET_INITIALIZER;
23402 
23403 #include "gt-aarch64.h"
23404