1;; ARM Cortex-A53 pipeline description 2;; Copyright (C) 2013-2019 Free Software Foundation, Inc. 3;; 4;; Contributed by ARM Ltd. 5;; 6;; This file is part of GCC. 7;; 8;; GCC is free software; you can redistribute it and/or modify it 9;; under the terms of the GNU General Public License as published by 10;; the Free Software Foundation; either version 3, or (at your option) 11;; any later version. 12;; 13;; GCC is distributed in the hope that it will be useful, but 14;; WITHOUT ANY WARRANTY; without even the implied warranty of 15;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16;; General Public License for more details. 17;; 18;; You should have received a copy of the GNU General Public License 19;; along with GCC; see the file COPYING3. If not see 20;; <http://www.gnu.org/licenses/>. 21 22(define_automaton "cortex_a53") 23 24;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 25;; General-purpose functional units. 26;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 27 28;; We use slot0 and slot1 to model constraints on which instructions may 29;; dual-issue. 30 31(define_cpu_unit "cortex_a53_slot0" "cortex_a53") 32(define_cpu_unit "cortex_a53_slot1" "cortex_a53") 33(final_presence_set "cortex_a53_slot1" "cortex_a53_slot0") 34 35(define_reservation "cortex_a53_slot_any" 36 "cortex_a53_slot0\ 37 |cortex_a53_slot1") 38 39(define_reservation "cortex_a53_single_issue" 40 "cortex_a53_slot0\ 41 +cortex_a53_slot1") 42 43;; Used to model load and store pipelines. Load/store instructions 44;; can dual-issue with other instructions, but two load/stores cannot 45;; simultaneously issue. 46 47(define_cpu_unit "cortex_a53_store" "cortex_a53") 48(define_cpu_unit "cortex_a53_load" "cortex_a53") 49(define_cpu_unit "cortex_a53_ls_agen" "cortex_a53") 50 51;; Used to model a branch pipeline. Branches can dual-issue with other 52;; instructions (except when those instructions take multiple cycles 53;; to issue). 54 55(define_cpu_unit "cortex_a53_branch" "cortex_a53") 56 57;; Used to model an integer divide pipeline. 58 59(define_cpu_unit "cortex_a53_idiv" "cortex_a53") 60 61;; Used to model an integer multiply/multiply-accumulate pipeline. 62 63(define_cpu_unit "cortex_a53_imul" "cortex_a53") 64 65;; Model general structural hazards, for wherever we need them. 66 67(define_cpu_unit "cortex_a53_hazard" "cortex_a53") 68 69;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 70;; ALU instructions. 71;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 72 73(define_insn_reservation "cortex_a53_shift" 2 74 (and (eq_attr "tune" "cortexa53") 75 (eq_attr "type" "adr,shift_imm,mov_imm,mvn_imm,mov_shift")) 76 "cortex_a53_slot_any") 77 78(define_insn_reservation "cortex_a53_shift_reg" 2 79 (and (eq_attr "tune" "cortexa53") 80 (eq_attr "type" "shift_reg,mov_shift_reg")) 81 "cortex_a53_slot_any+cortex_a53_hazard") 82 83(define_insn_reservation "cortex_a53_alu" 3 84 (and (eq_attr "tune" "cortexa53") 85 (eq_attr "type" "alu_imm,alus_imm,logic_imm,logics_imm, 86 alu_sreg,alus_sreg,logic_reg,logics_reg, 87 adc_imm,adcs_imm,adc_reg,adcs_reg, 88 csel,clz,rbit,rev,alu_dsp_reg, 89 mov_reg,mvn_reg,mrs,multiple,no_insn")) 90 "cortex_a53_slot_any") 91 92(define_insn_reservation "cortex_a53_alu_shift" 3 93 (and (eq_attr "tune" "cortexa53") 94 (eq_attr "type" "alu_shift_imm,alus_shift_imm, 95 crc,logic_shift_imm,logics_shift_imm, 96 alu_ext,alus_ext,bfm,bfx,extend,mvn_shift")) 97 "cortex_a53_slot_any") 98 99(define_insn_reservation "cortex_a53_alu_shift_reg" 3 100 (and (eq_attr "tune" "cortexa53") 101 (eq_attr "type" "alu_shift_reg,alus_shift_reg, 102 logic_shift_reg,logics_shift_reg, 103 mvn_shift_reg")) 104 "cortex_a53_slot_any+cortex_a53_hazard") 105 106(define_insn_reservation "cortex_a53_alu_extr" 3 107 (and (eq_attr "tune" "cortexa53") 108 (eq_attr "type" "rotate_imm")) 109 "cortex_a53_slot1|cortex_a53_single_issue") 110 111(define_insn_reservation "cortex_a53_mul" 4 112 (and (eq_attr "tune" "cortexa53") 113 (ior (eq_attr "mul32" "yes") 114 (eq_attr "widen_mul64" "yes"))) 115 "cortex_a53_slot_any+cortex_a53_imul") 116 117;; From the perspective of the GCC scheduling state machine, if we wish to 118;; model an instruction as serialising other instructions, we are best to do 119;; so by modelling it as taking very few cycles. Scheduling many other 120;; instructions underneath it at the cost of freedom to pick from the 121;; ready list is likely to hurt us more than it helps. However, we do 122;; want to model some resource and latency cost for divide instructions in 123;; order to avoid divides ending up too lumpy. 124 125(define_insn_reservation "cortex_a53_div" 4 126 (and (eq_attr "tune" "cortexa53") 127 (eq_attr "type" "udiv,sdiv")) 128 "cortex_a53_slot0,cortex_a53_idiv*2") 129 130;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 131;; Load/store instructions. 132;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 133 134;; TODO: load<n> is not prescriptive about how much data is to be loaded. 135;; This is most obvious for LDRD from AArch32 and LDP (X register) from 136;; AArch64, both are tagged load2 but LDP will load 128-bits compared to 137;; LDRD which is 64-bits. 138;; 139;; For the below, we assume AArch64 X-registers for load2, and AArch32 140;; registers for load3/load4. 141 142(define_insn_reservation "cortex_a53_load1" 4 143 (and (eq_attr "tune" "cortexa53") 144 (eq_attr "type" "load_byte,load_4,load_acq")) 145 "cortex_a53_slot_any+cortex_a53_ls_agen, 146 cortex_a53_load") 147 148(define_insn_reservation "cortex_a53_store1" 2 149 (and (eq_attr "tune" "cortexa53") 150 (eq_attr "type" "store_4,store_rel")) 151 "cortex_a53_slot_any+cortex_a53_ls_agen, 152 cortex_a53_store") 153 154;; Model AArch64-sized LDP Xm, Xn, [Xa] 155 156(define_insn_reservation "cortex_a53_load2" 4 157 (and (eq_attr "tune" "cortexa53") 158 (eq_attr "type" "load_8")) 159 "cortex_a53_single_issue+cortex_a53_ls_agen, 160 cortex_a53_load+cortex_a53_slot0, 161 cortex_a53_load") 162 163(define_insn_reservation "cortex_a53_store2" 2 164 (and (eq_attr "tune" "cortexa53") 165 (eq_attr "type" "store_8")) 166 "cortex_a53_slot_any+cortex_a53_ls_agen, 167 cortex_a53_store") 168 169;; Model AArch32-sized LDM Ra, {Rm, Rn, Ro} 170 171(define_insn_reservation "cortex_a53_load3plus" 6 172 (and (eq_attr "tune" "cortexa53") 173 (eq_attr "type" "load_12,load_16")) 174 "cortex_a53_single_issue+cortex_a53_ls_agen, 175 cortex_a53_load+cortex_a53_slot0, 176 cortex_a53_load") 177 178(define_insn_reservation "cortex_a53_store3plus" 2 179 (and (eq_attr "tune" "cortexa53") 180 (eq_attr "type" "store_12,store_16")) 181 "cortex_a53_slot_any+cortex_a53_ls_agen, 182 cortex_a53_store+cortex_a53_slot0, 183 cortex_a53_store") 184 185;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 186;; Branches. 187;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 188 189;; Model all branches as dual-issuable from either execution, which 190;; is not strictly true for all cases (indirect branches). 191 192(define_insn_reservation "cortex_a53_branch" 0 193 (and (eq_attr "tune" "cortexa53") 194 (eq_attr "type" "branch,call")) 195 "cortex_a53_slot_any+cortex_a53_branch") 196 197;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 198;; General-purpose register bypasses 199;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 200 201;; Model bypasses for ALU to ALU instructions. 202 203(define_bypass 0 "cortex_a53_shift*" 204 "cortex_a53_alu") 205 206(define_bypass 1 "cortex_a53_shift*" 207 "cortex_a53_shift*,cortex_a53_alu_*") 208 209(define_bypass 1 "cortex_a53_alu*" 210 "cortex_a53_alu") 211 212(define_bypass 1 "cortex_a53_alu*" 213 "cortex_a53_alu_shift*" 214 "arm_no_early_alu_shift_dep") 215 216(define_bypass 2 "cortex_a53_alu*" 217 "cortex_a53_alu_*,cortex_a53_shift*") 218 219;; Model a bypass from MUL/MLA to MLA instructions. 220 221(define_bypass 1 "cortex_a53_mul" 222 "cortex_a53_mul" 223 "aarch_accumulator_forwarding") 224 225;; Model a bypass from MUL/MLA to ALU instructions. 226 227(define_bypass 2 "cortex_a53_mul" 228 "cortex_a53_alu") 229 230(define_bypass 3 "cortex_a53_mul" 231 "cortex_a53_alu_*,cortex_a53_shift*") 232 233;; Model bypasses for loads which are to be consumed by the ALU. 234 235(define_bypass 2 "cortex_a53_load1" 236 "cortex_a53_alu") 237 238(define_bypass 3 "cortex_a53_load1" 239 "cortex_a53_alu_*,cortex_a53_shift*") 240 241(define_bypass 3 "cortex_a53_load2" 242 "cortex_a53_alu") 243 244;; Model a bypass for ALU instructions feeding stores. 245 246(define_bypass 0 "cortex_a53_alu*,cortex_a53_shift*" 247 "cortex_a53_store*" 248 "arm_no_early_store_addr_dep") 249 250;; Model a bypass for load and multiply instructions feeding stores. 251 252(define_bypass 1 "cortex_a53_mul, 253 cortex_a53_load*" 254 "cortex_a53_store*" 255 "arm_no_early_store_addr_dep") 256 257;; Model a bypass for load to load/store address. 258 259(define_bypass 3 "cortex_a53_load1" 260 "cortex_a53_load*" 261 "arm_early_load_addr_dep_ptr") 262 263(define_bypass 3 "cortex_a53_load1" 264 "cortex_a53_store*" 265 "arm_early_store_addr_dep_ptr") 266 267;; Model a GP->FP register move as similar to stores. 268 269(define_bypass 0 "cortex_a53_alu*,cortex_a53_shift*" 270 "cortex_a53_r2f") 271 272(define_bypass 1 "cortex_a53_mul, 273 cortex_a53_load1, 274 cortex_a53_load2" 275 "cortex_a53_r2f") 276 277(define_bypass 2 "cortex_a53_alu*" 278 "cortex_a53_r2f_cvt") 279 280(define_bypass 3 "cortex_a53_mul, 281 cortex_a53_load1, 282 cortex_a53_load2" 283 "cortex_a53_r2f_cvt") 284 285;; Model flag forwarding to branches. 286 287(define_bypass 0 "cortex_a53_alu*,cortex_a53_shift*" 288 "cortex_a53_branch") 289 290;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 291;; Floating-point/Advanced SIMD. 292;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 293 294(define_automaton "cortex_a53_advsimd") 295 296;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 297;; Broad Advanced SIMD type categorisation 298;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 299 300(define_attr "cortex_a53_advsimd_type" 301 "advsimd_alu, advsimd_alu_q, 302 advsimd_mul, advsimd_mul_q, 303 advsimd_div_s, advsimd_div_s_q, 304 advsimd_div_d, advsimd_div_d_q, 305 advsimd_load_64, advsimd_store_64, 306 advsimd_load_128, advsimd_store_128, 307 advsimd_load_lots, advsimd_store_lots, 308 unknown" 309 (cond [ 310 (eq_attr "type" "neon_add, neon_qadd, neon_add_halve, neon_sub, neon_qsub,\ 311 neon_sub_halve, neon_abs, neon_neg, neon_qneg,\ 312 neon_qabs, neon_abd, neon_minmax, neon_compare,\ 313 neon_compare_zero, neon_arith_acc, neon_reduc_add,\ 314 neon_reduc_add_acc, neon_reduc_minmax,\ 315 neon_logic, neon_tst, neon_shift_imm,\ 316 neon_shift_reg, neon_shift_acc, neon_sat_shift_imm,\ 317 neon_sat_shift_reg, neon_ins, neon_move,\ 318 neon_permute, neon_zip, neon_tbl1,\ 319 neon_tbl2, neon_tbl3, neon_tbl4, neon_bsl,\ 320 neon_cls, neon_cnt, neon_dup,\ 321 neon_ext, neon_rbit, neon_rev,\ 322 neon_fp_abd_s, neon_fp_abd_d,\ 323 neon_fp_abs_s, neon_fp_abs_d,\ 324 neon_fp_addsub_s, neon_fp_addsub_d, neon_fp_compare_s,\ 325 neon_fp_compare_d, neon_fp_minmax_s,\ 326 neon_fp_minmax_d, neon_fp_neg_s, neon_fp_neg_d,\ 327 neon_fp_reduc_add_s, neon_fp_reduc_add_d,\ 328 neon_fp_reduc_minmax_s, neon_fp_reduc_minmax_d,\ 329 neon_fp_cvt_widen_h, neon_fp_to_int_s,neon_fp_to_int_d,\ 330 neon_int_to_fp_s, neon_int_to_fp_d, neon_fp_round_s,\ 331 neon_fp_recpe_s, neon_fp_recpe_d, neon_fp_recps_s,\ 332 neon_fp_recps_d, neon_fp_recpx_s, neon_fp_recpx_d,\ 333 neon_fp_rsqrte_s, neon_fp_rsqrte_d, neon_fp_rsqrts_s,\ 334 neon_fp_rsqrts_d") 335 (const_string "advsimd_alu") 336 (eq_attr "type" "neon_add_q, neon_add_widen, neon_add_long,\ 337 neon_qadd_q, neon_add_halve_q, neon_add_halve_narrow_q,\ 338 neon_sub_q, neon_sub_widen, neon_sub_long,\ 339 neon_qsub_q, neon_sub_halve_q, neon_sub_halve_narrow_q,\ 340 neon_abs_q, neon_neg_q, neon_qneg_q, neon_qabs_q,\ 341 neon_abd_q, neon_abd_long, neon_minmax_q,\ 342 neon_compare_q, neon_compare_zero_q,\ 343 neon_arith_acc_q, neon_reduc_add_q,\ 344 neon_reduc_add_long, neon_reduc_add_acc_q,\ 345 neon_reduc_minmax_q, neon_logic_q, neon_tst_q,\ 346 neon_shift_imm_q, neon_shift_imm_narrow_q,\ 347 neon_shift_imm_long, neon_shift_reg_q,\ 348 neon_shift_acc_q, neon_sat_shift_imm_q,\ 349 neon_sat_shift_imm_narrow_q, neon_sat_shift_reg_q,\ 350 neon_ins_q, neon_move_q, neon_move_narrow_q,\ 351 neon_permute_q, neon_zip_q,\ 352 neon_tbl1_q, neon_tbl2_q, neon_tbl3_q,\ 353 neon_tbl4_q, neon_bsl_q, neon_cls_q, neon_cnt_q,\ 354 neon_dup_q, neon_ext_q, neon_rbit_q,\ 355 neon_rev_q, neon_fp_abd_s_q, neon_fp_abd_d_q,\ 356 neon_fp_abs_s_q, neon_fp_abs_d_q,\ 357 neon_fp_addsub_s_q, neon_fp_addsub_d_q,\ 358 neon_fp_compare_s_q, neon_fp_compare_d_q,\ 359 neon_fp_minmax_s_q, neon_fp_minmax_d_q,\ 360 neon_fp_cvt_widen_s, neon_fp_neg_s_q, neon_fp_neg_d_q,\ 361 neon_fp_reduc_add_s_q, neon_fp_reduc_add_d_q,\ 362 neon_fp_reduc_minmax_s_q, neon_fp_reduc_minmax_d_q,\ 363 neon_fp_cvt_narrow_s_q, neon_fp_cvt_narrow_d_q,\ 364 neon_fp_to_int_s_q, neon_fp_to_int_d_q,\ 365 neon_int_to_fp_s_q, neon_int_to_fp_d_q,\ 366 neon_fp_round_s_q,\ 367 neon_fp_recpe_s_q, neon_fp_recpe_d_q,\ 368 neon_fp_recps_s_q, neon_fp_recps_d_q,\ 369 neon_fp_recpx_s_q, neon_fp_recpx_d_q,\ 370 neon_fp_rsqrte_s_q, neon_fp_rsqrte_d_q,\ 371 neon_fp_rsqrts_s_q, neon_fp_rsqrts_d_q") 372 (const_string "advsimd_alu_q") 373 (eq_attr "type" "neon_mul_b, neon_mul_h, neon_mul_s,\ 374 neon_mul_h_scalar, neon_mul_s_scalar,\ 375 neon_sat_mul_b, neon_sat_mul_h, neon_sat_mul_s,\ 376 neon_sat_mul_h_scalar, neon_sat_mul_s_scalar,\ 377 neon_mla_b, neon_mla_h, neon_mla_s,\ 378 neon_mla_h_scalar, neon_mla_s_scalar,\ 379 neon_fp_mul_s, neon_fp_mul_s_scalar,\ 380 neon_fp_mul_d, neon_fp_mla_s,\ 381 neon_fp_mla_s_scalar, neon_fp_mla_d") 382 (const_string "advsimd_mul") 383 (eq_attr "type" "neon_mul_b_q, neon_mul_h_q, neon_mul_s_q,\ 384 neon_mul_b_long, neon_mul_h_long, neon_mul_s_long,\ 385 neon_mul_d_long, neon_mul_h_scalar_q,\ 386 neon_mul_s_scalar_q, neon_mul_h_scalar_long,\ 387 neon_mul_s_scalar_long, neon_sat_mul_b_q,\ 388 neon_sat_mul_h_q, neon_sat_mul_s_q,\ 389 neon_sat_mul_b_long, neon_sat_mul_h_long,\ 390 neon_sat_mul_s_long, neon_sat_mul_h_scalar_q,\ 391 neon_sat_mul_s_scalar_q, neon_sat_mul_h_scalar_long,\ 392 neon_sat_mul_s_scalar_long, crypto_pmull, neon_mla_b_q,\ 393 neon_mla_h_q, neon_mla_s_q, neon_mla_b_long,\ 394 neon_mla_h_long, neon_mla_s_long,\ 395 neon_mla_h_scalar_q, neon_mla_s_scalar_q,\ 396 neon_mla_h_scalar_long, neon_mla_s_scalar_long,\ 397 neon_sat_mla_b_long, neon_sat_mla_h_long,\ 398 neon_sat_mla_s_long, neon_sat_mla_h_scalar_long,\ 399 neon_sat_mla_s_scalar_long,\ 400 neon_fp_mul_s_q, neon_fp_mul_s_scalar_q,\ 401 neon_fp_mul_d_q, neon_fp_mul_d_scalar_q,\ 402 neon_fp_mla_s_q, neon_fp_mla_s_scalar_q,\ 403 neon_fp_mla_d_q, neon_fp_mla_d_scalar_q") 404 (const_string "advsimd_mul_q") 405 (eq_attr "type" "neon_fp_sqrt_s, neon_fp_div_s") 406 (const_string "advsimd_div_s") 407 (eq_attr "type" "neon_fp_sqrt_s_q, neon_fp_div_s_q") 408 (const_string "advsimd_div_s_q") 409 (eq_attr "type" "neon_fp_sqrt_d, neon_fp_div_d") 410 (const_string "advsimd_div_d") 411 (eq_attr "type" "neon_fp_sqrt_d_q, neon_fp_div_d_q") 412 (const_string "advsimd_div_d_q") 413 (eq_attr "type" "neon_ldr, neon_load1_1reg,\ 414 neon_load1_all_lanes, neon_load1_all_lanes_q,\ 415 neon_load1_one_lane, neon_load1_one_lane_q") 416 (const_string "advsimd_load_64") 417 (eq_attr "type" "neon_str, neon_store1_1reg,\ 418 neon_store1_one_lane,neon_store1_one_lane_q") 419 (const_string "advsimd_store_64") 420 (eq_attr "type" "neon_load1_1reg_q, neon_load1_2reg,\ 421 neon_load2_2reg,\ 422 neon_load2_all_lanes, neon_load2_all_lanes_q,\ 423 neon_load2_one_lane, neon_load2_one_lane_q") 424 (const_string "advsimd_load_128") 425 (eq_attr "type" "neon_store1_1reg_q, neon_store1_2reg,\ 426 neon_store2_2reg,\ 427 neon_store2_one_lane, neon_store2_one_lane_q") 428 (const_string "advsimd_store_128") 429 (eq_attr "type" "neon_load1_2reg_q, neon_load1_3reg, neon_load1_3reg_q,\ 430 neon_load1_4reg, neon_load1_4reg_q, \ 431 neon_load2_2reg_q, neon_load2_4reg,\ 432 neon_load2_4reg_q, neon_load3_3reg,\ 433 neon_load3_3reg_q, neon_load3_all_lanes,\ 434 neon_load3_all_lanes_q, neon_load3_one_lane,\ 435 neon_load3_one_lane_q, neon_load4_4reg,\ 436 neon_load4_4reg_q, neon_load4_all_lanes,\ 437 neon_load4_all_lanes_q, neon_load4_one_lane,\ 438 neon_load4_one_lane_q, neon_ldp, neon_ldp_q") 439 (const_string "advsimd_load_lots") 440 (eq_attr "type" "neon_store1_2reg_q, neon_store1_3reg,\ 441 neon_store1_3reg_q, neon_store1_4reg,\ 442 neon_store1_4reg_q, neon_store2_2reg_q,\ 443 neon_store2_4reg, neon_store2_4reg_q,\ 444 neon_store3_3reg, neon_store3_3reg_q,\ 445 neon_store3_one_lane, neon_store3_one_lane_q,\ 446 neon_store4_4reg, neon_store4_4reg_q,\ 447 neon_store4_one_lane, neon_store4_one_lane_q,\ 448 neon_stp, neon_stp_q") 449 (const_string "advsimd_store_lots")] 450 (const_string "unknown"))) 451 452;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 453;; Floating-point/Advanced SIMD functional units. 454;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 455 456;; We model the Advanced SIMD unit as two 64-bit units, each with three 457;; pipes, FP_ALU, FP_MUL, FP_DIV. We also give convenient reservations 458;; for 128-bit Advanced SIMD instructions, which use both units. 459 460;; The floating-point/Advanced SIMD ALU pipelines. 461 462(define_cpu_unit "cortex_a53_fp_alu_lo,\ 463 cortex_a53_fp_alu_hi" 464 "cortex_a53_advsimd") 465 466(define_reservation "cortex_a53_fp_alu" 467 "cortex_a53_fp_alu_lo\ 468 |cortex_a53_fp_alu_hi") 469 470(define_reservation "cortex_a53_fp_alu_q" 471 "cortex_a53_fp_alu_lo\ 472 +cortex_a53_fp_alu_hi") 473 474;; The floating-point/Advanced SIMD multiply/multiply-accumulate 475;; pipelines. 476 477(define_cpu_unit "cortex_a53_fp_mul_lo,\ 478 cortex_a53_fp_mul_hi" 479 "cortex_a53_advsimd") 480 481(define_reservation "cortex_a53_fp_mul" 482 "cortex_a53_fp_mul_lo\ 483 |cortex_a53_fp_mul_hi") 484 485(define_reservation "cortex_a53_fp_mul_q" 486 "cortex_a53_fp_mul_lo\ 487 +cortex_a53_fp_mul_hi") 488 489;; Floating-point/Advanced SIMD divide/square root. 490 491(define_cpu_unit "cortex_a53_fp_div_lo,\ 492 cortex_a53_fp_div_hi" 493 "cortex_a53_advsimd") 494 495;; Once we choose a pipe, stick with it for three simulated cycles. 496 497(define_reservation "cortex_a53_fp_div" 498 "(cortex_a53_fp_div_lo*3)\ 499 |(cortex_a53_fp_div_hi*3)") 500 501(define_reservation "cortex_a53_fp_div_q" 502 "(cortex_a53_fp_div_lo*3)\ 503 +(cortex_a53_fp_div_hi*3)") 504 505;; Cryptographic extensions 506 507(define_cpu_unit "cortex_a53_crypto" 508 "cortex_a53_advsimd") 509 510;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 511;; Floating-point arithmetic. 512;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 513 514(define_insn_reservation "cortex_a53_fpalu" 4 515 (and (eq_attr "tune" "cortexa53") 516 (eq_attr "type" "ffariths, fadds, ffarithd, faddd, fmov, 517 f_cvt, fcmps, fcmpd, fccmps, fccmpd, fcsel, 518 f_rints, f_rintd, f_minmaxs, f_minmaxd")) 519 "cortex_a53_slot_any,cortex_a53_fp_alu") 520 521(define_insn_reservation "cortex_a53_fconst" 2 522 (and (eq_attr "tune" "cortexa53") 523 (eq_attr "type" "fconsts,fconstd")) 524 "cortex_a53_slot_any,cortex_a53_fp_alu") 525 526(define_insn_reservation "cortex_a53_fpmul" 4 527 (and (eq_attr "tune" "cortexa53") 528 (eq_attr "type" "fmuls,fmuld")) 529 "cortex_a53_slot_any,cortex_a53_fp_mul") 530 531;; For multiply-accumulate, model the add (accumulate) as being issued 532;; after the multiply completes. 533 534(define_insn_reservation "cortex_a53_fpmac" 8 535 (and (eq_attr "tune" "cortexa53") 536 (eq_attr "type" "fmacs,fmacd,ffmas,ffmad")) 537 "cortex_a53_slot_any,cortex_a53_fp_mul, 538 nothing*3, cortex_a53_fp_alu") 539 540;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 541;; Floating-point to/from core transfers. 542;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 543 544(define_insn_reservation "cortex_a53_r2f" 2 545 (and (eq_attr "tune" "cortexa53") 546 (eq_attr "type" "f_mcr,f_mcrr")) 547 "cortex_a53_slot_any,cortex_a53_fp_alu") 548 549(define_insn_reservation "cortex_a53_f2r" 4 550 (and (eq_attr "tune" "cortexa53") 551 (eq_attr "type" "f_mrc,f_mrrc")) 552 "cortex_a53_slot_any,cortex_a53_fp_alu") 553 554(define_insn_reservation "cortex_a53_r2f_cvt" 4 555 (and (eq_attr "tune" "cortexa53") 556 (eq_attr "type" "f_cvti2f, neon_from_gp, neon_from_gp_q")) 557 "cortex_a53_slot_any,cortex_a53_fp_alu") 558 559(define_insn_reservation "cortex_a53_f2r_cvt" 5 560 (and (eq_attr "tune" "cortexa53") 561 (eq_attr "type" "f_cvtf2i, neon_to_gp, neon_to_gp_q")) 562 "cortex_a53_slot_any,cortex_a53_fp_alu") 563 564;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 565;; Floating-point flag transfer. 566;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 567 568(define_insn_reservation "cortex_a53_f_flags" 5 569 (and (eq_attr "tune" "cortexa53") 570 (eq_attr "type" "f_flag")) 571 "cortex_a53_slot_any") 572 573;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 574;; Floating-point load/store. 575;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 576 577(define_insn_reservation "cortex_a53_f_load_64" 3 578 (and (eq_attr "tune" "cortexa53") 579 (ior (eq_attr "type" "f_loads,f_loadd") 580 (eq_attr "cortex_a53_advsimd_type" 581 "advsimd_load_64"))) 582 "cortex_a53_slot_any+cortex_a53_ls_agen, 583 cortex_a53_load") 584 585(define_insn_reservation "cortex_a53_f_load_many" 4 586 (and (eq_attr "tune" "cortexa53") 587 (eq_attr "cortex_a53_advsimd_type" 588 "advsimd_load_128,advsimd_load_lots")) 589 "cortex_a53_single_issue+cortex_a53_ls_agen, 590 cortex_a53_load+cortex_a53_slot0, 591 cortex_a53_load") 592 593(define_insn_reservation "cortex_a53_f_store_64" 0 594 (and (eq_attr "tune" "cortexa53") 595 (ior (eq_attr "type" "f_stores,f_stored") 596 (eq_attr "cortex_a53_advsimd_type" 597 "advsimd_store_64"))) 598 "cortex_a53_slot_any+cortex_a53_ls_agen, 599 cortex_a53_store") 600 601(define_insn_reservation "cortex_a53_f_store_many" 0 602 (and (eq_attr "tune" "cortexa53") 603 (eq_attr "cortex_a53_advsimd_type" 604 "advsimd_store_128,advsimd_store_lots")) 605 "cortex_a53_slot_any+cortex_a53_ls_agen, 606 cortex_a53_store+cortex_a53_slot0, 607 cortex_a53_store") 608 609;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 610;; Advanced SIMD. 611;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 612 613;; Either we want to model use of the ALU pipe, the multiply pipe or the 614;; divide/sqrt pipe. In all cases we need to check if we are a 64-bit 615;; operation (in which case we model dual-issue without penalty) 616;; or a 128-bit operation in which case we require in our model that we 617;; issue from slot 0. 618 619(define_insn_reservation "cortex_a53_advsimd_alu" 4 620 (and (eq_attr "tune" "cortexa53") 621 (eq_attr "cortex_a53_advsimd_type" "advsimd_alu")) 622 "cortex_a53_slot_any,cortex_a53_fp_alu") 623 624(define_insn_reservation "cortex_a53_advsimd_alu_q" 4 625 (and (eq_attr "tune" "cortexa53") 626 (eq_attr "cortex_a53_advsimd_type" "advsimd_alu_q")) 627 "cortex_a53_slot0,cortex_a53_fp_alu_q") 628 629(define_insn_reservation "cortex_a53_advsimd_mul" 4 630 (and (eq_attr "tune" "cortexa53") 631 (eq_attr "cortex_a53_advsimd_type" "advsimd_mul")) 632 "cortex_a53_slot_any,cortex_a53_fp_mul") 633 634(define_insn_reservation "cortex_a53_advsimd_mul_q" 4 635 (and (eq_attr "tune" "cortexa53") 636 (eq_attr "cortex_a53_advsimd_type" "advsimd_mul_q")) 637 "cortex_a53_slot0,cortex_a53_fp_mul_q") 638 639;; SIMD Dividers. 640 641(define_insn_reservation "cortex_a53_advsimd_div_s" 14 642 (and (eq_attr "tune" "cortexa53") 643 (ior (eq_attr "type" "fdivs,fsqrts") 644 (eq_attr "cortex_a53_advsimd_type" "advsimd_div_s"))) 645 "cortex_a53_slot0,cortex_a53_fp_mul, 646 cortex_a53_fp_div") 647 648(define_insn_reservation "cortex_a53_advsimd_div_d" 29 649 (and (eq_attr "tune" "cortexa53") 650 (ior (eq_attr "type" "fdivd,fsqrtd") 651 (eq_attr "cortex_a53_advsimd_type" "advsimd_div_d"))) 652 "cortex_a53_slot0,cortex_a53_fp_mul, 653 cortex_a53_fp_div") 654 655(define_insn_reservation "cortex_a53_advsimd_div_s_q" 14 656 (and (eq_attr "tune" "cortexa53") 657 (eq_attr "cortex_a53_advsimd_type" "advsimd_div_s_q")) 658 "cortex_a53_single_issue,cortex_a53_fp_mul_q, 659 cortex_a53_fp_div_q") 660 661(define_insn_reservation "cortex_a53_advsimd_divd_q" 29 662 (and (eq_attr "tune" "cortexa53") 663 (eq_attr "cortex_a53_advsimd_type" "advsimd_div_d_q")) 664 "cortex_a53_single_issue,cortex_a53_fp_mul_q, 665 cortex_a53_fp_div_q") 666 667;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 668;; ARMv8-A Cryptographic extensions. 669;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 670 671;; We want AESE and AESMC to end up consecutive to one another. 672 673(define_insn_reservation "cortex_a53_crypto_aese" 3 674 (and (eq_attr "tune" "cortexa53") 675 (eq_attr "type" "crypto_aese")) 676 "cortex_a53_slot0") 677 678(define_insn_reservation "cortex_a53_crypto_aesmc" 3 679 (and (eq_attr "tune" "cortexa53") 680 (eq_attr "type" "crypto_aesmc")) 681 "cortex_a53_slot_any") 682 683;; SHA1H 684 685(define_insn_reservation "cortex_a53_crypto_sha1_fast" 3 686 (and (eq_attr "tune" "cortexa53") 687 (eq_attr "type" "crypto_sha1_fast")) 688 "cortex_a53_slot_any,cortex_a53_crypto") 689 690(define_insn_reservation "cortex_a53_crypto_sha256_fast" 3 691 (and (eq_attr "tune" "cortexa53") 692 (eq_attr "type" "crypto_sha256_fast")) 693 "cortex_a53_slot0,cortex_a53_crypto") 694 695(define_insn_reservation "cortex_a53_crypto_sha1_xor" 4 696 (and (eq_attr "tune" "cortexa53") 697 (eq_attr "type" "crypto_sha1_xor")) 698 "cortex_a53_slot0,cortex_a53_crypto") 699 700(define_insn_reservation "cortex_a53_crypto_sha_slow" 5 701 (and (eq_attr "tune" "cortexa53") 702 (eq_attr "type" "crypto_sha1_slow, crypto_sha256_slow")) 703 "cortex_a53_slot0,cortex_a53_crypto") 704 705;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 706;; Floating-point/Advanced SIMD register bypasses. 707;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 708 709;; Model the late use of the accumulator operand for floating-point 710;; multiply-accumulate operations as a bypass reducing the latency 711;; of producing instructions to near zero. 712 713(define_bypass 1 "cortex_a53_fpalu, 714 cortex_a53_fpmul, 715 cortex_a53_r2f, 716 cortex_a53_r2f_cvt, 717 cortex_a53_fconst, 718 cortex_a53_f_load*" 719 "cortex_a53_fpmac" 720 "aarch_accumulator_forwarding") 721 722(define_bypass 4 "cortex_a53_fpmac" 723 "cortex_a53_fpmac" 724 "aarch_accumulator_forwarding") 725 726;; We want AESE and AESMC to end up consecutive to one another. 727 728(define_bypass 0 "cortex_a53_crypto_aese" 729 "cortex_a53_crypto_aesmc" 730 "aarch_crypto_can_dual_issue") 731 732