1;; ARM Cortex-R4 scheduling description. 2;; Copyright (C) 2007-2013 Free Software Foundation, Inc. 3;; Contributed by CodeSourcery. 4 5;; This file is part of GCC. 6 7;; GCC is free software; you can redistribute it and/or modify it 8;; under the terms of the GNU General Public License as published 9;; by the Free Software Foundation; either version 3, or (at your 10;; option) any later version. 11 12;; GCC is distributed in the hope that it will be useful, but WITHOUT 13;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 14;; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public 15;; License for more details. 16 17;; You should have received a copy of the GNU General Public License 18;; along with GCC; see the file COPYING3. If not see 19;; <http://www.gnu.org/licenses/>. 20 21(define_automaton "cortex_r4") 22 23;; We approximate the dual-issue constraints of this core using four 24;; "issue units" and a reservation matrix as follows. The numbers indicate 25;; the instruction groups' preferences in order. Multiple entries for 26;; the same numbered preference indicate units that must be reserved 27;; together. 28;; 29;; Issue unit: A B C ALU 30;; 31;; ALU w/o reg shift 1st 2nd 1st and 2nd 32;; ALU w/ reg shift 1st 2nd 2nd 1st and 2nd 33;; Moves 1st 2nd 2nd 34;; Multiplication 1st 1st 35;; Division 1st 1st 36;; Load/store single 1st 1st 37;; Other load/store 1st 1st 38;; Branches 1st 39 40(define_cpu_unit "cortex_r4_issue_a" "cortex_r4") 41(define_cpu_unit "cortex_r4_issue_b" "cortex_r4") 42(define_cpu_unit "cortex_r4_issue_c" "cortex_r4") 43(define_cpu_unit "cortex_r4_issue_alu" "cortex_r4") 44 45(define_reservation "cortex_r4_alu" 46 "(cortex_r4_issue_a+cortex_r4_issue_alu)|\ 47 (cortex_r4_issue_b+cortex_r4_issue_alu)") 48(define_reservation "cortex_r4_alu_shift_reg" 49 "(cortex_r4_issue_a+cortex_r4_issue_alu)|\ 50 (cortex_r4_issue_b+cortex_r4_issue_c+\ 51 cortex_r4_issue_alu)") 52(define_reservation "cortex_r4_mov" 53 "cortex_r4_issue_a|(cortex_r4_issue_b+\ 54 cortex_r4_issue_alu)") 55(define_reservation "cortex_r4_mul" "cortex_r4_issue_a+cortex_r4_issue_alu") 56(define_reservation "cortex_r4_mul_2" 57 "(cortex_r4_issue_a+cortex_r4_issue_alu)*2") 58;; Division instructions execute out-of-order with respect to the 59;; rest of the pipeline and only require reservations on their first and 60;; final cycles. 61(define_reservation "cortex_r4_div_9" 62 "cortex_r4_issue_a+cortex_r4_issue_alu,\ 63 nothing*7,\ 64 cortex_r4_issue_a+cortex_r4_issue_alu") 65(define_reservation "cortex_r4_div_10" 66 "cortex_r4_issue_a+cortex_r4_issue_alu,\ 67 nothing*8,\ 68 cortex_r4_issue_a+cortex_r4_issue_alu") 69(define_reservation "cortex_r4_load_store" 70 "cortex_r4_issue_a+cortex_r4_issue_c") 71(define_reservation "cortex_r4_load_store_2" 72 "(cortex_r4_issue_a+cortex_r4_issue_b)*2") 73(define_reservation "cortex_r4_branch" "cortex_r4_issue_b") 74 75;; We assume that all instructions are unconditional. 76 77;; Data processing instructions. Moves without shifts are kept separate 78;; for the purposes of the dual-issue constraints above. 79(define_insn_reservation "cortex_r4_alu" 2 80 (and (eq_attr "tune_cortexr4" "yes") 81 (and (eq_attr "type" "alu_reg,simple_alu_imm") 82 (not (eq_attr "insn" "mov")))) 83 "cortex_r4_alu") 84 85(define_insn_reservation "cortex_r4_mov" 2 86 (and (eq_attr "tune_cortexr4" "yes") 87 (and (eq_attr "type" "alu_reg,simple_alu_imm") 88 (eq_attr "insn" "mov"))) 89 "cortex_r4_mov") 90 91(define_insn_reservation "cortex_r4_alu_shift" 2 92 (and (eq_attr "tune_cortexr4" "yes") 93 (eq_attr "type" "simple_alu_shift,alu_shift")) 94 "cortex_r4_alu") 95 96(define_insn_reservation "cortex_r4_alu_shift_reg" 2 97 (and (eq_attr "tune_cortexr4" "yes") 98 (eq_attr "type" "alu_shift_reg")) 99 "cortex_r4_alu_shift_reg") 100 101;; An ALU instruction followed by an ALU instruction with no early dep. 102(define_bypass 1 "cortex_r4_alu,cortex_r4_alu_shift,cortex_r4_alu_shift_reg,\ 103 cortex_r4_mov" 104 "cortex_r4_alu") 105(define_bypass 1 "cortex_r4_alu,cortex_r4_alu_shift,cortex_r4_alu_shift_reg,\ 106 cortex_r4_mov" 107 "cortex_r4_alu_shift" 108 "arm_no_early_alu_shift_dep") 109(define_bypass 1 "cortex_r4_alu,cortex_r4_alu_shift,cortex_r4_alu_shift_reg,\ 110 cortex_r4_mov" 111 "cortex_r4_alu_shift_reg" 112 "arm_no_early_alu_shift_value_dep") 113 114;; In terms of availabilities, a consumer mov could theoretically be 115;; issued together with a producer ALU instruction, without stalls. 116;; In practice this cannot happen because mov;add (in that order) is not 117;; eligible for dual issue and furthermore dual issue is not permitted 118;; when a dependency is involved. We therefore note it as latency one. 119;; A mov followed by another of the same is also latency one. 120(define_bypass 1 "cortex_r4_alu,cortex_r4_alu_shift,cortex_r4_alu_shift_reg,\ 121 cortex_r4_mov" 122 "cortex_r4_mov") 123 124;; qadd, qdadd, qsub and qdsub are not currently emitted, and neither are 125;; media data processing instructions nor sad instructions. 126 127;; Multiplication instructions. 128 129(define_insn_reservation "cortex_r4_mul_4" 4 130 (and (eq_attr "tune_cortexr4" "yes") 131 (eq_attr "insn" "mul,smmul")) 132 "cortex_r4_mul_2") 133 134(define_insn_reservation "cortex_r4_mul_3" 3 135 (and (eq_attr "tune_cortexr4" "yes") 136 (eq_attr "insn" "smulxy,smulwy,smuad,smusd")) 137 "cortex_r4_mul") 138 139(define_insn_reservation "cortex_r4_mla_4" 4 140 (and (eq_attr "tune_cortexr4" "yes") 141 (eq_attr "insn" "mla,smmla")) 142 "cortex_r4_mul_2") 143 144(define_insn_reservation "cortex_r4_mla_3" 3 145 (and (eq_attr "tune_cortexr4" "yes") 146 (eq_attr "insn" "smlaxy,smlawy,smlad,smlsd")) 147 "cortex_r4_mul") 148 149(define_insn_reservation "cortex_r4_smlald" 3 150 (and (eq_attr "tune_cortexr4" "yes") 151 (eq_attr "insn" "smlald,smlsld")) 152 "cortex_r4_mul") 153 154(define_insn_reservation "cortex_r4_mull" 4 155 (and (eq_attr "tune_cortexr4" "yes") 156 (eq_attr "insn" "smull,umull,umlal,umaal")) 157 "cortex_r4_mul_2") 158 159;; A multiply or an MLA with a single-register result, followed by an 160;; MLA with an accumulator dependency, has its result forwarded. 161(define_bypass 2 "cortex_r4_mul_3,cortex_r4_mla_3" 162 "cortex_r4_mla_3,cortex_r4_mla_4" 163 "arm_mac_accumulator_is_mul_result") 164 165(define_bypass 3 "cortex_r4_mul_4,cortex_r4_mla_4" 166 "cortex_r4_mla_3,cortex_r4_mla_4" 167 "arm_mac_accumulator_is_mul_result") 168 169;; A multiply followed by an ALU instruction needing the multiply 170;; result only at ALU has lower latency than one needing it at Shift. 171(define_bypass 2 "cortex_r4_mul_3,cortex_r4_mla_3,cortex_r4_smlald" 172 "cortex_r4_alu") 173(define_bypass 2 "cortex_r4_mul_3,cortex_r4_mla_3,cortex_r4_smlald" 174 "cortex_r4_alu_shift" 175 "arm_no_early_alu_shift_dep") 176(define_bypass 2 "cortex_r4_mul_3,cortex_r4_mla_3,cortex_r4_smlald" 177 "cortex_r4_alu_shift_reg" 178 "arm_no_early_alu_shift_value_dep") 179(define_bypass 3 "cortex_r4_mul_4,cortex_r4_mla_4,cortex_r4_mull" 180 "cortex_r4_alu") 181(define_bypass 3 "cortex_r4_mul_4,cortex_r4_mla_4,cortex_r4_mull" 182 "cortex_r4_alu_shift" 183 "arm_no_early_alu_shift_dep") 184(define_bypass 3 "cortex_r4_mul_4,cortex_r4_mla_4,cortex_r4_mull" 185 "cortex_r4_alu_shift_reg" 186 "arm_no_early_alu_shift_value_dep") 187 188;; A multiply followed by a mov has one cycle lower latency again. 189(define_bypass 1 "cortex_r4_mul_3,cortex_r4_mla_3,cortex_r4_smlald" 190 "cortex_r4_mov") 191(define_bypass 2 "cortex_r4_mul_4,cortex_r4_mla_4,cortex_r4_mull" 192 "cortex_r4_mov") 193 194;; We guess that division of A/B using sdiv or udiv, on average, 195;; is performed with B having ten more leading zeros than A. 196;; This gives a latency of nine for udiv and ten for sdiv. 197(define_insn_reservation "cortex_r4_udiv" 9 198 (and (eq_attr "tune_cortexr4" "yes") 199 (eq_attr "insn" "udiv")) 200 "cortex_r4_div_9") 201 202(define_insn_reservation "cortex_r4_sdiv" 10 203 (and (eq_attr "tune_cortexr4" "yes") 204 (eq_attr "insn" "sdiv")) 205 "cortex_r4_div_10") 206 207;; Branches. We assume correct prediction. 208 209(define_insn_reservation "cortex_r4_branch" 0 210 (and (eq_attr "tune_cortexr4" "yes") 211 (eq_attr "type" "branch")) 212 "cortex_r4_branch") 213 214;; Call latencies are not predictable. A semi-arbitrary very large 215;; number is used as "positive infinity" so that everything should be 216;; finished by the time of return. 217(define_insn_reservation "cortex_r4_call" 32 218 (and (eq_attr "tune_cortexr4" "yes") 219 (eq_attr "type" "call")) 220 "nothing") 221 222;; Status register access instructions are not currently emitted. 223 224;; Load instructions. 225;; We do not model the "addr_md_3cycle" cases and assume that 226;; accesses following are correctly aligned. 227 228(define_insn_reservation "cortex_r4_load_1_2" 3 229 (and (eq_attr "tune_cortexr4" "yes") 230 (eq_attr "type" "load1,load2")) 231 "cortex_r4_load_store") 232 233(define_insn_reservation "cortex_r4_load_3_4" 4 234 (and (eq_attr "tune_cortexr4" "yes") 235 (eq_attr "type" "load3,load4")) 236 "cortex_r4_load_store_2") 237 238;; If a producing load is followed by an instruction consuming only 239;; as a Normal Reg, there is one fewer cycle of latency. 240 241(define_bypass 2 "cortex_r4_load_1_2" 242 "cortex_r4_alu") 243(define_bypass 2 "cortex_r4_load_1_2" 244 "cortex_r4_alu_shift" 245 "arm_no_early_alu_shift_dep") 246(define_bypass 2 "cortex_r4_load_1_2" 247 "cortex_r4_alu_shift_reg" 248 "arm_no_early_alu_shift_value_dep") 249 250(define_bypass 3 "cortex_r4_load_3_4" 251 "cortex_r4_alu") 252(define_bypass 3 "cortex_r4_load_3_4" 253 "cortex_r4_alu_shift" 254 "arm_no_early_alu_shift_dep") 255(define_bypass 3 "cortex_r4_load_3_4" 256 "cortex_r4_alu_shift_reg" 257 "arm_no_early_alu_shift_value_dep") 258 259;; If a producing load is followed by an instruction consuming only 260;; as a Late Reg, there are two fewer cycles of latency. Such consumer 261;; instructions are moves and stores. 262 263(define_bypass 1 "cortex_r4_load_1_2" 264 "cortex_r4_mov,cortex_r4_store_1_2,cortex_r4_store_3_4") 265(define_bypass 2 "cortex_r4_load_3_4" 266 "cortex_r4_mov,cortex_r4_store_1_2,cortex_r4_store_3_4") 267 268;; If a producer's result is required as the base or offset of a load, 269;; there is an extra cycle latency. 270 271(define_bypass 3 "cortex_r4_alu,cortex_r4_mov,cortex_r4_alu_shift,\ 272 cortex_r4_alu_shift_reg" 273 "cortex_r4_load_1_2,cortex_r4_load_3_4") 274 275(define_bypass 4 "cortex_r4_mul_3,cortex_r4_mla_3,cortex_r4_smlald" 276 "cortex_r4_load_1_2,cortex_r4_load_3_4") 277 278(define_bypass 5 "cortex_r4_mul_4,cortex_r4_mla_4,cortex_r4_mull" 279 "cortex_r4_load_1_2,cortex_r4_load_3_4") 280 281;; Store instructions. 282 283(define_insn_reservation "cortex_r4_store_1_2" 0 284 (and (eq_attr "tune_cortexr4" "yes") 285 (eq_attr "type" "store1,store2")) 286 "cortex_r4_load_store") 287 288(define_insn_reservation "cortex_r4_store_3_4" 0 289 (and (eq_attr "tune_cortexr4" "yes") 290 (eq_attr "type" "store3,store4")) 291 "cortex_r4_load_store_2") 292 293