1;; ARM 1136J[F]-S Pipeline Description 2;; Copyright (C) 2003-2014 Free Software Foundation, Inc. 3;; Written by CodeSourcery, LLC. 4;; 5;; This file is part of GCC. 6;; 7;; GCC is free software; you can redistribute it and/or modify it 8;; under the terms of the GNU General Public License as published by 9;; the Free Software Foundation; either version 3, or (at your option) 10;; any later version. 11;; 12;; GCC is distributed in the hope that it will be useful, but 13;; WITHOUT ANY WARRANTY; without even the implied warranty of 14;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15;; General Public License for more details. 16;; 17;; You should have received a copy of the GNU General Public License 18;; along with GCC; see the file COPYING3. If not see 19;; <http://www.gnu.org/licenses/>. */ 20 21;; These descriptions are based on the information contained in the 22;; ARM1136JF-S Technical Reference Manual, Copyright (c) 2003 ARM 23;; Limited. 24;; 25 26;; This automaton provides a pipeline description for the ARM 27;; 1136J-S and 1136JF-S cores. 28;; 29;; The model given here assumes that the condition for all conditional 30;; instructions is "true", i.e., that all of the instructions are 31;; actually executed. 32 33(define_automaton "arm1136jfs") 34 35;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 36;; Pipelines 37;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 38 39;; There are three distinct pipelines (page 1-26 and following): 40;; 41;; - A 4-stage decode pipeline, shared by all three. It has fetch (1), 42;; fetch (2), decode, and issue stages. Since this is always involved, 43;; we do not model it in the scheduler. 44;; 45;; - A 4-stage ALU pipeline. It has shifter, ALU (main integer operations), 46;; and saturation stages. The fourth stage is writeback; see below. 47;; 48;; - A 4-stage multiply-accumulate pipeline. It has three stages, called 49;; MAC1 through MAC3, and a fourth writeback stage. 50;; 51;; The 4th-stage writeback is shared between the ALU and MAC pipelines, 52;; which operate in lockstep. Results from either pipeline will be 53;; moved into the writeback stage. Because the two pipelines operate 54;; in lockstep, we schedule them as a single "execute" pipeline. 55;; 56;; - A 4-stage LSU pipeline. It has address generation, data cache (1), 57;; data cache (2), and writeback stages. (Note that this pipeline, 58;; including the writeback stage, is independent from the ALU & LSU pipes.) 59 60(define_cpu_unit "e_1,e_2,e_3,e_wb" "arm1136jfs") ; ALU and MAC 61; e_1 = Sh/Mac1, e_2 = ALU/Mac2, e_3 = SAT/Mac3 62(define_cpu_unit "l_a,l_dc1,l_dc2,l_wb" "arm1136jfs") ; Load/Store 63 64;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 65;; ALU Instructions 66;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 67 68;; ALU instructions require eight cycles to execute, and use the ALU 69;; pipeline in each of the eight stages. The results are available 70;; after the alu stage has finished. 71;; 72;; If the destination register is the PC, the pipelines are stalled 73;; for several cycles. That case is not modelled here. 74 75;; ALU operations with no shifted operand 76(define_insn_reservation "11_alu_op" 2 77 (and (eq_attr "tune" "arm1136js,arm1136jfs") 78 (eq_attr "type" "alu_imm,alus_imm,logic_imm,logics_imm,\ 79 alu_reg,alus_reg,logic_reg,logics_reg,\ 80 adc_imm,adcs_imm,adc_reg,adcs_reg,\ 81 adr,bfm,rev,\ 82 shift_imm,shift_reg,\ 83 mov_imm,mov_reg,mvn_imm,mvn_reg,\ 84 multiple,no_insn")) 85 "e_1,e_2,e_3,e_wb") 86 87;; ALU operations with a shift-by-constant operand 88(define_insn_reservation "11_alu_shift_op" 2 89 (and (eq_attr "tune" "arm1136js,arm1136jfs") 90 (eq_attr "type" "alu_shift_imm,alus_shift_imm,\ 91 logic_shift_imm,logics_shift_imm,\ 92 extend,mov_shift,mvn_shift")) 93 "e_1,e_2,e_3,e_wb") 94 95;; ALU operations with a shift-by-register operand 96;; These really stall in the decoder, in order to read 97;; the shift value in a second cycle. Pretend we take two cycles in 98;; the shift stage. 99(define_insn_reservation "11_alu_shift_reg_op" 3 100 (and (eq_attr "tune" "arm1136js,arm1136jfs") 101 (eq_attr "type" "alu_shift_reg,alus_shift_reg,\ 102 logic_shift_reg,logics_shift_reg,\ 103 mov_shift_reg,mvn_shift_reg")) 104 "e_1*2,e_2,e_3,e_wb") 105 106;; alu_ops can start sooner, if there is no shifter dependency 107(define_bypass 1 "11_alu_op,11_alu_shift_op" 108 "11_alu_op") 109(define_bypass 1 "11_alu_op,11_alu_shift_op" 110 "11_alu_shift_op" 111 "arm_no_early_alu_shift_value_dep") 112(define_bypass 1 "11_alu_op,11_alu_shift_op" 113 "11_alu_shift_reg_op" 114 "arm_no_early_alu_shift_dep") 115(define_bypass 2 "11_alu_shift_reg_op" 116 "11_alu_op") 117(define_bypass 2 "11_alu_shift_reg_op" 118 "11_alu_shift_op" 119 "arm_no_early_alu_shift_value_dep") 120(define_bypass 2 "11_alu_shift_reg_op" 121 "11_alu_shift_reg_op" 122 "arm_no_early_alu_shift_dep") 123 124(define_bypass 1 "11_alu_op,11_alu_shift_op" 125 "11_mult1,11_mult2,11_mult3,11_mult4,11_mult5,11_mult6,11_mult7" 126 "arm_no_early_mul_dep") 127(define_bypass 2 "11_alu_shift_reg_op" 128 "11_mult1,11_mult2,11_mult3,11_mult4,11_mult5,11_mult6,11_mult7" 129 "arm_no_early_mul_dep") 130 131;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 132;; Multiplication Instructions 133;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 134 135;; Multiplication instructions loop in the first two execute stages until 136;; the instruction has been passed through the multiplier array enough 137;; times. 138 139;; Multiply and multiply-accumulate results are available after four stages. 140(define_insn_reservation "11_mult1" 4 141 (and (eq_attr "tune" "arm1136js,arm1136jfs") 142 (eq_attr "type" "mul,mla")) 143 "e_1*2,e_2,e_3,e_wb") 144 145;; The *S variants set the condition flags, which requires three more cycles. 146(define_insn_reservation "11_mult2" 4 147 (and (eq_attr "tune" "arm1136js,arm1136jfs") 148 (eq_attr "type" "muls,mlas")) 149 "e_1*2,e_2,e_3,e_wb") 150 151(define_bypass 3 "11_mult1,11_mult2" 152 "11_mult1,11_mult2,11_mult3,11_mult4,11_mult5,11_mult6,11_mult7" 153 "arm_no_early_mul_dep") 154(define_bypass 3 "11_mult1,11_mult2" 155 "11_alu_op") 156(define_bypass 3 "11_mult1,11_mult2" 157 "11_alu_shift_op" 158 "arm_no_early_alu_shift_value_dep") 159(define_bypass 3 "11_mult1,11_mult2" 160 "11_alu_shift_reg_op" 161 "arm_no_early_alu_shift_dep") 162(define_bypass 3 "11_mult1,11_mult2" 163 "11_store1" 164 "arm_no_early_store_addr_dep") 165 166;; Signed and unsigned multiply long results are available across two cycles; 167;; the less significant word is available one cycle before the more significant 168;; word. Here we conservatively wait until both are available, which is 169;; after three iterations and the memory cycle. The same is also true of 170;; the two multiply-accumulate instructions. 171(define_insn_reservation "11_mult3" 5 172 (and (eq_attr "tune" "arm1136js,arm1136jfs") 173 (eq_attr "type" "smull,umull,smlal,umlal")) 174 "e_1*3,e_2,e_3,e_wb*2") 175 176;; The *S variants set the condition flags, which requires three more cycles. 177(define_insn_reservation "11_mult4" 5 178 (and (eq_attr "tune" "arm1136js,arm1136jfs") 179 (eq_attr "type" "smulls,umulls,smlals,umlals")) 180 "e_1*3,e_2,e_3,e_wb*2") 181 182(define_bypass 4 "11_mult3,11_mult4" 183 "11_mult1,11_mult2,11_mult3,11_mult4,11_mult5,11_mult6,11_mult7" 184 "arm_no_early_mul_dep") 185(define_bypass 4 "11_mult3,11_mult4" 186 "11_alu_op") 187(define_bypass 4 "11_mult3,11_mult4" 188 "11_alu_shift_op" 189 "arm_no_early_alu_shift_value_dep") 190(define_bypass 4 "11_mult3,11_mult4" 191 "11_alu_shift_reg_op" 192 "arm_no_early_alu_shift_dep") 193(define_bypass 4 "11_mult3,11_mult4" 194 "11_store1" 195 "arm_no_early_store_addr_dep") 196 197;; Various 16x16->32 multiplies and multiply-accumulates, using combinations 198;; of high and low halves of the argument registers. They take a single 199;; pass through the pipeline and make the result available after three 200;; cycles. 201(define_insn_reservation "11_mult5" 3 202 (and (eq_attr "tune" "arm1136js,arm1136jfs") 203 (eq_attr "type" "smulxy,smlaxy,smulwy,smlawy,smuad,smuadx,smlad,smladx,\ 204 smusd,smusdx,smlsd,smlsdx")) 205 "e_1,e_2,e_3,e_wb") 206 207(define_bypass 2 "11_mult5" 208 "11_mult1,11_mult2,11_mult3,11_mult4,11_mult5,11_mult6,11_mult7" 209 "arm_no_early_mul_dep") 210(define_bypass 2 "11_mult5" 211 "11_alu_op") 212(define_bypass 2 "11_mult5" 213 "11_alu_shift_op" 214 "arm_no_early_alu_shift_value_dep") 215(define_bypass 2 "11_mult5" 216 "11_alu_shift_reg_op" 217 "arm_no_early_alu_shift_dep") 218(define_bypass 2 "11_mult5" 219 "11_store1" 220 "arm_no_early_store_addr_dep") 221 222;; The same idea, then the 32-bit result is added to a 64-bit quantity. 223(define_insn_reservation "11_mult6" 4 224 (and (eq_attr "tune" "arm1136js,arm1136jfs") 225 (eq_attr "type" "smlalxy")) 226 "e_1*2,e_2,e_3,e_wb*2") 227 228;; Signed 32x32 multiply, then the most significant 32 bits are extracted 229;; and are available after the memory stage. 230(define_insn_reservation "11_mult7" 4 231 (and (eq_attr "tune" "arm1136js,arm1136jfs") 232 (eq_attr "type" "smmul,smmulr")) 233 "e_1*2,e_2,e_3,e_wb") 234 235(define_bypass 3 "11_mult6,11_mult7" 236 "11_mult1,11_mult2,11_mult3,11_mult4,11_mult5,11_mult6,11_mult7" 237 "arm_no_early_mul_dep") 238(define_bypass 3 "11_mult6,11_mult7" 239 "11_alu_op") 240(define_bypass 3 "11_mult6,11_mult7" 241 "11_alu_shift_op" 242 "arm_no_early_alu_shift_value_dep") 243(define_bypass 3 "11_mult6,11_mult7" 244 "11_alu_shift_reg_op" 245 "arm_no_early_alu_shift_dep") 246(define_bypass 3 "11_mult6,11_mult7" 247 "11_store1" 248 "arm_no_early_store_addr_dep") 249 250;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 251;; Branch Instructions 252;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 253 254;; These vary greatly depending on their arguments and the results of 255;; stat prediction. Cycle count ranges from zero (unconditional branch, 256;; folded dynamic prediction) to seven (incorrect predictions, etc). We 257;; assume an optimal case for now, because the cost of a cache miss 258;; overwhelms the cost of everything else anyhow. 259 260(define_insn_reservation "11_branches" 0 261 (and (eq_attr "tune" "arm1136js,arm1136jfs") 262 (eq_attr "type" "branch")) 263 "nothing") 264 265;; Call latencies are not predictable. A semi-arbitrary very large 266;; number is used as "positive infinity" so that everything should be 267;; finished by the time of return. 268(define_insn_reservation "11_call" 32 269 (and (eq_attr "tune" "arm1136js,arm1136jfs") 270 (eq_attr "type" "call")) 271 "nothing") 272 273;; Branches are predicted. A correctly predicted branch will be no 274;; cost, but we're conservative here, and use the timings a 275;; late-register would give us. 276(define_bypass 1 "11_alu_op,11_alu_shift_op" 277 "11_branches") 278(define_bypass 2 "11_alu_shift_reg_op" 279 "11_branches") 280(define_bypass 2 "11_load1,11_load2" 281 "11_branches") 282(define_bypass 3 "11_load34" 283 "11_branches") 284 285;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 286;; Load/Store Instructions 287;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 288 289;; The models for load/store instructions do not accurately describe 290;; the difference between operations with a base register writeback. 291;; These models assume that all memory references hit in dcache. Also, 292;; if the PC is one of the registers involved, there are additional stalls 293;; not modelled here. Addressing modes are also not modelled. 294 295(define_insn_reservation "11_load1" 3 296 (and (eq_attr "tune" "arm1136js,arm1136jfs") 297 (eq_attr "type" "load1")) 298 "l_a+e_1,l_dc1,l_dc2,l_wb") 299 300;; Load byte results are not available until the writeback stage, where 301;; the correct byte is extracted. 302 303(define_insn_reservation "11_loadb" 4 304 (and (eq_attr "tune" "arm1136js,arm1136jfs") 305 (eq_attr "type" "load_byte")) 306 "l_a+e_1,l_dc1,l_dc2,l_wb") 307 308(define_insn_reservation "11_store1" 0 309 (and (eq_attr "tune" "arm1136js,arm1136jfs") 310 (eq_attr "type" "store1")) 311 "l_a+e_1,l_dc1,l_dc2,l_wb") 312 313;; Load/store double words into adjacent registers. The timing and 314;; latencies are different depending on whether the address is 64-bit 315;; aligned. This model assumes that it is. 316(define_insn_reservation "11_load2" 3 317 (and (eq_attr "tune" "arm1136js,arm1136jfs") 318 (eq_attr "type" "load2")) 319 "l_a+e_1,l_dc1,l_dc2,l_wb") 320 321(define_insn_reservation "11_store2" 0 322 (and (eq_attr "tune" "arm1136js,arm1136jfs") 323 (eq_attr "type" "store2")) 324 "l_a+e_1,l_dc1,l_dc2,l_wb") 325 326;; Load/store multiple registers. Two registers are stored per cycle. 327;; Actual timing depends on how many registers are affected, so we 328;; optimistically schedule a low latency. 329(define_insn_reservation "11_load34" 4 330 (and (eq_attr "tune" "arm1136js,arm1136jfs") 331 (eq_attr "type" "load3,load4")) 332 "l_a+e_1,l_dc1*2,l_dc2,l_wb") 333 334(define_insn_reservation "11_store34" 0 335 (and (eq_attr "tune" "arm1136js,arm1136jfs") 336 (eq_attr "type" "store3,store4")) 337 "l_a+e_1,l_dc1*2,l_dc2,l_wb") 338 339;; A store can start immediately after an alu op, if that alu op does 340;; not provide part of the address to access. 341(define_bypass 1 "11_alu_op,11_alu_shift_op" 342 "11_store1" 343 "arm_no_early_store_addr_dep") 344(define_bypass 2 "11_alu_shift_reg_op" 345 "11_store1" 346 "arm_no_early_store_addr_dep") 347 348;; An alu op can start sooner after a load, if that alu op does not 349;; have an early register dependency on the load 350(define_bypass 2 "11_load1" 351 "11_alu_op") 352(define_bypass 2 "11_load1" 353 "11_alu_shift_op" 354 "arm_no_early_alu_shift_value_dep") 355(define_bypass 2 "11_load1" 356 "11_alu_shift_reg_op" 357 "arm_no_early_alu_shift_dep") 358 359(define_bypass 3 "11_loadb" 360 "11_alu_op") 361(define_bypass 3 "11_loadb" 362 "11_alu_shift_op" 363 "arm_no_early_alu_shift_value_dep") 364(define_bypass 3 "11_loadb" 365 "11_alu_shift_reg_op" 366 "arm_no_early_alu_shift_dep") 367 368;; A mul op can start sooner after a load, if that mul op does not 369;; have an early multiply dependency 370(define_bypass 2 "11_load1" 371 "11_mult1,11_mult2,11_mult3,11_mult4,11_mult5,11_mult6,11_mult7" 372 "arm_no_early_mul_dep") 373(define_bypass 3 "11_load34" 374 "11_mult1,11_mult2,11_mult3,11_mult4,11_mult5,11_mult6,11_mult7" 375 "arm_no_early_mul_dep") 376(define_bypass 3 "11_loadb" 377 "11_mult1,11_mult2,11_mult3,11_mult4,11_mult5,11_mult6,11_mult7" 378 "arm_no_early_mul_dep") 379 380;; A store can start sooner after a load, if that load does not 381;; produce part of the address to access 382(define_bypass 2 "11_load1" 383 "11_store1" 384 "arm_no_early_store_addr_dep") 385(define_bypass 3 "11_loadb" 386 "11_store1" 387 "arm_no_early_store_addr_dep") 388