1;; ARM 1026EJ-S Pipeline Description 2;; Copyright (C) 2003 Free Software Foundation, Inc. 3;; Written by CodeSourcery, LLC. 4;; 5;; This file is part of GCC. 6;; 7;; GCC is free software; you can redistribute it and/or modify it 8;; under the terms of the GNU General Public License as published by 9;; the Free Software Foundation; either version 2, or (at your option) 10;; any later version. 11;; 12;; GCC is distributed in the hope that it will be useful, but 13;; WITHOUT ANY WARRANTY; without even the implied warranty of 14;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15;; General Public License for more details. 16;; 17;; You should have received a copy of the GNU General Public License 18;; along with GCC; see the file COPYING. If not, write to the Free 19;; Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 20;; 02110-1301, USA. */ 21 22;; These descriptions are based on the information contained in the 23;; ARM1026EJ-S Technical Reference Manual, Copyright (c) 2003 ARM 24;; Limited. 25;; 26 27;; This automaton provides a pipeline description for the ARM 28;; 1026EJ-S core. 29;; 30;; The model given here assumes that the condition for all conditional 31;; instructions is "true", i.e., that all of the instructions are 32;; actually executed. 33 34(define_automaton "arm1026ejs") 35 36;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 37;; Pipelines 38;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 39 40;; There are two pipelines: 41;; 42;; - An Arithmetic Logic Unit (ALU) pipeline. 43;; 44;; The ALU pipeline has fetch, issue, decode, execute, memory, and 45;; write stages. We only need to model the execute, memory and write 46;; stages. 47;; 48;; - A Load-Store Unit (LSU) pipeline. 49;; 50;; The LSU pipeline has decode, execute, memory, and write stages. 51;; We only model the execute, memory and write stages. 52 53(define_cpu_unit "a_e,a_m,a_w" "arm1026ejs") 54(define_cpu_unit "l_e,l_m,l_w" "arm1026ejs") 55 56;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 57;; ALU Instructions 58;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 59 60;; ALU instructions require three cycles to execute, and use the ALU 61;; pipeline in each of the three stages. The results are available 62;; after the execute stage stage has finished. 63;; 64;; If the destination register is the PC, the pipelines are stalled 65;; for several cycles. That case is not modeled here. 66 67;; ALU operations with no shifted operand 68(define_insn_reservation "alu_op" 1 69 (and (eq_attr "tune" "arm1026ejs") 70 (eq_attr "type" "alu")) 71 "a_e,a_m,a_w") 72 73;; ALU operations with a shift-by-constant operand 74(define_insn_reservation "alu_shift_op" 1 75 (and (eq_attr "tune" "arm1026ejs") 76 (eq_attr "type" "alu_shift")) 77 "a_e,a_m,a_w") 78 79;; ALU operations with a shift-by-register operand 80;; These really stall in the decoder, in order to read 81;; the shift value in a second cycle. Pretend we take two cycles in 82;; the execute stage. 83(define_insn_reservation "alu_shift_reg_op" 2 84 (and (eq_attr "tune" "arm1026ejs") 85 (eq_attr "type" "alu_shift_reg")) 86 "a_e*2,a_m,a_w") 87 88;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 89;; Multiplication Instructions 90;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 91 92;; Multiplication instructions loop in the execute stage until the 93;; instruction has been passed through the multiplier array enough 94;; times. 95 96;; The result of the "smul" and "smulw" instructions is not available 97;; until after the memory stage. 98(define_insn_reservation "mult1" 2 99 (and (eq_attr "tune" "arm1026ejs") 100 (eq_attr "insn" "smulxy,smulwy")) 101 "a_e,a_m,a_w") 102 103;; The "smlaxy" and "smlawx" instructions require two iterations through 104;; the execute stage; the result is available immediately following 105;; the execute stage. 106(define_insn_reservation "mult2" 2 107 (and (eq_attr "tune" "arm1026ejs") 108 (eq_attr "insn" "smlaxy,smlalxy,smlawx")) 109 "a_e*2,a_m,a_w") 110 111;; The "smlalxy", "mul", and "mla" instructions require two iterations 112;; through the execute stage; the result is not available until after 113;; the memory stage. 114(define_insn_reservation "mult3" 3 115 (and (eq_attr "tune" "arm1026ejs") 116 (eq_attr "insn" "smlalxy,mul,mla")) 117 "a_e*2,a_m,a_w") 118 119;; The "muls" and "mlas" instructions loop in the execute stage for 120;; four iterations in order to set the flags. The value result is 121;; available after three iterations. 122(define_insn_reservation "mult4" 3 123 (and (eq_attr "tune" "arm1026ejs") 124 (eq_attr "insn" "muls,mlas")) 125 "a_e*4,a_m,a_w") 126 127;; Long multiply instructions that produce two registers of 128;; output (such as umull) make their results available in two cycles; 129;; the least significant word is available before the most significant 130;; word. That fact is not modeled; instead, the instructions are 131;; described.as if the entire result was available at the end of the 132;; cycle in which both words are available. 133 134;; The "umull", "umlal", "smull", and "smlal" instructions all take 135;; three iterations through the execute cycle, and make their results 136;; available after the memory cycle. 137(define_insn_reservation "mult5" 4 138 (and (eq_attr "tune" "arm1026ejs") 139 (eq_attr "insn" "umull,umlal,smull,smlal")) 140 "a_e*3,a_m,a_w") 141 142;; The "umulls", "umlals", "smulls", and "smlals" instructions loop in 143;; the execute stage for five iterations in order to set the flags. 144;; The value result is available after four iterations. 145(define_insn_reservation "mult6" 4 146 (and (eq_attr "tune" "arm1026ejs") 147 (eq_attr "insn" "umulls,umlals,smulls,smlals")) 148 "a_e*5,a_m,a_w") 149 150;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 151;; Load/Store Instructions 152;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 153 154;; The models for load/store instructions do not accurately describe 155;; the difference between operations with a base register writeback 156;; (such as "ldm!"). These models assume that all memory references 157;; hit in dcache. 158 159;; LSU instructions require six cycles to execute. They use the ALU 160;; pipeline in all but the 5th cycle, and the LSU pipeline in cycles 161;; three through six. 162;; Loads and stores which use a scaled register offset or scaled 163;; register pre-indexed addressing mode take three cycles EXCEPT for 164;; those that are base + offset with LSL of 0 or 2, or base - offset 165;; with LSL of zero. The remainder take 1 cycle to execute. 166;; For 4byte loads there is a bypass from the load stage 167 168(define_insn_reservation "load1_op" 2 169 (and (eq_attr "tune" "arm1026ejs") 170 (eq_attr "type" "load_byte,load1")) 171 "a_e+l_e,l_m,a_w+l_w") 172 173(define_insn_reservation "store1_op" 0 174 (and (eq_attr "tune" "arm1026ejs") 175 (eq_attr "type" "store1")) 176 "a_e+l_e,l_m,a_w+l_w") 177 178;; A load's result can be stored by an immediately following store 179(define_bypass 1 "load1_op" "store1_op" "arm_no_early_store_addr_dep") 180 181;; On a LDM/STM operation, the LSU pipeline iterates until all of the 182;; registers have been processed. 183;; 184;; The time it takes to load the data depends on whether or not the 185;; base address is 64-bit aligned; if it is not, an additional cycle 186;; is required. This model assumes that the address is always 64-bit 187;; aligned. Because the processor can load two registers per cycle, 188;; that assumption means that we use the same instruction reservations 189;; for loading 2k and 2k - 1 registers. 190;; 191;; The ALU pipeline is stalled until the completion of the last memory 192;; stage in the LSU pipeline. That is modeled by keeping the ALU 193;; execute stage busy until that point. 194;; 195;; As with ALU operations, if one of the destination registers is the 196;; PC, there are additional stalls; that is not modeled. 197 198(define_insn_reservation "load2_op" 2 199 (and (eq_attr "tune" "arm1026ejs") 200 (eq_attr "type" "load2")) 201 "a_e+l_e,l_m,a_w+l_w") 202 203(define_insn_reservation "store2_op" 0 204 (and (eq_attr "tune" "arm1026ejs") 205 (eq_attr "type" "store2")) 206 "a_e+l_e,l_m,a_w+l_w") 207 208(define_insn_reservation "load34_op" 3 209 (and (eq_attr "tune" "arm1026ejs") 210 (eq_attr "type" "load3,load4")) 211 "a_e+l_e,a_e+l_e+l_m,a_e+l_m,a_w+l_w") 212 213(define_insn_reservation "store34_op" 0 214 (and (eq_attr "tune" "arm1026ejs") 215 (eq_attr "type" "store3,store4")) 216 "a_e+l_e,a_e+l_e+l_m,a_e+l_m,a_w+l_w") 217 218;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 219;; Branch and Call Instructions 220;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 221 222;; Branch instructions are difficult to model accurately. The ARM 223;; core can predict most branches. If the branch is predicted 224;; correctly, and predicted early enough, the branch can be completely 225;; eliminated from the instruction stream. Some branches can 226;; therefore appear to require zero cycles to execute. We assume that 227;; all branches are predicted correctly, and that the latency is 228;; therefore the minimum value. 229 230(define_insn_reservation "branch_op" 0 231 (and (eq_attr "tune" "arm1026ejs") 232 (eq_attr "type" "branch")) 233 "nothing") 234 235;; The latency for a call is not predictable. Therefore, we use 32 as 236;; roughly equivalent to positive infinity. 237 238(define_insn_reservation "call_op" 32 239 (and (eq_attr "tune" "arm1026ejs") 240 (eq_attr "type" "call")) 241 "nothing") 242