1*404b540aSrobert;; ARM 1026EJ-S Pipeline Description 2*404b540aSrobert;; Copyright (C) 2003 Free Software Foundation, Inc. 3*404b540aSrobert;; Written by CodeSourcery, LLC. 4*404b540aSrobert;; 5*404b540aSrobert;; This file is part of GCC. 6*404b540aSrobert;; 7*404b540aSrobert;; GCC is free software; you can redistribute it and/or modify it 8*404b540aSrobert;; under the terms of the GNU General Public License as published by 9*404b540aSrobert;; the Free Software Foundation; either version 2, or (at your option) 10*404b540aSrobert;; any later version. 11*404b540aSrobert;; 12*404b540aSrobert;; GCC is distributed in the hope that it will be useful, but 13*404b540aSrobert;; WITHOUT ANY WARRANTY; without even the implied warranty of 14*404b540aSrobert;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15*404b540aSrobert;; General Public License for more details. 16*404b540aSrobert;; 17*404b540aSrobert;; You should have received a copy of the GNU General Public License 18*404b540aSrobert;; along with GCC; see the file COPYING. If not, write to the Free 19*404b540aSrobert;; Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 20*404b540aSrobert;; 02110-1301, USA. */ 21*404b540aSrobert 22*404b540aSrobert;; These descriptions are based on the information contained in the 23*404b540aSrobert;; ARM1026EJ-S Technical Reference Manual, Copyright (c) 2003 ARM 24*404b540aSrobert;; Limited. 25*404b540aSrobert;; 26*404b540aSrobert 27*404b540aSrobert;; This automaton provides a pipeline description for the ARM 28*404b540aSrobert;; 1026EJ-S core. 29*404b540aSrobert;; 30*404b540aSrobert;; The model given here assumes that the condition for all conditional 31*404b540aSrobert;; instructions is "true", i.e., that all of the instructions are 32*404b540aSrobert;; actually executed. 33*404b540aSrobert 34*404b540aSrobert(define_automaton "arm1026ejs") 35*404b540aSrobert 36*404b540aSrobert;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 37*404b540aSrobert;; Pipelines 38*404b540aSrobert;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 39*404b540aSrobert 40*404b540aSrobert;; There are two pipelines: 41*404b540aSrobert;; 42*404b540aSrobert;; - An Arithmetic Logic Unit (ALU) pipeline. 43*404b540aSrobert;; 44*404b540aSrobert;; The ALU pipeline has fetch, issue, decode, execute, memory, and 45*404b540aSrobert;; write stages. We only need to model the execute, memory and write 46*404b540aSrobert;; stages. 47*404b540aSrobert;; 48*404b540aSrobert;; - A Load-Store Unit (LSU) pipeline. 49*404b540aSrobert;; 50*404b540aSrobert;; The LSU pipeline has decode, execute, memory, and write stages. 51*404b540aSrobert;; We only model the execute, memory and write stages. 52*404b540aSrobert 53*404b540aSrobert(define_cpu_unit "a_e,a_m,a_w" "arm1026ejs") 54*404b540aSrobert(define_cpu_unit "l_e,l_m,l_w" "arm1026ejs") 55*404b540aSrobert 56*404b540aSrobert;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 57*404b540aSrobert;; ALU Instructions 58*404b540aSrobert;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 59*404b540aSrobert 60*404b540aSrobert;; ALU instructions require three cycles to execute, and use the ALU 61*404b540aSrobert;; pipeline in each of the three stages. The results are available 62*404b540aSrobert;; after the execute stage stage has finished. 63*404b540aSrobert;; 64*404b540aSrobert;; If the destination register is the PC, the pipelines are stalled 65*404b540aSrobert;; for several cycles. That case is not modeled here. 66*404b540aSrobert 67*404b540aSrobert;; ALU operations with no shifted operand 68*404b540aSrobert(define_insn_reservation "alu_op" 1 69*404b540aSrobert (and (eq_attr "tune" "arm1026ejs") 70*404b540aSrobert (eq_attr "type" "alu")) 71*404b540aSrobert "a_e,a_m,a_w") 72*404b540aSrobert 73*404b540aSrobert;; ALU operations with a shift-by-constant operand 74*404b540aSrobert(define_insn_reservation "alu_shift_op" 1 75*404b540aSrobert (and (eq_attr "tune" "arm1026ejs") 76*404b540aSrobert (eq_attr "type" "alu_shift")) 77*404b540aSrobert "a_e,a_m,a_w") 78*404b540aSrobert 79*404b540aSrobert;; ALU operations with a shift-by-register operand 80*404b540aSrobert;; These really stall in the decoder, in order to read 81*404b540aSrobert;; the shift value in a second cycle. Pretend we take two cycles in 82*404b540aSrobert;; the execute stage. 83*404b540aSrobert(define_insn_reservation "alu_shift_reg_op" 2 84*404b540aSrobert (and (eq_attr "tune" "arm1026ejs") 85*404b540aSrobert (eq_attr "type" "alu_shift_reg")) 86*404b540aSrobert "a_e*2,a_m,a_w") 87*404b540aSrobert 88*404b540aSrobert;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 89*404b540aSrobert;; Multiplication Instructions 90*404b540aSrobert;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 91*404b540aSrobert 92*404b540aSrobert;; Multiplication instructions loop in the execute stage until the 93*404b540aSrobert;; instruction has been passed through the multiplier array enough 94*404b540aSrobert;; times. 95*404b540aSrobert 96*404b540aSrobert;; The result of the "smul" and "smulw" instructions is not available 97*404b540aSrobert;; until after the memory stage. 98*404b540aSrobert(define_insn_reservation "mult1" 2 99*404b540aSrobert (and (eq_attr "tune" "arm1026ejs") 100*404b540aSrobert (eq_attr "insn" "smulxy,smulwy")) 101*404b540aSrobert "a_e,a_m,a_w") 102*404b540aSrobert 103*404b540aSrobert;; The "smlaxy" and "smlawx" instructions require two iterations through 104*404b540aSrobert;; the execute stage; the result is available immediately following 105*404b540aSrobert;; the execute stage. 106*404b540aSrobert(define_insn_reservation "mult2" 2 107*404b540aSrobert (and (eq_attr "tune" "arm1026ejs") 108*404b540aSrobert (eq_attr "insn" "smlaxy,smlalxy,smlawx")) 109*404b540aSrobert "a_e*2,a_m,a_w") 110*404b540aSrobert 111*404b540aSrobert;; The "smlalxy", "mul", and "mla" instructions require two iterations 112*404b540aSrobert;; through the execute stage; the result is not available until after 113*404b540aSrobert;; the memory stage. 114*404b540aSrobert(define_insn_reservation "mult3" 3 115*404b540aSrobert (and (eq_attr "tune" "arm1026ejs") 116*404b540aSrobert (eq_attr "insn" "smlalxy,mul,mla")) 117*404b540aSrobert "a_e*2,a_m,a_w") 118*404b540aSrobert 119*404b540aSrobert;; The "muls" and "mlas" instructions loop in the execute stage for 120*404b540aSrobert;; four iterations in order to set the flags. The value result is 121*404b540aSrobert;; available after three iterations. 122*404b540aSrobert(define_insn_reservation "mult4" 3 123*404b540aSrobert (and (eq_attr "tune" "arm1026ejs") 124*404b540aSrobert (eq_attr "insn" "muls,mlas")) 125*404b540aSrobert "a_e*4,a_m,a_w") 126*404b540aSrobert 127*404b540aSrobert;; Long multiply instructions that produce two registers of 128*404b540aSrobert;; output (such as umull) make their results available in two cycles; 129*404b540aSrobert;; the least significant word is available before the most significant 130*404b540aSrobert;; word. That fact is not modeled; instead, the instructions are 131*404b540aSrobert;; described.as if the entire result was available at the end of the 132*404b540aSrobert;; cycle in which both words are available. 133*404b540aSrobert 134*404b540aSrobert;; The "umull", "umlal", "smull", and "smlal" instructions all take 135*404b540aSrobert;; three iterations through the execute cycle, and make their results 136*404b540aSrobert;; available after the memory cycle. 137*404b540aSrobert(define_insn_reservation "mult5" 4 138*404b540aSrobert (and (eq_attr "tune" "arm1026ejs") 139*404b540aSrobert (eq_attr "insn" "umull,umlal,smull,smlal")) 140*404b540aSrobert "a_e*3,a_m,a_w") 141*404b540aSrobert 142*404b540aSrobert;; The "umulls", "umlals", "smulls", and "smlals" instructions loop in 143*404b540aSrobert;; the execute stage for five iterations in order to set the flags. 144*404b540aSrobert;; The value result is available after four iterations. 145*404b540aSrobert(define_insn_reservation "mult6" 4 146*404b540aSrobert (and (eq_attr "tune" "arm1026ejs") 147*404b540aSrobert (eq_attr "insn" "umulls,umlals,smulls,smlals")) 148*404b540aSrobert "a_e*5,a_m,a_w") 149*404b540aSrobert 150*404b540aSrobert;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 151*404b540aSrobert;; Load/Store Instructions 152*404b540aSrobert;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 153*404b540aSrobert 154*404b540aSrobert;; The models for load/store instructions do not accurately describe 155*404b540aSrobert;; the difference between operations with a base register writeback 156*404b540aSrobert;; (such as "ldm!"). These models assume that all memory references 157*404b540aSrobert;; hit in dcache. 158*404b540aSrobert 159*404b540aSrobert;; LSU instructions require six cycles to execute. They use the ALU 160*404b540aSrobert;; pipeline in all but the 5th cycle, and the LSU pipeline in cycles 161*404b540aSrobert;; three through six. 162*404b540aSrobert;; Loads and stores which use a scaled register offset or scaled 163*404b540aSrobert;; register pre-indexed addressing mode take three cycles EXCEPT for 164*404b540aSrobert;; those that are base + offset with LSL of 0 or 2, or base - offset 165*404b540aSrobert;; with LSL of zero. The remainder take 1 cycle to execute. 166*404b540aSrobert;; For 4byte loads there is a bypass from the load stage 167*404b540aSrobert 168*404b540aSrobert(define_insn_reservation "load1_op" 2 169*404b540aSrobert (and (eq_attr "tune" "arm1026ejs") 170*404b540aSrobert (eq_attr "type" "load_byte,load1")) 171*404b540aSrobert "a_e+l_e,l_m,a_w+l_w") 172*404b540aSrobert 173*404b540aSrobert(define_insn_reservation "store1_op" 0 174*404b540aSrobert (and (eq_attr "tune" "arm1026ejs") 175*404b540aSrobert (eq_attr "type" "store1")) 176*404b540aSrobert "a_e+l_e,l_m,a_w+l_w") 177*404b540aSrobert 178*404b540aSrobert;; A load's result can be stored by an immediately following store 179*404b540aSrobert(define_bypass 1 "load1_op" "store1_op" "arm_no_early_store_addr_dep") 180*404b540aSrobert 181*404b540aSrobert;; On a LDM/STM operation, the LSU pipeline iterates until all of the 182*404b540aSrobert;; registers have been processed. 183*404b540aSrobert;; 184*404b540aSrobert;; The time it takes to load the data depends on whether or not the 185*404b540aSrobert;; base address is 64-bit aligned; if it is not, an additional cycle 186*404b540aSrobert;; is required. This model assumes that the address is always 64-bit 187*404b540aSrobert;; aligned. Because the processor can load two registers per cycle, 188*404b540aSrobert;; that assumption means that we use the same instruction reservations 189*404b540aSrobert;; for loading 2k and 2k - 1 registers. 190*404b540aSrobert;; 191*404b540aSrobert;; The ALU pipeline is stalled until the completion of the last memory 192*404b540aSrobert;; stage in the LSU pipeline. That is modeled by keeping the ALU 193*404b540aSrobert;; execute stage busy until that point. 194*404b540aSrobert;; 195*404b540aSrobert;; As with ALU operations, if one of the destination registers is the 196*404b540aSrobert;; PC, there are additional stalls; that is not modeled. 197*404b540aSrobert 198*404b540aSrobert(define_insn_reservation "load2_op" 2 199*404b540aSrobert (and (eq_attr "tune" "arm1026ejs") 200*404b540aSrobert (eq_attr "type" "load2")) 201*404b540aSrobert "a_e+l_e,l_m,a_w+l_w") 202*404b540aSrobert 203*404b540aSrobert(define_insn_reservation "store2_op" 0 204*404b540aSrobert (and (eq_attr "tune" "arm1026ejs") 205*404b540aSrobert (eq_attr "type" "store2")) 206*404b540aSrobert "a_e+l_e,l_m,a_w+l_w") 207*404b540aSrobert 208*404b540aSrobert(define_insn_reservation "load34_op" 3 209*404b540aSrobert (and (eq_attr "tune" "arm1026ejs") 210*404b540aSrobert (eq_attr "type" "load3,load4")) 211*404b540aSrobert "a_e+l_e,a_e+l_e+l_m,a_e+l_m,a_w+l_w") 212*404b540aSrobert 213*404b540aSrobert(define_insn_reservation "store34_op" 0 214*404b540aSrobert (and (eq_attr "tune" "arm1026ejs") 215*404b540aSrobert (eq_attr "type" "store3,store4")) 216*404b540aSrobert "a_e+l_e,a_e+l_e+l_m,a_e+l_m,a_w+l_w") 217*404b540aSrobert 218*404b540aSrobert;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 219*404b540aSrobert;; Branch and Call Instructions 220*404b540aSrobert;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 221*404b540aSrobert 222*404b540aSrobert;; Branch instructions are difficult to model accurately. The ARM 223*404b540aSrobert;; core can predict most branches. If the branch is predicted 224*404b540aSrobert;; correctly, and predicted early enough, the branch can be completely 225*404b540aSrobert;; eliminated from the instruction stream. Some branches can 226*404b540aSrobert;; therefore appear to require zero cycles to execute. We assume that 227*404b540aSrobert;; all branches are predicted correctly, and that the latency is 228*404b540aSrobert;; therefore the minimum value. 229*404b540aSrobert 230*404b540aSrobert(define_insn_reservation "branch_op" 0 231*404b540aSrobert (and (eq_attr "tune" "arm1026ejs") 232*404b540aSrobert (eq_attr "type" "branch")) 233*404b540aSrobert "nothing") 234*404b540aSrobert 235*404b540aSrobert;; The latency for a call is not predictable. Therefore, we use 32 as 236*404b540aSrobert;; roughly equivalent to positive infinity. 237*404b540aSrobert 238*404b540aSrobert(define_insn_reservation "call_op" 32 239*404b540aSrobert (and (eq_attr "tune" "arm1026ejs") 240*404b540aSrobert (eq_attr "type" "call")) 241*404b540aSrobert "nothing") 242