1;; Faraday FA726TE Pipeline Description 2;; Copyright (C) 2010-2018 Free Software Foundation, Inc. 3;; Written by I-Jui Sung, based on ARM926EJ-S Pipeline Description. 4;; 5;; This file is part of GCC. 6;; 7;; GCC is free software; you can redistribute it and/or modify it under 8;; the terms of the GNU General Public License as published by the Free 9;; Software Foundation; either version 3, or (at your option) any later 10;; version. 11;; 12;; GCC is distributed in the hope that it will be useful, but WITHOUT ANY 13;; WARRANTY; without even the implied warranty of MERCHANTABILITY or 14;; FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 15;; for more details. 16;; 17;; You should have received a copy of the GNU General Public License 18;; along with GCC; see the file COPYING3. If not see 19;; <http://www.gnu.org/licenses/>. */ 20 21;; These descriptions are based on the information contained in the 22;; FA726TE Core Design Note, Copyright (c) 2010 Faraday Technology Corp. 23 24;; This automaton provides a pipeline description for the Faraday 25;; FA726TE core. 26;; 27;; The model given here assumes that the condition for all conditional 28;; instructions is "true", i.e., that all of the instructions are 29;; actually executed. 30 31(define_automaton "fa726te") 32 33;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 34;; Pipelines 35;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 36 37;; The ALU pipeline has fetch, decode, execute, memory, and 38;; write stages. We only need to model the execute, memory and write 39;; stages. 40 41;; E1 E2 E3 E4 E5 WB 42;;______________________________________________________ 43;; 44;; <-------------- LD/ST -----------> 45;; shifter + LU <-- AU --> 46;; <-- AU --> shifter + LU CPSR (Pipe 0) 47;;______________________________________________________ 48;; 49;; <---------- MUL ---------> 50;; shifter + LU <-- AU --> 51;; <-- AU --> shifter + LU CPSR (Pipe 1) 52 53 54(define_cpu_unit "fa726te_alu0_pipe,fa726te_alu1_pipe" "fa726te") 55(define_cpu_unit "fa726te_mac_pipe" "fa726te") 56(define_cpu_unit "fa726te_lsu_pipe_e,fa726te_lsu_pipe_w" "fa726te") 57 58;; Pretend we have 2 LSUs (the second is ONLY for LDR), which can possibly 59;; improve code quality. 60(define_query_cpu_unit "fa726te_lsu1_pipe_e,fa726te_lsu1_pipe_w" "fa726te") 61(define_cpu_unit "fa726te_is0,fa726te_is1" "fa726te") 62 63(define_reservation "fa726te_issue" "(fa726te_is0|fa726te_is1)") 64;; Reservation to restrict issue to 1. 65(define_reservation "fa726te_blockage" "(fa726te_is0+fa726te_is1)") 66 67;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 68;; ALU Instructions 69;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 70 71;; ALU instructions require three cycles to execute, and use the ALU 72;; pipeline in each of the three stages. The results are available 73;; after the execute stage has finished. 74;; 75;; If the destination register is the PC, the pipelines are stalled 76;; for several cycles. That case is not modeled here. 77 78;; Move instructions. 79(define_insn_reservation "726te_shift_op" 1 80 (and (eq_attr "tune" "fa726te") 81 (eq_attr "type" "mov_imm,mov_reg,mov_shift,mov_shift_reg,\ 82 mvn_imm,mvn_reg,mvn_shift,mvn_shift_reg")) 83 "fa726te_issue+(fa726te_alu0_pipe|fa726te_alu1_pipe)") 84 85;; ALU operations with no shifted operand will finished in 1 cycle 86;; Other ALU instructions 2 cycles. 87(define_insn_reservation "726te_alu_op" 1 88 (and (eq_attr "tune" "fa726te") 89 (eq_attr "type" "alu_imm,alus_imm,logic_imm,logics_imm,\ 90 alu_sreg,alus_sreg,logic_reg,logics_reg,\ 91 adc_imm,adcs_imm,adc_reg,adcs_reg,\ 92 adr,bfm,rev,\ 93 shift_imm,shift_reg,\ 94 mrs,multiple,no_insn")) 95 "fa726te_issue+(fa726te_alu0_pipe|fa726te_alu1_pipe)") 96 97;; ALU operations with a shift-by-register operand. 98;; These really stall in the decoder, in order to read the shift value 99;; in the first cycle. If the instruction uses both shifter and AU, 100;; it takes 3 cycles. 101(define_insn_reservation "726te_alu_shift_op" 3 102 (and (eq_attr "tune" "fa726te") 103 (eq_attr "type" "extend,alu_shift_imm,alus_shift_imm,\ 104 logic_shift_imm,logics_shift_imm")) 105 "fa726te_issue+(fa726te_alu0_pipe|fa726te_alu1_pipe)") 106 107(define_insn_reservation "726te_alu_shift_reg_op" 3 108 (and (eq_attr "tune" "fa726te") 109 (eq_attr "type" "alu_shift_reg,alus_shift_reg,\ 110 logic_shift_reg,logics_shift_reg")) 111 "fa726te_issue+(fa726te_alu0_pipe|fa726te_alu1_pipe)") 112;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 113;; Multiplication Instructions 114;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 115 116;; Multiplication instructions loop in the execute stage until the 117;; instruction has been passed through the multiplier array enough 118;; times. Multiply operations occur in both the execute and memory 119;; stages of the pipeline 120 121(define_insn_reservation "726te_mult_op" 3 122 (and (eq_attr "tune" "fa726te") 123 (eq_attr "type" "smlalxy,mul,mla,muls,mlas,umull,umlal,smull,smlal,\ 124 umulls,umlals,smulls,smlals,smlawx,smulxy,smlaxy")) 125 "fa726te_issue+fa726te_mac_pipe") 126 127;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 128;; Load/Store Instructions 129;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 130 131;; The models for load/store instructions do not accurately describe 132;; the difference between operations with a base register writeback 133;; (such as "ldm!"). These models assume that all memory references 134;; hit in dcache. 135 136;; Loads with a shifted offset take 3 cycles, and are (a) probably the 137;; most common and (b) the pessimistic assumption will lead to fewer stalls. 138 139;; Scalar loads are pipelined in FA726TE LSU pipe. 140;; Here we model the resource conflict between Load@E3-stage & Store@W-stage. 141;; The 2nd LSU (lsu1) is to model the fact that if 2 loads are scheduled in the 142;; same "bundle", and the 2nd load will introudce another ISSUE stall but is 143;; still ok to execute (and may be benefical sometimes). 144 145(define_insn_reservation "726te_load1_op" 3 146 (and (eq_attr "tune" "fa726te") 147 (eq_attr "type" "load_4,load_byte")) 148 "(fa726te_issue+fa726te_lsu_pipe_e+fa726te_lsu_pipe_w)\ 149 | (fa726te_issue+fa726te_lsu1_pipe_e+fa726te_lsu1_pipe_w,fa726te_blockage)") 150 151(define_insn_reservation "726te_store1_op" 1 152 (and (eq_attr "tune" "fa726te") 153 (eq_attr "type" "store_4")) 154 "fa726te_blockage*2") 155 156;; Load/Store Multiple blocks all pipelines in EX stages until WB. 157;; No other instructions can be issued together. Since they essentially 158;; prevent all scheduling opportunities, we model them together here. 159 160;; The LDM is breaking into multiple load instructions, later instruction in 161;; the pipe 1 is stalled. 162(define_insn_reservation "726te_ldm2_op" 4 163 (and (eq_attr "tune" "fa726te") 164 (eq_attr "type" "load_8,load_12")) 165 "fa726te_blockage*4") 166 167(define_insn_reservation "726te_ldm3_op" 5 168 (and (eq_attr "tune" "fa726te") 169 (eq_attr "type" "load_16")) 170 "fa726te_blockage*5") 171 172(define_insn_reservation "726te_stm2_op" 2 173 (and (eq_attr "tune" "fa726te") 174 (eq_attr "type" "store_8,store_12")) 175 "fa726te_blockage*3") 176 177(define_insn_reservation "726te_stm3_op" 3 178 (and (eq_attr "tune" "fa726te") 179 (eq_attr "type" "store_16")) 180 "fa726te_blockage*4") 181 182(define_bypass 1 "726te_load1_op,726te_ldm2_op,726te_ldm3_op" "726te_store1_op,\ 183 726te_stm2_op,726te_stm3_op" "arm_no_early_store_addr_dep") 184(define_bypass 0 "726te_shift_op,726te_alu_op,726te_alu_shift_op,\ 185 726te_alu_shift_reg_op,726te_mult_op" "726te_store1_op" 186 "arm_no_early_store_addr_dep") 187(define_bypass 0 "726te_shift_op,726te_alu_op" "726te_shift_op,726te_alu_op") 188(define_bypass 1 "726te_alu_shift_op,726te_alu_shift_reg_op" 189 "726te_shift_op,726te_alu_op") 190(define_bypass 1 "726te_alu_shift_op,726te_alu_shift_reg_op,726te_mult_op" 191 "726te_alu_shift_op" "arm_no_early_alu_shift_dep") 192(define_bypass 1 "726te_alu_shift_op,726te_alu_shift_reg_op,726te_mult_op" 193 "726te_alu_shift_reg_op" "arm_no_early_alu_shift_value_dep") 194(define_bypass 1 "726te_mult_op" "726te_shift_op,726te_alu_op") 195 196(define_bypass 4 "726te_load1_op" "726te_mult_op") 197(define_bypass 5 "726te_ldm2_op" "726te_mult_op") 198(define_bypass 6 "726te_ldm3_op" "726te_mult_op") 199 200;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 201;; Branch and Call Instructions 202;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 203 204;; Branch instructions are difficult to model accurately. The FA726TE 205;; core can predict most branches. If the branch is predicted 206;; correctly, and predicted early enough, the branch can be completely 207;; eliminated from the instruction stream. Some branches can 208;; therefore appear to require zero cycle to execute. We assume that 209;; all branches are predicted correctly, and that the latency is 210;; therefore the minimum value. 211 212(define_insn_reservation "726te_branch_op" 0 213 (and (eq_attr "tune" "fa726te") 214 (eq_attr "type" "branch")) 215 "fa726te_blockage") 216 217;; The latency for a call is actually the latency when the result is available. 218;; i.e. R0 is ready for int return value. 219(define_insn_reservation "726te_call_op" 1 220 (and (eq_attr "tune" "fa726te") 221 (eq_attr "type" "call")) 222 "fa726te_blockage") 223 224