1;; Faraday FA726TE Pipeline Description
2;; Copyright (C) 2010-2018 Free Software Foundation, Inc.
3;; Written by I-Jui Sung, based on ARM926EJ-S Pipeline Description.
4;;
5;; This file is part of GCC.
6;;
7;; GCC is free software; you can redistribute it and/or modify it under
8;; the terms of the GNU General Public License as published by the Free
9;; Software Foundation; either version 3, or (at your option) any later
10;; version.
11;;
12;; GCC is distributed in the hope that it will be useful, but WITHOUT ANY
13;; WARRANTY; without even the implied warranty of MERCHANTABILITY or
14;; FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
15;; for more details.
16;;
17;; You should have received a copy of the GNU General Public License
18;; along with GCC; see the file COPYING3.  If not see
19;; <http://www.gnu.org/licenses/>.  */
20
21;; These descriptions are based on the information contained in the
22;; FA726TE Core Design Note, Copyright (c) 2010 Faraday Technology Corp.
23
24;; This automaton provides a pipeline description for the Faraday
25;; FA726TE core.
26;;
27;; The model given here assumes that the condition for all conditional
28;; instructions is "true", i.e., that all of the instructions are
29;; actually executed.
30
31(define_automaton "fa726te")
32
33;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
34;; Pipelines
35;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
36
37;;   The ALU pipeline has fetch, decode, execute, memory, and
38;;   write stages.  We only need to model the execute, memory and write
39;;   stages.
40
41;;	E1	E2	E3	E4	E5	WB
42;;______________________________________________________
43;;
44;;      <-------------- LD/ST ----------->
45;;    shifter + LU      <-- AU -->
46;;      <-- AU -->     shifter + LU    CPSR     (Pipe 0)
47;;______________________________________________________
48;;
49;;      <---------- MUL --------->
50;;    shifter + LU      <-- AU -->
51;;      <-- AU -->     shifter + LU    CPSR     (Pipe 1)
52
53
54(define_cpu_unit "fa726te_alu0_pipe,fa726te_alu1_pipe" "fa726te")
55(define_cpu_unit "fa726te_mac_pipe" "fa726te")
56(define_cpu_unit "fa726te_lsu_pipe_e,fa726te_lsu_pipe_w" "fa726te")
57
58;; Pretend we have 2 LSUs (the second is ONLY for LDR), which can possibly
59;; improve code quality.
60(define_query_cpu_unit "fa726te_lsu1_pipe_e,fa726te_lsu1_pipe_w" "fa726te")
61(define_cpu_unit "fa726te_is0,fa726te_is1" "fa726te")
62
63(define_reservation "fa726te_issue" "(fa726te_is0|fa726te_is1)")
64;; Reservation to restrict issue to 1.
65(define_reservation "fa726te_blockage" "(fa726te_is0+fa726te_is1)")
66
67;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
68;; ALU Instructions
69;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
70
71;; ALU instructions require three cycles to execute, and use the ALU
72;; pipeline in each of the three stages.  The results are available
73;; after the execute stage has finished.
74;;
75;; If the destination register is the PC, the pipelines are stalled
76;; for several cycles.  That case is not modeled here.
77
78;; Move instructions.
79(define_insn_reservation "726te_shift_op" 1
80  (and (eq_attr "tune" "fa726te")
81       (eq_attr "type" "mov_imm,mov_reg,mov_shift,mov_shift_reg,\
82                        mvn_imm,mvn_reg,mvn_shift,mvn_shift_reg"))
83  "fa726te_issue+(fa726te_alu0_pipe|fa726te_alu1_pipe)")
84
85;; ALU operations with no shifted operand will finished in 1 cycle
86;; Other ALU instructions 2 cycles.
87(define_insn_reservation "726te_alu_op" 1
88 (and (eq_attr "tune" "fa726te")
89      (eq_attr "type" "alu_imm,alus_imm,logic_imm,logics_imm,\
90                       alu_sreg,alus_sreg,logic_reg,logics_reg,\
91                       adc_imm,adcs_imm,adc_reg,adcs_reg,\
92                       adr,bfm,rev,\
93                       shift_imm,shift_reg,\
94                       mrs,multiple,no_insn"))
95  "fa726te_issue+(fa726te_alu0_pipe|fa726te_alu1_pipe)")
96
97;; ALU operations with a shift-by-register operand.
98;; These really stall in the decoder, in order to read the shift value
99;; in the first cycle.  If the instruction uses both shifter and AU,
100;; it takes 3 cycles.
101(define_insn_reservation "726te_alu_shift_op" 3
102 (and (eq_attr "tune" "fa726te")
103      (eq_attr "type" "extend,alu_shift_imm,alus_shift_imm,\
104                       logic_shift_imm,logics_shift_imm"))
105  "fa726te_issue+(fa726te_alu0_pipe|fa726te_alu1_pipe)")
106
107(define_insn_reservation "726te_alu_shift_reg_op" 3
108 (and (eq_attr "tune" "fa726te")
109      (eq_attr "type" "alu_shift_reg,alus_shift_reg,\
110                       logic_shift_reg,logics_shift_reg"))
111  "fa726te_issue+(fa726te_alu0_pipe|fa726te_alu1_pipe)")
112;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
113;; Multiplication Instructions
114;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
115
116;; Multiplication instructions loop in the execute stage until the
117;; instruction has been passed through the multiplier array enough
118;; times.  Multiply operations occur in both the execute and memory
119;; stages of the pipeline
120
121(define_insn_reservation "726te_mult_op" 3
122 (and (eq_attr "tune" "fa726te")
123      (eq_attr "type" "smlalxy,mul,mla,muls,mlas,umull,umlal,smull,smlal,\
124                       umulls,umlals,smulls,smlals,smlawx,smulxy,smlaxy"))
125 "fa726te_issue+fa726te_mac_pipe")
126
127;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
128;; Load/Store Instructions
129;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
130
131;; The models for load/store instructions do not accurately describe
132;; the difference between operations with a base register writeback
133;; (such as "ldm!").  These models assume that all memory references
134;; hit in dcache.
135
136;; Loads with a shifted offset take 3 cycles, and are (a) probably the
137;; most common and (b) the pessimistic assumption will lead to fewer stalls.
138
139;; Scalar loads are pipelined in FA726TE LSU pipe.
140;; Here we model the resource conflict between Load@E3-stage & Store@W-stage.
141;; The 2nd LSU (lsu1) is to model the fact that if 2 loads are scheduled in the
142;; same "bundle", and the 2nd load will introudce another ISSUE stall but is
143;; still ok to execute (and may be benefical sometimes).
144
145(define_insn_reservation "726te_load1_op" 3
146 (and (eq_attr "tune" "fa726te")
147      (eq_attr "type" "load_4,load_byte"))
148 "(fa726te_issue+fa726te_lsu_pipe_e+fa726te_lsu_pipe_w)\
149  | (fa726te_issue+fa726te_lsu1_pipe_e+fa726te_lsu1_pipe_w,fa726te_blockage)")
150
151(define_insn_reservation "726te_store1_op" 1
152 (and (eq_attr "tune" "fa726te")
153      (eq_attr "type" "store_4"))
154 "fa726te_blockage*2")
155
156;; Load/Store Multiple blocks all pipelines in EX stages until WB.
157;; No other instructions can be issued together.  Since they essentially
158;; prevent all scheduling opportunities, we model them together here.
159
160;; The LDM is breaking into multiple load instructions, later instruction in
161;; the pipe 1 is stalled.
162(define_insn_reservation "726te_ldm2_op" 4
163 (and (eq_attr "tune" "fa726te")
164      (eq_attr "type" "load_8,load_12"))
165 "fa726te_blockage*4")
166
167(define_insn_reservation "726te_ldm3_op" 5
168 (and (eq_attr "tune" "fa726te")
169      (eq_attr "type" "load_16"))
170 "fa726te_blockage*5")
171
172(define_insn_reservation "726te_stm2_op" 2
173 (and (eq_attr "tune" "fa726te")
174      (eq_attr "type" "store_8,store_12"))
175 "fa726te_blockage*3")
176
177(define_insn_reservation "726te_stm3_op" 3
178 (and (eq_attr "tune" "fa726te")
179      (eq_attr "type" "store_16"))
180 "fa726te_blockage*4")
181
182(define_bypass 1 "726te_load1_op,726te_ldm2_op,726te_ldm3_op" "726te_store1_op,\
183                  726te_stm2_op,726te_stm3_op" "arm_no_early_store_addr_dep")
184(define_bypass 0 "726te_shift_op,726te_alu_op,726te_alu_shift_op,\
185                 726te_alu_shift_reg_op,726te_mult_op" "726te_store1_op"
186                 "arm_no_early_store_addr_dep")
187(define_bypass 0 "726te_shift_op,726te_alu_op" "726te_shift_op,726te_alu_op")
188(define_bypass 1 "726te_alu_shift_op,726te_alu_shift_reg_op"
189                 "726te_shift_op,726te_alu_op")
190(define_bypass 1 "726te_alu_shift_op,726te_alu_shift_reg_op,726te_mult_op"
191                 "726te_alu_shift_op" "arm_no_early_alu_shift_dep")
192(define_bypass 1 "726te_alu_shift_op,726te_alu_shift_reg_op,726te_mult_op"
193                 "726te_alu_shift_reg_op" "arm_no_early_alu_shift_value_dep")
194(define_bypass 1 "726te_mult_op" "726te_shift_op,726te_alu_op")
195
196(define_bypass 4 "726te_load1_op" "726te_mult_op")
197(define_bypass 5 "726te_ldm2_op" "726te_mult_op")
198(define_bypass 6 "726te_ldm3_op" "726te_mult_op")
199
200;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
201;; Branch and Call Instructions
202;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
203
204;; Branch instructions are difficult to model accurately.  The FA726TE
205;; core can predict most branches.  If the branch is predicted
206;; correctly, and predicted early enough, the branch can be completely
207;; eliminated from the instruction stream.  Some branches can
208;; therefore appear to require zero cycle to execute.  We assume that
209;; all branches are predicted correctly, and that the latency is
210;; therefore the minimum value.
211
212(define_insn_reservation "726te_branch_op" 0
213 (and (eq_attr "tune" "fa726te")
214      (eq_attr "type" "branch"))
215 "fa726te_blockage")
216
217;; The latency for a call is actually the latency when the result is available.
218;; i.e. R0 is ready for int return value.
219(define_insn_reservation "726te_call_op" 1
220 (and (eq_attr "tune" "fa726te")
221      (eq_attr "type" "call"))
222 "fa726te_blockage")
223
224