1;; Copyright (C) 2002-2021 Free Software Foundation, Inc. 2;; 3;; This file is part of GCC. 4;; 5;; GCC is free software; you can redistribute it and/or modify 6;; it under the terms of the GNU General Public License as published by 7;; the Free Software Foundation; either version 3, or (at your option) 8;; any later version. 9;; 10;; GCC is distributed in the hope that it will be useful, 11;; but WITHOUT ANY WARRANTY; without even the implied warranty of 12;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13;; GNU General Public License for more details. 14;; 15;; You should have received a copy of the GNU General Public License 16;; along with GCC; see the file COPYING3. If not see 17;; <http://www.gnu.org/licenses/>. 18;; 19;; ......................... 20;; 21;; DFA-based pipeline description for Sandcraft SR3 (MIPS64 based) 22;; 23;; The SR3 is described as: 24;; - nine-stage pipeline, insn buffering with out-of-order issue to 25;; multiple function units, with an average dispatch rate of 2 26;; insn.s per cycle (max 6 insns: 2 fpu, 4 cpu). 27;; 28;; The details on this are scant except for a diagram in 29;; Chap. 6 of Rev. 1.0 SR3 Spec. 30;; 31;; The model employed below is designed to closely approximate the 32;; published latencies. Emulation of out-of-order issue and the insn 33;; buffering is done via a VLIW dispatch style (with a packing of 6 insns); 34;; the function unit reservations restrictions (define_*_set) are 35;; contrived to support published timings. 36;; 37;; Reference: 38;; "SR3 Microprocessor Specification, System development information," 39;; Revision 1.0, 13 December 2000. 40;; 41;; 42;; Reservation model is based on: 43;; 1) Figure 6-1, from the 1.0 specification. 44;; 2) Chapter 19, from the 1.0 specification. 45;; 3) following questions(Red Hat)/answers(Sandcraft): 46;; RH> From Section 19.1 47;; RH> 1) In terms of figure 6-1, are all the instructions in 48;; RH> table 19-1 restricted 49;; RH> to ALUx? When ALUx is not in use for an instruction in table;; RH> 19-1 is 50;; RH> it fully compatible with all insns that issue to ALUy? 51;; 52;; Yes, all the instructions in Table 19-1 only go to ALUX, and all the 53;; instructions that can be issued to ALUY can also be issued to ALUX. 54;; 55;; 56;; RH> From Section 19.2 57;; RH> 2) Explain conditional moves execution path (in terms of 58;; RH> figure 6-1) 59;; 60;; Conditional move of integer registers (based on floating point condition 61;; codes or integer register value) go to ALUX or ALUY. 62;; 63;; RH> 3) Explain floating point store execution path (in terms of 64;; RH> figure 6-1) 65;; 66;; Floating point stores go to Ld/St and go to MOV in the floating point 67;; pipeline. 68;; 69;; Floating point loads go to Ld/St and go to LOAD in the floating point 70;; pipeline. 71;; 72;; RH> 4) Explain branch on floating condition (in terms of figure 6-1);; 73;; Branch on floating condition go to BRU. 74;; 75;; RH> 5) Is the column for single RECIP instruction latency correct? 76;; RH> What about for RSQRT single and double? 77;; 78;; The latency/repeat for RECIP and RSQRT are correct. 79;; 80 81;; 82;; Use four automata to isolate long latency operations, and to 83;; reduce the complexity of cpu+fpu, reducing space. 84;; 85(define_automaton "sr71_cpu, sr71_cpu1, sr71_cp1, sr71_cp2, sr71_fextra, sr71_imacc") 86 87;; feeders for CPU function units and feeders for fpu (CP1 interface) 88(define_cpu_unit "sr_iss0,sr_iss1,sr_iss2,sr_iss3,sr_iss4,sr_iss5" "sr71_cpu") 89 90;; CPU function units 91(define_cpu_unit "ipu_bru" "sr71_cpu1") 92(define_cpu_unit "ipu_alux" "sr71_cpu1") 93(define_cpu_unit "ipu_aluy" "sr71_cpu1") 94(define_cpu_unit "ipu_ldst" "sr71_cpu1") 95(define_cpu_unit "ipu_macc_iter" "sr71_imacc") 96 97 98;; Floating-point unit (Co-processor interface 1). 99(define_cpu_unit "fpu_mov" "sr71_cp1") 100(define_cpu_unit "fpu_load" "sr71_cp1") 101(define_cpu_unit "fpu_fpu" "sr71_cp2") 102 103;; fictitous unit to track long float insns with separate automaton 104(define_cpu_unit "fpu_iter" "sr71_fextra") 105 106 107;; 108;; Define common execution path (reservation) combinations 109;; 110 111;; 112(define_reservation "cpu_iss" "sr_iss0|sr_iss1|sr_iss2|sr_iss3") 113 114;; two cycles are used for instruction using the fpu as it runs 115;; at half the clock speed of the cpu. By adding an extra cycle 116;; to the issue units, the default/minimum "repeat" dispatch delay is 117;; accounted for all insn.s 118(define_reservation "cp1_iss" "(sr_iss4*2)|(sr_iss5*2)") 119 120(define_reservation "serial_dispatch" "sr_iss0+sr_iss1+sr_iss2+sr_iss3+sr_iss4+sr_iss5") 121 122;; Simulate a 6 insn VLIW dispatch, 1 cycle in dispatch followed by 123;; reservation of function unit. 124(define_reservation "ri_insns" "cpu_iss,(ipu_alux|ipu_aluy)") 125(define_reservation "ri_mem" "cpu_iss,ipu_ldst") 126(define_reservation "ri_alux" "cpu_iss,ipu_alux") 127(define_reservation "ri_branch" "cpu_iss,ipu_bru") 128 129(define_reservation "rf_insn" "cp1_iss,fpu_fpu") 130(define_reservation "rf_ldmem" "cp1_iss,fpu_load") 131 132; simultaneous reservation of pseudo-unit keeps cp1 fpu tied 133; up until long cycle insn is finished... 134(define_reservation "rf_multi1" "rf_insn+fpu_iter") 135 136;; 137;; The ordering of the instruction-execution-path/resource-usage 138;; descriptions (also known as reservation RTL) is roughly ordered 139;; based on the define attribute RTL for the "type" classification. 140;; When modifying, remember that the first test that matches is the 141;; reservation used! 142;; 143 144 145(define_insn_reservation "ir_sr70_unknown" 1 146 (and (eq_attr "cpu" "sr71000") 147 (eq_attr "type" "unknown,atomic,syncloop")) 148 "serial_dispatch") 149 150 151;; Assume prediction fails. 152(define_insn_reservation "ir_sr70_branch" 6 153 (and (eq_attr "cpu" "sr71000") 154 (eq_attr "type" "branch,jump,call")) 155 "ri_branch") 156 157(define_insn_reservation "ir_sr70_load" 2 158 (and (eq_attr "cpu" "sr71000") 159 (eq_attr "type" "load")) 160 "ri_mem") 161 162(define_insn_reservation "ir_sr70_store" 1 163 (and (eq_attr "cpu" "sr71000") 164 (eq_attr "type" "store")) 165 "ri_mem") 166 167 168;; 169;; float loads/stores flow through both cpu and cp1... 170;; 171(define_insn_reservation "ir_sr70_fload" 9 172 (and (eq_attr "cpu" "sr71000") 173 (eq_attr "type" "fpload,fpidxload")) 174 "(cpu_iss+cp1_iss),(ri_mem+rf_ldmem)") 175 176(define_insn_reservation "ir_sr70_fstore" 1 177 (and (eq_attr "cpu" "sr71000") 178 (eq_attr "type" "fpstore,fpidxstore")) 179 "(cpu_iss+cp1_iss),(fpu_mov+ri_mem)") 180 181 182;; This reservation is for conditional move based on integer 183;; or floating point CC. 184(define_insn_reservation "ir_sr70_condmove" 4 185 (and (eq_attr "cpu" "sr71000") 186 (eq_attr "type" "condmove")) 187 "ri_insns") 188 189;; Try to discriminate move-from-cp1 versus move-to-cp1 as latencies 190;; are different. Like float load/store, these insns use multiple 191;; resources simultaneously 192(define_insn_reservation "ir_sr70_xfer_from" 6 193 (and (eq_attr "cpu" "sr71000") 194 (eq_attr "type" "mfc")) 195 "(cpu_iss+cp1_iss),(fpu_mov+ri_mem)") 196 197(define_insn_reservation "ir_sr70_xfer_to" 9 198 (and (eq_attr "cpu" "sr71000") 199 (eq_attr "type" "mtc")) 200 "(cpu_iss+cp1_iss),(ri_mem+rf_ldmem)") 201 202(define_insn_reservation "ir_sr70_hilo" 1 203 (and (eq_attr "cpu" "sr71000") 204 (eq_attr "type" "mthi,mtlo,mfhi,mflo")) 205 "ri_insns") 206 207(define_insn_reservation "ir_sr70_arith" 1 208 (and (eq_attr "cpu" "sr71000") 209 (eq_attr "type" "arith,shift,signext,slt,clz,const,logical,move,trap")) 210 "ri_insns") 211 212;; emulate repeat (dispatch stall) by spending extra cycle(s) in 213;; in iter unit 214(define_insn_reservation "ir_sr70_imul_si" 4 215 (and (eq_attr "cpu" "sr71000") 216 (and (eq_attr "type" "imul,imul3,imadd") 217 (eq_attr "mode" "SI"))) 218 "ri_alux,ipu_alux,ipu_macc_iter") 219 220(define_insn_reservation "ir_sr70_imul_di" 6 221 (and (eq_attr "cpu" "sr71000") 222 (and (eq_attr "type" "imul,imul3,imadd") 223 (eq_attr "mode" "DI"))) 224 "ri_alux,ipu_alux,(ipu_macc_iter*3)") 225 226;; Divide algorithm is early out with best latency of 7 pcycles. 227;; Use worst case for scheduling purposes. 228(define_insn_reservation "ir_sr70_idiv_si" 41 229 (and (eq_attr "cpu" "sr71000") 230 (and (eq_attr "type" "idiv") 231 (eq_attr "mode" "SI"))) 232 "ri_alux,ipu_alux,(ipu_macc_iter*38)") 233 234(define_insn_reservation "ir_sr70_idiv_di" 73 235 (and (eq_attr "cpu" "sr71000") 236 (and (eq_attr "type" "idiv") 237 (eq_attr "mode" "DI"))) 238 "ri_alux,ipu_alux,(ipu_macc_iter*70)") 239 240;; extra reservations of fpu_fpu are for repeat latency 241(define_insn_reservation "ir_sr70_fadd_sf" 8 242 (and (eq_attr "cpu" "sr71000") 243 (and (eq_attr "type" "fadd") 244 (eq_attr "mode" "SF"))) 245 "rf_insn,fpu_fpu") 246 247(define_insn_reservation "ir_sr70_fadd_df" 10 248 (and (eq_attr "cpu" "sr71000") 249 (and (eq_attr "type" "fadd") 250 (eq_attr "mode" "DF"))) 251 "rf_insn,fpu_fpu") 252 253;; Latencies for MADD,MSUB, NMADD, NMSUB assume the Multiply is fused 254;; with the sub or add. 255(define_insn_reservation "ir_sr70_fmul_sf" 8 256 (and (eq_attr "cpu" "sr71000") 257 (and (eq_attr "type" "fmul,fmadd") 258 (eq_attr "mode" "SF"))) 259 "rf_insn,fpu_fpu") 260 261;; tie up the fpu unit to emulate the balance for the "repeat 262;; rate" of 8 (2 are spent in the iss unit) 263(define_insn_reservation "ir_sr70_fmul_df" 16 264 (and (eq_attr "cpu" "sr71000") 265 (and (eq_attr "type" "fmul,fmadd") 266 (eq_attr "mode" "DF"))) 267 "rf_insn,fpu_fpu*6") 268 269 270;; RECIP insn uses same type attr as div, and for SR3, has same 271;; timings for double. However, single RECIP has a latency of 272;; 28 -- only way to fix this is to introduce new insn attrs. 273;; cycles spent in iter unit are designed to satisfy balance 274;; of "repeat" latency after insn uses up rf_multi1 reservation 275(define_insn_reservation "ir_sr70_fdiv_sf" 60 276 (and (eq_attr "cpu" "sr71000") 277 (and (eq_attr "type" "fdiv,frdiv") 278 (eq_attr "mode" "SF"))) 279 "rf_multi1+(fpu_iter*51)") 280 281(define_insn_reservation "ir_sr70_fdiv_df" 120 282 (and (eq_attr "cpu" "sr71000") 283 (and (eq_attr "type" "fdiv,frdiv") 284 (eq_attr "mode" "DF"))) 285 "rf_multi1+(fpu_iter*109)") 286 287(define_insn_reservation "ir_sr70_fabs" 4 288 (and (eq_attr "cpu" "sr71000") 289 (eq_attr "type" "fabs,fneg,fmove")) 290 "rf_insn,fpu_fpu") 291 292(define_insn_reservation "ir_sr70_fcmp" 10 293 (and (eq_attr "cpu" "sr71000") 294 (eq_attr "type" "fcmp")) 295 "rf_insn,fpu_fpu") 296 297;; "fcvt" type attribute covers a number of diff insns, most have the same 298;; latency descriptions, a few vary. We use the 299;; most common timing (which is also worst case). 300(define_insn_reservation "ir_sr70_fcvt" 12 301 (and (eq_attr "cpu" "sr71000") 302 (eq_attr "type" "fcvt")) 303 "rf_insn,fpu_fpu*4") 304 305(define_insn_reservation "ir_sr70_fsqrt_sf" 62 306 (and (eq_attr "cpu" "sr71000") 307 (and (eq_attr "type" "fsqrt") 308 (eq_attr "mode" "SF"))) 309 "rf_multi1+(fpu_iter*53)") 310 311(define_insn_reservation "ir_sr70_fsqrt_df" 122 312 (and (eq_attr "cpu" "sr71000") 313 (and (eq_attr "type" "fsqrt") 314 (eq_attr "mode" "DF"))) 315 "rf_multi1+(fpu_iter*111)") 316 317(define_insn_reservation "ir_sr70_frsqrt_sf" 48 318 (and (eq_attr "cpu" "sr71000") 319 (and (eq_attr "type" "frsqrt") 320 (eq_attr "mode" "SF"))) 321 "rf_multi1+(fpu_iter*39)") 322 323(define_insn_reservation "ir_sr70_frsqrt_df" 240 324 (and (eq_attr "cpu" "sr71000") 325 (and (eq_attr "type" "frsqrt") 326 (eq_attr "mode" "DF"))) 327 "rf_multi1+(fpu_iter*229)") 328 329(define_insn_reservation "ir_sr70_multi" 1 330 (and (eq_attr "cpu" "sr71000") 331 (eq_attr "type" "multi")) 332 "serial_dispatch") 333 334(define_insn_reservation "ir_sr70_nop" 1 335 (and (eq_attr "cpu" "sr71000") 336 (eq_attr "type" "nop")) 337 "ri_insns") 338