1;; ......................... 2;; 3;; DFA-based pipeline description for Sandcraft SR3 (MIPS64 based) 4;; 5;; The SR3 is described as: 6;; - nine-stage pipeline, insn buffering with out-of-order issue to 7;; multiple function units, with an average dispatch rate of 2 8;; insn.s per cycle (max 6 insns: 2 fpu, 4 cpu). 9;; 10;; The details on this are scant except for a diagram in 11;; Chap. 6 of Rev. 1.0 SR3 Spec. 12;; 13;; The model employed below is designed to closely approximate the 14;; published latencies. Emulation of out-of-order issue and the insn 15;; buffering is done via a VLIW dispatch style (with a packing of 6 insns); 16;; the function unit reservations restrictions (define_*_set) are 17;; contrived to support published timings. 18;; 19;; Reference: 20;; "SR3 Microporocessor Specification, System development information," 21;; Revision 1.0, 13 December 2000. 22;; 23;; 24;; Reservation model is based on: 25;; 1) Figure 6-1, from the 1.0 specification. 26;; 2) Chapter 19, from the 1.0 specification. 27;; 3) following questions(Red Hat)/answers(Sandcraft): 28;; RH> From Section 19.1 29;; RH> 1) In terms of figure 6-1, are all the instructions in 30;; RH> table 19-1 restricted 31;; RH> to ALUx? When ALUx is not in use for an instruction in table;; RH> 19-1 is 32;; RH> it fully compatible with all insns that issue to ALUy? 33;; 34;; Yes, all the instructions in Table 19-1 only go to ALUX, and all the 35;; instructions that can be issued to ALUY can also be issued to ALUX. 36;; 37;; 38;; RH> From Section 19.2 39;; RH> 2) Explain conditional moves execution path (in terms of 40;; RH> figure 6-1) 41;; 42;; Conditional move of integer registers (based on floating point condition 43;; codes or integer register value) go to ALUX or ALUY. 44;; 45;; RH> 3) Explain floating point store execution path (in terms of 46;; RH> figure 6-1) 47;; 48;; Floating point stores go to Ld/St and go to MOV in the floating point 49;; pipeline. 50;; 51;; Floating point loads go to Ld/St and go to LOAD in the floating point 52;; pipeline. 53;; 54;; RH> 4) Explain branch on floating condition (in terms of figure 6-1);; 55;; Branch on floating condition go to BRU. 56;; 57;; RH> 5) Is the column for single RECIP instruction latency correct? 58;; RH> What about for RSQRT single and double? 59;; 60;; The latency/repeat for RECIP and RSQRT are correct. 61;; 62 63;; 64;; Use four automata to isolate long latency operations, and to 65;; reduce the complexity of cpu+fpu, reducing space. 66;; 67(define_automaton "sr71_cpu, sr71_cpu1, sr71_cp1, sr71_cp2, sr71_fextra, sr71_imacc") 68 69;; feeders for CPU function units and feeders for fpu (CP1 interface) 70(define_cpu_unit "sr_iss0,sr_iss1,sr_iss2,sr_iss3,sr_iss4,sr_iss5" "sr71_cpu") 71 72;; CPU function units 73(define_cpu_unit "ipu_bru" "sr71_cpu1") 74(define_cpu_unit "ipu_alux" "sr71_cpu1") 75(define_cpu_unit "ipu_aluy" "sr71_cpu1") 76(define_cpu_unit "ipu_ldst" "sr71_cpu1") 77(define_cpu_unit "ipu_macc_iter" "sr71_imacc") 78 79 80;; Floating-point unit (Co-processor interface 1). 81(define_cpu_unit "fpu_mov" "sr71_cp1") 82(define_cpu_unit "fpu_load" "sr71_cp1") 83(define_cpu_unit "fpu_fpu" "sr71_cp2") 84 85;; fictitous unit to track long float insns with separate automaton 86(define_cpu_unit "fpu_iter" "sr71_fextra") 87 88 89;; 90;; Define common execution path (reservation) combinations 91;; 92 93;; 94(define_reservation "cpu_iss" "sr_iss0|sr_iss1|sr_iss2|sr_iss3") 95 96;; two cycles are used for instruction using the fpu as it runs 97;; at half the clock speed of the cpu. By adding an extra cycle 98;; to the issue units, the default/minimum "repeat" dispatch delay is 99;; accounted for all insn.s 100(define_reservation "cp1_iss" "(sr_iss4*2)|(sr_iss5*2)") 101 102(define_reservation "serial_dispatch" "sr_iss0+sr_iss1+sr_iss2+sr_iss3+sr_iss4+sr_iss5") 103 104;; Simulate a 6 insn VLIW dispatch, 1 cycle in dispatch followed by 105;; reservation of function unit. 106(define_reservation "ri_insns" "cpu_iss,(ipu_alux|ipu_aluy)") 107(define_reservation "ri_mem" "cpu_iss,ipu_ldst") 108(define_reservation "ri_alux" "cpu_iss,ipu_alux") 109(define_reservation "ri_branch" "cpu_iss,ipu_bru") 110 111(define_reservation "rf_insn" "cp1_iss,fpu_fpu") 112(define_reservation "rf_ldmem" "cp1_iss,fpu_load") 113 114; simultaneous reservation of pseudo-unit keeps cp1 fpu tied 115; up until long cycle insn is finished... 116(define_reservation "rf_multi1" "rf_insn+fpu_iter") 117 118;; 119;; The ordering of the instruction-execution-path/resource-usage 120;; descriptions (also known as reservation RTL) is roughly ordered 121;; based on the define attribute RTL for the "type" classification. 122;; When modifying, remember that the first test that matches is the 123;; reservation used! 124;; 125 126 127(define_insn_reservation "ir_sr70_unknown" 128 1 129 (and (eq_attr "cpu" "sr71000") 130 (eq_attr "type" "unknown")) 131 "serial_dispatch") 132 133 134;; Assume prediction fails. 135(define_insn_reservation "ir_sr70_branch" 136 6 137 (and (eq_attr "cpu" "sr71000") 138 (eq_attr "type" "branch,jump,call")) 139 "ri_branch") 140 141(define_insn_reservation "ir_sr70_load" 142 2 143 (and (eq_attr "cpu" "sr71000") 144 (and (eq_attr "type" "load") 145 (eq_attr "mode" "!SF,DF,FPSW"))) 146 "ri_mem") 147 148(define_insn_reservation "ir_sr70_store" 149 1 150 (and (eq_attr "cpu" "sr71000") 151 (and (eq_attr "type" "store") 152 (eq_attr "mode" "!SF,DF,FPSW"))) 153 "ri_mem") 154 155 156;; 157;; float loads/stores flow through both cpu and cp1... 158;; 159(define_insn_reservation "ir_sr70_fload" 160 9 161 (and (eq_attr "cpu" "sr71000") 162 (and (eq_attr "type" "load") 163 (eq_attr "mode" "SF,DF"))) 164 "(cpu_iss+cp1_iss),(ri_mem+rf_ldmem)") 165 166(define_insn_reservation "ir_sr70_fstore" 167 1 168 (and (eq_attr "cpu" "sr71000") 169 (and (eq_attr "type" "store") 170 (eq_attr "mode" "SF,DF"))) 171 "(cpu_iss+cp1_iss),(fpu_mov+ri_mem)") 172 173 174;; This reservation is for conditional move based on integer 175;; or floating point CC. 176(define_insn_reservation "ir_sr70_condmove" 177 4 178 (and (eq_attr "cpu" "sr71000") 179 (eq_attr "type" "condmove")) 180 "ri_insns") 181 182;; Try to discriminate move-from-cp1 versus move-to-cp1 as latencies 183;; are different. Like float load/store, these insns use multiple 184;; resources simultaneously 185(define_insn_reservation "ir_sr70_xfer_from" 186 6 187 (and (eq_attr "cpu" "sr71000") 188 (and (eq_attr "type" "xfer") 189 (eq_attr "mode" "!SF,DF,FPSW"))) 190 "(cpu_iss+cp1_iss),(fpu_mov+ri_mem)") 191 192(define_insn_reservation "ir_sr70_xfer_to" 193 9 194 (and (eq_attr "cpu" "sr71000") 195 (and (eq_attr "type" "xfer") 196 (eq_attr "mode" "SF,DF"))) 197 "(cpu_iss+cp1_iss),(ri_mem+rf_ldmem)") 198 199(define_insn_reservation "ir_sr70_hilo" 200 1 201 (and (eq_attr "cpu" "sr71000") 202 (eq_attr "type" "hilo")) 203 "ri_insns") 204 205(define_insn_reservation "ir_sr70_arith" 206 1 207 (and (eq_attr "cpu" "sr71000") 208 (eq_attr "type" "move,arith,darith,const")) 209 "ri_insns") 210 211;; emulate repeat (dispatch stall) by spending extra cycle(s) in 212;; in iter unit 213(define_insn_reservation "ir_sr70_imul_si" 214 4 215 (and (eq_attr "cpu" "sr71000") 216 (and (eq_attr "type" "imul,imadd") 217 (eq_attr "mode" "SI"))) 218 "ri_alux,ipu_alux,ipu_macc_iter") 219 220(define_insn_reservation "ir_sr70_imul_di" 221 6 222 (and (eq_attr "cpu" "sr71000") 223 (and (eq_attr "type" "imul,imadd") 224 (eq_attr "mode" "DI"))) 225 "ri_alux,ipu_alux,(ipu_macc_iter*3)") 226 227;; Divide algorithm is early out with best latency of 7 pcycles. 228;; Use worst case for scheduling purposes. 229(define_insn_reservation "ir_sr70_idiv_si" 230 41 231 (and (eq_attr "cpu" "sr71000") 232 (and (eq_attr "type" "idiv") 233 (eq_attr "mode" "SI"))) 234 "ri_alux,ipu_alux,(ipu_macc_iter*38)") 235 236(define_insn_reservation "ir_sr70_idiv_di" 237 73 238 (and (eq_attr "cpu" "sr71000") 239 (and (eq_attr "type" "idiv") 240 (eq_attr "mode" "DI"))) 241 "ri_alux,ipu_alux,(ipu_macc_iter*70)") 242 243(define_insn_reservation "ir_sr70_icmp" 244 1 245 (and (eq_attr "cpu" "sr71000") 246 (eq_attr "type" "icmp")) 247 "ri_insns") 248 249;; extra reservations of fpu_fpu are for repeat latency 250(define_insn_reservation "ir_sr70_fadd_sf" 251 8 252 (and (eq_attr "cpu" "sr71000") 253 (and (eq_attr "type" "fadd") 254 (eq_attr "mode" "SF"))) 255 "rf_insn,fpu_fpu") 256 257(define_insn_reservation "ir_sr70_fadd_df" 258 10 259 (and (eq_attr "cpu" "sr71000") 260 (and (eq_attr "type" "fadd") 261 (eq_attr "mode" "DF"))) 262 "rf_insn,fpu_fpu") 263 264;; Latencies for MADD,MSUB, NMADD, NMSUB assume the Multiply is fused 265;; with the sub or add. 266(define_insn_reservation "ir_sr70_fmul_sf" 267 8 268 (and (eq_attr "cpu" "sr71000") 269 (and (eq_attr "type" "fmul,fmadd") 270 (eq_attr "mode" "SF"))) 271 "rf_insn,fpu_fpu") 272 273;; tie up the fpu unit to emulate the balance for the "repeat 274;; rate" of 8 (2 are spent in the iss unit) 275(define_insn_reservation "ir_sr70_fmul_df" 276 16 277 (and (eq_attr "cpu" "sr71000") 278 (and (eq_attr "type" "fmul,fmadd") 279 (eq_attr "mode" "DF"))) 280 "rf_insn,fpu_fpu*6") 281 282 283;; RECIP insn uses same type attr as div, and for SR3, has same 284;; timings for double. However, single RECIP has a latency of 285;; 28 -- only way to fix this is to introduce new insn attrs. 286;; cycles spent in iter unit are designed to satisfy balance 287;; of "repeat" latency after insn uses up rf_multi1 reservation 288(define_insn_reservation "ir_sr70_fdiv_sf" 289 60 290 (and (eq_attr "cpu" "sr71000") 291 (and (eq_attr "type" "fdiv") 292 (eq_attr "mode" "SF"))) 293 "rf_multi1+(fpu_iter*51)") 294 295(define_insn_reservation "ir_sr70_fdiv_df" 296 120 297 (and (eq_attr "cpu" "sr71000") 298 (and (eq_attr "type" "fdiv") 299 (eq_attr "mode" "DF"))) 300 "rf_multi1+(fpu_iter*109)") 301 302(define_insn_reservation "ir_sr70_fabs" 303 4 304 (and (eq_attr "cpu" "sr71000") 305 (eq_attr "type" "fabs,fneg")) 306 "rf_insn,fpu_fpu") 307 308(define_insn_reservation "ir_sr70_fcmp" 309 10 310 (and (eq_attr "cpu" "sr71000") 311 (eq_attr "type" "fcmp")) 312 "rf_insn,fpu_fpu") 313 314;; "fcvt" type attribute covers a number of diff insns, most have the same 315;; latency descriptions, a few vary. We use the 316;; most common timing (which is also worst case). 317(define_insn_reservation "ir_sr70_fcvt" 318 12 319 (and (eq_attr "cpu" "sr71000") 320 (eq_attr "type" "fcvt")) 321 "rf_insn,fpu_fpu*4") 322 323(define_insn_reservation "ir_sr70_fsqrt_sf" 324 62 325 (and (eq_attr "cpu" "sr71000") 326 (and (eq_attr "type" "fsqrt") 327 (eq_attr "mode" "SF"))) 328 "rf_multi1+(fpu_iter*53)") 329 330(define_insn_reservation "ir_sr70_fsqrt_df" 331 122 332 (and (eq_attr "cpu" "sr71000") 333 (and (eq_attr "type" "fsqrt") 334 (eq_attr "mode" "DF"))) 335 "rf_multi1+(fpu_iter*111)") 336 337(define_insn_reservation "ir_sr70_frsqrt_sf" 338 48 339 (and (eq_attr "cpu" "sr71000") 340 (and (eq_attr "type" "frsqrt") 341 (eq_attr "mode" "SF"))) 342 "rf_multi1+(fpu_iter*39)") 343 344(define_insn_reservation "ir_sr70_frsqrt_df" 345 240 346 (and (eq_attr "cpu" "sr71000") 347 (and (eq_attr "type" "frsqrt") 348 (eq_attr "mode" "DF"))) 349 "rf_multi1+(fpu_iter*229)") 350 351(define_insn_reservation "ir_sr70_multi" 352 1 353 (and (eq_attr "cpu" "sr71000") 354 (eq_attr "type" "multi")) 355 "serial_dispatch") 356 357(define_insn_reservation "ir_sr70_nop" 358 1 359 (and (eq_attr "cpu" "sr71000") 360 (eq_attr "type" "nop")) 361 "ri_insns") 362