1;; ......................... 2;; 3;; DFA-based pipeline description for Sandcraft SR3 (MIPS64 based) 4;; 5;; The SR3 is described as: 6;; - nine-stage pipeline, insn buffering with out-of-order issue to 7;; multiple function units, with an average dispatch rate of 2 8;; insn.s per cycle (max 6 insns: 2 fpu, 4 cpu). 9;; 10;; The details on this are scant except for a diagram in 11;; Chap. 6 of Rev. 1.0 SR3 Spec. 12;; 13;; The model employed below is designed to closely approximate the 14;; published latencies. Emulation of out-of-order issue and the insn 15;; buffering is done via a VLIW dispatch style (with a packing of 6 insns); 16;; the function unit reservations restrictions (define_*_set) are 17;; contrived to support published timings. 18;; 19;; Reference: 20;; "SR3 Microprocessor Specification, System development information," 21;; Revision 1.0, 13 December 2000. 22;; 23;; 24;; Reservation model is based on: 25;; 1) Figure 6-1, from the 1.0 specification. 26;; 2) Chapter 19, from the 1.0 specification. 27;; 3) following questions(Red Hat)/answers(Sandcraft): 28;; RH> From Section 19.1 29;; RH> 1) In terms of figure 6-1, are all the instructions in 30;; RH> table 19-1 restricted 31;; RH> to ALUx? When ALUx is not in use for an instruction in table;; RH> 19-1 is 32;; RH> it fully compatible with all insns that issue to ALUy? 33;; 34;; Yes, all the instructions in Table 19-1 only go to ALUX, and all the 35;; instructions that can be issued to ALUY can also be issued to ALUX. 36;; 37;; 38;; RH> From Section 19.2 39;; RH> 2) Explain conditional moves execution path (in terms of 40;; RH> figure 6-1) 41;; 42;; Conditional move of integer registers (based on floating point condition 43;; codes or integer register value) go to ALUX or ALUY. 44;; 45;; RH> 3) Explain floating point store execution path (in terms of 46;; RH> figure 6-1) 47;; 48;; Floating point stores go to Ld/St and go to MOV in the floating point 49;; pipeline. 50;; 51;; Floating point loads go to Ld/St and go to LOAD in the floating point 52;; pipeline. 53;; 54;; RH> 4) Explain branch on floating condition (in terms of figure 6-1);; 55;; Branch on floating condition go to BRU. 56;; 57;; RH> 5) Is the column for single RECIP instruction latency correct? 58;; RH> What about for RSQRT single and double? 59;; 60;; The latency/repeat for RECIP and RSQRT are correct. 61;; 62 63;; 64;; Use four automata to isolate long latency operations, and to 65;; reduce the complexity of cpu+fpu, reducing space. 66;; 67(define_automaton "sr71_cpu, sr71_cpu1, sr71_cp1, sr71_cp2, sr71_fextra, sr71_imacc") 68 69;; feeders for CPU function units and feeders for fpu (CP1 interface) 70(define_cpu_unit "sr_iss0,sr_iss1,sr_iss2,sr_iss3,sr_iss4,sr_iss5" "sr71_cpu") 71 72;; CPU function units 73(define_cpu_unit "ipu_bru" "sr71_cpu1") 74(define_cpu_unit "ipu_alux" "sr71_cpu1") 75(define_cpu_unit "ipu_aluy" "sr71_cpu1") 76(define_cpu_unit "ipu_ldst" "sr71_cpu1") 77(define_cpu_unit "ipu_macc_iter" "sr71_imacc") 78 79 80;; Floating-point unit (Co-processor interface 1). 81(define_cpu_unit "fpu_mov" "sr71_cp1") 82(define_cpu_unit "fpu_load" "sr71_cp1") 83(define_cpu_unit "fpu_fpu" "sr71_cp2") 84 85;; fictitous unit to track long float insns with separate automaton 86(define_cpu_unit "fpu_iter" "sr71_fextra") 87 88 89;; 90;; Define common execution path (reservation) combinations 91;; 92 93;; 94(define_reservation "cpu_iss" "sr_iss0|sr_iss1|sr_iss2|sr_iss3") 95 96;; two cycles are used for instruction using the fpu as it runs 97;; at half the clock speed of the cpu. By adding an extra cycle 98;; to the issue units, the default/minimum "repeat" dispatch delay is 99;; accounted for all insn.s 100(define_reservation "cp1_iss" "(sr_iss4*2)|(sr_iss5*2)") 101 102(define_reservation "serial_dispatch" "sr_iss0+sr_iss1+sr_iss2+sr_iss3+sr_iss4+sr_iss5") 103 104;; Simulate a 6 insn VLIW dispatch, 1 cycle in dispatch followed by 105;; reservation of function unit. 106(define_reservation "ri_insns" "cpu_iss,(ipu_alux|ipu_aluy)") 107(define_reservation "ri_mem" "cpu_iss,ipu_ldst") 108(define_reservation "ri_alux" "cpu_iss,ipu_alux") 109(define_reservation "ri_branch" "cpu_iss,ipu_bru") 110 111(define_reservation "rf_insn" "cp1_iss,fpu_fpu") 112(define_reservation "rf_ldmem" "cp1_iss,fpu_load") 113 114; simultaneous reservation of pseudo-unit keeps cp1 fpu tied 115; up until long cycle insn is finished... 116(define_reservation "rf_multi1" "rf_insn+fpu_iter") 117 118;; 119;; The ordering of the instruction-execution-path/resource-usage 120;; descriptions (also known as reservation RTL) is roughly ordered 121;; based on the define attribute RTL for the "type" classification. 122;; When modifying, remember that the first test that matches is the 123;; reservation used! 124;; 125 126 127(define_insn_reservation "ir_sr70_unknown" 1 128 (and (eq_attr "cpu" "sr71000") 129 (eq_attr "type" "unknown")) 130 "serial_dispatch") 131 132 133;; Assume prediction fails. 134(define_insn_reservation "ir_sr70_branch" 6 135 (and (eq_attr "cpu" "sr71000") 136 (eq_attr "type" "branch,jump,call")) 137 "ri_branch") 138 139(define_insn_reservation "ir_sr70_load" 2 140 (and (eq_attr "cpu" "sr71000") 141 (eq_attr "type" "load")) 142 "ri_mem") 143 144(define_insn_reservation "ir_sr70_store" 1 145 (and (eq_attr "cpu" "sr71000") 146 (eq_attr "type" "store")) 147 "ri_mem") 148 149 150;; 151;; float loads/stores flow through both cpu and cp1... 152;; 153(define_insn_reservation "ir_sr70_fload" 9 154 (and (eq_attr "cpu" "sr71000") 155 (eq_attr "type" "fpload,fpidxload")) 156 "(cpu_iss+cp1_iss),(ri_mem+rf_ldmem)") 157 158(define_insn_reservation "ir_sr70_fstore" 1 159 (and (eq_attr "cpu" "sr71000") 160 (eq_attr "type" "fpstore,fpidxstore")) 161 "(cpu_iss+cp1_iss),(fpu_mov+ri_mem)") 162 163 164;; This reservation is for conditional move based on integer 165;; or floating point CC. 166(define_insn_reservation "ir_sr70_condmove" 4 167 (and (eq_attr "cpu" "sr71000") 168 (eq_attr "type" "condmove")) 169 "ri_insns") 170 171;; Try to discriminate move-from-cp1 versus move-to-cp1 as latencies 172;; are different. Like float load/store, these insns use multiple 173;; resources simultaneously 174(define_insn_reservation "ir_sr70_xfer_from" 6 175 (and (eq_attr "cpu" "sr71000") 176 (and (eq_attr "type" "xfer") 177 (eq_attr "mode" "!SF,DF,FPSW"))) 178 "(cpu_iss+cp1_iss),(fpu_mov+ri_mem)") 179 180(define_insn_reservation "ir_sr70_xfer_to" 9 181 (and (eq_attr "cpu" "sr71000") 182 (and (eq_attr "type" "xfer") 183 (eq_attr "mode" "SF,DF"))) 184 "(cpu_iss+cp1_iss),(ri_mem+rf_ldmem)") 185 186(define_insn_reservation "ir_sr70_hilo" 1 187 (and (eq_attr "cpu" "sr71000") 188 (eq_attr "type" "mthilo,mfhilo")) 189 "ri_insns") 190 191(define_insn_reservation "ir_sr70_arith" 1 192 (and (eq_attr "cpu" "sr71000") 193 (eq_attr "type" "arith,shift,slt,clz,const,trap")) 194 "ri_insns") 195 196;; emulate repeat (dispatch stall) by spending extra cycle(s) in 197;; in iter unit 198(define_insn_reservation "ir_sr70_imul_si" 4 199 (and (eq_attr "cpu" "sr71000") 200 (and (eq_attr "type" "imul,imul3,imadd") 201 (eq_attr "mode" "SI"))) 202 "ri_alux,ipu_alux,ipu_macc_iter") 203 204(define_insn_reservation "ir_sr70_imul_di" 6 205 (and (eq_attr "cpu" "sr71000") 206 (and (eq_attr "type" "imul,imul3,imadd") 207 (eq_attr "mode" "DI"))) 208 "ri_alux,ipu_alux,(ipu_macc_iter*3)") 209 210;; Divide algorithm is early out with best latency of 7 pcycles. 211;; Use worst case for scheduling purposes. 212(define_insn_reservation "ir_sr70_idiv_si" 41 213 (and (eq_attr "cpu" "sr71000") 214 (and (eq_attr "type" "idiv") 215 (eq_attr "mode" "SI"))) 216 "ri_alux,ipu_alux,(ipu_macc_iter*38)") 217 218(define_insn_reservation "ir_sr70_idiv_di" 73 219 (and (eq_attr "cpu" "sr71000") 220 (and (eq_attr "type" "idiv") 221 (eq_attr "mode" "DI"))) 222 "ri_alux,ipu_alux,(ipu_macc_iter*70)") 223 224;; extra reservations of fpu_fpu are for repeat latency 225(define_insn_reservation "ir_sr70_fadd_sf" 8 226 (and (eq_attr "cpu" "sr71000") 227 (and (eq_attr "type" "fadd") 228 (eq_attr "mode" "SF"))) 229 "rf_insn,fpu_fpu") 230 231(define_insn_reservation "ir_sr70_fadd_df" 10 232 (and (eq_attr "cpu" "sr71000") 233 (and (eq_attr "type" "fadd") 234 (eq_attr "mode" "DF"))) 235 "rf_insn,fpu_fpu") 236 237;; Latencies for MADD,MSUB, NMADD, NMSUB assume the Multiply is fused 238;; with the sub or add. 239(define_insn_reservation "ir_sr70_fmul_sf" 8 240 (and (eq_attr "cpu" "sr71000") 241 (and (eq_attr "type" "fmul,fmadd") 242 (eq_attr "mode" "SF"))) 243 "rf_insn,fpu_fpu") 244 245;; tie up the fpu unit to emulate the balance for the "repeat 246;; rate" of 8 (2 are spent in the iss unit) 247(define_insn_reservation "ir_sr70_fmul_df" 16 248 (and (eq_attr "cpu" "sr71000") 249 (and (eq_attr "type" "fmul,fmadd") 250 (eq_attr "mode" "DF"))) 251 "rf_insn,fpu_fpu*6") 252 253 254;; RECIP insn uses same type attr as div, and for SR3, has same 255;; timings for double. However, single RECIP has a latency of 256;; 28 -- only way to fix this is to introduce new insn attrs. 257;; cycles spent in iter unit are designed to satisfy balance 258;; of "repeat" latency after insn uses up rf_multi1 reservation 259(define_insn_reservation "ir_sr70_fdiv_sf" 60 260 (and (eq_attr "cpu" "sr71000") 261 (and (eq_attr "type" "fdiv,frdiv") 262 (eq_attr "mode" "SF"))) 263 "rf_multi1+(fpu_iter*51)") 264 265(define_insn_reservation "ir_sr70_fdiv_df" 120 266 (and (eq_attr "cpu" "sr71000") 267 (and (eq_attr "type" "fdiv,frdiv") 268 (eq_attr "mode" "DF"))) 269 "rf_multi1+(fpu_iter*109)") 270 271(define_insn_reservation "ir_sr70_fabs" 4 272 (and (eq_attr "cpu" "sr71000") 273 (eq_attr "type" "fabs,fneg,fmove")) 274 "rf_insn,fpu_fpu") 275 276(define_insn_reservation "ir_sr70_fcmp" 10 277 (and (eq_attr "cpu" "sr71000") 278 (eq_attr "type" "fcmp")) 279 "rf_insn,fpu_fpu") 280 281;; "fcvt" type attribute covers a number of diff insns, most have the same 282;; latency descriptions, a few vary. We use the 283;; most common timing (which is also worst case). 284(define_insn_reservation "ir_sr70_fcvt" 12 285 (and (eq_attr "cpu" "sr71000") 286 (eq_attr "type" "fcvt")) 287 "rf_insn,fpu_fpu*4") 288 289(define_insn_reservation "ir_sr70_fsqrt_sf" 62 290 (and (eq_attr "cpu" "sr71000") 291 (and (eq_attr "type" "fsqrt") 292 (eq_attr "mode" "SF"))) 293 "rf_multi1+(fpu_iter*53)") 294 295(define_insn_reservation "ir_sr70_fsqrt_df" 122 296 (and (eq_attr "cpu" "sr71000") 297 (and (eq_attr "type" "fsqrt") 298 (eq_attr "mode" "DF"))) 299 "rf_multi1+(fpu_iter*111)") 300 301(define_insn_reservation "ir_sr70_frsqrt_sf" 48 302 (and (eq_attr "cpu" "sr71000") 303 (and (eq_attr "type" "frsqrt") 304 (eq_attr "mode" "SF"))) 305 "rf_multi1+(fpu_iter*39)") 306 307(define_insn_reservation "ir_sr70_frsqrt_df" 240 308 (and (eq_attr "cpu" "sr71000") 309 (and (eq_attr "type" "frsqrt") 310 (eq_attr "mode" "DF"))) 311 "rf_multi1+(fpu_iter*229)") 312 313(define_insn_reservation "ir_sr70_multi" 1 314 (and (eq_attr "cpu" "sr71000") 315 (eq_attr "type" "multi")) 316 "serial_dispatch") 317 318(define_insn_reservation "ir_sr70_nop" 1 319 (and (eq_attr "cpu" "sr71000") 320 (eq_attr "type" "nop")) 321 "ri_insns") 322