1;; DFA scheduling description for SH4. 2;; Copyright (C) 2004 Free Software Foundation, Inc. 3 4;; This file is part of GCC. 5 6;; GCC is free software; you can redistribute it and/or modify 7;; it under the terms of the GNU General Public License as published by 8;; the Free Software Foundation; either version 2, or (at your option) 9;; any later version. 10 11;; GCC is distributed in the hope that it will be useful, 12;; but WITHOUT ANY WARRANTY; without even the implied warranty of 13;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14;; GNU General Public License for more details. 15 16;; You should have received a copy of the GNU General Public License 17;; along with GCC; see the file COPYING. If not, write to 18;; the Free Software Foundation, 51 Franklin Street, Fifth Floor, 19;; Boston, MA 02110-1301, USA. 20 21;; Load and store instructions save a cycle if they are aligned on a 22;; four byte boundary. Using a function unit for stores encourages 23;; gcc to separate load and store instructions by one instruction, 24;; which makes it more likely that the linker will be able to word 25;; align them when relaxing. 26 27;; The following description models the SH4 pipeline using the DFA based 28;; scheduler. The DFA based description is better way to model a 29;; superscalar pipeline as compared to function unit reservation model. 30;; 1. The function unit based model is oriented to describe at most one 31;; unit reservation by each insn. It is difficult to model unit reservations 32;; in multiple pipeline units by same insn. This can be done using DFA 33;; based description. 34;; 2. The execution performance of DFA based scheduler does not depend on 35;; processor complexity. 36;; 3. Writing all unit reservations for an instruction class is a more natural 37;; description of the pipeline and makes the interface to the hazard 38;; recognizer simpler than the old function unit based model. 39;; 4. The DFA model is richer and is a part of greater overall framework 40;; of RCSP. 41 42 43;; Two automata are defined to reduce number of states 44;; which a single large automaton will have. (Factoring) 45 46(define_automaton "inst_pipeline,fpu_pipe") 47 48;; This unit is basically the decode unit of the processor. 49;; Since SH4 is a dual issue machine,it is as if there are two 50;; units so that any insn can be processed by either one 51;; of the decoding unit. 52 53(define_cpu_unit "pipe_01,pipe_02" "inst_pipeline") 54 55 56;; The fixed point arithmetic calculator(?? EX Unit). 57 58(define_cpu_unit "int" "inst_pipeline") 59 60;; f1_1 and f1_2 are floating point units.Actually there is 61;; a f1 unit which can overlap with other f1 unit but 62;; not another F1 unit.It is as though there were two 63;; f1 units. 64 65(define_cpu_unit "f1_1,f1_2" "fpu_pipe") 66 67;; The floating point units (except FS - F2 always precedes it.) 68 69(define_cpu_unit "F0,F1,F2,F3" "fpu_pipe") 70 71;; This is basically the MA unit of SH4 72;; used in LOAD/STORE pipeline. 73 74(define_cpu_unit "memory" "inst_pipeline") 75 76;; However, there are LS group insns that don't use it, even ones that 77;; complete in 0 cycles. So we use an extra unit for the issue of LS insns. 78(define_cpu_unit "load_store" "inst_pipeline") 79 80;; The address calculator used for branch instructions. 81;; This will be reserved after "issue" of branch instructions 82;; and this is to make sure that no two branch instructions 83;; can be issued in parallel. 84 85(define_cpu_unit "pcr_addrcalc" "inst_pipeline") 86 87;; ---------------------------------------------------- 88;; This reservation is to simplify the dual issue description. 89 90(define_reservation "issue" "pipe_01|pipe_02") 91 92;; This is to express the locking of D stage. 93;; Note that the issue of a CO group insn also effectively locks the D stage. 94 95(define_reservation "d_lock" "pipe_01+pipe_02") 96 97;; Every FE instruction but fipr / ftrv starts with issue and this. 98(define_reservation "F01" "F0+F1") 99 100;; This is to simplify description where F1,F2,FS 101;; are used simultaneously. 102 103(define_reservation "fpu" "F1+F2") 104 105;; This is to highlight the fact that f1 106;; cannot overlap with F1. 107 108(exclusion_set "f1_1,f1_2" "F1") 109 110(define_insn_reservation "nil" 0 (eq_attr "type" "nil") "nothing") 111 112;; Although reg moves have a latency of zero 113;; we need to highlight that they use D stage 114;; for one cycle. 115 116;; Group: MT 117 118(define_insn_reservation "reg_mov" 0 119 (and (eq_attr "pipe_model" "sh4") 120 (eq_attr "type" "move")) 121 "issue") 122 123;; Group: LS 124 125(define_insn_reservation "freg_mov" 0 126 (and (eq_attr "pipe_model" "sh4") 127 (eq_attr "type" "fmove")) 128 "issue+load_store") 129 130;; We don't model all pipeline stages; we model the issue ('D') stage 131;; inasmuch as we allow only two instructions to issue simultaneously, 132;; and CO instructions prevent any simultaneous issue of another instruction. 133;; (This uses pipe_01 and pipe_02). 134;; Double issue of EX insns is prevented by using the int unit in the EX stage. 135;; Double issue of EX / BR insns is prevented by using the int unit / 136;; pcr_addrcalc unit in the EX stage. 137;; Double issue of BR / LS instructions is prevented by using the 138;; pcr_addrcalc / load_store unit in the issue cycle. 139;; Double issue of FE instructions is prevented by using F0 in the first 140;; pipeline stage after the first D stage. 141;; There is no need to describe the [ES]X / [MN]A / S stages after a D stage 142;; (except in the cases outlined above), nor to describe the FS stage after 143;; the F2 stage. 144 145;; Other MT group instructions(1 step operations) 146;; Group: MT 147;; Latency: 1 148;; Issue Rate: 1 149 150(define_insn_reservation "mt" 1 151 (and (eq_attr "pipe_model" "sh4") 152 (eq_attr "type" "mt_group")) 153 "issue") 154 155;; Fixed Point Arithmetic Instructions(1 step operations) 156;; Group: EX 157;; Latency: 1 158;; Issue Rate: 1 159 160(define_insn_reservation "sh4_simple_arith" 1 161 (and (eq_attr "pipe_model" "sh4") 162 (eq_attr "insn_class" "ex_group")) 163 "issue,int") 164 165;; Load and store instructions have no alignment peculiarities for the SH4, 166;; but they use the load-store unit, which they share with the fmove type 167;; insns (fldi[01]; fmov frn,frm; flds; fsts; fabs; fneg) . 168;; Loads have a latency of two. 169;; However, call insns can only paired with a preceding insn, and have 170;; a delay slot, so that we want two more insns to be scheduled between the 171;; load of the function address and the call. This is equivalent to a 172;; latency of three. 173;; ADJUST_COST can only properly handle reductions of the cost, so we 174;; use a latency of three here, which gets multiplied by 10 to yield 30. 175;; We only do this for SImode loads of general registers, to make the work 176;; for ADJUST_COST easier. 177 178;; Load Store instructions. (MOV.[BWL]@(d,GBR) 179;; Group: LS 180;; Latency: 2 181;; Issue Rate: 1 182 183(define_insn_reservation "sh4_load" 2 184 (and (eq_attr "pipe_model" "sh4") 185 (eq_attr "type" "load,pcload")) 186 "issue+load_store,nothing,memory") 187 188;; calls / sfuncs need an extra instruction for their delay slot. 189;; Moreover, estimating the latency for SImode loads as 3 will also allow 190;; adjust_cost to meaningfully bump it back up to 3 if they load the shift 191;; count of a dynamic shift. 192(define_insn_reservation "sh4_load_si" 3 193 (and (eq_attr "pipe_model" "sh4") 194 (eq_attr "type" "load_si,pcload_si")) 195 "issue+load_store,nothing,memory") 196 197;; (define_bypass 2 "sh4_load_si" "!sh4_call") 198 199;; The load latency is upped to three higher if the dependent insn does 200;; double precision computation. We want the 'default' latency to reflect 201;; that increased latency because otherwise the insn priorities won't 202;; allow proper scheduling. 203(define_insn_reservation "sh4_fload" 3 204 (and (eq_attr "pipe_model" "sh4") 205 (eq_attr "type" "fload,pcfload")) 206 "issue+load_store,nothing,memory") 207 208;; (define_bypass 2 "sh4_fload" "!") 209 210(define_insn_reservation "sh4_store" 1 211 (and (eq_attr "pipe_model" "sh4") 212 (eq_attr "type" "store")) 213 "issue+load_store,nothing,memory") 214 215;; Load Store instructions. 216;; Group: LS 217;; Latency: 1 218;; Issue Rate: 1 219 220(define_insn_reservation "sh4_gp_fpul" 1 221 (and (eq_attr "pipe_model" "sh4") 222 (eq_attr "type" "gp_fpul")) 223 "issue+load_store") 224 225;; Load Store instructions. 226;; Group: LS 227;; Latency: 3 228;; Issue Rate: 1 229 230(define_insn_reservation "sh4_fpul_gp" 3 231 (and (eq_attr "pipe_model" "sh4") 232 (eq_attr "type" "fpul_gp")) 233 "issue+load_store") 234 235;; Branch (BF,BF/S,BT,BT/S,BRA) 236;; Group: BR 237;; Latency when taken: 2 (or 1) 238;; Issue Rate: 1 239;; The latency is 1 when displacement is 0. 240;; We can't really do much with the latency, even if we could express it, 241;; but the pairing restrictions are useful to take into account. 242;; ??? If the branch is likely, we might want to fill the delay slot; 243;; if the branch is likely, but not very likely, should we pretend to use 244;; a resource that CO instructions use, to get a pairable delay slot insn? 245 246(define_insn_reservation "sh4_branch" 1 247 (and (eq_attr "pipe_model" "sh4") 248 (eq_attr "type" "cbranch,jump")) 249 "issue+pcr_addrcalc") 250 251;; Branch Far (JMP,RTS,BRAF) 252;; Group: CO 253;; Latency: 3 254;; Issue Rate: 2 255;; ??? Scheduling happens before branch shortening, and hence jmp and braf 256;; can't be distinguished from bra for the "jump" pattern. 257 258(define_insn_reservation "sh4_return" 3 259 (and (eq_attr "pipe_model" "sh4") 260 (eq_attr "type" "return,jump_ind")) 261 "d_lock*2") 262 263;; RTE 264;; Group: CO 265;; Latency: 5 266;; Issue Rate: 5 267;; this instruction can be executed in any of the pipelines 268;; and blocks the pipeline for next 4 stages. 269 270(define_insn_reservation "sh4_return_from_exp" 5 271 (and (eq_attr "pipe_model" "sh4") 272 (eq_attr "type" "rte")) 273 "d_lock*5") 274 275;; OCBP, OCBWB 276;; Group: CO 277;; Latency: 1-5 278;; Issue Rate: 1 279 280;; cwb is used for the sequence ocbwb @%0; extu.w %0,%2; or %1,%2; mov.l %0,@%2 281;; ocbwb on its own would be "d_lock,nothing,memory*5" 282(define_insn_reservation "ocbwb" 6 283 (and (eq_attr "pipe_model" "sh4") 284 (eq_attr "type" "cwb")) 285 "d_lock*2,(d_lock+memory)*3,issue+load_store+memory,memory*2") 286 287;; LDS to PR,JSR 288;; Group: CO 289;; Latency: 3 290;; Issue Rate: 2 291;; The SX stage is blocked for last 2 cycles. 292;; OTOH, the only time that has an effect for insns generated by the compiler 293;; is when lds to PR is followed by sts from PR - and that is highly unlikely - 294;; or when we are doing a function call - and we don't do inter-function 295;; scheduling. For the function call case, it's really best that we end with 296;; something that models an rts. 297 298(define_insn_reservation "sh4_lds_to_pr" 3 299 (and (eq_attr "pipe_model" "sh4") 300 (eq_attr "type" "prset") ) 301 "d_lock*2") 302 303;; calls introduce a longisch delay that is likely to flush the pipelines 304;; of the caller's instructions. Ordinary functions tend to end with a 305;; load to restore a register (in the delay slot of rts), while sfuncs 306;; tend to end with an EX or MT insn. But that is not actually relevant, 307;; since there are no instructions that contend for memory access early. 308;; We could, of course, provide exact scheduling information for specific 309;; sfuncs, if that should prove useful. 310 311(define_insn_reservation "sh4_call" 16 312 (and (eq_attr "pipe_model" "sh4") 313 (eq_attr "type" "call,sfunc")) 314 "d_lock*16") 315 316;; LDS.L to PR 317;; Group: CO 318;; Latency: 3 319;; Issue Rate: 2 320;; The SX unit is blocked for last 2 cycles. 321 322(define_insn_reservation "ldsmem_to_pr" 3 323 (and (eq_attr "pipe_model" "sh4") 324 (eq_attr "type" "pload")) 325 "d_lock*2") 326 327;; STS from PR 328;; Group: CO 329;; Latency: 2 330;; Issue Rate: 2 331;; The SX unit in second and third cycles. 332 333(define_insn_reservation "sts_from_pr" 2 334 (and (eq_attr "pipe_model" "sh4") 335 (eq_attr "type" "prget")) 336 "d_lock*2") 337 338;; STS.L from PR 339;; Group: CO 340;; Latency: 2 341;; Issue Rate: 2 342 343(define_insn_reservation "sh4_prstore_mem" 2 344 (and (eq_attr "pipe_model" "sh4") 345 (eq_attr "type" "pstore")) 346 "d_lock*2,nothing,memory") 347 348;; LDS to FPSCR 349;; Group: CO 350;; Latency: 4 351;; Issue Rate: 1 352;; F1 is blocked for last three cycles. 353 354(define_insn_reservation "fpscr_load" 4 355 (and (eq_attr "pipe_model" "sh4") 356 (eq_attr "type" "gp_fpscr")) 357 "d_lock,nothing,F1*3") 358 359;; LDS.L to FPSCR 360;; Group: CO 361;; Latency: 1 / 4 362;; Latency to update Rn is 1 and latency to update FPSCR is 4 363;; Issue Rate: 1 364;; F1 is blocked for last three cycles. 365 366(define_insn_reservation "fpscr_load_mem" 4 367 (and (eq_attr "pipe_model" "sh4") 368 (eq_attr "type" "mem_fpscr")) 369 "d_lock,nothing,(F1+memory),F1*2") 370 371 372;; Fixed point multiplication (DMULS.L DMULU.L MUL.L MULS.W,MULU.W) 373;; Group: CO 374;; Latency: 4 / 4 375;; Issue Rate: 1 376 377(define_insn_reservation "multi" 4 378 (and (eq_attr "pipe_model" "sh4") 379 (eq_attr "type" "smpy,dmpy")) 380 "d_lock,(d_lock+f1_1),(f1_1|f1_2)*3,F2") 381 382;; Fixed STS from MACL / MACH 383;; Group: CO 384;; Latency: 3 385;; Issue Rate: 1 386 387(define_insn_reservation "sh4_mac_gp" 3 388 (and (eq_attr "pipe_model" "sh4") 389 (eq_attr "type" "mac_gp")) 390 "d_lock") 391 392 393;; Single precision floating point computation FCMP/EQ, 394;; FCMP/GT, FADD, FLOAT, FMAC, FMUL, FSUB, FTRC, FRVHG, FSCHG 395;; Group: FE 396;; Latency: 3/4 397;; Issue Rate: 1 398 399(define_insn_reservation "fp_arith" 3 400 (and (eq_attr "pipe_model" "sh4") 401 (eq_attr "type" "fp")) 402 "issue,F01,F2") 403 404(define_insn_reservation "fp_arith_ftrc" 3 405 (and (eq_attr "pipe_model" "sh4") 406 (eq_attr "type" "ftrc_s")) 407 "issue,F01,F2") 408 409(define_bypass 1 "fp_arith_ftrc" "sh4_fpul_gp") 410 411;; Single Precision FDIV/SQRT 412;; Group: FE 413;; Latency: 12/13 (FDIV); 11/12 (FSQRT) 414;; Issue Rate: 1 415;; We describe fdiv here; fsqrt is actually one cycle faster. 416 417(define_insn_reservation "fp_div" 12 418 (and (eq_attr "pipe_model" "sh4") 419 (eq_attr "type" "fdiv")) 420 "issue,F01+F3,F2+F3,F3*7,F1+F3,F2") 421 422;; Double Precision floating point computation 423;; (FCNVDS, FCNVSD, FLOAT, FTRC) 424;; Group: FE 425;; Latency: (3,4)/5 426;; Issue Rate: 1 427 428(define_insn_reservation "dp_float" 4 429 (and (eq_attr "pipe_model" "sh4") 430 (eq_attr "type" "dfp_conv")) 431 "issue,F01,F1+F2,F2") 432 433;; Double-precision floating-point (FADD,FMUL,FSUB) 434;; Group: FE 435;; Latency: (7,8)/9 436;; Issue Rate: 1 437 438(define_insn_reservation "fp_double_arith" 8 439 (and (eq_attr "pipe_model" "sh4") 440 (eq_attr "type" "dfp_arith")) 441 "issue,F01,F1+F2,fpu*4,F2") 442 443;; Double-precision FCMP (FCMP/EQ,FCMP/GT) 444;; Group: CO 445;; Latency: 3/5 446;; Issue Rate: 2 447 448(define_insn_reservation "fp_double_cmp" 3 449 (and (eq_attr "pipe_model" "sh4") 450 (eq_attr "type" "dfp_cmp")) 451 "d_lock,(d_lock+F01),F1+F2,F2") 452 453;; Double precision FDIV/SQRT 454;; Group: FE 455;; Latency: (24,25)/26 456;; Issue Rate: 1 457 458(define_insn_reservation "dp_div" 25 459 (and (eq_attr "pipe_model" "sh4") 460 (eq_attr "type" "dfdiv")) 461 "issue,F01+F3,F1+F2+F3,F2+F3,F3*16,F1+F3,(fpu+F3)*2,F2") 462 463 464;; Use the branch-not-taken case to model arith3 insns. For the branch taken 465;; case, we'd get a d_lock instead of issue at the end. 466(define_insn_reservation "arith3" 3 467 (and (eq_attr "pipe_model" "sh4") 468 (eq_attr "type" "arith3")) 469 "issue,d_lock+pcr_addrcalc,issue") 470 471;; arith3b insns schedule the same no matter if the branch is taken or not. 472(define_insn_reservation "arith3b" 2 473 (and (eq_attr "pipe_model" "sh4") 474 (eq_attr "type" "arith3")) 475 "issue,d_lock+pcr_addrcalc") 476