1*38fd1498Szrj;; Scheduling for the Intel P6 family of processors 2*38fd1498Szrj;; Copyright (C) 2004-2018 Free Software Foundation, Inc. 3*38fd1498Szrj;; 4*38fd1498Szrj;; This file is part of GCC. 5*38fd1498Szrj;; 6*38fd1498Szrj;; GCC is free software; you can redistribute it and/or modify 7*38fd1498Szrj;; it under the terms of the GNU General Public License as published by 8*38fd1498Szrj;; the Free Software Foundation; either version 3, or (at your option) 9*38fd1498Szrj;; any later version. 10*38fd1498Szrj;; 11*38fd1498Szrj;; GCC is distributed in the hope that it will be useful, 12*38fd1498Szrj;; but WITHOUT ANY WARRANTY; without even the implied warranty of 13*38fd1498Szrj;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14*38fd1498Szrj;; GNU General Public License for more details. 15*38fd1498Szrj;; 16*38fd1498Szrj;; You should have received a copy of the GNU General Public License 17*38fd1498Szrj;; along with GCC; see the file COPYING3. If not see 18*38fd1498Szrj;; <http://www.gnu.org/licenses/>. */ 19*38fd1498Szrj 20*38fd1498Szrj;; The P6 family includes the Pentium Pro, Pentium II, Pentium III, Celeron 21*38fd1498Szrj;; and Xeon lines of CPUs. The DFA scheduler description in this file is 22*38fd1498Szrj;; based on information that can be found in the following three documents: 23*38fd1498Szrj;; 24*38fd1498Szrj;; "P6 Family of Processors Hardware Developer's Manual", 25*38fd1498Szrj;; Intel, September 1999. 26*38fd1498Szrj;; 27*38fd1498Szrj;; "Intel Architecture Optimization Manual", 28*38fd1498Szrj;; Intel, 1999 (Order Number: 245127-001). 29*38fd1498Szrj;; 30*38fd1498Szrj;; "How to optimize for the Pentium family of microprocessors", 31*38fd1498Szrj;; by Agner Fog, PhD. 32*38fd1498Szrj;; 33*38fd1498Szrj;; The P6 pipeline has three major components: 34*38fd1498Szrj;; 1) the FETCH/DECODE unit, an in-order issue front-end 35*38fd1498Szrj;; 2) the DISPATCH/EXECUTE unit, which is the out-of-order core 36*38fd1498Szrj;; 3) the RETIRE unit, an in-order retirement unit 37*38fd1498Szrj;; 38*38fd1498Szrj;; So, the P6 CPUs have out-of-order cores, but the instruction decoder and 39*38fd1498Szrj;; retirement unit are naturally in-order. 40*38fd1498Szrj;; 41*38fd1498Szrj;; BUS INTERFACE UNIT 42*38fd1498Szrj;; / \ 43*38fd1498Szrj;; L1 ICACHE L1 DCACHE 44*38fd1498Szrj;; / | \ | \ 45*38fd1498Szrj;; DECODER0 DECODER1 DECODER2 DISP/EXEC RETIRE 46*38fd1498Szrj;; \ | / | | 47*38fd1498Szrj;; INSTRUCTION POOL __________|_______/ 48*38fd1498Szrj;; (inc. reorder buffer) 49*38fd1498Szrj;; 50*38fd1498Szrj;; Since the P6 CPUs execute instructions out-of-order, the most important 51*38fd1498Szrj;; consideration in performance tuning is making sure enough micro-ops are 52*38fd1498Szrj;; ready for execution in the out-of-order core, while not stalling the 53*38fd1498Szrj;; decoder. 54*38fd1498Szrj;; 55*38fd1498Szrj;; TODO: 56*38fd1498Szrj;; - Find a less crude way to model complex instructions, in 57*38fd1498Szrj;; particular how many cycles they take to be decoded. 58*38fd1498Szrj;; - Include decoder latencies in the total reservation latencies. 59*38fd1498Szrj;; This isn't necessary right now because we assume for every 60*38fd1498Szrj;; instruction that it never blocks a decoder. 61*38fd1498Szrj;; - Figure out where the p0 and p1 reservations come from. These 62*38fd1498Szrj;; appear not to be in the manual 63*38fd1498Szrj;; - Lots more because I'm sure this is still far from optimal :-) 64*38fd1498Szrj 65*38fd1498Szrj;; The ppro_idiv and ppro_fdiv automata are used to model issue 66*38fd1498Szrj;; latencies of idiv and fdiv type insns. 67*38fd1498Szrj(define_automaton "ppro_decoder,ppro_core,ppro_idiv,ppro_fdiv,ppro_load,ppro_store") 68*38fd1498Szrj 69*38fd1498Szrj;; Simple instructions of the register-register form have only one uop. 70*38fd1498Szrj;; Load instructions are also only one uop. Store instructions decode to 71*38fd1498Szrj;; two uops, and simple read-modify instructions also take two uops. 72*38fd1498Szrj;; Simple instructions of the register-memory form have two to three uops. 73*38fd1498Szrj;; Simple read-modify-write instructions have four uops. The rules for 74*38fd1498Szrj;; the decoder are simple: 75*38fd1498Szrj;; - an instruction with 1 uop can be decoded by any of the three 76*38fd1498Szrj;; decoders in one cycle. 77*38fd1498Szrj;; - an instruction with 1 to 4 uops can be decoded only by decoder 0 78*38fd1498Szrj;; but still in only one cycle. 79*38fd1498Szrj;; - a complex (microcode) instruction can also only be decoded by 80*38fd1498Szrj;; decoder 0, and this takes an unspecified number of cycles. 81*38fd1498Szrj;; 82*38fd1498Szrj;; The goal is to schedule such that we have a few-one-one uops sequence 83*38fd1498Szrj;; in each cycle, to decode as many instructions per cycle as possible. 84*38fd1498Szrj(define_cpu_unit "decoder0" "ppro_decoder") 85*38fd1498Szrj(define_cpu_unit "decoder1" "ppro_decoder") 86*38fd1498Szrj(define_cpu_unit "decoder2" "ppro_decoder") 87*38fd1498Szrj 88*38fd1498Szrj;; We first wish to find an instruction for decoder0, so exclude 89*38fd1498Szrj;; decoder1 and decoder2 from being reserved until decoder 0 is 90*38fd1498Szrj;; reserved. 91*38fd1498Szrj(presence_set "decoder1" "decoder0") 92*38fd1498Szrj(presence_set "decoder2" "decoder0") 93*38fd1498Szrj 94*38fd1498Szrj;; Most instructions can be decoded on any of the three decoders. 95*38fd1498Szrj(define_reservation "decodern" "(decoder0|decoder1|decoder2)") 96*38fd1498Szrj 97*38fd1498Szrj;; The out-of-order core has five pipelines. During each cycle, the core 98*38fd1498Szrj;; may dispatch zero or one uop on the port of any of the five pipelines 99*38fd1498Szrj;; so the maximum number of dispatched uops per cycle is 5. In practicer, 100*38fd1498Szrj;; 3 uops per cycle is more realistic. 101*38fd1498Szrj;; 102*38fd1498Szrj;; Two of the five pipelines contain several execution units: 103*38fd1498Szrj;; 104*38fd1498Szrj;; Port 0 Port 1 Port 2 Port 3 Port 4 105*38fd1498Szrj;; ALU ALU LOAD SAC SDA 106*38fd1498Szrj;; FPU JUE 107*38fd1498Szrj;; AGU MMX 108*38fd1498Szrj;; MMX P3FPU 109*38fd1498Szrj;; P3FPU 110*38fd1498Szrj;; 111*38fd1498Szrj;; (SAC=Store Address Calculation, SDA=Store Data Unit, P3FPU = SSE unit, 112*38fd1498Szrj;; JUE = Jump Execution Unit, AGU = Address Generation Unit) 113*38fd1498Szrj;; 114*38fd1498Szrj(define_cpu_unit "p0,p1" "ppro_core") 115*38fd1498Szrj(define_cpu_unit "p2" "ppro_load") 116*38fd1498Szrj(define_cpu_unit "p3,p4" "ppro_store") 117*38fd1498Szrj(define_cpu_unit "idiv" "ppro_idiv") 118*38fd1498Szrj(define_cpu_unit "fdiv" "ppro_fdiv") 119*38fd1498Szrj 120*38fd1498Szrj;; Only the irregular instructions have to be modeled here. A load 121*38fd1498Szrj;; increases the latency by 2 or 3, or by nothing if the manual gives 122*38fd1498Szrj;; a latency already. Store latencies are not accounted for. 123*38fd1498Szrj;; 124*38fd1498Szrj;; The simple instructions follow a very regular pattern of 1 uop per 125*38fd1498Szrj;; reg-reg operation, 1 uop per load on port 2. and 2 uops per store 126*38fd1498Szrj;; on port 4 and port 3. These instructions are modelled at the bottom 127*38fd1498Szrj;; of this file. 128*38fd1498Szrj;; 129*38fd1498Szrj;; For microcoded instructions we don't know how many uops are produced. 130*38fd1498Szrj;; These instructions are the "complex" ones in the Intel manuals. All 131*38fd1498Szrj;; we _do_ know is that they typically produce four or more uops, so 132*38fd1498Szrj;; they can only be decoded on decoder0. Modelling their latencies 133*38fd1498Szrj;; doesn't make sense because we don't know how these instructions are 134*38fd1498Szrj;; executed in the core. So we just model that they can only be decoded 135*38fd1498Szrj;; on decoder 0, and say that it takes a little while before the result 136*38fd1498Szrj;; is available. 137*38fd1498Szrj(define_insn_reservation "ppro_complex_insn" 6 138*38fd1498Szrj (and (eq_attr "cpu" "pentiumpro") 139*38fd1498Szrj (eq_attr "type" "other,multi,call,callv,str")) 140*38fd1498Szrj "decoder0") 141*38fd1498Szrj 142*38fd1498Szrj;; imov with memory operands does not use the integer units. 143*38fd1498Szrj(define_insn_reservation "ppro_imov" 1 144*38fd1498Szrj (and (eq_attr "cpu" "pentiumpro") 145*38fd1498Szrj (and (eq_attr "memory" "none") 146*38fd1498Szrj (eq_attr "type" "imov"))) 147*38fd1498Szrj "decodern,(p0|p1)") 148*38fd1498Szrj 149*38fd1498Szrj(define_insn_reservation "ppro_imov_load" 4 150*38fd1498Szrj (and (eq_attr "cpu" "pentiumpro") 151*38fd1498Szrj (and (eq_attr "memory" "load") 152*38fd1498Szrj (eq_attr "type" "imov"))) 153*38fd1498Szrj "decodern,p2") 154*38fd1498Szrj 155*38fd1498Szrj(define_insn_reservation "ppro_imov_store" 1 156*38fd1498Szrj (and (eq_attr "cpu" "pentiumpro") 157*38fd1498Szrj (and (eq_attr "memory" "store") 158*38fd1498Szrj (eq_attr "type" "imov"))) 159*38fd1498Szrj "decoder0,p4+p3") 160*38fd1498Szrj 161*38fd1498Szrj;; imovx always decodes to one uop, and also doesn't use the integer 162*38fd1498Szrj;; units if it has memory operands. 163*38fd1498Szrj(define_insn_reservation "ppro_imovx" 1 164*38fd1498Szrj (and (eq_attr "cpu" "pentiumpro") 165*38fd1498Szrj (and (eq_attr "memory" "none") 166*38fd1498Szrj (eq_attr "type" "imovx"))) 167*38fd1498Szrj "decodern,(p0|p1)") 168*38fd1498Szrj 169*38fd1498Szrj(define_insn_reservation "ppro_imovx_load" 4 170*38fd1498Szrj (and (eq_attr "cpu" "pentiumpro") 171*38fd1498Szrj (and (eq_attr "memory" "load") 172*38fd1498Szrj (eq_attr "type" "imovx"))) 173*38fd1498Szrj "decodern,p2") 174*38fd1498Szrj 175*38fd1498Szrj;; lea executes on port 0 with latency one and throughput 1. 176*38fd1498Szrj(define_insn_reservation "ppro_lea" 1 177*38fd1498Szrj (and (eq_attr "cpu" "pentiumpro") 178*38fd1498Szrj (and (eq_attr "memory" "none") 179*38fd1498Szrj (eq_attr "type" "lea"))) 180*38fd1498Szrj "decodern,p0") 181*38fd1498Szrj 182*38fd1498Szrj;; Shift and rotate execute on port 0 with latency and throughput 1. 183*38fd1498Szrj;; The load and store units need to be reserved when memory operands 184*38fd1498Szrj;; are involved. 185*38fd1498Szrj(define_insn_reservation "ppro_shift_rotate" 1 186*38fd1498Szrj (and (eq_attr "cpu" "pentiumpro") 187*38fd1498Szrj (and (eq_attr "memory" "none") 188*38fd1498Szrj (eq_attr "type" "ishift,ishift1,rotate,rotate1"))) 189*38fd1498Szrj "decodern,p0") 190*38fd1498Szrj 191*38fd1498Szrj(define_insn_reservation "ppro_shift_rotate_mem" 4 192*38fd1498Szrj (and (eq_attr "cpu" "pentiumpro") 193*38fd1498Szrj (and (eq_attr "memory" "!none") 194*38fd1498Szrj (eq_attr "type" "ishift,ishift1,rotate,rotate1"))) 195*38fd1498Szrj "decoder0,p2+p0,p4+p3") 196*38fd1498Szrj 197*38fd1498Szrj 198*38fd1498Szrj;; The P6 has a sophisticated branch prediction mechanism to minimize 199*38fd1498Szrj;; latencies due to branching. In particular, it has a fast way to 200*38fd1498Szrj;; execute branches that are taken multiple times (such as in loops). 201*38fd1498Szrj;; Branches not taken suffer no penalty, and correctly predicted 202*38fd1498Szrj;; branches cost only one fetch cycle. Mispredicted branches are very 203*38fd1498Szrj;; costly: typically 15 cycles and possibly as many as 26 cycles. 204*38fd1498Szrj;; 205*38fd1498Szrj;; Unfortunately all this makes it quite difficult to properly model 206*38fd1498Szrj;; the latencies for the compiler. Here I've made the choice to be 207*38fd1498Szrj;; optimistic and assume branches are often predicted correctly, so 208*38fd1498Szrj;; they have latency 1, and the decoders are not blocked. 209*38fd1498Szrj;; 210*38fd1498Szrj;; In addition, the model assumes a branch always decodes to only 1 uop, 211*38fd1498Szrj;; which is not exactly true because there are a few instructions that 212*38fd1498Szrj;; decode to 2 uops or microcode. But this probably gives the best 213*38fd1498Szrj;; results because we can assume these instructions can decode on all 214*38fd1498Szrj;; decoders. 215*38fd1498Szrj(define_insn_reservation "ppro_branch" 1 216*38fd1498Szrj (and (eq_attr "cpu" "pentiumpro") 217*38fd1498Szrj (and (eq_attr "memory" "none") 218*38fd1498Szrj (eq_attr "type" "ibr"))) 219*38fd1498Szrj "decodern,p1") 220*38fd1498Szrj 221*38fd1498Szrj;; ??? Indirect branches probably have worse latency than this. 222*38fd1498Szrj(define_insn_reservation "ppro_indirect_branch" 6 223*38fd1498Szrj (and (eq_attr "cpu" "pentiumpro") 224*38fd1498Szrj (and (eq_attr "memory" "!none") 225*38fd1498Szrj (eq_attr "type" "ibr"))) 226*38fd1498Szrj "decoder0,p2+p1") 227*38fd1498Szrj 228*38fd1498Szrj(define_insn_reservation "ppro_leave" 4 229*38fd1498Szrj (and (eq_attr "cpu" "pentiumpro") 230*38fd1498Szrj (eq_attr "type" "leave")) 231*38fd1498Szrj "decoder0,p2+(p0|p1),(p0|p1)") 232*38fd1498Szrj 233*38fd1498Szrj;; imul has throughput one, but latency 4, and can only execute on port 0. 234*38fd1498Szrj(define_insn_reservation "ppro_imul" 4 235*38fd1498Szrj (and (eq_attr "cpu" "pentiumpro") 236*38fd1498Szrj (and (eq_attr "memory" "none") 237*38fd1498Szrj (eq_attr "type" "imul"))) 238*38fd1498Szrj "decodern,p0") 239*38fd1498Szrj 240*38fd1498Szrj(define_insn_reservation "ppro_imul_mem" 4 241*38fd1498Szrj (and (eq_attr "cpu" "pentiumpro") 242*38fd1498Szrj (and (eq_attr "memory" "!none") 243*38fd1498Szrj (eq_attr "type" "imul"))) 244*38fd1498Szrj "decoder0,p2+p0") 245*38fd1498Szrj 246*38fd1498Szrj;; div and idiv are very similar, so we model them the same. 247*38fd1498Szrj;; QI, HI, and SI have issue latency 12, 21, and 37, respectively. 248*38fd1498Szrj;; These issue latencies are modelled via the ppro_div automaton. 249*38fd1498Szrj(define_insn_reservation "ppro_idiv_QI" 19 250*38fd1498Szrj (and (eq_attr "cpu" "pentiumpro") 251*38fd1498Szrj (and (eq_attr "memory" "none") 252*38fd1498Szrj (and (eq_attr "mode" "QI") 253*38fd1498Szrj (eq_attr "type" "idiv")))) 254*38fd1498Szrj "decoder0,(p0+idiv)*2,(p0|p1)+idiv,idiv*9") 255*38fd1498Szrj 256*38fd1498Szrj(define_insn_reservation "ppro_idiv_QI_load" 19 257*38fd1498Szrj (and (eq_attr "cpu" "pentiumpro") 258*38fd1498Szrj (and (eq_attr "memory" "load") 259*38fd1498Szrj (and (eq_attr "mode" "QI") 260*38fd1498Szrj (eq_attr "type" "idiv")))) 261*38fd1498Szrj "decoder0,p2+p0+idiv,p0+idiv,(p0|p1)+idiv,idiv*9") 262*38fd1498Szrj 263*38fd1498Szrj(define_insn_reservation "ppro_idiv_HI" 23 264*38fd1498Szrj (and (eq_attr "cpu" "pentiumpro") 265*38fd1498Szrj (and (eq_attr "memory" "none") 266*38fd1498Szrj (and (eq_attr "mode" "HI") 267*38fd1498Szrj (eq_attr "type" "idiv")))) 268*38fd1498Szrj "decoder0,(p0+idiv)*3,(p0|p1)+idiv,idiv*17") 269*38fd1498Szrj 270*38fd1498Szrj(define_insn_reservation "ppro_idiv_HI_load" 23 271*38fd1498Szrj (and (eq_attr "cpu" "pentiumpro") 272*38fd1498Szrj (and (eq_attr "memory" "load") 273*38fd1498Szrj (and (eq_attr "mode" "HI") 274*38fd1498Szrj (eq_attr "type" "idiv")))) 275*38fd1498Szrj "decoder0,p2+p0+idiv,p0+idiv,(p0|p1)+idiv,idiv*18") 276*38fd1498Szrj 277*38fd1498Szrj(define_insn_reservation "ppro_idiv_SI" 39 278*38fd1498Szrj (and (eq_attr "cpu" "pentiumpro") 279*38fd1498Szrj (and (eq_attr "memory" "none") 280*38fd1498Szrj (and (eq_attr "mode" "SI") 281*38fd1498Szrj (eq_attr "type" "idiv")))) 282*38fd1498Szrj "decoder0,(p0+idiv)*3,(p0|p1)+idiv,idiv*33") 283*38fd1498Szrj 284*38fd1498Szrj(define_insn_reservation "ppro_idiv_SI_load" 39 285*38fd1498Szrj (and (eq_attr "cpu" "pentiumpro") 286*38fd1498Szrj (and (eq_attr "memory" "load") 287*38fd1498Szrj (and (eq_attr "mode" "SI") 288*38fd1498Szrj (eq_attr "type" "idiv")))) 289*38fd1498Szrj "decoder0,p2+p0+idiv,p0+idiv,(p0|p1)+idiv,idiv*34") 290*38fd1498Szrj 291*38fd1498Szrj;; Floating point operations always execute on port 0. 292*38fd1498Szrj;; ??? where do these latencies come from? fadd has latency 3 and 293*38fd1498Szrj;; has throughput "1/cycle (align with FADD)". What do they 294*38fd1498Szrj;; mean and how can we model that? 295*38fd1498Szrj(define_insn_reservation "ppro_fop" 3 296*38fd1498Szrj (and (eq_attr "cpu" "pentiumpro") 297*38fd1498Szrj (and (eq_attr "memory" "none,unknown") 298*38fd1498Szrj (eq_attr "type" "fop"))) 299*38fd1498Szrj "decodern,p0") 300*38fd1498Szrj 301*38fd1498Szrj(define_insn_reservation "ppro_fop_load" 5 302*38fd1498Szrj (and (eq_attr "cpu" "pentiumpro") 303*38fd1498Szrj (and (eq_attr "memory" "load") 304*38fd1498Szrj (eq_attr "type" "fop"))) 305*38fd1498Szrj "decoder0,p2+p0,p0") 306*38fd1498Szrj 307*38fd1498Szrj(define_insn_reservation "ppro_fop_store" 3 308*38fd1498Szrj (and (eq_attr "cpu" "pentiumpro") 309*38fd1498Szrj (and (eq_attr "memory" "store") 310*38fd1498Szrj (eq_attr "type" "fop"))) 311*38fd1498Szrj "decoder0,p0,p0,p0+p4+p3") 312*38fd1498Szrj 313*38fd1498Szrj(define_insn_reservation "ppro_fop_both" 5 314*38fd1498Szrj (and (eq_attr "cpu" "pentiumpro") 315*38fd1498Szrj (and (eq_attr "memory" "both") 316*38fd1498Szrj (eq_attr "type" "fop"))) 317*38fd1498Szrj "decoder0,p2+p0,p0+p4+p3") 318*38fd1498Szrj 319*38fd1498Szrj(define_insn_reservation "ppro_fsgn" 1 320*38fd1498Szrj (and (eq_attr "cpu" "pentiumpro") 321*38fd1498Szrj (eq_attr "type" "fsgn")) 322*38fd1498Szrj "decodern,p0") 323*38fd1498Szrj 324*38fd1498Szrj(define_insn_reservation "ppro_fistp" 5 325*38fd1498Szrj (and (eq_attr "cpu" "pentiumpro") 326*38fd1498Szrj (eq_attr "type" "fistp")) 327*38fd1498Szrj "decoder0,p0*2,p4+p3") 328*38fd1498Szrj 329*38fd1498Szrj(define_insn_reservation "ppro_fcmov" 2 330*38fd1498Szrj (and (eq_attr "cpu" "pentiumpro") 331*38fd1498Szrj (eq_attr "type" "fcmov")) 332*38fd1498Szrj "decoder0,p0*2") 333*38fd1498Szrj 334*38fd1498Szrj(define_insn_reservation "ppro_fcmp" 1 335*38fd1498Szrj (and (eq_attr "cpu" "pentiumpro") 336*38fd1498Szrj (and (eq_attr "memory" "none") 337*38fd1498Szrj (eq_attr "type" "fcmp"))) 338*38fd1498Szrj "decodern,p0") 339*38fd1498Szrj 340*38fd1498Szrj(define_insn_reservation "ppro_fcmp_load" 4 341*38fd1498Szrj (and (eq_attr "cpu" "pentiumpro") 342*38fd1498Szrj (and (eq_attr "memory" "load") 343*38fd1498Szrj (eq_attr "type" "fcmp"))) 344*38fd1498Szrj "decoder0,p2+p0") 345*38fd1498Szrj 346*38fd1498Szrj(define_insn_reservation "ppro_fmov" 1 347*38fd1498Szrj (and (eq_attr "cpu" "pentiumpro") 348*38fd1498Szrj (and (eq_attr "memory" "none") 349*38fd1498Szrj (eq_attr "type" "fmov"))) 350*38fd1498Szrj "decodern,p0") 351*38fd1498Szrj 352*38fd1498Szrj(define_insn_reservation "ppro_fmov_load" 1 353*38fd1498Szrj (and (eq_attr "cpu" "pentiumpro") 354*38fd1498Szrj (and (eq_attr "memory" "load") 355*38fd1498Szrj (and (eq_attr "mode" "!XF") 356*38fd1498Szrj (eq_attr "type" "fmov")))) 357*38fd1498Szrj "decodern,p2") 358*38fd1498Szrj 359*38fd1498Szrj(define_insn_reservation "ppro_fmov_XF_load" 3 360*38fd1498Szrj (and (eq_attr "cpu" "pentiumpro") 361*38fd1498Szrj (and (eq_attr "memory" "load") 362*38fd1498Szrj (and (eq_attr "mode" "XF") 363*38fd1498Szrj (eq_attr "type" "fmov")))) 364*38fd1498Szrj "decoder0,(p2+p0)*2") 365*38fd1498Szrj 366*38fd1498Szrj(define_insn_reservation "ppro_fmov_store" 1 367*38fd1498Szrj (and (eq_attr "cpu" "pentiumpro") 368*38fd1498Szrj (and (eq_attr "memory" "store") 369*38fd1498Szrj (and (eq_attr "mode" "!XF") 370*38fd1498Szrj (eq_attr "type" "fmov")))) 371*38fd1498Szrj "decodern,p0") 372*38fd1498Szrj 373*38fd1498Szrj(define_insn_reservation "ppro_fmov_XF_store" 3 374*38fd1498Szrj (and (eq_attr "cpu" "pentiumpro") 375*38fd1498Szrj (and (eq_attr "memory" "store") 376*38fd1498Szrj (and (eq_attr "mode" "XF") 377*38fd1498Szrj (eq_attr "type" "fmov")))) 378*38fd1498Szrj "decoder0,(p0+p4),(p0+p3)") 379*38fd1498Szrj 380*38fd1498Szrj;; fmul executes on port 0 with latency 5. It has issue latency 2, 381*38fd1498Szrj;; but we don't model this. 382*38fd1498Szrj(define_insn_reservation "ppro_fmul" 5 383*38fd1498Szrj (and (eq_attr "cpu" "pentiumpro") 384*38fd1498Szrj (and (eq_attr "memory" "none") 385*38fd1498Szrj (eq_attr "type" "fmul"))) 386*38fd1498Szrj "decoder0,p0*2") 387*38fd1498Szrj 388*38fd1498Szrj(define_insn_reservation "ppro_fmul_load" 6 389*38fd1498Szrj (and (eq_attr "cpu" "pentiumpro") 390*38fd1498Szrj (and (eq_attr "memory" "load") 391*38fd1498Szrj (eq_attr "type" "fmul"))) 392*38fd1498Szrj "decoder0,p2+p0,p0") 393*38fd1498Szrj 394*38fd1498Szrj;; fdiv latencies depend on the mode of the operands. XFmode gives 395*38fd1498Szrj;; a latency of 38 cycles, DFmode gives 32, and SFmode gives latency 18. 396*38fd1498Szrj;; Division by a power of 2 takes only 9 cycles, but we cannot model 397*38fd1498Szrj;; that. Throughput is equal to latency - 1, which we model using the 398*38fd1498Szrj;; ppro_div automaton. 399*38fd1498Szrj(define_insn_reservation "ppro_fdiv_SF" 18 400*38fd1498Szrj (and (eq_attr "cpu" "pentiumpro") 401*38fd1498Szrj (and (eq_attr "memory" "none") 402*38fd1498Szrj (and (eq_attr "mode" "SF") 403*38fd1498Szrj (eq_attr "type" "fdiv,fpspc")))) 404*38fd1498Szrj "decodern,p0+fdiv,fdiv*16") 405*38fd1498Szrj 406*38fd1498Szrj(define_insn_reservation "ppro_fdiv_SF_load" 19 407*38fd1498Szrj (and (eq_attr "cpu" "pentiumpro") 408*38fd1498Szrj (and (eq_attr "memory" "load") 409*38fd1498Szrj (and (eq_attr "mode" "SF") 410*38fd1498Szrj (eq_attr "type" "fdiv,fpspc")))) 411*38fd1498Szrj "decoder0,p2+p0+fdiv,fdiv*16") 412*38fd1498Szrj 413*38fd1498Szrj(define_insn_reservation "ppro_fdiv_DF" 32 414*38fd1498Szrj (and (eq_attr "cpu" "pentiumpro") 415*38fd1498Szrj (and (eq_attr "memory" "none") 416*38fd1498Szrj (and (eq_attr "mode" "DF") 417*38fd1498Szrj (eq_attr "type" "fdiv,fpspc")))) 418*38fd1498Szrj "decodern,p0+fdiv,fdiv*30") 419*38fd1498Szrj 420*38fd1498Szrj(define_insn_reservation "ppro_fdiv_DF_load" 33 421*38fd1498Szrj (and (eq_attr "cpu" "pentiumpro") 422*38fd1498Szrj (and (eq_attr "memory" "load") 423*38fd1498Szrj (and (eq_attr "mode" "DF") 424*38fd1498Szrj (eq_attr "type" "fdiv,fpspc")))) 425*38fd1498Szrj "decoder0,p2+p0+fdiv,fdiv*30") 426*38fd1498Szrj 427*38fd1498Szrj(define_insn_reservation "ppro_fdiv_XF" 38 428*38fd1498Szrj (and (eq_attr "cpu" "pentiumpro") 429*38fd1498Szrj (and (eq_attr "memory" "none") 430*38fd1498Szrj (and (eq_attr "mode" "XF") 431*38fd1498Szrj (eq_attr "type" "fdiv,fpspc")))) 432*38fd1498Szrj "decodern,p0+fdiv,fdiv*36") 433*38fd1498Szrj 434*38fd1498Szrj(define_insn_reservation "ppro_fdiv_XF_load" 39 435*38fd1498Szrj (and (eq_attr "cpu" "pentiumpro") 436*38fd1498Szrj (and (eq_attr "memory" "load") 437*38fd1498Szrj (and (eq_attr "mode" "XF") 438*38fd1498Szrj (eq_attr "type" "fdiv,fpspc")))) 439*38fd1498Szrj "decoder0,p2+p0+fdiv,fdiv*36") 440*38fd1498Szrj 441*38fd1498Szrj;; MMX instructions can execute on either port 0 or port 1 with a 442*38fd1498Szrj;; throughput of 1/cycle. 443*38fd1498Szrj;; on port 0: - ALU (latency 1) 444*38fd1498Szrj;; - Multiplier Unit (latency 3) 445*38fd1498Szrj;; on port 1: - ALU (latency 1) 446*38fd1498Szrj;; - Shift Unit (latency 1) 447*38fd1498Szrj;; 448*38fd1498Szrj;; MMX instructions are either of the type reg-reg, or read-modify, and 449*38fd1498Szrj;; except for mmxshft and mmxmul they can execute on port 0 or port 1, 450*38fd1498Szrj;; so they behave as "simple" instructions that need no special modelling. 451*38fd1498Szrj;; We only have to model mmxshft and mmxmul. 452*38fd1498Szrj(define_insn_reservation "ppro_mmx_shft" 1 453*38fd1498Szrj (and (eq_attr "cpu" "pentiumpro") 454*38fd1498Szrj (and (eq_attr "memory" "none") 455*38fd1498Szrj (eq_attr "type" "mmxshft"))) 456*38fd1498Szrj "decodern,p1") 457*38fd1498Szrj 458*38fd1498Szrj(define_insn_reservation "ppro_mmx_shft_load" 2 459*38fd1498Szrj (and (eq_attr "cpu" "pentiumpro") 460*38fd1498Szrj (and (eq_attr "memory" "none") 461*38fd1498Szrj (eq_attr "type" "mmxshft"))) 462*38fd1498Szrj "decoder0,p2+p1") 463*38fd1498Szrj 464*38fd1498Szrj(define_insn_reservation "ppro_mmx_mul" 3 465*38fd1498Szrj (and (eq_attr "cpu" "pentiumpro") 466*38fd1498Szrj (and (eq_attr "memory" "none") 467*38fd1498Szrj (eq_attr "type" "mmxmul"))) 468*38fd1498Szrj "decodern,p0") 469*38fd1498Szrj 470*38fd1498Szrj(define_insn_reservation "ppro_mmx_mul_load" 3 471*38fd1498Szrj (and (eq_attr "cpu" "pentiumpro") 472*38fd1498Szrj (and (eq_attr "memory" "none") 473*38fd1498Szrj (eq_attr "type" "mmxmul"))) 474*38fd1498Szrj "decoder0,p2+p0") 475*38fd1498Szrj 476*38fd1498Szrj(define_insn_reservation "ppro_sse_mmxcvt" 4 477*38fd1498Szrj (and (eq_attr "cpu" "pentiumpro") 478*38fd1498Szrj (and (eq_attr "mode" "DI") 479*38fd1498Szrj (eq_attr "type" "mmxcvt"))) 480*38fd1498Szrj "decodern,p1") 481*38fd1498Szrj 482*38fd1498Szrj;; FIXME: These are Pentium III only, but we cannot tell here if 483*38fd1498Szrj;; we're generating code for PentiumPro/Pentium II or Pentium III 484*38fd1498Szrj;; (define_insn_reservation "ppro_sse_mmxshft" 2 485*38fd1498Szrj;; (and (eq_attr "cpu" "pentiumpro") 486*38fd1498Szrj;; (and (eq_attr "mode" "DI") 487*38fd1498Szrj;; (eq_attr "type" "mmxshft"))) 488*38fd1498Szrj;; "decodern,p0") 489*38fd1498Szrj 490*38fd1498Szrj;; SSE is very complicated, and takes a bit more effort. 491*38fd1498Szrj;; ??? I assumed that all SSE instructions decode on decoder0, 492*38fd1498Szrj;; but is this correct? 493*38fd1498Szrj 494*38fd1498Szrj;; The sfence instruction. 495*38fd1498Szrj(define_insn_reservation "ppro_sse_sfence" 3 496*38fd1498Szrj (and (eq_attr "cpu" "pentiumpro") 497*38fd1498Szrj (and (eq_attr "memory" "unknown") 498*38fd1498Szrj (eq_attr "type" "sse"))) 499*38fd1498Szrj "decoder0,p4+p3") 500*38fd1498Szrj 501*38fd1498Szrj;; FIXME: This reservation is all wrong when we're scheduling sqrtss. 502*38fd1498Szrj(define_insn_reservation "ppro_sse_SF" 3 503*38fd1498Szrj (and (eq_attr "cpu" "pentiumpro") 504*38fd1498Szrj (and (eq_attr "mode" "SF") 505*38fd1498Szrj (eq_attr "type" "sse"))) 506*38fd1498Szrj "decodern,p0") 507*38fd1498Szrj 508*38fd1498Szrj(define_insn_reservation "ppro_sse_add_SF" 3 509*38fd1498Szrj (and (eq_attr "cpu" "pentiumpro") 510*38fd1498Szrj (and (eq_attr "memory" "none") 511*38fd1498Szrj (and (eq_attr "mode" "SF") 512*38fd1498Szrj (eq_attr "type" "sseadd,sseadd1")))) 513*38fd1498Szrj "decodern,p1") 514*38fd1498Szrj 515*38fd1498Szrj(define_insn_reservation "ppro_sse_add_SF_load" 3 516*38fd1498Szrj (and (eq_attr "cpu" "pentiumpro") 517*38fd1498Szrj (and (eq_attr "memory" "load") 518*38fd1498Szrj (and (eq_attr "mode" "SF") 519*38fd1498Szrj (eq_attr "type" "sseadd,sseadd1")))) 520*38fd1498Szrj "decoder0,p2+p1") 521*38fd1498Szrj 522*38fd1498Szrj(define_insn_reservation "ppro_sse_cmp_SF" 3 523*38fd1498Szrj (and (eq_attr "cpu" "pentiumpro") 524*38fd1498Szrj (and (eq_attr "memory" "none") 525*38fd1498Szrj (and (eq_attr "mode" "SF") 526*38fd1498Szrj (eq_attr "type" "ssecmp")))) 527*38fd1498Szrj "decoder0,p1") 528*38fd1498Szrj 529*38fd1498Szrj(define_insn_reservation "ppro_sse_cmp_SF_load" 3 530*38fd1498Szrj (and (eq_attr "cpu" "pentiumpro") 531*38fd1498Szrj (and (eq_attr "memory" "load") 532*38fd1498Szrj (and (eq_attr "mode" "SF") 533*38fd1498Szrj (eq_attr "type" "ssecmp")))) 534*38fd1498Szrj "decoder0,p2+p1") 535*38fd1498Szrj 536*38fd1498Szrj(define_insn_reservation "ppro_sse_comi_SF" 1 537*38fd1498Szrj (and (eq_attr "cpu" "pentiumpro") 538*38fd1498Szrj (and (eq_attr "memory" "none") 539*38fd1498Szrj (and (eq_attr "mode" "SF") 540*38fd1498Szrj (eq_attr "type" "ssecomi")))) 541*38fd1498Szrj "decodern,p0") 542*38fd1498Szrj 543*38fd1498Szrj(define_insn_reservation "ppro_sse_comi_SF_load" 1 544*38fd1498Szrj (and (eq_attr "cpu" "pentiumpro") 545*38fd1498Szrj (and (eq_attr "memory" "load") 546*38fd1498Szrj (and (eq_attr "mode" "SF") 547*38fd1498Szrj (eq_attr "type" "ssecomi")))) 548*38fd1498Szrj "decoder0,p2+p0") 549*38fd1498Szrj 550*38fd1498Szrj(define_insn_reservation "ppro_sse_mul_SF" 4 551*38fd1498Szrj (and (eq_attr "cpu" "pentiumpro") 552*38fd1498Szrj (and (eq_attr "memory" "none") 553*38fd1498Szrj (and (eq_attr "mode" "SF") 554*38fd1498Szrj (eq_attr "type" "ssemul")))) 555*38fd1498Szrj "decodern,p0") 556*38fd1498Szrj 557*38fd1498Szrj(define_insn_reservation "ppro_sse_mul_SF_load" 4 558*38fd1498Szrj (and (eq_attr "cpu" "pentiumpro") 559*38fd1498Szrj (and (eq_attr "memory" "load") 560*38fd1498Szrj (and (eq_attr "mode" "SF") 561*38fd1498Szrj (eq_attr "type" "ssemul")))) 562*38fd1498Szrj "decoder0,p2+p0") 563*38fd1498Szrj 564*38fd1498Szrj;; FIXME: ssediv doesn't close p0 for 17 cycles, surely??? 565*38fd1498Szrj(define_insn_reservation "ppro_sse_div_SF" 18 566*38fd1498Szrj (and (eq_attr "cpu" "pentiumpro") 567*38fd1498Szrj (and (eq_attr "memory" "none") 568*38fd1498Szrj (and (eq_attr "mode" "SF") 569*38fd1498Szrj (eq_attr "type" "ssediv")))) 570*38fd1498Szrj "decoder0,p0*17") 571*38fd1498Szrj 572*38fd1498Szrj(define_insn_reservation "ppro_sse_div_SF_load" 18 573*38fd1498Szrj (and (eq_attr "cpu" "pentiumpro") 574*38fd1498Szrj (and (eq_attr "memory" "none") 575*38fd1498Szrj (and (eq_attr "mode" "SF") 576*38fd1498Szrj (eq_attr "type" "ssediv")))) 577*38fd1498Szrj "decoder0,(p2+p0),p0*16") 578*38fd1498Szrj 579*38fd1498Szrj(define_insn_reservation "ppro_sse_icvt_SF" 4 580*38fd1498Szrj (and (eq_attr "cpu" "pentiumpro") 581*38fd1498Szrj (and (eq_attr "mode" "SF") 582*38fd1498Szrj (eq_attr "type" "sseicvt"))) 583*38fd1498Szrj "decoder0,(p2+p1)*2") 584*38fd1498Szrj 585*38fd1498Szrj(define_insn_reservation "ppro_sse_icvt_SI" 3 586*38fd1498Szrj (and (eq_attr "cpu" "pentiumpro") 587*38fd1498Szrj (and (eq_attr "mode" "SI") 588*38fd1498Szrj (eq_attr "type" "sseicvt"))) 589*38fd1498Szrj "decoder0,(p2+p1)") 590*38fd1498Szrj 591*38fd1498Szrj(define_insn_reservation "ppro_sse_mov_SF" 3 592*38fd1498Szrj (and (eq_attr "cpu" "pentiumpro") 593*38fd1498Szrj (and (eq_attr "memory" "none") 594*38fd1498Szrj (and (eq_attr "mode" "SF") 595*38fd1498Szrj (eq_attr "type" "ssemov")))) 596*38fd1498Szrj "decoder0,(p0|p1)") 597*38fd1498Szrj 598*38fd1498Szrj(define_insn_reservation "ppro_sse_mov_SF_load" 3 599*38fd1498Szrj (and (eq_attr "cpu" "pentiumpro") 600*38fd1498Szrj (and (eq_attr "memory" "load") 601*38fd1498Szrj (and (eq_attr "mode" "SF") 602*38fd1498Szrj (eq_attr "type" "ssemov")))) 603*38fd1498Szrj "decoder0,p2+(p0|p1)") 604*38fd1498Szrj 605*38fd1498Szrj(define_insn_reservation "ppro_sse_mov_SF_store" 3 606*38fd1498Szrj (and (eq_attr "cpu" "pentiumpro") 607*38fd1498Szrj (and (eq_attr "memory" "store") 608*38fd1498Szrj (and (eq_attr "mode" "SF") 609*38fd1498Szrj (eq_attr "type" "ssemov")))) 610*38fd1498Szrj "decoder0,p4+p3") 611*38fd1498Szrj 612*38fd1498Szrj(define_insn_reservation "ppro_sse_V4SF" 4 613*38fd1498Szrj (and (eq_attr "cpu" "pentiumpro") 614*38fd1498Szrj (and (eq_attr "mode" "V4SF") 615*38fd1498Szrj (eq_attr "type" "sse"))) 616*38fd1498Szrj "decoder0,p1*2") 617*38fd1498Szrj 618*38fd1498Szrj(define_insn_reservation "ppro_sse_add_V4SF" 3 619*38fd1498Szrj (and (eq_attr "cpu" "pentiumpro") 620*38fd1498Szrj (and (eq_attr "memory" "none") 621*38fd1498Szrj (and (eq_attr "mode" "V4SF") 622*38fd1498Szrj (eq_attr "type" "sseadd,sseadd1")))) 623*38fd1498Szrj "decoder0,p1*2") 624*38fd1498Szrj 625*38fd1498Szrj(define_insn_reservation "ppro_sse_add_V4SF_load" 3 626*38fd1498Szrj (and (eq_attr "cpu" "pentiumpro") 627*38fd1498Szrj (and (eq_attr "memory" "load") 628*38fd1498Szrj (and (eq_attr "mode" "V4SF") 629*38fd1498Szrj (eq_attr "type" "sseadd,sseadd1")))) 630*38fd1498Szrj "decoder0,(p2+p1)*2") 631*38fd1498Szrj 632*38fd1498Szrj(define_insn_reservation "ppro_sse_cmp_V4SF" 3 633*38fd1498Szrj (and (eq_attr "cpu" "pentiumpro") 634*38fd1498Szrj (and (eq_attr "memory" "none") 635*38fd1498Szrj (and (eq_attr "mode" "V4SF") 636*38fd1498Szrj (eq_attr "type" "ssecmp")))) 637*38fd1498Szrj "decoder0,p1*2") 638*38fd1498Szrj 639*38fd1498Szrj(define_insn_reservation "ppro_sse_cmp_V4SF_load" 3 640*38fd1498Szrj (and (eq_attr "cpu" "pentiumpro") 641*38fd1498Szrj (and (eq_attr "memory" "load") 642*38fd1498Szrj (and (eq_attr "mode" "V4SF") 643*38fd1498Szrj (eq_attr "type" "ssecmp")))) 644*38fd1498Szrj "decoder0,(p2+p1)*2") 645*38fd1498Szrj 646*38fd1498Szrj(define_insn_reservation "ppro_sse_cvt_V4SF" 3 647*38fd1498Szrj (and (eq_attr "cpu" "pentiumpro") 648*38fd1498Szrj (and (eq_attr "memory" "none,unknown") 649*38fd1498Szrj (and (eq_attr "mode" "V4SF") 650*38fd1498Szrj (eq_attr "type" "ssecvt")))) 651*38fd1498Szrj "decoder0,p1*2") 652*38fd1498Szrj 653*38fd1498Szrj(define_insn_reservation "ppro_sse_cvt_V4SF_other" 4 654*38fd1498Szrj (and (eq_attr "cpu" "pentiumpro") 655*38fd1498Szrj (and (eq_attr "memory" "!none,unknown") 656*38fd1498Szrj (and (eq_attr "mode" "V4SF") 657*38fd1498Szrj (eq_attr "type" "ssecmp")))) 658*38fd1498Szrj "decoder0,p1,p4+p3") 659*38fd1498Szrj 660*38fd1498Szrj(define_insn_reservation "ppro_sse_mul_V4SF" 5 661*38fd1498Szrj (and (eq_attr "cpu" "pentiumpro") 662*38fd1498Szrj (and (eq_attr "memory" "none") 663*38fd1498Szrj (and (eq_attr "mode" "V4SF") 664*38fd1498Szrj (eq_attr "type" "ssemul")))) 665*38fd1498Szrj "decoder0,p0*2") 666*38fd1498Szrj 667*38fd1498Szrj(define_insn_reservation "ppro_sse_mul_V4SF_load" 5 668*38fd1498Szrj (and (eq_attr "cpu" "pentiumpro") 669*38fd1498Szrj (and (eq_attr "memory" "load") 670*38fd1498Szrj (and (eq_attr "mode" "V4SF") 671*38fd1498Szrj (eq_attr "type" "ssemul")))) 672*38fd1498Szrj "decoder0,(p2+p0)*2") 673*38fd1498Szrj 674*38fd1498Szrj;; FIXME: p0 really closed this long??? 675*38fd1498Szrj(define_insn_reservation "ppro_sse_div_V4SF" 48 676*38fd1498Szrj (and (eq_attr "cpu" "pentiumpro") 677*38fd1498Szrj (and (eq_attr "memory" "none") 678*38fd1498Szrj (and (eq_attr "mode" "V4SF") 679*38fd1498Szrj (eq_attr "type" "ssediv")))) 680*38fd1498Szrj "decoder0,p0*34") 681*38fd1498Szrj 682*38fd1498Szrj(define_insn_reservation "ppro_sse_div_V4SF_load" 48 683*38fd1498Szrj (and (eq_attr "cpu" "pentiumpro") 684*38fd1498Szrj (and (eq_attr "memory" "load") 685*38fd1498Szrj (and (eq_attr "mode" "V4SF") 686*38fd1498Szrj (eq_attr "type" "ssediv")))) 687*38fd1498Szrj "decoder0,(p2+p0)*2,p0*32") 688*38fd1498Szrj 689*38fd1498Szrj(define_insn_reservation "ppro_sse_log_V4SF" 2 690*38fd1498Szrj (and (eq_attr "cpu" "pentiumpro") 691*38fd1498Szrj (and (eq_attr "memory" "none") 692*38fd1498Szrj (and (eq_attr "mode" "V4SF") 693*38fd1498Szrj (eq_attr "type" "sselog,sselog1,sseshuf,sseshuf1")))) 694*38fd1498Szrj "decodern,p1") 695*38fd1498Szrj 696*38fd1498Szrj(define_insn_reservation "ppro_sse_log_V4SF_load" 2 697*38fd1498Szrj (and (eq_attr "cpu" "pentiumpro") 698*38fd1498Szrj (and (eq_attr "memory" "load") 699*38fd1498Szrj (and (eq_attr "mode" "V4SF") 700*38fd1498Szrj (eq_attr "type" "sselog,sselog1,sseshuf,sseshuf1")))) 701*38fd1498Szrj "decoder0,(p2+p1)") 702*38fd1498Szrj 703*38fd1498Szrj(define_insn_reservation "ppro_sse_mov_V4SF" 1 704*38fd1498Szrj (and (eq_attr "cpu" "pentiumpro") 705*38fd1498Szrj (and (eq_attr "memory" "none") 706*38fd1498Szrj (and (eq_attr "mode" "V4SF") 707*38fd1498Szrj (eq_attr "type" "ssemov")))) 708*38fd1498Szrj "decoder0,(p0|p1)*2") 709*38fd1498Szrj 710*38fd1498Szrj(define_insn_reservation "ppro_sse_mov_V4SF_load" 2 711*38fd1498Szrj (and (eq_attr "cpu" "pentiumpro") 712*38fd1498Szrj (and (eq_attr "memory" "load") 713*38fd1498Szrj (and (eq_attr "mode" "V4SF") 714*38fd1498Szrj (eq_attr "type" "ssemov")))) 715*38fd1498Szrj "decoder0,p2*2") 716*38fd1498Szrj 717*38fd1498Szrj(define_insn_reservation "ppro_sse_mov_V4SF_store" 3 718*38fd1498Szrj (and (eq_attr "cpu" "pentiumpro") 719*38fd1498Szrj (and (eq_attr "memory" "store") 720*38fd1498Szrj (and (eq_attr "mode" "V4SF") 721*38fd1498Szrj (eq_attr "type" "ssemov")))) 722*38fd1498Szrj "decoder0,(p4+p3)*2") 723*38fd1498Szrj 724*38fd1498Szrj;; All other instructions are modelled as simple instructions. 725*38fd1498Szrj;; We have already modelled all i387 floating point instructions, so all 726*38fd1498Szrj;; other instructions execute on either port 0 or port 1. This includes 727*38fd1498Szrj;; the ALU units, and the MMX units. 728*38fd1498Szrj;; 729*38fd1498Szrj;; reg-reg instructions produce 1 uop so they can be decoded on any of 730*38fd1498Szrj;; the three decoders. 731*38fd1498Szrj(define_insn_reservation "ppro_insn" 1 732*38fd1498Szrj (and (eq_attr "cpu" "pentiumpro") 733*38fd1498Szrj (and (eq_attr "memory" "none,unknown") 734*38fd1498Szrj (eq_attr "type" "alu,alu1,negnot,incdec,icmp,test,setcc,icmov,push,pop,fxch,sseiadd,sseishft,sseishft1,sseimul,mmx,mmxadd,mmxcmp"))) 735*38fd1498Szrj "decodern,(p0|p1)") 736*38fd1498Szrj 737*38fd1498Szrj;; read-modify and register-memory instructions have 2 or three uops, 738*38fd1498Szrj;; so they have to be decoded on decoder0. 739*38fd1498Szrj(define_insn_reservation "ppro_insn_load" 3 740*38fd1498Szrj (and (eq_attr "cpu" "pentiumpro") 741*38fd1498Szrj (and (eq_attr "memory" "load") 742*38fd1498Szrj (eq_attr "type" "alu,alu1,negnot,incdec,icmp,test,setcc,icmov,push,pop,fxch,sseiadd,sseishft,sseishft1,sseimul,mmx,mmxadd,mmxcmp"))) 743*38fd1498Szrj "decoder0,p2+(p0|p1)") 744*38fd1498Szrj 745*38fd1498Szrj(define_insn_reservation "ppro_insn_store" 1 746*38fd1498Szrj (and (eq_attr "cpu" "pentiumpro") 747*38fd1498Szrj (and (eq_attr "memory" "store") 748*38fd1498Szrj (eq_attr "type" "alu,alu1,negnot,incdec,icmp,test,setcc,icmov,push,pop,fxch,sseiadd,sseishft,sseishft1,sseimul,mmx,mmxadd,mmxcmp"))) 749*38fd1498Szrj "decoder0,(p0|p1),p4+p3") 750*38fd1498Szrj 751*38fd1498Szrj;; read-modify-store instructions produce 4 uops so they have to be 752*38fd1498Szrj;; decoded on decoder0 as well. 753*38fd1498Szrj(define_insn_reservation "ppro_insn_both" 4 754*38fd1498Szrj (and (eq_attr "cpu" "pentiumpro") 755*38fd1498Szrj (and (eq_attr "memory" "both") 756*38fd1498Szrj (eq_attr "type" "alu,alu1,negnot,incdec,icmp,test,setcc,icmov,push,pop,fxch,sseiadd,sseishft,sseishft1,sseimul,mmx,mmxadd,mmxcmp"))) 757*38fd1498Szrj "decoder0,p2+(p0|p1),p4+p3") 758*38fd1498Szrj 759