1;; Scheduling description for cell processor.
2;; Copyright (C) 2001-2021 Free Software Foundation, Inc.
3;; Contributed by Sony Computer Entertainment, Inc.,
4
5
6;; This file is free software; you can redistribute it and/or modify it under
7;; the terms of the GNU General Public License as published by the Free
8;; Software Foundation; either version 3 of the License, or (at your option)
9;; any later version.
10
11;; This file is distributed in the hope that it will be useful, but WITHOUT
12;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13;; FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14;; for more details.
15
16;; You should have received a copy of the GNU General Public License
17;; along with GCC; see the file COPYING3.  If not see
18;; <http://www.gnu.org/licenses/>.
19
20;; Sources: BE BOOK4 (/sfs/enc/doc/PPU_BookIV_DD3.0_latest.pdf)
21
22;; BE Architecture *DD3.0 and DD3.1*
23;; This file simulate PPU processor unit backend of pipeline, maualP24.
24;; manual P27, stall and flush points
25;; IU, XU, VSU, dispatcher decodes and dispatch 2 insns per cycle in program
26;;  order, the grouped address are aligned by 8
27;; This file only simulate one thread situation
28;; XU executes all fixed point insns(3 units, a simple alu, a complex unit,
29;;   and load/store unit)
30;; VSU executes all scalar floating points insn(a float unit),
31;;   VMX insns(VMX unit, 4 sub units, simple, permute, complex, floating point)
32
33;; Dual issue combination
34
35;;	FXU	LSU	BR 	        VMX	               VMX
36;;                             (sx,cx,vsu_fp,fp_arith)    (perm,vsu_ls,fp_ls)
37;;FXU	X
38;;LSU		X               	X               	X
39;;BR			X
40;;VMX(sx,cx,vsu_fp,fp_arth)		X
41;;VMX(perm,vsu_ls, fp_ls)					X
42;;    X are illegal combination.
43
44;; Dual issue exceptions:
45;;(1) nop-pipelined FXU instr in slot 0
46;;(2) non-pipelined FPU inst in slot 0
47;; CSI instr(contex-synchronizing insn)
48;; Microcode insn
49
50;; BRU unit: bru(none register stall), bru_cr(cr register stall)
51;; VSU unit: vus(vmx simple), vup(vmx permute), vuc(vmx complex),
52;;  vuf(vmx float), fpu(floats). fpu_div is hypothetical, it is for
53;;  nonpipelined simulation
54;; micr insns will stall at least 7 cycles to get the first instr from ROM,
55;;  micro instructions are not dual issued.
56
57;; slot0 is older than slot1
58;; non-pipelined insn need to be in slot1 to avoid 1cycle stall
59
60;; There different stall point
61;; IB2, only stall one thread if stall here, so try to stall here as much as
62;; we can
63;; condition(1) insert nop, OR and ORI instruction form
64;; condition(2) flush happens, in case of: RAW, WAW, D-ERAT miss, or
65;;   CR0-access while stdcx, or stwcx
66;; IS2 stall ;; Page91 for details
67;; VQ8 stall
68;; IS2 stall can be activated by VQ8 stall and trying to issue a vsu instr to
69;;  the vsu issue queue
70
71;;(define_automaton "cellxu")
72
73;;(define_cpu_unit "fxu_cell,lsu_cell,bru_cell,vsu1_cell,vsu2_cell" "cellxu")
74
75;; ndfa
76(define_automaton "cellxu,cellvsu,cellbru,cell_mis")
77
78(define_cpu_unit "fxu_cell,lsu_cell" "cellxu")
79(define_cpu_unit "bru_cell" "cellbru")
80(define_cpu_unit "vsu1_cell,vsu2_cell" "cellvsu")
81
82(define_cpu_unit "slot0,slot1" "cell_mis")
83
84(absence_set "slot0" "slot1")
85
86(define_reservation "nonpipeline" "fxu_cell+lsu_cell+vsu1_cell+vsu2_cell")
87(define_reservation "slot01" "slot0|slot1")
88
89
90;; Load/store
91;; lmw, lswi, lswx are only generated for optimize for space, MC,
92;;   these instr are not simulated
93(define_insn_reservation "cell-load" 2
94  (and (eq_attr "type" "load")
95       (eq_attr "sign_extend" "no")
96       (eq_attr "update" "no")
97       (eq_attr "cpu" "cell"))
98  "slot01,lsu_cell")
99
100;; ldux, ldu, lbzux, lbzu, hardware breaks it down to two instrs,
101;;  if with 32bytes alignment, CMC
102(define_insn_reservation "cell-load-ux" 2
103  (and (eq_attr "type" "load")
104       (eq_attr "sign_extend" "no")
105       (eq_attr "update" "yes")
106       (eq_attr "cpu" "cell"))
107  "slot01,fxu_cell+lsu_cell")
108
109;; lha, lhax, lhau, lhaux, lwa, lwax, lwaux, MC, latency unknown
110;;   11/7, 11/8, 11/12
111(define_insn_reservation "cell-load-ext" 2
112  (and (eq_attr "type" "load")
113       (eq_attr "sign_extend" "yes")
114       (eq_attr "cpu" "cell"))
115  "slot01,fxu_cell+lsu_cell")
116
117;;lfs,lfsx,lfd,lfdx, 1 cycle
118(define_insn_reservation "cell-fpload" 1
119  (and (eq_attr "type" "fpload")
120       (eq_attr "update" "no")
121       (eq_attr "cpu" "cell"))
122  "vsu2_cell+lsu_cell+slot01")
123
124;; lfsu,lfsux,lfdu,lfdux 1cycle(fpr) 2 cycle(gpr)
125(define_insn_reservation "cell-fpload-update" 1
126  (and (eq_attr "type" "fpload")
127       (eq_attr "update" "yes")
128       (eq_attr "cpu" "cell"))
129  "fxu_cell+vsu2_cell+lsu_cell+slot01")
130
131(define_insn_reservation "cell-vecload" 2
132  (and (eq_attr "type" "vecload")
133       (eq_attr "cpu" "cell"))
134  "slot01,vsu2_cell+lsu_cell")
135
136;;st? stw(MC)
137(define_insn_reservation "cell-store" 1
138  (and (eq_attr "type" "store")
139       (eq_attr "update" "no")
140       (eq_attr "cpu" "cell"))
141  "lsu_cell+slot01")
142
143;;stdux, stdu, (hardware breaks into store and add) 2 for update reg
144(define_insn_reservation "cell-store-update" 1
145  (and (eq_attr "type" "store")
146       (eq_attr "update" "yes")
147       (eq_attr "cpu" "cell"))
148  "fxu_cell+lsu_cell+slot01")
149
150(define_insn_reservation "cell-fpstore" 1
151  (and (eq_attr "type" "fpstore")
152       (eq_attr "update" "no")
153       (eq_attr "cpu" "cell"))
154  "vsu2_cell+lsu_cell+slot01")
155
156(define_insn_reservation "cell-fpstore-update" 1
157  (and (eq_attr "type" "fpstore")
158       (eq_attr "update" "yes")
159       (eq_attr "cpu" "cell"))
160  "vsu2_cell+fxu_cell+lsu_cell+slot01")
161
162(define_insn_reservation "cell-vecstore" 1
163  (and (eq_attr "type" "vecstore")
164       (eq_attr "cpu" "cell"))
165  "vsu2_cell+lsu_cell+slot01")
166
167;; Integer latency is 2 cycles
168(define_insn_reservation "cell-integer" 2
169  (and (ior (eq_attr "type" "integer,trap,cntlz,isel")
170	    (and (eq_attr "type" "add,logical,shift,exts")
171		 (eq_attr "dot" "no"))
172	    (and (eq_attr "type" "insert")
173		 (eq_attr "size" "64")))
174       (eq_attr "cpu" "cell"))
175  "slot01,fxu_cell")
176
177;; Two integer latency is 4 cycles
178(define_insn_reservation "cell-two" 4
179  (and (eq_attr "type" "two")
180       (eq_attr "cpu" "cell"))
181  "slot01,fxu_cell,fxu_cell*2")
182
183;; Three integer latency is 6 cycles
184(define_insn_reservation "cell-three" 6
185  (and (eq_attr "type" "three")
186       (eq_attr "cpu" "cell"))
187  "slot01,fxu_cell,fxu_cell*4")
188
189;; rlwimi, alter cr0
190(define_insn_reservation "cell-insert" 2
191  (and (eq_attr "type" "insert")
192       (eq_attr "size" "32")
193       (eq_attr "cpu" "cell"))
194 "slot01,fxu_cell")
195
196;; cmpi, cmpli, cmpla, add, addo, sub, subo, alter cr0
197(define_insn_reservation "cell-cmp" 1
198  (and (eq_attr "type" "cmp")
199       (eq_attr "cpu" "cell"))
200  "fxu_cell+slot01")
201
202;; add, addo, sub, subo, alter cr0, rldcli, rlwinm
203(define_insn_reservation "cell-fast-cmp" 2
204  (and (eq_attr "type" "add,logical,shift,exts")
205       (eq_attr "dot" "yes")
206       (eq_attr "cpu" "cell")
207       (eq_attr "cell_micro" "not"))
208  "slot01,fxu_cell")
209
210(define_insn_reservation "cell-cmp-microcoded" 9
211  (and (eq_attr "type" "add,logical,shift,exts")
212       (eq_attr "dot" "yes")
213       (eq_attr "cpu" "cell")
214       (eq_attr "cell_micro" "always"))
215  "slot0+slot1,fxu_cell,fxu_cell*7")
216
217;; mulld
218(define_insn_reservation "cell-lmul" 15
219  (and (eq_attr "type" "mul")
220       (eq_attr "dot" "no")
221       (eq_attr "size" "64")
222       (eq_attr "cpu" "cell"))
223  "slot1,nonpipeline,nonpipeline*13")
224
225;; mulld. is microcoded
226(define_insn_reservation "cell-lmul-cmp" 22
227  (and (eq_attr "type" "mul")
228       (eq_attr "dot" "yes")
229       (eq_attr "size" "64")
230       (eq_attr "cpu" "cell"))
231  "slot0+slot1,nonpipeline,nonpipeline*20")
232
233;; mulli, 6 cycles
234(define_insn_reservation "cell-imul23" 6
235  (and (eq_attr "type" "mul")
236       (eq_attr "size" "8,16")
237       (eq_attr "cpu" "cell"))
238  "slot1,nonpipeline,nonpipeline*4")
239
240;; mullw, 9
241(define_insn_reservation "cell-imul" 9
242  (and (eq_attr "type" "mul")
243       (eq_attr "dot" "no")
244       (eq_attr "size" "32")
245       (eq_attr "cpu" "cell"))
246  "slot1,nonpipeline,nonpipeline*7")
247
248;; divide
249(define_insn_reservation "cell-idiv" 32
250  (and (eq_attr "type" "div")
251       (eq_attr "size" "32")
252       (eq_attr "cpu" "cell"))
253  "slot1,nonpipeline,nonpipeline*30")
254
255(define_insn_reservation "cell-ldiv" 64
256  (and (eq_attr "type" "div")
257       (eq_attr "size" "64")
258       (eq_attr "cpu" "cell"))
259  "slot1,nonpipeline,nonpipeline*62")
260
261;;mflr and mfctr are pipelined
262(define_insn_reservation "cell-mfjmpr" 1
263  (and (eq_attr "type" "mfjmpr")
264       (eq_attr "cpu" "cell"))
265  "slot01+bru_cell")
266
267;;mtlr and mtctr,
268;;mtspr fully pipelined
269(define_insn_reservation "cell-mtjmpr" 1
270 (and (eq_attr "type" "mtjmpr")
271       (eq_attr "cpu" "cell"))
272  "bru_cell+slot01")
273
274;; Branches
275;; b, ba, bl, bla, unconditional branch always predicts correctly n/a latency
276;; bcctr, bcctrl, latency 2, actually adjust by be to 4
277(define_insn_reservation "cell-branch" 1
278  (and (eq_attr "type" "branch")
279       (eq_attr "cpu" "cell"))
280  "bru_cell+slot1")
281
282(define_insn_reservation "cell-branchreg" 1
283  (and (eq_attr "type" "jmpreg")
284       (eq_attr "cpu" "cell"))
285  "bru_cell+slot1")
286
287;; cr hazard
288;; page 90, special cases for CR hazard, only one instr can access cr per cycle
289;; if insn reads CR following a stwcx, pipeline stall till stwcx finish
290(define_insn_reservation "cell-crlogical" 1
291  (and (eq_attr "type" "cr_logical")
292       (eq_attr "cpu" "cell"))
293  "bru_cell+slot01")
294
295;; mfcrf and mfcr is about 34 cycles and nonpipelined
296(define_insn_reservation "cell-mfcr" 34
297  (and (eq_attr "type" "mfcrf,mfcr")
298       (eq_attr "cpu" "cell"))
299   "slot1,nonpipeline,nonpipeline*32")
300
301;; mtcrf (1 field)
302(define_insn_reservation "cell-mtcrf" 1
303  (and (eq_attr "type" "mtcr")
304       (eq_attr "cpu" "cell"))
305  "fxu_cell+slot01")
306
307; Basic FP latency is 10 cycles, thoughput is 1/cycle
308(define_insn_reservation "cell-fp" 10
309  (and (eq_attr "type" "fp,fpsimple,dmul")
310       (eq_attr "cpu" "cell"))
311  "slot01,vsu1_cell,vsu1_cell*8")
312
313(define_insn_reservation "cell-fpcompare" 1
314  (and (eq_attr "type" "fpcompare")
315       (eq_attr "cpu" "cell"))
316  "vsu1_cell+slot01")
317
318;; sdiv thoughput 1/74, not pipelined but only in the FPU
319(define_insn_reservation "cell-sdiv" 74
320  (and (eq_attr "type" "sdiv,ddiv")
321       (eq_attr "cpu" "cell"))
322  "slot1,nonpipeline,nonpipeline*72")
323
324;; fsqrt thoughput 1/84, not pipelined but only in the FPU
325(define_insn_reservation "cell-sqrt" 84
326  (and (eq_attr "type" "ssqrt,dsqrt")
327       (eq_attr "cpu" "cell"))
328  "slot1,nonpipeline,nonpipeline*82")
329
330; VMX
331(define_insn_reservation "cell-vecsimple" 4
332  (and (eq_attr "type" "vecsimple,veclogical,vecmove")
333       (eq_attr "cpu" "cell"))
334  "slot01,vsu1_cell,vsu1_cell*2")
335
336;; mult, div, madd
337(define_insn_reservation "cell-veccomplex" 10
338  (and (eq_attr "type" "veccomplex")
339       (eq_attr "cpu" "cell"))
340  "slot01,vsu1_cell,vsu1_cell*8")
341
342;; TODO: add support for recording instructions
343(define_insn_reservation "cell-veccmp" 4
344  (and (eq_attr "type" "veccmp,veccmpfx")
345       (eq_attr "cpu" "cell"))
346  "slot01,vsu1_cell,vsu1_cell*2")
347
348(define_insn_reservation "cell-vecfloat" 12
349  (and (eq_attr "type" "vecfloat")
350       (eq_attr "cpu" "cell"))
351  "slot01,vsu1_cell,vsu1_cell*10")
352
353(define_insn_reservation "cell-vecperm" 4
354  (and (eq_attr "type" "vecperm")
355       (eq_attr "cpu" "cell"))
356  "slot01,vsu2_cell,vsu2_cell*2")
357
358;; New for 4.2, syncs
359
360(define_insn_reservation "cell-sync" 11
361  (and (eq_attr "type" "sync")
362       (eq_attr "cpu" "cell"))
363  "slot01,lsu_cell,lsu_cell*9")
364
365(define_insn_reservation "cell-isync" 11
366  (and (eq_attr "type" "isync")
367       (eq_attr "cpu" "cell"))
368  "slot01,lsu_cell,lsu_cell*9")
369
370(define_insn_reservation "cell-load_l" 11
371  (and (eq_attr "type" "load_l")
372       (eq_attr "cpu" "cell"))
373  "slot01,lsu_cell,lsu_cell*9")
374
375(define_insn_reservation "cell-store_c" 11
376  (and (eq_attr "type" "store_c")
377       (eq_attr "cpu" "cell"))
378  "slot01,lsu_cell,lsu_cell*9")
379
380;; RAW register dependency
381
382;; addi r3, r3, 1
383;; lw r4,offset(r3)
384;; there are 5 cycle deplay for r3 bypassing
385;; there are 5 cycle delay for a dependent load after a load
386(define_bypass 5 "cell-integer" "cell-load")
387(define_bypass 5 "cell-integer" "cell-load-ext")
388(define_bypass 5 "cell-load,cell-load-ext" "cell-load,cell-load-ext")
389
390;; there is a 6 cycle delay after a fp compare until you can use the cr.
391(define_bypass 6 "cell-fpcompare" "cell-branch,cell-branchreg,cell-mfcr,cell-crlogical")
392
393;; VXU float RAW
394(define_bypass 11 "cell-vecfloat" "cell-vecfloat")
395
396;; VXU and FPU
397(define_bypass 6 "cell-veccomplex" "cell-vecsimple")
398;;(define_bypass 6 "cell-veccompare" "cell-branch,cell-branchreg")
399(define_bypass 3 "cell-vecfloat" "cell-veccomplex")
400; this is not correct,
401;;  this is a stall in general and not dependent on result
402(define_bypass 13 "cell-vecstore" "cell-fpstore")
403; this is not correct, this can never be true, not dependent on result
404(define_bypass 7 "cell-fp" "cell-fpload")
405;; vsu1 should avoid writing to the same target register as vsu2 insn
406;;   within 12 cycles.
407
408;; WAW hazard
409
410;; the target of VSU estimate should not be reused within 10 dispatch groups
411;; the target of VSU float should not be reused within 8 dispatch groups
412;; the target of VSU complex should not be reused within 5 dispatch groups
413;; FP LOAD should not reuse an FPU Arithmetic target with 6 dispatch gropus
414
415;; mtctr-bcctr/bcctrl, branch target ctr register shadow update at
416;;  ex4 stage(10 cycles)
417(define_bypass 10 "cell-mtjmpr" "cell-branchreg")
418
419;;Things are not simulated:
420;; update instruction, update address gpr are not simulated
421;; vrefp, vrsqrtefp have latency(14), currently simulated as 12 cycle float
422;;  insns
423
424