xref: /dragonfly/contrib/gcc-8.0/gcc/config/i386/k6.md (revision 5e83d98b)
1;; AMD K6/K6-2 Scheduling
2;; Copyright (C) 2002-2018 Free Software Foundation, Inc.
3;;
4;; This file is part of GCC.
5;;
6;; GCC is free software; you can redistribute it and/or modify
7;; it under the terms of the GNU General Public License as published by
8;; the Free Software Foundation; either version 3, or (at your option)
9;; any later version.
10;;
11;; GCC is distributed in the hope that it will be useful,
12;; but WITHOUT ANY WARRANTY; without even the implied warranty of
13;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14;; GNU General Public License for more details.
15;;
16;; You should have received a copy of the GNU General Public License
17;; along with GCC; see the file COPYING3.  If not see
18;; <http://www.gnu.org/licenses/>.
19;;
20;; The K6 architecture is quite similar to PPro.  Important difference is
21;; that there are only two decoders and they seems to be much slower than
22;; any of the execution units.  So we have to pay much more attention to
23;; proper scheduling for the decoders.
24;; FIXME: We don't do that right now.  A good start would be to sort the
25;;        instructions based on length.
26;;
27;; This description is based on data from the following documents:
28;;
29;;    "AMD-K6 Processor Data Sheet (Preliminary information)"
30;;    Advanced Micro Devices, Inc., 1998.
31;;
32;;    "AMD-K6 Processor Code Optimization Application Note"
33;;    Advanced Micro Devices, Inc., 2000.
34;;
35;; CPU execution units of the K6:
36;;
37;; store	describes the Store unit.  This unit is not modelled
38;;		completely and it is only used to model lea operation.
39;;		Otherwise it lies outside of any critical path.
40;; load		describes the Load unit
41;; alux		describes the Integer X unit
42;; mm		describes the Multimedia unit, which shares a pipe
43;;		with the Integer X unit.  This unit is used for MMX,
44;;		which is not implemented for K6.
45;; aluy		describes the Integer Y unit
46;; fpu		describes the FPU unit
47;; branch	describes the Branch unit
48;;
49;; The fp unit is not pipelined, and it can only do one operation per two
50;; cycles, including fxcg.
51;;
52;; Generally this is a very poor description, but at least no worse than
53;; the old description, and a lot easier to extend to something more
54;; reasonable if anyone still cares enough about this architecture in 2004.
55;;
56;; ??? fxch isn't handled; not an issue until sched3 after reg-stack is real.
57
58(define_automaton "k6_decoder,k6_load_unit,k6_store_unit,k6_integer_units,k6_fpu_unit,k6_branch_unit")
59
60;; The K6 instruction decoding begins before the on-chip instruction cache is
61;; filled.  Depending on the length of the instruction, two simple instructions
62;; can be decoded in two parallel short decoders, or one complex instruction can
63;; be decoded in either the long or the vector decoder.  For all practical
64;; purposes, the long and vector decoder can be modelled as one decoder.
65(define_cpu_unit "k6_decode_short0" "k6_decoder")
66(define_cpu_unit "k6_decode_short1" "k6_decoder")
67(define_cpu_unit "k6_decode_long" "k6_decoder")
68(exclusion_set "k6_decode_long" "k6_decode_short0,k6_decode_short1")
69(define_reservation "k6_decode_short" "k6_decode_short0|k6_decode_short1")
70(define_reservation "k6_decode_vector" "k6_decode_long")
71
72(define_cpu_unit "k6_store" "k6_store_unit")
73(define_cpu_unit "k6_load" "k6_load_unit")
74(define_cpu_unit "k6_alux,k6_aluy" "k6_integer_units")
75(define_cpu_unit "k6_fpu" "k6_fpu_unit")
76(define_cpu_unit "k6_branch" "k6_branch_unit")
77
78;; Shift instructions and certain arithmetic are issued only on Integer X.
79(define_insn_reservation "k6_alux_only" 1
80			 (and (eq_attr "cpu" "k6")
81			      (and (eq_attr "type" "ishift,ishift1,rotate,rotate1,alu1,negnot")
82				   (eq_attr "memory" "none")))
83			 "k6_decode_short,k6_alux")
84
85(define_insn_reservation "k6_alux_only_load" 3
86			 (and (eq_attr "cpu" "k6")
87			       (and (eq_attr "type" "ishift,ishift1,rotate,rotate1,alu1,negnot")
88				    (eq_attr "memory" "load")))
89			 "k6_decode_short,k6_load,k6_alux")
90
91(define_insn_reservation "k6_alux_only_store" 3
92			 (and (eq_attr "cpu" "k6")
93			       (and (eq_attr "type" "ishift,ishift1,rotate,rotate1,alu1,negnot")
94				    (eq_attr "memory" "store,both,unknown")))
95			 "k6_decode_long,k6_load,k6_alux,k6_store")
96
97;; Integer divide and multiply can only be issued on Integer X, too.
98(define_insn_reservation "k6_alu_imul" 2
99			 (and (eq_attr "cpu" "k6")
100			      (eq_attr "type" "imul"))
101			 "k6_decode_vector,k6_alux*3")
102
103(define_insn_reservation "k6_alu_imul_load" 4
104			 (and (eq_attr "cpu" "k6")
105			      (and (eq_attr "type" "imul")
106				   (eq_attr "memory" "load")))
107			 "k6_decode_vector,k6_load,k6_alux*3")
108
109(define_insn_reservation "k6_alu_imul_store" 4
110			 (and (eq_attr "cpu" "k6")
111			      (and (eq_attr "type" "imul")
112				   (eq_attr "memory" "store,both,unknown")))
113			 "k6_decode_vector,k6_load,k6_alux*3,k6_store")
114
115;; ??? Guessed latencies based on the old pipeline description.
116(define_insn_reservation "k6_alu_idiv" 17
117			 (and (eq_attr "cpu" "k6")
118			      (and (eq_attr "type" "idiv")
119				   (eq_attr "memory" "none")))
120			 "k6_decode_vector,k6_alux*17")
121
122(define_insn_reservation "k6_alu_idiv_mem" 19
123			 (and (eq_attr "cpu" "k6")
124			      (and (eq_attr "type" "idiv")
125				   (eq_attr "memory" "!none")))
126			 "k6_decode_vector,k6_load,k6_alux*17")
127
128;; Basic word and doubleword ALU ops can be issued on both Integer units.
129(define_insn_reservation "k6_alu" 1
130			 (and (eq_attr "cpu" "k6")
131			      (and (eq_attr "type" "alu,alu1,negnot,icmp,test,imovx,incdec,setcc")
132				   (eq_attr "memory" "none")))
133			 "k6_decode_short,k6_alux|k6_aluy")
134
135(define_insn_reservation "k6_alu_load" 3
136			 (and (eq_attr "cpu" "k6")
137			      (and (eq_attr "type" "alu,alu1,negnot,icmp,test,imovx,incdec,setcc")
138				   (eq_attr "memory" "load")))
139			 "k6_decode_short,k6_load,k6_alux|k6_aluy")
140
141(define_insn_reservation "k6_alu_store" 3
142			 (and (eq_attr "cpu" "k6")
143			      (and (eq_attr "type" "alu,alu1,negnot,icmp,test,imovx,incdec,setcc")
144				   (eq_attr "memory" "store,both,unknown")))
145			 "k6_decode_long,k6_load,k6_alux|k6_aluy,k6_store")
146
147;; A "load immediate" operation does not require execution at all,
148;; it is available immediately after decoding.  Special-case this.
149(define_insn_reservation "k6_alu_imov" 1
150			 (and (eq_attr "cpu" "k6")
151			      (and (eq_attr "type" "imov")
152				   (and (eq_attr "memory" "none")
153					(match_operand 1 "nonimmediate_operand"))))
154			 "k6_decode_short,k6_alux|k6_aluy")
155
156(define_insn_reservation "k6_alu_imov_imm" 0
157			 (and (eq_attr "cpu" "k6")
158			      (and (eq_attr "type" "imov")
159				   (and (eq_attr "memory" "none")
160					(match_operand 1 "immediate_operand"))))
161			 "k6_decode_short")
162
163(define_insn_reservation "k6_alu_imov_load" 2
164			 (and (eq_attr "cpu" "k6")
165			      (and (eq_attr "type" "imov")
166				   (eq_attr "memory" "load")))
167			 "k6_decode_short,k6_load")
168
169(define_insn_reservation "k6_alu_imov_store" 1
170			 (and (eq_attr "cpu" "k6")
171			      (and (eq_attr "type" "imov")
172				   (eq_attr "memory" "store")))
173			 "k6_decode_short,k6_store")
174
175(define_insn_reservation "k6_alu_imov_both" 2
176			 (and (eq_attr "cpu" "k6")
177			      (and (eq_attr "type" "imov")
178				   (eq_attr "memory" "both,unknown")))
179			 "k6_decode_long,k6_load,k6_alux|k6_aluy")
180
181;; The branch unit.
182(define_insn_reservation "k6_branch_call" 1
183			 (and (eq_attr "cpu" "k6")
184			      (eq_attr "type" "call,callv"))
185			 "k6_decode_vector,k6_branch")
186
187(define_insn_reservation "k6_branch_branch" 1
188			 (and (eq_attr "cpu" "k6")
189			      (eq_attr "type" "ibr"))
190			 "k6_decode_short,k6_branch")
191
192;; The load and units have two pipeline stages.  The load latency is
193;; two cycles.
194(define_insn_reservation "k6_load_pop" 3
195			 (and (eq_attr "cpu" "k6")
196			      (ior (eq_attr "type" "pop")
197				   (eq_attr "memory" "load,both")))
198			 "k6_decode_short,k6_load")
199
200(define_insn_reservation "k6_load_leave" 5
201			 (and (eq_attr "cpu" "k6")
202			      (eq_attr "type" "leave"))
203			 "k6_decode_long,k6_load,(k6_alux|k6_aluy)*2")
204
205;; ??? From the old pipeline description.  Egad!
206;; ??? Apparently we take care of this reservation in adjust_cost.
207(define_insn_reservation "k6_load_str" 10
208			 (and (eq_attr "cpu" "k6")
209			      (and (eq_attr "type" "str")
210				   (eq_attr "memory" "load,both")))
211			 "k6_decode_vector,k6_load*10")
212
213;; The store unit handles lea and push.  It is otherwise unmodelled.
214(define_insn_reservation "k6_store_lea" 2
215			 (and (eq_attr "cpu" "k6")
216			      (eq_attr "type" "lea"))
217			 "k6_decode_short,k6_store,k6_alux|k6_aluy")
218
219(define_insn_reservation "k6_store_push" 2
220			 (and (eq_attr "cpu" "k6")
221			      (ior (eq_attr "type" "push")
222				   (eq_attr "memory" "store,both")))
223			 "k6_decode_short,k6_store")
224
225(define_insn_reservation "k6_store_str" 10
226			 (and (eq_attr "cpu" "k6")
227			      (eq_attr "type" "str"))
228			 "k6_store*10")
229
230;; Most FPU instructions have latency 2 and throughput 2.
231(define_insn_reservation "k6_fpu" 2
232			 (and (eq_attr "cpu" "k6")
233			      (and (eq_attr "type" "fop,fmov,fcmp,fistp")
234				   (eq_attr "memory" "none")))
235			 "k6_decode_vector,k6_fpu*2")
236
237(define_insn_reservation "k6_fpu_load" 6
238			 (and (eq_attr "cpu" "k6")
239			      (and (eq_attr "type" "fop,fmov,fcmp,fistp")
240				   (eq_attr "memory" "load,both")))
241			 "k6_decode_short,k6_load,k6_fpu*2")
242
243(define_insn_reservation "k6_fpu_store" 6
244			 (and (eq_attr "cpu" "k6")
245			      (and (eq_attr "type" "fop,fmov,fcmp,fistp")
246				   (eq_attr "memory" "store")))
247			 "k6_decode_short,k6_store,k6_fpu*2")
248
249(define_insn_reservation "k6_fpu_fmul" 2
250			 (and (eq_attr "cpu" "k6")
251			      (and (eq_attr "type" "fmul")
252				   (eq_attr "memory" "none")))
253			 "k6_decode_short,k6_fpu*2")
254
255(define_insn_reservation "k6_fpu_fmul_load" 2
256			 (and (eq_attr "cpu" "k6")
257			      (and (eq_attr "type" "fmul")
258				   (eq_attr "memory" "load,both")))
259			 "k6_decode_short,k6_load,k6_fpu*2")
260
261;; ??? Guessed latencies from the old pipeline description.
262(define_insn_reservation "k6_fpu_expensive" 56
263			 (and (eq_attr "cpu" "k6")
264			      (eq_attr "type" "fdiv,fpspc"))
265			 "k6_decode_short,k6_fpu*56")
266
267