1 /* Costs of operations of individual x86 CPUs.
2    Copyright (C) 1988-2021 Free Software Foundation, Inc.
3 
4 This file is part of GCC.
5 
6 GCC is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3, or (at your option)
9 any later version.
10 
11 GCC is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 GNU General Public License for more details.
15 
16 Under Section 7 of GPL version 3, you are granted additional
17 permissions described in the GCC Runtime Library Exception, version
18 3.1, as published by the Free Software Foundation.
19 
20 You should have received a copy of the GNU General Public License and
21 a copy of the GCC Runtime Library Exception along with this program;
22 see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
23 <http://www.gnu.org/licenses/>.  */
24 /* Processor costs (relative to an add) */
25 /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes.  */
26 #define COSTS_N_BYTES(N) ((N) * 2)
27 
28 #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall, false}}}
29 
30 static stringop_algs ix86_size_memcpy[2] = {
31   {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
32   {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
33 static stringop_algs ix86_size_memset[2] = {
34   {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
35   {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
36 
37 const
38 struct processor_costs ix86_size_cost = {/* costs for tuning for size */
39   {
40   /* Start of register allocator costs.  integer->integer move cost is 2. */
41   2,				     /* cost for loading QImode using movzbl */
42   {2, 2, 2},				/* cost of loading integer registers
43 					   in QImode, HImode and SImode.
44 					   Relative to reg-reg move (2).  */
45   {2, 2, 2},				/* cost of storing integer registers */
46   2,					/* cost of reg,reg fld/fst */
47   {2, 2, 2},				/* cost of loading fp registers
48 					   in SFmode, DFmode and XFmode */
49   {2, 2, 2},				/* cost of storing fp registers
50 					   in SFmode, DFmode and XFmode */
51   3,					/* cost of moving MMX register */
52   {3, 3},				/* cost of loading MMX registers
53 					   in SImode and DImode */
54   {3, 3},				/* cost of storing MMX registers
55 					   in SImode and DImode */
56   3, 3, 3,				/* cost of moving XMM,YMM,ZMM register */
57   {3, 3, 3, 3, 3},			/* cost of loading SSE registers
58 					   in 32,64,128,256 and 512-bit */
59   {3, 3, 3, 3, 3},			/* cost of storing SSE registers
60 					   in 32,64,128,256 and 512-bit */
61   3, 3,				/* SSE->integer and integer->SSE moves */
62   3, 3,				/* mask->integer and integer->mask moves */
63   {2, 2, 2},				/* cost of loading mask register
64 					   in QImode, HImode, SImode.  */
65   {2, 2, 2},				/* cost if storing mask register
66 					   in QImode, HImode, SImode.  */
67   2,					/* cost of moving mask register.  */
68   /* End of register allocator costs.  */
69   },
70 
71   COSTS_N_BYTES (2),			/* cost of an add instruction */
72   COSTS_N_BYTES (3),			/* cost of a lea instruction */
73   COSTS_N_BYTES (2),			/* variable shift costs */
74   COSTS_N_BYTES (3),			/* constant shift costs */
75   {COSTS_N_BYTES (3),			/* cost of starting multiply for QI */
76    COSTS_N_BYTES (3),			/*				 HI */
77    COSTS_N_BYTES (3),			/*				 SI */
78    COSTS_N_BYTES (3),			/*				 DI */
79    COSTS_N_BYTES (5)},			/*			      other */
80   0,					/* cost of multiply per each bit set */
81   {COSTS_N_BYTES (3),			/* cost of a divide/mod for QI */
82    COSTS_N_BYTES (3),			/*			    HI */
83    COSTS_N_BYTES (3),			/*			    SI */
84    COSTS_N_BYTES (3),			/*			    DI */
85    COSTS_N_BYTES (5)},			/*			    other */
86   COSTS_N_BYTES (3),			/* cost of movsx */
87   COSTS_N_BYTES (3),			/* cost of movzx */
88   0,					/* "large" insn */
89   2,					/* MOVE_RATIO */
90   2,					/* CLEAR_RATIO */
91   {2, 2, 2},				/* cost of loading integer registers
92 					   in QImode, HImode and SImode.
93 					   Relative to reg-reg move (2).  */
94   {2, 2, 2},				/* cost of storing integer registers */
95   {3, 3, 3, 3, 3},			/* cost of loading SSE register
96 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
97   {3, 3, 3, 3, 3},			/* cost of storing SSE register
98 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
99   {3, 3, 3, 3, 3},			/* cost of unaligned SSE load
100 					   in 128bit, 256bit and 512bit */
101   {3, 3, 3, 3, 3},			/* cost of unaligned SSE store
102 					   in 128bit, 256bit and 512bit */
103   3, 3, 3,				/* cost of moving XMM,YMM,ZMM register */
104   3,					/* cost of moving SSE register to integer.  */
105   5, 0,					/* Gather load static, per_elt.  */
106   5, 0,					/* Gather store static, per_elt.  */
107   0,					/* size of l1 cache  */
108   0,					/* size of l2 cache  */
109   0,					/* size of prefetch block */
110   0,					/* number of parallel prefetches */
111   2,					/* Branch cost */
112   COSTS_N_BYTES (2),			/* cost of FADD and FSUB insns.  */
113   COSTS_N_BYTES (2),			/* cost of FMUL instruction.  */
114   COSTS_N_BYTES (2),			/* cost of FDIV instruction.  */
115   COSTS_N_BYTES (2),			/* cost of FABS instruction.  */
116   COSTS_N_BYTES (2),			/* cost of FCHS instruction.  */
117   COSTS_N_BYTES (2),			/* cost of FSQRT instruction.  */
118 
119   COSTS_N_BYTES (2),			/* cost of cheap SSE instruction.  */
120   COSTS_N_BYTES (2),			/* cost of ADDSS/SD SUBSS/SD insns.  */
121   COSTS_N_BYTES (2),			/* cost of MULSS instruction.  */
122   COSTS_N_BYTES (2),			/* cost of MULSD instruction.  */
123   COSTS_N_BYTES (2),			/* cost of FMA SS instruction.  */
124   COSTS_N_BYTES (2),			/* cost of FMA SD instruction.  */
125   COSTS_N_BYTES (2),			/* cost of DIVSS instruction.  */
126   COSTS_N_BYTES (2),			/* cost of DIVSD instruction.  */
127   COSTS_N_BYTES (2),			/* cost of SQRTSS instruction.  */
128   COSTS_N_BYTES (2),			/* cost of SQRTSD instruction.  */
129   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
130   ix86_size_memcpy,
131   ix86_size_memset,
132   COSTS_N_BYTES (1),			/* cond_taken_branch_cost.  */
133   COSTS_N_BYTES (1),			/* cond_not_taken_branch_cost.  */
134   NULL,					/* Loop alignment.  */
135   NULL,					/* Jump alignment.  */
136   NULL,					/* Label alignment.  */
137   NULL,					/* Func alignment.  */
138 };
139 
140 /* Processor costs (relative to an add) */
141 static stringop_algs i386_memcpy[2] = {
142   {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
143   DUMMY_STRINGOP_ALGS};
144 static stringop_algs i386_memset[2] = {
145   {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
146   DUMMY_STRINGOP_ALGS};
147 
148 static const
149 struct processor_costs i386_cost = {	/* 386 specific costs */
150   {
151   /* Start of register allocator costs.  integer->integer move cost is 2. */
152   4,				     /* cost for loading QImode using movzbl */
153   {2, 4, 2},				/* cost of loading integer registers
154 					   in QImode, HImode and SImode.
155 					   Relative to reg-reg move (2).  */
156   {2, 4, 2},				/* cost of storing integer registers */
157   2,					/* cost of reg,reg fld/fst */
158   {8, 8, 8},				/* cost of loading fp registers
159 					   in SFmode, DFmode and XFmode */
160   {8, 8, 8},				/* cost of storing fp registers
161 					   in SFmode, DFmode and XFmode */
162   2,					/* cost of moving MMX register */
163   {4, 8},				/* cost of loading MMX registers
164 					   in SImode and DImode */
165   {4, 8},				/* cost of storing MMX registers
166 					   in SImode and DImode */
167   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
168   {4, 8, 16, 32, 64},			/* cost of loading SSE registers
169 					   in 32,64,128,256 and 512-bit */
170   {4, 8, 16, 32, 64},			/* cost of storing SSE registers
171 					   in 32,64,128,256 and 512-bit */
172   3, 3,				/* SSE->integer and integer->SSE moves */
173   3, 3,				/* mask->integer and integer->mask moves */
174   {2, 4, 2},				/* cost of loading mask register
175 					   in QImode, HImode, SImode.  */
176   {2, 4, 2},				/* cost if storing mask register
177 					   in QImode, HImode, SImode.  */
178   2,					/* cost of moving mask register.  */
179   /* End of register allocator costs.  */
180   },
181 
182   COSTS_N_INSNS (1),			/* cost of an add instruction */
183   COSTS_N_INSNS (1),			/* cost of a lea instruction */
184   COSTS_N_INSNS (3),			/* variable shift costs */
185   COSTS_N_INSNS (2),			/* constant shift costs */
186   {COSTS_N_INSNS (6),			/* cost of starting multiply for QI */
187    COSTS_N_INSNS (6),			/*				 HI */
188    COSTS_N_INSNS (6),			/*				 SI */
189    COSTS_N_INSNS (6),			/*				 DI */
190    COSTS_N_INSNS (6)},			/*			      other */
191   COSTS_N_INSNS (1),			/* cost of multiply per each bit set */
192   {COSTS_N_INSNS (23),			/* cost of a divide/mod for QI */
193    COSTS_N_INSNS (23),			/*			    HI */
194    COSTS_N_INSNS (23),			/*			    SI */
195    COSTS_N_INSNS (23),			/*			    DI */
196    COSTS_N_INSNS (23)},			/*			    other */
197   COSTS_N_INSNS (3),			/* cost of movsx */
198   COSTS_N_INSNS (2),			/* cost of movzx */
199   15,					/* "large" insn */
200   3,					/* MOVE_RATIO */
201   3,					/* CLEAR_RATIO */
202   {2, 4, 2},				/* cost of loading integer registers
203 					   in QImode, HImode and SImode.
204 					   Relative to reg-reg move (2).  */
205   {2, 4, 2},				/* cost of storing integer registers */
206   {4, 8, 16, 32, 64},			/* cost of loading SSE register
207 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
208   {4, 8, 16, 32, 64},			/* cost of storing SSE register
209 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
210   {4, 8, 16, 32, 64},			/* cost of unaligned loads.  */
211   {4, 8, 16, 32, 64},			/* cost of unaligned stores.  */
212   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
213   3,					/* cost of moving SSE register to integer.  */
214   4, 4,					/* Gather load static, per_elt.  */
215   4, 4,					/* Gather store static, per_elt.  */
216   0,					/* size of l1 cache  */
217   0,					/* size of l2 cache  */
218   0,					/* size of prefetch block */
219   0,					/* number of parallel prefetches */
220   1,					/* Branch cost */
221   COSTS_N_INSNS (23),			/* cost of FADD and FSUB insns.  */
222   COSTS_N_INSNS (27),			/* cost of FMUL instruction.  */
223   COSTS_N_INSNS (88),			/* cost of FDIV instruction.  */
224   COSTS_N_INSNS (22),			/* cost of FABS instruction.  */
225   COSTS_N_INSNS (24),			/* cost of FCHS instruction.  */
226   COSTS_N_INSNS (122),			/* cost of FSQRT instruction.  */
227 
228   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
229   COSTS_N_INSNS (23),			/* cost of ADDSS/SD SUBSS/SD insns.  */
230   COSTS_N_INSNS (27),			/* cost of MULSS instruction.  */
231   COSTS_N_INSNS (27),			/* cost of MULSD instruction.  */
232   COSTS_N_INSNS (27),			/* cost of FMA SS instruction.  */
233   COSTS_N_INSNS (27),			/* cost of FMA SD instruction.  */
234   COSTS_N_INSNS (88),			/* cost of DIVSS instruction.  */
235   COSTS_N_INSNS (88),			/* cost of DIVSD instruction.  */
236   COSTS_N_INSNS (122),			/* cost of SQRTSS instruction.  */
237   COSTS_N_INSNS (122),			/* cost of SQRTSD instruction.  */
238   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
239   i386_memcpy,
240   i386_memset,
241   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
242   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
243   "4",					/* Loop alignment.  */
244   "4",					/* Jump alignment.  */
245   NULL,					/* Label alignment.  */
246   "4",					/* Func alignment.  */
247 };
248 
249 static stringop_algs i486_memcpy[2] = {
250   {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
251   DUMMY_STRINGOP_ALGS};
252 static stringop_algs i486_memset[2] = {
253   {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
254   DUMMY_STRINGOP_ALGS};
255 
256 static const
257 struct processor_costs i486_cost = {	/* 486 specific costs */
258   {
259   /* Start of register allocator costs.  integer->integer move cost is 2. */
260   4,				     /* cost for loading QImode using movzbl */
261   {2, 4, 2},				/* cost of loading integer registers
262 					   in QImode, HImode and SImode.
263 					   Relative to reg-reg move (2).  */
264   {2, 4, 2},				/* cost of storing integer registers */
265   2,					/* cost of reg,reg fld/fst */
266   {8, 8, 8},				/* cost of loading fp registers
267 					   in SFmode, DFmode and XFmode */
268   {8, 8, 8},				/* cost of storing fp registers
269 					   in SFmode, DFmode and XFmode */
270   2,					/* cost of moving MMX register */
271   {4, 8},				/* cost of loading MMX registers
272 					   in SImode and DImode */
273   {4, 8},				/* cost of storing MMX registers
274 					   in SImode and DImode */
275   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
276   {4, 8, 16, 32, 64},			/* cost of loading SSE registers
277 					   in 32,64,128,256 and 512-bit */
278   {4, 8, 16, 32, 64},			/* cost of storing SSE registers
279 					   in 32,64,128,256 and 512-bit */
280   3, 3,				/* SSE->integer and integer->SSE moves */
281   3, 3,				/* mask->integer and integer->mask moves */
282   {2, 4, 2},				/* cost of loading mask register
283 					   in QImode, HImode, SImode.  */
284   {2, 4, 2},				/* cost if storing mask register
285 					   in QImode, HImode, SImode.  */
286   2,					/* cost of moving mask register.  */
287   /* End of register allocator costs.  */
288   },
289 
290   COSTS_N_INSNS (1),			/* cost of an add instruction */
291   COSTS_N_INSNS (1),			/* cost of a lea instruction */
292   COSTS_N_INSNS (3),			/* variable shift costs */
293   COSTS_N_INSNS (2),			/* constant shift costs */
294   {COSTS_N_INSNS (12),			/* cost of starting multiply for QI */
295    COSTS_N_INSNS (12),			/*				 HI */
296    COSTS_N_INSNS (12),			/*				 SI */
297    COSTS_N_INSNS (12),			/*				 DI */
298    COSTS_N_INSNS (12)},			/*			      other */
299   1,					/* cost of multiply per each bit set */
300   {COSTS_N_INSNS (40),			/* cost of a divide/mod for QI */
301    COSTS_N_INSNS (40),			/*			    HI */
302    COSTS_N_INSNS (40),			/*			    SI */
303    COSTS_N_INSNS (40),			/*			    DI */
304    COSTS_N_INSNS (40)},			/*			    other */
305   COSTS_N_INSNS (3),			/* cost of movsx */
306   COSTS_N_INSNS (2),			/* cost of movzx */
307   15,					/* "large" insn */
308   3,					/* MOVE_RATIO */
309   3,					/* CLEAR_RATIO */
310   {2, 4, 2},				/* cost of loading integer registers
311 					   in QImode, HImode and SImode.
312 					   Relative to reg-reg move (2).  */
313   {2, 4, 2},				/* cost of storing integer registers */
314   {4, 8, 16, 32, 64},			/* cost of loading SSE register
315 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
316   {4, 8, 16, 32, 64},			/* cost of storing SSE register
317 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
318   {4, 8, 16, 32, 64},			/* cost of unaligned loads.  */
319   {4, 8, 16, 32, 64},			/* cost of unaligned stores.  */
320   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
321   3,					/* cost of moving SSE register to integer.  */
322   4, 4,					/* Gather load static, per_elt.  */
323   4, 4,					/* Gather store static, per_elt.  */
324   4,					/* size of l1 cache.  486 has 8kB cache
325 					   shared for code and data, so 4kB is
326 					   not really precise.  */
327   4,					/* size of l2 cache  */
328   0,					/* size of prefetch block */
329   0,					/* number of parallel prefetches */
330   1,					/* Branch cost */
331   COSTS_N_INSNS (8),			/* cost of FADD and FSUB insns.  */
332   COSTS_N_INSNS (16),			/* cost of FMUL instruction.  */
333   COSTS_N_INSNS (73),			/* cost of FDIV instruction.  */
334   COSTS_N_INSNS (3),			/* cost of FABS instruction.  */
335   COSTS_N_INSNS (3),			/* cost of FCHS instruction.  */
336   COSTS_N_INSNS (83),			/* cost of FSQRT instruction.  */
337 
338   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
339   COSTS_N_INSNS (8),			/* cost of ADDSS/SD SUBSS/SD insns.  */
340   COSTS_N_INSNS (16),			/* cost of MULSS instruction.  */
341   COSTS_N_INSNS (16),			/* cost of MULSD instruction.  */
342   COSTS_N_INSNS (16),			/* cost of FMA SS instruction.  */
343   COSTS_N_INSNS (16),			/* cost of FMA SD instruction.  */
344   COSTS_N_INSNS (73),			/* cost of DIVSS instruction.  */
345   COSTS_N_INSNS (74),			/* cost of DIVSD instruction.  */
346   COSTS_N_INSNS (83),			/* cost of SQRTSS instruction.  */
347   COSTS_N_INSNS (83),			/* cost of SQRTSD instruction.  */
348   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
349   i486_memcpy,
350   i486_memset,
351   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
352   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
353   "16",					/* Loop alignment.  */
354   "16",					/* Jump alignment.  */
355   "0:0:8",				/* Label alignment.  */
356   "16",					/* Func alignment.  */
357 };
358 
359 static stringop_algs pentium_memcpy[2] = {
360   {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
361   DUMMY_STRINGOP_ALGS};
362 static stringop_algs pentium_memset[2] = {
363   {libcall, {{-1, rep_prefix_4_byte, false}}},
364   DUMMY_STRINGOP_ALGS};
365 
366 static const
367 struct processor_costs pentium_cost = {
368   {
369   /* Start of register allocator costs.  integer->integer move cost is 2. */
370   6,				     /* cost for loading QImode using movzbl */
371   {2, 4, 2},				/* cost of loading integer registers
372 					   in QImode, HImode and SImode.
373 					   Relative to reg-reg move (2).  */
374   {2, 4, 2},				/* cost of storing integer registers */
375   2,					/* cost of reg,reg fld/fst */
376   {2, 2, 6},				/* cost of loading fp registers
377 					   in SFmode, DFmode and XFmode */
378   {4, 4, 6},				/* cost of storing fp registers
379 					   in SFmode, DFmode and XFmode */
380   8,					/* cost of moving MMX register */
381   {8, 8},				/* cost of loading MMX registers
382 					   in SImode and DImode */
383   {8, 8},				/* cost of storing MMX registers
384 					   in SImode and DImode */
385   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
386   {4, 8, 16, 32, 64},			/* cost of loading SSE registers
387 					   in 32,64,128,256 and 512-bit */
388   {4, 8, 16, 32, 64},			/* cost of storing SSE registers
389 					   in 32,64,128,256 and 512-bit */
390   3, 3,				/* SSE->integer and integer->SSE moves */
391   3, 3,				/* mask->integer and integer->mask moves */
392   {2, 4, 2},				/* cost of loading mask register
393 					   in QImode, HImode, SImode.  */
394   {2, 4, 2},				/* cost if storing mask register
395 					   in QImode, HImode, SImode.  */
396   2,					/* cost of moving mask register.  */
397   /* End of register allocator costs.  */
398   },
399 
400   COSTS_N_INSNS (1),			/* cost of an add instruction */
401   COSTS_N_INSNS (1),			/* cost of a lea instruction */
402   COSTS_N_INSNS (4),			/* variable shift costs */
403   COSTS_N_INSNS (1),			/* constant shift costs */
404   {COSTS_N_INSNS (11),			/* cost of starting multiply for QI */
405    COSTS_N_INSNS (11),			/*				 HI */
406    COSTS_N_INSNS (11),			/*				 SI */
407    COSTS_N_INSNS (11),			/*				 DI */
408    COSTS_N_INSNS (11)},			/*			      other */
409   0,					/* cost of multiply per each bit set */
410   {COSTS_N_INSNS (25),			/* cost of a divide/mod for QI */
411    COSTS_N_INSNS (25),			/*			    HI */
412    COSTS_N_INSNS (25),			/*			    SI */
413    COSTS_N_INSNS (25),			/*			    DI */
414    COSTS_N_INSNS (25)},			/*			    other */
415   COSTS_N_INSNS (3),			/* cost of movsx */
416   COSTS_N_INSNS (2),			/* cost of movzx */
417   8,					/* "large" insn */
418   6,					/* MOVE_RATIO */
419   6,					/* CLEAR_RATIO */
420   {2, 4, 2},				/* cost of loading integer registers
421 					   in QImode, HImode and SImode.
422 					   Relative to reg-reg move (2).  */
423   {2, 4, 2},				/* cost of storing integer registers */
424   {4, 8, 16, 32, 64},			/* cost of loading SSE register
425 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
426   {4, 8, 16, 32, 64},			/* cost of storing SSE register
427 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
428   {4, 8, 16, 32, 64},			/* cost of unaligned loads.  */
429   {4, 8, 16, 32, 64},			/* cost of unaligned stores.  */
430   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
431   3,					/* cost of moving SSE register to integer.  */
432   4, 4,					/* Gather load static, per_elt.  */
433   4, 4,					/* Gather store static, per_elt.  */
434   8,					/* size of l1 cache.  */
435   8,					/* size of l2 cache  */
436   0,					/* size of prefetch block */
437   0,					/* number of parallel prefetches */
438   2,					/* Branch cost */
439   COSTS_N_INSNS (3),			/* cost of FADD and FSUB insns.  */
440   COSTS_N_INSNS (3),			/* cost of FMUL instruction.  */
441   COSTS_N_INSNS (39),			/* cost of FDIV instruction.  */
442   COSTS_N_INSNS (1),			/* cost of FABS instruction.  */
443   COSTS_N_INSNS (1),			/* cost of FCHS instruction.  */
444   COSTS_N_INSNS (70),			/* cost of FSQRT instruction.  */
445 
446   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
447   COSTS_N_INSNS (3),			/* cost of ADDSS/SD SUBSS/SD insns.  */
448   COSTS_N_INSNS (3),			/* cost of MULSS instruction.  */
449   COSTS_N_INSNS (3),			/* cost of MULSD instruction.  */
450   COSTS_N_INSNS (6),			/* cost of FMA SS instruction.  */
451   COSTS_N_INSNS (6),			/* cost of FMA SD instruction.  */
452   COSTS_N_INSNS (39),			/* cost of DIVSS instruction.  */
453   COSTS_N_INSNS (39),			/* cost of DIVSD instruction.  */
454   COSTS_N_INSNS (70),			/* cost of SQRTSS instruction.  */
455   COSTS_N_INSNS (70),			/* cost of SQRTSD instruction.  */
456   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
457   pentium_memcpy,
458   pentium_memset,
459   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
460   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
461   "16:8:8",				/* Loop alignment.  */
462   "16:8:8",				/* Jump alignment.  */
463   "0:0:8",				/* Label alignment.  */
464   "16",					/* Func alignment.  */
465 };
466 
467 static const
468 struct processor_costs lakemont_cost = {
469   {
470   /* Start of register allocator costs.  integer->integer move cost is 2. */
471   6,				     /* cost for loading QImode using movzbl */
472   {2, 4, 2},				/* cost of loading integer registers
473 					   in QImode, HImode and SImode.
474 					   Relative to reg-reg move (2).  */
475   {2, 4, 2},				/* cost of storing integer registers */
476   2,					/* cost of reg,reg fld/fst */
477   {2, 2, 6},				/* cost of loading fp registers
478 					   in SFmode, DFmode and XFmode */
479   {4, 4, 6},				/* cost of storing fp registers
480 					   in SFmode, DFmode and XFmode */
481   8,					/* cost of moving MMX register */
482   {8, 8},				/* cost of loading MMX registers
483 					   in SImode and DImode */
484   {8, 8},				/* cost of storing MMX registers
485 					   in SImode and DImode */
486   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
487   {4, 8, 16, 32, 64},			/* cost of loading SSE registers
488 					   in 32,64,128,256 and 512-bit */
489   {4, 8, 16, 32, 64},			/* cost of storing SSE registers
490 					   in 32,64,128,256 and 512-bit */
491   3, 3,				/* SSE->integer and integer->SSE moves */
492   3, 3,				/* mask->integer and integer->mask moves */
493   {2, 4, 2},				/* cost of loading mask register
494 					   in QImode, HImode, SImode.  */
495   {2, 4, 2},				/* cost if storing mask register
496 					   in QImode, HImode, SImode.  */
497   2,					/* cost of moving mask register.  */
498   /* End of register allocator costs.  */
499   },
500 
501   COSTS_N_INSNS (1),			/* cost of an add instruction */
502   COSTS_N_INSNS (1) + 1,		/* cost of a lea instruction */
503   COSTS_N_INSNS (1),			/* variable shift costs */
504   COSTS_N_INSNS (1),			/* constant shift costs */
505   {COSTS_N_INSNS (11),			/* cost of starting multiply for QI */
506    COSTS_N_INSNS (11),			/*				 HI */
507    COSTS_N_INSNS (11),			/*				 SI */
508    COSTS_N_INSNS (11),			/*				 DI */
509    COSTS_N_INSNS (11)},			/*			      other */
510   0,					/* cost of multiply per each bit set */
511   {COSTS_N_INSNS (25),			/* cost of a divide/mod for QI */
512    COSTS_N_INSNS (25),			/*			    HI */
513    COSTS_N_INSNS (25),			/*			    SI */
514    COSTS_N_INSNS (25),			/*			    DI */
515    COSTS_N_INSNS (25)},			/*			    other */
516   COSTS_N_INSNS (3),			/* cost of movsx */
517   COSTS_N_INSNS (2),			/* cost of movzx */
518   8,					/* "large" insn */
519   17,					/* MOVE_RATIO */
520   6,					/* CLEAR_RATIO */
521   {2, 4, 2},				/* cost of loading integer registers
522 					   in QImode, HImode and SImode.
523 					   Relative to reg-reg move (2).  */
524   {2, 4, 2},				/* cost of storing integer registers */
525   {4, 8, 16, 32, 64},			/* cost of loading SSE register
526 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
527   {4, 8, 16, 32, 64},			/* cost of storing SSE register
528 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
529   {4, 8, 16, 32, 64},			/* cost of unaligned loads.  */
530   {4, 8, 16, 32, 64},			/* cost of unaligned stores.  */
531   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
532   3,					/* cost of moving SSE register to integer.  */
533   4, 4,					/* Gather load static, per_elt.  */
534   4, 4,					/* Gather store static, per_elt.  */
535   8,					/* size of l1 cache.  */
536   8,					/* size of l2 cache  */
537   0,					/* size of prefetch block */
538   0,					/* number of parallel prefetches */
539   2,					/* Branch cost */
540   COSTS_N_INSNS (3),			/* cost of FADD and FSUB insns.  */
541   COSTS_N_INSNS (3),			/* cost of FMUL instruction.  */
542   COSTS_N_INSNS (39),			/* cost of FDIV instruction.  */
543   COSTS_N_INSNS (1),			/* cost of FABS instruction.  */
544   COSTS_N_INSNS (1),			/* cost of FCHS instruction.  */
545   COSTS_N_INSNS (70),			/* cost of FSQRT instruction.  */
546 
547   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
548   COSTS_N_INSNS (5),			/* cost of ADDSS/SD SUBSS/SD insns.  */
549   COSTS_N_INSNS (5),			/* cost of MULSS instruction.  */
550   COSTS_N_INSNS (5),			/* cost of MULSD instruction.  */
551   COSTS_N_INSNS (10),			/* cost of FMA SS instruction.  */
552   COSTS_N_INSNS (10),			/* cost of FMA SD instruction.  */
553   COSTS_N_INSNS (31),			/* cost of DIVSS instruction.  */
554   COSTS_N_INSNS (60),			/* cost of DIVSD instruction.  */
555   COSTS_N_INSNS (31),			/* cost of SQRTSS instruction.  */
556   COSTS_N_INSNS (63),			/* cost of SQRTSD instruction.  */
557   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
558   pentium_memcpy,
559   pentium_memset,
560   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
561   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
562   "16:8:8",				/* Loop alignment.  */
563   "16:8:8",				/* Jump alignment.  */
564   "0:0:8",				/* Label alignment.  */
565   "16",					/* Func alignment.  */
566 };
567 
568 /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
569    (we ensure the alignment).  For small blocks inline loop is still a
570    noticeable win, for bigger blocks either rep movsl or rep movsb is
571    way to go.  Rep movsb has apparently more expensive startup time in CPU,
572    but after 4K the difference is down in the noise.  */
573 static stringop_algs pentiumpro_memcpy[2] = {
574   {rep_prefix_4_byte, {{128, loop, false}, {1024, unrolled_loop, false},
575                        {8192, rep_prefix_4_byte, false},
576                        {-1, rep_prefix_1_byte, false}}},
577   DUMMY_STRINGOP_ALGS};
578 static stringop_algs pentiumpro_memset[2] = {
579   {rep_prefix_4_byte, {{1024, unrolled_loop, false},
580                        {8192, rep_prefix_4_byte, false},
581                        {-1, libcall, false}}},
582   DUMMY_STRINGOP_ALGS};
583 static const
584 struct processor_costs pentiumpro_cost = {
585   {
586   /* Start of register allocator costs.  integer->integer move cost is 2. */
587   2,				     /* cost for loading QImode using movzbl */
588   {4, 4, 4},				/* cost of loading integer registers
589 					   in QImode, HImode and SImode.
590 					   Relative to reg-reg move (2).  */
591   {2, 2, 2},				/* cost of storing integer registers */
592   2,					/* cost of reg,reg fld/fst */
593   {2, 2, 6},				/* cost of loading fp registers
594 					   in SFmode, DFmode and XFmode */
595   {4, 4, 6},				/* cost of storing fp registers
596 					   in SFmode, DFmode and XFmode */
597   2,					/* cost of moving MMX register */
598   {2, 2},				/* cost of loading MMX registers
599 					   in SImode and DImode */
600   {2, 2},				/* cost of storing MMX registers
601 					   in SImode and DImode */
602   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
603   {4, 8, 16, 32, 64},			/* cost of loading SSE registers
604 					   in 32,64,128,256 and 512-bit */
605   {4, 8, 16, 32, 64},			/* cost of storing SSE registers
606 					   in 32,64,128,256 and 512-bit */
607   3, 3,				/* SSE->integer and integer->SSE moves */
608   3, 3,				/* mask->integer and integer->mask moves */
609   {4, 4, 4},				/* cost of loading mask register
610 					   in QImode, HImode, SImode.  */
611   {2, 2, 2},				/* cost if storing mask register
612 					   in QImode, HImode, SImode.  */
613   2,					/* cost of moving mask register.  */
614   /* End of register allocator costs.  */
615   },
616 
617   COSTS_N_INSNS (1),			/* cost of an add instruction */
618   COSTS_N_INSNS (1),			/* cost of a lea instruction */
619   COSTS_N_INSNS (1),			/* variable shift costs */
620   COSTS_N_INSNS (1),			/* constant shift costs */
621   {COSTS_N_INSNS (4),			/* cost of starting multiply for QI */
622    COSTS_N_INSNS (4),			/*				 HI */
623    COSTS_N_INSNS (4),			/*				 SI */
624    COSTS_N_INSNS (4),			/*				 DI */
625    COSTS_N_INSNS (4)},			/*			      other */
626   0,					/* cost of multiply per each bit set */
627   {COSTS_N_INSNS (17),			/* cost of a divide/mod for QI */
628    COSTS_N_INSNS (17),			/*			    HI */
629    COSTS_N_INSNS (17),			/*			    SI */
630    COSTS_N_INSNS (17),			/*			    DI */
631    COSTS_N_INSNS (17)},			/*			    other */
632   COSTS_N_INSNS (1),			/* cost of movsx */
633   COSTS_N_INSNS (1),			/* cost of movzx */
634   8,					/* "large" insn */
635   6,					/* MOVE_RATIO */
636   6,					/* CLEAR_RATIO */
637   {4, 4, 4},				/* cost of loading integer registers
638 					   in QImode, HImode and SImode.
639 					   Relative to reg-reg move (2).  */
640   {2, 2, 2},				/* cost of storing integer registers */
641   {4, 8, 16, 32, 64},			/* cost of loading SSE register
642 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
643   {4, 8, 16, 32, 64},			/* cost of storing SSE register
644 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
645   {4, 8, 16, 32, 64},			/* cost of unaligned loads.  */
646   {4, 8, 16, 32, 64},			/* cost of unaligned stores.  */
647   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
648   3,					/* cost of moving SSE register to integer.  */
649   4, 4,					/* Gather load static, per_elt.  */
650   4, 4,					/* Gather store static, per_elt.  */
651   8,					/* size of l1 cache.  */
652   256,					/* size of l2 cache  */
653   32,					/* size of prefetch block */
654   6,					/* number of parallel prefetches */
655   2,					/* Branch cost */
656   COSTS_N_INSNS (3),			/* cost of FADD and FSUB insns.  */
657   COSTS_N_INSNS (5),			/* cost of FMUL instruction.  */
658   COSTS_N_INSNS (56),			/* cost of FDIV instruction.  */
659   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
660   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
661   COSTS_N_INSNS (56),			/* cost of FSQRT instruction.  */
662 
663   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
664   COSTS_N_INSNS (3),			/* cost of ADDSS/SD SUBSS/SD insns.  */
665   COSTS_N_INSNS (4),			/* cost of MULSS instruction.  */
666   COSTS_N_INSNS (4),			/* cost of MULSD instruction.  */
667   COSTS_N_INSNS (7),			/* cost of FMA SS instruction.  */
668   COSTS_N_INSNS (7),			/* cost of FMA SD instruction.  */
669   COSTS_N_INSNS (18),			/* cost of DIVSS instruction.  */
670   COSTS_N_INSNS (18),			/* cost of DIVSD instruction.  */
671   COSTS_N_INSNS (31),			/* cost of SQRTSS instruction.  */
672   COSTS_N_INSNS (31),			/* cost of SQRTSD instruction.  */
673   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
674   pentiumpro_memcpy,
675   pentiumpro_memset,
676   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
677   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
678   "16",					/* Loop alignment.  */
679   "16:11:8",				/* Jump alignment.  */
680   "0:0:8",				/* Label alignment.  */
681   "16",					/* Func alignment.  */
682 };
683 
684 static stringop_algs geode_memcpy[2] = {
685   {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
686   DUMMY_STRINGOP_ALGS};
687 static stringop_algs geode_memset[2] = {
688   {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
689   DUMMY_STRINGOP_ALGS};
690 static const
691 struct processor_costs geode_cost = {
692   {
693   /* Start of register allocator costs.  integer->integer move cost is 2. */
694   2,				     /* cost for loading QImode using movzbl */
695   {2, 2, 2},				/* cost of loading integer registers
696 					   in QImode, HImode and SImode.
697 					   Relative to reg-reg move (2).  */
698   {2, 2, 2},				/* cost of storing integer registers */
699   2,					/* cost of reg,reg fld/fst */
700   {2, 2, 2},				/* cost of loading fp registers
701 					   in SFmode, DFmode and XFmode */
702   {4, 6, 6},				/* cost of storing fp registers
703 					   in SFmode, DFmode and XFmode */
704   2,					/* cost of moving MMX register */
705   {2, 2},				/* cost of loading MMX registers
706 					   in SImode and DImode */
707   {2, 2},				/* cost of storing MMX registers
708 					   in SImode and DImode */
709   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
710   {2, 2, 8, 16, 32},			/* cost of loading SSE registers
711 					   in 32,64,128,256 and 512-bit */
712   {2, 2, 8, 16, 32},			/* cost of storing SSE registers
713 					   in 32,64,128,256 and 512-bit */
714   6, 6,				/* SSE->integer and integer->SSE moves */
715   6, 6,				/* mask->integer and integer->mask moves */
716   {2, 2, 2},				/* cost of loading mask register
717 					   in QImode, HImode, SImode.  */
718   {2, 2, 2},				/* cost if storing mask register
719 					   in QImode, HImode, SImode.  */
720   2,					/* cost of moving mask register.  */
721   /* End of register allocator costs.  */
722   },
723 
724   COSTS_N_INSNS (1),			/* cost of an add instruction */
725   COSTS_N_INSNS (1),			/* cost of a lea instruction */
726   COSTS_N_INSNS (2),			/* variable shift costs */
727   COSTS_N_INSNS (1),			/* constant shift costs */
728   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
729    COSTS_N_INSNS (4),			/*				 HI */
730    COSTS_N_INSNS (7),			/*				 SI */
731    COSTS_N_INSNS (7),			/*				 DI */
732    COSTS_N_INSNS (7)},			/*			      other */
733   0,					/* cost of multiply per each bit set */
734   {COSTS_N_INSNS (15),			/* cost of a divide/mod for QI */
735    COSTS_N_INSNS (23),			/*			    HI */
736    COSTS_N_INSNS (39),			/*			    SI */
737    COSTS_N_INSNS (39),			/*			    DI */
738    COSTS_N_INSNS (39)},			/*			    other */
739   COSTS_N_INSNS (1),			/* cost of movsx */
740   COSTS_N_INSNS (1),			/* cost of movzx */
741   8,					/* "large" insn */
742   4,					/* MOVE_RATIO */
743   4,					/* CLEAR_RATIO */
744   {2, 2, 2},				/* cost of loading integer registers
745 					   in QImode, HImode and SImode.
746 					   Relative to reg-reg move (2).  */
747   {2, 2, 2},				/* cost of storing integer registers */
748   {2, 2, 8, 16, 32},			/* cost of loading SSE register
749 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
750   {2, 2, 8, 16, 32},			/* cost of storing SSE register
751 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
752   {2, 2, 8, 16, 32},			/* cost of unaligned loads.  */
753   {2, 2, 8, 16, 32},			/* cost of unaligned stores.  */
754   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
755   6,					/* cost of moving SSE register to integer.  */
756   2, 2,					/* Gather load static, per_elt.  */
757   2, 2,					/* Gather store static, per_elt.  */
758   64,					/* size of l1 cache.  */
759   128,					/* size of l2 cache.  */
760   32,					/* size of prefetch block */
761   1,					/* number of parallel prefetches */
762   1,					/* Branch cost */
763   COSTS_N_INSNS (6),			/* cost of FADD and FSUB insns.  */
764   COSTS_N_INSNS (11),			/* cost of FMUL instruction.  */
765   COSTS_N_INSNS (47),			/* cost of FDIV instruction.  */
766   COSTS_N_INSNS (1),			/* cost of FABS instruction.  */
767   COSTS_N_INSNS (1),			/* cost of FCHS instruction.  */
768   COSTS_N_INSNS (54),			/* cost of FSQRT instruction.  */
769 
770   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
771   COSTS_N_INSNS (6),			/* cost of ADDSS/SD SUBSS/SD insns.  */
772   COSTS_N_INSNS (11),			/* cost of MULSS instruction.  */
773   COSTS_N_INSNS (11),			/* cost of MULSD instruction.  */
774   COSTS_N_INSNS (17),			/* cost of FMA SS instruction.  */
775   COSTS_N_INSNS (17),			/* cost of FMA SD instruction.  */
776   COSTS_N_INSNS (47),			/* cost of DIVSS instruction.  */
777   COSTS_N_INSNS (47),			/* cost of DIVSD instruction.  */
778   COSTS_N_INSNS (54),			/* cost of SQRTSS instruction.  */
779   COSTS_N_INSNS (54),			/* cost of SQRTSD instruction.  */
780   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
781   geode_memcpy,
782   geode_memset,
783   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
784   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
785   NULL,					/* Loop alignment.  */
786   NULL,					/* Jump alignment.  */
787   NULL,					/* Label alignment.  */
788   NULL,					/* Func alignment.  */
789 };
790 
791 static stringop_algs k6_memcpy[2] = {
792   {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
793   DUMMY_STRINGOP_ALGS};
794 static stringop_algs k6_memset[2] = {
795   {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
796   DUMMY_STRINGOP_ALGS};
797 static const
798 struct processor_costs k6_cost = {
799   {
800   /* Start of register allocator costs.  integer->integer move cost is 2. */
801   3,				     /* cost for loading QImode using movzbl */
802   {4, 5, 4},				/* cost of loading integer registers
803 					   in QImode, HImode and SImode.
804 					   Relative to reg-reg move (2).  */
805   {2, 3, 2},				/* cost of storing integer registers */
806   4,					/* cost of reg,reg fld/fst */
807   {6, 6, 6},				/* cost of loading fp registers
808 					   in SFmode, DFmode and XFmode */
809   {4, 4, 4},				/* cost of storing fp registers
810 					   in SFmode, DFmode and XFmode */
811   2,					/* cost of moving MMX register */
812   {2, 2},				/* cost of loading MMX registers
813 					   in SImode and DImode */
814   {2, 2},				/* cost of storing MMX registers
815 					   in SImode and DImode */
816   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
817   {2, 2, 8, 16, 32},			/* cost of loading SSE registers
818 					   in 32,64,128,256 and 512-bit */
819   {2, 2, 8, 16, 32},			/* cost of storing SSE registers
820 					   in 32,64,128,256 and 512-bit */
821   6, 6,				/* SSE->integer and integer->SSE moves */
822   6, 6,				/* mask->integer and integer->mask moves */
823   {4, 5, 4},				/* cost of loading mask register
824 					   in QImode, HImode, SImode.  */
825   {2, 3, 2},				/* cost if storing mask register
826 					   in QImode, HImode, SImode.  */
827   2,					/* cost of moving mask register.  */
828   /* End of register allocator costs.  */
829   },
830 
831   COSTS_N_INSNS (1),			/* cost of an add instruction */
832   COSTS_N_INSNS (2),			/* cost of a lea instruction */
833   COSTS_N_INSNS (1),			/* variable shift costs */
834   COSTS_N_INSNS (1),			/* constant shift costs */
835   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
836    COSTS_N_INSNS (3),			/*				 HI */
837    COSTS_N_INSNS (3),			/*				 SI */
838    COSTS_N_INSNS (3),			/*				 DI */
839    COSTS_N_INSNS (3)},			/*			      other */
840   0,					/* cost of multiply per each bit set */
841   {COSTS_N_INSNS (18),			/* cost of a divide/mod for QI */
842    COSTS_N_INSNS (18),			/*			    HI */
843    COSTS_N_INSNS (18),			/*			    SI */
844    COSTS_N_INSNS (18),			/*			    DI */
845    COSTS_N_INSNS (18)},			/*			    other */
846   COSTS_N_INSNS (2),			/* cost of movsx */
847   COSTS_N_INSNS (2),			/* cost of movzx */
848   8,					/* "large" insn */
849   4,					/* MOVE_RATIO */
850   4,					/* CLEAR_RATIO */
851   {4, 5, 4},				/* cost of loading integer registers
852 					   in QImode, HImode and SImode.
853 					   Relative to reg-reg move (2).  */
854   {2, 3, 2},				/* cost of storing integer registers */
855   {2, 2, 8, 16, 32},			/* cost of loading SSE register
856 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
857   {2, 2, 8, 16, 32},			/* cost of storing SSE register
858 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
859   {2, 2, 8, 16, 32},			/* cost of unaligned loads.  */
860   {2, 2, 8, 16, 32},			/* cost of unaligned stores.  */
861   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
862   6,					/* cost of moving SSE register to integer.  */
863   2, 2,					/* Gather load static, per_elt.  */
864   2, 2,					/* Gather store static, per_elt.  */
865   32,					/* size of l1 cache.  */
866   32,					/* size of l2 cache.  Some models
867 					   have integrated l2 cache, but
868 					   optimizing for k6 is not important
869 					   enough to worry about that.  */
870   32,					/* size of prefetch block */
871   1,					/* number of parallel prefetches */
872   1,					/* Branch cost */
873   COSTS_N_INSNS (2),			/* cost of FADD and FSUB insns.  */
874   COSTS_N_INSNS (2),			/* cost of FMUL instruction.  */
875   COSTS_N_INSNS (56),			/* cost of FDIV instruction.  */
876   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
877   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
878   COSTS_N_INSNS (56),			/* cost of FSQRT instruction.  */
879 
880   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
881   COSTS_N_INSNS (2),			/* cost of ADDSS/SD SUBSS/SD insns.  */
882   COSTS_N_INSNS (2),			/* cost of MULSS instruction.  */
883   COSTS_N_INSNS (2),			/* cost of MULSD instruction.  */
884   COSTS_N_INSNS (4),			/* cost of FMA SS instruction.  */
885   COSTS_N_INSNS (4),			/* cost of FMA SD instruction.  */
886   COSTS_N_INSNS (56),			/* cost of DIVSS instruction.  */
887   COSTS_N_INSNS (56),			/* cost of DIVSD instruction.  */
888   COSTS_N_INSNS (56),			/* cost of SQRTSS instruction.  */
889   COSTS_N_INSNS (56),			/* cost of SQRTSD instruction.  */
890   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
891   k6_memcpy,
892   k6_memset,
893   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
894   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
895   "32:8:8",				/* Loop alignment.  */
896   "32:8:8",				/* Jump alignment.  */
897   "0:0:8",				/* Label alignment.  */
898   "32",					/* Func alignment.  */
899 };
900 
901 /* For some reason, Athlon deals better with REP prefix (relative to loops)
902    compared to K8. Alignment becomes important after 8 bytes for memcpy and
903    128 bytes for memset.  */
904 static stringop_algs athlon_memcpy[2] = {
905   {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
906   DUMMY_STRINGOP_ALGS};
907 static stringop_algs athlon_memset[2] = {
908   {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
909   DUMMY_STRINGOP_ALGS};
910 static const
911 struct processor_costs athlon_cost = {
912   {
913   /* Start of register allocator costs.  integer->integer move cost is 2. */
914   4,				     /* cost for loading QImode using movzbl */
915   {3, 4, 3},				/* cost of loading integer registers
916 					   in QImode, HImode and SImode.
917 					   Relative to reg-reg move (2).  */
918   {3, 4, 3},				/* cost of storing integer registers */
919   4,					/* cost of reg,reg fld/fst */
920   {4, 4, 12},				/* cost of loading fp registers
921 					   in SFmode, DFmode and XFmode */
922   {6, 6, 8},				/* cost of storing fp registers
923 					   in SFmode, DFmode and XFmode */
924   2,					/* cost of moving MMX register */
925   {4, 4},				/* cost of loading MMX registers
926 					   in SImode and DImode */
927   {4, 4},				/* cost of storing MMX registers
928 					   in SImode and DImode */
929   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
930   {4, 4, 12, 12, 24},			/* cost of loading SSE registers
931 					   in 32,64,128,256 and 512-bit */
932   {4, 4, 10, 10, 20},			/* cost of storing SSE registers
933 					   in 32,64,128,256 and 512-bit */
934   5, 5,				/* SSE->integer and integer->SSE moves */
935   5, 5,				/* mask->integer and integer->mask moves */
936   {3, 4, 3},				/* cost of loading mask register
937 					   in QImode, HImode, SImode.  */
938   {3, 4, 3},				/* cost if storing mask register
939 					   in QImode, HImode, SImode.  */
940   2,					/* cost of moving mask register.  */
941   /* End of register allocator costs.  */
942   },
943 
944   COSTS_N_INSNS (1),			/* cost of an add instruction */
945   COSTS_N_INSNS (2),			/* cost of a lea instruction */
946   COSTS_N_INSNS (1),			/* variable shift costs */
947   COSTS_N_INSNS (1),			/* constant shift costs */
948   {COSTS_N_INSNS (5),			/* cost of starting multiply for QI */
949    COSTS_N_INSNS (5),			/*				 HI */
950    COSTS_N_INSNS (5),			/*				 SI */
951    COSTS_N_INSNS (5),			/*				 DI */
952    COSTS_N_INSNS (5)},			/*			      other */
953   0,					/* cost of multiply per each bit set */
954   {COSTS_N_INSNS (18),			/* cost of a divide/mod for QI */
955    COSTS_N_INSNS (26),			/*			    HI */
956    COSTS_N_INSNS (42),			/*			    SI */
957    COSTS_N_INSNS (74),			/*			    DI */
958    COSTS_N_INSNS (74)},			/*			    other */
959   COSTS_N_INSNS (1),			/* cost of movsx */
960   COSTS_N_INSNS (1),			/* cost of movzx */
961   8,					/* "large" insn */
962   9,					/* MOVE_RATIO */
963   6,					/* CLEAR_RATIO */
964   {3, 4, 3},				/* cost of loading integer registers
965 					   in QImode, HImode and SImode.
966 					   Relative to reg-reg move (2).  */
967   {3, 4, 3},				/* cost of storing integer registers */
968   {4, 4, 12, 12, 24},			/* cost of loading SSE register
969 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
970   {4, 4, 10, 10, 20},			/* cost of storing SSE register
971 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
972   {4, 4, 12, 12, 24},			/* cost of unaligned loads.  */
973   {4, 4, 10, 10, 20},			/* cost of unaligned stores.  */
974   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
975   5,					/* cost of moving SSE register to integer.  */
976   4, 4,					/* Gather load static, per_elt.  */
977   4, 4,					/* Gather store static, per_elt.  */
978   64,					/* size of l1 cache.  */
979   256,					/* size of l2 cache.  */
980   64,					/* size of prefetch block */
981   6,					/* number of parallel prefetches */
982   5,					/* Branch cost */
983   COSTS_N_INSNS (4),			/* cost of FADD and FSUB insns.  */
984   COSTS_N_INSNS (4),			/* cost of FMUL instruction.  */
985   COSTS_N_INSNS (24),			/* cost of FDIV instruction.  */
986   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
987   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
988   COSTS_N_INSNS (35),			/* cost of FSQRT instruction.  */
989 
990   COSTS_N_INSNS (2),			/* cost of cheap SSE instruction.  */
991   COSTS_N_INSNS (4),			/* cost of ADDSS/SD SUBSS/SD insns.  */
992   COSTS_N_INSNS (4),			/* cost of MULSS instruction.  */
993   COSTS_N_INSNS (4),			/* cost of MULSD instruction.  */
994   COSTS_N_INSNS (8),			/* cost of FMA SS instruction.  */
995   COSTS_N_INSNS (8),			/* cost of FMA SD instruction.  */
996   /* 11-16  */
997   COSTS_N_INSNS (16),			/* cost of DIVSS instruction.  */
998   COSTS_N_INSNS (24),			/* cost of DIVSD instruction.  */
999   COSTS_N_INSNS (19),			/* cost of SQRTSS instruction.  */
1000   COSTS_N_INSNS (19),			/* cost of SQRTSD instruction.  */
1001   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
1002   athlon_memcpy,
1003   athlon_memset,
1004   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
1005   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
1006   "16:8:8",				/* Loop alignment.  */
1007   "16:8:8",				/* Jump alignment.  */
1008   "0:0:8",				/* Label alignment.  */
1009   "16",					/* Func alignment.  */
1010 };
1011 
1012 /* K8 has optimized REP instruction for medium sized blocks, but for very
1013    small blocks it is better to use loop. For large blocks, libcall can
1014    do nontemporary accesses and beat inline considerably.  */
1015 static stringop_algs k8_memcpy[2] = {
1016   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1017              {-1, rep_prefix_4_byte, false}}},
1018   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1019              {-1, libcall, false}}}};
1020 static stringop_algs k8_memset[2] = {
1021   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1022              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1023   {libcall, {{48, unrolled_loop, false},
1024              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1025 static const
1026 struct processor_costs k8_cost = {
1027   {
1028   /* Start of register allocator costs.  integer->integer move cost is 2. */
1029   4,				     /* cost for loading QImode using movzbl */
1030   {3, 4, 3},				/* cost of loading integer registers
1031 					   in QImode, HImode and SImode.
1032 					   Relative to reg-reg move (2).  */
1033   {3, 4, 3},				/* cost of storing integer registers */
1034   4,					/* cost of reg,reg fld/fst */
1035   {4, 4, 12},				/* cost of loading fp registers
1036 					   in SFmode, DFmode and XFmode */
1037   {6, 6, 8},				/* cost of storing fp registers
1038 					   in SFmode, DFmode and XFmode */
1039   2,					/* cost of moving MMX register */
1040   {3, 3},				/* cost of loading MMX registers
1041 					   in SImode and DImode */
1042   {4, 4},				/* cost of storing MMX registers
1043 					   in SImode and DImode */
1044   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
1045   {4, 3, 12, 12, 24},			/* cost of loading SSE registers
1046 					   in 32,64,128,256 and 512-bit */
1047   {4, 4, 10, 10, 20},			/* cost of storing SSE registers
1048 					   in 32,64,128,256 and 512-bit */
1049   5, 5,				/* SSE->integer and integer->SSE moves */
1050   5, 5,				/* mask->integer and integer->mask moves */
1051   {3, 4, 3},				/* cost of loading mask register
1052 					   in QImode, HImode, SImode.  */
1053   {3, 4, 3},				/* cost if storing mask register
1054 					   in QImode, HImode, SImode.  */
1055   2,					/* cost of moving mask register.  */
1056   /* End of register allocator costs.  */
1057   },
1058 
1059   COSTS_N_INSNS (1),			/* cost of an add instruction */
1060   COSTS_N_INSNS (2),			/* cost of a lea instruction */
1061   COSTS_N_INSNS (1),			/* variable shift costs */
1062   COSTS_N_INSNS (1),			/* constant shift costs */
1063   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
1064    COSTS_N_INSNS (4),			/*				 HI */
1065    COSTS_N_INSNS (3),			/*				 SI */
1066    COSTS_N_INSNS (4),			/*				 DI */
1067    COSTS_N_INSNS (5)},			/*			      other */
1068   0,					/* cost of multiply per each bit set */
1069   {COSTS_N_INSNS (18),			/* cost of a divide/mod for QI */
1070    COSTS_N_INSNS (26),			/*			    HI */
1071    COSTS_N_INSNS (42),			/*			    SI */
1072    COSTS_N_INSNS (74),			/*			    DI */
1073    COSTS_N_INSNS (74)},			/*			    other */
1074   COSTS_N_INSNS (1),			/* cost of movsx */
1075   COSTS_N_INSNS (1),			/* cost of movzx */
1076   8,					/* "large" insn */
1077   9,					/* MOVE_RATIO */
1078   6,					/* CLEAR_RATIO */
1079   {3, 4, 3},				/* cost of loading integer registers
1080 					   in QImode, HImode and SImode.
1081 					   Relative to reg-reg move (2).  */
1082   {3, 4, 3},				/* cost of storing integer registers */
1083   {4, 3, 12, 12, 24},			/* cost of loading SSE register
1084 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
1085   {4, 4, 10, 10, 20},			/* cost of storing SSE register
1086 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
1087   {4, 3, 12, 12, 24},			/* cost of unaligned loads.  */
1088   {4, 4, 10, 10, 20},			/* cost of unaligned stores.  */
1089   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
1090   5,					/* cost of moving SSE register to integer.  */
1091   4, 4,					/* Gather load static, per_elt.  */
1092   4, 4,					/* Gather store static, per_elt.  */
1093   64,					/* size of l1 cache.  */
1094   512,					/* size of l2 cache.  */
1095   64,					/* size of prefetch block */
1096   /* New AMD processors never drop prefetches; if they cannot be performed
1097      immediately, they are queued.  We set number of simultaneous prefetches
1098      to a large constant to reflect this (it probably is not a good idea not
1099      to limit number of prefetches at all, as their execution also takes some
1100      time).  */
1101   100,					/* number of parallel prefetches */
1102   3,					/* Branch cost */
1103   COSTS_N_INSNS (4),			/* cost of FADD and FSUB insns.  */
1104   COSTS_N_INSNS (4),			/* cost of FMUL instruction.  */
1105   COSTS_N_INSNS (19),			/* cost of FDIV instruction.  */
1106   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
1107   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
1108   COSTS_N_INSNS (35),			/* cost of FSQRT instruction.  */
1109 
1110   COSTS_N_INSNS (2),			/* cost of cheap SSE instruction.  */
1111   COSTS_N_INSNS (4),			/* cost of ADDSS/SD SUBSS/SD insns.  */
1112   COSTS_N_INSNS (4),			/* cost of MULSS instruction.  */
1113   COSTS_N_INSNS (4),			/* cost of MULSD instruction.  */
1114   COSTS_N_INSNS (8),			/* cost of FMA SS instruction.  */
1115   COSTS_N_INSNS (8),			/* cost of FMA SD instruction.  */
1116   /* 11-16  */
1117   COSTS_N_INSNS (16),			/* cost of DIVSS instruction.  */
1118   COSTS_N_INSNS (20),			/* cost of DIVSD instruction.  */
1119   COSTS_N_INSNS (19),			/* cost of SQRTSS instruction.  */
1120   COSTS_N_INSNS (27),			/* cost of SQRTSD instruction.  */
1121   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
1122   k8_memcpy,
1123   k8_memset,
1124   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
1125   COSTS_N_INSNS (2),			/* cond_not_taken_branch_cost.  */
1126   "16:8:8",				/* Loop alignment.  */
1127   "16:8:8",				/* Jump alignment.  */
1128   "0:0:8",				/* Label alignment.  */
1129   "16",					/* Func alignment.  */
1130 };
1131 
1132 /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
1133    very small blocks it is better to use loop. For large blocks, libcall can
1134    do nontemporary accesses and beat inline considerably.  */
1135 static stringop_algs amdfam10_memcpy[2] = {
1136   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1137              {-1, rep_prefix_4_byte, false}}},
1138   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1139              {-1, libcall, false}}}};
1140 static stringop_algs amdfam10_memset[2] = {
1141   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1142              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1143   {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1144              {-1, libcall, false}}}};
1145 struct processor_costs amdfam10_cost = {
1146   {
1147   /* Start of register allocator costs.  integer->integer move cost is 2. */
1148   4,				     /* cost for loading QImode using movzbl */
1149   {3, 4, 3},				/* cost of loading integer registers
1150 					   in QImode, HImode and SImode.
1151 					   Relative to reg-reg move (2).  */
1152   {3, 4, 3},				/* cost of storing integer registers */
1153   4,					/* cost of reg,reg fld/fst */
1154   {4, 4, 12},				/* cost of loading fp registers
1155 		   			   in SFmode, DFmode and XFmode */
1156   {6, 6, 8},				/* cost of storing fp registers
1157  		   			   in SFmode, DFmode and XFmode */
1158   2,					/* cost of moving MMX register */
1159   {3, 3},				/* cost of loading MMX registers
1160 					   in SImode and DImode */
1161   {4, 4},				/* cost of storing MMX registers
1162 					   in SImode and DImode */
1163   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
1164   {4, 4, 3, 6, 12},			/* cost of loading SSE registers
1165 					   in 32,64,128,256 and 512-bit */
1166   {4, 4, 5, 10, 20},			/* cost of storing SSE registers
1167 					   in 32,64,128,256 and 512-bit */
1168   3, 3,				/* SSE->integer and integer->SSE moves */
1169   3, 3,				/* mask->integer and integer->mask moves */
1170   {3, 4, 3},				/* cost of loading mask register
1171 					   in QImode, HImode, SImode.  */
1172   {3, 4, 3},				/* cost if storing mask register
1173 					   in QImode, HImode, SImode.  */
1174   2,					/* cost of moving mask register.  */
1175 
1176   					/* On K8:
1177   					    MOVD reg64, xmmreg Double FSTORE 4
1178 					    MOVD reg32, xmmreg Double FSTORE 4
1179 					   On AMDFAM10:
1180 					    MOVD reg64, xmmreg Double FADD 3
1181 							       1/1  1/1
1182 					    MOVD reg32, xmmreg Double FADD 3
1183 							       1/1  1/1 */
1184   /* End of register allocator costs.  */
1185   },
1186 
1187   COSTS_N_INSNS (1),			/* cost of an add instruction */
1188   COSTS_N_INSNS (2),			/* cost of a lea instruction */
1189   COSTS_N_INSNS (1),			/* variable shift costs */
1190   COSTS_N_INSNS (1),			/* constant shift costs */
1191   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
1192    COSTS_N_INSNS (4),			/*				 HI */
1193    COSTS_N_INSNS (3),			/*				 SI */
1194    COSTS_N_INSNS (4),			/*				 DI */
1195    COSTS_N_INSNS (5)},			/*			      other */
1196   0,					/* cost of multiply per each bit set */
1197   {COSTS_N_INSNS (19),			/* cost of a divide/mod for QI */
1198    COSTS_N_INSNS (35),			/*			    HI */
1199    COSTS_N_INSNS (51),			/*			    SI */
1200    COSTS_N_INSNS (83),			/*			    DI */
1201    COSTS_N_INSNS (83)},			/*			    other */
1202   COSTS_N_INSNS (1),			/* cost of movsx */
1203   COSTS_N_INSNS (1),			/* cost of movzx */
1204   8,					/* "large" insn */
1205   9,					/* MOVE_RATIO */
1206   6,					/* CLEAR_RATIO */
1207   {3, 4, 3},				/* cost of loading integer registers
1208 					   in QImode, HImode and SImode.
1209 					   Relative to reg-reg move (2).  */
1210   {3, 4, 3},				/* cost of storing integer registers */
1211   {4, 4, 3, 6, 12},			/* cost of loading SSE register
1212 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
1213   {4, 4, 5, 10, 20},			/* cost of storing SSE register
1214 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
1215   {4, 4, 3, 7, 12},			/* cost of unaligned loads.  */
1216   {4, 4, 5, 10, 20},			/* cost of unaligned stores.  */
1217   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
1218   3,					/* cost of moving SSE register to integer.  */
1219   4, 4,					/* Gather load static, per_elt.  */
1220   4, 4,					/* Gather store static, per_elt.  */
1221   64,					/* size of l1 cache.  */
1222   512,					/* size of l2 cache.  */
1223   64,					/* size of prefetch block */
1224   /* New AMD processors never drop prefetches; if they cannot be performed
1225      immediately, they are queued.  We set number of simultaneous prefetches
1226      to a large constant to reflect this (it probably is not a good idea not
1227      to limit number of prefetches at all, as their execution also takes some
1228      time).  */
1229   100,					/* number of parallel prefetches */
1230   2,					/* Branch cost */
1231   COSTS_N_INSNS (4),			/* cost of FADD and FSUB insns.  */
1232   COSTS_N_INSNS (4),			/* cost of FMUL instruction.  */
1233   COSTS_N_INSNS (19),			/* cost of FDIV instruction.  */
1234   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
1235   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
1236   COSTS_N_INSNS (35),			/* cost of FSQRT instruction.  */
1237 
1238   COSTS_N_INSNS (2),			/* cost of cheap SSE instruction.  */
1239   COSTS_N_INSNS (4),			/* cost of ADDSS/SD SUBSS/SD insns.  */
1240   COSTS_N_INSNS (4),			/* cost of MULSS instruction.  */
1241   COSTS_N_INSNS (4),			/* cost of MULSD instruction.  */
1242   COSTS_N_INSNS (8),			/* cost of FMA SS instruction.  */
1243   COSTS_N_INSNS (8),			/* cost of FMA SD instruction.  */
1244   /* 11-16  */
1245   COSTS_N_INSNS (16),			/* cost of DIVSS instruction.  */
1246   COSTS_N_INSNS (20),			/* cost of DIVSD instruction.  */
1247   COSTS_N_INSNS (19),			/* cost of SQRTSS instruction.  */
1248   COSTS_N_INSNS (27),			/* cost of SQRTSD instruction.  */
1249   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
1250   amdfam10_memcpy,
1251   amdfam10_memset,
1252   COSTS_N_INSNS (2),			/* cond_taken_branch_cost.  */
1253   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
1254   "32:25:8",				/* Loop alignment.  */
1255   "32:8:8",				/* Jump alignment.  */
1256   "0:0:8",				/* Label alignment.  */
1257   "32",					/* Func alignment.  */
1258 };
1259 
1260 /*  BDVER has optimized REP instruction for medium sized blocks, but for
1261     very small blocks it is better to use loop. For large blocks, libcall
1262     can do nontemporary accesses and beat inline considerably.  */
1263 static stringop_algs bdver_memcpy[2] = {
1264   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1265              {-1, rep_prefix_4_byte, false}}},
1266   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1267              {-1, libcall, false}}}};
1268 static stringop_algs bdver_memset[2] = {
1269   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1270              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1271   {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1272              {-1, libcall, false}}}};
1273 
1274 const struct processor_costs bdver_cost = {
1275   {
1276   /* Start of register allocator costs.  integer->integer move cost is 2. */
1277   8,				     /* cost for loading QImode using movzbl */
1278   {8, 8, 8},				/* cost of loading integer registers
1279 					   in QImode, HImode and SImode.
1280 					   Relative to reg-reg move (2).  */
1281   {8, 8, 8},				/* cost of storing integer registers */
1282   4,					/* cost of reg,reg fld/fst */
1283   {12, 12, 28},				/* cost of loading fp registers
1284 		   			   in SFmode, DFmode and XFmode */
1285   {10, 10, 18},				/* cost of storing fp registers
1286  		   			   in SFmode, DFmode and XFmode */
1287   4,					/* cost of moving MMX register */
1288   {12, 12},				/* cost of loading MMX registers
1289 					   in SImode and DImode */
1290   {10, 10},				/* cost of storing MMX registers
1291 					   in SImode and DImode */
1292   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
1293   {12, 12, 10, 40, 60},			/* cost of loading SSE registers
1294 					   in 32,64,128,256 and 512-bit */
1295   {10, 10, 10, 40, 60},			/* cost of storing SSE registers
1296 					   in 32,64,128,256 and 512-bit */
1297   16, 20,				/* SSE->integer and integer->SSE moves */
1298   16, 20,				/* mask->integer and integer->mask moves */
1299   {8, 8, 8},				/* cost of loading mask register
1300 					   in QImode, HImode, SImode.  */
1301   {8, 8, 8},				/* cost if storing mask register
1302 					   in QImode, HImode, SImode.  */
1303   2,					/* cost of moving mask register.  */
1304   /* End of register allocator costs.  */
1305   },
1306 
1307   COSTS_N_INSNS (1),			/* cost of an add instruction */
1308   COSTS_N_INSNS (1),			/* cost of a lea instruction */
1309   COSTS_N_INSNS (1),			/* variable shift costs */
1310   COSTS_N_INSNS (1),			/* constant shift costs */
1311   {COSTS_N_INSNS (4),			/* cost of starting multiply for QI */
1312    COSTS_N_INSNS (4),			/*				 HI */
1313    COSTS_N_INSNS (4),			/*				 SI */
1314    COSTS_N_INSNS (6),			/*				 DI */
1315    COSTS_N_INSNS (6)},			/*			      other */
1316   0,					/* cost of multiply per each bit set */
1317   {COSTS_N_INSNS (19),			/* cost of a divide/mod for QI */
1318    COSTS_N_INSNS (35),			/*			    HI */
1319    COSTS_N_INSNS (51),			/*			    SI */
1320    COSTS_N_INSNS (83),			/*			    DI */
1321    COSTS_N_INSNS (83)},			/*			    other */
1322   COSTS_N_INSNS (1),			/* cost of movsx */
1323   COSTS_N_INSNS (1),			/* cost of movzx */
1324   8,					/* "large" insn */
1325   9,					/* MOVE_RATIO */
1326   6,					/* CLEAR_RATIO */
1327   {8, 8, 8},				/* cost of loading integer registers
1328 					   in QImode, HImode and SImode.
1329 					   Relative to reg-reg move (2).  */
1330   {8, 8, 8},				/* cost of storing integer registers */
1331   {12, 12, 10, 40, 60},			/* cost of loading SSE register
1332 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
1333   {10, 10, 10, 40, 60},			/* cost of storing SSE register
1334 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
1335   {12, 12, 10, 40, 60},			/* cost of unaligned loads.  */
1336   {10, 10, 10, 40, 60},			/* cost of unaligned stores.  */
1337   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
1338   16,					/* cost of moving SSE register to integer.  */
1339   12, 12,				/* Gather load static, per_elt.  */
1340   10, 10,				/* Gather store static, per_elt.  */
1341   16,					/* size of l1 cache.  */
1342   2048,					/* size of l2 cache.  */
1343   64,					/* size of prefetch block */
1344   /* New AMD processors never drop prefetches; if they cannot be performed
1345      immediately, they are queued.  We set number of simultaneous prefetches
1346      to a large constant to reflect this (it probably is not a good idea not
1347      to limit number of prefetches at all, as their execution also takes some
1348      time).  */
1349   100,					/* number of parallel prefetches */
1350   2,					/* Branch cost */
1351   COSTS_N_INSNS (6),			/* cost of FADD and FSUB insns.  */
1352   COSTS_N_INSNS (6),			/* cost of FMUL instruction.  */
1353   COSTS_N_INSNS (42),			/* cost of FDIV instruction.  */
1354   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
1355   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
1356   COSTS_N_INSNS (52),			/* cost of FSQRT instruction.  */
1357 
1358   COSTS_N_INSNS (2),			/* cost of cheap SSE instruction.  */
1359   COSTS_N_INSNS (6),			/* cost of ADDSS/SD SUBSS/SD insns.  */
1360   COSTS_N_INSNS (6),			/* cost of MULSS instruction.  */
1361   COSTS_N_INSNS (6),			/* cost of MULSD instruction.  */
1362   COSTS_N_INSNS (6),			/* cost of FMA SS instruction.  */
1363   COSTS_N_INSNS (6),			/* cost of FMA SD instruction.  */
1364   /* 9-24  */
1365   COSTS_N_INSNS (24),			/* cost of DIVSS instruction.  */
1366   /* 9-27  */
1367   COSTS_N_INSNS (27),			/* cost of DIVSD instruction.  */
1368   COSTS_N_INSNS (15),			/* cost of SQRTSS instruction.  */
1369   COSTS_N_INSNS (26),			/* cost of SQRTSD instruction.  */
1370   1, 2, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
1371   bdver_memcpy,
1372   bdver_memset,
1373   COSTS_N_INSNS (4),			/* cond_taken_branch_cost.  */
1374   COSTS_N_INSNS (2),			/* cond_not_taken_branch_cost.  */
1375   "16:11:8",				/* Loop alignment.  */
1376   "16:8:8",				/* Jump alignment.  */
1377   "0:0:8",				/* Label alignment.  */
1378   "11",					/* Func alignment.  */
1379 };
1380 
1381 
1382 /*  ZNVER1 has optimized REP instruction for medium sized blocks, but for
1383     very small blocks it is better to use loop.  For large blocks, libcall
1384     can do nontemporary accesses and beat inline considerably.  */
1385 static stringop_algs znver1_memcpy[2] = {
1386   /* 32-bit tuning.  */
1387   {libcall, {{6, loop, false},
1388 	     {14, unrolled_loop, false},
1389 	     {-1, libcall, false}}},
1390   /* 64-bit tuning.  */
1391   {libcall, {{16, loop, false},
1392 	     {128, rep_prefix_8_byte, false},
1393 	     {-1, libcall, false}}}};
1394 static stringop_algs znver1_memset[2] = {
1395   /* 32-bit tuning.  */
1396   {libcall, {{8, loop, false},
1397 	     {24, unrolled_loop, false},
1398 	     {128, rep_prefix_4_byte, false},
1399 	     {-1, libcall, false}}},
1400   /* 64-bit tuning.  */
1401   {libcall, {{48, unrolled_loop, false},
1402 	     {128, rep_prefix_8_byte, false},
1403 	     {-1, libcall, false}}}};
1404 struct processor_costs znver1_cost = {
1405   {
1406   /* Start of register allocator costs.  integer->integer move cost is 2. */
1407 
1408   /* reg-reg moves are done by renaming and thus they are even cheaper than
1409      1 cycle. Becuase reg-reg move cost is 2 and the following tables correspond
1410      to doubles of latencies, we do not model this correctly.  It does not
1411      seem to make practical difference to bump prices up even more.  */
1412   6,					/* cost for loading QImode using
1413 					   movzbl.  */
1414   {6, 6, 6},				/* cost of loading integer registers
1415 					   in QImode, HImode and SImode.
1416 					   Relative to reg-reg move (2).  */
1417   {8, 8, 8},				/* cost of storing integer
1418 					   registers.  */
1419   2,					/* cost of reg,reg fld/fst.  */
1420   {6, 6, 16},				/* cost of loading fp registers
1421 		   			   in SFmode, DFmode and XFmode.  */
1422   {8, 8, 16},				/* cost of storing fp registers
1423  		   			   in SFmode, DFmode and XFmode.  */
1424   2,					/* cost of moving MMX register.  */
1425   {6, 6},				/* cost of loading MMX registers
1426 					   in SImode and DImode.  */
1427   {8, 8},				/* cost of storing MMX registers
1428 					   in SImode and DImode.  */
1429   2, 3, 6,				/* cost of moving XMM,YMM,ZMM register.  */
1430   {6, 6, 6, 12, 24},			/* cost of loading SSE registers
1431 					   in 32,64,128,256 and 512-bit.  */
1432   {8, 8, 8, 16, 32},			/* cost of storing SSE registers
1433 					   in 32,64,128,256 and 512-bit.  */
1434   6, 6,				/* SSE->integer and integer->SSE moves.  */
1435   8, 8,				/* mask->integer and integer->mask moves */
1436   {6, 6, 6},				/* cost of loading mask register
1437 					   in QImode, HImode, SImode.  */
1438   {8, 8, 8},				/* cost if storing mask register
1439 					   in QImode, HImode, SImode.  */
1440   2,					/* cost of moving mask register.  */
1441   /* End of register allocator costs.  */
1442   },
1443 
1444   COSTS_N_INSNS (1),			/* cost of an add instruction.  */
1445   COSTS_N_INSNS (1),			/* cost of a lea instruction.  */
1446   COSTS_N_INSNS (1),			/* variable shift costs.  */
1447   COSTS_N_INSNS (1),			/* constant shift costs.  */
1448   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI.  */
1449    COSTS_N_INSNS (3),			/*				 HI.  */
1450    COSTS_N_INSNS (3),			/*				 SI.  */
1451    COSTS_N_INSNS (3),			/*				 DI.  */
1452    COSTS_N_INSNS (3)},			/*			      other.  */
1453   0,					/* cost of multiply per each bit
1454 					    set.  */
1455    /* Depending on parameters, idiv can get faster on ryzen.  This is upper
1456       bound.  */
1457   {COSTS_N_INSNS (16),			/* cost of a divide/mod for QI.  */
1458    COSTS_N_INSNS (22),			/*			    HI.  */
1459    COSTS_N_INSNS (30),			/*			    SI.  */
1460    COSTS_N_INSNS (45),			/*			    DI.  */
1461    COSTS_N_INSNS (45)},			/*			    other.  */
1462   COSTS_N_INSNS (1),			/* cost of movsx.  */
1463   COSTS_N_INSNS (1),			/* cost of movzx.  */
1464   8,					/* "large" insn.  */
1465   9,					/* MOVE_RATIO.  */
1466   6,					/* CLEAR_RATIO */
1467   {6, 6, 6},				/* cost of loading integer registers
1468 					   in QImode, HImode and SImode.
1469 					   Relative to reg-reg move (2).  */
1470   {8, 8, 8},				/* cost of storing integer
1471 					   registers.  */
1472   {6, 6, 6, 12, 24},			/* cost of loading SSE register
1473 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
1474   {8, 8, 8, 16, 32},			/* cost of storing SSE register
1475 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
1476   {6, 6, 6, 12, 24},			/* cost of unaligned loads.  */
1477   {8, 8, 8, 16, 32},			/* cost of unaligned stores.  */
1478   2, 3, 6,				/* cost of moving XMM,YMM,ZMM register.  */
1479   6,					/* cost of moving SSE register to integer.  */
1480   /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
1481      throughput 12.  Approx 9 uops do not depend on vector size and every load
1482      is 7 uops.  */
1483   18, 8,				/* Gather load static, per_elt.  */
1484   18, 10,				/* Gather store static, per_elt.  */
1485   32,					/* size of l1 cache.  */
1486   512,					/* size of l2 cache.  */
1487   64,					/* size of prefetch block.  */
1488   /* New AMD processors never drop prefetches; if they cannot be performed
1489      immediately, they are queued.  We set number of simultaneous prefetches
1490      to a large constant to reflect this (it probably is not a good idea not
1491      to limit number of prefetches at all, as their execution also takes some
1492      time).  */
1493   100,					/* number of parallel prefetches.  */
1494   3,					/* Branch cost.  */
1495   COSTS_N_INSNS (5),			/* cost of FADD and FSUB insns.  */
1496   COSTS_N_INSNS (5),			/* cost of FMUL instruction.  */
1497   /* Latency of fdiv is 8-15.  */
1498   COSTS_N_INSNS (15),			/* cost of FDIV instruction.  */
1499   COSTS_N_INSNS (1),			/* cost of FABS instruction.  */
1500   COSTS_N_INSNS (1),			/* cost of FCHS instruction.  */
1501   /* Latency of fsqrt is 4-10.  */
1502   COSTS_N_INSNS (10),			/* cost of FSQRT instruction.  */
1503 
1504   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
1505   COSTS_N_INSNS (3),			/* cost of ADDSS/SD SUBSS/SD insns.  */
1506   COSTS_N_INSNS (3),			/* cost of MULSS instruction.  */
1507   COSTS_N_INSNS (4),			/* cost of MULSD instruction.  */
1508   COSTS_N_INSNS (5),			/* cost of FMA SS instruction.  */
1509   COSTS_N_INSNS (5),			/* cost of FMA SD instruction.  */
1510   COSTS_N_INSNS (10),			/* cost of DIVSS instruction.  */
1511   /* 9-13  */
1512   COSTS_N_INSNS (13),			/* cost of DIVSD instruction.  */
1513   COSTS_N_INSNS (10),			/* cost of SQRTSS instruction.  */
1514   COSTS_N_INSNS (15),			/* cost of SQRTSD instruction.  */
1515   /* Zen can execute 4 integer operations per cycle. FP operations take 3 cycles
1516      and it can execute 2 integer additions and 2 multiplications thus
1517      reassociation may make sense up to with of 6.  SPEC2k6 bencharks suggests
1518      that 4 works better than 6 probably due to register pressure.
1519 
1520      Integer vector operations are taken by FP unit and execute 3 vector
1521      plus/minus operations per cycle but only one multiply.  This is adjusted
1522      in ix86_reassociation_width.  */
1523   4, 4, 3, 6,				/* reassoc int, fp, vec_int, vec_fp.  */
1524   znver1_memcpy,
1525   znver1_memset,
1526   COSTS_N_INSNS (4),			/* cond_taken_branch_cost.  */
1527   COSTS_N_INSNS (2),			/* cond_not_taken_branch_cost.  */
1528   "16",					/* Loop alignment.  */
1529   "16",					/* Jump alignment.  */
1530   "0:0:8",				/* Label alignment.  */
1531   "16",					/* Func alignment.  */
1532 };
1533 
1534 /*  ZNVER2 has optimized REP instruction for medium sized blocks, but for
1535     very small blocks it is better to use loop.  For large blocks, libcall
1536     can do nontemporary accesses and beat inline considerably.  */
1537 static stringop_algs znver2_memcpy[2] = {
1538   /* 32-bit tuning.  */
1539   {libcall, {{6, loop, false},
1540 	     {14, unrolled_loop, false},
1541 	     {-1, libcall, false}}},
1542   /* 64-bit tuning.  */
1543   {libcall, {{16, loop, false},
1544 	     {64, rep_prefix_4_byte, false},
1545 	     {-1, libcall, false}}}};
1546 static stringop_algs znver2_memset[2] = {
1547   /* 32-bit tuning.  */
1548   {libcall, {{8, loop, false},
1549 	     {24, unrolled_loop, false},
1550 	     {128, rep_prefix_4_byte, false},
1551 	     {-1, libcall, false}}},
1552   /* 64-bit tuning.  */
1553   {libcall, {{24, rep_prefix_4_byte, false},
1554 	     {128, rep_prefix_8_byte, false},
1555 	     {-1, libcall, false}}}};
1556 
1557 struct processor_costs znver2_cost = {
1558   {
1559   /* Start of register allocator costs.  integer->integer move cost is 2. */
1560 
1561   /* reg-reg moves are done by renaming and thus they are even cheaper than
1562      1 cycle.  Because reg-reg move cost is 2 and following tables correspond
1563      to doubles of latencies, we do not model this correctly.  It does not
1564      seem to make practical difference to bump prices up even more.  */
1565   6,					/* cost for loading QImode using
1566 					   movzbl.  */
1567   {6, 6, 6},				/* cost of loading integer registers
1568 					   in QImode, HImode and SImode.
1569 					   Relative to reg-reg move (2).  */
1570   {8, 8, 8},				/* cost of storing integer
1571 					   registers.  */
1572   2,					/* cost of reg,reg fld/fst.  */
1573   {6, 6, 16},				/* cost of loading fp registers
1574 					   in SFmode, DFmode and XFmode.  */
1575   {8, 8, 16},				/* cost of storing fp registers
1576 					   in SFmode, DFmode and XFmode.  */
1577   2,					/* cost of moving MMX register.  */
1578   {6, 6},				/* cost of loading MMX registers
1579 					   in SImode and DImode.  */
1580   {8, 8},				/* cost of storing MMX registers
1581 					   in SImode and DImode.  */
1582   2, 2, 3,				/* cost of moving XMM,YMM,ZMM
1583 					   register.  */
1584   {6, 6, 6, 6, 12},			/* cost of loading SSE registers
1585 					   in 32,64,128,256 and 512-bit.  */
1586   {8, 8, 8, 8, 16},			/* cost of storing SSE registers
1587 					   in 32,64,128,256 and 512-bit.  */
1588   6, 6,					/* SSE->integer and integer->SSE
1589 					   moves.  */
1590   8, 8,				/* mask->integer and integer->mask moves */
1591   {6, 6, 6},				/* cost of loading mask register
1592 					   in QImode, HImode, SImode.  */
1593   {8, 8, 8},				/* cost if storing mask register
1594 					   in QImode, HImode, SImode.  */
1595   2,					/* cost of moving mask register.  */
1596   /* End of register allocator costs.  */
1597   },
1598 
1599   COSTS_N_INSNS (1),			/* cost of an add instruction.  */
1600   COSTS_N_INSNS (1),			/* cost of a lea instruction.  */
1601   COSTS_N_INSNS (1),			/* variable shift costs.  */
1602   COSTS_N_INSNS (1),			/* constant shift costs.  */
1603   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI.  */
1604    COSTS_N_INSNS (3),			/* 				 HI.  */
1605    COSTS_N_INSNS (3),			/*				 SI.  */
1606    COSTS_N_INSNS (3),			/*				 DI.  */
1607    COSTS_N_INSNS (3)},			/*			other.  */
1608   0,					/* cost of multiply per each bit
1609 					   set.  */
1610    /* Depending on parameters, idiv can get faster on ryzen.  This is upper
1611       bound.  */
1612   {COSTS_N_INSNS (16),			/* cost of a divide/mod for QI.  */
1613    COSTS_N_INSNS (22),			/* 			    HI.  */
1614    COSTS_N_INSNS (30),			/*			    SI.  */
1615    COSTS_N_INSNS (45),			/*			    DI.  */
1616    COSTS_N_INSNS (45)},			/*			    other.  */
1617   COSTS_N_INSNS (1),			/* cost of movsx.  */
1618   COSTS_N_INSNS (1),			/* cost of movzx.  */
1619   8,					/* "large" insn.  */
1620   9,					/* MOVE_RATIO.  */
1621   6,					/* CLEAR_RATIO */
1622   {6, 6, 6},				/* cost of loading integer registers
1623 					   in QImode, HImode and SImode.
1624 					   Relative to reg-reg move (2).  */
1625   {8, 8, 8},				/* cost of storing integer
1626 					   registers.  */
1627   {6, 6, 6, 6, 12},			/* cost of loading SSE registers
1628 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
1629   {8, 8, 8, 8, 16},			/* cost of storing SSE register
1630 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
1631   {6, 6, 6, 6, 12},			/* cost of unaligned loads.  */
1632   {8, 8, 8, 8, 16},			/* cost of unaligned stores.  */
1633   2, 2, 3,				/* cost of moving XMM,YMM,ZMM
1634 					   register.  */
1635   6,					/* cost of moving SSE register to integer.  */
1636   /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
1637      throughput 12.  Approx 9 uops do not depend on vector size and every load
1638      is 7 uops.  */
1639   18, 8,				/* Gather load static, per_elt.  */
1640   18, 10,				/* Gather store static, per_elt.  */
1641   32,					/* size of l1 cache.  */
1642   512,					/* size of l2 cache.  */
1643   64,					/* size of prefetch block.  */
1644   /* New AMD processors never drop prefetches; if they cannot be performed
1645      immediately, they are queued.  We set number of simultaneous prefetches
1646      to a large constant to reflect this (it probably is not a good idea not
1647      to limit number of prefetches at all, as their execution also takes some
1648      time).  */
1649   100,					/* number of parallel prefetches.  */
1650   3,					/* Branch cost.  */
1651   COSTS_N_INSNS (5),			/* cost of FADD and FSUB insns.  */
1652   COSTS_N_INSNS (5),			/* cost of FMUL instruction.  */
1653   /* Latency of fdiv is 8-15.  */
1654   COSTS_N_INSNS (15),			/* cost of FDIV instruction.  */
1655   COSTS_N_INSNS (1),			/* cost of FABS instruction.  */
1656   COSTS_N_INSNS (1),			/* cost of FCHS instruction.  */
1657   /* Latency of fsqrt is 4-10.  */
1658   COSTS_N_INSNS (10),			/* cost of FSQRT instruction.  */
1659 
1660   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
1661   COSTS_N_INSNS (3),			/* cost of ADDSS/SD SUBSS/SD insns.  */
1662   COSTS_N_INSNS (3),			/* cost of MULSS instruction.  */
1663   COSTS_N_INSNS (3),			/* cost of MULSD instruction.  */
1664   COSTS_N_INSNS (5),			/* cost of FMA SS instruction.  */
1665   COSTS_N_INSNS (5),			/* cost of FMA SD instruction.  */
1666   COSTS_N_INSNS (10),			/* cost of DIVSS instruction.  */
1667   /* 9-13.  */
1668   COSTS_N_INSNS (13),			/* cost of DIVSD instruction.  */
1669   COSTS_N_INSNS (10),			/* cost of SQRTSS instruction.  */
1670   COSTS_N_INSNS (15),			/* cost of SQRTSD instruction.  */
1671   /* Zen can execute 4 integer operations per cycle.  FP operations
1672      take 3 cycles and it can execute 2 integer additions and 2
1673      multiplications thus reassociation may make sense up to with of 6.
1674      SPEC2k6 bencharks suggests
1675      that 4 works better than 6 probably due to register pressure.
1676 
1677      Integer vector operations are taken by FP unit and execute 3 vector
1678      plus/minus operations per cycle but only one multiply.  This is adjusted
1679      in ix86_reassociation_width.  */
1680   4, 4, 3, 6,				/* reassoc int, fp, vec_int, vec_fp.  */
1681   znver2_memcpy,
1682   znver2_memset,
1683   COSTS_N_INSNS (4),			/* cond_taken_branch_cost.  */
1684   COSTS_N_INSNS (2),			/* cond_not_taken_branch_cost.  */
1685   "16",					/* Loop alignment.  */
1686   "16",					/* Jump alignment.  */
1687   "0:0:8",				/* Label alignment.  */
1688   "16",					/* Func alignment.  */
1689 };
1690 
1691 struct processor_costs znver3_cost = {
1692   {
1693   /* Start of register allocator costs.  integer->integer move cost is 2. */
1694 
1695   /* reg-reg moves are done by renaming and thus they are even cheaper than
1696      1 cycle.  Because reg-reg move cost is 2 and following tables correspond
1697      to doubles of latencies, we do not model this correctly.  It does not
1698      seem to make practical difference to bump prices up even more.  */
1699   6,					/* cost for loading QImode using
1700 					   movzbl.  */
1701   {6, 6, 6},				/* cost of loading integer registers
1702 					   in QImode, HImode and SImode.
1703 					   Relative to reg-reg move (2).  */
1704   {8, 8, 8},				/* cost of storing integer
1705 					   registers.  */
1706   2,					/* cost of reg,reg fld/fst.  */
1707   {6, 6, 16},				/* cost of loading fp registers
1708 					   in SFmode, DFmode and XFmode.  */
1709   {8, 8, 16},				/* cost of storing fp registers
1710 					   in SFmode, DFmode and XFmode.  */
1711   2,					/* cost of moving MMX register.  */
1712   {6, 6},				/* cost of loading MMX registers
1713 					   in SImode and DImode.  */
1714   {8, 8},				/* cost of storing MMX registers
1715 					   in SImode and DImode.  */
1716   2, 2, 3,				/* cost of moving XMM,YMM,ZMM
1717 					   register.  */
1718   {6, 6, 6, 6, 12},			/* cost of loading SSE registers
1719 					   in 32,64,128,256 and 512-bit.  */
1720   {8, 8, 8, 8, 16},			/* cost of storing SSE registers
1721 					   in 32,64,128,256 and 512-bit.  */
1722   6, 6,					/* SSE->integer and integer->SSE
1723 					   moves.  */
1724   8, 8,				/* mask->integer and integer->mask moves */
1725   {6, 6, 6},				/* cost of loading mask register
1726 					   in QImode, HImode, SImode.  */
1727   {8, 8, 8},				/* cost if storing mask register
1728 					   in QImode, HImode, SImode.  */
1729   2,					/* cost of moving mask register.  */
1730   /* End of register allocator costs.  */
1731   },
1732 
1733   COSTS_N_INSNS (1),			/* cost of an add instruction.  */
1734   COSTS_N_INSNS (1),			/* cost of a lea instruction.  */
1735   COSTS_N_INSNS (1),			/* variable shift costs.  */
1736   COSTS_N_INSNS (1),			/* constant shift costs.  */
1737   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI.  */
1738    COSTS_N_INSNS (3),			/* 				 HI.  */
1739    COSTS_N_INSNS (3),			/*				 SI.  */
1740    COSTS_N_INSNS (3),			/*				 DI.  */
1741    COSTS_N_INSNS (3)},			/*			other.  */
1742   0,					/* cost of multiply per each bit
1743 					   set.  */
1744   {COSTS_N_INSNS (9),			/* cost of a divide/mod for QI.  */
1745    COSTS_N_INSNS (10),			/* 			    HI.  */
1746    COSTS_N_INSNS (12),			/*			    SI.  */
1747    COSTS_N_INSNS (17),			/*			    DI.  */
1748    COSTS_N_INSNS (17)},			/*			    other.  */
1749   COSTS_N_INSNS (1),			/* cost of movsx.  */
1750   COSTS_N_INSNS (1),			/* cost of movzx.  */
1751   8,					/* "large" insn.  */
1752   9,					/* MOVE_RATIO.  */
1753   6,					/* CLEAR_RATIO */
1754   {6, 6, 6},				/* cost of loading integer registers
1755 					   in QImode, HImode and SImode.
1756 					   Relative to reg-reg move (2).  */
1757   {8, 8, 8},				/* cost of storing integer
1758 					   registers.  */
1759   {6, 6, 6, 6, 12},			/* cost of loading SSE registers
1760 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
1761   {8, 8, 8, 8, 16},			/* cost of storing SSE register
1762 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
1763   {6, 6, 6, 6, 12},			/* cost of unaligned loads.  */
1764   {8, 8, 8, 8, 16},			/* cost of unaligned stores.  */
1765   2, 2, 3,				/* cost of moving XMM,YMM,ZMM
1766 					   register.  */
1767   6,					/* cost of moving SSE register to integer.  */
1768   /* VGATHERDPD is 15 uops and throughput is 4, VGATHERDPS is 23 uops,
1769      throughput 9.  Approx 7 uops do not depend on vector size and every load
1770      is 4 uops.  */
1771   14, 8,				/* Gather load static, per_elt.  */
1772   14, 10,				/* Gather store static, per_elt.  */
1773   32,					/* size of l1 cache.  */
1774   512,					/* size of l2 cache.  */
1775   64,					/* size of prefetch block.  */
1776   /* New AMD processors never drop prefetches; if they cannot be performed
1777      immediately, they are queued.  We set number of simultaneous prefetches
1778      to a large constant to reflect this (it probably is not a good idea not
1779      to limit number of prefetches at all, as their execution also takes some
1780      time).  */
1781   100,					/* number of parallel prefetches.  */
1782   3,					/* Branch cost.  */
1783   COSTS_N_INSNS (5),			/* cost of FADD and FSUB insns.  */
1784   COSTS_N_INSNS (5),			/* cost of FMUL instruction.  */
1785   /* Latency of fdiv is 8-15.  */
1786   COSTS_N_INSNS (15),			/* cost of FDIV instruction.  */
1787   COSTS_N_INSNS (1),			/* cost of FABS instruction.  */
1788   COSTS_N_INSNS (1),			/* cost of FCHS instruction.  */
1789   /* Latency of fsqrt is 4-10.  */
1790   COSTS_N_INSNS (10),			/* cost of FSQRT instruction.  */
1791 
1792   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
1793   COSTS_N_INSNS (3),			/* cost of ADDSS/SD SUBSS/SD insns.  */
1794   COSTS_N_INSNS (3),			/* cost of MULSS instruction.  */
1795   COSTS_N_INSNS (3),			/* cost of MULSD instruction.  */
1796   COSTS_N_INSNS (5),			/* cost of FMA SS instruction.  */
1797   COSTS_N_INSNS (5),			/* cost of FMA SD instruction.  */
1798   COSTS_N_INSNS (10),			/* cost of DIVSS instruction.  */
1799   /* 9-13.  */
1800   COSTS_N_INSNS (13),			/* cost of DIVSD instruction.  */
1801   COSTS_N_INSNS (10),			/* cost of SQRTSS instruction.  */
1802   COSTS_N_INSNS (15),			/* cost of SQRTSD instruction.  */
1803   /* Zen can execute 4 integer operations per cycle.  FP operations
1804      take 3 cycles and it can execute 2 integer additions and 2
1805      multiplications thus reassociation may make sense up to with of 6.
1806      SPEC2k6 bencharks suggests
1807      that 4 works better than 6 probably due to register pressure.
1808 
1809      Integer vector operations are taken by FP unit and execute 3 vector
1810      plus/minus operations per cycle but only one multiply.  This is adjusted
1811      in ix86_reassociation_width.  */
1812   4, 4, 3, 6,				/* reassoc int, fp, vec_int, vec_fp.  */
1813   znver2_memcpy,
1814   znver2_memset,
1815   COSTS_N_INSNS (4),			/* cond_taken_branch_cost.  */
1816   COSTS_N_INSNS (2),			/* cond_not_taken_branch_cost.  */
1817   "16",					/* Loop alignment.  */
1818   "16",					/* Jump alignment.  */
1819   "0:0:8",				/* Label alignment.  */
1820   "16",					/* Func alignment.  */
1821 };
1822 
1823 /* skylake_cost should produce code tuned for Skylake familly of CPUs.  */
1824 static stringop_algs skylake_memcpy[2] =   {
1825   {libcall,
1826    {{256, rep_prefix_1_byte, true},
1827     {256, loop, false},
1828     {-1, libcall, false}}},
1829   {libcall,
1830    {{256, rep_prefix_1_byte, true},
1831     {256, loop, false},
1832     {-1, libcall, false}}}};
1833 
1834 static stringop_algs skylake_memset[2] = {
1835   {libcall,
1836    {{256, rep_prefix_1_byte, true},
1837     {256, loop, false},
1838     {-1, libcall, false}}},
1839   {libcall,
1840    {{256, rep_prefix_1_byte, true},
1841     {256, loop, false},
1842     {-1, libcall, false}}}};
1843 
1844 static const
1845 struct processor_costs skylake_cost = {
1846   {
1847   /* Start of register allocator costs.  integer->integer move cost is 2. */
1848   6,				     /* cost for loading QImode using movzbl */
1849   {4, 4, 4},				/* cost of loading integer registers
1850 					   in QImode, HImode and SImode.
1851 					   Relative to reg-reg move (2).  */
1852   {6, 6, 6},				/* cost of storing integer registers */
1853   2,					/* cost of reg,reg fld/fst */
1854   {6, 6, 8},				/* cost of loading fp registers
1855 					   in SFmode, DFmode and XFmode */
1856   {6, 6, 10},				/* cost of storing fp registers
1857 					   in SFmode, DFmode and XFmode */
1858   2,					/* cost of moving MMX register */
1859   {6, 6},				/* cost of loading MMX registers
1860 					   in SImode and DImode */
1861   {6, 6},				/* cost of storing MMX registers
1862 					   in SImode and DImode */
1863   2, 2, 4,				/* cost of moving XMM,YMM,ZMM register */
1864   {6, 6, 6, 10, 20},			/* cost of loading SSE registers
1865 					   in 32,64,128,256 and 512-bit */
1866   {8, 8, 8, 12, 24},			/* cost of storing SSE registers
1867 					   in 32,64,128,256 and 512-bit */
1868   6, 6,				/* SSE->integer and integer->SSE moves */
1869   5, 5,				/* mask->integer and integer->mask moves */
1870   {8, 8, 8},				/* cost of loading mask register
1871 					   in QImode, HImode, SImode.  */
1872   {6, 6, 6},				/* cost if storing mask register
1873 					   in QImode, HImode, SImode.  */
1874   3,					/* cost of moving mask register.  */
1875   /* End of register allocator costs.  */
1876   },
1877 
1878   COSTS_N_INSNS (1),			/* cost of an add instruction */
1879   COSTS_N_INSNS (1)+1,		/* cost of a lea instruction */
1880   COSTS_N_INSNS (1),			/* variable shift costs */
1881   COSTS_N_INSNS (1),			/* constant shift costs */
1882   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
1883    COSTS_N_INSNS (4),			/*				 HI */
1884    COSTS_N_INSNS (3),			/*				 SI */
1885    COSTS_N_INSNS (3),			/*				 DI */
1886    COSTS_N_INSNS (3)},			/*			      other */
1887   0,					/* cost of multiply per each bit set */
1888   /* Expanding div/mod currently doesn't consider parallelism. So the cost
1889      model is not realistic. We compensate by increasing the latencies a bit.  */
1890   {COSTS_N_INSNS (11),			/* cost of a divide/mod for QI */
1891    COSTS_N_INSNS (11),			/*			    HI */
1892    COSTS_N_INSNS (14),			/*			    SI */
1893    COSTS_N_INSNS (76),			/*			    DI */
1894    COSTS_N_INSNS (76)},			/*			    other */
1895   COSTS_N_INSNS (1),			/* cost of movsx */
1896   COSTS_N_INSNS (0),			/* cost of movzx */
1897   8,					/* "large" insn */
1898   17,					/* MOVE_RATIO */
1899   17,					/* CLEAR_RATIO */
1900   {4, 4, 4},				/* cost of loading integer registers
1901 					   in QImode, HImode and SImode.
1902 					   Relative to reg-reg move (2).  */
1903   {6, 6, 6},				/* cost of storing integer registers */
1904   {6, 6, 6, 10, 20},			/* cost of loading SSE register
1905 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
1906   {8, 8, 8, 12, 24},			/* cost of storing SSE register
1907 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
1908   {6, 6, 6, 10, 20},			/* cost of unaligned loads.  */
1909   {8, 8, 8, 8, 16},			/* cost of unaligned stores.  */
1910   2, 2, 4,				/* cost of moving XMM,YMM,ZMM register */
1911   6,					/* cost of moving SSE register to integer.  */
1912   20, 8,				/* Gather load static, per_elt.  */
1913   22, 10,				/* Gather store static, per_elt.  */
1914   64,					/* size of l1 cache.  */
1915   512,					/* size of l2 cache.  */
1916   64,					/* size of prefetch block */
1917   6,					/* number of parallel prefetches */
1918   3,					/* Branch cost */
1919   COSTS_N_INSNS (3),			/* cost of FADD and FSUB insns.  */
1920   COSTS_N_INSNS (4),			/* cost of FMUL instruction.  */
1921   COSTS_N_INSNS (20),			/* cost of FDIV instruction.  */
1922   COSTS_N_INSNS (1),			/* cost of FABS instruction.  */
1923   COSTS_N_INSNS (1),			/* cost of FCHS instruction.  */
1924   COSTS_N_INSNS (20),			/* cost of FSQRT instruction.  */
1925 
1926   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
1927   COSTS_N_INSNS (4),			/* cost of ADDSS/SD SUBSS/SD insns.  */
1928   COSTS_N_INSNS (4),			/* cost of MULSS instruction.  */
1929   COSTS_N_INSNS (4),			/* cost of MULSD instruction.  */
1930   COSTS_N_INSNS (4),			/* cost of FMA SS instruction.  */
1931   COSTS_N_INSNS (4),			/* cost of FMA SD instruction.  */
1932   COSTS_N_INSNS (11),			/* cost of DIVSS instruction.  */
1933   COSTS_N_INSNS (14),			/* cost of DIVSD instruction.  */
1934   COSTS_N_INSNS (12),			/* cost of SQRTSS instruction.  */
1935   COSTS_N_INSNS (18),			/* cost of SQRTSD instruction.  */
1936   1, 4, 2, 2,				/* reassoc int, fp, vec_int, vec_fp.  */
1937   skylake_memcpy,
1938   skylake_memset,
1939   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
1940   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
1941   "16:11:8",				/* Loop alignment.  */
1942   "16:11:8",				/* Jump alignment.  */
1943   "0:0:8",				/* Label alignment.  */
1944   "16",					/* Func alignment.  */
1945 };
1946 
1947 /* icelake_cost should produce code tuned for Icelake family of CPUs.
1948    NB: rep_prefix_1_byte is used only for known size. */
1949 
1950 static stringop_algs icelake_memcpy[2] =   {
1951   {libcall,
1952    {{256, rep_prefix_1_byte, true},
1953     {256, loop, false},
1954     {-1, libcall, false}}},
1955   {libcall,
1956    {{256, rep_prefix_1_byte, true},
1957     {256, loop, false},
1958     {-1, libcall, false}}}};
1959 
1960 static stringop_algs icelake_memset[2] = {
1961   {libcall,
1962    {{256, rep_prefix_1_byte, true},
1963     {256, loop, false},
1964     {-1, libcall, false}}},
1965   {libcall,
1966    {{256, rep_prefix_1_byte, true},
1967     {256, loop, false},
1968     {-1, libcall, false}}}};
1969 
1970 static const
1971 struct processor_costs icelake_cost = {
1972   {
1973   /* Start of register allocator costs.  integer->integer move cost is 2. */
1974   6,				     /* cost for loading QImode using movzbl */
1975   {4, 4, 4},				/* cost of loading integer registers
1976 					   in QImode, HImode and SImode.
1977 					   Relative to reg-reg move (2).  */
1978   {6, 6, 6},				/* cost of storing integer registers */
1979   2,					/* cost of reg,reg fld/fst */
1980   {6, 6, 8},				/* cost of loading fp registers
1981 					   in SFmode, DFmode and XFmode */
1982   {6, 6, 10},				/* cost of storing fp registers
1983 					   in SFmode, DFmode and XFmode */
1984   2,					/* cost of moving MMX register */
1985   {6, 6},				/* cost of loading MMX registers
1986 					   in SImode and DImode */
1987   {6, 6},				/* cost of storing MMX registers
1988 					   in SImode and DImode */
1989   2, 2, 4,				/* cost of moving XMM,YMM,ZMM register */
1990   {6, 6, 6, 10, 20},			/* cost of loading SSE registers
1991 					   in 32,64,128,256 and 512-bit */
1992   {8, 8, 8, 12, 24},			/* cost of storing SSE registers
1993 					   in 32,64,128,256 and 512-bit */
1994   6, 6,				/* SSE->integer and integer->SSE moves */
1995   5, 5,				/* mask->integer and integer->mask moves */
1996   {8, 8, 8},				/* cost of loading mask register
1997 					   in QImode, HImode, SImode.  */
1998   {6, 6, 6},				/* cost if storing mask register
1999 					   in QImode, HImode, SImode.  */
2000   3,					/* cost of moving mask register.  */
2001   /* End of register allocator costs.  */
2002   },
2003 
2004   COSTS_N_INSNS (1),			/* cost of an add instruction */
2005   COSTS_N_INSNS (1)+1,		/* cost of a lea instruction */
2006   COSTS_N_INSNS (1),			/* variable shift costs */
2007   COSTS_N_INSNS (1),			/* constant shift costs */
2008   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
2009    COSTS_N_INSNS (4),			/*				 HI */
2010    COSTS_N_INSNS (3),			/*				 SI */
2011    COSTS_N_INSNS (3),			/*				 DI */
2012    COSTS_N_INSNS (3)},			/*			      other */
2013   0,					/* cost of multiply per each bit set */
2014   /* Expanding div/mod currently doesn't consider parallelism. So the cost
2015      model is not realistic. We compensate by increasing the latencies a bit.  */
2016   {COSTS_N_INSNS (11),			/* cost of a divide/mod for QI */
2017    COSTS_N_INSNS (11),			/*			    HI */
2018    COSTS_N_INSNS (14),			/*			    SI */
2019    COSTS_N_INSNS (76),			/*			    DI */
2020    COSTS_N_INSNS (76)},			/*			    other */
2021   COSTS_N_INSNS (1),			/* cost of movsx */
2022   COSTS_N_INSNS (0),			/* cost of movzx */
2023   8,					/* "large" insn */
2024   17,					/* MOVE_RATIO */
2025   17,					/* CLEAR_RATIO */
2026   {4, 4, 4},				/* cost of loading integer registers
2027 					   in QImode, HImode and SImode.
2028 					   Relative to reg-reg move (2).  */
2029   {6, 6, 6},				/* cost of storing integer registers */
2030   {6, 6, 6, 10, 20},			/* cost of loading SSE register
2031 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
2032   {8, 8, 8, 12, 24},			/* cost of storing SSE register
2033 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
2034   {6, 6, 6, 10, 20},			/* cost of unaligned loads.  */
2035   {8, 8, 8, 8, 16},			/* cost of unaligned stores.  */
2036   2, 2, 4,				/* cost of moving XMM,YMM,ZMM register */
2037   6,					/* cost of moving SSE register to integer.  */
2038   20, 8,				/* Gather load static, per_elt.  */
2039   22, 10,				/* Gather store static, per_elt.  */
2040   64,					/* size of l1 cache.  */
2041   512,					/* size of l2 cache.  */
2042   64,					/* size of prefetch block */
2043   6,					/* number of parallel prefetches */
2044   3,					/* Branch cost */
2045   COSTS_N_INSNS (3),			/* cost of FADD and FSUB insns.  */
2046   COSTS_N_INSNS (4),			/* cost of FMUL instruction.  */
2047   COSTS_N_INSNS (20),			/* cost of FDIV instruction.  */
2048   COSTS_N_INSNS (1),			/* cost of FABS instruction.  */
2049   COSTS_N_INSNS (1),			/* cost of FCHS instruction.  */
2050   COSTS_N_INSNS (20),			/* cost of FSQRT instruction.  */
2051 
2052   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
2053   COSTS_N_INSNS (4),			/* cost of ADDSS/SD SUBSS/SD insns.  */
2054   COSTS_N_INSNS (4),			/* cost of MULSS instruction.  */
2055   COSTS_N_INSNS (4),			/* cost of MULSD instruction.  */
2056   COSTS_N_INSNS (4),			/* cost of FMA SS instruction.  */
2057   COSTS_N_INSNS (4),			/* cost of FMA SD instruction.  */
2058   COSTS_N_INSNS (11),			/* cost of DIVSS instruction.  */
2059   COSTS_N_INSNS (14),			/* cost of DIVSD instruction.  */
2060   COSTS_N_INSNS (12),			/* cost of SQRTSS instruction.  */
2061   COSTS_N_INSNS (18),			/* cost of SQRTSD instruction.  */
2062   1, 4, 2, 2,				/* reassoc int, fp, vec_int, vec_fp.  */
2063   icelake_memcpy,
2064   icelake_memset,
2065   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
2066   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
2067   "16:11:8",				/* Loop alignment.  */
2068   "16:11:8",				/* Jump alignment.  */
2069   "0:0:8",				/* Label alignment.  */
2070   "16",					/* Func alignment.  */
2071 };
2072 
2073   /* BTVER1 has optimized REP instruction for medium sized blocks, but for
2074      very small blocks it is better to use loop. For large blocks, libcall can
2075      do nontemporary accesses and beat inline considerably.  */
2076 static stringop_algs btver1_memcpy[2] = {
2077   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
2078              {-1, rep_prefix_4_byte, false}}},
2079   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
2080              {-1, libcall, false}}}};
2081 static stringop_algs btver1_memset[2] = {
2082   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
2083              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
2084   {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
2085              {-1, libcall, false}}}};
2086 const struct processor_costs btver1_cost = {
2087   {
2088   /* Start of register allocator costs.  integer->integer move cost is 2. */
2089   8,				     /* cost for loading QImode using movzbl */
2090   {6, 8, 6},				/* cost of loading integer registers
2091 					   in QImode, HImode and SImode.
2092 					   Relative to reg-reg move (2).  */
2093   {6, 8, 6},				/* cost of storing integer registers */
2094   4,					/* cost of reg,reg fld/fst */
2095   {12, 12, 28},				/* cost of loading fp registers
2096 					   in SFmode, DFmode and XFmode */
2097   {12, 12, 38},				/* cost of storing fp registers
2098 					   in SFmode, DFmode and XFmode */
2099   4,					/* cost of moving MMX register */
2100   {10, 10},				/* cost of loading MMX registers
2101 					   in SImode and DImode */
2102   {12, 12},				/* cost of storing MMX registers
2103 					   in SImode and DImode */
2104   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
2105   {10, 10, 12, 48, 96},			/* cost of loading SSE registers
2106 					   in 32,64,128,256 and 512-bit */
2107   {10, 10, 12, 48, 96},			/* cost of storing SSE registers
2108 					   in 32,64,128,256 and 512-bit */
2109   14, 14,				/* SSE->integer and integer->SSE moves */
2110   14, 14,				/* mask->integer and integer->mask moves */
2111   {6, 8, 6},				/* cost of loading mask register
2112 					   in QImode, HImode, SImode.  */
2113   {6, 8, 6},				/* cost if storing mask register
2114 					   in QImode, HImode, SImode.  */
2115   2,					/* cost of moving mask register.  */
2116   /* End of register allocator costs.  */
2117   },
2118 
2119   COSTS_N_INSNS (1),			/* cost of an add instruction */
2120   COSTS_N_INSNS (2),			/* cost of a lea instruction */
2121   COSTS_N_INSNS (1),			/* variable shift costs */
2122   COSTS_N_INSNS (1),			/* constant shift costs */
2123   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
2124    COSTS_N_INSNS (4),			/*				 HI */
2125    COSTS_N_INSNS (3),			/*				 SI */
2126    COSTS_N_INSNS (4),			/*				 DI */
2127    COSTS_N_INSNS (5)},			/*			      other */
2128   0,					/* cost of multiply per each bit set */
2129   {COSTS_N_INSNS (19),			/* cost of a divide/mod for QI */
2130    COSTS_N_INSNS (35),			/*			    HI */
2131    COSTS_N_INSNS (51),			/*			    SI */
2132    COSTS_N_INSNS (83),			/*			    DI */
2133    COSTS_N_INSNS (83)},			/*			    other */
2134   COSTS_N_INSNS (1),			/* cost of movsx */
2135   COSTS_N_INSNS (1),			/* cost of movzx */
2136   8,					/* "large" insn */
2137   9,					/* MOVE_RATIO */
2138   6,					/* CLEAR_RATIO */
2139   {6, 8, 6},				/* cost of loading integer registers
2140 					   in QImode, HImode and SImode.
2141 					   Relative to reg-reg move (2).  */
2142   {6, 8, 6},				/* cost of storing integer registers */
2143   {10, 10, 12, 48, 96},			/* cost of loading SSE register
2144 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
2145   {10, 10, 12, 48, 96},			/* cost of storing SSE register
2146 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
2147   {10, 10, 12, 48, 96},			/* cost of unaligned loads.  */
2148   {10, 10, 12, 48, 96},			/* cost of unaligned stores.  */
2149   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
2150   14,					/* cost of moving SSE register to integer.  */
2151   10, 10,				/* Gather load static, per_elt.  */
2152   10, 10,				/* Gather store static, per_elt.  */
2153   32,					/* size of l1 cache.  */
2154   512,					/* size of l2 cache.  */
2155   64,					/* size of prefetch block */
2156   100,					/* number of parallel prefetches */
2157   2,					/* Branch cost */
2158   COSTS_N_INSNS (4),			/* cost of FADD and FSUB insns.  */
2159   COSTS_N_INSNS (4),			/* cost of FMUL instruction.  */
2160   COSTS_N_INSNS (19),			/* cost of FDIV instruction.  */
2161   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
2162   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
2163   COSTS_N_INSNS (35),			/* cost of FSQRT instruction.  */
2164 
2165   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
2166   COSTS_N_INSNS (3),			/* cost of ADDSS/SD SUBSS/SD insns.  */
2167   COSTS_N_INSNS (2),			/* cost of MULSS instruction.  */
2168   COSTS_N_INSNS (4),			/* cost of MULSD instruction.  */
2169   COSTS_N_INSNS (5),			/* cost of FMA SS instruction.  */
2170   COSTS_N_INSNS (5),			/* cost of FMA SD instruction.  */
2171   COSTS_N_INSNS (13),			/* cost of DIVSS instruction.  */
2172   COSTS_N_INSNS (17),			/* cost of DIVSD instruction.  */
2173   COSTS_N_INSNS (14),			/* cost of SQRTSS instruction.  */
2174   COSTS_N_INSNS (48),			/* cost of SQRTSD instruction.  */
2175   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
2176   btver1_memcpy,
2177   btver1_memset,
2178   COSTS_N_INSNS (2),			/* cond_taken_branch_cost.  */
2179   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
2180   "16:11:8",				/* Loop alignment.  */
2181   "16:8:8",				/* Jump alignment.  */
2182   "0:0:8",				/* Label alignment.  */
2183   "11",					/* Func alignment.  */
2184 };
2185 
2186 static stringop_algs btver2_memcpy[2] = {
2187   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
2188              {-1, rep_prefix_4_byte, false}}},
2189   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
2190              {-1, libcall, false}}}};
2191 static stringop_algs btver2_memset[2] = {
2192   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
2193              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
2194   {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
2195              {-1, libcall, false}}}};
2196 const struct processor_costs btver2_cost = {
2197   {
2198   /* Start of register allocator costs.  integer->integer move cost is 2. */
2199   8,				     /* cost for loading QImode using movzbl */
2200   {8, 8, 6},				/* cost of loading integer registers
2201 					   in QImode, HImode and SImode.
2202 					   Relative to reg-reg move (2).  */
2203   {8, 8, 6},				/* cost of storing integer registers */
2204   4,					/* cost of reg,reg fld/fst */
2205   {12, 12, 28},				/* cost of loading fp registers
2206 					   in SFmode, DFmode and XFmode */
2207   {12, 12, 38},				/* cost of storing fp registers
2208 					   in SFmode, DFmode and XFmode */
2209   4,					/* cost of moving MMX register */
2210   {10, 10},				/* cost of loading MMX registers
2211 					   in SImode and DImode */
2212   {12, 12},				/* cost of storing MMX registers
2213 					   in SImode and DImode */
2214   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
2215   {10, 10, 12, 48, 96},			/* cost of loading SSE registers
2216 					   in 32,64,128,256 and 512-bit */
2217   {10, 10, 12, 48, 96},			/* cost of storing SSE registers
2218 					   in 32,64,128,256 and 512-bit */
2219   14, 14,				/* SSE->integer and integer->SSE moves */
2220   14, 14,				/* mask->integer and integer->mask moves */
2221   {8, 8, 6},				/* cost of loading mask register
2222 					   in QImode, HImode, SImode.  */
2223   {8, 8, 6},				/* cost if storing mask register
2224 					   in QImode, HImode, SImode.  */
2225   2,					/* cost of moving mask register.  */
2226   /* End of register allocator costs.  */
2227   },
2228 
2229   COSTS_N_INSNS (1),			/* cost of an add instruction */
2230   COSTS_N_INSNS (2),			/* cost of a lea instruction */
2231   COSTS_N_INSNS (1),			/* variable shift costs */
2232   COSTS_N_INSNS (1),			/* constant shift costs */
2233   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
2234    COSTS_N_INSNS (4),			/*				 HI */
2235    COSTS_N_INSNS (3),			/*				 SI */
2236    COSTS_N_INSNS (4),			/*				 DI */
2237    COSTS_N_INSNS (5)},			/*			      other */
2238   0,					/* cost of multiply per each bit set */
2239   {COSTS_N_INSNS (19),			/* cost of a divide/mod for QI */
2240    COSTS_N_INSNS (35),			/*			    HI */
2241    COSTS_N_INSNS (51),			/*			    SI */
2242    COSTS_N_INSNS (83),			/*			    DI */
2243    COSTS_N_INSNS (83)},			/*			    other */
2244   COSTS_N_INSNS (1),			/* cost of movsx */
2245   COSTS_N_INSNS (1),			/* cost of movzx */
2246   8,					/* "large" insn */
2247   9,					/* MOVE_RATIO */
2248   6,					/* CLEAR_RATIO */
2249   {8, 8, 6},				/* cost of loading integer registers
2250 					   in QImode, HImode and SImode.
2251 					   Relative to reg-reg move (2).  */
2252   {8, 8, 6},				/* cost of storing integer registers */
2253   {10, 10, 12, 48, 96},			/* cost of loading SSE register
2254 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
2255   {10, 10, 12, 48, 96},			/* cost of storing SSE register
2256 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
2257   {10, 10, 12, 48, 96},			/* cost of unaligned loads.  */
2258   {10, 10, 12, 48, 96},			/* cost of unaligned stores.  */
2259   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
2260   14,					/* cost of moving SSE register to integer.  */
2261   10, 10,				/* Gather load static, per_elt.  */
2262   10, 10,				/* Gather store static, per_elt.  */
2263   32,					/* size of l1 cache.  */
2264   2048,					/* size of l2 cache.  */
2265   64,					/* size of prefetch block */
2266   100,					/* number of parallel prefetches */
2267   2,					/* Branch cost */
2268   COSTS_N_INSNS (4),			/* cost of FADD and FSUB insns.  */
2269   COSTS_N_INSNS (4),			/* cost of FMUL instruction.  */
2270   COSTS_N_INSNS (19),			/* cost of FDIV instruction.  */
2271   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
2272   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
2273   COSTS_N_INSNS (35),			/* cost of FSQRT instruction.  */
2274 
2275   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
2276   COSTS_N_INSNS (3),			/* cost of ADDSS/SD SUBSS/SD insns.  */
2277   COSTS_N_INSNS (2),			/* cost of MULSS instruction.  */
2278   COSTS_N_INSNS (4),			/* cost of MULSD instruction.  */
2279   COSTS_N_INSNS (5),			/* cost of FMA SS instruction.  */
2280   COSTS_N_INSNS (5),			/* cost of FMA SD instruction.  */
2281   COSTS_N_INSNS (13),			/* cost of DIVSS instruction.  */
2282   COSTS_N_INSNS (19),			/* cost of DIVSD instruction.  */
2283   COSTS_N_INSNS (16),			/* cost of SQRTSS instruction.  */
2284   COSTS_N_INSNS (21),			/* cost of SQRTSD instruction.  */
2285   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
2286   btver2_memcpy,
2287   btver2_memset,
2288   COSTS_N_INSNS (2),			/* cond_taken_branch_cost.  */
2289   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
2290   "16:11:8",				/* Loop alignment.  */
2291   "16:8:8",				/* Jump alignment.  */
2292   "0:0:8",				/* Label alignment.  */
2293   "11",					/* Func alignment.  */
2294 };
2295 
2296 static stringop_algs pentium4_memcpy[2] = {
2297   {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
2298   DUMMY_STRINGOP_ALGS};
2299 static stringop_algs pentium4_memset[2] = {
2300   {libcall, {{6, loop_1_byte, false}, {48, loop, false},
2301              {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
2302   DUMMY_STRINGOP_ALGS};
2303 
2304 static const
2305 struct processor_costs pentium4_cost = {
2306   {
2307   /* Start of register allocator costs.  integer->integer move cost is 2. */
2308   5,				     /* cost for loading QImode using movzbl */
2309   {4, 5, 4},				/* cost of loading integer registers
2310 					   in QImode, HImode and SImode.
2311 					   Relative to reg-reg move (2).  */
2312   {2, 3, 2},				/* cost of storing integer registers */
2313   12,					/* cost of reg,reg fld/fst */
2314   {14, 14, 14},				/* cost of loading fp registers
2315 					   in SFmode, DFmode and XFmode */
2316   {14, 14, 14},				/* cost of storing fp registers
2317 					   in SFmode, DFmode and XFmode */
2318   12,					/* cost of moving MMX register */
2319   {16, 16},				/* cost of loading MMX registers
2320 					   in SImode and DImode */
2321   {16, 16},				/* cost of storing MMX registers
2322 					   in SImode and DImode */
2323   12, 24, 48,				/* cost of moving XMM,YMM,ZMM register */
2324   {16, 16, 16, 32, 64},			/* cost of loading SSE registers
2325 					   in 32,64,128,256 and 512-bit */
2326   {16, 16, 16, 32, 64},			/* cost of storing SSE registers
2327 					   in 32,64,128,256 and 512-bit */
2328   20, 12,				/* SSE->integer and integer->SSE moves */
2329   20, 12,				/* mask->integer and integer->mask moves */
2330   {4, 5, 4},				/* cost of loading mask register
2331 					   in QImode, HImode, SImode.  */
2332   {2, 3, 2},				/* cost if storing mask register
2333 					   in QImode, HImode, SImode.  */
2334   2,					/* cost of moving mask register.  */
2335   /* End of register allocator costs.  */
2336   },
2337 
2338   COSTS_N_INSNS (1),			/* cost of an add instruction */
2339   COSTS_N_INSNS (3),			/* cost of a lea instruction */
2340   COSTS_N_INSNS (4),			/* variable shift costs */
2341   COSTS_N_INSNS (4),			/* constant shift costs */
2342   {COSTS_N_INSNS (15),			/* cost of starting multiply for QI */
2343    COSTS_N_INSNS (15),			/*				 HI */
2344    COSTS_N_INSNS (15),			/*				 SI */
2345    COSTS_N_INSNS (15),			/*				 DI */
2346    COSTS_N_INSNS (15)},			/*			      other */
2347   0,					/* cost of multiply per each bit set */
2348   {COSTS_N_INSNS (56),			/* cost of a divide/mod for QI */
2349    COSTS_N_INSNS (56),			/*			    HI */
2350    COSTS_N_INSNS (56),			/*			    SI */
2351    COSTS_N_INSNS (56),			/*			    DI */
2352    COSTS_N_INSNS (56)},			/*			    other */
2353   COSTS_N_INSNS (1),			/* cost of movsx */
2354   COSTS_N_INSNS (1),			/* cost of movzx */
2355   16,					/* "large" insn */
2356   6,					/* MOVE_RATIO */
2357   6,					/* CLEAR_RATIO */
2358   {4, 5, 4},				/* cost of loading integer registers
2359 					   in QImode, HImode and SImode.
2360 					   Relative to reg-reg move (2).  */
2361   {2, 3, 2},				/* cost of storing integer registers */
2362   {16, 16, 16, 32, 64},			/* cost of loading SSE register
2363 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
2364   {16, 16, 16, 32, 64},			/* cost of storing SSE register
2365 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
2366   {32, 32, 32, 64, 128},		/* cost of unaligned loads.  */
2367   {32, 32, 32, 64, 128},		/* cost of unaligned stores.  */
2368   12, 24, 48,				/* cost of moving XMM,YMM,ZMM register */
2369   20,					/* cost of moving SSE register to integer.  */
2370   16, 16,				/* Gather load static, per_elt.  */
2371   16, 16,				/* Gather store static, per_elt.  */
2372   8,					/* size of l1 cache.  */
2373   256,					/* size of l2 cache.  */
2374   64,					/* size of prefetch block */
2375   6,					/* number of parallel prefetches */
2376   2,					/* Branch cost */
2377   COSTS_N_INSNS (5),			/* cost of FADD and FSUB insns.  */
2378   COSTS_N_INSNS (7),			/* cost of FMUL instruction.  */
2379   COSTS_N_INSNS (43),			/* cost of FDIV instruction.  */
2380   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
2381   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
2382   COSTS_N_INSNS (43),			/* cost of FSQRT instruction.  */
2383 
2384   COSTS_N_INSNS (2),			/* cost of cheap SSE instruction.  */
2385   COSTS_N_INSNS (4),			/* cost of ADDSS/SD SUBSS/SD insns.  */
2386   COSTS_N_INSNS (6),			/* cost of MULSS instruction.  */
2387   COSTS_N_INSNS (6),			/* cost of MULSD instruction.  */
2388   COSTS_N_INSNS (6),			/* cost of FMA SS instruction.  */
2389   COSTS_N_INSNS (6),			/* cost of FMA SD instruction.  */
2390   COSTS_N_INSNS (23),			/* cost of DIVSS instruction.  */
2391   COSTS_N_INSNS (38),			/* cost of DIVSD instruction.  */
2392   COSTS_N_INSNS (23),			/* cost of SQRTSS instruction.  */
2393   COSTS_N_INSNS (38),			/* cost of SQRTSD instruction.  */
2394   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
2395   pentium4_memcpy,
2396   pentium4_memset,
2397   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
2398   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
2399   NULL,					/* Loop alignment.  */
2400   NULL,					/* Jump alignment.  */
2401   NULL,					/* Label alignment.  */
2402   NULL,					/* Func alignment.  */
2403 };
2404 
2405 static stringop_algs nocona_memcpy[2] = {
2406   {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
2407   {libcall, {{32, loop, false}, {20000, rep_prefix_8_byte, false},
2408              {100000, unrolled_loop, false}, {-1, libcall, false}}}};
2409 
2410 static stringop_algs nocona_memset[2] = {
2411   {libcall, {{6, loop_1_byte, false}, {48, loop, false},
2412              {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
2413   {libcall, {{24, loop, false}, {64, unrolled_loop, false},
2414              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2415 
2416 static const
2417 struct processor_costs nocona_cost = {
2418   {
2419   /* Start of register allocator costs.  integer->integer move cost is 2. */
2420   4,				     /* cost for loading QImode using movzbl */
2421   {4, 4, 4},				/* cost of loading integer registers
2422 					   in QImode, HImode and SImode.
2423 					   Relative to reg-reg move (2).  */
2424   {4, 4, 4},				/* cost of storing integer registers */
2425   12,					/* cost of reg,reg fld/fst */
2426   {14, 14, 14},				/* cost of loading fp registers
2427 					   in SFmode, DFmode and XFmode */
2428   {14, 14, 14},				/* cost of storing fp registers
2429 					   in SFmode, DFmode and XFmode */
2430   14,					/* cost of moving MMX register */
2431   {12, 12},				/* cost of loading MMX registers
2432 					   in SImode and DImode */
2433   {12, 12},				/* cost of storing MMX registers
2434 					   in SImode and DImode */
2435   6, 12, 24,				/* cost of moving XMM,YMM,ZMM register */
2436   {12, 12, 12, 24, 48},			/* cost of loading SSE registers
2437 					   in 32,64,128,256 and 512-bit */
2438   {12, 12, 12, 24, 48},			/* cost of storing SSE registers
2439 					   in 32,64,128,256 and 512-bit */
2440   20, 12,				/* SSE->integer and integer->SSE moves */
2441   20, 12,				/* mask->integer and integer->mask moves */
2442   {4, 4, 4},				/* cost of loading mask register
2443 					   in QImode, HImode, SImode.  */
2444   {4, 4, 4},				/* cost if storing mask register
2445 					   in QImode, HImode, SImode.  */
2446   2,					/* cost of moving mask register.  */
2447   /* End of register allocator costs.  */
2448   },
2449 
2450   COSTS_N_INSNS (1),			/* cost of an add instruction */
2451   COSTS_N_INSNS (1),			/* cost of a lea instruction */
2452   COSTS_N_INSNS (1),			/* variable shift costs */
2453   COSTS_N_INSNS (1),			/* constant shift costs */
2454   {COSTS_N_INSNS (10),			/* cost of starting multiply for QI */
2455    COSTS_N_INSNS (10),			/*				 HI */
2456    COSTS_N_INSNS (10),			/*				 SI */
2457    COSTS_N_INSNS (10),			/*				 DI */
2458    COSTS_N_INSNS (10)},			/*			      other */
2459   0,					/* cost of multiply per each bit set */
2460   {COSTS_N_INSNS (66),			/* cost of a divide/mod for QI */
2461    COSTS_N_INSNS (66),			/*			    HI */
2462    COSTS_N_INSNS (66),			/*			    SI */
2463    COSTS_N_INSNS (66),			/*			    DI */
2464    COSTS_N_INSNS (66)},			/*			    other */
2465   COSTS_N_INSNS (1),			/* cost of movsx */
2466   COSTS_N_INSNS (1),			/* cost of movzx */
2467   16,					/* "large" insn */
2468   17,					/* MOVE_RATIO */
2469   6,					/* CLEAR_RATIO */
2470   {4, 4, 4},				/* cost of loading integer registers
2471 					   in QImode, HImode and SImode.
2472 					   Relative to reg-reg move (2).  */
2473   {4, 4, 4},				/* cost of storing integer registers */
2474   {12, 12, 12, 24, 48},			/* cost of loading SSE register
2475 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
2476   {12, 12, 12, 24, 48},			/* cost of storing SSE register
2477 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
2478   {24, 24, 24, 48, 96},			/* cost of unaligned loads.  */
2479   {24, 24, 24, 48, 96},			/* cost of unaligned stores.  */
2480   6, 12, 24,				/* cost of moving XMM,YMM,ZMM register */
2481   20,					/* cost of moving SSE register to integer.  */
2482   12, 12,				/* Gather load static, per_elt.  */
2483   12, 12,				/* Gather store static, per_elt.  */
2484   8,					/* size of l1 cache.  */
2485   1024,					/* size of l2 cache.  */
2486   64,					/* size of prefetch block */
2487   8,					/* number of parallel prefetches */
2488   1,					/* Branch cost */
2489   COSTS_N_INSNS (6),			/* cost of FADD and FSUB insns.  */
2490   COSTS_N_INSNS (8),			/* cost of FMUL instruction.  */
2491   COSTS_N_INSNS (40),			/* cost of FDIV instruction.  */
2492   COSTS_N_INSNS (3),			/* cost of FABS instruction.  */
2493   COSTS_N_INSNS (3),			/* cost of FCHS instruction.  */
2494   COSTS_N_INSNS (44),			/* cost of FSQRT instruction.  */
2495 
2496   COSTS_N_INSNS (2),			/* cost of cheap SSE instruction.  */
2497   COSTS_N_INSNS (5),			/* cost of ADDSS/SD SUBSS/SD insns.  */
2498   COSTS_N_INSNS (7),			/* cost of MULSS instruction.  */
2499   COSTS_N_INSNS (7),			/* cost of MULSD instruction.  */
2500   COSTS_N_INSNS (7),			/* cost of FMA SS instruction.  */
2501   COSTS_N_INSNS (7),			/* cost of FMA SD instruction.  */
2502   COSTS_N_INSNS (32),			/* cost of DIVSS instruction.  */
2503   COSTS_N_INSNS (40),			/* cost of DIVSD instruction.  */
2504   COSTS_N_INSNS (32),			/* cost of SQRTSS instruction.  */
2505   COSTS_N_INSNS (41),			/* cost of SQRTSD instruction.  */
2506   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
2507   nocona_memcpy,
2508   nocona_memset,
2509   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
2510   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
2511   NULL,					/* Loop alignment.  */
2512   NULL,					/* Jump alignment.  */
2513   NULL,					/* Label alignment.  */
2514   NULL,					/* Func alignment.  */
2515 };
2516 
2517 static stringop_algs atom_memcpy[2] = {
2518   {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
2519   {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
2520              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2521 static stringop_algs atom_memset[2] = {
2522   {libcall, {{8, loop, false}, {15, unrolled_loop, false},
2523              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
2524   {libcall, {{24, loop, false}, {32, unrolled_loop, false},
2525              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2526 static const
2527 struct processor_costs atom_cost = {
2528   {
2529   /* Start of register allocator costs.  integer->integer move cost is 2. */
2530   6,					/* cost for loading QImode using movzbl */
2531   {6, 6, 6},				/* cost of loading integer registers
2532 					   in QImode, HImode and SImode.
2533 					   Relative to reg-reg move (2).  */
2534   {6, 6, 6},				/* cost of storing integer registers */
2535   4,					/* cost of reg,reg fld/fst */
2536   {6, 6, 18},				/* cost of loading fp registers
2537 					   in SFmode, DFmode and XFmode */
2538   {14, 14, 24},				/* cost of storing fp registers
2539 					   in SFmode, DFmode and XFmode */
2540   2,					/* cost of moving MMX register */
2541   {8, 8},				/* cost of loading MMX registers
2542 					   in SImode and DImode */
2543   {10, 10},				/* cost of storing MMX registers
2544 					   in SImode and DImode */
2545   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
2546   {8, 8, 8, 16, 32},			/* cost of loading SSE registers
2547 					   in 32,64,128,256 and 512-bit */
2548   {8, 8, 8, 16, 32},			/* cost of storing SSE registers
2549 					   in 32,64,128,256 and 512-bit */
2550   8, 6,				/* SSE->integer and integer->SSE moves */
2551   8, 6,				/* mask->integer and integer->mask moves */
2552   {6, 6, 6},				/* cost of loading mask register
2553 					   in QImode, HImode, SImode.  */
2554   {6, 6, 6},			/* cost if storing mask register
2555 					   in QImode, HImode, SImode.  */
2556   2,					/* cost of moving mask register.  */
2557   /* End of register allocator costs.  */
2558   },
2559 
2560   COSTS_N_INSNS (1),			/* cost of an add instruction */
2561   COSTS_N_INSNS (1) + 1,		/* cost of a lea instruction */
2562   COSTS_N_INSNS (1),			/* variable shift costs */
2563   COSTS_N_INSNS (1),			/* constant shift costs */
2564   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
2565    COSTS_N_INSNS (4),			/*				 HI */
2566    COSTS_N_INSNS (3),			/*				 SI */
2567    COSTS_N_INSNS (4),			/*				 DI */
2568    COSTS_N_INSNS (2)},			/*			      other */
2569   0,					/* cost of multiply per each bit set */
2570   {COSTS_N_INSNS (18),			/* cost of a divide/mod for QI */
2571    COSTS_N_INSNS (26),			/*			    HI */
2572    COSTS_N_INSNS (42),			/*			    SI */
2573    COSTS_N_INSNS (74),			/*			    DI */
2574    COSTS_N_INSNS (74)},			/*			    other */
2575   COSTS_N_INSNS (1),			/* cost of movsx */
2576   COSTS_N_INSNS (1),			/* cost of movzx */
2577   8,					/* "large" insn */
2578   17,					/* MOVE_RATIO */
2579   6,					/* CLEAR_RATIO */
2580   {6, 6, 6},				/* cost of loading integer registers
2581 					   in QImode, HImode and SImode.
2582 					   Relative to reg-reg move (2).  */
2583   {6, 6, 6},				/* cost of storing integer registers */
2584   {8, 8, 8, 16, 32},			/* cost of loading SSE register
2585 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
2586   {8, 8, 8, 16, 32},			/* cost of storing SSE register
2587 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
2588   {16, 16, 16, 32, 64},			/* cost of unaligned loads.  */
2589   {16, 16, 16, 32, 64},			/* cost of unaligned stores.  */
2590   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
2591   8,					/* cost of moving SSE register to integer.  */
2592   8, 8,					/* Gather load static, per_elt.  */
2593   8, 8,					/* Gather store static, per_elt.  */
2594   32,					/* size of l1 cache.  */
2595   256,					/* size of l2 cache.  */
2596   64,					/* size of prefetch block */
2597   6,					/* number of parallel prefetches */
2598   3,					/* Branch cost */
2599   COSTS_N_INSNS (8),			/* cost of FADD and FSUB insns.  */
2600   COSTS_N_INSNS (8),			/* cost of FMUL instruction.  */
2601   COSTS_N_INSNS (20),			/* cost of FDIV instruction.  */
2602   COSTS_N_INSNS (8),			/* cost of FABS instruction.  */
2603   COSTS_N_INSNS (8),			/* cost of FCHS instruction.  */
2604   COSTS_N_INSNS (40),			/* cost of FSQRT instruction.  */
2605 
2606   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
2607   COSTS_N_INSNS (5),			/* cost of ADDSS/SD SUBSS/SD insns.  */
2608   COSTS_N_INSNS (4),			/* cost of MULSS instruction.  */
2609   COSTS_N_INSNS (5),			/* cost of MULSD instruction.  */
2610   COSTS_N_INSNS (6),			/* cost of FMA SS instruction.  */
2611   COSTS_N_INSNS (6),			/* cost of FMA SD instruction.  */
2612   COSTS_N_INSNS (31),			/* cost of DIVSS instruction.  */
2613   COSTS_N_INSNS (60),			/* cost of DIVSD instruction.  */
2614   COSTS_N_INSNS (31),			/* cost of SQRTSS instruction.  */
2615   COSTS_N_INSNS (63),			/* cost of SQRTSD instruction.  */
2616   2, 2, 2, 2,				/* reassoc int, fp, vec_int, vec_fp.  */
2617   atom_memcpy,
2618   atom_memset,
2619   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
2620   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
2621   "16",					/* Loop alignment.  */
2622   "16:8:8",				/* Jump alignment.  */
2623   "0:0:8",				/* Label alignment.  */
2624   "16",					/* Func alignment.  */
2625 };
2626 
2627 static stringop_algs slm_memcpy[2] = {
2628   {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
2629   {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
2630              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2631 static stringop_algs slm_memset[2] = {
2632   {libcall, {{8, loop, false}, {15, unrolled_loop, false},
2633              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
2634   {libcall, {{24, loop, false}, {32, unrolled_loop, false},
2635              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2636 static const
2637 struct processor_costs slm_cost = {
2638   {
2639   /* Start of register allocator costs.  integer->integer move cost is 2. */
2640   8,					/* cost for loading QImode using movzbl */
2641   {8, 8, 8},				/* cost of loading integer registers
2642 					   in QImode, HImode and SImode.
2643 					   Relative to reg-reg move (2).  */
2644   {6, 6, 6},				/* cost of storing integer registers */
2645   2,					/* cost of reg,reg fld/fst */
2646   {8, 8, 18},				/* cost of loading fp registers
2647 					   in SFmode, DFmode and XFmode */
2648   {6, 6, 18},				/* cost of storing fp registers
2649 					   in SFmode, DFmode and XFmode */
2650   2,					/* cost of moving MMX register */
2651   {8, 8},				/* cost of loading MMX registers
2652 					   in SImode and DImode */
2653   {6, 6},				/* cost of storing MMX registers
2654 					   in SImode and DImode */
2655   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
2656   {8, 8, 8, 16, 32},			/* cost of loading SSE registers
2657 					   in 32,64,128,256 and 512-bit */
2658   {8, 8, 8, 16, 32},			/* cost of storing SSE registers
2659 					   in 32,64,128,256 and 512-bit */
2660   8, 6,				/* SSE->integer and integer->SSE moves */
2661   8, 6,				/* mask->integer and integer->mask moves */
2662   {8, 8, 8},			/* cost of loading mask register
2663 					   in QImode, HImode, SImode.  */
2664   {6, 6, 6},			/* cost if storing mask register
2665 					   in QImode, HImode, SImode.  */
2666   2,					/* cost of moving mask register.  */
2667   /* End of register allocator costs.  */
2668   },
2669 
2670   COSTS_N_INSNS (1),			/* cost of an add instruction */
2671   COSTS_N_INSNS (1) + 1,		/* cost of a lea instruction */
2672   COSTS_N_INSNS (1),			/* variable shift costs */
2673   COSTS_N_INSNS (1),			/* constant shift costs */
2674   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
2675    COSTS_N_INSNS (3),			/*				 HI */
2676    COSTS_N_INSNS (3),			/*				 SI */
2677    COSTS_N_INSNS (4),			/*				 DI */
2678    COSTS_N_INSNS (2)},			/*			      other */
2679   0,					/* cost of multiply per each bit set */
2680   {COSTS_N_INSNS (18),			/* cost of a divide/mod for QI */
2681    COSTS_N_INSNS (26),			/*			    HI */
2682    COSTS_N_INSNS (42),			/*			    SI */
2683    COSTS_N_INSNS (74),			/*			    DI */
2684    COSTS_N_INSNS (74)},			/*			    other */
2685   COSTS_N_INSNS (1),			/* cost of movsx */
2686   COSTS_N_INSNS (1),			/* cost of movzx */
2687   8,					/* "large" insn */
2688   17,					/* MOVE_RATIO */
2689   6,					/* CLEAR_RATIO */
2690   {8, 8, 8},				/* cost of loading integer registers
2691 					   in QImode, HImode and SImode.
2692 					   Relative to reg-reg move (2).  */
2693   {6, 6, 6},				/* cost of storing integer registers */
2694   {8, 8, 8, 16, 32},			/* cost of loading SSE register
2695 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
2696   {8, 8, 8, 16, 32},			/* cost of storing SSE register
2697 					   in SImode, DImode and TImode.  */
2698   {16, 16, 16, 32, 64},			/* cost of unaligned loads.  */
2699   {16, 16, 16, 32, 64},			/* cost of unaligned stores.  */
2700   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
2701   8,					/* cost of moving SSE register to integer.  */
2702   8, 8,					/* Gather load static, per_elt.  */
2703   8, 8,					/* Gather store static, per_elt.  */
2704   32,					/* size of l1 cache.  */
2705   256,					/* size of l2 cache.  */
2706   64,					/* size of prefetch block */
2707   6,					/* number of parallel prefetches */
2708   3,					/* Branch cost */
2709   COSTS_N_INSNS (8),			/* cost of FADD and FSUB insns.  */
2710   COSTS_N_INSNS (8),			/* cost of FMUL instruction.  */
2711   COSTS_N_INSNS (20),			/* cost of FDIV instruction.  */
2712   COSTS_N_INSNS (8),			/* cost of FABS instruction.  */
2713   COSTS_N_INSNS (8),			/* cost of FCHS instruction.  */
2714   COSTS_N_INSNS (40),			/* cost of FSQRT instruction.  */
2715 
2716   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
2717   COSTS_N_INSNS (3),			/* cost of ADDSS/SD SUBSS/SD insns.  */
2718   COSTS_N_INSNS (4),			/* cost of MULSS instruction.  */
2719   COSTS_N_INSNS (5),			/* cost of MULSD instruction.  */
2720   COSTS_N_INSNS (6),			/* cost of FMA SS instruction.  */
2721   COSTS_N_INSNS (6),			/* cost of FMA SD instruction.  */
2722   COSTS_N_INSNS (39),			/* cost of DIVSS instruction.  */
2723   COSTS_N_INSNS (69),			/* cost of DIVSD instruction.  */
2724   COSTS_N_INSNS (20),			/* cost of SQRTSS instruction.  */
2725   COSTS_N_INSNS (35),			/* cost of SQRTSD instruction.  */
2726   1, 2, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
2727   slm_memcpy,
2728   slm_memset,
2729   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
2730   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
2731   "16",					/* Loop alignment.  */
2732   "16:8:8",				/* Jump alignment.  */
2733   "0:0:8",				/* Label alignment.  */
2734   "16",					/* Func alignment.  */
2735 };
2736 
2737 static stringop_algs intel_memcpy[2] = {
2738   {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
2739   {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
2740              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2741 static stringop_algs intel_memset[2] = {
2742   {libcall, {{8, loop, false}, {15, unrolled_loop, false},
2743              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
2744   {libcall, {{24, loop, false}, {32, unrolled_loop, false},
2745              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2746 static const
2747 struct processor_costs intel_cost = {
2748   {
2749   /* Start of register allocator costs.  integer->integer move cost is 2. */
2750   6,				     /* cost for loading QImode using movzbl */
2751   {4, 4, 4},				/* cost of loading integer registers
2752 					   in QImode, HImode and SImode.
2753 					   Relative to reg-reg move (2).  */
2754   {6, 6, 6},				/* cost of storing integer registers */
2755   2,					/* cost of reg,reg fld/fst */
2756   {6, 6, 8},				/* cost of loading fp registers
2757 					   in SFmode, DFmode and XFmode */
2758   {6, 6, 10},				/* cost of storing fp registers
2759 					   in SFmode, DFmode and XFmode */
2760   2,					/* cost of moving MMX register */
2761   {6, 6},				/* cost of loading MMX registers
2762 					   in SImode and DImode */
2763   {6, 6},				/* cost of storing MMX registers
2764 					   in SImode and DImode */
2765   2, 2, 2,				/* cost of moving XMM,YMM,ZMM register */
2766   {6, 6, 6, 6, 6},			/* cost of loading SSE registers
2767 					   in 32,64,128,256 and 512-bit */
2768   {6, 6, 6, 6, 6},			/* cost of storing SSE registers
2769 					   in 32,64,128,256 and 512-bit */
2770   4, 4,				/* SSE->integer and integer->SSE moves */
2771   4, 4,				/* mask->integer and integer->mask moves */
2772   {4, 4, 4},				/* cost of loading mask register
2773 					   in QImode, HImode, SImode.  */
2774   {6, 6, 6},				/* cost if storing mask register
2775 					   in QImode, HImode, SImode.  */
2776   2,					/* cost of moving mask register.  */
2777   /* End of register allocator costs.  */
2778   },
2779 
2780   COSTS_N_INSNS (1),			/* cost of an add instruction */
2781   COSTS_N_INSNS (1) + 1,		/* cost of a lea instruction */
2782   COSTS_N_INSNS (1),			/* variable shift costs */
2783   COSTS_N_INSNS (1),			/* constant shift costs */
2784   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
2785    COSTS_N_INSNS (3),			/*				 HI */
2786    COSTS_N_INSNS (3),			/*				 SI */
2787    COSTS_N_INSNS (4),			/*				 DI */
2788    COSTS_N_INSNS (2)},			/*			      other */
2789   0,					/* cost of multiply per each bit set */
2790   {COSTS_N_INSNS (18),			/* cost of a divide/mod for QI */
2791    COSTS_N_INSNS (26),			/*			    HI */
2792    COSTS_N_INSNS (42),			/*			    SI */
2793    COSTS_N_INSNS (74),			/*			    DI */
2794    COSTS_N_INSNS (74)},			/*			    other */
2795   COSTS_N_INSNS (1),			/* cost of movsx */
2796   COSTS_N_INSNS (1),			/* cost of movzx */
2797   8,					/* "large" insn */
2798   17,					/* MOVE_RATIO */
2799   6,					/* CLEAR_RATIO */
2800   {4, 4, 4},				/* cost of loading integer registers
2801 					   in QImode, HImode and SImode.
2802 					   Relative to reg-reg move (2).  */
2803   {6, 6, 6},				/* cost of storing integer registers */
2804   {6, 6, 6, 6, 6},			/* cost of loading SSE register
2805 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
2806   {6, 6, 6, 6, 6},			/* cost of storing SSE register
2807 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
2808   {10, 10, 10, 10, 10},			/* cost of unaligned loads.  */
2809   {10, 10, 10, 10, 10},			/* cost of unaligned loads.  */
2810   2, 2, 2,				/* cost of moving XMM,YMM,ZMM register */
2811   4,					/* cost of moving SSE register to integer.  */
2812   6, 6,					/* Gather load static, per_elt.  */
2813   6, 6,					/* Gather store static, per_elt.  */
2814   32,					/* size of l1 cache.  */
2815   256,					/* size of l2 cache.  */
2816   64,					/* size of prefetch block */
2817   6,					/* number of parallel prefetches */
2818   3,					/* Branch cost */
2819   COSTS_N_INSNS (8),			/* cost of FADD and FSUB insns.  */
2820   COSTS_N_INSNS (8),			/* cost of FMUL instruction.  */
2821   COSTS_N_INSNS (20),			/* cost of FDIV instruction.  */
2822   COSTS_N_INSNS (8),			/* cost of FABS instruction.  */
2823   COSTS_N_INSNS (8),			/* cost of FCHS instruction.  */
2824   COSTS_N_INSNS (40),			/* cost of FSQRT instruction.  */
2825 
2826   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
2827   COSTS_N_INSNS (8),			/* cost of ADDSS/SD SUBSS/SD insns.  */
2828   COSTS_N_INSNS (8),			/* cost of MULSS instruction.  */
2829   COSTS_N_INSNS (8),			/* cost of MULSD instruction.  */
2830   COSTS_N_INSNS (6),			/* cost of FMA SS instruction.  */
2831   COSTS_N_INSNS (6),			/* cost of FMA SD instruction.  */
2832   COSTS_N_INSNS (20),			/* cost of DIVSS instruction.  */
2833   COSTS_N_INSNS (20),			/* cost of DIVSD instruction.  */
2834   COSTS_N_INSNS (40),			/* cost of SQRTSS instruction.  */
2835   COSTS_N_INSNS (40),			/* cost of SQRTSD instruction.  */
2836   1, 4, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
2837   intel_memcpy,
2838   intel_memset,
2839   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
2840   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
2841   "16",					/* Loop alignment.  */
2842   "16:8:8",				/* Jump alignment.  */
2843   "0:0:8",				/* Label alignment.  */
2844   "16",					/* Func alignment.  */
2845 };
2846 
2847 /* Generic should produce code tuned for Core-i7 (and newer chips)
2848    and btver1 (and newer chips).  */
2849 
2850 static stringop_algs generic_memcpy[2] = {
2851   {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
2852              {-1, libcall, false}}},
2853   {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
2854              {-1, libcall, false}}}};
2855 static stringop_algs generic_memset[2] = {
2856   {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
2857              {-1, libcall, false}}},
2858   {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
2859              {-1, libcall, false}}}};
2860 static const
2861 struct processor_costs generic_cost = {
2862   {
2863   /* Start of register allocator costs.  integer->integer move cost is 2. */
2864   6,				     /* cost for loading QImode using movzbl */
2865   {6, 6, 6},				/* cost of loading integer registers
2866 					   in QImode, HImode and SImode.
2867 					   Relative to reg-reg move (2).  */
2868   {6, 6, 6},				/* cost of storing integer registers */
2869   4,					/* cost of reg,reg fld/fst */
2870   {6, 6, 12},				/* cost of loading fp registers
2871 					   in SFmode, DFmode and XFmode */
2872   {6, 6, 12},				/* cost of storing fp registers
2873 					   in SFmode, DFmode and XFmode */
2874   2,					/* cost of moving MMX register */
2875   {6, 6},				/* cost of loading MMX registers
2876 					   in SImode and DImode */
2877   {6, 6},				/* cost of storing MMX registers
2878 					   in SImode and DImode */
2879   2, 3, 4,				/* cost of moving XMM,YMM,ZMM register */
2880   {6, 6, 6, 10, 15},			/* cost of loading SSE registers
2881 					   in 32,64,128,256 and 512-bit */
2882   {6, 6, 6, 10, 15},			/* cost of storing SSE registers
2883 					   in 32,64,128,256 and 512-bit */
2884   6, 6,				/* SSE->integer and integer->SSE moves */
2885   6, 6,				/* mask->integer and integer->mask moves */
2886   {6, 6, 6},				/* cost of loading mask register
2887 					   in QImode, HImode, SImode.  */
2888   {6, 6, 6},			/* cost if storing mask register
2889 					   in QImode, HImode, SImode.  */
2890   2,					/* cost of moving mask register.  */
2891   /* End of register allocator costs.  */
2892   },
2893 
2894   COSTS_N_INSNS (1),			/* cost of an add instruction */
2895   /* Setting cost to 2 makes our current implementation of synth_mult result in
2896      use of unnecessary temporary registers causing regression on several
2897      SPECfp benchmarks.  */
2898   COSTS_N_INSNS (1) + 1,		/* cost of a lea instruction */
2899   COSTS_N_INSNS (1),			/* variable shift costs */
2900   COSTS_N_INSNS (1),			/* constant shift costs */
2901   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
2902    COSTS_N_INSNS (4),			/*				 HI */
2903    COSTS_N_INSNS (3),			/*				 SI */
2904    COSTS_N_INSNS (4),			/*				 DI */
2905    COSTS_N_INSNS (4)},			/*			      other */
2906   0,					/* cost of multiply per each bit set */
2907   {COSTS_N_INSNS (16),			/* cost of a divide/mod for QI */
2908    COSTS_N_INSNS (22),			/*			    HI */
2909    COSTS_N_INSNS (30),			/*			    SI */
2910    COSTS_N_INSNS (74),			/*			    DI */
2911    COSTS_N_INSNS (74)},			/*			    other */
2912   COSTS_N_INSNS (1),			/* cost of movsx */
2913   COSTS_N_INSNS (1),			/* cost of movzx */
2914   8,					/* "large" insn */
2915   17,					/* MOVE_RATIO */
2916   6,					/* CLEAR_RATIO */
2917   {6, 6, 6},				/* cost of loading integer registers
2918 					   in QImode, HImode and SImode.
2919 					   Relative to reg-reg move (2).  */
2920   {6, 6, 6},				/* cost of storing integer registers */
2921   {6, 6, 6, 10, 15},			/* cost of loading SSE register
2922 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
2923   {6, 6, 6, 10, 15},			/* cost of storing SSE register
2924 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
2925   {6, 6, 6, 10, 15},			/* cost of unaligned loads.  */
2926   {6, 6, 6, 10, 15},			/* cost of unaligned storess.  */
2927   2, 3, 4,				/* cost of moving XMM,YMM,ZMM register */
2928   6,					/* cost of moving SSE register to integer.  */
2929   18, 6,				/* Gather load static, per_elt.  */
2930   18, 6,				/* Gather store static, per_elt.  */
2931   32,					/* size of l1 cache.  */
2932   512,					/* size of l2 cache.  */
2933   64,					/* size of prefetch block */
2934   6,					/* number of parallel prefetches */
2935   /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
2936      value is increased to perhaps more appropriate value of 5.  */
2937   3,					/* Branch cost */
2938   COSTS_N_INSNS (3),			/* cost of FADD and FSUB insns.  */
2939   COSTS_N_INSNS (5),			/* cost of FMUL instruction.  */
2940   COSTS_N_INSNS (17),			/* cost of FDIV instruction.  */
2941   COSTS_N_INSNS (1),			/* cost of FABS instruction.  */
2942   COSTS_N_INSNS (1),			/* cost of FCHS instruction.  */
2943   COSTS_N_INSNS (14),			/* cost of FSQRT instruction.  */
2944 
2945   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
2946   COSTS_N_INSNS (3),			/* cost of ADDSS/SD SUBSS/SD insns.  */
2947   COSTS_N_INSNS (4),			/* cost of MULSS instruction.  */
2948   COSTS_N_INSNS (5),			/* cost of MULSD instruction.  */
2949   COSTS_N_INSNS (5),			/* cost of FMA SS instruction.  */
2950   COSTS_N_INSNS (5),			/* cost of FMA SD instruction.  */
2951   COSTS_N_INSNS (13),			/* cost of DIVSS instruction.  */
2952   COSTS_N_INSNS (17),			/* cost of DIVSD instruction.  */
2953   COSTS_N_INSNS (14),			/* cost of SQRTSS instruction.  */
2954   COSTS_N_INSNS (18),			/* cost of SQRTSD instruction.  */
2955   1, 4, 3, 3,				/* reassoc int, fp, vec_int, vec_fp.  */
2956   generic_memcpy,
2957   generic_memset,
2958   COSTS_N_INSNS (4),			/* cond_taken_branch_cost.  */
2959   COSTS_N_INSNS (2),			/* cond_not_taken_branch_cost.  */
2960   "16:11:8",				/* Loop alignment.  */
2961   "16:11:8",				/* Jump alignment.  */
2962   "0:0:8",				/* Label alignment.  */
2963   "16",					/* Func alignment.  */
2964 };
2965 
2966 /* core_cost should produce code tuned for Core familly of CPUs.  */
2967 static stringop_algs core_memcpy[2] = {
2968   {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}},
2969   {libcall, {{24, loop, true}, {128, rep_prefix_8_byte, true},
2970              {-1, libcall, false}}}};
2971 static stringop_algs core_memset[2] = {
2972   {libcall, {{6, loop_1_byte, true},
2973              {24, loop, true},
2974              {8192, rep_prefix_4_byte, true},
2975              {-1, libcall, false}}},
2976   {libcall, {{24, loop, true}, {512, rep_prefix_8_byte, true},
2977              {-1, libcall, false}}}};
2978 
2979 static const
2980 struct processor_costs core_cost = {
2981   {
2982   /* Start of register allocator costs.  integer->integer move cost is 2. */
2983   6,				     /* cost for loading QImode using movzbl */
2984   {4, 4, 4},				/* cost of loading integer registers
2985 					   in QImode, HImode and SImode.
2986 					   Relative to reg-reg move (2).  */
2987   {6, 6, 6},				/* cost of storing integer registers */
2988   2,					/* cost of reg,reg fld/fst */
2989   {6, 6, 8},				/* cost of loading fp registers
2990 					   in SFmode, DFmode and XFmode */
2991   {6, 6, 10},				/* cost of storing fp registers
2992 					   in SFmode, DFmode and XFmode */
2993   2,					/* cost of moving MMX register */
2994   {6, 6},				/* cost of loading MMX registers
2995 					   in SImode and DImode */
2996   {6, 6},				/* cost of storing MMX registers
2997 					   in SImode and DImode */
2998   2, 2, 4,				/* cost of moving XMM,YMM,ZMM register */
2999   {6, 6, 6, 6, 12},			/* cost of loading SSE registers
3000 					   in 32,64,128,256 and 512-bit */
3001   {6, 6, 6, 6, 12},			/* cost of storing SSE registers
3002 					   in 32,64,128,256 and 512-bit */
3003   6, 6,				/* SSE->integer and integer->SSE moves */
3004   6, 6,				/* mask->integer and integer->mask moves */
3005   {4, 4, 4},				/* cost of loading mask register
3006 					   in QImode, HImode, SImode.  */
3007   {6, 6, 6},				/* cost if storing mask register
3008 					   in QImode, HImode, SImode.  */
3009   2,					/* cost of moving mask register.  */
3010   /* End of register allocator costs.  */
3011   },
3012 
3013   COSTS_N_INSNS (1),			/* cost of an add instruction */
3014   /* On all chips taken into consideration lea is 2 cycles and more.  With
3015      this cost however our current implementation of synth_mult results in
3016      use of unnecessary temporary registers causing regression on several
3017      SPECfp benchmarks.  */
3018   COSTS_N_INSNS (1) + 1,		/* cost of a lea instruction */
3019   COSTS_N_INSNS (1),			/* variable shift costs */
3020   COSTS_N_INSNS (1),			/* constant shift costs */
3021   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
3022    COSTS_N_INSNS (4),			/*				 HI */
3023    COSTS_N_INSNS (3),			/*				 SI */
3024    /* Here we tune for Sandybridge or newer.  */
3025    COSTS_N_INSNS (3),			/*				 DI */
3026    COSTS_N_INSNS (3)},			/*			      other */
3027   0,					/* cost of multiply per each bit set */
3028   /* Expanding div/mod currently doesn't consider parallelism. So the cost
3029      model is not realistic. We compensate by increasing the latencies a bit.  */
3030   {COSTS_N_INSNS (11),			/* cost of a divide/mod for QI */
3031    COSTS_N_INSNS (11),			/*			    HI */
3032    COSTS_N_INSNS (14),			/*			    SI */
3033    COSTS_N_INSNS (81),			/*			    DI */
3034    COSTS_N_INSNS (81)},			/*			    other */
3035   COSTS_N_INSNS (1),			/* cost of movsx */
3036   COSTS_N_INSNS (1),			/* cost of movzx */
3037   8,					/* "large" insn */
3038   17,					/* MOVE_RATIO */
3039   6,					/* CLEAR_RATIO */
3040   {4, 4, 4},				/* cost of loading integer registers
3041 					   in QImode, HImode and SImode.
3042 					   Relative to reg-reg move (2).  */
3043   {6, 6, 6},				/* cost of storing integer registers */
3044   {6, 6, 6, 6, 12},			/* cost of loading SSE register
3045 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
3046   {6, 6, 6, 6, 12},			/* cost of storing SSE register
3047 					   in 32bit, 64bit, 128bit, 256bit and 512bit */
3048   {6, 6, 6, 6, 12},			/* cost of unaligned loads.  */
3049   {6, 6, 6, 6, 12},			/* cost of unaligned stores.  */
3050   2, 2, 4,				/* cost of moving XMM,YMM,ZMM register */
3051   2,					/* cost of moving SSE register to integer.  */
3052   /* VGATHERDPD is 7 uops, rec throughput 5, while VGATHERDPD is 9 uops,
3053      rec. throughput 6.
3054      So 5 uops statically and one uops per load.  */
3055   10, 6,				/* Gather load static, per_elt.  */
3056   10, 6,				/* Gather store static, per_elt.  */
3057   64,					/* size of l1 cache.  */
3058   512,					/* size of l2 cache.  */
3059   64,					/* size of prefetch block */
3060   6,					/* number of parallel prefetches */
3061   /* FIXME perhaps more appropriate value is 5.  */
3062   3,					/* Branch cost */
3063   COSTS_N_INSNS (3),			/* cost of FADD and FSUB insns.  */
3064   COSTS_N_INSNS (5),			/* cost of FMUL instruction.  */
3065   /* 10-24 */
3066   COSTS_N_INSNS (24),			/* cost of FDIV instruction.  */
3067   COSTS_N_INSNS (1),			/* cost of FABS instruction.  */
3068   COSTS_N_INSNS (1),			/* cost of FCHS instruction.  */
3069   COSTS_N_INSNS (23),			/* cost of FSQRT instruction.  */
3070 
3071   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
3072   COSTS_N_INSNS (3),			/* cost of ADDSS/SD SUBSS/SD insns.  */
3073   COSTS_N_INSNS (4),			/* cost of MULSS instruction.  */
3074   COSTS_N_INSNS (5),			/* cost of MULSD instruction.  */
3075   COSTS_N_INSNS (5),			/* cost of FMA SS instruction.  */
3076   COSTS_N_INSNS (5),			/* cost of FMA SD instruction.  */
3077   COSTS_N_INSNS (18),			/* cost of DIVSS instruction.  */
3078   COSTS_N_INSNS (32),			/* cost of DIVSD instruction.  */
3079   COSTS_N_INSNS (30),			/* cost of SQRTSS instruction.  */
3080   COSTS_N_INSNS (58),			/* cost of SQRTSD instruction.  */
3081   1, 4, 2, 2,				/* reassoc int, fp, vec_int, vec_fp.  */
3082   core_memcpy,
3083   core_memset,
3084   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
3085   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
3086   "16:11:8",				/* Loop alignment.  */
3087   "16:11:8",				/* Jump alignment.  */
3088   "0:0:8",				/* Label alignment.  */
3089   "16",					/* Func alignment.  */
3090 };
3091 
3092