1 /* Costs of operations of individual x86 CPUs.
2    Copyright (C) 1988-2018 Free Software Foundation, Inc.
3 
4 This file is part of GCC.
5 
6 GCC is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3, or (at your option)
9 any later version.
10 
11 GCC is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 GNU General Public License for more details.
15 
16 Under Section 7 of GPL version 3, you are granted additional
17 permissions described in the GCC Runtime Library Exception, version
18 3.1, as published by the Free Software Foundation.
19 
20 You should have received a copy of the GNU General Public License and
21 a copy of the GCC Runtime Library Exception along with this program;
22 see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
23 <http://www.gnu.org/licenses/>.  */
24 /* Processor costs (relative to an add) */
25 /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes.  */
26 #define COSTS_N_BYTES(N) ((N) * 2)
27 
28 #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall, false}}}
29 
30 static stringop_algs ix86_size_memcpy[2] = {
31   {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
32   {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
33 static stringop_algs ix86_size_memset[2] = {
34   {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
35   {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
36 
37 const
38 struct processor_costs ix86_size_cost = {/* costs for tuning for size */
39   COSTS_N_BYTES (2),			/* cost of an add instruction */
40   COSTS_N_BYTES (3),			/* cost of a lea instruction */
41   COSTS_N_BYTES (2),			/* variable shift costs */
42   COSTS_N_BYTES (3),			/* constant shift costs */
43   {COSTS_N_BYTES (3),			/* cost of starting multiply for QI */
44    COSTS_N_BYTES (3),			/*				 HI */
45    COSTS_N_BYTES (3),			/*				 SI */
46    COSTS_N_BYTES (3),			/*				 DI */
47    COSTS_N_BYTES (5)},			/*			      other */
48   0,					/* cost of multiply per each bit set */
49   {COSTS_N_BYTES (3),			/* cost of a divide/mod for QI */
50    COSTS_N_BYTES (3),			/*			    HI */
51    COSTS_N_BYTES (3),			/*			    SI */
52    COSTS_N_BYTES (3),			/*			    DI */
53    COSTS_N_BYTES (5)},			/*			    other */
54   COSTS_N_BYTES (3),			/* cost of movsx */
55   COSTS_N_BYTES (3),			/* cost of movzx */
56   0,					/* "large" insn */
57   2,					/* MOVE_RATIO */
58 
59   /* All move costs are relative to integer->integer move times 2. */
60   2,				     /* cost for loading QImode using movzbl */
61   {2, 2, 2},				/* cost of loading integer registers
62 					   in QImode, HImode and SImode.
63 					   Relative to reg-reg move (2).  */
64   {2, 2, 2},				/* cost of storing integer registers */
65   2,					/* cost of reg,reg fld/fst */
66   {2, 2, 2},				/* cost of loading fp registers
67 					   in SFmode, DFmode and XFmode */
68   {2, 2, 2},				/* cost of storing fp registers
69 					   in SFmode, DFmode and XFmode */
70   3,					/* cost of moving MMX register */
71   {3, 3},				/* cost of loading MMX registers
72 					   in SImode and DImode */
73   {3, 3},				/* cost of storing MMX registers
74 					   in SImode and DImode */
75   3, 3, 3,				/* cost of moving XMM,YMM,ZMM register */
76   {3, 3, 3, 3, 3},			/* cost of loading SSE registers
77 					   in 32,64,128,256 and 512-bit */
78   {3, 3, 3, 3, 3},			/* cost of unaligned SSE load
79 					   in 128bit, 256bit and 512bit */
80   {3, 3, 3, 3, 3},			/* cost of storing SSE registers
81 					   in 32,64,128,256 and 512-bit */
82   {3, 3, 3, 3, 3},				/* cost of unaligned SSE store
83 					   in 128bit, 256bit and 512bit */
84   3, 3,					/* SSE->integer and integer->SSE moves */
85   5, 0,					/* Gather load static, per_elt.  */
86   5, 0,					/* Gather store static, per_elt.  */
87   0,					/* size of l1 cache  */
88   0,					/* size of l2 cache  */
89   0,					/* size of prefetch block */
90   0,					/* number of parallel prefetches */
91   2,					/* Branch cost */
92   COSTS_N_BYTES (2),			/* cost of FADD and FSUB insns.  */
93   COSTS_N_BYTES (2),			/* cost of FMUL instruction.  */
94   COSTS_N_BYTES (2),			/* cost of FDIV instruction.  */
95   COSTS_N_BYTES (2),			/* cost of FABS instruction.  */
96   COSTS_N_BYTES (2),			/* cost of FCHS instruction.  */
97   COSTS_N_BYTES (2),			/* cost of FSQRT instruction.  */
98 
99   COSTS_N_BYTES (2),			/* cost of cheap SSE instruction.  */
100   COSTS_N_BYTES (2),			/* cost of ADDSS/SD SUBSS/SD insns.  */
101   COSTS_N_BYTES (2),			/* cost of MULSS instruction.  */
102   COSTS_N_BYTES (2),			/* cost of MULSD instruction.  */
103   COSTS_N_BYTES (2),			/* cost of FMA SS instruction.  */
104   COSTS_N_BYTES (2),			/* cost of FMA SD instruction.  */
105   COSTS_N_BYTES (2),			/* cost of DIVSS instruction.  */
106   COSTS_N_BYTES (2),			/* cost of DIVSD instruction.  */
107   COSTS_N_BYTES (2),			/* cost of SQRTSS instruction.  */
108   COSTS_N_BYTES (2),			/* cost of SQRTSD instruction.  */
109   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
110   ix86_size_memcpy,
111   ix86_size_memset,
112   COSTS_N_BYTES (1),			/* cond_taken_branch_cost.  */
113   COSTS_N_BYTES (1),			/* cond_not_taken_branch_cost.  */
114 };
115 
116 /* Processor costs (relative to an add) */
117 static stringop_algs i386_memcpy[2] = {
118   {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
119   DUMMY_STRINGOP_ALGS};
120 static stringop_algs i386_memset[2] = {
121   {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
122   DUMMY_STRINGOP_ALGS};
123 
124 static const
125 struct processor_costs i386_cost = {	/* 386 specific costs */
126   COSTS_N_INSNS (1),			/* cost of an add instruction */
127   COSTS_N_INSNS (1),			/* cost of a lea instruction */
128   COSTS_N_INSNS (3),			/* variable shift costs */
129   COSTS_N_INSNS (2),			/* constant shift costs */
130   {COSTS_N_INSNS (6),			/* cost of starting multiply for QI */
131    COSTS_N_INSNS (6),			/*				 HI */
132    COSTS_N_INSNS (6),			/*				 SI */
133    COSTS_N_INSNS (6),			/*				 DI */
134    COSTS_N_INSNS (6)},			/*			      other */
135   COSTS_N_INSNS (1),			/* cost of multiply per each bit set */
136   {COSTS_N_INSNS (23),			/* cost of a divide/mod for QI */
137    COSTS_N_INSNS (23),			/*			    HI */
138    COSTS_N_INSNS (23),			/*			    SI */
139    COSTS_N_INSNS (23),			/*			    DI */
140    COSTS_N_INSNS (23)},			/*			    other */
141   COSTS_N_INSNS (3),			/* cost of movsx */
142   COSTS_N_INSNS (2),			/* cost of movzx */
143   15,					/* "large" insn */
144   3,					/* MOVE_RATIO */
145 
146   /* All move costs are relative to integer->integer move times 2 and thus
147      they are latency*2. */
148   4,				     /* cost for loading QImode using movzbl */
149   {2, 4, 2},				/* cost of loading integer registers
150 					   in QImode, HImode and SImode.
151 					   Relative to reg-reg move (2).  */
152   {2, 4, 2},				/* cost of storing integer registers */
153   2,					/* cost of reg,reg fld/fst */
154   {8, 8, 8},				/* cost of loading fp registers
155 					   in SFmode, DFmode and XFmode */
156   {8, 8, 8},				/* cost of storing fp registers
157 					   in SFmode, DFmode and XFmode */
158   2,					/* cost of moving MMX register */
159   {4, 8},				/* cost of loading MMX registers
160 					   in SImode and DImode */
161   {4, 8},				/* cost of storing MMX registers
162 					   in SImode and DImode */
163   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
164   {4, 8, 16, 32, 64},			/* cost of loading SSE registers
165 					   in 32,64,128,256 and 512-bit */
166   {4, 8, 16, 32, 64},			/* cost of unaligned loads.  */
167   {4, 8, 16, 32, 64},			/* cost of storing SSE registers
168 					   in 32,64,128,256 and 512-bit */
169   {4, 8, 16, 32, 64},			/* cost of unaligned stores.  */
170   3, 3,					/* SSE->integer and integer->SSE moves */
171   4, 4,					/* Gather load static, per_elt.  */
172   4, 4,					/* Gather store static, per_elt.  */
173   0,					/* size of l1 cache  */
174   0,					/* size of l2 cache  */
175   0,					/* size of prefetch block */
176   0,					/* number of parallel prefetches */
177   1,					/* Branch cost */
178   COSTS_N_INSNS (23),			/* cost of FADD and FSUB insns.  */
179   COSTS_N_INSNS (27),			/* cost of FMUL instruction.  */
180   COSTS_N_INSNS (88),			/* cost of FDIV instruction.  */
181   COSTS_N_INSNS (22),			/* cost of FABS instruction.  */
182   COSTS_N_INSNS (24),			/* cost of FCHS instruction.  */
183   COSTS_N_INSNS (122),			/* cost of FSQRT instruction.  */
184 
185   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
186   COSTS_N_INSNS (23),			/* cost of ADDSS/SD SUBSS/SD insns.  */
187   COSTS_N_INSNS (27),			/* cost of MULSS instruction.  */
188   COSTS_N_INSNS (27),			/* cost of MULSD instruction.  */
189   COSTS_N_INSNS (27),			/* cost of FMA SS instruction.  */
190   COSTS_N_INSNS (27),			/* cost of FMA SD instruction.  */
191   COSTS_N_INSNS (88),			/* cost of DIVSS instruction.  */
192   COSTS_N_INSNS (88),			/* cost of DIVSD instruction.  */
193   COSTS_N_INSNS (122),			/* cost of SQRTSS instruction.  */
194   COSTS_N_INSNS (122),			/* cost of SQRTSD instruction.  */
195   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
196   i386_memcpy,
197   i386_memset,
198   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
199   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
200 };
201 
202 static stringop_algs i486_memcpy[2] = {
203   {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
204   DUMMY_STRINGOP_ALGS};
205 static stringop_algs i486_memset[2] = {
206   {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
207   DUMMY_STRINGOP_ALGS};
208 
209 static const
210 struct processor_costs i486_cost = {	/* 486 specific costs */
211   COSTS_N_INSNS (1),			/* cost of an add instruction */
212   COSTS_N_INSNS (1),			/* cost of a lea instruction */
213   COSTS_N_INSNS (3),			/* variable shift costs */
214   COSTS_N_INSNS (2),			/* constant shift costs */
215   {COSTS_N_INSNS (12),			/* cost of starting multiply for QI */
216    COSTS_N_INSNS (12),			/*				 HI */
217    COSTS_N_INSNS (12),			/*				 SI */
218    COSTS_N_INSNS (12),			/*				 DI */
219    COSTS_N_INSNS (12)},			/*			      other */
220   1,					/* cost of multiply per each bit set */
221   {COSTS_N_INSNS (40),			/* cost of a divide/mod for QI */
222    COSTS_N_INSNS (40),			/*			    HI */
223    COSTS_N_INSNS (40),			/*			    SI */
224    COSTS_N_INSNS (40),			/*			    DI */
225    COSTS_N_INSNS (40)},			/*			    other */
226   COSTS_N_INSNS (3),			/* cost of movsx */
227   COSTS_N_INSNS (2),			/* cost of movzx */
228   15,					/* "large" insn */
229   3,					/* MOVE_RATIO */
230 
231   /* All move costs are relative to integer->integer move times 2 and thus
232      they are latency*2. */
233   4,				     /* cost for loading QImode using movzbl */
234   {2, 4, 2},				/* cost of loading integer registers
235 					   in QImode, HImode and SImode.
236 					   Relative to reg-reg move (2).  */
237   {2, 4, 2},				/* cost of storing integer registers */
238   2,					/* cost of reg,reg fld/fst */
239   {8, 8, 8},				/* cost of loading fp registers
240 					   in SFmode, DFmode and XFmode */
241   {8, 8, 8},				/* cost of storing fp registers
242 					   in SFmode, DFmode and XFmode */
243   2,					/* cost of moving MMX register */
244   {4, 8},				/* cost of loading MMX registers
245 					   in SImode and DImode */
246   {4, 8},				/* cost of storing MMX registers
247 					   in SImode and DImode */
248   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
249   {4, 8, 16, 32, 64},			/* cost of loading SSE registers
250 					   in 32,64,128,256 and 512-bit */
251   {4, 8, 16, 32, 64},			/* cost of unaligned loads.  */
252   {4, 8, 16, 32, 64},			/* cost of storing SSE registers
253 					   in 32,64,128,256 and 512-bit */
254   {4, 8, 16, 32, 64},			/* cost of unaligned stores.  */
255   3, 3,					/* SSE->integer and integer->SSE moves */
256   4, 4,					/* Gather load static, per_elt.  */
257   4, 4,					/* Gather store static, per_elt.  */
258   4,					/* size of l1 cache.  486 has 8kB cache
259 					   shared for code and data, so 4kB is
260 					   not really precise.  */
261   4,					/* size of l2 cache  */
262   0,					/* size of prefetch block */
263   0,					/* number of parallel prefetches */
264   1,					/* Branch cost */
265   COSTS_N_INSNS (8),			/* cost of FADD and FSUB insns.  */
266   COSTS_N_INSNS (16),			/* cost of FMUL instruction.  */
267   COSTS_N_INSNS (73),			/* cost of FDIV instruction.  */
268   COSTS_N_INSNS (3),			/* cost of FABS instruction.  */
269   COSTS_N_INSNS (3),			/* cost of FCHS instruction.  */
270   COSTS_N_INSNS (83),			/* cost of FSQRT instruction.  */
271 
272   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
273   COSTS_N_INSNS (8),			/* cost of ADDSS/SD SUBSS/SD insns.  */
274   COSTS_N_INSNS (16),			/* cost of MULSS instruction.  */
275   COSTS_N_INSNS (16),			/* cost of MULSD instruction.  */
276   COSTS_N_INSNS (16),			/* cost of FMA SS instruction.  */
277   COSTS_N_INSNS (16),			/* cost of FMA SD instruction.  */
278   COSTS_N_INSNS (73),			/* cost of DIVSS instruction.  */
279   COSTS_N_INSNS (74),			/* cost of DIVSD instruction.  */
280   COSTS_N_INSNS (83),			/* cost of SQRTSS instruction.  */
281   COSTS_N_INSNS (83),			/* cost of SQRTSD instruction.  */
282   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
283   i486_memcpy,
284   i486_memset,
285   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
286   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
287 };
288 
289 static stringop_algs pentium_memcpy[2] = {
290   {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
291   DUMMY_STRINGOP_ALGS};
292 static stringop_algs pentium_memset[2] = {
293   {libcall, {{-1, rep_prefix_4_byte, false}}},
294   DUMMY_STRINGOP_ALGS};
295 
296 static const
297 struct processor_costs pentium_cost = {
298   COSTS_N_INSNS (1),			/* cost of an add instruction */
299   COSTS_N_INSNS (1),			/* cost of a lea instruction */
300   COSTS_N_INSNS (4),			/* variable shift costs */
301   COSTS_N_INSNS (1),			/* constant shift costs */
302   {COSTS_N_INSNS (11),			/* cost of starting multiply for QI */
303    COSTS_N_INSNS (11),			/*				 HI */
304    COSTS_N_INSNS (11),			/*				 SI */
305    COSTS_N_INSNS (11),			/*				 DI */
306    COSTS_N_INSNS (11)},			/*			      other */
307   0,					/* cost of multiply per each bit set */
308   {COSTS_N_INSNS (25),			/* cost of a divide/mod for QI */
309    COSTS_N_INSNS (25),			/*			    HI */
310    COSTS_N_INSNS (25),			/*			    SI */
311    COSTS_N_INSNS (25),			/*			    DI */
312    COSTS_N_INSNS (25)},			/*			    other */
313   COSTS_N_INSNS (3),			/* cost of movsx */
314   COSTS_N_INSNS (2),			/* cost of movzx */
315   8,					/* "large" insn */
316   6,					/* MOVE_RATIO */
317 
318   /* All move costs are relative to integer->integer move times 2 and thus
319      they are latency*2. */
320   6,				     /* cost for loading QImode using movzbl */
321   {2, 4, 2},				/* cost of loading integer registers
322 					   in QImode, HImode and SImode.
323 					   Relative to reg-reg move (2).  */
324   {2, 4, 2},				/* cost of storing integer registers */
325   2,					/* cost of reg,reg fld/fst */
326   {2, 2, 6},				/* cost of loading fp registers
327 					   in SFmode, DFmode and XFmode */
328   {4, 4, 6},				/* cost of storing fp registers
329 					   in SFmode, DFmode and XFmode */
330   8,					/* cost of moving MMX register */
331   {8, 8},				/* cost of loading MMX registers
332 					   in SImode and DImode */
333   {8, 8},				/* cost of storing MMX registers
334 					   in SImode and DImode */
335   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
336   {4, 8, 16, 32, 64},			/* cost of loading SSE registers
337 					   in 32,64,128,256 and 512-bit */
338   {4, 8, 16, 32, 64},			/* cost of unaligned loads.  */
339   {4, 8, 16, 32, 64},			/* cost of storing SSE registers
340 					   in 32,64,128,256 and 512-bit */
341   {4, 8, 16, 32, 64},			/* cost of unaligned stores.  */
342   3, 3,					/* SSE->integer and integer->SSE moves */
343   4, 4,					/* Gather load static, per_elt.  */
344   4, 4,					/* Gather store static, per_elt.  */
345   8,					/* size of l1 cache.  */
346   8,					/* size of l2 cache  */
347   0,					/* size of prefetch block */
348   0,					/* number of parallel prefetches */
349   2,					/* Branch cost */
350   COSTS_N_INSNS (3),			/* cost of FADD and FSUB insns.  */
351   COSTS_N_INSNS (3),			/* cost of FMUL instruction.  */
352   COSTS_N_INSNS (39),			/* cost of FDIV instruction.  */
353   COSTS_N_INSNS (1),			/* cost of FABS instruction.  */
354   COSTS_N_INSNS (1),			/* cost of FCHS instruction.  */
355   COSTS_N_INSNS (70),			/* cost of FSQRT instruction.  */
356 
357   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
358   COSTS_N_INSNS (3),			/* cost of ADDSS/SD SUBSS/SD insns.  */
359   COSTS_N_INSNS (3),			/* cost of MULSS instruction.  */
360   COSTS_N_INSNS (3),			/* cost of MULSD instruction.  */
361   COSTS_N_INSNS (6),			/* cost of FMA SS instruction.  */
362   COSTS_N_INSNS (6),			/* cost of FMA SD instruction.  */
363   COSTS_N_INSNS (39),			/* cost of DIVSS instruction.  */
364   COSTS_N_INSNS (39),			/* cost of DIVSD instruction.  */
365   COSTS_N_INSNS (70),			/* cost of SQRTSS instruction.  */
366   COSTS_N_INSNS (70),			/* cost of SQRTSD instruction.  */
367   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
368   pentium_memcpy,
369   pentium_memset,
370   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
371   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
372 };
373 
374 static const
375 struct processor_costs lakemont_cost = {
376   COSTS_N_INSNS (1),			/* cost of an add instruction */
377   COSTS_N_INSNS (1) + 1,		/* cost of a lea instruction */
378   COSTS_N_INSNS (1),			/* variable shift costs */
379   COSTS_N_INSNS (1),			/* constant shift costs */
380   {COSTS_N_INSNS (11),			/* cost of starting multiply for QI */
381    COSTS_N_INSNS (11),			/*				 HI */
382    COSTS_N_INSNS (11),			/*				 SI */
383    COSTS_N_INSNS (11),			/*				 DI */
384    COSTS_N_INSNS (11)},			/*			      other */
385   0,					/* cost of multiply per each bit set */
386   {COSTS_N_INSNS (25),			/* cost of a divide/mod for QI */
387    COSTS_N_INSNS (25),			/*			    HI */
388    COSTS_N_INSNS (25),			/*			    SI */
389    COSTS_N_INSNS (25),			/*			    DI */
390    COSTS_N_INSNS (25)},			/*			    other */
391   COSTS_N_INSNS (3),			/* cost of movsx */
392   COSTS_N_INSNS (2),			/* cost of movzx */
393   8,					/* "large" insn */
394   17,					/* MOVE_RATIO */
395 
396   /* All move costs are relative to integer->integer move times 2 and thus
397      they are latency*2. */
398   6,				     /* cost for loading QImode using movzbl */
399   {2, 4, 2},				/* cost of loading integer registers
400 					   in QImode, HImode and SImode.
401 					   Relative to reg-reg move (2).  */
402   {2, 4, 2},				/* cost of storing integer registers */
403   2,					/* cost of reg,reg fld/fst */
404   {2, 2, 6},				/* cost of loading fp registers
405 					   in SFmode, DFmode and XFmode */
406   {4, 4, 6},				/* cost of storing fp registers
407 					   in SFmode, DFmode and XFmode */
408   8,					/* cost of moving MMX register */
409   {8, 8},				/* cost of loading MMX registers
410 					   in SImode and DImode */
411   {8, 8},				/* cost of storing MMX registers
412 					   in SImode and DImode */
413   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
414   {4, 8, 16, 32, 64},			/* cost of loading SSE registers
415 					   in 32,64,128,256 and 512-bit */
416   {4, 8, 16, 32, 64},			/* cost of unaligned loads.  */
417   {4, 8, 16, 32, 64},			/* cost of storing SSE registers
418 					   in 32,64,128,256 and 512-bit */
419   {4, 8, 16, 32, 64},			/* cost of unaligned stores.  */
420   3, 3,					/* SSE->integer and integer->SSE moves */
421   4, 4,					/* Gather load static, per_elt.  */
422   4, 4,					/* Gather store static, per_elt.  */
423   8,					/* size of l1 cache.  */
424   8,					/* size of l2 cache  */
425   0,					/* size of prefetch block */
426   0,					/* number of parallel prefetches */
427   2,					/* Branch cost */
428   COSTS_N_INSNS (3),			/* cost of FADD and FSUB insns.  */
429   COSTS_N_INSNS (3),			/* cost of FMUL instruction.  */
430   COSTS_N_INSNS (39),			/* cost of FDIV instruction.  */
431   COSTS_N_INSNS (1),			/* cost of FABS instruction.  */
432   COSTS_N_INSNS (1),			/* cost of FCHS instruction.  */
433   COSTS_N_INSNS (70),			/* cost of FSQRT instruction.  */
434 
435   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
436   COSTS_N_INSNS (5),			/* cost of ADDSS/SD SUBSS/SD insns.  */
437   COSTS_N_INSNS (5),			/* cost of MULSS instruction.  */
438   COSTS_N_INSNS (5),			/* cost of MULSD instruction.  */
439   COSTS_N_INSNS (10),			/* cost of FMA SS instruction.  */
440   COSTS_N_INSNS (10),			/* cost of FMA SD instruction.  */
441   COSTS_N_INSNS (31),			/* cost of DIVSS instruction.  */
442   COSTS_N_INSNS (60),			/* cost of DIVSD instruction.  */
443   COSTS_N_INSNS (31),			/* cost of SQRTSS instruction.  */
444   COSTS_N_INSNS (63),			/* cost of SQRTSD instruction.  */
445   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
446   pentium_memcpy,
447   pentium_memset,
448   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
449   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
450 };
451 
452 /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
453    (we ensure the alignment).  For small blocks inline loop is still a
454    noticeable win, for bigger blocks either rep movsl or rep movsb is
455    way to go.  Rep movsb has apparently more expensive startup time in CPU,
456    but after 4K the difference is down in the noise.  */
457 static stringop_algs pentiumpro_memcpy[2] = {
458   {rep_prefix_4_byte, {{128, loop, false}, {1024, unrolled_loop, false},
459                        {8192, rep_prefix_4_byte, false},
460                        {-1, rep_prefix_1_byte, false}}},
461   DUMMY_STRINGOP_ALGS};
462 static stringop_algs pentiumpro_memset[2] = {
463   {rep_prefix_4_byte, {{1024, unrolled_loop, false},
464                        {8192, rep_prefix_4_byte, false},
465                        {-1, libcall, false}}},
466   DUMMY_STRINGOP_ALGS};
467 static const
468 struct processor_costs pentiumpro_cost = {
469   COSTS_N_INSNS (1),			/* cost of an add instruction */
470   COSTS_N_INSNS (1),			/* cost of a lea instruction */
471   COSTS_N_INSNS (1),			/* variable shift costs */
472   COSTS_N_INSNS (1),			/* constant shift costs */
473   {COSTS_N_INSNS (4),			/* cost of starting multiply for QI */
474    COSTS_N_INSNS (4),			/*				 HI */
475    COSTS_N_INSNS (4),			/*				 SI */
476    COSTS_N_INSNS (4),			/*				 DI */
477    COSTS_N_INSNS (4)},			/*			      other */
478   0,					/* cost of multiply per each bit set */
479   {COSTS_N_INSNS (17),			/* cost of a divide/mod for QI */
480    COSTS_N_INSNS (17),			/*			    HI */
481    COSTS_N_INSNS (17),			/*			    SI */
482    COSTS_N_INSNS (17),			/*			    DI */
483    COSTS_N_INSNS (17)},			/*			    other */
484   COSTS_N_INSNS (1),			/* cost of movsx */
485   COSTS_N_INSNS (1),			/* cost of movzx */
486   8,					/* "large" insn */
487   6,					/* MOVE_RATIO */
488 
489   /* All move costs are relative to integer->integer move times 2 and thus
490      they are latency*2. */
491   2,				     /* cost for loading QImode using movzbl */
492   {4, 4, 4},				/* cost of loading integer registers
493 					   in QImode, HImode and SImode.
494 					   Relative to reg-reg move (2).  */
495   {2, 2, 2},				/* cost of storing integer registers */
496   2,					/* cost of reg,reg fld/fst */
497   {2, 2, 6},				/* cost of loading fp registers
498 					   in SFmode, DFmode and XFmode */
499   {4, 4, 6},				/* cost of storing fp registers
500 					   in SFmode, DFmode and XFmode */
501   2,					/* cost of moving MMX register */
502   {2, 2},				/* cost of loading MMX registers
503 					   in SImode and DImode */
504   {2, 2},				/* cost of storing MMX registers
505 					   in SImode and DImode */
506   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
507   {4, 8, 16, 32, 64},			/* cost of loading SSE registers
508 					   in 32,64,128,256 and 512-bit */
509   {4, 8, 16, 32, 64},			/* cost of unaligned loads.  */
510   {4, 8, 16, 32, 64},			/* cost of storing SSE registers
511 					   in 32,64,128,256 and 512-bit */
512   {4, 8, 16, 32, 64},			/* cost of unaligned stores.  */
513   3, 3,					/* SSE->integer and integer->SSE moves */
514   4, 4,					/* Gather load static, per_elt.  */
515   4, 4,					/* Gather store static, per_elt.  */
516   8,					/* size of l1 cache.  */
517   256,					/* size of l2 cache  */
518   32,					/* size of prefetch block */
519   6,					/* number of parallel prefetches */
520   2,					/* Branch cost */
521   COSTS_N_INSNS (3),			/* cost of FADD and FSUB insns.  */
522   COSTS_N_INSNS (5),			/* cost of FMUL instruction.  */
523   COSTS_N_INSNS (56),			/* cost of FDIV instruction.  */
524   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
525   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
526   COSTS_N_INSNS (56),			/* cost of FSQRT instruction.  */
527 
528   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
529   COSTS_N_INSNS (3),			/* cost of ADDSS/SD SUBSS/SD insns.  */
530   COSTS_N_INSNS (4),			/* cost of MULSS instruction.  */
531   COSTS_N_INSNS (4),			/* cost of MULSD instruction.  */
532   COSTS_N_INSNS (7),			/* cost of FMA SS instruction.  */
533   COSTS_N_INSNS (7),			/* cost of FMA SD instruction.  */
534   COSTS_N_INSNS (18),			/* cost of DIVSS instruction.  */
535   COSTS_N_INSNS (18),			/* cost of DIVSD instruction.  */
536   COSTS_N_INSNS (31),			/* cost of SQRTSS instruction.  */
537   COSTS_N_INSNS (31),			/* cost of SQRTSD instruction.  */
538   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
539   pentiumpro_memcpy,
540   pentiumpro_memset,
541   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
542   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
543 };
544 
545 static stringop_algs geode_memcpy[2] = {
546   {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
547   DUMMY_STRINGOP_ALGS};
548 static stringop_algs geode_memset[2] = {
549   {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
550   DUMMY_STRINGOP_ALGS};
551 static const
552 struct processor_costs geode_cost = {
553   COSTS_N_INSNS (1),			/* cost of an add instruction */
554   COSTS_N_INSNS (1),			/* cost of a lea instruction */
555   COSTS_N_INSNS (2),			/* variable shift costs */
556   COSTS_N_INSNS (1),			/* constant shift costs */
557   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
558    COSTS_N_INSNS (4),			/*				 HI */
559    COSTS_N_INSNS (7),			/*				 SI */
560    COSTS_N_INSNS (7),			/*				 DI */
561    COSTS_N_INSNS (7)},			/*			      other */
562   0,					/* cost of multiply per each bit set */
563   {COSTS_N_INSNS (15),			/* cost of a divide/mod for QI */
564    COSTS_N_INSNS (23),			/*			    HI */
565    COSTS_N_INSNS (39),			/*			    SI */
566    COSTS_N_INSNS (39),			/*			    DI */
567    COSTS_N_INSNS (39)},			/*			    other */
568   COSTS_N_INSNS (1),			/* cost of movsx */
569   COSTS_N_INSNS (1),			/* cost of movzx */
570   8,					/* "large" insn */
571   4,					/* MOVE_RATIO */
572 
573   /* All move costs are relative to integer->integer move times 2 and thus
574      they are latency*2. */
575   2,				     /* cost for loading QImode using movzbl */
576   {2, 2, 2},				/* cost of loading integer registers
577 					   in QImode, HImode and SImode.
578 					   Relative to reg-reg move (2).  */
579   {2, 2, 2},				/* cost of storing integer registers */
580   2,					/* cost of reg,reg fld/fst */
581   {2, 2, 2},				/* cost of loading fp registers
582 					   in SFmode, DFmode and XFmode */
583   {4, 6, 6},				/* cost of storing fp registers
584 					   in SFmode, DFmode and XFmode */
585 
586   2,					/* cost of moving MMX register */
587   {2, 2},				/* cost of loading MMX registers
588 					   in SImode and DImode */
589   {2, 2},				/* cost of storing MMX registers
590 					   in SImode and DImode */
591   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
592   {2, 2, 8, 16, 32},			/* cost of loading SSE registers
593 					   in 32,64,128,256 and 512-bit */
594   {2, 2, 8, 16, 32},			/* cost of unaligned loads.  */
595   {2, 2, 8, 16, 32},			/* cost of storing SSE registers
596 					   in 32,64,128,256 and 512-bit */
597   {2, 2, 8, 16, 32},			/* cost of unaligned stores.  */
598   6, 6,					/* SSE->integer and integer->SSE moves */
599   2, 2,					/* Gather load static, per_elt.  */
600   2, 2,					/* Gather store static, per_elt.  */
601   64,					/* size of l1 cache.  */
602   128,					/* size of l2 cache.  */
603   32,					/* size of prefetch block */
604   1,					/* number of parallel prefetches */
605   1,					/* Branch cost */
606   COSTS_N_INSNS (6),			/* cost of FADD and FSUB insns.  */
607   COSTS_N_INSNS (11),			/* cost of FMUL instruction.  */
608   COSTS_N_INSNS (47),			/* cost of FDIV instruction.  */
609   COSTS_N_INSNS (1),			/* cost of FABS instruction.  */
610   COSTS_N_INSNS (1),			/* cost of FCHS instruction.  */
611   COSTS_N_INSNS (54),			/* cost of FSQRT instruction.  */
612 
613   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
614   COSTS_N_INSNS (6),			/* cost of ADDSS/SD SUBSS/SD insns.  */
615   COSTS_N_INSNS (11),			/* cost of MULSS instruction.  */
616   COSTS_N_INSNS (11),			/* cost of MULSD instruction.  */
617   COSTS_N_INSNS (17),			/* cost of FMA SS instruction.  */
618   COSTS_N_INSNS (17),			/* cost of FMA SD instruction.  */
619   COSTS_N_INSNS (47),			/* cost of DIVSS instruction.  */
620   COSTS_N_INSNS (47),			/* cost of DIVSD instruction.  */
621   COSTS_N_INSNS (54),			/* cost of SQRTSS instruction.  */
622   COSTS_N_INSNS (54),			/* cost of SQRTSD instruction.  */
623   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
624   geode_memcpy,
625   geode_memset,
626   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
627   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
628 };
629 
630 static stringop_algs k6_memcpy[2] = {
631   {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
632   DUMMY_STRINGOP_ALGS};
633 static stringop_algs k6_memset[2] = {
634   {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
635   DUMMY_STRINGOP_ALGS};
636 static const
637 struct processor_costs k6_cost = {
638   COSTS_N_INSNS (1),			/* cost of an add instruction */
639   COSTS_N_INSNS (2),			/* cost of a lea instruction */
640   COSTS_N_INSNS (1),			/* variable shift costs */
641   COSTS_N_INSNS (1),			/* constant shift costs */
642   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
643    COSTS_N_INSNS (3),			/*				 HI */
644    COSTS_N_INSNS (3),			/*				 SI */
645    COSTS_N_INSNS (3),			/*				 DI */
646    COSTS_N_INSNS (3)},			/*			      other */
647   0,					/* cost of multiply per each bit set */
648   {COSTS_N_INSNS (18),			/* cost of a divide/mod for QI */
649    COSTS_N_INSNS (18),			/*			    HI */
650    COSTS_N_INSNS (18),			/*			    SI */
651    COSTS_N_INSNS (18),			/*			    DI */
652    COSTS_N_INSNS (18)},			/*			    other */
653   COSTS_N_INSNS (2),			/* cost of movsx */
654   COSTS_N_INSNS (2),			/* cost of movzx */
655   8,					/* "large" insn */
656   4,					/* MOVE_RATIO */
657 
658   /* All move costs are relative to integer->integer move times 2 and thus
659      they are latency*2. */
660   3,				     /* cost for loading QImode using movzbl */
661   {4, 5, 4},				/* cost of loading integer registers
662 					   in QImode, HImode and SImode.
663 					   Relative to reg-reg move (2).  */
664   {2, 3, 2},				/* cost of storing integer registers */
665   4,					/* cost of reg,reg fld/fst */
666   {6, 6, 6},				/* cost of loading fp registers
667 					   in SFmode, DFmode and XFmode */
668   {4, 4, 4},				/* cost of storing fp registers
669 					   in SFmode, DFmode and XFmode */
670   2,					/* cost of moving MMX register */
671   {2, 2},				/* cost of loading MMX registers
672 					   in SImode and DImode */
673   {2, 2},				/* cost of storing MMX registers
674 					   in SImode and DImode */
675   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
676   {2, 2, 8, 16, 32},			/* cost of loading SSE registers
677 					   in 32,64,128,256 and 512-bit */
678   {2, 2, 8, 16, 32},			/* cost of unaligned loads.  */
679   {2, 2, 8, 16, 32},			/* cost of storing SSE registers
680 					   in 32,64,128,256 and 512-bit */
681   {2, 2, 8, 16, 32},			/* cost of unaligned stores.  */
682   6, 6,					/* SSE->integer and integer->SSE moves */
683   2, 2,					/* Gather load static, per_elt.  */
684   2, 2,					/* Gather store static, per_elt.  */
685   32,					/* size of l1 cache.  */
686   32,					/* size of l2 cache.  Some models
687 					   have integrated l2 cache, but
688 					   optimizing for k6 is not important
689 					   enough to worry about that.  */
690   32,					/* size of prefetch block */
691   1,					/* number of parallel prefetches */
692   1,					/* Branch cost */
693   COSTS_N_INSNS (2),			/* cost of FADD and FSUB insns.  */
694   COSTS_N_INSNS (2),			/* cost of FMUL instruction.  */
695   COSTS_N_INSNS (56),			/* cost of FDIV instruction.  */
696   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
697   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
698   COSTS_N_INSNS (56),			/* cost of FSQRT instruction.  */
699 
700   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
701   COSTS_N_INSNS (2),			/* cost of ADDSS/SD SUBSS/SD insns.  */
702   COSTS_N_INSNS (2),			/* cost of MULSS instruction.  */
703   COSTS_N_INSNS (2),			/* cost of MULSD instruction.  */
704   COSTS_N_INSNS (4),			/* cost of FMA SS instruction.  */
705   COSTS_N_INSNS (4),			/* cost of FMA SD instruction.  */
706   COSTS_N_INSNS (56),			/* cost of DIVSS instruction.  */
707   COSTS_N_INSNS (56),			/* cost of DIVSD instruction.  */
708   COSTS_N_INSNS (56),			/* cost of SQRTSS instruction.  */
709   COSTS_N_INSNS (56),			/* cost of SQRTSD instruction.  */
710   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
711   k6_memcpy,
712   k6_memset,
713   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
714   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
715 };
716 
717 /* For some reason, Athlon deals better with REP prefix (relative to loops)
718    compared to K8. Alignment becomes important after 8 bytes for memcpy and
719    128 bytes for memset.  */
720 static stringop_algs athlon_memcpy[2] = {
721   {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
722   DUMMY_STRINGOP_ALGS};
723 static stringop_algs athlon_memset[2] = {
724   {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
725   DUMMY_STRINGOP_ALGS};
726 static const
727 struct processor_costs athlon_cost = {
728   COSTS_N_INSNS (1),			/* cost of an add instruction */
729   COSTS_N_INSNS (2),			/* cost of a lea instruction */
730   COSTS_N_INSNS (1),			/* variable shift costs */
731   COSTS_N_INSNS (1),			/* constant shift costs */
732   {COSTS_N_INSNS (5),			/* cost of starting multiply for QI */
733    COSTS_N_INSNS (5),			/*				 HI */
734    COSTS_N_INSNS (5),			/*				 SI */
735    COSTS_N_INSNS (5),			/*				 DI */
736    COSTS_N_INSNS (5)},			/*			      other */
737   0,					/* cost of multiply per each bit set */
738   {COSTS_N_INSNS (18),			/* cost of a divide/mod for QI */
739    COSTS_N_INSNS (26),			/*			    HI */
740    COSTS_N_INSNS (42),			/*			    SI */
741    COSTS_N_INSNS (74),			/*			    DI */
742    COSTS_N_INSNS (74)},			/*			    other */
743   COSTS_N_INSNS (1),			/* cost of movsx */
744   COSTS_N_INSNS (1),			/* cost of movzx */
745   8,					/* "large" insn */
746   9,					/* MOVE_RATIO */
747 
748   /* All move costs are relative to integer->integer move times 2 and thus
749      they are latency*2. */
750   4,				     /* cost for loading QImode using movzbl */
751   {3, 4, 3},				/* cost of loading integer registers
752 					   in QImode, HImode and SImode.
753 					   Relative to reg-reg move (2).  */
754   {3, 4, 3},				/* cost of storing integer registers */
755   4,					/* cost of reg,reg fld/fst */
756   {4, 4, 12},				/* cost of loading fp registers
757 					   in SFmode, DFmode and XFmode */
758   {6, 6, 8},				/* cost of storing fp registers
759 					   in SFmode, DFmode and XFmode */
760   2,					/* cost of moving MMX register */
761   {4, 4},				/* cost of loading MMX registers
762 					   in SImode and DImode */
763   {4, 4},				/* cost of storing MMX registers
764 					   in SImode and DImode */
765   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
766   {4, 4, 6, 12, 24},			/* cost of loading SSE registers
767 					   in 32,64,128,256 and 512-bit */
768   {4, 4, 6, 12, 24},			/* cost of unaligned loads.  */
769   {4, 4, 5, 10, 20},			/* cost of storing SSE registers
770 					   in 32,64,128,256 and 512-bit */
771   {4, 4, 5, 10, 20},			/* cost of unaligned stores.  */
772   5, 5,					/* SSE->integer and integer->SSE moves */
773   4, 4,					/* Gather load static, per_elt.  */
774   4, 4,					/* Gather store static, per_elt.  */
775   64,					/* size of l1 cache.  */
776   256,					/* size of l2 cache.  */
777   64,					/* size of prefetch block */
778   6,					/* number of parallel prefetches */
779   5,					/* Branch cost */
780   COSTS_N_INSNS (4),			/* cost of FADD and FSUB insns.  */
781   COSTS_N_INSNS (4),			/* cost of FMUL instruction.  */
782   COSTS_N_INSNS (24),			/* cost of FDIV instruction.  */
783   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
784   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
785   COSTS_N_INSNS (35),			/* cost of FSQRT instruction.  */
786 
787   COSTS_N_INSNS (2),			/* cost of cheap SSE instruction.  */
788   COSTS_N_INSNS (4),			/* cost of ADDSS/SD SUBSS/SD insns.  */
789   COSTS_N_INSNS (4),			/* cost of MULSS instruction.  */
790   COSTS_N_INSNS (4),			/* cost of MULSD instruction.  */
791   COSTS_N_INSNS (8),			/* cost of FMA SS instruction.  */
792   COSTS_N_INSNS (8),			/* cost of FMA SD instruction.  */
793   /* 11-16  */
794   COSTS_N_INSNS (16),			/* cost of DIVSS instruction.  */
795   COSTS_N_INSNS (24),			/* cost of DIVSD instruction.  */
796   COSTS_N_INSNS (19),			/* cost of SQRTSS instruction.  */
797   COSTS_N_INSNS (19),			/* cost of SQRTSD instruction.  */
798   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
799   athlon_memcpy,
800   athlon_memset,
801   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
802   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
803 };
804 
805 /* K8 has optimized REP instruction for medium sized blocks, but for very
806    small blocks it is better to use loop. For large blocks, libcall can
807    do nontemporary accesses and beat inline considerably.  */
808 static stringop_algs k8_memcpy[2] = {
809   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
810              {-1, rep_prefix_4_byte, false}}},
811   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
812              {-1, libcall, false}}}};
813 static stringop_algs k8_memset[2] = {
814   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
815              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
816   {libcall, {{48, unrolled_loop, false},
817              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
818 static const
819 struct processor_costs k8_cost = {
820   COSTS_N_INSNS (1),			/* cost of an add instruction */
821   COSTS_N_INSNS (2),			/* cost of a lea instruction */
822   COSTS_N_INSNS (1),			/* variable shift costs */
823   COSTS_N_INSNS (1),			/* constant shift costs */
824   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
825    COSTS_N_INSNS (4),			/*				 HI */
826    COSTS_N_INSNS (3),			/*				 SI */
827    COSTS_N_INSNS (4),			/*				 DI */
828    COSTS_N_INSNS (5)},			/*			      other */
829   0,					/* cost of multiply per each bit set */
830   {COSTS_N_INSNS (18),			/* cost of a divide/mod for QI */
831    COSTS_N_INSNS (26),			/*			    HI */
832    COSTS_N_INSNS (42),			/*			    SI */
833    COSTS_N_INSNS (74),			/*			    DI */
834    COSTS_N_INSNS (74)},			/*			    other */
835   COSTS_N_INSNS (1),			/* cost of movsx */
836   COSTS_N_INSNS (1),			/* cost of movzx */
837   8,					/* "large" insn */
838   9,					/* MOVE_RATIO */
839 
840   /* All move costs are relative to integer->integer move times 2 and thus
841      they are latency*2. */
842   4,				     /* cost for loading QImode using movzbl */
843   {3, 4, 3},				/* cost of loading integer registers
844 					   in QImode, HImode and SImode.
845 					   Relative to reg-reg move (2).  */
846   {3, 4, 3},				/* cost of storing integer registers */
847   4,					/* cost of reg,reg fld/fst */
848   {4, 4, 12},				/* cost of loading fp registers
849 					   in SFmode, DFmode and XFmode */
850   {6, 6, 8},				/* cost of storing fp registers
851 					   in SFmode, DFmode and XFmode */
852   2,					/* cost of moving MMX register */
853   {3, 3},				/* cost of loading MMX registers
854 					   in SImode and DImode */
855   {4, 4},				/* cost of storing MMX registers
856 					   in SImode and DImode */
857   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
858   {4, 3, 6, 12, 24},			/* cost of loading SSE registers
859 					   in 32,64,128,256 and 512-bit */
860   {4, 3, 6, 12, 24},			/* cost of unaligned loads.  */
861   {4, 4, 5, 10, 20},			/* cost of storing SSE registers
862 					   in 32,64,128,256 and 512-bit */
863   {4, 4, 5, 10, 20},			/* cost of unaligned stores.  */
864   5, 5,					/* SSE->integer and integer->SSE moves */
865   4, 4,					/* Gather load static, per_elt.  */
866   4, 4,					/* Gather store static, per_elt.  */
867   64,					/* size of l1 cache.  */
868   512,					/* size of l2 cache.  */
869   64,					/* size of prefetch block */
870   /* New AMD processors never drop prefetches; if they cannot be performed
871      immediately, they are queued.  We set number of simultaneous prefetches
872      to a large constant to reflect this (it probably is not a good idea not
873      to limit number of prefetches at all, as their execution also takes some
874      time).  */
875   100,					/* number of parallel prefetches */
876   3,					/* Branch cost */
877   COSTS_N_INSNS (4),			/* cost of FADD and FSUB insns.  */
878   COSTS_N_INSNS (4),			/* cost of FMUL instruction.  */
879   COSTS_N_INSNS (19),			/* cost of FDIV instruction.  */
880   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
881   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
882   COSTS_N_INSNS (35),			/* cost of FSQRT instruction.  */
883 
884   COSTS_N_INSNS (2),			/* cost of cheap SSE instruction.  */
885   COSTS_N_INSNS (4),			/* cost of ADDSS/SD SUBSS/SD insns.  */
886   COSTS_N_INSNS (4),			/* cost of MULSS instruction.  */
887   COSTS_N_INSNS (4),			/* cost of MULSD instruction.  */
888   COSTS_N_INSNS (8),			/* cost of FMA SS instruction.  */
889   COSTS_N_INSNS (8),			/* cost of FMA SD instruction.  */
890   /* 11-16  */
891   COSTS_N_INSNS (16),			/* cost of DIVSS instruction.  */
892   COSTS_N_INSNS (20),			/* cost of DIVSD instruction.  */
893   COSTS_N_INSNS (19),			/* cost of SQRTSS instruction.  */
894   COSTS_N_INSNS (27),			/* cost of SQRTSD instruction.  */
895   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
896   k8_memcpy,
897   k8_memset,
898   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
899   COSTS_N_INSNS (2),			/* cond_not_taken_branch_cost.  */
900 };
901 
902 /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
903    very small blocks it is better to use loop. For large blocks, libcall can
904    do nontemporary accesses and beat inline considerably.  */
905 static stringop_algs amdfam10_memcpy[2] = {
906   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
907              {-1, rep_prefix_4_byte, false}}},
908   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
909              {-1, libcall, false}}}};
910 static stringop_algs amdfam10_memset[2] = {
911   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
912              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
913   {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
914              {-1, libcall, false}}}};
915 struct processor_costs amdfam10_cost = {
916   COSTS_N_INSNS (1),			/* cost of an add instruction */
917   COSTS_N_INSNS (2),			/* cost of a lea instruction */
918   COSTS_N_INSNS (1),			/* variable shift costs */
919   COSTS_N_INSNS (1),			/* constant shift costs */
920   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
921    COSTS_N_INSNS (4),			/*				 HI */
922    COSTS_N_INSNS (3),			/*				 SI */
923    COSTS_N_INSNS (4),			/*				 DI */
924    COSTS_N_INSNS (5)},			/*			      other */
925   0,					/* cost of multiply per each bit set */
926   {COSTS_N_INSNS (19),			/* cost of a divide/mod for QI */
927    COSTS_N_INSNS (35),			/*			    HI */
928    COSTS_N_INSNS (51),			/*			    SI */
929    COSTS_N_INSNS (83),			/*			    DI */
930    COSTS_N_INSNS (83)},			/*			    other */
931   COSTS_N_INSNS (1),			/* cost of movsx */
932   COSTS_N_INSNS (1),			/* cost of movzx */
933   8,					/* "large" insn */
934   9,					/* MOVE_RATIO */
935 
936   /* All move costs are relative to integer->integer move times 2 and thus
937      they are latency*2. */
938   4,				     /* cost for loading QImode using movzbl */
939   {3, 4, 3},				/* cost of loading integer registers
940 					   in QImode, HImode and SImode.
941 					   Relative to reg-reg move (2).  */
942   {3, 4, 3},				/* cost of storing integer registers */
943   4,					/* cost of reg,reg fld/fst */
944   {4, 4, 12},				/* cost of loading fp registers
945 		   			   in SFmode, DFmode and XFmode */
946   {6, 6, 8},				/* cost of storing fp registers
947  		   			   in SFmode, DFmode and XFmode */
948   2,					/* cost of moving MMX register */
949   {3, 3},				/* cost of loading MMX registers
950 					   in SImode and DImode */
951   {4, 4},				/* cost of storing MMX registers
952 					   in SImode and DImode */
953   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
954   {4, 4, 3, 6, 12},			/* cost of loading SSE registers
955 					   in 32,64,128,256 and 512-bit */
956   {4, 4, 3, 7, 12},			/* cost of unaligned loads.  */
957   {4, 4, 5, 10, 20},			/* cost of storing SSE registers
958 					   in 32,64,128,256 and 512-bit */
959   {4, 4, 5, 10, 20},			/* cost of unaligned stores.  */
960   3, 3,					/* SSE->integer and integer->SSE moves */
961   					/* On K8:
962   					    MOVD reg64, xmmreg Double FSTORE 4
963 					    MOVD reg32, xmmreg Double FSTORE 4
964 					   On AMDFAM10:
965 					    MOVD reg64, xmmreg Double FADD 3
966 							       1/1  1/1
967 					    MOVD reg32, xmmreg Double FADD 3
968 							       1/1  1/1 */
969   4, 4,					/* Gather load static, per_elt.  */
970   4, 4,					/* Gather store static, per_elt.  */
971   64,					/* size of l1 cache.  */
972   512,					/* size of l2 cache.  */
973   64,					/* size of prefetch block */
974   /* New AMD processors never drop prefetches; if they cannot be performed
975      immediately, they are queued.  We set number of simultaneous prefetches
976      to a large constant to reflect this (it probably is not a good idea not
977      to limit number of prefetches at all, as their execution also takes some
978      time).  */
979   100,					/* number of parallel prefetches */
980   2,					/* Branch cost */
981   COSTS_N_INSNS (4),			/* cost of FADD and FSUB insns.  */
982   COSTS_N_INSNS (4),			/* cost of FMUL instruction.  */
983   COSTS_N_INSNS (19),			/* cost of FDIV instruction.  */
984   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
985   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
986   COSTS_N_INSNS (35),			/* cost of FSQRT instruction.  */
987 
988   COSTS_N_INSNS (2),			/* cost of cheap SSE instruction.  */
989   COSTS_N_INSNS (4),			/* cost of ADDSS/SD SUBSS/SD insns.  */
990   COSTS_N_INSNS (4),			/* cost of MULSS instruction.  */
991   COSTS_N_INSNS (4),			/* cost of MULSD instruction.  */
992   COSTS_N_INSNS (8),			/* cost of FMA SS instruction.  */
993   COSTS_N_INSNS (8),			/* cost of FMA SD instruction.  */
994   /* 11-16  */
995   COSTS_N_INSNS (16),			/* cost of DIVSS instruction.  */
996   COSTS_N_INSNS (20),			/* cost of DIVSD instruction.  */
997   COSTS_N_INSNS (19),			/* cost of SQRTSS instruction.  */
998   COSTS_N_INSNS (27),			/* cost of SQRTSD instruction.  */
999   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
1000   amdfam10_memcpy,
1001   amdfam10_memset,
1002   COSTS_N_INSNS (2),			/* cond_taken_branch_cost.  */
1003   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
1004 };
1005 
1006 /*  BDVER1 has optimized REP instruction for medium sized blocks, but for
1007     very small blocks it is better to use loop. For large blocks, libcall
1008     can do nontemporary accesses and beat inline considerably.  */
1009 static stringop_algs bdver1_memcpy[2] = {
1010   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1011              {-1, rep_prefix_4_byte, false}}},
1012   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1013              {-1, libcall, false}}}};
1014 static stringop_algs bdver1_memset[2] = {
1015   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1016              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1017   {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1018              {-1, libcall, false}}}};
1019 
1020 const struct processor_costs bdver1_cost = {
1021   COSTS_N_INSNS (1),			/* cost of an add instruction */
1022   COSTS_N_INSNS (1),			/* cost of a lea instruction */
1023   COSTS_N_INSNS (1),			/* variable shift costs */
1024   COSTS_N_INSNS (1),			/* constant shift costs */
1025   {COSTS_N_INSNS (4),			/* cost of starting multiply for QI */
1026    COSTS_N_INSNS (4),			/*				 HI */
1027    COSTS_N_INSNS (4),			/*				 SI */
1028    COSTS_N_INSNS (6),			/*				 DI */
1029    COSTS_N_INSNS (6)},			/*			      other */
1030   0,					/* cost of multiply per each bit set */
1031   {COSTS_N_INSNS (19),			/* cost of a divide/mod for QI */
1032    COSTS_N_INSNS (35),			/*			    HI */
1033    COSTS_N_INSNS (51),			/*			    SI */
1034    COSTS_N_INSNS (83),			/*			    DI */
1035    COSTS_N_INSNS (83)},			/*			    other */
1036   COSTS_N_INSNS (1),			/* cost of movsx */
1037   COSTS_N_INSNS (1),			/* cost of movzx */
1038   8,					/* "large" insn */
1039   9,					/* MOVE_RATIO */
1040 
1041   /* All move costs are relative to integer->integer move times 2 and thus
1042      they are latency*2. */
1043   8,				     /* cost for loading QImode using movzbl */
1044   {8, 8, 8},				/* cost of loading integer registers
1045 					   in QImode, HImode and SImode.
1046 					   Relative to reg-reg move (2).  */
1047   {8, 8, 8},				/* cost of storing integer registers */
1048   4,					/* cost of reg,reg fld/fst */
1049   {12, 12, 28},				/* cost of loading fp registers
1050 		   			   in SFmode, DFmode and XFmode */
1051   {10, 10, 18},				/* cost of storing fp registers
1052  		   			   in SFmode, DFmode and XFmode */
1053   4,					/* cost of moving MMX register */
1054   {12, 12},				/* cost of loading MMX registers
1055 					   in SImode and DImode */
1056   {10, 10},				/* cost of storing MMX registers
1057 					   in SImode and DImode */
1058   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
1059   {12, 12, 10, 20, 30},			/* cost of loading SSE registers
1060 					   in 32,64,128,256 and 512-bit */
1061   {12, 12, 10, 20, 30},			/* cost of unaligned loads.  */
1062   {10, 10, 10, 20, 30},			/* cost of storing SSE registers
1063 					   in 32,64,128,256 and 512-bit */
1064   {10, 10, 10, 20, 30},			/* cost of unaligned stores.  */
1065   16, 20,				/* SSE->integer and integer->SSE moves */
1066   12, 12,				/* Gather load static, per_elt.  */
1067   10, 10,				/* Gather store static, per_elt.  */
1068   16,					/* size of l1 cache.  */
1069   2048,					/* size of l2 cache.  */
1070   64,					/* size of prefetch block */
1071   /* New AMD processors never drop prefetches; if they cannot be performed
1072      immediately, they are queued.  We set number of simultaneous prefetches
1073      to a large constant to reflect this (it probably is not a good idea not
1074      to limit number of prefetches at all, as their execution also takes some
1075      time).  */
1076   100,					/* number of parallel prefetches */
1077   2,					/* Branch cost */
1078   COSTS_N_INSNS (6),			/* cost of FADD and FSUB insns.  */
1079   COSTS_N_INSNS (6),			/* cost of FMUL instruction.  */
1080   COSTS_N_INSNS (42),			/* cost of FDIV instruction.  */
1081   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
1082   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
1083   COSTS_N_INSNS (52),			/* cost of FSQRT instruction.  */
1084 
1085   COSTS_N_INSNS (2),			/* cost of cheap SSE instruction.  */
1086   COSTS_N_INSNS (6),			/* cost of ADDSS/SD SUBSS/SD insns.  */
1087   COSTS_N_INSNS (6),			/* cost of MULSS instruction.  */
1088   COSTS_N_INSNS (6),			/* cost of MULSD instruction.  */
1089   COSTS_N_INSNS (6),			/* cost of FMA SS instruction.  */
1090   COSTS_N_INSNS (6),			/* cost of FMA SD instruction.  */
1091   /* 9-24  */
1092   COSTS_N_INSNS (24),			/* cost of DIVSS instruction.  */
1093   /* 9-27  */
1094   COSTS_N_INSNS (27),			/* cost of DIVSD instruction.  */
1095   COSTS_N_INSNS (15),			/* cost of SQRTSS instruction.  */
1096   COSTS_N_INSNS (26),			/* cost of SQRTSD instruction.  */
1097   1, 2, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
1098   bdver1_memcpy,
1099   bdver1_memset,
1100   COSTS_N_INSNS (4),			/* cond_taken_branch_cost.  */
1101   COSTS_N_INSNS (2),			/* cond_not_taken_branch_cost.  */
1102 };
1103 
1104 /*  BDVER2 has optimized REP instruction for medium sized blocks, but for
1105     very small blocks it is better to use loop. For large blocks, libcall
1106     can do nontemporary accesses and beat inline considerably.  */
1107 
1108 static stringop_algs bdver2_memcpy[2] = {
1109   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1110              {-1, rep_prefix_4_byte, false}}},
1111   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1112              {-1, libcall, false}}}};
1113 static stringop_algs bdver2_memset[2] = {
1114   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1115              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1116   {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1117              {-1, libcall, false}}}};
1118 
1119 const struct processor_costs bdver2_cost = {
1120   COSTS_N_INSNS (1),			/* cost of an add instruction */
1121   COSTS_N_INSNS (1),			/* cost of a lea instruction */
1122   COSTS_N_INSNS (1),			/* variable shift costs */
1123   COSTS_N_INSNS (1),			/* constant shift costs */
1124   {COSTS_N_INSNS (4),			/* cost of starting multiply for QI */
1125    COSTS_N_INSNS (4),			/*				 HI */
1126    COSTS_N_INSNS (4),			/*				 SI */
1127    COSTS_N_INSNS (6),			/*				 DI */
1128    COSTS_N_INSNS (6)},			/*			      other */
1129   0,					/* cost of multiply per each bit set */
1130   {COSTS_N_INSNS (19),			/* cost of a divide/mod for QI */
1131    COSTS_N_INSNS (35),			/*			    HI */
1132    COSTS_N_INSNS (51),			/*			    SI */
1133    COSTS_N_INSNS (83),			/*			    DI */
1134    COSTS_N_INSNS (83)},			/*			    other */
1135   COSTS_N_INSNS (1),			/* cost of movsx */
1136   COSTS_N_INSNS (1),			/* cost of movzx */
1137   8,					/* "large" insn */
1138   9,					/* MOVE_RATIO */
1139 
1140   /* All move costs are relative to integer->integer move times 2 and thus
1141      they are latency*2. */
1142   8,				     /* cost for loading QImode using movzbl */
1143   {8, 8, 8},				/* cost of loading integer registers
1144 					   in QImode, HImode and SImode.
1145 					   Relative to reg-reg move (2).  */
1146   {8, 8, 8},				/* cost of storing integer registers */
1147   4,					/* cost of reg,reg fld/fst */
1148   {12, 12, 28},				/* cost of loading fp registers
1149 		   			   in SFmode, DFmode and XFmode */
1150   {10, 10, 18},				/* cost of storing fp registers
1151  		   			   in SFmode, DFmode and XFmode */
1152   4,					/* cost of moving MMX register */
1153   {12, 12},				/* cost of loading MMX registers
1154 					   in SImode and DImode */
1155   {10, 10},				/* cost of storing MMX registers
1156 					   in SImode and DImode */
1157   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
1158   {12, 12, 10, 20, 30},			/* cost of loading SSE registers
1159 					   in 32,64,128,256 and 512-bit */
1160   {12, 12, 10, 20, 30},			/* cost of unaligned loads.  */
1161   {10, 10, 10, 20, 30},			/* cost of storing SSE registers
1162 					   in 32,64,128,256 and 512-bit */
1163   {10, 10, 10, 20, 30},			/* cost of unaligned stores.  */
1164   16, 20,				/* SSE->integer and integer->SSE moves */
1165   12, 12,				/* Gather load static, per_elt.  */
1166   10, 10,				/* Gather store static, per_elt.  */
1167   16,					/* size of l1 cache.  */
1168   2048,					/* size of l2 cache.  */
1169   64,					/* size of prefetch block */
1170   /* New AMD processors never drop prefetches; if they cannot be performed
1171      immediately, they are queued.  We set number of simultaneous prefetches
1172      to a large constant to reflect this (it probably is not a good idea not
1173      to limit number of prefetches at all, as their execution also takes some
1174      time).  */
1175   100,					/* number of parallel prefetches */
1176   2,					/* Branch cost */
1177   COSTS_N_INSNS (6),			/* cost of FADD and FSUB insns.  */
1178   COSTS_N_INSNS (6),			/* cost of FMUL instruction.  */
1179   COSTS_N_INSNS (42),			/* cost of FDIV instruction.  */
1180   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
1181   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
1182   COSTS_N_INSNS (52),			/* cost of FSQRT instruction.  */
1183 
1184   COSTS_N_INSNS (2),			/* cost of cheap SSE instruction.  */
1185   COSTS_N_INSNS (6),			/* cost of ADDSS/SD SUBSS/SD insns.  */
1186   COSTS_N_INSNS (6),			/* cost of MULSS instruction.  */
1187   COSTS_N_INSNS (6),			/* cost of MULSD instruction.  */
1188   COSTS_N_INSNS (6),			/* cost of FMA SS instruction.  */
1189   COSTS_N_INSNS (6),			/* cost of FMA SD instruction.  */
1190   /* 9-24  */
1191   COSTS_N_INSNS (24),			/* cost of DIVSS instruction.  */
1192   /* 9-27  */
1193   COSTS_N_INSNS (27),			/* cost of DIVSD instruction.  */
1194   COSTS_N_INSNS (15),			/* cost of SQRTSS instruction.  */
1195   COSTS_N_INSNS (26),			/* cost of SQRTSD instruction.  */
1196   1, 2, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
1197   bdver2_memcpy,
1198   bdver2_memset,
1199   COSTS_N_INSNS (4),			/* cond_taken_branch_cost.  */
1200   COSTS_N_INSNS (2),			/* cond_not_taken_branch_cost.  */
1201 };
1202 
1203 
1204   /*  BDVER3 has optimized REP instruction for medium sized blocks, but for
1205       very small blocks it is better to use loop. For large blocks, libcall
1206       can do nontemporary accesses and beat inline considerably.  */
1207 static stringop_algs bdver3_memcpy[2] = {
1208   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1209              {-1, rep_prefix_4_byte, false}}},
1210   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1211              {-1, libcall, false}}}};
1212 static stringop_algs bdver3_memset[2] = {
1213   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1214              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1215   {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1216              {-1, libcall, false}}}};
1217 struct processor_costs bdver3_cost = {
1218   COSTS_N_INSNS (1),			/* cost of an add instruction */
1219   COSTS_N_INSNS (1),			/* cost of a lea instruction */
1220   COSTS_N_INSNS (1),			/* variable shift costs */
1221   COSTS_N_INSNS (1),			/* constant shift costs */
1222   {COSTS_N_INSNS (4),			/* cost of starting multiply for QI */
1223    COSTS_N_INSNS (4),			/*				 HI */
1224    COSTS_N_INSNS (4),			/*				 SI */
1225    COSTS_N_INSNS (6),			/*				 DI */
1226    COSTS_N_INSNS (6)},			/*			      other */
1227   0,					/* cost of multiply per each bit set */
1228   {COSTS_N_INSNS (19),			/* cost of a divide/mod for QI */
1229    COSTS_N_INSNS (35),			/*			    HI */
1230    COSTS_N_INSNS (51),			/*			    SI */
1231    COSTS_N_INSNS (83),			/*			    DI */
1232    COSTS_N_INSNS (83)},			/*			    other */
1233   COSTS_N_INSNS (1),			/* cost of movsx */
1234   COSTS_N_INSNS (1),			/* cost of movzx */
1235   8,					/* "large" insn */
1236   9,					/* MOVE_RATIO */
1237 
1238   /* All move costs are relative to integer->integer move times 2 and thus
1239      they are latency*2. */
1240   8,				     /* cost for loading QImode using movzbl */
1241   {8, 8, 8},				/* cost of loading integer registers
1242 					   in QImode, HImode and SImode.
1243 					   Relative to reg-reg move (2).  */
1244   {8, 8, 8},				/* cost of storing integer registers */
1245   4,					/* cost of reg,reg fld/fst */
1246   {12, 12, 28},				/* cost of loading fp registers
1247 		   			   in SFmode, DFmode and XFmode */
1248   {10, 10, 18},				/* cost of storing fp registers
1249  		   			   in SFmode, DFmode and XFmode */
1250   4,					/* cost of moving MMX register */
1251   {12, 12},				/* cost of loading MMX registers
1252 					   in SImode and DImode */
1253   {10, 10},				/* cost of storing MMX registers
1254 					   in SImode and DImode */
1255   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
1256   {12, 12, 10, 20, 30},			/* cost of loading SSE registers
1257 					   in 32,64,128,256 and 512-bit */
1258   {12, 12, 10, 20, 30},			/* cost of unaligned loads.  */
1259   {10, 10, 10, 20, 30},			/* cost of storing SSE registers
1260 					   in 32,64,128,256 and 512-bit */
1261   {10, 10, 10, 20, 30},			/* cost of unaligned stores.  */
1262   16, 20,				/* SSE->integer and integer->SSE moves */
1263   12, 12,				/* Gather load static, per_elt.  */
1264   10, 10,				/* Gather store static, per_elt.  */
1265   16,					/* size of l1 cache.  */
1266   2048,					/* size of l2 cache.  */
1267   64,					/* size of prefetch block */
1268   /* New AMD processors never drop prefetches; if they cannot be performed
1269      immediately, they are queued.  We set number of simultaneous prefetches
1270      to a large constant to reflect this (it probably is not a good idea not
1271      to limit number of prefetches at all, as their execution also takes some
1272      time).  */
1273   100,					/* number of parallel prefetches */
1274   2,					/* Branch cost */
1275   COSTS_N_INSNS (6),			/* cost of FADD and FSUB insns.  */
1276   COSTS_N_INSNS (6),			/* cost of FMUL instruction.  */
1277   COSTS_N_INSNS (42),			/* cost of FDIV instruction.  */
1278   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
1279   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
1280   COSTS_N_INSNS (52),			/* cost of FSQRT instruction.  */
1281 
1282   COSTS_N_INSNS (2),			/* cost of cheap SSE instruction.  */
1283   COSTS_N_INSNS (6),			/* cost of ADDSS/SD SUBSS/SD insns.  */
1284   COSTS_N_INSNS (6),			/* cost of MULSS instruction.  */
1285   COSTS_N_INSNS (6),			/* cost of MULSD instruction.  */
1286   COSTS_N_INSNS (6),			/* cost of FMA SS instruction.  */
1287   COSTS_N_INSNS (6),			/* cost of FMA SD instruction.  */
1288   /* 9-24  */
1289   COSTS_N_INSNS (24),			/* cost of DIVSS instruction.  */
1290   /* 9-27  */
1291   COSTS_N_INSNS (27),			/* cost of DIVSD instruction.  */
1292   COSTS_N_INSNS (15),			/* cost of SQRTSS instruction.  */
1293   COSTS_N_INSNS (26),			/* cost of SQRTSD instruction.  */
1294   1, 2, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
1295   bdver3_memcpy,
1296   bdver3_memset,
1297   COSTS_N_INSNS (4),			/* cond_taken_branch_cost.  */
1298   COSTS_N_INSNS (2),			/* cond_not_taken_branch_cost.  */
1299 };
1300 
1301 /*  BDVER4 has optimized REP instruction for medium sized blocks, but for
1302     very small blocks it is better to use loop. For large blocks, libcall
1303     can do nontemporary accesses and beat inline considerably.  */
1304 static stringop_algs bdver4_memcpy[2] = {
1305   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1306              {-1, rep_prefix_4_byte, false}}},
1307   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1308              {-1, libcall, false}}}};
1309 static stringop_algs bdver4_memset[2] = {
1310   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1311              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1312   {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1313              {-1, libcall, false}}}};
1314 struct processor_costs bdver4_cost = {
1315   COSTS_N_INSNS (1),			/* cost of an add instruction */
1316   COSTS_N_INSNS (1),			/* cost of a lea instruction */
1317   COSTS_N_INSNS (1),			/* variable shift costs */
1318   COSTS_N_INSNS (1),			/* constant shift costs */
1319   {COSTS_N_INSNS (4),			/* cost of starting multiply for QI */
1320    COSTS_N_INSNS (4),			/*				 HI */
1321    COSTS_N_INSNS (4),			/*				 SI */
1322    COSTS_N_INSNS (6),			/*				 DI */
1323    COSTS_N_INSNS (6)},			/*			      other */
1324   0,					/* cost of multiply per each bit set */
1325   {COSTS_N_INSNS (19),			/* cost of a divide/mod for QI */
1326    COSTS_N_INSNS (35),			/*			    HI */
1327    COSTS_N_INSNS (51),			/*			    SI */
1328    COSTS_N_INSNS (83),			/*			    DI */
1329    COSTS_N_INSNS (83)},			/*			    other */
1330   COSTS_N_INSNS (1),			/* cost of movsx */
1331   COSTS_N_INSNS (1),			/* cost of movzx */
1332   8,					/* "large" insn */
1333   9,					/* MOVE_RATIO */
1334 
1335   /* All move costs are relative to integer->integer move times 2 and thus
1336      they are latency*2. */
1337   8,				     /* cost for loading QImode using movzbl */
1338   {8, 8, 8},				/* cost of loading integer registers
1339 					   in QImode, HImode and SImode.
1340 					   Relative to reg-reg move (2).  */
1341   {8, 8, 8},				/* cost of storing integer registers */
1342   4,					/* cost of reg,reg fld/fst */
1343   {12, 12, 28},				/* cost of loading fp registers
1344 		   			   in SFmode, DFmode and XFmode */
1345   {10, 10, 18},				/* cost of storing fp registers
1346  		   			   in SFmode, DFmode and XFmode */
1347   4,					/* cost of moving MMX register */
1348   {12, 12},				/* cost of loading MMX registers
1349 					   in SImode and DImode */
1350   {10, 10},				/* cost of storing MMX registers
1351 					   in SImode and DImode */
1352   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
1353   {12, 12, 10, 20, 30},			/* cost of loading SSE registers
1354 					   in 32,64,128,256 and 512-bit */
1355   {12, 12, 10, 20, 30},			/* cost of unaligned loads.  */
1356   {10, 10, 10, 20, 30},			/* cost of storing SSE registers
1357 					   in 32,64,128,256 and 512-bit */
1358   {10, 10, 10, 20, 30},			/* cost of unaligned stores.  */
1359   16, 20,				/* SSE->integer and integer->SSE moves */
1360   12, 12,				/* Gather load static, per_elt.  */
1361   10, 10,				/* Gather store static, per_elt.  */
1362   16,					/* size of l1 cache.  */
1363   2048,					/* size of l2 cache.  */
1364   64,					/* size of prefetch block */
1365   /* New AMD processors never drop prefetches; if they cannot be performed
1366      immediately, they are queued.  We set number of simultaneous prefetches
1367      to a large constant to reflect this (it probably is not a good idea not
1368      to limit number of prefetches at all, as their execution also takes some
1369      time).  */
1370   100,					/* number of parallel prefetches */
1371   2,					/* Branch cost */
1372   COSTS_N_INSNS (6),			/* cost of FADD and FSUB insns.  */
1373   COSTS_N_INSNS (6),			/* cost of FMUL instruction.  */
1374   COSTS_N_INSNS (42),			/* cost of FDIV instruction.  */
1375   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
1376   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
1377   COSTS_N_INSNS (52),			/* cost of FSQRT instruction.  */
1378 
1379   COSTS_N_INSNS (2),			/* cost of cheap SSE instruction.  */
1380   COSTS_N_INSNS (6),			/* cost of ADDSS/SD SUBSS/SD insns.  */
1381   COSTS_N_INSNS (6),			/* cost of MULSS instruction.  */
1382   COSTS_N_INSNS (6),			/* cost of MULSD instruction.  */
1383   COSTS_N_INSNS (6),			/* cost of FMA SS instruction.  */
1384   COSTS_N_INSNS (6),			/* cost of FMA SD instruction.  */
1385   /* 9-24  */
1386   COSTS_N_INSNS (24),			/* cost of DIVSS instruction.  */
1387   /* 9-27  */
1388   COSTS_N_INSNS (27),			/* cost of DIVSD instruction.  */
1389   COSTS_N_INSNS (15),			/* cost of SQRTSS instruction.  */
1390   COSTS_N_INSNS (26),			/* cost of SQRTSD instruction.  */
1391   1, 2, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
1392   bdver4_memcpy,
1393   bdver4_memset,
1394   COSTS_N_INSNS (4),			/* cond_taken_branch_cost.  */
1395   COSTS_N_INSNS (2),			/* cond_not_taken_branch_cost.  */
1396 };
1397 
1398 
1399 /*  ZNVER1 has optimized REP instruction for medium sized blocks, but for
1400     very small blocks it is better to use loop.  For large blocks, libcall
1401     can do nontemporary accesses and beat inline considerably.  */
1402 static stringop_algs znver1_memcpy[2] = {
1403   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1404 	     {-1, rep_prefix_4_byte, false}}},
1405   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1406 	     {-1, libcall, false}}}};
1407 static stringop_algs znver1_memset[2] = {
1408   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1409 	     {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1410   {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1411 	     {-1, libcall, false}}}};
1412 struct processor_costs znver1_cost = {
1413   COSTS_N_INSNS (1),			/* cost of an add instruction.  */
1414   COSTS_N_INSNS (1),			/* cost of a lea instruction.  */
1415   COSTS_N_INSNS (1),			/* variable shift costs.  */
1416   COSTS_N_INSNS (1),			/* constant shift costs.  */
1417   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI.  */
1418    COSTS_N_INSNS (3),			/*				 HI.  */
1419    COSTS_N_INSNS (3),			/*				 SI.  */
1420    COSTS_N_INSNS (3),			/*				 DI.  */
1421    COSTS_N_INSNS (3)},			/*			      other.  */
1422   0,					/* cost of multiply per each bit
1423 					    set.  */
1424    /* Depending on parameters, idiv can get faster on ryzen.  This is upper
1425       bound.  */
1426   {COSTS_N_INSNS (16),			/* cost of a divide/mod for QI.  */
1427    COSTS_N_INSNS (22),			/*			    HI.  */
1428    COSTS_N_INSNS (30),			/*			    SI.  */
1429    COSTS_N_INSNS (45),			/*			    DI.  */
1430    COSTS_N_INSNS (45)},			/*			    other.  */
1431   COSTS_N_INSNS (1),			/* cost of movsx.  */
1432   COSTS_N_INSNS (1),			/* cost of movzx.  */
1433   8,					/* "large" insn.  */
1434   9,					/* MOVE_RATIO.  */
1435 
1436   /* All move costs are relative to integer->integer move times 2 and thus
1437      they are latency*2. */
1438 
1439   /* reg-reg moves are done by renaming and thus they are even cheaper than
1440      1 cycle. Becuase reg-reg move cost is 2 and the following tables correspond
1441      to doubles of latencies, we do not model this correctly.  It does not
1442      seem to make practical difference to bump prices up even more.  */
1443   6,					/* cost for loading QImode using
1444 					   movzbl.  */
1445   {6, 6, 6},				/* cost of loading integer registers
1446 					   in QImode, HImode and SImode.
1447 					   Relative to reg-reg move (2).  */
1448   {8, 8, 8},				/* cost of storing integer
1449 					   registers.  */
1450   2,					/* cost of reg,reg fld/fst.  */
1451   {6, 6, 16},				/* cost of loading fp registers
1452 		   			   in SFmode, DFmode and XFmode.  */
1453   {8, 8, 16},				/* cost of storing fp registers
1454  		   			   in SFmode, DFmode and XFmode.  */
1455   2,					/* cost of moving MMX register.  */
1456   {6, 6},				/* cost of loading MMX registers
1457 					   in SImode and DImode.  */
1458   {8, 8},				/* cost of storing MMX registers
1459 					   in SImode and DImode.  */
1460   2, 3, 6,				/* cost of moving XMM,YMM,ZMM register.  */
1461   {6, 6, 6, 10, 20},			/* cost of loading SSE registers
1462 					   in 32,64,128,256 and 512-bit.  */
1463   {6, 6, 6, 10, 20},			/* cost of unaligned loads.  */
1464   {8, 8, 8, 8, 16},			/* cost of storing SSE registers
1465 					   in 32,64,128,256 and 512-bit.  */
1466   {8, 8, 8, 8, 16},			/* cost of unaligned stores.  */
1467   6, 6,					/* SSE->integer and integer->SSE moves.  */
1468   /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
1469      throughput 12.  Approx 9 uops do not depend on vector size and every load
1470      is 7 uops.  */
1471   18, 8,				/* Gather load static, per_elt.  */
1472   18, 10,				/* Gather store static, per_elt.  */
1473   32,					/* size of l1 cache.  */
1474   512,					/* size of l2 cache.  */
1475   64,					/* size of prefetch block.  */
1476   /* New AMD processors never drop prefetches; if they cannot be performed
1477      immediately, they are queued.  We set number of simultaneous prefetches
1478      to a large constant to reflect this (it probably is not a good idea not
1479      to limit number of prefetches at all, as their execution also takes some
1480      time).  */
1481   100,					/* number of parallel prefetches.  */
1482   3,					/* Branch cost.  */
1483   COSTS_N_INSNS (5),			/* cost of FADD and FSUB insns.  */
1484   COSTS_N_INSNS (5),			/* cost of FMUL instruction.  */
1485   /* Latency of fdiv is 8-15.  */
1486   COSTS_N_INSNS (15),			/* cost of FDIV instruction.  */
1487   COSTS_N_INSNS (1),			/* cost of FABS instruction.  */
1488   COSTS_N_INSNS (1),			/* cost of FCHS instruction.  */
1489   /* Latency of fsqrt is 4-10.  */
1490   COSTS_N_INSNS (10),			/* cost of FSQRT instruction.  */
1491 
1492   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
1493   COSTS_N_INSNS (3),			/* cost of ADDSS/SD SUBSS/SD insns.  */
1494   COSTS_N_INSNS (3),			/* cost of MULSS instruction.  */
1495   COSTS_N_INSNS (4),			/* cost of MULSD instruction.  */
1496   COSTS_N_INSNS (5),			/* cost of FMA SS instruction.  */
1497   COSTS_N_INSNS (5),			/* cost of FMA SD instruction.  */
1498   COSTS_N_INSNS (10),			/* cost of DIVSS instruction.  */
1499   /* 9-13  */
1500   COSTS_N_INSNS (13),			/* cost of DIVSD instruction.  */
1501   COSTS_N_INSNS (10),			/* cost of SQRTSS instruction.  */
1502   COSTS_N_INSNS (15),			/* cost of SQRTSD instruction.  */
1503   /* Zen can execute 4 integer operations per cycle. FP operations take 3 cycles
1504      and it can execute 2 integer additions and 2 multiplications thus
1505      reassociation may make sense up to with of 6.  SPEC2k6 bencharks suggests
1506      that 4 works better than 6 probably due to register pressure.
1507 
1508      Integer vector operations are taken by FP unit and execute 3 vector
1509      plus/minus operations per cycle but only one multiply.  This is adjusted
1510      in ix86_reassociation_width.  */
1511   4, 4, 3, 6,				/* reassoc int, fp, vec_int, vec_fp.  */
1512   znver1_memcpy,
1513   znver1_memset,
1514   COSTS_N_INSNS (4),			/* cond_taken_branch_cost.  */
1515   COSTS_N_INSNS (2),			/* cond_not_taken_branch_cost.  */
1516 };
1517 
1518 /* skylake_cost should produce code tuned for Skylake familly of CPUs.  */
1519 static stringop_algs skylake_memcpy[2] =   {
1520   {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}},
1521   {libcall, {{16, loop, false}, {512, rep_prefix_8_byte, false},
1522              {-1, libcall, false}}}};
1523 
1524 static stringop_algs skylake_memset[2] = {
1525   {libcall, {{6, loop_1_byte, true},
1526              {24, loop, true},
1527              {8192, rep_prefix_4_byte, true},
1528              {-1, libcall, false}}},
1529   {libcall, {{24, loop, true}, {512, rep_prefix_8_byte, false},
1530              {-1, libcall, false}}}};
1531 
1532 static const
1533 struct processor_costs skylake_cost = {
1534   COSTS_N_INSNS (1),			/* cost of an add instruction */
1535   COSTS_N_INSNS (1)+1,		/* cost of a lea instruction */
1536   COSTS_N_INSNS (1),			/* variable shift costs */
1537   COSTS_N_INSNS (1),			/* constant shift costs */
1538   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
1539    COSTS_N_INSNS (4),			/*				 HI */
1540    COSTS_N_INSNS (3),			/*				 SI */
1541    COSTS_N_INSNS (3),			/*				 DI */
1542    COSTS_N_INSNS (3)},			/*			      other */
1543   0,					/* cost of multiply per each bit set */
1544   /* Expanding div/mod currently doesn't consider parallelism. So the cost
1545      model is not realistic. We compensate by increasing the latencies a bit.  */
1546   {COSTS_N_INSNS (11),			/* cost of a divide/mod for QI */
1547    COSTS_N_INSNS (11),			/*			    HI */
1548    COSTS_N_INSNS (14),			/*			    SI */
1549    COSTS_N_INSNS (76),			/*			    DI */
1550    COSTS_N_INSNS (76)},			/*			    other */
1551   COSTS_N_INSNS (1),			/* cost of movsx */
1552   COSTS_N_INSNS (0),			/* cost of movzx */
1553   8,					/* "large" insn */
1554   17,					/* MOVE_RATIO */
1555 
1556   6,				     /* cost for loading QImode using movzbl */
1557   {4, 4, 4},				/* cost of loading integer registers
1558 					   in QImode, HImode and SImode.
1559 					   Relative to reg-reg move (2).  */
1560   {6, 6, 3},				/* cost of storing integer registers */
1561   2,					/* cost of reg,reg fld/fst */
1562   {6, 6, 8},				/* cost of loading fp registers
1563 					   in SFmode, DFmode and XFmode */
1564   {6, 6, 10},				/* cost of storing fp registers
1565 					   in SFmode, DFmode and XFmode */
1566   2,					/* cost of moving MMX register */
1567   {6, 6},				/* cost of loading MMX registers
1568 					   in SImode and DImode */
1569   {6, 6},				/* cost of storing MMX registers
1570 					   in SImode and DImode */
1571   2, 2, 4,				/* cost of moving XMM,YMM,ZMM register */
1572   {6, 6, 6, 10, 20},			/* cost of loading SSE registers
1573 					   in 32,64,128,256 and 512-bit */
1574   {6, 6, 6, 10, 20},			/* cost of unaligned loads.  */
1575   {8, 8, 8, 12, 24},			/* cost of storing SSE registers
1576 					   in 32,64,128,256 and 512-bit */
1577   {8, 8, 8, 8, 16},			/* cost of unaligned stores.  */
1578   2, 2,					/* SSE->integer and integer->SSE moves */
1579   20, 8,				/* Gather load static, per_elt.  */
1580   22, 10,				/* Gather store static, per_elt.  */
1581   64,					/* size of l1 cache.  */
1582   512,					/* size of l2 cache.  */
1583   64,					/* size of prefetch block */
1584   6,					/* number of parallel prefetches */
1585   3,					/* Branch cost */
1586   COSTS_N_INSNS (3),			/* cost of FADD and FSUB insns.  */
1587   COSTS_N_INSNS (4),			/* cost of FMUL instruction.  */
1588   COSTS_N_INSNS (20),			/* cost of FDIV instruction.  */
1589   COSTS_N_INSNS (1),			/* cost of FABS instruction.  */
1590   COSTS_N_INSNS (1),			/* cost of FCHS instruction.  */
1591   COSTS_N_INSNS (20),			/* cost of FSQRT instruction.  */
1592 
1593   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
1594   COSTS_N_INSNS (4),			/* cost of ADDSS/SD SUBSS/SD insns.  */
1595   COSTS_N_INSNS (4),			/* cost of MULSS instruction.  */
1596   COSTS_N_INSNS (4),			/* cost of MULSD instruction.  */
1597   COSTS_N_INSNS (4),			/* cost of FMA SS instruction.  */
1598   COSTS_N_INSNS (4),			/* cost of FMA SD instruction.  */
1599   COSTS_N_INSNS (11),			/* cost of DIVSS instruction.  */
1600   COSTS_N_INSNS (14),			/* cost of DIVSD instruction.  */
1601   COSTS_N_INSNS (12),			/* cost of SQRTSS instruction.  */
1602   COSTS_N_INSNS (18),			/* cost of SQRTSD instruction.  */
1603   1, 4, 2, 2,				/* reassoc int, fp, vec_int, vec_fp.  */
1604   skylake_memcpy,
1605   skylake_memset,
1606   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
1607   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
1608 };
1609   /* BTVER1 has optimized REP instruction for medium sized blocks, but for
1610      very small blocks it is better to use loop. For large blocks, libcall can
1611      do nontemporary accesses and beat inline considerably.  */
1612 static stringop_algs btver1_memcpy[2] = {
1613   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1614              {-1, rep_prefix_4_byte, false}}},
1615   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1616              {-1, libcall, false}}}};
1617 static stringop_algs btver1_memset[2] = {
1618   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1619              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1620   {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1621              {-1, libcall, false}}}};
1622 const struct processor_costs btver1_cost = {
1623   COSTS_N_INSNS (1),			/* cost of an add instruction */
1624   COSTS_N_INSNS (2),			/* cost of a lea instruction */
1625   COSTS_N_INSNS (1),			/* variable shift costs */
1626   COSTS_N_INSNS (1),			/* constant shift costs */
1627   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
1628    COSTS_N_INSNS (4),			/*				 HI */
1629    COSTS_N_INSNS (3),			/*				 SI */
1630    COSTS_N_INSNS (4),			/*				 DI */
1631    COSTS_N_INSNS (5)},			/*			      other */
1632   0,					/* cost of multiply per each bit set */
1633   {COSTS_N_INSNS (19),			/* cost of a divide/mod for QI */
1634    COSTS_N_INSNS (35),			/*			    HI */
1635    COSTS_N_INSNS (51),			/*			    SI */
1636    COSTS_N_INSNS (83),			/*			    DI */
1637    COSTS_N_INSNS (83)},			/*			    other */
1638   COSTS_N_INSNS (1),			/* cost of movsx */
1639   COSTS_N_INSNS (1),			/* cost of movzx */
1640   8,					/* "large" insn */
1641   9,					/* MOVE_RATIO */
1642 
1643   /* All move costs are relative to integer->integer move times 2 and thus
1644      they are latency*2. */
1645   8,				     /* cost for loading QImode using movzbl */
1646   {6, 8, 6},				/* cost of loading integer registers
1647 					   in QImode, HImode and SImode.
1648 					   Relative to reg-reg move (2).  */
1649   {6, 8, 6},				/* cost of storing integer registers */
1650   4,					/* cost of reg,reg fld/fst */
1651   {12, 12, 28},				/* cost of loading fp registers
1652 					   in SFmode, DFmode and XFmode */
1653   {12, 12, 38},				/* cost of storing fp registers
1654 					   in SFmode, DFmode and XFmode */
1655   4,					/* cost of moving MMX register */
1656   {10, 10},				/* cost of loading MMX registers
1657 					   in SImode and DImode */
1658   {12, 12},				/* cost of storing MMX registers
1659 					   in SImode and DImode */
1660   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
1661   {10, 10, 12, 24, 48},			/* cost of loading SSE registers
1662 					   in 32,64,128,256 and 512-bit */
1663   {10, 10, 12, 24, 48},			/* cost of unaligned loads.  */
1664   {10, 10, 12, 24, 48},			/* cost of storing SSE registers
1665 					   in 32,64,128,256 and 512-bit */
1666   {10, 10, 12, 24, 48},			/* cost of unaligned stores.  */
1667   14, 14,				/* SSE->integer and integer->SSE moves */
1668   10, 10,				/* Gather load static, per_elt.  */
1669   10, 10,				/* Gather store static, per_elt.  */
1670   32,					/* size of l1 cache.  */
1671   512,					/* size of l2 cache.  */
1672   64,					/* size of prefetch block */
1673   100,					/* number of parallel prefetches */
1674   2,					/* Branch cost */
1675   COSTS_N_INSNS (4),			/* cost of FADD and FSUB insns.  */
1676   COSTS_N_INSNS (4),			/* cost of FMUL instruction.  */
1677   COSTS_N_INSNS (19),			/* cost of FDIV instruction.  */
1678   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
1679   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
1680   COSTS_N_INSNS (35),			/* cost of FSQRT instruction.  */
1681 
1682   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
1683   COSTS_N_INSNS (3),			/* cost of ADDSS/SD SUBSS/SD insns.  */
1684   COSTS_N_INSNS (2),			/* cost of MULSS instruction.  */
1685   COSTS_N_INSNS (4),			/* cost of MULSD instruction.  */
1686   COSTS_N_INSNS (5),			/* cost of FMA SS instruction.  */
1687   COSTS_N_INSNS (5),			/* cost of FMA SD instruction.  */
1688   COSTS_N_INSNS (13),			/* cost of DIVSS instruction.  */
1689   COSTS_N_INSNS (17),			/* cost of DIVSD instruction.  */
1690   COSTS_N_INSNS (14),			/* cost of SQRTSS instruction.  */
1691   COSTS_N_INSNS (48),			/* cost of SQRTSD instruction.  */
1692   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
1693   btver1_memcpy,
1694   btver1_memset,
1695   COSTS_N_INSNS (2),			/* cond_taken_branch_cost.  */
1696   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
1697 };
1698 
1699 static stringop_algs btver2_memcpy[2] = {
1700   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1701              {-1, rep_prefix_4_byte, false}}},
1702   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1703              {-1, libcall, false}}}};
1704 static stringop_algs btver2_memset[2] = {
1705   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1706              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1707   {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1708              {-1, libcall, false}}}};
1709 const struct processor_costs btver2_cost = {
1710   COSTS_N_INSNS (1),			/* cost of an add instruction */
1711   COSTS_N_INSNS (2),			/* cost of a lea instruction */
1712   COSTS_N_INSNS (1),			/* variable shift costs */
1713   COSTS_N_INSNS (1),			/* constant shift costs */
1714   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
1715    COSTS_N_INSNS (4),			/*				 HI */
1716    COSTS_N_INSNS (3),			/*				 SI */
1717    COSTS_N_INSNS (4),			/*				 DI */
1718    COSTS_N_INSNS (5)},			/*			      other */
1719   0,					/* cost of multiply per each bit set */
1720   {COSTS_N_INSNS (19),			/* cost of a divide/mod for QI */
1721    COSTS_N_INSNS (35),			/*			    HI */
1722    COSTS_N_INSNS (51),			/*			    SI */
1723    COSTS_N_INSNS (83),			/*			    DI */
1724    COSTS_N_INSNS (83)},			/*			    other */
1725   COSTS_N_INSNS (1),			/* cost of movsx */
1726   COSTS_N_INSNS (1),			/* cost of movzx */
1727   8,					/* "large" insn */
1728   9,					/* MOVE_RATIO */
1729 
1730   /* All move costs are relative to integer->integer move times 2 and thus
1731      they are latency*2. */
1732   8,				     /* cost for loading QImode using movzbl */
1733   {8, 8, 6},				/* cost of loading integer registers
1734 					   in QImode, HImode and SImode.
1735 					   Relative to reg-reg move (2).  */
1736   {8, 8, 6},				/* cost of storing integer registers */
1737   4,					/* cost of reg,reg fld/fst */
1738   {12, 12, 28},				/* cost of loading fp registers
1739 					   in SFmode, DFmode and XFmode */
1740   {12, 12, 38},				/* cost of storing fp registers
1741 					   in SFmode, DFmode and XFmode */
1742   4,					/* cost of moving MMX register */
1743   {10, 10},				/* cost of loading MMX registers
1744 					   in SImode and DImode */
1745   {12, 12},				/* cost of storing MMX registers
1746 					   in SImode and DImode */
1747   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
1748   {10, 10, 12, 24, 48},			/* cost of loading SSE registers
1749 					   in 32,64,128,256 and 512-bit */
1750   {10, 10, 12, 24, 48},			/* cost of unaligned loads.  */
1751   {10, 10, 12, 24, 48},			/* cost of storing SSE registers
1752 					   in 32,64,128,256 and 512-bit */
1753   {10, 10, 12, 24, 48},			/* cost of unaligned stores.  */
1754   14, 14,				/* SSE->integer and integer->SSE moves */
1755   10, 10,				/* Gather load static, per_elt.  */
1756   10, 10,				/* Gather store static, per_elt.  */
1757   32,					/* size of l1 cache.  */
1758   2048,					/* size of l2 cache.  */
1759   64,					/* size of prefetch block */
1760   100,					/* number of parallel prefetches */
1761   2,					/* Branch cost */
1762   COSTS_N_INSNS (4),			/* cost of FADD and FSUB insns.  */
1763   COSTS_N_INSNS (4),			/* cost of FMUL instruction.  */
1764   COSTS_N_INSNS (19),			/* cost of FDIV instruction.  */
1765   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
1766   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
1767   COSTS_N_INSNS (35),			/* cost of FSQRT instruction.  */
1768 
1769   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
1770   COSTS_N_INSNS (3),			/* cost of ADDSS/SD SUBSS/SD insns.  */
1771   COSTS_N_INSNS (2),			/* cost of MULSS instruction.  */
1772   COSTS_N_INSNS (4),			/* cost of MULSD instruction.  */
1773   COSTS_N_INSNS (5),			/* cost of FMA SS instruction.  */
1774   COSTS_N_INSNS (5),			/* cost of FMA SD instruction.  */
1775   COSTS_N_INSNS (13),			/* cost of DIVSS instruction.  */
1776   COSTS_N_INSNS (19),			/* cost of DIVSD instruction.  */
1777   COSTS_N_INSNS (16),			/* cost of SQRTSS instruction.  */
1778   COSTS_N_INSNS (21),			/* cost of SQRTSD instruction.  */
1779   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
1780   btver2_memcpy,
1781   btver2_memset,
1782   COSTS_N_INSNS (2),			/* cond_taken_branch_cost.  */
1783   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
1784 };
1785 
1786 static stringop_algs pentium4_memcpy[2] = {
1787   {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1788   DUMMY_STRINGOP_ALGS};
1789 static stringop_algs pentium4_memset[2] = {
1790   {libcall, {{6, loop_1_byte, false}, {48, loop, false},
1791              {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1792   DUMMY_STRINGOP_ALGS};
1793 
1794 static const
1795 struct processor_costs pentium4_cost = {
1796   COSTS_N_INSNS (1),			/* cost of an add instruction */
1797   COSTS_N_INSNS (3),			/* cost of a lea instruction */
1798   COSTS_N_INSNS (4),			/* variable shift costs */
1799   COSTS_N_INSNS (4),			/* constant shift costs */
1800   {COSTS_N_INSNS (15),			/* cost of starting multiply for QI */
1801    COSTS_N_INSNS (15),			/*				 HI */
1802    COSTS_N_INSNS (15),			/*				 SI */
1803    COSTS_N_INSNS (15),			/*				 DI */
1804    COSTS_N_INSNS (15)},			/*			      other */
1805   0,					/* cost of multiply per each bit set */
1806   {COSTS_N_INSNS (56),			/* cost of a divide/mod for QI */
1807    COSTS_N_INSNS (56),			/*			    HI */
1808    COSTS_N_INSNS (56),			/*			    SI */
1809    COSTS_N_INSNS (56),			/*			    DI */
1810    COSTS_N_INSNS (56)},			/*			    other */
1811   COSTS_N_INSNS (1),			/* cost of movsx */
1812   COSTS_N_INSNS (1),			/* cost of movzx */
1813   16,					/* "large" insn */
1814   6,					/* MOVE_RATIO */
1815 
1816   /* All move costs are relative to integer->integer move times 2 and thus
1817      they are latency*2. */
1818   5,				     /* cost for loading QImode using movzbl */
1819   {4, 5, 4},				/* cost of loading integer registers
1820 					   in QImode, HImode and SImode.
1821 					   Relative to reg-reg move (2).  */
1822   {2, 3, 2},				/* cost of storing integer registers */
1823   12,					/* cost of reg,reg fld/fst */
1824   {14, 14, 14},				/* cost of loading fp registers
1825 					   in SFmode, DFmode and XFmode */
1826   {14, 14, 14},				/* cost of storing fp registers
1827 					   in SFmode, DFmode and XFmode */
1828   12,					/* cost of moving MMX register */
1829   {16, 16},				/* cost of loading MMX registers
1830 					   in SImode and DImode */
1831   {16, 16},				/* cost of storing MMX registers
1832 					   in SImode and DImode */
1833   12, 24, 48,				/* cost of moving XMM,YMM,ZMM register */
1834   {16, 16, 16, 32, 64},			/* cost of loading SSE registers
1835 					   in 32,64,128,256 and 512-bit */
1836   {32, 32, 32, 64, 128},		/* cost of unaligned loads.  */
1837   {16, 16, 16, 32, 64},			/* cost of storing SSE registers
1838 					   in 32,64,128,256 and 512-bit */
1839   {32, 32, 32, 64, 128},		/* cost of unaligned stores.  */
1840   20, 12,				/* SSE->integer and integer->SSE moves */
1841   16, 16,				/* Gather load static, per_elt.  */
1842   16, 16,				/* Gather store static, per_elt.  */
1843   8,					/* size of l1 cache.  */
1844   256,					/* size of l2 cache.  */
1845   64,					/* size of prefetch block */
1846   6,					/* number of parallel prefetches */
1847   2,					/* Branch cost */
1848   COSTS_N_INSNS (5),			/* cost of FADD and FSUB insns.  */
1849   COSTS_N_INSNS (7),			/* cost of FMUL instruction.  */
1850   COSTS_N_INSNS (43),			/* cost of FDIV instruction.  */
1851   COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
1852   COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
1853   COSTS_N_INSNS (43),			/* cost of FSQRT instruction.  */
1854 
1855   COSTS_N_INSNS (2),			/* cost of cheap SSE instruction.  */
1856   COSTS_N_INSNS (4),			/* cost of ADDSS/SD SUBSS/SD insns.  */
1857   COSTS_N_INSNS (6),			/* cost of MULSS instruction.  */
1858   COSTS_N_INSNS (6),			/* cost of MULSD instruction.  */
1859   COSTS_N_INSNS (6),			/* cost of FMA SS instruction.  */
1860   COSTS_N_INSNS (6),			/* cost of FMA SD instruction.  */
1861   COSTS_N_INSNS (23),			/* cost of DIVSS instruction.  */
1862   COSTS_N_INSNS (38),			/* cost of DIVSD instruction.  */
1863   COSTS_N_INSNS (23),			/* cost of SQRTSS instruction.  */
1864   COSTS_N_INSNS (38),			/* cost of SQRTSD instruction.  */
1865   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
1866   pentium4_memcpy,
1867   pentium4_memset,
1868   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
1869   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
1870 };
1871 
1872 static stringop_algs nocona_memcpy[2] = {
1873   {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1874   {libcall, {{32, loop, false}, {20000, rep_prefix_8_byte, false},
1875              {100000, unrolled_loop, false}, {-1, libcall, false}}}};
1876 
1877 static stringop_algs nocona_memset[2] = {
1878   {libcall, {{6, loop_1_byte, false}, {48, loop, false},
1879              {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1880   {libcall, {{24, loop, false}, {64, unrolled_loop, false},
1881              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1882 
1883 static const
1884 struct processor_costs nocona_cost = {
1885   COSTS_N_INSNS (1),			/* cost of an add instruction */
1886   COSTS_N_INSNS (1),			/* cost of a lea instruction */
1887   COSTS_N_INSNS (1),			/* variable shift costs */
1888   COSTS_N_INSNS (1),			/* constant shift costs */
1889   {COSTS_N_INSNS (10),			/* cost of starting multiply for QI */
1890    COSTS_N_INSNS (10),			/*				 HI */
1891    COSTS_N_INSNS (10),			/*				 SI */
1892    COSTS_N_INSNS (10),			/*				 DI */
1893    COSTS_N_INSNS (10)},			/*			      other */
1894   0,					/* cost of multiply per each bit set */
1895   {COSTS_N_INSNS (66),			/* cost of a divide/mod for QI */
1896    COSTS_N_INSNS (66),			/*			    HI */
1897    COSTS_N_INSNS (66),			/*			    SI */
1898    COSTS_N_INSNS (66),			/*			    DI */
1899    COSTS_N_INSNS (66)},			/*			    other */
1900   COSTS_N_INSNS (1),			/* cost of movsx */
1901   COSTS_N_INSNS (1),			/* cost of movzx */
1902   16,					/* "large" insn */
1903   17,					/* MOVE_RATIO */
1904 
1905   /* All move costs are relative to integer->integer move times 2 and thus
1906      they are latency*2. */
1907   4,				     /* cost for loading QImode using movzbl */
1908   {4, 4, 4},				/* cost of loading integer registers
1909 					   in QImode, HImode and SImode.
1910 					   Relative to reg-reg move (2).  */
1911   {4, 4, 4},				/* cost of storing integer registers */
1912   12,					/* cost of reg,reg fld/fst */
1913   {14, 14, 14},				/* cost of loading fp registers
1914 					   in SFmode, DFmode and XFmode */
1915   {14, 14, 14},				/* cost of storing fp registers
1916 					   in SFmode, DFmode and XFmode */
1917   14,					/* cost of moving MMX register */
1918   {12, 12},				/* cost of loading MMX registers
1919 					   in SImode and DImode */
1920   {12, 12},				/* cost of storing MMX registers
1921 					   in SImode and DImode */
1922   6, 12, 24,				/* cost of moving XMM,YMM,ZMM register */
1923   {12, 12, 12, 24, 48},			/* cost of loading SSE registers
1924 					   in 32,64,128,256 and 512-bit */
1925   {24, 24, 24, 48, 96},			/* cost of unaligned loads.  */
1926   {12, 12, 12, 24, 48},			/* cost of storing SSE registers
1927 					   in 32,64,128,256 and 512-bit */
1928   {24, 24, 24, 48, 96},			/* cost of unaligned stores.  */
1929   20, 12,				/* SSE->integer and integer->SSE moves */
1930   12, 12,				/* Gather load static, per_elt.  */
1931   12, 12,				/* Gather store static, per_elt.  */
1932   8,					/* size of l1 cache.  */
1933   1024,					/* size of l2 cache.  */
1934   64,					/* size of prefetch block */
1935   8,					/* number of parallel prefetches */
1936   1,					/* Branch cost */
1937   COSTS_N_INSNS (6),			/* cost of FADD and FSUB insns.  */
1938   COSTS_N_INSNS (8),			/* cost of FMUL instruction.  */
1939   COSTS_N_INSNS (40),			/* cost of FDIV instruction.  */
1940   COSTS_N_INSNS (3),			/* cost of FABS instruction.  */
1941   COSTS_N_INSNS (3),			/* cost of FCHS instruction.  */
1942   COSTS_N_INSNS (44),			/* cost of FSQRT instruction.  */
1943 
1944   COSTS_N_INSNS (2),			/* cost of cheap SSE instruction.  */
1945   COSTS_N_INSNS (5),			/* cost of ADDSS/SD SUBSS/SD insns.  */
1946   COSTS_N_INSNS (7),			/* cost of MULSS instruction.  */
1947   COSTS_N_INSNS (7),			/* cost of MULSD instruction.  */
1948   COSTS_N_INSNS (7),			/* cost of FMA SS instruction.  */
1949   COSTS_N_INSNS (7),			/* cost of FMA SD instruction.  */
1950   COSTS_N_INSNS (32),			/* cost of DIVSS instruction.  */
1951   COSTS_N_INSNS (40),			/* cost of DIVSD instruction.  */
1952   COSTS_N_INSNS (32),			/* cost of SQRTSS instruction.  */
1953   COSTS_N_INSNS (41),			/* cost of SQRTSD instruction.  */
1954   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
1955   nocona_memcpy,
1956   nocona_memset,
1957   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
1958   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
1959 };
1960 
1961 static stringop_algs atom_memcpy[2] = {
1962   {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
1963   {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
1964              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1965 static stringop_algs atom_memset[2] = {
1966   {libcall, {{8, loop, false}, {15, unrolled_loop, false},
1967              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1968   {libcall, {{24, loop, false}, {32, unrolled_loop, false},
1969              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1970 static const
1971 struct processor_costs atom_cost = {
1972   COSTS_N_INSNS (1),			/* cost of an add instruction */
1973   COSTS_N_INSNS (1) + 1,		/* cost of a lea instruction */
1974   COSTS_N_INSNS (1),			/* variable shift costs */
1975   COSTS_N_INSNS (1),			/* constant shift costs */
1976   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
1977    COSTS_N_INSNS (4),			/*				 HI */
1978    COSTS_N_INSNS (3),			/*				 SI */
1979    COSTS_N_INSNS (4),			/*				 DI */
1980    COSTS_N_INSNS (2)},			/*			      other */
1981   0,					/* cost of multiply per each bit set */
1982   {COSTS_N_INSNS (18),			/* cost of a divide/mod for QI */
1983    COSTS_N_INSNS (26),			/*			    HI */
1984    COSTS_N_INSNS (42),			/*			    SI */
1985    COSTS_N_INSNS (74),			/*			    DI */
1986    COSTS_N_INSNS (74)},			/*			    other */
1987   COSTS_N_INSNS (1),			/* cost of movsx */
1988   COSTS_N_INSNS (1),			/* cost of movzx */
1989   8,					/* "large" insn */
1990   17,					/* MOVE_RATIO */
1991 
1992   /* All move costs are relative to integer->integer move times 2 and thus
1993      they are latency*2. */
1994   6,					/* cost for loading QImode using movzbl */
1995   {6, 6, 6},				/* cost of loading integer registers
1996 					   in QImode, HImode and SImode.
1997 					   Relative to reg-reg move (2).  */
1998   {6, 6, 6},				/* cost of storing integer registers */
1999   4,					/* cost of reg,reg fld/fst */
2000   {6, 6, 18},				/* cost of loading fp registers
2001 					   in SFmode, DFmode and XFmode */
2002   {14, 14, 24},				/* cost of storing fp registers
2003 					   in SFmode, DFmode and XFmode */
2004   2,					/* cost of moving MMX register */
2005   {8, 8},				/* cost of loading MMX registers
2006 					   in SImode and DImode */
2007   {10, 10},				/* cost of storing MMX registers
2008 					   in SImode and DImode */
2009   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
2010   {8, 8, 8, 16, 32},			/* cost of loading SSE registers
2011 					   in 32,64,128,256 and 512-bit */
2012   {16, 16, 16, 32, 64},			/* cost of unaligned loads.  */
2013   {8, 8, 8, 16, 32},			/* cost of storing SSE registers
2014 					   in 32,64,128,256 and 512-bit */
2015   {16, 16, 16, 32, 64},			/* cost of unaligned stores.  */
2016   8, 6,					/* SSE->integer and integer->SSE moves */
2017   8, 8,					/* Gather load static, per_elt.  */
2018   8, 8,					/* Gather store static, per_elt.  */
2019   32,					/* size of l1 cache.  */
2020   256,					/* size of l2 cache.  */
2021   64,					/* size of prefetch block */
2022   6,					/* number of parallel prefetches */
2023   3,					/* Branch cost */
2024   COSTS_N_INSNS (8),			/* cost of FADD and FSUB insns.  */
2025   COSTS_N_INSNS (8),			/* cost of FMUL instruction.  */
2026   COSTS_N_INSNS (20),			/* cost of FDIV instruction.  */
2027   COSTS_N_INSNS (8),			/* cost of FABS instruction.  */
2028   COSTS_N_INSNS (8),			/* cost of FCHS instruction.  */
2029   COSTS_N_INSNS (40),			/* cost of FSQRT instruction.  */
2030 
2031   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
2032   COSTS_N_INSNS (5),			/* cost of ADDSS/SD SUBSS/SD insns.  */
2033   COSTS_N_INSNS (4),			/* cost of MULSS instruction.  */
2034   COSTS_N_INSNS (5),			/* cost of MULSD instruction.  */
2035   COSTS_N_INSNS (6),			/* cost of FMA SS instruction.  */
2036   COSTS_N_INSNS (6),			/* cost of FMA SD instruction.  */
2037   COSTS_N_INSNS (31),			/* cost of DIVSS instruction.  */
2038   COSTS_N_INSNS (60),			/* cost of DIVSD instruction.  */
2039   COSTS_N_INSNS (31),			/* cost of SQRTSS instruction.  */
2040   COSTS_N_INSNS (63),			/* cost of SQRTSD instruction.  */
2041   2, 2, 2, 2,				/* reassoc int, fp, vec_int, vec_fp.  */
2042   atom_memcpy,
2043   atom_memset,
2044   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
2045   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
2046 };
2047 
2048 static stringop_algs slm_memcpy[2] = {
2049   {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
2050   {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
2051              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2052 static stringop_algs slm_memset[2] = {
2053   {libcall, {{8, loop, false}, {15, unrolled_loop, false},
2054              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
2055   {libcall, {{24, loop, false}, {32, unrolled_loop, false},
2056              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2057 static const
2058 struct processor_costs slm_cost = {
2059   COSTS_N_INSNS (1),			/* cost of an add instruction */
2060   COSTS_N_INSNS (1) + 1,		/* cost of a lea instruction */
2061   COSTS_N_INSNS (1),			/* variable shift costs */
2062   COSTS_N_INSNS (1),			/* constant shift costs */
2063   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
2064    COSTS_N_INSNS (3),			/*				 HI */
2065    COSTS_N_INSNS (3),			/*				 SI */
2066    COSTS_N_INSNS (4),			/*				 DI */
2067    COSTS_N_INSNS (2)},			/*			      other */
2068   0,					/* cost of multiply per each bit set */
2069   {COSTS_N_INSNS (18),			/* cost of a divide/mod for QI */
2070    COSTS_N_INSNS (26),			/*			    HI */
2071    COSTS_N_INSNS (42),			/*			    SI */
2072    COSTS_N_INSNS (74),			/*			    DI */
2073    COSTS_N_INSNS (74)},			/*			    other */
2074   COSTS_N_INSNS (1),			/* cost of movsx */
2075   COSTS_N_INSNS (1),			/* cost of movzx */
2076   8,					/* "large" insn */
2077   17,					/* MOVE_RATIO */
2078 
2079   /* All move costs are relative to integer->integer move times 2 and thus
2080      they are latency*2. */
2081   8,					/* cost for loading QImode using movzbl */
2082   {8, 8, 8},				/* cost of loading integer registers
2083 					   in QImode, HImode and SImode.
2084 					   Relative to reg-reg move (2).  */
2085   {6, 6, 6},				/* cost of storing integer registers */
2086   2,					/* cost of reg,reg fld/fst */
2087   {8, 8, 18},				/* cost of loading fp registers
2088 					   in SFmode, DFmode and XFmode */
2089   {6, 6, 18},				/* cost of storing fp registers
2090 					   in SFmode, DFmode and XFmode */
2091   2,					/* cost of moving MMX register */
2092   {8, 8},				/* cost of loading MMX registers
2093 					   in SImode and DImode */
2094   {6, 6},				/* cost of storing MMX registers
2095 					   in SImode and DImode */
2096   2, 4, 8,				/* cost of moving XMM,YMM,ZMM register */
2097   {8, 8, 8, 16, 32},			/* cost of loading SSE registers
2098 					   in 32,64,128,256 and 512-bit */
2099   {16, 16, 16, 32, 64},			/* cost of unaligned loads.  */
2100   {8, 8, 8, 16, 32},			/* cost of storing SSE registers
2101 					   in 32,64,128,256 and 512-bit */
2102   {16, 16, 16, 32, 64},			/* cost of unaligned stores.  */
2103   8, 6,					/* SSE->integer and integer->SSE moves */
2104   8, 8,					/* Gather load static, per_elt.  */
2105   8, 8,					/* Gather store static, per_elt.  */
2106   32,					/* size of l1 cache.  */
2107   256,					/* size of l2 cache.  */
2108   64,					/* size of prefetch block */
2109   6,					/* number of parallel prefetches */
2110   3,					/* Branch cost */
2111   COSTS_N_INSNS (8),			/* cost of FADD and FSUB insns.  */
2112   COSTS_N_INSNS (8),			/* cost of FMUL instruction.  */
2113   COSTS_N_INSNS (20),			/* cost of FDIV instruction.  */
2114   COSTS_N_INSNS (8),			/* cost of FABS instruction.  */
2115   COSTS_N_INSNS (8),			/* cost of FCHS instruction.  */
2116   COSTS_N_INSNS (40),			/* cost of FSQRT instruction.  */
2117 
2118   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
2119   COSTS_N_INSNS (3),			/* cost of ADDSS/SD SUBSS/SD insns.  */
2120   COSTS_N_INSNS (4),			/* cost of MULSS instruction.  */
2121   COSTS_N_INSNS (5),			/* cost of MULSD instruction.  */
2122   COSTS_N_INSNS (6),			/* cost of FMA SS instruction.  */
2123   COSTS_N_INSNS (6),			/* cost of FMA SD instruction.  */
2124   COSTS_N_INSNS (39),			/* cost of DIVSS instruction.  */
2125   COSTS_N_INSNS (69),			/* cost of DIVSD instruction.  */
2126   COSTS_N_INSNS (20),			/* cost of SQRTSS instruction.  */
2127   COSTS_N_INSNS (35),			/* cost of SQRTSD instruction.  */
2128   1, 2, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
2129   slm_memcpy,
2130   slm_memset,
2131   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
2132   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
2133 };
2134 
2135 static stringop_algs intel_memcpy[2] = {
2136   {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
2137   {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
2138              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2139 static stringop_algs intel_memset[2] = {
2140   {libcall, {{8, loop, false}, {15, unrolled_loop, false},
2141              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
2142   {libcall, {{24, loop, false}, {32, unrolled_loop, false},
2143              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2144 static const
2145 struct processor_costs intel_cost = {
2146   COSTS_N_INSNS (1),			/* cost of an add instruction */
2147   COSTS_N_INSNS (1) + 1,		/* cost of a lea instruction */
2148   COSTS_N_INSNS (1),			/* variable shift costs */
2149   COSTS_N_INSNS (1),			/* constant shift costs */
2150   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
2151    COSTS_N_INSNS (3),			/*				 HI */
2152    COSTS_N_INSNS (3),			/*				 SI */
2153    COSTS_N_INSNS (4),			/*				 DI */
2154    COSTS_N_INSNS (2)},			/*			      other */
2155   0,					/* cost of multiply per each bit set */
2156   {COSTS_N_INSNS (18),			/* cost of a divide/mod for QI */
2157    COSTS_N_INSNS (26),			/*			    HI */
2158    COSTS_N_INSNS (42),			/*			    SI */
2159    COSTS_N_INSNS (74),			/*			    DI */
2160    COSTS_N_INSNS (74)},			/*			    other */
2161   COSTS_N_INSNS (1),			/* cost of movsx */
2162   COSTS_N_INSNS (1),			/* cost of movzx */
2163   8,					/* "large" insn */
2164   17,					/* MOVE_RATIO */
2165 
2166   /* All move costs are relative to integer->integer move times 2 and thus
2167      they are latency*2. */
2168   6,				     /* cost for loading QImode using movzbl */
2169   {4, 4, 4},				/* cost of loading integer registers
2170 					   in QImode, HImode and SImode.
2171 					   Relative to reg-reg move (2).  */
2172   {6, 6, 6},				/* cost of storing integer registers */
2173   2,					/* cost of reg,reg fld/fst */
2174   {6, 6, 8},				/* cost of loading fp registers
2175 					   in SFmode, DFmode and XFmode */
2176   {6, 6, 10},				/* cost of storing fp registers
2177 					   in SFmode, DFmode and XFmode */
2178   2,					/* cost of moving MMX register */
2179   {6, 6},				/* cost of loading MMX registers
2180 					   in SImode and DImode */
2181   {6, 6},				/* cost of storing MMX registers
2182 					   in SImode and DImode */
2183   2, 2, 2,				/* cost of moving XMM,YMM,ZMM register */
2184   {6, 6, 6, 6, 6},			/* cost of loading SSE registers
2185 					   in 32,64,128,256 and 512-bit */
2186   {10, 10, 10, 10, 10},			/* cost of unaligned loads.  */
2187   {6, 6, 6, 6, 6},			/* cost of storing SSE registers
2188 					   in 32,64,128,256 and 512-bit */
2189   {10, 10, 10, 10, 10},			/* cost of unaligned loads.  */
2190   4, 4,					/* SSE->integer and integer->SSE moves */
2191   6, 6,					/* Gather load static, per_elt.  */
2192   6, 6,					/* Gather store static, per_elt.  */
2193   32,					/* size of l1 cache.  */
2194   256,					/* size of l2 cache.  */
2195   64,					/* size of prefetch block */
2196   6,					/* number of parallel prefetches */
2197   3,					/* Branch cost */
2198   COSTS_N_INSNS (8),			/* cost of FADD and FSUB insns.  */
2199   COSTS_N_INSNS (8),			/* cost of FMUL instruction.  */
2200   COSTS_N_INSNS (20),			/* cost of FDIV instruction.  */
2201   COSTS_N_INSNS (8),			/* cost of FABS instruction.  */
2202   COSTS_N_INSNS (8),			/* cost of FCHS instruction.  */
2203   COSTS_N_INSNS (40),			/* cost of FSQRT instruction.  */
2204 
2205   COSTS_N_INSNS (8),			/* cost of cheap SSE instruction.  */
2206   COSTS_N_INSNS (8),			/* cost of ADDSS/SD SUBSS/SD insns.  */
2207   COSTS_N_INSNS (8),			/* cost of MULSS instruction.  */
2208   COSTS_N_INSNS (8),			/* cost of MULSD instruction.  */
2209   COSTS_N_INSNS (6),			/* cost of FMA SS instruction.  */
2210   COSTS_N_INSNS (6),			/* cost of FMA SD instruction.  */
2211   COSTS_N_INSNS (20),			/* cost of DIVSS instruction.  */
2212   COSTS_N_INSNS (20),			/* cost of DIVSD instruction.  */
2213   COSTS_N_INSNS (40),			/* cost of SQRTSS instruction.  */
2214   COSTS_N_INSNS (40),			/* cost of SQRTSD instruction.  */
2215   1, 4, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
2216   intel_memcpy,
2217   intel_memset,
2218   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
2219   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
2220 };
2221 
2222 /* Generic should produce code tuned for Core-i7 (and newer chips)
2223    and btver1 (and newer chips).  */
2224 
2225 static stringop_algs generic_memcpy[2] = {
2226   {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
2227              {-1, libcall, false}}},
2228   {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
2229              {-1, libcall, false}}}};
2230 static stringop_algs generic_memset[2] = {
2231   {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
2232              {-1, libcall, false}}},
2233   {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
2234              {-1, libcall, false}}}};
2235 static const
2236 struct processor_costs generic_cost = {
2237   COSTS_N_INSNS (1),			/* cost of an add instruction */
2238   /* Setting cost to 2 makes our current implementation of synth_mult result in
2239      use of unnecessary temporary registers causing regression on several
2240      SPECfp benchmarks.  */
2241   COSTS_N_INSNS (1) + 1,		/* cost of a lea instruction */
2242   COSTS_N_INSNS (1),			/* variable shift costs */
2243   COSTS_N_INSNS (1),			/* constant shift costs */
2244   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
2245    COSTS_N_INSNS (4),			/*				 HI */
2246    COSTS_N_INSNS (3),			/*				 SI */
2247    COSTS_N_INSNS (4),			/*				 DI */
2248    COSTS_N_INSNS (4)},			/*			      other */
2249   0,					/* cost of multiply per each bit set */
2250   {COSTS_N_INSNS (16),			/* cost of a divide/mod for QI */
2251    COSTS_N_INSNS (22),			/*			    HI */
2252    COSTS_N_INSNS (30),			/*			    SI */
2253    COSTS_N_INSNS (74),			/*			    DI */
2254    COSTS_N_INSNS (74)},			/*			    other */
2255   COSTS_N_INSNS (1),			/* cost of movsx */
2256   COSTS_N_INSNS (1),			/* cost of movzx */
2257   8,					/* "large" insn */
2258   17,					/* MOVE_RATIO */
2259 
2260   /* All move costs are relative to integer->integer move times 2 and thus
2261      they are latency*2. */
2262   6,				     /* cost for loading QImode using movzbl */
2263   {6, 6, 6},				/* cost of loading integer registers
2264 					   in QImode, HImode and SImode.
2265 					   Relative to reg-reg move (2).  */
2266   {6, 6, 6},				/* cost of storing integer registers */
2267   4,					/* cost of reg,reg fld/fst */
2268   {6, 6, 12},				/* cost of loading fp registers
2269 					   in SFmode, DFmode and XFmode */
2270   {6, 6, 12},				/* cost of storing fp registers
2271 					   in SFmode, DFmode and XFmode */
2272   2,					/* cost of moving MMX register */
2273   {6, 6},				/* cost of loading MMX registers
2274 					   in SImode and DImode */
2275   {6, 6},				/* cost of storing MMX registers
2276 					   in SImode and DImode */
2277   2, 3, 4,				/* cost of moving XMM,YMM,ZMM register */
2278   {6, 6, 6, 10, 15},			/* cost of loading SSE registers
2279 					   in 32,64,128,256 and 512-bit */
2280   {6, 6, 6, 10, 15},			/* cost of unaligned loads.  */
2281   {6, 6, 6, 10, 15},			/* cost of storing SSE registers
2282 					   in 32,64,128,256 and 512-bit */
2283   {6, 6, 6, 10, 15},			/* cost of unaligned storess.  */
2284   6, 6,					/* SSE->integer and integer->SSE moves */
2285   18, 6,				/* Gather load static, per_elt.  */
2286   18, 6,				/* Gather store static, per_elt.  */
2287   32,					/* size of l1 cache.  */
2288   512,					/* size of l2 cache.  */
2289   64,					/* size of prefetch block */
2290   6,					/* number of parallel prefetches */
2291   /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
2292      value is increased to perhaps more appropriate value of 5.  */
2293   3,					/* Branch cost */
2294   COSTS_N_INSNS (3),			/* cost of FADD and FSUB insns.  */
2295   COSTS_N_INSNS (5),			/* cost of FMUL instruction.  */
2296   COSTS_N_INSNS (17),			/* cost of FDIV instruction.  */
2297   COSTS_N_INSNS (1),			/* cost of FABS instruction.  */
2298   COSTS_N_INSNS (1),			/* cost of FCHS instruction.  */
2299   COSTS_N_INSNS (14),			/* cost of FSQRT instruction.  */
2300 
2301   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
2302   COSTS_N_INSNS (3),			/* cost of ADDSS/SD SUBSS/SD insns.  */
2303   COSTS_N_INSNS (4),			/* cost of MULSS instruction.  */
2304   COSTS_N_INSNS (5),			/* cost of MULSD instruction.  */
2305   COSTS_N_INSNS (5),			/* cost of FMA SS instruction.  */
2306   COSTS_N_INSNS (5),			/* cost of FMA SD instruction.  */
2307   COSTS_N_INSNS (13),			/* cost of DIVSS instruction.  */
2308   COSTS_N_INSNS (17),			/* cost of DIVSD instruction.  */
2309   COSTS_N_INSNS (14),			/* cost of SQRTSS instruction.  */
2310   COSTS_N_INSNS (18),			/* cost of SQRTSD instruction.  */
2311   1, 4, 3, 3,				/* reassoc int, fp, vec_int, vec_fp.  */
2312   generic_memcpy,
2313   generic_memset,
2314   COSTS_N_INSNS (4),			/* cond_taken_branch_cost.  */
2315   COSTS_N_INSNS (2),			/* cond_not_taken_branch_cost.  */
2316 };
2317 
2318 /* core_cost should produce code tuned for Core familly of CPUs.  */
2319 static stringop_algs core_memcpy[2] = {
2320   {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}},
2321   {libcall, {{24, loop, true}, {128, rep_prefix_8_byte, true},
2322              {-1, libcall, false}}}};
2323 static stringop_algs core_memset[2] = {
2324   {libcall, {{6, loop_1_byte, true},
2325              {24, loop, true},
2326              {8192, rep_prefix_4_byte, true},
2327              {-1, libcall, false}}},
2328   {libcall, {{24, loop, true}, {512, rep_prefix_8_byte, true},
2329              {-1, libcall, false}}}};
2330 
2331 static const
2332 struct processor_costs core_cost = {
2333   COSTS_N_INSNS (1),			/* cost of an add instruction */
2334   /* On all chips taken into consideration lea is 2 cycles and more.  With
2335      this cost however our current implementation of synth_mult results in
2336      use of unnecessary temporary registers causing regression on several
2337      SPECfp benchmarks.  */
2338   COSTS_N_INSNS (1) + 1,		/* cost of a lea instruction */
2339   COSTS_N_INSNS (1),			/* variable shift costs */
2340   COSTS_N_INSNS (1),			/* constant shift costs */
2341   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
2342    COSTS_N_INSNS (4),			/*				 HI */
2343    COSTS_N_INSNS (3),			/*				 SI */
2344    /* Here we tune for Sandybridge or newer.  */
2345    COSTS_N_INSNS (3),			/*				 DI */
2346    COSTS_N_INSNS (3)},			/*			      other */
2347   0,					/* cost of multiply per each bit set */
2348   /* Expanding div/mod currently doesn't consider parallelism. So the cost
2349      model is not realistic. We compensate by increasing the latencies a bit.  */
2350   {COSTS_N_INSNS (11),			/* cost of a divide/mod for QI */
2351    COSTS_N_INSNS (11),			/*			    HI */
2352    COSTS_N_INSNS (14),			/*			    SI */
2353    COSTS_N_INSNS (81),			/*			    DI */
2354    COSTS_N_INSNS (81)},			/*			    other */
2355   COSTS_N_INSNS (1),			/* cost of movsx */
2356   COSTS_N_INSNS (1),			/* cost of movzx */
2357   8,					/* "large" insn */
2358   17,					/* MOVE_RATIO */
2359 
2360   /* All move costs are relative to integer->integer move times 2 and thus
2361      they are latency*2. */
2362   6,				     /* cost for loading QImode using movzbl */
2363   {4, 4, 4},				/* cost of loading integer registers
2364 					   in QImode, HImode and SImode.
2365 					   Relative to reg-reg move (2).  */
2366   {6, 6, 6},				/* cost of storing integer registers */
2367   2,					/* cost of reg,reg fld/fst */
2368   {6, 6, 8},				/* cost of loading fp registers
2369 					   in SFmode, DFmode and XFmode */
2370   {6, 6, 10},				/* cost of storing fp registers
2371 					   in SFmode, DFmode and XFmode */
2372   2,					/* cost of moving MMX register */
2373   {6, 6},				/* cost of loading MMX registers
2374 					   in SImode and DImode */
2375   {6, 6},				/* cost of storing MMX registers
2376 					   in SImode and DImode */
2377   2, 2, 4,				/* cost of moving XMM,YMM,ZMM register */
2378   {6, 6, 6, 6, 12},			/* cost of loading SSE registers
2379 					   in 32,64,128,256 and 512-bit */
2380   {6, 6, 6, 6, 12},			/* cost of unaligned loads.  */
2381   {6, 6, 6, 6, 12},			/* cost of storing SSE registers
2382 					   in 32,64,128,256 and 512-bit */
2383   {6, 6, 6, 6, 12},			/* cost of unaligned stores.  */
2384   2, 2,					/* SSE->integer and integer->SSE moves */
2385   /* VGATHERDPD is 7 uops, rec throughput 5, while VGATHERDPD is 9 uops,
2386      rec. throughput 6.
2387      So 5 uops statically and one uops per load.  */
2388   10, 6,				/* Gather load static, per_elt.  */
2389   10, 6,				/* Gather store static, per_elt.  */
2390   64,					/* size of l1 cache.  */
2391   512,					/* size of l2 cache.  */
2392   64,					/* size of prefetch block */
2393   6,					/* number of parallel prefetches */
2394   /* FIXME perhaps more appropriate value is 5.  */
2395   3,					/* Branch cost */
2396   COSTS_N_INSNS (3),			/* cost of FADD and FSUB insns.  */
2397   COSTS_N_INSNS (5),			/* cost of FMUL instruction.  */
2398   /* 10-24 */
2399   COSTS_N_INSNS (24),			/* cost of FDIV instruction.  */
2400   COSTS_N_INSNS (1),			/* cost of FABS instruction.  */
2401   COSTS_N_INSNS (1),			/* cost of FCHS instruction.  */
2402   COSTS_N_INSNS (23),			/* cost of FSQRT instruction.  */
2403 
2404   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
2405   COSTS_N_INSNS (3),			/* cost of ADDSS/SD SUBSS/SD insns.  */
2406   COSTS_N_INSNS (4),			/* cost of MULSS instruction.  */
2407   COSTS_N_INSNS (5),			/* cost of MULSD instruction.  */
2408   COSTS_N_INSNS (5),			/* cost of FMA SS instruction.  */
2409   COSTS_N_INSNS (5),			/* cost of FMA SD instruction.  */
2410   COSTS_N_INSNS (18),			/* cost of DIVSS instruction.  */
2411   COSTS_N_INSNS (32),			/* cost of DIVSD instruction.  */
2412   COSTS_N_INSNS (30),			/* cost of SQRTSS instruction.  */
2413   COSTS_N_INSNS (58),			/* cost of SQRTSD instruction.  */
2414   1, 4, 2, 2,				/* reassoc int, fp, vec_int, vec_fp.  */
2415   core_memcpy,
2416   core_memset,
2417   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
2418   COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
2419 };
2420 
2421