1 /* Costs of operations of individual x86 CPUs. 2 Copyright (C) 1988-2018 Free Software Foundation, Inc. 3 4 This file is part of GCC. 5 6 GCC is free software; you can redistribute it and/or modify 7 it under the terms of the GNU General Public License as published by 8 the Free Software Foundation; either version 3, or (at your option) 9 any later version. 10 11 GCC is distributed in the hope that it will be useful, 12 but WITHOUT ANY WARRANTY; without even the implied warranty of 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 GNU General Public License for more details. 15 16 Under Section 7 of GPL version 3, you are granted additional 17 permissions described in the GCC Runtime Library Exception, version 18 3.1, as published by the Free Software Foundation. 19 20 You should have received a copy of the GNU General Public License and 21 a copy of the GCC Runtime Library Exception along with this program; 22 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 23 <http://www.gnu.org/licenses/>. */ 24 /* Processor costs (relative to an add) */ 25 /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */ 26 #define COSTS_N_BYTES(N) ((N) * 2) 27 28 #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall, false}}} 29 30 static stringop_algs ix86_size_memcpy[2] = { 31 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}, 32 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}}; 33 static stringop_algs ix86_size_memset[2] = { 34 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}, 35 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}}; 36 37 const 38 struct processor_costs ix86_size_cost = {/* costs for tuning for size */ 39 COSTS_N_BYTES (2), /* cost of an add instruction */ 40 COSTS_N_BYTES (3), /* cost of a lea instruction */ 41 COSTS_N_BYTES (2), /* variable shift costs */ 42 COSTS_N_BYTES (3), /* constant shift costs */ 43 {COSTS_N_BYTES (3), /* cost of starting multiply for QI */ 44 COSTS_N_BYTES (3), /* HI */ 45 COSTS_N_BYTES (3), /* SI */ 46 COSTS_N_BYTES (3), /* DI */ 47 COSTS_N_BYTES (5)}, /* other */ 48 0, /* cost of multiply per each bit set */ 49 {COSTS_N_BYTES (3), /* cost of a divide/mod for QI */ 50 COSTS_N_BYTES (3), /* HI */ 51 COSTS_N_BYTES (3), /* SI */ 52 COSTS_N_BYTES (3), /* DI */ 53 COSTS_N_BYTES (5)}, /* other */ 54 COSTS_N_BYTES (3), /* cost of movsx */ 55 COSTS_N_BYTES (3), /* cost of movzx */ 56 0, /* "large" insn */ 57 2, /* MOVE_RATIO */ 58 59 /* All move costs are relative to integer->integer move times 2. */ 60 2, /* cost for loading QImode using movzbl */ 61 {2, 2, 2}, /* cost of loading integer registers 62 in QImode, HImode and SImode. 63 Relative to reg-reg move (2). */ 64 {2, 2, 2}, /* cost of storing integer registers */ 65 2, /* cost of reg,reg fld/fst */ 66 {2, 2, 2}, /* cost of loading fp registers 67 in SFmode, DFmode and XFmode */ 68 {2, 2, 2}, /* cost of storing fp registers 69 in SFmode, DFmode and XFmode */ 70 3, /* cost of moving MMX register */ 71 {3, 3}, /* cost of loading MMX registers 72 in SImode and DImode */ 73 {3, 3}, /* cost of storing MMX registers 74 in SImode and DImode */ 75 3, 3, 3, /* cost of moving XMM,YMM,ZMM register */ 76 {3, 3, 3, 3, 3}, /* cost of loading SSE registers 77 in 32,64,128,256 and 512-bit */ 78 {3, 3, 3, 3, 3}, /* cost of unaligned SSE load 79 in 128bit, 256bit and 512bit */ 80 {3, 3, 3, 3, 3}, /* cost of storing SSE registers 81 in 32,64,128,256 and 512-bit */ 82 {3, 3, 3, 3, 3}, /* cost of unaligned SSE store 83 in 128bit, 256bit and 512bit */ 84 3, 3, /* SSE->integer and integer->SSE moves */ 85 5, 0, /* Gather load static, per_elt. */ 86 5, 0, /* Gather store static, per_elt. */ 87 0, /* size of l1 cache */ 88 0, /* size of l2 cache */ 89 0, /* size of prefetch block */ 90 0, /* number of parallel prefetches */ 91 2, /* Branch cost */ 92 COSTS_N_BYTES (2), /* cost of FADD and FSUB insns. */ 93 COSTS_N_BYTES (2), /* cost of FMUL instruction. */ 94 COSTS_N_BYTES (2), /* cost of FDIV instruction. */ 95 COSTS_N_BYTES (2), /* cost of FABS instruction. */ 96 COSTS_N_BYTES (2), /* cost of FCHS instruction. */ 97 COSTS_N_BYTES (2), /* cost of FSQRT instruction. */ 98 99 COSTS_N_BYTES (2), /* cost of cheap SSE instruction. */ 100 COSTS_N_BYTES (2), /* cost of ADDSS/SD SUBSS/SD insns. */ 101 COSTS_N_BYTES (2), /* cost of MULSS instruction. */ 102 COSTS_N_BYTES (2), /* cost of MULSD instruction. */ 103 COSTS_N_BYTES (2), /* cost of FMA SS instruction. */ 104 COSTS_N_BYTES (2), /* cost of FMA SD instruction. */ 105 COSTS_N_BYTES (2), /* cost of DIVSS instruction. */ 106 COSTS_N_BYTES (2), /* cost of DIVSD instruction. */ 107 COSTS_N_BYTES (2), /* cost of SQRTSS instruction. */ 108 COSTS_N_BYTES (2), /* cost of SQRTSD instruction. */ 109 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ 110 ix86_size_memcpy, 111 ix86_size_memset, 112 COSTS_N_BYTES (1), /* cond_taken_branch_cost. */ 113 COSTS_N_BYTES (1), /* cond_not_taken_branch_cost. */ 114 }; 115 116 /* Processor costs (relative to an add) */ 117 static stringop_algs i386_memcpy[2] = { 118 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}, 119 DUMMY_STRINGOP_ALGS}; 120 static stringop_algs i386_memset[2] = { 121 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}, 122 DUMMY_STRINGOP_ALGS}; 123 124 static const 125 struct processor_costs i386_cost = { /* 386 specific costs */ 126 COSTS_N_INSNS (1), /* cost of an add instruction */ 127 COSTS_N_INSNS (1), /* cost of a lea instruction */ 128 COSTS_N_INSNS (3), /* variable shift costs */ 129 COSTS_N_INSNS (2), /* constant shift costs */ 130 {COSTS_N_INSNS (6), /* cost of starting multiply for QI */ 131 COSTS_N_INSNS (6), /* HI */ 132 COSTS_N_INSNS (6), /* SI */ 133 COSTS_N_INSNS (6), /* DI */ 134 COSTS_N_INSNS (6)}, /* other */ 135 COSTS_N_INSNS (1), /* cost of multiply per each bit set */ 136 {COSTS_N_INSNS (23), /* cost of a divide/mod for QI */ 137 COSTS_N_INSNS (23), /* HI */ 138 COSTS_N_INSNS (23), /* SI */ 139 COSTS_N_INSNS (23), /* DI */ 140 COSTS_N_INSNS (23)}, /* other */ 141 COSTS_N_INSNS (3), /* cost of movsx */ 142 COSTS_N_INSNS (2), /* cost of movzx */ 143 15, /* "large" insn */ 144 3, /* MOVE_RATIO */ 145 146 /* All move costs are relative to integer->integer move times 2 and thus 147 they are latency*2. */ 148 4, /* cost for loading QImode using movzbl */ 149 {2, 4, 2}, /* cost of loading integer registers 150 in QImode, HImode and SImode. 151 Relative to reg-reg move (2). */ 152 {2, 4, 2}, /* cost of storing integer registers */ 153 2, /* cost of reg,reg fld/fst */ 154 {8, 8, 8}, /* cost of loading fp registers 155 in SFmode, DFmode and XFmode */ 156 {8, 8, 8}, /* cost of storing fp registers 157 in SFmode, DFmode and XFmode */ 158 2, /* cost of moving MMX register */ 159 {4, 8}, /* cost of loading MMX registers 160 in SImode and DImode */ 161 {4, 8}, /* cost of storing MMX registers 162 in SImode and DImode */ 163 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 164 {4, 8, 16, 32, 64}, /* cost of loading SSE registers 165 in 32,64,128,256 and 512-bit */ 166 {4, 8, 16, 32, 64}, /* cost of unaligned loads. */ 167 {4, 8, 16, 32, 64}, /* cost of storing SSE registers 168 in 32,64,128,256 and 512-bit */ 169 {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ 170 3, 3, /* SSE->integer and integer->SSE moves */ 171 4, 4, /* Gather load static, per_elt. */ 172 4, 4, /* Gather store static, per_elt. */ 173 0, /* size of l1 cache */ 174 0, /* size of l2 cache */ 175 0, /* size of prefetch block */ 176 0, /* number of parallel prefetches */ 177 1, /* Branch cost */ 178 COSTS_N_INSNS (23), /* cost of FADD and FSUB insns. */ 179 COSTS_N_INSNS (27), /* cost of FMUL instruction. */ 180 COSTS_N_INSNS (88), /* cost of FDIV instruction. */ 181 COSTS_N_INSNS (22), /* cost of FABS instruction. */ 182 COSTS_N_INSNS (24), /* cost of FCHS instruction. */ 183 COSTS_N_INSNS (122), /* cost of FSQRT instruction. */ 184 185 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ 186 COSTS_N_INSNS (23), /* cost of ADDSS/SD SUBSS/SD insns. */ 187 COSTS_N_INSNS (27), /* cost of MULSS instruction. */ 188 COSTS_N_INSNS (27), /* cost of MULSD instruction. */ 189 COSTS_N_INSNS (27), /* cost of FMA SS instruction. */ 190 COSTS_N_INSNS (27), /* cost of FMA SD instruction. */ 191 COSTS_N_INSNS (88), /* cost of DIVSS instruction. */ 192 COSTS_N_INSNS (88), /* cost of DIVSD instruction. */ 193 COSTS_N_INSNS (122), /* cost of SQRTSS instruction. */ 194 COSTS_N_INSNS (122), /* cost of SQRTSD instruction. */ 195 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ 196 i386_memcpy, 197 i386_memset, 198 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ 199 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ 200 }; 201 202 static stringop_algs i486_memcpy[2] = { 203 {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}}, 204 DUMMY_STRINGOP_ALGS}; 205 static stringop_algs i486_memset[2] = { 206 {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}}, 207 DUMMY_STRINGOP_ALGS}; 208 209 static const 210 struct processor_costs i486_cost = { /* 486 specific costs */ 211 COSTS_N_INSNS (1), /* cost of an add instruction */ 212 COSTS_N_INSNS (1), /* cost of a lea instruction */ 213 COSTS_N_INSNS (3), /* variable shift costs */ 214 COSTS_N_INSNS (2), /* constant shift costs */ 215 {COSTS_N_INSNS (12), /* cost of starting multiply for QI */ 216 COSTS_N_INSNS (12), /* HI */ 217 COSTS_N_INSNS (12), /* SI */ 218 COSTS_N_INSNS (12), /* DI */ 219 COSTS_N_INSNS (12)}, /* other */ 220 1, /* cost of multiply per each bit set */ 221 {COSTS_N_INSNS (40), /* cost of a divide/mod for QI */ 222 COSTS_N_INSNS (40), /* HI */ 223 COSTS_N_INSNS (40), /* SI */ 224 COSTS_N_INSNS (40), /* DI */ 225 COSTS_N_INSNS (40)}, /* other */ 226 COSTS_N_INSNS (3), /* cost of movsx */ 227 COSTS_N_INSNS (2), /* cost of movzx */ 228 15, /* "large" insn */ 229 3, /* MOVE_RATIO */ 230 231 /* All move costs are relative to integer->integer move times 2 and thus 232 they are latency*2. */ 233 4, /* cost for loading QImode using movzbl */ 234 {2, 4, 2}, /* cost of loading integer registers 235 in QImode, HImode and SImode. 236 Relative to reg-reg move (2). */ 237 {2, 4, 2}, /* cost of storing integer registers */ 238 2, /* cost of reg,reg fld/fst */ 239 {8, 8, 8}, /* cost of loading fp registers 240 in SFmode, DFmode and XFmode */ 241 {8, 8, 8}, /* cost of storing fp registers 242 in SFmode, DFmode and XFmode */ 243 2, /* cost of moving MMX register */ 244 {4, 8}, /* cost of loading MMX registers 245 in SImode and DImode */ 246 {4, 8}, /* cost of storing MMX registers 247 in SImode and DImode */ 248 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 249 {4, 8, 16, 32, 64}, /* cost of loading SSE registers 250 in 32,64,128,256 and 512-bit */ 251 {4, 8, 16, 32, 64}, /* cost of unaligned loads. */ 252 {4, 8, 16, 32, 64}, /* cost of storing SSE registers 253 in 32,64,128,256 and 512-bit */ 254 {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ 255 3, 3, /* SSE->integer and integer->SSE moves */ 256 4, 4, /* Gather load static, per_elt. */ 257 4, 4, /* Gather store static, per_elt. */ 258 4, /* size of l1 cache. 486 has 8kB cache 259 shared for code and data, so 4kB is 260 not really precise. */ 261 4, /* size of l2 cache */ 262 0, /* size of prefetch block */ 263 0, /* number of parallel prefetches */ 264 1, /* Branch cost */ 265 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */ 266 COSTS_N_INSNS (16), /* cost of FMUL instruction. */ 267 COSTS_N_INSNS (73), /* cost of FDIV instruction. */ 268 COSTS_N_INSNS (3), /* cost of FABS instruction. */ 269 COSTS_N_INSNS (3), /* cost of FCHS instruction. */ 270 COSTS_N_INSNS (83), /* cost of FSQRT instruction. */ 271 272 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ 273 COSTS_N_INSNS (8), /* cost of ADDSS/SD SUBSS/SD insns. */ 274 COSTS_N_INSNS (16), /* cost of MULSS instruction. */ 275 COSTS_N_INSNS (16), /* cost of MULSD instruction. */ 276 COSTS_N_INSNS (16), /* cost of FMA SS instruction. */ 277 COSTS_N_INSNS (16), /* cost of FMA SD instruction. */ 278 COSTS_N_INSNS (73), /* cost of DIVSS instruction. */ 279 COSTS_N_INSNS (74), /* cost of DIVSD instruction. */ 280 COSTS_N_INSNS (83), /* cost of SQRTSS instruction. */ 281 COSTS_N_INSNS (83), /* cost of SQRTSD instruction. */ 282 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ 283 i486_memcpy, 284 i486_memset, 285 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ 286 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ 287 }; 288 289 static stringop_algs pentium_memcpy[2] = { 290 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}}, 291 DUMMY_STRINGOP_ALGS}; 292 static stringop_algs pentium_memset[2] = { 293 {libcall, {{-1, rep_prefix_4_byte, false}}}, 294 DUMMY_STRINGOP_ALGS}; 295 296 static const 297 struct processor_costs pentium_cost = { 298 COSTS_N_INSNS (1), /* cost of an add instruction */ 299 COSTS_N_INSNS (1), /* cost of a lea instruction */ 300 COSTS_N_INSNS (4), /* variable shift costs */ 301 COSTS_N_INSNS (1), /* constant shift costs */ 302 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */ 303 COSTS_N_INSNS (11), /* HI */ 304 COSTS_N_INSNS (11), /* SI */ 305 COSTS_N_INSNS (11), /* DI */ 306 COSTS_N_INSNS (11)}, /* other */ 307 0, /* cost of multiply per each bit set */ 308 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */ 309 COSTS_N_INSNS (25), /* HI */ 310 COSTS_N_INSNS (25), /* SI */ 311 COSTS_N_INSNS (25), /* DI */ 312 COSTS_N_INSNS (25)}, /* other */ 313 COSTS_N_INSNS (3), /* cost of movsx */ 314 COSTS_N_INSNS (2), /* cost of movzx */ 315 8, /* "large" insn */ 316 6, /* MOVE_RATIO */ 317 318 /* All move costs are relative to integer->integer move times 2 and thus 319 they are latency*2. */ 320 6, /* cost for loading QImode using movzbl */ 321 {2, 4, 2}, /* cost of loading integer registers 322 in QImode, HImode and SImode. 323 Relative to reg-reg move (2). */ 324 {2, 4, 2}, /* cost of storing integer registers */ 325 2, /* cost of reg,reg fld/fst */ 326 {2, 2, 6}, /* cost of loading fp registers 327 in SFmode, DFmode and XFmode */ 328 {4, 4, 6}, /* cost of storing fp registers 329 in SFmode, DFmode and XFmode */ 330 8, /* cost of moving MMX register */ 331 {8, 8}, /* cost of loading MMX registers 332 in SImode and DImode */ 333 {8, 8}, /* cost of storing MMX registers 334 in SImode and DImode */ 335 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 336 {4, 8, 16, 32, 64}, /* cost of loading SSE registers 337 in 32,64,128,256 and 512-bit */ 338 {4, 8, 16, 32, 64}, /* cost of unaligned loads. */ 339 {4, 8, 16, 32, 64}, /* cost of storing SSE registers 340 in 32,64,128,256 and 512-bit */ 341 {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ 342 3, 3, /* SSE->integer and integer->SSE moves */ 343 4, 4, /* Gather load static, per_elt. */ 344 4, 4, /* Gather store static, per_elt. */ 345 8, /* size of l1 cache. */ 346 8, /* size of l2 cache */ 347 0, /* size of prefetch block */ 348 0, /* number of parallel prefetches */ 349 2, /* Branch cost */ 350 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */ 351 COSTS_N_INSNS (3), /* cost of FMUL instruction. */ 352 COSTS_N_INSNS (39), /* cost of FDIV instruction. */ 353 COSTS_N_INSNS (1), /* cost of FABS instruction. */ 354 COSTS_N_INSNS (1), /* cost of FCHS instruction. */ 355 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */ 356 357 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ 358 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */ 359 COSTS_N_INSNS (3), /* cost of MULSS instruction. */ 360 COSTS_N_INSNS (3), /* cost of MULSD instruction. */ 361 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */ 362 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */ 363 COSTS_N_INSNS (39), /* cost of DIVSS instruction. */ 364 COSTS_N_INSNS (39), /* cost of DIVSD instruction. */ 365 COSTS_N_INSNS (70), /* cost of SQRTSS instruction. */ 366 COSTS_N_INSNS (70), /* cost of SQRTSD instruction. */ 367 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ 368 pentium_memcpy, 369 pentium_memset, 370 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ 371 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ 372 }; 373 374 static const 375 struct processor_costs lakemont_cost = { 376 COSTS_N_INSNS (1), /* cost of an add instruction */ 377 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ 378 COSTS_N_INSNS (1), /* variable shift costs */ 379 COSTS_N_INSNS (1), /* constant shift costs */ 380 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */ 381 COSTS_N_INSNS (11), /* HI */ 382 COSTS_N_INSNS (11), /* SI */ 383 COSTS_N_INSNS (11), /* DI */ 384 COSTS_N_INSNS (11)}, /* other */ 385 0, /* cost of multiply per each bit set */ 386 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */ 387 COSTS_N_INSNS (25), /* HI */ 388 COSTS_N_INSNS (25), /* SI */ 389 COSTS_N_INSNS (25), /* DI */ 390 COSTS_N_INSNS (25)}, /* other */ 391 COSTS_N_INSNS (3), /* cost of movsx */ 392 COSTS_N_INSNS (2), /* cost of movzx */ 393 8, /* "large" insn */ 394 17, /* MOVE_RATIO */ 395 396 /* All move costs are relative to integer->integer move times 2 and thus 397 they are latency*2. */ 398 6, /* cost for loading QImode using movzbl */ 399 {2, 4, 2}, /* cost of loading integer registers 400 in QImode, HImode and SImode. 401 Relative to reg-reg move (2). */ 402 {2, 4, 2}, /* cost of storing integer registers */ 403 2, /* cost of reg,reg fld/fst */ 404 {2, 2, 6}, /* cost of loading fp registers 405 in SFmode, DFmode and XFmode */ 406 {4, 4, 6}, /* cost of storing fp registers 407 in SFmode, DFmode and XFmode */ 408 8, /* cost of moving MMX register */ 409 {8, 8}, /* cost of loading MMX registers 410 in SImode and DImode */ 411 {8, 8}, /* cost of storing MMX registers 412 in SImode and DImode */ 413 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 414 {4, 8, 16, 32, 64}, /* cost of loading SSE registers 415 in 32,64,128,256 and 512-bit */ 416 {4, 8, 16, 32, 64}, /* cost of unaligned loads. */ 417 {4, 8, 16, 32, 64}, /* cost of storing SSE registers 418 in 32,64,128,256 and 512-bit */ 419 {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ 420 3, 3, /* SSE->integer and integer->SSE moves */ 421 4, 4, /* Gather load static, per_elt. */ 422 4, 4, /* Gather store static, per_elt. */ 423 8, /* size of l1 cache. */ 424 8, /* size of l2 cache */ 425 0, /* size of prefetch block */ 426 0, /* number of parallel prefetches */ 427 2, /* Branch cost */ 428 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */ 429 COSTS_N_INSNS (3), /* cost of FMUL instruction. */ 430 COSTS_N_INSNS (39), /* cost of FDIV instruction. */ 431 COSTS_N_INSNS (1), /* cost of FABS instruction. */ 432 COSTS_N_INSNS (1), /* cost of FCHS instruction. */ 433 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */ 434 435 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ 436 COSTS_N_INSNS (5), /* cost of ADDSS/SD SUBSS/SD insns. */ 437 COSTS_N_INSNS (5), /* cost of MULSS instruction. */ 438 COSTS_N_INSNS (5), /* cost of MULSD instruction. */ 439 COSTS_N_INSNS (10), /* cost of FMA SS instruction. */ 440 COSTS_N_INSNS (10), /* cost of FMA SD instruction. */ 441 COSTS_N_INSNS (31), /* cost of DIVSS instruction. */ 442 COSTS_N_INSNS (60), /* cost of DIVSD instruction. */ 443 COSTS_N_INSNS (31), /* cost of SQRTSS instruction. */ 444 COSTS_N_INSNS (63), /* cost of SQRTSD instruction. */ 445 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ 446 pentium_memcpy, 447 pentium_memset, 448 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ 449 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ 450 }; 451 452 /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes 453 (we ensure the alignment). For small blocks inline loop is still a 454 noticeable win, for bigger blocks either rep movsl or rep movsb is 455 way to go. Rep movsb has apparently more expensive startup time in CPU, 456 but after 4K the difference is down in the noise. */ 457 static stringop_algs pentiumpro_memcpy[2] = { 458 {rep_prefix_4_byte, {{128, loop, false}, {1024, unrolled_loop, false}, 459 {8192, rep_prefix_4_byte, false}, 460 {-1, rep_prefix_1_byte, false}}}, 461 DUMMY_STRINGOP_ALGS}; 462 static stringop_algs pentiumpro_memset[2] = { 463 {rep_prefix_4_byte, {{1024, unrolled_loop, false}, 464 {8192, rep_prefix_4_byte, false}, 465 {-1, libcall, false}}}, 466 DUMMY_STRINGOP_ALGS}; 467 static const 468 struct processor_costs pentiumpro_cost = { 469 COSTS_N_INSNS (1), /* cost of an add instruction */ 470 COSTS_N_INSNS (1), /* cost of a lea instruction */ 471 COSTS_N_INSNS (1), /* variable shift costs */ 472 COSTS_N_INSNS (1), /* constant shift costs */ 473 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */ 474 COSTS_N_INSNS (4), /* HI */ 475 COSTS_N_INSNS (4), /* SI */ 476 COSTS_N_INSNS (4), /* DI */ 477 COSTS_N_INSNS (4)}, /* other */ 478 0, /* cost of multiply per each bit set */ 479 {COSTS_N_INSNS (17), /* cost of a divide/mod for QI */ 480 COSTS_N_INSNS (17), /* HI */ 481 COSTS_N_INSNS (17), /* SI */ 482 COSTS_N_INSNS (17), /* DI */ 483 COSTS_N_INSNS (17)}, /* other */ 484 COSTS_N_INSNS (1), /* cost of movsx */ 485 COSTS_N_INSNS (1), /* cost of movzx */ 486 8, /* "large" insn */ 487 6, /* MOVE_RATIO */ 488 489 /* All move costs are relative to integer->integer move times 2 and thus 490 they are latency*2. */ 491 2, /* cost for loading QImode using movzbl */ 492 {4, 4, 4}, /* cost of loading integer registers 493 in QImode, HImode and SImode. 494 Relative to reg-reg move (2). */ 495 {2, 2, 2}, /* cost of storing integer registers */ 496 2, /* cost of reg,reg fld/fst */ 497 {2, 2, 6}, /* cost of loading fp registers 498 in SFmode, DFmode and XFmode */ 499 {4, 4, 6}, /* cost of storing fp registers 500 in SFmode, DFmode and XFmode */ 501 2, /* cost of moving MMX register */ 502 {2, 2}, /* cost of loading MMX registers 503 in SImode and DImode */ 504 {2, 2}, /* cost of storing MMX registers 505 in SImode and DImode */ 506 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 507 {4, 8, 16, 32, 64}, /* cost of loading SSE registers 508 in 32,64,128,256 and 512-bit */ 509 {4, 8, 16, 32, 64}, /* cost of unaligned loads. */ 510 {4, 8, 16, 32, 64}, /* cost of storing SSE registers 511 in 32,64,128,256 and 512-bit */ 512 {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ 513 3, 3, /* SSE->integer and integer->SSE moves */ 514 4, 4, /* Gather load static, per_elt. */ 515 4, 4, /* Gather store static, per_elt. */ 516 8, /* size of l1 cache. */ 517 256, /* size of l2 cache */ 518 32, /* size of prefetch block */ 519 6, /* number of parallel prefetches */ 520 2, /* Branch cost */ 521 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */ 522 COSTS_N_INSNS (5), /* cost of FMUL instruction. */ 523 COSTS_N_INSNS (56), /* cost of FDIV instruction. */ 524 COSTS_N_INSNS (2), /* cost of FABS instruction. */ 525 COSTS_N_INSNS (2), /* cost of FCHS instruction. */ 526 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */ 527 528 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ 529 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */ 530 COSTS_N_INSNS (4), /* cost of MULSS instruction. */ 531 COSTS_N_INSNS (4), /* cost of MULSD instruction. */ 532 COSTS_N_INSNS (7), /* cost of FMA SS instruction. */ 533 COSTS_N_INSNS (7), /* cost of FMA SD instruction. */ 534 COSTS_N_INSNS (18), /* cost of DIVSS instruction. */ 535 COSTS_N_INSNS (18), /* cost of DIVSD instruction. */ 536 COSTS_N_INSNS (31), /* cost of SQRTSS instruction. */ 537 COSTS_N_INSNS (31), /* cost of SQRTSD instruction. */ 538 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ 539 pentiumpro_memcpy, 540 pentiumpro_memset, 541 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ 542 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ 543 }; 544 545 static stringop_algs geode_memcpy[2] = { 546 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}}, 547 DUMMY_STRINGOP_ALGS}; 548 static stringop_algs geode_memset[2] = { 549 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}}, 550 DUMMY_STRINGOP_ALGS}; 551 static const 552 struct processor_costs geode_cost = { 553 COSTS_N_INSNS (1), /* cost of an add instruction */ 554 COSTS_N_INSNS (1), /* cost of a lea instruction */ 555 COSTS_N_INSNS (2), /* variable shift costs */ 556 COSTS_N_INSNS (1), /* constant shift costs */ 557 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ 558 COSTS_N_INSNS (4), /* HI */ 559 COSTS_N_INSNS (7), /* SI */ 560 COSTS_N_INSNS (7), /* DI */ 561 COSTS_N_INSNS (7)}, /* other */ 562 0, /* cost of multiply per each bit set */ 563 {COSTS_N_INSNS (15), /* cost of a divide/mod for QI */ 564 COSTS_N_INSNS (23), /* HI */ 565 COSTS_N_INSNS (39), /* SI */ 566 COSTS_N_INSNS (39), /* DI */ 567 COSTS_N_INSNS (39)}, /* other */ 568 COSTS_N_INSNS (1), /* cost of movsx */ 569 COSTS_N_INSNS (1), /* cost of movzx */ 570 8, /* "large" insn */ 571 4, /* MOVE_RATIO */ 572 573 /* All move costs are relative to integer->integer move times 2 and thus 574 they are latency*2. */ 575 2, /* cost for loading QImode using movzbl */ 576 {2, 2, 2}, /* cost of loading integer registers 577 in QImode, HImode and SImode. 578 Relative to reg-reg move (2). */ 579 {2, 2, 2}, /* cost of storing integer registers */ 580 2, /* cost of reg,reg fld/fst */ 581 {2, 2, 2}, /* cost of loading fp registers 582 in SFmode, DFmode and XFmode */ 583 {4, 6, 6}, /* cost of storing fp registers 584 in SFmode, DFmode and XFmode */ 585 586 2, /* cost of moving MMX register */ 587 {2, 2}, /* cost of loading MMX registers 588 in SImode and DImode */ 589 {2, 2}, /* cost of storing MMX registers 590 in SImode and DImode */ 591 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 592 {2, 2, 8, 16, 32}, /* cost of loading SSE registers 593 in 32,64,128,256 and 512-bit */ 594 {2, 2, 8, 16, 32}, /* cost of unaligned loads. */ 595 {2, 2, 8, 16, 32}, /* cost of storing SSE registers 596 in 32,64,128,256 and 512-bit */ 597 {2, 2, 8, 16, 32}, /* cost of unaligned stores. */ 598 6, 6, /* SSE->integer and integer->SSE moves */ 599 2, 2, /* Gather load static, per_elt. */ 600 2, 2, /* Gather store static, per_elt. */ 601 64, /* size of l1 cache. */ 602 128, /* size of l2 cache. */ 603 32, /* size of prefetch block */ 604 1, /* number of parallel prefetches */ 605 1, /* Branch cost */ 606 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */ 607 COSTS_N_INSNS (11), /* cost of FMUL instruction. */ 608 COSTS_N_INSNS (47), /* cost of FDIV instruction. */ 609 COSTS_N_INSNS (1), /* cost of FABS instruction. */ 610 COSTS_N_INSNS (1), /* cost of FCHS instruction. */ 611 COSTS_N_INSNS (54), /* cost of FSQRT instruction. */ 612 613 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ 614 COSTS_N_INSNS (6), /* cost of ADDSS/SD SUBSS/SD insns. */ 615 COSTS_N_INSNS (11), /* cost of MULSS instruction. */ 616 COSTS_N_INSNS (11), /* cost of MULSD instruction. */ 617 COSTS_N_INSNS (17), /* cost of FMA SS instruction. */ 618 COSTS_N_INSNS (17), /* cost of FMA SD instruction. */ 619 COSTS_N_INSNS (47), /* cost of DIVSS instruction. */ 620 COSTS_N_INSNS (47), /* cost of DIVSD instruction. */ 621 COSTS_N_INSNS (54), /* cost of SQRTSS instruction. */ 622 COSTS_N_INSNS (54), /* cost of SQRTSD instruction. */ 623 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ 624 geode_memcpy, 625 geode_memset, 626 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ 627 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ 628 }; 629 630 static stringop_algs k6_memcpy[2] = { 631 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}}, 632 DUMMY_STRINGOP_ALGS}; 633 static stringop_algs k6_memset[2] = { 634 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}}, 635 DUMMY_STRINGOP_ALGS}; 636 static const 637 struct processor_costs k6_cost = { 638 COSTS_N_INSNS (1), /* cost of an add instruction */ 639 COSTS_N_INSNS (2), /* cost of a lea instruction */ 640 COSTS_N_INSNS (1), /* variable shift costs */ 641 COSTS_N_INSNS (1), /* constant shift costs */ 642 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ 643 COSTS_N_INSNS (3), /* HI */ 644 COSTS_N_INSNS (3), /* SI */ 645 COSTS_N_INSNS (3), /* DI */ 646 COSTS_N_INSNS (3)}, /* other */ 647 0, /* cost of multiply per each bit set */ 648 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */ 649 COSTS_N_INSNS (18), /* HI */ 650 COSTS_N_INSNS (18), /* SI */ 651 COSTS_N_INSNS (18), /* DI */ 652 COSTS_N_INSNS (18)}, /* other */ 653 COSTS_N_INSNS (2), /* cost of movsx */ 654 COSTS_N_INSNS (2), /* cost of movzx */ 655 8, /* "large" insn */ 656 4, /* MOVE_RATIO */ 657 658 /* All move costs are relative to integer->integer move times 2 and thus 659 they are latency*2. */ 660 3, /* cost for loading QImode using movzbl */ 661 {4, 5, 4}, /* cost of loading integer registers 662 in QImode, HImode and SImode. 663 Relative to reg-reg move (2). */ 664 {2, 3, 2}, /* cost of storing integer registers */ 665 4, /* cost of reg,reg fld/fst */ 666 {6, 6, 6}, /* cost of loading fp registers 667 in SFmode, DFmode and XFmode */ 668 {4, 4, 4}, /* cost of storing fp registers 669 in SFmode, DFmode and XFmode */ 670 2, /* cost of moving MMX register */ 671 {2, 2}, /* cost of loading MMX registers 672 in SImode and DImode */ 673 {2, 2}, /* cost of storing MMX registers 674 in SImode and DImode */ 675 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 676 {2, 2, 8, 16, 32}, /* cost of loading SSE registers 677 in 32,64,128,256 and 512-bit */ 678 {2, 2, 8, 16, 32}, /* cost of unaligned loads. */ 679 {2, 2, 8, 16, 32}, /* cost of storing SSE registers 680 in 32,64,128,256 and 512-bit */ 681 {2, 2, 8, 16, 32}, /* cost of unaligned stores. */ 682 6, 6, /* SSE->integer and integer->SSE moves */ 683 2, 2, /* Gather load static, per_elt. */ 684 2, 2, /* Gather store static, per_elt. */ 685 32, /* size of l1 cache. */ 686 32, /* size of l2 cache. Some models 687 have integrated l2 cache, but 688 optimizing for k6 is not important 689 enough to worry about that. */ 690 32, /* size of prefetch block */ 691 1, /* number of parallel prefetches */ 692 1, /* Branch cost */ 693 COSTS_N_INSNS (2), /* cost of FADD and FSUB insns. */ 694 COSTS_N_INSNS (2), /* cost of FMUL instruction. */ 695 COSTS_N_INSNS (56), /* cost of FDIV instruction. */ 696 COSTS_N_INSNS (2), /* cost of FABS instruction. */ 697 COSTS_N_INSNS (2), /* cost of FCHS instruction. */ 698 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */ 699 700 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ 701 COSTS_N_INSNS (2), /* cost of ADDSS/SD SUBSS/SD insns. */ 702 COSTS_N_INSNS (2), /* cost of MULSS instruction. */ 703 COSTS_N_INSNS (2), /* cost of MULSD instruction. */ 704 COSTS_N_INSNS (4), /* cost of FMA SS instruction. */ 705 COSTS_N_INSNS (4), /* cost of FMA SD instruction. */ 706 COSTS_N_INSNS (56), /* cost of DIVSS instruction. */ 707 COSTS_N_INSNS (56), /* cost of DIVSD instruction. */ 708 COSTS_N_INSNS (56), /* cost of SQRTSS instruction. */ 709 COSTS_N_INSNS (56), /* cost of SQRTSD instruction. */ 710 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ 711 k6_memcpy, 712 k6_memset, 713 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ 714 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ 715 }; 716 717 /* For some reason, Athlon deals better with REP prefix (relative to loops) 718 compared to K8. Alignment becomes important after 8 bytes for memcpy and 719 128 bytes for memset. */ 720 static stringop_algs athlon_memcpy[2] = { 721 {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, 722 DUMMY_STRINGOP_ALGS}; 723 static stringop_algs athlon_memset[2] = { 724 {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, 725 DUMMY_STRINGOP_ALGS}; 726 static const 727 struct processor_costs athlon_cost = { 728 COSTS_N_INSNS (1), /* cost of an add instruction */ 729 COSTS_N_INSNS (2), /* cost of a lea instruction */ 730 COSTS_N_INSNS (1), /* variable shift costs */ 731 COSTS_N_INSNS (1), /* constant shift costs */ 732 {COSTS_N_INSNS (5), /* cost of starting multiply for QI */ 733 COSTS_N_INSNS (5), /* HI */ 734 COSTS_N_INSNS (5), /* SI */ 735 COSTS_N_INSNS (5), /* DI */ 736 COSTS_N_INSNS (5)}, /* other */ 737 0, /* cost of multiply per each bit set */ 738 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */ 739 COSTS_N_INSNS (26), /* HI */ 740 COSTS_N_INSNS (42), /* SI */ 741 COSTS_N_INSNS (74), /* DI */ 742 COSTS_N_INSNS (74)}, /* other */ 743 COSTS_N_INSNS (1), /* cost of movsx */ 744 COSTS_N_INSNS (1), /* cost of movzx */ 745 8, /* "large" insn */ 746 9, /* MOVE_RATIO */ 747 748 /* All move costs are relative to integer->integer move times 2 and thus 749 they are latency*2. */ 750 4, /* cost for loading QImode using movzbl */ 751 {3, 4, 3}, /* cost of loading integer registers 752 in QImode, HImode and SImode. 753 Relative to reg-reg move (2). */ 754 {3, 4, 3}, /* cost of storing integer registers */ 755 4, /* cost of reg,reg fld/fst */ 756 {4, 4, 12}, /* cost of loading fp registers 757 in SFmode, DFmode and XFmode */ 758 {6, 6, 8}, /* cost of storing fp registers 759 in SFmode, DFmode and XFmode */ 760 2, /* cost of moving MMX register */ 761 {4, 4}, /* cost of loading MMX registers 762 in SImode and DImode */ 763 {4, 4}, /* cost of storing MMX registers 764 in SImode and DImode */ 765 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 766 {4, 4, 6, 12, 24}, /* cost of loading SSE registers 767 in 32,64,128,256 and 512-bit */ 768 {4, 4, 6, 12, 24}, /* cost of unaligned loads. */ 769 {4, 4, 5, 10, 20}, /* cost of storing SSE registers 770 in 32,64,128,256 and 512-bit */ 771 {4, 4, 5, 10, 20}, /* cost of unaligned stores. */ 772 5, 5, /* SSE->integer and integer->SSE moves */ 773 4, 4, /* Gather load static, per_elt. */ 774 4, 4, /* Gather store static, per_elt. */ 775 64, /* size of l1 cache. */ 776 256, /* size of l2 cache. */ 777 64, /* size of prefetch block */ 778 6, /* number of parallel prefetches */ 779 5, /* Branch cost */ 780 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */ 781 COSTS_N_INSNS (4), /* cost of FMUL instruction. */ 782 COSTS_N_INSNS (24), /* cost of FDIV instruction. */ 783 COSTS_N_INSNS (2), /* cost of FABS instruction. */ 784 COSTS_N_INSNS (2), /* cost of FCHS instruction. */ 785 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */ 786 787 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */ 788 COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */ 789 COSTS_N_INSNS (4), /* cost of MULSS instruction. */ 790 COSTS_N_INSNS (4), /* cost of MULSD instruction. */ 791 COSTS_N_INSNS (8), /* cost of FMA SS instruction. */ 792 COSTS_N_INSNS (8), /* cost of FMA SD instruction. */ 793 /* 11-16 */ 794 COSTS_N_INSNS (16), /* cost of DIVSS instruction. */ 795 COSTS_N_INSNS (24), /* cost of DIVSD instruction. */ 796 COSTS_N_INSNS (19), /* cost of SQRTSS instruction. */ 797 COSTS_N_INSNS (19), /* cost of SQRTSD instruction. */ 798 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ 799 athlon_memcpy, 800 athlon_memset, 801 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ 802 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ 803 }; 804 805 /* K8 has optimized REP instruction for medium sized blocks, but for very 806 small blocks it is better to use loop. For large blocks, libcall can 807 do nontemporary accesses and beat inline considerably. */ 808 static stringop_algs k8_memcpy[2] = { 809 {libcall, {{6, loop, false}, {14, unrolled_loop, false}, 810 {-1, rep_prefix_4_byte, false}}}, 811 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false}, 812 {-1, libcall, false}}}}; 813 static stringop_algs k8_memset[2] = { 814 {libcall, {{8, loop, false}, {24, unrolled_loop, false}, 815 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, 816 {libcall, {{48, unrolled_loop, false}, 817 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}}; 818 static const 819 struct processor_costs k8_cost = { 820 COSTS_N_INSNS (1), /* cost of an add instruction */ 821 COSTS_N_INSNS (2), /* cost of a lea instruction */ 822 COSTS_N_INSNS (1), /* variable shift costs */ 823 COSTS_N_INSNS (1), /* constant shift costs */ 824 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ 825 COSTS_N_INSNS (4), /* HI */ 826 COSTS_N_INSNS (3), /* SI */ 827 COSTS_N_INSNS (4), /* DI */ 828 COSTS_N_INSNS (5)}, /* other */ 829 0, /* cost of multiply per each bit set */ 830 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */ 831 COSTS_N_INSNS (26), /* HI */ 832 COSTS_N_INSNS (42), /* SI */ 833 COSTS_N_INSNS (74), /* DI */ 834 COSTS_N_INSNS (74)}, /* other */ 835 COSTS_N_INSNS (1), /* cost of movsx */ 836 COSTS_N_INSNS (1), /* cost of movzx */ 837 8, /* "large" insn */ 838 9, /* MOVE_RATIO */ 839 840 /* All move costs are relative to integer->integer move times 2 and thus 841 they are latency*2. */ 842 4, /* cost for loading QImode using movzbl */ 843 {3, 4, 3}, /* cost of loading integer registers 844 in QImode, HImode and SImode. 845 Relative to reg-reg move (2). */ 846 {3, 4, 3}, /* cost of storing integer registers */ 847 4, /* cost of reg,reg fld/fst */ 848 {4, 4, 12}, /* cost of loading fp registers 849 in SFmode, DFmode and XFmode */ 850 {6, 6, 8}, /* cost of storing fp registers 851 in SFmode, DFmode and XFmode */ 852 2, /* cost of moving MMX register */ 853 {3, 3}, /* cost of loading MMX registers 854 in SImode and DImode */ 855 {4, 4}, /* cost of storing MMX registers 856 in SImode and DImode */ 857 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 858 {4, 3, 6, 12, 24}, /* cost of loading SSE registers 859 in 32,64,128,256 and 512-bit */ 860 {4, 3, 6, 12, 24}, /* cost of unaligned loads. */ 861 {4, 4, 5, 10, 20}, /* cost of storing SSE registers 862 in 32,64,128,256 and 512-bit */ 863 {4, 4, 5, 10, 20}, /* cost of unaligned stores. */ 864 5, 5, /* SSE->integer and integer->SSE moves */ 865 4, 4, /* Gather load static, per_elt. */ 866 4, 4, /* Gather store static, per_elt. */ 867 64, /* size of l1 cache. */ 868 512, /* size of l2 cache. */ 869 64, /* size of prefetch block */ 870 /* New AMD processors never drop prefetches; if they cannot be performed 871 immediately, they are queued. We set number of simultaneous prefetches 872 to a large constant to reflect this (it probably is not a good idea not 873 to limit number of prefetches at all, as their execution also takes some 874 time). */ 875 100, /* number of parallel prefetches */ 876 3, /* Branch cost */ 877 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */ 878 COSTS_N_INSNS (4), /* cost of FMUL instruction. */ 879 COSTS_N_INSNS (19), /* cost of FDIV instruction. */ 880 COSTS_N_INSNS (2), /* cost of FABS instruction. */ 881 COSTS_N_INSNS (2), /* cost of FCHS instruction. */ 882 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */ 883 884 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */ 885 COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */ 886 COSTS_N_INSNS (4), /* cost of MULSS instruction. */ 887 COSTS_N_INSNS (4), /* cost of MULSD instruction. */ 888 COSTS_N_INSNS (8), /* cost of FMA SS instruction. */ 889 COSTS_N_INSNS (8), /* cost of FMA SD instruction. */ 890 /* 11-16 */ 891 COSTS_N_INSNS (16), /* cost of DIVSS instruction. */ 892 COSTS_N_INSNS (20), /* cost of DIVSD instruction. */ 893 COSTS_N_INSNS (19), /* cost of SQRTSS instruction. */ 894 COSTS_N_INSNS (27), /* cost of SQRTSD instruction. */ 895 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ 896 k8_memcpy, 897 k8_memset, 898 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ 899 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */ 900 }; 901 902 /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for 903 very small blocks it is better to use loop. For large blocks, libcall can 904 do nontemporary accesses and beat inline considerably. */ 905 static stringop_algs amdfam10_memcpy[2] = { 906 {libcall, {{6, loop, false}, {14, unrolled_loop, false}, 907 {-1, rep_prefix_4_byte, false}}}, 908 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false}, 909 {-1, libcall, false}}}}; 910 static stringop_algs amdfam10_memset[2] = { 911 {libcall, {{8, loop, false}, {24, unrolled_loop, false}, 912 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, 913 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false}, 914 {-1, libcall, false}}}}; 915 struct processor_costs amdfam10_cost = { 916 COSTS_N_INSNS (1), /* cost of an add instruction */ 917 COSTS_N_INSNS (2), /* cost of a lea instruction */ 918 COSTS_N_INSNS (1), /* variable shift costs */ 919 COSTS_N_INSNS (1), /* constant shift costs */ 920 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ 921 COSTS_N_INSNS (4), /* HI */ 922 COSTS_N_INSNS (3), /* SI */ 923 COSTS_N_INSNS (4), /* DI */ 924 COSTS_N_INSNS (5)}, /* other */ 925 0, /* cost of multiply per each bit set */ 926 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */ 927 COSTS_N_INSNS (35), /* HI */ 928 COSTS_N_INSNS (51), /* SI */ 929 COSTS_N_INSNS (83), /* DI */ 930 COSTS_N_INSNS (83)}, /* other */ 931 COSTS_N_INSNS (1), /* cost of movsx */ 932 COSTS_N_INSNS (1), /* cost of movzx */ 933 8, /* "large" insn */ 934 9, /* MOVE_RATIO */ 935 936 /* All move costs are relative to integer->integer move times 2 and thus 937 they are latency*2. */ 938 4, /* cost for loading QImode using movzbl */ 939 {3, 4, 3}, /* cost of loading integer registers 940 in QImode, HImode and SImode. 941 Relative to reg-reg move (2). */ 942 {3, 4, 3}, /* cost of storing integer registers */ 943 4, /* cost of reg,reg fld/fst */ 944 {4, 4, 12}, /* cost of loading fp registers 945 in SFmode, DFmode and XFmode */ 946 {6, 6, 8}, /* cost of storing fp registers 947 in SFmode, DFmode and XFmode */ 948 2, /* cost of moving MMX register */ 949 {3, 3}, /* cost of loading MMX registers 950 in SImode and DImode */ 951 {4, 4}, /* cost of storing MMX registers 952 in SImode and DImode */ 953 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 954 {4, 4, 3, 6, 12}, /* cost of loading SSE registers 955 in 32,64,128,256 and 512-bit */ 956 {4, 4, 3, 7, 12}, /* cost of unaligned loads. */ 957 {4, 4, 5, 10, 20}, /* cost of storing SSE registers 958 in 32,64,128,256 and 512-bit */ 959 {4, 4, 5, 10, 20}, /* cost of unaligned stores. */ 960 3, 3, /* SSE->integer and integer->SSE moves */ 961 /* On K8: 962 MOVD reg64, xmmreg Double FSTORE 4 963 MOVD reg32, xmmreg Double FSTORE 4 964 On AMDFAM10: 965 MOVD reg64, xmmreg Double FADD 3 966 1/1 1/1 967 MOVD reg32, xmmreg Double FADD 3 968 1/1 1/1 */ 969 4, 4, /* Gather load static, per_elt. */ 970 4, 4, /* Gather store static, per_elt. */ 971 64, /* size of l1 cache. */ 972 512, /* size of l2 cache. */ 973 64, /* size of prefetch block */ 974 /* New AMD processors never drop prefetches; if they cannot be performed 975 immediately, they are queued. We set number of simultaneous prefetches 976 to a large constant to reflect this (it probably is not a good idea not 977 to limit number of prefetches at all, as their execution also takes some 978 time). */ 979 100, /* number of parallel prefetches */ 980 2, /* Branch cost */ 981 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */ 982 COSTS_N_INSNS (4), /* cost of FMUL instruction. */ 983 COSTS_N_INSNS (19), /* cost of FDIV instruction. */ 984 COSTS_N_INSNS (2), /* cost of FABS instruction. */ 985 COSTS_N_INSNS (2), /* cost of FCHS instruction. */ 986 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */ 987 988 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */ 989 COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */ 990 COSTS_N_INSNS (4), /* cost of MULSS instruction. */ 991 COSTS_N_INSNS (4), /* cost of MULSD instruction. */ 992 COSTS_N_INSNS (8), /* cost of FMA SS instruction. */ 993 COSTS_N_INSNS (8), /* cost of FMA SD instruction. */ 994 /* 11-16 */ 995 COSTS_N_INSNS (16), /* cost of DIVSS instruction. */ 996 COSTS_N_INSNS (20), /* cost of DIVSD instruction. */ 997 COSTS_N_INSNS (19), /* cost of SQRTSS instruction. */ 998 COSTS_N_INSNS (27), /* cost of SQRTSD instruction. */ 999 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ 1000 amdfam10_memcpy, 1001 amdfam10_memset, 1002 COSTS_N_INSNS (2), /* cond_taken_branch_cost. */ 1003 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ 1004 }; 1005 1006 /* BDVER1 has optimized REP instruction for medium sized blocks, but for 1007 very small blocks it is better to use loop. For large blocks, libcall 1008 can do nontemporary accesses and beat inline considerably. */ 1009 static stringop_algs bdver1_memcpy[2] = { 1010 {libcall, {{6, loop, false}, {14, unrolled_loop, false}, 1011 {-1, rep_prefix_4_byte, false}}}, 1012 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false}, 1013 {-1, libcall, false}}}}; 1014 static stringop_algs bdver1_memset[2] = { 1015 {libcall, {{8, loop, false}, {24, unrolled_loop, false}, 1016 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, 1017 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false}, 1018 {-1, libcall, false}}}}; 1019 1020 const struct processor_costs bdver1_cost = { 1021 COSTS_N_INSNS (1), /* cost of an add instruction */ 1022 COSTS_N_INSNS (1), /* cost of a lea instruction */ 1023 COSTS_N_INSNS (1), /* variable shift costs */ 1024 COSTS_N_INSNS (1), /* constant shift costs */ 1025 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */ 1026 COSTS_N_INSNS (4), /* HI */ 1027 COSTS_N_INSNS (4), /* SI */ 1028 COSTS_N_INSNS (6), /* DI */ 1029 COSTS_N_INSNS (6)}, /* other */ 1030 0, /* cost of multiply per each bit set */ 1031 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */ 1032 COSTS_N_INSNS (35), /* HI */ 1033 COSTS_N_INSNS (51), /* SI */ 1034 COSTS_N_INSNS (83), /* DI */ 1035 COSTS_N_INSNS (83)}, /* other */ 1036 COSTS_N_INSNS (1), /* cost of movsx */ 1037 COSTS_N_INSNS (1), /* cost of movzx */ 1038 8, /* "large" insn */ 1039 9, /* MOVE_RATIO */ 1040 1041 /* All move costs are relative to integer->integer move times 2 and thus 1042 they are latency*2. */ 1043 8, /* cost for loading QImode using movzbl */ 1044 {8, 8, 8}, /* cost of loading integer registers 1045 in QImode, HImode and SImode. 1046 Relative to reg-reg move (2). */ 1047 {8, 8, 8}, /* cost of storing integer registers */ 1048 4, /* cost of reg,reg fld/fst */ 1049 {12, 12, 28}, /* cost of loading fp registers 1050 in SFmode, DFmode and XFmode */ 1051 {10, 10, 18}, /* cost of storing fp registers 1052 in SFmode, DFmode and XFmode */ 1053 4, /* cost of moving MMX register */ 1054 {12, 12}, /* cost of loading MMX registers 1055 in SImode and DImode */ 1056 {10, 10}, /* cost of storing MMX registers 1057 in SImode and DImode */ 1058 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 1059 {12, 12, 10, 20, 30}, /* cost of loading SSE registers 1060 in 32,64,128,256 and 512-bit */ 1061 {12, 12, 10, 20, 30}, /* cost of unaligned loads. */ 1062 {10, 10, 10, 20, 30}, /* cost of storing SSE registers 1063 in 32,64,128,256 and 512-bit */ 1064 {10, 10, 10, 20, 30}, /* cost of unaligned stores. */ 1065 16, 20, /* SSE->integer and integer->SSE moves */ 1066 12, 12, /* Gather load static, per_elt. */ 1067 10, 10, /* Gather store static, per_elt. */ 1068 16, /* size of l1 cache. */ 1069 2048, /* size of l2 cache. */ 1070 64, /* size of prefetch block */ 1071 /* New AMD processors never drop prefetches; if they cannot be performed 1072 immediately, they are queued. We set number of simultaneous prefetches 1073 to a large constant to reflect this (it probably is not a good idea not 1074 to limit number of prefetches at all, as their execution also takes some 1075 time). */ 1076 100, /* number of parallel prefetches */ 1077 2, /* Branch cost */ 1078 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */ 1079 COSTS_N_INSNS (6), /* cost of FMUL instruction. */ 1080 COSTS_N_INSNS (42), /* cost of FDIV instruction. */ 1081 COSTS_N_INSNS (2), /* cost of FABS instruction. */ 1082 COSTS_N_INSNS (2), /* cost of FCHS instruction. */ 1083 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */ 1084 1085 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */ 1086 COSTS_N_INSNS (6), /* cost of ADDSS/SD SUBSS/SD insns. */ 1087 COSTS_N_INSNS (6), /* cost of MULSS instruction. */ 1088 COSTS_N_INSNS (6), /* cost of MULSD instruction. */ 1089 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */ 1090 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */ 1091 /* 9-24 */ 1092 COSTS_N_INSNS (24), /* cost of DIVSS instruction. */ 1093 /* 9-27 */ 1094 COSTS_N_INSNS (27), /* cost of DIVSD instruction. */ 1095 COSTS_N_INSNS (15), /* cost of SQRTSS instruction. */ 1096 COSTS_N_INSNS (26), /* cost of SQRTSD instruction. */ 1097 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ 1098 bdver1_memcpy, 1099 bdver1_memset, 1100 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */ 1101 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */ 1102 }; 1103 1104 /* BDVER2 has optimized REP instruction for medium sized blocks, but for 1105 very small blocks it is better to use loop. For large blocks, libcall 1106 can do nontemporary accesses and beat inline considerably. */ 1107 1108 static stringop_algs bdver2_memcpy[2] = { 1109 {libcall, {{6, loop, false}, {14, unrolled_loop, false}, 1110 {-1, rep_prefix_4_byte, false}}}, 1111 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false}, 1112 {-1, libcall, false}}}}; 1113 static stringop_algs bdver2_memset[2] = { 1114 {libcall, {{8, loop, false}, {24, unrolled_loop, false}, 1115 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, 1116 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false}, 1117 {-1, libcall, false}}}}; 1118 1119 const struct processor_costs bdver2_cost = { 1120 COSTS_N_INSNS (1), /* cost of an add instruction */ 1121 COSTS_N_INSNS (1), /* cost of a lea instruction */ 1122 COSTS_N_INSNS (1), /* variable shift costs */ 1123 COSTS_N_INSNS (1), /* constant shift costs */ 1124 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */ 1125 COSTS_N_INSNS (4), /* HI */ 1126 COSTS_N_INSNS (4), /* SI */ 1127 COSTS_N_INSNS (6), /* DI */ 1128 COSTS_N_INSNS (6)}, /* other */ 1129 0, /* cost of multiply per each bit set */ 1130 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */ 1131 COSTS_N_INSNS (35), /* HI */ 1132 COSTS_N_INSNS (51), /* SI */ 1133 COSTS_N_INSNS (83), /* DI */ 1134 COSTS_N_INSNS (83)}, /* other */ 1135 COSTS_N_INSNS (1), /* cost of movsx */ 1136 COSTS_N_INSNS (1), /* cost of movzx */ 1137 8, /* "large" insn */ 1138 9, /* MOVE_RATIO */ 1139 1140 /* All move costs are relative to integer->integer move times 2 and thus 1141 they are latency*2. */ 1142 8, /* cost for loading QImode using movzbl */ 1143 {8, 8, 8}, /* cost of loading integer registers 1144 in QImode, HImode and SImode. 1145 Relative to reg-reg move (2). */ 1146 {8, 8, 8}, /* cost of storing integer registers */ 1147 4, /* cost of reg,reg fld/fst */ 1148 {12, 12, 28}, /* cost of loading fp registers 1149 in SFmode, DFmode and XFmode */ 1150 {10, 10, 18}, /* cost of storing fp registers 1151 in SFmode, DFmode and XFmode */ 1152 4, /* cost of moving MMX register */ 1153 {12, 12}, /* cost of loading MMX registers 1154 in SImode and DImode */ 1155 {10, 10}, /* cost of storing MMX registers 1156 in SImode and DImode */ 1157 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 1158 {12, 12, 10, 20, 30}, /* cost of loading SSE registers 1159 in 32,64,128,256 and 512-bit */ 1160 {12, 12, 10, 20, 30}, /* cost of unaligned loads. */ 1161 {10, 10, 10, 20, 30}, /* cost of storing SSE registers 1162 in 32,64,128,256 and 512-bit */ 1163 {10, 10, 10, 20, 30}, /* cost of unaligned stores. */ 1164 16, 20, /* SSE->integer and integer->SSE moves */ 1165 12, 12, /* Gather load static, per_elt. */ 1166 10, 10, /* Gather store static, per_elt. */ 1167 16, /* size of l1 cache. */ 1168 2048, /* size of l2 cache. */ 1169 64, /* size of prefetch block */ 1170 /* New AMD processors never drop prefetches; if they cannot be performed 1171 immediately, they are queued. We set number of simultaneous prefetches 1172 to a large constant to reflect this (it probably is not a good idea not 1173 to limit number of prefetches at all, as their execution also takes some 1174 time). */ 1175 100, /* number of parallel prefetches */ 1176 2, /* Branch cost */ 1177 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */ 1178 COSTS_N_INSNS (6), /* cost of FMUL instruction. */ 1179 COSTS_N_INSNS (42), /* cost of FDIV instruction. */ 1180 COSTS_N_INSNS (2), /* cost of FABS instruction. */ 1181 COSTS_N_INSNS (2), /* cost of FCHS instruction. */ 1182 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */ 1183 1184 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */ 1185 COSTS_N_INSNS (6), /* cost of ADDSS/SD SUBSS/SD insns. */ 1186 COSTS_N_INSNS (6), /* cost of MULSS instruction. */ 1187 COSTS_N_INSNS (6), /* cost of MULSD instruction. */ 1188 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */ 1189 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */ 1190 /* 9-24 */ 1191 COSTS_N_INSNS (24), /* cost of DIVSS instruction. */ 1192 /* 9-27 */ 1193 COSTS_N_INSNS (27), /* cost of DIVSD instruction. */ 1194 COSTS_N_INSNS (15), /* cost of SQRTSS instruction. */ 1195 COSTS_N_INSNS (26), /* cost of SQRTSD instruction. */ 1196 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ 1197 bdver2_memcpy, 1198 bdver2_memset, 1199 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */ 1200 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */ 1201 }; 1202 1203 1204 /* BDVER3 has optimized REP instruction for medium sized blocks, but for 1205 very small blocks it is better to use loop. For large blocks, libcall 1206 can do nontemporary accesses and beat inline considerably. */ 1207 static stringop_algs bdver3_memcpy[2] = { 1208 {libcall, {{6, loop, false}, {14, unrolled_loop, false}, 1209 {-1, rep_prefix_4_byte, false}}}, 1210 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false}, 1211 {-1, libcall, false}}}}; 1212 static stringop_algs bdver3_memset[2] = { 1213 {libcall, {{8, loop, false}, {24, unrolled_loop, false}, 1214 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, 1215 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false}, 1216 {-1, libcall, false}}}}; 1217 struct processor_costs bdver3_cost = { 1218 COSTS_N_INSNS (1), /* cost of an add instruction */ 1219 COSTS_N_INSNS (1), /* cost of a lea instruction */ 1220 COSTS_N_INSNS (1), /* variable shift costs */ 1221 COSTS_N_INSNS (1), /* constant shift costs */ 1222 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */ 1223 COSTS_N_INSNS (4), /* HI */ 1224 COSTS_N_INSNS (4), /* SI */ 1225 COSTS_N_INSNS (6), /* DI */ 1226 COSTS_N_INSNS (6)}, /* other */ 1227 0, /* cost of multiply per each bit set */ 1228 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */ 1229 COSTS_N_INSNS (35), /* HI */ 1230 COSTS_N_INSNS (51), /* SI */ 1231 COSTS_N_INSNS (83), /* DI */ 1232 COSTS_N_INSNS (83)}, /* other */ 1233 COSTS_N_INSNS (1), /* cost of movsx */ 1234 COSTS_N_INSNS (1), /* cost of movzx */ 1235 8, /* "large" insn */ 1236 9, /* MOVE_RATIO */ 1237 1238 /* All move costs are relative to integer->integer move times 2 and thus 1239 they are latency*2. */ 1240 8, /* cost for loading QImode using movzbl */ 1241 {8, 8, 8}, /* cost of loading integer registers 1242 in QImode, HImode and SImode. 1243 Relative to reg-reg move (2). */ 1244 {8, 8, 8}, /* cost of storing integer registers */ 1245 4, /* cost of reg,reg fld/fst */ 1246 {12, 12, 28}, /* cost of loading fp registers 1247 in SFmode, DFmode and XFmode */ 1248 {10, 10, 18}, /* cost of storing fp registers 1249 in SFmode, DFmode and XFmode */ 1250 4, /* cost of moving MMX register */ 1251 {12, 12}, /* cost of loading MMX registers 1252 in SImode and DImode */ 1253 {10, 10}, /* cost of storing MMX registers 1254 in SImode and DImode */ 1255 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 1256 {12, 12, 10, 20, 30}, /* cost of loading SSE registers 1257 in 32,64,128,256 and 512-bit */ 1258 {12, 12, 10, 20, 30}, /* cost of unaligned loads. */ 1259 {10, 10, 10, 20, 30}, /* cost of storing SSE registers 1260 in 32,64,128,256 and 512-bit */ 1261 {10, 10, 10, 20, 30}, /* cost of unaligned stores. */ 1262 16, 20, /* SSE->integer and integer->SSE moves */ 1263 12, 12, /* Gather load static, per_elt. */ 1264 10, 10, /* Gather store static, per_elt. */ 1265 16, /* size of l1 cache. */ 1266 2048, /* size of l2 cache. */ 1267 64, /* size of prefetch block */ 1268 /* New AMD processors never drop prefetches; if they cannot be performed 1269 immediately, they are queued. We set number of simultaneous prefetches 1270 to a large constant to reflect this (it probably is not a good idea not 1271 to limit number of prefetches at all, as their execution also takes some 1272 time). */ 1273 100, /* number of parallel prefetches */ 1274 2, /* Branch cost */ 1275 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */ 1276 COSTS_N_INSNS (6), /* cost of FMUL instruction. */ 1277 COSTS_N_INSNS (42), /* cost of FDIV instruction. */ 1278 COSTS_N_INSNS (2), /* cost of FABS instruction. */ 1279 COSTS_N_INSNS (2), /* cost of FCHS instruction. */ 1280 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */ 1281 1282 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */ 1283 COSTS_N_INSNS (6), /* cost of ADDSS/SD SUBSS/SD insns. */ 1284 COSTS_N_INSNS (6), /* cost of MULSS instruction. */ 1285 COSTS_N_INSNS (6), /* cost of MULSD instruction. */ 1286 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */ 1287 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */ 1288 /* 9-24 */ 1289 COSTS_N_INSNS (24), /* cost of DIVSS instruction. */ 1290 /* 9-27 */ 1291 COSTS_N_INSNS (27), /* cost of DIVSD instruction. */ 1292 COSTS_N_INSNS (15), /* cost of SQRTSS instruction. */ 1293 COSTS_N_INSNS (26), /* cost of SQRTSD instruction. */ 1294 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ 1295 bdver3_memcpy, 1296 bdver3_memset, 1297 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */ 1298 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */ 1299 }; 1300 1301 /* BDVER4 has optimized REP instruction for medium sized blocks, but for 1302 very small blocks it is better to use loop. For large blocks, libcall 1303 can do nontemporary accesses and beat inline considerably. */ 1304 static stringop_algs bdver4_memcpy[2] = { 1305 {libcall, {{6, loop, false}, {14, unrolled_loop, false}, 1306 {-1, rep_prefix_4_byte, false}}}, 1307 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false}, 1308 {-1, libcall, false}}}}; 1309 static stringop_algs bdver4_memset[2] = { 1310 {libcall, {{8, loop, false}, {24, unrolled_loop, false}, 1311 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, 1312 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false}, 1313 {-1, libcall, false}}}}; 1314 struct processor_costs bdver4_cost = { 1315 COSTS_N_INSNS (1), /* cost of an add instruction */ 1316 COSTS_N_INSNS (1), /* cost of a lea instruction */ 1317 COSTS_N_INSNS (1), /* variable shift costs */ 1318 COSTS_N_INSNS (1), /* constant shift costs */ 1319 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */ 1320 COSTS_N_INSNS (4), /* HI */ 1321 COSTS_N_INSNS (4), /* SI */ 1322 COSTS_N_INSNS (6), /* DI */ 1323 COSTS_N_INSNS (6)}, /* other */ 1324 0, /* cost of multiply per each bit set */ 1325 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */ 1326 COSTS_N_INSNS (35), /* HI */ 1327 COSTS_N_INSNS (51), /* SI */ 1328 COSTS_N_INSNS (83), /* DI */ 1329 COSTS_N_INSNS (83)}, /* other */ 1330 COSTS_N_INSNS (1), /* cost of movsx */ 1331 COSTS_N_INSNS (1), /* cost of movzx */ 1332 8, /* "large" insn */ 1333 9, /* MOVE_RATIO */ 1334 1335 /* All move costs are relative to integer->integer move times 2 and thus 1336 they are latency*2. */ 1337 8, /* cost for loading QImode using movzbl */ 1338 {8, 8, 8}, /* cost of loading integer registers 1339 in QImode, HImode and SImode. 1340 Relative to reg-reg move (2). */ 1341 {8, 8, 8}, /* cost of storing integer registers */ 1342 4, /* cost of reg,reg fld/fst */ 1343 {12, 12, 28}, /* cost of loading fp registers 1344 in SFmode, DFmode and XFmode */ 1345 {10, 10, 18}, /* cost of storing fp registers 1346 in SFmode, DFmode and XFmode */ 1347 4, /* cost of moving MMX register */ 1348 {12, 12}, /* cost of loading MMX registers 1349 in SImode and DImode */ 1350 {10, 10}, /* cost of storing MMX registers 1351 in SImode and DImode */ 1352 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 1353 {12, 12, 10, 20, 30}, /* cost of loading SSE registers 1354 in 32,64,128,256 and 512-bit */ 1355 {12, 12, 10, 20, 30}, /* cost of unaligned loads. */ 1356 {10, 10, 10, 20, 30}, /* cost of storing SSE registers 1357 in 32,64,128,256 and 512-bit */ 1358 {10, 10, 10, 20, 30}, /* cost of unaligned stores. */ 1359 16, 20, /* SSE->integer and integer->SSE moves */ 1360 12, 12, /* Gather load static, per_elt. */ 1361 10, 10, /* Gather store static, per_elt. */ 1362 16, /* size of l1 cache. */ 1363 2048, /* size of l2 cache. */ 1364 64, /* size of prefetch block */ 1365 /* New AMD processors never drop prefetches; if they cannot be performed 1366 immediately, they are queued. We set number of simultaneous prefetches 1367 to a large constant to reflect this (it probably is not a good idea not 1368 to limit number of prefetches at all, as their execution also takes some 1369 time). */ 1370 100, /* number of parallel prefetches */ 1371 2, /* Branch cost */ 1372 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */ 1373 COSTS_N_INSNS (6), /* cost of FMUL instruction. */ 1374 COSTS_N_INSNS (42), /* cost of FDIV instruction. */ 1375 COSTS_N_INSNS (2), /* cost of FABS instruction. */ 1376 COSTS_N_INSNS (2), /* cost of FCHS instruction. */ 1377 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */ 1378 1379 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */ 1380 COSTS_N_INSNS (6), /* cost of ADDSS/SD SUBSS/SD insns. */ 1381 COSTS_N_INSNS (6), /* cost of MULSS instruction. */ 1382 COSTS_N_INSNS (6), /* cost of MULSD instruction. */ 1383 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */ 1384 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */ 1385 /* 9-24 */ 1386 COSTS_N_INSNS (24), /* cost of DIVSS instruction. */ 1387 /* 9-27 */ 1388 COSTS_N_INSNS (27), /* cost of DIVSD instruction. */ 1389 COSTS_N_INSNS (15), /* cost of SQRTSS instruction. */ 1390 COSTS_N_INSNS (26), /* cost of SQRTSD instruction. */ 1391 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ 1392 bdver4_memcpy, 1393 bdver4_memset, 1394 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */ 1395 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */ 1396 }; 1397 1398 1399 /* ZNVER1 has optimized REP instruction for medium sized blocks, but for 1400 very small blocks it is better to use loop. For large blocks, libcall 1401 can do nontemporary accesses and beat inline considerably. */ 1402 static stringop_algs znver1_memcpy[2] = { 1403 {libcall, {{6, loop, false}, {14, unrolled_loop, false}, 1404 {-1, rep_prefix_4_byte, false}}}, 1405 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false}, 1406 {-1, libcall, false}}}}; 1407 static stringop_algs znver1_memset[2] = { 1408 {libcall, {{8, loop, false}, {24, unrolled_loop, false}, 1409 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, 1410 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false}, 1411 {-1, libcall, false}}}}; 1412 struct processor_costs znver1_cost = { 1413 COSTS_N_INSNS (1), /* cost of an add instruction. */ 1414 COSTS_N_INSNS (1), /* cost of a lea instruction. */ 1415 COSTS_N_INSNS (1), /* variable shift costs. */ 1416 COSTS_N_INSNS (1), /* constant shift costs. */ 1417 {COSTS_N_INSNS (3), /* cost of starting multiply for QI. */ 1418 COSTS_N_INSNS (3), /* HI. */ 1419 COSTS_N_INSNS (3), /* SI. */ 1420 COSTS_N_INSNS (3), /* DI. */ 1421 COSTS_N_INSNS (3)}, /* other. */ 1422 0, /* cost of multiply per each bit 1423 set. */ 1424 /* Depending on parameters, idiv can get faster on ryzen. This is upper 1425 bound. */ 1426 {COSTS_N_INSNS (16), /* cost of a divide/mod for QI. */ 1427 COSTS_N_INSNS (22), /* HI. */ 1428 COSTS_N_INSNS (30), /* SI. */ 1429 COSTS_N_INSNS (45), /* DI. */ 1430 COSTS_N_INSNS (45)}, /* other. */ 1431 COSTS_N_INSNS (1), /* cost of movsx. */ 1432 COSTS_N_INSNS (1), /* cost of movzx. */ 1433 8, /* "large" insn. */ 1434 9, /* MOVE_RATIO. */ 1435 1436 /* All move costs are relative to integer->integer move times 2 and thus 1437 they are latency*2. */ 1438 1439 /* reg-reg moves are done by renaming and thus they are even cheaper than 1440 1 cycle. Becuase reg-reg move cost is 2 and the following tables correspond 1441 to doubles of latencies, we do not model this correctly. It does not 1442 seem to make practical difference to bump prices up even more. */ 1443 6, /* cost for loading QImode using 1444 movzbl. */ 1445 {6, 6, 6}, /* cost of loading integer registers 1446 in QImode, HImode and SImode. 1447 Relative to reg-reg move (2). */ 1448 {8, 8, 8}, /* cost of storing integer 1449 registers. */ 1450 2, /* cost of reg,reg fld/fst. */ 1451 {6, 6, 16}, /* cost of loading fp registers 1452 in SFmode, DFmode and XFmode. */ 1453 {8, 8, 16}, /* cost of storing fp registers 1454 in SFmode, DFmode and XFmode. */ 1455 2, /* cost of moving MMX register. */ 1456 {6, 6}, /* cost of loading MMX registers 1457 in SImode and DImode. */ 1458 {8, 8}, /* cost of storing MMX registers 1459 in SImode and DImode. */ 1460 2, 3, 6, /* cost of moving XMM,YMM,ZMM register. */ 1461 {6, 6, 6, 10, 20}, /* cost of loading SSE registers 1462 in 32,64,128,256 and 512-bit. */ 1463 {6, 6, 6, 10, 20}, /* cost of unaligned loads. */ 1464 {8, 8, 8, 8, 16}, /* cost of storing SSE registers 1465 in 32,64,128,256 and 512-bit. */ 1466 {8, 8, 8, 8, 16}, /* cost of unaligned stores. */ 1467 6, 6, /* SSE->integer and integer->SSE moves. */ 1468 /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops, 1469 throughput 12. Approx 9 uops do not depend on vector size and every load 1470 is 7 uops. */ 1471 18, 8, /* Gather load static, per_elt. */ 1472 18, 10, /* Gather store static, per_elt. */ 1473 32, /* size of l1 cache. */ 1474 512, /* size of l2 cache. */ 1475 64, /* size of prefetch block. */ 1476 /* New AMD processors never drop prefetches; if they cannot be performed 1477 immediately, they are queued. We set number of simultaneous prefetches 1478 to a large constant to reflect this (it probably is not a good idea not 1479 to limit number of prefetches at all, as their execution also takes some 1480 time). */ 1481 100, /* number of parallel prefetches. */ 1482 3, /* Branch cost. */ 1483 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */ 1484 COSTS_N_INSNS (5), /* cost of FMUL instruction. */ 1485 /* Latency of fdiv is 8-15. */ 1486 COSTS_N_INSNS (15), /* cost of FDIV instruction. */ 1487 COSTS_N_INSNS (1), /* cost of FABS instruction. */ 1488 COSTS_N_INSNS (1), /* cost of FCHS instruction. */ 1489 /* Latency of fsqrt is 4-10. */ 1490 COSTS_N_INSNS (10), /* cost of FSQRT instruction. */ 1491 1492 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ 1493 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */ 1494 COSTS_N_INSNS (3), /* cost of MULSS instruction. */ 1495 COSTS_N_INSNS (4), /* cost of MULSD instruction. */ 1496 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */ 1497 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */ 1498 COSTS_N_INSNS (10), /* cost of DIVSS instruction. */ 1499 /* 9-13 */ 1500 COSTS_N_INSNS (13), /* cost of DIVSD instruction. */ 1501 COSTS_N_INSNS (10), /* cost of SQRTSS instruction. */ 1502 COSTS_N_INSNS (15), /* cost of SQRTSD instruction. */ 1503 /* Zen can execute 4 integer operations per cycle. FP operations take 3 cycles 1504 and it can execute 2 integer additions and 2 multiplications thus 1505 reassociation may make sense up to with of 6. SPEC2k6 bencharks suggests 1506 that 4 works better than 6 probably due to register pressure. 1507 1508 Integer vector operations are taken by FP unit and execute 3 vector 1509 plus/minus operations per cycle but only one multiply. This is adjusted 1510 in ix86_reassociation_width. */ 1511 4, 4, 3, 6, /* reassoc int, fp, vec_int, vec_fp. */ 1512 znver1_memcpy, 1513 znver1_memset, 1514 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */ 1515 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */ 1516 }; 1517 1518 /* skylake_cost should produce code tuned for Skylake familly of CPUs. */ 1519 static stringop_algs skylake_memcpy[2] = { 1520 {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}}, 1521 {libcall, {{16, loop, false}, {512, rep_prefix_8_byte, false}, 1522 {-1, libcall, false}}}}; 1523 1524 static stringop_algs skylake_memset[2] = { 1525 {libcall, {{6, loop_1_byte, true}, 1526 {24, loop, true}, 1527 {8192, rep_prefix_4_byte, true}, 1528 {-1, libcall, false}}}, 1529 {libcall, {{24, loop, true}, {512, rep_prefix_8_byte, false}, 1530 {-1, libcall, false}}}}; 1531 1532 static const 1533 struct processor_costs skylake_cost = { 1534 COSTS_N_INSNS (1), /* cost of an add instruction */ 1535 COSTS_N_INSNS (1)+1, /* cost of a lea instruction */ 1536 COSTS_N_INSNS (1), /* variable shift costs */ 1537 COSTS_N_INSNS (1), /* constant shift costs */ 1538 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ 1539 COSTS_N_INSNS (4), /* HI */ 1540 COSTS_N_INSNS (3), /* SI */ 1541 COSTS_N_INSNS (3), /* DI */ 1542 COSTS_N_INSNS (3)}, /* other */ 1543 0, /* cost of multiply per each bit set */ 1544 /* Expanding div/mod currently doesn't consider parallelism. So the cost 1545 model is not realistic. We compensate by increasing the latencies a bit. */ 1546 {COSTS_N_INSNS (11), /* cost of a divide/mod for QI */ 1547 COSTS_N_INSNS (11), /* HI */ 1548 COSTS_N_INSNS (14), /* SI */ 1549 COSTS_N_INSNS (76), /* DI */ 1550 COSTS_N_INSNS (76)}, /* other */ 1551 COSTS_N_INSNS (1), /* cost of movsx */ 1552 COSTS_N_INSNS (0), /* cost of movzx */ 1553 8, /* "large" insn */ 1554 17, /* MOVE_RATIO */ 1555 1556 6, /* cost for loading QImode using movzbl */ 1557 {4, 4, 4}, /* cost of loading integer registers 1558 in QImode, HImode and SImode. 1559 Relative to reg-reg move (2). */ 1560 {6, 6, 3}, /* cost of storing integer registers */ 1561 2, /* cost of reg,reg fld/fst */ 1562 {6, 6, 8}, /* cost of loading fp registers 1563 in SFmode, DFmode and XFmode */ 1564 {6, 6, 10}, /* cost of storing fp registers 1565 in SFmode, DFmode and XFmode */ 1566 2, /* cost of moving MMX register */ 1567 {6, 6}, /* cost of loading MMX registers 1568 in SImode and DImode */ 1569 {6, 6}, /* cost of storing MMX registers 1570 in SImode and DImode */ 1571 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */ 1572 {6, 6, 6, 10, 20}, /* cost of loading SSE registers 1573 in 32,64,128,256 and 512-bit */ 1574 {6, 6, 6, 10, 20}, /* cost of unaligned loads. */ 1575 {8, 8, 8, 12, 24}, /* cost of storing SSE registers 1576 in 32,64,128,256 and 512-bit */ 1577 {8, 8, 8, 8, 16}, /* cost of unaligned stores. */ 1578 2, 2, /* SSE->integer and integer->SSE moves */ 1579 20, 8, /* Gather load static, per_elt. */ 1580 22, 10, /* Gather store static, per_elt. */ 1581 64, /* size of l1 cache. */ 1582 512, /* size of l2 cache. */ 1583 64, /* size of prefetch block */ 1584 6, /* number of parallel prefetches */ 1585 3, /* Branch cost */ 1586 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */ 1587 COSTS_N_INSNS (4), /* cost of FMUL instruction. */ 1588 COSTS_N_INSNS (20), /* cost of FDIV instruction. */ 1589 COSTS_N_INSNS (1), /* cost of FABS instruction. */ 1590 COSTS_N_INSNS (1), /* cost of FCHS instruction. */ 1591 COSTS_N_INSNS (20), /* cost of FSQRT instruction. */ 1592 1593 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ 1594 COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */ 1595 COSTS_N_INSNS (4), /* cost of MULSS instruction. */ 1596 COSTS_N_INSNS (4), /* cost of MULSD instruction. */ 1597 COSTS_N_INSNS (4), /* cost of FMA SS instruction. */ 1598 COSTS_N_INSNS (4), /* cost of FMA SD instruction. */ 1599 COSTS_N_INSNS (11), /* cost of DIVSS instruction. */ 1600 COSTS_N_INSNS (14), /* cost of DIVSD instruction. */ 1601 COSTS_N_INSNS (12), /* cost of SQRTSS instruction. */ 1602 COSTS_N_INSNS (18), /* cost of SQRTSD instruction. */ 1603 1, 4, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */ 1604 skylake_memcpy, 1605 skylake_memset, 1606 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ 1607 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ 1608 }; 1609 /* BTVER1 has optimized REP instruction for medium sized blocks, but for 1610 very small blocks it is better to use loop. For large blocks, libcall can 1611 do nontemporary accesses and beat inline considerably. */ 1612 static stringop_algs btver1_memcpy[2] = { 1613 {libcall, {{6, loop, false}, {14, unrolled_loop, false}, 1614 {-1, rep_prefix_4_byte, false}}}, 1615 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false}, 1616 {-1, libcall, false}}}}; 1617 static stringop_algs btver1_memset[2] = { 1618 {libcall, {{8, loop, false}, {24, unrolled_loop, false}, 1619 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, 1620 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false}, 1621 {-1, libcall, false}}}}; 1622 const struct processor_costs btver1_cost = { 1623 COSTS_N_INSNS (1), /* cost of an add instruction */ 1624 COSTS_N_INSNS (2), /* cost of a lea instruction */ 1625 COSTS_N_INSNS (1), /* variable shift costs */ 1626 COSTS_N_INSNS (1), /* constant shift costs */ 1627 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ 1628 COSTS_N_INSNS (4), /* HI */ 1629 COSTS_N_INSNS (3), /* SI */ 1630 COSTS_N_INSNS (4), /* DI */ 1631 COSTS_N_INSNS (5)}, /* other */ 1632 0, /* cost of multiply per each bit set */ 1633 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */ 1634 COSTS_N_INSNS (35), /* HI */ 1635 COSTS_N_INSNS (51), /* SI */ 1636 COSTS_N_INSNS (83), /* DI */ 1637 COSTS_N_INSNS (83)}, /* other */ 1638 COSTS_N_INSNS (1), /* cost of movsx */ 1639 COSTS_N_INSNS (1), /* cost of movzx */ 1640 8, /* "large" insn */ 1641 9, /* MOVE_RATIO */ 1642 1643 /* All move costs are relative to integer->integer move times 2 and thus 1644 they are latency*2. */ 1645 8, /* cost for loading QImode using movzbl */ 1646 {6, 8, 6}, /* cost of loading integer registers 1647 in QImode, HImode and SImode. 1648 Relative to reg-reg move (2). */ 1649 {6, 8, 6}, /* cost of storing integer registers */ 1650 4, /* cost of reg,reg fld/fst */ 1651 {12, 12, 28}, /* cost of loading fp registers 1652 in SFmode, DFmode and XFmode */ 1653 {12, 12, 38}, /* cost of storing fp registers 1654 in SFmode, DFmode and XFmode */ 1655 4, /* cost of moving MMX register */ 1656 {10, 10}, /* cost of loading MMX registers 1657 in SImode and DImode */ 1658 {12, 12}, /* cost of storing MMX registers 1659 in SImode and DImode */ 1660 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 1661 {10, 10, 12, 24, 48}, /* cost of loading SSE registers 1662 in 32,64,128,256 and 512-bit */ 1663 {10, 10, 12, 24, 48}, /* cost of unaligned loads. */ 1664 {10, 10, 12, 24, 48}, /* cost of storing SSE registers 1665 in 32,64,128,256 and 512-bit */ 1666 {10, 10, 12, 24, 48}, /* cost of unaligned stores. */ 1667 14, 14, /* SSE->integer and integer->SSE moves */ 1668 10, 10, /* Gather load static, per_elt. */ 1669 10, 10, /* Gather store static, per_elt. */ 1670 32, /* size of l1 cache. */ 1671 512, /* size of l2 cache. */ 1672 64, /* size of prefetch block */ 1673 100, /* number of parallel prefetches */ 1674 2, /* Branch cost */ 1675 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */ 1676 COSTS_N_INSNS (4), /* cost of FMUL instruction. */ 1677 COSTS_N_INSNS (19), /* cost of FDIV instruction. */ 1678 COSTS_N_INSNS (2), /* cost of FABS instruction. */ 1679 COSTS_N_INSNS (2), /* cost of FCHS instruction. */ 1680 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */ 1681 1682 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ 1683 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */ 1684 COSTS_N_INSNS (2), /* cost of MULSS instruction. */ 1685 COSTS_N_INSNS (4), /* cost of MULSD instruction. */ 1686 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */ 1687 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */ 1688 COSTS_N_INSNS (13), /* cost of DIVSS instruction. */ 1689 COSTS_N_INSNS (17), /* cost of DIVSD instruction. */ 1690 COSTS_N_INSNS (14), /* cost of SQRTSS instruction. */ 1691 COSTS_N_INSNS (48), /* cost of SQRTSD instruction. */ 1692 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ 1693 btver1_memcpy, 1694 btver1_memset, 1695 COSTS_N_INSNS (2), /* cond_taken_branch_cost. */ 1696 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ 1697 }; 1698 1699 static stringop_algs btver2_memcpy[2] = { 1700 {libcall, {{6, loop, false}, {14, unrolled_loop, false}, 1701 {-1, rep_prefix_4_byte, false}}}, 1702 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false}, 1703 {-1, libcall, false}}}}; 1704 static stringop_algs btver2_memset[2] = { 1705 {libcall, {{8, loop, false}, {24, unrolled_loop, false}, 1706 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, 1707 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false}, 1708 {-1, libcall, false}}}}; 1709 const struct processor_costs btver2_cost = { 1710 COSTS_N_INSNS (1), /* cost of an add instruction */ 1711 COSTS_N_INSNS (2), /* cost of a lea instruction */ 1712 COSTS_N_INSNS (1), /* variable shift costs */ 1713 COSTS_N_INSNS (1), /* constant shift costs */ 1714 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ 1715 COSTS_N_INSNS (4), /* HI */ 1716 COSTS_N_INSNS (3), /* SI */ 1717 COSTS_N_INSNS (4), /* DI */ 1718 COSTS_N_INSNS (5)}, /* other */ 1719 0, /* cost of multiply per each bit set */ 1720 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */ 1721 COSTS_N_INSNS (35), /* HI */ 1722 COSTS_N_INSNS (51), /* SI */ 1723 COSTS_N_INSNS (83), /* DI */ 1724 COSTS_N_INSNS (83)}, /* other */ 1725 COSTS_N_INSNS (1), /* cost of movsx */ 1726 COSTS_N_INSNS (1), /* cost of movzx */ 1727 8, /* "large" insn */ 1728 9, /* MOVE_RATIO */ 1729 1730 /* All move costs are relative to integer->integer move times 2 and thus 1731 they are latency*2. */ 1732 8, /* cost for loading QImode using movzbl */ 1733 {8, 8, 6}, /* cost of loading integer registers 1734 in QImode, HImode and SImode. 1735 Relative to reg-reg move (2). */ 1736 {8, 8, 6}, /* cost of storing integer registers */ 1737 4, /* cost of reg,reg fld/fst */ 1738 {12, 12, 28}, /* cost of loading fp registers 1739 in SFmode, DFmode and XFmode */ 1740 {12, 12, 38}, /* cost of storing fp registers 1741 in SFmode, DFmode and XFmode */ 1742 4, /* cost of moving MMX register */ 1743 {10, 10}, /* cost of loading MMX registers 1744 in SImode and DImode */ 1745 {12, 12}, /* cost of storing MMX registers 1746 in SImode and DImode */ 1747 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 1748 {10, 10, 12, 24, 48}, /* cost of loading SSE registers 1749 in 32,64,128,256 and 512-bit */ 1750 {10, 10, 12, 24, 48}, /* cost of unaligned loads. */ 1751 {10, 10, 12, 24, 48}, /* cost of storing SSE registers 1752 in 32,64,128,256 and 512-bit */ 1753 {10, 10, 12, 24, 48}, /* cost of unaligned stores. */ 1754 14, 14, /* SSE->integer and integer->SSE moves */ 1755 10, 10, /* Gather load static, per_elt. */ 1756 10, 10, /* Gather store static, per_elt. */ 1757 32, /* size of l1 cache. */ 1758 2048, /* size of l2 cache. */ 1759 64, /* size of prefetch block */ 1760 100, /* number of parallel prefetches */ 1761 2, /* Branch cost */ 1762 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */ 1763 COSTS_N_INSNS (4), /* cost of FMUL instruction. */ 1764 COSTS_N_INSNS (19), /* cost of FDIV instruction. */ 1765 COSTS_N_INSNS (2), /* cost of FABS instruction. */ 1766 COSTS_N_INSNS (2), /* cost of FCHS instruction. */ 1767 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */ 1768 1769 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ 1770 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */ 1771 COSTS_N_INSNS (2), /* cost of MULSS instruction. */ 1772 COSTS_N_INSNS (4), /* cost of MULSD instruction. */ 1773 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */ 1774 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */ 1775 COSTS_N_INSNS (13), /* cost of DIVSS instruction. */ 1776 COSTS_N_INSNS (19), /* cost of DIVSD instruction. */ 1777 COSTS_N_INSNS (16), /* cost of SQRTSS instruction. */ 1778 COSTS_N_INSNS (21), /* cost of SQRTSD instruction. */ 1779 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ 1780 btver2_memcpy, 1781 btver2_memset, 1782 COSTS_N_INSNS (2), /* cond_taken_branch_cost. */ 1783 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ 1784 }; 1785 1786 static stringop_algs pentium4_memcpy[2] = { 1787 {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}}, 1788 DUMMY_STRINGOP_ALGS}; 1789 static stringop_algs pentium4_memset[2] = { 1790 {libcall, {{6, loop_1_byte, false}, {48, loop, false}, 1791 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}}, 1792 DUMMY_STRINGOP_ALGS}; 1793 1794 static const 1795 struct processor_costs pentium4_cost = { 1796 COSTS_N_INSNS (1), /* cost of an add instruction */ 1797 COSTS_N_INSNS (3), /* cost of a lea instruction */ 1798 COSTS_N_INSNS (4), /* variable shift costs */ 1799 COSTS_N_INSNS (4), /* constant shift costs */ 1800 {COSTS_N_INSNS (15), /* cost of starting multiply for QI */ 1801 COSTS_N_INSNS (15), /* HI */ 1802 COSTS_N_INSNS (15), /* SI */ 1803 COSTS_N_INSNS (15), /* DI */ 1804 COSTS_N_INSNS (15)}, /* other */ 1805 0, /* cost of multiply per each bit set */ 1806 {COSTS_N_INSNS (56), /* cost of a divide/mod for QI */ 1807 COSTS_N_INSNS (56), /* HI */ 1808 COSTS_N_INSNS (56), /* SI */ 1809 COSTS_N_INSNS (56), /* DI */ 1810 COSTS_N_INSNS (56)}, /* other */ 1811 COSTS_N_INSNS (1), /* cost of movsx */ 1812 COSTS_N_INSNS (1), /* cost of movzx */ 1813 16, /* "large" insn */ 1814 6, /* MOVE_RATIO */ 1815 1816 /* All move costs are relative to integer->integer move times 2 and thus 1817 they are latency*2. */ 1818 5, /* cost for loading QImode using movzbl */ 1819 {4, 5, 4}, /* cost of loading integer registers 1820 in QImode, HImode and SImode. 1821 Relative to reg-reg move (2). */ 1822 {2, 3, 2}, /* cost of storing integer registers */ 1823 12, /* cost of reg,reg fld/fst */ 1824 {14, 14, 14}, /* cost of loading fp registers 1825 in SFmode, DFmode and XFmode */ 1826 {14, 14, 14}, /* cost of storing fp registers 1827 in SFmode, DFmode and XFmode */ 1828 12, /* cost of moving MMX register */ 1829 {16, 16}, /* cost of loading MMX registers 1830 in SImode and DImode */ 1831 {16, 16}, /* cost of storing MMX registers 1832 in SImode and DImode */ 1833 12, 24, 48, /* cost of moving XMM,YMM,ZMM register */ 1834 {16, 16, 16, 32, 64}, /* cost of loading SSE registers 1835 in 32,64,128,256 and 512-bit */ 1836 {32, 32, 32, 64, 128}, /* cost of unaligned loads. */ 1837 {16, 16, 16, 32, 64}, /* cost of storing SSE registers 1838 in 32,64,128,256 and 512-bit */ 1839 {32, 32, 32, 64, 128}, /* cost of unaligned stores. */ 1840 20, 12, /* SSE->integer and integer->SSE moves */ 1841 16, 16, /* Gather load static, per_elt. */ 1842 16, 16, /* Gather store static, per_elt. */ 1843 8, /* size of l1 cache. */ 1844 256, /* size of l2 cache. */ 1845 64, /* size of prefetch block */ 1846 6, /* number of parallel prefetches */ 1847 2, /* Branch cost */ 1848 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */ 1849 COSTS_N_INSNS (7), /* cost of FMUL instruction. */ 1850 COSTS_N_INSNS (43), /* cost of FDIV instruction. */ 1851 COSTS_N_INSNS (2), /* cost of FABS instruction. */ 1852 COSTS_N_INSNS (2), /* cost of FCHS instruction. */ 1853 COSTS_N_INSNS (43), /* cost of FSQRT instruction. */ 1854 1855 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */ 1856 COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */ 1857 COSTS_N_INSNS (6), /* cost of MULSS instruction. */ 1858 COSTS_N_INSNS (6), /* cost of MULSD instruction. */ 1859 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */ 1860 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */ 1861 COSTS_N_INSNS (23), /* cost of DIVSS instruction. */ 1862 COSTS_N_INSNS (38), /* cost of DIVSD instruction. */ 1863 COSTS_N_INSNS (23), /* cost of SQRTSS instruction. */ 1864 COSTS_N_INSNS (38), /* cost of SQRTSD instruction. */ 1865 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ 1866 pentium4_memcpy, 1867 pentium4_memset, 1868 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ 1869 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ 1870 }; 1871 1872 static stringop_algs nocona_memcpy[2] = { 1873 {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}}, 1874 {libcall, {{32, loop, false}, {20000, rep_prefix_8_byte, false}, 1875 {100000, unrolled_loop, false}, {-1, libcall, false}}}}; 1876 1877 static stringop_algs nocona_memset[2] = { 1878 {libcall, {{6, loop_1_byte, false}, {48, loop, false}, 1879 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}}, 1880 {libcall, {{24, loop, false}, {64, unrolled_loop, false}, 1881 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}}; 1882 1883 static const 1884 struct processor_costs nocona_cost = { 1885 COSTS_N_INSNS (1), /* cost of an add instruction */ 1886 COSTS_N_INSNS (1), /* cost of a lea instruction */ 1887 COSTS_N_INSNS (1), /* variable shift costs */ 1888 COSTS_N_INSNS (1), /* constant shift costs */ 1889 {COSTS_N_INSNS (10), /* cost of starting multiply for QI */ 1890 COSTS_N_INSNS (10), /* HI */ 1891 COSTS_N_INSNS (10), /* SI */ 1892 COSTS_N_INSNS (10), /* DI */ 1893 COSTS_N_INSNS (10)}, /* other */ 1894 0, /* cost of multiply per each bit set */ 1895 {COSTS_N_INSNS (66), /* cost of a divide/mod for QI */ 1896 COSTS_N_INSNS (66), /* HI */ 1897 COSTS_N_INSNS (66), /* SI */ 1898 COSTS_N_INSNS (66), /* DI */ 1899 COSTS_N_INSNS (66)}, /* other */ 1900 COSTS_N_INSNS (1), /* cost of movsx */ 1901 COSTS_N_INSNS (1), /* cost of movzx */ 1902 16, /* "large" insn */ 1903 17, /* MOVE_RATIO */ 1904 1905 /* All move costs are relative to integer->integer move times 2 and thus 1906 they are latency*2. */ 1907 4, /* cost for loading QImode using movzbl */ 1908 {4, 4, 4}, /* cost of loading integer registers 1909 in QImode, HImode and SImode. 1910 Relative to reg-reg move (2). */ 1911 {4, 4, 4}, /* cost of storing integer registers */ 1912 12, /* cost of reg,reg fld/fst */ 1913 {14, 14, 14}, /* cost of loading fp registers 1914 in SFmode, DFmode and XFmode */ 1915 {14, 14, 14}, /* cost of storing fp registers 1916 in SFmode, DFmode and XFmode */ 1917 14, /* cost of moving MMX register */ 1918 {12, 12}, /* cost of loading MMX registers 1919 in SImode and DImode */ 1920 {12, 12}, /* cost of storing MMX registers 1921 in SImode and DImode */ 1922 6, 12, 24, /* cost of moving XMM,YMM,ZMM register */ 1923 {12, 12, 12, 24, 48}, /* cost of loading SSE registers 1924 in 32,64,128,256 and 512-bit */ 1925 {24, 24, 24, 48, 96}, /* cost of unaligned loads. */ 1926 {12, 12, 12, 24, 48}, /* cost of storing SSE registers 1927 in 32,64,128,256 and 512-bit */ 1928 {24, 24, 24, 48, 96}, /* cost of unaligned stores. */ 1929 20, 12, /* SSE->integer and integer->SSE moves */ 1930 12, 12, /* Gather load static, per_elt. */ 1931 12, 12, /* Gather store static, per_elt. */ 1932 8, /* size of l1 cache. */ 1933 1024, /* size of l2 cache. */ 1934 64, /* size of prefetch block */ 1935 8, /* number of parallel prefetches */ 1936 1, /* Branch cost */ 1937 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */ 1938 COSTS_N_INSNS (8), /* cost of FMUL instruction. */ 1939 COSTS_N_INSNS (40), /* cost of FDIV instruction. */ 1940 COSTS_N_INSNS (3), /* cost of FABS instruction. */ 1941 COSTS_N_INSNS (3), /* cost of FCHS instruction. */ 1942 COSTS_N_INSNS (44), /* cost of FSQRT instruction. */ 1943 1944 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */ 1945 COSTS_N_INSNS (5), /* cost of ADDSS/SD SUBSS/SD insns. */ 1946 COSTS_N_INSNS (7), /* cost of MULSS instruction. */ 1947 COSTS_N_INSNS (7), /* cost of MULSD instruction. */ 1948 COSTS_N_INSNS (7), /* cost of FMA SS instruction. */ 1949 COSTS_N_INSNS (7), /* cost of FMA SD instruction. */ 1950 COSTS_N_INSNS (32), /* cost of DIVSS instruction. */ 1951 COSTS_N_INSNS (40), /* cost of DIVSD instruction. */ 1952 COSTS_N_INSNS (32), /* cost of SQRTSS instruction. */ 1953 COSTS_N_INSNS (41), /* cost of SQRTSD instruction. */ 1954 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ 1955 nocona_memcpy, 1956 nocona_memset, 1957 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ 1958 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ 1959 }; 1960 1961 static stringop_algs atom_memcpy[2] = { 1962 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}}, 1963 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false}, 1964 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}}; 1965 static stringop_algs atom_memset[2] = { 1966 {libcall, {{8, loop, false}, {15, unrolled_loop, false}, 1967 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, 1968 {libcall, {{24, loop, false}, {32, unrolled_loop, false}, 1969 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}}; 1970 static const 1971 struct processor_costs atom_cost = { 1972 COSTS_N_INSNS (1), /* cost of an add instruction */ 1973 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ 1974 COSTS_N_INSNS (1), /* variable shift costs */ 1975 COSTS_N_INSNS (1), /* constant shift costs */ 1976 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ 1977 COSTS_N_INSNS (4), /* HI */ 1978 COSTS_N_INSNS (3), /* SI */ 1979 COSTS_N_INSNS (4), /* DI */ 1980 COSTS_N_INSNS (2)}, /* other */ 1981 0, /* cost of multiply per each bit set */ 1982 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */ 1983 COSTS_N_INSNS (26), /* HI */ 1984 COSTS_N_INSNS (42), /* SI */ 1985 COSTS_N_INSNS (74), /* DI */ 1986 COSTS_N_INSNS (74)}, /* other */ 1987 COSTS_N_INSNS (1), /* cost of movsx */ 1988 COSTS_N_INSNS (1), /* cost of movzx */ 1989 8, /* "large" insn */ 1990 17, /* MOVE_RATIO */ 1991 1992 /* All move costs are relative to integer->integer move times 2 and thus 1993 they are latency*2. */ 1994 6, /* cost for loading QImode using movzbl */ 1995 {6, 6, 6}, /* cost of loading integer registers 1996 in QImode, HImode and SImode. 1997 Relative to reg-reg move (2). */ 1998 {6, 6, 6}, /* cost of storing integer registers */ 1999 4, /* cost of reg,reg fld/fst */ 2000 {6, 6, 18}, /* cost of loading fp registers 2001 in SFmode, DFmode and XFmode */ 2002 {14, 14, 24}, /* cost of storing fp registers 2003 in SFmode, DFmode and XFmode */ 2004 2, /* cost of moving MMX register */ 2005 {8, 8}, /* cost of loading MMX registers 2006 in SImode and DImode */ 2007 {10, 10}, /* cost of storing MMX registers 2008 in SImode and DImode */ 2009 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 2010 {8, 8, 8, 16, 32}, /* cost of loading SSE registers 2011 in 32,64,128,256 and 512-bit */ 2012 {16, 16, 16, 32, 64}, /* cost of unaligned loads. */ 2013 {8, 8, 8, 16, 32}, /* cost of storing SSE registers 2014 in 32,64,128,256 and 512-bit */ 2015 {16, 16, 16, 32, 64}, /* cost of unaligned stores. */ 2016 8, 6, /* SSE->integer and integer->SSE moves */ 2017 8, 8, /* Gather load static, per_elt. */ 2018 8, 8, /* Gather store static, per_elt. */ 2019 32, /* size of l1 cache. */ 2020 256, /* size of l2 cache. */ 2021 64, /* size of prefetch block */ 2022 6, /* number of parallel prefetches */ 2023 3, /* Branch cost */ 2024 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */ 2025 COSTS_N_INSNS (8), /* cost of FMUL instruction. */ 2026 COSTS_N_INSNS (20), /* cost of FDIV instruction. */ 2027 COSTS_N_INSNS (8), /* cost of FABS instruction. */ 2028 COSTS_N_INSNS (8), /* cost of FCHS instruction. */ 2029 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */ 2030 2031 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ 2032 COSTS_N_INSNS (5), /* cost of ADDSS/SD SUBSS/SD insns. */ 2033 COSTS_N_INSNS (4), /* cost of MULSS instruction. */ 2034 COSTS_N_INSNS (5), /* cost of MULSD instruction. */ 2035 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */ 2036 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */ 2037 COSTS_N_INSNS (31), /* cost of DIVSS instruction. */ 2038 COSTS_N_INSNS (60), /* cost of DIVSD instruction. */ 2039 COSTS_N_INSNS (31), /* cost of SQRTSS instruction. */ 2040 COSTS_N_INSNS (63), /* cost of SQRTSD instruction. */ 2041 2, 2, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */ 2042 atom_memcpy, 2043 atom_memset, 2044 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ 2045 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ 2046 }; 2047 2048 static stringop_algs slm_memcpy[2] = { 2049 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}}, 2050 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false}, 2051 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}}; 2052 static stringop_algs slm_memset[2] = { 2053 {libcall, {{8, loop, false}, {15, unrolled_loop, false}, 2054 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, 2055 {libcall, {{24, loop, false}, {32, unrolled_loop, false}, 2056 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}}; 2057 static const 2058 struct processor_costs slm_cost = { 2059 COSTS_N_INSNS (1), /* cost of an add instruction */ 2060 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ 2061 COSTS_N_INSNS (1), /* variable shift costs */ 2062 COSTS_N_INSNS (1), /* constant shift costs */ 2063 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ 2064 COSTS_N_INSNS (3), /* HI */ 2065 COSTS_N_INSNS (3), /* SI */ 2066 COSTS_N_INSNS (4), /* DI */ 2067 COSTS_N_INSNS (2)}, /* other */ 2068 0, /* cost of multiply per each bit set */ 2069 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */ 2070 COSTS_N_INSNS (26), /* HI */ 2071 COSTS_N_INSNS (42), /* SI */ 2072 COSTS_N_INSNS (74), /* DI */ 2073 COSTS_N_INSNS (74)}, /* other */ 2074 COSTS_N_INSNS (1), /* cost of movsx */ 2075 COSTS_N_INSNS (1), /* cost of movzx */ 2076 8, /* "large" insn */ 2077 17, /* MOVE_RATIO */ 2078 2079 /* All move costs are relative to integer->integer move times 2 and thus 2080 they are latency*2. */ 2081 8, /* cost for loading QImode using movzbl */ 2082 {8, 8, 8}, /* cost of loading integer registers 2083 in QImode, HImode and SImode. 2084 Relative to reg-reg move (2). */ 2085 {6, 6, 6}, /* cost of storing integer registers */ 2086 2, /* cost of reg,reg fld/fst */ 2087 {8, 8, 18}, /* cost of loading fp registers 2088 in SFmode, DFmode and XFmode */ 2089 {6, 6, 18}, /* cost of storing fp registers 2090 in SFmode, DFmode and XFmode */ 2091 2, /* cost of moving MMX register */ 2092 {8, 8}, /* cost of loading MMX registers 2093 in SImode and DImode */ 2094 {6, 6}, /* cost of storing MMX registers 2095 in SImode and DImode */ 2096 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 2097 {8, 8, 8, 16, 32}, /* cost of loading SSE registers 2098 in 32,64,128,256 and 512-bit */ 2099 {16, 16, 16, 32, 64}, /* cost of unaligned loads. */ 2100 {8, 8, 8, 16, 32}, /* cost of storing SSE registers 2101 in 32,64,128,256 and 512-bit */ 2102 {16, 16, 16, 32, 64}, /* cost of unaligned stores. */ 2103 8, 6, /* SSE->integer and integer->SSE moves */ 2104 8, 8, /* Gather load static, per_elt. */ 2105 8, 8, /* Gather store static, per_elt. */ 2106 32, /* size of l1 cache. */ 2107 256, /* size of l2 cache. */ 2108 64, /* size of prefetch block */ 2109 6, /* number of parallel prefetches */ 2110 3, /* Branch cost */ 2111 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */ 2112 COSTS_N_INSNS (8), /* cost of FMUL instruction. */ 2113 COSTS_N_INSNS (20), /* cost of FDIV instruction. */ 2114 COSTS_N_INSNS (8), /* cost of FABS instruction. */ 2115 COSTS_N_INSNS (8), /* cost of FCHS instruction. */ 2116 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */ 2117 2118 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ 2119 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */ 2120 COSTS_N_INSNS (4), /* cost of MULSS instruction. */ 2121 COSTS_N_INSNS (5), /* cost of MULSD instruction. */ 2122 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */ 2123 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */ 2124 COSTS_N_INSNS (39), /* cost of DIVSS instruction. */ 2125 COSTS_N_INSNS (69), /* cost of DIVSD instruction. */ 2126 COSTS_N_INSNS (20), /* cost of SQRTSS instruction. */ 2127 COSTS_N_INSNS (35), /* cost of SQRTSD instruction. */ 2128 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ 2129 slm_memcpy, 2130 slm_memset, 2131 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ 2132 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ 2133 }; 2134 2135 static stringop_algs intel_memcpy[2] = { 2136 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}}, 2137 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false}, 2138 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}}; 2139 static stringop_algs intel_memset[2] = { 2140 {libcall, {{8, loop, false}, {15, unrolled_loop, false}, 2141 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, 2142 {libcall, {{24, loop, false}, {32, unrolled_loop, false}, 2143 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}}; 2144 static const 2145 struct processor_costs intel_cost = { 2146 COSTS_N_INSNS (1), /* cost of an add instruction */ 2147 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ 2148 COSTS_N_INSNS (1), /* variable shift costs */ 2149 COSTS_N_INSNS (1), /* constant shift costs */ 2150 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ 2151 COSTS_N_INSNS (3), /* HI */ 2152 COSTS_N_INSNS (3), /* SI */ 2153 COSTS_N_INSNS (4), /* DI */ 2154 COSTS_N_INSNS (2)}, /* other */ 2155 0, /* cost of multiply per each bit set */ 2156 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */ 2157 COSTS_N_INSNS (26), /* HI */ 2158 COSTS_N_INSNS (42), /* SI */ 2159 COSTS_N_INSNS (74), /* DI */ 2160 COSTS_N_INSNS (74)}, /* other */ 2161 COSTS_N_INSNS (1), /* cost of movsx */ 2162 COSTS_N_INSNS (1), /* cost of movzx */ 2163 8, /* "large" insn */ 2164 17, /* MOVE_RATIO */ 2165 2166 /* All move costs are relative to integer->integer move times 2 and thus 2167 they are latency*2. */ 2168 6, /* cost for loading QImode using movzbl */ 2169 {4, 4, 4}, /* cost of loading integer registers 2170 in QImode, HImode and SImode. 2171 Relative to reg-reg move (2). */ 2172 {6, 6, 6}, /* cost of storing integer registers */ 2173 2, /* cost of reg,reg fld/fst */ 2174 {6, 6, 8}, /* cost of loading fp registers 2175 in SFmode, DFmode and XFmode */ 2176 {6, 6, 10}, /* cost of storing fp registers 2177 in SFmode, DFmode and XFmode */ 2178 2, /* cost of moving MMX register */ 2179 {6, 6}, /* cost of loading MMX registers 2180 in SImode and DImode */ 2181 {6, 6}, /* cost of storing MMX registers 2182 in SImode and DImode */ 2183 2, 2, 2, /* cost of moving XMM,YMM,ZMM register */ 2184 {6, 6, 6, 6, 6}, /* cost of loading SSE registers 2185 in 32,64,128,256 and 512-bit */ 2186 {10, 10, 10, 10, 10}, /* cost of unaligned loads. */ 2187 {6, 6, 6, 6, 6}, /* cost of storing SSE registers 2188 in 32,64,128,256 and 512-bit */ 2189 {10, 10, 10, 10, 10}, /* cost of unaligned loads. */ 2190 4, 4, /* SSE->integer and integer->SSE moves */ 2191 6, 6, /* Gather load static, per_elt. */ 2192 6, 6, /* Gather store static, per_elt. */ 2193 32, /* size of l1 cache. */ 2194 256, /* size of l2 cache. */ 2195 64, /* size of prefetch block */ 2196 6, /* number of parallel prefetches */ 2197 3, /* Branch cost */ 2198 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */ 2199 COSTS_N_INSNS (8), /* cost of FMUL instruction. */ 2200 COSTS_N_INSNS (20), /* cost of FDIV instruction. */ 2201 COSTS_N_INSNS (8), /* cost of FABS instruction. */ 2202 COSTS_N_INSNS (8), /* cost of FCHS instruction. */ 2203 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */ 2204 2205 COSTS_N_INSNS (8), /* cost of cheap SSE instruction. */ 2206 COSTS_N_INSNS (8), /* cost of ADDSS/SD SUBSS/SD insns. */ 2207 COSTS_N_INSNS (8), /* cost of MULSS instruction. */ 2208 COSTS_N_INSNS (8), /* cost of MULSD instruction. */ 2209 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */ 2210 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */ 2211 COSTS_N_INSNS (20), /* cost of DIVSS instruction. */ 2212 COSTS_N_INSNS (20), /* cost of DIVSD instruction. */ 2213 COSTS_N_INSNS (40), /* cost of SQRTSS instruction. */ 2214 COSTS_N_INSNS (40), /* cost of SQRTSD instruction. */ 2215 1, 4, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ 2216 intel_memcpy, 2217 intel_memset, 2218 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ 2219 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ 2220 }; 2221 2222 /* Generic should produce code tuned for Core-i7 (and newer chips) 2223 and btver1 (and newer chips). */ 2224 2225 static stringop_algs generic_memcpy[2] = { 2226 {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false}, 2227 {-1, libcall, false}}}, 2228 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false}, 2229 {-1, libcall, false}}}}; 2230 static stringop_algs generic_memset[2] = { 2231 {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false}, 2232 {-1, libcall, false}}}, 2233 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false}, 2234 {-1, libcall, false}}}}; 2235 static const 2236 struct processor_costs generic_cost = { 2237 COSTS_N_INSNS (1), /* cost of an add instruction */ 2238 /* Setting cost to 2 makes our current implementation of synth_mult result in 2239 use of unnecessary temporary registers causing regression on several 2240 SPECfp benchmarks. */ 2241 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ 2242 COSTS_N_INSNS (1), /* variable shift costs */ 2243 COSTS_N_INSNS (1), /* constant shift costs */ 2244 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ 2245 COSTS_N_INSNS (4), /* HI */ 2246 COSTS_N_INSNS (3), /* SI */ 2247 COSTS_N_INSNS (4), /* DI */ 2248 COSTS_N_INSNS (4)}, /* other */ 2249 0, /* cost of multiply per each bit set */ 2250 {COSTS_N_INSNS (16), /* cost of a divide/mod for QI */ 2251 COSTS_N_INSNS (22), /* HI */ 2252 COSTS_N_INSNS (30), /* SI */ 2253 COSTS_N_INSNS (74), /* DI */ 2254 COSTS_N_INSNS (74)}, /* other */ 2255 COSTS_N_INSNS (1), /* cost of movsx */ 2256 COSTS_N_INSNS (1), /* cost of movzx */ 2257 8, /* "large" insn */ 2258 17, /* MOVE_RATIO */ 2259 2260 /* All move costs are relative to integer->integer move times 2 and thus 2261 they are latency*2. */ 2262 6, /* cost for loading QImode using movzbl */ 2263 {6, 6, 6}, /* cost of loading integer registers 2264 in QImode, HImode and SImode. 2265 Relative to reg-reg move (2). */ 2266 {6, 6, 6}, /* cost of storing integer registers */ 2267 4, /* cost of reg,reg fld/fst */ 2268 {6, 6, 12}, /* cost of loading fp registers 2269 in SFmode, DFmode and XFmode */ 2270 {6, 6, 12}, /* cost of storing fp registers 2271 in SFmode, DFmode and XFmode */ 2272 2, /* cost of moving MMX register */ 2273 {6, 6}, /* cost of loading MMX registers 2274 in SImode and DImode */ 2275 {6, 6}, /* cost of storing MMX registers 2276 in SImode and DImode */ 2277 2, 3, 4, /* cost of moving XMM,YMM,ZMM register */ 2278 {6, 6, 6, 10, 15}, /* cost of loading SSE registers 2279 in 32,64,128,256 and 512-bit */ 2280 {6, 6, 6, 10, 15}, /* cost of unaligned loads. */ 2281 {6, 6, 6, 10, 15}, /* cost of storing SSE registers 2282 in 32,64,128,256 and 512-bit */ 2283 {6, 6, 6, 10, 15}, /* cost of unaligned storess. */ 2284 6, 6, /* SSE->integer and integer->SSE moves */ 2285 18, 6, /* Gather load static, per_elt. */ 2286 18, 6, /* Gather store static, per_elt. */ 2287 32, /* size of l1 cache. */ 2288 512, /* size of l2 cache. */ 2289 64, /* size of prefetch block */ 2290 6, /* number of parallel prefetches */ 2291 /* Benchmarks shows large regressions on K8 sixtrack benchmark when this 2292 value is increased to perhaps more appropriate value of 5. */ 2293 3, /* Branch cost */ 2294 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */ 2295 COSTS_N_INSNS (5), /* cost of FMUL instruction. */ 2296 COSTS_N_INSNS (17), /* cost of FDIV instruction. */ 2297 COSTS_N_INSNS (1), /* cost of FABS instruction. */ 2298 COSTS_N_INSNS (1), /* cost of FCHS instruction. */ 2299 COSTS_N_INSNS (14), /* cost of FSQRT instruction. */ 2300 2301 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ 2302 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */ 2303 COSTS_N_INSNS (4), /* cost of MULSS instruction. */ 2304 COSTS_N_INSNS (5), /* cost of MULSD instruction. */ 2305 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */ 2306 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */ 2307 COSTS_N_INSNS (13), /* cost of DIVSS instruction. */ 2308 COSTS_N_INSNS (17), /* cost of DIVSD instruction. */ 2309 COSTS_N_INSNS (14), /* cost of SQRTSS instruction. */ 2310 COSTS_N_INSNS (18), /* cost of SQRTSD instruction. */ 2311 1, 4, 3, 3, /* reassoc int, fp, vec_int, vec_fp. */ 2312 generic_memcpy, 2313 generic_memset, 2314 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */ 2315 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */ 2316 }; 2317 2318 /* core_cost should produce code tuned for Core familly of CPUs. */ 2319 static stringop_algs core_memcpy[2] = { 2320 {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}}, 2321 {libcall, {{24, loop, true}, {128, rep_prefix_8_byte, true}, 2322 {-1, libcall, false}}}}; 2323 static stringop_algs core_memset[2] = { 2324 {libcall, {{6, loop_1_byte, true}, 2325 {24, loop, true}, 2326 {8192, rep_prefix_4_byte, true}, 2327 {-1, libcall, false}}}, 2328 {libcall, {{24, loop, true}, {512, rep_prefix_8_byte, true}, 2329 {-1, libcall, false}}}}; 2330 2331 static const 2332 struct processor_costs core_cost = { 2333 COSTS_N_INSNS (1), /* cost of an add instruction */ 2334 /* On all chips taken into consideration lea is 2 cycles and more. With 2335 this cost however our current implementation of synth_mult results in 2336 use of unnecessary temporary registers causing regression on several 2337 SPECfp benchmarks. */ 2338 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ 2339 COSTS_N_INSNS (1), /* variable shift costs */ 2340 COSTS_N_INSNS (1), /* constant shift costs */ 2341 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ 2342 COSTS_N_INSNS (4), /* HI */ 2343 COSTS_N_INSNS (3), /* SI */ 2344 /* Here we tune for Sandybridge or newer. */ 2345 COSTS_N_INSNS (3), /* DI */ 2346 COSTS_N_INSNS (3)}, /* other */ 2347 0, /* cost of multiply per each bit set */ 2348 /* Expanding div/mod currently doesn't consider parallelism. So the cost 2349 model is not realistic. We compensate by increasing the latencies a bit. */ 2350 {COSTS_N_INSNS (11), /* cost of a divide/mod for QI */ 2351 COSTS_N_INSNS (11), /* HI */ 2352 COSTS_N_INSNS (14), /* SI */ 2353 COSTS_N_INSNS (81), /* DI */ 2354 COSTS_N_INSNS (81)}, /* other */ 2355 COSTS_N_INSNS (1), /* cost of movsx */ 2356 COSTS_N_INSNS (1), /* cost of movzx */ 2357 8, /* "large" insn */ 2358 17, /* MOVE_RATIO */ 2359 2360 /* All move costs are relative to integer->integer move times 2 and thus 2361 they are latency*2. */ 2362 6, /* cost for loading QImode using movzbl */ 2363 {4, 4, 4}, /* cost of loading integer registers 2364 in QImode, HImode and SImode. 2365 Relative to reg-reg move (2). */ 2366 {6, 6, 6}, /* cost of storing integer registers */ 2367 2, /* cost of reg,reg fld/fst */ 2368 {6, 6, 8}, /* cost of loading fp registers 2369 in SFmode, DFmode and XFmode */ 2370 {6, 6, 10}, /* cost of storing fp registers 2371 in SFmode, DFmode and XFmode */ 2372 2, /* cost of moving MMX register */ 2373 {6, 6}, /* cost of loading MMX registers 2374 in SImode and DImode */ 2375 {6, 6}, /* cost of storing MMX registers 2376 in SImode and DImode */ 2377 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */ 2378 {6, 6, 6, 6, 12}, /* cost of loading SSE registers 2379 in 32,64,128,256 and 512-bit */ 2380 {6, 6, 6, 6, 12}, /* cost of unaligned loads. */ 2381 {6, 6, 6, 6, 12}, /* cost of storing SSE registers 2382 in 32,64,128,256 and 512-bit */ 2383 {6, 6, 6, 6, 12}, /* cost of unaligned stores. */ 2384 2, 2, /* SSE->integer and integer->SSE moves */ 2385 /* VGATHERDPD is 7 uops, rec throughput 5, while VGATHERDPD is 9 uops, 2386 rec. throughput 6. 2387 So 5 uops statically and one uops per load. */ 2388 10, 6, /* Gather load static, per_elt. */ 2389 10, 6, /* Gather store static, per_elt. */ 2390 64, /* size of l1 cache. */ 2391 512, /* size of l2 cache. */ 2392 64, /* size of prefetch block */ 2393 6, /* number of parallel prefetches */ 2394 /* FIXME perhaps more appropriate value is 5. */ 2395 3, /* Branch cost */ 2396 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */ 2397 COSTS_N_INSNS (5), /* cost of FMUL instruction. */ 2398 /* 10-24 */ 2399 COSTS_N_INSNS (24), /* cost of FDIV instruction. */ 2400 COSTS_N_INSNS (1), /* cost of FABS instruction. */ 2401 COSTS_N_INSNS (1), /* cost of FCHS instruction. */ 2402 COSTS_N_INSNS (23), /* cost of FSQRT instruction. */ 2403 2404 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ 2405 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */ 2406 COSTS_N_INSNS (4), /* cost of MULSS instruction. */ 2407 COSTS_N_INSNS (5), /* cost of MULSD instruction. */ 2408 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */ 2409 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */ 2410 COSTS_N_INSNS (18), /* cost of DIVSS instruction. */ 2411 COSTS_N_INSNS (32), /* cost of DIVSD instruction. */ 2412 COSTS_N_INSNS (30), /* cost of SQRTSS instruction. */ 2413 COSTS_N_INSNS (58), /* cost of SQRTSD instruction. */ 2414 1, 4, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */ 2415 core_memcpy, 2416 core_memset, 2417 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ 2418 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ 2419 }; 2420 2421