1 /* Copyright (C) 1988-2020 Free Software Foundation, Inc.
2
3 This file is part of GCC.
4
5 GCC is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3, or (at your option)
8 any later version.
9
10 GCC is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with GCC; see the file COPYING3. If not see
17 <http://www.gnu.org/licenses/>. */
18
19 #define IN_TARGET_CODE 1
20
21 #include "config.h"
22 #include "system.h"
23 #include "coretypes.h"
24 #include "backend.h"
25 #include "rtl.h"
26 #include "tree.h"
27 #include "memmodel.h"
28 #include "gimple.h"
29 #include "cfghooks.h"
30 #include "cfgloop.h"
31 #include "df.h"
32 #include "tm_p.h"
33 #include "stringpool.h"
34 #include "expmed.h"
35 #include "optabs.h"
36 #include "regs.h"
37 #include "emit-rtl.h"
38 #include "recog.h"
39 #include "cgraph.h"
40 #include "diagnostic.h"
41 #include "cfgbuild.h"
42 #include "alias.h"
43 #include "fold-const.h"
44 #include "attribs.h"
45 #include "calls.h"
46 #include "stor-layout.h"
47 #include "varasm.h"
48 #include "output.h"
49 #include "insn-attr.h"
50 #include "flags.h"
51 #include "except.h"
52 #include "explow.h"
53 #include "expr.h"
54 #include "cfgrtl.h"
55 #include "common/common-target.h"
56 #include "langhooks.h"
57 #include "reload.h"
58 #include "gimplify.h"
59 #include "dwarf2.h"
60 #include "tm-constrs.h"
61 #include "cselib.h"
62 #include "sched-int.h"
63 #include "opts.h"
64 #include "tree-pass.h"
65 #include "context.h"
66 #include "pass_manager.h"
67 #include "target-globals.h"
68 #include "gimple-iterator.h"
69 #include "tree-vectorizer.h"
70 #include "shrink-wrap.h"
71 #include "builtins.h"
72 #include "rtl-iter.h"
73 #include "tree-iterator.h"
74 #include "dbgcnt.h"
75 #include "case-cfn-macros.h"
76 #include "dojump.h"
77 #include "fold-const-call.h"
78 #include "tree-vrp.h"
79 #include "tree-ssanames.h"
80 #include "selftest.h"
81 #include "selftest-rtl.h"
82 #include "print-rtl.h"
83 #include "intl.h"
84 #include "ifcvt.h"
85 #include "symbol-summary.h"
86 #include "ipa-prop.h"
87 #include "ipa-fnsummary.h"
88 #include "wide-int-bitmask.h"
89 #include "tree-vector-builder.h"
90 #include "debug.h"
91 #include "dwarf2out.h"
92 #include "i386-builtins.h"
93 #include "i386-features.h"
94
95 const char * const xlogue_layout::STUB_BASE_NAMES[XLOGUE_STUB_COUNT] = {
96 "savms64",
97 "resms64",
98 "resms64x",
99 "savms64f",
100 "resms64f",
101 "resms64fx"
102 };
103
104 const unsigned xlogue_layout::REG_ORDER[xlogue_layout::MAX_REGS] = {
105 /* The below offset values are where each register is stored for the layout
106 relative to incoming stack pointer. The value of each m_regs[].offset will
107 be relative to the incoming base pointer (rax or rsi) used by the stub.
108
109 s_instances: 0 1 2 3
110 Offset: realigned or aligned + 8
111 Register aligned aligned + 8 aligned w/HFP w/HFP */
112 XMM15_REG, /* 0x10 0x18 0x10 0x18 */
113 XMM14_REG, /* 0x20 0x28 0x20 0x28 */
114 XMM13_REG, /* 0x30 0x38 0x30 0x38 */
115 XMM12_REG, /* 0x40 0x48 0x40 0x48 */
116 XMM11_REG, /* 0x50 0x58 0x50 0x58 */
117 XMM10_REG, /* 0x60 0x68 0x60 0x68 */
118 XMM9_REG, /* 0x70 0x78 0x70 0x78 */
119 XMM8_REG, /* 0x80 0x88 0x80 0x88 */
120 XMM7_REG, /* 0x90 0x98 0x90 0x98 */
121 XMM6_REG, /* 0xa0 0xa8 0xa0 0xa8 */
122 SI_REG, /* 0xa8 0xb0 0xa8 0xb0 */
123 DI_REG, /* 0xb0 0xb8 0xb0 0xb8 */
124 BX_REG, /* 0xb8 0xc0 0xb8 0xc0 */
125 BP_REG, /* 0xc0 0xc8 N/A N/A */
126 R12_REG, /* 0xc8 0xd0 0xc0 0xc8 */
127 R13_REG, /* 0xd0 0xd8 0xc8 0xd0 */
128 R14_REG, /* 0xd8 0xe0 0xd0 0xd8 */
129 R15_REG, /* 0xe0 0xe8 0xd8 0xe0 */
130 };
131
132 /* Instantiate static const values. */
133 const HOST_WIDE_INT xlogue_layout::STUB_INDEX_OFFSET;
134 const unsigned xlogue_layout::MIN_REGS;
135 const unsigned xlogue_layout::MAX_REGS;
136 const unsigned xlogue_layout::MAX_EXTRA_REGS;
137 const unsigned xlogue_layout::VARIANT_COUNT;
138 const unsigned xlogue_layout::STUB_NAME_MAX_LEN;
139
140 /* Initialize xlogue_layout::s_stub_names to zero. */
141 char xlogue_layout::s_stub_names[2][XLOGUE_STUB_COUNT][VARIANT_COUNT]
142 [STUB_NAME_MAX_LEN];
143
144 /* Instantiates all xlogue_layout instances. */
145 const xlogue_layout xlogue_layout::s_instances[XLOGUE_SET_COUNT] = {
146 xlogue_layout (0, false),
147 xlogue_layout (8, false),
148 xlogue_layout (0, true),
149 xlogue_layout (8, true)
150 };
151
152 /* Return an appropriate const instance of xlogue_layout based upon values
153 in cfun->machine and crtl. */
154 const class xlogue_layout &
get_instance()155 xlogue_layout::get_instance ()
156 {
157 enum xlogue_stub_sets stub_set;
158 bool aligned_plus_8 = cfun->machine->call_ms2sysv_pad_in;
159
160 if (stack_realign_fp)
161 stub_set = XLOGUE_SET_HFP_ALIGNED_OR_REALIGN;
162 else if (frame_pointer_needed)
163 stub_set = aligned_plus_8
164 ? XLOGUE_SET_HFP_ALIGNED_PLUS_8
165 : XLOGUE_SET_HFP_ALIGNED_OR_REALIGN;
166 else
167 stub_set = aligned_plus_8 ? XLOGUE_SET_ALIGNED_PLUS_8 : XLOGUE_SET_ALIGNED;
168
169 return s_instances[stub_set];
170 }
171
172 /* Determine how many clobbered registers can be saved by the stub.
173 Returns the count of registers the stub will save and restore. */
174 unsigned
count_stub_managed_regs()175 xlogue_layout::count_stub_managed_regs ()
176 {
177 bool hfp = frame_pointer_needed || stack_realign_fp;
178 unsigned i, count;
179 unsigned regno;
180
181 for (count = i = MIN_REGS; i < MAX_REGS; ++i)
182 {
183 regno = REG_ORDER[i];
184 if (regno == BP_REG && hfp)
185 continue;
186 if (!ix86_save_reg (regno, false, false))
187 break;
188 ++count;
189 }
190 return count;
191 }
192
193 /* Determine if register REGNO is a stub managed register given the
194 total COUNT of stub managed registers. */
195 bool
is_stub_managed_reg(unsigned regno,unsigned count)196 xlogue_layout::is_stub_managed_reg (unsigned regno, unsigned count)
197 {
198 bool hfp = frame_pointer_needed || stack_realign_fp;
199 unsigned i;
200
201 for (i = 0; i < count; ++i)
202 {
203 gcc_assert (i < MAX_REGS);
204 if (REG_ORDER[i] == BP_REG && hfp)
205 ++count;
206 else if (REG_ORDER[i] == regno)
207 return true;
208 }
209 return false;
210 }
211
212 /* Constructor for xlogue_layout. */
xlogue_layout(HOST_WIDE_INT stack_align_off_in,bool hfp)213 xlogue_layout::xlogue_layout (HOST_WIDE_INT stack_align_off_in, bool hfp)
214 : m_hfp (hfp) , m_nregs (hfp ? 17 : 18),
215 m_stack_align_off_in (stack_align_off_in)
216 {
217 HOST_WIDE_INT offset = stack_align_off_in;
218 unsigned i, j;
219
220 for (i = j = 0; i < MAX_REGS; ++i)
221 {
222 unsigned regno = REG_ORDER[i];
223
224 if (regno == BP_REG && hfp)
225 continue;
226 if (SSE_REGNO_P (regno))
227 {
228 offset += 16;
229 /* Verify that SSE regs are always aligned. */
230 gcc_assert (!((stack_align_off_in + offset) & 15));
231 }
232 else
233 offset += 8;
234
235 m_regs[j].regno = regno;
236 m_regs[j++].offset = offset - STUB_INDEX_OFFSET;
237 }
238 gcc_assert (j == m_nregs);
239 }
240
241 const char *
get_stub_name(enum xlogue_stub stub,unsigned n_extra_regs)242 xlogue_layout::get_stub_name (enum xlogue_stub stub,
243 unsigned n_extra_regs)
244 {
245 const int have_avx = TARGET_AVX;
246 char *name = s_stub_names[!!have_avx][stub][n_extra_regs];
247
248 /* Lazy init */
249 if (!*name)
250 {
251 int res = snprintf (name, STUB_NAME_MAX_LEN, "__%s_%s_%u",
252 (have_avx ? "avx" : "sse"),
253 STUB_BASE_NAMES[stub],
254 MIN_REGS + n_extra_regs);
255 gcc_checking_assert (res < (int)STUB_NAME_MAX_LEN);
256 }
257
258 return name;
259 }
260
261 /* Return rtx of a symbol ref for the entry point (based upon
262 cfun->machine->call_ms2sysv_extra_regs) of the specified stub. */
263 rtx
get_stub_rtx(enum xlogue_stub stub)264 xlogue_layout::get_stub_rtx (enum xlogue_stub stub)
265 {
266 const unsigned n_extra_regs = cfun->machine->call_ms2sysv_extra_regs;
267 gcc_checking_assert (n_extra_regs <= MAX_EXTRA_REGS);
268 gcc_assert (stub < XLOGUE_STUB_COUNT);
269 gcc_assert (crtl->stack_realign_finalized);
270
271 return gen_rtx_SYMBOL_REF (Pmode, get_stub_name (stub, n_extra_regs));
272 }
273
274 unsigned scalar_chain::max_id = 0;
275
276 namespace {
277
278 /* Initialize new chain. */
279
scalar_chain(enum machine_mode smode_,enum machine_mode vmode_)280 scalar_chain::scalar_chain (enum machine_mode smode_, enum machine_mode vmode_)
281 {
282 smode = smode_;
283 vmode = vmode_;
284
285 chain_id = ++max_id;
286
287 if (dump_file)
288 fprintf (dump_file, "Created a new instruction chain #%d\n", chain_id);
289
290 bitmap_obstack_initialize (NULL);
291 insns = BITMAP_ALLOC (NULL);
292 defs = BITMAP_ALLOC (NULL);
293 defs_conv = BITMAP_ALLOC (NULL);
294 queue = NULL;
295 }
296
297 /* Free chain's data. */
298
~scalar_chain()299 scalar_chain::~scalar_chain ()
300 {
301 BITMAP_FREE (insns);
302 BITMAP_FREE (defs);
303 BITMAP_FREE (defs_conv);
304 bitmap_obstack_release (NULL);
305 }
306
307 /* Add instruction into chains' queue. */
308
309 void
add_to_queue(unsigned insn_uid)310 scalar_chain::add_to_queue (unsigned insn_uid)
311 {
312 if (bitmap_bit_p (insns, insn_uid)
313 || bitmap_bit_p (queue, insn_uid))
314 return;
315
316 if (dump_file)
317 fprintf (dump_file, " Adding insn %d into chain's #%d queue\n",
318 insn_uid, chain_id);
319 bitmap_set_bit (queue, insn_uid);
320 }
321
general_scalar_chain(enum machine_mode smode_,enum machine_mode vmode_)322 general_scalar_chain::general_scalar_chain (enum machine_mode smode_,
323 enum machine_mode vmode_)
324 : scalar_chain (smode_, vmode_)
325 {
326 insns_conv = BITMAP_ALLOC (NULL);
327 n_sse_to_integer = 0;
328 n_integer_to_sse = 0;
329 }
330
~general_scalar_chain()331 general_scalar_chain::~general_scalar_chain ()
332 {
333 BITMAP_FREE (insns_conv);
334 }
335
336 /* For DImode conversion, mark register defined by DEF as requiring
337 conversion. */
338
339 void
mark_dual_mode_def(df_ref def)340 general_scalar_chain::mark_dual_mode_def (df_ref def)
341 {
342 gcc_assert (DF_REF_REG_DEF_P (def));
343
344 /* Record the def/insn pair so we can later efficiently iterate over
345 the defs to convert on insns not in the chain. */
346 bool reg_new = bitmap_set_bit (defs_conv, DF_REF_REGNO (def));
347 if (!bitmap_bit_p (insns, DF_REF_INSN_UID (def)))
348 {
349 if (!bitmap_set_bit (insns_conv, DF_REF_INSN_UID (def))
350 && !reg_new)
351 return;
352 n_integer_to_sse++;
353 }
354 else
355 {
356 if (!reg_new)
357 return;
358 n_sse_to_integer++;
359 }
360
361 if (dump_file)
362 fprintf (dump_file,
363 " Mark r%d def in insn %d as requiring both modes in chain #%d\n",
364 DF_REF_REGNO (def), DF_REF_INSN_UID (def), chain_id);
365 }
366
367 /* For TImode conversion, it is unused. */
368
369 void
mark_dual_mode_def(df_ref)370 timode_scalar_chain::mark_dual_mode_def (df_ref)
371 {
372 gcc_unreachable ();
373 }
374
375 /* Check REF's chain to add new insns into a queue
376 and find registers requiring conversion. */
377
378 void
analyze_register_chain(bitmap candidates,df_ref ref)379 scalar_chain::analyze_register_chain (bitmap candidates, df_ref ref)
380 {
381 df_link *chain;
382
383 gcc_assert (bitmap_bit_p (insns, DF_REF_INSN_UID (ref))
384 || bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)));
385 add_to_queue (DF_REF_INSN_UID (ref));
386
387 for (chain = DF_REF_CHAIN (ref); chain; chain = chain->next)
388 {
389 unsigned uid = DF_REF_INSN_UID (chain->ref);
390
391 if (!NONDEBUG_INSN_P (DF_REF_INSN (chain->ref)))
392 continue;
393
394 if (!DF_REF_REG_MEM_P (chain->ref))
395 {
396 if (bitmap_bit_p (insns, uid))
397 continue;
398
399 if (bitmap_bit_p (candidates, uid))
400 {
401 add_to_queue (uid);
402 continue;
403 }
404 }
405
406 if (DF_REF_REG_DEF_P (chain->ref))
407 {
408 if (dump_file)
409 fprintf (dump_file, " r%d def in insn %d isn't convertible\n",
410 DF_REF_REGNO (chain->ref), uid);
411 mark_dual_mode_def (chain->ref);
412 }
413 else
414 {
415 if (dump_file)
416 fprintf (dump_file, " r%d use in insn %d isn't convertible\n",
417 DF_REF_REGNO (chain->ref), uid);
418 mark_dual_mode_def (ref);
419 }
420 }
421 }
422
423 /* Add instruction into a chain. */
424
425 void
add_insn(bitmap candidates,unsigned int insn_uid)426 scalar_chain::add_insn (bitmap candidates, unsigned int insn_uid)
427 {
428 if (bitmap_bit_p (insns, insn_uid))
429 return;
430
431 if (dump_file)
432 fprintf (dump_file, " Adding insn %d to chain #%d\n", insn_uid, chain_id);
433
434 bitmap_set_bit (insns, insn_uid);
435
436 rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
437 rtx def_set = single_set (insn);
438 if (def_set && REG_P (SET_DEST (def_set))
439 && !HARD_REGISTER_P (SET_DEST (def_set)))
440 bitmap_set_bit (defs, REGNO (SET_DEST (def_set)));
441
442 /* ??? The following is quadratic since analyze_register_chain
443 iterates over all refs to look for dual-mode regs. Instead this
444 should be done separately for all regs mentioned in the chain once. */
445 df_ref ref;
446 for (ref = DF_INSN_UID_DEFS (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
447 if (!HARD_REGISTER_P (DF_REF_REG (ref)))
448 analyze_register_chain (candidates, ref);
449 for (ref = DF_INSN_UID_USES (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
450 if (!DF_REF_REG_MEM_P (ref))
451 analyze_register_chain (candidates, ref);
452 }
453
454 /* Build new chain starting from insn INSN_UID recursively
455 adding all dependent uses and definitions. */
456
457 void
build(bitmap candidates,unsigned insn_uid)458 scalar_chain::build (bitmap candidates, unsigned insn_uid)
459 {
460 queue = BITMAP_ALLOC (NULL);
461 bitmap_set_bit (queue, insn_uid);
462
463 if (dump_file)
464 fprintf (dump_file, "Building chain #%d...\n", chain_id);
465
466 while (!bitmap_empty_p (queue))
467 {
468 insn_uid = bitmap_first_set_bit (queue);
469 bitmap_clear_bit (queue, insn_uid);
470 bitmap_clear_bit (candidates, insn_uid);
471 add_insn (candidates, insn_uid);
472 }
473
474 if (dump_file)
475 {
476 fprintf (dump_file, "Collected chain #%d...\n", chain_id);
477 fprintf (dump_file, " insns: ");
478 dump_bitmap (dump_file, insns);
479 if (!bitmap_empty_p (defs_conv))
480 {
481 bitmap_iterator bi;
482 unsigned id;
483 const char *comma = "";
484 fprintf (dump_file, " defs to convert: ");
485 EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, id, bi)
486 {
487 fprintf (dump_file, "%sr%d", comma, id);
488 comma = ", ";
489 }
490 fprintf (dump_file, "\n");
491 }
492 }
493
494 BITMAP_FREE (queue);
495 }
496
497 /* Return a cost of building a vector costant
498 instead of using a scalar one. */
499
500 int
vector_const_cost(rtx exp)501 general_scalar_chain::vector_const_cost (rtx exp)
502 {
503 gcc_assert (CONST_INT_P (exp));
504
505 if (standard_sse_constant_p (exp, vmode))
506 return ix86_cost->sse_op;
507 /* We have separate costs for SImode and DImode, use SImode costs
508 for smaller modes. */
509 return ix86_cost->sse_load[smode == DImode ? 1 : 0];
510 }
511
512 /* Compute a gain for chain conversion. */
513
514 int
compute_convert_gain()515 general_scalar_chain::compute_convert_gain ()
516 {
517 bitmap_iterator bi;
518 unsigned insn_uid;
519 int gain = 0;
520 int cost = 0;
521
522 if (dump_file)
523 fprintf (dump_file, "Computing gain for chain #%d...\n", chain_id);
524
525 /* SSE costs distinguish between SImode and DImode loads/stores, for
526 int costs factor in the number of GPRs involved. When supporting
527 smaller modes than SImode the int load/store costs need to be
528 adjusted as well. */
529 unsigned sse_cost_idx = smode == DImode ? 1 : 0;
530 unsigned m = smode == DImode ? (TARGET_64BIT ? 1 : 2) : 1;
531
532 EXECUTE_IF_SET_IN_BITMAP (insns, 0, insn_uid, bi)
533 {
534 rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
535 rtx def_set = single_set (insn);
536 rtx src = SET_SRC (def_set);
537 rtx dst = SET_DEST (def_set);
538 int igain = 0;
539
540 if (REG_P (src) && REG_P (dst))
541 igain += 2 * m - ix86_cost->xmm_move;
542 else if (REG_P (src) && MEM_P (dst))
543 igain
544 += m * ix86_cost->int_store[2] - ix86_cost->sse_store[sse_cost_idx];
545 else if (MEM_P (src) && REG_P (dst))
546 igain += m * ix86_cost->int_load[2] - ix86_cost->sse_load[sse_cost_idx];
547 else if (GET_CODE (src) == ASHIFT
548 || GET_CODE (src) == ASHIFTRT
549 || GET_CODE (src) == LSHIFTRT)
550 {
551 if (m == 2)
552 {
553 if (INTVAL (XEXP (src, 1)) >= 32)
554 igain += ix86_cost->add;
555 else
556 igain += ix86_cost->shift_const;
557 }
558
559 igain += ix86_cost->shift_const - ix86_cost->sse_op;
560
561 if (CONST_INT_P (XEXP (src, 0)))
562 igain -= vector_const_cost (XEXP (src, 0));
563 }
564 else if (GET_CODE (src) == PLUS
565 || GET_CODE (src) == MINUS
566 || GET_CODE (src) == IOR
567 || GET_CODE (src) == XOR
568 || GET_CODE (src) == AND)
569 {
570 igain += m * ix86_cost->add - ix86_cost->sse_op;
571 /* Additional gain for andnot for targets without BMI. */
572 if (GET_CODE (XEXP (src, 0)) == NOT
573 && !TARGET_BMI)
574 igain += m * ix86_cost->add;
575
576 if (CONST_INT_P (XEXP (src, 0)))
577 igain -= vector_const_cost (XEXP (src, 0));
578 if (CONST_INT_P (XEXP (src, 1)))
579 igain -= vector_const_cost (XEXP (src, 1));
580 }
581 else if (GET_CODE (src) == NEG
582 || GET_CODE (src) == NOT)
583 igain += m * ix86_cost->add - ix86_cost->sse_op - COSTS_N_INSNS (1);
584 else if (GET_CODE (src) == SMAX
585 || GET_CODE (src) == SMIN
586 || GET_CODE (src) == UMAX
587 || GET_CODE (src) == UMIN)
588 {
589 /* We do not have any conditional move cost, estimate it as a
590 reg-reg move. Comparisons are costed as adds. */
591 igain += m * (COSTS_N_INSNS (2) + ix86_cost->add);
592 /* Integer SSE ops are all costed the same. */
593 igain -= ix86_cost->sse_op;
594 }
595 else if (GET_CODE (src) == COMPARE)
596 {
597 /* Assume comparison cost is the same. */
598 }
599 else if (CONST_INT_P (src))
600 {
601 if (REG_P (dst))
602 /* DImode can be immediate for TARGET_64BIT and SImode always. */
603 igain += m * COSTS_N_INSNS (1);
604 else if (MEM_P (dst))
605 igain += (m * ix86_cost->int_store[2]
606 - ix86_cost->sse_store[sse_cost_idx]);
607 igain -= vector_const_cost (src);
608 }
609 else
610 gcc_unreachable ();
611
612 if (igain != 0 && dump_file)
613 {
614 fprintf (dump_file, " Instruction gain %d for ", igain);
615 dump_insn_slim (dump_file, insn);
616 }
617 gain += igain;
618 }
619
620 if (dump_file)
621 fprintf (dump_file, " Instruction conversion gain: %d\n", gain);
622
623 /* Cost the integer to sse and sse to integer moves. */
624 cost += n_sse_to_integer * ix86_cost->sse_to_integer;
625 /* ??? integer_to_sse but we only have that in the RA cost table.
626 Assume sse_to_integer/integer_to_sse are the same which they
627 are at the moment. */
628 cost += n_integer_to_sse * ix86_cost->sse_to_integer;
629
630 if (dump_file)
631 fprintf (dump_file, " Registers conversion cost: %d\n", cost);
632
633 gain -= cost;
634
635 if (dump_file)
636 fprintf (dump_file, " Total gain: %d\n", gain);
637
638 return gain;
639 }
640
641 /* Insert generated conversion instruction sequence INSNS
642 after instruction AFTER. New BB may be required in case
643 instruction has EH region attached. */
644
645 void
emit_conversion_insns(rtx insns,rtx_insn * after)646 scalar_chain::emit_conversion_insns (rtx insns, rtx_insn *after)
647 {
648 if (!control_flow_insn_p (after))
649 {
650 emit_insn_after (insns, after);
651 return;
652 }
653
654 basic_block bb = BLOCK_FOR_INSN (after);
655 edge e = find_fallthru_edge (bb->succs);
656 gcc_assert (e);
657
658 basic_block new_bb = split_edge (e);
659 emit_insn_after (insns, BB_HEAD (new_bb));
660 }
661
662 } // anon namespace
663
664 /* Generate the canonical SET_SRC to move GPR to a VMODE vector register,
665 zeroing the upper parts. */
666
667 static rtx
gen_gpr_to_xmm_move_src(enum machine_mode vmode,rtx gpr)668 gen_gpr_to_xmm_move_src (enum machine_mode vmode, rtx gpr)
669 {
670 switch (GET_MODE_NUNITS (vmode))
671 {
672 case 1:
673 /* We are not using this case currently. */
674 gcc_unreachable ();
675 case 2:
676 return gen_rtx_VEC_CONCAT (vmode, gpr,
677 CONST0_RTX (GET_MODE_INNER (vmode)));
678 default:
679 return gen_rtx_VEC_MERGE (vmode, gen_rtx_VEC_DUPLICATE (vmode, gpr),
680 CONST0_RTX (vmode), GEN_INT (HOST_WIDE_INT_1U));
681 }
682 }
683
684 /* Make vector copies for all register REGNO definitions
685 and replace its uses in a chain. */
686
687 void
make_vector_copies(rtx_insn * insn,rtx reg)688 general_scalar_chain::make_vector_copies (rtx_insn *insn, rtx reg)
689 {
690 rtx vreg = *defs_map.get (reg);
691
692 start_sequence ();
693 if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
694 {
695 rtx tmp = assign_386_stack_local (smode, SLOT_STV_TEMP);
696 if (smode == DImode && !TARGET_64BIT)
697 {
698 emit_move_insn (adjust_address (tmp, SImode, 0),
699 gen_rtx_SUBREG (SImode, reg, 0));
700 emit_move_insn (adjust_address (tmp, SImode, 4),
701 gen_rtx_SUBREG (SImode, reg, 4));
702 }
703 else
704 emit_move_insn (copy_rtx (tmp), reg);
705 emit_insn (gen_rtx_SET (gen_rtx_SUBREG (vmode, vreg, 0),
706 gen_gpr_to_xmm_move_src (vmode, tmp)));
707 }
708 else if (!TARGET_64BIT && smode == DImode)
709 {
710 if (TARGET_SSE4_1)
711 {
712 emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0),
713 CONST0_RTX (V4SImode),
714 gen_rtx_SUBREG (SImode, reg, 0)));
715 emit_insn (gen_sse4_1_pinsrd (gen_rtx_SUBREG (V4SImode, vreg, 0),
716 gen_rtx_SUBREG (V4SImode, vreg, 0),
717 gen_rtx_SUBREG (SImode, reg, 4),
718 GEN_INT (2)));
719 }
720 else
721 {
722 rtx tmp = gen_reg_rtx (DImode);
723 emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0),
724 CONST0_RTX (V4SImode),
725 gen_rtx_SUBREG (SImode, reg, 0)));
726 emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, tmp, 0),
727 CONST0_RTX (V4SImode),
728 gen_rtx_SUBREG (SImode, reg, 4)));
729 emit_insn (gen_vec_interleave_lowv4si
730 (gen_rtx_SUBREG (V4SImode, vreg, 0),
731 gen_rtx_SUBREG (V4SImode, vreg, 0),
732 gen_rtx_SUBREG (V4SImode, tmp, 0)));
733 }
734 }
735 else
736 emit_insn (gen_rtx_SET (gen_rtx_SUBREG (vmode, vreg, 0),
737 gen_gpr_to_xmm_move_src (vmode, reg)));
738 rtx_insn *seq = get_insns ();
739 end_sequence ();
740 emit_conversion_insns (seq, insn);
741
742 if (dump_file)
743 fprintf (dump_file,
744 " Copied r%d to a vector register r%d for insn %d\n",
745 REGNO (reg), REGNO (vreg), INSN_UID (insn));
746 }
747
748 /* Copy the definition SRC of INSN inside the chain to DST for
749 scalar uses outside of the chain. */
750
751 void
convert_reg(rtx_insn * insn,rtx dst,rtx src)752 general_scalar_chain::convert_reg (rtx_insn *insn, rtx dst, rtx src)
753 {
754 start_sequence ();
755 if (!TARGET_INTER_UNIT_MOVES_FROM_VEC)
756 {
757 rtx tmp = assign_386_stack_local (smode, SLOT_STV_TEMP);
758 emit_move_insn (tmp, src);
759 if (!TARGET_64BIT && smode == DImode)
760 {
761 emit_move_insn (gen_rtx_SUBREG (SImode, dst, 0),
762 adjust_address (tmp, SImode, 0));
763 emit_move_insn (gen_rtx_SUBREG (SImode, dst, 4),
764 adjust_address (tmp, SImode, 4));
765 }
766 else
767 emit_move_insn (dst, copy_rtx (tmp));
768 }
769 else if (!TARGET_64BIT && smode == DImode)
770 {
771 if (TARGET_SSE4_1)
772 {
773 rtx tmp = gen_rtx_PARALLEL (VOIDmode,
774 gen_rtvec (1, const0_rtx));
775 emit_insn
776 (gen_rtx_SET
777 (gen_rtx_SUBREG (SImode, dst, 0),
778 gen_rtx_VEC_SELECT (SImode,
779 gen_rtx_SUBREG (V4SImode, src, 0),
780 tmp)));
781
782 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const1_rtx));
783 emit_insn
784 (gen_rtx_SET
785 (gen_rtx_SUBREG (SImode, dst, 4),
786 gen_rtx_VEC_SELECT (SImode,
787 gen_rtx_SUBREG (V4SImode, src, 0),
788 tmp)));
789 }
790 else
791 {
792 rtx vcopy = gen_reg_rtx (V2DImode);
793 emit_move_insn (vcopy, gen_rtx_SUBREG (V2DImode, src, 0));
794 emit_move_insn (gen_rtx_SUBREG (SImode, dst, 0),
795 gen_rtx_SUBREG (SImode, vcopy, 0));
796 emit_move_insn (vcopy,
797 gen_rtx_LSHIFTRT (V2DImode,
798 vcopy, GEN_INT (32)));
799 emit_move_insn (gen_rtx_SUBREG (SImode, dst, 4),
800 gen_rtx_SUBREG (SImode, vcopy, 0));
801 }
802 }
803 else
804 emit_move_insn (dst, src);
805
806 rtx_insn *seq = get_insns ();
807 end_sequence ();
808 emit_conversion_insns (seq, insn);
809
810 if (dump_file)
811 fprintf (dump_file,
812 " Copied r%d to a scalar register r%d for insn %d\n",
813 REGNO (src), REGNO (dst), INSN_UID (insn));
814 }
815
816 /* Convert operand OP in INSN. We should handle
817 memory operands and uninitialized registers.
818 All other register uses are converted during
819 registers conversion. */
820
821 void
convert_op(rtx * op,rtx_insn * insn)822 general_scalar_chain::convert_op (rtx *op, rtx_insn *insn)
823 {
824 *op = copy_rtx_if_shared (*op);
825
826 if (GET_CODE (*op) == NOT)
827 {
828 convert_op (&XEXP (*op, 0), insn);
829 PUT_MODE (*op, vmode);
830 }
831 else if (MEM_P (*op))
832 {
833 rtx tmp = gen_reg_rtx (GET_MODE (*op));
834
835 /* Handle movabs. */
836 if (!memory_operand (*op, GET_MODE (*op)))
837 {
838 rtx tmp2 = gen_reg_rtx (GET_MODE (*op));
839
840 emit_insn_before (gen_rtx_SET (tmp2, *op), insn);
841 *op = tmp2;
842 }
843
844 emit_insn_before (gen_rtx_SET (gen_rtx_SUBREG (vmode, tmp, 0),
845 gen_gpr_to_xmm_move_src (vmode, *op)),
846 insn);
847 *op = gen_rtx_SUBREG (vmode, tmp, 0);
848
849 if (dump_file)
850 fprintf (dump_file, " Preloading operand for insn %d into r%d\n",
851 INSN_UID (insn), REGNO (tmp));
852 }
853 else if (REG_P (*op))
854 {
855 *op = gen_rtx_SUBREG (vmode, *op, 0);
856 }
857 else if (CONST_INT_P (*op))
858 {
859 rtx vec_cst;
860 rtx tmp = gen_rtx_SUBREG (vmode, gen_reg_rtx (smode), 0);
861
862 /* Prefer all ones vector in case of -1. */
863 if (constm1_operand (*op, GET_MODE (*op)))
864 vec_cst = CONSTM1_RTX (vmode);
865 else
866 {
867 unsigned n = GET_MODE_NUNITS (vmode);
868 rtx *v = XALLOCAVEC (rtx, n);
869 v[0] = *op;
870 for (unsigned i = 1; i < n; ++i)
871 v[i] = const0_rtx;
872 vec_cst = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (n, v));
873 }
874
875 if (!standard_sse_constant_p (vec_cst, vmode))
876 {
877 start_sequence ();
878 vec_cst = validize_mem (force_const_mem (vmode, vec_cst));
879 rtx_insn *seq = get_insns ();
880 end_sequence ();
881 emit_insn_before (seq, insn);
882 }
883
884 emit_insn_before (gen_move_insn (copy_rtx (tmp), vec_cst), insn);
885 *op = tmp;
886 }
887 else
888 {
889 gcc_assert (SUBREG_P (*op));
890 gcc_assert (GET_MODE (*op) == vmode);
891 }
892 }
893
894 /* Convert INSN to vector mode. */
895
896 void
convert_insn(rtx_insn * insn)897 general_scalar_chain::convert_insn (rtx_insn *insn)
898 {
899 /* Generate copies for out-of-chain uses of defs and adjust debug uses. */
900 for (df_ref ref = DF_INSN_DEFS (insn); ref; ref = DF_REF_NEXT_LOC (ref))
901 if (bitmap_bit_p (defs_conv, DF_REF_REGNO (ref)))
902 {
903 df_link *use;
904 for (use = DF_REF_CHAIN (ref); use; use = use->next)
905 if (NONDEBUG_INSN_P (DF_REF_INSN (use->ref))
906 && (DF_REF_REG_MEM_P (use->ref)
907 || !bitmap_bit_p (insns, DF_REF_INSN_UID (use->ref))))
908 break;
909 if (use)
910 convert_reg (insn, DF_REF_REG (ref),
911 *defs_map.get (regno_reg_rtx [DF_REF_REGNO (ref)]));
912 else if (MAY_HAVE_DEBUG_BIND_INSNS)
913 {
914 /* If we generated a scalar copy we can leave debug-insns
915 as-is, if not, we have to adjust them. */
916 auto_vec<rtx_insn *, 5> to_reset_debug_insns;
917 for (use = DF_REF_CHAIN (ref); use; use = use->next)
918 if (DEBUG_INSN_P (DF_REF_INSN (use->ref)))
919 {
920 rtx_insn *debug_insn = DF_REF_INSN (use->ref);
921 /* If there's a reaching definition outside of the
922 chain we have to reset. */
923 df_link *def;
924 for (def = DF_REF_CHAIN (use->ref); def; def = def->next)
925 if (!bitmap_bit_p (insns, DF_REF_INSN_UID (def->ref)))
926 break;
927 if (def)
928 to_reset_debug_insns.safe_push (debug_insn);
929 else
930 {
931 *DF_REF_REAL_LOC (use->ref)
932 = *defs_map.get (regno_reg_rtx [DF_REF_REGNO (ref)]);
933 df_insn_rescan (debug_insn);
934 }
935 }
936 /* Have to do the reset outside of the DF_CHAIN walk to not
937 disrupt it. */
938 while (!to_reset_debug_insns.is_empty ())
939 {
940 rtx_insn *debug_insn = to_reset_debug_insns.pop ();
941 INSN_VAR_LOCATION_LOC (debug_insn) = gen_rtx_UNKNOWN_VAR_LOC ();
942 df_insn_rescan_debug_internal (debug_insn);
943 }
944 }
945 }
946
947 /* Replace uses in this insn with the defs we use in the chain. */
948 for (df_ref ref = DF_INSN_USES (insn); ref; ref = DF_REF_NEXT_LOC (ref))
949 if (!DF_REF_REG_MEM_P (ref))
950 if (rtx *vreg = defs_map.get (regno_reg_rtx[DF_REF_REGNO (ref)]))
951 {
952 /* Also update a corresponding REG_DEAD note. */
953 rtx note = find_reg_note (insn, REG_DEAD, DF_REF_REG (ref));
954 if (note)
955 XEXP (note, 0) = *vreg;
956 *DF_REF_REAL_LOC (ref) = *vreg;
957 }
958
959 rtx def_set = single_set (insn);
960 rtx src = SET_SRC (def_set);
961 rtx dst = SET_DEST (def_set);
962 rtx subreg;
963
964 if (MEM_P (dst) && !REG_P (src))
965 {
966 /* There are no scalar integer instructions and therefore
967 temporary register usage is required. */
968 rtx tmp = gen_reg_rtx (smode);
969 emit_conversion_insns (gen_move_insn (dst, tmp), insn);
970 dst = gen_rtx_SUBREG (vmode, tmp, 0);
971 }
972 else if (REG_P (dst))
973 {
974 /* Replace the definition with a SUBREG to the definition we
975 use inside the chain. */
976 rtx *vdef = defs_map.get (dst);
977 if (vdef)
978 dst = *vdef;
979 dst = gen_rtx_SUBREG (vmode, dst, 0);
980 /* IRA doesn't like to have REG_EQUAL/EQUIV notes when the SET_DEST
981 is a non-REG_P. So kill those off. */
982 rtx note = find_reg_equal_equiv_note (insn);
983 if (note)
984 remove_note (insn, note);
985 }
986
987 switch (GET_CODE (src))
988 {
989 case ASHIFT:
990 case ASHIFTRT:
991 case LSHIFTRT:
992 convert_op (&XEXP (src, 0), insn);
993 PUT_MODE (src, vmode);
994 break;
995
996 case PLUS:
997 case MINUS:
998 case IOR:
999 case XOR:
1000 case AND:
1001 case SMAX:
1002 case SMIN:
1003 case UMAX:
1004 case UMIN:
1005 convert_op (&XEXP (src, 0), insn);
1006 convert_op (&XEXP (src, 1), insn);
1007 PUT_MODE (src, vmode);
1008 break;
1009
1010 case NEG:
1011 src = XEXP (src, 0);
1012 convert_op (&src, insn);
1013 subreg = gen_reg_rtx (vmode);
1014 emit_insn_before (gen_move_insn (subreg, CONST0_RTX (vmode)), insn);
1015 src = gen_rtx_MINUS (vmode, subreg, src);
1016 break;
1017
1018 case NOT:
1019 src = XEXP (src, 0);
1020 convert_op (&src, insn);
1021 subreg = gen_reg_rtx (vmode);
1022 emit_insn_before (gen_move_insn (subreg, CONSTM1_RTX (vmode)), insn);
1023 src = gen_rtx_XOR (vmode, src, subreg);
1024 break;
1025
1026 case MEM:
1027 if (!REG_P (dst))
1028 convert_op (&src, insn);
1029 break;
1030
1031 case REG:
1032 if (!MEM_P (dst))
1033 convert_op (&src, insn);
1034 break;
1035
1036 case SUBREG:
1037 gcc_assert (GET_MODE (src) == vmode);
1038 break;
1039
1040 case COMPARE:
1041 src = SUBREG_REG (XEXP (XEXP (src, 0), 0));
1042
1043 gcc_assert (REG_P (src) && GET_MODE (src) == DImode);
1044 subreg = gen_rtx_SUBREG (V2DImode, src, 0);
1045 emit_insn_before (gen_vec_interleave_lowv2di (copy_rtx_if_shared (subreg),
1046 copy_rtx_if_shared (subreg),
1047 copy_rtx_if_shared (subreg)),
1048 insn);
1049 dst = gen_rtx_REG (CCmode, FLAGS_REG);
1050 src = gen_rtx_UNSPEC (CCmode, gen_rtvec (2, copy_rtx_if_shared (subreg),
1051 copy_rtx_if_shared (subreg)),
1052 UNSPEC_PTEST);
1053 break;
1054
1055 case CONST_INT:
1056 convert_op (&src, insn);
1057 break;
1058
1059 default:
1060 gcc_unreachable ();
1061 }
1062
1063 SET_SRC (def_set) = src;
1064 SET_DEST (def_set) = dst;
1065
1066 /* Drop possible dead definitions. */
1067 PATTERN (insn) = def_set;
1068
1069 INSN_CODE (insn) = -1;
1070 int patt = recog_memoized (insn);
1071 if (patt == -1)
1072 fatal_insn_not_found (insn);
1073 df_insn_rescan (insn);
1074 }
1075
1076 /* Fix uses of converted REG in debug insns. */
1077
1078 void
fix_debug_reg_uses(rtx reg)1079 timode_scalar_chain::fix_debug_reg_uses (rtx reg)
1080 {
1081 if (!flag_var_tracking)
1082 return;
1083
1084 df_ref ref, next;
1085 for (ref = DF_REG_USE_CHAIN (REGNO (reg)); ref; ref = next)
1086 {
1087 rtx_insn *insn = DF_REF_INSN (ref);
1088 /* Make sure the next ref is for a different instruction,
1089 so that we're not affected by the rescan. */
1090 next = DF_REF_NEXT_REG (ref);
1091 while (next && DF_REF_INSN (next) == insn)
1092 next = DF_REF_NEXT_REG (next);
1093
1094 if (DEBUG_INSN_P (insn))
1095 {
1096 /* It may be a debug insn with a TImode variable in
1097 register. */
1098 bool changed = false;
1099 for (; ref != next; ref = DF_REF_NEXT_REG (ref))
1100 {
1101 rtx *loc = DF_REF_LOC (ref);
1102 if (REG_P (*loc) && GET_MODE (*loc) == V1TImode)
1103 {
1104 *loc = gen_rtx_SUBREG (TImode, *loc, 0);
1105 changed = true;
1106 }
1107 }
1108 if (changed)
1109 df_insn_rescan (insn);
1110 }
1111 }
1112 }
1113
1114 /* Convert INSN from TImode to V1T1mode. */
1115
1116 void
convert_insn(rtx_insn * insn)1117 timode_scalar_chain::convert_insn (rtx_insn *insn)
1118 {
1119 rtx def_set = single_set (insn);
1120 rtx src = SET_SRC (def_set);
1121 rtx dst = SET_DEST (def_set);
1122
1123 switch (GET_CODE (dst))
1124 {
1125 case REG:
1126 {
1127 rtx tmp = find_reg_equal_equiv_note (insn);
1128 if (tmp)
1129 PUT_MODE (XEXP (tmp, 0), V1TImode);
1130 PUT_MODE (dst, V1TImode);
1131 fix_debug_reg_uses (dst);
1132 }
1133 break;
1134 case MEM:
1135 PUT_MODE (dst, V1TImode);
1136 break;
1137
1138 default:
1139 gcc_unreachable ();
1140 }
1141
1142 switch (GET_CODE (src))
1143 {
1144 case REG:
1145 PUT_MODE (src, V1TImode);
1146 /* Call fix_debug_reg_uses only if SRC is never defined. */
1147 if (!DF_REG_DEF_CHAIN (REGNO (src)))
1148 fix_debug_reg_uses (src);
1149 break;
1150
1151 case MEM:
1152 PUT_MODE (src, V1TImode);
1153 break;
1154
1155 case CONST_WIDE_INT:
1156 if (NONDEBUG_INSN_P (insn))
1157 {
1158 /* Since there are no instructions to store 128-bit constant,
1159 temporary register usage is required. */
1160 rtx tmp = gen_reg_rtx (V1TImode);
1161 start_sequence ();
1162 src = gen_rtx_CONST_VECTOR (V1TImode, gen_rtvec (1, src));
1163 src = validize_mem (force_const_mem (V1TImode, src));
1164 rtx_insn *seq = get_insns ();
1165 end_sequence ();
1166 if (seq)
1167 emit_insn_before (seq, insn);
1168 emit_conversion_insns (gen_rtx_SET (dst, tmp), insn);
1169 dst = tmp;
1170 }
1171 break;
1172
1173 case CONST_INT:
1174 switch (standard_sse_constant_p (src, TImode))
1175 {
1176 case 1:
1177 src = CONST0_RTX (GET_MODE (dst));
1178 break;
1179 case 2:
1180 src = CONSTM1_RTX (GET_MODE (dst));
1181 break;
1182 default:
1183 gcc_unreachable ();
1184 }
1185 if (NONDEBUG_INSN_P (insn))
1186 {
1187 rtx tmp = gen_reg_rtx (V1TImode);
1188 /* Since there are no instructions to store standard SSE
1189 constant, temporary register usage is required. */
1190 emit_conversion_insns (gen_rtx_SET (dst, tmp), insn);
1191 dst = tmp;
1192 }
1193 break;
1194
1195 default:
1196 gcc_unreachable ();
1197 }
1198
1199 SET_SRC (def_set) = src;
1200 SET_DEST (def_set) = dst;
1201
1202 /* Drop possible dead definitions. */
1203 PATTERN (insn) = def_set;
1204
1205 INSN_CODE (insn) = -1;
1206 recog_memoized (insn);
1207 df_insn_rescan (insn);
1208 }
1209
1210 /* Generate copies from defs used by the chain but not defined therein.
1211 Also populates defs_map which is used later by convert_insn. */
1212
1213 void
convert_registers()1214 general_scalar_chain::convert_registers ()
1215 {
1216 bitmap_iterator bi;
1217 unsigned id;
1218 EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, id, bi)
1219 {
1220 rtx chain_reg = gen_reg_rtx (smode);
1221 defs_map.put (regno_reg_rtx[id], chain_reg);
1222 }
1223 EXECUTE_IF_SET_IN_BITMAP (insns_conv, 0, id, bi)
1224 for (df_ref ref = DF_INSN_UID_DEFS (id); ref; ref = DF_REF_NEXT_LOC (ref))
1225 if (bitmap_bit_p (defs_conv, DF_REF_REGNO (ref)))
1226 make_vector_copies (DF_REF_INSN (ref), DF_REF_REAL_REG (ref));
1227 }
1228
1229 /* Convert whole chain creating required register
1230 conversions and copies. */
1231
1232 int
convert()1233 scalar_chain::convert ()
1234 {
1235 bitmap_iterator bi;
1236 unsigned id;
1237 int converted_insns = 0;
1238
1239 if (!dbg_cnt (stv_conversion))
1240 return 0;
1241
1242 if (dump_file)
1243 fprintf (dump_file, "Converting chain #%d...\n", chain_id);
1244
1245 convert_registers ();
1246
1247 EXECUTE_IF_SET_IN_BITMAP (insns, 0, id, bi)
1248 {
1249 convert_insn (DF_INSN_UID_GET (id)->insn);
1250 converted_insns++;
1251 }
1252
1253 return converted_insns;
1254 }
1255
1256 /* Return 1 if INSN uses or defines a hard register.
1257 Hard register uses in a memory address are ignored.
1258 Clobbers and flags definitions are ignored. */
1259
1260 static bool
has_non_address_hard_reg(rtx_insn * insn)1261 has_non_address_hard_reg (rtx_insn *insn)
1262 {
1263 df_ref ref;
1264 FOR_EACH_INSN_DEF (ref, insn)
1265 if (HARD_REGISTER_P (DF_REF_REAL_REG (ref))
1266 && !DF_REF_FLAGS_IS_SET (ref, DF_REF_MUST_CLOBBER)
1267 && DF_REF_REGNO (ref) != FLAGS_REG)
1268 return true;
1269
1270 FOR_EACH_INSN_USE (ref, insn)
1271 if (!DF_REF_REG_MEM_P (ref) && HARD_REGISTER_P (DF_REF_REAL_REG (ref)))
1272 return true;
1273
1274 return false;
1275 }
1276
1277 /* Check if comparison INSN may be transformed
1278 into vector comparison. Currently we transform
1279 zero checks only which look like:
1280
1281 (set (reg:CCZ 17 flags)
1282 (compare:CCZ (ior:SI (subreg:SI (reg:DI x) 4)
1283 (subreg:SI (reg:DI x) 0))
1284 (const_int 0 [0]))) */
1285
1286 static bool
convertible_comparison_p(rtx_insn * insn,enum machine_mode mode)1287 convertible_comparison_p (rtx_insn *insn, enum machine_mode mode)
1288 {
1289 /* ??? Currently convertible for double-word DImode chain only. */
1290 if (TARGET_64BIT || mode != DImode)
1291 return false;
1292
1293 if (!TARGET_SSE4_1)
1294 return false;
1295
1296 rtx def_set = single_set (insn);
1297
1298 gcc_assert (def_set);
1299
1300 rtx src = SET_SRC (def_set);
1301 rtx dst = SET_DEST (def_set);
1302
1303 gcc_assert (GET_CODE (src) == COMPARE);
1304
1305 if (GET_CODE (dst) != REG
1306 || REGNO (dst) != FLAGS_REG
1307 || GET_MODE (dst) != CCZmode)
1308 return false;
1309
1310 rtx op1 = XEXP (src, 0);
1311 rtx op2 = XEXP (src, 1);
1312
1313 if (op2 != CONST0_RTX (GET_MODE (op2)))
1314 return false;
1315
1316 if (GET_CODE (op1) != IOR)
1317 return false;
1318
1319 op2 = XEXP (op1, 1);
1320 op1 = XEXP (op1, 0);
1321
1322 if (!SUBREG_P (op1)
1323 || !SUBREG_P (op2)
1324 || GET_MODE (op1) != SImode
1325 || GET_MODE (op2) != SImode
1326 || ((SUBREG_BYTE (op1) != 0
1327 || SUBREG_BYTE (op2) != GET_MODE_SIZE (SImode))
1328 && (SUBREG_BYTE (op2) != 0
1329 || SUBREG_BYTE (op1) != GET_MODE_SIZE (SImode))))
1330 return false;
1331
1332 op1 = SUBREG_REG (op1);
1333 op2 = SUBREG_REG (op2);
1334
1335 if (op1 != op2
1336 || !REG_P (op1)
1337 || GET_MODE (op1) != DImode)
1338 return false;
1339
1340 return true;
1341 }
1342
1343 /* The general version of scalar_to_vector_candidate_p. */
1344
1345 static bool
general_scalar_to_vector_candidate_p(rtx_insn * insn,enum machine_mode mode)1346 general_scalar_to_vector_candidate_p (rtx_insn *insn, enum machine_mode mode)
1347 {
1348 rtx def_set = single_set (insn);
1349
1350 if (!def_set)
1351 return false;
1352
1353 if (has_non_address_hard_reg (insn))
1354 return false;
1355
1356 rtx src = SET_SRC (def_set);
1357 rtx dst = SET_DEST (def_set);
1358
1359 if (GET_CODE (src) == COMPARE)
1360 return convertible_comparison_p (insn, mode);
1361
1362 /* We are interested in "mode" only. */
1363 if ((GET_MODE (src) != mode
1364 && !CONST_INT_P (src))
1365 || GET_MODE (dst) != mode)
1366 return false;
1367
1368 if (!REG_P (dst) && !MEM_P (dst))
1369 return false;
1370
1371 switch (GET_CODE (src))
1372 {
1373 case ASHIFTRT:
1374 if (!TARGET_AVX512VL)
1375 return false;
1376 /* FALLTHRU */
1377
1378 case ASHIFT:
1379 case LSHIFTRT:
1380 if (!CONST_INT_P (XEXP (src, 1))
1381 || !IN_RANGE (INTVAL (XEXP (src, 1)), 0, GET_MODE_BITSIZE (mode)-1))
1382 return false;
1383 break;
1384
1385 case SMAX:
1386 case SMIN:
1387 case UMAX:
1388 case UMIN:
1389 if ((mode == DImode && !TARGET_AVX512VL)
1390 || (mode == SImode && !TARGET_SSE4_1))
1391 return false;
1392 /* Fallthru. */
1393
1394 case PLUS:
1395 case MINUS:
1396 case IOR:
1397 case XOR:
1398 case AND:
1399 if (!REG_P (XEXP (src, 1))
1400 && !MEM_P (XEXP (src, 1))
1401 && !CONST_INT_P (XEXP (src, 1)))
1402 return false;
1403
1404 if (GET_MODE (XEXP (src, 1)) != mode
1405 && !CONST_INT_P (XEXP (src, 1)))
1406 return false;
1407 break;
1408
1409 case NEG:
1410 case NOT:
1411 break;
1412
1413 case REG:
1414 return true;
1415
1416 case MEM:
1417 case CONST_INT:
1418 return REG_P (dst);
1419
1420 default:
1421 return false;
1422 }
1423
1424 if (!REG_P (XEXP (src, 0))
1425 && !MEM_P (XEXP (src, 0))
1426 && !CONST_INT_P (XEXP (src, 0))
1427 /* Check for andnot case. */
1428 && (GET_CODE (src) != AND
1429 || GET_CODE (XEXP (src, 0)) != NOT
1430 || !REG_P (XEXP (XEXP (src, 0), 0))))
1431 return false;
1432
1433 if (GET_MODE (XEXP (src, 0)) != mode
1434 && !CONST_INT_P (XEXP (src, 0)))
1435 return false;
1436
1437 return true;
1438 }
1439
1440 /* The TImode version of scalar_to_vector_candidate_p. */
1441
1442 static bool
timode_scalar_to_vector_candidate_p(rtx_insn * insn)1443 timode_scalar_to_vector_candidate_p (rtx_insn *insn)
1444 {
1445 rtx def_set = single_set (insn);
1446
1447 if (!def_set)
1448 return false;
1449
1450 if (has_non_address_hard_reg (insn))
1451 return false;
1452
1453 rtx src = SET_SRC (def_set);
1454 rtx dst = SET_DEST (def_set);
1455
1456 /* Only TImode load and store are allowed. */
1457 if (GET_MODE (dst) != TImode)
1458 return false;
1459
1460 if (MEM_P (dst))
1461 {
1462 /* Check for store. Memory must be aligned or unaligned store
1463 is optimal. Only support store from register, standard SSE
1464 constant or CONST_WIDE_INT generated from piecewise store.
1465
1466 ??? Verify performance impact before enabling CONST_INT for
1467 __int128 store. */
1468 if (misaligned_operand (dst, TImode)
1469 && !TARGET_SSE_UNALIGNED_STORE_OPTIMAL)
1470 return false;
1471
1472 switch (GET_CODE (src))
1473 {
1474 default:
1475 return false;
1476
1477 case REG:
1478 case CONST_WIDE_INT:
1479 return true;
1480
1481 case CONST_INT:
1482 return standard_sse_constant_p (src, TImode);
1483 }
1484 }
1485 else if (MEM_P (src))
1486 {
1487 /* Check for load. Memory must be aligned or unaligned load is
1488 optimal. */
1489 return (REG_P (dst)
1490 && (!misaligned_operand (src, TImode)
1491 || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL));
1492 }
1493
1494 return false;
1495 }
1496
1497 /* For a register REGNO, scan instructions for its defs and uses.
1498 Put REGNO in REGS if a def or use isn't in CANDIDATES. */
1499
1500 static void
timode_check_non_convertible_regs(bitmap candidates,bitmap regs,unsigned int regno)1501 timode_check_non_convertible_regs (bitmap candidates, bitmap regs,
1502 unsigned int regno)
1503 {
1504 for (df_ref def = DF_REG_DEF_CHAIN (regno);
1505 def;
1506 def = DF_REF_NEXT_REG (def))
1507 {
1508 if (!bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
1509 {
1510 if (dump_file)
1511 fprintf (dump_file,
1512 "r%d has non convertible def in insn %d\n",
1513 regno, DF_REF_INSN_UID (def));
1514
1515 bitmap_set_bit (regs, regno);
1516 break;
1517 }
1518 }
1519
1520 for (df_ref ref = DF_REG_USE_CHAIN (regno);
1521 ref;
1522 ref = DF_REF_NEXT_REG (ref))
1523 {
1524 /* Debug instructions are skipped. */
1525 if (NONDEBUG_INSN_P (DF_REF_INSN (ref))
1526 && !bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)))
1527 {
1528 if (dump_file)
1529 fprintf (dump_file,
1530 "r%d has non convertible use in insn %d\n",
1531 regno, DF_REF_INSN_UID (ref));
1532
1533 bitmap_set_bit (regs, regno);
1534 break;
1535 }
1536 }
1537 }
1538
1539 /* The TImode version of remove_non_convertible_regs. */
1540
1541 static void
timode_remove_non_convertible_regs(bitmap candidates)1542 timode_remove_non_convertible_regs (bitmap candidates)
1543 {
1544 bitmap_iterator bi;
1545 unsigned id;
1546 bitmap regs = BITMAP_ALLOC (NULL);
1547
1548 EXECUTE_IF_SET_IN_BITMAP (candidates, 0, id, bi)
1549 {
1550 rtx def_set = single_set (DF_INSN_UID_GET (id)->insn);
1551 rtx dest = SET_DEST (def_set);
1552 rtx src = SET_SRC (def_set);
1553
1554 if ((!REG_P (dest)
1555 || bitmap_bit_p (regs, REGNO (dest))
1556 || HARD_REGISTER_P (dest))
1557 && (!REG_P (src)
1558 || bitmap_bit_p (regs, REGNO (src))
1559 || HARD_REGISTER_P (src)))
1560 continue;
1561
1562 if (REG_P (dest))
1563 timode_check_non_convertible_regs (candidates, regs,
1564 REGNO (dest));
1565
1566 if (REG_P (src))
1567 timode_check_non_convertible_regs (candidates, regs,
1568 REGNO (src));
1569 }
1570
1571 EXECUTE_IF_SET_IN_BITMAP (regs, 0, id, bi)
1572 {
1573 for (df_ref def = DF_REG_DEF_CHAIN (id);
1574 def;
1575 def = DF_REF_NEXT_REG (def))
1576 if (bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
1577 {
1578 if (dump_file)
1579 fprintf (dump_file, "Removing insn %d from candidates list\n",
1580 DF_REF_INSN_UID (def));
1581
1582 bitmap_clear_bit (candidates, DF_REF_INSN_UID (def));
1583 }
1584
1585 for (df_ref ref = DF_REG_USE_CHAIN (id);
1586 ref;
1587 ref = DF_REF_NEXT_REG (ref))
1588 if (bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)))
1589 {
1590 if (dump_file)
1591 fprintf (dump_file, "Removing insn %d from candidates list\n",
1592 DF_REF_INSN_UID (ref));
1593
1594 bitmap_clear_bit (candidates, DF_REF_INSN_UID (ref));
1595 }
1596 }
1597
1598 BITMAP_FREE (regs);
1599 }
1600
1601 /* Main STV pass function. Find and convert scalar
1602 instructions into vector mode when profitable. */
1603
1604 static unsigned int
convert_scalars_to_vector(bool timode_p)1605 convert_scalars_to_vector (bool timode_p)
1606 {
1607 basic_block bb;
1608 int converted_insns = 0;
1609
1610 bitmap_obstack_initialize (NULL);
1611 const machine_mode cand_mode[3] = { SImode, DImode, TImode };
1612 const machine_mode cand_vmode[3] = { V4SImode, V2DImode, V1TImode };
1613 bitmap_head candidates[3]; /* { SImode, DImode, TImode } */
1614 for (unsigned i = 0; i < 3; ++i)
1615 bitmap_initialize (&candidates[i], &bitmap_default_obstack);
1616
1617 calculate_dominance_info (CDI_DOMINATORS);
1618 df_set_flags (DF_DEFER_INSN_RESCAN);
1619 df_chain_add_problem (DF_DU_CHAIN | DF_UD_CHAIN);
1620 df_analyze ();
1621
1622 /* Find all instructions we want to convert into vector mode. */
1623 if (dump_file)
1624 fprintf (dump_file, "Searching for mode conversion candidates...\n");
1625
1626 FOR_EACH_BB_FN (bb, cfun)
1627 {
1628 rtx_insn *insn;
1629 FOR_BB_INSNS (bb, insn)
1630 if (timode_p
1631 && timode_scalar_to_vector_candidate_p (insn))
1632 {
1633 if (dump_file)
1634 fprintf (dump_file, " insn %d is marked as a TImode candidate\n",
1635 INSN_UID (insn));
1636
1637 bitmap_set_bit (&candidates[2], INSN_UID (insn));
1638 }
1639 else if (!timode_p)
1640 {
1641 /* Check {SI,DI}mode. */
1642 for (unsigned i = 0; i <= 1; ++i)
1643 if (general_scalar_to_vector_candidate_p (insn, cand_mode[i]))
1644 {
1645 if (dump_file)
1646 fprintf (dump_file, " insn %d is marked as a %s candidate\n",
1647 INSN_UID (insn), i == 0 ? "SImode" : "DImode");
1648
1649 bitmap_set_bit (&candidates[i], INSN_UID (insn));
1650 break;
1651 }
1652 }
1653 }
1654
1655 if (timode_p)
1656 timode_remove_non_convertible_regs (&candidates[2]);
1657
1658 for (unsigned i = 0; i <= 2; ++i)
1659 if (!bitmap_empty_p (&candidates[i]))
1660 break;
1661 else if (i == 2 && dump_file)
1662 fprintf (dump_file, "There are no candidates for optimization.\n");
1663
1664 for (unsigned i = 0; i <= 2; ++i)
1665 while (!bitmap_empty_p (&candidates[i]))
1666 {
1667 unsigned uid = bitmap_first_set_bit (&candidates[i]);
1668 scalar_chain *chain;
1669
1670 if (cand_mode[i] == TImode)
1671 chain = new timode_scalar_chain;
1672 else
1673 chain = new general_scalar_chain (cand_mode[i], cand_vmode[i]);
1674
1675 /* Find instructions chain we want to convert to vector mode.
1676 Check all uses and definitions to estimate all required
1677 conversions. */
1678 chain->build (&candidates[i], uid);
1679
1680 if (chain->compute_convert_gain () > 0)
1681 converted_insns += chain->convert ();
1682 else
1683 if (dump_file)
1684 fprintf (dump_file, "Chain #%d conversion is not profitable\n",
1685 chain->chain_id);
1686
1687 delete chain;
1688 }
1689
1690 if (dump_file)
1691 fprintf (dump_file, "Total insns converted: %d\n", converted_insns);
1692
1693 for (unsigned i = 0; i <= 2; ++i)
1694 bitmap_release (&candidates[i]);
1695 bitmap_obstack_release (NULL);
1696 df_process_deferred_rescans ();
1697
1698 /* Conversion means we may have 128bit register spills/fills
1699 which require aligned stack. */
1700 if (converted_insns)
1701 {
1702 if (crtl->stack_alignment_needed < 128)
1703 crtl->stack_alignment_needed = 128;
1704 if (crtl->stack_alignment_estimated < 128)
1705 crtl->stack_alignment_estimated = 128;
1706
1707 crtl->stack_realign_needed
1708 = INCOMING_STACK_BOUNDARY < crtl->stack_alignment_estimated;
1709 crtl->stack_realign_tried = crtl->stack_realign_needed;
1710
1711 crtl->stack_realign_processed = true;
1712
1713 if (!crtl->drap_reg)
1714 {
1715 rtx drap_rtx = targetm.calls.get_drap_rtx ();
1716
1717 /* stack_realign_drap and drap_rtx must match. */
1718 gcc_assert ((stack_realign_drap != 0) == (drap_rtx != NULL));
1719
1720 /* Do nothing if NULL is returned,
1721 which means DRAP is not needed. */
1722 if (drap_rtx != NULL)
1723 {
1724 crtl->args.internal_arg_pointer = drap_rtx;
1725
1726 /* Call fixup_tail_calls to clean up
1727 REG_EQUIV note if DRAP is needed. */
1728 fixup_tail_calls ();
1729 }
1730 }
1731
1732 /* Fix up DECL_RTL/DECL_INCOMING_RTL of arguments. */
1733 if (TARGET_64BIT)
1734 for (tree parm = DECL_ARGUMENTS (current_function_decl);
1735 parm; parm = DECL_CHAIN (parm))
1736 {
1737 if (TYPE_MODE (TREE_TYPE (parm)) != TImode)
1738 continue;
1739 if (DECL_RTL_SET_P (parm)
1740 && GET_MODE (DECL_RTL (parm)) == V1TImode)
1741 {
1742 rtx r = DECL_RTL (parm);
1743 if (REG_P (r))
1744 SET_DECL_RTL (parm, gen_rtx_SUBREG (TImode, r, 0));
1745 }
1746 if (DECL_INCOMING_RTL (parm)
1747 && GET_MODE (DECL_INCOMING_RTL (parm)) == V1TImode)
1748 {
1749 rtx r = DECL_INCOMING_RTL (parm);
1750 if (REG_P (r))
1751 DECL_INCOMING_RTL (parm) = gen_rtx_SUBREG (TImode, r, 0);
1752 }
1753 }
1754 }
1755
1756 return 0;
1757 }
1758
1759 /* Modify the vzeroupper pattern in INSN so that it describes the effect
1760 that the instruction has on the SSE registers. LIVE_REGS are the set
1761 of registers that are live across the instruction.
1762
1763 For a live register R we use:
1764
1765 (set (reg:V2DF R) (reg:V2DF R))
1766
1767 which preserves the low 128 bits but clobbers the upper bits. */
1768
1769 static void
ix86_add_reg_usage_to_vzeroupper(rtx_insn * insn,bitmap live_regs)1770 ix86_add_reg_usage_to_vzeroupper (rtx_insn *insn, bitmap live_regs)
1771 {
1772 rtx pattern = PATTERN (insn);
1773 unsigned int nregs = TARGET_64BIT ? 16 : 8;
1774 unsigned int npats = nregs;
1775 for (unsigned int i = 0; i < nregs; ++i)
1776 {
1777 unsigned int regno = GET_SSE_REGNO (i);
1778 if (!bitmap_bit_p (live_regs, regno))
1779 npats--;
1780 }
1781 if (npats == 0)
1782 return;
1783 rtvec vec = rtvec_alloc (npats + 1);
1784 RTVEC_ELT (vec, 0) = XVECEXP (pattern, 0, 0);
1785 for (unsigned int i = 0, j = 0; i < nregs; ++i)
1786 {
1787 unsigned int regno = GET_SSE_REGNO (i);
1788 if (!bitmap_bit_p (live_regs, regno))
1789 continue;
1790 rtx reg = gen_rtx_REG (V2DImode, regno);
1791 ++j;
1792 RTVEC_ELT (vec, j) = gen_rtx_SET (reg, reg);
1793 }
1794 XVEC (pattern, 0) = vec;
1795 INSN_CODE (insn) = -1;
1796 df_insn_rescan (insn);
1797 }
1798
1799 /* Walk the vzeroupper instructions in the function and annotate them
1800 with the effect that they have on the SSE registers. */
1801
1802 static void
ix86_add_reg_usage_to_vzerouppers(void)1803 ix86_add_reg_usage_to_vzerouppers (void)
1804 {
1805 basic_block bb;
1806 rtx_insn *insn;
1807 auto_bitmap live_regs;
1808
1809 df_analyze ();
1810 FOR_EACH_BB_FN (bb, cfun)
1811 {
1812 bitmap_copy (live_regs, df_get_live_out (bb));
1813 df_simulate_initialize_backwards (bb, live_regs);
1814 FOR_BB_INSNS_REVERSE (bb, insn)
1815 {
1816 if (!NONDEBUG_INSN_P (insn))
1817 continue;
1818 if (vzeroupper_pattern (PATTERN (insn), VOIDmode))
1819 ix86_add_reg_usage_to_vzeroupper (insn, live_regs);
1820 df_simulate_one_insn_backwards (bb, insn, live_regs);
1821 }
1822 }
1823 }
1824
1825 static unsigned int
rest_of_handle_insert_vzeroupper(void)1826 rest_of_handle_insert_vzeroupper (void)
1827 {
1828 int i;
1829
1830 /* vzeroupper instructions are inserted immediately after reload to
1831 account for possible spills from 256bit or 512bit registers. The pass
1832 reuses mode switching infrastructure by re-running mode insertion
1833 pass, so disable entities that have already been processed. */
1834 for (i = 0; i < MAX_386_ENTITIES; i++)
1835 ix86_optimize_mode_switching[i] = 0;
1836
1837 ix86_optimize_mode_switching[AVX_U128] = 1;
1838
1839 /* Call optimize_mode_switching. */
1840 g->get_passes ()->execute_pass_mode_switching ();
1841 ix86_add_reg_usage_to_vzerouppers ();
1842 return 0;
1843 }
1844
1845 namespace {
1846
1847 const pass_data pass_data_insert_vzeroupper =
1848 {
1849 RTL_PASS, /* type */
1850 "vzeroupper", /* name */
1851 OPTGROUP_NONE, /* optinfo_flags */
1852 TV_MACH_DEP, /* tv_id */
1853 0, /* properties_required */
1854 0, /* properties_provided */
1855 0, /* properties_destroyed */
1856 0, /* todo_flags_start */
1857 TODO_df_finish, /* todo_flags_finish */
1858 };
1859
1860 class pass_insert_vzeroupper : public rtl_opt_pass
1861 {
1862 public:
pass_insert_vzeroupper(gcc::context * ctxt)1863 pass_insert_vzeroupper(gcc::context *ctxt)
1864 : rtl_opt_pass(pass_data_insert_vzeroupper, ctxt)
1865 {}
1866
1867 /* opt_pass methods: */
gate(function *)1868 virtual bool gate (function *)
1869 {
1870 return TARGET_AVX
1871 && TARGET_VZEROUPPER && flag_expensive_optimizations
1872 && !optimize_size;
1873 }
1874
execute(function *)1875 virtual unsigned int execute (function *)
1876 {
1877 return rest_of_handle_insert_vzeroupper ();
1878 }
1879
1880 }; // class pass_insert_vzeroupper
1881
1882 const pass_data pass_data_stv =
1883 {
1884 RTL_PASS, /* type */
1885 "stv", /* name */
1886 OPTGROUP_NONE, /* optinfo_flags */
1887 TV_MACH_DEP, /* tv_id */
1888 0, /* properties_required */
1889 0, /* properties_provided */
1890 0, /* properties_destroyed */
1891 0, /* todo_flags_start */
1892 TODO_df_finish, /* todo_flags_finish */
1893 };
1894
1895 class pass_stv : public rtl_opt_pass
1896 {
1897 public:
pass_stv(gcc::context * ctxt)1898 pass_stv (gcc::context *ctxt)
1899 : rtl_opt_pass (pass_data_stv, ctxt),
1900 timode_p (false)
1901 {}
1902
1903 /* opt_pass methods: */
gate(function *)1904 virtual bool gate (function *)
1905 {
1906 return ((!timode_p || TARGET_64BIT)
1907 && TARGET_STV && TARGET_SSE2 && optimize > 1);
1908 }
1909
execute(function *)1910 virtual unsigned int execute (function *)
1911 {
1912 return convert_scalars_to_vector (timode_p);
1913 }
1914
clone()1915 opt_pass *clone ()
1916 {
1917 return new pass_stv (m_ctxt);
1918 }
1919
set_pass_param(unsigned int n,bool param)1920 void set_pass_param (unsigned int n, bool param)
1921 {
1922 gcc_assert (n == 0);
1923 timode_p = param;
1924 }
1925
1926 private:
1927 bool timode_p;
1928 }; // class pass_stv
1929
1930 } // anon namespace
1931
1932 rtl_opt_pass *
make_pass_insert_vzeroupper(gcc::context * ctxt)1933 make_pass_insert_vzeroupper (gcc::context *ctxt)
1934 {
1935 return new pass_insert_vzeroupper (ctxt);
1936 }
1937
1938 rtl_opt_pass *
make_pass_stv(gcc::context * ctxt)1939 make_pass_stv (gcc::context *ctxt)
1940 {
1941 return new pass_stv (ctxt);
1942 }
1943
1944 /* Inserting ENDBRANCH instructions. */
1945
1946 static unsigned int
rest_of_insert_endbranch(void)1947 rest_of_insert_endbranch (void)
1948 {
1949 timevar_push (TV_MACH_DEP);
1950
1951 rtx cet_eb;
1952 rtx_insn *insn;
1953 basic_block bb;
1954
1955 /* Currently emit EB if it's a tracking function, i.e. 'nocf_check' is
1956 absent among function attributes. Later an optimization will be
1957 introduced to make analysis if an address of a static function is
1958 taken. A static function whose address is not taken will get a
1959 nocf_check attribute. This will allow to reduce the number of EB. */
1960
1961 if (!lookup_attribute ("nocf_check",
1962 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
1963 && (!flag_manual_endbr
1964 || lookup_attribute ("cf_check",
1965 DECL_ATTRIBUTES (cfun->decl)))
1966 && (!cgraph_node::get (cfun->decl)->only_called_directly_p ()
1967 || ix86_cmodel == CM_LARGE
1968 || ix86_cmodel == CM_LARGE_PIC
1969 || flag_force_indirect_call
1970 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES
1971 && DECL_DLLIMPORT_P (cfun->decl))))
1972 {
1973 /* Queue ENDBR insertion to x86_function_profiler. */
1974 if (crtl->profile && flag_fentry)
1975 cfun->machine->endbr_queued_at_entrance = true;
1976 else
1977 {
1978 cet_eb = gen_nop_endbr ();
1979
1980 bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
1981 insn = BB_HEAD (bb);
1982 emit_insn_before (cet_eb, insn);
1983 }
1984 }
1985
1986 bb = 0;
1987 FOR_EACH_BB_FN (bb, cfun)
1988 {
1989 for (insn = BB_HEAD (bb); insn != NEXT_INSN (BB_END (bb));
1990 insn = NEXT_INSN (insn))
1991 {
1992 if (CALL_P (insn))
1993 {
1994 bool need_endbr;
1995 need_endbr = find_reg_note (insn, REG_SETJMP, NULL) != NULL;
1996 if (!need_endbr && !SIBLING_CALL_P (insn))
1997 {
1998 rtx call = get_call_rtx_from (insn);
1999 rtx fnaddr = XEXP (call, 0);
2000 tree fndecl = NULL_TREE;
2001
2002 /* Also generate ENDBRANCH for non-tail call which
2003 may return via indirect branch. */
2004 if (GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
2005 fndecl = SYMBOL_REF_DECL (XEXP (fnaddr, 0));
2006 if (fndecl == NULL_TREE)
2007 fndecl = MEM_EXPR (fnaddr);
2008 if (fndecl
2009 && TREE_CODE (TREE_TYPE (fndecl)) != FUNCTION_TYPE
2010 && TREE_CODE (TREE_TYPE (fndecl)) != METHOD_TYPE)
2011 fndecl = NULL_TREE;
2012 if (fndecl && TYPE_ARG_TYPES (TREE_TYPE (fndecl)))
2013 {
2014 tree fntype = TREE_TYPE (fndecl);
2015 if (lookup_attribute ("indirect_return",
2016 TYPE_ATTRIBUTES (fntype)))
2017 need_endbr = true;
2018 }
2019 }
2020 if (!need_endbr)
2021 continue;
2022 /* Generate ENDBRANCH after CALL, which can return more than
2023 twice, setjmp-like functions. */
2024
2025 cet_eb = gen_nop_endbr ();
2026 emit_insn_after_setloc (cet_eb, insn, INSN_LOCATION (insn));
2027 continue;
2028 }
2029
2030 if (JUMP_P (insn) && flag_cet_switch)
2031 {
2032 rtx target = JUMP_LABEL (insn);
2033 if (target == NULL_RTX || ANY_RETURN_P (target))
2034 continue;
2035
2036 /* Check the jump is a switch table. */
2037 rtx_insn *label = as_a<rtx_insn *> (target);
2038 rtx_insn *table = next_insn (label);
2039 if (table == NULL_RTX || !JUMP_TABLE_DATA_P (table))
2040 continue;
2041
2042 /* For the indirect jump find out all places it jumps and insert
2043 ENDBRANCH there. It should be done under a special flag to
2044 control ENDBRANCH generation for switch stmts. */
2045 edge_iterator ei;
2046 edge e;
2047 basic_block dest_blk;
2048
2049 FOR_EACH_EDGE (e, ei, bb->succs)
2050 {
2051 rtx_insn *insn;
2052
2053 dest_blk = e->dest;
2054 insn = BB_HEAD (dest_blk);
2055 gcc_assert (LABEL_P (insn));
2056 cet_eb = gen_nop_endbr ();
2057 emit_insn_after (cet_eb, insn);
2058 }
2059 continue;
2060 }
2061
2062 if (LABEL_P (insn) && LABEL_PRESERVE_P (insn))
2063 {
2064 cet_eb = gen_nop_endbr ();
2065 emit_insn_after (cet_eb, insn);
2066 continue;
2067 }
2068 }
2069 }
2070
2071 timevar_pop (TV_MACH_DEP);
2072 return 0;
2073 }
2074
2075 namespace {
2076
2077 const pass_data pass_data_insert_endbranch =
2078 {
2079 RTL_PASS, /* type. */
2080 "cet", /* name. */
2081 OPTGROUP_NONE, /* optinfo_flags. */
2082 TV_MACH_DEP, /* tv_id. */
2083 0, /* properties_required. */
2084 0, /* properties_provided. */
2085 0, /* properties_destroyed. */
2086 0, /* todo_flags_start. */
2087 0, /* todo_flags_finish. */
2088 };
2089
2090 class pass_insert_endbranch : public rtl_opt_pass
2091 {
2092 public:
pass_insert_endbranch(gcc::context * ctxt)2093 pass_insert_endbranch (gcc::context *ctxt)
2094 : rtl_opt_pass (pass_data_insert_endbranch, ctxt)
2095 {}
2096
2097 /* opt_pass methods: */
gate(function *)2098 virtual bool gate (function *)
2099 {
2100 return ((flag_cf_protection & CF_BRANCH));
2101 }
2102
execute(function *)2103 virtual unsigned int execute (function *)
2104 {
2105 return rest_of_insert_endbranch ();
2106 }
2107
2108 }; // class pass_insert_endbranch
2109
2110 } // anon namespace
2111
2112 rtl_opt_pass *
make_pass_insert_endbranch(gcc::context * ctxt)2113 make_pass_insert_endbranch (gcc::context *ctxt)
2114 {
2115 return new pass_insert_endbranch (ctxt);
2116 }
2117
2118 /* At entry of the nearest common dominator for basic blocks with
2119 conversions, generate a single
2120 vxorps %xmmN, %xmmN, %xmmN
2121 for all
2122 vcvtss2sd op, %xmmN, %xmmX
2123 vcvtsd2ss op, %xmmN, %xmmX
2124 vcvtsi2ss op, %xmmN, %xmmX
2125 vcvtsi2sd op, %xmmN, %xmmX
2126
2127 NB: We want to generate only a single vxorps to cover the whole
2128 function. The LCM algorithm isn't appropriate here since it may
2129 place a vxorps inside the loop. */
2130
2131 static unsigned int
remove_partial_avx_dependency(void)2132 remove_partial_avx_dependency (void)
2133 {
2134 timevar_push (TV_MACH_DEP);
2135
2136 bitmap_obstack_initialize (NULL);
2137 bitmap convert_bbs = BITMAP_ALLOC (NULL);
2138
2139 basic_block bb;
2140 rtx_insn *insn, *set_insn;
2141 rtx set;
2142 rtx v4sf_const0 = NULL_RTX;
2143
2144 auto_vec<rtx_insn *> control_flow_insns;
2145
2146 FOR_EACH_BB_FN (bb, cfun)
2147 {
2148 FOR_BB_INSNS (bb, insn)
2149 {
2150 if (!NONDEBUG_INSN_P (insn))
2151 continue;
2152
2153 set = single_set (insn);
2154 if (!set)
2155 continue;
2156
2157 if (get_attr_avx_partial_xmm_update (insn)
2158 != AVX_PARTIAL_XMM_UPDATE_TRUE)
2159 continue;
2160
2161 if (!v4sf_const0)
2162 {
2163 calculate_dominance_info (CDI_DOMINATORS);
2164 df_set_flags (DF_DEFER_INSN_RESCAN);
2165 df_chain_add_problem (DF_DU_CHAIN | DF_UD_CHAIN);
2166 df_md_add_problem ();
2167 df_analyze ();
2168 v4sf_const0 = gen_reg_rtx (V4SFmode);
2169 }
2170
2171 /* Convert PARTIAL_XMM_UPDATE_TRUE insns, DF -> SF, SF -> DF,
2172 SI -> SF, SI -> DF, DI -> SF, DI -> DF, to vec_dup and
2173 vec_merge with subreg. */
2174 rtx src = SET_SRC (set);
2175 rtx dest = SET_DEST (set);
2176 machine_mode dest_mode = GET_MODE (dest);
2177
2178 rtx zero;
2179 machine_mode dest_vecmode;
2180 if (dest_mode == E_SFmode)
2181 {
2182 dest_vecmode = V4SFmode;
2183 zero = v4sf_const0;
2184 }
2185 else
2186 {
2187 dest_vecmode = V2DFmode;
2188 zero = gen_rtx_SUBREG (V2DFmode, v4sf_const0, 0);
2189 }
2190
2191 /* Change source to vector mode. */
2192 src = gen_rtx_VEC_DUPLICATE (dest_vecmode, src);
2193 src = gen_rtx_VEC_MERGE (dest_vecmode, src, zero,
2194 GEN_INT (HOST_WIDE_INT_1U));
2195 /* Change destination to vector mode. */
2196 rtx vec = gen_reg_rtx (dest_vecmode);
2197 /* Generate an XMM vector SET. */
2198 set = gen_rtx_SET (vec, src);
2199 set_insn = emit_insn_before (set, insn);
2200 df_insn_rescan (set_insn);
2201
2202 if (cfun->can_throw_non_call_exceptions)
2203 {
2204 /* Handle REG_EH_REGION note. */
2205 rtx note = find_reg_note (insn, REG_EH_REGION, NULL_RTX);
2206 if (note)
2207 {
2208 control_flow_insns.safe_push (set_insn);
2209 add_reg_note (set_insn, REG_EH_REGION, XEXP (note, 0));
2210 }
2211 }
2212
2213 src = gen_rtx_SUBREG (dest_mode, vec, 0);
2214 set = gen_rtx_SET (dest, src);
2215
2216 /* Drop possible dead definitions. */
2217 PATTERN (insn) = set;
2218
2219 INSN_CODE (insn) = -1;
2220 recog_memoized (insn);
2221 df_insn_rescan (insn);
2222 bitmap_set_bit (convert_bbs, bb->index);
2223 }
2224 }
2225
2226 if (v4sf_const0)
2227 {
2228 /* (Re-)discover loops so that bb->loop_father can be used in the
2229 analysis below. */
2230 loop_optimizer_init (AVOID_CFG_MODIFICATIONS);
2231
2232 /* Generate a vxorps at entry of the nearest dominator for basic
2233 blocks with conversions, which is in the fake loop that
2234 contains the whole function, so that there is only a single
2235 vxorps in the whole function. */
2236 bb = nearest_common_dominator_for_set (CDI_DOMINATORS,
2237 convert_bbs);
2238 while (bb->loop_father->latch
2239 != EXIT_BLOCK_PTR_FOR_FN (cfun))
2240 bb = get_immediate_dominator (CDI_DOMINATORS,
2241 bb->loop_father->header);
2242
2243 set = gen_rtx_SET (v4sf_const0, CONST0_RTX (V4SFmode));
2244
2245 insn = BB_HEAD (bb);
2246 while (insn && !NONDEBUG_INSN_P (insn))
2247 {
2248 if (insn == BB_END (bb))
2249 {
2250 insn = NULL;
2251 break;
2252 }
2253 insn = NEXT_INSN (insn);
2254 }
2255 if (insn == BB_HEAD (bb))
2256 set_insn = emit_insn_before (set, insn);
2257 else
2258 set_insn = emit_insn_after (set,
2259 insn ? PREV_INSN (insn) : BB_END (bb));
2260 df_insn_rescan (set_insn);
2261 df_process_deferred_rescans ();
2262 loop_optimizer_finalize ();
2263
2264 if (!control_flow_insns.is_empty ())
2265 {
2266 free_dominance_info (CDI_DOMINATORS);
2267
2268 unsigned int i;
2269 FOR_EACH_VEC_ELT (control_flow_insns, i, insn)
2270 if (control_flow_insn_p (insn))
2271 {
2272 /* Split the block after insn. There will be a fallthru
2273 edge, which is OK so we keep it. We have to create
2274 the exception edges ourselves. */
2275 bb = BLOCK_FOR_INSN (insn);
2276 split_block (bb, insn);
2277 rtl_make_eh_edge (NULL, bb, BB_END (bb));
2278 }
2279 }
2280 }
2281
2282 bitmap_obstack_release (NULL);
2283 BITMAP_FREE (convert_bbs);
2284
2285 timevar_pop (TV_MACH_DEP);
2286 return 0;
2287 }
2288
2289 namespace {
2290
2291 const pass_data pass_data_remove_partial_avx_dependency =
2292 {
2293 RTL_PASS, /* type */
2294 "rpad", /* name */
2295 OPTGROUP_NONE, /* optinfo_flags */
2296 TV_MACH_DEP, /* tv_id */
2297 0, /* properties_required */
2298 0, /* properties_provided */
2299 0, /* properties_destroyed */
2300 0, /* todo_flags_start */
2301 TODO_df_finish, /* todo_flags_finish */
2302 };
2303
2304 class pass_remove_partial_avx_dependency : public rtl_opt_pass
2305 {
2306 public:
pass_remove_partial_avx_dependency(gcc::context * ctxt)2307 pass_remove_partial_avx_dependency (gcc::context *ctxt)
2308 : rtl_opt_pass (pass_data_remove_partial_avx_dependency, ctxt)
2309 {}
2310
2311 /* opt_pass methods: */
gate(function *)2312 virtual bool gate (function *)
2313 {
2314 return (TARGET_AVX
2315 && TARGET_SSE_PARTIAL_REG_DEPENDENCY
2316 && TARGET_SSE_MATH
2317 && optimize
2318 && optimize_function_for_speed_p (cfun));
2319 }
2320
execute(function *)2321 virtual unsigned int execute (function *)
2322 {
2323 return remove_partial_avx_dependency ();
2324 }
2325 }; // class pass_rpad
2326
2327 } // anon namespace
2328
2329 rtl_opt_pass *
make_pass_remove_partial_avx_dependency(gcc::context * ctxt)2330 make_pass_remove_partial_avx_dependency (gcc::context *ctxt)
2331 {
2332 return new pass_remove_partial_avx_dependency (ctxt);
2333 }
2334
2335 /* This compares the priority of target features in function DECL1
2336 and DECL2. It returns positive value if DECL1 is higher priority,
2337 negative value if DECL2 is higher priority and 0 if they are the
2338 same. */
2339
2340 int
ix86_compare_version_priority(tree decl1,tree decl2)2341 ix86_compare_version_priority (tree decl1, tree decl2)
2342 {
2343 unsigned int priority1 = get_builtin_code_for_version (decl1, NULL);
2344 unsigned int priority2 = get_builtin_code_for_version (decl2, NULL);
2345
2346 return (int)priority1 - (int)priority2;
2347 }
2348
2349 /* V1 and V2 point to function versions with different priorities
2350 based on the target ISA. This function compares their priorities. */
2351
2352 static int
feature_compare(const void * v1,const void * v2)2353 feature_compare (const void *v1, const void *v2)
2354 {
2355 typedef struct _function_version_info
2356 {
2357 tree version_decl;
2358 tree predicate_chain;
2359 unsigned int dispatch_priority;
2360 } function_version_info;
2361
2362 const function_version_info c1 = *(const function_version_info *)v1;
2363 const function_version_info c2 = *(const function_version_info *)v2;
2364 return (c2.dispatch_priority - c1.dispatch_priority);
2365 }
2366
2367 /* This adds a condition to the basic_block NEW_BB in function FUNCTION_DECL
2368 to return a pointer to VERSION_DECL if the outcome of the expression
2369 formed by PREDICATE_CHAIN is true. This function will be called during
2370 version dispatch to decide which function version to execute. It returns
2371 the basic block at the end, to which more conditions can be added. */
2372
2373 static basic_block
add_condition_to_bb(tree function_decl,tree version_decl,tree predicate_chain,basic_block new_bb)2374 add_condition_to_bb (tree function_decl, tree version_decl,
2375 tree predicate_chain, basic_block new_bb)
2376 {
2377 gimple *return_stmt;
2378 tree convert_expr, result_var;
2379 gimple *convert_stmt;
2380 gimple *call_cond_stmt;
2381 gimple *if_else_stmt;
2382
2383 basic_block bb1, bb2, bb3;
2384 edge e12, e23;
2385
2386 tree cond_var, and_expr_var = NULL_TREE;
2387 gimple_seq gseq;
2388
2389 tree predicate_decl, predicate_arg;
2390
2391 push_cfun (DECL_STRUCT_FUNCTION (function_decl));
2392
2393 gcc_assert (new_bb != NULL);
2394 gseq = bb_seq (new_bb);
2395
2396
2397 convert_expr = build1 (CONVERT_EXPR, ptr_type_node,
2398 build_fold_addr_expr (version_decl));
2399 result_var = create_tmp_var (ptr_type_node);
2400 convert_stmt = gimple_build_assign (result_var, convert_expr);
2401 return_stmt = gimple_build_return (result_var);
2402
2403 if (predicate_chain == NULL_TREE)
2404 {
2405 gimple_seq_add_stmt (&gseq, convert_stmt);
2406 gimple_seq_add_stmt (&gseq, return_stmt);
2407 set_bb_seq (new_bb, gseq);
2408 gimple_set_bb (convert_stmt, new_bb);
2409 gimple_set_bb (return_stmt, new_bb);
2410 pop_cfun ();
2411 return new_bb;
2412 }
2413
2414 while (predicate_chain != NULL)
2415 {
2416 cond_var = create_tmp_var (integer_type_node);
2417 predicate_decl = TREE_PURPOSE (predicate_chain);
2418 predicate_arg = TREE_VALUE (predicate_chain);
2419 call_cond_stmt = gimple_build_call (predicate_decl, 1, predicate_arg);
2420 gimple_call_set_lhs (call_cond_stmt, cond_var);
2421
2422 gimple_set_block (call_cond_stmt, DECL_INITIAL (function_decl));
2423 gimple_set_bb (call_cond_stmt, new_bb);
2424 gimple_seq_add_stmt (&gseq, call_cond_stmt);
2425
2426 predicate_chain = TREE_CHAIN (predicate_chain);
2427
2428 if (and_expr_var == NULL)
2429 and_expr_var = cond_var;
2430 else
2431 {
2432 gimple *assign_stmt;
2433 /* Use MIN_EXPR to check if any integer is zero?.
2434 and_expr_var = min_expr <cond_var, and_expr_var> */
2435 assign_stmt = gimple_build_assign (and_expr_var,
2436 build2 (MIN_EXPR, integer_type_node,
2437 cond_var, and_expr_var));
2438
2439 gimple_set_block (assign_stmt, DECL_INITIAL (function_decl));
2440 gimple_set_bb (assign_stmt, new_bb);
2441 gimple_seq_add_stmt (&gseq, assign_stmt);
2442 }
2443 }
2444
2445 if_else_stmt = gimple_build_cond (GT_EXPR, and_expr_var,
2446 integer_zero_node,
2447 NULL_TREE, NULL_TREE);
2448 gimple_set_block (if_else_stmt, DECL_INITIAL (function_decl));
2449 gimple_set_bb (if_else_stmt, new_bb);
2450 gimple_seq_add_stmt (&gseq, if_else_stmt);
2451
2452 gimple_seq_add_stmt (&gseq, convert_stmt);
2453 gimple_seq_add_stmt (&gseq, return_stmt);
2454 set_bb_seq (new_bb, gseq);
2455
2456 bb1 = new_bb;
2457 e12 = split_block (bb1, if_else_stmt);
2458 bb2 = e12->dest;
2459 e12->flags &= ~EDGE_FALLTHRU;
2460 e12->flags |= EDGE_TRUE_VALUE;
2461
2462 e23 = split_block (bb2, return_stmt);
2463
2464 gimple_set_bb (convert_stmt, bb2);
2465 gimple_set_bb (return_stmt, bb2);
2466
2467 bb3 = e23->dest;
2468 make_edge (bb1, bb3, EDGE_FALSE_VALUE);
2469
2470 remove_edge (e23);
2471 make_edge (bb2, EXIT_BLOCK_PTR_FOR_FN (cfun), 0);
2472
2473 pop_cfun ();
2474
2475 return bb3;
2476 }
2477
2478 /* This function generates the dispatch function for
2479 multi-versioned functions. DISPATCH_DECL is the function which will
2480 contain the dispatch logic. FNDECLS are the function choices for
2481 dispatch, and is a tree chain. EMPTY_BB is the basic block pointer
2482 in DISPATCH_DECL in which the dispatch code is generated. */
2483
2484 static int
dispatch_function_versions(tree dispatch_decl,void * fndecls_p,basic_block * empty_bb)2485 dispatch_function_versions (tree dispatch_decl,
2486 void *fndecls_p,
2487 basic_block *empty_bb)
2488 {
2489 tree default_decl;
2490 gimple *ifunc_cpu_init_stmt;
2491 gimple_seq gseq;
2492 int ix;
2493 tree ele;
2494 vec<tree> *fndecls;
2495 unsigned int num_versions = 0;
2496 unsigned int actual_versions = 0;
2497 unsigned int i;
2498
2499 struct _function_version_info
2500 {
2501 tree version_decl;
2502 tree predicate_chain;
2503 unsigned int dispatch_priority;
2504 }*function_version_info;
2505
2506 gcc_assert (dispatch_decl != NULL
2507 && fndecls_p != NULL
2508 && empty_bb != NULL);
2509
2510 /*fndecls_p is actually a vector. */
2511 fndecls = static_cast<vec<tree> *> (fndecls_p);
2512
2513 /* At least one more version other than the default. */
2514 num_versions = fndecls->length ();
2515 gcc_assert (num_versions >= 2);
2516
2517 function_version_info = (struct _function_version_info *)
2518 XNEWVEC (struct _function_version_info, (num_versions - 1));
2519
2520 /* The first version in the vector is the default decl. */
2521 default_decl = (*fndecls)[0];
2522
2523 push_cfun (DECL_STRUCT_FUNCTION (dispatch_decl));
2524
2525 gseq = bb_seq (*empty_bb);
2526 /* Function version dispatch is via IFUNC. IFUNC resolvers fire before
2527 constructors, so explicity call __builtin_cpu_init here. */
2528 ifunc_cpu_init_stmt
2529 = gimple_build_call_vec (get_ix86_builtin (IX86_BUILTIN_CPU_INIT), vNULL);
2530 gimple_seq_add_stmt (&gseq, ifunc_cpu_init_stmt);
2531 gimple_set_bb (ifunc_cpu_init_stmt, *empty_bb);
2532 set_bb_seq (*empty_bb, gseq);
2533
2534 pop_cfun ();
2535
2536
2537 for (ix = 1; fndecls->iterate (ix, &ele); ++ix)
2538 {
2539 tree version_decl = ele;
2540 tree predicate_chain = NULL_TREE;
2541 unsigned int priority;
2542 /* Get attribute string, parse it and find the right predicate decl.
2543 The predicate function could be a lengthy combination of many
2544 features, like arch-type and various isa-variants. */
2545 priority = get_builtin_code_for_version (version_decl,
2546 &predicate_chain);
2547
2548 if (predicate_chain == NULL_TREE)
2549 continue;
2550
2551 function_version_info [actual_versions].version_decl = version_decl;
2552 function_version_info [actual_versions].predicate_chain
2553 = predicate_chain;
2554 function_version_info [actual_versions].dispatch_priority = priority;
2555 actual_versions++;
2556 }
2557
2558 /* Sort the versions according to descending order of dispatch priority. The
2559 priority is based on the ISA. This is not a perfect solution. There
2560 could still be ambiguity. If more than one function version is suitable
2561 to execute, which one should be dispatched? In future, allow the user
2562 to specify a dispatch priority next to the version. */
2563 qsort (function_version_info, actual_versions,
2564 sizeof (struct _function_version_info), feature_compare);
2565
2566 for (i = 0; i < actual_versions; ++i)
2567 *empty_bb = add_condition_to_bb (dispatch_decl,
2568 function_version_info[i].version_decl,
2569 function_version_info[i].predicate_chain,
2570 *empty_bb);
2571
2572 /* dispatch default version at the end. */
2573 *empty_bb = add_condition_to_bb (dispatch_decl, default_decl,
2574 NULL, *empty_bb);
2575
2576 free (function_version_info);
2577 return 0;
2578 }
2579
2580 /* This function changes the assembler name for functions that are
2581 versions. If DECL is a function version and has a "target"
2582 attribute, it appends the attribute string to its assembler name. */
2583
2584 static tree
ix86_mangle_function_version_assembler_name(tree decl,tree id)2585 ix86_mangle_function_version_assembler_name (tree decl, tree id)
2586 {
2587 tree version_attr;
2588 const char *orig_name, *version_string;
2589 char *attr_str, *assembler_name;
2590
2591 if (DECL_DECLARED_INLINE_P (decl)
2592 && lookup_attribute ("gnu_inline",
2593 DECL_ATTRIBUTES (decl)))
2594 error_at (DECL_SOURCE_LOCATION (decl),
2595 "function versions cannot be marked as %<gnu_inline%>,"
2596 " bodies have to be generated");
2597
2598 if (DECL_VIRTUAL_P (decl)
2599 || DECL_VINDEX (decl))
2600 sorry ("virtual function multiversioning not supported");
2601
2602 version_attr = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
2603
2604 /* target attribute string cannot be NULL. */
2605 gcc_assert (version_attr != NULL_TREE);
2606
2607 orig_name = IDENTIFIER_POINTER (id);
2608 version_string
2609 = TREE_STRING_POINTER (TREE_VALUE (TREE_VALUE (version_attr)));
2610
2611 if (strcmp (version_string, "default") == 0)
2612 return id;
2613
2614 attr_str = sorted_attr_string (TREE_VALUE (version_attr));
2615 assembler_name = XNEWVEC (char, strlen (orig_name) + strlen (attr_str) + 2);
2616
2617 sprintf (assembler_name, "%s.%s", orig_name, attr_str);
2618
2619 /* Allow assembler name to be modified if already set. */
2620 if (DECL_ASSEMBLER_NAME_SET_P (decl))
2621 SET_DECL_RTL (decl, NULL);
2622
2623 tree ret = get_identifier (assembler_name);
2624 XDELETEVEC (attr_str);
2625 XDELETEVEC (assembler_name);
2626 return ret;
2627 }
2628
2629 tree
ix86_mangle_decl_assembler_name(tree decl,tree id)2630 ix86_mangle_decl_assembler_name (tree decl, tree id)
2631 {
2632 /* For function version, add the target suffix to the assembler name. */
2633 if (TREE_CODE (decl) == FUNCTION_DECL
2634 && DECL_FUNCTION_VERSIONED (decl))
2635 id = ix86_mangle_function_version_assembler_name (decl, id);
2636 #ifdef SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME
2637 id = SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME (decl, id);
2638 #endif
2639
2640 return id;
2641 }
2642
2643 /* Make a dispatcher declaration for the multi-versioned function DECL.
2644 Calls to DECL function will be replaced with calls to the dispatcher
2645 by the front-end. Returns the decl of the dispatcher function. */
2646
2647 tree
ix86_get_function_versions_dispatcher(void * decl)2648 ix86_get_function_versions_dispatcher (void *decl)
2649 {
2650 tree fn = (tree) decl;
2651 struct cgraph_node *node = NULL;
2652 struct cgraph_node *default_node = NULL;
2653 struct cgraph_function_version_info *node_v = NULL;
2654 struct cgraph_function_version_info *first_v = NULL;
2655
2656 tree dispatch_decl = NULL;
2657
2658 struct cgraph_function_version_info *default_version_info = NULL;
2659
2660 gcc_assert (fn != NULL && DECL_FUNCTION_VERSIONED (fn));
2661
2662 node = cgraph_node::get (fn);
2663 gcc_assert (node != NULL);
2664
2665 node_v = node->function_version ();
2666 gcc_assert (node_v != NULL);
2667
2668 if (node_v->dispatcher_resolver != NULL)
2669 return node_v->dispatcher_resolver;
2670
2671 /* Find the default version and make it the first node. */
2672 first_v = node_v;
2673 /* Go to the beginning of the chain. */
2674 while (first_v->prev != NULL)
2675 first_v = first_v->prev;
2676 default_version_info = first_v;
2677 while (default_version_info != NULL)
2678 {
2679 if (is_function_default_version
2680 (default_version_info->this_node->decl))
2681 break;
2682 default_version_info = default_version_info->next;
2683 }
2684
2685 /* If there is no default node, just return NULL. */
2686 if (default_version_info == NULL)
2687 return NULL;
2688
2689 /* Make default info the first node. */
2690 if (first_v != default_version_info)
2691 {
2692 default_version_info->prev->next = default_version_info->next;
2693 if (default_version_info->next)
2694 default_version_info->next->prev = default_version_info->prev;
2695 first_v->prev = default_version_info;
2696 default_version_info->next = first_v;
2697 default_version_info->prev = NULL;
2698 }
2699
2700 default_node = default_version_info->this_node;
2701
2702 #if defined (ASM_OUTPUT_TYPE_DIRECTIVE)
2703 if (targetm.has_ifunc_p ())
2704 {
2705 struct cgraph_function_version_info *it_v = NULL;
2706 struct cgraph_node *dispatcher_node = NULL;
2707 struct cgraph_function_version_info *dispatcher_version_info = NULL;
2708
2709 /* Right now, the dispatching is done via ifunc. */
2710 dispatch_decl = make_dispatcher_decl (default_node->decl);
2711
2712 dispatcher_node = cgraph_node::get_create (dispatch_decl);
2713 gcc_assert (dispatcher_node != NULL);
2714 dispatcher_node->dispatcher_function = 1;
2715 dispatcher_version_info
2716 = dispatcher_node->insert_new_function_version ();
2717 dispatcher_version_info->next = default_version_info;
2718 dispatcher_node->definition = 1;
2719
2720 /* Set the dispatcher for all the versions. */
2721 it_v = default_version_info;
2722 while (it_v != NULL)
2723 {
2724 it_v->dispatcher_resolver = dispatch_decl;
2725 it_v = it_v->next;
2726 }
2727 }
2728 else
2729 #endif
2730 {
2731 error_at (DECL_SOURCE_LOCATION (default_node->decl),
2732 "multiversioning needs %<ifunc%> which is not supported "
2733 "on this target");
2734 }
2735
2736 return dispatch_decl;
2737 }
2738
2739 /* Make the resolver function decl to dispatch the versions of
2740 a multi-versioned function, DEFAULT_DECL. IFUNC_ALIAS_DECL is
2741 ifunc alias that will point to the created resolver. Create an
2742 empty basic block in the resolver and store the pointer in
2743 EMPTY_BB. Return the decl of the resolver function. */
2744
2745 static tree
make_resolver_func(const tree default_decl,const tree ifunc_alias_decl,basic_block * empty_bb)2746 make_resolver_func (const tree default_decl,
2747 const tree ifunc_alias_decl,
2748 basic_block *empty_bb)
2749 {
2750 tree decl, type, t;
2751
2752 /* Create resolver function name based on default_decl. */
2753 tree decl_name = clone_function_name (default_decl, "resolver");
2754 const char *resolver_name = IDENTIFIER_POINTER (decl_name);
2755
2756 /* The resolver function should return a (void *). */
2757 type = build_function_type_list (ptr_type_node, NULL_TREE);
2758
2759 decl = build_fn_decl (resolver_name, type);
2760 SET_DECL_ASSEMBLER_NAME (decl, decl_name);
2761
2762 DECL_NAME (decl) = decl_name;
2763 TREE_USED (decl) = 1;
2764 DECL_ARTIFICIAL (decl) = 1;
2765 DECL_IGNORED_P (decl) = 1;
2766 TREE_PUBLIC (decl) = 0;
2767 DECL_UNINLINABLE (decl) = 1;
2768
2769 /* Resolver is not external, body is generated. */
2770 DECL_EXTERNAL (decl) = 0;
2771 DECL_EXTERNAL (ifunc_alias_decl) = 0;
2772
2773 DECL_CONTEXT (decl) = NULL_TREE;
2774 DECL_INITIAL (decl) = make_node (BLOCK);
2775 DECL_STATIC_CONSTRUCTOR (decl) = 0;
2776
2777 if (DECL_COMDAT_GROUP (default_decl)
2778 || TREE_PUBLIC (default_decl))
2779 {
2780 /* In this case, each translation unit with a call to this
2781 versioned function will put out a resolver. Ensure it
2782 is comdat to keep just one copy. */
2783 DECL_COMDAT (decl) = 1;
2784 make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
2785 }
2786 else
2787 TREE_PUBLIC (ifunc_alias_decl) = 0;
2788
2789 /* Build result decl and add to function_decl. */
2790 t = build_decl (UNKNOWN_LOCATION, RESULT_DECL, NULL_TREE, ptr_type_node);
2791 DECL_CONTEXT (t) = decl;
2792 DECL_ARTIFICIAL (t) = 1;
2793 DECL_IGNORED_P (t) = 1;
2794 DECL_RESULT (decl) = t;
2795
2796 gimplify_function_tree (decl);
2797 push_cfun (DECL_STRUCT_FUNCTION (decl));
2798 *empty_bb = init_lowered_empty_function (decl, false,
2799 profile_count::uninitialized ());
2800
2801 cgraph_node::add_new_function (decl, true);
2802 symtab->call_cgraph_insertion_hooks (cgraph_node::get_create (decl));
2803
2804 pop_cfun ();
2805
2806 gcc_assert (ifunc_alias_decl != NULL);
2807 /* Mark ifunc_alias_decl as "ifunc" with resolver as resolver_name. */
2808 DECL_ATTRIBUTES (ifunc_alias_decl)
2809 = make_attribute ("ifunc", resolver_name,
2810 DECL_ATTRIBUTES (ifunc_alias_decl));
2811
2812 /* Create the alias for dispatch to resolver here. */
2813 cgraph_node::create_same_body_alias (ifunc_alias_decl, decl);
2814 return decl;
2815 }
2816
2817 /* Generate the dispatching code body to dispatch multi-versioned function
2818 DECL. The target hook is called to process the "target" attributes and
2819 provide the code to dispatch the right function at run-time. NODE points
2820 to the dispatcher decl whose body will be created. */
2821
2822 tree
ix86_generate_version_dispatcher_body(void * node_p)2823 ix86_generate_version_dispatcher_body (void *node_p)
2824 {
2825 tree resolver_decl;
2826 basic_block empty_bb;
2827 tree default_ver_decl;
2828 struct cgraph_node *versn;
2829 struct cgraph_node *node;
2830
2831 struct cgraph_function_version_info *node_version_info = NULL;
2832 struct cgraph_function_version_info *versn_info = NULL;
2833
2834 node = (cgraph_node *)node_p;
2835
2836 node_version_info = node->function_version ();
2837 gcc_assert (node->dispatcher_function
2838 && node_version_info != NULL);
2839
2840 if (node_version_info->dispatcher_resolver)
2841 return node_version_info->dispatcher_resolver;
2842
2843 /* The first version in the chain corresponds to the default version. */
2844 default_ver_decl = node_version_info->next->this_node->decl;
2845
2846 /* node is going to be an alias, so remove the finalized bit. */
2847 node->definition = false;
2848
2849 resolver_decl = make_resolver_func (default_ver_decl,
2850 node->decl, &empty_bb);
2851
2852 node_version_info->dispatcher_resolver = resolver_decl;
2853
2854 push_cfun (DECL_STRUCT_FUNCTION (resolver_decl));
2855
2856 auto_vec<tree, 2> fn_ver_vec;
2857
2858 for (versn_info = node_version_info->next; versn_info;
2859 versn_info = versn_info->next)
2860 {
2861 versn = versn_info->this_node;
2862 /* Check for virtual functions here again, as by this time it should
2863 have been determined if this function needs a vtable index or
2864 not. This happens for methods in derived classes that override
2865 virtual methods in base classes but are not explicitly marked as
2866 virtual. */
2867 if (DECL_VINDEX (versn->decl))
2868 sorry ("virtual function multiversioning not supported");
2869
2870 fn_ver_vec.safe_push (versn->decl);
2871 }
2872
2873 dispatch_function_versions (resolver_decl, &fn_ver_vec, &empty_bb);
2874 cgraph_edge::rebuild_edges ();
2875 pop_cfun ();
2876 return resolver_decl;
2877 }
2878
2879
2880