1 /* Copyright (C) 1988-2020 Free Software Foundation, Inc.
2 
3 This file is part of GCC.
4 
5 GCC is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3, or (at your option)
8 any later version.
9 
10 GCC is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 GNU General Public License for more details.
14 
15 You should have received a copy of the GNU General Public License
16 along with GCC; see the file COPYING3.  If not see
17 <http://www.gnu.org/licenses/>.  */
18 
19 #define IN_TARGET_CODE 1
20 
21 #include "config.h"
22 #include "system.h"
23 #include "coretypes.h"
24 #include "backend.h"
25 #include "rtl.h"
26 #include "tree.h"
27 #include "memmodel.h"
28 #include "gimple.h"
29 #include "cfghooks.h"
30 #include "cfgloop.h"
31 #include "df.h"
32 #include "tm_p.h"
33 #include "stringpool.h"
34 #include "expmed.h"
35 #include "optabs.h"
36 #include "regs.h"
37 #include "emit-rtl.h"
38 #include "recog.h"
39 #include "cgraph.h"
40 #include "diagnostic.h"
41 #include "cfgbuild.h"
42 #include "alias.h"
43 #include "fold-const.h"
44 #include "attribs.h"
45 #include "calls.h"
46 #include "stor-layout.h"
47 #include "varasm.h"
48 #include "output.h"
49 #include "insn-attr.h"
50 #include "flags.h"
51 #include "except.h"
52 #include "explow.h"
53 #include "expr.h"
54 #include "cfgrtl.h"
55 #include "common/common-target.h"
56 #include "langhooks.h"
57 #include "reload.h"
58 #include "gimplify.h"
59 #include "dwarf2.h"
60 #include "tm-constrs.h"
61 #include "cselib.h"
62 #include "sched-int.h"
63 #include "opts.h"
64 #include "tree-pass.h"
65 #include "context.h"
66 #include "pass_manager.h"
67 #include "target-globals.h"
68 #include "gimple-iterator.h"
69 #include "tree-vectorizer.h"
70 #include "shrink-wrap.h"
71 #include "builtins.h"
72 #include "rtl-iter.h"
73 #include "tree-iterator.h"
74 #include "dbgcnt.h"
75 #include "case-cfn-macros.h"
76 #include "dojump.h"
77 #include "fold-const-call.h"
78 #include "tree-vrp.h"
79 #include "tree-ssanames.h"
80 #include "selftest.h"
81 #include "selftest-rtl.h"
82 #include "print-rtl.h"
83 #include "intl.h"
84 #include "ifcvt.h"
85 #include "symbol-summary.h"
86 #include "ipa-prop.h"
87 #include "ipa-fnsummary.h"
88 #include "wide-int-bitmask.h"
89 #include "tree-vector-builder.h"
90 #include "debug.h"
91 #include "dwarf2out.h"
92 #include "i386-builtins.h"
93 #include "i386-features.h"
94 
95 const char * const xlogue_layout::STUB_BASE_NAMES[XLOGUE_STUB_COUNT] = {
96   "savms64",
97   "resms64",
98   "resms64x",
99   "savms64f",
100   "resms64f",
101   "resms64fx"
102 };
103 
104 const unsigned xlogue_layout::REG_ORDER[xlogue_layout::MAX_REGS] = {
105 /* The below offset values are where each register is stored for the layout
106    relative to incoming stack pointer.  The value of each m_regs[].offset will
107    be relative to the incoming base pointer (rax or rsi) used by the stub.
108 
109     s_instances:   0		1		2		3
110     Offset:					realigned or	aligned + 8
111     Register	   aligned	aligned + 8	aligned w/HFP	w/HFP	*/
112     XMM15_REG,	/* 0x10		0x18		0x10		0x18	*/
113     XMM14_REG,	/* 0x20		0x28		0x20		0x28	*/
114     XMM13_REG,	/* 0x30		0x38		0x30		0x38	*/
115     XMM12_REG,	/* 0x40		0x48		0x40		0x48	*/
116     XMM11_REG,	/* 0x50		0x58		0x50		0x58	*/
117     XMM10_REG,	/* 0x60		0x68		0x60		0x68	*/
118     XMM9_REG,	/* 0x70		0x78		0x70		0x78	*/
119     XMM8_REG,	/* 0x80		0x88		0x80		0x88	*/
120     XMM7_REG,	/* 0x90		0x98		0x90		0x98	*/
121     XMM6_REG,	/* 0xa0		0xa8		0xa0		0xa8	*/
122     SI_REG,	/* 0xa8		0xb0		0xa8		0xb0	*/
123     DI_REG,	/* 0xb0		0xb8		0xb0		0xb8	*/
124     BX_REG,	/* 0xb8		0xc0		0xb8		0xc0	*/
125     BP_REG,	/* 0xc0		0xc8		N/A		N/A	*/
126     R12_REG,	/* 0xc8		0xd0		0xc0		0xc8	*/
127     R13_REG,	/* 0xd0		0xd8		0xc8		0xd0	*/
128     R14_REG,	/* 0xd8		0xe0		0xd0		0xd8	*/
129     R15_REG,	/* 0xe0		0xe8		0xd8		0xe0	*/
130 };
131 
132 /* Instantiate static const values.  */
133 const HOST_WIDE_INT xlogue_layout::STUB_INDEX_OFFSET;
134 const unsigned xlogue_layout::MIN_REGS;
135 const unsigned xlogue_layout::MAX_REGS;
136 const unsigned xlogue_layout::MAX_EXTRA_REGS;
137 const unsigned xlogue_layout::VARIANT_COUNT;
138 const unsigned xlogue_layout::STUB_NAME_MAX_LEN;
139 
140 /* Initialize xlogue_layout::s_stub_names to zero.  */
141 char xlogue_layout::s_stub_names[2][XLOGUE_STUB_COUNT][VARIANT_COUNT]
142 				[STUB_NAME_MAX_LEN];
143 
144 /* Instantiates all xlogue_layout instances.  */
145 const xlogue_layout xlogue_layout::s_instances[XLOGUE_SET_COUNT] = {
146   xlogue_layout (0, false),
147   xlogue_layout (8, false),
148   xlogue_layout (0, true),
149   xlogue_layout (8, true)
150 };
151 
152 /* Return an appropriate const instance of xlogue_layout based upon values
153    in cfun->machine and crtl.  */
154 const class xlogue_layout &
get_instance()155 xlogue_layout::get_instance ()
156 {
157   enum xlogue_stub_sets stub_set;
158   bool aligned_plus_8 = cfun->machine->call_ms2sysv_pad_in;
159 
160   if (stack_realign_fp)
161     stub_set = XLOGUE_SET_HFP_ALIGNED_OR_REALIGN;
162   else if (frame_pointer_needed)
163     stub_set = aligned_plus_8
164 	      ? XLOGUE_SET_HFP_ALIGNED_PLUS_8
165 	      : XLOGUE_SET_HFP_ALIGNED_OR_REALIGN;
166   else
167     stub_set = aligned_plus_8 ? XLOGUE_SET_ALIGNED_PLUS_8 : XLOGUE_SET_ALIGNED;
168 
169   return s_instances[stub_set];
170 }
171 
172 /* Determine how many clobbered registers can be saved by the stub.
173    Returns the count of registers the stub will save and restore.  */
174 unsigned
count_stub_managed_regs()175 xlogue_layout::count_stub_managed_regs ()
176 {
177   bool hfp = frame_pointer_needed || stack_realign_fp;
178   unsigned i, count;
179   unsigned regno;
180 
181   for (count = i = MIN_REGS; i < MAX_REGS; ++i)
182     {
183       regno = REG_ORDER[i];
184       if (regno == BP_REG && hfp)
185 	continue;
186       if (!ix86_save_reg (regno, false, false))
187 	break;
188       ++count;
189     }
190   return count;
191 }
192 
193 /* Determine if register REGNO is a stub managed register given the
194    total COUNT of stub managed registers.  */
195 bool
is_stub_managed_reg(unsigned regno,unsigned count)196 xlogue_layout::is_stub_managed_reg (unsigned regno, unsigned count)
197 {
198   bool hfp = frame_pointer_needed || stack_realign_fp;
199   unsigned i;
200 
201   for (i = 0; i < count; ++i)
202     {
203       gcc_assert (i < MAX_REGS);
204       if (REG_ORDER[i] == BP_REG && hfp)
205 	++count;
206       else if (REG_ORDER[i] == regno)
207 	return true;
208     }
209   return false;
210 }
211 
212 /* Constructor for xlogue_layout.  */
xlogue_layout(HOST_WIDE_INT stack_align_off_in,bool hfp)213 xlogue_layout::xlogue_layout (HOST_WIDE_INT stack_align_off_in, bool hfp)
214   : m_hfp (hfp) , m_nregs (hfp ? 17 : 18),
215     m_stack_align_off_in (stack_align_off_in)
216 {
217   HOST_WIDE_INT offset = stack_align_off_in;
218   unsigned i, j;
219 
220   for (i = j = 0; i < MAX_REGS; ++i)
221     {
222       unsigned regno = REG_ORDER[i];
223 
224       if (regno == BP_REG && hfp)
225 	continue;
226       if (SSE_REGNO_P (regno))
227 	{
228 	  offset += 16;
229 	  /* Verify that SSE regs are always aligned.  */
230 	  gcc_assert (!((stack_align_off_in + offset) & 15));
231 	}
232       else
233 	offset += 8;
234 
235       m_regs[j].regno    = regno;
236       m_regs[j++].offset = offset - STUB_INDEX_OFFSET;
237     }
238   gcc_assert (j == m_nregs);
239 }
240 
241 const char *
get_stub_name(enum xlogue_stub stub,unsigned n_extra_regs)242 xlogue_layout::get_stub_name (enum xlogue_stub stub,
243 			      unsigned n_extra_regs)
244 {
245   const int have_avx = TARGET_AVX;
246   char *name = s_stub_names[!!have_avx][stub][n_extra_regs];
247 
248   /* Lazy init */
249   if (!*name)
250     {
251       int res = snprintf (name, STUB_NAME_MAX_LEN, "__%s_%s_%u",
252 			  (have_avx ? "avx" : "sse"),
253 			  STUB_BASE_NAMES[stub],
254 			  MIN_REGS + n_extra_regs);
255       gcc_checking_assert (res < (int)STUB_NAME_MAX_LEN);
256     }
257 
258   return name;
259 }
260 
261 /* Return rtx of a symbol ref for the entry point (based upon
262    cfun->machine->call_ms2sysv_extra_regs) of the specified stub.  */
263 rtx
get_stub_rtx(enum xlogue_stub stub)264 xlogue_layout::get_stub_rtx (enum xlogue_stub stub)
265 {
266   const unsigned n_extra_regs = cfun->machine->call_ms2sysv_extra_regs;
267   gcc_checking_assert (n_extra_regs <= MAX_EXTRA_REGS);
268   gcc_assert (stub < XLOGUE_STUB_COUNT);
269   gcc_assert (crtl->stack_realign_finalized);
270 
271   return gen_rtx_SYMBOL_REF (Pmode, get_stub_name (stub, n_extra_regs));
272 }
273 
274 unsigned scalar_chain::max_id = 0;
275 
276 namespace {
277 
278 /* Initialize new chain.  */
279 
scalar_chain(enum machine_mode smode_,enum machine_mode vmode_)280 scalar_chain::scalar_chain (enum machine_mode smode_, enum machine_mode vmode_)
281 {
282   smode = smode_;
283   vmode = vmode_;
284 
285   chain_id = ++max_id;
286 
287    if (dump_file)
288     fprintf (dump_file, "Created a new instruction chain #%d\n", chain_id);
289 
290   bitmap_obstack_initialize (NULL);
291   insns = BITMAP_ALLOC (NULL);
292   defs = BITMAP_ALLOC (NULL);
293   defs_conv = BITMAP_ALLOC (NULL);
294   queue = NULL;
295 }
296 
297 /* Free chain's data.  */
298 
~scalar_chain()299 scalar_chain::~scalar_chain ()
300 {
301   BITMAP_FREE (insns);
302   BITMAP_FREE (defs);
303   BITMAP_FREE (defs_conv);
304   bitmap_obstack_release (NULL);
305 }
306 
307 /* Add instruction into chains' queue.  */
308 
309 void
add_to_queue(unsigned insn_uid)310 scalar_chain::add_to_queue (unsigned insn_uid)
311 {
312   if (bitmap_bit_p (insns, insn_uid)
313       || bitmap_bit_p (queue, insn_uid))
314     return;
315 
316   if (dump_file)
317     fprintf (dump_file, "  Adding insn %d into chain's #%d queue\n",
318 	     insn_uid, chain_id);
319   bitmap_set_bit (queue, insn_uid);
320 }
321 
general_scalar_chain(enum machine_mode smode_,enum machine_mode vmode_)322 general_scalar_chain::general_scalar_chain (enum machine_mode smode_,
323 					    enum machine_mode vmode_)
324      : scalar_chain (smode_, vmode_)
325 {
326   insns_conv = BITMAP_ALLOC (NULL);
327   n_sse_to_integer = 0;
328   n_integer_to_sse = 0;
329 }
330 
~general_scalar_chain()331 general_scalar_chain::~general_scalar_chain ()
332 {
333   BITMAP_FREE (insns_conv);
334 }
335 
336 /* For DImode conversion, mark register defined by DEF as requiring
337    conversion.  */
338 
339 void
mark_dual_mode_def(df_ref def)340 general_scalar_chain::mark_dual_mode_def (df_ref def)
341 {
342   gcc_assert (DF_REF_REG_DEF_P (def));
343 
344   /* Record the def/insn pair so we can later efficiently iterate over
345      the defs to convert on insns not in the chain.  */
346   bool reg_new = bitmap_set_bit (defs_conv, DF_REF_REGNO (def));
347   if (!bitmap_bit_p (insns, DF_REF_INSN_UID (def)))
348     {
349       if (!bitmap_set_bit (insns_conv, DF_REF_INSN_UID (def))
350 	  && !reg_new)
351 	return;
352       n_integer_to_sse++;
353     }
354   else
355     {
356       if (!reg_new)
357 	return;
358       n_sse_to_integer++;
359     }
360 
361   if (dump_file)
362     fprintf (dump_file,
363 	     "  Mark r%d def in insn %d as requiring both modes in chain #%d\n",
364 	     DF_REF_REGNO (def), DF_REF_INSN_UID (def), chain_id);
365 }
366 
367 /* For TImode conversion, it is unused.  */
368 
369 void
mark_dual_mode_def(df_ref)370 timode_scalar_chain::mark_dual_mode_def (df_ref)
371 {
372   gcc_unreachable ();
373 }
374 
375 /* Check REF's chain to add new insns into a queue
376    and find registers requiring conversion.  */
377 
378 void
analyze_register_chain(bitmap candidates,df_ref ref)379 scalar_chain::analyze_register_chain (bitmap candidates, df_ref ref)
380 {
381   df_link *chain;
382 
383   gcc_assert (bitmap_bit_p (insns, DF_REF_INSN_UID (ref))
384 	      || bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)));
385   add_to_queue (DF_REF_INSN_UID (ref));
386 
387   for (chain = DF_REF_CHAIN (ref); chain; chain = chain->next)
388     {
389       unsigned uid = DF_REF_INSN_UID (chain->ref);
390 
391       if (!NONDEBUG_INSN_P (DF_REF_INSN (chain->ref)))
392 	continue;
393 
394       if (!DF_REF_REG_MEM_P (chain->ref))
395 	{
396 	  if (bitmap_bit_p (insns, uid))
397 	    continue;
398 
399 	  if (bitmap_bit_p (candidates, uid))
400 	    {
401 	      add_to_queue (uid);
402 	      continue;
403 	    }
404 	}
405 
406       if (DF_REF_REG_DEF_P (chain->ref))
407 	{
408 	  if (dump_file)
409 	    fprintf (dump_file, "  r%d def in insn %d isn't convertible\n",
410 		     DF_REF_REGNO (chain->ref), uid);
411 	  mark_dual_mode_def (chain->ref);
412 	}
413       else
414 	{
415 	  if (dump_file)
416 	    fprintf (dump_file, "  r%d use in insn %d isn't convertible\n",
417 		     DF_REF_REGNO (chain->ref), uid);
418 	  mark_dual_mode_def (ref);
419 	}
420     }
421 }
422 
423 /* Add instruction into a chain.  */
424 
425 void
add_insn(bitmap candidates,unsigned int insn_uid)426 scalar_chain::add_insn (bitmap candidates, unsigned int insn_uid)
427 {
428   if (bitmap_bit_p (insns, insn_uid))
429     return;
430 
431   if (dump_file)
432     fprintf (dump_file, "  Adding insn %d to chain #%d\n", insn_uid, chain_id);
433 
434   bitmap_set_bit (insns, insn_uid);
435 
436   rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
437   rtx def_set = single_set (insn);
438   if (def_set && REG_P (SET_DEST (def_set))
439       && !HARD_REGISTER_P (SET_DEST (def_set)))
440     bitmap_set_bit (defs, REGNO (SET_DEST (def_set)));
441 
442   /* ???  The following is quadratic since analyze_register_chain
443      iterates over all refs to look for dual-mode regs.  Instead this
444      should be done separately for all regs mentioned in the chain once.  */
445   df_ref ref;
446   for (ref = DF_INSN_UID_DEFS (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
447     if (!HARD_REGISTER_P (DF_REF_REG (ref)))
448       analyze_register_chain (candidates, ref);
449   for (ref = DF_INSN_UID_USES (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
450     if (!DF_REF_REG_MEM_P (ref))
451       analyze_register_chain (candidates, ref);
452 }
453 
454 /* Build new chain starting from insn INSN_UID recursively
455    adding all dependent uses and definitions.  */
456 
457 void
build(bitmap candidates,unsigned insn_uid)458 scalar_chain::build (bitmap candidates, unsigned insn_uid)
459 {
460   queue = BITMAP_ALLOC (NULL);
461   bitmap_set_bit (queue, insn_uid);
462 
463   if (dump_file)
464     fprintf (dump_file, "Building chain #%d...\n", chain_id);
465 
466   while (!bitmap_empty_p (queue))
467     {
468       insn_uid = bitmap_first_set_bit (queue);
469       bitmap_clear_bit (queue, insn_uid);
470       bitmap_clear_bit (candidates, insn_uid);
471       add_insn (candidates, insn_uid);
472     }
473 
474   if (dump_file)
475     {
476       fprintf (dump_file, "Collected chain #%d...\n", chain_id);
477       fprintf (dump_file, "  insns: ");
478       dump_bitmap (dump_file, insns);
479       if (!bitmap_empty_p (defs_conv))
480 	{
481 	  bitmap_iterator bi;
482 	  unsigned id;
483 	  const char *comma = "";
484 	  fprintf (dump_file, "  defs to convert: ");
485 	  EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, id, bi)
486 	    {
487 	      fprintf (dump_file, "%sr%d", comma, id);
488 	      comma = ", ";
489 	    }
490 	  fprintf (dump_file, "\n");
491 	}
492     }
493 
494   BITMAP_FREE (queue);
495 }
496 
497 /* Return a cost of building a vector costant
498    instead of using a scalar one.  */
499 
500 int
vector_const_cost(rtx exp)501 general_scalar_chain::vector_const_cost (rtx exp)
502 {
503   gcc_assert (CONST_INT_P (exp));
504 
505   if (standard_sse_constant_p (exp, vmode))
506     return ix86_cost->sse_op;
507   /* We have separate costs for SImode and DImode, use SImode costs
508      for smaller modes.  */
509   return ix86_cost->sse_load[smode == DImode ? 1 : 0];
510 }
511 
512 /* Compute a gain for chain conversion.  */
513 
514 int
compute_convert_gain()515 general_scalar_chain::compute_convert_gain ()
516 {
517   bitmap_iterator bi;
518   unsigned insn_uid;
519   int gain = 0;
520   int cost = 0;
521 
522   if (dump_file)
523     fprintf (dump_file, "Computing gain for chain #%d...\n", chain_id);
524 
525   /* SSE costs distinguish between SImode and DImode loads/stores, for
526      int costs factor in the number of GPRs involved.  When supporting
527      smaller modes than SImode the int load/store costs need to be
528      adjusted as well.  */
529   unsigned sse_cost_idx = smode == DImode ? 1 : 0;
530   unsigned m = smode == DImode ? (TARGET_64BIT ? 1 : 2) : 1;
531 
532   EXECUTE_IF_SET_IN_BITMAP (insns, 0, insn_uid, bi)
533     {
534       rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
535       rtx def_set = single_set (insn);
536       rtx src = SET_SRC (def_set);
537       rtx dst = SET_DEST (def_set);
538       int igain = 0;
539 
540       if (REG_P (src) && REG_P (dst))
541 	igain += 2 * m - ix86_cost->xmm_move;
542       else if (REG_P (src) && MEM_P (dst))
543 	igain
544 	  += m * ix86_cost->int_store[2] - ix86_cost->sse_store[sse_cost_idx];
545       else if (MEM_P (src) && REG_P (dst))
546 	igain += m * ix86_cost->int_load[2] - ix86_cost->sse_load[sse_cost_idx];
547       else if (GET_CODE (src) == ASHIFT
548 	       || GET_CODE (src) == ASHIFTRT
549 	       || GET_CODE (src) == LSHIFTRT)
550 	{
551 	  if (m == 2)
552 	    {
553 	      if (INTVAL (XEXP (src, 1)) >= 32)
554 		igain += ix86_cost->add;
555 	      else
556 		igain += ix86_cost->shift_const;
557 	    }
558 
559 	  igain += ix86_cost->shift_const - ix86_cost->sse_op;
560 
561 	  if (CONST_INT_P (XEXP (src, 0)))
562 	    igain -= vector_const_cost (XEXP (src, 0));
563 	}
564       else if (GET_CODE (src) == PLUS
565 	       || GET_CODE (src) == MINUS
566 	       || GET_CODE (src) == IOR
567 	       || GET_CODE (src) == XOR
568 	       || GET_CODE (src) == AND)
569 	{
570 	  igain += m * ix86_cost->add - ix86_cost->sse_op;
571 	  /* Additional gain for andnot for targets without BMI.  */
572 	  if (GET_CODE (XEXP (src, 0)) == NOT
573 	      && !TARGET_BMI)
574 	    igain += m * ix86_cost->add;
575 
576 	  if (CONST_INT_P (XEXP (src, 0)))
577 	    igain -= vector_const_cost (XEXP (src, 0));
578 	  if (CONST_INT_P (XEXP (src, 1)))
579 	    igain -= vector_const_cost (XEXP (src, 1));
580 	}
581       else if (GET_CODE (src) == NEG
582 	       || GET_CODE (src) == NOT)
583 	igain += m * ix86_cost->add - ix86_cost->sse_op - COSTS_N_INSNS (1);
584       else if (GET_CODE (src) == SMAX
585 	       || GET_CODE (src) == SMIN
586 	       || GET_CODE (src) == UMAX
587 	       || GET_CODE (src) == UMIN)
588 	{
589 	  /* We do not have any conditional move cost, estimate it as a
590 	     reg-reg move.  Comparisons are costed as adds.  */
591 	  igain += m * (COSTS_N_INSNS (2) + ix86_cost->add);
592 	  /* Integer SSE ops are all costed the same.  */
593 	  igain -= ix86_cost->sse_op;
594 	}
595       else if (GET_CODE (src) == COMPARE)
596 	{
597 	  /* Assume comparison cost is the same.  */
598 	}
599       else if (CONST_INT_P (src))
600 	{
601 	  if (REG_P (dst))
602 	    /* DImode can be immediate for TARGET_64BIT and SImode always.  */
603 	    igain += m * COSTS_N_INSNS (1);
604 	  else if (MEM_P (dst))
605 	    igain += (m * ix86_cost->int_store[2]
606 		     - ix86_cost->sse_store[sse_cost_idx]);
607 	  igain -= vector_const_cost (src);
608 	}
609       else
610 	gcc_unreachable ();
611 
612       if (igain != 0 && dump_file)
613 	{
614 	  fprintf (dump_file, "  Instruction gain %d for ", igain);
615 	  dump_insn_slim (dump_file, insn);
616 	}
617       gain += igain;
618     }
619 
620   if (dump_file)
621     fprintf (dump_file, "  Instruction conversion gain: %d\n", gain);
622 
623   /* Cost the integer to sse and sse to integer moves.  */
624   cost += n_sse_to_integer * ix86_cost->sse_to_integer;
625   /* ???  integer_to_sse but we only have that in the RA cost table.
626      Assume sse_to_integer/integer_to_sse are the same which they
627      are at the moment.  */
628   cost += n_integer_to_sse * ix86_cost->sse_to_integer;
629 
630   if (dump_file)
631     fprintf (dump_file, "  Registers conversion cost: %d\n", cost);
632 
633   gain -= cost;
634 
635   if (dump_file)
636     fprintf (dump_file, "  Total gain: %d\n", gain);
637 
638   return gain;
639 }
640 
641 /* Insert generated conversion instruction sequence INSNS
642    after instruction AFTER.  New BB may be required in case
643    instruction has EH region attached.  */
644 
645 void
emit_conversion_insns(rtx insns,rtx_insn * after)646 scalar_chain::emit_conversion_insns (rtx insns, rtx_insn *after)
647 {
648   if (!control_flow_insn_p (after))
649     {
650       emit_insn_after (insns, after);
651       return;
652     }
653 
654   basic_block bb = BLOCK_FOR_INSN (after);
655   edge e = find_fallthru_edge (bb->succs);
656   gcc_assert (e);
657 
658   basic_block new_bb = split_edge (e);
659   emit_insn_after (insns, BB_HEAD (new_bb));
660 }
661 
662 } // anon namespace
663 
664 /* Generate the canonical SET_SRC to move GPR to a VMODE vector register,
665    zeroing the upper parts.  */
666 
667 static rtx
gen_gpr_to_xmm_move_src(enum machine_mode vmode,rtx gpr)668 gen_gpr_to_xmm_move_src (enum machine_mode vmode, rtx gpr)
669 {
670   switch (GET_MODE_NUNITS (vmode))
671     {
672     case 1:
673       /* We are not using this case currently.  */
674       gcc_unreachable ();
675     case 2:
676       return gen_rtx_VEC_CONCAT (vmode, gpr,
677 				 CONST0_RTX (GET_MODE_INNER (vmode)));
678     default:
679       return gen_rtx_VEC_MERGE (vmode, gen_rtx_VEC_DUPLICATE (vmode, gpr),
680 				CONST0_RTX (vmode), GEN_INT (HOST_WIDE_INT_1U));
681     }
682 }
683 
684 /* Make vector copies for all register REGNO definitions
685    and replace its uses in a chain.  */
686 
687 void
make_vector_copies(rtx_insn * insn,rtx reg)688 general_scalar_chain::make_vector_copies (rtx_insn *insn, rtx reg)
689 {
690   rtx vreg = *defs_map.get (reg);
691 
692   start_sequence ();
693   if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
694     {
695       rtx tmp = assign_386_stack_local (smode, SLOT_STV_TEMP);
696       if (smode == DImode && !TARGET_64BIT)
697 	{
698 	  emit_move_insn (adjust_address (tmp, SImode, 0),
699 			  gen_rtx_SUBREG (SImode, reg, 0));
700 	  emit_move_insn (adjust_address (tmp, SImode, 4),
701 			  gen_rtx_SUBREG (SImode, reg, 4));
702 	}
703       else
704 	emit_move_insn (copy_rtx (tmp), reg);
705       emit_insn (gen_rtx_SET (gen_rtx_SUBREG (vmode, vreg, 0),
706 			      gen_gpr_to_xmm_move_src (vmode, tmp)));
707     }
708   else if (!TARGET_64BIT && smode == DImode)
709     {
710       if (TARGET_SSE4_1)
711 	{
712 	  emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0),
713 				      CONST0_RTX (V4SImode),
714 				      gen_rtx_SUBREG (SImode, reg, 0)));
715 	  emit_insn (gen_sse4_1_pinsrd (gen_rtx_SUBREG (V4SImode, vreg, 0),
716 					gen_rtx_SUBREG (V4SImode, vreg, 0),
717 					gen_rtx_SUBREG (SImode, reg, 4),
718 					GEN_INT (2)));
719 	}
720       else
721 	{
722 	  rtx tmp = gen_reg_rtx (DImode);
723 	  emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0),
724 				      CONST0_RTX (V4SImode),
725 				      gen_rtx_SUBREG (SImode, reg, 0)));
726 	  emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, tmp, 0),
727 				      CONST0_RTX (V4SImode),
728 				      gen_rtx_SUBREG (SImode, reg, 4)));
729 	  emit_insn (gen_vec_interleave_lowv4si
730 		     (gen_rtx_SUBREG (V4SImode, vreg, 0),
731 		      gen_rtx_SUBREG (V4SImode, vreg, 0),
732 		      gen_rtx_SUBREG (V4SImode, tmp, 0)));
733 	}
734     }
735   else
736     emit_insn (gen_rtx_SET (gen_rtx_SUBREG (vmode, vreg, 0),
737 			    gen_gpr_to_xmm_move_src (vmode, reg)));
738   rtx_insn *seq = get_insns ();
739   end_sequence ();
740   emit_conversion_insns (seq, insn);
741 
742   if (dump_file)
743     fprintf (dump_file,
744 	     "  Copied r%d to a vector register r%d for insn %d\n",
745 	     REGNO (reg), REGNO (vreg), INSN_UID (insn));
746 }
747 
748 /* Copy the definition SRC of INSN inside the chain to DST for
749    scalar uses outside of the chain.  */
750 
751 void
convert_reg(rtx_insn * insn,rtx dst,rtx src)752 general_scalar_chain::convert_reg (rtx_insn *insn, rtx dst, rtx src)
753 {
754   start_sequence ();
755   if (!TARGET_INTER_UNIT_MOVES_FROM_VEC)
756     {
757       rtx tmp = assign_386_stack_local (smode, SLOT_STV_TEMP);
758       emit_move_insn (tmp, src);
759       if (!TARGET_64BIT && smode == DImode)
760 	{
761 	  emit_move_insn (gen_rtx_SUBREG (SImode, dst, 0),
762 			  adjust_address (tmp, SImode, 0));
763 	  emit_move_insn (gen_rtx_SUBREG (SImode, dst, 4),
764 			  adjust_address (tmp, SImode, 4));
765 	}
766       else
767 	emit_move_insn (dst, copy_rtx (tmp));
768     }
769   else if (!TARGET_64BIT && smode == DImode)
770     {
771       if (TARGET_SSE4_1)
772 	{
773 	  rtx tmp = gen_rtx_PARALLEL (VOIDmode,
774 				      gen_rtvec (1, const0_rtx));
775 	  emit_insn
776 	      (gen_rtx_SET
777 	       (gen_rtx_SUBREG (SImode, dst, 0),
778 		gen_rtx_VEC_SELECT (SImode,
779 				    gen_rtx_SUBREG (V4SImode, src, 0),
780 				    tmp)));
781 
782 	  tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const1_rtx));
783 	  emit_insn
784 	      (gen_rtx_SET
785 	       (gen_rtx_SUBREG (SImode, dst, 4),
786 		gen_rtx_VEC_SELECT (SImode,
787 				    gen_rtx_SUBREG (V4SImode, src, 0),
788 				    tmp)));
789 	}
790       else
791 	{
792 	  rtx vcopy = gen_reg_rtx (V2DImode);
793 	  emit_move_insn (vcopy, gen_rtx_SUBREG (V2DImode, src, 0));
794 	  emit_move_insn (gen_rtx_SUBREG (SImode, dst, 0),
795 			  gen_rtx_SUBREG (SImode, vcopy, 0));
796 	  emit_move_insn (vcopy,
797 			  gen_rtx_LSHIFTRT (V2DImode,
798 					    vcopy, GEN_INT (32)));
799 	  emit_move_insn (gen_rtx_SUBREG (SImode, dst, 4),
800 			  gen_rtx_SUBREG (SImode, vcopy, 0));
801 	}
802     }
803   else
804     emit_move_insn (dst, src);
805 
806   rtx_insn *seq = get_insns ();
807   end_sequence ();
808   emit_conversion_insns (seq, insn);
809 
810   if (dump_file)
811     fprintf (dump_file,
812 	     "  Copied r%d to a scalar register r%d for insn %d\n",
813 	     REGNO (src), REGNO (dst), INSN_UID (insn));
814 }
815 
816 /* Convert operand OP in INSN.  We should handle
817    memory operands and uninitialized registers.
818    All other register uses are converted during
819    registers conversion.  */
820 
821 void
convert_op(rtx * op,rtx_insn * insn)822 general_scalar_chain::convert_op (rtx *op, rtx_insn *insn)
823 {
824   *op = copy_rtx_if_shared (*op);
825 
826   if (GET_CODE (*op) == NOT)
827     {
828       convert_op (&XEXP (*op, 0), insn);
829       PUT_MODE (*op, vmode);
830     }
831   else if (MEM_P (*op))
832     {
833       rtx tmp = gen_reg_rtx (GET_MODE (*op));
834 
835       /* Handle movabs.  */
836       if (!memory_operand (*op, GET_MODE (*op)))
837 	{
838 	  rtx tmp2 = gen_reg_rtx (GET_MODE (*op));
839 
840 	  emit_insn_before (gen_rtx_SET (tmp2, *op), insn);
841 	  *op = tmp2;
842 	}
843 
844       emit_insn_before (gen_rtx_SET (gen_rtx_SUBREG (vmode, tmp, 0),
845 				     gen_gpr_to_xmm_move_src (vmode, *op)),
846 			insn);
847       *op = gen_rtx_SUBREG (vmode, tmp, 0);
848 
849       if (dump_file)
850 	fprintf (dump_file, "  Preloading operand for insn %d into r%d\n",
851 		 INSN_UID (insn), REGNO (tmp));
852     }
853   else if (REG_P (*op))
854     {
855       *op = gen_rtx_SUBREG (vmode, *op, 0);
856     }
857   else if (CONST_INT_P (*op))
858     {
859       rtx vec_cst;
860       rtx tmp = gen_rtx_SUBREG (vmode, gen_reg_rtx (smode), 0);
861 
862       /* Prefer all ones vector in case of -1.  */
863       if (constm1_operand (*op, GET_MODE (*op)))
864 	vec_cst = CONSTM1_RTX (vmode);
865       else
866 	{
867 	  unsigned n = GET_MODE_NUNITS (vmode);
868 	  rtx *v = XALLOCAVEC (rtx, n);
869 	  v[0] = *op;
870 	  for (unsigned i = 1; i < n; ++i)
871 	    v[i] = const0_rtx;
872 	  vec_cst = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (n, v));
873 	}
874 
875       if (!standard_sse_constant_p (vec_cst, vmode))
876 	{
877 	  start_sequence ();
878 	  vec_cst = validize_mem (force_const_mem (vmode, vec_cst));
879 	  rtx_insn *seq = get_insns ();
880 	  end_sequence ();
881 	  emit_insn_before (seq, insn);
882 	}
883 
884       emit_insn_before (gen_move_insn (copy_rtx (tmp), vec_cst), insn);
885       *op = tmp;
886     }
887   else
888     {
889       gcc_assert (SUBREG_P (*op));
890       gcc_assert (GET_MODE (*op) == vmode);
891     }
892 }
893 
894 /* Convert INSN to vector mode.  */
895 
896 void
convert_insn(rtx_insn * insn)897 general_scalar_chain::convert_insn (rtx_insn *insn)
898 {
899   /* Generate copies for out-of-chain uses of defs and adjust debug uses.  */
900   for (df_ref ref = DF_INSN_DEFS (insn); ref; ref = DF_REF_NEXT_LOC (ref))
901     if (bitmap_bit_p (defs_conv, DF_REF_REGNO (ref)))
902       {
903 	df_link *use;
904 	for (use = DF_REF_CHAIN (ref); use; use = use->next)
905 	  if (NONDEBUG_INSN_P (DF_REF_INSN (use->ref))
906 	      && (DF_REF_REG_MEM_P (use->ref)
907 		  || !bitmap_bit_p (insns, DF_REF_INSN_UID (use->ref))))
908 	    break;
909 	if (use)
910 	  convert_reg (insn, DF_REF_REG (ref),
911 		       *defs_map.get (regno_reg_rtx [DF_REF_REGNO (ref)]));
912 	else if (MAY_HAVE_DEBUG_BIND_INSNS)
913 	  {
914 	    /* If we generated a scalar copy we can leave debug-insns
915 	       as-is, if not, we have to adjust them.  */
916 	    auto_vec<rtx_insn *, 5> to_reset_debug_insns;
917 	    for (use = DF_REF_CHAIN (ref); use; use = use->next)
918 	      if (DEBUG_INSN_P (DF_REF_INSN (use->ref)))
919 		{
920 		  rtx_insn *debug_insn = DF_REF_INSN (use->ref);
921 		  /* If there's a reaching definition outside of the
922 		     chain we have to reset.  */
923 		  df_link *def;
924 		  for (def = DF_REF_CHAIN (use->ref); def; def = def->next)
925 		    if (!bitmap_bit_p (insns, DF_REF_INSN_UID (def->ref)))
926 		      break;
927 		  if (def)
928 		    to_reset_debug_insns.safe_push (debug_insn);
929 		  else
930 		    {
931 		      *DF_REF_REAL_LOC (use->ref)
932 			= *defs_map.get (regno_reg_rtx [DF_REF_REGNO (ref)]);
933 		      df_insn_rescan (debug_insn);
934 		    }
935 		}
936 	    /* Have to do the reset outside of the DF_CHAIN walk to not
937 	       disrupt it.  */
938 	    while (!to_reset_debug_insns.is_empty ())
939 	      {
940 		rtx_insn *debug_insn = to_reset_debug_insns.pop ();
941 		INSN_VAR_LOCATION_LOC (debug_insn) = gen_rtx_UNKNOWN_VAR_LOC ();
942 		df_insn_rescan_debug_internal (debug_insn);
943 	      }
944 	  }
945       }
946 
947   /* Replace uses in this insn with the defs we use in the chain.  */
948   for (df_ref ref = DF_INSN_USES (insn); ref; ref = DF_REF_NEXT_LOC (ref))
949     if (!DF_REF_REG_MEM_P (ref))
950       if (rtx *vreg = defs_map.get (regno_reg_rtx[DF_REF_REGNO (ref)]))
951 	{
952 	  /* Also update a corresponding REG_DEAD note.  */
953 	  rtx note = find_reg_note (insn, REG_DEAD, DF_REF_REG (ref));
954 	  if (note)
955 	    XEXP (note, 0) = *vreg;
956 	  *DF_REF_REAL_LOC (ref) = *vreg;
957 	}
958 
959   rtx def_set = single_set (insn);
960   rtx src = SET_SRC (def_set);
961   rtx dst = SET_DEST (def_set);
962   rtx subreg;
963 
964   if (MEM_P (dst) && !REG_P (src))
965     {
966       /* There are no scalar integer instructions and therefore
967 	 temporary register usage is required.  */
968       rtx tmp = gen_reg_rtx (smode);
969       emit_conversion_insns (gen_move_insn (dst, tmp), insn);
970       dst = gen_rtx_SUBREG (vmode, tmp, 0);
971     }
972   else if (REG_P (dst))
973     {
974       /* Replace the definition with a SUBREG to the definition we
975          use inside the chain.  */
976       rtx *vdef = defs_map.get (dst);
977       if (vdef)
978 	dst = *vdef;
979       dst = gen_rtx_SUBREG (vmode, dst, 0);
980       /* IRA doesn't like to have REG_EQUAL/EQUIV notes when the SET_DEST
981          is a non-REG_P.  So kill those off.  */
982       rtx note = find_reg_equal_equiv_note (insn);
983       if (note)
984 	remove_note (insn, note);
985     }
986 
987   switch (GET_CODE (src))
988     {
989     case ASHIFT:
990     case ASHIFTRT:
991     case LSHIFTRT:
992       convert_op (&XEXP (src, 0), insn);
993       PUT_MODE (src, vmode);
994       break;
995 
996     case PLUS:
997     case MINUS:
998     case IOR:
999     case XOR:
1000     case AND:
1001     case SMAX:
1002     case SMIN:
1003     case UMAX:
1004     case UMIN:
1005       convert_op (&XEXP (src, 0), insn);
1006       convert_op (&XEXP (src, 1), insn);
1007       PUT_MODE (src, vmode);
1008       break;
1009 
1010     case NEG:
1011       src = XEXP (src, 0);
1012       convert_op (&src, insn);
1013       subreg = gen_reg_rtx (vmode);
1014       emit_insn_before (gen_move_insn (subreg, CONST0_RTX (vmode)), insn);
1015       src = gen_rtx_MINUS (vmode, subreg, src);
1016       break;
1017 
1018     case NOT:
1019       src = XEXP (src, 0);
1020       convert_op (&src, insn);
1021       subreg = gen_reg_rtx (vmode);
1022       emit_insn_before (gen_move_insn (subreg, CONSTM1_RTX (vmode)), insn);
1023       src = gen_rtx_XOR (vmode, src, subreg);
1024       break;
1025 
1026     case MEM:
1027       if (!REG_P (dst))
1028 	convert_op (&src, insn);
1029       break;
1030 
1031     case REG:
1032       if (!MEM_P (dst))
1033 	convert_op (&src, insn);
1034       break;
1035 
1036     case SUBREG:
1037       gcc_assert (GET_MODE (src) == vmode);
1038       break;
1039 
1040     case COMPARE:
1041       src = SUBREG_REG (XEXP (XEXP (src, 0), 0));
1042 
1043       gcc_assert (REG_P (src) && GET_MODE (src) == DImode);
1044       subreg = gen_rtx_SUBREG (V2DImode, src, 0);
1045       emit_insn_before (gen_vec_interleave_lowv2di (copy_rtx_if_shared (subreg),
1046 						    copy_rtx_if_shared (subreg),
1047 						    copy_rtx_if_shared (subreg)),
1048 			insn);
1049       dst = gen_rtx_REG (CCmode, FLAGS_REG);
1050       src = gen_rtx_UNSPEC (CCmode, gen_rtvec (2, copy_rtx_if_shared (subreg),
1051 					       copy_rtx_if_shared (subreg)),
1052 			    UNSPEC_PTEST);
1053       break;
1054 
1055     case CONST_INT:
1056       convert_op (&src, insn);
1057       break;
1058 
1059     default:
1060       gcc_unreachable ();
1061     }
1062 
1063   SET_SRC (def_set) = src;
1064   SET_DEST (def_set) = dst;
1065 
1066   /* Drop possible dead definitions.  */
1067   PATTERN (insn) = def_set;
1068 
1069   INSN_CODE (insn) = -1;
1070   int patt = recog_memoized (insn);
1071   if  (patt == -1)
1072     fatal_insn_not_found (insn);
1073   df_insn_rescan (insn);
1074 }
1075 
1076 /* Fix uses of converted REG in debug insns.  */
1077 
1078 void
fix_debug_reg_uses(rtx reg)1079 timode_scalar_chain::fix_debug_reg_uses (rtx reg)
1080 {
1081   if (!flag_var_tracking)
1082     return;
1083 
1084   df_ref ref, next;
1085   for (ref = DF_REG_USE_CHAIN (REGNO (reg)); ref; ref = next)
1086     {
1087       rtx_insn *insn = DF_REF_INSN (ref);
1088       /* Make sure the next ref is for a different instruction,
1089          so that we're not affected by the rescan.  */
1090       next = DF_REF_NEXT_REG (ref);
1091       while (next && DF_REF_INSN (next) == insn)
1092 	next = DF_REF_NEXT_REG (next);
1093 
1094       if (DEBUG_INSN_P (insn))
1095 	{
1096 	  /* It may be a debug insn with a TImode variable in
1097 	     register.  */
1098 	  bool changed = false;
1099 	  for (; ref != next; ref = DF_REF_NEXT_REG (ref))
1100 	    {
1101 	      rtx *loc = DF_REF_LOC (ref);
1102 	      if (REG_P (*loc) && GET_MODE (*loc) == V1TImode)
1103 		{
1104 		  *loc = gen_rtx_SUBREG (TImode, *loc, 0);
1105 		  changed = true;
1106 		}
1107 	    }
1108 	  if (changed)
1109 	    df_insn_rescan (insn);
1110 	}
1111     }
1112 }
1113 
1114 /* Convert INSN from TImode to V1T1mode.  */
1115 
1116 void
convert_insn(rtx_insn * insn)1117 timode_scalar_chain::convert_insn (rtx_insn *insn)
1118 {
1119   rtx def_set = single_set (insn);
1120   rtx src = SET_SRC (def_set);
1121   rtx dst = SET_DEST (def_set);
1122 
1123   switch (GET_CODE (dst))
1124     {
1125     case REG:
1126       {
1127 	rtx tmp = find_reg_equal_equiv_note (insn);
1128 	if (tmp)
1129 	  PUT_MODE (XEXP (tmp, 0), V1TImode);
1130 	PUT_MODE (dst, V1TImode);
1131 	fix_debug_reg_uses (dst);
1132       }
1133       break;
1134     case MEM:
1135       PUT_MODE (dst, V1TImode);
1136       break;
1137 
1138     default:
1139       gcc_unreachable ();
1140     }
1141 
1142   switch (GET_CODE (src))
1143     {
1144     case REG:
1145       PUT_MODE (src, V1TImode);
1146       /* Call fix_debug_reg_uses only if SRC is never defined.  */
1147       if (!DF_REG_DEF_CHAIN (REGNO (src)))
1148 	fix_debug_reg_uses (src);
1149       break;
1150 
1151     case MEM:
1152       PUT_MODE (src, V1TImode);
1153       break;
1154 
1155     case CONST_WIDE_INT:
1156       if (NONDEBUG_INSN_P (insn))
1157 	{
1158 	  /* Since there are no instructions to store 128-bit constant,
1159 	     temporary register usage is required.  */
1160 	  rtx tmp = gen_reg_rtx (V1TImode);
1161 	  start_sequence ();
1162 	  src = gen_rtx_CONST_VECTOR (V1TImode, gen_rtvec (1, src));
1163 	  src = validize_mem (force_const_mem (V1TImode, src));
1164 	  rtx_insn *seq = get_insns ();
1165 	  end_sequence ();
1166 	  if (seq)
1167 	    emit_insn_before (seq, insn);
1168 	  emit_conversion_insns (gen_rtx_SET (dst, tmp), insn);
1169 	  dst = tmp;
1170 	}
1171       break;
1172 
1173     case CONST_INT:
1174       switch (standard_sse_constant_p (src, TImode))
1175 	{
1176 	case 1:
1177 	  src = CONST0_RTX (GET_MODE (dst));
1178 	  break;
1179 	case 2:
1180 	  src = CONSTM1_RTX (GET_MODE (dst));
1181 	  break;
1182 	default:
1183 	  gcc_unreachable ();
1184 	}
1185       if (NONDEBUG_INSN_P (insn))
1186 	{
1187 	  rtx tmp = gen_reg_rtx (V1TImode);
1188 	  /* Since there are no instructions to store standard SSE
1189 	     constant, temporary register usage is required.  */
1190 	  emit_conversion_insns (gen_rtx_SET (dst, tmp), insn);
1191 	  dst = tmp;
1192 	}
1193       break;
1194 
1195     default:
1196       gcc_unreachable ();
1197     }
1198 
1199   SET_SRC (def_set) = src;
1200   SET_DEST (def_set) = dst;
1201 
1202   /* Drop possible dead definitions.  */
1203   PATTERN (insn) = def_set;
1204 
1205   INSN_CODE (insn) = -1;
1206   recog_memoized (insn);
1207   df_insn_rescan (insn);
1208 }
1209 
1210 /* Generate copies from defs used by the chain but not defined therein.
1211    Also populates defs_map which is used later by convert_insn.  */
1212 
1213 void
convert_registers()1214 general_scalar_chain::convert_registers ()
1215 {
1216   bitmap_iterator bi;
1217   unsigned id;
1218   EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, id, bi)
1219     {
1220       rtx chain_reg = gen_reg_rtx (smode);
1221       defs_map.put (regno_reg_rtx[id], chain_reg);
1222     }
1223   EXECUTE_IF_SET_IN_BITMAP (insns_conv, 0, id, bi)
1224     for (df_ref ref = DF_INSN_UID_DEFS (id); ref; ref = DF_REF_NEXT_LOC (ref))
1225       if (bitmap_bit_p (defs_conv, DF_REF_REGNO (ref)))
1226 	make_vector_copies (DF_REF_INSN (ref), DF_REF_REAL_REG (ref));
1227 }
1228 
1229 /* Convert whole chain creating required register
1230    conversions and copies.  */
1231 
1232 int
convert()1233 scalar_chain::convert ()
1234 {
1235   bitmap_iterator bi;
1236   unsigned id;
1237   int converted_insns = 0;
1238 
1239   if (!dbg_cnt (stv_conversion))
1240     return 0;
1241 
1242   if (dump_file)
1243     fprintf (dump_file, "Converting chain #%d...\n", chain_id);
1244 
1245   convert_registers ();
1246 
1247   EXECUTE_IF_SET_IN_BITMAP (insns, 0, id, bi)
1248     {
1249       convert_insn (DF_INSN_UID_GET (id)->insn);
1250       converted_insns++;
1251     }
1252 
1253   return converted_insns;
1254 }
1255 
1256 /* Return 1 if INSN uses or defines a hard register.
1257    Hard register uses in a memory address are ignored.
1258    Clobbers and flags definitions are ignored.  */
1259 
1260 static bool
has_non_address_hard_reg(rtx_insn * insn)1261 has_non_address_hard_reg (rtx_insn *insn)
1262 {
1263   df_ref ref;
1264   FOR_EACH_INSN_DEF (ref, insn)
1265     if (HARD_REGISTER_P (DF_REF_REAL_REG (ref))
1266 	&& !DF_REF_FLAGS_IS_SET (ref, DF_REF_MUST_CLOBBER)
1267 	&& DF_REF_REGNO (ref) != FLAGS_REG)
1268       return true;
1269 
1270   FOR_EACH_INSN_USE (ref, insn)
1271     if (!DF_REF_REG_MEM_P (ref) && HARD_REGISTER_P (DF_REF_REAL_REG (ref)))
1272       return true;
1273 
1274   return false;
1275 }
1276 
1277 /* Check if comparison INSN may be transformed
1278    into vector comparison.  Currently we transform
1279    zero checks only which look like:
1280 
1281    (set (reg:CCZ 17 flags)
1282         (compare:CCZ (ior:SI (subreg:SI (reg:DI x) 4)
1283                              (subreg:SI (reg:DI x) 0))
1284 		     (const_int 0 [0])))  */
1285 
1286 static bool
convertible_comparison_p(rtx_insn * insn,enum machine_mode mode)1287 convertible_comparison_p (rtx_insn *insn, enum machine_mode mode)
1288 {
1289   /* ??? Currently convertible for double-word DImode chain only.  */
1290   if (TARGET_64BIT || mode != DImode)
1291     return false;
1292 
1293   if (!TARGET_SSE4_1)
1294     return false;
1295 
1296   rtx def_set = single_set (insn);
1297 
1298   gcc_assert (def_set);
1299 
1300   rtx src = SET_SRC (def_set);
1301   rtx dst = SET_DEST (def_set);
1302 
1303   gcc_assert (GET_CODE (src) == COMPARE);
1304 
1305   if (GET_CODE (dst) != REG
1306       || REGNO (dst) != FLAGS_REG
1307       || GET_MODE (dst) != CCZmode)
1308     return false;
1309 
1310   rtx op1 = XEXP (src, 0);
1311   rtx op2 = XEXP (src, 1);
1312 
1313   if (op2 != CONST0_RTX (GET_MODE (op2)))
1314     return false;
1315 
1316   if (GET_CODE (op1) != IOR)
1317     return false;
1318 
1319   op2 = XEXP (op1, 1);
1320   op1 = XEXP (op1, 0);
1321 
1322   if (!SUBREG_P (op1)
1323       || !SUBREG_P (op2)
1324       || GET_MODE (op1) != SImode
1325       || GET_MODE (op2) != SImode
1326       || ((SUBREG_BYTE (op1) != 0
1327 	   || SUBREG_BYTE (op2) != GET_MODE_SIZE (SImode))
1328 	  && (SUBREG_BYTE (op2) != 0
1329 	      || SUBREG_BYTE (op1) != GET_MODE_SIZE (SImode))))
1330     return false;
1331 
1332   op1 = SUBREG_REG (op1);
1333   op2 = SUBREG_REG (op2);
1334 
1335   if (op1 != op2
1336       || !REG_P (op1)
1337       || GET_MODE (op1) != DImode)
1338     return false;
1339 
1340   return true;
1341 }
1342 
1343 /* The general version of scalar_to_vector_candidate_p.  */
1344 
1345 static bool
general_scalar_to_vector_candidate_p(rtx_insn * insn,enum machine_mode mode)1346 general_scalar_to_vector_candidate_p (rtx_insn *insn, enum machine_mode mode)
1347 {
1348   rtx def_set = single_set (insn);
1349 
1350   if (!def_set)
1351     return false;
1352 
1353   if (has_non_address_hard_reg (insn))
1354     return false;
1355 
1356   rtx src = SET_SRC (def_set);
1357   rtx dst = SET_DEST (def_set);
1358 
1359   if (GET_CODE (src) == COMPARE)
1360     return convertible_comparison_p (insn, mode);
1361 
1362   /* We are interested in "mode" only.  */
1363   if ((GET_MODE (src) != mode
1364        && !CONST_INT_P (src))
1365       || GET_MODE (dst) != mode)
1366     return false;
1367 
1368   if (!REG_P (dst) && !MEM_P (dst))
1369     return false;
1370 
1371   switch (GET_CODE (src))
1372     {
1373     case ASHIFTRT:
1374       if (!TARGET_AVX512VL)
1375 	return false;
1376       /* FALLTHRU */
1377 
1378     case ASHIFT:
1379     case LSHIFTRT:
1380       if (!CONST_INT_P (XEXP (src, 1))
1381 	  || !IN_RANGE (INTVAL (XEXP (src, 1)), 0, GET_MODE_BITSIZE (mode)-1))
1382 	return false;
1383       break;
1384 
1385     case SMAX:
1386     case SMIN:
1387     case UMAX:
1388     case UMIN:
1389       if ((mode == DImode && !TARGET_AVX512VL)
1390 	  || (mode == SImode && !TARGET_SSE4_1))
1391 	return false;
1392       /* Fallthru.  */
1393 
1394     case PLUS:
1395     case MINUS:
1396     case IOR:
1397     case XOR:
1398     case AND:
1399       if (!REG_P (XEXP (src, 1))
1400 	  && !MEM_P (XEXP (src, 1))
1401 	  && !CONST_INT_P (XEXP (src, 1)))
1402 	return false;
1403 
1404       if (GET_MODE (XEXP (src, 1)) != mode
1405 	  && !CONST_INT_P (XEXP (src, 1)))
1406 	return false;
1407       break;
1408 
1409     case NEG:
1410     case NOT:
1411       break;
1412 
1413     case REG:
1414       return true;
1415 
1416     case MEM:
1417     case CONST_INT:
1418       return REG_P (dst);
1419 
1420     default:
1421       return false;
1422     }
1423 
1424   if (!REG_P (XEXP (src, 0))
1425       && !MEM_P (XEXP (src, 0))
1426       && !CONST_INT_P (XEXP (src, 0))
1427       /* Check for andnot case.  */
1428       && (GET_CODE (src) != AND
1429 	  || GET_CODE (XEXP (src, 0)) != NOT
1430 	  || !REG_P (XEXP (XEXP (src, 0), 0))))
1431       return false;
1432 
1433   if (GET_MODE (XEXP (src, 0)) != mode
1434       && !CONST_INT_P (XEXP (src, 0)))
1435     return false;
1436 
1437   return true;
1438 }
1439 
1440 /* The TImode version of scalar_to_vector_candidate_p.  */
1441 
1442 static bool
timode_scalar_to_vector_candidate_p(rtx_insn * insn)1443 timode_scalar_to_vector_candidate_p (rtx_insn *insn)
1444 {
1445   rtx def_set = single_set (insn);
1446 
1447   if (!def_set)
1448     return false;
1449 
1450   if (has_non_address_hard_reg (insn))
1451     return false;
1452 
1453   rtx src = SET_SRC (def_set);
1454   rtx dst = SET_DEST (def_set);
1455 
1456   /* Only TImode load and store are allowed.  */
1457   if (GET_MODE (dst) != TImode)
1458     return false;
1459 
1460   if (MEM_P (dst))
1461     {
1462       /* Check for store.  Memory must be aligned or unaligned store
1463 	 is optimal.  Only support store from register, standard SSE
1464 	 constant or CONST_WIDE_INT generated from piecewise store.
1465 
1466 	 ??? Verify performance impact before enabling CONST_INT for
1467 	 __int128 store.  */
1468       if (misaligned_operand (dst, TImode)
1469 	  && !TARGET_SSE_UNALIGNED_STORE_OPTIMAL)
1470 	return false;
1471 
1472       switch (GET_CODE (src))
1473 	{
1474 	default:
1475 	  return false;
1476 
1477 	case REG:
1478 	case CONST_WIDE_INT:
1479 	  return true;
1480 
1481 	case CONST_INT:
1482 	  return standard_sse_constant_p (src, TImode);
1483 	}
1484     }
1485   else if (MEM_P (src))
1486     {
1487       /* Check for load.  Memory must be aligned or unaligned load is
1488 	 optimal.  */
1489       return (REG_P (dst)
1490 	      && (!misaligned_operand (src, TImode)
1491 		  || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL));
1492     }
1493 
1494   return false;
1495 }
1496 
1497 /* For a register REGNO, scan instructions for its defs and uses.
1498    Put REGNO in REGS if a def or use isn't in CANDIDATES.  */
1499 
1500 static void
timode_check_non_convertible_regs(bitmap candidates,bitmap regs,unsigned int regno)1501 timode_check_non_convertible_regs (bitmap candidates, bitmap regs,
1502 				   unsigned int regno)
1503 {
1504   for (df_ref def = DF_REG_DEF_CHAIN (regno);
1505        def;
1506        def = DF_REF_NEXT_REG (def))
1507     {
1508       if (!bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
1509 	{
1510 	  if (dump_file)
1511 	    fprintf (dump_file,
1512 		     "r%d has non convertible def in insn %d\n",
1513 		     regno, DF_REF_INSN_UID (def));
1514 
1515 	  bitmap_set_bit (regs, regno);
1516 	  break;
1517 	}
1518     }
1519 
1520   for (df_ref ref = DF_REG_USE_CHAIN (regno);
1521        ref;
1522        ref = DF_REF_NEXT_REG (ref))
1523     {
1524       /* Debug instructions are skipped.  */
1525       if (NONDEBUG_INSN_P (DF_REF_INSN (ref))
1526 	  && !bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)))
1527 	{
1528 	  if (dump_file)
1529 	    fprintf (dump_file,
1530 		     "r%d has non convertible use in insn %d\n",
1531 		     regno, DF_REF_INSN_UID (ref));
1532 
1533 	  bitmap_set_bit (regs, regno);
1534 	  break;
1535 	}
1536     }
1537 }
1538 
1539 /* The TImode version of remove_non_convertible_regs.  */
1540 
1541 static void
timode_remove_non_convertible_regs(bitmap candidates)1542 timode_remove_non_convertible_regs (bitmap candidates)
1543 {
1544   bitmap_iterator bi;
1545   unsigned id;
1546   bitmap regs = BITMAP_ALLOC (NULL);
1547 
1548   EXECUTE_IF_SET_IN_BITMAP (candidates, 0, id, bi)
1549     {
1550       rtx def_set = single_set (DF_INSN_UID_GET (id)->insn);
1551       rtx dest = SET_DEST (def_set);
1552       rtx src = SET_SRC (def_set);
1553 
1554       if ((!REG_P (dest)
1555 	   || bitmap_bit_p (regs, REGNO (dest))
1556 	   || HARD_REGISTER_P (dest))
1557 	  && (!REG_P (src)
1558 	      || bitmap_bit_p (regs, REGNO (src))
1559 	      || HARD_REGISTER_P (src)))
1560 	continue;
1561 
1562       if (REG_P (dest))
1563 	timode_check_non_convertible_regs (candidates, regs,
1564 					   REGNO (dest));
1565 
1566       if (REG_P (src))
1567 	timode_check_non_convertible_regs (candidates, regs,
1568 					   REGNO (src));
1569     }
1570 
1571   EXECUTE_IF_SET_IN_BITMAP (regs, 0, id, bi)
1572     {
1573       for (df_ref def = DF_REG_DEF_CHAIN (id);
1574 	   def;
1575 	   def = DF_REF_NEXT_REG (def))
1576 	if (bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
1577 	  {
1578 	    if (dump_file)
1579 	      fprintf (dump_file, "Removing insn %d from candidates list\n",
1580 		       DF_REF_INSN_UID (def));
1581 
1582 	    bitmap_clear_bit (candidates, DF_REF_INSN_UID (def));
1583 	  }
1584 
1585       for (df_ref ref = DF_REG_USE_CHAIN (id);
1586 	   ref;
1587 	   ref = DF_REF_NEXT_REG (ref))
1588 	if (bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)))
1589 	  {
1590 	    if (dump_file)
1591 	      fprintf (dump_file, "Removing insn %d from candidates list\n",
1592 		       DF_REF_INSN_UID (ref));
1593 
1594 	    bitmap_clear_bit (candidates, DF_REF_INSN_UID (ref));
1595 	  }
1596     }
1597 
1598   BITMAP_FREE (regs);
1599 }
1600 
1601 /* Main STV pass function.  Find and convert scalar
1602    instructions into vector mode when profitable.  */
1603 
1604 static unsigned int
convert_scalars_to_vector(bool timode_p)1605 convert_scalars_to_vector (bool timode_p)
1606 {
1607   basic_block bb;
1608   int converted_insns = 0;
1609 
1610   bitmap_obstack_initialize (NULL);
1611   const machine_mode cand_mode[3] = { SImode, DImode, TImode };
1612   const machine_mode cand_vmode[3] = { V4SImode, V2DImode, V1TImode };
1613   bitmap_head candidates[3];  /* { SImode, DImode, TImode } */
1614   for (unsigned i = 0; i < 3; ++i)
1615     bitmap_initialize (&candidates[i], &bitmap_default_obstack);
1616 
1617   calculate_dominance_info (CDI_DOMINATORS);
1618   df_set_flags (DF_DEFER_INSN_RESCAN);
1619   df_chain_add_problem (DF_DU_CHAIN | DF_UD_CHAIN);
1620   df_analyze ();
1621 
1622   /* Find all instructions we want to convert into vector mode.  */
1623   if (dump_file)
1624     fprintf (dump_file, "Searching for mode conversion candidates...\n");
1625 
1626   FOR_EACH_BB_FN (bb, cfun)
1627     {
1628       rtx_insn *insn;
1629       FOR_BB_INSNS (bb, insn)
1630 	if (timode_p
1631 	    && timode_scalar_to_vector_candidate_p (insn))
1632 	  {
1633 	    if (dump_file)
1634 	      fprintf (dump_file, "  insn %d is marked as a TImode candidate\n",
1635 		       INSN_UID (insn));
1636 
1637 	    bitmap_set_bit (&candidates[2], INSN_UID (insn));
1638 	  }
1639 	else if (!timode_p)
1640 	  {
1641 	    /* Check {SI,DI}mode.  */
1642 	    for (unsigned i = 0; i <= 1; ++i)
1643 	      if (general_scalar_to_vector_candidate_p (insn, cand_mode[i]))
1644 		{
1645 		  if (dump_file)
1646 		    fprintf (dump_file, "  insn %d is marked as a %s candidate\n",
1647 			     INSN_UID (insn), i == 0 ? "SImode" : "DImode");
1648 
1649 		  bitmap_set_bit (&candidates[i], INSN_UID (insn));
1650 		  break;
1651 		}
1652 	  }
1653     }
1654 
1655   if (timode_p)
1656     timode_remove_non_convertible_regs (&candidates[2]);
1657 
1658   for (unsigned i = 0; i <= 2; ++i)
1659     if (!bitmap_empty_p (&candidates[i]))
1660       break;
1661     else if (i == 2 && dump_file)
1662       fprintf (dump_file, "There are no candidates for optimization.\n");
1663 
1664   for (unsigned i = 0; i <= 2; ++i)
1665     while (!bitmap_empty_p (&candidates[i]))
1666       {
1667 	unsigned uid = bitmap_first_set_bit (&candidates[i]);
1668 	scalar_chain *chain;
1669 
1670 	if (cand_mode[i] == TImode)
1671 	  chain = new timode_scalar_chain;
1672 	else
1673 	  chain = new general_scalar_chain (cand_mode[i], cand_vmode[i]);
1674 
1675 	/* Find instructions chain we want to convert to vector mode.
1676 	   Check all uses and definitions to estimate all required
1677 	   conversions.  */
1678 	chain->build (&candidates[i], uid);
1679 
1680 	if (chain->compute_convert_gain () > 0)
1681 	  converted_insns += chain->convert ();
1682 	else
1683 	  if (dump_file)
1684 	    fprintf (dump_file, "Chain #%d conversion is not profitable\n",
1685 		     chain->chain_id);
1686 
1687 	delete chain;
1688       }
1689 
1690   if (dump_file)
1691     fprintf (dump_file, "Total insns converted: %d\n", converted_insns);
1692 
1693   for (unsigned i = 0; i <= 2; ++i)
1694     bitmap_release (&candidates[i]);
1695   bitmap_obstack_release (NULL);
1696   df_process_deferred_rescans ();
1697 
1698   /* Conversion means we may have 128bit register spills/fills
1699      which require aligned stack.  */
1700   if (converted_insns)
1701     {
1702       if (crtl->stack_alignment_needed < 128)
1703 	crtl->stack_alignment_needed = 128;
1704       if (crtl->stack_alignment_estimated < 128)
1705 	crtl->stack_alignment_estimated = 128;
1706 
1707       crtl->stack_realign_needed
1708 	= INCOMING_STACK_BOUNDARY < crtl->stack_alignment_estimated;
1709       crtl->stack_realign_tried = crtl->stack_realign_needed;
1710 
1711       crtl->stack_realign_processed = true;
1712 
1713       if (!crtl->drap_reg)
1714 	{
1715 	  rtx drap_rtx = targetm.calls.get_drap_rtx ();
1716 
1717 	  /* stack_realign_drap and drap_rtx must match.  */
1718 	  gcc_assert ((stack_realign_drap != 0) == (drap_rtx != NULL));
1719 
1720 	  /* Do nothing if NULL is returned,
1721 	     which means DRAP is not needed.  */
1722 	  if (drap_rtx != NULL)
1723 	    {
1724 	      crtl->args.internal_arg_pointer = drap_rtx;
1725 
1726 	      /* Call fixup_tail_calls to clean up
1727 		 REG_EQUIV note if DRAP is needed. */
1728 	      fixup_tail_calls ();
1729 	    }
1730 	}
1731 
1732       /* Fix up DECL_RTL/DECL_INCOMING_RTL of arguments.  */
1733       if (TARGET_64BIT)
1734 	for (tree parm = DECL_ARGUMENTS (current_function_decl);
1735 	     parm; parm = DECL_CHAIN (parm))
1736 	  {
1737 	    if (TYPE_MODE (TREE_TYPE (parm)) != TImode)
1738 	      continue;
1739 	    if (DECL_RTL_SET_P (parm)
1740 		&& GET_MODE (DECL_RTL (parm)) == V1TImode)
1741 	      {
1742 		rtx r = DECL_RTL (parm);
1743 		if (REG_P (r))
1744 		  SET_DECL_RTL (parm, gen_rtx_SUBREG (TImode, r, 0));
1745 	      }
1746 	    if (DECL_INCOMING_RTL (parm)
1747 		&& GET_MODE (DECL_INCOMING_RTL (parm)) == V1TImode)
1748 	      {
1749 		rtx r = DECL_INCOMING_RTL (parm);
1750 		if (REG_P (r))
1751 		  DECL_INCOMING_RTL (parm) = gen_rtx_SUBREG (TImode, r, 0);
1752 	      }
1753 	  }
1754     }
1755 
1756   return 0;
1757 }
1758 
1759 /* Modify the vzeroupper pattern in INSN so that it describes the effect
1760    that the instruction has on the SSE registers.  LIVE_REGS are the set
1761    of registers that are live across the instruction.
1762 
1763    For a live register R we use:
1764 
1765      (set (reg:V2DF R) (reg:V2DF R))
1766 
1767    which preserves the low 128 bits but clobbers the upper bits.  */
1768 
1769 static void
ix86_add_reg_usage_to_vzeroupper(rtx_insn * insn,bitmap live_regs)1770 ix86_add_reg_usage_to_vzeroupper (rtx_insn *insn, bitmap live_regs)
1771 {
1772   rtx pattern = PATTERN (insn);
1773   unsigned int nregs = TARGET_64BIT ? 16 : 8;
1774   unsigned int npats = nregs;
1775   for (unsigned int i = 0; i < nregs; ++i)
1776     {
1777       unsigned int regno = GET_SSE_REGNO (i);
1778       if (!bitmap_bit_p (live_regs, regno))
1779 	npats--;
1780     }
1781   if (npats == 0)
1782     return;
1783   rtvec vec = rtvec_alloc (npats + 1);
1784   RTVEC_ELT (vec, 0) = XVECEXP (pattern, 0, 0);
1785   for (unsigned int i = 0, j = 0; i < nregs; ++i)
1786     {
1787       unsigned int regno = GET_SSE_REGNO (i);
1788       if (!bitmap_bit_p (live_regs, regno))
1789 	continue;
1790       rtx reg = gen_rtx_REG (V2DImode, regno);
1791       ++j;
1792       RTVEC_ELT (vec, j) = gen_rtx_SET (reg, reg);
1793     }
1794   XVEC (pattern, 0) = vec;
1795   INSN_CODE (insn) = -1;
1796   df_insn_rescan (insn);
1797 }
1798 
1799 /* Walk the vzeroupper instructions in the function and annotate them
1800    with the effect that they have on the SSE registers.  */
1801 
1802 static void
ix86_add_reg_usage_to_vzerouppers(void)1803 ix86_add_reg_usage_to_vzerouppers (void)
1804 {
1805   basic_block bb;
1806   rtx_insn *insn;
1807   auto_bitmap live_regs;
1808 
1809   df_analyze ();
1810   FOR_EACH_BB_FN (bb, cfun)
1811     {
1812       bitmap_copy (live_regs, df_get_live_out (bb));
1813       df_simulate_initialize_backwards (bb, live_regs);
1814       FOR_BB_INSNS_REVERSE (bb, insn)
1815 	{
1816 	  if (!NONDEBUG_INSN_P (insn))
1817 	    continue;
1818 	  if (vzeroupper_pattern (PATTERN (insn), VOIDmode))
1819 	    ix86_add_reg_usage_to_vzeroupper (insn, live_regs);
1820 	  df_simulate_one_insn_backwards (bb, insn, live_regs);
1821 	}
1822     }
1823 }
1824 
1825 static unsigned int
rest_of_handle_insert_vzeroupper(void)1826 rest_of_handle_insert_vzeroupper (void)
1827 {
1828   int i;
1829 
1830   /* vzeroupper instructions are inserted immediately after reload to
1831      account for possible spills from 256bit or 512bit registers.  The pass
1832      reuses mode switching infrastructure by re-running mode insertion
1833      pass, so disable entities that have already been processed.  */
1834   for (i = 0; i < MAX_386_ENTITIES; i++)
1835     ix86_optimize_mode_switching[i] = 0;
1836 
1837   ix86_optimize_mode_switching[AVX_U128] = 1;
1838 
1839   /* Call optimize_mode_switching.  */
1840   g->get_passes ()->execute_pass_mode_switching ();
1841   ix86_add_reg_usage_to_vzerouppers ();
1842   return 0;
1843 }
1844 
1845 namespace {
1846 
1847 const pass_data pass_data_insert_vzeroupper =
1848 {
1849   RTL_PASS, /* type */
1850   "vzeroupper", /* name */
1851   OPTGROUP_NONE, /* optinfo_flags */
1852   TV_MACH_DEP, /* tv_id */
1853   0, /* properties_required */
1854   0, /* properties_provided */
1855   0, /* properties_destroyed */
1856   0, /* todo_flags_start */
1857   TODO_df_finish, /* todo_flags_finish */
1858 };
1859 
1860 class pass_insert_vzeroupper : public rtl_opt_pass
1861 {
1862 public:
pass_insert_vzeroupper(gcc::context * ctxt)1863   pass_insert_vzeroupper(gcc::context *ctxt)
1864     : rtl_opt_pass(pass_data_insert_vzeroupper, ctxt)
1865   {}
1866 
1867   /* opt_pass methods: */
gate(function *)1868   virtual bool gate (function *)
1869     {
1870       return TARGET_AVX
1871 	     && TARGET_VZEROUPPER && flag_expensive_optimizations
1872 	     && !optimize_size;
1873     }
1874 
execute(function *)1875   virtual unsigned int execute (function *)
1876     {
1877       return rest_of_handle_insert_vzeroupper ();
1878     }
1879 
1880 }; // class pass_insert_vzeroupper
1881 
1882 const pass_data pass_data_stv =
1883 {
1884   RTL_PASS, /* type */
1885   "stv", /* name */
1886   OPTGROUP_NONE, /* optinfo_flags */
1887   TV_MACH_DEP, /* tv_id */
1888   0, /* properties_required */
1889   0, /* properties_provided */
1890   0, /* properties_destroyed */
1891   0, /* todo_flags_start */
1892   TODO_df_finish, /* todo_flags_finish */
1893 };
1894 
1895 class pass_stv : public rtl_opt_pass
1896 {
1897 public:
pass_stv(gcc::context * ctxt)1898   pass_stv (gcc::context *ctxt)
1899     : rtl_opt_pass (pass_data_stv, ctxt),
1900       timode_p (false)
1901   {}
1902 
1903   /* opt_pass methods: */
gate(function *)1904   virtual bool gate (function *)
1905     {
1906       return ((!timode_p || TARGET_64BIT)
1907 	      && TARGET_STV && TARGET_SSE2 && optimize > 1);
1908     }
1909 
execute(function *)1910   virtual unsigned int execute (function *)
1911     {
1912       return convert_scalars_to_vector (timode_p);
1913     }
1914 
clone()1915   opt_pass *clone ()
1916     {
1917       return new pass_stv (m_ctxt);
1918     }
1919 
set_pass_param(unsigned int n,bool param)1920   void set_pass_param (unsigned int n, bool param)
1921     {
1922       gcc_assert (n == 0);
1923       timode_p = param;
1924     }
1925 
1926 private:
1927   bool timode_p;
1928 }; // class pass_stv
1929 
1930 } // anon namespace
1931 
1932 rtl_opt_pass *
make_pass_insert_vzeroupper(gcc::context * ctxt)1933 make_pass_insert_vzeroupper (gcc::context *ctxt)
1934 {
1935   return new pass_insert_vzeroupper (ctxt);
1936 }
1937 
1938 rtl_opt_pass *
make_pass_stv(gcc::context * ctxt)1939 make_pass_stv (gcc::context *ctxt)
1940 {
1941   return new pass_stv (ctxt);
1942 }
1943 
1944 /* Inserting ENDBRANCH instructions.  */
1945 
1946 static unsigned int
rest_of_insert_endbranch(void)1947 rest_of_insert_endbranch (void)
1948 {
1949   timevar_push (TV_MACH_DEP);
1950 
1951   rtx cet_eb;
1952   rtx_insn *insn;
1953   basic_block bb;
1954 
1955   /* Currently emit EB if it's a tracking function, i.e. 'nocf_check' is
1956      absent among function attributes.  Later an optimization will be
1957      introduced to make analysis if an address of a static function is
1958      taken.  A static function whose address is not taken will get a
1959      nocf_check attribute.  This will allow to reduce the number of EB.  */
1960 
1961   if (!lookup_attribute ("nocf_check",
1962 			 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
1963       && (!flag_manual_endbr
1964 	  || lookup_attribute ("cf_check",
1965 			       DECL_ATTRIBUTES (cfun->decl)))
1966       && (!cgraph_node::get (cfun->decl)->only_called_directly_p ()
1967 	  || ix86_cmodel == CM_LARGE
1968 	  || ix86_cmodel == CM_LARGE_PIC
1969 	  || flag_force_indirect_call
1970 	  || (TARGET_DLLIMPORT_DECL_ATTRIBUTES
1971 	      && DECL_DLLIMPORT_P (cfun->decl))))
1972     {
1973       /* Queue ENDBR insertion to x86_function_profiler.  */
1974       if (crtl->profile && flag_fentry)
1975 	cfun->machine->endbr_queued_at_entrance = true;
1976       else
1977 	{
1978 	  cet_eb = gen_nop_endbr ();
1979 
1980 	  bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
1981 	  insn = BB_HEAD (bb);
1982 	  emit_insn_before (cet_eb, insn);
1983 	}
1984     }
1985 
1986   bb = 0;
1987   FOR_EACH_BB_FN (bb, cfun)
1988     {
1989       for (insn = BB_HEAD (bb); insn != NEXT_INSN (BB_END (bb));
1990 	   insn = NEXT_INSN (insn))
1991 	{
1992 	  if (CALL_P (insn))
1993 	    {
1994 	      bool need_endbr;
1995 	      need_endbr = find_reg_note (insn, REG_SETJMP, NULL) != NULL;
1996 	      if (!need_endbr && !SIBLING_CALL_P (insn))
1997 		{
1998 		  rtx call = get_call_rtx_from (insn);
1999 		  rtx fnaddr = XEXP (call, 0);
2000 		  tree fndecl = NULL_TREE;
2001 
2002 		  /* Also generate ENDBRANCH for non-tail call which
2003 		     may return via indirect branch.  */
2004 		  if (GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
2005 		    fndecl = SYMBOL_REF_DECL (XEXP (fnaddr, 0));
2006 		  if (fndecl == NULL_TREE)
2007 		    fndecl = MEM_EXPR (fnaddr);
2008 		  if (fndecl
2009 		      && TREE_CODE (TREE_TYPE (fndecl)) != FUNCTION_TYPE
2010 		      && TREE_CODE (TREE_TYPE (fndecl)) != METHOD_TYPE)
2011 		    fndecl = NULL_TREE;
2012 		  if (fndecl && TYPE_ARG_TYPES (TREE_TYPE (fndecl)))
2013 		    {
2014 		      tree fntype = TREE_TYPE (fndecl);
2015 		      if (lookup_attribute ("indirect_return",
2016 					    TYPE_ATTRIBUTES (fntype)))
2017 			need_endbr = true;
2018 		    }
2019 		}
2020 	      if (!need_endbr)
2021 		continue;
2022 	      /* Generate ENDBRANCH after CALL, which can return more than
2023 		 twice, setjmp-like functions.  */
2024 
2025 	      cet_eb = gen_nop_endbr ();
2026 	      emit_insn_after_setloc (cet_eb, insn, INSN_LOCATION (insn));
2027 	      continue;
2028 	    }
2029 
2030 	  if (JUMP_P (insn) && flag_cet_switch)
2031 	    {
2032 	      rtx target = JUMP_LABEL (insn);
2033 	      if (target == NULL_RTX || ANY_RETURN_P (target))
2034 		continue;
2035 
2036 	      /* Check the jump is a switch table.  */
2037 	      rtx_insn *label = as_a<rtx_insn *> (target);
2038 	      rtx_insn *table = next_insn (label);
2039 	      if (table == NULL_RTX || !JUMP_TABLE_DATA_P (table))
2040 		continue;
2041 
2042 	      /* For the indirect jump find out all places it jumps and insert
2043 		 ENDBRANCH there.  It should be done under a special flag to
2044 		 control ENDBRANCH generation for switch stmts.  */
2045 	      edge_iterator ei;
2046 	      edge e;
2047 	      basic_block dest_blk;
2048 
2049 	      FOR_EACH_EDGE (e, ei, bb->succs)
2050 		{
2051 		  rtx_insn *insn;
2052 
2053 		  dest_blk = e->dest;
2054 		  insn = BB_HEAD (dest_blk);
2055 		  gcc_assert (LABEL_P (insn));
2056 		  cet_eb = gen_nop_endbr ();
2057 		  emit_insn_after (cet_eb, insn);
2058 		}
2059 	      continue;
2060 	    }
2061 
2062 	  if (LABEL_P (insn) && LABEL_PRESERVE_P (insn))
2063 	    {
2064 	      cet_eb = gen_nop_endbr ();
2065 	      emit_insn_after (cet_eb, insn);
2066 	      continue;
2067 	    }
2068 	}
2069     }
2070 
2071   timevar_pop (TV_MACH_DEP);
2072   return 0;
2073 }
2074 
2075 namespace {
2076 
2077 const pass_data pass_data_insert_endbranch =
2078 {
2079   RTL_PASS, /* type.  */
2080   "cet", /* name.  */
2081   OPTGROUP_NONE, /* optinfo_flags.  */
2082   TV_MACH_DEP, /* tv_id.  */
2083   0, /* properties_required.  */
2084   0, /* properties_provided.  */
2085   0, /* properties_destroyed.  */
2086   0, /* todo_flags_start.  */
2087   0, /* todo_flags_finish.  */
2088 };
2089 
2090 class pass_insert_endbranch : public rtl_opt_pass
2091 {
2092 public:
pass_insert_endbranch(gcc::context * ctxt)2093   pass_insert_endbranch (gcc::context *ctxt)
2094     : rtl_opt_pass (pass_data_insert_endbranch, ctxt)
2095   {}
2096 
2097   /* opt_pass methods: */
gate(function *)2098   virtual bool gate (function *)
2099     {
2100       return ((flag_cf_protection & CF_BRANCH));
2101     }
2102 
execute(function *)2103   virtual unsigned int execute (function *)
2104     {
2105       return rest_of_insert_endbranch ();
2106     }
2107 
2108 }; // class pass_insert_endbranch
2109 
2110 } // anon namespace
2111 
2112 rtl_opt_pass *
make_pass_insert_endbranch(gcc::context * ctxt)2113 make_pass_insert_endbranch (gcc::context *ctxt)
2114 {
2115   return new pass_insert_endbranch (ctxt);
2116 }
2117 
2118 /* At entry of the nearest common dominator for basic blocks with
2119    conversions, generate a single
2120 	vxorps %xmmN, %xmmN, %xmmN
2121    for all
2122 	vcvtss2sd  op, %xmmN, %xmmX
2123 	vcvtsd2ss  op, %xmmN, %xmmX
2124 	vcvtsi2ss  op, %xmmN, %xmmX
2125 	vcvtsi2sd  op, %xmmN, %xmmX
2126 
2127    NB: We want to generate only a single vxorps to cover the whole
2128    function.  The LCM algorithm isn't appropriate here since it may
2129    place a vxorps inside the loop.  */
2130 
2131 static unsigned int
remove_partial_avx_dependency(void)2132 remove_partial_avx_dependency (void)
2133 {
2134   timevar_push (TV_MACH_DEP);
2135 
2136   bitmap_obstack_initialize (NULL);
2137   bitmap convert_bbs = BITMAP_ALLOC (NULL);
2138 
2139   basic_block bb;
2140   rtx_insn *insn, *set_insn;
2141   rtx set;
2142   rtx v4sf_const0 = NULL_RTX;
2143 
2144   auto_vec<rtx_insn *> control_flow_insns;
2145 
2146   FOR_EACH_BB_FN (bb, cfun)
2147     {
2148       FOR_BB_INSNS (bb, insn)
2149 	{
2150 	  if (!NONDEBUG_INSN_P (insn))
2151 	    continue;
2152 
2153 	  set = single_set (insn);
2154 	  if (!set)
2155 	    continue;
2156 
2157 	  if (get_attr_avx_partial_xmm_update (insn)
2158 	      != AVX_PARTIAL_XMM_UPDATE_TRUE)
2159 	    continue;
2160 
2161 	  if (!v4sf_const0)
2162 	    {
2163 	      calculate_dominance_info (CDI_DOMINATORS);
2164 	      df_set_flags (DF_DEFER_INSN_RESCAN);
2165 	      df_chain_add_problem (DF_DU_CHAIN | DF_UD_CHAIN);
2166 	      df_md_add_problem ();
2167 	      df_analyze ();
2168 	      v4sf_const0 = gen_reg_rtx (V4SFmode);
2169 	    }
2170 
2171 	  /* Convert PARTIAL_XMM_UPDATE_TRUE insns, DF -> SF, SF -> DF,
2172 	     SI -> SF, SI -> DF, DI -> SF, DI -> DF, to vec_dup and
2173 	     vec_merge with subreg.  */
2174 	  rtx src = SET_SRC (set);
2175 	  rtx dest = SET_DEST (set);
2176 	  machine_mode dest_mode = GET_MODE (dest);
2177 
2178 	  rtx zero;
2179 	  machine_mode dest_vecmode;
2180 	  if (dest_mode == E_SFmode)
2181 	    {
2182 	      dest_vecmode = V4SFmode;
2183 	      zero = v4sf_const0;
2184 	    }
2185 	  else
2186 	    {
2187 	      dest_vecmode = V2DFmode;
2188 	      zero = gen_rtx_SUBREG (V2DFmode, v4sf_const0, 0);
2189 	    }
2190 
2191 	  /* Change source to vector mode.  */
2192 	  src = gen_rtx_VEC_DUPLICATE (dest_vecmode, src);
2193 	  src = gen_rtx_VEC_MERGE (dest_vecmode, src, zero,
2194 				   GEN_INT (HOST_WIDE_INT_1U));
2195 	  /* Change destination to vector mode.  */
2196 	  rtx vec = gen_reg_rtx (dest_vecmode);
2197 	  /* Generate an XMM vector SET.  */
2198 	  set = gen_rtx_SET (vec, src);
2199 	  set_insn = emit_insn_before (set, insn);
2200 	  df_insn_rescan (set_insn);
2201 
2202 	  if (cfun->can_throw_non_call_exceptions)
2203 	    {
2204 	      /* Handle REG_EH_REGION note.  */
2205 	      rtx note = find_reg_note (insn, REG_EH_REGION, NULL_RTX);
2206 	      if (note)
2207 		{
2208 		  control_flow_insns.safe_push (set_insn);
2209 		  add_reg_note (set_insn, REG_EH_REGION, XEXP (note, 0));
2210 		}
2211 	    }
2212 
2213 	  src = gen_rtx_SUBREG (dest_mode, vec, 0);
2214 	  set = gen_rtx_SET (dest, src);
2215 
2216 	  /* Drop possible dead definitions.  */
2217 	  PATTERN (insn) = set;
2218 
2219 	  INSN_CODE (insn) = -1;
2220 	  recog_memoized (insn);
2221 	  df_insn_rescan (insn);
2222 	  bitmap_set_bit (convert_bbs, bb->index);
2223 	}
2224     }
2225 
2226   if (v4sf_const0)
2227     {
2228       /* (Re-)discover loops so that bb->loop_father can be used in the
2229 	 analysis below.  */
2230       loop_optimizer_init (AVOID_CFG_MODIFICATIONS);
2231 
2232       /* Generate a vxorps at entry of the nearest dominator for basic
2233 	 blocks with conversions, which is in the fake loop that
2234 	 contains the whole function, so that there is only a single
2235 	 vxorps in the whole function.   */
2236       bb = nearest_common_dominator_for_set (CDI_DOMINATORS,
2237 					     convert_bbs);
2238       while (bb->loop_father->latch
2239 	     != EXIT_BLOCK_PTR_FOR_FN (cfun))
2240 	bb = get_immediate_dominator (CDI_DOMINATORS,
2241 				      bb->loop_father->header);
2242 
2243       set = gen_rtx_SET (v4sf_const0, CONST0_RTX (V4SFmode));
2244 
2245       insn = BB_HEAD (bb);
2246       while (insn && !NONDEBUG_INSN_P (insn))
2247 	{
2248 	  if (insn == BB_END (bb))
2249 	    {
2250 	      insn = NULL;
2251 	      break;
2252 	    }
2253 	  insn = NEXT_INSN (insn);
2254 	}
2255       if (insn == BB_HEAD (bb))
2256         set_insn = emit_insn_before (set, insn);
2257       else
2258 	set_insn = emit_insn_after (set,
2259 				    insn ? PREV_INSN (insn) : BB_END (bb));
2260       df_insn_rescan (set_insn);
2261       df_process_deferred_rescans ();
2262       loop_optimizer_finalize ();
2263 
2264       if (!control_flow_insns.is_empty ())
2265 	{
2266 	  free_dominance_info (CDI_DOMINATORS);
2267 
2268 	  unsigned int i;
2269 	  FOR_EACH_VEC_ELT (control_flow_insns, i, insn)
2270 	    if (control_flow_insn_p (insn))
2271 	      {
2272 		/* Split the block after insn.  There will be a fallthru
2273 		   edge, which is OK so we keep it.  We have to create
2274 		   the exception edges ourselves.  */
2275 		bb = BLOCK_FOR_INSN (insn);
2276 		split_block (bb, insn);
2277 		rtl_make_eh_edge (NULL, bb, BB_END (bb));
2278 	      }
2279 	}
2280     }
2281 
2282   bitmap_obstack_release (NULL);
2283   BITMAP_FREE (convert_bbs);
2284 
2285   timevar_pop (TV_MACH_DEP);
2286   return 0;
2287 }
2288 
2289 namespace {
2290 
2291 const pass_data pass_data_remove_partial_avx_dependency =
2292 {
2293   RTL_PASS, /* type */
2294   "rpad", /* name */
2295   OPTGROUP_NONE, /* optinfo_flags */
2296   TV_MACH_DEP, /* tv_id */
2297   0, /* properties_required */
2298   0, /* properties_provided */
2299   0, /* properties_destroyed */
2300   0, /* todo_flags_start */
2301   TODO_df_finish, /* todo_flags_finish */
2302 };
2303 
2304 class pass_remove_partial_avx_dependency : public rtl_opt_pass
2305 {
2306 public:
pass_remove_partial_avx_dependency(gcc::context * ctxt)2307   pass_remove_partial_avx_dependency (gcc::context *ctxt)
2308     : rtl_opt_pass (pass_data_remove_partial_avx_dependency, ctxt)
2309   {}
2310 
2311   /* opt_pass methods: */
gate(function *)2312   virtual bool gate (function *)
2313     {
2314       return (TARGET_AVX
2315 	      && TARGET_SSE_PARTIAL_REG_DEPENDENCY
2316 	      && TARGET_SSE_MATH
2317 	      && optimize
2318 	      && optimize_function_for_speed_p (cfun));
2319     }
2320 
execute(function *)2321   virtual unsigned int execute (function *)
2322     {
2323       return remove_partial_avx_dependency ();
2324     }
2325 }; // class pass_rpad
2326 
2327 } // anon namespace
2328 
2329 rtl_opt_pass *
make_pass_remove_partial_avx_dependency(gcc::context * ctxt)2330 make_pass_remove_partial_avx_dependency (gcc::context *ctxt)
2331 {
2332   return new pass_remove_partial_avx_dependency (ctxt);
2333 }
2334 
2335 /* This compares the priority of target features in function DECL1
2336    and DECL2.  It returns positive value if DECL1 is higher priority,
2337    negative value if DECL2 is higher priority and 0 if they are the
2338    same.  */
2339 
2340 int
ix86_compare_version_priority(tree decl1,tree decl2)2341 ix86_compare_version_priority (tree decl1, tree decl2)
2342 {
2343   unsigned int priority1 = get_builtin_code_for_version (decl1, NULL);
2344   unsigned int priority2 = get_builtin_code_for_version (decl2, NULL);
2345 
2346   return (int)priority1 - (int)priority2;
2347 }
2348 
2349 /* V1 and V2 point to function versions with different priorities
2350    based on the target ISA.  This function compares their priorities.  */
2351 
2352 static int
feature_compare(const void * v1,const void * v2)2353 feature_compare (const void *v1, const void *v2)
2354 {
2355   typedef struct _function_version_info
2356     {
2357       tree version_decl;
2358       tree predicate_chain;
2359       unsigned int dispatch_priority;
2360     } function_version_info;
2361 
2362   const function_version_info c1 = *(const function_version_info *)v1;
2363   const function_version_info c2 = *(const function_version_info *)v2;
2364   return (c2.dispatch_priority - c1.dispatch_priority);
2365 }
2366 
2367 /* This adds a condition to the basic_block NEW_BB in function FUNCTION_DECL
2368    to return a pointer to VERSION_DECL if the outcome of the expression
2369    formed by PREDICATE_CHAIN is true.  This function will be called during
2370    version dispatch to decide which function version to execute.  It returns
2371    the basic block at the end, to which more conditions can be added.  */
2372 
2373 static basic_block
add_condition_to_bb(tree function_decl,tree version_decl,tree predicate_chain,basic_block new_bb)2374 add_condition_to_bb (tree function_decl, tree version_decl,
2375 		     tree predicate_chain, basic_block new_bb)
2376 {
2377   gimple *return_stmt;
2378   tree convert_expr, result_var;
2379   gimple *convert_stmt;
2380   gimple *call_cond_stmt;
2381   gimple *if_else_stmt;
2382 
2383   basic_block bb1, bb2, bb3;
2384   edge e12, e23;
2385 
2386   tree cond_var, and_expr_var = NULL_TREE;
2387   gimple_seq gseq;
2388 
2389   tree predicate_decl, predicate_arg;
2390 
2391   push_cfun (DECL_STRUCT_FUNCTION (function_decl));
2392 
2393   gcc_assert (new_bb != NULL);
2394   gseq = bb_seq (new_bb);
2395 
2396 
2397   convert_expr = build1 (CONVERT_EXPR, ptr_type_node,
2398 	     		 build_fold_addr_expr (version_decl));
2399   result_var = create_tmp_var (ptr_type_node);
2400   convert_stmt = gimple_build_assign (result_var, convert_expr);
2401   return_stmt = gimple_build_return (result_var);
2402 
2403   if (predicate_chain == NULL_TREE)
2404     {
2405       gimple_seq_add_stmt (&gseq, convert_stmt);
2406       gimple_seq_add_stmt (&gseq, return_stmt);
2407       set_bb_seq (new_bb, gseq);
2408       gimple_set_bb (convert_stmt, new_bb);
2409       gimple_set_bb (return_stmt, new_bb);
2410       pop_cfun ();
2411       return new_bb;
2412     }
2413 
2414   while (predicate_chain != NULL)
2415     {
2416       cond_var = create_tmp_var (integer_type_node);
2417       predicate_decl = TREE_PURPOSE (predicate_chain);
2418       predicate_arg = TREE_VALUE (predicate_chain);
2419       call_cond_stmt = gimple_build_call (predicate_decl, 1, predicate_arg);
2420       gimple_call_set_lhs (call_cond_stmt, cond_var);
2421 
2422       gimple_set_block (call_cond_stmt, DECL_INITIAL (function_decl));
2423       gimple_set_bb (call_cond_stmt, new_bb);
2424       gimple_seq_add_stmt (&gseq, call_cond_stmt);
2425 
2426       predicate_chain = TREE_CHAIN (predicate_chain);
2427 
2428       if (and_expr_var == NULL)
2429         and_expr_var = cond_var;
2430       else
2431 	{
2432 	  gimple *assign_stmt;
2433 	  /* Use MIN_EXPR to check if any integer is zero?.
2434 	     and_expr_var = min_expr <cond_var, and_expr_var>  */
2435 	  assign_stmt = gimple_build_assign (and_expr_var,
2436 			  build2 (MIN_EXPR, integer_type_node,
2437 				  cond_var, and_expr_var));
2438 
2439 	  gimple_set_block (assign_stmt, DECL_INITIAL (function_decl));
2440 	  gimple_set_bb (assign_stmt, new_bb);
2441 	  gimple_seq_add_stmt (&gseq, assign_stmt);
2442 	}
2443     }
2444 
2445   if_else_stmt = gimple_build_cond (GT_EXPR, and_expr_var,
2446 	  		            integer_zero_node,
2447 				    NULL_TREE, NULL_TREE);
2448   gimple_set_block (if_else_stmt, DECL_INITIAL (function_decl));
2449   gimple_set_bb (if_else_stmt, new_bb);
2450   gimple_seq_add_stmt (&gseq, if_else_stmt);
2451 
2452   gimple_seq_add_stmt (&gseq, convert_stmt);
2453   gimple_seq_add_stmt (&gseq, return_stmt);
2454   set_bb_seq (new_bb, gseq);
2455 
2456   bb1 = new_bb;
2457   e12 = split_block (bb1, if_else_stmt);
2458   bb2 = e12->dest;
2459   e12->flags &= ~EDGE_FALLTHRU;
2460   e12->flags |= EDGE_TRUE_VALUE;
2461 
2462   e23 = split_block (bb2, return_stmt);
2463 
2464   gimple_set_bb (convert_stmt, bb2);
2465   gimple_set_bb (return_stmt, bb2);
2466 
2467   bb3 = e23->dest;
2468   make_edge (bb1, bb3, EDGE_FALSE_VALUE);
2469 
2470   remove_edge (e23);
2471   make_edge (bb2, EXIT_BLOCK_PTR_FOR_FN (cfun), 0);
2472 
2473   pop_cfun ();
2474 
2475   return bb3;
2476 }
2477 
2478 /* This function generates the dispatch function for
2479    multi-versioned functions.  DISPATCH_DECL is the function which will
2480    contain the dispatch logic.  FNDECLS are the function choices for
2481    dispatch, and is a tree chain.  EMPTY_BB is the basic block pointer
2482    in DISPATCH_DECL in which the dispatch code is generated.  */
2483 
2484 static int
dispatch_function_versions(tree dispatch_decl,void * fndecls_p,basic_block * empty_bb)2485 dispatch_function_versions (tree dispatch_decl,
2486 			    void *fndecls_p,
2487 			    basic_block *empty_bb)
2488 {
2489   tree default_decl;
2490   gimple *ifunc_cpu_init_stmt;
2491   gimple_seq gseq;
2492   int ix;
2493   tree ele;
2494   vec<tree> *fndecls;
2495   unsigned int num_versions = 0;
2496   unsigned int actual_versions = 0;
2497   unsigned int i;
2498 
2499   struct _function_version_info
2500     {
2501       tree version_decl;
2502       tree predicate_chain;
2503       unsigned int dispatch_priority;
2504     }*function_version_info;
2505 
2506   gcc_assert (dispatch_decl != NULL
2507 	      && fndecls_p != NULL
2508 	      && empty_bb != NULL);
2509 
2510   /*fndecls_p is actually a vector.  */
2511   fndecls = static_cast<vec<tree> *> (fndecls_p);
2512 
2513   /* At least one more version other than the default.  */
2514   num_versions = fndecls->length ();
2515   gcc_assert (num_versions >= 2);
2516 
2517   function_version_info = (struct _function_version_info *)
2518     XNEWVEC (struct _function_version_info, (num_versions - 1));
2519 
2520   /* The first version in the vector is the default decl.  */
2521   default_decl = (*fndecls)[0];
2522 
2523   push_cfun (DECL_STRUCT_FUNCTION (dispatch_decl));
2524 
2525   gseq = bb_seq (*empty_bb);
2526   /* Function version dispatch is via IFUNC.  IFUNC resolvers fire before
2527      constructors, so explicity call __builtin_cpu_init here.  */
2528   ifunc_cpu_init_stmt
2529     = gimple_build_call_vec (get_ix86_builtin (IX86_BUILTIN_CPU_INIT), vNULL);
2530   gimple_seq_add_stmt (&gseq, ifunc_cpu_init_stmt);
2531   gimple_set_bb (ifunc_cpu_init_stmt, *empty_bb);
2532   set_bb_seq (*empty_bb, gseq);
2533 
2534   pop_cfun ();
2535 
2536 
2537   for (ix = 1; fndecls->iterate (ix, &ele); ++ix)
2538     {
2539       tree version_decl = ele;
2540       tree predicate_chain = NULL_TREE;
2541       unsigned int priority;
2542       /* Get attribute string, parse it and find the right predicate decl.
2543          The predicate function could be a lengthy combination of many
2544 	 features, like arch-type and various isa-variants.  */
2545       priority = get_builtin_code_for_version (version_decl,
2546 	 			               &predicate_chain);
2547 
2548       if (predicate_chain == NULL_TREE)
2549 	continue;
2550 
2551       function_version_info [actual_versions].version_decl = version_decl;
2552       function_version_info [actual_versions].predicate_chain
2553 	 = predicate_chain;
2554       function_version_info [actual_versions].dispatch_priority = priority;
2555       actual_versions++;
2556     }
2557 
2558   /* Sort the versions according to descending order of dispatch priority.  The
2559      priority is based on the ISA.  This is not a perfect solution.  There
2560      could still be ambiguity.  If more than one function version is suitable
2561      to execute,  which one should be dispatched?  In future, allow the user
2562      to specify a dispatch  priority next to the version.  */
2563   qsort (function_version_info, actual_versions,
2564          sizeof (struct _function_version_info), feature_compare);
2565 
2566   for  (i = 0; i < actual_versions; ++i)
2567     *empty_bb = add_condition_to_bb (dispatch_decl,
2568 				     function_version_info[i].version_decl,
2569 				     function_version_info[i].predicate_chain,
2570 				     *empty_bb);
2571 
2572   /* dispatch default version at the end.  */
2573   *empty_bb = add_condition_to_bb (dispatch_decl, default_decl,
2574 				   NULL, *empty_bb);
2575 
2576   free (function_version_info);
2577   return 0;
2578 }
2579 
2580 /* This function changes the assembler name for functions that are
2581    versions.  If DECL is a function version and has a "target"
2582    attribute, it appends the attribute string to its assembler name.  */
2583 
2584 static tree
ix86_mangle_function_version_assembler_name(tree decl,tree id)2585 ix86_mangle_function_version_assembler_name (tree decl, tree id)
2586 {
2587   tree version_attr;
2588   const char *orig_name, *version_string;
2589   char *attr_str, *assembler_name;
2590 
2591   if (DECL_DECLARED_INLINE_P (decl)
2592       && lookup_attribute ("gnu_inline",
2593 			   DECL_ATTRIBUTES (decl)))
2594     error_at (DECL_SOURCE_LOCATION (decl),
2595 	      "function versions cannot be marked as %<gnu_inline%>,"
2596 	      " bodies have to be generated");
2597 
2598   if (DECL_VIRTUAL_P (decl)
2599       || DECL_VINDEX (decl))
2600     sorry ("virtual function multiversioning not supported");
2601 
2602   version_attr = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
2603 
2604   /* target attribute string cannot be NULL.  */
2605   gcc_assert (version_attr != NULL_TREE);
2606 
2607   orig_name = IDENTIFIER_POINTER (id);
2608   version_string
2609     = TREE_STRING_POINTER (TREE_VALUE (TREE_VALUE (version_attr)));
2610 
2611   if (strcmp (version_string, "default") == 0)
2612     return id;
2613 
2614   attr_str = sorted_attr_string (TREE_VALUE (version_attr));
2615   assembler_name = XNEWVEC (char, strlen (orig_name) + strlen (attr_str) + 2);
2616 
2617   sprintf (assembler_name, "%s.%s", orig_name, attr_str);
2618 
2619   /* Allow assembler name to be modified if already set.  */
2620   if (DECL_ASSEMBLER_NAME_SET_P (decl))
2621     SET_DECL_RTL (decl, NULL);
2622 
2623   tree ret = get_identifier (assembler_name);
2624   XDELETEVEC (attr_str);
2625   XDELETEVEC (assembler_name);
2626   return ret;
2627 }
2628 
2629 tree
ix86_mangle_decl_assembler_name(tree decl,tree id)2630 ix86_mangle_decl_assembler_name (tree decl, tree id)
2631 {
2632   /* For function version, add the target suffix to the assembler name.  */
2633   if (TREE_CODE (decl) == FUNCTION_DECL
2634       && DECL_FUNCTION_VERSIONED (decl))
2635     id = ix86_mangle_function_version_assembler_name (decl, id);
2636 #ifdef SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME
2637   id = SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME (decl, id);
2638 #endif
2639 
2640   return id;
2641 }
2642 
2643 /* Make a dispatcher declaration for the multi-versioned function DECL.
2644    Calls to DECL function will be replaced with calls to the dispatcher
2645    by the front-end.  Returns the decl of the dispatcher function.  */
2646 
2647 tree
ix86_get_function_versions_dispatcher(void * decl)2648 ix86_get_function_versions_dispatcher (void *decl)
2649 {
2650   tree fn = (tree) decl;
2651   struct cgraph_node *node = NULL;
2652   struct cgraph_node *default_node = NULL;
2653   struct cgraph_function_version_info *node_v = NULL;
2654   struct cgraph_function_version_info *first_v = NULL;
2655 
2656   tree dispatch_decl = NULL;
2657 
2658   struct cgraph_function_version_info *default_version_info = NULL;
2659 
2660   gcc_assert (fn != NULL && DECL_FUNCTION_VERSIONED (fn));
2661 
2662   node = cgraph_node::get (fn);
2663   gcc_assert (node != NULL);
2664 
2665   node_v = node->function_version ();
2666   gcc_assert (node_v != NULL);
2667 
2668   if (node_v->dispatcher_resolver != NULL)
2669     return node_v->dispatcher_resolver;
2670 
2671   /* Find the default version and make it the first node.  */
2672   first_v = node_v;
2673   /* Go to the beginning of the chain.  */
2674   while (first_v->prev != NULL)
2675     first_v = first_v->prev;
2676   default_version_info = first_v;
2677   while (default_version_info != NULL)
2678     {
2679       if (is_function_default_version
2680 	    (default_version_info->this_node->decl))
2681         break;
2682       default_version_info = default_version_info->next;
2683     }
2684 
2685   /* If there is no default node, just return NULL.  */
2686   if (default_version_info == NULL)
2687     return NULL;
2688 
2689   /* Make default info the first node.  */
2690   if (first_v != default_version_info)
2691     {
2692       default_version_info->prev->next = default_version_info->next;
2693       if (default_version_info->next)
2694         default_version_info->next->prev = default_version_info->prev;
2695       first_v->prev = default_version_info;
2696       default_version_info->next = first_v;
2697       default_version_info->prev = NULL;
2698     }
2699 
2700   default_node = default_version_info->this_node;
2701 
2702 #if defined (ASM_OUTPUT_TYPE_DIRECTIVE)
2703   if (targetm.has_ifunc_p ())
2704     {
2705       struct cgraph_function_version_info *it_v = NULL;
2706       struct cgraph_node *dispatcher_node = NULL;
2707       struct cgraph_function_version_info *dispatcher_version_info = NULL;
2708 
2709       /* Right now, the dispatching is done via ifunc.  */
2710       dispatch_decl = make_dispatcher_decl (default_node->decl);
2711 
2712       dispatcher_node = cgraph_node::get_create (dispatch_decl);
2713       gcc_assert (dispatcher_node != NULL);
2714       dispatcher_node->dispatcher_function = 1;
2715       dispatcher_version_info
2716 	= dispatcher_node->insert_new_function_version ();
2717       dispatcher_version_info->next = default_version_info;
2718       dispatcher_node->definition = 1;
2719 
2720       /* Set the dispatcher for all the versions.  */
2721       it_v = default_version_info;
2722       while (it_v != NULL)
2723 	{
2724 	  it_v->dispatcher_resolver = dispatch_decl;
2725 	  it_v = it_v->next;
2726 	}
2727     }
2728   else
2729 #endif
2730     {
2731       error_at (DECL_SOURCE_LOCATION (default_node->decl),
2732 		"multiversioning needs %<ifunc%> which is not supported "
2733 		"on this target");
2734     }
2735 
2736   return dispatch_decl;
2737 }
2738 
2739 /* Make the resolver function decl to dispatch the versions of
2740    a multi-versioned function,  DEFAULT_DECL.  IFUNC_ALIAS_DECL is
2741    ifunc alias that will point to the created resolver.  Create an
2742    empty basic block in the resolver and store the pointer in
2743    EMPTY_BB.  Return the decl of the resolver function.  */
2744 
2745 static tree
make_resolver_func(const tree default_decl,const tree ifunc_alias_decl,basic_block * empty_bb)2746 make_resolver_func (const tree default_decl,
2747 		    const tree ifunc_alias_decl,
2748 		    basic_block *empty_bb)
2749 {
2750   tree decl, type, t;
2751 
2752   /* Create resolver function name based on default_decl.  */
2753   tree decl_name = clone_function_name (default_decl, "resolver");
2754   const char *resolver_name = IDENTIFIER_POINTER (decl_name);
2755 
2756   /* The resolver function should return a (void *). */
2757   type = build_function_type_list (ptr_type_node, NULL_TREE);
2758 
2759   decl = build_fn_decl (resolver_name, type);
2760   SET_DECL_ASSEMBLER_NAME (decl, decl_name);
2761 
2762   DECL_NAME (decl) = decl_name;
2763   TREE_USED (decl) = 1;
2764   DECL_ARTIFICIAL (decl) = 1;
2765   DECL_IGNORED_P (decl) = 1;
2766   TREE_PUBLIC (decl) = 0;
2767   DECL_UNINLINABLE (decl) = 1;
2768 
2769   /* Resolver is not external, body is generated.  */
2770   DECL_EXTERNAL (decl) = 0;
2771   DECL_EXTERNAL (ifunc_alias_decl) = 0;
2772 
2773   DECL_CONTEXT (decl) = NULL_TREE;
2774   DECL_INITIAL (decl) = make_node (BLOCK);
2775   DECL_STATIC_CONSTRUCTOR (decl) = 0;
2776 
2777   if (DECL_COMDAT_GROUP (default_decl)
2778       || TREE_PUBLIC (default_decl))
2779     {
2780       /* In this case, each translation unit with a call to this
2781 	 versioned function will put out a resolver.  Ensure it
2782 	 is comdat to keep just one copy.  */
2783       DECL_COMDAT (decl) = 1;
2784       make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
2785     }
2786   else
2787     TREE_PUBLIC (ifunc_alias_decl) = 0;
2788 
2789   /* Build result decl and add to function_decl. */
2790   t = build_decl (UNKNOWN_LOCATION, RESULT_DECL, NULL_TREE, ptr_type_node);
2791   DECL_CONTEXT (t) = decl;
2792   DECL_ARTIFICIAL (t) = 1;
2793   DECL_IGNORED_P (t) = 1;
2794   DECL_RESULT (decl) = t;
2795 
2796   gimplify_function_tree (decl);
2797   push_cfun (DECL_STRUCT_FUNCTION (decl));
2798   *empty_bb = init_lowered_empty_function (decl, false,
2799 					   profile_count::uninitialized ());
2800 
2801   cgraph_node::add_new_function (decl, true);
2802   symtab->call_cgraph_insertion_hooks (cgraph_node::get_create (decl));
2803 
2804   pop_cfun ();
2805 
2806   gcc_assert (ifunc_alias_decl != NULL);
2807   /* Mark ifunc_alias_decl as "ifunc" with resolver as resolver_name.  */
2808   DECL_ATTRIBUTES (ifunc_alias_decl)
2809     = make_attribute ("ifunc", resolver_name,
2810 		      DECL_ATTRIBUTES (ifunc_alias_decl));
2811 
2812   /* Create the alias for dispatch to resolver here.  */
2813   cgraph_node::create_same_body_alias (ifunc_alias_decl, decl);
2814   return decl;
2815 }
2816 
2817 /* Generate the dispatching code body to dispatch multi-versioned function
2818    DECL.  The target hook is called to process the "target" attributes and
2819    provide the code to dispatch the right function at run-time.  NODE points
2820    to the dispatcher decl whose body will be created.  */
2821 
2822 tree
ix86_generate_version_dispatcher_body(void * node_p)2823 ix86_generate_version_dispatcher_body (void *node_p)
2824 {
2825   tree resolver_decl;
2826   basic_block empty_bb;
2827   tree default_ver_decl;
2828   struct cgraph_node *versn;
2829   struct cgraph_node *node;
2830 
2831   struct cgraph_function_version_info *node_version_info = NULL;
2832   struct cgraph_function_version_info *versn_info = NULL;
2833 
2834   node = (cgraph_node *)node_p;
2835 
2836   node_version_info = node->function_version ();
2837   gcc_assert (node->dispatcher_function
2838 	      && node_version_info != NULL);
2839 
2840   if (node_version_info->dispatcher_resolver)
2841     return node_version_info->dispatcher_resolver;
2842 
2843   /* The first version in the chain corresponds to the default version.  */
2844   default_ver_decl = node_version_info->next->this_node->decl;
2845 
2846   /* node is going to be an alias, so remove the finalized bit.  */
2847   node->definition = false;
2848 
2849   resolver_decl = make_resolver_func (default_ver_decl,
2850 				      node->decl, &empty_bb);
2851 
2852   node_version_info->dispatcher_resolver = resolver_decl;
2853 
2854   push_cfun (DECL_STRUCT_FUNCTION (resolver_decl));
2855 
2856   auto_vec<tree, 2> fn_ver_vec;
2857 
2858   for (versn_info = node_version_info->next; versn_info;
2859        versn_info = versn_info->next)
2860     {
2861       versn = versn_info->this_node;
2862       /* Check for virtual functions here again, as by this time it should
2863 	 have been determined if this function needs a vtable index or
2864 	 not.  This happens for methods in derived classes that override
2865 	 virtual methods in base classes but are not explicitly marked as
2866 	 virtual.  */
2867       if (DECL_VINDEX (versn->decl))
2868 	sorry ("virtual function multiversioning not supported");
2869 
2870       fn_ver_vec.safe_push (versn->decl);
2871     }
2872 
2873   dispatch_function_versions (resolver_decl, &fn_ver_vec, &empty_bb);
2874   cgraph_edge::rebuild_edges ();
2875   pop_cfun ();
2876   return resolver_decl;
2877 }
2878 
2879 
2880