1 /* Copyright (C) 1988-2021 Free Software Foundation, Inc.
2 
3 This file is part of GCC.
4 
5 GCC is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3, or (at your option)
8 any later version.
9 
10 GCC is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 GNU General Public License for more details.
14 
15 You should have received a copy of the GNU General Public License
16 along with GCC; see the file COPYING3.  If not see
17 <http://www.gnu.org/licenses/>.  */
18 
19 #define IN_TARGET_CODE 1
20 
21 #include "config.h"
22 #include "system.h"
23 #include "coretypes.h"
24 #include "backend.h"
25 #include "rtl.h"
26 #include "tree.h"
27 #include "memmodel.h"
28 #include "gimple.h"
29 #include "cfghooks.h"
30 #include "cfgloop.h"
31 #include "df.h"
32 #include "tm_p.h"
33 #include "stringpool.h"
34 #include "expmed.h"
35 #include "optabs.h"
36 #include "regs.h"
37 #include "emit-rtl.h"
38 #include "recog.h"
39 #include "cgraph.h"
40 #include "diagnostic.h"
41 #include "cfgbuild.h"
42 #include "alias.h"
43 #include "fold-const.h"
44 #include "attribs.h"
45 #include "calls.h"
46 #include "stor-layout.h"
47 #include "varasm.h"
48 #include "output.h"
49 #include "insn-attr.h"
50 #include "flags.h"
51 #include "except.h"
52 #include "explow.h"
53 #include "expr.h"
54 #include "cfgrtl.h"
55 #include "common/common-target.h"
56 #include "langhooks.h"
57 #include "reload.h"
58 #include "gimplify.h"
59 #include "dwarf2.h"
60 #include "tm-constrs.h"
61 #include "cselib.h"
62 #include "sched-int.h"
63 #include "opts.h"
64 #include "tree-pass.h"
65 #include "context.h"
66 #include "pass_manager.h"
67 #include "target-globals.h"
68 #include "gimple-iterator.h"
69 #include "tree-vectorizer.h"
70 #include "shrink-wrap.h"
71 #include "builtins.h"
72 #include "rtl-iter.h"
73 #include "tree-iterator.h"
74 #include "dbgcnt.h"
75 #include "case-cfn-macros.h"
76 #include "dojump.h"
77 #include "fold-const-call.h"
78 #include "tree-vrp.h"
79 #include "tree-ssanames.h"
80 #include "selftest.h"
81 #include "selftest-rtl.h"
82 #include "print-rtl.h"
83 #include "intl.h"
84 #include "ifcvt.h"
85 #include "symbol-summary.h"
86 #include "ipa-prop.h"
87 #include "ipa-fnsummary.h"
88 #include "wide-int-bitmask.h"
89 #include "tree-vector-builder.h"
90 #include "debug.h"
91 #include "dwarf2out.h"
92 #include "i386-builtins.h"
93 #include "i386-features.h"
94 
95 const char * const xlogue_layout::STUB_BASE_NAMES[XLOGUE_STUB_COUNT] = {
96   "savms64",
97   "resms64",
98   "resms64x",
99   "savms64f",
100   "resms64f",
101   "resms64fx"
102 };
103 
104 const unsigned xlogue_layout::REG_ORDER[xlogue_layout::MAX_REGS] = {
105 /* The below offset values are where each register is stored for the layout
106    relative to incoming stack pointer.  The value of each m_regs[].offset will
107    be relative to the incoming base pointer (rax or rsi) used by the stub.
108 
109     s_instances:   0		1		2		3
110     Offset:					realigned or	aligned + 8
111     Register	   aligned	aligned + 8	aligned w/HFP	w/HFP	*/
112     XMM15_REG,	/* 0x10		0x18		0x10		0x18	*/
113     XMM14_REG,	/* 0x20		0x28		0x20		0x28	*/
114     XMM13_REG,	/* 0x30		0x38		0x30		0x38	*/
115     XMM12_REG,	/* 0x40		0x48		0x40		0x48	*/
116     XMM11_REG,	/* 0x50		0x58		0x50		0x58	*/
117     XMM10_REG,	/* 0x60		0x68		0x60		0x68	*/
118     XMM9_REG,	/* 0x70		0x78		0x70		0x78	*/
119     XMM8_REG,	/* 0x80		0x88		0x80		0x88	*/
120     XMM7_REG,	/* 0x90		0x98		0x90		0x98	*/
121     XMM6_REG,	/* 0xa0		0xa8		0xa0		0xa8	*/
122     SI_REG,	/* 0xa8		0xb0		0xa8		0xb0	*/
123     DI_REG,	/* 0xb0		0xb8		0xb0		0xb8	*/
124     BX_REG,	/* 0xb8		0xc0		0xb8		0xc0	*/
125     BP_REG,	/* 0xc0		0xc8		N/A		N/A	*/
126     R12_REG,	/* 0xc8		0xd0		0xc0		0xc8	*/
127     R13_REG,	/* 0xd0		0xd8		0xc8		0xd0	*/
128     R14_REG,	/* 0xd8		0xe0		0xd0		0xd8	*/
129     R15_REG,	/* 0xe0		0xe8		0xd8		0xe0	*/
130 };
131 
132 /* Instantiate static const values.  */
133 const HOST_WIDE_INT xlogue_layout::STUB_INDEX_OFFSET;
134 const unsigned xlogue_layout::MIN_REGS;
135 const unsigned xlogue_layout::MAX_REGS;
136 const unsigned xlogue_layout::MAX_EXTRA_REGS;
137 const unsigned xlogue_layout::VARIANT_COUNT;
138 const unsigned xlogue_layout::STUB_NAME_MAX_LEN;
139 
140 /* Initialize xlogue_layout::s_stub_names to zero.  */
141 char xlogue_layout::s_stub_names[2][XLOGUE_STUB_COUNT][VARIANT_COUNT]
142 				[STUB_NAME_MAX_LEN];
143 
144 /* Instantiates all xlogue_layout instances.  */
145 const xlogue_layout xlogue_layout::s_instances[XLOGUE_SET_COUNT] = {
146   xlogue_layout (0, false),
147   xlogue_layout (8, false),
148   xlogue_layout (0, true),
149   xlogue_layout (8, true)
150 };
151 
152 /* Return an appropriate const instance of xlogue_layout based upon values
153    in cfun->machine and crtl.  */
154 const class xlogue_layout &
get_instance()155 xlogue_layout::get_instance ()
156 {
157   enum xlogue_stub_sets stub_set;
158   bool aligned_plus_8 = cfun->machine->call_ms2sysv_pad_in;
159 
160   if (stack_realign_fp)
161     stub_set = XLOGUE_SET_HFP_ALIGNED_OR_REALIGN;
162   else if (frame_pointer_needed)
163     stub_set = aligned_plus_8
164 	      ? XLOGUE_SET_HFP_ALIGNED_PLUS_8
165 	      : XLOGUE_SET_HFP_ALIGNED_OR_REALIGN;
166   else
167     stub_set = aligned_plus_8 ? XLOGUE_SET_ALIGNED_PLUS_8 : XLOGUE_SET_ALIGNED;
168 
169   return s_instances[stub_set];
170 }
171 
172 /* Determine how many clobbered registers can be saved by the stub.
173    Returns the count of registers the stub will save and restore.  */
174 unsigned
count_stub_managed_regs()175 xlogue_layout::count_stub_managed_regs ()
176 {
177   bool hfp = frame_pointer_needed || stack_realign_fp;
178   unsigned i, count;
179   unsigned regno;
180 
181   for (count = i = MIN_REGS; i < MAX_REGS; ++i)
182     {
183       regno = REG_ORDER[i];
184       if (regno == BP_REG && hfp)
185 	continue;
186       if (!ix86_save_reg (regno, false, false))
187 	break;
188       ++count;
189     }
190   return count;
191 }
192 
193 /* Determine if register REGNO is a stub managed register given the
194    total COUNT of stub managed registers.  */
195 bool
is_stub_managed_reg(unsigned regno,unsigned count)196 xlogue_layout::is_stub_managed_reg (unsigned regno, unsigned count)
197 {
198   bool hfp = frame_pointer_needed || stack_realign_fp;
199   unsigned i;
200 
201   for (i = 0; i < count; ++i)
202     {
203       gcc_assert (i < MAX_REGS);
204       if (REG_ORDER[i] == BP_REG && hfp)
205 	++count;
206       else if (REG_ORDER[i] == regno)
207 	return true;
208     }
209   return false;
210 }
211 
212 /* Constructor for xlogue_layout.  */
xlogue_layout(HOST_WIDE_INT stack_align_off_in,bool hfp)213 xlogue_layout::xlogue_layout (HOST_WIDE_INT stack_align_off_in, bool hfp)
214   : m_hfp (hfp) , m_nregs (hfp ? 17 : 18),
215     m_stack_align_off_in (stack_align_off_in)
216 {
217   HOST_WIDE_INT offset = stack_align_off_in;
218   unsigned i, j;
219 
220   for (i = j = 0; i < MAX_REGS; ++i)
221     {
222       unsigned regno = REG_ORDER[i];
223 
224       if (regno == BP_REG && hfp)
225 	continue;
226       if (SSE_REGNO_P (regno))
227 	{
228 	  offset += 16;
229 	  /* Verify that SSE regs are always aligned.  */
230 	  gcc_assert (!((stack_align_off_in + offset) & 15));
231 	}
232       else
233 	offset += 8;
234 
235       m_regs[j].regno    = regno;
236       m_regs[j++].offset = offset - STUB_INDEX_OFFSET;
237     }
238   gcc_assert (j == m_nregs);
239 }
240 
241 const char *
get_stub_name(enum xlogue_stub stub,unsigned n_extra_regs)242 xlogue_layout::get_stub_name (enum xlogue_stub stub,
243 			      unsigned n_extra_regs)
244 {
245   const int have_avx = TARGET_AVX;
246   char *name = s_stub_names[!!have_avx][stub][n_extra_regs];
247 
248   /* Lazy init */
249   if (!*name)
250     {
251       int res = snprintf (name, STUB_NAME_MAX_LEN, "__%s_%s_%u",
252 			  (have_avx ? "avx" : "sse"),
253 			  STUB_BASE_NAMES[stub],
254 			  MIN_REGS + n_extra_regs);
255       gcc_checking_assert (res < (int)STUB_NAME_MAX_LEN);
256     }
257 
258   return name;
259 }
260 
261 /* Return rtx of a symbol ref for the entry point (based upon
262    cfun->machine->call_ms2sysv_extra_regs) of the specified stub.  */
263 rtx
get_stub_rtx(enum xlogue_stub stub)264 xlogue_layout::get_stub_rtx (enum xlogue_stub stub)
265 {
266   const unsigned n_extra_regs = cfun->machine->call_ms2sysv_extra_regs;
267   gcc_checking_assert (n_extra_regs <= MAX_EXTRA_REGS);
268   gcc_assert (stub < XLOGUE_STUB_COUNT);
269   gcc_assert (crtl->stack_realign_finalized);
270 
271   return gen_rtx_SYMBOL_REF (Pmode, get_stub_name (stub, n_extra_regs));
272 }
273 
274 unsigned scalar_chain::max_id = 0;
275 
276 namespace {
277 
278 /* Initialize new chain.  */
279 
scalar_chain(enum machine_mode smode_,enum machine_mode vmode_)280 scalar_chain::scalar_chain (enum machine_mode smode_, enum machine_mode vmode_)
281 {
282   smode = smode_;
283   vmode = vmode_;
284 
285   chain_id = ++max_id;
286 
287    if (dump_file)
288     fprintf (dump_file, "Created a new instruction chain #%d\n", chain_id);
289 
290   bitmap_obstack_initialize (NULL);
291   insns = BITMAP_ALLOC (NULL);
292   defs = BITMAP_ALLOC (NULL);
293   defs_conv = BITMAP_ALLOC (NULL);
294   queue = NULL;
295 }
296 
297 /* Free chain's data.  */
298 
~scalar_chain()299 scalar_chain::~scalar_chain ()
300 {
301   BITMAP_FREE (insns);
302   BITMAP_FREE (defs);
303   BITMAP_FREE (defs_conv);
304   bitmap_obstack_release (NULL);
305 }
306 
307 /* Add instruction into chains' queue.  */
308 
309 void
add_to_queue(unsigned insn_uid)310 scalar_chain::add_to_queue (unsigned insn_uid)
311 {
312   if (bitmap_bit_p (insns, insn_uid)
313       || bitmap_bit_p (queue, insn_uid))
314     return;
315 
316   if (dump_file)
317     fprintf (dump_file, "  Adding insn %d into chain's #%d queue\n",
318 	     insn_uid, chain_id);
319   bitmap_set_bit (queue, insn_uid);
320 }
321 
general_scalar_chain(enum machine_mode smode_,enum machine_mode vmode_)322 general_scalar_chain::general_scalar_chain (enum machine_mode smode_,
323 					    enum machine_mode vmode_)
324      : scalar_chain (smode_, vmode_)
325 {
326   insns_conv = BITMAP_ALLOC (NULL);
327   n_sse_to_integer = 0;
328   n_integer_to_sse = 0;
329 }
330 
~general_scalar_chain()331 general_scalar_chain::~general_scalar_chain ()
332 {
333   BITMAP_FREE (insns_conv);
334 }
335 
336 /* For DImode conversion, mark register defined by DEF as requiring
337    conversion.  */
338 
339 void
mark_dual_mode_def(df_ref def)340 general_scalar_chain::mark_dual_mode_def (df_ref def)
341 {
342   gcc_assert (DF_REF_REG_DEF_P (def));
343 
344   /* Record the def/insn pair so we can later efficiently iterate over
345      the defs to convert on insns not in the chain.  */
346   bool reg_new = bitmap_set_bit (defs_conv, DF_REF_REGNO (def));
347   if (!bitmap_bit_p (insns, DF_REF_INSN_UID (def)))
348     {
349       if (!bitmap_set_bit (insns_conv, DF_REF_INSN_UID (def))
350 	  && !reg_new)
351 	return;
352       n_integer_to_sse++;
353     }
354   else
355     {
356       if (!reg_new)
357 	return;
358       n_sse_to_integer++;
359     }
360 
361   if (dump_file)
362     fprintf (dump_file,
363 	     "  Mark r%d def in insn %d as requiring both modes in chain #%d\n",
364 	     DF_REF_REGNO (def), DF_REF_INSN_UID (def), chain_id);
365 }
366 
367 /* For TImode conversion, it is unused.  */
368 
369 void
mark_dual_mode_def(df_ref)370 timode_scalar_chain::mark_dual_mode_def (df_ref)
371 {
372   gcc_unreachable ();
373 }
374 
375 /* Check REF's chain to add new insns into a queue
376    and find registers requiring conversion.  */
377 
378 void
analyze_register_chain(bitmap candidates,df_ref ref)379 scalar_chain::analyze_register_chain (bitmap candidates, df_ref ref)
380 {
381   df_link *chain;
382 
383   gcc_assert (bitmap_bit_p (insns, DF_REF_INSN_UID (ref))
384 	      || bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)));
385   add_to_queue (DF_REF_INSN_UID (ref));
386 
387   for (chain = DF_REF_CHAIN (ref); chain; chain = chain->next)
388     {
389       unsigned uid = DF_REF_INSN_UID (chain->ref);
390 
391       if (!NONDEBUG_INSN_P (DF_REF_INSN (chain->ref)))
392 	continue;
393 
394       if (!DF_REF_REG_MEM_P (chain->ref))
395 	{
396 	  if (bitmap_bit_p (insns, uid))
397 	    continue;
398 
399 	  if (bitmap_bit_p (candidates, uid))
400 	    {
401 	      add_to_queue (uid);
402 	      continue;
403 	    }
404 	}
405 
406       if (DF_REF_REG_DEF_P (chain->ref))
407 	{
408 	  if (dump_file)
409 	    fprintf (dump_file, "  r%d def in insn %d isn't convertible\n",
410 		     DF_REF_REGNO (chain->ref), uid);
411 	  mark_dual_mode_def (chain->ref);
412 	}
413       else
414 	{
415 	  if (dump_file)
416 	    fprintf (dump_file, "  r%d use in insn %d isn't convertible\n",
417 		     DF_REF_REGNO (chain->ref), uid);
418 	  mark_dual_mode_def (ref);
419 	}
420     }
421 }
422 
423 /* Add instruction into a chain.  */
424 
425 void
add_insn(bitmap candidates,unsigned int insn_uid)426 scalar_chain::add_insn (bitmap candidates, unsigned int insn_uid)
427 {
428   if (bitmap_bit_p (insns, insn_uid))
429     return;
430 
431   if (dump_file)
432     fprintf (dump_file, "  Adding insn %d to chain #%d\n", insn_uid, chain_id);
433 
434   bitmap_set_bit (insns, insn_uid);
435 
436   rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
437   rtx def_set = single_set (insn);
438   if (def_set && REG_P (SET_DEST (def_set))
439       && !HARD_REGISTER_P (SET_DEST (def_set)))
440     bitmap_set_bit (defs, REGNO (SET_DEST (def_set)));
441 
442   /* ???  The following is quadratic since analyze_register_chain
443      iterates over all refs to look for dual-mode regs.  Instead this
444      should be done separately for all regs mentioned in the chain once.  */
445   df_ref ref;
446   for (ref = DF_INSN_UID_DEFS (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
447     if (!HARD_REGISTER_P (DF_REF_REG (ref)))
448       analyze_register_chain (candidates, ref);
449   for (ref = DF_INSN_UID_USES (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
450     if (!DF_REF_REG_MEM_P (ref))
451       analyze_register_chain (candidates, ref);
452 }
453 
454 /* Build new chain starting from insn INSN_UID recursively
455    adding all dependent uses and definitions.  */
456 
457 void
build(bitmap candidates,unsigned insn_uid)458 scalar_chain::build (bitmap candidates, unsigned insn_uid)
459 {
460   queue = BITMAP_ALLOC (NULL);
461   bitmap_set_bit (queue, insn_uid);
462 
463   if (dump_file)
464     fprintf (dump_file, "Building chain #%d...\n", chain_id);
465 
466   while (!bitmap_empty_p (queue))
467     {
468       insn_uid = bitmap_first_set_bit (queue);
469       bitmap_clear_bit (queue, insn_uid);
470       bitmap_clear_bit (candidates, insn_uid);
471       add_insn (candidates, insn_uid);
472     }
473 
474   if (dump_file)
475     {
476       fprintf (dump_file, "Collected chain #%d...\n", chain_id);
477       fprintf (dump_file, "  insns: ");
478       dump_bitmap (dump_file, insns);
479       if (!bitmap_empty_p (defs_conv))
480 	{
481 	  bitmap_iterator bi;
482 	  unsigned id;
483 	  const char *comma = "";
484 	  fprintf (dump_file, "  defs to convert: ");
485 	  EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, id, bi)
486 	    {
487 	      fprintf (dump_file, "%sr%d", comma, id);
488 	      comma = ", ";
489 	    }
490 	  fprintf (dump_file, "\n");
491 	}
492     }
493 
494   BITMAP_FREE (queue);
495 }
496 
497 /* Return a cost of building a vector costant
498    instead of using a scalar one.  */
499 
500 int
vector_const_cost(rtx exp)501 general_scalar_chain::vector_const_cost (rtx exp)
502 {
503   gcc_assert (CONST_INT_P (exp));
504 
505   if (standard_sse_constant_p (exp, vmode))
506     return ix86_cost->sse_op;
507   /* We have separate costs for SImode and DImode, use SImode costs
508      for smaller modes.  */
509   return ix86_cost->sse_load[smode == DImode ? 1 : 0];
510 }
511 
512 /* Compute a gain for chain conversion.  */
513 
514 int
compute_convert_gain()515 general_scalar_chain::compute_convert_gain ()
516 {
517   bitmap_iterator bi;
518   unsigned insn_uid;
519   int gain = 0;
520   int cost = 0;
521 
522   if (dump_file)
523     fprintf (dump_file, "Computing gain for chain #%d...\n", chain_id);
524 
525   /* SSE costs distinguish between SImode and DImode loads/stores, for
526      int costs factor in the number of GPRs involved.  When supporting
527      smaller modes than SImode the int load/store costs need to be
528      adjusted as well.  */
529   unsigned sse_cost_idx = smode == DImode ? 1 : 0;
530   unsigned m = smode == DImode ? (TARGET_64BIT ? 1 : 2) : 1;
531 
532   EXECUTE_IF_SET_IN_BITMAP (insns, 0, insn_uid, bi)
533     {
534       rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
535       rtx def_set = single_set (insn);
536       rtx src = SET_SRC (def_set);
537       rtx dst = SET_DEST (def_set);
538       int igain = 0;
539 
540       if (REG_P (src) && REG_P (dst))
541 	igain += 2 * m - ix86_cost->xmm_move;
542       else if (REG_P (src) && MEM_P (dst))
543 	igain
544 	  += m * ix86_cost->int_store[2] - ix86_cost->sse_store[sse_cost_idx];
545       else if (MEM_P (src) && REG_P (dst))
546 	igain += m * ix86_cost->int_load[2] - ix86_cost->sse_load[sse_cost_idx];
547       else
548 	switch (GET_CODE (src))
549 	  {
550 	  case ASHIFT:
551 	  case ASHIFTRT:
552 	  case LSHIFTRT:
553 	    if (m == 2)
554 	      {
555 		if (INTVAL (XEXP (src, 1)) >= 32)
556 		  igain += ix86_cost->add;
557 		else
558 		  igain += ix86_cost->shift_const;
559 	      }
560 
561 	    igain += ix86_cost->shift_const - ix86_cost->sse_op;
562 
563 	    if (CONST_INT_P (XEXP (src, 0)))
564 	      igain -= vector_const_cost (XEXP (src, 0));
565 	    break;
566 
567 	  case AND:
568 	  case IOR:
569 	  case XOR:
570 	  case PLUS:
571 	  case MINUS:
572 	    igain += m * ix86_cost->add - ix86_cost->sse_op;
573 	    /* Additional gain for andnot for targets without BMI.  */
574 	    if (GET_CODE (XEXP (src, 0)) == NOT
575 		&& !TARGET_BMI)
576 	      igain += m * ix86_cost->add;
577 
578 	    if (CONST_INT_P (XEXP (src, 0)))
579 	      igain -= vector_const_cost (XEXP (src, 0));
580 	    if (CONST_INT_P (XEXP (src, 1)))
581 	      igain -= vector_const_cost (XEXP (src, 1));
582 	    break;
583 
584 	  case NEG:
585 	  case NOT:
586 	    igain -= ix86_cost->sse_op + COSTS_N_INSNS (1);
587 
588 	    if (GET_CODE (XEXP (src, 0)) != ABS)
589 	      {
590 		igain += m * ix86_cost->add;
591 		break;
592 	      }
593 	    /* FALLTHRU */
594 
595 	  case ABS:
596 	  case SMAX:
597 	  case SMIN:
598 	  case UMAX:
599 	  case UMIN:
600 	    /* We do not have any conditional move cost, estimate it as a
601 	       reg-reg move.  Comparisons are costed as adds.  */
602 	    igain += m * (COSTS_N_INSNS (2) + ix86_cost->add);
603 	    /* Integer SSE ops are all costed the same.  */
604 	    igain -= ix86_cost->sse_op;
605 	    break;
606 
607 	  case COMPARE:
608 	    /* Assume comparison cost is the same.  */
609 	    break;
610 
611 	  case CONST_INT:
612 	    if (REG_P (dst))
613 	      {
614 		if (optimize_insn_for_size_p ())
615 		  {
616 		    /* xor (2 bytes) vs. xorps (3 bytes).  */
617 		    if (src == const0_rtx)
618 		      igain -= COSTS_N_BYTES (1);
619 		    /* movdi_internal vs. movv2di_internal.  */
620 		    /* => mov (5 bytes) vs. movaps (7 bytes).  */
621 		    else if (x86_64_immediate_operand (src, SImode))
622 		      igain -= COSTS_N_BYTES (2);
623 		    else
624 		      /* ??? Larger immediate constants are placed in the
625 			 constant pool, where the size benefit/impact of
626 			 STV conversion is affected by whether and how
627 			 often each constant pool entry is shared/reused.
628 			 The value below is empirically derived from the
629 			 CSiBE benchmark (and the optimal value may drift
630 			 over time).  */
631 		      igain += COSTS_N_BYTES (0);
632 		  }
633 		else
634 		  {
635 		    /* DImode can be immediate for TARGET_64BIT
636 		       and SImode always.  */
637 		    igain += m * COSTS_N_INSNS (1);
638 		    igain -= vector_const_cost (src);
639 		  }
640 	      }
641 	    else if (MEM_P (dst))
642 	      {
643 		igain += (m * ix86_cost->int_store[2]
644 			  - ix86_cost->sse_store[sse_cost_idx]);
645 		igain -= vector_const_cost (src);
646 	      }
647 	    break;
648 
649 	  default:
650 	    gcc_unreachable ();
651 	  }
652 
653       if (igain != 0 && dump_file)
654 	{
655 	  fprintf (dump_file, "  Instruction gain %d for ", igain);
656 	  dump_insn_slim (dump_file, insn);
657 	}
658       gain += igain;
659     }
660 
661   if (dump_file)
662     fprintf (dump_file, "  Instruction conversion gain: %d\n", gain);
663 
664   /* Cost the integer to sse and sse to integer moves.  */
665   cost += n_sse_to_integer * ix86_cost->sse_to_integer;
666   /* ???  integer_to_sse but we only have that in the RA cost table.
667      Assume sse_to_integer/integer_to_sse are the same which they
668      are at the moment.  */
669   cost += n_integer_to_sse * ix86_cost->sse_to_integer;
670 
671   if (dump_file)
672     fprintf (dump_file, "  Registers conversion cost: %d\n", cost);
673 
674   gain -= cost;
675 
676   if (dump_file)
677     fprintf (dump_file, "  Total gain: %d\n", gain);
678 
679   return gain;
680 }
681 
682 /* Insert generated conversion instruction sequence INSNS
683    after instruction AFTER.  New BB may be required in case
684    instruction has EH region attached.  */
685 
686 void
emit_conversion_insns(rtx insns,rtx_insn * after)687 scalar_chain::emit_conversion_insns (rtx insns, rtx_insn *after)
688 {
689   if (!control_flow_insn_p (after))
690     {
691       emit_insn_after (insns, after);
692       return;
693     }
694 
695   basic_block bb = BLOCK_FOR_INSN (after);
696   edge e = find_fallthru_edge (bb->succs);
697   gcc_assert (e);
698 
699   basic_block new_bb = split_edge (e);
700   emit_insn_after (insns, BB_HEAD (new_bb));
701 }
702 
703 } // anon namespace
704 
705 /* Generate the canonical SET_SRC to move GPR to a VMODE vector register,
706    zeroing the upper parts.  */
707 
708 static rtx
gen_gpr_to_xmm_move_src(enum machine_mode vmode,rtx gpr)709 gen_gpr_to_xmm_move_src (enum machine_mode vmode, rtx gpr)
710 {
711   switch (GET_MODE_NUNITS (vmode))
712     {
713     case 1:
714       /* We are not using this case currently.  */
715       gcc_unreachable ();
716     case 2:
717       return gen_rtx_VEC_CONCAT (vmode, gpr,
718 				 CONST0_RTX (GET_MODE_INNER (vmode)));
719     default:
720       return gen_rtx_VEC_MERGE (vmode, gen_rtx_VEC_DUPLICATE (vmode, gpr),
721 				CONST0_RTX (vmode), GEN_INT (HOST_WIDE_INT_1U));
722     }
723 }
724 
725 /* Make vector copies for all register REGNO definitions
726    and replace its uses in a chain.  */
727 
728 void
make_vector_copies(rtx_insn * insn,rtx reg)729 general_scalar_chain::make_vector_copies (rtx_insn *insn, rtx reg)
730 {
731   rtx vreg = *defs_map.get (reg);
732 
733   start_sequence ();
734   if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
735     {
736       rtx tmp = assign_386_stack_local (smode, SLOT_STV_TEMP);
737       if (smode == DImode && !TARGET_64BIT)
738 	{
739 	  emit_move_insn (adjust_address (tmp, SImode, 0),
740 			  gen_rtx_SUBREG (SImode, reg, 0));
741 	  emit_move_insn (adjust_address (tmp, SImode, 4),
742 			  gen_rtx_SUBREG (SImode, reg, 4));
743 	}
744       else
745 	emit_move_insn (copy_rtx (tmp), reg);
746       emit_insn (gen_rtx_SET (gen_rtx_SUBREG (vmode, vreg, 0),
747 			      gen_gpr_to_xmm_move_src (vmode, tmp)));
748     }
749   else if (!TARGET_64BIT && smode == DImode)
750     {
751       if (TARGET_SSE4_1)
752 	{
753 	  emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0),
754 				      CONST0_RTX (V4SImode),
755 				      gen_rtx_SUBREG (SImode, reg, 0)));
756 	  emit_insn (gen_sse4_1_pinsrd (gen_rtx_SUBREG (V4SImode, vreg, 0),
757 					gen_rtx_SUBREG (V4SImode, vreg, 0),
758 					gen_rtx_SUBREG (SImode, reg, 4),
759 					GEN_INT (2)));
760 	}
761       else
762 	{
763 	  rtx tmp = gen_reg_rtx (DImode);
764 	  emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0),
765 				      CONST0_RTX (V4SImode),
766 				      gen_rtx_SUBREG (SImode, reg, 0)));
767 	  emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, tmp, 0),
768 				      CONST0_RTX (V4SImode),
769 				      gen_rtx_SUBREG (SImode, reg, 4)));
770 	  emit_insn (gen_vec_interleave_lowv4si
771 		     (gen_rtx_SUBREG (V4SImode, vreg, 0),
772 		      gen_rtx_SUBREG (V4SImode, vreg, 0),
773 		      gen_rtx_SUBREG (V4SImode, tmp, 0)));
774 	}
775     }
776   else
777     emit_insn (gen_rtx_SET (gen_rtx_SUBREG (vmode, vreg, 0),
778 			    gen_gpr_to_xmm_move_src (vmode, reg)));
779   rtx_insn *seq = get_insns ();
780   end_sequence ();
781   emit_conversion_insns (seq, insn);
782 
783   if (dump_file)
784     fprintf (dump_file,
785 	     "  Copied r%d to a vector register r%d for insn %d\n",
786 	     REGNO (reg), REGNO (vreg), INSN_UID (insn));
787 }
788 
789 /* Copy the definition SRC of INSN inside the chain to DST for
790    scalar uses outside of the chain.  */
791 
792 void
convert_reg(rtx_insn * insn,rtx dst,rtx src)793 general_scalar_chain::convert_reg (rtx_insn *insn, rtx dst, rtx src)
794 {
795   start_sequence ();
796   if (!TARGET_INTER_UNIT_MOVES_FROM_VEC)
797     {
798       rtx tmp = assign_386_stack_local (smode, SLOT_STV_TEMP);
799       emit_move_insn (tmp, src);
800       if (!TARGET_64BIT && smode == DImode)
801 	{
802 	  emit_move_insn (gen_rtx_SUBREG (SImode, dst, 0),
803 			  adjust_address (tmp, SImode, 0));
804 	  emit_move_insn (gen_rtx_SUBREG (SImode, dst, 4),
805 			  adjust_address (tmp, SImode, 4));
806 	}
807       else
808 	emit_move_insn (dst, copy_rtx (tmp));
809     }
810   else if (!TARGET_64BIT && smode == DImode)
811     {
812       if (TARGET_SSE4_1)
813 	{
814 	  rtx tmp = gen_rtx_PARALLEL (VOIDmode,
815 				      gen_rtvec (1, const0_rtx));
816 	  emit_insn
817 	      (gen_rtx_SET
818 	       (gen_rtx_SUBREG (SImode, dst, 0),
819 		gen_rtx_VEC_SELECT (SImode,
820 				    gen_rtx_SUBREG (V4SImode, src, 0),
821 				    tmp)));
822 
823 	  tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const1_rtx));
824 	  emit_insn
825 	      (gen_rtx_SET
826 	       (gen_rtx_SUBREG (SImode, dst, 4),
827 		gen_rtx_VEC_SELECT (SImode,
828 				    gen_rtx_SUBREG (V4SImode, src, 0),
829 				    tmp)));
830 	}
831       else
832 	{
833 	  rtx vcopy = gen_reg_rtx (V2DImode);
834 	  emit_move_insn (vcopy, gen_rtx_SUBREG (V2DImode, src, 0));
835 	  emit_move_insn (gen_rtx_SUBREG (SImode, dst, 0),
836 			  gen_rtx_SUBREG (SImode, vcopy, 0));
837 	  emit_move_insn (vcopy,
838 			  gen_rtx_LSHIFTRT (V2DImode,
839 					    vcopy, GEN_INT (32)));
840 	  emit_move_insn (gen_rtx_SUBREG (SImode, dst, 4),
841 			  gen_rtx_SUBREG (SImode, vcopy, 0));
842 	}
843     }
844   else
845     emit_move_insn (dst, src);
846 
847   rtx_insn *seq = get_insns ();
848   end_sequence ();
849   emit_conversion_insns (seq, insn);
850 
851   if (dump_file)
852     fprintf (dump_file,
853 	     "  Copied r%d to a scalar register r%d for insn %d\n",
854 	     REGNO (src), REGNO (dst), INSN_UID (insn));
855 }
856 
857 /* Convert operand OP in INSN.  We should handle
858    memory operands and uninitialized registers.
859    All other register uses are converted during
860    registers conversion.  */
861 
862 void
convert_op(rtx * op,rtx_insn * insn)863 general_scalar_chain::convert_op (rtx *op, rtx_insn *insn)
864 {
865   *op = copy_rtx_if_shared (*op);
866 
867   if (GET_CODE (*op) == NOT)
868     {
869       convert_op (&XEXP (*op, 0), insn);
870       PUT_MODE (*op, vmode);
871     }
872   else if (MEM_P (*op))
873     {
874       rtx tmp = gen_reg_rtx (GET_MODE (*op));
875 
876       /* Handle movabs.  */
877       if (!memory_operand (*op, GET_MODE (*op)))
878 	{
879 	  rtx tmp2 = gen_reg_rtx (GET_MODE (*op));
880 
881 	  emit_insn_before (gen_rtx_SET (tmp2, *op), insn);
882 	  *op = tmp2;
883 	}
884 
885       emit_insn_before (gen_rtx_SET (gen_rtx_SUBREG (vmode, tmp, 0),
886 				     gen_gpr_to_xmm_move_src (vmode, *op)),
887 			insn);
888       *op = gen_rtx_SUBREG (vmode, tmp, 0);
889 
890       if (dump_file)
891 	fprintf (dump_file, "  Preloading operand for insn %d into r%d\n",
892 		 INSN_UID (insn), REGNO (tmp));
893     }
894   else if (REG_P (*op))
895     {
896       *op = gen_rtx_SUBREG (vmode, *op, 0);
897     }
898   else if (CONST_INT_P (*op))
899     {
900       rtx vec_cst;
901       rtx tmp = gen_rtx_SUBREG (vmode, gen_reg_rtx (smode), 0);
902 
903       /* Prefer all ones vector in case of -1.  */
904       if (constm1_operand (*op, GET_MODE (*op)))
905 	vec_cst = CONSTM1_RTX (vmode);
906       else
907 	{
908 	  unsigned n = GET_MODE_NUNITS (vmode);
909 	  rtx *v = XALLOCAVEC (rtx, n);
910 	  v[0] = *op;
911 	  for (unsigned i = 1; i < n; ++i)
912 	    v[i] = const0_rtx;
913 	  vec_cst = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (n, v));
914 	}
915 
916       if (!standard_sse_constant_p (vec_cst, vmode))
917 	{
918 	  start_sequence ();
919 	  vec_cst = validize_mem (force_const_mem (vmode, vec_cst));
920 	  rtx_insn *seq = get_insns ();
921 	  end_sequence ();
922 	  emit_insn_before (seq, insn);
923 	}
924 
925       emit_insn_before (gen_move_insn (copy_rtx (tmp), vec_cst), insn);
926       *op = tmp;
927     }
928   else
929     {
930       gcc_assert (SUBREG_P (*op));
931       gcc_assert (GET_MODE (*op) == vmode);
932     }
933 }
934 
935 /* Convert INSN to vector mode.  */
936 
937 void
convert_insn(rtx_insn * insn)938 general_scalar_chain::convert_insn (rtx_insn *insn)
939 {
940   /* Generate copies for out-of-chain uses of defs and adjust debug uses.  */
941   for (df_ref ref = DF_INSN_DEFS (insn); ref; ref = DF_REF_NEXT_LOC (ref))
942     if (bitmap_bit_p (defs_conv, DF_REF_REGNO (ref)))
943       {
944 	df_link *use;
945 	for (use = DF_REF_CHAIN (ref); use; use = use->next)
946 	  if (NONDEBUG_INSN_P (DF_REF_INSN (use->ref))
947 	      && (DF_REF_REG_MEM_P (use->ref)
948 		  || !bitmap_bit_p (insns, DF_REF_INSN_UID (use->ref))))
949 	    break;
950 	if (use)
951 	  convert_reg (insn, DF_REF_REG (ref),
952 		       *defs_map.get (regno_reg_rtx [DF_REF_REGNO (ref)]));
953 	else if (MAY_HAVE_DEBUG_BIND_INSNS)
954 	  {
955 	    /* If we generated a scalar copy we can leave debug-insns
956 	       as-is, if not, we have to adjust them.  */
957 	    auto_vec<rtx_insn *, 5> to_reset_debug_insns;
958 	    for (use = DF_REF_CHAIN (ref); use; use = use->next)
959 	      if (DEBUG_INSN_P (DF_REF_INSN (use->ref)))
960 		{
961 		  rtx_insn *debug_insn = DF_REF_INSN (use->ref);
962 		  /* If there's a reaching definition outside of the
963 		     chain we have to reset.  */
964 		  df_link *def;
965 		  for (def = DF_REF_CHAIN (use->ref); def; def = def->next)
966 		    if (!bitmap_bit_p (insns, DF_REF_INSN_UID (def->ref)))
967 		      break;
968 		  if (def)
969 		    to_reset_debug_insns.safe_push (debug_insn);
970 		  else
971 		    {
972 		      *DF_REF_REAL_LOC (use->ref)
973 			= *defs_map.get (regno_reg_rtx [DF_REF_REGNO (ref)]);
974 		      df_insn_rescan (debug_insn);
975 		    }
976 		}
977 	    /* Have to do the reset outside of the DF_CHAIN walk to not
978 	       disrupt it.  */
979 	    while (!to_reset_debug_insns.is_empty ())
980 	      {
981 		rtx_insn *debug_insn = to_reset_debug_insns.pop ();
982 		INSN_VAR_LOCATION_LOC (debug_insn) = gen_rtx_UNKNOWN_VAR_LOC ();
983 		df_insn_rescan_debug_internal (debug_insn);
984 	      }
985 	  }
986       }
987 
988   /* Replace uses in this insn with the defs we use in the chain.  */
989   for (df_ref ref = DF_INSN_USES (insn); ref; ref = DF_REF_NEXT_LOC (ref))
990     if (!DF_REF_REG_MEM_P (ref))
991       if (rtx *vreg = defs_map.get (regno_reg_rtx[DF_REF_REGNO (ref)]))
992 	{
993 	  /* Also update a corresponding REG_DEAD note.  */
994 	  rtx note = find_reg_note (insn, REG_DEAD, DF_REF_REG (ref));
995 	  if (note)
996 	    XEXP (note, 0) = *vreg;
997 	  *DF_REF_REAL_LOC (ref) = *vreg;
998 	}
999 
1000   rtx def_set = single_set (insn);
1001   rtx src = SET_SRC (def_set);
1002   rtx dst = SET_DEST (def_set);
1003   rtx subreg;
1004 
1005   if (MEM_P (dst) && !REG_P (src))
1006     {
1007       /* There are no scalar integer instructions and therefore
1008 	 temporary register usage is required.  */
1009       rtx tmp = gen_reg_rtx (smode);
1010       emit_conversion_insns (gen_move_insn (dst, tmp), insn);
1011       dst = gen_rtx_SUBREG (vmode, tmp, 0);
1012     }
1013   else if (REG_P (dst))
1014     {
1015       /* Replace the definition with a SUBREG to the definition we
1016          use inside the chain.  */
1017       rtx *vdef = defs_map.get (dst);
1018       if (vdef)
1019 	dst = *vdef;
1020       dst = gen_rtx_SUBREG (vmode, dst, 0);
1021       /* IRA doesn't like to have REG_EQUAL/EQUIV notes when the SET_DEST
1022          is a non-REG_P.  So kill those off.  */
1023       rtx note = find_reg_equal_equiv_note (insn);
1024       if (note)
1025 	remove_note (insn, note);
1026     }
1027 
1028   switch (GET_CODE (src))
1029     {
1030     case PLUS:
1031     case MINUS:
1032     case IOR:
1033     case XOR:
1034     case AND:
1035     case SMAX:
1036     case SMIN:
1037     case UMAX:
1038     case UMIN:
1039       convert_op (&XEXP (src, 1), insn);
1040       /* FALLTHRU */
1041 
1042     case ABS:
1043     case ASHIFT:
1044     case ASHIFTRT:
1045     case LSHIFTRT:
1046       convert_op (&XEXP (src, 0), insn);
1047       PUT_MODE (src, vmode);
1048       break;
1049 
1050     case NEG:
1051       src = XEXP (src, 0);
1052 
1053       if (GET_CODE (src) == ABS)
1054 	{
1055 	  src = XEXP (src, 0);
1056 	  convert_op (&src, insn);
1057 	  subreg = gen_reg_rtx (vmode);
1058 	  emit_insn_before (gen_rtx_SET (subreg,
1059 					 gen_rtx_ABS (vmode, src)), insn);
1060 	  src = subreg;
1061 	}
1062       else
1063 	convert_op (&src, insn);
1064 
1065       subreg = gen_reg_rtx (vmode);
1066       emit_insn_before (gen_move_insn (subreg, CONST0_RTX (vmode)), insn);
1067       src = gen_rtx_MINUS (vmode, subreg, src);
1068       break;
1069 
1070     case NOT:
1071       src = XEXP (src, 0);
1072       convert_op (&src, insn);
1073       subreg = gen_reg_rtx (vmode);
1074       emit_insn_before (gen_move_insn (subreg, CONSTM1_RTX (vmode)), insn);
1075       src = gen_rtx_XOR (vmode, src, subreg);
1076       break;
1077 
1078     case MEM:
1079       if (!REG_P (dst))
1080 	convert_op (&src, insn);
1081       break;
1082 
1083     case REG:
1084       if (!MEM_P (dst))
1085 	convert_op (&src, insn);
1086       break;
1087 
1088     case SUBREG:
1089       gcc_assert (GET_MODE (src) == vmode);
1090       break;
1091 
1092     case COMPARE:
1093       src = SUBREG_REG (XEXP (XEXP (src, 0), 0));
1094 
1095       gcc_assert (REG_P (src) && GET_MODE (src) == DImode);
1096       subreg = gen_rtx_SUBREG (V2DImode, src, 0);
1097       emit_insn_before (gen_vec_interleave_lowv2di
1098 			(copy_rtx_if_shared (subreg),
1099 			 copy_rtx_if_shared (subreg),
1100 			 copy_rtx_if_shared (subreg)),
1101 			insn);
1102       dst = gen_rtx_REG (CCmode, FLAGS_REG);
1103       src = gen_rtx_UNSPEC (CCmode, gen_rtvec (2, copy_rtx_if_shared (subreg),
1104 					       copy_rtx_if_shared (subreg)),
1105 			    UNSPEC_PTEST);
1106       break;
1107 
1108     case CONST_INT:
1109       convert_op (&src, insn);
1110       break;
1111 
1112     default:
1113       gcc_unreachable ();
1114     }
1115 
1116   SET_SRC (def_set) = src;
1117   SET_DEST (def_set) = dst;
1118 
1119   /* Drop possible dead definitions.  */
1120   PATTERN (insn) = def_set;
1121 
1122   INSN_CODE (insn) = -1;
1123   int patt = recog_memoized (insn);
1124   if  (patt == -1)
1125     fatal_insn_not_found (insn);
1126   df_insn_rescan (insn);
1127 }
1128 
1129 /* Fix uses of converted REG in debug insns.  */
1130 
1131 void
fix_debug_reg_uses(rtx reg)1132 timode_scalar_chain::fix_debug_reg_uses (rtx reg)
1133 {
1134   if (!flag_var_tracking)
1135     return;
1136 
1137   df_ref ref, next;
1138   for (ref = DF_REG_USE_CHAIN (REGNO (reg)); ref; ref = next)
1139     {
1140       rtx_insn *insn = DF_REF_INSN (ref);
1141       /* Make sure the next ref is for a different instruction,
1142          so that we're not affected by the rescan.  */
1143       next = DF_REF_NEXT_REG (ref);
1144       while (next && DF_REF_INSN (next) == insn)
1145 	next = DF_REF_NEXT_REG (next);
1146 
1147       if (DEBUG_INSN_P (insn))
1148 	{
1149 	  /* It may be a debug insn with a TImode variable in
1150 	     register.  */
1151 	  bool changed = false;
1152 	  for (; ref != next; ref = DF_REF_NEXT_REG (ref))
1153 	    {
1154 	      rtx *loc = DF_REF_LOC (ref);
1155 	      if (REG_P (*loc) && GET_MODE (*loc) == V1TImode)
1156 		{
1157 		  *loc = gen_rtx_SUBREG (TImode, *loc, 0);
1158 		  changed = true;
1159 		}
1160 	    }
1161 	  if (changed)
1162 	    df_insn_rescan (insn);
1163 	}
1164     }
1165 }
1166 
1167 /* Convert INSN from TImode to V1T1mode.  */
1168 
1169 void
convert_insn(rtx_insn * insn)1170 timode_scalar_chain::convert_insn (rtx_insn *insn)
1171 {
1172   rtx def_set = single_set (insn);
1173   rtx src = SET_SRC (def_set);
1174   rtx dst = SET_DEST (def_set);
1175 
1176   switch (GET_CODE (dst))
1177     {
1178     case REG:
1179       {
1180 	rtx tmp = find_reg_equal_equiv_note (insn);
1181 	if (tmp)
1182 	  PUT_MODE (XEXP (tmp, 0), V1TImode);
1183 	PUT_MODE (dst, V1TImode);
1184 	fix_debug_reg_uses (dst);
1185       }
1186       break;
1187     case MEM:
1188       PUT_MODE (dst, V1TImode);
1189       break;
1190 
1191     default:
1192       gcc_unreachable ();
1193     }
1194 
1195   switch (GET_CODE (src))
1196     {
1197     case REG:
1198       PUT_MODE (src, V1TImode);
1199       /* Call fix_debug_reg_uses only if SRC is never defined.  */
1200       if (!DF_REG_DEF_CHAIN (REGNO (src)))
1201 	fix_debug_reg_uses (src);
1202       break;
1203 
1204     case MEM:
1205       PUT_MODE (src, V1TImode);
1206       break;
1207 
1208     case CONST_WIDE_INT:
1209       if (NONDEBUG_INSN_P (insn))
1210 	{
1211 	  /* Since there are no instructions to store 128-bit constant,
1212 	     temporary register usage is required.  */
1213 	  rtx tmp = gen_reg_rtx (V1TImode);
1214 	  start_sequence ();
1215 	  src = gen_rtx_CONST_VECTOR (V1TImode, gen_rtvec (1, src));
1216 	  src = validize_mem (force_const_mem (V1TImode, src));
1217 	  rtx_insn *seq = get_insns ();
1218 	  end_sequence ();
1219 	  if (seq)
1220 	    emit_insn_before (seq, insn);
1221 	  emit_conversion_insns (gen_rtx_SET (dst, tmp), insn);
1222 	  dst = tmp;
1223 	}
1224       break;
1225 
1226     case CONST_INT:
1227       switch (standard_sse_constant_p (src, TImode))
1228 	{
1229 	case 1:
1230 	  src = CONST0_RTX (GET_MODE (dst));
1231 	  break;
1232 	case 2:
1233 	  src = CONSTM1_RTX (GET_MODE (dst));
1234 	  break;
1235 	default:
1236 	  gcc_unreachable ();
1237 	}
1238       if (NONDEBUG_INSN_P (insn))
1239 	{
1240 	  rtx tmp = gen_reg_rtx (V1TImode);
1241 	  /* Since there are no instructions to store standard SSE
1242 	     constant, temporary register usage is required.  */
1243 	  emit_conversion_insns (gen_rtx_SET (dst, tmp), insn);
1244 	  dst = tmp;
1245 	}
1246       break;
1247 
1248     default:
1249       gcc_unreachable ();
1250     }
1251 
1252   SET_SRC (def_set) = src;
1253   SET_DEST (def_set) = dst;
1254 
1255   /* Drop possible dead definitions.  */
1256   PATTERN (insn) = def_set;
1257 
1258   INSN_CODE (insn) = -1;
1259   recog_memoized (insn);
1260   df_insn_rescan (insn);
1261 }
1262 
1263 /* Generate copies from defs used by the chain but not defined therein.
1264    Also populates defs_map which is used later by convert_insn.  */
1265 
1266 void
convert_registers()1267 general_scalar_chain::convert_registers ()
1268 {
1269   bitmap_iterator bi;
1270   unsigned id;
1271   EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, id, bi)
1272     {
1273       rtx chain_reg = gen_reg_rtx (smode);
1274       defs_map.put (regno_reg_rtx[id], chain_reg);
1275     }
1276   EXECUTE_IF_SET_IN_BITMAP (insns_conv, 0, id, bi)
1277     for (df_ref ref = DF_INSN_UID_DEFS (id); ref; ref = DF_REF_NEXT_LOC (ref))
1278       if (bitmap_bit_p (defs_conv, DF_REF_REGNO (ref)))
1279 	make_vector_copies (DF_REF_INSN (ref), DF_REF_REAL_REG (ref));
1280 }
1281 
1282 /* Convert whole chain creating required register
1283    conversions and copies.  */
1284 
1285 int
convert()1286 scalar_chain::convert ()
1287 {
1288   bitmap_iterator bi;
1289   unsigned id;
1290   int converted_insns = 0;
1291 
1292   if (!dbg_cnt (stv_conversion))
1293     return 0;
1294 
1295   if (dump_file)
1296     fprintf (dump_file, "Converting chain #%d...\n", chain_id);
1297 
1298   convert_registers ();
1299 
1300   EXECUTE_IF_SET_IN_BITMAP (insns, 0, id, bi)
1301     {
1302       convert_insn (DF_INSN_UID_GET (id)->insn);
1303       converted_insns++;
1304     }
1305 
1306   return converted_insns;
1307 }
1308 
1309 /* Return the SET expression if INSN doesn't reference hard register.
1310    Return NULL if INSN uses or defines a hard register, excluding
1311    pseudo register pushes, hard register uses in a memory address,
1312    clobbers and flags definitions.  */
1313 
1314 static rtx
pseudo_reg_set(rtx_insn * insn)1315 pseudo_reg_set (rtx_insn *insn)
1316 {
1317   rtx set = single_set (insn);
1318   if (!set)
1319     return NULL;
1320 
1321   /* Check pseudo register push first. */
1322   machine_mode mode = TARGET_64BIT ? TImode : DImode;
1323   if (REG_P (SET_SRC (set))
1324       && !HARD_REGISTER_P (SET_SRC (set))
1325       && push_operand (SET_DEST (set), mode))
1326     return set;
1327 
1328   df_ref ref;
1329   FOR_EACH_INSN_DEF (ref, insn)
1330     if (HARD_REGISTER_P (DF_REF_REAL_REG (ref))
1331 	&& !DF_REF_FLAGS_IS_SET (ref, DF_REF_MUST_CLOBBER)
1332 	&& DF_REF_REGNO (ref) != FLAGS_REG)
1333       return NULL;
1334 
1335   FOR_EACH_INSN_USE (ref, insn)
1336     if (!DF_REF_REG_MEM_P (ref) && HARD_REGISTER_P (DF_REF_REAL_REG (ref)))
1337       return NULL;
1338 
1339   return set;
1340 }
1341 
1342 /* Check if comparison INSN may be transformed
1343    into vector comparison.  Currently we transform
1344    zero checks only which look like:
1345 
1346    (set (reg:CCZ 17 flags)
1347         (compare:CCZ (ior:SI (subreg:SI (reg:DI x) 4)
1348                              (subreg:SI (reg:DI x) 0))
1349 		     (const_int 0 [0])))  */
1350 
1351 static bool
convertible_comparison_p(rtx_insn * insn,enum machine_mode mode)1352 convertible_comparison_p (rtx_insn *insn, enum machine_mode mode)
1353 {
1354   /* ??? Currently convertible for double-word DImode chain only.  */
1355   if (TARGET_64BIT || mode != DImode)
1356     return false;
1357 
1358   if (!TARGET_SSE4_1)
1359     return false;
1360 
1361   rtx def_set = single_set (insn);
1362 
1363   gcc_assert (def_set);
1364 
1365   rtx src = SET_SRC (def_set);
1366   rtx dst = SET_DEST (def_set);
1367 
1368   gcc_assert (GET_CODE (src) == COMPARE);
1369 
1370   if (GET_CODE (dst) != REG
1371       || REGNO (dst) != FLAGS_REG
1372       || GET_MODE (dst) != CCZmode)
1373     return false;
1374 
1375   rtx op1 = XEXP (src, 0);
1376   rtx op2 = XEXP (src, 1);
1377 
1378   if (op2 != CONST0_RTX (GET_MODE (op2)))
1379     return false;
1380 
1381   if (GET_CODE (op1) != IOR)
1382     return false;
1383 
1384   op2 = XEXP (op1, 1);
1385   op1 = XEXP (op1, 0);
1386 
1387   if (!SUBREG_P (op1)
1388       || !SUBREG_P (op2)
1389       || GET_MODE (op1) != SImode
1390       || GET_MODE (op2) != SImode
1391       || ((SUBREG_BYTE (op1) != 0
1392 	   || SUBREG_BYTE (op2) != GET_MODE_SIZE (SImode))
1393 	  && (SUBREG_BYTE (op2) != 0
1394 	      || SUBREG_BYTE (op1) != GET_MODE_SIZE (SImode))))
1395     return false;
1396 
1397   op1 = SUBREG_REG (op1);
1398   op2 = SUBREG_REG (op2);
1399 
1400   if (op1 != op2
1401       || !REG_P (op1)
1402       || GET_MODE (op1) != DImode)
1403     return false;
1404 
1405   return true;
1406 }
1407 
1408 /* The general version of scalar_to_vector_candidate_p.  */
1409 
1410 static bool
general_scalar_to_vector_candidate_p(rtx_insn * insn,enum machine_mode mode)1411 general_scalar_to_vector_candidate_p (rtx_insn *insn, enum machine_mode mode)
1412 {
1413   rtx def_set = pseudo_reg_set (insn);
1414 
1415   if (!def_set)
1416     return false;
1417 
1418   rtx src = SET_SRC (def_set);
1419   rtx dst = SET_DEST (def_set);
1420 
1421   if (GET_CODE (src) == COMPARE)
1422     return convertible_comparison_p (insn, mode);
1423 
1424   /* We are interested in "mode" only.  */
1425   if ((GET_MODE (src) != mode
1426        && !CONST_INT_P (src))
1427       || GET_MODE (dst) != mode)
1428     return false;
1429 
1430   if (!REG_P (dst) && !MEM_P (dst))
1431     return false;
1432 
1433   switch (GET_CODE (src))
1434     {
1435     case ASHIFTRT:
1436       if (!TARGET_AVX512VL)
1437 	return false;
1438       /* FALLTHRU */
1439 
1440     case ASHIFT:
1441     case LSHIFTRT:
1442       if (!CONST_INT_P (XEXP (src, 1))
1443 	  || !IN_RANGE (INTVAL (XEXP (src, 1)), 0, GET_MODE_BITSIZE (mode)-1))
1444 	return false;
1445       break;
1446 
1447     case SMAX:
1448     case SMIN:
1449     case UMAX:
1450     case UMIN:
1451       if ((mode == DImode && !TARGET_AVX512VL)
1452 	  || (mode == SImode && !TARGET_SSE4_1))
1453 	return false;
1454       /* Fallthru.  */
1455 
1456     case AND:
1457     case IOR:
1458     case XOR:
1459     case PLUS:
1460     case MINUS:
1461       if (!REG_P (XEXP (src, 1))
1462 	  && !MEM_P (XEXP (src, 1))
1463 	  && !CONST_INT_P (XEXP (src, 1)))
1464 	return false;
1465 
1466       if (GET_MODE (XEXP (src, 1)) != mode
1467 	  && !CONST_INT_P (XEXP (src, 1)))
1468 	return false;
1469 
1470       /* Check for andnot case.  */
1471       if (GET_CODE (src) != AND
1472 	  || GET_CODE (XEXP (src, 0)) != NOT)
1473 	break;
1474 
1475       src = XEXP (src, 0);
1476       /* FALLTHRU */
1477 
1478     case NOT:
1479       break;
1480 
1481     case NEG:
1482       /* Check for nabs case.  */
1483       if (GET_CODE (XEXP (src, 0)) != ABS)
1484 	break;
1485 
1486       src = XEXP (src, 0);
1487       /* FALLTHRU */
1488 
1489     case ABS:
1490       if ((mode == DImode && !TARGET_AVX512VL)
1491 	  || (mode == SImode && !TARGET_SSSE3))
1492 	return false;
1493       break;
1494 
1495     case REG:
1496       return true;
1497 
1498     case MEM:
1499     case CONST_INT:
1500       return REG_P (dst);
1501 
1502     default:
1503       return false;
1504     }
1505 
1506   if (!REG_P (XEXP (src, 0))
1507       && !MEM_P (XEXP (src, 0))
1508       && !CONST_INT_P (XEXP (src, 0)))
1509     return false;
1510 
1511   if (GET_MODE (XEXP (src, 0)) != mode
1512       && !CONST_INT_P (XEXP (src, 0)))
1513     return false;
1514 
1515   return true;
1516 }
1517 
1518 /* The TImode version of scalar_to_vector_candidate_p.  */
1519 
1520 static bool
timode_scalar_to_vector_candidate_p(rtx_insn * insn)1521 timode_scalar_to_vector_candidate_p (rtx_insn *insn)
1522 {
1523   rtx def_set = pseudo_reg_set (insn);
1524 
1525   if (!def_set)
1526     return false;
1527 
1528   rtx src = SET_SRC (def_set);
1529   rtx dst = SET_DEST (def_set);
1530 
1531   /* Only TImode load and store are allowed.  */
1532   if (GET_MODE (dst) != TImode)
1533     return false;
1534 
1535   if (MEM_P (dst))
1536     {
1537       /* Check for store.  Memory must be aligned or unaligned store
1538 	 is optimal.  Only support store from register, standard SSE
1539 	 constant or CONST_WIDE_INT generated from piecewise store.
1540 
1541 	 ??? Verify performance impact before enabling CONST_INT for
1542 	 __int128 store.  */
1543       if (misaligned_operand (dst, TImode)
1544 	  && !TARGET_SSE_UNALIGNED_STORE_OPTIMAL)
1545 	return false;
1546 
1547       switch (GET_CODE (src))
1548 	{
1549 	default:
1550 	  return false;
1551 
1552 	case REG:
1553 	case CONST_WIDE_INT:
1554 	  return true;
1555 
1556 	case CONST_INT:
1557 	  return standard_sse_constant_p (src, TImode);
1558 	}
1559     }
1560   else if (MEM_P (src))
1561     {
1562       /* Check for load.  Memory must be aligned or unaligned load is
1563 	 optimal.  */
1564       return (REG_P (dst)
1565 	      && (!misaligned_operand (src, TImode)
1566 		  || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL));
1567     }
1568 
1569   return false;
1570 }
1571 
1572 /* For a register REGNO, scan instructions for its defs and uses.
1573    Put REGNO in REGS if a def or use isn't in CANDIDATES.  */
1574 
1575 static void
timode_check_non_convertible_regs(bitmap candidates,bitmap regs,unsigned int regno)1576 timode_check_non_convertible_regs (bitmap candidates, bitmap regs,
1577 				   unsigned int regno)
1578 {
1579   for (df_ref def = DF_REG_DEF_CHAIN (regno);
1580        def;
1581        def = DF_REF_NEXT_REG (def))
1582     {
1583       if (!bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
1584 	{
1585 	  if (dump_file)
1586 	    fprintf (dump_file,
1587 		     "r%d has non convertible def in insn %d\n",
1588 		     regno, DF_REF_INSN_UID (def));
1589 
1590 	  bitmap_set_bit (regs, regno);
1591 	  break;
1592 	}
1593     }
1594 
1595   for (df_ref ref = DF_REG_USE_CHAIN (regno);
1596        ref;
1597        ref = DF_REF_NEXT_REG (ref))
1598     {
1599       /* Debug instructions are skipped.  */
1600       if (NONDEBUG_INSN_P (DF_REF_INSN (ref))
1601 	  && !bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)))
1602 	{
1603 	  if (dump_file)
1604 	    fprintf (dump_file,
1605 		     "r%d has non convertible use in insn %d\n",
1606 		     regno, DF_REF_INSN_UID (ref));
1607 
1608 	  bitmap_set_bit (regs, regno);
1609 	  break;
1610 	}
1611     }
1612 }
1613 
1614 /* The TImode version of remove_non_convertible_regs.  */
1615 
1616 static void
timode_remove_non_convertible_regs(bitmap candidates)1617 timode_remove_non_convertible_regs (bitmap candidates)
1618 {
1619   bitmap_iterator bi;
1620   unsigned id;
1621   bitmap regs = BITMAP_ALLOC (NULL);
1622 
1623   EXECUTE_IF_SET_IN_BITMAP (candidates, 0, id, bi)
1624     {
1625       rtx def_set = single_set (DF_INSN_UID_GET (id)->insn);
1626       rtx dest = SET_DEST (def_set);
1627       rtx src = SET_SRC (def_set);
1628 
1629       if ((!REG_P (dest)
1630 	   || bitmap_bit_p (regs, REGNO (dest))
1631 	   || HARD_REGISTER_P (dest))
1632 	  && (!REG_P (src)
1633 	      || bitmap_bit_p (regs, REGNO (src))
1634 	      || HARD_REGISTER_P (src)))
1635 	continue;
1636 
1637       if (REG_P (dest))
1638 	timode_check_non_convertible_regs (candidates, regs,
1639 					   REGNO (dest));
1640 
1641       if (REG_P (src))
1642 	timode_check_non_convertible_regs (candidates, regs,
1643 					   REGNO (src));
1644     }
1645 
1646   EXECUTE_IF_SET_IN_BITMAP (regs, 0, id, bi)
1647     {
1648       for (df_ref def = DF_REG_DEF_CHAIN (id);
1649 	   def;
1650 	   def = DF_REF_NEXT_REG (def))
1651 	if (bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
1652 	  {
1653 	    if (dump_file)
1654 	      fprintf (dump_file, "Removing insn %d from candidates list\n",
1655 		       DF_REF_INSN_UID (def));
1656 
1657 	    bitmap_clear_bit (candidates, DF_REF_INSN_UID (def));
1658 	  }
1659 
1660       for (df_ref ref = DF_REG_USE_CHAIN (id);
1661 	   ref;
1662 	   ref = DF_REF_NEXT_REG (ref))
1663 	if (bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)))
1664 	  {
1665 	    if (dump_file)
1666 	      fprintf (dump_file, "Removing insn %d from candidates list\n",
1667 		       DF_REF_INSN_UID (ref));
1668 
1669 	    bitmap_clear_bit (candidates, DF_REF_INSN_UID (ref));
1670 	  }
1671     }
1672 
1673   BITMAP_FREE (regs);
1674 }
1675 
1676 /* Main STV pass function.  Find and convert scalar
1677    instructions into vector mode when profitable.  */
1678 
1679 static unsigned int
convert_scalars_to_vector(bool timode_p)1680 convert_scalars_to_vector (bool timode_p)
1681 {
1682   basic_block bb;
1683   int converted_insns = 0;
1684 
1685   bitmap_obstack_initialize (NULL);
1686   const machine_mode cand_mode[3] = { SImode, DImode, TImode };
1687   const machine_mode cand_vmode[3] = { V4SImode, V2DImode, V1TImode };
1688   bitmap_head candidates[3];  /* { SImode, DImode, TImode } */
1689   for (unsigned i = 0; i < 3; ++i)
1690     bitmap_initialize (&candidates[i], &bitmap_default_obstack);
1691 
1692   calculate_dominance_info (CDI_DOMINATORS);
1693   df_set_flags (DF_DEFER_INSN_RESCAN | DF_RD_PRUNE_DEAD_DEFS);
1694   df_chain_add_problem (DF_DU_CHAIN | DF_UD_CHAIN);
1695   df_analyze ();
1696 
1697   /* Find all instructions we want to convert into vector mode.  */
1698   if (dump_file)
1699     fprintf (dump_file, "Searching for mode conversion candidates...\n");
1700 
1701   FOR_EACH_BB_FN (bb, cfun)
1702     {
1703       rtx_insn *insn;
1704       FOR_BB_INSNS (bb, insn)
1705 	if (timode_p
1706 	    && timode_scalar_to_vector_candidate_p (insn))
1707 	  {
1708 	    if (dump_file)
1709 	      fprintf (dump_file, "  insn %d is marked as a TImode candidate\n",
1710 		       INSN_UID (insn));
1711 
1712 	    bitmap_set_bit (&candidates[2], INSN_UID (insn));
1713 	  }
1714 	else if (!timode_p)
1715 	  {
1716 	    /* Check {SI,DI}mode.  */
1717 	    for (unsigned i = 0; i <= 1; ++i)
1718 	      if (general_scalar_to_vector_candidate_p (insn, cand_mode[i]))
1719 		{
1720 		  if (dump_file)
1721 		    fprintf (dump_file, "  insn %d is marked as a %s candidate\n",
1722 			     INSN_UID (insn), i == 0 ? "SImode" : "DImode");
1723 
1724 		  bitmap_set_bit (&candidates[i], INSN_UID (insn));
1725 		  break;
1726 		}
1727 	  }
1728     }
1729 
1730   if (timode_p)
1731     timode_remove_non_convertible_regs (&candidates[2]);
1732 
1733   for (unsigned i = 0; i <= 2; ++i)
1734     if (!bitmap_empty_p (&candidates[i]))
1735       break;
1736     else if (i == 2 && dump_file)
1737       fprintf (dump_file, "There are no candidates for optimization.\n");
1738 
1739   for (unsigned i = 0; i <= 2; ++i)
1740     while (!bitmap_empty_p (&candidates[i]))
1741       {
1742 	unsigned uid = bitmap_first_set_bit (&candidates[i]);
1743 	scalar_chain *chain;
1744 
1745 	if (cand_mode[i] == TImode)
1746 	  chain = new timode_scalar_chain;
1747 	else
1748 	  chain = new general_scalar_chain (cand_mode[i], cand_vmode[i]);
1749 
1750 	/* Find instructions chain we want to convert to vector mode.
1751 	   Check all uses and definitions to estimate all required
1752 	   conversions.  */
1753 	chain->build (&candidates[i], uid);
1754 
1755 	if (chain->compute_convert_gain () > 0)
1756 	  converted_insns += chain->convert ();
1757 	else
1758 	  if (dump_file)
1759 	    fprintf (dump_file, "Chain #%d conversion is not profitable\n",
1760 		     chain->chain_id);
1761 
1762 	delete chain;
1763       }
1764 
1765   if (dump_file)
1766     fprintf (dump_file, "Total insns converted: %d\n", converted_insns);
1767 
1768   for (unsigned i = 0; i <= 2; ++i)
1769     bitmap_release (&candidates[i]);
1770   bitmap_obstack_release (NULL);
1771   df_process_deferred_rescans ();
1772 
1773   /* Conversion means we may have 128bit register spills/fills
1774      which require aligned stack.  */
1775   if (converted_insns)
1776     {
1777       if (crtl->stack_alignment_needed < 128)
1778 	crtl->stack_alignment_needed = 128;
1779       if (crtl->stack_alignment_estimated < 128)
1780 	crtl->stack_alignment_estimated = 128;
1781 
1782       crtl->stack_realign_needed
1783 	= INCOMING_STACK_BOUNDARY < crtl->stack_alignment_estimated;
1784       crtl->stack_realign_tried = crtl->stack_realign_needed;
1785 
1786       crtl->stack_realign_processed = true;
1787 
1788       if (!crtl->drap_reg)
1789 	{
1790 	  rtx drap_rtx = targetm.calls.get_drap_rtx ();
1791 
1792 	  /* stack_realign_drap and drap_rtx must match.  */
1793 	  gcc_assert ((stack_realign_drap != 0) == (drap_rtx != NULL));
1794 
1795 	  /* Do nothing if NULL is returned,
1796 	     which means DRAP is not needed.  */
1797 	  if (drap_rtx != NULL)
1798 	    {
1799 	      crtl->args.internal_arg_pointer = drap_rtx;
1800 
1801 	      /* Call fixup_tail_calls to clean up
1802 		 REG_EQUIV note if DRAP is needed. */
1803 	      fixup_tail_calls ();
1804 	    }
1805 	}
1806 
1807       /* Fix up DECL_RTL/DECL_INCOMING_RTL of arguments.  */
1808       if (TARGET_64BIT)
1809 	for (tree parm = DECL_ARGUMENTS (current_function_decl);
1810 	     parm; parm = DECL_CHAIN (parm))
1811 	  {
1812 	    if (TYPE_MODE (TREE_TYPE (parm)) != TImode)
1813 	      continue;
1814 	    if (DECL_RTL_SET_P (parm)
1815 		&& GET_MODE (DECL_RTL (parm)) == V1TImode)
1816 	      {
1817 		rtx r = DECL_RTL (parm);
1818 		if (REG_P (r))
1819 		  SET_DECL_RTL (parm, gen_rtx_SUBREG (TImode, r, 0));
1820 	      }
1821 	    if (DECL_INCOMING_RTL (parm)
1822 		&& GET_MODE (DECL_INCOMING_RTL (parm)) == V1TImode)
1823 	      {
1824 		rtx r = DECL_INCOMING_RTL (parm);
1825 		if (REG_P (r))
1826 		  DECL_INCOMING_RTL (parm) = gen_rtx_SUBREG (TImode, r, 0);
1827 	      }
1828 	  }
1829     }
1830 
1831   return 0;
1832 }
1833 
1834 static unsigned int
rest_of_handle_insert_vzeroupper(void)1835 rest_of_handle_insert_vzeroupper (void)
1836 {
1837   /* vzeroupper instructions are inserted immediately after reload to
1838      account for possible spills from 256bit or 512bit registers.  The pass
1839      reuses mode switching infrastructure by re-running mode insertion
1840      pass, so disable entities that have already been processed.  */
1841   for (int i = 0; i < MAX_386_ENTITIES; i++)
1842     ix86_optimize_mode_switching[i] = 0;
1843 
1844   ix86_optimize_mode_switching[AVX_U128] = 1;
1845 
1846   /* Call optimize_mode_switching.  */
1847   g->get_passes ()->execute_pass_mode_switching ();
1848 
1849   df_analyze ();
1850   return 0;
1851 }
1852 
1853 namespace {
1854 
1855 const pass_data pass_data_insert_vzeroupper =
1856 {
1857   RTL_PASS, /* type */
1858   "vzeroupper", /* name */
1859   OPTGROUP_NONE, /* optinfo_flags */
1860   TV_MACH_DEP, /* tv_id */
1861   0, /* properties_required */
1862   0, /* properties_provided */
1863   0, /* properties_destroyed */
1864   0, /* todo_flags_start */
1865   TODO_df_finish, /* todo_flags_finish */
1866 };
1867 
1868 class pass_insert_vzeroupper : public rtl_opt_pass
1869 {
1870 public:
pass_insert_vzeroupper(gcc::context * ctxt)1871   pass_insert_vzeroupper(gcc::context *ctxt)
1872     : rtl_opt_pass(pass_data_insert_vzeroupper, ctxt)
1873   {}
1874 
1875   /* opt_pass methods: */
gate(function *)1876   virtual bool gate (function *)
1877     {
1878       return TARGET_AVX && TARGET_VZEROUPPER
1879 	&& flag_expensive_optimizations && !optimize_size;
1880     }
1881 
execute(function *)1882   virtual unsigned int execute (function *)
1883     {
1884       return rest_of_handle_insert_vzeroupper ();
1885     }
1886 
1887 }; // class pass_insert_vzeroupper
1888 
1889 const pass_data pass_data_stv =
1890 {
1891   RTL_PASS, /* type */
1892   "stv", /* name */
1893   OPTGROUP_NONE, /* optinfo_flags */
1894   TV_MACH_DEP, /* tv_id */
1895   0, /* properties_required */
1896   0, /* properties_provided */
1897   0, /* properties_destroyed */
1898   0, /* todo_flags_start */
1899   TODO_df_finish, /* todo_flags_finish */
1900 };
1901 
1902 class pass_stv : public rtl_opt_pass
1903 {
1904 public:
pass_stv(gcc::context * ctxt)1905   pass_stv (gcc::context *ctxt)
1906     : rtl_opt_pass (pass_data_stv, ctxt),
1907       timode_p (false)
1908   {}
1909 
1910   /* opt_pass methods: */
gate(function *)1911   virtual bool gate (function *)
1912     {
1913       return ((!timode_p || TARGET_64BIT)
1914 	      && TARGET_STV && TARGET_SSE2 && optimize > 1);
1915     }
1916 
execute(function *)1917   virtual unsigned int execute (function *)
1918     {
1919       return convert_scalars_to_vector (timode_p);
1920     }
1921 
clone()1922   opt_pass *clone ()
1923     {
1924       return new pass_stv (m_ctxt);
1925     }
1926 
set_pass_param(unsigned int n,bool param)1927   void set_pass_param (unsigned int n, bool param)
1928     {
1929       gcc_assert (n == 0);
1930       timode_p = param;
1931     }
1932 
1933 private:
1934   bool timode_p;
1935 }; // class pass_stv
1936 
1937 } // anon namespace
1938 
1939 rtl_opt_pass *
make_pass_insert_vzeroupper(gcc::context * ctxt)1940 make_pass_insert_vzeroupper (gcc::context *ctxt)
1941 {
1942   return new pass_insert_vzeroupper (ctxt);
1943 }
1944 
1945 rtl_opt_pass *
make_pass_stv(gcc::context * ctxt)1946 make_pass_stv (gcc::context *ctxt)
1947 {
1948   return new pass_stv (ctxt);
1949 }
1950 
1951 /* Inserting ENDBR and pseudo patchable-area instructions.  */
1952 
1953 static void
rest_of_insert_endbr_and_patchable_area(bool need_endbr,unsigned int patchable_area_size)1954 rest_of_insert_endbr_and_patchable_area (bool need_endbr,
1955 					 unsigned int patchable_area_size)
1956 {
1957   rtx endbr;
1958   rtx_insn *insn;
1959   rtx_insn *endbr_insn = NULL;
1960   basic_block bb;
1961 
1962   if (need_endbr)
1963     {
1964       /* Currently emit EB if it's a tracking function, i.e. 'nocf_check'
1965 	 is absent among function attributes.  Later an optimization will
1966 	 be introduced to make analysis if an address of a static function
1967 	 is taken.  A static function whose address is not taken will get
1968 	 a nocf_check attribute.  This will allow to reduce the number of
1969 	 EB.  */
1970       if (!lookup_attribute ("nocf_check",
1971 			     TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
1972 	  && (!flag_manual_endbr
1973 	      || lookup_attribute ("cf_check",
1974 				   DECL_ATTRIBUTES (cfun->decl)))
1975 	  && (!cgraph_node::get (cfun->decl)->only_called_directly_p ()
1976 	      || ix86_cmodel == CM_LARGE
1977 	      || ix86_cmodel == CM_LARGE_PIC
1978 	      || flag_force_indirect_call
1979 	      || (TARGET_DLLIMPORT_DECL_ATTRIBUTES
1980 		  && DECL_DLLIMPORT_P (cfun->decl))))
1981 	{
1982 	  if (crtl->profile && flag_fentry)
1983 	    {
1984 	      /* Queue ENDBR insertion to x86_function_profiler.
1985 		 NB: Any patchable-area insn will be inserted after
1986 		 ENDBR.  */
1987 	      cfun->machine->insn_queued_at_entrance = TYPE_ENDBR;
1988 	    }
1989 	  else
1990 	    {
1991 	      endbr = gen_nop_endbr ();
1992 	      bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
1993 	      rtx_insn *insn = BB_HEAD (bb);
1994 	      endbr_insn = emit_insn_before (endbr, insn);
1995 	    }
1996 	}
1997     }
1998 
1999   if (patchable_area_size)
2000     {
2001       if (crtl->profile && flag_fentry)
2002 	{
2003 	  /* Queue patchable-area insertion to x86_function_profiler.
2004 	     NB: If there is a queued ENDBR, x86_function_profiler
2005 	     will also handle patchable-area.  */
2006 	  if (!cfun->machine->insn_queued_at_entrance)
2007 	    cfun->machine->insn_queued_at_entrance = TYPE_PATCHABLE_AREA;
2008 	}
2009       else
2010 	{
2011 	  rtx patchable_area
2012 	    = gen_patchable_area (GEN_INT (patchable_area_size),
2013 				  GEN_INT (crtl->patch_area_entry == 0));
2014 	  if (endbr_insn)
2015 	    emit_insn_after (patchable_area, endbr_insn);
2016 	  else
2017 	    {
2018 	      bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
2019 	      insn = BB_HEAD (bb);
2020 	      emit_insn_before (patchable_area, insn);
2021 	    }
2022 	}
2023     }
2024 
2025   if (!need_endbr)
2026     return;
2027 
2028   bb = 0;
2029   FOR_EACH_BB_FN (bb, cfun)
2030     {
2031       for (insn = BB_HEAD (bb); insn != NEXT_INSN (BB_END (bb));
2032 	   insn = NEXT_INSN (insn))
2033 	{
2034 	  if (CALL_P (insn))
2035 	    {
2036 	      need_endbr = find_reg_note (insn, REG_SETJMP, NULL) != NULL;
2037 	      if (!need_endbr && !SIBLING_CALL_P (insn))
2038 		{
2039 		  rtx call = get_call_rtx_from (insn);
2040 		  rtx fnaddr = XEXP (call, 0);
2041 		  tree fndecl = NULL_TREE;
2042 
2043 		  /* Also generate ENDBRANCH for non-tail call which
2044 		     may return via indirect branch.  */
2045 		  if (GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
2046 		    fndecl = SYMBOL_REF_DECL (XEXP (fnaddr, 0));
2047 		  if (fndecl == NULL_TREE)
2048 		    fndecl = MEM_EXPR (fnaddr);
2049 		  if (fndecl
2050 		      && TREE_CODE (TREE_TYPE (fndecl)) != FUNCTION_TYPE
2051 		      && TREE_CODE (TREE_TYPE (fndecl)) != METHOD_TYPE)
2052 		    fndecl = NULL_TREE;
2053 		  if (fndecl && TYPE_ARG_TYPES (TREE_TYPE (fndecl)))
2054 		    {
2055 		      tree fntype = TREE_TYPE (fndecl);
2056 		      if (lookup_attribute ("indirect_return",
2057 					    TYPE_ATTRIBUTES (fntype)))
2058 			need_endbr = true;
2059 		    }
2060 		}
2061 	      if (!need_endbr)
2062 		continue;
2063 	      /* Generate ENDBRANCH after CALL, which can return more than
2064 		 twice, setjmp-like functions.  */
2065 
2066 	      endbr = gen_nop_endbr ();
2067 	      emit_insn_after_setloc (endbr, insn, INSN_LOCATION (insn));
2068 	      continue;
2069 	    }
2070 
2071 	  if (JUMP_P (insn) && flag_cet_switch)
2072 	    {
2073 	      rtx target = JUMP_LABEL (insn);
2074 	      if (target == NULL_RTX || ANY_RETURN_P (target))
2075 		continue;
2076 
2077 	      /* Check the jump is a switch table.  */
2078 	      rtx_insn *label = as_a<rtx_insn *> (target);
2079 	      rtx_insn *table = next_insn (label);
2080 	      if (table == NULL_RTX || !JUMP_TABLE_DATA_P (table))
2081 		continue;
2082 
2083 	      /* For the indirect jump find out all places it jumps and insert
2084 		 ENDBRANCH there.  It should be done under a special flag to
2085 		 control ENDBRANCH generation for switch stmts.  */
2086 	      edge_iterator ei;
2087 	      edge e;
2088 	      basic_block dest_blk;
2089 
2090 	      FOR_EACH_EDGE (e, ei, bb->succs)
2091 		{
2092 		  rtx_insn *insn;
2093 
2094 		  dest_blk = e->dest;
2095 		  insn = BB_HEAD (dest_blk);
2096 		  gcc_assert (LABEL_P (insn));
2097 		  endbr = gen_nop_endbr ();
2098 		  emit_insn_after (endbr, insn);
2099 		}
2100 	      continue;
2101 	    }
2102 
2103 	  if (LABEL_P (insn) && LABEL_PRESERVE_P (insn))
2104 	    {
2105 	      endbr = gen_nop_endbr ();
2106 	      emit_insn_after (endbr, insn);
2107 	      continue;
2108 	    }
2109 	}
2110     }
2111 
2112   return;
2113 }
2114 
2115 namespace {
2116 
2117 const pass_data pass_data_insert_endbr_and_patchable_area =
2118 {
2119   RTL_PASS, /* type.  */
2120   "endbr_and_patchable_area", /* name.  */
2121   OPTGROUP_NONE, /* optinfo_flags.  */
2122   TV_MACH_DEP, /* tv_id.  */
2123   0, /* properties_required.  */
2124   0, /* properties_provided.  */
2125   0, /* properties_destroyed.  */
2126   0, /* todo_flags_start.  */
2127   0, /* todo_flags_finish.  */
2128 };
2129 
2130 class pass_insert_endbr_and_patchable_area : public rtl_opt_pass
2131 {
2132 public:
pass_insert_endbr_and_patchable_area(gcc::context * ctxt)2133   pass_insert_endbr_and_patchable_area (gcc::context *ctxt)
2134     : rtl_opt_pass (pass_data_insert_endbr_and_patchable_area, ctxt)
2135   {}
2136 
2137   /* opt_pass methods: */
gate(function *)2138   virtual bool gate (function *)
2139     {
2140       need_endbr = (flag_cf_protection & CF_BRANCH) != 0;
2141       patchable_area_size = crtl->patch_area_size - crtl->patch_area_entry;
2142       return need_endbr || patchable_area_size;
2143     }
2144 
execute(function *)2145   virtual unsigned int execute (function *)
2146     {
2147       timevar_push (TV_MACH_DEP);
2148       rest_of_insert_endbr_and_patchable_area (need_endbr,
2149 					       patchable_area_size);
2150       timevar_pop (TV_MACH_DEP);
2151       return 0;
2152     }
2153 
2154 private:
2155   bool need_endbr;
2156   unsigned int patchable_area_size;
2157 }; // class pass_insert_endbr_and_patchable_area
2158 
2159 } // anon namespace
2160 
2161 rtl_opt_pass *
make_pass_insert_endbr_and_patchable_area(gcc::context * ctxt)2162 make_pass_insert_endbr_and_patchable_area (gcc::context *ctxt)
2163 {
2164   return new pass_insert_endbr_and_patchable_area (ctxt);
2165 }
2166 
2167 /* At entry of the nearest common dominator for basic blocks with
2168    conversions/rcp/sqrt/rsqrt/round, generate a single
2169 	vxorps %xmmN, %xmmN, %xmmN
2170    for all
2171 	vcvtss2sd  op, %xmmN, %xmmX
2172 	vcvtsd2ss  op, %xmmN, %xmmX
2173 	vcvtsi2ss  op, %xmmN, %xmmX
2174 	vcvtsi2sd  op, %xmmN, %xmmX
2175 
2176    NB: We want to generate only a single vxorps to cover the whole
2177    function.  The LCM algorithm isn't appropriate here since it may
2178    place a vxorps inside the loop.  */
2179 
2180 static unsigned int
remove_partial_avx_dependency(void)2181 remove_partial_avx_dependency (void)
2182 {
2183   timevar_push (TV_MACH_DEP);
2184 
2185   bitmap_obstack_initialize (NULL);
2186   bitmap convert_bbs = BITMAP_ALLOC (NULL);
2187 
2188   basic_block bb;
2189   rtx_insn *insn, *set_insn;
2190   rtx set;
2191   rtx v4sf_const0 = NULL_RTX;
2192 
2193   auto_vec<rtx_insn *> control_flow_insns;
2194 
2195   /* We create invalid RTL initially so defer rescans.  */
2196   df_set_flags (DF_DEFER_INSN_RESCAN);
2197 
2198   FOR_EACH_BB_FN (bb, cfun)
2199     {
2200       FOR_BB_INSNS (bb, insn)
2201 	{
2202 	  if (!NONDEBUG_INSN_P (insn))
2203 	    continue;
2204 
2205 	  set = single_set (insn);
2206 	  if (!set)
2207 	    continue;
2208 
2209 	  if (get_attr_avx_partial_xmm_update (insn)
2210 	      != AVX_PARTIAL_XMM_UPDATE_TRUE)
2211 	    continue;
2212 
2213 	  /* Convert PARTIAL_XMM_UPDATE_TRUE insns, DF -> SF, SF -> DF,
2214 	     SI -> SF, SI -> DF, DI -> SF, DI -> DF, sqrt, rsqrt, rcp,
2215 	     round, to vec_dup and vec_merge with subreg.  */
2216 	  rtx src = SET_SRC (set);
2217 	  rtx dest = SET_DEST (set);
2218 	  machine_mode dest_mode = GET_MODE (dest);
2219 	  bool convert_p = false;
2220 	  switch (GET_CODE (src))
2221 	    {
2222 	    case FLOAT:
2223 	    case FLOAT_EXTEND:
2224 	    case FLOAT_TRUNCATE:
2225 	    case UNSIGNED_FLOAT:
2226 	      convert_p = true;
2227 	      break;
2228 	    default:
2229 	      break;
2230 	    }
2231 
2232 	  /* Only hanlde conversion here.  */
2233 	  machine_mode src_mode
2234 	    = convert_p ? GET_MODE (XEXP (src, 0)) : VOIDmode;
2235 	  switch (src_mode)
2236 	    {
2237 	    case E_SFmode:
2238 	    case E_DFmode:
2239 	      if (TARGET_USE_VECTOR_FP_CONVERTS
2240 		  || !TARGET_SSE_PARTIAL_REG_FP_CONVERTS_DEPENDENCY)
2241 		continue;
2242 	      break;
2243 	    case E_SImode:
2244 	    case E_DImode:
2245 	      if (TARGET_USE_VECTOR_CONVERTS
2246 		  || !TARGET_SSE_PARTIAL_REG_CONVERTS_DEPENDENCY)
2247 		continue;
2248 	      break;
2249 	    case E_VOIDmode:
2250 	      gcc_assert (!convert_p);
2251 	      break;
2252 	    default:
2253 	      gcc_unreachable ();
2254 	    }
2255 
2256 	  if (!v4sf_const0)
2257 	    v4sf_const0 = gen_reg_rtx (V4SFmode);
2258 
2259 	  rtx zero;
2260 	  machine_mode dest_vecmode;
2261 	  switch (dest_mode)
2262 	    {
2263 	    case E_HFmode:
2264 	      dest_vecmode = V8HFmode;
2265 	      zero = gen_rtx_SUBREG (V8HFmode, v4sf_const0, 0);
2266 	      break;
2267 	    case E_SFmode:
2268 	      dest_vecmode = V4SFmode;
2269 	      zero = v4sf_const0;
2270 	      break;
2271 	    case E_DFmode:
2272 	      dest_vecmode = V2DFmode;
2273 	      zero = gen_rtx_SUBREG (V2DFmode, v4sf_const0, 0);
2274 	      break;
2275 	    default:
2276 	      gcc_unreachable ();
2277 	    }
2278 
2279 	  /* Change source to vector mode.  */
2280 	  src = gen_rtx_VEC_DUPLICATE (dest_vecmode, src);
2281 	  src = gen_rtx_VEC_MERGE (dest_vecmode, src, zero,
2282 				   GEN_INT (HOST_WIDE_INT_1U));
2283 	  /* Change destination to vector mode.  */
2284 	  rtx vec = gen_reg_rtx (dest_vecmode);
2285 	  /* Generate an XMM vector SET.  */
2286 	  set = gen_rtx_SET (vec, src);
2287 	  set_insn = emit_insn_before (set, insn);
2288 	  df_insn_rescan (set_insn);
2289 
2290 	  if (cfun->can_throw_non_call_exceptions)
2291 	    {
2292 	      /* Handle REG_EH_REGION note.  */
2293 	      rtx note = find_reg_note (insn, REG_EH_REGION, NULL_RTX);
2294 	      if (note)
2295 		{
2296 		  control_flow_insns.safe_push (set_insn);
2297 		  add_reg_note (set_insn, REG_EH_REGION, XEXP (note, 0));
2298 		}
2299 	    }
2300 
2301 	  src = gen_rtx_SUBREG (dest_mode, vec, 0);
2302 	  set = gen_rtx_SET (dest, src);
2303 
2304 	  /* Drop possible dead definitions.  */
2305 	  PATTERN (insn) = set;
2306 
2307 	  INSN_CODE (insn) = -1;
2308 	  recog_memoized (insn);
2309 	  df_insn_rescan (insn);
2310 	  bitmap_set_bit (convert_bbs, bb->index);
2311 	}
2312     }
2313 
2314   if (v4sf_const0)
2315     {
2316       /* (Re-)discover loops so that bb->loop_father can be used in the
2317 	 analysis below.  */
2318       calculate_dominance_info (CDI_DOMINATORS);
2319       loop_optimizer_init (AVOID_CFG_MODIFICATIONS);
2320 
2321       /* Generate a vxorps at entry of the nearest dominator for basic
2322 	 blocks with conversions, which is in the fake loop that
2323 	 contains the whole function, so that there is only a single
2324 	 vxorps in the whole function.   */
2325       bb = nearest_common_dominator_for_set (CDI_DOMINATORS,
2326 					     convert_bbs);
2327       while (bb->loop_father->latch
2328 	     != EXIT_BLOCK_PTR_FOR_FN (cfun))
2329 	bb = get_immediate_dominator (CDI_DOMINATORS,
2330 				      bb->loop_father->header);
2331 
2332       set = gen_rtx_SET (v4sf_const0, CONST0_RTX (V4SFmode));
2333 
2334       insn = BB_HEAD (bb);
2335       while (insn && !NONDEBUG_INSN_P (insn))
2336 	{
2337 	  if (insn == BB_END (bb))
2338 	    {
2339 	      insn = NULL;
2340 	      break;
2341 	    }
2342 	  insn = NEXT_INSN (insn);
2343 	}
2344       if (insn == BB_HEAD (bb))
2345         set_insn = emit_insn_before (set, insn);
2346       else
2347 	set_insn = emit_insn_after (set,
2348 				    insn ? PREV_INSN (insn) : BB_END (bb));
2349       df_insn_rescan (set_insn);
2350       loop_optimizer_finalize ();
2351 
2352       if (!control_flow_insns.is_empty ())
2353 	{
2354 	  free_dominance_info (CDI_DOMINATORS);
2355 
2356 	  unsigned int i;
2357 	  FOR_EACH_VEC_ELT (control_flow_insns, i, insn)
2358 	    if (control_flow_insn_p (insn))
2359 	      {
2360 		/* Split the block after insn.  There will be a fallthru
2361 		   edge, which is OK so we keep it.  We have to create
2362 		   the exception edges ourselves.  */
2363 		bb = BLOCK_FOR_INSN (insn);
2364 		split_block (bb, insn);
2365 		rtl_make_eh_edge (NULL, bb, BB_END (bb));
2366 	      }
2367 	}
2368     }
2369 
2370   df_process_deferred_rescans ();
2371   df_clear_flags (DF_DEFER_INSN_RESCAN);
2372   bitmap_obstack_release (NULL);
2373   BITMAP_FREE (convert_bbs);
2374 
2375   timevar_pop (TV_MACH_DEP);
2376   return 0;
2377 }
2378 
2379 namespace {
2380 
2381 const pass_data pass_data_remove_partial_avx_dependency =
2382 {
2383   RTL_PASS, /* type */
2384   "rpad", /* name */
2385   OPTGROUP_NONE, /* optinfo_flags */
2386   TV_MACH_DEP, /* tv_id */
2387   0, /* properties_required */
2388   0, /* properties_provided */
2389   0, /* properties_destroyed */
2390   0, /* todo_flags_start */
2391   0, /* todo_flags_finish */
2392 };
2393 
2394 class pass_remove_partial_avx_dependency : public rtl_opt_pass
2395 {
2396 public:
pass_remove_partial_avx_dependency(gcc::context * ctxt)2397   pass_remove_partial_avx_dependency (gcc::context *ctxt)
2398     : rtl_opt_pass (pass_data_remove_partial_avx_dependency, ctxt)
2399   {}
2400 
2401   /* opt_pass methods: */
gate(function *)2402   virtual bool gate (function *)
2403     {
2404       return (TARGET_AVX
2405 	      && TARGET_SSE_PARTIAL_REG_DEPENDENCY
2406 	      && TARGET_SSE_MATH
2407 	      && optimize
2408 	      && optimize_function_for_speed_p (cfun));
2409     }
2410 
execute(function *)2411   virtual unsigned int execute (function *)
2412     {
2413       return remove_partial_avx_dependency ();
2414     }
2415 }; // class pass_rpad
2416 
2417 } // anon namespace
2418 
2419 rtl_opt_pass *
make_pass_remove_partial_avx_dependency(gcc::context * ctxt)2420 make_pass_remove_partial_avx_dependency (gcc::context *ctxt)
2421 {
2422   return new pass_remove_partial_avx_dependency (ctxt);
2423 }
2424 
2425 /* This compares the priority of target features in function DECL1
2426    and DECL2.  It returns positive value if DECL1 is higher priority,
2427    negative value if DECL2 is higher priority and 0 if they are the
2428    same.  */
2429 
2430 int
ix86_compare_version_priority(tree decl1,tree decl2)2431 ix86_compare_version_priority (tree decl1, tree decl2)
2432 {
2433   unsigned int priority1 = get_builtin_code_for_version (decl1, NULL);
2434   unsigned int priority2 = get_builtin_code_for_version (decl2, NULL);
2435 
2436   return (int)priority1 - (int)priority2;
2437 }
2438 
2439 /* V1 and V2 point to function versions with different priorities
2440    based on the target ISA.  This function compares their priorities.  */
2441 
2442 static int
feature_compare(const void * v1,const void * v2)2443 feature_compare (const void *v1, const void *v2)
2444 {
2445   typedef struct _function_version_info
2446     {
2447       tree version_decl;
2448       tree predicate_chain;
2449       unsigned int dispatch_priority;
2450     } function_version_info;
2451 
2452   const function_version_info c1 = *(const function_version_info *)v1;
2453   const function_version_info c2 = *(const function_version_info *)v2;
2454   return (c2.dispatch_priority - c1.dispatch_priority);
2455 }
2456 
2457 /* This adds a condition to the basic_block NEW_BB in function FUNCTION_DECL
2458    to return a pointer to VERSION_DECL if the outcome of the expression
2459    formed by PREDICATE_CHAIN is true.  This function will be called during
2460    version dispatch to decide which function version to execute.  It returns
2461    the basic block at the end, to which more conditions can be added.  */
2462 
2463 static basic_block
add_condition_to_bb(tree function_decl,tree version_decl,tree predicate_chain,basic_block new_bb)2464 add_condition_to_bb (tree function_decl, tree version_decl,
2465 		     tree predicate_chain, basic_block new_bb)
2466 {
2467   gimple *return_stmt;
2468   tree convert_expr, result_var;
2469   gimple *convert_stmt;
2470   gimple *call_cond_stmt;
2471   gimple *if_else_stmt;
2472 
2473   basic_block bb1, bb2, bb3;
2474   edge e12, e23;
2475 
2476   tree cond_var, and_expr_var = NULL_TREE;
2477   gimple_seq gseq;
2478 
2479   tree predicate_decl, predicate_arg;
2480 
2481   push_cfun (DECL_STRUCT_FUNCTION (function_decl));
2482 
2483   gcc_assert (new_bb != NULL);
2484   gseq = bb_seq (new_bb);
2485 
2486 
2487   convert_expr = build1 (CONVERT_EXPR, ptr_type_node,
2488 	     		 build_fold_addr_expr (version_decl));
2489   result_var = create_tmp_var (ptr_type_node);
2490   convert_stmt = gimple_build_assign (result_var, convert_expr);
2491   return_stmt = gimple_build_return (result_var);
2492 
2493   if (predicate_chain == NULL_TREE)
2494     {
2495       gimple_seq_add_stmt (&gseq, convert_stmt);
2496       gimple_seq_add_stmt (&gseq, return_stmt);
2497       set_bb_seq (new_bb, gseq);
2498       gimple_set_bb (convert_stmt, new_bb);
2499       gimple_set_bb (return_stmt, new_bb);
2500       pop_cfun ();
2501       return new_bb;
2502     }
2503 
2504   while (predicate_chain != NULL)
2505     {
2506       cond_var = create_tmp_var (integer_type_node);
2507       predicate_decl = TREE_PURPOSE (predicate_chain);
2508       predicate_arg = TREE_VALUE (predicate_chain);
2509       call_cond_stmt = gimple_build_call (predicate_decl, 1, predicate_arg);
2510       gimple_call_set_lhs (call_cond_stmt, cond_var);
2511 
2512       gimple_set_block (call_cond_stmt, DECL_INITIAL (function_decl));
2513       gimple_set_bb (call_cond_stmt, new_bb);
2514       gimple_seq_add_stmt (&gseq, call_cond_stmt);
2515 
2516       predicate_chain = TREE_CHAIN (predicate_chain);
2517 
2518       if (and_expr_var == NULL)
2519         and_expr_var = cond_var;
2520       else
2521 	{
2522 	  gimple *assign_stmt;
2523 	  /* Use MIN_EXPR to check if any integer is zero?.
2524 	     and_expr_var = min_expr <cond_var, and_expr_var>  */
2525 	  assign_stmt = gimple_build_assign (and_expr_var,
2526 			  build2 (MIN_EXPR, integer_type_node,
2527 				  cond_var, and_expr_var));
2528 
2529 	  gimple_set_block (assign_stmt, DECL_INITIAL (function_decl));
2530 	  gimple_set_bb (assign_stmt, new_bb);
2531 	  gimple_seq_add_stmt (&gseq, assign_stmt);
2532 	}
2533     }
2534 
2535   if_else_stmt = gimple_build_cond (GT_EXPR, and_expr_var,
2536 	  		            integer_zero_node,
2537 				    NULL_TREE, NULL_TREE);
2538   gimple_set_block (if_else_stmt, DECL_INITIAL (function_decl));
2539   gimple_set_bb (if_else_stmt, new_bb);
2540   gimple_seq_add_stmt (&gseq, if_else_stmt);
2541 
2542   gimple_seq_add_stmt (&gseq, convert_stmt);
2543   gimple_seq_add_stmt (&gseq, return_stmt);
2544   set_bb_seq (new_bb, gseq);
2545 
2546   bb1 = new_bb;
2547   e12 = split_block (bb1, if_else_stmt);
2548   bb2 = e12->dest;
2549   e12->flags &= ~EDGE_FALLTHRU;
2550   e12->flags |= EDGE_TRUE_VALUE;
2551 
2552   e23 = split_block (bb2, return_stmt);
2553 
2554   gimple_set_bb (convert_stmt, bb2);
2555   gimple_set_bb (return_stmt, bb2);
2556 
2557   bb3 = e23->dest;
2558   make_edge (bb1, bb3, EDGE_FALSE_VALUE);
2559 
2560   remove_edge (e23);
2561   make_edge (bb2, EXIT_BLOCK_PTR_FOR_FN (cfun), 0);
2562 
2563   pop_cfun ();
2564 
2565   return bb3;
2566 }
2567 
2568 /* This function generates the dispatch function for
2569    multi-versioned functions.  DISPATCH_DECL is the function which will
2570    contain the dispatch logic.  FNDECLS are the function choices for
2571    dispatch, and is a tree chain.  EMPTY_BB is the basic block pointer
2572    in DISPATCH_DECL in which the dispatch code is generated.  */
2573 
2574 static int
dispatch_function_versions(tree dispatch_decl,void * fndecls_p,basic_block * empty_bb)2575 dispatch_function_versions (tree dispatch_decl,
2576 			    void *fndecls_p,
2577 			    basic_block *empty_bb)
2578 {
2579   tree default_decl;
2580   gimple *ifunc_cpu_init_stmt;
2581   gimple_seq gseq;
2582   int ix;
2583   tree ele;
2584   vec<tree> *fndecls;
2585   unsigned int num_versions = 0;
2586   unsigned int actual_versions = 0;
2587   unsigned int i;
2588 
2589   struct _function_version_info
2590     {
2591       tree version_decl;
2592       tree predicate_chain;
2593       unsigned int dispatch_priority;
2594     }*function_version_info;
2595 
2596   gcc_assert (dispatch_decl != NULL
2597 	      && fndecls_p != NULL
2598 	      && empty_bb != NULL);
2599 
2600   /*fndecls_p is actually a vector.  */
2601   fndecls = static_cast<vec<tree> *> (fndecls_p);
2602 
2603   /* At least one more version other than the default.  */
2604   num_versions = fndecls->length ();
2605   gcc_assert (num_versions >= 2);
2606 
2607   function_version_info = (struct _function_version_info *)
2608     XNEWVEC (struct _function_version_info, (num_versions - 1));
2609 
2610   /* The first version in the vector is the default decl.  */
2611   default_decl = (*fndecls)[0];
2612 
2613   push_cfun (DECL_STRUCT_FUNCTION (dispatch_decl));
2614 
2615   gseq = bb_seq (*empty_bb);
2616   /* Function version dispatch is via IFUNC.  IFUNC resolvers fire before
2617      constructors, so explicity call __builtin_cpu_init here.  */
2618   ifunc_cpu_init_stmt
2619     = gimple_build_call_vec (get_ix86_builtin (IX86_BUILTIN_CPU_INIT), vNULL);
2620   gimple_seq_add_stmt (&gseq, ifunc_cpu_init_stmt);
2621   gimple_set_bb (ifunc_cpu_init_stmt, *empty_bb);
2622   set_bb_seq (*empty_bb, gseq);
2623 
2624   pop_cfun ();
2625 
2626 
2627   for (ix = 1; fndecls->iterate (ix, &ele); ++ix)
2628     {
2629       tree version_decl = ele;
2630       tree predicate_chain = NULL_TREE;
2631       unsigned int priority;
2632       /* Get attribute string, parse it and find the right predicate decl.
2633          The predicate function could be a lengthy combination of many
2634 	 features, like arch-type and various isa-variants.  */
2635       priority = get_builtin_code_for_version (version_decl,
2636 	 			               &predicate_chain);
2637 
2638       if (predicate_chain == NULL_TREE)
2639 	continue;
2640 
2641       function_version_info [actual_versions].version_decl = version_decl;
2642       function_version_info [actual_versions].predicate_chain
2643 	 = predicate_chain;
2644       function_version_info [actual_versions].dispatch_priority = priority;
2645       actual_versions++;
2646     }
2647 
2648   /* Sort the versions according to descending order of dispatch priority.  The
2649      priority is based on the ISA.  This is not a perfect solution.  There
2650      could still be ambiguity.  If more than one function version is suitable
2651      to execute,  which one should be dispatched?  In future, allow the user
2652      to specify a dispatch  priority next to the version.  */
2653   qsort (function_version_info, actual_versions,
2654          sizeof (struct _function_version_info), feature_compare);
2655 
2656   for  (i = 0; i < actual_versions; ++i)
2657     *empty_bb = add_condition_to_bb (dispatch_decl,
2658 				     function_version_info[i].version_decl,
2659 				     function_version_info[i].predicate_chain,
2660 				     *empty_bb);
2661 
2662   /* dispatch default version at the end.  */
2663   *empty_bb = add_condition_to_bb (dispatch_decl, default_decl,
2664 				   NULL, *empty_bb);
2665 
2666   free (function_version_info);
2667   return 0;
2668 }
2669 
2670 /* This function changes the assembler name for functions that are
2671    versions.  If DECL is a function version and has a "target"
2672    attribute, it appends the attribute string to its assembler name.  */
2673 
2674 static tree
ix86_mangle_function_version_assembler_name(tree decl,tree id)2675 ix86_mangle_function_version_assembler_name (tree decl, tree id)
2676 {
2677   tree version_attr;
2678   const char *orig_name, *version_string;
2679   char *attr_str, *assembler_name;
2680 
2681   if (DECL_DECLARED_INLINE_P (decl)
2682       && lookup_attribute ("gnu_inline",
2683 			   DECL_ATTRIBUTES (decl)))
2684     error_at (DECL_SOURCE_LOCATION (decl),
2685 	      "function versions cannot be marked as %<gnu_inline%>,"
2686 	      " bodies have to be generated");
2687 
2688   if (DECL_VIRTUAL_P (decl)
2689       || DECL_VINDEX (decl))
2690     sorry ("virtual function multiversioning not supported");
2691 
2692   version_attr = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
2693 
2694   /* target attribute string cannot be NULL.  */
2695   gcc_assert (version_attr != NULL_TREE);
2696 
2697   orig_name = IDENTIFIER_POINTER (id);
2698   version_string
2699     = TREE_STRING_POINTER (TREE_VALUE (TREE_VALUE (version_attr)));
2700 
2701   if (strcmp (version_string, "default") == 0)
2702     return id;
2703 
2704   attr_str = sorted_attr_string (TREE_VALUE (version_attr));
2705   assembler_name = XNEWVEC (char, strlen (orig_name) + strlen (attr_str) + 2);
2706 
2707   sprintf (assembler_name, "%s.%s", orig_name, attr_str);
2708 
2709   /* Allow assembler name to be modified if already set.  */
2710   if (DECL_ASSEMBLER_NAME_SET_P (decl))
2711     SET_DECL_RTL (decl, NULL);
2712 
2713   tree ret = get_identifier (assembler_name);
2714   XDELETEVEC (attr_str);
2715   XDELETEVEC (assembler_name);
2716   return ret;
2717 }
2718 
2719 tree
ix86_mangle_decl_assembler_name(tree decl,tree id)2720 ix86_mangle_decl_assembler_name (tree decl, tree id)
2721 {
2722   /* For function version, add the target suffix to the assembler name.  */
2723   if (TREE_CODE (decl) == FUNCTION_DECL
2724       && DECL_FUNCTION_VERSIONED (decl))
2725     id = ix86_mangle_function_version_assembler_name (decl, id);
2726 #ifdef SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME
2727   id = SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME (decl, id);
2728 #endif
2729 
2730   return id;
2731 }
2732 
2733 /* Make a dispatcher declaration for the multi-versioned function DECL.
2734    Calls to DECL function will be replaced with calls to the dispatcher
2735    by the front-end.  Returns the decl of the dispatcher function.  */
2736 
2737 tree
ix86_get_function_versions_dispatcher(void * decl)2738 ix86_get_function_versions_dispatcher (void *decl)
2739 {
2740   tree fn = (tree) decl;
2741   struct cgraph_node *node = NULL;
2742   struct cgraph_node *default_node = NULL;
2743   struct cgraph_function_version_info *node_v = NULL;
2744   struct cgraph_function_version_info *first_v = NULL;
2745 
2746   tree dispatch_decl = NULL;
2747 
2748   struct cgraph_function_version_info *default_version_info = NULL;
2749 
2750   gcc_assert (fn != NULL && DECL_FUNCTION_VERSIONED (fn));
2751 
2752   node = cgraph_node::get (fn);
2753   gcc_assert (node != NULL);
2754 
2755   node_v = node->function_version ();
2756   gcc_assert (node_v != NULL);
2757 
2758   if (node_v->dispatcher_resolver != NULL)
2759     return node_v->dispatcher_resolver;
2760 
2761   /* Find the default version and make it the first node.  */
2762   first_v = node_v;
2763   /* Go to the beginning of the chain.  */
2764   while (first_v->prev != NULL)
2765     first_v = first_v->prev;
2766   default_version_info = first_v;
2767   while (default_version_info != NULL)
2768     {
2769       if (is_function_default_version
2770 	    (default_version_info->this_node->decl))
2771         break;
2772       default_version_info = default_version_info->next;
2773     }
2774 
2775   /* If there is no default node, just return NULL.  */
2776   if (default_version_info == NULL)
2777     return NULL;
2778 
2779   /* Make default info the first node.  */
2780   if (first_v != default_version_info)
2781     {
2782       default_version_info->prev->next = default_version_info->next;
2783       if (default_version_info->next)
2784         default_version_info->next->prev = default_version_info->prev;
2785       first_v->prev = default_version_info;
2786       default_version_info->next = first_v;
2787       default_version_info->prev = NULL;
2788     }
2789 
2790   default_node = default_version_info->this_node;
2791 
2792 #if defined (ASM_OUTPUT_TYPE_DIRECTIVE)
2793   if (targetm.has_ifunc_p ())
2794     {
2795       struct cgraph_function_version_info *it_v = NULL;
2796       struct cgraph_node *dispatcher_node = NULL;
2797       struct cgraph_function_version_info *dispatcher_version_info = NULL;
2798 
2799       /* Right now, the dispatching is done via ifunc.  */
2800       dispatch_decl = make_dispatcher_decl (default_node->decl);
2801 
2802       dispatcher_node = cgraph_node::get_create (dispatch_decl);
2803       gcc_assert (dispatcher_node != NULL);
2804       dispatcher_node->dispatcher_function = 1;
2805       dispatcher_version_info
2806 	= dispatcher_node->insert_new_function_version ();
2807       dispatcher_version_info->next = default_version_info;
2808       dispatcher_node->definition = 1;
2809 
2810       /* Set the dispatcher for all the versions.  */
2811       it_v = default_version_info;
2812       while (it_v != NULL)
2813 	{
2814 	  it_v->dispatcher_resolver = dispatch_decl;
2815 	  it_v = it_v->next;
2816 	}
2817     }
2818   else
2819 #endif
2820     {
2821       error_at (DECL_SOURCE_LOCATION (default_node->decl),
2822 		"multiversioning needs %<ifunc%> which is not supported "
2823 		"on this target");
2824     }
2825 
2826   return dispatch_decl;
2827 }
2828 
2829 /* Make the resolver function decl to dispatch the versions of
2830    a multi-versioned function,  DEFAULT_DECL.  IFUNC_ALIAS_DECL is
2831    ifunc alias that will point to the created resolver.  Create an
2832    empty basic block in the resolver and store the pointer in
2833    EMPTY_BB.  Return the decl of the resolver function.  */
2834 
2835 static tree
make_resolver_func(const tree default_decl,const tree ifunc_alias_decl,basic_block * empty_bb)2836 make_resolver_func (const tree default_decl,
2837 		    const tree ifunc_alias_decl,
2838 		    basic_block *empty_bb)
2839 {
2840   tree decl, type, t;
2841 
2842   /* Create resolver function name based on default_decl.  */
2843   tree decl_name = clone_function_name (default_decl, "resolver");
2844   const char *resolver_name = IDENTIFIER_POINTER (decl_name);
2845 
2846   /* The resolver function should return a (void *). */
2847   type = build_function_type_list (ptr_type_node, NULL_TREE);
2848 
2849   decl = build_fn_decl (resolver_name, type);
2850   SET_DECL_ASSEMBLER_NAME (decl, decl_name);
2851 
2852   DECL_NAME (decl) = decl_name;
2853   TREE_USED (decl) = 1;
2854   DECL_ARTIFICIAL (decl) = 1;
2855   DECL_IGNORED_P (decl) = 1;
2856   TREE_PUBLIC (decl) = 0;
2857   DECL_UNINLINABLE (decl) = 1;
2858 
2859   /* Resolver is not external, body is generated.  */
2860   DECL_EXTERNAL (decl) = 0;
2861   DECL_EXTERNAL (ifunc_alias_decl) = 0;
2862 
2863   DECL_CONTEXT (decl) = NULL_TREE;
2864   DECL_INITIAL (decl) = make_node (BLOCK);
2865   DECL_STATIC_CONSTRUCTOR (decl) = 0;
2866 
2867   if (DECL_COMDAT_GROUP (default_decl)
2868       || TREE_PUBLIC (default_decl))
2869     {
2870       /* In this case, each translation unit with a call to this
2871 	 versioned function will put out a resolver.  Ensure it
2872 	 is comdat to keep just one copy.  */
2873       DECL_COMDAT (decl) = 1;
2874       make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
2875     }
2876   else
2877     TREE_PUBLIC (ifunc_alias_decl) = 0;
2878 
2879   /* Build result decl and add to function_decl. */
2880   t = build_decl (UNKNOWN_LOCATION, RESULT_DECL, NULL_TREE, ptr_type_node);
2881   DECL_CONTEXT (t) = decl;
2882   DECL_ARTIFICIAL (t) = 1;
2883   DECL_IGNORED_P (t) = 1;
2884   DECL_RESULT (decl) = t;
2885 
2886   gimplify_function_tree (decl);
2887   push_cfun (DECL_STRUCT_FUNCTION (decl));
2888   *empty_bb = init_lowered_empty_function (decl, false,
2889 					   profile_count::uninitialized ());
2890 
2891   cgraph_node::add_new_function (decl, true);
2892   symtab->call_cgraph_insertion_hooks (cgraph_node::get_create (decl));
2893 
2894   pop_cfun ();
2895 
2896   gcc_assert (ifunc_alias_decl != NULL);
2897   /* Mark ifunc_alias_decl as "ifunc" with resolver as resolver_name.  */
2898   DECL_ATTRIBUTES (ifunc_alias_decl)
2899     = make_attribute ("ifunc", resolver_name,
2900 		      DECL_ATTRIBUTES (ifunc_alias_decl));
2901 
2902   /* Create the alias for dispatch to resolver here.  */
2903   cgraph_node::create_same_body_alias (ifunc_alias_decl, decl);
2904   return decl;
2905 }
2906 
2907 /* Generate the dispatching code body to dispatch multi-versioned function
2908    DECL.  The target hook is called to process the "target" attributes and
2909    provide the code to dispatch the right function at run-time.  NODE points
2910    to the dispatcher decl whose body will be created.  */
2911 
2912 tree
ix86_generate_version_dispatcher_body(void * node_p)2913 ix86_generate_version_dispatcher_body (void *node_p)
2914 {
2915   tree resolver_decl;
2916   basic_block empty_bb;
2917   tree default_ver_decl;
2918   struct cgraph_node *versn;
2919   struct cgraph_node *node;
2920 
2921   struct cgraph_function_version_info *node_version_info = NULL;
2922   struct cgraph_function_version_info *versn_info = NULL;
2923 
2924   node = (cgraph_node *)node_p;
2925 
2926   node_version_info = node->function_version ();
2927   gcc_assert (node->dispatcher_function
2928 	      && node_version_info != NULL);
2929 
2930   if (node_version_info->dispatcher_resolver)
2931     return node_version_info->dispatcher_resolver;
2932 
2933   /* The first version in the chain corresponds to the default version.  */
2934   default_ver_decl = node_version_info->next->this_node->decl;
2935 
2936   /* node is going to be an alias, so remove the finalized bit.  */
2937   node->definition = false;
2938 
2939   resolver_decl = make_resolver_func (default_ver_decl,
2940 				      node->decl, &empty_bb);
2941 
2942   node_version_info->dispatcher_resolver = resolver_decl;
2943 
2944   push_cfun (DECL_STRUCT_FUNCTION (resolver_decl));
2945 
2946   auto_vec<tree, 2> fn_ver_vec;
2947 
2948   for (versn_info = node_version_info->next; versn_info;
2949        versn_info = versn_info->next)
2950     {
2951       versn = versn_info->this_node;
2952       /* Check for virtual functions here again, as by this time it should
2953 	 have been determined if this function needs a vtable index or
2954 	 not.  This happens for methods in derived classes that override
2955 	 virtual methods in base classes but are not explicitly marked as
2956 	 virtual.  */
2957       if (DECL_VINDEX (versn->decl))
2958 	sorry ("virtual function multiversioning not supported");
2959 
2960       fn_ver_vec.safe_push (versn->decl);
2961     }
2962 
2963   dispatch_function_versions (resolver_decl, &fn_ver_vec, &empty_bb);
2964   cgraph_edge::rebuild_edges ();
2965   pop_cfun ();
2966   return resolver_decl;
2967 }
2968 
2969 
2970