1 /* Global, SSA-based optimizations using mathematical identities.
2    Copyright (C) 2005-2020 Free Software Foundation, Inc.
3 
4 This file is part of GCC.
5 
6 GCC is free software; you can redistribute it and/or modify it
7 under the terms of the GNU General Public License as published by the
8 Free Software Foundation; either version 3, or (at your option) any
9 later version.
10 
11 GCC is distributed in the hope that it will be useful, but WITHOUT
12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14 for more details.
15 
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3.  If not see
18 <http://www.gnu.org/licenses/>.  */
19 
20 /* Currently, the only mini-pass in this file tries to CSE reciprocal
21    operations.  These are common in sequences such as this one:
22 
23 	modulus = sqrt(x*x + y*y + z*z);
24 	x = x / modulus;
25 	y = y / modulus;
26 	z = z / modulus;
27 
28    that can be optimized to
29 
30 	modulus = sqrt(x*x + y*y + z*z);
31         rmodulus = 1.0 / modulus;
32 	x = x * rmodulus;
33 	y = y * rmodulus;
34 	z = z * rmodulus;
35 
36    We do this for loop invariant divisors, and with this pass whenever
37    we notice that a division has the same divisor multiple times.
38 
39    Of course, like in PRE, we don't insert a division if a dominator
40    already has one.  However, this cannot be done as an extension of
41    PRE for several reasons.
42 
43    First of all, with some experiments it was found out that the
44    transformation is not always useful if there are only two divisions
45    by the same divisor.  This is probably because modern processors
46    can pipeline the divisions; on older, in-order processors it should
47    still be effective to optimize two divisions by the same number.
48    We make this a param, and it shall be called N in the remainder of
49    this comment.
50 
51    Second, if trapping math is active, we have less freedom on where
52    to insert divisions: we can only do so in basic blocks that already
53    contain one.  (If divisions don't trap, instead, we can insert
54    divisions elsewhere, which will be in blocks that are common dominators
55    of those that have the division).
56 
57    We really don't want to compute the reciprocal unless a division will
58    be found.  To do this, we won't insert the division in a basic block
59    that has less than N divisions *post-dominating* it.
60 
61    The algorithm constructs a subset of the dominator tree, holding the
62    blocks containing the divisions and the common dominators to them,
63    and walk it twice.  The first walk is in post-order, and it annotates
64    each block with the number of divisions that post-dominate it: this
65    gives information on where divisions can be inserted profitably.
66    The second walk is in pre-order, and it inserts divisions as explained
67    above, and replaces divisions by multiplications.
68 
69    In the best case, the cost of the pass is O(n_statements).  In the
70    worst-case, the cost is due to creating the dominator tree subset,
71    with a cost of O(n_basic_blocks ^ 2); however this can only happen
72    for n_statements / n_basic_blocks statements.  So, the amortized cost
73    of creating the dominator tree subset is O(n_basic_blocks) and the
74    worst-case cost of the pass is O(n_statements * n_basic_blocks).
75 
76    More practically, the cost will be small because there are few
77    divisions, and they tend to be in the same basic block, so insert_bb
78    is called very few times.
79 
80    If we did this using domwalk.c, an efficient implementation would have
81    to work on all the variables in a single pass, because we could not
82    work on just a subset of the dominator tree, as we do now, and the
83    cost would also be something like O(n_statements * n_basic_blocks).
84    The data structures would be more complex in order to work on all the
85    variables in a single pass.  */
86 
87 #include "config.h"
88 #include "system.h"
89 #include "coretypes.h"
90 #include "backend.h"
91 #include "target.h"
92 #include "rtl.h"
93 #include "tree.h"
94 #include "gimple.h"
95 #include "predict.h"
96 #include "alloc-pool.h"
97 #include "tree-pass.h"
98 #include "ssa.h"
99 #include "optabs-tree.h"
100 #include "gimple-pretty-print.h"
101 #include "alias.h"
102 #include "fold-const.h"
103 #include "gimple-fold.h"
104 #include "gimple-iterator.h"
105 #include "gimplify.h"
106 #include "gimplify-me.h"
107 #include "stor-layout.h"
108 #include "tree-cfg.h"
109 #include "tree-dfa.h"
110 #include "tree-ssa.h"
111 #include "builtins.h"
112 #include "internal-fn.h"
113 #include "case-cfn-macros.h"
114 #include "optabs-libfuncs.h"
115 #include "tree-eh.h"
116 #include "targhooks.h"
117 #include "domwalk.h"
118 
119 /* This structure represents one basic block that either computes a
120    division, or is a common dominator for basic block that compute a
121    division.  */
122 struct occurrence {
123   /* The basic block represented by this structure.  */
124   basic_block bb;
125 
126   /* If non-NULL, the SSA_NAME holding the definition for a reciprocal
127      inserted in BB.  */
128   tree recip_def;
129 
130   /* If non-NULL, the SSA_NAME holding the definition for a squared
131      reciprocal inserted in BB.  */
132   tree square_recip_def;
133 
134   /* If non-NULL, the GIMPLE_ASSIGN for a reciprocal computation that
135      was inserted in BB.  */
136   gimple *recip_def_stmt;
137 
138   /* Pointer to a list of "struct occurrence"s for blocks dominated
139      by BB.  */
140   struct occurrence *children;
141 
142   /* Pointer to the next "struct occurrence"s in the list of blocks
143      sharing a common dominator.  */
144   struct occurrence *next;
145 
146   /* The number of divisions that are in BB before compute_merit.  The
147      number of divisions that are in BB or post-dominate it after
148      compute_merit.  */
149   int num_divisions;
150 
151   /* True if the basic block has a division, false if it is a common
152      dominator for basic blocks that do.  If it is false and trapping
153      math is active, BB is not a candidate for inserting a reciprocal.  */
154   bool bb_has_division;
155 };
156 
157 static struct
158 {
159   /* Number of 1.0/X ops inserted.  */
160   int rdivs_inserted;
161 
162   /* Number of 1.0/FUNC ops inserted.  */
163   int rfuncs_inserted;
164 } reciprocal_stats;
165 
166 static struct
167 {
168   /* Number of cexpi calls inserted.  */
169   int inserted;
170 } sincos_stats;
171 
172 static struct
173 {
174   /* Number of widening multiplication ops inserted.  */
175   int widen_mults_inserted;
176 
177   /* Number of integer multiply-and-accumulate ops inserted.  */
178   int maccs_inserted;
179 
180   /* Number of fp fused multiply-add ops inserted.  */
181   int fmas_inserted;
182 
183   /* Number of divmod calls inserted.  */
184   int divmod_calls_inserted;
185 } widen_mul_stats;
186 
187 /* The instance of "struct occurrence" representing the highest
188    interesting block in the dominator tree.  */
189 static struct occurrence *occ_head;
190 
191 /* Allocation pool for getting instances of "struct occurrence".  */
192 static object_allocator<occurrence> *occ_pool;
193 
194 
195 
196 /* Allocate and return a new struct occurrence for basic block BB, and
197    whose children list is headed by CHILDREN.  */
198 static struct occurrence *
occ_new(basic_block bb,struct occurrence * children)199 occ_new (basic_block bb, struct occurrence *children)
200 {
201   struct occurrence *occ;
202 
203   bb->aux = occ = occ_pool->allocate ();
204   memset (occ, 0, sizeof (struct occurrence));
205 
206   occ->bb = bb;
207   occ->children = children;
208   return occ;
209 }
210 
211 
212 /* Insert NEW_OCC into our subset of the dominator tree.  P_HEAD points to a
213    list of "struct occurrence"s, one per basic block, having IDOM as
214    their common dominator.
215 
216    We try to insert NEW_OCC as deep as possible in the tree, and we also
217    insert any other block that is a common dominator for BB and one
218    block already in the tree.  */
219 
220 static void
insert_bb(struct occurrence * new_occ,basic_block idom,struct occurrence ** p_head)221 insert_bb (struct occurrence *new_occ, basic_block idom,
222 	   struct occurrence **p_head)
223 {
224   struct occurrence *occ, **p_occ;
225 
226   for (p_occ = p_head; (occ = *p_occ) != NULL; )
227     {
228       basic_block bb = new_occ->bb, occ_bb = occ->bb;
229       basic_block dom = nearest_common_dominator (CDI_DOMINATORS, occ_bb, bb);
230       if (dom == bb)
231 	{
232 	  /* BB dominates OCC_BB.  OCC becomes NEW_OCC's child: remove OCC
233 	     from its list.  */
234 	  *p_occ = occ->next;
235 	  occ->next = new_occ->children;
236 	  new_occ->children = occ;
237 
238 	  /* Try the next block (it may as well be dominated by BB).  */
239 	}
240 
241       else if (dom == occ_bb)
242 	{
243 	  /* OCC_BB dominates BB.  Tail recurse to look deeper.  */
244 	  insert_bb (new_occ, dom, &occ->children);
245 	  return;
246 	}
247 
248       else if (dom != idom)
249 	{
250 	  gcc_assert (!dom->aux);
251 
252 	  /* There is a dominator between IDOM and BB, add it and make
253 	     two children out of NEW_OCC and OCC.  First, remove OCC from
254 	     its list.	*/
255 	  *p_occ = occ->next;
256 	  new_occ->next = occ;
257 	  occ->next = NULL;
258 
259 	  /* None of the previous blocks has DOM as a dominator: if we tail
260 	     recursed, we would reexamine them uselessly. Just switch BB with
261 	     DOM, and go on looking for blocks dominated by DOM.  */
262           new_occ = occ_new (dom, new_occ);
263 	}
264 
265       else
266 	{
267 	  /* Nothing special, go on with the next element.  */
268 	  p_occ = &occ->next;
269 	}
270     }
271 
272   /* No place was found as a child of IDOM.  Make BB a sibling of IDOM.  */
273   new_occ->next = *p_head;
274   *p_head = new_occ;
275 }
276 
277 /* Register that we found a division in BB.
278    IMPORTANCE is a measure of how much weighting to give
279    that division.  Use IMPORTANCE = 2 to register a single
280    division.  If the division is going to be found multiple
281    times use 1 (as it is with squares).  */
282 
283 static inline void
register_division_in(basic_block bb,int importance)284 register_division_in (basic_block bb, int importance)
285 {
286   struct occurrence *occ;
287 
288   occ = (struct occurrence *) bb->aux;
289   if (!occ)
290     {
291       occ = occ_new (bb, NULL);
292       insert_bb (occ, ENTRY_BLOCK_PTR_FOR_FN (cfun), &occ_head);
293     }
294 
295   occ->bb_has_division = true;
296   occ->num_divisions += importance;
297 }
298 
299 
300 /* Compute the number of divisions that postdominate each block in OCC and
301    its children.  */
302 
303 static void
compute_merit(struct occurrence * occ)304 compute_merit (struct occurrence *occ)
305 {
306   struct occurrence *occ_child;
307   basic_block dom = occ->bb;
308 
309   for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
310     {
311       basic_block bb;
312       if (occ_child->children)
313         compute_merit (occ_child);
314 
315       if (flag_exceptions)
316 	bb = single_noncomplex_succ (dom);
317       else
318 	bb = dom;
319 
320       if (dominated_by_p (CDI_POST_DOMINATORS, bb, occ_child->bb))
321         occ->num_divisions += occ_child->num_divisions;
322     }
323 }
324 
325 
326 /* Return whether USE_STMT is a floating-point division by DEF.  */
327 static inline bool
is_division_by(gimple * use_stmt,tree def)328 is_division_by (gimple *use_stmt, tree def)
329 {
330   return is_gimple_assign (use_stmt)
331 	 && gimple_assign_rhs_code (use_stmt) == RDIV_EXPR
332 	 && gimple_assign_rhs2 (use_stmt) == def
333 	 /* Do not recognize x / x as valid division, as we are getting
334 	    confused later by replacing all immediate uses x in such
335 	    a stmt.  */
336 	 && gimple_assign_rhs1 (use_stmt) != def
337 	 && !stmt_can_throw_internal (cfun, use_stmt);
338 }
339 
340 /* Return TRUE if USE_STMT is a multiplication of DEF by A.  */
341 static inline bool
is_mult_by(gimple * use_stmt,tree def,tree a)342 is_mult_by (gimple *use_stmt, tree def, tree a)
343 {
344   if (gimple_code (use_stmt) == GIMPLE_ASSIGN
345       && gimple_assign_rhs_code (use_stmt) == MULT_EXPR)
346     {
347       tree op0 = gimple_assign_rhs1 (use_stmt);
348       tree op1 = gimple_assign_rhs2 (use_stmt);
349 
350       return (op0 == def && op1 == a)
351 	      || (op0 == a && op1 == def);
352     }
353   return 0;
354 }
355 
356 /* Return whether USE_STMT is DEF * DEF.  */
357 static inline bool
is_square_of(gimple * use_stmt,tree def)358 is_square_of (gimple *use_stmt, tree def)
359 {
360   return is_mult_by (use_stmt, def, def);
361 }
362 
363 /* Return whether USE_STMT is a floating-point division by
364    DEF * DEF.  */
365 static inline bool
is_division_by_square(gimple * use_stmt,tree def)366 is_division_by_square (gimple *use_stmt, tree def)
367 {
368   if (gimple_code (use_stmt) == GIMPLE_ASSIGN
369       && gimple_assign_rhs_code (use_stmt) == RDIV_EXPR
370       && gimple_assign_rhs1 (use_stmt) != gimple_assign_rhs2 (use_stmt)
371       && !stmt_can_throw_internal (cfun, use_stmt))
372     {
373       tree denominator = gimple_assign_rhs2 (use_stmt);
374       if (TREE_CODE (denominator) == SSA_NAME)
375 	return is_square_of (SSA_NAME_DEF_STMT (denominator), def);
376     }
377   return 0;
378 }
379 
380 /* Walk the subset of the dominator tree rooted at OCC, setting the
381    RECIP_DEF field to a definition of 1.0 / DEF that can be used in
382    the given basic block.  The field may be left NULL, of course,
383    if it is not possible or profitable to do the optimization.
384 
385    DEF_BSI is an iterator pointing at the statement defining DEF.
386    If RECIP_DEF is set, a dominator already has a computation that can
387    be used.
388 
389    If should_insert_square_recip is set, then this also inserts
390    the square of the reciprocal immediately after the definition
391    of the reciprocal.  */
392 
393 static void
insert_reciprocals(gimple_stmt_iterator * def_gsi,struct occurrence * occ,tree def,tree recip_def,tree square_recip_def,int should_insert_square_recip,int threshold)394 insert_reciprocals (gimple_stmt_iterator *def_gsi, struct occurrence *occ,
395 		    tree def, tree recip_def, tree square_recip_def,
396 		    int should_insert_square_recip, int threshold)
397 {
398   tree type;
399   gassign *new_stmt, *new_square_stmt;
400   gimple_stmt_iterator gsi;
401   struct occurrence *occ_child;
402 
403   if (!recip_def
404       && (occ->bb_has_division || !flag_trapping_math)
405       /* Divide by two as all divisions are counted twice in
406 	 the costing loop.  */
407       && occ->num_divisions / 2 >= threshold)
408     {
409       /* Make a variable with the replacement and substitute it.  */
410       type = TREE_TYPE (def);
411       recip_def = create_tmp_reg (type, "reciptmp");
412       new_stmt = gimple_build_assign (recip_def, RDIV_EXPR,
413 				      build_one_cst (type), def);
414 
415       if (should_insert_square_recip)
416 	{
417 	  square_recip_def = create_tmp_reg (type, "powmult_reciptmp");
418 	  new_square_stmt = gimple_build_assign (square_recip_def, MULT_EXPR,
419 						 recip_def, recip_def);
420 	}
421 
422       if (occ->bb_has_division)
423 	{
424 	  /* Case 1: insert before an existing division.  */
425 	  gsi = gsi_after_labels (occ->bb);
426 	  while (!gsi_end_p (gsi)
427 		 && (!is_division_by (gsi_stmt (gsi), def))
428 		 && (!is_division_by_square (gsi_stmt (gsi), def)))
429 	    gsi_next (&gsi);
430 
431 	  gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
432 	  if (should_insert_square_recip)
433 	    gsi_insert_before (&gsi, new_square_stmt, GSI_SAME_STMT);
434 	}
435       else if (def_gsi && occ->bb == def_gsi->bb)
436 	{
437 	  /* Case 2: insert right after the definition.  Note that this will
438 	     never happen if the definition statement can throw, because in
439 	     that case the sole successor of the statement's basic block will
440 	     dominate all the uses as well.  */
441 	  gsi_insert_after (def_gsi, new_stmt, GSI_NEW_STMT);
442 	  if (should_insert_square_recip)
443 	    gsi_insert_after (def_gsi, new_square_stmt, GSI_NEW_STMT);
444 	}
445       else
446 	{
447 	  /* Case 3: insert in a basic block not containing defs/uses.  */
448 	  gsi = gsi_after_labels (occ->bb);
449 	  gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
450 	  if (should_insert_square_recip)
451 	    gsi_insert_before (&gsi, new_square_stmt, GSI_SAME_STMT);
452 	}
453 
454       reciprocal_stats.rdivs_inserted++;
455 
456       occ->recip_def_stmt = new_stmt;
457     }
458 
459   occ->recip_def = recip_def;
460   occ->square_recip_def = square_recip_def;
461   for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
462     insert_reciprocals (def_gsi, occ_child, def, recip_def,
463 			square_recip_def, should_insert_square_recip,
464 			threshold);
465 }
466 
467 /* Replace occurrences of expr / (x * x) with expr * ((1 / x) * (1 / x)).
468    Take as argument the use for (x * x).  */
469 static inline void
replace_reciprocal_squares(use_operand_p use_p)470 replace_reciprocal_squares (use_operand_p use_p)
471 {
472   gimple *use_stmt = USE_STMT (use_p);
473   basic_block bb = gimple_bb (use_stmt);
474   struct occurrence *occ = (struct occurrence *) bb->aux;
475 
476   if (optimize_bb_for_speed_p (bb) && occ->square_recip_def
477       && occ->recip_def)
478     {
479       gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
480       gimple_assign_set_rhs_code (use_stmt, MULT_EXPR);
481       gimple_assign_set_rhs2 (use_stmt, occ->square_recip_def);
482       SET_USE (use_p, occ->square_recip_def);
483       fold_stmt_inplace (&gsi);
484       update_stmt (use_stmt);
485     }
486 }
487 
488 
489 /* Replace the division at USE_P with a multiplication by the reciprocal, if
490    possible.  */
491 
492 static inline void
replace_reciprocal(use_operand_p use_p)493 replace_reciprocal (use_operand_p use_p)
494 {
495   gimple *use_stmt = USE_STMT (use_p);
496   basic_block bb = gimple_bb (use_stmt);
497   struct occurrence *occ = (struct occurrence *) bb->aux;
498 
499   if (optimize_bb_for_speed_p (bb)
500       && occ->recip_def && use_stmt != occ->recip_def_stmt)
501     {
502       gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
503       gimple_assign_set_rhs_code (use_stmt, MULT_EXPR);
504       SET_USE (use_p, occ->recip_def);
505       fold_stmt_inplace (&gsi);
506       update_stmt (use_stmt);
507     }
508 }
509 
510 
511 /* Free OCC and return one more "struct occurrence" to be freed.  */
512 
513 static struct occurrence *
free_bb(struct occurrence * occ)514 free_bb (struct occurrence *occ)
515 {
516   struct occurrence *child, *next;
517 
518   /* First get the two pointers hanging off OCC.  */
519   next = occ->next;
520   child = occ->children;
521   occ->bb->aux = NULL;
522   occ_pool->remove (occ);
523 
524   /* Now ensure that we don't recurse unless it is necessary.  */
525   if (!child)
526     return next;
527   else
528     {
529       while (next)
530 	next = free_bb (next);
531 
532       return child;
533     }
534 }
535 
536 /* Transform sequences like
537    t = sqrt (a)
538    x = 1.0 / t;
539    r1 = x * x;
540    r2 = a * x;
541    into:
542    t = sqrt (a)
543    r1 = 1.0 / a;
544    r2 = t;
545    x = r1 * r2;
546    depending on the uses of x, r1, r2.  This removes one multiplication and
547    allows the sqrt and division operations to execute in parallel.
548    DEF_GSI is the gsi of the initial division by sqrt that defines
549    DEF (x in the example above).  */
550 
551 static void
optimize_recip_sqrt(gimple_stmt_iterator * def_gsi,tree def)552 optimize_recip_sqrt (gimple_stmt_iterator *def_gsi, tree def)
553 {
554   gimple *use_stmt;
555   imm_use_iterator use_iter;
556   gimple *stmt = gsi_stmt (*def_gsi);
557   tree x = def;
558   tree orig_sqrt_ssa_name = gimple_assign_rhs2 (stmt);
559   tree div_rhs1 = gimple_assign_rhs1 (stmt);
560 
561   if (TREE_CODE (orig_sqrt_ssa_name) != SSA_NAME
562       || TREE_CODE (div_rhs1) != REAL_CST
563       || !real_equal (&TREE_REAL_CST (div_rhs1), &dconst1))
564     return;
565 
566   gcall *sqrt_stmt
567     = dyn_cast <gcall *> (SSA_NAME_DEF_STMT (orig_sqrt_ssa_name));
568 
569   if (!sqrt_stmt || !gimple_call_lhs (sqrt_stmt))
570     return;
571 
572   switch (gimple_call_combined_fn (sqrt_stmt))
573     {
574     CASE_CFN_SQRT:
575     CASE_CFN_SQRT_FN:
576       break;
577 
578     default:
579       return;
580     }
581   tree a = gimple_call_arg (sqrt_stmt, 0);
582 
583   /* We have 'a' and 'x'.  Now analyze the uses of 'x'.  */
584 
585   /* Statements that use x in x * x.  */
586   auto_vec<gimple *> sqr_stmts;
587   /* Statements that use x in a * x.  */
588   auto_vec<gimple *> mult_stmts;
589   bool has_other_use = false;
590   bool mult_on_main_path = false;
591 
592   FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, x)
593     {
594       if (is_gimple_debug (use_stmt))
595 	continue;
596       if (is_square_of (use_stmt, x))
597 	{
598 	  sqr_stmts.safe_push (use_stmt);
599 	  if (gimple_bb (use_stmt) == gimple_bb (stmt))
600 	    mult_on_main_path = true;
601 	}
602       else if (is_mult_by (use_stmt, x, a))
603 	{
604 	  mult_stmts.safe_push (use_stmt);
605 	  if (gimple_bb (use_stmt) == gimple_bb (stmt))
606 	    mult_on_main_path = true;
607 	}
608       else
609 	has_other_use = true;
610     }
611 
612   /* In the x * x and a * x cases we just rewire stmt operands or
613      remove multiplications.  In the has_other_use case we introduce
614      a multiplication so make sure we don't introduce a multiplication
615      on a path where there was none.  */
616   if (has_other_use && !mult_on_main_path)
617     return;
618 
619   if (sqr_stmts.is_empty () && mult_stmts.is_empty ())
620     return;
621 
622   /* If x = 1.0 / sqrt (a) has uses other than those optimized here we want
623      to be able to compose it from the sqr and mult cases.  */
624   if (has_other_use && (sqr_stmts.is_empty () || mult_stmts.is_empty ()))
625     return;
626 
627   if (dump_file)
628     {
629       fprintf (dump_file, "Optimizing reciprocal sqrt multiplications of\n");
630       print_gimple_stmt (dump_file, sqrt_stmt, 0, TDF_NONE);
631       print_gimple_stmt (dump_file, stmt, 0, TDF_NONE);
632       fprintf (dump_file, "\n");
633     }
634 
635   bool delete_div = !has_other_use;
636   tree sqr_ssa_name = NULL_TREE;
637   if (!sqr_stmts.is_empty ())
638     {
639       /* r1 = x * x.  Transform the original
640 	 x = 1.0 / t
641 	 into
642 	 tmp1 = 1.0 / a
643 	 r1 = tmp1.  */
644 
645       sqr_ssa_name
646 	= make_temp_ssa_name (TREE_TYPE (a), NULL, "recip_sqrt_sqr");
647 
648       if (dump_file)
649 	{
650 	  fprintf (dump_file, "Replacing original division\n");
651 	  print_gimple_stmt (dump_file, stmt, 0, TDF_NONE);
652 	  fprintf (dump_file, "with new division\n");
653 	}
654       stmt
655 	= gimple_build_assign (sqr_ssa_name, gimple_assign_rhs_code (stmt),
656 			       gimple_assign_rhs1 (stmt), a);
657       gsi_insert_before (def_gsi, stmt, GSI_SAME_STMT);
658       gsi_remove (def_gsi, true);
659       *def_gsi = gsi_for_stmt (stmt);
660       fold_stmt_inplace (def_gsi);
661       update_stmt (stmt);
662 
663       if (dump_file)
664 	print_gimple_stmt (dump_file, stmt, 0, TDF_NONE);
665 
666       delete_div = false;
667       gimple *sqr_stmt;
668       unsigned int i;
669       FOR_EACH_VEC_ELT (sqr_stmts, i, sqr_stmt)
670 	{
671 	  gimple_stmt_iterator gsi2 = gsi_for_stmt (sqr_stmt);
672 	  gimple_assign_set_rhs_from_tree (&gsi2, sqr_ssa_name);
673 	  update_stmt (sqr_stmt);
674 	}
675     }
676   if (!mult_stmts.is_empty ())
677     {
678       /* r2 = a * x.  Transform this into:
679 	 r2 = t (The original sqrt (a)).  */
680       unsigned int i;
681       gimple *mult_stmt = NULL;
682       FOR_EACH_VEC_ELT (mult_stmts, i, mult_stmt)
683 	{
684 	  gimple_stmt_iterator gsi2 = gsi_for_stmt (mult_stmt);
685 
686 	  if (dump_file)
687 	    {
688 	      fprintf (dump_file, "Replacing squaring multiplication\n");
689 	      print_gimple_stmt (dump_file, mult_stmt, 0, TDF_NONE);
690 	      fprintf (dump_file, "with assignment\n");
691 	    }
692 	  gimple_assign_set_rhs_from_tree (&gsi2, orig_sqrt_ssa_name);
693 	  fold_stmt_inplace (&gsi2);
694 	  update_stmt (mult_stmt);
695 	  if (dump_file)
696 	    print_gimple_stmt (dump_file, mult_stmt, 0, TDF_NONE);
697       }
698     }
699 
700   if (has_other_use)
701     {
702       /* Using the two temporaries tmp1, tmp2 from above
703 	 the original x is now:
704 	 x = tmp1 * tmp2.  */
705       gcc_assert (orig_sqrt_ssa_name);
706       gcc_assert (sqr_ssa_name);
707 
708       gimple *new_stmt
709 	= gimple_build_assign (x, MULT_EXPR,
710 			       orig_sqrt_ssa_name, sqr_ssa_name);
711       gsi_insert_after (def_gsi, new_stmt, GSI_NEW_STMT);
712       update_stmt (stmt);
713     }
714   else if (delete_div)
715     {
716       /* Remove the original division.  */
717       gimple_stmt_iterator gsi2 = gsi_for_stmt (stmt);
718       gsi_remove (&gsi2, true);
719       release_defs (stmt);
720     }
721   else
722     release_ssa_name (x);
723 }
724 
725 /* Look for floating-point divisions among DEF's uses, and try to
726    replace them by multiplications with the reciprocal.  Add
727    as many statements computing the reciprocal as needed.
728 
729    DEF must be a GIMPLE register of a floating-point type.  */
730 
731 static void
execute_cse_reciprocals_1(gimple_stmt_iterator * def_gsi,tree def)732 execute_cse_reciprocals_1 (gimple_stmt_iterator *def_gsi, tree def)
733 {
734   use_operand_p use_p, square_use_p;
735   imm_use_iterator use_iter, square_use_iter;
736   tree square_def;
737   struct occurrence *occ;
738   int count = 0;
739   int threshold;
740   int square_recip_count = 0;
741   int sqrt_recip_count = 0;
742 
743   gcc_assert (FLOAT_TYPE_P (TREE_TYPE (def)) && TREE_CODE (def) == SSA_NAME);
744   threshold = targetm.min_divisions_for_recip_mul (TYPE_MODE (TREE_TYPE (def)));
745 
746   /* If DEF is a square (x * x), count the number of divisions by x.
747      If there are more divisions by x than by (DEF * DEF), prefer to optimize
748      the reciprocal of x instead of DEF.  This improves cases like:
749        def = x * x
750        t0 = a / def
751        t1 = b / def
752        t2 = c / x
753      Reciprocal optimization of x results in 1 division rather than 2 or 3.  */
754   gimple *def_stmt = SSA_NAME_DEF_STMT (def);
755 
756   if (is_gimple_assign (def_stmt)
757       && gimple_assign_rhs_code (def_stmt) == MULT_EXPR
758       && TREE_CODE (gimple_assign_rhs1 (def_stmt)) == SSA_NAME
759       && gimple_assign_rhs1 (def_stmt) == gimple_assign_rhs2 (def_stmt))
760     {
761       tree op0 = gimple_assign_rhs1 (def_stmt);
762 
763       FOR_EACH_IMM_USE_FAST (use_p, use_iter, op0)
764 	{
765 	  gimple *use_stmt = USE_STMT (use_p);
766 	  if (is_division_by (use_stmt, op0))
767 	    sqrt_recip_count++;
768 	}
769     }
770 
771   FOR_EACH_IMM_USE_FAST (use_p, use_iter, def)
772     {
773       gimple *use_stmt = USE_STMT (use_p);
774       if (is_division_by (use_stmt, def))
775 	{
776 	  register_division_in (gimple_bb (use_stmt), 2);
777 	  count++;
778 	}
779 
780       if (is_square_of (use_stmt, def))
781 	{
782 	  square_def = gimple_assign_lhs (use_stmt);
783 	  FOR_EACH_IMM_USE_FAST (square_use_p, square_use_iter, square_def)
784 	    {
785 	      gimple *square_use_stmt = USE_STMT (square_use_p);
786 	      if (is_division_by (square_use_stmt, square_def))
787 		{
788 		  /* This is executed twice for each division by a square.  */
789 		  register_division_in (gimple_bb (square_use_stmt), 1);
790 		  square_recip_count++;
791 		}
792 	    }
793 	}
794     }
795 
796   /* Square reciprocals were counted twice above.  */
797   square_recip_count /= 2;
798 
799   /* If it is more profitable to optimize 1 / x, don't optimize 1 / (x * x).  */
800   if (sqrt_recip_count > square_recip_count)
801     goto out;
802 
803   /* Do the expensive part only if we can hope to optimize something.  */
804   if (count + square_recip_count >= threshold && count >= 1)
805     {
806       gimple *use_stmt;
807       for (occ = occ_head; occ; occ = occ->next)
808 	{
809 	  compute_merit (occ);
810 	  insert_reciprocals (def_gsi, occ, def, NULL, NULL,
811 			      square_recip_count, threshold);
812 	}
813 
814       FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, def)
815 	{
816 	  if (is_division_by (use_stmt, def))
817 	    {
818 	      FOR_EACH_IMM_USE_ON_STMT (use_p, use_iter)
819 		replace_reciprocal (use_p);
820 	    }
821 	  else if (square_recip_count > 0 && is_square_of (use_stmt, def))
822 	    {
823 	      FOR_EACH_IMM_USE_ON_STMT (use_p, use_iter)
824 		{
825 		  /* Find all uses of the square that are divisions and
826 		   * replace them by multiplications with the inverse.  */
827 		  imm_use_iterator square_iterator;
828 		  gimple *powmult_use_stmt = USE_STMT (use_p);
829 		  tree powmult_def_name = gimple_assign_lhs (powmult_use_stmt);
830 
831 		  FOR_EACH_IMM_USE_STMT (powmult_use_stmt,
832 					 square_iterator, powmult_def_name)
833 		    FOR_EACH_IMM_USE_ON_STMT (square_use_p, square_iterator)
834 		      {
835 			gimple *powmult_use_stmt = USE_STMT (square_use_p);
836 			if (is_division_by (powmult_use_stmt, powmult_def_name))
837 			  replace_reciprocal_squares (square_use_p);
838 		      }
839 		}
840 	    }
841 	}
842     }
843 
844 out:
845   for (occ = occ_head; occ; )
846     occ = free_bb (occ);
847 
848   occ_head = NULL;
849 }
850 
851 /* Return an internal function that implements the reciprocal of CALL,
852    or IFN_LAST if there is no such function that the target supports.  */
853 
854 internal_fn
internal_fn_reciprocal(gcall * call)855 internal_fn_reciprocal (gcall *call)
856 {
857   internal_fn ifn;
858 
859   switch (gimple_call_combined_fn (call))
860     {
861     CASE_CFN_SQRT:
862     CASE_CFN_SQRT_FN:
863       ifn = IFN_RSQRT;
864       break;
865 
866     default:
867       return IFN_LAST;
868     }
869 
870   tree_pair types = direct_internal_fn_types (ifn, call);
871   if (!direct_internal_fn_supported_p (ifn, types, OPTIMIZE_FOR_SPEED))
872     return IFN_LAST;
873 
874   return ifn;
875 }
876 
877 /* Go through all the floating-point SSA_NAMEs, and call
878    execute_cse_reciprocals_1 on each of them.  */
879 namespace {
880 
881 const pass_data pass_data_cse_reciprocals =
882 {
883   GIMPLE_PASS, /* type */
884   "recip", /* name */
885   OPTGROUP_NONE, /* optinfo_flags */
886   TV_TREE_RECIP, /* tv_id */
887   PROP_ssa, /* properties_required */
888   0, /* properties_provided */
889   0, /* properties_destroyed */
890   0, /* todo_flags_start */
891   TODO_update_ssa, /* todo_flags_finish */
892 };
893 
894 class pass_cse_reciprocals : public gimple_opt_pass
895 {
896 public:
pass_cse_reciprocals(gcc::context * ctxt)897   pass_cse_reciprocals (gcc::context *ctxt)
898     : gimple_opt_pass (pass_data_cse_reciprocals, ctxt)
899   {}
900 
901   /* opt_pass methods: */
gate(function *)902   virtual bool gate (function *) { return optimize && flag_reciprocal_math; }
903   virtual unsigned int execute (function *);
904 
905 }; // class pass_cse_reciprocals
906 
907 unsigned int
execute(function * fun)908 pass_cse_reciprocals::execute (function *fun)
909 {
910   basic_block bb;
911   tree arg;
912 
913   occ_pool = new object_allocator<occurrence> ("dominators for recip");
914 
915   memset (&reciprocal_stats, 0, sizeof (reciprocal_stats));
916   calculate_dominance_info (CDI_DOMINATORS);
917   calculate_dominance_info (CDI_POST_DOMINATORS);
918 
919   if (flag_checking)
920     FOR_EACH_BB_FN (bb, fun)
921       gcc_assert (!bb->aux);
922 
923   for (arg = DECL_ARGUMENTS (fun->decl); arg; arg = DECL_CHAIN (arg))
924     if (FLOAT_TYPE_P (TREE_TYPE (arg))
925 	&& is_gimple_reg (arg))
926       {
927 	tree name = ssa_default_def (fun, arg);
928 	if (name)
929 	  execute_cse_reciprocals_1 (NULL, name);
930       }
931 
932   FOR_EACH_BB_FN (bb, fun)
933     {
934       tree def;
935 
936       for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
937 	   gsi_next (&gsi))
938 	{
939 	  gphi *phi = gsi.phi ();
940 	  def = PHI_RESULT (phi);
941 	  if (! virtual_operand_p (def)
942 	      && FLOAT_TYPE_P (TREE_TYPE (def)))
943 	    execute_cse_reciprocals_1 (NULL, def);
944 	}
945 
946       for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
947 	   gsi_next (&gsi))
948         {
949 	  gimple *stmt = gsi_stmt (gsi);
950 
951 	  if (gimple_has_lhs (stmt)
952 	      && (def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF)) != NULL
953 	      && FLOAT_TYPE_P (TREE_TYPE (def))
954 	      && TREE_CODE (def) == SSA_NAME)
955 	    {
956 	      execute_cse_reciprocals_1 (&gsi, def);
957 	      stmt = gsi_stmt (gsi);
958 	      if (flag_unsafe_math_optimizations
959 		  && is_gimple_assign (stmt)
960 		  && gimple_assign_lhs (stmt) == def
961 		  && !stmt_can_throw_internal (cfun, stmt)
962 		  && gimple_assign_rhs_code (stmt) == RDIV_EXPR)
963 		optimize_recip_sqrt (&gsi, def);
964 	    }
965 	}
966 
967       if (optimize_bb_for_size_p (bb))
968         continue;
969 
970       /* Scan for a/func(b) and convert it to reciprocal a*rfunc(b).  */
971       for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
972 	   gsi_next (&gsi))
973         {
974 	  gimple *stmt = gsi_stmt (gsi);
975 
976 	  if (is_gimple_assign (stmt)
977 	      && gimple_assign_rhs_code (stmt) == RDIV_EXPR)
978 	    {
979 	      tree arg1 = gimple_assign_rhs2 (stmt);
980 	      gimple *stmt1;
981 
982 	      if (TREE_CODE (arg1) != SSA_NAME)
983 		continue;
984 
985 	      stmt1 = SSA_NAME_DEF_STMT (arg1);
986 
987 	      if (is_gimple_call (stmt1)
988 		  && gimple_call_lhs (stmt1))
989 		{
990 		  bool fail;
991 		  imm_use_iterator ui;
992 		  use_operand_p use_p;
993 		  tree fndecl = NULL_TREE;
994 
995 		  gcall *call = as_a <gcall *> (stmt1);
996 		  internal_fn ifn = internal_fn_reciprocal (call);
997 		  if (ifn == IFN_LAST)
998 		    {
999 		      fndecl = gimple_call_fndecl (call);
1000 		      if (!fndecl
1001 			  || !fndecl_built_in_p (fndecl, BUILT_IN_MD))
1002 			continue;
1003 		      fndecl = targetm.builtin_reciprocal (fndecl);
1004 		      if (!fndecl)
1005 			continue;
1006 		    }
1007 
1008 		  /* Check that all uses of the SSA name are divisions,
1009 		     otherwise replacing the defining statement will do
1010 		     the wrong thing.  */
1011 		  fail = false;
1012 		  FOR_EACH_IMM_USE_FAST (use_p, ui, arg1)
1013 		    {
1014 		      gimple *stmt2 = USE_STMT (use_p);
1015 		      if (is_gimple_debug (stmt2))
1016 			continue;
1017 		      if (!is_gimple_assign (stmt2)
1018 			  || gimple_assign_rhs_code (stmt2) != RDIV_EXPR
1019 			  || gimple_assign_rhs1 (stmt2) == arg1
1020 			  || gimple_assign_rhs2 (stmt2) != arg1)
1021 			{
1022 			  fail = true;
1023 			  break;
1024 			}
1025 		    }
1026 		  if (fail)
1027 		    continue;
1028 
1029 		  gimple_replace_ssa_lhs (call, arg1);
1030 		  if (gimple_call_internal_p (call) != (ifn != IFN_LAST))
1031 		    {
1032 		      auto_vec<tree, 4> args;
1033 		      for (unsigned int i = 0;
1034 			   i < gimple_call_num_args (call); i++)
1035 			args.safe_push (gimple_call_arg (call, i));
1036 		      gcall *stmt2;
1037 		      if (ifn == IFN_LAST)
1038 			stmt2 = gimple_build_call_vec (fndecl, args);
1039 		      else
1040 			stmt2 = gimple_build_call_internal_vec (ifn, args);
1041 		      gimple_call_set_lhs (stmt2, arg1);
1042 		      gimple_move_vops (stmt2, call);
1043 		      gimple_call_set_nothrow (stmt2,
1044 					       gimple_call_nothrow_p (call));
1045 		      gimple_stmt_iterator gsi2 = gsi_for_stmt (call);
1046 		      gsi_replace (&gsi2, stmt2, true);
1047 		    }
1048 		  else
1049 		    {
1050 		      if (ifn == IFN_LAST)
1051 			gimple_call_set_fndecl (call, fndecl);
1052 		      else
1053 			gimple_call_set_internal_fn (call, ifn);
1054 		      update_stmt (call);
1055 		    }
1056 		  reciprocal_stats.rfuncs_inserted++;
1057 
1058 		  FOR_EACH_IMM_USE_STMT (stmt, ui, arg1)
1059 		    {
1060 		      gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
1061 		      gimple_assign_set_rhs_code (stmt, MULT_EXPR);
1062 		      fold_stmt_inplace (&gsi);
1063 		      update_stmt (stmt);
1064 		    }
1065 		}
1066 	    }
1067 	}
1068     }
1069 
1070   statistics_counter_event (fun, "reciprocal divs inserted",
1071 			    reciprocal_stats.rdivs_inserted);
1072   statistics_counter_event (fun, "reciprocal functions inserted",
1073 			    reciprocal_stats.rfuncs_inserted);
1074 
1075   free_dominance_info (CDI_DOMINATORS);
1076   free_dominance_info (CDI_POST_DOMINATORS);
1077   delete occ_pool;
1078   return 0;
1079 }
1080 
1081 } // anon namespace
1082 
1083 gimple_opt_pass *
make_pass_cse_reciprocals(gcc::context * ctxt)1084 make_pass_cse_reciprocals (gcc::context *ctxt)
1085 {
1086   return new pass_cse_reciprocals (ctxt);
1087 }
1088 
1089 /* Records an occurrence at statement USE_STMT in the vector of trees
1090    STMTS if it is dominated by *TOP_BB or dominates it or this basic block
1091    is not yet initialized.  Returns true if the occurrence was pushed on
1092    the vector.  Adjusts *TOP_BB to be the basic block dominating all
1093    statements in the vector.  */
1094 
1095 static bool
maybe_record_sincos(vec<gimple * > * stmts,basic_block * top_bb,gimple * use_stmt)1096 maybe_record_sincos (vec<gimple *> *stmts,
1097 		     basic_block *top_bb, gimple *use_stmt)
1098 {
1099   basic_block use_bb = gimple_bb (use_stmt);
1100   if (*top_bb
1101       && (*top_bb == use_bb
1102 	  || dominated_by_p (CDI_DOMINATORS, use_bb, *top_bb)))
1103     stmts->safe_push (use_stmt);
1104   else if (!*top_bb
1105 	   || dominated_by_p (CDI_DOMINATORS, *top_bb, use_bb))
1106     {
1107       stmts->safe_push (use_stmt);
1108       *top_bb = use_bb;
1109     }
1110   else
1111     return false;
1112 
1113   return true;
1114 }
1115 
1116 /* Look for sin, cos and cexpi calls with the same argument NAME and
1117    create a single call to cexpi CSEing the result in this case.
1118    We first walk over all immediate uses of the argument collecting
1119    statements that we can CSE in a vector and in a second pass replace
1120    the statement rhs with a REALPART or IMAGPART expression on the
1121    result of the cexpi call we insert before the use statement that
1122    dominates all other candidates.  */
1123 
1124 static bool
execute_cse_sincos_1(tree name)1125 execute_cse_sincos_1 (tree name)
1126 {
1127   gimple_stmt_iterator gsi;
1128   imm_use_iterator use_iter;
1129   tree fndecl, res, type;
1130   gimple *def_stmt, *use_stmt, *stmt;
1131   int seen_cos = 0, seen_sin = 0, seen_cexpi = 0;
1132   auto_vec<gimple *> stmts;
1133   basic_block top_bb = NULL;
1134   int i;
1135   bool cfg_changed = false;
1136 
1137   type = TREE_TYPE (name);
1138   FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, name)
1139     {
1140       if (gimple_code (use_stmt) != GIMPLE_CALL
1141 	  || !gimple_call_lhs (use_stmt))
1142 	continue;
1143 
1144       switch (gimple_call_combined_fn (use_stmt))
1145 	{
1146 	CASE_CFN_COS:
1147 	  seen_cos |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
1148 	  break;
1149 
1150 	CASE_CFN_SIN:
1151 	  seen_sin |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
1152 	  break;
1153 
1154 	CASE_CFN_CEXPI:
1155 	  seen_cexpi |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
1156 	  break;
1157 
1158 	default:;
1159 	}
1160     }
1161 
1162   if (seen_cos + seen_sin + seen_cexpi <= 1)
1163     return false;
1164 
1165   /* Simply insert cexpi at the beginning of top_bb but not earlier than
1166      the name def statement.  */
1167   fndecl = mathfn_built_in (type, BUILT_IN_CEXPI);
1168   if (!fndecl)
1169     return false;
1170   stmt = gimple_build_call (fndecl, 1, name);
1171   res = make_temp_ssa_name (TREE_TYPE (TREE_TYPE (fndecl)), stmt, "sincostmp");
1172   gimple_call_set_lhs (stmt, res);
1173 
1174   def_stmt = SSA_NAME_DEF_STMT (name);
1175   if (!SSA_NAME_IS_DEFAULT_DEF (name)
1176       && gimple_code (def_stmt) != GIMPLE_PHI
1177       && gimple_bb (def_stmt) == top_bb)
1178     {
1179       gsi = gsi_for_stmt (def_stmt);
1180       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
1181     }
1182   else
1183     {
1184       gsi = gsi_after_labels (top_bb);
1185       gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
1186     }
1187   sincos_stats.inserted++;
1188 
1189   /* And adjust the recorded old call sites.  */
1190   for (i = 0; stmts.iterate (i, &use_stmt); ++i)
1191     {
1192       tree rhs = NULL;
1193 
1194       switch (gimple_call_combined_fn (use_stmt))
1195 	{
1196 	CASE_CFN_COS:
1197 	  rhs = fold_build1 (REALPART_EXPR, type, res);
1198 	  break;
1199 
1200 	CASE_CFN_SIN:
1201 	  rhs = fold_build1 (IMAGPART_EXPR, type, res);
1202 	  break;
1203 
1204 	CASE_CFN_CEXPI:
1205 	  rhs = res;
1206 	  break;
1207 
1208 	default:;
1209 	  gcc_unreachable ();
1210 	}
1211 
1212 	/* Replace call with a copy.  */
1213 	stmt = gimple_build_assign (gimple_call_lhs (use_stmt), rhs);
1214 
1215 	gsi = gsi_for_stmt (use_stmt);
1216 	gsi_replace (&gsi, stmt, true);
1217 	if (gimple_purge_dead_eh_edges (gimple_bb (stmt)))
1218 	  cfg_changed = true;
1219     }
1220 
1221   return cfg_changed;
1222 }
1223 
1224 /* To evaluate powi(x,n), the floating point value x raised to the
1225    constant integer exponent n, we use a hybrid algorithm that
1226    combines the "window method" with look-up tables.  For an
1227    introduction to exponentiation algorithms and "addition chains",
1228    see section 4.6.3, "Evaluation of Powers" of Donald E. Knuth,
1229    "Seminumerical Algorithms", Vol. 2, "The Art of Computer Programming",
1230    3rd Edition, 1998, and Daniel M. Gordon, "A Survey of Fast Exponentiation
1231    Methods", Journal of Algorithms, Vol. 27, pp. 129-146, 1998.  */
1232 
1233 /* Provide a default value for POWI_MAX_MULTS, the maximum number of
1234    multiplications to inline before calling the system library's pow
1235    function.  powi(x,n) requires at worst 2*bits(n)-2 multiplications,
1236    so this default never requires calling pow, powf or powl.  */
1237 
1238 #ifndef POWI_MAX_MULTS
1239 #define POWI_MAX_MULTS  (2*HOST_BITS_PER_WIDE_INT-2)
1240 #endif
1241 
1242 /* The size of the "optimal power tree" lookup table.  All
1243    exponents less than this value are simply looked up in the
1244    powi_table below.  This threshold is also used to size the
1245    cache of pseudo registers that hold intermediate results.  */
1246 #define POWI_TABLE_SIZE 256
1247 
1248 /* The size, in bits of the window, used in the "window method"
1249    exponentiation algorithm.  This is equivalent to a radix of
1250    (1<<POWI_WINDOW_SIZE) in the corresponding "m-ary method".  */
1251 #define POWI_WINDOW_SIZE 3
1252 
1253 /* The following table is an efficient representation of an
1254    "optimal power tree".  For each value, i, the corresponding
1255    value, j, in the table states than an optimal evaluation
1256    sequence for calculating pow(x,i) can be found by evaluating
1257    pow(x,j)*pow(x,i-j).  An optimal power tree for the first
1258    100 integers is given in Knuth's "Seminumerical algorithms".  */
1259 
1260 static const unsigned char powi_table[POWI_TABLE_SIZE] =
1261   {
1262       0,   1,   1,   2,   2,   3,   3,   4,  /*   0 -   7 */
1263       4,   6,   5,   6,   6,  10,   7,   9,  /*   8 -  15 */
1264       8,  16,   9,  16,  10,  12,  11,  13,  /*  16 -  23 */
1265      12,  17,  13,  18,  14,  24,  15,  26,  /*  24 -  31 */
1266      16,  17,  17,  19,  18,  33,  19,  26,  /*  32 -  39 */
1267      20,  25,  21,  40,  22,  27,  23,  44,  /*  40 -  47 */
1268      24,  32,  25,  34,  26,  29,  27,  44,  /*  48 -  55 */
1269      28,  31,  29,  34,  30,  60,  31,  36,  /*  56 -  63 */
1270      32,  64,  33,  34,  34,  46,  35,  37,  /*  64 -  71 */
1271      36,  65,  37,  50,  38,  48,  39,  69,  /*  72 -  79 */
1272      40,  49,  41,  43,  42,  51,  43,  58,  /*  80 -  87 */
1273      44,  64,  45,  47,  46,  59,  47,  76,  /*  88 -  95 */
1274      48,  65,  49,  66,  50,  67,  51,  66,  /*  96 - 103 */
1275      52,  70,  53,  74,  54, 104,  55,  74,  /* 104 - 111 */
1276      56,  64,  57,  69,  58,  78,  59,  68,  /* 112 - 119 */
1277      60,  61,  61,  80,  62,  75,  63,  68,  /* 120 - 127 */
1278      64,  65,  65, 128,  66, 129,  67,  90,  /* 128 - 135 */
1279      68,  73,  69, 131,  70,  94,  71,  88,  /* 136 - 143 */
1280      72, 128,  73,  98,  74, 132,  75, 121,  /* 144 - 151 */
1281      76, 102,  77, 124,  78, 132,  79, 106,  /* 152 - 159 */
1282      80,  97,  81, 160,  82,  99,  83, 134,  /* 160 - 167 */
1283      84,  86,  85,  95,  86, 160,  87, 100,  /* 168 - 175 */
1284      88, 113,  89,  98,  90, 107,  91, 122,  /* 176 - 183 */
1285      92, 111,  93, 102,  94, 126,  95, 150,  /* 184 - 191 */
1286      96, 128,  97, 130,  98, 133,  99, 195,  /* 192 - 199 */
1287     100, 128, 101, 123, 102, 164, 103, 138,  /* 200 - 207 */
1288     104, 145, 105, 146, 106, 109, 107, 149,  /* 208 - 215 */
1289     108, 200, 109, 146, 110, 170, 111, 157,  /* 216 - 223 */
1290     112, 128, 113, 130, 114, 182, 115, 132,  /* 224 - 231 */
1291     116, 200, 117, 132, 118, 158, 119, 206,  /* 232 - 239 */
1292     120, 240, 121, 162, 122, 147, 123, 152,  /* 240 - 247 */
1293     124, 166, 125, 214, 126, 138, 127, 153,  /* 248 - 255 */
1294   };
1295 
1296 
1297 /* Return the number of multiplications required to calculate
1298    powi(x,n) where n is less than POWI_TABLE_SIZE.  This is a
1299    subroutine of powi_cost.  CACHE is an array indicating
1300    which exponents have already been calculated.  */
1301 
1302 static int
powi_lookup_cost(unsigned HOST_WIDE_INT n,bool * cache)1303 powi_lookup_cost (unsigned HOST_WIDE_INT n, bool *cache)
1304 {
1305   /* If we've already calculated this exponent, then this evaluation
1306      doesn't require any additional multiplications.  */
1307   if (cache[n])
1308     return 0;
1309 
1310   cache[n] = true;
1311   return powi_lookup_cost (n - powi_table[n], cache)
1312 	 + powi_lookup_cost (powi_table[n], cache) + 1;
1313 }
1314 
1315 /* Return the number of multiplications required to calculate
1316    powi(x,n) for an arbitrary x, given the exponent N.  This
1317    function needs to be kept in sync with powi_as_mults below.  */
1318 
1319 static int
powi_cost(HOST_WIDE_INT n)1320 powi_cost (HOST_WIDE_INT n)
1321 {
1322   bool cache[POWI_TABLE_SIZE];
1323   unsigned HOST_WIDE_INT digit;
1324   unsigned HOST_WIDE_INT val;
1325   int result;
1326 
1327   if (n == 0)
1328     return 0;
1329 
1330   /* Ignore the reciprocal when calculating the cost.  */
1331   val = (n < 0) ? -n : n;
1332 
1333   /* Initialize the exponent cache.  */
1334   memset (cache, 0, POWI_TABLE_SIZE * sizeof (bool));
1335   cache[1] = true;
1336 
1337   result = 0;
1338 
1339   while (val >= POWI_TABLE_SIZE)
1340     {
1341       if (val & 1)
1342 	{
1343 	  digit = val & ((1 << POWI_WINDOW_SIZE) - 1);
1344 	  result += powi_lookup_cost (digit, cache)
1345 		    + POWI_WINDOW_SIZE + 1;
1346 	  val >>= POWI_WINDOW_SIZE;
1347 	}
1348       else
1349 	{
1350 	  val >>= 1;
1351 	  result++;
1352 	}
1353     }
1354 
1355   return result + powi_lookup_cost (val, cache);
1356 }
1357 
1358 /* Recursive subroutine of powi_as_mults.  This function takes the
1359    array, CACHE, of already calculated exponents and an exponent N and
1360    returns a tree that corresponds to CACHE[1]**N, with type TYPE.  */
1361 
1362 static tree
powi_as_mults_1(gimple_stmt_iterator * gsi,location_t loc,tree type,HOST_WIDE_INT n,tree * cache)1363 powi_as_mults_1 (gimple_stmt_iterator *gsi, location_t loc, tree type,
1364 		 HOST_WIDE_INT n, tree *cache)
1365 {
1366   tree op0, op1, ssa_target;
1367   unsigned HOST_WIDE_INT digit;
1368   gassign *mult_stmt;
1369 
1370   if (n < POWI_TABLE_SIZE && cache[n])
1371     return cache[n];
1372 
1373   ssa_target = make_temp_ssa_name (type, NULL, "powmult");
1374 
1375   if (n < POWI_TABLE_SIZE)
1376     {
1377       cache[n] = ssa_target;
1378       op0 = powi_as_mults_1 (gsi, loc, type, n - powi_table[n], cache);
1379       op1 = powi_as_mults_1 (gsi, loc, type, powi_table[n], cache);
1380     }
1381   else if (n & 1)
1382     {
1383       digit = n & ((1 << POWI_WINDOW_SIZE) - 1);
1384       op0 = powi_as_mults_1 (gsi, loc, type, n - digit, cache);
1385       op1 = powi_as_mults_1 (gsi, loc, type, digit, cache);
1386     }
1387   else
1388     {
1389       op0 = powi_as_mults_1 (gsi, loc, type, n >> 1, cache);
1390       op1 = op0;
1391     }
1392 
1393   mult_stmt = gimple_build_assign (ssa_target, MULT_EXPR, op0, op1);
1394   gimple_set_location (mult_stmt, loc);
1395   gsi_insert_before (gsi, mult_stmt, GSI_SAME_STMT);
1396 
1397   return ssa_target;
1398 }
1399 
1400 /* Convert ARG0**N to a tree of multiplications of ARG0 with itself.
1401    This function needs to be kept in sync with powi_cost above.  */
1402 
1403 static tree
powi_as_mults(gimple_stmt_iterator * gsi,location_t loc,tree arg0,HOST_WIDE_INT n)1404 powi_as_mults (gimple_stmt_iterator *gsi, location_t loc,
1405 	       tree arg0, HOST_WIDE_INT n)
1406 {
1407   tree cache[POWI_TABLE_SIZE], result, type = TREE_TYPE (arg0);
1408   gassign *div_stmt;
1409   tree target;
1410 
1411   if (n == 0)
1412     return build_real (type, dconst1);
1413 
1414   memset (cache, 0,  sizeof (cache));
1415   cache[1] = arg0;
1416 
1417   result = powi_as_mults_1 (gsi, loc, type, (n < 0) ? -n : n, cache);
1418   if (n >= 0)
1419     return result;
1420 
1421   /* If the original exponent was negative, reciprocate the result.  */
1422   target = make_temp_ssa_name (type, NULL, "powmult");
1423   div_stmt = gimple_build_assign (target, RDIV_EXPR,
1424 				  build_real (type, dconst1), result);
1425   gimple_set_location (div_stmt, loc);
1426   gsi_insert_before (gsi, div_stmt, GSI_SAME_STMT);
1427 
1428   return target;
1429 }
1430 
1431 /* ARG0 and N are the two arguments to a powi builtin in GSI with
1432    location info LOC.  If the arguments are appropriate, create an
1433    equivalent sequence of statements prior to GSI using an optimal
1434    number of multiplications, and return an expession holding the
1435    result.  */
1436 
1437 static tree
gimple_expand_builtin_powi(gimple_stmt_iterator * gsi,location_t loc,tree arg0,HOST_WIDE_INT n)1438 gimple_expand_builtin_powi (gimple_stmt_iterator *gsi, location_t loc,
1439 			    tree arg0, HOST_WIDE_INT n)
1440 {
1441   /* Avoid largest negative number.  */
1442   if (n != -n
1443       && ((n >= -1 && n <= 2)
1444 	  || (optimize_function_for_speed_p (cfun)
1445 	      && powi_cost (n) <= POWI_MAX_MULTS)))
1446     return powi_as_mults (gsi, loc, arg0, n);
1447 
1448   return NULL_TREE;
1449 }
1450 
1451 /* Build a gimple call statement that calls FN with argument ARG.
1452    Set the lhs of the call statement to a fresh SSA name.  Insert the
1453    statement prior to GSI's current position, and return the fresh
1454    SSA name.  */
1455 
1456 static tree
build_and_insert_call(gimple_stmt_iterator * gsi,location_t loc,tree fn,tree arg)1457 build_and_insert_call (gimple_stmt_iterator *gsi, location_t loc,
1458 		       tree fn, tree arg)
1459 {
1460   gcall *call_stmt;
1461   tree ssa_target;
1462 
1463   call_stmt = gimple_build_call (fn, 1, arg);
1464   ssa_target = make_temp_ssa_name (TREE_TYPE (arg), NULL, "powroot");
1465   gimple_set_lhs (call_stmt, ssa_target);
1466   gimple_set_location (call_stmt, loc);
1467   gsi_insert_before (gsi, call_stmt, GSI_SAME_STMT);
1468 
1469   return ssa_target;
1470 }
1471 
1472 /* Build a gimple binary operation with the given CODE and arguments
1473    ARG0, ARG1, assigning the result to a new SSA name for variable
1474    TARGET.  Insert the statement prior to GSI's current position, and
1475    return the fresh SSA name.*/
1476 
1477 static tree
build_and_insert_binop(gimple_stmt_iterator * gsi,location_t loc,const char * name,enum tree_code code,tree arg0,tree arg1)1478 build_and_insert_binop (gimple_stmt_iterator *gsi, location_t loc,
1479 			const char *name, enum tree_code code,
1480 			tree arg0, tree arg1)
1481 {
1482   tree result = make_temp_ssa_name (TREE_TYPE (arg0), NULL, name);
1483   gassign *stmt = gimple_build_assign (result, code, arg0, arg1);
1484   gimple_set_location (stmt, loc);
1485   gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1486   return result;
1487 }
1488 
1489 /* Build a gimple reference operation with the given CODE and argument
1490    ARG, assigning the result to a new SSA name of TYPE with NAME.
1491    Insert the statement prior to GSI's current position, and return
1492    the fresh SSA name.  */
1493 
1494 static inline tree
build_and_insert_ref(gimple_stmt_iterator * gsi,location_t loc,tree type,const char * name,enum tree_code code,tree arg0)1495 build_and_insert_ref (gimple_stmt_iterator *gsi, location_t loc, tree type,
1496 		      const char *name, enum tree_code code, tree arg0)
1497 {
1498   tree result = make_temp_ssa_name (type, NULL, name);
1499   gimple *stmt = gimple_build_assign (result, build1 (code, type, arg0));
1500   gimple_set_location (stmt, loc);
1501   gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1502   return result;
1503 }
1504 
1505 /* Build a gimple assignment to cast VAL to TYPE.  Insert the statement
1506    prior to GSI's current position, and return the fresh SSA name.  */
1507 
1508 static tree
build_and_insert_cast(gimple_stmt_iterator * gsi,location_t loc,tree type,tree val)1509 build_and_insert_cast (gimple_stmt_iterator *gsi, location_t loc,
1510 		       tree type, tree val)
1511 {
1512   tree result = make_ssa_name (type);
1513   gassign *stmt = gimple_build_assign (result, NOP_EXPR, val);
1514   gimple_set_location (stmt, loc);
1515   gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1516   return result;
1517 }
1518 
1519 struct pow_synth_sqrt_info
1520 {
1521   bool *factors;
1522   unsigned int deepest;
1523   unsigned int num_mults;
1524 };
1525 
1526 /* Return true iff the real value C can be represented as a
1527    sum of powers of 0.5 up to N.  That is:
1528    C == SUM<i from 1..N> (a[i]*(0.5**i)) where a[i] is either 0 or 1.
1529    Record in INFO the various parameters of the synthesis algorithm such
1530    as the factors a[i], the maximum 0.5 power and the number of
1531    multiplications that will be required.  */
1532 
1533 bool
representable_as_half_series_p(REAL_VALUE_TYPE c,unsigned n,struct pow_synth_sqrt_info * info)1534 representable_as_half_series_p (REAL_VALUE_TYPE c, unsigned n,
1535 				 struct pow_synth_sqrt_info *info)
1536 {
1537   REAL_VALUE_TYPE factor = dconsthalf;
1538   REAL_VALUE_TYPE remainder = c;
1539 
1540   info->deepest = 0;
1541   info->num_mults = 0;
1542   memset (info->factors, 0, n * sizeof (bool));
1543 
1544   for (unsigned i = 0; i < n; i++)
1545     {
1546       REAL_VALUE_TYPE res;
1547 
1548       /* If something inexact happened bail out now.  */
1549       if (real_arithmetic (&res, MINUS_EXPR, &remainder, &factor))
1550 	return false;
1551 
1552       /* We have hit zero.  The number is representable as a sum
1553          of powers of 0.5.  */
1554       if (real_equal (&res, &dconst0))
1555 	{
1556 	  info->factors[i] = true;
1557 	  info->deepest = i + 1;
1558 	  return true;
1559 	}
1560       else if (!REAL_VALUE_NEGATIVE (res))
1561 	{
1562 	  remainder = res;
1563 	  info->factors[i] = true;
1564 	  info->num_mults++;
1565 	}
1566       else
1567 	info->factors[i] = false;
1568 
1569       real_arithmetic (&factor, MULT_EXPR, &factor, &dconsthalf);
1570     }
1571   return false;
1572 }
1573 
1574 /* Return the tree corresponding to FN being applied
1575    to ARG N times at GSI and LOC.
1576    Look up previous results from CACHE if need be.
1577    cache[0] should contain just plain ARG i.e. FN applied to ARG 0 times.  */
1578 
1579 static tree
get_fn_chain(tree arg,unsigned int n,gimple_stmt_iterator * gsi,tree fn,location_t loc,tree * cache)1580 get_fn_chain (tree arg, unsigned int n, gimple_stmt_iterator *gsi,
1581 	      tree fn, location_t loc, tree *cache)
1582 {
1583   tree res = cache[n];
1584   if (!res)
1585     {
1586       tree prev = get_fn_chain (arg, n - 1, gsi, fn, loc, cache);
1587       res = build_and_insert_call (gsi, loc, fn, prev);
1588       cache[n] = res;
1589     }
1590 
1591   return res;
1592 }
1593 
1594 /* Print to STREAM the repeated application of function FNAME to ARG
1595    N times.  So, for FNAME = "foo", ARG = "x", N = 2 it would print:
1596    "foo (foo (x))".  */
1597 
1598 static void
print_nested_fn(FILE * stream,const char * fname,const char * arg,unsigned int n)1599 print_nested_fn (FILE* stream, const char *fname, const char* arg,
1600 		 unsigned int n)
1601 {
1602   if (n == 0)
1603     fprintf (stream, "%s", arg);
1604   else
1605     {
1606       fprintf (stream, "%s (", fname);
1607       print_nested_fn (stream, fname, arg, n - 1);
1608       fprintf (stream, ")");
1609     }
1610 }
1611 
1612 /* Print to STREAM the fractional sequence of sqrt chains
1613    applied to ARG, described by INFO.  Used for the dump file.  */
1614 
1615 static void
dump_fractional_sqrt_sequence(FILE * stream,const char * arg,struct pow_synth_sqrt_info * info)1616 dump_fractional_sqrt_sequence (FILE *stream, const char *arg,
1617 			        struct pow_synth_sqrt_info *info)
1618 {
1619   for (unsigned int i = 0; i < info->deepest; i++)
1620     {
1621       bool is_set = info->factors[i];
1622       if (is_set)
1623 	{
1624 	  print_nested_fn (stream, "sqrt", arg, i + 1);
1625 	  if (i != info->deepest - 1)
1626 	    fprintf (stream, " * ");
1627 	}
1628     }
1629 }
1630 
1631 /* Print to STREAM a representation of raising ARG to an integer
1632    power N.  Used for the dump file.  */
1633 
1634 static void
dump_integer_part(FILE * stream,const char * arg,HOST_WIDE_INT n)1635 dump_integer_part (FILE *stream, const char* arg, HOST_WIDE_INT n)
1636 {
1637   if (n > 1)
1638     fprintf (stream, "powi (%s, " HOST_WIDE_INT_PRINT_DEC ")", arg, n);
1639   else if (n == 1)
1640     fprintf (stream, "%s", arg);
1641 }
1642 
1643 /* Attempt to synthesize a POW[F] (ARG0, ARG1) call using chains of
1644    square roots.  Place at GSI and LOC.  Limit the maximum depth
1645    of the sqrt chains to MAX_DEPTH.  Return the tree holding the
1646    result of the expanded sequence or NULL_TREE if the expansion failed.
1647 
1648    This routine assumes that ARG1 is a real number with a fractional part
1649    (the integer exponent case will have been handled earlier in
1650    gimple_expand_builtin_pow).
1651 
1652    For ARG1 > 0.0:
1653    * For ARG1 composed of a whole part WHOLE_PART and a fractional part
1654      FRAC_PART i.e. WHOLE_PART == floor (ARG1) and
1655                     FRAC_PART == ARG1 - WHOLE_PART:
1656      Produce POWI (ARG0, WHOLE_PART) * POW (ARG0, FRAC_PART) where
1657      POW (ARG0, FRAC_PART) is expanded as a product of square root chains
1658      if it can be expressed as such, that is if FRAC_PART satisfies:
1659      FRAC_PART == <SUM from i = 1 until MAX_DEPTH> (a[i] * (0.5**i))
1660      where integer a[i] is either 0 or 1.
1661 
1662      Example:
1663      POW (x, 3.625) == POWI (x, 3) * POW (x, 0.625)
1664        --> POWI (x, 3) * SQRT (x) * SQRT (SQRT (SQRT (x)))
1665 
1666    For ARG1 < 0.0 there are two approaches:
1667    * (A) Expand to 1.0 / POW (ARG0, -ARG1) where POW (ARG0, -ARG1)
1668          is calculated as above.
1669 
1670      Example:
1671      POW (x, -5.625) == 1.0 / POW (x, 5.625)
1672        -->  1.0 / (POWI (x, 5) * SQRT (x) * SQRT (SQRT (SQRT (x))))
1673 
1674    * (B) : WHOLE_PART := - ceil (abs (ARG1))
1675            FRAC_PART  := ARG1 - WHOLE_PART
1676      and expand to POW (x, FRAC_PART) / POWI (x, WHOLE_PART).
1677      Example:
1678      POW (x, -5.875) == POW (x, 0.125) / POWI (X, 6)
1679        --> SQRT (SQRT (SQRT (x))) / (POWI (x, 6))
1680 
1681    For ARG1 < 0.0 we choose between (A) and (B) depending on
1682    how many multiplications we'd have to do.
1683    So, for the example in (B): POW (x, -5.875), if we were to
1684    follow algorithm (A) we would produce:
1685    1.0 / POWI (X, 5) * SQRT (X) * SQRT (SQRT (X)) * SQRT (SQRT (SQRT (X)))
1686    which contains more multiplications than approach (B).
1687 
1688    Hopefully, this approach will eliminate potentially expensive POW library
1689    calls when unsafe floating point math is enabled and allow the compiler to
1690    further optimise the multiplies, square roots and divides produced by this
1691    function.  */
1692 
1693 static tree
expand_pow_as_sqrts(gimple_stmt_iterator * gsi,location_t loc,tree arg0,tree arg1,HOST_WIDE_INT max_depth)1694 expand_pow_as_sqrts (gimple_stmt_iterator *gsi, location_t loc,
1695 		     tree arg0, tree arg1, HOST_WIDE_INT max_depth)
1696 {
1697   tree type = TREE_TYPE (arg0);
1698   machine_mode mode = TYPE_MODE (type);
1699   tree sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1700   bool one_over = true;
1701 
1702   if (!sqrtfn)
1703     return NULL_TREE;
1704 
1705   if (TREE_CODE (arg1) != REAL_CST)
1706     return NULL_TREE;
1707 
1708   REAL_VALUE_TYPE exp_init = TREE_REAL_CST (arg1);
1709 
1710   gcc_assert (max_depth > 0);
1711   tree *cache = XALLOCAVEC (tree, max_depth + 1);
1712 
1713   struct pow_synth_sqrt_info synth_info;
1714   synth_info.factors = XALLOCAVEC (bool, max_depth + 1);
1715   synth_info.deepest = 0;
1716   synth_info.num_mults = 0;
1717 
1718   bool neg_exp = REAL_VALUE_NEGATIVE (exp_init);
1719   REAL_VALUE_TYPE exp = real_value_abs (&exp_init);
1720 
1721   /* The whole and fractional parts of exp.  */
1722   REAL_VALUE_TYPE whole_part;
1723   REAL_VALUE_TYPE frac_part;
1724 
1725   real_floor (&whole_part, mode, &exp);
1726   real_arithmetic (&frac_part, MINUS_EXPR, &exp, &whole_part);
1727 
1728 
1729   REAL_VALUE_TYPE ceil_whole = dconst0;
1730   REAL_VALUE_TYPE ceil_fract = dconst0;
1731 
1732   if (neg_exp)
1733     {
1734       real_ceil (&ceil_whole, mode, &exp);
1735       real_arithmetic (&ceil_fract, MINUS_EXPR, &ceil_whole, &exp);
1736     }
1737 
1738   if (!representable_as_half_series_p (frac_part, max_depth, &synth_info))
1739     return NULL_TREE;
1740 
1741   /* Check whether it's more profitable to not use 1.0 / ...  */
1742   if (neg_exp)
1743     {
1744       struct pow_synth_sqrt_info alt_synth_info;
1745       alt_synth_info.factors = XALLOCAVEC (bool, max_depth + 1);
1746       alt_synth_info.deepest = 0;
1747       alt_synth_info.num_mults = 0;
1748 
1749       if (representable_as_half_series_p (ceil_fract, max_depth,
1750 					   &alt_synth_info)
1751 	  && alt_synth_info.deepest <= synth_info.deepest
1752 	  && alt_synth_info.num_mults < synth_info.num_mults)
1753 	{
1754 	  whole_part = ceil_whole;
1755 	  frac_part = ceil_fract;
1756 	  synth_info.deepest = alt_synth_info.deepest;
1757 	  synth_info.num_mults = alt_synth_info.num_mults;
1758 	  memcpy (synth_info.factors, alt_synth_info.factors,
1759 		  (max_depth + 1) * sizeof (bool));
1760 	  one_over = false;
1761 	}
1762     }
1763 
1764   HOST_WIDE_INT n = real_to_integer (&whole_part);
1765   REAL_VALUE_TYPE cint;
1766   real_from_integer (&cint, VOIDmode, n, SIGNED);
1767 
1768   if (!real_identical (&whole_part, &cint))
1769     return NULL_TREE;
1770 
1771   if (powi_cost (n) + synth_info.num_mults > POWI_MAX_MULTS)
1772     return NULL_TREE;
1773 
1774   memset (cache, 0, (max_depth + 1) * sizeof (tree));
1775 
1776   tree integer_res = n == 0 ? build_real (type, dconst1) : arg0;
1777 
1778   /* Calculate the integer part of the exponent.  */
1779   if (n > 1)
1780     {
1781       integer_res = gimple_expand_builtin_powi (gsi, loc, arg0, n);
1782       if (!integer_res)
1783 	return NULL_TREE;
1784     }
1785 
1786   if (dump_file)
1787     {
1788       char string[64];
1789 
1790       real_to_decimal (string, &exp_init, sizeof (string), 0, 1);
1791       fprintf (dump_file, "synthesizing pow (x, %s) as:\n", string);
1792 
1793       if (neg_exp)
1794 	{
1795 	  if (one_over)
1796 	    {
1797 	      fprintf (dump_file, "1.0 / (");
1798 	      dump_integer_part (dump_file, "x", n);
1799 	      if (n > 0)
1800 	        fprintf (dump_file, " * ");
1801 	      dump_fractional_sqrt_sequence (dump_file, "x", &synth_info);
1802 	      fprintf (dump_file, ")");
1803 	    }
1804 	  else
1805 	    {
1806 	      dump_fractional_sqrt_sequence (dump_file, "x", &synth_info);
1807 	      fprintf (dump_file, " / (");
1808 	      dump_integer_part (dump_file, "x", n);
1809 	      fprintf (dump_file, ")");
1810 	    }
1811 	}
1812       else
1813 	{
1814 	  dump_fractional_sqrt_sequence (dump_file, "x", &synth_info);
1815 	  if (n > 0)
1816 	    fprintf (dump_file, " * ");
1817 	  dump_integer_part (dump_file, "x", n);
1818 	}
1819 
1820       fprintf (dump_file, "\ndeepest sqrt chain: %d\n", synth_info.deepest);
1821     }
1822 
1823 
1824   tree fract_res = NULL_TREE;
1825   cache[0] = arg0;
1826 
1827   /* Calculate the fractional part of the exponent.  */
1828   for (unsigned i = 0; i < synth_info.deepest; i++)
1829     {
1830       if (synth_info.factors[i])
1831 	{
1832 	  tree sqrt_chain = get_fn_chain (arg0, i + 1, gsi, sqrtfn, loc, cache);
1833 
1834 	  if (!fract_res)
1835 	      fract_res = sqrt_chain;
1836 
1837 	  else
1838 	    fract_res = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1839 					   fract_res, sqrt_chain);
1840 	}
1841     }
1842 
1843   tree res = NULL_TREE;
1844 
1845   if (neg_exp)
1846     {
1847       if (one_over)
1848 	{
1849 	  if (n > 0)
1850 	    res = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1851 					   fract_res, integer_res);
1852 	  else
1853 	    res = fract_res;
1854 
1855 	  res = build_and_insert_binop (gsi, loc, "powrootrecip", RDIV_EXPR,
1856 					  build_real (type, dconst1), res);
1857 	}
1858       else
1859 	{
1860 	  res = build_and_insert_binop (gsi, loc, "powroot", RDIV_EXPR,
1861 					 fract_res, integer_res);
1862 	}
1863     }
1864   else
1865     res = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1866 				   fract_res, integer_res);
1867   return res;
1868 }
1869 
1870 /* ARG0 and ARG1 are the two arguments to a pow builtin call in GSI
1871    with location info LOC.  If possible, create an equivalent and
1872    less expensive sequence of statements prior to GSI, and return an
1873    expession holding the result.  */
1874 
1875 static tree
gimple_expand_builtin_pow(gimple_stmt_iterator * gsi,location_t loc,tree arg0,tree arg1)1876 gimple_expand_builtin_pow (gimple_stmt_iterator *gsi, location_t loc,
1877 			   tree arg0, tree arg1)
1878 {
1879   REAL_VALUE_TYPE c, cint, dconst1_3, dconst1_4, dconst1_6;
1880   REAL_VALUE_TYPE c2, dconst3;
1881   HOST_WIDE_INT n;
1882   tree type, sqrtfn, cbrtfn, sqrt_arg0, result, cbrt_x, powi_cbrt_x;
1883   machine_mode mode;
1884   bool speed_p = optimize_bb_for_speed_p (gsi_bb (*gsi));
1885   bool hw_sqrt_exists, c_is_int, c2_is_int;
1886 
1887   dconst1_4 = dconst1;
1888   SET_REAL_EXP (&dconst1_4, REAL_EXP (&dconst1_4) - 2);
1889 
1890   /* If the exponent isn't a constant, there's nothing of interest
1891      to be done.  */
1892   if (TREE_CODE (arg1) != REAL_CST)
1893     return NULL_TREE;
1894 
1895   /* Don't perform the operation if flag_signaling_nans is on
1896      and the operand is a signaling NaN.  */
1897   if (HONOR_SNANS (TYPE_MODE (TREE_TYPE (arg1)))
1898       && ((TREE_CODE (arg0) == REAL_CST
1899 	   && REAL_VALUE_ISSIGNALING_NAN (TREE_REAL_CST (arg0)))
1900 	  || REAL_VALUE_ISSIGNALING_NAN (TREE_REAL_CST (arg1))))
1901     return NULL_TREE;
1902 
1903   /* If the exponent is equivalent to an integer, expand to an optimal
1904      multiplication sequence when profitable.  */
1905   c = TREE_REAL_CST (arg1);
1906   n = real_to_integer (&c);
1907   real_from_integer (&cint, VOIDmode, n, SIGNED);
1908   c_is_int = real_identical (&c, &cint);
1909 
1910   if (c_is_int
1911       && ((n >= -1 && n <= 2)
1912 	  || (flag_unsafe_math_optimizations
1913 	      && speed_p
1914 	      && powi_cost (n) <= POWI_MAX_MULTS)))
1915     return gimple_expand_builtin_powi (gsi, loc, arg0, n);
1916 
1917   /* Attempt various optimizations using sqrt and cbrt.  */
1918   type = TREE_TYPE (arg0);
1919   mode = TYPE_MODE (type);
1920   sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1921 
1922   /* Optimize pow(x,0.5) = sqrt(x).  This replacement is always safe
1923      unless signed zeros must be maintained.  pow(-0,0.5) = +0, while
1924      sqrt(-0) = -0.  */
1925   if (sqrtfn
1926       && real_equal (&c, &dconsthalf)
1927       && !HONOR_SIGNED_ZEROS (mode))
1928     return build_and_insert_call (gsi, loc, sqrtfn, arg0);
1929 
1930   hw_sqrt_exists = optab_handler (sqrt_optab, mode) != CODE_FOR_nothing;
1931 
1932   /* Optimize pow(x,1./3.) = cbrt(x).  This requires unsafe math
1933      optimizations since 1./3. is not exactly representable.  If x
1934      is negative and finite, the correct value of pow(x,1./3.) is
1935      a NaN with the "invalid" exception raised, because the value
1936      of 1./3. actually has an even denominator.  The correct value
1937      of cbrt(x) is a negative real value.  */
1938   cbrtfn = mathfn_built_in (type, BUILT_IN_CBRT);
1939   dconst1_3 = real_value_truncate (mode, dconst_third ());
1940 
1941   if (flag_unsafe_math_optimizations
1942       && cbrtfn
1943       && (!HONOR_NANS (mode) || tree_expr_nonnegative_p (arg0))
1944       && real_equal (&c, &dconst1_3))
1945     return build_and_insert_call (gsi, loc, cbrtfn, arg0);
1946 
1947   /* Optimize pow(x,1./6.) = cbrt(sqrt(x)).  Don't do this optimization
1948      if we don't have a hardware sqrt insn.  */
1949   dconst1_6 = dconst1_3;
1950   SET_REAL_EXP (&dconst1_6, REAL_EXP (&dconst1_6) - 1);
1951 
1952   if (flag_unsafe_math_optimizations
1953       && sqrtfn
1954       && cbrtfn
1955       && (!HONOR_NANS (mode) || tree_expr_nonnegative_p (arg0))
1956       && speed_p
1957       && hw_sqrt_exists
1958       && real_equal (&c, &dconst1_6))
1959     {
1960       /* sqrt(x)  */
1961       sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1962 
1963       /* cbrt(sqrt(x))  */
1964       return build_and_insert_call (gsi, loc, cbrtfn, sqrt_arg0);
1965     }
1966 
1967 
1968   /* Attempt to expand the POW as a product of square root chains.
1969      Expand the 0.25 case even when otpimising for size.  */
1970   if (flag_unsafe_math_optimizations
1971       && sqrtfn
1972       && hw_sqrt_exists
1973       && (speed_p || real_equal (&c, &dconst1_4))
1974       && !HONOR_SIGNED_ZEROS (mode))
1975     {
1976       unsigned int max_depth = speed_p
1977 				? param_max_pow_sqrt_depth
1978 				: 2;
1979 
1980       tree expand_with_sqrts
1981 	= expand_pow_as_sqrts (gsi, loc, arg0, arg1, max_depth);
1982 
1983       if (expand_with_sqrts)
1984 	return expand_with_sqrts;
1985     }
1986 
1987   real_arithmetic (&c2, MULT_EXPR, &c, &dconst2);
1988   n = real_to_integer (&c2);
1989   real_from_integer (&cint, VOIDmode, n, SIGNED);
1990   c2_is_int = real_identical (&c2, &cint);
1991 
1992   /* Optimize pow(x,c), where 3c = n for some nonzero integer n, into
1993 
1994      powi(x, n/3) * powi(cbrt(x), n%3),                    n > 0;
1995      1.0 / (powi(x, abs(n)/3) * powi(cbrt(x), abs(n)%3)),  n < 0.
1996 
1997      Do not calculate the first factor when n/3 = 0.  As cbrt(x) is
1998      different from pow(x, 1./3.) due to rounding and behavior with
1999      negative x, we need to constrain this transformation to unsafe
2000      math and positive x or finite math.  */
2001   real_from_integer (&dconst3, VOIDmode, 3, SIGNED);
2002   real_arithmetic (&c2, MULT_EXPR, &c, &dconst3);
2003   real_round (&c2, mode, &c2);
2004   n = real_to_integer (&c2);
2005   real_from_integer (&cint, VOIDmode, n, SIGNED);
2006   real_arithmetic (&c2, RDIV_EXPR, &cint, &dconst3);
2007   real_convert (&c2, mode, &c2);
2008 
2009   if (flag_unsafe_math_optimizations
2010       && cbrtfn
2011       && (!HONOR_NANS (mode) || tree_expr_nonnegative_p (arg0))
2012       && real_identical (&c2, &c)
2013       && !c2_is_int
2014       && optimize_function_for_speed_p (cfun)
2015       && powi_cost (n / 3) <= POWI_MAX_MULTS)
2016     {
2017       tree powi_x_ndiv3 = NULL_TREE;
2018 
2019       /* Attempt to fold powi(arg0, abs(n/3)) into multiplies.  If not
2020          possible or profitable, give up.  Skip the degenerate case when
2021          abs(n) < 3, where the result is always 1.  */
2022       if (absu_hwi (n) >= 3)
2023 	{
2024 	  powi_x_ndiv3 = gimple_expand_builtin_powi (gsi, loc, arg0,
2025 						     abs_hwi (n / 3));
2026 	  if (!powi_x_ndiv3)
2027 	    return NULL_TREE;
2028 	}
2029 
2030       /* Calculate powi(cbrt(x), n%3).  Don't use gimple_expand_builtin_powi
2031          as that creates an unnecessary variable.  Instead, just produce
2032          either cbrt(x) or cbrt(x) * cbrt(x).  */
2033       cbrt_x = build_and_insert_call (gsi, loc, cbrtfn, arg0);
2034 
2035       if (absu_hwi (n) % 3 == 1)
2036 	powi_cbrt_x = cbrt_x;
2037       else
2038 	powi_cbrt_x = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
2039 					      cbrt_x, cbrt_x);
2040 
2041       /* Multiply the two subexpressions, unless powi(x,abs(n)/3) = 1.  */
2042       if (absu_hwi (n) < 3)
2043 	result = powi_cbrt_x;
2044       else
2045 	result = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
2046 					 powi_x_ndiv3, powi_cbrt_x);
2047 
2048       /* If n is negative, reciprocate the result.  */
2049       if (n < 0)
2050 	result = build_and_insert_binop (gsi, loc, "powroot", RDIV_EXPR,
2051 					 build_real (type, dconst1), result);
2052 
2053       return result;
2054     }
2055 
2056   /* No optimizations succeeded.  */
2057   return NULL_TREE;
2058 }
2059 
2060 /* ARG is the argument to a cabs builtin call in GSI with location info
2061    LOC.  Create a sequence of statements prior to GSI that calculates
2062    sqrt(R*R + I*I), where R and I are the real and imaginary components
2063    of ARG, respectively.  Return an expression holding the result.  */
2064 
2065 static tree
gimple_expand_builtin_cabs(gimple_stmt_iterator * gsi,location_t loc,tree arg)2066 gimple_expand_builtin_cabs (gimple_stmt_iterator *gsi, location_t loc, tree arg)
2067 {
2068   tree real_part, imag_part, addend1, addend2, sum, result;
2069   tree type = TREE_TYPE (TREE_TYPE (arg));
2070   tree sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
2071   machine_mode mode = TYPE_MODE (type);
2072 
2073   if (!flag_unsafe_math_optimizations
2074       || !optimize_bb_for_speed_p (gimple_bb (gsi_stmt (*gsi)))
2075       || !sqrtfn
2076       || optab_handler (sqrt_optab, mode) == CODE_FOR_nothing)
2077     return NULL_TREE;
2078 
2079   real_part = build_and_insert_ref (gsi, loc, type, "cabs",
2080 				    REALPART_EXPR, arg);
2081   addend1 = build_and_insert_binop (gsi, loc, "cabs", MULT_EXPR,
2082 				    real_part, real_part);
2083   imag_part = build_and_insert_ref (gsi, loc, type, "cabs",
2084 				    IMAGPART_EXPR, arg);
2085   addend2 = build_and_insert_binop (gsi, loc, "cabs", MULT_EXPR,
2086 				    imag_part, imag_part);
2087   sum = build_and_insert_binop (gsi, loc, "cabs", PLUS_EXPR, addend1, addend2);
2088   result = build_and_insert_call (gsi, loc, sqrtfn, sum);
2089 
2090   return result;
2091 }
2092 
2093 /* Go through all calls to sin, cos and cexpi and call execute_cse_sincos_1
2094    on the SSA_NAME argument of each of them.  Also expand powi(x,n) into
2095    an optimal number of multiplies, when n is a constant.  */
2096 
2097 namespace {
2098 
2099 const pass_data pass_data_cse_sincos =
2100 {
2101   GIMPLE_PASS, /* type */
2102   "sincos", /* name */
2103   OPTGROUP_NONE, /* optinfo_flags */
2104   TV_TREE_SINCOS, /* tv_id */
2105   PROP_ssa, /* properties_required */
2106   PROP_gimple_opt_math, /* properties_provided */
2107   0, /* properties_destroyed */
2108   0, /* todo_flags_start */
2109   TODO_update_ssa, /* todo_flags_finish */
2110 };
2111 
2112 class pass_cse_sincos : public gimple_opt_pass
2113 {
2114 public:
pass_cse_sincos(gcc::context * ctxt)2115   pass_cse_sincos (gcc::context *ctxt)
2116     : gimple_opt_pass (pass_data_cse_sincos, ctxt)
2117   {}
2118 
2119   /* opt_pass methods: */
gate(function *)2120   virtual bool gate (function *)
2121     {
2122       /* We no longer require either sincos or cexp, since powi expansion
2123 	 piggybacks on this pass.  */
2124       return optimize;
2125     }
2126 
2127   virtual unsigned int execute (function *);
2128 
2129 }; // class pass_cse_sincos
2130 
2131 unsigned int
execute(function * fun)2132 pass_cse_sincos::execute (function *fun)
2133 {
2134   basic_block bb;
2135   bool cfg_changed = false;
2136 
2137   calculate_dominance_info (CDI_DOMINATORS);
2138   memset (&sincos_stats, 0, sizeof (sincos_stats));
2139 
2140   FOR_EACH_BB_FN (bb, fun)
2141     {
2142       gimple_stmt_iterator gsi;
2143       bool cleanup_eh = false;
2144 
2145       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2146         {
2147 	  gimple *stmt = gsi_stmt (gsi);
2148 
2149 	  /* Only the last stmt in a bb could throw, no need to call
2150 	     gimple_purge_dead_eh_edges if we change something in the middle
2151 	     of a basic block.  */
2152 	  cleanup_eh = false;
2153 
2154 	  if (is_gimple_call (stmt)
2155 	      && gimple_call_lhs (stmt))
2156 	    {
2157 	      tree arg, arg0, arg1, result;
2158 	      HOST_WIDE_INT n;
2159 	      location_t loc;
2160 
2161 	      switch (gimple_call_combined_fn (stmt))
2162 		{
2163 		CASE_CFN_COS:
2164 		CASE_CFN_SIN:
2165 		CASE_CFN_CEXPI:
2166 		  /* Make sure we have either sincos or cexp.  */
2167 		  if (!targetm.libc_has_function (function_c99_math_complex)
2168 		      && !targetm.libc_has_function (function_sincos))
2169 		    break;
2170 
2171 		  arg = gimple_call_arg (stmt, 0);
2172 		  if (TREE_CODE (arg) == SSA_NAME)
2173 		    cfg_changed |= execute_cse_sincos_1 (arg);
2174 		  break;
2175 
2176 		CASE_CFN_POW:
2177 		  arg0 = gimple_call_arg (stmt, 0);
2178 		  arg1 = gimple_call_arg (stmt, 1);
2179 
2180 		  loc = gimple_location (stmt);
2181 		  result = gimple_expand_builtin_pow (&gsi, loc, arg0, arg1);
2182 
2183 		  if (result)
2184 		    {
2185 		      tree lhs = gimple_get_lhs (stmt);
2186 		      gassign *new_stmt = gimple_build_assign (lhs, result);
2187 		      gimple_set_location (new_stmt, loc);
2188 		      unlink_stmt_vdef (stmt);
2189 		      gsi_replace (&gsi, new_stmt, true);
2190 		      cleanup_eh = true;
2191 		      if (gimple_vdef (stmt))
2192 			release_ssa_name (gimple_vdef (stmt));
2193 		    }
2194 		  break;
2195 
2196 		CASE_CFN_POWI:
2197 		  arg0 = gimple_call_arg (stmt, 0);
2198 		  arg1 = gimple_call_arg (stmt, 1);
2199 		  loc = gimple_location (stmt);
2200 
2201 		  if (real_minus_onep (arg0))
2202 		    {
2203                       tree t0, t1, cond, one, minus_one;
2204 		      gassign *stmt;
2205 
2206 		      t0 = TREE_TYPE (arg0);
2207 		      t1 = TREE_TYPE (arg1);
2208 		      one = build_real (t0, dconst1);
2209 		      minus_one = build_real (t0, dconstm1);
2210 
2211 		      cond = make_temp_ssa_name (t1, NULL, "powi_cond");
2212 		      stmt = gimple_build_assign (cond, BIT_AND_EXPR,
2213 						  arg1, build_int_cst (t1, 1));
2214 		      gimple_set_location (stmt, loc);
2215 		      gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
2216 
2217 		      result = make_temp_ssa_name (t0, NULL, "powi");
2218 		      stmt = gimple_build_assign (result, COND_EXPR, cond,
2219 						  minus_one, one);
2220 		      gimple_set_location (stmt, loc);
2221 		      gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
2222 		    }
2223 		  else
2224 		    {
2225 		      if (!tree_fits_shwi_p (arg1))
2226 			break;
2227 
2228 		      n = tree_to_shwi (arg1);
2229 		      result = gimple_expand_builtin_powi (&gsi, loc, arg0, n);
2230 		    }
2231 
2232 		  if (result)
2233 		    {
2234 		      tree lhs = gimple_get_lhs (stmt);
2235 		      gassign *new_stmt = gimple_build_assign (lhs, result);
2236 		      gimple_set_location (new_stmt, loc);
2237 		      unlink_stmt_vdef (stmt);
2238 		      gsi_replace (&gsi, new_stmt, true);
2239 		      cleanup_eh = true;
2240 		      if (gimple_vdef (stmt))
2241 			release_ssa_name (gimple_vdef (stmt));
2242 		    }
2243 		  break;
2244 
2245 		CASE_CFN_CABS:
2246 		  arg0 = gimple_call_arg (stmt, 0);
2247 		  loc = gimple_location (stmt);
2248 		  result = gimple_expand_builtin_cabs (&gsi, loc, arg0);
2249 
2250 		  if (result)
2251 		    {
2252 		      tree lhs = gimple_get_lhs (stmt);
2253 		      gassign *new_stmt = gimple_build_assign (lhs, result);
2254 		      gimple_set_location (new_stmt, loc);
2255 		      unlink_stmt_vdef (stmt);
2256 		      gsi_replace (&gsi, new_stmt, true);
2257 		      cleanup_eh = true;
2258 		      if (gimple_vdef (stmt))
2259 			release_ssa_name (gimple_vdef (stmt));
2260 		    }
2261 		  break;
2262 
2263 		default:;
2264 		}
2265 	    }
2266 	}
2267       if (cleanup_eh)
2268 	cfg_changed |= gimple_purge_dead_eh_edges (bb);
2269     }
2270 
2271   statistics_counter_event (fun, "sincos statements inserted",
2272 			    sincos_stats.inserted);
2273 
2274   return cfg_changed ? TODO_cleanup_cfg : 0;
2275 }
2276 
2277 } // anon namespace
2278 
2279 gimple_opt_pass *
make_pass_cse_sincos(gcc::context * ctxt)2280 make_pass_cse_sincos (gcc::context *ctxt)
2281 {
2282   return new pass_cse_sincos (ctxt);
2283 }
2284 
2285 /* Return true if stmt is a type conversion operation that can be stripped
2286    when used in a widening multiply operation.  */
2287 static bool
widening_mult_conversion_strippable_p(tree result_type,gimple * stmt)2288 widening_mult_conversion_strippable_p (tree result_type, gimple *stmt)
2289 {
2290   enum tree_code rhs_code = gimple_assign_rhs_code (stmt);
2291 
2292   if (TREE_CODE (result_type) == INTEGER_TYPE)
2293     {
2294       tree op_type;
2295       tree inner_op_type;
2296 
2297       if (!CONVERT_EXPR_CODE_P (rhs_code))
2298 	return false;
2299 
2300       op_type = TREE_TYPE (gimple_assign_lhs (stmt));
2301 
2302       /* If the type of OP has the same precision as the result, then
2303 	 we can strip this conversion.  The multiply operation will be
2304 	 selected to create the correct extension as a by-product.  */
2305       if (TYPE_PRECISION (result_type) == TYPE_PRECISION (op_type))
2306 	return true;
2307 
2308       /* We can also strip a conversion if it preserves the signed-ness of
2309 	 the operation and doesn't narrow the range.  */
2310       inner_op_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
2311 
2312       /* If the inner-most type is unsigned, then we can strip any
2313 	 intermediate widening operation.  If it's signed, then the
2314 	 intermediate widening operation must also be signed.  */
2315       if ((TYPE_UNSIGNED (inner_op_type)
2316 	   || TYPE_UNSIGNED (op_type) == TYPE_UNSIGNED (inner_op_type))
2317 	  && TYPE_PRECISION (op_type) > TYPE_PRECISION (inner_op_type))
2318 	return true;
2319 
2320       return false;
2321     }
2322 
2323   return rhs_code == FIXED_CONVERT_EXPR;
2324 }
2325 
2326 /* Return true if RHS is a suitable operand for a widening multiplication,
2327    assuming a target type of TYPE.
2328    There are two cases:
2329 
2330      - RHS makes some value at least twice as wide.  Store that value
2331        in *NEW_RHS_OUT if so, and store its type in *TYPE_OUT.
2332 
2333      - RHS is an integer constant.  Store that value in *NEW_RHS_OUT if so,
2334        but leave *TYPE_OUT untouched.  */
2335 
2336 static bool
is_widening_mult_rhs_p(tree type,tree rhs,tree * type_out,tree * new_rhs_out)2337 is_widening_mult_rhs_p (tree type, tree rhs, tree *type_out,
2338 			tree *new_rhs_out)
2339 {
2340   gimple *stmt;
2341   tree type1, rhs1;
2342 
2343   if (TREE_CODE (rhs) == SSA_NAME)
2344     {
2345       stmt = SSA_NAME_DEF_STMT (rhs);
2346       if (is_gimple_assign (stmt))
2347 	{
2348 	  if (! widening_mult_conversion_strippable_p (type, stmt))
2349 	    rhs1 = rhs;
2350 	  else
2351 	    {
2352 	      rhs1 = gimple_assign_rhs1 (stmt);
2353 
2354 	      if (TREE_CODE (rhs1) == INTEGER_CST)
2355 		{
2356 		  *new_rhs_out = rhs1;
2357 		  *type_out = NULL;
2358 		  return true;
2359 		}
2360 	    }
2361 	}
2362       else
2363 	rhs1 = rhs;
2364 
2365       type1 = TREE_TYPE (rhs1);
2366 
2367       if (TREE_CODE (type1) != TREE_CODE (type)
2368 	  || TYPE_PRECISION (type1) * 2 > TYPE_PRECISION (type))
2369 	return false;
2370 
2371       *new_rhs_out = rhs1;
2372       *type_out = type1;
2373       return true;
2374     }
2375 
2376   if (TREE_CODE (rhs) == INTEGER_CST)
2377     {
2378       *new_rhs_out = rhs;
2379       *type_out = NULL;
2380       return true;
2381     }
2382 
2383   return false;
2384 }
2385 
2386 /* Return true if STMT performs a widening multiplication, assuming the
2387    output type is TYPE.  If so, store the unwidened types of the operands
2388    in *TYPE1_OUT and *TYPE2_OUT respectively.  Also fill *RHS1_OUT and
2389    *RHS2_OUT such that converting those operands to types *TYPE1_OUT
2390    and *TYPE2_OUT would give the operands of the multiplication.  */
2391 
2392 static bool
is_widening_mult_p(gimple * stmt,tree * type1_out,tree * rhs1_out,tree * type2_out,tree * rhs2_out)2393 is_widening_mult_p (gimple *stmt,
2394 		    tree *type1_out, tree *rhs1_out,
2395 		    tree *type2_out, tree *rhs2_out)
2396 {
2397   tree type = TREE_TYPE (gimple_assign_lhs (stmt));
2398 
2399   if (TREE_CODE (type) == INTEGER_TYPE)
2400     {
2401       if (TYPE_OVERFLOW_TRAPS (type))
2402 	return false;
2403     }
2404   else if (TREE_CODE (type) != FIXED_POINT_TYPE)
2405     return false;
2406 
2407   if (!is_widening_mult_rhs_p (type, gimple_assign_rhs1 (stmt), type1_out,
2408 			       rhs1_out))
2409     return false;
2410 
2411   if (!is_widening_mult_rhs_p (type, gimple_assign_rhs2 (stmt), type2_out,
2412 			       rhs2_out))
2413     return false;
2414 
2415   if (*type1_out == NULL)
2416     {
2417       if (*type2_out == NULL || !int_fits_type_p (*rhs1_out, *type2_out))
2418 	return false;
2419       *type1_out = *type2_out;
2420     }
2421 
2422   if (*type2_out == NULL)
2423     {
2424       if (!int_fits_type_p (*rhs2_out, *type1_out))
2425 	return false;
2426       *type2_out = *type1_out;
2427     }
2428 
2429   /* Ensure that the larger of the two operands comes first. */
2430   if (TYPE_PRECISION (*type1_out) < TYPE_PRECISION (*type2_out))
2431     {
2432       std::swap (*type1_out, *type2_out);
2433       std::swap (*rhs1_out, *rhs2_out);
2434     }
2435 
2436   return true;
2437 }
2438 
2439 /* Check to see if the CALL statement is an invocation of copysign
2440    with 1. being the first argument.  */
2441 static bool
is_copysign_call_with_1(gimple * call)2442 is_copysign_call_with_1 (gimple *call)
2443 {
2444   gcall *c = dyn_cast <gcall *> (call);
2445   if (! c)
2446     return false;
2447 
2448   enum combined_fn code = gimple_call_combined_fn (c);
2449 
2450   if (code == CFN_LAST)
2451     return false;
2452 
2453   if (builtin_fn_p (code))
2454     {
2455       switch (as_builtin_fn (code))
2456 	{
2457 	CASE_FLT_FN (BUILT_IN_COPYSIGN):
2458 	CASE_FLT_FN_FLOATN_NX (BUILT_IN_COPYSIGN):
2459 	  return real_onep (gimple_call_arg (c, 0));
2460 	default:
2461 	  return false;
2462 	}
2463     }
2464 
2465   if (internal_fn_p (code))
2466     {
2467       switch (as_internal_fn (code))
2468 	{
2469 	case IFN_COPYSIGN:
2470 	  return real_onep (gimple_call_arg (c, 0));
2471 	default:
2472 	  return false;
2473 	}
2474     }
2475 
2476    return false;
2477 }
2478 
2479 /* Try to expand the pattern x * copysign (1, y) into xorsign (x, y).
2480    This only happens when the xorsign optab is defined, if the
2481    pattern is not a xorsign pattern or if expansion fails FALSE is
2482    returned, otherwise TRUE is returned.  */
2483 static bool
convert_expand_mult_copysign(gimple * stmt,gimple_stmt_iterator * gsi)2484 convert_expand_mult_copysign (gimple *stmt, gimple_stmt_iterator *gsi)
2485 {
2486   tree treeop0, treeop1, lhs, type;
2487   location_t loc = gimple_location (stmt);
2488   lhs = gimple_assign_lhs (stmt);
2489   treeop0 = gimple_assign_rhs1 (stmt);
2490   treeop1 = gimple_assign_rhs2 (stmt);
2491   type = TREE_TYPE (lhs);
2492   machine_mode mode = TYPE_MODE (type);
2493 
2494   if (HONOR_SNANS (type))
2495     return false;
2496 
2497   if (TREE_CODE (treeop0) == SSA_NAME && TREE_CODE (treeop1) == SSA_NAME)
2498     {
2499       gimple *call0 = SSA_NAME_DEF_STMT (treeop0);
2500       if (!has_single_use (treeop0) || !is_copysign_call_with_1 (call0))
2501 	{
2502 	  call0 = SSA_NAME_DEF_STMT (treeop1);
2503 	  if (!has_single_use (treeop1) || !is_copysign_call_with_1 (call0))
2504 	    return false;
2505 
2506 	  treeop1 = treeop0;
2507 	}
2508 	if (optab_handler (xorsign_optab, mode) == CODE_FOR_nothing)
2509 	  return false;
2510 
2511 	gcall *c = as_a<gcall*> (call0);
2512 	treeop0 = gimple_call_arg (c, 1);
2513 
2514 	gcall *call_stmt
2515 	  = gimple_build_call_internal (IFN_XORSIGN, 2, treeop1, treeop0);
2516 	gimple_set_lhs (call_stmt, lhs);
2517 	gimple_set_location (call_stmt, loc);
2518 	gsi_replace (gsi, call_stmt, true);
2519 	return true;
2520     }
2521 
2522   return false;
2523 }
2524 
2525 /* Process a single gimple statement STMT, which has a MULT_EXPR as
2526    its rhs, and try to convert it into a WIDEN_MULT_EXPR.  The return
2527    value is true iff we converted the statement.  */
2528 
2529 static bool
convert_mult_to_widen(gimple * stmt,gimple_stmt_iterator * gsi)2530 convert_mult_to_widen (gimple *stmt, gimple_stmt_iterator *gsi)
2531 {
2532   tree lhs, rhs1, rhs2, type, type1, type2;
2533   enum insn_code handler;
2534   scalar_int_mode to_mode, from_mode, actual_mode;
2535   optab op;
2536   int actual_precision;
2537   location_t loc = gimple_location (stmt);
2538   bool from_unsigned1, from_unsigned2;
2539 
2540   lhs = gimple_assign_lhs (stmt);
2541   type = TREE_TYPE (lhs);
2542   if (TREE_CODE (type) != INTEGER_TYPE)
2543     return false;
2544 
2545   if (!is_widening_mult_p (stmt, &type1, &rhs1, &type2, &rhs2))
2546     return false;
2547 
2548   to_mode = SCALAR_INT_TYPE_MODE (type);
2549   from_mode = SCALAR_INT_TYPE_MODE (type1);
2550   if (to_mode == from_mode)
2551     return false;
2552 
2553   from_unsigned1 = TYPE_UNSIGNED (type1);
2554   from_unsigned2 = TYPE_UNSIGNED (type2);
2555 
2556   if (from_unsigned1 && from_unsigned2)
2557     op = umul_widen_optab;
2558   else if (!from_unsigned1 && !from_unsigned2)
2559     op = smul_widen_optab;
2560   else
2561     op = usmul_widen_optab;
2562 
2563   handler = find_widening_optab_handler_and_mode (op, to_mode, from_mode,
2564 						  &actual_mode);
2565 
2566   if (handler == CODE_FOR_nothing)
2567     {
2568       if (op != smul_widen_optab)
2569 	{
2570 	  /* We can use a signed multiply with unsigned types as long as
2571 	     there is a wider mode to use, or it is the smaller of the two
2572 	     types that is unsigned.  Note that type1 >= type2, always.  */
2573 	  if ((TYPE_UNSIGNED (type1)
2574 	       && TYPE_PRECISION (type1) == GET_MODE_PRECISION (from_mode))
2575 	      || (TYPE_UNSIGNED (type2)
2576 		  && TYPE_PRECISION (type2) == GET_MODE_PRECISION (from_mode)))
2577 	    {
2578 	      if (!GET_MODE_WIDER_MODE (from_mode).exists (&from_mode)
2579 		  || GET_MODE_SIZE (to_mode) <= GET_MODE_SIZE (from_mode))
2580 		return false;
2581 	    }
2582 
2583 	  op = smul_widen_optab;
2584 	  handler = find_widening_optab_handler_and_mode (op, to_mode,
2585 							  from_mode,
2586 							  &actual_mode);
2587 
2588 	  if (handler == CODE_FOR_nothing)
2589 	    return false;
2590 
2591 	  from_unsigned1 = from_unsigned2 = false;
2592 	}
2593       else
2594 	return false;
2595     }
2596 
2597   /* Ensure that the inputs to the handler are in the correct precison
2598      for the opcode.  This will be the full mode size.  */
2599   actual_precision = GET_MODE_PRECISION (actual_mode);
2600   if (2 * actual_precision > TYPE_PRECISION (type))
2601     return false;
2602   if (actual_precision != TYPE_PRECISION (type1)
2603       || from_unsigned1 != TYPE_UNSIGNED (type1))
2604     rhs1 = build_and_insert_cast (gsi, loc,
2605 				  build_nonstandard_integer_type
2606 				    (actual_precision, from_unsigned1), rhs1);
2607   if (actual_precision != TYPE_PRECISION (type2)
2608       || from_unsigned2 != TYPE_UNSIGNED (type2))
2609     rhs2 = build_and_insert_cast (gsi, loc,
2610 				  build_nonstandard_integer_type
2611 				    (actual_precision, from_unsigned2), rhs2);
2612 
2613   /* Handle constants.  */
2614   if (TREE_CODE (rhs1) == INTEGER_CST)
2615     rhs1 = fold_convert (type1, rhs1);
2616   if (TREE_CODE (rhs2) == INTEGER_CST)
2617     rhs2 = fold_convert (type2, rhs2);
2618 
2619   gimple_assign_set_rhs1 (stmt, rhs1);
2620   gimple_assign_set_rhs2 (stmt, rhs2);
2621   gimple_assign_set_rhs_code (stmt, WIDEN_MULT_EXPR);
2622   update_stmt (stmt);
2623   widen_mul_stats.widen_mults_inserted++;
2624   return true;
2625 }
2626 
2627 /* Process a single gimple statement STMT, which is found at the
2628    iterator GSI and has a either a PLUS_EXPR or a MINUS_EXPR as its
2629    rhs (given by CODE), and try to convert it into a
2630    WIDEN_MULT_PLUS_EXPR or a WIDEN_MULT_MINUS_EXPR.  The return value
2631    is true iff we converted the statement.  */
2632 
2633 static bool
convert_plusminus_to_widen(gimple_stmt_iterator * gsi,gimple * stmt,enum tree_code code)2634 convert_plusminus_to_widen (gimple_stmt_iterator *gsi, gimple *stmt,
2635 			    enum tree_code code)
2636 {
2637   gimple *rhs1_stmt = NULL, *rhs2_stmt = NULL;
2638   gimple *conv1_stmt = NULL, *conv2_stmt = NULL, *conv_stmt;
2639   tree type, type1, type2, optype;
2640   tree lhs, rhs1, rhs2, mult_rhs1, mult_rhs2, add_rhs;
2641   enum tree_code rhs1_code = ERROR_MARK, rhs2_code = ERROR_MARK;
2642   optab this_optab;
2643   enum tree_code wmult_code;
2644   enum insn_code handler;
2645   scalar_mode to_mode, from_mode, actual_mode;
2646   location_t loc = gimple_location (stmt);
2647   int actual_precision;
2648   bool from_unsigned1, from_unsigned2;
2649 
2650   lhs = gimple_assign_lhs (stmt);
2651   type = TREE_TYPE (lhs);
2652   if (TREE_CODE (type) != INTEGER_TYPE
2653       && TREE_CODE (type) != FIXED_POINT_TYPE)
2654     return false;
2655 
2656   if (code == MINUS_EXPR)
2657     wmult_code = WIDEN_MULT_MINUS_EXPR;
2658   else
2659     wmult_code = WIDEN_MULT_PLUS_EXPR;
2660 
2661   rhs1 = gimple_assign_rhs1 (stmt);
2662   rhs2 = gimple_assign_rhs2 (stmt);
2663 
2664   if (TREE_CODE (rhs1) == SSA_NAME)
2665     {
2666       rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
2667       if (is_gimple_assign (rhs1_stmt))
2668 	rhs1_code = gimple_assign_rhs_code (rhs1_stmt);
2669     }
2670 
2671   if (TREE_CODE (rhs2) == SSA_NAME)
2672     {
2673       rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
2674       if (is_gimple_assign (rhs2_stmt))
2675 	rhs2_code = gimple_assign_rhs_code (rhs2_stmt);
2676     }
2677 
2678   /* Allow for one conversion statement between the multiply
2679      and addition/subtraction statement.  If there are more than
2680      one conversions then we assume they would invalidate this
2681      transformation.  If that's not the case then they should have
2682      been folded before now.  */
2683   if (CONVERT_EXPR_CODE_P (rhs1_code))
2684     {
2685       conv1_stmt = rhs1_stmt;
2686       rhs1 = gimple_assign_rhs1 (rhs1_stmt);
2687       if (TREE_CODE (rhs1) == SSA_NAME)
2688 	{
2689 	  rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
2690 	  if (is_gimple_assign (rhs1_stmt))
2691 	    rhs1_code = gimple_assign_rhs_code (rhs1_stmt);
2692 	}
2693       else
2694 	return false;
2695     }
2696   if (CONVERT_EXPR_CODE_P (rhs2_code))
2697     {
2698       conv2_stmt = rhs2_stmt;
2699       rhs2 = gimple_assign_rhs1 (rhs2_stmt);
2700       if (TREE_CODE (rhs2) == SSA_NAME)
2701 	{
2702 	  rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
2703 	  if (is_gimple_assign (rhs2_stmt))
2704 	    rhs2_code = gimple_assign_rhs_code (rhs2_stmt);
2705 	}
2706       else
2707 	return false;
2708     }
2709 
2710   /* If code is WIDEN_MULT_EXPR then it would seem unnecessary to call
2711      is_widening_mult_p, but we still need the rhs returns.
2712 
2713      It might also appear that it would be sufficient to use the existing
2714      operands of the widening multiply, but that would limit the choice of
2715      multiply-and-accumulate instructions.
2716 
2717      If the widened-multiplication result has more than one uses, it is
2718      probably wiser not to do the conversion.  Also restrict this operation
2719      to single basic block to avoid moving the multiply to a different block
2720      with a higher execution frequency.  */
2721   if (code == PLUS_EXPR
2722       && (rhs1_code == MULT_EXPR || rhs1_code == WIDEN_MULT_EXPR))
2723     {
2724       if (!has_single_use (rhs1)
2725 	  || gimple_bb (rhs1_stmt) != gimple_bb (stmt)
2726 	  || !is_widening_mult_p (rhs1_stmt, &type1, &mult_rhs1,
2727 				  &type2, &mult_rhs2))
2728 	return false;
2729       add_rhs = rhs2;
2730       conv_stmt = conv1_stmt;
2731     }
2732   else if (rhs2_code == MULT_EXPR || rhs2_code == WIDEN_MULT_EXPR)
2733     {
2734       if (!has_single_use (rhs2)
2735 	  || gimple_bb (rhs2_stmt) != gimple_bb (stmt)
2736 	  || !is_widening_mult_p (rhs2_stmt, &type1, &mult_rhs1,
2737 				  &type2, &mult_rhs2))
2738 	return false;
2739       add_rhs = rhs1;
2740       conv_stmt = conv2_stmt;
2741     }
2742   else
2743     return false;
2744 
2745   to_mode = SCALAR_TYPE_MODE (type);
2746   from_mode = SCALAR_TYPE_MODE (type1);
2747   if (to_mode == from_mode)
2748     return false;
2749 
2750   from_unsigned1 = TYPE_UNSIGNED (type1);
2751   from_unsigned2 = TYPE_UNSIGNED (type2);
2752   optype = type1;
2753 
2754   /* There's no such thing as a mixed sign madd yet, so use a wider mode.  */
2755   if (from_unsigned1 != from_unsigned2)
2756     {
2757       if (!INTEGRAL_TYPE_P (type))
2758 	return false;
2759       /* We can use a signed multiply with unsigned types as long as
2760 	 there is a wider mode to use, or it is the smaller of the two
2761 	 types that is unsigned.  Note that type1 >= type2, always.  */
2762       if ((from_unsigned1
2763 	   && TYPE_PRECISION (type1) == GET_MODE_PRECISION (from_mode))
2764 	  || (from_unsigned2
2765 	      && TYPE_PRECISION (type2) == GET_MODE_PRECISION (from_mode)))
2766 	{
2767 	  if (!GET_MODE_WIDER_MODE (from_mode).exists (&from_mode)
2768 	      || GET_MODE_SIZE (from_mode) >= GET_MODE_SIZE (to_mode))
2769 	    return false;
2770 	}
2771 
2772       from_unsigned1 = from_unsigned2 = false;
2773       optype = build_nonstandard_integer_type (GET_MODE_PRECISION (from_mode),
2774 					       false);
2775     }
2776 
2777   /* If there was a conversion between the multiply and addition
2778      then we need to make sure it fits a multiply-and-accumulate.
2779      The should be a single mode change which does not change the
2780      value.  */
2781   if (conv_stmt)
2782     {
2783       /* We use the original, unmodified data types for this.  */
2784       tree from_type = TREE_TYPE (gimple_assign_rhs1 (conv_stmt));
2785       tree to_type = TREE_TYPE (gimple_assign_lhs (conv_stmt));
2786       int data_size = TYPE_PRECISION (type1) + TYPE_PRECISION (type2);
2787       bool is_unsigned = TYPE_UNSIGNED (type1) && TYPE_UNSIGNED (type2);
2788 
2789       if (TYPE_PRECISION (from_type) > TYPE_PRECISION (to_type))
2790 	{
2791 	  /* Conversion is a truncate.  */
2792 	  if (TYPE_PRECISION (to_type) < data_size)
2793 	    return false;
2794 	}
2795       else if (TYPE_PRECISION (from_type) < TYPE_PRECISION (to_type))
2796 	{
2797 	  /* Conversion is an extend.  Check it's the right sort.  */
2798 	  if (TYPE_UNSIGNED (from_type) != is_unsigned
2799 	      && !(is_unsigned && TYPE_PRECISION (from_type) > data_size))
2800 	    return false;
2801 	}
2802       /* else convert is a no-op for our purposes.  */
2803     }
2804 
2805   /* Verify that the machine can perform a widening multiply
2806      accumulate in this mode/signedness combination, otherwise
2807      this transformation is likely to pessimize code.  */
2808   this_optab = optab_for_tree_code (wmult_code, optype, optab_default);
2809   handler = find_widening_optab_handler_and_mode (this_optab, to_mode,
2810 						  from_mode, &actual_mode);
2811 
2812   if (handler == CODE_FOR_nothing)
2813     return false;
2814 
2815   /* Ensure that the inputs to the handler are in the correct precison
2816      for the opcode.  This will be the full mode size.  */
2817   actual_precision = GET_MODE_PRECISION (actual_mode);
2818   if (actual_precision != TYPE_PRECISION (type1)
2819       || from_unsigned1 != TYPE_UNSIGNED (type1))
2820     mult_rhs1 = build_and_insert_cast (gsi, loc,
2821 				       build_nonstandard_integer_type
2822 				         (actual_precision, from_unsigned1),
2823 				       mult_rhs1);
2824   if (actual_precision != TYPE_PRECISION (type2)
2825       || from_unsigned2 != TYPE_UNSIGNED (type2))
2826     mult_rhs2 = build_and_insert_cast (gsi, loc,
2827 				       build_nonstandard_integer_type
2828 					 (actual_precision, from_unsigned2),
2829 				       mult_rhs2);
2830 
2831   if (!useless_type_conversion_p (type, TREE_TYPE (add_rhs)))
2832     add_rhs = build_and_insert_cast (gsi, loc, type, add_rhs);
2833 
2834   /* Handle constants.  */
2835   if (TREE_CODE (mult_rhs1) == INTEGER_CST)
2836     mult_rhs1 = fold_convert (type1, mult_rhs1);
2837   if (TREE_CODE (mult_rhs2) == INTEGER_CST)
2838     mult_rhs2 = fold_convert (type2, mult_rhs2);
2839 
2840   gimple_assign_set_rhs_with_ops (gsi, wmult_code, mult_rhs1, mult_rhs2,
2841 				  add_rhs);
2842   update_stmt (gsi_stmt (*gsi));
2843   widen_mul_stats.maccs_inserted++;
2844   return true;
2845 }
2846 
2847 /* Given a result MUL_RESULT which is a result of a multiplication of OP1 and
2848    OP2 and which we know is used in statements that can be, together with the
2849    multiplication, converted to FMAs, perform the transformation.  */
2850 
2851 static void
convert_mult_to_fma_1(tree mul_result,tree op1,tree op2)2852 convert_mult_to_fma_1 (tree mul_result, tree op1, tree op2)
2853 {
2854   tree type = TREE_TYPE (mul_result);
2855   gimple *use_stmt;
2856   imm_use_iterator imm_iter;
2857   gcall *fma_stmt;
2858 
2859   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, mul_result)
2860     {
2861       gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
2862       tree addop, mulop1 = op1, result = mul_result;
2863       bool negate_p = false;
2864       gimple_seq seq = NULL;
2865 
2866       if (is_gimple_debug (use_stmt))
2867 	continue;
2868 
2869       if (is_gimple_assign (use_stmt)
2870 	  && gimple_assign_rhs_code (use_stmt) == NEGATE_EXPR)
2871 	{
2872 	  result = gimple_assign_lhs (use_stmt);
2873 	  use_operand_p use_p;
2874 	  gimple *neguse_stmt;
2875 	  single_imm_use (gimple_assign_lhs (use_stmt), &use_p, &neguse_stmt);
2876 	  gsi_remove (&gsi, true);
2877 	  release_defs (use_stmt);
2878 
2879 	  use_stmt = neguse_stmt;
2880 	  gsi = gsi_for_stmt (use_stmt);
2881 	  negate_p = true;
2882 	}
2883 
2884       tree cond, else_value, ops[3];
2885       tree_code code;
2886       if (!can_interpret_as_conditional_op_p (use_stmt, &cond, &code,
2887 					      ops, &else_value))
2888 	gcc_unreachable ();
2889       addop = ops[0] == result ? ops[1] : ops[0];
2890 
2891       if (code == MINUS_EXPR)
2892 	{
2893 	  if (ops[0] == result)
2894 	    /* a * b - c -> a * b + (-c)  */
2895 	    addop = gimple_build (&seq, NEGATE_EXPR, type, addop);
2896 	  else
2897 	    /* a - b * c -> (-b) * c + a */
2898 	    negate_p = !negate_p;
2899 	}
2900 
2901       if (negate_p)
2902 	mulop1 = gimple_build (&seq, NEGATE_EXPR, type, mulop1);
2903 
2904       if (seq)
2905 	gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
2906 
2907       if (cond)
2908 	fma_stmt = gimple_build_call_internal (IFN_COND_FMA, 5, cond, mulop1,
2909 					       op2, addop, else_value);
2910       else
2911 	fma_stmt = gimple_build_call_internal (IFN_FMA, 3, mulop1, op2, addop);
2912       gimple_set_lhs (fma_stmt, gimple_get_lhs (use_stmt));
2913       gimple_call_set_nothrow (fma_stmt, !stmt_can_throw_internal (cfun,
2914 								   use_stmt));
2915       gsi_replace (&gsi, fma_stmt, true);
2916       /* Follow all SSA edges so that we generate FMS, FNMA and FNMS
2917 	 regardless of where the negation occurs.  */
2918       gimple *orig_stmt = gsi_stmt (gsi);
2919       if (fold_stmt (&gsi, follow_all_ssa_edges))
2920 	{
2921 	  if (maybe_clean_or_replace_eh_stmt (orig_stmt, gsi_stmt (gsi)))
2922 	    gcc_unreachable ();
2923 	  update_stmt (gsi_stmt (gsi));
2924 	}
2925 
2926       if (dump_file && (dump_flags & TDF_DETAILS))
2927 	{
2928 	  fprintf (dump_file, "Generated FMA ");
2929 	  print_gimple_stmt (dump_file, gsi_stmt (gsi), 0, TDF_NONE);
2930 	  fprintf (dump_file, "\n");
2931 	}
2932 
2933       widen_mul_stats.fmas_inserted++;
2934     }
2935 }
2936 
2937 /* Data necessary to perform the actual transformation from a multiplication
2938    and an addition to an FMA after decision is taken it should be done and to
2939    then delete the multiplication statement from the function IL.  */
2940 
2941 struct fma_transformation_info
2942 {
2943   gimple *mul_stmt;
2944   tree mul_result;
2945   tree op1;
2946   tree op2;
2947 };
2948 
2949 /* Structure containing the current state of FMA deferring, i.e. whether we are
2950    deferring, whether to continue deferring, and all data necessary to come
2951    back and perform all deferred transformations.  */
2952 
2953 class fma_deferring_state
2954 {
2955 public:
2956   /* Class constructor.  Pass true as PERFORM_DEFERRING in order to actually
2957      do any deferring.  */
2958 
fma_deferring_state(bool perform_deferring)2959   fma_deferring_state (bool perform_deferring)
2960     : m_candidates (), m_mul_result_set (), m_initial_phi (NULL),
2961       m_last_result (NULL_TREE), m_deferring_p (perform_deferring) {}
2962 
2963   /* List of FMA candidates for which we the transformation has been determined
2964      possible but we at this point in BB analysis we do not consider them
2965      beneficial.  */
2966   auto_vec<fma_transformation_info, 8> m_candidates;
2967 
2968   /* Set of results of multiplication that are part of an already deferred FMA
2969      candidates.  */
2970   hash_set<tree> m_mul_result_set;
2971 
2972   /* The PHI that supposedly feeds back result of a FMA to another over loop
2973      boundary.  */
2974   gphi *m_initial_phi;
2975 
2976   /* Result of the last produced FMA candidate or NULL if there has not been
2977      one.  */
2978   tree m_last_result;
2979 
2980   /* If true, deferring might still be profitable.  If false, transform all
2981      candidates and no longer defer.  */
2982   bool m_deferring_p;
2983 };
2984 
2985 /* Transform all deferred FMA candidates and mark STATE as no longer
2986    deferring.  */
2987 
2988 static void
cancel_fma_deferring(fma_deferring_state * state)2989 cancel_fma_deferring (fma_deferring_state *state)
2990 {
2991   if (!state->m_deferring_p)
2992     return;
2993 
2994   for (unsigned i = 0; i < state->m_candidates.length (); i++)
2995     {
2996       if (dump_file && (dump_flags & TDF_DETAILS))
2997 	fprintf (dump_file, "Generating deferred FMA\n");
2998 
2999       const fma_transformation_info &fti = state->m_candidates[i];
3000       convert_mult_to_fma_1 (fti.mul_result, fti.op1, fti.op2);
3001 
3002       gimple_stmt_iterator gsi = gsi_for_stmt (fti.mul_stmt);
3003       gsi_remove (&gsi, true);
3004       release_defs (fti.mul_stmt);
3005     }
3006   state->m_deferring_p = false;
3007 }
3008 
3009 /* If OP is an SSA name defined by a PHI node, return the PHI statement.
3010    Otherwise return NULL.  */
3011 
3012 static gphi *
result_of_phi(tree op)3013 result_of_phi (tree op)
3014 {
3015   if (TREE_CODE (op) != SSA_NAME)
3016     return NULL;
3017 
3018   return dyn_cast <gphi *> (SSA_NAME_DEF_STMT (op));
3019 }
3020 
3021 /* After processing statements of a BB and recording STATE, return true if the
3022    initial phi is fed by the last FMA candidate result ore one such result from
3023    previously processed BBs marked in LAST_RESULT_SET.  */
3024 
3025 static bool
last_fma_candidate_feeds_initial_phi(fma_deferring_state * state,hash_set<tree> * last_result_set)3026 last_fma_candidate_feeds_initial_phi (fma_deferring_state *state,
3027 				      hash_set<tree> *last_result_set)
3028 {
3029   ssa_op_iter iter;
3030   use_operand_p use;
3031   FOR_EACH_PHI_ARG (use, state->m_initial_phi, iter, SSA_OP_USE)
3032     {
3033       tree t = USE_FROM_PTR (use);
3034       if (t == state->m_last_result
3035 	  || last_result_set->contains (t))
3036 	return true;
3037     }
3038 
3039   return false;
3040 }
3041 
3042 /* Combine the multiplication at MUL_STMT with operands MULOP1 and MULOP2
3043    with uses in additions and subtractions to form fused multiply-add
3044    operations.  Returns true if successful and MUL_STMT should be removed.
3045    If MUL_COND is nonnull, the multiplication in MUL_STMT is conditional
3046    on MUL_COND, otherwise it is unconditional.
3047 
3048    If STATE indicates that we are deferring FMA transformation, that means
3049    that we do not produce FMAs for basic blocks which look like:
3050 
3051     <bb 6>
3052     # accumulator_111 = PHI <0.0(5), accumulator_66(6)>
3053     _65 = _14 * _16;
3054     accumulator_66 = _65 + accumulator_111;
3055 
3056   or its unrolled version, i.e. with several FMA candidates that feed result
3057   of one into the addend of another.  Instead, we add them to a list in STATE
3058   and if we later discover an FMA candidate that is not part of such a chain,
3059   we go back and perform all deferred past candidates.  */
3060 
3061 static bool
3062 convert_mult_to_fma (gimple *mul_stmt, tree op1, tree op2,
3063 		     fma_deferring_state *state, tree mul_cond = NULL_TREE)
3064 {
3065   tree mul_result = gimple_get_lhs (mul_stmt);
3066   tree type = TREE_TYPE (mul_result);
3067   gimple *use_stmt, *neguse_stmt;
3068   use_operand_p use_p;
3069   imm_use_iterator imm_iter;
3070 
3071   if (FLOAT_TYPE_P (type)
3072       && flag_fp_contract_mode == FP_CONTRACT_OFF)
3073     return false;
3074 
3075   /* We don't want to do bitfield reduction ops.  */
3076   if (INTEGRAL_TYPE_P (type)
3077       && (!type_has_mode_precision_p (type) || TYPE_OVERFLOW_TRAPS (type)))
3078     return false;
3079 
3080   /* If the target doesn't support it, don't generate it.  We assume that
3081      if fma isn't available then fms, fnma or fnms are not either.  */
3082   optimization_type opt_type = bb_optimization_type (gimple_bb (mul_stmt));
3083   if (!direct_internal_fn_supported_p (IFN_FMA, type, opt_type))
3084     return false;
3085 
3086   /* If the multiplication has zero uses, it is kept around probably because
3087      of -fnon-call-exceptions.  Don't optimize it away in that case,
3088      it is DCE job.  */
3089   if (has_zero_uses (mul_result))
3090     return false;
3091 
3092   bool check_defer
3093     = (state->m_deferring_p
3094        && (tree_to_shwi (TYPE_SIZE (type))
3095 	   <= param_avoid_fma_max_bits));
3096   bool defer = check_defer;
3097   bool seen_negate_p = false;
3098   /* Make sure that the multiplication statement becomes dead after
3099      the transformation, thus that all uses are transformed to FMAs.
3100      This means we assume that an FMA operation has the same cost
3101      as an addition.  */
FOR_EACH_IMM_USE_FAST(use_p,imm_iter,mul_result)3102   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, mul_result)
3103     {
3104       tree result = mul_result;
3105       bool negate_p = false;
3106 
3107       use_stmt = USE_STMT (use_p);
3108 
3109       if (is_gimple_debug (use_stmt))
3110 	continue;
3111 
3112       /* For now restrict this operations to single basic blocks.  In theory
3113 	 we would want to support sinking the multiplication in
3114 	 m = a*b;
3115 	 if ()
3116 	   ma = m + c;
3117 	 else
3118 	   d = m;
3119 	 to form a fma in the then block and sink the multiplication to the
3120 	 else block.  */
3121       if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
3122 	return false;
3123 
3124       /* A negate on the multiplication leads to FNMA.  */
3125       if (is_gimple_assign (use_stmt)
3126 	  && gimple_assign_rhs_code (use_stmt) == NEGATE_EXPR)
3127 	{
3128 	  ssa_op_iter iter;
3129 	  use_operand_p usep;
3130 
3131 	  /* If (due to earlier missed optimizations) we have two
3132 	     negates of the same value, treat them as equivalent
3133 	     to a single negate with multiple uses.  */
3134 	  if (seen_negate_p)
3135 	    return false;
3136 
3137 	  result = gimple_assign_lhs (use_stmt);
3138 
3139 	  /* Make sure the negate statement becomes dead with this
3140 	     single transformation.  */
3141 	  if (!single_imm_use (gimple_assign_lhs (use_stmt),
3142 			       &use_p, &neguse_stmt))
3143 	    return false;
3144 
3145 	  /* Make sure the multiplication isn't also used on that stmt.  */
3146 	  FOR_EACH_PHI_OR_STMT_USE (usep, neguse_stmt, iter, SSA_OP_USE)
3147 	    if (USE_FROM_PTR (usep) == mul_result)
3148 	      return false;
3149 
3150 	  /* Re-validate.  */
3151 	  use_stmt = neguse_stmt;
3152 	  if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
3153 	    return false;
3154 
3155 	  negate_p = seen_negate_p = true;
3156 	}
3157 
3158       tree cond, else_value, ops[3];
3159       tree_code code;
3160       if (!can_interpret_as_conditional_op_p (use_stmt, &cond, &code, ops,
3161 					      &else_value))
3162 	return false;
3163 
3164       switch (code)
3165 	{
3166 	case MINUS_EXPR:
3167 	  if (ops[1] == result)
3168 	    negate_p = !negate_p;
3169 	  break;
3170 	case PLUS_EXPR:
3171 	  break;
3172 	default:
3173 	  /* FMA can only be formed from PLUS and MINUS.  */
3174 	  return false;
3175 	}
3176 
3177       if (mul_cond && cond != mul_cond)
3178 	return false;
3179 
3180       if (cond)
3181 	{
3182 	  if (cond == result || else_value == result)
3183 	    return false;
3184 	  if (!direct_internal_fn_supported_p (IFN_COND_FMA, type, opt_type))
3185 	    return false;
3186 	}
3187 
3188       /* If the subtrahend (OPS[1]) is computed by a MULT_EXPR that
3189 	 we'll visit later, we might be able to get a more profitable
3190 	 match with fnma.
3191 	 OTOH, if we don't, a negate / fma pair has likely lower latency
3192 	 that a mult / subtract pair.  */
3193       if (code == MINUS_EXPR
3194 	  && !negate_p
3195 	  && ops[0] == result
3196 	  && !direct_internal_fn_supported_p (IFN_FMS, type, opt_type)
3197 	  && direct_internal_fn_supported_p (IFN_FNMA, type, opt_type)
3198 	  && TREE_CODE (ops[1]) == SSA_NAME
3199 	  && has_single_use (ops[1]))
3200 	{
3201 	  gimple *stmt2 = SSA_NAME_DEF_STMT (ops[1]);
3202 	  if (is_gimple_assign (stmt2)
3203 	      && gimple_assign_rhs_code (stmt2) == MULT_EXPR)
3204 	    return false;
3205 	}
3206 
3207       /* We can't handle a * b + a * b.  */
3208       if (ops[0] == ops[1])
3209 	return false;
3210       /* If deferring, make sure we are not looking at an instruction that
3211 	 wouldn't have existed if we were not.  */
3212       if (state->m_deferring_p
3213 	  && (state->m_mul_result_set.contains (ops[0])
3214 	      || state->m_mul_result_set.contains (ops[1])))
3215 	return false;
3216 
3217       if (check_defer)
3218 	{
3219 	  tree use_lhs = gimple_get_lhs (use_stmt);
3220 	  if (state->m_last_result)
3221 	    {
3222 	      if (ops[1] == state->m_last_result
3223 		  || ops[0] == state->m_last_result)
3224 		defer = true;
3225 	      else
3226 		defer = false;
3227 	    }
3228 	  else
3229 	    {
3230 	      gcc_checking_assert (!state->m_initial_phi);
3231 	      gphi *phi;
3232 	      if (ops[0] == result)
3233 		phi = result_of_phi (ops[1]);
3234 	      else
3235 		{
3236 		  gcc_assert (ops[1] == result);
3237 		  phi = result_of_phi (ops[0]);
3238 		}
3239 
3240 	      if (phi)
3241 		{
3242 		  state->m_initial_phi = phi;
3243 		  defer = true;
3244 		}
3245 	      else
3246 		defer = false;
3247 	    }
3248 
3249 	  state->m_last_result = use_lhs;
3250 	  check_defer = false;
3251 	}
3252       else
3253 	defer = false;
3254 
3255       /* While it is possible to validate whether or not the exact form that
3256 	 we've recognized is available in the backend, the assumption is that
3257 	 if the deferring logic above did not trigger, the transformation is
3258 	 never a loss.  For instance, suppose the target only has the plain FMA
3259 	 pattern available.  Consider a*b-c -> fma(a,b,-c): we've exchanged
3260 	 MUL+SUB for FMA+NEG, which is still two operations.  Consider
3261          -(a*b)-c -> fma(-a,b,-c): we still have 3 operations, but in the FMA
3262 	 form the two NEGs are independent and could be run in parallel.  */
3263     }
3264 
3265   if (defer)
3266     {
3267       fma_transformation_info fti;
3268       fti.mul_stmt = mul_stmt;
3269       fti.mul_result = mul_result;
3270       fti.op1 = op1;
3271       fti.op2 = op2;
3272       state->m_candidates.safe_push (fti);
3273       state->m_mul_result_set.add (mul_result);
3274 
3275       if (dump_file && (dump_flags & TDF_DETAILS))
3276 	{
3277 	  fprintf (dump_file, "Deferred generating FMA for multiplication ");
3278 	  print_gimple_stmt (dump_file, mul_stmt, 0, TDF_NONE);
3279 	  fprintf (dump_file, "\n");
3280 	}
3281 
3282       return false;
3283     }
3284   else
3285     {
3286       if (state->m_deferring_p)
3287 	cancel_fma_deferring (state);
3288       convert_mult_to_fma_1 (mul_result, op1, op2);
3289       return true;
3290     }
3291 }
3292 
3293 
3294 /* Helper function of match_uaddsub_overflow.  Return 1
3295    if USE_STMT is unsigned overflow check ovf != 0 for
3296    STMT, -1 if USE_STMT is unsigned overflow check ovf == 0
3297    and 0 otherwise.  */
3298 
3299 static int
uaddsub_overflow_check_p(gimple * stmt,gimple * use_stmt)3300 uaddsub_overflow_check_p (gimple *stmt, gimple *use_stmt)
3301 {
3302   enum tree_code ccode = ERROR_MARK;
3303   tree crhs1 = NULL_TREE, crhs2 = NULL_TREE;
3304   if (gimple_code (use_stmt) == GIMPLE_COND)
3305     {
3306       ccode = gimple_cond_code (use_stmt);
3307       crhs1 = gimple_cond_lhs (use_stmt);
3308       crhs2 = gimple_cond_rhs (use_stmt);
3309     }
3310   else if (is_gimple_assign (use_stmt))
3311     {
3312       if (gimple_assign_rhs_class (use_stmt) == GIMPLE_BINARY_RHS)
3313 	{
3314 	  ccode = gimple_assign_rhs_code (use_stmt);
3315 	  crhs1 = gimple_assign_rhs1 (use_stmt);
3316 	  crhs2 = gimple_assign_rhs2 (use_stmt);
3317 	}
3318       else if (gimple_assign_rhs_code (use_stmt) == COND_EXPR)
3319 	{
3320 	  tree cond = gimple_assign_rhs1 (use_stmt);
3321 	  if (COMPARISON_CLASS_P (cond))
3322 	    {
3323 	      ccode = TREE_CODE (cond);
3324 	      crhs1 = TREE_OPERAND (cond, 0);
3325 	      crhs2 = TREE_OPERAND (cond, 1);
3326 	    }
3327 	  else
3328 	    return 0;
3329 	}
3330       else
3331 	return 0;
3332     }
3333   else
3334     return 0;
3335 
3336   if (TREE_CODE_CLASS (ccode) != tcc_comparison)
3337     return 0;
3338 
3339   enum tree_code code = gimple_assign_rhs_code (stmt);
3340   tree lhs = gimple_assign_lhs (stmt);
3341   tree rhs1 = gimple_assign_rhs1 (stmt);
3342   tree rhs2 = gimple_assign_rhs2 (stmt);
3343 
3344   switch (ccode)
3345     {
3346     case GT_EXPR:
3347     case LE_EXPR:
3348       /* r = a - b; r > a or r <= a
3349 	 r = a + b; a > r or a <= r or b > r or b <= r.  */
3350       if ((code == MINUS_EXPR && crhs1 == lhs && crhs2 == rhs1)
3351 	  || (code == PLUS_EXPR && (crhs1 == rhs1 || crhs1 == rhs2)
3352 	      && crhs2 == lhs))
3353 	return ccode == GT_EXPR ? 1 : -1;
3354       break;
3355     case LT_EXPR:
3356     case GE_EXPR:
3357       /* r = a - b; a < r or a >= r
3358 	 r = a + b; r < a or r >= a or r < b or r >= b.  */
3359       if ((code == MINUS_EXPR && crhs1 == rhs1 && crhs2 == lhs)
3360 	  || (code == PLUS_EXPR && crhs1 == lhs
3361 	      && (crhs2 == rhs1 || crhs2 == rhs2)))
3362 	return ccode == LT_EXPR ? 1 : -1;
3363       break;
3364     default:
3365       break;
3366     }
3367   return 0;
3368 }
3369 
3370 /* Recognize for unsigned x
3371    x = y - z;
3372    if (x > y)
3373    where there are other uses of x and replace it with
3374    _7 = SUB_OVERFLOW (y, z);
3375    x = REALPART_EXPR <_7>;
3376    _8 = IMAGPART_EXPR <_7>;
3377    if (_8)
3378    and similarly for addition.  */
3379 
3380 static bool
match_uaddsub_overflow(gimple_stmt_iterator * gsi,gimple * stmt,enum tree_code code)3381 match_uaddsub_overflow (gimple_stmt_iterator *gsi, gimple *stmt,
3382 			enum tree_code code)
3383 {
3384   tree lhs = gimple_assign_lhs (stmt);
3385   tree type = TREE_TYPE (lhs);
3386   use_operand_p use_p;
3387   imm_use_iterator iter;
3388   bool use_seen = false;
3389   bool ovf_use_seen = false;
3390   gimple *use_stmt;
3391 
3392   gcc_checking_assert (code == PLUS_EXPR || code == MINUS_EXPR);
3393   if (!INTEGRAL_TYPE_P (type)
3394       || !TYPE_UNSIGNED (type)
3395       || has_zero_uses (lhs)
3396       || has_single_use (lhs)
3397       || optab_handler (code == PLUS_EXPR ? uaddv4_optab : usubv4_optab,
3398 			TYPE_MODE (type)) == CODE_FOR_nothing)
3399     return false;
3400 
3401   FOR_EACH_IMM_USE_FAST (use_p, iter, lhs)
3402     {
3403       use_stmt = USE_STMT (use_p);
3404       if (is_gimple_debug (use_stmt))
3405 	continue;
3406 
3407       if (uaddsub_overflow_check_p (stmt, use_stmt))
3408 	ovf_use_seen = true;
3409       else
3410 	use_seen = true;
3411       if (ovf_use_seen && use_seen)
3412 	break;
3413     }
3414 
3415   if (!ovf_use_seen || !use_seen)
3416     return false;
3417 
3418   tree ctype = build_complex_type (type);
3419   tree rhs1 = gimple_assign_rhs1 (stmt);
3420   tree rhs2 = gimple_assign_rhs2 (stmt);
3421   gcall *g = gimple_build_call_internal (code == PLUS_EXPR
3422 					 ? IFN_ADD_OVERFLOW : IFN_SUB_OVERFLOW,
3423 					 2, rhs1, rhs2);
3424   tree ctmp = make_ssa_name (ctype);
3425   gimple_call_set_lhs (g, ctmp);
3426   gsi_insert_before (gsi, g, GSI_SAME_STMT);
3427   gassign *g2 = gimple_build_assign (lhs, REALPART_EXPR,
3428 				     build1 (REALPART_EXPR, type, ctmp));
3429   gsi_replace (gsi, g2, true);
3430   tree ovf = make_ssa_name (type);
3431   g2 = gimple_build_assign (ovf, IMAGPART_EXPR,
3432 			    build1 (IMAGPART_EXPR, type, ctmp));
3433   gsi_insert_after (gsi, g2, GSI_NEW_STMT);
3434 
3435   FOR_EACH_IMM_USE_STMT (use_stmt, iter, lhs)
3436     {
3437       if (is_gimple_debug (use_stmt))
3438 	continue;
3439 
3440       int ovf_use = uaddsub_overflow_check_p (stmt, use_stmt);
3441       if (ovf_use == 0)
3442 	continue;
3443       if (gimple_code (use_stmt) == GIMPLE_COND)
3444 	{
3445 	  gcond *cond_stmt = as_a <gcond *> (use_stmt);
3446 	  gimple_cond_set_lhs (cond_stmt, ovf);
3447 	  gimple_cond_set_rhs (cond_stmt, build_int_cst (type, 0));
3448 	  gimple_cond_set_code (cond_stmt, ovf_use == 1 ? NE_EXPR : EQ_EXPR);
3449 	}
3450       else
3451 	{
3452 	  gcc_checking_assert (is_gimple_assign (use_stmt));
3453 	  if (gimple_assign_rhs_class (use_stmt) == GIMPLE_BINARY_RHS)
3454 	    {
3455 	      gimple_assign_set_rhs1 (use_stmt, ovf);
3456 	      gimple_assign_set_rhs2 (use_stmt, build_int_cst (type, 0));
3457 	      gimple_assign_set_rhs_code (use_stmt,
3458 					  ovf_use == 1 ? NE_EXPR : EQ_EXPR);
3459 	    }
3460 	  else
3461 	    {
3462 	      gcc_checking_assert (gimple_assign_rhs_code (use_stmt)
3463 				   == COND_EXPR);
3464 	      tree cond = build2 (ovf_use == 1 ? NE_EXPR : EQ_EXPR,
3465 				  boolean_type_node, ovf,
3466 				  build_int_cst (type, 0));
3467 	      gimple_assign_set_rhs1 (use_stmt, cond);
3468 	    }
3469 	}
3470       update_stmt (use_stmt);
3471     }
3472   return true;
3473 }
3474 
3475 /* Return true if target has support for divmod.  */
3476 
3477 static bool
target_supports_divmod_p(optab divmod_optab,optab div_optab,machine_mode mode)3478 target_supports_divmod_p (optab divmod_optab, optab div_optab, machine_mode mode)
3479 {
3480   /* If target supports hardware divmod insn, use it for divmod.  */
3481   if (optab_handler (divmod_optab, mode) != CODE_FOR_nothing)
3482     return true;
3483 
3484   /* Check if libfunc for divmod is available.  */
3485   rtx libfunc = optab_libfunc (divmod_optab, mode);
3486   if (libfunc != NULL_RTX)
3487     {
3488       /* If optab_handler exists for div_optab, perhaps in a wider mode,
3489 	 we don't want to use the libfunc even if it exists for given mode.  */
3490       machine_mode div_mode;
3491       FOR_EACH_MODE_FROM (div_mode, mode)
3492 	if (optab_handler (div_optab, div_mode) != CODE_FOR_nothing)
3493 	  return false;
3494 
3495       return targetm.expand_divmod_libfunc != NULL;
3496     }
3497 
3498   return false;
3499 }
3500 
3501 /* Check if stmt is candidate for divmod transform.  */
3502 
3503 static bool
divmod_candidate_p(gassign * stmt)3504 divmod_candidate_p (gassign *stmt)
3505 {
3506   tree type = TREE_TYPE (gimple_assign_lhs (stmt));
3507   machine_mode mode = TYPE_MODE (type);
3508   optab divmod_optab, div_optab;
3509 
3510   if (TYPE_UNSIGNED (type))
3511     {
3512       divmod_optab = udivmod_optab;
3513       div_optab = udiv_optab;
3514     }
3515   else
3516     {
3517       divmod_optab = sdivmod_optab;
3518       div_optab = sdiv_optab;
3519     }
3520 
3521   tree op1 = gimple_assign_rhs1 (stmt);
3522   tree op2 = gimple_assign_rhs2 (stmt);
3523 
3524   /* Disable the transform if either is a constant, since division-by-constant
3525      may have specialized expansion.  */
3526   if (CONSTANT_CLASS_P (op1) || CONSTANT_CLASS_P (op2))
3527     return false;
3528 
3529   /* Exclude the case where TYPE_OVERFLOW_TRAPS (type) as that should
3530      expand using the [su]divv optabs.  */
3531   if (TYPE_OVERFLOW_TRAPS (type))
3532     return false;
3533 
3534   if (!target_supports_divmod_p (divmod_optab, div_optab, mode))
3535     return false;
3536 
3537   return true;
3538 }
3539 
3540 /* This function looks for:
3541    t1 = a TRUNC_DIV_EXPR b;
3542    t2 = a TRUNC_MOD_EXPR b;
3543    and transforms it to the following sequence:
3544    complex_tmp = DIVMOD (a, b);
3545    t1 = REALPART_EXPR(a);
3546    t2 = IMAGPART_EXPR(b);
3547    For conditions enabling the transform see divmod_candidate_p().
3548 
3549    The pass has three parts:
3550    1) Find top_stmt which is trunc_div or trunc_mod stmt and dominates all
3551       other trunc_div_expr and trunc_mod_expr stmts.
3552    2) Add top_stmt and all trunc_div and trunc_mod stmts dominated by top_stmt
3553       to stmts vector.
3554    3) Insert DIVMOD call just before top_stmt and update entries in
3555       stmts vector to use return value of DIMOVD (REALEXPR_PART for div,
3556       IMAGPART_EXPR for mod).  */
3557 
3558 static bool
convert_to_divmod(gassign * stmt)3559 convert_to_divmod (gassign *stmt)
3560 {
3561   if (stmt_can_throw_internal (cfun, stmt)
3562       || !divmod_candidate_p (stmt))
3563     return false;
3564 
3565   tree op1 = gimple_assign_rhs1 (stmt);
3566   tree op2 = gimple_assign_rhs2 (stmt);
3567 
3568   imm_use_iterator use_iter;
3569   gimple *use_stmt;
3570   auto_vec<gimple *> stmts;
3571 
3572   gimple *top_stmt = stmt;
3573   basic_block top_bb = gimple_bb (stmt);
3574 
3575   /* Part 1: Try to set top_stmt to "topmost" stmt that dominates
3576      at-least stmt and possibly other trunc_div/trunc_mod stmts
3577      having same operands as stmt.  */
3578 
3579   FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, op1)
3580     {
3581       if (is_gimple_assign (use_stmt)
3582 	  && (gimple_assign_rhs_code (use_stmt) == TRUNC_DIV_EXPR
3583 	      || gimple_assign_rhs_code (use_stmt) == TRUNC_MOD_EXPR)
3584 	  && operand_equal_p (op1, gimple_assign_rhs1 (use_stmt), 0)
3585 	  && operand_equal_p (op2, gimple_assign_rhs2 (use_stmt), 0))
3586 	{
3587 	  if (stmt_can_throw_internal (cfun, use_stmt))
3588 	    continue;
3589 
3590 	  basic_block bb = gimple_bb (use_stmt);
3591 
3592 	  if (bb == top_bb)
3593 	    {
3594 	      if (gimple_uid (use_stmt) < gimple_uid (top_stmt))
3595 		top_stmt = use_stmt;
3596 	    }
3597 	  else if (dominated_by_p (CDI_DOMINATORS, top_bb, bb))
3598 	    {
3599 	      top_bb = bb;
3600 	      top_stmt = use_stmt;
3601 	    }
3602 	}
3603     }
3604 
3605   tree top_op1 = gimple_assign_rhs1 (top_stmt);
3606   tree top_op2 = gimple_assign_rhs2 (top_stmt);
3607 
3608   stmts.safe_push (top_stmt);
3609   bool div_seen = (gimple_assign_rhs_code (top_stmt) == TRUNC_DIV_EXPR);
3610 
3611   /* Part 2: Add all trunc_div/trunc_mod statements domianted by top_bb
3612      to stmts vector. The 2nd loop will always add stmt to stmts vector, since
3613      gimple_bb (top_stmt) dominates gimple_bb (stmt), so the
3614      2nd loop ends up adding at-least single trunc_mod_expr stmt.  */
3615 
3616   FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, top_op1)
3617     {
3618       if (is_gimple_assign (use_stmt)
3619 	  && (gimple_assign_rhs_code (use_stmt) == TRUNC_DIV_EXPR
3620 	      || gimple_assign_rhs_code (use_stmt) == TRUNC_MOD_EXPR)
3621 	  && operand_equal_p (top_op1, gimple_assign_rhs1 (use_stmt), 0)
3622 	  && operand_equal_p (top_op2, gimple_assign_rhs2 (use_stmt), 0))
3623 	{
3624 	  if (use_stmt == top_stmt
3625 	      || stmt_can_throw_internal (cfun, use_stmt)
3626 	      || !dominated_by_p (CDI_DOMINATORS, gimple_bb (use_stmt), top_bb))
3627 	    continue;
3628 
3629 	  stmts.safe_push (use_stmt);
3630 	  if (gimple_assign_rhs_code (use_stmt) == TRUNC_DIV_EXPR)
3631 	    div_seen = true;
3632 	}
3633     }
3634 
3635   if (!div_seen)
3636     return false;
3637 
3638   /* Part 3: Create libcall to internal fn DIVMOD:
3639      divmod_tmp = DIVMOD (op1, op2).  */
3640 
3641   gcall *call_stmt = gimple_build_call_internal (IFN_DIVMOD, 2, op1, op2);
3642   tree res = make_temp_ssa_name (build_complex_type (TREE_TYPE (op1)),
3643 				 call_stmt, "divmod_tmp");
3644   gimple_call_set_lhs (call_stmt, res);
3645   /* We rejected throwing statements above.  */
3646   gimple_call_set_nothrow (call_stmt, true);
3647 
3648   /* Insert the call before top_stmt.  */
3649   gimple_stmt_iterator top_stmt_gsi = gsi_for_stmt (top_stmt);
3650   gsi_insert_before (&top_stmt_gsi, call_stmt, GSI_SAME_STMT);
3651 
3652   widen_mul_stats.divmod_calls_inserted++;
3653 
3654   /* Update all statements in stmts vector:
3655      lhs = op1 TRUNC_DIV_EXPR op2 -> lhs = REALPART_EXPR<divmod_tmp>
3656      lhs = op1 TRUNC_MOD_EXPR op2 -> lhs = IMAGPART_EXPR<divmod_tmp>.  */
3657 
3658   for (unsigned i = 0; stmts.iterate (i, &use_stmt); ++i)
3659     {
3660       tree new_rhs;
3661 
3662       switch (gimple_assign_rhs_code (use_stmt))
3663 	{
3664 	  case TRUNC_DIV_EXPR:
3665 	    new_rhs = fold_build1 (REALPART_EXPR, TREE_TYPE (op1), res);
3666 	    break;
3667 
3668 	  case TRUNC_MOD_EXPR:
3669 	    new_rhs = fold_build1 (IMAGPART_EXPR, TREE_TYPE (op1), res);
3670 	    break;
3671 
3672 	  default:
3673 	    gcc_unreachable ();
3674 	}
3675 
3676       gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
3677       gimple_assign_set_rhs_from_tree (&gsi, new_rhs);
3678       update_stmt (use_stmt);
3679     }
3680 
3681   return true;
3682 }
3683 
3684 /* Find integer multiplications where the operands are extended from
3685    smaller types, and replace the MULT_EXPR with a WIDEN_MULT_EXPR
3686    where appropriate.  */
3687 
3688 namespace {
3689 
3690 const pass_data pass_data_optimize_widening_mul =
3691 {
3692   GIMPLE_PASS, /* type */
3693   "widening_mul", /* name */
3694   OPTGROUP_NONE, /* optinfo_flags */
3695   TV_TREE_WIDEN_MUL, /* tv_id */
3696   PROP_ssa, /* properties_required */
3697   0, /* properties_provided */
3698   0, /* properties_destroyed */
3699   0, /* todo_flags_start */
3700   TODO_update_ssa, /* todo_flags_finish */
3701 };
3702 
3703 class pass_optimize_widening_mul : public gimple_opt_pass
3704 {
3705 public:
pass_optimize_widening_mul(gcc::context * ctxt)3706   pass_optimize_widening_mul (gcc::context *ctxt)
3707     : gimple_opt_pass (pass_data_optimize_widening_mul, ctxt)
3708   {}
3709 
3710   /* opt_pass methods: */
gate(function *)3711   virtual bool gate (function *)
3712     {
3713       return flag_expensive_optimizations && optimize;
3714     }
3715 
3716   virtual unsigned int execute (function *);
3717 
3718 }; // class pass_optimize_widening_mul
3719 
3720 /* Walker class to perform the transformation in reverse dominance order. */
3721 
3722 class math_opts_dom_walker : public dom_walker
3723 {
3724 public:
3725   /* Constructor, CFG_CHANGED is a pointer to a boolean flag that will be set
3726      if walking modidifes the CFG.  */
3727 
math_opts_dom_walker(bool * cfg_changed_p)3728   math_opts_dom_walker (bool *cfg_changed_p)
3729     : dom_walker (CDI_DOMINATORS), m_last_result_set (),
3730       m_cfg_changed_p (cfg_changed_p) {}
3731 
3732   /* The actual actions performed in the walk.  */
3733 
3734   virtual void after_dom_children (basic_block);
3735 
3736   /* Set of results of chains of multiply and add statement combinations that
3737      were not transformed into FMAs because of active deferring.  */
3738   hash_set<tree> m_last_result_set;
3739 
3740   /* Pointer to a flag of the user that needs to be set if CFG has been
3741      modified.  */
3742   bool *m_cfg_changed_p;
3743 };
3744 
3745 void
after_dom_children(basic_block bb)3746 math_opts_dom_walker::after_dom_children (basic_block bb)
3747 {
3748   gimple_stmt_iterator gsi;
3749 
3750   fma_deferring_state fma_state (param_avoid_fma_max_bits > 0);
3751 
3752   for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi);)
3753     {
3754       gimple *stmt = gsi_stmt (gsi);
3755       enum tree_code code;
3756 
3757       if (is_gimple_assign (stmt))
3758 	{
3759 	  code = gimple_assign_rhs_code (stmt);
3760 	  switch (code)
3761 	    {
3762 	    case MULT_EXPR:
3763 	      if (!convert_mult_to_widen (stmt, &gsi)
3764 		  && !convert_expand_mult_copysign (stmt, &gsi)
3765 		  && convert_mult_to_fma (stmt,
3766 					  gimple_assign_rhs1 (stmt),
3767 					  gimple_assign_rhs2 (stmt),
3768 					  &fma_state))
3769 		{
3770 		  gsi_remove (&gsi, true);
3771 		  release_defs (stmt);
3772 		  continue;
3773 		}
3774 	      break;
3775 
3776 	    case PLUS_EXPR:
3777 	    case MINUS_EXPR:
3778 	      if (!convert_plusminus_to_widen (&gsi, stmt, code))
3779 		match_uaddsub_overflow (&gsi, stmt, code);
3780 	      break;
3781 
3782 	    case TRUNC_MOD_EXPR:
3783 	      convert_to_divmod (as_a<gassign *> (stmt));
3784 	      break;
3785 
3786 	    default:;
3787 	    }
3788 	}
3789       else if (is_gimple_call (stmt))
3790 	{
3791 	  switch (gimple_call_combined_fn (stmt))
3792 	    {
3793 	    CASE_CFN_POW:
3794 	      if (gimple_call_lhs (stmt)
3795 		  && TREE_CODE (gimple_call_arg (stmt, 1)) == REAL_CST
3796 		  && real_equal (&TREE_REAL_CST (gimple_call_arg (stmt, 1)),
3797 				 &dconst2)
3798 		  && convert_mult_to_fma (stmt,
3799 					  gimple_call_arg (stmt, 0),
3800 					  gimple_call_arg (stmt, 0),
3801 					  &fma_state))
3802 		{
3803 		  unlink_stmt_vdef (stmt);
3804 		  if (gsi_remove (&gsi, true)
3805 		      && gimple_purge_dead_eh_edges (bb))
3806 		    *m_cfg_changed_p = true;
3807 		  release_defs (stmt);
3808 		  continue;
3809 		}
3810 	      break;
3811 
3812 	    case CFN_COND_MUL:
3813 	      if (convert_mult_to_fma (stmt,
3814 				       gimple_call_arg (stmt, 1),
3815 				       gimple_call_arg (stmt, 2),
3816 				       &fma_state,
3817 				       gimple_call_arg (stmt, 0)))
3818 
3819 		{
3820 		  gsi_remove (&gsi, true);
3821 		  release_defs (stmt);
3822 		  continue;
3823 		}
3824 	      break;
3825 
3826 	    case CFN_LAST:
3827 	      cancel_fma_deferring (&fma_state);
3828 	      break;
3829 
3830 	    default:
3831 	      break;
3832 	    }
3833 	}
3834       gsi_next (&gsi);
3835     }
3836   if (fma_state.m_deferring_p
3837       && fma_state.m_initial_phi)
3838     {
3839       gcc_checking_assert (fma_state.m_last_result);
3840       if (!last_fma_candidate_feeds_initial_phi (&fma_state,
3841 						 &m_last_result_set))
3842 	cancel_fma_deferring (&fma_state);
3843       else
3844 	m_last_result_set.add (fma_state.m_last_result);
3845     }
3846 }
3847 
3848 
3849 unsigned int
execute(function * fun)3850 pass_optimize_widening_mul::execute (function *fun)
3851 {
3852   bool cfg_changed = false;
3853 
3854   memset (&widen_mul_stats, 0, sizeof (widen_mul_stats));
3855   calculate_dominance_info (CDI_DOMINATORS);
3856   renumber_gimple_stmt_uids (cfun);
3857 
3858   math_opts_dom_walker (&cfg_changed).walk (ENTRY_BLOCK_PTR_FOR_FN (cfun));
3859 
3860   statistics_counter_event (fun, "widening multiplications inserted",
3861 			    widen_mul_stats.widen_mults_inserted);
3862   statistics_counter_event (fun, "widening maccs inserted",
3863 			    widen_mul_stats.maccs_inserted);
3864   statistics_counter_event (fun, "fused multiply-adds inserted",
3865 			    widen_mul_stats.fmas_inserted);
3866   statistics_counter_event (fun, "divmod calls inserted",
3867 			    widen_mul_stats.divmod_calls_inserted);
3868 
3869   return cfg_changed ? TODO_cleanup_cfg : 0;
3870 }
3871 
3872 } // anon namespace
3873 
3874 gimple_opt_pass *
make_pass_optimize_widening_mul(gcc::context * ctxt)3875 make_pass_optimize_widening_mul (gcc::context *ctxt)
3876 {
3877   return new pass_optimize_widening_mul (ctxt);
3878 }
3879