1 /* Global, SSA-based optimizations using mathematical identities.
2    Copyright (C) 2005-2018 Free Software Foundation, Inc.
3 
4 This file is part of GCC.
5 
6 GCC is free software; you can redistribute it and/or modify it
7 under the terms of the GNU General Public License as published by the
8 Free Software Foundation; either version 3, or (at your option) any
9 later version.
10 
11 GCC is distributed in the hope that it will be useful, but WITHOUT
12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14 for more details.
15 
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3.  If not see
18 <http://www.gnu.org/licenses/>.  */
19 
20 /* Currently, the only mini-pass in this file tries to CSE reciprocal
21    operations.  These are common in sequences such as this one:
22 
23 	modulus = sqrt(x*x + y*y + z*z);
24 	x = x / modulus;
25 	y = y / modulus;
26 	z = z / modulus;
27 
28    that can be optimized to
29 
30 	modulus = sqrt(x*x + y*y + z*z);
31         rmodulus = 1.0 / modulus;
32 	x = x * rmodulus;
33 	y = y * rmodulus;
34 	z = z * rmodulus;
35 
36    We do this for loop invariant divisors, and with this pass whenever
37    we notice that a division has the same divisor multiple times.
38 
39    Of course, like in PRE, we don't insert a division if a dominator
40    already has one.  However, this cannot be done as an extension of
41    PRE for several reasons.
42 
43    First of all, with some experiments it was found out that the
44    transformation is not always useful if there are only two divisions
45    by the same divisor.  This is probably because modern processors
46    can pipeline the divisions; on older, in-order processors it should
47    still be effective to optimize two divisions by the same number.
48    We make this a param, and it shall be called N in the remainder of
49    this comment.
50 
51    Second, if trapping math is active, we have less freedom on where
52    to insert divisions: we can only do so in basic blocks that already
53    contain one.  (If divisions don't trap, instead, we can insert
54    divisions elsewhere, which will be in blocks that are common dominators
55    of those that have the division).
56 
57    We really don't want to compute the reciprocal unless a division will
58    be found.  To do this, we won't insert the division in a basic block
59    that has less than N divisions *post-dominating* it.
60 
61    The algorithm constructs a subset of the dominator tree, holding the
62    blocks containing the divisions and the common dominators to them,
63    and walk it twice.  The first walk is in post-order, and it annotates
64    each block with the number of divisions that post-dominate it: this
65    gives information on where divisions can be inserted profitably.
66    The second walk is in pre-order, and it inserts divisions as explained
67    above, and replaces divisions by multiplications.
68 
69    In the best case, the cost of the pass is O(n_statements).  In the
70    worst-case, the cost is due to creating the dominator tree subset,
71    with a cost of O(n_basic_blocks ^ 2); however this can only happen
72    for n_statements / n_basic_blocks statements.  So, the amortized cost
73    of creating the dominator tree subset is O(n_basic_blocks) and the
74    worst-case cost of the pass is O(n_statements * n_basic_blocks).
75 
76    More practically, the cost will be small because there are few
77    divisions, and they tend to be in the same basic block, so insert_bb
78    is called very few times.
79 
80    If we did this using domwalk.c, an efficient implementation would have
81    to work on all the variables in a single pass, because we could not
82    work on just a subset of the dominator tree, as we do now, and the
83    cost would also be something like O(n_statements * n_basic_blocks).
84    The data structures would be more complex in order to work on all the
85    variables in a single pass.  */
86 
87 #include "config.h"
88 #include "system.h"
89 #include "coretypes.h"
90 #include "backend.h"
91 #include "target.h"
92 #include "rtl.h"
93 #include "tree.h"
94 #include "gimple.h"
95 #include "predict.h"
96 #include "alloc-pool.h"
97 #include "tree-pass.h"
98 #include "ssa.h"
99 #include "optabs-tree.h"
100 #include "gimple-pretty-print.h"
101 #include "alias.h"
102 #include "fold-const.h"
103 #include "gimple-fold.h"
104 #include "gimple-iterator.h"
105 #include "gimplify.h"
106 #include "gimplify-me.h"
107 #include "stor-layout.h"
108 #include "tree-cfg.h"
109 #include "tree-dfa.h"
110 #include "tree-ssa.h"
111 #include "builtins.h"
112 #include "params.h"
113 #include "internal-fn.h"
114 #include "case-cfn-macros.h"
115 #include "optabs-libfuncs.h"
116 #include "tree-eh.h"
117 #include "targhooks.h"
118 #include "domwalk.h"
119 
120 /* This structure represents one basic block that either computes a
121    division, or is a common dominator for basic block that compute a
122    division.  */
123 struct occurrence {
124   /* The basic block represented by this structure.  */
125   basic_block bb;
126 
127   /* If non-NULL, the SSA_NAME holding the definition for a reciprocal
128      inserted in BB.  */
129   tree recip_def;
130 
131   /* If non-NULL, the SSA_NAME holding the definition for a squared
132      reciprocal inserted in BB.  */
133   tree square_recip_def;
134 
135   /* If non-NULL, the GIMPLE_ASSIGN for a reciprocal computation that
136      was inserted in BB.  */
137   gimple *recip_def_stmt;
138 
139   /* Pointer to a list of "struct occurrence"s for blocks dominated
140      by BB.  */
141   struct occurrence *children;
142 
143   /* Pointer to the next "struct occurrence"s in the list of blocks
144      sharing a common dominator.  */
145   struct occurrence *next;
146 
147   /* The number of divisions that are in BB before compute_merit.  The
148      number of divisions that are in BB or post-dominate it after
149      compute_merit.  */
150   int num_divisions;
151 
152   /* True if the basic block has a division, false if it is a common
153      dominator for basic blocks that do.  If it is false and trapping
154      math is active, BB is not a candidate for inserting a reciprocal.  */
155   bool bb_has_division;
156 };
157 
158 static struct
159 {
160   /* Number of 1.0/X ops inserted.  */
161   int rdivs_inserted;
162 
163   /* Number of 1.0/FUNC ops inserted.  */
164   int rfuncs_inserted;
165 } reciprocal_stats;
166 
167 static struct
168 {
169   /* Number of cexpi calls inserted.  */
170   int inserted;
171 } sincos_stats;
172 
173 static struct
174 {
175   /* Number of widening multiplication ops inserted.  */
176   int widen_mults_inserted;
177 
178   /* Number of integer multiply-and-accumulate ops inserted.  */
179   int maccs_inserted;
180 
181   /* Number of fp fused multiply-add ops inserted.  */
182   int fmas_inserted;
183 
184   /* Number of divmod calls inserted.  */
185   int divmod_calls_inserted;
186 } widen_mul_stats;
187 
188 /* The instance of "struct occurrence" representing the highest
189    interesting block in the dominator tree.  */
190 static struct occurrence *occ_head;
191 
192 /* Allocation pool for getting instances of "struct occurrence".  */
193 static object_allocator<occurrence> *occ_pool;
194 
195 
196 
197 /* Allocate and return a new struct occurrence for basic block BB, and
198    whose children list is headed by CHILDREN.  */
199 static struct occurrence *
occ_new(basic_block bb,struct occurrence * children)200 occ_new (basic_block bb, struct occurrence *children)
201 {
202   struct occurrence *occ;
203 
204   bb->aux = occ = occ_pool->allocate ();
205   memset (occ, 0, sizeof (struct occurrence));
206 
207   occ->bb = bb;
208   occ->children = children;
209   return occ;
210 }
211 
212 
213 /* Insert NEW_OCC into our subset of the dominator tree.  P_HEAD points to a
214    list of "struct occurrence"s, one per basic block, having IDOM as
215    their common dominator.
216 
217    We try to insert NEW_OCC as deep as possible in the tree, and we also
218    insert any other block that is a common dominator for BB and one
219    block already in the tree.  */
220 
221 static void
insert_bb(struct occurrence * new_occ,basic_block idom,struct occurrence ** p_head)222 insert_bb (struct occurrence *new_occ, basic_block idom,
223 	   struct occurrence **p_head)
224 {
225   struct occurrence *occ, **p_occ;
226 
227   for (p_occ = p_head; (occ = *p_occ) != NULL; )
228     {
229       basic_block bb = new_occ->bb, occ_bb = occ->bb;
230       basic_block dom = nearest_common_dominator (CDI_DOMINATORS, occ_bb, bb);
231       if (dom == bb)
232 	{
233 	  /* BB dominates OCC_BB.  OCC becomes NEW_OCC's child: remove OCC
234 	     from its list.  */
235 	  *p_occ = occ->next;
236 	  occ->next = new_occ->children;
237 	  new_occ->children = occ;
238 
239 	  /* Try the next block (it may as well be dominated by BB).  */
240 	}
241 
242       else if (dom == occ_bb)
243 	{
244 	  /* OCC_BB dominates BB.  Tail recurse to look deeper.  */
245 	  insert_bb (new_occ, dom, &occ->children);
246 	  return;
247 	}
248 
249       else if (dom != idom)
250 	{
251 	  gcc_assert (!dom->aux);
252 
253 	  /* There is a dominator between IDOM and BB, add it and make
254 	     two children out of NEW_OCC and OCC.  First, remove OCC from
255 	     its list.	*/
256 	  *p_occ = occ->next;
257 	  new_occ->next = occ;
258 	  occ->next = NULL;
259 
260 	  /* None of the previous blocks has DOM as a dominator: if we tail
261 	     recursed, we would reexamine them uselessly. Just switch BB with
262 	     DOM, and go on looking for blocks dominated by DOM.  */
263           new_occ = occ_new (dom, new_occ);
264 	}
265 
266       else
267 	{
268 	  /* Nothing special, go on with the next element.  */
269 	  p_occ = &occ->next;
270 	}
271     }
272 
273   /* No place was found as a child of IDOM.  Make BB a sibling of IDOM.  */
274   new_occ->next = *p_head;
275   *p_head = new_occ;
276 }
277 
278 /* Register that we found a division in BB.
279    IMPORTANCE is a measure of how much weighting to give
280    that division.  Use IMPORTANCE = 2 to register a single
281    division.  If the division is going to be found multiple
282    times use 1 (as it is with squares).  */
283 
284 static inline void
register_division_in(basic_block bb,int importance)285 register_division_in (basic_block bb, int importance)
286 {
287   struct occurrence *occ;
288 
289   occ = (struct occurrence *) bb->aux;
290   if (!occ)
291     {
292       occ = occ_new (bb, NULL);
293       insert_bb (occ, ENTRY_BLOCK_PTR_FOR_FN (cfun), &occ_head);
294     }
295 
296   occ->bb_has_division = true;
297   occ->num_divisions += importance;
298 }
299 
300 
301 /* Compute the number of divisions that postdominate each block in OCC and
302    its children.  */
303 
304 static void
compute_merit(struct occurrence * occ)305 compute_merit (struct occurrence *occ)
306 {
307   struct occurrence *occ_child;
308   basic_block dom = occ->bb;
309 
310   for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
311     {
312       basic_block bb;
313       if (occ_child->children)
314         compute_merit (occ_child);
315 
316       if (flag_exceptions)
317 	bb = single_noncomplex_succ (dom);
318       else
319 	bb = dom;
320 
321       if (dominated_by_p (CDI_POST_DOMINATORS, bb, occ_child->bb))
322         occ->num_divisions += occ_child->num_divisions;
323     }
324 }
325 
326 
327 /* Return whether USE_STMT is a floating-point division by DEF.  */
328 static inline bool
is_division_by(gimple * use_stmt,tree def)329 is_division_by (gimple *use_stmt, tree def)
330 {
331   return is_gimple_assign (use_stmt)
332 	 && gimple_assign_rhs_code (use_stmt) == RDIV_EXPR
333 	 && gimple_assign_rhs2 (use_stmt) == def
334 	 /* Do not recognize x / x as valid division, as we are getting
335 	    confused later by replacing all immediate uses x in such
336 	    a stmt.  */
337 	 && gimple_assign_rhs1 (use_stmt) != def
338 	 && !stmt_can_throw_internal (use_stmt);
339 }
340 
341 /* Return whether USE_STMT is DEF * DEF.  */
342 static inline bool
is_square_of(gimple * use_stmt,tree def)343 is_square_of (gimple *use_stmt, tree def)
344 {
345   if (gimple_code (use_stmt) == GIMPLE_ASSIGN
346       && gimple_assign_rhs_code (use_stmt) == MULT_EXPR)
347     {
348       tree op0 = gimple_assign_rhs1 (use_stmt);
349       tree op1 = gimple_assign_rhs2 (use_stmt);
350 
351       return op0 == op1 && op0 == def;
352     }
353   return 0;
354 }
355 
356 /* Return whether USE_STMT is a floating-point division by
357    DEF * DEF.  */
358 static inline bool
is_division_by_square(gimple * use_stmt,tree def)359 is_division_by_square (gimple *use_stmt, tree def)
360 {
361   if (gimple_code (use_stmt) == GIMPLE_ASSIGN
362       && gimple_assign_rhs_code (use_stmt) == RDIV_EXPR
363       && gimple_assign_rhs1 (use_stmt) != gimple_assign_rhs2 (use_stmt)
364       && !stmt_can_throw_internal (use_stmt))
365     {
366       tree denominator = gimple_assign_rhs2 (use_stmt);
367       if (TREE_CODE (denominator) == SSA_NAME)
368 	return is_square_of (SSA_NAME_DEF_STMT (denominator), def);
369     }
370   return 0;
371 }
372 
373 /* Walk the subset of the dominator tree rooted at OCC, setting the
374    RECIP_DEF field to a definition of 1.0 / DEF that can be used in
375    the given basic block.  The field may be left NULL, of course,
376    if it is not possible or profitable to do the optimization.
377 
378    DEF_BSI is an iterator pointing at the statement defining DEF.
379    If RECIP_DEF is set, a dominator already has a computation that can
380    be used.
381 
382    If should_insert_square_recip is set, then this also inserts
383    the square of the reciprocal immediately after the definition
384    of the reciprocal.  */
385 
386 static void
insert_reciprocals(gimple_stmt_iterator * def_gsi,struct occurrence * occ,tree def,tree recip_def,tree square_recip_def,int should_insert_square_recip,int threshold)387 insert_reciprocals (gimple_stmt_iterator *def_gsi, struct occurrence *occ,
388 		    tree def, tree recip_def, tree square_recip_def,
389 		    int should_insert_square_recip, int threshold)
390 {
391   tree type;
392   gassign *new_stmt, *new_square_stmt;
393   gimple_stmt_iterator gsi;
394   struct occurrence *occ_child;
395 
396   if (!recip_def
397       && (occ->bb_has_division || !flag_trapping_math)
398       /* Divide by two as all divisions are counted twice in
399 	 the costing loop.  */
400       && occ->num_divisions / 2 >= threshold)
401     {
402       /* Make a variable with the replacement and substitute it.  */
403       type = TREE_TYPE (def);
404       recip_def = create_tmp_reg (type, "reciptmp");
405       new_stmt = gimple_build_assign (recip_def, RDIV_EXPR,
406 				      build_one_cst (type), def);
407 
408       if (should_insert_square_recip)
409 	{
410 	  square_recip_def = create_tmp_reg (type, "powmult_reciptmp");
411 	  new_square_stmt = gimple_build_assign (square_recip_def, MULT_EXPR,
412 						 recip_def, recip_def);
413 	}
414 
415       if (occ->bb_has_division)
416 	{
417 	  /* Case 1: insert before an existing division.  */
418 	  gsi = gsi_after_labels (occ->bb);
419 	  while (!gsi_end_p (gsi)
420 		 && (!is_division_by (gsi_stmt (gsi), def))
421 		 && (!is_division_by_square (gsi_stmt (gsi), def)))
422 	    gsi_next (&gsi);
423 
424 	  gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
425 	  if (should_insert_square_recip)
426 	    gsi_insert_before (&gsi, new_square_stmt, GSI_SAME_STMT);
427 	}
428       else if (def_gsi && occ->bb == def_gsi->bb)
429 	{
430 	  /* Case 2: insert right after the definition.  Note that this will
431 	     never happen if the definition statement can throw, because in
432 	     that case the sole successor of the statement's basic block will
433 	     dominate all the uses as well.  */
434 	  gsi_insert_after (def_gsi, new_stmt, GSI_NEW_STMT);
435 	  if (should_insert_square_recip)
436 	    gsi_insert_after (def_gsi, new_square_stmt, GSI_NEW_STMT);
437 	}
438       else
439 	{
440 	  /* Case 3: insert in a basic block not containing defs/uses.  */
441 	  gsi = gsi_after_labels (occ->bb);
442 	  gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
443 	  if (should_insert_square_recip)
444 	    gsi_insert_before (&gsi, new_square_stmt, GSI_SAME_STMT);
445 	}
446 
447       reciprocal_stats.rdivs_inserted++;
448 
449       occ->recip_def_stmt = new_stmt;
450     }
451 
452   occ->recip_def = recip_def;
453   occ->square_recip_def = square_recip_def;
454   for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
455     insert_reciprocals (def_gsi, occ_child, def, recip_def,
456 			square_recip_def, should_insert_square_recip,
457 			threshold);
458 }
459 
460 /* Replace occurrences of expr / (x * x) with expr * ((1 / x) * (1 / x)).
461    Take as argument the use for (x * x).  */
462 static inline void
replace_reciprocal_squares(use_operand_p use_p)463 replace_reciprocal_squares (use_operand_p use_p)
464 {
465   gimple *use_stmt = USE_STMT (use_p);
466   basic_block bb = gimple_bb (use_stmt);
467   struct occurrence *occ = (struct occurrence *) bb->aux;
468 
469   if (optimize_bb_for_speed_p (bb) && occ->square_recip_def
470       && occ->recip_def)
471     {
472       gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
473       gimple_assign_set_rhs_code (use_stmt, MULT_EXPR);
474       gimple_assign_set_rhs2 (use_stmt, occ->square_recip_def);
475       SET_USE (use_p, occ->square_recip_def);
476       fold_stmt_inplace (&gsi);
477       update_stmt (use_stmt);
478     }
479 }
480 
481 
482 /* Replace the division at USE_P with a multiplication by the reciprocal, if
483    possible.  */
484 
485 static inline void
replace_reciprocal(use_operand_p use_p)486 replace_reciprocal (use_operand_p use_p)
487 {
488   gimple *use_stmt = USE_STMT (use_p);
489   basic_block bb = gimple_bb (use_stmt);
490   struct occurrence *occ = (struct occurrence *) bb->aux;
491 
492   if (optimize_bb_for_speed_p (bb)
493       && occ->recip_def && use_stmt != occ->recip_def_stmt)
494     {
495       gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
496       gimple_assign_set_rhs_code (use_stmt, MULT_EXPR);
497       SET_USE (use_p, occ->recip_def);
498       fold_stmt_inplace (&gsi);
499       update_stmt (use_stmt);
500     }
501 }
502 
503 
504 /* Free OCC and return one more "struct occurrence" to be freed.  */
505 
506 static struct occurrence *
free_bb(struct occurrence * occ)507 free_bb (struct occurrence *occ)
508 {
509   struct occurrence *child, *next;
510 
511   /* First get the two pointers hanging off OCC.  */
512   next = occ->next;
513   child = occ->children;
514   occ->bb->aux = NULL;
515   occ_pool->remove (occ);
516 
517   /* Now ensure that we don't recurse unless it is necessary.  */
518   if (!child)
519     return next;
520   else
521     {
522       while (next)
523 	next = free_bb (next);
524 
525       return child;
526     }
527 }
528 
529 
530 /* Look for floating-point divisions among DEF's uses, and try to
531    replace them by multiplications with the reciprocal.  Add
532    as many statements computing the reciprocal as needed.
533 
534    DEF must be a GIMPLE register of a floating-point type.  */
535 
536 static void
execute_cse_reciprocals_1(gimple_stmt_iterator * def_gsi,tree def)537 execute_cse_reciprocals_1 (gimple_stmt_iterator *def_gsi, tree def)
538 {
539   use_operand_p use_p, square_use_p;
540   imm_use_iterator use_iter, square_use_iter;
541   tree square_def;
542   struct occurrence *occ;
543   int count = 0;
544   int threshold;
545   int square_recip_count = 0;
546   int sqrt_recip_count = 0;
547 
548   gcc_assert (FLOAT_TYPE_P (TREE_TYPE (def)) && TREE_CODE (def) == SSA_NAME);
549   threshold = targetm.min_divisions_for_recip_mul (TYPE_MODE (TREE_TYPE (def)));
550 
551   /* If DEF is a square (x * x), count the number of divisions by x.
552      If there are more divisions by x than by (DEF * DEF), prefer to optimize
553      the reciprocal of x instead of DEF.  This improves cases like:
554        def = x * x
555        t0 = a / def
556        t1 = b / def
557        t2 = c / x
558      Reciprocal optimization of x results in 1 division rather than 2 or 3.  */
559   gimple *def_stmt = SSA_NAME_DEF_STMT (def);
560 
561   if (is_gimple_assign (def_stmt)
562       && gimple_assign_rhs_code (def_stmt) == MULT_EXPR
563       && TREE_CODE (gimple_assign_rhs1 (def_stmt)) == SSA_NAME
564       && gimple_assign_rhs1 (def_stmt) == gimple_assign_rhs2 (def_stmt))
565     {
566       tree op0 = gimple_assign_rhs1 (def_stmt);
567 
568       FOR_EACH_IMM_USE_FAST (use_p, use_iter, op0)
569 	{
570 	  gimple *use_stmt = USE_STMT (use_p);
571 	  if (is_division_by (use_stmt, op0))
572 	    sqrt_recip_count++;
573 	}
574     }
575 
576   FOR_EACH_IMM_USE_FAST (use_p, use_iter, def)
577     {
578       gimple *use_stmt = USE_STMT (use_p);
579       if (is_division_by (use_stmt, def))
580 	{
581 	  register_division_in (gimple_bb (use_stmt), 2);
582 	  count++;
583 	}
584 
585       if (is_square_of (use_stmt, def))
586 	{
587 	  square_def = gimple_assign_lhs (use_stmt);
588 	  FOR_EACH_IMM_USE_FAST (square_use_p, square_use_iter, square_def)
589 	    {
590 	      gimple *square_use_stmt = USE_STMT (square_use_p);
591 	      if (is_division_by (square_use_stmt, square_def))
592 		{
593 		  /* This is executed twice for each division by a square.  */
594 		  register_division_in (gimple_bb (square_use_stmt), 1);
595 		  square_recip_count++;
596 		}
597 	    }
598 	}
599     }
600 
601   /* Square reciprocals were counted twice above.  */
602   square_recip_count /= 2;
603 
604   /* If it is more profitable to optimize 1 / x, don't optimize 1 / (x * x).  */
605   if (sqrt_recip_count > square_recip_count)
606     goto out;
607 
608   /* Do the expensive part only if we can hope to optimize something.  */
609   if (count + square_recip_count >= threshold && count >= 1)
610     {
611       gimple *use_stmt;
612       for (occ = occ_head; occ; occ = occ->next)
613 	{
614 	  compute_merit (occ);
615 	  insert_reciprocals (def_gsi, occ, def, NULL, NULL,
616 			      square_recip_count, threshold);
617 	}
618 
619       FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, def)
620 	{
621 	  if (is_division_by (use_stmt, def))
622 	    {
623 	      FOR_EACH_IMM_USE_ON_STMT (use_p, use_iter)
624 		replace_reciprocal (use_p);
625 	    }
626 	  else if (square_recip_count > 0 && is_square_of (use_stmt, def))
627 	    {
628 	      FOR_EACH_IMM_USE_ON_STMT (use_p, use_iter)
629 		{
630 		  /* Find all uses of the square that are divisions and
631 		   * replace them by multiplications with the inverse.  */
632 		  imm_use_iterator square_iterator;
633 		  gimple *powmult_use_stmt = USE_STMT (use_p);
634 		  tree powmult_def_name = gimple_assign_lhs (powmult_use_stmt);
635 
636 		  FOR_EACH_IMM_USE_STMT (powmult_use_stmt,
637 					 square_iterator, powmult_def_name)
638 		    FOR_EACH_IMM_USE_ON_STMT (square_use_p, square_iterator)
639 		      {
640 			gimple *powmult_use_stmt = USE_STMT (square_use_p);
641 			if (is_division_by (powmult_use_stmt, powmult_def_name))
642 			  replace_reciprocal_squares (square_use_p);
643 		      }
644 		}
645 	    }
646 	}
647     }
648 
649 out:
650   for (occ = occ_head; occ; )
651     occ = free_bb (occ);
652 
653   occ_head = NULL;
654 }
655 
656 /* Return an internal function that implements the reciprocal of CALL,
657    or IFN_LAST if there is no such function that the target supports.  */
658 
659 internal_fn
internal_fn_reciprocal(gcall * call)660 internal_fn_reciprocal (gcall *call)
661 {
662   internal_fn ifn;
663 
664   switch (gimple_call_combined_fn (call))
665     {
666     CASE_CFN_SQRT:
667     CASE_CFN_SQRT_FN:
668       ifn = IFN_RSQRT;
669       break;
670 
671     default:
672       return IFN_LAST;
673     }
674 
675   tree_pair types = direct_internal_fn_types (ifn, call);
676   if (!direct_internal_fn_supported_p (ifn, types, OPTIMIZE_FOR_SPEED))
677     return IFN_LAST;
678 
679   return ifn;
680 }
681 
682 /* Go through all the floating-point SSA_NAMEs, and call
683    execute_cse_reciprocals_1 on each of them.  */
684 namespace {
685 
686 const pass_data pass_data_cse_reciprocals =
687 {
688   GIMPLE_PASS, /* type */
689   "recip", /* name */
690   OPTGROUP_NONE, /* optinfo_flags */
691   TV_TREE_RECIP, /* tv_id */
692   PROP_ssa, /* properties_required */
693   0, /* properties_provided */
694   0, /* properties_destroyed */
695   0, /* todo_flags_start */
696   TODO_update_ssa, /* todo_flags_finish */
697 };
698 
699 class pass_cse_reciprocals : public gimple_opt_pass
700 {
701 public:
pass_cse_reciprocals(gcc::context * ctxt)702   pass_cse_reciprocals (gcc::context *ctxt)
703     : gimple_opt_pass (pass_data_cse_reciprocals, ctxt)
704   {}
705 
706   /* opt_pass methods: */
gate(function *)707   virtual bool gate (function *) { return optimize && flag_reciprocal_math; }
708   virtual unsigned int execute (function *);
709 
710 }; // class pass_cse_reciprocals
711 
712 unsigned int
execute(function * fun)713 pass_cse_reciprocals::execute (function *fun)
714 {
715   basic_block bb;
716   tree arg;
717 
718   occ_pool = new object_allocator<occurrence> ("dominators for recip");
719 
720   memset (&reciprocal_stats, 0, sizeof (reciprocal_stats));
721   calculate_dominance_info (CDI_DOMINATORS);
722   calculate_dominance_info (CDI_POST_DOMINATORS);
723 
724   if (flag_checking)
725     FOR_EACH_BB_FN (bb, fun)
726       gcc_assert (!bb->aux);
727 
728   for (arg = DECL_ARGUMENTS (fun->decl); arg; arg = DECL_CHAIN (arg))
729     if (FLOAT_TYPE_P (TREE_TYPE (arg))
730 	&& is_gimple_reg (arg))
731       {
732 	tree name = ssa_default_def (fun, arg);
733 	if (name)
734 	  execute_cse_reciprocals_1 (NULL, name);
735       }
736 
737   FOR_EACH_BB_FN (bb, fun)
738     {
739       tree def;
740 
741       for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
742 	   gsi_next (&gsi))
743 	{
744 	  gphi *phi = gsi.phi ();
745 	  def = PHI_RESULT (phi);
746 	  if (! virtual_operand_p (def)
747 	      && FLOAT_TYPE_P (TREE_TYPE (def)))
748 	    execute_cse_reciprocals_1 (NULL, def);
749 	}
750 
751       for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
752 	   gsi_next (&gsi))
753         {
754 	  gimple *stmt = gsi_stmt (gsi);
755 
756 	  if (gimple_has_lhs (stmt)
757 	      && (def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF)) != NULL
758 	      && FLOAT_TYPE_P (TREE_TYPE (def))
759 	      && TREE_CODE (def) == SSA_NAME)
760 	    execute_cse_reciprocals_1 (&gsi, def);
761 	}
762 
763       if (optimize_bb_for_size_p (bb))
764         continue;
765 
766       /* Scan for a/func(b) and convert it to reciprocal a*rfunc(b).  */
767       for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
768 	   gsi_next (&gsi))
769         {
770 	  gimple *stmt = gsi_stmt (gsi);
771 
772 	  if (is_gimple_assign (stmt)
773 	      && gimple_assign_rhs_code (stmt) == RDIV_EXPR)
774 	    {
775 	      tree arg1 = gimple_assign_rhs2 (stmt);
776 	      gimple *stmt1;
777 
778 	      if (TREE_CODE (arg1) != SSA_NAME)
779 		continue;
780 
781 	      stmt1 = SSA_NAME_DEF_STMT (arg1);
782 
783 	      if (is_gimple_call (stmt1)
784 		  && gimple_call_lhs (stmt1))
785 		{
786 		  bool fail;
787 		  imm_use_iterator ui;
788 		  use_operand_p use_p;
789 		  tree fndecl = NULL_TREE;
790 
791 		  gcall *call = as_a <gcall *> (stmt1);
792 		  internal_fn ifn = internal_fn_reciprocal (call);
793 		  if (ifn == IFN_LAST)
794 		    {
795 		      fndecl = gimple_call_fndecl (call);
796 		      if (!fndecl
797 			  || DECL_BUILT_IN_CLASS (fndecl) != BUILT_IN_MD)
798 			continue;
799 		      fndecl = targetm.builtin_reciprocal (fndecl);
800 		      if (!fndecl)
801 			continue;
802 		    }
803 
804 		  /* Check that all uses of the SSA name are divisions,
805 		     otherwise replacing the defining statement will do
806 		     the wrong thing.  */
807 		  fail = false;
808 		  FOR_EACH_IMM_USE_FAST (use_p, ui, arg1)
809 		    {
810 		      gimple *stmt2 = USE_STMT (use_p);
811 		      if (is_gimple_debug (stmt2))
812 			continue;
813 		      if (!is_gimple_assign (stmt2)
814 			  || gimple_assign_rhs_code (stmt2) != RDIV_EXPR
815 			  || gimple_assign_rhs1 (stmt2) == arg1
816 			  || gimple_assign_rhs2 (stmt2) != arg1)
817 			{
818 			  fail = true;
819 			  break;
820 			}
821 		    }
822 		  if (fail)
823 		    continue;
824 
825 		  gimple_replace_ssa_lhs (call, arg1);
826 		  if (gimple_call_internal_p (call) != (ifn != IFN_LAST))
827 		    {
828 		      auto_vec<tree, 4> args;
829 		      for (unsigned int i = 0;
830 			   i < gimple_call_num_args (call); i++)
831 			args.safe_push (gimple_call_arg (call, i));
832 		      gcall *stmt2;
833 		      if (ifn == IFN_LAST)
834 			stmt2 = gimple_build_call_vec (fndecl, args);
835 		      else
836 			stmt2 = gimple_build_call_internal_vec (ifn, args);
837 		      gimple_call_set_lhs (stmt2, arg1);
838 		      if (gimple_vdef (call))
839 			{
840 			  gimple_set_vdef (stmt2, gimple_vdef (call));
841 			  SSA_NAME_DEF_STMT (gimple_vdef (stmt2)) = stmt2;
842 			}
843 		      gimple_call_set_nothrow (stmt2,
844 					       gimple_call_nothrow_p (call));
845 		      gimple_set_vuse (stmt2, gimple_vuse (call));
846 		      gimple_stmt_iterator gsi2 = gsi_for_stmt (call);
847 		      gsi_replace (&gsi2, stmt2, true);
848 		    }
849 		  else
850 		    {
851 		      if (ifn == IFN_LAST)
852 			gimple_call_set_fndecl (call, fndecl);
853 		      else
854 			gimple_call_set_internal_fn (call, ifn);
855 		      update_stmt (call);
856 		    }
857 		  reciprocal_stats.rfuncs_inserted++;
858 
859 		  FOR_EACH_IMM_USE_STMT (stmt, ui, arg1)
860 		    {
861 		      gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
862 		      gimple_assign_set_rhs_code (stmt, MULT_EXPR);
863 		      fold_stmt_inplace (&gsi);
864 		      update_stmt (stmt);
865 		    }
866 		}
867 	    }
868 	}
869     }
870 
871   statistics_counter_event (fun, "reciprocal divs inserted",
872 			    reciprocal_stats.rdivs_inserted);
873   statistics_counter_event (fun, "reciprocal functions inserted",
874 			    reciprocal_stats.rfuncs_inserted);
875 
876   free_dominance_info (CDI_DOMINATORS);
877   free_dominance_info (CDI_POST_DOMINATORS);
878   delete occ_pool;
879   return 0;
880 }
881 
882 } // anon namespace
883 
884 gimple_opt_pass *
make_pass_cse_reciprocals(gcc::context * ctxt)885 make_pass_cse_reciprocals (gcc::context *ctxt)
886 {
887   return new pass_cse_reciprocals (ctxt);
888 }
889 
890 /* Records an occurrence at statement USE_STMT in the vector of trees
891    STMTS if it is dominated by *TOP_BB or dominates it or this basic block
892    is not yet initialized.  Returns true if the occurrence was pushed on
893    the vector.  Adjusts *TOP_BB to be the basic block dominating all
894    statements in the vector.  */
895 
896 static bool
maybe_record_sincos(vec<gimple * > * stmts,basic_block * top_bb,gimple * use_stmt)897 maybe_record_sincos (vec<gimple *> *stmts,
898 		     basic_block *top_bb, gimple *use_stmt)
899 {
900   basic_block use_bb = gimple_bb (use_stmt);
901   if (*top_bb
902       && (*top_bb == use_bb
903 	  || dominated_by_p (CDI_DOMINATORS, use_bb, *top_bb)))
904     stmts->safe_push (use_stmt);
905   else if (!*top_bb
906 	   || dominated_by_p (CDI_DOMINATORS, *top_bb, use_bb))
907     {
908       stmts->safe_push (use_stmt);
909       *top_bb = use_bb;
910     }
911   else
912     return false;
913 
914   return true;
915 }
916 
917 /* Look for sin, cos and cexpi calls with the same argument NAME and
918    create a single call to cexpi CSEing the result in this case.
919    We first walk over all immediate uses of the argument collecting
920    statements that we can CSE in a vector and in a second pass replace
921    the statement rhs with a REALPART or IMAGPART expression on the
922    result of the cexpi call we insert before the use statement that
923    dominates all other candidates.  */
924 
925 static bool
execute_cse_sincos_1(tree name)926 execute_cse_sincos_1 (tree name)
927 {
928   gimple_stmt_iterator gsi;
929   imm_use_iterator use_iter;
930   tree fndecl, res, type;
931   gimple *def_stmt, *use_stmt, *stmt;
932   int seen_cos = 0, seen_sin = 0, seen_cexpi = 0;
933   auto_vec<gimple *> stmts;
934   basic_block top_bb = NULL;
935   int i;
936   bool cfg_changed = false;
937 
938   type = TREE_TYPE (name);
939   FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, name)
940     {
941       if (gimple_code (use_stmt) != GIMPLE_CALL
942 	  || !gimple_call_lhs (use_stmt))
943 	continue;
944 
945       switch (gimple_call_combined_fn (use_stmt))
946 	{
947 	CASE_CFN_COS:
948 	  seen_cos |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
949 	  break;
950 
951 	CASE_CFN_SIN:
952 	  seen_sin |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
953 	  break;
954 
955 	CASE_CFN_CEXPI:
956 	  seen_cexpi |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
957 	  break;
958 
959 	default:;
960 	}
961     }
962 
963   if (seen_cos + seen_sin + seen_cexpi <= 1)
964     return false;
965 
966   /* Simply insert cexpi at the beginning of top_bb but not earlier than
967      the name def statement.  */
968   fndecl = mathfn_built_in (type, BUILT_IN_CEXPI);
969   if (!fndecl)
970     return false;
971   stmt = gimple_build_call (fndecl, 1, name);
972   res = make_temp_ssa_name (TREE_TYPE (TREE_TYPE (fndecl)), stmt, "sincostmp");
973   gimple_call_set_lhs (stmt, res);
974 
975   def_stmt = SSA_NAME_DEF_STMT (name);
976   if (!SSA_NAME_IS_DEFAULT_DEF (name)
977       && gimple_code (def_stmt) != GIMPLE_PHI
978       && gimple_bb (def_stmt) == top_bb)
979     {
980       gsi = gsi_for_stmt (def_stmt);
981       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
982     }
983   else
984     {
985       gsi = gsi_after_labels (top_bb);
986       gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
987     }
988   sincos_stats.inserted++;
989 
990   /* And adjust the recorded old call sites.  */
991   for (i = 0; stmts.iterate (i, &use_stmt); ++i)
992     {
993       tree rhs = NULL;
994 
995       switch (gimple_call_combined_fn (use_stmt))
996 	{
997 	CASE_CFN_COS:
998 	  rhs = fold_build1 (REALPART_EXPR, type, res);
999 	  break;
1000 
1001 	CASE_CFN_SIN:
1002 	  rhs = fold_build1 (IMAGPART_EXPR, type, res);
1003 	  break;
1004 
1005 	CASE_CFN_CEXPI:
1006 	  rhs = res;
1007 	  break;
1008 
1009 	default:;
1010 	  gcc_unreachable ();
1011 	}
1012 
1013 	/* Replace call with a copy.  */
1014 	stmt = gimple_build_assign (gimple_call_lhs (use_stmt), rhs);
1015 
1016 	gsi = gsi_for_stmt (use_stmt);
1017 	gsi_replace (&gsi, stmt, true);
1018 	if (gimple_purge_dead_eh_edges (gimple_bb (stmt)))
1019 	  cfg_changed = true;
1020     }
1021 
1022   return cfg_changed;
1023 }
1024 
1025 /* To evaluate powi(x,n), the floating point value x raised to the
1026    constant integer exponent n, we use a hybrid algorithm that
1027    combines the "window method" with look-up tables.  For an
1028    introduction to exponentiation algorithms and "addition chains",
1029    see section 4.6.3, "Evaluation of Powers" of Donald E. Knuth,
1030    "Seminumerical Algorithms", Vol. 2, "The Art of Computer Programming",
1031    3rd Edition, 1998, and Daniel M. Gordon, "A Survey of Fast Exponentiation
1032    Methods", Journal of Algorithms, Vol. 27, pp. 129-146, 1998.  */
1033 
1034 /* Provide a default value for POWI_MAX_MULTS, the maximum number of
1035    multiplications to inline before calling the system library's pow
1036    function.  powi(x,n) requires at worst 2*bits(n)-2 multiplications,
1037    so this default never requires calling pow, powf or powl.  */
1038 
1039 #ifndef POWI_MAX_MULTS
1040 #define POWI_MAX_MULTS  (2*HOST_BITS_PER_WIDE_INT-2)
1041 #endif
1042 
1043 /* The size of the "optimal power tree" lookup table.  All
1044    exponents less than this value are simply looked up in the
1045    powi_table below.  This threshold is also used to size the
1046    cache of pseudo registers that hold intermediate results.  */
1047 #define POWI_TABLE_SIZE 256
1048 
1049 /* The size, in bits of the window, used in the "window method"
1050    exponentiation algorithm.  This is equivalent to a radix of
1051    (1<<POWI_WINDOW_SIZE) in the corresponding "m-ary method".  */
1052 #define POWI_WINDOW_SIZE 3
1053 
1054 /* The following table is an efficient representation of an
1055    "optimal power tree".  For each value, i, the corresponding
1056    value, j, in the table states than an optimal evaluation
1057    sequence for calculating pow(x,i) can be found by evaluating
1058    pow(x,j)*pow(x,i-j).  An optimal power tree for the first
1059    100 integers is given in Knuth's "Seminumerical algorithms".  */
1060 
1061 static const unsigned char powi_table[POWI_TABLE_SIZE] =
1062   {
1063       0,   1,   1,   2,   2,   3,   3,   4,  /*   0 -   7 */
1064       4,   6,   5,   6,   6,  10,   7,   9,  /*   8 -  15 */
1065       8,  16,   9,  16,  10,  12,  11,  13,  /*  16 -  23 */
1066      12,  17,  13,  18,  14,  24,  15,  26,  /*  24 -  31 */
1067      16,  17,  17,  19,  18,  33,  19,  26,  /*  32 -  39 */
1068      20,  25,  21,  40,  22,  27,  23,  44,  /*  40 -  47 */
1069      24,  32,  25,  34,  26,  29,  27,  44,  /*  48 -  55 */
1070      28,  31,  29,  34,  30,  60,  31,  36,  /*  56 -  63 */
1071      32,  64,  33,  34,  34,  46,  35,  37,  /*  64 -  71 */
1072      36,  65,  37,  50,  38,  48,  39,  69,  /*  72 -  79 */
1073      40,  49,  41,  43,  42,  51,  43,  58,  /*  80 -  87 */
1074      44,  64,  45,  47,  46,  59,  47,  76,  /*  88 -  95 */
1075      48,  65,  49,  66,  50,  67,  51,  66,  /*  96 - 103 */
1076      52,  70,  53,  74,  54, 104,  55,  74,  /* 104 - 111 */
1077      56,  64,  57,  69,  58,  78,  59,  68,  /* 112 - 119 */
1078      60,  61,  61,  80,  62,  75,  63,  68,  /* 120 - 127 */
1079      64,  65,  65, 128,  66, 129,  67,  90,  /* 128 - 135 */
1080      68,  73,  69, 131,  70,  94,  71,  88,  /* 136 - 143 */
1081      72, 128,  73,  98,  74, 132,  75, 121,  /* 144 - 151 */
1082      76, 102,  77, 124,  78, 132,  79, 106,  /* 152 - 159 */
1083      80,  97,  81, 160,  82,  99,  83, 134,  /* 160 - 167 */
1084      84,  86,  85,  95,  86, 160,  87, 100,  /* 168 - 175 */
1085      88, 113,  89,  98,  90, 107,  91, 122,  /* 176 - 183 */
1086      92, 111,  93, 102,  94, 126,  95, 150,  /* 184 - 191 */
1087      96, 128,  97, 130,  98, 133,  99, 195,  /* 192 - 199 */
1088     100, 128, 101, 123, 102, 164, 103, 138,  /* 200 - 207 */
1089     104, 145, 105, 146, 106, 109, 107, 149,  /* 208 - 215 */
1090     108, 200, 109, 146, 110, 170, 111, 157,  /* 216 - 223 */
1091     112, 128, 113, 130, 114, 182, 115, 132,  /* 224 - 231 */
1092     116, 200, 117, 132, 118, 158, 119, 206,  /* 232 - 239 */
1093     120, 240, 121, 162, 122, 147, 123, 152,  /* 240 - 247 */
1094     124, 166, 125, 214, 126, 138, 127, 153,  /* 248 - 255 */
1095   };
1096 
1097 
1098 /* Return the number of multiplications required to calculate
1099    powi(x,n) where n is less than POWI_TABLE_SIZE.  This is a
1100    subroutine of powi_cost.  CACHE is an array indicating
1101    which exponents have already been calculated.  */
1102 
1103 static int
powi_lookup_cost(unsigned HOST_WIDE_INT n,bool * cache)1104 powi_lookup_cost (unsigned HOST_WIDE_INT n, bool *cache)
1105 {
1106   /* If we've already calculated this exponent, then this evaluation
1107      doesn't require any additional multiplications.  */
1108   if (cache[n])
1109     return 0;
1110 
1111   cache[n] = true;
1112   return powi_lookup_cost (n - powi_table[n], cache)
1113 	 + powi_lookup_cost (powi_table[n], cache) + 1;
1114 }
1115 
1116 /* Return the number of multiplications required to calculate
1117    powi(x,n) for an arbitrary x, given the exponent N.  This
1118    function needs to be kept in sync with powi_as_mults below.  */
1119 
1120 static int
powi_cost(HOST_WIDE_INT n)1121 powi_cost (HOST_WIDE_INT n)
1122 {
1123   bool cache[POWI_TABLE_SIZE];
1124   unsigned HOST_WIDE_INT digit;
1125   unsigned HOST_WIDE_INT val;
1126   int result;
1127 
1128   if (n == 0)
1129     return 0;
1130 
1131   /* Ignore the reciprocal when calculating the cost.  */
1132   val = (n < 0) ? -n : n;
1133 
1134   /* Initialize the exponent cache.  */
1135   memset (cache, 0, POWI_TABLE_SIZE * sizeof (bool));
1136   cache[1] = true;
1137 
1138   result = 0;
1139 
1140   while (val >= POWI_TABLE_SIZE)
1141     {
1142       if (val & 1)
1143 	{
1144 	  digit = val & ((1 << POWI_WINDOW_SIZE) - 1);
1145 	  result += powi_lookup_cost (digit, cache)
1146 		    + POWI_WINDOW_SIZE + 1;
1147 	  val >>= POWI_WINDOW_SIZE;
1148 	}
1149       else
1150 	{
1151 	  val >>= 1;
1152 	  result++;
1153 	}
1154     }
1155 
1156   return result + powi_lookup_cost (val, cache);
1157 }
1158 
1159 /* Recursive subroutine of powi_as_mults.  This function takes the
1160    array, CACHE, of already calculated exponents and an exponent N and
1161    returns a tree that corresponds to CACHE[1]**N, with type TYPE.  */
1162 
1163 static tree
powi_as_mults_1(gimple_stmt_iterator * gsi,location_t loc,tree type,HOST_WIDE_INT n,tree * cache)1164 powi_as_mults_1 (gimple_stmt_iterator *gsi, location_t loc, tree type,
1165 		 HOST_WIDE_INT n, tree *cache)
1166 {
1167   tree op0, op1, ssa_target;
1168   unsigned HOST_WIDE_INT digit;
1169   gassign *mult_stmt;
1170 
1171   if (n < POWI_TABLE_SIZE && cache[n])
1172     return cache[n];
1173 
1174   ssa_target = make_temp_ssa_name (type, NULL, "powmult");
1175 
1176   if (n < POWI_TABLE_SIZE)
1177     {
1178       cache[n] = ssa_target;
1179       op0 = powi_as_mults_1 (gsi, loc, type, n - powi_table[n], cache);
1180       op1 = powi_as_mults_1 (gsi, loc, type, powi_table[n], cache);
1181     }
1182   else if (n & 1)
1183     {
1184       digit = n & ((1 << POWI_WINDOW_SIZE) - 1);
1185       op0 = powi_as_mults_1 (gsi, loc, type, n - digit, cache);
1186       op1 = powi_as_mults_1 (gsi, loc, type, digit, cache);
1187     }
1188   else
1189     {
1190       op0 = powi_as_mults_1 (gsi, loc, type, n >> 1, cache);
1191       op1 = op0;
1192     }
1193 
1194   mult_stmt = gimple_build_assign (ssa_target, MULT_EXPR, op0, op1);
1195   gimple_set_location (mult_stmt, loc);
1196   gsi_insert_before (gsi, mult_stmt, GSI_SAME_STMT);
1197 
1198   return ssa_target;
1199 }
1200 
1201 /* Convert ARG0**N to a tree of multiplications of ARG0 with itself.
1202    This function needs to be kept in sync with powi_cost above.  */
1203 
1204 static tree
powi_as_mults(gimple_stmt_iterator * gsi,location_t loc,tree arg0,HOST_WIDE_INT n)1205 powi_as_mults (gimple_stmt_iterator *gsi, location_t loc,
1206 	       tree arg0, HOST_WIDE_INT n)
1207 {
1208   tree cache[POWI_TABLE_SIZE], result, type = TREE_TYPE (arg0);
1209   gassign *div_stmt;
1210   tree target;
1211 
1212   if (n == 0)
1213     return build_real (type, dconst1);
1214 
1215   memset (cache, 0,  sizeof (cache));
1216   cache[1] = arg0;
1217 
1218   result = powi_as_mults_1 (gsi, loc, type, (n < 0) ? -n : n, cache);
1219   if (n >= 0)
1220     return result;
1221 
1222   /* If the original exponent was negative, reciprocate the result.  */
1223   target = make_temp_ssa_name (type, NULL, "powmult");
1224   div_stmt = gimple_build_assign (target, RDIV_EXPR,
1225 				  build_real (type, dconst1), result);
1226   gimple_set_location (div_stmt, loc);
1227   gsi_insert_before (gsi, div_stmt, GSI_SAME_STMT);
1228 
1229   return target;
1230 }
1231 
1232 /* ARG0 and N are the two arguments to a powi builtin in GSI with
1233    location info LOC.  If the arguments are appropriate, create an
1234    equivalent sequence of statements prior to GSI using an optimal
1235    number of multiplications, and return an expession holding the
1236    result.  */
1237 
1238 static tree
gimple_expand_builtin_powi(gimple_stmt_iterator * gsi,location_t loc,tree arg0,HOST_WIDE_INT n)1239 gimple_expand_builtin_powi (gimple_stmt_iterator *gsi, location_t loc,
1240 			    tree arg0, HOST_WIDE_INT n)
1241 {
1242   /* Avoid largest negative number.  */
1243   if (n != -n
1244       && ((n >= -1 && n <= 2)
1245 	  || (optimize_function_for_speed_p (cfun)
1246 	      && powi_cost (n) <= POWI_MAX_MULTS)))
1247     return powi_as_mults (gsi, loc, arg0, n);
1248 
1249   return NULL_TREE;
1250 }
1251 
1252 /* Build a gimple call statement that calls FN with argument ARG.
1253    Set the lhs of the call statement to a fresh SSA name.  Insert the
1254    statement prior to GSI's current position, and return the fresh
1255    SSA name.  */
1256 
1257 static tree
build_and_insert_call(gimple_stmt_iterator * gsi,location_t loc,tree fn,tree arg)1258 build_and_insert_call (gimple_stmt_iterator *gsi, location_t loc,
1259 		       tree fn, tree arg)
1260 {
1261   gcall *call_stmt;
1262   tree ssa_target;
1263 
1264   call_stmt = gimple_build_call (fn, 1, arg);
1265   ssa_target = make_temp_ssa_name (TREE_TYPE (arg), NULL, "powroot");
1266   gimple_set_lhs (call_stmt, ssa_target);
1267   gimple_set_location (call_stmt, loc);
1268   gsi_insert_before (gsi, call_stmt, GSI_SAME_STMT);
1269 
1270   return ssa_target;
1271 }
1272 
1273 /* Build a gimple binary operation with the given CODE and arguments
1274    ARG0, ARG1, assigning the result to a new SSA name for variable
1275    TARGET.  Insert the statement prior to GSI's current position, and
1276    return the fresh SSA name.*/
1277 
1278 static tree
build_and_insert_binop(gimple_stmt_iterator * gsi,location_t loc,const char * name,enum tree_code code,tree arg0,tree arg1)1279 build_and_insert_binop (gimple_stmt_iterator *gsi, location_t loc,
1280 			const char *name, enum tree_code code,
1281 			tree arg0, tree arg1)
1282 {
1283   tree result = make_temp_ssa_name (TREE_TYPE (arg0), NULL, name);
1284   gassign *stmt = gimple_build_assign (result, code, arg0, arg1);
1285   gimple_set_location (stmt, loc);
1286   gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1287   return result;
1288 }
1289 
1290 /* Build a gimple reference operation with the given CODE and argument
1291    ARG, assigning the result to a new SSA name of TYPE with NAME.
1292    Insert the statement prior to GSI's current position, and return
1293    the fresh SSA name.  */
1294 
1295 static inline tree
build_and_insert_ref(gimple_stmt_iterator * gsi,location_t loc,tree type,const char * name,enum tree_code code,tree arg0)1296 build_and_insert_ref (gimple_stmt_iterator *gsi, location_t loc, tree type,
1297 		      const char *name, enum tree_code code, tree arg0)
1298 {
1299   tree result = make_temp_ssa_name (type, NULL, name);
1300   gimple *stmt = gimple_build_assign (result, build1 (code, type, arg0));
1301   gimple_set_location (stmt, loc);
1302   gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1303   return result;
1304 }
1305 
1306 /* Build a gimple assignment to cast VAL to TYPE.  Insert the statement
1307    prior to GSI's current position, and return the fresh SSA name.  */
1308 
1309 static tree
build_and_insert_cast(gimple_stmt_iterator * gsi,location_t loc,tree type,tree val)1310 build_and_insert_cast (gimple_stmt_iterator *gsi, location_t loc,
1311 		       tree type, tree val)
1312 {
1313   tree result = make_ssa_name (type);
1314   gassign *stmt = gimple_build_assign (result, NOP_EXPR, val);
1315   gimple_set_location (stmt, loc);
1316   gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1317   return result;
1318 }
1319 
1320 struct pow_synth_sqrt_info
1321 {
1322   bool *factors;
1323   unsigned int deepest;
1324   unsigned int num_mults;
1325 };
1326 
1327 /* Return true iff the real value C can be represented as a
1328    sum of powers of 0.5 up to N.  That is:
1329    C == SUM<i from 1..N> (a[i]*(0.5**i)) where a[i] is either 0 or 1.
1330    Record in INFO the various parameters of the synthesis algorithm such
1331    as the factors a[i], the maximum 0.5 power and the number of
1332    multiplications that will be required.  */
1333 
1334 bool
representable_as_half_series_p(REAL_VALUE_TYPE c,unsigned n,struct pow_synth_sqrt_info * info)1335 representable_as_half_series_p (REAL_VALUE_TYPE c, unsigned n,
1336 				 struct pow_synth_sqrt_info *info)
1337 {
1338   REAL_VALUE_TYPE factor = dconsthalf;
1339   REAL_VALUE_TYPE remainder = c;
1340 
1341   info->deepest = 0;
1342   info->num_mults = 0;
1343   memset (info->factors, 0, n * sizeof (bool));
1344 
1345   for (unsigned i = 0; i < n; i++)
1346     {
1347       REAL_VALUE_TYPE res;
1348 
1349       /* If something inexact happened bail out now.  */
1350       if (real_arithmetic (&res, MINUS_EXPR, &remainder, &factor))
1351 	return false;
1352 
1353       /* We have hit zero.  The number is representable as a sum
1354          of powers of 0.5.  */
1355       if (real_equal (&res, &dconst0))
1356 	{
1357 	  info->factors[i] = true;
1358 	  info->deepest = i + 1;
1359 	  return true;
1360 	}
1361       else if (!REAL_VALUE_NEGATIVE (res))
1362 	{
1363 	  remainder = res;
1364 	  info->factors[i] = true;
1365 	  info->num_mults++;
1366 	}
1367       else
1368 	info->factors[i] = false;
1369 
1370       real_arithmetic (&factor, MULT_EXPR, &factor, &dconsthalf);
1371     }
1372   return false;
1373 }
1374 
1375 /* Return the tree corresponding to FN being applied
1376    to ARG N times at GSI and LOC.
1377    Look up previous results from CACHE if need be.
1378    cache[0] should contain just plain ARG i.e. FN applied to ARG 0 times.  */
1379 
1380 static tree
get_fn_chain(tree arg,unsigned int n,gimple_stmt_iterator * gsi,tree fn,location_t loc,tree * cache)1381 get_fn_chain (tree arg, unsigned int n, gimple_stmt_iterator *gsi,
1382 	      tree fn, location_t loc, tree *cache)
1383 {
1384   tree res = cache[n];
1385   if (!res)
1386     {
1387       tree prev = get_fn_chain (arg, n - 1, gsi, fn, loc, cache);
1388       res = build_and_insert_call (gsi, loc, fn, prev);
1389       cache[n] = res;
1390     }
1391 
1392   return res;
1393 }
1394 
1395 /* Print to STREAM the repeated application of function FNAME to ARG
1396    N times.  So, for FNAME = "foo", ARG = "x", N = 2 it would print:
1397    "foo (foo (x))".  */
1398 
1399 static void
print_nested_fn(FILE * stream,const char * fname,const char * arg,unsigned int n)1400 print_nested_fn (FILE* stream, const char *fname, const char* arg,
1401 		 unsigned int n)
1402 {
1403   if (n == 0)
1404     fprintf (stream, "%s", arg);
1405   else
1406     {
1407       fprintf (stream, "%s (", fname);
1408       print_nested_fn (stream, fname, arg, n - 1);
1409       fprintf (stream, ")");
1410     }
1411 }
1412 
1413 /* Print to STREAM the fractional sequence of sqrt chains
1414    applied to ARG, described by INFO.  Used for the dump file.  */
1415 
1416 static void
dump_fractional_sqrt_sequence(FILE * stream,const char * arg,struct pow_synth_sqrt_info * info)1417 dump_fractional_sqrt_sequence (FILE *stream, const char *arg,
1418 			        struct pow_synth_sqrt_info *info)
1419 {
1420   for (unsigned int i = 0; i < info->deepest; i++)
1421     {
1422       bool is_set = info->factors[i];
1423       if (is_set)
1424 	{
1425 	  print_nested_fn (stream, "sqrt", arg, i + 1);
1426 	  if (i != info->deepest - 1)
1427 	    fprintf (stream, " * ");
1428 	}
1429     }
1430 }
1431 
1432 /* Print to STREAM a representation of raising ARG to an integer
1433    power N.  Used for the dump file.  */
1434 
1435 static void
dump_integer_part(FILE * stream,const char * arg,HOST_WIDE_INT n)1436 dump_integer_part (FILE *stream, const char* arg, HOST_WIDE_INT n)
1437 {
1438   if (n > 1)
1439     fprintf (stream, "powi (%s, " HOST_WIDE_INT_PRINT_DEC ")", arg, n);
1440   else if (n == 1)
1441     fprintf (stream, "%s", arg);
1442 }
1443 
1444 /* Attempt to synthesize a POW[F] (ARG0, ARG1) call using chains of
1445    square roots.  Place at GSI and LOC.  Limit the maximum depth
1446    of the sqrt chains to MAX_DEPTH.  Return the tree holding the
1447    result of the expanded sequence or NULL_TREE if the expansion failed.
1448 
1449    This routine assumes that ARG1 is a real number with a fractional part
1450    (the integer exponent case will have been handled earlier in
1451    gimple_expand_builtin_pow).
1452 
1453    For ARG1 > 0.0:
1454    * For ARG1 composed of a whole part WHOLE_PART and a fractional part
1455      FRAC_PART i.e. WHOLE_PART == floor (ARG1) and
1456                     FRAC_PART == ARG1 - WHOLE_PART:
1457      Produce POWI (ARG0, WHOLE_PART) * POW (ARG0, FRAC_PART) where
1458      POW (ARG0, FRAC_PART) is expanded as a product of square root chains
1459      if it can be expressed as such, that is if FRAC_PART satisfies:
1460      FRAC_PART == <SUM from i = 1 until MAX_DEPTH> (a[i] * (0.5**i))
1461      where integer a[i] is either 0 or 1.
1462 
1463      Example:
1464      POW (x, 3.625) == POWI (x, 3) * POW (x, 0.625)
1465        --> POWI (x, 3) * SQRT (x) * SQRT (SQRT (SQRT (x)))
1466 
1467    For ARG1 < 0.0 there are two approaches:
1468    * (A) Expand to 1.0 / POW (ARG0, -ARG1) where POW (ARG0, -ARG1)
1469          is calculated as above.
1470 
1471      Example:
1472      POW (x, -5.625) == 1.0 / POW (x, 5.625)
1473        -->  1.0 / (POWI (x, 5) * SQRT (x) * SQRT (SQRT (SQRT (x))))
1474 
1475    * (B) : WHOLE_PART := - ceil (abs (ARG1))
1476            FRAC_PART  := ARG1 - WHOLE_PART
1477      and expand to POW (x, FRAC_PART) / POWI (x, WHOLE_PART).
1478      Example:
1479      POW (x, -5.875) == POW (x, 0.125) / POWI (X, 6)
1480        --> SQRT (SQRT (SQRT (x))) / (POWI (x, 6))
1481 
1482    For ARG1 < 0.0 we choose between (A) and (B) depending on
1483    how many multiplications we'd have to do.
1484    So, for the example in (B): POW (x, -5.875), if we were to
1485    follow algorithm (A) we would produce:
1486    1.0 / POWI (X, 5) * SQRT (X) * SQRT (SQRT (X)) * SQRT (SQRT (SQRT (X)))
1487    which contains more multiplications than approach (B).
1488 
1489    Hopefully, this approach will eliminate potentially expensive POW library
1490    calls when unsafe floating point math is enabled and allow the compiler to
1491    further optimise the multiplies, square roots and divides produced by this
1492    function.  */
1493 
1494 static tree
expand_pow_as_sqrts(gimple_stmt_iterator * gsi,location_t loc,tree arg0,tree arg1,HOST_WIDE_INT max_depth)1495 expand_pow_as_sqrts (gimple_stmt_iterator *gsi, location_t loc,
1496 		     tree arg0, tree arg1, HOST_WIDE_INT max_depth)
1497 {
1498   tree type = TREE_TYPE (arg0);
1499   machine_mode mode = TYPE_MODE (type);
1500   tree sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1501   bool one_over = true;
1502 
1503   if (!sqrtfn)
1504     return NULL_TREE;
1505 
1506   if (TREE_CODE (arg1) != REAL_CST)
1507     return NULL_TREE;
1508 
1509   REAL_VALUE_TYPE exp_init = TREE_REAL_CST (arg1);
1510 
1511   gcc_assert (max_depth > 0);
1512   tree *cache = XALLOCAVEC (tree, max_depth + 1);
1513 
1514   struct pow_synth_sqrt_info synth_info;
1515   synth_info.factors = XALLOCAVEC (bool, max_depth + 1);
1516   synth_info.deepest = 0;
1517   synth_info.num_mults = 0;
1518 
1519   bool neg_exp = REAL_VALUE_NEGATIVE (exp_init);
1520   REAL_VALUE_TYPE exp = real_value_abs (&exp_init);
1521 
1522   /* The whole and fractional parts of exp.  */
1523   REAL_VALUE_TYPE whole_part;
1524   REAL_VALUE_TYPE frac_part;
1525 
1526   real_floor (&whole_part, mode, &exp);
1527   real_arithmetic (&frac_part, MINUS_EXPR, &exp, &whole_part);
1528 
1529 
1530   REAL_VALUE_TYPE ceil_whole = dconst0;
1531   REAL_VALUE_TYPE ceil_fract = dconst0;
1532 
1533   if (neg_exp)
1534     {
1535       real_ceil (&ceil_whole, mode, &exp);
1536       real_arithmetic (&ceil_fract, MINUS_EXPR, &ceil_whole, &exp);
1537     }
1538 
1539   if (!representable_as_half_series_p (frac_part, max_depth, &synth_info))
1540     return NULL_TREE;
1541 
1542   /* Check whether it's more profitable to not use 1.0 / ...  */
1543   if (neg_exp)
1544     {
1545       struct pow_synth_sqrt_info alt_synth_info;
1546       alt_synth_info.factors = XALLOCAVEC (bool, max_depth + 1);
1547       alt_synth_info.deepest = 0;
1548       alt_synth_info.num_mults = 0;
1549 
1550       if (representable_as_half_series_p (ceil_fract, max_depth,
1551 					   &alt_synth_info)
1552 	  && alt_synth_info.deepest <= synth_info.deepest
1553 	  && alt_synth_info.num_mults < synth_info.num_mults)
1554 	{
1555 	  whole_part = ceil_whole;
1556 	  frac_part = ceil_fract;
1557 	  synth_info.deepest = alt_synth_info.deepest;
1558 	  synth_info.num_mults = alt_synth_info.num_mults;
1559 	  memcpy (synth_info.factors, alt_synth_info.factors,
1560 		  (max_depth + 1) * sizeof (bool));
1561 	  one_over = false;
1562 	}
1563     }
1564 
1565   HOST_WIDE_INT n = real_to_integer (&whole_part);
1566   REAL_VALUE_TYPE cint;
1567   real_from_integer (&cint, VOIDmode, n, SIGNED);
1568 
1569   if (!real_identical (&whole_part, &cint))
1570     return NULL_TREE;
1571 
1572   if (powi_cost (n) + synth_info.num_mults > POWI_MAX_MULTS)
1573     return NULL_TREE;
1574 
1575   memset (cache, 0, (max_depth + 1) * sizeof (tree));
1576 
1577   tree integer_res = n == 0 ? build_real (type, dconst1) : arg0;
1578 
1579   /* Calculate the integer part of the exponent.  */
1580   if (n > 1)
1581     {
1582       integer_res = gimple_expand_builtin_powi (gsi, loc, arg0, n);
1583       if (!integer_res)
1584 	return NULL_TREE;
1585     }
1586 
1587   if (dump_file)
1588     {
1589       char string[64];
1590 
1591       real_to_decimal (string, &exp_init, sizeof (string), 0, 1);
1592       fprintf (dump_file, "synthesizing pow (x, %s) as:\n", string);
1593 
1594       if (neg_exp)
1595 	{
1596 	  if (one_over)
1597 	    {
1598 	      fprintf (dump_file, "1.0 / (");
1599 	      dump_integer_part (dump_file, "x", n);
1600 	      if (n > 0)
1601 	        fprintf (dump_file, " * ");
1602 	      dump_fractional_sqrt_sequence (dump_file, "x", &synth_info);
1603 	      fprintf (dump_file, ")");
1604 	    }
1605 	  else
1606 	    {
1607 	      dump_fractional_sqrt_sequence (dump_file, "x", &synth_info);
1608 	      fprintf (dump_file, " / (");
1609 	      dump_integer_part (dump_file, "x", n);
1610 	      fprintf (dump_file, ")");
1611 	    }
1612 	}
1613       else
1614 	{
1615 	  dump_fractional_sqrt_sequence (dump_file, "x", &synth_info);
1616 	  if (n > 0)
1617 	    fprintf (dump_file, " * ");
1618 	  dump_integer_part (dump_file, "x", n);
1619 	}
1620 
1621       fprintf (dump_file, "\ndeepest sqrt chain: %d\n", synth_info.deepest);
1622     }
1623 
1624 
1625   tree fract_res = NULL_TREE;
1626   cache[0] = arg0;
1627 
1628   /* Calculate the fractional part of the exponent.  */
1629   for (unsigned i = 0; i < synth_info.deepest; i++)
1630     {
1631       if (synth_info.factors[i])
1632 	{
1633 	  tree sqrt_chain = get_fn_chain (arg0, i + 1, gsi, sqrtfn, loc, cache);
1634 
1635 	  if (!fract_res)
1636 	      fract_res = sqrt_chain;
1637 
1638 	  else
1639 	    fract_res = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1640 					   fract_res, sqrt_chain);
1641 	}
1642     }
1643 
1644   tree res = NULL_TREE;
1645 
1646   if (neg_exp)
1647     {
1648       if (one_over)
1649 	{
1650 	  if (n > 0)
1651 	    res = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1652 					   fract_res, integer_res);
1653 	  else
1654 	    res = fract_res;
1655 
1656 	  res = build_and_insert_binop (gsi, loc, "powrootrecip", RDIV_EXPR,
1657 					  build_real (type, dconst1), res);
1658 	}
1659       else
1660 	{
1661 	  res = build_and_insert_binop (gsi, loc, "powroot", RDIV_EXPR,
1662 					 fract_res, integer_res);
1663 	}
1664     }
1665   else
1666     res = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1667 				   fract_res, integer_res);
1668   return res;
1669 }
1670 
1671 /* ARG0 and ARG1 are the two arguments to a pow builtin call in GSI
1672    with location info LOC.  If possible, create an equivalent and
1673    less expensive sequence of statements prior to GSI, and return an
1674    expession holding the result.  */
1675 
1676 static tree
gimple_expand_builtin_pow(gimple_stmt_iterator * gsi,location_t loc,tree arg0,tree arg1)1677 gimple_expand_builtin_pow (gimple_stmt_iterator *gsi, location_t loc,
1678 			   tree arg0, tree arg1)
1679 {
1680   REAL_VALUE_TYPE c, cint, dconst1_3, dconst1_4, dconst1_6;
1681   REAL_VALUE_TYPE c2, dconst3;
1682   HOST_WIDE_INT n;
1683   tree type, sqrtfn, cbrtfn, sqrt_arg0, result, cbrt_x, powi_cbrt_x;
1684   machine_mode mode;
1685   bool speed_p = optimize_bb_for_speed_p (gsi_bb (*gsi));
1686   bool hw_sqrt_exists, c_is_int, c2_is_int;
1687 
1688   dconst1_4 = dconst1;
1689   SET_REAL_EXP (&dconst1_4, REAL_EXP (&dconst1_4) - 2);
1690 
1691   /* If the exponent isn't a constant, there's nothing of interest
1692      to be done.  */
1693   if (TREE_CODE (arg1) != REAL_CST)
1694     return NULL_TREE;
1695 
1696   /* Don't perform the operation if flag_signaling_nans is on
1697      and the operand is a signaling NaN.  */
1698   if (HONOR_SNANS (TYPE_MODE (TREE_TYPE (arg1)))
1699       && ((TREE_CODE (arg0) == REAL_CST
1700 	   && REAL_VALUE_ISSIGNALING_NAN (TREE_REAL_CST (arg0)))
1701 	  || REAL_VALUE_ISSIGNALING_NAN (TREE_REAL_CST (arg1))))
1702     return NULL_TREE;
1703 
1704   /* If the exponent is equivalent to an integer, expand to an optimal
1705      multiplication sequence when profitable.  */
1706   c = TREE_REAL_CST (arg1);
1707   n = real_to_integer (&c);
1708   real_from_integer (&cint, VOIDmode, n, SIGNED);
1709   c_is_int = real_identical (&c, &cint);
1710 
1711   if (c_is_int
1712       && ((n >= -1 && n <= 2)
1713 	  || (flag_unsafe_math_optimizations
1714 	      && speed_p
1715 	      && powi_cost (n) <= POWI_MAX_MULTS)))
1716     return gimple_expand_builtin_powi (gsi, loc, arg0, n);
1717 
1718   /* Attempt various optimizations using sqrt and cbrt.  */
1719   type = TREE_TYPE (arg0);
1720   mode = TYPE_MODE (type);
1721   sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1722 
1723   /* Optimize pow(x,0.5) = sqrt(x).  This replacement is always safe
1724      unless signed zeros must be maintained.  pow(-0,0.5) = +0, while
1725      sqrt(-0) = -0.  */
1726   if (sqrtfn
1727       && real_equal (&c, &dconsthalf)
1728       && !HONOR_SIGNED_ZEROS (mode))
1729     return build_and_insert_call (gsi, loc, sqrtfn, arg0);
1730 
1731   hw_sqrt_exists = optab_handler (sqrt_optab, mode) != CODE_FOR_nothing;
1732 
1733   /* Optimize pow(x,1./3.) = cbrt(x).  This requires unsafe math
1734      optimizations since 1./3. is not exactly representable.  If x
1735      is negative and finite, the correct value of pow(x,1./3.) is
1736      a NaN with the "invalid" exception raised, because the value
1737      of 1./3. actually has an even denominator.  The correct value
1738      of cbrt(x) is a negative real value.  */
1739   cbrtfn = mathfn_built_in (type, BUILT_IN_CBRT);
1740   dconst1_3 = real_value_truncate (mode, dconst_third ());
1741 
1742   if (flag_unsafe_math_optimizations
1743       && cbrtfn
1744       && (!HONOR_NANS (mode) || tree_expr_nonnegative_p (arg0))
1745       && real_equal (&c, &dconst1_3))
1746     return build_and_insert_call (gsi, loc, cbrtfn, arg0);
1747 
1748   /* Optimize pow(x,1./6.) = cbrt(sqrt(x)).  Don't do this optimization
1749      if we don't have a hardware sqrt insn.  */
1750   dconst1_6 = dconst1_3;
1751   SET_REAL_EXP (&dconst1_6, REAL_EXP (&dconst1_6) - 1);
1752 
1753   if (flag_unsafe_math_optimizations
1754       && sqrtfn
1755       && cbrtfn
1756       && (!HONOR_NANS (mode) || tree_expr_nonnegative_p (arg0))
1757       && speed_p
1758       && hw_sqrt_exists
1759       && real_equal (&c, &dconst1_6))
1760     {
1761       /* sqrt(x)  */
1762       sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1763 
1764       /* cbrt(sqrt(x))  */
1765       return build_and_insert_call (gsi, loc, cbrtfn, sqrt_arg0);
1766     }
1767 
1768 
1769   /* Attempt to expand the POW as a product of square root chains.
1770      Expand the 0.25 case even when otpimising for size.  */
1771   if (flag_unsafe_math_optimizations
1772       && sqrtfn
1773       && hw_sqrt_exists
1774       && (speed_p || real_equal (&c, &dconst1_4))
1775       && !HONOR_SIGNED_ZEROS (mode))
1776     {
1777       unsigned int max_depth = speed_p
1778 				? PARAM_VALUE (PARAM_MAX_POW_SQRT_DEPTH)
1779 				: 2;
1780 
1781       tree expand_with_sqrts
1782 	= expand_pow_as_sqrts (gsi, loc, arg0, arg1, max_depth);
1783 
1784       if (expand_with_sqrts)
1785 	return expand_with_sqrts;
1786     }
1787 
1788   real_arithmetic (&c2, MULT_EXPR, &c, &dconst2);
1789   n = real_to_integer (&c2);
1790   real_from_integer (&cint, VOIDmode, n, SIGNED);
1791   c2_is_int = real_identical (&c2, &cint);
1792 
1793   /* Optimize pow(x,c), where 3c = n for some nonzero integer n, into
1794 
1795      powi(x, n/3) * powi(cbrt(x), n%3),                    n > 0;
1796      1.0 / (powi(x, abs(n)/3) * powi(cbrt(x), abs(n)%3)),  n < 0.
1797 
1798      Do not calculate the first factor when n/3 = 0.  As cbrt(x) is
1799      different from pow(x, 1./3.) due to rounding and behavior with
1800      negative x, we need to constrain this transformation to unsafe
1801      math and positive x or finite math.  */
1802   real_from_integer (&dconst3, VOIDmode, 3, SIGNED);
1803   real_arithmetic (&c2, MULT_EXPR, &c, &dconst3);
1804   real_round (&c2, mode, &c2);
1805   n = real_to_integer (&c2);
1806   real_from_integer (&cint, VOIDmode, n, SIGNED);
1807   real_arithmetic (&c2, RDIV_EXPR, &cint, &dconst3);
1808   real_convert (&c2, mode, &c2);
1809 
1810   if (flag_unsafe_math_optimizations
1811       && cbrtfn
1812       && (!HONOR_NANS (mode) || tree_expr_nonnegative_p (arg0))
1813       && real_identical (&c2, &c)
1814       && !c2_is_int
1815       && optimize_function_for_speed_p (cfun)
1816       && powi_cost (n / 3) <= POWI_MAX_MULTS)
1817     {
1818       tree powi_x_ndiv3 = NULL_TREE;
1819 
1820       /* Attempt to fold powi(arg0, abs(n/3)) into multiplies.  If not
1821          possible or profitable, give up.  Skip the degenerate case when
1822          abs(n) < 3, where the result is always 1.  */
1823       if (absu_hwi (n) >= 3)
1824 	{
1825 	  powi_x_ndiv3 = gimple_expand_builtin_powi (gsi, loc, arg0,
1826 						     abs_hwi (n / 3));
1827 	  if (!powi_x_ndiv3)
1828 	    return NULL_TREE;
1829 	}
1830 
1831       /* Calculate powi(cbrt(x), n%3).  Don't use gimple_expand_builtin_powi
1832          as that creates an unnecessary variable.  Instead, just produce
1833          either cbrt(x) or cbrt(x) * cbrt(x).  */
1834       cbrt_x = build_and_insert_call (gsi, loc, cbrtfn, arg0);
1835 
1836       if (absu_hwi (n) % 3 == 1)
1837 	powi_cbrt_x = cbrt_x;
1838       else
1839 	powi_cbrt_x = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1840 					      cbrt_x, cbrt_x);
1841 
1842       /* Multiply the two subexpressions, unless powi(x,abs(n)/3) = 1.  */
1843       if (absu_hwi (n) < 3)
1844 	result = powi_cbrt_x;
1845       else
1846 	result = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1847 					 powi_x_ndiv3, powi_cbrt_x);
1848 
1849       /* If n is negative, reciprocate the result.  */
1850       if (n < 0)
1851 	result = build_and_insert_binop (gsi, loc, "powroot", RDIV_EXPR,
1852 					 build_real (type, dconst1), result);
1853 
1854       return result;
1855     }
1856 
1857   /* No optimizations succeeded.  */
1858   return NULL_TREE;
1859 }
1860 
1861 /* ARG is the argument to a cabs builtin call in GSI with location info
1862    LOC.  Create a sequence of statements prior to GSI that calculates
1863    sqrt(R*R + I*I), where R and I are the real and imaginary components
1864    of ARG, respectively.  Return an expression holding the result.  */
1865 
1866 static tree
gimple_expand_builtin_cabs(gimple_stmt_iterator * gsi,location_t loc,tree arg)1867 gimple_expand_builtin_cabs (gimple_stmt_iterator *gsi, location_t loc, tree arg)
1868 {
1869   tree real_part, imag_part, addend1, addend2, sum, result;
1870   tree type = TREE_TYPE (TREE_TYPE (arg));
1871   tree sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1872   machine_mode mode = TYPE_MODE (type);
1873 
1874   if (!flag_unsafe_math_optimizations
1875       || !optimize_bb_for_speed_p (gimple_bb (gsi_stmt (*gsi)))
1876       || !sqrtfn
1877       || optab_handler (sqrt_optab, mode) == CODE_FOR_nothing)
1878     return NULL_TREE;
1879 
1880   real_part = build_and_insert_ref (gsi, loc, type, "cabs",
1881 				    REALPART_EXPR, arg);
1882   addend1 = build_and_insert_binop (gsi, loc, "cabs", MULT_EXPR,
1883 				    real_part, real_part);
1884   imag_part = build_and_insert_ref (gsi, loc, type, "cabs",
1885 				    IMAGPART_EXPR, arg);
1886   addend2 = build_and_insert_binop (gsi, loc, "cabs", MULT_EXPR,
1887 				    imag_part, imag_part);
1888   sum = build_and_insert_binop (gsi, loc, "cabs", PLUS_EXPR, addend1, addend2);
1889   result = build_and_insert_call (gsi, loc, sqrtfn, sum);
1890 
1891   return result;
1892 }
1893 
1894 /* Go through all calls to sin, cos and cexpi and call execute_cse_sincos_1
1895    on the SSA_NAME argument of each of them.  Also expand powi(x,n) into
1896    an optimal number of multiplies, when n is a constant.  */
1897 
1898 namespace {
1899 
1900 const pass_data pass_data_cse_sincos =
1901 {
1902   GIMPLE_PASS, /* type */
1903   "sincos", /* name */
1904   OPTGROUP_NONE, /* optinfo_flags */
1905   TV_TREE_SINCOS, /* tv_id */
1906   PROP_ssa, /* properties_required */
1907   PROP_gimple_opt_math, /* properties_provided */
1908   0, /* properties_destroyed */
1909   0, /* todo_flags_start */
1910   TODO_update_ssa, /* todo_flags_finish */
1911 };
1912 
1913 class pass_cse_sincos : public gimple_opt_pass
1914 {
1915 public:
pass_cse_sincos(gcc::context * ctxt)1916   pass_cse_sincos (gcc::context *ctxt)
1917     : gimple_opt_pass (pass_data_cse_sincos, ctxt)
1918   {}
1919 
1920   /* opt_pass methods: */
gate(function *)1921   virtual bool gate (function *)
1922     {
1923       /* We no longer require either sincos or cexp, since powi expansion
1924 	 piggybacks on this pass.  */
1925       return optimize;
1926     }
1927 
1928   virtual unsigned int execute (function *);
1929 
1930 }; // class pass_cse_sincos
1931 
1932 unsigned int
execute(function * fun)1933 pass_cse_sincos::execute (function *fun)
1934 {
1935   basic_block bb;
1936   bool cfg_changed = false;
1937 
1938   calculate_dominance_info (CDI_DOMINATORS);
1939   memset (&sincos_stats, 0, sizeof (sincos_stats));
1940 
1941   FOR_EACH_BB_FN (bb, fun)
1942     {
1943       gimple_stmt_iterator gsi;
1944       bool cleanup_eh = false;
1945 
1946       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1947         {
1948 	  gimple *stmt = gsi_stmt (gsi);
1949 
1950 	  /* Only the last stmt in a bb could throw, no need to call
1951 	     gimple_purge_dead_eh_edges if we change something in the middle
1952 	     of a basic block.  */
1953 	  cleanup_eh = false;
1954 
1955 	  if (is_gimple_call (stmt)
1956 	      && gimple_call_lhs (stmt))
1957 	    {
1958 	      tree arg, arg0, arg1, result;
1959 	      HOST_WIDE_INT n;
1960 	      location_t loc;
1961 
1962 	      switch (gimple_call_combined_fn (stmt))
1963 		{
1964 		CASE_CFN_COS:
1965 		CASE_CFN_SIN:
1966 		CASE_CFN_CEXPI:
1967 		  /* Make sure we have either sincos or cexp.  */
1968 		  if (!targetm.libc_has_function (function_c99_math_complex)
1969 		      && !targetm.libc_has_function (function_sincos))
1970 		    break;
1971 
1972 		  arg = gimple_call_arg (stmt, 0);
1973 		  if (TREE_CODE (arg) == SSA_NAME)
1974 		    cfg_changed |= execute_cse_sincos_1 (arg);
1975 		  break;
1976 
1977 		CASE_CFN_POW:
1978 		  arg0 = gimple_call_arg (stmt, 0);
1979 		  arg1 = gimple_call_arg (stmt, 1);
1980 
1981 		  loc = gimple_location (stmt);
1982 		  result = gimple_expand_builtin_pow (&gsi, loc, arg0, arg1);
1983 
1984 		  if (result)
1985 		    {
1986 		      tree lhs = gimple_get_lhs (stmt);
1987 		      gassign *new_stmt = gimple_build_assign (lhs, result);
1988 		      gimple_set_location (new_stmt, loc);
1989 		      unlink_stmt_vdef (stmt);
1990 		      gsi_replace (&gsi, new_stmt, true);
1991 		      cleanup_eh = true;
1992 		      if (gimple_vdef (stmt))
1993 			release_ssa_name (gimple_vdef (stmt));
1994 		    }
1995 		  break;
1996 
1997 		CASE_CFN_POWI:
1998 		  arg0 = gimple_call_arg (stmt, 0);
1999 		  arg1 = gimple_call_arg (stmt, 1);
2000 		  loc = gimple_location (stmt);
2001 
2002 		  if (real_minus_onep (arg0))
2003 		    {
2004                       tree t0, t1, cond, one, minus_one;
2005 		      gassign *stmt;
2006 
2007 		      t0 = TREE_TYPE (arg0);
2008 		      t1 = TREE_TYPE (arg1);
2009 		      one = build_real (t0, dconst1);
2010 		      minus_one = build_real (t0, dconstm1);
2011 
2012 		      cond = make_temp_ssa_name (t1, NULL, "powi_cond");
2013 		      stmt = gimple_build_assign (cond, BIT_AND_EXPR,
2014 						  arg1, build_int_cst (t1, 1));
2015 		      gimple_set_location (stmt, loc);
2016 		      gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
2017 
2018 		      result = make_temp_ssa_name (t0, NULL, "powi");
2019 		      stmt = gimple_build_assign (result, COND_EXPR, cond,
2020 						  minus_one, one);
2021 		      gimple_set_location (stmt, loc);
2022 		      gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
2023 		    }
2024 		  else
2025 		    {
2026 		      if (!tree_fits_shwi_p (arg1))
2027 			break;
2028 
2029 		      n = tree_to_shwi (arg1);
2030 		      result = gimple_expand_builtin_powi (&gsi, loc, arg0, n);
2031 		    }
2032 
2033 		  if (result)
2034 		    {
2035 		      tree lhs = gimple_get_lhs (stmt);
2036 		      gassign *new_stmt = gimple_build_assign (lhs, result);
2037 		      gimple_set_location (new_stmt, loc);
2038 		      unlink_stmt_vdef (stmt);
2039 		      gsi_replace (&gsi, new_stmt, true);
2040 		      cleanup_eh = true;
2041 		      if (gimple_vdef (stmt))
2042 			release_ssa_name (gimple_vdef (stmt));
2043 		    }
2044 		  break;
2045 
2046 		CASE_CFN_CABS:
2047 		  arg0 = gimple_call_arg (stmt, 0);
2048 		  loc = gimple_location (stmt);
2049 		  result = gimple_expand_builtin_cabs (&gsi, loc, arg0);
2050 
2051 		  if (result)
2052 		    {
2053 		      tree lhs = gimple_get_lhs (stmt);
2054 		      gassign *new_stmt = gimple_build_assign (lhs, result);
2055 		      gimple_set_location (new_stmt, loc);
2056 		      unlink_stmt_vdef (stmt);
2057 		      gsi_replace (&gsi, new_stmt, true);
2058 		      cleanup_eh = true;
2059 		      if (gimple_vdef (stmt))
2060 			release_ssa_name (gimple_vdef (stmt));
2061 		    }
2062 		  break;
2063 
2064 		default:;
2065 		}
2066 	    }
2067 	}
2068       if (cleanup_eh)
2069 	cfg_changed |= gimple_purge_dead_eh_edges (bb);
2070     }
2071 
2072   statistics_counter_event (fun, "sincos statements inserted",
2073 			    sincos_stats.inserted);
2074 
2075   return cfg_changed ? TODO_cleanup_cfg : 0;
2076 }
2077 
2078 } // anon namespace
2079 
2080 gimple_opt_pass *
make_pass_cse_sincos(gcc::context * ctxt)2081 make_pass_cse_sincos (gcc::context *ctxt)
2082 {
2083   return new pass_cse_sincos (ctxt);
2084 }
2085 
2086 /* Return true if stmt is a type conversion operation that can be stripped
2087    when used in a widening multiply operation.  */
2088 static bool
widening_mult_conversion_strippable_p(tree result_type,gimple * stmt)2089 widening_mult_conversion_strippable_p (tree result_type, gimple *stmt)
2090 {
2091   enum tree_code rhs_code = gimple_assign_rhs_code (stmt);
2092 
2093   if (TREE_CODE (result_type) == INTEGER_TYPE)
2094     {
2095       tree op_type;
2096       tree inner_op_type;
2097 
2098       if (!CONVERT_EXPR_CODE_P (rhs_code))
2099 	return false;
2100 
2101       op_type = TREE_TYPE (gimple_assign_lhs (stmt));
2102 
2103       /* If the type of OP has the same precision as the result, then
2104 	 we can strip this conversion.  The multiply operation will be
2105 	 selected to create the correct extension as a by-product.  */
2106       if (TYPE_PRECISION (result_type) == TYPE_PRECISION (op_type))
2107 	return true;
2108 
2109       /* We can also strip a conversion if it preserves the signed-ness of
2110 	 the operation and doesn't narrow the range.  */
2111       inner_op_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
2112 
2113       /* If the inner-most type is unsigned, then we can strip any
2114 	 intermediate widening operation.  If it's signed, then the
2115 	 intermediate widening operation must also be signed.  */
2116       if ((TYPE_UNSIGNED (inner_op_type)
2117 	   || TYPE_UNSIGNED (op_type) == TYPE_UNSIGNED (inner_op_type))
2118 	  && TYPE_PRECISION (op_type) > TYPE_PRECISION (inner_op_type))
2119 	return true;
2120 
2121       return false;
2122     }
2123 
2124   return rhs_code == FIXED_CONVERT_EXPR;
2125 }
2126 
2127 /* Return true if RHS is a suitable operand for a widening multiplication,
2128    assuming a target type of TYPE.
2129    There are two cases:
2130 
2131      - RHS makes some value at least twice as wide.  Store that value
2132        in *NEW_RHS_OUT if so, and store its type in *TYPE_OUT.
2133 
2134      - RHS is an integer constant.  Store that value in *NEW_RHS_OUT if so,
2135        but leave *TYPE_OUT untouched.  */
2136 
2137 static bool
is_widening_mult_rhs_p(tree type,tree rhs,tree * type_out,tree * new_rhs_out)2138 is_widening_mult_rhs_p (tree type, tree rhs, tree *type_out,
2139 			tree *new_rhs_out)
2140 {
2141   gimple *stmt;
2142   tree type1, rhs1;
2143 
2144   if (TREE_CODE (rhs) == SSA_NAME)
2145     {
2146       stmt = SSA_NAME_DEF_STMT (rhs);
2147       if (is_gimple_assign (stmt))
2148 	{
2149 	  if (! widening_mult_conversion_strippable_p (type, stmt))
2150 	    rhs1 = rhs;
2151 	  else
2152 	    {
2153 	      rhs1 = gimple_assign_rhs1 (stmt);
2154 
2155 	      if (TREE_CODE (rhs1) == INTEGER_CST)
2156 		{
2157 		  *new_rhs_out = rhs1;
2158 		  *type_out = NULL;
2159 		  return true;
2160 		}
2161 	    }
2162 	}
2163       else
2164 	rhs1 = rhs;
2165 
2166       type1 = TREE_TYPE (rhs1);
2167 
2168       if (TREE_CODE (type1) != TREE_CODE (type)
2169 	  || TYPE_PRECISION (type1) * 2 > TYPE_PRECISION (type))
2170 	return false;
2171 
2172       *new_rhs_out = rhs1;
2173       *type_out = type1;
2174       return true;
2175     }
2176 
2177   if (TREE_CODE (rhs) == INTEGER_CST)
2178     {
2179       *new_rhs_out = rhs;
2180       *type_out = NULL;
2181       return true;
2182     }
2183 
2184   return false;
2185 }
2186 
2187 /* Return true if STMT performs a widening multiplication, assuming the
2188    output type is TYPE.  If so, store the unwidened types of the operands
2189    in *TYPE1_OUT and *TYPE2_OUT respectively.  Also fill *RHS1_OUT and
2190    *RHS2_OUT such that converting those operands to types *TYPE1_OUT
2191    and *TYPE2_OUT would give the operands of the multiplication.  */
2192 
2193 static bool
is_widening_mult_p(gimple * stmt,tree * type1_out,tree * rhs1_out,tree * type2_out,tree * rhs2_out)2194 is_widening_mult_p (gimple *stmt,
2195 		    tree *type1_out, tree *rhs1_out,
2196 		    tree *type2_out, tree *rhs2_out)
2197 {
2198   tree type = TREE_TYPE (gimple_assign_lhs (stmt));
2199 
2200   if (TREE_CODE (type) == INTEGER_TYPE)
2201     {
2202       if (TYPE_OVERFLOW_TRAPS (type))
2203 	return false;
2204     }
2205   else if (TREE_CODE (type) != FIXED_POINT_TYPE)
2206     return false;
2207 
2208   if (!is_widening_mult_rhs_p (type, gimple_assign_rhs1 (stmt), type1_out,
2209 			       rhs1_out))
2210     return false;
2211 
2212   if (!is_widening_mult_rhs_p (type, gimple_assign_rhs2 (stmt), type2_out,
2213 			       rhs2_out))
2214     return false;
2215 
2216   if (*type1_out == NULL)
2217     {
2218       if (*type2_out == NULL || !int_fits_type_p (*rhs1_out, *type2_out))
2219 	return false;
2220       *type1_out = *type2_out;
2221     }
2222 
2223   if (*type2_out == NULL)
2224     {
2225       if (!int_fits_type_p (*rhs2_out, *type1_out))
2226 	return false;
2227       *type2_out = *type1_out;
2228     }
2229 
2230   /* Ensure that the larger of the two operands comes first. */
2231   if (TYPE_PRECISION (*type1_out) < TYPE_PRECISION (*type2_out))
2232     {
2233       std::swap (*type1_out, *type2_out);
2234       std::swap (*rhs1_out, *rhs2_out);
2235     }
2236 
2237   return true;
2238 }
2239 
2240 /* Check to see if the CALL statement is an invocation of copysign
2241    with 1. being the first argument.  */
2242 static bool
is_copysign_call_with_1(gimple * call)2243 is_copysign_call_with_1 (gimple *call)
2244 {
2245   gcall *c = dyn_cast <gcall *> (call);
2246   if (! c)
2247     return false;
2248 
2249   enum combined_fn code = gimple_call_combined_fn (c);
2250 
2251   if (code == CFN_LAST)
2252     return false;
2253 
2254   if (builtin_fn_p (code))
2255     {
2256       switch (as_builtin_fn (code))
2257 	{
2258 	CASE_FLT_FN (BUILT_IN_COPYSIGN):
2259 	CASE_FLT_FN_FLOATN_NX (BUILT_IN_COPYSIGN):
2260 	  return real_onep (gimple_call_arg (c, 0));
2261 	default:
2262 	  return false;
2263 	}
2264     }
2265 
2266   if (internal_fn_p (code))
2267     {
2268       switch (as_internal_fn (code))
2269 	{
2270 	case IFN_COPYSIGN:
2271 	  return real_onep (gimple_call_arg (c, 0));
2272 	default:
2273 	  return false;
2274 	}
2275     }
2276 
2277    return false;
2278 }
2279 
2280 /* Try to expand the pattern x * copysign (1, y) into xorsign (x, y).
2281    This only happens when the the xorsign optab is defined, if the
2282    pattern is not a xorsign pattern or if expansion fails FALSE is
2283    returned, otherwise TRUE is returned.  */
2284 static bool
convert_expand_mult_copysign(gimple * stmt,gimple_stmt_iterator * gsi)2285 convert_expand_mult_copysign (gimple *stmt, gimple_stmt_iterator *gsi)
2286 {
2287   tree treeop0, treeop1, lhs, type;
2288   location_t loc = gimple_location (stmt);
2289   lhs = gimple_assign_lhs (stmt);
2290   treeop0 = gimple_assign_rhs1 (stmt);
2291   treeop1 = gimple_assign_rhs2 (stmt);
2292   type = TREE_TYPE (lhs);
2293   machine_mode mode = TYPE_MODE (type);
2294 
2295   if (HONOR_SNANS (type))
2296     return false;
2297 
2298   if (TREE_CODE (treeop0) == SSA_NAME && TREE_CODE (treeop1) == SSA_NAME)
2299     {
2300       gimple *call0 = SSA_NAME_DEF_STMT (treeop0);
2301       if (!has_single_use (treeop0) || !is_copysign_call_with_1 (call0))
2302 	{
2303 	  call0 = SSA_NAME_DEF_STMT (treeop1);
2304 	  if (!has_single_use (treeop1) || !is_copysign_call_with_1 (call0))
2305 	    return false;
2306 
2307 	  treeop1 = treeop0;
2308 	}
2309 	if (optab_handler (xorsign_optab, mode) == CODE_FOR_nothing)
2310 	  return false;
2311 
2312 	gcall *c = as_a<gcall*> (call0);
2313 	treeop0 = gimple_call_arg (c, 1);
2314 
2315 	gcall *call_stmt
2316 	  = gimple_build_call_internal (IFN_XORSIGN, 2, treeop1, treeop0);
2317 	gimple_set_lhs (call_stmt, lhs);
2318 	gimple_set_location (call_stmt, loc);
2319 	gsi_replace (gsi, call_stmt, true);
2320 	return true;
2321     }
2322 
2323   return false;
2324 }
2325 
2326 /* Process a single gimple statement STMT, which has a MULT_EXPR as
2327    its rhs, and try to convert it into a WIDEN_MULT_EXPR.  The return
2328    value is true iff we converted the statement.  */
2329 
2330 static bool
convert_mult_to_widen(gimple * stmt,gimple_stmt_iterator * gsi)2331 convert_mult_to_widen (gimple *stmt, gimple_stmt_iterator *gsi)
2332 {
2333   tree lhs, rhs1, rhs2, type, type1, type2;
2334   enum insn_code handler;
2335   scalar_int_mode to_mode, from_mode, actual_mode;
2336   optab op;
2337   int actual_precision;
2338   location_t loc = gimple_location (stmt);
2339   bool from_unsigned1, from_unsigned2;
2340 
2341   lhs = gimple_assign_lhs (stmt);
2342   type = TREE_TYPE (lhs);
2343   if (TREE_CODE (type) != INTEGER_TYPE)
2344     return false;
2345 
2346   if (!is_widening_mult_p (stmt, &type1, &rhs1, &type2, &rhs2))
2347     return false;
2348 
2349   to_mode = SCALAR_INT_TYPE_MODE (type);
2350   from_mode = SCALAR_INT_TYPE_MODE (type1);
2351   if (to_mode == from_mode)
2352     return false;
2353 
2354   from_unsigned1 = TYPE_UNSIGNED (type1);
2355   from_unsigned2 = TYPE_UNSIGNED (type2);
2356 
2357   if (from_unsigned1 && from_unsigned2)
2358     op = umul_widen_optab;
2359   else if (!from_unsigned1 && !from_unsigned2)
2360     op = smul_widen_optab;
2361   else
2362     op = usmul_widen_optab;
2363 
2364   handler = find_widening_optab_handler_and_mode (op, to_mode, from_mode,
2365 						  &actual_mode);
2366 
2367   if (handler == CODE_FOR_nothing)
2368     {
2369       if (op != smul_widen_optab)
2370 	{
2371 	  /* We can use a signed multiply with unsigned types as long as
2372 	     there is a wider mode to use, or it is the smaller of the two
2373 	     types that is unsigned.  Note that type1 >= type2, always.  */
2374 	  if ((TYPE_UNSIGNED (type1)
2375 	       && TYPE_PRECISION (type1) == GET_MODE_PRECISION (from_mode))
2376 	      || (TYPE_UNSIGNED (type2)
2377 		  && TYPE_PRECISION (type2) == GET_MODE_PRECISION (from_mode)))
2378 	    {
2379 	      if (!GET_MODE_WIDER_MODE (from_mode).exists (&from_mode)
2380 		  || GET_MODE_SIZE (to_mode) <= GET_MODE_SIZE (from_mode))
2381 		return false;
2382 	    }
2383 
2384 	  op = smul_widen_optab;
2385 	  handler = find_widening_optab_handler_and_mode (op, to_mode,
2386 							  from_mode,
2387 							  &actual_mode);
2388 
2389 	  if (handler == CODE_FOR_nothing)
2390 	    return false;
2391 
2392 	  from_unsigned1 = from_unsigned2 = false;
2393 	}
2394       else
2395 	return false;
2396     }
2397 
2398   /* Ensure that the inputs to the handler are in the correct precison
2399      for the opcode.  This will be the full mode size.  */
2400   actual_precision = GET_MODE_PRECISION (actual_mode);
2401   if (2 * actual_precision > TYPE_PRECISION (type))
2402     return false;
2403   if (actual_precision != TYPE_PRECISION (type1)
2404       || from_unsigned1 != TYPE_UNSIGNED (type1))
2405     rhs1 = build_and_insert_cast (gsi, loc,
2406 				  build_nonstandard_integer_type
2407 				    (actual_precision, from_unsigned1), rhs1);
2408   if (actual_precision != TYPE_PRECISION (type2)
2409       || from_unsigned2 != TYPE_UNSIGNED (type2))
2410     rhs2 = build_and_insert_cast (gsi, loc,
2411 				  build_nonstandard_integer_type
2412 				    (actual_precision, from_unsigned2), rhs2);
2413 
2414   /* Handle constants.  */
2415   if (TREE_CODE (rhs1) == INTEGER_CST)
2416     rhs1 = fold_convert (type1, rhs1);
2417   if (TREE_CODE (rhs2) == INTEGER_CST)
2418     rhs2 = fold_convert (type2, rhs2);
2419 
2420   gimple_assign_set_rhs1 (stmt, rhs1);
2421   gimple_assign_set_rhs2 (stmt, rhs2);
2422   gimple_assign_set_rhs_code (stmt, WIDEN_MULT_EXPR);
2423   update_stmt (stmt);
2424   widen_mul_stats.widen_mults_inserted++;
2425   return true;
2426 }
2427 
2428 /* Process a single gimple statement STMT, which is found at the
2429    iterator GSI and has a either a PLUS_EXPR or a MINUS_EXPR as its
2430    rhs (given by CODE), and try to convert it into a
2431    WIDEN_MULT_PLUS_EXPR or a WIDEN_MULT_MINUS_EXPR.  The return value
2432    is true iff we converted the statement.  */
2433 
2434 static bool
convert_plusminus_to_widen(gimple_stmt_iterator * gsi,gimple * stmt,enum tree_code code)2435 convert_plusminus_to_widen (gimple_stmt_iterator *gsi, gimple *stmt,
2436 			    enum tree_code code)
2437 {
2438   gimple *rhs1_stmt = NULL, *rhs2_stmt = NULL;
2439   gimple *conv1_stmt = NULL, *conv2_stmt = NULL, *conv_stmt;
2440   tree type, type1, type2, optype;
2441   tree lhs, rhs1, rhs2, mult_rhs1, mult_rhs2, add_rhs;
2442   enum tree_code rhs1_code = ERROR_MARK, rhs2_code = ERROR_MARK;
2443   optab this_optab;
2444   enum tree_code wmult_code;
2445   enum insn_code handler;
2446   scalar_mode to_mode, from_mode, actual_mode;
2447   location_t loc = gimple_location (stmt);
2448   int actual_precision;
2449   bool from_unsigned1, from_unsigned2;
2450 
2451   lhs = gimple_assign_lhs (stmt);
2452   type = TREE_TYPE (lhs);
2453   if (TREE_CODE (type) != INTEGER_TYPE
2454       && TREE_CODE (type) != FIXED_POINT_TYPE)
2455     return false;
2456 
2457   if (code == MINUS_EXPR)
2458     wmult_code = WIDEN_MULT_MINUS_EXPR;
2459   else
2460     wmult_code = WIDEN_MULT_PLUS_EXPR;
2461 
2462   rhs1 = gimple_assign_rhs1 (stmt);
2463   rhs2 = gimple_assign_rhs2 (stmt);
2464 
2465   if (TREE_CODE (rhs1) == SSA_NAME)
2466     {
2467       rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
2468       if (is_gimple_assign (rhs1_stmt))
2469 	rhs1_code = gimple_assign_rhs_code (rhs1_stmt);
2470     }
2471 
2472   if (TREE_CODE (rhs2) == SSA_NAME)
2473     {
2474       rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
2475       if (is_gimple_assign (rhs2_stmt))
2476 	rhs2_code = gimple_assign_rhs_code (rhs2_stmt);
2477     }
2478 
2479   /* Allow for one conversion statement between the multiply
2480      and addition/subtraction statement.  If there are more than
2481      one conversions then we assume they would invalidate this
2482      transformation.  If that's not the case then they should have
2483      been folded before now.  */
2484   if (CONVERT_EXPR_CODE_P (rhs1_code))
2485     {
2486       conv1_stmt = rhs1_stmt;
2487       rhs1 = gimple_assign_rhs1 (rhs1_stmt);
2488       if (TREE_CODE (rhs1) == SSA_NAME)
2489 	{
2490 	  rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
2491 	  if (is_gimple_assign (rhs1_stmt))
2492 	    rhs1_code = gimple_assign_rhs_code (rhs1_stmt);
2493 	}
2494       else
2495 	return false;
2496     }
2497   if (CONVERT_EXPR_CODE_P (rhs2_code))
2498     {
2499       conv2_stmt = rhs2_stmt;
2500       rhs2 = gimple_assign_rhs1 (rhs2_stmt);
2501       if (TREE_CODE (rhs2) == SSA_NAME)
2502 	{
2503 	  rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
2504 	  if (is_gimple_assign (rhs2_stmt))
2505 	    rhs2_code = gimple_assign_rhs_code (rhs2_stmt);
2506 	}
2507       else
2508 	return false;
2509     }
2510 
2511   /* If code is WIDEN_MULT_EXPR then it would seem unnecessary to call
2512      is_widening_mult_p, but we still need the rhs returns.
2513 
2514      It might also appear that it would be sufficient to use the existing
2515      operands of the widening multiply, but that would limit the choice of
2516      multiply-and-accumulate instructions.
2517 
2518      If the widened-multiplication result has more than one uses, it is
2519      probably wiser not to do the conversion.  */
2520   if (code == PLUS_EXPR
2521       && (rhs1_code == MULT_EXPR || rhs1_code == WIDEN_MULT_EXPR))
2522     {
2523       if (!has_single_use (rhs1)
2524 	  || !is_widening_mult_p (rhs1_stmt, &type1, &mult_rhs1,
2525 				  &type2, &mult_rhs2))
2526 	return false;
2527       add_rhs = rhs2;
2528       conv_stmt = conv1_stmt;
2529     }
2530   else if (rhs2_code == MULT_EXPR || rhs2_code == WIDEN_MULT_EXPR)
2531     {
2532       if (!has_single_use (rhs2)
2533 	  || !is_widening_mult_p (rhs2_stmt, &type1, &mult_rhs1,
2534 				  &type2, &mult_rhs2))
2535 	return false;
2536       add_rhs = rhs1;
2537       conv_stmt = conv2_stmt;
2538     }
2539   else
2540     return false;
2541 
2542   to_mode = SCALAR_TYPE_MODE (type);
2543   from_mode = SCALAR_TYPE_MODE (type1);
2544   if (to_mode == from_mode)
2545     return false;
2546 
2547   from_unsigned1 = TYPE_UNSIGNED (type1);
2548   from_unsigned2 = TYPE_UNSIGNED (type2);
2549   optype = type1;
2550 
2551   /* There's no such thing as a mixed sign madd yet, so use a wider mode.  */
2552   if (from_unsigned1 != from_unsigned2)
2553     {
2554       if (!INTEGRAL_TYPE_P (type))
2555 	return false;
2556       /* We can use a signed multiply with unsigned types as long as
2557 	 there is a wider mode to use, or it is the smaller of the two
2558 	 types that is unsigned.  Note that type1 >= type2, always.  */
2559       if ((from_unsigned1
2560 	   && TYPE_PRECISION (type1) == GET_MODE_PRECISION (from_mode))
2561 	  || (from_unsigned2
2562 	      && TYPE_PRECISION (type2) == GET_MODE_PRECISION (from_mode)))
2563 	{
2564 	  if (!GET_MODE_WIDER_MODE (from_mode).exists (&from_mode)
2565 	      || GET_MODE_SIZE (from_mode) >= GET_MODE_SIZE (to_mode))
2566 	    return false;
2567 	}
2568 
2569       from_unsigned1 = from_unsigned2 = false;
2570       optype = build_nonstandard_integer_type (GET_MODE_PRECISION (from_mode),
2571 					       false);
2572     }
2573 
2574   /* If there was a conversion between the multiply and addition
2575      then we need to make sure it fits a multiply-and-accumulate.
2576      The should be a single mode change which does not change the
2577      value.  */
2578   if (conv_stmt)
2579     {
2580       /* We use the original, unmodified data types for this.  */
2581       tree from_type = TREE_TYPE (gimple_assign_rhs1 (conv_stmt));
2582       tree to_type = TREE_TYPE (gimple_assign_lhs (conv_stmt));
2583       int data_size = TYPE_PRECISION (type1) + TYPE_PRECISION (type2);
2584       bool is_unsigned = TYPE_UNSIGNED (type1) && TYPE_UNSIGNED (type2);
2585 
2586       if (TYPE_PRECISION (from_type) > TYPE_PRECISION (to_type))
2587 	{
2588 	  /* Conversion is a truncate.  */
2589 	  if (TYPE_PRECISION (to_type) < data_size)
2590 	    return false;
2591 	}
2592       else if (TYPE_PRECISION (from_type) < TYPE_PRECISION (to_type))
2593 	{
2594 	  /* Conversion is an extend.  Check it's the right sort.  */
2595 	  if (TYPE_UNSIGNED (from_type) != is_unsigned
2596 	      && !(is_unsigned && TYPE_PRECISION (from_type) > data_size))
2597 	    return false;
2598 	}
2599       /* else convert is a no-op for our purposes.  */
2600     }
2601 
2602   /* Verify that the machine can perform a widening multiply
2603      accumulate in this mode/signedness combination, otherwise
2604      this transformation is likely to pessimize code.  */
2605   this_optab = optab_for_tree_code (wmult_code, optype, optab_default);
2606   handler = find_widening_optab_handler_and_mode (this_optab, to_mode,
2607 						  from_mode, &actual_mode);
2608 
2609   if (handler == CODE_FOR_nothing)
2610     return false;
2611 
2612   /* Ensure that the inputs to the handler are in the correct precison
2613      for the opcode.  This will be the full mode size.  */
2614   actual_precision = GET_MODE_PRECISION (actual_mode);
2615   if (actual_precision != TYPE_PRECISION (type1)
2616       || from_unsigned1 != TYPE_UNSIGNED (type1))
2617     mult_rhs1 = build_and_insert_cast (gsi, loc,
2618 				       build_nonstandard_integer_type
2619 				         (actual_precision, from_unsigned1),
2620 				       mult_rhs1);
2621   if (actual_precision != TYPE_PRECISION (type2)
2622       || from_unsigned2 != TYPE_UNSIGNED (type2))
2623     mult_rhs2 = build_and_insert_cast (gsi, loc,
2624 				       build_nonstandard_integer_type
2625 					 (actual_precision, from_unsigned2),
2626 				       mult_rhs2);
2627 
2628   if (!useless_type_conversion_p (type, TREE_TYPE (add_rhs)))
2629     add_rhs = build_and_insert_cast (gsi, loc, type, add_rhs);
2630 
2631   /* Handle constants.  */
2632   if (TREE_CODE (mult_rhs1) == INTEGER_CST)
2633     mult_rhs1 = fold_convert (type1, mult_rhs1);
2634   if (TREE_CODE (mult_rhs2) == INTEGER_CST)
2635     mult_rhs2 = fold_convert (type2, mult_rhs2);
2636 
2637   gimple_assign_set_rhs_with_ops (gsi, wmult_code, mult_rhs1, mult_rhs2,
2638 				  add_rhs);
2639   update_stmt (gsi_stmt (*gsi));
2640   widen_mul_stats.maccs_inserted++;
2641   return true;
2642 }
2643 
2644 /* Given a result MUL_RESULT which is a result of a multiplication of OP1 and
2645    OP2 and which we know is used in statements that can be, together with the
2646    multiplication, converted to FMAs, perform the transformation.  */
2647 
2648 static void
convert_mult_to_fma_1(tree mul_result,tree op1,tree op2)2649 convert_mult_to_fma_1 (tree mul_result, tree op1, tree op2)
2650 {
2651   tree type = TREE_TYPE (mul_result);
2652   gimple *use_stmt;
2653   imm_use_iterator imm_iter;
2654   gassign *fma_stmt;
2655 
2656   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, mul_result)
2657     {
2658       gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
2659       enum tree_code use_code;
2660       tree addop, mulop1 = op1, result = mul_result;
2661       bool negate_p = false;
2662 
2663       if (is_gimple_debug (use_stmt))
2664 	continue;
2665 
2666       use_code = gimple_assign_rhs_code (use_stmt);
2667       if (use_code == NEGATE_EXPR)
2668 	{
2669 	  result = gimple_assign_lhs (use_stmt);
2670 	  use_operand_p use_p;
2671 	  gimple *neguse_stmt;
2672 	  single_imm_use (gimple_assign_lhs (use_stmt), &use_p, &neguse_stmt);
2673 	  gsi_remove (&gsi, true);
2674 	  release_defs (use_stmt);
2675 
2676 	  use_stmt = neguse_stmt;
2677 	  gsi = gsi_for_stmt (use_stmt);
2678 	  use_code = gimple_assign_rhs_code (use_stmt);
2679 	  negate_p = true;
2680 	}
2681 
2682       if (gimple_assign_rhs1 (use_stmt) == result)
2683 	{
2684 	  addop = gimple_assign_rhs2 (use_stmt);
2685 	  /* a * b - c -> a * b + (-c)  */
2686 	  if (gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
2687 	    addop = force_gimple_operand_gsi (&gsi,
2688 					      build1 (NEGATE_EXPR,
2689 						      type, addop),
2690 					      true, NULL_TREE, true,
2691 					      GSI_SAME_STMT);
2692 	}
2693       else
2694 	{
2695 	  addop = gimple_assign_rhs1 (use_stmt);
2696 	  /* a - b * c -> (-b) * c + a */
2697 	  if (gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
2698 	    negate_p = !negate_p;
2699 	}
2700 
2701       if (negate_p)
2702 	mulop1 = force_gimple_operand_gsi (&gsi,
2703 					   build1 (NEGATE_EXPR,
2704 						   type, mulop1),
2705 					   true, NULL_TREE, true,
2706 					   GSI_SAME_STMT);
2707 
2708       fma_stmt = gimple_build_assign (gimple_assign_lhs (use_stmt),
2709 				      FMA_EXPR, mulop1, op2, addop);
2710 
2711       if (dump_file && (dump_flags & TDF_DETAILS))
2712 	{
2713 	  fprintf (dump_file, "Generated FMA ");
2714 	  print_gimple_stmt (dump_file, fma_stmt, 0, 0);
2715 	  fprintf (dump_file, "\n");
2716 	}
2717 
2718       gsi_replace (&gsi, fma_stmt, true);
2719       widen_mul_stats.fmas_inserted++;
2720     }
2721 }
2722 
2723 /* Data necessary to perform the actual transformation from a multiplication
2724    and an addition to an FMA after decision is taken it should be done and to
2725    then delete the multiplication statement from the function IL.  */
2726 
2727 struct fma_transformation_info
2728 {
2729   gimple *mul_stmt;
2730   tree mul_result;
2731   tree op1;
2732   tree op2;
2733 };
2734 
2735 /* Structure containing the current state of FMA deferring, i.e. whether we are
2736    deferring, whether to continue deferring, and all data necessary to come
2737    back and perform all deferred transformations.  */
2738 
2739 class fma_deferring_state
2740 {
2741 public:
2742   /* Class constructor.  Pass true as PERFORM_DEFERRING in order to actually
2743      do any deferring.  */
2744 
fma_deferring_state(bool perform_deferring)2745   fma_deferring_state (bool perform_deferring)
2746     : m_candidates (), m_mul_result_set (), m_initial_phi (NULL),
2747       m_last_result (NULL_TREE), m_deferring_p (perform_deferring) {}
2748 
2749   /* List of FMA candidates for which we the transformation has been determined
2750      possible but we at this point in BB analysis we do not consider them
2751      beneficial.  */
2752   auto_vec<fma_transformation_info, 8> m_candidates;
2753 
2754   /* Set of results of multiplication that are part of an already deferred FMA
2755      candidates.  */
2756   hash_set<tree> m_mul_result_set;
2757 
2758   /* The PHI that supposedly feeds back result of a FMA to another over loop
2759      boundary.  */
2760   gphi *m_initial_phi;
2761 
2762   /* Result of the last produced FMA candidate or NULL if there has not been
2763      one.  */
2764   tree m_last_result;
2765 
2766   /* If true, deferring might still be profitable.  If false, transform all
2767      candidates and no longer defer.  */
2768   bool m_deferring_p;
2769 };
2770 
2771 /* Transform all deferred FMA candidates and mark STATE as no longer
2772    deferring.  */
2773 
2774 static void
cancel_fma_deferring(fma_deferring_state * state)2775 cancel_fma_deferring (fma_deferring_state *state)
2776 {
2777   if (!state->m_deferring_p)
2778     return;
2779 
2780   for (unsigned i = 0; i < state->m_candidates.length (); i++)
2781     {
2782       if (dump_file && (dump_flags & TDF_DETAILS))
2783 	fprintf (dump_file, "Generating deferred FMA\n");
2784 
2785       const fma_transformation_info &fti = state->m_candidates[i];
2786       convert_mult_to_fma_1 (fti.mul_result, fti.op1, fti.op2);
2787 
2788       gimple_stmt_iterator gsi = gsi_for_stmt (fti.mul_stmt);
2789       gsi_remove (&gsi, true);
2790       release_defs (fti.mul_stmt);
2791     }
2792   state->m_deferring_p = false;
2793 }
2794 
2795 /* If OP is an SSA name defined by a PHI node, return the PHI statement.
2796    Otherwise return NULL.  */
2797 
2798 static gphi *
result_of_phi(tree op)2799 result_of_phi (tree op)
2800 {
2801   if (TREE_CODE (op) != SSA_NAME)
2802     return NULL;
2803 
2804   return dyn_cast <gphi *> (SSA_NAME_DEF_STMT (op));
2805 }
2806 
2807 /* After processing statements of a BB and recording STATE, return true if the
2808    initial phi is fed by the last FMA candidate result ore one such result from
2809    previously processed BBs marked in LAST_RESULT_SET.  */
2810 
2811 static bool
last_fma_candidate_feeds_initial_phi(fma_deferring_state * state,hash_set<tree> * last_result_set)2812 last_fma_candidate_feeds_initial_phi (fma_deferring_state *state,
2813 				      hash_set<tree> *last_result_set)
2814 {
2815   ssa_op_iter iter;
2816   use_operand_p use;
2817   FOR_EACH_PHI_ARG (use, state->m_initial_phi, iter, SSA_OP_USE)
2818     {
2819       tree t = USE_FROM_PTR (use);
2820       if (t == state->m_last_result
2821 	  || last_result_set->contains (t))
2822 	return true;
2823     }
2824 
2825   return false;
2826 }
2827 
2828 /* Combine the multiplication at MUL_STMT with operands MULOP1 and MULOP2
2829    with uses in additions and subtractions to form fused multiply-add
2830    operations.  Returns true if successful and MUL_STMT should be removed.
2831 
2832    If STATE indicates that we are deferring FMA transformation, that means
2833    that we do not produce FMAs for basic blocks which look like:
2834 
2835     <bb 6>
2836     # accumulator_111 = PHI <0.0(5), accumulator_66(6)>
2837     _65 = _14 * _16;
2838     accumulator_66 = _65 + accumulator_111;
2839 
2840   or its unrolled version, i.e. with several FMA candidates that feed result
2841   of one into the addend of another.  Instead, we add them to a list in STATE
2842   and if we later discover an FMA candidate that is not part of such a chain,
2843   we go back and perform all deferred past candidates.  */
2844 
2845 static bool
convert_mult_to_fma(gimple * mul_stmt,tree op1,tree op2,fma_deferring_state * state)2846 convert_mult_to_fma (gimple *mul_stmt, tree op1, tree op2,
2847 		     fma_deferring_state *state)
2848 {
2849   tree mul_result = gimple_get_lhs (mul_stmt);
2850   tree type = TREE_TYPE (mul_result);
2851   gimple *use_stmt, *neguse_stmt;
2852   use_operand_p use_p;
2853   imm_use_iterator imm_iter;
2854 
2855   if (FLOAT_TYPE_P (type)
2856       && flag_fp_contract_mode == FP_CONTRACT_OFF)
2857     return false;
2858 
2859   /* We don't want to do bitfield reduction ops.  */
2860   if (INTEGRAL_TYPE_P (type)
2861       && (!type_has_mode_precision_p (type) || TYPE_OVERFLOW_TRAPS (type)))
2862     return false;
2863 
2864   /* If the target doesn't support it, don't generate it.  We assume that
2865      if fma isn't available then fms, fnma or fnms are not either.  */
2866   if (optab_handler (fma_optab, TYPE_MODE (type)) == CODE_FOR_nothing)
2867     return false;
2868 
2869   /* If the multiplication has zero uses, it is kept around probably because
2870      of -fnon-call-exceptions.  Don't optimize it away in that case,
2871      it is DCE job.  */
2872   if (has_zero_uses (mul_result))
2873     return false;
2874 
2875   bool check_defer
2876     = (state->m_deferring_p
2877        && (tree_to_shwi (TYPE_SIZE (type))
2878 	   <= PARAM_VALUE (PARAM_AVOID_FMA_MAX_BITS)));
2879   bool defer = check_defer;
2880   /* Make sure that the multiplication statement becomes dead after
2881      the transformation, thus that all uses are transformed to FMAs.
2882      This means we assume that an FMA operation has the same cost
2883      as an addition.  */
2884   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, mul_result)
2885     {
2886       enum tree_code use_code;
2887       tree result = mul_result;
2888       bool negate_p = false;
2889 
2890       use_stmt = USE_STMT (use_p);
2891 
2892       if (is_gimple_debug (use_stmt))
2893 	continue;
2894 
2895       /* For now restrict this operations to single basic blocks.  In theory
2896 	 we would want to support sinking the multiplication in
2897 	 m = a*b;
2898 	 if ()
2899 	   ma = m + c;
2900 	 else
2901 	   d = m;
2902 	 to form a fma in the then block and sink the multiplication to the
2903 	 else block.  */
2904       if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
2905 	return false;
2906 
2907       if (!is_gimple_assign (use_stmt))
2908 	return false;
2909 
2910       use_code = gimple_assign_rhs_code (use_stmt);
2911 
2912       /* A negate on the multiplication leads to FNMA.  */
2913       if (use_code == NEGATE_EXPR)
2914 	{
2915 	  ssa_op_iter iter;
2916 	  use_operand_p usep;
2917 
2918 	  result = gimple_assign_lhs (use_stmt);
2919 
2920 	  /* Make sure the negate statement becomes dead with this
2921 	     single transformation.  */
2922 	  if (!single_imm_use (gimple_assign_lhs (use_stmt),
2923 			       &use_p, &neguse_stmt))
2924 	    return false;
2925 
2926 	  /* Make sure the multiplication isn't also used on that stmt.  */
2927 	  FOR_EACH_PHI_OR_STMT_USE (usep, neguse_stmt, iter, SSA_OP_USE)
2928 	    if (USE_FROM_PTR (usep) == mul_result)
2929 	      return false;
2930 
2931 	  /* Re-validate.  */
2932 	  use_stmt = neguse_stmt;
2933 	  if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
2934 	    return false;
2935 	  if (!is_gimple_assign (use_stmt))
2936 	    return false;
2937 
2938 	  use_code = gimple_assign_rhs_code (use_stmt);
2939 	  negate_p = true;
2940 	}
2941 
2942       switch (use_code)
2943 	{
2944 	case MINUS_EXPR:
2945 	  if (gimple_assign_rhs2 (use_stmt) == result)
2946 	    negate_p = !negate_p;
2947 	  break;
2948 	case PLUS_EXPR:
2949 	  break;
2950 	default:
2951 	  /* FMA can only be formed from PLUS and MINUS.  */
2952 	  return false;
2953 	}
2954 
2955       /* If the subtrahend (gimple_assign_rhs2 (use_stmt)) is computed
2956 	 by a MULT_EXPR that we'll visit later, we might be able to
2957 	 get a more profitable match with fnma.
2958 	 OTOH, if we don't, a negate / fma pair has likely lower latency
2959 	 that a mult / subtract pair.  */
2960       if (use_code == MINUS_EXPR && !negate_p
2961 	  && gimple_assign_rhs1 (use_stmt) == result
2962 	  && optab_handler (fms_optab, TYPE_MODE (type)) == CODE_FOR_nothing
2963 	  && optab_handler (fnma_optab, TYPE_MODE (type)) != CODE_FOR_nothing)
2964 	{
2965 	  tree rhs2 = gimple_assign_rhs2 (use_stmt);
2966 
2967 	  if (TREE_CODE (rhs2) == SSA_NAME)
2968 	    {
2969 	      gimple *stmt2 = SSA_NAME_DEF_STMT (rhs2);
2970 	      if (has_single_use (rhs2)
2971 		  && is_gimple_assign (stmt2)
2972 		  && gimple_assign_rhs_code (stmt2) == MULT_EXPR)
2973 	      return false;
2974 	    }
2975 	}
2976 
2977       tree use_rhs1 = gimple_assign_rhs1 (use_stmt);
2978       tree use_rhs2 = gimple_assign_rhs2 (use_stmt);
2979       /* We can't handle a * b + a * b.  */
2980       if (use_rhs1 == use_rhs2)
2981 	return false;
2982       /* If deferring, make sure we are not looking at an instruction that
2983 	 wouldn't have existed if we were not.  */
2984       if (state->m_deferring_p
2985 	  && (state->m_mul_result_set.contains (use_rhs1)
2986 	      || state->m_mul_result_set.contains (use_rhs2)))
2987 	return false;
2988 
2989       if (check_defer)
2990 	{
2991 	  tree use_lhs = gimple_assign_lhs (use_stmt);
2992 	  if (state->m_last_result)
2993 	    {
2994 	      if (use_rhs2 == state->m_last_result
2995 		  || use_rhs1 == state->m_last_result)
2996 		defer = true;
2997 	      else
2998 		defer = false;
2999 	    }
3000 	  else
3001 	    {
3002 	      gcc_checking_assert (!state->m_initial_phi);
3003 	      gphi *phi;
3004 	      if (use_rhs1 == result)
3005 		phi = result_of_phi (use_rhs2);
3006 	      else
3007 		{
3008 		  gcc_assert (use_rhs2 == result);
3009 		  phi = result_of_phi (use_rhs1);
3010 		}
3011 
3012 	      if (phi)
3013 		{
3014 		  state->m_initial_phi = phi;
3015 		  defer = true;
3016 		}
3017 	      else
3018 		defer = false;
3019 	    }
3020 
3021 	  state->m_last_result = use_lhs;
3022 	  check_defer = false;
3023 	}
3024       else
3025 	defer = false;
3026 
3027       /* While it is possible to validate whether or not the exact form that
3028 	 we've recognized is available in the backend, the assumption is that
3029 	 if the deferring logic above did not trigger, the transformation is
3030 	 never a loss.  For instance, suppose the target only has the plain FMA
3031 	 pattern available.  Consider a*b-c -> fma(a,b,-c): we've exchanged
3032 	 MUL+SUB for FMA+NEG, which is still two operations.  Consider
3033          -(a*b)-c -> fma(-a,b,-c): we still have 3 operations, but in the FMA
3034 	 form the two NEGs are independent and could be run in parallel.  */
3035     }
3036 
3037   if (defer)
3038     {
3039       fma_transformation_info fti;
3040       fti.mul_stmt = mul_stmt;
3041       fti.mul_result = mul_result;
3042       fti.op1 = op1;
3043       fti.op2 = op2;
3044       state->m_candidates.safe_push (fti);
3045       state->m_mul_result_set.add (mul_result);
3046 
3047       if (dump_file && (dump_flags & TDF_DETAILS))
3048 	{
3049 	  fprintf (dump_file, "Deferred generating FMA for multiplication ");
3050 	  print_gimple_stmt (dump_file, mul_stmt, 0, 0);
3051 	  fprintf (dump_file, "\n");
3052 	}
3053 
3054       return false;
3055     }
3056   else
3057     {
3058       if (state->m_deferring_p)
3059 	cancel_fma_deferring (state);
3060       convert_mult_to_fma_1 (mul_result, op1, op2);
3061       return true;
3062     }
3063 }
3064 
3065 
3066 /* Helper function of match_uaddsub_overflow.  Return 1
3067    if USE_STMT is unsigned overflow check ovf != 0 for
3068    STMT, -1 if USE_STMT is unsigned overflow check ovf == 0
3069    and 0 otherwise.  */
3070 
3071 static int
uaddsub_overflow_check_p(gimple * stmt,gimple * use_stmt)3072 uaddsub_overflow_check_p (gimple *stmt, gimple *use_stmt)
3073 {
3074   enum tree_code ccode = ERROR_MARK;
3075   tree crhs1 = NULL_TREE, crhs2 = NULL_TREE;
3076   if (gimple_code (use_stmt) == GIMPLE_COND)
3077     {
3078       ccode = gimple_cond_code (use_stmt);
3079       crhs1 = gimple_cond_lhs (use_stmt);
3080       crhs2 = gimple_cond_rhs (use_stmt);
3081     }
3082   else if (is_gimple_assign (use_stmt))
3083     {
3084       if (gimple_assign_rhs_class (use_stmt) == GIMPLE_BINARY_RHS)
3085 	{
3086 	  ccode = gimple_assign_rhs_code (use_stmt);
3087 	  crhs1 = gimple_assign_rhs1 (use_stmt);
3088 	  crhs2 = gimple_assign_rhs2 (use_stmt);
3089 	}
3090       else if (gimple_assign_rhs_code (use_stmt) == COND_EXPR)
3091 	{
3092 	  tree cond = gimple_assign_rhs1 (use_stmt);
3093 	  if (COMPARISON_CLASS_P (cond))
3094 	    {
3095 	      ccode = TREE_CODE (cond);
3096 	      crhs1 = TREE_OPERAND (cond, 0);
3097 	      crhs2 = TREE_OPERAND (cond, 1);
3098 	    }
3099 	  else
3100 	    return 0;
3101 	}
3102       else
3103 	return 0;
3104     }
3105   else
3106     return 0;
3107 
3108   if (TREE_CODE_CLASS (ccode) != tcc_comparison)
3109     return 0;
3110 
3111   enum tree_code code = gimple_assign_rhs_code (stmt);
3112   tree lhs = gimple_assign_lhs (stmt);
3113   tree rhs1 = gimple_assign_rhs1 (stmt);
3114   tree rhs2 = gimple_assign_rhs2 (stmt);
3115 
3116   switch (ccode)
3117     {
3118     case GT_EXPR:
3119     case LE_EXPR:
3120       /* r = a - b; r > a or r <= a
3121 	 r = a + b; a > r or a <= r or b > r or b <= r.  */
3122       if ((code == MINUS_EXPR && crhs1 == lhs && crhs2 == rhs1)
3123 	  || (code == PLUS_EXPR && (crhs1 == rhs1 || crhs1 == rhs2)
3124 	      && crhs2 == lhs))
3125 	return ccode == GT_EXPR ? 1 : -1;
3126       break;
3127     case LT_EXPR:
3128     case GE_EXPR:
3129       /* r = a - b; a < r or a >= r
3130 	 r = a + b; r < a or r >= a or r < b or r >= b.  */
3131       if ((code == MINUS_EXPR && crhs1 == rhs1 && crhs2 == lhs)
3132 	  || (code == PLUS_EXPR && crhs1 == lhs
3133 	      && (crhs2 == rhs1 || crhs2 == rhs2)))
3134 	return ccode == LT_EXPR ? 1 : -1;
3135       break;
3136     default:
3137       break;
3138     }
3139   return 0;
3140 }
3141 
3142 /* Recognize for unsigned x
3143    x = y - z;
3144    if (x > y)
3145    where there are other uses of x and replace it with
3146    _7 = SUB_OVERFLOW (y, z);
3147    x = REALPART_EXPR <_7>;
3148    _8 = IMAGPART_EXPR <_7>;
3149    if (_8)
3150    and similarly for addition.  */
3151 
3152 static bool
match_uaddsub_overflow(gimple_stmt_iterator * gsi,gimple * stmt,enum tree_code code)3153 match_uaddsub_overflow (gimple_stmt_iterator *gsi, gimple *stmt,
3154 			enum tree_code code)
3155 {
3156   tree lhs = gimple_assign_lhs (stmt);
3157   tree type = TREE_TYPE (lhs);
3158   use_operand_p use_p;
3159   imm_use_iterator iter;
3160   bool use_seen = false;
3161   bool ovf_use_seen = false;
3162   gimple *use_stmt;
3163 
3164   gcc_checking_assert (code == PLUS_EXPR || code == MINUS_EXPR);
3165   if (!INTEGRAL_TYPE_P (type)
3166       || !TYPE_UNSIGNED (type)
3167       || has_zero_uses (lhs)
3168       || has_single_use (lhs)
3169       || optab_handler (code == PLUS_EXPR ? uaddv4_optab : usubv4_optab,
3170 			TYPE_MODE (type)) == CODE_FOR_nothing)
3171     return false;
3172 
3173   FOR_EACH_IMM_USE_FAST (use_p, iter, lhs)
3174     {
3175       use_stmt = USE_STMT (use_p);
3176       if (is_gimple_debug (use_stmt))
3177 	continue;
3178 
3179       if (uaddsub_overflow_check_p (stmt, use_stmt))
3180 	ovf_use_seen = true;
3181       else
3182 	use_seen = true;
3183       if (ovf_use_seen && use_seen)
3184 	break;
3185     }
3186 
3187   if (!ovf_use_seen || !use_seen)
3188     return false;
3189 
3190   tree ctype = build_complex_type (type);
3191   tree rhs1 = gimple_assign_rhs1 (stmt);
3192   tree rhs2 = gimple_assign_rhs2 (stmt);
3193   gcall *g = gimple_build_call_internal (code == PLUS_EXPR
3194 					 ? IFN_ADD_OVERFLOW : IFN_SUB_OVERFLOW,
3195 					 2, rhs1, rhs2);
3196   tree ctmp = make_ssa_name (ctype);
3197   gimple_call_set_lhs (g, ctmp);
3198   gsi_insert_before (gsi, g, GSI_SAME_STMT);
3199   gassign *g2 = gimple_build_assign (lhs, REALPART_EXPR,
3200 				     build1 (REALPART_EXPR, type, ctmp));
3201   gsi_replace (gsi, g2, true);
3202   tree ovf = make_ssa_name (type);
3203   g2 = gimple_build_assign (ovf, IMAGPART_EXPR,
3204 			    build1 (IMAGPART_EXPR, type, ctmp));
3205   gsi_insert_after (gsi, g2, GSI_NEW_STMT);
3206 
3207   FOR_EACH_IMM_USE_STMT (use_stmt, iter, lhs)
3208     {
3209       if (is_gimple_debug (use_stmt))
3210 	continue;
3211 
3212       int ovf_use = uaddsub_overflow_check_p (stmt, use_stmt);
3213       if (ovf_use == 0)
3214 	continue;
3215       if (gimple_code (use_stmt) == GIMPLE_COND)
3216 	{
3217 	  gcond *cond_stmt = as_a <gcond *> (use_stmt);
3218 	  gimple_cond_set_lhs (cond_stmt, ovf);
3219 	  gimple_cond_set_rhs (cond_stmt, build_int_cst (type, 0));
3220 	  gimple_cond_set_code (cond_stmt, ovf_use == 1 ? NE_EXPR : EQ_EXPR);
3221 	}
3222       else
3223 	{
3224 	  gcc_checking_assert (is_gimple_assign (use_stmt));
3225 	  if (gimple_assign_rhs_class (use_stmt) == GIMPLE_BINARY_RHS)
3226 	    {
3227 	      gimple_assign_set_rhs1 (use_stmt, ovf);
3228 	      gimple_assign_set_rhs2 (use_stmt, build_int_cst (type, 0));
3229 	      gimple_assign_set_rhs_code (use_stmt,
3230 					  ovf_use == 1 ? NE_EXPR : EQ_EXPR);
3231 	    }
3232 	  else
3233 	    {
3234 	      gcc_checking_assert (gimple_assign_rhs_code (use_stmt)
3235 				   == COND_EXPR);
3236 	      tree cond = build2 (ovf_use == 1 ? NE_EXPR : EQ_EXPR,
3237 				  boolean_type_node, ovf,
3238 				  build_int_cst (type, 0));
3239 	      gimple_assign_set_rhs1 (use_stmt, cond);
3240 	    }
3241 	}
3242       update_stmt (use_stmt);
3243     }
3244   return true;
3245 }
3246 
3247 /* Return true if target has support for divmod.  */
3248 
3249 static bool
target_supports_divmod_p(optab divmod_optab,optab div_optab,machine_mode mode)3250 target_supports_divmod_p (optab divmod_optab, optab div_optab, machine_mode mode)
3251 {
3252   /* If target supports hardware divmod insn, use it for divmod.  */
3253   if (optab_handler (divmod_optab, mode) != CODE_FOR_nothing)
3254     return true;
3255 
3256   /* Check if libfunc for divmod is available.  */
3257   rtx libfunc = optab_libfunc (divmod_optab, mode);
3258   if (libfunc != NULL_RTX)
3259     {
3260       /* If optab_handler exists for div_optab, perhaps in a wider mode,
3261 	 we don't want to use the libfunc even if it exists for given mode.  */
3262       machine_mode div_mode;
3263       FOR_EACH_MODE_FROM (div_mode, mode)
3264 	if (optab_handler (div_optab, div_mode) != CODE_FOR_nothing)
3265 	  return false;
3266 
3267       return targetm.expand_divmod_libfunc != NULL;
3268     }
3269 
3270   return false;
3271 }
3272 
3273 /* Check if stmt is candidate for divmod transform.  */
3274 
3275 static bool
divmod_candidate_p(gassign * stmt)3276 divmod_candidate_p (gassign *stmt)
3277 {
3278   tree type = TREE_TYPE (gimple_assign_lhs (stmt));
3279   machine_mode mode = TYPE_MODE (type);
3280   optab divmod_optab, div_optab;
3281 
3282   if (TYPE_UNSIGNED (type))
3283     {
3284       divmod_optab = udivmod_optab;
3285       div_optab = udiv_optab;
3286     }
3287   else
3288     {
3289       divmod_optab = sdivmod_optab;
3290       div_optab = sdiv_optab;
3291     }
3292 
3293   tree op1 = gimple_assign_rhs1 (stmt);
3294   tree op2 = gimple_assign_rhs2 (stmt);
3295 
3296   /* Disable the transform if either is a constant, since division-by-constant
3297      may have specialized expansion.  */
3298   if (CONSTANT_CLASS_P (op1) || CONSTANT_CLASS_P (op2))
3299     return false;
3300 
3301   /* Exclude the case where TYPE_OVERFLOW_TRAPS (type) as that should
3302      expand using the [su]divv optabs.  */
3303   if (TYPE_OVERFLOW_TRAPS (type))
3304     return false;
3305 
3306   if (!target_supports_divmod_p (divmod_optab, div_optab, mode))
3307     return false;
3308 
3309   return true;
3310 }
3311 
3312 /* This function looks for:
3313    t1 = a TRUNC_DIV_EXPR b;
3314    t2 = a TRUNC_MOD_EXPR b;
3315    and transforms it to the following sequence:
3316    complex_tmp = DIVMOD (a, b);
3317    t1 = REALPART_EXPR(a);
3318    t2 = IMAGPART_EXPR(b);
3319    For conditions enabling the transform see divmod_candidate_p().
3320 
3321    The pass has three parts:
3322    1) Find top_stmt which is trunc_div or trunc_mod stmt and dominates all
3323       other trunc_div_expr and trunc_mod_expr stmts.
3324    2) Add top_stmt and all trunc_div and trunc_mod stmts dominated by top_stmt
3325       to stmts vector.
3326    3) Insert DIVMOD call just before top_stmt and update entries in
3327       stmts vector to use return value of DIMOVD (REALEXPR_PART for div,
3328       IMAGPART_EXPR for mod).  */
3329 
3330 static bool
convert_to_divmod(gassign * stmt)3331 convert_to_divmod (gassign *stmt)
3332 {
3333   if (stmt_can_throw_internal (stmt)
3334       || !divmod_candidate_p (stmt))
3335     return false;
3336 
3337   tree op1 = gimple_assign_rhs1 (stmt);
3338   tree op2 = gimple_assign_rhs2 (stmt);
3339 
3340   imm_use_iterator use_iter;
3341   gimple *use_stmt;
3342   auto_vec<gimple *> stmts;
3343 
3344   gimple *top_stmt = stmt;
3345   basic_block top_bb = gimple_bb (stmt);
3346 
3347   /* Part 1: Try to set top_stmt to "topmost" stmt that dominates
3348      at-least stmt and possibly other trunc_div/trunc_mod stmts
3349      having same operands as stmt.  */
3350 
3351   FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, op1)
3352     {
3353       if (is_gimple_assign (use_stmt)
3354 	  && (gimple_assign_rhs_code (use_stmt) == TRUNC_DIV_EXPR
3355 	      || gimple_assign_rhs_code (use_stmt) == TRUNC_MOD_EXPR)
3356 	  && operand_equal_p (op1, gimple_assign_rhs1 (use_stmt), 0)
3357 	  && operand_equal_p (op2, gimple_assign_rhs2 (use_stmt), 0))
3358 	{
3359 	  if (stmt_can_throw_internal (use_stmt))
3360 	    continue;
3361 
3362 	  basic_block bb = gimple_bb (use_stmt);
3363 
3364 	  if (bb == top_bb)
3365 	    {
3366 	      if (gimple_uid (use_stmt) < gimple_uid (top_stmt))
3367 		top_stmt = use_stmt;
3368 	    }
3369 	  else if (dominated_by_p (CDI_DOMINATORS, top_bb, bb))
3370 	    {
3371 	      top_bb = bb;
3372 	      top_stmt = use_stmt;
3373 	    }
3374 	}
3375     }
3376 
3377   tree top_op1 = gimple_assign_rhs1 (top_stmt);
3378   tree top_op2 = gimple_assign_rhs2 (top_stmt);
3379 
3380   stmts.safe_push (top_stmt);
3381   bool div_seen = (gimple_assign_rhs_code (top_stmt) == TRUNC_DIV_EXPR);
3382 
3383   /* Part 2: Add all trunc_div/trunc_mod statements domianted by top_bb
3384      to stmts vector. The 2nd loop will always add stmt to stmts vector, since
3385      gimple_bb (top_stmt) dominates gimple_bb (stmt), so the
3386      2nd loop ends up adding at-least single trunc_mod_expr stmt.  */
3387 
3388   FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, top_op1)
3389     {
3390       if (is_gimple_assign (use_stmt)
3391 	  && (gimple_assign_rhs_code (use_stmt) == TRUNC_DIV_EXPR
3392 	      || gimple_assign_rhs_code (use_stmt) == TRUNC_MOD_EXPR)
3393 	  && operand_equal_p (top_op1, gimple_assign_rhs1 (use_stmt), 0)
3394 	  && operand_equal_p (top_op2, gimple_assign_rhs2 (use_stmt), 0))
3395 	{
3396 	  if (use_stmt == top_stmt
3397 	      || stmt_can_throw_internal (use_stmt)
3398 	      || !dominated_by_p (CDI_DOMINATORS, gimple_bb (use_stmt), top_bb))
3399 	    continue;
3400 
3401 	  stmts.safe_push (use_stmt);
3402 	  if (gimple_assign_rhs_code (use_stmt) == TRUNC_DIV_EXPR)
3403 	    div_seen = true;
3404 	}
3405     }
3406 
3407   if (!div_seen)
3408     return false;
3409 
3410   /* Part 3: Create libcall to internal fn DIVMOD:
3411      divmod_tmp = DIVMOD (op1, op2).  */
3412 
3413   gcall *call_stmt = gimple_build_call_internal (IFN_DIVMOD, 2, op1, op2);
3414   tree res = make_temp_ssa_name (build_complex_type (TREE_TYPE (op1)),
3415 				 call_stmt, "divmod_tmp");
3416   gimple_call_set_lhs (call_stmt, res);
3417   /* We rejected throwing statements above.  */
3418   gimple_call_set_nothrow (call_stmt, true);
3419 
3420   /* Insert the call before top_stmt.  */
3421   gimple_stmt_iterator top_stmt_gsi = gsi_for_stmt (top_stmt);
3422   gsi_insert_before (&top_stmt_gsi, call_stmt, GSI_SAME_STMT);
3423 
3424   widen_mul_stats.divmod_calls_inserted++;
3425 
3426   /* Update all statements in stmts vector:
3427      lhs = op1 TRUNC_DIV_EXPR op2 -> lhs = REALPART_EXPR<divmod_tmp>
3428      lhs = op1 TRUNC_MOD_EXPR op2 -> lhs = IMAGPART_EXPR<divmod_tmp>.  */
3429 
3430   for (unsigned i = 0; stmts.iterate (i, &use_stmt); ++i)
3431     {
3432       tree new_rhs;
3433 
3434       switch (gimple_assign_rhs_code (use_stmt))
3435 	{
3436 	  case TRUNC_DIV_EXPR:
3437 	    new_rhs = fold_build1 (REALPART_EXPR, TREE_TYPE (op1), res);
3438 	    break;
3439 
3440 	  case TRUNC_MOD_EXPR:
3441 	    new_rhs = fold_build1 (IMAGPART_EXPR, TREE_TYPE (op1), res);
3442 	    break;
3443 
3444 	  default:
3445 	    gcc_unreachable ();
3446 	}
3447 
3448       gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
3449       gimple_assign_set_rhs_from_tree (&gsi, new_rhs);
3450       update_stmt (use_stmt);
3451     }
3452 
3453   return true;
3454 }
3455 
3456 /* Find integer multiplications where the operands are extended from
3457    smaller types, and replace the MULT_EXPR with a WIDEN_MULT_EXPR
3458    where appropriate.  */
3459 
3460 namespace {
3461 
3462 const pass_data pass_data_optimize_widening_mul =
3463 {
3464   GIMPLE_PASS, /* type */
3465   "widening_mul", /* name */
3466   OPTGROUP_NONE, /* optinfo_flags */
3467   TV_TREE_WIDEN_MUL, /* tv_id */
3468   PROP_ssa, /* properties_required */
3469   0, /* properties_provided */
3470   0, /* properties_destroyed */
3471   0, /* todo_flags_start */
3472   TODO_update_ssa, /* todo_flags_finish */
3473 };
3474 
3475 class pass_optimize_widening_mul : public gimple_opt_pass
3476 {
3477 public:
pass_optimize_widening_mul(gcc::context * ctxt)3478   pass_optimize_widening_mul (gcc::context *ctxt)
3479     : gimple_opt_pass (pass_data_optimize_widening_mul, ctxt)
3480   {}
3481 
3482   /* opt_pass methods: */
gate(function *)3483   virtual bool gate (function *)
3484     {
3485       return flag_expensive_optimizations && optimize;
3486     }
3487 
3488   virtual unsigned int execute (function *);
3489 
3490 }; // class pass_optimize_widening_mul
3491 
3492 /* Walker class to perform the transformation in reverse dominance order. */
3493 
3494 class math_opts_dom_walker : public dom_walker
3495 {
3496 public:
3497   /* Constructor, CFG_CHANGED is a pointer to a boolean flag that will be set
3498      if walking modidifes the CFG.  */
3499 
math_opts_dom_walker(bool * cfg_changed_p)3500   math_opts_dom_walker (bool *cfg_changed_p)
3501     : dom_walker (CDI_DOMINATORS), m_last_result_set (),
3502       m_cfg_changed_p (cfg_changed_p) {}
3503 
3504   /* The actual actions performed in the walk.  */
3505 
3506   virtual void after_dom_children (basic_block);
3507 
3508   /* Set of results of chains of multiply and add statement combinations that
3509      were not transformed into FMAs because of active deferring.  */
3510   hash_set<tree> m_last_result_set;
3511 
3512   /* Pointer to a flag of the user that needs to be set if CFG has been
3513      modified.  */
3514   bool *m_cfg_changed_p;
3515 };
3516 
3517 void
after_dom_children(basic_block bb)3518 math_opts_dom_walker::after_dom_children (basic_block bb)
3519 {
3520   gimple_stmt_iterator gsi;
3521 
3522   fma_deferring_state fma_state (PARAM_VALUE (PARAM_AVOID_FMA_MAX_BITS) > 0);
3523 
3524   for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi);)
3525     {
3526       gimple *stmt = gsi_stmt (gsi);
3527       enum tree_code code;
3528 
3529       if (is_gimple_assign (stmt))
3530 	{
3531 	  code = gimple_assign_rhs_code (stmt);
3532 	  switch (code)
3533 	    {
3534 	    case MULT_EXPR:
3535 	      if (!convert_mult_to_widen (stmt, &gsi)
3536 		  && !convert_expand_mult_copysign (stmt, &gsi)
3537 		  && convert_mult_to_fma (stmt,
3538 					  gimple_assign_rhs1 (stmt),
3539 					  gimple_assign_rhs2 (stmt),
3540 					  &fma_state))
3541 		{
3542 		  gsi_remove (&gsi, true);
3543 		  release_defs (stmt);
3544 		  continue;
3545 		}
3546 	      break;
3547 
3548 	    case PLUS_EXPR:
3549 	    case MINUS_EXPR:
3550 	      if (!convert_plusminus_to_widen (&gsi, stmt, code))
3551 		match_uaddsub_overflow (&gsi, stmt, code);
3552 	      break;
3553 
3554 	    case TRUNC_MOD_EXPR:
3555 	      convert_to_divmod (as_a<gassign *> (stmt));
3556 	      break;
3557 
3558 	    default:;
3559 	    }
3560 	}
3561       else if (is_gimple_call (stmt))
3562 	{
3563 	  tree fndecl = gimple_call_fndecl (stmt);
3564 	  if (fndecl && gimple_call_builtin_p (stmt, BUILT_IN_NORMAL))
3565 	    {
3566 	      switch (DECL_FUNCTION_CODE (fndecl))
3567 		{
3568 		case BUILT_IN_POWF:
3569 		case BUILT_IN_POW:
3570 		case BUILT_IN_POWL:
3571 		  if (gimple_call_lhs (stmt)
3572 		      && TREE_CODE (gimple_call_arg (stmt, 1)) == REAL_CST
3573 		      && real_equal
3574 		      (&TREE_REAL_CST (gimple_call_arg (stmt, 1)),
3575 		       &dconst2)
3576 		      && convert_mult_to_fma (stmt,
3577 					      gimple_call_arg (stmt, 0),
3578 					      gimple_call_arg (stmt, 0),
3579 					      &fma_state))
3580 		    {
3581 		      unlink_stmt_vdef (stmt);
3582 		      if (gsi_remove (&gsi, true)
3583 			  && gimple_purge_dead_eh_edges (bb))
3584 			*m_cfg_changed_p = true;
3585 		      release_defs (stmt);
3586 		      continue;
3587 		    }
3588 		  break;
3589 
3590 		default:;
3591 		}
3592 	    }
3593 	  else
3594 	    cancel_fma_deferring (&fma_state);
3595 	}
3596       gsi_next (&gsi);
3597     }
3598   if (fma_state.m_deferring_p
3599       && fma_state.m_initial_phi)
3600     {
3601       gcc_checking_assert (fma_state.m_last_result);
3602       if (!last_fma_candidate_feeds_initial_phi (&fma_state,
3603 						 &m_last_result_set))
3604 	cancel_fma_deferring (&fma_state);
3605       else
3606 	m_last_result_set.add (fma_state.m_last_result);
3607     }
3608 }
3609 
3610 
3611 unsigned int
execute(function * fun)3612 pass_optimize_widening_mul::execute (function *fun)
3613 {
3614   bool cfg_changed = false;
3615 
3616   memset (&widen_mul_stats, 0, sizeof (widen_mul_stats));
3617   calculate_dominance_info (CDI_DOMINATORS);
3618   renumber_gimple_stmt_uids ();
3619 
3620   math_opts_dom_walker (&cfg_changed).walk (ENTRY_BLOCK_PTR_FOR_FN (cfun));
3621 
3622   statistics_counter_event (fun, "widening multiplications inserted",
3623 			    widen_mul_stats.widen_mults_inserted);
3624   statistics_counter_event (fun, "widening maccs inserted",
3625 			    widen_mul_stats.maccs_inserted);
3626   statistics_counter_event (fun, "fused multiply-adds inserted",
3627 			    widen_mul_stats.fmas_inserted);
3628   statistics_counter_event (fun, "divmod calls inserted",
3629 			    widen_mul_stats.divmod_calls_inserted);
3630 
3631   return cfg_changed ? TODO_cleanup_cfg : 0;
3632 }
3633 
3634 } // anon namespace
3635 
3636 gimple_opt_pass *
make_pass_optimize_widening_mul(gcc::context * ctxt)3637 make_pass_optimize_widening_mul (gcc::context *ctxt)
3638 {
3639   return new pass_optimize_widening_mul (ctxt);
3640 }
3641