1 /* Global, SSA-based optimizations using mathematical identities.
2    Copyright (C) 2005-2019 Free Software Foundation, Inc.
3 
4 This file is part of GCC.
5 
6 GCC is free software; you can redistribute it and/or modify it
7 under the terms of the GNU General Public License as published by the
8 Free Software Foundation; either version 3, or (at your option) any
9 later version.
10 
11 GCC is distributed in the hope that it will be useful, but WITHOUT
12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14 for more details.
15 
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3.  If not see
18 <http://www.gnu.org/licenses/>.  */
19 
20 /* Currently, the only mini-pass in this file tries to CSE reciprocal
21    operations.  These are common in sequences such as this one:
22 
23 	modulus = sqrt(x*x + y*y + z*z);
24 	x = x / modulus;
25 	y = y / modulus;
26 	z = z / modulus;
27 
28    that can be optimized to
29 
30 	modulus = sqrt(x*x + y*y + z*z);
31         rmodulus = 1.0 / modulus;
32 	x = x * rmodulus;
33 	y = y * rmodulus;
34 	z = z * rmodulus;
35 
36    We do this for loop invariant divisors, and with this pass whenever
37    we notice that a division has the same divisor multiple times.
38 
39    Of course, like in PRE, we don't insert a division if a dominator
40    already has one.  However, this cannot be done as an extension of
41    PRE for several reasons.
42 
43    First of all, with some experiments it was found out that the
44    transformation is not always useful if there are only two divisions
45    by the same divisor.  This is probably because modern processors
46    can pipeline the divisions; on older, in-order processors it should
47    still be effective to optimize two divisions by the same number.
48    We make this a param, and it shall be called N in the remainder of
49    this comment.
50 
51    Second, if trapping math is active, we have less freedom on where
52    to insert divisions: we can only do so in basic blocks that already
53    contain one.  (If divisions don't trap, instead, we can insert
54    divisions elsewhere, which will be in blocks that are common dominators
55    of those that have the division).
56 
57    We really don't want to compute the reciprocal unless a division will
58    be found.  To do this, we won't insert the division in a basic block
59    that has less than N divisions *post-dominating* it.
60 
61    The algorithm constructs a subset of the dominator tree, holding the
62    blocks containing the divisions and the common dominators to them,
63    and walk it twice.  The first walk is in post-order, and it annotates
64    each block with the number of divisions that post-dominate it: this
65    gives information on where divisions can be inserted profitably.
66    The second walk is in pre-order, and it inserts divisions as explained
67    above, and replaces divisions by multiplications.
68 
69    In the best case, the cost of the pass is O(n_statements).  In the
70    worst-case, the cost is due to creating the dominator tree subset,
71    with a cost of O(n_basic_blocks ^ 2); however this can only happen
72    for n_statements / n_basic_blocks statements.  So, the amortized cost
73    of creating the dominator tree subset is O(n_basic_blocks) and the
74    worst-case cost of the pass is O(n_statements * n_basic_blocks).
75 
76    More practically, the cost will be small because there are few
77    divisions, and they tend to be in the same basic block, so insert_bb
78    is called very few times.
79 
80    If we did this using domwalk.c, an efficient implementation would have
81    to work on all the variables in a single pass, because we could not
82    work on just a subset of the dominator tree, as we do now, and the
83    cost would also be something like O(n_statements * n_basic_blocks).
84    The data structures would be more complex in order to work on all the
85    variables in a single pass.  */
86 
87 #include "config.h"
88 #include "system.h"
89 #include "coretypes.h"
90 #include "backend.h"
91 #include "target.h"
92 #include "rtl.h"
93 #include "tree.h"
94 #include "gimple.h"
95 #include "predict.h"
96 #include "alloc-pool.h"
97 #include "tree-pass.h"
98 #include "ssa.h"
99 #include "optabs-tree.h"
100 #include "gimple-pretty-print.h"
101 #include "alias.h"
102 #include "fold-const.h"
103 #include "gimple-fold.h"
104 #include "gimple-iterator.h"
105 #include "gimplify.h"
106 #include "gimplify-me.h"
107 #include "stor-layout.h"
108 #include "tree-cfg.h"
109 #include "tree-dfa.h"
110 #include "tree-ssa.h"
111 #include "builtins.h"
112 #include "params.h"
113 #include "internal-fn.h"
114 #include "case-cfn-macros.h"
115 #include "optabs-libfuncs.h"
116 #include "tree-eh.h"
117 #include "targhooks.h"
118 #include "domwalk.h"
119 
120 /* This structure represents one basic block that either computes a
121    division, or is a common dominator for basic block that compute a
122    division.  */
123 struct occurrence {
124   /* The basic block represented by this structure.  */
125   basic_block bb;
126 
127   /* If non-NULL, the SSA_NAME holding the definition for a reciprocal
128      inserted in BB.  */
129   tree recip_def;
130 
131   /* If non-NULL, the SSA_NAME holding the definition for a squared
132      reciprocal inserted in BB.  */
133   tree square_recip_def;
134 
135   /* If non-NULL, the GIMPLE_ASSIGN for a reciprocal computation that
136      was inserted in BB.  */
137   gimple *recip_def_stmt;
138 
139   /* Pointer to a list of "struct occurrence"s for blocks dominated
140      by BB.  */
141   struct occurrence *children;
142 
143   /* Pointer to the next "struct occurrence"s in the list of blocks
144      sharing a common dominator.  */
145   struct occurrence *next;
146 
147   /* The number of divisions that are in BB before compute_merit.  The
148      number of divisions that are in BB or post-dominate it after
149      compute_merit.  */
150   int num_divisions;
151 
152   /* True if the basic block has a division, false if it is a common
153      dominator for basic blocks that do.  If it is false and trapping
154      math is active, BB is not a candidate for inserting a reciprocal.  */
155   bool bb_has_division;
156 };
157 
158 static struct
159 {
160   /* Number of 1.0/X ops inserted.  */
161   int rdivs_inserted;
162 
163   /* Number of 1.0/FUNC ops inserted.  */
164   int rfuncs_inserted;
165 } reciprocal_stats;
166 
167 static struct
168 {
169   /* Number of cexpi calls inserted.  */
170   int inserted;
171 } sincos_stats;
172 
173 static struct
174 {
175   /* Number of widening multiplication ops inserted.  */
176   int widen_mults_inserted;
177 
178   /* Number of integer multiply-and-accumulate ops inserted.  */
179   int maccs_inserted;
180 
181   /* Number of fp fused multiply-add ops inserted.  */
182   int fmas_inserted;
183 
184   /* Number of divmod calls inserted.  */
185   int divmod_calls_inserted;
186 } widen_mul_stats;
187 
188 /* The instance of "struct occurrence" representing the highest
189    interesting block in the dominator tree.  */
190 static struct occurrence *occ_head;
191 
192 /* Allocation pool for getting instances of "struct occurrence".  */
193 static object_allocator<occurrence> *occ_pool;
194 
195 
196 
197 /* Allocate and return a new struct occurrence for basic block BB, and
198    whose children list is headed by CHILDREN.  */
199 static struct occurrence *
occ_new(basic_block bb,struct occurrence * children)200 occ_new (basic_block bb, struct occurrence *children)
201 {
202   struct occurrence *occ;
203 
204   bb->aux = occ = occ_pool->allocate ();
205   memset (occ, 0, sizeof (struct occurrence));
206 
207   occ->bb = bb;
208   occ->children = children;
209   return occ;
210 }
211 
212 
213 /* Insert NEW_OCC into our subset of the dominator tree.  P_HEAD points to a
214    list of "struct occurrence"s, one per basic block, having IDOM as
215    their common dominator.
216 
217    We try to insert NEW_OCC as deep as possible in the tree, and we also
218    insert any other block that is a common dominator for BB and one
219    block already in the tree.  */
220 
221 static void
insert_bb(struct occurrence * new_occ,basic_block idom,struct occurrence ** p_head)222 insert_bb (struct occurrence *new_occ, basic_block idom,
223 	   struct occurrence **p_head)
224 {
225   struct occurrence *occ, **p_occ;
226 
227   for (p_occ = p_head; (occ = *p_occ) != NULL; )
228     {
229       basic_block bb = new_occ->bb, occ_bb = occ->bb;
230       basic_block dom = nearest_common_dominator (CDI_DOMINATORS, occ_bb, bb);
231       if (dom == bb)
232 	{
233 	  /* BB dominates OCC_BB.  OCC becomes NEW_OCC's child: remove OCC
234 	     from its list.  */
235 	  *p_occ = occ->next;
236 	  occ->next = new_occ->children;
237 	  new_occ->children = occ;
238 
239 	  /* Try the next block (it may as well be dominated by BB).  */
240 	}
241 
242       else if (dom == occ_bb)
243 	{
244 	  /* OCC_BB dominates BB.  Tail recurse to look deeper.  */
245 	  insert_bb (new_occ, dom, &occ->children);
246 	  return;
247 	}
248 
249       else if (dom != idom)
250 	{
251 	  gcc_assert (!dom->aux);
252 
253 	  /* There is a dominator between IDOM and BB, add it and make
254 	     two children out of NEW_OCC and OCC.  First, remove OCC from
255 	     its list.	*/
256 	  *p_occ = occ->next;
257 	  new_occ->next = occ;
258 	  occ->next = NULL;
259 
260 	  /* None of the previous blocks has DOM as a dominator: if we tail
261 	     recursed, we would reexamine them uselessly. Just switch BB with
262 	     DOM, and go on looking for blocks dominated by DOM.  */
263           new_occ = occ_new (dom, new_occ);
264 	}
265 
266       else
267 	{
268 	  /* Nothing special, go on with the next element.  */
269 	  p_occ = &occ->next;
270 	}
271     }
272 
273   /* No place was found as a child of IDOM.  Make BB a sibling of IDOM.  */
274   new_occ->next = *p_head;
275   *p_head = new_occ;
276 }
277 
278 /* Register that we found a division in BB.
279    IMPORTANCE is a measure of how much weighting to give
280    that division.  Use IMPORTANCE = 2 to register a single
281    division.  If the division is going to be found multiple
282    times use 1 (as it is with squares).  */
283 
284 static inline void
register_division_in(basic_block bb,int importance)285 register_division_in (basic_block bb, int importance)
286 {
287   struct occurrence *occ;
288 
289   occ = (struct occurrence *) bb->aux;
290   if (!occ)
291     {
292       occ = occ_new (bb, NULL);
293       insert_bb (occ, ENTRY_BLOCK_PTR_FOR_FN (cfun), &occ_head);
294     }
295 
296   occ->bb_has_division = true;
297   occ->num_divisions += importance;
298 }
299 
300 
301 /* Compute the number of divisions that postdominate each block in OCC and
302    its children.  */
303 
304 static void
compute_merit(struct occurrence * occ)305 compute_merit (struct occurrence *occ)
306 {
307   struct occurrence *occ_child;
308   basic_block dom = occ->bb;
309 
310   for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
311     {
312       basic_block bb;
313       if (occ_child->children)
314         compute_merit (occ_child);
315 
316       if (flag_exceptions)
317 	bb = single_noncomplex_succ (dom);
318       else
319 	bb = dom;
320 
321       if (dominated_by_p (CDI_POST_DOMINATORS, bb, occ_child->bb))
322         occ->num_divisions += occ_child->num_divisions;
323     }
324 }
325 
326 
327 /* Return whether USE_STMT is a floating-point division by DEF.  */
328 static inline bool
is_division_by(gimple * use_stmt,tree def)329 is_division_by (gimple *use_stmt, tree def)
330 {
331   return is_gimple_assign (use_stmt)
332 	 && gimple_assign_rhs_code (use_stmt) == RDIV_EXPR
333 	 && gimple_assign_rhs2 (use_stmt) == def
334 	 /* Do not recognize x / x as valid division, as we are getting
335 	    confused later by replacing all immediate uses x in such
336 	    a stmt.  */
337 	 && gimple_assign_rhs1 (use_stmt) != def
338 	 && !stmt_can_throw_internal (cfun, use_stmt);
339 }
340 
341 /* Return TRUE if USE_STMT is a multiplication of DEF by A.  */
342 static inline bool
is_mult_by(gimple * use_stmt,tree def,tree a)343 is_mult_by (gimple *use_stmt, tree def, tree a)
344 {
345   if (gimple_code (use_stmt) == GIMPLE_ASSIGN
346       && gimple_assign_rhs_code (use_stmt) == MULT_EXPR)
347     {
348       tree op0 = gimple_assign_rhs1 (use_stmt);
349       tree op1 = gimple_assign_rhs2 (use_stmt);
350 
351       return (op0 == def && op1 == a)
352 	      || (op0 == a && op1 == def);
353     }
354   return 0;
355 }
356 
357 /* Return whether USE_STMT is DEF * DEF.  */
358 static inline bool
is_square_of(gimple * use_stmt,tree def)359 is_square_of (gimple *use_stmt, tree def)
360 {
361   return is_mult_by (use_stmt, def, def);
362 }
363 
364 /* Return whether USE_STMT is a floating-point division by
365    DEF * DEF.  */
366 static inline bool
is_division_by_square(gimple * use_stmt,tree def)367 is_division_by_square (gimple *use_stmt, tree def)
368 {
369   if (gimple_code (use_stmt) == GIMPLE_ASSIGN
370       && gimple_assign_rhs_code (use_stmt) == RDIV_EXPR
371       && gimple_assign_rhs1 (use_stmt) != gimple_assign_rhs2 (use_stmt)
372       && !stmt_can_throw_internal (cfun, use_stmt))
373     {
374       tree denominator = gimple_assign_rhs2 (use_stmt);
375       if (TREE_CODE (denominator) == SSA_NAME)
376 	return is_square_of (SSA_NAME_DEF_STMT (denominator), def);
377     }
378   return 0;
379 }
380 
381 /* Walk the subset of the dominator tree rooted at OCC, setting the
382    RECIP_DEF field to a definition of 1.0 / DEF that can be used in
383    the given basic block.  The field may be left NULL, of course,
384    if it is not possible or profitable to do the optimization.
385 
386    DEF_BSI is an iterator pointing at the statement defining DEF.
387    If RECIP_DEF is set, a dominator already has a computation that can
388    be used.
389 
390    If should_insert_square_recip is set, then this also inserts
391    the square of the reciprocal immediately after the definition
392    of the reciprocal.  */
393 
394 static void
insert_reciprocals(gimple_stmt_iterator * def_gsi,struct occurrence * occ,tree def,tree recip_def,tree square_recip_def,int should_insert_square_recip,int threshold)395 insert_reciprocals (gimple_stmt_iterator *def_gsi, struct occurrence *occ,
396 		    tree def, tree recip_def, tree square_recip_def,
397 		    int should_insert_square_recip, int threshold)
398 {
399   tree type;
400   gassign *new_stmt, *new_square_stmt;
401   gimple_stmt_iterator gsi;
402   struct occurrence *occ_child;
403 
404   if (!recip_def
405       && (occ->bb_has_division || !flag_trapping_math)
406       /* Divide by two as all divisions are counted twice in
407 	 the costing loop.  */
408       && occ->num_divisions / 2 >= threshold)
409     {
410       /* Make a variable with the replacement and substitute it.  */
411       type = TREE_TYPE (def);
412       recip_def = create_tmp_reg (type, "reciptmp");
413       new_stmt = gimple_build_assign (recip_def, RDIV_EXPR,
414 				      build_one_cst (type), def);
415 
416       if (should_insert_square_recip)
417 	{
418 	  square_recip_def = create_tmp_reg (type, "powmult_reciptmp");
419 	  new_square_stmt = gimple_build_assign (square_recip_def, MULT_EXPR,
420 						 recip_def, recip_def);
421 	}
422 
423       if (occ->bb_has_division)
424 	{
425 	  /* Case 1: insert before an existing division.  */
426 	  gsi = gsi_after_labels (occ->bb);
427 	  while (!gsi_end_p (gsi)
428 		 && (!is_division_by (gsi_stmt (gsi), def))
429 		 && (!is_division_by_square (gsi_stmt (gsi), def)))
430 	    gsi_next (&gsi);
431 
432 	  gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
433 	  if (should_insert_square_recip)
434 	    gsi_insert_before (&gsi, new_square_stmt, GSI_SAME_STMT);
435 	}
436       else if (def_gsi && occ->bb == def_gsi->bb)
437 	{
438 	  /* Case 2: insert right after the definition.  Note that this will
439 	     never happen if the definition statement can throw, because in
440 	     that case the sole successor of the statement's basic block will
441 	     dominate all the uses as well.  */
442 	  gsi_insert_after (def_gsi, new_stmt, GSI_NEW_STMT);
443 	  if (should_insert_square_recip)
444 	    gsi_insert_after (def_gsi, new_square_stmt, GSI_NEW_STMT);
445 	}
446       else
447 	{
448 	  /* Case 3: insert in a basic block not containing defs/uses.  */
449 	  gsi = gsi_after_labels (occ->bb);
450 	  gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
451 	  if (should_insert_square_recip)
452 	    gsi_insert_before (&gsi, new_square_stmt, GSI_SAME_STMT);
453 	}
454 
455       reciprocal_stats.rdivs_inserted++;
456 
457       occ->recip_def_stmt = new_stmt;
458     }
459 
460   occ->recip_def = recip_def;
461   occ->square_recip_def = square_recip_def;
462   for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
463     insert_reciprocals (def_gsi, occ_child, def, recip_def,
464 			square_recip_def, should_insert_square_recip,
465 			threshold);
466 }
467 
468 /* Replace occurrences of expr / (x * x) with expr * ((1 / x) * (1 / x)).
469    Take as argument the use for (x * x).  */
470 static inline void
replace_reciprocal_squares(use_operand_p use_p)471 replace_reciprocal_squares (use_operand_p use_p)
472 {
473   gimple *use_stmt = USE_STMT (use_p);
474   basic_block bb = gimple_bb (use_stmt);
475   struct occurrence *occ = (struct occurrence *) bb->aux;
476 
477   if (optimize_bb_for_speed_p (bb) && occ->square_recip_def
478       && occ->recip_def)
479     {
480       gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
481       gimple_assign_set_rhs_code (use_stmt, MULT_EXPR);
482       gimple_assign_set_rhs2 (use_stmt, occ->square_recip_def);
483       SET_USE (use_p, occ->square_recip_def);
484       fold_stmt_inplace (&gsi);
485       update_stmt (use_stmt);
486     }
487 }
488 
489 
490 /* Replace the division at USE_P with a multiplication by the reciprocal, if
491    possible.  */
492 
493 static inline void
replace_reciprocal(use_operand_p use_p)494 replace_reciprocal (use_operand_p use_p)
495 {
496   gimple *use_stmt = USE_STMT (use_p);
497   basic_block bb = gimple_bb (use_stmt);
498   struct occurrence *occ = (struct occurrence *) bb->aux;
499 
500   if (optimize_bb_for_speed_p (bb)
501       && occ->recip_def && use_stmt != occ->recip_def_stmt)
502     {
503       gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
504       gimple_assign_set_rhs_code (use_stmt, MULT_EXPR);
505       SET_USE (use_p, occ->recip_def);
506       fold_stmt_inplace (&gsi);
507       update_stmt (use_stmt);
508     }
509 }
510 
511 
512 /* Free OCC and return one more "struct occurrence" to be freed.  */
513 
514 static struct occurrence *
free_bb(struct occurrence * occ)515 free_bb (struct occurrence *occ)
516 {
517   struct occurrence *child, *next;
518 
519   /* First get the two pointers hanging off OCC.  */
520   next = occ->next;
521   child = occ->children;
522   occ->bb->aux = NULL;
523   occ_pool->remove (occ);
524 
525   /* Now ensure that we don't recurse unless it is necessary.  */
526   if (!child)
527     return next;
528   else
529     {
530       while (next)
531 	next = free_bb (next);
532 
533       return child;
534     }
535 }
536 
537 /* Transform sequences like
538    t = sqrt (a)
539    x = 1.0 / t;
540    r1 = x * x;
541    r2 = a * x;
542    into:
543    t = sqrt (a)
544    r1 = 1.0 / a;
545    r2 = t;
546    x = r1 * r2;
547    depending on the uses of x, r1, r2.  This removes one multiplication and
548    allows the sqrt and division operations to execute in parallel.
549    DEF_GSI is the gsi of the initial division by sqrt that defines
550    DEF (x in the example above).  */
551 
552 static void
optimize_recip_sqrt(gimple_stmt_iterator * def_gsi,tree def)553 optimize_recip_sqrt (gimple_stmt_iterator *def_gsi, tree def)
554 {
555   gimple *use_stmt;
556   imm_use_iterator use_iter;
557   gimple *stmt = gsi_stmt (*def_gsi);
558   tree x = def;
559   tree orig_sqrt_ssa_name = gimple_assign_rhs2 (stmt);
560   tree div_rhs1 = gimple_assign_rhs1 (stmt);
561 
562   if (TREE_CODE (orig_sqrt_ssa_name) != SSA_NAME
563       || TREE_CODE (div_rhs1) != REAL_CST
564       || !real_equal (&TREE_REAL_CST (div_rhs1), &dconst1))
565     return;
566 
567   gcall *sqrt_stmt
568     = dyn_cast <gcall *> (SSA_NAME_DEF_STMT (orig_sqrt_ssa_name));
569 
570   if (!sqrt_stmt || !gimple_call_lhs (sqrt_stmt))
571     return;
572 
573   switch (gimple_call_combined_fn (sqrt_stmt))
574     {
575     CASE_CFN_SQRT:
576     CASE_CFN_SQRT_FN:
577       break;
578 
579     default:
580       return;
581     }
582   tree a = gimple_call_arg (sqrt_stmt, 0);
583 
584   /* We have 'a' and 'x'.  Now analyze the uses of 'x'.  */
585 
586   /* Statements that use x in x * x.  */
587   auto_vec<gimple *> sqr_stmts;
588   /* Statements that use x in a * x.  */
589   auto_vec<gimple *> mult_stmts;
590   bool has_other_use = false;
591   bool mult_on_main_path = false;
592 
593   FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, x)
594     {
595       if (is_gimple_debug (use_stmt))
596 	continue;
597       if (is_square_of (use_stmt, x))
598 	{
599 	  sqr_stmts.safe_push (use_stmt);
600 	  if (gimple_bb (use_stmt) == gimple_bb (stmt))
601 	    mult_on_main_path = true;
602 	}
603       else if (is_mult_by (use_stmt, x, a))
604 	{
605 	  mult_stmts.safe_push (use_stmt);
606 	  if (gimple_bb (use_stmt) == gimple_bb (stmt))
607 	    mult_on_main_path = true;
608 	}
609       else
610 	has_other_use = true;
611     }
612 
613   /* In the x * x and a * x cases we just rewire stmt operands or
614      remove multiplications.  In the has_other_use case we introduce
615      a multiplication so make sure we don't introduce a multiplication
616      on a path where there was none.  */
617   if (has_other_use && !mult_on_main_path)
618     return;
619 
620   if (sqr_stmts.is_empty () && mult_stmts.is_empty ())
621     return;
622 
623   /* If x = 1.0 / sqrt (a) has uses other than those optimized here we want
624      to be able to compose it from the sqr and mult cases.  */
625   if (has_other_use && (sqr_stmts.is_empty () || mult_stmts.is_empty ()))
626     return;
627 
628   if (dump_file)
629     {
630       fprintf (dump_file, "Optimizing reciprocal sqrt multiplications of\n");
631       print_gimple_stmt (dump_file, sqrt_stmt, 0, TDF_NONE);
632       print_gimple_stmt (dump_file, stmt, 0, TDF_NONE);
633       fprintf (dump_file, "\n");
634     }
635 
636   bool delete_div = !has_other_use;
637   tree sqr_ssa_name = NULL_TREE;
638   if (!sqr_stmts.is_empty ())
639     {
640       /* r1 = x * x.  Transform the original
641 	 x = 1.0 / t
642 	 into
643 	 tmp1 = 1.0 / a
644 	 r1 = tmp1.  */
645 
646       sqr_ssa_name
647 	= make_temp_ssa_name (TREE_TYPE (a), NULL, "recip_sqrt_sqr");
648 
649       if (dump_file)
650 	{
651 	  fprintf (dump_file, "Replacing original division\n");
652 	  print_gimple_stmt (dump_file, stmt, 0, TDF_NONE);
653 	  fprintf (dump_file, "with new division\n");
654 	}
655       stmt
656 	= gimple_build_assign (sqr_ssa_name, gimple_assign_rhs_code (stmt),
657 			       gimple_assign_rhs1 (stmt), a);
658       gsi_insert_before (def_gsi, stmt, GSI_SAME_STMT);
659       gsi_remove (def_gsi, true);
660       *def_gsi = gsi_for_stmt (stmt);
661       fold_stmt_inplace (def_gsi);
662       update_stmt (stmt);
663 
664       if (dump_file)
665 	print_gimple_stmt (dump_file, stmt, 0, TDF_NONE);
666 
667       delete_div = false;
668       gimple *sqr_stmt;
669       unsigned int i;
670       FOR_EACH_VEC_ELT (sqr_stmts, i, sqr_stmt)
671 	{
672 	  gimple_stmt_iterator gsi2 = gsi_for_stmt (sqr_stmt);
673 	  gimple_assign_set_rhs_from_tree (&gsi2, sqr_ssa_name);
674 	  update_stmt (sqr_stmt);
675 	}
676     }
677   if (!mult_stmts.is_empty ())
678     {
679       /* r2 = a * x.  Transform this into:
680 	 r2 = t (The original sqrt (a)).  */
681       unsigned int i;
682       gimple *mult_stmt = NULL;
683       FOR_EACH_VEC_ELT (mult_stmts, i, mult_stmt)
684 	{
685 	  gimple_stmt_iterator gsi2 = gsi_for_stmt (mult_stmt);
686 
687 	  if (dump_file)
688 	    {
689 	      fprintf (dump_file, "Replacing squaring multiplication\n");
690 	      print_gimple_stmt (dump_file, mult_stmt, 0, TDF_NONE);
691 	      fprintf (dump_file, "with assignment\n");
692 	    }
693 	  gimple_assign_set_rhs_from_tree (&gsi2, orig_sqrt_ssa_name);
694 	  fold_stmt_inplace (&gsi2);
695 	  update_stmt (mult_stmt);
696 	  if (dump_file)
697 	    print_gimple_stmt (dump_file, mult_stmt, 0, TDF_NONE);
698       }
699     }
700 
701   if (has_other_use)
702     {
703       /* Using the two temporaries tmp1, tmp2 from above
704 	 the original x is now:
705 	 x = tmp1 * tmp2.  */
706       gcc_assert (orig_sqrt_ssa_name);
707       gcc_assert (sqr_ssa_name);
708 
709       gimple *new_stmt
710 	= gimple_build_assign (x, MULT_EXPR,
711 			       orig_sqrt_ssa_name, sqr_ssa_name);
712       gsi_insert_after (def_gsi, new_stmt, GSI_NEW_STMT);
713       update_stmt (stmt);
714     }
715   else if (delete_div)
716     {
717       /* Remove the original division.  */
718       gimple_stmt_iterator gsi2 = gsi_for_stmt (stmt);
719       gsi_remove (&gsi2, true);
720       release_defs (stmt);
721     }
722   else
723     release_ssa_name (x);
724 }
725 
726 /* Look for floating-point divisions among DEF's uses, and try to
727    replace them by multiplications with the reciprocal.  Add
728    as many statements computing the reciprocal as needed.
729 
730    DEF must be a GIMPLE register of a floating-point type.  */
731 
732 static void
execute_cse_reciprocals_1(gimple_stmt_iterator * def_gsi,tree def)733 execute_cse_reciprocals_1 (gimple_stmt_iterator *def_gsi, tree def)
734 {
735   use_operand_p use_p, square_use_p;
736   imm_use_iterator use_iter, square_use_iter;
737   tree square_def;
738   struct occurrence *occ;
739   int count = 0;
740   int threshold;
741   int square_recip_count = 0;
742   int sqrt_recip_count = 0;
743 
744   gcc_assert (FLOAT_TYPE_P (TREE_TYPE (def)) && TREE_CODE (def) == SSA_NAME);
745   threshold = targetm.min_divisions_for_recip_mul (TYPE_MODE (TREE_TYPE (def)));
746 
747   /* If DEF is a square (x * x), count the number of divisions by x.
748      If there are more divisions by x than by (DEF * DEF), prefer to optimize
749      the reciprocal of x instead of DEF.  This improves cases like:
750        def = x * x
751        t0 = a / def
752        t1 = b / def
753        t2 = c / x
754      Reciprocal optimization of x results in 1 division rather than 2 or 3.  */
755   gimple *def_stmt = SSA_NAME_DEF_STMT (def);
756 
757   if (is_gimple_assign (def_stmt)
758       && gimple_assign_rhs_code (def_stmt) == MULT_EXPR
759       && TREE_CODE (gimple_assign_rhs1 (def_stmt)) == SSA_NAME
760       && gimple_assign_rhs1 (def_stmt) == gimple_assign_rhs2 (def_stmt))
761     {
762       tree op0 = gimple_assign_rhs1 (def_stmt);
763 
764       FOR_EACH_IMM_USE_FAST (use_p, use_iter, op0)
765 	{
766 	  gimple *use_stmt = USE_STMT (use_p);
767 	  if (is_division_by (use_stmt, op0))
768 	    sqrt_recip_count++;
769 	}
770     }
771 
772   FOR_EACH_IMM_USE_FAST (use_p, use_iter, def)
773     {
774       gimple *use_stmt = USE_STMT (use_p);
775       if (is_division_by (use_stmt, def))
776 	{
777 	  register_division_in (gimple_bb (use_stmt), 2);
778 	  count++;
779 	}
780 
781       if (is_square_of (use_stmt, def))
782 	{
783 	  square_def = gimple_assign_lhs (use_stmt);
784 	  FOR_EACH_IMM_USE_FAST (square_use_p, square_use_iter, square_def)
785 	    {
786 	      gimple *square_use_stmt = USE_STMT (square_use_p);
787 	      if (is_division_by (square_use_stmt, square_def))
788 		{
789 		  /* This is executed twice for each division by a square.  */
790 		  register_division_in (gimple_bb (square_use_stmt), 1);
791 		  square_recip_count++;
792 		}
793 	    }
794 	}
795     }
796 
797   /* Square reciprocals were counted twice above.  */
798   square_recip_count /= 2;
799 
800   /* If it is more profitable to optimize 1 / x, don't optimize 1 / (x * x).  */
801   if (sqrt_recip_count > square_recip_count)
802     goto out;
803 
804   /* Do the expensive part only if we can hope to optimize something.  */
805   if (count + square_recip_count >= threshold && count >= 1)
806     {
807       gimple *use_stmt;
808       for (occ = occ_head; occ; occ = occ->next)
809 	{
810 	  compute_merit (occ);
811 	  insert_reciprocals (def_gsi, occ, def, NULL, NULL,
812 			      square_recip_count, threshold);
813 	}
814 
815       FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, def)
816 	{
817 	  if (is_division_by (use_stmt, def))
818 	    {
819 	      FOR_EACH_IMM_USE_ON_STMT (use_p, use_iter)
820 		replace_reciprocal (use_p);
821 	    }
822 	  else if (square_recip_count > 0 && is_square_of (use_stmt, def))
823 	    {
824 	      FOR_EACH_IMM_USE_ON_STMT (use_p, use_iter)
825 		{
826 		  /* Find all uses of the square that are divisions and
827 		   * replace them by multiplications with the inverse.  */
828 		  imm_use_iterator square_iterator;
829 		  gimple *powmult_use_stmt = USE_STMT (use_p);
830 		  tree powmult_def_name = gimple_assign_lhs (powmult_use_stmt);
831 
832 		  FOR_EACH_IMM_USE_STMT (powmult_use_stmt,
833 					 square_iterator, powmult_def_name)
834 		    FOR_EACH_IMM_USE_ON_STMT (square_use_p, square_iterator)
835 		      {
836 			gimple *powmult_use_stmt = USE_STMT (square_use_p);
837 			if (is_division_by (powmult_use_stmt, powmult_def_name))
838 			  replace_reciprocal_squares (square_use_p);
839 		      }
840 		}
841 	    }
842 	}
843     }
844 
845 out:
846   for (occ = occ_head; occ; )
847     occ = free_bb (occ);
848 
849   occ_head = NULL;
850 }
851 
852 /* Return an internal function that implements the reciprocal of CALL,
853    or IFN_LAST if there is no such function that the target supports.  */
854 
855 internal_fn
internal_fn_reciprocal(gcall * call)856 internal_fn_reciprocal (gcall *call)
857 {
858   internal_fn ifn;
859 
860   switch (gimple_call_combined_fn (call))
861     {
862     CASE_CFN_SQRT:
863     CASE_CFN_SQRT_FN:
864       ifn = IFN_RSQRT;
865       break;
866 
867     default:
868       return IFN_LAST;
869     }
870 
871   tree_pair types = direct_internal_fn_types (ifn, call);
872   if (!direct_internal_fn_supported_p (ifn, types, OPTIMIZE_FOR_SPEED))
873     return IFN_LAST;
874 
875   return ifn;
876 }
877 
878 /* Go through all the floating-point SSA_NAMEs, and call
879    execute_cse_reciprocals_1 on each of them.  */
880 namespace {
881 
882 const pass_data pass_data_cse_reciprocals =
883 {
884   GIMPLE_PASS, /* type */
885   "recip", /* name */
886   OPTGROUP_NONE, /* optinfo_flags */
887   TV_TREE_RECIP, /* tv_id */
888   PROP_ssa, /* properties_required */
889   0, /* properties_provided */
890   0, /* properties_destroyed */
891   0, /* todo_flags_start */
892   TODO_update_ssa, /* todo_flags_finish */
893 };
894 
895 class pass_cse_reciprocals : public gimple_opt_pass
896 {
897 public:
pass_cse_reciprocals(gcc::context * ctxt)898   pass_cse_reciprocals (gcc::context *ctxt)
899     : gimple_opt_pass (pass_data_cse_reciprocals, ctxt)
900   {}
901 
902   /* opt_pass methods: */
gate(function *)903   virtual bool gate (function *) { return optimize && flag_reciprocal_math; }
904   virtual unsigned int execute (function *);
905 
906 }; // class pass_cse_reciprocals
907 
908 unsigned int
execute(function * fun)909 pass_cse_reciprocals::execute (function *fun)
910 {
911   basic_block bb;
912   tree arg;
913 
914   occ_pool = new object_allocator<occurrence> ("dominators for recip");
915 
916   memset (&reciprocal_stats, 0, sizeof (reciprocal_stats));
917   calculate_dominance_info (CDI_DOMINATORS);
918   calculate_dominance_info (CDI_POST_DOMINATORS);
919 
920   if (flag_checking)
921     FOR_EACH_BB_FN (bb, fun)
922       gcc_assert (!bb->aux);
923 
924   for (arg = DECL_ARGUMENTS (fun->decl); arg; arg = DECL_CHAIN (arg))
925     if (FLOAT_TYPE_P (TREE_TYPE (arg))
926 	&& is_gimple_reg (arg))
927       {
928 	tree name = ssa_default_def (fun, arg);
929 	if (name)
930 	  execute_cse_reciprocals_1 (NULL, name);
931       }
932 
933   FOR_EACH_BB_FN (bb, fun)
934     {
935       tree def;
936 
937       for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
938 	   gsi_next (&gsi))
939 	{
940 	  gphi *phi = gsi.phi ();
941 	  def = PHI_RESULT (phi);
942 	  if (! virtual_operand_p (def)
943 	      && FLOAT_TYPE_P (TREE_TYPE (def)))
944 	    execute_cse_reciprocals_1 (NULL, def);
945 	}
946 
947       for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
948 	   gsi_next (&gsi))
949         {
950 	  gimple *stmt = gsi_stmt (gsi);
951 
952 	  if (gimple_has_lhs (stmt)
953 	      && (def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF)) != NULL
954 	      && FLOAT_TYPE_P (TREE_TYPE (def))
955 	      && TREE_CODE (def) == SSA_NAME)
956 	    {
957 	      execute_cse_reciprocals_1 (&gsi, def);
958 	      stmt = gsi_stmt (gsi);
959 	      if (flag_unsafe_math_optimizations
960 		  && is_gimple_assign (stmt)
961 		  && gimple_assign_lhs (stmt) == def
962 		  && !stmt_can_throw_internal (cfun, stmt)
963 		  && gimple_assign_rhs_code (stmt) == RDIV_EXPR)
964 		optimize_recip_sqrt (&gsi, def);
965 	    }
966 	}
967 
968       if (optimize_bb_for_size_p (bb))
969         continue;
970 
971       /* Scan for a/func(b) and convert it to reciprocal a*rfunc(b).  */
972       for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
973 	   gsi_next (&gsi))
974         {
975 	  gimple *stmt = gsi_stmt (gsi);
976 
977 	  if (is_gimple_assign (stmt)
978 	      && gimple_assign_rhs_code (stmt) == RDIV_EXPR)
979 	    {
980 	      tree arg1 = gimple_assign_rhs2 (stmt);
981 	      gimple *stmt1;
982 
983 	      if (TREE_CODE (arg1) != SSA_NAME)
984 		continue;
985 
986 	      stmt1 = SSA_NAME_DEF_STMT (arg1);
987 
988 	      if (is_gimple_call (stmt1)
989 		  && gimple_call_lhs (stmt1))
990 		{
991 		  bool fail;
992 		  imm_use_iterator ui;
993 		  use_operand_p use_p;
994 		  tree fndecl = NULL_TREE;
995 
996 		  gcall *call = as_a <gcall *> (stmt1);
997 		  internal_fn ifn = internal_fn_reciprocal (call);
998 		  if (ifn == IFN_LAST)
999 		    {
1000 		      fndecl = gimple_call_fndecl (call);
1001 		      if (!fndecl
1002 			  || !fndecl_built_in_p (fndecl, BUILT_IN_MD))
1003 			continue;
1004 		      fndecl = targetm.builtin_reciprocal (fndecl);
1005 		      if (!fndecl)
1006 			continue;
1007 		    }
1008 
1009 		  /* Check that all uses of the SSA name are divisions,
1010 		     otherwise replacing the defining statement will do
1011 		     the wrong thing.  */
1012 		  fail = false;
1013 		  FOR_EACH_IMM_USE_FAST (use_p, ui, arg1)
1014 		    {
1015 		      gimple *stmt2 = USE_STMT (use_p);
1016 		      if (is_gimple_debug (stmt2))
1017 			continue;
1018 		      if (!is_gimple_assign (stmt2)
1019 			  || gimple_assign_rhs_code (stmt2) != RDIV_EXPR
1020 			  || gimple_assign_rhs1 (stmt2) == arg1
1021 			  || gimple_assign_rhs2 (stmt2) != arg1)
1022 			{
1023 			  fail = true;
1024 			  break;
1025 			}
1026 		    }
1027 		  if (fail)
1028 		    continue;
1029 
1030 		  gimple_replace_ssa_lhs (call, arg1);
1031 		  if (gimple_call_internal_p (call) != (ifn != IFN_LAST))
1032 		    {
1033 		      auto_vec<tree, 4> args;
1034 		      for (unsigned int i = 0;
1035 			   i < gimple_call_num_args (call); i++)
1036 			args.safe_push (gimple_call_arg (call, i));
1037 		      gcall *stmt2;
1038 		      if (ifn == IFN_LAST)
1039 			stmt2 = gimple_build_call_vec (fndecl, args);
1040 		      else
1041 			stmt2 = gimple_build_call_internal_vec (ifn, args);
1042 		      gimple_call_set_lhs (stmt2, arg1);
1043 		      if (gimple_vdef (call))
1044 			{
1045 			  gimple_set_vdef (stmt2, gimple_vdef (call));
1046 			  SSA_NAME_DEF_STMT (gimple_vdef (stmt2)) = stmt2;
1047 			}
1048 		      gimple_call_set_nothrow (stmt2,
1049 					       gimple_call_nothrow_p (call));
1050 		      gimple_set_vuse (stmt2, gimple_vuse (call));
1051 		      gimple_stmt_iterator gsi2 = gsi_for_stmt (call);
1052 		      gsi_replace (&gsi2, stmt2, true);
1053 		    }
1054 		  else
1055 		    {
1056 		      if (ifn == IFN_LAST)
1057 			gimple_call_set_fndecl (call, fndecl);
1058 		      else
1059 			gimple_call_set_internal_fn (call, ifn);
1060 		      update_stmt (call);
1061 		    }
1062 		  reciprocal_stats.rfuncs_inserted++;
1063 
1064 		  FOR_EACH_IMM_USE_STMT (stmt, ui, arg1)
1065 		    {
1066 		      gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
1067 		      gimple_assign_set_rhs_code (stmt, MULT_EXPR);
1068 		      fold_stmt_inplace (&gsi);
1069 		      update_stmt (stmt);
1070 		    }
1071 		}
1072 	    }
1073 	}
1074     }
1075 
1076   statistics_counter_event (fun, "reciprocal divs inserted",
1077 			    reciprocal_stats.rdivs_inserted);
1078   statistics_counter_event (fun, "reciprocal functions inserted",
1079 			    reciprocal_stats.rfuncs_inserted);
1080 
1081   free_dominance_info (CDI_DOMINATORS);
1082   free_dominance_info (CDI_POST_DOMINATORS);
1083   delete occ_pool;
1084   return 0;
1085 }
1086 
1087 } // anon namespace
1088 
1089 gimple_opt_pass *
make_pass_cse_reciprocals(gcc::context * ctxt)1090 make_pass_cse_reciprocals (gcc::context *ctxt)
1091 {
1092   return new pass_cse_reciprocals (ctxt);
1093 }
1094 
1095 /* Records an occurrence at statement USE_STMT in the vector of trees
1096    STMTS if it is dominated by *TOP_BB or dominates it or this basic block
1097    is not yet initialized.  Returns true if the occurrence was pushed on
1098    the vector.  Adjusts *TOP_BB to be the basic block dominating all
1099    statements in the vector.  */
1100 
1101 static bool
maybe_record_sincos(vec<gimple * > * stmts,basic_block * top_bb,gimple * use_stmt)1102 maybe_record_sincos (vec<gimple *> *stmts,
1103 		     basic_block *top_bb, gimple *use_stmt)
1104 {
1105   basic_block use_bb = gimple_bb (use_stmt);
1106   if (*top_bb
1107       && (*top_bb == use_bb
1108 	  || dominated_by_p (CDI_DOMINATORS, use_bb, *top_bb)))
1109     stmts->safe_push (use_stmt);
1110   else if (!*top_bb
1111 	   || dominated_by_p (CDI_DOMINATORS, *top_bb, use_bb))
1112     {
1113       stmts->safe_push (use_stmt);
1114       *top_bb = use_bb;
1115     }
1116   else
1117     return false;
1118 
1119   return true;
1120 }
1121 
1122 /* Look for sin, cos and cexpi calls with the same argument NAME and
1123    create a single call to cexpi CSEing the result in this case.
1124    We first walk over all immediate uses of the argument collecting
1125    statements that we can CSE in a vector and in a second pass replace
1126    the statement rhs with a REALPART or IMAGPART expression on the
1127    result of the cexpi call we insert before the use statement that
1128    dominates all other candidates.  */
1129 
1130 static bool
execute_cse_sincos_1(tree name)1131 execute_cse_sincos_1 (tree name)
1132 {
1133   gimple_stmt_iterator gsi;
1134   imm_use_iterator use_iter;
1135   tree fndecl, res, type;
1136   gimple *def_stmt, *use_stmt, *stmt;
1137   int seen_cos = 0, seen_sin = 0, seen_cexpi = 0;
1138   auto_vec<gimple *> stmts;
1139   basic_block top_bb = NULL;
1140   int i;
1141   bool cfg_changed = false;
1142 
1143   type = TREE_TYPE (name);
1144   FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, name)
1145     {
1146       if (gimple_code (use_stmt) != GIMPLE_CALL
1147 	  || !gimple_call_lhs (use_stmt))
1148 	continue;
1149 
1150       switch (gimple_call_combined_fn (use_stmt))
1151 	{
1152 	CASE_CFN_COS:
1153 	  seen_cos |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
1154 	  break;
1155 
1156 	CASE_CFN_SIN:
1157 	  seen_sin |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
1158 	  break;
1159 
1160 	CASE_CFN_CEXPI:
1161 	  seen_cexpi |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
1162 	  break;
1163 
1164 	default:;
1165 	}
1166     }
1167 
1168   if (seen_cos + seen_sin + seen_cexpi <= 1)
1169     return false;
1170 
1171   /* Simply insert cexpi at the beginning of top_bb but not earlier than
1172      the name def statement.  */
1173   fndecl = mathfn_built_in (type, BUILT_IN_CEXPI);
1174   if (!fndecl)
1175     return false;
1176   stmt = gimple_build_call (fndecl, 1, name);
1177   res = make_temp_ssa_name (TREE_TYPE (TREE_TYPE (fndecl)), stmt, "sincostmp");
1178   gimple_call_set_lhs (stmt, res);
1179 
1180   def_stmt = SSA_NAME_DEF_STMT (name);
1181   if (!SSA_NAME_IS_DEFAULT_DEF (name)
1182       && gimple_code (def_stmt) != GIMPLE_PHI
1183       && gimple_bb (def_stmt) == top_bb)
1184     {
1185       gsi = gsi_for_stmt (def_stmt);
1186       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
1187     }
1188   else
1189     {
1190       gsi = gsi_after_labels (top_bb);
1191       gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
1192     }
1193   sincos_stats.inserted++;
1194 
1195   /* And adjust the recorded old call sites.  */
1196   for (i = 0; stmts.iterate (i, &use_stmt); ++i)
1197     {
1198       tree rhs = NULL;
1199 
1200       switch (gimple_call_combined_fn (use_stmt))
1201 	{
1202 	CASE_CFN_COS:
1203 	  rhs = fold_build1 (REALPART_EXPR, type, res);
1204 	  break;
1205 
1206 	CASE_CFN_SIN:
1207 	  rhs = fold_build1 (IMAGPART_EXPR, type, res);
1208 	  break;
1209 
1210 	CASE_CFN_CEXPI:
1211 	  rhs = res;
1212 	  break;
1213 
1214 	default:;
1215 	  gcc_unreachable ();
1216 	}
1217 
1218 	/* Replace call with a copy.  */
1219 	stmt = gimple_build_assign (gimple_call_lhs (use_stmt), rhs);
1220 
1221 	gsi = gsi_for_stmt (use_stmt);
1222 	gsi_replace (&gsi, stmt, true);
1223 	if (gimple_purge_dead_eh_edges (gimple_bb (stmt)))
1224 	  cfg_changed = true;
1225     }
1226 
1227   return cfg_changed;
1228 }
1229 
1230 /* To evaluate powi(x,n), the floating point value x raised to the
1231    constant integer exponent n, we use a hybrid algorithm that
1232    combines the "window method" with look-up tables.  For an
1233    introduction to exponentiation algorithms and "addition chains",
1234    see section 4.6.3, "Evaluation of Powers" of Donald E. Knuth,
1235    "Seminumerical Algorithms", Vol. 2, "The Art of Computer Programming",
1236    3rd Edition, 1998, and Daniel M. Gordon, "A Survey of Fast Exponentiation
1237    Methods", Journal of Algorithms, Vol. 27, pp. 129-146, 1998.  */
1238 
1239 /* Provide a default value for POWI_MAX_MULTS, the maximum number of
1240    multiplications to inline before calling the system library's pow
1241    function.  powi(x,n) requires at worst 2*bits(n)-2 multiplications,
1242    so this default never requires calling pow, powf or powl.  */
1243 
1244 #ifndef POWI_MAX_MULTS
1245 #define POWI_MAX_MULTS  (2*HOST_BITS_PER_WIDE_INT-2)
1246 #endif
1247 
1248 /* The size of the "optimal power tree" lookup table.  All
1249    exponents less than this value are simply looked up in the
1250    powi_table below.  This threshold is also used to size the
1251    cache of pseudo registers that hold intermediate results.  */
1252 #define POWI_TABLE_SIZE 256
1253 
1254 /* The size, in bits of the window, used in the "window method"
1255    exponentiation algorithm.  This is equivalent to a radix of
1256    (1<<POWI_WINDOW_SIZE) in the corresponding "m-ary method".  */
1257 #define POWI_WINDOW_SIZE 3
1258 
1259 /* The following table is an efficient representation of an
1260    "optimal power tree".  For each value, i, the corresponding
1261    value, j, in the table states than an optimal evaluation
1262    sequence for calculating pow(x,i) can be found by evaluating
1263    pow(x,j)*pow(x,i-j).  An optimal power tree for the first
1264    100 integers is given in Knuth's "Seminumerical algorithms".  */
1265 
1266 static const unsigned char powi_table[POWI_TABLE_SIZE] =
1267   {
1268       0,   1,   1,   2,   2,   3,   3,   4,  /*   0 -   7 */
1269       4,   6,   5,   6,   6,  10,   7,   9,  /*   8 -  15 */
1270       8,  16,   9,  16,  10,  12,  11,  13,  /*  16 -  23 */
1271      12,  17,  13,  18,  14,  24,  15,  26,  /*  24 -  31 */
1272      16,  17,  17,  19,  18,  33,  19,  26,  /*  32 -  39 */
1273      20,  25,  21,  40,  22,  27,  23,  44,  /*  40 -  47 */
1274      24,  32,  25,  34,  26,  29,  27,  44,  /*  48 -  55 */
1275      28,  31,  29,  34,  30,  60,  31,  36,  /*  56 -  63 */
1276      32,  64,  33,  34,  34,  46,  35,  37,  /*  64 -  71 */
1277      36,  65,  37,  50,  38,  48,  39,  69,  /*  72 -  79 */
1278      40,  49,  41,  43,  42,  51,  43,  58,  /*  80 -  87 */
1279      44,  64,  45,  47,  46,  59,  47,  76,  /*  88 -  95 */
1280      48,  65,  49,  66,  50,  67,  51,  66,  /*  96 - 103 */
1281      52,  70,  53,  74,  54, 104,  55,  74,  /* 104 - 111 */
1282      56,  64,  57,  69,  58,  78,  59,  68,  /* 112 - 119 */
1283      60,  61,  61,  80,  62,  75,  63,  68,  /* 120 - 127 */
1284      64,  65,  65, 128,  66, 129,  67,  90,  /* 128 - 135 */
1285      68,  73,  69, 131,  70,  94,  71,  88,  /* 136 - 143 */
1286      72, 128,  73,  98,  74, 132,  75, 121,  /* 144 - 151 */
1287      76, 102,  77, 124,  78, 132,  79, 106,  /* 152 - 159 */
1288      80,  97,  81, 160,  82,  99,  83, 134,  /* 160 - 167 */
1289      84,  86,  85,  95,  86, 160,  87, 100,  /* 168 - 175 */
1290      88, 113,  89,  98,  90, 107,  91, 122,  /* 176 - 183 */
1291      92, 111,  93, 102,  94, 126,  95, 150,  /* 184 - 191 */
1292      96, 128,  97, 130,  98, 133,  99, 195,  /* 192 - 199 */
1293     100, 128, 101, 123, 102, 164, 103, 138,  /* 200 - 207 */
1294     104, 145, 105, 146, 106, 109, 107, 149,  /* 208 - 215 */
1295     108, 200, 109, 146, 110, 170, 111, 157,  /* 216 - 223 */
1296     112, 128, 113, 130, 114, 182, 115, 132,  /* 224 - 231 */
1297     116, 200, 117, 132, 118, 158, 119, 206,  /* 232 - 239 */
1298     120, 240, 121, 162, 122, 147, 123, 152,  /* 240 - 247 */
1299     124, 166, 125, 214, 126, 138, 127, 153,  /* 248 - 255 */
1300   };
1301 
1302 
1303 /* Return the number of multiplications required to calculate
1304    powi(x,n) where n is less than POWI_TABLE_SIZE.  This is a
1305    subroutine of powi_cost.  CACHE is an array indicating
1306    which exponents have already been calculated.  */
1307 
1308 static int
powi_lookup_cost(unsigned HOST_WIDE_INT n,bool * cache)1309 powi_lookup_cost (unsigned HOST_WIDE_INT n, bool *cache)
1310 {
1311   /* If we've already calculated this exponent, then this evaluation
1312      doesn't require any additional multiplications.  */
1313   if (cache[n])
1314     return 0;
1315 
1316   cache[n] = true;
1317   return powi_lookup_cost (n - powi_table[n], cache)
1318 	 + powi_lookup_cost (powi_table[n], cache) + 1;
1319 }
1320 
1321 /* Return the number of multiplications required to calculate
1322    powi(x,n) for an arbitrary x, given the exponent N.  This
1323    function needs to be kept in sync with powi_as_mults below.  */
1324 
1325 static int
powi_cost(HOST_WIDE_INT n)1326 powi_cost (HOST_WIDE_INT n)
1327 {
1328   bool cache[POWI_TABLE_SIZE];
1329   unsigned HOST_WIDE_INT digit;
1330   unsigned HOST_WIDE_INT val;
1331   int result;
1332 
1333   if (n == 0)
1334     return 0;
1335 
1336   /* Ignore the reciprocal when calculating the cost.  */
1337   val = (n < 0) ? -n : n;
1338 
1339   /* Initialize the exponent cache.  */
1340   memset (cache, 0, POWI_TABLE_SIZE * sizeof (bool));
1341   cache[1] = true;
1342 
1343   result = 0;
1344 
1345   while (val >= POWI_TABLE_SIZE)
1346     {
1347       if (val & 1)
1348 	{
1349 	  digit = val & ((1 << POWI_WINDOW_SIZE) - 1);
1350 	  result += powi_lookup_cost (digit, cache)
1351 		    + POWI_WINDOW_SIZE + 1;
1352 	  val >>= POWI_WINDOW_SIZE;
1353 	}
1354       else
1355 	{
1356 	  val >>= 1;
1357 	  result++;
1358 	}
1359     }
1360 
1361   return result + powi_lookup_cost (val, cache);
1362 }
1363 
1364 /* Recursive subroutine of powi_as_mults.  This function takes the
1365    array, CACHE, of already calculated exponents and an exponent N and
1366    returns a tree that corresponds to CACHE[1]**N, with type TYPE.  */
1367 
1368 static tree
powi_as_mults_1(gimple_stmt_iterator * gsi,location_t loc,tree type,HOST_WIDE_INT n,tree * cache)1369 powi_as_mults_1 (gimple_stmt_iterator *gsi, location_t loc, tree type,
1370 		 HOST_WIDE_INT n, tree *cache)
1371 {
1372   tree op0, op1, ssa_target;
1373   unsigned HOST_WIDE_INT digit;
1374   gassign *mult_stmt;
1375 
1376   if (n < POWI_TABLE_SIZE && cache[n])
1377     return cache[n];
1378 
1379   ssa_target = make_temp_ssa_name (type, NULL, "powmult");
1380 
1381   if (n < POWI_TABLE_SIZE)
1382     {
1383       cache[n] = ssa_target;
1384       op0 = powi_as_mults_1 (gsi, loc, type, n - powi_table[n], cache);
1385       op1 = powi_as_mults_1 (gsi, loc, type, powi_table[n], cache);
1386     }
1387   else if (n & 1)
1388     {
1389       digit = n & ((1 << POWI_WINDOW_SIZE) - 1);
1390       op0 = powi_as_mults_1 (gsi, loc, type, n - digit, cache);
1391       op1 = powi_as_mults_1 (gsi, loc, type, digit, cache);
1392     }
1393   else
1394     {
1395       op0 = powi_as_mults_1 (gsi, loc, type, n >> 1, cache);
1396       op1 = op0;
1397     }
1398 
1399   mult_stmt = gimple_build_assign (ssa_target, MULT_EXPR, op0, op1);
1400   gimple_set_location (mult_stmt, loc);
1401   gsi_insert_before (gsi, mult_stmt, GSI_SAME_STMT);
1402 
1403   return ssa_target;
1404 }
1405 
1406 /* Convert ARG0**N to a tree of multiplications of ARG0 with itself.
1407    This function needs to be kept in sync with powi_cost above.  */
1408 
1409 static tree
powi_as_mults(gimple_stmt_iterator * gsi,location_t loc,tree arg0,HOST_WIDE_INT n)1410 powi_as_mults (gimple_stmt_iterator *gsi, location_t loc,
1411 	       tree arg0, HOST_WIDE_INT n)
1412 {
1413   tree cache[POWI_TABLE_SIZE], result, type = TREE_TYPE (arg0);
1414   gassign *div_stmt;
1415   tree target;
1416 
1417   if (n == 0)
1418     return build_real (type, dconst1);
1419 
1420   memset (cache, 0,  sizeof (cache));
1421   cache[1] = arg0;
1422 
1423   result = powi_as_mults_1 (gsi, loc, type, (n < 0) ? -n : n, cache);
1424   if (n >= 0)
1425     return result;
1426 
1427   /* If the original exponent was negative, reciprocate the result.  */
1428   target = make_temp_ssa_name (type, NULL, "powmult");
1429   div_stmt = gimple_build_assign (target, RDIV_EXPR,
1430 				  build_real (type, dconst1), result);
1431   gimple_set_location (div_stmt, loc);
1432   gsi_insert_before (gsi, div_stmt, GSI_SAME_STMT);
1433 
1434   return target;
1435 }
1436 
1437 /* ARG0 and N are the two arguments to a powi builtin in GSI with
1438    location info LOC.  If the arguments are appropriate, create an
1439    equivalent sequence of statements prior to GSI using an optimal
1440    number of multiplications, and return an expession holding the
1441    result.  */
1442 
1443 static tree
gimple_expand_builtin_powi(gimple_stmt_iterator * gsi,location_t loc,tree arg0,HOST_WIDE_INT n)1444 gimple_expand_builtin_powi (gimple_stmt_iterator *gsi, location_t loc,
1445 			    tree arg0, HOST_WIDE_INT n)
1446 {
1447   /* Avoid largest negative number.  */
1448   if (n != -n
1449       && ((n >= -1 && n <= 2)
1450 	  || (optimize_function_for_speed_p (cfun)
1451 	      && powi_cost (n) <= POWI_MAX_MULTS)))
1452     return powi_as_mults (gsi, loc, arg0, n);
1453 
1454   return NULL_TREE;
1455 }
1456 
1457 /* Build a gimple call statement that calls FN with argument ARG.
1458    Set the lhs of the call statement to a fresh SSA name.  Insert the
1459    statement prior to GSI's current position, and return the fresh
1460    SSA name.  */
1461 
1462 static tree
build_and_insert_call(gimple_stmt_iterator * gsi,location_t loc,tree fn,tree arg)1463 build_and_insert_call (gimple_stmt_iterator *gsi, location_t loc,
1464 		       tree fn, tree arg)
1465 {
1466   gcall *call_stmt;
1467   tree ssa_target;
1468 
1469   call_stmt = gimple_build_call (fn, 1, arg);
1470   ssa_target = make_temp_ssa_name (TREE_TYPE (arg), NULL, "powroot");
1471   gimple_set_lhs (call_stmt, ssa_target);
1472   gimple_set_location (call_stmt, loc);
1473   gsi_insert_before (gsi, call_stmt, GSI_SAME_STMT);
1474 
1475   return ssa_target;
1476 }
1477 
1478 /* Build a gimple binary operation with the given CODE and arguments
1479    ARG0, ARG1, assigning the result to a new SSA name for variable
1480    TARGET.  Insert the statement prior to GSI's current position, and
1481    return the fresh SSA name.*/
1482 
1483 static tree
build_and_insert_binop(gimple_stmt_iterator * gsi,location_t loc,const char * name,enum tree_code code,tree arg0,tree arg1)1484 build_and_insert_binop (gimple_stmt_iterator *gsi, location_t loc,
1485 			const char *name, enum tree_code code,
1486 			tree arg0, tree arg1)
1487 {
1488   tree result = make_temp_ssa_name (TREE_TYPE (arg0), NULL, name);
1489   gassign *stmt = gimple_build_assign (result, code, arg0, arg1);
1490   gimple_set_location (stmt, loc);
1491   gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1492   return result;
1493 }
1494 
1495 /* Build a gimple reference operation with the given CODE and argument
1496    ARG, assigning the result to a new SSA name of TYPE with NAME.
1497    Insert the statement prior to GSI's current position, and return
1498    the fresh SSA name.  */
1499 
1500 static inline tree
build_and_insert_ref(gimple_stmt_iterator * gsi,location_t loc,tree type,const char * name,enum tree_code code,tree arg0)1501 build_and_insert_ref (gimple_stmt_iterator *gsi, location_t loc, tree type,
1502 		      const char *name, enum tree_code code, tree arg0)
1503 {
1504   tree result = make_temp_ssa_name (type, NULL, name);
1505   gimple *stmt = gimple_build_assign (result, build1 (code, type, arg0));
1506   gimple_set_location (stmt, loc);
1507   gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1508   return result;
1509 }
1510 
1511 /* Build a gimple assignment to cast VAL to TYPE.  Insert the statement
1512    prior to GSI's current position, and return the fresh SSA name.  */
1513 
1514 static tree
build_and_insert_cast(gimple_stmt_iterator * gsi,location_t loc,tree type,tree val)1515 build_and_insert_cast (gimple_stmt_iterator *gsi, location_t loc,
1516 		       tree type, tree val)
1517 {
1518   tree result = make_ssa_name (type);
1519   gassign *stmt = gimple_build_assign (result, NOP_EXPR, val);
1520   gimple_set_location (stmt, loc);
1521   gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1522   return result;
1523 }
1524 
1525 struct pow_synth_sqrt_info
1526 {
1527   bool *factors;
1528   unsigned int deepest;
1529   unsigned int num_mults;
1530 };
1531 
1532 /* Return true iff the real value C can be represented as a
1533    sum of powers of 0.5 up to N.  That is:
1534    C == SUM<i from 1..N> (a[i]*(0.5**i)) where a[i] is either 0 or 1.
1535    Record in INFO the various parameters of the synthesis algorithm such
1536    as the factors a[i], the maximum 0.5 power and the number of
1537    multiplications that will be required.  */
1538 
1539 bool
representable_as_half_series_p(REAL_VALUE_TYPE c,unsigned n,struct pow_synth_sqrt_info * info)1540 representable_as_half_series_p (REAL_VALUE_TYPE c, unsigned n,
1541 				 struct pow_synth_sqrt_info *info)
1542 {
1543   REAL_VALUE_TYPE factor = dconsthalf;
1544   REAL_VALUE_TYPE remainder = c;
1545 
1546   info->deepest = 0;
1547   info->num_mults = 0;
1548   memset (info->factors, 0, n * sizeof (bool));
1549 
1550   for (unsigned i = 0; i < n; i++)
1551     {
1552       REAL_VALUE_TYPE res;
1553 
1554       /* If something inexact happened bail out now.  */
1555       if (real_arithmetic (&res, MINUS_EXPR, &remainder, &factor))
1556 	return false;
1557 
1558       /* We have hit zero.  The number is representable as a sum
1559          of powers of 0.5.  */
1560       if (real_equal (&res, &dconst0))
1561 	{
1562 	  info->factors[i] = true;
1563 	  info->deepest = i + 1;
1564 	  return true;
1565 	}
1566       else if (!REAL_VALUE_NEGATIVE (res))
1567 	{
1568 	  remainder = res;
1569 	  info->factors[i] = true;
1570 	  info->num_mults++;
1571 	}
1572       else
1573 	info->factors[i] = false;
1574 
1575       real_arithmetic (&factor, MULT_EXPR, &factor, &dconsthalf);
1576     }
1577   return false;
1578 }
1579 
1580 /* Return the tree corresponding to FN being applied
1581    to ARG N times at GSI and LOC.
1582    Look up previous results from CACHE if need be.
1583    cache[0] should contain just plain ARG i.e. FN applied to ARG 0 times.  */
1584 
1585 static tree
get_fn_chain(tree arg,unsigned int n,gimple_stmt_iterator * gsi,tree fn,location_t loc,tree * cache)1586 get_fn_chain (tree arg, unsigned int n, gimple_stmt_iterator *gsi,
1587 	      tree fn, location_t loc, tree *cache)
1588 {
1589   tree res = cache[n];
1590   if (!res)
1591     {
1592       tree prev = get_fn_chain (arg, n - 1, gsi, fn, loc, cache);
1593       res = build_and_insert_call (gsi, loc, fn, prev);
1594       cache[n] = res;
1595     }
1596 
1597   return res;
1598 }
1599 
1600 /* Print to STREAM the repeated application of function FNAME to ARG
1601    N times.  So, for FNAME = "foo", ARG = "x", N = 2 it would print:
1602    "foo (foo (x))".  */
1603 
1604 static void
print_nested_fn(FILE * stream,const char * fname,const char * arg,unsigned int n)1605 print_nested_fn (FILE* stream, const char *fname, const char* arg,
1606 		 unsigned int n)
1607 {
1608   if (n == 0)
1609     fprintf (stream, "%s", arg);
1610   else
1611     {
1612       fprintf (stream, "%s (", fname);
1613       print_nested_fn (stream, fname, arg, n - 1);
1614       fprintf (stream, ")");
1615     }
1616 }
1617 
1618 /* Print to STREAM the fractional sequence of sqrt chains
1619    applied to ARG, described by INFO.  Used for the dump file.  */
1620 
1621 static void
dump_fractional_sqrt_sequence(FILE * stream,const char * arg,struct pow_synth_sqrt_info * info)1622 dump_fractional_sqrt_sequence (FILE *stream, const char *arg,
1623 			        struct pow_synth_sqrt_info *info)
1624 {
1625   for (unsigned int i = 0; i < info->deepest; i++)
1626     {
1627       bool is_set = info->factors[i];
1628       if (is_set)
1629 	{
1630 	  print_nested_fn (stream, "sqrt", arg, i + 1);
1631 	  if (i != info->deepest - 1)
1632 	    fprintf (stream, " * ");
1633 	}
1634     }
1635 }
1636 
1637 /* Print to STREAM a representation of raising ARG to an integer
1638    power N.  Used for the dump file.  */
1639 
1640 static void
dump_integer_part(FILE * stream,const char * arg,HOST_WIDE_INT n)1641 dump_integer_part (FILE *stream, const char* arg, HOST_WIDE_INT n)
1642 {
1643   if (n > 1)
1644     fprintf (stream, "powi (%s, " HOST_WIDE_INT_PRINT_DEC ")", arg, n);
1645   else if (n == 1)
1646     fprintf (stream, "%s", arg);
1647 }
1648 
1649 /* Attempt to synthesize a POW[F] (ARG0, ARG1) call using chains of
1650    square roots.  Place at GSI and LOC.  Limit the maximum depth
1651    of the sqrt chains to MAX_DEPTH.  Return the tree holding the
1652    result of the expanded sequence or NULL_TREE if the expansion failed.
1653 
1654    This routine assumes that ARG1 is a real number with a fractional part
1655    (the integer exponent case will have been handled earlier in
1656    gimple_expand_builtin_pow).
1657 
1658    For ARG1 > 0.0:
1659    * For ARG1 composed of a whole part WHOLE_PART and a fractional part
1660      FRAC_PART i.e. WHOLE_PART == floor (ARG1) and
1661                     FRAC_PART == ARG1 - WHOLE_PART:
1662      Produce POWI (ARG0, WHOLE_PART) * POW (ARG0, FRAC_PART) where
1663      POW (ARG0, FRAC_PART) is expanded as a product of square root chains
1664      if it can be expressed as such, that is if FRAC_PART satisfies:
1665      FRAC_PART == <SUM from i = 1 until MAX_DEPTH> (a[i] * (0.5**i))
1666      where integer a[i] is either 0 or 1.
1667 
1668      Example:
1669      POW (x, 3.625) == POWI (x, 3) * POW (x, 0.625)
1670        --> POWI (x, 3) * SQRT (x) * SQRT (SQRT (SQRT (x)))
1671 
1672    For ARG1 < 0.0 there are two approaches:
1673    * (A) Expand to 1.0 / POW (ARG0, -ARG1) where POW (ARG0, -ARG1)
1674          is calculated as above.
1675 
1676      Example:
1677      POW (x, -5.625) == 1.0 / POW (x, 5.625)
1678        -->  1.0 / (POWI (x, 5) * SQRT (x) * SQRT (SQRT (SQRT (x))))
1679 
1680    * (B) : WHOLE_PART := - ceil (abs (ARG1))
1681            FRAC_PART  := ARG1 - WHOLE_PART
1682      and expand to POW (x, FRAC_PART) / POWI (x, WHOLE_PART).
1683      Example:
1684      POW (x, -5.875) == POW (x, 0.125) / POWI (X, 6)
1685        --> SQRT (SQRT (SQRT (x))) / (POWI (x, 6))
1686 
1687    For ARG1 < 0.0 we choose between (A) and (B) depending on
1688    how many multiplications we'd have to do.
1689    So, for the example in (B): POW (x, -5.875), if we were to
1690    follow algorithm (A) we would produce:
1691    1.0 / POWI (X, 5) * SQRT (X) * SQRT (SQRT (X)) * SQRT (SQRT (SQRT (X)))
1692    which contains more multiplications than approach (B).
1693 
1694    Hopefully, this approach will eliminate potentially expensive POW library
1695    calls when unsafe floating point math is enabled and allow the compiler to
1696    further optimise the multiplies, square roots and divides produced by this
1697    function.  */
1698 
1699 static tree
expand_pow_as_sqrts(gimple_stmt_iterator * gsi,location_t loc,tree arg0,tree arg1,HOST_WIDE_INT max_depth)1700 expand_pow_as_sqrts (gimple_stmt_iterator *gsi, location_t loc,
1701 		     tree arg0, tree arg1, HOST_WIDE_INT max_depth)
1702 {
1703   tree type = TREE_TYPE (arg0);
1704   machine_mode mode = TYPE_MODE (type);
1705   tree sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1706   bool one_over = true;
1707 
1708   if (!sqrtfn)
1709     return NULL_TREE;
1710 
1711   if (TREE_CODE (arg1) != REAL_CST)
1712     return NULL_TREE;
1713 
1714   REAL_VALUE_TYPE exp_init = TREE_REAL_CST (arg1);
1715 
1716   gcc_assert (max_depth > 0);
1717   tree *cache = XALLOCAVEC (tree, max_depth + 1);
1718 
1719   struct pow_synth_sqrt_info synth_info;
1720   synth_info.factors = XALLOCAVEC (bool, max_depth + 1);
1721   synth_info.deepest = 0;
1722   synth_info.num_mults = 0;
1723 
1724   bool neg_exp = REAL_VALUE_NEGATIVE (exp_init);
1725   REAL_VALUE_TYPE exp = real_value_abs (&exp_init);
1726 
1727   /* The whole and fractional parts of exp.  */
1728   REAL_VALUE_TYPE whole_part;
1729   REAL_VALUE_TYPE frac_part;
1730 
1731   real_floor (&whole_part, mode, &exp);
1732   real_arithmetic (&frac_part, MINUS_EXPR, &exp, &whole_part);
1733 
1734 
1735   REAL_VALUE_TYPE ceil_whole = dconst0;
1736   REAL_VALUE_TYPE ceil_fract = dconst0;
1737 
1738   if (neg_exp)
1739     {
1740       real_ceil (&ceil_whole, mode, &exp);
1741       real_arithmetic (&ceil_fract, MINUS_EXPR, &ceil_whole, &exp);
1742     }
1743 
1744   if (!representable_as_half_series_p (frac_part, max_depth, &synth_info))
1745     return NULL_TREE;
1746 
1747   /* Check whether it's more profitable to not use 1.0 / ...  */
1748   if (neg_exp)
1749     {
1750       struct pow_synth_sqrt_info alt_synth_info;
1751       alt_synth_info.factors = XALLOCAVEC (bool, max_depth + 1);
1752       alt_synth_info.deepest = 0;
1753       alt_synth_info.num_mults = 0;
1754 
1755       if (representable_as_half_series_p (ceil_fract, max_depth,
1756 					   &alt_synth_info)
1757 	  && alt_synth_info.deepest <= synth_info.deepest
1758 	  && alt_synth_info.num_mults < synth_info.num_mults)
1759 	{
1760 	  whole_part = ceil_whole;
1761 	  frac_part = ceil_fract;
1762 	  synth_info.deepest = alt_synth_info.deepest;
1763 	  synth_info.num_mults = alt_synth_info.num_mults;
1764 	  memcpy (synth_info.factors, alt_synth_info.factors,
1765 		  (max_depth + 1) * sizeof (bool));
1766 	  one_over = false;
1767 	}
1768     }
1769 
1770   HOST_WIDE_INT n = real_to_integer (&whole_part);
1771   REAL_VALUE_TYPE cint;
1772   real_from_integer (&cint, VOIDmode, n, SIGNED);
1773 
1774   if (!real_identical (&whole_part, &cint))
1775     return NULL_TREE;
1776 
1777   if (powi_cost (n) + synth_info.num_mults > POWI_MAX_MULTS)
1778     return NULL_TREE;
1779 
1780   memset (cache, 0, (max_depth + 1) * sizeof (tree));
1781 
1782   tree integer_res = n == 0 ? build_real (type, dconst1) : arg0;
1783 
1784   /* Calculate the integer part of the exponent.  */
1785   if (n > 1)
1786     {
1787       integer_res = gimple_expand_builtin_powi (gsi, loc, arg0, n);
1788       if (!integer_res)
1789 	return NULL_TREE;
1790     }
1791 
1792   if (dump_file)
1793     {
1794       char string[64];
1795 
1796       real_to_decimal (string, &exp_init, sizeof (string), 0, 1);
1797       fprintf (dump_file, "synthesizing pow (x, %s) as:\n", string);
1798 
1799       if (neg_exp)
1800 	{
1801 	  if (one_over)
1802 	    {
1803 	      fprintf (dump_file, "1.0 / (");
1804 	      dump_integer_part (dump_file, "x", n);
1805 	      if (n > 0)
1806 	        fprintf (dump_file, " * ");
1807 	      dump_fractional_sqrt_sequence (dump_file, "x", &synth_info);
1808 	      fprintf (dump_file, ")");
1809 	    }
1810 	  else
1811 	    {
1812 	      dump_fractional_sqrt_sequence (dump_file, "x", &synth_info);
1813 	      fprintf (dump_file, " / (");
1814 	      dump_integer_part (dump_file, "x", n);
1815 	      fprintf (dump_file, ")");
1816 	    }
1817 	}
1818       else
1819 	{
1820 	  dump_fractional_sqrt_sequence (dump_file, "x", &synth_info);
1821 	  if (n > 0)
1822 	    fprintf (dump_file, " * ");
1823 	  dump_integer_part (dump_file, "x", n);
1824 	}
1825 
1826       fprintf (dump_file, "\ndeepest sqrt chain: %d\n", synth_info.deepest);
1827     }
1828 
1829 
1830   tree fract_res = NULL_TREE;
1831   cache[0] = arg0;
1832 
1833   /* Calculate the fractional part of the exponent.  */
1834   for (unsigned i = 0; i < synth_info.deepest; i++)
1835     {
1836       if (synth_info.factors[i])
1837 	{
1838 	  tree sqrt_chain = get_fn_chain (arg0, i + 1, gsi, sqrtfn, loc, cache);
1839 
1840 	  if (!fract_res)
1841 	      fract_res = sqrt_chain;
1842 
1843 	  else
1844 	    fract_res = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1845 					   fract_res, sqrt_chain);
1846 	}
1847     }
1848 
1849   tree res = NULL_TREE;
1850 
1851   if (neg_exp)
1852     {
1853       if (one_over)
1854 	{
1855 	  if (n > 0)
1856 	    res = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1857 					   fract_res, integer_res);
1858 	  else
1859 	    res = fract_res;
1860 
1861 	  res = build_and_insert_binop (gsi, loc, "powrootrecip", RDIV_EXPR,
1862 					  build_real (type, dconst1), res);
1863 	}
1864       else
1865 	{
1866 	  res = build_and_insert_binop (gsi, loc, "powroot", RDIV_EXPR,
1867 					 fract_res, integer_res);
1868 	}
1869     }
1870   else
1871     res = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1872 				   fract_res, integer_res);
1873   return res;
1874 }
1875 
1876 /* ARG0 and ARG1 are the two arguments to a pow builtin call in GSI
1877    with location info LOC.  If possible, create an equivalent and
1878    less expensive sequence of statements prior to GSI, and return an
1879    expession holding the result.  */
1880 
1881 static tree
gimple_expand_builtin_pow(gimple_stmt_iterator * gsi,location_t loc,tree arg0,tree arg1)1882 gimple_expand_builtin_pow (gimple_stmt_iterator *gsi, location_t loc,
1883 			   tree arg0, tree arg1)
1884 {
1885   REAL_VALUE_TYPE c, cint, dconst1_3, dconst1_4, dconst1_6;
1886   REAL_VALUE_TYPE c2, dconst3;
1887   HOST_WIDE_INT n;
1888   tree type, sqrtfn, cbrtfn, sqrt_arg0, result, cbrt_x, powi_cbrt_x;
1889   machine_mode mode;
1890   bool speed_p = optimize_bb_for_speed_p (gsi_bb (*gsi));
1891   bool hw_sqrt_exists, c_is_int, c2_is_int;
1892 
1893   dconst1_4 = dconst1;
1894   SET_REAL_EXP (&dconst1_4, REAL_EXP (&dconst1_4) - 2);
1895 
1896   /* If the exponent isn't a constant, there's nothing of interest
1897      to be done.  */
1898   if (TREE_CODE (arg1) != REAL_CST)
1899     return NULL_TREE;
1900 
1901   /* Don't perform the operation if flag_signaling_nans is on
1902      and the operand is a signaling NaN.  */
1903   if (HONOR_SNANS (TYPE_MODE (TREE_TYPE (arg1)))
1904       && ((TREE_CODE (arg0) == REAL_CST
1905 	   && REAL_VALUE_ISSIGNALING_NAN (TREE_REAL_CST (arg0)))
1906 	  || REAL_VALUE_ISSIGNALING_NAN (TREE_REAL_CST (arg1))))
1907     return NULL_TREE;
1908 
1909   /* If the exponent is equivalent to an integer, expand to an optimal
1910      multiplication sequence when profitable.  */
1911   c = TREE_REAL_CST (arg1);
1912   n = real_to_integer (&c);
1913   real_from_integer (&cint, VOIDmode, n, SIGNED);
1914   c_is_int = real_identical (&c, &cint);
1915 
1916   if (c_is_int
1917       && ((n >= -1 && n <= 2)
1918 	  || (flag_unsafe_math_optimizations
1919 	      && speed_p
1920 	      && powi_cost (n) <= POWI_MAX_MULTS)))
1921     return gimple_expand_builtin_powi (gsi, loc, arg0, n);
1922 
1923   /* Attempt various optimizations using sqrt and cbrt.  */
1924   type = TREE_TYPE (arg0);
1925   mode = TYPE_MODE (type);
1926   sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1927 
1928   /* Optimize pow(x,0.5) = sqrt(x).  This replacement is always safe
1929      unless signed zeros must be maintained.  pow(-0,0.5) = +0, while
1930      sqrt(-0) = -0.  */
1931   if (sqrtfn
1932       && real_equal (&c, &dconsthalf)
1933       && !HONOR_SIGNED_ZEROS (mode))
1934     return build_and_insert_call (gsi, loc, sqrtfn, arg0);
1935 
1936   hw_sqrt_exists = optab_handler (sqrt_optab, mode) != CODE_FOR_nothing;
1937 
1938   /* Optimize pow(x,1./3.) = cbrt(x).  This requires unsafe math
1939      optimizations since 1./3. is not exactly representable.  If x
1940      is negative and finite, the correct value of pow(x,1./3.) is
1941      a NaN with the "invalid" exception raised, because the value
1942      of 1./3. actually has an even denominator.  The correct value
1943      of cbrt(x) is a negative real value.  */
1944   cbrtfn = mathfn_built_in (type, BUILT_IN_CBRT);
1945   dconst1_3 = real_value_truncate (mode, dconst_third ());
1946 
1947   if (flag_unsafe_math_optimizations
1948       && cbrtfn
1949       && (!HONOR_NANS (mode) || tree_expr_nonnegative_p (arg0))
1950       && real_equal (&c, &dconst1_3))
1951     return build_and_insert_call (gsi, loc, cbrtfn, arg0);
1952 
1953   /* Optimize pow(x,1./6.) = cbrt(sqrt(x)).  Don't do this optimization
1954      if we don't have a hardware sqrt insn.  */
1955   dconst1_6 = dconst1_3;
1956   SET_REAL_EXP (&dconst1_6, REAL_EXP (&dconst1_6) - 1);
1957 
1958   if (flag_unsafe_math_optimizations
1959       && sqrtfn
1960       && cbrtfn
1961       && (!HONOR_NANS (mode) || tree_expr_nonnegative_p (arg0))
1962       && speed_p
1963       && hw_sqrt_exists
1964       && real_equal (&c, &dconst1_6))
1965     {
1966       /* sqrt(x)  */
1967       sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1968 
1969       /* cbrt(sqrt(x))  */
1970       return build_and_insert_call (gsi, loc, cbrtfn, sqrt_arg0);
1971     }
1972 
1973 
1974   /* Attempt to expand the POW as a product of square root chains.
1975      Expand the 0.25 case even when otpimising for size.  */
1976   if (flag_unsafe_math_optimizations
1977       && sqrtfn
1978       && hw_sqrt_exists
1979       && (speed_p || real_equal (&c, &dconst1_4))
1980       && !HONOR_SIGNED_ZEROS (mode))
1981     {
1982       unsigned int max_depth = speed_p
1983 				? PARAM_VALUE (PARAM_MAX_POW_SQRT_DEPTH)
1984 				: 2;
1985 
1986       tree expand_with_sqrts
1987 	= expand_pow_as_sqrts (gsi, loc, arg0, arg1, max_depth);
1988 
1989       if (expand_with_sqrts)
1990 	return expand_with_sqrts;
1991     }
1992 
1993   real_arithmetic (&c2, MULT_EXPR, &c, &dconst2);
1994   n = real_to_integer (&c2);
1995   real_from_integer (&cint, VOIDmode, n, SIGNED);
1996   c2_is_int = real_identical (&c2, &cint);
1997 
1998   /* Optimize pow(x,c), where 3c = n for some nonzero integer n, into
1999 
2000      powi(x, n/3) * powi(cbrt(x), n%3),                    n > 0;
2001      1.0 / (powi(x, abs(n)/3) * powi(cbrt(x), abs(n)%3)),  n < 0.
2002 
2003      Do not calculate the first factor when n/3 = 0.  As cbrt(x) is
2004      different from pow(x, 1./3.) due to rounding and behavior with
2005      negative x, we need to constrain this transformation to unsafe
2006      math and positive x or finite math.  */
2007   real_from_integer (&dconst3, VOIDmode, 3, SIGNED);
2008   real_arithmetic (&c2, MULT_EXPR, &c, &dconst3);
2009   real_round (&c2, mode, &c2);
2010   n = real_to_integer (&c2);
2011   real_from_integer (&cint, VOIDmode, n, SIGNED);
2012   real_arithmetic (&c2, RDIV_EXPR, &cint, &dconst3);
2013   real_convert (&c2, mode, &c2);
2014 
2015   if (flag_unsafe_math_optimizations
2016       && cbrtfn
2017       && (!HONOR_NANS (mode) || tree_expr_nonnegative_p (arg0))
2018       && real_identical (&c2, &c)
2019       && !c2_is_int
2020       && optimize_function_for_speed_p (cfun)
2021       && powi_cost (n / 3) <= POWI_MAX_MULTS)
2022     {
2023       tree powi_x_ndiv3 = NULL_TREE;
2024 
2025       /* Attempt to fold powi(arg0, abs(n/3)) into multiplies.  If not
2026          possible or profitable, give up.  Skip the degenerate case when
2027          abs(n) < 3, where the result is always 1.  */
2028       if (absu_hwi (n) >= 3)
2029 	{
2030 	  powi_x_ndiv3 = gimple_expand_builtin_powi (gsi, loc, arg0,
2031 						     abs_hwi (n / 3));
2032 	  if (!powi_x_ndiv3)
2033 	    return NULL_TREE;
2034 	}
2035 
2036       /* Calculate powi(cbrt(x), n%3).  Don't use gimple_expand_builtin_powi
2037          as that creates an unnecessary variable.  Instead, just produce
2038          either cbrt(x) or cbrt(x) * cbrt(x).  */
2039       cbrt_x = build_and_insert_call (gsi, loc, cbrtfn, arg0);
2040 
2041       if (absu_hwi (n) % 3 == 1)
2042 	powi_cbrt_x = cbrt_x;
2043       else
2044 	powi_cbrt_x = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
2045 					      cbrt_x, cbrt_x);
2046 
2047       /* Multiply the two subexpressions, unless powi(x,abs(n)/3) = 1.  */
2048       if (absu_hwi (n) < 3)
2049 	result = powi_cbrt_x;
2050       else
2051 	result = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
2052 					 powi_x_ndiv3, powi_cbrt_x);
2053 
2054       /* If n is negative, reciprocate the result.  */
2055       if (n < 0)
2056 	result = build_and_insert_binop (gsi, loc, "powroot", RDIV_EXPR,
2057 					 build_real (type, dconst1), result);
2058 
2059       return result;
2060     }
2061 
2062   /* No optimizations succeeded.  */
2063   return NULL_TREE;
2064 }
2065 
2066 /* ARG is the argument to a cabs builtin call in GSI with location info
2067    LOC.  Create a sequence of statements prior to GSI that calculates
2068    sqrt(R*R + I*I), where R and I are the real and imaginary components
2069    of ARG, respectively.  Return an expression holding the result.  */
2070 
2071 static tree
gimple_expand_builtin_cabs(gimple_stmt_iterator * gsi,location_t loc,tree arg)2072 gimple_expand_builtin_cabs (gimple_stmt_iterator *gsi, location_t loc, tree arg)
2073 {
2074   tree real_part, imag_part, addend1, addend2, sum, result;
2075   tree type = TREE_TYPE (TREE_TYPE (arg));
2076   tree sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
2077   machine_mode mode = TYPE_MODE (type);
2078 
2079   if (!flag_unsafe_math_optimizations
2080       || !optimize_bb_for_speed_p (gimple_bb (gsi_stmt (*gsi)))
2081       || !sqrtfn
2082       || optab_handler (sqrt_optab, mode) == CODE_FOR_nothing)
2083     return NULL_TREE;
2084 
2085   real_part = build_and_insert_ref (gsi, loc, type, "cabs",
2086 				    REALPART_EXPR, arg);
2087   addend1 = build_and_insert_binop (gsi, loc, "cabs", MULT_EXPR,
2088 				    real_part, real_part);
2089   imag_part = build_and_insert_ref (gsi, loc, type, "cabs",
2090 				    IMAGPART_EXPR, arg);
2091   addend2 = build_and_insert_binop (gsi, loc, "cabs", MULT_EXPR,
2092 				    imag_part, imag_part);
2093   sum = build_and_insert_binop (gsi, loc, "cabs", PLUS_EXPR, addend1, addend2);
2094   result = build_and_insert_call (gsi, loc, sqrtfn, sum);
2095 
2096   return result;
2097 }
2098 
2099 /* Go through all calls to sin, cos and cexpi and call execute_cse_sincos_1
2100    on the SSA_NAME argument of each of them.  Also expand powi(x,n) into
2101    an optimal number of multiplies, when n is a constant.  */
2102 
2103 namespace {
2104 
2105 const pass_data pass_data_cse_sincos =
2106 {
2107   GIMPLE_PASS, /* type */
2108   "sincos", /* name */
2109   OPTGROUP_NONE, /* optinfo_flags */
2110   TV_TREE_SINCOS, /* tv_id */
2111   PROP_ssa, /* properties_required */
2112   PROP_gimple_opt_math, /* properties_provided */
2113   0, /* properties_destroyed */
2114   0, /* todo_flags_start */
2115   TODO_update_ssa, /* todo_flags_finish */
2116 };
2117 
2118 class pass_cse_sincos : public gimple_opt_pass
2119 {
2120 public:
pass_cse_sincos(gcc::context * ctxt)2121   pass_cse_sincos (gcc::context *ctxt)
2122     : gimple_opt_pass (pass_data_cse_sincos, ctxt)
2123   {}
2124 
2125   /* opt_pass methods: */
gate(function *)2126   virtual bool gate (function *)
2127     {
2128       /* We no longer require either sincos or cexp, since powi expansion
2129 	 piggybacks on this pass.  */
2130       return optimize;
2131     }
2132 
2133   virtual unsigned int execute (function *);
2134 
2135 }; // class pass_cse_sincos
2136 
2137 unsigned int
execute(function * fun)2138 pass_cse_sincos::execute (function *fun)
2139 {
2140   basic_block bb;
2141   bool cfg_changed = false;
2142 
2143   calculate_dominance_info (CDI_DOMINATORS);
2144   memset (&sincos_stats, 0, sizeof (sincos_stats));
2145 
2146   FOR_EACH_BB_FN (bb, fun)
2147     {
2148       gimple_stmt_iterator gsi;
2149       bool cleanup_eh = false;
2150 
2151       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2152         {
2153 	  gimple *stmt = gsi_stmt (gsi);
2154 
2155 	  /* Only the last stmt in a bb could throw, no need to call
2156 	     gimple_purge_dead_eh_edges if we change something in the middle
2157 	     of a basic block.  */
2158 	  cleanup_eh = false;
2159 
2160 	  if (is_gimple_call (stmt)
2161 	      && gimple_call_lhs (stmt))
2162 	    {
2163 	      tree arg, arg0, arg1, result;
2164 	      HOST_WIDE_INT n;
2165 	      location_t loc;
2166 
2167 	      switch (gimple_call_combined_fn (stmt))
2168 		{
2169 		CASE_CFN_COS:
2170 		CASE_CFN_SIN:
2171 		CASE_CFN_CEXPI:
2172 		  /* Make sure we have either sincos or cexp.  */
2173 		  if (!targetm.libc_has_function (function_c99_math_complex)
2174 		      && !targetm.libc_has_function (function_sincos))
2175 		    break;
2176 
2177 		  arg = gimple_call_arg (stmt, 0);
2178 		  if (TREE_CODE (arg) == SSA_NAME)
2179 		    cfg_changed |= execute_cse_sincos_1 (arg);
2180 		  break;
2181 
2182 		CASE_CFN_POW:
2183 		  arg0 = gimple_call_arg (stmt, 0);
2184 		  arg1 = gimple_call_arg (stmt, 1);
2185 
2186 		  loc = gimple_location (stmt);
2187 		  result = gimple_expand_builtin_pow (&gsi, loc, arg0, arg1);
2188 
2189 		  if (result)
2190 		    {
2191 		      tree lhs = gimple_get_lhs (stmt);
2192 		      gassign *new_stmt = gimple_build_assign (lhs, result);
2193 		      gimple_set_location (new_stmt, loc);
2194 		      unlink_stmt_vdef (stmt);
2195 		      gsi_replace (&gsi, new_stmt, true);
2196 		      cleanup_eh = true;
2197 		      if (gimple_vdef (stmt))
2198 			release_ssa_name (gimple_vdef (stmt));
2199 		    }
2200 		  break;
2201 
2202 		CASE_CFN_POWI:
2203 		  arg0 = gimple_call_arg (stmt, 0);
2204 		  arg1 = gimple_call_arg (stmt, 1);
2205 		  loc = gimple_location (stmt);
2206 
2207 		  if (real_minus_onep (arg0))
2208 		    {
2209                       tree t0, t1, cond, one, minus_one;
2210 		      gassign *stmt;
2211 
2212 		      t0 = TREE_TYPE (arg0);
2213 		      t1 = TREE_TYPE (arg1);
2214 		      one = build_real (t0, dconst1);
2215 		      minus_one = build_real (t0, dconstm1);
2216 
2217 		      cond = make_temp_ssa_name (t1, NULL, "powi_cond");
2218 		      stmt = gimple_build_assign (cond, BIT_AND_EXPR,
2219 						  arg1, build_int_cst (t1, 1));
2220 		      gimple_set_location (stmt, loc);
2221 		      gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
2222 
2223 		      result = make_temp_ssa_name (t0, NULL, "powi");
2224 		      stmt = gimple_build_assign (result, COND_EXPR, cond,
2225 						  minus_one, one);
2226 		      gimple_set_location (stmt, loc);
2227 		      gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
2228 		    }
2229 		  else
2230 		    {
2231 		      if (!tree_fits_shwi_p (arg1))
2232 			break;
2233 
2234 		      n = tree_to_shwi (arg1);
2235 		      result = gimple_expand_builtin_powi (&gsi, loc, arg0, n);
2236 		    }
2237 
2238 		  if (result)
2239 		    {
2240 		      tree lhs = gimple_get_lhs (stmt);
2241 		      gassign *new_stmt = gimple_build_assign (lhs, result);
2242 		      gimple_set_location (new_stmt, loc);
2243 		      unlink_stmt_vdef (stmt);
2244 		      gsi_replace (&gsi, new_stmt, true);
2245 		      cleanup_eh = true;
2246 		      if (gimple_vdef (stmt))
2247 			release_ssa_name (gimple_vdef (stmt));
2248 		    }
2249 		  break;
2250 
2251 		CASE_CFN_CABS:
2252 		  arg0 = gimple_call_arg (stmt, 0);
2253 		  loc = gimple_location (stmt);
2254 		  result = gimple_expand_builtin_cabs (&gsi, loc, arg0);
2255 
2256 		  if (result)
2257 		    {
2258 		      tree lhs = gimple_get_lhs (stmt);
2259 		      gassign *new_stmt = gimple_build_assign (lhs, result);
2260 		      gimple_set_location (new_stmt, loc);
2261 		      unlink_stmt_vdef (stmt);
2262 		      gsi_replace (&gsi, new_stmt, true);
2263 		      cleanup_eh = true;
2264 		      if (gimple_vdef (stmt))
2265 			release_ssa_name (gimple_vdef (stmt));
2266 		    }
2267 		  break;
2268 
2269 		default:;
2270 		}
2271 	    }
2272 	}
2273       if (cleanup_eh)
2274 	cfg_changed |= gimple_purge_dead_eh_edges (bb);
2275     }
2276 
2277   statistics_counter_event (fun, "sincos statements inserted",
2278 			    sincos_stats.inserted);
2279 
2280   return cfg_changed ? TODO_cleanup_cfg : 0;
2281 }
2282 
2283 } // anon namespace
2284 
2285 gimple_opt_pass *
make_pass_cse_sincos(gcc::context * ctxt)2286 make_pass_cse_sincos (gcc::context *ctxt)
2287 {
2288   return new pass_cse_sincos (ctxt);
2289 }
2290 
2291 /* Return true if stmt is a type conversion operation that can be stripped
2292    when used in a widening multiply operation.  */
2293 static bool
widening_mult_conversion_strippable_p(tree result_type,gimple * stmt)2294 widening_mult_conversion_strippable_p (tree result_type, gimple *stmt)
2295 {
2296   enum tree_code rhs_code = gimple_assign_rhs_code (stmt);
2297 
2298   if (TREE_CODE (result_type) == INTEGER_TYPE)
2299     {
2300       tree op_type;
2301       tree inner_op_type;
2302 
2303       if (!CONVERT_EXPR_CODE_P (rhs_code))
2304 	return false;
2305 
2306       op_type = TREE_TYPE (gimple_assign_lhs (stmt));
2307 
2308       /* If the type of OP has the same precision as the result, then
2309 	 we can strip this conversion.  The multiply operation will be
2310 	 selected to create the correct extension as a by-product.  */
2311       if (TYPE_PRECISION (result_type) == TYPE_PRECISION (op_type))
2312 	return true;
2313 
2314       /* We can also strip a conversion if it preserves the signed-ness of
2315 	 the operation and doesn't narrow the range.  */
2316       inner_op_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
2317 
2318       /* If the inner-most type is unsigned, then we can strip any
2319 	 intermediate widening operation.  If it's signed, then the
2320 	 intermediate widening operation must also be signed.  */
2321       if ((TYPE_UNSIGNED (inner_op_type)
2322 	   || TYPE_UNSIGNED (op_type) == TYPE_UNSIGNED (inner_op_type))
2323 	  && TYPE_PRECISION (op_type) > TYPE_PRECISION (inner_op_type))
2324 	return true;
2325 
2326       return false;
2327     }
2328 
2329   return rhs_code == FIXED_CONVERT_EXPR;
2330 }
2331 
2332 /* Return true if RHS is a suitable operand for a widening multiplication,
2333    assuming a target type of TYPE.
2334    There are two cases:
2335 
2336      - RHS makes some value at least twice as wide.  Store that value
2337        in *NEW_RHS_OUT if so, and store its type in *TYPE_OUT.
2338 
2339      - RHS is an integer constant.  Store that value in *NEW_RHS_OUT if so,
2340        but leave *TYPE_OUT untouched.  */
2341 
2342 static bool
is_widening_mult_rhs_p(tree type,tree rhs,tree * type_out,tree * new_rhs_out)2343 is_widening_mult_rhs_p (tree type, tree rhs, tree *type_out,
2344 			tree *new_rhs_out)
2345 {
2346   gimple *stmt;
2347   tree type1, rhs1;
2348 
2349   if (TREE_CODE (rhs) == SSA_NAME)
2350     {
2351       stmt = SSA_NAME_DEF_STMT (rhs);
2352       if (is_gimple_assign (stmt))
2353 	{
2354 	  if (! widening_mult_conversion_strippable_p (type, stmt))
2355 	    rhs1 = rhs;
2356 	  else
2357 	    {
2358 	      rhs1 = gimple_assign_rhs1 (stmt);
2359 
2360 	      if (TREE_CODE (rhs1) == INTEGER_CST)
2361 		{
2362 		  *new_rhs_out = rhs1;
2363 		  *type_out = NULL;
2364 		  return true;
2365 		}
2366 	    }
2367 	}
2368       else
2369 	rhs1 = rhs;
2370 
2371       type1 = TREE_TYPE (rhs1);
2372 
2373       if (TREE_CODE (type1) != TREE_CODE (type)
2374 	  || TYPE_PRECISION (type1) * 2 > TYPE_PRECISION (type))
2375 	return false;
2376 
2377       *new_rhs_out = rhs1;
2378       *type_out = type1;
2379       return true;
2380     }
2381 
2382   if (TREE_CODE (rhs) == INTEGER_CST)
2383     {
2384       *new_rhs_out = rhs;
2385       *type_out = NULL;
2386       return true;
2387     }
2388 
2389   return false;
2390 }
2391 
2392 /* Return true if STMT performs a widening multiplication, assuming the
2393    output type is TYPE.  If so, store the unwidened types of the operands
2394    in *TYPE1_OUT and *TYPE2_OUT respectively.  Also fill *RHS1_OUT and
2395    *RHS2_OUT such that converting those operands to types *TYPE1_OUT
2396    and *TYPE2_OUT would give the operands of the multiplication.  */
2397 
2398 static bool
is_widening_mult_p(gimple * stmt,tree * type1_out,tree * rhs1_out,tree * type2_out,tree * rhs2_out)2399 is_widening_mult_p (gimple *stmt,
2400 		    tree *type1_out, tree *rhs1_out,
2401 		    tree *type2_out, tree *rhs2_out)
2402 {
2403   tree type = TREE_TYPE (gimple_assign_lhs (stmt));
2404 
2405   if (TREE_CODE (type) == INTEGER_TYPE)
2406     {
2407       if (TYPE_OVERFLOW_TRAPS (type))
2408 	return false;
2409     }
2410   else if (TREE_CODE (type) != FIXED_POINT_TYPE)
2411     return false;
2412 
2413   if (!is_widening_mult_rhs_p (type, gimple_assign_rhs1 (stmt), type1_out,
2414 			       rhs1_out))
2415     return false;
2416 
2417   if (!is_widening_mult_rhs_p (type, gimple_assign_rhs2 (stmt), type2_out,
2418 			       rhs2_out))
2419     return false;
2420 
2421   if (*type1_out == NULL)
2422     {
2423       if (*type2_out == NULL || !int_fits_type_p (*rhs1_out, *type2_out))
2424 	return false;
2425       *type1_out = *type2_out;
2426     }
2427 
2428   if (*type2_out == NULL)
2429     {
2430       if (!int_fits_type_p (*rhs2_out, *type1_out))
2431 	return false;
2432       *type2_out = *type1_out;
2433     }
2434 
2435   /* Ensure that the larger of the two operands comes first. */
2436   if (TYPE_PRECISION (*type1_out) < TYPE_PRECISION (*type2_out))
2437     {
2438       std::swap (*type1_out, *type2_out);
2439       std::swap (*rhs1_out, *rhs2_out);
2440     }
2441 
2442   return true;
2443 }
2444 
2445 /* Check to see if the CALL statement is an invocation of copysign
2446    with 1. being the first argument.  */
2447 static bool
is_copysign_call_with_1(gimple * call)2448 is_copysign_call_with_1 (gimple *call)
2449 {
2450   gcall *c = dyn_cast <gcall *> (call);
2451   if (! c)
2452     return false;
2453 
2454   enum combined_fn code = gimple_call_combined_fn (c);
2455 
2456   if (code == CFN_LAST)
2457     return false;
2458 
2459   if (builtin_fn_p (code))
2460     {
2461       switch (as_builtin_fn (code))
2462 	{
2463 	CASE_FLT_FN (BUILT_IN_COPYSIGN):
2464 	CASE_FLT_FN_FLOATN_NX (BUILT_IN_COPYSIGN):
2465 	  return real_onep (gimple_call_arg (c, 0));
2466 	default:
2467 	  return false;
2468 	}
2469     }
2470 
2471   if (internal_fn_p (code))
2472     {
2473       switch (as_internal_fn (code))
2474 	{
2475 	case IFN_COPYSIGN:
2476 	  return real_onep (gimple_call_arg (c, 0));
2477 	default:
2478 	  return false;
2479 	}
2480     }
2481 
2482    return false;
2483 }
2484 
2485 /* Try to expand the pattern x * copysign (1, y) into xorsign (x, y).
2486    This only happens when the the xorsign optab is defined, if the
2487    pattern is not a xorsign pattern or if expansion fails FALSE is
2488    returned, otherwise TRUE is returned.  */
2489 static bool
convert_expand_mult_copysign(gimple * stmt,gimple_stmt_iterator * gsi)2490 convert_expand_mult_copysign (gimple *stmt, gimple_stmt_iterator *gsi)
2491 {
2492   tree treeop0, treeop1, lhs, type;
2493   location_t loc = gimple_location (stmt);
2494   lhs = gimple_assign_lhs (stmt);
2495   treeop0 = gimple_assign_rhs1 (stmt);
2496   treeop1 = gimple_assign_rhs2 (stmt);
2497   type = TREE_TYPE (lhs);
2498   machine_mode mode = TYPE_MODE (type);
2499 
2500   if (HONOR_SNANS (type))
2501     return false;
2502 
2503   if (TREE_CODE (treeop0) == SSA_NAME && TREE_CODE (treeop1) == SSA_NAME)
2504     {
2505       gimple *call0 = SSA_NAME_DEF_STMT (treeop0);
2506       if (!has_single_use (treeop0) || !is_copysign_call_with_1 (call0))
2507 	{
2508 	  call0 = SSA_NAME_DEF_STMT (treeop1);
2509 	  if (!has_single_use (treeop1) || !is_copysign_call_with_1 (call0))
2510 	    return false;
2511 
2512 	  treeop1 = treeop0;
2513 	}
2514 	if (optab_handler (xorsign_optab, mode) == CODE_FOR_nothing)
2515 	  return false;
2516 
2517 	gcall *c = as_a<gcall*> (call0);
2518 	treeop0 = gimple_call_arg (c, 1);
2519 
2520 	gcall *call_stmt
2521 	  = gimple_build_call_internal (IFN_XORSIGN, 2, treeop1, treeop0);
2522 	gimple_set_lhs (call_stmt, lhs);
2523 	gimple_set_location (call_stmt, loc);
2524 	gsi_replace (gsi, call_stmt, true);
2525 	return true;
2526     }
2527 
2528   return false;
2529 }
2530 
2531 /* Process a single gimple statement STMT, which has a MULT_EXPR as
2532    its rhs, and try to convert it into a WIDEN_MULT_EXPR.  The return
2533    value is true iff we converted the statement.  */
2534 
2535 static bool
convert_mult_to_widen(gimple * stmt,gimple_stmt_iterator * gsi)2536 convert_mult_to_widen (gimple *stmt, gimple_stmt_iterator *gsi)
2537 {
2538   tree lhs, rhs1, rhs2, type, type1, type2;
2539   enum insn_code handler;
2540   scalar_int_mode to_mode, from_mode, actual_mode;
2541   optab op;
2542   int actual_precision;
2543   location_t loc = gimple_location (stmt);
2544   bool from_unsigned1, from_unsigned2;
2545 
2546   lhs = gimple_assign_lhs (stmt);
2547   type = TREE_TYPE (lhs);
2548   if (TREE_CODE (type) != INTEGER_TYPE)
2549     return false;
2550 
2551   if (!is_widening_mult_p (stmt, &type1, &rhs1, &type2, &rhs2))
2552     return false;
2553 
2554   to_mode = SCALAR_INT_TYPE_MODE (type);
2555   from_mode = SCALAR_INT_TYPE_MODE (type1);
2556   if (to_mode == from_mode)
2557     return false;
2558 
2559   from_unsigned1 = TYPE_UNSIGNED (type1);
2560   from_unsigned2 = TYPE_UNSIGNED (type2);
2561 
2562   if (from_unsigned1 && from_unsigned2)
2563     op = umul_widen_optab;
2564   else if (!from_unsigned1 && !from_unsigned2)
2565     op = smul_widen_optab;
2566   else
2567     op = usmul_widen_optab;
2568 
2569   handler = find_widening_optab_handler_and_mode (op, to_mode, from_mode,
2570 						  &actual_mode);
2571 
2572   if (handler == CODE_FOR_nothing)
2573     {
2574       if (op != smul_widen_optab)
2575 	{
2576 	  /* We can use a signed multiply with unsigned types as long as
2577 	     there is a wider mode to use, or it is the smaller of the two
2578 	     types that is unsigned.  Note that type1 >= type2, always.  */
2579 	  if ((TYPE_UNSIGNED (type1)
2580 	       && TYPE_PRECISION (type1) == GET_MODE_PRECISION (from_mode))
2581 	      || (TYPE_UNSIGNED (type2)
2582 		  && TYPE_PRECISION (type2) == GET_MODE_PRECISION (from_mode)))
2583 	    {
2584 	      if (!GET_MODE_WIDER_MODE (from_mode).exists (&from_mode)
2585 		  || GET_MODE_SIZE (to_mode) <= GET_MODE_SIZE (from_mode))
2586 		return false;
2587 	    }
2588 
2589 	  op = smul_widen_optab;
2590 	  handler = find_widening_optab_handler_and_mode (op, to_mode,
2591 							  from_mode,
2592 							  &actual_mode);
2593 
2594 	  if (handler == CODE_FOR_nothing)
2595 	    return false;
2596 
2597 	  from_unsigned1 = from_unsigned2 = false;
2598 	}
2599       else
2600 	return false;
2601     }
2602 
2603   /* Ensure that the inputs to the handler are in the correct precison
2604      for the opcode.  This will be the full mode size.  */
2605   actual_precision = GET_MODE_PRECISION (actual_mode);
2606   if (2 * actual_precision > TYPE_PRECISION (type))
2607     return false;
2608   if (actual_precision != TYPE_PRECISION (type1)
2609       || from_unsigned1 != TYPE_UNSIGNED (type1))
2610     rhs1 = build_and_insert_cast (gsi, loc,
2611 				  build_nonstandard_integer_type
2612 				    (actual_precision, from_unsigned1), rhs1);
2613   if (actual_precision != TYPE_PRECISION (type2)
2614       || from_unsigned2 != TYPE_UNSIGNED (type2))
2615     rhs2 = build_and_insert_cast (gsi, loc,
2616 				  build_nonstandard_integer_type
2617 				    (actual_precision, from_unsigned2), rhs2);
2618 
2619   /* Handle constants.  */
2620   if (TREE_CODE (rhs1) == INTEGER_CST)
2621     rhs1 = fold_convert (type1, rhs1);
2622   if (TREE_CODE (rhs2) == INTEGER_CST)
2623     rhs2 = fold_convert (type2, rhs2);
2624 
2625   gimple_assign_set_rhs1 (stmt, rhs1);
2626   gimple_assign_set_rhs2 (stmt, rhs2);
2627   gimple_assign_set_rhs_code (stmt, WIDEN_MULT_EXPR);
2628   update_stmt (stmt);
2629   widen_mul_stats.widen_mults_inserted++;
2630   return true;
2631 }
2632 
2633 /* Process a single gimple statement STMT, which is found at the
2634    iterator GSI and has a either a PLUS_EXPR or a MINUS_EXPR as its
2635    rhs (given by CODE), and try to convert it into a
2636    WIDEN_MULT_PLUS_EXPR or a WIDEN_MULT_MINUS_EXPR.  The return value
2637    is true iff we converted the statement.  */
2638 
2639 static bool
convert_plusminus_to_widen(gimple_stmt_iterator * gsi,gimple * stmt,enum tree_code code)2640 convert_plusminus_to_widen (gimple_stmt_iterator *gsi, gimple *stmt,
2641 			    enum tree_code code)
2642 {
2643   gimple *rhs1_stmt = NULL, *rhs2_stmt = NULL;
2644   gimple *conv1_stmt = NULL, *conv2_stmt = NULL, *conv_stmt;
2645   tree type, type1, type2, optype;
2646   tree lhs, rhs1, rhs2, mult_rhs1, mult_rhs2, add_rhs;
2647   enum tree_code rhs1_code = ERROR_MARK, rhs2_code = ERROR_MARK;
2648   optab this_optab;
2649   enum tree_code wmult_code;
2650   enum insn_code handler;
2651   scalar_mode to_mode, from_mode, actual_mode;
2652   location_t loc = gimple_location (stmt);
2653   int actual_precision;
2654   bool from_unsigned1, from_unsigned2;
2655 
2656   lhs = gimple_assign_lhs (stmt);
2657   type = TREE_TYPE (lhs);
2658   if (TREE_CODE (type) != INTEGER_TYPE
2659       && TREE_CODE (type) != FIXED_POINT_TYPE)
2660     return false;
2661 
2662   if (code == MINUS_EXPR)
2663     wmult_code = WIDEN_MULT_MINUS_EXPR;
2664   else
2665     wmult_code = WIDEN_MULT_PLUS_EXPR;
2666 
2667   rhs1 = gimple_assign_rhs1 (stmt);
2668   rhs2 = gimple_assign_rhs2 (stmt);
2669 
2670   if (TREE_CODE (rhs1) == SSA_NAME)
2671     {
2672       rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
2673       if (is_gimple_assign (rhs1_stmt))
2674 	rhs1_code = gimple_assign_rhs_code (rhs1_stmt);
2675     }
2676 
2677   if (TREE_CODE (rhs2) == SSA_NAME)
2678     {
2679       rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
2680       if (is_gimple_assign (rhs2_stmt))
2681 	rhs2_code = gimple_assign_rhs_code (rhs2_stmt);
2682     }
2683 
2684   /* Allow for one conversion statement between the multiply
2685      and addition/subtraction statement.  If there are more than
2686      one conversions then we assume they would invalidate this
2687      transformation.  If that's not the case then they should have
2688      been folded before now.  */
2689   if (CONVERT_EXPR_CODE_P (rhs1_code))
2690     {
2691       conv1_stmt = rhs1_stmt;
2692       rhs1 = gimple_assign_rhs1 (rhs1_stmt);
2693       if (TREE_CODE (rhs1) == SSA_NAME)
2694 	{
2695 	  rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
2696 	  if (is_gimple_assign (rhs1_stmt))
2697 	    rhs1_code = gimple_assign_rhs_code (rhs1_stmt);
2698 	}
2699       else
2700 	return false;
2701     }
2702   if (CONVERT_EXPR_CODE_P (rhs2_code))
2703     {
2704       conv2_stmt = rhs2_stmt;
2705       rhs2 = gimple_assign_rhs1 (rhs2_stmt);
2706       if (TREE_CODE (rhs2) == SSA_NAME)
2707 	{
2708 	  rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
2709 	  if (is_gimple_assign (rhs2_stmt))
2710 	    rhs2_code = gimple_assign_rhs_code (rhs2_stmt);
2711 	}
2712       else
2713 	return false;
2714     }
2715 
2716   /* If code is WIDEN_MULT_EXPR then it would seem unnecessary to call
2717      is_widening_mult_p, but we still need the rhs returns.
2718 
2719      It might also appear that it would be sufficient to use the existing
2720      operands of the widening multiply, but that would limit the choice of
2721      multiply-and-accumulate instructions.
2722 
2723      If the widened-multiplication result has more than one uses, it is
2724      probably wiser not to do the conversion.  */
2725   if (code == PLUS_EXPR
2726       && (rhs1_code == MULT_EXPR || rhs1_code == WIDEN_MULT_EXPR))
2727     {
2728       if (!has_single_use (rhs1)
2729 	  || !is_widening_mult_p (rhs1_stmt, &type1, &mult_rhs1,
2730 				  &type2, &mult_rhs2))
2731 	return false;
2732       add_rhs = rhs2;
2733       conv_stmt = conv1_stmt;
2734     }
2735   else if (rhs2_code == MULT_EXPR || rhs2_code == WIDEN_MULT_EXPR)
2736     {
2737       if (!has_single_use (rhs2)
2738 	  || !is_widening_mult_p (rhs2_stmt, &type1, &mult_rhs1,
2739 				  &type2, &mult_rhs2))
2740 	return false;
2741       add_rhs = rhs1;
2742       conv_stmt = conv2_stmt;
2743     }
2744   else
2745     return false;
2746 
2747   to_mode = SCALAR_TYPE_MODE (type);
2748   from_mode = SCALAR_TYPE_MODE (type1);
2749   if (to_mode == from_mode)
2750     return false;
2751 
2752   from_unsigned1 = TYPE_UNSIGNED (type1);
2753   from_unsigned2 = TYPE_UNSIGNED (type2);
2754   optype = type1;
2755 
2756   /* There's no such thing as a mixed sign madd yet, so use a wider mode.  */
2757   if (from_unsigned1 != from_unsigned2)
2758     {
2759       if (!INTEGRAL_TYPE_P (type))
2760 	return false;
2761       /* We can use a signed multiply with unsigned types as long as
2762 	 there is a wider mode to use, or it is the smaller of the two
2763 	 types that is unsigned.  Note that type1 >= type2, always.  */
2764       if ((from_unsigned1
2765 	   && TYPE_PRECISION (type1) == GET_MODE_PRECISION (from_mode))
2766 	  || (from_unsigned2
2767 	      && TYPE_PRECISION (type2) == GET_MODE_PRECISION (from_mode)))
2768 	{
2769 	  if (!GET_MODE_WIDER_MODE (from_mode).exists (&from_mode)
2770 	      || GET_MODE_SIZE (from_mode) >= GET_MODE_SIZE (to_mode))
2771 	    return false;
2772 	}
2773 
2774       from_unsigned1 = from_unsigned2 = false;
2775       optype = build_nonstandard_integer_type (GET_MODE_PRECISION (from_mode),
2776 					       false);
2777     }
2778 
2779   /* If there was a conversion between the multiply and addition
2780      then we need to make sure it fits a multiply-and-accumulate.
2781      The should be a single mode change which does not change the
2782      value.  */
2783   if (conv_stmt)
2784     {
2785       /* We use the original, unmodified data types for this.  */
2786       tree from_type = TREE_TYPE (gimple_assign_rhs1 (conv_stmt));
2787       tree to_type = TREE_TYPE (gimple_assign_lhs (conv_stmt));
2788       int data_size = TYPE_PRECISION (type1) + TYPE_PRECISION (type2);
2789       bool is_unsigned = TYPE_UNSIGNED (type1) && TYPE_UNSIGNED (type2);
2790 
2791       if (TYPE_PRECISION (from_type) > TYPE_PRECISION (to_type))
2792 	{
2793 	  /* Conversion is a truncate.  */
2794 	  if (TYPE_PRECISION (to_type) < data_size)
2795 	    return false;
2796 	}
2797       else if (TYPE_PRECISION (from_type) < TYPE_PRECISION (to_type))
2798 	{
2799 	  /* Conversion is an extend.  Check it's the right sort.  */
2800 	  if (TYPE_UNSIGNED (from_type) != is_unsigned
2801 	      && !(is_unsigned && TYPE_PRECISION (from_type) > data_size))
2802 	    return false;
2803 	}
2804       /* else convert is a no-op for our purposes.  */
2805     }
2806 
2807   /* Verify that the machine can perform a widening multiply
2808      accumulate in this mode/signedness combination, otherwise
2809      this transformation is likely to pessimize code.  */
2810   this_optab = optab_for_tree_code (wmult_code, optype, optab_default);
2811   handler = find_widening_optab_handler_and_mode (this_optab, to_mode,
2812 						  from_mode, &actual_mode);
2813 
2814   if (handler == CODE_FOR_nothing)
2815     return false;
2816 
2817   /* Ensure that the inputs to the handler are in the correct precison
2818      for the opcode.  This will be the full mode size.  */
2819   actual_precision = GET_MODE_PRECISION (actual_mode);
2820   if (actual_precision != TYPE_PRECISION (type1)
2821       || from_unsigned1 != TYPE_UNSIGNED (type1))
2822     mult_rhs1 = build_and_insert_cast (gsi, loc,
2823 				       build_nonstandard_integer_type
2824 				         (actual_precision, from_unsigned1),
2825 				       mult_rhs1);
2826   if (actual_precision != TYPE_PRECISION (type2)
2827       || from_unsigned2 != TYPE_UNSIGNED (type2))
2828     mult_rhs2 = build_and_insert_cast (gsi, loc,
2829 				       build_nonstandard_integer_type
2830 					 (actual_precision, from_unsigned2),
2831 				       mult_rhs2);
2832 
2833   if (!useless_type_conversion_p (type, TREE_TYPE (add_rhs)))
2834     add_rhs = build_and_insert_cast (gsi, loc, type, add_rhs);
2835 
2836   /* Handle constants.  */
2837   if (TREE_CODE (mult_rhs1) == INTEGER_CST)
2838     mult_rhs1 = fold_convert (type1, mult_rhs1);
2839   if (TREE_CODE (mult_rhs2) == INTEGER_CST)
2840     mult_rhs2 = fold_convert (type2, mult_rhs2);
2841 
2842   gimple_assign_set_rhs_with_ops (gsi, wmult_code, mult_rhs1, mult_rhs2,
2843 				  add_rhs);
2844   update_stmt (gsi_stmt (*gsi));
2845   widen_mul_stats.maccs_inserted++;
2846   return true;
2847 }
2848 
2849 /* Given a result MUL_RESULT which is a result of a multiplication of OP1 and
2850    OP2 and which we know is used in statements that can be, together with the
2851    multiplication, converted to FMAs, perform the transformation.  */
2852 
2853 static void
convert_mult_to_fma_1(tree mul_result,tree op1,tree op2)2854 convert_mult_to_fma_1 (tree mul_result, tree op1, tree op2)
2855 {
2856   tree type = TREE_TYPE (mul_result);
2857   gimple *use_stmt;
2858   imm_use_iterator imm_iter;
2859   gcall *fma_stmt;
2860 
2861   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, mul_result)
2862     {
2863       gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
2864       tree addop, mulop1 = op1, result = mul_result;
2865       bool negate_p = false;
2866       gimple_seq seq = NULL;
2867 
2868       if (is_gimple_debug (use_stmt))
2869 	continue;
2870 
2871       if (is_gimple_assign (use_stmt)
2872 	  && gimple_assign_rhs_code (use_stmt) == NEGATE_EXPR)
2873 	{
2874 	  result = gimple_assign_lhs (use_stmt);
2875 	  use_operand_p use_p;
2876 	  gimple *neguse_stmt;
2877 	  single_imm_use (gimple_assign_lhs (use_stmt), &use_p, &neguse_stmt);
2878 	  gsi_remove (&gsi, true);
2879 	  release_defs (use_stmt);
2880 
2881 	  use_stmt = neguse_stmt;
2882 	  gsi = gsi_for_stmt (use_stmt);
2883 	  negate_p = true;
2884 	}
2885 
2886       tree cond, else_value, ops[3];
2887       tree_code code;
2888       if (!can_interpret_as_conditional_op_p (use_stmt, &cond, &code,
2889 					      ops, &else_value))
2890 	gcc_unreachable ();
2891       addop = ops[0] == result ? ops[1] : ops[0];
2892 
2893       if (code == MINUS_EXPR)
2894 	{
2895 	  if (ops[0] == result)
2896 	    /* a * b - c -> a * b + (-c)  */
2897 	    addop = gimple_build (&seq, NEGATE_EXPR, type, addop);
2898 	  else
2899 	    /* a - b * c -> (-b) * c + a */
2900 	    negate_p = !negate_p;
2901 	}
2902 
2903       if (negate_p)
2904 	mulop1 = gimple_build (&seq, NEGATE_EXPR, type, mulop1);
2905 
2906       if (seq)
2907 	gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
2908 
2909       if (cond)
2910 	fma_stmt = gimple_build_call_internal (IFN_COND_FMA, 5, cond, mulop1,
2911 					       op2, addop, else_value);
2912       else
2913 	fma_stmt = gimple_build_call_internal (IFN_FMA, 3, mulop1, op2, addop);
2914       gimple_set_lhs (fma_stmt, gimple_get_lhs (use_stmt));
2915       gimple_call_set_nothrow (fma_stmt, !stmt_can_throw_internal (cfun,
2916 								   use_stmt));
2917       gsi_replace (&gsi, fma_stmt, true);
2918       /* Follow all SSA edges so that we generate FMS, FNMA and FNMS
2919 	 regardless of where the negation occurs.  */
2920       gimple *orig_stmt = gsi_stmt (gsi);
2921       if (fold_stmt (&gsi, follow_all_ssa_edges))
2922 	{
2923 	  if (maybe_clean_or_replace_eh_stmt (orig_stmt, gsi_stmt (gsi)))
2924 	    gcc_unreachable ();
2925 	  update_stmt (gsi_stmt (gsi));
2926 	}
2927 
2928       if (dump_file && (dump_flags & TDF_DETAILS))
2929 	{
2930 	  fprintf (dump_file, "Generated FMA ");
2931 	  print_gimple_stmt (dump_file, gsi_stmt (gsi), 0, TDF_NONE);
2932 	  fprintf (dump_file, "\n");
2933 	}
2934 
2935       widen_mul_stats.fmas_inserted++;
2936     }
2937 }
2938 
2939 /* Data necessary to perform the actual transformation from a multiplication
2940    and an addition to an FMA after decision is taken it should be done and to
2941    then delete the multiplication statement from the function IL.  */
2942 
2943 struct fma_transformation_info
2944 {
2945   gimple *mul_stmt;
2946   tree mul_result;
2947   tree op1;
2948   tree op2;
2949 };
2950 
2951 /* Structure containing the current state of FMA deferring, i.e. whether we are
2952    deferring, whether to continue deferring, and all data necessary to come
2953    back and perform all deferred transformations.  */
2954 
2955 class fma_deferring_state
2956 {
2957 public:
2958   /* Class constructor.  Pass true as PERFORM_DEFERRING in order to actually
2959      do any deferring.  */
2960 
fma_deferring_state(bool perform_deferring)2961   fma_deferring_state (bool perform_deferring)
2962     : m_candidates (), m_mul_result_set (), m_initial_phi (NULL),
2963       m_last_result (NULL_TREE), m_deferring_p (perform_deferring) {}
2964 
2965   /* List of FMA candidates for which we the transformation has been determined
2966      possible but we at this point in BB analysis we do not consider them
2967      beneficial.  */
2968   auto_vec<fma_transformation_info, 8> m_candidates;
2969 
2970   /* Set of results of multiplication that are part of an already deferred FMA
2971      candidates.  */
2972   hash_set<tree> m_mul_result_set;
2973 
2974   /* The PHI that supposedly feeds back result of a FMA to another over loop
2975      boundary.  */
2976   gphi *m_initial_phi;
2977 
2978   /* Result of the last produced FMA candidate or NULL if there has not been
2979      one.  */
2980   tree m_last_result;
2981 
2982   /* If true, deferring might still be profitable.  If false, transform all
2983      candidates and no longer defer.  */
2984   bool m_deferring_p;
2985 };
2986 
2987 /* Transform all deferred FMA candidates and mark STATE as no longer
2988    deferring.  */
2989 
2990 static void
cancel_fma_deferring(fma_deferring_state * state)2991 cancel_fma_deferring (fma_deferring_state *state)
2992 {
2993   if (!state->m_deferring_p)
2994     return;
2995 
2996   for (unsigned i = 0; i < state->m_candidates.length (); i++)
2997     {
2998       if (dump_file && (dump_flags & TDF_DETAILS))
2999 	fprintf (dump_file, "Generating deferred FMA\n");
3000 
3001       const fma_transformation_info &fti = state->m_candidates[i];
3002       convert_mult_to_fma_1 (fti.mul_result, fti.op1, fti.op2);
3003 
3004       gimple_stmt_iterator gsi = gsi_for_stmt (fti.mul_stmt);
3005       gsi_remove (&gsi, true);
3006       release_defs (fti.mul_stmt);
3007     }
3008   state->m_deferring_p = false;
3009 }
3010 
3011 /* If OP is an SSA name defined by a PHI node, return the PHI statement.
3012    Otherwise return NULL.  */
3013 
3014 static gphi *
result_of_phi(tree op)3015 result_of_phi (tree op)
3016 {
3017   if (TREE_CODE (op) != SSA_NAME)
3018     return NULL;
3019 
3020   return dyn_cast <gphi *> (SSA_NAME_DEF_STMT (op));
3021 }
3022 
3023 /* After processing statements of a BB and recording STATE, return true if the
3024    initial phi is fed by the last FMA candidate result ore one such result from
3025    previously processed BBs marked in LAST_RESULT_SET.  */
3026 
3027 static bool
last_fma_candidate_feeds_initial_phi(fma_deferring_state * state,hash_set<tree> * last_result_set)3028 last_fma_candidate_feeds_initial_phi (fma_deferring_state *state,
3029 				      hash_set<tree> *last_result_set)
3030 {
3031   ssa_op_iter iter;
3032   use_operand_p use;
3033   FOR_EACH_PHI_ARG (use, state->m_initial_phi, iter, SSA_OP_USE)
3034     {
3035       tree t = USE_FROM_PTR (use);
3036       if (t == state->m_last_result
3037 	  || last_result_set->contains (t))
3038 	return true;
3039     }
3040 
3041   return false;
3042 }
3043 
3044 /* Combine the multiplication at MUL_STMT with operands MULOP1 and MULOP2
3045    with uses in additions and subtractions to form fused multiply-add
3046    operations.  Returns true if successful and MUL_STMT should be removed.
3047 
3048    If STATE indicates that we are deferring FMA transformation, that means
3049    that we do not produce FMAs for basic blocks which look like:
3050 
3051     <bb 6>
3052     # accumulator_111 = PHI <0.0(5), accumulator_66(6)>
3053     _65 = _14 * _16;
3054     accumulator_66 = _65 + accumulator_111;
3055 
3056   or its unrolled version, i.e. with several FMA candidates that feed result
3057   of one into the addend of another.  Instead, we add them to a list in STATE
3058   and if we later discover an FMA candidate that is not part of such a chain,
3059   we go back and perform all deferred past candidates.  */
3060 
3061 static bool
convert_mult_to_fma(gimple * mul_stmt,tree op1,tree op2,fma_deferring_state * state)3062 convert_mult_to_fma (gimple *mul_stmt, tree op1, tree op2,
3063 		     fma_deferring_state *state)
3064 {
3065   tree mul_result = gimple_get_lhs (mul_stmt);
3066   tree type = TREE_TYPE (mul_result);
3067   gimple *use_stmt, *neguse_stmt;
3068   use_operand_p use_p;
3069   imm_use_iterator imm_iter;
3070 
3071   if (FLOAT_TYPE_P (type)
3072       && flag_fp_contract_mode == FP_CONTRACT_OFF)
3073     return false;
3074 
3075   /* We don't want to do bitfield reduction ops.  */
3076   if (INTEGRAL_TYPE_P (type)
3077       && (!type_has_mode_precision_p (type) || TYPE_OVERFLOW_TRAPS (type)))
3078     return false;
3079 
3080   /* If the target doesn't support it, don't generate it.  We assume that
3081      if fma isn't available then fms, fnma or fnms are not either.  */
3082   optimization_type opt_type = bb_optimization_type (gimple_bb (mul_stmt));
3083   if (!direct_internal_fn_supported_p (IFN_FMA, type, opt_type))
3084     return false;
3085 
3086   /* If the multiplication has zero uses, it is kept around probably because
3087      of -fnon-call-exceptions.  Don't optimize it away in that case,
3088      it is DCE job.  */
3089   if (has_zero_uses (mul_result))
3090     return false;
3091 
3092   bool check_defer
3093     = (state->m_deferring_p
3094        && (tree_to_shwi (TYPE_SIZE (type))
3095 	   <= PARAM_VALUE (PARAM_AVOID_FMA_MAX_BITS)));
3096   bool defer = check_defer;
3097   bool seen_negate_p = false;
3098   /* Make sure that the multiplication statement becomes dead after
3099      the transformation, thus that all uses are transformed to FMAs.
3100      This means we assume that an FMA operation has the same cost
3101      as an addition.  */
3102   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, mul_result)
3103     {
3104       tree result = mul_result;
3105       bool negate_p = false;
3106 
3107       use_stmt = USE_STMT (use_p);
3108 
3109       if (is_gimple_debug (use_stmt))
3110 	continue;
3111 
3112       /* For now restrict this operations to single basic blocks.  In theory
3113 	 we would want to support sinking the multiplication in
3114 	 m = a*b;
3115 	 if ()
3116 	   ma = m + c;
3117 	 else
3118 	   d = m;
3119 	 to form a fma in the then block and sink the multiplication to the
3120 	 else block.  */
3121       if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
3122 	return false;
3123 
3124       /* A negate on the multiplication leads to FNMA.  */
3125       if (is_gimple_assign (use_stmt)
3126 	  && gimple_assign_rhs_code (use_stmt) == NEGATE_EXPR)
3127 	{
3128 	  ssa_op_iter iter;
3129 	  use_operand_p usep;
3130 
3131 	  /* If (due to earlier missed optimizations) we have two
3132 	     negates of the same value, treat them as equivalent
3133 	     to a single negate with multiple uses.  */
3134 	  if (seen_negate_p)
3135 	    return false;
3136 
3137 	  result = gimple_assign_lhs (use_stmt);
3138 
3139 	  /* Make sure the negate statement becomes dead with this
3140 	     single transformation.  */
3141 	  if (!single_imm_use (gimple_assign_lhs (use_stmt),
3142 			       &use_p, &neguse_stmt))
3143 	    return false;
3144 
3145 	  /* Make sure the multiplication isn't also used on that stmt.  */
3146 	  FOR_EACH_PHI_OR_STMT_USE (usep, neguse_stmt, iter, SSA_OP_USE)
3147 	    if (USE_FROM_PTR (usep) == mul_result)
3148 	      return false;
3149 
3150 	  /* Re-validate.  */
3151 	  use_stmt = neguse_stmt;
3152 	  if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
3153 	    return false;
3154 
3155 	  negate_p = seen_negate_p = true;
3156 	}
3157 
3158       tree cond, else_value, ops[3];
3159       tree_code code;
3160       if (!can_interpret_as_conditional_op_p (use_stmt, &cond, &code, ops,
3161 					      &else_value))
3162 	return false;
3163 
3164       switch (code)
3165 	{
3166 	case MINUS_EXPR:
3167 	  if (ops[1] == result)
3168 	    negate_p = !negate_p;
3169 	  break;
3170 	case PLUS_EXPR:
3171 	  break;
3172 	default:
3173 	  /* FMA can only be formed from PLUS and MINUS.  */
3174 	  return false;
3175 	}
3176 
3177       if (cond)
3178 	{
3179 	  if (cond == result || else_value == result)
3180 	    return false;
3181 	  if (!direct_internal_fn_supported_p (IFN_COND_FMA, type, opt_type))
3182 	    return false;
3183 	}
3184 
3185       /* If the subtrahend (OPS[1]) is computed by a MULT_EXPR that
3186 	 we'll visit later, we might be able to get a more profitable
3187 	 match with fnma.
3188 	 OTOH, if we don't, a negate / fma pair has likely lower latency
3189 	 that a mult / subtract pair.  */
3190       if (code == MINUS_EXPR
3191 	  && !negate_p
3192 	  && ops[0] == result
3193 	  && !direct_internal_fn_supported_p (IFN_FMS, type, opt_type)
3194 	  && direct_internal_fn_supported_p (IFN_FNMA, type, opt_type)
3195 	  && TREE_CODE (ops[1]) == SSA_NAME
3196 	  && has_single_use (ops[1]))
3197 	{
3198 	  gimple *stmt2 = SSA_NAME_DEF_STMT (ops[1]);
3199 	  if (is_gimple_assign (stmt2)
3200 	      && gimple_assign_rhs_code (stmt2) == MULT_EXPR)
3201 	    return false;
3202 	}
3203 
3204       /* We can't handle a * b + a * b.  */
3205       if (ops[0] == ops[1])
3206 	return false;
3207       /* If deferring, make sure we are not looking at an instruction that
3208 	 wouldn't have existed if we were not.  */
3209       if (state->m_deferring_p
3210 	  && (state->m_mul_result_set.contains (ops[0])
3211 	      || state->m_mul_result_set.contains (ops[1])))
3212 	return false;
3213 
3214       if (check_defer)
3215 	{
3216 	  tree use_lhs = gimple_get_lhs (use_stmt);
3217 	  if (state->m_last_result)
3218 	    {
3219 	      if (ops[1] == state->m_last_result
3220 		  || ops[0] == state->m_last_result)
3221 		defer = true;
3222 	      else
3223 		defer = false;
3224 	    }
3225 	  else
3226 	    {
3227 	      gcc_checking_assert (!state->m_initial_phi);
3228 	      gphi *phi;
3229 	      if (ops[0] == result)
3230 		phi = result_of_phi (ops[1]);
3231 	      else
3232 		{
3233 		  gcc_assert (ops[1] == result);
3234 		  phi = result_of_phi (ops[0]);
3235 		}
3236 
3237 	      if (phi)
3238 		{
3239 		  state->m_initial_phi = phi;
3240 		  defer = true;
3241 		}
3242 	      else
3243 		defer = false;
3244 	    }
3245 
3246 	  state->m_last_result = use_lhs;
3247 	  check_defer = false;
3248 	}
3249       else
3250 	defer = false;
3251 
3252       /* While it is possible to validate whether or not the exact form that
3253 	 we've recognized is available in the backend, the assumption is that
3254 	 if the deferring logic above did not trigger, the transformation is
3255 	 never a loss.  For instance, suppose the target only has the plain FMA
3256 	 pattern available.  Consider a*b-c -> fma(a,b,-c): we've exchanged
3257 	 MUL+SUB for FMA+NEG, which is still two operations.  Consider
3258          -(a*b)-c -> fma(-a,b,-c): we still have 3 operations, but in the FMA
3259 	 form the two NEGs are independent and could be run in parallel.  */
3260     }
3261 
3262   if (defer)
3263     {
3264       fma_transformation_info fti;
3265       fti.mul_stmt = mul_stmt;
3266       fti.mul_result = mul_result;
3267       fti.op1 = op1;
3268       fti.op2 = op2;
3269       state->m_candidates.safe_push (fti);
3270       state->m_mul_result_set.add (mul_result);
3271 
3272       if (dump_file && (dump_flags & TDF_DETAILS))
3273 	{
3274 	  fprintf (dump_file, "Deferred generating FMA for multiplication ");
3275 	  print_gimple_stmt (dump_file, mul_stmt, 0, TDF_NONE);
3276 	  fprintf (dump_file, "\n");
3277 	}
3278 
3279       return false;
3280     }
3281   else
3282     {
3283       if (state->m_deferring_p)
3284 	cancel_fma_deferring (state);
3285       convert_mult_to_fma_1 (mul_result, op1, op2);
3286       return true;
3287     }
3288 }
3289 
3290 
3291 /* Helper function of match_uaddsub_overflow.  Return 1
3292    if USE_STMT is unsigned overflow check ovf != 0 for
3293    STMT, -1 if USE_STMT is unsigned overflow check ovf == 0
3294    and 0 otherwise.  */
3295 
3296 static int
uaddsub_overflow_check_p(gimple * stmt,gimple * use_stmt)3297 uaddsub_overflow_check_p (gimple *stmt, gimple *use_stmt)
3298 {
3299   enum tree_code ccode = ERROR_MARK;
3300   tree crhs1 = NULL_TREE, crhs2 = NULL_TREE;
3301   if (gimple_code (use_stmt) == GIMPLE_COND)
3302     {
3303       ccode = gimple_cond_code (use_stmt);
3304       crhs1 = gimple_cond_lhs (use_stmt);
3305       crhs2 = gimple_cond_rhs (use_stmt);
3306     }
3307   else if (is_gimple_assign (use_stmt))
3308     {
3309       if (gimple_assign_rhs_class (use_stmt) == GIMPLE_BINARY_RHS)
3310 	{
3311 	  ccode = gimple_assign_rhs_code (use_stmt);
3312 	  crhs1 = gimple_assign_rhs1 (use_stmt);
3313 	  crhs2 = gimple_assign_rhs2 (use_stmt);
3314 	}
3315       else if (gimple_assign_rhs_code (use_stmt) == COND_EXPR)
3316 	{
3317 	  tree cond = gimple_assign_rhs1 (use_stmt);
3318 	  if (COMPARISON_CLASS_P (cond))
3319 	    {
3320 	      ccode = TREE_CODE (cond);
3321 	      crhs1 = TREE_OPERAND (cond, 0);
3322 	      crhs2 = TREE_OPERAND (cond, 1);
3323 	    }
3324 	  else
3325 	    return 0;
3326 	}
3327       else
3328 	return 0;
3329     }
3330   else
3331     return 0;
3332 
3333   if (TREE_CODE_CLASS (ccode) != tcc_comparison)
3334     return 0;
3335 
3336   enum tree_code code = gimple_assign_rhs_code (stmt);
3337   tree lhs = gimple_assign_lhs (stmt);
3338   tree rhs1 = gimple_assign_rhs1 (stmt);
3339   tree rhs2 = gimple_assign_rhs2 (stmt);
3340 
3341   switch (ccode)
3342     {
3343     case GT_EXPR:
3344     case LE_EXPR:
3345       /* r = a - b; r > a or r <= a
3346 	 r = a + b; a > r or a <= r or b > r or b <= r.  */
3347       if ((code == MINUS_EXPR && crhs1 == lhs && crhs2 == rhs1)
3348 	  || (code == PLUS_EXPR && (crhs1 == rhs1 || crhs1 == rhs2)
3349 	      && crhs2 == lhs))
3350 	return ccode == GT_EXPR ? 1 : -1;
3351       break;
3352     case LT_EXPR:
3353     case GE_EXPR:
3354       /* r = a - b; a < r or a >= r
3355 	 r = a + b; r < a or r >= a or r < b or r >= b.  */
3356       if ((code == MINUS_EXPR && crhs1 == rhs1 && crhs2 == lhs)
3357 	  || (code == PLUS_EXPR && crhs1 == lhs
3358 	      && (crhs2 == rhs1 || crhs2 == rhs2)))
3359 	return ccode == LT_EXPR ? 1 : -1;
3360       break;
3361     default:
3362       break;
3363     }
3364   return 0;
3365 }
3366 
3367 /* Recognize for unsigned x
3368    x = y - z;
3369    if (x > y)
3370    where there are other uses of x and replace it with
3371    _7 = SUB_OVERFLOW (y, z);
3372    x = REALPART_EXPR <_7>;
3373    _8 = IMAGPART_EXPR <_7>;
3374    if (_8)
3375    and similarly for addition.  */
3376 
3377 static bool
match_uaddsub_overflow(gimple_stmt_iterator * gsi,gimple * stmt,enum tree_code code)3378 match_uaddsub_overflow (gimple_stmt_iterator *gsi, gimple *stmt,
3379 			enum tree_code code)
3380 {
3381   tree lhs = gimple_assign_lhs (stmt);
3382   tree type = TREE_TYPE (lhs);
3383   use_operand_p use_p;
3384   imm_use_iterator iter;
3385   bool use_seen = false;
3386   bool ovf_use_seen = false;
3387   gimple *use_stmt;
3388 
3389   gcc_checking_assert (code == PLUS_EXPR || code == MINUS_EXPR);
3390   if (!INTEGRAL_TYPE_P (type)
3391       || !TYPE_UNSIGNED (type)
3392       || has_zero_uses (lhs)
3393       || has_single_use (lhs)
3394       || optab_handler (code == PLUS_EXPR ? uaddv4_optab : usubv4_optab,
3395 			TYPE_MODE (type)) == CODE_FOR_nothing)
3396     return false;
3397 
3398   FOR_EACH_IMM_USE_FAST (use_p, iter, lhs)
3399     {
3400       use_stmt = USE_STMT (use_p);
3401       if (is_gimple_debug (use_stmt))
3402 	continue;
3403 
3404       if (uaddsub_overflow_check_p (stmt, use_stmt))
3405 	ovf_use_seen = true;
3406       else
3407 	use_seen = true;
3408       if (ovf_use_seen && use_seen)
3409 	break;
3410     }
3411 
3412   if (!ovf_use_seen || !use_seen)
3413     return false;
3414 
3415   tree ctype = build_complex_type (type);
3416   tree rhs1 = gimple_assign_rhs1 (stmt);
3417   tree rhs2 = gimple_assign_rhs2 (stmt);
3418   gcall *g = gimple_build_call_internal (code == PLUS_EXPR
3419 					 ? IFN_ADD_OVERFLOW : IFN_SUB_OVERFLOW,
3420 					 2, rhs1, rhs2);
3421   tree ctmp = make_ssa_name (ctype);
3422   gimple_call_set_lhs (g, ctmp);
3423   gsi_insert_before (gsi, g, GSI_SAME_STMT);
3424   gassign *g2 = gimple_build_assign (lhs, REALPART_EXPR,
3425 				     build1 (REALPART_EXPR, type, ctmp));
3426   gsi_replace (gsi, g2, true);
3427   tree ovf = make_ssa_name (type);
3428   g2 = gimple_build_assign (ovf, IMAGPART_EXPR,
3429 			    build1 (IMAGPART_EXPR, type, ctmp));
3430   gsi_insert_after (gsi, g2, GSI_NEW_STMT);
3431 
3432   FOR_EACH_IMM_USE_STMT (use_stmt, iter, lhs)
3433     {
3434       if (is_gimple_debug (use_stmt))
3435 	continue;
3436 
3437       int ovf_use = uaddsub_overflow_check_p (stmt, use_stmt);
3438       if (ovf_use == 0)
3439 	continue;
3440       if (gimple_code (use_stmt) == GIMPLE_COND)
3441 	{
3442 	  gcond *cond_stmt = as_a <gcond *> (use_stmt);
3443 	  gimple_cond_set_lhs (cond_stmt, ovf);
3444 	  gimple_cond_set_rhs (cond_stmt, build_int_cst (type, 0));
3445 	  gimple_cond_set_code (cond_stmt, ovf_use == 1 ? NE_EXPR : EQ_EXPR);
3446 	}
3447       else
3448 	{
3449 	  gcc_checking_assert (is_gimple_assign (use_stmt));
3450 	  if (gimple_assign_rhs_class (use_stmt) == GIMPLE_BINARY_RHS)
3451 	    {
3452 	      gimple_assign_set_rhs1 (use_stmt, ovf);
3453 	      gimple_assign_set_rhs2 (use_stmt, build_int_cst (type, 0));
3454 	      gimple_assign_set_rhs_code (use_stmt,
3455 					  ovf_use == 1 ? NE_EXPR : EQ_EXPR);
3456 	    }
3457 	  else
3458 	    {
3459 	      gcc_checking_assert (gimple_assign_rhs_code (use_stmt)
3460 				   == COND_EXPR);
3461 	      tree cond = build2 (ovf_use == 1 ? NE_EXPR : EQ_EXPR,
3462 				  boolean_type_node, ovf,
3463 				  build_int_cst (type, 0));
3464 	      gimple_assign_set_rhs1 (use_stmt, cond);
3465 	    }
3466 	}
3467       update_stmt (use_stmt);
3468     }
3469   return true;
3470 }
3471 
3472 /* Return true if target has support for divmod.  */
3473 
3474 static bool
target_supports_divmod_p(optab divmod_optab,optab div_optab,machine_mode mode)3475 target_supports_divmod_p (optab divmod_optab, optab div_optab, machine_mode mode)
3476 {
3477   /* If target supports hardware divmod insn, use it for divmod.  */
3478   if (optab_handler (divmod_optab, mode) != CODE_FOR_nothing)
3479     return true;
3480 
3481   /* Check if libfunc for divmod is available.  */
3482   rtx libfunc = optab_libfunc (divmod_optab, mode);
3483   if (libfunc != NULL_RTX)
3484     {
3485       /* If optab_handler exists for div_optab, perhaps in a wider mode,
3486 	 we don't want to use the libfunc even if it exists for given mode.  */
3487       machine_mode div_mode;
3488       FOR_EACH_MODE_FROM (div_mode, mode)
3489 	if (optab_handler (div_optab, div_mode) != CODE_FOR_nothing)
3490 	  return false;
3491 
3492       return targetm.expand_divmod_libfunc != NULL;
3493     }
3494 
3495   return false;
3496 }
3497 
3498 /* Check if stmt is candidate for divmod transform.  */
3499 
3500 static bool
divmod_candidate_p(gassign * stmt)3501 divmod_candidate_p (gassign *stmt)
3502 {
3503   tree type = TREE_TYPE (gimple_assign_lhs (stmt));
3504   machine_mode mode = TYPE_MODE (type);
3505   optab divmod_optab, div_optab;
3506 
3507   if (TYPE_UNSIGNED (type))
3508     {
3509       divmod_optab = udivmod_optab;
3510       div_optab = udiv_optab;
3511     }
3512   else
3513     {
3514       divmod_optab = sdivmod_optab;
3515       div_optab = sdiv_optab;
3516     }
3517 
3518   tree op1 = gimple_assign_rhs1 (stmt);
3519   tree op2 = gimple_assign_rhs2 (stmt);
3520 
3521   /* Disable the transform if either is a constant, since division-by-constant
3522      may have specialized expansion.  */
3523   if (CONSTANT_CLASS_P (op1) || CONSTANT_CLASS_P (op2))
3524     return false;
3525 
3526   /* Exclude the case where TYPE_OVERFLOW_TRAPS (type) as that should
3527      expand using the [su]divv optabs.  */
3528   if (TYPE_OVERFLOW_TRAPS (type))
3529     return false;
3530 
3531   if (!target_supports_divmod_p (divmod_optab, div_optab, mode))
3532     return false;
3533 
3534   return true;
3535 }
3536 
3537 /* This function looks for:
3538    t1 = a TRUNC_DIV_EXPR b;
3539    t2 = a TRUNC_MOD_EXPR b;
3540    and transforms it to the following sequence:
3541    complex_tmp = DIVMOD (a, b);
3542    t1 = REALPART_EXPR(a);
3543    t2 = IMAGPART_EXPR(b);
3544    For conditions enabling the transform see divmod_candidate_p().
3545 
3546    The pass has three parts:
3547    1) Find top_stmt which is trunc_div or trunc_mod stmt and dominates all
3548       other trunc_div_expr and trunc_mod_expr stmts.
3549    2) Add top_stmt and all trunc_div and trunc_mod stmts dominated by top_stmt
3550       to stmts vector.
3551    3) Insert DIVMOD call just before top_stmt and update entries in
3552       stmts vector to use return value of DIMOVD (REALEXPR_PART for div,
3553       IMAGPART_EXPR for mod).  */
3554 
3555 static bool
convert_to_divmod(gassign * stmt)3556 convert_to_divmod (gassign *stmt)
3557 {
3558   if (stmt_can_throw_internal (cfun, stmt)
3559       || !divmod_candidate_p (stmt))
3560     return false;
3561 
3562   tree op1 = gimple_assign_rhs1 (stmt);
3563   tree op2 = gimple_assign_rhs2 (stmt);
3564 
3565   imm_use_iterator use_iter;
3566   gimple *use_stmt;
3567   auto_vec<gimple *> stmts;
3568 
3569   gimple *top_stmt = stmt;
3570   basic_block top_bb = gimple_bb (stmt);
3571 
3572   /* Part 1: Try to set top_stmt to "topmost" stmt that dominates
3573      at-least stmt and possibly other trunc_div/trunc_mod stmts
3574      having same operands as stmt.  */
3575 
3576   FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, op1)
3577     {
3578       if (is_gimple_assign (use_stmt)
3579 	  && (gimple_assign_rhs_code (use_stmt) == TRUNC_DIV_EXPR
3580 	      || gimple_assign_rhs_code (use_stmt) == TRUNC_MOD_EXPR)
3581 	  && operand_equal_p (op1, gimple_assign_rhs1 (use_stmt), 0)
3582 	  && operand_equal_p (op2, gimple_assign_rhs2 (use_stmt), 0))
3583 	{
3584 	  if (stmt_can_throw_internal (cfun, use_stmt))
3585 	    continue;
3586 
3587 	  basic_block bb = gimple_bb (use_stmt);
3588 
3589 	  if (bb == top_bb)
3590 	    {
3591 	      if (gimple_uid (use_stmt) < gimple_uid (top_stmt))
3592 		top_stmt = use_stmt;
3593 	    }
3594 	  else if (dominated_by_p (CDI_DOMINATORS, top_bb, bb))
3595 	    {
3596 	      top_bb = bb;
3597 	      top_stmt = use_stmt;
3598 	    }
3599 	}
3600     }
3601 
3602   tree top_op1 = gimple_assign_rhs1 (top_stmt);
3603   tree top_op2 = gimple_assign_rhs2 (top_stmt);
3604 
3605   stmts.safe_push (top_stmt);
3606   bool div_seen = (gimple_assign_rhs_code (top_stmt) == TRUNC_DIV_EXPR);
3607 
3608   /* Part 2: Add all trunc_div/trunc_mod statements domianted by top_bb
3609      to stmts vector. The 2nd loop will always add stmt to stmts vector, since
3610      gimple_bb (top_stmt) dominates gimple_bb (stmt), so the
3611      2nd loop ends up adding at-least single trunc_mod_expr stmt.  */
3612 
3613   FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, top_op1)
3614     {
3615       if (is_gimple_assign (use_stmt)
3616 	  && (gimple_assign_rhs_code (use_stmt) == TRUNC_DIV_EXPR
3617 	      || gimple_assign_rhs_code (use_stmt) == TRUNC_MOD_EXPR)
3618 	  && operand_equal_p (top_op1, gimple_assign_rhs1 (use_stmt), 0)
3619 	  && operand_equal_p (top_op2, gimple_assign_rhs2 (use_stmt), 0))
3620 	{
3621 	  if (use_stmt == top_stmt
3622 	      || stmt_can_throw_internal (cfun, use_stmt)
3623 	      || !dominated_by_p (CDI_DOMINATORS, gimple_bb (use_stmt), top_bb))
3624 	    continue;
3625 
3626 	  stmts.safe_push (use_stmt);
3627 	  if (gimple_assign_rhs_code (use_stmt) == TRUNC_DIV_EXPR)
3628 	    div_seen = true;
3629 	}
3630     }
3631 
3632   if (!div_seen)
3633     return false;
3634 
3635   /* Part 3: Create libcall to internal fn DIVMOD:
3636      divmod_tmp = DIVMOD (op1, op2).  */
3637 
3638   gcall *call_stmt = gimple_build_call_internal (IFN_DIVMOD, 2, op1, op2);
3639   tree res = make_temp_ssa_name (build_complex_type (TREE_TYPE (op1)),
3640 				 call_stmt, "divmod_tmp");
3641   gimple_call_set_lhs (call_stmt, res);
3642   /* We rejected throwing statements above.  */
3643   gimple_call_set_nothrow (call_stmt, true);
3644 
3645   /* Insert the call before top_stmt.  */
3646   gimple_stmt_iterator top_stmt_gsi = gsi_for_stmt (top_stmt);
3647   gsi_insert_before (&top_stmt_gsi, call_stmt, GSI_SAME_STMT);
3648 
3649   widen_mul_stats.divmod_calls_inserted++;
3650 
3651   /* Update all statements in stmts vector:
3652      lhs = op1 TRUNC_DIV_EXPR op2 -> lhs = REALPART_EXPR<divmod_tmp>
3653      lhs = op1 TRUNC_MOD_EXPR op2 -> lhs = IMAGPART_EXPR<divmod_tmp>.  */
3654 
3655   for (unsigned i = 0; stmts.iterate (i, &use_stmt); ++i)
3656     {
3657       tree new_rhs;
3658 
3659       switch (gimple_assign_rhs_code (use_stmt))
3660 	{
3661 	  case TRUNC_DIV_EXPR:
3662 	    new_rhs = fold_build1 (REALPART_EXPR, TREE_TYPE (op1), res);
3663 	    break;
3664 
3665 	  case TRUNC_MOD_EXPR:
3666 	    new_rhs = fold_build1 (IMAGPART_EXPR, TREE_TYPE (op1), res);
3667 	    break;
3668 
3669 	  default:
3670 	    gcc_unreachable ();
3671 	}
3672 
3673       gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
3674       gimple_assign_set_rhs_from_tree (&gsi, new_rhs);
3675       update_stmt (use_stmt);
3676     }
3677 
3678   return true;
3679 }
3680 
3681 /* Find integer multiplications where the operands are extended from
3682    smaller types, and replace the MULT_EXPR with a WIDEN_MULT_EXPR
3683    where appropriate.  */
3684 
3685 namespace {
3686 
3687 const pass_data pass_data_optimize_widening_mul =
3688 {
3689   GIMPLE_PASS, /* type */
3690   "widening_mul", /* name */
3691   OPTGROUP_NONE, /* optinfo_flags */
3692   TV_TREE_WIDEN_MUL, /* tv_id */
3693   PROP_ssa, /* properties_required */
3694   0, /* properties_provided */
3695   0, /* properties_destroyed */
3696   0, /* todo_flags_start */
3697   TODO_update_ssa, /* todo_flags_finish */
3698 };
3699 
3700 class pass_optimize_widening_mul : public gimple_opt_pass
3701 {
3702 public:
pass_optimize_widening_mul(gcc::context * ctxt)3703   pass_optimize_widening_mul (gcc::context *ctxt)
3704     : gimple_opt_pass (pass_data_optimize_widening_mul, ctxt)
3705   {}
3706 
3707   /* opt_pass methods: */
gate(function *)3708   virtual bool gate (function *)
3709     {
3710       return flag_expensive_optimizations && optimize;
3711     }
3712 
3713   virtual unsigned int execute (function *);
3714 
3715 }; // class pass_optimize_widening_mul
3716 
3717 /* Walker class to perform the transformation in reverse dominance order. */
3718 
3719 class math_opts_dom_walker : public dom_walker
3720 {
3721 public:
3722   /* Constructor, CFG_CHANGED is a pointer to a boolean flag that will be set
3723      if walking modidifes the CFG.  */
3724 
math_opts_dom_walker(bool * cfg_changed_p)3725   math_opts_dom_walker (bool *cfg_changed_p)
3726     : dom_walker (CDI_DOMINATORS), m_last_result_set (),
3727       m_cfg_changed_p (cfg_changed_p) {}
3728 
3729   /* The actual actions performed in the walk.  */
3730 
3731   virtual void after_dom_children (basic_block);
3732 
3733   /* Set of results of chains of multiply and add statement combinations that
3734      were not transformed into FMAs because of active deferring.  */
3735   hash_set<tree> m_last_result_set;
3736 
3737   /* Pointer to a flag of the user that needs to be set if CFG has been
3738      modified.  */
3739   bool *m_cfg_changed_p;
3740 };
3741 
3742 void
after_dom_children(basic_block bb)3743 math_opts_dom_walker::after_dom_children (basic_block bb)
3744 {
3745   gimple_stmt_iterator gsi;
3746 
3747   fma_deferring_state fma_state (PARAM_VALUE (PARAM_AVOID_FMA_MAX_BITS) > 0);
3748 
3749   for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi);)
3750     {
3751       gimple *stmt = gsi_stmt (gsi);
3752       enum tree_code code;
3753 
3754       if (is_gimple_assign (stmt))
3755 	{
3756 	  code = gimple_assign_rhs_code (stmt);
3757 	  switch (code)
3758 	    {
3759 	    case MULT_EXPR:
3760 	      if (!convert_mult_to_widen (stmt, &gsi)
3761 		  && !convert_expand_mult_copysign (stmt, &gsi)
3762 		  && convert_mult_to_fma (stmt,
3763 					  gimple_assign_rhs1 (stmt),
3764 					  gimple_assign_rhs2 (stmt),
3765 					  &fma_state))
3766 		{
3767 		  gsi_remove (&gsi, true);
3768 		  release_defs (stmt);
3769 		  continue;
3770 		}
3771 	      break;
3772 
3773 	    case PLUS_EXPR:
3774 	    case MINUS_EXPR:
3775 	      if (!convert_plusminus_to_widen (&gsi, stmt, code))
3776 		match_uaddsub_overflow (&gsi, stmt, code);
3777 	      break;
3778 
3779 	    case TRUNC_MOD_EXPR:
3780 	      convert_to_divmod (as_a<gassign *> (stmt));
3781 	      break;
3782 
3783 	    default:;
3784 	    }
3785 	}
3786       else if (is_gimple_call (stmt))
3787 	{
3788 	  tree fndecl = gimple_call_fndecl (stmt);
3789 	  if (fndecl && gimple_call_builtin_p (stmt, BUILT_IN_NORMAL))
3790 	    {
3791 	      switch (DECL_FUNCTION_CODE (fndecl))
3792 		{
3793 		case BUILT_IN_POWF:
3794 		case BUILT_IN_POW:
3795 		case BUILT_IN_POWL:
3796 		  if (gimple_call_lhs (stmt)
3797 		      && TREE_CODE (gimple_call_arg (stmt, 1)) == REAL_CST
3798 		      && real_equal
3799 		      (&TREE_REAL_CST (gimple_call_arg (stmt, 1)),
3800 		       &dconst2)
3801 		      && convert_mult_to_fma (stmt,
3802 					      gimple_call_arg (stmt, 0),
3803 					      gimple_call_arg (stmt, 0),
3804 					      &fma_state))
3805 		    {
3806 		      unlink_stmt_vdef (stmt);
3807 		      if (gsi_remove (&gsi, true)
3808 			  && gimple_purge_dead_eh_edges (bb))
3809 			*m_cfg_changed_p = true;
3810 		      release_defs (stmt);
3811 		      continue;
3812 		    }
3813 		  break;
3814 
3815 		default:;
3816 		}
3817 	    }
3818 	  else
3819 	    cancel_fma_deferring (&fma_state);
3820 	}
3821       gsi_next (&gsi);
3822     }
3823   if (fma_state.m_deferring_p
3824       && fma_state.m_initial_phi)
3825     {
3826       gcc_checking_assert (fma_state.m_last_result);
3827       if (!last_fma_candidate_feeds_initial_phi (&fma_state,
3828 						 &m_last_result_set))
3829 	cancel_fma_deferring (&fma_state);
3830       else
3831 	m_last_result_set.add (fma_state.m_last_result);
3832     }
3833 }
3834 
3835 
3836 unsigned int
execute(function * fun)3837 pass_optimize_widening_mul::execute (function *fun)
3838 {
3839   bool cfg_changed = false;
3840 
3841   memset (&widen_mul_stats, 0, sizeof (widen_mul_stats));
3842   calculate_dominance_info (CDI_DOMINATORS);
3843   renumber_gimple_stmt_uids (cfun);
3844 
3845   math_opts_dom_walker (&cfg_changed).walk (ENTRY_BLOCK_PTR_FOR_FN (cfun));
3846 
3847   statistics_counter_event (fun, "widening multiplications inserted",
3848 			    widen_mul_stats.widen_mults_inserted);
3849   statistics_counter_event (fun, "widening maccs inserted",
3850 			    widen_mul_stats.maccs_inserted);
3851   statistics_counter_event (fun, "fused multiply-adds inserted",
3852 			    widen_mul_stats.fmas_inserted);
3853   statistics_counter_event (fun, "divmod calls inserted",
3854 			    widen_mul_stats.divmod_calls_inserted);
3855 
3856   return cfg_changed ? TODO_cleanup_cfg : 0;
3857 }
3858 
3859 } // anon namespace
3860 
3861 gimple_opt_pass *
make_pass_optimize_widening_mul(gcc::context * ctxt)3862 make_pass_optimize_widening_mul (gcc::context *ctxt)
3863 {
3864   return new pass_optimize_widening_mul (ctxt);
3865 }
3866