xref: /openbsd/gnu/gcc/gcc/tree-vect-generic.c (revision 404b540a)
1 /* Lower vector operations to scalar operations.
2    Copyright (C) 2004, 2005, 2006, 2007 Free Software Foundation, Inc.
3 
4 This file is part of GCC.
5 
6 GCC is free software; you can redistribute it and/or modify it
7 under the terms of the GNU General Public License as published by the
8 Free Software Foundation; either version 2, or (at your option) any
9 later version.
10 
11 GCC is distributed in the hope that it will be useful, but WITHOUT
12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14 for more details.
15 
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING.  If not, write to the Free
18 Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
19 02110-1301, USA.  */
20 
21 #include "config.h"
22 #include "system.h"
23 #include "coretypes.h"
24 #include "tree.h"
25 #include "tm.h"
26 #include "rtl.h"
27 #include "expr.h"
28 #include "insn-codes.h"
29 #include "diagnostic.h"
30 #include "optabs.h"
31 #include "machmode.h"
32 #include "langhooks.h"
33 #include "tree-flow.h"
34 #include "tree-gimple.h"
35 #include "tree-iterator.h"
36 #include "tree-pass.h"
37 #include "flags.h"
38 #include "ggc.h"
39 
40 
41 /* Build a constant of type TYPE, made of VALUE's bits replicated
42    every TYPE_SIZE (INNER_TYPE) bits to fit TYPE's precision.  */
43 static tree
build_replicated_const(tree type,tree inner_type,HOST_WIDE_INT value)44 build_replicated_const (tree type, tree inner_type, HOST_WIDE_INT value)
45 {
46   int width = tree_low_cst (TYPE_SIZE (inner_type), 1);
47   int n = HOST_BITS_PER_WIDE_INT / width;
48   unsigned HOST_WIDE_INT low, high, mask;
49   tree ret;
50 
51   gcc_assert (n);
52 
53   if (width == HOST_BITS_PER_WIDE_INT)
54     low = value;
55   else
56     {
57       mask = ((HOST_WIDE_INT)1 << width) - 1;
58       low = (unsigned HOST_WIDE_INT) ~0 / mask * (value & mask);
59     }
60 
61   if (TYPE_PRECISION (type) < HOST_BITS_PER_WIDE_INT)
62     low &= ((HOST_WIDE_INT)1 << TYPE_PRECISION (type)) - 1, high = 0;
63   else if (TYPE_PRECISION (type) == HOST_BITS_PER_WIDE_INT)
64     high = 0;
65   else if (TYPE_PRECISION (type) == 2 * HOST_BITS_PER_WIDE_INT)
66     high = low;
67   else
68     gcc_unreachable ();
69 
70   ret = build_int_cst_wide (type, low, high);
71   return ret;
72 }
73 
74 static GTY(()) tree vector_inner_type;
75 static GTY(()) tree vector_last_type;
76 static GTY(()) int vector_last_nunits;
77 
78 /* Return a suitable vector types made of SUBPARTS units each of mode
79    "word_mode" (the global variable).  */
80 static tree
build_word_mode_vector_type(int nunits)81 build_word_mode_vector_type (int nunits)
82 {
83   if (!vector_inner_type)
84     vector_inner_type = lang_hooks.types.type_for_mode (word_mode, 1);
85   else if (vector_last_nunits == nunits)
86     {
87       gcc_assert (TREE_CODE (vector_last_type) == VECTOR_TYPE);
88       return vector_last_type;
89     }
90 
91   /* We build a new type, but we canonicalize it nevertheless,
92      because it still saves some memory.  */
93   vector_last_nunits = nunits;
94   vector_last_type = type_hash_canon (nunits,
95 				      build_vector_type (vector_inner_type,
96 							 nunits));
97   return vector_last_type;
98 }
99 
100 typedef tree (*elem_op_func) (block_stmt_iterator *,
101 			      tree, tree, tree, tree, tree, enum tree_code);
102 
103 static inline tree
tree_vec_extract(block_stmt_iterator * bsi,tree type,tree t,tree bitsize,tree bitpos)104 tree_vec_extract (block_stmt_iterator *bsi, tree type,
105 		  tree t, tree bitsize, tree bitpos)
106 {
107   if (bitpos)
108     return gimplify_build3 (bsi, BIT_FIELD_REF, type, t, bitsize, bitpos);
109   else
110     return gimplify_build1 (bsi, VIEW_CONVERT_EXPR, type, t);
111 }
112 
113 static tree
do_unop(block_stmt_iterator * bsi,tree inner_type,tree a,tree b ATTRIBUTE_UNUSED,tree bitpos,tree bitsize,enum tree_code code)114 do_unop (block_stmt_iterator *bsi, tree inner_type, tree a,
115 	 tree b ATTRIBUTE_UNUSED, tree bitpos, tree bitsize,
116 	 enum tree_code code)
117 {
118   a = tree_vec_extract (bsi, inner_type, a, bitsize, bitpos);
119   return gimplify_build1 (bsi, code, inner_type, a);
120 }
121 
122 static tree
do_binop(block_stmt_iterator * bsi,tree inner_type,tree a,tree b,tree bitpos,tree bitsize,enum tree_code code)123 do_binop (block_stmt_iterator *bsi, tree inner_type, tree a, tree b,
124 	  tree bitpos, tree bitsize, enum tree_code code)
125 {
126   a = tree_vec_extract (bsi, inner_type, a, bitsize, bitpos);
127   b = tree_vec_extract (bsi, inner_type, b, bitsize, bitpos);
128   return gimplify_build2 (bsi, code, inner_type, a, b);
129 }
130 
131 /* Expand vector addition to scalars.  This does bit twiddling
132    in order to increase parallelism:
133 
134    a + b = (((int) a & 0x7f7f7f7f) + ((int) b & 0x7f7f7f7f)) ^
135            (a ^ b) & 0x80808080
136 
137    a - b =  (((int) a | 0x80808080) - ((int) b & 0x7f7f7f7f)) ^
138             (a ^ ~b) & 0x80808080
139 
140    -b = (0x80808080 - ((int) b & 0x7f7f7f7f)) ^ (~b & 0x80808080)
141 
142    This optimization should be done only if 4 vector items or more
143    fit into a word.  */
144 static tree
do_plus_minus(block_stmt_iterator * bsi,tree word_type,tree a,tree b,tree bitpos ATTRIBUTE_UNUSED,tree bitsize ATTRIBUTE_UNUSED,enum tree_code code)145 do_plus_minus (block_stmt_iterator *bsi, tree word_type, tree a, tree b,
146 	       tree bitpos ATTRIBUTE_UNUSED, tree bitsize ATTRIBUTE_UNUSED,
147 	       enum tree_code code)
148 {
149   tree inner_type = TREE_TYPE (TREE_TYPE (a));
150   unsigned HOST_WIDE_INT max;
151   tree low_bits, high_bits, a_low, b_low, result_low, signs;
152 
153   max = GET_MODE_MASK (TYPE_MODE (inner_type));
154   low_bits = build_replicated_const (word_type, inner_type, max >> 1);
155   high_bits = build_replicated_const (word_type, inner_type, max & ~(max >> 1));
156 
157   a = tree_vec_extract (bsi, word_type, a, bitsize, bitpos);
158   b = tree_vec_extract (bsi, word_type, b, bitsize, bitpos);
159 
160   signs = gimplify_build2 (bsi, BIT_XOR_EXPR, word_type, a, b);
161   b_low = gimplify_build2 (bsi, BIT_AND_EXPR, word_type, b, low_bits);
162   if (code == PLUS_EXPR)
163     a_low = gimplify_build2 (bsi, BIT_AND_EXPR, word_type, a, low_bits);
164   else
165     {
166       a_low = gimplify_build2 (bsi, BIT_IOR_EXPR, word_type, a, high_bits);
167       signs = gimplify_build1 (bsi, BIT_NOT_EXPR, word_type, signs);
168     }
169 
170   signs = gimplify_build2 (bsi, BIT_AND_EXPR, word_type, signs, high_bits);
171   result_low = gimplify_build2 (bsi, code, word_type, a_low, b_low);
172   return gimplify_build2 (bsi, BIT_XOR_EXPR, word_type, result_low, signs);
173 }
174 
175 static tree
do_negate(block_stmt_iterator * bsi,tree word_type,tree b,tree unused ATTRIBUTE_UNUSED,tree bitpos ATTRIBUTE_UNUSED,tree bitsize ATTRIBUTE_UNUSED,enum tree_code code ATTRIBUTE_UNUSED)176 do_negate (block_stmt_iterator *bsi, tree word_type, tree b,
177 	   tree unused ATTRIBUTE_UNUSED, tree bitpos ATTRIBUTE_UNUSED,
178 	   tree bitsize ATTRIBUTE_UNUSED,
179 	   enum tree_code code ATTRIBUTE_UNUSED)
180 {
181   tree inner_type = TREE_TYPE (TREE_TYPE (b));
182   HOST_WIDE_INT max;
183   tree low_bits, high_bits, b_low, result_low, signs;
184 
185   max = GET_MODE_MASK (TYPE_MODE (inner_type));
186   low_bits = build_replicated_const (word_type, inner_type, max >> 1);
187   high_bits = build_replicated_const (word_type, inner_type, max & ~(max >> 1));
188 
189   b = tree_vec_extract (bsi, word_type, b, bitsize, bitpos);
190 
191   b_low = gimplify_build2 (bsi, BIT_AND_EXPR, word_type, b, low_bits);
192   signs = gimplify_build1 (bsi, BIT_NOT_EXPR, word_type, b);
193   signs = gimplify_build2 (bsi, BIT_AND_EXPR, word_type, signs, high_bits);
194   result_low = gimplify_build2 (bsi, MINUS_EXPR, word_type, high_bits, b_low);
195   return gimplify_build2 (bsi, BIT_XOR_EXPR, word_type, result_low, signs);
196 }
197 
198 /* Expand a vector operation to scalars, by using many operations
199    whose type is the vector type's inner type.  */
200 static tree
expand_vector_piecewise(block_stmt_iterator * bsi,elem_op_func f,tree type,tree inner_type,tree a,tree b,enum tree_code code)201 expand_vector_piecewise (block_stmt_iterator *bsi, elem_op_func f,
202 			 tree type, tree inner_type,
203 			 tree a, tree b, enum tree_code code)
204 {
205   VEC(constructor_elt,gc) *v;
206   tree part_width = TYPE_SIZE (inner_type);
207   tree index = bitsize_int (0);
208   int nunits = TYPE_VECTOR_SUBPARTS (type);
209   int delta = tree_low_cst (part_width, 1)
210 	      / tree_low_cst (TYPE_SIZE (TREE_TYPE (type)), 1);
211   int i;
212 
213   v = VEC_alloc(constructor_elt, gc, (nunits + delta - 1) / delta);
214   for (i = 0; i < nunits;
215        i += delta, index = int_const_binop (PLUS_EXPR, index, part_width, 0))
216     {
217       tree result = f (bsi, inner_type, a, b, index, part_width, code);
218       constructor_elt *ce = VEC_quick_push (constructor_elt, v, NULL);
219       ce->index = NULL_TREE;
220       ce->value = result;
221     }
222 
223   return build_constructor (type, v);
224 }
225 
226 /* Expand a vector operation to scalars with the freedom to use
227    a scalar integer type, or to use a different size for the items
228    in the vector type.  */
229 static tree
expand_vector_parallel(block_stmt_iterator * bsi,elem_op_func f,tree type,tree a,tree b,enum tree_code code)230 expand_vector_parallel (block_stmt_iterator *bsi, elem_op_func f, tree type,
231 			tree a, tree b,
232 			enum tree_code code)
233 {
234   tree result, compute_type;
235   enum machine_mode mode;
236   int n_words = tree_low_cst (TYPE_SIZE_UNIT (type), 1) / UNITS_PER_WORD;
237 
238   /* We have three strategies.  If the type is already correct, just do
239      the operation an element at a time.  Else, if the vector is wider than
240      one word, do it a word at a time; finally, if the vector is smaller
241      than one word, do it as a scalar.  */
242   if (TYPE_MODE (TREE_TYPE (type)) == word_mode)
243      return expand_vector_piecewise (bsi, f,
244 				     type, TREE_TYPE (type),
245 				     a, b, code);
246   else if (n_words > 1)
247     {
248       tree word_type = build_word_mode_vector_type (n_words);
249       result = expand_vector_piecewise (bsi, f,
250 				        word_type, TREE_TYPE (word_type),
251 					a, b, code);
252       result = gimplify_val (bsi, word_type, result);
253     }
254   else
255     {
256       /* Use a single scalar operation with a mode no wider than word_mode.  */
257       mode = mode_for_size (tree_low_cst (TYPE_SIZE (type), 1), MODE_INT, 0);
258       compute_type = lang_hooks.types.type_for_mode (mode, 1);
259       result = f (bsi, compute_type, a, b, NULL_TREE, NULL_TREE, code);
260     }
261 
262   return result;
263 }
264 
265 /* Expand a vector operation to scalars; for integer types we can use
266    special bit twiddling tricks to do the sums a word at a time, using
267    function F_PARALLEL instead of F.  These tricks are done only if
268    they can process at least four items, that is, only if the vector
269    holds at least four items and if a word can hold four items.  */
270 static tree
expand_vector_addition(block_stmt_iterator * bsi,elem_op_func f,elem_op_func f_parallel,tree type,tree a,tree b,enum tree_code code)271 expand_vector_addition (block_stmt_iterator *bsi,
272 			elem_op_func f, elem_op_func f_parallel,
273 			tree type, tree a, tree b, enum tree_code code)
274 {
275   int parts_per_word = UNITS_PER_WORD
276 	  	       / tree_low_cst (TYPE_SIZE_UNIT (TREE_TYPE (type)), 1);
277 
278   if (INTEGRAL_TYPE_P (TREE_TYPE (type))
279       && parts_per_word >= 4
280       && TYPE_VECTOR_SUBPARTS (type) >= 4)
281     return expand_vector_parallel (bsi, f_parallel,
282 				   type, a, b, code);
283   else
284     return expand_vector_piecewise (bsi, f,
285 				    type, TREE_TYPE (type),
286 				    a, b, code);
287 }
288 
289 static tree
expand_vector_operation(block_stmt_iterator * bsi,tree type,tree compute_type,tree rhs,enum tree_code code)290 expand_vector_operation (block_stmt_iterator *bsi, tree type, tree compute_type,
291 			 tree rhs, enum tree_code code)
292 {
293   enum machine_mode compute_mode = TYPE_MODE (compute_type);
294 
295   /* If the compute mode is not a vector mode (hence we are not decomposing
296      a BLKmode vector to smaller, hardware-supported vectors), we may want
297      to expand the operations in parallel.  */
298   if (GET_MODE_CLASS (compute_mode) != MODE_VECTOR_INT
299       && GET_MODE_CLASS (compute_mode) != MODE_VECTOR_FLOAT)
300     switch (code)
301       {
302       case PLUS_EXPR:
303       case MINUS_EXPR:
304         if (!TYPE_OVERFLOW_TRAPS (type))
305           return expand_vector_addition (bsi, do_binop, do_plus_minus, type,
306 		      		         TREE_OPERAND (rhs, 0),
307 					 TREE_OPERAND (rhs, 1), code);
308 	break;
309 
310       case NEGATE_EXPR:
311         if (!TYPE_OVERFLOW_TRAPS (type))
312           return expand_vector_addition (bsi, do_unop, do_negate, type,
313 		      		         TREE_OPERAND (rhs, 0),
314 					 NULL_TREE, code);
315 	break;
316 
317       case BIT_AND_EXPR:
318       case BIT_IOR_EXPR:
319       case BIT_XOR_EXPR:
320         return expand_vector_parallel (bsi, do_binop, type,
321 		      		       TREE_OPERAND (rhs, 0),
322 				       TREE_OPERAND (rhs, 1), code);
323 
324       case BIT_NOT_EXPR:
325         return expand_vector_parallel (bsi, do_unop, type,
326 		      		       TREE_OPERAND (rhs, 0),
327 				       NULL_TREE, code);
328 
329       default:
330 	break;
331       }
332 
333   if (TREE_CODE_CLASS (code) == tcc_unary)
334     return expand_vector_piecewise (bsi, do_unop, type, compute_type,
335 				    TREE_OPERAND (rhs, 0),
336 				    NULL_TREE, code);
337   else
338     return expand_vector_piecewise (bsi, do_binop, type, compute_type,
339 				    TREE_OPERAND (rhs, 0),
340 				    TREE_OPERAND (rhs, 1), code);
341 }
342 
343 /* Return a type for the widest vector mode whose components are of mode
344    INNER_MODE, or NULL_TREE if none is found.  */
345 static tree
type_for_widest_vector_mode(enum machine_mode inner_mode,optab op)346 type_for_widest_vector_mode (enum machine_mode inner_mode, optab op)
347 {
348   enum machine_mode best_mode = VOIDmode, mode;
349   int best_nunits = 0;
350 
351   if (SCALAR_FLOAT_MODE_P (inner_mode))
352     mode = MIN_MODE_VECTOR_FLOAT;
353   else
354     mode = MIN_MODE_VECTOR_INT;
355 
356   for (; mode != VOIDmode; mode = GET_MODE_WIDER_MODE (mode))
357     if (GET_MODE_INNER (mode) == inner_mode
358         && GET_MODE_NUNITS (mode) > best_nunits
359 	&& op->handlers[mode].insn_code != CODE_FOR_nothing)
360       best_mode = mode, best_nunits = GET_MODE_NUNITS (mode);
361 
362   if (best_mode == VOIDmode)
363     return NULL_TREE;
364   else
365     return lang_hooks.types.type_for_mode (best_mode, 1);
366 }
367 
368 /* Process one statement.  If we identify a vector operation, expand it.  */
369 
370 static void
expand_vector_operations_1(block_stmt_iterator * bsi)371 expand_vector_operations_1 (block_stmt_iterator *bsi)
372 {
373   tree stmt = bsi_stmt (*bsi);
374   tree *p_lhs, *p_rhs, lhs, rhs, type, compute_type;
375   enum tree_code code;
376   enum machine_mode compute_mode;
377   optab op;
378 
379   switch (TREE_CODE (stmt))
380     {
381     case RETURN_EXPR:
382       stmt = TREE_OPERAND (stmt, 0);
383       if (!stmt || TREE_CODE (stmt) != MODIFY_EXPR)
384 	return;
385 
386       /* FALLTHRU */
387 
388     case MODIFY_EXPR:
389       p_lhs = &TREE_OPERAND (stmt, 0);
390       p_rhs = &TREE_OPERAND (stmt, 1);
391       lhs = *p_lhs;
392       rhs = *p_rhs;
393       break;
394 
395     default:
396       return;
397     }
398 
399   type = TREE_TYPE (rhs);
400   if (TREE_CODE (type) != VECTOR_TYPE)
401     return;
402 
403   code = TREE_CODE (rhs);
404   if (TREE_CODE_CLASS (code) != tcc_unary
405       && TREE_CODE_CLASS (code) != tcc_binary)
406     return;
407 
408   if (code == NOP_EXPR || code == VIEW_CONVERT_EXPR)
409     return;
410 
411   gcc_assert (code != CONVERT_EXPR);
412   op = optab_for_tree_code (code, type);
413 
414   /* For widening vector operations, the relevant type is of the arguments,
415      not the widened result.  */
416   if (code == WIDEN_SUM_EXPR)
417     type = TREE_TYPE (TREE_OPERAND (rhs, 0));
418 
419   /* Optabs will try converting a negation into a subtraction, so
420      look for it as well.  TODO: negation of floating-point vectors
421      might be turned into an exclusive OR toggling the sign bit.  */
422   if (op == NULL
423       && code == NEGATE_EXPR
424       && INTEGRAL_TYPE_P (TREE_TYPE (type)))
425     op = optab_for_tree_code (MINUS_EXPR, type);
426 
427   /* For very wide vectors, try using a smaller vector mode.  */
428   compute_type = type;
429   if (TYPE_MODE (type) == BLKmode && op)
430     {
431       tree vector_compute_type
432         = type_for_widest_vector_mode (TYPE_MODE (TREE_TYPE (type)), op);
433       if (vector_compute_type != NULL_TREE)
434         compute_type = vector_compute_type;
435     }
436 
437   /* If we are breaking a BLKmode vector into smaller pieces,
438      type_for_widest_vector_mode has already looked into the optab,
439      so skip these checks.  */
440   if (compute_type == type)
441     {
442       compute_mode = TYPE_MODE (compute_type);
443       if ((GET_MODE_CLASS (compute_mode) == MODE_VECTOR_INT
444 	   || GET_MODE_CLASS (compute_mode) == MODE_VECTOR_FLOAT)
445           && op != NULL
446 	  && op->handlers[compute_mode].insn_code != CODE_FOR_nothing)
447 	return;
448       else
449 	/* There is no operation in hardware, so fall back to scalars.  */
450 	compute_type = TREE_TYPE (type);
451     }
452 
453   gcc_assert (code != VEC_LSHIFT_EXPR && code != VEC_RSHIFT_EXPR);
454   rhs = expand_vector_operation (bsi, type, compute_type, rhs, code);
455   if (lang_hooks.types_compatible_p (TREE_TYPE (lhs), TREE_TYPE (rhs)))
456     *p_rhs = rhs;
457   else
458     *p_rhs = gimplify_build1 (bsi, VIEW_CONVERT_EXPR, TREE_TYPE (lhs), rhs);
459 
460   mark_stmt_modified (bsi_stmt (*bsi));
461 }
462 
463 /* Use this to lower vector operations introduced by the vectorizer,
464    if it may need the bit-twiddling tricks implemented in this file.  */
465 
466 static bool
gate_expand_vector_operations(void)467 gate_expand_vector_operations (void)
468 {
469   return flag_tree_vectorize != 0;
470 }
471 
472 static unsigned int
expand_vector_operations(void)473 expand_vector_operations (void)
474 {
475   block_stmt_iterator bsi;
476   basic_block bb;
477 
478   FOR_EACH_BB (bb)
479     {
480       for (bsi = bsi_start (bb); !bsi_end_p (bsi); bsi_next (&bsi))
481 	{
482 	  expand_vector_operations_1 (&bsi);
483 	  update_stmt_if_modified (bsi_stmt (bsi));
484 	}
485     }
486   return 0;
487 }
488 
489 struct tree_opt_pass pass_lower_vector =
490 {
491   "veclower",				/* name */
492   0,					/* gate */
493   expand_vector_operations,		/* execute */
494   NULL,					/* sub */
495   NULL,					/* next */
496   0,					/* static_pass_number */
497   0,					/* tv_id */
498   PROP_cfg,				/* properties_required */
499   0,					/* properties_provided */
500   0,					/* properties_destroyed */
501   0,					/* todo_flags_start */
502   TODO_dump_func | TODO_ggc_collect
503     | TODO_verify_stmts,		/* todo_flags_finish */
504   0					/* letter */
505 };
506 
507 struct tree_opt_pass pass_lower_vector_ssa =
508 {
509   "veclower2",				/* name */
510   gate_expand_vector_operations,	/* gate */
511   expand_vector_operations,		/* execute */
512   NULL,					/* sub */
513   NULL,					/* next */
514   0,					/* static_pass_number */
515   0,					/* tv_id */
516   PROP_cfg,				/* properties_required */
517   0,					/* properties_provided */
518   0,					/* properties_destroyed */
519   0,					/* todo_flags_start */
520   TODO_dump_func | TODO_update_ssa	/* todo_flags_finish */
521     | TODO_verify_ssa
522     | TODO_verify_stmts | TODO_verify_flow,
523   0					/* letter */
524 };
525 
526 #include "gt-tree-vect-generic.h"
527