1 /* GIMPLE store merging and byte swapping passes.
2    Copyright (C) 2009-2018 Free Software Foundation, Inc.
3    Contributed by ARM Ltd.
4 
5    This file is part of GCC.
6 
7    GCC is free software; you can redistribute it and/or modify it
8    under the terms of the GNU General Public License as published by
9    the Free Software Foundation; either version 3, or (at your option)
10    any later version.
11 
12    GCC is distributed in the hope that it will be useful, but
13    WITHOUT ANY WARRANTY; without even the implied warranty of
14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15    General Public License for more details.
16 
17    You should have received a copy of the GNU General Public License
18    along with GCC; see the file COPYING3.  If not see
19    <http://www.gnu.org/licenses/>.  */
20 
21 /* The purpose of the store merging pass is to combine multiple memory
22    stores of constant values, values loaded from memory or bitwise operations
23    on those to consecutive memory locations into fewer wider stores.
24    For example, if we have a sequence peforming four byte stores to
25    consecutive memory locations:
26    [p     ] := imm1;
27    [p + 1B] := imm2;
28    [p + 2B] := imm3;
29    [p + 3B] := imm4;
30    we can transform this into a single 4-byte store if the target supports it:
31   [p] := imm1:imm2:imm3:imm4 //concatenated immediates according to endianness.
32 
33    Or:
34    [p     ] := [q     ];
35    [p + 1B] := [q + 1B];
36    [p + 2B] := [q + 2B];
37    [p + 3B] := [q + 3B];
38    if there is no overlap can be transformed into a single 4-byte
39    load followed by single 4-byte store.
40 
41    Or:
42    [p     ] := [q     ] ^ imm1;
43    [p + 1B] := [q + 1B] ^ imm2;
44    [p + 2B] := [q + 2B] ^ imm3;
45    [p + 3B] := [q + 3B] ^ imm4;
46    if there is no overlap can be transformed into a single 4-byte
47    load, xored with imm1:imm2:imm3:imm4 and stored using a single 4-byte store.
48 
49    The algorithm is applied to each basic block in three phases:
50 
51    1) Scan through the basic block recording assignments to
52    destinations that can be expressed as a store to memory of a certain size
53    at a certain bit offset from expressions we can handle.  For bit-fields
54    we also note the surrounding bit region, bits that could be stored in
55    a read-modify-write operation when storing the bit-field.  Record store
56    chains to different bases in a hash_map (m_stores) and make sure to
57    terminate such chains when appropriate (for example when when the stored
58    values get used subsequently).
59    These stores can be a result of structure element initializers, array stores
60    etc.  A store_immediate_info object is recorded for every such store.
61    Record as many such assignments to a single base as possible until a
62    statement that interferes with the store sequence is encountered.
63    Each store has up to 2 operands, which can be an immediate constant
64    or a memory load, from which the value to be stored can be computed.
65    At most one of the operands can be a constant.  The operands are recorded
66    in store_operand_info struct.
67 
68    2) Analyze the chain of stores recorded in phase 1) (i.e. the vector of
69    store_immediate_info objects) and coalesce contiguous stores into
70    merged_store_group objects.  For bit-fields stores, we don't need to
71    require the stores to be contiguous, just their surrounding bit regions
72    have to be contiguous.  If the expression being stored is different
73    between adjacent stores, such as one store storing a constant and
74    following storing a value loaded from memory, or if the loaded memory
75    objects are not adjacent, a new merged_store_group is created as well.
76 
77    For example, given the stores:
78    [p     ] := 0;
79    [p + 1B] := 1;
80    [p + 3B] := 0;
81    [p + 4B] := 1;
82    [p + 5B] := 0;
83    [p + 6B] := 0;
84    This phase would produce two merged_store_group objects, one recording the
85    two bytes stored in the memory region [p : p + 1] and another
86    recording the four bytes stored in the memory region [p + 3 : p + 6].
87 
88    3) The merged_store_group objects produced in phase 2) are processed
89    to generate the sequence of wider stores that set the contiguous memory
90    regions to the sequence of bytes that correspond to it.  This may emit
91    multiple stores per store group to handle contiguous stores that are not
92    of a size that is a power of 2.  For example it can try to emit a 40-bit
93    store as a 32-bit store followed by an 8-bit store.
94    We try to emit as wide stores as we can while respecting STRICT_ALIGNMENT or
95    TARGET_SLOW_UNALIGNED_ACCESS rules.
96 
97    Note on endianness and example:
98    Consider 2 contiguous 16-bit stores followed by 2 contiguous 8-bit stores:
99    [p     ] := 0x1234;
100    [p + 2B] := 0x5678;
101    [p + 4B] := 0xab;
102    [p + 5B] := 0xcd;
103 
104    The memory layout for little-endian (LE) and big-endian (BE) must be:
105   p |LE|BE|
106   ---------
107   0 |34|12|
108   1 |12|34|
109   2 |78|56|
110   3 |56|78|
111   4 |ab|ab|
112   5 |cd|cd|
113 
114   To merge these into a single 48-bit merged value 'val' in phase 2)
115   on little-endian we insert stores to higher (consecutive) bitpositions
116   into the most significant bits of the merged value.
117   The final merged value would be: 0xcdab56781234
118 
119   For big-endian we insert stores to higher bitpositions into the least
120   significant bits of the merged value.
121   The final merged value would be: 0x12345678abcd
122 
123   Then, in phase 3), we want to emit this 48-bit value as a 32-bit store
124   followed by a 16-bit store.  Again, we must consider endianness when
125   breaking down the 48-bit value 'val' computed above.
126   For little endian we emit:
127   [p]      (32-bit) := 0x56781234; // val & 0x0000ffffffff;
128   [p + 4B] (16-bit) := 0xcdab;    // (val & 0xffff00000000) >> 32;
129 
130   Whereas for big-endian we emit:
131   [p]      (32-bit) := 0x12345678; // (val & 0xffffffff0000) >> 16;
132   [p + 4B] (16-bit) := 0xabcd;     //  val & 0x00000000ffff;  */
133 
134 #include "config.h"
135 #include "system.h"
136 #include "coretypes.h"
137 #include "backend.h"
138 #include "tree.h"
139 #include "gimple.h"
140 #include "builtins.h"
141 #include "fold-const.h"
142 #include "tree-pass.h"
143 #include "ssa.h"
144 #include "gimple-pretty-print.h"
145 #include "alias.h"
146 #include "fold-const.h"
147 #include "params.h"
148 #include "print-tree.h"
149 #include "tree-hash-traits.h"
150 #include "gimple-iterator.h"
151 #include "gimplify.h"
152 #include "stor-layout.h"
153 #include "timevar.h"
154 #include "tree-cfg.h"
155 #include "tree-eh.h"
156 #include "target.h"
157 #include "gimplify-me.h"
158 #include "rtl.h"
159 #include "expr.h"	/* For get_bit_range.  */
160 #include "optabs-tree.h"
161 #include "selftest.h"
162 
163 /* The maximum size (in bits) of the stores this pass should generate.  */
164 #define MAX_STORE_BITSIZE (BITS_PER_WORD)
165 #define MAX_STORE_BYTES (MAX_STORE_BITSIZE / BITS_PER_UNIT)
166 
167 /* Limit to bound the number of aliasing checks for loads with the same
168    vuse as the corresponding store.  */
169 #define MAX_STORE_ALIAS_CHECKS 64
170 
171 namespace {
172 
173 struct bswap_stat
174 {
175   /* Number of hand-written 16-bit nop / bswaps found.  */
176   int found_16bit;
177 
178   /* Number of hand-written 32-bit nop / bswaps found.  */
179   int found_32bit;
180 
181   /* Number of hand-written 64-bit nop / bswaps found.  */
182   int found_64bit;
183 } nop_stats, bswap_stats;
184 
185 /* A symbolic number structure is used to detect byte permutation and selection
186    patterns of a source.  To achieve that, its field N contains an artificial
187    number consisting of BITS_PER_MARKER sized markers tracking where does each
188    byte come from in the source:
189 
190    0	   - target byte has the value 0
191    FF	   - target byte has an unknown value (eg. due to sign extension)
192    1..size - marker value is the byte index in the source (0 for lsb).
193 
194    To detect permutations on memory sources (arrays and structures), a symbolic
195    number is also associated:
196    - a base address BASE_ADDR and an OFFSET giving the address of the source;
197    - a range which gives the difference between the highest and lowest accessed
198      memory location to make such a symbolic number;
199    - the address SRC of the source element of lowest address as a convenience
200      to easily get BASE_ADDR + offset + lowest bytepos;
201    - number of expressions N_OPS bitwise ored together to represent
202      approximate cost of the computation.
203 
204    Note 1: the range is different from size as size reflects the size of the
205    type of the current expression.  For instance, for an array char a[],
206    (short) a[0] | (short) a[3] would have a size of 2 but a range of 4 while
207    (short) a[0] | ((short) a[0] << 1) would still have a size of 2 but this
208    time a range of 1.
209 
210    Note 2: for non-memory sources, range holds the same value as size.
211 
212    Note 3: SRC points to the SSA_NAME in case of non-memory source.  */
213 
214 struct symbolic_number {
215   uint64_t n;
216   tree type;
217   tree base_addr;
218   tree offset;
219   poly_int64_pod bytepos;
220   tree src;
221   tree alias_set;
222   tree vuse;
223   unsigned HOST_WIDE_INT range;
224   int n_ops;
225 };
226 
227 #define BITS_PER_MARKER 8
228 #define MARKER_MASK ((1 << BITS_PER_MARKER) - 1)
229 #define MARKER_BYTE_UNKNOWN MARKER_MASK
230 #define HEAD_MARKER(n, size) \
231   ((n) & ((uint64_t) MARKER_MASK << (((size) - 1) * BITS_PER_MARKER)))
232 
233 /* The number which the find_bswap_or_nop_1 result should match in
234    order to have a nop.  The number is masked according to the size of
235    the symbolic number before using it.  */
236 #define CMPNOP (sizeof (int64_t) < 8 ? 0 : \
237   (uint64_t)0x08070605 << 32 | 0x04030201)
238 
239 /* The number which the find_bswap_or_nop_1 result should match in
240    order to have a byte swap.  The number is masked according to the
241    size of the symbolic number before using it.  */
242 #define CMPXCHG (sizeof (int64_t) < 8 ? 0 : \
243   (uint64_t)0x01020304 << 32 | 0x05060708)
244 
245 /* Perform a SHIFT or ROTATE operation by COUNT bits on symbolic
246    number N.  Return false if the requested operation is not permitted
247    on a symbolic number.  */
248 
249 inline bool
do_shift_rotate(enum tree_code code,struct symbolic_number * n,int count)250 do_shift_rotate (enum tree_code code,
251 		 struct symbolic_number *n,
252 		 int count)
253 {
254   int i, size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
255   unsigned head_marker;
256 
257   if (count % BITS_PER_UNIT != 0)
258     return false;
259   count = (count / BITS_PER_UNIT) * BITS_PER_MARKER;
260 
261   /* Zero out the extra bits of N in order to avoid them being shifted
262      into the significant bits.  */
263   if (size < 64 / BITS_PER_MARKER)
264     n->n &= ((uint64_t) 1 << (size * BITS_PER_MARKER)) - 1;
265 
266   switch (code)
267     {
268     case LSHIFT_EXPR:
269       n->n <<= count;
270       break;
271     case RSHIFT_EXPR:
272       head_marker = HEAD_MARKER (n->n, size);
273       n->n >>= count;
274       /* Arithmetic shift of signed type: result is dependent on the value.  */
275       if (!TYPE_UNSIGNED (n->type) && head_marker)
276 	for (i = 0; i < count / BITS_PER_MARKER; i++)
277 	  n->n |= (uint64_t) MARKER_BYTE_UNKNOWN
278 		  << ((size - 1 - i) * BITS_PER_MARKER);
279       break;
280     case LROTATE_EXPR:
281       n->n = (n->n << count) | (n->n >> ((size * BITS_PER_MARKER) - count));
282       break;
283     case RROTATE_EXPR:
284       n->n = (n->n >> count) | (n->n << ((size * BITS_PER_MARKER) - count));
285       break;
286     default:
287       return false;
288     }
289   /* Zero unused bits for size.  */
290   if (size < 64 / BITS_PER_MARKER)
291     n->n &= ((uint64_t) 1 << (size * BITS_PER_MARKER)) - 1;
292   return true;
293 }
294 
295 /* Perform sanity checking for the symbolic number N and the gimple
296    statement STMT.  */
297 
298 inline bool
verify_symbolic_number_p(struct symbolic_number * n,gimple * stmt)299 verify_symbolic_number_p (struct symbolic_number *n, gimple *stmt)
300 {
301   tree lhs_type;
302 
303   lhs_type = gimple_expr_type (stmt);
304 
305   if (TREE_CODE (lhs_type) != INTEGER_TYPE)
306     return false;
307 
308   if (TYPE_PRECISION (lhs_type) != TYPE_PRECISION (n->type))
309     return false;
310 
311   return true;
312 }
313 
314 /* Initialize the symbolic number N for the bswap pass from the base element
315    SRC manipulated by the bitwise OR expression.  */
316 
317 bool
init_symbolic_number(struct symbolic_number * n,tree src)318 init_symbolic_number (struct symbolic_number *n, tree src)
319 {
320   int size;
321 
322   if (! INTEGRAL_TYPE_P (TREE_TYPE (src)))
323     return false;
324 
325   n->base_addr = n->offset = n->alias_set = n->vuse = NULL_TREE;
326   n->src = src;
327 
328   /* Set up the symbolic number N by setting each byte to a value between 1 and
329      the byte size of rhs1.  The highest order byte is set to n->size and the
330      lowest order byte to 1.  */
331   n->type = TREE_TYPE (src);
332   size = TYPE_PRECISION (n->type);
333   if (size % BITS_PER_UNIT != 0)
334     return false;
335   size /= BITS_PER_UNIT;
336   if (size > 64 / BITS_PER_MARKER)
337     return false;
338   n->range = size;
339   n->n = CMPNOP;
340   n->n_ops = 1;
341 
342   if (size < 64 / BITS_PER_MARKER)
343     n->n &= ((uint64_t) 1 << (size * BITS_PER_MARKER)) - 1;
344 
345   return true;
346 }
347 
348 /* Check if STMT might be a byte swap or a nop from a memory source and returns
349    the answer. If so, REF is that memory source and the base of the memory area
350    accessed and the offset of the access from that base are recorded in N.  */
351 
352 bool
find_bswap_or_nop_load(gimple * stmt,tree ref,struct symbolic_number * n)353 find_bswap_or_nop_load (gimple *stmt, tree ref, struct symbolic_number *n)
354 {
355   /* Leaf node is an array or component ref. Memorize its base and
356      offset from base to compare to other such leaf node.  */
357   poly_int64 bitsize, bitpos, bytepos;
358   machine_mode mode;
359   int unsignedp, reversep, volatilep;
360   tree offset, base_addr;
361 
362   /* Not prepared to handle PDP endian.  */
363   if (BYTES_BIG_ENDIAN != WORDS_BIG_ENDIAN)
364     return false;
365 
366   if (!gimple_assign_load_p (stmt) || gimple_has_volatile_ops (stmt))
367     return false;
368 
369   base_addr = get_inner_reference (ref, &bitsize, &bitpos, &offset, &mode,
370 				   &unsignedp, &reversep, &volatilep);
371 
372   if (TREE_CODE (base_addr) == TARGET_MEM_REF)
373     /* Do not rewrite TARGET_MEM_REF.  */
374     return false;
375   else if (TREE_CODE (base_addr) == MEM_REF)
376     {
377       poly_offset_int bit_offset = 0;
378       tree off = TREE_OPERAND (base_addr, 1);
379 
380       if (!integer_zerop (off))
381 	{
382 	  poly_offset_int boff = mem_ref_offset (base_addr);
383 	  boff <<= LOG2_BITS_PER_UNIT;
384 	  bit_offset += boff;
385 	}
386 
387       base_addr = TREE_OPERAND (base_addr, 0);
388 
389       /* Avoid returning a negative bitpos as this may wreak havoc later.  */
390       if (maybe_lt (bit_offset, 0))
391 	{
392 	  tree byte_offset = wide_int_to_tree
393 	    (sizetype, bits_to_bytes_round_down (bit_offset));
394 	  bit_offset = num_trailing_bits (bit_offset);
395 	  if (offset)
396 	    offset = size_binop (PLUS_EXPR, offset, byte_offset);
397 	  else
398 	    offset = byte_offset;
399 	}
400 
401       bitpos += bit_offset.force_shwi ();
402     }
403   else
404     base_addr = build_fold_addr_expr (base_addr);
405 
406   if (!multiple_p (bitpos, BITS_PER_UNIT, &bytepos))
407     return false;
408   if (!multiple_p (bitsize, BITS_PER_UNIT))
409     return false;
410   if (reversep)
411     return false;
412 
413   if (!init_symbolic_number (n, ref))
414     return false;
415   n->base_addr = base_addr;
416   n->offset = offset;
417   n->bytepos = bytepos;
418   n->alias_set = reference_alias_ptr_type (ref);
419   n->vuse = gimple_vuse (stmt);
420   return true;
421 }
422 
423 /* Compute the symbolic number N representing the result of a bitwise OR on 2
424    symbolic number N1 and N2 whose source statements are respectively
425    SOURCE_STMT1 and SOURCE_STMT2.  */
426 
427 gimple *
perform_symbolic_merge(gimple * source_stmt1,struct symbolic_number * n1,gimple * source_stmt2,struct symbolic_number * n2,struct symbolic_number * n)428 perform_symbolic_merge (gimple *source_stmt1, struct symbolic_number *n1,
429 			gimple *source_stmt2, struct symbolic_number *n2,
430 			struct symbolic_number *n)
431 {
432   int i, size;
433   uint64_t mask;
434   gimple *source_stmt;
435   struct symbolic_number *n_start;
436 
437   tree rhs1 = gimple_assign_rhs1 (source_stmt1);
438   if (TREE_CODE (rhs1) == BIT_FIELD_REF
439       && TREE_CODE (TREE_OPERAND (rhs1, 0)) == SSA_NAME)
440     rhs1 = TREE_OPERAND (rhs1, 0);
441   tree rhs2 = gimple_assign_rhs1 (source_stmt2);
442   if (TREE_CODE (rhs2) == BIT_FIELD_REF
443       && TREE_CODE (TREE_OPERAND (rhs2, 0)) == SSA_NAME)
444     rhs2 = TREE_OPERAND (rhs2, 0);
445 
446   /* Sources are different, cancel bswap if they are not memory location with
447      the same base (array, structure, ...).  */
448   if (rhs1 != rhs2)
449     {
450       uint64_t inc;
451       HOST_WIDE_INT start1, start2, start_sub, end_sub, end1, end2, end;
452       struct symbolic_number *toinc_n_ptr, *n_end;
453       basic_block bb1, bb2;
454 
455       if (!n1->base_addr || !n2->base_addr
456 	  || !operand_equal_p (n1->base_addr, n2->base_addr, 0))
457 	return NULL;
458 
459       if (!n1->offset != !n2->offset
460 	  || (n1->offset && !operand_equal_p (n1->offset, n2->offset, 0)))
461 	return NULL;
462 
463       start1 = 0;
464       if (!(n2->bytepos - n1->bytepos).is_constant (&start2))
465 	return NULL;
466 
467       if (start1 < start2)
468 	{
469 	  n_start = n1;
470 	  start_sub = start2 - start1;
471 	}
472       else
473 	{
474 	  n_start = n2;
475 	  start_sub = start1 - start2;
476 	}
477 
478       bb1 = gimple_bb (source_stmt1);
479       bb2 = gimple_bb (source_stmt2);
480       if (dominated_by_p (CDI_DOMINATORS, bb1, bb2))
481 	source_stmt = source_stmt1;
482       else
483 	source_stmt = source_stmt2;
484 
485       /* Find the highest address at which a load is performed and
486 	 compute related info.  */
487       end1 = start1 + (n1->range - 1);
488       end2 = start2 + (n2->range - 1);
489       if (end1 < end2)
490 	{
491 	  end = end2;
492 	  end_sub = end2 - end1;
493 	}
494       else
495 	{
496 	  end = end1;
497 	  end_sub = end1 - end2;
498 	}
499       n_end = (end2 > end1) ? n2 : n1;
500 
501       /* Find symbolic number whose lsb is the most significant.  */
502       if (BYTES_BIG_ENDIAN)
503 	toinc_n_ptr = (n_end == n1) ? n2 : n1;
504       else
505 	toinc_n_ptr = (n_start == n1) ? n2 : n1;
506 
507       n->range = end - MIN (start1, start2) + 1;
508 
509       /* Check that the range of memory covered can be represented by
510 	 a symbolic number.  */
511       if (n->range > 64 / BITS_PER_MARKER)
512 	return NULL;
513 
514       /* Reinterpret byte marks in symbolic number holding the value of
515 	 bigger weight according to target endianness.  */
516       inc = BYTES_BIG_ENDIAN ? end_sub : start_sub;
517       size = TYPE_PRECISION (n1->type) / BITS_PER_UNIT;
518       for (i = 0; i < size; i++, inc <<= BITS_PER_MARKER)
519 	{
520 	  unsigned marker
521 	    = (toinc_n_ptr->n >> (i * BITS_PER_MARKER)) & MARKER_MASK;
522 	  if (marker && marker != MARKER_BYTE_UNKNOWN)
523 	    toinc_n_ptr->n += inc;
524 	}
525     }
526   else
527     {
528       n->range = n1->range;
529       n_start = n1;
530       source_stmt = source_stmt1;
531     }
532 
533   if (!n1->alias_set
534       || alias_ptr_types_compatible_p (n1->alias_set, n2->alias_set))
535     n->alias_set = n1->alias_set;
536   else
537     n->alias_set = ptr_type_node;
538   n->vuse = n_start->vuse;
539   n->base_addr = n_start->base_addr;
540   n->offset = n_start->offset;
541   n->src = n_start->src;
542   n->bytepos = n_start->bytepos;
543   n->type = n_start->type;
544   size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
545 
546   for (i = 0, mask = MARKER_MASK; i < size; i++, mask <<= BITS_PER_MARKER)
547     {
548       uint64_t masked1, masked2;
549 
550       masked1 = n1->n & mask;
551       masked2 = n2->n & mask;
552       if (masked1 && masked2 && masked1 != masked2)
553 	return NULL;
554     }
555   n->n = n1->n | n2->n;
556   n->n_ops = n1->n_ops + n2->n_ops;
557 
558   return source_stmt;
559 }
560 
561 /* find_bswap_or_nop_1 invokes itself recursively with N and tries to perform
562    the operation given by the rhs of STMT on the result.  If the operation
563    could successfully be executed the function returns a gimple stmt whose
564    rhs's first tree is the expression of the source operand and NULL
565    otherwise.  */
566 
567 gimple *
find_bswap_or_nop_1(gimple * stmt,struct symbolic_number * n,int limit)568 find_bswap_or_nop_1 (gimple *stmt, struct symbolic_number *n, int limit)
569 {
570   enum tree_code code;
571   tree rhs1, rhs2 = NULL;
572   gimple *rhs1_stmt, *rhs2_stmt, *source_stmt1;
573   enum gimple_rhs_class rhs_class;
574 
575   if (!limit || !is_gimple_assign (stmt))
576     return NULL;
577 
578   rhs1 = gimple_assign_rhs1 (stmt);
579 
580   if (find_bswap_or_nop_load (stmt, rhs1, n))
581     return stmt;
582 
583   /* Handle BIT_FIELD_REF.  */
584   if (TREE_CODE (rhs1) == BIT_FIELD_REF
585       && TREE_CODE (TREE_OPERAND (rhs1, 0)) == SSA_NAME)
586     {
587       unsigned HOST_WIDE_INT bitsize = tree_to_uhwi (TREE_OPERAND (rhs1, 1));
588       unsigned HOST_WIDE_INT bitpos = tree_to_uhwi (TREE_OPERAND (rhs1, 2));
589       if (bitpos % BITS_PER_UNIT == 0
590 	  && bitsize % BITS_PER_UNIT == 0
591 	  && init_symbolic_number (n, TREE_OPERAND (rhs1, 0)))
592 	{
593 	  /* Handle big-endian bit numbering in BIT_FIELD_REF.  */
594 	  if (BYTES_BIG_ENDIAN)
595 	    bitpos = TYPE_PRECISION (n->type) - bitpos - bitsize;
596 
597 	  /* Shift.  */
598 	  if (!do_shift_rotate (RSHIFT_EXPR, n, bitpos))
599 	    return NULL;
600 
601 	  /* Mask.  */
602 	  uint64_t mask = 0;
603 	  uint64_t tmp = (1 << BITS_PER_UNIT) - 1;
604 	  for (unsigned i = 0; i < bitsize / BITS_PER_UNIT;
605 	       i++, tmp <<= BITS_PER_UNIT)
606 	    mask |= (uint64_t) MARKER_MASK << (i * BITS_PER_MARKER);
607 	  n->n &= mask;
608 
609 	  /* Convert.  */
610 	  n->type = TREE_TYPE (rhs1);
611 	  if (!n->base_addr)
612 	    n->range = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
613 
614 	  return verify_symbolic_number_p (n, stmt) ? stmt : NULL;
615 	}
616 
617       return NULL;
618     }
619 
620   if (TREE_CODE (rhs1) != SSA_NAME)
621     return NULL;
622 
623   code = gimple_assign_rhs_code (stmt);
624   rhs_class = gimple_assign_rhs_class (stmt);
625   rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
626 
627   if (rhs_class == GIMPLE_BINARY_RHS)
628     rhs2 = gimple_assign_rhs2 (stmt);
629 
630   /* Handle unary rhs and binary rhs with integer constants as second
631      operand.  */
632 
633   if (rhs_class == GIMPLE_UNARY_RHS
634       || (rhs_class == GIMPLE_BINARY_RHS
635 	  && TREE_CODE (rhs2) == INTEGER_CST))
636     {
637       if (code != BIT_AND_EXPR
638 	  && code != LSHIFT_EXPR
639 	  && code != RSHIFT_EXPR
640 	  && code != LROTATE_EXPR
641 	  && code != RROTATE_EXPR
642 	  && !CONVERT_EXPR_CODE_P (code))
643 	return NULL;
644 
645       source_stmt1 = find_bswap_or_nop_1 (rhs1_stmt, n, limit - 1);
646 
647       /* If find_bswap_or_nop_1 returned NULL, STMT is a leaf node and
648 	 we have to initialize the symbolic number.  */
649       if (!source_stmt1)
650 	{
651 	  if (gimple_assign_load_p (stmt)
652 	      || !init_symbolic_number (n, rhs1))
653 	    return NULL;
654 	  source_stmt1 = stmt;
655 	}
656 
657       switch (code)
658 	{
659 	case BIT_AND_EXPR:
660 	  {
661 	    int i, size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
662 	    uint64_t val = int_cst_value (rhs2), mask = 0;
663 	    uint64_t tmp = (1 << BITS_PER_UNIT) - 1;
664 
665 	    /* Only constants masking full bytes are allowed.  */
666 	    for (i = 0; i < size; i++, tmp <<= BITS_PER_UNIT)
667 	      if ((val & tmp) != 0 && (val & tmp) != tmp)
668 		return NULL;
669 	      else if (val & tmp)
670 		mask |= (uint64_t) MARKER_MASK << (i * BITS_PER_MARKER);
671 
672 	    n->n &= mask;
673 	  }
674 	  break;
675 	case LSHIFT_EXPR:
676 	case RSHIFT_EXPR:
677 	case LROTATE_EXPR:
678 	case RROTATE_EXPR:
679 	  if (!do_shift_rotate (code, n, (int) TREE_INT_CST_LOW (rhs2)))
680 	    return NULL;
681 	  break;
682 	CASE_CONVERT:
683 	  {
684 	    int i, type_size, old_type_size;
685 	    tree type;
686 
687 	    type = gimple_expr_type (stmt);
688 	    type_size = TYPE_PRECISION (type);
689 	    if (type_size % BITS_PER_UNIT != 0)
690 	      return NULL;
691 	    type_size /= BITS_PER_UNIT;
692 	    if (type_size > 64 / BITS_PER_MARKER)
693 	      return NULL;
694 
695 	    /* Sign extension: result is dependent on the value.  */
696 	    old_type_size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
697 	    if (!TYPE_UNSIGNED (n->type) && type_size > old_type_size
698 		&& HEAD_MARKER (n->n, old_type_size))
699 	      for (i = 0; i < type_size - old_type_size; i++)
700 		n->n |= (uint64_t) MARKER_BYTE_UNKNOWN
701 			<< ((type_size - 1 - i) * BITS_PER_MARKER);
702 
703 	    if (type_size < 64 / BITS_PER_MARKER)
704 	      {
705 		/* If STMT casts to a smaller type mask out the bits not
706 		   belonging to the target type.  */
707 		n->n &= ((uint64_t) 1 << (type_size * BITS_PER_MARKER)) - 1;
708 	      }
709 	    n->type = type;
710 	    if (!n->base_addr)
711 	      n->range = type_size;
712 	  }
713 	  break;
714 	default:
715 	  return NULL;
716 	};
717       return verify_symbolic_number_p (n, stmt) ? source_stmt1 : NULL;
718     }
719 
720   /* Handle binary rhs.  */
721 
722   if (rhs_class == GIMPLE_BINARY_RHS)
723     {
724       struct symbolic_number n1, n2;
725       gimple *source_stmt, *source_stmt2;
726 
727       if (code != BIT_IOR_EXPR)
728 	return NULL;
729 
730       if (TREE_CODE (rhs2) != SSA_NAME)
731 	return NULL;
732 
733       rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
734 
735       switch (code)
736 	{
737 	case BIT_IOR_EXPR:
738 	  source_stmt1 = find_bswap_or_nop_1 (rhs1_stmt, &n1, limit - 1);
739 
740 	  if (!source_stmt1)
741 	    return NULL;
742 
743 	  source_stmt2 = find_bswap_or_nop_1 (rhs2_stmt, &n2, limit - 1);
744 
745 	  if (!source_stmt2)
746 	    return NULL;
747 
748 	  if (TYPE_PRECISION (n1.type) != TYPE_PRECISION (n2.type))
749 	    return NULL;
750 
751 	  if (n1.vuse != n2.vuse)
752 	    return NULL;
753 
754 	  source_stmt
755 	    = perform_symbolic_merge (source_stmt1, &n1, source_stmt2, &n2, n);
756 
757 	  if (!source_stmt)
758 	    return NULL;
759 
760 	  if (!verify_symbolic_number_p (n, stmt))
761 	    return NULL;
762 
763 	  break;
764 	default:
765 	  return NULL;
766 	}
767       return source_stmt;
768     }
769   return NULL;
770 }
771 
772 /* Helper for find_bswap_or_nop and try_coalesce_bswap to compute
773    *CMPXCHG, *CMPNOP and adjust *N.  */
774 
775 void
find_bswap_or_nop_finalize(struct symbolic_number * n,uint64_t * cmpxchg,uint64_t * cmpnop)776 find_bswap_or_nop_finalize (struct symbolic_number *n, uint64_t *cmpxchg,
777 			    uint64_t *cmpnop)
778 {
779   unsigned rsize;
780   uint64_t tmpn, mask;
781 
782   /* The number which the find_bswap_or_nop_1 result should match in order
783      to have a full byte swap.  The number is shifted to the right
784      according to the size of the symbolic number before using it.  */
785   *cmpxchg = CMPXCHG;
786   *cmpnop = CMPNOP;
787 
788   /* Find real size of result (highest non-zero byte).  */
789   if (n->base_addr)
790     for (tmpn = n->n, rsize = 0; tmpn; tmpn >>= BITS_PER_MARKER, rsize++);
791   else
792     rsize = n->range;
793 
794   /* Zero out the bits corresponding to untouched bytes in original gimple
795      expression.  */
796   if (n->range < (int) sizeof (int64_t))
797     {
798       mask = ((uint64_t) 1 << (n->range * BITS_PER_MARKER)) - 1;
799       *cmpxchg >>= (64 / BITS_PER_MARKER - n->range) * BITS_PER_MARKER;
800       *cmpnop &= mask;
801     }
802 
803   /* Zero out the bits corresponding to unused bytes in the result of the
804      gimple expression.  */
805   if (rsize < n->range)
806     {
807       if (BYTES_BIG_ENDIAN)
808 	{
809 	  mask = ((uint64_t) 1 << (rsize * BITS_PER_MARKER)) - 1;
810 	  *cmpxchg &= mask;
811 	  *cmpnop >>= (n->range - rsize) * BITS_PER_MARKER;
812 	}
813       else
814 	{
815 	  mask = ((uint64_t) 1 << (rsize * BITS_PER_MARKER)) - 1;
816 	  *cmpxchg >>= (n->range - rsize) * BITS_PER_MARKER;
817 	  *cmpnop &= mask;
818 	}
819       n->range = rsize;
820     }
821 
822   n->range *= BITS_PER_UNIT;
823 }
824 
825 /* Check if STMT completes a bswap implementation or a read in a given
826    endianness consisting of ORs, SHIFTs and ANDs and sets *BSWAP
827    accordingly.  It also sets N to represent the kind of operations
828    performed: size of the resulting expression and whether it works on
829    a memory source, and if so alias-set and vuse.  At last, the
830    function returns a stmt whose rhs's first tree is the source
831    expression.  */
832 
833 gimple *
find_bswap_or_nop(gimple * stmt,struct symbolic_number * n,bool * bswap)834 find_bswap_or_nop (gimple *stmt, struct symbolic_number *n, bool *bswap)
835 {
836   /* The last parameter determines the depth search limit.  It usually
837      correlates directly to the number n of bytes to be touched.  We
838      increase that number by log2(n) + 1 here in order to also
839      cover signed -> unsigned conversions of the src operand as can be seen
840      in libgcc, and for initial shift/and operation of the src operand.  */
841   int limit = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (gimple_expr_type (stmt)));
842   limit += 1 + (int) ceil_log2 ((unsigned HOST_WIDE_INT) limit);
843   gimple *ins_stmt = find_bswap_or_nop_1 (stmt, n, limit);
844 
845   if (!ins_stmt)
846     return NULL;
847 
848   uint64_t cmpxchg, cmpnop;
849   find_bswap_or_nop_finalize (n, &cmpxchg, &cmpnop);
850 
851   /* A complete byte swap should make the symbolic number to start with
852      the largest digit in the highest order byte. Unchanged symbolic
853      number indicates a read with same endianness as target architecture.  */
854   if (n->n == cmpnop)
855     *bswap = false;
856   else if (n->n == cmpxchg)
857     *bswap = true;
858   else
859     return NULL;
860 
861   /* Useless bit manipulation performed by code.  */
862   if (!n->base_addr && n->n == cmpnop && n->n_ops == 1)
863     return NULL;
864 
865   return ins_stmt;
866 }
867 
868 const pass_data pass_data_optimize_bswap =
869 {
870   GIMPLE_PASS, /* type */
871   "bswap", /* name */
872   OPTGROUP_NONE, /* optinfo_flags */
873   TV_NONE, /* tv_id */
874   PROP_ssa, /* properties_required */
875   0, /* properties_provided */
876   0, /* properties_destroyed */
877   0, /* todo_flags_start */
878   0, /* todo_flags_finish */
879 };
880 
881 class pass_optimize_bswap : public gimple_opt_pass
882 {
883 public:
pass_optimize_bswap(gcc::context * ctxt)884   pass_optimize_bswap (gcc::context *ctxt)
885     : gimple_opt_pass (pass_data_optimize_bswap, ctxt)
886   {}
887 
888   /* opt_pass methods: */
gate(function *)889   virtual bool gate (function *)
890     {
891       return flag_expensive_optimizations && optimize && BITS_PER_UNIT == 8;
892     }
893 
894   virtual unsigned int execute (function *);
895 
896 }; // class pass_optimize_bswap
897 
898 /* Perform the bswap optimization: replace the expression computed in the rhs
899    of gsi_stmt (GSI) (or if NULL add instead of replace) by an equivalent
900    bswap, load or load + bswap expression.
901    Which of these alternatives replace the rhs is given by N->base_addr (non
902    null if a load is needed) and BSWAP.  The type, VUSE and set-alias of the
903    load to perform are also given in N while the builtin bswap invoke is given
904    in FNDEL.  Finally, if a load is involved, INS_STMT refers to one of the
905    load statements involved to construct the rhs in gsi_stmt (GSI) and
906    N->range gives the size of the rhs expression for maintaining some
907    statistics.
908 
909    Note that if the replacement involve a load and if gsi_stmt (GSI) is
910    non-NULL, that stmt is moved just after INS_STMT to do the load with the
911    same VUSE which can lead to gsi_stmt (GSI) changing of basic block.  */
912 
913 tree
bswap_replace(gimple_stmt_iterator gsi,gimple * ins_stmt,tree fndecl,tree bswap_type,tree load_type,struct symbolic_number * n,bool bswap)914 bswap_replace (gimple_stmt_iterator gsi, gimple *ins_stmt, tree fndecl,
915 	       tree bswap_type, tree load_type, struct symbolic_number *n,
916 	       bool bswap)
917 {
918   tree src, tmp, tgt = NULL_TREE;
919   gimple *bswap_stmt;
920 
921   gimple *cur_stmt = gsi_stmt (gsi);
922   src = n->src;
923   if (cur_stmt)
924     tgt = gimple_assign_lhs (cur_stmt);
925 
926   /* Need to load the value from memory first.  */
927   if (n->base_addr)
928     {
929       gimple_stmt_iterator gsi_ins = gsi;
930       if (ins_stmt)
931 	gsi_ins = gsi_for_stmt (ins_stmt);
932       tree addr_expr, addr_tmp, val_expr, val_tmp;
933       tree load_offset_ptr, aligned_load_type;
934       gimple *load_stmt;
935       unsigned align = get_object_alignment (src);
936       poly_int64 load_offset = 0;
937 
938       if (cur_stmt)
939 	{
940 	  basic_block ins_bb = gimple_bb (ins_stmt);
941 	  basic_block cur_bb = gimple_bb (cur_stmt);
942 	  if (!dominated_by_p (CDI_DOMINATORS, cur_bb, ins_bb))
943 	    return NULL_TREE;
944 
945 	  /* Move cur_stmt just before one of the load of the original
946 	     to ensure it has the same VUSE.  See PR61517 for what could
947 	     go wrong.  */
948 	  if (gimple_bb (cur_stmt) != gimple_bb (ins_stmt))
949 	    reset_flow_sensitive_info (gimple_assign_lhs (cur_stmt));
950 	  gsi_move_before (&gsi, &gsi_ins);
951 	  gsi = gsi_for_stmt (cur_stmt);
952 	}
953       else
954 	gsi = gsi_ins;
955 
956       /* Compute address to load from and cast according to the size
957 	 of the load.  */
958       addr_expr = build_fold_addr_expr (src);
959       if (is_gimple_mem_ref_addr (addr_expr))
960 	addr_tmp = unshare_expr (addr_expr);
961       else
962 	{
963 	  addr_tmp = unshare_expr (n->base_addr);
964 	  if (!is_gimple_mem_ref_addr (addr_tmp))
965 	    addr_tmp = force_gimple_operand_gsi_1 (&gsi, addr_tmp,
966 						   is_gimple_mem_ref_addr,
967 						   NULL_TREE, true,
968 						   GSI_SAME_STMT);
969 	  load_offset = n->bytepos;
970 	  if (n->offset)
971 	    {
972 	      tree off
973 		= force_gimple_operand_gsi (&gsi, unshare_expr (n->offset),
974 					    true, NULL_TREE, true,
975 					    GSI_SAME_STMT);
976 	      gimple *stmt
977 		= gimple_build_assign (make_ssa_name (TREE_TYPE (addr_tmp)),
978 				       POINTER_PLUS_EXPR, addr_tmp, off);
979 	      gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
980 	      addr_tmp = gimple_assign_lhs (stmt);
981 	    }
982 	}
983 
984       /* Perform the load.  */
985       aligned_load_type = load_type;
986       if (align < TYPE_ALIGN (load_type))
987 	aligned_load_type = build_aligned_type (load_type, align);
988       load_offset_ptr = build_int_cst (n->alias_set, load_offset);
989       val_expr = fold_build2 (MEM_REF, aligned_load_type, addr_tmp,
990 			      load_offset_ptr);
991 
992       if (!bswap)
993 	{
994 	  if (n->range == 16)
995 	    nop_stats.found_16bit++;
996 	  else if (n->range == 32)
997 	    nop_stats.found_32bit++;
998 	  else
999 	    {
1000 	      gcc_assert (n->range == 64);
1001 	      nop_stats.found_64bit++;
1002 	    }
1003 
1004 	  /* Convert the result of load if necessary.  */
1005 	  if (tgt && !useless_type_conversion_p (TREE_TYPE (tgt), load_type))
1006 	    {
1007 	      val_tmp = make_temp_ssa_name (aligned_load_type, NULL,
1008 					    "load_dst");
1009 	      load_stmt = gimple_build_assign (val_tmp, val_expr);
1010 	      gimple_set_vuse (load_stmt, n->vuse);
1011 	      gsi_insert_before (&gsi, load_stmt, GSI_SAME_STMT);
1012 	      gimple_assign_set_rhs_with_ops (&gsi, NOP_EXPR, val_tmp);
1013 	      update_stmt (cur_stmt);
1014 	    }
1015 	  else if (cur_stmt)
1016 	    {
1017 	      gimple_assign_set_rhs_with_ops (&gsi, MEM_REF, val_expr);
1018 	      gimple_set_vuse (cur_stmt, n->vuse);
1019 	      update_stmt (cur_stmt);
1020 	    }
1021 	  else
1022 	    {
1023 	      tgt = make_ssa_name (load_type);
1024 	      cur_stmt = gimple_build_assign (tgt, MEM_REF, val_expr);
1025 	      gimple_set_vuse (cur_stmt, n->vuse);
1026 	      gsi_insert_before (&gsi, cur_stmt, GSI_SAME_STMT);
1027 	    }
1028 
1029 	  if (dump_file)
1030 	    {
1031 	      fprintf (dump_file,
1032 		       "%d bit load in target endianness found at: ",
1033 		       (int) n->range);
1034 	      print_gimple_stmt (dump_file, cur_stmt, 0);
1035 	    }
1036 	  return tgt;
1037 	}
1038       else
1039 	{
1040 	  val_tmp = make_temp_ssa_name (aligned_load_type, NULL, "load_dst");
1041 	  load_stmt = gimple_build_assign (val_tmp, val_expr);
1042 	  gimple_set_vuse (load_stmt, n->vuse);
1043 	  gsi_insert_before (&gsi, load_stmt, GSI_SAME_STMT);
1044 	}
1045       src = val_tmp;
1046     }
1047   else if (!bswap)
1048     {
1049       gimple *g = NULL;
1050       if (tgt && !useless_type_conversion_p (TREE_TYPE (tgt), TREE_TYPE (src)))
1051 	{
1052 	  if (!is_gimple_val (src))
1053 	    return NULL_TREE;
1054 	  g = gimple_build_assign (tgt, NOP_EXPR, src);
1055 	}
1056       else if (cur_stmt)
1057 	g = gimple_build_assign (tgt, src);
1058       else
1059 	tgt = src;
1060       if (n->range == 16)
1061 	nop_stats.found_16bit++;
1062       else if (n->range == 32)
1063 	nop_stats.found_32bit++;
1064       else
1065 	{
1066 	  gcc_assert (n->range == 64);
1067 	  nop_stats.found_64bit++;
1068 	}
1069       if (dump_file)
1070 	{
1071 	  fprintf (dump_file,
1072 		   "%d bit reshuffle in target endianness found at: ",
1073 		   (int) n->range);
1074 	  if (cur_stmt)
1075 	    print_gimple_stmt (dump_file, cur_stmt, 0);
1076 	  else
1077 	    {
1078 	      print_generic_expr (dump_file, tgt, 0);
1079 	      fprintf (dump_file, "\n");
1080 	    }
1081 	}
1082       if (cur_stmt)
1083 	gsi_replace (&gsi, g, true);
1084       return tgt;
1085     }
1086   else if (TREE_CODE (src) == BIT_FIELD_REF)
1087     src = TREE_OPERAND (src, 0);
1088 
1089   if (n->range == 16)
1090     bswap_stats.found_16bit++;
1091   else if (n->range == 32)
1092     bswap_stats.found_32bit++;
1093   else
1094     {
1095       gcc_assert (n->range == 64);
1096       bswap_stats.found_64bit++;
1097     }
1098 
1099   tmp = src;
1100 
1101   /* Convert the src expression if necessary.  */
1102   if (!useless_type_conversion_p (TREE_TYPE (tmp), bswap_type))
1103     {
1104       gimple *convert_stmt;
1105 
1106       tmp = make_temp_ssa_name (bswap_type, NULL, "bswapsrc");
1107       convert_stmt = gimple_build_assign (tmp, NOP_EXPR, src);
1108       gsi_insert_before (&gsi, convert_stmt, GSI_SAME_STMT);
1109     }
1110 
1111   /* Canonical form for 16 bit bswap is a rotate expression.  Only 16bit values
1112      are considered as rotation of 2N bit values by N bits is generally not
1113      equivalent to a bswap.  Consider for instance 0x01020304 r>> 16 which
1114      gives 0x03040102 while a bswap for that value is 0x04030201.  */
1115   if (bswap && n->range == 16)
1116     {
1117       tree count = build_int_cst (NULL, BITS_PER_UNIT);
1118       src = fold_build2 (LROTATE_EXPR, bswap_type, tmp, count);
1119       bswap_stmt = gimple_build_assign (NULL, src);
1120     }
1121   else
1122     bswap_stmt = gimple_build_call (fndecl, 1, tmp);
1123 
1124   if (tgt == NULL_TREE)
1125     tgt = make_ssa_name (bswap_type);
1126   tmp = tgt;
1127 
1128   /* Convert the result if necessary.  */
1129   if (!useless_type_conversion_p (TREE_TYPE (tgt), bswap_type))
1130     {
1131       gimple *convert_stmt;
1132 
1133       tmp = make_temp_ssa_name (bswap_type, NULL, "bswapdst");
1134       convert_stmt = gimple_build_assign (tgt, NOP_EXPR, tmp);
1135       gsi_insert_after (&gsi, convert_stmt, GSI_SAME_STMT);
1136     }
1137 
1138   gimple_set_lhs (bswap_stmt, tmp);
1139 
1140   if (dump_file)
1141     {
1142       fprintf (dump_file, "%d bit bswap implementation found at: ",
1143 	       (int) n->range);
1144       if (cur_stmt)
1145 	print_gimple_stmt (dump_file, cur_stmt, 0);
1146       else
1147 	{
1148 	  print_generic_expr (dump_file, tgt, 0);
1149 	  fprintf (dump_file, "\n");
1150 	}
1151     }
1152 
1153   if (cur_stmt)
1154     {
1155       gsi_insert_after (&gsi, bswap_stmt, GSI_SAME_STMT);
1156       gsi_remove (&gsi, true);
1157     }
1158   else
1159     gsi_insert_before (&gsi, bswap_stmt, GSI_SAME_STMT);
1160   return tgt;
1161 }
1162 
1163 /* Find manual byte swap implementations as well as load in a given
1164    endianness. Byte swaps are turned into a bswap builtin invokation
1165    while endian loads are converted to bswap builtin invokation or
1166    simple load according to the target endianness.  */
1167 
1168 unsigned int
execute(function * fun)1169 pass_optimize_bswap::execute (function *fun)
1170 {
1171   basic_block bb;
1172   bool bswap32_p, bswap64_p;
1173   bool changed = false;
1174   tree bswap32_type = NULL_TREE, bswap64_type = NULL_TREE;
1175 
1176   bswap32_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP32)
1177 	       && optab_handler (bswap_optab, SImode) != CODE_FOR_nothing);
1178   bswap64_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP64)
1179 	       && (optab_handler (bswap_optab, DImode) != CODE_FOR_nothing
1180 		   || (bswap32_p && word_mode == SImode)));
1181 
1182   /* Determine the argument type of the builtins.  The code later on
1183      assumes that the return and argument type are the same.  */
1184   if (bswap32_p)
1185     {
1186       tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
1187       bswap32_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
1188     }
1189 
1190   if (bswap64_p)
1191     {
1192       tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
1193       bswap64_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
1194     }
1195 
1196   memset (&nop_stats, 0, sizeof (nop_stats));
1197   memset (&bswap_stats, 0, sizeof (bswap_stats));
1198   calculate_dominance_info (CDI_DOMINATORS);
1199 
1200   FOR_EACH_BB_FN (bb, fun)
1201     {
1202       gimple_stmt_iterator gsi;
1203 
1204       /* We do a reverse scan for bswap patterns to make sure we get the
1205 	 widest match. As bswap pattern matching doesn't handle previously
1206 	 inserted smaller bswap replacements as sub-patterns, the wider
1207 	 variant wouldn't be detected.  */
1208       for (gsi = gsi_last_bb (bb); !gsi_end_p (gsi);)
1209 	{
1210 	  gimple *ins_stmt, *cur_stmt = gsi_stmt (gsi);
1211 	  tree fndecl = NULL_TREE, bswap_type = NULL_TREE, load_type;
1212 	  enum tree_code code;
1213 	  struct symbolic_number n;
1214 	  bool bswap;
1215 
1216 	  /* This gsi_prev (&gsi) is not part of the for loop because cur_stmt
1217 	     might be moved to a different basic block by bswap_replace and gsi
1218 	     must not points to it if that's the case.  Moving the gsi_prev
1219 	     there make sure that gsi points to the statement previous to
1220 	     cur_stmt while still making sure that all statements are
1221 	     considered in this basic block.  */
1222 	  gsi_prev (&gsi);
1223 
1224 	  if (!is_gimple_assign (cur_stmt))
1225 	    continue;
1226 
1227 	  code = gimple_assign_rhs_code (cur_stmt);
1228 	  switch (code)
1229 	    {
1230 	    case LROTATE_EXPR:
1231 	    case RROTATE_EXPR:
1232 	      if (!tree_fits_uhwi_p (gimple_assign_rhs2 (cur_stmt))
1233 		  || tree_to_uhwi (gimple_assign_rhs2 (cur_stmt))
1234 		     % BITS_PER_UNIT)
1235 		continue;
1236 	      /* Fall through.  */
1237 	    case BIT_IOR_EXPR:
1238 	      break;
1239 	    default:
1240 	      continue;
1241 	    }
1242 
1243 	  ins_stmt = find_bswap_or_nop (cur_stmt, &n, &bswap);
1244 
1245 	  if (!ins_stmt)
1246 	    continue;
1247 
1248 	  switch (n.range)
1249 	    {
1250 	    case 16:
1251 	      /* Already in canonical form, nothing to do.  */
1252 	      if (code == LROTATE_EXPR || code == RROTATE_EXPR)
1253 		continue;
1254 	      load_type = bswap_type = uint16_type_node;
1255 	      break;
1256 	    case 32:
1257 	      load_type = uint32_type_node;
1258 	      if (bswap32_p)
1259 		{
1260 		  fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
1261 		  bswap_type = bswap32_type;
1262 		}
1263 	      break;
1264 	    case 64:
1265 	      load_type = uint64_type_node;
1266 	      if (bswap64_p)
1267 		{
1268 		  fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
1269 		  bswap_type = bswap64_type;
1270 		}
1271 	      break;
1272 	    default:
1273 	      continue;
1274 	    }
1275 
1276 	  if (bswap && !fndecl && n.range != 16)
1277 	    continue;
1278 
1279 	  if (bswap_replace (gsi_for_stmt (cur_stmt), ins_stmt, fndecl,
1280 			     bswap_type, load_type, &n, bswap))
1281 	    changed = true;
1282 	}
1283     }
1284 
1285   statistics_counter_event (fun, "16-bit nop implementations found",
1286 			    nop_stats.found_16bit);
1287   statistics_counter_event (fun, "32-bit nop implementations found",
1288 			    nop_stats.found_32bit);
1289   statistics_counter_event (fun, "64-bit nop implementations found",
1290 			    nop_stats.found_64bit);
1291   statistics_counter_event (fun, "16-bit bswap implementations found",
1292 			    bswap_stats.found_16bit);
1293   statistics_counter_event (fun, "32-bit bswap implementations found",
1294 			    bswap_stats.found_32bit);
1295   statistics_counter_event (fun, "64-bit bswap implementations found",
1296 			    bswap_stats.found_64bit);
1297 
1298   return (changed ? TODO_update_ssa : 0);
1299 }
1300 
1301 } // anon namespace
1302 
1303 gimple_opt_pass *
make_pass_optimize_bswap(gcc::context * ctxt)1304 make_pass_optimize_bswap (gcc::context *ctxt)
1305 {
1306   return new pass_optimize_bswap (ctxt);
1307 }
1308 
1309 namespace {
1310 
1311 /* Struct recording one operand for the store, which is either a constant,
1312    then VAL represents the constant and all the other fields are zero,
1313    or a memory load, then VAL represents the reference, BASE_ADDR is non-NULL
1314    and the other fields also reflect the memory load.  */
1315 
1316 struct store_operand_info
1317 {
1318   tree val;
1319   tree base_addr;
1320   poly_uint64 bitsize;
1321   poly_uint64 bitpos;
1322   poly_uint64 bitregion_start;
1323   poly_uint64 bitregion_end;
1324   gimple *stmt;
1325   bool bit_not_p;
1326   store_operand_info ();
1327 };
1328 
store_operand_info()1329 store_operand_info::store_operand_info ()
1330   : val (NULL_TREE), base_addr (NULL_TREE), bitsize (0), bitpos (0),
1331     bitregion_start (0), bitregion_end (0), stmt (NULL), bit_not_p (false)
1332 {
1333 }
1334 
1335 /* Struct recording the information about a single store of an immediate
1336    to memory.  These are created in the first phase and coalesced into
1337    merged_store_group objects in the second phase.  */
1338 
1339 struct store_immediate_info
1340 {
1341   unsigned HOST_WIDE_INT bitsize;
1342   unsigned HOST_WIDE_INT bitpos;
1343   unsigned HOST_WIDE_INT bitregion_start;
1344   /* This is one past the last bit of the bit region.  */
1345   unsigned HOST_WIDE_INT bitregion_end;
1346   gimple *stmt;
1347   unsigned int order;
1348   /* INTEGER_CST for constant stores, MEM_REF for memory copy or
1349      BIT_*_EXPR for logical bitwise operation.
1350      LROTATE_EXPR if it can be only bswap optimized and
1351      ops are not really meaningful.
1352      NOP_EXPR if bswap optimization detected identity, ops
1353      are not meaningful.  */
1354   enum tree_code rhs_code;
1355   /* Two fields for bswap optimization purposes.  */
1356   struct symbolic_number n;
1357   gimple *ins_stmt;
1358   /* True if BIT_{AND,IOR,XOR}_EXPR result is inverted before storing.  */
1359   bool bit_not_p;
1360   /* True if ops have been swapped and thus ops[1] represents
1361      rhs1 of BIT_{AND,IOR,XOR}_EXPR and ops[0] represents rhs2.  */
1362   bool ops_swapped_p;
1363   /* Operands.  For BIT_*_EXPR rhs_code both operands are used, otherwise
1364      just the first one.  */
1365   store_operand_info ops[2];
1366   store_immediate_info (unsigned HOST_WIDE_INT, unsigned HOST_WIDE_INT,
1367 			unsigned HOST_WIDE_INT, unsigned HOST_WIDE_INT,
1368 			gimple *, unsigned int, enum tree_code,
1369 			struct symbolic_number &, gimple *, bool,
1370 			const store_operand_info &,
1371 			const store_operand_info &);
1372 };
1373 
store_immediate_info(unsigned HOST_WIDE_INT bs,unsigned HOST_WIDE_INT bp,unsigned HOST_WIDE_INT brs,unsigned HOST_WIDE_INT bre,gimple * st,unsigned int ord,enum tree_code rhscode,struct symbolic_number & nr,gimple * ins_stmtp,bool bitnotp,const store_operand_info & op0r,const store_operand_info & op1r)1374 store_immediate_info::store_immediate_info (unsigned HOST_WIDE_INT bs,
1375 					    unsigned HOST_WIDE_INT bp,
1376 					    unsigned HOST_WIDE_INT brs,
1377 					    unsigned HOST_WIDE_INT bre,
1378 					    gimple *st,
1379 					    unsigned int ord,
1380 					    enum tree_code rhscode,
1381 					    struct symbolic_number &nr,
1382 					    gimple *ins_stmtp,
1383 					    bool bitnotp,
1384 					    const store_operand_info &op0r,
1385 					    const store_operand_info &op1r)
1386   : bitsize (bs), bitpos (bp), bitregion_start (brs), bitregion_end (bre),
1387     stmt (st), order (ord), rhs_code (rhscode), n (nr),
1388     ins_stmt (ins_stmtp), bit_not_p (bitnotp), ops_swapped_p (false)
1389 #if __cplusplus >= 201103L
1390     , ops { op0r, op1r }
1391 {
1392 }
1393 #else
1394 {
1395   ops[0] = op0r;
1396   ops[1] = op1r;
1397 }
1398 #endif
1399 
1400 /* Struct representing a group of stores to contiguous memory locations.
1401    These are produced by the second phase (coalescing) and consumed in the
1402    third phase that outputs the widened stores.  */
1403 
1404 struct merged_store_group
1405 {
1406   unsigned HOST_WIDE_INT start;
1407   unsigned HOST_WIDE_INT width;
1408   unsigned HOST_WIDE_INT bitregion_start;
1409   unsigned HOST_WIDE_INT bitregion_end;
1410   /* The size of the allocated memory for val and mask.  */
1411   unsigned HOST_WIDE_INT buf_size;
1412   unsigned HOST_WIDE_INT align_base;
1413   poly_uint64 load_align_base[2];
1414 
1415   unsigned int align;
1416   unsigned int load_align[2];
1417   unsigned int first_order;
1418   unsigned int last_order;
1419   unsigned int first_nonmergeable_order;
1420 
1421   auto_vec<store_immediate_info *> stores;
1422   /* We record the first and last original statements in the sequence because
1423      we'll need their vuse/vdef and replacement position.  It's easier to keep
1424      track of them separately as 'stores' is reordered by apply_stores.  */
1425   gimple *last_stmt;
1426   gimple *first_stmt;
1427   unsigned char *val;
1428   unsigned char *mask;
1429 
1430   merged_store_group (store_immediate_info *);
1431   ~merged_store_group ();
1432   void merge_into (store_immediate_info *);
1433   void merge_overlapping (store_immediate_info *);
1434   bool apply_stores ();
1435 private:
1436   void do_merge (store_immediate_info *);
1437 };
1438 
1439 /* Debug helper.  Dump LEN elements of byte array PTR to FD in hex.  */
1440 
1441 static void
dump_char_array(FILE * fd,unsigned char * ptr,unsigned int len)1442 dump_char_array (FILE *fd, unsigned char *ptr, unsigned int len)
1443 {
1444   if (!fd)
1445     return;
1446 
1447   for (unsigned int i = 0; i < len; i++)
1448     fprintf (fd, "%x ", ptr[i]);
1449   fprintf (fd, "\n");
1450 }
1451 
1452 /* Shift left the bytes in PTR of SZ elements by AMNT bits, carrying over the
1453    bits between adjacent elements.  AMNT should be within
1454    [0, BITS_PER_UNIT).
1455    Example, AMNT = 2:
1456    00011111|11100000 << 2 = 01111111|10000000
1457    PTR[1]  | PTR[0]         PTR[1]  | PTR[0].  */
1458 
1459 static void
shift_bytes_in_array(unsigned char * ptr,unsigned int sz,unsigned int amnt)1460 shift_bytes_in_array (unsigned char *ptr, unsigned int sz, unsigned int amnt)
1461 {
1462   if (amnt == 0)
1463     return;
1464 
1465   unsigned char carry_over = 0U;
1466   unsigned char carry_mask = (~0U) << (unsigned char) (BITS_PER_UNIT - amnt);
1467   unsigned char clear_mask = (~0U) << amnt;
1468 
1469   for (unsigned int i = 0; i < sz; i++)
1470     {
1471       unsigned prev_carry_over = carry_over;
1472       carry_over = (ptr[i] & carry_mask) >> (BITS_PER_UNIT - amnt);
1473 
1474       ptr[i] <<= amnt;
1475       if (i != 0)
1476 	{
1477 	  ptr[i] &= clear_mask;
1478 	  ptr[i] |= prev_carry_over;
1479 	}
1480     }
1481 }
1482 
1483 /* Like shift_bytes_in_array but for big-endian.
1484    Shift right the bytes in PTR of SZ elements by AMNT bits, carrying over the
1485    bits between adjacent elements.  AMNT should be within
1486    [0, BITS_PER_UNIT).
1487    Example, AMNT = 2:
1488    00011111|11100000 >> 2 = 00000111|11111000
1489    PTR[0]  | PTR[1]         PTR[0]  | PTR[1].  */
1490 
1491 static void
shift_bytes_in_array_right(unsigned char * ptr,unsigned int sz,unsigned int amnt)1492 shift_bytes_in_array_right (unsigned char *ptr, unsigned int sz,
1493 			    unsigned int amnt)
1494 {
1495   if (amnt == 0)
1496     return;
1497 
1498   unsigned char carry_over = 0U;
1499   unsigned char carry_mask = ~(~0U << amnt);
1500 
1501   for (unsigned int i = 0; i < sz; i++)
1502     {
1503       unsigned prev_carry_over = carry_over;
1504       carry_over = ptr[i] & carry_mask;
1505 
1506       carry_over <<= (unsigned char) BITS_PER_UNIT - amnt;
1507       ptr[i] >>= amnt;
1508       ptr[i] |= prev_carry_over;
1509     }
1510 }
1511 
1512 /* Clear out LEN bits starting from bit START in the byte array
1513    PTR.  This clears the bits to the *right* from START.
1514    START must be within [0, BITS_PER_UNIT) and counts starting from
1515    the least significant bit.  */
1516 
1517 static void
clear_bit_region_be(unsigned char * ptr,unsigned int start,unsigned int len)1518 clear_bit_region_be (unsigned char *ptr, unsigned int start,
1519 		     unsigned int len)
1520 {
1521   if (len == 0)
1522     return;
1523   /* Clear len bits to the right of start.  */
1524   else if (len <= start + 1)
1525     {
1526       unsigned char mask = (~(~0U << len));
1527       mask = mask << (start + 1U - len);
1528       ptr[0] &= ~mask;
1529     }
1530   else if (start != BITS_PER_UNIT - 1)
1531     {
1532       clear_bit_region_be (ptr, start, (start % BITS_PER_UNIT) + 1);
1533       clear_bit_region_be (ptr + 1, BITS_PER_UNIT - 1,
1534 			   len - (start % BITS_PER_UNIT) - 1);
1535     }
1536   else if (start == BITS_PER_UNIT - 1
1537 	   && len > BITS_PER_UNIT)
1538     {
1539       unsigned int nbytes = len / BITS_PER_UNIT;
1540       memset (ptr, 0, nbytes);
1541       if (len % BITS_PER_UNIT != 0)
1542 	clear_bit_region_be (ptr + nbytes, BITS_PER_UNIT - 1,
1543 			     len % BITS_PER_UNIT);
1544     }
1545   else
1546     gcc_unreachable ();
1547 }
1548 
1549 /* In the byte array PTR clear the bit region starting at bit
1550    START and is LEN bits wide.
1551    For regions spanning multiple bytes do this recursively until we reach
1552    zero LEN or a region contained within a single byte.  */
1553 
1554 static void
clear_bit_region(unsigned char * ptr,unsigned int start,unsigned int len)1555 clear_bit_region (unsigned char *ptr, unsigned int start,
1556 		  unsigned int len)
1557 {
1558   /* Degenerate base case.  */
1559   if (len == 0)
1560     return;
1561   else if (start >= BITS_PER_UNIT)
1562     clear_bit_region (ptr + 1, start - BITS_PER_UNIT, len);
1563   /* Second base case.  */
1564   else if ((start + len) <= BITS_PER_UNIT)
1565     {
1566       unsigned char mask = (~0U) << (unsigned char) (BITS_PER_UNIT - len);
1567       mask >>= BITS_PER_UNIT - (start + len);
1568 
1569       ptr[0] &= ~mask;
1570 
1571       return;
1572     }
1573   /* Clear most significant bits in a byte and proceed with the next byte.  */
1574   else if (start != 0)
1575     {
1576       clear_bit_region (ptr, start, BITS_PER_UNIT - start);
1577       clear_bit_region (ptr + 1, 0, len - (BITS_PER_UNIT - start));
1578     }
1579   /* Whole bytes need to be cleared.  */
1580   else if (start == 0 && len > BITS_PER_UNIT)
1581     {
1582       unsigned int nbytes = len / BITS_PER_UNIT;
1583       /* We could recurse on each byte but we clear whole bytes, so a simple
1584 	 memset will do.  */
1585       memset (ptr, '\0', nbytes);
1586       /* Clear the remaining sub-byte region if there is one.  */
1587       if (len % BITS_PER_UNIT != 0)
1588 	clear_bit_region (ptr + nbytes, 0, len % BITS_PER_UNIT);
1589     }
1590   else
1591     gcc_unreachable ();
1592 }
1593 
1594 /* Write BITLEN bits of EXPR to the byte array PTR at
1595    bit position BITPOS.  PTR should contain TOTAL_BYTES elements.
1596    Return true if the operation succeeded.  */
1597 
1598 static bool
encode_tree_to_bitpos(tree expr,unsigned char * ptr,int bitlen,int bitpos,unsigned int total_bytes)1599 encode_tree_to_bitpos (tree expr, unsigned char *ptr, int bitlen, int bitpos,
1600 		       unsigned int total_bytes)
1601 {
1602   unsigned int first_byte = bitpos / BITS_PER_UNIT;
1603   tree tmp_int = expr;
1604   bool sub_byte_op_p = ((bitlen % BITS_PER_UNIT)
1605 			|| (bitpos % BITS_PER_UNIT)
1606 			|| !int_mode_for_size (bitlen, 0).exists ());
1607 
1608   if (!sub_byte_op_p)
1609     return native_encode_expr (tmp_int, ptr + first_byte, total_bytes) != 0;
1610 
1611   /* LITTLE-ENDIAN
1612      We are writing a non byte-sized quantity or at a position that is not
1613      at a byte boundary.
1614      |--------|--------|--------| ptr + first_byte
1615            ^              ^
1616            xxx xxxxxxxx xxx< bp>
1617            |______EXPR____|
1618 
1619      First native_encode_expr EXPR into a temporary buffer and shift each
1620      byte in the buffer by 'bp' (carrying the bits over as necessary).
1621      |00000000|00xxxxxx|xxxxxxxx| << bp = |000xxxxx|xxxxxxxx|xxx00000|
1622                                               <------bitlen---->< bp>
1623     Then we clear the destination bits:
1624     |---00000|00000000|000-----| ptr + first_byte
1625         <-------bitlen--->< bp>
1626 
1627     Finally we ORR the bytes of the shifted EXPR into the cleared region:
1628     |---xxxxx||xxxxxxxx||xxx-----| ptr + first_byte.
1629 
1630    BIG-ENDIAN
1631    We are writing a non byte-sized quantity or at a position that is not
1632    at a byte boundary.
1633      ptr + first_byte |--------|--------|--------|
1634                             ^              ^
1635                        <bp >xxx xxxxxxxx xxx
1636                             |_____EXPR_____|
1637 
1638      First native_encode_expr EXPR into a temporary buffer and shift each
1639      byte in the buffer to the right by (carrying the bits over as necessary).
1640      We shift by as much as needed to align the most significant bit of EXPR
1641      with bitpos:
1642      |00xxxxxx|xxxxxxxx| >> 3 = |00000xxx|xxxxxxxx|xxxxx000|
1643         <---bitlen---->          <bp ><-----bitlen----->
1644     Then we clear the destination bits:
1645     ptr + first_byte |-----000||00000000||00000---|
1646                       <bp ><-------bitlen----->
1647 
1648     Finally we ORR the bytes of the shifted EXPR into the cleared region:
1649     ptr + first_byte |---xxxxx||xxxxxxxx||xxx-----|.
1650     The awkwardness comes from the fact that bitpos is counted from the
1651     most significant bit of a byte.  */
1652 
1653   /* We must be dealing with fixed-size data at this point, since the
1654      total size is also fixed.  */
1655   fixed_size_mode mode = as_a <fixed_size_mode> (TYPE_MODE (TREE_TYPE (expr)));
1656   /* Allocate an extra byte so that we have space to shift into.  */
1657   unsigned int byte_size = GET_MODE_SIZE (mode) + 1;
1658   unsigned char *tmpbuf = XALLOCAVEC (unsigned char, byte_size);
1659   memset (tmpbuf, '\0', byte_size);
1660   /* The store detection code should only have allowed constants that are
1661      accepted by native_encode_expr.  */
1662   if (native_encode_expr (expr, tmpbuf, byte_size - 1) == 0)
1663     gcc_unreachable ();
1664 
1665   /* The native_encode_expr machinery uses TYPE_MODE to determine how many
1666      bytes to write.  This means it can write more than
1667      ROUND_UP (bitlen, BITS_PER_UNIT) / BITS_PER_UNIT bytes (for example
1668      write 8 bytes for a bitlen of 40).  Skip the bytes that are not within
1669      bitlen and zero out the bits that are not relevant as well (that may
1670      contain a sign bit due to sign-extension).  */
1671   unsigned int padding
1672     = byte_size - ROUND_UP (bitlen, BITS_PER_UNIT) / BITS_PER_UNIT - 1;
1673   /* On big-endian the padding is at the 'front' so just skip the initial
1674      bytes.  */
1675   if (BYTES_BIG_ENDIAN)
1676     tmpbuf += padding;
1677 
1678   byte_size -= padding;
1679 
1680   if (bitlen % BITS_PER_UNIT != 0)
1681     {
1682       if (BYTES_BIG_ENDIAN)
1683 	clear_bit_region_be (tmpbuf, BITS_PER_UNIT - 1,
1684 			     BITS_PER_UNIT - (bitlen % BITS_PER_UNIT));
1685       else
1686 	clear_bit_region (tmpbuf, bitlen,
1687 			  byte_size * BITS_PER_UNIT - bitlen);
1688     }
1689   /* Left shifting relies on the last byte being clear if bitlen is
1690      a multiple of BITS_PER_UNIT, which might not be clear if
1691      there are padding bytes.  */
1692   else if (!BYTES_BIG_ENDIAN)
1693     tmpbuf[byte_size - 1] = '\0';
1694 
1695   /* Clear the bit region in PTR where the bits from TMPBUF will be
1696      inserted into.  */
1697   if (BYTES_BIG_ENDIAN)
1698     clear_bit_region_be (ptr + first_byte,
1699 			 BITS_PER_UNIT - 1 - (bitpos % BITS_PER_UNIT), bitlen);
1700   else
1701     clear_bit_region (ptr + first_byte, bitpos % BITS_PER_UNIT, bitlen);
1702 
1703   int shift_amnt;
1704   int bitlen_mod = bitlen % BITS_PER_UNIT;
1705   int bitpos_mod = bitpos % BITS_PER_UNIT;
1706 
1707   bool skip_byte = false;
1708   if (BYTES_BIG_ENDIAN)
1709     {
1710       /* BITPOS and BITLEN are exactly aligned and no shifting
1711 	 is necessary.  */
1712       if (bitpos_mod + bitlen_mod == BITS_PER_UNIT
1713 	  || (bitpos_mod == 0 && bitlen_mod == 0))
1714 	shift_amnt = 0;
1715       /* |. . . . . . . .|
1716 	  <bp >   <blen >.
1717 	 We always shift right for BYTES_BIG_ENDIAN so shift the beginning
1718 	 of the value until it aligns with 'bp' in the next byte over.  */
1719       else if (bitpos_mod + bitlen_mod < BITS_PER_UNIT)
1720 	{
1721 	  shift_amnt = bitlen_mod + bitpos_mod;
1722 	  skip_byte = bitlen_mod != 0;
1723 	}
1724       /* |. . . . . . . .|
1725 	  <----bp--->
1726 	    <---blen---->.
1727 	 Shift the value right within the same byte so it aligns with 'bp'.  */
1728       else
1729 	shift_amnt = bitlen_mod + bitpos_mod - BITS_PER_UNIT;
1730     }
1731   else
1732     shift_amnt = bitpos % BITS_PER_UNIT;
1733 
1734   /* Create the shifted version of EXPR.  */
1735   if (!BYTES_BIG_ENDIAN)
1736     {
1737       shift_bytes_in_array (tmpbuf, byte_size, shift_amnt);
1738       if (shift_amnt == 0)
1739 	byte_size--;
1740     }
1741   else
1742     {
1743       gcc_assert (BYTES_BIG_ENDIAN);
1744       shift_bytes_in_array_right (tmpbuf, byte_size, shift_amnt);
1745       /* If shifting right forced us to move into the next byte skip the now
1746 	 empty byte.  */
1747       if (skip_byte)
1748 	{
1749 	  tmpbuf++;
1750 	  byte_size--;
1751 	}
1752     }
1753 
1754   /* Insert the bits from TMPBUF.  */
1755   for (unsigned int i = 0; i < byte_size; i++)
1756     ptr[first_byte + i] |= tmpbuf[i];
1757 
1758   return true;
1759 }
1760 
1761 /* Sorting function for store_immediate_info objects.
1762    Sorts them by bitposition.  */
1763 
1764 static int
sort_by_bitpos(const void * x,const void * y)1765 sort_by_bitpos (const void *x, const void *y)
1766 {
1767   store_immediate_info *const *tmp = (store_immediate_info * const *) x;
1768   store_immediate_info *const *tmp2 = (store_immediate_info * const *) y;
1769 
1770   if ((*tmp)->bitpos < (*tmp2)->bitpos)
1771     return -1;
1772   else if ((*tmp)->bitpos > (*tmp2)->bitpos)
1773     return 1;
1774   else
1775     /* If they are the same let's use the order which is guaranteed to
1776        be different.  */
1777     return (*tmp)->order - (*tmp2)->order;
1778 }
1779 
1780 /* Sorting function for store_immediate_info objects.
1781    Sorts them by the order field.  */
1782 
1783 static int
sort_by_order(const void * x,const void * y)1784 sort_by_order (const void *x, const void *y)
1785 {
1786   store_immediate_info *const *tmp = (store_immediate_info * const *) x;
1787   store_immediate_info *const *tmp2 = (store_immediate_info * const *) y;
1788 
1789   if ((*tmp)->order < (*tmp2)->order)
1790     return -1;
1791   else if ((*tmp)->order > (*tmp2)->order)
1792     return 1;
1793 
1794   gcc_unreachable ();
1795 }
1796 
1797 /* Initialize a merged_store_group object from a store_immediate_info
1798    object.  */
1799 
merged_store_group(store_immediate_info * info)1800 merged_store_group::merged_store_group (store_immediate_info *info)
1801 {
1802   start = info->bitpos;
1803   width = info->bitsize;
1804   bitregion_start = info->bitregion_start;
1805   bitregion_end = info->bitregion_end;
1806   /* VAL has memory allocated for it in apply_stores once the group
1807      width has been finalized.  */
1808   val = NULL;
1809   mask = NULL;
1810   first_nonmergeable_order = ~0U;
1811   unsigned HOST_WIDE_INT align_bitpos = 0;
1812   get_object_alignment_1 (gimple_assign_lhs (info->stmt),
1813 			  &align, &align_bitpos);
1814   align_base = start - align_bitpos;
1815   for (int i = 0; i < 2; ++i)
1816     {
1817       store_operand_info &op = info->ops[i];
1818       if (op.base_addr == NULL_TREE)
1819 	{
1820 	  load_align[i] = 0;
1821 	  load_align_base[i] = 0;
1822 	}
1823       else
1824 	{
1825 	  get_object_alignment_1 (op.val, &load_align[i], &align_bitpos);
1826 	  load_align_base[i] = op.bitpos - align_bitpos;
1827 	}
1828     }
1829   stores.create (1);
1830   stores.safe_push (info);
1831   last_stmt = info->stmt;
1832   last_order = info->order;
1833   first_stmt = last_stmt;
1834   first_order = last_order;
1835   buf_size = 0;
1836 }
1837 
~merged_store_group()1838 merged_store_group::~merged_store_group ()
1839 {
1840   if (val)
1841     XDELETEVEC (val);
1842 }
1843 
1844 /* Helper method for merge_into and merge_overlapping to do
1845    the common part.  */
1846 void
do_merge(store_immediate_info * info)1847 merged_store_group::do_merge (store_immediate_info *info)
1848 {
1849   bitregion_start = MIN (bitregion_start, info->bitregion_start);
1850   bitregion_end = MAX (bitregion_end, info->bitregion_end);
1851 
1852   unsigned int this_align;
1853   unsigned HOST_WIDE_INT align_bitpos = 0;
1854   get_object_alignment_1 (gimple_assign_lhs (info->stmt),
1855 			  &this_align, &align_bitpos);
1856   if (this_align > align)
1857     {
1858       align = this_align;
1859       align_base = info->bitpos - align_bitpos;
1860     }
1861   for (int i = 0; i < 2; ++i)
1862     {
1863       store_operand_info &op = info->ops[i];
1864       if (!op.base_addr)
1865 	continue;
1866 
1867       get_object_alignment_1 (op.val, &this_align, &align_bitpos);
1868       if (this_align > load_align[i])
1869 	{
1870 	  load_align[i] = this_align;
1871 	  load_align_base[i] = op.bitpos - align_bitpos;
1872 	}
1873     }
1874 
1875   gimple *stmt = info->stmt;
1876   stores.safe_push (info);
1877   if (info->order > last_order)
1878     {
1879       last_order = info->order;
1880       last_stmt = stmt;
1881     }
1882   else if (info->order < first_order)
1883     {
1884       first_order = info->order;
1885       first_stmt = stmt;
1886     }
1887 }
1888 
1889 /* Merge a store recorded by INFO into this merged store.
1890    The store is not overlapping with the existing recorded
1891    stores.  */
1892 
1893 void
merge_into(store_immediate_info * info)1894 merged_store_group::merge_into (store_immediate_info *info)
1895 {
1896   /* Make sure we're inserting in the position we think we're inserting.  */
1897   gcc_assert (info->bitpos >= start + width
1898 	      && info->bitregion_start <= bitregion_end);
1899 
1900   width = info->bitpos + info->bitsize - start;
1901   do_merge (info);
1902 }
1903 
1904 /* Merge a store described by INFO into this merged store.
1905    INFO overlaps in some way with the current store (i.e. it's not contiguous
1906    which is handled by merged_store_group::merge_into).  */
1907 
1908 void
merge_overlapping(store_immediate_info * info)1909 merged_store_group::merge_overlapping (store_immediate_info *info)
1910 {
1911   /* If the store extends the size of the group, extend the width.  */
1912   if (info->bitpos + info->bitsize > start + width)
1913     width = info->bitpos + info->bitsize - start;
1914 
1915   do_merge (info);
1916 }
1917 
1918 /* Go through all the recorded stores in this group in program order and
1919    apply their values to the VAL byte array to create the final merged
1920    value.  Return true if the operation succeeded.  */
1921 
1922 bool
apply_stores()1923 merged_store_group::apply_stores ()
1924 {
1925   /* Make sure we have more than one store in the group, otherwise we cannot
1926      merge anything.  */
1927   if (bitregion_start % BITS_PER_UNIT != 0
1928       || bitregion_end % BITS_PER_UNIT != 0
1929       || stores.length () == 1)
1930     return false;
1931 
1932   stores.qsort (sort_by_order);
1933   store_immediate_info *info;
1934   unsigned int i;
1935   /* Create a buffer of a size that is 2 times the number of bytes we're
1936      storing.  That way native_encode_expr can write power-of-2-sized
1937      chunks without overrunning.  */
1938   buf_size = 2 * ((bitregion_end - bitregion_start) / BITS_PER_UNIT);
1939   val = XNEWVEC (unsigned char, 2 * buf_size);
1940   mask = val + buf_size;
1941   memset (val, 0, buf_size);
1942   memset (mask, ~0U, buf_size);
1943 
1944   FOR_EACH_VEC_ELT (stores, i, info)
1945     {
1946       unsigned int pos_in_buffer = info->bitpos - bitregion_start;
1947       tree cst = NULL_TREE;
1948       if (info->ops[0].val && info->ops[0].base_addr == NULL_TREE)
1949 	cst = info->ops[0].val;
1950       else if (info->ops[1].val && info->ops[1].base_addr == NULL_TREE)
1951 	cst = info->ops[1].val;
1952       bool ret = true;
1953       if (cst)
1954 	ret = encode_tree_to_bitpos (cst, val, info->bitsize,
1955 				     pos_in_buffer, buf_size);
1956       if (cst && dump_file && (dump_flags & TDF_DETAILS))
1957 	{
1958 	  if (ret)
1959 	    {
1960 	      fprintf (dump_file, "After writing ");
1961 	      print_generic_expr (dump_file, cst, 0);
1962 	      fprintf (dump_file, " of size " HOST_WIDE_INT_PRINT_DEC
1963 		       " at position %d the merged region contains:\n",
1964 		       info->bitsize, pos_in_buffer);
1965 	      dump_char_array (dump_file, val, buf_size);
1966 	    }
1967 	  else
1968 	    fprintf (dump_file, "Failed to merge stores\n");
1969 	}
1970       if (!ret)
1971 	return false;
1972       unsigned char *m = mask + (pos_in_buffer / BITS_PER_UNIT);
1973       if (BYTES_BIG_ENDIAN)
1974 	clear_bit_region_be (m, (BITS_PER_UNIT - 1
1975 				 - (pos_in_buffer % BITS_PER_UNIT)),
1976 			     info->bitsize);
1977       else
1978 	clear_bit_region (m, pos_in_buffer % BITS_PER_UNIT, info->bitsize);
1979     }
1980   stores.qsort (sort_by_bitpos);
1981   return true;
1982 }
1983 
1984 /* Structure describing the store chain.  */
1985 
1986 struct imm_store_chain_info
1987 {
1988   /* Doubly-linked list that imposes an order on chain processing.
1989      PNXP (prev's next pointer) points to the head of a list, or to
1990      the next field in the previous chain in the list.
1991      See pass_store_merging::m_stores_head for more rationale.  */
1992   imm_store_chain_info *next, **pnxp;
1993   tree base_addr;
1994   auto_vec<store_immediate_info *> m_store_info;
1995   auto_vec<merged_store_group *> m_merged_store_groups;
1996 
imm_store_chain_infoimm_store_chain_info1997   imm_store_chain_info (imm_store_chain_info *&inspt, tree b_a)
1998   : next (inspt), pnxp (&inspt), base_addr (b_a)
1999   {
2000     inspt = this;
2001     if (next)
2002       {
2003 	gcc_checking_assert (pnxp == next->pnxp);
2004 	next->pnxp = &next;
2005       }
2006   }
~imm_store_chain_infoimm_store_chain_info2007   ~imm_store_chain_info ()
2008   {
2009     *pnxp = next;
2010     if (next)
2011       {
2012 	gcc_checking_assert (&next == next->pnxp);
2013 	next->pnxp = pnxp;
2014       }
2015   }
2016   bool terminate_and_process_chain ();
2017   bool try_coalesce_bswap (merged_store_group *, unsigned int, unsigned int);
2018   bool coalesce_immediate_stores ();
2019   bool output_merged_store (merged_store_group *);
2020   bool output_merged_stores ();
2021 };
2022 
2023 const pass_data pass_data_tree_store_merging = {
2024   GIMPLE_PASS,     /* type */
2025   "store-merging", /* name */
2026   OPTGROUP_NONE,   /* optinfo_flags */
2027   TV_GIMPLE_STORE_MERGING,	 /* tv_id */
2028   PROP_ssa,	/* properties_required */
2029   0,		   /* properties_provided */
2030   0,		   /* properties_destroyed */
2031   0,		   /* todo_flags_start */
2032   TODO_update_ssa, /* todo_flags_finish */
2033 };
2034 
2035 class pass_store_merging : public gimple_opt_pass
2036 {
2037 public:
pass_store_merging(gcc::context * ctxt)2038   pass_store_merging (gcc::context *ctxt)
2039     : gimple_opt_pass (pass_data_tree_store_merging, ctxt), m_stores_head ()
2040   {
2041   }
2042 
2043   /* Pass not supported for PDP-endianness, nor for insane hosts
2044      or target character sizes where native_{encode,interpret}_expr
2045      doesn't work properly.  */
2046   virtual bool
gate(function *)2047   gate (function *)
2048   {
2049     return flag_store_merging
2050 	   && WORDS_BIG_ENDIAN == BYTES_BIG_ENDIAN
2051 	   && CHAR_BIT == 8
2052 	   && BITS_PER_UNIT == 8;
2053   }
2054 
2055   virtual unsigned int execute (function *);
2056 
2057 private:
2058   hash_map<tree_operand_hash, struct imm_store_chain_info *> m_stores;
2059 
2060   /* Form a doubly-linked stack of the elements of m_stores, so that
2061      we can iterate over them in a predictable way.  Using this order
2062      avoids extraneous differences in the compiler output just because
2063      of tree pointer variations (e.g. different chains end up in
2064      different positions of m_stores, so they are handled in different
2065      orders, so they allocate or release SSA names in different
2066      orders, and when they get reused, subsequent passes end up
2067      getting different SSA names, which may ultimately change
2068      decisions when going out of SSA).  */
2069   imm_store_chain_info *m_stores_head;
2070 
2071   void process_store (gimple *);
2072   bool terminate_and_process_all_chains ();
2073   bool terminate_all_aliasing_chains (imm_store_chain_info **, gimple *);
2074   bool terminate_and_release_chain (imm_store_chain_info *);
2075 }; // class pass_store_merging
2076 
2077 /* Terminate and process all recorded chains.  Return true if any changes
2078    were made.  */
2079 
2080 bool
terminate_and_process_all_chains()2081 pass_store_merging::terminate_and_process_all_chains ()
2082 {
2083   bool ret = false;
2084   while (m_stores_head)
2085     ret |= terminate_and_release_chain (m_stores_head);
2086   gcc_assert (m_stores.elements () == 0);
2087   gcc_assert (m_stores_head == NULL);
2088 
2089   return ret;
2090 }
2091 
2092 /* Terminate all chains that are affected by the statement STMT.
2093    CHAIN_INFO is the chain we should ignore from the checks if
2094    non-NULL.  */
2095 
2096 bool
terminate_all_aliasing_chains(imm_store_chain_info ** chain_info,gimple * stmt)2097 pass_store_merging::terminate_all_aliasing_chains (imm_store_chain_info
2098 						     **chain_info,
2099 						   gimple *stmt)
2100 {
2101   bool ret = false;
2102 
2103   /* If the statement doesn't touch memory it can't alias.  */
2104   if (!gimple_vuse (stmt))
2105     return false;
2106 
2107   tree store_lhs = gimple_store_p (stmt) ? gimple_get_lhs (stmt) : NULL_TREE;
2108   for (imm_store_chain_info *next = m_stores_head, *cur = next; cur; cur = next)
2109     {
2110       next = cur->next;
2111 
2112       /* We already checked all the stores in chain_info and terminated the
2113 	 chain if necessary.  Skip it here.  */
2114       if (chain_info && *chain_info == cur)
2115 	continue;
2116 
2117       store_immediate_info *info;
2118       unsigned int i;
2119       FOR_EACH_VEC_ELT (cur->m_store_info, i, info)
2120 	{
2121 	  tree lhs = gimple_assign_lhs (info->stmt);
2122 	  if (ref_maybe_used_by_stmt_p (stmt, lhs)
2123 	      || stmt_may_clobber_ref_p (stmt, lhs)
2124 	      || (store_lhs && refs_output_dependent_p (store_lhs, lhs)))
2125 	    {
2126 	      if (dump_file && (dump_flags & TDF_DETAILS))
2127 		{
2128 		  fprintf (dump_file, "stmt causes chain termination:\n");
2129 		  print_gimple_stmt (dump_file, stmt, 0);
2130 		}
2131 	      terminate_and_release_chain (cur);
2132 	      ret = true;
2133 	      break;
2134 	    }
2135 	}
2136     }
2137 
2138   return ret;
2139 }
2140 
2141 /* Helper function.  Terminate the recorded chain storing to base object
2142    BASE.  Return true if the merging and output was successful.  The m_stores
2143    entry is removed after the processing in any case.  */
2144 
2145 bool
terminate_and_release_chain(imm_store_chain_info * chain_info)2146 pass_store_merging::terminate_and_release_chain (imm_store_chain_info *chain_info)
2147 {
2148   bool ret = chain_info->terminate_and_process_chain ();
2149   m_stores.remove (chain_info->base_addr);
2150   delete chain_info;
2151   return ret;
2152 }
2153 
2154 /* Return true if stmts in between FIRST (inclusive) and LAST (exclusive)
2155    may clobber REF.  FIRST and LAST must be in the same basic block and
2156    have non-NULL vdef.  We want to be able to sink load of REF across
2157    stores between FIRST and LAST, up to right before LAST.  */
2158 
2159 bool
stmts_may_clobber_ref_p(gimple * first,gimple * last,tree ref)2160 stmts_may_clobber_ref_p (gimple *first, gimple *last, tree ref)
2161 {
2162   ao_ref r;
2163   ao_ref_init (&r, ref);
2164   unsigned int count = 0;
2165   tree vop = gimple_vdef (last);
2166   gimple *stmt;
2167 
2168   gcc_checking_assert (gimple_bb (first) == gimple_bb (last));
2169   do
2170     {
2171       stmt = SSA_NAME_DEF_STMT (vop);
2172       if (stmt_may_clobber_ref_p_1 (stmt, &r))
2173 	return true;
2174       if (gimple_store_p (stmt)
2175 	  && refs_anti_dependent_p (ref, gimple_get_lhs (stmt)))
2176 	return true;
2177       /* Avoid quadratic compile time by bounding the number of checks
2178 	 we perform.  */
2179       if (++count > MAX_STORE_ALIAS_CHECKS)
2180 	return true;
2181       vop = gimple_vuse (stmt);
2182     }
2183   while (stmt != first);
2184   return false;
2185 }
2186 
2187 /* Return true if INFO->ops[IDX] is mergeable with the
2188    corresponding loads already in MERGED_STORE group.
2189    BASE_ADDR is the base address of the whole store group.  */
2190 
2191 bool
compatible_load_p(merged_store_group * merged_store,store_immediate_info * info,tree base_addr,int idx)2192 compatible_load_p (merged_store_group *merged_store,
2193 		   store_immediate_info *info,
2194 		   tree base_addr, int idx)
2195 {
2196   store_immediate_info *infof = merged_store->stores[0];
2197   if (!info->ops[idx].base_addr
2198       || maybe_ne (info->ops[idx].bitpos - infof->ops[idx].bitpos,
2199 		   info->bitpos - infof->bitpos)
2200       || !operand_equal_p (info->ops[idx].base_addr,
2201 			   infof->ops[idx].base_addr, 0))
2202     return false;
2203 
2204   store_immediate_info *infol = merged_store->stores.last ();
2205   tree load_vuse = gimple_vuse (info->ops[idx].stmt);
2206   /* In this case all vuses should be the same, e.g.
2207      _1 = s.a; _2 = s.b; _3 = _1 | 1; t.a = _3; _4 = _2 | 2; t.b = _4;
2208      or
2209      _1 = s.a; _2 = s.b; t.a = _1; t.b = _2;
2210      and we can emit the coalesced load next to any of those loads.  */
2211   if (gimple_vuse (infof->ops[idx].stmt) == load_vuse
2212       && gimple_vuse (infol->ops[idx].stmt) == load_vuse)
2213     return true;
2214 
2215   /* Otherwise, at least for now require that the load has the same
2216      vuse as the store.  See following examples.  */
2217   if (gimple_vuse (info->stmt) != load_vuse)
2218     return false;
2219 
2220   if (gimple_vuse (infof->stmt) != gimple_vuse (infof->ops[idx].stmt)
2221       || (infof != infol
2222 	  && gimple_vuse (infol->stmt) != gimple_vuse (infol->ops[idx].stmt)))
2223     return false;
2224 
2225   /* If the load is from the same location as the store, already
2226      the construction of the immediate chain info guarantees no intervening
2227      stores, so no further checks are needed.  Example:
2228      _1 = s.a; _2 = _1 & -7; s.a = _2; _3 = s.b; _4 = _3 & -7; s.b = _4;  */
2229   if (known_eq (info->ops[idx].bitpos, info->bitpos)
2230       && operand_equal_p (info->ops[idx].base_addr, base_addr, 0))
2231     return true;
2232 
2233   /* Otherwise, we need to punt if any of the loads can be clobbered by any
2234      of the stores in the group, or any other stores in between those.
2235      Previous calls to compatible_load_p ensured that for all the
2236      merged_store->stores IDX loads, no stmts starting with
2237      merged_store->first_stmt and ending right before merged_store->last_stmt
2238      clobbers those loads.  */
2239   gimple *first = merged_store->first_stmt;
2240   gimple *last = merged_store->last_stmt;
2241   unsigned int i;
2242   store_immediate_info *infoc;
2243   /* The stores are sorted by increasing store bitpos, so if info->stmt store
2244      comes before the so far first load, we'll be changing
2245      merged_store->first_stmt.  In that case we need to give up if
2246      any of the earlier processed loads clobber with the stmts in the new
2247      range.  */
2248   if (info->order < merged_store->first_order)
2249     {
2250       FOR_EACH_VEC_ELT (merged_store->stores, i, infoc)
2251 	if (stmts_may_clobber_ref_p (info->stmt, first, infoc->ops[idx].val))
2252 	  return false;
2253       first = info->stmt;
2254     }
2255   /* Similarly, we could change merged_store->last_stmt, so ensure
2256      in that case no stmts in the new range clobber any of the earlier
2257      processed loads.  */
2258   else if (info->order > merged_store->last_order)
2259     {
2260       FOR_EACH_VEC_ELT (merged_store->stores, i, infoc)
2261 	if (stmts_may_clobber_ref_p (last, info->stmt, infoc->ops[idx].val))
2262 	  return false;
2263       last = info->stmt;
2264     }
2265   /* And finally, we'd be adding a new load to the set, ensure it isn't
2266      clobbered in the new range.  */
2267   if (stmts_may_clobber_ref_p (first, last, info->ops[idx].val))
2268     return false;
2269 
2270   /* Otherwise, we are looking for:
2271      _1 = s.a; _2 = _1 ^ 15; t.a = _2; _3 = s.b; _4 = _3 ^ 15; t.b = _4;
2272      or
2273      _1 = s.a; t.a = _1; _2 = s.b; t.b = _2;  */
2274   return true;
2275 }
2276 
2277 /* Add all refs loaded to compute VAL to REFS vector.  */
2278 
2279 void
gather_bswap_load_refs(vec<tree> * refs,tree val)2280 gather_bswap_load_refs (vec<tree> *refs, tree val)
2281 {
2282   if (TREE_CODE (val) != SSA_NAME)
2283     return;
2284 
2285   gimple *stmt = SSA_NAME_DEF_STMT (val);
2286   if (!is_gimple_assign (stmt))
2287     return;
2288 
2289   if (gimple_assign_load_p (stmt))
2290     {
2291       refs->safe_push (gimple_assign_rhs1 (stmt));
2292       return;
2293     }
2294 
2295   switch (gimple_assign_rhs_class (stmt))
2296     {
2297     case GIMPLE_BINARY_RHS:
2298       gather_bswap_load_refs (refs, gimple_assign_rhs2 (stmt));
2299       /* FALLTHRU */
2300     case GIMPLE_UNARY_RHS:
2301       gather_bswap_load_refs (refs, gimple_assign_rhs1 (stmt));
2302       break;
2303     default:
2304       gcc_unreachable ();
2305     }
2306 }
2307 
2308 /* Check if there are any stores in M_STORE_INFO after index I
2309    (where M_STORE_INFO must be sorted by sort_by_bitpos) that overlap
2310    a potential group ending with END that have their order
2311    smaller than LAST_ORDER.  ALL_INTEGER_CST_P is true if
2312    all the stores already merged and the one under consideration
2313    have rhs_code of INTEGER_CST.  Return true if there are no such stores.
2314    Consider:
2315      MEM[(long long int *)p_28] = 0;
2316      MEM[(long long int *)p_28 + 8B] = 0;
2317      MEM[(long long int *)p_28 + 16B] = 0;
2318      MEM[(long long int *)p_28 + 24B] = 0;
2319      _129 = (int) _130;
2320      MEM[(int *)p_28 + 8B] = _129;
2321      MEM[(int *)p_28].a = -1;
2322    We already have
2323      MEM[(long long int *)p_28] = 0;
2324      MEM[(int *)p_28].a = -1;
2325    stmts in the current group and need to consider if it is safe to
2326    add MEM[(long long int *)p_28 + 8B] = 0; store into the same group.
2327    There is an overlap between that store and the MEM[(int *)p_28 + 8B] = _129;
2328    store though, so if we add the MEM[(long long int *)p_28 + 8B] = 0;
2329    into the group and merging of those 3 stores is successful, merged
2330    stmts will be emitted at the latest store from that group, i.e.
2331    LAST_ORDER, which is the MEM[(int *)p_28].a = -1; store.
2332    The MEM[(int *)p_28 + 8B] = _129; store that originally follows
2333    the MEM[(long long int *)p_28 + 8B] = 0; would now be before it,
2334    so we need to refuse merging MEM[(long long int *)p_28 + 8B] = 0;
2335    into the group.  That way it will be its own store group and will
2336    not be touched.  If ALL_INTEGER_CST_P and there are overlapping
2337    INTEGER_CST stores, those are mergeable using merge_overlapping,
2338    so don't return false for those.  */
2339 
2340 static bool
check_no_overlap(vec<store_immediate_info * > m_store_info,unsigned int i,bool all_integer_cst_p,unsigned int last_order,unsigned HOST_WIDE_INT end)2341 check_no_overlap (vec<store_immediate_info *> m_store_info, unsigned int i,
2342 		  bool all_integer_cst_p, unsigned int last_order,
2343 		  unsigned HOST_WIDE_INT end)
2344 {
2345   unsigned int len = m_store_info.length ();
2346   for (++i; i < len; ++i)
2347     {
2348       store_immediate_info *info = m_store_info[i];
2349       if (info->bitpos >= end)
2350 	break;
2351       if (info->order < last_order
2352 	  && (!all_integer_cst_p || info->rhs_code != INTEGER_CST))
2353 	return false;
2354     }
2355   return true;
2356 }
2357 
2358 /* Return true if m_store_info[first] and at least one following store
2359    form a group which store try_size bitsize value which is byte swapped
2360    from a memory load or some value, or identity from some value.
2361    This uses the bswap pass APIs.  */
2362 
2363 bool
try_coalesce_bswap(merged_store_group * merged_store,unsigned int first,unsigned int try_size)2364 imm_store_chain_info::try_coalesce_bswap (merged_store_group *merged_store,
2365 					  unsigned int first,
2366 					  unsigned int try_size)
2367 {
2368   unsigned int len = m_store_info.length (), last = first;
2369   unsigned HOST_WIDE_INT width = m_store_info[first]->bitsize;
2370   if (width >= try_size)
2371     return false;
2372   for (unsigned int i = first + 1; i < len; ++i)
2373     {
2374       if (m_store_info[i]->bitpos != m_store_info[first]->bitpos + width
2375 	  || m_store_info[i]->ins_stmt == NULL)
2376 	return false;
2377       width += m_store_info[i]->bitsize;
2378       if (width >= try_size)
2379 	{
2380 	  last = i;
2381 	  break;
2382 	}
2383     }
2384   if (width != try_size)
2385     return false;
2386 
2387   bool allow_unaligned
2388     = !STRICT_ALIGNMENT && PARAM_VALUE (PARAM_STORE_MERGING_ALLOW_UNALIGNED);
2389   /* Punt if the combined store would not be aligned and we need alignment.  */
2390   if (!allow_unaligned)
2391     {
2392       unsigned int align = merged_store->align;
2393       unsigned HOST_WIDE_INT align_base = merged_store->align_base;
2394       for (unsigned int i = first + 1; i <= last; ++i)
2395 	{
2396 	  unsigned int this_align;
2397 	  unsigned HOST_WIDE_INT align_bitpos = 0;
2398 	  get_object_alignment_1 (gimple_assign_lhs (m_store_info[i]->stmt),
2399 				  &this_align, &align_bitpos);
2400 	  if (this_align > align)
2401 	    {
2402 	      align = this_align;
2403 	      align_base = m_store_info[i]->bitpos - align_bitpos;
2404 	    }
2405 	}
2406       unsigned HOST_WIDE_INT align_bitpos
2407 	= (m_store_info[first]->bitpos - align_base) & (align - 1);
2408       if (align_bitpos)
2409 	align = least_bit_hwi (align_bitpos);
2410       if (align < try_size)
2411 	return false;
2412     }
2413 
2414   tree type;
2415   switch (try_size)
2416     {
2417     case 16: type = uint16_type_node; break;
2418     case 32: type = uint32_type_node; break;
2419     case 64: type = uint64_type_node; break;
2420     default: gcc_unreachable ();
2421     }
2422   struct symbolic_number n;
2423   gimple *ins_stmt = NULL;
2424   int vuse_store = -1;
2425   unsigned int first_order = merged_store->first_order;
2426   unsigned int last_order = merged_store->last_order;
2427   gimple *first_stmt = merged_store->first_stmt;
2428   gimple *last_stmt = merged_store->last_stmt;
2429   unsigned HOST_WIDE_INT end = merged_store->start + merged_store->width;
2430   store_immediate_info *infof = m_store_info[first];
2431 
2432   for (unsigned int i = first; i <= last; ++i)
2433     {
2434       store_immediate_info *info = m_store_info[i];
2435       struct symbolic_number this_n = info->n;
2436       this_n.type = type;
2437       if (!this_n.base_addr)
2438 	this_n.range = try_size / BITS_PER_UNIT;
2439       else
2440 	/* Update vuse in case it has changed by output_merged_stores.  */
2441 	this_n.vuse = gimple_vuse (info->ins_stmt);
2442       unsigned int bitpos = info->bitpos - infof->bitpos;
2443       if (!do_shift_rotate (LSHIFT_EXPR, &this_n,
2444 			    BYTES_BIG_ENDIAN
2445 			    ? try_size - info->bitsize - bitpos
2446 			    : bitpos))
2447 	return false;
2448       if (this_n.base_addr && vuse_store)
2449 	{
2450 	  unsigned int j;
2451 	  for (j = first; j <= last; ++j)
2452 	    if (this_n.vuse == gimple_vuse (m_store_info[j]->stmt))
2453 	      break;
2454 	  if (j > last)
2455 	    {
2456 	      if (vuse_store == 1)
2457 		return false;
2458 	      vuse_store = 0;
2459 	    }
2460 	}
2461       if (i == first)
2462 	{
2463 	  n = this_n;
2464 	  ins_stmt = info->ins_stmt;
2465 	}
2466       else
2467 	{
2468 	  if (n.base_addr && n.vuse != this_n.vuse)
2469 	    {
2470 	      if (vuse_store == 0)
2471 		return false;
2472 	      vuse_store = 1;
2473 	    }
2474 	  if (info->order > last_order)
2475 	    {
2476 	      last_order = info->order;
2477 	      last_stmt = info->stmt;
2478 	    }
2479 	  else if (info->order < first_order)
2480 	    {
2481 	      first_order = info->order;
2482 	      first_stmt = info->stmt;
2483 	    }
2484 	  end = MAX (end, info->bitpos + info->bitsize);
2485 
2486 	  ins_stmt = perform_symbolic_merge (ins_stmt, &n, info->ins_stmt,
2487 					     &this_n, &n);
2488 	  if (ins_stmt == NULL)
2489 	    return false;
2490 	}
2491     }
2492 
2493   uint64_t cmpxchg, cmpnop;
2494   find_bswap_or_nop_finalize (&n, &cmpxchg, &cmpnop);
2495 
2496   /* A complete byte swap should make the symbolic number to start with
2497      the largest digit in the highest order byte.  Unchanged symbolic
2498      number indicates a read with same endianness as target architecture.  */
2499   if (n.n != cmpnop && n.n != cmpxchg)
2500     return false;
2501 
2502   if (n.base_addr == NULL_TREE && !is_gimple_val (n.src))
2503     return false;
2504 
2505   if (!check_no_overlap (m_store_info, last, false, last_order, end))
2506     return false;
2507 
2508   /* Don't handle memory copy this way if normal non-bswap processing
2509      would handle it too.  */
2510   if (n.n == cmpnop && (unsigned) n.n_ops == last - first + 1)
2511     {
2512       unsigned int i;
2513       for (i = first; i <= last; ++i)
2514 	if (m_store_info[i]->rhs_code != MEM_REF)
2515 	  break;
2516       if (i == last + 1)
2517 	return false;
2518     }
2519 
2520   if (n.n == cmpxchg)
2521     switch (try_size)
2522       {
2523       case 16:
2524 	/* Will emit LROTATE_EXPR.  */
2525 	break;
2526       case 32:
2527 	if (builtin_decl_explicit_p (BUILT_IN_BSWAP32)
2528 	    && optab_handler (bswap_optab, SImode) != CODE_FOR_nothing)
2529 	  break;
2530 	return false;
2531       case 64:
2532 	if (builtin_decl_explicit_p (BUILT_IN_BSWAP64)
2533 	    && optab_handler (bswap_optab, DImode) != CODE_FOR_nothing)
2534 	  break;
2535 	return false;
2536       default:
2537 	gcc_unreachable ();
2538       }
2539 
2540   if (!allow_unaligned && n.base_addr)
2541     {
2542       unsigned int align = get_object_alignment (n.src);
2543       if (align < try_size)
2544 	return false;
2545     }
2546 
2547   /* If each load has vuse of the corresponding store, need to verify
2548      the loads can be sunk right before the last store.  */
2549   if (vuse_store == 1)
2550     {
2551       auto_vec<tree, 64> refs;
2552       for (unsigned int i = first; i <= last; ++i)
2553 	gather_bswap_load_refs (&refs,
2554 				gimple_assign_rhs1 (m_store_info[i]->stmt));
2555 
2556       unsigned int i;
2557       tree ref;
2558       FOR_EACH_VEC_ELT (refs, i, ref)
2559 	if (stmts_may_clobber_ref_p (first_stmt, last_stmt, ref))
2560 	  return false;
2561       n.vuse = NULL_TREE;
2562     }
2563 
2564   infof->n = n;
2565   infof->ins_stmt = ins_stmt;
2566   for (unsigned int i = first; i <= last; ++i)
2567     {
2568       m_store_info[i]->rhs_code = n.n == cmpxchg ? LROTATE_EXPR : NOP_EXPR;
2569       m_store_info[i]->ops[0].base_addr = NULL_TREE;
2570       m_store_info[i]->ops[1].base_addr = NULL_TREE;
2571       if (i != first)
2572 	merged_store->merge_into (m_store_info[i]);
2573     }
2574 
2575   return true;
2576 }
2577 
2578 /* Go through the candidate stores recorded in m_store_info and merge them
2579    into merged_store_group objects recorded into m_merged_store_groups
2580    representing the widened stores.  Return true if coalescing was successful
2581    and the number of widened stores is fewer than the original number
2582    of stores.  */
2583 
2584 bool
coalesce_immediate_stores()2585 imm_store_chain_info::coalesce_immediate_stores ()
2586 {
2587   /* Anything less can't be processed.  */
2588   if (m_store_info.length () < 2)
2589     return false;
2590 
2591   if (dump_file && (dump_flags & TDF_DETAILS))
2592     fprintf (dump_file, "Attempting to coalesce %u stores in chain.\n",
2593 	     m_store_info.length ());
2594 
2595   store_immediate_info *info;
2596   unsigned int i, ignore = 0;
2597 
2598   /* Order the stores by the bitposition they write to.  */
2599   m_store_info.qsort (sort_by_bitpos);
2600 
2601   info = m_store_info[0];
2602   merged_store_group *merged_store = new merged_store_group (info);
2603 
2604   FOR_EACH_VEC_ELT (m_store_info, i, info)
2605     {
2606       if (dump_file && (dump_flags & TDF_DETAILS))
2607 	{
2608 	  fprintf (dump_file, "Store %u:\nbitsize:" HOST_WIDE_INT_PRINT_DEC
2609 			      " bitpos:" HOST_WIDE_INT_PRINT_DEC " val:\n",
2610 		   i, info->bitsize, info->bitpos);
2611 	  print_generic_expr (dump_file, gimple_assign_rhs1 (info->stmt));
2612 	  fprintf (dump_file, "\n------------\n");
2613 	}
2614 
2615       if (i <= ignore)
2616 	continue;
2617 
2618       /* First try to handle group of stores like:
2619 	 p[0] = data >> 24;
2620 	 p[1] = data >> 16;
2621 	 p[2] = data >> 8;
2622 	 p[3] = data;
2623 	 using the bswap framework.  */
2624       if (info->bitpos == merged_store->start + merged_store->width
2625 	  && merged_store->stores.length () == 1
2626 	  && merged_store->stores[0]->ins_stmt != NULL
2627 	  && info->ins_stmt != NULL)
2628 	{
2629 	  unsigned int try_size;
2630 	  for (try_size = 64; try_size >= 16; try_size >>= 1)
2631 	    if (try_coalesce_bswap (merged_store, i - 1, try_size))
2632 	      break;
2633 
2634 	  if (try_size >= 16)
2635 	    {
2636 	      ignore = i + merged_store->stores.length () - 1;
2637 	      m_merged_store_groups.safe_push (merged_store);
2638 	      if (ignore < m_store_info.length ())
2639 		merged_store = new merged_store_group (m_store_info[ignore]);
2640 	      else
2641 		merged_store = NULL;
2642 	      continue;
2643 	    }
2644 	}
2645 
2646       if (info->order >= merged_store->first_nonmergeable_order)
2647 	;
2648 
2649       /* |---store 1---|
2650 	       |---store 2---|
2651 	 Overlapping stores.  */
2652       else if (IN_RANGE (info->bitpos, merged_store->start,
2653 			 merged_store->start + merged_store->width - 1)
2654 	       /* |---store 1---||---store 2---|
2655 		  Handle also the consecutive INTEGER_CST stores case here,
2656 		  as we have here the code to deal with overlaps.  */
2657 	       || (info->bitregion_start <= merged_store->bitregion_end
2658 		   && info->rhs_code == INTEGER_CST
2659 		   && merged_store->stores[0]->rhs_code == INTEGER_CST))
2660 	{
2661 	  /* Only allow overlapping stores of constants.  */
2662 	  if (info->rhs_code == INTEGER_CST
2663 	      && merged_store->stores[0]->rhs_code == INTEGER_CST)
2664 	    {
2665 	      unsigned int last_order
2666 		= MAX (merged_store->last_order, info->order);
2667 	      unsigned HOST_WIDE_INT end
2668 		= MAX (merged_store->start + merged_store->width,
2669 		       info->bitpos + info->bitsize);
2670 	      if (check_no_overlap (m_store_info, i, true, last_order, end))
2671 		{
2672 		  /* check_no_overlap call above made sure there are no
2673 		     overlapping stores with non-INTEGER_CST rhs_code
2674 		     in between the first and last of the stores we've
2675 		     just merged.  If there are any INTEGER_CST rhs_code
2676 		     stores in between, we need to merge_overlapping them
2677 		     even if in the sort_by_bitpos order there are other
2678 		     overlapping stores in between.  Keep those stores as is.
2679 		     Example:
2680 			MEM[(int *)p_28] = 0;
2681 			MEM[(char *)p_28 + 3B] = 1;
2682 			MEM[(char *)p_28 + 1B] = 2;
2683 			MEM[(char *)p_28 + 2B] = MEM[(char *)p_28 + 6B];
2684 		     We can't merge the zero store with the store of two and
2685 		     not merge anything else, because the store of one is
2686 		     in the original order in between those two, but in
2687 		     store_by_bitpos order it comes after the last store that
2688 		     we can't merge with them.  We can merge the first 3 stores
2689 		     and keep the last store as is though.  */
2690 		  unsigned int len = m_store_info.length ();
2691 		  unsigned int try_order = last_order;
2692 		  unsigned int first_nonmergeable_order;
2693 		  unsigned int k;
2694 		  bool last_iter = false;
2695 		  int attempts = 0;
2696 		  do
2697 		    {
2698 		      unsigned int max_order = 0;
2699 		      unsigned first_nonmergeable_int_order = ~0U;
2700 		      unsigned HOST_WIDE_INT this_end = end;
2701 		      k = i;
2702 		      first_nonmergeable_order = ~0U;
2703 		      for (unsigned int j = i + 1; j < len; ++j)
2704 			{
2705 			  store_immediate_info *info2 = m_store_info[j];
2706 			  if (info2->bitpos >= this_end)
2707 			    break;
2708 			  if (info2->order < try_order)
2709 			    {
2710 			      if (info2->rhs_code != INTEGER_CST)
2711 				{
2712 				  /* Normally check_no_overlap makes sure this
2713 				     doesn't happen, but if end grows below,
2714 				     then we need to process more stores than
2715 				     check_no_overlap verified.  Example:
2716 				      MEM[(int *)p_5] = 0;
2717 				      MEM[(short *)p_5 + 3B] = 1;
2718 				      MEM[(char *)p_5 + 4B] = _9;
2719 				      MEM[(char *)p_5 + 2B] = 2;  */
2720 				  k = 0;
2721 				  break;
2722 				}
2723 			      k = j;
2724 			      this_end = MAX (this_end,
2725 					      info2->bitpos + info2->bitsize);
2726 			    }
2727 			  else if (info2->rhs_code == INTEGER_CST
2728 				   && !last_iter)
2729 			    {
2730 			      max_order = MAX (max_order, info2->order + 1);
2731 			      first_nonmergeable_int_order
2732 				= MIN (first_nonmergeable_int_order,
2733 				       info2->order);
2734 			    }
2735 			  else
2736 			    first_nonmergeable_order
2737 			      = MIN (first_nonmergeable_order, info2->order);
2738 			}
2739 		      if (k == 0)
2740 			{
2741 			  if (last_order == try_order)
2742 			    break;
2743 			  /* If this failed, but only because we grew
2744 			     try_order, retry with the last working one,
2745 			     so that we merge at least something.  */
2746 			  try_order = last_order;
2747 			  last_iter = true;
2748 			  continue;
2749 			}
2750 		      last_order = try_order;
2751 		      /* Retry with a larger try_order to see if we could
2752 			 merge some further INTEGER_CST stores.  */
2753 		      if (max_order
2754 			  && (first_nonmergeable_int_order
2755 			      < first_nonmergeable_order))
2756 			{
2757 			  try_order = MIN (max_order,
2758 					   first_nonmergeable_order);
2759 			  try_order
2760 			    = MIN (try_order,
2761 				   merged_store->first_nonmergeable_order);
2762 			  if (try_order > last_order && ++attempts < 16)
2763 			    continue;
2764 			}
2765 		      first_nonmergeable_order
2766 			= MIN (first_nonmergeable_order,
2767 			       first_nonmergeable_int_order);
2768 		      end = this_end;
2769 		      break;
2770 		    }
2771 		  while (1);
2772 
2773 		  if (k != 0)
2774 		    {
2775 		      merged_store->merge_overlapping (info);
2776 
2777 		      merged_store->first_nonmergeable_order
2778 			= MIN (merged_store->first_nonmergeable_order,
2779 			       first_nonmergeable_order);
2780 
2781 		      for (unsigned int j = i + 1; j <= k; j++)
2782 			{
2783 			  store_immediate_info *info2 = m_store_info[j];
2784 			  gcc_assert (info2->bitpos < end);
2785 			  if (info2->order < last_order)
2786 			    {
2787 			      gcc_assert (info2->rhs_code == INTEGER_CST);
2788 			      if (info != info2)
2789 				merged_store->merge_overlapping (info2);
2790 			    }
2791 			  /* Other stores are kept and not merged in any
2792 			     way.  */
2793 			}
2794 		      ignore = k;
2795 		      continue;
2796 		    }
2797 		}
2798 	    }
2799 	}
2800       /* |---store 1---||---store 2---|
2801 	 This store is consecutive to the previous one.
2802 	 Merge it into the current store group.  There can be gaps in between
2803 	 the stores, but there can't be gaps in between bitregions.  */
2804       else if (info->rhs_code != LROTATE_EXPR
2805 	       && info->bitregion_start <= merged_store->bitregion_end
2806 	       && info->rhs_code == merged_store->stores[0]->rhs_code)
2807 	{
2808 	  store_immediate_info *infof = merged_store->stores[0];
2809 
2810 	  /* All the rhs_code ops that take 2 operands are commutative,
2811 	     swap the operands if it could make the operands compatible.  */
2812 	  if (infof->ops[0].base_addr
2813 	      && infof->ops[1].base_addr
2814 	      && info->ops[0].base_addr
2815 	      && info->ops[1].base_addr
2816 	      && known_eq (info->ops[1].bitpos - infof->ops[0].bitpos,
2817 			   info->bitpos - infof->bitpos)
2818 	      && operand_equal_p (info->ops[1].base_addr,
2819 				  infof->ops[0].base_addr, 0))
2820 	    {
2821 	      std::swap (info->ops[0], info->ops[1]);
2822 	      info->ops_swapped_p = true;
2823 	    }
2824 	  if ((infof->ops[0].base_addr
2825 	       ? compatible_load_p (merged_store, info, base_addr, 0)
2826 	       : !info->ops[0].base_addr)
2827 	      && (infof->ops[1].base_addr
2828 		  ? compatible_load_p (merged_store, info, base_addr, 1)
2829 		  : !info->ops[1].base_addr)
2830 	      && check_no_overlap (m_store_info, i, false,
2831 				   MAX (merged_store->last_order,
2832 					info->order),
2833 				   MAX (merged_store->start
2834 					+ merged_store->width,
2835 					info->bitpos + info->bitsize)))
2836 	    {
2837 	      merged_store->merge_into (info);
2838 	      continue;
2839 	    }
2840 	}
2841 
2842       /* |---store 1---| <gap> |---store 2---|.
2843 	 Gap between stores or the rhs not compatible.  Start a new group.  */
2844 
2845       /* Try to apply all the stores recorded for the group to determine
2846 	 the bitpattern they write and discard it if that fails.
2847 	 This will also reject single-store groups.  */
2848       if (!merged_store->apply_stores ())
2849 	delete merged_store;
2850       else
2851 	m_merged_store_groups.safe_push (merged_store);
2852 
2853       merged_store = new merged_store_group (info);
2854     }
2855 
2856   /* Record or discard the last store group.  */
2857   if (merged_store)
2858     {
2859       if (!merged_store->apply_stores ())
2860 	delete merged_store;
2861       else
2862 	m_merged_store_groups.safe_push (merged_store);
2863     }
2864 
2865   gcc_assert (m_merged_store_groups.length () <= m_store_info.length ());
2866   bool success
2867     = !m_merged_store_groups.is_empty ()
2868       && m_merged_store_groups.length () < m_store_info.length ();
2869 
2870   if (success && dump_file)
2871     fprintf (dump_file, "Coalescing successful!\n"
2872 			"Merged into %u stores\n",
2873 	     m_merged_store_groups.length ());
2874 
2875   return success;
2876 }
2877 
2878 /* Return the type to use for the merged stores or loads described by STMTS.
2879    This is needed to get the alias sets right.  If IS_LOAD, look for rhs,
2880    otherwise lhs.  Additionally set *CLIQUEP and *BASEP to MR_DEPENDENCE_*
2881    of the MEM_REFs if any.  */
2882 
2883 static tree
get_alias_type_for_stmts(vec<gimple * > & stmts,bool is_load,unsigned short * cliquep,unsigned short * basep)2884 get_alias_type_for_stmts (vec<gimple *> &stmts, bool is_load,
2885 			  unsigned short *cliquep, unsigned short *basep)
2886 {
2887   gimple *stmt;
2888   unsigned int i;
2889   tree type = NULL_TREE;
2890   tree ret = NULL_TREE;
2891   *cliquep = 0;
2892   *basep = 0;
2893 
2894   FOR_EACH_VEC_ELT (stmts, i, stmt)
2895     {
2896       tree ref = is_load ? gimple_assign_rhs1 (stmt)
2897 			 : gimple_assign_lhs (stmt);
2898       tree type1 = reference_alias_ptr_type (ref);
2899       tree base = get_base_address (ref);
2900 
2901       if (i == 0)
2902 	{
2903 	  if (TREE_CODE (base) == MEM_REF)
2904 	    {
2905 	      *cliquep = MR_DEPENDENCE_CLIQUE (base);
2906 	      *basep = MR_DEPENDENCE_BASE (base);
2907 	    }
2908 	  ret = type = type1;
2909 	  continue;
2910 	}
2911       if (!alias_ptr_types_compatible_p (type, type1))
2912 	ret = ptr_type_node;
2913       if (TREE_CODE (base) != MEM_REF
2914 	  || *cliquep != MR_DEPENDENCE_CLIQUE (base)
2915 	  || *basep != MR_DEPENDENCE_BASE (base))
2916 	{
2917 	  *cliquep = 0;
2918 	  *basep = 0;
2919 	}
2920     }
2921   return ret;
2922 }
2923 
2924 /* Return the location_t information we can find among the statements
2925    in STMTS.  */
2926 
2927 static location_t
get_location_for_stmts(vec<gimple * > & stmts)2928 get_location_for_stmts (vec<gimple *> &stmts)
2929 {
2930   gimple *stmt;
2931   unsigned int i;
2932 
2933   FOR_EACH_VEC_ELT (stmts, i, stmt)
2934     if (gimple_has_location (stmt))
2935       return gimple_location (stmt);
2936 
2937   return UNKNOWN_LOCATION;
2938 }
2939 
2940 /* Used to decribe a store resulting from splitting a wide store in smaller
2941    regularly-sized stores in split_group.  */
2942 
2943 struct split_store
2944 {
2945   unsigned HOST_WIDE_INT bytepos;
2946   unsigned HOST_WIDE_INT size;
2947   unsigned HOST_WIDE_INT align;
2948   auto_vec<store_immediate_info *> orig_stores;
2949   /* True if there is a single orig stmt covering the whole split store.  */
2950   bool orig;
2951   split_store (unsigned HOST_WIDE_INT, unsigned HOST_WIDE_INT,
2952 	       unsigned HOST_WIDE_INT);
2953 };
2954 
2955 /* Simple constructor.  */
2956 
split_store(unsigned HOST_WIDE_INT bp,unsigned HOST_WIDE_INT sz,unsigned HOST_WIDE_INT al)2957 split_store::split_store (unsigned HOST_WIDE_INT bp,
2958 			  unsigned HOST_WIDE_INT sz,
2959 			  unsigned HOST_WIDE_INT al)
2960 			  : bytepos (bp), size (sz), align (al), orig (false)
2961 {
2962   orig_stores.create (0);
2963 }
2964 
2965 /* Record all stores in GROUP that write to the region starting at BITPOS and
2966    is of size BITSIZE.  Record infos for such statements in STORES if
2967    non-NULL.  The stores in GROUP must be sorted by bitposition.  Return INFO
2968    if there is exactly one original store in the range.  */
2969 
2970 static store_immediate_info *
find_constituent_stores(struct merged_store_group * group,vec<store_immediate_info * > * stores,unsigned int * first,unsigned HOST_WIDE_INT bitpos,unsigned HOST_WIDE_INT bitsize)2971 find_constituent_stores (struct merged_store_group *group,
2972 			 vec<store_immediate_info *> *stores,
2973 			 unsigned int *first,
2974 			 unsigned HOST_WIDE_INT bitpos,
2975 			 unsigned HOST_WIDE_INT bitsize)
2976 {
2977   store_immediate_info *info, *ret = NULL;
2978   unsigned int i;
2979   bool second = false;
2980   bool update_first = true;
2981   unsigned HOST_WIDE_INT end = bitpos + bitsize;
2982   for (i = *first; group->stores.iterate (i, &info); ++i)
2983     {
2984       unsigned HOST_WIDE_INT stmt_start = info->bitpos;
2985       unsigned HOST_WIDE_INT stmt_end = stmt_start + info->bitsize;
2986       if (stmt_end <= bitpos)
2987 	{
2988 	  /* BITPOS passed to this function never decreases from within the
2989 	     same split_group call, so optimize and don't scan info records
2990 	     which are known to end before or at BITPOS next time.
2991 	     Only do it if all stores before this one also pass this.  */
2992 	  if (update_first)
2993 	    *first = i + 1;
2994 	  continue;
2995 	}
2996       else
2997 	update_first = false;
2998 
2999       /* The stores in GROUP are ordered by bitposition so if we're past
3000 	 the region for this group return early.  */
3001       if (stmt_start >= end)
3002 	return ret;
3003 
3004       if (stores)
3005 	{
3006 	  stores->safe_push (info);
3007 	  if (ret)
3008 	    {
3009 	      ret = NULL;
3010 	      second = true;
3011 	    }
3012 	}
3013       else if (ret)
3014 	return NULL;
3015       if (!second)
3016 	ret = info;
3017     }
3018   return ret;
3019 }
3020 
3021 /* Return how many SSA_NAMEs used to compute value to store in the INFO
3022    store have multiple uses.  If any SSA_NAME has multiple uses, also
3023    count statements needed to compute it.  */
3024 
3025 static unsigned
count_multiple_uses(store_immediate_info * info)3026 count_multiple_uses (store_immediate_info *info)
3027 {
3028   gimple *stmt = info->stmt;
3029   unsigned ret = 0;
3030   switch (info->rhs_code)
3031     {
3032     case INTEGER_CST:
3033       return 0;
3034     case BIT_AND_EXPR:
3035     case BIT_IOR_EXPR:
3036     case BIT_XOR_EXPR:
3037       if (info->bit_not_p)
3038 	{
3039 	  if (!has_single_use (gimple_assign_rhs1 (stmt)))
3040 	    ret = 1; /* Fall through below to return
3041 			the BIT_NOT_EXPR stmt and then
3042 			BIT_{AND,IOR,XOR}_EXPR and anything it
3043 			uses.  */
3044 	  else
3045 	    /* stmt is after this the BIT_NOT_EXPR.  */
3046 	    stmt = SSA_NAME_DEF_STMT (gimple_assign_rhs1 (stmt));
3047 	}
3048       if (!has_single_use (gimple_assign_rhs1 (stmt)))
3049 	{
3050 	  ret += 1 + info->ops[0].bit_not_p;
3051 	  if (info->ops[1].base_addr)
3052 	    ret += 1 + info->ops[1].bit_not_p;
3053 	  return ret + 1;
3054 	}
3055       stmt = SSA_NAME_DEF_STMT (gimple_assign_rhs1 (stmt));
3056       /* stmt is now the BIT_*_EXPR.  */
3057       if (!has_single_use (gimple_assign_rhs1 (stmt)))
3058 	ret += 1 + info->ops[info->ops_swapped_p].bit_not_p;
3059       else if (info->ops[info->ops_swapped_p].bit_not_p)
3060 	{
3061 	  gimple *stmt2 = SSA_NAME_DEF_STMT (gimple_assign_rhs1 (stmt));
3062 	  if (!has_single_use (gimple_assign_rhs1 (stmt2)))
3063 	    ++ret;
3064 	}
3065       if (info->ops[1].base_addr == NULL_TREE)
3066 	{
3067 	  gcc_checking_assert (!info->ops_swapped_p);
3068 	  return ret;
3069 	}
3070       if (!has_single_use (gimple_assign_rhs2 (stmt)))
3071 	ret += 1 + info->ops[1 - info->ops_swapped_p].bit_not_p;
3072       else if (info->ops[1 - info->ops_swapped_p].bit_not_p)
3073 	{
3074 	  gimple *stmt2 = SSA_NAME_DEF_STMT (gimple_assign_rhs2 (stmt));
3075 	  if (!has_single_use (gimple_assign_rhs1 (stmt2)))
3076 	    ++ret;
3077 	}
3078       return ret;
3079     case MEM_REF:
3080       if (!has_single_use (gimple_assign_rhs1 (stmt)))
3081 	return 1 + info->ops[0].bit_not_p;
3082       else if (info->ops[0].bit_not_p)
3083 	{
3084 	  stmt = SSA_NAME_DEF_STMT (gimple_assign_rhs1 (stmt));
3085 	  if (!has_single_use (gimple_assign_rhs1 (stmt)))
3086 	    return 1;
3087 	}
3088       return 0;
3089     default:
3090       gcc_unreachable ();
3091     }
3092 }
3093 
3094 /* Split a merged store described by GROUP by populating the SPLIT_STORES
3095    vector (if non-NULL) with split_store structs describing the byte offset
3096    (from the base), the bit size and alignment of each store as well as the
3097    original statements involved in each such split group.
3098    This is to separate the splitting strategy from the statement
3099    building/emission/linking done in output_merged_store.
3100    Return number of new stores.
3101    If ALLOW_UNALIGNED_STORE is false, then all stores must be aligned.
3102    If ALLOW_UNALIGNED_LOAD is false, then all loads must be aligned.
3103    If SPLIT_STORES is NULL, it is just a dry run to count number of
3104    new stores.  */
3105 
3106 static unsigned int
split_group(merged_store_group * group,bool allow_unaligned_store,bool allow_unaligned_load,vec<struct split_store * > * split_stores,unsigned * total_orig,unsigned * total_new)3107 split_group (merged_store_group *group, bool allow_unaligned_store,
3108 	     bool allow_unaligned_load,
3109 	     vec<struct split_store *> *split_stores,
3110 	     unsigned *total_orig,
3111 	     unsigned *total_new)
3112 {
3113   unsigned HOST_WIDE_INT pos = group->bitregion_start;
3114   unsigned HOST_WIDE_INT size = group->bitregion_end - pos;
3115   unsigned HOST_WIDE_INT bytepos = pos / BITS_PER_UNIT;
3116   unsigned HOST_WIDE_INT group_align = group->align;
3117   unsigned HOST_WIDE_INT align_base = group->align_base;
3118   unsigned HOST_WIDE_INT group_load_align = group_align;
3119   bool any_orig = false;
3120 
3121   gcc_assert ((size % BITS_PER_UNIT == 0) && (pos % BITS_PER_UNIT == 0));
3122 
3123   if (group->stores[0]->rhs_code == LROTATE_EXPR
3124       || group->stores[0]->rhs_code == NOP_EXPR)
3125     {
3126       /* For bswap framework using sets of stores, all the checking
3127 	 has been done earlier in try_coalesce_bswap and needs to be
3128 	 emitted as a single store.  */
3129       if (total_orig)
3130 	{
3131 	  /* Avoid the old/new stmt count heuristics.  It should be
3132 	     always beneficial.  */
3133 	  total_new[0] = 1;
3134 	  total_orig[0] = 2;
3135 	}
3136 
3137       if (split_stores)
3138 	{
3139 	  unsigned HOST_WIDE_INT align_bitpos
3140 	    = (group->start - align_base) & (group_align - 1);
3141 	  unsigned HOST_WIDE_INT align = group_align;
3142 	  if (align_bitpos)
3143 	    align = least_bit_hwi (align_bitpos);
3144 	  bytepos = group->start / BITS_PER_UNIT;
3145 	  struct split_store *store
3146 	    = new split_store (bytepos, group->width, align);
3147 	  unsigned int first = 0;
3148 	  find_constituent_stores (group, &store->orig_stores,
3149 				   &first, group->start, group->width);
3150 	  split_stores->safe_push (store);
3151 	}
3152 
3153       return 1;
3154     }
3155 
3156   unsigned int ret = 0, first = 0;
3157   unsigned HOST_WIDE_INT try_pos = bytepos;
3158 
3159   if (total_orig)
3160     {
3161       unsigned int i;
3162       store_immediate_info *info = group->stores[0];
3163 
3164       total_new[0] = 0;
3165       total_orig[0] = 1; /* The orig store.  */
3166       info = group->stores[0];
3167       if (info->ops[0].base_addr)
3168 	total_orig[0]++;
3169       if (info->ops[1].base_addr)
3170 	total_orig[0]++;
3171       switch (info->rhs_code)
3172 	{
3173 	case BIT_AND_EXPR:
3174 	case BIT_IOR_EXPR:
3175 	case BIT_XOR_EXPR:
3176 	  total_orig[0]++; /* The orig BIT_*_EXPR stmt.  */
3177 	  break;
3178 	default:
3179 	  break;
3180 	}
3181       total_orig[0] *= group->stores.length ();
3182 
3183       FOR_EACH_VEC_ELT (group->stores, i, info)
3184 	{
3185 	  total_new[0] += count_multiple_uses (info);
3186 	  total_orig[0] += (info->bit_not_p
3187 			    + info->ops[0].bit_not_p
3188 			    + info->ops[1].bit_not_p);
3189 	}
3190     }
3191 
3192   if (!allow_unaligned_load)
3193     for (int i = 0; i < 2; ++i)
3194       if (group->load_align[i])
3195 	group_load_align = MIN (group_load_align, group->load_align[i]);
3196 
3197   while (size > 0)
3198     {
3199       if ((allow_unaligned_store || group_align <= BITS_PER_UNIT)
3200 	  && group->mask[try_pos - bytepos] == (unsigned char) ~0U)
3201 	{
3202 	  /* Skip padding bytes.  */
3203 	  ++try_pos;
3204 	  size -= BITS_PER_UNIT;
3205 	  continue;
3206 	}
3207 
3208       unsigned HOST_WIDE_INT try_bitpos = try_pos * BITS_PER_UNIT;
3209       unsigned int try_size = MAX_STORE_BITSIZE, nonmasked;
3210       unsigned HOST_WIDE_INT align_bitpos
3211 	= (try_bitpos - align_base) & (group_align - 1);
3212       unsigned HOST_WIDE_INT align = group_align;
3213       if (align_bitpos)
3214 	align = least_bit_hwi (align_bitpos);
3215       if (!allow_unaligned_store)
3216 	try_size = MIN (try_size, align);
3217       if (!allow_unaligned_load)
3218 	{
3219 	  /* If we can't do or don't want to do unaligned stores
3220 	     as well as loads, we need to take the loads into account
3221 	     as well.  */
3222 	  unsigned HOST_WIDE_INT load_align = group_load_align;
3223 	  align_bitpos = (try_bitpos - align_base) & (load_align - 1);
3224 	  if (align_bitpos)
3225 	    load_align = least_bit_hwi (align_bitpos);
3226 	  for (int i = 0; i < 2; ++i)
3227 	    if (group->load_align[i])
3228 	      {
3229 		align_bitpos
3230 		  = known_alignment (try_bitpos
3231 				     - group->stores[0]->bitpos
3232 				     + group->stores[0]->ops[i].bitpos
3233 				     - group->load_align_base[i]);
3234 		if (align_bitpos & (group_load_align - 1))
3235 		  {
3236 		    unsigned HOST_WIDE_INT a = least_bit_hwi (align_bitpos);
3237 		    load_align = MIN (load_align, a);
3238 		  }
3239 	      }
3240 	  try_size = MIN (try_size, load_align);
3241 	}
3242       store_immediate_info *info
3243 	= find_constituent_stores (group, NULL, &first, try_bitpos, try_size);
3244       if (info)
3245 	{
3246 	  /* If there is just one original statement for the range, see if
3247 	     we can just reuse the original store which could be even larger
3248 	     than try_size.  */
3249 	  unsigned HOST_WIDE_INT stmt_end
3250 	    = ROUND_UP (info->bitpos + info->bitsize, BITS_PER_UNIT);
3251 	  info = find_constituent_stores (group, NULL, &first, try_bitpos,
3252 					  stmt_end - try_bitpos);
3253 	  if (info && info->bitpos >= try_bitpos)
3254 	    {
3255 	      try_size = stmt_end - try_bitpos;
3256 	      goto found;
3257 	    }
3258 	}
3259 
3260       /* Approximate store bitsize for the case when there are no padding
3261 	 bits.  */
3262       while (try_size > size)
3263 	try_size /= 2;
3264       /* Now look for whole padding bytes at the end of that bitsize.  */
3265       for (nonmasked = try_size / BITS_PER_UNIT; nonmasked > 0; --nonmasked)
3266 	if (group->mask[try_pos - bytepos + nonmasked - 1]
3267 	    != (unsigned char) ~0U)
3268 	  break;
3269       if (nonmasked == 0)
3270 	{
3271 	  /* If entire try_size range is padding, skip it.  */
3272 	  try_pos += try_size / BITS_PER_UNIT;
3273 	  size -= try_size;
3274 	  continue;
3275 	}
3276       /* Otherwise try to decrease try_size if second half, last 3 quarters
3277 	 etc. are padding.  */
3278       nonmasked *= BITS_PER_UNIT;
3279       while (nonmasked <= try_size / 2)
3280 	try_size /= 2;
3281       if (!allow_unaligned_store && group_align > BITS_PER_UNIT)
3282 	{
3283 	  /* Now look for whole padding bytes at the start of that bitsize.  */
3284 	  unsigned int try_bytesize = try_size / BITS_PER_UNIT, masked;
3285 	  for (masked = 0; masked < try_bytesize; ++masked)
3286 	    if (group->mask[try_pos - bytepos + masked] != (unsigned char) ~0U)
3287 	      break;
3288 	  masked *= BITS_PER_UNIT;
3289 	  gcc_assert (masked < try_size);
3290 	  if (masked >= try_size / 2)
3291 	    {
3292 	      while (masked >= try_size / 2)
3293 		{
3294 		  try_size /= 2;
3295 		  try_pos += try_size / BITS_PER_UNIT;
3296 		  size -= try_size;
3297 		  masked -= try_size;
3298 		}
3299 	      /* Need to recompute the alignment, so just retry at the new
3300 		 position.  */
3301 	      continue;
3302 	    }
3303 	}
3304 
3305     found:
3306       ++ret;
3307 
3308       if (split_stores)
3309 	{
3310 	  struct split_store *store
3311 	    = new split_store (try_pos, try_size, align);
3312 	  info = find_constituent_stores (group, &store->orig_stores,
3313 					  &first, try_bitpos, try_size);
3314 	  if (info
3315 	      && info->bitpos >= try_bitpos
3316 	      && info->bitpos + info->bitsize <= try_bitpos + try_size)
3317 	    {
3318 	      store->orig = true;
3319 	      any_orig = true;
3320 	    }
3321 	  split_stores->safe_push (store);
3322 	}
3323 
3324       try_pos += try_size / BITS_PER_UNIT;
3325       size -= try_size;
3326     }
3327 
3328   if (total_orig)
3329     {
3330       unsigned int i;
3331       struct split_store *store;
3332       /* If we are reusing some original stores and any of the
3333 	 original SSA_NAMEs had multiple uses, we need to subtract
3334 	 those now before we add the new ones.  */
3335       if (total_new[0] && any_orig)
3336 	{
3337 	  FOR_EACH_VEC_ELT (*split_stores, i, store)
3338 	    if (store->orig)
3339 	      total_new[0] -= count_multiple_uses (store->orig_stores[0]);
3340 	}
3341       total_new[0] += ret; /* The new store.  */
3342       store_immediate_info *info = group->stores[0];
3343       if (info->ops[0].base_addr)
3344 	total_new[0] += ret;
3345       if (info->ops[1].base_addr)
3346 	total_new[0] += ret;
3347       switch (info->rhs_code)
3348 	{
3349 	case BIT_AND_EXPR:
3350 	case BIT_IOR_EXPR:
3351 	case BIT_XOR_EXPR:
3352 	  total_new[0] += ret; /* The new BIT_*_EXPR stmt.  */
3353 	  break;
3354 	default:
3355 	  break;
3356 	}
3357       FOR_EACH_VEC_ELT (*split_stores, i, store)
3358 	{
3359 	  unsigned int j;
3360 	  bool bit_not_p[3] = { false, false, false };
3361 	  /* If all orig_stores have certain bit_not_p set, then
3362 	     we'd use a BIT_NOT_EXPR stmt and need to account for it.
3363 	     If some orig_stores have certain bit_not_p set, then
3364 	     we'd use a BIT_XOR_EXPR with a mask and need to account for
3365 	     it.  */
3366 	  FOR_EACH_VEC_ELT (store->orig_stores, j, info)
3367 	    {
3368 	      if (info->ops[0].bit_not_p)
3369 		bit_not_p[0] = true;
3370 	      if (info->ops[1].bit_not_p)
3371 		bit_not_p[1] = true;
3372 	      if (info->bit_not_p)
3373 		bit_not_p[2] = true;
3374 	    }
3375 	  total_new[0] += bit_not_p[0] + bit_not_p[1] + bit_not_p[2];
3376 	}
3377 
3378     }
3379 
3380   return ret;
3381 }
3382 
3383 /* Return the operation through which the operand IDX (if < 2) or
3384    result (IDX == 2) should be inverted.  If NOP_EXPR, no inversion
3385    is done, if BIT_NOT_EXPR, all bits are inverted, if BIT_XOR_EXPR,
3386    the bits should be xored with mask.  */
3387 
3388 static enum tree_code
invert_op(split_store * split_store,int idx,tree int_type,tree & mask)3389 invert_op (split_store *split_store, int idx, tree int_type, tree &mask)
3390 {
3391   unsigned int i;
3392   store_immediate_info *info;
3393   unsigned int cnt = 0;
3394   bool any_paddings = false;
3395   FOR_EACH_VEC_ELT (split_store->orig_stores, i, info)
3396     {
3397       bool bit_not_p = idx < 2 ? info->ops[idx].bit_not_p : info->bit_not_p;
3398       if (bit_not_p)
3399 	{
3400 	  ++cnt;
3401 	  tree lhs = gimple_assign_lhs (info->stmt);
3402 	  if (INTEGRAL_TYPE_P (TREE_TYPE (lhs))
3403 	      && TYPE_PRECISION (TREE_TYPE (lhs)) < info->bitsize)
3404 	    any_paddings = true;
3405 	}
3406     }
3407   mask = NULL_TREE;
3408   if (cnt == 0)
3409     return NOP_EXPR;
3410   if (cnt == split_store->orig_stores.length () && !any_paddings)
3411     return BIT_NOT_EXPR;
3412 
3413   unsigned HOST_WIDE_INT try_bitpos = split_store->bytepos * BITS_PER_UNIT;
3414   unsigned buf_size = split_store->size / BITS_PER_UNIT;
3415   unsigned char *buf
3416     = XALLOCAVEC (unsigned char, buf_size);
3417   memset (buf, ~0U, buf_size);
3418   FOR_EACH_VEC_ELT (split_store->orig_stores, i, info)
3419     {
3420       bool bit_not_p = idx < 2 ? info->ops[idx].bit_not_p : info->bit_not_p;
3421       if (!bit_not_p)
3422 	continue;
3423       /* Clear regions with bit_not_p and invert afterwards, rather than
3424 	 clear regions with !bit_not_p, so that gaps in between stores aren't
3425 	 set in the mask.  */
3426       unsigned HOST_WIDE_INT bitsize = info->bitsize;
3427       unsigned HOST_WIDE_INT prec = bitsize;
3428       unsigned int pos_in_buffer = 0;
3429       if (any_paddings)
3430 	{
3431 	  tree lhs = gimple_assign_lhs (info->stmt);
3432 	  if (INTEGRAL_TYPE_P (TREE_TYPE (lhs))
3433 	      && TYPE_PRECISION (TREE_TYPE (lhs)) < bitsize)
3434 	    prec = TYPE_PRECISION (TREE_TYPE (lhs));
3435 	}
3436       if (info->bitpos < try_bitpos)
3437 	{
3438 	  gcc_assert (info->bitpos + bitsize > try_bitpos);
3439 	  if (!BYTES_BIG_ENDIAN)
3440 	    {
3441 	      if (prec <= try_bitpos - info->bitpos)
3442 		continue;
3443 	      prec -= try_bitpos - info->bitpos;
3444 	    }
3445 	  bitsize -= try_bitpos - info->bitpos;
3446 	  if (BYTES_BIG_ENDIAN && prec > bitsize)
3447 	    prec = bitsize;
3448 	}
3449       else
3450 	pos_in_buffer = info->bitpos - try_bitpos;
3451       if (prec < bitsize)
3452 	{
3453 	  /* If this is a bool inversion, invert just the least significant
3454 	     prec bits rather than all bits of it.  */
3455 	  if (BYTES_BIG_ENDIAN)
3456 	    {
3457 	      pos_in_buffer += bitsize - prec;
3458 	      if (pos_in_buffer >= split_store->size)
3459 		continue;
3460 	    }
3461 	  bitsize = prec;
3462 	}
3463       if (pos_in_buffer + bitsize > split_store->size)
3464 	bitsize = split_store->size - pos_in_buffer;
3465       unsigned char *p = buf + (pos_in_buffer / BITS_PER_UNIT);
3466       if (BYTES_BIG_ENDIAN)
3467 	clear_bit_region_be (p, (BITS_PER_UNIT - 1
3468 				 - (pos_in_buffer % BITS_PER_UNIT)), bitsize);
3469       else
3470 	clear_bit_region (p, pos_in_buffer % BITS_PER_UNIT, bitsize);
3471     }
3472   for (unsigned int i = 0; i < buf_size; ++i)
3473     buf[i] = ~buf[i];
3474   mask = native_interpret_expr (int_type, buf, buf_size);
3475   return BIT_XOR_EXPR;
3476 }
3477 
3478 /* Given a merged store group GROUP output the widened version of it.
3479    The store chain is against the base object BASE.
3480    Try store sizes of at most MAX_STORE_BITSIZE bits wide and don't output
3481    unaligned stores for STRICT_ALIGNMENT targets or if it's too expensive.
3482    Make sure that the number of statements output is less than the number of
3483    original statements.  If a better sequence is possible emit it and
3484    return true.  */
3485 
3486 bool
output_merged_store(merged_store_group * group)3487 imm_store_chain_info::output_merged_store (merged_store_group *group)
3488 {
3489   split_store *split_store;
3490   unsigned int i;
3491   unsigned HOST_WIDE_INT start_byte_pos
3492     = group->bitregion_start / BITS_PER_UNIT;
3493 
3494   unsigned int orig_num_stmts = group->stores.length ();
3495   if (orig_num_stmts < 2)
3496     return false;
3497 
3498   auto_vec<struct split_store *, 32> split_stores;
3499   bool allow_unaligned_store
3500     = !STRICT_ALIGNMENT && PARAM_VALUE (PARAM_STORE_MERGING_ALLOW_UNALIGNED);
3501   bool allow_unaligned_load = allow_unaligned_store;
3502   if (allow_unaligned_store)
3503     {
3504       /* If unaligned stores are allowed, see how many stores we'd emit
3505 	 for unaligned and how many stores we'd emit for aligned stores.
3506 	 Only use unaligned stores if it allows fewer stores than aligned.  */
3507       unsigned aligned_cnt
3508 	= split_group (group, false, allow_unaligned_load, NULL, NULL, NULL);
3509       unsigned unaligned_cnt
3510 	= split_group (group, true, allow_unaligned_load, NULL, NULL, NULL);
3511       if (aligned_cnt <= unaligned_cnt)
3512 	allow_unaligned_store = false;
3513     }
3514   unsigned total_orig, total_new;
3515   split_group (group, allow_unaligned_store, allow_unaligned_load,
3516 	       &split_stores, &total_orig, &total_new);
3517 
3518   if (split_stores.length () >= orig_num_stmts)
3519     {
3520       /* We didn't manage to reduce the number of statements.  Bail out.  */
3521       if (dump_file && (dump_flags & TDF_DETAILS))
3522 	fprintf (dump_file, "Exceeded original number of stmts (%u)."
3523 			    "  Not profitable to emit new sequence.\n",
3524 		 orig_num_stmts);
3525       FOR_EACH_VEC_ELT (split_stores, i, split_store)
3526 	delete split_store;
3527       return false;
3528     }
3529   if (total_orig <= total_new)
3530     {
3531       /* If number of estimated new statements is above estimated original
3532 	 statements, bail out too.  */
3533       if (dump_file && (dump_flags & TDF_DETAILS))
3534 	fprintf (dump_file, "Estimated number of original stmts (%u)"
3535 			    " not larger than estimated number of new"
3536 			    " stmts (%u).\n",
3537 		 total_orig, total_new);
3538       FOR_EACH_VEC_ELT (split_stores, i, split_store)
3539 	delete split_store;
3540       return false;
3541     }
3542 
3543   gimple_stmt_iterator last_gsi = gsi_for_stmt (group->last_stmt);
3544   gimple_seq seq = NULL;
3545   tree last_vdef, new_vuse;
3546   last_vdef = gimple_vdef (group->last_stmt);
3547   new_vuse = gimple_vuse (group->last_stmt);
3548   tree bswap_res = NULL_TREE;
3549 
3550   if (group->stores[0]->rhs_code == LROTATE_EXPR
3551       || group->stores[0]->rhs_code == NOP_EXPR)
3552     {
3553       tree fndecl = NULL_TREE, bswap_type = NULL_TREE, load_type;
3554       gimple *ins_stmt = group->stores[0]->ins_stmt;
3555       struct symbolic_number *n = &group->stores[0]->n;
3556       bool bswap = group->stores[0]->rhs_code == LROTATE_EXPR;
3557 
3558       switch (n->range)
3559 	{
3560 	case 16:
3561 	  load_type = bswap_type = uint16_type_node;
3562 	  break;
3563 	case 32:
3564 	  load_type = uint32_type_node;
3565 	  if (bswap)
3566 	    {
3567 	      fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
3568 	      bswap_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
3569 	    }
3570 	  break;
3571 	case 64:
3572 	  load_type = uint64_type_node;
3573 	  if (bswap)
3574 	    {
3575 	      fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
3576 	      bswap_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
3577 	    }
3578 	  break;
3579 	default:
3580 	  gcc_unreachable ();
3581 	}
3582 
3583       /* If the loads have each vuse of the corresponding store,
3584 	 we've checked the aliasing already in try_coalesce_bswap and
3585 	 we want to sink the need load into seq.  So need to use new_vuse
3586 	 on the load.  */
3587       if (n->base_addr)
3588 	{
3589 	  if (n->vuse == NULL)
3590 	    {
3591 	      n->vuse = new_vuse;
3592 	      ins_stmt = NULL;
3593 	    }
3594 	  else
3595 	    /* Update vuse in case it has changed by output_merged_stores.  */
3596 	    n->vuse = gimple_vuse (ins_stmt);
3597 	}
3598       bswap_res = bswap_replace (gsi_start (seq), ins_stmt, fndecl,
3599 				 bswap_type, load_type, n, bswap);
3600       gcc_assert (bswap_res);
3601     }
3602 
3603   gimple *stmt = NULL;
3604   auto_vec<gimple *, 32> orig_stmts;
3605   gimple_seq this_seq;
3606   tree addr = force_gimple_operand_1 (unshare_expr (base_addr), &this_seq,
3607 				      is_gimple_mem_ref_addr, NULL_TREE);
3608   gimple_seq_add_seq_without_update (&seq, this_seq);
3609 
3610   tree load_addr[2] = { NULL_TREE, NULL_TREE };
3611   gimple_seq load_seq[2] = { NULL, NULL };
3612   gimple_stmt_iterator load_gsi[2] = { gsi_none (), gsi_none () };
3613   for (int j = 0; j < 2; ++j)
3614     {
3615       store_operand_info &op = group->stores[0]->ops[j];
3616       if (op.base_addr == NULL_TREE)
3617 	continue;
3618 
3619       store_immediate_info *infol = group->stores.last ();
3620       if (gimple_vuse (op.stmt) == gimple_vuse (infol->ops[j].stmt))
3621 	{
3622 	  /* We can't pick the location randomly; while we've verified
3623 	     all the loads have the same vuse, they can be still in different
3624 	     basic blocks and we need to pick the one from the last bb:
3625 	     int x = q[0];
3626 	     if (x == N) return;
3627 	     int y = q[1];
3628 	     p[0] = x;
3629 	     p[1] = y;
3630 	     otherwise if we put the wider load at the q[0] load, we might
3631 	     segfault if q[1] is not mapped.  */
3632 	  basic_block bb = gimple_bb (op.stmt);
3633 	  gimple *ostmt = op.stmt;
3634 	  store_immediate_info *info;
3635 	  FOR_EACH_VEC_ELT (group->stores, i, info)
3636 	    {
3637 	      gimple *tstmt = info->ops[j].stmt;
3638 	      basic_block tbb = gimple_bb (tstmt);
3639 	      if (dominated_by_p (CDI_DOMINATORS, tbb, bb))
3640 		{
3641 		  ostmt = tstmt;
3642 		  bb = tbb;
3643 		}
3644 	    }
3645 	  load_gsi[j] = gsi_for_stmt (ostmt);
3646 	  load_addr[j]
3647 	    = force_gimple_operand_1 (unshare_expr (op.base_addr),
3648 				      &load_seq[j], is_gimple_mem_ref_addr,
3649 				      NULL_TREE);
3650 	}
3651       else if (operand_equal_p (base_addr, op.base_addr, 0))
3652 	load_addr[j] = addr;
3653       else
3654 	{
3655 	  load_addr[j]
3656 	    = force_gimple_operand_1 (unshare_expr (op.base_addr),
3657 				      &this_seq, is_gimple_mem_ref_addr,
3658 				      NULL_TREE);
3659 	  gimple_seq_add_seq_without_update (&seq, this_seq);
3660 	}
3661     }
3662 
3663   FOR_EACH_VEC_ELT (split_stores, i, split_store)
3664     {
3665       unsigned HOST_WIDE_INT try_size = split_store->size;
3666       unsigned HOST_WIDE_INT try_pos = split_store->bytepos;
3667       unsigned HOST_WIDE_INT align = split_store->align;
3668       tree dest, src;
3669       location_t loc;
3670       if (split_store->orig)
3671 	{
3672 	  /* If there is just a single constituent store which covers
3673 	     the whole area, just reuse the lhs and rhs.  */
3674 	  gimple *orig_stmt = split_store->orig_stores[0]->stmt;
3675 	  dest = gimple_assign_lhs (orig_stmt);
3676 	  src = gimple_assign_rhs1 (orig_stmt);
3677 	  loc = gimple_location (orig_stmt);
3678 	}
3679       else
3680 	{
3681 	  store_immediate_info *info;
3682 	  unsigned short clique, base;
3683 	  unsigned int k;
3684 	  FOR_EACH_VEC_ELT (split_store->orig_stores, k, info)
3685 	    orig_stmts.safe_push (info->stmt);
3686 	  tree offset_type
3687 	    = get_alias_type_for_stmts (orig_stmts, false, &clique, &base);
3688 	  loc = get_location_for_stmts (orig_stmts);
3689 	  orig_stmts.truncate (0);
3690 
3691 	  tree int_type = build_nonstandard_integer_type (try_size, UNSIGNED);
3692 	  int_type = build_aligned_type (int_type, align);
3693 	  dest = fold_build2 (MEM_REF, int_type, addr,
3694 			      build_int_cst (offset_type, try_pos));
3695 	  if (TREE_CODE (dest) == MEM_REF)
3696 	    {
3697 	      MR_DEPENDENCE_CLIQUE (dest) = clique;
3698 	      MR_DEPENDENCE_BASE (dest) = base;
3699 	    }
3700 
3701 	  tree mask = integer_zero_node;
3702 	  if (!bswap_res)
3703 	    mask = native_interpret_expr (int_type,
3704 					  group->mask + try_pos
3705 					  - start_byte_pos,
3706 					  group->buf_size);
3707 
3708 	  tree ops[2];
3709 	  for (int j = 0;
3710 	       j < 1 + (split_store->orig_stores[0]->ops[1].val != NULL_TREE);
3711 	       ++j)
3712 	    {
3713 	      store_operand_info &op = split_store->orig_stores[0]->ops[j];
3714 	      if (bswap_res)
3715 		ops[j] = bswap_res;
3716 	      else if (op.base_addr)
3717 		{
3718 		  FOR_EACH_VEC_ELT (split_store->orig_stores, k, info)
3719 		    orig_stmts.safe_push (info->ops[j].stmt);
3720 
3721 		  offset_type = get_alias_type_for_stmts (orig_stmts, true,
3722 							  &clique, &base);
3723 		  location_t load_loc = get_location_for_stmts (orig_stmts);
3724 		  orig_stmts.truncate (0);
3725 
3726 		  unsigned HOST_WIDE_INT load_align = group->load_align[j];
3727 		  unsigned HOST_WIDE_INT align_bitpos
3728 		    = known_alignment (try_pos * BITS_PER_UNIT
3729 				       - split_store->orig_stores[0]->bitpos
3730 				       + op.bitpos);
3731 		  if (align_bitpos & (load_align - 1))
3732 		    load_align = least_bit_hwi (align_bitpos);
3733 
3734 		  tree load_int_type
3735 		    = build_nonstandard_integer_type (try_size, UNSIGNED);
3736 		  load_int_type
3737 		    = build_aligned_type (load_int_type, load_align);
3738 
3739 		  poly_uint64 load_pos
3740 		    = exact_div (try_pos * BITS_PER_UNIT
3741 				 - split_store->orig_stores[0]->bitpos
3742 				 + op.bitpos,
3743 				 BITS_PER_UNIT);
3744 		  ops[j] = fold_build2 (MEM_REF, load_int_type, load_addr[j],
3745 					build_int_cst (offset_type, load_pos));
3746 		  if (TREE_CODE (ops[j]) == MEM_REF)
3747 		    {
3748 		      MR_DEPENDENCE_CLIQUE (ops[j]) = clique;
3749 		      MR_DEPENDENCE_BASE (ops[j]) = base;
3750 		    }
3751 		  if (!integer_zerop (mask))
3752 		    /* The load might load some bits (that will be masked off
3753 		       later on) uninitialized, avoid -W*uninitialized
3754 		       warnings in that case.  */
3755 		    TREE_NO_WARNING (ops[j]) = 1;
3756 
3757 		  stmt = gimple_build_assign (make_ssa_name (int_type),
3758 					      ops[j]);
3759 		  gimple_set_location (stmt, load_loc);
3760 		  if (gsi_bb (load_gsi[j]))
3761 		    {
3762 		      gimple_set_vuse (stmt, gimple_vuse (op.stmt));
3763 		      gimple_seq_add_stmt_without_update (&load_seq[j], stmt);
3764 		    }
3765 		  else
3766 		    {
3767 		      gimple_set_vuse (stmt, new_vuse);
3768 		      gimple_seq_add_stmt_without_update (&seq, stmt);
3769 		    }
3770 		  ops[j] = gimple_assign_lhs (stmt);
3771 		  tree xor_mask;
3772 		  enum tree_code inv_op
3773 		    = invert_op (split_store, j, int_type, xor_mask);
3774 		  if (inv_op != NOP_EXPR)
3775 		    {
3776 		      stmt = gimple_build_assign (make_ssa_name (int_type),
3777 						  inv_op, ops[j], xor_mask);
3778 		      gimple_set_location (stmt, load_loc);
3779 		      ops[j] = gimple_assign_lhs (stmt);
3780 
3781 		      if (gsi_bb (load_gsi[j]))
3782 			gimple_seq_add_stmt_without_update (&load_seq[j],
3783 							    stmt);
3784 		      else
3785 			gimple_seq_add_stmt_without_update (&seq, stmt);
3786 		    }
3787 		}
3788 	      else
3789 		ops[j] = native_interpret_expr (int_type,
3790 						group->val + try_pos
3791 						- start_byte_pos,
3792 						group->buf_size);
3793 	    }
3794 
3795 	  switch (split_store->orig_stores[0]->rhs_code)
3796 	    {
3797 	    case BIT_AND_EXPR:
3798 	    case BIT_IOR_EXPR:
3799 	    case BIT_XOR_EXPR:
3800 	      FOR_EACH_VEC_ELT (split_store->orig_stores, k, info)
3801 		{
3802 		  tree rhs1 = gimple_assign_rhs1 (info->stmt);
3803 		  orig_stmts.safe_push (SSA_NAME_DEF_STMT (rhs1));
3804 		}
3805 	      location_t bit_loc;
3806 	      bit_loc = get_location_for_stmts (orig_stmts);
3807 	      orig_stmts.truncate (0);
3808 
3809 	      stmt
3810 		= gimple_build_assign (make_ssa_name (int_type),
3811 				       split_store->orig_stores[0]->rhs_code,
3812 				       ops[0], ops[1]);
3813 	      gimple_set_location (stmt, bit_loc);
3814 	      /* If there is just one load and there is a separate
3815 		 load_seq[0], emit the bitwise op right after it.  */
3816 	      if (load_addr[1] == NULL_TREE && gsi_bb (load_gsi[0]))
3817 		gimple_seq_add_stmt_without_update (&load_seq[0], stmt);
3818 	      /* Otherwise, if at least one load is in seq, we need to
3819 		 emit the bitwise op right before the store.  If there
3820 		 are two loads and are emitted somewhere else, it would
3821 		 be better to emit the bitwise op as early as possible;
3822 		 we don't track where that would be possible right now
3823 		 though.  */
3824 	      else
3825 		gimple_seq_add_stmt_without_update (&seq, stmt);
3826 	      src = gimple_assign_lhs (stmt);
3827 	      tree xor_mask;
3828 	      enum tree_code inv_op;
3829 	      inv_op = invert_op (split_store, 2, int_type, xor_mask);
3830 	      if (inv_op != NOP_EXPR)
3831 		{
3832 		  stmt = gimple_build_assign (make_ssa_name (int_type),
3833 					      inv_op, src, xor_mask);
3834 		  gimple_set_location (stmt, bit_loc);
3835 		  if (load_addr[1] == NULL_TREE && gsi_bb (load_gsi[0]))
3836 		    gimple_seq_add_stmt_without_update (&load_seq[0], stmt);
3837 		  else
3838 		    gimple_seq_add_stmt_without_update (&seq, stmt);
3839 		  src = gimple_assign_lhs (stmt);
3840 		}
3841 	      break;
3842 	    case LROTATE_EXPR:
3843 	    case NOP_EXPR:
3844 	      src = ops[0];
3845 	      if (!is_gimple_val (src))
3846 		{
3847 		  stmt = gimple_build_assign (make_ssa_name (TREE_TYPE (src)),
3848 					      src);
3849 		  gimple_seq_add_stmt_without_update (&seq, stmt);
3850 		  src = gimple_assign_lhs (stmt);
3851 		}
3852 	      if (!useless_type_conversion_p (int_type, TREE_TYPE (src)))
3853 		{
3854 		  stmt = gimple_build_assign (make_ssa_name (int_type),
3855 					      NOP_EXPR, src);
3856 		  gimple_seq_add_stmt_without_update (&seq, stmt);
3857 		  src = gimple_assign_lhs (stmt);
3858 		}
3859 	      inv_op = invert_op (split_store, 2, int_type, xor_mask);
3860 	      if (inv_op != NOP_EXPR)
3861 		{
3862 		  stmt = gimple_build_assign (make_ssa_name (int_type),
3863 					      inv_op, src, xor_mask);
3864 		  gimple_set_location (stmt, loc);
3865 		  gimple_seq_add_stmt_without_update (&seq, stmt);
3866 		  src = gimple_assign_lhs (stmt);
3867 		}
3868 	      break;
3869 	    default:
3870 	      src = ops[0];
3871 	      break;
3872 	    }
3873 
3874 	  if (!integer_zerop (mask))
3875 	    {
3876 	      tree tem = make_ssa_name (int_type);
3877 	      tree load_src = unshare_expr (dest);
3878 	      /* The load might load some or all bits uninitialized,
3879 		 avoid -W*uninitialized warnings in that case.
3880 		 As optimization, it would be nice if all the bits are
3881 		 provably uninitialized (no stores at all yet or previous
3882 		 store a CLOBBER) we'd optimize away the load and replace
3883 		 it e.g. with 0.  */
3884 	      TREE_NO_WARNING (load_src) = 1;
3885 	      stmt = gimple_build_assign (tem, load_src);
3886 	      gimple_set_location (stmt, loc);
3887 	      gimple_set_vuse (stmt, new_vuse);
3888 	      gimple_seq_add_stmt_without_update (&seq, stmt);
3889 
3890 	      /* FIXME: If there is a single chunk of zero bits in mask,
3891 		 perhaps use BIT_INSERT_EXPR instead?  */
3892 	      stmt = gimple_build_assign (make_ssa_name (int_type),
3893 					  BIT_AND_EXPR, tem, mask);
3894 	      gimple_set_location (stmt, loc);
3895 	      gimple_seq_add_stmt_without_update (&seq, stmt);
3896 	      tem = gimple_assign_lhs (stmt);
3897 
3898 	      if (TREE_CODE (src) == INTEGER_CST)
3899 		src = wide_int_to_tree (int_type,
3900 					wi::bit_and_not (wi::to_wide (src),
3901 							 wi::to_wide (mask)));
3902 	      else
3903 		{
3904 		  tree nmask
3905 		    = wide_int_to_tree (int_type,
3906 					wi::bit_not (wi::to_wide (mask)));
3907 		  stmt = gimple_build_assign (make_ssa_name (int_type),
3908 					      BIT_AND_EXPR, src, nmask);
3909 		  gimple_set_location (stmt, loc);
3910 		  gimple_seq_add_stmt_without_update (&seq, stmt);
3911 		  src = gimple_assign_lhs (stmt);
3912 		}
3913 	      stmt = gimple_build_assign (make_ssa_name (int_type),
3914 					  BIT_IOR_EXPR, tem, src);
3915 	      gimple_set_location (stmt, loc);
3916 	      gimple_seq_add_stmt_without_update (&seq, stmt);
3917 	      src = gimple_assign_lhs (stmt);
3918 	    }
3919 	}
3920 
3921       stmt = gimple_build_assign (dest, src);
3922       gimple_set_location (stmt, loc);
3923       gimple_set_vuse (stmt, new_vuse);
3924       gimple_seq_add_stmt_without_update (&seq, stmt);
3925 
3926       tree new_vdef;
3927       if (i < split_stores.length () - 1)
3928 	new_vdef = make_ssa_name (gimple_vop (cfun), stmt);
3929       else
3930 	new_vdef = last_vdef;
3931 
3932       gimple_set_vdef (stmt, new_vdef);
3933       SSA_NAME_DEF_STMT (new_vdef) = stmt;
3934       new_vuse = new_vdef;
3935     }
3936 
3937   FOR_EACH_VEC_ELT (split_stores, i, split_store)
3938     delete split_store;
3939 
3940   gcc_assert (seq);
3941   if (dump_file)
3942     {
3943       fprintf (dump_file,
3944 	       "New sequence of %u stmts to replace old one of %u stmts\n",
3945 	       split_stores.length (), orig_num_stmts);
3946       if (dump_flags & TDF_DETAILS)
3947 	print_gimple_seq (dump_file, seq, 0, TDF_VOPS | TDF_MEMSYMS);
3948     }
3949   gsi_insert_seq_after (&last_gsi, seq, GSI_SAME_STMT);
3950   for (int j = 0; j < 2; ++j)
3951     if (load_seq[j])
3952       gsi_insert_seq_after (&load_gsi[j], load_seq[j], GSI_SAME_STMT);
3953 
3954   return true;
3955 }
3956 
3957 /* Process the merged_store_group objects created in the coalescing phase.
3958    The stores are all against the base object BASE.
3959    Try to output the widened stores and delete the original statements if
3960    successful.  Return true iff any changes were made.  */
3961 
3962 bool
output_merged_stores()3963 imm_store_chain_info::output_merged_stores ()
3964 {
3965   unsigned int i;
3966   merged_store_group *merged_store;
3967   bool ret = false;
3968   FOR_EACH_VEC_ELT (m_merged_store_groups, i, merged_store)
3969     {
3970       if (output_merged_store (merged_store))
3971 	{
3972 	  unsigned int j;
3973 	  store_immediate_info *store;
3974 	  FOR_EACH_VEC_ELT (merged_store->stores, j, store)
3975 	    {
3976 	      gimple *stmt = store->stmt;
3977 	      gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
3978 	      gsi_remove (&gsi, true);
3979 	      if (stmt != merged_store->last_stmt)
3980 		{
3981 		  unlink_stmt_vdef (stmt);
3982 		  release_defs (stmt);
3983 		}
3984 	    }
3985 	  ret = true;
3986 	}
3987     }
3988   if (ret && dump_file)
3989     fprintf (dump_file, "Merging successful!\n");
3990 
3991   return ret;
3992 }
3993 
3994 /* Coalesce the store_immediate_info objects recorded against the base object
3995    BASE in the first phase and output them.
3996    Delete the allocated structures.
3997    Return true if any changes were made.  */
3998 
3999 bool
terminate_and_process_chain()4000 imm_store_chain_info::terminate_and_process_chain ()
4001 {
4002   /* Process store chain.  */
4003   bool ret = false;
4004   if (m_store_info.length () > 1)
4005     {
4006       ret = coalesce_immediate_stores ();
4007       if (ret)
4008 	ret = output_merged_stores ();
4009     }
4010 
4011   /* Delete all the entries we allocated ourselves.  */
4012   store_immediate_info *info;
4013   unsigned int i;
4014   FOR_EACH_VEC_ELT (m_store_info, i, info)
4015     delete info;
4016 
4017   merged_store_group *merged_info;
4018   FOR_EACH_VEC_ELT (m_merged_store_groups, i, merged_info)
4019     delete merged_info;
4020 
4021   return ret;
4022 }
4023 
4024 /* Return true iff LHS is a destination potentially interesting for
4025    store merging.  In practice these are the codes that get_inner_reference
4026    can process.  */
4027 
4028 static bool
lhs_valid_for_store_merging_p(tree lhs)4029 lhs_valid_for_store_merging_p (tree lhs)
4030 {
4031   tree_code code = TREE_CODE (lhs);
4032 
4033   if (code == ARRAY_REF || code == ARRAY_RANGE_REF || code == MEM_REF
4034       || code == COMPONENT_REF || code == BIT_FIELD_REF)
4035     return true;
4036 
4037   return false;
4038 }
4039 
4040 /* Return true if the tree RHS is a constant we want to consider
4041    during store merging.  In practice accept all codes that
4042    native_encode_expr accepts.  */
4043 
4044 static bool
rhs_valid_for_store_merging_p(tree rhs)4045 rhs_valid_for_store_merging_p (tree rhs)
4046 {
4047   unsigned HOST_WIDE_INT size;
4048   return (GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (rhs))).is_constant (&size)
4049 	  && native_encode_expr (rhs, NULL, size) != 0);
4050 }
4051 
4052 /* If MEM is a memory reference usable for store merging (either as
4053    store destination or for loads), return the non-NULL base_addr
4054    and set *PBITSIZE, *PBITPOS, *PBITREGION_START and *PBITREGION_END.
4055    Otherwise return NULL, *PBITPOS should be still valid even for that
4056    case.  */
4057 
4058 static tree
mem_valid_for_store_merging(tree mem,poly_uint64 * pbitsize,poly_uint64 * pbitpos,poly_uint64 * pbitregion_start,poly_uint64 * pbitregion_end)4059 mem_valid_for_store_merging (tree mem, poly_uint64 *pbitsize,
4060 			     poly_uint64 *pbitpos,
4061 			     poly_uint64 *pbitregion_start,
4062 			     poly_uint64 *pbitregion_end)
4063 {
4064   poly_int64 bitsize, bitpos;
4065   poly_uint64 bitregion_start = 0, bitregion_end = 0;
4066   machine_mode mode;
4067   int unsignedp = 0, reversep = 0, volatilep = 0;
4068   tree offset;
4069   tree base_addr = get_inner_reference (mem, &bitsize, &bitpos, &offset, &mode,
4070 					&unsignedp, &reversep, &volatilep);
4071   *pbitsize = bitsize;
4072   if (known_eq (bitsize, 0))
4073     return NULL_TREE;
4074 
4075   if (TREE_CODE (mem) == COMPONENT_REF
4076       && DECL_BIT_FIELD_TYPE (TREE_OPERAND (mem, 1)))
4077     {
4078       get_bit_range (&bitregion_start, &bitregion_end, mem, &bitpos, &offset);
4079       if (maybe_ne (bitregion_end, 0U))
4080 	bitregion_end += 1;
4081     }
4082 
4083   if (reversep)
4084     return NULL_TREE;
4085 
4086   /* We do not want to rewrite TARGET_MEM_REFs.  */
4087   if (TREE_CODE (base_addr) == TARGET_MEM_REF)
4088     return NULL_TREE;
4089   /* In some cases get_inner_reference may return a
4090      MEM_REF [ptr + byteoffset].  For the purposes of this pass
4091      canonicalize the base_addr to MEM_REF [ptr] and take
4092      byteoffset into account in the bitpos.  This occurs in
4093      PR 23684 and this way we can catch more chains.  */
4094   else if (TREE_CODE (base_addr) == MEM_REF)
4095     {
4096       poly_offset_int byte_off = mem_ref_offset (base_addr);
4097       poly_offset_int bit_off = byte_off << LOG2_BITS_PER_UNIT;
4098       bit_off += bitpos;
4099       if (known_ge (bit_off, 0) && bit_off.to_shwi (&bitpos))
4100 	{
4101 	  if (maybe_ne (bitregion_end, 0U))
4102 	    {
4103 	      bit_off = byte_off << LOG2_BITS_PER_UNIT;
4104 	      bit_off += bitregion_start;
4105 	      if (bit_off.to_uhwi (&bitregion_start))
4106 		{
4107 		  bit_off = byte_off << LOG2_BITS_PER_UNIT;
4108 		  bit_off += bitregion_end;
4109 		  if (!bit_off.to_uhwi (&bitregion_end))
4110 		    bitregion_end = 0;
4111 		}
4112 	      else
4113 		bitregion_end = 0;
4114 	    }
4115 	}
4116       else
4117 	return NULL_TREE;
4118       base_addr = TREE_OPERAND (base_addr, 0);
4119     }
4120   /* get_inner_reference returns the base object, get at its
4121      address now.  */
4122   else
4123     {
4124       if (maybe_lt (bitpos, 0))
4125 	return NULL_TREE;
4126       base_addr = build_fold_addr_expr (base_addr);
4127     }
4128 
4129   if (known_eq (bitregion_end, 0U))
4130     {
4131       bitregion_start = round_down_to_byte_boundary (bitpos);
4132       bitregion_end = bitpos;
4133       bitregion_end = round_up_to_byte_boundary (bitregion_end + bitsize);
4134     }
4135 
4136   if (offset != NULL_TREE)
4137     {
4138       /* If the access is variable offset then a base decl has to be
4139 	 address-taken to be able to emit pointer-based stores to it.
4140 	 ???  We might be able to get away with re-using the original
4141 	 base up to the first variable part and then wrapping that inside
4142 	 a BIT_FIELD_REF.  */
4143       tree base = get_base_address (base_addr);
4144       if (! base
4145 	  || (DECL_P (base) && ! TREE_ADDRESSABLE (base)))
4146 	return NULL_TREE;
4147 
4148       base_addr = build2 (POINTER_PLUS_EXPR, TREE_TYPE (base_addr),
4149 			  base_addr, offset);
4150     }
4151 
4152   *pbitsize = bitsize;
4153   *pbitpos = bitpos;
4154   *pbitregion_start = bitregion_start;
4155   *pbitregion_end = bitregion_end;
4156   return base_addr;
4157 }
4158 
4159 /* Return true if STMT is a load that can be used for store merging.
4160    In that case fill in *OP.  BITSIZE, BITPOS, BITREGION_START and
4161    BITREGION_END are properties of the corresponding store.  */
4162 
4163 static bool
handled_load(gimple * stmt,store_operand_info * op,poly_uint64 bitsize,poly_uint64 bitpos,poly_uint64 bitregion_start,poly_uint64 bitregion_end)4164 handled_load (gimple *stmt, store_operand_info *op,
4165 	      poly_uint64 bitsize, poly_uint64 bitpos,
4166 	      poly_uint64 bitregion_start, poly_uint64 bitregion_end)
4167 {
4168   if (!is_gimple_assign (stmt))
4169     return false;
4170   if (gimple_assign_rhs_code (stmt) == BIT_NOT_EXPR)
4171     {
4172       tree rhs1 = gimple_assign_rhs1 (stmt);
4173       if (TREE_CODE (rhs1) == SSA_NAME
4174 	  && handled_load (SSA_NAME_DEF_STMT (rhs1), op, bitsize, bitpos,
4175 			   bitregion_start, bitregion_end))
4176 	{
4177 	  /* Don't allow _1 = load; _2 = ~1; _3 = ~_2; which should have
4178 	     been optimized earlier, but if allowed here, would confuse the
4179 	     multiple uses counting.  */
4180 	  if (op->bit_not_p)
4181 	    return false;
4182 	  op->bit_not_p = !op->bit_not_p;
4183 	  return true;
4184 	}
4185       return false;
4186     }
4187   if (gimple_vuse (stmt)
4188       && gimple_assign_load_p (stmt)
4189       && !stmt_can_throw_internal (stmt)
4190       && !gimple_has_volatile_ops (stmt))
4191     {
4192       tree mem = gimple_assign_rhs1 (stmt);
4193       op->base_addr
4194 	= mem_valid_for_store_merging (mem, &op->bitsize, &op->bitpos,
4195 				       &op->bitregion_start,
4196 				       &op->bitregion_end);
4197       if (op->base_addr != NULL_TREE
4198 	  && known_eq (op->bitsize, bitsize)
4199 	  && multiple_p (op->bitpos - bitpos, BITS_PER_UNIT)
4200 	  && known_ge (op->bitpos - op->bitregion_start,
4201 		       bitpos - bitregion_start)
4202 	  && known_ge (op->bitregion_end - op->bitpos,
4203 		       bitregion_end - bitpos))
4204 	{
4205 	  op->stmt = stmt;
4206 	  op->val = mem;
4207 	  op->bit_not_p = false;
4208 	  return true;
4209 	}
4210     }
4211   return false;
4212 }
4213 
4214 /* Record the store STMT for store merging optimization if it can be
4215    optimized.  */
4216 
4217 void
process_store(gimple * stmt)4218 pass_store_merging::process_store (gimple *stmt)
4219 {
4220   tree lhs = gimple_assign_lhs (stmt);
4221   tree rhs = gimple_assign_rhs1 (stmt);
4222   poly_uint64 bitsize, bitpos;
4223   poly_uint64 bitregion_start, bitregion_end;
4224   tree base_addr
4225     = mem_valid_for_store_merging (lhs, &bitsize, &bitpos,
4226 				   &bitregion_start, &bitregion_end);
4227   if (known_eq (bitsize, 0U))
4228     return;
4229 
4230   bool invalid = (base_addr == NULL_TREE
4231 		  || (maybe_gt (bitsize,
4232 				(unsigned int) MAX_BITSIZE_MODE_ANY_INT)
4233 		      && (TREE_CODE (rhs) != INTEGER_CST)));
4234   enum tree_code rhs_code = ERROR_MARK;
4235   bool bit_not_p = false;
4236   struct symbolic_number n;
4237   gimple *ins_stmt = NULL;
4238   store_operand_info ops[2];
4239   if (invalid)
4240     ;
4241   else if (rhs_valid_for_store_merging_p (rhs))
4242     {
4243       rhs_code = INTEGER_CST;
4244       ops[0].val = rhs;
4245     }
4246   else if (TREE_CODE (rhs) != SSA_NAME)
4247     invalid = true;
4248   else
4249     {
4250       gimple *def_stmt = SSA_NAME_DEF_STMT (rhs), *def_stmt1, *def_stmt2;
4251       if (!is_gimple_assign (def_stmt))
4252 	invalid = true;
4253       else if (handled_load (def_stmt, &ops[0], bitsize, bitpos,
4254 			     bitregion_start, bitregion_end))
4255 	rhs_code = MEM_REF;
4256       else if (gimple_assign_rhs_code (def_stmt) == BIT_NOT_EXPR)
4257 	{
4258 	  tree rhs1 = gimple_assign_rhs1 (def_stmt);
4259 	  if (TREE_CODE (rhs1) == SSA_NAME
4260 	      && is_gimple_assign (SSA_NAME_DEF_STMT (rhs1)))
4261 	    {
4262 	      bit_not_p = true;
4263 	      def_stmt = SSA_NAME_DEF_STMT (rhs1);
4264 	    }
4265 	}
4266       if (rhs_code == ERROR_MARK && !invalid)
4267 	switch ((rhs_code = gimple_assign_rhs_code (def_stmt)))
4268 	  {
4269 	  case BIT_AND_EXPR:
4270 	  case BIT_IOR_EXPR:
4271 	  case BIT_XOR_EXPR:
4272 	    tree rhs1, rhs2;
4273 	    rhs1 = gimple_assign_rhs1 (def_stmt);
4274 	    rhs2 = gimple_assign_rhs2 (def_stmt);
4275 	    invalid = true;
4276 	    if (TREE_CODE (rhs1) != SSA_NAME)
4277 	      break;
4278 	    def_stmt1 = SSA_NAME_DEF_STMT (rhs1);
4279 	    if (!is_gimple_assign (def_stmt1)
4280 		|| !handled_load (def_stmt1, &ops[0], bitsize, bitpos,
4281 				  bitregion_start, bitregion_end))
4282 	      break;
4283 	    if (rhs_valid_for_store_merging_p (rhs2))
4284 	      ops[1].val = rhs2;
4285 	    else if (TREE_CODE (rhs2) != SSA_NAME)
4286 	      break;
4287 	    else
4288 	      {
4289 		def_stmt2 = SSA_NAME_DEF_STMT (rhs2);
4290 		if (!is_gimple_assign (def_stmt2))
4291 		  break;
4292 		else if (!handled_load (def_stmt2, &ops[1], bitsize, bitpos,
4293 					bitregion_start, bitregion_end))
4294 		  break;
4295 	      }
4296 	    invalid = false;
4297 	    break;
4298 	  default:
4299 	    invalid = true;
4300 	    break;
4301 	  }
4302       unsigned HOST_WIDE_INT const_bitsize;
4303       if (bitsize.is_constant (&const_bitsize)
4304 	  && multiple_p (const_bitsize, BITS_PER_UNIT)
4305 	  && multiple_p (bitpos, BITS_PER_UNIT)
4306 	  && const_bitsize <= 64
4307 	  && BYTES_BIG_ENDIAN == WORDS_BIG_ENDIAN)
4308 	{
4309 	  ins_stmt = find_bswap_or_nop_1 (def_stmt, &n, 12);
4310 	  if (ins_stmt)
4311 	    {
4312 	      uint64_t nn = n.n;
4313 	      for (unsigned HOST_WIDE_INT i = 0;
4314 		   i < const_bitsize;
4315 		   i += BITS_PER_UNIT, nn >>= BITS_PER_MARKER)
4316 		if ((nn & MARKER_MASK) == 0
4317 		    || (nn & MARKER_MASK) == MARKER_BYTE_UNKNOWN)
4318 		  {
4319 		    ins_stmt = NULL;
4320 		    break;
4321 		  }
4322 	      if (ins_stmt)
4323 		{
4324 		  if (invalid)
4325 		    {
4326 		      rhs_code = LROTATE_EXPR;
4327 		      ops[0].base_addr = NULL_TREE;
4328 		      ops[1].base_addr = NULL_TREE;
4329 		    }
4330 		  invalid = false;
4331 		}
4332 	    }
4333 	}
4334     }
4335 
4336   unsigned HOST_WIDE_INT const_bitsize, const_bitpos;
4337   unsigned HOST_WIDE_INT const_bitregion_start, const_bitregion_end;
4338   if (invalid
4339       || !bitsize.is_constant (&const_bitsize)
4340       || !bitpos.is_constant (&const_bitpos)
4341       || !bitregion_start.is_constant (&const_bitregion_start)
4342       || !bitregion_end.is_constant (&const_bitregion_end))
4343     {
4344       terminate_all_aliasing_chains (NULL, stmt);
4345       return;
4346     }
4347 
4348   if (!ins_stmt)
4349     memset (&n, 0, sizeof (n));
4350 
4351   struct imm_store_chain_info **chain_info = NULL;
4352   if (base_addr)
4353     chain_info = m_stores.get (base_addr);
4354 
4355   store_immediate_info *info;
4356   if (chain_info)
4357     {
4358       unsigned int ord = (*chain_info)->m_store_info.length ();
4359       info = new store_immediate_info (const_bitsize, const_bitpos,
4360 				       const_bitregion_start,
4361 				       const_bitregion_end,
4362 				       stmt, ord, rhs_code, n, ins_stmt,
4363 				       bit_not_p, ops[0], ops[1]);
4364       if (dump_file && (dump_flags & TDF_DETAILS))
4365 	{
4366 	  fprintf (dump_file, "Recording immediate store from stmt:\n");
4367 	  print_gimple_stmt (dump_file, stmt, 0);
4368 	}
4369       (*chain_info)->m_store_info.safe_push (info);
4370       terminate_all_aliasing_chains (chain_info, stmt);
4371       /* If we reach the limit of stores to merge in a chain terminate and
4372 	 process the chain now.  */
4373       if ((*chain_info)->m_store_info.length ()
4374 	  == (unsigned int) PARAM_VALUE (PARAM_MAX_STORES_TO_MERGE))
4375 	{
4376 	  if (dump_file && (dump_flags & TDF_DETAILS))
4377 	    fprintf (dump_file,
4378 		     "Reached maximum number of statements to merge:\n");
4379 	  terminate_and_release_chain (*chain_info);
4380 	}
4381       return;
4382     }
4383 
4384   /* Store aliases any existing chain?  */
4385   terminate_all_aliasing_chains (NULL, stmt);
4386   /* Start a new chain.  */
4387   struct imm_store_chain_info *new_chain
4388     = new imm_store_chain_info (m_stores_head, base_addr);
4389   info = new store_immediate_info (const_bitsize, const_bitpos,
4390 				   const_bitregion_start,
4391 				   const_bitregion_end,
4392 				   stmt, 0, rhs_code, n, ins_stmt,
4393 				   bit_not_p, ops[0], ops[1]);
4394   new_chain->m_store_info.safe_push (info);
4395   m_stores.put (base_addr, new_chain);
4396   if (dump_file && (dump_flags & TDF_DETAILS))
4397     {
4398       fprintf (dump_file, "Starting new chain with statement:\n");
4399       print_gimple_stmt (dump_file, stmt, 0);
4400       fprintf (dump_file, "The base object is:\n");
4401       print_generic_expr (dump_file, base_addr);
4402       fprintf (dump_file, "\n");
4403     }
4404 }
4405 
4406 /* Entry point for the pass.  Go over each basic block recording chains of
4407    immediate stores.  Upon encountering a terminating statement (as defined
4408    by stmt_terminates_chain_p) process the recorded stores and emit the widened
4409    variants.  */
4410 
4411 unsigned int
execute(function * fun)4412 pass_store_merging::execute (function *fun)
4413 {
4414   basic_block bb;
4415   hash_set<gimple *> orig_stmts;
4416 
4417   calculate_dominance_info (CDI_DOMINATORS);
4418 
4419   FOR_EACH_BB_FN (bb, fun)
4420     {
4421       gimple_stmt_iterator gsi;
4422       unsigned HOST_WIDE_INT num_statements = 0;
4423       /* Record the original statements so that we can keep track of
4424 	 statements emitted in this pass and not re-process new
4425 	 statements.  */
4426       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
4427 	{
4428 	  if (is_gimple_debug (gsi_stmt (gsi)))
4429 	    continue;
4430 
4431 	  if (++num_statements >= 2)
4432 	    break;
4433 	}
4434 
4435       if (num_statements < 2)
4436 	continue;
4437 
4438       if (dump_file && (dump_flags & TDF_DETAILS))
4439 	fprintf (dump_file, "Processing basic block <%d>:\n", bb->index);
4440 
4441       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
4442 	{
4443 	  gimple *stmt = gsi_stmt (gsi);
4444 
4445 	  if (is_gimple_debug (stmt))
4446 	    continue;
4447 
4448 	  if (gimple_has_volatile_ops (stmt))
4449 	    {
4450 	      /* Terminate all chains.  */
4451 	      if (dump_file && (dump_flags & TDF_DETAILS))
4452 		fprintf (dump_file, "Volatile access terminates "
4453 				    "all chains\n");
4454 	      terminate_and_process_all_chains ();
4455 	      continue;
4456 	    }
4457 
4458 	  if (gimple_assign_single_p (stmt) && gimple_vdef (stmt)
4459 	      && !stmt_can_throw_internal (stmt)
4460 	      && lhs_valid_for_store_merging_p (gimple_assign_lhs (stmt)))
4461 	    process_store (stmt);
4462 	  else
4463 	    terminate_all_aliasing_chains (NULL, stmt);
4464 	}
4465       terminate_and_process_all_chains ();
4466     }
4467   return 0;
4468 }
4469 
4470 } // anon namespace
4471 
4472 /* Construct and return a store merging pass object.  */
4473 
4474 gimple_opt_pass *
make_pass_store_merging(gcc::context * ctxt)4475 make_pass_store_merging (gcc::context *ctxt)
4476 {
4477   return new pass_store_merging (ctxt);
4478 }
4479 
4480 #if CHECKING_P
4481 
4482 namespace selftest {
4483 
4484 /* Selftests for store merging helpers.  */
4485 
4486 /* Assert that all elements of the byte arrays X and Y, both of length N
4487    are equal.  */
4488 
4489 static void
verify_array_eq(unsigned char * x,unsigned char * y,unsigned int n)4490 verify_array_eq (unsigned char *x, unsigned char *y, unsigned int n)
4491 {
4492   for (unsigned int i = 0; i < n; i++)
4493     {
4494       if (x[i] != y[i])
4495 	{
4496 	  fprintf (stderr, "Arrays do not match.  X:\n");
4497 	  dump_char_array (stderr, x, n);
4498 	  fprintf (stderr, "Y:\n");
4499 	  dump_char_array (stderr, y, n);
4500 	}
4501       ASSERT_EQ (x[i], y[i]);
4502     }
4503 }
4504 
4505 /* Test shift_bytes_in_array and that it carries bits across between
4506    bytes correctly.  */
4507 
4508 static void
verify_shift_bytes_in_array(void)4509 verify_shift_bytes_in_array (void)
4510 {
4511    /* byte 1   | byte 0
4512       00011111 | 11100000.  */
4513   unsigned char orig[2] = { 0xe0, 0x1f };
4514   unsigned char in[2];
4515   memcpy (in, orig, sizeof orig);
4516 
4517   unsigned char expected[2] = { 0x80, 0x7f };
4518   shift_bytes_in_array (in, sizeof (in), 2);
4519   verify_array_eq (in, expected, sizeof (in));
4520 
4521   memcpy (in, orig, sizeof orig);
4522   memcpy (expected, orig, sizeof orig);
4523   /* Check that shifting by zero doesn't change anything.  */
4524   shift_bytes_in_array (in, sizeof (in), 0);
4525   verify_array_eq (in, expected, sizeof (in));
4526 
4527 }
4528 
4529 /* Test shift_bytes_in_array_right and that it carries bits across between
4530    bytes correctly.  */
4531 
4532 static void
verify_shift_bytes_in_array_right(void)4533 verify_shift_bytes_in_array_right (void)
4534 {
4535    /* byte 1   | byte 0
4536       00011111 | 11100000.  */
4537   unsigned char orig[2] = { 0x1f, 0xe0};
4538   unsigned char in[2];
4539   memcpy (in, orig, sizeof orig);
4540   unsigned char expected[2] = { 0x07, 0xf8};
4541   shift_bytes_in_array_right (in, sizeof (in), 2);
4542   verify_array_eq (in, expected, sizeof (in));
4543 
4544   memcpy (in, orig, sizeof orig);
4545   memcpy (expected, orig, sizeof orig);
4546   /* Check that shifting by zero doesn't change anything.  */
4547   shift_bytes_in_array_right (in, sizeof (in), 0);
4548   verify_array_eq (in, expected, sizeof (in));
4549 }
4550 
4551 /* Test clear_bit_region that it clears exactly the bits asked and
4552    nothing more.  */
4553 
4554 static void
verify_clear_bit_region(void)4555 verify_clear_bit_region (void)
4556 {
4557   /* Start with all bits set and test clearing various patterns in them.  */
4558   unsigned char orig[3] = { 0xff, 0xff, 0xff};
4559   unsigned char in[3];
4560   unsigned char expected[3];
4561   memcpy (in, orig, sizeof in);
4562 
4563   /* Check zeroing out all the bits.  */
4564   clear_bit_region (in, 0, 3 * BITS_PER_UNIT);
4565   expected[0] = expected[1] = expected[2] = 0;
4566   verify_array_eq (in, expected, sizeof in);
4567 
4568   memcpy (in, orig, sizeof in);
4569   /* Leave the first and last bits intact.  */
4570   clear_bit_region (in, 1, 3 * BITS_PER_UNIT - 2);
4571   expected[0] = 0x1;
4572   expected[1] = 0;
4573   expected[2] = 0x80;
4574   verify_array_eq (in, expected, sizeof in);
4575 }
4576 
4577 /* Test verify_clear_bit_region_be that it clears exactly the bits asked and
4578    nothing more.  */
4579 
4580 static void
verify_clear_bit_region_be(void)4581 verify_clear_bit_region_be (void)
4582 {
4583   /* Start with all bits set and test clearing various patterns in them.  */
4584   unsigned char orig[3] = { 0xff, 0xff, 0xff};
4585   unsigned char in[3];
4586   unsigned char expected[3];
4587   memcpy (in, orig, sizeof in);
4588 
4589   /* Check zeroing out all the bits.  */
4590   clear_bit_region_be (in, BITS_PER_UNIT - 1, 3 * BITS_PER_UNIT);
4591   expected[0] = expected[1] = expected[2] = 0;
4592   verify_array_eq (in, expected, sizeof in);
4593 
4594   memcpy (in, orig, sizeof in);
4595   /* Leave the first and last bits intact.  */
4596   clear_bit_region_be (in, BITS_PER_UNIT - 2, 3 * BITS_PER_UNIT - 2);
4597   expected[0] = 0x80;
4598   expected[1] = 0;
4599   expected[2] = 0x1;
4600   verify_array_eq (in, expected, sizeof in);
4601 }
4602 
4603 
4604 /* Run all of the selftests within this file.  */
4605 
4606 void
store_merging_c_tests(void)4607 store_merging_c_tests (void)
4608 {
4609   verify_shift_bytes_in_array ();
4610   verify_shift_bytes_in_array_right ();
4611   verify_clear_bit_region ();
4612   verify_clear_bit_region_be ();
4613 }
4614 
4615 } // namespace selftest
4616 #endif /* CHECKING_P.  */
4617