1 /* GIMPLE store merging and byte swapping passes.
2    Copyright (C) 2009-2018 Free Software Foundation, Inc.
3    Contributed by ARM Ltd.
4 
5    This file is part of GCC.
6 
7    GCC is free software; you can redistribute it and/or modify it
8    under the terms of the GNU General Public License as published by
9    the Free Software Foundation; either version 3, or (at your option)
10    any later version.
11 
12    GCC is distributed in the hope that it will be useful, but
13    WITHOUT ANY WARRANTY; without even the implied warranty of
14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15    General Public License for more details.
16 
17    You should have received a copy of the GNU General Public License
18    along with GCC; see the file COPYING3.  If not see
19    <http://www.gnu.org/licenses/>.  */
20 
21 /* The purpose of the store merging pass is to combine multiple memory
22    stores of constant values, values loaded from memory or bitwise operations
23    on those to consecutive memory locations into fewer wider stores.
24    For example, if we have a sequence peforming four byte stores to
25    consecutive memory locations:
26    [p     ] := imm1;
27    [p + 1B] := imm2;
28    [p + 2B] := imm3;
29    [p + 3B] := imm4;
30    we can transform this into a single 4-byte store if the target supports it:
31   [p] := imm1:imm2:imm3:imm4 //concatenated immediates according to endianness.
32 
33    Or:
34    [p     ] := [q     ];
35    [p + 1B] := [q + 1B];
36    [p + 2B] := [q + 2B];
37    [p + 3B] := [q + 3B];
38    if there is no overlap can be transformed into a single 4-byte
39    load followed by single 4-byte store.
40 
41    Or:
42    [p     ] := [q     ] ^ imm1;
43    [p + 1B] := [q + 1B] ^ imm2;
44    [p + 2B] := [q + 2B] ^ imm3;
45    [p + 3B] := [q + 3B] ^ imm4;
46    if there is no overlap can be transformed into a single 4-byte
47    load, xored with imm1:imm2:imm3:imm4 and stored using a single 4-byte store.
48 
49    The algorithm is applied to each basic block in three phases:
50 
51    1) Scan through the basic block recording assignments to
52    destinations that can be expressed as a store to memory of a certain size
53    at a certain bit offset from expressions we can handle.  For bit-fields
54    we also note the surrounding bit region, bits that could be stored in
55    a read-modify-write operation when storing the bit-field.  Record store
56    chains to different bases in a hash_map (m_stores) and make sure to
57    terminate such chains when appropriate (for example when when the stored
58    values get used subsequently).
59    These stores can be a result of structure element initializers, array stores
60    etc.  A store_immediate_info object is recorded for every such store.
61    Record as many such assignments to a single base as possible until a
62    statement that interferes with the store sequence is encountered.
63    Each store has up to 2 operands, which can be an immediate constant
64    or a memory load, from which the value to be stored can be computed.
65    At most one of the operands can be a constant.  The operands are recorded
66    in store_operand_info struct.
67 
68    2) Analyze the chain of stores recorded in phase 1) (i.e. the vector of
69    store_immediate_info objects) and coalesce contiguous stores into
70    merged_store_group objects.  For bit-fields stores, we don't need to
71    require the stores to be contiguous, just their surrounding bit regions
72    have to be contiguous.  If the expression being stored is different
73    between adjacent stores, such as one store storing a constant and
74    following storing a value loaded from memory, or if the loaded memory
75    objects are not adjacent, a new merged_store_group is created as well.
76 
77    For example, given the stores:
78    [p     ] := 0;
79    [p + 1B] := 1;
80    [p + 3B] := 0;
81    [p + 4B] := 1;
82    [p + 5B] := 0;
83    [p + 6B] := 0;
84    This phase would produce two merged_store_group objects, one recording the
85    two bytes stored in the memory region [p : p + 1] and another
86    recording the four bytes stored in the memory region [p + 3 : p + 6].
87 
88    3) The merged_store_group objects produced in phase 2) are processed
89    to generate the sequence of wider stores that set the contiguous memory
90    regions to the sequence of bytes that correspond to it.  This may emit
91    multiple stores per store group to handle contiguous stores that are not
92    of a size that is a power of 2.  For example it can try to emit a 40-bit
93    store as a 32-bit store followed by an 8-bit store.
94    We try to emit as wide stores as we can while respecting STRICT_ALIGNMENT or
95    TARGET_SLOW_UNALIGNED_ACCESS rules.
96 
97    Note on endianness and example:
98    Consider 2 contiguous 16-bit stores followed by 2 contiguous 8-bit stores:
99    [p     ] := 0x1234;
100    [p + 2B] := 0x5678;
101    [p + 4B] := 0xab;
102    [p + 5B] := 0xcd;
103 
104    The memory layout for little-endian (LE) and big-endian (BE) must be:
105   p |LE|BE|
106   ---------
107   0 |34|12|
108   1 |12|34|
109   2 |78|56|
110   3 |56|78|
111   4 |ab|ab|
112   5 |cd|cd|
113 
114   To merge these into a single 48-bit merged value 'val' in phase 2)
115   on little-endian we insert stores to higher (consecutive) bitpositions
116   into the most significant bits of the merged value.
117   The final merged value would be: 0xcdab56781234
118 
119   For big-endian we insert stores to higher bitpositions into the least
120   significant bits of the merged value.
121   The final merged value would be: 0x12345678abcd
122 
123   Then, in phase 3), we want to emit this 48-bit value as a 32-bit store
124   followed by a 16-bit store.  Again, we must consider endianness when
125   breaking down the 48-bit value 'val' computed above.
126   For little endian we emit:
127   [p]      (32-bit) := 0x56781234; // val & 0x0000ffffffff;
128   [p + 4B] (16-bit) := 0xcdab;    // (val & 0xffff00000000) >> 32;
129 
130   Whereas for big-endian we emit:
131   [p]      (32-bit) := 0x12345678; // (val & 0xffffffff0000) >> 16;
132   [p + 4B] (16-bit) := 0xabcd;     //  val & 0x00000000ffff;  */
133 
134 #include "config.h"
135 #include "system.h"
136 #include "coretypes.h"
137 #include "backend.h"
138 #include "tree.h"
139 #include "gimple.h"
140 #include "builtins.h"
141 #include "fold-const.h"
142 #include "tree-pass.h"
143 #include "ssa.h"
144 #include "gimple-pretty-print.h"
145 #include "alias.h"
146 #include "fold-const.h"
147 #include "params.h"
148 #include "print-tree.h"
149 #include "tree-hash-traits.h"
150 #include "gimple-iterator.h"
151 #include "gimplify.h"
152 #include "stor-layout.h"
153 #include "timevar.h"
154 #include "tree-cfg.h"
155 #include "tree-eh.h"
156 #include "target.h"
157 #include "gimplify-me.h"
158 #include "rtl.h"
159 #include "expr.h"	/* For get_bit_range.  */
160 #include "optabs-tree.h"
161 #include "selftest.h"
162 
163 /* The maximum size (in bits) of the stores this pass should generate.  */
164 #define MAX_STORE_BITSIZE (BITS_PER_WORD)
165 #define MAX_STORE_BYTES (MAX_STORE_BITSIZE / BITS_PER_UNIT)
166 
167 /* Limit to bound the number of aliasing checks for loads with the same
168    vuse as the corresponding store.  */
169 #define MAX_STORE_ALIAS_CHECKS 64
170 
171 namespace {
172 
173 struct bswap_stat
174 {
175   /* Number of hand-written 16-bit nop / bswaps found.  */
176   int found_16bit;
177 
178   /* Number of hand-written 32-bit nop / bswaps found.  */
179   int found_32bit;
180 
181   /* Number of hand-written 64-bit nop / bswaps found.  */
182   int found_64bit;
183 } nop_stats, bswap_stats;
184 
185 /* A symbolic number structure is used to detect byte permutation and selection
186    patterns of a source.  To achieve that, its field N contains an artificial
187    number consisting of BITS_PER_MARKER sized markers tracking where does each
188    byte come from in the source:
189 
190    0	   - target byte has the value 0
191    FF	   - target byte has an unknown value (eg. due to sign extension)
192    1..size - marker value is the byte index in the source (0 for lsb).
193 
194    To detect permutations on memory sources (arrays and structures), a symbolic
195    number is also associated:
196    - a base address BASE_ADDR and an OFFSET giving the address of the source;
197    - a range which gives the difference between the highest and lowest accessed
198      memory location to make such a symbolic number;
199    - the address SRC of the source element of lowest address as a convenience
200      to easily get BASE_ADDR + offset + lowest bytepos;
201    - number of expressions N_OPS bitwise ored together to represent
202      approximate cost of the computation.
203 
204    Note 1: the range is different from size as size reflects the size of the
205    type of the current expression.  For instance, for an array char a[],
206    (short) a[0] | (short) a[3] would have a size of 2 but a range of 4 while
207    (short) a[0] | ((short) a[0] << 1) would still have a size of 2 but this
208    time a range of 1.
209 
210    Note 2: for non-memory sources, range holds the same value as size.
211 
212    Note 3: SRC points to the SSA_NAME in case of non-memory source.  */
213 
214 struct symbolic_number {
215   uint64_t n;
216   tree type;
217   tree base_addr;
218   tree offset;
219   poly_int64_pod bytepos;
220   tree src;
221   tree alias_set;
222   tree vuse;
223   unsigned HOST_WIDE_INT range;
224   int n_ops;
225 };
226 
227 #define BITS_PER_MARKER 8
228 #define MARKER_MASK ((1 << BITS_PER_MARKER) - 1)
229 #define MARKER_BYTE_UNKNOWN MARKER_MASK
230 #define HEAD_MARKER(n, size) \
231   ((n) & ((uint64_t) MARKER_MASK << (((size) - 1) * BITS_PER_MARKER)))
232 
233 /* The number which the find_bswap_or_nop_1 result should match in
234    order to have a nop.  The number is masked according to the size of
235    the symbolic number before using it.  */
236 #define CMPNOP (sizeof (int64_t) < 8 ? 0 : \
237   (uint64_t)0x08070605 << 32 | 0x04030201)
238 
239 /* The number which the find_bswap_or_nop_1 result should match in
240    order to have a byte swap.  The number is masked according to the
241    size of the symbolic number before using it.  */
242 #define CMPXCHG (sizeof (int64_t) < 8 ? 0 : \
243   (uint64_t)0x01020304 << 32 | 0x05060708)
244 
245 /* Perform a SHIFT or ROTATE operation by COUNT bits on symbolic
246    number N.  Return false if the requested operation is not permitted
247    on a symbolic number.  */
248 
249 inline bool
250 do_shift_rotate (enum tree_code code,
251 		 struct symbolic_number *n,
252 		 int count)
253 {
254   int i, size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
255   unsigned head_marker;
256 
257   if (count % BITS_PER_UNIT != 0)
258     return false;
259   count = (count / BITS_PER_UNIT) * BITS_PER_MARKER;
260 
261   /* Zero out the extra bits of N in order to avoid them being shifted
262      into the significant bits.  */
263   if (size < 64 / BITS_PER_MARKER)
264     n->n &= ((uint64_t) 1 << (size * BITS_PER_MARKER)) - 1;
265 
266   switch (code)
267     {
268     case LSHIFT_EXPR:
269       n->n <<= count;
270       break;
271     case RSHIFT_EXPR:
272       head_marker = HEAD_MARKER (n->n, size);
273       n->n >>= count;
274       /* Arithmetic shift of signed type: result is dependent on the value.  */
275       if (!TYPE_UNSIGNED (n->type) && head_marker)
276 	for (i = 0; i < count / BITS_PER_MARKER; i++)
277 	  n->n |= (uint64_t) MARKER_BYTE_UNKNOWN
278 		  << ((size - 1 - i) * BITS_PER_MARKER);
279       break;
280     case LROTATE_EXPR:
281       n->n = (n->n << count) | (n->n >> ((size * BITS_PER_MARKER) - count));
282       break;
283     case RROTATE_EXPR:
284       n->n = (n->n >> count) | (n->n << ((size * BITS_PER_MARKER) - count));
285       break;
286     default:
287       return false;
288     }
289   /* Zero unused bits for size.  */
290   if (size < 64 / BITS_PER_MARKER)
291     n->n &= ((uint64_t) 1 << (size * BITS_PER_MARKER)) - 1;
292   return true;
293 }
294 
295 /* Perform sanity checking for the symbolic number N and the gimple
296    statement STMT.  */
297 
298 inline bool
299 verify_symbolic_number_p (struct symbolic_number *n, gimple *stmt)
300 {
301   tree lhs_type;
302 
303   lhs_type = gimple_expr_type (stmt);
304 
305   if (TREE_CODE (lhs_type) != INTEGER_TYPE)
306     return false;
307 
308   if (TYPE_PRECISION (lhs_type) != TYPE_PRECISION (n->type))
309     return false;
310 
311   return true;
312 }
313 
314 /* Initialize the symbolic number N for the bswap pass from the base element
315    SRC manipulated by the bitwise OR expression.  */
316 
317 bool
318 init_symbolic_number (struct symbolic_number *n, tree src)
319 {
320   int size;
321 
322   if (! INTEGRAL_TYPE_P (TREE_TYPE (src)))
323     return false;
324 
325   n->base_addr = n->offset = n->alias_set = n->vuse = NULL_TREE;
326   n->src = src;
327 
328   /* Set up the symbolic number N by setting each byte to a value between 1 and
329      the byte size of rhs1.  The highest order byte is set to n->size and the
330      lowest order byte to 1.  */
331   n->type = TREE_TYPE (src);
332   size = TYPE_PRECISION (n->type);
333   if (size % BITS_PER_UNIT != 0)
334     return false;
335   size /= BITS_PER_UNIT;
336   if (size > 64 / BITS_PER_MARKER)
337     return false;
338   n->range = size;
339   n->n = CMPNOP;
340   n->n_ops = 1;
341 
342   if (size < 64 / BITS_PER_MARKER)
343     n->n &= ((uint64_t) 1 << (size * BITS_PER_MARKER)) - 1;
344 
345   return true;
346 }
347 
348 /* Check if STMT might be a byte swap or a nop from a memory source and returns
349    the answer. If so, REF is that memory source and the base of the memory area
350    accessed and the offset of the access from that base are recorded in N.  */
351 
352 bool
353 find_bswap_or_nop_load (gimple *stmt, tree ref, struct symbolic_number *n)
354 {
355   /* Leaf node is an array or component ref. Memorize its base and
356      offset from base to compare to other such leaf node.  */
357   poly_int64 bitsize, bitpos, bytepos;
358   machine_mode mode;
359   int unsignedp, reversep, volatilep;
360   tree offset, base_addr;
361 
362   /* Not prepared to handle PDP endian.  */
363   if (BYTES_BIG_ENDIAN != WORDS_BIG_ENDIAN)
364     return false;
365 
366   if (!gimple_assign_load_p (stmt) || gimple_has_volatile_ops (stmt))
367     return false;
368 
369   base_addr = get_inner_reference (ref, &bitsize, &bitpos, &offset, &mode,
370 				   &unsignedp, &reversep, &volatilep);
371 
372   if (TREE_CODE (base_addr) == TARGET_MEM_REF)
373     /* Do not rewrite TARGET_MEM_REF.  */
374     return false;
375   else if (TREE_CODE (base_addr) == MEM_REF)
376     {
377       poly_offset_int bit_offset = 0;
378       tree off = TREE_OPERAND (base_addr, 1);
379 
380       if (!integer_zerop (off))
381 	{
382 	  poly_offset_int boff = mem_ref_offset (base_addr);
383 	  boff <<= LOG2_BITS_PER_UNIT;
384 	  bit_offset += boff;
385 	}
386 
387       base_addr = TREE_OPERAND (base_addr, 0);
388 
389       /* Avoid returning a negative bitpos as this may wreak havoc later.  */
390       if (maybe_lt (bit_offset, 0))
391 	{
392 	  tree byte_offset = wide_int_to_tree
393 	    (sizetype, bits_to_bytes_round_down (bit_offset));
394 	  bit_offset = num_trailing_bits (bit_offset);
395 	  if (offset)
396 	    offset = size_binop (PLUS_EXPR, offset, byte_offset);
397 	  else
398 	    offset = byte_offset;
399 	}
400 
401       bitpos += bit_offset.force_shwi ();
402     }
403   else
404     base_addr = build_fold_addr_expr (base_addr);
405 
406   if (!multiple_p (bitpos, BITS_PER_UNIT, &bytepos))
407     return false;
408   if (!multiple_p (bitsize, BITS_PER_UNIT))
409     return false;
410   if (reversep)
411     return false;
412 
413   if (!init_symbolic_number (n, ref))
414     return false;
415   n->base_addr = base_addr;
416   n->offset = offset;
417   n->bytepos = bytepos;
418   n->alias_set = reference_alias_ptr_type (ref);
419   n->vuse = gimple_vuse (stmt);
420   return true;
421 }
422 
423 /* Compute the symbolic number N representing the result of a bitwise OR on 2
424    symbolic number N1 and N2 whose source statements are respectively
425    SOURCE_STMT1 and SOURCE_STMT2.  */
426 
427 gimple *
428 perform_symbolic_merge (gimple *source_stmt1, struct symbolic_number *n1,
429 			gimple *source_stmt2, struct symbolic_number *n2,
430 			struct symbolic_number *n)
431 {
432   int i, size;
433   uint64_t mask;
434   gimple *source_stmt;
435   struct symbolic_number *n_start;
436 
437   tree rhs1 = gimple_assign_rhs1 (source_stmt1);
438   if (TREE_CODE (rhs1) == BIT_FIELD_REF
439       && TREE_CODE (TREE_OPERAND (rhs1, 0)) == SSA_NAME)
440     rhs1 = TREE_OPERAND (rhs1, 0);
441   tree rhs2 = gimple_assign_rhs1 (source_stmt2);
442   if (TREE_CODE (rhs2) == BIT_FIELD_REF
443       && TREE_CODE (TREE_OPERAND (rhs2, 0)) == SSA_NAME)
444     rhs2 = TREE_OPERAND (rhs2, 0);
445 
446   /* Sources are different, cancel bswap if they are not memory location with
447      the same base (array, structure, ...).  */
448   if (rhs1 != rhs2)
449     {
450       uint64_t inc;
451       HOST_WIDE_INT start1, start2, start_sub, end_sub, end1, end2, end;
452       struct symbolic_number *toinc_n_ptr, *n_end;
453       basic_block bb1, bb2;
454 
455       if (!n1->base_addr || !n2->base_addr
456 	  || !operand_equal_p (n1->base_addr, n2->base_addr, 0))
457 	return NULL;
458 
459       if (!n1->offset != !n2->offset
460 	  || (n1->offset && !operand_equal_p (n1->offset, n2->offset, 0)))
461 	return NULL;
462 
463       start1 = 0;
464       if (!(n2->bytepos - n1->bytepos).is_constant (&start2))
465 	return NULL;
466 
467       if (start1 < start2)
468 	{
469 	  n_start = n1;
470 	  start_sub = start2 - start1;
471 	}
472       else
473 	{
474 	  n_start = n2;
475 	  start_sub = start1 - start2;
476 	}
477 
478       bb1 = gimple_bb (source_stmt1);
479       bb2 = gimple_bb (source_stmt2);
480       if (dominated_by_p (CDI_DOMINATORS, bb1, bb2))
481 	source_stmt = source_stmt1;
482       else
483 	source_stmt = source_stmt2;
484 
485       /* Find the highest address at which a load is performed and
486 	 compute related info.  */
487       end1 = start1 + (n1->range - 1);
488       end2 = start2 + (n2->range - 1);
489       if (end1 < end2)
490 	{
491 	  end = end2;
492 	  end_sub = end2 - end1;
493 	}
494       else
495 	{
496 	  end = end1;
497 	  end_sub = end1 - end2;
498 	}
499       n_end = (end2 > end1) ? n2 : n1;
500 
501       /* Find symbolic number whose lsb is the most significant.  */
502       if (BYTES_BIG_ENDIAN)
503 	toinc_n_ptr = (n_end == n1) ? n2 : n1;
504       else
505 	toinc_n_ptr = (n_start == n1) ? n2 : n1;
506 
507       n->range = end - MIN (start1, start2) + 1;
508 
509       /* Check that the range of memory covered can be represented by
510 	 a symbolic number.  */
511       if (n->range > 64 / BITS_PER_MARKER)
512 	return NULL;
513 
514       /* Reinterpret byte marks in symbolic number holding the value of
515 	 bigger weight according to target endianness.  */
516       inc = BYTES_BIG_ENDIAN ? end_sub : start_sub;
517       size = TYPE_PRECISION (n1->type) / BITS_PER_UNIT;
518       for (i = 0; i < size; i++, inc <<= BITS_PER_MARKER)
519 	{
520 	  unsigned marker
521 	    = (toinc_n_ptr->n >> (i * BITS_PER_MARKER)) & MARKER_MASK;
522 	  if (marker && marker != MARKER_BYTE_UNKNOWN)
523 	    toinc_n_ptr->n += inc;
524 	}
525     }
526   else
527     {
528       n->range = n1->range;
529       n_start = n1;
530       source_stmt = source_stmt1;
531     }
532 
533   if (!n1->alias_set
534       || alias_ptr_types_compatible_p (n1->alias_set, n2->alias_set))
535     n->alias_set = n1->alias_set;
536   else
537     n->alias_set = ptr_type_node;
538   n->vuse = n_start->vuse;
539   n->base_addr = n_start->base_addr;
540   n->offset = n_start->offset;
541   n->src = n_start->src;
542   n->bytepos = n_start->bytepos;
543   n->type = n_start->type;
544   size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
545 
546   for (i = 0, mask = MARKER_MASK; i < size; i++, mask <<= BITS_PER_MARKER)
547     {
548       uint64_t masked1, masked2;
549 
550       masked1 = n1->n & mask;
551       masked2 = n2->n & mask;
552       if (masked1 && masked2 && masked1 != masked2)
553 	return NULL;
554     }
555   n->n = n1->n | n2->n;
556   n->n_ops = n1->n_ops + n2->n_ops;
557 
558   return source_stmt;
559 }
560 
561 /* find_bswap_or_nop_1 invokes itself recursively with N and tries to perform
562    the operation given by the rhs of STMT on the result.  If the operation
563    could successfully be executed the function returns a gimple stmt whose
564    rhs's first tree is the expression of the source operand and NULL
565    otherwise.  */
566 
567 gimple *
568 find_bswap_or_nop_1 (gimple *stmt, struct symbolic_number *n, int limit)
569 {
570   enum tree_code code;
571   tree rhs1, rhs2 = NULL;
572   gimple *rhs1_stmt, *rhs2_stmt, *source_stmt1;
573   enum gimple_rhs_class rhs_class;
574 
575   if (!limit || !is_gimple_assign (stmt))
576     return NULL;
577 
578   rhs1 = gimple_assign_rhs1 (stmt);
579 
580   if (find_bswap_or_nop_load (stmt, rhs1, n))
581     return stmt;
582 
583   /* Handle BIT_FIELD_REF.  */
584   if (TREE_CODE (rhs1) == BIT_FIELD_REF
585       && TREE_CODE (TREE_OPERAND (rhs1, 0)) == SSA_NAME)
586     {
587       unsigned HOST_WIDE_INT bitsize = tree_to_uhwi (TREE_OPERAND (rhs1, 1));
588       unsigned HOST_WIDE_INT bitpos = tree_to_uhwi (TREE_OPERAND (rhs1, 2));
589       if (bitpos % BITS_PER_UNIT == 0
590 	  && bitsize % BITS_PER_UNIT == 0
591 	  && init_symbolic_number (n, TREE_OPERAND (rhs1, 0)))
592 	{
593 	  /* Handle big-endian bit numbering in BIT_FIELD_REF.  */
594 	  if (BYTES_BIG_ENDIAN)
595 	    bitpos = TYPE_PRECISION (n->type) - bitpos - bitsize;
596 
597 	  /* Shift.  */
598 	  if (!do_shift_rotate (RSHIFT_EXPR, n, bitpos))
599 	    return NULL;
600 
601 	  /* Mask.  */
602 	  uint64_t mask = 0;
603 	  uint64_t tmp = (1 << BITS_PER_UNIT) - 1;
604 	  for (unsigned i = 0; i < bitsize / BITS_PER_UNIT;
605 	       i++, tmp <<= BITS_PER_UNIT)
606 	    mask |= (uint64_t) MARKER_MASK << (i * BITS_PER_MARKER);
607 	  n->n &= mask;
608 
609 	  /* Convert.  */
610 	  n->type = TREE_TYPE (rhs1);
611 	  if (!n->base_addr)
612 	    n->range = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
613 
614 	  return verify_symbolic_number_p (n, stmt) ? stmt : NULL;
615 	}
616 
617       return NULL;
618     }
619 
620   if (TREE_CODE (rhs1) != SSA_NAME)
621     return NULL;
622 
623   code = gimple_assign_rhs_code (stmt);
624   rhs_class = gimple_assign_rhs_class (stmt);
625   rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
626 
627   if (rhs_class == GIMPLE_BINARY_RHS)
628     rhs2 = gimple_assign_rhs2 (stmt);
629 
630   /* Handle unary rhs and binary rhs with integer constants as second
631      operand.  */
632 
633   if (rhs_class == GIMPLE_UNARY_RHS
634       || (rhs_class == GIMPLE_BINARY_RHS
635 	  && TREE_CODE (rhs2) == INTEGER_CST))
636     {
637       if (code != BIT_AND_EXPR
638 	  && code != LSHIFT_EXPR
639 	  && code != RSHIFT_EXPR
640 	  && code != LROTATE_EXPR
641 	  && code != RROTATE_EXPR
642 	  && !CONVERT_EXPR_CODE_P (code))
643 	return NULL;
644 
645       source_stmt1 = find_bswap_or_nop_1 (rhs1_stmt, n, limit - 1);
646 
647       /* If find_bswap_or_nop_1 returned NULL, STMT is a leaf node and
648 	 we have to initialize the symbolic number.  */
649       if (!source_stmt1)
650 	{
651 	  if (gimple_assign_load_p (stmt)
652 	      || !init_symbolic_number (n, rhs1))
653 	    return NULL;
654 	  source_stmt1 = stmt;
655 	}
656 
657       switch (code)
658 	{
659 	case BIT_AND_EXPR:
660 	  {
661 	    int i, size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
662 	    uint64_t val = int_cst_value (rhs2), mask = 0;
663 	    uint64_t tmp = (1 << BITS_PER_UNIT) - 1;
664 
665 	    /* Only constants masking full bytes are allowed.  */
666 	    for (i = 0; i < size; i++, tmp <<= BITS_PER_UNIT)
667 	      if ((val & tmp) != 0 && (val & tmp) != tmp)
668 		return NULL;
669 	      else if (val & tmp)
670 		mask |= (uint64_t) MARKER_MASK << (i * BITS_PER_MARKER);
671 
672 	    n->n &= mask;
673 	  }
674 	  break;
675 	case LSHIFT_EXPR:
676 	case RSHIFT_EXPR:
677 	case LROTATE_EXPR:
678 	case RROTATE_EXPR:
679 	  if (!do_shift_rotate (code, n, (int) TREE_INT_CST_LOW (rhs2)))
680 	    return NULL;
681 	  break;
682 	CASE_CONVERT:
683 	  {
684 	    int i, type_size, old_type_size;
685 	    tree type;
686 
687 	    type = gimple_expr_type (stmt);
688 	    type_size = TYPE_PRECISION (type);
689 	    if (type_size % BITS_PER_UNIT != 0)
690 	      return NULL;
691 	    type_size /= BITS_PER_UNIT;
692 	    if (type_size > 64 / BITS_PER_MARKER)
693 	      return NULL;
694 
695 	    /* Sign extension: result is dependent on the value.  */
696 	    old_type_size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
697 	    if (!TYPE_UNSIGNED (n->type) && type_size > old_type_size
698 		&& HEAD_MARKER (n->n, old_type_size))
699 	      for (i = 0; i < type_size - old_type_size; i++)
700 		n->n |= (uint64_t) MARKER_BYTE_UNKNOWN
701 			<< ((type_size - 1 - i) * BITS_PER_MARKER);
702 
703 	    if (type_size < 64 / BITS_PER_MARKER)
704 	      {
705 		/* If STMT casts to a smaller type mask out the bits not
706 		   belonging to the target type.  */
707 		n->n &= ((uint64_t) 1 << (type_size * BITS_PER_MARKER)) - 1;
708 	      }
709 	    n->type = type;
710 	    if (!n->base_addr)
711 	      n->range = type_size;
712 	  }
713 	  break;
714 	default:
715 	  return NULL;
716 	};
717       return verify_symbolic_number_p (n, stmt) ? source_stmt1 : NULL;
718     }
719 
720   /* Handle binary rhs.  */
721 
722   if (rhs_class == GIMPLE_BINARY_RHS)
723     {
724       struct symbolic_number n1, n2;
725       gimple *source_stmt, *source_stmt2;
726 
727       if (code != BIT_IOR_EXPR)
728 	return NULL;
729 
730       if (TREE_CODE (rhs2) != SSA_NAME)
731 	return NULL;
732 
733       rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
734 
735       switch (code)
736 	{
737 	case BIT_IOR_EXPR:
738 	  source_stmt1 = find_bswap_or_nop_1 (rhs1_stmt, &n1, limit - 1);
739 
740 	  if (!source_stmt1)
741 	    return NULL;
742 
743 	  source_stmt2 = find_bswap_or_nop_1 (rhs2_stmt, &n2, limit - 1);
744 
745 	  if (!source_stmt2)
746 	    return NULL;
747 
748 	  if (TYPE_PRECISION (n1.type) != TYPE_PRECISION (n2.type))
749 	    return NULL;
750 
751 	  if (n1.vuse != n2.vuse)
752 	    return NULL;
753 
754 	  source_stmt
755 	    = perform_symbolic_merge (source_stmt1, &n1, source_stmt2, &n2, n);
756 
757 	  if (!source_stmt)
758 	    return NULL;
759 
760 	  if (!verify_symbolic_number_p (n, stmt))
761 	    return NULL;
762 
763 	  break;
764 	default:
765 	  return NULL;
766 	}
767       return source_stmt;
768     }
769   return NULL;
770 }
771 
772 /* Helper for find_bswap_or_nop and try_coalesce_bswap to compute
773    *CMPXCHG, *CMPNOP and adjust *N.  */
774 
775 void
776 find_bswap_or_nop_finalize (struct symbolic_number *n, uint64_t *cmpxchg,
777 			    uint64_t *cmpnop)
778 {
779   unsigned rsize;
780   uint64_t tmpn, mask;
781 
782   /* The number which the find_bswap_or_nop_1 result should match in order
783      to have a full byte swap.  The number is shifted to the right
784      according to the size of the symbolic number before using it.  */
785   *cmpxchg = CMPXCHG;
786   *cmpnop = CMPNOP;
787 
788   /* Find real size of result (highest non-zero byte).  */
789   if (n->base_addr)
790     for (tmpn = n->n, rsize = 0; tmpn; tmpn >>= BITS_PER_MARKER, rsize++);
791   else
792     rsize = n->range;
793 
794   /* Zero out the bits corresponding to untouched bytes in original gimple
795      expression.  */
796   if (n->range < (int) sizeof (int64_t))
797     {
798       mask = ((uint64_t) 1 << (n->range * BITS_PER_MARKER)) - 1;
799       *cmpxchg >>= (64 / BITS_PER_MARKER - n->range) * BITS_PER_MARKER;
800       *cmpnop &= mask;
801     }
802 
803   /* Zero out the bits corresponding to unused bytes in the result of the
804      gimple expression.  */
805   if (rsize < n->range)
806     {
807       if (BYTES_BIG_ENDIAN)
808 	{
809 	  mask = ((uint64_t) 1 << (rsize * BITS_PER_MARKER)) - 1;
810 	  *cmpxchg &= mask;
811 	  *cmpnop >>= (n->range - rsize) * BITS_PER_MARKER;
812 	}
813       else
814 	{
815 	  mask = ((uint64_t) 1 << (rsize * BITS_PER_MARKER)) - 1;
816 	  *cmpxchg >>= (n->range - rsize) * BITS_PER_MARKER;
817 	  *cmpnop &= mask;
818 	}
819       n->range = rsize;
820     }
821 
822   n->range *= BITS_PER_UNIT;
823 }
824 
825 /* Check if STMT completes a bswap implementation or a read in a given
826    endianness consisting of ORs, SHIFTs and ANDs and sets *BSWAP
827    accordingly.  It also sets N to represent the kind of operations
828    performed: size of the resulting expression and whether it works on
829    a memory source, and if so alias-set and vuse.  At last, the
830    function returns a stmt whose rhs's first tree is the source
831    expression.  */
832 
833 gimple *
834 find_bswap_or_nop (gimple *stmt, struct symbolic_number *n, bool *bswap)
835 {
836   /* The last parameter determines the depth search limit.  It usually
837      correlates directly to the number n of bytes to be touched.  We
838      increase that number by log2(n) + 1 here in order to also
839      cover signed -> unsigned conversions of the src operand as can be seen
840      in libgcc, and for initial shift/and operation of the src operand.  */
841   int limit = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (gimple_expr_type (stmt)));
842   limit += 1 + (int) ceil_log2 ((unsigned HOST_WIDE_INT) limit);
843   gimple *ins_stmt = find_bswap_or_nop_1 (stmt, n, limit);
844 
845   if (!ins_stmt)
846     return NULL;
847 
848   uint64_t cmpxchg, cmpnop;
849   find_bswap_or_nop_finalize (n, &cmpxchg, &cmpnop);
850 
851   /* A complete byte swap should make the symbolic number to start with
852      the largest digit in the highest order byte. Unchanged symbolic
853      number indicates a read with same endianness as target architecture.  */
854   if (n->n == cmpnop)
855     *bswap = false;
856   else if (n->n == cmpxchg)
857     *bswap = true;
858   else
859     return NULL;
860 
861   /* Useless bit manipulation performed by code.  */
862   if (!n->base_addr && n->n == cmpnop && n->n_ops == 1)
863     return NULL;
864 
865   return ins_stmt;
866 }
867 
868 const pass_data pass_data_optimize_bswap =
869 {
870   GIMPLE_PASS, /* type */
871   "bswap", /* name */
872   OPTGROUP_NONE, /* optinfo_flags */
873   TV_NONE, /* tv_id */
874   PROP_ssa, /* properties_required */
875   0, /* properties_provided */
876   0, /* properties_destroyed */
877   0, /* todo_flags_start */
878   0, /* todo_flags_finish */
879 };
880 
881 class pass_optimize_bswap : public gimple_opt_pass
882 {
883 public:
884   pass_optimize_bswap (gcc::context *ctxt)
885     : gimple_opt_pass (pass_data_optimize_bswap, ctxt)
886   {}
887 
888   /* opt_pass methods: */
889   virtual bool gate (function *)
890     {
891       return flag_expensive_optimizations && optimize && BITS_PER_UNIT == 8;
892     }
893 
894   virtual unsigned int execute (function *);
895 
896 }; // class pass_optimize_bswap
897 
898 /* Perform the bswap optimization: replace the expression computed in the rhs
899    of gsi_stmt (GSI) (or if NULL add instead of replace) by an equivalent
900    bswap, load or load + bswap expression.
901    Which of these alternatives replace the rhs is given by N->base_addr (non
902    null if a load is needed) and BSWAP.  The type, VUSE and set-alias of the
903    load to perform are also given in N while the builtin bswap invoke is given
904    in FNDEL.  Finally, if a load is involved, INS_STMT refers to one of the
905    load statements involved to construct the rhs in gsi_stmt (GSI) and
906    N->range gives the size of the rhs expression for maintaining some
907    statistics.
908 
909    Note that if the replacement involve a load and if gsi_stmt (GSI) is
910    non-NULL, that stmt is moved just after INS_STMT to do the load with the
911    same VUSE which can lead to gsi_stmt (GSI) changing of basic block.  */
912 
913 tree
914 bswap_replace (gimple_stmt_iterator gsi, gimple *ins_stmt, tree fndecl,
915 	       tree bswap_type, tree load_type, struct symbolic_number *n,
916 	       bool bswap)
917 {
918   tree src, tmp, tgt = NULL_TREE;
919   gimple *bswap_stmt;
920 
921   gimple *cur_stmt = gsi_stmt (gsi);
922   src = n->src;
923   if (cur_stmt)
924     tgt = gimple_assign_lhs (cur_stmt);
925 
926   /* Need to load the value from memory first.  */
927   if (n->base_addr)
928     {
929       gimple_stmt_iterator gsi_ins = gsi;
930       if (ins_stmt)
931 	gsi_ins = gsi_for_stmt (ins_stmt);
932       tree addr_expr, addr_tmp, val_expr, val_tmp;
933       tree load_offset_ptr, aligned_load_type;
934       gimple *load_stmt;
935       unsigned align = get_object_alignment (src);
936       poly_int64 load_offset = 0;
937 
938       if (cur_stmt)
939 	{
940 	  basic_block ins_bb = gimple_bb (ins_stmt);
941 	  basic_block cur_bb = gimple_bb (cur_stmt);
942 	  if (!dominated_by_p (CDI_DOMINATORS, cur_bb, ins_bb))
943 	    return NULL_TREE;
944 
945 	  /* Move cur_stmt just before one of the load of the original
946 	     to ensure it has the same VUSE.  See PR61517 for what could
947 	     go wrong.  */
948 	  if (gimple_bb (cur_stmt) != gimple_bb (ins_stmt))
949 	    reset_flow_sensitive_info (gimple_assign_lhs (cur_stmt));
950 	  gsi_move_before (&gsi, &gsi_ins);
951 	  gsi = gsi_for_stmt (cur_stmt);
952 	}
953       else
954 	gsi = gsi_ins;
955 
956       /* Compute address to load from and cast according to the size
957 	 of the load.  */
958       addr_expr = build_fold_addr_expr (src);
959       if (is_gimple_mem_ref_addr (addr_expr))
960 	addr_tmp = unshare_expr (addr_expr);
961       else
962 	{
963 	  addr_tmp = unshare_expr (n->base_addr);
964 	  if (!is_gimple_mem_ref_addr (addr_tmp))
965 	    addr_tmp = force_gimple_operand_gsi_1 (&gsi, addr_tmp,
966 						   is_gimple_mem_ref_addr,
967 						   NULL_TREE, true,
968 						   GSI_SAME_STMT);
969 	  load_offset = n->bytepos;
970 	  if (n->offset)
971 	    {
972 	      tree off
973 		= force_gimple_operand_gsi (&gsi, unshare_expr (n->offset),
974 					    true, NULL_TREE, true,
975 					    GSI_SAME_STMT);
976 	      gimple *stmt
977 		= gimple_build_assign (make_ssa_name (TREE_TYPE (addr_tmp)),
978 				       POINTER_PLUS_EXPR, addr_tmp, off);
979 	      gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
980 	      addr_tmp = gimple_assign_lhs (stmt);
981 	    }
982 	}
983 
984       /* Perform the load.  */
985       aligned_load_type = load_type;
986       if (align < TYPE_ALIGN (load_type))
987 	aligned_load_type = build_aligned_type (load_type, align);
988       load_offset_ptr = build_int_cst (n->alias_set, load_offset);
989       val_expr = fold_build2 (MEM_REF, aligned_load_type, addr_tmp,
990 			      load_offset_ptr);
991 
992       if (!bswap)
993 	{
994 	  if (n->range == 16)
995 	    nop_stats.found_16bit++;
996 	  else if (n->range == 32)
997 	    nop_stats.found_32bit++;
998 	  else
999 	    {
1000 	      gcc_assert (n->range == 64);
1001 	      nop_stats.found_64bit++;
1002 	    }
1003 
1004 	  /* Convert the result of load if necessary.  */
1005 	  if (tgt && !useless_type_conversion_p (TREE_TYPE (tgt), load_type))
1006 	    {
1007 	      val_tmp = make_temp_ssa_name (aligned_load_type, NULL,
1008 					    "load_dst");
1009 	      load_stmt = gimple_build_assign (val_tmp, val_expr);
1010 	      gimple_set_vuse (load_stmt, n->vuse);
1011 	      gsi_insert_before (&gsi, load_stmt, GSI_SAME_STMT);
1012 	      gimple_assign_set_rhs_with_ops (&gsi, NOP_EXPR, val_tmp);
1013 	      update_stmt (cur_stmt);
1014 	    }
1015 	  else if (cur_stmt)
1016 	    {
1017 	      gimple_assign_set_rhs_with_ops (&gsi, MEM_REF, val_expr);
1018 	      gimple_set_vuse (cur_stmt, n->vuse);
1019 	      update_stmt (cur_stmt);
1020 	    }
1021 	  else
1022 	    {
1023 	      tgt = make_ssa_name (load_type);
1024 	      cur_stmt = gimple_build_assign (tgt, MEM_REF, val_expr);
1025 	      gimple_set_vuse (cur_stmt, n->vuse);
1026 	      gsi_insert_before (&gsi, cur_stmt, GSI_SAME_STMT);
1027 	    }
1028 
1029 	  if (dump_file)
1030 	    {
1031 	      fprintf (dump_file,
1032 		       "%d bit load in target endianness found at: ",
1033 		       (int) n->range);
1034 	      print_gimple_stmt (dump_file, cur_stmt, 0);
1035 	    }
1036 	  return tgt;
1037 	}
1038       else
1039 	{
1040 	  val_tmp = make_temp_ssa_name (aligned_load_type, NULL, "load_dst");
1041 	  load_stmt = gimple_build_assign (val_tmp, val_expr);
1042 	  gimple_set_vuse (load_stmt, n->vuse);
1043 	  gsi_insert_before (&gsi, load_stmt, GSI_SAME_STMT);
1044 	}
1045       src = val_tmp;
1046     }
1047   else if (!bswap)
1048     {
1049       gimple *g = NULL;
1050       if (tgt && !useless_type_conversion_p (TREE_TYPE (tgt), TREE_TYPE (src)))
1051 	{
1052 	  if (!is_gimple_val (src))
1053 	    return NULL_TREE;
1054 	  g = gimple_build_assign (tgt, NOP_EXPR, src);
1055 	}
1056       else if (cur_stmt)
1057 	g = gimple_build_assign (tgt, src);
1058       else
1059 	tgt = src;
1060       if (n->range == 16)
1061 	nop_stats.found_16bit++;
1062       else if (n->range == 32)
1063 	nop_stats.found_32bit++;
1064       else
1065 	{
1066 	  gcc_assert (n->range == 64);
1067 	  nop_stats.found_64bit++;
1068 	}
1069       if (dump_file)
1070 	{
1071 	  fprintf (dump_file,
1072 		   "%d bit reshuffle in target endianness found at: ",
1073 		   (int) n->range);
1074 	  if (cur_stmt)
1075 	    print_gimple_stmt (dump_file, cur_stmt, 0);
1076 	  else
1077 	    {
1078 	      print_generic_expr (dump_file, tgt, 0);
1079 	      fprintf (dump_file, "\n");
1080 	    }
1081 	}
1082       if (cur_stmt)
1083 	gsi_replace (&gsi, g, true);
1084       return tgt;
1085     }
1086   else if (TREE_CODE (src) == BIT_FIELD_REF)
1087     src = TREE_OPERAND (src, 0);
1088 
1089   if (n->range == 16)
1090     bswap_stats.found_16bit++;
1091   else if (n->range == 32)
1092     bswap_stats.found_32bit++;
1093   else
1094     {
1095       gcc_assert (n->range == 64);
1096       bswap_stats.found_64bit++;
1097     }
1098 
1099   tmp = src;
1100 
1101   /* Convert the src expression if necessary.  */
1102   if (!useless_type_conversion_p (TREE_TYPE (tmp), bswap_type))
1103     {
1104       gimple *convert_stmt;
1105 
1106       tmp = make_temp_ssa_name (bswap_type, NULL, "bswapsrc");
1107       convert_stmt = gimple_build_assign (tmp, NOP_EXPR, src);
1108       gsi_insert_before (&gsi, convert_stmt, GSI_SAME_STMT);
1109     }
1110 
1111   /* Canonical form for 16 bit bswap is a rotate expression.  Only 16bit values
1112      are considered as rotation of 2N bit values by N bits is generally not
1113      equivalent to a bswap.  Consider for instance 0x01020304 r>> 16 which
1114      gives 0x03040102 while a bswap for that value is 0x04030201.  */
1115   if (bswap && n->range == 16)
1116     {
1117       tree count = build_int_cst (NULL, BITS_PER_UNIT);
1118       src = fold_build2 (LROTATE_EXPR, bswap_type, tmp, count);
1119       bswap_stmt = gimple_build_assign (NULL, src);
1120     }
1121   else
1122     bswap_stmt = gimple_build_call (fndecl, 1, tmp);
1123 
1124   if (tgt == NULL_TREE)
1125     tgt = make_ssa_name (bswap_type);
1126   tmp = tgt;
1127 
1128   /* Convert the result if necessary.  */
1129   if (!useless_type_conversion_p (TREE_TYPE (tgt), bswap_type))
1130     {
1131       gimple *convert_stmt;
1132 
1133       tmp = make_temp_ssa_name (bswap_type, NULL, "bswapdst");
1134       convert_stmt = gimple_build_assign (tgt, NOP_EXPR, tmp);
1135       gsi_insert_after (&gsi, convert_stmt, GSI_SAME_STMT);
1136     }
1137 
1138   gimple_set_lhs (bswap_stmt, tmp);
1139 
1140   if (dump_file)
1141     {
1142       fprintf (dump_file, "%d bit bswap implementation found at: ",
1143 	       (int) n->range);
1144       if (cur_stmt)
1145 	print_gimple_stmt (dump_file, cur_stmt, 0);
1146       else
1147 	{
1148 	  print_generic_expr (dump_file, tgt, 0);
1149 	  fprintf (dump_file, "\n");
1150 	}
1151     }
1152 
1153   if (cur_stmt)
1154     {
1155       gsi_insert_after (&gsi, bswap_stmt, GSI_SAME_STMT);
1156       gsi_remove (&gsi, true);
1157     }
1158   else
1159     gsi_insert_before (&gsi, bswap_stmt, GSI_SAME_STMT);
1160   return tgt;
1161 }
1162 
1163 /* Find manual byte swap implementations as well as load in a given
1164    endianness. Byte swaps are turned into a bswap builtin invokation
1165    while endian loads are converted to bswap builtin invokation or
1166    simple load according to the target endianness.  */
1167 
1168 unsigned int
1169 pass_optimize_bswap::execute (function *fun)
1170 {
1171   basic_block bb;
1172   bool bswap32_p, bswap64_p;
1173   bool changed = false;
1174   tree bswap32_type = NULL_TREE, bswap64_type = NULL_TREE;
1175 
1176   bswap32_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP32)
1177 	       && optab_handler (bswap_optab, SImode) != CODE_FOR_nothing);
1178   bswap64_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP64)
1179 	       && (optab_handler (bswap_optab, DImode) != CODE_FOR_nothing
1180 		   || (bswap32_p && word_mode == SImode)));
1181 
1182   /* Determine the argument type of the builtins.  The code later on
1183      assumes that the return and argument type are the same.  */
1184   if (bswap32_p)
1185     {
1186       tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
1187       bswap32_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
1188     }
1189 
1190   if (bswap64_p)
1191     {
1192       tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
1193       bswap64_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
1194     }
1195 
1196   memset (&nop_stats, 0, sizeof (nop_stats));
1197   memset (&bswap_stats, 0, sizeof (bswap_stats));
1198   calculate_dominance_info (CDI_DOMINATORS);
1199 
1200   FOR_EACH_BB_FN (bb, fun)
1201     {
1202       gimple_stmt_iterator gsi;
1203 
1204       /* We do a reverse scan for bswap patterns to make sure we get the
1205 	 widest match. As bswap pattern matching doesn't handle previously
1206 	 inserted smaller bswap replacements as sub-patterns, the wider
1207 	 variant wouldn't be detected.  */
1208       for (gsi = gsi_last_bb (bb); !gsi_end_p (gsi);)
1209 	{
1210 	  gimple *ins_stmt, *cur_stmt = gsi_stmt (gsi);
1211 	  tree fndecl = NULL_TREE, bswap_type = NULL_TREE, load_type;
1212 	  enum tree_code code;
1213 	  struct symbolic_number n;
1214 	  bool bswap;
1215 
1216 	  /* This gsi_prev (&gsi) is not part of the for loop because cur_stmt
1217 	     might be moved to a different basic block by bswap_replace and gsi
1218 	     must not points to it if that's the case.  Moving the gsi_prev
1219 	     there make sure that gsi points to the statement previous to
1220 	     cur_stmt while still making sure that all statements are
1221 	     considered in this basic block.  */
1222 	  gsi_prev (&gsi);
1223 
1224 	  if (!is_gimple_assign (cur_stmt))
1225 	    continue;
1226 
1227 	  code = gimple_assign_rhs_code (cur_stmt);
1228 	  switch (code)
1229 	    {
1230 	    case LROTATE_EXPR:
1231 	    case RROTATE_EXPR:
1232 	      if (!tree_fits_uhwi_p (gimple_assign_rhs2 (cur_stmt))
1233 		  || tree_to_uhwi (gimple_assign_rhs2 (cur_stmt))
1234 		     % BITS_PER_UNIT)
1235 		continue;
1236 	      /* Fall through.  */
1237 	    case BIT_IOR_EXPR:
1238 	      break;
1239 	    default:
1240 	      continue;
1241 	    }
1242 
1243 	  ins_stmt = find_bswap_or_nop (cur_stmt, &n, &bswap);
1244 
1245 	  if (!ins_stmt)
1246 	    continue;
1247 
1248 	  switch (n.range)
1249 	    {
1250 	    case 16:
1251 	      /* Already in canonical form, nothing to do.  */
1252 	      if (code == LROTATE_EXPR || code == RROTATE_EXPR)
1253 		continue;
1254 	      load_type = bswap_type = uint16_type_node;
1255 	      break;
1256 	    case 32:
1257 	      load_type = uint32_type_node;
1258 	      if (bswap32_p)
1259 		{
1260 		  fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
1261 		  bswap_type = bswap32_type;
1262 		}
1263 	      break;
1264 	    case 64:
1265 	      load_type = uint64_type_node;
1266 	      if (bswap64_p)
1267 		{
1268 		  fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
1269 		  bswap_type = bswap64_type;
1270 		}
1271 	      break;
1272 	    default:
1273 	      continue;
1274 	    }
1275 
1276 	  if (bswap && !fndecl && n.range != 16)
1277 	    continue;
1278 
1279 	  if (bswap_replace (gsi_for_stmt (cur_stmt), ins_stmt, fndecl,
1280 			     bswap_type, load_type, &n, bswap))
1281 	    changed = true;
1282 	}
1283     }
1284 
1285   statistics_counter_event (fun, "16-bit nop implementations found",
1286 			    nop_stats.found_16bit);
1287   statistics_counter_event (fun, "32-bit nop implementations found",
1288 			    nop_stats.found_32bit);
1289   statistics_counter_event (fun, "64-bit nop implementations found",
1290 			    nop_stats.found_64bit);
1291   statistics_counter_event (fun, "16-bit bswap implementations found",
1292 			    bswap_stats.found_16bit);
1293   statistics_counter_event (fun, "32-bit bswap implementations found",
1294 			    bswap_stats.found_32bit);
1295   statistics_counter_event (fun, "64-bit bswap implementations found",
1296 			    bswap_stats.found_64bit);
1297 
1298   return (changed ? TODO_update_ssa : 0);
1299 }
1300 
1301 } // anon namespace
1302 
1303 gimple_opt_pass *
1304 make_pass_optimize_bswap (gcc::context *ctxt)
1305 {
1306   return new pass_optimize_bswap (ctxt);
1307 }
1308 
1309 namespace {
1310 
1311 /* Struct recording one operand for the store, which is either a constant,
1312    then VAL represents the constant and all the other fields are zero,
1313    or a memory load, then VAL represents the reference, BASE_ADDR is non-NULL
1314    and the other fields also reflect the memory load.  */
1315 
1316 struct store_operand_info
1317 {
1318   tree val;
1319   tree base_addr;
1320   poly_uint64 bitsize;
1321   poly_uint64 bitpos;
1322   poly_uint64 bitregion_start;
1323   poly_uint64 bitregion_end;
1324   gimple *stmt;
1325   bool bit_not_p;
1326   store_operand_info ();
1327 };
1328 
1329 store_operand_info::store_operand_info ()
1330   : val (NULL_TREE), base_addr (NULL_TREE), bitsize (0), bitpos (0),
1331     bitregion_start (0), bitregion_end (0), stmt (NULL), bit_not_p (false)
1332 {
1333 }
1334 
1335 /* Struct recording the information about a single store of an immediate
1336    to memory.  These are created in the first phase and coalesced into
1337    merged_store_group objects in the second phase.  */
1338 
1339 struct store_immediate_info
1340 {
1341   unsigned HOST_WIDE_INT bitsize;
1342   unsigned HOST_WIDE_INT bitpos;
1343   unsigned HOST_WIDE_INT bitregion_start;
1344   /* This is one past the last bit of the bit region.  */
1345   unsigned HOST_WIDE_INT bitregion_end;
1346   gimple *stmt;
1347   unsigned int order;
1348   /* INTEGER_CST for constant stores, MEM_REF for memory copy or
1349      BIT_*_EXPR for logical bitwise operation.
1350      LROTATE_EXPR if it can be only bswap optimized and
1351      ops are not really meaningful.
1352      NOP_EXPR if bswap optimization detected identity, ops
1353      are not meaningful.  */
1354   enum tree_code rhs_code;
1355   /* Two fields for bswap optimization purposes.  */
1356   struct symbolic_number n;
1357   gimple *ins_stmt;
1358   /* True if BIT_{AND,IOR,XOR}_EXPR result is inverted before storing.  */
1359   bool bit_not_p;
1360   /* True if ops have been swapped and thus ops[1] represents
1361      rhs1 of BIT_{AND,IOR,XOR}_EXPR and ops[0] represents rhs2.  */
1362   bool ops_swapped_p;
1363   /* Operands.  For BIT_*_EXPR rhs_code both operands are used, otherwise
1364      just the first one.  */
1365   store_operand_info ops[2];
1366   store_immediate_info (unsigned HOST_WIDE_INT, unsigned HOST_WIDE_INT,
1367 			unsigned HOST_WIDE_INT, unsigned HOST_WIDE_INT,
1368 			gimple *, unsigned int, enum tree_code,
1369 			struct symbolic_number &, gimple *, bool,
1370 			const store_operand_info &,
1371 			const store_operand_info &);
1372 };
1373 
1374 store_immediate_info::store_immediate_info (unsigned HOST_WIDE_INT bs,
1375 					    unsigned HOST_WIDE_INT bp,
1376 					    unsigned HOST_WIDE_INT brs,
1377 					    unsigned HOST_WIDE_INT bre,
1378 					    gimple *st,
1379 					    unsigned int ord,
1380 					    enum tree_code rhscode,
1381 					    struct symbolic_number &nr,
1382 					    gimple *ins_stmtp,
1383 					    bool bitnotp,
1384 					    const store_operand_info &op0r,
1385 					    const store_operand_info &op1r)
1386   : bitsize (bs), bitpos (bp), bitregion_start (brs), bitregion_end (bre),
1387     stmt (st), order (ord), rhs_code (rhscode), n (nr),
1388     ins_stmt (ins_stmtp), bit_not_p (bitnotp), ops_swapped_p (false)
1389 #if __cplusplus >= 201103L
1390     , ops { op0r, op1r }
1391 {
1392 }
1393 #else
1394 {
1395   ops[0] = op0r;
1396   ops[1] = op1r;
1397 }
1398 #endif
1399 
1400 /* Struct representing a group of stores to contiguous memory locations.
1401    These are produced by the second phase (coalescing) and consumed in the
1402    third phase that outputs the widened stores.  */
1403 
1404 struct merged_store_group
1405 {
1406   unsigned HOST_WIDE_INT start;
1407   unsigned HOST_WIDE_INT width;
1408   unsigned HOST_WIDE_INT bitregion_start;
1409   unsigned HOST_WIDE_INT bitregion_end;
1410   /* The size of the allocated memory for val and mask.  */
1411   unsigned HOST_WIDE_INT buf_size;
1412   unsigned HOST_WIDE_INT align_base;
1413   poly_uint64 load_align_base[2];
1414 
1415   unsigned int align;
1416   unsigned int load_align[2];
1417   unsigned int first_order;
1418   unsigned int last_order;
1419   unsigned int first_nonmergeable_order;
1420 
1421   auto_vec<store_immediate_info *> stores;
1422   /* We record the first and last original statements in the sequence because
1423      we'll need their vuse/vdef and replacement position.  It's easier to keep
1424      track of them separately as 'stores' is reordered by apply_stores.  */
1425   gimple *last_stmt;
1426   gimple *first_stmt;
1427   unsigned char *val;
1428   unsigned char *mask;
1429 
1430   merged_store_group (store_immediate_info *);
1431   ~merged_store_group ();
1432   void merge_into (store_immediate_info *);
1433   void merge_overlapping (store_immediate_info *);
1434   bool apply_stores ();
1435 private:
1436   void do_merge (store_immediate_info *);
1437 };
1438 
1439 /* Debug helper.  Dump LEN elements of byte array PTR to FD in hex.  */
1440 
1441 static void
1442 dump_char_array (FILE *fd, unsigned char *ptr, unsigned int len)
1443 {
1444   if (!fd)
1445     return;
1446 
1447   for (unsigned int i = 0; i < len; i++)
1448     fprintf (fd, "%x ", ptr[i]);
1449   fprintf (fd, "\n");
1450 }
1451 
1452 /* Shift left the bytes in PTR of SZ elements by AMNT bits, carrying over the
1453    bits between adjacent elements.  AMNT should be within
1454    [0, BITS_PER_UNIT).
1455    Example, AMNT = 2:
1456    00011111|11100000 << 2 = 01111111|10000000
1457    PTR[1]  | PTR[0]         PTR[1]  | PTR[0].  */
1458 
1459 static void
1460 shift_bytes_in_array (unsigned char *ptr, unsigned int sz, unsigned int amnt)
1461 {
1462   if (amnt == 0)
1463     return;
1464 
1465   unsigned char carry_over = 0U;
1466   unsigned char carry_mask = (~0U) << (unsigned char) (BITS_PER_UNIT - amnt);
1467   unsigned char clear_mask = (~0U) << amnt;
1468 
1469   for (unsigned int i = 0; i < sz; i++)
1470     {
1471       unsigned prev_carry_over = carry_over;
1472       carry_over = (ptr[i] & carry_mask) >> (BITS_PER_UNIT - amnt);
1473 
1474       ptr[i] <<= amnt;
1475       if (i != 0)
1476 	{
1477 	  ptr[i] &= clear_mask;
1478 	  ptr[i] |= prev_carry_over;
1479 	}
1480     }
1481 }
1482 
1483 /* Like shift_bytes_in_array but for big-endian.
1484    Shift right the bytes in PTR of SZ elements by AMNT bits, carrying over the
1485    bits between adjacent elements.  AMNT should be within
1486    [0, BITS_PER_UNIT).
1487    Example, AMNT = 2:
1488    00011111|11100000 >> 2 = 00000111|11111000
1489    PTR[0]  | PTR[1]         PTR[0]  | PTR[1].  */
1490 
1491 static void
1492 shift_bytes_in_array_right (unsigned char *ptr, unsigned int sz,
1493 			    unsigned int amnt)
1494 {
1495   if (amnt == 0)
1496     return;
1497 
1498   unsigned char carry_over = 0U;
1499   unsigned char carry_mask = ~(~0U << amnt);
1500 
1501   for (unsigned int i = 0; i < sz; i++)
1502     {
1503       unsigned prev_carry_over = carry_over;
1504       carry_over = ptr[i] & carry_mask;
1505 
1506       carry_over <<= (unsigned char) BITS_PER_UNIT - amnt;
1507       ptr[i] >>= amnt;
1508       ptr[i] |= prev_carry_over;
1509     }
1510 }
1511 
1512 /* Clear out LEN bits starting from bit START in the byte array
1513    PTR.  This clears the bits to the *right* from START.
1514    START must be within [0, BITS_PER_UNIT) and counts starting from
1515    the least significant bit.  */
1516 
1517 static void
1518 clear_bit_region_be (unsigned char *ptr, unsigned int start,
1519 		     unsigned int len)
1520 {
1521   if (len == 0)
1522     return;
1523   /* Clear len bits to the right of start.  */
1524   else if (len <= start + 1)
1525     {
1526       unsigned char mask = (~(~0U << len));
1527       mask = mask << (start + 1U - len);
1528       ptr[0] &= ~mask;
1529     }
1530   else if (start != BITS_PER_UNIT - 1)
1531     {
1532       clear_bit_region_be (ptr, start, (start % BITS_PER_UNIT) + 1);
1533       clear_bit_region_be (ptr + 1, BITS_PER_UNIT - 1,
1534 			   len - (start % BITS_PER_UNIT) - 1);
1535     }
1536   else if (start == BITS_PER_UNIT - 1
1537 	   && len > BITS_PER_UNIT)
1538     {
1539       unsigned int nbytes = len / BITS_PER_UNIT;
1540       memset (ptr, 0, nbytes);
1541       if (len % BITS_PER_UNIT != 0)
1542 	clear_bit_region_be (ptr + nbytes, BITS_PER_UNIT - 1,
1543 			     len % BITS_PER_UNIT);
1544     }
1545   else
1546     gcc_unreachable ();
1547 }
1548 
1549 /* In the byte array PTR clear the bit region starting at bit
1550    START and is LEN bits wide.
1551    For regions spanning multiple bytes do this recursively until we reach
1552    zero LEN or a region contained within a single byte.  */
1553 
1554 static void
1555 clear_bit_region (unsigned char *ptr, unsigned int start,
1556 		  unsigned int len)
1557 {
1558   /* Degenerate base case.  */
1559   if (len == 0)
1560     return;
1561   else if (start >= BITS_PER_UNIT)
1562     clear_bit_region (ptr + 1, start - BITS_PER_UNIT, len);
1563   /* Second base case.  */
1564   else if ((start + len) <= BITS_PER_UNIT)
1565     {
1566       unsigned char mask = (~0U) << (unsigned char) (BITS_PER_UNIT - len);
1567       mask >>= BITS_PER_UNIT - (start + len);
1568 
1569       ptr[0] &= ~mask;
1570 
1571       return;
1572     }
1573   /* Clear most significant bits in a byte and proceed with the next byte.  */
1574   else if (start != 0)
1575     {
1576       clear_bit_region (ptr, start, BITS_PER_UNIT - start);
1577       clear_bit_region (ptr + 1, 0, len - (BITS_PER_UNIT - start));
1578     }
1579   /* Whole bytes need to be cleared.  */
1580   else if (start == 0 && len > BITS_PER_UNIT)
1581     {
1582       unsigned int nbytes = len / BITS_PER_UNIT;
1583       /* We could recurse on each byte but we clear whole bytes, so a simple
1584 	 memset will do.  */
1585       memset (ptr, '\0', nbytes);
1586       /* Clear the remaining sub-byte region if there is one.  */
1587       if (len % BITS_PER_UNIT != 0)
1588 	clear_bit_region (ptr + nbytes, 0, len % BITS_PER_UNIT);
1589     }
1590   else
1591     gcc_unreachable ();
1592 }
1593 
1594 /* Write BITLEN bits of EXPR to the byte array PTR at
1595    bit position BITPOS.  PTR should contain TOTAL_BYTES elements.
1596    Return true if the operation succeeded.  */
1597 
1598 static bool
1599 encode_tree_to_bitpos (tree expr, unsigned char *ptr, int bitlen, int bitpos,
1600 		       unsigned int total_bytes)
1601 {
1602   unsigned int first_byte = bitpos / BITS_PER_UNIT;
1603   tree tmp_int = expr;
1604   bool sub_byte_op_p = ((bitlen % BITS_PER_UNIT)
1605 			|| (bitpos % BITS_PER_UNIT)
1606 			|| !int_mode_for_size (bitlen, 0).exists ());
1607 
1608   if (!sub_byte_op_p)
1609     return native_encode_expr (tmp_int, ptr + first_byte, total_bytes) != 0;
1610 
1611   /* LITTLE-ENDIAN
1612      We are writing a non byte-sized quantity or at a position that is not
1613      at a byte boundary.
1614      |--------|--------|--------| ptr + first_byte
1615            ^              ^
1616            xxx xxxxxxxx xxx< bp>
1617            |______EXPR____|
1618 
1619      First native_encode_expr EXPR into a temporary buffer and shift each
1620      byte in the buffer by 'bp' (carrying the bits over as necessary).
1621      |00000000|00xxxxxx|xxxxxxxx| << bp = |000xxxxx|xxxxxxxx|xxx00000|
1622                                               <------bitlen---->< bp>
1623     Then we clear the destination bits:
1624     |---00000|00000000|000-----| ptr + first_byte
1625         <-------bitlen--->< bp>
1626 
1627     Finally we ORR the bytes of the shifted EXPR into the cleared region:
1628     |---xxxxx||xxxxxxxx||xxx-----| ptr + first_byte.
1629 
1630    BIG-ENDIAN
1631    We are writing a non byte-sized quantity or at a position that is not
1632    at a byte boundary.
1633      ptr + first_byte |--------|--------|--------|
1634                             ^              ^
1635                        <bp >xxx xxxxxxxx xxx
1636                             |_____EXPR_____|
1637 
1638      First native_encode_expr EXPR into a temporary buffer and shift each
1639      byte in the buffer to the right by (carrying the bits over as necessary).
1640      We shift by as much as needed to align the most significant bit of EXPR
1641      with bitpos:
1642      |00xxxxxx|xxxxxxxx| >> 3 = |00000xxx|xxxxxxxx|xxxxx000|
1643         <---bitlen---->          <bp ><-----bitlen----->
1644     Then we clear the destination bits:
1645     ptr + first_byte |-----000||00000000||00000---|
1646                       <bp ><-------bitlen----->
1647 
1648     Finally we ORR the bytes of the shifted EXPR into the cleared region:
1649     ptr + first_byte |---xxxxx||xxxxxxxx||xxx-----|.
1650     The awkwardness comes from the fact that bitpos is counted from the
1651     most significant bit of a byte.  */
1652 
1653   /* We must be dealing with fixed-size data at this point, since the
1654      total size is also fixed.  */
1655   fixed_size_mode mode = as_a <fixed_size_mode> (TYPE_MODE (TREE_TYPE (expr)));
1656   /* Allocate an extra byte so that we have space to shift into.  */
1657   unsigned int byte_size = GET_MODE_SIZE (mode) + 1;
1658   unsigned char *tmpbuf = XALLOCAVEC (unsigned char, byte_size);
1659   memset (tmpbuf, '\0', byte_size);
1660   /* The store detection code should only have allowed constants that are
1661      accepted by native_encode_expr.  */
1662   if (native_encode_expr (expr, tmpbuf, byte_size - 1) == 0)
1663     gcc_unreachable ();
1664 
1665   /* The native_encode_expr machinery uses TYPE_MODE to determine how many
1666      bytes to write.  This means it can write more than
1667      ROUND_UP (bitlen, BITS_PER_UNIT) / BITS_PER_UNIT bytes (for example
1668      write 8 bytes for a bitlen of 40).  Skip the bytes that are not within
1669      bitlen and zero out the bits that are not relevant as well (that may
1670      contain a sign bit due to sign-extension).  */
1671   unsigned int padding
1672     = byte_size - ROUND_UP (bitlen, BITS_PER_UNIT) / BITS_PER_UNIT - 1;
1673   /* On big-endian the padding is at the 'front' so just skip the initial
1674      bytes.  */
1675   if (BYTES_BIG_ENDIAN)
1676     tmpbuf += padding;
1677 
1678   byte_size -= padding;
1679 
1680   if (bitlen % BITS_PER_UNIT != 0)
1681     {
1682       if (BYTES_BIG_ENDIAN)
1683 	clear_bit_region_be (tmpbuf, BITS_PER_UNIT - 1,
1684 			     BITS_PER_UNIT - (bitlen % BITS_PER_UNIT));
1685       else
1686 	clear_bit_region (tmpbuf, bitlen,
1687 			  byte_size * BITS_PER_UNIT - bitlen);
1688     }
1689   /* Left shifting relies on the last byte being clear if bitlen is
1690      a multiple of BITS_PER_UNIT, which might not be clear if
1691      there are padding bytes.  */
1692   else if (!BYTES_BIG_ENDIAN)
1693     tmpbuf[byte_size - 1] = '\0';
1694 
1695   /* Clear the bit region in PTR where the bits from TMPBUF will be
1696      inserted into.  */
1697   if (BYTES_BIG_ENDIAN)
1698     clear_bit_region_be (ptr + first_byte,
1699 			 BITS_PER_UNIT - 1 - (bitpos % BITS_PER_UNIT), bitlen);
1700   else
1701     clear_bit_region (ptr + first_byte, bitpos % BITS_PER_UNIT, bitlen);
1702 
1703   int shift_amnt;
1704   int bitlen_mod = bitlen % BITS_PER_UNIT;
1705   int bitpos_mod = bitpos % BITS_PER_UNIT;
1706 
1707   bool skip_byte = false;
1708   if (BYTES_BIG_ENDIAN)
1709     {
1710       /* BITPOS and BITLEN are exactly aligned and no shifting
1711 	 is necessary.  */
1712       if (bitpos_mod + bitlen_mod == BITS_PER_UNIT
1713 	  || (bitpos_mod == 0 && bitlen_mod == 0))
1714 	shift_amnt = 0;
1715       /* |. . . . . . . .|
1716 	  <bp >   <blen >.
1717 	 We always shift right for BYTES_BIG_ENDIAN so shift the beginning
1718 	 of the value until it aligns with 'bp' in the next byte over.  */
1719       else if (bitpos_mod + bitlen_mod < BITS_PER_UNIT)
1720 	{
1721 	  shift_amnt = bitlen_mod + bitpos_mod;
1722 	  skip_byte = bitlen_mod != 0;
1723 	}
1724       /* |. . . . . . . .|
1725 	  <----bp--->
1726 	    <---blen---->.
1727 	 Shift the value right within the same byte so it aligns with 'bp'.  */
1728       else
1729 	shift_amnt = bitlen_mod + bitpos_mod - BITS_PER_UNIT;
1730     }
1731   else
1732     shift_amnt = bitpos % BITS_PER_UNIT;
1733 
1734   /* Create the shifted version of EXPR.  */
1735   if (!BYTES_BIG_ENDIAN)
1736     {
1737       shift_bytes_in_array (tmpbuf, byte_size, shift_amnt);
1738       if (shift_amnt == 0)
1739 	byte_size--;
1740     }
1741   else
1742     {
1743       gcc_assert (BYTES_BIG_ENDIAN);
1744       shift_bytes_in_array_right (tmpbuf, byte_size, shift_amnt);
1745       /* If shifting right forced us to move into the next byte skip the now
1746 	 empty byte.  */
1747       if (skip_byte)
1748 	{
1749 	  tmpbuf++;
1750 	  byte_size--;
1751 	}
1752     }
1753 
1754   /* Insert the bits from TMPBUF.  */
1755   for (unsigned int i = 0; i < byte_size; i++)
1756     ptr[first_byte + i] |= tmpbuf[i];
1757 
1758   return true;
1759 }
1760 
1761 /* Sorting function for store_immediate_info objects.
1762    Sorts them by bitposition.  */
1763 
1764 static int
1765 sort_by_bitpos (const void *x, const void *y)
1766 {
1767   store_immediate_info *const *tmp = (store_immediate_info * const *) x;
1768   store_immediate_info *const *tmp2 = (store_immediate_info * const *) y;
1769 
1770   if ((*tmp)->bitpos < (*tmp2)->bitpos)
1771     return -1;
1772   else if ((*tmp)->bitpos > (*tmp2)->bitpos)
1773     return 1;
1774   else
1775     /* If they are the same let's use the order which is guaranteed to
1776        be different.  */
1777     return (*tmp)->order - (*tmp2)->order;
1778 }
1779 
1780 /* Sorting function for store_immediate_info objects.
1781    Sorts them by the order field.  */
1782 
1783 static int
1784 sort_by_order (const void *x, const void *y)
1785 {
1786   store_immediate_info *const *tmp = (store_immediate_info * const *) x;
1787   store_immediate_info *const *tmp2 = (store_immediate_info * const *) y;
1788 
1789   if ((*tmp)->order < (*tmp2)->order)
1790     return -1;
1791   else if ((*tmp)->order > (*tmp2)->order)
1792     return 1;
1793 
1794   gcc_unreachable ();
1795 }
1796 
1797 /* Initialize a merged_store_group object from a store_immediate_info
1798    object.  */
1799 
1800 merged_store_group::merged_store_group (store_immediate_info *info)
1801 {
1802   start = info->bitpos;
1803   width = info->bitsize;
1804   bitregion_start = info->bitregion_start;
1805   bitregion_end = info->bitregion_end;
1806   /* VAL has memory allocated for it in apply_stores once the group
1807      width has been finalized.  */
1808   val = NULL;
1809   mask = NULL;
1810   first_nonmergeable_order = ~0U;
1811   unsigned HOST_WIDE_INT align_bitpos = 0;
1812   get_object_alignment_1 (gimple_assign_lhs (info->stmt),
1813 			  &align, &align_bitpos);
1814   align_base = start - align_bitpos;
1815   for (int i = 0; i < 2; ++i)
1816     {
1817       store_operand_info &op = info->ops[i];
1818       if (op.base_addr == NULL_TREE)
1819 	{
1820 	  load_align[i] = 0;
1821 	  load_align_base[i] = 0;
1822 	}
1823       else
1824 	{
1825 	  get_object_alignment_1 (op.val, &load_align[i], &align_bitpos);
1826 	  load_align_base[i] = op.bitpos - align_bitpos;
1827 	}
1828     }
1829   stores.create (1);
1830   stores.safe_push (info);
1831   last_stmt = info->stmt;
1832   last_order = info->order;
1833   first_stmt = last_stmt;
1834   first_order = last_order;
1835   buf_size = 0;
1836 }
1837 
1838 merged_store_group::~merged_store_group ()
1839 {
1840   if (val)
1841     XDELETEVEC (val);
1842 }
1843 
1844 /* Helper method for merge_into and merge_overlapping to do
1845    the common part.  */
1846 void
1847 merged_store_group::do_merge (store_immediate_info *info)
1848 {
1849   bitregion_start = MIN (bitregion_start, info->bitregion_start);
1850   bitregion_end = MAX (bitregion_end, info->bitregion_end);
1851 
1852   unsigned int this_align;
1853   unsigned HOST_WIDE_INT align_bitpos = 0;
1854   get_object_alignment_1 (gimple_assign_lhs (info->stmt),
1855 			  &this_align, &align_bitpos);
1856   if (this_align > align)
1857     {
1858       align = this_align;
1859       align_base = info->bitpos - align_bitpos;
1860     }
1861   for (int i = 0; i < 2; ++i)
1862     {
1863       store_operand_info &op = info->ops[i];
1864       if (!op.base_addr)
1865 	continue;
1866 
1867       get_object_alignment_1 (op.val, &this_align, &align_bitpos);
1868       if (this_align > load_align[i])
1869 	{
1870 	  load_align[i] = this_align;
1871 	  load_align_base[i] = op.bitpos - align_bitpos;
1872 	}
1873     }
1874 
1875   gimple *stmt = info->stmt;
1876   stores.safe_push (info);
1877   if (info->order > last_order)
1878     {
1879       last_order = info->order;
1880       last_stmt = stmt;
1881     }
1882   else if (info->order < first_order)
1883     {
1884       first_order = info->order;
1885       first_stmt = stmt;
1886     }
1887 }
1888 
1889 /* Merge a store recorded by INFO into this merged store.
1890    The store is not overlapping with the existing recorded
1891    stores.  */
1892 
1893 void
1894 merged_store_group::merge_into (store_immediate_info *info)
1895 {
1896   /* Make sure we're inserting in the position we think we're inserting.  */
1897   gcc_assert (info->bitpos >= start + width
1898 	      && info->bitregion_start <= bitregion_end);
1899 
1900   width = info->bitpos + info->bitsize - start;
1901   do_merge (info);
1902 }
1903 
1904 /* Merge a store described by INFO into this merged store.
1905    INFO overlaps in some way with the current store (i.e. it's not contiguous
1906    which is handled by merged_store_group::merge_into).  */
1907 
1908 void
1909 merged_store_group::merge_overlapping (store_immediate_info *info)
1910 {
1911   /* If the store extends the size of the group, extend the width.  */
1912   if (info->bitpos + info->bitsize > start + width)
1913     width = info->bitpos + info->bitsize - start;
1914 
1915   do_merge (info);
1916 }
1917 
1918 /* Go through all the recorded stores in this group in program order and
1919    apply their values to the VAL byte array to create the final merged
1920    value.  Return true if the operation succeeded.  */
1921 
1922 bool
1923 merged_store_group::apply_stores ()
1924 {
1925   /* Make sure we have more than one store in the group, otherwise we cannot
1926      merge anything.  */
1927   if (bitregion_start % BITS_PER_UNIT != 0
1928       || bitregion_end % BITS_PER_UNIT != 0
1929       || stores.length () == 1)
1930     return false;
1931 
1932   stores.qsort (sort_by_order);
1933   store_immediate_info *info;
1934   unsigned int i;
1935   /* Create a buffer of a size that is 2 times the number of bytes we're
1936      storing.  That way native_encode_expr can write power-of-2-sized
1937      chunks without overrunning.  */
1938   buf_size = 2 * ((bitregion_end - bitregion_start) / BITS_PER_UNIT);
1939   val = XNEWVEC (unsigned char, 2 * buf_size);
1940   mask = val + buf_size;
1941   memset (val, 0, buf_size);
1942   memset (mask, ~0U, buf_size);
1943 
1944   FOR_EACH_VEC_ELT (stores, i, info)
1945     {
1946       unsigned int pos_in_buffer = info->bitpos - bitregion_start;
1947       tree cst = NULL_TREE;
1948       if (info->ops[0].val && info->ops[0].base_addr == NULL_TREE)
1949 	cst = info->ops[0].val;
1950       else if (info->ops[1].val && info->ops[1].base_addr == NULL_TREE)
1951 	cst = info->ops[1].val;
1952       bool ret = true;
1953       if (cst)
1954 	ret = encode_tree_to_bitpos (cst, val, info->bitsize,
1955 				     pos_in_buffer, buf_size);
1956       if (cst && dump_file && (dump_flags & TDF_DETAILS))
1957 	{
1958 	  if (ret)
1959 	    {
1960 	      fprintf (dump_file, "After writing ");
1961 	      print_generic_expr (dump_file, cst, 0);
1962 	      fprintf (dump_file, " of size " HOST_WIDE_INT_PRINT_DEC
1963 		       " at position %d the merged region contains:\n",
1964 		       info->bitsize, pos_in_buffer);
1965 	      dump_char_array (dump_file, val, buf_size);
1966 	    }
1967 	  else
1968 	    fprintf (dump_file, "Failed to merge stores\n");
1969 	}
1970       if (!ret)
1971 	return false;
1972       unsigned char *m = mask + (pos_in_buffer / BITS_PER_UNIT);
1973       if (BYTES_BIG_ENDIAN)
1974 	clear_bit_region_be (m, (BITS_PER_UNIT - 1
1975 				 - (pos_in_buffer % BITS_PER_UNIT)),
1976 			     info->bitsize);
1977       else
1978 	clear_bit_region (m, pos_in_buffer % BITS_PER_UNIT, info->bitsize);
1979     }
1980   stores.qsort (sort_by_bitpos);
1981   return true;
1982 }
1983 
1984 /* Structure describing the store chain.  */
1985 
1986 struct imm_store_chain_info
1987 {
1988   /* Doubly-linked list that imposes an order on chain processing.
1989      PNXP (prev's next pointer) points to the head of a list, or to
1990      the next field in the previous chain in the list.
1991      See pass_store_merging::m_stores_head for more rationale.  */
1992   imm_store_chain_info *next, **pnxp;
1993   tree base_addr;
1994   auto_vec<store_immediate_info *> m_store_info;
1995   auto_vec<merged_store_group *> m_merged_store_groups;
1996 
1997   imm_store_chain_info (imm_store_chain_info *&inspt, tree b_a)
1998   : next (inspt), pnxp (&inspt), base_addr (b_a)
1999   {
2000     inspt = this;
2001     if (next)
2002       {
2003 	gcc_checking_assert (pnxp == next->pnxp);
2004 	next->pnxp = &next;
2005       }
2006   }
2007   ~imm_store_chain_info ()
2008   {
2009     *pnxp = next;
2010     if (next)
2011       {
2012 	gcc_checking_assert (&next == next->pnxp);
2013 	next->pnxp = pnxp;
2014       }
2015   }
2016   bool terminate_and_process_chain ();
2017   bool try_coalesce_bswap (merged_store_group *, unsigned int, unsigned int);
2018   bool coalesce_immediate_stores ();
2019   bool output_merged_store (merged_store_group *);
2020   bool output_merged_stores ();
2021 };
2022 
2023 const pass_data pass_data_tree_store_merging = {
2024   GIMPLE_PASS,     /* type */
2025   "store-merging", /* name */
2026   OPTGROUP_NONE,   /* optinfo_flags */
2027   TV_GIMPLE_STORE_MERGING,	 /* tv_id */
2028   PROP_ssa,	/* properties_required */
2029   0,		   /* properties_provided */
2030   0,		   /* properties_destroyed */
2031   0,		   /* todo_flags_start */
2032   TODO_update_ssa, /* todo_flags_finish */
2033 };
2034 
2035 class pass_store_merging : public gimple_opt_pass
2036 {
2037 public:
2038   pass_store_merging (gcc::context *ctxt)
2039     : gimple_opt_pass (pass_data_tree_store_merging, ctxt), m_stores_head ()
2040   {
2041   }
2042 
2043   /* Pass not supported for PDP-endianness, nor for insane hosts
2044      or target character sizes where native_{encode,interpret}_expr
2045      doesn't work properly.  */
2046   virtual bool
2047   gate (function *)
2048   {
2049     return flag_store_merging
2050 	   && WORDS_BIG_ENDIAN == BYTES_BIG_ENDIAN
2051 	   && CHAR_BIT == 8
2052 	   && BITS_PER_UNIT == 8;
2053   }
2054 
2055   virtual unsigned int execute (function *);
2056 
2057 private:
2058   hash_map<tree_operand_hash, struct imm_store_chain_info *> m_stores;
2059 
2060   /* Form a doubly-linked stack of the elements of m_stores, so that
2061      we can iterate over them in a predictable way.  Using this order
2062      avoids extraneous differences in the compiler output just because
2063      of tree pointer variations (e.g. different chains end up in
2064      different positions of m_stores, so they are handled in different
2065      orders, so they allocate or release SSA names in different
2066      orders, and when they get reused, subsequent passes end up
2067      getting different SSA names, which may ultimately change
2068      decisions when going out of SSA).  */
2069   imm_store_chain_info *m_stores_head;
2070 
2071   void process_store (gimple *);
2072   bool terminate_and_process_all_chains ();
2073   bool terminate_all_aliasing_chains (imm_store_chain_info **, gimple *);
2074   bool terminate_and_release_chain (imm_store_chain_info *);
2075 }; // class pass_store_merging
2076 
2077 /* Terminate and process all recorded chains.  Return true if any changes
2078    were made.  */
2079 
2080 bool
2081 pass_store_merging::terminate_and_process_all_chains ()
2082 {
2083   bool ret = false;
2084   while (m_stores_head)
2085     ret |= terminate_and_release_chain (m_stores_head);
2086   gcc_assert (m_stores.elements () == 0);
2087   gcc_assert (m_stores_head == NULL);
2088 
2089   return ret;
2090 }
2091 
2092 /* Terminate all chains that are affected by the statement STMT.
2093    CHAIN_INFO is the chain we should ignore from the checks if
2094    non-NULL.  */
2095 
2096 bool
2097 pass_store_merging::terminate_all_aliasing_chains (imm_store_chain_info
2098 						     **chain_info,
2099 						   gimple *stmt)
2100 {
2101   bool ret = false;
2102 
2103   /* If the statement doesn't touch memory it can't alias.  */
2104   if (!gimple_vuse (stmt))
2105     return false;
2106 
2107   tree store_lhs = gimple_store_p (stmt) ? gimple_get_lhs (stmt) : NULL_TREE;
2108   for (imm_store_chain_info *next = m_stores_head, *cur = next; cur; cur = next)
2109     {
2110       next = cur->next;
2111 
2112       /* We already checked all the stores in chain_info and terminated the
2113 	 chain if necessary.  Skip it here.  */
2114       if (chain_info && *chain_info == cur)
2115 	continue;
2116 
2117       store_immediate_info *info;
2118       unsigned int i;
2119       FOR_EACH_VEC_ELT (cur->m_store_info, i, info)
2120 	{
2121 	  tree lhs = gimple_assign_lhs (info->stmt);
2122 	  if (ref_maybe_used_by_stmt_p (stmt, lhs)
2123 	      || stmt_may_clobber_ref_p (stmt, lhs)
2124 	      || (store_lhs && refs_output_dependent_p (store_lhs, lhs)))
2125 	    {
2126 	      if (dump_file && (dump_flags & TDF_DETAILS))
2127 		{
2128 		  fprintf (dump_file, "stmt causes chain termination:\n");
2129 		  print_gimple_stmt (dump_file, stmt, 0);
2130 		}
2131 	      terminate_and_release_chain (cur);
2132 	      ret = true;
2133 	      break;
2134 	    }
2135 	}
2136     }
2137 
2138   return ret;
2139 }
2140 
2141 /* Helper function.  Terminate the recorded chain storing to base object
2142    BASE.  Return true if the merging and output was successful.  The m_stores
2143    entry is removed after the processing in any case.  */
2144 
2145 bool
2146 pass_store_merging::terminate_and_release_chain (imm_store_chain_info *chain_info)
2147 {
2148   bool ret = chain_info->terminate_and_process_chain ();
2149   m_stores.remove (chain_info->base_addr);
2150   delete chain_info;
2151   return ret;
2152 }
2153 
2154 /* Return true if stmts in between FIRST (inclusive) and LAST (exclusive)
2155    may clobber REF.  FIRST and LAST must be in the same basic block and
2156    have non-NULL vdef.  We want to be able to sink load of REF across
2157    stores between FIRST and LAST, up to right before LAST.  */
2158 
2159 bool
2160 stmts_may_clobber_ref_p (gimple *first, gimple *last, tree ref)
2161 {
2162   ao_ref r;
2163   ao_ref_init (&r, ref);
2164   unsigned int count = 0;
2165   tree vop = gimple_vdef (last);
2166   gimple *stmt;
2167 
2168   gcc_checking_assert (gimple_bb (first) == gimple_bb (last));
2169   do
2170     {
2171       stmt = SSA_NAME_DEF_STMT (vop);
2172       if (stmt_may_clobber_ref_p_1 (stmt, &r))
2173 	return true;
2174       if (gimple_store_p (stmt)
2175 	  && refs_anti_dependent_p (ref, gimple_get_lhs (stmt)))
2176 	return true;
2177       /* Avoid quadratic compile time by bounding the number of checks
2178 	 we perform.  */
2179       if (++count > MAX_STORE_ALIAS_CHECKS)
2180 	return true;
2181       vop = gimple_vuse (stmt);
2182     }
2183   while (stmt != first);
2184   return false;
2185 }
2186 
2187 /* Return true if INFO->ops[IDX] is mergeable with the
2188    corresponding loads already in MERGED_STORE group.
2189    BASE_ADDR is the base address of the whole store group.  */
2190 
2191 bool
2192 compatible_load_p (merged_store_group *merged_store,
2193 		   store_immediate_info *info,
2194 		   tree base_addr, int idx)
2195 {
2196   store_immediate_info *infof = merged_store->stores[0];
2197   if (!info->ops[idx].base_addr
2198       || maybe_ne (info->ops[idx].bitpos - infof->ops[idx].bitpos,
2199 		   info->bitpos - infof->bitpos)
2200       || !operand_equal_p (info->ops[idx].base_addr,
2201 			   infof->ops[idx].base_addr, 0))
2202     return false;
2203 
2204   store_immediate_info *infol = merged_store->stores.last ();
2205   tree load_vuse = gimple_vuse (info->ops[idx].stmt);
2206   /* In this case all vuses should be the same, e.g.
2207      _1 = s.a; _2 = s.b; _3 = _1 | 1; t.a = _3; _4 = _2 | 2; t.b = _4;
2208      or
2209      _1 = s.a; _2 = s.b; t.a = _1; t.b = _2;
2210      and we can emit the coalesced load next to any of those loads.  */
2211   if (gimple_vuse (infof->ops[idx].stmt) == load_vuse
2212       && gimple_vuse (infol->ops[idx].stmt) == load_vuse)
2213     return true;
2214 
2215   /* Otherwise, at least for now require that the load has the same
2216      vuse as the store.  See following examples.  */
2217   if (gimple_vuse (info->stmt) != load_vuse)
2218     return false;
2219 
2220   if (gimple_vuse (infof->stmt) != gimple_vuse (infof->ops[idx].stmt)
2221       || (infof != infol
2222 	  && gimple_vuse (infol->stmt) != gimple_vuse (infol->ops[idx].stmt)))
2223     return false;
2224 
2225   /* If the load is from the same location as the store, already
2226      the construction of the immediate chain info guarantees no intervening
2227      stores, so no further checks are needed.  Example:
2228      _1 = s.a; _2 = _1 & -7; s.a = _2; _3 = s.b; _4 = _3 & -7; s.b = _4;  */
2229   if (known_eq (info->ops[idx].bitpos, info->bitpos)
2230       && operand_equal_p (info->ops[idx].base_addr, base_addr, 0))
2231     return true;
2232 
2233   /* Otherwise, we need to punt if any of the loads can be clobbered by any
2234      of the stores in the group, or any other stores in between those.
2235      Previous calls to compatible_load_p ensured that for all the
2236      merged_store->stores IDX loads, no stmts starting with
2237      merged_store->first_stmt and ending right before merged_store->last_stmt
2238      clobbers those loads.  */
2239   gimple *first = merged_store->first_stmt;
2240   gimple *last = merged_store->last_stmt;
2241   unsigned int i;
2242   store_immediate_info *infoc;
2243   /* The stores are sorted by increasing store bitpos, so if info->stmt store
2244      comes before the so far first load, we'll be changing
2245      merged_store->first_stmt.  In that case we need to give up if
2246      any of the earlier processed loads clobber with the stmts in the new
2247      range.  */
2248   if (info->order < merged_store->first_order)
2249     {
2250       FOR_EACH_VEC_ELT (merged_store->stores, i, infoc)
2251 	if (stmts_may_clobber_ref_p (info->stmt, first, infoc->ops[idx].val))
2252 	  return false;
2253       first = info->stmt;
2254     }
2255   /* Similarly, we could change merged_store->last_stmt, so ensure
2256      in that case no stmts in the new range clobber any of the earlier
2257      processed loads.  */
2258   else if (info->order > merged_store->last_order)
2259     {
2260       FOR_EACH_VEC_ELT (merged_store->stores, i, infoc)
2261 	if (stmts_may_clobber_ref_p (last, info->stmt, infoc->ops[idx].val))
2262 	  return false;
2263       last = info->stmt;
2264     }
2265   /* And finally, we'd be adding a new load to the set, ensure it isn't
2266      clobbered in the new range.  */
2267   if (stmts_may_clobber_ref_p (first, last, info->ops[idx].val))
2268     return false;
2269 
2270   /* Otherwise, we are looking for:
2271      _1 = s.a; _2 = _1 ^ 15; t.a = _2; _3 = s.b; _4 = _3 ^ 15; t.b = _4;
2272      or
2273      _1 = s.a; t.a = _1; _2 = s.b; t.b = _2;  */
2274   return true;
2275 }
2276 
2277 /* Add all refs loaded to compute VAL to REFS vector.  */
2278 
2279 void
2280 gather_bswap_load_refs (vec<tree> *refs, tree val)
2281 {
2282   if (TREE_CODE (val) != SSA_NAME)
2283     return;
2284 
2285   gimple *stmt = SSA_NAME_DEF_STMT (val);
2286   if (!is_gimple_assign (stmt))
2287     return;
2288 
2289   if (gimple_assign_load_p (stmt))
2290     {
2291       refs->safe_push (gimple_assign_rhs1 (stmt));
2292       return;
2293     }
2294 
2295   switch (gimple_assign_rhs_class (stmt))
2296     {
2297     case GIMPLE_BINARY_RHS:
2298       gather_bswap_load_refs (refs, gimple_assign_rhs2 (stmt));
2299       /* FALLTHRU */
2300     case GIMPLE_UNARY_RHS:
2301       gather_bswap_load_refs (refs, gimple_assign_rhs1 (stmt));
2302       break;
2303     default:
2304       gcc_unreachable ();
2305     }
2306 }
2307 
2308 /* Check if there are any stores in M_STORE_INFO after index I
2309    (where M_STORE_INFO must be sorted by sort_by_bitpos) that overlap
2310    a potential group ending with END that have their order
2311    smaller than LAST_ORDER.  RHS_CODE is the kind of store in the
2312    group.  Return true if there are no such stores.
2313    Consider:
2314      MEM[(long long int *)p_28] = 0;
2315      MEM[(long long int *)p_28 + 8B] = 0;
2316      MEM[(long long int *)p_28 + 16B] = 0;
2317      MEM[(long long int *)p_28 + 24B] = 0;
2318      _129 = (int) _130;
2319      MEM[(int *)p_28 + 8B] = _129;
2320      MEM[(int *)p_28].a = -1;
2321    We already have
2322      MEM[(long long int *)p_28] = 0;
2323      MEM[(int *)p_28].a = -1;
2324    stmts in the current group and need to consider if it is safe to
2325    add MEM[(long long int *)p_28 + 8B] = 0; store into the same group.
2326    There is an overlap between that store and the MEM[(int *)p_28 + 8B] = _129;
2327    store though, so if we add the MEM[(long long int *)p_28 + 8B] = 0;
2328    into the group and merging of those 3 stores is successful, merged
2329    stmts will be emitted at the latest store from that group, i.e.
2330    LAST_ORDER, which is the MEM[(int *)p_28].a = -1; store.
2331    The MEM[(int *)p_28 + 8B] = _129; store that originally follows
2332    the MEM[(long long int *)p_28 + 8B] = 0; would now be before it,
2333    so we need to refuse merging MEM[(long long int *)p_28 + 8B] = 0;
2334    into the group.  That way it will be its own store group and will
2335    not be touched.  If RHS_CODE is INTEGER_CST and there are overlapping
2336    INTEGER_CST stores, those are mergeable using merge_overlapping,
2337    so don't return false for those.  */
2338 
2339 static bool
2340 check_no_overlap (vec<store_immediate_info *> m_store_info, unsigned int i,
2341 		  enum tree_code rhs_code, unsigned int last_order,
2342 		  unsigned HOST_WIDE_INT end)
2343 {
2344   unsigned int len = m_store_info.length ();
2345   for (++i; i < len; ++i)
2346     {
2347       store_immediate_info *info = m_store_info[i];
2348       if (info->bitpos >= end)
2349 	break;
2350       if (info->order < last_order
2351 	  && (rhs_code != INTEGER_CST || info->rhs_code != INTEGER_CST))
2352 	return false;
2353     }
2354   return true;
2355 }
2356 
2357 /* Return true if m_store_info[first] and at least one following store
2358    form a group which store try_size bitsize value which is byte swapped
2359    from a memory load or some value, or identity from some value.
2360    This uses the bswap pass APIs.  */
2361 
2362 bool
2363 imm_store_chain_info::try_coalesce_bswap (merged_store_group *merged_store,
2364 					  unsigned int first,
2365 					  unsigned int try_size)
2366 {
2367   unsigned int len = m_store_info.length (), last = first;
2368   unsigned HOST_WIDE_INT width = m_store_info[first]->bitsize;
2369   if (width >= try_size)
2370     return false;
2371   for (unsigned int i = first + 1; i < len; ++i)
2372     {
2373       if (m_store_info[i]->bitpos != m_store_info[first]->bitpos + width
2374 	  || m_store_info[i]->ins_stmt == NULL)
2375 	return false;
2376       width += m_store_info[i]->bitsize;
2377       if (width >= try_size)
2378 	{
2379 	  last = i;
2380 	  break;
2381 	}
2382     }
2383   if (width != try_size)
2384     return false;
2385 
2386   bool allow_unaligned
2387     = !STRICT_ALIGNMENT && PARAM_VALUE (PARAM_STORE_MERGING_ALLOW_UNALIGNED);
2388   /* Punt if the combined store would not be aligned and we need alignment.  */
2389   if (!allow_unaligned)
2390     {
2391       unsigned int align = merged_store->align;
2392       unsigned HOST_WIDE_INT align_base = merged_store->align_base;
2393       for (unsigned int i = first + 1; i <= last; ++i)
2394 	{
2395 	  unsigned int this_align;
2396 	  unsigned HOST_WIDE_INT align_bitpos = 0;
2397 	  get_object_alignment_1 (gimple_assign_lhs (m_store_info[i]->stmt),
2398 				  &this_align, &align_bitpos);
2399 	  if (this_align > align)
2400 	    {
2401 	      align = this_align;
2402 	      align_base = m_store_info[i]->bitpos - align_bitpos;
2403 	    }
2404 	}
2405       unsigned HOST_WIDE_INT align_bitpos
2406 	= (m_store_info[first]->bitpos - align_base) & (align - 1);
2407       if (align_bitpos)
2408 	align = least_bit_hwi (align_bitpos);
2409       if (align < try_size)
2410 	return false;
2411     }
2412 
2413   tree type;
2414   switch (try_size)
2415     {
2416     case 16: type = uint16_type_node; break;
2417     case 32: type = uint32_type_node; break;
2418     case 64: type = uint64_type_node; break;
2419     default: gcc_unreachable ();
2420     }
2421   struct symbolic_number n;
2422   gimple *ins_stmt = NULL;
2423   int vuse_store = -1;
2424   unsigned int first_order = merged_store->first_order;
2425   unsigned int last_order = merged_store->last_order;
2426   gimple *first_stmt = merged_store->first_stmt;
2427   gimple *last_stmt = merged_store->last_stmt;
2428   unsigned HOST_WIDE_INT end = merged_store->start + merged_store->width;
2429   store_immediate_info *infof = m_store_info[first];
2430 
2431   for (unsigned int i = first; i <= last; ++i)
2432     {
2433       store_immediate_info *info = m_store_info[i];
2434       struct symbolic_number this_n = info->n;
2435       this_n.type = type;
2436       if (!this_n.base_addr)
2437 	this_n.range = try_size / BITS_PER_UNIT;
2438       else
2439 	/* Update vuse in case it has changed by output_merged_stores.  */
2440 	this_n.vuse = gimple_vuse (info->ins_stmt);
2441       unsigned int bitpos = info->bitpos - infof->bitpos;
2442       if (!do_shift_rotate (LSHIFT_EXPR, &this_n,
2443 			    BYTES_BIG_ENDIAN
2444 			    ? try_size - info->bitsize - bitpos
2445 			    : bitpos))
2446 	return false;
2447       if (this_n.base_addr && vuse_store)
2448 	{
2449 	  unsigned int j;
2450 	  for (j = first; j <= last; ++j)
2451 	    if (this_n.vuse == gimple_vuse (m_store_info[j]->stmt))
2452 	      break;
2453 	  if (j > last)
2454 	    {
2455 	      if (vuse_store == 1)
2456 		return false;
2457 	      vuse_store = 0;
2458 	    }
2459 	}
2460       if (i == first)
2461 	{
2462 	  n = this_n;
2463 	  ins_stmt = info->ins_stmt;
2464 	}
2465       else
2466 	{
2467 	  if (n.base_addr && n.vuse != this_n.vuse)
2468 	    {
2469 	      if (vuse_store == 0)
2470 		return false;
2471 	      vuse_store = 1;
2472 	    }
2473 	  if (info->order > last_order)
2474 	    {
2475 	      last_order = info->order;
2476 	      last_stmt = info->stmt;
2477 	    }
2478 	  else if (info->order < first_order)
2479 	    {
2480 	      first_order = info->order;
2481 	      first_stmt = info->stmt;
2482 	    }
2483 	  end = MAX (end, info->bitpos + info->bitsize);
2484 
2485 	  ins_stmt = perform_symbolic_merge (ins_stmt, &n, info->ins_stmt,
2486 					     &this_n, &n);
2487 	  if (ins_stmt == NULL)
2488 	    return false;
2489 	}
2490     }
2491 
2492   uint64_t cmpxchg, cmpnop;
2493   find_bswap_or_nop_finalize (&n, &cmpxchg, &cmpnop);
2494 
2495   /* A complete byte swap should make the symbolic number to start with
2496      the largest digit in the highest order byte.  Unchanged symbolic
2497      number indicates a read with same endianness as target architecture.  */
2498   if (n.n != cmpnop && n.n != cmpxchg)
2499     return false;
2500 
2501   if (n.base_addr == NULL_TREE && !is_gimple_val (n.src))
2502     return false;
2503 
2504   if (!check_no_overlap (m_store_info, last, LROTATE_EXPR, last_order, end))
2505     return false;
2506 
2507   /* Don't handle memory copy this way if normal non-bswap processing
2508      would handle it too.  */
2509   if (n.n == cmpnop && (unsigned) n.n_ops == last - first + 1)
2510     {
2511       unsigned int i;
2512       for (i = first; i <= last; ++i)
2513 	if (m_store_info[i]->rhs_code != MEM_REF)
2514 	  break;
2515       if (i == last + 1)
2516 	return false;
2517     }
2518 
2519   if (n.n == cmpxchg)
2520     switch (try_size)
2521       {
2522       case 16:
2523 	/* Will emit LROTATE_EXPR.  */
2524 	break;
2525       case 32:
2526 	if (builtin_decl_explicit_p (BUILT_IN_BSWAP32)
2527 	    && optab_handler (bswap_optab, SImode) != CODE_FOR_nothing)
2528 	  break;
2529 	return false;
2530       case 64:
2531 	if (builtin_decl_explicit_p (BUILT_IN_BSWAP64)
2532 	    && optab_handler (bswap_optab, DImode) != CODE_FOR_nothing)
2533 	  break;
2534 	return false;
2535       default:
2536 	gcc_unreachable ();
2537       }
2538 
2539   if (!allow_unaligned && n.base_addr)
2540     {
2541       unsigned int align = get_object_alignment (n.src);
2542       if (align < try_size)
2543 	return false;
2544     }
2545 
2546   /* If each load has vuse of the corresponding store, need to verify
2547      the loads can be sunk right before the last store.  */
2548   if (vuse_store == 1)
2549     {
2550       auto_vec<tree, 64> refs;
2551       for (unsigned int i = first; i <= last; ++i)
2552 	gather_bswap_load_refs (&refs,
2553 				gimple_assign_rhs1 (m_store_info[i]->stmt));
2554 
2555       unsigned int i;
2556       tree ref;
2557       FOR_EACH_VEC_ELT (refs, i, ref)
2558 	if (stmts_may_clobber_ref_p (first_stmt, last_stmt, ref))
2559 	  return false;
2560       n.vuse = NULL_TREE;
2561     }
2562 
2563   infof->n = n;
2564   infof->ins_stmt = ins_stmt;
2565   for (unsigned int i = first; i <= last; ++i)
2566     {
2567       m_store_info[i]->rhs_code = n.n == cmpxchg ? LROTATE_EXPR : NOP_EXPR;
2568       m_store_info[i]->ops[0].base_addr = NULL_TREE;
2569       m_store_info[i]->ops[1].base_addr = NULL_TREE;
2570       if (i != first)
2571 	merged_store->merge_into (m_store_info[i]);
2572     }
2573 
2574   return true;
2575 }
2576 
2577 /* Go through the candidate stores recorded in m_store_info and merge them
2578    into merged_store_group objects recorded into m_merged_store_groups
2579    representing the widened stores.  Return true if coalescing was successful
2580    and the number of widened stores is fewer than the original number
2581    of stores.  */
2582 
2583 bool
2584 imm_store_chain_info::coalesce_immediate_stores ()
2585 {
2586   /* Anything less can't be processed.  */
2587   if (m_store_info.length () < 2)
2588     return false;
2589 
2590   if (dump_file && (dump_flags & TDF_DETAILS))
2591     fprintf (dump_file, "Attempting to coalesce %u stores in chain.\n",
2592 	     m_store_info.length ());
2593 
2594   store_immediate_info *info;
2595   unsigned int i, ignore = 0;
2596 
2597   /* Order the stores by the bitposition they write to.  */
2598   m_store_info.qsort (sort_by_bitpos);
2599 
2600   info = m_store_info[0];
2601   merged_store_group *merged_store = new merged_store_group (info);
2602 
2603   FOR_EACH_VEC_ELT (m_store_info, i, info)
2604     {
2605       if (dump_file && (dump_flags & TDF_DETAILS))
2606 	{
2607 	  fprintf (dump_file, "Store %u:\nbitsize:" HOST_WIDE_INT_PRINT_DEC
2608 			      " bitpos:" HOST_WIDE_INT_PRINT_DEC " val:\n",
2609 		   i, info->bitsize, info->bitpos);
2610 	  print_generic_expr (dump_file, gimple_assign_rhs1 (info->stmt));
2611 	  fprintf (dump_file, "\n------------\n");
2612 	}
2613 
2614       if (i <= ignore)
2615 	continue;
2616 
2617       /* First try to handle group of stores like:
2618 	 p[0] = data >> 24;
2619 	 p[1] = data >> 16;
2620 	 p[2] = data >> 8;
2621 	 p[3] = data;
2622 	 using the bswap framework.  */
2623       if (info->bitpos == merged_store->start + merged_store->width
2624 	  && merged_store->stores.length () == 1
2625 	  && merged_store->stores[0]->ins_stmt != NULL
2626 	  && info->ins_stmt != NULL)
2627 	{
2628 	  unsigned int try_size;
2629 	  for (try_size = 64; try_size >= 16; try_size >>= 1)
2630 	    if (try_coalesce_bswap (merged_store, i - 1, try_size))
2631 	      break;
2632 
2633 	  if (try_size >= 16)
2634 	    {
2635 	      ignore = i + merged_store->stores.length () - 1;
2636 	      m_merged_store_groups.safe_push (merged_store);
2637 	      if (ignore < m_store_info.length ())
2638 		merged_store = new merged_store_group (m_store_info[ignore]);
2639 	      else
2640 		merged_store = NULL;
2641 	      continue;
2642 	    }
2643 	}
2644 
2645       if (info->order >= merged_store->first_nonmergeable_order)
2646 	;
2647 
2648       /* |---store 1---|
2649 	       |---store 2---|
2650 	 Overlapping stores.  */
2651       else if (IN_RANGE (info->bitpos, merged_store->start,
2652 			 merged_store->start + merged_store->width - 1))
2653 	{
2654 	  /* Only allow overlapping stores of constants.  */
2655 	  if (info->rhs_code == INTEGER_CST
2656 	      && merged_store->stores[0]->rhs_code == INTEGER_CST)
2657 	    {
2658 	      unsigned int last_order
2659 		= MAX (merged_store->last_order, info->order);
2660 	      unsigned HOST_WIDE_INT end
2661 		= MAX (merged_store->start + merged_store->width,
2662 		       info->bitpos + info->bitsize);
2663 	      if (check_no_overlap (m_store_info, i, INTEGER_CST,
2664 				    last_order, end))
2665 		{
2666 		  /* check_no_overlap call above made sure there are no
2667 		     overlapping stores with non-INTEGER_CST rhs_code
2668 		     in between the first and last of the stores we've
2669 		     just merged.  If there are any INTEGER_CST rhs_code
2670 		     stores in between, we need to merge_overlapping them
2671 		     even if in the sort_by_bitpos order there are other
2672 		     overlapping stores in between.  Keep those stores as is.
2673 		     Example:
2674 			MEM[(int *)p_28] = 0;
2675 			MEM[(char *)p_28 + 3B] = 1;
2676 			MEM[(char *)p_28 + 1B] = 2;
2677 			MEM[(char *)p_28 + 2B] = MEM[(char *)p_28 + 6B];
2678 		     We can't merge the zero store with the store of two and
2679 		     not merge anything else, because the store of one is
2680 		     in the original order in between those two, but in
2681 		     store_by_bitpos order it comes after the last store that
2682 		     we can't merge with them.  We can merge the first 3 stores
2683 		     and keep the last store as is though.  */
2684 		  unsigned int len = m_store_info.length ();
2685 		  unsigned int try_order = last_order;
2686 		  unsigned int first_nonmergeable_order;
2687 		  unsigned int k;
2688 		  bool last_iter = false;
2689 		  int attempts = 0;
2690 		  do
2691 		    {
2692 		      unsigned int max_order = 0;
2693 		      unsigned first_nonmergeable_int_order = ~0U;
2694 		      unsigned HOST_WIDE_INT this_end = end;
2695 		      k = i;
2696 		      first_nonmergeable_order = ~0U;
2697 		      for (unsigned int j = i + 1; j < len; ++j)
2698 			{
2699 			  store_immediate_info *info2 = m_store_info[j];
2700 			  if (info2->bitpos >= this_end)
2701 			    break;
2702 			  if (info2->order < try_order)
2703 			    {
2704 			      if (info2->rhs_code != INTEGER_CST)
2705 				{
2706 				  /* Normally check_no_overlap makes sure this
2707 				     doesn't happen, but if end grows below,
2708 				     then we need to process more stores than
2709 				     check_no_overlap verified.  Example:
2710 				      MEM[(int *)p_5] = 0;
2711 				      MEM[(short *)p_5 + 3B] = 1;
2712 				      MEM[(char *)p_5 + 4B] = _9;
2713 				      MEM[(char *)p_5 + 2B] = 2;  */
2714 				  k = 0;
2715 				  break;
2716 				}
2717 			      k = j;
2718 			      this_end = MAX (this_end,
2719 					      info2->bitpos + info2->bitsize);
2720 			    }
2721 			  else if (info2->rhs_code == INTEGER_CST
2722 				   && !last_iter)
2723 			    {
2724 			      max_order = MAX (max_order, info2->order + 1);
2725 			      first_nonmergeable_int_order
2726 				= MIN (first_nonmergeable_int_order,
2727 				       info2->order);
2728 			    }
2729 			  else
2730 			    first_nonmergeable_order
2731 			      = MIN (first_nonmergeable_order, info2->order);
2732 			}
2733 		      if (k == 0)
2734 			{
2735 			  if (last_order == try_order)
2736 			    break;
2737 			  /* If this failed, but only because we grew
2738 			     try_order, retry with the last working one,
2739 			     so that we merge at least something.  */
2740 			  try_order = last_order;
2741 			  last_iter = true;
2742 			  continue;
2743 			}
2744 		      last_order = try_order;
2745 		      /* Retry with a larger try_order to see if we could
2746 			 merge some further INTEGER_CST stores.  */
2747 		      if (max_order
2748 			  && (first_nonmergeable_int_order
2749 			      < first_nonmergeable_order))
2750 			{
2751 			  try_order = MIN (max_order,
2752 					   first_nonmergeable_order);
2753 			  try_order
2754 			    = MIN (try_order,
2755 				   merged_store->first_nonmergeable_order);
2756 			  if (try_order > last_order && ++attempts < 16)
2757 			    continue;
2758 			}
2759 		      first_nonmergeable_order
2760 			= MIN (first_nonmergeable_order,
2761 			       first_nonmergeable_int_order);
2762 		      end = this_end;
2763 		      break;
2764 		    }
2765 		  while (1);
2766 
2767 		  if (k != 0)
2768 		    {
2769 		      merged_store->merge_overlapping (info);
2770 
2771 		      merged_store->first_nonmergeable_order
2772 			= MIN (merged_store->first_nonmergeable_order,
2773 			       first_nonmergeable_order);
2774 
2775 		      for (unsigned int j = i + 1; j <= k; j++)
2776 			{
2777 			  store_immediate_info *info2 = m_store_info[j];
2778 			  gcc_assert (info2->bitpos < end);
2779 			  if (info2->order < last_order)
2780 			    {
2781 			      gcc_assert (info2->rhs_code == INTEGER_CST);
2782 			      if (info != info2)
2783 				merged_store->merge_overlapping (info2);
2784 			    }
2785 			  /* Other stores are kept and not merged in any
2786 			     way.  */
2787 			}
2788 		      ignore = k;
2789 		      continue;
2790 		    }
2791 		}
2792 	    }
2793 	}
2794       /* |---store 1---||---store 2---|
2795 	 This store is consecutive to the previous one.
2796 	 Merge it into the current store group.  There can be gaps in between
2797 	 the stores, but there can't be gaps in between bitregions.  */
2798       else if (info->rhs_code != LROTATE_EXPR
2799 	       && info->bitregion_start <= merged_store->bitregion_end
2800 	       && info->rhs_code == merged_store->stores[0]->rhs_code)
2801 	{
2802 	  store_immediate_info *infof = merged_store->stores[0];
2803 
2804 	  /* All the rhs_code ops that take 2 operands are commutative,
2805 	     swap the operands if it could make the operands compatible.  */
2806 	  if (infof->ops[0].base_addr
2807 	      && infof->ops[1].base_addr
2808 	      && info->ops[0].base_addr
2809 	      && info->ops[1].base_addr
2810 	      && known_eq (info->ops[1].bitpos - infof->ops[0].bitpos,
2811 			   info->bitpos - infof->bitpos)
2812 	      && operand_equal_p (info->ops[1].base_addr,
2813 				  infof->ops[0].base_addr, 0))
2814 	    {
2815 	      std::swap (info->ops[0], info->ops[1]);
2816 	      info->ops_swapped_p = true;
2817 	    }
2818 	  if ((infof->ops[0].base_addr
2819 	       ? compatible_load_p (merged_store, info, base_addr, 0)
2820 	       : !info->ops[0].base_addr)
2821 	      && (infof->ops[1].base_addr
2822 		  ? compatible_load_p (merged_store, info, base_addr, 1)
2823 		  : !info->ops[1].base_addr)
2824 	      && check_no_overlap (m_store_info, i, info->rhs_code,
2825 				   MAX (merged_store->last_order,
2826 					info->order),
2827 				   MAX (merged_store->start
2828 					+ merged_store->width,
2829 					info->bitpos + info->bitsize)))
2830 	    {
2831 	      merged_store->merge_into (info);
2832 	      continue;
2833 	    }
2834 	}
2835 
2836       /* |---store 1---| <gap> |---store 2---|.
2837 	 Gap between stores or the rhs not compatible.  Start a new group.  */
2838 
2839       /* Try to apply all the stores recorded for the group to determine
2840 	 the bitpattern they write and discard it if that fails.
2841 	 This will also reject single-store groups.  */
2842       if (!merged_store->apply_stores ())
2843 	delete merged_store;
2844       else
2845 	m_merged_store_groups.safe_push (merged_store);
2846 
2847       merged_store = new merged_store_group (info);
2848     }
2849 
2850   /* Record or discard the last store group.  */
2851   if (merged_store)
2852     {
2853       if (!merged_store->apply_stores ())
2854 	delete merged_store;
2855       else
2856 	m_merged_store_groups.safe_push (merged_store);
2857     }
2858 
2859   gcc_assert (m_merged_store_groups.length () <= m_store_info.length ());
2860   bool success
2861     = !m_merged_store_groups.is_empty ()
2862       && m_merged_store_groups.length () < m_store_info.length ();
2863 
2864   if (success && dump_file)
2865     fprintf (dump_file, "Coalescing successful!\n"
2866 			"Merged into %u stores\n",
2867 	     m_merged_store_groups.length ());
2868 
2869   return success;
2870 }
2871 
2872 /* Return the type to use for the merged stores or loads described by STMTS.
2873    This is needed to get the alias sets right.  If IS_LOAD, look for rhs,
2874    otherwise lhs.  Additionally set *CLIQUEP and *BASEP to MR_DEPENDENCE_*
2875    of the MEM_REFs if any.  */
2876 
2877 static tree
2878 get_alias_type_for_stmts (vec<gimple *> &stmts, bool is_load,
2879 			  unsigned short *cliquep, unsigned short *basep)
2880 {
2881   gimple *stmt;
2882   unsigned int i;
2883   tree type = NULL_TREE;
2884   tree ret = NULL_TREE;
2885   *cliquep = 0;
2886   *basep = 0;
2887 
2888   FOR_EACH_VEC_ELT (stmts, i, stmt)
2889     {
2890       tree ref = is_load ? gimple_assign_rhs1 (stmt)
2891 			 : gimple_assign_lhs (stmt);
2892       tree type1 = reference_alias_ptr_type (ref);
2893       tree base = get_base_address (ref);
2894 
2895       if (i == 0)
2896 	{
2897 	  if (TREE_CODE (base) == MEM_REF)
2898 	    {
2899 	      *cliquep = MR_DEPENDENCE_CLIQUE (base);
2900 	      *basep = MR_DEPENDENCE_BASE (base);
2901 	    }
2902 	  ret = type = type1;
2903 	  continue;
2904 	}
2905       if (!alias_ptr_types_compatible_p (type, type1))
2906 	ret = ptr_type_node;
2907       if (TREE_CODE (base) != MEM_REF
2908 	  || *cliquep != MR_DEPENDENCE_CLIQUE (base)
2909 	  || *basep != MR_DEPENDENCE_BASE (base))
2910 	{
2911 	  *cliquep = 0;
2912 	  *basep = 0;
2913 	}
2914     }
2915   return ret;
2916 }
2917 
2918 /* Return the location_t information we can find among the statements
2919    in STMTS.  */
2920 
2921 static location_t
2922 get_location_for_stmts (vec<gimple *> &stmts)
2923 {
2924   gimple *stmt;
2925   unsigned int i;
2926 
2927   FOR_EACH_VEC_ELT (stmts, i, stmt)
2928     if (gimple_has_location (stmt))
2929       return gimple_location (stmt);
2930 
2931   return UNKNOWN_LOCATION;
2932 }
2933 
2934 /* Used to decribe a store resulting from splitting a wide store in smaller
2935    regularly-sized stores in split_group.  */
2936 
2937 struct split_store
2938 {
2939   unsigned HOST_WIDE_INT bytepos;
2940   unsigned HOST_WIDE_INT size;
2941   unsigned HOST_WIDE_INT align;
2942   auto_vec<store_immediate_info *> orig_stores;
2943   /* True if there is a single orig stmt covering the whole split store.  */
2944   bool orig;
2945   split_store (unsigned HOST_WIDE_INT, unsigned HOST_WIDE_INT,
2946 	       unsigned HOST_WIDE_INT);
2947 };
2948 
2949 /* Simple constructor.  */
2950 
2951 split_store::split_store (unsigned HOST_WIDE_INT bp,
2952 			  unsigned HOST_WIDE_INT sz,
2953 			  unsigned HOST_WIDE_INT al)
2954 			  : bytepos (bp), size (sz), align (al), orig (false)
2955 {
2956   orig_stores.create (0);
2957 }
2958 
2959 /* Record all stores in GROUP that write to the region starting at BITPOS and
2960    is of size BITSIZE.  Record infos for such statements in STORES if
2961    non-NULL.  The stores in GROUP must be sorted by bitposition.  Return INFO
2962    if there is exactly one original store in the range.  */
2963 
2964 static store_immediate_info *
2965 find_constituent_stores (struct merged_store_group *group,
2966 			 vec<store_immediate_info *> *stores,
2967 			 unsigned int *first,
2968 			 unsigned HOST_WIDE_INT bitpos,
2969 			 unsigned HOST_WIDE_INT bitsize)
2970 {
2971   store_immediate_info *info, *ret = NULL;
2972   unsigned int i;
2973   bool second = false;
2974   bool update_first = true;
2975   unsigned HOST_WIDE_INT end = bitpos + bitsize;
2976   for (i = *first; group->stores.iterate (i, &info); ++i)
2977     {
2978       unsigned HOST_WIDE_INT stmt_start = info->bitpos;
2979       unsigned HOST_WIDE_INT stmt_end = stmt_start + info->bitsize;
2980       if (stmt_end <= bitpos)
2981 	{
2982 	  /* BITPOS passed to this function never decreases from within the
2983 	     same split_group call, so optimize and don't scan info records
2984 	     which are known to end before or at BITPOS next time.
2985 	     Only do it if all stores before this one also pass this.  */
2986 	  if (update_first)
2987 	    *first = i + 1;
2988 	  continue;
2989 	}
2990       else
2991 	update_first = false;
2992 
2993       /* The stores in GROUP are ordered by bitposition so if we're past
2994 	 the region for this group return early.  */
2995       if (stmt_start >= end)
2996 	return ret;
2997 
2998       if (stores)
2999 	{
3000 	  stores->safe_push (info);
3001 	  if (ret)
3002 	    {
3003 	      ret = NULL;
3004 	      second = true;
3005 	    }
3006 	}
3007       else if (ret)
3008 	return NULL;
3009       if (!second)
3010 	ret = info;
3011     }
3012   return ret;
3013 }
3014 
3015 /* Return how many SSA_NAMEs used to compute value to store in the INFO
3016    store have multiple uses.  If any SSA_NAME has multiple uses, also
3017    count statements needed to compute it.  */
3018 
3019 static unsigned
3020 count_multiple_uses (store_immediate_info *info)
3021 {
3022   gimple *stmt = info->stmt;
3023   unsigned ret = 0;
3024   switch (info->rhs_code)
3025     {
3026     case INTEGER_CST:
3027       return 0;
3028     case BIT_AND_EXPR:
3029     case BIT_IOR_EXPR:
3030     case BIT_XOR_EXPR:
3031       if (info->bit_not_p)
3032 	{
3033 	  if (!has_single_use (gimple_assign_rhs1 (stmt)))
3034 	    ret = 1; /* Fall through below to return
3035 			the BIT_NOT_EXPR stmt and then
3036 			BIT_{AND,IOR,XOR}_EXPR and anything it
3037 			uses.  */
3038 	  else
3039 	    /* stmt is after this the BIT_NOT_EXPR.  */
3040 	    stmt = SSA_NAME_DEF_STMT (gimple_assign_rhs1 (stmt));
3041 	}
3042       if (!has_single_use (gimple_assign_rhs1 (stmt)))
3043 	{
3044 	  ret += 1 + info->ops[0].bit_not_p;
3045 	  if (info->ops[1].base_addr)
3046 	    ret += 1 + info->ops[1].bit_not_p;
3047 	  return ret + 1;
3048 	}
3049       stmt = SSA_NAME_DEF_STMT (gimple_assign_rhs1 (stmt));
3050       /* stmt is now the BIT_*_EXPR.  */
3051       if (!has_single_use (gimple_assign_rhs1 (stmt)))
3052 	ret += 1 + info->ops[info->ops_swapped_p].bit_not_p;
3053       else if (info->ops[info->ops_swapped_p].bit_not_p)
3054 	{
3055 	  gimple *stmt2 = SSA_NAME_DEF_STMT (gimple_assign_rhs1 (stmt));
3056 	  if (!has_single_use (gimple_assign_rhs1 (stmt2)))
3057 	    ++ret;
3058 	}
3059       if (info->ops[1].base_addr == NULL_TREE)
3060 	{
3061 	  gcc_checking_assert (!info->ops_swapped_p);
3062 	  return ret;
3063 	}
3064       if (!has_single_use (gimple_assign_rhs2 (stmt)))
3065 	ret += 1 + info->ops[1 - info->ops_swapped_p].bit_not_p;
3066       else if (info->ops[1 - info->ops_swapped_p].bit_not_p)
3067 	{
3068 	  gimple *stmt2 = SSA_NAME_DEF_STMT (gimple_assign_rhs2 (stmt));
3069 	  if (!has_single_use (gimple_assign_rhs1 (stmt2)))
3070 	    ++ret;
3071 	}
3072       return ret;
3073     case MEM_REF:
3074       if (!has_single_use (gimple_assign_rhs1 (stmt)))
3075 	return 1 + info->ops[0].bit_not_p;
3076       else if (info->ops[0].bit_not_p)
3077 	{
3078 	  stmt = SSA_NAME_DEF_STMT (gimple_assign_rhs1 (stmt));
3079 	  if (!has_single_use (gimple_assign_rhs1 (stmt)))
3080 	    return 1;
3081 	}
3082       return 0;
3083     default:
3084       gcc_unreachable ();
3085     }
3086 }
3087 
3088 /* Split a merged store described by GROUP by populating the SPLIT_STORES
3089    vector (if non-NULL) with split_store structs describing the byte offset
3090    (from the base), the bit size and alignment of each store as well as the
3091    original statements involved in each such split group.
3092    This is to separate the splitting strategy from the statement
3093    building/emission/linking done in output_merged_store.
3094    Return number of new stores.
3095    If ALLOW_UNALIGNED_STORE is false, then all stores must be aligned.
3096    If ALLOW_UNALIGNED_LOAD is false, then all loads must be aligned.
3097    If SPLIT_STORES is NULL, it is just a dry run to count number of
3098    new stores.  */
3099 
3100 static unsigned int
3101 split_group (merged_store_group *group, bool allow_unaligned_store,
3102 	     bool allow_unaligned_load,
3103 	     vec<struct split_store *> *split_stores,
3104 	     unsigned *total_orig,
3105 	     unsigned *total_new)
3106 {
3107   unsigned HOST_WIDE_INT pos = group->bitregion_start;
3108   unsigned HOST_WIDE_INT size = group->bitregion_end - pos;
3109   unsigned HOST_WIDE_INT bytepos = pos / BITS_PER_UNIT;
3110   unsigned HOST_WIDE_INT group_align = group->align;
3111   unsigned HOST_WIDE_INT align_base = group->align_base;
3112   unsigned HOST_WIDE_INT group_load_align = group_align;
3113   bool any_orig = false;
3114 
3115   gcc_assert ((size % BITS_PER_UNIT == 0) && (pos % BITS_PER_UNIT == 0));
3116 
3117   if (group->stores[0]->rhs_code == LROTATE_EXPR
3118       || group->stores[0]->rhs_code == NOP_EXPR)
3119     {
3120       /* For bswap framework using sets of stores, all the checking
3121 	 has been done earlier in try_coalesce_bswap and needs to be
3122 	 emitted as a single store.  */
3123       if (total_orig)
3124 	{
3125 	  /* Avoid the old/new stmt count heuristics.  It should be
3126 	     always beneficial.  */
3127 	  total_new[0] = 1;
3128 	  total_orig[0] = 2;
3129 	}
3130 
3131       if (split_stores)
3132 	{
3133 	  unsigned HOST_WIDE_INT align_bitpos
3134 	    = (group->start - align_base) & (group_align - 1);
3135 	  unsigned HOST_WIDE_INT align = group_align;
3136 	  if (align_bitpos)
3137 	    align = least_bit_hwi (align_bitpos);
3138 	  bytepos = group->start / BITS_PER_UNIT;
3139 	  struct split_store *store
3140 	    = new split_store (bytepos, group->width, align);
3141 	  unsigned int first = 0;
3142 	  find_constituent_stores (group, &store->orig_stores,
3143 				   &first, group->start, group->width);
3144 	  split_stores->safe_push (store);
3145 	}
3146 
3147       return 1;
3148     }
3149 
3150   unsigned int ret = 0, first = 0;
3151   unsigned HOST_WIDE_INT try_pos = bytepos;
3152 
3153   if (total_orig)
3154     {
3155       unsigned int i;
3156       store_immediate_info *info = group->stores[0];
3157 
3158       total_new[0] = 0;
3159       total_orig[0] = 1; /* The orig store.  */
3160       info = group->stores[0];
3161       if (info->ops[0].base_addr)
3162 	total_orig[0]++;
3163       if (info->ops[1].base_addr)
3164 	total_orig[0]++;
3165       switch (info->rhs_code)
3166 	{
3167 	case BIT_AND_EXPR:
3168 	case BIT_IOR_EXPR:
3169 	case BIT_XOR_EXPR:
3170 	  total_orig[0]++; /* The orig BIT_*_EXPR stmt.  */
3171 	  break;
3172 	default:
3173 	  break;
3174 	}
3175       total_orig[0] *= group->stores.length ();
3176 
3177       FOR_EACH_VEC_ELT (group->stores, i, info)
3178 	{
3179 	  total_new[0] += count_multiple_uses (info);
3180 	  total_orig[0] += (info->bit_not_p
3181 			    + info->ops[0].bit_not_p
3182 			    + info->ops[1].bit_not_p);
3183 	}
3184     }
3185 
3186   if (!allow_unaligned_load)
3187     for (int i = 0; i < 2; ++i)
3188       if (group->load_align[i])
3189 	group_load_align = MIN (group_load_align, group->load_align[i]);
3190 
3191   while (size > 0)
3192     {
3193       if ((allow_unaligned_store || group_align <= BITS_PER_UNIT)
3194 	  && group->mask[try_pos - bytepos] == (unsigned char) ~0U)
3195 	{
3196 	  /* Skip padding bytes.  */
3197 	  ++try_pos;
3198 	  size -= BITS_PER_UNIT;
3199 	  continue;
3200 	}
3201 
3202       unsigned HOST_WIDE_INT try_bitpos = try_pos * BITS_PER_UNIT;
3203       unsigned int try_size = MAX_STORE_BITSIZE, nonmasked;
3204       unsigned HOST_WIDE_INT align_bitpos
3205 	= (try_bitpos - align_base) & (group_align - 1);
3206       unsigned HOST_WIDE_INT align = group_align;
3207       if (align_bitpos)
3208 	align = least_bit_hwi (align_bitpos);
3209       if (!allow_unaligned_store)
3210 	try_size = MIN (try_size, align);
3211       if (!allow_unaligned_load)
3212 	{
3213 	  /* If we can't do or don't want to do unaligned stores
3214 	     as well as loads, we need to take the loads into account
3215 	     as well.  */
3216 	  unsigned HOST_WIDE_INT load_align = group_load_align;
3217 	  align_bitpos = (try_bitpos - align_base) & (load_align - 1);
3218 	  if (align_bitpos)
3219 	    load_align = least_bit_hwi (align_bitpos);
3220 	  for (int i = 0; i < 2; ++i)
3221 	    if (group->load_align[i])
3222 	      {
3223 		align_bitpos
3224 		  = known_alignment (try_bitpos
3225 				     - group->stores[0]->bitpos
3226 				     + group->stores[0]->ops[i].bitpos
3227 				     - group->load_align_base[i]);
3228 		if (align_bitpos & (group_load_align - 1))
3229 		  {
3230 		    unsigned HOST_WIDE_INT a = least_bit_hwi (align_bitpos);
3231 		    load_align = MIN (load_align, a);
3232 		  }
3233 	      }
3234 	  try_size = MIN (try_size, load_align);
3235 	}
3236       store_immediate_info *info
3237 	= find_constituent_stores (group, NULL, &first, try_bitpos, try_size);
3238       if (info)
3239 	{
3240 	  /* If there is just one original statement for the range, see if
3241 	     we can just reuse the original store which could be even larger
3242 	     than try_size.  */
3243 	  unsigned HOST_WIDE_INT stmt_end
3244 	    = ROUND_UP (info->bitpos + info->bitsize, BITS_PER_UNIT);
3245 	  info = find_constituent_stores (group, NULL, &first, try_bitpos,
3246 					  stmt_end - try_bitpos);
3247 	  if (info && info->bitpos >= try_bitpos)
3248 	    {
3249 	      try_size = stmt_end - try_bitpos;
3250 	      goto found;
3251 	    }
3252 	}
3253 
3254       /* Approximate store bitsize for the case when there are no padding
3255 	 bits.  */
3256       while (try_size > size)
3257 	try_size /= 2;
3258       /* Now look for whole padding bytes at the end of that bitsize.  */
3259       for (nonmasked = try_size / BITS_PER_UNIT; nonmasked > 0; --nonmasked)
3260 	if (group->mask[try_pos - bytepos + nonmasked - 1]
3261 	    != (unsigned char) ~0U)
3262 	  break;
3263       if (nonmasked == 0)
3264 	{
3265 	  /* If entire try_size range is padding, skip it.  */
3266 	  try_pos += try_size / BITS_PER_UNIT;
3267 	  size -= try_size;
3268 	  continue;
3269 	}
3270       /* Otherwise try to decrease try_size if second half, last 3 quarters
3271 	 etc. are padding.  */
3272       nonmasked *= BITS_PER_UNIT;
3273       while (nonmasked <= try_size / 2)
3274 	try_size /= 2;
3275       if (!allow_unaligned_store && group_align > BITS_PER_UNIT)
3276 	{
3277 	  /* Now look for whole padding bytes at the start of that bitsize.  */
3278 	  unsigned int try_bytesize = try_size / BITS_PER_UNIT, masked;
3279 	  for (masked = 0; masked < try_bytesize; ++masked)
3280 	    if (group->mask[try_pos - bytepos + masked] != (unsigned char) ~0U)
3281 	      break;
3282 	  masked *= BITS_PER_UNIT;
3283 	  gcc_assert (masked < try_size);
3284 	  if (masked >= try_size / 2)
3285 	    {
3286 	      while (masked >= try_size / 2)
3287 		{
3288 		  try_size /= 2;
3289 		  try_pos += try_size / BITS_PER_UNIT;
3290 		  size -= try_size;
3291 		  masked -= try_size;
3292 		}
3293 	      /* Need to recompute the alignment, so just retry at the new
3294 		 position.  */
3295 	      continue;
3296 	    }
3297 	}
3298 
3299     found:
3300       ++ret;
3301 
3302       if (split_stores)
3303 	{
3304 	  struct split_store *store
3305 	    = new split_store (try_pos, try_size, align);
3306 	  info = find_constituent_stores (group, &store->orig_stores,
3307 					  &first, try_bitpos, try_size);
3308 	  if (info
3309 	      && info->bitpos >= try_bitpos
3310 	      && info->bitpos + info->bitsize <= try_bitpos + try_size)
3311 	    {
3312 	      store->orig = true;
3313 	      any_orig = true;
3314 	    }
3315 	  split_stores->safe_push (store);
3316 	}
3317 
3318       try_pos += try_size / BITS_PER_UNIT;
3319       size -= try_size;
3320     }
3321 
3322   if (total_orig)
3323     {
3324       unsigned int i;
3325       struct split_store *store;
3326       /* If we are reusing some original stores and any of the
3327 	 original SSA_NAMEs had multiple uses, we need to subtract
3328 	 those now before we add the new ones.  */
3329       if (total_new[0] && any_orig)
3330 	{
3331 	  FOR_EACH_VEC_ELT (*split_stores, i, store)
3332 	    if (store->orig)
3333 	      total_new[0] -= count_multiple_uses (store->orig_stores[0]);
3334 	}
3335       total_new[0] += ret; /* The new store.  */
3336       store_immediate_info *info = group->stores[0];
3337       if (info->ops[0].base_addr)
3338 	total_new[0] += ret;
3339       if (info->ops[1].base_addr)
3340 	total_new[0] += ret;
3341       switch (info->rhs_code)
3342 	{
3343 	case BIT_AND_EXPR:
3344 	case BIT_IOR_EXPR:
3345 	case BIT_XOR_EXPR:
3346 	  total_new[0] += ret; /* The new BIT_*_EXPR stmt.  */
3347 	  break;
3348 	default:
3349 	  break;
3350 	}
3351       FOR_EACH_VEC_ELT (*split_stores, i, store)
3352 	{
3353 	  unsigned int j;
3354 	  bool bit_not_p[3] = { false, false, false };
3355 	  /* If all orig_stores have certain bit_not_p set, then
3356 	     we'd use a BIT_NOT_EXPR stmt and need to account for it.
3357 	     If some orig_stores have certain bit_not_p set, then
3358 	     we'd use a BIT_XOR_EXPR with a mask and need to account for
3359 	     it.  */
3360 	  FOR_EACH_VEC_ELT (store->orig_stores, j, info)
3361 	    {
3362 	      if (info->ops[0].bit_not_p)
3363 		bit_not_p[0] = true;
3364 	      if (info->ops[1].bit_not_p)
3365 		bit_not_p[1] = true;
3366 	      if (info->bit_not_p)
3367 		bit_not_p[2] = true;
3368 	    }
3369 	  total_new[0] += bit_not_p[0] + bit_not_p[1] + bit_not_p[2];
3370 	}
3371 
3372     }
3373 
3374   return ret;
3375 }
3376 
3377 /* Return the operation through which the operand IDX (if < 2) or
3378    result (IDX == 2) should be inverted.  If NOP_EXPR, no inversion
3379    is done, if BIT_NOT_EXPR, all bits are inverted, if BIT_XOR_EXPR,
3380    the bits should be xored with mask.  */
3381 
3382 static enum tree_code
3383 invert_op (split_store *split_store, int idx, tree int_type, tree &mask)
3384 {
3385   unsigned int i;
3386   store_immediate_info *info;
3387   unsigned int cnt = 0;
3388   bool any_paddings = false;
3389   FOR_EACH_VEC_ELT (split_store->orig_stores, i, info)
3390     {
3391       bool bit_not_p = idx < 2 ? info->ops[idx].bit_not_p : info->bit_not_p;
3392       if (bit_not_p)
3393 	{
3394 	  ++cnt;
3395 	  tree lhs = gimple_assign_lhs (info->stmt);
3396 	  if (INTEGRAL_TYPE_P (TREE_TYPE (lhs))
3397 	      && TYPE_PRECISION (TREE_TYPE (lhs)) < info->bitsize)
3398 	    any_paddings = true;
3399 	}
3400     }
3401   mask = NULL_TREE;
3402   if (cnt == 0)
3403     return NOP_EXPR;
3404   if (cnt == split_store->orig_stores.length () && !any_paddings)
3405     return BIT_NOT_EXPR;
3406 
3407   unsigned HOST_WIDE_INT try_bitpos = split_store->bytepos * BITS_PER_UNIT;
3408   unsigned buf_size = split_store->size / BITS_PER_UNIT;
3409   unsigned char *buf
3410     = XALLOCAVEC (unsigned char, buf_size);
3411   memset (buf, ~0U, buf_size);
3412   FOR_EACH_VEC_ELT (split_store->orig_stores, i, info)
3413     {
3414       bool bit_not_p = idx < 2 ? info->ops[idx].bit_not_p : info->bit_not_p;
3415       if (!bit_not_p)
3416 	continue;
3417       /* Clear regions with bit_not_p and invert afterwards, rather than
3418 	 clear regions with !bit_not_p, so that gaps in between stores aren't
3419 	 set in the mask.  */
3420       unsigned HOST_WIDE_INT bitsize = info->bitsize;
3421       unsigned HOST_WIDE_INT prec = bitsize;
3422       unsigned int pos_in_buffer = 0;
3423       if (any_paddings)
3424 	{
3425 	  tree lhs = gimple_assign_lhs (info->stmt);
3426 	  if (INTEGRAL_TYPE_P (TREE_TYPE (lhs))
3427 	      && TYPE_PRECISION (TREE_TYPE (lhs)) < bitsize)
3428 	    prec = TYPE_PRECISION (TREE_TYPE (lhs));
3429 	}
3430       if (info->bitpos < try_bitpos)
3431 	{
3432 	  gcc_assert (info->bitpos + bitsize > try_bitpos);
3433 	  if (!BYTES_BIG_ENDIAN)
3434 	    {
3435 	      if (prec <= try_bitpos - info->bitpos)
3436 		continue;
3437 	      prec -= try_bitpos - info->bitpos;
3438 	    }
3439 	  bitsize -= try_bitpos - info->bitpos;
3440 	  if (BYTES_BIG_ENDIAN && prec > bitsize)
3441 	    prec = bitsize;
3442 	}
3443       else
3444 	pos_in_buffer = info->bitpos - try_bitpos;
3445       if (prec < bitsize)
3446 	{
3447 	  /* If this is a bool inversion, invert just the least significant
3448 	     prec bits rather than all bits of it.  */
3449 	  if (BYTES_BIG_ENDIAN)
3450 	    {
3451 	      pos_in_buffer += bitsize - prec;
3452 	      if (pos_in_buffer >= split_store->size)
3453 		continue;
3454 	    }
3455 	  bitsize = prec;
3456 	}
3457       if (pos_in_buffer + bitsize > split_store->size)
3458 	bitsize = split_store->size - pos_in_buffer;
3459       unsigned char *p = buf + (pos_in_buffer / BITS_PER_UNIT);
3460       if (BYTES_BIG_ENDIAN)
3461 	clear_bit_region_be (p, (BITS_PER_UNIT - 1
3462 				 - (pos_in_buffer % BITS_PER_UNIT)), bitsize);
3463       else
3464 	clear_bit_region (p, pos_in_buffer % BITS_PER_UNIT, bitsize);
3465     }
3466   for (unsigned int i = 0; i < buf_size; ++i)
3467     buf[i] = ~buf[i];
3468   mask = native_interpret_expr (int_type, buf, buf_size);
3469   return BIT_XOR_EXPR;
3470 }
3471 
3472 /* Given a merged store group GROUP output the widened version of it.
3473    The store chain is against the base object BASE.
3474    Try store sizes of at most MAX_STORE_BITSIZE bits wide and don't output
3475    unaligned stores for STRICT_ALIGNMENT targets or if it's too expensive.
3476    Make sure that the number of statements output is less than the number of
3477    original statements.  If a better sequence is possible emit it and
3478    return true.  */
3479 
3480 bool
3481 imm_store_chain_info::output_merged_store (merged_store_group *group)
3482 {
3483   split_store *split_store;
3484   unsigned int i;
3485   unsigned HOST_WIDE_INT start_byte_pos
3486     = group->bitregion_start / BITS_PER_UNIT;
3487 
3488   unsigned int orig_num_stmts = group->stores.length ();
3489   if (orig_num_stmts < 2)
3490     return false;
3491 
3492   auto_vec<struct split_store *, 32> split_stores;
3493   bool allow_unaligned_store
3494     = !STRICT_ALIGNMENT && PARAM_VALUE (PARAM_STORE_MERGING_ALLOW_UNALIGNED);
3495   bool allow_unaligned_load = allow_unaligned_store;
3496   if (allow_unaligned_store)
3497     {
3498       /* If unaligned stores are allowed, see how many stores we'd emit
3499 	 for unaligned and how many stores we'd emit for aligned stores.
3500 	 Only use unaligned stores if it allows fewer stores than aligned.  */
3501       unsigned aligned_cnt
3502 	= split_group (group, false, allow_unaligned_load, NULL, NULL, NULL);
3503       unsigned unaligned_cnt
3504 	= split_group (group, true, allow_unaligned_load, NULL, NULL, NULL);
3505       if (aligned_cnt <= unaligned_cnt)
3506 	allow_unaligned_store = false;
3507     }
3508   unsigned total_orig, total_new;
3509   split_group (group, allow_unaligned_store, allow_unaligned_load,
3510 	       &split_stores, &total_orig, &total_new);
3511 
3512   if (split_stores.length () >= orig_num_stmts)
3513     {
3514       /* We didn't manage to reduce the number of statements.  Bail out.  */
3515       if (dump_file && (dump_flags & TDF_DETAILS))
3516 	fprintf (dump_file, "Exceeded original number of stmts (%u)."
3517 			    "  Not profitable to emit new sequence.\n",
3518 		 orig_num_stmts);
3519       FOR_EACH_VEC_ELT (split_stores, i, split_store)
3520 	delete split_store;
3521       return false;
3522     }
3523   if (total_orig <= total_new)
3524     {
3525       /* If number of estimated new statements is above estimated original
3526 	 statements, bail out too.  */
3527       if (dump_file && (dump_flags & TDF_DETAILS))
3528 	fprintf (dump_file, "Estimated number of original stmts (%u)"
3529 			    " not larger than estimated number of new"
3530 			    " stmts (%u).\n",
3531 		 total_orig, total_new);
3532       FOR_EACH_VEC_ELT (split_stores, i, split_store)
3533 	delete split_store;
3534       return false;
3535     }
3536 
3537   gimple_stmt_iterator last_gsi = gsi_for_stmt (group->last_stmt);
3538   gimple_seq seq = NULL;
3539   tree last_vdef, new_vuse;
3540   last_vdef = gimple_vdef (group->last_stmt);
3541   new_vuse = gimple_vuse (group->last_stmt);
3542   tree bswap_res = NULL_TREE;
3543 
3544   if (group->stores[0]->rhs_code == LROTATE_EXPR
3545       || group->stores[0]->rhs_code == NOP_EXPR)
3546     {
3547       tree fndecl = NULL_TREE, bswap_type = NULL_TREE, load_type;
3548       gimple *ins_stmt = group->stores[0]->ins_stmt;
3549       struct symbolic_number *n = &group->stores[0]->n;
3550       bool bswap = group->stores[0]->rhs_code == LROTATE_EXPR;
3551 
3552       switch (n->range)
3553 	{
3554 	case 16:
3555 	  load_type = bswap_type = uint16_type_node;
3556 	  break;
3557 	case 32:
3558 	  load_type = uint32_type_node;
3559 	  if (bswap)
3560 	    {
3561 	      fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
3562 	      bswap_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
3563 	    }
3564 	  break;
3565 	case 64:
3566 	  load_type = uint64_type_node;
3567 	  if (bswap)
3568 	    {
3569 	      fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
3570 	      bswap_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
3571 	    }
3572 	  break;
3573 	default:
3574 	  gcc_unreachable ();
3575 	}
3576 
3577       /* If the loads have each vuse of the corresponding store,
3578 	 we've checked the aliasing already in try_coalesce_bswap and
3579 	 we want to sink the need load into seq.  So need to use new_vuse
3580 	 on the load.  */
3581       if (n->base_addr)
3582 	{
3583 	  if (n->vuse == NULL)
3584 	    {
3585 	      n->vuse = new_vuse;
3586 	      ins_stmt = NULL;
3587 	    }
3588 	  else
3589 	    /* Update vuse in case it has changed by output_merged_stores.  */
3590 	    n->vuse = gimple_vuse (ins_stmt);
3591 	}
3592       bswap_res = bswap_replace (gsi_start (seq), ins_stmt, fndecl,
3593 				 bswap_type, load_type, n, bswap);
3594       gcc_assert (bswap_res);
3595     }
3596 
3597   gimple *stmt = NULL;
3598   auto_vec<gimple *, 32> orig_stmts;
3599   gimple_seq this_seq;
3600   tree addr = force_gimple_operand_1 (unshare_expr (base_addr), &this_seq,
3601 				      is_gimple_mem_ref_addr, NULL_TREE);
3602   gimple_seq_add_seq_without_update (&seq, this_seq);
3603 
3604   tree load_addr[2] = { NULL_TREE, NULL_TREE };
3605   gimple_seq load_seq[2] = { NULL, NULL };
3606   gimple_stmt_iterator load_gsi[2] = { gsi_none (), gsi_none () };
3607   for (int j = 0; j < 2; ++j)
3608     {
3609       store_operand_info &op = group->stores[0]->ops[j];
3610       if (op.base_addr == NULL_TREE)
3611 	continue;
3612 
3613       store_immediate_info *infol = group->stores.last ();
3614       if (gimple_vuse (op.stmt) == gimple_vuse (infol->ops[j].stmt))
3615 	{
3616 	  /* We can't pick the location randomly; while we've verified
3617 	     all the loads have the same vuse, they can be still in different
3618 	     basic blocks and we need to pick the one from the last bb:
3619 	     int x = q[0];
3620 	     if (x == N) return;
3621 	     int y = q[1];
3622 	     p[0] = x;
3623 	     p[1] = y;
3624 	     otherwise if we put the wider load at the q[0] load, we might
3625 	     segfault if q[1] is not mapped.  */
3626 	  basic_block bb = gimple_bb (op.stmt);
3627 	  gimple *ostmt = op.stmt;
3628 	  store_immediate_info *info;
3629 	  FOR_EACH_VEC_ELT (group->stores, i, info)
3630 	    {
3631 	      gimple *tstmt = info->ops[j].stmt;
3632 	      basic_block tbb = gimple_bb (tstmt);
3633 	      if (dominated_by_p (CDI_DOMINATORS, tbb, bb))
3634 		{
3635 		  ostmt = tstmt;
3636 		  bb = tbb;
3637 		}
3638 	    }
3639 	  load_gsi[j] = gsi_for_stmt (ostmt);
3640 	  load_addr[j]
3641 	    = force_gimple_operand_1 (unshare_expr (op.base_addr),
3642 				      &load_seq[j], is_gimple_mem_ref_addr,
3643 				      NULL_TREE);
3644 	}
3645       else if (operand_equal_p (base_addr, op.base_addr, 0))
3646 	load_addr[j] = addr;
3647       else
3648 	{
3649 	  load_addr[j]
3650 	    = force_gimple_operand_1 (unshare_expr (op.base_addr),
3651 				      &this_seq, is_gimple_mem_ref_addr,
3652 				      NULL_TREE);
3653 	  gimple_seq_add_seq_without_update (&seq, this_seq);
3654 	}
3655     }
3656 
3657   FOR_EACH_VEC_ELT (split_stores, i, split_store)
3658     {
3659       unsigned HOST_WIDE_INT try_size = split_store->size;
3660       unsigned HOST_WIDE_INT try_pos = split_store->bytepos;
3661       unsigned HOST_WIDE_INT align = split_store->align;
3662       tree dest, src;
3663       location_t loc;
3664       if (split_store->orig)
3665 	{
3666 	  /* If there is just a single constituent store which covers
3667 	     the whole area, just reuse the lhs and rhs.  */
3668 	  gimple *orig_stmt = split_store->orig_stores[0]->stmt;
3669 	  dest = gimple_assign_lhs (orig_stmt);
3670 	  src = gimple_assign_rhs1 (orig_stmt);
3671 	  loc = gimple_location (orig_stmt);
3672 	}
3673       else
3674 	{
3675 	  store_immediate_info *info;
3676 	  unsigned short clique, base;
3677 	  unsigned int k;
3678 	  FOR_EACH_VEC_ELT (split_store->orig_stores, k, info)
3679 	    orig_stmts.safe_push (info->stmt);
3680 	  tree offset_type
3681 	    = get_alias_type_for_stmts (orig_stmts, false, &clique, &base);
3682 	  loc = get_location_for_stmts (orig_stmts);
3683 	  orig_stmts.truncate (0);
3684 
3685 	  tree int_type = build_nonstandard_integer_type (try_size, UNSIGNED);
3686 	  int_type = build_aligned_type (int_type, align);
3687 	  dest = fold_build2 (MEM_REF, int_type, addr,
3688 			      build_int_cst (offset_type, try_pos));
3689 	  if (TREE_CODE (dest) == MEM_REF)
3690 	    {
3691 	      MR_DEPENDENCE_CLIQUE (dest) = clique;
3692 	      MR_DEPENDENCE_BASE (dest) = base;
3693 	    }
3694 
3695 	  tree mask = integer_zero_node;
3696 	  if (!bswap_res)
3697 	    mask = native_interpret_expr (int_type,
3698 					  group->mask + try_pos
3699 					  - start_byte_pos,
3700 					  group->buf_size);
3701 
3702 	  tree ops[2];
3703 	  for (int j = 0;
3704 	       j < 1 + (split_store->orig_stores[0]->ops[1].val != NULL_TREE);
3705 	       ++j)
3706 	    {
3707 	      store_operand_info &op = split_store->orig_stores[0]->ops[j];
3708 	      if (bswap_res)
3709 		ops[j] = bswap_res;
3710 	      else if (op.base_addr)
3711 		{
3712 		  FOR_EACH_VEC_ELT (split_store->orig_stores, k, info)
3713 		    orig_stmts.safe_push (info->ops[j].stmt);
3714 
3715 		  offset_type = get_alias_type_for_stmts (orig_stmts, true,
3716 							  &clique, &base);
3717 		  location_t load_loc = get_location_for_stmts (orig_stmts);
3718 		  orig_stmts.truncate (0);
3719 
3720 		  unsigned HOST_WIDE_INT load_align = group->load_align[j];
3721 		  unsigned HOST_WIDE_INT align_bitpos
3722 		    = known_alignment (try_pos * BITS_PER_UNIT
3723 				       - split_store->orig_stores[0]->bitpos
3724 				       + op.bitpos);
3725 		  if (align_bitpos & (load_align - 1))
3726 		    load_align = least_bit_hwi (align_bitpos);
3727 
3728 		  tree load_int_type
3729 		    = build_nonstandard_integer_type (try_size, UNSIGNED);
3730 		  load_int_type
3731 		    = build_aligned_type (load_int_type, load_align);
3732 
3733 		  poly_uint64 load_pos
3734 		    = exact_div (try_pos * BITS_PER_UNIT
3735 				 - split_store->orig_stores[0]->bitpos
3736 				 + op.bitpos,
3737 				 BITS_PER_UNIT);
3738 		  ops[j] = fold_build2 (MEM_REF, load_int_type, load_addr[j],
3739 					build_int_cst (offset_type, load_pos));
3740 		  if (TREE_CODE (ops[j]) == MEM_REF)
3741 		    {
3742 		      MR_DEPENDENCE_CLIQUE (ops[j]) = clique;
3743 		      MR_DEPENDENCE_BASE (ops[j]) = base;
3744 		    }
3745 		  if (!integer_zerop (mask))
3746 		    /* The load might load some bits (that will be masked off
3747 		       later on) uninitialized, avoid -W*uninitialized
3748 		       warnings in that case.  */
3749 		    TREE_NO_WARNING (ops[j]) = 1;
3750 
3751 		  stmt = gimple_build_assign (make_ssa_name (int_type),
3752 					      ops[j]);
3753 		  gimple_set_location (stmt, load_loc);
3754 		  if (gsi_bb (load_gsi[j]))
3755 		    {
3756 		      gimple_set_vuse (stmt, gimple_vuse (op.stmt));
3757 		      gimple_seq_add_stmt_without_update (&load_seq[j], stmt);
3758 		    }
3759 		  else
3760 		    {
3761 		      gimple_set_vuse (stmt, new_vuse);
3762 		      gimple_seq_add_stmt_without_update (&seq, stmt);
3763 		    }
3764 		  ops[j] = gimple_assign_lhs (stmt);
3765 		  tree xor_mask;
3766 		  enum tree_code inv_op
3767 		    = invert_op (split_store, j, int_type, xor_mask);
3768 		  if (inv_op != NOP_EXPR)
3769 		    {
3770 		      stmt = gimple_build_assign (make_ssa_name (int_type),
3771 						  inv_op, ops[j], xor_mask);
3772 		      gimple_set_location (stmt, load_loc);
3773 		      ops[j] = gimple_assign_lhs (stmt);
3774 
3775 		      if (gsi_bb (load_gsi[j]))
3776 			gimple_seq_add_stmt_without_update (&load_seq[j],
3777 							    stmt);
3778 		      else
3779 			gimple_seq_add_stmt_without_update (&seq, stmt);
3780 		    }
3781 		}
3782 	      else
3783 		ops[j] = native_interpret_expr (int_type,
3784 						group->val + try_pos
3785 						- start_byte_pos,
3786 						group->buf_size);
3787 	    }
3788 
3789 	  switch (split_store->orig_stores[0]->rhs_code)
3790 	    {
3791 	    case BIT_AND_EXPR:
3792 	    case BIT_IOR_EXPR:
3793 	    case BIT_XOR_EXPR:
3794 	      FOR_EACH_VEC_ELT (split_store->orig_stores, k, info)
3795 		{
3796 		  tree rhs1 = gimple_assign_rhs1 (info->stmt);
3797 		  orig_stmts.safe_push (SSA_NAME_DEF_STMT (rhs1));
3798 		}
3799 	      location_t bit_loc;
3800 	      bit_loc = get_location_for_stmts (orig_stmts);
3801 	      orig_stmts.truncate (0);
3802 
3803 	      stmt
3804 		= gimple_build_assign (make_ssa_name (int_type),
3805 				       split_store->orig_stores[0]->rhs_code,
3806 				       ops[0], ops[1]);
3807 	      gimple_set_location (stmt, bit_loc);
3808 	      /* If there is just one load and there is a separate
3809 		 load_seq[0], emit the bitwise op right after it.  */
3810 	      if (load_addr[1] == NULL_TREE && gsi_bb (load_gsi[0]))
3811 		gimple_seq_add_stmt_without_update (&load_seq[0], stmt);
3812 	      /* Otherwise, if at least one load is in seq, we need to
3813 		 emit the bitwise op right before the store.  If there
3814 		 are two loads and are emitted somewhere else, it would
3815 		 be better to emit the bitwise op as early as possible;
3816 		 we don't track where that would be possible right now
3817 		 though.  */
3818 	      else
3819 		gimple_seq_add_stmt_without_update (&seq, stmt);
3820 	      src = gimple_assign_lhs (stmt);
3821 	      tree xor_mask;
3822 	      enum tree_code inv_op;
3823 	      inv_op = invert_op (split_store, 2, int_type, xor_mask);
3824 	      if (inv_op != NOP_EXPR)
3825 		{
3826 		  stmt = gimple_build_assign (make_ssa_name (int_type),
3827 					      inv_op, src, xor_mask);
3828 		  gimple_set_location (stmt, bit_loc);
3829 		  if (load_addr[1] == NULL_TREE && gsi_bb (load_gsi[0]))
3830 		    gimple_seq_add_stmt_without_update (&load_seq[0], stmt);
3831 		  else
3832 		    gimple_seq_add_stmt_without_update (&seq, stmt);
3833 		  src = gimple_assign_lhs (stmt);
3834 		}
3835 	      break;
3836 	    case LROTATE_EXPR:
3837 	    case NOP_EXPR:
3838 	      src = ops[0];
3839 	      if (!is_gimple_val (src))
3840 		{
3841 		  stmt = gimple_build_assign (make_ssa_name (TREE_TYPE (src)),
3842 					      src);
3843 		  gimple_seq_add_stmt_without_update (&seq, stmt);
3844 		  src = gimple_assign_lhs (stmt);
3845 		}
3846 	      if (!useless_type_conversion_p (int_type, TREE_TYPE (src)))
3847 		{
3848 		  stmt = gimple_build_assign (make_ssa_name (int_type),
3849 					      NOP_EXPR, src);
3850 		  gimple_seq_add_stmt_without_update (&seq, stmt);
3851 		  src = gimple_assign_lhs (stmt);
3852 		}
3853 	      inv_op = invert_op (split_store, 2, int_type, xor_mask);
3854 	      if (inv_op != NOP_EXPR)
3855 		{
3856 		  stmt = gimple_build_assign (make_ssa_name (int_type),
3857 					      inv_op, src, xor_mask);
3858 		  gimple_set_location (stmt, loc);
3859 		  gimple_seq_add_stmt_without_update (&seq, stmt);
3860 		  src = gimple_assign_lhs (stmt);
3861 		}
3862 	      break;
3863 	    default:
3864 	      src = ops[0];
3865 	      break;
3866 	    }
3867 
3868 	  if (!integer_zerop (mask))
3869 	    {
3870 	      tree tem = make_ssa_name (int_type);
3871 	      tree load_src = unshare_expr (dest);
3872 	      /* The load might load some or all bits uninitialized,
3873 		 avoid -W*uninitialized warnings in that case.
3874 		 As optimization, it would be nice if all the bits are
3875 		 provably uninitialized (no stores at all yet or previous
3876 		 store a CLOBBER) we'd optimize away the load and replace
3877 		 it e.g. with 0.  */
3878 	      TREE_NO_WARNING (load_src) = 1;
3879 	      stmt = gimple_build_assign (tem, load_src);
3880 	      gimple_set_location (stmt, loc);
3881 	      gimple_set_vuse (stmt, new_vuse);
3882 	      gimple_seq_add_stmt_without_update (&seq, stmt);
3883 
3884 	      /* FIXME: If there is a single chunk of zero bits in mask,
3885 		 perhaps use BIT_INSERT_EXPR instead?  */
3886 	      stmt = gimple_build_assign (make_ssa_name (int_type),
3887 					  BIT_AND_EXPR, tem, mask);
3888 	      gimple_set_location (stmt, loc);
3889 	      gimple_seq_add_stmt_without_update (&seq, stmt);
3890 	      tem = gimple_assign_lhs (stmt);
3891 
3892 	      if (TREE_CODE (src) == INTEGER_CST)
3893 		src = wide_int_to_tree (int_type,
3894 					wi::bit_and_not (wi::to_wide (src),
3895 							 wi::to_wide (mask)));
3896 	      else
3897 		{
3898 		  tree nmask
3899 		    = wide_int_to_tree (int_type,
3900 					wi::bit_not (wi::to_wide (mask)));
3901 		  stmt = gimple_build_assign (make_ssa_name (int_type),
3902 					      BIT_AND_EXPR, src, nmask);
3903 		  gimple_set_location (stmt, loc);
3904 		  gimple_seq_add_stmt_without_update (&seq, stmt);
3905 		  src = gimple_assign_lhs (stmt);
3906 		}
3907 	      stmt = gimple_build_assign (make_ssa_name (int_type),
3908 					  BIT_IOR_EXPR, tem, src);
3909 	      gimple_set_location (stmt, loc);
3910 	      gimple_seq_add_stmt_without_update (&seq, stmt);
3911 	      src = gimple_assign_lhs (stmt);
3912 	    }
3913 	}
3914 
3915       stmt = gimple_build_assign (dest, src);
3916       gimple_set_location (stmt, loc);
3917       gimple_set_vuse (stmt, new_vuse);
3918       gimple_seq_add_stmt_without_update (&seq, stmt);
3919 
3920       tree new_vdef;
3921       if (i < split_stores.length () - 1)
3922 	new_vdef = make_ssa_name (gimple_vop (cfun), stmt);
3923       else
3924 	new_vdef = last_vdef;
3925 
3926       gimple_set_vdef (stmt, new_vdef);
3927       SSA_NAME_DEF_STMT (new_vdef) = stmt;
3928       new_vuse = new_vdef;
3929     }
3930 
3931   FOR_EACH_VEC_ELT (split_stores, i, split_store)
3932     delete split_store;
3933 
3934   gcc_assert (seq);
3935   if (dump_file)
3936     {
3937       fprintf (dump_file,
3938 	       "New sequence of %u stmts to replace old one of %u stmts\n",
3939 	       split_stores.length (), orig_num_stmts);
3940       if (dump_flags & TDF_DETAILS)
3941 	print_gimple_seq (dump_file, seq, 0, TDF_VOPS | TDF_MEMSYMS);
3942     }
3943   gsi_insert_seq_after (&last_gsi, seq, GSI_SAME_STMT);
3944   for (int j = 0; j < 2; ++j)
3945     if (load_seq[j])
3946       gsi_insert_seq_after (&load_gsi[j], load_seq[j], GSI_SAME_STMT);
3947 
3948   return true;
3949 }
3950 
3951 /* Process the merged_store_group objects created in the coalescing phase.
3952    The stores are all against the base object BASE.
3953    Try to output the widened stores and delete the original statements if
3954    successful.  Return true iff any changes were made.  */
3955 
3956 bool
3957 imm_store_chain_info::output_merged_stores ()
3958 {
3959   unsigned int i;
3960   merged_store_group *merged_store;
3961   bool ret = false;
3962   FOR_EACH_VEC_ELT (m_merged_store_groups, i, merged_store)
3963     {
3964       if (output_merged_store (merged_store))
3965 	{
3966 	  unsigned int j;
3967 	  store_immediate_info *store;
3968 	  FOR_EACH_VEC_ELT (merged_store->stores, j, store)
3969 	    {
3970 	      gimple *stmt = store->stmt;
3971 	      gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
3972 	      gsi_remove (&gsi, true);
3973 	      if (stmt != merged_store->last_stmt)
3974 		{
3975 		  unlink_stmt_vdef (stmt);
3976 		  release_defs (stmt);
3977 		}
3978 	    }
3979 	  ret = true;
3980 	}
3981     }
3982   if (ret && dump_file)
3983     fprintf (dump_file, "Merging successful!\n");
3984 
3985   return ret;
3986 }
3987 
3988 /* Coalesce the store_immediate_info objects recorded against the base object
3989    BASE in the first phase and output them.
3990    Delete the allocated structures.
3991    Return true if any changes were made.  */
3992 
3993 bool
3994 imm_store_chain_info::terminate_and_process_chain ()
3995 {
3996   /* Process store chain.  */
3997   bool ret = false;
3998   if (m_store_info.length () > 1)
3999     {
4000       ret = coalesce_immediate_stores ();
4001       if (ret)
4002 	ret = output_merged_stores ();
4003     }
4004 
4005   /* Delete all the entries we allocated ourselves.  */
4006   store_immediate_info *info;
4007   unsigned int i;
4008   FOR_EACH_VEC_ELT (m_store_info, i, info)
4009     delete info;
4010 
4011   merged_store_group *merged_info;
4012   FOR_EACH_VEC_ELT (m_merged_store_groups, i, merged_info)
4013     delete merged_info;
4014 
4015   return ret;
4016 }
4017 
4018 /* Return true iff LHS is a destination potentially interesting for
4019    store merging.  In practice these are the codes that get_inner_reference
4020    can process.  */
4021 
4022 static bool
4023 lhs_valid_for_store_merging_p (tree lhs)
4024 {
4025   tree_code code = TREE_CODE (lhs);
4026 
4027   if (code == ARRAY_REF || code == ARRAY_RANGE_REF || code == MEM_REF
4028       || code == COMPONENT_REF || code == BIT_FIELD_REF)
4029     return true;
4030 
4031   return false;
4032 }
4033 
4034 /* Return true if the tree RHS is a constant we want to consider
4035    during store merging.  In practice accept all codes that
4036    native_encode_expr accepts.  */
4037 
4038 static bool
4039 rhs_valid_for_store_merging_p (tree rhs)
4040 {
4041   unsigned HOST_WIDE_INT size;
4042   return (GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (rhs))).is_constant (&size)
4043 	  && native_encode_expr (rhs, NULL, size) != 0);
4044 }
4045 
4046 /* If MEM is a memory reference usable for store merging (either as
4047    store destination or for loads), return the non-NULL base_addr
4048    and set *PBITSIZE, *PBITPOS, *PBITREGION_START and *PBITREGION_END.
4049    Otherwise return NULL, *PBITPOS should be still valid even for that
4050    case.  */
4051 
4052 static tree
4053 mem_valid_for_store_merging (tree mem, poly_uint64 *pbitsize,
4054 			     poly_uint64 *pbitpos,
4055 			     poly_uint64 *pbitregion_start,
4056 			     poly_uint64 *pbitregion_end)
4057 {
4058   poly_int64 bitsize, bitpos;
4059   poly_uint64 bitregion_start = 0, bitregion_end = 0;
4060   machine_mode mode;
4061   int unsignedp = 0, reversep = 0, volatilep = 0;
4062   tree offset;
4063   tree base_addr = get_inner_reference (mem, &bitsize, &bitpos, &offset, &mode,
4064 					&unsignedp, &reversep, &volatilep);
4065   *pbitsize = bitsize;
4066   if (known_eq (bitsize, 0))
4067     return NULL_TREE;
4068 
4069   if (TREE_CODE (mem) == COMPONENT_REF
4070       && DECL_BIT_FIELD_TYPE (TREE_OPERAND (mem, 1)))
4071     {
4072       get_bit_range (&bitregion_start, &bitregion_end, mem, &bitpos, &offset);
4073       if (maybe_ne (bitregion_end, 0U))
4074 	bitregion_end += 1;
4075     }
4076 
4077   if (reversep)
4078     return NULL_TREE;
4079 
4080   /* We do not want to rewrite TARGET_MEM_REFs.  */
4081   if (TREE_CODE (base_addr) == TARGET_MEM_REF)
4082     return NULL_TREE;
4083   /* In some cases get_inner_reference may return a
4084      MEM_REF [ptr + byteoffset].  For the purposes of this pass
4085      canonicalize the base_addr to MEM_REF [ptr] and take
4086      byteoffset into account in the bitpos.  This occurs in
4087      PR 23684 and this way we can catch more chains.  */
4088   else if (TREE_CODE (base_addr) == MEM_REF)
4089     {
4090       poly_offset_int byte_off = mem_ref_offset (base_addr);
4091       poly_offset_int bit_off = byte_off << LOG2_BITS_PER_UNIT;
4092       bit_off += bitpos;
4093       if (known_ge (bit_off, 0) && bit_off.to_shwi (&bitpos))
4094 	{
4095 	  if (maybe_ne (bitregion_end, 0U))
4096 	    {
4097 	      bit_off = byte_off << LOG2_BITS_PER_UNIT;
4098 	      bit_off += bitregion_start;
4099 	      if (bit_off.to_uhwi (&bitregion_start))
4100 		{
4101 		  bit_off = byte_off << LOG2_BITS_PER_UNIT;
4102 		  bit_off += bitregion_end;
4103 		  if (!bit_off.to_uhwi (&bitregion_end))
4104 		    bitregion_end = 0;
4105 		}
4106 	      else
4107 		bitregion_end = 0;
4108 	    }
4109 	}
4110       else
4111 	return NULL_TREE;
4112       base_addr = TREE_OPERAND (base_addr, 0);
4113     }
4114   /* get_inner_reference returns the base object, get at its
4115      address now.  */
4116   else
4117     {
4118       if (maybe_lt (bitpos, 0))
4119 	return NULL_TREE;
4120       base_addr = build_fold_addr_expr (base_addr);
4121     }
4122 
4123   if (known_eq (bitregion_end, 0U))
4124     {
4125       bitregion_start = round_down_to_byte_boundary (bitpos);
4126       bitregion_end = bitpos;
4127       bitregion_end = round_up_to_byte_boundary (bitregion_end + bitsize);
4128     }
4129 
4130   if (offset != NULL_TREE)
4131     {
4132       /* If the access is variable offset then a base decl has to be
4133 	 address-taken to be able to emit pointer-based stores to it.
4134 	 ???  We might be able to get away with re-using the original
4135 	 base up to the first variable part and then wrapping that inside
4136 	 a BIT_FIELD_REF.  */
4137       tree base = get_base_address (base_addr);
4138       if (! base
4139 	  || (DECL_P (base) && ! TREE_ADDRESSABLE (base)))
4140 	return NULL_TREE;
4141 
4142       base_addr = build2 (POINTER_PLUS_EXPR, TREE_TYPE (base_addr),
4143 			  base_addr, offset);
4144     }
4145 
4146   *pbitsize = bitsize;
4147   *pbitpos = bitpos;
4148   *pbitregion_start = bitregion_start;
4149   *pbitregion_end = bitregion_end;
4150   return base_addr;
4151 }
4152 
4153 /* Return true if STMT is a load that can be used for store merging.
4154    In that case fill in *OP.  BITSIZE, BITPOS, BITREGION_START and
4155    BITREGION_END are properties of the corresponding store.  */
4156 
4157 static bool
4158 handled_load (gimple *stmt, store_operand_info *op,
4159 	      poly_uint64 bitsize, poly_uint64 bitpos,
4160 	      poly_uint64 bitregion_start, poly_uint64 bitregion_end)
4161 {
4162   if (!is_gimple_assign (stmt))
4163     return false;
4164   if (gimple_assign_rhs_code (stmt) == BIT_NOT_EXPR)
4165     {
4166       tree rhs1 = gimple_assign_rhs1 (stmt);
4167       if (TREE_CODE (rhs1) == SSA_NAME
4168 	  && handled_load (SSA_NAME_DEF_STMT (rhs1), op, bitsize, bitpos,
4169 			   bitregion_start, bitregion_end))
4170 	{
4171 	  /* Don't allow _1 = load; _2 = ~1; _3 = ~_2; which should have
4172 	     been optimized earlier, but if allowed here, would confuse the
4173 	     multiple uses counting.  */
4174 	  if (op->bit_not_p)
4175 	    return false;
4176 	  op->bit_not_p = !op->bit_not_p;
4177 	  return true;
4178 	}
4179       return false;
4180     }
4181   if (gimple_vuse (stmt)
4182       && gimple_assign_load_p (stmt)
4183       && !stmt_can_throw_internal (stmt)
4184       && !gimple_has_volatile_ops (stmt))
4185     {
4186       tree mem = gimple_assign_rhs1 (stmt);
4187       op->base_addr
4188 	= mem_valid_for_store_merging (mem, &op->bitsize, &op->bitpos,
4189 				       &op->bitregion_start,
4190 				       &op->bitregion_end);
4191       if (op->base_addr != NULL_TREE
4192 	  && known_eq (op->bitsize, bitsize)
4193 	  && multiple_p (op->bitpos - bitpos, BITS_PER_UNIT)
4194 	  && known_ge (op->bitpos - op->bitregion_start,
4195 		       bitpos - bitregion_start)
4196 	  && known_ge (op->bitregion_end - op->bitpos,
4197 		       bitregion_end - bitpos))
4198 	{
4199 	  op->stmt = stmt;
4200 	  op->val = mem;
4201 	  op->bit_not_p = false;
4202 	  return true;
4203 	}
4204     }
4205   return false;
4206 }
4207 
4208 /* Record the store STMT for store merging optimization if it can be
4209    optimized.  */
4210 
4211 void
4212 pass_store_merging::process_store (gimple *stmt)
4213 {
4214   tree lhs = gimple_assign_lhs (stmt);
4215   tree rhs = gimple_assign_rhs1 (stmt);
4216   poly_uint64 bitsize, bitpos;
4217   poly_uint64 bitregion_start, bitregion_end;
4218   tree base_addr
4219     = mem_valid_for_store_merging (lhs, &bitsize, &bitpos,
4220 				   &bitregion_start, &bitregion_end);
4221   if (known_eq (bitsize, 0U))
4222     return;
4223 
4224   bool invalid = (base_addr == NULL_TREE
4225 		  || (maybe_gt (bitsize,
4226 				(unsigned int) MAX_BITSIZE_MODE_ANY_INT)
4227 		      && (TREE_CODE (rhs) != INTEGER_CST)));
4228   enum tree_code rhs_code = ERROR_MARK;
4229   bool bit_not_p = false;
4230   struct symbolic_number n;
4231   gimple *ins_stmt = NULL;
4232   store_operand_info ops[2];
4233   if (invalid)
4234     ;
4235   else if (rhs_valid_for_store_merging_p (rhs))
4236     {
4237       rhs_code = INTEGER_CST;
4238       ops[0].val = rhs;
4239     }
4240   else if (TREE_CODE (rhs) != SSA_NAME)
4241     invalid = true;
4242   else
4243     {
4244       gimple *def_stmt = SSA_NAME_DEF_STMT (rhs), *def_stmt1, *def_stmt2;
4245       if (!is_gimple_assign (def_stmt))
4246 	invalid = true;
4247       else if (handled_load (def_stmt, &ops[0], bitsize, bitpos,
4248 			     bitregion_start, bitregion_end))
4249 	rhs_code = MEM_REF;
4250       else if (gimple_assign_rhs_code (def_stmt) == BIT_NOT_EXPR)
4251 	{
4252 	  tree rhs1 = gimple_assign_rhs1 (def_stmt);
4253 	  if (TREE_CODE (rhs1) == SSA_NAME
4254 	      && is_gimple_assign (SSA_NAME_DEF_STMT (rhs1)))
4255 	    {
4256 	      bit_not_p = true;
4257 	      def_stmt = SSA_NAME_DEF_STMT (rhs1);
4258 	    }
4259 	}
4260       if (rhs_code == ERROR_MARK && !invalid)
4261 	switch ((rhs_code = gimple_assign_rhs_code (def_stmt)))
4262 	  {
4263 	  case BIT_AND_EXPR:
4264 	  case BIT_IOR_EXPR:
4265 	  case BIT_XOR_EXPR:
4266 	    tree rhs1, rhs2;
4267 	    rhs1 = gimple_assign_rhs1 (def_stmt);
4268 	    rhs2 = gimple_assign_rhs2 (def_stmt);
4269 	    invalid = true;
4270 	    if (TREE_CODE (rhs1) != SSA_NAME)
4271 	      break;
4272 	    def_stmt1 = SSA_NAME_DEF_STMT (rhs1);
4273 	    if (!is_gimple_assign (def_stmt1)
4274 		|| !handled_load (def_stmt1, &ops[0], bitsize, bitpos,
4275 				  bitregion_start, bitregion_end))
4276 	      break;
4277 	    if (rhs_valid_for_store_merging_p (rhs2))
4278 	      ops[1].val = rhs2;
4279 	    else if (TREE_CODE (rhs2) != SSA_NAME)
4280 	      break;
4281 	    else
4282 	      {
4283 		def_stmt2 = SSA_NAME_DEF_STMT (rhs2);
4284 		if (!is_gimple_assign (def_stmt2))
4285 		  break;
4286 		else if (!handled_load (def_stmt2, &ops[1], bitsize, bitpos,
4287 					bitregion_start, bitregion_end))
4288 		  break;
4289 	      }
4290 	    invalid = false;
4291 	    break;
4292 	  default:
4293 	    invalid = true;
4294 	    break;
4295 	  }
4296       unsigned HOST_WIDE_INT const_bitsize;
4297       if (bitsize.is_constant (&const_bitsize)
4298 	  && multiple_p (const_bitsize, BITS_PER_UNIT)
4299 	  && multiple_p (bitpos, BITS_PER_UNIT)
4300 	  && const_bitsize <= 64
4301 	  && BYTES_BIG_ENDIAN == WORDS_BIG_ENDIAN)
4302 	{
4303 	  ins_stmt = find_bswap_or_nop_1 (def_stmt, &n, 12);
4304 	  if (ins_stmt)
4305 	    {
4306 	      uint64_t nn = n.n;
4307 	      for (unsigned HOST_WIDE_INT i = 0;
4308 		   i < const_bitsize;
4309 		   i += BITS_PER_UNIT, nn >>= BITS_PER_MARKER)
4310 		if ((nn & MARKER_MASK) == 0
4311 		    || (nn & MARKER_MASK) == MARKER_BYTE_UNKNOWN)
4312 		  {
4313 		    ins_stmt = NULL;
4314 		    break;
4315 		  }
4316 	      if (ins_stmt)
4317 		{
4318 		  if (invalid)
4319 		    {
4320 		      rhs_code = LROTATE_EXPR;
4321 		      ops[0].base_addr = NULL_TREE;
4322 		      ops[1].base_addr = NULL_TREE;
4323 		    }
4324 		  invalid = false;
4325 		}
4326 	    }
4327 	}
4328     }
4329 
4330   unsigned HOST_WIDE_INT const_bitsize, const_bitpos;
4331   unsigned HOST_WIDE_INT const_bitregion_start, const_bitregion_end;
4332   if (invalid
4333       || !bitsize.is_constant (&const_bitsize)
4334       || !bitpos.is_constant (&const_bitpos)
4335       || !bitregion_start.is_constant (&const_bitregion_start)
4336       || !bitregion_end.is_constant (&const_bitregion_end))
4337     {
4338       terminate_all_aliasing_chains (NULL, stmt);
4339       return;
4340     }
4341 
4342   if (!ins_stmt)
4343     memset (&n, 0, sizeof (n));
4344 
4345   struct imm_store_chain_info **chain_info = NULL;
4346   if (base_addr)
4347     chain_info = m_stores.get (base_addr);
4348 
4349   store_immediate_info *info;
4350   if (chain_info)
4351     {
4352       unsigned int ord = (*chain_info)->m_store_info.length ();
4353       info = new store_immediate_info (const_bitsize, const_bitpos,
4354 				       const_bitregion_start,
4355 				       const_bitregion_end,
4356 				       stmt, ord, rhs_code, n, ins_stmt,
4357 				       bit_not_p, ops[0], ops[1]);
4358       if (dump_file && (dump_flags & TDF_DETAILS))
4359 	{
4360 	  fprintf (dump_file, "Recording immediate store from stmt:\n");
4361 	  print_gimple_stmt (dump_file, stmt, 0);
4362 	}
4363       (*chain_info)->m_store_info.safe_push (info);
4364       terminate_all_aliasing_chains (chain_info, stmt);
4365       /* If we reach the limit of stores to merge in a chain terminate and
4366 	 process the chain now.  */
4367       if ((*chain_info)->m_store_info.length ()
4368 	  == (unsigned int) PARAM_VALUE (PARAM_MAX_STORES_TO_MERGE))
4369 	{
4370 	  if (dump_file && (dump_flags & TDF_DETAILS))
4371 	    fprintf (dump_file,
4372 		     "Reached maximum number of statements to merge:\n");
4373 	  terminate_and_release_chain (*chain_info);
4374 	}
4375       return;
4376     }
4377 
4378   /* Store aliases any existing chain?  */
4379   terminate_all_aliasing_chains (NULL, stmt);
4380   /* Start a new chain.  */
4381   struct imm_store_chain_info *new_chain
4382     = new imm_store_chain_info (m_stores_head, base_addr);
4383   info = new store_immediate_info (const_bitsize, const_bitpos,
4384 				   const_bitregion_start,
4385 				   const_bitregion_end,
4386 				   stmt, 0, rhs_code, n, ins_stmt,
4387 				   bit_not_p, ops[0], ops[1]);
4388   new_chain->m_store_info.safe_push (info);
4389   m_stores.put (base_addr, new_chain);
4390   if (dump_file && (dump_flags & TDF_DETAILS))
4391     {
4392       fprintf (dump_file, "Starting new chain with statement:\n");
4393       print_gimple_stmt (dump_file, stmt, 0);
4394       fprintf (dump_file, "The base object is:\n");
4395       print_generic_expr (dump_file, base_addr);
4396       fprintf (dump_file, "\n");
4397     }
4398 }
4399 
4400 /* Entry point for the pass.  Go over each basic block recording chains of
4401    immediate stores.  Upon encountering a terminating statement (as defined
4402    by stmt_terminates_chain_p) process the recorded stores and emit the widened
4403    variants.  */
4404 
4405 unsigned int
4406 pass_store_merging::execute (function *fun)
4407 {
4408   basic_block bb;
4409   hash_set<gimple *> orig_stmts;
4410 
4411   calculate_dominance_info (CDI_DOMINATORS);
4412 
4413   FOR_EACH_BB_FN (bb, fun)
4414     {
4415       gimple_stmt_iterator gsi;
4416       unsigned HOST_WIDE_INT num_statements = 0;
4417       /* Record the original statements so that we can keep track of
4418 	 statements emitted in this pass and not re-process new
4419 	 statements.  */
4420       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
4421 	{
4422 	  if (is_gimple_debug (gsi_stmt (gsi)))
4423 	    continue;
4424 
4425 	  if (++num_statements >= 2)
4426 	    break;
4427 	}
4428 
4429       if (num_statements < 2)
4430 	continue;
4431 
4432       if (dump_file && (dump_flags & TDF_DETAILS))
4433 	fprintf (dump_file, "Processing basic block <%d>:\n", bb->index);
4434 
4435       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
4436 	{
4437 	  gimple *stmt = gsi_stmt (gsi);
4438 
4439 	  if (is_gimple_debug (stmt))
4440 	    continue;
4441 
4442 	  if (gimple_has_volatile_ops (stmt))
4443 	    {
4444 	      /* Terminate all chains.  */
4445 	      if (dump_file && (dump_flags & TDF_DETAILS))
4446 		fprintf (dump_file, "Volatile access terminates "
4447 				    "all chains\n");
4448 	      terminate_and_process_all_chains ();
4449 	      continue;
4450 	    }
4451 
4452 	  if (gimple_assign_single_p (stmt) && gimple_vdef (stmt)
4453 	      && !stmt_can_throw_internal (stmt)
4454 	      && lhs_valid_for_store_merging_p (gimple_assign_lhs (stmt)))
4455 	    process_store (stmt);
4456 	  else
4457 	    terminate_all_aliasing_chains (NULL, stmt);
4458 	}
4459       terminate_and_process_all_chains ();
4460     }
4461   return 0;
4462 }
4463 
4464 } // anon namespace
4465 
4466 /* Construct and return a store merging pass object.  */
4467 
4468 gimple_opt_pass *
4469 make_pass_store_merging (gcc::context *ctxt)
4470 {
4471   return new pass_store_merging (ctxt);
4472 }
4473 
4474 #if CHECKING_P
4475 
4476 namespace selftest {
4477 
4478 /* Selftests for store merging helpers.  */
4479 
4480 /* Assert that all elements of the byte arrays X and Y, both of length N
4481    are equal.  */
4482 
4483 static void
4484 verify_array_eq (unsigned char *x, unsigned char *y, unsigned int n)
4485 {
4486   for (unsigned int i = 0; i < n; i++)
4487     {
4488       if (x[i] != y[i])
4489 	{
4490 	  fprintf (stderr, "Arrays do not match.  X:\n");
4491 	  dump_char_array (stderr, x, n);
4492 	  fprintf (stderr, "Y:\n");
4493 	  dump_char_array (stderr, y, n);
4494 	}
4495       ASSERT_EQ (x[i], y[i]);
4496     }
4497 }
4498 
4499 /* Test shift_bytes_in_array and that it carries bits across between
4500    bytes correctly.  */
4501 
4502 static void
4503 verify_shift_bytes_in_array (void)
4504 {
4505    /* byte 1   | byte 0
4506       00011111 | 11100000.  */
4507   unsigned char orig[2] = { 0xe0, 0x1f };
4508   unsigned char in[2];
4509   memcpy (in, orig, sizeof orig);
4510 
4511   unsigned char expected[2] = { 0x80, 0x7f };
4512   shift_bytes_in_array (in, sizeof (in), 2);
4513   verify_array_eq (in, expected, sizeof (in));
4514 
4515   memcpy (in, orig, sizeof orig);
4516   memcpy (expected, orig, sizeof orig);
4517   /* Check that shifting by zero doesn't change anything.  */
4518   shift_bytes_in_array (in, sizeof (in), 0);
4519   verify_array_eq (in, expected, sizeof (in));
4520 
4521 }
4522 
4523 /* Test shift_bytes_in_array_right and that it carries bits across between
4524    bytes correctly.  */
4525 
4526 static void
4527 verify_shift_bytes_in_array_right (void)
4528 {
4529    /* byte 1   | byte 0
4530       00011111 | 11100000.  */
4531   unsigned char orig[2] = { 0x1f, 0xe0};
4532   unsigned char in[2];
4533   memcpy (in, orig, sizeof orig);
4534   unsigned char expected[2] = { 0x07, 0xf8};
4535   shift_bytes_in_array_right (in, sizeof (in), 2);
4536   verify_array_eq (in, expected, sizeof (in));
4537 
4538   memcpy (in, orig, sizeof orig);
4539   memcpy (expected, orig, sizeof orig);
4540   /* Check that shifting by zero doesn't change anything.  */
4541   shift_bytes_in_array_right (in, sizeof (in), 0);
4542   verify_array_eq (in, expected, sizeof (in));
4543 }
4544 
4545 /* Test clear_bit_region that it clears exactly the bits asked and
4546    nothing more.  */
4547 
4548 static void
4549 verify_clear_bit_region (void)
4550 {
4551   /* Start with all bits set and test clearing various patterns in them.  */
4552   unsigned char orig[3] = { 0xff, 0xff, 0xff};
4553   unsigned char in[3];
4554   unsigned char expected[3];
4555   memcpy (in, orig, sizeof in);
4556 
4557   /* Check zeroing out all the bits.  */
4558   clear_bit_region (in, 0, 3 * BITS_PER_UNIT);
4559   expected[0] = expected[1] = expected[2] = 0;
4560   verify_array_eq (in, expected, sizeof in);
4561 
4562   memcpy (in, orig, sizeof in);
4563   /* Leave the first and last bits intact.  */
4564   clear_bit_region (in, 1, 3 * BITS_PER_UNIT - 2);
4565   expected[0] = 0x1;
4566   expected[1] = 0;
4567   expected[2] = 0x80;
4568   verify_array_eq (in, expected, sizeof in);
4569 }
4570 
4571 /* Test verify_clear_bit_region_be that it clears exactly the bits asked and
4572    nothing more.  */
4573 
4574 static void
4575 verify_clear_bit_region_be (void)
4576 {
4577   /* Start with all bits set and test clearing various patterns in them.  */
4578   unsigned char orig[3] = { 0xff, 0xff, 0xff};
4579   unsigned char in[3];
4580   unsigned char expected[3];
4581   memcpy (in, orig, sizeof in);
4582 
4583   /* Check zeroing out all the bits.  */
4584   clear_bit_region_be (in, BITS_PER_UNIT - 1, 3 * BITS_PER_UNIT);
4585   expected[0] = expected[1] = expected[2] = 0;
4586   verify_array_eq (in, expected, sizeof in);
4587 
4588   memcpy (in, orig, sizeof in);
4589   /* Leave the first and last bits intact.  */
4590   clear_bit_region_be (in, BITS_PER_UNIT - 2, 3 * BITS_PER_UNIT - 2);
4591   expected[0] = 0x80;
4592   expected[1] = 0;
4593   expected[2] = 0x1;
4594   verify_array_eq (in, expected, sizeof in);
4595 }
4596 
4597 
4598 /* Run all of the selftests within this file.  */
4599 
4600 void
4601 store_merging_c_tests (void)
4602 {
4603   verify_shift_bytes_in_array ();
4604   verify_shift_bytes_in_array_right ();
4605   verify_clear_bit_region ();
4606   verify_clear_bit_region_be ();
4607 }
4608 
4609 } // namespace selftest
4610 #endif /* CHECKING_P.  */
4611