1 /* Subroutines used to expand string and block move, clear,
2    compare and other operations for PowerPC.
3    Copyright (C) 1991-2018 Free Software Foundation, Inc.
4 
5    This file is part of GCC.
6 
7    GCC is free software; you can redistribute it and/or modify it
8    under the terms of the GNU General Public License as published
9    by the Free Software Foundation; either version 3, or (at your
10    option) any later version.
11 
12    GCC is distributed in the hope that it will be useful, but WITHOUT
13    ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14    or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
15    License for more details.
16 
17    You should have received a copy of the GNU General Public License
18    along with GCC; see the file COPYING3.  If not see
19    <http://www.gnu.org/licenses/>.  */
20 
21 #define IN_TARGET_CODE 1
22 
23 #include "config.h"
24 #include "system.h"
25 #include "coretypes.h"
26 #include "backend.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "memmodel.h"
30 #include "tm_p.h"
31 #include "ira.h"
32 #include "print-tree.h"
33 #include "varasm.h"
34 #include "explow.h"
35 #include "expr.h"
36 #include "output.h"
37 #include "target.h"
38 #include "profile-count.h"
39 #include "predict.h"
40 
41 /* Expand a block clear operation, and return 1 if successful.  Return 0
42    if we should let the compiler generate normal code.
43 
44    operands[0] is the destination
45    operands[1] is the length
46    operands[3] is the alignment */
47 
48 int
expand_block_clear(rtx operands[])49 expand_block_clear (rtx operands[])
50 {
51   rtx orig_dest = operands[0];
52   rtx bytes_rtx	= operands[1];
53   rtx align_rtx = operands[3];
54   bool constp	= (GET_CODE (bytes_rtx) == CONST_INT);
55   HOST_WIDE_INT align;
56   HOST_WIDE_INT bytes;
57   int offset;
58   int clear_bytes;
59   int clear_step;
60 
61   /* If this is not a fixed size move, just call memcpy */
62   if (! constp)
63     return 0;
64 
65   /* This must be a fixed size alignment  */
66   gcc_assert (GET_CODE (align_rtx) == CONST_INT);
67   align = INTVAL (align_rtx) * BITS_PER_UNIT;
68 
69   /* Anything to clear? */
70   bytes = INTVAL (bytes_rtx);
71   if (bytes <= 0)
72     return 1;
73 
74   /* Use the builtin memset after a point, to avoid huge code bloat.
75      When optimize_size, avoid any significant code bloat; calling
76      memset is about 4 instructions, so allow for one instruction to
77      load zero and three to do clearing.  */
78   if (TARGET_ALTIVEC && (align >= 128 || TARGET_EFFICIENT_UNALIGNED_VSX))
79     clear_step = 16;
80   else if (TARGET_POWERPC64 && (align >= 64 || !STRICT_ALIGNMENT))
81     clear_step = 8;
82   else
83     clear_step = 4;
84 
85   if (optimize_size && bytes > 3 * clear_step)
86     return 0;
87   if (! optimize_size && bytes > 8 * clear_step)
88     return 0;
89 
90   bool unaligned_vsx_ok = (bytes >= 32 && TARGET_EFFICIENT_UNALIGNED_VSX);
91 
92   for (offset = 0; bytes > 0; offset += clear_bytes, bytes -= clear_bytes)
93     {
94       machine_mode mode = BLKmode;
95       rtx dest;
96 
97       if (TARGET_ALTIVEC
98 	  && (bytes >= 16 && (align >= 128 || unaligned_vsx_ok)))
99 	{
100 	  clear_bytes = 16;
101 	  mode = V4SImode;
102 	}
103       else if (bytes >= 8 && TARGET_POWERPC64
104 	       && (align >= 64 || !STRICT_ALIGNMENT))
105 	{
106 	  clear_bytes = 8;
107 	  mode = DImode;
108 	  if (offset == 0 && align < 64)
109 	    {
110 	      rtx addr;
111 
112 	      /* If the address form is reg+offset with offset not a
113 		 multiple of four, reload into reg indirect form here
114 		 rather than waiting for reload.  This way we get one
115 		 reload, not one per store.  */
116 	      addr = XEXP (orig_dest, 0);
117 	      if ((GET_CODE (addr) == PLUS || GET_CODE (addr) == LO_SUM)
118 		  && GET_CODE (XEXP (addr, 1)) == CONST_INT
119 		  && (INTVAL (XEXP (addr, 1)) & 3) != 0)
120 		{
121 		  addr = copy_addr_to_reg (addr);
122 		  orig_dest = replace_equiv_address (orig_dest, addr);
123 		}
124 	    }
125 	}
126       else if (bytes >= 4 && (align >= 32 || !STRICT_ALIGNMENT))
127 	{			/* move 4 bytes */
128 	  clear_bytes = 4;
129 	  mode = SImode;
130 	}
131       else if (bytes >= 2 && (align >= 16 || !STRICT_ALIGNMENT))
132 	{			/* move 2 bytes */
133 	  clear_bytes = 2;
134 	  mode = HImode;
135 	}
136       else /* move 1 byte at a time */
137 	{
138 	  clear_bytes = 1;
139 	  mode = QImode;
140 	}
141 
142       dest = adjust_address (orig_dest, mode, offset);
143 
144       emit_move_insn (dest, CONST0_RTX (mode));
145     }
146 
147   return 1;
148 }
149 
150 /* Figure out the correct instructions to generate to load data for
151    block compare.  MODE is used for the read from memory, and
152    data is zero extended if REG is wider than MODE.  If LE code
153    is being generated, bswap loads are used.
154 
155    REG is the destination register to move the data into.
156    MEM is the memory block being read.
157    MODE is the mode of memory to use for the read.  */
158 static void
do_load_for_compare(rtx reg,rtx mem,machine_mode mode)159 do_load_for_compare (rtx reg, rtx mem, machine_mode mode)
160 {
161   switch (GET_MODE (reg))
162     {
163     case E_DImode:
164       switch (mode)
165 	{
166 	case E_QImode:
167 	  emit_insn (gen_zero_extendqidi2 (reg, mem));
168 	  break;
169 	case E_HImode:
170 	  {
171 	    rtx src = mem;
172 	    if (!BYTES_BIG_ENDIAN)
173 	      {
174 		src = gen_reg_rtx (HImode);
175 		emit_insn (gen_bswaphi2 (src, mem));
176 	      }
177 	    emit_insn (gen_zero_extendhidi2 (reg, src));
178 	    break;
179 	  }
180 	case E_SImode:
181 	  {
182 	    rtx src = mem;
183 	    if (!BYTES_BIG_ENDIAN)
184 	      {
185 		src = gen_reg_rtx (SImode);
186 		emit_insn (gen_bswapsi2 (src, mem));
187 	      }
188 	    emit_insn (gen_zero_extendsidi2 (reg, src));
189 	  }
190 	  break;
191 	case E_DImode:
192 	  if (!BYTES_BIG_ENDIAN)
193 	    emit_insn (gen_bswapdi2 (reg, mem));
194 	  else
195 	    emit_insn (gen_movdi (reg, mem));
196 	  break;
197 	default:
198 	  gcc_unreachable ();
199 	}
200       break;
201 
202     case E_SImode:
203       switch (mode)
204 	{
205 	case E_QImode:
206 	  emit_insn (gen_zero_extendqisi2 (reg, mem));
207 	  break;
208 	case E_HImode:
209 	  {
210 	    rtx src = mem;
211 	    if (!BYTES_BIG_ENDIAN)
212 	      {
213 		src = gen_reg_rtx (HImode);
214 		emit_insn (gen_bswaphi2 (src, mem));
215 	      }
216 	    emit_insn (gen_zero_extendhisi2 (reg, src));
217 	    break;
218 	  }
219 	case E_SImode:
220 	  if (!BYTES_BIG_ENDIAN)
221 	    emit_insn (gen_bswapsi2 (reg, mem));
222 	  else
223 	    emit_insn (gen_movsi (reg, mem));
224 	  break;
225 	case E_DImode:
226 	  /* DImode is larger than the destination reg so is not expected.  */
227 	  gcc_unreachable ();
228 	  break;
229 	default:
230 	  gcc_unreachable ();
231 	}
232       break;
233     default:
234       gcc_unreachable ();
235       break;
236     }
237 }
238 
239 /* Select the mode to be used for reading the next chunk of bytes
240    in the compare.
241 
242    OFFSET is the current read offset from the beginning of the block.
243    BYTES is the number of bytes remaining to be read.
244    ALIGN is the minimum alignment of the memory blocks being compared in bytes.
245    WORD_MODE_OK indicates using WORD_MODE is allowed, else SImode is
246    the largest allowable mode.  */
247 static machine_mode
select_block_compare_mode(unsigned HOST_WIDE_INT offset,unsigned HOST_WIDE_INT bytes,unsigned HOST_WIDE_INT align,bool word_mode_ok)248 select_block_compare_mode (unsigned HOST_WIDE_INT offset,
249 			   unsigned HOST_WIDE_INT bytes,
250 			   unsigned HOST_WIDE_INT align, bool word_mode_ok)
251 {
252   /* First see if we can do a whole load unit
253      as that will be more efficient than a larger load + shift.  */
254 
255   /* If big, use biggest chunk.
256      If exactly chunk size, use that size.
257      If remainder can be done in one piece with shifting, do that.
258      Do largest chunk possible without violating alignment rules.  */
259 
260   /* The most we can read without potential page crossing.  */
261   unsigned HOST_WIDE_INT maxread = ROUND_UP (bytes, align);
262 
263   if (word_mode_ok && bytes >= UNITS_PER_WORD)
264     return word_mode;
265   else if (bytes == GET_MODE_SIZE (SImode))
266     return SImode;
267   else if (bytes == GET_MODE_SIZE (HImode))
268     return HImode;
269   else if (bytes == GET_MODE_SIZE (QImode))
270     return QImode;
271   else if (bytes < GET_MODE_SIZE (SImode)
272 	   && offset >= GET_MODE_SIZE (SImode) - bytes)
273     /* This matches the case were we have SImode and 3 bytes
274        and offset >= 1 and permits us to move back one and overlap
275        with the previous read, thus avoiding having to shift
276        unwanted bytes off of the input.  */
277     return SImode;
278   else if (word_mode_ok && bytes < UNITS_PER_WORD
279 	   && offset >= UNITS_PER_WORD-bytes)
280     /* Similarly, if we can use DImode it will get matched here and
281        can do an overlapping read that ends at the end of the block.  */
282     return word_mode;
283   else if (word_mode_ok && maxread >= UNITS_PER_WORD)
284     /* It is safe to do all remaining in one load of largest size,
285        possibly with a shift to get rid of unwanted bytes.  */
286     return word_mode;
287   else if (maxread >= GET_MODE_SIZE (SImode))
288     /* It is safe to do all remaining in one SImode load,
289        possibly with a shift to get rid of unwanted bytes.  */
290     return SImode;
291   else if (bytes > GET_MODE_SIZE (SImode))
292     return SImode;
293   else if (bytes > GET_MODE_SIZE (HImode))
294     return HImode;
295 
296   /* final fallback is do one byte */
297   return QImode;
298 }
299 
300 /* Compute the alignment of pointer+OFFSET where the original alignment
301    of pointer was BASE_ALIGN.  */
302 static unsigned HOST_WIDE_INT
compute_current_alignment(unsigned HOST_WIDE_INT base_align,unsigned HOST_WIDE_INT offset)303 compute_current_alignment (unsigned HOST_WIDE_INT base_align,
304 			   unsigned HOST_WIDE_INT offset)
305 {
306   if (offset == 0)
307     return base_align;
308   return MIN (base_align, offset & -offset);
309 }
310 
311 /* Prepare address and then do a load.
312 
313    MODE is the mode to use for the load.
314    DEST is the destination register for the data.
315    ADDR is the address to be loaded.
316    ORIG_ADDR is the original address expression.  */
317 static void
do_load_for_compare_from_addr(machine_mode mode,rtx dest,rtx addr,rtx orig_addr)318 do_load_for_compare_from_addr (machine_mode mode, rtx dest, rtx addr,
319 			       rtx orig_addr)
320 {
321   rtx mem = gen_rtx_MEM (mode, addr);
322   MEM_COPY_ATTRIBUTES (mem, orig_addr);
323   set_mem_size (mem, GET_MODE_SIZE (mode));
324   do_load_for_compare (dest, mem, mode);
325   return;
326 }
327 
328 /* Do a branch for an if/else decision.
329 
330    CMPMODE is the mode to use for the comparison.
331    COMPARISON is the rtx code for the compare needed.
332    A is the first thing to be compared.
333    B is the second thing to be compared.
334    CR is the condition code reg input, or NULL_RTX.
335    TRUE_LABEL is the label to branch to if the condition is true.
336    P is the estimated branch probability for the branch.
337 
338    The return value is the CR used for the comparison.
339    If CR is null_rtx, then a new register of CMPMODE is generated.
340    If A and B are both null_rtx, then CR must not be null, and the
341    compare is not generated so you can use this with a dot form insn.  */
342 
343 static void
do_ifelse(machine_mode cmpmode,rtx_code comparison,rtx a,rtx b,rtx cr,rtx true_label,profile_probability br_prob)344 do_ifelse (machine_mode cmpmode, rtx_code comparison,
345 	   rtx a, rtx b, rtx cr, rtx true_label, profile_probability br_prob)
346 {
347   gcc_assert ((a == NULL_RTX && b == NULL_RTX && cr != NULL_RTX)
348 	      || (a != NULL_RTX && b != NULL_RTX));
349 
350   if (cr != NULL_RTX)
351     gcc_assert (GET_MODE (cr) == cmpmode);
352   else
353     cr = gen_reg_rtx (cmpmode);
354 
355   rtx label_ref = gen_rtx_LABEL_REF (VOIDmode, true_label);
356 
357   if (a != NULL_RTX)
358     emit_move_insn (cr, gen_rtx_COMPARE (cmpmode, a, b));
359 
360   rtx cmp_rtx = gen_rtx_fmt_ee (comparison, VOIDmode, cr, const0_rtx);
361 
362   rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx, label_ref, pc_rtx);
363   rtx_insn *j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
364   add_reg_br_prob_note (j, br_prob);
365   JUMP_LABEL (j) = true_label;
366   LABEL_NUSES (true_label) += 1;
367 }
368 
369 /* Emit an isel of the proper mode for DEST.
370 
371    DEST is the isel destination register.
372    SRC1 is the isel source if CR is true.
373    SRC2 is the isel source if CR is false.
374    CR is the condition for the isel.  */
375 static void
do_isel(rtx dest,rtx cmp,rtx src_t,rtx src_f,rtx cr)376 do_isel (rtx dest, rtx cmp, rtx src_t, rtx src_f, rtx cr)
377 {
378   if (GET_MODE (dest) == DImode)
379     emit_insn (gen_isel_signed_di (dest, cmp, src_t, src_f, cr));
380   else
381     emit_insn (gen_isel_signed_si (dest, cmp, src_t, src_f, cr));
382 }
383 
384 /* Emit a subtract of the proper mode for DEST.
385 
386    DEST is the destination register for the subtract.
387    SRC1 is the first subtract input.
388    SRC2 is the second subtract input.
389 
390    Computes DEST = SRC1-SRC2.  */
391 static void
do_sub3(rtx dest,rtx src1,rtx src2)392 do_sub3 (rtx dest, rtx src1, rtx src2)
393 {
394   if (GET_MODE (dest) == DImode)
395     emit_insn (gen_subdi3 (dest, src1, src2));
396   else
397     emit_insn (gen_subsi3 (dest, src1, src2));
398 }
399 
400 /* Emit an add of the proper mode for DEST.
401 
402    DEST is the destination register for the add.
403    SRC1 is the first add input.
404    SRC2 is the second add input.
405 
406    Computes DEST = SRC1+SRC2.  */
407 static void
do_add3(rtx dest,rtx src1,rtx src2)408 do_add3 (rtx dest, rtx src1, rtx src2)
409 {
410   if (GET_MODE (dest) == DImode)
411     emit_insn (gen_adddi3 (dest, src1, src2));
412   else
413     emit_insn (gen_addsi3 (dest, src1, src2));
414 }
415 
416 /* Emit an and of the proper mode for DEST.
417 
418    DEST is the destination register for the and.
419    SRC1 is the first and input.
420    SRC2 is the second and input.
421 
422    Computes DEST = SRC1&SRC2.  */
423 static void
do_and3(rtx dest,rtx src1,rtx src2)424 do_and3 (rtx dest, rtx src1, rtx src2)
425 {
426   if (GET_MODE (dest) == DImode)
427     emit_insn (gen_anddi3 (dest, src1, src2));
428   else
429     emit_insn (gen_andsi3 (dest, src1, src2));
430 }
431 
432 /* Emit an cmpb of the proper mode for DEST.
433 
434    DEST is the destination register for the cmpb.
435    SRC1 is the first input.
436    SRC2 is the second input.
437 
438    Computes cmpb of SRC1, SRC2.  */
439 static void
do_cmpb3(rtx dest,rtx src1,rtx src2)440 do_cmpb3 (rtx dest, rtx src1, rtx src2)
441 {
442   if (GET_MODE (dest) == DImode)
443     emit_insn (gen_cmpbdi3 (dest, src1, src2));
444   else
445     emit_insn (gen_cmpbsi3 (dest, src1, src2));
446 }
447 
448 /* Emit a rotl of the proper mode for DEST.
449 
450    DEST is the destination register for the and.
451    SRC1 is the first and input.
452    SRC2 is the second and input.
453 
454    Computes DEST = SRC1 rotated left by SRC2.  */
455 static void
do_rotl3(rtx dest,rtx src1,rtx src2)456 do_rotl3 (rtx dest, rtx src1, rtx src2)
457 {
458   if (GET_MODE (dest) == DImode)
459     emit_insn (gen_rotldi3 (dest, src1, src2));
460   else
461     emit_insn (gen_rotlsi3 (dest, src1, src2));
462 }
463 
464 /* Generate rtl for a load, shift, and compare of less than a full word.
465 
466    LOAD_MODE is the machine mode for the loads.
467    DIFF is the reg for the difference.
468    CMP_REM is the reg containing the remaining bytes to compare.
469    DCOND is the CCUNS reg for the compare if we are doing P9 code with setb.
470    SRC1_ADDR is the first source address.
471    SRC2_ADDR is the second source address.
472    ORIG_SRC1 is the original first source block's address rtx.
473    ORIG_SRC2 is the original second source block's address rtx.  */
474 static void
do_load_mask_compare(const machine_mode load_mode,rtx diff,rtx cmp_rem,rtx dcond,rtx src1_addr,rtx src2_addr,rtx orig_src1,rtx orig_src2)475 do_load_mask_compare (const machine_mode load_mode, rtx diff, rtx cmp_rem, rtx dcond,
476 		      rtx src1_addr, rtx src2_addr, rtx orig_src1, rtx orig_src2)
477 {
478   HOST_WIDE_INT load_mode_size = GET_MODE_SIZE (load_mode);
479   rtx shift_amount = gen_reg_rtx (word_mode);
480   rtx d1 = gen_reg_rtx (word_mode);
481   rtx d2 = gen_reg_rtx (word_mode);
482 
483   do_load_for_compare_from_addr (load_mode, d1, src1_addr, orig_src1);
484   do_load_for_compare_from_addr (load_mode, d2, src2_addr, orig_src2);
485   do_sub3 (shift_amount, GEN_INT (load_mode_size), cmp_rem);
486 
487   if (word_mode == DImode)
488     {
489       emit_insn (gen_ashldi3 (shift_amount, shift_amount,
490 			      GEN_INT (LOG2_BITS_PER_UNIT)));
491       emit_insn (gen_lshrdi3 (d1, d1,
492 			      gen_lowpart (SImode, shift_amount)));
493       emit_insn (gen_lshrdi3 (d2, d2,
494 			      gen_lowpart (SImode, shift_amount)));
495     }
496   else
497     {
498       emit_insn (gen_ashlsi3 (shift_amount, shift_amount,
499 			      GEN_INT (LOG2_BITS_PER_UNIT)));
500       emit_insn (gen_lshrsi3 (d1, d1, shift_amount));
501       emit_insn (gen_lshrsi3 (d2, d2, shift_amount));
502     }
503 
504   if (TARGET_P9_MISC)
505     {
506       /* Generate a compare, and convert with a setb later.  */
507       rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1, d2);
508       emit_insn (gen_rtx_SET (dcond, cmp));
509     }
510   else
511     {
512       if (word_mode == DImode)
513 	emit_insn (gen_subfdi3_carry (diff, d2, d1));
514       else
515 	emit_insn (gen_subfsi3_carry (diff, d2, d1));
516     }
517 }
518 
519 /* Generate rtl for an overlapping load and compare of less than a
520    full load_mode.  This assumes that the previous word is part of the
521    block being compared so it's ok to back up part of a word so we can
522    compare the last unaligned full word that ends at the end of the block.
523 
524    LOAD_MODE is the machine mode for the loads.
525    ISCONST tells whether the remaining length is a constant or in a register.
526    BYTES_REM is the remaining length if ISCONST is true.
527    DIFF is the reg for the difference.
528    CMP_REM is the reg containing the remaining bytes to compare if !ISCONST.
529    DCOND is the CCUNS reg for the compare if we are doing P9 code with setb.
530    SRC1_ADDR is the first source address.
531    SRC2_ADDR is the second source address.
532    ORIG_SRC1 is the original first source block's address rtx.
533    ORIG_SRC2 is the original second source block's address rtx.  */
534 static void
do_overlap_load_compare(machine_mode load_mode,bool isConst,HOST_WIDE_INT bytes_rem,rtx diff,rtx cmp_rem,rtx dcond,rtx src1_addr,rtx src2_addr,rtx orig_src1,rtx orig_src2)535 do_overlap_load_compare (machine_mode load_mode, bool isConst,
536 			HOST_WIDE_INT bytes_rem, rtx diff,
537 			rtx cmp_rem, rtx dcond, rtx src1_addr, rtx src2_addr,
538 			rtx orig_src1, rtx orig_src2)
539 {
540   HOST_WIDE_INT load_mode_size = GET_MODE_SIZE (load_mode);
541   HOST_WIDE_INT addr_adj = load_mode_size - bytes_rem;
542   rtx d1 = gen_reg_rtx (word_mode);
543   rtx d2 = gen_reg_rtx (word_mode);
544 
545   rtx addr1, addr2;
546   if (!isConst || addr_adj)
547     {
548       rtx adj_reg = gen_reg_rtx (word_mode);
549       if (isConst)
550 	emit_move_insn (adj_reg, GEN_INT (-addr_adj));
551       else
552 	{
553 	  rtx reg_lms = gen_reg_rtx (word_mode);
554 	  emit_move_insn (reg_lms, GEN_INT (load_mode_size));
555 	  do_sub3 (adj_reg, cmp_rem, reg_lms);
556 	}
557 
558       addr1 = gen_rtx_PLUS (word_mode, src1_addr, adj_reg);
559       addr2 = gen_rtx_PLUS (word_mode, src2_addr, adj_reg);
560     }
561   else
562     {
563       addr1 = src1_addr;
564       addr2 = src2_addr;
565     }
566 
567   do_load_for_compare_from_addr (load_mode, d1, addr1, orig_src1);
568   do_load_for_compare_from_addr (load_mode, d2, addr2, orig_src2);
569 
570   if (TARGET_P9_MISC)
571     {
572       /* Generate a compare, and convert with a setb later.  */
573       rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1, d2);
574       emit_insn (gen_rtx_SET (dcond, cmp));
575     }
576   else
577     {
578       if (word_mode == DImode)
579 	emit_insn (gen_subfdi3_carry (diff, d2, d1));
580       else
581 	emit_insn (gen_subfsi3_carry (diff, d2, d1));
582     }
583 }
584 
585 /* Expand a block compare operation using loop code, and return true
586    if successful.  Return false if we should let the compiler generate
587    normal code, probably a memcmp call.
588 
589    OPERANDS[0] is the target (result).
590    OPERANDS[1] is the first source.
591    OPERANDS[2] is the second source.
592    OPERANDS[3] is the length.
593    OPERANDS[4] is the alignment.  */
594 bool
expand_compare_loop(rtx operands[])595 expand_compare_loop (rtx operands[])
596 {
597   rtx target = operands[0];
598   rtx orig_src1 = operands[1];
599   rtx orig_src2 = operands[2];
600   rtx bytes_rtx = operands[3];
601   rtx align_rtx = operands[4];
602 
603   /* This case is complicated to handle because the subtract
604      with carry instructions do not generate the 64-bit
605      carry and so we must emit code to calculate it ourselves.
606      We choose not to implement this yet.  */
607   if (TARGET_32BIT && TARGET_POWERPC64)
608     return false;
609 
610   /* Allow non-const length.  */
611   int bytes_is_const = CONST_INT_P (bytes_rtx);
612 
613   /* This must be a fixed size alignment.  */
614   if (!CONST_INT_P (align_rtx))
615     return false;
616 
617   HOST_WIDE_INT align1 = MEM_ALIGN (orig_src1) / BITS_PER_UNIT;
618   HOST_WIDE_INT align2 = MEM_ALIGN (orig_src2) / BITS_PER_UNIT;
619   HOST_WIDE_INT minalign = MIN (align1, align2);
620 
621   bool isP7 = (rs6000_tune == PROCESSOR_POWER7);
622 
623   gcc_assert (GET_MODE (target) == SImode);
624 
625   /* Anything to move?	*/
626   HOST_WIDE_INT bytes = 0;
627   if (bytes_is_const)
628     bytes = INTVAL (bytes_rtx);
629 
630   if (bytes_is_const && bytes == 0)
631     return true;
632 
633   /* Limit the amount we compare, if known statically.  */
634   HOST_WIDE_INT max_bytes;
635   switch (rs6000_tune)
636     {
637     case PROCESSOR_POWER7:
638       if (!bytes_is_const)
639 	if (minalign < 8)
640 	  max_bytes = 0;
641 	else
642 	  max_bytes = 128;
643       else
644 	if (minalign < 8)
645 	  max_bytes = 32;
646 	else
647 	  max_bytes = 128;
648       break;
649     case PROCESSOR_POWER8:
650       if (!bytes_is_const)
651 	max_bytes = 0;
652       else
653 	if (minalign < 8)
654 	  max_bytes = 128;
655 	else
656 	  max_bytes = 64;
657       break;
658     case PROCESSOR_POWER9:
659       if (bytes_is_const)
660 	max_bytes = 191;
661       else
662 	max_bytes = 0;
663       break;
664     default:
665       max_bytes = 128;
666     }
667 
668   /* Allow the option to override the default.  */
669   if (rs6000_block_compare_inline_loop_limit >= 0)
670     max_bytes = (unsigned HOST_WIDE_INT) rs6000_block_compare_inline_loop_limit;
671 
672   if (max_bytes == 0)
673     return false;
674 
675   rtx cmp_rem = gen_reg_rtx (word_mode);  /* Remainder for library call.  */
676   rtx loop_cmp = gen_reg_rtx (word_mode); /* Actual amount compared by loop.  */
677   HOST_WIDE_INT niter;
678   rtx iter = gen_reg_rtx (word_mode);
679   rtx iv1 = gen_reg_rtx (word_mode);
680   rtx iv2 = gen_reg_rtx (word_mode);
681   rtx d1_1 = gen_reg_rtx (word_mode);  /* Addr expression src1+iv1 */
682   rtx d1_2 = gen_reg_rtx (word_mode);  /* Addr expression src1+iv2 */
683   rtx d2_1 = gen_reg_rtx (word_mode);  /* Addr expression src2+iv1 */
684   rtx d2_2 = gen_reg_rtx (word_mode);  /* Addr expression src2+iv2 */
685 
686   /* Strip unneeded subreg from length if there is one.  */
687   if (SUBREG_P (bytes_rtx) && subreg_lowpart_p (bytes_rtx))
688     bytes_rtx = SUBREG_REG (bytes_rtx);
689   /* Extend bytes_rtx to word_mode if needed.  But, we expect only to
690    maybe have to deal with the case were bytes_rtx is SImode and
691    word_mode is DImode.  */
692   if (!bytes_is_const)
693     {
694       if (GET_MODE_SIZE (GET_MODE (bytes_rtx)) > GET_MODE_SIZE (word_mode))
695 	/* Do not expect length longer than word_mode.  */
696 	return false;
697       else if (GET_MODE_SIZE (GET_MODE (bytes_rtx)) < GET_MODE_SIZE (word_mode))
698 	{
699 	  bytes_rtx = force_reg (GET_MODE (bytes_rtx), bytes_rtx);
700 	  bytes_rtx = force_reg (word_mode,
701 				 gen_rtx_fmt_e (ZERO_EXTEND, word_mode,
702 						bytes_rtx));
703 	}
704       else
705 	/* Make sure it's in a register before we get started.  */
706 	bytes_rtx = force_reg (GET_MODE (bytes_rtx), bytes_rtx);
707     }
708 
709   machine_mode load_mode = word_mode;
710   HOST_WIDE_INT load_mode_size = GET_MODE_SIZE (load_mode);
711 
712   /* Number of bytes per iteration of the unrolled loop.  */
713   HOST_WIDE_INT loop_bytes = 2 * load_mode_size;
714   /* max iters and bytes compared in the loop.  */
715   HOST_WIDE_INT max_loop_iter = max_bytes / loop_bytes;
716   HOST_WIDE_INT max_loop_bytes = max_loop_iter * loop_bytes;
717   int l2lb = floor_log2 (loop_bytes);
718 
719   if (bytes_is_const && (max_bytes < load_mode_size
720 			 || !IN_RANGE (bytes, load_mode_size, max_bytes)))
721     return false;
722 
723   bool no_remainder_code = false;
724   rtx final_label = gen_label_rtx ();
725   rtx final_ref = gen_rtx_LABEL_REF (VOIDmode, final_label);
726   rtx diff_label = gen_label_rtx ();
727   rtx library_call_label = NULL;
728   rtx cleanup_label = gen_label_rtx ();
729 
730   rtx cr;
731 
732   rtx src1_addr = copy_addr_to_reg (XEXP (orig_src1, 0));
733   rtx src2_addr = copy_addr_to_reg (XEXP (orig_src2, 0));
734 
735   /* Difference found is stored here before jump to diff_label.  */
736   rtx diff = gen_reg_rtx (word_mode);
737   rtx_insn *j;
738 
739   /* Example of generated code for 35 bytes aligned 1 byte.
740 
741 	     mtctr 8
742 	     li 6,0
743 	     li 5,8
744      .L13:
745 	     ldbrx 7,3,6
746 	     ldbrx 9,10,6
747 	     ldbrx 0,3,5
748 	     ldbrx 4,10,5
749 	     addi 6,6,16
750 	     addi 5,5,16
751 	     subfc. 9,9,7
752 	     bne 0,.L10
753 	     subfc. 9,4,0
754 	     bdnzt 2,.L13
755 	     bne 0,.L10
756 	     add 3,3,6
757 	     add 10,10,6
758 	     addi 9,3,-5
759 	     ldbrx 7,0,9
760 	     addi 9,10,-5
761 	     ldbrx 9,0,9
762 	     subfc 9,9,7
763 	     .p2align 4,,15
764      .L10:
765 	     popcntd 9,9
766 	     subfe 10,10,10
767 	     or 9,9,10
768 
769      Compiled with -fno-reorder-blocks for clarity.  */
770 
771   /* Structure of what we're going to do:
772      Two separate lengths: what we will compare before bailing to library
773 	call (max_bytes), and the total length to be checked.
774      if length <= 16, branch to linear cleanup code starting with
775 	remainder length check (length not known at compile time)
776      set up 2 iv's and load count reg, compute remainder length
777      unrollx2 compare loop
778      if loop exit due to a difference, branch to difference handling code
779      if remainder length < 8, branch to final cleanup compare
780      load and compare 8B
781      final cleanup comparison (depends on alignment and length)
782 	load 8B, shift off bytes past length, compare
783 	load 8B ending at last byte and compare
784 	load/compare 1 byte at a time (short block abutting 4k boundary)
785      difference handling, 64->32 conversion
786      final result
787      branch around memcmp call
788      memcmp library call
789   */
790 
791   /* If bytes is not const, compare length and branch directly
792      to the cleanup code that can handle 0-16 bytes if length
793      is >= 16.  Stash away bytes-max_bytes for the library call.  */
794   if (bytes_is_const)
795     {
796       /* These need to be set for some of the places we may jump to.  */
797       if (bytes > max_bytes)
798 	{
799 	  no_remainder_code = true;
800 	  niter = max_loop_iter;
801 	  library_call_label = gen_label_rtx ();
802 	}
803       else
804 	{
805 	  niter = bytes / loop_bytes;
806 	}
807       emit_move_insn (iter, GEN_INT (niter));
808       emit_move_insn (loop_cmp, GEN_INT (niter * loop_bytes));
809       emit_move_insn (cmp_rem, GEN_INT (bytes - niter * loop_bytes));
810     }
811   else
812     {
813       library_call_label = gen_label_rtx ();
814 
815       /* If we go to the cleanup code, it expects length to be in cmp_rem.  */
816       emit_move_insn (cmp_rem, bytes_rtx);
817 
818       /* Check for > max_bytes bytes.  We want to bail out as quickly as
819 	 possible if we have to go over to memcmp.  */
820       do_ifelse (CCmode, GT, bytes_rtx, GEN_INT (max_bytes),
821 		 NULL_RTX, library_call_label, profile_probability::even ());
822 
823       /* Check for < loop_bytes bytes.  */
824       do_ifelse (CCmode, LT, bytes_rtx, GEN_INT (loop_bytes),
825 		 NULL_RTX, cleanup_label, profile_probability::even ());
826 
827       /* Loop compare bytes and iterations if bytes>max_bytes.  */
828       rtx mb_reg = gen_reg_rtx (word_mode);
829       emit_move_insn (mb_reg, GEN_INT (max_loop_bytes));
830       rtx mi_reg = gen_reg_rtx (word_mode);
831       emit_move_insn (mi_reg, GEN_INT (max_loop_iter));
832 
833       /* Compute number of loop iterations if bytes <= max_bytes.  */
834       if (word_mode == DImode)
835 	emit_insn (gen_lshrdi3 (iter, bytes_rtx, GEN_INT (l2lb)));
836       else
837 	emit_insn (gen_lshrsi3 (iter, bytes_rtx, GEN_INT (l2lb)));
838 
839       /* Compute bytes to compare in loop if bytes <= max_bytes.  */
840       rtx mask = GEN_INT (HOST_WIDE_INT_M1U << l2lb);
841       if (word_mode == DImode)
842 	{
843 	  emit_insn (gen_anddi3 (loop_cmp, bytes_rtx, mask));
844 	}
845       else
846 	{
847 	  emit_insn (gen_andsi3 (loop_cmp, bytes_rtx, mask));
848 	}
849 
850       /* Check for bytes <= max_bytes.  */
851       if (TARGET_ISEL)
852 	{
853 	  /* P9 has fast isel so we use one compare and two isel.  */
854 	  cr = gen_reg_rtx (CCmode);
855 	  rtx compare_rtx = gen_rtx_COMPARE (CCmode, bytes_rtx,
856 					     GEN_INT (max_bytes));
857 	  emit_move_insn (cr, compare_rtx);
858 	  rtx cmp_rtx = gen_rtx_LE (VOIDmode, cr, const0_rtx);
859 	  do_isel (loop_cmp, cmp_rtx, loop_cmp, mb_reg, cr);
860 	  do_isel (iter, cmp_rtx, iter, mi_reg, cr);
861 	}
862       else
863 	{
864 	  rtx lab_after = gen_label_rtx ();
865 	  do_ifelse (CCmode, LE, bytes_rtx, GEN_INT (max_bytes),
866 		     NULL_RTX, lab_after, profile_probability::even ());
867 	  emit_move_insn (loop_cmp, mb_reg);
868 	  emit_move_insn (iter, mi_reg);
869 	  emit_label (lab_after);
870 	}
871 
872       /* Now compute remainder bytes which isn't used until after the loop.  */
873       do_sub3 (cmp_rem, bytes_rtx, loop_cmp);
874     }
875 
876   rtx dcond = NULL_RTX; /* Used for when we jump to diff_label.  */
877   /* For p9 we need to have just one of these as multiple places define
878      it and it gets used by the setb at the end.  */
879   if (TARGET_P9_MISC)
880     dcond = gen_reg_rtx (CCUNSmode);
881 
882   if (!bytes_is_const || bytes >= loop_bytes)
883     {
884       /* It should not be possible to come here if remaining bytes is
885 	 < 16 in the runtime case either.  Compute number of loop
886 	 iterations.  We compare 2*word_mode per iteration so 16B for
887 	 64-bit code and 8B for 32-bit.  Set up two induction
888 	 variables and load count register.  */
889 
890       /* HACK ALERT: create hard reg for CTR here.  If we just use a
891 	 pseudo, cse will get rid of it and then the allocator will
892 	 see it used in the lshr above and won't give us ctr.  */
893       rtx ctr = gen_rtx_REG (Pmode, CTR_REGNO);
894       emit_move_insn (ctr, iter);
895       emit_move_insn (diff, GEN_INT (0));
896       emit_move_insn (iv1, GEN_INT (0));
897       emit_move_insn (iv2, GEN_INT (load_mode_size));
898 
899       /* inner loop to compare 2*word_mode */
900       rtx loop_top_label = gen_label_rtx ();
901       emit_label (loop_top_label);
902 
903       rtx src1_ix1 = gen_rtx_PLUS (word_mode, src1_addr, iv1);
904       rtx src2_ix1 = gen_rtx_PLUS (word_mode, src2_addr, iv1);
905 
906       do_load_for_compare_from_addr (load_mode, d1_1,
907 				     src1_ix1, orig_src1);
908       do_load_for_compare_from_addr (load_mode, d2_1,
909 				     src2_ix1, orig_src2);
910       do_add3 (iv1, iv1, GEN_INT (loop_bytes));
911 
912       rtx src1_ix2 = gen_rtx_PLUS (word_mode, src1_addr, iv2);
913       rtx src2_ix2 = gen_rtx_PLUS (word_mode, src2_addr, iv2);
914 
915       do_load_for_compare_from_addr (load_mode, d1_2,
916 				     src1_ix2, orig_src1);
917       do_load_for_compare_from_addr (load_mode, d2_2,
918 				     src2_ix2, orig_src2);
919       do_add3 (iv2, iv2, GEN_INT (loop_bytes));
920 
921       if (TARGET_P9_MISC)
922 	{
923 	  /* Generate a compare, and convert with a setb later.  */
924 	  rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1_1, d2_1);
925 	  emit_insn (gen_rtx_SET (dcond, cmp));
926 	}
927       else
928 	{
929 	  dcond = gen_reg_rtx (CCmode);
930 	  if (word_mode == DImode)
931 	    emit_insn (gen_subfdi3_carry_dot2 (diff, d2_1, d1_1, dcond));
932 	  else
933 	    emit_insn (gen_subfsi3_carry_dot2 (diff, d2_1, d1_1, dcond));
934 	}
935 
936       do_ifelse (GET_MODE (dcond), NE, NULL_RTX, NULL_RTX,
937 		 dcond, diff_label, profile_probability::unlikely ());
938 
939       if (TARGET_P9_MISC)
940 	{
941 	  /* Generate a compare, and convert with a setb later.  */
942 	  rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1_2, d2_2);
943 	  emit_insn (gen_rtx_SET (dcond, cmp));
944 	}
945       else
946 	{
947 	  dcond = gen_reg_rtx (CCmode);
948 	  if (word_mode == DImode)
949 	    emit_insn (gen_subfdi3_carry_dot2 (diff, d2_2, d1_2, dcond));
950 	  else
951 	    emit_insn (gen_subfsi3_carry_dot2 (diff, d2_2, d1_2, dcond));
952 	}
953 
954       rtx eqrtx = gen_rtx_EQ (VOIDmode, d1_2, d2_2);
955       if (TARGET_64BIT)
956 	j = emit_jump_insn (gen_bdnztf_di (loop_top_label, ctr, ctr,
957 					   eqrtx, dcond));
958       else
959 	j = emit_jump_insn (gen_bdnztf_si (loop_top_label, ctr, ctr,
960 					   eqrtx, dcond));
961       add_reg_br_prob_note (j, profile_probability::likely ());
962       JUMP_LABEL (j) = loop_top_label;
963       LABEL_NUSES (loop_top_label) += 1;
964     }
965 
966   HOST_WIDE_INT bytes_remaining = 0;
967   if (bytes_is_const)
968     bytes_remaining = (bytes % loop_bytes);
969 
970   /* If diff is nonzero, branch to difference handling
971      code.  If we exit here with a nonzero diff, it is
972      because the second word differed.  */
973   if (TARGET_P9_MISC)
974     do_ifelse (CCUNSmode, NE, NULL_RTX, NULL_RTX, dcond,
975 	       diff_label, profile_probability::unlikely ());
976   else
977     do_ifelse (CCmode, NE, diff, const0_rtx, NULL_RTX,
978 	       diff_label, profile_probability::unlikely ());
979 
980   if (library_call_label != NULL && bytes_is_const && bytes > max_bytes)
981     {
982       /* If the length is known at compile time, then we will always
983 	 have a remainder to go to the library call with.  */
984       rtx library_call_ref = gen_rtx_LABEL_REF (VOIDmode, library_call_label);
985       j = emit_jump_insn (gen_rtx_SET (pc_rtx, library_call_ref));
986       JUMP_LABEL (j) = library_call_label;
987       LABEL_NUSES (library_call_label) += 1;
988       emit_barrier ();
989     }
990 
991   if (bytes_is_const && bytes_remaining == 0)
992     {
993       /* No remainder and if we are here then diff is 0 so just return 0 */
994       if (TARGET_64BIT)
995 	emit_insn (gen_movsi (target, gen_lowpart (SImode, diff)));
996       else
997 	emit_move_insn (target, diff);
998       j = emit_jump_insn (gen_rtx_SET (pc_rtx, final_ref));
999       JUMP_LABEL (j) = final_label;
1000       LABEL_NUSES (final_label) += 1;
1001       emit_barrier ();
1002     }
1003   else if (!no_remainder_code)
1004     {
1005       /* Update addresses to point to the next word to examine.  */
1006       do_add3 (src1_addr, src1_addr, iv1);
1007       do_add3 (src2_addr, src2_addr, iv1);
1008 
1009       emit_label (cleanup_label);
1010 
1011       if (!bytes_is_const)
1012 	{
1013 	  /* If we're dealing with runtime length, we have to check if
1014 	     it's zero after the loop.  When length is known at compile
1015 	     time the no-remainder condition is dealt with above.  By
1016 	     doing this after cleanup_label, we also deal with the
1017 	     case where length is 0 at the start and we bypass the
1018 	     loop with a branch to cleanup_label.  */
1019 	  emit_move_insn (target, const0_rtx);
1020 	  do_ifelse (CCmode, EQ, cmp_rem, const0_rtx,
1021 		     NULL_RTX, final_label, profile_probability::unlikely ());
1022 	}
1023 
1024       rtx final_cleanup = gen_label_rtx ();
1025       rtx cmp_rem_before = gen_reg_rtx (word_mode);
1026       /* Compare one more word_mode chunk if needed.  */
1027       if (!bytes_is_const || bytes_remaining >= load_mode_size)
1028 	{
1029 	  /* If remainder length < word length, branch to final
1030 	     cleanup compare.  */
1031 
1032 	  if (!bytes_is_const)
1033 	    {
1034 	      do_ifelse (CCmode, LT, cmp_rem, GEN_INT (load_mode_size),
1035 			 NULL_RTX, final_cleanup, profile_probability::even ());
1036 	    }
1037 
1038 	  /* load and compare 8B */
1039 	  do_load_for_compare_from_addr (load_mode, d1_1,
1040 					 src1_addr, orig_src1);
1041 	  do_load_for_compare_from_addr (load_mode, d2_1,
1042 					 src2_addr, orig_src2);
1043 
1044 	  /* Compare the word, see if we need to do the last partial.  */
1045 	  if (TARGET_P9_MISC)
1046 	    {
1047 	      /* Generate a compare, and convert with a setb later.  */
1048 	      rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1_1, d2_1);
1049 	      emit_insn (gen_rtx_SET (dcond, cmp));
1050 	    }
1051 	  else
1052 	    {
1053 	      dcond = gen_reg_rtx (CCmode);
1054 	      if (word_mode == DImode)
1055 		emit_insn (gen_subfdi3_carry_dot2 (diff, d2_1, d1_1, dcond));
1056 	      else
1057 		emit_insn (gen_subfsi3_carry_dot2 (diff, d2_1, d1_1, dcond));
1058 	    }
1059 
1060 	  do_ifelse (GET_MODE (dcond), NE, NULL_RTX, NULL_RTX,
1061 		     dcond, diff_label, profile_probability::even ());
1062 
1063 	  do_add3 (src1_addr, src1_addr, GEN_INT (load_mode_size));
1064 	  do_add3 (src2_addr, src2_addr, GEN_INT (load_mode_size));
1065 	  emit_move_insn (cmp_rem_before, cmp_rem);
1066 	  do_add3 (cmp_rem, cmp_rem, GEN_INT (-load_mode_size));
1067 	  if (bytes_is_const)
1068 	    bytes_remaining -= load_mode_size;
1069 	  else
1070 	    /* See if remaining length is now zero.  We previously set
1071 	       target to 0 so we can just jump to the end.  */
1072 	    do_ifelse (CCmode, EQ, cmp_rem, const0_rtx, NULL_RTX,
1073 		       final_label, profile_probability::unlikely ());
1074 	}
1075 
1076       /* Cases:
1077 	 bytes_is_const
1078 	   We can always shift back to do an overlapping compare
1079 	   of the last chunk because we know length >= 8.
1080 
1081 	 !bytes_is_const
1082 	   align>=load_mode_size
1083 	     Read word_mode and mask
1084 	   align<load_mode_size
1085 	     avoid stepping past end
1086 
1087 	  Three strategies:
1088 	  * decrement address and do overlapping compare
1089 	  * read word_mode and mask
1090 	  * carefully avoid crossing 4k boundary
1091        */
1092 
1093       if ((!bytes_is_const || (bytes_is_const && bytes_remaining && isP7))
1094 	  && align1 >= load_mode_size && align2 >= load_mode_size)
1095 	{
1096 	  /* Alignment is larger than word_mode so we do not need to be
1097 	     concerned with extra page crossings.  But, we do not know
1098 	     that the length is larger than load_mode_size so we might
1099 	     end up compareing against data before the block if we try
1100 	     an overlapping compare.  Also we use this on P7 for fixed length
1101 	     remainder because P7 doesn't like overlapping unaligned.
1102 	     Strategy: load 8B, shift off bytes past length, and compare.  */
1103 	  emit_label (final_cleanup);
1104 	  do_load_mask_compare (load_mode, diff, cmp_rem, dcond,
1105 				src1_addr, src2_addr, orig_src1, orig_src2);
1106 	}
1107       else if (bytes_remaining && bytes_is_const)
1108 	{
1109 	  /* We do not do loop expand if length < 32 so we know at the
1110 	     end we can do an overlapping compare.
1111 	     Strategy: shift address back and do word_mode load that
1112 	     ends at the end of the block.  */
1113 	  emit_label (final_cleanup);
1114 	  do_overlap_load_compare (load_mode, true, bytes_remaining, diff,
1115 				   cmp_rem, dcond, src1_addr, src2_addr,
1116 				   orig_src1, orig_src2);
1117 	}
1118       else if (!bytes_is_const)
1119 	{
1120 	  rtx handle4k_label = gen_label_rtx ();
1121 	  rtx nonconst_overlap = gen_label_rtx ();
1122 	  emit_label (nonconst_overlap);
1123 
1124 	  /* Here we have to handle the case where whe have runtime
1125 	     length which may be too short for overlap compare, and
1126 	     alignment is not at least load_mode_size so we have to
1127 	     tread carefully to avoid stepping across 4k boundaries.  */
1128 
1129 	  /* If the length after the loop was larger than word_mode
1130 	     size, we can just do an overlapping compare and we're
1131 	     done.  We fall through to this code from the word_mode
1132 	     compare that preceeds this.  */
1133 	  do_overlap_load_compare (load_mode, false, 0, diff,
1134 				   cmp_rem, dcond, src1_addr, src2_addr,
1135 				   orig_src1, orig_src2);
1136 
1137 	  rtx diff_ref = gen_rtx_LABEL_REF (VOIDmode, diff_label);
1138 	  j = emit_jump_insn (gen_rtx_SET (pc_rtx, diff_ref));
1139 	  JUMP_LABEL (j) = diff_label;
1140 	  LABEL_NUSES (diff_label) += 1;
1141 	  emit_barrier ();
1142 
1143 	  /* If we couldn't do the overlap compare we have to be more
1144 	     careful of the 4k boundary.  Test to see if either
1145 	     address is less than word_mode_size away from a 4k
1146 	     boundary.  If not, then we can do a load/shift/compare
1147 	     and we are done.  We come to this code if length was less
1148 	     than word_mode_size.  */
1149 
1150 	  emit_label (final_cleanup);
1151 
1152 	  /* We can still avoid the slow case if the length was larger
1153 	     than one loop iteration, in which case go do the overlap
1154 	     load compare path.  */
1155 	  do_ifelse (CCmode, GT, bytes_rtx, GEN_INT (loop_bytes),
1156 		     NULL_RTX, nonconst_overlap, profile_probability::even ());
1157 
1158 	  rtx rem4k = gen_reg_rtx (word_mode);
1159 	  rtx dist1 = gen_reg_rtx (word_mode);
1160 	  rtx dist2 = gen_reg_rtx (word_mode);
1161 	  do_sub3 (rem4k, GEN_INT (4096), cmp_rem);
1162 	  if (word_mode == SImode)
1163 	    emit_insn (gen_andsi3 (dist1, src1_addr, GEN_INT (0xfff)));
1164 	  else
1165 	    emit_insn (gen_anddi3 (dist1, src1_addr, GEN_INT (0xfff)));
1166 	  do_ifelse (CCmode, LE, dist1, rem4k, NULL_RTX,
1167 		     handle4k_label, profile_probability::very_unlikely ());
1168 	  if (word_mode == SImode)
1169 	    emit_insn (gen_andsi3 (dist2, src2_addr, GEN_INT (0xfff)));
1170 	  else
1171 	    emit_insn (gen_anddi3 (dist2, src2_addr, GEN_INT (0xfff)));
1172 	  do_ifelse (CCmode, LE, dist2, rem4k, NULL_RTX,
1173 		     handle4k_label, profile_probability::very_unlikely ());
1174 
1175 	  /* We don't have a 4k boundary to deal with, so do
1176 	     a load/shift/compare and jump to diff.  */
1177 
1178 	  do_load_mask_compare (load_mode, diff, cmp_rem, dcond,
1179 				src1_addr, src2_addr, orig_src1, orig_src2);
1180 
1181 	  j = emit_jump_insn (gen_rtx_SET (pc_rtx, diff_ref));
1182 	  JUMP_LABEL (j) = diff_label;
1183 	  LABEL_NUSES (diff_label) += 1;
1184 	  emit_barrier ();
1185 
1186 	  /* Finally in the unlikely case we are inching up to a
1187 	     4k boundary we use a compact lbzx/compare loop to do
1188 	     it a byte at a time.  */
1189 
1190 	  emit_label (handle4k_label);
1191 
1192 	  rtx ctr = gen_rtx_REG (Pmode, CTR_REGNO);
1193 	  emit_move_insn (ctr, cmp_rem);
1194 	  rtx ixreg = gen_reg_rtx (Pmode);
1195 	  emit_move_insn (ixreg, const0_rtx);
1196 
1197 	  rtx src1_ix = gen_rtx_PLUS (word_mode, src1_addr, ixreg);
1198 	  rtx src2_ix = gen_rtx_PLUS (word_mode, src2_addr, ixreg);
1199 	  rtx d1 = gen_reg_rtx (word_mode);
1200 	  rtx d2 = gen_reg_rtx (word_mode);
1201 
1202 	  rtx fc_loop = gen_label_rtx ();
1203 	  emit_label (fc_loop);
1204 
1205 	  do_load_for_compare_from_addr (QImode, d1, src1_ix, orig_src1);
1206 	  do_load_for_compare_from_addr (QImode, d2, src2_ix, orig_src2);
1207 
1208 	  do_add3 (ixreg, ixreg, const1_rtx);
1209 
1210 	  rtx cond = gen_reg_rtx (CCmode);
1211 	  rtx subexpr = gen_rtx_MINUS (word_mode, d1, d2);
1212 	  rs6000_emit_dot_insn (diff, subexpr, 2, cond);
1213 
1214 	  rtx eqrtx = gen_rtx_EQ (VOIDmode, d1, d2);
1215 	  if (TARGET_64BIT)
1216 	    j = emit_jump_insn (gen_bdnztf_di (fc_loop, ctr, ctr,
1217 					       eqrtx, cond));
1218 	  else
1219 	    j = emit_jump_insn (gen_bdnztf_si (fc_loop, ctr, ctr,
1220 					       eqrtx, cond));
1221 	  add_reg_br_prob_note (j, profile_probability::likely ());
1222 	  JUMP_LABEL (j) = fc_loop;
1223 	  LABEL_NUSES (fc_loop) += 1;
1224 
1225 	  if (TARGET_64BIT)
1226 	    emit_insn (gen_movsi (target, gen_lowpart (SImode, diff)));
1227 	  else
1228 	    emit_move_insn (target, diff);
1229 
1230 	  /* Since we are comparing bytes, the difference can be used
1231 	     as the final result and we are done here.  */
1232 	  j = emit_jump_insn (gen_rtx_SET (pc_rtx, final_ref));
1233 	  JUMP_LABEL (j) = final_label;
1234 	  LABEL_NUSES (final_label) += 1;
1235 	  emit_barrier ();
1236 	}
1237     }
1238 
1239   emit_label (diff_label);
1240   /* difference handling, 64->32 conversion */
1241 
1242   /* We need to produce DI result from sub, then convert to target SI
1243      while maintaining <0 / ==0 / >0 properties.  This sequence works:
1244      subfc L,A,B
1245      subfe H,H,H
1246      popcntd L,L
1247      rldimi L,H,6,0
1248 
1249      This is an alternate one Segher cooked up if somebody
1250      wants to expand this for something that doesn't have popcntd:
1251      subfc L,a,b
1252      subfe H,x,x
1253      addic t,L,-1
1254      subfe v,t,L
1255      or z,v,H
1256 
1257      And finally, p9 can just do this:
1258      cmpld A,B
1259      setb r */
1260 
1261   if (TARGET_P9_MISC)
1262     emit_insn (gen_setb_unsigned (target, dcond));
1263   else
1264     {
1265       if (TARGET_64BIT)
1266 	{
1267 	  rtx tmp_reg_ca = gen_reg_rtx (DImode);
1268 	  emit_insn (gen_subfdi3_carry_in_xx (tmp_reg_ca));
1269 	  emit_insn (gen_popcntddi2 (diff, diff));
1270 	  emit_insn (gen_iordi3 (diff, diff, tmp_reg_ca));
1271 	  emit_insn (gen_movsi (target, gen_lowpart (SImode, diff)));
1272 	}
1273       else
1274 	{
1275 	  rtx tmp_reg_ca = gen_reg_rtx (SImode);
1276 	  emit_insn (gen_subfsi3_carry_in_xx (tmp_reg_ca));
1277 	  emit_insn (gen_popcntdsi2 (diff, diff));
1278 	  emit_insn (gen_iorsi3 (target, diff, tmp_reg_ca));
1279 	}
1280     }
1281 
1282   if (library_call_label != NULL)
1283     {
1284       /* Branch around memcmp call.  */
1285       j = emit_jump_insn (gen_rtx_SET (pc_rtx, final_ref));
1286       JUMP_LABEL (j) = final_label;
1287       LABEL_NUSES (final_label) += 1;
1288       emit_barrier ();
1289 
1290       /* Make memcmp library call.  cmp_rem is the remaining bytes that
1291 	 were compared and cmp_rem is the expected amount to be compared
1292 	 by memcmp.  If we don't find a difference in the loop compare, do
1293 	 the library call directly instead of doing a small compare just
1294 	 to get to an arbitrary boundary before calling it anyway.
1295 	 Also, update addresses to point to the next word to examine.  */
1296       emit_label (library_call_label);
1297 
1298       rtx len_rtx = gen_reg_rtx (word_mode);
1299       if (bytes_is_const)
1300 	{
1301 	  emit_move_insn (len_rtx, cmp_rem);
1302 	  do_add3 (src1_addr, src1_addr, iv1);
1303 	  do_add3 (src2_addr, src2_addr, iv1);
1304 	}
1305       else
1306 	emit_move_insn (len_rtx, bytes_rtx);
1307 
1308       tree fun = builtin_decl_explicit (BUILT_IN_MEMCMP);
1309       emit_library_call_value (XEXP (DECL_RTL (fun), 0),
1310 			       target, LCT_NORMAL, GET_MODE (target),
1311 			       src1_addr, Pmode,
1312 			       src2_addr, Pmode,
1313 			       len_rtx, GET_MODE (len_rtx));
1314     }
1315 
1316   /* emit final_label */
1317   emit_label (final_label);
1318   return true;
1319 }
1320 
1321 /* Expand a block compare operation, and return true if successful.
1322    Return false if we should let the compiler generate normal code,
1323    probably a memcmp call.
1324 
1325    OPERANDS[0] is the target (result).
1326    OPERANDS[1] is the first source.
1327    OPERANDS[2] is the second source.
1328    OPERANDS[3] is the length.
1329    OPERANDS[4] is the alignment.  */
1330 bool
expand_block_compare(rtx operands[])1331 expand_block_compare (rtx operands[])
1332 {
1333   rtx target = operands[0];
1334   rtx orig_src1 = operands[1];
1335   rtx orig_src2 = operands[2];
1336   rtx bytes_rtx = operands[3];
1337   rtx align_rtx = operands[4];
1338   HOST_WIDE_INT cmp_bytes = 0;
1339   rtx src1 = orig_src1;
1340   rtx src2 = orig_src2;
1341 
1342   /* This case is complicated to handle because the subtract
1343      with carry instructions do not generate the 64-bit
1344      carry and so we must emit code to calculate it ourselves.
1345      We choose not to implement this yet.  */
1346   if (TARGET_32BIT && TARGET_POWERPC64)
1347     return false;
1348 
1349   bool isP7 = (rs6000_tune == PROCESSOR_POWER7);
1350 
1351   /* Allow this param to shut off all expansion.  */
1352   if (rs6000_block_compare_inline_limit == 0)
1353     return false;
1354 
1355   /* targetm.slow_unaligned_access -- don't do unaligned stuff.
1356      However slow_unaligned_access returns true on P7 even though the
1357      performance of this code is good there.  */
1358   if (!isP7
1359       && (targetm.slow_unaligned_access (word_mode, MEM_ALIGN (orig_src1))
1360 	  || targetm.slow_unaligned_access (word_mode, MEM_ALIGN (orig_src2))))
1361     return false;
1362 
1363   /* Unaligned l*brx traps on P7 so don't do this.  However this should
1364      not affect much because LE isn't really supported on P7 anyway.  */
1365   if (isP7 && !BYTES_BIG_ENDIAN)
1366     return false;
1367 
1368   /* If this is not a fixed size compare, try generating loop code and
1369      if that fails just call memcmp.  */
1370   if (!CONST_INT_P (bytes_rtx))
1371     return expand_compare_loop (operands);
1372 
1373   /* This must be a fixed size alignment.  */
1374   if (!CONST_INT_P (align_rtx))
1375     return false;
1376 
1377   unsigned int base_align = UINTVAL (align_rtx) / BITS_PER_UNIT;
1378 
1379   gcc_assert (GET_MODE (target) == SImode);
1380 
1381   /* Anything to move?  */
1382   unsigned HOST_WIDE_INT bytes = UINTVAL (bytes_rtx);
1383   if (bytes == 0)
1384     return true;
1385 
1386   rtx tmp_reg_src1 = gen_reg_rtx (word_mode);
1387   rtx tmp_reg_src2 = gen_reg_rtx (word_mode);
1388   /* P7/P8 code uses cond for subfc. but P9 uses
1389      it for cmpld which needs CCUNSmode.  */
1390   rtx cond;
1391   if (TARGET_P9_MISC)
1392     cond = gen_reg_rtx (CCUNSmode);
1393   else
1394     cond = gen_reg_rtx (CCmode);
1395 
1396   /* If we have an LE target without ldbrx and word_mode is DImode,
1397      then we must avoid using word_mode.  */
1398   int word_mode_ok = !(!BYTES_BIG_ENDIAN && !TARGET_LDBRX
1399 		       && word_mode == DImode);
1400 
1401   /* Strategy phase.  How many ops will this take and should we expand it?  */
1402 
1403   unsigned HOST_WIDE_INT offset = 0;
1404   machine_mode load_mode =
1405     select_block_compare_mode (offset, bytes, base_align, word_mode_ok);
1406   unsigned int load_mode_size = GET_MODE_SIZE (load_mode);
1407 
1408   /* We don't want to generate too much code.  The loop code can take
1409      over for lengths greater than 31 bytes.  */
1410   unsigned HOST_WIDE_INT max_bytes = rs6000_block_compare_inline_limit;
1411   if (!IN_RANGE (bytes, 1, max_bytes))
1412     return expand_compare_loop (operands);
1413 
1414   /* The code generated for p7 and older is not faster than glibc
1415      memcmp if alignment is small and length is not short, so bail
1416      out to avoid those conditions.  */
1417   if (!TARGET_EFFICIENT_OVERLAPPING_UNALIGNED
1418       && ((base_align == 1 && bytes > 16)
1419 	  || (base_align == 2 && bytes > 32)))
1420     return false;
1421 
1422   bool generate_6432_conversion = false;
1423   rtx convert_label = NULL;
1424   rtx final_label = NULL;
1425 
1426   /* Example of generated code for 18 bytes aligned 1 byte.
1427      Compiled with -fno-reorder-blocks for clarity.
1428              ldbrx 10,31,8
1429              ldbrx 9,7,8
1430              subfc. 9,9,10
1431              bne 0,.L6487
1432              addi 9,12,8
1433              addi 5,11,8
1434              ldbrx 10,0,9
1435              ldbrx 9,0,5
1436              subfc. 9,9,10
1437              bne 0,.L6487
1438              addi 9,12,16
1439              lhbrx 10,0,9
1440              addi 9,11,16
1441              lhbrx 9,0,9
1442              subf 9,9,10
1443              b .L6488
1444              .p2align 4,,15
1445      .L6487: #convert_label
1446              popcntd 9,9
1447              subfe 10,10,10
1448              or 9,9,10
1449      .L6488: #final_label
1450              extsw 10,9
1451 
1452      We start off with DImode for two blocks that jump to the DI->SI conversion
1453      if the difference is found there, then a final block of HImode that skips
1454      the DI->SI conversion.  */
1455 
1456   while (bytes > 0)
1457     {
1458       unsigned int align = compute_current_alignment (base_align, offset);
1459       if (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED)
1460 	load_mode = select_block_compare_mode (offset, bytes, align,
1461 					       word_mode_ok);
1462       else
1463 	load_mode = select_block_compare_mode (0, bytes, align, word_mode_ok);
1464       load_mode_size = GET_MODE_SIZE (load_mode);
1465       if (bytes >= load_mode_size)
1466 	cmp_bytes = load_mode_size;
1467       else if (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED)
1468 	{
1469 	  /* Move this load back so it doesn't go past the end.
1470 	     P8/P9 can do this efficiently.  */
1471 	  unsigned int extra_bytes = load_mode_size - bytes;
1472 	  cmp_bytes = bytes;
1473 	  if (extra_bytes < offset)
1474 	    {
1475 	      offset -= extra_bytes;
1476 	      cmp_bytes = load_mode_size;
1477 	      bytes = cmp_bytes;
1478 	    }
1479 	}
1480       else
1481 	/* P7 and earlier can't do the overlapping load trick fast,
1482 	   so this forces a non-overlapping load and a shift to get
1483 	   rid of the extra bytes.  */
1484 	cmp_bytes = bytes;
1485 
1486       src1 = adjust_address (orig_src1, load_mode, offset);
1487       src2 = adjust_address (orig_src2, load_mode, offset);
1488 
1489       if (!REG_P (XEXP (src1, 0)))
1490 	{
1491 	  rtx src1_reg = copy_addr_to_reg (XEXP (src1, 0));
1492 	  src1 = replace_equiv_address (src1, src1_reg);
1493 	}
1494       set_mem_size (src1, load_mode_size);
1495 
1496       if (!REG_P (XEXP (src2, 0)))
1497 	{
1498 	  rtx src2_reg = copy_addr_to_reg (XEXP (src2, 0));
1499 	  src2 = replace_equiv_address (src2, src2_reg);
1500 	}
1501       set_mem_size (src2, load_mode_size);
1502 
1503       do_load_for_compare (tmp_reg_src1, src1, load_mode);
1504       do_load_for_compare (tmp_reg_src2, src2, load_mode);
1505 
1506       if (cmp_bytes < load_mode_size)
1507 	{
1508 	  /* Shift unneeded bytes off.  */
1509 	  rtx sh = GEN_INT (BITS_PER_UNIT * (load_mode_size - cmp_bytes));
1510 	  if (word_mode == DImode)
1511 	    {
1512 	      emit_insn (gen_lshrdi3 (tmp_reg_src1, tmp_reg_src1, sh));
1513 	      emit_insn (gen_lshrdi3 (tmp_reg_src2, tmp_reg_src2, sh));
1514 	    }
1515 	  else
1516 	    {
1517 	      emit_insn (gen_lshrsi3 (tmp_reg_src1, tmp_reg_src1, sh));
1518 	      emit_insn (gen_lshrsi3 (tmp_reg_src2, tmp_reg_src2, sh));
1519 	    }
1520 	}
1521 
1522       int remain = bytes - cmp_bytes;
1523       if (GET_MODE_SIZE (GET_MODE (target)) > GET_MODE_SIZE (load_mode))
1524 	{
1525 	  /* Target is larger than load size so we don't need to
1526 	     reduce result size.  */
1527 
1528 	  /* We previously did a block that need 64->32 conversion but
1529 	     the current block does not, so a label is needed to jump
1530 	     to the end.  */
1531 	  if (generate_6432_conversion && !final_label)
1532 	    final_label = gen_label_rtx ();
1533 
1534 	  if (remain > 0)
1535 	    {
1536 	      /* This is not the last block, branch to the end if the result
1537 		 of this subtract is not zero.  */
1538 	      if (!final_label)
1539 		final_label = gen_label_rtx ();
1540 	      rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label);
1541 	      rtx tmp = gen_rtx_MINUS (word_mode, tmp_reg_src1, tmp_reg_src2);
1542 	      rtx cr = gen_reg_rtx (CCmode);
1543 	      rs6000_emit_dot_insn (tmp_reg_src2, tmp, 2, cr);
1544 	      emit_insn (gen_movsi (target,
1545 				    gen_lowpart (SImode, tmp_reg_src2)));
1546 	      rtx ne_rtx = gen_rtx_NE (VOIDmode, cr, const0_rtx);
1547 	      rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, ne_rtx,
1548 						 fin_ref, pc_rtx);
1549 	      rtx_insn *j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
1550 	      add_reg_br_prob_note (j, profile_probability::unlikely ());
1551 	      JUMP_LABEL (j) = final_label;
1552 	      LABEL_NUSES (final_label) += 1;
1553 	    }
1554 	  else
1555 	    {
1556 	      if (word_mode == DImode)
1557 		{
1558 		  emit_insn (gen_subdi3 (tmp_reg_src2, tmp_reg_src1,
1559 					 tmp_reg_src2));
1560 		  emit_insn (gen_movsi (target,
1561 					gen_lowpart (SImode, tmp_reg_src2)));
1562 		}
1563 	      else
1564 		emit_insn (gen_subsi3 (target, tmp_reg_src1, tmp_reg_src2));
1565 
1566 	      if (final_label)
1567 		{
1568 		  rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label);
1569 		  rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, fin_ref));
1570 		  JUMP_LABEL (j) = final_label;
1571 		  LABEL_NUSES (final_label) += 1;
1572 		  emit_barrier ();
1573 		}
1574 	    }
1575 	}
1576       else
1577 	{
1578 	  /* Do we need a 64->32 conversion block? We need the 64->32
1579 	     conversion even if target size == load_mode size because
1580 	     the subtract generates one extra bit.  */
1581 	  generate_6432_conversion = true;
1582 
1583 	  if (remain > 0)
1584 	    {
1585 	      if (!convert_label)
1586 		convert_label = gen_label_rtx ();
1587 
1588 	      /* Compare to zero and branch to convert_label if not zero.  */
1589 	      rtx cvt_ref = gen_rtx_LABEL_REF (VOIDmode, convert_label);
1590 	      if (TARGET_P9_MISC)
1591 		{
1592 		/* Generate a compare, and convert with a setb later.  */
1593 		  rtx cmp = gen_rtx_COMPARE (CCUNSmode, tmp_reg_src1,
1594 					     tmp_reg_src2);
1595 		  emit_insn (gen_rtx_SET (cond, cmp));
1596 		}
1597 	      else
1598 		/* Generate a subfc. and use the longer
1599 		   sequence for conversion.  */
1600 		if (TARGET_64BIT)
1601 		  emit_insn (gen_subfdi3_carry_dot2 (tmp_reg_src2, tmp_reg_src2,
1602 						     tmp_reg_src1, cond));
1603 		else
1604 		  emit_insn (gen_subfsi3_carry_dot2 (tmp_reg_src2, tmp_reg_src2,
1605 						     tmp_reg_src1, cond));
1606 	      rtx ne_rtx = gen_rtx_NE (VOIDmode, cond, const0_rtx);
1607 	      rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, ne_rtx,
1608 						 cvt_ref, pc_rtx);
1609 	      rtx_insn *j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
1610 	      add_reg_br_prob_note (j, profile_probability::likely ());
1611 	      JUMP_LABEL (j) = convert_label;
1612 	      LABEL_NUSES (convert_label) += 1;
1613 	    }
1614 	  else
1615 	    {
1616 	      /* Just do the subtract/compare.  Since this is the last block
1617 		 the convert code will be generated immediately following.  */
1618 	      if (TARGET_P9_MISC)
1619 		{
1620 		  rtx cmp = gen_rtx_COMPARE (CCUNSmode, tmp_reg_src1,
1621 					     tmp_reg_src2);
1622 		  emit_insn (gen_rtx_SET (cond, cmp));
1623 		}
1624 	      else
1625 		if (TARGET_64BIT)
1626 		  emit_insn (gen_subfdi3_carry (tmp_reg_src2, tmp_reg_src2,
1627 						tmp_reg_src1));
1628 		else
1629 		  emit_insn (gen_subfsi3_carry (tmp_reg_src2, tmp_reg_src2,
1630 						tmp_reg_src1));
1631 	    }
1632 	}
1633 
1634       offset += cmp_bytes;
1635       bytes -= cmp_bytes;
1636     }
1637 
1638   if (generate_6432_conversion)
1639     {
1640       if (convert_label)
1641 	emit_label (convert_label);
1642 
1643       /* We need to produce DI result from sub, then convert to target SI
1644 	 while maintaining <0 / ==0 / >0 properties.  This sequence works:
1645 	 subfc L,A,B
1646 	 subfe H,H,H
1647 	 popcntd L,L
1648 	 rldimi L,H,6,0
1649 
1650 	 This is an alternate one Segher cooked up if somebody
1651 	 wants to expand this for something that doesn't have popcntd:
1652 	 subfc L,a,b
1653 	 subfe H,x,x
1654 	 addic t,L,-1
1655 	 subfe v,t,L
1656 	 or z,v,H
1657 
1658 	 And finally, p9 can just do this:
1659 	 cmpld A,B
1660 	 setb r */
1661 
1662       if (TARGET_P9_MISC)
1663 	{
1664 	  emit_insn (gen_setb_unsigned (target, cond));
1665 	}
1666       else
1667 	{
1668 	  if (TARGET_64BIT)
1669 	    {
1670 	      rtx tmp_reg_ca = gen_reg_rtx (DImode);
1671 	      emit_insn (gen_subfdi3_carry_in_xx (tmp_reg_ca));
1672 	      emit_insn (gen_popcntddi2 (tmp_reg_src2, tmp_reg_src2));
1673 	      emit_insn (gen_iordi3 (tmp_reg_src2, tmp_reg_src2, tmp_reg_ca));
1674 	      emit_insn (gen_movsi (target, gen_lowpart (SImode, tmp_reg_src2)));
1675 	    }
1676 	  else
1677 	    {
1678 	      rtx tmp_reg_ca = gen_reg_rtx (SImode);
1679 	      emit_insn (gen_subfsi3_carry_in_xx (tmp_reg_ca));
1680 	      emit_insn (gen_popcntdsi2 (tmp_reg_src2, tmp_reg_src2));
1681 	      emit_insn (gen_iorsi3 (target, tmp_reg_src2, tmp_reg_ca));
1682 	    }
1683 	}
1684     }
1685 
1686   if (final_label)
1687     emit_label (final_label);
1688 
1689   gcc_assert (bytes == 0);
1690   return true;
1691 }
1692 
1693 /* Generate alignment check and branch code to set up for
1694    strncmp when we don't have DI alignment.
1695    STRNCMP_LABEL is the label to branch if there is a page crossing.
1696    SRC is the string pointer to be examined.
1697    BYTES is the max number of bytes to compare.  */
1698 static void
expand_strncmp_align_check(rtx strncmp_label,rtx src,HOST_WIDE_INT bytes)1699 expand_strncmp_align_check (rtx strncmp_label, rtx src, HOST_WIDE_INT bytes)
1700 {
1701   rtx lab_ref = gen_rtx_LABEL_REF (VOIDmode, strncmp_label);
1702   rtx src_check = copy_addr_to_reg (XEXP (src, 0));
1703   if (GET_MODE (src_check) == SImode)
1704     emit_insn (gen_andsi3 (src_check, src_check, GEN_INT (0xfff)));
1705   else
1706     emit_insn (gen_anddi3 (src_check, src_check, GEN_INT (0xfff)));
1707   rtx cond = gen_reg_rtx (CCmode);
1708   emit_move_insn (cond, gen_rtx_COMPARE (CCmode, src_check,
1709 					 GEN_INT (4096 - bytes)));
1710 
1711   rtx cmp_rtx = gen_rtx_GE (VOIDmode, cond, const0_rtx);
1712 
1713   rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx,
1714 				     lab_ref, pc_rtx);
1715   rtx_insn *j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
1716   add_reg_br_prob_note (j, profile_probability::unlikely ());
1717   JUMP_LABEL (j) = strncmp_label;
1718   LABEL_NUSES (strncmp_label) += 1;
1719 }
1720 
1721 /* Expand a string compare operation with length, and return
1722    true if successful. Return false if we should let the
1723    compiler generate normal code, probably a strncmp call.
1724 
1725    OPERANDS[0] is the target (result).
1726    OPERANDS[1] is the first source.
1727    OPERANDS[2] is the second source.
1728    If NO_LENGTH is zero, then:
1729    OPERANDS[3] is the length.
1730    OPERANDS[4] is the alignment in bytes.
1731    If NO_LENGTH is nonzero, then:
1732    OPERANDS[3] is the alignment in bytes.  */
1733 bool
expand_strn_compare(rtx operands[],int no_length)1734 expand_strn_compare (rtx operands[], int no_length)
1735 {
1736   rtx target = operands[0];
1737   rtx orig_src1 = operands[1];
1738   rtx orig_src2 = operands[2];
1739   rtx bytes_rtx, align_rtx;
1740   if (no_length)
1741     {
1742       bytes_rtx = NULL;
1743       align_rtx = operands[3];
1744     }
1745   else
1746     {
1747       bytes_rtx = operands[3];
1748       align_rtx = operands[4];
1749     }
1750   unsigned HOST_WIDE_INT cmp_bytes = 0;
1751   rtx src1 = orig_src1;
1752   rtx src2 = orig_src2;
1753 
1754   /* If we have a length, it must be constant. This simplifies things
1755      a bit as we don't have to generate code to check if we've exceeded
1756      the length. Later this could be expanded to handle this case.  */
1757   if (!no_length && !CONST_INT_P (bytes_rtx))
1758     return false;
1759 
1760   /* This must be a fixed size alignment.  */
1761   if (!CONST_INT_P (align_rtx))
1762     return false;
1763 
1764   unsigned int base_align = UINTVAL (align_rtx);
1765   int align1 = MEM_ALIGN (orig_src1) / BITS_PER_UNIT;
1766   int align2 = MEM_ALIGN (orig_src2) / BITS_PER_UNIT;
1767 
1768   /* targetm.slow_unaligned_access -- don't do unaligned stuff.  */
1769   if (targetm.slow_unaligned_access (word_mode, align1)
1770       || targetm.slow_unaligned_access (word_mode, align2))
1771     return false;
1772 
1773   gcc_assert (GET_MODE (target) == SImode);
1774 
1775   /* If we have an LE target without ldbrx and word_mode is DImode,
1776      then we must avoid using word_mode.  */
1777   int word_mode_ok = !(!BYTES_BIG_ENDIAN && !TARGET_LDBRX
1778 		       && word_mode == DImode);
1779 
1780   unsigned int word_mode_size = GET_MODE_SIZE (word_mode);
1781 
1782   unsigned HOST_WIDE_INT offset = 0;
1783   unsigned HOST_WIDE_INT bytes; /* N from the strncmp args if available.  */
1784   unsigned HOST_WIDE_INT compare_length; /* How much to compare inline.  */
1785   if (no_length)
1786     /* Use this as a standin to determine the mode to use.  */
1787     bytes = rs6000_string_compare_inline_limit * word_mode_size;
1788   else
1789     bytes = UINTVAL (bytes_rtx);
1790 
1791   machine_mode load_mode =
1792     select_block_compare_mode (offset, bytes, base_align, word_mode_ok);
1793   unsigned int load_mode_size = GET_MODE_SIZE (load_mode);
1794   compare_length = rs6000_string_compare_inline_limit * load_mode_size;
1795 
1796   /* If we have equality at the end of the last compare and we have not
1797      found the end of the string, we need to call strcmp/strncmp to
1798      compare the remainder.  */
1799   bool equality_compare_rest = false;
1800 
1801   if (no_length)
1802     {
1803       bytes = compare_length;
1804       equality_compare_rest = true;
1805     }
1806   else
1807     {
1808       if (bytes <= compare_length)
1809 	compare_length = bytes;
1810       else
1811 	equality_compare_rest = true;
1812     }
1813 
1814   rtx result_reg = gen_reg_rtx (word_mode);
1815   rtx final_move_label = gen_label_rtx ();
1816   rtx final_label = gen_label_rtx ();
1817   rtx begin_compare_label = NULL;
1818 
1819   if (base_align < 8)
1820     {
1821       /* Generate code that checks distance to 4k boundary for this case.  */
1822       begin_compare_label = gen_label_rtx ();
1823       rtx strncmp_label = gen_label_rtx ();
1824       rtx jmp;
1825 
1826       /* Strncmp for power8 in glibc does this:
1827 	 rldicl r8,r3,0,52
1828 	 cmpldi cr7,r8,4096-16
1829 	 bgt    cr7,L(pagecross) */
1830 
1831       /* Make sure that the length we use for the alignment test and
1832          the subsequent code generation are in agreement so we do not
1833          go past the length we tested for a 4k boundary crossing.  */
1834       unsigned HOST_WIDE_INT align_test = compare_length;
1835       if (align_test < 8)
1836         {
1837           align_test = HOST_WIDE_INT_1U << ceil_log2 (align_test);
1838           base_align = align_test;
1839         }
1840       else
1841         {
1842           align_test = ROUND_UP (align_test, 8);
1843           base_align = 8;
1844         }
1845 
1846       if (align1 < 8)
1847         expand_strncmp_align_check (strncmp_label, src1, align_test);
1848       if (align2 < 8)
1849         expand_strncmp_align_check (strncmp_label, src2, align_test);
1850 
1851       /* Now generate the following sequence:
1852 	 - branch to begin_compare
1853 	 - strncmp_label
1854 	 - call to strncmp
1855 	 - branch to final_label
1856 	 - begin_compare_label */
1857 
1858       rtx cmp_ref = gen_rtx_LABEL_REF (VOIDmode, begin_compare_label);
1859       jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, cmp_ref));
1860       JUMP_LABEL (jmp) = begin_compare_label;
1861       LABEL_NUSES (begin_compare_label) += 1;
1862       emit_barrier ();
1863 
1864       emit_label (strncmp_label);
1865 
1866       if (!REG_P (XEXP (src1, 0)))
1867 	{
1868 	  rtx src1_reg = copy_addr_to_reg (XEXP (src1, 0));
1869 	  src1 = replace_equiv_address (src1, src1_reg);
1870 	}
1871 
1872       if (!REG_P (XEXP (src2, 0)))
1873 	{
1874 	  rtx src2_reg = copy_addr_to_reg (XEXP (src2, 0));
1875 	  src2 = replace_equiv_address (src2, src2_reg);
1876 	}
1877 
1878       if (no_length)
1879 	{
1880 	  tree fun = builtin_decl_explicit (BUILT_IN_STRCMP);
1881 	  emit_library_call_value (XEXP (DECL_RTL (fun), 0),
1882 				   target, LCT_NORMAL, GET_MODE (target),
1883 				   force_reg (Pmode, XEXP (src1, 0)), Pmode,
1884 				   force_reg (Pmode, XEXP (src2, 0)), Pmode);
1885 	}
1886       else
1887 	{
1888 	  /* -m32 -mpowerpc64 results in word_mode being DImode even
1889 	     though otherwise it is 32-bit. The length arg to strncmp
1890 	     is a size_t which will be the same size as pointers.  */
1891 	  rtx len_rtx = gen_reg_rtx (Pmode);
1892 	  emit_move_insn (len_rtx, gen_int_mode (bytes, Pmode));
1893 
1894 	  tree fun = builtin_decl_explicit (BUILT_IN_STRNCMP);
1895 	  emit_library_call_value (XEXP (DECL_RTL (fun), 0),
1896 				   target, LCT_NORMAL, GET_MODE (target),
1897 				   force_reg (Pmode, XEXP (src1, 0)), Pmode,
1898 				   force_reg (Pmode, XEXP (src2, 0)), Pmode,
1899 				   len_rtx, Pmode);
1900 	}
1901 
1902       rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label);
1903       jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, fin_ref));
1904       JUMP_LABEL (jmp) = final_label;
1905       LABEL_NUSES (final_label) += 1;
1906       emit_barrier ();
1907       emit_label (begin_compare_label);
1908     }
1909 
1910   rtx cleanup_label = NULL;
1911   rtx tmp_reg_src1 = gen_reg_rtx (word_mode);
1912   rtx tmp_reg_src2 = gen_reg_rtx (word_mode);
1913 
1914   rtx src1_addr = force_reg (Pmode, XEXP (orig_src1, 0));
1915   rtx src2_addr = force_reg (Pmode, XEXP (orig_src2, 0));
1916 
1917   /* Generate sequence of ld/ldbrx, cmpb to compare out
1918      to the length specified.  */
1919   unsigned HOST_WIDE_INT bytes_to_compare = compare_length;
1920   while (bytes_to_compare > 0)
1921     {
1922       /* Compare sequence:
1923          check each 8B with: ld/ld/cmpb/cmpb/orc./bne
1924 
1925          cleanup code at end:
1926          cntlzd        get bit of first zero/diff byte
1927          subfic        convert for rldcl use
1928          rldcl rldcl   extract diff/zero byte
1929          subf          subtract for final result
1930 
1931          The last compare can branch around the cleanup code if the
1932          result is zero because the strings are exactly equal.  */
1933       unsigned int align = compute_current_alignment (base_align, offset);
1934       if (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED)
1935 	load_mode = select_block_compare_mode (offset, bytes_to_compare, align,
1936 					       word_mode_ok);
1937       else
1938 	load_mode = select_block_compare_mode (0, bytes_to_compare, align,
1939 					       word_mode_ok);
1940       load_mode_size = GET_MODE_SIZE (load_mode);
1941       if (bytes_to_compare >= load_mode_size)
1942 	cmp_bytes = load_mode_size;
1943       else if (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED)
1944 	{
1945 	  /* Move this load back so it doesn't go past the end.
1946 	     P8/P9 can do this efficiently.  */
1947 	  unsigned int extra_bytes = load_mode_size - bytes_to_compare;
1948 	  cmp_bytes = bytes_to_compare;
1949 	  if (extra_bytes < offset)
1950 	    {
1951 	      offset -= extra_bytes;
1952 	      cmp_bytes = load_mode_size;
1953 	      bytes_to_compare = cmp_bytes;
1954 	    }
1955 	}
1956       else
1957 	/* P7 and earlier can't do the overlapping load trick fast,
1958 	   so this forces a non-overlapping load and a shift to get
1959 	   rid of the extra bytes.  */
1960 	cmp_bytes = bytes_to_compare;
1961 
1962       rtx offset_rtx;
1963       if (BYTES_BIG_ENDIAN || TARGET_AVOID_XFORM)
1964        offset_rtx = GEN_INT (offset);
1965       else
1966        {
1967          offset_rtx = gen_reg_rtx (Pmode);
1968          emit_move_insn (offset_rtx, GEN_INT (offset));
1969        }
1970       rtx addr1 = gen_rtx_PLUS (Pmode, src1_addr, offset_rtx);
1971       rtx addr2 = gen_rtx_PLUS (Pmode, src2_addr, offset_rtx);
1972       do_load_for_compare_from_addr (load_mode, tmp_reg_src1, addr1, orig_src1);
1973       do_load_for_compare_from_addr (load_mode, tmp_reg_src2, addr2, orig_src2);
1974 
1975       /* We must always left-align the data we read, and
1976 	 clear any bytes to the right that are beyond the string.
1977 	 Otherwise the cmpb sequence won't produce the correct
1978  	 results.  However if there is only one byte left, we
1979  	 can just subtract to get the final result so the shifts
1980  	 and clears are not needed.  */
1981 
1982       unsigned HOST_WIDE_INT remain = bytes_to_compare - cmp_bytes;
1983 
1984       /* Loading just a single byte is a special case.  If we are
1985 	 loading more than that, we have to check whether we are
1986 	 looking at the entire chunk of data.  If not, rotate left and
1987 	 clear right so that bytes we aren't supposed to look at are
1988 	 zeroed, and the first byte we are supposed to compare is
1989 	 leftmost.  */
1990 
1991       if (load_mode_size != 1)
1992 	{
1993 	  if (load_mode_size < word_mode_size)
1994 	    {
1995 	      /* Rotate left first.  */
1996 	      rtx sh = GEN_INT (BITS_PER_UNIT
1997 				* (word_mode_size - load_mode_size));
1998 	      do_rotl3 (tmp_reg_src1, tmp_reg_src1, sh);
1999 	      do_rotl3 (tmp_reg_src2, tmp_reg_src2, sh);
2000 	    }
2001 
2002 	  if (cmp_bytes < word_mode_size)
2003 	    {
2004 	      /* Now clear right.  This plus the rotate can be
2005 		 turned into a rldicr instruction.  */
2006 	      HOST_WIDE_INT mb = BITS_PER_UNIT * (word_mode_size - cmp_bytes);
2007 	      rtx mask = GEN_INT (HOST_WIDE_INT_M1U << mb);
2008 	      do_and3 (tmp_reg_src1, tmp_reg_src1, mask);
2009 	      do_and3 (tmp_reg_src2, tmp_reg_src2, mask);
2010 	    }
2011 	}
2012 
2013       /* Cases to handle.  A and B are chunks of the two strings.
2014 	 1: Not end of comparison:
2015 	 A != B: branch to cleanup code to compute result.
2016 	 A == B: check for 0 byte, next block if not found.
2017 	 2: End of the inline comparison:
2018 	 A != B: branch to cleanup code to compute result.
2019 	 A == B: check for 0 byte, call strcmp/strncmp
2020 	 3: compared requested N bytes:
2021 	 A == B: branch to result 0.
2022 	 A != B: cleanup code to compute result.  */
2023 
2024       rtx dst_label;
2025       if (remain > 0 || equality_compare_rest)
2026 	{
2027 	  /* Branch to cleanup code, otherwise fall through to do
2028 	     more compares.  */
2029 	  if (!cleanup_label)
2030 	    cleanup_label = gen_label_rtx ();
2031 	  dst_label = cleanup_label;
2032 	}
2033       else
2034 	/* Branch to end and produce result of 0.  */
2035 	dst_label = final_move_label;
2036 
2037       if (load_mode_size == 1)
2038 	{
2039 	  /* Special case for comparing just single byte.  */
2040 	  if (equality_compare_rest)
2041 	    {
2042 	      /* Use subf./bne to branch to final_move_label if the
2043 		 byte differs, otherwise fall through to the strncmp
2044 		 call.  We must also check for a zero byte here as we
2045 		 must not make the library call if this is the end of
2046 		 the string.  */
2047 
2048 	      rtx lab_ref = gen_rtx_LABEL_REF (VOIDmode, final_move_label);
2049 	      rtx cond = gen_reg_rtx (CCmode);
2050 	      rtx diff_rtx = gen_rtx_MINUS (word_mode,
2051 					    tmp_reg_src1, tmp_reg_src2);
2052 	      rs6000_emit_dot_insn (result_reg, diff_rtx, 2, cond);
2053 	      rtx cmp_rtx = gen_rtx_NE (VOIDmode, cond, const0_rtx);
2054 
2055 	      rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx,
2056 						 lab_ref, pc_rtx);
2057 	      rtx_insn *j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
2058 	      add_reg_br_prob_note (j, profile_probability::unlikely ());
2059 	      JUMP_LABEL (j) = final_move_label;
2060 	      LABEL_NUSES (final_move_label) += 1;
2061 
2062 	      /* Check for zero byte here before fall through to
2063 		 library call.  This catches the case where the
2064 		 strings are equal and end in a zero byte at this
2065 		 position.  */
2066 
2067 	      rtx cond0 = gen_reg_rtx (CCmode);
2068 	      emit_move_insn (cond0, gen_rtx_COMPARE (CCmode, tmp_reg_src1,
2069 						      const0_rtx));
2070 
2071 	      rtx cmp0eq_rtx = gen_rtx_EQ (VOIDmode, cond0, const0_rtx);
2072 
2073 	      rtx ifelse0 = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp0eq_rtx,
2074 						 lab_ref, pc_rtx);
2075 	      rtx_insn *j0 = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse0));
2076 	      add_reg_br_prob_note (j0, profile_probability::unlikely ());
2077 	      JUMP_LABEL (j0) = final_move_label;
2078 	      LABEL_NUSES (final_move_label) += 1;
2079 	    }
2080 	  else
2081 	    {
2082 	      /* This is the last byte to be compared so we can use
2083 		 subf to compute the final result and branch
2084 		 unconditionally to final_move_label.  */
2085 
2086 	      do_sub3 (result_reg, tmp_reg_src1, tmp_reg_src2);
2087 
2088 	      rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_move_label);
2089 	      rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, fin_ref));
2090 	      JUMP_LABEL (j) = final_move_label;
2091 	      LABEL_NUSES (final_move_label) += 1;
2092 	      emit_barrier ();
2093 	    }
2094 	}
2095       else
2096 	{
2097 	  rtx cmpb_zero = gen_reg_rtx (word_mode);
2098 	  rtx cmpb_diff = gen_reg_rtx (word_mode);
2099 	  rtx zero_reg = gen_reg_rtx (word_mode);
2100 	  rtx lab_ref = gen_rtx_LABEL_REF (VOIDmode, dst_label);
2101 	  rtx cond = gen_reg_rtx (CCmode);
2102 
2103 	  emit_move_insn (zero_reg, GEN_INT (0));
2104 	  do_cmpb3 (cmpb_diff, tmp_reg_src1, tmp_reg_src2);
2105 	  do_cmpb3 (cmpb_zero, tmp_reg_src1, zero_reg);
2106 	  rtx not_diff = gen_rtx_NOT (word_mode, cmpb_diff);
2107 	  rtx orc_rtx = gen_rtx_IOR (word_mode, not_diff, cmpb_zero);
2108 
2109 	  rs6000_emit_dot_insn (result_reg, orc_rtx, 2, cond);
2110 
2111 	  rtx cmp_rtx;
2112 	  if (remain == 0 && !equality_compare_rest)
2113 	    cmp_rtx = gen_rtx_EQ (VOIDmode, cond, const0_rtx);
2114 	  else
2115 	    cmp_rtx = gen_rtx_NE (VOIDmode, cond, const0_rtx);
2116 
2117 	  rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx,
2118 					     lab_ref, pc_rtx);
2119 	  rtx_insn *j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
2120 	  add_reg_br_prob_note (j, profile_probability::unlikely ());
2121 	  JUMP_LABEL (j) = dst_label;
2122 	  LABEL_NUSES (dst_label) += 1;
2123 	}
2124 
2125       offset += cmp_bytes;
2126       bytes_to_compare -= cmp_bytes;
2127     }
2128 
2129   if (equality_compare_rest)
2130     {
2131       /* Update pointers past what has been compared already.  */
2132       src1 = adjust_address (orig_src1, load_mode, offset);
2133       src2 = adjust_address (orig_src2, load_mode, offset);
2134 
2135       if (!REG_P (XEXP (src1, 0)))
2136 	{
2137 	  rtx src1_reg = copy_addr_to_reg (XEXP (src1, 0));
2138 	  src1 = replace_equiv_address (src1, src1_reg);
2139 	}
2140       set_mem_size (src1, load_mode_size);
2141 
2142       if (!REG_P (XEXP (src2, 0)))
2143 	{
2144 	  rtx src2_reg = copy_addr_to_reg (XEXP (src2, 0));
2145 	  src2 = replace_equiv_address (src2, src2_reg);
2146 	}
2147       set_mem_size (src2, load_mode_size);
2148 
2149       /* Construct call to strcmp/strncmp to compare the rest of the string.  */
2150       if (no_length)
2151 	{
2152 	  tree fun = builtin_decl_explicit (BUILT_IN_STRCMP);
2153 	  emit_library_call_value (XEXP (DECL_RTL (fun), 0),
2154 				   target, LCT_NORMAL, GET_MODE (target),
2155 				   force_reg (Pmode, XEXP (src1, 0)), Pmode,
2156 				   force_reg (Pmode, XEXP (src2, 0)), Pmode);
2157 	}
2158       else
2159 	{
2160 	  rtx len_rtx = gen_reg_rtx (Pmode);
2161 	  emit_move_insn (len_rtx, gen_int_mode (bytes - compare_length, Pmode));
2162 	  tree fun = builtin_decl_explicit (BUILT_IN_STRNCMP);
2163 	  emit_library_call_value (XEXP (DECL_RTL (fun), 0),
2164 				   target, LCT_NORMAL, GET_MODE (target),
2165 				   force_reg (Pmode, XEXP (src1, 0)), Pmode,
2166 				   force_reg (Pmode, XEXP (src2, 0)), Pmode,
2167 				   len_rtx, Pmode);
2168 	}
2169 
2170       rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label);
2171       rtx jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, fin_ref));
2172       JUMP_LABEL (jmp) = final_label;
2173       LABEL_NUSES (final_label) += 1;
2174       emit_barrier ();
2175     }
2176 
2177   if (cleanup_label)
2178     emit_label (cleanup_label);
2179 
2180   /* Generate the final sequence that identifies the differing
2181      byte and generates the final result, taking into account
2182      zero bytes:
2183 
2184      cntlzd            get bit of first zero/diff byte
2185      addi              convert for rldcl use
2186      rldcl rldcl       extract diff/zero byte
2187      subf              subtract for final result
2188   */
2189 
2190   rtx rot_amt = gen_reg_rtx (word_mode);
2191 
2192   rtx rot1_1 = gen_reg_rtx (word_mode);
2193   rtx rot1_2 = gen_reg_rtx (word_mode);
2194   rtx rot2_1 = gen_reg_rtx (word_mode);
2195   rtx rot2_2 = gen_reg_rtx (word_mode);
2196 
2197   if (word_mode == SImode)
2198     {
2199       emit_insn (gen_clzsi2 (rot_amt, result_reg));
2200       emit_insn (gen_addsi3 (rot_amt, rot_amt, GEN_INT (8)));
2201       emit_insn (gen_rotlsi3 (rot1_1, tmp_reg_src1,
2202 			      gen_lowpart (SImode, rot_amt)));
2203       emit_insn (gen_andsi3_mask (rot1_2, rot1_1, GEN_INT (0xff)));
2204       emit_insn (gen_rotlsi3 (rot2_1, tmp_reg_src2,
2205 			      gen_lowpart (SImode, rot_amt)));
2206       emit_insn (gen_andsi3_mask (rot2_2, rot2_1, GEN_INT (0xff)));
2207       emit_insn (gen_subsi3 (result_reg, rot1_2, rot2_2));
2208     }
2209   else
2210     {
2211       emit_insn (gen_clzdi2 (rot_amt, result_reg));
2212       emit_insn (gen_adddi3 (rot_amt, rot_amt, GEN_INT (8)));
2213       emit_insn (gen_rotldi3 (rot1_1, tmp_reg_src1,
2214 			      gen_lowpart (SImode, rot_amt)));
2215       emit_insn (gen_anddi3_mask (rot1_2, rot1_1, GEN_INT (0xff)));
2216       emit_insn (gen_rotldi3 (rot2_1, tmp_reg_src2,
2217 			      gen_lowpart (SImode, rot_amt)));
2218       emit_insn (gen_anddi3_mask (rot2_2, rot2_1, GEN_INT (0xff)));
2219       emit_insn (gen_subdi3 (result_reg, rot1_2, rot2_2));
2220     }
2221 
2222   emit_label (final_move_label);
2223   emit_insn (gen_movsi (target,
2224 			gen_lowpart (SImode, result_reg)));
2225   emit_label (final_label);
2226   return true;
2227 }
2228 
2229 /* Generate loads and stores for a move of v4si mode using lvx/stvx.
2230    This uses altivec_{l,st}vx_<mode>_internal which use unspecs to
2231    keep combine from changing what instruction gets used.
2232 
2233    DEST is the destination for the data.
2234    SRC is the source of the data for the move.  */
2235 
2236 static rtx
gen_lvx_v4si_move(rtx dest,rtx src)2237 gen_lvx_v4si_move (rtx dest, rtx src)
2238 {
2239   gcc_assert (MEM_P (dest) ^ MEM_P (src));
2240   gcc_assert (GET_MODE (dest) == V4SImode && GET_MODE (src) == V4SImode);
2241 
2242   if (MEM_P (dest))
2243     return gen_altivec_stvx_v4si_internal (dest, src);
2244   else
2245     return gen_altivec_lvx_v4si_internal (dest, src);
2246 }
2247 
2248 /* Expand a block move operation, and return 1 if successful.  Return 0
2249    if we should let the compiler generate normal code.
2250 
2251    operands[0] is the destination
2252    operands[1] is the source
2253    operands[2] is the length
2254    operands[3] is the alignment */
2255 
2256 #define MAX_MOVE_REG 4
2257 
2258 int
expand_block_move(rtx operands[])2259 expand_block_move (rtx operands[])
2260 {
2261   rtx orig_dest = operands[0];
2262   rtx orig_src	= operands[1];
2263   rtx bytes_rtx	= operands[2];
2264   rtx align_rtx = operands[3];
2265   int constp	= (GET_CODE (bytes_rtx) == CONST_INT);
2266   int align;
2267   int bytes;
2268   int offset;
2269   int move_bytes;
2270   rtx stores[MAX_MOVE_REG];
2271   int num_reg = 0;
2272 
2273   /* If this is not a fixed size move, just call memcpy */
2274   if (! constp)
2275     return 0;
2276 
2277   /* This must be a fixed size alignment */
2278   gcc_assert (GET_CODE (align_rtx) == CONST_INT);
2279   align = INTVAL (align_rtx) * BITS_PER_UNIT;
2280 
2281   /* Anything to move? */
2282   bytes = INTVAL (bytes_rtx);
2283   if (bytes <= 0)
2284     return 1;
2285 
2286   if (bytes > rs6000_block_move_inline_limit)
2287     return 0;
2288 
2289   for (offset = 0; bytes > 0; offset += move_bytes, bytes -= move_bytes)
2290     {
2291       union {
2292 	rtx (*movmemsi) (rtx, rtx, rtx, rtx);
2293 	rtx (*mov) (rtx, rtx);
2294       } gen_func;
2295       machine_mode mode = BLKmode;
2296       rtx src, dest;
2297 
2298       /* Altivec first, since it will be faster than a string move
2299 	 when it applies, and usually not significantly larger.  */
2300       if (TARGET_ALTIVEC && bytes >= 16 && align >= 128)
2301 	{
2302 	  move_bytes = 16;
2303 	  mode = V4SImode;
2304 	  gen_func.mov = gen_lvx_v4si_move;
2305 	}
2306       else if (bytes >= 8 && TARGET_POWERPC64
2307 	       && (align >= 64 || !STRICT_ALIGNMENT))
2308 	{
2309 	  move_bytes = 8;
2310 	  mode = DImode;
2311 	  gen_func.mov = gen_movdi;
2312 	  if (offset == 0 && align < 64)
2313 	    {
2314 	      rtx addr;
2315 
2316 	      /* If the address form is reg+offset with offset not a
2317 		 multiple of four, reload into reg indirect form here
2318 		 rather than waiting for reload.  This way we get one
2319 		 reload, not one per load and/or store.  */
2320 	      addr = XEXP (orig_dest, 0);
2321 	      if ((GET_CODE (addr) == PLUS || GET_CODE (addr) == LO_SUM)
2322 		  && GET_CODE (XEXP (addr, 1)) == CONST_INT
2323 		  && (INTVAL (XEXP (addr, 1)) & 3) != 0)
2324 		{
2325 		  addr = copy_addr_to_reg (addr);
2326 		  orig_dest = replace_equiv_address (orig_dest, addr);
2327 		}
2328 	      addr = XEXP (orig_src, 0);
2329 	      if ((GET_CODE (addr) == PLUS || GET_CODE (addr) == LO_SUM)
2330 		  && GET_CODE (XEXP (addr, 1)) == CONST_INT
2331 		  && (INTVAL (XEXP (addr, 1)) & 3) != 0)
2332 		{
2333 		  addr = copy_addr_to_reg (addr);
2334 		  orig_src = replace_equiv_address (orig_src, addr);
2335 		}
2336 	    }
2337 	}
2338       else if (bytes >= 4 && (align >= 32 || !STRICT_ALIGNMENT))
2339 	{			/* move 4 bytes */
2340 	  move_bytes = 4;
2341 	  mode = SImode;
2342 	  gen_func.mov = gen_movsi;
2343 	}
2344       else if (bytes >= 2 && (align >= 16 || !STRICT_ALIGNMENT))
2345 	{			/* move 2 bytes */
2346 	  move_bytes = 2;
2347 	  mode = HImode;
2348 	  gen_func.mov = gen_movhi;
2349 	}
2350       else /* move 1 byte at a time */
2351 	{
2352 	  move_bytes = 1;
2353 	  mode = QImode;
2354 	  gen_func.mov = gen_movqi;
2355 	}
2356 
2357       src = adjust_address (orig_src, mode, offset);
2358       dest = adjust_address (orig_dest, mode, offset);
2359 
2360       if (mode != BLKmode)
2361 	{
2362 	  rtx tmp_reg = gen_reg_rtx (mode);
2363 
2364 	  emit_insn ((*gen_func.mov) (tmp_reg, src));
2365 	  stores[num_reg++] = (*gen_func.mov) (dest, tmp_reg);
2366 	}
2367 
2368       if (mode == BLKmode || num_reg >= MAX_MOVE_REG || bytes == move_bytes)
2369 	{
2370 	  int i;
2371 	  for (i = 0; i < num_reg; i++)
2372 	    emit_insn (stores[i]);
2373 	  num_reg = 0;
2374 	}
2375 
2376       if (mode == BLKmode)
2377 	{
2378 	  /* Move the address into scratch registers.  The movmemsi
2379 	     patterns require zero offset.  */
2380 	  if (!REG_P (XEXP (src, 0)))
2381 	    {
2382 	      rtx src_reg = copy_addr_to_reg (XEXP (src, 0));
2383 	      src = replace_equiv_address (src, src_reg);
2384 	    }
2385 	  set_mem_size (src, move_bytes);
2386 
2387 	  if (!REG_P (XEXP (dest, 0)))
2388 	    {
2389 	      rtx dest_reg = copy_addr_to_reg (XEXP (dest, 0));
2390 	      dest = replace_equiv_address (dest, dest_reg);
2391 	    }
2392 	  set_mem_size (dest, move_bytes);
2393 
2394 	  emit_insn ((*gen_func.movmemsi) (dest, src,
2395 					   GEN_INT (move_bytes & 31),
2396 					   align_rtx));
2397 	}
2398     }
2399 
2400   return 1;
2401 }
2402