1 /* Subroutines used to expand string and block move, clear,
2    compare and other operations for PowerPC.
3    Copyright (C) 1991-2021 Free Software Foundation, Inc.
4 
5    This file is part of GCC.
6 
7    GCC is free software; you can redistribute it and/or modify it
8    under the terms of the GNU General Public License as published
9    by the Free Software Foundation; either version 3, or (at your
10    option) any later version.
11 
12    GCC is distributed in the hope that it will be useful, but WITHOUT
13    ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14    or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
15    License for more details.
16 
17    You should have received a copy of the GNU General Public License
18    along with GCC; see the file COPYING3.  If not see
19    <http://www.gnu.org/licenses/>.  */
20 
21 #define IN_TARGET_CODE 1
22 
23 #include "config.h"
24 #include "system.h"
25 #include "coretypes.h"
26 #include "backend.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "memmodel.h"
30 #include "tm_p.h"
31 #include "ira.h"
32 #include "print-tree.h"
33 #include "varasm.h"
34 #include "explow.h"
35 #include "expr.h"
36 #include "output.h"
37 #include "target.h"
38 #include "profile-count.h"
39 #include "predict.h"
40 
41 /* Expand a block clear operation, and return 1 if successful.  Return 0
42    if we should let the compiler generate normal code.
43 
44    operands[0] is the destination
45    operands[1] is the length
46    operands[3] is the alignment */
47 
48 int
expand_block_clear(rtx operands[])49 expand_block_clear (rtx operands[])
50 {
51   rtx orig_dest = operands[0];
52   rtx bytes_rtx	= operands[1];
53   rtx align_rtx = operands[3];
54   bool constp	= CONST_INT_P (bytes_rtx);
55   HOST_WIDE_INT align;
56   HOST_WIDE_INT bytes;
57   int offset;
58   int clear_bytes;
59   int clear_step;
60 
61   /* If this is not a fixed size move, just call memcpy */
62   if (! constp)
63     return 0;
64 
65   /* This must be a fixed size alignment  */
66   gcc_assert (CONST_INT_P (align_rtx));
67   align = INTVAL (align_rtx) * BITS_PER_UNIT;
68 
69   /* Anything to clear? */
70   bytes = INTVAL (bytes_rtx);
71   if (bytes <= 0)
72     return 1;
73 
74   /* Use the builtin memset after a point, to avoid huge code bloat.
75      When optimize_size, avoid any significant code bloat; calling
76      memset is about 4 instructions, so allow for one instruction to
77      load zero and three to do clearing.  */
78   if (TARGET_ALTIVEC && (align >= 128 || TARGET_EFFICIENT_UNALIGNED_VSX))
79     clear_step = 16;
80   else if (TARGET_POWERPC64 && (align >= 64 || !STRICT_ALIGNMENT))
81     clear_step = 8;
82   else
83     clear_step = 4;
84 
85   if (optimize_size && bytes > 3 * clear_step)
86     return 0;
87   if (! optimize_size && bytes > 8 * clear_step)
88     return 0;
89 
90   bool unaligned_vsx_ok = (bytes >= 32 && TARGET_EFFICIENT_UNALIGNED_VSX);
91 
92   for (offset = 0; bytes > 0; offset += clear_bytes, bytes -= clear_bytes)
93     {
94       machine_mode mode = BLKmode;
95       rtx dest;
96 
97       if (TARGET_ALTIVEC
98 	  && (bytes >= 16 && (align >= 128 || unaligned_vsx_ok)))
99 	{
100 	  clear_bytes = 16;
101 	  mode = V4SImode;
102 	}
103       else if (bytes >= 8 && TARGET_POWERPC64
104 	       && (align >= 64 || !STRICT_ALIGNMENT))
105 	{
106 	  clear_bytes = 8;
107 	  mode = DImode;
108 	  if (offset == 0 && align < 64)
109 	    {
110 	      rtx addr;
111 
112 	      /* If the address form is reg+offset with offset not a
113 		 multiple of four, reload into reg indirect form here
114 		 rather than waiting for reload.  This way we get one
115 		 reload, not one per store.  */
116 	      addr = XEXP (orig_dest, 0);
117 	      if ((GET_CODE (addr) == PLUS || GET_CODE (addr) == LO_SUM)
118 		  && CONST_INT_P (XEXP (addr, 1))
119 		  && (INTVAL (XEXP (addr, 1)) & 3) != 0)
120 		{
121 		  addr = copy_addr_to_reg (addr);
122 		  orig_dest = replace_equiv_address (orig_dest, addr);
123 		}
124 	    }
125 	}
126       else if (bytes >= 4 && (align >= 32 || !STRICT_ALIGNMENT))
127 	{			/* move 4 bytes */
128 	  clear_bytes = 4;
129 	  mode = SImode;
130 	}
131       else if (bytes >= 2 && (align >= 16 || !STRICT_ALIGNMENT))
132 	{			/* move 2 bytes */
133 	  clear_bytes = 2;
134 	  mode = HImode;
135 	}
136       else /* move 1 byte at a time */
137 	{
138 	  clear_bytes = 1;
139 	  mode = QImode;
140 	}
141 
142       dest = adjust_address (orig_dest, mode, offset);
143 
144       emit_move_insn (dest, CONST0_RTX (mode));
145     }
146 
147   return 1;
148 }
149 
150 /* Figure out the correct instructions to generate to load data for
151    block compare.  MODE is used for the read from memory, and
152    data is zero extended if REG is wider than MODE.  If LE code
153    is being generated, bswap loads are used.
154 
155    REG is the destination register to move the data into.
156    MEM is the memory block being read.
157    MODE is the mode of memory to use for the read.  */
158 static void
do_load_for_compare(rtx reg,rtx mem,machine_mode mode)159 do_load_for_compare (rtx reg, rtx mem, machine_mode mode)
160 {
161   switch (GET_MODE (reg))
162     {
163     case E_V16QImode:
164       switch (mode)
165 	{
166 	case E_V16QImode:
167 	  if (!BYTES_BIG_ENDIAN)
168 	    {
169 	      if (TARGET_P9_VECTOR)
170 		emit_insn (gen_vsx_ld_elemrev_v16qi_internal (reg, mem));
171 	      else
172 		{
173 		  rtx reg_v2di = simplify_gen_subreg (V2DImode, reg,
174 						      V16QImode, 0);
175 		  gcc_assert (MEM_P (mem));
176 		  rtx addr = XEXP (mem, 0);
177 		  rtx mem_v2di = gen_rtx_MEM (V2DImode, addr);
178 		  MEM_COPY_ATTRIBUTES (mem_v2di, mem);
179 		  set_mem_size (mem, GET_MODE_SIZE (V2DImode));
180 		  emit_insn (gen_vsx_ld_elemrev_v2di (reg_v2di, mem_v2di));
181 		}
182 	    }
183 	  else
184 	    emit_insn (gen_vsx_movv2di_64bit (reg, mem));
185 	  break;
186 	default:
187 	  gcc_unreachable ();
188 	}
189       break;
190     case E_DImode:
191       switch (mode)
192 	{
193 	case E_QImode:
194 	  emit_insn (gen_zero_extendqidi2 (reg, mem));
195 	  break;
196 	case E_HImode:
197 	  {
198 	    rtx src = mem;
199 	    if (!BYTES_BIG_ENDIAN)
200 	      {
201 		src = gen_reg_rtx (HImode);
202 		emit_insn (gen_bswaphi2 (src, mem));
203 	      }
204 	    emit_insn (gen_zero_extendhidi2 (reg, src));
205 	    break;
206 	  }
207 	case E_SImode:
208 	  {
209 	    rtx src = mem;
210 	    if (!BYTES_BIG_ENDIAN)
211 	      {
212 		src = gen_reg_rtx (SImode);
213 		emit_insn (gen_bswapsi2 (src, mem));
214 	      }
215 	    emit_insn (gen_zero_extendsidi2 (reg, src));
216 	  }
217 	  break;
218 	case E_DImode:
219 	  if (!BYTES_BIG_ENDIAN)
220 	    emit_insn (gen_bswapdi2 (reg, mem));
221 	  else
222 	    emit_insn (gen_movdi (reg, mem));
223 	  break;
224 	default:
225 	  gcc_unreachable ();
226 	}
227       break;
228 
229     case E_SImode:
230       switch (mode)
231 	{
232 	case E_QImode:
233 	  emit_insn (gen_zero_extendqisi2 (reg, mem));
234 	  break;
235 	case E_HImode:
236 	  {
237 	    rtx src = mem;
238 	    if (!BYTES_BIG_ENDIAN)
239 	      {
240 		src = gen_reg_rtx (HImode);
241 		emit_insn (gen_bswaphi2 (src, mem));
242 	      }
243 	    emit_insn (gen_zero_extendhisi2 (reg, src));
244 	    break;
245 	  }
246 	case E_SImode:
247 	  if (!BYTES_BIG_ENDIAN)
248 	    emit_insn (gen_bswapsi2 (reg, mem));
249 	  else
250 	    emit_insn (gen_movsi (reg, mem));
251 	  break;
252 	case E_DImode:
253 	  /* DImode is larger than the destination reg so is not expected.  */
254 	  gcc_unreachable ();
255 	  break;
256 	default:
257 	  gcc_unreachable ();
258 	}
259       break;
260 
261     case E_QImode:
262       gcc_assert (mode == E_QImode);
263       emit_move_insn (reg, mem);
264       break;
265 
266     default:
267       gcc_unreachable ();
268       break;
269     }
270 }
271 
272 /* Select the mode to be used for reading the next chunk of bytes
273    in the compare.
274 
275    OFFSET is the current read offset from the beginning of the block.
276    BYTES is the number of bytes remaining to be read.
277    ALIGN is the minimum alignment of the memory blocks being compared in bytes.  */
278 static machine_mode
select_block_compare_mode(unsigned HOST_WIDE_INT offset,unsigned HOST_WIDE_INT bytes,unsigned HOST_WIDE_INT align)279 select_block_compare_mode (unsigned HOST_WIDE_INT offset,
280 			   unsigned HOST_WIDE_INT bytes,
281 			   unsigned HOST_WIDE_INT align)
282 {
283   /* First see if we can do a whole load unit
284      as that will be more efficient than a larger load + shift.  */
285 
286   /* If big, use biggest chunk.
287      If exactly chunk size, use that size.
288      If remainder can be done in one piece with shifting, do that.
289      Do largest chunk possible without violating alignment rules.  */
290 
291   /* The most we can read without potential page crossing.  */
292   unsigned HOST_WIDE_INT maxread = ROUND_UP (bytes, align);
293 
294   /* If we have an LE target without ldbrx and word_mode is DImode,
295      then we must avoid using word_mode.  */
296   int word_mode_ok = !(!BYTES_BIG_ENDIAN && !TARGET_LDBRX
297 		       && word_mode == DImode);
298 
299   if (word_mode_ok && bytes >= UNITS_PER_WORD)
300     return word_mode;
301   else if (bytes == GET_MODE_SIZE (SImode))
302     return SImode;
303   else if (bytes == GET_MODE_SIZE (HImode))
304     return HImode;
305   else if (bytes == GET_MODE_SIZE (QImode))
306     return QImode;
307   else if (bytes < GET_MODE_SIZE (SImode)
308 	   && TARGET_EFFICIENT_OVERLAPPING_UNALIGNED
309 	   && offset >= GET_MODE_SIZE (SImode) - bytes)
310     /* This matches the case were we have SImode and 3 bytes
311        and offset >= 1 and permits us to move back one and overlap
312        with the previous read, thus avoiding having to shift
313        unwanted bytes off of the input.  */
314     return SImode;
315   else if (word_mode_ok && bytes < UNITS_PER_WORD
316 	   && TARGET_EFFICIENT_OVERLAPPING_UNALIGNED
317 	   && offset >= UNITS_PER_WORD-bytes)
318     /* Similarly, if we can use DImode it will get matched here and
319        can do an overlapping read that ends at the end of the block.  */
320     return word_mode;
321   else if (word_mode_ok && maxread >= UNITS_PER_WORD)
322     /* It is safe to do all remaining in one load of largest size,
323        possibly with a shift to get rid of unwanted bytes.  */
324     return word_mode;
325   else if (maxread >= GET_MODE_SIZE (SImode))
326     /* It is safe to do all remaining in one SImode load,
327        possibly with a shift to get rid of unwanted bytes.  */
328     return SImode;
329   else if (bytes > GET_MODE_SIZE (SImode))
330     return SImode;
331   else if (bytes > GET_MODE_SIZE (HImode))
332     return HImode;
333 
334   /* final fallback is do one byte */
335   return QImode;
336 }
337 
338 /* Compute the alignment of pointer+OFFSET where the original alignment
339    of pointer was BASE_ALIGN.  */
340 static unsigned HOST_WIDE_INT
compute_current_alignment(unsigned HOST_WIDE_INT base_align,unsigned HOST_WIDE_INT offset)341 compute_current_alignment (unsigned HOST_WIDE_INT base_align,
342 			   unsigned HOST_WIDE_INT offset)
343 {
344   if (offset == 0)
345     return base_align;
346   return MIN (base_align, offset & -offset);
347 }
348 
349 /* Prepare address and then do a load.
350 
351    MODE is the mode to use for the load.
352    DEST is the destination register for the data.
353    ADDR is the address to be loaded.
354    ORIG_ADDR is the original address expression.  */
355 static void
do_load_for_compare_from_addr(machine_mode mode,rtx dest,rtx addr,rtx orig_addr)356 do_load_for_compare_from_addr (machine_mode mode, rtx dest, rtx addr,
357 			       rtx orig_addr)
358 {
359   rtx mem = gen_rtx_MEM (mode, addr);
360   MEM_COPY_ATTRIBUTES (mem, orig_addr);
361   set_mem_size (mem, GET_MODE_SIZE (mode));
362   do_load_for_compare (dest, mem, mode);
363   return;
364 }
365 
366 /* Do a branch for an if/else decision.
367 
368    CMPMODE is the mode to use for the comparison.
369    COMPARISON is the rtx code for the compare needed.
370    A is the first thing to be compared.
371    B is the second thing to be compared.
372    CR is the condition code reg input, or NULL_RTX.
373    TRUE_LABEL is the label to branch to if the condition is true.
374    P is the estimated branch probability for the branch.
375 
376    The return value is the CR used for the comparison.
377    If CR is null_rtx, then a new register of CMPMODE is generated.
378    If A and B are both null_rtx, then CR must not be null, and the
379    compare is not generated so you can use this with a dot form insn.  */
380 
381 static void
do_ifelse(machine_mode cmpmode,rtx_code comparison,rtx a,rtx b,rtx cr,rtx true_label,profile_probability br_prob)382 do_ifelse (machine_mode cmpmode, rtx_code comparison,
383 	   rtx a, rtx b, rtx cr, rtx true_label, profile_probability br_prob)
384 {
385   gcc_assert ((a == NULL_RTX && b == NULL_RTX && cr != NULL_RTX)
386 	      || (a != NULL_RTX && b != NULL_RTX));
387 
388   if (cr != NULL_RTX)
389     gcc_assert (GET_MODE (cr) == cmpmode);
390   else
391     cr = gen_reg_rtx (cmpmode);
392 
393   rtx label_ref = gen_rtx_LABEL_REF (VOIDmode, true_label);
394 
395   if (a != NULL_RTX)
396     emit_move_insn (cr, gen_rtx_COMPARE (cmpmode, a, b));
397 
398   rtx cmp_rtx = gen_rtx_fmt_ee (comparison, VOIDmode, cr, const0_rtx);
399 
400   rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx, label_ref, pc_rtx);
401   rtx_insn *j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
402   add_reg_br_prob_note (j, br_prob);
403   JUMP_LABEL (j) = true_label;
404   LABEL_NUSES (true_label) += 1;
405 }
406 
407 /* Emit an isel of the proper mode for DEST.
408 
409    DEST is the isel destination register.
410    SRC1 is the isel source if CR is true.
411    SRC2 is the isel source if CR is false.
412    CR is the condition for the isel.  */
413 static void
do_isel(rtx dest,rtx cmp,rtx src_t,rtx src_f,rtx cr)414 do_isel (rtx dest, rtx cmp, rtx src_t, rtx src_f, rtx cr)
415 {
416   if (GET_MODE (dest) == DImode)
417     emit_insn (gen_isel_signed_di (dest, cmp, src_t, src_f, cr));
418   else
419     emit_insn (gen_isel_signed_si (dest, cmp, src_t, src_f, cr));
420 }
421 
422 /* Emit a subtract of the proper mode for DEST.
423 
424    DEST is the destination register for the subtract.
425    SRC1 is the first subtract input.
426    SRC2 is the second subtract input.
427 
428    Computes DEST = SRC1-SRC2.  */
429 static void
do_sub3(rtx dest,rtx src1,rtx src2)430 do_sub3 (rtx dest, rtx src1, rtx src2)
431 {
432   if (GET_MODE (dest) == DImode)
433     emit_insn (gen_subdi3 (dest, src1, src2));
434   else
435     emit_insn (gen_subsi3 (dest, src1, src2));
436 }
437 
438 /* Emit an add of the proper mode for DEST.
439 
440    DEST is the destination register for the add.
441    SRC1 is the first add input.
442    SRC2 is the second add input.
443 
444    Computes DEST = SRC1+SRC2.  */
445 static void
do_add3(rtx dest,rtx src1,rtx src2)446 do_add3 (rtx dest, rtx src1, rtx src2)
447 {
448   if (GET_MODE (dest) == DImode)
449     emit_insn (gen_adddi3 (dest, src1, src2));
450   else
451     emit_insn (gen_addsi3 (dest, src1, src2));
452 }
453 
454 /* Emit an and of the proper mode for DEST.
455 
456    DEST is the destination register for the and.
457    SRC1 is the first and input.
458    SRC2 is the second and input.
459 
460    Computes DEST = SRC1&SRC2.  */
461 static void
do_and3(rtx dest,rtx src1,rtx src2)462 do_and3 (rtx dest, rtx src1, rtx src2)
463 {
464   if (GET_MODE (dest) == DImode)
465     emit_insn (gen_anddi3 (dest, src1, src2));
466   else
467     emit_insn (gen_andsi3 (dest, src1, src2));
468 }
469 
470 /* Emit an cmpb of the proper mode for DEST.
471 
472    DEST is the destination register for the cmpb.
473    SRC1 is the first input.
474    SRC2 is the second input.
475 
476    Computes cmpb of SRC1, SRC2.  */
477 static void
do_cmpb3(rtx dest,rtx src1,rtx src2)478 do_cmpb3 (rtx dest, rtx src1, rtx src2)
479 {
480   if (GET_MODE (dest) == DImode)
481     emit_insn (gen_cmpbdi3 (dest, src1, src2));
482   else
483     emit_insn (gen_cmpbsi3 (dest, src1, src2));
484 }
485 
486 /* Emit a rotl of the proper mode for DEST.
487 
488    DEST is the destination register for the and.
489    SRC1 is the first and input.
490    SRC2 is the second and input.
491 
492    Computes DEST = SRC1 rotated left by SRC2.  */
493 static void
do_rotl3(rtx dest,rtx src1,rtx src2)494 do_rotl3 (rtx dest, rtx src1, rtx src2)
495 {
496   if (GET_MODE (dest) == DImode)
497     emit_insn (gen_rotldi3 (dest, src1, src2));
498   else
499     emit_insn (gen_rotlsi3 (dest, src1, src2));
500 }
501 
502 /* Generate rtl for a load, shift, and compare of less than a full word.
503 
504    LOAD_MODE is the machine mode for the loads.
505    DIFF is the reg for the difference.
506    CMP_REM is the reg containing the remaining bytes to compare.
507    DCOND is the CCUNS reg for the compare if we are doing P9 code with setb.
508    SRC1_ADDR is the first source address.
509    SRC2_ADDR is the second source address.
510    ORIG_SRC1 is the original first source block's address rtx.
511    ORIG_SRC2 is the original second source block's address rtx.  */
512 static void
do_load_mask_compare(const machine_mode load_mode,rtx diff,rtx cmp_rem,rtx dcond,rtx src1_addr,rtx src2_addr,rtx orig_src1,rtx orig_src2)513 do_load_mask_compare (const machine_mode load_mode, rtx diff, rtx cmp_rem, rtx dcond,
514 		      rtx src1_addr, rtx src2_addr, rtx orig_src1, rtx orig_src2)
515 {
516   HOST_WIDE_INT load_mode_size = GET_MODE_SIZE (load_mode);
517   rtx shift_amount = gen_reg_rtx (word_mode);
518   rtx d1 = gen_reg_rtx (word_mode);
519   rtx d2 = gen_reg_rtx (word_mode);
520 
521   do_load_for_compare_from_addr (load_mode, d1, src1_addr, orig_src1);
522   do_load_for_compare_from_addr (load_mode, d2, src2_addr, orig_src2);
523   do_sub3 (shift_amount, GEN_INT (load_mode_size), cmp_rem);
524 
525   if (word_mode == DImode)
526     {
527       emit_insn (gen_ashldi3 (shift_amount, shift_amount,
528 			      GEN_INT (LOG2_BITS_PER_UNIT)));
529       emit_insn (gen_lshrdi3 (d1, d1,
530 			      gen_lowpart (SImode, shift_amount)));
531       emit_insn (gen_lshrdi3 (d2, d2,
532 			      gen_lowpart (SImode, shift_amount)));
533     }
534   else
535     {
536       emit_insn (gen_ashlsi3 (shift_amount, shift_amount,
537 			      GEN_INT (LOG2_BITS_PER_UNIT)));
538       emit_insn (gen_lshrsi3 (d1, d1, shift_amount));
539       emit_insn (gen_lshrsi3 (d2, d2, shift_amount));
540     }
541 
542   if (TARGET_P9_MISC)
543     {
544       /* Generate a compare, and convert with a setb later.  */
545       rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1, d2);
546       emit_insn (gen_rtx_SET (dcond, cmp));
547     }
548   else
549     {
550       if (word_mode == DImode)
551 	emit_insn (gen_subfdi3_carry (diff, d2, d1));
552       else
553 	emit_insn (gen_subfsi3_carry (diff, d2, d1));
554     }
555 }
556 
557 /* Generate rtl for an overlapping load and compare of less than a
558    full load_mode.  This assumes that the previous word is part of the
559    block being compared so it's ok to back up part of a word so we can
560    compare the last unaligned full word that ends at the end of the block.
561 
562    LOAD_MODE is the machine mode for the loads.
563    ISCONST tells whether the remaining length is a constant or in a register.
564    BYTES_REM is the remaining length if ISCONST is true.
565    DIFF is the reg for the difference.
566    CMP_REM is the reg containing the remaining bytes to compare if !ISCONST.
567    DCOND is the CCUNS reg for the compare if we are doing P9 code with setb.
568    SRC1_ADDR is the first source address.
569    SRC2_ADDR is the second source address.
570    ORIG_SRC1 is the original first source block's address rtx.
571    ORIG_SRC2 is the original second source block's address rtx.  */
572 static void
do_overlap_load_compare(machine_mode load_mode,bool isConst,HOST_WIDE_INT bytes_rem,rtx diff,rtx cmp_rem,rtx dcond,rtx src1_addr,rtx src2_addr,rtx orig_src1,rtx orig_src2)573 do_overlap_load_compare (machine_mode load_mode, bool isConst,
574 			HOST_WIDE_INT bytes_rem, rtx diff,
575 			rtx cmp_rem, rtx dcond, rtx src1_addr, rtx src2_addr,
576 			rtx orig_src1, rtx orig_src2)
577 {
578   HOST_WIDE_INT load_mode_size = GET_MODE_SIZE (load_mode);
579   HOST_WIDE_INT addr_adj = load_mode_size - bytes_rem;
580   rtx d1 = gen_reg_rtx (word_mode);
581   rtx d2 = gen_reg_rtx (word_mode);
582 
583   rtx addr1, addr2;
584   if (!isConst || addr_adj)
585     {
586       rtx adj_reg = gen_reg_rtx (word_mode);
587       if (isConst)
588 	emit_move_insn (adj_reg, GEN_INT (-addr_adj));
589       else
590 	{
591 	  rtx reg_lms = gen_reg_rtx (word_mode);
592 	  emit_move_insn (reg_lms, GEN_INT (load_mode_size));
593 	  do_sub3 (adj_reg, cmp_rem, reg_lms);
594 	}
595 
596       addr1 = gen_rtx_PLUS (word_mode, src1_addr, adj_reg);
597       addr2 = gen_rtx_PLUS (word_mode, src2_addr, adj_reg);
598     }
599   else
600     {
601       addr1 = src1_addr;
602       addr2 = src2_addr;
603     }
604 
605   do_load_for_compare_from_addr (load_mode, d1, addr1, orig_src1);
606   do_load_for_compare_from_addr (load_mode, d2, addr2, orig_src2);
607 
608   if (TARGET_P9_MISC)
609     {
610       /* Generate a compare, and convert with a setb later.  */
611       rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1, d2);
612       emit_insn (gen_rtx_SET (dcond, cmp));
613     }
614   else
615     {
616       if (word_mode == DImode)
617 	emit_insn (gen_subfdi3_carry (diff, d2, d1));
618       else
619 	emit_insn (gen_subfsi3_carry (diff, d2, d1));
620     }
621 }
622 
623 /* Generate the sequence of compares for strcmp/strncmp using vec/vsx
624    instructions.
625 
626    BYTES_TO_COMPARE is the number of bytes to be compared.
627    ORIG_SRC1 is the unmodified rtx for the first string.
628    ORIG_SRC2 is the unmodified rtx for the second string.
629    S1ADDR is the register to use for the base address of the first string.
630    S2ADDR is the register to use for the base address of the second string.
631    OFF_REG is the register to use for the string offset for loads.
632    S1DATA is the register for loading the first string.
633    S2DATA is the register for loading the second string.
634    VEC_RESULT is the rtx for the vector result indicating the byte difference.
635    EQUALITY_COMPARE_REST is a flag to indicate we need to make a cleanup call
636    to strcmp/strncmp if we have equality at the end of the inline comparison.
637    P_CLEANUP_LABEL is a pointer to rtx for a label we generate if we need code
638    to clean up and generate the final comparison result.
639    FINAL_MOVE_LABEL is rtx for a label we can branch to when we can just
640    set the final result.
641    CHECKZERO indicates whether the sequence should check for zero bytes
642    for use doing strncmp, or not (for use doing memcmp).  */
643 static void
expand_cmp_vec_sequence(unsigned HOST_WIDE_INT bytes_to_compare,rtx orig_src1,rtx orig_src2,rtx s1addr,rtx s2addr,rtx off_reg,rtx s1data,rtx s2data,rtx vec_result,bool equality_compare_rest,rtx * p_cleanup_label,rtx final_move_label,bool checkzero)644 expand_cmp_vec_sequence (unsigned HOST_WIDE_INT bytes_to_compare,
645 			 rtx orig_src1, rtx orig_src2,
646 			 rtx s1addr, rtx s2addr, rtx off_reg,
647 			 rtx s1data, rtx s2data, rtx vec_result,
648 			 bool equality_compare_rest, rtx *p_cleanup_label,
649 			 rtx final_move_label, bool checkzero)
650 {
651   machine_mode load_mode;
652   unsigned int load_mode_size;
653   unsigned HOST_WIDE_INT cmp_bytes = 0;
654   unsigned HOST_WIDE_INT offset = 0;
655   rtx zero_reg = NULL;
656 
657   gcc_assert (p_cleanup_label != NULL);
658   rtx cleanup_label = *p_cleanup_label;
659 
660   emit_move_insn (s1addr, force_reg (Pmode, XEXP (orig_src1, 0)));
661   emit_move_insn (s2addr, force_reg (Pmode, XEXP (orig_src2, 0)));
662 
663   if (checkzero && !TARGET_P9_VECTOR)
664     {
665       zero_reg = gen_reg_rtx (V16QImode);
666       emit_move_insn (zero_reg, CONST0_RTX (V16QImode));
667     }
668 
669   while (bytes_to_compare > 0)
670     {
671       /* VEC/VSX compare sequence for P8:
672 	 check each 16B with:
673 	 lxvd2x 32,28,8
674 	 lxvd2x 33,29,8
675 	 vcmpequb 2,0,1  # compare strings
676 	 vcmpequb 4,0,3  # compare w/ 0
677 	 xxlorc 37,36,34       # first FF byte is either mismatch or end of string
678 	 vcmpequb. 7,5,3  # reg 7 contains 0
679 	 bnl 6,.Lmismatch
680 
681 	 For the P8 LE case, we use lxvd2x and compare full 16 bytes
682 	 but then use vgbbd and a shift to get two bytes with the
683 	 information we need in the correct order.
684 
685 	 VEC/VSX compare sequence if TARGET_P9_VECTOR:
686 	 lxvb16x/lxvb16x     # load 16B of each string
687 	 vcmpnezb.           # produces difference location or zero byte location
688 	 bne 6,.Lmismatch
689 
690 	 Use the overlapping compare trick for the last block if it is
691 	 less than 16 bytes.
692       */
693 
694       load_mode = V16QImode;
695       load_mode_size = GET_MODE_SIZE (load_mode);
696 
697       if (bytes_to_compare >= load_mode_size)
698 	cmp_bytes = load_mode_size;
699       else
700 	{
701 	  /* Move this load back so it doesn't go past the end.  P8/P9
702 	     can do this efficiently.  This is never called with less
703 	     than 16 bytes so we should always be able to do this.  */
704 	  unsigned int extra_bytes = load_mode_size - bytes_to_compare;
705 	  cmp_bytes = bytes_to_compare;
706 	  gcc_assert (offset > extra_bytes);
707 	  offset -= extra_bytes;
708 	  cmp_bytes = load_mode_size;
709 	  bytes_to_compare = cmp_bytes;
710 	}
711 
712       /* The offset currently used is always kept in off_reg so that the
713 	 cleanup code on P8 can use it to extract the differing byte.  */
714       emit_move_insn (off_reg, GEN_INT (offset));
715 
716       rtx addr1 = gen_rtx_PLUS (Pmode, s1addr, off_reg);
717       do_load_for_compare_from_addr (load_mode, s1data, addr1, orig_src1);
718       rtx addr2 = gen_rtx_PLUS (Pmode, s2addr, off_reg);
719       do_load_for_compare_from_addr (load_mode, s2data, addr2, orig_src2);
720 
721       /* Cases to handle.  A and B are chunks of the two strings.
722 	 1: Not end of comparison:
723 	 A != B: branch to cleanup code to compute result.
724 	 A == B: next block
725 	 2: End of the inline comparison:
726 	 A != B: branch to cleanup code to compute result.
727 	 A == B: call strcmp/strncmp
728 	 3: compared requested N bytes:
729 	 A == B: branch to result 0.
730 	 A != B: cleanup code to compute result.  */
731 
732       unsigned HOST_WIDE_INT remain = bytes_to_compare - cmp_bytes;
733 
734       if (checkzero)
735 	{
736 	  if (TARGET_P9_VECTOR)
737 	    emit_insn (gen_vcmpnezb_p (vec_result, s1data, s2data));
738 	  else
739 	    {
740 	      /* Emit instructions to do comparison and zero check.  */
741 	      rtx cmp_res = gen_reg_rtx (load_mode);
742 	      rtx cmp_zero = gen_reg_rtx (load_mode);
743 	      rtx cmp_combined = gen_reg_rtx (load_mode);
744 	      emit_insn (gen_altivec_eqv16qi (cmp_res, s1data, s2data));
745 	      emit_insn (gen_altivec_eqv16qi (cmp_zero, s1data, zero_reg));
746 	      emit_insn (gen_orcv16qi3 (vec_result, cmp_zero, cmp_res));
747 	      emit_insn (gen_altivec_vcmpequb_p (cmp_combined, vec_result, zero_reg));
748 	    }
749 	}
750       else
751 	emit_insn (gen_altivec_vcmpequb_p (vec_result, s1data, s2data));
752 
753       bool branch_to_cleanup = (remain > 0 || equality_compare_rest);
754       rtx cr6 = gen_rtx_REG (CCmode, CR6_REGNO);
755       rtx dst_label;
756       rtx cmp_rtx;
757       if (branch_to_cleanup)
758 	{
759 	  /* Branch to cleanup code, otherwise fall through to do more
760 	     compares.  P8 and P9 use different CR bits because on P8
761 	     we are looking at the result of a comparsion vs a
762 	     register of zeroes so the all-true condition means no
763 	     difference or zero was found.  On P9, vcmpnezb sets a byte
764 	     to 0xff if there is a mismatch or zero, so the all-false
765 	     condition indicates we found no difference or zero.  */
766 	  if (!cleanup_label)
767 	    cleanup_label = gen_label_rtx ();
768 	  dst_label = cleanup_label;
769 	  if (TARGET_P9_VECTOR && checkzero)
770 	    cmp_rtx = gen_rtx_NE (VOIDmode, cr6, const0_rtx);
771 	  else
772 	    cmp_rtx = gen_rtx_GE (VOIDmode, cr6, const0_rtx);
773 	}
774       else
775 	{
776 	  /* Branch to final return or fall through to cleanup,
777 	     result is already set to 0.  */
778 	  dst_label = final_move_label;
779 	  if (TARGET_P9_VECTOR && checkzero)
780 	    cmp_rtx = gen_rtx_EQ (VOIDmode, cr6, const0_rtx);
781 	  else
782 	    cmp_rtx = gen_rtx_LT (VOIDmode, cr6, const0_rtx);
783 	}
784 
785       rtx lab_ref = gen_rtx_LABEL_REF (VOIDmode, dst_label);
786       rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx,
787 					 lab_ref, pc_rtx);
788       rtx_insn *j2 = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
789       add_reg_br_prob_note (j2, profile_probability::likely ());
790       JUMP_LABEL (j2) = dst_label;
791       LABEL_NUSES (dst_label) += 1;
792 
793       offset += cmp_bytes;
794       bytes_to_compare -= cmp_bytes;
795     }
796   *p_cleanup_label = cleanup_label;
797   return;
798 }
799 
800 /* Generate the final sequence that identifies the differing
801    byte and generates the final result, taking into account
802    zero bytes:
803 
804    P8:
805         vgbbd 0,0
806         vsldoi 0,0,0,9
807         mfvsrd 9,32
808         addi 10,9,-1    # count trailing zero bits
809         andc 9,10,9
810         popcntd 9,9
811         lbzx 10,28,9    # use that offset to load differing byte
812         lbzx 3,29,9
813         subf 3,3,10     # subtract for final result
814 
815    P9:
816 	 vclzlsbb            # counts trailing bytes with lsb=0
817 	 vextublx            # extract differing byte
818 
819    STR1 is the reg rtx for data from string 1.
820    STR2 is the reg rtx for data from string 2.
821    RESULT is the reg rtx for the comparison result.
822    S1ADDR is the register to use for the base address of the first string.
823    S2ADDR is the register to use for the base address of the second string.
824    ORIG_SRC1 is the unmodified rtx for the first string.
825    ORIG_SRC2 is the unmodified rtx for the second string.
826    OFF_REG is the register to use for the string offset for loads.
827    VEC_RESULT is the rtx for the vector result indicating the byte difference.  */
828 
829 static void
emit_final_compare_vec(rtx str1,rtx str2,rtx result,rtx s1addr,rtx s2addr,rtx orig_src1,rtx orig_src2,rtx off_reg,rtx vec_result)830 emit_final_compare_vec (rtx str1, rtx str2, rtx result,
831 			rtx s1addr, rtx s2addr,
832 			rtx orig_src1, rtx orig_src2,
833 			rtx off_reg, rtx vec_result)
834 {
835 
836   if (TARGET_P9_VECTOR)
837     {
838       rtx diffix = gen_reg_rtx (SImode);
839       rtx chr1 = gen_reg_rtx (SImode);
840       rtx chr2 = gen_reg_rtx (SImode);
841       rtx chr1_di = simplify_gen_subreg (DImode, chr1, SImode, 0);
842       rtx chr2_di = simplify_gen_subreg (DImode, chr2, SImode, 0);
843       emit_insn (gen_vclzlsbb_v16qi (diffix, vec_result));
844       emit_insn (gen_vextublx (chr1, diffix, str1));
845       emit_insn (gen_vextublx (chr2, diffix, str2));
846       do_sub3 (result, chr1_di, chr2_di);
847     }
848   else
849     {
850       gcc_assert (TARGET_P8_VECTOR);
851       rtx diffix = gen_reg_rtx (DImode);
852       rtx result_gbbd = gen_reg_rtx (V16QImode);
853       /* Since each byte of the input is either 00 or FF, the bytes in
854 	 dw0 and dw1 after vgbbd are all identical to each other.  */
855       emit_insn (gen_p8v_vgbbd (result_gbbd, vec_result));
856       /* For LE, we shift by 9 and get BA in the low two bytes then CTZ.
857 	 For BE, we shift by 7 and get AB in the high two bytes then CLZ.  */
858       rtx result_shifted = gen_reg_rtx (V16QImode);
859       int shift_amt = (BYTES_BIG_ENDIAN) ? 7 : 9;
860       emit_insn (gen_altivec_vsldoi_v16qi (result_shifted, result_gbbd,
861 					   result_gbbd, GEN_INT (shift_amt)));
862 
863       rtx diffix_df = simplify_gen_subreg (DFmode, diffix, DImode, 0);
864       emit_insn (gen_p8_mfvsrd_3_v16qi (diffix_df, result_shifted));
865       rtx count = gen_reg_rtx (DImode);
866 
867       if (BYTES_BIG_ENDIAN)
868 	emit_insn (gen_clzdi2 (count, diffix));
869       else
870 	emit_insn (gen_ctzdi2 (count, diffix));
871 
872       /* P8 doesn't have a good solution for extracting one byte from
873 	 a vsx reg like vextublx on P9 so we just compute the offset
874 	 of the differing byte and load it from each string.  */
875       do_add3 (off_reg, off_reg, count);
876 
877       rtx chr1 = gen_reg_rtx (QImode);
878       rtx chr2 = gen_reg_rtx (QImode);
879       rtx addr1 = gen_rtx_PLUS (Pmode, s1addr, off_reg);
880       do_load_for_compare_from_addr (QImode, chr1, addr1, orig_src1);
881       rtx addr2 = gen_rtx_PLUS (Pmode, s2addr, off_reg);
882       do_load_for_compare_from_addr (QImode, chr2, addr2, orig_src2);
883       machine_mode rmode = GET_MODE (result);
884       rtx chr1_rm = simplify_gen_subreg (rmode, chr1, QImode, 0);
885       rtx chr2_rm = simplify_gen_subreg (rmode, chr2, QImode, 0);
886       do_sub3 (result, chr1_rm, chr2_rm);
887     }
888 
889   return;
890 }
891 
892 /* Expand a block compare operation using loop code, and return true
893    if successful.  Return false if we should let the compiler generate
894    normal code, probably a memcmp call.
895 
896    OPERANDS[0] is the target (result).
897    OPERANDS[1] is the first source.
898    OPERANDS[2] is the second source.
899    OPERANDS[3] is the length.
900    OPERANDS[4] is the alignment.  */
901 bool
expand_compare_loop(rtx operands[])902 expand_compare_loop (rtx operands[])
903 {
904   rtx target = operands[0];
905   rtx orig_src1 = operands[1];
906   rtx orig_src2 = operands[2];
907   rtx bytes_rtx = operands[3];
908   rtx align_rtx = operands[4];
909 
910   /* This case is complicated to handle because the subtract
911      with carry instructions do not generate the 64-bit
912      carry and so we must emit code to calculate it ourselves.
913      We choose not to implement this yet.  */
914   if (TARGET_32BIT && TARGET_POWERPC64)
915     return false;
916 
917   /* Allow non-const length.  */
918   int bytes_is_const = CONST_INT_P (bytes_rtx);
919 
920   /* This must be a fixed size alignment.  */
921   if (!CONST_INT_P (align_rtx))
922     return false;
923 
924   HOST_WIDE_INT align1 = MEM_ALIGN (orig_src1) / BITS_PER_UNIT;
925   HOST_WIDE_INT align2 = MEM_ALIGN (orig_src2) / BITS_PER_UNIT;
926   HOST_WIDE_INT minalign = MIN (align1, align2);
927 
928   bool isP7 = (rs6000_tune == PROCESSOR_POWER7);
929 
930   gcc_assert (GET_MODE (target) == SImode);
931 
932   /* Anything to move?	*/
933   HOST_WIDE_INT bytes = 0;
934   if (bytes_is_const)
935     bytes = INTVAL (bytes_rtx);
936 
937   if (bytes_is_const && bytes == 0)
938     return true;
939 
940   /* Limit the amount we compare, if known statically.  */
941   HOST_WIDE_INT max_bytes;
942   switch (rs6000_tune)
943     {
944     case PROCESSOR_POWER7:
945       if (!bytes_is_const)
946 	if (minalign < 8)
947 	  max_bytes = 0;
948 	else
949 	  max_bytes = 128;
950       else
951 	if (minalign < 8)
952 	  max_bytes = 32;
953 	else
954 	  max_bytes = 128;
955       break;
956     case PROCESSOR_POWER8:
957       if (!bytes_is_const)
958 	max_bytes = 0;
959       else
960 	if (minalign < 8)
961 	  max_bytes = 128;
962 	else
963 	  max_bytes = 64;
964       break;
965     case PROCESSOR_POWER9:
966     case PROCESSOR_POWER10:
967       if (bytes_is_const)
968 	max_bytes = 191;
969       else
970 	max_bytes = 0;
971       break;
972     default:
973       max_bytes = 128;
974     }
975 
976   /* Allow the option to override the default.  */
977   if (rs6000_block_compare_inline_loop_limit >= 0)
978     max_bytes = (unsigned HOST_WIDE_INT) rs6000_block_compare_inline_loop_limit;
979 
980   if (max_bytes == 0)
981     return false;
982 
983   rtx cmp_rem = gen_reg_rtx (word_mode);  /* Remainder for library call.  */
984   rtx loop_cmp = gen_reg_rtx (word_mode); /* Actual amount compared by loop.  */
985   HOST_WIDE_INT niter;
986   rtx iter = gen_reg_rtx (word_mode);
987   rtx iv1 = gen_reg_rtx (word_mode);
988   rtx iv2 = gen_reg_rtx (word_mode);
989   rtx d1_1 = gen_reg_rtx (word_mode);  /* Addr expression src1+iv1 */
990   rtx d1_2 = gen_reg_rtx (word_mode);  /* Addr expression src1+iv2 */
991   rtx d2_1 = gen_reg_rtx (word_mode);  /* Addr expression src2+iv1 */
992   rtx d2_2 = gen_reg_rtx (word_mode);  /* Addr expression src2+iv2 */
993 
994   /* Strip unneeded subreg from length if there is one.  */
995   if (SUBREG_P (bytes_rtx) && subreg_lowpart_p (bytes_rtx))
996     bytes_rtx = SUBREG_REG (bytes_rtx);
997   /* Extend bytes_rtx to word_mode if needed.  But, we expect only to
998    maybe have to deal with the case were bytes_rtx is SImode and
999    word_mode is DImode.  */
1000   if (!bytes_is_const)
1001     {
1002       if (GET_MODE_SIZE (GET_MODE (bytes_rtx)) > GET_MODE_SIZE (word_mode))
1003 	/* Do not expect length longer than word_mode.  */
1004 	return false;
1005       else if (GET_MODE_SIZE (GET_MODE (bytes_rtx)) < GET_MODE_SIZE (word_mode))
1006 	{
1007 	  bytes_rtx = force_reg (GET_MODE (bytes_rtx), bytes_rtx);
1008 	  bytes_rtx = force_reg (word_mode,
1009 				 gen_rtx_fmt_e (ZERO_EXTEND, word_mode,
1010 						bytes_rtx));
1011 	}
1012       else
1013 	/* Make sure it's in a register before we get started.  */
1014 	bytes_rtx = force_reg (GET_MODE (bytes_rtx), bytes_rtx);
1015     }
1016 
1017   machine_mode load_mode = word_mode;
1018   HOST_WIDE_INT load_mode_size = GET_MODE_SIZE (load_mode);
1019 
1020   /* Number of bytes per iteration of the unrolled loop.  */
1021   HOST_WIDE_INT loop_bytes = 2 * load_mode_size;
1022   /* max iters and bytes compared in the loop.  */
1023   HOST_WIDE_INT max_loop_iter = max_bytes / loop_bytes;
1024   HOST_WIDE_INT max_loop_bytes = max_loop_iter * loop_bytes;
1025   int l2lb = floor_log2 (loop_bytes);
1026 
1027   if (bytes_is_const && (max_bytes < load_mode_size
1028 			 || !IN_RANGE (bytes, load_mode_size, max_bytes)))
1029     return false;
1030 
1031   bool no_remainder_code = false;
1032   rtx final_label = gen_label_rtx ();
1033   rtx final_ref = gen_rtx_LABEL_REF (VOIDmode, final_label);
1034   rtx diff_label = gen_label_rtx ();
1035   rtx library_call_label = NULL;
1036   rtx cleanup_label = gen_label_rtx ();
1037 
1038   rtx cr;
1039 
1040   rtx src1_addr = copy_addr_to_reg (XEXP (orig_src1, 0));
1041   rtx src2_addr = copy_addr_to_reg (XEXP (orig_src2, 0));
1042 
1043   /* Difference found is stored here before jump to diff_label.  */
1044   rtx diff = gen_reg_rtx (word_mode);
1045   rtx_insn *j;
1046 
1047   /* Example of generated code for 35 bytes aligned 1 byte.
1048 
1049 	     mtctr 8
1050 	     li 6,0
1051 	     li 5,8
1052      .L13:
1053 	     ldbrx 7,3,6
1054 	     ldbrx 9,10,6
1055 	     ldbrx 0,3,5
1056 	     ldbrx 4,10,5
1057 	     addi 6,6,16
1058 	     addi 5,5,16
1059 	     subfc. 9,9,7
1060 	     bne 0,.L10
1061 	     subfc. 9,4,0
1062 	     bdnzt 2,.L13
1063 	     bne 0,.L10
1064 	     add 3,3,6
1065 	     add 10,10,6
1066 	     addi 9,3,-5
1067 	     ldbrx 7,0,9
1068 	     addi 9,10,-5
1069 	     ldbrx 9,0,9
1070 	     subfc 9,9,7
1071 	     .p2align 4,,15
1072      .L10:
1073 	     popcntd 9,9
1074 	     subfe 10,10,10
1075 	     or 9,9,10
1076 
1077      Compiled with -fno-reorder-blocks for clarity.  */
1078 
1079   /* Structure of what we're going to do:
1080      Two separate lengths: what we will compare before bailing to library
1081 	call (max_bytes), and the total length to be checked.
1082      if length <= 16, branch to linear cleanup code starting with
1083 	remainder length check (length not known at compile time)
1084      set up 2 iv's and load count reg, compute remainder length
1085      unrollx2 compare loop
1086      if loop exit due to a difference, branch to difference handling code
1087      if remainder length < 8, branch to final cleanup compare
1088      load and compare 8B
1089      final cleanup comparison (depends on alignment and length)
1090 	load 8B, shift off bytes past length, compare
1091 	load 8B ending at last byte and compare
1092 	load/compare 1 byte at a time (short block abutting 4k boundary)
1093      difference handling, 64->32 conversion
1094      final result
1095      branch around memcmp call
1096      memcmp library call
1097   */
1098 
1099   /* If bytes is not const, compare length and branch directly
1100      to the cleanup code that can handle 0-16 bytes if length
1101      is >= 16.  Stash away bytes-max_bytes for the library call.  */
1102   if (bytes_is_const)
1103     {
1104       /* These need to be set for some of the places we may jump to.  */
1105       if (bytes > max_bytes)
1106 	{
1107 	  no_remainder_code = true;
1108 	  niter = max_loop_iter;
1109 	  library_call_label = gen_label_rtx ();
1110 	}
1111       else
1112 	{
1113 	  niter = bytes / loop_bytes;
1114 	}
1115       emit_move_insn (iter, GEN_INT (niter));
1116       emit_move_insn (loop_cmp, GEN_INT (niter * loop_bytes));
1117       emit_move_insn (cmp_rem, GEN_INT (bytes - niter * loop_bytes));
1118     }
1119   else
1120     {
1121       library_call_label = gen_label_rtx ();
1122 
1123       /* If we go to the cleanup code, it expects length to be in cmp_rem.  */
1124       emit_move_insn (cmp_rem, bytes_rtx);
1125 
1126       /* Check for > max_bytes bytes.  We want to bail out as quickly as
1127 	 possible if we have to go over to memcmp.  */
1128       do_ifelse (CCmode, GT, bytes_rtx, GEN_INT (max_bytes),
1129 		 NULL_RTX, library_call_label, profile_probability::even ());
1130 
1131       /* Check for < loop_bytes bytes.  */
1132       do_ifelse (CCmode, LT, bytes_rtx, GEN_INT (loop_bytes),
1133 		 NULL_RTX, cleanup_label, profile_probability::even ());
1134 
1135       /* Loop compare bytes and iterations if bytes>max_bytes.  */
1136       rtx mb_reg = gen_reg_rtx (word_mode);
1137       emit_move_insn (mb_reg, GEN_INT (max_loop_bytes));
1138       rtx mi_reg = gen_reg_rtx (word_mode);
1139       emit_move_insn (mi_reg, GEN_INT (max_loop_iter));
1140 
1141       /* Compute number of loop iterations if bytes <= max_bytes.  */
1142       if (word_mode == DImode)
1143 	emit_insn (gen_lshrdi3 (iter, bytes_rtx, GEN_INT (l2lb)));
1144       else
1145 	emit_insn (gen_lshrsi3 (iter, bytes_rtx, GEN_INT (l2lb)));
1146 
1147       /* Compute bytes to compare in loop if bytes <= max_bytes.  */
1148       rtx mask = GEN_INT (HOST_WIDE_INT_M1U << l2lb);
1149       if (word_mode == DImode)
1150 	{
1151 	  emit_insn (gen_anddi3 (loop_cmp, bytes_rtx, mask));
1152 	}
1153       else
1154 	{
1155 	  emit_insn (gen_andsi3 (loop_cmp, bytes_rtx, mask));
1156 	}
1157 
1158       /* Check for bytes <= max_bytes.  */
1159       if (TARGET_ISEL)
1160 	{
1161 	  /* P9 has fast isel so we use one compare and two isel.  */
1162 	  cr = gen_reg_rtx (CCmode);
1163 	  rtx compare_rtx = gen_rtx_COMPARE (CCmode, bytes_rtx,
1164 					     GEN_INT (max_bytes));
1165 	  emit_move_insn (cr, compare_rtx);
1166 	  rtx cmp_rtx = gen_rtx_LE (VOIDmode, cr, const0_rtx);
1167 	  do_isel (loop_cmp, cmp_rtx, loop_cmp, mb_reg, cr);
1168 	  do_isel (iter, cmp_rtx, iter, mi_reg, cr);
1169 	}
1170       else
1171 	{
1172 	  rtx lab_after = gen_label_rtx ();
1173 	  do_ifelse (CCmode, LE, bytes_rtx, GEN_INT (max_bytes),
1174 		     NULL_RTX, lab_after, profile_probability::even ());
1175 	  emit_move_insn (loop_cmp, mb_reg);
1176 	  emit_move_insn (iter, mi_reg);
1177 	  emit_label (lab_after);
1178 	}
1179 
1180       /* Now compute remainder bytes which isn't used until after the loop.  */
1181       do_sub3 (cmp_rem, bytes_rtx, loop_cmp);
1182     }
1183 
1184   rtx dcond = NULL_RTX; /* Used for when we jump to diff_label.  */
1185   /* For p9 we need to have just one of these as multiple places define
1186      it and it gets used by the setb at the end.  */
1187   if (TARGET_P9_MISC)
1188     dcond = gen_reg_rtx (CCUNSmode);
1189 
1190   if (!bytes_is_const || bytes >= loop_bytes)
1191     {
1192       /* It should not be possible to come here if remaining bytes is
1193 	 < 16 in the runtime case either.  Compute number of loop
1194 	 iterations.  We compare 2*word_mode per iteration so 16B for
1195 	 64-bit code and 8B for 32-bit.  Set up two induction
1196 	 variables and load count register.  */
1197 
1198       /* HACK ALERT: create hard reg for CTR here.  If we just use a
1199 	 pseudo, cse will get rid of it and then the allocator will
1200 	 see it used in the lshr above and won't give us ctr.  */
1201       rtx ctr = gen_rtx_REG (Pmode, CTR_REGNO);
1202       emit_move_insn (ctr, iter);
1203       emit_move_insn (diff, GEN_INT (0));
1204       emit_move_insn (iv1, GEN_INT (0));
1205       emit_move_insn (iv2, GEN_INT (load_mode_size));
1206 
1207       /* inner loop to compare 2*word_mode */
1208       rtx loop_top_label = gen_label_rtx ();
1209       emit_label (loop_top_label);
1210 
1211       rtx src1_ix1 = gen_rtx_PLUS (word_mode, src1_addr, iv1);
1212       rtx src2_ix1 = gen_rtx_PLUS (word_mode, src2_addr, iv1);
1213 
1214       do_load_for_compare_from_addr (load_mode, d1_1,
1215 				     src1_ix1, orig_src1);
1216       do_load_for_compare_from_addr (load_mode, d2_1,
1217 				     src2_ix1, orig_src2);
1218       do_add3 (iv1, iv1, GEN_INT (loop_bytes));
1219 
1220       rtx src1_ix2 = gen_rtx_PLUS (word_mode, src1_addr, iv2);
1221       rtx src2_ix2 = gen_rtx_PLUS (word_mode, src2_addr, iv2);
1222 
1223       do_load_for_compare_from_addr (load_mode, d1_2,
1224 				     src1_ix2, orig_src1);
1225       do_load_for_compare_from_addr (load_mode, d2_2,
1226 				     src2_ix2, orig_src2);
1227       do_add3 (iv2, iv2, GEN_INT (loop_bytes));
1228 
1229       if (TARGET_P9_MISC)
1230 	{
1231 	  /* Generate a compare, and convert with a setb later.  */
1232 	  rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1_1, d2_1);
1233 	  emit_insn (gen_rtx_SET (dcond, cmp));
1234 	}
1235       else
1236 	{
1237 	  dcond = gen_reg_rtx (CCmode);
1238 	  if (word_mode == DImode)
1239 	    emit_insn (gen_subfdi3_carry_dot2 (diff, d2_1, d1_1, dcond));
1240 	  else
1241 	    emit_insn (gen_subfsi3_carry_dot2 (diff, d2_1, d1_1, dcond));
1242 	}
1243 
1244       do_ifelse (GET_MODE (dcond), NE, NULL_RTX, NULL_RTX,
1245 		 dcond, diff_label, profile_probability::unlikely ());
1246 
1247       if (TARGET_P9_MISC)
1248 	{
1249 	  /* Generate a compare, and convert with a setb later.  */
1250 	  rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1_2, d2_2);
1251 	  emit_insn (gen_rtx_SET (dcond, cmp));
1252 	}
1253       else
1254 	{
1255 	  dcond = gen_reg_rtx (CCmode);
1256 	  if (word_mode == DImode)
1257 	    emit_insn (gen_subfdi3_carry_dot2 (diff, d2_2, d1_2, dcond));
1258 	  else
1259 	    emit_insn (gen_subfsi3_carry_dot2 (diff, d2_2, d1_2, dcond));
1260 	}
1261 
1262       rtx eqrtx = gen_rtx_EQ (VOIDmode, d1_2, d2_2);
1263       if (TARGET_64BIT)
1264 	j = emit_jump_insn (gen_bdnztf_di (loop_top_label, ctr, ctr,
1265 					   eqrtx, dcond));
1266       else
1267 	j = emit_jump_insn (gen_bdnztf_si (loop_top_label, ctr, ctr,
1268 					   eqrtx, dcond));
1269       add_reg_br_prob_note (j, profile_probability::likely ());
1270       JUMP_LABEL (j) = loop_top_label;
1271       LABEL_NUSES (loop_top_label) += 1;
1272     }
1273 
1274   HOST_WIDE_INT bytes_remaining = 0;
1275   if (bytes_is_const)
1276     bytes_remaining = (bytes % loop_bytes);
1277 
1278   /* If diff is nonzero, branch to difference handling
1279      code.  If we exit here with a nonzero diff, it is
1280      because the second word differed.  */
1281   if (TARGET_P9_MISC)
1282     do_ifelse (CCUNSmode, NE, NULL_RTX, NULL_RTX, dcond,
1283 	       diff_label, profile_probability::unlikely ());
1284   else
1285     do_ifelse (CCmode, NE, diff, const0_rtx, NULL_RTX,
1286 	       diff_label, profile_probability::unlikely ());
1287 
1288   if (library_call_label != NULL && bytes_is_const && bytes > max_bytes)
1289     {
1290       /* If the length is known at compile time, then we will always
1291 	 have a remainder to go to the library call with.  */
1292       rtx library_call_ref = gen_rtx_LABEL_REF (VOIDmode, library_call_label);
1293       j = emit_jump_insn (gen_rtx_SET (pc_rtx, library_call_ref));
1294       JUMP_LABEL (j) = library_call_label;
1295       LABEL_NUSES (library_call_label) += 1;
1296       emit_barrier ();
1297     }
1298 
1299   if (bytes_is_const && bytes_remaining == 0)
1300     {
1301       /* No remainder and if we are here then diff is 0 so just return 0 */
1302       if (TARGET_64BIT)
1303 	emit_insn (gen_movsi (target, gen_lowpart (SImode, diff)));
1304       else
1305 	emit_move_insn (target, diff);
1306       j = emit_jump_insn (gen_rtx_SET (pc_rtx, final_ref));
1307       JUMP_LABEL (j) = final_label;
1308       LABEL_NUSES (final_label) += 1;
1309       emit_barrier ();
1310     }
1311   else if (!no_remainder_code)
1312     {
1313       /* Update addresses to point to the next word to examine.  */
1314       do_add3 (src1_addr, src1_addr, iv1);
1315       do_add3 (src2_addr, src2_addr, iv1);
1316 
1317       emit_label (cleanup_label);
1318 
1319       if (!bytes_is_const)
1320 	{
1321 	  /* If we're dealing with runtime length, we have to check if
1322 	     it's zero after the loop.  When length is known at compile
1323 	     time the no-remainder condition is dealt with above.  By
1324 	     doing this after cleanup_label, we also deal with the
1325 	     case where length is 0 at the start and we bypass the
1326 	     loop with a branch to cleanup_label.  */
1327 	  emit_move_insn (target, const0_rtx);
1328 	  do_ifelse (CCmode, EQ, cmp_rem, const0_rtx,
1329 		     NULL_RTX, final_label, profile_probability::unlikely ());
1330 	}
1331 
1332       rtx final_cleanup = gen_label_rtx ();
1333       rtx cmp_rem_before = gen_reg_rtx (word_mode);
1334       /* Compare one more word_mode chunk if needed.  */
1335       if (!bytes_is_const || bytes_remaining >= load_mode_size)
1336 	{
1337 	  /* If remainder length < word length, branch to final
1338 	     cleanup compare.  */
1339 
1340 	  if (!bytes_is_const)
1341 	    {
1342 	      do_ifelse (CCmode, LT, cmp_rem, GEN_INT (load_mode_size),
1343 			 NULL_RTX, final_cleanup, profile_probability::even ());
1344 	    }
1345 
1346 	  /* load and compare 8B */
1347 	  do_load_for_compare_from_addr (load_mode, d1_1,
1348 					 src1_addr, orig_src1);
1349 	  do_load_for_compare_from_addr (load_mode, d2_1,
1350 					 src2_addr, orig_src2);
1351 
1352 	  /* Compare the word, see if we need to do the last partial.  */
1353 	  if (TARGET_P9_MISC)
1354 	    {
1355 	      /* Generate a compare, and convert with a setb later.  */
1356 	      rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1_1, d2_1);
1357 	      emit_insn (gen_rtx_SET (dcond, cmp));
1358 	    }
1359 	  else
1360 	    {
1361 	      dcond = gen_reg_rtx (CCmode);
1362 	      if (word_mode == DImode)
1363 		emit_insn (gen_subfdi3_carry_dot2 (diff, d2_1, d1_1, dcond));
1364 	      else
1365 		emit_insn (gen_subfsi3_carry_dot2 (diff, d2_1, d1_1, dcond));
1366 	    }
1367 
1368 	  do_ifelse (GET_MODE (dcond), NE, NULL_RTX, NULL_RTX,
1369 		     dcond, diff_label, profile_probability::even ());
1370 
1371 	  do_add3 (src1_addr, src1_addr, GEN_INT (load_mode_size));
1372 	  do_add3 (src2_addr, src2_addr, GEN_INT (load_mode_size));
1373 	  emit_move_insn (cmp_rem_before, cmp_rem);
1374 	  do_add3 (cmp_rem, cmp_rem, GEN_INT (-load_mode_size));
1375 	  if (bytes_is_const)
1376 	    bytes_remaining -= load_mode_size;
1377 	  else
1378 	    /* See if remaining length is now zero.  We previously set
1379 	       target to 0 so we can just jump to the end.  */
1380 	    do_ifelse (CCmode, EQ, cmp_rem, const0_rtx, NULL_RTX,
1381 		       final_label, profile_probability::unlikely ());
1382 	}
1383 
1384       /* Cases:
1385 	 bytes_is_const
1386 	   We can always shift back to do an overlapping compare
1387 	   of the last chunk because we know length >= 8.
1388 
1389 	 !bytes_is_const
1390 	   align>=load_mode_size
1391 	     Read word_mode and mask
1392 	   align<load_mode_size
1393 	     avoid stepping past end
1394 
1395 	  Three strategies:
1396 	  * decrement address and do overlapping compare
1397 	  * read word_mode and mask
1398 	  * carefully avoid crossing 4k boundary
1399        */
1400 
1401       if ((!bytes_is_const || (bytes_is_const && bytes_remaining && isP7))
1402 	  && align1 >= load_mode_size && align2 >= load_mode_size)
1403 	{
1404 	  /* Alignment is larger than word_mode so we do not need to be
1405 	     concerned with extra page crossings.  But, we do not know
1406 	     that the length is larger than load_mode_size so we might
1407 	     end up compareing against data before the block if we try
1408 	     an overlapping compare.  Also we use this on P7 for fixed length
1409 	     remainder because P7 doesn't like overlapping unaligned.
1410 	     Strategy: load 8B, shift off bytes past length, and compare.  */
1411 	  emit_label (final_cleanup);
1412 	  do_load_mask_compare (load_mode, diff, cmp_rem, dcond,
1413 				src1_addr, src2_addr, orig_src1, orig_src2);
1414 	}
1415       else if (bytes_remaining && bytes_is_const)
1416 	{
1417 	  /* We do not do loop expand if length < 32 so we know at the
1418 	     end we can do an overlapping compare.
1419 	     Strategy: shift address back and do word_mode load that
1420 	     ends at the end of the block.  */
1421 	  emit_label (final_cleanup);
1422 	  do_overlap_load_compare (load_mode, true, bytes_remaining, diff,
1423 				   cmp_rem, dcond, src1_addr, src2_addr,
1424 				   orig_src1, orig_src2);
1425 	}
1426       else if (!bytes_is_const)
1427 	{
1428 	  rtx handle4k_label = gen_label_rtx ();
1429 	  rtx nonconst_overlap = gen_label_rtx ();
1430 	  emit_label (nonconst_overlap);
1431 
1432 	  /* Here we have to handle the case where whe have runtime
1433 	     length which may be too short for overlap compare, and
1434 	     alignment is not at least load_mode_size so we have to
1435 	     tread carefully to avoid stepping across 4k boundaries.  */
1436 
1437 	  /* If the length after the loop was larger than word_mode
1438 	     size, we can just do an overlapping compare and we're
1439 	     done.  We fall through to this code from the word_mode
1440 	     compare that preceeds this.  */
1441 	  do_overlap_load_compare (load_mode, false, 0, diff,
1442 				   cmp_rem, dcond, src1_addr, src2_addr,
1443 				   orig_src1, orig_src2);
1444 
1445 	  rtx diff_ref = gen_rtx_LABEL_REF (VOIDmode, diff_label);
1446 	  j = emit_jump_insn (gen_rtx_SET (pc_rtx, diff_ref));
1447 	  JUMP_LABEL (j) = diff_label;
1448 	  LABEL_NUSES (diff_label) += 1;
1449 	  emit_barrier ();
1450 
1451 	  /* If we couldn't do the overlap compare we have to be more
1452 	     careful of the 4k boundary.  Test to see if either
1453 	     address is less than word_mode_size away from a 4k
1454 	     boundary.  If not, then we can do a load/shift/compare
1455 	     and we are done.  We come to this code if length was less
1456 	     than word_mode_size.  */
1457 
1458 	  emit_label (final_cleanup);
1459 
1460 	  /* We can still avoid the slow case if the length was larger
1461 	     than one loop iteration, in which case go do the overlap
1462 	     load compare path.  */
1463 	  do_ifelse (CCmode, GT, bytes_rtx, GEN_INT (loop_bytes),
1464 		     NULL_RTX, nonconst_overlap, profile_probability::even ());
1465 
1466 	  rtx rem4k = gen_reg_rtx (word_mode);
1467 	  rtx dist1 = gen_reg_rtx (word_mode);
1468 	  rtx dist2 = gen_reg_rtx (word_mode);
1469 	  do_sub3 (rem4k, GEN_INT (4096), cmp_rem);
1470 	  if (word_mode == SImode)
1471 	    emit_insn (gen_andsi3 (dist1, src1_addr, GEN_INT (0xfff)));
1472 	  else
1473 	    emit_insn (gen_anddi3 (dist1, src1_addr, GEN_INT (0xfff)));
1474 	  do_ifelse (CCmode, LE, dist1, rem4k, NULL_RTX,
1475 		     handle4k_label, profile_probability::very_unlikely ());
1476 	  if (word_mode == SImode)
1477 	    emit_insn (gen_andsi3 (dist2, src2_addr, GEN_INT (0xfff)));
1478 	  else
1479 	    emit_insn (gen_anddi3 (dist2, src2_addr, GEN_INT (0xfff)));
1480 	  do_ifelse (CCmode, LE, dist2, rem4k, NULL_RTX,
1481 		     handle4k_label, profile_probability::very_unlikely ());
1482 
1483 	  /* We don't have a 4k boundary to deal with, so do
1484 	     a load/shift/compare and jump to diff.  */
1485 
1486 	  do_load_mask_compare (load_mode, diff, cmp_rem, dcond,
1487 				src1_addr, src2_addr, orig_src1, orig_src2);
1488 
1489 	  j = emit_jump_insn (gen_rtx_SET (pc_rtx, diff_ref));
1490 	  JUMP_LABEL (j) = diff_label;
1491 	  LABEL_NUSES (diff_label) += 1;
1492 	  emit_barrier ();
1493 
1494 	  /* Finally in the unlikely case we are inching up to a
1495 	     4k boundary we use a compact lbzx/compare loop to do
1496 	     it a byte at a time.  */
1497 
1498 	  emit_label (handle4k_label);
1499 
1500 	  rtx ctr = gen_rtx_REG (Pmode, CTR_REGNO);
1501 	  emit_move_insn (ctr, cmp_rem);
1502 	  rtx ixreg = gen_reg_rtx (Pmode);
1503 	  emit_move_insn (ixreg, const0_rtx);
1504 
1505 	  rtx src1_ix = gen_rtx_PLUS (word_mode, src1_addr, ixreg);
1506 	  rtx src2_ix = gen_rtx_PLUS (word_mode, src2_addr, ixreg);
1507 	  rtx d1 = gen_reg_rtx (word_mode);
1508 	  rtx d2 = gen_reg_rtx (word_mode);
1509 
1510 	  rtx fc_loop = gen_label_rtx ();
1511 	  emit_label (fc_loop);
1512 
1513 	  do_load_for_compare_from_addr (QImode, d1, src1_ix, orig_src1);
1514 	  do_load_for_compare_from_addr (QImode, d2, src2_ix, orig_src2);
1515 
1516 	  do_add3 (ixreg, ixreg, const1_rtx);
1517 
1518 	  rtx cond = gen_reg_rtx (CCmode);
1519 	  rtx subexpr = gen_rtx_MINUS (word_mode, d1, d2);
1520 	  rs6000_emit_dot_insn (diff, subexpr, 2, cond);
1521 
1522 	  rtx eqrtx = gen_rtx_EQ (VOIDmode, d1, d2);
1523 	  if (TARGET_64BIT)
1524 	    j = emit_jump_insn (gen_bdnztf_di (fc_loop, ctr, ctr,
1525 					       eqrtx, cond));
1526 	  else
1527 	    j = emit_jump_insn (gen_bdnztf_si (fc_loop, ctr, ctr,
1528 					       eqrtx, cond));
1529 	  add_reg_br_prob_note (j, profile_probability::likely ());
1530 	  JUMP_LABEL (j) = fc_loop;
1531 	  LABEL_NUSES (fc_loop) += 1;
1532 
1533 	  if (TARGET_64BIT)
1534 	    emit_insn (gen_movsi (target, gen_lowpart (SImode, diff)));
1535 	  else
1536 	    emit_move_insn (target, diff);
1537 
1538 	  /* Since we are comparing bytes, the difference can be used
1539 	     as the final result and we are done here.  */
1540 	  j = emit_jump_insn (gen_rtx_SET (pc_rtx, final_ref));
1541 	  JUMP_LABEL (j) = final_label;
1542 	  LABEL_NUSES (final_label) += 1;
1543 	  emit_barrier ();
1544 	}
1545     }
1546 
1547   emit_label (diff_label);
1548   /* difference handling, 64->32 conversion */
1549 
1550   /* We need to produce DI result from sub, then convert to target SI
1551      while maintaining <0 / ==0 / >0 properties.  This sequence works:
1552      subfc L,A,B
1553      subfe H,H,H
1554      popcntd L,L
1555      rldimi L,H,6,0
1556 
1557      This is an alternate one Segher cooked up if somebody
1558      wants to expand this for something that doesn't have popcntd:
1559      subfc L,a,b
1560      subfe H,x,x
1561      addic t,L,-1
1562      subfe v,t,L
1563      or z,v,H
1564 
1565      And finally, p9 can just do this:
1566      cmpld A,B
1567      setb r */
1568 
1569   if (TARGET_P9_MISC)
1570     emit_insn (gen_setb_unsigned (target, dcond));
1571   else
1572     {
1573       if (TARGET_64BIT)
1574 	{
1575 	  rtx tmp_reg_ca = gen_reg_rtx (DImode);
1576 	  emit_insn (gen_subfdi3_carry_in_xx (tmp_reg_ca));
1577 	  emit_insn (gen_popcntddi2 (diff, diff));
1578 	  emit_insn (gen_iordi3 (diff, diff, tmp_reg_ca));
1579 	  emit_insn (gen_movsi (target, gen_lowpart (SImode, diff)));
1580 	}
1581       else
1582 	{
1583 	  rtx tmp_reg_ca = gen_reg_rtx (SImode);
1584 	  emit_insn (gen_subfsi3_carry_in_xx (tmp_reg_ca));
1585 	  emit_insn (gen_popcntdsi2 (diff, diff));
1586 	  emit_insn (gen_iorsi3 (target, diff, tmp_reg_ca));
1587 	}
1588     }
1589 
1590   if (library_call_label != NULL)
1591     {
1592       /* Branch around memcmp call.  */
1593       j = emit_jump_insn (gen_rtx_SET (pc_rtx, final_ref));
1594       JUMP_LABEL (j) = final_label;
1595       LABEL_NUSES (final_label) += 1;
1596       emit_barrier ();
1597 
1598       /* Make memcmp library call.  cmp_rem is the remaining bytes that
1599 	 were compared and cmp_rem is the expected amount to be compared
1600 	 by memcmp.  If we don't find a difference in the loop compare, do
1601 	 the library call directly instead of doing a small compare just
1602 	 to get to an arbitrary boundary before calling it anyway.
1603 	 Also, update addresses to point to the next word to examine.  */
1604       emit_label (library_call_label);
1605 
1606       rtx len_rtx = gen_reg_rtx (word_mode);
1607       if (bytes_is_const)
1608 	{
1609 	  emit_move_insn (len_rtx, cmp_rem);
1610 	  do_add3 (src1_addr, src1_addr, iv1);
1611 	  do_add3 (src2_addr, src2_addr, iv1);
1612 	}
1613       else
1614 	emit_move_insn (len_rtx, bytes_rtx);
1615 
1616       tree fun = builtin_decl_explicit (BUILT_IN_MEMCMP);
1617       emit_library_call_value (XEXP (DECL_RTL (fun), 0),
1618 			       target, LCT_NORMAL, GET_MODE (target),
1619 			       src1_addr, Pmode,
1620 			       src2_addr, Pmode,
1621 			       len_rtx, GET_MODE (len_rtx));
1622     }
1623 
1624   /* emit final_label */
1625   emit_label (final_label);
1626   return true;
1627 }
1628 
1629 /* Generate code to convert a DImode-plus-carry subtract result into
1630    a SImode result that has the same <0 / ==0 / >0 properties to
1631    produce the final result from memcmp.
1632 
1633    TARGET is the rtx for the register to receive the memcmp result.
1634    SUB_RESULT is the rtx for the register contining the subtract result.  */
1635 
1636 void
generate_6432_conversion(rtx target,rtx sub_result)1637 generate_6432_conversion(rtx target, rtx sub_result)
1638 {
1639   /* We need to produce DI result from sub, then convert to target SI
1640      while maintaining <0 / ==0 / >0 properties.  This sequence works:
1641      subfc L,A,B
1642      subfe H,H,H
1643      popcntd L,L
1644      rldimi L,H,6,0
1645 
1646      This is an alternate one Segher cooked up if somebody
1647      wants to expand this for something that doesn't have popcntd:
1648      subfc L,a,b
1649      subfe H,x,x
1650      addic t,L,-1
1651      subfe v,t,L
1652      or z,v,H
1653 
1654      And finally, p9 can just do this:
1655      cmpld A,B
1656      setb r */
1657 
1658   if (TARGET_64BIT)
1659     {
1660       rtx tmp_reg_ca = gen_reg_rtx (DImode);
1661       emit_insn (gen_subfdi3_carry_in_xx (tmp_reg_ca));
1662       rtx popcnt = gen_reg_rtx (DImode);
1663       emit_insn (gen_popcntddi2 (popcnt, sub_result));
1664       rtx tmp2 = gen_reg_rtx (DImode);
1665       emit_insn (gen_iordi3 (tmp2, popcnt, tmp_reg_ca));
1666       emit_insn (gen_movsi (target, gen_lowpart (SImode, tmp2)));
1667     }
1668   else
1669     {
1670       rtx tmp_reg_ca = gen_reg_rtx (SImode);
1671       emit_insn (gen_subfsi3_carry_in_xx (tmp_reg_ca));
1672       rtx popcnt = gen_reg_rtx (SImode);
1673       emit_insn (gen_popcntdsi2 (popcnt, sub_result));
1674       emit_insn (gen_iorsi3 (target, popcnt, tmp_reg_ca));
1675     }
1676 }
1677 
1678 /* Generate memcmp expansion using in-line non-loop GPR instructions.
1679    The bool return indicates whether code for a 64->32 conversion
1680    should be generated.
1681 
1682    BYTES is the number of bytes to be compared.
1683    BASE_ALIGN is the minimum alignment for both blocks to compare.
1684    ORIG_SRC1 is the original pointer to the first block to compare.
1685    ORIG_SRC2 is the original pointer to the second block to compare.
1686    SUB_RESULT is the reg rtx for the result from the final subtract.
1687    COND is rtx for a condition register that will be used for the final
1688    compare on power9 or better.
1689    FINAL_RESULT is the reg rtx for the final memcmp result.
1690    P_CONVERT_LABEL is a pointer to rtx that will be used to store the
1691    label generated for a branch to the 64->32 code, if such a branch
1692    is needed.
1693    P_FINAL_LABEL is a pointer to rtx that will be used to store the label
1694    for the end of the memcmp if a branch there is needed.
1695 */
1696 
1697 bool
expand_block_compare_gpr(unsigned HOST_WIDE_INT bytes,unsigned int base_align,rtx orig_src1,rtx orig_src2,rtx sub_result,rtx cond,rtx final_result,rtx * p_convert_label,rtx * p_final_label)1698 expand_block_compare_gpr(unsigned HOST_WIDE_INT bytes, unsigned int base_align,
1699 			 rtx orig_src1, rtx orig_src2,
1700 			 rtx sub_result, rtx cond, rtx final_result,
1701 			 rtx *p_convert_label, rtx *p_final_label)
1702 {
1703   /* Example of generated code for 18 bytes aligned 1 byte.
1704      Compiled with -fno-reorder-blocks for clarity.
1705              ldbrx 10,31,8
1706              ldbrx 9,7,8
1707              subfc. 9,9,10
1708              bne 0,.L6487
1709              addi 9,12,8
1710              addi 5,11,8
1711              ldbrx 10,0,9
1712              ldbrx 9,0,5
1713              subfc. 9,9,10
1714              bne 0,.L6487
1715              addi 9,12,16
1716              lhbrx 10,0,9
1717              addi 9,11,16
1718              lhbrx 9,0,9
1719              subf 9,9,10
1720              b .L6488
1721              .p2align 4,,15
1722      .L6487: #convert_label
1723              popcntd 9,9
1724              subfe 10,10,10
1725              or 9,9,10
1726      .L6488: #final_label
1727              extsw 10,9
1728 
1729      We start off with DImode for two blocks that jump to the DI->SI conversion
1730      if the difference is found there, then a final block of HImode that skips
1731      the DI->SI conversion.  */
1732 
1733   unsigned HOST_WIDE_INT offset = 0;
1734   unsigned int load_mode_size;
1735   HOST_WIDE_INT cmp_bytes = 0;
1736   rtx src1 = orig_src1;
1737   rtx src2 = orig_src2;
1738   rtx tmp_reg_src1 = gen_reg_rtx (word_mode);
1739   rtx tmp_reg_src2 = gen_reg_rtx (word_mode);
1740   bool need_6432_conv = false;
1741   rtx convert_label = NULL;
1742   rtx final_label = NULL;
1743   machine_mode load_mode;
1744 
1745   while (bytes > 0)
1746     {
1747       unsigned int align = compute_current_alignment (base_align, offset);
1748       load_mode = select_block_compare_mode (offset, bytes, align);
1749       load_mode_size = GET_MODE_SIZE (load_mode);
1750       if (bytes >= load_mode_size)
1751 	cmp_bytes = load_mode_size;
1752       else if (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED)
1753 	{
1754 	  /* Move this load back so it doesn't go past the end.
1755 	     P8/P9 can do this efficiently.  */
1756 	  unsigned int extra_bytes = load_mode_size - bytes;
1757 	  cmp_bytes = bytes;
1758 	  if (extra_bytes < offset)
1759 	    {
1760 	      offset -= extra_bytes;
1761 	      cmp_bytes = load_mode_size;
1762 	      bytes = cmp_bytes;
1763 	    }
1764 	}
1765       else
1766 	/* P7 and earlier can't do the overlapping load trick fast,
1767 	   so this forces a non-overlapping load and a shift to get
1768 	   rid of the extra bytes.  */
1769 	cmp_bytes = bytes;
1770 
1771       src1 = adjust_address (orig_src1, load_mode, offset);
1772       src2 = adjust_address (orig_src2, load_mode, offset);
1773 
1774       if (!REG_P (XEXP (src1, 0)))
1775 	{
1776 	  rtx src1_reg = copy_addr_to_reg (XEXP (src1, 0));
1777 	  src1 = replace_equiv_address (src1, src1_reg);
1778 	}
1779       set_mem_size (src1, load_mode_size);
1780 
1781       if (!REG_P (XEXP (src2, 0)))
1782 	{
1783 	  rtx src2_reg = copy_addr_to_reg (XEXP (src2, 0));
1784 	  src2 = replace_equiv_address (src2, src2_reg);
1785 	}
1786       set_mem_size (src2, load_mode_size);
1787 
1788       do_load_for_compare (tmp_reg_src1, src1, load_mode);
1789       do_load_for_compare (tmp_reg_src2, src2, load_mode);
1790 
1791       if (cmp_bytes < load_mode_size)
1792 	{
1793 	  /* Shift unneeded bytes off.  */
1794 	  rtx sh = GEN_INT (BITS_PER_UNIT * (load_mode_size - cmp_bytes));
1795 	  if (word_mode == DImode)
1796 	    {
1797 	      emit_insn (gen_lshrdi3 (tmp_reg_src1, tmp_reg_src1, sh));
1798 	      emit_insn (gen_lshrdi3 (tmp_reg_src2, tmp_reg_src2, sh));
1799 	    }
1800 	  else
1801 	    {
1802 	      emit_insn (gen_lshrsi3 (tmp_reg_src1, tmp_reg_src1, sh));
1803 	      emit_insn (gen_lshrsi3 (tmp_reg_src2, tmp_reg_src2, sh));
1804 	    }
1805 	}
1806 
1807       int remain = bytes - cmp_bytes;
1808       if (GET_MODE_SIZE (GET_MODE (final_result)) > GET_MODE_SIZE (load_mode))
1809 	{
1810 	  /* Final_result is larger than load size so we don't need to
1811 	     reduce result size.  */
1812 
1813 	  /* We previously did a block that need 64->32 conversion but
1814 	     the current block does not, so a label is needed to jump
1815 	     to the end.  */
1816 	  if (need_6432_conv && !final_label)
1817 	    final_label = gen_label_rtx ();
1818 
1819 	  if (remain > 0)
1820 	    {
1821 	      /* This is not the last block, branch to the end if the result
1822 		 of this subtract is not zero.  */
1823 	      if (!final_label)
1824 		final_label = gen_label_rtx ();
1825 	      rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label);
1826 	      rtx tmp = gen_rtx_MINUS (word_mode, tmp_reg_src1, tmp_reg_src2);
1827 	      rtx cr = gen_reg_rtx (CCmode);
1828 	      rs6000_emit_dot_insn (tmp_reg_src2, tmp, 2, cr);
1829 	      emit_insn (gen_movsi (final_result,
1830 				    gen_lowpart (SImode, tmp_reg_src2)));
1831 	      rtx ne_rtx = gen_rtx_NE (VOIDmode, cr, const0_rtx);
1832 	      rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, ne_rtx,
1833 						 fin_ref, pc_rtx);
1834 	      rtx_insn *j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
1835 	      add_reg_br_prob_note (j, profile_probability::unlikely ());
1836 	      JUMP_LABEL (j) = final_label;
1837 	      LABEL_NUSES (final_label) += 1;
1838 	    }
1839 	  else
1840 	    {
1841 	      if (word_mode == DImode)
1842 		{
1843 		  emit_insn (gen_subdi3 (tmp_reg_src2, tmp_reg_src1,
1844 					 tmp_reg_src2));
1845 		  emit_insn (gen_movsi (final_result,
1846 					gen_lowpart (SImode, tmp_reg_src2)));
1847 		}
1848 	      else
1849 		emit_insn (gen_subsi3 (final_result, tmp_reg_src1, tmp_reg_src2));
1850 
1851 	      if (final_label)
1852 		{
1853 		  rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label);
1854 		  rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, fin_ref));
1855 		  JUMP_LABEL (j) = final_label;
1856 		  LABEL_NUSES (final_label) += 1;
1857 		  emit_barrier ();
1858 		}
1859 	    }
1860 	}
1861       else
1862 	{
1863 	  /* Do we need a 64->32 conversion block? We need the 64->32
1864 	     conversion even if final_result size == load_mode size because
1865 	     the subtract generates one extra bit.  */
1866 	  need_6432_conv = true;
1867 
1868 	  if (remain > 0)
1869 	    {
1870 	      if (!convert_label)
1871 		convert_label = gen_label_rtx ();
1872 
1873 	      /* Compare to zero and branch to convert_label if not zero.  */
1874 	      rtx cvt_ref = gen_rtx_LABEL_REF (VOIDmode, convert_label);
1875 	      if (TARGET_P9_MISC)
1876 		{
1877 		/* Generate a compare, and convert with a setb later.
1878 		   Use cond that is passed in because the caller needs
1879 		   to use it for the 64->32 conversion later.  */
1880 		  rtx cmp = gen_rtx_COMPARE (CCUNSmode, tmp_reg_src1,
1881 					     tmp_reg_src2);
1882 		  emit_insn (gen_rtx_SET (cond, cmp));
1883 		}
1884 	      else
1885 		{
1886 		  /* Generate a subfc. and use the longer sequence for
1887 		     conversion.  Cond is not used outside this
1888 		     function in this case.  */
1889 		  cond = gen_reg_rtx (CCmode);
1890 		  if (TARGET_64BIT)
1891 		    emit_insn (gen_subfdi3_carry_dot2 (sub_result, tmp_reg_src2,
1892 						       tmp_reg_src1, cond));
1893 		  else
1894 		    emit_insn (gen_subfsi3_carry_dot2 (sub_result, tmp_reg_src2,
1895 						       tmp_reg_src1, cond));
1896 		}
1897 
1898 	      rtx ne_rtx = gen_rtx_NE (VOIDmode, cond, const0_rtx);
1899 	      rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, ne_rtx,
1900 						 cvt_ref, pc_rtx);
1901 	      rtx_insn *j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
1902 	      add_reg_br_prob_note (j, profile_probability::likely ());
1903 	      JUMP_LABEL (j) = convert_label;
1904 	      LABEL_NUSES (convert_label) += 1;
1905 	    }
1906 	  else
1907 	    {
1908 	      /* Just do the subtract/compare.  Since this is the last block
1909 		 the convert code will be generated immediately following.  */
1910 	      if (TARGET_P9_MISC)
1911 		{
1912 		  rtx cmp = gen_rtx_COMPARE (CCUNSmode, tmp_reg_src1,
1913 					     tmp_reg_src2);
1914 		  emit_insn (gen_rtx_SET (cond, cmp));
1915 		}
1916 	      else
1917 		if (TARGET_64BIT)
1918 		  emit_insn (gen_subfdi3_carry (sub_result, tmp_reg_src2,
1919 						tmp_reg_src1));
1920 		else
1921 		  emit_insn (gen_subfsi3_carry (sub_result, tmp_reg_src2,
1922 						tmp_reg_src1));
1923 	    }
1924 	}
1925 
1926       offset += cmp_bytes;
1927       bytes -= cmp_bytes;
1928     }
1929 
1930   if (convert_label)
1931     *p_convert_label = convert_label;
1932   if (final_label)
1933     *p_final_label = final_label;
1934   return need_6432_conv;
1935 }
1936 
1937 /* Expand a block compare operation, and return true if successful.
1938    Return false if we should let the compiler generate normal code,
1939    probably a memcmp call.
1940 
1941    OPERANDS[0] is the target (result).
1942    OPERANDS[1] is the first source.
1943    OPERANDS[2] is the second source.
1944    OPERANDS[3] is the length.
1945    OPERANDS[4] is the alignment.  */
1946 bool
expand_block_compare(rtx operands[])1947 expand_block_compare (rtx operands[])
1948 {
1949   rtx target = operands[0];
1950   rtx orig_src1 = operands[1];
1951   rtx orig_src2 = operands[2];
1952   rtx bytes_rtx = operands[3];
1953   rtx align_rtx = operands[4];
1954 
1955   /* This case is complicated to handle because the subtract
1956      with carry instructions do not generate the 64-bit
1957      carry and so we must emit code to calculate it ourselves.
1958      We choose not to implement this yet.  */
1959   if (TARGET_32BIT && TARGET_POWERPC64)
1960     return false;
1961 
1962   bool isP7 = (rs6000_tune == PROCESSOR_POWER7);
1963 
1964   /* Allow this param to shut off all expansion.  */
1965   if (rs6000_block_compare_inline_limit == 0)
1966     return false;
1967 
1968   /* targetm.slow_unaligned_access -- don't do unaligned stuff.
1969      However slow_unaligned_access returns true on P7 even though the
1970      performance of this code is good there.  */
1971   if (!isP7
1972       && (targetm.slow_unaligned_access (word_mode, MEM_ALIGN (orig_src1))
1973 	  || targetm.slow_unaligned_access (word_mode, MEM_ALIGN (orig_src2))))
1974     return false;
1975 
1976   /* Unaligned l*brx traps on P7 so don't do this.  However this should
1977      not affect much because LE isn't really supported on P7 anyway.  */
1978   if (isP7 && !BYTES_BIG_ENDIAN)
1979     return false;
1980 
1981   /* If this is not a fixed size compare, try generating loop code and
1982      if that fails just call memcmp.  */
1983   if (!CONST_INT_P (bytes_rtx))
1984     return expand_compare_loop (operands);
1985 
1986   /* This must be a fixed size alignment.  */
1987   if (!CONST_INT_P (align_rtx))
1988     return false;
1989 
1990   unsigned int base_align = UINTVAL (align_rtx) / BITS_PER_UNIT;
1991 
1992   gcc_assert (GET_MODE (target) == SImode);
1993 
1994   /* Anything to move?  */
1995   unsigned HOST_WIDE_INT bytes = UINTVAL (bytes_rtx);
1996   if (bytes == 0)
1997     return true;
1998 
1999   /* P7/P8 code uses cond for subfc. but P9 uses
2000      it for cmpld which needs CCUNSmode.  */
2001   rtx cond = NULL;
2002   if (TARGET_P9_MISC)
2003     cond = gen_reg_rtx (CCUNSmode);
2004 
2005   /* Is it OK to use vec/vsx for this.  TARGET_VSX means we have at
2006      least POWER7 but we use TARGET_EFFICIENT_UNALIGNED_VSX which is
2007      at least POWER8.  That way we can rely on overlapping compares to
2008      do the final comparison of less than 16 bytes.  Also I do not
2009      want to deal with making this work for 32 bits.  In addition, we
2010      have to make sure that we have at least P8_VECTOR (we don't allow
2011      P9_VECTOR without P8_VECTOR).  */
2012   int use_vec = (bytes >= 33 && !TARGET_32BIT
2013 		 && TARGET_EFFICIENT_UNALIGNED_VSX && TARGET_P8_VECTOR);
2014 
2015   /* We don't want to generate too much code.  The loop code can take
2016      over for lengths greater than 31 bytes.  */
2017   unsigned HOST_WIDE_INT max_bytes = rs6000_block_compare_inline_limit;
2018 
2019   /* Don't generate too much code if vsx was disabled.  */
2020   if (!use_vec && max_bytes > 1)
2021     max_bytes = ((max_bytes + 1) / 2) - 1;
2022 
2023   if (!IN_RANGE (bytes, 1, max_bytes))
2024     return expand_compare_loop (operands);
2025 
2026   /* The code generated for p7 and older is not faster than glibc
2027      memcmp if alignment is small and length is not short, so bail
2028      out to avoid those conditions.  */
2029   if (!TARGET_EFFICIENT_OVERLAPPING_UNALIGNED
2030       && ((base_align == 1 && bytes > 16)
2031 	  || (base_align == 2 && bytes > 32)))
2032     return false;
2033 
2034   rtx final_label = NULL;
2035 
2036   if (use_vec)
2037     {
2038       rtx final_move_label = gen_label_rtx ();
2039       rtx s1addr = gen_reg_rtx (Pmode);
2040       rtx s2addr = gen_reg_rtx (Pmode);
2041       rtx off_reg = gen_reg_rtx (Pmode);
2042       rtx cleanup_label = NULL;
2043       rtx vec_result = gen_reg_rtx (V16QImode);
2044       rtx s1data = gen_reg_rtx (V16QImode);
2045       rtx s2data = gen_reg_rtx (V16QImode);
2046       rtx result_reg = gen_reg_rtx (word_mode);
2047       emit_move_insn (result_reg, GEN_INT (0));
2048 
2049       expand_cmp_vec_sequence (bytes, orig_src1, orig_src2,
2050 			       s1addr, s2addr, off_reg, s1data, s2data,
2051 			       vec_result, false,
2052 			       &cleanup_label, final_move_label, false);
2053 
2054       if (cleanup_label)
2055 	emit_label (cleanup_label);
2056 
2057       emit_insn (gen_one_cmplv16qi2 (vec_result, vec_result));
2058 
2059       emit_final_compare_vec (s1data, s2data, result_reg,
2060 			      s1addr, s2addr, orig_src1, orig_src2,
2061 			      off_reg, vec_result);
2062 
2063       emit_label (final_move_label);
2064       emit_insn (gen_movsi (target,
2065 			    gen_lowpart (SImode, result_reg)));
2066     }
2067   else
2068     { /* generate GPR code */
2069 
2070       rtx convert_label = NULL;
2071       rtx sub_result = gen_reg_rtx (word_mode);
2072       bool need_6432_conversion =
2073 	expand_block_compare_gpr(bytes, base_align,
2074 				 orig_src1, orig_src2,
2075 				 sub_result, cond, target,
2076 				 &convert_label, &final_label);
2077 
2078       if (need_6432_conversion)
2079 	{
2080 	  if (convert_label)
2081 	    emit_label (convert_label);
2082 	  if (TARGET_P9_MISC)
2083 	    emit_insn (gen_setb_unsigned (target, cond));
2084 	  else
2085 	    generate_6432_conversion(target, sub_result);
2086 	}
2087     }
2088 
2089   if (final_label)
2090     emit_label (final_label);
2091 
2092   return true;
2093 }
2094 
2095 /* Generate page crossing check and branch code to set up for
2096    strncmp when we don't have DI alignment.
2097    STRNCMP_LABEL is the label to branch if there is a page crossing.
2098    SRC_ADDR is the string address to be examined.
2099    BYTES is the max number of bytes to compare.  */
2100 static void
expand_strncmp_align_check(rtx strncmp_label,rtx src_addr,HOST_WIDE_INT bytes)2101 expand_strncmp_align_check (rtx strncmp_label, rtx src_addr, HOST_WIDE_INT bytes)
2102 {
2103   rtx lab_ref = gen_rtx_LABEL_REF (VOIDmode, strncmp_label);
2104   rtx src_pgoff = gen_reg_rtx (GET_MODE (src_addr));
2105   do_and3 (src_pgoff, src_addr, GEN_INT (0xfff));
2106   rtx cond = gen_reg_rtx (CCmode);
2107   emit_move_insn (cond, gen_rtx_COMPARE (CCmode, src_pgoff,
2108 					 GEN_INT (4096 - bytes)));
2109 
2110   rtx cmp_rtx = gen_rtx_GE (VOIDmode, cond, const0_rtx);
2111 
2112   rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx,
2113 				     lab_ref, pc_rtx);
2114   rtx_insn *j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
2115   add_reg_br_prob_note (j, profile_probability::unlikely ());
2116   JUMP_LABEL (j) = strncmp_label;
2117   LABEL_NUSES (strncmp_label) += 1;
2118 }
2119 
2120 /* Generate the sequence of compares for strcmp/strncmp using gpr instructions.
2121    BYTES_TO_COMPARE is the number of bytes to be compared.
2122    BASE_ALIGN is the smaller of the alignment of the two strings.
2123    ORIG_SRC1 is the unmodified rtx for the first string.
2124    ORIG_SRC2 is the unmodified rtx for the second string.
2125    TMP_REG_SRC1 is the register for loading the first string.
2126    TMP_REG_SRC2 is the register for loading the second string.
2127    RESULT_REG is the rtx for the result register.
2128    EQUALITY_COMPARE_REST is a flag to indicate we need to make a cleanup call
2129    to strcmp/strncmp if we have equality at the end of the inline comparison.
2130    P_CLEANUP_LABEL is a pointer to rtx for a label we generate if we need code
2131    to clean up and generate the final comparison result.
2132    FINAL_MOVE_LABEL is rtx for a label we can branch to when we can just
2133    set the final result.  */
2134 static void
expand_strncmp_gpr_sequence(unsigned HOST_WIDE_INT bytes_to_compare,unsigned int base_align,rtx orig_src1,rtx orig_src2,rtx tmp_reg_src1,rtx tmp_reg_src2,rtx result_reg,bool equality_compare_rest,rtx * p_cleanup_label,rtx final_move_label)2135 expand_strncmp_gpr_sequence (unsigned HOST_WIDE_INT bytes_to_compare,
2136 			     unsigned int base_align,
2137 			     rtx orig_src1, rtx orig_src2,
2138 			     rtx tmp_reg_src1, rtx tmp_reg_src2, rtx result_reg,
2139 			     bool equality_compare_rest, rtx *p_cleanup_label,
2140 			     rtx final_move_label)
2141 {
2142   unsigned int word_mode_size = GET_MODE_SIZE (word_mode);
2143   machine_mode load_mode;
2144   unsigned int load_mode_size;
2145   unsigned HOST_WIDE_INT cmp_bytes = 0;
2146   unsigned HOST_WIDE_INT offset = 0;
2147   rtx src1_addr = force_reg (Pmode, XEXP (orig_src1, 0));
2148   rtx src2_addr = force_reg (Pmode, XEXP (orig_src2, 0));
2149   gcc_assert (p_cleanup_label != NULL);
2150   rtx cleanup_label = *p_cleanup_label;
2151 
2152   while (bytes_to_compare > 0)
2153     {
2154       /* GPR compare sequence:
2155          check each 8B with: ld/ld/cmpb/cmpb/orc./bne
2156 
2157          cleanup code at end:
2158          cntlzd        get bit of first zero/diff byte
2159          subfic        convert for rldcl use
2160          rldcl rldcl   extract diff/zero byte
2161          subf          subtract for final result
2162 
2163          The last compare can branch around the cleanup code if the
2164          result is zero because the strings are exactly equal.  */
2165 
2166       unsigned int align = compute_current_alignment (base_align, offset);
2167       load_mode = select_block_compare_mode (offset, bytes_to_compare, align);
2168       load_mode_size = GET_MODE_SIZE (load_mode);
2169       if (bytes_to_compare >= load_mode_size)
2170 	cmp_bytes = load_mode_size;
2171       else if (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED)
2172 	{
2173 	  /* Move this load back so it doesn't go past the end.
2174 	     P8/P9 can do this efficiently.  */
2175 	  unsigned int extra_bytes = load_mode_size - bytes_to_compare;
2176 	  cmp_bytes = bytes_to_compare;
2177 	  if (extra_bytes < offset)
2178 	    {
2179 	      offset -= extra_bytes;
2180 	      cmp_bytes = load_mode_size;
2181 	      bytes_to_compare = cmp_bytes;
2182 	    }
2183 	}
2184       else
2185 	/* P7 and earlier can't do the overlapping load trick fast,
2186 	   so this forces a non-overlapping load and a shift to get
2187 	   rid of the extra bytes.  */
2188 	cmp_bytes = bytes_to_compare;
2189 
2190       rtx offset_rtx;
2191       if (BYTES_BIG_ENDIAN || TARGET_AVOID_XFORM)
2192 	offset_rtx = GEN_INT (offset);
2193       else
2194 	{
2195 	  offset_rtx = gen_reg_rtx (Pmode);
2196 	  emit_move_insn (offset_rtx, GEN_INT (offset));
2197 	}
2198       rtx addr1 = gen_rtx_PLUS (Pmode, src1_addr, offset_rtx);
2199       rtx addr2 = gen_rtx_PLUS (Pmode, src2_addr, offset_rtx);
2200 
2201       do_load_for_compare_from_addr (load_mode, tmp_reg_src1, addr1, orig_src1);
2202       do_load_for_compare_from_addr (load_mode, tmp_reg_src2, addr2, orig_src2);
2203 
2204       /* We must always left-align the data we read, and
2205 	 clear any bytes to the right that are beyond the string.
2206 	 Otherwise the cmpb sequence won't produce the correct
2207 	 results.  However if there is only one byte left, we
2208 	 can just subtract to get the final result so the shifts
2209 	 and clears are not needed.  */
2210 
2211       unsigned HOST_WIDE_INT remain = bytes_to_compare - cmp_bytes;
2212 
2213       /* Loading just a single byte is a special case.  If we are
2214 	 loading more than that, we have to check whether we are
2215 	 looking at the entire chunk of data.  If not, rotate left and
2216 	 clear right so that bytes we aren't supposed to look at are
2217 	 zeroed, and the first byte we are supposed to compare is
2218 	 leftmost.  */
2219       if (load_mode_size != 1)
2220 	{
2221 	  if (load_mode_size < word_mode_size)
2222 	    {
2223 	      /* Rotate left first.  */
2224 	      rtx sh = GEN_INT (BITS_PER_UNIT
2225 				* (word_mode_size - load_mode_size));
2226 	      do_rotl3 (tmp_reg_src1, tmp_reg_src1, sh);
2227 	      do_rotl3 (tmp_reg_src2, tmp_reg_src2, sh);
2228 	    }
2229 
2230 	  if (cmp_bytes < word_mode_size)
2231 	    {
2232 	      /* Now clear right.  This plus the rotate can be
2233 		 turned into a rldicr instruction.  */
2234 	      HOST_WIDE_INT mb = BITS_PER_UNIT * (word_mode_size - cmp_bytes);
2235 	      rtx mask = GEN_INT (HOST_WIDE_INT_M1U << mb);
2236 	      do_and3 (tmp_reg_src1, tmp_reg_src1, mask);
2237 	      do_and3 (tmp_reg_src2, tmp_reg_src2, mask);
2238 	    }
2239 	}
2240 
2241       /* Cases to handle.  A and B are chunks of the two strings.
2242 	 1: Not end of comparison:
2243 	 A != B: branch to cleanup code to compute result.
2244 	 A == B: check for 0 byte, next block if not found.
2245 	 2: End of the inline comparison:
2246 	 A != B: branch to cleanup code to compute result.
2247 	 A == B: check for 0 byte, call strcmp/strncmp
2248 	 3: compared requested N bytes:
2249 	 A == B: branch to result 0.
2250 	 A != B: cleanup code to compute result.  */
2251 
2252       rtx dst_label;
2253       if (remain > 0 || equality_compare_rest)
2254 	{
2255 	  /* Branch to cleanup code, otherwise fall through to do
2256 	     more compares.  */
2257 	  if (!cleanup_label)
2258 	    cleanup_label = gen_label_rtx ();
2259 	  dst_label = cleanup_label;
2260 	}
2261       else
2262 	/* Branch to end and produce result of 0.  */
2263 	dst_label = final_move_label;
2264 
2265       if (load_mode_size == 1)
2266 	{
2267 	  /* Special case for comparing just single byte.  */
2268 	  if (equality_compare_rest)
2269 	    {
2270 	      /* Use subf./bne to branch to final_move_label if the
2271 		 byte differs, otherwise fall through to the strncmp
2272 		 call.  We must also check for a zero byte here as we
2273 		 must not make the library call if this is the end of
2274 		 the string.  */
2275 
2276 	      rtx lab_ref = gen_rtx_LABEL_REF (VOIDmode, final_move_label);
2277 	      rtx cond = gen_reg_rtx (CCmode);
2278 	      rtx diff_rtx = gen_rtx_MINUS (word_mode,
2279 					    tmp_reg_src1, tmp_reg_src2);
2280 	      rs6000_emit_dot_insn (result_reg, diff_rtx, 2, cond);
2281 	      rtx cmp_rtx = gen_rtx_NE (VOIDmode, cond, const0_rtx);
2282 
2283 	      rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx,
2284 						 lab_ref, pc_rtx);
2285 	      rtx_insn *j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
2286 	      add_reg_br_prob_note (j, profile_probability::unlikely ());
2287 	      JUMP_LABEL (j) = final_move_label;
2288 	      LABEL_NUSES (final_move_label) += 1;
2289 
2290 	      /* Check for zero byte here before fall through to
2291 		 library call.  This catches the case where the
2292 		 strings are equal and end in a zero byte at this
2293 		 position.  */
2294 
2295 	      rtx cond0 = gen_reg_rtx (CCmode);
2296 	      emit_move_insn (cond0, gen_rtx_COMPARE (CCmode, tmp_reg_src1,
2297 						      const0_rtx));
2298 
2299 	      rtx cmp0eq_rtx = gen_rtx_EQ (VOIDmode, cond0, const0_rtx);
2300 
2301 	      rtx ifelse0 = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp0eq_rtx,
2302 						 lab_ref, pc_rtx);
2303 	      rtx_insn *j0 = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse0));
2304 	      add_reg_br_prob_note (j0, profile_probability::unlikely ());
2305 	      JUMP_LABEL (j0) = final_move_label;
2306 	      LABEL_NUSES (final_move_label) += 1;
2307 	    }
2308 	  else
2309 	    {
2310 	      /* This is the last byte to be compared so we can use
2311 		 subf to compute the final result and branch
2312 		 unconditionally to final_move_label.  */
2313 
2314 	      do_sub3 (result_reg, tmp_reg_src1, tmp_reg_src2);
2315 
2316 	      rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_move_label);
2317 	      rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, fin_ref));
2318 	      JUMP_LABEL (j) = final_move_label;
2319 	      LABEL_NUSES (final_move_label) += 1;
2320 	      emit_barrier ();
2321 	    }
2322 	}
2323       else
2324 	{
2325 	  rtx cmpb_zero = gen_reg_rtx (word_mode);
2326 	  rtx cmpb_diff = gen_reg_rtx (word_mode);
2327 	  rtx zero_reg = gen_reg_rtx (word_mode);
2328 	  rtx lab_ref = gen_rtx_LABEL_REF (VOIDmode, dst_label);
2329 	  rtx cond = gen_reg_rtx (CCmode);
2330 
2331 	  emit_move_insn (zero_reg, GEN_INT (0));
2332 	  do_cmpb3 (cmpb_diff, tmp_reg_src1, tmp_reg_src2);
2333 	  do_cmpb3 (cmpb_zero, tmp_reg_src1, zero_reg);
2334 	  rtx not_diff = gen_rtx_NOT (word_mode, cmpb_diff);
2335 	  rtx orc_rtx = gen_rtx_IOR (word_mode, not_diff, cmpb_zero);
2336 
2337 	  rs6000_emit_dot_insn (result_reg, orc_rtx, 2, cond);
2338 
2339 	  rtx cmp_rtx;
2340 	  if (remain == 0 && !equality_compare_rest)
2341 	    cmp_rtx = gen_rtx_EQ (VOIDmode, cond, const0_rtx);
2342 	  else
2343 	    cmp_rtx = gen_rtx_NE (VOIDmode, cond, const0_rtx);
2344 
2345 	  rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx,
2346 					     lab_ref, pc_rtx);
2347 	  rtx_insn *j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
2348 	  add_reg_br_prob_note (j, profile_probability::unlikely ());
2349 	  JUMP_LABEL (j) = dst_label;
2350 	  LABEL_NUSES (dst_label) += 1;
2351 	}
2352 
2353       offset += cmp_bytes;
2354       bytes_to_compare -= cmp_bytes;
2355     }
2356 
2357   *p_cleanup_label = cleanup_label;
2358   return;
2359 }
2360 
2361 /* Generate the final sequence that identifies the differing
2362    byte and generates the final result, taking into account
2363    zero bytes:
2364 
2365    cntlzd            get bit of first zero/diff byte
2366    addi              convert for rldcl use
2367    rldcl rldcl       extract diff/zero byte
2368    subf              subtract for final result
2369 
2370    STR1 is the reg rtx for data from string 1.
2371    STR2 is the reg rtx for data from string 2.
2372    RESULT is the reg rtx for the comparison result.  */
2373 
2374 static void
emit_final_str_compare_gpr(rtx str1,rtx str2,rtx result)2375 emit_final_str_compare_gpr (rtx str1, rtx str2, rtx result)
2376 {
2377   machine_mode m = GET_MODE (str1);
2378   rtx rot_amt = gen_reg_rtx (m);
2379 
2380   rtx rot1_1 = gen_reg_rtx (m);
2381   rtx rot1_2 = gen_reg_rtx (m);
2382   rtx rot2_1 = gen_reg_rtx (m);
2383   rtx rot2_2 = gen_reg_rtx (m);
2384 
2385   if (m == SImode)
2386     {
2387       emit_insn (gen_clzsi2 (rot_amt, result));
2388       emit_insn (gen_addsi3 (rot_amt, rot_amt, GEN_INT (8)));
2389       emit_insn (gen_rotlsi3 (rot1_1, str1,
2390 			      gen_lowpart (SImode, rot_amt)));
2391       emit_insn (gen_andsi3_mask (rot1_2, rot1_1, GEN_INT (0xff)));
2392       emit_insn (gen_rotlsi3 (rot2_1, str2,
2393 			      gen_lowpart (SImode, rot_amt)));
2394       emit_insn (gen_andsi3_mask (rot2_2, rot2_1, GEN_INT (0xff)));
2395       emit_insn (gen_subsi3 (result, rot1_2, rot2_2));
2396     }
2397   else if (m == DImode)
2398     {
2399       emit_insn (gen_clzdi2 (rot_amt, result));
2400       emit_insn (gen_adddi3 (rot_amt, rot_amt, GEN_INT (8)));
2401       emit_insn (gen_rotldi3 (rot1_1, str1,
2402 			      gen_lowpart (SImode, rot_amt)));
2403       emit_insn (gen_anddi3_mask (rot1_2, rot1_1, GEN_INT (0xff)));
2404       emit_insn (gen_rotldi3 (rot2_1, str2,
2405 			      gen_lowpart (SImode, rot_amt)));
2406       emit_insn (gen_anddi3_mask (rot2_2, rot2_1, GEN_INT (0xff)));
2407       emit_insn (gen_subdi3 (result, rot1_2, rot2_2));
2408     }
2409   else
2410     gcc_unreachable ();
2411 
2412   return;
2413 }
2414 
2415 /* Expand a string compare operation with length, and return
2416    true if successful.  Return false if we should let the
2417    compiler generate normal code, probably a strncmp call.
2418 
2419    OPERANDS[0] is the target (result).
2420    OPERANDS[1] is the first source.
2421    OPERANDS[2] is the second source.
2422    If NO_LENGTH is zero, then:
2423    OPERANDS[3] is the length.
2424    OPERANDS[4] is the alignment in bytes.
2425    If NO_LENGTH is nonzero, then:
2426    OPERANDS[3] is the alignment in bytes.  */
2427 bool
expand_strn_compare(rtx operands[],int no_length)2428 expand_strn_compare (rtx operands[], int no_length)
2429 {
2430   rtx target = operands[0];
2431   rtx orig_src1 = operands[1];
2432   rtx orig_src2 = operands[2];
2433   rtx bytes_rtx, align_rtx;
2434   if (no_length)
2435     {
2436       bytes_rtx = NULL;
2437       align_rtx = operands[3];
2438     }
2439   else
2440     {
2441       bytes_rtx = operands[3];
2442       align_rtx = operands[4];
2443     }
2444 
2445   rtx src1_addr = force_reg (Pmode, XEXP (orig_src1, 0));
2446   rtx src2_addr = force_reg (Pmode, XEXP (orig_src2, 0));
2447 
2448   /* If we have a length, it must be constant.  This simplifies things
2449      a bit as we don't have to generate code to check if we've exceeded
2450      the length.  Later this could be expanded to handle this case.  */
2451   if (!no_length && !CONST_INT_P (bytes_rtx))
2452     return false;
2453 
2454   /* This must be a fixed size alignment.  */
2455   if (!CONST_INT_P (align_rtx))
2456     return false;
2457 
2458   unsigned int base_align = UINTVAL (align_rtx);
2459   unsigned int align1 = MEM_ALIGN (orig_src1) / BITS_PER_UNIT;
2460   unsigned int align2 = MEM_ALIGN (orig_src2) / BITS_PER_UNIT;
2461 
2462   /* targetm.slow_unaligned_access -- don't do unaligned stuff.  */
2463   if (targetm.slow_unaligned_access (word_mode, align1)
2464       || targetm.slow_unaligned_access (word_mode, align2))
2465     return false;
2466 
2467   gcc_assert (GET_MODE (target) == SImode);
2468 
2469   unsigned int required_align = 8;
2470 
2471   unsigned HOST_WIDE_INT offset = 0;
2472   unsigned HOST_WIDE_INT bytes; /* N from the strncmp args if available.  */
2473   unsigned HOST_WIDE_INT compare_length; /* How much to compare inline.  */
2474 
2475   if (no_length)
2476     bytes = rs6000_string_compare_inline_limit;
2477   else
2478     bytes = UINTVAL (bytes_rtx);
2479 
2480   /* Is it OK to use vec/vsx for this.  TARGET_VSX means we have at
2481      least POWER7 but we use TARGET_EFFICIENT_UNALIGNED_VSX which is
2482      at least POWER8.  That way we can rely on overlapping compares to
2483      do the final comparison of less than 16 bytes.  Also I do not
2484      want to deal with making this work for 32 bits.  In addition, we
2485      have to make sure that we have at least P8_VECTOR (we don't allow
2486      P9_VECTOR without P8_VECTOR).  */
2487   int use_vec = (bytes >= 16 && !TARGET_32BIT
2488 		 && TARGET_EFFICIENT_UNALIGNED_VSX && TARGET_P8_VECTOR);
2489 
2490   if (use_vec)
2491     required_align = 16;
2492 
2493   machine_mode load_mode;
2494   rtx tmp_reg_src1, tmp_reg_src2;
2495   if (use_vec)
2496     {
2497       load_mode = V16QImode;
2498       tmp_reg_src1 = gen_reg_rtx (V16QImode);
2499       tmp_reg_src2 = gen_reg_rtx (V16QImode);
2500     }
2501   else
2502     {
2503       load_mode = select_block_compare_mode (0, bytes, base_align);
2504       tmp_reg_src1 = gen_reg_rtx (word_mode);
2505       tmp_reg_src2 = gen_reg_rtx (word_mode);
2506     }
2507 
2508   compare_length = rs6000_string_compare_inline_limit;
2509 
2510   /* If we have equality at the end of the last compare and we have not
2511      found the end of the string, we need to call strcmp/strncmp to
2512      compare the remainder.  */
2513   bool equality_compare_rest = false;
2514 
2515   if (no_length)
2516     {
2517       bytes = compare_length;
2518       equality_compare_rest = true;
2519     }
2520   else
2521     {
2522       if (bytes <= compare_length)
2523 	compare_length = bytes;
2524       else
2525 	equality_compare_rest = true;
2526     }
2527 
2528   rtx result_reg = gen_reg_rtx (word_mode);
2529   rtx final_move_label = gen_label_rtx ();
2530   rtx final_label = gen_label_rtx ();
2531   rtx begin_compare_label = NULL;
2532 
2533   if (base_align < required_align)
2534     {
2535       /* Generate code that checks distance to 4k boundary for this case.  */
2536       begin_compare_label = gen_label_rtx ();
2537       rtx strncmp_label = gen_label_rtx ();
2538       rtx jmp;
2539 
2540       /* Strncmp for power8 in glibc does this:
2541 	 rldicl r8,r3,0,52
2542 	 cmpldi cr7,r8,4096-16
2543 	 bgt    cr7,L(pagecross) */
2544 
2545       /* Make sure that the length we use for the alignment test and
2546          the subsequent code generation are in agreement so we do not
2547          go past the length we tested for a 4k boundary crossing.  */
2548       unsigned HOST_WIDE_INT align_test = compare_length;
2549       if (align_test < required_align)
2550         {
2551           align_test = HOST_WIDE_INT_1U << ceil_log2 (align_test);
2552           base_align = align_test;
2553         }
2554       else
2555         {
2556           align_test = ROUND_UP (align_test, required_align);
2557           base_align = required_align;
2558         }
2559 
2560       if (align1 < required_align)
2561         expand_strncmp_align_check (strncmp_label, src1_addr, align_test);
2562       if (align2 < required_align)
2563         expand_strncmp_align_check (strncmp_label, src2_addr, align_test);
2564 
2565       /* Now generate the following sequence:
2566 	 - branch to begin_compare
2567 	 - strncmp_label
2568 	 - call to strncmp
2569 	 - branch to final_label
2570 	 - begin_compare_label */
2571 
2572       rtx cmp_ref = gen_rtx_LABEL_REF (VOIDmode, begin_compare_label);
2573       jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, cmp_ref));
2574       JUMP_LABEL (jmp) = begin_compare_label;
2575       LABEL_NUSES (begin_compare_label) += 1;
2576       emit_barrier ();
2577 
2578       emit_label (strncmp_label);
2579 
2580       if (no_length)
2581 	{
2582 	  tree fun = builtin_decl_explicit (BUILT_IN_STRCMP);
2583 	  emit_library_call_value (XEXP (DECL_RTL (fun), 0),
2584 				   target, LCT_NORMAL, GET_MODE (target),
2585 				   force_reg (Pmode, src1_addr), Pmode,
2586 				   force_reg (Pmode, src2_addr), Pmode);
2587 	}
2588       else
2589 	{
2590 	  /* -m32 -mpowerpc64 results in word_mode being DImode even
2591 	     though otherwise it is 32-bit.  The length arg to strncmp
2592 	     is a size_t which will be the same size as pointers.  */
2593 	  rtx len_rtx = gen_reg_rtx (Pmode);
2594 	  emit_move_insn (len_rtx, gen_int_mode (bytes, Pmode));
2595 
2596 	  tree fun = builtin_decl_explicit (BUILT_IN_STRNCMP);
2597 	  emit_library_call_value (XEXP (DECL_RTL (fun), 0),
2598 				   target, LCT_NORMAL, GET_MODE (target),
2599 				   force_reg (Pmode, src1_addr), Pmode,
2600 				   force_reg (Pmode, src2_addr), Pmode,
2601 				   len_rtx, Pmode);
2602 	}
2603 
2604       rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label);
2605       jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, fin_ref));
2606       JUMP_LABEL (jmp) = final_label;
2607       LABEL_NUSES (final_label) += 1;
2608       emit_barrier ();
2609       emit_label (begin_compare_label);
2610     }
2611 
2612   rtx cleanup_label = NULL;
2613   rtx s1addr = NULL, s2addr = NULL, off_reg = NULL, vec_result = NULL;
2614 
2615   /* Generate a sequence of GPR or VEC/VSX instructions to compare out
2616      to the length specified.  */
2617   if (use_vec)
2618     {
2619       s1addr = gen_reg_rtx (Pmode);
2620       s2addr = gen_reg_rtx (Pmode);
2621       off_reg = gen_reg_rtx (Pmode);
2622       vec_result = gen_reg_rtx (load_mode);
2623       emit_move_insn (result_reg, GEN_INT (0));
2624       expand_cmp_vec_sequence (compare_length,
2625 			       orig_src1, orig_src2,
2626 			       s1addr, s2addr, off_reg,
2627 			       tmp_reg_src1, tmp_reg_src2,
2628 			       vec_result,
2629 			       equality_compare_rest,
2630 			       &cleanup_label, final_move_label, true);
2631     }
2632   else
2633     expand_strncmp_gpr_sequence (compare_length, base_align,
2634 				 orig_src1, orig_src2,
2635 				 tmp_reg_src1, tmp_reg_src2,
2636 				 result_reg,
2637 				 equality_compare_rest,
2638 				 &cleanup_label, final_move_label);
2639 
2640   offset = compare_length;
2641 
2642   if (equality_compare_rest)
2643     {
2644       /* Update pointers past what has been compared already.  */
2645       rtx src1 = force_reg (Pmode,
2646 			    gen_rtx_PLUS (Pmode, src1_addr, GEN_INT (offset)));
2647       rtx src2 = force_reg (Pmode,
2648 			    gen_rtx_PLUS (Pmode, src2_addr, GEN_INT (offset)));
2649 
2650       /* Construct call to strcmp/strncmp to compare the rest of the string.  */
2651       if (no_length)
2652 	{
2653 	  tree fun = builtin_decl_explicit (BUILT_IN_STRCMP);
2654 	  emit_library_call_value (XEXP (DECL_RTL (fun), 0),
2655 				   target, LCT_NORMAL, GET_MODE (target),
2656 				   src1, Pmode, src2, Pmode);
2657 	}
2658       else
2659 	{
2660 	  rtx len_rtx = gen_reg_rtx (Pmode);
2661 	  emit_move_insn (len_rtx, gen_int_mode (bytes - compare_length, Pmode));
2662 	  tree fun = builtin_decl_explicit (BUILT_IN_STRNCMP);
2663 	  emit_library_call_value (XEXP (DECL_RTL (fun), 0),
2664 				   target, LCT_NORMAL, GET_MODE (target),
2665 				   src1, Pmode, src2, Pmode, len_rtx, Pmode);
2666 	}
2667 
2668       rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label);
2669       rtx jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, fin_ref));
2670       JUMP_LABEL (jmp) = final_label;
2671       LABEL_NUSES (final_label) += 1;
2672       emit_barrier ();
2673     }
2674 
2675   if (cleanup_label)
2676     emit_label (cleanup_label);
2677 
2678   if (use_vec)
2679     emit_final_compare_vec (tmp_reg_src1, tmp_reg_src2, result_reg,
2680 			    s1addr, s2addr, orig_src1, orig_src2,
2681 			    off_reg, vec_result);
2682   else
2683     emit_final_str_compare_gpr (tmp_reg_src1, tmp_reg_src2, result_reg);
2684 
2685   emit_label (final_move_label);
2686   emit_insn (gen_movsi (target,
2687 			gen_lowpart (SImode, result_reg)));
2688   emit_label (final_label);
2689   return true;
2690 }
2691 
2692 /* Generate loads and stores for a move of v4si mode using lvx/stvx.
2693    This uses altivec_{l,st}vx_<mode>_internal which use unspecs to
2694    keep combine from changing what instruction gets used.
2695 
2696    DEST is the destination for the data.
2697    SRC is the source of the data for the move.  */
2698 
2699 static rtx
gen_lvx_v4si_move(rtx dest,rtx src)2700 gen_lvx_v4si_move (rtx dest, rtx src)
2701 {
2702   gcc_assert (MEM_P (dest) ^ MEM_P (src));
2703   gcc_assert (GET_MODE (dest) == V4SImode && GET_MODE (src) == V4SImode);
2704 
2705   if (MEM_P (dest))
2706     return gen_altivec_stvx_v4si_internal (dest, src);
2707   else
2708     return gen_altivec_lvx_v4si_internal (dest, src);
2709 }
2710 
2711 static rtx
gen_lxvl_stxvl_move(rtx dest,rtx src,int length)2712 gen_lxvl_stxvl_move (rtx dest, rtx src, int length)
2713 {
2714   gcc_assert (MEM_P (dest) ^ MEM_P (src));
2715   gcc_assert (GET_MODE (dest) == V16QImode && GET_MODE (src) == V16QImode);
2716   gcc_assert (length <= 16);
2717 
2718   bool is_store = MEM_P (dest);
2719   rtx addr;
2720 
2721   /* If the address form is not a simple register, make it so.  */
2722   if (is_store)
2723     addr = XEXP (dest, 0);
2724   else
2725     addr = XEXP (src, 0);
2726 
2727   if (!REG_P (addr))
2728     addr = force_reg (Pmode, addr);
2729 
2730   rtx len = force_reg (DImode, gen_int_mode (length, DImode));
2731   if (is_store)
2732     return gen_stxvl (src, addr, len);
2733   else
2734     return gen_lxvl (dest, addr, len);
2735 }
2736 
2737 /* Expand a block move operation, and return 1 if successful.  Return 0
2738    if we should let the compiler generate normal code.
2739 
2740    operands[0] is the destination
2741    operands[1] is the source
2742    operands[2] is the length
2743    operands[3] is the alignment */
2744 
2745 #define MAX_MOVE_REG 4
2746 
2747 int
expand_block_move(rtx operands[],bool might_overlap)2748 expand_block_move (rtx operands[], bool might_overlap)
2749 {
2750   rtx orig_dest = operands[0];
2751   rtx orig_src	= operands[1];
2752   rtx bytes_rtx	= operands[2];
2753   rtx align_rtx = operands[3];
2754   int constp	= CONST_INT_P (bytes_rtx);
2755   int align;
2756   int bytes;
2757   int offset;
2758   int move_bytes;
2759   rtx loads[MAX_MOVE_REG];
2760   rtx stores[MAX_MOVE_REG];
2761   int num_reg = 0;
2762 
2763   /* If this is not a fixed size move, just call memcpy */
2764   if (! constp)
2765     return 0;
2766 
2767   /* This must be a fixed size alignment */
2768   gcc_assert (CONST_INT_P (align_rtx));
2769   align = INTVAL (align_rtx) * BITS_PER_UNIT;
2770 
2771   /* Anything to move? */
2772   bytes = INTVAL (bytes_rtx);
2773   if (bytes <= 0)
2774     return 1;
2775 
2776   if (bytes > rs6000_block_move_inline_limit)
2777     return 0;
2778 
2779   int orig_bytes = bytes;
2780   for (offset = 0; bytes > 0; offset += move_bytes, bytes -= move_bytes)
2781     {
2782       union {
2783 	rtx (*mov) (rtx, rtx);
2784 	rtx (*movlen) (rtx, rtx, int);
2785       } gen_func;
2786       machine_mode mode = BLKmode;
2787       rtx src, dest;
2788       bool move_with_length = false;
2789 
2790       /* Use OOmode for paired vsx load/store.  Use V2DI for single
2791 	 unaligned vsx load/store, for consistency with what other
2792 	 expansions (compare) already do, and so we can use lxvd2x on
2793 	 p8.  Order is VSX pair unaligned, VSX unaligned, Altivec, VSX
2794 	 with length < 16 (if allowed), then gpr load/store.  */
2795 
2796       if (TARGET_MMA && TARGET_BLOCK_OPS_UNALIGNED_VSX
2797 	  && TARGET_BLOCK_OPS_VECTOR_PAIR
2798 	  && bytes >= 32
2799 	  && (align >= 256 || !STRICT_ALIGNMENT))
2800 	{
2801 	  move_bytes = 32;
2802 	  mode = OOmode;
2803 	  gen_func.mov = gen_movoo;
2804 	}
2805       else if (TARGET_POWERPC64 && TARGET_BLOCK_OPS_UNALIGNED_VSX
2806 	       && VECTOR_MEM_VSX_P (V2DImode)
2807 	       && bytes >= 16 && (align >= 128 || !STRICT_ALIGNMENT))
2808 	{
2809 	  move_bytes = 16;
2810 	  mode = V2DImode;
2811 	  gen_func.mov = gen_vsx_movv2di_64bit;
2812 	}
2813       else if (TARGET_BLOCK_OPS_UNALIGNED_VSX
2814 	       && TARGET_POWER10 && bytes < 16
2815 	       && orig_bytes > 16
2816 	       && !(bytes == 1 || bytes == 2
2817 		    || bytes == 4 || bytes == 8)
2818 	       && (align >= 128 || !STRICT_ALIGNMENT))
2819 	{
2820 	  /* Only use lxvl/stxvl if it could replace multiple ordinary
2821 	     loads+stores.  Also don't use it unless we likely already
2822 	     did one vsx copy so we aren't mixing gpr and vsx.  */
2823 	  move_bytes = bytes;
2824 	  mode = V16QImode;
2825 	  gen_func.movlen = gen_lxvl_stxvl_move;
2826 	  move_with_length = true;
2827 	}
2828       else if (TARGET_ALTIVEC && bytes >= 16 && align >= 128)
2829 	{
2830 	  move_bytes = 16;
2831 	  mode = V4SImode;
2832 	  gen_func.mov = gen_lvx_v4si_move;
2833 	}
2834       else if (bytes >= 8 && TARGET_POWERPC64
2835 	       && (align >= 64 || !STRICT_ALIGNMENT))
2836 	{
2837 	  move_bytes = 8;
2838 	  mode = DImode;
2839 	  gen_func.mov = gen_movdi;
2840 	  if (offset == 0 && align < 64)
2841 	    {
2842 	      rtx addr;
2843 
2844 	      /* If the address form is reg+offset with offset not a
2845 		 multiple of four, reload into reg indirect form here
2846 		 rather than waiting for reload.  This way we get one
2847 		 reload, not one per load and/or store.  */
2848 	      addr = XEXP (orig_dest, 0);
2849 	      if ((GET_CODE (addr) == PLUS || GET_CODE (addr) == LO_SUM)
2850 		  && CONST_INT_P (XEXP (addr, 1))
2851 		  && (INTVAL (XEXP (addr, 1)) & 3) != 0)
2852 		{
2853 		  addr = copy_addr_to_reg (addr);
2854 		  orig_dest = replace_equiv_address (orig_dest, addr);
2855 		}
2856 	      addr = XEXP (orig_src, 0);
2857 	      if ((GET_CODE (addr) == PLUS || GET_CODE (addr) == LO_SUM)
2858 		  && CONST_INT_P (XEXP (addr, 1))
2859 		  && (INTVAL (XEXP (addr, 1)) & 3) != 0)
2860 		{
2861 		  addr = copy_addr_to_reg (addr);
2862 		  orig_src = replace_equiv_address (orig_src, addr);
2863 		}
2864 	    }
2865 	}
2866       else if (bytes >= 4 && (align >= 32 || !STRICT_ALIGNMENT))
2867 	{			/* move 4 bytes */
2868 	  move_bytes = 4;
2869 	  mode = SImode;
2870 	  gen_func.mov = gen_movsi;
2871 	}
2872       else if (bytes >= 2 && (align >= 16 || !STRICT_ALIGNMENT))
2873 	{			/* move 2 bytes */
2874 	  move_bytes = 2;
2875 	  mode = HImode;
2876 	  gen_func.mov = gen_movhi;
2877 	}
2878       else /* move 1 byte at a time */
2879 	{
2880 	  move_bytes = 1;
2881 	  mode = QImode;
2882 	  gen_func.mov = gen_movqi;
2883 	}
2884 
2885       /* If we can't succeed in doing the move in one pass, we can't
2886 	 do it in the might_overlap case.  Bail out and return
2887 	 failure.  We test num_reg + 1 >= MAX_MOVE_REG here to check
2888 	 the same condition as the test of num_reg >= MAX_MOVE_REG
2889 	 that is done below after the increment of num_reg.  */
2890       if (might_overlap && num_reg + 1 >= MAX_MOVE_REG
2891 	  && bytes > move_bytes)
2892 	return 0;
2893 
2894       /* Mode is always set to something other than BLKmode by one of the
2895 	 cases of the if statement above.  */
2896       gcc_assert (mode != BLKmode);
2897 
2898       src = adjust_address (orig_src, mode, offset);
2899       dest = adjust_address (orig_dest, mode, offset);
2900 
2901       rtx tmp_reg = gen_reg_rtx (mode);
2902 
2903       if (move_with_length)
2904 	{
2905 	  loads[num_reg]    = (*gen_func.movlen) (tmp_reg, src, move_bytes);
2906 	  stores[num_reg++] = (*gen_func.movlen) (dest, tmp_reg, move_bytes);
2907 	}
2908       else
2909 	{
2910 	  loads[num_reg]    = (*gen_func.mov) (tmp_reg, src);
2911 	  stores[num_reg++] = (*gen_func.mov) (dest, tmp_reg);
2912 	}
2913 
2914       /* Emit loads and stores saved up.  */
2915       if (num_reg >= MAX_MOVE_REG || bytes == move_bytes)
2916 	{
2917 	  int i;
2918 	  for (i = 0; i < num_reg; i++)
2919 	    emit_insn (loads[i]);
2920 	  for (i = 0; i < num_reg; i++)
2921 	    emit_insn (stores[i]);
2922 	  num_reg = 0;
2923 	}
2924 
2925     }
2926 
2927   return 1;
2928 }
2929