1 /* Subroutines used to expand string and block move, clear,
2 compare and other operations for PowerPC.
3 Copyright (C) 1991-2018 Free Software Foundation, Inc.
4
5 This file is part of GCC.
6
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published
9 by the Free Software Foundation; either version 3, or (at your
10 option) any later version.
11
12 GCC is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
15 License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
20
21 #define IN_TARGET_CODE 1
22
23 #include "config.h"
24 #include "system.h"
25 #include "coretypes.h"
26 #include "backend.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "memmodel.h"
30 #include "tm_p.h"
31 #include "ira.h"
32 #include "print-tree.h"
33 #include "varasm.h"
34 #include "explow.h"
35 #include "expr.h"
36 #include "output.h"
37 #include "target.h"
38 #include "profile-count.h"
39 #include "predict.h"
40
41 /* Expand a block clear operation, and return 1 if successful. Return 0
42 if we should let the compiler generate normal code.
43
44 operands[0] is the destination
45 operands[1] is the length
46 operands[3] is the alignment */
47
48 int
expand_block_clear(rtx operands[])49 expand_block_clear (rtx operands[])
50 {
51 rtx orig_dest = operands[0];
52 rtx bytes_rtx = operands[1];
53 rtx align_rtx = operands[3];
54 bool constp = (GET_CODE (bytes_rtx) == CONST_INT);
55 HOST_WIDE_INT align;
56 HOST_WIDE_INT bytes;
57 int offset;
58 int clear_bytes;
59 int clear_step;
60
61 /* If this is not a fixed size move, just call memcpy */
62 if (! constp)
63 return 0;
64
65 /* This must be a fixed size alignment */
66 gcc_assert (GET_CODE (align_rtx) == CONST_INT);
67 align = INTVAL (align_rtx) * BITS_PER_UNIT;
68
69 /* Anything to clear? */
70 bytes = INTVAL (bytes_rtx);
71 if (bytes <= 0)
72 return 1;
73
74 /* Use the builtin memset after a point, to avoid huge code bloat.
75 When optimize_size, avoid any significant code bloat; calling
76 memset is about 4 instructions, so allow for one instruction to
77 load zero and three to do clearing. */
78 if (TARGET_ALTIVEC && (align >= 128 || TARGET_EFFICIENT_UNALIGNED_VSX))
79 clear_step = 16;
80 else if (TARGET_POWERPC64 && (align >= 64 || !STRICT_ALIGNMENT))
81 clear_step = 8;
82 else
83 clear_step = 4;
84
85 if (optimize_size && bytes > 3 * clear_step)
86 return 0;
87 if (! optimize_size && bytes > 8 * clear_step)
88 return 0;
89
90 bool unaligned_vsx_ok = (bytes >= 32 && TARGET_EFFICIENT_UNALIGNED_VSX);
91
92 for (offset = 0; bytes > 0; offset += clear_bytes, bytes -= clear_bytes)
93 {
94 machine_mode mode = BLKmode;
95 rtx dest;
96
97 if (TARGET_ALTIVEC
98 && (bytes >= 16 && (align >= 128 || unaligned_vsx_ok)))
99 {
100 clear_bytes = 16;
101 mode = V4SImode;
102 }
103 else if (bytes >= 8 && TARGET_POWERPC64
104 && (align >= 64 || !STRICT_ALIGNMENT))
105 {
106 clear_bytes = 8;
107 mode = DImode;
108 if (offset == 0 && align < 64)
109 {
110 rtx addr;
111
112 /* If the address form is reg+offset with offset not a
113 multiple of four, reload into reg indirect form here
114 rather than waiting for reload. This way we get one
115 reload, not one per store. */
116 addr = XEXP (orig_dest, 0);
117 if ((GET_CODE (addr) == PLUS || GET_CODE (addr) == LO_SUM)
118 && GET_CODE (XEXP (addr, 1)) == CONST_INT
119 && (INTVAL (XEXP (addr, 1)) & 3) != 0)
120 {
121 addr = copy_addr_to_reg (addr);
122 orig_dest = replace_equiv_address (orig_dest, addr);
123 }
124 }
125 }
126 else if (bytes >= 4 && (align >= 32 || !STRICT_ALIGNMENT))
127 { /* move 4 bytes */
128 clear_bytes = 4;
129 mode = SImode;
130 }
131 else if (bytes >= 2 && (align >= 16 || !STRICT_ALIGNMENT))
132 { /* move 2 bytes */
133 clear_bytes = 2;
134 mode = HImode;
135 }
136 else /* move 1 byte at a time */
137 {
138 clear_bytes = 1;
139 mode = QImode;
140 }
141
142 dest = adjust_address (orig_dest, mode, offset);
143
144 emit_move_insn (dest, CONST0_RTX (mode));
145 }
146
147 return 1;
148 }
149
150 /* Figure out the correct instructions to generate to load data for
151 block compare. MODE is used for the read from memory, and
152 data is zero extended if REG is wider than MODE. If LE code
153 is being generated, bswap loads are used.
154
155 REG is the destination register to move the data into.
156 MEM is the memory block being read.
157 MODE is the mode of memory to use for the read. */
158 static void
do_load_for_compare(rtx reg,rtx mem,machine_mode mode)159 do_load_for_compare (rtx reg, rtx mem, machine_mode mode)
160 {
161 switch (GET_MODE (reg))
162 {
163 case E_DImode:
164 switch (mode)
165 {
166 case E_QImode:
167 emit_insn (gen_zero_extendqidi2 (reg, mem));
168 break;
169 case E_HImode:
170 {
171 rtx src = mem;
172 if (!BYTES_BIG_ENDIAN)
173 {
174 src = gen_reg_rtx (HImode);
175 emit_insn (gen_bswaphi2 (src, mem));
176 }
177 emit_insn (gen_zero_extendhidi2 (reg, src));
178 break;
179 }
180 case E_SImode:
181 {
182 rtx src = mem;
183 if (!BYTES_BIG_ENDIAN)
184 {
185 src = gen_reg_rtx (SImode);
186 emit_insn (gen_bswapsi2 (src, mem));
187 }
188 emit_insn (gen_zero_extendsidi2 (reg, src));
189 }
190 break;
191 case E_DImode:
192 if (!BYTES_BIG_ENDIAN)
193 emit_insn (gen_bswapdi2 (reg, mem));
194 else
195 emit_insn (gen_movdi (reg, mem));
196 break;
197 default:
198 gcc_unreachable ();
199 }
200 break;
201
202 case E_SImode:
203 switch (mode)
204 {
205 case E_QImode:
206 emit_insn (gen_zero_extendqisi2 (reg, mem));
207 break;
208 case E_HImode:
209 {
210 rtx src = mem;
211 if (!BYTES_BIG_ENDIAN)
212 {
213 src = gen_reg_rtx (HImode);
214 emit_insn (gen_bswaphi2 (src, mem));
215 }
216 emit_insn (gen_zero_extendhisi2 (reg, src));
217 break;
218 }
219 case E_SImode:
220 if (!BYTES_BIG_ENDIAN)
221 emit_insn (gen_bswapsi2 (reg, mem));
222 else
223 emit_insn (gen_movsi (reg, mem));
224 break;
225 case E_DImode:
226 /* DImode is larger than the destination reg so is not expected. */
227 gcc_unreachable ();
228 break;
229 default:
230 gcc_unreachable ();
231 }
232 break;
233 default:
234 gcc_unreachable ();
235 break;
236 }
237 }
238
239 /* Select the mode to be used for reading the next chunk of bytes
240 in the compare.
241
242 OFFSET is the current read offset from the beginning of the block.
243 BYTES is the number of bytes remaining to be read.
244 ALIGN is the minimum alignment of the memory blocks being compared in bytes.
245 WORD_MODE_OK indicates using WORD_MODE is allowed, else SImode is
246 the largest allowable mode. */
247 static machine_mode
select_block_compare_mode(unsigned HOST_WIDE_INT offset,unsigned HOST_WIDE_INT bytes,unsigned HOST_WIDE_INT align,bool word_mode_ok)248 select_block_compare_mode (unsigned HOST_WIDE_INT offset,
249 unsigned HOST_WIDE_INT bytes,
250 unsigned HOST_WIDE_INT align, bool word_mode_ok)
251 {
252 /* First see if we can do a whole load unit
253 as that will be more efficient than a larger load + shift. */
254
255 /* If big, use biggest chunk.
256 If exactly chunk size, use that size.
257 If remainder can be done in one piece with shifting, do that.
258 Do largest chunk possible without violating alignment rules. */
259
260 /* The most we can read without potential page crossing. */
261 unsigned HOST_WIDE_INT maxread = ROUND_UP (bytes, align);
262
263 if (word_mode_ok && bytes >= UNITS_PER_WORD)
264 return word_mode;
265 else if (bytes == GET_MODE_SIZE (SImode))
266 return SImode;
267 else if (bytes == GET_MODE_SIZE (HImode))
268 return HImode;
269 else if (bytes == GET_MODE_SIZE (QImode))
270 return QImode;
271 else if (bytes < GET_MODE_SIZE (SImode)
272 && offset >= GET_MODE_SIZE (SImode) - bytes)
273 /* This matches the case were we have SImode and 3 bytes
274 and offset >= 1 and permits us to move back one and overlap
275 with the previous read, thus avoiding having to shift
276 unwanted bytes off of the input. */
277 return SImode;
278 else if (word_mode_ok && bytes < UNITS_PER_WORD
279 && offset >= UNITS_PER_WORD-bytes)
280 /* Similarly, if we can use DImode it will get matched here and
281 can do an overlapping read that ends at the end of the block. */
282 return word_mode;
283 else if (word_mode_ok && maxread >= UNITS_PER_WORD)
284 /* It is safe to do all remaining in one load of largest size,
285 possibly with a shift to get rid of unwanted bytes. */
286 return word_mode;
287 else if (maxread >= GET_MODE_SIZE (SImode))
288 /* It is safe to do all remaining in one SImode load,
289 possibly with a shift to get rid of unwanted bytes. */
290 return SImode;
291 else if (bytes > GET_MODE_SIZE (SImode))
292 return SImode;
293 else if (bytes > GET_MODE_SIZE (HImode))
294 return HImode;
295
296 /* final fallback is do one byte */
297 return QImode;
298 }
299
300 /* Compute the alignment of pointer+OFFSET where the original alignment
301 of pointer was BASE_ALIGN. */
302 static unsigned HOST_WIDE_INT
compute_current_alignment(unsigned HOST_WIDE_INT base_align,unsigned HOST_WIDE_INT offset)303 compute_current_alignment (unsigned HOST_WIDE_INT base_align,
304 unsigned HOST_WIDE_INT offset)
305 {
306 if (offset == 0)
307 return base_align;
308 return MIN (base_align, offset & -offset);
309 }
310
311 /* Prepare address and then do a load.
312
313 MODE is the mode to use for the load.
314 DEST is the destination register for the data.
315 ADDR is the address to be loaded.
316 ORIG_ADDR is the original address expression. */
317 static void
do_load_for_compare_from_addr(machine_mode mode,rtx dest,rtx addr,rtx orig_addr)318 do_load_for_compare_from_addr (machine_mode mode, rtx dest, rtx addr,
319 rtx orig_addr)
320 {
321 rtx mem = gen_rtx_MEM (mode, addr);
322 MEM_COPY_ATTRIBUTES (mem, orig_addr);
323 set_mem_size (mem, GET_MODE_SIZE (mode));
324 do_load_for_compare (dest, mem, mode);
325 return;
326 }
327
328 /* Do a branch for an if/else decision.
329
330 CMPMODE is the mode to use for the comparison.
331 COMPARISON is the rtx code for the compare needed.
332 A is the first thing to be compared.
333 B is the second thing to be compared.
334 CR is the condition code reg input, or NULL_RTX.
335 TRUE_LABEL is the label to branch to if the condition is true.
336 P is the estimated branch probability for the branch.
337
338 The return value is the CR used for the comparison.
339 If CR is null_rtx, then a new register of CMPMODE is generated.
340 If A and B are both null_rtx, then CR must not be null, and the
341 compare is not generated so you can use this with a dot form insn. */
342
343 static void
do_ifelse(machine_mode cmpmode,rtx_code comparison,rtx a,rtx b,rtx cr,rtx true_label,profile_probability br_prob)344 do_ifelse (machine_mode cmpmode, rtx_code comparison,
345 rtx a, rtx b, rtx cr, rtx true_label, profile_probability br_prob)
346 {
347 gcc_assert ((a == NULL_RTX && b == NULL_RTX && cr != NULL_RTX)
348 || (a != NULL_RTX && b != NULL_RTX));
349
350 if (cr != NULL_RTX)
351 gcc_assert (GET_MODE (cr) == cmpmode);
352 else
353 cr = gen_reg_rtx (cmpmode);
354
355 rtx label_ref = gen_rtx_LABEL_REF (VOIDmode, true_label);
356
357 if (a != NULL_RTX)
358 emit_move_insn (cr, gen_rtx_COMPARE (cmpmode, a, b));
359
360 rtx cmp_rtx = gen_rtx_fmt_ee (comparison, VOIDmode, cr, const0_rtx);
361
362 rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx, label_ref, pc_rtx);
363 rtx_insn *j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
364 add_reg_br_prob_note (j, br_prob);
365 JUMP_LABEL (j) = true_label;
366 LABEL_NUSES (true_label) += 1;
367 }
368
369 /* Emit an isel of the proper mode for DEST.
370
371 DEST is the isel destination register.
372 SRC1 is the isel source if CR is true.
373 SRC2 is the isel source if CR is false.
374 CR is the condition for the isel. */
375 static void
do_isel(rtx dest,rtx cmp,rtx src_t,rtx src_f,rtx cr)376 do_isel (rtx dest, rtx cmp, rtx src_t, rtx src_f, rtx cr)
377 {
378 if (GET_MODE (dest) == DImode)
379 emit_insn (gen_isel_signed_di (dest, cmp, src_t, src_f, cr));
380 else
381 emit_insn (gen_isel_signed_si (dest, cmp, src_t, src_f, cr));
382 }
383
384 /* Emit a subtract of the proper mode for DEST.
385
386 DEST is the destination register for the subtract.
387 SRC1 is the first subtract input.
388 SRC2 is the second subtract input.
389
390 Computes DEST = SRC1-SRC2. */
391 static void
do_sub3(rtx dest,rtx src1,rtx src2)392 do_sub3 (rtx dest, rtx src1, rtx src2)
393 {
394 if (GET_MODE (dest) == DImode)
395 emit_insn (gen_subdi3 (dest, src1, src2));
396 else
397 emit_insn (gen_subsi3 (dest, src1, src2));
398 }
399
400 /* Emit an add of the proper mode for DEST.
401
402 DEST is the destination register for the add.
403 SRC1 is the first add input.
404 SRC2 is the second add input.
405
406 Computes DEST = SRC1+SRC2. */
407 static void
do_add3(rtx dest,rtx src1,rtx src2)408 do_add3 (rtx dest, rtx src1, rtx src2)
409 {
410 if (GET_MODE (dest) == DImode)
411 emit_insn (gen_adddi3 (dest, src1, src2));
412 else
413 emit_insn (gen_addsi3 (dest, src1, src2));
414 }
415
416 /* Emit an and of the proper mode for DEST.
417
418 DEST is the destination register for the and.
419 SRC1 is the first and input.
420 SRC2 is the second and input.
421
422 Computes DEST = SRC1&SRC2. */
423 static void
do_and3(rtx dest,rtx src1,rtx src2)424 do_and3 (rtx dest, rtx src1, rtx src2)
425 {
426 if (GET_MODE (dest) == DImode)
427 emit_insn (gen_anddi3 (dest, src1, src2));
428 else
429 emit_insn (gen_andsi3 (dest, src1, src2));
430 }
431
432 /* Emit an cmpb of the proper mode for DEST.
433
434 DEST is the destination register for the cmpb.
435 SRC1 is the first input.
436 SRC2 is the second input.
437
438 Computes cmpb of SRC1, SRC2. */
439 static void
do_cmpb3(rtx dest,rtx src1,rtx src2)440 do_cmpb3 (rtx dest, rtx src1, rtx src2)
441 {
442 if (GET_MODE (dest) == DImode)
443 emit_insn (gen_cmpbdi3 (dest, src1, src2));
444 else
445 emit_insn (gen_cmpbsi3 (dest, src1, src2));
446 }
447
448 /* Emit a rotl of the proper mode for DEST.
449
450 DEST is the destination register for the and.
451 SRC1 is the first and input.
452 SRC2 is the second and input.
453
454 Computes DEST = SRC1 rotated left by SRC2. */
455 static void
do_rotl3(rtx dest,rtx src1,rtx src2)456 do_rotl3 (rtx dest, rtx src1, rtx src2)
457 {
458 if (GET_MODE (dest) == DImode)
459 emit_insn (gen_rotldi3 (dest, src1, src2));
460 else
461 emit_insn (gen_rotlsi3 (dest, src1, src2));
462 }
463
464 /* Generate rtl for a load, shift, and compare of less than a full word.
465
466 LOAD_MODE is the machine mode for the loads.
467 DIFF is the reg for the difference.
468 CMP_REM is the reg containing the remaining bytes to compare.
469 DCOND is the CCUNS reg for the compare if we are doing P9 code with setb.
470 SRC1_ADDR is the first source address.
471 SRC2_ADDR is the second source address.
472 ORIG_SRC1 is the original first source block's address rtx.
473 ORIG_SRC2 is the original second source block's address rtx. */
474 static void
do_load_mask_compare(const machine_mode load_mode,rtx diff,rtx cmp_rem,rtx dcond,rtx src1_addr,rtx src2_addr,rtx orig_src1,rtx orig_src2)475 do_load_mask_compare (const machine_mode load_mode, rtx diff, rtx cmp_rem, rtx dcond,
476 rtx src1_addr, rtx src2_addr, rtx orig_src1, rtx orig_src2)
477 {
478 HOST_WIDE_INT load_mode_size = GET_MODE_SIZE (load_mode);
479 rtx shift_amount = gen_reg_rtx (word_mode);
480 rtx d1 = gen_reg_rtx (word_mode);
481 rtx d2 = gen_reg_rtx (word_mode);
482
483 do_load_for_compare_from_addr (load_mode, d1, src1_addr, orig_src1);
484 do_load_for_compare_from_addr (load_mode, d2, src2_addr, orig_src2);
485 do_sub3 (shift_amount, GEN_INT (load_mode_size), cmp_rem);
486
487 if (word_mode == DImode)
488 {
489 emit_insn (gen_ashldi3 (shift_amount, shift_amount,
490 GEN_INT (LOG2_BITS_PER_UNIT)));
491 emit_insn (gen_lshrdi3 (d1, d1,
492 gen_lowpart (SImode, shift_amount)));
493 emit_insn (gen_lshrdi3 (d2, d2,
494 gen_lowpart (SImode, shift_amount)));
495 }
496 else
497 {
498 emit_insn (gen_ashlsi3 (shift_amount, shift_amount,
499 GEN_INT (LOG2_BITS_PER_UNIT)));
500 emit_insn (gen_lshrsi3 (d1, d1, shift_amount));
501 emit_insn (gen_lshrsi3 (d2, d2, shift_amount));
502 }
503
504 if (TARGET_P9_MISC)
505 {
506 /* Generate a compare, and convert with a setb later. */
507 rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1, d2);
508 emit_insn (gen_rtx_SET (dcond, cmp));
509 }
510 else
511 {
512 if (word_mode == DImode)
513 emit_insn (gen_subfdi3_carry (diff, d2, d1));
514 else
515 emit_insn (gen_subfsi3_carry (diff, d2, d1));
516 }
517 }
518
519 /* Generate rtl for an overlapping load and compare of less than a
520 full load_mode. This assumes that the previous word is part of the
521 block being compared so it's ok to back up part of a word so we can
522 compare the last unaligned full word that ends at the end of the block.
523
524 LOAD_MODE is the machine mode for the loads.
525 ISCONST tells whether the remaining length is a constant or in a register.
526 BYTES_REM is the remaining length if ISCONST is true.
527 DIFF is the reg for the difference.
528 CMP_REM is the reg containing the remaining bytes to compare if !ISCONST.
529 DCOND is the CCUNS reg for the compare if we are doing P9 code with setb.
530 SRC1_ADDR is the first source address.
531 SRC2_ADDR is the second source address.
532 ORIG_SRC1 is the original first source block's address rtx.
533 ORIG_SRC2 is the original second source block's address rtx. */
534 static void
do_overlap_load_compare(machine_mode load_mode,bool isConst,HOST_WIDE_INT bytes_rem,rtx diff,rtx cmp_rem,rtx dcond,rtx src1_addr,rtx src2_addr,rtx orig_src1,rtx orig_src2)535 do_overlap_load_compare (machine_mode load_mode, bool isConst,
536 HOST_WIDE_INT bytes_rem, rtx diff,
537 rtx cmp_rem, rtx dcond, rtx src1_addr, rtx src2_addr,
538 rtx orig_src1, rtx orig_src2)
539 {
540 HOST_WIDE_INT load_mode_size = GET_MODE_SIZE (load_mode);
541 HOST_WIDE_INT addr_adj = load_mode_size - bytes_rem;
542 rtx d1 = gen_reg_rtx (word_mode);
543 rtx d2 = gen_reg_rtx (word_mode);
544
545 rtx addr1, addr2;
546 if (!isConst || addr_adj)
547 {
548 rtx adj_reg = gen_reg_rtx (word_mode);
549 if (isConst)
550 emit_move_insn (adj_reg, GEN_INT (-addr_adj));
551 else
552 {
553 rtx reg_lms = gen_reg_rtx (word_mode);
554 emit_move_insn (reg_lms, GEN_INT (load_mode_size));
555 do_sub3 (adj_reg, cmp_rem, reg_lms);
556 }
557
558 addr1 = gen_rtx_PLUS (word_mode, src1_addr, adj_reg);
559 addr2 = gen_rtx_PLUS (word_mode, src2_addr, adj_reg);
560 }
561 else
562 {
563 addr1 = src1_addr;
564 addr2 = src2_addr;
565 }
566
567 do_load_for_compare_from_addr (load_mode, d1, addr1, orig_src1);
568 do_load_for_compare_from_addr (load_mode, d2, addr2, orig_src2);
569
570 if (TARGET_P9_MISC)
571 {
572 /* Generate a compare, and convert with a setb later. */
573 rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1, d2);
574 emit_insn (gen_rtx_SET (dcond, cmp));
575 }
576 else
577 {
578 if (word_mode == DImode)
579 emit_insn (gen_subfdi3_carry (diff, d2, d1));
580 else
581 emit_insn (gen_subfsi3_carry (diff, d2, d1));
582 }
583 }
584
585 /* Expand a block compare operation using loop code, and return true
586 if successful. Return false if we should let the compiler generate
587 normal code, probably a memcmp call.
588
589 OPERANDS[0] is the target (result).
590 OPERANDS[1] is the first source.
591 OPERANDS[2] is the second source.
592 OPERANDS[3] is the length.
593 OPERANDS[4] is the alignment. */
594 bool
expand_compare_loop(rtx operands[])595 expand_compare_loop (rtx operands[])
596 {
597 rtx target = operands[0];
598 rtx orig_src1 = operands[1];
599 rtx orig_src2 = operands[2];
600 rtx bytes_rtx = operands[3];
601 rtx align_rtx = operands[4];
602
603 /* This case is complicated to handle because the subtract
604 with carry instructions do not generate the 64-bit
605 carry and so we must emit code to calculate it ourselves.
606 We choose not to implement this yet. */
607 if (TARGET_32BIT && TARGET_POWERPC64)
608 return false;
609
610 /* Allow non-const length. */
611 int bytes_is_const = CONST_INT_P (bytes_rtx);
612
613 /* This must be a fixed size alignment. */
614 if (!CONST_INT_P (align_rtx))
615 return false;
616
617 HOST_WIDE_INT align1 = MEM_ALIGN (orig_src1) / BITS_PER_UNIT;
618 HOST_WIDE_INT align2 = MEM_ALIGN (orig_src2) / BITS_PER_UNIT;
619 HOST_WIDE_INT minalign = MIN (align1, align2);
620
621 bool isP7 = (rs6000_tune == PROCESSOR_POWER7);
622
623 gcc_assert (GET_MODE (target) == SImode);
624
625 /* Anything to move? */
626 HOST_WIDE_INT bytes = 0;
627 if (bytes_is_const)
628 bytes = INTVAL (bytes_rtx);
629
630 if (bytes_is_const && bytes == 0)
631 return true;
632
633 /* Limit the amount we compare, if known statically. */
634 HOST_WIDE_INT max_bytes;
635 switch (rs6000_tune)
636 {
637 case PROCESSOR_POWER7:
638 if (!bytes_is_const)
639 if (minalign < 8)
640 max_bytes = 0;
641 else
642 max_bytes = 128;
643 else
644 if (minalign < 8)
645 max_bytes = 32;
646 else
647 max_bytes = 128;
648 break;
649 case PROCESSOR_POWER8:
650 if (!bytes_is_const)
651 max_bytes = 0;
652 else
653 if (minalign < 8)
654 max_bytes = 128;
655 else
656 max_bytes = 64;
657 break;
658 case PROCESSOR_POWER9:
659 if (bytes_is_const)
660 max_bytes = 191;
661 else
662 max_bytes = 0;
663 break;
664 default:
665 max_bytes = 128;
666 }
667
668 /* Allow the option to override the default. */
669 if (rs6000_block_compare_inline_loop_limit >= 0)
670 max_bytes = (unsigned HOST_WIDE_INT) rs6000_block_compare_inline_loop_limit;
671
672 if (max_bytes == 0)
673 return false;
674
675 rtx cmp_rem = gen_reg_rtx (word_mode); /* Remainder for library call. */
676 rtx loop_cmp = gen_reg_rtx (word_mode); /* Actual amount compared by loop. */
677 HOST_WIDE_INT niter;
678 rtx iter = gen_reg_rtx (word_mode);
679 rtx iv1 = gen_reg_rtx (word_mode);
680 rtx iv2 = gen_reg_rtx (word_mode);
681 rtx d1_1 = gen_reg_rtx (word_mode); /* Addr expression src1+iv1 */
682 rtx d1_2 = gen_reg_rtx (word_mode); /* Addr expression src1+iv2 */
683 rtx d2_1 = gen_reg_rtx (word_mode); /* Addr expression src2+iv1 */
684 rtx d2_2 = gen_reg_rtx (word_mode); /* Addr expression src2+iv2 */
685
686 /* Strip unneeded subreg from length if there is one. */
687 if (SUBREG_P (bytes_rtx) && subreg_lowpart_p (bytes_rtx))
688 bytes_rtx = SUBREG_REG (bytes_rtx);
689 /* Extend bytes_rtx to word_mode if needed. But, we expect only to
690 maybe have to deal with the case were bytes_rtx is SImode and
691 word_mode is DImode. */
692 if (!bytes_is_const)
693 {
694 if (GET_MODE_SIZE (GET_MODE (bytes_rtx)) > GET_MODE_SIZE (word_mode))
695 /* Do not expect length longer than word_mode. */
696 return false;
697 else if (GET_MODE_SIZE (GET_MODE (bytes_rtx)) < GET_MODE_SIZE (word_mode))
698 {
699 bytes_rtx = force_reg (GET_MODE (bytes_rtx), bytes_rtx);
700 bytes_rtx = force_reg (word_mode,
701 gen_rtx_fmt_e (ZERO_EXTEND, word_mode,
702 bytes_rtx));
703 }
704 else
705 /* Make sure it's in a register before we get started. */
706 bytes_rtx = force_reg (GET_MODE (bytes_rtx), bytes_rtx);
707 }
708
709 machine_mode load_mode = word_mode;
710 HOST_WIDE_INT load_mode_size = GET_MODE_SIZE (load_mode);
711
712 /* Number of bytes per iteration of the unrolled loop. */
713 HOST_WIDE_INT loop_bytes = 2 * load_mode_size;
714 /* max iters and bytes compared in the loop. */
715 HOST_WIDE_INT max_loop_iter = max_bytes / loop_bytes;
716 HOST_WIDE_INT max_loop_bytes = max_loop_iter * loop_bytes;
717 int l2lb = floor_log2 (loop_bytes);
718
719 if (bytes_is_const && (max_bytes < load_mode_size
720 || !IN_RANGE (bytes, load_mode_size, max_bytes)))
721 return false;
722
723 bool no_remainder_code = false;
724 rtx final_label = gen_label_rtx ();
725 rtx final_ref = gen_rtx_LABEL_REF (VOIDmode, final_label);
726 rtx diff_label = gen_label_rtx ();
727 rtx library_call_label = NULL;
728 rtx cleanup_label = gen_label_rtx ();
729
730 rtx cr;
731
732 rtx src1_addr = copy_addr_to_reg (XEXP (orig_src1, 0));
733 rtx src2_addr = copy_addr_to_reg (XEXP (orig_src2, 0));
734
735 /* Difference found is stored here before jump to diff_label. */
736 rtx diff = gen_reg_rtx (word_mode);
737 rtx_insn *j;
738
739 /* Example of generated code for 35 bytes aligned 1 byte.
740
741 mtctr 8
742 li 6,0
743 li 5,8
744 .L13:
745 ldbrx 7,3,6
746 ldbrx 9,10,6
747 ldbrx 0,3,5
748 ldbrx 4,10,5
749 addi 6,6,16
750 addi 5,5,16
751 subfc. 9,9,7
752 bne 0,.L10
753 subfc. 9,4,0
754 bdnzt 2,.L13
755 bne 0,.L10
756 add 3,3,6
757 add 10,10,6
758 addi 9,3,-5
759 ldbrx 7,0,9
760 addi 9,10,-5
761 ldbrx 9,0,9
762 subfc 9,9,7
763 .p2align 4,,15
764 .L10:
765 popcntd 9,9
766 subfe 10,10,10
767 or 9,9,10
768
769 Compiled with -fno-reorder-blocks for clarity. */
770
771 /* Structure of what we're going to do:
772 Two separate lengths: what we will compare before bailing to library
773 call (max_bytes), and the total length to be checked.
774 if length <= 16, branch to linear cleanup code starting with
775 remainder length check (length not known at compile time)
776 set up 2 iv's and load count reg, compute remainder length
777 unrollx2 compare loop
778 if loop exit due to a difference, branch to difference handling code
779 if remainder length < 8, branch to final cleanup compare
780 load and compare 8B
781 final cleanup comparison (depends on alignment and length)
782 load 8B, shift off bytes past length, compare
783 load 8B ending at last byte and compare
784 load/compare 1 byte at a time (short block abutting 4k boundary)
785 difference handling, 64->32 conversion
786 final result
787 branch around memcmp call
788 memcmp library call
789 */
790
791 /* If bytes is not const, compare length and branch directly
792 to the cleanup code that can handle 0-16 bytes if length
793 is >= 16. Stash away bytes-max_bytes for the library call. */
794 if (bytes_is_const)
795 {
796 /* These need to be set for some of the places we may jump to. */
797 if (bytes > max_bytes)
798 {
799 no_remainder_code = true;
800 niter = max_loop_iter;
801 library_call_label = gen_label_rtx ();
802 }
803 else
804 {
805 niter = bytes / loop_bytes;
806 }
807 emit_move_insn (iter, GEN_INT (niter));
808 emit_move_insn (loop_cmp, GEN_INT (niter * loop_bytes));
809 emit_move_insn (cmp_rem, GEN_INT (bytes - niter * loop_bytes));
810 }
811 else
812 {
813 library_call_label = gen_label_rtx ();
814
815 /* If we go to the cleanup code, it expects length to be in cmp_rem. */
816 emit_move_insn (cmp_rem, bytes_rtx);
817
818 /* Check for > max_bytes bytes. We want to bail out as quickly as
819 possible if we have to go over to memcmp. */
820 do_ifelse (CCmode, GT, bytes_rtx, GEN_INT (max_bytes),
821 NULL_RTX, library_call_label, profile_probability::even ());
822
823 /* Check for < loop_bytes bytes. */
824 do_ifelse (CCmode, LT, bytes_rtx, GEN_INT (loop_bytes),
825 NULL_RTX, cleanup_label, profile_probability::even ());
826
827 /* Loop compare bytes and iterations if bytes>max_bytes. */
828 rtx mb_reg = gen_reg_rtx (word_mode);
829 emit_move_insn (mb_reg, GEN_INT (max_loop_bytes));
830 rtx mi_reg = gen_reg_rtx (word_mode);
831 emit_move_insn (mi_reg, GEN_INT (max_loop_iter));
832
833 /* Compute number of loop iterations if bytes <= max_bytes. */
834 if (word_mode == DImode)
835 emit_insn (gen_lshrdi3 (iter, bytes_rtx, GEN_INT (l2lb)));
836 else
837 emit_insn (gen_lshrsi3 (iter, bytes_rtx, GEN_INT (l2lb)));
838
839 /* Compute bytes to compare in loop if bytes <= max_bytes. */
840 rtx mask = GEN_INT (HOST_WIDE_INT_M1U << l2lb);
841 if (word_mode == DImode)
842 {
843 emit_insn (gen_anddi3 (loop_cmp, bytes_rtx, mask));
844 }
845 else
846 {
847 emit_insn (gen_andsi3 (loop_cmp, bytes_rtx, mask));
848 }
849
850 /* Check for bytes <= max_bytes. */
851 if (TARGET_ISEL)
852 {
853 /* P9 has fast isel so we use one compare and two isel. */
854 cr = gen_reg_rtx (CCmode);
855 rtx compare_rtx = gen_rtx_COMPARE (CCmode, bytes_rtx,
856 GEN_INT (max_bytes));
857 emit_move_insn (cr, compare_rtx);
858 rtx cmp_rtx = gen_rtx_LE (VOIDmode, cr, const0_rtx);
859 do_isel (loop_cmp, cmp_rtx, loop_cmp, mb_reg, cr);
860 do_isel (iter, cmp_rtx, iter, mi_reg, cr);
861 }
862 else
863 {
864 rtx lab_after = gen_label_rtx ();
865 do_ifelse (CCmode, LE, bytes_rtx, GEN_INT (max_bytes),
866 NULL_RTX, lab_after, profile_probability::even ());
867 emit_move_insn (loop_cmp, mb_reg);
868 emit_move_insn (iter, mi_reg);
869 emit_label (lab_after);
870 }
871
872 /* Now compute remainder bytes which isn't used until after the loop. */
873 do_sub3 (cmp_rem, bytes_rtx, loop_cmp);
874 }
875
876 rtx dcond = NULL_RTX; /* Used for when we jump to diff_label. */
877 /* For p9 we need to have just one of these as multiple places define
878 it and it gets used by the setb at the end. */
879 if (TARGET_P9_MISC)
880 dcond = gen_reg_rtx (CCUNSmode);
881
882 if (!bytes_is_const || bytes >= loop_bytes)
883 {
884 /* It should not be possible to come here if remaining bytes is
885 < 16 in the runtime case either. Compute number of loop
886 iterations. We compare 2*word_mode per iteration so 16B for
887 64-bit code and 8B for 32-bit. Set up two induction
888 variables and load count register. */
889
890 /* HACK ALERT: create hard reg for CTR here. If we just use a
891 pseudo, cse will get rid of it and then the allocator will
892 see it used in the lshr above and won't give us ctr. */
893 rtx ctr = gen_rtx_REG (Pmode, CTR_REGNO);
894 emit_move_insn (ctr, iter);
895 emit_move_insn (diff, GEN_INT (0));
896 emit_move_insn (iv1, GEN_INT (0));
897 emit_move_insn (iv2, GEN_INT (load_mode_size));
898
899 /* inner loop to compare 2*word_mode */
900 rtx loop_top_label = gen_label_rtx ();
901 emit_label (loop_top_label);
902
903 rtx src1_ix1 = gen_rtx_PLUS (word_mode, src1_addr, iv1);
904 rtx src2_ix1 = gen_rtx_PLUS (word_mode, src2_addr, iv1);
905
906 do_load_for_compare_from_addr (load_mode, d1_1,
907 src1_ix1, orig_src1);
908 do_load_for_compare_from_addr (load_mode, d2_1,
909 src2_ix1, orig_src2);
910 do_add3 (iv1, iv1, GEN_INT (loop_bytes));
911
912 rtx src1_ix2 = gen_rtx_PLUS (word_mode, src1_addr, iv2);
913 rtx src2_ix2 = gen_rtx_PLUS (word_mode, src2_addr, iv2);
914
915 do_load_for_compare_from_addr (load_mode, d1_2,
916 src1_ix2, orig_src1);
917 do_load_for_compare_from_addr (load_mode, d2_2,
918 src2_ix2, orig_src2);
919 do_add3 (iv2, iv2, GEN_INT (loop_bytes));
920
921 if (TARGET_P9_MISC)
922 {
923 /* Generate a compare, and convert with a setb later. */
924 rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1_1, d2_1);
925 emit_insn (gen_rtx_SET (dcond, cmp));
926 }
927 else
928 {
929 dcond = gen_reg_rtx (CCmode);
930 if (word_mode == DImode)
931 emit_insn (gen_subfdi3_carry_dot2 (diff, d2_1, d1_1, dcond));
932 else
933 emit_insn (gen_subfsi3_carry_dot2 (diff, d2_1, d1_1, dcond));
934 }
935
936 do_ifelse (GET_MODE (dcond), NE, NULL_RTX, NULL_RTX,
937 dcond, diff_label, profile_probability::unlikely ());
938
939 if (TARGET_P9_MISC)
940 {
941 /* Generate a compare, and convert with a setb later. */
942 rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1_2, d2_2);
943 emit_insn (gen_rtx_SET (dcond, cmp));
944 }
945 else
946 {
947 dcond = gen_reg_rtx (CCmode);
948 if (word_mode == DImode)
949 emit_insn (gen_subfdi3_carry_dot2 (diff, d2_2, d1_2, dcond));
950 else
951 emit_insn (gen_subfsi3_carry_dot2 (diff, d2_2, d1_2, dcond));
952 }
953
954 rtx eqrtx = gen_rtx_EQ (VOIDmode, d1_2, d2_2);
955 if (TARGET_64BIT)
956 j = emit_jump_insn (gen_bdnztf_di (loop_top_label, ctr, ctr,
957 eqrtx, dcond));
958 else
959 j = emit_jump_insn (gen_bdnztf_si (loop_top_label, ctr, ctr,
960 eqrtx, dcond));
961 add_reg_br_prob_note (j, profile_probability::likely ());
962 JUMP_LABEL (j) = loop_top_label;
963 LABEL_NUSES (loop_top_label) += 1;
964 }
965
966 HOST_WIDE_INT bytes_remaining = 0;
967 if (bytes_is_const)
968 bytes_remaining = (bytes % loop_bytes);
969
970 /* If diff is nonzero, branch to difference handling
971 code. If we exit here with a nonzero diff, it is
972 because the second word differed. */
973 if (TARGET_P9_MISC)
974 do_ifelse (CCUNSmode, NE, NULL_RTX, NULL_RTX, dcond,
975 diff_label, profile_probability::unlikely ());
976 else
977 do_ifelse (CCmode, NE, diff, const0_rtx, NULL_RTX,
978 diff_label, profile_probability::unlikely ());
979
980 if (library_call_label != NULL && bytes_is_const && bytes > max_bytes)
981 {
982 /* If the length is known at compile time, then we will always
983 have a remainder to go to the library call with. */
984 rtx library_call_ref = gen_rtx_LABEL_REF (VOIDmode, library_call_label);
985 j = emit_jump_insn (gen_rtx_SET (pc_rtx, library_call_ref));
986 JUMP_LABEL (j) = library_call_label;
987 LABEL_NUSES (library_call_label) += 1;
988 emit_barrier ();
989 }
990
991 if (bytes_is_const && bytes_remaining == 0)
992 {
993 /* No remainder and if we are here then diff is 0 so just return 0 */
994 if (TARGET_64BIT)
995 emit_insn (gen_movsi (target, gen_lowpart (SImode, diff)));
996 else
997 emit_move_insn (target, diff);
998 j = emit_jump_insn (gen_rtx_SET (pc_rtx, final_ref));
999 JUMP_LABEL (j) = final_label;
1000 LABEL_NUSES (final_label) += 1;
1001 emit_barrier ();
1002 }
1003 else if (!no_remainder_code)
1004 {
1005 /* Update addresses to point to the next word to examine. */
1006 do_add3 (src1_addr, src1_addr, iv1);
1007 do_add3 (src2_addr, src2_addr, iv1);
1008
1009 emit_label (cleanup_label);
1010
1011 if (!bytes_is_const)
1012 {
1013 /* If we're dealing with runtime length, we have to check if
1014 it's zero after the loop. When length is known at compile
1015 time the no-remainder condition is dealt with above. By
1016 doing this after cleanup_label, we also deal with the
1017 case where length is 0 at the start and we bypass the
1018 loop with a branch to cleanup_label. */
1019 emit_move_insn (target, const0_rtx);
1020 do_ifelse (CCmode, EQ, cmp_rem, const0_rtx,
1021 NULL_RTX, final_label, profile_probability::unlikely ());
1022 }
1023
1024 rtx final_cleanup = gen_label_rtx ();
1025 rtx cmp_rem_before = gen_reg_rtx (word_mode);
1026 /* Compare one more word_mode chunk if needed. */
1027 if (!bytes_is_const || bytes_remaining >= load_mode_size)
1028 {
1029 /* If remainder length < word length, branch to final
1030 cleanup compare. */
1031
1032 if (!bytes_is_const)
1033 {
1034 do_ifelse (CCmode, LT, cmp_rem, GEN_INT (load_mode_size),
1035 NULL_RTX, final_cleanup, profile_probability::even ());
1036 }
1037
1038 /* load and compare 8B */
1039 do_load_for_compare_from_addr (load_mode, d1_1,
1040 src1_addr, orig_src1);
1041 do_load_for_compare_from_addr (load_mode, d2_1,
1042 src2_addr, orig_src2);
1043
1044 /* Compare the word, see if we need to do the last partial. */
1045 if (TARGET_P9_MISC)
1046 {
1047 /* Generate a compare, and convert with a setb later. */
1048 rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1_1, d2_1);
1049 emit_insn (gen_rtx_SET (dcond, cmp));
1050 }
1051 else
1052 {
1053 dcond = gen_reg_rtx (CCmode);
1054 if (word_mode == DImode)
1055 emit_insn (gen_subfdi3_carry_dot2 (diff, d2_1, d1_1, dcond));
1056 else
1057 emit_insn (gen_subfsi3_carry_dot2 (diff, d2_1, d1_1, dcond));
1058 }
1059
1060 do_ifelse (GET_MODE (dcond), NE, NULL_RTX, NULL_RTX,
1061 dcond, diff_label, profile_probability::even ());
1062
1063 do_add3 (src1_addr, src1_addr, GEN_INT (load_mode_size));
1064 do_add3 (src2_addr, src2_addr, GEN_INT (load_mode_size));
1065 emit_move_insn (cmp_rem_before, cmp_rem);
1066 do_add3 (cmp_rem, cmp_rem, GEN_INT (-load_mode_size));
1067 if (bytes_is_const)
1068 bytes_remaining -= load_mode_size;
1069 else
1070 /* See if remaining length is now zero. We previously set
1071 target to 0 so we can just jump to the end. */
1072 do_ifelse (CCmode, EQ, cmp_rem, const0_rtx, NULL_RTX,
1073 final_label, profile_probability::unlikely ());
1074 }
1075
1076 /* Cases:
1077 bytes_is_const
1078 We can always shift back to do an overlapping compare
1079 of the last chunk because we know length >= 8.
1080
1081 !bytes_is_const
1082 align>=load_mode_size
1083 Read word_mode and mask
1084 align<load_mode_size
1085 avoid stepping past end
1086
1087 Three strategies:
1088 * decrement address and do overlapping compare
1089 * read word_mode and mask
1090 * carefully avoid crossing 4k boundary
1091 */
1092
1093 if ((!bytes_is_const || (bytes_is_const && bytes_remaining && isP7))
1094 && align1 >= load_mode_size && align2 >= load_mode_size)
1095 {
1096 /* Alignment is larger than word_mode so we do not need to be
1097 concerned with extra page crossings. But, we do not know
1098 that the length is larger than load_mode_size so we might
1099 end up compareing against data before the block if we try
1100 an overlapping compare. Also we use this on P7 for fixed length
1101 remainder because P7 doesn't like overlapping unaligned.
1102 Strategy: load 8B, shift off bytes past length, and compare. */
1103 emit_label (final_cleanup);
1104 do_load_mask_compare (load_mode, diff, cmp_rem, dcond,
1105 src1_addr, src2_addr, orig_src1, orig_src2);
1106 }
1107 else if (bytes_remaining && bytes_is_const)
1108 {
1109 /* We do not do loop expand if length < 32 so we know at the
1110 end we can do an overlapping compare.
1111 Strategy: shift address back and do word_mode load that
1112 ends at the end of the block. */
1113 emit_label (final_cleanup);
1114 do_overlap_load_compare (load_mode, true, bytes_remaining, diff,
1115 cmp_rem, dcond, src1_addr, src2_addr,
1116 orig_src1, orig_src2);
1117 }
1118 else if (!bytes_is_const)
1119 {
1120 rtx handle4k_label = gen_label_rtx ();
1121 rtx nonconst_overlap = gen_label_rtx ();
1122 emit_label (nonconst_overlap);
1123
1124 /* Here we have to handle the case where whe have runtime
1125 length which may be too short for overlap compare, and
1126 alignment is not at least load_mode_size so we have to
1127 tread carefully to avoid stepping across 4k boundaries. */
1128
1129 /* If the length after the loop was larger than word_mode
1130 size, we can just do an overlapping compare and we're
1131 done. We fall through to this code from the word_mode
1132 compare that preceeds this. */
1133 do_overlap_load_compare (load_mode, false, 0, diff,
1134 cmp_rem, dcond, src1_addr, src2_addr,
1135 orig_src1, orig_src2);
1136
1137 rtx diff_ref = gen_rtx_LABEL_REF (VOIDmode, diff_label);
1138 j = emit_jump_insn (gen_rtx_SET (pc_rtx, diff_ref));
1139 JUMP_LABEL (j) = diff_label;
1140 LABEL_NUSES (diff_label) += 1;
1141 emit_barrier ();
1142
1143 /* If we couldn't do the overlap compare we have to be more
1144 careful of the 4k boundary. Test to see if either
1145 address is less than word_mode_size away from a 4k
1146 boundary. If not, then we can do a load/shift/compare
1147 and we are done. We come to this code if length was less
1148 than word_mode_size. */
1149
1150 emit_label (final_cleanup);
1151
1152 /* We can still avoid the slow case if the length was larger
1153 than one loop iteration, in which case go do the overlap
1154 load compare path. */
1155 do_ifelse (CCmode, GT, bytes_rtx, GEN_INT (loop_bytes),
1156 NULL_RTX, nonconst_overlap, profile_probability::even ());
1157
1158 rtx rem4k = gen_reg_rtx (word_mode);
1159 rtx dist1 = gen_reg_rtx (word_mode);
1160 rtx dist2 = gen_reg_rtx (word_mode);
1161 do_sub3 (rem4k, GEN_INT (4096), cmp_rem);
1162 if (word_mode == SImode)
1163 emit_insn (gen_andsi3 (dist1, src1_addr, GEN_INT (0xfff)));
1164 else
1165 emit_insn (gen_anddi3 (dist1, src1_addr, GEN_INT (0xfff)));
1166 do_ifelse (CCmode, LE, dist1, rem4k, NULL_RTX,
1167 handle4k_label, profile_probability::very_unlikely ());
1168 if (word_mode == SImode)
1169 emit_insn (gen_andsi3 (dist2, src2_addr, GEN_INT (0xfff)));
1170 else
1171 emit_insn (gen_anddi3 (dist2, src2_addr, GEN_INT (0xfff)));
1172 do_ifelse (CCmode, LE, dist2, rem4k, NULL_RTX,
1173 handle4k_label, profile_probability::very_unlikely ());
1174
1175 /* We don't have a 4k boundary to deal with, so do
1176 a load/shift/compare and jump to diff. */
1177
1178 do_load_mask_compare (load_mode, diff, cmp_rem, dcond,
1179 src1_addr, src2_addr, orig_src1, orig_src2);
1180
1181 j = emit_jump_insn (gen_rtx_SET (pc_rtx, diff_ref));
1182 JUMP_LABEL (j) = diff_label;
1183 LABEL_NUSES (diff_label) += 1;
1184 emit_barrier ();
1185
1186 /* Finally in the unlikely case we are inching up to a
1187 4k boundary we use a compact lbzx/compare loop to do
1188 it a byte at a time. */
1189
1190 emit_label (handle4k_label);
1191
1192 rtx ctr = gen_rtx_REG (Pmode, CTR_REGNO);
1193 emit_move_insn (ctr, cmp_rem);
1194 rtx ixreg = gen_reg_rtx (Pmode);
1195 emit_move_insn (ixreg, const0_rtx);
1196
1197 rtx src1_ix = gen_rtx_PLUS (word_mode, src1_addr, ixreg);
1198 rtx src2_ix = gen_rtx_PLUS (word_mode, src2_addr, ixreg);
1199 rtx d1 = gen_reg_rtx (word_mode);
1200 rtx d2 = gen_reg_rtx (word_mode);
1201
1202 rtx fc_loop = gen_label_rtx ();
1203 emit_label (fc_loop);
1204
1205 do_load_for_compare_from_addr (QImode, d1, src1_ix, orig_src1);
1206 do_load_for_compare_from_addr (QImode, d2, src2_ix, orig_src2);
1207
1208 do_add3 (ixreg, ixreg, const1_rtx);
1209
1210 rtx cond = gen_reg_rtx (CCmode);
1211 rtx subexpr = gen_rtx_MINUS (word_mode, d1, d2);
1212 rs6000_emit_dot_insn (diff, subexpr, 2, cond);
1213
1214 rtx eqrtx = gen_rtx_EQ (VOIDmode, d1, d2);
1215 if (TARGET_64BIT)
1216 j = emit_jump_insn (gen_bdnztf_di (fc_loop, ctr, ctr,
1217 eqrtx, cond));
1218 else
1219 j = emit_jump_insn (gen_bdnztf_si (fc_loop, ctr, ctr,
1220 eqrtx, cond));
1221 add_reg_br_prob_note (j, profile_probability::likely ());
1222 JUMP_LABEL (j) = fc_loop;
1223 LABEL_NUSES (fc_loop) += 1;
1224
1225 if (TARGET_64BIT)
1226 emit_insn (gen_movsi (target, gen_lowpart (SImode, diff)));
1227 else
1228 emit_move_insn (target, diff);
1229
1230 /* Since we are comparing bytes, the difference can be used
1231 as the final result and we are done here. */
1232 j = emit_jump_insn (gen_rtx_SET (pc_rtx, final_ref));
1233 JUMP_LABEL (j) = final_label;
1234 LABEL_NUSES (final_label) += 1;
1235 emit_barrier ();
1236 }
1237 }
1238
1239 emit_label (diff_label);
1240 /* difference handling, 64->32 conversion */
1241
1242 /* We need to produce DI result from sub, then convert to target SI
1243 while maintaining <0 / ==0 / >0 properties. This sequence works:
1244 subfc L,A,B
1245 subfe H,H,H
1246 popcntd L,L
1247 rldimi L,H,6,0
1248
1249 This is an alternate one Segher cooked up if somebody
1250 wants to expand this for something that doesn't have popcntd:
1251 subfc L,a,b
1252 subfe H,x,x
1253 addic t,L,-1
1254 subfe v,t,L
1255 or z,v,H
1256
1257 And finally, p9 can just do this:
1258 cmpld A,B
1259 setb r */
1260
1261 if (TARGET_P9_MISC)
1262 emit_insn (gen_setb_unsigned (target, dcond));
1263 else
1264 {
1265 if (TARGET_64BIT)
1266 {
1267 rtx tmp_reg_ca = gen_reg_rtx (DImode);
1268 emit_insn (gen_subfdi3_carry_in_xx (tmp_reg_ca));
1269 emit_insn (gen_popcntddi2 (diff, diff));
1270 emit_insn (gen_iordi3 (diff, diff, tmp_reg_ca));
1271 emit_insn (gen_movsi (target, gen_lowpart (SImode, diff)));
1272 }
1273 else
1274 {
1275 rtx tmp_reg_ca = gen_reg_rtx (SImode);
1276 emit_insn (gen_subfsi3_carry_in_xx (tmp_reg_ca));
1277 emit_insn (gen_popcntdsi2 (diff, diff));
1278 emit_insn (gen_iorsi3 (target, diff, tmp_reg_ca));
1279 }
1280 }
1281
1282 if (library_call_label != NULL)
1283 {
1284 /* Branch around memcmp call. */
1285 j = emit_jump_insn (gen_rtx_SET (pc_rtx, final_ref));
1286 JUMP_LABEL (j) = final_label;
1287 LABEL_NUSES (final_label) += 1;
1288 emit_barrier ();
1289
1290 /* Make memcmp library call. cmp_rem is the remaining bytes that
1291 were compared and cmp_rem is the expected amount to be compared
1292 by memcmp. If we don't find a difference in the loop compare, do
1293 the library call directly instead of doing a small compare just
1294 to get to an arbitrary boundary before calling it anyway.
1295 Also, update addresses to point to the next word to examine. */
1296 emit_label (library_call_label);
1297
1298 rtx len_rtx = gen_reg_rtx (word_mode);
1299 if (bytes_is_const)
1300 {
1301 emit_move_insn (len_rtx, cmp_rem);
1302 do_add3 (src1_addr, src1_addr, iv1);
1303 do_add3 (src2_addr, src2_addr, iv1);
1304 }
1305 else
1306 emit_move_insn (len_rtx, bytes_rtx);
1307
1308 tree fun = builtin_decl_explicit (BUILT_IN_MEMCMP);
1309 emit_library_call_value (XEXP (DECL_RTL (fun), 0),
1310 target, LCT_NORMAL, GET_MODE (target),
1311 src1_addr, Pmode,
1312 src2_addr, Pmode,
1313 len_rtx, GET_MODE (len_rtx));
1314 }
1315
1316 /* emit final_label */
1317 emit_label (final_label);
1318 return true;
1319 }
1320
1321 /* Expand a block compare operation, and return true if successful.
1322 Return false if we should let the compiler generate normal code,
1323 probably a memcmp call.
1324
1325 OPERANDS[0] is the target (result).
1326 OPERANDS[1] is the first source.
1327 OPERANDS[2] is the second source.
1328 OPERANDS[3] is the length.
1329 OPERANDS[4] is the alignment. */
1330 bool
expand_block_compare(rtx operands[])1331 expand_block_compare (rtx operands[])
1332 {
1333 rtx target = operands[0];
1334 rtx orig_src1 = operands[1];
1335 rtx orig_src2 = operands[2];
1336 rtx bytes_rtx = operands[3];
1337 rtx align_rtx = operands[4];
1338 HOST_WIDE_INT cmp_bytes = 0;
1339 rtx src1 = orig_src1;
1340 rtx src2 = orig_src2;
1341
1342 /* This case is complicated to handle because the subtract
1343 with carry instructions do not generate the 64-bit
1344 carry and so we must emit code to calculate it ourselves.
1345 We choose not to implement this yet. */
1346 if (TARGET_32BIT && TARGET_POWERPC64)
1347 return false;
1348
1349 bool isP7 = (rs6000_tune == PROCESSOR_POWER7);
1350
1351 /* Allow this param to shut off all expansion. */
1352 if (rs6000_block_compare_inline_limit == 0)
1353 return false;
1354
1355 /* targetm.slow_unaligned_access -- don't do unaligned stuff.
1356 However slow_unaligned_access returns true on P7 even though the
1357 performance of this code is good there. */
1358 if (!isP7
1359 && (targetm.slow_unaligned_access (word_mode, MEM_ALIGN (orig_src1))
1360 || targetm.slow_unaligned_access (word_mode, MEM_ALIGN (orig_src2))))
1361 return false;
1362
1363 /* Unaligned l*brx traps on P7 so don't do this. However this should
1364 not affect much because LE isn't really supported on P7 anyway. */
1365 if (isP7 && !BYTES_BIG_ENDIAN)
1366 return false;
1367
1368 /* If this is not a fixed size compare, try generating loop code and
1369 if that fails just call memcmp. */
1370 if (!CONST_INT_P (bytes_rtx))
1371 return expand_compare_loop (operands);
1372
1373 /* This must be a fixed size alignment. */
1374 if (!CONST_INT_P (align_rtx))
1375 return false;
1376
1377 unsigned int base_align = UINTVAL (align_rtx) / BITS_PER_UNIT;
1378
1379 gcc_assert (GET_MODE (target) == SImode);
1380
1381 /* Anything to move? */
1382 unsigned HOST_WIDE_INT bytes = UINTVAL (bytes_rtx);
1383 if (bytes == 0)
1384 return true;
1385
1386 rtx tmp_reg_src1 = gen_reg_rtx (word_mode);
1387 rtx tmp_reg_src2 = gen_reg_rtx (word_mode);
1388 /* P7/P8 code uses cond for subfc. but P9 uses
1389 it for cmpld which needs CCUNSmode. */
1390 rtx cond;
1391 if (TARGET_P9_MISC)
1392 cond = gen_reg_rtx (CCUNSmode);
1393 else
1394 cond = gen_reg_rtx (CCmode);
1395
1396 /* If we have an LE target without ldbrx and word_mode is DImode,
1397 then we must avoid using word_mode. */
1398 int word_mode_ok = !(!BYTES_BIG_ENDIAN && !TARGET_LDBRX
1399 && word_mode == DImode);
1400
1401 /* Strategy phase. How many ops will this take and should we expand it? */
1402
1403 unsigned HOST_WIDE_INT offset = 0;
1404 machine_mode load_mode =
1405 select_block_compare_mode (offset, bytes, base_align, word_mode_ok);
1406 unsigned int load_mode_size = GET_MODE_SIZE (load_mode);
1407
1408 /* We don't want to generate too much code. The loop code can take
1409 over for lengths greater than 31 bytes. */
1410 unsigned HOST_WIDE_INT max_bytes = rs6000_block_compare_inline_limit;
1411 if (!IN_RANGE (bytes, 1, max_bytes))
1412 return expand_compare_loop (operands);
1413
1414 /* The code generated for p7 and older is not faster than glibc
1415 memcmp if alignment is small and length is not short, so bail
1416 out to avoid those conditions. */
1417 if (!TARGET_EFFICIENT_OVERLAPPING_UNALIGNED
1418 && ((base_align == 1 && bytes > 16)
1419 || (base_align == 2 && bytes > 32)))
1420 return false;
1421
1422 bool generate_6432_conversion = false;
1423 rtx convert_label = NULL;
1424 rtx final_label = NULL;
1425
1426 /* Example of generated code for 18 bytes aligned 1 byte.
1427 Compiled with -fno-reorder-blocks for clarity.
1428 ldbrx 10,31,8
1429 ldbrx 9,7,8
1430 subfc. 9,9,10
1431 bne 0,.L6487
1432 addi 9,12,8
1433 addi 5,11,8
1434 ldbrx 10,0,9
1435 ldbrx 9,0,5
1436 subfc. 9,9,10
1437 bne 0,.L6487
1438 addi 9,12,16
1439 lhbrx 10,0,9
1440 addi 9,11,16
1441 lhbrx 9,0,9
1442 subf 9,9,10
1443 b .L6488
1444 .p2align 4,,15
1445 .L6487: #convert_label
1446 popcntd 9,9
1447 subfe 10,10,10
1448 or 9,9,10
1449 .L6488: #final_label
1450 extsw 10,9
1451
1452 We start off with DImode for two blocks that jump to the DI->SI conversion
1453 if the difference is found there, then a final block of HImode that skips
1454 the DI->SI conversion. */
1455
1456 while (bytes > 0)
1457 {
1458 unsigned int align = compute_current_alignment (base_align, offset);
1459 if (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED)
1460 load_mode = select_block_compare_mode (offset, bytes, align,
1461 word_mode_ok);
1462 else
1463 load_mode = select_block_compare_mode (0, bytes, align, word_mode_ok);
1464 load_mode_size = GET_MODE_SIZE (load_mode);
1465 if (bytes >= load_mode_size)
1466 cmp_bytes = load_mode_size;
1467 else if (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED)
1468 {
1469 /* Move this load back so it doesn't go past the end.
1470 P8/P9 can do this efficiently. */
1471 unsigned int extra_bytes = load_mode_size - bytes;
1472 cmp_bytes = bytes;
1473 if (extra_bytes < offset)
1474 {
1475 offset -= extra_bytes;
1476 cmp_bytes = load_mode_size;
1477 bytes = cmp_bytes;
1478 }
1479 }
1480 else
1481 /* P7 and earlier can't do the overlapping load trick fast,
1482 so this forces a non-overlapping load and a shift to get
1483 rid of the extra bytes. */
1484 cmp_bytes = bytes;
1485
1486 src1 = adjust_address (orig_src1, load_mode, offset);
1487 src2 = adjust_address (orig_src2, load_mode, offset);
1488
1489 if (!REG_P (XEXP (src1, 0)))
1490 {
1491 rtx src1_reg = copy_addr_to_reg (XEXP (src1, 0));
1492 src1 = replace_equiv_address (src1, src1_reg);
1493 }
1494 set_mem_size (src1, load_mode_size);
1495
1496 if (!REG_P (XEXP (src2, 0)))
1497 {
1498 rtx src2_reg = copy_addr_to_reg (XEXP (src2, 0));
1499 src2 = replace_equiv_address (src2, src2_reg);
1500 }
1501 set_mem_size (src2, load_mode_size);
1502
1503 do_load_for_compare (tmp_reg_src1, src1, load_mode);
1504 do_load_for_compare (tmp_reg_src2, src2, load_mode);
1505
1506 if (cmp_bytes < load_mode_size)
1507 {
1508 /* Shift unneeded bytes off. */
1509 rtx sh = GEN_INT (BITS_PER_UNIT * (load_mode_size - cmp_bytes));
1510 if (word_mode == DImode)
1511 {
1512 emit_insn (gen_lshrdi3 (tmp_reg_src1, tmp_reg_src1, sh));
1513 emit_insn (gen_lshrdi3 (tmp_reg_src2, tmp_reg_src2, sh));
1514 }
1515 else
1516 {
1517 emit_insn (gen_lshrsi3 (tmp_reg_src1, tmp_reg_src1, sh));
1518 emit_insn (gen_lshrsi3 (tmp_reg_src2, tmp_reg_src2, sh));
1519 }
1520 }
1521
1522 int remain = bytes - cmp_bytes;
1523 if (GET_MODE_SIZE (GET_MODE (target)) > GET_MODE_SIZE (load_mode))
1524 {
1525 /* Target is larger than load size so we don't need to
1526 reduce result size. */
1527
1528 /* We previously did a block that need 64->32 conversion but
1529 the current block does not, so a label is needed to jump
1530 to the end. */
1531 if (generate_6432_conversion && !final_label)
1532 final_label = gen_label_rtx ();
1533
1534 if (remain > 0)
1535 {
1536 /* This is not the last block, branch to the end if the result
1537 of this subtract is not zero. */
1538 if (!final_label)
1539 final_label = gen_label_rtx ();
1540 rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label);
1541 rtx tmp = gen_rtx_MINUS (word_mode, tmp_reg_src1, tmp_reg_src2);
1542 rtx cr = gen_reg_rtx (CCmode);
1543 rs6000_emit_dot_insn (tmp_reg_src2, tmp, 2, cr);
1544 emit_insn (gen_movsi (target,
1545 gen_lowpart (SImode, tmp_reg_src2)));
1546 rtx ne_rtx = gen_rtx_NE (VOIDmode, cr, const0_rtx);
1547 rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, ne_rtx,
1548 fin_ref, pc_rtx);
1549 rtx_insn *j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
1550 add_reg_br_prob_note (j, profile_probability::unlikely ());
1551 JUMP_LABEL (j) = final_label;
1552 LABEL_NUSES (final_label) += 1;
1553 }
1554 else
1555 {
1556 if (word_mode == DImode)
1557 {
1558 emit_insn (gen_subdi3 (tmp_reg_src2, tmp_reg_src1,
1559 tmp_reg_src2));
1560 emit_insn (gen_movsi (target,
1561 gen_lowpart (SImode, tmp_reg_src2)));
1562 }
1563 else
1564 emit_insn (gen_subsi3 (target, tmp_reg_src1, tmp_reg_src2));
1565
1566 if (final_label)
1567 {
1568 rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label);
1569 rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, fin_ref));
1570 JUMP_LABEL (j) = final_label;
1571 LABEL_NUSES (final_label) += 1;
1572 emit_barrier ();
1573 }
1574 }
1575 }
1576 else
1577 {
1578 /* Do we need a 64->32 conversion block? We need the 64->32
1579 conversion even if target size == load_mode size because
1580 the subtract generates one extra bit. */
1581 generate_6432_conversion = true;
1582
1583 if (remain > 0)
1584 {
1585 if (!convert_label)
1586 convert_label = gen_label_rtx ();
1587
1588 /* Compare to zero and branch to convert_label if not zero. */
1589 rtx cvt_ref = gen_rtx_LABEL_REF (VOIDmode, convert_label);
1590 if (TARGET_P9_MISC)
1591 {
1592 /* Generate a compare, and convert with a setb later. */
1593 rtx cmp = gen_rtx_COMPARE (CCUNSmode, tmp_reg_src1,
1594 tmp_reg_src2);
1595 emit_insn (gen_rtx_SET (cond, cmp));
1596 }
1597 else
1598 /* Generate a subfc. and use the longer
1599 sequence for conversion. */
1600 if (TARGET_64BIT)
1601 emit_insn (gen_subfdi3_carry_dot2 (tmp_reg_src2, tmp_reg_src2,
1602 tmp_reg_src1, cond));
1603 else
1604 emit_insn (gen_subfsi3_carry_dot2 (tmp_reg_src2, tmp_reg_src2,
1605 tmp_reg_src1, cond));
1606 rtx ne_rtx = gen_rtx_NE (VOIDmode, cond, const0_rtx);
1607 rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, ne_rtx,
1608 cvt_ref, pc_rtx);
1609 rtx_insn *j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
1610 add_reg_br_prob_note (j, profile_probability::likely ());
1611 JUMP_LABEL (j) = convert_label;
1612 LABEL_NUSES (convert_label) += 1;
1613 }
1614 else
1615 {
1616 /* Just do the subtract/compare. Since this is the last block
1617 the convert code will be generated immediately following. */
1618 if (TARGET_P9_MISC)
1619 {
1620 rtx cmp = gen_rtx_COMPARE (CCUNSmode, tmp_reg_src1,
1621 tmp_reg_src2);
1622 emit_insn (gen_rtx_SET (cond, cmp));
1623 }
1624 else
1625 if (TARGET_64BIT)
1626 emit_insn (gen_subfdi3_carry (tmp_reg_src2, tmp_reg_src2,
1627 tmp_reg_src1));
1628 else
1629 emit_insn (gen_subfsi3_carry (tmp_reg_src2, tmp_reg_src2,
1630 tmp_reg_src1));
1631 }
1632 }
1633
1634 offset += cmp_bytes;
1635 bytes -= cmp_bytes;
1636 }
1637
1638 if (generate_6432_conversion)
1639 {
1640 if (convert_label)
1641 emit_label (convert_label);
1642
1643 /* We need to produce DI result from sub, then convert to target SI
1644 while maintaining <0 / ==0 / >0 properties. This sequence works:
1645 subfc L,A,B
1646 subfe H,H,H
1647 popcntd L,L
1648 rldimi L,H,6,0
1649
1650 This is an alternate one Segher cooked up if somebody
1651 wants to expand this for something that doesn't have popcntd:
1652 subfc L,a,b
1653 subfe H,x,x
1654 addic t,L,-1
1655 subfe v,t,L
1656 or z,v,H
1657
1658 And finally, p9 can just do this:
1659 cmpld A,B
1660 setb r */
1661
1662 if (TARGET_P9_MISC)
1663 {
1664 emit_insn (gen_setb_unsigned (target, cond));
1665 }
1666 else
1667 {
1668 if (TARGET_64BIT)
1669 {
1670 rtx tmp_reg_ca = gen_reg_rtx (DImode);
1671 emit_insn (gen_subfdi3_carry_in_xx (tmp_reg_ca));
1672 emit_insn (gen_popcntddi2 (tmp_reg_src2, tmp_reg_src2));
1673 emit_insn (gen_iordi3 (tmp_reg_src2, tmp_reg_src2, tmp_reg_ca));
1674 emit_insn (gen_movsi (target, gen_lowpart (SImode, tmp_reg_src2)));
1675 }
1676 else
1677 {
1678 rtx tmp_reg_ca = gen_reg_rtx (SImode);
1679 emit_insn (gen_subfsi3_carry_in_xx (tmp_reg_ca));
1680 emit_insn (gen_popcntdsi2 (tmp_reg_src2, tmp_reg_src2));
1681 emit_insn (gen_iorsi3 (target, tmp_reg_src2, tmp_reg_ca));
1682 }
1683 }
1684 }
1685
1686 if (final_label)
1687 emit_label (final_label);
1688
1689 gcc_assert (bytes == 0);
1690 return true;
1691 }
1692
1693 /* Generate alignment check and branch code to set up for
1694 strncmp when we don't have DI alignment.
1695 STRNCMP_LABEL is the label to branch if there is a page crossing.
1696 SRC is the string pointer to be examined.
1697 BYTES is the max number of bytes to compare. */
1698 static void
expand_strncmp_align_check(rtx strncmp_label,rtx src,HOST_WIDE_INT bytes)1699 expand_strncmp_align_check (rtx strncmp_label, rtx src, HOST_WIDE_INT bytes)
1700 {
1701 rtx lab_ref = gen_rtx_LABEL_REF (VOIDmode, strncmp_label);
1702 rtx src_check = copy_addr_to_reg (XEXP (src, 0));
1703 if (GET_MODE (src_check) == SImode)
1704 emit_insn (gen_andsi3 (src_check, src_check, GEN_INT (0xfff)));
1705 else
1706 emit_insn (gen_anddi3 (src_check, src_check, GEN_INT (0xfff)));
1707 rtx cond = gen_reg_rtx (CCmode);
1708 emit_move_insn (cond, gen_rtx_COMPARE (CCmode, src_check,
1709 GEN_INT (4096 - bytes)));
1710
1711 rtx cmp_rtx = gen_rtx_GE (VOIDmode, cond, const0_rtx);
1712
1713 rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx,
1714 lab_ref, pc_rtx);
1715 rtx_insn *j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
1716 add_reg_br_prob_note (j, profile_probability::unlikely ());
1717 JUMP_LABEL (j) = strncmp_label;
1718 LABEL_NUSES (strncmp_label) += 1;
1719 }
1720
1721 /* Expand a string compare operation with length, and return
1722 true if successful. Return false if we should let the
1723 compiler generate normal code, probably a strncmp call.
1724
1725 OPERANDS[0] is the target (result).
1726 OPERANDS[1] is the first source.
1727 OPERANDS[2] is the second source.
1728 If NO_LENGTH is zero, then:
1729 OPERANDS[3] is the length.
1730 OPERANDS[4] is the alignment in bytes.
1731 If NO_LENGTH is nonzero, then:
1732 OPERANDS[3] is the alignment in bytes. */
1733 bool
expand_strn_compare(rtx operands[],int no_length)1734 expand_strn_compare (rtx operands[], int no_length)
1735 {
1736 rtx target = operands[0];
1737 rtx orig_src1 = operands[1];
1738 rtx orig_src2 = operands[2];
1739 rtx bytes_rtx, align_rtx;
1740 if (no_length)
1741 {
1742 bytes_rtx = NULL;
1743 align_rtx = operands[3];
1744 }
1745 else
1746 {
1747 bytes_rtx = operands[3];
1748 align_rtx = operands[4];
1749 }
1750 unsigned HOST_WIDE_INT cmp_bytes = 0;
1751 rtx src1 = orig_src1;
1752 rtx src2 = orig_src2;
1753
1754 /* If we have a length, it must be constant. This simplifies things
1755 a bit as we don't have to generate code to check if we've exceeded
1756 the length. Later this could be expanded to handle this case. */
1757 if (!no_length && !CONST_INT_P (bytes_rtx))
1758 return false;
1759
1760 /* This must be a fixed size alignment. */
1761 if (!CONST_INT_P (align_rtx))
1762 return false;
1763
1764 unsigned int base_align = UINTVAL (align_rtx);
1765 int align1 = MEM_ALIGN (orig_src1) / BITS_PER_UNIT;
1766 int align2 = MEM_ALIGN (orig_src2) / BITS_PER_UNIT;
1767
1768 /* targetm.slow_unaligned_access -- don't do unaligned stuff. */
1769 if (targetm.slow_unaligned_access (word_mode, align1)
1770 || targetm.slow_unaligned_access (word_mode, align2))
1771 return false;
1772
1773 gcc_assert (GET_MODE (target) == SImode);
1774
1775 /* If we have an LE target without ldbrx and word_mode is DImode,
1776 then we must avoid using word_mode. */
1777 int word_mode_ok = !(!BYTES_BIG_ENDIAN && !TARGET_LDBRX
1778 && word_mode == DImode);
1779
1780 unsigned int word_mode_size = GET_MODE_SIZE (word_mode);
1781
1782 unsigned HOST_WIDE_INT offset = 0;
1783 unsigned HOST_WIDE_INT bytes; /* N from the strncmp args if available. */
1784 unsigned HOST_WIDE_INT compare_length; /* How much to compare inline. */
1785 if (no_length)
1786 /* Use this as a standin to determine the mode to use. */
1787 bytes = rs6000_string_compare_inline_limit * word_mode_size;
1788 else
1789 bytes = UINTVAL (bytes_rtx);
1790
1791 machine_mode load_mode =
1792 select_block_compare_mode (offset, bytes, base_align, word_mode_ok);
1793 unsigned int load_mode_size = GET_MODE_SIZE (load_mode);
1794 compare_length = rs6000_string_compare_inline_limit * load_mode_size;
1795
1796 /* If we have equality at the end of the last compare and we have not
1797 found the end of the string, we need to call strcmp/strncmp to
1798 compare the remainder. */
1799 bool equality_compare_rest = false;
1800
1801 if (no_length)
1802 {
1803 bytes = compare_length;
1804 equality_compare_rest = true;
1805 }
1806 else
1807 {
1808 if (bytes <= compare_length)
1809 compare_length = bytes;
1810 else
1811 equality_compare_rest = true;
1812 }
1813
1814 rtx result_reg = gen_reg_rtx (word_mode);
1815 rtx final_move_label = gen_label_rtx ();
1816 rtx final_label = gen_label_rtx ();
1817 rtx begin_compare_label = NULL;
1818
1819 if (base_align < 8)
1820 {
1821 /* Generate code that checks distance to 4k boundary for this case. */
1822 begin_compare_label = gen_label_rtx ();
1823 rtx strncmp_label = gen_label_rtx ();
1824 rtx jmp;
1825
1826 /* Strncmp for power8 in glibc does this:
1827 rldicl r8,r3,0,52
1828 cmpldi cr7,r8,4096-16
1829 bgt cr7,L(pagecross) */
1830
1831 /* Make sure that the length we use for the alignment test and
1832 the subsequent code generation are in agreement so we do not
1833 go past the length we tested for a 4k boundary crossing. */
1834 unsigned HOST_WIDE_INT align_test = compare_length;
1835 if (align_test < 8)
1836 {
1837 align_test = HOST_WIDE_INT_1U << ceil_log2 (align_test);
1838 base_align = align_test;
1839 }
1840 else
1841 {
1842 align_test = ROUND_UP (align_test, 8);
1843 base_align = 8;
1844 }
1845
1846 if (align1 < 8)
1847 expand_strncmp_align_check (strncmp_label, src1, align_test);
1848 if (align2 < 8)
1849 expand_strncmp_align_check (strncmp_label, src2, align_test);
1850
1851 /* Now generate the following sequence:
1852 - branch to begin_compare
1853 - strncmp_label
1854 - call to strncmp
1855 - branch to final_label
1856 - begin_compare_label */
1857
1858 rtx cmp_ref = gen_rtx_LABEL_REF (VOIDmode, begin_compare_label);
1859 jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, cmp_ref));
1860 JUMP_LABEL (jmp) = begin_compare_label;
1861 LABEL_NUSES (begin_compare_label) += 1;
1862 emit_barrier ();
1863
1864 emit_label (strncmp_label);
1865
1866 if (!REG_P (XEXP (src1, 0)))
1867 {
1868 rtx src1_reg = copy_addr_to_reg (XEXP (src1, 0));
1869 src1 = replace_equiv_address (src1, src1_reg);
1870 }
1871
1872 if (!REG_P (XEXP (src2, 0)))
1873 {
1874 rtx src2_reg = copy_addr_to_reg (XEXP (src2, 0));
1875 src2 = replace_equiv_address (src2, src2_reg);
1876 }
1877
1878 if (no_length)
1879 {
1880 tree fun = builtin_decl_explicit (BUILT_IN_STRCMP);
1881 emit_library_call_value (XEXP (DECL_RTL (fun), 0),
1882 target, LCT_NORMAL, GET_MODE (target),
1883 force_reg (Pmode, XEXP (src1, 0)), Pmode,
1884 force_reg (Pmode, XEXP (src2, 0)), Pmode);
1885 }
1886 else
1887 {
1888 /* -m32 -mpowerpc64 results in word_mode being DImode even
1889 though otherwise it is 32-bit. The length arg to strncmp
1890 is a size_t which will be the same size as pointers. */
1891 rtx len_rtx = gen_reg_rtx (Pmode);
1892 emit_move_insn (len_rtx, gen_int_mode (bytes, Pmode));
1893
1894 tree fun = builtin_decl_explicit (BUILT_IN_STRNCMP);
1895 emit_library_call_value (XEXP (DECL_RTL (fun), 0),
1896 target, LCT_NORMAL, GET_MODE (target),
1897 force_reg (Pmode, XEXP (src1, 0)), Pmode,
1898 force_reg (Pmode, XEXP (src2, 0)), Pmode,
1899 len_rtx, Pmode);
1900 }
1901
1902 rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label);
1903 jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, fin_ref));
1904 JUMP_LABEL (jmp) = final_label;
1905 LABEL_NUSES (final_label) += 1;
1906 emit_barrier ();
1907 emit_label (begin_compare_label);
1908 }
1909
1910 rtx cleanup_label = NULL;
1911 rtx tmp_reg_src1 = gen_reg_rtx (word_mode);
1912 rtx tmp_reg_src2 = gen_reg_rtx (word_mode);
1913
1914 rtx src1_addr = force_reg (Pmode, XEXP (orig_src1, 0));
1915 rtx src2_addr = force_reg (Pmode, XEXP (orig_src2, 0));
1916
1917 /* Generate sequence of ld/ldbrx, cmpb to compare out
1918 to the length specified. */
1919 unsigned HOST_WIDE_INT bytes_to_compare = compare_length;
1920 while (bytes_to_compare > 0)
1921 {
1922 /* Compare sequence:
1923 check each 8B with: ld/ld/cmpb/cmpb/orc./bne
1924
1925 cleanup code at end:
1926 cntlzd get bit of first zero/diff byte
1927 subfic convert for rldcl use
1928 rldcl rldcl extract diff/zero byte
1929 subf subtract for final result
1930
1931 The last compare can branch around the cleanup code if the
1932 result is zero because the strings are exactly equal. */
1933 unsigned int align = compute_current_alignment (base_align, offset);
1934 if (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED)
1935 load_mode = select_block_compare_mode (offset, bytes_to_compare, align,
1936 word_mode_ok);
1937 else
1938 load_mode = select_block_compare_mode (0, bytes_to_compare, align,
1939 word_mode_ok);
1940 load_mode_size = GET_MODE_SIZE (load_mode);
1941 if (bytes_to_compare >= load_mode_size)
1942 cmp_bytes = load_mode_size;
1943 else if (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED)
1944 {
1945 /* Move this load back so it doesn't go past the end.
1946 P8/P9 can do this efficiently. */
1947 unsigned int extra_bytes = load_mode_size - bytes_to_compare;
1948 cmp_bytes = bytes_to_compare;
1949 if (extra_bytes < offset)
1950 {
1951 offset -= extra_bytes;
1952 cmp_bytes = load_mode_size;
1953 bytes_to_compare = cmp_bytes;
1954 }
1955 }
1956 else
1957 /* P7 and earlier can't do the overlapping load trick fast,
1958 so this forces a non-overlapping load and a shift to get
1959 rid of the extra bytes. */
1960 cmp_bytes = bytes_to_compare;
1961
1962 rtx offset_rtx;
1963 if (BYTES_BIG_ENDIAN || TARGET_AVOID_XFORM)
1964 offset_rtx = GEN_INT (offset);
1965 else
1966 {
1967 offset_rtx = gen_reg_rtx (Pmode);
1968 emit_move_insn (offset_rtx, GEN_INT (offset));
1969 }
1970 rtx addr1 = gen_rtx_PLUS (Pmode, src1_addr, offset_rtx);
1971 rtx addr2 = gen_rtx_PLUS (Pmode, src2_addr, offset_rtx);
1972 do_load_for_compare_from_addr (load_mode, tmp_reg_src1, addr1, orig_src1);
1973 do_load_for_compare_from_addr (load_mode, tmp_reg_src2, addr2, orig_src2);
1974
1975 /* We must always left-align the data we read, and
1976 clear any bytes to the right that are beyond the string.
1977 Otherwise the cmpb sequence won't produce the correct
1978 results. However if there is only one byte left, we
1979 can just subtract to get the final result so the shifts
1980 and clears are not needed. */
1981
1982 unsigned HOST_WIDE_INT remain = bytes_to_compare - cmp_bytes;
1983
1984 /* Loading just a single byte is a special case. If we are
1985 loading more than that, we have to check whether we are
1986 looking at the entire chunk of data. If not, rotate left and
1987 clear right so that bytes we aren't supposed to look at are
1988 zeroed, and the first byte we are supposed to compare is
1989 leftmost. */
1990
1991 if (load_mode_size != 1)
1992 {
1993 if (load_mode_size < word_mode_size)
1994 {
1995 /* Rotate left first. */
1996 rtx sh = GEN_INT (BITS_PER_UNIT
1997 * (word_mode_size - load_mode_size));
1998 do_rotl3 (tmp_reg_src1, tmp_reg_src1, sh);
1999 do_rotl3 (tmp_reg_src2, tmp_reg_src2, sh);
2000 }
2001
2002 if (cmp_bytes < word_mode_size)
2003 {
2004 /* Now clear right. This plus the rotate can be
2005 turned into a rldicr instruction. */
2006 HOST_WIDE_INT mb = BITS_PER_UNIT * (word_mode_size - cmp_bytes);
2007 rtx mask = GEN_INT (HOST_WIDE_INT_M1U << mb);
2008 do_and3 (tmp_reg_src1, tmp_reg_src1, mask);
2009 do_and3 (tmp_reg_src2, tmp_reg_src2, mask);
2010 }
2011 }
2012
2013 /* Cases to handle. A and B are chunks of the two strings.
2014 1: Not end of comparison:
2015 A != B: branch to cleanup code to compute result.
2016 A == B: check for 0 byte, next block if not found.
2017 2: End of the inline comparison:
2018 A != B: branch to cleanup code to compute result.
2019 A == B: check for 0 byte, call strcmp/strncmp
2020 3: compared requested N bytes:
2021 A == B: branch to result 0.
2022 A != B: cleanup code to compute result. */
2023
2024 rtx dst_label;
2025 if (remain > 0 || equality_compare_rest)
2026 {
2027 /* Branch to cleanup code, otherwise fall through to do
2028 more compares. */
2029 if (!cleanup_label)
2030 cleanup_label = gen_label_rtx ();
2031 dst_label = cleanup_label;
2032 }
2033 else
2034 /* Branch to end and produce result of 0. */
2035 dst_label = final_move_label;
2036
2037 if (load_mode_size == 1)
2038 {
2039 /* Special case for comparing just single byte. */
2040 if (equality_compare_rest)
2041 {
2042 /* Use subf./bne to branch to final_move_label if the
2043 byte differs, otherwise fall through to the strncmp
2044 call. We must also check for a zero byte here as we
2045 must not make the library call if this is the end of
2046 the string. */
2047
2048 rtx lab_ref = gen_rtx_LABEL_REF (VOIDmode, final_move_label);
2049 rtx cond = gen_reg_rtx (CCmode);
2050 rtx diff_rtx = gen_rtx_MINUS (word_mode,
2051 tmp_reg_src1, tmp_reg_src2);
2052 rs6000_emit_dot_insn (result_reg, diff_rtx, 2, cond);
2053 rtx cmp_rtx = gen_rtx_NE (VOIDmode, cond, const0_rtx);
2054
2055 rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx,
2056 lab_ref, pc_rtx);
2057 rtx_insn *j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
2058 add_reg_br_prob_note (j, profile_probability::unlikely ());
2059 JUMP_LABEL (j) = final_move_label;
2060 LABEL_NUSES (final_move_label) += 1;
2061
2062 /* Check for zero byte here before fall through to
2063 library call. This catches the case where the
2064 strings are equal and end in a zero byte at this
2065 position. */
2066
2067 rtx cond0 = gen_reg_rtx (CCmode);
2068 emit_move_insn (cond0, gen_rtx_COMPARE (CCmode, tmp_reg_src1,
2069 const0_rtx));
2070
2071 rtx cmp0eq_rtx = gen_rtx_EQ (VOIDmode, cond0, const0_rtx);
2072
2073 rtx ifelse0 = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp0eq_rtx,
2074 lab_ref, pc_rtx);
2075 rtx_insn *j0 = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse0));
2076 add_reg_br_prob_note (j0, profile_probability::unlikely ());
2077 JUMP_LABEL (j0) = final_move_label;
2078 LABEL_NUSES (final_move_label) += 1;
2079 }
2080 else
2081 {
2082 /* This is the last byte to be compared so we can use
2083 subf to compute the final result and branch
2084 unconditionally to final_move_label. */
2085
2086 do_sub3 (result_reg, tmp_reg_src1, tmp_reg_src2);
2087
2088 rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_move_label);
2089 rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, fin_ref));
2090 JUMP_LABEL (j) = final_move_label;
2091 LABEL_NUSES (final_move_label) += 1;
2092 emit_barrier ();
2093 }
2094 }
2095 else
2096 {
2097 rtx cmpb_zero = gen_reg_rtx (word_mode);
2098 rtx cmpb_diff = gen_reg_rtx (word_mode);
2099 rtx zero_reg = gen_reg_rtx (word_mode);
2100 rtx lab_ref = gen_rtx_LABEL_REF (VOIDmode, dst_label);
2101 rtx cond = gen_reg_rtx (CCmode);
2102
2103 emit_move_insn (zero_reg, GEN_INT (0));
2104 do_cmpb3 (cmpb_diff, tmp_reg_src1, tmp_reg_src2);
2105 do_cmpb3 (cmpb_zero, tmp_reg_src1, zero_reg);
2106 rtx not_diff = gen_rtx_NOT (word_mode, cmpb_diff);
2107 rtx orc_rtx = gen_rtx_IOR (word_mode, not_diff, cmpb_zero);
2108
2109 rs6000_emit_dot_insn (result_reg, orc_rtx, 2, cond);
2110
2111 rtx cmp_rtx;
2112 if (remain == 0 && !equality_compare_rest)
2113 cmp_rtx = gen_rtx_EQ (VOIDmode, cond, const0_rtx);
2114 else
2115 cmp_rtx = gen_rtx_NE (VOIDmode, cond, const0_rtx);
2116
2117 rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx,
2118 lab_ref, pc_rtx);
2119 rtx_insn *j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
2120 add_reg_br_prob_note (j, profile_probability::unlikely ());
2121 JUMP_LABEL (j) = dst_label;
2122 LABEL_NUSES (dst_label) += 1;
2123 }
2124
2125 offset += cmp_bytes;
2126 bytes_to_compare -= cmp_bytes;
2127 }
2128
2129 if (equality_compare_rest)
2130 {
2131 /* Update pointers past what has been compared already. */
2132 src1 = adjust_address (orig_src1, load_mode, offset);
2133 src2 = adjust_address (orig_src2, load_mode, offset);
2134
2135 if (!REG_P (XEXP (src1, 0)))
2136 {
2137 rtx src1_reg = copy_addr_to_reg (XEXP (src1, 0));
2138 src1 = replace_equiv_address (src1, src1_reg);
2139 }
2140 set_mem_size (src1, load_mode_size);
2141
2142 if (!REG_P (XEXP (src2, 0)))
2143 {
2144 rtx src2_reg = copy_addr_to_reg (XEXP (src2, 0));
2145 src2 = replace_equiv_address (src2, src2_reg);
2146 }
2147 set_mem_size (src2, load_mode_size);
2148
2149 /* Construct call to strcmp/strncmp to compare the rest of the string. */
2150 if (no_length)
2151 {
2152 tree fun = builtin_decl_explicit (BUILT_IN_STRCMP);
2153 emit_library_call_value (XEXP (DECL_RTL (fun), 0),
2154 target, LCT_NORMAL, GET_MODE (target),
2155 force_reg (Pmode, XEXP (src1, 0)), Pmode,
2156 force_reg (Pmode, XEXP (src2, 0)), Pmode);
2157 }
2158 else
2159 {
2160 rtx len_rtx = gen_reg_rtx (Pmode);
2161 emit_move_insn (len_rtx, gen_int_mode (bytes - compare_length, Pmode));
2162 tree fun = builtin_decl_explicit (BUILT_IN_STRNCMP);
2163 emit_library_call_value (XEXP (DECL_RTL (fun), 0),
2164 target, LCT_NORMAL, GET_MODE (target),
2165 force_reg (Pmode, XEXP (src1, 0)), Pmode,
2166 force_reg (Pmode, XEXP (src2, 0)), Pmode,
2167 len_rtx, Pmode);
2168 }
2169
2170 rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label);
2171 rtx jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, fin_ref));
2172 JUMP_LABEL (jmp) = final_label;
2173 LABEL_NUSES (final_label) += 1;
2174 emit_barrier ();
2175 }
2176
2177 if (cleanup_label)
2178 emit_label (cleanup_label);
2179
2180 /* Generate the final sequence that identifies the differing
2181 byte and generates the final result, taking into account
2182 zero bytes:
2183
2184 cntlzd get bit of first zero/diff byte
2185 addi convert for rldcl use
2186 rldcl rldcl extract diff/zero byte
2187 subf subtract for final result
2188 */
2189
2190 rtx rot_amt = gen_reg_rtx (word_mode);
2191
2192 rtx rot1_1 = gen_reg_rtx (word_mode);
2193 rtx rot1_2 = gen_reg_rtx (word_mode);
2194 rtx rot2_1 = gen_reg_rtx (word_mode);
2195 rtx rot2_2 = gen_reg_rtx (word_mode);
2196
2197 if (word_mode == SImode)
2198 {
2199 emit_insn (gen_clzsi2 (rot_amt, result_reg));
2200 emit_insn (gen_addsi3 (rot_amt, rot_amt, GEN_INT (8)));
2201 emit_insn (gen_rotlsi3 (rot1_1, tmp_reg_src1,
2202 gen_lowpart (SImode, rot_amt)));
2203 emit_insn (gen_andsi3_mask (rot1_2, rot1_1, GEN_INT (0xff)));
2204 emit_insn (gen_rotlsi3 (rot2_1, tmp_reg_src2,
2205 gen_lowpart (SImode, rot_amt)));
2206 emit_insn (gen_andsi3_mask (rot2_2, rot2_1, GEN_INT (0xff)));
2207 emit_insn (gen_subsi3 (result_reg, rot1_2, rot2_2));
2208 }
2209 else
2210 {
2211 emit_insn (gen_clzdi2 (rot_amt, result_reg));
2212 emit_insn (gen_adddi3 (rot_amt, rot_amt, GEN_INT (8)));
2213 emit_insn (gen_rotldi3 (rot1_1, tmp_reg_src1,
2214 gen_lowpart (SImode, rot_amt)));
2215 emit_insn (gen_anddi3_mask (rot1_2, rot1_1, GEN_INT (0xff)));
2216 emit_insn (gen_rotldi3 (rot2_1, tmp_reg_src2,
2217 gen_lowpart (SImode, rot_amt)));
2218 emit_insn (gen_anddi3_mask (rot2_2, rot2_1, GEN_INT (0xff)));
2219 emit_insn (gen_subdi3 (result_reg, rot1_2, rot2_2));
2220 }
2221
2222 emit_label (final_move_label);
2223 emit_insn (gen_movsi (target,
2224 gen_lowpart (SImode, result_reg)));
2225 emit_label (final_label);
2226 return true;
2227 }
2228
2229 /* Generate loads and stores for a move of v4si mode using lvx/stvx.
2230 This uses altivec_{l,st}vx_<mode>_internal which use unspecs to
2231 keep combine from changing what instruction gets used.
2232
2233 DEST is the destination for the data.
2234 SRC is the source of the data for the move. */
2235
2236 static rtx
gen_lvx_v4si_move(rtx dest,rtx src)2237 gen_lvx_v4si_move (rtx dest, rtx src)
2238 {
2239 gcc_assert (MEM_P (dest) ^ MEM_P (src));
2240 gcc_assert (GET_MODE (dest) == V4SImode && GET_MODE (src) == V4SImode);
2241
2242 if (MEM_P (dest))
2243 return gen_altivec_stvx_v4si_internal (dest, src);
2244 else
2245 return gen_altivec_lvx_v4si_internal (dest, src);
2246 }
2247
2248 /* Expand a block move operation, and return 1 if successful. Return 0
2249 if we should let the compiler generate normal code.
2250
2251 operands[0] is the destination
2252 operands[1] is the source
2253 operands[2] is the length
2254 operands[3] is the alignment */
2255
2256 #define MAX_MOVE_REG 4
2257
2258 int
expand_block_move(rtx operands[])2259 expand_block_move (rtx operands[])
2260 {
2261 rtx orig_dest = operands[0];
2262 rtx orig_src = operands[1];
2263 rtx bytes_rtx = operands[2];
2264 rtx align_rtx = operands[3];
2265 int constp = (GET_CODE (bytes_rtx) == CONST_INT);
2266 int align;
2267 int bytes;
2268 int offset;
2269 int move_bytes;
2270 rtx stores[MAX_MOVE_REG];
2271 int num_reg = 0;
2272
2273 /* If this is not a fixed size move, just call memcpy */
2274 if (! constp)
2275 return 0;
2276
2277 /* This must be a fixed size alignment */
2278 gcc_assert (GET_CODE (align_rtx) == CONST_INT);
2279 align = INTVAL (align_rtx) * BITS_PER_UNIT;
2280
2281 /* Anything to move? */
2282 bytes = INTVAL (bytes_rtx);
2283 if (bytes <= 0)
2284 return 1;
2285
2286 if (bytes > rs6000_block_move_inline_limit)
2287 return 0;
2288
2289 for (offset = 0; bytes > 0; offset += move_bytes, bytes -= move_bytes)
2290 {
2291 union {
2292 rtx (*movmemsi) (rtx, rtx, rtx, rtx);
2293 rtx (*mov) (rtx, rtx);
2294 } gen_func;
2295 machine_mode mode = BLKmode;
2296 rtx src, dest;
2297
2298 /* Altivec first, since it will be faster than a string move
2299 when it applies, and usually not significantly larger. */
2300 if (TARGET_ALTIVEC && bytes >= 16 && align >= 128)
2301 {
2302 move_bytes = 16;
2303 mode = V4SImode;
2304 gen_func.mov = gen_lvx_v4si_move;
2305 }
2306 else if (bytes >= 8 && TARGET_POWERPC64
2307 && (align >= 64 || !STRICT_ALIGNMENT))
2308 {
2309 move_bytes = 8;
2310 mode = DImode;
2311 gen_func.mov = gen_movdi;
2312 if (offset == 0 && align < 64)
2313 {
2314 rtx addr;
2315
2316 /* If the address form is reg+offset with offset not a
2317 multiple of four, reload into reg indirect form here
2318 rather than waiting for reload. This way we get one
2319 reload, not one per load and/or store. */
2320 addr = XEXP (orig_dest, 0);
2321 if ((GET_CODE (addr) == PLUS || GET_CODE (addr) == LO_SUM)
2322 && GET_CODE (XEXP (addr, 1)) == CONST_INT
2323 && (INTVAL (XEXP (addr, 1)) & 3) != 0)
2324 {
2325 addr = copy_addr_to_reg (addr);
2326 orig_dest = replace_equiv_address (orig_dest, addr);
2327 }
2328 addr = XEXP (orig_src, 0);
2329 if ((GET_CODE (addr) == PLUS || GET_CODE (addr) == LO_SUM)
2330 && GET_CODE (XEXP (addr, 1)) == CONST_INT
2331 && (INTVAL (XEXP (addr, 1)) & 3) != 0)
2332 {
2333 addr = copy_addr_to_reg (addr);
2334 orig_src = replace_equiv_address (orig_src, addr);
2335 }
2336 }
2337 }
2338 else if (bytes >= 4 && (align >= 32 || !STRICT_ALIGNMENT))
2339 { /* move 4 bytes */
2340 move_bytes = 4;
2341 mode = SImode;
2342 gen_func.mov = gen_movsi;
2343 }
2344 else if (bytes >= 2 && (align >= 16 || !STRICT_ALIGNMENT))
2345 { /* move 2 bytes */
2346 move_bytes = 2;
2347 mode = HImode;
2348 gen_func.mov = gen_movhi;
2349 }
2350 else /* move 1 byte at a time */
2351 {
2352 move_bytes = 1;
2353 mode = QImode;
2354 gen_func.mov = gen_movqi;
2355 }
2356
2357 src = adjust_address (orig_src, mode, offset);
2358 dest = adjust_address (orig_dest, mode, offset);
2359
2360 if (mode != BLKmode)
2361 {
2362 rtx tmp_reg = gen_reg_rtx (mode);
2363
2364 emit_insn ((*gen_func.mov) (tmp_reg, src));
2365 stores[num_reg++] = (*gen_func.mov) (dest, tmp_reg);
2366 }
2367
2368 if (mode == BLKmode || num_reg >= MAX_MOVE_REG || bytes == move_bytes)
2369 {
2370 int i;
2371 for (i = 0; i < num_reg; i++)
2372 emit_insn (stores[i]);
2373 num_reg = 0;
2374 }
2375
2376 if (mode == BLKmode)
2377 {
2378 /* Move the address into scratch registers. The movmemsi
2379 patterns require zero offset. */
2380 if (!REG_P (XEXP (src, 0)))
2381 {
2382 rtx src_reg = copy_addr_to_reg (XEXP (src, 0));
2383 src = replace_equiv_address (src, src_reg);
2384 }
2385 set_mem_size (src, move_bytes);
2386
2387 if (!REG_P (XEXP (dest, 0)))
2388 {
2389 rtx dest_reg = copy_addr_to_reg (XEXP (dest, 0));
2390 dest = replace_equiv_address (dest, dest_reg);
2391 }
2392 set_mem_size (dest, move_bytes);
2393
2394 emit_insn ((*gen_func.movmemsi) (dest, src,
2395 GEN_INT (move_bytes & 31),
2396 align_rtx));
2397 }
2398 }
2399
2400 return 1;
2401 }
2402