1; RUN: llc -mcpu=pwr9 -O3 -verify-machineinstrs -ppc-vsr-nums-as-vr \
2; RUN:     -ppc-asm-full-reg-names -mtriple=powerpc64le-unknown-linux-gnu \
3; RUN:     < %s | FileCheck %s
4
5; RUN: llc -mcpu=pwr9 -O3 -verify-machineinstrs -ppc-vsr-nums-as-vr \
6; RUN:     -ppc-asm-full-reg-names -mtriple=powerpc64-unknown-linux-gnu \
7; RUN:     < %s | FileCheck %s --check-prefix=P9BE
8
9; Function Attrs: norecurse nounwind readonly
10define signext i32 @test_pre_inc_disable_1(i8* nocapture readonly %pix1, i32 signext %i_stride_pix1, i8* nocapture readonly %pix2) {
11; CHECK-LABEL: test_pre_inc_disable_1:
12; CHECK:   # %bb.0: # %entry
13; CHECK:    lfd f0, 0(r5)
14; CHECK:    addis r5, r2
15; CHECK:    addi r5, r5,
16; CHECK:    lxvx v2, 0, r5
17; CHECK:    addis r5, r2,
18; CHECK:    addi r5, r5,
19; CHECK:    lxvx v4, 0, r5
20; CHECK:    xxpermdi v5, f0, f0, 2
21; CHECK:    xxlxor v3, v3, v3
22; CHECK-DAG: vperm v[[VR1:[0-9]+]], v5, v3, v4
23; CHECK-DAG: vperm v[[VR2:[0-9]+]], v3, v5, v2
24; CHECK-DAG: xvnegsp v[[VR3:[0-9]+]], v[[VR1]]
25; CHECK-DAG: xvnegsp v[[VR4:[0-9]+]], v[[VR2]]
26
27; CHECK:  .LBB0_1: # %for.cond1.preheader
28; CHECK:    lfd f0, 0(r3)
29; CHECK:    xxpermdi v1, f0, f0, 2
30; CHECK:    vperm v6, v3, v1, v2
31; CHECK:    vperm v1, v1, v3, v4
32; CHECK-DAG:    xvnegsp v6, v6
33; CHECK-DAG:    xvnegsp v1, v1
34; CHECK-DAG: vabsduw v1, v1, v[[VR3]]
35; CHECK-DAG: vabsduw v6, v6, v[[VR4]]
36; CHECK:    vadduwm v1, v1, v6
37; CHECK:    xxswapd v6, v1
38; CHECK:    vadduwm v1, v1, v6
39; CHECK:    xxspltw v6, v1, 2
40; CHECK:    vadduwm v1, v1, v6
41; CHECK:    vextuwrx r7, r5, v1
42; CHECK:    ldux r8, r3, r4
43; CHECK:    add r3, r3, r4
44; CHECK:    add r6, r7, r6
45; CHECK:    mtvsrd f0, r8
46; CHECK:    xxswapd v1, vs0
47; CHECK:    vperm v6, v3, v1, v2
48; CHECK:    vperm v1, v1, v3, v4
49; CHECK-DAG: xvnegsp v6, v6
50; CHECK-DAG: xvnegsp v1, v1
51; CHECK-DAG: vabsduw v1, v1, v[[VR3]]
52; CHECK-DAG: vabsduw v6, v6, v[[VR4]]
53; CHECK:    vadduwm v1, v1, v6
54; CHECK:    xxswapd v6, v1
55; CHECK:    vadduwm v1, v1, v6
56; CHECK:    xxspltw v6, v1, 2
57; CHECK:    vadduwm v1, v1, v6
58; CHECK:    vextuwrx r7, r5, v1
59; CHECK:    add r6, r7, r6
60; CHECK:    bdnz .LBB0_1
61; CHECK:    extsw r3, r6
62; CHECK:    blr
63
64; P9BE-LABEL: test_pre_inc_disable_1:
65; P9BE:    lfd f0, 0(r5)
66; P9BE:    addis r5, r2,
67; P9BE:    addi r5, r5,
68; P9BE:    lxvx v2, 0, r5
69; P9BE:    addis r5, r2,
70; P9BE:    addi r5, r5,
71; P9BE:    lxvx v4, 0, r5
72; P9BE:    xxlor v5, vs0, vs0
73; P9BE:    xxlxor v3, v3, v3
74; P9BE-DAG: li r5, 0
75; P9BE-DAG: vperm v[[VR1:[0-9]+]], v3, v5, v2
76; P9BE-DAG: vperm v[[VR2:[0-9]+]], v3, v5, v4
77; P9BE-DAG: xvnegsp v[[VR3:[0-9]+]], v[[VR1]]
78; P9BE-DAG: xvnegsp v[[VR4:[0-9]+]], v[[VR2]]
79
80; P9BE:  .LBB0_1: # %for.cond1.preheader
81; P9BE:    lfd f0, 0(r3)
82; P9BE:    xxlor v1, vs0, vs0
83; P9BE:    vperm v6, v3, v1, v4
84; P9BE:    vperm v1, v3, v1, v2
85; P9BE-DAG: xvnegsp v6, v6
86; P9BE-DAG: xvnegsp v1, v1
87; P9BE-DAG: vabsduw v1, v1, v[[VR3]]
88; P9BE-DAG: vabsduw v6, v6, v[[VR4]]
89; P9BE:    vadduwm v1, v6, v1
90; P9BE:    xxswapd v6, v1
91; P9BE:    vadduwm v1, v1, v6
92; P9BE:    xxspltw v6, v1, 1
93; P9BE:    vadduwm v1, v1, v6
94; P9BE:    vextuwlx r[[GR1:[0-9]+]], r5, v1
95; P9BE:    add r6, r[[GR1]], r6
96; P9BE:    ldux r[[GR2:[0-9]+]], r3, r4
97; P9BE:    add r3, r3, r4
98; P9BE:    mtvsrd v1, r[[GR2]]
99; P9BE:    vperm v6, v3, v1, v2
100; P9BE:    vperm v1, v3, v1, v4
101; P9BE-DAG: xvnegsp v6, v6
102; P9BE-DAG: xvnegsp v1, v1
103; P9BE-DAG: vabsduw v1, v1, v[[VR4]]
104; P9BE-DAG: vabsduw v6, v6, v[[VR3]]
105; P9BE:    vadduwm v1, v1, v6
106; P9BE:    xxswapd v6, v1
107; P9BE:    vadduwm v1, v1, v6
108; P9BE:    xxspltw v6, v1, 1
109; P9BE:    vadduwm v1, v1, v6
110; P9BE:    vextuwlx r7, r5, v1
111; P9BE:    add r6, r7, r6
112; P9BE:    bdnz .LBB0_1
113; P9BE:    extsw r3, r6
114; P9BE:    blr
115entry:
116  %idx.ext = sext i32 %i_stride_pix1 to i64
117  %0 = bitcast i8* %pix2 to <8 x i8>*
118  %1 = load <8 x i8>, <8 x i8>* %0, align 1
119  %2 = zext <8 x i8> %1 to <8 x i32>
120  br label %for.cond1.preheader
121
122for.cond1.preheader:                              ; preds = %for.cond1.preheader, %entry
123  %y.024 = phi i32 [ 0, %entry ], [ %inc9.1, %for.cond1.preheader ]
124  %i_sum.023 = phi i32 [ 0, %entry ], [ %op.extra.1, %for.cond1.preheader ]
125  %pix1.addr.022 = phi i8* [ %pix1, %entry ], [ %add.ptr.1, %for.cond1.preheader ]
126  %3 = bitcast i8* %pix1.addr.022 to <8 x i8>*
127  %4 = load <8 x i8>, <8 x i8>* %3, align 1
128  %5 = zext <8 x i8> %4 to <8 x i32>
129  %6 = sub nsw <8 x i32> %5, %2
130  %7 = icmp slt <8 x i32> %6, zeroinitializer
131  %8 = sub nsw <8 x i32> zeroinitializer, %6
132  %9 = select <8 x i1> %7, <8 x i32> %8, <8 x i32> %6
133  %rdx.shuf = shufflevector <8 x i32> %9, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
134  %bin.rdx = add nsw <8 x i32> %9, %rdx.shuf
135  %rdx.shuf32 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
136  %bin.rdx33 = add nsw <8 x i32> %bin.rdx, %rdx.shuf32
137  %rdx.shuf34 = shufflevector <8 x i32> %bin.rdx33, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
138  %bin.rdx35 = add nsw <8 x i32> %bin.rdx33, %rdx.shuf34
139  %10 = extractelement <8 x i32> %bin.rdx35, i32 0
140  %op.extra = add nsw i32 %10, %i_sum.023
141  %add.ptr = getelementptr inbounds i8, i8* %pix1.addr.022, i64 %idx.ext
142  %11 = bitcast i8* %add.ptr to <8 x i8>*
143  %12 = load <8 x i8>, <8 x i8>* %11, align 1
144  %13 = zext <8 x i8> %12 to <8 x i32>
145  %14 = sub nsw <8 x i32> %13, %2
146  %15 = icmp slt <8 x i32> %14, zeroinitializer
147  %16 = sub nsw <8 x i32> zeroinitializer, %14
148  %17 = select <8 x i1> %15, <8 x i32> %16, <8 x i32> %14
149  %rdx.shuf.1 = shufflevector <8 x i32> %17, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
150  %bin.rdx.1 = add nsw <8 x i32> %17, %rdx.shuf.1
151  %rdx.shuf32.1 = shufflevector <8 x i32> %bin.rdx.1, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
152  %bin.rdx33.1 = add nsw <8 x i32> %bin.rdx.1, %rdx.shuf32.1
153  %rdx.shuf34.1 = shufflevector <8 x i32> %bin.rdx33.1, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
154  %bin.rdx35.1 = add nsw <8 x i32> %bin.rdx33.1, %rdx.shuf34.1
155  %18 = extractelement <8 x i32> %bin.rdx35.1, i32 0
156  %op.extra.1 = add nsw i32 %18, %op.extra
157  %add.ptr.1 = getelementptr inbounds i8, i8* %add.ptr, i64 %idx.ext
158  %inc9.1 = add nuw nsw i32 %y.024, 2
159  %exitcond.1 = icmp eq i32 %inc9.1, 8
160  br i1 %exitcond.1, label %for.cond.cleanup, label %for.cond1.preheader
161
162for.cond.cleanup:                                 ; preds = %for.cond1.preheader
163  ret i32 %op.extra.1
164}
165
166; Function Attrs: norecurse nounwind readonly
167define signext i32 @test_pre_inc_disable_2(i8* nocapture readonly %pix1, i8* nocapture readonly %pix2) {
168; CHECK-LABEL: test_pre_inc_disable_2:
169; CHECK:    lfd f0, 0(r3)
170; CHECK:    addis r3, r2,
171; CHECK:    addi r3, r3, .LCPI1_0@toc@l
172; CHECK:    lxvx v4, 0, r3
173; CHECK:    addis r3, r2,
174; CHECK:    xxpermdi v2, f0, f0, 2
175; CHECK:    lfd f0, 0(r4)
176; CHECK:    addi r3, r3, .LCPI1_1@toc@l
177; CHECK:    xxlxor v3, v3, v3
178; CHECK:    lxvx v0, 0, r3
179; CHECK:    xxpermdi v1, f0, f0, 2
180; CHECK:    vperm v5, v2, v3, v4
181; CHECK:    vperm v2, v3, v2, v0
182; CHECK:    vperm v0, v3, v1, v0
183; CHECK:    vperm v3, v1, v3, v4
184; CHECK:    vabsduw v2, v2, v0
185; CHECK:    vabsduw v3, v5, v3
186; CHECK:    vadduwm v2, v3, v2
187; CHECK:    xxswapd v3, v2
188; CHECK:    vadduwm v2, v2, v3
189; CHECK:    xxspltw v3, v2, 2
190; CHECK:    vadduwm v2, v2, v3
191; CHECK:    vextuwrx r3, r3, v2
192; CHECK:    extsw r3, r3
193; CHECK:    blr
194
195; P9BE-LABEL: test_pre_inc_disable_2:
196; P9BE:    lfd f0, 0(r3)
197; P9BE:    addis r3, r2,
198; P9BE:    addi r3, r3,
199; P9BE:    lxvx v4, 0, r3
200; P9BE:    addis r3, r2,
201; P9BE:    addi r3, r3,
202; P9BE:    xxlor v2, vs0, vs0
203; P9BE:    lfd f0, 0(r4)
204; P9BE:    lxvx v0, 0, r3
205; P9BE:    xxlxor v3, v3, v3
206; P9BE:    xxlor v1, vs0, vs0
207; P9BE:    vperm v5, v3, v2, v4
208; P9BE:    vperm v2, v3, v2, v0
209; P9BE:    vperm v0, v3, v1, v0
210; P9BE:    vperm v3, v3, v1, v4
211; P9BE:    vabsduw v2, v2, v0
212; P9BE:    vabsduw v3, v5, v3
213; P9BE:    vadduwm v2, v3, v2
214; P9BE:    xxswapd v3, v2
215; P9BE:    vadduwm v2, v2, v3
216; P9BE:    xxspltw v3, v2, 1
217; P9BE:    vadduwm v2, v2, v3
218; P9BE:    vextuwlx r3, r3, v2
219; P9BE:    extsw r3, r3
220; P9BE:    blr
221entry:
222  %0 = bitcast i8* %pix1 to <8 x i8>*
223  %1 = load <8 x i8>, <8 x i8>* %0, align 1
224  %2 = zext <8 x i8> %1 to <8 x i32>
225  %3 = bitcast i8* %pix2 to <8 x i8>*
226  %4 = load <8 x i8>, <8 x i8>* %3, align 1
227  %5 = zext <8 x i8> %4 to <8 x i32>
228  %6 = sub nsw <8 x i32> %2, %5
229  %7 = icmp slt <8 x i32> %6, zeroinitializer
230  %8 = sub nsw <8 x i32> zeroinitializer, %6
231  %9 = select <8 x i1> %7, <8 x i32> %8, <8 x i32> %6
232  %rdx.shuf = shufflevector <8 x i32> %9, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
233  %bin.rdx = add nsw <8 x i32> %9, %rdx.shuf
234  %rdx.shuf12 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
235  %bin.rdx13 = add nsw <8 x i32> %bin.rdx, %rdx.shuf12
236  %rdx.shuf14 = shufflevector <8 x i32> %bin.rdx13, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
237  %bin.rdx15 = add nsw <8 x i32> %bin.rdx13, %rdx.shuf14
238  %10 = extractelement <8 x i32> %bin.rdx15, i32 0
239  ret i32 %10
240}
241
242
243; Generated from C source:
244;
245;#include <stdint.h>
246;#include <stdlib.h>
247;int test_pre_inc_disable_1( uint8_t *pix1, int i_stride_pix1, uint8_t *pix2 ) {
248;    int i_sum = 0;
249;    for( int y = 0; y < 8; y++ ) {
250;        for( int x = 0; x < 8; x++) {
251;            i_sum += abs( pix1[x] - pix2[x] )
252;        }
253;        pix1 += i_stride_pix1;
254;    }
255;    return i_sum;
256;}
257
258;int test_pre_inc_disable_2( uint8_t *pix1, uint8_t *pix2 ) {
259;  int i_sum = 0;
260;  for( int x = 0; x < 8; x++ ) {
261;    i_sum += abs( pix1[x] - pix2[x] );
262;  }
263;
264;  return i_sum;
265;}
266
267