1; RUN: llc -mcpu=pwr9 -O3 -verify-machineinstrs -ppc-vsr-nums-as-vr \ 2; RUN: -ppc-asm-full-reg-names -mtriple=powerpc64le-unknown-linux-gnu \ 3; RUN: < %s | FileCheck %s 4 5; RUN: llc -mcpu=pwr9 -O3 -verify-machineinstrs -ppc-vsr-nums-as-vr \ 6; RUN: -ppc-asm-full-reg-names -mtriple=powerpc64-unknown-linux-gnu \ 7; RUN: < %s | FileCheck %s --check-prefix=P9BE 8 9; Function Attrs: norecurse nounwind readonly 10define signext i32 @test_pre_inc_disable_1(i8* nocapture readonly %pix1, i32 signext %i_stride_pix1, i8* nocapture readonly %pix2) { 11; CHECK-LABEL: test_pre_inc_disable_1: 12; CHECK: # %bb.0: # %entry 13; CHECK: lfd f0, 0(r5) 14; CHECK: addis r5, r2 15; CHECK: addi r5, r5, 16; CHECK: lxvx v2, 0, r5 17; CHECK: addis r5, r2, 18; CHECK: addi r5, r5, 19; CHECK: lxvx v4, 0, r5 20; CHECK: xxpermdi v5, f0, f0, 2 21; CHECK: xxlxor v3, v3, v3 22; CHECK-DAG: vperm v[[VR1:[0-9]+]], v5, v3, v4 23; CHECK-DAG: vperm v[[VR2:[0-9]+]], v3, v5, v2 24; CHECK-DAG: xvnegsp v[[VR3:[0-9]+]], v[[VR1]] 25; CHECK-DAG: xvnegsp v[[VR4:[0-9]+]], v[[VR2]] 26 27; CHECK: .LBB0_1: # %for.cond1.preheader 28; CHECK: lfd f0, 0(r3) 29; CHECK: xxpermdi v1, f0, f0, 2 30; CHECK: vperm v6, v3, v1, v2 31; CHECK: vperm v1, v1, v3, v4 32; CHECK-DAG: xvnegsp v6, v6 33; CHECK-DAG: xvnegsp v1, v1 34; CHECK-DAG: vabsduw v1, v1, v[[VR3]] 35; CHECK-DAG: vabsduw v6, v6, v[[VR4]] 36; CHECK: vadduwm v1, v1, v6 37; CHECK: xxswapd v6, v1 38; CHECK: vadduwm v1, v1, v6 39; CHECK: xxspltw v6, v1, 2 40; CHECK: vadduwm v1, v1, v6 41; CHECK: vextuwrx r7, r5, v1 42; CHECK: ldux r8, r3, r4 43; CHECK: add r3, r3, r4 44; CHECK: add r6, r7, r6 45; CHECK: mtvsrd f0, r8 46; CHECK: xxswapd v1, vs0 47; CHECK: vperm v6, v3, v1, v2 48; CHECK: vperm v1, v1, v3, v4 49; CHECK-DAG: xvnegsp v6, v6 50; CHECK-DAG: xvnegsp v1, v1 51; CHECK-DAG: vabsduw v1, v1, v[[VR3]] 52; CHECK-DAG: vabsduw v6, v6, v[[VR4]] 53; CHECK: vadduwm v1, v1, v6 54; CHECK: xxswapd v6, v1 55; CHECK: vadduwm v1, v1, v6 56; CHECK: xxspltw v6, v1, 2 57; CHECK: vadduwm v1, v1, v6 58; CHECK: vextuwrx r7, r5, v1 59; CHECK: add r6, r7, r6 60; CHECK: bdnz .LBB0_1 61; CHECK: extsw r3, r6 62; CHECK: blr 63 64; P9BE-LABEL: test_pre_inc_disable_1: 65; P9BE: lfd f0, 0(r5) 66; P9BE: addis r5, r2, 67; P9BE: addi r5, r5, 68; P9BE: lxvx v2, 0, r5 69; P9BE: addis r5, r2, 70; P9BE: addi r5, r5, 71; P9BE: lxvx v4, 0, r5 72; P9BE: xxlor v5, vs0, vs0 73; P9BE: xxlxor v3, v3, v3 74; P9BE-DAG: li r5, 0 75; P9BE-DAG: vperm v[[VR1:[0-9]+]], v3, v5, v2 76; P9BE-DAG: vperm v[[VR2:[0-9]+]], v3, v5, v4 77; P9BE-DAG: xvnegsp v[[VR3:[0-9]+]], v[[VR1]] 78; P9BE-DAG: xvnegsp v[[VR4:[0-9]+]], v[[VR2]] 79 80; P9BE: .LBB0_1: # %for.cond1.preheader 81; P9BE: lfd f0, 0(r3) 82; P9BE: xxlor v1, vs0, vs0 83; P9BE: vperm v6, v3, v1, v4 84; P9BE: vperm v1, v3, v1, v2 85; P9BE-DAG: xvnegsp v6, v6 86; P9BE-DAG: xvnegsp v1, v1 87; P9BE-DAG: vabsduw v1, v1, v[[VR3]] 88; P9BE-DAG: vabsduw v6, v6, v[[VR4]] 89; P9BE: vadduwm v1, v6, v1 90; P9BE: xxswapd v6, v1 91; P9BE: vadduwm v1, v1, v6 92; P9BE: xxspltw v6, v1, 1 93; P9BE: vadduwm v1, v1, v6 94; P9BE: vextuwlx r[[GR1:[0-9]+]], r5, v1 95; P9BE: add r6, r[[GR1]], r6 96; P9BE: ldux r[[GR2:[0-9]+]], r3, r4 97; P9BE: add r3, r3, r4 98; P9BE: mtvsrd v1, r[[GR2]] 99; P9BE: vperm v6, v3, v1, v2 100; P9BE: vperm v1, v3, v1, v4 101; P9BE-DAG: xvnegsp v6, v6 102; P9BE-DAG: xvnegsp v1, v1 103; P9BE-DAG: vabsduw v1, v1, v[[VR4]] 104; P9BE-DAG: vabsduw v6, v6, v[[VR3]] 105; P9BE: vadduwm v1, v1, v6 106; P9BE: xxswapd v6, v1 107; P9BE: vadduwm v1, v1, v6 108; P9BE: xxspltw v6, v1, 1 109; P9BE: vadduwm v1, v1, v6 110; P9BE: vextuwlx r7, r5, v1 111; P9BE: add r6, r7, r6 112; P9BE: bdnz .LBB0_1 113; P9BE: extsw r3, r6 114; P9BE: blr 115entry: 116 %idx.ext = sext i32 %i_stride_pix1 to i64 117 %0 = bitcast i8* %pix2 to <8 x i8>* 118 %1 = load <8 x i8>, <8 x i8>* %0, align 1 119 %2 = zext <8 x i8> %1 to <8 x i32> 120 br label %for.cond1.preheader 121 122for.cond1.preheader: ; preds = %for.cond1.preheader, %entry 123 %y.024 = phi i32 [ 0, %entry ], [ %inc9.1, %for.cond1.preheader ] 124 %i_sum.023 = phi i32 [ 0, %entry ], [ %op.extra.1, %for.cond1.preheader ] 125 %pix1.addr.022 = phi i8* [ %pix1, %entry ], [ %add.ptr.1, %for.cond1.preheader ] 126 %3 = bitcast i8* %pix1.addr.022 to <8 x i8>* 127 %4 = load <8 x i8>, <8 x i8>* %3, align 1 128 %5 = zext <8 x i8> %4 to <8 x i32> 129 %6 = sub nsw <8 x i32> %5, %2 130 %7 = icmp slt <8 x i32> %6, zeroinitializer 131 %8 = sub nsw <8 x i32> zeroinitializer, %6 132 %9 = select <8 x i1> %7, <8 x i32> %8, <8 x i32> %6 133 %rdx.shuf = shufflevector <8 x i32> %9, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef> 134 %bin.rdx = add nsw <8 x i32> %9, %rdx.shuf 135 %rdx.shuf32 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 136 %bin.rdx33 = add nsw <8 x i32> %bin.rdx, %rdx.shuf32 137 %rdx.shuf34 = shufflevector <8 x i32> %bin.rdx33, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 138 %bin.rdx35 = add nsw <8 x i32> %bin.rdx33, %rdx.shuf34 139 %10 = extractelement <8 x i32> %bin.rdx35, i32 0 140 %op.extra = add nsw i32 %10, %i_sum.023 141 %add.ptr = getelementptr inbounds i8, i8* %pix1.addr.022, i64 %idx.ext 142 %11 = bitcast i8* %add.ptr to <8 x i8>* 143 %12 = load <8 x i8>, <8 x i8>* %11, align 1 144 %13 = zext <8 x i8> %12 to <8 x i32> 145 %14 = sub nsw <8 x i32> %13, %2 146 %15 = icmp slt <8 x i32> %14, zeroinitializer 147 %16 = sub nsw <8 x i32> zeroinitializer, %14 148 %17 = select <8 x i1> %15, <8 x i32> %16, <8 x i32> %14 149 %rdx.shuf.1 = shufflevector <8 x i32> %17, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef> 150 %bin.rdx.1 = add nsw <8 x i32> %17, %rdx.shuf.1 151 %rdx.shuf32.1 = shufflevector <8 x i32> %bin.rdx.1, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 152 %bin.rdx33.1 = add nsw <8 x i32> %bin.rdx.1, %rdx.shuf32.1 153 %rdx.shuf34.1 = shufflevector <8 x i32> %bin.rdx33.1, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 154 %bin.rdx35.1 = add nsw <8 x i32> %bin.rdx33.1, %rdx.shuf34.1 155 %18 = extractelement <8 x i32> %bin.rdx35.1, i32 0 156 %op.extra.1 = add nsw i32 %18, %op.extra 157 %add.ptr.1 = getelementptr inbounds i8, i8* %add.ptr, i64 %idx.ext 158 %inc9.1 = add nuw nsw i32 %y.024, 2 159 %exitcond.1 = icmp eq i32 %inc9.1, 8 160 br i1 %exitcond.1, label %for.cond.cleanup, label %for.cond1.preheader 161 162for.cond.cleanup: ; preds = %for.cond1.preheader 163 ret i32 %op.extra.1 164} 165 166; Function Attrs: norecurse nounwind readonly 167define signext i32 @test_pre_inc_disable_2(i8* nocapture readonly %pix1, i8* nocapture readonly %pix2) { 168; CHECK-LABEL: test_pre_inc_disable_2: 169; CHECK: lfd f0, 0(r3) 170; CHECK: addis r3, r2, 171; CHECK: addi r3, r3, .LCPI1_0@toc@l 172; CHECK: lxvx v4, 0, r3 173; CHECK: addis r3, r2, 174; CHECK: xxpermdi v2, f0, f0, 2 175; CHECK: lfd f0, 0(r4) 176; CHECK: addi r3, r3, .LCPI1_1@toc@l 177; CHECK: xxlxor v3, v3, v3 178; CHECK: lxvx v0, 0, r3 179; CHECK: xxpermdi v1, f0, f0, 2 180; CHECK: vperm v5, v2, v3, v4 181; CHECK: vperm v2, v3, v2, v0 182; CHECK: vperm v0, v3, v1, v0 183; CHECK: vperm v3, v1, v3, v4 184; CHECK: vabsduw v2, v2, v0 185; CHECK: vabsduw v3, v5, v3 186; CHECK: vadduwm v2, v3, v2 187; CHECK: xxswapd v3, v2 188; CHECK: vadduwm v2, v2, v3 189; CHECK: xxspltw v3, v2, 2 190; CHECK: vadduwm v2, v2, v3 191; CHECK: vextuwrx r3, r3, v2 192; CHECK: extsw r3, r3 193; CHECK: blr 194 195; P9BE-LABEL: test_pre_inc_disable_2: 196; P9BE: lfd f0, 0(r3) 197; P9BE: addis r3, r2, 198; P9BE: addi r3, r3, 199; P9BE: lxvx v4, 0, r3 200; P9BE: addis r3, r2, 201; P9BE: addi r3, r3, 202; P9BE: xxlor v2, vs0, vs0 203; P9BE: lfd f0, 0(r4) 204; P9BE: lxvx v0, 0, r3 205; P9BE: xxlxor v3, v3, v3 206; P9BE: xxlor v1, vs0, vs0 207; P9BE: vperm v5, v3, v2, v4 208; P9BE: vperm v2, v3, v2, v0 209; P9BE: vperm v0, v3, v1, v0 210; P9BE: vperm v3, v3, v1, v4 211; P9BE: vabsduw v2, v2, v0 212; P9BE: vabsduw v3, v5, v3 213; P9BE: vadduwm v2, v3, v2 214; P9BE: xxswapd v3, v2 215; P9BE: vadduwm v2, v2, v3 216; P9BE: xxspltw v3, v2, 1 217; P9BE: vadduwm v2, v2, v3 218; P9BE: vextuwlx r3, r3, v2 219; P9BE: extsw r3, r3 220; P9BE: blr 221entry: 222 %0 = bitcast i8* %pix1 to <8 x i8>* 223 %1 = load <8 x i8>, <8 x i8>* %0, align 1 224 %2 = zext <8 x i8> %1 to <8 x i32> 225 %3 = bitcast i8* %pix2 to <8 x i8>* 226 %4 = load <8 x i8>, <8 x i8>* %3, align 1 227 %5 = zext <8 x i8> %4 to <8 x i32> 228 %6 = sub nsw <8 x i32> %2, %5 229 %7 = icmp slt <8 x i32> %6, zeroinitializer 230 %8 = sub nsw <8 x i32> zeroinitializer, %6 231 %9 = select <8 x i1> %7, <8 x i32> %8, <8 x i32> %6 232 %rdx.shuf = shufflevector <8 x i32> %9, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef> 233 %bin.rdx = add nsw <8 x i32> %9, %rdx.shuf 234 %rdx.shuf12 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 235 %bin.rdx13 = add nsw <8 x i32> %bin.rdx, %rdx.shuf12 236 %rdx.shuf14 = shufflevector <8 x i32> %bin.rdx13, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 237 %bin.rdx15 = add nsw <8 x i32> %bin.rdx13, %rdx.shuf14 238 %10 = extractelement <8 x i32> %bin.rdx15, i32 0 239 ret i32 %10 240} 241 242 243; Generated from C source: 244; 245;#include <stdint.h> 246;#include <stdlib.h> 247;int test_pre_inc_disable_1( uint8_t *pix1, int i_stride_pix1, uint8_t *pix2 ) { 248; int i_sum = 0; 249; for( int y = 0; y < 8; y++ ) { 250; for( int x = 0; x < 8; x++) { 251; i_sum += abs( pix1[x] - pix2[x] ) 252; } 253; pix1 += i_stride_pix1; 254; } 255; return i_sum; 256;} 257 258;int test_pre_inc_disable_2( uint8_t *pix1, uint8_t *pix2 ) { 259; int i_sum = 0; 260; for( int x = 0; x < 8; x++ ) { 261; i_sum += abs( pix1[x] - pix2[x] ); 262; } 263; 264; return i_sum; 265;} 266 267