1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,X86-SSE 3; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,X86-AVX,AVX1,X86-AVX1 4; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl | FileCheck %s --check-prefixes=AVX,X86-AVX,AVX512,X86-AVX512 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,X64-SSE 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,X64-AVX,AVX1,X64-AVX1 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl | FileCheck %s --check-prefixes=AVX,X64-AVX,AVX512,X64-AVX512 8 9; Tests for SSE2 and below, without SSE3+. 10 11define void @test1(<2 x double>* %r, <2 x double>* %A, double %B) nounwind { 12; X86-SSE-LABEL: test1: 13; X86-SSE: # %bb.0: 14; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax 15; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx 16; X86-SSE-NEXT: movaps (%ecx), %xmm0 17; X86-SSE-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] 18; X86-SSE-NEXT: movaps %xmm0, (%eax) 19; X86-SSE-NEXT: retl 20; 21; X86-AVX-LABEL: test1: 22; X86-AVX: # %bb.0: 23; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 24; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx 25; X86-AVX-NEXT: vmovaps (%ecx), %xmm0 26; X86-AVX-NEXT: vmovlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] 27; X86-AVX-NEXT: vmovaps %xmm0, (%eax) 28; X86-AVX-NEXT: retl 29; 30; X64-SSE-LABEL: test1: 31; X64-SSE: # %bb.0: 32; X64-SSE-NEXT: shufpd {{.*#+}} xmm0 = xmm0[0],mem[1] 33; X64-SSE-NEXT: movapd %xmm0, (%rdi) 34; X64-SSE-NEXT: retq 35; 36; X64-AVX-LABEL: test1: 37; X64-AVX: # %bb.0: 38; X64-AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] 39; X64-AVX-NEXT: vmovaps %xmm0, (%rdi) 40; X64-AVX-NEXT: retq 41 %tmp3 = load <2 x double>, <2 x double>* %A, align 16 42 %tmp7 = insertelement <2 x double> undef, double %B, i32 0 43 %tmp9 = shufflevector <2 x double> %tmp3, <2 x double> %tmp7, <2 x i32> < i32 2, i32 1 > 44 store <2 x double> %tmp9, <2 x double>* %r, align 16 45 ret void 46} 47 48define void @test2(<2 x double>* %r, <2 x double>* %A, double %B) nounwind { 49; X86-SSE-LABEL: test2: 50; X86-SSE: # %bb.0: 51; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax 52; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx 53; X86-SSE-NEXT: movaps (%ecx), %xmm0 54; X86-SSE-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] 55; X86-SSE-NEXT: movaps %xmm0, (%eax) 56; X86-SSE-NEXT: retl 57; 58; X86-AVX-LABEL: test2: 59; X86-AVX: # %bb.0: 60; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 61; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx 62; X86-AVX-NEXT: vmovaps (%ecx), %xmm0 63; X86-AVX-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] 64; X86-AVX-NEXT: vmovaps %xmm0, (%eax) 65; X86-AVX-NEXT: retl 66; 67; X64-SSE-LABEL: test2: 68; X64-SSE: # %bb.0: 69; X64-SSE-NEXT: movaps (%rsi), %xmm1 70; X64-SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 71; X64-SSE-NEXT: movaps %xmm1, (%rdi) 72; X64-SSE-NEXT: retq 73; 74; X64-AVX-LABEL: test2: 75; X64-AVX: # %bb.0: 76; X64-AVX-NEXT: vmovaps (%rsi), %xmm1 77; X64-AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] 78; X64-AVX-NEXT: vmovaps %xmm0, (%rdi) 79; X64-AVX-NEXT: retq 80 %tmp3 = load <2 x double>, <2 x double>* %A, align 16 81 %tmp7 = insertelement <2 x double> undef, double %B, i32 0 82 %tmp9 = shufflevector <2 x double> %tmp3, <2 x double> %tmp7, <2 x i32> < i32 0, i32 2 > 83 store <2 x double> %tmp9, <2 x double>* %r, align 16 84 ret void 85} 86 87 88define void @test3(<4 x float>* %res, <4 x float>* %A, <4 x float>* %B) nounwind { 89; X86-SSE-LABEL: test3: 90; X86-SSE: # %bb.0: 91; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax 92; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx 93; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx 94; X86-SSE-NEXT: movaps (%edx), %xmm0 95; X86-SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 96; X86-SSE-NEXT: movaps %xmm0, (%eax) 97; X86-SSE-NEXT: retl 98; 99; X86-AVX-LABEL: test3: 100; X86-AVX: # %bb.0: 101; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 102; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx 103; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx 104; X86-AVX-NEXT: vmovaps (%edx), %xmm0 105; X86-AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 106; X86-AVX-NEXT: vmovaps %xmm0, (%eax) 107; X86-AVX-NEXT: retl 108; 109; X64-SSE-LABEL: test3: 110; X64-SSE: # %bb.0: 111; X64-SSE-NEXT: movaps (%rsi), %xmm0 112; X64-SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 113; X64-SSE-NEXT: movaps %xmm0, (%rdi) 114; X64-SSE-NEXT: retq 115; 116; X64-AVX-LABEL: test3: 117; X64-AVX: # %bb.0: 118; X64-AVX-NEXT: vmovaps (%rsi), %xmm0 119; X64-AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 120; X64-AVX-NEXT: vmovaps %xmm0, (%rdi) 121; X64-AVX-NEXT: retq 122 %tmp = load <4 x float>, <4 x float>* %B ; <<4 x float>> [#uses=2] 123 %tmp3 = load <4 x float>, <4 x float>* %A ; <<4 x float>> [#uses=2] 124 %tmp.upgrd.1 = extractelement <4 x float> %tmp3, i32 0 ; <float> [#uses=1] 125 %tmp7 = extractelement <4 x float> %tmp, i32 0 ; <float> [#uses=1] 126 %tmp8 = extractelement <4 x float> %tmp3, i32 1 ; <float> [#uses=1] 127 %tmp9 = extractelement <4 x float> %tmp, i32 1 ; <float> [#uses=1] 128 %tmp10 = insertelement <4 x float> undef, float %tmp.upgrd.1, i32 0 ; <<4 x float>> [#uses=1] 129 %tmp11 = insertelement <4 x float> %tmp10, float %tmp7, i32 1 ; <<4 x float>> [#uses=1] 130 %tmp12 = insertelement <4 x float> %tmp11, float %tmp8, i32 2 ; <<4 x float>> [#uses=1] 131 %tmp13 = insertelement <4 x float> %tmp12, float %tmp9, i32 3 ; <<4 x float>> [#uses=1] 132 store <4 x float> %tmp13, <4 x float>* %res 133 ret void 134} 135 136define void @test4(<4 x float> %X, <4 x float>* %res) nounwind { 137; X86-SSE-LABEL: test4: 138; X86-SSE: # %bb.0: 139; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax 140; X86-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,1,3,3] 141; X86-SSE-NEXT: movaps %xmm0, (%eax) 142; X86-SSE-NEXT: retl 143; 144; X86-AVX-LABEL: test4: 145; X86-AVX: # %bb.0: 146; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 147; X86-AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,3,3] 148; X86-AVX-NEXT: vmovaps %xmm0, (%eax) 149; X86-AVX-NEXT: retl 150; 151; X64-SSE-LABEL: test4: 152; X64-SSE: # %bb.0: 153; X64-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,1,3,3] 154; X64-SSE-NEXT: movaps %xmm0, (%rdi) 155; X64-SSE-NEXT: retq 156; 157; X64-AVX-LABEL: test4: 158; X64-AVX: # %bb.0: 159; X64-AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,3,3] 160; X64-AVX-NEXT: vmovaps %xmm0, (%rdi) 161; X64-AVX-NEXT: retq 162 %tmp5 = shufflevector <4 x float> %X, <4 x float> undef, <4 x i32> < i32 2, i32 6, i32 3, i32 7 > ; <<4 x float>> [#uses=1] 163 store <4 x float> %tmp5, <4 x float>* %res 164 ret void 165} 166 167define <4 x i32> @test5(i8** %ptr) nounwind { 168; X86-SSE-LABEL: test5: 169; X86-SSE: # %bb.0: 170; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax 171; X86-SSE-NEXT: movl (%eax), %eax 172; X86-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 173; X86-SSE-NEXT: pxor %xmm0, %xmm0 174; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 175; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 176; X86-SSE-NEXT: retl 177; 178; X86-AVX-LABEL: test5: 179; X86-AVX: # %bb.0: 180; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 181; X86-AVX-NEXT: movl (%eax), %eax 182; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 183; X86-AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 184; X86-AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 185; X86-AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 186; X86-AVX-NEXT: retl 187; 188; X64-SSE-LABEL: test5: 189; X64-SSE: # %bb.0: 190; X64-SSE-NEXT: movq (%rdi), %rax 191; X64-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 192; X64-SSE-NEXT: pxor %xmm0, %xmm0 193; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 194; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 195; X64-SSE-NEXT: retq 196; 197; X64-AVX-LABEL: test5: 198; X64-AVX: # %bb.0: 199; X64-AVX-NEXT: movq (%rdi), %rax 200; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 201; X64-AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 202; X64-AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 203; X64-AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 204; X64-AVX-NEXT: retq 205 %tmp = load i8*, i8** %ptr ; <i8*> [#uses=1] 206 %tmp.upgrd.1 = bitcast i8* %tmp to float* ; <float*> [#uses=1] 207 %tmp.upgrd.2 = load float, float* %tmp.upgrd.1 ; <float> [#uses=1] 208 %tmp.upgrd.3 = insertelement <4 x float> undef, float %tmp.upgrd.2, i32 0 ; <<4 x float>> [#uses=1] 209 %tmp9 = insertelement <4 x float> %tmp.upgrd.3, float 0.000000e+00, i32 1 ; <<4 x float>> [#uses=1] 210 %tmp10 = insertelement <4 x float> %tmp9, float 0.000000e+00, i32 2 ; <<4 x float>> [#uses=1] 211 %tmp11 = insertelement <4 x float> %tmp10, float 0.000000e+00, i32 3 ; <<4 x float>> [#uses=1] 212 %tmp21 = bitcast <4 x float> %tmp11 to <16 x i8> ; <<16 x i8>> [#uses=1] 213 %tmp22 = shufflevector <16 x i8> %tmp21, <16 x i8> zeroinitializer, <16 x i32> < i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23 > ; <<16 x i8>> [#uses=1] 214 %tmp31 = bitcast <16 x i8> %tmp22 to <8 x i16> ; <<8 x i16>> [#uses=1] 215 %tmp.upgrd.4 = shufflevector <8 x i16> zeroinitializer, <8 x i16> %tmp31, <8 x i32> < i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11 > ; <<8 x i16>> [#uses=1] 216 %tmp36 = bitcast <8 x i16> %tmp.upgrd.4 to <4 x i32> ; <<4 x i32>> [#uses=1] 217 ret <4 x i32> %tmp36 218} 219 220define void @test6(<4 x float>* %res, <4 x float>* %A) nounwind { 221; X86-SSE-LABEL: test6: 222; X86-SSE: # %bb.0: 223; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax 224; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx 225; X86-SSE-NEXT: movaps (%ecx), %xmm0 226; X86-SSE-NEXT: movaps %xmm0, (%eax) 227; X86-SSE-NEXT: retl 228; 229; X86-AVX-LABEL: test6: 230; X86-AVX: # %bb.0: 231; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 232; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx 233; X86-AVX-NEXT: vmovaps (%ecx), %xmm0 234; X86-AVX-NEXT: vmovaps %xmm0, (%eax) 235; X86-AVX-NEXT: retl 236; 237; X64-SSE-LABEL: test6: 238; X64-SSE: # %bb.0: 239; X64-SSE-NEXT: movaps (%rsi), %xmm0 240; X64-SSE-NEXT: movaps %xmm0, (%rdi) 241; X64-SSE-NEXT: retq 242; 243; X64-AVX-LABEL: test6: 244; X64-AVX: # %bb.0: 245; X64-AVX-NEXT: vmovaps (%rsi), %xmm0 246; X64-AVX-NEXT: vmovaps %xmm0, (%rdi) 247; X64-AVX-NEXT: retq 248 %tmp1 = load <4 x float>, <4 x float>* %A ; <<4 x float>> [#uses=1] 249 %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> < i32 0, i32 5, i32 6, i32 7 > ; <<4 x float>> [#uses=1] 250 store <4 x float> %tmp2, <4 x float>* %res 251 ret void 252} 253 254define void @test7() nounwind { 255; SSE-LABEL: test7: 256; SSE: # %bb.0: 257; SSE-NEXT: xorps %xmm0, %xmm0 258; SSE-NEXT: movaps %xmm0, 0 259; SSE-NEXT: ret{{[l|q]}} 260; 261; AVX-LABEL: test7: 262; AVX: # %bb.0: 263; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 264; AVX-NEXT: vmovaps %xmm0, 0 265; AVX-NEXT: ret{{[l|q]}} 266 bitcast <4 x i32> zeroinitializer to <4 x float> ; <<4 x float>>:1 [#uses=1] 267 shufflevector <4 x float> %1, <4 x float> zeroinitializer, <4 x i32> zeroinitializer ; <<4 x float>>:2 [#uses=1] 268 store <4 x float> %2, <4 x float>* null 269 ret void 270} 271 272@x = external dso_local global [4 x i32] 273 274define <2 x i64> @test8() nounwind { 275; X86-SSE-LABEL: test8: 276; X86-SSE: # %bb.0: 277; X86-SSE-NEXT: movups x, %xmm0 278; X86-SSE-NEXT: retl 279; 280; X86-AVX-LABEL: test8: 281; X86-AVX: # %bb.0: 282; X86-AVX-NEXT: vmovups x, %xmm0 283; X86-AVX-NEXT: retl 284; 285; X64-SSE-LABEL: test8: 286; X64-SSE: # %bb.0: 287; X64-SSE-NEXT: movups x(%rip), %xmm0 288; X64-SSE-NEXT: retq 289; 290; X64-AVX-LABEL: test8: 291; X64-AVX: # %bb.0: 292; X64-AVX-NEXT: vmovups x(%rip), %xmm0 293; X64-AVX-NEXT: retq 294 %tmp = load i32, i32* getelementptr ([4 x i32], [4 x i32]* @x, i32 0, i32 0) ; <i32> [#uses=1] 295 %tmp3 = load i32, i32* getelementptr ([4 x i32], [4 x i32]* @x, i32 0, i32 1) ; <i32> [#uses=1] 296 %tmp5 = load i32, i32* getelementptr ([4 x i32], [4 x i32]* @x, i32 0, i32 2) ; <i32> [#uses=1] 297 %tmp7 = load i32, i32* getelementptr ([4 x i32], [4 x i32]* @x, i32 0, i32 3) ; <i32> [#uses=1] 298 %tmp.upgrd.1 = insertelement <4 x i32> undef, i32 %tmp, i32 0 ; <<4 x i32>> [#uses=1] 299 %tmp13 = insertelement <4 x i32> %tmp.upgrd.1, i32 %tmp3, i32 1 ; <<4 x i32>> [#uses=1] 300 %tmp14 = insertelement <4 x i32> %tmp13, i32 %tmp5, i32 2 ; <<4 x i32>> [#uses=1] 301 %tmp15 = insertelement <4 x i32> %tmp14, i32 %tmp7, i32 3 ; <<4 x i32>> [#uses=1] 302 %tmp16 = bitcast <4 x i32> %tmp15 to <2 x i64> ; <<2 x i64>> [#uses=1] 303 ret <2 x i64> %tmp16 304} 305 306define <4 x float> @test9(i32 %dummy, float %a, float %b, float %c, float %d) nounwind { 307; X86-SSE-LABEL: test9: 308; X86-SSE: # %bb.0: 309; X86-SSE-NEXT: movups {{[0-9]+}}(%esp), %xmm0 310; X86-SSE-NEXT: retl 311; 312; X86-AVX-LABEL: test9: 313; X86-AVX: # %bb.0: 314; X86-AVX-NEXT: vmovups {{[0-9]+}}(%esp), %xmm0 315; X86-AVX-NEXT: retl 316; 317; X64-SSE-LABEL: test9: 318; X64-SSE: # %bb.0: 319; X64-SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] 320; X64-SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 321; X64-SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] 322; X64-SSE-NEXT: retq 323; 324; X64-AVX-LABEL: test9: 325; X64-AVX: # %bb.0: 326; X64-AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] 327; X64-AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] 328; X64-AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0] 329; X64-AVX-NEXT: retq 330 %tmp = insertelement <4 x float> undef, float %a, i32 0 ; <<4 x float>> [#uses=1] 331 %tmp11 = insertelement <4 x float> %tmp, float %b, i32 1 ; <<4 x float>> [#uses=1] 332 %tmp12 = insertelement <4 x float> %tmp11, float %c, i32 2 ; <<4 x float>> [#uses=1] 333 %tmp13 = insertelement <4 x float> %tmp12, float %d, i32 3 ; <<4 x float>> [#uses=1] 334 ret <4 x float> %tmp13 335} 336 337define <4 x float> @test10(float %a, float %b, float %c, float %d) nounwind { 338; X86-SSE-LABEL: test10: 339; X86-SSE: # %bb.0: 340; X86-SSE-NEXT: movups {{[0-9]+}}(%esp), %xmm0 341; X86-SSE-NEXT: retl 342; 343; X86-AVX-LABEL: test10: 344; X86-AVX: # %bb.0: 345; X86-AVX-NEXT: vmovups {{[0-9]+}}(%esp), %xmm0 346; X86-AVX-NEXT: retl 347; 348; X64-SSE-LABEL: test10: 349; X64-SSE: # %bb.0: 350; X64-SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] 351; X64-SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 352; X64-SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] 353; X64-SSE-NEXT: retq 354; 355; X64-AVX-LABEL: test10: 356; X64-AVX: # %bb.0: 357; X64-AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] 358; X64-AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] 359; X64-AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0] 360; X64-AVX-NEXT: retq 361 %tmp = insertelement <4 x float> undef, float %a, i32 0 ; <<4 x float>> [#uses=1] 362 %tmp11 = insertelement <4 x float> %tmp, float %b, i32 1 ; <<4 x float>> [#uses=1] 363 %tmp12 = insertelement <4 x float> %tmp11, float %c, i32 2 ; <<4 x float>> [#uses=1] 364 %tmp13 = insertelement <4 x float> %tmp12, float %d, i32 3 ; <<4 x float>> [#uses=1] 365 ret <4 x float> %tmp13 366} 367 368define <2 x double> @test11(double %a, double %b) nounwind { 369; X86-SSE-LABEL: test11: 370; X86-SSE: # %bb.0: 371; X86-SSE-NEXT: movups {{[0-9]+}}(%esp), %xmm0 372; X86-SSE-NEXT: retl 373; 374; X86-AVX-LABEL: test11: 375; X86-AVX: # %bb.0: 376; X86-AVX-NEXT: vmovups {{[0-9]+}}(%esp), %xmm0 377; X86-AVX-NEXT: retl 378; 379; X64-SSE-LABEL: test11: 380; X64-SSE: # %bb.0: 381; X64-SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 382; X64-SSE-NEXT: retq 383; 384; X64-AVX-LABEL: test11: 385; X64-AVX: # %bb.0: 386; X64-AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 387; X64-AVX-NEXT: retq 388 %tmp = insertelement <2 x double> undef, double %a, i32 0 ; <<2 x double>> [#uses=1] 389 %tmp7 = insertelement <2 x double> %tmp, double %b, i32 1 ; <<2 x double>> [#uses=1] 390 ret <2 x double> %tmp7 391} 392 393define void @test12() nounwind { 394; SSE-LABEL: test12: 395; SSE: # %bb.0: 396; SSE-NEXT: movapd 0, %xmm0 397; SSE-NEXT: movapd {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 398; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 399; SSE-NEXT: xorps %xmm2, %xmm2 400; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1] 401; SSE-NEXT: addps %xmm1, %xmm2 402; SSE-NEXT: movaps %xmm2, 0 403; SSE-NEXT: ret{{[l|q]}} 404; 405; AVX1-LABEL: test12: 406; AVX1: # %bb.0: 407; AVX1-NEXT: vmovaps 0, %xmm0 408; AVX1-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1],mem[2,3] 409; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 410; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] 411; AVX1-NEXT: vaddps %xmm0, %xmm1, %xmm0 412; AVX1-NEXT: vmovaps %xmm0, 0 413; AVX1-NEXT: ret{{[l|q]}} 414; 415; AVX512-LABEL: test12: 416; AVX512: # %bb.0: 417; AVX512-NEXT: vmovaps 0, %xmm0 418; AVX512-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 419; AVX512-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3] 420; AVX512-NEXT: vxorps %xmm2, %xmm2, %xmm2 421; AVX512-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] 422; AVX512-NEXT: vaddps %xmm0, %xmm1, %xmm0 423; AVX512-NEXT: vmovaps %xmm0, 0 424; AVX512-NEXT: ret{{[l|q]}} 425 %tmp1 = load <4 x float>, <4 x float>* null ; <<4 x float>> [#uses=2] 426 %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> < float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00 >, <4 x i32> < i32 0, i32 1, i32 6, i32 7 > ; <<4 x float>> [#uses=1] 427 %tmp3 = shufflevector <4 x float> %tmp1, <4 x float> zeroinitializer, <4 x i32> < i32 2, i32 3, i32 6, i32 7 > ; <<4 x float>> [#uses=1] 428 %tmp4 = fadd <4 x float> %tmp2, %tmp3 ; <<4 x float>> [#uses=1] 429 store <4 x float> %tmp4, <4 x float>* null 430 ret void 431} 432 433define void @test13(<4 x float>* %res, <4 x float>* %A, <4 x float>* %B, <4 x float>* %C) nounwind { 434; X86-SSE-LABEL: test13: 435; X86-SSE: # %bb.0: 436; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax 437; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx 438; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx 439; X86-SSE-NEXT: movaps (%edx), %xmm0 440; X86-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],mem[0,1] 441; X86-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] 442; X86-SSE-NEXT: movaps %xmm0, (%eax) 443; X86-SSE-NEXT: retl 444; 445; X86-AVX-LABEL: test13: 446; X86-AVX: # %bb.0: 447; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 448; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx 449; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx 450; X86-AVX-NEXT: vmovaps (%edx), %xmm0 451; X86-AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1],mem[0,1] 452; X86-AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] 453; X86-AVX-NEXT: vmovaps %xmm0, (%eax) 454; X86-AVX-NEXT: retl 455; 456; X64-SSE-LABEL: test13: 457; X64-SSE: # %bb.0: 458; X64-SSE-NEXT: movaps (%rdx), %xmm0 459; X64-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],mem[0,1] 460; X64-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] 461; X64-SSE-NEXT: movaps %xmm0, (%rdi) 462; X64-SSE-NEXT: retq 463; 464; X64-AVX-LABEL: test13: 465; X64-AVX: # %bb.0: 466; X64-AVX-NEXT: vmovaps (%rdx), %xmm0 467; X64-AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1],mem[0,1] 468; X64-AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] 469; X64-AVX-NEXT: vmovaps %xmm0, (%rdi) 470; X64-AVX-NEXT: retq 471 %tmp3 = load <4 x float>, <4 x float>* %B ; <<4 x float>> [#uses=1] 472 %tmp5 = load <4 x float>, <4 x float>* %C ; <<4 x float>> [#uses=1] 473 %tmp11 = shufflevector <4 x float> %tmp3, <4 x float> %tmp5, <4 x i32> < i32 1, i32 4, i32 1, i32 5 > ; <<4 x float>> [#uses=1] 474 store <4 x float> %tmp11, <4 x float>* %res 475 ret void 476} 477 478define <4 x float> @test14(<4 x float>* %x, <4 x float>* %y) nounwind { 479; X86-SSE-LABEL: test14: 480; X86-SSE: # %bb.0: 481; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax 482; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx 483; X86-SSE-NEXT: movaps (%ecx), %xmm1 484; X86-SSE-NEXT: movaps (%eax), %xmm2 485; X86-SSE-NEXT: movaps %xmm2, %xmm0 486; X86-SSE-NEXT: addps %xmm1, %xmm0 487; X86-SSE-NEXT: subps %xmm1, %xmm2 488; X86-SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] 489; X86-SSE-NEXT: retl 490; 491; X86-AVX-LABEL: test14: 492; X86-AVX: # %bb.0: 493; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 494; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx 495; X86-AVX-NEXT: vmovaps (%ecx), %xmm0 496; X86-AVX-NEXT: vmovaps (%eax), %xmm1 497; X86-AVX-NEXT: vaddps %xmm0, %xmm1, %xmm2 498; X86-AVX-NEXT: vsubps %xmm0, %xmm1, %xmm0 499; X86-AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm2[0],xmm0[0] 500; X86-AVX-NEXT: retl 501; 502; X64-SSE-LABEL: test14: 503; X64-SSE: # %bb.0: 504; X64-SSE-NEXT: movaps (%rsi), %xmm1 505; X64-SSE-NEXT: movaps (%rdi), %xmm2 506; X64-SSE-NEXT: movaps %xmm2, %xmm0 507; X64-SSE-NEXT: addps %xmm1, %xmm0 508; X64-SSE-NEXT: subps %xmm1, %xmm2 509; X64-SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] 510; X64-SSE-NEXT: retq 511; 512; X64-AVX-LABEL: test14: 513; X64-AVX: # %bb.0: 514; X64-AVX-NEXT: vmovaps (%rsi), %xmm0 515; X64-AVX-NEXT: vmovaps (%rdi), %xmm1 516; X64-AVX-NEXT: vaddps %xmm0, %xmm1, %xmm2 517; X64-AVX-NEXT: vsubps %xmm0, %xmm1, %xmm0 518; X64-AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm2[0],xmm0[0] 519; X64-AVX-NEXT: retq 520 %tmp = load <4 x float>, <4 x float>* %y ; <<4 x float>> [#uses=2] 521 %tmp5 = load <4 x float>, <4 x float>* %x ; <<4 x float>> [#uses=2] 522 %tmp9 = fadd <4 x float> %tmp5, %tmp ; <<4 x float>> [#uses=1] 523 %tmp21 = fsub <4 x float> %tmp5, %tmp ; <<4 x float>> [#uses=1] 524 %tmp27 = shufflevector <4 x float> %tmp9, <4 x float> %tmp21, <4 x i32> < i32 0, i32 1, i32 4, i32 5 > ; <<4 x float>> [#uses=1] 525 ret <4 x float> %tmp27 526} 527 528define <4 x float> @test15(<4 x float>* %x, <4 x float>* %y) nounwind { 529; X86-SSE-LABEL: test15: 530; X86-SSE: # %bb.0: # %entry 531; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax 532; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx 533; X86-SSE-NEXT: movaps (%ecx), %xmm0 534; X86-SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] 535; X86-SSE-NEXT: retl 536; 537; X86-AVX-LABEL: test15: 538; X86-AVX: # %bb.0: # %entry 539; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 540; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx 541; X86-AVX-NEXT: vmovaps (%ecx), %xmm0 542; X86-AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] 543; X86-AVX-NEXT: retl 544; 545; X64-SSE-LABEL: test15: 546; X64-SSE: # %bb.0: # %entry 547; X64-SSE-NEXT: movaps (%rdi), %xmm0 548; X64-SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] 549; X64-SSE-NEXT: retq 550; 551; X64-AVX-LABEL: test15: 552; X64-AVX: # %bb.0: # %entry 553; X64-AVX-NEXT: vmovaps (%rdi), %xmm0 554; X64-AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] 555; X64-AVX-NEXT: retq 556entry: 557 %tmp = load <4 x float>, <4 x float>* %y ; <<4 x float>> [#uses=1] 558 %tmp3 = load <4 x float>, <4 x float>* %x ; <<4 x float>> [#uses=1] 559 %tmp4 = shufflevector <4 x float> %tmp3, <4 x float> %tmp, <4 x i32> < i32 2, i32 3, i32 6, i32 7 > ; <<4 x float>> [#uses=1] 560 ret <4 x float> %tmp4 561} 562 563; PR8900 564 565define <2 x double> @test16(<4 x double> * nocapture %srcA, <2 x double>* nocapture %dst) { 566; X86-SSE-LABEL: test16: 567; X86-SSE: # %bb.0: 568; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax 569; X86-SSE-NEXT: movaps 96(%eax), %xmm0 570; X86-SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] 571; X86-SSE-NEXT: retl 572; 573; X86-AVX-LABEL: test16: 574; X86-AVX: # %bb.0: 575; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 576; X86-AVX-NEXT: vmovaps 96(%eax), %xmm0 577; X86-AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] 578; X86-AVX-NEXT: retl 579; 580; X64-SSE-LABEL: test16: 581; X64-SSE: # %bb.0: 582; X64-SSE-NEXT: movaps 96(%rdi), %xmm0 583; X64-SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] 584; X64-SSE-NEXT: retq 585; 586; X64-AVX-LABEL: test16: 587; X64-AVX: # %bb.0: 588; X64-AVX-NEXT: vmovaps 96(%rdi), %xmm0 589; X64-AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] 590; X64-AVX-NEXT: retq 591 %i5 = getelementptr inbounds <4 x double>, <4 x double>* %srcA, i32 3 592 %i6 = load <4 x double>, <4 x double>* %i5, align 32 593 %i7 = shufflevector <4 x double> %i6, <4 x double> undef, <2 x i32> <i32 0, i32 2> 594 ret <2 x double> %i7 595} 596 597; PR9009 598define fastcc void @test17() nounwind { 599; X86-SSE-LABEL: test17: 600; X86-SSE: # %bb.0: # %entry 601; X86-SSE-NEXT: movaps {{.*#+}} xmm0 = <u,u,32768,32768> 602; X86-SSE-NEXT: movaps %xmm0, (%eax) 603; X86-SSE-NEXT: retl 604; 605; X86-AVX1-LABEL: test17: 606; X86-AVX1: # %bb.0: # %entry 607; X86-AVX1-NEXT: vmovaps {{.*#+}} xmm0 = <u,u,32768,32768> 608; X86-AVX1-NEXT: vmovaps %xmm0, (%eax) 609; X86-AVX1-NEXT: retl 610; 611; X86-AVX512-LABEL: test17: 612; X86-AVX512: # %bb.0: # %entry 613; X86-AVX512-NEXT: vbroadcastss {{.*#+}} xmm0 = [32768,32768,32768,32768] 614; X86-AVX512-NEXT: vmovaps %xmm0, (%eax) 615; X86-AVX512-NEXT: retl 616; 617; X64-SSE-LABEL: test17: 618; X64-SSE: # %bb.0: # %entry 619; X64-SSE-NEXT: movaps {{.*#+}} xmm0 = <u,u,32768,32768> 620; X64-SSE-NEXT: movaps %xmm0, (%rax) 621; X64-SSE-NEXT: retq 622; 623; X64-AVX1-LABEL: test17: 624; X64-AVX1: # %bb.0: # %entry 625; X64-AVX1-NEXT: vmovaps {{.*#+}} xmm0 = <u,u,32768,32768> 626; X64-AVX1-NEXT: vmovaps %xmm0, (%rax) 627; X64-AVX1-NEXT: retq 628; 629; X64-AVX512-LABEL: test17: 630; X64-AVX512: # %bb.0: # %entry 631; X64-AVX512-NEXT: vbroadcastss {{.*#+}} xmm0 = [32768,32768,32768,32768] 632; X64-AVX512-NEXT: vmovaps %xmm0, (%rax) 633; X64-AVX512-NEXT: retq 634entry: 635 %0 = insertelement <4 x i32> undef, i32 undef, i32 1 636 %1 = shufflevector <4 x i32> <i32 undef, i32 undef, i32 32768, i32 32768>, <4 x i32> %0, <4 x i32> <i32 4, i32 5, i32 2, i32 3> 637 %2 = bitcast <4 x i32> %1 to <4 x float> 638 store <4 x float> %2, <4 x float> * undef 639 ret void 640} 641 642; PR9210 643define <4 x float> @f(<4 x double>) nounwind { 644; SSE-LABEL: f: 645; SSE: # %bb.0: # %entry 646; SSE-NEXT: cvtpd2ps %xmm1, %xmm1 647; SSE-NEXT: cvtpd2ps %xmm0, %xmm0 648; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] 649; SSE-NEXT: ret{{[l|q]}} 650; 651; AVX-LABEL: f: 652; AVX: # %bb.0: # %entry 653; AVX-NEXT: vcvtpd2ps %ymm0, %xmm0 654; AVX-NEXT: vzeroupper 655; AVX-NEXT: ret{{[l|q]}} 656entry: 657 %double2float.i = fptrunc <4 x double> %0 to <4 x float> 658 ret <4 x float> %double2float.i 659} 660 661define <2 x i64> @test_insert_64_zext(<2 x i64> %i) { 662; SSE-LABEL: test_insert_64_zext: 663; SSE: # %bb.0: 664; SSE-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero 665; SSE-NEXT: ret{{[l|q]}} 666; 667; AVX-LABEL: test_insert_64_zext: 668; AVX: # %bb.0: 669; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero 670; AVX-NEXT: ret{{[l|q]}} 671 %1 = shufflevector <2 x i64> %i, <2 x i64> <i64 0, i64 undef>, <2 x i32> <i32 0, i32 2> 672 ret <2 x i64> %1 673} 674 675define <4 x i32> @PR19721(<4 x i32> %i) { 676; X86-SSE-LABEL: PR19721: 677; X86-SSE: # %bb.0: 678; X86-SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 679; X86-SSE-NEXT: retl 680; 681; AVX-LABEL: PR19721: 682; AVX: # %bb.0: 683; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 684; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] 685; AVX-NEXT: ret{{[l|q]}} 686; 687; X64-SSE-LABEL: PR19721: 688; X64-SSE: # %bb.0: 689; X64-SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 690; X64-SSE-NEXT: retq 691 %bc = bitcast <4 x i32> %i to i128 692 %insert = and i128 %bc, -4294967296 693 %bc2 = bitcast i128 %insert to <4 x i32> 694 ret <4 x i32> %bc2 695} 696 697define <4 x i32> @test_mul(<4 x i32> %x, <4 x i32> %y) { 698; SSE-LABEL: test_mul: 699; SSE: # %bb.0: 700; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 701; SSE-NEXT: pmuludq %xmm1, %xmm0 702; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 703; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 704; SSE-NEXT: pmuludq %xmm2, %xmm1 705; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 706; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 707; SSE-NEXT: ret{{[l|q]}} 708; 709; AVX-LABEL: test_mul: 710; AVX: # %bb.0: 711; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 712; AVX-NEXT: ret{{[l|q]}} 713 %m = mul <4 x i32> %x, %y 714 ret <4 x i32> %m 715} 716