1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -show-mc-encoding < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+sse | FileCheck %s --check-prefixes=CHECK,X86,SSE,X86-SSE,X86-SSE1
3; RUN: llc -show-mc-encoding < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,X86,SSE,X86-SSE,X86-SSE2
4; RUN: llc -show-mc-encoding < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,X86,AVX,X86-AVX,AVX1,X86-AVX1
5; RUN: llc -show-mc-encoding < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl | FileCheck %s --check-prefixes=CHECK,X86,AVX,X86-AVX,AVX512,X86-AVX512
6; RUN: llc -show-mc-encoding < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+sse,-sse2 | FileCheck %s --check-prefixes=CHECK,X64,SSE,X64-SSE,X64-SSE1
7; RUN: llc -show-mc-encoding < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+sse,+sse2 | FileCheck %s --check-prefixes=CHECK,X64,SSE,X64-SSE,X64-SSE2
8; RUN: llc -show-mc-encoding < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,X64,AVX,X64-AVX,AVX1,X64-AVX1
9; RUN: llc -show-mc-encoding < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl | FileCheck %s --check-prefixes=CHECK,X64,AVX,X64-AVX,AVX512,X64-AVX512
10
11; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/sse-builtins.c
12
13define <4 x float> @test_mm_add_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
14; SSE-LABEL: test_mm_add_ps:
15; SSE:       # %bb.0:
16; SSE-NEXT:    addps %xmm1, %xmm0 # encoding: [0x0f,0x58,0xc1]
17; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
18;
19; AVX1-LABEL: test_mm_add_ps:
20; AVX1:       # %bb.0:
21; AVX1-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x58,0xc1]
22; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
23;
24; AVX512-LABEL: test_mm_add_ps:
25; AVX512:       # %bb.0:
26; AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1]
27; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
28  %res = fadd <4 x float> %a0, %a1
29  ret <4 x float> %res
30}
31
32define <4 x float> @test_mm_add_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
33; SSE-LABEL: test_mm_add_ss:
34; SSE:       # %bb.0:
35; SSE-NEXT:    addss %xmm1, %xmm0 # encoding: [0xf3,0x0f,0x58,0xc1]
36; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
37;
38; AVX1-LABEL: test_mm_add_ss:
39; AVX1:       # %bb.0:
40; AVX1-NEXT:    vaddss %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x58,0xc1]
41; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
42;
43; AVX512-LABEL: test_mm_add_ss:
44; AVX512:       # %bb.0:
45; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x58,0xc1]
46; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
47  %ext0 = extractelement <4 x float> %a0, i32 0
48  %ext1 = extractelement <4 x float> %a1, i32 0
49  %fadd = fadd float %ext0, %ext1
50  %res = insertelement <4 x float> %a0, float %fadd, i32 0
51  ret <4 x float> %res
52}
53
54define <4 x float> @test_mm_and_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
55; SSE-LABEL: test_mm_and_ps:
56; SSE:       # %bb.0:
57; SSE-NEXT:    andps %xmm1, %xmm0 # encoding: [0x0f,0x54,0xc1]
58; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
59;
60; AVX1-LABEL: test_mm_and_ps:
61; AVX1:       # %bb.0:
62; AVX1-NEXT:    vandps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x54,0xc1]
63; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
64;
65; AVX512-LABEL: test_mm_and_ps:
66; AVX512:       # %bb.0:
67; AVX512-NEXT:    vandps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x54,0xc1]
68; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
69  %arg0 = bitcast <4 x float> %a0 to <4 x i32>
70  %arg1 = bitcast <4 x float> %a1 to <4 x i32>
71  %res = and <4 x i32> %arg0, %arg1
72  %bc = bitcast <4 x i32> %res to <4 x float>
73  ret <4 x float> %bc
74}
75
76define <4 x float> @test_mm_andnot_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
77; X86-SSE1-LABEL: test_mm_andnot_ps:
78; X86-SSE1:       # %bb.0:
79; X86-SSE1-NEXT:    andnps %xmm1, %xmm0 # encoding: [0x0f,0x55,0xc1]
80; X86-SSE1-NEXT:    retl # encoding: [0xc3]
81;
82; X86-SSE2-LABEL: test_mm_andnot_ps:
83; X86-SSE2:       # %bb.0:
84; X86-SSE2-NEXT:    pcmpeqd %xmm2, %xmm2 # encoding: [0x66,0x0f,0x76,0xd2]
85; X86-SSE2-NEXT:    pxor %xmm2, %xmm0 # encoding: [0x66,0x0f,0xef,0xc2]
86; X86-SSE2-NEXT:    pand %xmm1, %xmm0 # encoding: [0x66,0x0f,0xdb,0xc1]
87; X86-SSE2-NEXT:    retl # encoding: [0xc3]
88;
89; AVX1-LABEL: test_mm_andnot_ps:
90; AVX1:       # %bb.0:
91; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2 # encoding: [0xc5,0xe9,0x76,0xd2]
92; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xef,0xc2]
93; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xdb,0xc1]
94; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
95;
96; AVX512-LABEL: test_mm_andnot_ps:
97; AVX512:       # %bb.0:
98; AVX512-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0 # encoding: [0x62,0xf3,0xfd,0x08,0x25,0xc0,0x0f]
99; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdb,0xc1]
100; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
101;
102; X64-SSE1-LABEL: test_mm_andnot_ps:
103; X64-SSE1:       # %bb.0:
104; X64-SSE1-NEXT:    andnps %xmm1, %xmm0 # encoding: [0x0f,0x55,0xc1]
105; X64-SSE1-NEXT:    retq # encoding: [0xc3]
106;
107; X64-SSE2-LABEL: test_mm_andnot_ps:
108; X64-SSE2:       # %bb.0:
109; X64-SSE2-NEXT:    pcmpeqd %xmm2, %xmm2 # encoding: [0x66,0x0f,0x76,0xd2]
110; X64-SSE2-NEXT:    pxor %xmm2, %xmm0 # encoding: [0x66,0x0f,0xef,0xc2]
111; X64-SSE2-NEXT:    pand %xmm1, %xmm0 # encoding: [0x66,0x0f,0xdb,0xc1]
112; X64-SSE2-NEXT:    retq # encoding: [0xc3]
113  %arg0 = bitcast <4 x float> %a0 to <4 x i32>
114  %arg1 = bitcast <4 x float> %a1 to <4 x i32>
115  %not = xor <4 x i32> %arg0, <i32 -1, i32 -1, i32 -1, i32 -1>
116  %res = and <4 x i32> %not, %arg1
117  %bc = bitcast <4 x i32> %res to <4 x float>
118  ret <4 x float> %bc
119}
120
121define <4 x float> @test_mm_cmpeq_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
122; SSE-LABEL: test_mm_cmpeq_ps:
123; SSE:       # %bb.0:
124; SSE-NEXT:    cmpeqps %xmm1, %xmm0 # encoding: [0x0f,0xc2,0xc1,0x00]
125; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
126;
127; AVX1-LABEL: test_mm_cmpeq_ps:
128; AVX1:       # %bb.0:
129; AVX1-NEXT:    vcmpeqps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0xc2,0xc1,0x00]
130; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
131;
132; AVX512-LABEL: test_mm_cmpeq_ps:
133; AVX512:       # %bb.0:
134; AVX512-NEXT:    vcmpeqps %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7c,0x08,0xc2,0xc1,0x00]
135; AVX512-NEXT:    vpmovm2d %k0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x38,0xc0]
136; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
137  %cmp = fcmp oeq <4 x float> %a0, %a1
138  %sext = sext <4 x i1> %cmp to <4 x i32>
139  %res = bitcast <4 x i32> %sext to <4 x float>
140  ret <4 x float> %res
141}
142
143define <4 x float> @test_mm_cmpeq_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
144; SSE-LABEL: test_mm_cmpeq_ss:
145; SSE:       # %bb.0:
146; SSE-NEXT:    cmpeqss %xmm1, %xmm0 # encoding: [0xf3,0x0f,0xc2,0xc1,0x00]
147; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
148;
149; AVX-LABEL: test_mm_cmpeq_ss:
150; AVX:       # %bb.0:
151; AVX-NEXT:    vcmpeqss %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0xc2,0xc1,0x00]
152; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
153  %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 0)
154  ret <4 x float> %res
155}
156declare <4 x float> @llvm.x86.sse.cmp.ss(<4 x float>, <4 x float>, i8) nounwind readnone
157
158define <4 x float> @test_mm_cmpge_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
159; SSE-LABEL: test_mm_cmpge_ps:
160; SSE:       # %bb.0:
161; SSE-NEXT:    cmpleps %xmm0, %xmm1 # encoding: [0x0f,0xc2,0xc8,0x02]
162; SSE-NEXT:    movaps %xmm1, %xmm0 # encoding: [0x0f,0x28,0xc1]
163; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
164;
165; AVX1-LABEL: test_mm_cmpge_ps:
166; AVX1:       # %bb.0:
167; AVX1-NEXT:    vcmpleps %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf0,0xc2,0xc0,0x02]
168; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
169;
170; AVX512-LABEL: test_mm_cmpge_ps:
171; AVX512:       # %bb.0:
172; AVX512-NEXT:    vcmpleps %xmm0, %xmm1, %k0 # encoding: [0x62,0xf1,0x74,0x08,0xc2,0xc0,0x02]
173; AVX512-NEXT:    vpmovm2d %k0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x38,0xc0]
174; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
175  %cmp = fcmp ole <4 x float> %a1, %a0
176  %sext = sext <4 x i1> %cmp to <4 x i32>
177  %res = bitcast <4 x i32> %sext to <4 x float>
178  ret <4 x float> %res
179}
180
181define <4 x float> @test_mm_cmpge_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
182; SSE-LABEL: test_mm_cmpge_ss:
183; SSE:       # %bb.0:
184; SSE-NEXT:    cmpless %xmm0, %xmm1 # encoding: [0xf3,0x0f,0xc2,0xc8,0x02]
185; SSE-NEXT:    movss %xmm1, %xmm0 # encoding: [0xf3,0x0f,0x10,0xc1]
186; SSE-NEXT:    # xmm0 = xmm1[0],xmm0[1,2,3]
187; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
188;
189; AVX-LABEL: test_mm_cmpge_ss:
190; AVX:       # %bb.0:
191; AVX-NEXT:    vcmpless %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf2,0xc2,0xc8,0x02]
192; AVX-NEXT:    vblendps $1, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x01]
193; AVX-NEXT:    # xmm0 = xmm1[0],xmm0[1,2,3]
194; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
195  %cmp = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a1, <4 x float> %a0, i8 2)
196  %res = shufflevector <4 x float> %a0, <4 x float> %cmp, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
197  ret <4 x float> %res
198}
199
200define <4 x float> @test_mm_cmpgt_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
201; SSE-LABEL: test_mm_cmpgt_ps:
202; SSE:       # %bb.0:
203; SSE-NEXT:    cmpltps %xmm0, %xmm1 # encoding: [0x0f,0xc2,0xc8,0x01]
204; SSE-NEXT:    movaps %xmm1, %xmm0 # encoding: [0x0f,0x28,0xc1]
205; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
206;
207; AVX1-LABEL: test_mm_cmpgt_ps:
208; AVX1:       # %bb.0:
209; AVX1-NEXT:    vcmpltps %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf0,0xc2,0xc0,0x01]
210; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
211;
212; AVX512-LABEL: test_mm_cmpgt_ps:
213; AVX512:       # %bb.0:
214; AVX512-NEXT:    vcmpltps %xmm0, %xmm1, %k0 # encoding: [0x62,0xf1,0x74,0x08,0xc2,0xc0,0x01]
215; AVX512-NEXT:    vpmovm2d %k0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x38,0xc0]
216; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
217  %cmp = fcmp olt <4 x float> %a1, %a0
218  %sext = sext <4 x i1> %cmp to <4 x i32>
219  %res = bitcast <4 x i32> %sext to <4 x float>
220  ret <4 x float> %res
221}
222
223define <4 x float> @test_mm_cmpgt_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
224; SSE-LABEL: test_mm_cmpgt_ss:
225; SSE:       # %bb.0:
226; SSE-NEXT:    cmpltss %xmm0, %xmm1 # encoding: [0xf3,0x0f,0xc2,0xc8,0x01]
227; SSE-NEXT:    movss %xmm1, %xmm0 # encoding: [0xf3,0x0f,0x10,0xc1]
228; SSE-NEXT:    # xmm0 = xmm1[0],xmm0[1,2,3]
229; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
230;
231; AVX-LABEL: test_mm_cmpgt_ss:
232; AVX:       # %bb.0:
233; AVX-NEXT:    vcmpltss %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf2,0xc2,0xc8,0x01]
234; AVX-NEXT:    vblendps $1, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x01]
235; AVX-NEXT:    # xmm0 = xmm1[0],xmm0[1,2,3]
236; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
237  %cmp = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a1, <4 x float> %a0, i8 1)
238  %res = shufflevector <4 x float> %a0, <4 x float> %cmp, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
239  ret <4 x float> %res
240}
241
242define <4 x float> @test_mm_cmple_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
243; SSE-LABEL: test_mm_cmple_ps:
244; SSE:       # %bb.0:
245; SSE-NEXT:    cmpleps %xmm1, %xmm0 # encoding: [0x0f,0xc2,0xc1,0x02]
246; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
247;
248; AVX1-LABEL: test_mm_cmple_ps:
249; AVX1:       # %bb.0:
250; AVX1-NEXT:    vcmpleps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0xc2,0xc1,0x02]
251; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
252;
253; AVX512-LABEL: test_mm_cmple_ps:
254; AVX512:       # %bb.0:
255; AVX512-NEXT:    vcmpleps %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7c,0x08,0xc2,0xc1,0x02]
256; AVX512-NEXT:    vpmovm2d %k0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x38,0xc0]
257; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
258  %cmp = fcmp ole <4 x float> %a0, %a1
259  %sext = sext <4 x i1> %cmp to <4 x i32>
260  %res = bitcast <4 x i32> %sext to <4 x float>
261  ret <4 x float> %res
262}
263
264define <4 x float> @test_mm_cmple_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
265; SSE-LABEL: test_mm_cmple_ss:
266; SSE:       # %bb.0:
267; SSE-NEXT:    cmpless %xmm1, %xmm0 # encoding: [0xf3,0x0f,0xc2,0xc1,0x02]
268; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
269;
270; AVX-LABEL: test_mm_cmple_ss:
271; AVX:       # %bb.0:
272; AVX-NEXT:    vcmpless %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0xc2,0xc1,0x02]
273; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
274  %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 2)
275  ret <4 x float> %res
276}
277
278define <4 x float> @test_mm_cmplt_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
279; SSE-LABEL: test_mm_cmplt_ps:
280; SSE:       # %bb.0:
281; SSE-NEXT:    cmpltps %xmm1, %xmm0 # encoding: [0x0f,0xc2,0xc1,0x01]
282; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
283;
284; AVX1-LABEL: test_mm_cmplt_ps:
285; AVX1:       # %bb.0:
286; AVX1-NEXT:    vcmpltps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0xc2,0xc1,0x01]
287; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
288;
289; AVX512-LABEL: test_mm_cmplt_ps:
290; AVX512:       # %bb.0:
291; AVX512-NEXT:    vcmpltps %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7c,0x08,0xc2,0xc1,0x01]
292; AVX512-NEXT:    vpmovm2d %k0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x38,0xc0]
293; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
294  %cmp = fcmp olt <4 x float> %a0, %a1
295  %sext = sext <4 x i1> %cmp to <4 x i32>
296  %res = bitcast <4 x i32> %sext to <4 x float>
297  ret <4 x float> %res
298}
299
300define <4 x float> @test_mm_cmplt_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
301; SSE-LABEL: test_mm_cmplt_ss:
302; SSE:       # %bb.0:
303; SSE-NEXT:    cmpltss %xmm1, %xmm0 # encoding: [0xf3,0x0f,0xc2,0xc1,0x01]
304; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
305;
306; AVX-LABEL: test_mm_cmplt_ss:
307; AVX:       # %bb.0:
308; AVX-NEXT:    vcmpltss %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0xc2,0xc1,0x01]
309; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
310  %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 1)
311  ret <4 x float> %res
312}
313
314define <4 x float> @test_mm_cmpneq_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
315; SSE-LABEL: test_mm_cmpneq_ps:
316; SSE:       # %bb.0:
317; SSE-NEXT:    cmpneqps %xmm1, %xmm0 # encoding: [0x0f,0xc2,0xc1,0x04]
318; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
319;
320; AVX1-LABEL: test_mm_cmpneq_ps:
321; AVX1:       # %bb.0:
322; AVX1-NEXT:    vcmpneqps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0xc2,0xc1,0x04]
323; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
324;
325; AVX512-LABEL: test_mm_cmpneq_ps:
326; AVX512:       # %bb.0:
327; AVX512-NEXT:    vcmpneqps %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7c,0x08,0xc2,0xc1,0x04]
328; AVX512-NEXT:    vpmovm2d %k0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x38,0xc0]
329; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
330  %cmp = fcmp une <4 x float> %a0, %a1
331  %sext = sext <4 x i1> %cmp to <4 x i32>
332  %res = bitcast <4 x i32> %sext to <4 x float>
333  ret <4 x float> %res
334}
335
336define <4 x float> @test_mm_cmpneq_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
337; SSE-LABEL: test_mm_cmpneq_ss:
338; SSE:       # %bb.0:
339; SSE-NEXT:    cmpneqss %xmm1, %xmm0 # encoding: [0xf3,0x0f,0xc2,0xc1,0x04]
340; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
341;
342; AVX-LABEL: test_mm_cmpneq_ss:
343; AVX:       # %bb.0:
344; AVX-NEXT:    vcmpneqss %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0xc2,0xc1,0x04]
345; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
346  %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 4)
347  ret <4 x float> %res
348}
349
350define <4 x float> @test_mm_cmpnge_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
351; SSE-LABEL: test_mm_cmpnge_ps:
352; SSE:       # %bb.0:
353; SSE-NEXT:    cmpnleps %xmm0, %xmm1 # encoding: [0x0f,0xc2,0xc8,0x06]
354; SSE-NEXT:    movaps %xmm1, %xmm0 # encoding: [0x0f,0x28,0xc1]
355; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
356;
357; AVX1-LABEL: test_mm_cmpnge_ps:
358; AVX1:       # %bb.0:
359; AVX1-NEXT:    vcmpnleps %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf0,0xc2,0xc0,0x06]
360; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
361;
362; AVX512-LABEL: test_mm_cmpnge_ps:
363; AVX512:       # %bb.0:
364; AVX512-NEXT:    vcmpnleps %xmm0, %xmm1, %k0 # encoding: [0x62,0xf1,0x74,0x08,0xc2,0xc0,0x06]
365; AVX512-NEXT:    vpmovm2d %k0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x38,0xc0]
366; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
367  %cmp = fcmp ugt <4 x float> %a1, %a0
368  %sext = sext <4 x i1> %cmp to <4 x i32>
369  %res = bitcast <4 x i32> %sext to <4 x float>
370  ret <4 x float> %res
371}
372
373define <4 x float> @test_mm_cmpnge_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
374; SSE-LABEL: test_mm_cmpnge_ss:
375; SSE:       # %bb.0:
376; SSE-NEXT:    cmpnless %xmm0, %xmm1 # encoding: [0xf3,0x0f,0xc2,0xc8,0x06]
377; SSE-NEXT:    movss %xmm1, %xmm0 # encoding: [0xf3,0x0f,0x10,0xc1]
378; SSE-NEXT:    # xmm0 = xmm1[0],xmm0[1,2,3]
379; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
380;
381; AVX-LABEL: test_mm_cmpnge_ss:
382; AVX:       # %bb.0:
383; AVX-NEXT:    vcmpnless %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf2,0xc2,0xc8,0x06]
384; AVX-NEXT:    vblendps $1, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x01]
385; AVX-NEXT:    # xmm0 = xmm1[0],xmm0[1,2,3]
386; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
387  %cmp = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a1, <4 x float> %a0, i8 6)
388  %res = shufflevector <4 x float> %a0, <4 x float> %cmp, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
389  ret <4 x float> %res
390}
391
392define <4 x float> @test_mm_cmpngt_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
393; SSE-LABEL: test_mm_cmpngt_ps:
394; SSE:       # %bb.0:
395; SSE-NEXT:    cmpnltps %xmm0, %xmm1 # encoding: [0x0f,0xc2,0xc8,0x05]
396; SSE-NEXT:    movaps %xmm1, %xmm0 # encoding: [0x0f,0x28,0xc1]
397; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
398;
399; AVX1-LABEL: test_mm_cmpngt_ps:
400; AVX1:       # %bb.0:
401; AVX1-NEXT:    vcmpnltps %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf0,0xc2,0xc0,0x05]
402; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
403;
404; AVX512-LABEL: test_mm_cmpngt_ps:
405; AVX512:       # %bb.0:
406; AVX512-NEXT:    vcmpnltps %xmm0, %xmm1, %k0 # encoding: [0x62,0xf1,0x74,0x08,0xc2,0xc0,0x05]
407; AVX512-NEXT:    vpmovm2d %k0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x38,0xc0]
408; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
409  %cmp = fcmp uge <4 x float> %a1, %a0
410  %sext = sext <4 x i1> %cmp to <4 x i32>
411  %res = bitcast <4 x i32> %sext to <4 x float>
412  ret <4 x float> %res
413}
414
415define <4 x float> @test_mm_cmpngt_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
416; SSE-LABEL: test_mm_cmpngt_ss:
417; SSE:       # %bb.0:
418; SSE-NEXT:    cmpnltss %xmm0, %xmm1 # encoding: [0xf3,0x0f,0xc2,0xc8,0x05]
419; SSE-NEXT:    movss %xmm1, %xmm0 # encoding: [0xf3,0x0f,0x10,0xc1]
420; SSE-NEXT:    # xmm0 = xmm1[0],xmm0[1,2,3]
421; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
422;
423; AVX-LABEL: test_mm_cmpngt_ss:
424; AVX:       # %bb.0:
425; AVX-NEXT:    vcmpnltss %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf2,0xc2,0xc8,0x05]
426; AVX-NEXT:    vblendps $1, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x01]
427; AVX-NEXT:    # xmm0 = xmm1[0],xmm0[1,2,3]
428; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
429  %cmp = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a1, <4 x float> %a0, i8 5)
430  %res = shufflevector <4 x float> %a0, <4 x float> %cmp, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
431  ret <4 x float> %res
432}
433
434define <4 x float> @test_mm_cmpnle_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
435; SSE-LABEL: test_mm_cmpnle_ps:
436; SSE:       # %bb.0:
437; SSE-NEXT:    cmpnleps %xmm1, %xmm0 # encoding: [0x0f,0xc2,0xc1,0x06]
438; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
439;
440; AVX1-LABEL: test_mm_cmpnle_ps:
441; AVX1:       # %bb.0:
442; AVX1-NEXT:    vcmpnleps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0xc2,0xc1,0x06]
443; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
444;
445; AVX512-LABEL: test_mm_cmpnle_ps:
446; AVX512:       # %bb.0:
447; AVX512-NEXT:    vcmpnleps %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7c,0x08,0xc2,0xc1,0x06]
448; AVX512-NEXT:    vpmovm2d %k0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x38,0xc0]
449; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
450  %cmp = fcmp ugt <4 x float> %a0, %a1
451  %sext = sext <4 x i1> %cmp to <4 x i32>
452  %res = bitcast <4 x i32> %sext to <4 x float>
453  ret <4 x float> %res
454}
455
456define <4 x float> @test_mm_cmpnle_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
457; SSE-LABEL: test_mm_cmpnle_ss:
458; SSE:       # %bb.0:
459; SSE-NEXT:    cmpnless %xmm1, %xmm0 # encoding: [0xf3,0x0f,0xc2,0xc1,0x06]
460; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
461;
462; AVX-LABEL: test_mm_cmpnle_ss:
463; AVX:       # %bb.0:
464; AVX-NEXT:    vcmpnless %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0xc2,0xc1,0x06]
465; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
466  %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 6)
467  ret <4 x float> %res
468}
469
470define <4 x float> @test_mm_cmpnlt_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
471; SSE-LABEL: test_mm_cmpnlt_ps:
472; SSE:       # %bb.0:
473; SSE-NEXT:    cmpnltps %xmm1, %xmm0 # encoding: [0x0f,0xc2,0xc1,0x05]
474; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
475;
476; AVX1-LABEL: test_mm_cmpnlt_ps:
477; AVX1:       # %bb.0:
478; AVX1-NEXT:    vcmpnltps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0xc2,0xc1,0x05]
479; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
480;
481; AVX512-LABEL: test_mm_cmpnlt_ps:
482; AVX512:       # %bb.0:
483; AVX512-NEXT:    vcmpnltps %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7c,0x08,0xc2,0xc1,0x05]
484; AVX512-NEXT:    vpmovm2d %k0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x38,0xc0]
485; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
486  %cmp = fcmp uge <4 x float> %a0, %a1
487  %sext = sext <4 x i1> %cmp to <4 x i32>
488  %res = bitcast <4 x i32> %sext to <4 x float>
489  ret <4 x float> %res
490}
491
492define <4 x float> @test_mm_cmpnlt_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
493; SSE-LABEL: test_mm_cmpnlt_ss:
494; SSE:       # %bb.0:
495; SSE-NEXT:    cmpnltss %xmm1, %xmm0 # encoding: [0xf3,0x0f,0xc2,0xc1,0x05]
496; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
497;
498; AVX-LABEL: test_mm_cmpnlt_ss:
499; AVX:       # %bb.0:
500; AVX-NEXT:    vcmpnltss %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0xc2,0xc1,0x05]
501; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
502  %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 5)
503  ret <4 x float> %res
504}
505
506define <4 x float> @test_mm_cmpord_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
507; SSE-LABEL: test_mm_cmpord_ps:
508; SSE:       # %bb.0:
509; SSE-NEXT:    cmpordps %xmm1, %xmm0 # encoding: [0x0f,0xc2,0xc1,0x07]
510; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
511;
512; AVX1-LABEL: test_mm_cmpord_ps:
513; AVX1:       # %bb.0:
514; AVX1-NEXT:    vcmpordps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0xc2,0xc1,0x07]
515; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
516;
517; AVX512-LABEL: test_mm_cmpord_ps:
518; AVX512:       # %bb.0:
519; AVX512-NEXT:    vcmpordps %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7c,0x08,0xc2,0xc1,0x07]
520; AVX512-NEXT:    vpmovm2d %k0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x38,0xc0]
521; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
522  %cmp = fcmp ord <4 x float> %a0, %a1
523  %sext = sext <4 x i1> %cmp to <4 x i32>
524  %res = bitcast <4 x i32> %sext to <4 x float>
525  ret <4 x float> %res
526}
527
528define <4 x float> @test_mm_cmpord_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
529; SSE-LABEL: test_mm_cmpord_ss:
530; SSE:       # %bb.0:
531; SSE-NEXT:    cmpordss %xmm1, %xmm0 # encoding: [0xf3,0x0f,0xc2,0xc1,0x07]
532; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
533;
534; AVX-LABEL: test_mm_cmpord_ss:
535; AVX:       # %bb.0:
536; AVX-NEXT:    vcmpordss %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0xc2,0xc1,0x07]
537; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
538  %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 7)
539  ret <4 x float> %res
540}
541
542define <4 x float> @test_mm_cmpunord_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
543; SSE-LABEL: test_mm_cmpunord_ps:
544; SSE:       # %bb.0:
545; SSE-NEXT:    cmpunordps %xmm1, %xmm0 # encoding: [0x0f,0xc2,0xc1,0x03]
546; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
547;
548; AVX1-LABEL: test_mm_cmpunord_ps:
549; AVX1:       # %bb.0:
550; AVX1-NEXT:    vcmpunordps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0xc2,0xc1,0x03]
551; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
552;
553; AVX512-LABEL: test_mm_cmpunord_ps:
554; AVX512:       # %bb.0:
555; AVX512-NEXT:    vcmpunordps %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7c,0x08,0xc2,0xc1,0x03]
556; AVX512-NEXT:    vpmovm2d %k0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x38,0xc0]
557; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
558  %cmp = fcmp uno <4 x float> %a0, %a1
559  %sext = sext <4 x i1> %cmp to <4 x i32>
560  %res = bitcast <4 x i32> %sext to <4 x float>
561  ret <4 x float> %res
562}
563
564define <4 x float> @test_mm_cmpunord_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
565; SSE-LABEL: test_mm_cmpunord_ss:
566; SSE:       # %bb.0:
567; SSE-NEXT:    cmpunordss %xmm1, %xmm0 # encoding: [0xf3,0x0f,0xc2,0xc1,0x03]
568; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
569;
570; AVX-LABEL: test_mm_cmpunord_ss:
571; AVX:       # %bb.0:
572; AVX-NEXT:    vcmpunordss %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0xc2,0xc1,0x03]
573; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
574  %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 3)
575  ret <4 x float> %res
576}
577
578define i32 @test_mm_comieq_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
579; SSE-LABEL: test_mm_comieq_ss:
580; SSE:       # %bb.0:
581; SSE-NEXT:    comiss %xmm1, %xmm0 # encoding: [0x0f,0x2f,0xc1]
582; SSE-NEXT:    setnp %al # encoding: [0x0f,0x9b,0xc0]
583; SSE-NEXT:    sete %cl # encoding: [0x0f,0x94,0xc1]
584; SSE-NEXT:    andb %al, %cl # encoding: [0x20,0xc1]
585; SSE-NEXT:    movzbl %cl, %eax # encoding: [0x0f,0xb6,0xc1]
586; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
587;
588; AVX1-LABEL: test_mm_comieq_ss:
589; AVX1:       # %bb.0:
590; AVX1-NEXT:    vcomiss %xmm1, %xmm0 # encoding: [0xc5,0xf8,0x2f,0xc1]
591; AVX1-NEXT:    setnp %al # encoding: [0x0f,0x9b,0xc0]
592; AVX1-NEXT:    sete %cl # encoding: [0x0f,0x94,0xc1]
593; AVX1-NEXT:    andb %al, %cl # encoding: [0x20,0xc1]
594; AVX1-NEXT:    movzbl %cl, %eax # encoding: [0x0f,0xb6,0xc1]
595; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
596;
597; AVX512-LABEL: test_mm_comieq_ss:
598; AVX512:       # %bb.0:
599; AVX512-NEXT:    vcomiss %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2f,0xc1]
600; AVX512-NEXT:    setnp %al # encoding: [0x0f,0x9b,0xc0]
601; AVX512-NEXT:    sete %cl # encoding: [0x0f,0x94,0xc1]
602; AVX512-NEXT:    andb %al, %cl # encoding: [0x20,0xc1]
603; AVX512-NEXT:    movzbl %cl, %eax # encoding: [0x0f,0xb6,0xc1]
604; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
605  %res = call i32 @llvm.x86.sse.comieq.ss(<4 x float> %a0, <4 x float> %a1)
606  ret i32 %res
607}
608declare i32 @llvm.x86.sse.comieq.ss(<4 x float>, <4 x float>) nounwind readnone
609
610define i32 @test_mm_comige_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
611; SSE-LABEL: test_mm_comige_ss:
612; SSE:       # %bb.0:
613; SSE-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
614; SSE-NEXT:    comiss %xmm1, %xmm0 # encoding: [0x0f,0x2f,0xc1]
615; SSE-NEXT:    setae %al # encoding: [0x0f,0x93,0xc0]
616; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
617;
618; AVX1-LABEL: test_mm_comige_ss:
619; AVX1:       # %bb.0:
620; AVX1-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
621; AVX1-NEXT:    vcomiss %xmm1, %xmm0 # encoding: [0xc5,0xf8,0x2f,0xc1]
622; AVX1-NEXT:    setae %al # encoding: [0x0f,0x93,0xc0]
623; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
624;
625; AVX512-LABEL: test_mm_comige_ss:
626; AVX512:       # %bb.0:
627; AVX512-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
628; AVX512-NEXT:    vcomiss %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2f,0xc1]
629; AVX512-NEXT:    setae %al # encoding: [0x0f,0x93,0xc0]
630; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
631  %res = call i32 @llvm.x86.sse.comige.ss(<4 x float> %a0, <4 x float> %a1)
632  ret i32 %res
633}
634declare i32 @llvm.x86.sse.comige.ss(<4 x float>, <4 x float>) nounwind readnone
635
636define i32 @test_mm_comigt_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
637; SSE-LABEL: test_mm_comigt_ss:
638; SSE:       # %bb.0:
639; SSE-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
640; SSE-NEXT:    comiss %xmm1, %xmm0 # encoding: [0x0f,0x2f,0xc1]
641; SSE-NEXT:    seta %al # encoding: [0x0f,0x97,0xc0]
642; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
643;
644; AVX1-LABEL: test_mm_comigt_ss:
645; AVX1:       # %bb.0:
646; AVX1-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
647; AVX1-NEXT:    vcomiss %xmm1, %xmm0 # encoding: [0xc5,0xf8,0x2f,0xc1]
648; AVX1-NEXT:    seta %al # encoding: [0x0f,0x97,0xc0]
649; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
650;
651; AVX512-LABEL: test_mm_comigt_ss:
652; AVX512:       # %bb.0:
653; AVX512-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
654; AVX512-NEXT:    vcomiss %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2f,0xc1]
655; AVX512-NEXT:    seta %al # encoding: [0x0f,0x97,0xc0]
656; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
657  %res = call i32 @llvm.x86.sse.comigt.ss(<4 x float> %a0, <4 x float> %a1)
658  ret i32 %res
659}
660declare i32 @llvm.x86.sse.comigt.ss(<4 x float>, <4 x float>) nounwind readnone
661
662define i32 @test_mm_comile_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
663; SSE-LABEL: test_mm_comile_ss:
664; SSE:       # %bb.0:
665; SSE-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
666; SSE-NEXT:    comiss %xmm0, %xmm1 # encoding: [0x0f,0x2f,0xc8]
667; SSE-NEXT:    setae %al # encoding: [0x0f,0x93,0xc0]
668; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
669;
670; AVX1-LABEL: test_mm_comile_ss:
671; AVX1:       # %bb.0:
672; AVX1-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
673; AVX1-NEXT:    vcomiss %xmm0, %xmm1 # encoding: [0xc5,0xf8,0x2f,0xc8]
674; AVX1-NEXT:    setae %al # encoding: [0x0f,0x93,0xc0]
675; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
676;
677; AVX512-LABEL: test_mm_comile_ss:
678; AVX512:       # %bb.0:
679; AVX512-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
680; AVX512-NEXT:    vcomiss %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2f,0xc8]
681; AVX512-NEXT:    setae %al # encoding: [0x0f,0x93,0xc0]
682; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
683  %res = call i32 @llvm.x86.sse.comile.ss(<4 x float> %a0, <4 x float> %a1)
684  ret i32 %res
685}
686declare i32 @llvm.x86.sse.comile.ss(<4 x float>, <4 x float>) nounwind readnone
687
688define i32 @test_mm_comilt_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
689; SSE-LABEL: test_mm_comilt_ss:
690; SSE:       # %bb.0:
691; SSE-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
692; SSE-NEXT:    comiss %xmm0, %xmm1 # encoding: [0x0f,0x2f,0xc8]
693; SSE-NEXT:    seta %al # encoding: [0x0f,0x97,0xc0]
694; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
695;
696; AVX1-LABEL: test_mm_comilt_ss:
697; AVX1:       # %bb.0:
698; AVX1-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
699; AVX1-NEXT:    vcomiss %xmm0, %xmm1 # encoding: [0xc5,0xf8,0x2f,0xc8]
700; AVX1-NEXT:    seta %al # encoding: [0x0f,0x97,0xc0]
701; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
702;
703; AVX512-LABEL: test_mm_comilt_ss:
704; AVX512:       # %bb.0:
705; AVX512-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
706; AVX512-NEXT:    vcomiss %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2f,0xc8]
707; AVX512-NEXT:    seta %al # encoding: [0x0f,0x97,0xc0]
708; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
709  %res = call i32 @llvm.x86.sse.comilt.ss(<4 x float> %a0, <4 x float> %a1)
710  ret i32 %res
711}
712declare i32 @llvm.x86.sse.comilt.ss(<4 x float>, <4 x float>) nounwind readnone
713
714define i32 @test_mm_comineq_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
715; SSE-LABEL: test_mm_comineq_ss:
716; SSE:       # %bb.0:
717; SSE-NEXT:    comiss %xmm1, %xmm0 # encoding: [0x0f,0x2f,0xc1]
718; SSE-NEXT:    setp %al # encoding: [0x0f,0x9a,0xc0]
719; SSE-NEXT:    setne %cl # encoding: [0x0f,0x95,0xc1]
720; SSE-NEXT:    orb %al, %cl # encoding: [0x08,0xc1]
721; SSE-NEXT:    movzbl %cl, %eax # encoding: [0x0f,0xb6,0xc1]
722; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
723;
724; AVX1-LABEL: test_mm_comineq_ss:
725; AVX1:       # %bb.0:
726; AVX1-NEXT:    vcomiss %xmm1, %xmm0 # encoding: [0xc5,0xf8,0x2f,0xc1]
727; AVX1-NEXT:    setp %al # encoding: [0x0f,0x9a,0xc0]
728; AVX1-NEXT:    setne %cl # encoding: [0x0f,0x95,0xc1]
729; AVX1-NEXT:    orb %al, %cl # encoding: [0x08,0xc1]
730; AVX1-NEXT:    movzbl %cl, %eax # encoding: [0x0f,0xb6,0xc1]
731; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
732;
733; AVX512-LABEL: test_mm_comineq_ss:
734; AVX512:       # %bb.0:
735; AVX512-NEXT:    vcomiss %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2f,0xc1]
736; AVX512-NEXT:    setp %al # encoding: [0x0f,0x9a,0xc0]
737; AVX512-NEXT:    setne %cl # encoding: [0x0f,0x95,0xc1]
738; AVX512-NEXT:    orb %al, %cl # encoding: [0x08,0xc1]
739; AVX512-NEXT:    movzbl %cl, %eax # encoding: [0x0f,0xb6,0xc1]
740; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
741  %res = call i32 @llvm.x86.sse.comineq.ss(<4 x float> %a0, <4 x float> %a1)
742  ret i32 %res
743}
744declare i32 @llvm.x86.sse.comineq.ss(<4 x float>, <4 x float>) nounwind readnone
745
746define i32 @test_mm_cvt_ss2si(<4 x float> %a0) nounwind {
747; SSE-LABEL: test_mm_cvt_ss2si:
748; SSE:       # %bb.0:
749; SSE-NEXT:    cvtss2si %xmm0, %eax # encoding: [0xf3,0x0f,0x2d,0xc0]
750; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
751;
752; AVX1-LABEL: test_mm_cvt_ss2si:
753; AVX1:       # %bb.0:
754; AVX1-NEXT:    vcvtss2si %xmm0, %eax # encoding: [0xc5,0xfa,0x2d,0xc0]
755; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
756;
757; AVX512-LABEL: test_mm_cvt_ss2si:
758; AVX512:       # %bb.0:
759; AVX512-NEXT:    vcvtss2si %xmm0, %eax # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x2d,0xc0]
760; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
761  %res = call i32 @llvm.x86.sse.cvtss2si(<4 x float> %a0)
762  ret i32 %res
763}
764declare i32 @llvm.x86.sse.cvtss2si(<4 x float>) nounwind readnone
765
766define <4 x float> @test_mm_cvtsi32_ss(<4 x float> %a0, i32 %a1) nounwind {
767; X86-SSE-LABEL: test_mm_cvtsi32_ss:
768; X86-SSE:       # %bb.0:
769; X86-SSE-NEXT:    cvtsi2ssl {{[0-9]+}}(%esp), %xmm0 # encoding: [0xf3,0x0f,0x2a,0x44,0x24,0x04]
770; X86-SSE-NEXT:    retl # encoding: [0xc3]
771;
772; X86-AVX1-LABEL: test_mm_cvtsi32_ss:
773; X86-AVX1:       # %bb.0:
774; X86-AVX1-NEXT:    vcvtsi2ssl {{[0-9]+}}(%esp), %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x2a,0x44,0x24,0x04]
775; X86-AVX1-NEXT:    retl # encoding: [0xc3]
776;
777; X86-AVX512-LABEL: test_mm_cvtsi32_ss:
778; X86-AVX512:       # %bb.0:
779; X86-AVX512-NEXT:    vcvtsi2ssl {{[0-9]+}}(%esp), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x2a,0x44,0x24,0x04]
780; X86-AVX512-NEXT:    retl # encoding: [0xc3]
781;
782; X64-SSE-LABEL: test_mm_cvtsi32_ss:
783; X64-SSE:       # %bb.0:
784; X64-SSE-NEXT:    cvtsi2ss %edi, %xmm0 # encoding: [0xf3,0x0f,0x2a,0xc7]
785; X64-SSE-NEXT:    retq # encoding: [0xc3]
786;
787; X64-AVX1-LABEL: test_mm_cvtsi32_ss:
788; X64-AVX1:       # %bb.0:
789; X64-AVX1-NEXT:    vcvtsi2ss %edi, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x2a,0xc7]
790; X64-AVX1-NEXT:    retq # encoding: [0xc3]
791;
792; X64-AVX512-LABEL: test_mm_cvtsi32_ss:
793; X64-AVX512:       # %bb.0:
794; X64-AVX512-NEXT:    vcvtsi2ss %edi, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x2a,0xc7]
795; X64-AVX512-NEXT:    retq # encoding: [0xc3]
796  %res = call <4 x float> @llvm.x86.sse.cvtsi2ss(<4 x float> %a0, i32 %a1)
797  ret <4 x float> %res
798}
799declare <4 x float> @llvm.x86.sse.cvtsi2ss(<4 x float>, i32) nounwind readnone
800
801define float @test_mm_cvtss_f32(<4 x float> %a0) nounwind {
802; X86-SSE-LABEL: test_mm_cvtss_f32:
803; X86-SSE:       # %bb.0:
804; X86-SSE-NEXT:    pushl %eax # encoding: [0x50]
805; X86-SSE-NEXT:    movss %xmm0, (%esp) # encoding: [0xf3,0x0f,0x11,0x04,0x24]
806; X86-SSE-NEXT:    flds (%esp) # encoding: [0xd9,0x04,0x24]
807; X86-SSE-NEXT:    popl %eax # encoding: [0x58]
808; X86-SSE-NEXT:    retl # encoding: [0xc3]
809;
810; X86-AVX1-LABEL: test_mm_cvtss_f32:
811; X86-AVX1:       # %bb.0:
812; X86-AVX1-NEXT:    pushl %eax # encoding: [0x50]
813; X86-AVX1-NEXT:    vmovss %xmm0, (%esp) # encoding: [0xc5,0xfa,0x11,0x04,0x24]
814; X86-AVX1-NEXT:    flds (%esp) # encoding: [0xd9,0x04,0x24]
815; X86-AVX1-NEXT:    popl %eax # encoding: [0x58]
816; X86-AVX1-NEXT:    retl # encoding: [0xc3]
817;
818; X86-AVX512-LABEL: test_mm_cvtss_f32:
819; X86-AVX512:       # %bb.0:
820; X86-AVX512-NEXT:    pushl %eax # encoding: [0x50]
821; X86-AVX512-NEXT:    vmovss %xmm0, (%esp) # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x11,0x04,0x24]
822; X86-AVX512-NEXT:    flds (%esp) # encoding: [0xd9,0x04,0x24]
823; X86-AVX512-NEXT:    popl %eax # encoding: [0x58]
824; X86-AVX512-NEXT:    retl # encoding: [0xc3]
825;
826; X64-LABEL: test_mm_cvtss_f32:
827; X64:       # %bb.0:
828; X64-NEXT:    retq # encoding: [0xc3]
829  %res = extractelement <4 x float> %a0, i32 0
830  ret float %res
831}
832
833define i32 @test_mm_cvtss_si32(<4 x float> %a0) nounwind {
834; SSE-LABEL: test_mm_cvtss_si32:
835; SSE:       # %bb.0:
836; SSE-NEXT:    cvtss2si %xmm0, %eax # encoding: [0xf3,0x0f,0x2d,0xc0]
837; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
838;
839; AVX1-LABEL: test_mm_cvtss_si32:
840; AVX1:       # %bb.0:
841; AVX1-NEXT:    vcvtss2si %xmm0, %eax # encoding: [0xc5,0xfa,0x2d,0xc0]
842; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
843;
844; AVX512-LABEL: test_mm_cvtss_si32:
845; AVX512:       # %bb.0:
846; AVX512-NEXT:    vcvtss2si %xmm0, %eax # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x2d,0xc0]
847; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
848  %res = call i32 @llvm.x86.sse.cvtss2si(<4 x float> %a0)
849  ret i32 %res
850}
851
852define i32 @test_mm_cvttss_si(<4 x float> %a0) nounwind {
853; SSE-LABEL: test_mm_cvttss_si:
854; SSE:       # %bb.0:
855; SSE-NEXT:    cvttss2si %xmm0, %eax # encoding: [0xf3,0x0f,0x2c,0xc0]
856; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
857;
858; AVX1-LABEL: test_mm_cvttss_si:
859; AVX1:       # %bb.0:
860; AVX1-NEXT:    vcvttss2si %xmm0, %eax # encoding: [0xc5,0xfa,0x2c,0xc0]
861; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
862;
863; AVX512-LABEL: test_mm_cvttss_si:
864; AVX512:       # %bb.0:
865; AVX512-NEXT:    vcvttss2si %xmm0, %eax # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x2c,0xc0]
866; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
867  %res = call i32 @llvm.x86.sse.cvttss2si(<4 x float> %a0)
868  ret i32 %res
869}
870declare i32 @llvm.x86.sse.cvttss2si(<4 x float>) nounwind readnone
871
872define i32 @test_mm_cvttss_si32(<4 x float> %a0) nounwind {
873; SSE-LABEL: test_mm_cvttss_si32:
874; SSE:       # %bb.0:
875; SSE-NEXT:    cvttss2si %xmm0, %eax # encoding: [0xf3,0x0f,0x2c,0xc0]
876; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
877;
878; AVX1-LABEL: test_mm_cvttss_si32:
879; AVX1:       # %bb.0:
880; AVX1-NEXT:    vcvttss2si %xmm0, %eax # encoding: [0xc5,0xfa,0x2c,0xc0]
881; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
882;
883; AVX512-LABEL: test_mm_cvttss_si32:
884; AVX512:       # %bb.0:
885; AVX512-NEXT:    vcvttss2si %xmm0, %eax # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x2c,0xc0]
886; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
887  %res = call i32 @llvm.x86.sse.cvttss2si(<4 x float> %a0)
888  ret i32 %res
889}
890
891define <4 x float> @test_mm_div_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
892; SSE-LABEL: test_mm_div_ps:
893; SSE:       # %bb.0:
894; SSE-NEXT:    divps %xmm1, %xmm0 # encoding: [0x0f,0x5e,0xc1]
895; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
896;
897; AVX1-LABEL: test_mm_div_ps:
898; AVX1:       # %bb.0:
899; AVX1-NEXT:    vdivps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x5e,0xc1]
900; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
901;
902; AVX512-LABEL: test_mm_div_ps:
903; AVX512:       # %bb.0:
904; AVX512-NEXT:    vdivps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x5e,0xc1]
905; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
906  %res = fdiv <4 x float> %a0, %a1
907  ret <4 x float> %res
908}
909
910define <4 x float> @test_mm_div_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
911; SSE-LABEL: test_mm_div_ss:
912; SSE:       # %bb.0:
913; SSE-NEXT:    divss %xmm1, %xmm0 # encoding: [0xf3,0x0f,0x5e,0xc1]
914; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
915;
916; AVX1-LABEL: test_mm_div_ss:
917; AVX1:       # %bb.0:
918; AVX1-NEXT:    vdivss %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x5e,0xc1]
919; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
920;
921; AVX512-LABEL: test_mm_div_ss:
922; AVX512:       # %bb.0:
923; AVX512-NEXT:    vdivss %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x5e,0xc1]
924; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
925  %ext0 = extractelement <4 x float> %a0, i32 0
926  %ext1 = extractelement <4 x float> %a1, i32 0
927  %fdiv = fdiv float %ext0, %ext1
928  %res = insertelement <4 x float> %a0, float %fdiv, i32 0
929  ret <4 x float> %res
930}
931
932define i32 @test_MM_GET_EXCEPTION_MASK() nounwind {
933; X86-SSE-LABEL: test_MM_GET_EXCEPTION_MASK:
934; X86-SSE:       # %bb.0:
935; X86-SSE-NEXT:    pushl %eax # encoding: [0x50]
936; X86-SSE-NEXT:    movl %esp, %eax # encoding: [0x89,0xe0]
937; X86-SSE-NEXT:    stmxcsr (%eax) # encoding: [0x0f,0xae,0x18]
938; X86-SSE-NEXT:    movl (%esp), %eax # encoding: [0x8b,0x04,0x24]
939; X86-SSE-NEXT:    andl $8064, %eax # encoding: [0x25,0x80,0x1f,0x00,0x00]
940; X86-SSE-NEXT:    # imm = 0x1F80
941; X86-SSE-NEXT:    popl %ecx # encoding: [0x59]
942; X86-SSE-NEXT:    retl # encoding: [0xc3]
943;
944; X86-AVX-LABEL: test_MM_GET_EXCEPTION_MASK:
945; X86-AVX:       # %bb.0:
946; X86-AVX-NEXT:    pushl %eax # encoding: [0x50]
947; X86-AVX-NEXT:    movl %esp, %eax # encoding: [0x89,0xe0]
948; X86-AVX-NEXT:    vstmxcsr (%eax) # encoding: [0xc5,0xf8,0xae,0x18]
949; X86-AVX-NEXT:    movl (%esp), %eax # encoding: [0x8b,0x04,0x24]
950; X86-AVX-NEXT:    andl $8064, %eax # encoding: [0x25,0x80,0x1f,0x00,0x00]
951; X86-AVX-NEXT:    # imm = 0x1F80
952; X86-AVX-NEXT:    popl %ecx # encoding: [0x59]
953; X86-AVX-NEXT:    retl # encoding: [0xc3]
954;
955; X64-SSE-LABEL: test_MM_GET_EXCEPTION_MASK:
956; X64-SSE:       # %bb.0:
957; X64-SSE-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8d,0x44,0x24,0xfc]
958; X64-SSE-NEXT:    stmxcsr (%rax) # encoding: [0x0f,0xae,0x18]
959; X64-SSE-NEXT:    movl -{{[0-9]+}}(%rsp), %eax # encoding: [0x8b,0x44,0x24,0xfc]
960; X64-SSE-NEXT:    andl $8064, %eax # encoding: [0x25,0x80,0x1f,0x00,0x00]
961; X64-SSE-NEXT:    # imm = 0x1F80
962; X64-SSE-NEXT:    retq # encoding: [0xc3]
963;
964; X64-AVX-LABEL: test_MM_GET_EXCEPTION_MASK:
965; X64-AVX:       # %bb.0:
966; X64-AVX-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8d,0x44,0x24,0xfc]
967; X64-AVX-NEXT:    vstmxcsr (%rax) # encoding: [0xc5,0xf8,0xae,0x18]
968; X64-AVX-NEXT:    movl -{{[0-9]+}}(%rsp), %eax # encoding: [0x8b,0x44,0x24,0xfc]
969; X64-AVX-NEXT:    andl $8064, %eax # encoding: [0x25,0x80,0x1f,0x00,0x00]
970; X64-AVX-NEXT:    # imm = 0x1F80
971; X64-AVX-NEXT:    retq # encoding: [0xc3]
972  %1 = alloca i32, align 4
973  %2 = bitcast i32* %1 to i8*
974  call void @llvm.x86.sse.stmxcsr(i8* %2)
975  %3 = load i32, i32* %1, align 4
976  %4 = and i32 %3, 8064
977  ret i32 %4
978}
979declare void @llvm.x86.sse.stmxcsr(i8*) nounwind readnone
980
981define i32 @test_MM_GET_EXCEPTION_STATE() nounwind {
982; X86-SSE-LABEL: test_MM_GET_EXCEPTION_STATE:
983; X86-SSE:       # %bb.0:
984; X86-SSE-NEXT:    pushl %eax # encoding: [0x50]
985; X86-SSE-NEXT:    movl %esp, %eax # encoding: [0x89,0xe0]
986; X86-SSE-NEXT:    stmxcsr (%eax) # encoding: [0x0f,0xae,0x18]
987; X86-SSE-NEXT:    movl (%esp), %eax # encoding: [0x8b,0x04,0x24]
988; X86-SSE-NEXT:    andl $63, %eax # encoding: [0x83,0xe0,0x3f]
989; X86-SSE-NEXT:    popl %ecx # encoding: [0x59]
990; X86-SSE-NEXT:    retl # encoding: [0xc3]
991;
992; X86-AVX-LABEL: test_MM_GET_EXCEPTION_STATE:
993; X86-AVX:       # %bb.0:
994; X86-AVX-NEXT:    pushl %eax # encoding: [0x50]
995; X86-AVX-NEXT:    movl %esp, %eax # encoding: [0x89,0xe0]
996; X86-AVX-NEXT:    vstmxcsr (%eax) # encoding: [0xc5,0xf8,0xae,0x18]
997; X86-AVX-NEXT:    movl (%esp), %eax # encoding: [0x8b,0x04,0x24]
998; X86-AVX-NEXT:    andl $63, %eax # encoding: [0x83,0xe0,0x3f]
999; X86-AVX-NEXT:    popl %ecx # encoding: [0x59]
1000; X86-AVX-NEXT:    retl # encoding: [0xc3]
1001;
1002; X64-SSE-LABEL: test_MM_GET_EXCEPTION_STATE:
1003; X64-SSE:       # %bb.0:
1004; X64-SSE-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8d,0x44,0x24,0xfc]
1005; X64-SSE-NEXT:    stmxcsr (%rax) # encoding: [0x0f,0xae,0x18]
1006; X64-SSE-NEXT:    movl -{{[0-9]+}}(%rsp), %eax # encoding: [0x8b,0x44,0x24,0xfc]
1007; X64-SSE-NEXT:    andl $63, %eax # encoding: [0x83,0xe0,0x3f]
1008; X64-SSE-NEXT:    retq # encoding: [0xc3]
1009;
1010; X64-AVX-LABEL: test_MM_GET_EXCEPTION_STATE:
1011; X64-AVX:       # %bb.0:
1012; X64-AVX-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8d,0x44,0x24,0xfc]
1013; X64-AVX-NEXT:    vstmxcsr (%rax) # encoding: [0xc5,0xf8,0xae,0x18]
1014; X64-AVX-NEXT:    movl -{{[0-9]+}}(%rsp), %eax # encoding: [0x8b,0x44,0x24,0xfc]
1015; X64-AVX-NEXT:    andl $63, %eax # encoding: [0x83,0xe0,0x3f]
1016; X64-AVX-NEXT:    retq # encoding: [0xc3]
1017  %1 = alloca i32, align 4
1018  %2 = bitcast i32* %1 to i8*
1019  call void @llvm.x86.sse.stmxcsr(i8* %2)
1020  %3 = load i32, i32* %1, align 4
1021  %4 = and i32 %3, 63
1022  ret i32 %4
1023}
1024
1025define i32 @test_MM_GET_FLUSH_ZERO_MODE() nounwind {
1026; X86-SSE-LABEL: test_MM_GET_FLUSH_ZERO_MODE:
1027; X86-SSE:       # %bb.0:
1028; X86-SSE-NEXT:    pushl %eax # encoding: [0x50]
1029; X86-SSE-NEXT:    movl %esp, %eax # encoding: [0x89,0xe0]
1030; X86-SSE-NEXT:    stmxcsr (%eax) # encoding: [0x0f,0xae,0x18]
1031; X86-SSE-NEXT:    movl (%esp), %eax # encoding: [0x8b,0x04,0x24]
1032; X86-SSE-NEXT:    andl $32768, %eax # encoding: [0x25,0x00,0x80,0x00,0x00]
1033; X86-SSE-NEXT:    # imm = 0x8000
1034; X86-SSE-NEXT:    popl %ecx # encoding: [0x59]
1035; X86-SSE-NEXT:    retl # encoding: [0xc3]
1036;
1037; X86-AVX-LABEL: test_MM_GET_FLUSH_ZERO_MODE:
1038; X86-AVX:       # %bb.0:
1039; X86-AVX-NEXT:    pushl %eax # encoding: [0x50]
1040; X86-AVX-NEXT:    movl %esp, %eax # encoding: [0x89,0xe0]
1041; X86-AVX-NEXT:    vstmxcsr (%eax) # encoding: [0xc5,0xf8,0xae,0x18]
1042; X86-AVX-NEXT:    movl (%esp), %eax # encoding: [0x8b,0x04,0x24]
1043; X86-AVX-NEXT:    andl $32768, %eax # encoding: [0x25,0x00,0x80,0x00,0x00]
1044; X86-AVX-NEXT:    # imm = 0x8000
1045; X86-AVX-NEXT:    popl %ecx # encoding: [0x59]
1046; X86-AVX-NEXT:    retl # encoding: [0xc3]
1047;
1048; X64-SSE-LABEL: test_MM_GET_FLUSH_ZERO_MODE:
1049; X64-SSE:       # %bb.0:
1050; X64-SSE-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8d,0x44,0x24,0xfc]
1051; X64-SSE-NEXT:    stmxcsr (%rax) # encoding: [0x0f,0xae,0x18]
1052; X64-SSE-NEXT:    movl -{{[0-9]+}}(%rsp), %eax # encoding: [0x8b,0x44,0x24,0xfc]
1053; X64-SSE-NEXT:    andl $32768, %eax # encoding: [0x25,0x00,0x80,0x00,0x00]
1054; X64-SSE-NEXT:    # imm = 0x8000
1055; X64-SSE-NEXT:    retq # encoding: [0xc3]
1056;
1057; X64-AVX-LABEL: test_MM_GET_FLUSH_ZERO_MODE:
1058; X64-AVX:       # %bb.0:
1059; X64-AVX-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8d,0x44,0x24,0xfc]
1060; X64-AVX-NEXT:    vstmxcsr (%rax) # encoding: [0xc5,0xf8,0xae,0x18]
1061; X64-AVX-NEXT:    movl -{{[0-9]+}}(%rsp), %eax # encoding: [0x8b,0x44,0x24,0xfc]
1062; X64-AVX-NEXT:    andl $32768, %eax # encoding: [0x25,0x00,0x80,0x00,0x00]
1063; X64-AVX-NEXT:    # imm = 0x8000
1064; X64-AVX-NEXT:    retq # encoding: [0xc3]
1065  %1 = alloca i32, align 4
1066  %2 = bitcast i32* %1 to i8*
1067  call void @llvm.x86.sse.stmxcsr(i8* %2)
1068  %3 = load i32, i32* %1, align 4
1069  %4 = and i32 %3, 32768
1070  ret i32 %4
1071}
1072
1073define i32 @test_MM_GET_ROUNDING_MODE() nounwind {
1074; X86-SSE-LABEL: test_MM_GET_ROUNDING_MODE:
1075; X86-SSE:       # %bb.0:
1076; X86-SSE-NEXT:    pushl %eax # encoding: [0x50]
1077; X86-SSE-NEXT:    movl %esp, %eax # encoding: [0x89,0xe0]
1078; X86-SSE-NEXT:    stmxcsr (%eax) # encoding: [0x0f,0xae,0x18]
1079; X86-SSE-NEXT:    movl (%esp), %eax # encoding: [0x8b,0x04,0x24]
1080; X86-SSE-NEXT:    andl $24576, %eax # encoding: [0x25,0x00,0x60,0x00,0x00]
1081; X86-SSE-NEXT:    # imm = 0x6000
1082; X86-SSE-NEXT:    popl %ecx # encoding: [0x59]
1083; X86-SSE-NEXT:    retl # encoding: [0xc3]
1084;
1085; X86-AVX-LABEL: test_MM_GET_ROUNDING_MODE:
1086; X86-AVX:       # %bb.0:
1087; X86-AVX-NEXT:    pushl %eax # encoding: [0x50]
1088; X86-AVX-NEXT:    movl %esp, %eax # encoding: [0x89,0xe0]
1089; X86-AVX-NEXT:    vstmxcsr (%eax) # encoding: [0xc5,0xf8,0xae,0x18]
1090; X86-AVX-NEXT:    movl (%esp), %eax # encoding: [0x8b,0x04,0x24]
1091; X86-AVX-NEXT:    andl $24576, %eax # encoding: [0x25,0x00,0x60,0x00,0x00]
1092; X86-AVX-NEXT:    # imm = 0x6000
1093; X86-AVX-NEXT:    popl %ecx # encoding: [0x59]
1094; X86-AVX-NEXT:    retl # encoding: [0xc3]
1095;
1096; X64-SSE-LABEL: test_MM_GET_ROUNDING_MODE:
1097; X64-SSE:       # %bb.0:
1098; X64-SSE-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8d,0x44,0x24,0xfc]
1099; X64-SSE-NEXT:    stmxcsr (%rax) # encoding: [0x0f,0xae,0x18]
1100; X64-SSE-NEXT:    movl -{{[0-9]+}}(%rsp), %eax # encoding: [0x8b,0x44,0x24,0xfc]
1101; X64-SSE-NEXT:    andl $24576, %eax # encoding: [0x25,0x00,0x60,0x00,0x00]
1102; X64-SSE-NEXT:    # imm = 0x6000
1103; X64-SSE-NEXT:    retq # encoding: [0xc3]
1104;
1105; X64-AVX-LABEL: test_MM_GET_ROUNDING_MODE:
1106; X64-AVX:       # %bb.0:
1107; X64-AVX-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8d,0x44,0x24,0xfc]
1108; X64-AVX-NEXT:    vstmxcsr (%rax) # encoding: [0xc5,0xf8,0xae,0x18]
1109; X64-AVX-NEXT:    movl -{{[0-9]+}}(%rsp), %eax # encoding: [0x8b,0x44,0x24,0xfc]
1110; X64-AVX-NEXT:    andl $24576, %eax # encoding: [0x25,0x00,0x60,0x00,0x00]
1111; X64-AVX-NEXT:    # imm = 0x6000
1112; X64-AVX-NEXT:    retq # encoding: [0xc3]
1113  %1 = alloca i32, align 4
1114  %2 = bitcast i32* %1 to i8*
1115  call void @llvm.x86.sse.stmxcsr(i8* %2)
1116  %3 = load i32, i32* %1, align 4
1117  %4 = and i32 %3, 24576
1118  ret i32 %4
1119}
1120
1121define i32 @test_mm_getcsr() nounwind {
1122; X86-SSE-LABEL: test_mm_getcsr:
1123; X86-SSE:       # %bb.0:
1124; X86-SSE-NEXT:    pushl %eax # encoding: [0x50]
1125; X86-SSE-NEXT:    movl %esp, %eax # encoding: [0x89,0xe0]
1126; X86-SSE-NEXT:    stmxcsr (%eax) # encoding: [0x0f,0xae,0x18]
1127; X86-SSE-NEXT:    movl (%esp), %eax # encoding: [0x8b,0x04,0x24]
1128; X86-SSE-NEXT:    popl %ecx # encoding: [0x59]
1129; X86-SSE-NEXT:    retl # encoding: [0xc3]
1130;
1131; X86-AVX-LABEL: test_mm_getcsr:
1132; X86-AVX:       # %bb.0:
1133; X86-AVX-NEXT:    pushl %eax # encoding: [0x50]
1134; X86-AVX-NEXT:    movl %esp, %eax # encoding: [0x89,0xe0]
1135; X86-AVX-NEXT:    vstmxcsr (%eax) # encoding: [0xc5,0xf8,0xae,0x18]
1136; X86-AVX-NEXT:    movl (%esp), %eax # encoding: [0x8b,0x04,0x24]
1137; X86-AVX-NEXT:    popl %ecx # encoding: [0x59]
1138; X86-AVX-NEXT:    retl # encoding: [0xc3]
1139;
1140; X64-SSE-LABEL: test_mm_getcsr:
1141; X64-SSE:       # %bb.0:
1142; X64-SSE-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8d,0x44,0x24,0xfc]
1143; X64-SSE-NEXT:    stmxcsr (%rax) # encoding: [0x0f,0xae,0x18]
1144; X64-SSE-NEXT:    movl -{{[0-9]+}}(%rsp), %eax # encoding: [0x8b,0x44,0x24,0xfc]
1145; X64-SSE-NEXT:    retq # encoding: [0xc3]
1146;
1147; X64-AVX-LABEL: test_mm_getcsr:
1148; X64-AVX:       # %bb.0:
1149; X64-AVX-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8d,0x44,0x24,0xfc]
1150; X64-AVX-NEXT:    vstmxcsr (%rax) # encoding: [0xc5,0xf8,0xae,0x18]
1151; X64-AVX-NEXT:    movl -{{[0-9]+}}(%rsp), %eax # encoding: [0x8b,0x44,0x24,0xfc]
1152; X64-AVX-NEXT:    retq # encoding: [0xc3]
1153  %1 = alloca i32, align 4
1154  %2 = bitcast i32* %1 to i8*
1155  call void @llvm.x86.sse.stmxcsr(i8* %2)
1156  %3 = load i32, i32* %1, align 4
1157  ret i32 %3
1158}
1159
1160define <4 x float> @test_mm_load_ps(float* %a0) nounwind {
1161; X86-SSE-LABEL: test_mm_load_ps:
1162; X86-SSE:       # %bb.0:
1163; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1164; X86-SSE-NEXT:    movaps (%eax), %xmm0 # encoding: [0x0f,0x28,0x00]
1165; X86-SSE-NEXT:    retl # encoding: [0xc3]
1166;
1167; X86-AVX1-LABEL: test_mm_load_ps:
1168; X86-AVX1:       # %bb.0:
1169; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1170; X86-AVX1-NEXT:    vmovaps (%eax), %xmm0 # encoding: [0xc5,0xf8,0x28,0x00]
1171; X86-AVX1-NEXT:    retl # encoding: [0xc3]
1172;
1173; X86-AVX512-LABEL: test_mm_load_ps:
1174; X86-AVX512:       # %bb.0:
1175; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1176; X86-AVX512-NEXT:    vmovaps (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x00]
1177; X86-AVX512-NEXT:    retl # encoding: [0xc3]
1178;
1179; X64-SSE-LABEL: test_mm_load_ps:
1180; X64-SSE:       # %bb.0:
1181; X64-SSE-NEXT:    movaps (%rdi), %xmm0 # encoding: [0x0f,0x28,0x07]
1182; X64-SSE-NEXT:    retq # encoding: [0xc3]
1183;
1184; X64-AVX1-LABEL: test_mm_load_ps:
1185; X64-AVX1:       # %bb.0:
1186; X64-AVX1-NEXT:    vmovaps (%rdi), %xmm0 # encoding: [0xc5,0xf8,0x28,0x07]
1187; X64-AVX1-NEXT:    retq # encoding: [0xc3]
1188;
1189; X64-AVX512-LABEL: test_mm_load_ps:
1190; X64-AVX512:       # %bb.0:
1191; X64-AVX512-NEXT:    vmovaps (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x07]
1192; X64-AVX512-NEXT:    retq # encoding: [0xc3]
1193  %arg0 = bitcast float* %a0 to <4 x float>*
1194  %res = load <4 x float>, <4 x float>* %arg0, align 16
1195  ret <4 x float> %res
1196}
1197
1198define <4 x float> @test_mm_load_ps1(float* %a0) nounwind {
1199; X86-SSE-LABEL: test_mm_load_ps1:
1200; X86-SSE:       # %bb.0:
1201; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1202; X86-SSE-NEXT:    movss (%eax), %xmm0 # encoding: [0xf3,0x0f,0x10,0x00]
1203; X86-SSE-NEXT:    # xmm0 = mem[0],zero,zero,zero
1204; X86-SSE-NEXT:    shufps $0, %xmm0, %xmm0 # encoding: [0x0f,0xc6,0xc0,0x00]
1205; X86-SSE-NEXT:    # xmm0 = xmm0[0,0,0,0]
1206; X86-SSE-NEXT:    retl # encoding: [0xc3]
1207;
1208; X86-AVX1-LABEL: test_mm_load_ps1:
1209; X86-AVX1:       # %bb.0:
1210; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1211; X86-AVX1-NEXT:    vbroadcastss (%eax), %xmm0 # encoding: [0xc4,0xe2,0x79,0x18,0x00]
1212; X86-AVX1-NEXT:    retl # encoding: [0xc3]
1213;
1214; X86-AVX512-LABEL: test_mm_load_ps1:
1215; X86-AVX512:       # %bb.0:
1216; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1217; X86-AVX512-NEXT:    vbroadcastss (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x00]
1218; X86-AVX512-NEXT:    retl # encoding: [0xc3]
1219;
1220; X64-SSE-LABEL: test_mm_load_ps1:
1221; X64-SSE:       # %bb.0:
1222; X64-SSE-NEXT:    movss (%rdi), %xmm0 # encoding: [0xf3,0x0f,0x10,0x07]
1223; X64-SSE-NEXT:    # xmm0 = mem[0],zero,zero,zero
1224; X64-SSE-NEXT:    shufps $0, %xmm0, %xmm0 # encoding: [0x0f,0xc6,0xc0,0x00]
1225; X64-SSE-NEXT:    # xmm0 = xmm0[0,0,0,0]
1226; X64-SSE-NEXT:    retq # encoding: [0xc3]
1227;
1228; X64-AVX1-LABEL: test_mm_load_ps1:
1229; X64-AVX1:       # %bb.0:
1230; X64-AVX1-NEXT:    vbroadcastss (%rdi), %xmm0 # encoding: [0xc4,0xe2,0x79,0x18,0x07]
1231; X64-AVX1-NEXT:    retq # encoding: [0xc3]
1232;
1233; X64-AVX512-LABEL: test_mm_load_ps1:
1234; X64-AVX512:       # %bb.0:
1235; X64-AVX512-NEXT:    vbroadcastss (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x07]
1236; X64-AVX512-NEXT:    retq # encoding: [0xc3]
1237  %ld = load float, float* %a0, align 4
1238  %res0 = insertelement <4 x float> undef, float %ld, i32 0
1239  %res1 = insertelement <4 x float> %res0, float %ld, i32 1
1240  %res2 = insertelement <4 x float> %res1, float %ld, i32 2
1241  %res3 = insertelement <4 x float> %res2, float %ld, i32 3
1242  ret <4 x float> %res3
1243}
1244
1245define <4 x float> @test_mm_load_ss(float* %a0) nounwind {
1246; X86-SSE-LABEL: test_mm_load_ss:
1247; X86-SSE:       # %bb.0:
1248; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1249; X86-SSE-NEXT:    movss (%eax), %xmm0 # encoding: [0xf3,0x0f,0x10,0x00]
1250; X86-SSE-NEXT:    # xmm0 = mem[0],zero,zero,zero
1251; X86-SSE-NEXT:    retl # encoding: [0xc3]
1252;
1253; X86-AVX1-LABEL: test_mm_load_ss:
1254; X86-AVX1:       # %bb.0:
1255; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1256; X86-AVX1-NEXT:    vmovss (%eax), %xmm0 # encoding: [0xc5,0xfa,0x10,0x00]
1257; X86-AVX1-NEXT:    # xmm0 = mem[0],zero,zero,zero
1258; X86-AVX1-NEXT:    retl # encoding: [0xc3]
1259;
1260; X86-AVX512-LABEL: test_mm_load_ss:
1261; X86-AVX512:       # %bb.0:
1262; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1263; X86-AVX512-NEXT:    vmovss (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x00]
1264; X86-AVX512-NEXT:    # xmm0 = mem[0],zero,zero,zero
1265; X86-AVX512-NEXT:    retl # encoding: [0xc3]
1266;
1267; X64-SSE-LABEL: test_mm_load_ss:
1268; X64-SSE:       # %bb.0:
1269; X64-SSE-NEXT:    movss (%rdi), %xmm0 # encoding: [0xf3,0x0f,0x10,0x07]
1270; X64-SSE-NEXT:    # xmm0 = mem[0],zero,zero,zero
1271; X64-SSE-NEXT:    retq # encoding: [0xc3]
1272;
1273; X64-AVX1-LABEL: test_mm_load_ss:
1274; X64-AVX1:       # %bb.0:
1275; X64-AVX1-NEXT:    vmovss (%rdi), %xmm0 # encoding: [0xc5,0xfa,0x10,0x07]
1276; X64-AVX1-NEXT:    # xmm0 = mem[0],zero,zero,zero
1277; X64-AVX1-NEXT:    retq # encoding: [0xc3]
1278;
1279; X64-AVX512-LABEL: test_mm_load_ss:
1280; X64-AVX512:       # %bb.0:
1281; X64-AVX512-NEXT:    vmovss (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x07]
1282; X64-AVX512-NEXT:    # xmm0 = mem[0],zero,zero,zero
1283; X64-AVX512-NEXT:    retq # encoding: [0xc3]
1284  %ld = load float, float* %a0, align 1
1285  %res0 = insertelement <4 x float> undef, float %ld, i32 0
1286  %res1 = insertelement <4 x float> %res0, float 0.0, i32 1
1287  %res2 = insertelement <4 x float> %res1, float 0.0, i32 2
1288  %res3 = insertelement <4 x float> %res2, float 0.0, i32 3
1289  ret <4 x float> %res3
1290}
1291
1292define <4 x float> @test_mm_load1_ps(float* %a0) nounwind {
1293; X86-SSE-LABEL: test_mm_load1_ps:
1294; X86-SSE:       # %bb.0:
1295; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1296; X86-SSE-NEXT:    movss (%eax), %xmm0 # encoding: [0xf3,0x0f,0x10,0x00]
1297; X86-SSE-NEXT:    # xmm0 = mem[0],zero,zero,zero
1298; X86-SSE-NEXT:    shufps $0, %xmm0, %xmm0 # encoding: [0x0f,0xc6,0xc0,0x00]
1299; X86-SSE-NEXT:    # xmm0 = xmm0[0,0,0,0]
1300; X86-SSE-NEXT:    retl # encoding: [0xc3]
1301;
1302; X86-AVX1-LABEL: test_mm_load1_ps:
1303; X86-AVX1:       # %bb.0:
1304; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1305; X86-AVX1-NEXT:    vbroadcastss (%eax), %xmm0 # encoding: [0xc4,0xe2,0x79,0x18,0x00]
1306; X86-AVX1-NEXT:    retl # encoding: [0xc3]
1307;
1308; X86-AVX512-LABEL: test_mm_load1_ps:
1309; X86-AVX512:       # %bb.0:
1310; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1311; X86-AVX512-NEXT:    vbroadcastss (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x00]
1312; X86-AVX512-NEXT:    retl # encoding: [0xc3]
1313;
1314; X64-SSE-LABEL: test_mm_load1_ps:
1315; X64-SSE:       # %bb.0:
1316; X64-SSE-NEXT:    movss (%rdi), %xmm0 # encoding: [0xf3,0x0f,0x10,0x07]
1317; X64-SSE-NEXT:    # xmm0 = mem[0],zero,zero,zero
1318; X64-SSE-NEXT:    shufps $0, %xmm0, %xmm0 # encoding: [0x0f,0xc6,0xc0,0x00]
1319; X64-SSE-NEXT:    # xmm0 = xmm0[0,0,0,0]
1320; X64-SSE-NEXT:    retq # encoding: [0xc3]
1321;
1322; X64-AVX1-LABEL: test_mm_load1_ps:
1323; X64-AVX1:       # %bb.0:
1324; X64-AVX1-NEXT:    vbroadcastss (%rdi), %xmm0 # encoding: [0xc4,0xe2,0x79,0x18,0x07]
1325; X64-AVX1-NEXT:    retq # encoding: [0xc3]
1326;
1327; X64-AVX512-LABEL: test_mm_load1_ps:
1328; X64-AVX512:       # %bb.0:
1329; X64-AVX512-NEXT:    vbroadcastss (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x07]
1330; X64-AVX512-NEXT:    retq # encoding: [0xc3]
1331  %ld = load float, float* %a0, align 4
1332  %res0 = insertelement <4 x float> undef, float %ld, i32 0
1333  %res1 = insertelement <4 x float> %res0, float %ld, i32 1
1334  %res2 = insertelement <4 x float> %res1, float %ld, i32 2
1335  %res3 = insertelement <4 x float> %res2, float %ld, i32 3
1336  ret <4 x float> %res3
1337}
1338
1339define <4 x float> @test_mm_loadh_pi(<4 x float> %a0, x86_mmx* %a1) {
1340; X86-SSE-LABEL: test_mm_loadh_pi:
1341; X86-SSE:       # %bb.0:
1342; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1343; X86-SSE-NEXT:    movhps (%eax), %xmm0 # encoding: [0x0f,0x16,0x00]
1344; X86-SSE-NEXT:    # xmm0 = xmm0[0,1],mem[0,1]
1345; X86-SSE-NEXT:    retl # encoding: [0xc3]
1346;
1347; X86-AVX1-LABEL: test_mm_loadh_pi:
1348; X86-AVX1:       # %bb.0:
1349; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1350; X86-AVX1-NEXT:    vmovhps (%eax), %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x16,0x00]
1351; X86-AVX1-NEXT:    # xmm0 = xmm0[0,1],mem[0,1]
1352; X86-AVX1-NEXT:    retl # encoding: [0xc3]
1353;
1354; X86-AVX512-LABEL: test_mm_loadh_pi:
1355; X86-AVX512:       # %bb.0:
1356; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1357; X86-AVX512-NEXT:    vmovhps (%eax), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x16,0x00]
1358; X86-AVX512-NEXT:    # xmm0 = xmm0[0,1],mem[0,1]
1359; X86-AVX512-NEXT:    retl # encoding: [0xc3]
1360;
1361; X64-SSE-LABEL: test_mm_loadh_pi:
1362; X64-SSE:       # %bb.0:
1363; X64-SSE-NEXT:    movhps (%rdi), %xmm0 # encoding: [0x0f,0x16,0x07]
1364; X64-SSE-NEXT:    # xmm0 = xmm0[0,1],mem[0,1]
1365; X64-SSE-NEXT:    retq # encoding: [0xc3]
1366;
1367; X64-AVX1-LABEL: test_mm_loadh_pi:
1368; X64-AVX1:       # %bb.0:
1369; X64-AVX1-NEXT:    vmovhps (%rdi), %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x16,0x07]
1370; X64-AVX1-NEXT:    # xmm0 = xmm0[0,1],mem[0,1]
1371; X64-AVX1-NEXT:    retq # encoding: [0xc3]
1372;
1373; X64-AVX512-LABEL: test_mm_loadh_pi:
1374; X64-AVX512:       # %bb.0:
1375; X64-AVX512-NEXT:    vmovhps (%rdi), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x16,0x07]
1376; X64-AVX512-NEXT:    # xmm0 = xmm0[0,1],mem[0,1]
1377; X64-AVX512-NEXT:    retq # encoding: [0xc3]
1378  %ptr = bitcast x86_mmx* %a1 to <2 x float>*
1379  %ld  = load <2 x float>, <2 x float>* %ptr
1380  %ext = shufflevector <2 x float> %ld, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
1381  %res = shufflevector <4 x float> %a0, <4 x float> %ext, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
1382  ret <4 x float> %res
1383}
1384
1385define <4 x float> @test_mm_loadl_pi(<4 x float> %a0, x86_mmx* %a1) {
1386; X86-SSE-LABEL: test_mm_loadl_pi:
1387; X86-SSE:       # %bb.0:
1388; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1389; X86-SSE-NEXT:    movlps (%eax), %xmm0 # encoding: [0x0f,0x12,0x00]
1390; X86-SSE-NEXT:    # xmm0 = mem[0,1],xmm0[2,3]
1391; X86-SSE-NEXT:    retl # encoding: [0xc3]
1392;
1393; X86-AVX1-LABEL: test_mm_loadl_pi:
1394; X86-AVX1:       # %bb.0:
1395; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1396; X86-AVX1-NEXT:    vmovlps (%eax), %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x12,0x00]
1397; X86-AVX1-NEXT:    # xmm0 = mem[0,1],xmm0[2,3]
1398; X86-AVX1-NEXT:    retl # encoding: [0xc3]
1399;
1400; X86-AVX512-LABEL: test_mm_loadl_pi:
1401; X86-AVX512:       # %bb.0:
1402; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1403; X86-AVX512-NEXT:    vmovlps (%eax), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x12,0x00]
1404; X86-AVX512-NEXT:    # xmm0 = mem[0,1],xmm0[2,3]
1405; X86-AVX512-NEXT:    retl # encoding: [0xc3]
1406;
1407; X64-SSE-LABEL: test_mm_loadl_pi:
1408; X64-SSE:       # %bb.0:
1409; X64-SSE-NEXT:    movlps (%rdi), %xmm0 # encoding: [0x0f,0x12,0x07]
1410; X64-SSE-NEXT:    # xmm0 = mem[0,1],xmm0[2,3]
1411; X64-SSE-NEXT:    retq # encoding: [0xc3]
1412;
1413; X64-AVX1-LABEL: test_mm_loadl_pi:
1414; X64-AVX1:       # %bb.0:
1415; X64-AVX1-NEXT:    vmovlps (%rdi), %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x12,0x07]
1416; X64-AVX1-NEXT:    # xmm0 = mem[0,1],xmm0[2,3]
1417; X64-AVX1-NEXT:    retq # encoding: [0xc3]
1418;
1419; X64-AVX512-LABEL: test_mm_loadl_pi:
1420; X64-AVX512:       # %bb.0:
1421; X64-AVX512-NEXT:    vmovlps (%rdi), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x12,0x07]
1422; X64-AVX512-NEXT:    # xmm0 = mem[0,1],xmm0[2,3]
1423; X64-AVX512-NEXT:    retq # encoding: [0xc3]
1424  %ptr = bitcast x86_mmx* %a1 to <2 x float>*
1425  %ld  = load <2 x float>, <2 x float>* %ptr
1426  %ext = shufflevector <2 x float> %ld, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
1427  %res = shufflevector <4 x float> %a0, <4 x float> %ext, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
1428  ret <4 x float> %res
1429}
1430
1431define <4 x float> @test_mm_loadr_ps(float* %a0) nounwind {
1432; X86-SSE-LABEL: test_mm_loadr_ps:
1433; X86-SSE:       # %bb.0:
1434; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1435; X86-SSE-NEXT:    movaps (%eax), %xmm0 # encoding: [0x0f,0x28,0x00]
1436; X86-SSE-NEXT:    shufps $27, %xmm0, %xmm0 # encoding: [0x0f,0xc6,0xc0,0x1b]
1437; X86-SSE-NEXT:    # xmm0 = xmm0[3,2,1,0]
1438; X86-SSE-NEXT:    retl # encoding: [0xc3]
1439;
1440; X86-AVX1-LABEL: test_mm_loadr_ps:
1441; X86-AVX1:       # %bb.0:
1442; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1443; X86-AVX1-NEXT:    vpermilps $27, (%eax), %xmm0 # encoding: [0xc4,0xe3,0x79,0x04,0x00,0x1b]
1444; X86-AVX1-NEXT:    # xmm0 = mem[3,2,1,0]
1445; X86-AVX1-NEXT:    retl # encoding: [0xc3]
1446;
1447; X86-AVX512-LABEL: test_mm_loadr_ps:
1448; X86-AVX512:       # %bb.0:
1449; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1450; X86-AVX512-NEXT:    vpermilps $27, (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0x00,0x1b]
1451; X86-AVX512-NEXT:    # xmm0 = mem[3,2,1,0]
1452; X86-AVX512-NEXT:    retl # encoding: [0xc3]
1453;
1454; X64-SSE-LABEL: test_mm_loadr_ps:
1455; X64-SSE:       # %bb.0:
1456; X64-SSE-NEXT:    movaps (%rdi), %xmm0 # encoding: [0x0f,0x28,0x07]
1457; X64-SSE-NEXT:    shufps $27, %xmm0, %xmm0 # encoding: [0x0f,0xc6,0xc0,0x1b]
1458; X64-SSE-NEXT:    # xmm0 = xmm0[3,2,1,0]
1459; X64-SSE-NEXT:    retq # encoding: [0xc3]
1460;
1461; X64-AVX1-LABEL: test_mm_loadr_ps:
1462; X64-AVX1:       # %bb.0:
1463; X64-AVX1-NEXT:    vpermilps $27, (%rdi), %xmm0 # encoding: [0xc4,0xe3,0x79,0x04,0x07,0x1b]
1464; X64-AVX1-NEXT:    # xmm0 = mem[3,2,1,0]
1465; X64-AVX1-NEXT:    retq # encoding: [0xc3]
1466;
1467; X64-AVX512-LABEL: test_mm_loadr_ps:
1468; X64-AVX512:       # %bb.0:
1469; X64-AVX512-NEXT:    vpermilps $27, (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0x07,0x1b]
1470; X64-AVX512-NEXT:    # xmm0 = mem[3,2,1,0]
1471; X64-AVX512-NEXT:    retq # encoding: [0xc3]
1472  %arg0 = bitcast float* %a0 to <4 x float>*
1473  %ld = load <4 x float>, <4 x float>* %arg0, align 16
1474  %res = shufflevector <4 x float> %ld, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
1475  ret <4 x float> %res
1476}
1477
1478define <4 x float> @test_mm_loadu_ps(float* %a0) nounwind {
1479; X86-SSE-LABEL: test_mm_loadu_ps:
1480; X86-SSE:       # %bb.0:
1481; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1482; X86-SSE-NEXT:    movups (%eax), %xmm0 # encoding: [0x0f,0x10,0x00]
1483; X86-SSE-NEXT:    retl # encoding: [0xc3]
1484;
1485; X86-AVX1-LABEL: test_mm_loadu_ps:
1486; X86-AVX1:       # %bb.0:
1487; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1488; X86-AVX1-NEXT:    vmovups (%eax), %xmm0 # encoding: [0xc5,0xf8,0x10,0x00]
1489; X86-AVX1-NEXT:    retl # encoding: [0xc3]
1490;
1491; X86-AVX512-LABEL: test_mm_loadu_ps:
1492; X86-AVX512:       # %bb.0:
1493; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1494; X86-AVX512-NEXT:    vmovups (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x10,0x00]
1495; X86-AVX512-NEXT:    retl # encoding: [0xc3]
1496;
1497; X64-SSE-LABEL: test_mm_loadu_ps:
1498; X64-SSE:       # %bb.0:
1499; X64-SSE-NEXT:    movups (%rdi), %xmm0 # encoding: [0x0f,0x10,0x07]
1500; X64-SSE-NEXT:    retq # encoding: [0xc3]
1501;
1502; X64-AVX1-LABEL: test_mm_loadu_ps:
1503; X64-AVX1:       # %bb.0:
1504; X64-AVX1-NEXT:    vmovups (%rdi), %xmm0 # encoding: [0xc5,0xf8,0x10,0x07]
1505; X64-AVX1-NEXT:    retq # encoding: [0xc3]
1506;
1507; X64-AVX512-LABEL: test_mm_loadu_ps:
1508; X64-AVX512:       # %bb.0:
1509; X64-AVX512-NEXT:    vmovups (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x10,0x07]
1510; X64-AVX512-NEXT:    retq # encoding: [0xc3]
1511  %arg0 = bitcast float* %a0 to <4 x float>*
1512  %res = load <4 x float>, <4 x float>* %arg0, align 1
1513  ret <4 x float> %res
1514}
1515
1516define <4 x float> @test_mm_max_ps(<4 x float> %a0, <4 x float> %a1) {
1517; SSE-LABEL: test_mm_max_ps:
1518; SSE:       # %bb.0:
1519; SSE-NEXT:    maxps %xmm1, %xmm0 # encoding: [0x0f,0x5f,0xc1]
1520; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1521;
1522; AVX1-LABEL: test_mm_max_ps:
1523; AVX1:       # %bb.0:
1524; AVX1-NEXT:    vmaxps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x5f,0xc1]
1525; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1526;
1527; AVX512-LABEL: test_mm_max_ps:
1528; AVX512:       # %bb.0:
1529; AVX512-NEXT:    vmaxps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x5f,0xc1]
1530; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1531  %res = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %a0, <4 x float> %a1)
1532  ret <4 x float> %res
1533}
1534declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
1535
1536define <4 x float> @test_mm_max_ss(<4 x float> %a0, <4 x float> %a1) {
1537; SSE-LABEL: test_mm_max_ss:
1538; SSE:       # %bb.0:
1539; SSE-NEXT:    maxss %xmm1, %xmm0 # encoding: [0xf3,0x0f,0x5f,0xc1]
1540; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1541;
1542; AVX1-LABEL: test_mm_max_ss:
1543; AVX1:       # %bb.0:
1544; AVX1-NEXT:    vmaxss %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x5f,0xc1]
1545; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1546;
1547; AVX512-LABEL: test_mm_max_ss:
1548; AVX512:       # %bb.0:
1549; AVX512-NEXT:    vmaxss %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x5f,0xc1]
1550; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1551  %res = call <4 x float> @llvm.x86.sse.max.ss(<4 x float> %a0, <4 x float> %a1)
1552  ret <4 x float> %res
1553}
1554declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone
1555
1556define <4 x float> @test_mm_min_ps(<4 x float> %a0, <4 x float> %a1) {
1557; SSE-LABEL: test_mm_min_ps:
1558; SSE:       # %bb.0:
1559; SSE-NEXT:    minps %xmm1, %xmm0 # encoding: [0x0f,0x5d,0xc1]
1560; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1561;
1562; AVX1-LABEL: test_mm_min_ps:
1563; AVX1:       # %bb.0:
1564; AVX1-NEXT:    vminps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x5d,0xc1]
1565; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1566;
1567; AVX512-LABEL: test_mm_min_ps:
1568; AVX512:       # %bb.0:
1569; AVX512-NEXT:    vminps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x5d,0xc1]
1570; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1571  %res = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %a0, <4 x float> %a1)
1572  ret <4 x float> %res
1573}
1574declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
1575
1576define <4 x float> @test_mm_min_ss(<4 x float> %a0, <4 x float> %a1) {
1577; SSE-LABEL: test_mm_min_ss:
1578; SSE:       # %bb.0:
1579; SSE-NEXT:    minss %xmm1, %xmm0 # encoding: [0xf3,0x0f,0x5d,0xc1]
1580; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1581;
1582; AVX1-LABEL: test_mm_min_ss:
1583; AVX1:       # %bb.0:
1584; AVX1-NEXT:    vminss %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x5d,0xc1]
1585; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1586;
1587; AVX512-LABEL: test_mm_min_ss:
1588; AVX512:       # %bb.0:
1589; AVX512-NEXT:    vminss %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x5d,0xc1]
1590; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1591  %res = call <4 x float> @llvm.x86.sse.min.ss(<4 x float> %a0, <4 x float> %a1)
1592  ret <4 x float> %res
1593}
1594declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind readnone
1595
1596define <4 x float> @test_mm_move_ss(<4 x float> %a0, <4 x float> %a1) {
1597; SSE-LABEL: test_mm_move_ss:
1598; SSE:       # %bb.0:
1599; SSE-NEXT:    movss %xmm1, %xmm0 # encoding: [0xf3,0x0f,0x10,0xc1]
1600; SSE-NEXT:    # xmm0 = xmm1[0],xmm0[1,2,3]
1601; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1602;
1603; AVX-LABEL: test_mm_move_ss:
1604; AVX:       # %bb.0:
1605; AVX-NEXT:    vblendps $1, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x01]
1606; AVX-NEXT:    # xmm0 = xmm1[0],xmm0[1,2,3]
1607; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1608  %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
1609  ret <4 x float> %res
1610}
1611
1612define <4 x float> @test_mm_movehl_ps(<4 x float> %a0, <4 x float> %a1) {
1613; SSE-LABEL: test_mm_movehl_ps:
1614; SSE:       # %bb.0:
1615; SSE-NEXT:    movhlps %xmm1, %xmm0 # encoding: [0x0f,0x12,0xc1]
1616; SSE-NEXT:    # xmm0 = xmm1[1],xmm0[1]
1617; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1618;
1619; AVX1-LABEL: test_mm_movehl_ps:
1620; AVX1:       # %bb.0:
1621; AVX1-NEXT:    vunpckhpd %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0x15,0xc0]
1622; AVX1-NEXT:    # xmm0 = xmm1[1],xmm0[1]
1623; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1624;
1625; AVX512-LABEL: test_mm_movehl_ps:
1626; AVX512:       # %bb.0:
1627; AVX512-NEXT:    vunpckhpd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0x15,0xc0]
1628; AVX512-NEXT:    # xmm0 = xmm1[1],xmm0[1]
1629; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1630  %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
1631  ret <4 x float> %res
1632}
1633
1634define <4 x float> @test_mm_movelh_ps(<4 x float> %a0, <4 x float> %a1) {
1635; SSE-LABEL: test_mm_movelh_ps:
1636; SSE:       # %bb.0:
1637; SSE-NEXT:    movlhps %xmm1, %xmm0 # encoding: [0x0f,0x16,0xc1]
1638; SSE-NEXT:    # xmm0 = xmm0[0],xmm1[0]
1639; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1640;
1641; AVX1-LABEL: test_mm_movelh_ps:
1642; AVX1:       # %bb.0:
1643; AVX1-NEXT:    vmovlhps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x16,0xc1]
1644; AVX1-NEXT:    # xmm0 = xmm0[0],xmm1[0]
1645; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1646;
1647; AVX512-LABEL: test_mm_movelh_ps:
1648; AVX512:       # %bb.0:
1649; AVX512-NEXT:    vmovlhps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x16,0xc1]
1650; AVX512-NEXT:    # xmm0 = xmm0[0],xmm1[0]
1651; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1652  %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
1653  ret <4 x float> %res
1654}
1655
1656define i32 @test_mm_movemask_ps(<4 x float> %a0) nounwind {
1657; SSE-LABEL: test_mm_movemask_ps:
1658; SSE:       # %bb.0:
1659; SSE-NEXT:    movmskps %xmm0, %eax # encoding: [0x0f,0x50,0xc0]
1660; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1661;
1662; AVX-LABEL: test_mm_movemask_ps:
1663; AVX:       # %bb.0:
1664; AVX-NEXT:    vmovmskps %xmm0, %eax # encoding: [0xc5,0xf8,0x50,0xc0]
1665; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1666  %res = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %a0)
1667  ret i32 %res
1668}
1669declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
1670
1671define <4 x float> @test_mm_mul_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
1672; SSE-LABEL: test_mm_mul_ps:
1673; SSE:       # %bb.0:
1674; SSE-NEXT:    mulps %xmm1, %xmm0 # encoding: [0x0f,0x59,0xc1]
1675; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1676;
1677; AVX1-LABEL: test_mm_mul_ps:
1678; AVX1:       # %bb.0:
1679; AVX1-NEXT:    vmulps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x59,0xc1]
1680; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1681;
1682; AVX512-LABEL: test_mm_mul_ps:
1683; AVX512:       # %bb.0:
1684; AVX512-NEXT:    vmulps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x59,0xc1]
1685; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1686  %res = fmul <4 x float> %a0, %a1
1687  ret <4 x float> %res
1688}
1689
1690define <4 x float> @test_mm_mul_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
1691; SSE-LABEL: test_mm_mul_ss:
1692; SSE:       # %bb.0:
1693; SSE-NEXT:    mulss %xmm1, %xmm0 # encoding: [0xf3,0x0f,0x59,0xc1]
1694; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1695;
1696; AVX1-LABEL: test_mm_mul_ss:
1697; AVX1:       # %bb.0:
1698; AVX1-NEXT:    vmulss %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x59,0xc1]
1699; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1700;
1701; AVX512-LABEL: test_mm_mul_ss:
1702; AVX512:       # %bb.0:
1703; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x59,0xc1]
1704; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1705  %ext0 = extractelement <4 x float> %a0, i32 0
1706  %ext1 = extractelement <4 x float> %a1, i32 0
1707  %fmul = fmul float %ext0, %ext1
1708  %res = insertelement <4 x float> %a0, float %fmul, i32 0
1709  ret <4 x float> %res
1710}
1711
1712define <4 x float> @test_mm_or_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
1713; SSE-LABEL: test_mm_or_ps:
1714; SSE:       # %bb.0:
1715; SSE-NEXT:    orps %xmm1, %xmm0 # encoding: [0x0f,0x56,0xc1]
1716; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1717;
1718; AVX1-LABEL: test_mm_or_ps:
1719; AVX1:       # %bb.0:
1720; AVX1-NEXT:    vorps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x56,0xc1]
1721; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1722;
1723; AVX512-LABEL: test_mm_or_ps:
1724; AVX512:       # %bb.0:
1725; AVX512-NEXT:    vorps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x56,0xc1]
1726; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1727  %arg0 = bitcast <4 x float> %a0 to <4 x i32>
1728  %arg1 = bitcast <4 x float> %a1 to <4 x i32>
1729  %res = or <4 x i32> %arg0, %arg1
1730  %bc = bitcast <4 x i32> %res to <4 x float>
1731  ret <4 x float> %bc
1732}
1733
1734define void @test_mm_prefetch(i8* %a0) {
1735; X86-LABEL: test_mm_prefetch:
1736; X86:       # %bb.0:
1737; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1738; X86-NEXT:    prefetchnta (%eax) # encoding: [0x0f,0x18,0x00]
1739; X86-NEXT:    retl # encoding: [0xc3]
1740;
1741; X64-LABEL: test_mm_prefetch:
1742; X64:       # %bb.0:
1743; X64-NEXT:    prefetchnta (%rdi) # encoding: [0x0f,0x18,0x07]
1744; X64-NEXT:    retq # encoding: [0xc3]
1745  call void @llvm.prefetch(i8* %a0, i32 0, i32 0, i32 1)
1746  ret void
1747}
1748declare void @llvm.prefetch(i8* nocapture, i32, i32, i32) nounwind readnone
1749
1750define <4 x float> @test_mm_rcp_ps(<4 x float> %a0) {
1751; SSE-LABEL: test_mm_rcp_ps:
1752; SSE:       # %bb.0:
1753; SSE-NEXT:    rcpps %xmm0, %xmm0 # encoding: [0x0f,0x53,0xc0]
1754; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1755;
1756; AVX-LABEL: test_mm_rcp_ps:
1757; AVX:       # %bb.0:
1758; AVX-NEXT:    vrcpps %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x53,0xc0]
1759; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1760  %res = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %a0)
1761  ret <4 x float> %res
1762}
1763declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
1764
1765define <4 x float> @test_mm_rcp_ss(<4 x float> %a0) {
1766; SSE-LABEL: test_mm_rcp_ss:
1767; SSE:       # %bb.0:
1768; SSE-NEXT:    rcpss %xmm0, %xmm0 # encoding: [0xf3,0x0f,0x53,0xc0]
1769; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1770;
1771; AVX-LABEL: test_mm_rcp_ss:
1772; AVX:       # %bb.0:
1773; AVX-NEXT:    vrcpss %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x53,0xc0]
1774; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1775  %rcp = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %a0)
1776  ret <4 x float> %rcp
1777}
1778declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
1779
1780define <4 x float> @test_mm_rsqrt_ps(<4 x float> %a0) {
1781; SSE-LABEL: test_mm_rsqrt_ps:
1782; SSE:       # %bb.0:
1783; SSE-NEXT:    rsqrtps %xmm0, %xmm0 # encoding: [0x0f,0x52,0xc0]
1784; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1785;
1786; AVX-LABEL: test_mm_rsqrt_ps:
1787; AVX:       # %bb.0:
1788; AVX-NEXT:    vrsqrtps %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x52,0xc0]
1789; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1790  %res = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %a0)
1791  ret <4 x float> %res
1792}
1793declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
1794
1795define <4 x float> @test_mm_rsqrt_ss(<4 x float> %a0) {
1796; SSE-LABEL: test_mm_rsqrt_ss:
1797; SSE:       # %bb.0:
1798; SSE-NEXT:    rsqrtss %xmm0, %xmm0 # encoding: [0xf3,0x0f,0x52,0xc0]
1799; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1800;
1801; AVX-LABEL: test_mm_rsqrt_ss:
1802; AVX:       # %bb.0:
1803; AVX-NEXT:    vrsqrtss %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x52,0xc0]
1804; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1805  %rsqrt = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %a0)
1806  ret <4 x float> %rsqrt
1807}
1808declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
1809
1810define void @test_MM_SET_EXCEPTION_MASK(i32 %a0) nounwind {
1811; X86-SSE-LABEL: test_MM_SET_EXCEPTION_MASK:
1812; X86-SSE:       # %bb.0:
1813; X86-SSE-NEXT:    pushl %eax # encoding: [0x50]
1814; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
1815; X86-SSE-NEXT:    movl %esp, %ecx # encoding: [0x89,0xe1]
1816; X86-SSE-NEXT:    stmxcsr (%ecx) # encoding: [0x0f,0xae,0x19]
1817; X86-SSE-NEXT:    movl (%esp), %edx # encoding: [0x8b,0x14,0x24]
1818; X86-SSE-NEXT:    andl $-8065, %edx # encoding: [0x81,0xe2,0x7f,0xe0,0xff,0xff]
1819; X86-SSE-NEXT:    # imm = 0xE07F
1820; X86-SSE-NEXT:    orl %eax, %edx # encoding: [0x09,0xc2]
1821; X86-SSE-NEXT:    movl %edx, (%esp) # encoding: [0x89,0x14,0x24]
1822; X86-SSE-NEXT:    ldmxcsr (%ecx) # encoding: [0x0f,0xae,0x11]
1823; X86-SSE-NEXT:    popl %eax # encoding: [0x58]
1824; X86-SSE-NEXT:    retl # encoding: [0xc3]
1825;
1826; X86-AVX-LABEL: test_MM_SET_EXCEPTION_MASK:
1827; X86-AVX:       # %bb.0:
1828; X86-AVX-NEXT:    pushl %eax # encoding: [0x50]
1829; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
1830; X86-AVX-NEXT:    movl %esp, %ecx # encoding: [0x89,0xe1]
1831; X86-AVX-NEXT:    vstmxcsr (%ecx) # encoding: [0xc5,0xf8,0xae,0x19]
1832; X86-AVX-NEXT:    movl (%esp), %edx # encoding: [0x8b,0x14,0x24]
1833; X86-AVX-NEXT:    andl $-8065, %edx # encoding: [0x81,0xe2,0x7f,0xe0,0xff,0xff]
1834; X86-AVX-NEXT:    # imm = 0xE07F
1835; X86-AVX-NEXT:    orl %eax, %edx # encoding: [0x09,0xc2]
1836; X86-AVX-NEXT:    movl %edx, (%esp) # encoding: [0x89,0x14,0x24]
1837; X86-AVX-NEXT:    vldmxcsr (%ecx) # encoding: [0xc5,0xf8,0xae,0x11]
1838; X86-AVX-NEXT:    popl %eax # encoding: [0x58]
1839; X86-AVX-NEXT:    retl # encoding: [0xc3]
1840;
1841; X64-SSE-LABEL: test_MM_SET_EXCEPTION_MASK:
1842; X64-SSE:       # %bb.0:
1843; X64-SSE-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8d,0x44,0x24,0xfc]
1844; X64-SSE-NEXT:    stmxcsr (%rax) # encoding: [0x0f,0xae,0x18]
1845; X64-SSE-NEXT:    movl -{{[0-9]+}}(%rsp), %ecx # encoding: [0x8b,0x4c,0x24,0xfc]
1846; X64-SSE-NEXT:    andl $-8065, %ecx # encoding: [0x81,0xe1,0x7f,0xe0,0xff,0xff]
1847; X64-SSE-NEXT:    # imm = 0xE07F
1848; X64-SSE-NEXT:    orl %edi, %ecx # encoding: [0x09,0xf9]
1849; X64-SSE-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp) # encoding: [0x89,0x4c,0x24,0xfc]
1850; X64-SSE-NEXT:    ldmxcsr (%rax) # encoding: [0x0f,0xae,0x10]
1851; X64-SSE-NEXT:    retq # encoding: [0xc3]
1852;
1853; X64-AVX-LABEL: test_MM_SET_EXCEPTION_MASK:
1854; X64-AVX:       # %bb.0:
1855; X64-AVX-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8d,0x44,0x24,0xfc]
1856; X64-AVX-NEXT:    vstmxcsr (%rax) # encoding: [0xc5,0xf8,0xae,0x18]
1857; X64-AVX-NEXT:    movl -{{[0-9]+}}(%rsp), %ecx # encoding: [0x8b,0x4c,0x24,0xfc]
1858; X64-AVX-NEXT:    andl $-8065, %ecx # encoding: [0x81,0xe1,0x7f,0xe0,0xff,0xff]
1859; X64-AVX-NEXT:    # imm = 0xE07F
1860; X64-AVX-NEXT:    orl %edi, %ecx # encoding: [0x09,0xf9]
1861; X64-AVX-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp) # encoding: [0x89,0x4c,0x24,0xfc]
1862; X64-AVX-NEXT:    vldmxcsr (%rax) # encoding: [0xc5,0xf8,0xae,0x10]
1863; X64-AVX-NEXT:    retq # encoding: [0xc3]
1864  %1 = alloca i32, align 4
1865  %2 = bitcast i32* %1 to i8*
1866  call void @llvm.x86.sse.stmxcsr(i8* %2)
1867  %3 = load i32, i32* %1
1868  %4 = and i32 %3, -8065
1869  %5 = or i32 %4, %a0
1870  store i32 %5, i32* %1
1871  call void @llvm.x86.sse.ldmxcsr(i8* %2)
1872  ret void
1873}
1874declare void @llvm.x86.sse.ldmxcsr(i8*) nounwind readnone
1875
1876define void @test_MM_SET_EXCEPTION_STATE(i32 %a0) nounwind {
1877; X86-SSE-LABEL: test_MM_SET_EXCEPTION_STATE:
1878; X86-SSE:       # %bb.0:
1879; X86-SSE-NEXT:    pushl %eax # encoding: [0x50]
1880; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
1881; X86-SSE-NEXT:    movl %esp, %ecx # encoding: [0x89,0xe1]
1882; X86-SSE-NEXT:    stmxcsr (%ecx) # encoding: [0x0f,0xae,0x19]
1883; X86-SSE-NEXT:    movl (%esp), %edx # encoding: [0x8b,0x14,0x24]
1884; X86-SSE-NEXT:    andl $-64, %edx # encoding: [0x83,0xe2,0xc0]
1885; X86-SSE-NEXT:    orl %eax, %edx # encoding: [0x09,0xc2]
1886; X86-SSE-NEXT:    movl %edx, (%esp) # encoding: [0x89,0x14,0x24]
1887; X86-SSE-NEXT:    ldmxcsr (%ecx) # encoding: [0x0f,0xae,0x11]
1888; X86-SSE-NEXT:    popl %eax # encoding: [0x58]
1889; X86-SSE-NEXT:    retl # encoding: [0xc3]
1890;
1891; X86-AVX-LABEL: test_MM_SET_EXCEPTION_STATE:
1892; X86-AVX:       # %bb.0:
1893; X86-AVX-NEXT:    pushl %eax # encoding: [0x50]
1894; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
1895; X86-AVX-NEXT:    movl %esp, %ecx # encoding: [0x89,0xe1]
1896; X86-AVX-NEXT:    vstmxcsr (%ecx) # encoding: [0xc5,0xf8,0xae,0x19]
1897; X86-AVX-NEXT:    movl (%esp), %edx # encoding: [0x8b,0x14,0x24]
1898; X86-AVX-NEXT:    andl $-64, %edx # encoding: [0x83,0xe2,0xc0]
1899; X86-AVX-NEXT:    orl %eax, %edx # encoding: [0x09,0xc2]
1900; X86-AVX-NEXT:    movl %edx, (%esp) # encoding: [0x89,0x14,0x24]
1901; X86-AVX-NEXT:    vldmxcsr (%ecx) # encoding: [0xc5,0xf8,0xae,0x11]
1902; X86-AVX-NEXT:    popl %eax # encoding: [0x58]
1903; X86-AVX-NEXT:    retl # encoding: [0xc3]
1904;
1905; X64-SSE-LABEL: test_MM_SET_EXCEPTION_STATE:
1906; X64-SSE:       # %bb.0:
1907; X64-SSE-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8d,0x44,0x24,0xfc]
1908; X64-SSE-NEXT:    stmxcsr (%rax) # encoding: [0x0f,0xae,0x18]
1909; X64-SSE-NEXT:    movl -{{[0-9]+}}(%rsp), %ecx # encoding: [0x8b,0x4c,0x24,0xfc]
1910; X64-SSE-NEXT:    andl $-64, %ecx # encoding: [0x83,0xe1,0xc0]
1911; X64-SSE-NEXT:    orl %edi, %ecx # encoding: [0x09,0xf9]
1912; X64-SSE-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp) # encoding: [0x89,0x4c,0x24,0xfc]
1913; X64-SSE-NEXT:    ldmxcsr (%rax) # encoding: [0x0f,0xae,0x10]
1914; X64-SSE-NEXT:    retq # encoding: [0xc3]
1915;
1916; X64-AVX-LABEL: test_MM_SET_EXCEPTION_STATE:
1917; X64-AVX:       # %bb.0:
1918; X64-AVX-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8d,0x44,0x24,0xfc]
1919; X64-AVX-NEXT:    vstmxcsr (%rax) # encoding: [0xc5,0xf8,0xae,0x18]
1920; X64-AVX-NEXT:    movl -{{[0-9]+}}(%rsp), %ecx # encoding: [0x8b,0x4c,0x24,0xfc]
1921; X64-AVX-NEXT:    andl $-64, %ecx # encoding: [0x83,0xe1,0xc0]
1922; X64-AVX-NEXT:    orl %edi, %ecx # encoding: [0x09,0xf9]
1923; X64-AVX-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp) # encoding: [0x89,0x4c,0x24,0xfc]
1924; X64-AVX-NEXT:    vldmxcsr (%rax) # encoding: [0xc5,0xf8,0xae,0x10]
1925; X64-AVX-NEXT:    retq # encoding: [0xc3]
1926  %1 = alloca i32, align 4
1927  %2 = bitcast i32* %1 to i8*
1928  call void @llvm.x86.sse.stmxcsr(i8* %2)
1929  %3 = load i32, i32* %1
1930  %4 = and i32 %3, -64
1931  %5 = or i32 %4, %a0
1932  store i32 %5, i32* %1
1933  call void @llvm.x86.sse.ldmxcsr(i8* %2)
1934  ret void
1935}
1936
1937define void @test_MM_SET_FLUSH_ZERO_MODE(i32 %a0) nounwind {
1938; X86-SSE-LABEL: test_MM_SET_FLUSH_ZERO_MODE:
1939; X86-SSE:       # %bb.0:
1940; X86-SSE-NEXT:    pushl %eax # encoding: [0x50]
1941; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
1942; X86-SSE-NEXT:    movl %esp, %ecx # encoding: [0x89,0xe1]
1943; X86-SSE-NEXT:    stmxcsr (%ecx) # encoding: [0x0f,0xae,0x19]
1944; X86-SSE-NEXT:    movl (%esp), %edx # encoding: [0x8b,0x14,0x24]
1945; X86-SSE-NEXT:    andl $-32769, %edx # encoding: [0x81,0xe2,0xff,0x7f,0xff,0xff]
1946; X86-SSE-NEXT:    # imm = 0xFFFF7FFF
1947; X86-SSE-NEXT:    orl %eax, %edx # encoding: [0x09,0xc2]
1948; X86-SSE-NEXT:    movl %edx, (%esp) # encoding: [0x89,0x14,0x24]
1949; X86-SSE-NEXT:    ldmxcsr (%ecx) # encoding: [0x0f,0xae,0x11]
1950; X86-SSE-NEXT:    popl %eax # encoding: [0x58]
1951; X86-SSE-NEXT:    retl # encoding: [0xc3]
1952;
1953; X86-AVX-LABEL: test_MM_SET_FLUSH_ZERO_MODE:
1954; X86-AVX:       # %bb.0:
1955; X86-AVX-NEXT:    pushl %eax # encoding: [0x50]
1956; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
1957; X86-AVX-NEXT:    movl %esp, %ecx # encoding: [0x89,0xe1]
1958; X86-AVX-NEXT:    vstmxcsr (%ecx) # encoding: [0xc5,0xf8,0xae,0x19]
1959; X86-AVX-NEXT:    movl (%esp), %edx # encoding: [0x8b,0x14,0x24]
1960; X86-AVX-NEXT:    andl $-32769, %edx # encoding: [0x81,0xe2,0xff,0x7f,0xff,0xff]
1961; X86-AVX-NEXT:    # imm = 0xFFFF7FFF
1962; X86-AVX-NEXT:    orl %eax, %edx # encoding: [0x09,0xc2]
1963; X86-AVX-NEXT:    movl %edx, (%esp) # encoding: [0x89,0x14,0x24]
1964; X86-AVX-NEXT:    vldmxcsr (%ecx) # encoding: [0xc5,0xf8,0xae,0x11]
1965; X86-AVX-NEXT:    popl %eax # encoding: [0x58]
1966; X86-AVX-NEXT:    retl # encoding: [0xc3]
1967;
1968; X64-SSE-LABEL: test_MM_SET_FLUSH_ZERO_MODE:
1969; X64-SSE:       # %bb.0:
1970; X64-SSE-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8d,0x44,0x24,0xfc]
1971; X64-SSE-NEXT:    stmxcsr (%rax) # encoding: [0x0f,0xae,0x18]
1972; X64-SSE-NEXT:    movl -{{[0-9]+}}(%rsp), %ecx # encoding: [0x8b,0x4c,0x24,0xfc]
1973; X64-SSE-NEXT:    andl $-32769, %ecx # encoding: [0x81,0xe1,0xff,0x7f,0xff,0xff]
1974; X64-SSE-NEXT:    # imm = 0xFFFF7FFF
1975; X64-SSE-NEXT:    orl %edi, %ecx # encoding: [0x09,0xf9]
1976; X64-SSE-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp) # encoding: [0x89,0x4c,0x24,0xfc]
1977; X64-SSE-NEXT:    ldmxcsr (%rax) # encoding: [0x0f,0xae,0x10]
1978; X64-SSE-NEXT:    retq # encoding: [0xc3]
1979;
1980; X64-AVX-LABEL: test_MM_SET_FLUSH_ZERO_MODE:
1981; X64-AVX:       # %bb.0:
1982; X64-AVX-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8d,0x44,0x24,0xfc]
1983; X64-AVX-NEXT:    vstmxcsr (%rax) # encoding: [0xc5,0xf8,0xae,0x18]
1984; X64-AVX-NEXT:    movl -{{[0-9]+}}(%rsp), %ecx # encoding: [0x8b,0x4c,0x24,0xfc]
1985; X64-AVX-NEXT:    andl $-32769, %ecx # encoding: [0x81,0xe1,0xff,0x7f,0xff,0xff]
1986; X64-AVX-NEXT:    # imm = 0xFFFF7FFF
1987; X64-AVX-NEXT:    orl %edi, %ecx # encoding: [0x09,0xf9]
1988; X64-AVX-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp) # encoding: [0x89,0x4c,0x24,0xfc]
1989; X64-AVX-NEXT:    vldmxcsr (%rax) # encoding: [0xc5,0xf8,0xae,0x10]
1990; X64-AVX-NEXT:    retq # encoding: [0xc3]
1991  %1 = alloca i32, align 4
1992  %2 = bitcast i32* %1 to i8*
1993  call void @llvm.x86.sse.stmxcsr(i8* %2)
1994  %3 = load i32, i32* %1
1995  %4 = and i32 %3, -32769
1996  %5 = or i32 %4, %a0
1997  store i32 %5, i32* %1
1998  call void @llvm.x86.sse.ldmxcsr(i8* %2)
1999  ret void
2000}
2001
2002define <4 x float> @test_mm_set_ps(float %a0, float %a1, float %a2, float %a3) nounwind {
2003; X86-SSE-LABEL: test_mm_set_ps:
2004; X86-SSE:       # %bb.0:
2005; X86-SSE-NEXT:    movss {{[0-9]+}}(%esp), %xmm0 # encoding: [0xf3,0x0f,0x10,0x44,0x24,0x10]
2006; X86-SSE-NEXT:    # xmm0 = mem[0],zero,zero,zero
2007; X86-SSE-NEXT:    movss {{[0-9]+}}(%esp), %xmm1 # encoding: [0xf3,0x0f,0x10,0x4c,0x24,0x0c]
2008; X86-SSE-NEXT:    # xmm1 = mem[0],zero,zero,zero
2009; X86-SSE-NEXT:    unpcklps %xmm1, %xmm0 # encoding: [0x0f,0x14,0xc1]
2010; X86-SSE-NEXT:    # xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2011; X86-SSE-NEXT:    movss {{[0-9]+}}(%esp), %xmm1 # encoding: [0xf3,0x0f,0x10,0x4c,0x24,0x08]
2012; X86-SSE-NEXT:    # xmm1 = mem[0],zero,zero,zero
2013; X86-SSE-NEXT:    movss {{[0-9]+}}(%esp), %xmm2 # encoding: [0xf3,0x0f,0x10,0x54,0x24,0x04]
2014; X86-SSE-NEXT:    # xmm2 = mem[0],zero,zero,zero
2015; X86-SSE-NEXT:    unpcklps %xmm2, %xmm1 # encoding: [0x0f,0x14,0xca]
2016; X86-SSE-NEXT:    # xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
2017; X86-SSE-NEXT:    movlhps %xmm1, %xmm0 # encoding: [0x0f,0x16,0xc1]
2018; X86-SSE-NEXT:    # xmm0 = xmm0[0],xmm1[0]
2019; X86-SSE-NEXT:    retl # encoding: [0xc3]
2020;
2021; X86-AVX1-LABEL: test_mm_set_ps:
2022; X86-AVX1:       # %bb.0:
2023; X86-AVX1-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm0 # encoding: [0xc5,0xfa,0x10,0x44,0x24,0x10]
2024; X86-AVX1-NEXT:    # xmm0 = mem[0],zero,zero,zero
2025; X86-AVX1-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm1 # encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x0c]
2026; X86-AVX1-NEXT:    # xmm1 = mem[0],zero,zero,zero
2027; X86-AVX1-NEXT:    vinsertps $16, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x10]
2028; X86-AVX1-NEXT:    # xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
2029; X86-AVX1-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm1 # encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x08]
2030; X86-AVX1-NEXT:    # xmm1 = mem[0],zero,zero,zero
2031; X86-AVX1-NEXT:    vinsertps $32, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x20]
2032; X86-AVX1-NEXT:    # xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
2033; X86-AVX1-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm1 # encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x04]
2034; X86-AVX1-NEXT:    # xmm1 = mem[0],zero,zero,zero
2035; X86-AVX1-NEXT:    vinsertps $48, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30]
2036; X86-AVX1-NEXT:    # xmm0 = xmm0[0,1,2],xmm1[0]
2037; X86-AVX1-NEXT:    retl # encoding: [0xc3]
2038;
2039; X86-AVX512-LABEL: test_mm_set_ps:
2040; X86-AVX512:       # %bb.0:
2041; X86-AVX512-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x44,0x24,0x10]
2042; X86-AVX512-NEXT:    # xmm0 = mem[0],zero,zero,zero
2043; X86-AVX512-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x0c]
2044; X86-AVX512-NEXT:    # xmm1 = mem[0],zero,zero,zero
2045; X86-AVX512-NEXT:    vinsertps $16, %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x10]
2046; X86-AVX512-NEXT:    # xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
2047; X86-AVX512-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x08]
2048; X86-AVX512-NEXT:    # xmm1 = mem[0],zero,zero,zero
2049; X86-AVX512-NEXT:    vinsertps $32, %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x20]
2050; X86-AVX512-NEXT:    # xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
2051; X86-AVX512-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x04]
2052; X86-AVX512-NEXT:    # xmm1 = mem[0],zero,zero,zero
2053; X86-AVX512-NEXT:    vinsertps $48, %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30]
2054; X86-AVX512-NEXT:    # xmm0 = xmm0[0,1,2],xmm1[0]
2055; X86-AVX512-NEXT:    retl # encoding: [0xc3]
2056;
2057; X64-SSE-LABEL: test_mm_set_ps:
2058; X64-SSE:       # %bb.0:
2059; X64-SSE-NEXT:    unpcklps %xmm0, %xmm1 # encoding: [0x0f,0x14,0xc8]
2060; X64-SSE-NEXT:    # xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
2061; X64-SSE-NEXT:    unpcklps %xmm2, %xmm3 # encoding: [0x0f,0x14,0xda]
2062; X64-SSE-NEXT:    # xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
2063; X64-SSE-NEXT:    movlhps %xmm1, %xmm3 # encoding: [0x0f,0x16,0xd9]
2064; X64-SSE-NEXT:    # xmm3 = xmm3[0],xmm1[0]
2065; X64-SSE-NEXT:    movaps %xmm3, %xmm0 # encoding: [0x0f,0x28,0xc3]
2066; X64-SSE-NEXT:    retq # encoding: [0xc3]
2067;
2068; X64-AVX1-LABEL: test_mm_set_ps:
2069; X64-AVX1:       # %bb.0:
2070; X64-AVX1-NEXT:    vinsertps $16, %xmm2, %xmm3, %xmm2 # encoding: [0xc4,0xe3,0x61,0x21,0xd2,0x10]
2071; X64-AVX1-NEXT:    # xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
2072; X64-AVX1-NEXT:    vinsertps $32, %xmm1, %xmm2, %xmm1 # encoding: [0xc4,0xe3,0x69,0x21,0xc9,0x20]
2073; X64-AVX1-NEXT:    # xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
2074; X64-AVX1-NEXT:    vinsertps $48, %xmm0, %xmm1, %xmm0 # encoding: [0xc4,0xe3,0x71,0x21,0xc0,0x30]
2075; X64-AVX1-NEXT:    # xmm0 = xmm1[0,1,2],xmm0[0]
2076; X64-AVX1-NEXT:    retq # encoding: [0xc3]
2077;
2078; X64-AVX512-LABEL: test_mm_set_ps:
2079; X64-AVX512:       # %bb.0:
2080; X64-AVX512-NEXT:    vinsertps $16, %xmm2, %xmm3, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x61,0x21,0xd2,0x10]
2081; X64-AVX512-NEXT:    # xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
2082; X64-AVX512-NEXT:    vinsertps $32, %xmm1, %xmm2, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x69,0x21,0xc9,0x20]
2083; X64-AVX512-NEXT:    # xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
2084; X64-AVX512-NEXT:    vinsertps $48, %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x71,0x21,0xc0,0x30]
2085; X64-AVX512-NEXT:    # xmm0 = xmm1[0,1,2],xmm0[0]
2086; X64-AVX512-NEXT:    retq # encoding: [0xc3]
2087  %res0  = insertelement <4 x float> undef, float %a3, i32 0
2088  %res1  = insertelement <4 x float> %res0, float %a2, i32 1
2089  %res2  = insertelement <4 x float> %res1, float %a1, i32 2
2090  %res3  = insertelement <4 x float> %res2, float %a0, i32 3
2091  ret <4 x float> %res3
2092}
2093
2094define <4 x float> @test_mm_set_ps1(float %a0) nounwind {
2095; X86-SSE-LABEL: test_mm_set_ps1:
2096; X86-SSE:       # %bb.0:
2097; X86-SSE-NEXT:    movss {{[0-9]+}}(%esp), %xmm0 # encoding: [0xf3,0x0f,0x10,0x44,0x24,0x04]
2098; X86-SSE-NEXT:    # xmm0 = mem[0],zero,zero,zero
2099; X86-SSE-NEXT:    shufps $0, %xmm0, %xmm0 # encoding: [0x0f,0xc6,0xc0,0x00]
2100; X86-SSE-NEXT:    # xmm0 = xmm0[0,0,0,0]
2101; X86-SSE-NEXT:    retl # encoding: [0xc3]
2102;
2103; X86-AVX1-LABEL: test_mm_set_ps1:
2104; X86-AVX1:       # %bb.0:
2105; X86-AVX1-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm0 # encoding: [0xc5,0xfa,0x10,0x44,0x24,0x04]
2106; X86-AVX1-NEXT:    # xmm0 = mem[0],zero,zero,zero
2107; X86-AVX1-NEXT:    vpermilps $0, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x00]
2108; X86-AVX1-NEXT:    # xmm0 = xmm0[0,0,0,0]
2109; X86-AVX1-NEXT:    retl # encoding: [0xc3]
2110;
2111; X86-AVX512-LABEL: test_mm_set_ps1:
2112; X86-AVX512:       # %bb.0:
2113; X86-AVX512-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x44,0x24,0x04]
2114; X86-AVX512-NEXT:    # xmm0 = mem[0],zero,zero,zero
2115; X86-AVX512-NEXT:    vbroadcastss %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xc0]
2116; X86-AVX512-NEXT:    retl # encoding: [0xc3]
2117;
2118; X64-SSE-LABEL: test_mm_set_ps1:
2119; X64-SSE:       # %bb.0:
2120; X64-SSE-NEXT:    shufps $0, %xmm0, %xmm0 # encoding: [0x0f,0xc6,0xc0,0x00]
2121; X64-SSE-NEXT:    # xmm0 = xmm0[0,0,0,0]
2122; X64-SSE-NEXT:    retq # encoding: [0xc3]
2123;
2124; X64-AVX1-LABEL: test_mm_set_ps1:
2125; X64-AVX1:       # %bb.0:
2126; X64-AVX1-NEXT:    vpermilps $0, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x00]
2127; X64-AVX1-NEXT:    # xmm0 = xmm0[0,0,0,0]
2128; X64-AVX1-NEXT:    retq # encoding: [0xc3]
2129;
2130; X64-AVX512-LABEL: test_mm_set_ps1:
2131; X64-AVX512:       # %bb.0:
2132; X64-AVX512-NEXT:    vbroadcastss %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xc0]
2133; X64-AVX512-NEXT:    retq # encoding: [0xc3]
2134  %res0  = insertelement <4 x float> undef, float %a0, i32 0
2135  %res1  = insertelement <4 x float> %res0, float %a0, i32 1
2136  %res2  = insertelement <4 x float> %res1, float %a0, i32 2
2137  %res3  = insertelement <4 x float> %res2, float %a0, i32 3
2138  ret <4 x float> %res3
2139}
2140
2141define void @test_MM_SET_ROUNDING_MODE(i32 %a0) nounwind {
2142; X86-SSE-LABEL: test_MM_SET_ROUNDING_MODE:
2143; X86-SSE:       # %bb.0:
2144; X86-SSE-NEXT:    pushl %eax # encoding: [0x50]
2145; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
2146; X86-SSE-NEXT:    movl %esp, %ecx # encoding: [0x89,0xe1]
2147; X86-SSE-NEXT:    stmxcsr (%ecx) # encoding: [0x0f,0xae,0x19]
2148; X86-SSE-NEXT:    movl (%esp), %edx # encoding: [0x8b,0x14,0x24]
2149; X86-SSE-NEXT:    andl $-24577, %edx # encoding: [0x81,0xe2,0xff,0x9f,0xff,0xff]
2150; X86-SSE-NEXT:    # imm = 0x9FFF
2151; X86-SSE-NEXT:    orl %eax, %edx # encoding: [0x09,0xc2]
2152; X86-SSE-NEXT:    movl %edx, (%esp) # encoding: [0x89,0x14,0x24]
2153; X86-SSE-NEXT:    ldmxcsr (%ecx) # encoding: [0x0f,0xae,0x11]
2154; X86-SSE-NEXT:    popl %eax # encoding: [0x58]
2155; X86-SSE-NEXT:    retl # encoding: [0xc3]
2156;
2157; X86-AVX-LABEL: test_MM_SET_ROUNDING_MODE:
2158; X86-AVX:       # %bb.0:
2159; X86-AVX-NEXT:    pushl %eax # encoding: [0x50]
2160; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
2161; X86-AVX-NEXT:    movl %esp, %ecx # encoding: [0x89,0xe1]
2162; X86-AVX-NEXT:    vstmxcsr (%ecx) # encoding: [0xc5,0xf8,0xae,0x19]
2163; X86-AVX-NEXT:    movl (%esp), %edx # encoding: [0x8b,0x14,0x24]
2164; X86-AVX-NEXT:    andl $-24577, %edx # encoding: [0x81,0xe2,0xff,0x9f,0xff,0xff]
2165; X86-AVX-NEXT:    # imm = 0x9FFF
2166; X86-AVX-NEXT:    orl %eax, %edx # encoding: [0x09,0xc2]
2167; X86-AVX-NEXT:    movl %edx, (%esp) # encoding: [0x89,0x14,0x24]
2168; X86-AVX-NEXT:    vldmxcsr (%ecx) # encoding: [0xc5,0xf8,0xae,0x11]
2169; X86-AVX-NEXT:    popl %eax # encoding: [0x58]
2170; X86-AVX-NEXT:    retl # encoding: [0xc3]
2171;
2172; X64-SSE-LABEL: test_MM_SET_ROUNDING_MODE:
2173; X64-SSE:       # %bb.0:
2174; X64-SSE-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8d,0x44,0x24,0xfc]
2175; X64-SSE-NEXT:    stmxcsr (%rax) # encoding: [0x0f,0xae,0x18]
2176; X64-SSE-NEXT:    movl -{{[0-9]+}}(%rsp), %ecx # encoding: [0x8b,0x4c,0x24,0xfc]
2177; X64-SSE-NEXT:    andl $-24577, %ecx # encoding: [0x81,0xe1,0xff,0x9f,0xff,0xff]
2178; X64-SSE-NEXT:    # imm = 0x9FFF
2179; X64-SSE-NEXT:    orl %edi, %ecx # encoding: [0x09,0xf9]
2180; X64-SSE-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp) # encoding: [0x89,0x4c,0x24,0xfc]
2181; X64-SSE-NEXT:    ldmxcsr (%rax) # encoding: [0x0f,0xae,0x10]
2182; X64-SSE-NEXT:    retq # encoding: [0xc3]
2183;
2184; X64-AVX-LABEL: test_MM_SET_ROUNDING_MODE:
2185; X64-AVX:       # %bb.0:
2186; X64-AVX-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8d,0x44,0x24,0xfc]
2187; X64-AVX-NEXT:    vstmxcsr (%rax) # encoding: [0xc5,0xf8,0xae,0x18]
2188; X64-AVX-NEXT:    movl -{{[0-9]+}}(%rsp), %ecx # encoding: [0x8b,0x4c,0x24,0xfc]
2189; X64-AVX-NEXT:    andl $-24577, %ecx # encoding: [0x81,0xe1,0xff,0x9f,0xff,0xff]
2190; X64-AVX-NEXT:    # imm = 0x9FFF
2191; X64-AVX-NEXT:    orl %edi, %ecx # encoding: [0x09,0xf9]
2192; X64-AVX-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp) # encoding: [0x89,0x4c,0x24,0xfc]
2193; X64-AVX-NEXT:    vldmxcsr (%rax) # encoding: [0xc5,0xf8,0xae,0x10]
2194; X64-AVX-NEXT:    retq # encoding: [0xc3]
2195  %1 = alloca i32, align 4
2196  %2 = bitcast i32* %1 to i8*
2197  call void @llvm.x86.sse.stmxcsr(i8* %2)
2198  %3 = load i32, i32* %1
2199  %4 = and i32 %3, -24577
2200  %5 = or i32 %4, %a0
2201  store i32 %5, i32* %1
2202  call void @llvm.x86.sse.ldmxcsr(i8* %2)
2203  ret void
2204}
2205
2206define <4 x float> @test_mm_set_ss(float %a0) nounwind {
2207; X86-SSE-LABEL: test_mm_set_ss:
2208; X86-SSE:       # %bb.0:
2209; X86-SSE-NEXT:    movss {{[0-9]+}}(%esp), %xmm1 # encoding: [0xf3,0x0f,0x10,0x4c,0x24,0x04]
2210; X86-SSE-NEXT:    # xmm1 = mem[0],zero,zero,zero
2211; X86-SSE-NEXT:    xorps %xmm0, %xmm0 # encoding: [0x0f,0x57,0xc0]
2212; X86-SSE-NEXT:    movss %xmm1, %xmm0 # encoding: [0xf3,0x0f,0x10,0xc1]
2213; X86-SSE-NEXT:    # xmm0 = xmm1[0],xmm0[1,2,3]
2214; X86-SSE-NEXT:    retl # encoding: [0xc3]
2215;
2216; X86-AVX1-LABEL: test_mm_set_ss:
2217; X86-AVX1:       # %bb.0:
2218; X86-AVX1-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm0 # encoding: [0xc5,0xfa,0x10,0x44,0x24,0x04]
2219; X86-AVX1-NEXT:    # xmm0 = mem[0],zero,zero,zero
2220; X86-AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf0,0x57,0xc9]
2221; X86-AVX1-NEXT:    vblendps $1, %xmm0, %xmm1, %xmm0 # encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x01]
2222; X86-AVX1-NEXT:    # xmm0 = xmm0[0],xmm1[1,2,3]
2223; X86-AVX1-NEXT:    retl # encoding: [0xc3]
2224;
2225; X86-AVX512-LABEL: test_mm_set_ss:
2226; X86-AVX512:       # %bb.0:
2227; X86-AVX512-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x44,0x24,0x04]
2228; X86-AVX512-NEXT:    # xmm0 = mem[0],zero,zero,zero
2229; X86-AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf0,0x57,0xc9]
2230; X86-AVX512-NEXT:    vblendps $1, %xmm0, %xmm1, %xmm0 # encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x01]
2231; X86-AVX512-NEXT:    # xmm0 = xmm0[0],xmm1[1,2,3]
2232; X86-AVX512-NEXT:    retl # encoding: [0xc3]
2233;
2234; X64-SSE-LABEL: test_mm_set_ss:
2235; X64-SSE:       # %bb.0:
2236; X64-SSE-NEXT:    xorps %xmm1, %xmm1 # encoding: [0x0f,0x57,0xc9]
2237; X64-SSE-NEXT:    movss %xmm0, %xmm1 # encoding: [0xf3,0x0f,0x10,0xc8]
2238; X64-SSE-NEXT:    # xmm1 = xmm0[0],xmm1[1,2,3]
2239; X64-SSE-NEXT:    movaps %xmm1, %xmm0 # encoding: [0x0f,0x28,0xc1]
2240; X64-SSE-NEXT:    retq # encoding: [0xc3]
2241;
2242; X64-AVX-LABEL: test_mm_set_ss:
2243; X64-AVX:       # %bb.0:
2244; X64-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf0,0x57,0xc9]
2245; X64-AVX-NEXT:    vblendps $1, %xmm0, %xmm1, %xmm0 # encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x01]
2246; X64-AVX-NEXT:    # xmm0 = xmm0[0],xmm1[1,2,3]
2247; X64-AVX-NEXT:    retq # encoding: [0xc3]
2248  %res0  = insertelement <4 x float> undef, float %a0, i32 0
2249  %res1  = insertelement <4 x float> %res0, float 0.0, i32 1
2250  %res2  = insertelement <4 x float> %res1, float 0.0, i32 2
2251  %res3  = insertelement <4 x float> %res2, float 0.0, i32 3
2252  ret <4 x float> %res3
2253}
2254
2255define <4 x float> @test_mm_set1_ps(float %a0) nounwind {
2256; X86-SSE-LABEL: test_mm_set1_ps:
2257; X86-SSE:       # %bb.0:
2258; X86-SSE-NEXT:    movss {{[0-9]+}}(%esp), %xmm0 # encoding: [0xf3,0x0f,0x10,0x44,0x24,0x04]
2259; X86-SSE-NEXT:    # xmm0 = mem[0],zero,zero,zero
2260; X86-SSE-NEXT:    shufps $0, %xmm0, %xmm0 # encoding: [0x0f,0xc6,0xc0,0x00]
2261; X86-SSE-NEXT:    # xmm0 = xmm0[0,0,0,0]
2262; X86-SSE-NEXT:    retl # encoding: [0xc3]
2263;
2264; X86-AVX1-LABEL: test_mm_set1_ps:
2265; X86-AVX1:       # %bb.0:
2266; X86-AVX1-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm0 # encoding: [0xc5,0xfa,0x10,0x44,0x24,0x04]
2267; X86-AVX1-NEXT:    # xmm0 = mem[0],zero,zero,zero
2268; X86-AVX1-NEXT:    vpermilps $0, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x00]
2269; X86-AVX1-NEXT:    # xmm0 = xmm0[0,0,0,0]
2270; X86-AVX1-NEXT:    retl # encoding: [0xc3]
2271;
2272; X86-AVX512-LABEL: test_mm_set1_ps:
2273; X86-AVX512:       # %bb.0:
2274; X86-AVX512-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x44,0x24,0x04]
2275; X86-AVX512-NEXT:    # xmm0 = mem[0],zero,zero,zero
2276; X86-AVX512-NEXT:    vbroadcastss %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xc0]
2277; X86-AVX512-NEXT:    retl # encoding: [0xc3]
2278;
2279; X64-SSE-LABEL: test_mm_set1_ps:
2280; X64-SSE:       # %bb.0:
2281; X64-SSE-NEXT:    shufps $0, %xmm0, %xmm0 # encoding: [0x0f,0xc6,0xc0,0x00]
2282; X64-SSE-NEXT:    # xmm0 = xmm0[0,0,0,0]
2283; X64-SSE-NEXT:    retq # encoding: [0xc3]
2284;
2285; X64-AVX1-LABEL: test_mm_set1_ps:
2286; X64-AVX1:       # %bb.0:
2287; X64-AVX1-NEXT:    vpermilps $0, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x00]
2288; X64-AVX1-NEXT:    # xmm0 = xmm0[0,0,0,0]
2289; X64-AVX1-NEXT:    retq # encoding: [0xc3]
2290;
2291; X64-AVX512-LABEL: test_mm_set1_ps:
2292; X64-AVX512:       # %bb.0:
2293; X64-AVX512-NEXT:    vbroadcastss %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xc0]
2294; X64-AVX512-NEXT:    retq # encoding: [0xc3]
2295  %res0  = insertelement <4 x float> undef, float %a0, i32 0
2296  %res1  = insertelement <4 x float> %res0, float %a0, i32 1
2297  %res2  = insertelement <4 x float> %res1, float %a0, i32 2
2298  %res3  = insertelement <4 x float> %res2, float %a0, i32 3
2299  ret <4 x float> %res3
2300}
2301
2302define void @test_mm_setcsr(i32 %a0) nounwind {
2303; X86-SSE-LABEL: test_mm_setcsr:
2304; X86-SSE:       # %bb.0:
2305; X86-SSE-NEXT:    leal {{[0-9]+}}(%esp), %eax # encoding: [0x8d,0x44,0x24,0x04]
2306; X86-SSE-NEXT:    ldmxcsr (%eax) # encoding: [0x0f,0xae,0x10]
2307; X86-SSE-NEXT:    retl # encoding: [0xc3]
2308;
2309; X86-AVX-LABEL: test_mm_setcsr:
2310; X86-AVX:       # %bb.0:
2311; X86-AVX-NEXT:    leal {{[0-9]+}}(%esp), %eax # encoding: [0x8d,0x44,0x24,0x04]
2312; X86-AVX-NEXT:    vldmxcsr (%eax) # encoding: [0xc5,0xf8,0xae,0x10]
2313; X86-AVX-NEXT:    retl # encoding: [0xc3]
2314;
2315; X64-SSE-LABEL: test_mm_setcsr:
2316; X64-SSE:       # %bb.0:
2317; X64-SSE-NEXT:    movl %edi, -{{[0-9]+}}(%rsp) # encoding: [0x89,0x7c,0x24,0xfc]
2318; X64-SSE-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8d,0x44,0x24,0xfc]
2319; X64-SSE-NEXT:    ldmxcsr (%rax) # encoding: [0x0f,0xae,0x10]
2320; X64-SSE-NEXT:    retq # encoding: [0xc3]
2321;
2322; X64-AVX-LABEL: test_mm_setcsr:
2323; X64-AVX:       # %bb.0:
2324; X64-AVX-NEXT:    movl %edi, -{{[0-9]+}}(%rsp) # encoding: [0x89,0x7c,0x24,0xfc]
2325; X64-AVX-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8d,0x44,0x24,0xfc]
2326; X64-AVX-NEXT:    vldmxcsr (%rax) # encoding: [0xc5,0xf8,0xae,0x10]
2327; X64-AVX-NEXT:    retq # encoding: [0xc3]
2328  %st = alloca i32, align 4
2329  store i32 %a0, i32* %st, align 4
2330  %bc = bitcast i32* %st to i8*
2331  call void @llvm.x86.sse.ldmxcsr(i8* %bc)
2332  ret void
2333}
2334
2335define <4 x float> @test_mm_setr_ps(float %a0, float %a1, float %a2, float %a3) nounwind {
2336; X86-SSE-LABEL: test_mm_setr_ps:
2337; X86-SSE:       # %bb.0:
2338; X86-SSE-NEXT:    movss {{[0-9]+}}(%esp), %xmm0 # encoding: [0xf3,0x0f,0x10,0x44,0x24,0x10]
2339; X86-SSE-NEXT:    # xmm0 = mem[0],zero,zero,zero
2340; X86-SSE-NEXT:    movss {{[0-9]+}}(%esp), %xmm1 # encoding: [0xf3,0x0f,0x10,0x4c,0x24,0x0c]
2341; X86-SSE-NEXT:    # xmm1 = mem[0],zero,zero,zero
2342; X86-SSE-NEXT:    unpcklps %xmm0, %xmm1 # encoding: [0x0f,0x14,0xc8]
2343; X86-SSE-NEXT:    # xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
2344; X86-SSE-NEXT:    movss {{[0-9]+}}(%esp), %xmm2 # encoding: [0xf3,0x0f,0x10,0x54,0x24,0x08]
2345; X86-SSE-NEXT:    # xmm2 = mem[0],zero,zero,zero
2346; X86-SSE-NEXT:    movss {{[0-9]+}}(%esp), %xmm0 # encoding: [0xf3,0x0f,0x10,0x44,0x24,0x04]
2347; X86-SSE-NEXT:    # xmm0 = mem[0],zero,zero,zero
2348; X86-SSE-NEXT:    unpcklps %xmm2, %xmm0 # encoding: [0x0f,0x14,0xc2]
2349; X86-SSE-NEXT:    # xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
2350; X86-SSE-NEXT:    movlhps %xmm1, %xmm0 # encoding: [0x0f,0x16,0xc1]
2351; X86-SSE-NEXT:    # xmm0 = xmm0[0],xmm1[0]
2352; X86-SSE-NEXT:    retl # encoding: [0xc3]
2353;
2354; X86-AVX1-LABEL: test_mm_setr_ps:
2355; X86-AVX1:       # %bb.0:
2356; X86-AVX1-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm0 # encoding: [0xc5,0xfa,0x10,0x44,0x24,0x10]
2357; X86-AVX1-NEXT:    # xmm0 = mem[0],zero,zero,zero
2358; X86-AVX1-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm1 # encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x0c]
2359; X86-AVX1-NEXT:    # xmm1 = mem[0],zero,zero,zero
2360; X86-AVX1-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm2 # encoding: [0xc5,0xfa,0x10,0x54,0x24,0x08]
2361; X86-AVX1-NEXT:    # xmm2 = mem[0],zero,zero,zero
2362; X86-AVX1-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm3 # encoding: [0xc5,0xfa,0x10,0x5c,0x24,0x04]
2363; X86-AVX1-NEXT:    # xmm3 = mem[0],zero,zero,zero
2364; X86-AVX1-NEXT:    vinsertps $16, %xmm2, %xmm3, %xmm2 # encoding: [0xc4,0xe3,0x61,0x21,0xd2,0x10]
2365; X86-AVX1-NEXT:    # xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
2366; X86-AVX1-NEXT:    vinsertps $32, %xmm1, %xmm2, %xmm1 # encoding: [0xc4,0xe3,0x69,0x21,0xc9,0x20]
2367; X86-AVX1-NEXT:    # xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
2368; X86-AVX1-NEXT:    vinsertps $48, %xmm0, %xmm1, %xmm0 # encoding: [0xc4,0xe3,0x71,0x21,0xc0,0x30]
2369; X86-AVX1-NEXT:    # xmm0 = xmm1[0,1,2],xmm0[0]
2370; X86-AVX1-NEXT:    retl # encoding: [0xc3]
2371;
2372; X86-AVX512-LABEL: test_mm_setr_ps:
2373; X86-AVX512:       # %bb.0:
2374; X86-AVX512-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x44,0x24,0x10]
2375; X86-AVX512-NEXT:    # xmm0 = mem[0],zero,zero,zero
2376; X86-AVX512-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x0c]
2377; X86-AVX512-NEXT:    # xmm1 = mem[0],zero,zero,zero
2378; X86-AVX512-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x54,0x24,0x08]
2379; X86-AVX512-NEXT:    # xmm2 = mem[0],zero,zero,zero
2380; X86-AVX512-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x5c,0x24,0x04]
2381; X86-AVX512-NEXT:    # xmm3 = mem[0],zero,zero,zero
2382; X86-AVX512-NEXT:    vinsertps $16, %xmm2, %xmm3, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x61,0x21,0xd2,0x10]
2383; X86-AVX512-NEXT:    # xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
2384; X86-AVX512-NEXT:    vinsertps $32, %xmm1, %xmm2, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x69,0x21,0xc9,0x20]
2385; X86-AVX512-NEXT:    # xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
2386; X86-AVX512-NEXT:    vinsertps $48, %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x71,0x21,0xc0,0x30]
2387; X86-AVX512-NEXT:    # xmm0 = xmm1[0,1,2],xmm0[0]
2388; X86-AVX512-NEXT:    retl # encoding: [0xc3]
2389;
2390; X64-SSE-LABEL: test_mm_setr_ps:
2391; X64-SSE:       # %bb.0:
2392; X64-SSE-NEXT:    unpcklps %xmm3, %xmm2 # encoding: [0x0f,0x14,0xd3]
2393; X64-SSE-NEXT:    # xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
2394; X64-SSE-NEXT:    unpcklps %xmm1, %xmm0 # encoding: [0x0f,0x14,0xc1]
2395; X64-SSE-NEXT:    # xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2396; X64-SSE-NEXT:    movlhps %xmm2, %xmm0 # encoding: [0x0f,0x16,0xc2]
2397; X64-SSE-NEXT:    # xmm0 = xmm0[0],xmm2[0]
2398; X64-SSE-NEXT:    retq # encoding: [0xc3]
2399;
2400; X64-AVX1-LABEL: test_mm_setr_ps:
2401; X64-AVX1:       # %bb.0:
2402; X64-AVX1-NEXT:    vinsertps $16, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x10]
2403; X64-AVX1-NEXT:    # xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
2404; X64-AVX1-NEXT:    vinsertps $32, %xmm2, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x21,0xc2,0x20]
2405; X64-AVX1-NEXT:    # xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
2406; X64-AVX1-NEXT:    vinsertps $48, %xmm3, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x21,0xc3,0x30]
2407; X64-AVX1-NEXT:    # xmm0 = xmm0[0,1,2],xmm3[0]
2408; X64-AVX1-NEXT:    retq # encoding: [0xc3]
2409;
2410; X64-AVX512-LABEL: test_mm_setr_ps:
2411; X64-AVX512:       # %bb.0:
2412; X64-AVX512-NEXT:    vinsertps $16, %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x10]
2413; X64-AVX512-NEXT:    # xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
2414; X64-AVX512-NEXT:    vinsertps $32, %xmm2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc2,0x20]
2415; X64-AVX512-NEXT:    # xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
2416; X64-AVX512-NEXT:    vinsertps $48, %xmm3, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc3,0x30]
2417; X64-AVX512-NEXT:    # xmm0 = xmm0[0,1,2],xmm3[0]
2418; X64-AVX512-NEXT:    retq # encoding: [0xc3]
2419  %res0  = insertelement <4 x float> undef, float %a0, i32 0
2420  %res1  = insertelement <4 x float> %res0, float %a1, i32 1
2421  %res2  = insertelement <4 x float> %res1, float %a2, i32 2
2422  %res3  = insertelement <4 x float> %res2, float %a3, i32 3
2423  ret <4 x float> %res3
2424}
2425
2426define <4 x float> @test_mm_setzero_ps() {
2427; SSE-LABEL: test_mm_setzero_ps:
2428; SSE:       # %bb.0:
2429; SSE-NEXT:    xorps %xmm0, %xmm0 # encoding: [0x0f,0x57,0xc0]
2430; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
2431;
2432; AVX1-LABEL: test_mm_setzero_ps:
2433; AVX1:       # %bb.0:
2434; AVX1-NEXT:    vxorps %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x57,0xc0]
2435; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
2436;
2437; AVX512-LABEL: test_mm_setzero_ps:
2438; AVX512:       # %bb.0:
2439; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x57,0xc0]
2440; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
2441  ret <4 x float> zeroinitializer
2442}
2443
2444define void @test_mm_sfence() nounwind {
2445; CHECK-LABEL: test_mm_sfence:
2446; CHECK:       # %bb.0:
2447; CHECK-NEXT:    sfence # encoding: [0x0f,0xae,0xf8]
2448; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
2449  call void @llvm.x86.sse.sfence()
2450  ret void
2451}
2452declare void @llvm.x86.sse.sfence() nounwind readnone
2453
2454define <4 x float> @test_mm_shuffle_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
2455; SSE-LABEL: test_mm_shuffle_ps:
2456; SSE:       # %bb.0:
2457; SSE-NEXT:    shufps $0, %xmm1, %xmm0 # encoding: [0x0f,0xc6,0xc1,0x00]
2458; SSE-NEXT:    # xmm0 = xmm0[0,0],xmm1[0,0]
2459; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
2460;
2461; AVX1-LABEL: test_mm_shuffle_ps:
2462; AVX1:       # %bb.0:
2463; AVX1-NEXT:    vshufps $0, %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0xc6,0xc1,0x00]
2464; AVX1-NEXT:    # xmm0 = xmm0[0,0],xmm1[0,0]
2465; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
2466;
2467; AVX512-LABEL: test_mm_shuffle_ps:
2468; AVX512:       # %bb.0:
2469; AVX512-NEXT:    vshufps $0, %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0xc6,0xc1,0x00]
2470; AVX512-NEXT:    # xmm0 = xmm0[0,0],xmm1[0,0]
2471; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
2472  %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 0, i32 4, i32 4>
2473  ret <4 x float> %res
2474}
2475
2476define <4 x float> @test_mm_sqrt_ps(<4 x float> %a0) {
2477; SSE-LABEL: test_mm_sqrt_ps:
2478; SSE:       # %bb.0:
2479; SSE-NEXT:    sqrtps %xmm0, %xmm0 # encoding: [0x0f,0x51,0xc0]
2480; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
2481;
2482; AVX1-LABEL: test_mm_sqrt_ps:
2483; AVX1:       # %bb.0:
2484; AVX1-NEXT:    vsqrtps %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x51,0xc0]
2485; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
2486;
2487; AVX512-LABEL: test_mm_sqrt_ps:
2488; AVX512:       # %bb.0:
2489; AVX512-NEXT:    vsqrtps %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x51,0xc0]
2490; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
2491  %res = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %a0)
2492  ret <4 x float> %res
2493}
2494declare <4 x float> @llvm.sqrt.v4f32(<4 x float>) nounwind readnone
2495
2496define <4 x float> @test_mm_sqrt_ss(<4 x float> %a0) {
2497; SSE-LABEL: test_mm_sqrt_ss:
2498; SSE:       # %bb.0:
2499; SSE-NEXT:    sqrtss %xmm0, %xmm0 # encoding: [0xf3,0x0f,0x51,0xc0]
2500; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
2501;
2502; AVX1-LABEL: test_mm_sqrt_ss:
2503; AVX1:       # %bb.0:
2504; AVX1-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x51,0xc0]
2505; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
2506;
2507; AVX512-LABEL: test_mm_sqrt_ss:
2508; AVX512:       # %bb.0:
2509; AVX512-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x51,0xc0]
2510; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
2511  %ext = extractelement <4 x float> %a0, i32 0
2512  %sqrt = call float @llvm.sqrt.f32(float %ext)
2513  %ins = insertelement <4 x float> %a0, float %sqrt, i32 0
2514  ret <4 x float> %ins
2515}
2516declare float @llvm.sqrt.f32(float) nounwind readnone
2517
2518define float @test_mm_sqrt_ss_scalar(float %a0) {
2519; X86-SSE-LABEL: test_mm_sqrt_ss_scalar:
2520; X86-SSE:       # %bb.0:
2521; X86-SSE-NEXT:    pushl %eax # encoding: [0x50]
2522; X86-SSE-NEXT:    .cfi_def_cfa_offset 8
2523; X86-SSE-NEXT:    movss {{[0-9]+}}(%esp), %xmm0 # encoding: [0xf3,0x0f,0x10,0x44,0x24,0x08]
2524; X86-SSE-NEXT:    # xmm0 = mem[0],zero,zero,zero
2525; X86-SSE-NEXT:    sqrtss %xmm0, %xmm0 # encoding: [0xf3,0x0f,0x51,0xc0]
2526; X86-SSE-NEXT:    movss %xmm0, (%esp) # encoding: [0xf3,0x0f,0x11,0x04,0x24]
2527; X86-SSE-NEXT:    flds (%esp) # encoding: [0xd9,0x04,0x24]
2528; X86-SSE-NEXT:    popl %eax # encoding: [0x58]
2529; X86-SSE-NEXT:    .cfi_def_cfa_offset 4
2530; X86-SSE-NEXT:    retl # encoding: [0xc3]
2531;
2532; X86-AVX1-LABEL: test_mm_sqrt_ss_scalar:
2533; X86-AVX1:       # %bb.0:
2534; X86-AVX1-NEXT:    pushl %eax # encoding: [0x50]
2535; X86-AVX1-NEXT:    .cfi_def_cfa_offset 8
2536; X86-AVX1-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm0 # encoding: [0xc5,0xfa,0x10,0x44,0x24,0x08]
2537; X86-AVX1-NEXT:    # xmm0 = mem[0],zero,zero,zero
2538; X86-AVX1-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x51,0xc0]
2539; X86-AVX1-NEXT:    vmovss %xmm0, (%esp) # encoding: [0xc5,0xfa,0x11,0x04,0x24]
2540; X86-AVX1-NEXT:    flds (%esp) # encoding: [0xd9,0x04,0x24]
2541; X86-AVX1-NEXT:    popl %eax # encoding: [0x58]
2542; X86-AVX1-NEXT:    .cfi_def_cfa_offset 4
2543; X86-AVX1-NEXT:    retl # encoding: [0xc3]
2544;
2545; X86-AVX512-LABEL: test_mm_sqrt_ss_scalar:
2546; X86-AVX512:       # %bb.0:
2547; X86-AVX512-NEXT:    pushl %eax # encoding: [0x50]
2548; X86-AVX512-NEXT:    .cfi_def_cfa_offset 8
2549; X86-AVX512-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x44,0x24,0x08]
2550; X86-AVX512-NEXT:    # xmm0 = mem[0],zero,zero,zero
2551; X86-AVX512-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x51,0xc0]
2552; X86-AVX512-NEXT:    vmovss %xmm0, (%esp) # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x11,0x04,0x24]
2553; X86-AVX512-NEXT:    flds (%esp) # encoding: [0xd9,0x04,0x24]
2554; X86-AVX512-NEXT:    popl %eax # encoding: [0x58]
2555; X86-AVX512-NEXT:    .cfi_def_cfa_offset 4
2556; X86-AVX512-NEXT:    retl # encoding: [0xc3]
2557;
2558; X64-SSE-LABEL: test_mm_sqrt_ss_scalar:
2559; X64-SSE:       # %bb.0:
2560; X64-SSE-NEXT:    sqrtss %xmm0, %xmm0 # encoding: [0xf3,0x0f,0x51,0xc0]
2561; X64-SSE-NEXT:    retq # encoding: [0xc3]
2562;
2563; X64-AVX1-LABEL: test_mm_sqrt_ss_scalar:
2564; X64-AVX1:       # %bb.0:
2565; X64-AVX1-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x51,0xc0]
2566; X64-AVX1-NEXT:    retq # encoding: [0xc3]
2567;
2568; X64-AVX512-LABEL: test_mm_sqrt_ss_scalar:
2569; X64-AVX512:       # %bb.0:
2570; X64-AVX512-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x51,0xc0]
2571; X64-AVX512-NEXT:    retq # encoding: [0xc3]
2572  %sqrt = call float @llvm.sqrt.f32(float %a0)
2573  ret float %sqrt
2574}
2575
2576define void @test_mm_store_ps(float *%a0, <4 x float> %a1) {
2577; X86-SSE-LABEL: test_mm_store_ps:
2578; X86-SSE:       # %bb.0:
2579; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2580; X86-SSE-NEXT:    movaps %xmm0, (%eax) # encoding: [0x0f,0x29,0x00]
2581; X86-SSE-NEXT:    retl # encoding: [0xc3]
2582;
2583; X86-AVX1-LABEL: test_mm_store_ps:
2584; X86-AVX1:       # %bb.0:
2585; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2586; X86-AVX1-NEXT:    vmovaps %xmm0, (%eax) # encoding: [0xc5,0xf8,0x29,0x00]
2587; X86-AVX1-NEXT:    retl # encoding: [0xc3]
2588;
2589; X86-AVX512-LABEL: test_mm_store_ps:
2590; X86-AVX512:       # %bb.0:
2591; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2592; X86-AVX512-NEXT:    vmovaps %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x00]
2593; X86-AVX512-NEXT:    retl # encoding: [0xc3]
2594;
2595; X64-SSE-LABEL: test_mm_store_ps:
2596; X64-SSE:       # %bb.0:
2597; X64-SSE-NEXT:    movaps %xmm0, (%rdi) # encoding: [0x0f,0x29,0x07]
2598; X64-SSE-NEXT:    retq # encoding: [0xc3]
2599;
2600; X64-AVX1-LABEL: test_mm_store_ps:
2601; X64-AVX1:       # %bb.0:
2602; X64-AVX1-NEXT:    vmovaps %xmm0, (%rdi) # encoding: [0xc5,0xf8,0x29,0x07]
2603; X64-AVX1-NEXT:    retq # encoding: [0xc3]
2604;
2605; X64-AVX512-LABEL: test_mm_store_ps:
2606; X64-AVX512:       # %bb.0:
2607; X64-AVX512-NEXT:    vmovaps %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x07]
2608; X64-AVX512-NEXT:    retq # encoding: [0xc3]
2609  %arg0 = bitcast float* %a0 to <4 x float>*
2610  store <4 x float> %a1, <4 x float>* %arg0, align 16
2611  ret void
2612}
2613
2614define void @test_mm_store_ps1(float *%a0, <4 x float> %a1) {
2615; X86-SSE-LABEL: test_mm_store_ps1:
2616; X86-SSE:       # %bb.0:
2617; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2618; X86-SSE-NEXT:    shufps $0, %xmm0, %xmm0 # encoding: [0x0f,0xc6,0xc0,0x00]
2619; X86-SSE-NEXT:    # xmm0 = xmm0[0,0,0,0]
2620; X86-SSE-NEXT:    movaps %xmm0, (%eax) # encoding: [0x0f,0x29,0x00]
2621; X86-SSE-NEXT:    retl # encoding: [0xc3]
2622;
2623; X86-AVX1-LABEL: test_mm_store_ps1:
2624; X86-AVX1:       # %bb.0:
2625; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2626; X86-AVX1-NEXT:    vpermilps $0, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x00]
2627; X86-AVX1-NEXT:    # xmm0 = xmm0[0,0,0,0]
2628; X86-AVX1-NEXT:    vmovaps %xmm0, (%eax) # encoding: [0xc5,0xf8,0x29,0x00]
2629; X86-AVX1-NEXT:    retl # encoding: [0xc3]
2630;
2631; X86-AVX512-LABEL: test_mm_store_ps1:
2632; X86-AVX512:       # %bb.0:
2633; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2634; X86-AVX512-NEXT:    vbroadcastss %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xc0]
2635; X86-AVX512-NEXT:    vmovaps %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x00]
2636; X86-AVX512-NEXT:    retl # encoding: [0xc3]
2637;
2638; X64-SSE-LABEL: test_mm_store_ps1:
2639; X64-SSE:       # %bb.0:
2640; X64-SSE-NEXT:    shufps $0, %xmm0, %xmm0 # encoding: [0x0f,0xc6,0xc0,0x00]
2641; X64-SSE-NEXT:    # xmm0 = xmm0[0,0,0,0]
2642; X64-SSE-NEXT:    movaps %xmm0, (%rdi) # encoding: [0x0f,0x29,0x07]
2643; X64-SSE-NEXT:    retq # encoding: [0xc3]
2644;
2645; X64-AVX1-LABEL: test_mm_store_ps1:
2646; X64-AVX1:       # %bb.0:
2647; X64-AVX1-NEXT:    vpermilps $0, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x00]
2648; X64-AVX1-NEXT:    # xmm0 = xmm0[0,0,0,0]
2649; X64-AVX1-NEXT:    vmovaps %xmm0, (%rdi) # encoding: [0xc5,0xf8,0x29,0x07]
2650; X64-AVX1-NEXT:    retq # encoding: [0xc3]
2651;
2652; X64-AVX512-LABEL: test_mm_store_ps1:
2653; X64-AVX512:       # %bb.0:
2654; X64-AVX512-NEXT:    vbroadcastss %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xc0]
2655; X64-AVX512-NEXT:    vmovaps %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x07]
2656; X64-AVX512-NEXT:    retq # encoding: [0xc3]
2657  %arg0 = bitcast float* %a0 to <4 x float>*
2658  %shuf = shufflevector <4 x float> %a1, <4 x float> undef, <4 x i32> zeroinitializer
2659  store <4 x float> %shuf, <4 x float>* %arg0, align 16
2660  ret void
2661}
2662
2663define void @test_mm_store_ss(float *%a0, <4 x float> %a1) {
2664; X86-SSE-LABEL: test_mm_store_ss:
2665; X86-SSE:       # %bb.0:
2666; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2667; X86-SSE-NEXT:    movss %xmm0, (%eax) # encoding: [0xf3,0x0f,0x11,0x00]
2668; X86-SSE-NEXT:    retl # encoding: [0xc3]
2669;
2670; X86-AVX1-LABEL: test_mm_store_ss:
2671; X86-AVX1:       # %bb.0:
2672; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2673; X86-AVX1-NEXT:    vmovss %xmm0, (%eax) # encoding: [0xc5,0xfa,0x11,0x00]
2674; X86-AVX1-NEXT:    retl # encoding: [0xc3]
2675;
2676; X86-AVX512-LABEL: test_mm_store_ss:
2677; X86-AVX512:       # %bb.0:
2678; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2679; X86-AVX512-NEXT:    vmovss %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x11,0x00]
2680; X86-AVX512-NEXT:    retl # encoding: [0xc3]
2681;
2682; X64-SSE-LABEL: test_mm_store_ss:
2683; X64-SSE:       # %bb.0:
2684; X64-SSE-NEXT:    movss %xmm0, (%rdi) # encoding: [0xf3,0x0f,0x11,0x07]
2685; X64-SSE-NEXT:    retq # encoding: [0xc3]
2686;
2687; X64-AVX1-LABEL: test_mm_store_ss:
2688; X64-AVX1:       # %bb.0:
2689; X64-AVX1-NEXT:    vmovss %xmm0, (%rdi) # encoding: [0xc5,0xfa,0x11,0x07]
2690; X64-AVX1-NEXT:    retq # encoding: [0xc3]
2691;
2692; X64-AVX512-LABEL: test_mm_store_ss:
2693; X64-AVX512:       # %bb.0:
2694; X64-AVX512-NEXT:    vmovss %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x11,0x07]
2695; X64-AVX512-NEXT:    retq # encoding: [0xc3]
2696  %ext = extractelement <4 x float> %a1, i32 0
2697  store float %ext, float* %a0, align 1
2698  ret void
2699}
2700
2701define void @test_mm_store1_ps(float *%a0, <4 x float> %a1) {
2702; X86-SSE-LABEL: test_mm_store1_ps:
2703; X86-SSE:       # %bb.0:
2704; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2705; X86-SSE-NEXT:    shufps $0, %xmm0, %xmm0 # encoding: [0x0f,0xc6,0xc0,0x00]
2706; X86-SSE-NEXT:    # xmm0 = xmm0[0,0,0,0]
2707; X86-SSE-NEXT:    movaps %xmm0, (%eax) # encoding: [0x0f,0x29,0x00]
2708; X86-SSE-NEXT:    retl # encoding: [0xc3]
2709;
2710; X86-AVX1-LABEL: test_mm_store1_ps:
2711; X86-AVX1:       # %bb.0:
2712; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2713; X86-AVX1-NEXT:    vpermilps $0, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x00]
2714; X86-AVX1-NEXT:    # xmm0 = xmm0[0,0,0,0]
2715; X86-AVX1-NEXT:    vmovaps %xmm0, (%eax) # encoding: [0xc5,0xf8,0x29,0x00]
2716; X86-AVX1-NEXT:    retl # encoding: [0xc3]
2717;
2718; X86-AVX512-LABEL: test_mm_store1_ps:
2719; X86-AVX512:       # %bb.0:
2720; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2721; X86-AVX512-NEXT:    vbroadcastss %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xc0]
2722; X86-AVX512-NEXT:    vmovaps %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x00]
2723; X86-AVX512-NEXT:    retl # encoding: [0xc3]
2724;
2725; X64-SSE-LABEL: test_mm_store1_ps:
2726; X64-SSE:       # %bb.0:
2727; X64-SSE-NEXT:    shufps $0, %xmm0, %xmm0 # encoding: [0x0f,0xc6,0xc0,0x00]
2728; X64-SSE-NEXT:    # xmm0 = xmm0[0,0,0,0]
2729; X64-SSE-NEXT:    movaps %xmm0, (%rdi) # encoding: [0x0f,0x29,0x07]
2730; X64-SSE-NEXT:    retq # encoding: [0xc3]
2731;
2732; X64-AVX1-LABEL: test_mm_store1_ps:
2733; X64-AVX1:       # %bb.0:
2734; X64-AVX1-NEXT:    vpermilps $0, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x00]
2735; X64-AVX1-NEXT:    # xmm0 = xmm0[0,0,0,0]
2736; X64-AVX1-NEXT:    vmovaps %xmm0, (%rdi) # encoding: [0xc5,0xf8,0x29,0x07]
2737; X64-AVX1-NEXT:    retq # encoding: [0xc3]
2738;
2739; X64-AVX512-LABEL: test_mm_store1_ps:
2740; X64-AVX512:       # %bb.0:
2741; X64-AVX512-NEXT:    vbroadcastss %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xc0]
2742; X64-AVX512-NEXT:    vmovaps %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x07]
2743; X64-AVX512-NEXT:    retq # encoding: [0xc3]
2744  %arg0 = bitcast float* %a0 to <4 x float>*
2745  %shuf = shufflevector <4 x float> %a1, <4 x float> undef, <4 x i32> zeroinitializer
2746  store <4 x float> %shuf, <4 x float>* %arg0, align 16
2747  ret void
2748}
2749
2750define void @test_mm_storeh_pi(x86_mmx *%a0, <4 x float> %a1) nounwind {
2751; X86-SSE1-LABEL: test_mm_storeh_pi:
2752; X86-SSE1:       # %bb.0:
2753; X86-SSE1-NEXT:    pushl %ebp # encoding: [0x55]
2754; X86-SSE1-NEXT:    movl %esp, %ebp # encoding: [0x89,0xe5]
2755; X86-SSE1-NEXT:    andl $-16, %esp # encoding: [0x83,0xe4,0xf0]
2756; X86-SSE1-NEXT:    subl $32, %esp # encoding: [0x83,0xec,0x20]
2757; X86-SSE1-NEXT:    movl 8(%ebp), %eax # encoding: [0x8b,0x45,0x08]
2758; X86-SSE1-NEXT:    movaps %xmm0, (%esp) # encoding: [0x0f,0x29,0x04,0x24]
2759; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x08]
2760; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c]
2761; X86-SSE1-NEXT:    movl %edx, 4(%eax) # encoding: [0x89,0x50,0x04]
2762; X86-SSE1-NEXT:    movl %ecx, (%eax) # encoding: [0x89,0x08]
2763; X86-SSE1-NEXT:    movl %ebp, %esp # encoding: [0x89,0xec]
2764; X86-SSE1-NEXT:    popl %ebp # encoding: [0x5d]
2765; X86-SSE1-NEXT:    retl # encoding: [0xc3]
2766;
2767; X86-SSE2-LABEL: test_mm_storeh_pi:
2768; X86-SSE2:       # %bb.0:
2769; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2770; X86-SSE2-NEXT:    movhps %xmm0, (%eax) # encoding: [0x0f,0x17,0x00]
2771; X86-SSE2-NEXT:    retl # encoding: [0xc3]
2772;
2773; X86-AVX1-LABEL: test_mm_storeh_pi:
2774; X86-AVX1:       # %bb.0:
2775; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2776; X86-AVX1-NEXT:    vmovhps %xmm0, (%eax) # encoding: [0xc5,0xf8,0x17,0x00]
2777; X86-AVX1-NEXT:    retl # encoding: [0xc3]
2778;
2779; X86-AVX512-LABEL: test_mm_storeh_pi:
2780; X86-AVX512:       # %bb.0:
2781; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2782; X86-AVX512-NEXT:    vmovhps %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x17,0x00]
2783; X86-AVX512-NEXT:    retl # encoding: [0xc3]
2784;
2785; X64-SSE1-LABEL: test_mm_storeh_pi:
2786; X64-SSE1:       # %bb.0:
2787; X64-SSE1-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp) # encoding: [0x0f,0x29,0x44,0x24,0xe8]
2788; X64-SSE1-NEXT:    movq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8b,0x44,0x24,0xf0]
2789; X64-SSE1-NEXT:    movq %rax, (%rdi) # encoding: [0x48,0x89,0x07]
2790; X64-SSE1-NEXT:    retq # encoding: [0xc3]
2791;
2792; X64-SSE2-LABEL: test_mm_storeh_pi:
2793; X64-SSE2:       # %bb.0:
2794; X64-SSE2-NEXT:    punpckhqdq %xmm0, %xmm0 # encoding: [0x66,0x0f,0x6d,0xc0]
2795; X64-SSE2-NEXT:    # xmm0 = xmm0[1,1]
2796; X64-SSE2-NEXT:    movq %xmm0, %rax # encoding: [0x66,0x48,0x0f,0x7e,0xc0]
2797; X64-SSE2-NEXT:    movq %rax, (%rdi) # encoding: [0x48,0x89,0x07]
2798; X64-SSE2-NEXT:    retq # encoding: [0xc3]
2799;
2800; X64-AVX1-LABEL: test_mm_storeh_pi:
2801; X64-AVX1:       # %bb.0:
2802; X64-AVX1-NEXT:    vpextrq $1, %xmm0, %rax # encoding: [0xc4,0xe3,0xf9,0x16,0xc0,0x01]
2803; X64-AVX1-NEXT:    movq %rax, (%rdi) # encoding: [0x48,0x89,0x07]
2804; X64-AVX1-NEXT:    retq # encoding: [0xc3]
2805;
2806; X64-AVX512-LABEL: test_mm_storeh_pi:
2807; X64-AVX512:       # %bb.0:
2808; X64-AVX512-NEXT:    vpextrq $1, %xmm0, %rax # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0x16,0xc0,0x01]
2809; X64-AVX512-NEXT:    movq %rax, (%rdi) # encoding: [0x48,0x89,0x07]
2810; X64-AVX512-NEXT:    retq # encoding: [0xc3]
2811  %ptr = bitcast x86_mmx* %a0 to i64*
2812  %bc  = bitcast <4 x float> %a1 to <2 x i64>
2813  %ext = extractelement <2 x i64> %bc, i32 1
2814  store i64 %ext, i64* %ptr
2815  ret void
2816}
2817
2818define void @test_mm_storeh_pi2(x86_mmx *%a0, <4 x float> %a1) nounwind {
2819; X86-SSE-LABEL: test_mm_storeh_pi2:
2820; X86-SSE:       # %bb.0:
2821; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2822; X86-SSE-NEXT:    movhps %xmm0, (%eax) # encoding: [0x0f,0x17,0x00]
2823; X86-SSE-NEXT:    retl # encoding: [0xc3]
2824;
2825; X86-AVX1-LABEL: test_mm_storeh_pi2:
2826; X86-AVX1:       # %bb.0:
2827; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2828; X86-AVX1-NEXT:    vmovhps %xmm0, (%eax) # encoding: [0xc5,0xf8,0x17,0x00]
2829; X86-AVX1-NEXT:    retl # encoding: [0xc3]
2830;
2831; X86-AVX512-LABEL: test_mm_storeh_pi2:
2832; X86-AVX512:       # %bb.0:
2833; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2834; X86-AVX512-NEXT:    vmovhps %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x17,0x00]
2835; X86-AVX512-NEXT:    retl # encoding: [0xc3]
2836;
2837; X64-SSE-LABEL: test_mm_storeh_pi2:
2838; X64-SSE:       # %bb.0:
2839; X64-SSE-NEXT:    movhps %xmm0, (%rdi) # encoding: [0x0f,0x17,0x07]
2840; X64-SSE-NEXT:    retq # encoding: [0xc3]
2841;
2842; X64-AVX1-LABEL: test_mm_storeh_pi2:
2843; X64-AVX1:       # %bb.0:
2844; X64-AVX1-NEXT:    vmovhps %xmm0, (%rdi) # encoding: [0xc5,0xf8,0x17,0x07]
2845; X64-AVX1-NEXT:    retq # encoding: [0xc3]
2846;
2847; X64-AVX512-LABEL: test_mm_storeh_pi2:
2848; X64-AVX512:       # %bb.0:
2849; X64-AVX512-NEXT:    vmovhps %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x17,0x07]
2850; X64-AVX512-NEXT:    retq # encoding: [0xc3]
2851  %ptr = bitcast x86_mmx* %a0 to <2 x float>*
2852  %ext = shufflevector <4 x float> %a1, <4 x float> undef, <2 x i32> <i32 2, i32 3>
2853  store <2 x float> %ext, <2 x float>* %ptr
2854  ret void
2855}
2856
2857define void @test_mm_storel_pi(x86_mmx *%a0, <4 x float> %a1) nounwind {
2858; X86-SSE1-LABEL: test_mm_storel_pi:
2859; X86-SSE1:       # %bb.0:
2860; X86-SSE1-NEXT:    pushl %ebp # encoding: [0x55]
2861; X86-SSE1-NEXT:    movl %esp, %ebp # encoding: [0x89,0xe5]
2862; X86-SSE1-NEXT:    andl $-16, %esp # encoding: [0x83,0xe4,0xf0]
2863; X86-SSE1-NEXT:    subl $32, %esp # encoding: [0x83,0xec,0x20]
2864; X86-SSE1-NEXT:    movl 8(%ebp), %eax # encoding: [0x8b,0x45,0x08]
2865; X86-SSE1-NEXT:    movaps %xmm0, (%esp) # encoding: [0x0f,0x29,0x04,0x24]
2866; X86-SSE1-NEXT:    movl (%esp), %ecx # encoding: [0x8b,0x0c,0x24]
2867; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x04]
2868; X86-SSE1-NEXT:    movl %edx, 4(%eax) # encoding: [0x89,0x50,0x04]
2869; X86-SSE1-NEXT:    movl %ecx, (%eax) # encoding: [0x89,0x08]
2870; X86-SSE1-NEXT:    movl %ebp, %esp # encoding: [0x89,0xec]
2871; X86-SSE1-NEXT:    popl %ebp # encoding: [0x5d]
2872; X86-SSE1-NEXT:    retl # encoding: [0xc3]
2873;
2874; X86-SSE2-LABEL: test_mm_storel_pi:
2875; X86-SSE2:       # %bb.0:
2876; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2877; X86-SSE2-NEXT:    movlps %xmm0, (%eax) # encoding: [0x0f,0x13,0x00]
2878; X86-SSE2-NEXT:    retl # encoding: [0xc3]
2879;
2880; X86-AVX1-LABEL: test_mm_storel_pi:
2881; X86-AVX1:       # %bb.0:
2882; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2883; X86-AVX1-NEXT:    vmovlps %xmm0, (%eax) # encoding: [0xc5,0xf8,0x13,0x00]
2884; X86-AVX1-NEXT:    retl # encoding: [0xc3]
2885;
2886; X86-AVX512-LABEL: test_mm_storel_pi:
2887; X86-AVX512:       # %bb.0:
2888; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2889; X86-AVX512-NEXT:    vmovlps %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x13,0x00]
2890; X86-AVX512-NEXT:    retl # encoding: [0xc3]
2891;
2892; X64-SSE1-LABEL: test_mm_storel_pi:
2893; X64-SSE1:       # %bb.0:
2894; X64-SSE1-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp) # encoding: [0x0f,0x29,0x44,0x24,0xe8]
2895; X64-SSE1-NEXT:    movq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8b,0x44,0x24,0xe8]
2896; X64-SSE1-NEXT:    movq %rax, (%rdi) # encoding: [0x48,0x89,0x07]
2897; X64-SSE1-NEXT:    retq # encoding: [0xc3]
2898;
2899; X64-SSE2-LABEL: test_mm_storel_pi:
2900; X64-SSE2:       # %bb.0:
2901; X64-SSE2-NEXT:    movq %xmm0, %rax # encoding: [0x66,0x48,0x0f,0x7e,0xc0]
2902; X64-SSE2-NEXT:    movq %rax, (%rdi) # encoding: [0x48,0x89,0x07]
2903; X64-SSE2-NEXT:    retq # encoding: [0xc3]
2904;
2905; X64-AVX1-LABEL: test_mm_storel_pi:
2906; X64-AVX1:       # %bb.0:
2907; X64-AVX1-NEXT:    vmovq %xmm0, %rax # encoding: [0xc4,0xe1,0xf9,0x7e,0xc0]
2908; X64-AVX1-NEXT:    movq %rax, (%rdi) # encoding: [0x48,0x89,0x07]
2909; X64-AVX1-NEXT:    retq # encoding: [0xc3]
2910;
2911; X64-AVX512-LABEL: test_mm_storel_pi:
2912; X64-AVX512:       # %bb.0:
2913; X64-AVX512-NEXT:    vmovq %xmm0, %rax # EVEX TO VEX Compression encoding: [0xc4,0xe1,0xf9,0x7e,0xc0]
2914; X64-AVX512-NEXT:    movq %rax, (%rdi) # encoding: [0x48,0x89,0x07]
2915; X64-AVX512-NEXT:    retq # encoding: [0xc3]
2916  %ptr = bitcast x86_mmx* %a0 to i64*
2917  %bc  = bitcast <4 x float> %a1 to <2 x i64>
2918  %ext = extractelement <2 x i64> %bc, i32 0
2919  store i64 %ext, i64* %ptr
2920  ret void
2921}
2922
2923; FIXME: Switch the frontend to use this code.
2924define void @test_mm_storel_pi2(x86_mmx *%a0, <4 x float> %a1) nounwind {
2925; X86-SSE-LABEL: test_mm_storel_pi2:
2926; X86-SSE:       # %bb.0:
2927; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2928; X86-SSE-NEXT:    movlps %xmm0, (%eax) # encoding: [0x0f,0x13,0x00]
2929; X86-SSE-NEXT:    retl # encoding: [0xc3]
2930;
2931; X86-AVX1-LABEL: test_mm_storel_pi2:
2932; X86-AVX1:       # %bb.0:
2933; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2934; X86-AVX1-NEXT:    vmovlps %xmm0, (%eax) # encoding: [0xc5,0xf8,0x13,0x00]
2935; X86-AVX1-NEXT:    retl # encoding: [0xc3]
2936;
2937; X86-AVX512-LABEL: test_mm_storel_pi2:
2938; X86-AVX512:       # %bb.0:
2939; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2940; X86-AVX512-NEXT:    vmovlps %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x13,0x00]
2941; X86-AVX512-NEXT:    retl # encoding: [0xc3]
2942;
2943; X64-SSE-LABEL: test_mm_storel_pi2:
2944; X64-SSE:       # %bb.0:
2945; X64-SSE-NEXT:    movlps %xmm0, (%rdi) # encoding: [0x0f,0x13,0x07]
2946; X64-SSE-NEXT:    retq # encoding: [0xc3]
2947;
2948; X64-AVX1-LABEL: test_mm_storel_pi2:
2949; X64-AVX1:       # %bb.0:
2950; X64-AVX1-NEXT:    vmovlps %xmm0, (%rdi) # encoding: [0xc5,0xf8,0x13,0x07]
2951; X64-AVX1-NEXT:    retq # encoding: [0xc3]
2952;
2953; X64-AVX512-LABEL: test_mm_storel_pi2:
2954; X64-AVX512:       # %bb.0:
2955; X64-AVX512-NEXT:    vmovlps %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x13,0x07]
2956; X64-AVX512-NEXT:    retq # encoding: [0xc3]
2957  %ptr = bitcast x86_mmx* %a0 to <2 x float>*
2958  %ext = shufflevector <4 x float> %a1, <4 x float> undef, <2 x i32> <i32 0, i32 1>
2959  store <2 x float> %ext, <2 x float>* %ptr
2960  ret void
2961}
2962
2963define void @test_mm_storer_ps(float *%a0, <4 x float> %a1) {
2964; X86-SSE-LABEL: test_mm_storer_ps:
2965; X86-SSE:       # %bb.0:
2966; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2967; X86-SSE-NEXT:    shufps $27, %xmm0, %xmm0 # encoding: [0x0f,0xc6,0xc0,0x1b]
2968; X86-SSE-NEXT:    # xmm0 = xmm0[3,2,1,0]
2969; X86-SSE-NEXT:    movaps %xmm0, (%eax) # encoding: [0x0f,0x29,0x00]
2970; X86-SSE-NEXT:    retl # encoding: [0xc3]
2971;
2972; X86-AVX1-LABEL: test_mm_storer_ps:
2973; X86-AVX1:       # %bb.0:
2974; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2975; X86-AVX1-NEXT:    vpermilps $27, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x1b]
2976; X86-AVX1-NEXT:    # xmm0 = xmm0[3,2,1,0]
2977; X86-AVX1-NEXT:    vmovaps %xmm0, (%eax) # encoding: [0xc5,0xf8,0x29,0x00]
2978; X86-AVX1-NEXT:    retl # encoding: [0xc3]
2979;
2980; X86-AVX512-LABEL: test_mm_storer_ps:
2981; X86-AVX512:       # %bb.0:
2982; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2983; X86-AVX512-NEXT:    vpermilps $27, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x1b]
2984; X86-AVX512-NEXT:    # xmm0 = xmm0[3,2,1,0]
2985; X86-AVX512-NEXT:    vmovaps %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x00]
2986; X86-AVX512-NEXT:    retl # encoding: [0xc3]
2987;
2988; X64-SSE-LABEL: test_mm_storer_ps:
2989; X64-SSE:       # %bb.0:
2990; X64-SSE-NEXT:    shufps $27, %xmm0, %xmm0 # encoding: [0x0f,0xc6,0xc0,0x1b]
2991; X64-SSE-NEXT:    # xmm0 = xmm0[3,2,1,0]
2992; X64-SSE-NEXT:    movaps %xmm0, (%rdi) # encoding: [0x0f,0x29,0x07]
2993; X64-SSE-NEXT:    retq # encoding: [0xc3]
2994;
2995; X64-AVX1-LABEL: test_mm_storer_ps:
2996; X64-AVX1:       # %bb.0:
2997; X64-AVX1-NEXT:    vpermilps $27, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x1b]
2998; X64-AVX1-NEXT:    # xmm0 = xmm0[3,2,1,0]
2999; X64-AVX1-NEXT:    vmovaps %xmm0, (%rdi) # encoding: [0xc5,0xf8,0x29,0x07]
3000; X64-AVX1-NEXT:    retq # encoding: [0xc3]
3001;
3002; X64-AVX512-LABEL: test_mm_storer_ps:
3003; X64-AVX512:       # %bb.0:
3004; X64-AVX512-NEXT:    vpermilps $27, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x1b]
3005; X64-AVX512-NEXT:    # xmm0 = xmm0[3,2,1,0]
3006; X64-AVX512-NEXT:    vmovaps %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x07]
3007; X64-AVX512-NEXT:    retq # encoding: [0xc3]
3008  %arg0 = bitcast float* %a0 to <4 x float>*
3009  %shuf = shufflevector <4 x float> %a1, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
3010  store <4 x float> %shuf, <4 x float>* %arg0, align 16
3011  ret void
3012}
3013
3014define void @test_mm_storeu_ps(float *%a0, <4 x float> %a1) {
3015; X86-SSE-LABEL: test_mm_storeu_ps:
3016; X86-SSE:       # %bb.0:
3017; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
3018; X86-SSE-NEXT:    movups %xmm0, (%eax) # encoding: [0x0f,0x11,0x00]
3019; X86-SSE-NEXT:    retl # encoding: [0xc3]
3020;
3021; X86-AVX1-LABEL: test_mm_storeu_ps:
3022; X86-AVX1:       # %bb.0:
3023; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
3024; X86-AVX1-NEXT:    vmovups %xmm0, (%eax) # encoding: [0xc5,0xf8,0x11,0x00]
3025; X86-AVX1-NEXT:    retl # encoding: [0xc3]
3026;
3027; X86-AVX512-LABEL: test_mm_storeu_ps:
3028; X86-AVX512:       # %bb.0:
3029; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
3030; X86-AVX512-NEXT:    vmovups %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x11,0x00]
3031; X86-AVX512-NEXT:    retl # encoding: [0xc3]
3032;
3033; X64-SSE-LABEL: test_mm_storeu_ps:
3034; X64-SSE:       # %bb.0:
3035; X64-SSE-NEXT:    movups %xmm0, (%rdi) # encoding: [0x0f,0x11,0x07]
3036; X64-SSE-NEXT:    retq # encoding: [0xc3]
3037;
3038; X64-AVX1-LABEL: test_mm_storeu_ps:
3039; X64-AVX1:       # %bb.0:
3040; X64-AVX1-NEXT:    vmovups %xmm0, (%rdi) # encoding: [0xc5,0xf8,0x11,0x07]
3041; X64-AVX1-NEXT:    retq # encoding: [0xc3]
3042;
3043; X64-AVX512-LABEL: test_mm_storeu_ps:
3044; X64-AVX512:       # %bb.0:
3045; X64-AVX512-NEXT:    vmovups %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x11,0x07]
3046; X64-AVX512-NEXT:    retq # encoding: [0xc3]
3047  %arg0 = bitcast float* %a0 to <4 x float>*
3048  store <4 x float> %a1, <4 x float>* %arg0, align 1
3049  ret void
3050}
3051
3052define void @test_mm_stream_ps(float *%a0, <4 x float> %a1) {
3053; X86-SSE-LABEL: test_mm_stream_ps:
3054; X86-SSE:       # %bb.0:
3055; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
3056; X86-SSE-NEXT:    movntps %xmm0, (%eax) # encoding: [0x0f,0x2b,0x00]
3057; X86-SSE-NEXT:    retl # encoding: [0xc3]
3058;
3059; X86-AVX1-LABEL: test_mm_stream_ps:
3060; X86-AVX1:       # %bb.0:
3061; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
3062; X86-AVX1-NEXT:    vmovntps %xmm0, (%eax) # encoding: [0xc5,0xf8,0x2b,0x00]
3063; X86-AVX1-NEXT:    retl # encoding: [0xc3]
3064;
3065; X86-AVX512-LABEL: test_mm_stream_ps:
3066; X86-AVX512:       # %bb.0:
3067; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
3068; X86-AVX512-NEXT:    vmovntps %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2b,0x00]
3069; X86-AVX512-NEXT:    retl # encoding: [0xc3]
3070;
3071; X64-SSE-LABEL: test_mm_stream_ps:
3072; X64-SSE:       # %bb.0:
3073; X64-SSE-NEXT:    movntps %xmm0, (%rdi) # encoding: [0x0f,0x2b,0x07]
3074; X64-SSE-NEXT:    retq # encoding: [0xc3]
3075;
3076; X64-AVX1-LABEL: test_mm_stream_ps:
3077; X64-AVX1:       # %bb.0:
3078; X64-AVX1-NEXT:    vmovntps %xmm0, (%rdi) # encoding: [0xc5,0xf8,0x2b,0x07]
3079; X64-AVX1-NEXT:    retq # encoding: [0xc3]
3080;
3081; X64-AVX512-LABEL: test_mm_stream_ps:
3082; X64-AVX512:       # %bb.0:
3083; X64-AVX512-NEXT:    vmovntps %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2b,0x07]
3084; X64-AVX512-NEXT:    retq # encoding: [0xc3]
3085  %arg0 = bitcast float* %a0 to <4 x float>*
3086  store <4 x float> %a1, <4 x float>* %arg0, align 16, !nontemporal !0
3087  ret void
3088}
3089
3090define <4 x float> @test_mm_sub_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
3091; SSE-LABEL: test_mm_sub_ps:
3092; SSE:       # %bb.0:
3093; SSE-NEXT:    subps %xmm1, %xmm0 # encoding: [0x0f,0x5c,0xc1]
3094; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3095;
3096; AVX1-LABEL: test_mm_sub_ps:
3097; AVX1:       # %bb.0:
3098; AVX1-NEXT:    vsubps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x5c,0xc1]
3099; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3100;
3101; AVX512-LABEL: test_mm_sub_ps:
3102; AVX512:       # %bb.0:
3103; AVX512-NEXT:    vsubps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x5c,0xc1]
3104; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3105  %res = fsub <4 x float> %a0, %a1
3106  ret <4 x float> %res
3107}
3108
3109define <4 x float> @test_mm_sub_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
3110; SSE-LABEL: test_mm_sub_ss:
3111; SSE:       # %bb.0:
3112; SSE-NEXT:    subss %xmm1, %xmm0 # encoding: [0xf3,0x0f,0x5c,0xc1]
3113; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3114;
3115; AVX1-LABEL: test_mm_sub_ss:
3116; AVX1:       # %bb.0:
3117; AVX1-NEXT:    vsubss %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x5c,0xc1]
3118; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3119;
3120; AVX512-LABEL: test_mm_sub_ss:
3121; AVX512:       # %bb.0:
3122; AVX512-NEXT:    vsubss %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x5c,0xc1]
3123; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3124  %ext0 = extractelement <4 x float> %a0, i32 0
3125  %ext1 = extractelement <4 x float> %a1, i32 0
3126  %fsub = fsub float %ext0, %ext1
3127  %res = insertelement <4 x float> %a0, float %fsub, i32 0
3128  ret <4 x float> %res
3129}
3130
3131define void @test_MM_TRANSPOSE4_PS(<4 x float>* %a0, <4 x float>* %a1, <4 x float>* %a2, <4 x float>* %a3) nounwind {
3132; X86-SSE-LABEL: test_MM_TRANSPOSE4_PS:
3133; X86-SSE:       # %bb.0:
3134; X86-SSE-NEXT:    pushl %esi # encoding: [0x56]
3135; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x14]
3136; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x10]
3137; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c]
3138; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x08]
3139; X86-SSE-NEXT:    movaps (%esi), %xmm0 # encoding: [0x0f,0x28,0x06]
3140; X86-SSE-NEXT:    movaps (%edx), %xmm1 # encoding: [0x0f,0x28,0x0a]
3141; X86-SSE-NEXT:    movaps (%ecx), %xmm2 # encoding: [0x0f,0x28,0x11]
3142; X86-SSE-NEXT:    movaps (%eax), %xmm3 # encoding: [0x0f,0x28,0x18]
3143; X86-SSE-NEXT:    movaps %xmm0, %xmm4 # encoding: [0x0f,0x28,0xe0]
3144; X86-SSE-NEXT:    unpcklps %xmm1, %xmm4 # encoding: [0x0f,0x14,0xe1]
3145; X86-SSE-NEXT:    # xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
3146; X86-SSE-NEXT:    movaps %xmm2, %xmm5 # encoding: [0x0f,0x28,0xea]
3147; X86-SSE-NEXT:    unpcklps %xmm3, %xmm5 # encoding: [0x0f,0x14,0xeb]
3148; X86-SSE-NEXT:    # xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
3149; X86-SSE-NEXT:    unpckhps %xmm1, %xmm0 # encoding: [0x0f,0x15,0xc1]
3150; X86-SSE-NEXT:    # xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3151; X86-SSE-NEXT:    unpckhps %xmm3, %xmm2 # encoding: [0x0f,0x15,0xd3]
3152; X86-SSE-NEXT:    # xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
3153; X86-SSE-NEXT:    movaps %xmm4, %xmm1 # encoding: [0x0f,0x28,0xcc]
3154; X86-SSE-NEXT:    movlhps %xmm5, %xmm1 # encoding: [0x0f,0x16,0xcd]
3155; X86-SSE-NEXT:    # xmm1 = xmm1[0],xmm5[0]
3156; X86-SSE-NEXT:    movhlps %xmm4, %xmm5 # encoding: [0x0f,0x12,0xec]
3157; X86-SSE-NEXT:    # xmm5 = xmm4[1],xmm5[1]
3158; X86-SSE-NEXT:    movaps %xmm0, %xmm3 # encoding: [0x0f,0x28,0xd8]
3159; X86-SSE-NEXT:    movlhps %xmm2, %xmm3 # encoding: [0x0f,0x16,0xda]
3160; X86-SSE-NEXT:    # xmm3 = xmm3[0],xmm2[0]
3161; X86-SSE-NEXT:    movhlps %xmm0, %xmm2 # encoding: [0x0f,0x12,0xd0]
3162; X86-SSE-NEXT:    # xmm2 = xmm0[1],xmm2[1]
3163; X86-SSE-NEXT:    movaps %xmm1, (%esi) # encoding: [0x0f,0x29,0x0e]
3164; X86-SSE-NEXT:    movaps %xmm5, (%edx) # encoding: [0x0f,0x29,0x2a]
3165; X86-SSE-NEXT:    movaps %xmm3, (%ecx) # encoding: [0x0f,0x29,0x19]
3166; X86-SSE-NEXT:    movaps %xmm2, (%eax) # encoding: [0x0f,0x29,0x10]
3167; X86-SSE-NEXT:    popl %esi # encoding: [0x5e]
3168; X86-SSE-NEXT:    retl # encoding: [0xc3]
3169;
3170; X86-AVX1-LABEL: test_MM_TRANSPOSE4_PS:
3171; X86-AVX1:       # %bb.0:
3172; X86-AVX1-NEXT:    pushl %esi # encoding: [0x56]
3173; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x14]
3174; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x10]
3175; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c]
3176; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x08]
3177; X86-AVX1-NEXT:    vmovaps (%esi), %xmm0 # encoding: [0xc5,0xf8,0x28,0x06]
3178; X86-AVX1-NEXT:    vmovaps (%edx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x0a]
3179; X86-AVX1-NEXT:    vmovaps (%ecx), %xmm2 # encoding: [0xc5,0xf8,0x28,0x11]
3180; X86-AVX1-NEXT:    vmovaps (%eax), %xmm3 # encoding: [0xc5,0xf8,0x28,0x18]
3181; X86-AVX1-NEXT:    vunpcklps %xmm1, %xmm0, %xmm4 # encoding: [0xc5,0xf8,0x14,0xe1]
3182; X86-AVX1-NEXT:    # xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3183; X86-AVX1-NEXT:    vunpcklps %xmm3, %xmm2, %xmm5 # encoding: [0xc5,0xe8,0x14,0xeb]
3184; X86-AVX1-NEXT:    # xmm5 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
3185; X86-AVX1-NEXT:    vunpckhps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x15,0xc1]
3186; X86-AVX1-NEXT:    # xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3187; X86-AVX1-NEXT:    vunpckhps %xmm3, %xmm2, %xmm1 # encoding: [0xc5,0xe8,0x15,0xcb]
3188; X86-AVX1-NEXT:    # xmm1 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
3189; X86-AVX1-NEXT:    vmovlhps %xmm5, %xmm4, %xmm2 # encoding: [0xc5,0xd8,0x16,0xd5]
3190; X86-AVX1-NEXT:    # xmm2 = xmm4[0],xmm5[0]
3191; X86-AVX1-NEXT:    vunpckhpd %xmm5, %xmm4, %xmm3 # encoding: [0xc5,0xd9,0x15,0xdd]
3192; X86-AVX1-NEXT:    # xmm3 = xmm4[1],xmm5[1]
3193; X86-AVX1-NEXT:    vmovlhps %xmm1, %xmm0, %xmm4 # encoding: [0xc5,0xf8,0x16,0xe1]
3194; X86-AVX1-NEXT:    # xmm4 = xmm0[0],xmm1[0]
3195; X86-AVX1-NEXT:    vunpckhpd %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x15,0xc1]
3196; X86-AVX1-NEXT:    # xmm0 = xmm0[1],xmm1[1]
3197; X86-AVX1-NEXT:    vmovaps %xmm2, (%esi) # encoding: [0xc5,0xf8,0x29,0x16]
3198; X86-AVX1-NEXT:    vmovaps %xmm3, (%edx) # encoding: [0xc5,0xf8,0x29,0x1a]
3199; X86-AVX1-NEXT:    vmovaps %xmm4, (%ecx) # encoding: [0xc5,0xf8,0x29,0x21]
3200; X86-AVX1-NEXT:    vmovaps %xmm0, (%eax) # encoding: [0xc5,0xf8,0x29,0x00]
3201; X86-AVX1-NEXT:    popl %esi # encoding: [0x5e]
3202; X86-AVX1-NEXT:    retl # encoding: [0xc3]
3203;
3204; X86-AVX512-LABEL: test_MM_TRANSPOSE4_PS:
3205; X86-AVX512:       # %bb.0:
3206; X86-AVX512-NEXT:    pushl %esi # encoding: [0x56]
3207; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x14]
3208; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x10]
3209; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c]
3210; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x08]
3211; X86-AVX512-NEXT:    vmovaps (%esi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x06]
3212; X86-AVX512-NEXT:    vmovaps (%edx), %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x0a]
3213; X86-AVX512-NEXT:    vmovaps (%ecx), %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x11]
3214; X86-AVX512-NEXT:    vmovaps (%eax), %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x18]
3215; X86-AVX512-NEXT:    vunpcklps %xmm1, %xmm0, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x14,0xe1]
3216; X86-AVX512-NEXT:    # xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3217; X86-AVX512-NEXT:    vunpcklps %xmm3, %xmm2, %xmm5 # EVEX TO VEX Compression encoding: [0xc5,0xe8,0x14,0xeb]
3218; X86-AVX512-NEXT:    # xmm5 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
3219; X86-AVX512-NEXT:    vunpckhps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x15,0xc1]
3220; X86-AVX512-NEXT:    # xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3221; X86-AVX512-NEXT:    vunpckhps %xmm3, %xmm2, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xe8,0x15,0xcb]
3222; X86-AVX512-NEXT:    # xmm1 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
3223; X86-AVX512-NEXT:    vmovlhps %xmm5, %xmm4, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xd8,0x16,0xd5]
3224; X86-AVX512-NEXT:    # xmm2 = xmm4[0],xmm5[0]
3225; X86-AVX512-NEXT:    vunpckhpd %xmm5, %xmm4, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xd9,0x15,0xdd]
3226; X86-AVX512-NEXT:    # xmm3 = xmm4[1],xmm5[1]
3227; X86-AVX512-NEXT:    vmovlhps %xmm1, %xmm0, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x16,0xe1]
3228; X86-AVX512-NEXT:    # xmm4 = xmm0[0],xmm1[0]
3229; X86-AVX512-NEXT:    vunpckhpd %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x15,0xc1]
3230; X86-AVX512-NEXT:    # xmm0 = xmm0[1],xmm1[1]
3231; X86-AVX512-NEXT:    vmovaps %xmm2, (%esi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x16]
3232; X86-AVX512-NEXT:    vmovaps %xmm3, (%edx) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x1a]
3233; X86-AVX512-NEXT:    vmovaps %xmm4, (%ecx) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x21]
3234; X86-AVX512-NEXT:    vmovaps %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x00]
3235; X86-AVX512-NEXT:    popl %esi # encoding: [0x5e]
3236; X86-AVX512-NEXT:    retl # encoding: [0xc3]
3237;
3238; X64-SSE-LABEL: test_MM_TRANSPOSE4_PS:
3239; X64-SSE:       # %bb.0:
3240; X64-SSE-NEXT:    movaps (%rdi), %xmm0 # encoding: [0x0f,0x28,0x07]
3241; X64-SSE-NEXT:    movaps (%rsi), %xmm1 # encoding: [0x0f,0x28,0x0e]
3242; X64-SSE-NEXT:    movaps (%rdx), %xmm2 # encoding: [0x0f,0x28,0x12]
3243; X64-SSE-NEXT:    movaps (%rcx), %xmm3 # encoding: [0x0f,0x28,0x19]
3244; X64-SSE-NEXT:    movaps %xmm0, %xmm4 # encoding: [0x0f,0x28,0xe0]
3245; X64-SSE-NEXT:    unpcklps %xmm1, %xmm4 # encoding: [0x0f,0x14,0xe1]
3246; X64-SSE-NEXT:    # xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
3247; X64-SSE-NEXT:    movaps %xmm2, %xmm5 # encoding: [0x0f,0x28,0xea]
3248; X64-SSE-NEXT:    unpcklps %xmm3, %xmm5 # encoding: [0x0f,0x14,0xeb]
3249; X64-SSE-NEXT:    # xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
3250; X64-SSE-NEXT:    unpckhps %xmm1, %xmm0 # encoding: [0x0f,0x15,0xc1]
3251; X64-SSE-NEXT:    # xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3252; X64-SSE-NEXT:    unpckhps %xmm3, %xmm2 # encoding: [0x0f,0x15,0xd3]
3253; X64-SSE-NEXT:    # xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
3254; X64-SSE-NEXT:    movaps %xmm4, %xmm1 # encoding: [0x0f,0x28,0xcc]
3255; X64-SSE-NEXT:    movlhps %xmm5, %xmm1 # encoding: [0x0f,0x16,0xcd]
3256; X64-SSE-NEXT:    # xmm1 = xmm1[0],xmm5[0]
3257; X64-SSE-NEXT:    movhlps %xmm4, %xmm5 # encoding: [0x0f,0x12,0xec]
3258; X64-SSE-NEXT:    # xmm5 = xmm4[1],xmm5[1]
3259; X64-SSE-NEXT:    movaps %xmm0, %xmm3 # encoding: [0x0f,0x28,0xd8]
3260; X64-SSE-NEXT:    movlhps %xmm2, %xmm3 # encoding: [0x0f,0x16,0xda]
3261; X64-SSE-NEXT:    # xmm3 = xmm3[0],xmm2[0]
3262; X64-SSE-NEXT:    movhlps %xmm0, %xmm2 # encoding: [0x0f,0x12,0xd0]
3263; X64-SSE-NEXT:    # xmm2 = xmm0[1],xmm2[1]
3264; X64-SSE-NEXT:    movaps %xmm1, (%rdi) # encoding: [0x0f,0x29,0x0f]
3265; X64-SSE-NEXT:    movaps %xmm5, (%rsi) # encoding: [0x0f,0x29,0x2e]
3266; X64-SSE-NEXT:    movaps %xmm3, (%rdx) # encoding: [0x0f,0x29,0x1a]
3267; X64-SSE-NEXT:    movaps %xmm2, (%rcx) # encoding: [0x0f,0x29,0x11]
3268; X64-SSE-NEXT:    retq # encoding: [0xc3]
3269;
3270; X64-AVX1-LABEL: test_MM_TRANSPOSE4_PS:
3271; X64-AVX1:       # %bb.0:
3272; X64-AVX1-NEXT:    vmovaps (%rdi), %xmm0 # encoding: [0xc5,0xf8,0x28,0x07]
3273; X64-AVX1-NEXT:    vmovaps (%rsi), %xmm1 # encoding: [0xc5,0xf8,0x28,0x0e]
3274; X64-AVX1-NEXT:    vmovaps (%rdx), %xmm2 # encoding: [0xc5,0xf8,0x28,0x12]
3275; X64-AVX1-NEXT:    vmovaps (%rcx), %xmm3 # encoding: [0xc5,0xf8,0x28,0x19]
3276; X64-AVX1-NEXT:    vunpcklps %xmm1, %xmm0, %xmm4 # encoding: [0xc5,0xf8,0x14,0xe1]
3277; X64-AVX1-NEXT:    # xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3278; X64-AVX1-NEXT:    vunpcklps %xmm3, %xmm2, %xmm5 # encoding: [0xc5,0xe8,0x14,0xeb]
3279; X64-AVX1-NEXT:    # xmm5 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
3280; X64-AVX1-NEXT:    vunpckhps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x15,0xc1]
3281; X64-AVX1-NEXT:    # xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3282; X64-AVX1-NEXT:    vunpckhps %xmm3, %xmm2, %xmm1 # encoding: [0xc5,0xe8,0x15,0xcb]
3283; X64-AVX1-NEXT:    # xmm1 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
3284; X64-AVX1-NEXT:    vmovlhps %xmm5, %xmm4, %xmm2 # encoding: [0xc5,0xd8,0x16,0xd5]
3285; X64-AVX1-NEXT:    # xmm2 = xmm4[0],xmm5[0]
3286; X64-AVX1-NEXT:    vunpckhpd %xmm5, %xmm4, %xmm3 # encoding: [0xc5,0xd9,0x15,0xdd]
3287; X64-AVX1-NEXT:    # xmm3 = xmm4[1],xmm5[1]
3288; X64-AVX1-NEXT:    vmovlhps %xmm1, %xmm0, %xmm4 # encoding: [0xc5,0xf8,0x16,0xe1]
3289; X64-AVX1-NEXT:    # xmm4 = xmm0[0],xmm1[0]
3290; X64-AVX1-NEXT:    vunpckhpd %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x15,0xc1]
3291; X64-AVX1-NEXT:    # xmm0 = xmm0[1],xmm1[1]
3292; X64-AVX1-NEXT:    vmovaps %xmm2, (%rdi) # encoding: [0xc5,0xf8,0x29,0x17]
3293; X64-AVX1-NEXT:    vmovaps %xmm3, (%rsi) # encoding: [0xc5,0xf8,0x29,0x1e]
3294; X64-AVX1-NEXT:    vmovaps %xmm4, (%rdx) # encoding: [0xc5,0xf8,0x29,0x22]
3295; X64-AVX1-NEXT:    vmovaps %xmm0, (%rcx) # encoding: [0xc5,0xf8,0x29,0x01]
3296; X64-AVX1-NEXT:    retq # encoding: [0xc3]
3297;
3298; X64-AVX512-LABEL: test_MM_TRANSPOSE4_PS:
3299; X64-AVX512:       # %bb.0:
3300; X64-AVX512-NEXT:    vmovaps (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x07]
3301; X64-AVX512-NEXT:    vmovaps (%rsi), %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x0e]
3302; X64-AVX512-NEXT:    vmovaps (%rdx), %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x12]
3303; X64-AVX512-NEXT:    vmovaps (%rcx), %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x19]
3304; X64-AVX512-NEXT:    vunpcklps %xmm1, %xmm0, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x14,0xe1]
3305; X64-AVX512-NEXT:    # xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3306; X64-AVX512-NEXT:    vunpcklps %xmm3, %xmm2, %xmm5 # EVEX TO VEX Compression encoding: [0xc5,0xe8,0x14,0xeb]
3307; X64-AVX512-NEXT:    # xmm5 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
3308; X64-AVX512-NEXT:    vunpckhps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x15,0xc1]
3309; X64-AVX512-NEXT:    # xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3310; X64-AVX512-NEXT:    vunpckhps %xmm3, %xmm2, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xe8,0x15,0xcb]
3311; X64-AVX512-NEXT:    # xmm1 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
3312; X64-AVX512-NEXT:    vmovlhps %xmm5, %xmm4, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xd8,0x16,0xd5]
3313; X64-AVX512-NEXT:    # xmm2 = xmm4[0],xmm5[0]
3314; X64-AVX512-NEXT:    vunpckhpd %xmm5, %xmm4, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xd9,0x15,0xdd]
3315; X64-AVX512-NEXT:    # xmm3 = xmm4[1],xmm5[1]
3316; X64-AVX512-NEXT:    vmovlhps %xmm1, %xmm0, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x16,0xe1]
3317; X64-AVX512-NEXT:    # xmm4 = xmm0[0],xmm1[0]
3318; X64-AVX512-NEXT:    vunpckhpd %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x15,0xc1]
3319; X64-AVX512-NEXT:    # xmm0 = xmm0[1],xmm1[1]
3320; X64-AVX512-NEXT:    vmovaps %xmm2, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x17]
3321; X64-AVX512-NEXT:    vmovaps %xmm3, (%rsi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x1e]
3322; X64-AVX512-NEXT:    vmovaps %xmm4, (%rdx) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x22]
3323; X64-AVX512-NEXT:    vmovaps %xmm0, (%rcx) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x01]
3324; X64-AVX512-NEXT:    retq # encoding: [0xc3]
3325  %row0 = load <4 x float>, <4 x float>* %a0, align 16
3326  %row1 = load <4 x float>, <4 x float>* %a1, align 16
3327  %row2 = load <4 x float>, <4 x float>* %a2, align 16
3328  %row3 = load <4 x float>, <4 x float>* %a3, align 16
3329  %tmp0 = shufflevector <4 x float> %row0, <4 x float> %row1, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
3330  %tmp2 = shufflevector <4 x float> %row2, <4 x float> %row3, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
3331  %tmp1 = shufflevector <4 x float> %row0, <4 x float> %row1, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
3332  %tmp3 = shufflevector <4 x float> %row2, <4 x float> %row3, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
3333  %res0 = shufflevector <4 x float> %tmp0, <4 x float> %tmp2, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
3334  %res1 = shufflevector <4 x float> %tmp2, <4 x float> %tmp0, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
3335  %res2 = shufflevector <4 x float> %tmp1, <4 x float> %tmp3, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
3336  %res3 = shufflevector <4 x float> %tmp3, <4 x float> %tmp1, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
3337  store <4 x float> %res0, <4 x float>* %a0, align 16
3338  store <4 x float> %res1, <4 x float>* %a1, align 16
3339  store <4 x float> %res2, <4 x float>* %a2, align 16
3340  store <4 x float> %res3, <4 x float>* %a3, align 16
3341  ret void
3342}
3343
3344define i32 @test_mm_ucomieq_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
3345; SSE-LABEL: test_mm_ucomieq_ss:
3346; SSE:       # %bb.0:
3347; SSE-NEXT:    ucomiss %xmm1, %xmm0 # encoding: [0x0f,0x2e,0xc1]
3348; SSE-NEXT:    setnp %al # encoding: [0x0f,0x9b,0xc0]
3349; SSE-NEXT:    sete %cl # encoding: [0x0f,0x94,0xc1]
3350; SSE-NEXT:    andb %al, %cl # encoding: [0x20,0xc1]
3351; SSE-NEXT:    movzbl %cl, %eax # encoding: [0x0f,0xb6,0xc1]
3352; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3353;
3354; AVX1-LABEL: test_mm_ucomieq_ss:
3355; AVX1:       # %bb.0:
3356; AVX1-NEXT:    vucomiss %xmm1, %xmm0 # encoding: [0xc5,0xf8,0x2e,0xc1]
3357; AVX1-NEXT:    setnp %al # encoding: [0x0f,0x9b,0xc0]
3358; AVX1-NEXT:    sete %cl # encoding: [0x0f,0x94,0xc1]
3359; AVX1-NEXT:    andb %al, %cl # encoding: [0x20,0xc1]
3360; AVX1-NEXT:    movzbl %cl, %eax # encoding: [0x0f,0xb6,0xc1]
3361; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3362;
3363; AVX512-LABEL: test_mm_ucomieq_ss:
3364; AVX512:       # %bb.0:
3365; AVX512-NEXT:    vucomiss %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc1]
3366; AVX512-NEXT:    setnp %al # encoding: [0x0f,0x9b,0xc0]
3367; AVX512-NEXT:    sete %cl # encoding: [0x0f,0x94,0xc1]
3368; AVX512-NEXT:    andb %al, %cl # encoding: [0x20,0xc1]
3369; AVX512-NEXT:    movzbl %cl, %eax # encoding: [0x0f,0xb6,0xc1]
3370; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3371  %res = call i32 @llvm.x86.sse.ucomieq.ss(<4 x float> %a0, <4 x float> %a1)
3372  ret i32 %res
3373}
3374declare i32 @llvm.x86.sse.ucomieq.ss(<4 x float>, <4 x float>) nounwind readnone
3375
3376define i32 @test_mm_ucomige_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
3377; SSE-LABEL: test_mm_ucomige_ss:
3378; SSE:       # %bb.0:
3379; SSE-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
3380; SSE-NEXT:    ucomiss %xmm1, %xmm0 # encoding: [0x0f,0x2e,0xc1]
3381; SSE-NEXT:    setae %al # encoding: [0x0f,0x93,0xc0]
3382; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3383;
3384; AVX1-LABEL: test_mm_ucomige_ss:
3385; AVX1:       # %bb.0:
3386; AVX1-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
3387; AVX1-NEXT:    vucomiss %xmm1, %xmm0 # encoding: [0xc5,0xf8,0x2e,0xc1]
3388; AVX1-NEXT:    setae %al # encoding: [0x0f,0x93,0xc0]
3389; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3390;
3391; AVX512-LABEL: test_mm_ucomige_ss:
3392; AVX512:       # %bb.0:
3393; AVX512-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
3394; AVX512-NEXT:    vucomiss %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc1]
3395; AVX512-NEXT:    setae %al # encoding: [0x0f,0x93,0xc0]
3396; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3397  %res = call i32 @llvm.x86.sse.ucomige.ss(<4 x float> %a0, <4 x float> %a1)
3398  ret i32 %res
3399}
3400declare i32 @llvm.x86.sse.ucomige.ss(<4 x float>, <4 x float>) nounwind readnone
3401
3402define i32 @test_mm_ucomigt_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
3403; SSE-LABEL: test_mm_ucomigt_ss:
3404; SSE:       # %bb.0:
3405; SSE-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
3406; SSE-NEXT:    ucomiss %xmm1, %xmm0 # encoding: [0x0f,0x2e,0xc1]
3407; SSE-NEXT:    seta %al # encoding: [0x0f,0x97,0xc0]
3408; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3409;
3410; AVX1-LABEL: test_mm_ucomigt_ss:
3411; AVX1:       # %bb.0:
3412; AVX1-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
3413; AVX1-NEXT:    vucomiss %xmm1, %xmm0 # encoding: [0xc5,0xf8,0x2e,0xc1]
3414; AVX1-NEXT:    seta %al # encoding: [0x0f,0x97,0xc0]
3415; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3416;
3417; AVX512-LABEL: test_mm_ucomigt_ss:
3418; AVX512:       # %bb.0:
3419; AVX512-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
3420; AVX512-NEXT:    vucomiss %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc1]
3421; AVX512-NEXT:    seta %al # encoding: [0x0f,0x97,0xc0]
3422; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3423  %res = call i32 @llvm.x86.sse.ucomigt.ss(<4 x float> %a0, <4 x float> %a1)
3424  ret i32 %res
3425}
3426declare i32 @llvm.x86.sse.ucomigt.ss(<4 x float>, <4 x float>) nounwind readnone
3427
3428define i32 @test_mm_ucomile_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
3429; SSE-LABEL: test_mm_ucomile_ss:
3430; SSE:       # %bb.0:
3431; SSE-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
3432; SSE-NEXT:    ucomiss %xmm0, %xmm1 # encoding: [0x0f,0x2e,0xc8]
3433; SSE-NEXT:    setae %al # encoding: [0x0f,0x93,0xc0]
3434; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3435;
3436; AVX1-LABEL: test_mm_ucomile_ss:
3437; AVX1:       # %bb.0:
3438; AVX1-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
3439; AVX1-NEXT:    vucomiss %xmm0, %xmm1 # encoding: [0xc5,0xf8,0x2e,0xc8]
3440; AVX1-NEXT:    setae %al # encoding: [0x0f,0x93,0xc0]
3441; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3442;
3443; AVX512-LABEL: test_mm_ucomile_ss:
3444; AVX512:       # %bb.0:
3445; AVX512-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
3446; AVX512-NEXT:    vucomiss %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc8]
3447; AVX512-NEXT:    setae %al # encoding: [0x0f,0x93,0xc0]
3448; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3449  %res = call i32 @llvm.x86.sse.ucomile.ss(<4 x float> %a0, <4 x float> %a1)
3450  ret i32 %res
3451}
3452declare i32 @llvm.x86.sse.ucomile.ss(<4 x float>, <4 x float>) nounwind readnone
3453
3454define i32 @test_mm_ucomilt_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
3455; SSE-LABEL: test_mm_ucomilt_ss:
3456; SSE:       # %bb.0:
3457; SSE-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
3458; SSE-NEXT:    ucomiss %xmm0, %xmm1 # encoding: [0x0f,0x2e,0xc8]
3459; SSE-NEXT:    seta %al # encoding: [0x0f,0x97,0xc0]
3460; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3461;
3462; AVX1-LABEL: test_mm_ucomilt_ss:
3463; AVX1:       # %bb.0:
3464; AVX1-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
3465; AVX1-NEXT:    vucomiss %xmm0, %xmm1 # encoding: [0xc5,0xf8,0x2e,0xc8]
3466; AVX1-NEXT:    seta %al # encoding: [0x0f,0x97,0xc0]
3467; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3468;
3469; AVX512-LABEL: test_mm_ucomilt_ss:
3470; AVX512:       # %bb.0:
3471; AVX512-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
3472; AVX512-NEXT:    vucomiss %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc8]
3473; AVX512-NEXT:    seta %al # encoding: [0x0f,0x97,0xc0]
3474; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3475  %res = call i32 @llvm.x86.sse.ucomilt.ss(<4 x float> %a0, <4 x float> %a1)
3476  ret i32 %res
3477}
3478declare i32 @llvm.x86.sse.ucomilt.ss(<4 x float>, <4 x float>) nounwind readnone
3479
3480define i32 @test_mm_ucomineq_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
3481; SSE-LABEL: test_mm_ucomineq_ss:
3482; SSE:       # %bb.0:
3483; SSE-NEXT:    ucomiss %xmm1, %xmm0 # encoding: [0x0f,0x2e,0xc1]
3484; SSE-NEXT:    setp %al # encoding: [0x0f,0x9a,0xc0]
3485; SSE-NEXT:    setne %cl # encoding: [0x0f,0x95,0xc1]
3486; SSE-NEXT:    orb %al, %cl # encoding: [0x08,0xc1]
3487; SSE-NEXT:    movzbl %cl, %eax # encoding: [0x0f,0xb6,0xc1]
3488; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3489;
3490; AVX1-LABEL: test_mm_ucomineq_ss:
3491; AVX1:       # %bb.0:
3492; AVX1-NEXT:    vucomiss %xmm1, %xmm0 # encoding: [0xc5,0xf8,0x2e,0xc1]
3493; AVX1-NEXT:    setp %al # encoding: [0x0f,0x9a,0xc0]
3494; AVX1-NEXT:    setne %cl # encoding: [0x0f,0x95,0xc1]
3495; AVX1-NEXT:    orb %al, %cl # encoding: [0x08,0xc1]
3496; AVX1-NEXT:    movzbl %cl, %eax # encoding: [0x0f,0xb6,0xc1]
3497; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3498;
3499; AVX512-LABEL: test_mm_ucomineq_ss:
3500; AVX512:       # %bb.0:
3501; AVX512-NEXT:    vucomiss %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc1]
3502; AVX512-NEXT:    setp %al # encoding: [0x0f,0x9a,0xc0]
3503; AVX512-NEXT:    setne %cl # encoding: [0x0f,0x95,0xc1]
3504; AVX512-NEXT:    orb %al, %cl # encoding: [0x08,0xc1]
3505; AVX512-NEXT:    movzbl %cl, %eax # encoding: [0x0f,0xb6,0xc1]
3506; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3507  %res = call i32 @llvm.x86.sse.ucomineq.ss(<4 x float> %a0, <4 x float> %a1)
3508  ret i32 %res
3509}
3510declare i32 @llvm.x86.sse.ucomineq.ss(<4 x float>, <4 x float>) nounwind readnone
3511
3512define <4 x float> @test_mm_undefined_ps() {
3513; CHECK-LABEL: test_mm_undefined_ps:
3514; CHECK:       # %bb.0:
3515; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3516  ret <4 x float> undef
3517}
3518
3519define <4 x float> @test_mm_unpackhi_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
3520; SSE-LABEL: test_mm_unpackhi_ps:
3521; SSE:       # %bb.0:
3522; SSE-NEXT:    unpckhps %xmm1, %xmm0 # encoding: [0x0f,0x15,0xc1]
3523; SSE-NEXT:    # xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3524; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3525;
3526; AVX1-LABEL: test_mm_unpackhi_ps:
3527; AVX1:       # %bb.0:
3528; AVX1-NEXT:    vunpckhps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x15,0xc1]
3529; AVX1-NEXT:    # xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3530; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3531;
3532; AVX512-LABEL: test_mm_unpackhi_ps:
3533; AVX512:       # %bb.0:
3534; AVX512-NEXT:    vunpckhps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x15,0xc1]
3535; AVX512-NEXT:    # xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3536; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3537  %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
3538  ret <4 x float> %res
3539}
3540
3541define <4 x float> @test_mm_unpacklo_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
3542; SSE-LABEL: test_mm_unpacklo_ps:
3543; SSE:       # %bb.0:
3544; SSE-NEXT:    unpcklps %xmm1, %xmm0 # encoding: [0x0f,0x14,0xc1]
3545; SSE-NEXT:    # xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3546; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3547;
3548; AVX1-LABEL: test_mm_unpacklo_ps:
3549; AVX1:       # %bb.0:
3550; AVX1-NEXT:    vunpcklps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x14,0xc1]
3551; AVX1-NEXT:    # xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3552; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3553;
3554; AVX512-LABEL: test_mm_unpacklo_ps:
3555; AVX512:       # %bb.0:
3556; AVX512-NEXT:    vunpcklps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x14,0xc1]
3557; AVX512-NEXT:    # xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3558; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3559  %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
3560  ret <4 x float> %res
3561}
3562
3563define <4 x float> @test_mm_xor_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
3564; SSE-LABEL: test_mm_xor_ps:
3565; SSE:       # %bb.0:
3566; SSE-NEXT:    xorps %xmm1, %xmm0 # encoding: [0x0f,0x57,0xc1]
3567; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3568;
3569; AVX1-LABEL: test_mm_xor_ps:
3570; AVX1:       # %bb.0:
3571; AVX1-NEXT:    vxorps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x57,0xc1]
3572; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3573;
3574; AVX512-LABEL: test_mm_xor_ps:
3575; AVX512:       # %bb.0:
3576; AVX512-NEXT:    vxorps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x57,0xc1]
3577; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3578  %arg0 = bitcast <4 x float> %a0 to <4 x i32>
3579  %arg1 = bitcast <4 x float> %a1 to <4 x i32>
3580  %res = xor <4 x i32> %arg0, %arg1
3581  %bc = bitcast <4 x i32> %res to <4 x float>
3582  ret <4 x float> %bc
3583}
3584
3585!0 = !{i32 1}
3586