1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -show-mc-encoding -fast-isel-sink-local-values < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+sse | FileCheck %s --check-prefixes=CHECK,X86,SSE,X86-SSE
3; RUN: llc -show-mc-encoding -fast-isel-sink-local-values < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,X86,AVX,X86-AVX,AVX1,X86-AVX1
4; RUN: llc -show-mc-encoding -fast-isel-sink-local-values < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl | FileCheck %s --check-prefixes=CHECK,X86,AVX,X86-AVX,AVX512,X86-AVX512
5; RUN: llc -show-mc-encoding -fast-isel-sink-local-values < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+sse,-sse2 | FileCheck %s --check-prefixes=CHECK,X64,SSE,X64-SSE
6; RUN: llc -show-mc-encoding -fast-isel-sink-local-values < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,X64,AVX,X64-AVX,AVX1,X64-AVX1
7; RUN: llc -show-mc-encoding -fast-isel-sink-local-values < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl | FileCheck %s --check-prefixes=CHECK,X64,AVX,X64-AVX,AVX512,X64-AVX512
8
9; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/sse-builtins.c
10
11define <4 x float> @test_mm_add_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
12; SSE-LABEL: test_mm_add_ps:
13; SSE:       # %bb.0:
14; SSE-NEXT:    addps %xmm1, %xmm0 # encoding: [0x0f,0x58,0xc1]
15; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
16;
17; AVX1-LABEL: test_mm_add_ps:
18; AVX1:       # %bb.0:
19; AVX1-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x58,0xc1]
20; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
21;
22; AVX512-LABEL: test_mm_add_ps:
23; AVX512:       # %bb.0:
24; AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1]
25; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
26  %res = fadd <4 x float> %a0, %a1
27  ret <4 x float> %res
28}
29
30define <4 x float> @test_mm_add_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
31; SSE-LABEL: test_mm_add_ss:
32; SSE:       # %bb.0:
33; SSE-NEXT:    addss %xmm1, %xmm0 # encoding: [0xf3,0x0f,0x58,0xc1]
34; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
35;
36; AVX1-LABEL: test_mm_add_ss:
37; AVX1:       # %bb.0:
38; AVX1-NEXT:    vaddss %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x58,0xc1]
39; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
40;
41; AVX512-LABEL: test_mm_add_ss:
42; AVX512:       # %bb.0:
43; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x58,0xc1]
44; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
45  %ext0 = extractelement <4 x float> %a0, i32 0
46  %ext1 = extractelement <4 x float> %a1, i32 0
47  %fadd = fadd float %ext0, %ext1
48  %res = insertelement <4 x float> %a0, float %fadd, i32 0
49  ret <4 x float> %res
50}
51
52define <4 x float> @test_mm_and_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
53; SSE-LABEL: test_mm_and_ps:
54; SSE:       # %bb.0:
55; SSE-NEXT:    andps %xmm1, %xmm0 # encoding: [0x0f,0x54,0xc1]
56; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
57;
58; AVX1-LABEL: test_mm_and_ps:
59; AVX1:       # %bb.0:
60; AVX1-NEXT:    vandps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x54,0xc1]
61; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
62;
63; AVX512-LABEL: test_mm_and_ps:
64; AVX512:       # %bb.0:
65; AVX512-NEXT:    vandps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x54,0xc1]
66; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
67  %arg0 = bitcast <4 x float> %a0 to <4 x i32>
68  %arg1 = bitcast <4 x float> %a1 to <4 x i32>
69  %res = and <4 x i32> %arg0, %arg1
70  %bc = bitcast <4 x i32> %res to <4 x float>
71  ret <4 x float> %bc
72}
73
74define <4 x float> @test_mm_andnot_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
75; SSE-LABEL: test_mm_andnot_ps:
76; SSE:       # %bb.0:
77; SSE-NEXT:    andnps %xmm1, %xmm0 # encoding: [0x0f,0x55,0xc1]
78; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
79;
80; AVX1-LABEL: test_mm_andnot_ps:
81; AVX1:       # %bb.0:
82; AVX1-NEXT:    vandnps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x55,0xc1]
83; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
84;
85; AVX512-LABEL: test_mm_andnot_ps:
86; AVX512:       # %bb.0:
87; AVX512-NEXT:    vandnps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x55,0xc1]
88; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
89  %arg0 = bitcast <4 x float> %a0 to <4 x i32>
90  %arg1 = bitcast <4 x float> %a1 to <4 x i32>
91  %not = xor <4 x i32> %arg0, <i32 -1, i32 -1, i32 -1, i32 -1>
92  %res = and <4 x i32> %not, %arg1
93  %bc = bitcast <4 x i32> %res to <4 x float>
94  ret <4 x float> %bc
95}
96
97define <4 x float> @test_mm_cmpeq_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
98; SSE-LABEL: test_mm_cmpeq_ps:
99; SSE:       # %bb.0:
100; SSE-NEXT:    cmpeqps %xmm1, %xmm0 # encoding: [0x0f,0xc2,0xc1,0x00]
101; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
102;
103; AVX1-LABEL: test_mm_cmpeq_ps:
104; AVX1:       # %bb.0:
105; AVX1-NEXT:    vcmpeqps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0xc2,0xc1,0x00]
106; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
107;
108; AVX512-LABEL: test_mm_cmpeq_ps:
109; AVX512:       # %bb.0:
110; AVX512-NEXT:    vcmpeqps %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7c,0x08,0xc2,0xc1,0x00]
111; AVX512-NEXT:    vpmovm2d %k0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x38,0xc0]
112; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
113  %cmp = fcmp oeq <4 x float> %a0, %a1
114  %sext = sext <4 x i1> %cmp to <4 x i32>
115  %res = bitcast <4 x i32> %sext to <4 x float>
116  ret <4 x float> %res
117}
118
119define <4 x float> @test_mm_cmpeq_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
120; SSE-LABEL: test_mm_cmpeq_ss:
121; SSE:       # %bb.0:
122; SSE-NEXT:    cmpeqss %xmm1, %xmm0 # encoding: [0xf3,0x0f,0xc2,0xc1,0x00]
123; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
124;
125; AVX-LABEL: test_mm_cmpeq_ss:
126; AVX:       # %bb.0:
127; AVX-NEXT:    vcmpeqss %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0xc2,0xc1,0x00]
128; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
129  %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 0)
130  ret <4 x float> %res
131}
132declare <4 x float> @llvm.x86.sse.cmp.ss(<4 x float>, <4 x float>, i8) nounwind readnone
133
134define <4 x float> @test_mm_cmpge_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
135; SSE-LABEL: test_mm_cmpge_ps:
136; SSE:       # %bb.0:
137; SSE-NEXT:    cmpleps %xmm0, %xmm1 # encoding: [0x0f,0xc2,0xc8,0x02]
138; SSE-NEXT:    movaps %xmm1, %xmm0 # encoding: [0x0f,0x28,0xc1]
139; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
140;
141; AVX1-LABEL: test_mm_cmpge_ps:
142; AVX1:       # %bb.0:
143; AVX1-NEXT:    vcmpleps %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf0,0xc2,0xc0,0x02]
144; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
145;
146; AVX512-LABEL: test_mm_cmpge_ps:
147; AVX512:       # %bb.0:
148; AVX512-NEXT:    vcmpleps %xmm0, %xmm1, %k0 # encoding: [0x62,0xf1,0x74,0x08,0xc2,0xc0,0x02]
149; AVX512-NEXT:    vpmovm2d %k0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x38,0xc0]
150; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
151  %cmp = fcmp ole <4 x float> %a1, %a0
152  %sext = sext <4 x i1> %cmp to <4 x i32>
153  %res = bitcast <4 x i32> %sext to <4 x float>
154  ret <4 x float> %res
155}
156
157define <4 x float> @test_mm_cmpge_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
158; SSE-LABEL: test_mm_cmpge_ss:
159; SSE:       # %bb.0:
160; SSE-NEXT:    cmpless %xmm0, %xmm1 # encoding: [0xf3,0x0f,0xc2,0xc8,0x02]
161; SSE-NEXT:    movss %xmm1, %xmm0 # encoding: [0xf3,0x0f,0x10,0xc1]
162; SSE-NEXT:    # xmm0 = xmm1[0],xmm0[1,2,3]
163; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
164;
165; AVX-LABEL: test_mm_cmpge_ss:
166; AVX:       # %bb.0:
167; AVX-NEXT:    vcmpless %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf2,0xc2,0xc8,0x02]
168; AVX-NEXT:    vblendps $1, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x01]
169; AVX-NEXT:    # xmm0 = xmm1[0],xmm0[1,2,3]
170; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
171  %cmp = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a1, <4 x float> %a0, i8 2)
172  %res = shufflevector <4 x float> %a0, <4 x float> %cmp, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
173  ret <4 x float> %res
174}
175
176define <4 x float> @test_mm_cmpgt_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
177; SSE-LABEL: test_mm_cmpgt_ps:
178; SSE:       # %bb.0:
179; SSE-NEXT:    cmpltps %xmm0, %xmm1 # encoding: [0x0f,0xc2,0xc8,0x01]
180; SSE-NEXT:    movaps %xmm1, %xmm0 # encoding: [0x0f,0x28,0xc1]
181; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
182;
183; AVX1-LABEL: test_mm_cmpgt_ps:
184; AVX1:       # %bb.0:
185; AVX1-NEXT:    vcmpltps %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf0,0xc2,0xc0,0x01]
186; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
187;
188; AVX512-LABEL: test_mm_cmpgt_ps:
189; AVX512:       # %bb.0:
190; AVX512-NEXT:    vcmpltps %xmm0, %xmm1, %k0 # encoding: [0x62,0xf1,0x74,0x08,0xc2,0xc0,0x01]
191; AVX512-NEXT:    vpmovm2d %k0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x38,0xc0]
192; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
193  %cmp = fcmp olt <4 x float> %a1, %a0
194  %sext = sext <4 x i1> %cmp to <4 x i32>
195  %res = bitcast <4 x i32> %sext to <4 x float>
196  ret <4 x float> %res
197}
198
199define <4 x float> @test_mm_cmpgt_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
200; SSE-LABEL: test_mm_cmpgt_ss:
201; SSE:       # %bb.0:
202; SSE-NEXT:    cmpltss %xmm0, %xmm1 # encoding: [0xf3,0x0f,0xc2,0xc8,0x01]
203; SSE-NEXT:    movss %xmm1, %xmm0 # encoding: [0xf3,0x0f,0x10,0xc1]
204; SSE-NEXT:    # xmm0 = xmm1[0],xmm0[1,2,3]
205; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
206;
207; AVX-LABEL: test_mm_cmpgt_ss:
208; AVX:       # %bb.0:
209; AVX-NEXT:    vcmpltss %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf2,0xc2,0xc8,0x01]
210; AVX-NEXT:    vblendps $1, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x01]
211; AVX-NEXT:    # xmm0 = xmm1[0],xmm0[1,2,3]
212; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
213  %cmp = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a1, <4 x float> %a0, i8 1)
214  %res = shufflevector <4 x float> %a0, <4 x float> %cmp, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
215  ret <4 x float> %res
216}
217
218define <4 x float> @test_mm_cmple_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
219; SSE-LABEL: test_mm_cmple_ps:
220; SSE:       # %bb.0:
221; SSE-NEXT:    cmpleps %xmm1, %xmm0 # encoding: [0x0f,0xc2,0xc1,0x02]
222; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
223;
224; AVX1-LABEL: test_mm_cmple_ps:
225; AVX1:       # %bb.0:
226; AVX1-NEXT:    vcmpleps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0xc2,0xc1,0x02]
227; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
228;
229; AVX512-LABEL: test_mm_cmple_ps:
230; AVX512:       # %bb.0:
231; AVX512-NEXT:    vcmpleps %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7c,0x08,0xc2,0xc1,0x02]
232; AVX512-NEXT:    vpmovm2d %k0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x38,0xc0]
233; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
234  %cmp = fcmp ole <4 x float> %a0, %a1
235  %sext = sext <4 x i1> %cmp to <4 x i32>
236  %res = bitcast <4 x i32> %sext to <4 x float>
237  ret <4 x float> %res
238}
239
240define <4 x float> @test_mm_cmple_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
241; SSE-LABEL: test_mm_cmple_ss:
242; SSE:       # %bb.0:
243; SSE-NEXT:    cmpless %xmm1, %xmm0 # encoding: [0xf3,0x0f,0xc2,0xc1,0x02]
244; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
245;
246; AVX-LABEL: test_mm_cmple_ss:
247; AVX:       # %bb.0:
248; AVX-NEXT:    vcmpless %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0xc2,0xc1,0x02]
249; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
250  %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 2)
251  ret <4 x float> %res
252}
253
254define <4 x float> @test_mm_cmplt_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
255; SSE-LABEL: test_mm_cmplt_ps:
256; SSE:       # %bb.0:
257; SSE-NEXT:    cmpltps %xmm1, %xmm0 # encoding: [0x0f,0xc2,0xc1,0x01]
258; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
259;
260; AVX1-LABEL: test_mm_cmplt_ps:
261; AVX1:       # %bb.0:
262; AVX1-NEXT:    vcmpltps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0xc2,0xc1,0x01]
263; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
264;
265; AVX512-LABEL: test_mm_cmplt_ps:
266; AVX512:       # %bb.0:
267; AVX512-NEXT:    vcmpltps %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7c,0x08,0xc2,0xc1,0x01]
268; AVX512-NEXT:    vpmovm2d %k0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x38,0xc0]
269; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
270  %cmp = fcmp olt <4 x float> %a0, %a1
271  %sext = sext <4 x i1> %cmp to <4 x i32>
272  %res = bitcast <4 x i32> %sext to <4 x float>
273  ret <4 x float> %res
274}
275
276define <4 x float> @test_mm_cmplt_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
277; SSE-LABEL: test_mm_cmplt_ss:
278; SSE:       # %bb.0:
279; SSE-NEXT:    cmpltss %xmm1, %xmm0 # encoding: [0xf3,0x0f,0xc2,0xc1,0x01]
280; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
281;
282; AVX-LABEL: test_mm_cmplt_ss:
283; AVX:       # %bb.0:
284; AVX-NEXT:    vcmpltss %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0xc2,0xc1,0x01]
285; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
286  %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 1)
287  ret <4 x float> %res
288}
289
290define <4 x float> @test_mm_cmpneq_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
291; SSE-LABEL: test_mm_cmpneq_ps:
292; SSE:       # %bb.0:
293; SSE-NEXT:    cmpneqps %xmm1, %xmm0 # encoding: [0x0f,0xc2,0xc1,0x04]
294; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
295;
296; AVX1-LABEL: test_mm_cmpneq_ps:
297; AVX1:       # %bb.0:
298; AVX1-NEXT:    vcmpneqps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0xc2,0xc1,0x04]
299; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
300;
301; AVX512-LABEL: test_mm_cmpneq_ps:
302; AVX512:       # %bb.0:
303; AVX512-NEXT:    vcmpneqps %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7c,0x08,0xc2,0xc1,0x04]
304; AVX512-NEXT:    vpmovm2d %k0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x38,0xc0]
305; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
306  %cmp = fcmp une <4 x float> %a0, %a1
307  %sext = sext <4 x i1> %cmp to <4 x i32>
308  %res = bitcast <4 x i32> %sext to <4 x float>
309  ret <4 x float> %res
310}
311
312define <4 x float> @test_mm_cmpneq_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
313; SSE-LABEL: test_mm_cmpneq_ss:
314; SSE:       # %bb.0:
315; SSE-NEXT:    cmpneqss %xmm1, %xmm0 # encoding: [0xf3,0x0f,0xc2,0xc1,0x04]
316; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
317;
318; AVX-LABEL: test_mm_cmpneq_ss:
319; AVX:       # %bb.0:
320; AVX-NEXT:    vcmpneqss %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0xc2,0xc1,0x04]
321; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
322  %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 4)
323  ret <4 x float> %res
324}
325
326define <4 x float> @test_mm_cmpnge_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
327; SSE-LABEL: test_mm_cmpnge_ps:
328; SSE:       # %bb.0:
329; SSE-NEXT:    cmpnleps %xmm0, %xmm1 # encoding: [0x0f,0xc2,0xc8,0x06]
330; SSE-NEXT:    movaps %xmm1, %xmm0 # encoding: [0x0f,0x28,0xc1]
331; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
332;
333; AVX1-LABEL: test_mm_cmpnge_ps:
334; AVX1:       # %bb.0:
335; AVX1-NEXT:    vcmpnleps %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf0,0xc2,0xc0,0x06]
336; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
337;
338; AVX512-LABEL: test_mm_cmpnge_ps:
339; AVX512:       # %bb.0:
340; AVX512-NEXT:    vcmpnleps %xmm0, %xmm1, %k0 # encoding: [0x62,0xf1,0x74,0x08,0xc2,0xc0,0x06]
341; AVX512-NEXT:    vpmovm2d %k0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x38,0xc0]
342; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
343  %cmp = fcmp ugt <4 x float> %a1, %a0
344  %sext = sext <4 x i1> %cmp to <4 x i32>
345  %res = bitcast <4 x i32> %sext to <4 x float>
346  ret <4 x float> %res
347}
348
349define <4 x float> @test_mm_cmpnge_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
350; SSE-LABEL: test_mm_cmpnge_ss:
351; SSE:       # %bb.0:
352; SSE-NEXT:    cmpnless %xmm0, %xmm1 # encoding: [0xf3,0x0f,0xc2,0xc8,0x06]
353; SSE-NEXT:    movss %xmm1, %xmm0 # encoding: [0xf3,0x0f,0x10,0xc1]
354; SSE-NEXT:    # xmm0 = xmm1[0],xmm0[1,2,3]
355; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
356;
357; AVX-LABEL: test_mm_cmpnge_ss:
358; AVX:       # %bb.0:
359; AVX-NEXT:    vcmpnless %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf2,0xc2,0xc8,0x06]
360; AVX-NEXT:    vblendps $1, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x01]
361; AVX-NEXT:    # xmm0 = xmm1[0],xmm0[1,2,3]
362; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
363  %cmp = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a1, <4 x float> %a0, i8 6)
364  %res = shufflevector <4 x float> %a0, <4 x float> %cmp, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
365  ret <4 x float> %res
366}
367
368define <4 x float> @test_mm_cmpngt_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
369; SSE-LABEL: test_mm_cmpngt_ps:
370; SSE:       # %bb.0:
371; SSE-NEXT:    cmpnltps %xmm0, %xmm1 # encoding: [0x0f,0xc2,0xc8,0x05]
372; SSE-NEXT:    movaps %xmm1, %xmm0 # encoding: [0x0f,0x28,0xc1]
373; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
374;
375; AVX1-LABEL: test_mm_cmpngt_ps:
376; AVX1:       # %bb.0:
377; AVX1-NEXT:    vcmpnltps %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf0,0xc2,0xc0,0x05]
378; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
379;
380; AVX512-LABEL: test_mm_cmpngt_ps:
381; AVX512:       # %bb.0:
382; AVX512-NEXT:    vcmpnltps %xmm0, %xmm1, %k0 # encoding: [0x62,0xf1,0x74,0x08,0xc2,0xc0,0x05]
383; AVX512-NEXT:    vpmovm2d %k0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x38,0xc0]
384; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
385  %cmp = fcmp uge <4 x float> %a1, %a0
386  %sext = sext <4 x i1> %cmp to <4 x i32>
387  %res = bitcast <4 x i32> %sext to <4 x float>
388  ret <4 x float> %res
389}
390
391define <4 x float> @test_mm_cmpngt_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
392; SSE-LABEL: test_mm_cmpngt_ss:
393; SSE:       # %bb.0:
394; SSE-NEXT:    cmpnltss %xmm0, %xmm1 # encoding: [0xf3,0x0f,0xc2,0xc8,0x05]
395; SSE-NEXT:    movss %xmm1, %xmm0 # encoding: [0xf3,0x0f,0x10,0xc1]
396; SSE-NEXT:    # xmm0 = xmm1[0],xmm0[1,2,3]
397; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
398;
399; AVX-LABEL: test_mm_cmpngt_ss:
400; AVX:       # %bb.0:
401; AVX-NEXT:    vcmpnltss %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf2,0xc2,0xc8,0x05]
402; AVX-NEXT:    vblendps $1, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x01]
403; AVX-NEXT:    # xmm0 = xmm1[0],xmm0[1,2,3]
404; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
405  %cmp = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a1, <4 x float> %a0, i8 5)
406  %res = shufflevector <4 x float> %a0, <4 x float> %cmp, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
407  ret <4 x float> %res
408}
409
410define <4 x float> @test_mm_cmpnle_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
411; SSE-LABEL: test_mm_cmpnle_ps:
412; SSE:       # %bb.0:
413; SSE-NEXT:    cmpnleps %xmm1, %xmm0 # encoding: [0x0f,0xc2,0xc1,0x06]
414; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
415;
416; AVX1-LABEL: test_mm_cmpnle_ps:
417; AVX1:       # %bb.0:
418; AVX1-NEXT:    vcmpnleps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0xc2,0xc1,0x06]
419; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
420;
421; AVX512-LABEL: test_mm_cmpnle_ps:
422; AVX512:       # %bb.0:
423; AVX512-NEXT:    vcmpnleps %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7c,0x08,0xc2,0xc1,0x06]
424; AVX512-NEXT:    vpmovm2d %k0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x38,0xc0]
425; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
426  %cmp = fcmp ugt <4 x float> %a0, %a1
427  %sext = sext <4 x i1> %cmp to <4 x i32>
428  %res = bitcast <4 x i32> %sext to <4 x float>
429  ret <4 x float> %res
430}
431
432define <4 x float> @test_mm_cmpnle_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
433; SSE-LABEL: test_mm_cmpnle_ss:
434; SSE:       # %bb.0:
435; SSE-NEXT:    cmpnless %xmm1, %xmm0 # encoding: [0xf3,0x0f,0xc2,0xc1,0x06]
436; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
437;
438; AVX-LABEL: test_mm_cmpnle_ss:
439; AVX:       # %bb.0:
440; AVX-NEXT:    vcmpnless %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0xc2,0xc1,0x06]
441; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
442  %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 6)
443  ret <4 x float> %res
444}
445
446define <4 x float> @test_mm_cmpnlt_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
447; SSE-LABEL: test_mm_cmpnlt_ps:
448; SSE:       # %bb.0:
449; SSE-NEXT:    cmpnltps %xmm1, %xmm0 # encoding: [0x0f,0xc2,0xc1,0x05]
450; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
451;
452; AVX1-LABEL: test_mm_cmpnlt_ps:
453; AVX1:       # %bb.0:
454; AVX1-NEXT:    vcmpnltps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0xc2,0xc1,0x05]
455; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
456;
457; AVX512-LABEL: test_mm_cmpnlt_ps:
458; AVX512:       # %bb.0:
459; AVX512-NEXT:    vcmpnltps %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7c,0x08,0xc2,0xc1,0x05]
460; AVX512-NEXT:    vpmovm2d %k0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x38,0xc0]
461; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
462  %cmp = fcmp uge <4 x float> %a0, %a1
463  %sext = sext <4 x i1> %cmp to <4 x i32>
464  %res = bitcast <4 x i32> %sext to <4 x float>
465  ret <4 x float> %res
466}
467
468define <4 x float> @test_mm_cmpnlt_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
469; SSE-LABEL: test_mm_cmpnlt_ss:
470; SSE:       # %bb.0:
471; SSE-NEXT:    cmpnltss %xmm1, %xmm0 # encoding: [0xf3,0x0f,0xc2,0xc1,0x05]
472; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
473;
474; AVX-LABEL: test_mm_cmpnlt_ss:
475; AVX:       # %bb.0:
476; AVX-NEXT:    vcmpnltss %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0xc2,0xc1,0x05]
477; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
478  %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 5)
479  ret <4 x float> %res
480}
481
482define <4 x float> @test_mm_cmpord_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
483; SSE-LABEL: test_mm_cmpord_ps:
484; SSE:       # %bb.0:
485; SSE-NEXT:    cmpordps %xmm1, %xmm0 # encoding: [0x0f,0xc2,0xc1,0x07]
486; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
487;
488; AVX1-LABEL: test_mm_cmpord_ps:
489; AVX1:       # %bb.0:
490; AVX1-NEXT:    vcmpordps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0xc2,0xc1,0x07]
491; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
492;
493; AVX512-LABEL: test_mm_cmpord_ps:
494; AVX512:       # %bb.0:
495; AVX512-NEXT:    vcmpordps %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7c,0x08,0xc2,0xc1,0x07]
496; AVX512-NEXT:    vpmovm2d %k0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x38,0xc0]
497; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
498  %cmp = fcmp ord <4 x float> %a0, %a1
499  %sext = sext <4 x i1> %cmp to <4 x i32>
500  %res = bitcast <4 x i32> %sext to <4 x float>
501  ret <4 x float> %res
502}
503
504define <4 x float> @test_mm_cmpord_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
505; SSE-LABEL: test_mm_cmpord_ss:
506; SSE:       # %bb.0:
507; SSE-NEXT:    cmpordss %xmm1, %xmm0 # encoding: [0xf3,0x0f,0xc2,0xc1,0x07]
508; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
509;
510; AVX-LABEL: test_mm_cmpord_ss:
511; AVX:       # %bb.0:
512; AVX-NEXT:    vcmpordss %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0xc2,0xc1,0x07]
513; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
514  %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 7)
515  ret <4 x float> %res
516}
517
518define <4 x float> @test_mm_cmpunord_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
519; SSE-LABEL: test_mm_cmpunord_ps:
520; SSE:       # %bb.0:
521; SSE-NEXT:    cmpunordps %xmm1, %xmm0 # encoding: [0x0f,0xc2,0xc1,0x03]
522; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
523;
524; AVX1-LABEL: test_mm_cmpunord_ps:
525; AVX1:       # %bb.0:
526; AVX1-NEXT:    vcmpunordps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0xc2,0xc1,0x03]
527; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
528;
529; AVX512-LABEL: test_mm_cmpunord_ps:
530; AVX512:       # %bb.0:
531; AVX512-NEXT:    vcmpunordps %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7c,0x08,0xc2,0xc1,0x03]
532; AVX512-NEXT:    vpmovm2d %k0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x38,0xc0]
533; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
534  %cmp = fcmp uno <4 x float> %a0, %a1
535  %sext = sext <4 x i1> %cmp to <4 x i32>
536  %res = bitcast <4 x i32> %sext to <4 x float>
537  ret <4 x float> %res
538}
539
540define <4 x float> @test_mm_cmpunord_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
541; SSE-LABEL: test_mm_cmpunord_ss:
542; SSE:       # %bb.0:
543; SSE-NEXT:    cmpunordss %xmm1, %xmm0 # encoding: [0xf3,0x0f,0xc2,0xc1,0x03]
544; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
545;
546; AVX-LABEL: test_mm_cmpunord_ss:
547; AVX:       # %bb.0:
548; AVX-NEXT:    vcmpunordss %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0xc2,0xc1,0x03]
549; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
550  %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 3)
551  ret <4 x float> %res
552}
553
554define i32 @test_mm_comieq_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
555; SSE-LABEL: test_mm_comieq_ss:
556; SSE:       # %bb.0:
557; SSE-NEXT:    comiss %xmm1, %xmm0 # encoding: [0x0f,0x2f,0xc1]
558; SSE-NEXT:    setnp %al # encoding: [0x0f,0x9b,0xc0]
559; SSE-NEXT:    sete %cl # encoding: [0x0f,0x94,0xc1]
560; SSE-NEXT:    andb %al, %cl # encoding: [0x20,0xc1]
561; SSE-NEXT:    movzbl %cl, %eax # encoding: [0x0f,0xb6,0xc1]
562; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
563;
564; AVX1-LABEL: test_mm_comieq_ss:
565; AVX1:       # %bb.0:
566; AVX1-NEXT:    vcomiss %xmm1, %xmm0 # encoding: [0xc5,0xf8,0x2f,0xc1]
567; AVX1-NEXT:    setnp %al # encoding: [0x0f,0x9b,0xc0]
568; AVX1-NEXT:    sete %cl # encoding: [0x0f,0x94,0xc1]
569; AVX1-NEXT:    andb %al, %cl # encoding: [0x20,0xc1]
570; AVX1-NEXT:    movzbl %cl, %eax # encoding: [0x0f,0xb6,0xc1]
571; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
572;
573; AVX512-LABEL: test_mm_comieq_ss:
574; AVX512:       # %bb.0:
575; AVX512-NEXT:    vcomiss %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2f,0xc1]
576; AVX512-NEXT:    setnp %al # encoding: [0x0f,0x9b,0xc0]
577; AVX512-NEXT:    sete %cl # encoding: [0x0f,0x94,0xc1]
578; AVX512-NEXT:    andb %al, %cl # encoding: [0x20,0xc1]
579; AVX512-NEXT:    movzbl %cl, %eax # encoding: [0x0f,0xb6,0xc1]
580; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
581  %res = call i32 @llvm.x86.sse.comieq.ss(<4 x float> %a0, <4 x float> %a1)
582  ret i32 %res
583}
584declare i32 @llvm.x86.sse.comieq.ss(<4 x float>, <4 x float>) nounwind readnone
585
586define i32 @test_mm_comige_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
587; SSE-LABEL: test_mm_comige_ss:
588; SSE:       # %bb.0:
589; SSE-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
590; SSE-NEXT:    comiss %xmm1, %xmm0 # encoding: [0x0f,0x2f,0xc1]
591; SSE-NEXT:    setae %al # encoding: [0x0f,0x93,0xc0]
592; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
593;
594; AVX1-LABEL: test_mm_comige_ss:
595; AVX1:       # %bb.0:
596; AVX1-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
597; AVX1-NEXT:    vcomiss %xmm1, %xmm0 # encoding: [0xc5,0xf8,0x2f,0xc1]
598; AVX1-NEXT:    setae %al # encoding: [0x0f,0x93,0xc0]
599; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
600;
601; AVX512-LABEL: test_mm_comige_ss:
602; AVX512:       # %bb.0:
603; AVX512-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
604; AVX512-NEXT:    vcomiss %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2f,0xc1]
605; AVX512-NEXT:    setae %al # encoding: [0x0f,0x93,0xc0]
606; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
607  %res = call i32 @llvm.x86.sse.comige.ss(<4 x float> %a0, <4 x float> %a1)
608  ret i32 %res
609}
610declare i32 @llvm.x86.sse.comige.ss(<4 x float>, <4 x float>) nounwind readnone
611
612define i32 @test_mm_comigt_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
613; SSE-LABEL: test_mm_comigt_ss:
614; SSE:       # %bb.0:
615; SSE-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
616; SSE-NEXT:    comiss %xmm1, %xmm0 # encoding: [0x0f,0x2f,0xc1]
617; SSE-NEXT:    seta %al # encoding: [0x0f,0x97,0xc0]
618; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
619;
620; AVX1-LABEL: test_mm_comigt_ss:
621; AVX1:       # %bb.0:
622; AVX1-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
623; AVX1-NEXT:    vcomiss %xmm1, %xmm0 # encoding: [0xc5,0xf8,0x2f,0xc1]
624; AVX1-NEXT:    seta %al # encoding: [0x0f,0x97,0xc0]
625; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
626;
627; AVX512-LABEL: test_mm_comigt_ss:
628; AVX512:       # %bb.0:
629; AVX512-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
630; AVX512-NEXT:    vcomiss %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2f,0xc1]
631; AVX512-NEXT:    seta %al # encoding: [0x0f,0x97,0xc0]
632; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
633  %res = call i32 @llvm.x86.sse.comigt.ss(<4 x float> %a0, <4 x float> %a1)
634  ret i32 %res
635}
636declare i32 @llvm.x86.sse.comigt.ss(<4 x float>, <4 x float>) nounwind readnone
637
638define i32 @test_mm_comile_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
639; SSE-LABEL: test_mm_comile_ss:
640; SSE:       # %bb.0:
641; SSE-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
642; SSE-NEXT:    comiss %xmm0, %xmm1 # encoding: [0x0f,0x2f,0xc8]
643; SSE-NEXT:    setae %al # encoding: [0x0f,0x93,0xc0]
644; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
645;
646; AVX1-LABEL: test_mm_comile_ss:
647; AVX1:       # %bb.0:
648; AVX1-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
649; AVX1-NEXT:    vcomiss %xmm0, %xmm1 # encoding: [0xc5,0xf8,0x2f,0xc8]
650; AVX1-NEXT:    setae %al # encoding: [0x0f,0x93,0xc0]
651; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
652;
653; AVX512-LABEL: test_mm_comile_ss:
654; AVX512:       # %bb.0:
655; AVX512-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
656; AVX512-NEXT:    vcomiss %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2f,0xc8]
657; AVX512-NEXT:    setae %al # encoding: [0x0f,0x93,0xc0]
658; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
659  %res = call i32 @llvm.x86.sse.comile.ss(<4 x float> %a0, <4 x float> %a1)
660  ret i32 %res
661}
662declare i32 @llvm.x86.sse.comile.ss(<4 x float>, <4 x float>) nounwind readnone
663
664define i32 @test_mm_comilt_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
665; SSE-LABEL: test_mm_comilt_ss:
666; SSE:       # %bb.0:
667; SSE-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
668; SSE-NEXT:    comiss %xmm0, %xmm1 # encoding: [0x0f,0x2f,0xc8]
669; SSE-NEXT:    seta %al # encoding: [0x0f,0x97,0xc0]
670; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
671;
672; AVX1-LABEL: test_mm_comilt_ss:
673; AVX1:       # %bb.0:
674; AVX1-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
675; AVX1-NEXT:    vcomiss %xmm0, %xmm1 # encoding: [0xc5,0xf8,0x2f,0xc8]
676; AVX1-NEXT:    seta %al # encoding: [0x0f,0x97,0xc0]
677; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
678;
679; AVX512-LABEL: test_mm_comilt_ss:
680; AVX512:       # %bb.0:
681; AVX512-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
682; AVX512-NEXT:    vcomiss %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2f,0xc8]
683; AVX512-NEXT:    seta %al # encoding: [0x0f,0x97,0xc0]
684; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
685  %res = call i32 @llvm.x86.sse.comilt.ss(<4 x float> %a0, <4 x float> %a1)
686  ret i32 %res
687}
688declare i32 @llvm.x86.sse.comilt.ss(<4 x float>, <4 x float>) nounwind readnone
689
690define i32 @test_mm_comineq_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
691; SSE-LABEL: test_mm_comineq_ss:
692; SSE:       # %bb.0:
693; SSE-NEXT:    comiss %xmm1, %xmm0 # encoding: [0x0f,0x2f,0xc1]
694; SSE-NEXT:    setp %al # encoding: [0x0f,0x9a,0xc0]
695; SSE-NEXT:    setne %cl # encoding: [0x0f,0x95,0xc1]
696; SSE-NEXT:    orb %al, %cl # encoding: [0x08,0xc1]
697; SSE-NEXT:    movzbl %cl, %eax # encoding: [0x0f,0xb6,0xc1]
698; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
699;
700; AVX1-LABEL: test_mm_comineq_ss:
701; AVX1:       # %bb.0:
702; AVX1-NEXT:    vcomiss %xmm1, %xmm0 # encoding: [0xc5,0xf8,0x2f,0xc1]
703; AVX1-NEXT:    setp %al # encoding: [0x0f,0x9a,0xc0]
704; AVX1-NEXT:    setne %cl # encoding: [0x0f,0x95,0xc1]
705; AVX1-NEXT:    orb %al, %cl # encoding: [0x08,0xc1]
706; AVX1-NEXT:    movzbl %cl, %eax # encoding: [0x0f,0xb6,0xc1]
707; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
708;
709; AVX512-LABEL: test_mm_comineq_ss:
710; AVX512:       # %bb.0:
711; AVX512-NEXT:    vcomiss %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2f,0xc1]
712; AVX512-NEXT:    setp %al # encoding: [0x0f,0x9a,0xc0]
713; AVX512-NEXT:    setne %cl # encoding: [0x0f,0x95,0xc1]
714; AVX512-NEXT:    orb %al, %cl # encoding: [0x08,0xc1]
715; AVX512-NEXT:    movzbl %cl, %eax # encoding: [0x0f,0xb6,0xc1]
716; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
717  %res = call i32 @llvm.x86.sse.comineq.ss(<4 x float> %a0, <4 x float> %a1)
718  ret i32 %res
719}
720declare i32 @llvm.x86.sse.comineq.ss(<4 x float>, <4 x float>) nounwind readnone
721
722define i32 @test_mm_cvt_ss2si(<4 x float> %a0) nounwind {
723; SSE-LABEL: test_mm_cvt_ss2si:
724; SSE:       # %bb.0:
725; SSE-NEXT:    cvtss2si %xmm0, %eax # encoding: [0xf3,0x0f,0x2d,0xc0]
726; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
727;
728; AVX1-LABEL: test_mm_cvt_ss2si:
729; AVX1:       # %bb.0:
730; AVX1-NEXT:    vcvtss2si %xmm0, %eax # encoding: [0xc5,0xfa,0x2d,0xc0]
731; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
732;
733; AVX512-LABEL: test_mm_cvt_ss2si:
734; AVX512:       # %bb.0:
735; AVX512-NEXT:    vcvtss2si %xmm0, %eax # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x2d,0xc0]
736; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
737  %res = call i32 @llvm.x86.sse.cvtss2si(<4 x float> %a0)
738  ret i32 %res
739}
740declare i32 @llvm.x86.sse.cvtss2si(<4 x float>) nounwind readnone
741
742define <4 x float> @test_mm_cvtsi32_ss(<4 x float> %a0, i32 %a1) nounwind {
743; X86-SSE-LABEL: test_mm_cvtsi32_ss:
744; X86-SSE:       # %bb.0:
745; X86-SSE-NEXT:    cvtsi2ssl {{[0-9]+}}(%esp), %xmm0 # encoding: [0xf3,0x0f,0x2a,0x44,0x24,0x04]
746; X86-SSE-NEXT:    retl # encoding: [0xc3]
747;
748; X86-AVX1-LABEL: test_mm_cvtsi32_ss:
749; X86-AVX1:       # %bb.0:
750; X86-AVX1-NEXT:    vcvtsi2ssl {{[0-9]+}}(%esp), %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x2a,0x44,0x24,0x04]
751; X86-AVX1-NEXT:    retl # encoding: [0xc3]
752;
753; X86-AVX512-LABEL: test_mm_cvtsi32_ss:
754; X86-AVX512:       # %bb.0:
755; X86-AVX512-NEXT:    vcvtsi2ssl {{[0-9]+}}(%esp), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x2a,0x44,0x24,0x04]
756; X86-AVX512-NEXT:    retl # encoding: [0xc3]
757;
758; X64-SSE-LABEL: test_mm_cvtsi32_ss:
759; X64-SSE:       # %bb.0:
760; X64-SSE-NEXT:    cvtsi2ssl %edi, %xmm0 # encoding: [0xf3,0x0f,0x2a,0xc7]
761; X64-SSE-NEXT:    retq # encoding: [0xc3]
762;
763; X64-AVX1-LABEL: test_mm_cvtsi32_ss:
764; X64-AVX1:       # %bb.0:
765; X64-AVX1-NEXT:    vcvtsi2ssl %edi, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x2a,0xc7]
766; X64-AVX1-NEXT:    retq # encoding: [0xc3]
767;
768; X64-AVX512-LABEL: test_mm_cvtsi32_ss:
769; X64-AVX512:       # %bb.0:
770; X64-AVX512-NEXT:    vcvtsi2ssl %edi, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x2a,0xc7]
771; X64-AVX512-NEXT:    retq # encoding: [0xc3]
772  %res = call <4 x float> @llvm.x86.sse.cvtsi2ss(<4 x float> %a0, i32 %a1)
773  ret <4 x float> %res
774}
775declare <4 x float> @llvm.x86.sse.cvtsi2ss(<4 x float>, i32) nounwind readnone
776
777define float @test_mm_cvtss_f32(<4 x float> %a0) nounwind {
778; X86-SSE-LABEL: test_mm_cvtss_f32:
779; X86-SSE:       # %bb.0:
780; X86-SSE-NEXT:    pushl %eax # encoding: [0x50]
781; X86-SSE-NEXT:    movss %xmm0, (%esp) # encoding: [0xf3,0x0f,0x11,0x04,0x24]
782; X86-SSE-NEXT:    flds (%esp) # encoding: [0xd9,0x04,0x24]
783; X86-SSE-NEXT:    popl %eax # encoding: [0x58]
784; X86-SSE-NEXT:    retl # encoding: [0xc3]
785;
786; X86-AVX1-LABEL: test_mm_cvtss_f32:
787; X86-AVX1:       # %bb.0:
788; X86-AVX1-NEXT:    pushl %eax # encoding: [0x50]
789; X86-AVX1-NEXT:    vmovss %xmm0, (%esp) # encoding: [0xc5,0xfa,0x11,0x04,0x24]
790; X86-AVX1-NEXT:    flds (%esp) # encoding: [0xd9,0x04,0x24]
791; X86-AVX1-NEXT:    popl %eax # encoding: [0x58]
792; X86-AVX1-NEXT:    retl # encoding: [0xc3]
793;
794; X86-AVX512-LABEL: test_mm_cvtss_f32:
795; X86-AVX512:       # %bb.0:
796; X86-AVX512-NEXT:    pushl %eax # encoding: [0x50]
797; X86-AVX512-NEXT:    vmovss %xmm0, (%esp) # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x11,0x04,0x24]
798; X86-AVX512-NEXT:    flds (%esp) # encoding: [0xd9,0x04,0x24]
799; X86-AVX512-NEXT:    popl %eax # encoding: [0x58]
800; X86-AVX512-NEXT:    retl # encoding: [0xc3]
801;
802; X64-LABEL: test_mm_cvtss_f32:
803; X64:       # %bb.0:
804; X64-NEXT:    retq # encoding: [0xc3]
805  %res = extractelement <4 x float> %a0, i32 0
806  ret float %res
807}
808
809define i32 @test_mm_cvtss_si32(<4 x float> %a0) nounwind {
810; SSE-LABEL: test_mm_cvtss_si32:
811; SSE:       # %bb.0:
812; SSE-NEXT:    cvtss2si %xmm0, %eax # encoding: [0xf3,0x0f,0x2d,0xc0]
813; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
814;
815; AVX1-LABEL: test_mm_cvtss_si32:
816; AVX1:       # %bb.0:
817; AVX1-NEXT:    vcvtss2si %xmm0, %eax # encoding: [0xc5,0xfa,0x2d,0xc0]
818; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
819;
820; AVX512-LABEL: test_mm_cvtss_si32:
821; AVX512:       # %bb.0:
822; AVX512-NEXT:    vcvtss2si %xmm0, %eax # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x2d,0xc0]
823; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
824  %res = call i32 @llvm.x86.sse.cvtss2si(<4 x float> %a0)
825  ret i32 %res
826}
827
828define i32 @test_mm_cvttss_si(<4 x float> %a0) nounwind {
829; SSE-LABEL: test_mm_cvttss_si:
830; SSE:       # %bb.0:
831; SSE-NEXT:    cvttss2si %xmm0, %eax # encoding: [0xf3,0x0f,0x2c,0xc0]
832; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
833;
834; AVX1-LABEL: test_mm_cvttss_si:
835; AVX1:       # %bb.0:
836; AVX1-NEXT:    vcvttss2si %xmm0, %eax # encoding: [0xc5,0xfa,0x2c,0xc0]
837; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
838;
839; AVX512-LABEL: test_mm_cvttss_si:
840; AVX512:       # %bb.0:
841; AVX512-NEXT:    vcvttss2si %xmm0, %eax # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x2c,0xc0]
842; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
843  %res = call i32 @llvm.x86.sse.cvttss2si(<4 x float> %a0)
844  ret i32 %res
845}
846declare i32 @llvm.x86.sse.cvttss2si(<4 x float>) nounwind readnone
847
848define i32 @test_mm_cvttss_si32(<4 x float> %a0) nounwind {
849; SSE-LABEL: test_mm_cvttss_si32:
850; SSE:       # %bb.0:
851; SSE-NEXT:    cvttss2si %xmm0, %eax # encoding: [0xf3,0x0f,0x2c,0xc0]
852; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
853;
854; AVX1-LABEL: test_mm_cvttss_si32:
855; AVX1:       # %bb.0:
856; AVX1-NEXT:    vcvttss2si %xmm0, %eax # encoding: [0xc5,0xfa,0x2c,0xc0]
857; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
858;
859; AVX512-LABEL: test_mm_cvttss_si32:
860; AVX512:       # %bb.0:
861; AVX512-NEXT:    vcvttss2si %xmm0, %eax # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x2c,0xc0]
862; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
863  %res = call i32 @llvm.x86.sse.cvttss2si(<4 x float> %a0)
864  ret i32 %res
865}
866
867define <4 x float> @test_mm_div_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
868; SSE-LABEL: test_mm_div_ps:
869; SSE:       # %bb.0:
870; SSE-NEXT:    divps %xmm1, %xmm0 # encoding: [0x0f,0x5e,0xc1]
871; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
872;
873; AVX1-LABEL: test_mm_div_ps:
874; AVX1:       # %bb.0:
875; AVX1-NEXT:    vdivps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x5e,0xc1]
876; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
877;
878; AVX512-LABEL: test_mm_div_ps:
879; AVX512:       # %bb.0:
880; AVX512-NEXT:    vdivps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x5e,0xc1]
881; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
882  %res = fdiv <4 x float> %a0, %a1
883  ret <4 x float> %res
884}
885
886define <4 x float> @test_mm_div_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
887; SSE-LABEL: test_mm_div_ss:
888; SSE:       # %bb.0:
889; SSE-NEXT:    divss %xmm1, %xmm0 # encoding: [0xf3,0x0f,0x5e,0xc1]
890; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
891;
892; AVX1-LABEL: test_mm_div_ss:
893; AVX1:       # %bb.0:
894; AVX1-NEXT:    vdivss %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x5e,0xc1]
895; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
896;
897; AVX512-LABEL: test_mm_div_ss:
898; AVX512:       # %bb.0:
899; AVX512-NEXT:    vdivss %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x5e,0xc1]
900; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
901  %ext0 = extractelement <4 x float> %a0, i32 0
902  %ext1 = extractelement <4 x float> %a1, i32 0
903  %fdiv = fdiv float %ext0, %ext1
904  %res = insertelement <4 x float> %a0, float %fdiv, i32 0
905  ret <4 x float> %res
906}
907
908define i32 @test_MM_GET_EXCEPTION_MASK() nounwind {
909; X86-SSE-LABEL: test_MM_GET_EXCEPTION_MASK:
910; X86-SSE:       # %bb.0:
911; X86-SSE-NEXT:    pushl %eax # encoding: [0x50]
912; X86-SSE-NEXT:    movl %esp, %eax # encoding: [0x89,0xe0]
913; X86-SSE-NEXT:    stmxcsr (%eax) # encoding: [0x0f,0xae,0x18]
914; X86-SSE-NEXT:    movl (%esp), %eax # encoding: [0x8b,0x04,0x24]
915; X86-SSE-NEXT:    andl $8064, %eax # encoding: [0x25,0x80,0x1f,0x00,0x00]
916; X86-SSE-NEXT:    # imm = 0x1F80
917; X86-SSE-NEXT:    popl %ecx # encoding: [0x59]
918; X86-SSE-NEXT:    retl # encoding: [0xc3]
919;
920; X86-AVX-LABEL: test_MM_GET_EXCEPTION_MASK:
921; X86-AVX:       # %bb.0:
922; X86-AVX-NEXT:    pushl %eax # encoding: [0x50]
923; X86-AVX-NEXT:    movl %esp, %eax # encoding: [0x89,0xe0]
924; X86-AVX-NEXT:    vstmxcsr (%eax) # encoding: [0xc5,0xf8,0xae,0x18]
925; X86-AVX-NEXT:    movl (%esp), %eax # encoding: [0x8b,0x04,0x24]
926; X86-AVX-NEXT:    andl $8064, %eax # encoding: [0x25,0x80,0x1f,0x00,0x00]
927; X86-AVX-NEXT:    # imm = 0x1F80
928; X86-AVX-NEXT:    popl %ecx # encoding: [0x59]
929; X86-AVX-NEXT:    retl # encoding: [0xc3]
930;
931; X64-SSE-LABEL: test_MM_GET_EXCEPTION_MASK:
932; X64-SSE:       # %bb.0:
933; X64-SSE-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8d,0x44,0x24,0xfc]
934; X64-SSE-NEXT:    stmxcsr (%rax) # encoding: [0x0f,0xae,0x18]
935; X64-SSE-NEXT:    movl -{{[0-9]+}}(%rsp), %eax # encoding: [0x8b,0x44,0x24,0xfc]
936; X64-SSE-NEXT:    andl $8064, %eax # encoding: [0x25,0x80,0x1f,0x00,0x00]
937; X64-SSE-NEXT:    # imm = 0x1F80
938; X64-SSE-NEXT:    retq # encoding: [0xc3]
939;
940; X64-AVX-LABEL: test_MM_GET_EXCEPTION_MASK:
941; X64-AVX:       # %bb.0:
942; X64-AVX-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8d,0x44,0x24,0xfc]
943; X64-AVX-NEXT:    vstmxcsr (%rax) # encoding: [0xc5,0xf8,0xae,0x18]
944; X64-AVX-NEXT:    movl -{{[0-9]+}}(%rsp), %eax # encoding: [0x8b,0x44,0x24,0xfc]
945; X64-AVX-NEXT:    andl $8064, %eax # encoding: [0x25,0x80,0x1f,0x00,0x00]
946; X64-AVX-NEXT:    # imm = 0x1F80
947; X64-AVX-NEXT:    retq # encoding: [0xc3]
948  %1 = alloca i32, align 4
949  %2 = bitcast i32* %1 to i8*
950  call void @llvm.x86.sse.stmxcsr(i8* %2)
951  %3 = load i32, i32* %1, align 4
952  %4 = and i32 %3, 8064
953  ret i32 %4
954}
955declare void @llvm.x86.sse.stmxcsr(i8*) nounwind readnone
956
957define i32 @test_MM_GET_EXCEPTION_STATE() nounwind {
958; X86-SSE-LABEL: test_MM_GET_EXCEPTION_STATE:
959; X86-SSE:       # %bb.0:
960; X86-SSE-NEXT:    pushl %eax # encoding: [0x50]
961; X86-SSE-NEXT:    movl %esp, %eax # encoding: [0x89,0xe0]
962; X86-SSE-NEXT:    stmxcsr (%eax) # encoding: [0x0f,0xae,0x18]
963; X86-SSE-NEXT:    movl (%esp), %eax # encoding: [0x8b,0x04,0x24]
964; X86-SSE-NEXT:    andl $63, %eax # encoding: [0x83,0xe0,0x3f]
965; X86-SSE-NEXT:    popl %ecx # encoding: [0x59]
966; X86-SSE-NEXT:    retl # encoding: [0xc3]
967;
968; X86-AVX-LABEL: test_MM_GET_EXCEPTION_STATE:
969; X86-AVX:       # %bb.0:
970; X86-AVX-NEXT:    pushl %eax # encoding: [0x50]
971; X86-AVX-NEXT:    movl %esp, %eax # encoding: [0x89,0xe0]
972; X86-AVX-NEXT:    vstmxcsr (%eax) # encoding: [0xc5,0xf8,0xae,0x18]
973; X86-AVX-NEXT:    movl (%esp), %eax # encoding: [0x8b,0x04,0x24]
974; X86-AVX-NEXT:    andl $63, %eax # encoding: [0x83,0xe0,0x3f]
975; X86-AVX-NEXT:    popl %ecx # encoding: [0x59]
976; X86-AVX-NEXT:    retl # encoding: [0xc3]
977;
978; X64-SSE-LABEL: test_MM_GET_EXCEPTION_STATE:
979; X64-SSE:       # %bb.0:
980; X64-SSE-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8d,0x44,0x24,0xfc]
981; X64-SSE-NEXT:    stmxcsr (%rax) # encoding: [0x0f,0xae,0x18]
982; X64-SSE-NEXT:    movl -{{[0-9]+}}(%rsp), %eax # encoding: [0x8b,0x44,0x24,0xfc]
983; X64-SSE-NEXT:    andl $63, %eax # encoding: [0x83,0xe0,0x3f]
984; X64-SSE-NEXT:    retq # encoding: [0xc3]
985;
986; X64-AVX-LABEL: test_MM_GET_EXCEPTION_STATE:
987; X64-AVX:       # %bb.0:
988; X64-AVX-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8d,0x44,0x24,0xfc]
989; X64-AVX-NEXT:    vstmxcsr (%rax) # encoding: [0xc5,0xf8,0xae,0x18]
990; X64-AVX-NEXT:    movl -{{[0-9]+}}(%rsp), %eax # encoding: [0x8b,0x44,0x24,0xfc]
991; X64-AVX-NEXT:    andl $63, %eax # encoding: [0x83,0xe0,0x3f]
992; X64-AVX-NEXT:    retq # encoding: [0xc3]
993  %1 = alloca i32, align 4
994  %2 = bitcast i32* %1 to i8*
995  call void @llvm.x86.sse.stmxcsr(i8* %2)
996  %3 = load i32, i32* %1, align 4
997  %4 = and i32 %3, 63
998  ret i32 %4
999}
1000
1001define i32 @test_MM_GET_FLUSH_ZERO_MODE() nounwind {
1002; X86-SSE-LABEL: test_MM_GET_FLUSH_ZERO_MODE:
1003; X86-SSE:       # %bb.0:
1004; X86-SSE-NEXT:    pushl %eax # encoding: [0x50]
1005; X86-SSE-NEXT:    movl %esp, %eax # encoding: [0x89,0xe0]
1006; X86-SSE-NEXT:    stmxcsr (%eax) # encoding: [0x0f,0xae,0x18]
1007; X86-SSE-NEXT:    movl (%esp), %eax # encoding: [0x8b,0x04,0x24]
1008; X86-SSE-NEXT:    andl $32768, %eax # encoding: [0x25,0x00,0x80,0x00,0x00]
1009; X86-SSE-NEXT:    # imm = 0x8000
1010; X86-SSE-NEXT:    popl %ecx # encoding: [0x59]
1011; X86-SSE-NEXT:    retl # encoding: [0xc3]
1012;
1013; X86-AVX-LABEL: test_MM_GET_FLUSH_ZERO_MODE:
1014; X86-AVX:       # %bb.0:
1015; X86-AVX-NEXT:    pushl %eax # encoding: [0x50]
1016; X86-AVX-NEXT:    movl %esp, %eax # encoding: [0x89,0xe0]
1017; X86-AVX-NEXT:    vstmxcsr (%eax) # encoding: [0xc5,0xf8,0xae,0x18]
1018; X86-AVX-NEXT:    movl (%esp), %eax # encoding: [0x8b,0x04,0x24]
1019; X86-AVX-NEXT:    andl $32768, %eax # encoding: [0x25,0x00,0x80,0x00,0x00]
1020; X86-AVX-NEXT:    # imm = 0x8000
1021; X86-AVX-NEXT:    popl %ecx # encoding: [0x59]
1022; X86-AVX-NEXT:    retl # encoding: [0xc3]
1023;
1024; X64-SSE-LABEL: test_MM_GET_FLUSH_ZERO_MODE:
1025; X64-SSE:       # %bb.0:
1026; X64-SSE-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8d,0x44,0x24,0xfc]
1027; X64-SSE-NEXT:    stmxcsr (%rax) # encoding: [0x0f,0xae,0x18]
1028; X64-SSE-NEXT:    movl -{{[0-9]+}}(%rsp), %eax # encoding: [0x8b,0x44,0x24,0xfc]
1029; X64-SSE-NEXT:    andl $32768, %eax # encoding: [0x25,0x00,0x80,0x00,0x00]
1030; X64-SSE-NEXT:    # imm = 0x8000
1031; X64-SSE-NEXT:    retq # encoding: [0xc3]
1032;
1033; X64-AVX-LABEL: test_MM_GET_FLUSH_ZERO_MODE:
1034; X64-AVX:       # %bb.0:
1035; X64-AVX-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8d,0x44,0x24,0xfc]
1036; X64-AVX-NEXT:    vstmxcsr (%rax) # encoding: [0xc5,0xf8,0xae,0x18]
1037; X64-AVX-NEXT:    movl -{{[0-9]+}}(%rsp), %eax # encoding: [0x8b,0x44,0x24,0xfc]
1038; X64-AVX-NEXT:    andl $32768, %eax # encoding: [0x25,0x00,0x80,0x00,0x00]
1039; X64-AVX-NEXT:    # imm = 0x8000
1040; X64-AVX-NEXT:    retq # encoding: [0xc3]
1041  %1 = alloca i32, align 4
1042  %2 = bitcast i32* %1 to i8*
1043  call void @llvm.x86.sse.stmxcsr(i8* %2)
1044  %3 = load i32, i32* %1, align 4
1045  %4 = and i32 %3, 32768
1046  ret i32 %4
1047}
1048
1049define i32 @test_MM_GET_ROUNDING_MODE() nounwind {
1050; X86-SSE-LABEL: test_MM_GET_ROUNDING_MODE:
1051; X86-SSE:       # %bb.0:
1052; X86-SSE-NEXT:    pushl %eax # encoding: [0x50]
1053; X86-SSE-NEXT:    movl %esp, %eax # encoding: [0x89,0xe0]
1054; X86-SSE-NEXT:    stmxcsr (%eax) # encoding: [0x0f,0xae,0x18]
1055; X86-SSE-NEXT:    movl (%esp), %eax # encoding: [0x8b,0x04,0x24]
1056; X86-SSE-NEXT:    andl $24576, %eax # encoding: [0x25,0x00,0x60,0x00,0x00]
1057; X86-SSE-NEXT:    # imm = 0x6000
1058; X86-SSE-NEXT:    popl %ecx # encoding: [0x59]
1059; X86-SSE-NEXT:    retl # encoding: [0xc3]
1060;
1061; X86-AVX-LABEL: test_MM_GET_ROUNDING_MODE:
1062; X86-AVX:       # %bb.0:
1063; X86-AVX-NEXT:    pushl %eax # encoding: [0x50]
1064; X86-AVX-NEXT:    movl %esp, %eax # encoding: [0x89,0xe0]
1065; X86-AVX-NEXT:    vstmxcsr (%eax) # encoding: [0xc5,0xf8,0xae,0x18]
1066; X86-AVX-NEXT:    movl (%esp), %eax # encoding: [0x8b,0x04,0x24]
1067; X86-AVX-NEXT:    andl $24576, %eax # encoding: [0x25,0x00,0x60,0x00,0x00]
1068; X86-AVX-NEXT:    # imm = 0x6000
1069; X86-AVX-NEXT:    popl %ecx # encoding: [0x59]
1070; X86-AVX-NEXT:    retl # encoding: [0xc3]
1071;
1072; X64-SSE-LABEL: test_MM_GET_ROUNDING_MODE:
1073; X64-SSE:       # %bb.0:
1074; X64-SSE-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8d,0x44,0x24,0xfc]
1075; X64-SSE-NEXT:    stmxcsr (%rax) # encoding: [0x0f,0xae,0x18]
1076; X64-SSE-NEXT:    movl -{{[0-9]+}}(%rsp), %eax # encoding: [0x8b,0x44,0x24,0xfc]
1077; X64-SSE-NEXT:    andl $24576, %eax # encoding: [0x25,0x00,0x60,0x00,0x00]
1078; X64-SSE-NEXT:    # imm = 0x6000
1079; X64-SSE-NEXT:    retq # encoding: [0xc3]
1080;
1081; X64-AVX-LABEL: test_MM_GET_ROUNDING_MODE:
1082; X64-AVX:       # %bb.0:
1083; X64-AVX-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8d,0x44,0x24,0xfc]
1084; X64-AVX-NEXT:    vstmxcsr (%rax) # encoding: [0xc5,0xf8,0xae,0x18]
1085; X64-AVX-NEXT:    movl -{{[0-9]+}}(%rsp), %eax # encoding: [0x8b,0x44,0x24,0xfc]
1086; X64-AVX-NEXT:    andl $24576, %eax # encoding: [0x25,0x00,0x60,0x00,0x00]
1087; X64-AVX-NEXT:    # imm = 0x6000
1088; X64-AVX-NEXT:    retq # encoding: [0xc3]
1089  %1 = alloca i32, align 4
1090  %2 = bitcast i32* %1 to i8*
1091  call void @llvm.x86.sse.stmxcsr(i8* %2)
1092  %3 = load i32, i32* %1, align 4
1093  %4 = and i32 %3, 24576
1094  ret i32 %4
1095}
1096
1097define i32 @test_mm_getcsr() nounwind {
1098; X86-SSE-LABEL: test_mm_getcsr:
1099; X86-SSE:       # %bb.0:
1100; X86-SSE-NEXT:    pushl %eax # encoding: [0x50]
1101; X86-SSE-NEXT:    movl %esp, %eax # encoding: [0x89,0xe0]
1102; X86-SSE-NEXT:    stmxcsr (%eax) # encoding: [0x0f,0xae,0x18]
1103; X86-SSE-NEXT:    movl (%esp), %eax # encoding: [0x8b,0x04,0x24]
1104; X86-SSE-NEXT:    popl %ecx # encoding: [0x59]
1105; X86-SSE-NEXT:    retl # encoding: [0xc3]
1106;
1107; X86-AVX-LABEL: test_mm_getcsr:
1108; X86-AVX:       # %bb.0:
1109; X86-AVX-NEXT:    pushl %eax # encoding: [0x50]
1110; X86-AVX-NEXT:    movl %esp, %eax # encoding: [0x89,0xe0]
1111; X86-AVX-NEXT:    vstmxcsr (%eax) # encoding: [0xc5,0xf8,0xae,0x18]
1112; X86-AVX-NEXT:    movl (%esp), %eax # encoding: [0x8b,0x04,0x24]
1113; X86-AVX-NEXT:    popl %ecx # encoding: [0x59]
1114; X86-AVX-NEXT:    retl # encoding: [0xc3]
1115;
1116; X64-SSE-LABEL: test_mm_getcsr:
1117; X64-SSE:       # %bb.0:
1118; X64-SSE-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8d,0x44,0x24,0xfc]
1119; X64-SSE-NEXT:    stmxcsr (%rax) # encoding: [0x0f,0xae,0x18]
1120; X64-SSE-NEXT:    movl -{{[0-9]+}}(%rsp), %eax # encoding: [0x8b,0x44,0x24,0xfc]
1121; X64-SSE-NEXT:    retq # encoding: [0xc3]
1122;
1123; X64-AVX-LABEL: test_mm_getcsr:
1124; X64-AVX:       # %bb.0:
1125; X64-AVX-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8d,0x44,0x24,0xfc]
1126; X64-AVX-NEXT:    vstmxcsr (%rax) # encoding: [0xc5,0xf8,0xae,0x18]
1127; X64-AVX-NEXT:    movl -{{[0-9]+}}(%rsp), %eax # encoding: [0x8b,0x44,0x24,0xfc]
1128; X64-AVX-NEXT:    retq # encoding: [0xc3]
1129  %1 = alloca i32, align 4
1130  %2 = bitcast i32* %1 to i8*
1131  call void @llvm.x86.sse.stmxcsr(i8* %2)
1132  %3 = load i32, i32* %1, align 4
1133  ret i32 %3
1134}
1135
1136define <4 x float> @test_mm_load_ps(float* %a0) nounwind {
1137; X86-SSE-LABEL: test_mm_load_ps:
1138; X86-SSE:       # %bb.0:
1139; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1140; X86-SSE-NEXT:    movaps (%eax), %xmm0 # encoding: [0x0f,0x28,0x00]
1141; X86-SSE-NEXT:    retl # encoding: [0xc3]
1142;
1143; X86-AVX1-LABEL: test_mm_load_ps:
1144; X86-AVX1:       # %bb.0:
1145; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1146; X86-AVX1-NEXT:    vmovaps (%eax), %xmm0 # encoding: [0xc5,0xf8,0x28,0x00]
1147; X86-AVX1-NEXT:    retl # encoding: [0xc3]
1148;
1149; X86-AVX512-LABEL: test_mm_load_ps:
1150; X86-AVX512:       # %bb.0:
1151; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1152; X86-AVX512-NEXT:    vmovaps (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x00]
1153; X86-AVX512-NEXT:    retl # encoding: [0xc3]
1154;
1155; X64-SSE-LABEL: test_mm_load_ps:
1156; X64-SSE:       # %bb.0:
1157; X64-SSE-NEXT:    movaps (%rdi), %xmm0 # encoding: [0x0f,0x28,0x07]
1158; X64-SSE-NEXT:    retq # encoding: [0xc3]
1159;
1160; X64-AVX1-LABEL: test_mm_load_ps:
1161; X64-AVX1:       # %bb.0:
1162; X64-AVX1-NEXT:    vmovaps (%rdi), %xmm0 # encoding: [0xc5,0xf8,0x28,0x07]
1163; X64-AVX1-NEXT:    retq # encoding: [0xc3]
1164;
1165; X64-AVX512-LABEL: test_mm_load_ps:
1166; X64-AVX512:       # %bb.0:
1167; X64-AVX512-NEXT:    vmovaps (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x07]
1168; X64-AVX512-NEXT:    retq # encoding: [0xc3]
1169  %arg0 = bitcast float* %a0 to <4 x float>*
1170  %res = load <4 x float>, <4 x float>* %arg0, align 16
1171  ret <4 x float> %res
1172}
1173
1174define <4 x float> @test_mm_load_ps1(float* %a0) nounwind {
1175; X86-SSE-LABEL: test_mm_load_ps1:
1176; X86-SSE:       # %bb.0:
1177; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1178; X86-SSE-NEXT:    movss (%eax), %xmm0 # encoding: [0xf3,0x0f,0x10,0x00]
1179; X86-SSE-NEXT:    # xmm0 = mem[0],zero,zero,zero
1180; X86-SSE-NEXT:    shufps $0, %xmm0, %xmm0 # encoding: [0x0f,0xc6,0xc0,0x00]
1181; X86-SSE-NEXT:    # xmm0 = xmm0[0,0,0,0]
1182; X86-SSE-NEXT:    retl # encoding: [0xc3]
1183;
1184; X86-AVX1-LABEL: test_mm_load_ps1:
1185; X86-AVX1:       # %bb.0:
1186; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1187; X86-AVX1-NEXT:    vbroadcastss (%eax), %xmm0 # encoding: [0xc4,0xe2,0x79,0x18,0x00]
1188; X86-AVX1-NEXT:    retl # encoding: [0xc3]
1189;
1190; X86-AVX512-LABEL: test_mm_load_ps1:
1191; X86-AVX512:       # %bb.0:
1192; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1193; X86-AVX512-NEXT:    vbroadcastss (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x00]
1194; X86-AVX512-NEXT:    retl # encoding: [0xc3]
1195;
1196; X64-SSE-LABEL: test_mm_load_ps1:
1197; X64-SSE:       # %bb.0:
1198; X64-SSE-NEXT:    movss (%rdi), %xmm0 # encoding: [0xf3,0x0f,0x10,0x07]
1199; X64-SSE-NEXT:    # xmm0 = mem[0],zero,zero,zero
1200; X64-SSE-NEXT:    shufps $0, %xmm0, %xmm0 # encoding: [0x0f,0xc6,0xc0,0x00]
1201; X64-SSE-NEXT:    # xmm0 = xmm0[0,0,0,0]
1202; X64-SSE-NEXT:    retq # encoding: [0xc3]
1203;
1204; X64-AVX1-LABEL: test_mm_load_ps1:
1205; X64-AVX1:       # %bb.0:
1206; X64-AVX1-NEXT:    vbroadcastss (%rdi), %xmm0 # encoding: [0xc4,0xe2,0x79,0x18,0x07]
1207; X64-AVX1-NEXT:    retq # encoding: [0xc3]
1208;
1209; X64-AVX512-LABEL: test_mm_load_ps1:
1210; X64-AVX512:       # %bb.0:
1211; X64-AVX512-NEXT:    vbroadcastss (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x07]
1212; X64-AVX512-NEXT:    retq # encoding: [0xc3]
1213  %ld = load float, float* %a0, align 4
1214  %res0 = insertelement <4 x float> undef, float %ld, i32 0
1215  %res1 = insertelement <4 x float> %res0, float %ld, i32 1
1216  %res2 = insertelement <4 x float> %res1, float %ld, i32 2
1217  %res3 = insertelement <4 x float> %res2, float %ld, i32 3
1218  ret <4 x float> %res3
1219}
1220
1221define <4 x float> @test_mm_load_ss(float* %a0) nounwind {
1222; X86-SSE-LABEL: test_mm_load_ss:
1223; X86-SSE:       # %bb.0:
1224; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1225; X86-SSE-NEXT:    movss (%eax), %xmm0 # encoding: [0xf3,0x0f,0x10,0x00]
1226; X86-SSE-NEXT:    # xmm0 = mem[0],zero,zero,zero
1227; X86-SSE-NEXT:    retl # encoding: [0xc3]
1228;
1229; X86-AVX1-LABEL: test_mm_load_ss:
1230; X86-AVX1:       # %bb.0:
1231; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1232; X86-AVX1-NEXT:    vmovss (%eax), %xmm0 # encoding: [0xc5,0xfa,0x10,0x00]
1233; X86-AVX1-NEXT:    # xmm0 = mem[0],zero,zero,zero
1234; X86-AVX1-NEXT:    retl # encoding: [0xc3]
1235;
1236; X86-AVX512-LABEL: test_mm_load_ss:
1237; X86-AVX512:       # %bb.0:
1238; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1239; X86-AVX512-NEXT:    vmovss (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x00]
1240; X86-AVX512-NEXT:    # xmm0 = mem[0],zero,zero,zero
1241; X86-AVX512-NEXT:    retl # encoding: [0xc3]
1242;
1243; X64-SSE-LABEL: test_mm_load_ss:
1244; X64-SSE:       # %bb.0:
1245; X64-SSE-NEXT:    movss (%rdi), %xmm0 # encoding: [0xf3,0x0f,0x10,0x07]
1246; X64-SSE-NEXT:    # xmm0 = mem[0],zero,zero,zero
1247; X64-SSE-NEXT:    retq # encoding: [0xc3]
1248;
1249; X64-AVX1-LABEL: test_mm_load_ss:
1250; X64-AVX1:       # %bb.0:
1251; X64-AVX1-NEXT:    vmovss (%rdi), %xmm0 # encoding: [0xc5,0xfa,0x10,0x07]
1252; X64-AVX1-NEXT:    # xmm0 = mem[0],zero,zero,zero
1253; X64-AVX1-NEXT:    retq # encoding: [0xc3]
1254;
1255; X64-AVX512-LABEL: test_mm_load_ss:
1256; X64-AVX512:       # %bb.0:
1257; X64-AVX512-NEXT:    vmovss (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x07]
1258; X64-AVX512-NEXT:    # xmm0 = mem[0],zero,zero,zero
1259; X64-AVX512-NEXT:    retq # encoding: [0xc3]
1260  %ld = load float, float* %a0, align 1
1261  %res0 = insertelement <4 x float> undef, float %ld, i32 0
1262  %res1 = insertelement <4 x float> %res0, float 0.0, i32 1
1263  %res2 = insertelement <4 x float> %res1, float 0.0, i32 2
1264  %res3 = insertelement <4 x float> %res2, float 0.0, i32 3
1265  ret <4 x float> %res3
1266}
1267
1268define <4 x float> @test_mm_load1_ps(float* %a0) nounwind {
1269; X86-SSE-LABEL: test_mm_load1_ps:
1270; X86-SSE:       # %bb.0:
1271; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1272; X86-SSE-NEXT:    movss (%eax), %xmm0 # encoding: [0xf3,0x0f,0x10,0x00]
1273; X86-SSE-NEXT:    # xmm0 = mem[0],zero,zero,zero
1274; X86-SSE-NEXT:    shufps $0, %xmm0, %xmm0 # encoding: [0x0f,0xc6,0xc0,0x00]
1275; X86-SSE-NEXT:    # xmm0 = xmm0[0,0,0,0]
1276; X86-SSE-NEXT:    retl # encoding: [0xc3]
1277;
1278; X86-AVX1-LABEL: test_mm_load1_ps:
1279; X86-AVX1:       # %bb.0:
1280; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1281; X86-AVX1-NEXT:    vbroadcastss (%eax), %xmm0 # encoding: [0xc4,0xe2,0x79,0x18,0x00]
1282; X86-AVX1-NEXT:    retl # encoding: [0xc3]
1283;
1284; X86-AVX512-LABEL: test_mm_load1_ps:
1285; X86-AVX512:       # %bb.0:
1286; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1287; X86-AVX512-NEXT:    vbroadcastss (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x00]
1288; X86-AVX512-NEXT:    retl # encoding: [0xc3]
1289;
1290; X64-SSE-LABEL: test_mm_load1_ps:
1291; X64-SSE:       # %bb.0:
1292; X64-SSE-NEXT:    movss (%rdi), %xmm0 # encoding: [0xf3,0x0f,0x10,0x07]
1293; X64-SSE-NEXT:    # xmm0 = mem[0],zero,zero,zero
1294; X64-SSE-NEXT:    shufps $0, %xmm0, %xmm0 # encoding: [0x0f,0xc6,0xc0,0x00]
1295; X64-SSE-NEXT:    # xmm0 = xmm0[0,0,0,0]
1296; X64-SSE-NEXT:    retq # encoding: [0xc3]
1297;
1298; X64-AVX1-LABEL: test_mm_load1_ps:
1299; X64-AVX1:       # %bb.0:
1300; X64-AVX1-NEXT:    vbroadcastss (%rdi), %xmm0 # encoding: [0xc4,0xe2,0x79,0x18,0x07]
1301; X64-AVX1-NEXT:    retq # encoding: [0xc3]
1302;
1303; X64-AVX512-LABEL: test_mm_load1_ps:
1304; X64-AVX512:       # %bb.0:
1305; X64-AVX512-NEXT:    vbroadcastss (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x07]
1306; X64-AVX512-NEXT:    retq # encoding: [0xc3]
1307  %ld = load float, float* %a0, align 4
1308  %res0 = insertelement <4 x float> undef, float %ld, i32 0
1309  %res1 = insertelement <4 x float> %res0, float %ld, i32 1
1310  %res2 = insertelement <4 x float> %res1, float %ld, i32 2
1311  %res3 = insertelement <4 x float> %res2, float %ld, i32 3
1312  ret <4 x float> %res3
1313}
1314
1315define <4 x float> @test_mm_loadh_pi(<4 x float> %a0, x86_mmx* %a1) {
1316; X86-SSE-LABEL: test_mm_loadh_pi:
1317; X86-SSE:       # %bb.0:
1318; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1319; X86-SSE-NEXT:    movss (%eax), %xmm1 # encoding: [0xf3,0x0f,0x10,0x08]
1320; X86-SSE-NEXT:    # xmm1 = mem[0],zero,zero,zero
1321; X86-SSE-NEXT:    movss 4(%eax), %xmm2 # encoding: [0xf3,0x0f,0x10,0x50,0x04]
1322; X86-SSE-NEXT:    # xmm2 = mem[0],zero,zero,zero
1323; X86-SSE-NEXT:    unpcklps %xmm2, %xmm1 # encoding: [0x0f,0x14,0xca]
1324; X86-SSE-NEXT:    # xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
1325; X86-SSE-NEXT:    movlhps %xmm1, %xmm0 # encoding: [0x0f,0x16,0xc1]
1326; X86-SSE-NEXT:    # xmm0 = xmm0[0],xmm1[0]
1327; X86-SSE-NEXT:    retl # encoding: [0xc3]
1328;
1329; X86-AVX1-LABEL: test_mm_loadh_pi:
1330; X86-AVX1:       # %bb.0:
1331; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1332; X86-AVX1-NEXT:    vmovsd (%eax), %xmm1 # encoding: [0xc5,0xfb,0x10,0x08]
1333; X86-AVX1-NEXT:    # xmm1 = mem[0],zero
1334; X86-AVX1-NEXT:    vmovlhps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x16,0xc1]
1335; X86-AVX1-NEXT:    # xmm0 = xmm0[0],xmm1[0]
1336; X86-AVX1-NEXT:    retl # encoding: [0xc3]
1337;
1338; X86-AVX512-LABEL: test_mm_loadh_pi:
1339; X86-AVX512:       # %bb.0:
1340; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1341; X86-AVX512-NEXT:    vmovsd (%eax), %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x08]
1342; X86-AVX512-NEXT:    # xmm1 = mem[0],zero
1343; X86-AVX512-NEXT:    vmovlhps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x16,0xc1]
1344; X86-AVX512-NEXT:    # xmm0 = xmm0[0],xmm1[0]
1345; X86-AVX512-NEXT:    retl # encoding: [0xc3]
1346;
1347; X64-SSE-LABEL: test_mm_loadh_pi:
1348; X64-SSE:       # %bb.0:
1349; X64-SSE-NEXT:    movq (%rdi), %rax # encoding: [0x48,0x8b,0x07]
1350; X64-SSE-NEXT:    movl %eax, -{{[0-9]+}}(%rsp) # encoding: [0x89,0x44,0x24,0xf8]
1351; X64-SSE-NEXT:    shrq $32, %rax # encoding: [0x48,0xc1,0xe8,0x20]
1352; X64-SSE-NEXT:    movl %eax, -{{[0-9]+}}(%rsp) # encoding: [0x89,0x44,0x24,0xfc]
1353; X64-SSE-NEXT:    movss -{{[0-9]+}}(%rsp), %xmm1 # encoding: [0xf3,0x0f,0x10,0x4c,0x24,0xf8]
1354; X64-SSE-NEXT:    # xmm1 = mem[0],zero,zero,zero
1355; X64-SSE-NEXT:    movss -{{[0-9]+}}(%rsp), %xmm2 # encoding: [0xf3,0x0f,0x10,0x54,0x24,0xfc]
1356; X64-SSE-NEXT:    # xmm2 = mem[0],zero,zero,zero
1357; X64-SSE-NEXT:    unpcklps %xmm2, %xmm1 # encoding: [0x0f,0x14,0xca]
1358; X64-SSE-NEXT:    # xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
1359; X64-SSE-NEXT:    movlhps %xmm1, %xmm0 # encoding: [0x0f,0x16,0xc1]
1360; X64-SSE-NEXT:    # xmm0 = xmm0[0],xmm1[0]
1361; X64-SSE-NEXT:    retq # encoding: [0xc3]
1362;
1363; X64-AVX1-LABEL: test_mm_loadh_pi:
1364; X64-AVX1:       # %bb.0:
1365; X64-AVX1-NEXT:    vmovhpd (%rdi), %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x16,0x07]
1366; X64-AVX1-NEXT:    # xmm0 = xmm0[0],mem[0]
1367; X64-AVX1-NEXT:    retq # encoding: [0xc3]
1368;
1369; X64-AVX512-LABEL: test_mm_loadh_pi:
1370; X64-AVX512:       # %bb.0:
1371; X64-AVX512-NEXT:    vmovhpd (%rdi), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x16,0x07]
1372; X64-AVX512-NEXT:    # xmm0 = xmm0[0],mem[0]
1373; X64-AVX512-NEXT:    retq # encoding: [0xc3]
1374  %ptr = bitcast x86_mmx* %a1 to <2 x float>*
1375  %ld  = load <2 x float>, <2 x float>* %ptr
1376  %ext = shufflevector <2 x float> %ld, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
1377  %res = shufflevector <4 x float> %a0, <4 x float> %ext, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
1378  ret <4 x float> %res
1379}
1380
1381define <4 x float> @test_mm_loadl_pi(<4 x float> %a0, x86_mmx* %a1) {
1382; X86-SSE-LABEL: test_mm_loadl_pi:
1383; X86-SSE:       # %bb.0:
1384; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1385; X86-SSE-NEXT:    movss (%eax), %xmm1 # encoding: [0xf3,0x0f,0x10,0x08]
1386; X86-SSE-NEXT:    # xmm1 = mem[0],zero,zero,zero
1387; X86-SSE-NEXT:    movss 4(%eax), %xmm2 # encoding: [0xf3,0x0f,0x10,0x50,0x04]
1388; X86-SSE-NEXT:    # xmm2 = mem[0],zero,zero,zero
1389; X86-SSE-NEXT:    unpcklps %xmm2, %xmm1 # encoding: [0x0f,0x14,0xca]
1390; X86-SSE-NEXT:    # xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
1391; X86-SSE-NEXT:    shufps $228, %xmm0, %xmm1 # encoding: [0x0f,0xc6,0xc8,0xe4]
1392; X86-SSE-NEXT:    # xmm1 = xmm1[0,1],xmm0[2,3]
1393; X86-SSE-NEXT:    movaps %xmm1, %xmm0 # encoding: [0x0f,0x28,0xc1]
1394; X86-SSE-NEXT:    retl # encoding: [0xc3]
1395;
1396; X86-AVX1-LABEL: test_mm_loadl_pi:
1397; X86-AVX1:       # %bb.0:
1398; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1399; X86-AVX1-NEXT:    vmovsd (%eax), %xmm1 # encoding: [0xc5,0xfb,0x10,0x08]
1400; X86-AVX1-NEXT:    # xmm1 = mem[0],zero
1401; X86-AVX1-NEXT:    vblendps $3, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x03]
1402; X86-AVX1-NEXT:    # xmm0 = xmm1[0,1],xmm0[2,3]
1403; X86-AVX1-NEXT:    retl # encoding: [0xc3]
1404;
1405; X86-AVX512-LABEL: test_mm_loadl_pi:
1406; X86-AVX512:       # %bb.0:
1407; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1408; X86-AVX512-NEXT:    vmovsd (%eax), %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x08]
1409; X86-AVX512-NEXT:    # xmm1 = mem[0],zero
1410; X86-AVX512-NEXT:    vblendps $3, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x03]
1411; X86-AVX512-NEXT:    # xmm0 = xmm1[0,1],xmm0[2,3]
1412; X86-AVX512-NEXT:    retl # encoding: [0xc3]
1413;
1414; X64-SSE-LABEL: test_mm_loadl_pi:
1415; X64-SSE:       # %bb.0:
1416; X64-SSE-NEXT:    movq (%rdi), %rax # encoding: [0x48,0x8b,0x07]
1417; X64-SSE-NEXT:    movl %eax, -{{[0-9]+}}(%rsp) # encoding: [0x89,0x44,0x24,0xf8]
1418; X64-SSE-NEXT:    shrq $32, %rax # encoding: [0x48,0xc1,0xe8,0x20]
1419; X64-SSE-NEXT:    movl %eax, -{{[0-9]+}}(%rsp) # encoding: [0x89,0x44,0x24,0xfc]
1420; X64-SSE-NEXT:    movss -{{[0-9]+}}(%rsp), %xmm1 # encoding: [0xf3,0x0f,0x10,0x4c,0x24,0xf8]
1421; X64-SSE-NEXT:    # xmm1 = mem[0],zero,zero,zero
1422; X64-SSE-NEXT:    movss -{{[0-9]+}}(%rsp), %xmm2 # encoding: [0xf3,0x0f,0x10,0x54,0x24,0xfc]
1423; X64-SSE-NEXT:    # xmm2 = mem[0],zero,zero,zero
1424; X64-SSE-NEXT:    unpcklps %xmm2, %xmm1 # encoding: [0x0f,0x14,0xca]
1425; X64-SSE-NEXT:    # xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
1426; X64-SSE-NEXT:    shufps $228, %xmm0, %xmm1 # encoding: [0x0f,0xc6,0xc8,0xe4]
1427; X64-SSE-NEXT:    # xmm1 = xmm1[0,1],xmm0[2,3]
1428; X64-SSE-NEXT:    movaps %xmm1, %xmm0 # encoding: [0x0f,0x28,0xc1]
1429; X64-SSE-NEXT:    retq # encoding: [0xc3]
1430;
1431; X64-AVX1-LABEL: test_mm_loadl_pi:
1432; X64-AVX1:       # %bb.0:
1433; X64-AVX1-NEXT:    vmovlpd (%rdi), %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x12,0x07]
1434; X64-AVX1-NEXT:    # xmm0 = mem[0],xmm0[1]
1435; X64-AVX1-NEXT:    retq # encoding: [0xc3]
1436;
1437; X64-AVX512-LABEL: test_mm_loadl_pi:
1438; X64-AVX512:       # %bb.0:
1439; X64-AVX512-NEXT:    vmovlpd (%rdi), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x12,0x07]
1440; X64-AVX512-NEXT:    # xmm0 = mem[0],xmm0[1]
1441; X64-AVX512-NEXT:    retq # encoding: [0xc3]
1442  %ptr = bitcast x86_mmx* %a1 to <2 x float>*
1443  %ld  = load <2 x float>, <2 x float>* %ptr
1444  %ext = shufflevector <2 x float> %ld, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
1445  %res = shufflevector <4 x float> %a0, <4 x float> %ext, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
1446  ret <4 x float> %res
1447}
1448
1449define <4 x float> @test_mm_loadr_ps(float* %a0) nounwind {
1450; X86-SSE-LABEL: test_mm_loadr_ps:
1451; X86-SSE:       # %bb.0:
1452; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1453; X86-SSE-NEXT:    movaps (%eax), %xmm0 # encoding: [0x0f,0x28,0x00]
1454; X86-SSE-NEXT:    shufps $27, %xmm0, %xmm0 # encoding: [0x0f,0xc6,0xc0,0x1b]
1455; X86-SSE-NEXT:    # xmm0 = xmm0[3,2,1,0]
1456; X86-SSE-NEXT:    retl # encoding: [0xc3]
1457;
1458; X86-AVX1-LABEL: test_mm_loadr_ps:
1459; X86-AVX1:       # %bb.0:
1460; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1461; X86-AVX1-NEXT:    vpermilps $27, (%eax), %xmm0 # encoding: [0xc4,0xe3,0x79,0x04,0x00,0x1b]
1462; X86-AVX1-NEXT:    # xmm0 = mem[3,2,1,0]
1463; X86-AVX1-NEXT:    retl # encoding: [0xc3]
1464;
1465; X86-AVX512-LABEL: test_mm_loadr_ps:
1466; X86-AVX512:       # %bb.0:
1467; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1468; X86-AVX512-NEXT:    vpermilps $27, (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0x00,0x1b]
1469; X86-AVX512-NEXT:    # xmm0 = mem[3,2,1,0]
1470; X86-AVX512-NEXT:    retl # encoding: [0xc3]
1471;
1472; X64-SSE-LABEL: test_mm_loadr_ps:
1473; X64-SSE:       # %bb.0:
1474; X64-SSE-NEXT:    movaps (%rdi), %xmm0 # encoding: [0x0f,0x28,0x07]
1475; X64-SSE-NEXT:    shufps $27, %xmm0, %xmm0 # encoding: [0x0f,0xc6,0xc0,0x1b]
1476; X64-SSE-NEXT:    # xmm0 = xmm0[3,2,1,0]
1477; X64-SSE-NEXT:    retq # encoding: [0xc3]
1478;
1479; X64-AVX1-LABEL: test_mm_loadr_ps:
1480; X64-AVX1:       # %bb.0:
1481; X64-AVX1-NEXT:    vpermilps $27, (%rdi), %xmm0 # encoding: [0xc4,0xe3,0x79,0x04,0x07,0x1b]
1482; X64-AVX1-NEXT:    # xmm0 = mem[3,2,1,0]
1483; X64-AVX1-NEXT:    retq # encoding: [0xc3]
1484;
1485; X64-AVX512-LABEL: test_mm_loadr_ps:
1486; X64-AVX512:       # %bb.0:
1487; X64-AVX512-NEXT:    vpermilps $27, (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0x07,0x1b]
1488; X64-AVX512-NEXT:    # xmm0 = mem[3,2,1,0]
1489; X64-AVX512-NEXT:    retq # encoding: [0xc3]
1490  %arg0 = bitcast float* %a0 to <4 x float>*
1491  %ld = load <4 x float>, <4 x float>* %arg0, align 16
1492  %res = shufflevector <4 x float> %ld, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
1493  ret <4 x float> %res
1494}
1495
1496define <4 x float> @test_mm_loadu_ps(float* %a0) nounwind {
1497; X86-SSE-LABEL: test_mm_loadu_ps:
1498; X86-SSE:       # %bb.0:
1499; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1500; X86-SSE-NEXT:    movups (%eax), %xmm0 # encoding: [0x0f,0x10,0x00]
1501; X86-SSE-NEXT:    retl # encoding: [0xc3]
1502;
1503; X86-AVX1-LABEL: test_mm_loadu_ps:
1504; X86-AVX1:       # %bb.0:
1505; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1506; X86-AVX1-NEXT:    vmovups (%eax), %xmm0 # encoding: [0xc5,0xf8,0x10,0x00]
1507; X86-AVX1-NEXT:    retl # encoding: [0xc3]
1508;
1509; X86-AVX512-LABEL: test_mm_loadu_ps:
1510; X86-AVX512:       # %bb.0:
1511; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1512; X86-AVX512-NEXT:    vmovups (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x10,0x00]
1513; X86-AVX512-NEXT:    retl # encoding: [0xc3]
1514;
1515; X64-SSE-LABEL: test_mm_loadu_ps:
1516; X64-SSE:       # %bb.0:
1517; X64-SSE-NEXT:    movups (%rdi), %xmm0 # encoding: [0x0f,0x10,0x07]
1518; X64-SSE-NEXT:    retq # encoding: [0xc3]
1519;
1520; X64-AVX1-LABEL: test_mm_loadu_ps:
1521; X64-AVX1:       # %bb.0:
1522; X64-AVX1-NEXT:    vmovups (%rdi), %xmm0 # encoding: [0xc5,0xf8,0x10,0x07]
1523; X64-AVX1-NEXT:    retq # encoding: [0xc3]
1524;
1525; X64-AVX512-LABEL: test_mm_loadu_ps:
1526; X64-AVX512:       # %bb.0:
1527; X64-AVX512-NEXT:    vmovups (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x10,0x07]
1528; X64-AVX512-NEXT:    retq # encoding: [0xc3]
1529  %arg0 = bitcast float* %a0 to <4 x float>*
1530  %res = load <4 x float>, <4 x float>* %arg0, align 1
1531  ret <4 x float> %res
1532}
1533
1534define <4 x float> @test_mm_max_ps(<4 x float> %a0, <4 x float> %a1) {
1535; SSE-LABEL: test_mm_max_ps:
1536; SSE:       # %bb.0:
1537; SSE-NEXT:    maxps %xmm1, %xmm0 # encoding: [0x0f,0x5f,0xc1]
1538; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1539;
1540; AVX1-LABEL: test_mm_max_ps:
1541; AVX1:       # %bb.0:
1542; AVX1-NEXT:    vmaxps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x5f,0xc1]
1543; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1544;
1545; AVX512-LABEL: test_mm_max_ps:
1546; AVX512:       # %bb.0:
1547; AVX512-NEXT:    vmaxps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x5f,0xc1]
1548; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1549  %res = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %a0, <4 x float> %a1)
1550  ret <4 x float> %res
1551}
1552declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
1553
1554define <4 x float> @test_mm_max_ss(<4 x float> %a0, <4 x float> %a1) {
1555; SSE-LABEL: test_mm_max_ss:
1556; SSE:       # %bb.0:
1557; SSE-NEXT:    maxss %xmm1, %xmm0 # encoding: [0xf3,0x0f,0x5f,0xc1]
1558; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1559;
1560; AVX1-LABEL: test_mm_max_ss:
1561; AVX1:       # %bb.0:
1562; AVX1-NEXT:    vmaxss %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x5f,0xc1]
1563; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1564;
1565; AVX512-LABEL: test_mm_max_ss:
1566; AVX512:       # %bb.0:
1567; AVX512-NEXT:    vmaxss %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x5f,0xc1]
1568; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1569  %res = call <4 x float> @llvm.x86.sse.max.ss(<4 x float> %a0, <4 x float> %a1)
1570  ret <4 x float> %res
1571}
1572declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone
1573
1574define <4 x float> @test_mm_min_ps(<4 x float> %a0, <4 x float> %a1) {
1575; SSE-LABEL: test_mm_min_ps:
1576; SSE:       # %bb.0:
1577; SSE-NEXT:    minps %xmm1, %xmm0 # encoding: [0x0f,0x5d,0xc1]
1578; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1579;
1580; AVX1-LABEL: test_mm_min_ps:
1581; AVX1:       # %bb.0:
1582; AVX1-NEXT:    vminps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x5d,0xc1]
1583; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1584;
1585; AVX512-LABEL: test_mm_min_ps:
1586; AVX512:       # %bb.0:
1587; AVX512-NEXT:    vminps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x5d,0xc1]
1588; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1589  %res = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %a0, <4 x float> %a1)
1590  ret <4 x float> %res
1591}
1592declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
1593
1594define <4 x float> @test_mm_min_ss(<4 x float> %a0, <4 x float> %a1) {
1595; SSE-LABEL: test_mm_min_ss:
1596; SSE:       # %bb.0:
1597; SSE-NEXT:    minss %xmm1, %xmm0 # encoding: [0xf3,0x0f,0x5d,0xc1]
1598; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1599;
1600; AVX1-LABEL: test_mm_min_ss:
1601; AVX1:       # %bb.0:
1602; AVX1-NEXT:    vminss %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x5d,0xc1]
1603; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1604;
1605; AVX512-LABEL: test_mm_min_ss:
1606; AVX512:       # %bb.0:
1607; AVX512-NEXT:    vminss %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x5d,0xc1]
1608; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1609  %res = call <4 x float> @llvm.x86.sse.min.ss(<4 x float> %a0, <4 x float> %a1)
1610  ret <4 x float> %res
1611}
1612declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind readnone
1613
1614define <4 x float> @test_mm_move_ss(<4 x float> %a0, <4 x float> %a1) {
1615; SSE-LABEL: test_mm_move_ss:
1616; SSE:       # %bb.0:
1617; SSE-NEXT:    movss %xmm1, %xmm0 # encoding: [0xf3,0x0f,0x10,0xc1]
1618; SSE-NEXT:    # xmm0 = xmm1[0],xmm0[1,2,3]
1619; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1620;
1621; AVX-LABEL: test_mm_move_ss:
1622; AVX:       # %bb.0:
1623; AVX-NEXT:    vblendps $1, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x01]
1624; AVX-NEXT:    # xmm0 = xmm1[0],xmm0[1,2,3]
1625; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1626  %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
1627  ret <4 x float> %res
1628}
1629
1630define <4 x float> @test_mm_movehl_ps(<4 x float> %a0, <4 x float> %a1) {
1631; SSE-LABEL: test_mm_movehl_ps:
1632; SSE:       # %bb.0:
1633; SSE-NEXT:    movhlps %xmm1, %xmm0 # encoding: [0x0f,0x12,0xc1]
1634; SSE-NEXT:    # xmm0 = xmm1[1],xmm0[1]
1635; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1636;
1637; AVX1-LABEL: test_mm_movehl_ps:
1638; AVX1:       # %bb.0:
1639; AVX1-NEXT:    vunpckhpd %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0x15,0xc0]
1640; AVX1-NEXT:    # xmm0 = xmm1[1],xmm0[1]
1641; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1642;
1643; AVX512-LABEL: test_mm_movehl_ps:
1644; AVX512:       # %bb.0:
1645; AVX512-NEXT:    vunpckhpd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0x15,0xc0]
1646; AVX512-NEXT:    # xmm0 = xmm1[1],xmm0[1]
1647; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1648  %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
1649  ret <4 x float> %res
1650}
1651
1652define <4 x float> @test_mm_movelh_ps(<4 x float> %a0, <4 x float> %a1) {
1653; SSE-LABEL: test_mm_movelh_ps:
1654; SSE:       # %bb.0:
1655; SSE-NEXT:    movlhps %xmm1, %xmm0 # encoding: [0x0f,0x16,0xc1]
1656; SSE-NEXT:    # xmm0 = xmm0[0],xmm1[0]
1657; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1658;
1659; AVX1-LABEL: test_mm_movelh_ps:
1660; AVX1:       # %bb.0:
1661; AVX1-NEXT:    vmovlhps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x16,0xc1]
1662; AVX1-NEXT:    # xmm0 = xmm0[0],xmm1[0]
1663; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1664;
1665; AVX512-LABEL: test_mm_movelh_ps:
1666; AVX512:       # %bb.0:
1667; AVX512-NEXT:    vmovlhps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x16,0xc1]
1668; AVX512-NEXT:    # xmm0 = xmm0[0],xmm1[0]
1669; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1670  %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
1671  ret <4 x float> %res
1672}
1673
1674define i32 @test_mm_movemask_ps(<4 x float> %a0) nounwind {
1675; SSE-LABEL: test_mm_movemask_ps:
1676; SSE:       # %bb.0:
1677; SSE-NEXT:    movmskps %xmm0, %eax # encoding: [0x0f,0x50,0xc0]
1678; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1679;
1680; AVX-LABEL: test_mm_movemask_ps:
1681; AVX:       # %bb.0:
1682; AVX-NEXT:    vmovmskps %xmm0, %eax # encoding: [0xc5,0xf8,0x50,0xc0]
1683; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1684  %res = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %a0)
1685  ret i32 %res
1686}
1687declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
1688
1689define <4 x float> @test_mm_mul_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
1690; SSE-LABEL: test_mm_mul_ps:
1691; SSE:       # %bb.0:
1692; SSE-NEXT:    mulps %xmm1, %xmm0 # encoding: [0x0f,0x59,0xc1]
1693; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1694;
1695; AVX1-LABEL: test_mm_mul_ps:
1696; AVX1:       # %bb.0:
1697; AVX1-NEXT:    vmulps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x59,0xc1]
1698; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1699;
1700; AVX512-LABEL: test_mm_mul_ps:
1701; AVX512:       # %bb.0:
1702; AVX512-NEXT:    vmulps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x59,0xc1]
1703; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1704  %res = fmul <4 x float> %a0, %a1
1705  ret <4 x float> %res
1706}
1707
1708define <4 x float> @test_mm_mul_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
1709; SSE-LABEL: test_mm_mul_ss:
1710; SSE:       # %bb.0:
1711; SSE-NEXT:    mulss %xmm1, %xmm0 # encoding: [0xf3,0x0f,0x59,0xc1]
1712; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1713;
1714; AVX1-LABEL: test_mm_mul_ss:
1715; AVX1:       # %bb.0:
1716; AVX1-NEXT:    vmulss %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x59,0xc1]
1717; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1718;
1719; AVX512-LABEL: test_mm_mul_ss:
1720; AVX512:       # %bb.0:
1721; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x59,0xc1]
1722; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1723  %ext0 = extractelement <4 x float> %a0, i32 0
1724  %ext1 = extractelement <4 x float> %a1, i32 0
1725  %fmul = fmul float %ext0, %ext1
1726  %res = insertelement <4 x float> %a0, float %fmul, i32 0
1727  ret <4 x float> %res
1728}
1729
1730define <4 x float> @test_mm_or_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
1731; SSE-LABEL: test_mm_or_ps:
1732; SSE:       # %bb.0:
1733; SSE-NEXT:    orps %xmm1, %xmm0 # encoding: [0x0f,0x56,0xc1]
1734; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1735;
1736; AVX1-LABEL: test_mm_or_ps:
1737; AVX1:       # %bb.0:
1738; AVX1-NEXT:    vorps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x56,0xc1]
1739; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1740;
1741; AVX512-LABEL: test_mm_or_ps:
1742; AVX512:       # %bb.0:
1743; AVX512-NEXT:    vorps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x56,0xc1]
1744; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1745  %arg0 = bitcast <4 x float> %a0 to <4 x i32>
1746  %arg1 = bitcast <4 x float> %a1 to <4 x i32>
1747  %res = or <4 x i32> %arg0, %arg1
1748  %bc = bitcast <4 x i32> %res to <4 x float>
1749  ret <4 x float> %bc
1750}
1751
1752define void @test_mm_prefetch(i8* %a0) {
1753; X86-LABEL: test_mm_prefetch:
1754; X86:       # %bb.0:
1755; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1756; X86-NEXT:    prefetchnta (%eax) # encoding: [0x0f,0x18,0x00]
1757; X86-NEXT:    retl # encoding: [0xc3]
1758;
1759; X64-LABEL: test_mm_prefetch:
1760; X64:       # %bb.0:
1761; X64-NEXT:    prefetchnta (%rdi) # encoding: [0x0f,0x18,0x07]
1762; X64-NEXT:    retq # encoding: [0xc3]
1763  call void @llvm.prefetch(i8* %a0, i32 0, i32 0, i32 1)
1764  ret void
1765}
1766declare void @llvm.prefetch(i8* nocapture, i32, i32, i32) nounwind readnone
1767
1768define <4 x float> @test_mm_rcp_ps(<4 x float> %a0) {
1769; SSE-LABEL: test_mm_rcp_ps:
1770; SSE:       # %bb.0:
1771; SSE-NEXT:    rcpps %xmm0, %xmm0 # encoding: [0x0f,0x53,0xc0]
1772; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1773;
1774; AVX-LABEL: test_mm_rcp_ps:
1775; AVX:       # %bb.0:
1776; AVX-NEXT:    vrcpps %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x53,0xc0]
1777; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1778  %res = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %a0)
1779  ret <4 x float> %res
1780}
1781declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
1782
1783define <4 x float> @test_mm_rcp_ss(<4 x float> %a0) {
1784; SSE-LABEL: test_mm_rcp_ss:
1785; SSE:       # %bb.0:
1786; SSE-NEXT:    rcpss %xmm0, %xmm0 # encoding: [0xf3,0x0f,0x53,0xc0]
1787; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1788;
1789; AVX-LABEL: test_mm_rcp_ss:
1790; AVX:       # %bb.0:
1791; AVX-NEXT:    vrcpss %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x53,0xc0]
1792; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1793  %rcp = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %a0)
1794  ret <4 x float> %rcp
1795}
1796declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
1797
1798define <4 x float> @test_mm_rsqrt_ps(<4 x float> %a0) {
1799; SSE-LABEL: test_mm_rsqrt_ps:
1800; SSE:       # %bb.0:
1801; SSE-NEXT:    rsqrtps %xmm0, %xmm0 # encoding: [0x0f,0x52,0xc0]
1802; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1803;
1804; AVX-LABEL: test_mm_rsqrt_ps:
1805; AVX:       # %bb.0:
1806; AVX-NEXT:    vrsqrtps %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x52,0xc0]
1807; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1808  %res = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %a0)
1809  ret <4 x float> %res
1810}
1811declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
1812
1813define <4 x float> @test_mm_rsqrt_ss(<4 x float> %a0) {
1814; SSE-LABEL: test_mm_rsqrt_ss:
1815; SSE:       # %bb.0:
1816; SSE-NEXT:    rsqrtss %xmm0, %xmm0 # encoding: [0xf3,0x0f,0x52,0xc0]
1817; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1818;
1819; AVX-LABEL: test_mm_rsqrt_ss:
1820; AVX:       # %bb.0:
1821; AVX-NEXT:    vrsqrtss %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x52,0xc0]
1822; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1823  %rsqrt = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %a0)
1824  ret <4 x float> %rsqrt
1825}
1826declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
1827
1828define void @test_MM_SET_EXCEPTION_MASK(i32 %a0) nounwind {
1829; X86-SSE-LABEL: test_MM_SET_EXCEPTION_MASK:
1830; X86-SSE:       # %bb.0:
1831; X86-SSE-NEXT:    pushl %eax # encoding: [0x50]
1832; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
1833; X86-SSE-NEXT:    movl %esp, %ecx # encoding: [0x89,0xe1]
1834; X86-SSE-NEXT:    stmxcsr (%ecx) # encoding: [0x0f,0xae,0x19]
1835; X86-SSE-NEXT:    movl (%esp), %edx # encoding: [0x8b,0x14,0x24]
1836; X86-SSE-NEXT:    andl $-8065, %edx # encoding: [0x81,0xe2,0x7f,0xe0,0xff,0xff]
1837; X86-SSE-NEXT:    # imm = 0xE07F
1838; X86-SSE-NEXT:    orl %eax, %edx # encoding: [0x09,0xc2]
1839; X86-SSE-NEXT:    movl %edx, (%esp) # encoding: [0x89,0x14,0x24]
1840; X86-SSE-NEXT:    ldmxcsr (%ecx) # encoding: [0x0f,0xae,0x11]
1841; X86-SSE-NEXT:    popl %eax # encoding: [0x58]
1842; X86-SSE-NEXT:    retl # encoding: [0xc3]
1843;
1844; X86-AVX-LABEL: test_MM_SET_EXCEPTION_MASK:
1845; X86-AVX:       # %bb.0:
1846; X86-AVX-NEXT:    pushl %eax # encoding: [0x50]
1847; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
1848; X86-AVX-NEXT:    movl %esp, %ecx # encoding: [0x89,0xe1]
1849; X86-AVX-NEXT:    vstmxcsr (%ecx) # encoding: [0xc5,0xf8,0xae,0x19]
1850; X86-AVX-NEXT:    movl (%esp), %edx # encoding: [0x8b,0x14,0x24]
1851; X86-AVX-NEXT:    andl $-8065, %edx # encoding: [0x81,0xe2,0x7f,0xe0,0xff,0xff]
1852; X86-AVX-NEXT:    # imm = 0xE07F
1853; X86-AVX-NEXT:    orl %eax, %edx # encoding: [0x09,0xc2]
1854; X86-AVX-NEXT:    movl %edx, (%esp) # encoding: [0x89,0x14,0x24]
1855; X86-AVX-NEXT:    vldmxcsr (%ecx) # encoding: [0xc5,0xf8,0xae,0x11]
1856; X86-AVX-NEXT:    popl %eax # encoding: [0x58]
1857; X86-AVX-NEXT:    retl # encoding: [0xc3]
1858;
1859; X64-SSE-LABEL: test_MM_SET_EXCEPTION_MASK:
1860; X64-SSE:       # %bb.0:
1861; X64-SSE-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8d,0x44,0x24,0xfc]
1862; X64-SSE-NEXT:    stmxcsr (%rax) # encoding: [0x0f,0xae,0x18]
1863; X64-SSE-NEXT:    movl -{{[0-9]+}}(%rsp), %ecx # encoding: [0x8b,0x4c,0x24,0xfc]
1864; X64-SSE-NEXT:    andl $-8065, %ecx # encoding: [0x81,0xe1,0x7f,0xe0,0xff,0xff]
1865; X64-SSE-NEXT:    # imm = 0xE07F
1866; X64-SSE-NEXT:    orl %edi, %ecx # encoding: [0x09,0xf9]
1867; X64-SSE-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp) # encoding: [0x89,0x4c,0x24,0xfc]
1868; X64-SSE-NEXT:    ldmxcsr (%rax) # encoding: [0x0f,0xae,0x10]
1869; X64-SSE-NEXT:    retq # encoding: [0xc3]
1870;
1871; X64-AVX-LABEL: test_MM_SET_EXCEPTION_MASK:
1872; X64-AVX:       # %bb.0:
1873; X64-AVX-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8d,0x44,0x24,0xfc]
1874; X64-AVX-NEXT:    vstmxcsr (%rax) # encoding: [0xc5,0xf8,0xae,0x18]
1875; X64-AVX-NEXT:    movl -{{[0-9]+}}(%rsp), %ecx # encoding: [0x8b,0x4c,0x24,0xfc]
1876; X64-AVX-NEXT:    andl $-8065, %ecx # encoding: [0x81,0xe1,0x7f,0xe0,0xff,0xff]
1877; X64-AVX-NEXT:    # imm = 0xE07F
1878; X64-AVX-NEXT:    orl %edi, %ecx # encoding: [0x09,0xf9]
1879; X64-AVX-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp) # encoding: [0x89,0x4c,0x24,0xfc]
1880; X64-AVX-NEXT:    vldmxcsr (%rax) # encoding: [0xc5,0xf8,0xae,0x10]
1881; X64-AVX-NEXT:    retq # encoding: [0xc3]
1882  %1 = alloca i32, align 4
1883  %2 = bitcast i32* %1 to i8*
1884  call void @llvm.x86.sse.stmxcsr(i8* %2)
1885  %3 = load i32, i32* %1
1886  %4 = and i32 %3, -8065
1887  %5 = or i32 %4, %a0
1888  store i32 %5, i32* %1
1889  call void @llvm.x86.sse.ldmxcsr(i8* %2)
1890  ret void
1891}
1892declare void @llvm.x86.sse.ldmxcsr(i8*) nounwind readnone
1893
1894define void @test_MM_SET_EXCEPTION_STATE(i32 %a0) nounwind {
1895; X86-SSE-LABEL: test_MM_SET_EXCEPTION_STATE:
1896; X86-SSE:       # %bb.0:
1897; X86-SSE-NEXT:    pushl %eax # encoding: [0x50]
1898; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
1899; X86-SSE-NEXT:    movl %esp, %ecx # encoding: [0x89,0xe1]
1900; X86-SSE-NEXT:    stmxcsr (%ecx) # encoding: [0x0f,0xae,0x19]
1901; X86-SSE-NEXT:    movl (%esp), %edx # encoding: [0x8b,0x14,0x24]
1902; X86-SSE-NEXT:    andl $-64, %edx # encoding: [0x83,0xe2,0xc0]
1903; X86-SSE-NEXT:    orl %eax, %edx # encoding: [0x09,0xc2]
1904; X86-SSE-NEXT:    movl %edx, (%esp) # encoding: [0x89,0x14,0x24]
1905; X86-SSE-NEXT:    ldmxcsr (%ecx) # encoding: [0x0f,0xae,0x11]
1906; X86-SSE-NEXT:    popl %eax # encoding: [0x58]
1907; X86-SSE-NEXT:    retl # encoding: [0xc3]
1908;
1909; X86-AVX-LABEL: test_MM_SET_EXCEPTION_STATE:
1910; X86-AVX:       # %bb.0:
1911; X86-AVX-NEXT:    pushl %eax # encoding: [0x50]
1912; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
1913; X86-AVX-NEXT:    movl %esp, %ecx # encoding: [0x89,0xe1]
1914; X86-AVX-NEXT:    vstmxcsr (%ecx) # encoding: [0xc5,0xf8,0xae,0x19]
1915; X86-AVX-NEXT:    movl (%esp), %edx # encoding: [0x8b,0x14,0x24]
1916; X86-AVX-NEXT:    andl $-64, %edx # encoding: [0x83,0xe2,0xc0]
1917; X86-AVX-NEXT:    orl %eax, %edx # encoding: [0x09,0xc2]
1918; X86-AVX-NEXT:    movl %edx, (%esp) # encoding: [0x89,0x14,0x24]
1919; X86-AVX-NEXT:    vldmxcsr (%ecx) # encoding: [0xc5,0xf8,0xae,0x11]
1920; X86-AVX-NEXT:    popl %eax # encoding: [0x58]
1921; X86-AVX-NEXT:    retl # encoding: [0xc3]
1922;
1923; X64-SSE-LABEL: test_MM_SET_EXCEPTION_STATE:
1924; X64-SSE:       # %bb.0:
1925; X64-SSE-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8d,0x44,0x24,0xfc]
1926; X64-SSE-NEXT:    stmxcsr (%rax) # encoding: [0x0f,0xae,0x18]
1927; X64-SSE-NEXT:    movl -{{[0-9]+}}(%rsp), %ecx # encoding: [0x8b,0x4c,0x24,0xfc]
1928; X64-SSE-NEXT:    andl $-64, %ecx # encoding: [0x83,0xe1,0xc0]
1929; X64-SSE-NEXT:    orl %edi, %ecx # encoding: [0x09,0xf9]
1930; X64-SSE-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp) # encoding: [0x89,0x4c,0x24,0xfc]
1931; X64-SSE-NEXT:    ldmxcsr (%rax) # encoding: [0x0f,0xae,0x10]
1932; X64-SSE-NEXT:    retq # encoding: [0xc3]
1933;
1934; X64-AVX-LABEL: test_MM_SET_EXCEPTION_STATE:
1935; X64-AVX:       # %bb.0:
1936; X64-AVX-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8d,0x44,0x24,0xfc]
1937; X64-AVX-NEXT:    vstmxcsr (%rax) # encoding: [0xc5,0xf8,0xae,0x18]
1938; X64-AVX-NEXT:    movl -{{[0-9]+}}(%rsp), %ecx # encoding: [0x8b,0x4c,0x24,0xfc]
1939; X64-AVX-NEXT:    andl $-64, %ecx # encoding: [0x83,0xe1,0xc0]
1940; X64-AVX-NEXT:    orl %edi, %ecx # encoding: [0x09,0xf9]
1941; X64-AVX-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp) # encoding: [0x89,0x4c,0x24,0xfc]
1942; X64-AVX-NEXT:    vldmxcsr (%rax) # encoding: [0xc5,0xf8,0xae,0x10]
1943; X64-AVX-NEXT:    retq # encoding: [0xc3]
1944  %1 = alloca i32, align 4
1945  %2 = bitcast i32* %1 to i8*
1946  call void @llvm.x86.sse.stmxcsr(i8* %2)
1947  %3 = load i32, i32* %1
1948  %4 = and i32 %3, -64
1949  %5 = or i32 %4, %a0
1950  store i32 %5, i32* %1
1951  call void @llvm.x86.sse.ldmxcsr(i8* %2)
1952  ret void
1953}
1954
1955define void @test_MM_SET_FLUSH_ZERO_MODE(i32 %a0) nounwind {
1956; X86-SSE-LABEL: test_MM_SET_FLUSH_ZERO_MODE:
1957; X86-SSE:       # %bb.0:
1958; X86-SSE-NEXT:    pushl %eax # encoding: [0x50]
1959; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
1960; X86-SSE-NEXT:    movl %esp, %ecx # encoding: [0x89,0xe1]
1961; X86-SSE-NEXT:    stmxcsr (%ecx) # encoding: [0x0f,0xae,0x19]
1962; X86-SSE-NEXT:    movl (%esp), %edx # encoding: [0x8b,0x14,0x24]
1963; X86-SSE-NEXT:    andl $-32769, %edx # encoding: [0x81,0xe2,0xff,0x7f,0xff,0xff]
1964; X86-SSE-NEXT:    # imm = 0xFFFF7FFF
1965; X86-SSE-NEXT:    orl %eax, %edx # encoding: [0x09,0xc2]
1966; X86-SSE-NEXT:    movl %edx, (%esp) # encoding: [0x89,0x14,0x24]
1967; X86-SSE-NEXT:    ldmxcsr (%ecx) # encoding: [0x0f,0xae,0x11]
1968; X86-SSE-NEXT:    popl %eax # encoding: [0x58]
1969; X86-SSE-NEXT:    retl # encoding: [0xc3]
1970;
1971; X86-AVX-LABEL: test_MM_SET_FLUSH_ZERO_MODE:
1972; X86-AVX:       # %bb.0:
1973; X86-AVX-NEXT:    pushl %eax # encoding: [0x50]
1974; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
1975; X86-AVX-NEXT:    movl %esp, %ecx # encoding: [0x89,0xe1]
1976; X86-AVX-NEXT:    vstmxcsr (%ecx) # encoding: [0xc5,0xf8,0xae,0x19]
1977; X86-AVX-NEXT:    movl (%esp), %edx # encoding: [0x8b,0x14,0x24]
1978; X86-AVX-NEXT:    andl $-32769, %edx # encoding: [0x81,0xe2,0xff,0x7f,0xff,0xff]
1979; X86-AVX-NEXT:    # imm = 0xFFFF7FFF
1980; X86-AVX-NEXT:    orl %eax, %edx # encoding: [0x09,0xc2]
1981; X86-AVX-NEXT:    movl %edx, (%esp) # encoding: [0x89,0x14,0x24]
1982; X86-AVX-NEXT:    vldmxcsr (%ecx) # encoding: [0xc5,0xf8,0xae,0x11]
1983; X86-AVX-NEXT:    popl %eax # encoding: [0x58]
1984; X86-AVX-NEXT:    retl # encoding: [0xc3]
1985;
1986; X64-SSE-LABEL: test_MM_SET_FLUSH_ZERO_MODE:
1987; X64-SSE:       # %bb.0:
1988; X64-SSE-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8d,0x44,0x24,0xfc]
1989; X64-SSE-NEXT:    stmxcsr (%rax) # encoding: [0x0f,0xae,0x18]
1990; X64-SSE-NEXT:    movl -{{[0-9]+}}(%rsp), %ecx # encoding: [0x8b,0x4c,0x24,0xfc]
1991; X64-SSE-NEXT:    andl $-32769, %ecx # encoding: [0x81,0xe1,0xff,0x7f,0xff,0xff]
1992; X64-SSE-NEXT:    # imm = 0xFFFF7FFF
1993; X64-SSE-NEXT:    orl %edi, %ecx # encoding: [0x09,0xf9]
1994; X64-SSE-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp) # encoding: [0x89,0x4c,0x24,0xfc]
1995; X64-SSE-NEXT:    ldmxcsr (%rax) # encoding: [0x0f,0xae,0x10]
1996; X64-SSE-NEXT:    retq # encoding: [0xc3]
1997;
1998; X64-AVX-LABEL: test_MM_SET_FLUSH_ZERO_MODE:
1999; X64-AVX:       # %bb.0:
2000; X64-AVX-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8d,0x44,0x24,0xfc]
2001; X64-AVX-NEXT:    vstmxcsr (%rax) # encoding: [0xc5,0xf8,0xae,0x18]
2002; X64-AVX-NEXT:    movl -{{[0-9]+}}(%rsp), %ecx # encoding: [0x8b,0x4c,0x24,0xfc]
2003; X64-AVX-NEXT:    andl $-32769, %ecx # encoding: [0x81,0xe1,0xff,0x7f,0xff,0xff]
2004; X64-AVX-NEXT:    # imm = 0xFFFF7FFF
2005; X64-AVX-NEXT:    orl %edi, %ecx # encoding: [0x09,0xf9]
2006; X64-AVX-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp) # encoding: [0x89,0x4c,0x24,0xfc]
2007; X64-AVX-NEXT:    vldmxcsr (%rax) # encoding: [0xc5,0xf8,0xae,0x10]
2008; X64-AVX-NEXT:    retq # encoding: [0xc3]
2009  %1 = alloca i32, align 4
2010  %2 = bitcast i32* %1 to i8*
2011  call void @llvm.x86.sse.stmxcsr(i8* %2)
2012  %3 = load i32, i32* %1
2013  %4 = and i32 %3, -32769
2014  %5 = or i32 %4, %a0
2015  store i32 %5, i32* %1
2016  call void @llvm.x86.sse.ldmxcsr(i8* %2)
2017  ret void
2018}
2019
2020define <4 x float> @test_mm_set_ps(float %a0, float %a1, float %a2, float %a3) nounwind {
2021; X86-SSE-LABEL: test_mm_set_ps:
2022; X86-SSE:       # %bb.0:
2023; X86-SSE-NEXT:    movss {{[0-9]+}}(%esp), %xmm0 # encoding: [0xf3,0x0f,0x10,0x44,0x24,0x10]
2024; X86-SSE-NEXT:    # xmm0 = mem[0],zero,zero,zero
2025; X86-SSE-NEXT:    movss {{[0-9]+}}(%esp), %xmm1 # encoding: [0xf3,0x0f,0x10,0x4c,0x24,0x0c]
2026; X86-SSE-NEXT:    # xmm1 = mem[0],zero,zero,zero
2027; X86-SSE-NEXT:    unpcklps %xmm1, %xmm0 # encoding: [0x0f,0x14,0xc1]
2028; X86-SSE-NEXT:    # xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2029; X86-SSE-NEXT:    movss {{[0-9]+}}(%esp), %xmm1 # encoding: [0xf3,0x0f,0x10,0x4c,0x24,0x08]
2030; X86-SSE-NEXT:    # xmm1 = mem[0],zero,zero,zero
2031; X86-SSE-NEXT:    movss {{[0-9]+}}(%esp), %xmm2 # encoding: [0xf3,0x0f,0x10,0x54,0x24,0x04]
2032; X86-SSE-NEXT:    # xmm2 = mem[0],zero,zero,zero
2033; X86-SSE-NEXT:    unpcklps %xmm2, %xmm1 # encoding: [0x0f,0x14,0xca]
2034; X86-SSE-NEXT:    # xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
2035; X86-SSE-NEXT:    movlhps %xmm1, %xmm0 # encoding: [0x0f,0x16,0xc1]
2036; X86-SSE-NEXT:    # xmm0 = xmm0[0],xmm1[0]
2037; X86-SSE-NEXT:    retl # encoding: [0xc3]
2038;
2039; X86-AVX1-LABEL: test_mm_set_ps:
2040; X86-AVX1:       # %bb.0:
2041; X86-AVX1-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm0 # encoding: [0xc5,0xfa,0x10,0x44,0x24,0x10]
2042; X86-AVX1-NEXT:    # xmm0 = mem[0],zero,zero,zero
2043; X86-AVX1-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm1 # encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x0c]
2044; X86-AVX1-NEXT:    # xmm1 = mem[0],zero,zero,zero
2045; X86-AVX1-NEXT:    vinsertps $16, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x10]
2046; X86-AVX1-NEXT:    # xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
2047; X86-AVX1-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm1 # encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x08]
2048; X86-AVX1-NEXT:    # xmm1 = mem[0],zero,zero,zero
2049; X86-AVX1-NEXT:    vinsertps $32, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x20]
2050; X86-AVX1-NEXT:    # xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
2051; X86-AVX1-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm1 # encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x04]
2052; X86-AVX1-NEXT:    # xmm1 = mem[0],zero,zero,zero
2053; X86-AVX1-NEXT:    vinsertps $48, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30]
2054; X86-AVX1-NEXT:    # xmm0 = xmm0[0,1,2],xmm1[0]
2055; X86-AVX1-NEXT:    retl # encoding: [0xc3]
2056;
2057; X86-AVX512-LABEL: test_mm_set_ps:
2058; X86-AVX512:       # %bb.0:
2059; X86-AVX512-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x44,0x24,0x10]
2060; X86-AVX512-NEXT:    # xmm0 = mem[0],zero,zero,zero
2061; X86-AVX512-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x0c]
2062; X86-AVX512-NEXT:    # xmm1 = mem[0],zero,zero,zero
2063; X86-AVX512-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x54,0x24,0x08]
2064; X86-AVX512-NEXT:    # xmm2 = mem[0],zero,zero,zero
2065; X86-AVX512-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x5c,0x24,0x04]
2066; X86-AVX512-NEXT:    # xmm3 = mem[0],zero,zero,zero
2067; X86-AVX512-NEXT:    vinsertps $16, %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x10]
2068; X86-AVX512-NEXT:    # xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
2069; X86-AVX512-NEXT:    vinsertps $32, %xmm2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc2,0x20]
2070; X86-AVX512-NEXT:    # xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
2071; X86-AVX512-NEXT:    vinsertps $48, %xmm3, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc3,0x30]
2072; X86-AVX512-NEXT:    # xmm0 = xmm0[0,1,2],xmm3[0]
2073; X86-AVX512-NEXT:    retl # encoding: [0xc3]
2074;
2075; X64-SSE-LABEL: test_mm_set_ps:
2076; X64-SSE:       # %bb.0:
2077; X64-SSE-NEXT:    unpcklps %xmm0, %xmm1 # encoding: [0x0f,0x14,0xc8]
2078; X64-SSE-NEXT:    # xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
2079; X64-SSE-NEXT:    unpcklps %xmm2, %xmm3 # encoding: [0x0f,0x14,0xda]
2080; X64-SSE-NEXT:    # xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
2081; X64-SSE-NEXT:    movlhps %xmm1, %xmm3 # encoding: [0x0f,0x16,0xd9]
2082; X64-SSE-NEXT:    # xmm3 = xmm3[0],xmm1[0]
2083; X64-SSE-NEXT:    movaps %xmm3, %xmm0 # encoding: [0x0f,0x28,0xc3]
2084; X64-SSE-NEXT:    retq # encoding: [0xc3]
2085;
2086; X64-AVX1-LABEL: test_mm_set_ps:
2087; X64-AVX1:       # %bb.0:
2088; X64-AVX1-NEXT:    vinsertps $16, %xmm2, %xmm3, %xmm2 # encoding: [0xc4,0xe3,0x61,0x21,0xd2,0x10]
2089; X64-AVX1-NEXT:    # xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
2090; X64-AVX1-NEXT:    vinsertps $32, %xmm1, %xmm2, %xmm1 # encoding: [0xc4,0xe3,0x69,0x21,0xc9,0x20]
2091; X64-AVX1-NEXT:    # xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
2092; X64-AVX1-NEXT:    vinsertps $48, %xmm0, %xmm1, %xmm0 # encoding: [0xc4,0xe3,0x71,0x21,0xc0,0x30]
2093; X64-AVX1-NEXT:    # xmm0 = xmm1[0,1,2],xmm0[0]
2094; X64-AVX1-NEXT:    retq # encoding: [0xc3]
2095;
2096; X64-AVX512-LABEL: test_mm_set_ps:
2097; X64-AVX512:       # %bb.0:
2098; X64-AVX512-NEXT:    vinsertps $16, %xmm2, %xmm3, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x61,0x21,0xd2,0x10]
2099; X64-AVX512-NEXT:    # xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
2100; X64-AVX512-NEXT:    vinsertps $32, %xmm1, %xmm2, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x69,0x21,0xc9,0x20]
2101; X64-AVX512-NEXT:    # xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
2102; X64-AVX512-NEXT:    vinsertps $48, %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x71,0x21,0xc0,0x30]
2103; X64-AVX512-NEXT:    # xmm0 = xmm1[0,1,2],xmm0[0]
2104; X64-AVX512-NEXT:    retq # encoding: [0xc3]
2105  %res0  = insertelement <4 x float> undef, float %a3, i32 0
2106  %res1  = insertelement <4 x float> %res0, float %a2, i32 1
2107  %res2  = insertelement <4 x float> %res1, float %a1, i32 2
2108  %res3  = insertelement <4 x float> %res2, float %a0, i32 3
2109  ret <4 x float> %res3
2110}
2111
2112define <4 x float> @test_mm_set_ps1(float %a0) nounwind {
2113; X86-SSE-LABEL: test_mm_set_ps1:
2114; X86-SSE:       # %bb.0:
2115; X86-SSE-NEXT:    movss {{[0-9]+}}(%esp), %xmm0 # encoding: [0xf3,0x0f,0x10,0x44,0x24,0x04]
2116; X86-SSE-NEXT:    # xmm0 = mem[0],zero,zero,zero
2117; X86-SSE-NEXT:    shufps $0, %xmm0, %xmm0 # encoding: [0x0f,0xc6,0xc0,0x00]
2118; X86-SSE-NEXT:    # xmm0 = xmm0[0,0,0,0]
2119; X86-SSE-NEXT:    retl # encoding: [0xc3]
2120;
2121; X86-AVX1-LABEL: test_mm_set_ps1:
2122; X86-AVX1:       # %bb.0:
2123; X86-AVX1-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm0 # encoding: [0xc5,0xfa,0x10,0x44,0x24,0x04]
2124; X86-AVX1-NEXT:    # xmm0 = mem[0],zero,zero,zero
2125; X86-AVX1-NEXT:    vpermilps $0, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x00]
2126; X86-AVX1-NEXT:    # xmm0 = xmm0[0,0,0,0]
2127; X86-AVX1-NEXT:    retl # encoding: [0xc3]
2128;
2129; X86-AVX512-LABEL: test_mm_set_ps1:
2130; X86-AVX512:       # %bb.0:
2131; X86-AVX512-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x44,0x24,0x04]
2132; X86-AVX512-NEXT:    # xmm0 = mem[0],zero,zero,zero
2133; X86-AVX512-NEXT:    vbroadcastss %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xc0]
2134; X86-AVX512-NEXT:    retl # encoding: [0xc3]
2135;
2136; X64-SSE-LABEL: test_mm_set_ps1:
2137; X64-SSE:       # %bb.0:
2138; X64-SSE-NEXT:    shufps $0, %xmm0, %xmm0 # encoding: [0x0f,0xc6,0xc0,0x00]
2139; X64-SSE-NEXT:    # xmm0 = xmm0[0,0,0,0]
2140; X64-SSE-NEXT:    retq # encoding: [0xc3]
2141;
2142; X64-AVX1-LABEL: test_mm_set_ps1:
2143; X64-AVX1:       # %bb.0:
2144; X64-AVX1-NEXT:    vpermilps $0, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x00]
2145; X64-AVX1-NEXT:    # xmm0 = xmm0[0,0,0,0]
2146; X64-AVX1-NEXT:    retq # encoding: [0xc3]
2147;
2148; X64-AVX512-LABEL: test_mm_set_ps1:
2149; X64-AVX512:       # %bb.0:
2150; X64-AVX512-NEXT:    vbroadcastss %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xc0]
2151; X64-AVX512-NEXT:    retq # encoding: [0xc3]
2152  %res0  = insertelement <4 x float> undef, float %a0, i32 0
2153  %res1  = insertelement <4 x float> %res0, float %a0, i32 1
2154  %res2  = insertelement <4 x float> %res1, float %a0, i32 2
2155  %res3  = insertelement <4 x float> %res2, float %a0, i32 3
2156  ret <4 x float> %res3
2157}
2158
2159define void @test_MM_SET_ROUNDING_MODE(i32 %a0) nounwind {
2160; X86-SSE-LABEL: test_MM_SET_ROUNDING_MODE:
2161; X86-SSE:       # %bb.0:
2162; X86-SSE-NEXT:    pushl %eax # encoding: [0x50]
2163; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
2164; X86-SSE-NEXT:    movl %esp, %ecx # encoding: [0x89,0xe1]
2165; X86-SSE-NEXT:    stmxcsr (%ecx) # encoding: [0x0f,0xae,0x19]
2166; X86-SSE-NEXT:    movl (%esp), %edx # encoding: [0x8b,0x14,0x24]
2167; X86-SSE-NEXT:    andl $-24577, %edx # encoding: [0x81,0xe2,0xff,0x9f,0xff,0xff]
2168; X86-SSE-NEXT:    # imm = 0x9FFF
2169; X86-SSE-NEXT:    orl %eax, %edx # encoding: [0x09,0xc2]
2170; X86-SSE-NEXT:    movl %edx, (%esp) # encoding: [0x89,0x14,0x24]
2171; X86-SSE-NEXT:    ldmxcsr (%ecx) # encoding: [0x0f,0xae,0x11]
2172; X86-SSE-NEXT:    popl %eax # encoding: [0x58]
2173; X86-SSE-NEXT:    retl # encoding: [0xc3]
2174;
2175; X86-AVX-LABEL: test_MM_SET_ROUNDING_MODE:
2176; X86-AVX:       # %bb.0:
2177; X86-AVX-NEXT:    pushl %eax # encoding: [0x50]
2178; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
2179; X86-AVX-NEXT:    movl %esp, %ecx # encoding: [0x89,0xe1]
2180; X86-AVX-NEXT:    vstmxcsr (%ecx) # encoding: [0xc5,0xf8,0xae,0x19]
2181; X86-AVX-NEXT:    movl (%esp), %edx # encoding: [0x8b,0x14,0x24]
2182; X86-AVX-NEXT:    andl $-24577, %edx # encoding: [0x81,0xe2,0xff,0x9f,0xff,0xff]
2183; X86-AVX-NEXT:    # imm = 0x9FFF
2184; X86-AVX-NEXT:    orl %eax, %edx # encoding: [0x09,0xc2]
2185; X86-AVX-NEXT:    movl %edx, (%esp) # encoding: [0x89,0x14,0x24]
2186; X86-AVX-NEXT:    vldmxcsr (%ecx) # encoding: [0xc5,0xf8,0xae,0x11]
2187; X86-AVX-NEXT:    popl %eax # encoding: [0x58]
2188; X86-AVX-NEXT:    retl # encoding: [0xc3]
2189;
2190; X64-SSE-LABEL: test_MM_SET_ROUNDING_MODE:
2191; X64-SSE:       # %bb.0:
2192; X64-SSE-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8d,0x44,0x24,0xfc]
2193; X64-SSE-NEXT:    stmxcsr (%rax) # encoding: [0x0f,0xae,0x18]
2194; X64-SSE-NEXT:    movl -{{[0-9]+}}(%rsp), %ecx # encoding: [0x8b,0x4c,0x24,0xfc]
2195; X64-SSE-NEXT:    andl $-24577, %ecx # encoding: [0x81,0xe1,0xff,0x9f,0xff,0xff]
2196; X64-SSE-NEXT:    # imm = 0x9FFF
2197; X64-SSE-NEXT:    orl %edi, %ecx # encoding: [0x09,0xf9]
2198; X64-SSE-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp) # encoding: [0x89,0x4c,0x24,0xfc]
2199; X64-SSE-NEXT:    ldmxcsr (%rax) # encoding: [0x0f,0xae,0x10]
2200; X64-SSE-NEXT:    retq # encoding: [0xc3]
2201;
2202; X64-AVX-LABEL: test_MM_SET_ROUNDING_MODE:
2203; X64-AVX:       # %bb.0:
2204; X64-AVX-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8d,0x44,0x24,0xfc]
2205; X64-AVX-NEXT:    vstmxcsr (%rax) # encoding: [0xc5,0xf8,0xae,0x18]
2206; X64-AVX-NEXT:    movl -{{[0-9]+}}(%rsp), %ecx # encoding: [0x8b,0x4c,0x24,0xfc]
2207; X64-AVX-NEXT:    andl $-24577, %ecx # encoding: [0x81,0xe1,0xff,0x9f,0xff,0xff]
2208; X64-AVX-NEXT:    # imm = 0x9FFF
2209; X64-AVX-NEXT:    orl %edi, %ecx # encoding: [0x09,0xf9]
2210; X64-AVX-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp) # encoding: [0x89,0x4c,0x24,0xfc]
2211; X64-AVX-NEXT:    vldmxcsr (%rax) # encoding: [0xc5,0xf8,0xae,0x10]
2212; X64-AVX-NEXT:    retq # encoding: [0xc3]
2213  %1 = alloca i32, align 4
2214  %2 = bitcast i32* %1 to i8*
2215  call void @llvm.x86.sse.stmxcsr(i8* %2)
2216  %3 = load i32, i32* %1
2217  %4 = and i32 %3, -24577
2218  %5 = or i32 %4, %a0
2219  store i32 %5, i32* %1
2220  call void @llvm.x86.sse.ldmxcsr(i8* %2)
2221  ret void
2222}
2223
2224define <4 x float> @test_mm_set_ss(float %a0) nounwind {
2225; X86-SSE-LABEL: test_mm_set_ss:
2226; X86-SSE:       # %bb.0:
2227; X86-SSE-NEXT:    movss {{[0-9]+}}(%esp), %xmm1 # encoding: [0xf3,0x0f,0x10,0x4c,0x24,0x04]
2228; X86-SSE-NEXT:    # xmm1 = mem[0],zero,zero,zero
2229; X86-SSE-NEXT:    xorps %xmm0, %xmm0 # encoding: [0x0f,0x57,0xc0]
2230; X86-SSE-NEXT:    movss %xmm1, %xmm0 # encoding: [0xf3,0x0f,0x10,0xc1]
2231; X86-SSE-NEXT:    # xmm0 = xmm1[0],xmm0[1,2,3]
2232; X86-SSE-NEXT:    retl # encoding: [0xc3]
2233;
2234; X86-AVX1-LABEL: test_mm_set_ss:
2235; X86-AVX1:       # %bb.0:
2236; X86-AVX1-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm0 # encoding: [0xc5,0xfa,0x10,0x44,0x24,0x04]
2237; X86-AVX1-NEXT:    # xmm0 = mem[0],zero,zero,zero
2238; X86-AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf0,0x57,0xc9]
2239; X86-AVX1-NEXT:    vblendps $1, %xmm0, %xmm1, %xmm0 # encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x01]
2240; X86-AVX1-NEXT:    # xmm0 = xmm0[0],xmm1[1,2,3]
2241; X86-AVX1-NEXT:    retl # encoding: [0xc3]
2242;
2243; X86-AVX512-LABEL: test_mm_set_ss:
2244; X86-AVX512:       # %bb.0:
2245; X86-AVX512-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x44,0x24,0x04]
2246; X86-AVX512-NEXT:    # xmm0 = mem[0],zero,zero,zero
2247; X86-AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf0,0x57,0xc9]
2248; X86-AVX512-NEXT:    vblendps $1, %xmm0, %xmm1, %xmm0 # encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x01]
2249; X86-AVX512-NEXT:    # xmm0 = xmm0[0],xmm1[1,2,3]
2250; X86-AVX512-NEXT:    retl # encoding: [0xc3]
2251;
2252; X64-SSE-LABEL: test_mm_set_ss:
2253; X64-SSE:       # %bb.0:
2254; X64-SSE-NEXT:    xorps %xmm1, %xmm1 # encoding: [0x0f,0x57,0xc9]
2255; X64-SSE-NEXT:    movss %xmm0, %xmm1 # encoding: [0xf3,0x0f,0x10,0xc8]
2256; X64-SSE-NEXT:    # xmm1 = xmm0[0],xmm1[1,2,3]
2257; X64-SSE-NEXT:    movaps %xmm1, %xmm0 # encoding: [0x0f,0x28,0xc1]
2258; X64-SSE-NEXT:    retq # encoding: [0xc3]
2259;
2260; X64-AVX-LABEL: test_mm_set_ss:
2261; X64-AVX:       # %bb.0:
2262; X64-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf0,0x57,0xc9]
2263; X64-AVX-NEXT:    vblendps $1, %xmm0, %xmm1, %xmm0 # encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x01]
2264; X64-AVX-NEXT:    # xmm0 = xmm0[0],xmm1[1,2,3]
2265; X64-AVX-NEXT:    retq # encoding: [0xc3]
2266  %res0  = insertelement <4 x float> undef, float %a0, i32 0
2267  %res1  = insertelement <4 x float> %res0, float 0.0, i32 1
2268  %res2  = insertelement <4 x float> %res1, float 0.0, i32 2
2269  %res3  = insertelement <4 x float> %res2, float 0.0, i32 3
2270  ret <4 x float> %res3
2271}
2272
2273define <4 x float> @test_mm_set1_ps(float %a0) nounwind {
2274; X86-SSE-LABEL: test_mm_set1_ps:
2275; X86-SSE:       # %bb.0:
2276; X86-SSE-NEXT:    movss {{[0-9]+}}(%esp), %xmm0 # encoding: [0xf3,0x0f,0x10,0x44,0x24,0x04]
2277; X86-SSE-NEXT:    # xmm0 = mem[0],zero,zero,zero
2278; X86-SSE-NEXT:    shufps $0, %xmm0, %xmm0 # encoding: [0x0f,0xc6,0xc0,0x00]
2279; X86-SSE-NEXT:    # xmm0 = xmm0[0,0,0,0]
2280; X86-SSE-NEXT:    retl # encoding: [0xc3]
2281;
2282; X86-AVX1-LABEL: test_mm_set1_ps:
2283; X86-AVX1:       # %bb.0:
2284; X86-AVX1-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm0 # encoding: [0xc5,0xfa,0x10,0x44,0x24,0x04]
2285; X86-AVX1-NEXT:    # xmm0 = mem[0],zero,zero,zero
2286; X86-AVX1-NEXT:    vpermilps $0, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x00]
2287; X86-AVX1-NEXT:    # xmm0 = xmm0[0,0,0,0]
2288; X86-AVX1-NEXT:    retl # encoding: [0xc3]
2289;
2290; X86-AVX512-LABEL: test_mm_set1_ps:
2291; X86-AVX512:       # %bb.0:
2292; X86-AVX512-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x44,0x24,0x04]
2293; X86-AVX512-NEXT:    # xmm0 = mem[0],zero,zero,zero
2294; X86-AVX512-NEXT:    vbroadcastss %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xc0]
2295; X86-AVX512-NEXT:    retl # encoding: [0xc3]
2296;
2297; X64-SSE-LABEL: test_mm_set1_ps:
2298; X64-SSE:       # %bb.0:
2299; X64-SSE-NEXT:    shufps $0, %xmm0, %xmm0 # encoding: [0x0f,0xc6,0xc0,0x00]
2300; X64-SSE-NEXT:    # xmm0 = xmm0[0,0,0,0]
2301; X64-SSE-NEXT:    retq # encoding: [0xc3]
2302;
2303; X64-AVX1-LABEL: test_mm_set1_ps:
2304; X64-AVX1:       # %bb.0:
2305; X64-AVX1-NEXT:    vpermilps $0, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x00]
2306; X64-AVX1-NEXT:    # xmm0 = xmm0[0,0,0,0]
2307; X64-AVX1-NEXT:    retq # encoding: [0xc3]
2308;
2309; X64-AVX512-LABEL: test_mm_set1_ps:
2310; X64-AVX512:       # %bb.0:
2311; X64-AVX512-NEXT:    vbroadcastss %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xc0]
2312; X64-AVX512-NEXT:    retq # encoding: [0xc3]
2313  %res0  = insertelement <4 x float> undef, float %a0, i32 0
2314  %res1  = insertelement <4 x float> %res0, float %a0, i32 1
2315  %res2  = insertelement <4 x float> %res1, float %a0, i32 2
2316  %res3  = insertelement <4 x float> %res2, float %a0, i32 3
2317  ret <4 x float> %res3
2318}
2319
2320define void @test_mm_setcsr(i32 %a0) nounwind {
2321; X86-SSE-LABEL: test_mm_setcsr:
2322; X86-SSE:       # %bb.0:
2323; X86-SSE-NEXT:    leal {{[0-9]+}}(%esp), %eax # encoding: [0x8d,0x44,0x24,0x04]
2324; X86-SSE-NEXT:    ldmxcsr (%eax) # encoding: [0x0f,0xae,0x10]
2325; X86-SSE-NEXT:    retl # encoding: [0xc3]
2326;
2327; X86-AVX-LABEL: test_mm_setcsr:
2328; X86-AVX:       # %bb.0:
2329; X86-AVX-NEXT:    leal {{[0-9]+}}(%esp), %eax # encoding: [0x8d,0x44,0x24,0x04]
2330; X86-AVX-NEXT:    vldmxcsr (%eax) # encoding: [0xc5,0xf8,0xae,0x10]
2331; X86-AVX-NEXT:    retl # encoding: [0xc3]
2332;
2333; X64-SSE-LABEL: test_mm_setcsr:
2334; X64-SSE:       # %bb.0:
2335; X64-SSE-NEXT:    movl %edi, -{{[0-9]+}}(%rsp) # encoding: [0x89,0x7c,0x24,0xfc]
2336; X64-SSE-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8d,0x44,0x24,0xfc]
2337; X64-SSE-NEXT:    ldmxcsr (%rax) # encoding: [0x0f,0xae,0x10]
2338; X64-SSE-NEXT:    retq # encoding: [0xc3]
2339;
2340; X64-AVX-LABEL: test_mm_setcsr:
2341; X64-AVX:       # %bb.0:
2342; X64-AVX-NEXT:    movl %edi, -{{[0-9]+}}(%rsp) # encoding: [0x89,0x7c,0x24,0xfc]
2343; X64-AVX-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8d,0x44,0x24,0xfc]
2344; X64-AVX-NEXT:    vldmxcsr (%rax) # encoding: [0xc5,0xf8,0xae,0x10]
2345; X64-AVX-NEXT:    retq # encoding: [0xc3]
2346  %st = alloca i32, align 4
2347  store i32 %a0, i32* %st, align 4
2348  %bc = bitcast i32* %st to i8*
2349  call void @llvm.x86.sse.ldmxcsr(i8* %bc)
2350  ret void
2351}
2352
2353define <4 x float> @test_mm_setr_ps(float %a0, float %a1, float %a2, float %a3) nounwind {
2354; X86-SSE-LABEL: test_mm_setr_ps:
2355; X86-SSE:       # %bb.0:
2356; X86-SSE-NEXT:    movss {{[0-9]+}}(%esp), %xmm0 # encoding: [0xf3,0x0f,0x10,0x44,0x24,0x10]
2357; X86-SSE-NEXT:    # xmm0 = mem[0],zero,zero,zero
2358; X86-SSE-NEXT:    movss {{[0-9]+}}(%esp), %xmm1 # encoding: [0xf3,0x0f,0x10,0x4c,0x24,0x0c]
2359; X86-SSE-NEXT:    # xmm1 = mem[0],zero,zero,zero
2360; X86-SSE-NEXT:    unpcklps %xmm0, %xmm1 # encoding: [0x0f,0x14,0xc8]
2361; X86-SSE-NEXT:    # xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
2362; X86-SSE-NEXT:    movss {{[0-9]+}}(%esp), %xmm2 # encoding: [0xf3,0x0f,0x10,0x54,0x24,0x08]
2363; X86-SSE-NEXT:    # xmm2 = mem[0],zero,zero,zero
2364; X86-SSE-NEXT:    movss {{[0-9]+}}(%esp), %xmm0 # encoding: [0xf3,0x0f,0x10,0x44,0x24,0x04]
2365; X86-SSE-NEXT:    # xmm0 = mem[0],zero,zero,zero
2366; X86-SSE-NEXT:    unpcklps %xmm2, %xmm0 # encoding: [0x0f,0x14,0xc2]
2367; X86-SSE-NEXT:    # xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
2368; X86-SSE-NEXT:    movlhps %xmm1, %xmm0 # encoding: [0x0f,0x16,0xc1]
2369; X86-SSE-NEXT:    # xmm0 = xmm0[0],xmm1[0]
2370; X86-SSE-NEXT:    retl # encoding: [0xc3]
2371;
2372; X86-AVX1-LABEL: test_mm_setr_ps:
2373; X86-AVX1:       # %bb.0:
2374; X86-AVX1-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm0 # encoding: [0xc5,0xfa,0x10,0x44,0x24,0x10]
2375; X86-AVX1-NEXT:    # xmm0 = mem[0],zero,zero,zero
2376; X86-AVX1-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm1 # encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x0c]
2377; X86-AVX1-NEXT:    # xmm1 = mem[0],zero,zero,zero
2378; X86-AVX1-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm2 # encoding: [0xc5,0xfa,0x10,0x54,0x24,0x08]
2379; X86-AVX1-NEXT:    # xmm2 = mem[0],zero,zero,zero
2380; X86-AVX1-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm3 # encoding: [0xc5,0xfa,0x10,0x5c,0x24,0x04]
2381; X86-AVX1-NEXT:    # xmm3 = mem[0],zero,zero,zero
2382; X86-AVX1-NEXT:    vinsertps $16, %xmm2, %xmm3, %xmm2 # encoding: [0xc4,0xe3,0x61,0x21,0xd2,0x10]
2383; X86-AVX1-NEXT:    # xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
2384; X86-AVX1-NEXT:    vinsertps $32, %xmm1, %xmm2, %xmm1 # encoding: [0xc4,0xe3,0x69,0x21,0xc9,0x20]
2385; X86-AVX1-NEXT:    # xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
2386; X86-AVX1-NEXT:    vinsertps $48, %xmm0, %xmm1, %xmm0 # encoding: [0xc4,0xe3,0x71,0x21,0xc0,0x30]
2387; X86-AVX1-NEXT:    # xmm0 = xmm1[0,1,2],xmm0[0]
2388; X86-AVX1-NEXT:    retl # encoding: [0xc3]
2389;
2390; X86-AVX512-LABEL: test_mm_setr_ps:
2391; X86-AVX512:       # %bb.0:
2392; X86-AVX512-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x44,0x24,0x10]
2393; X86-AVX512-NEXT:    # xmm0 = mem[0],zero,zero,zero
2394; X86-AVX512-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x0c]
2395; X86-AVX512-NEXT:    # xmm1 = mem[0],zero,zero,zero
2396; X86-AVX512-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x54,0x24,0x08]
2397; X86-AVX512-NEXT:    # xmm2 = mem[0],zero,zero,zero
2398; X86-AVX512-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x5c,0x24,0x04]
2399; X86-AVX512-NEXT:    # xmm3 = mem[0],zero,zero,zero
2400; X86-AVX512-NEXT:    vinsertps $16, %xmm2, %xmm3, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x61,0x21,0xd2,0x10]
2401; X86-AVX512-NEXT:    # xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
2402; X86-AVX512-NEXT:    vinsertps $32, %xmm1, %xmm2, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x69,0x21,0xc9,0x20]
2403; X86-AVX512-NEXT:    # xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
2404; X86-AVX512-NEXT:    vinsertps $48, %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x71,0x21,0xc0,0x30]
2405; X86-AVX512-NEXT:    # xmm0 = xmm1[0,1,2],xmm0[0]
2406; X86-AVX512-NEXT:    retl # encoding: [0xc3]
2407;
2408; X64-SSE-LABEL: test_mm_setr_ps:
2409; X64-SSE:       # %bb.0:
2410; X64-SSE-NEXT:    unpcklps %xmm3, %xmm2 # encoding: [0x0f,0x14,0xd3]
2411; X64-SSE-NEXT:    # xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
2412; X64-SSE-NEXT:    unpcklps %xmm1, %xmm0 # encoding: [0x0f,0x14,0xc1]
2413; X64-SSE-NEXT:    # xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2414; X64-SSE-NEXT:    movlhps %xmm2, %xmm0 # encoding: [0x0f,0x16,0xc2]
2415; X64-SSE-NEXT:    # xmm0 = xmm0[0],xmm2[0]
2416; X64-SSE-NEXT:    retq # encoding: [0xc3]
2417;
2418; X64-AVX1-LABEL: test_mm_setr_ps:
2419; X64-AVX1:       # %bb.0:
2420; X64-AVX1-NEXT:    vinsertps $16, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x10]
2421; X64-AVX1-NEXT:    # xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
2422; X64-AVX1-NEXT:    vinsertps $32, %xmm2, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x21,0xc2,0x20]
2423; X64-AVX1-NEXT:    # xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
2424; X64-AVX1-NEXT:    vinsertps $48, %xmm3, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x21,0xc3,0x30]
2425; X64-AVX1-NEXT:    # xmm0 = xmm0[0,1,2],xmm3[0]
2426; X64-AVX1-NEXT:    retq # encoding: [0xc3]
2427;
2428; X64-AVX512-LABEL: test_mm_setr_ps:
2429; X64-AVX512:       # %bb.0:
2430; X64-AVX512-NEXT:    vinsertps $16, %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x10]
2431; X64-AVX512-NEXT:    # xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
2432; X64-AVX512-NEXT:    vinsertps $32, %xmm2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc2,0x20]
2433; X64-AVX512-NEXT:    # xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
2434; X64-AVX512-NEXT:    vinsertps $48, %xmm3, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc3,0x30]
2435; X64-AVX512-NEXT:    # xmm0 = xmm0[0,1,2],xmm3[0]
2436; X64-AVX512-NEXT:    retq # encoding: [0xc3]
2437  %res0  = insertelement <4 x float> undef, float %a0, i32 0
2438  %res1  = insertelement <4 x float> %res0, float %a1, i32 1
2439  %res2  = insertelement <4 x float> %res1, float %a2, i32 2
2440  %res3  = insertelement <4 x float> %res2, float %a3, i32 3
2441  ret <4 x float> %res3
2442}
2443
2444define <4 x float> @test_mm_setzero_ps() {
2445; SSE-LABEL: test_mm_setzero_ps:
2446; SSE:       # %bb.0:
2447; SSE-NEXT:    xorps %xmm0, %xmm0 # encoding: [0x0f,0x57,0xc0]
2448; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
2449;
2450; AVX1-LABEL: test_mm_setzero_ps:
2451; AVX1:       # %bb.0:
2452; AVX1-NEXT:    vxorps %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x57,0xc0]
2453; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
2454;
2455; AVX512-LABEL: test_mm_setzero_ps:
2456; AVX512:       # %bb.0:
2457; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x57,0xc0]
2458; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
2459  ret <4 x float> zeroinitializer
2460}
2461
2462define void @test_mm_sfence() nounwind {
2463; CHECK-LABEL: test_mm_sfence:
2464; CHECK:       # %bb.0:
2465; CHECK-NEXT:    sfence # encoding: [0x0f,0xae,0xf8]
2466; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
2467  call void @llvm.x86.sse.sfence()
2468  ret void
2469}
2470declare void @llvm.x86.sse.sfence() nounwind readnone
2471
2472define <4 x float> @test_mm_shuffle_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
2473; SSE-LABEL: test_mm_shuffle_ps:
2474; SSE:       # %bb.0:
2475; SSE-NEXT:    shufps $0, %xmm1, %xmm0 # encoding: [0x0f,0xc6,0xc1,0x00]
2476; SSE-NEXT:    # xmm0 = xmm0[0,0],xmm1[0,0]
2477; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
2478;
2479; AVX1-LABEL: test_mm_shuffle_ps:
2480; AVX1:       # %bb.0:
2481; AVX1-NEXT:    vshufps $0, %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0xc6,0xc1,0x00]
2482; AVX1-NEXT:    # xmm0 = xmm0[0,0],xmm1[0,0]
2483; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
2484;
2485; AVX512-LABEL: test_mm_shuffle_ps:
2486; AVX512:       # %bb.0:
2487; AVX512-NEXT:    vshufps $0, %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0xc6,0xc1,0x00]
2488; AVX512-NEXT:    # xmm0 = xmm0[0,0],xmm1[0,0]
2489; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
2490  %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 0, i32 4, i32 4>
2491  ret <4 x float> %res
2492}
2493
2494define <4 x float> @test_mm_sqrt_ps(<4 x float> %a0) {
2495; SSE-LABEL: test_mm_sqrt_ps:
2496; SSE:       # %bb.0:
2497; SSE-NEXT:    sqrtps %xmm0, %xmm0 # encoding: [0x0f,0x51,0xc0]
2498; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
2499;
2500; AVX1-LABEL: test_mm_sqrt_ps:
2501; AVX1:       # %bb.0:
2502; AVX1-NEXT:    vsqrtps %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x51,0xc0]
2503; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
2504;
2505; AVX512-LABEL: test_mm_sqrt_ps:
2506; AVX512:       # %bb.0:
2507; AVX512-NEXT:    vsqrtps %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x51,0xc0]
2508; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
2509  %res = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %a0)
2510  ret <4 x float> %res
2511}
2512declare <4 x float> @llvm.sqrt.v4f32(<4 x float>) nounwind readnone
2513
2514define <4 x float> @test_mm_sqrt_ss(<4 x float> %a0) {
2515; SSE-LABEL: test_mm_sqrt_ss:
2516; SSE:       # %bb.0:
2517; SSE-NEXT:    sqrtss %xmm0, %xmm0 # encoding: [0xf3,0x0f,0x51,0xc0]
2518; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
2519;
2520; AVX1-LABEL: test_mm_sqrt_ss:
2521; AVX1:       # %bb.0:
2522; AVX1-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x51,0xc0]
2523; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
2524;
2525; AVX512-LABEL: test_mm_sqrt_ss:
2526; AVX512:       # %bb.0:
2527; AVX512-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x51,0xc0]
2528; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
2529  %ext = extractelement <4 x float> %a0, i32 0
2530  %sqrt = call float @llvm.sqrt.f32(float %ext)
2531  %ins = insertelement <4 x float> %a0, float %sqrt, i32 0
2532  ret <4 x float> %ins
2533}
2534declare float @llvm.sqrt.f32(float) nounwind readnone
2535
2536define float @test_mm_sqrt_ss_scalar(float %a0) {
2537; X86-SSE-LABEL: test_mm_sqrt_ss_scalar:
2538; X86-SSE:       # %bb.0:
2539; X86-SSE-NEXT:    pushl %eax # encoding: [0x50]
2540; X86-SSE-NEXT:    .cfi_def_cfa_offset 8
2541; X86-SSE-NEXT:    movss {{[0-9]+}}(%esp), %xmm0 # encoding: [0xf3,0x0f,0x10,0x44,0x24,0x08]
2542; X86-SSE-NEXT:    # xmm0 = mem[0],zero,zero,zero
2543; X86-SSE-NEXT:    sqrtss %xmm0, %xmm0 # encoding: [0xf3,0x0f,0x51,0xc0]
2544; X86-SSE-NEXT:    movss %xmm0, (%esp) # encoding: [0xf3,0x0f,0x11,0x04,0x24]
2545; X86-SSE-NEXT:    flds (%esp) # encoding: [0xd9,0x04,0x24]
2546; X86-SSE-NEXT:    popl %eax # encoding: [0x58]
2547; X86-SSE-NEXT:    .cfi_def_cfa_offset 4
2548; X86-SSE-NEXT:    retl # encoding: [0xc3]
2549;
2550; X86-AVX1-LABEL: test_mm_sqrt_ss_scalar:
2551; X86-AVX1:       # %bb.0:
2552; X86-AVX1-NEXT:    pushl %eax # encoding: [0x50]
2553; X86-AVX1-NEXT:    .cfi_def_cfa_offset 8
2554; X86-AVX1-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm0 # encoding: [0xc5,0xfa,0x10,0x44,0x24,0x08]
2555; X86-AVX1-NEXT:    # xmm0 = mem[0],zero,zero,zero
2556; X86-AVX1-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x51,0xc0]
2557; X86-AVX1-NEXT:    vmovss %xmm0, (%esp) # encoding: [0xc5,0xfa,0x11,0x04,0x24]
2558; X86-AVX1-NEXT:    flds (%esp) # encoding: [0xd9,0x04,0x24]
2559; X86-AVX1-NEXT:    popl %eax # encoding: [0x58]
2560; X86-AVX1-NEXT:    .cfi_def_cfa_offset 4
2561; X86-AVX1-NEXT:    retl # encoding: [0xc3]
2562;
2563; X86-AVX512-LABEL: test_mm_sqrt_ss_scalar:
2564; X86-AVX512:       # %bb.0:
2565; X86-AVX512-NEXT:    pushl %eax # encoding: [0x50]
2566; X86-AVX512-NEXT:    .cfi_def_cfa_offset 8
2567; X86-AVX512-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x44,0x24,0x08]
2568; X86-AVX512-NEXT:    # xmm0 = mem[0],zero,zero,zero
2569; X86-AVX512-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x51,0xc0]
2570; X86-AVX512-NEXT:    vmovss %xmm0, (%esp) # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x11,0x04,0x24]
2571; X86-AVX512-NEXT:    flds (%esp) # encoding: [0xd9,0x04,0x24]
2572; X86-AVX512-NEXT:    popl %eax # encoding: [0x58]
2573; X86-AVX512-NEXT:    .cfi_def_cfa_offset 4
2574; X86-AVX512-NEXT:    retl # encoding: [0xc3]
2575;
2576; X64-SSE-LABEL: test_mm_sqrt_ss_scalar:
2577; X64-SSE:       # %bb.0:
2578; X64-SSE-NEXT:    sqrtss %xmm0, %xmm0 # encoding: [0xf3,0x0f,0x51,0xc0]
2579; X64-SSE-NEXT:    retq # encoding: [0xc3]
2580;
2581; X64-AVX1-LABEL: test_mm_sqrt_ss_scalar:
2582; X64-AVX1:       # %bb.0:
2583; X64-AVX1-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x51,0xc0]
2584; X64-AVX1-NEXT:    retq # encoding: [0xc3]
2585;
2586; X64-AVX512-LABEL: test_mm_sqrt_ss_scalar:
2587; X64-AVX512:       # %bb.0:
2588; X64-AVX512-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x51,0xc0]
2589; X64-AVX512-NEXT:    retq # encoding: [0xc3]
2590  %sqrt = call float @llvm.sqrt.f32(float %a0)
2591  ret float %sqrt
2592}
2593
2594define void @test_mm_store_ps(float *%a0, <4 x float> %a1) {
2595; X86-SSE-LABEL: test_mm_store_ps:
2596; X86-SSE:       # %bb.0:
2597; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2598; X86-SSE-NEXT:    movaps %xmm0, (%eax) # encoding: [0x0f,0x29,0x00]
2599; X86-SSE-NEXT:    retl # encoding: [0xc3]
2600;
2601; X86-AVX1-LABEL: test_mm_store_ps:
2602; X86-AVX1:       # %bb.0:
2603; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2604; X86-AVX1-NEXT:    vmovaps %xmm0, (%eax) # encoding: [0xc5,0xf8,0x29,0x00]
2605; X86-AVX1-NEXT:    retl # encoding: [0xc3]
2606;
2607; X86-AVX512-LABEL: test_mm_store_ps:
2608; X86-AVX512:       # %bb.0:
2609; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2610; X86-AVX512-NEXT:    vmovaps %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x00]
2611; X86-AVX512-NEXT:    retl # encoding: [0xc3]
2612;
2613; X64-SSE-LABEL: test_mm_store_ps:
2614; X64-SSE:       # %bb.0:
2615; X64-SSE-NEXT:    movaps %xmm0, (%rdi) # encoding: [0x0f,0x29,0x07]
2616; X64-SSE-NEXT:    retq # encoding: [0xc3]
2617;
2618; X64-AVX1-LABEL: test_mm_store_ps:
2619; X64-AVX1:       # %bb.0:
2620; X64-AVX1-NEXT:    vmovaps %xmm0, (%rdi) # encoding: [0xc5,0xf8,0x29,0x07]
2621; X64-AVX1-NEXT:    retq # encoding: [0xc3]
2622;
2623; X64-AVX512-LABEL: test_mm_store_ps:
2624; X64-AVX512:       # %bb.0:
2625; X64-AVX512-NEXT:    vmovaps %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x07]
2626; X64-AVX512-NEXT:    retq # encoding: [0xc3]
2627  %arg0 = bitcast float* %a0 to <4 x float>*
2628  store <4 x float> %a1, <4 x float>* %arg0, align 16
2629  ret void
2630}
2631
2632define void @test_mm_store_ps1(float *%a0, <4 x float> %a1) {
2633; X86-SSE-LABEL: test_mm_store_ps1:
2634; X86-SSE:       # %bb.0:
2635; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2636; X86-SSE-NEXT:    shufps $0, %xmm0, %xmm0 # encoding: [0x0f,0xc6,0xc0,0x00]
2637; X86-SSE-NEXT:    # xmm0 = xmm0[0,0,0,0]
2638; X86-SSE-NEXT:    movaps %xmm0, (%eax) # encoding: [0x0f,0x29,0x00]
2639; X86-SSE-NEXT:    retl # encoding: [0xc3]
2640;
2641; X86-AVX1-LABEL: test_mm_store_ps1:
2642; X86-AVX1:       # %bb.0:
2643; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2644; X86-AVX1-NEXT:    vpermilps $0, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x00]
2645; X86-AVX1-NEXT:    # xmm0 = xmm0[0,0,0,0]
2646; X86-AVX1-NEXT:    vmovaps %xmm0, (%eax) # encoding: [0xc5,0xf8,0x29,0x00]
2647; X86-AVX1-NEXT:    retl # encoding: [0xc3]
2648;
2649; X86-AVX512-LABEL: test_mm_store_ps1:
2650; X86-AVX512:       # %bb.0:
2651; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2652; X86-AVX512-NEXT:    vbroadcastss %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xc0]
2653; X86-AVX512-NEXT:    vmovaps %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x00]
2654; X86-AVX512-NEXT:    retl # encoding: [0xc3]
2655;
2656; X64-SSE-LABEL: test_mm_store_ps1:
2657; X64-SSE:       # %bb.0:
2658; X64-SSE-NEXT:    shufps $0, %xmm0, %xmm0 # encoding: [0x0f,0xc6,0xc0,0x00]
2659; X64-SSE-NEXT:    # xmm0 = xmm0[0,0,0,0]
2660; X64-SSE-NEXT:    movaps %xmm0, (%rdi) # encoding: [0x0f,0x29,0x07]
2661; X64-SSE-NEXT:    retq # encoding: [0xc3]
2662;
2663; X64-AVX1-LABEL: test_mm_store_ps1:
2664; X64-AVX1:       # %bb.0:
2665; X64-AVX1-NEXT:    vpermilps $0, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x00]
2666; X64-AVX1-NEXT:    # xmm0 = xmm0[0,0,0,0]
2667; X64-AVX1-NEXT:    vmovaps %xmm0, (%rdi) # encoding: [0xc5,0xf8,0x29,0x07]
2668; X64-AVX1-NEXT:    retq # encoding: [0xc3]
2669;
2670; X64-AVX512-LABEL: test_mm_store_ps1:
2671; X64-AVX512:       # %bb.0:
2672; X64-AVX512-NEXT:    vbroadcastss %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xc0]
2673; X64-AVX512-NEXT:    vmovaps %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x07]
2674; X64-AVX512-NEXT:    retq # encoding: [0xc3]
2675  %arg0 = bitcast float* %a0 to <4 x float>*
2676  %shuf = shufflevector <4 x float> %a1, <4 x float> undef, <4 x i32> zeroinitializer
2677  store <4 x float> %shuf, <4 x float>* %arg0, align 16
2678  ret void
2679}
2680
2681define void @test_mm_store_ss(float *%a0, <4 x float> %a1) {
2682; X86-SSE-LABEL: test_mm_store_ss:
2683; X86-SSE:       # %bb.0:
2684; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2685; X86-SSE-NEXT:    movss %xmm0, (%eax) # encoding: [0xf3,0x0f,0x11,0x00]
2686; X86-SSE-NEXT:    retl # encoding: [0xc3]
2687;
2688; X86-AVX1-LABEL: test_mm_store_ss:
2689; X86-AVX1:       # %bb.0:
2690; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2691; X86-AVX1-NEXT:    vmovss %xmm0, (%eax) # encoding: [0xc5,0xfa,0x11,0x00]
2692; X86-AVX1-NEXT:    retl # encoding: [0xc3]
2693;
2694; X86-AVX512-LABEL: test_mm_store_ss:
2695; X86-AVX512:       # %bb.0:
2696; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2697; X86-AVX512-NEXT:    vmovss %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x11,0x00]
2698; X86-AVX512-NEXT:    retl # encoding: [0xc3]
2699;
2700; X64-SSE-LABEL: test_mm_store_ss:
2701; X64-SSE:       # %bb.0:
2702; X64-SSE-NEXT:    movss %xmm0, (%rdi) # encoding: [0xf3,0x0f,0x11,0x07]
2703; X64-SSE-NEXT:    retq # encoding: [0xc3]
2704;
2705; X64-AVX1-LABEL: test_mm_store_ss:
2706; X64-AVX1:       # %bb.0:
2707; X64-AVX1-NEXT:    vmovss %xmm0, (%rdi) # encoding: [0xc5,0xfa,0x11,0x07]
2708; X64-AVX1-NEXT:    retq # encoding: [0xc3]
2709;
2710; X64-AVX512-LABEL: test_mm_store_ss:
2711; X64-AVX512:       # %bb.0:
2712; X64-AVX512-NEXT:    vmovss %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x11,0x07]
2713; X64-AVX512-NEXT:    retq # encoding: [0xc3]
2714  %ext = extractelement <4 x float> %a1, i32 0
2715  store float %ext, float* %a0, align 1
2716  ret void
2717}
2718
2719define void @test_mm_store1_ps(float *%a0, <4 x float> %a1) {
2720; X86-SSE-LABEL: test_mm_store1_ps:
2721; X86-SSE:       # %bb.0:
2722; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2723; X86-SSE-NEXT:    shufps $0, %xmm0, %xmm0 # encoding: [0x0f,0xc6,0xc0,0x00]
2724; X86-SSE-NEXT:    # xmm0 = xmm0[0,0,0,0]
2725; X86-SSE-NEXT:    movaps %xmm0, (%eax) # encoding: [0x0f,0x29,0x00]
2726; X86-SSE-NEXT:    retl # encoding: [0xc3]
2727;
2728; X86-AVX1-LABEL: test_mm_store1_ps:
2729; X86-AVX1:       # %bb.0:
2730; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2731; X86-AVX1-NEXT:    vpermilps $0, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x00]
2732; X86-AVX1-NEXT:    # xmm0 = xmm0[0,0,0,0]
2733; X86-AVX1-NEXT:    vmovaps %xmm0, (%eax) # encoding: [0xc5,0xf8,0x29,0x00]
2734; X86-AVX1-NEXT:    retl # encoding: [0xc3]
2735;
2736; X86-AVX512-LABEL: test_mm_store1_ps:
2737; X86-AVX512:       # %bb.0:
2738; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2739; X86-AVX512-NEXT:    vbroadcastss %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xc0]
2740; X86-AVX512-NEXT:    vmovaps %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x00]
2741; X86-AVX512-NEXT:    retl # encoding: [0xc3]
2742;
2743; X64-SSE-LABEL: test_mm_store1_ps:
2744; X64-SSE:       # %bb.0:
2745; X64-SSE-NEXT:    shufps $0, %xmm0, %xmm0 # encoding: [0x0f,0xc6,0xc0,0x00]
2746; X64-SSE-NEXT:    # xmm0 = xmm0[0,0,0,0]
2747; X64-SSE-NEXT:    movaps %xmm0, (%rdi) # encoding: [0x0f,0x29,0x07]
2748; X64-SSE-NEXT:    retq # encoding: [0xc3]
2749;
2750; X64-AVX1-LABEL: test_mm_store1_ps:
2751; X64-AVX1:       # %bb.0:
2752; X64-AVX1-NEXT:    vpermilps $0, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x00]
2753; X64-AVX1-NEXT:    # xmm0 = xmm0[0,0,0,0]
2754; X64-AVX1-NEXT:    vmovaps %xmm0, (%rdi) # encoding: [0xc5,0xf8,0x29,0x07]
2755; X64-AVX1-NEXT:    retq # encoding: [0xc3]
2756;
2757; X64-AVX512-LABEL: test_mm_store1_ps:
2758; X64-AVX512:       # %bb.0:
2759; X64-AVX512-NEXT:    vbroadcastss %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xc0]
2760; X64-AVX512-NEXT:    vmovaps %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x07]
2761; X64-AVX512-NEXT:    retq # encoding: [0xc3]
2762  %arg0 = bitcast float* %a0 to <4 x float>*
2763  %shuf = shufflevector <4 x float> %a1, <4 x float> undef, <4 x i32> zeroinitializer
2764  store <4 x float> %shuf, <4 x float>* %arg0, align 16
2765  ret void
2766}
2767
2768define void @test_mm_storeh_ps(x86_mmx *%a0, <4 x float> %a1) nounwind {
2769; X86-SSE-LABEL: test_mm_storeh_ps:
2770; X86-SSE:       # %bb.0:
2771; X86-SSE-NEXT:    pushl %ebp # encoding: [0x55]
2772; X86-SSE-NEXT:    movl %esp, %ebp # encoding: [0x89,0xe5]
2773; X86-SSE-NEXT:    andl $-16, %esp # encoding: [0x83,0xe4,0xf0]
2774; X86-SSE-NEXT:    subl $32, %esp # encoding: [0x83,0xec,0x20]
2775; X86-SSE-NEXT:    movl 8(%ebp), %eax # encoding: [0x8b,0x45,0x08]
2776; X86-SSE-NEXT:    movaps %xmm0, (%esp) # encoding: [0x0f,0x29,0x04,0x24]
2777; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x08]
2778; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c]
2779; X86-SSE-NEXT:    movl %edx, 4(%eax) # encoding: [0x89,0x50,0x04]
2780; X86-SSE-NEXT:    movl %ecx, (%eax) # encoding: [0x89,0x08]
2781; X86-SSE-NEXT:    movl %ebp, %esp # encoding: [0x89,0xec]
2782; X86-SSE-NEXT:    popl %ebp # encoding: [0x5d]
2783; X86-SSE-NEXT:    retl # encoding: [0xc3]
2784;
2785; X86-AVX1-LABEL: test_mm_storeh_ps:
2786; X86-AVX1:       # %bb.0:
2787; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2788; X86-AVX1-NEXT:    vmovhpd %xmm0, (%eax) # encoding: [0xc5,0xf9,0x17,0x00]
2789; X86-AVX1-NEXT:    retl # encoding: [0xc3]
2790;
2791; X86-AVX512-LABEL: test_mm_storeh_ps:
2792; X86-AVX512:       # %bb.0:
2793; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2794; X86-AVX512-NEXT:    vmovhpd %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x17,0x00]
2795; X86-AVX512-NEXT:    retl # encoding: [0xc3]
2796;
2797; X64-SSE-LABEL: test_mm_storeh_ps:
2798; X64-SSE:       # %bb.0:
2799; X64-SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp) # encoding: [0x0f,0x29,0x44,0x24,0xe8]
2800; X64-SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8b,0x44,0x24,0xf0]
2801; X64-SSE-NEXT:    movq %rax, (%rdi) # encoding: [0x48,0x89,0x07]
2802; X64-SSE-NEXT:    retq # encoding: [0xc3]
2803;
2804; X64-AVX1-LABEL: test_mm_storeh_ps:
2805; X64-AVX1:       # %bb.0:
2806; X64-AVX1-NEXT:    vpextrq $1, %xmm0, %rax # encoding: [0xc4,0xe3,0xf9,0x16,0xc0,0x01]
2807; X64-AVX1-NEXT:    movq %rax, (%rdi) # encoding: [0x48,0x89,0x07]
2808; X64-AVX1-NEXT:    retq # encoding: [0xc3]
2809;
2810; X64-AVX512-LABEL: test_mm_storeh_ps:
2811; X64-AVX512:       # %bb.0:
2812; X64-AVX512-NEXT:    vpextrq $1, %xmm0, %rax # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0x16,0xc0,0x01]
2813; X64-AVX512-NEXT:    movq %rax, (%rdi) # encoding: [0x48,0x89,0x07]
2814; X64-AVX512-NEXT:    retq # encoding: [0xc3]
2815  %ptr = bitcast x86_mmx* %a0 to i64*
2816  %bc  = bitcast <4 x float> %a1 to <2 x i64>
2817  %ext = extractelement <2 x i64> %bc, i32 1
2818  store i64 %ext, i64* %ptr
2819  ret void
2820}
2821
2822define void @test_mm_storel_ps(x86_mmx *%a0, <4 x float> %a1) nounwind {
2823; X86-SSE-LABEL: test_mm_storel_ps:
2824; X86-SSE:       # %bb.0:
2825; X86-SSE-NEXT:    pushl %ebp # encoding: [0x55]
2826; X86-SSE-NEXT:    movl %esp, %ebp # encoding: [0x89,0xe5]
2827; X86-SSE-NEXT:    andl $-16, %esp # encoding: [0x83,0xe4,0xf0]
2828; X86-SSE-NEXT:    subl $32, %esp # encoding: [0x83,0xec,0x20]
2829; X86-SSE-NEXT:    movl 8(%ebp), %eax # encoding: [0x8b,0x45,0x08]
2830; X86-SSE-NEXT:    movaps %xmm0, (%esp) # encoding: [0x0f,0x29,0x04,0x24]
2831; X86-SSE-NEXT:    movl (%esp), %ecx # encoding: [0x8b,0x0c,0x24]
2832; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x04]
2833; X86-SSE-NEXT:    movl %edx, 4(%eax) # encoding: [0x89,0x50,0x04]
2834; X86-SSE-NEXT:    movl %ecx, (%eax) # encoding: [0x89,0x08]
2835; X86-SSE-NEXT:    movl %ebp, %esp # encoding: [0x89,0xec]
2836; X86-SSE-NEXT:    popl %ebp # encoding: [0x5d]
2837; X86-SSE-NEXT:    retl # encoding: [0xc3]
2838;
2839; X86-AVX1-LABEL: test_mm_storel_ps:
2840; X86-AVX1:       # %bb.0:
2841; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2842; X86-AVX1-NEXT:    vmovlps %xmm0, (%eax) # encoding: [0xc5,0xf8,0x13,0x00]
2843; X86-AVX1-NEXT:    retl # encoding: [0xc3]
2844;
2845; X86-AVX512-LABEL: test_mm_storel_ps:
2846; X86-AVX512:       # %bb.0:
2847; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2848; X86-AVX512-NEXT:    vmovlps %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x13,0x00]
2849; X86-AVX512-NEXT:    retl # encoding: [0xc3]
2850;
2851; X64-SSE-LABEL: test_mm_storel_ps:
2852; X64-SSE:       # %bb.0:
2853; X64-SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp) # encoding: [0x0f,0x29,0x44,0x24,0xe8]
2854; X64-SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8b,0x44,0x24,0xe8]
2855; X64-SSE-NEXT:    movq %rax, (%rdi) # encoding: [0x48,0x89,0x07]
2856; X64-SSE-NEXT:    retq # encoding: [0xc3]
2857;
2858; X64-AVX1-LABEL: test_mm_storel_ps:
2859; X64-AVX1:       # %bb.0:
2860; X64-AVX1-NEXT:    vmovq %xmm0, %rax # encoding: [0xc4,0xe1,0xf9,0x7e,0xc0]
2861; X64-AVX1-NEXT:    movq %rax, (%rdi) # encoding: [0x48,0x89,0x07]
2862; X64-AVX1-NEXT:    retq # encoding: [0xc3]
2863;
2864; X64-AVX512-LABEL: test_mm_storel_ps:
2865; X64-AVX512:       # %bb.0:
2866; X64-AVX512-NEXT:    vmovq %xmm0, %rax # EVEX TO VEX Compression encoding: [0xc4,0xe1,0xf9,0x7e,0xc0]
2867; X64-AVX512-NEXT:    movq %rax, (%rdi) # encoding: [0x48,0x89,0x07]
2868; X64-AVX512-NEXT:    retq # encoding: [0xc3]
2869  %ptr = bitcast x86_mmx* %a0 to i64*
2870  %bc  = bitcast <4 x float> %a1 to <2 x i64>
2871  %ext = extractelement <2 x i64> %bc, i32 0
2872  store i64 %ext, i64* %ptr
2873  ret void
2874}
2875
2876define void @test_mm_storer_ps(float *%a0, <4 x float> %a1) {
2877; X86-SSE-LABEL: test_mm_storer_ps:
2878; X86-SSE:       # %bb.0:
2879; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2880; X86-SSE-NEXT:    shufps $27, %xmm0, %xmm0 # encoding: [0x0f,0xc6,0xc0,0x1b]
2881; X86-SSE-NEXT:    # xmm0 = xmm0[3,2,1,0]
2882; X86-SSE-NEXT:    movaps %xmm0, (%eax) # encoding: [0x0f,0x29,0x00]
2883; X86-SSE-NEXT:    retl # encoding: [0xc3]
2884;
2885; X86-AVX1-LABEL: test_mm_storer_ps:
2886; X86-AVX1:       # %bb.0:
2887; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2888; X86-AVX1-NEXT:    vpermilps $27, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x1b]
2889; X86-AVX1-NEXT:    # xmm0 = xmm0[3,2,1,0]
2890; X86-AVX1-NEXT:    vmovaps %xmm0, (%eax) # encoding: [0xc5,0xf8,0x29,0x00]
2891; X86-AVX1-NEXT:    retl # encoding: [0xc3]
2892;
2893; X86-AVX512-LABEL: test_mm_storer_ps:
2894; X86-AVX512:       # %bb.0:
2895; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2896; X86-AVX512-NEXT:    vpermilps $27, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x1b]
2897; X86-AVX512-NEXT:    # xmm0 = xmm0[3,2,1,0]
2898; X86-AVX512-NEXT:    vmovaps %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x00]
2899; X86-AVX512-NEXT:    retl # encoding: [0xc3]
2900;
2901; X64-SSE-LABEL: test_mm_storer_ps:
2902; X64-SSE:       # %bb.0:
2903; X64-SSE-NEXT:    shufps $27, %xmm0, %xmm0 # encoding: [0x0f,0xc6,0xc0,0x1b]
2904; X64-SSE-NEXT:    # xmm0 = xmm0[3,2,1,0]
2905; X64-SSE-NEXT:    movaps %xmm0, (%rdi) # encoding: [0x0f,0x29,0x07]
2906; X64-SSE-NEXT:    retq # encoding: [0xc3]
2907;
2908; X64-AVX1-LABEL: test_mm_storer_ps:
2909; X64-AVX1:       # %bb.0:
2910; X64-AVX1-NEXT:    vpermilps $27, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x1b]
2911; X64-AVX1-NEXT:    # xmm0 = xmm0[3,2,1,0]
2912; X64-AVX1-NEXT:    vmovaps %xmm0, (%rdi) # encoding: [0xc5,0xf8,0x29,0x07]
2913; X64-AVX1-NEXT:    retq # encoding: [0xc3]
2914;
2915; X64-AVX512-LABEL: test_mm_storer_ps:
2916; X64-AVX512:       # %bb.0:
2917; X64-AVX512-NEXT:    vpermilps $27, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x1b]
2918; X64-AVX512-NEXT:    # xmm0 = xmm0[3,2,1,0]
2919; X64-AVX512-NEXT:    vmovaps %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x07]
2920; X64-AVX512-NEXT:    retq # encoding: [0xc3]
2921  %arg0 = bitcast float* %a0 to <4 x float>*
2922  %shuf = shufflevector <4 x float> %a1, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
2923  store <4 x float> %shuf, <4 x float>* %arg0, align 16
2924  ret void
2925}
2926
2927define void @test_mm_storeu_ps(float *%a0, <4 x float> %a1) {
2928; X86-SSE-LABEL: test_mm_storeu_ps:
2929; X86-SSE:       # %bb.0:
2930; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2931; X86-SSE-NEXT:    movups %xmm0, (%eax) # encoding: [0x0f,0x11,0x00]
2932; X86-SSE-NEXT:    retl # encoding: [0xc3]
2933;
2934; X86-AVX1-LABEL: test_mm_storeu_ps:
2935; X86-AVX1:       # %bb.0:
2936; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2937; X86-AVX1-NEXT:    vmovups %xmm0, (%eax) # encoding: [0xc5,0xf8,0x11,0x00]
2938; X86-AVX1-NEXT:    retl # encoding: [0xc3]
2939;
2940; X86-AVX512-LABEL: test_mm_storeu_ps:
2941; X86-AVX512:       # %bb.0:
2942; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2943; X86-AVX512-NEXT:    vmovups %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x11,0x00]
2944; X86-AVX512-NEXT:    retl # encoding: [0xc3]
2945;
2946; X64-SSE-LABEL: test_mm_storeu_ps:
2947; X64-SSE:       # %bb.0:
2948; X64-SSE-NEXT:    movups %xmm0, (%rdi) # encoding: [0x0f,0x11,0x07]
2949; X64-SSE-NEXT:    retq # encoding: [0xc3]
2950;
2951; X64-AVX1-LABEL: test_mm_storeu_ps:
2952; X64-AVX1:       # %bb.0:
2953; X64-AVX1-NEXT:    vmovups %xmm0, (%rdi) # encoding: [0xc5,0xf8,0x11,0x07]
2954; X64-AVX1-NEXT:    retq # encoding: [0xc3]
2955;
2956; X64-AVX512-LABEL: test_mm_storeu_ps:
2957; X64-AVX512:       # %bb.0:
2958; X64-AVX512-NEXT:    vmovups %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x11,0x07]
2959; X64-AVX512-NEXT:    retq # encoding: [0xc3]
2960  %arg0 = bitcast float* %a0 to <4 x float>*
2961  store <4 x float> %a1, <4 x float>* %arg0, align 1
2962  ret void
2963}
2964
2965define void @test_mm_stream_ps(float *%a0, <4 x float> %a1) {
2966; X86-SSE-LABEL: test_mm_stream_ps:
2967; X86-SSE:       # %bb.0:
2968; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2969; X86-SSE-NEXT:    movntps %xmm0, (%eax) # encoding: [0x0f,0x2b,0x00]
2970; X86-SSE-NEXT:    retl # encoding: [0xc3]
2971;
2972; X86-AVX1-LABEL: test_mm_stream_ps:
2973; X86-AVX1:       # %bb.0:
2974; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2975; X86-AVX1-NEXT:    vmovntps %xmm0, (%eax) # encoding: [0xc5,0xf8,0x2b,0x00]
2976; X86-AVX1-NEXT:    retl # encoding: [0xc3]
2977;
2978; X86-AVX512-LABEL: test_mm_stream_ps:
2979; X86-AVX512:       # %bb.0:
2980; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2981; X86-AVX512-NEXT:    vmovntps %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2b,0x00]
2982; X86-AVX512-NEXT:    retl # encoding: [0xc3]
2983;
2984; X64-SSE-LABEL: test_mm_stream_ps:
2985; X64-SSE:       # %bb.0:
2986; X64-SSE-NEXT:    movntps %xmm0, (%rdi) # encoding: [0x0f,0x2b,0x07]
2987; X64-SSE-NEXT:    retq # encoding: [0xc3]
2988;
2989; X64-AVX1-LABEL: test_mm_stream_ps:
2990; X64-AVX1:       # %bb.0:
2991; X64-AVX1-NEXT:    vmovntps %xmm0, (%rdi) # encoding: [0xc5,0xf8,0x2b,0x07]
2992; X64-AVX1-NEXT:    retq # encoding: [0xc3]
2993;
2994; X64-AVX512-LABEL: test_mm_stream_ps:
2995; X64-AVX512:       # %bb.0:
2996; X64-AVX512-NEXT:    vmovntps %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2b,0x07]
2997; X64-AVX512-NEXT:    retq # encoding: [0xc3]
2998  %arg0 = bitcast float* %a0 to <4 x float>*
2999  store <4 x float> %a1, <4 x float>* %arg0, align 16, !nontemporal !0
3000  ret void
3001}
3002
3003define <4 x float> @test_mm_sub_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
3004; SSE-LABEL: test_mm_sub_ps:
3005; SSE:       # %bb.0:
3006; SSE-NEXT:    subps %xmm1, %xmm0 # encoding: [0x0f,0x5c,0xc1]
3007; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3008;
3009; AVX1-LABEL: test_mm_sub_ps:
3010; AVX1:       # %bb.0:
3011; AVX1-NEXT:    vsubps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x5c,0xc1]
3012; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3013;
3014; AVX512-LABEL: test_mm_sub_ps:
3015; AVX512:       # %bb.0:
3016; AVX512-NEXT:    vsubps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x5c,0xc1]
3017; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3018  %res = fsub <4 x float> %a0, %a1
3019  ret <4 x float> %res
3020}
3021
3022define <4 x float> @test_mm_sub_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
3023; SSE-LABEL: test_mm_sub_ss:
3024; SSE:       # %bb.0:
3025; SSE-NEXT:    subss %xmm1, %xmm0 # encoding: [0xf3,0x0f,0x5c,0xc1]
3026; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3027;
3028; AVX1-LABEL: test_mm_sub_ss:
3029; AVX1:       # %bb.0:
3030; AVX1-NEXT:    vsubss %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x5c,0xc1]
3031; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3032;
3033; AVX512-LABEL: test_mm_sub_ss:
3034; AVX512:       # %bb.0:
3035; AVX512-NEXT:    vsubss %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x5c,0xc1]
3036; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3037  %ext0 = extractelement <4 x float> %a0, i32 0
3038  %ext1 = extractelement <4 x float> %a1, i32 0
3039  %fsub = fsub float %ext0, %ext1
3040  %res = insertelement <4 x float> %a0, float %fsub, i32 0
3041  ret <4 x float> %res
3042}
3043
3044define void @test_MM_TRANSPOSE4_PS(<4 x float>* %a0, <4 x float>* %a1, <4 x float>* %a2, <4 x float>* %a3) nounwind {
3045; X86-SSE-LABEL: test_MM_TRANSPOSE4_PS:
3046; X86-SSE:       # %bb.0:
3047; X86-SSE-NEXT:    pushl %esi # encoding: [0x56]
3048; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x14]
3049; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x10]
3050; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c]
3051; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x08]
3052; X86-SSE-NEXT:    movaps (%esi), %xmm0 # encoding: [0x0f,0x28,0x06]
3053; X86-SSE-NEXT:    movaps (%edx), %xmm1 # encoding: [0x0f,0x28,0x0a]
3054; X86-SSE-NEXT:    movaps (%ecx), %xmm2 # encoding: [0x0f,0x28,0x11]
3055; X86-SSE-NEXT:    movaps (%eax), %xmm3 # encoding: [0x0f,0x28,0x18]
3056; X86-SSE-NEXT:    movaps %xmm0, %xmm4 # encoding: [0x0f,0x28,0xe0]
3057; X86-SSE-NEXT:    unpcklps %xmm1, %xmm4 # encoding: [0x0f,0x14,0xe1]
3058; X86-SSE-NEXT:    # xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
3059; X86-SSE-NEXT:    movaps %xmm2, %xmm5 # encoding: [0x0f,0x28,0xea]
3060; X86-SSE-NEXT:    unpcklps %xmm3, %xmm5 # encoding: [0x0f,0x14,0xeb]
3061; X86-SSE-NEXT:    # xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
3062; X86-SSE-NEXT:    unpckhps %xmm1, %xmm0 # encoding: [0x0f,0x15,0xc1]
3063; X86-SSE-NEXT:    # xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3064; X86-SSE-NEXT:    unpckhps %xmm3, %xmm2 # encoding: [0x0f,0x15,0xd3]
3065; X86-SSE-NEXT:    # xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
3066; X86-SSE-NEXT:    movaps %xmm4, %xmm1 # encoding: [0x0f,0x28,0xcc]
3067; X86-SSE-NEXT:    movlhps %xmm5, %xmm1 # encoding: [0x0f,0x16,0xcd]
3068; X86-SSE-NEXT:    # xmm1 = xmm1[0],xmm5[0]
3069; X86-SSE-NEXT:    movhlps %xmm4, %xmm5 # encoding: [0x0f,0x12,0xec]
3070; X86-SSE-NEXT:    # xmm5 = xmm4[1],xmm5[1]
3071; X86-SSE-NEXT:    movaps %xmm0, %xmm3 # encoding: [0x0f,0x28,0xd8]
3072; X86-SSE-NEXT:    movlhps %xmm2, %xmm3 # encoding: [0x0f,0x16,0xda]
3073; X86-SSE-NEXT:    # xmm3 = xmm3[0],xmm2[0]
3074; X86-SSE-NEXT:    movhlps %xmm0, %xmm2 # encoding: [0x0f,0x12,0xd0]
3075; X86-SSE-NEXT:    # xmm2 = xmm0[1],xmm2[1]
3076; X86-SSE-NEXT:    movaps %xmm1, (%esi) # encoding: [0x0f,0x29,0x0e]
3077; X86-SSE-NEXT:    movaps %xmm5, (%edx) # encoding: [0x0f,0x29,0x2a]
3078; X86-SSE-NEXT:    movaps %xmm3, (%ecx) # encoding: [0x0f,0x29,0x19]
3079; X86-SSE-NEXT:    movaps %xmm2, (%eax) # encoding: [0x0f,0x29,0x10]
3080; X86-SSE-NEXT:    popl %esi # encoding: [0x5e]
3081; X86-SSE-NEXT:    retl # encoding: [0xc3]
3082;
3083; X86-AVX1-LABEL: test_MM_TRANSPOSE4_PS:
3084; X86-AVX1:       # %bb.0:
3085; X86-AVX1-NEXT:    pushl %esi # encoding: [0x56]
3086; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x14]
3087; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x10]
3088; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c]
3089; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x08]
3090; X86-AVX1-NEXT:    vmovaps (%esi), %xmm0 # encoding: [0xc5,0xf8,0x28,0x06]
3091; X86-AVX1-NEXT:    vmovaps (%edx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x0a]
3092; X86-AVX1-NEXT:    vmovaps (%ecx), %xmm2 # encoding: [0xc5,0xf8,0x28,0x11]
3093; X86-AVX1-NEXT:    vmovaps (%eax), %xmm3 # encoding: [0xc5,0xf8,0x28,0x18]
3094; X86-AVX1-NEXT:    vunpcklps %xmm1, %xmm0, %xmm4 # encoding: [0xc5,0xf8,0x14,0xe1]
3095; X86-AVX1-NEXT:    # xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3096; X86-AVX1-NEXT:    vunpcklps %xmm3, %xmm2, %xmm5 # encoding: [0xc5,0xe8,0x14,0xeb]
3097; X86-AVX1-NEXT:    # xmm5 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
3098; X86-AVX1-NEXT:    vunpckhps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x15,0xc1]
3099; X86-AVX1-NEXT:    # xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3100; X86-AVX1-NEXT:    vunpckhps %xmm3, %xmm2, %xmm1 # encoding: [0xc5,0xe8,0x15,0xcb]
3101; X86-AVX1-NEXT:    # xmm1 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
3102; X86-AVX1-NEXT:    vmovlhps %xmm5, %xmm4, %xmm2 # encoding: [0xc5,0xd8,0x16,0xd5]
3103; X86-AVX1-NEXT:    # xmm2 = xmm4[0],xmm5[0]
3104; X86-AVX1-NEXT:    vunpckhpd %xmm5, %xmm4, %xmm3 # encoding: [0xc5,0xd9,0x15,0xdd]
3105; X86-AVX1-NEXT:    # xmm3 = xmm4[1],xmm5[1]
3106; X86-AVX1-NEXT:    vmovlhps %xmm1, %xmm0, %xmm4 # encoding: [0xc5,0xf8,0x16,0xe1]
3107; X86-AVX1-NEXT:    # xmm4 = xmm0[0],xmm1[0]
3108; X86-AVX1-NEXT:    vunpckhpd %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x15,0xc1]
3109; X86-AVX1-NEXT:    # xmm0 = xmm0[1],xmm1[1]
3110; X86-AVX1-NEXT:    vmovaps %xmm2, (%esi) # encoding: [0xc5,0xf8,0x29,0x16]
3111; X86-AVX1-NEXT:    vmovaps %xmm3, (%edx) # encoding: [0xc5,0xf8,0x29,0x1a]
3112; X86-AVX1-NEXT:    vmovaps %xmm4, (%ecx) # encoding: [0xc5,0xf8,0x29,0x21]
3113; X86-AVX1-NEXT:    vmovaps %xmm0, (%eax) # encoding: [0xc5,0xf8,0x29,0x00]
3114; X86-AVX1-NEXT:    popl %esi # encoding: [0x5e]
3115; X86-AVX1-NEXT:    retl # encoding: [0xc3]
3116;
3117; X86-AVX512-LABEL: test_MM_TRANSPOSE4_PS:
3118; X86-AVX512:       # %bb.0:
3119; X86-AVX512-NEXT:    pushl %esi # encoding: [0x56]
3120; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x14]
3121; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x10]
3122; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c]
3123; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x08]
3124; X86-AVX512-NEXT:    vmovaps (%esi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x06]
3125; X86-AVX512-NEXT:    vmovaps (%edx), %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x0a]
3126; X86-AVX512-NEXT:    vmovaps (%ecx), %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x11]
3127; X86-AVX512-NEXT:    vmovaps (%eax), %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x18]
3128; X86-AVX512-NEXT:    vunpcklps %xmm1, %xmm0, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x14,0xe1]
3129; X86-AVX512-NEXT:    # xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3130; X86-AVX512-NEXT:    vunpcklps %xmm3, %xmm2, %xmm5 # EVEX TO VEX Compression encoding: [0xc5,0xe8,0x14,0xeb]
3131; X86-AVX512-NEXT:    # xmm5 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
3132; X86-AVX512-NEXT:    vunpckhps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x15,0xc1]
3133; X86-AVX512-NEXT:    # xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3134; X86-AVX512-NEXT:    vunpckhps %xmm3, %xmm2, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xe8,0x15,0xcb]
3135; X86-AVX512-NEXT:    # xmm1 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
3136; X86-AVX512-NEXT:    vmovlhps %xmm5, %xmm4, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xd8,0x16,0xd5]
3137; X86-AVX512-NEXT:    # xmm2 = xmm4[0],xmm5[0]
3138; X86-AVX512-NEXT:    vunpckhpd %xmm5, %xmm4, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xd9,0x15,0xdd]
3139; X86-AVX512-NEXT:    # xmm3 = xmm4[1],xmm5[1]
3140; X86-AVX512-NEXT:    vmovlhps %xmm1, %xmm0, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x16,0xe1]
3141; X86-AVX512-NEXT:    # xmm4 = xmm0[0],xmm1[0]
3142; X86-AVX512-NEXT:    vunpckhpd %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x15,0xc1]
3143; X86-AVX512-NEXT:    # xmm0 = xmm0[1],xmm1[1]
3144; X86-AVX512-NEXT:    vmovaps %xmm2, (%esi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x16]
3145; X86-AVX512-NEXT:    vmovaps %xmm3, (%edx) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x1a]
3146; X86-AVX512-NEXT:    vmovaps %xmm4, (%ecx) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x21]
3147; X86-AVX512-NEXT:    vmovaps %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x00]
3148; X86-AVX512-NEXT:    popl %esi # encoding: [0x5e]
3149; X86-AVX512-NEXT:    retl # encoding: [0xc3]
3150;
3151; X64-SSE-LABEL: test_MM_TRANSPOSE4_PS:
3152; X64-SSE:       # %bb.0:
3153; X64-SSE-NEXT:    movaps (%rdi), %xmm0 # encoding: [0x0f,0x28,0x07]
3154; X64-SSE-NEXT:    movaps (%rsi), %xmm1 # encoding: [0x0f,0x28,0x0e]
3155; X64-SSE-NEXT:    movaps (%rdx), %xmm2 # encoding: [0x0f,0x28,0x12]
3156; X64-SSE-NEXT:    movaps (%rcx), %xmm3 # encoding: [0x0f,0x28,0x19]
3157; X64-SSE-NEXT:    movaps %xmm0, %xmm4 # encoding: [0x0f,0x28,0xe0]
3158; X64-SSE-NEXT:    unpcklps %xmm1, %xmm4 # encoding: [0x0f,0x14,0xe1]
3159; X64-SSE-NEXT:    # xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
3160; X64-SSE-NEXT:    movaps %xmm2, %xmm5 # encoding: [0x0f,0x28,0xea]
3161; X64-SSE-NEXT:    unpcklps %xmm3, %xmm5 # encoding: [0x0f,0x14,0xeb]
3162; X64-SSE-NEXT:    # xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
3163; X64-SSE-NEXT:    unpckhps %xmm1, %xmm0 # encoding: [0x0f,0x15,0xc1]
3164; X64-SSE-NEXT:    # xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3165; X64-SSE-NEXT:    unpckhps %xmm3, %xmm2 # encoding: [0x0f,0x15,0xd3]
3166; X64-SSE-NEXT:    # xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
3167; X64-SSE-NEXT:    movaps %xmm4, %xmm1 # encoding: [0x0f,0x28,0xcc]
3168; X64-SSE-NEXT:    movlhps %xmm5, %xmm1 # encoding: [0x0f,0x16,0xcd]
3169; X64-SSE-NEXT:    # xmm1 = xmm1[0],xmm5[0]
3170; X64-SSE-NEXT:    movhlps %xmm4, %xmm5 # encoding: [0x0f,0x12,0xec]
3171; X64-SSE-NEXT:    # xmm5 = xmm4[1],xmm5[1]
3172; X64-SSE-NEXT:    movaps %xmm0, %xmm3 # encoding: [0x0f,0x28,0xd8]
3173; X64-SSE-NEXT:    movlhps %xmm2, %xmm3 # encoding: [0x0f,0x16,0xda]
3174; X64-SSE-NEXT:    # xmm3 = xmm3[0],xmm2[0]
3175; X64-SSE-NEXT:    movhlps %xmm0, %xmm2 # encoding: [0x0f,0x12,0xd0]
3176; X64-SSE-NEXT:    # xmm2 = xmm0[1],xmm2[1]
3177; X64-SSE-NEXT:    movaps %xmm1, (%rdi) # encoding: [0x0f,0x29,0x0f]
3178; X64-SSE-NEXT:    movaps %xmm5, (%rsi) # encoding: [0x0f,0x29,0x2e]
3179; X64-SSE-NEXT:    movaps %xmm3, (%rdx) # encoding: [0x0f,0x29,0x1a]
3180; X64-SSE-NEXT:    movaps %xmm2, (%rcx) # encoding: [0x0f,0x29,0x11]
3181; X64-SSE-NEXT:    retq # encoding: [0xc3]
3182;
3183; X64-AVX1-LABEL: test_MM_TRANSPOSE4_PS:
3184; X64-AVX1:       # %bb.0:
3185; X64-AVX1-NEXT:    vmovaps (%rdi), %xmm0 # encoding: [0xc5,0xf8,0x28,0x07]
3186; X64-AVX1-NEXT:    vmovaps (%rsi), %xmm1 # encoding: [0xc5,0xf8,0x28,0x0e]
3187; X64-AVX1-NEXT:    vmovaps (%rdx), %xmm2 # encoding: [0xc5,0xf8,0x28,0x12]
3188; X64-AVX1-NEXT:    vmovaps (%rcx), %xmm3 # encoding: [0xc5,0xf8,0x28,0x19]
3189; X64-AVX1-NEXT:    vunpcklps %xmm1, %xmm0, %xmm4 # encoding: [0xc5,0xf8,0x14,0xe1]
3190; X64-AVX1-NEXT:    # xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3191; X64-AVX1-NEXT:    vunpcklps %xmm3, %xmm2, %xmm5 # encoding: [0xc5,0xe8,0x14,0xeb]
3192; X64-AVX1-NEXT:    # xmm5 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
3193; X64-AVX1-NEXT:    vunpckhps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x15,0xc1]
3194; X64-AVX1-NEXT:    # xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3195; X64-AVX1-NEXT:    vunpckhps %xmm3, %xmm2, %xmm1 # encoding: [0xc5,0xe8,0x15,0xcb]
3196; X64-AVX1-NEXT:    # xmm1 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
3197; X64-AVX1-NEXT:    vmovlhps %xmm5, %xmm4, %xmm2 # encoding: [0xc5,0xd8,0x16,0xd5]
3198; X64-AVX1-NEXT:    # xmm2 = xmm4[0],xmm5[0]
3199; X64-AVX1-NEXT:    vunpckhpd %xmm5, %xmm4, %xmm3 # encoding: [0xc5,0xd9,0x15,0xdd]
3200; X64-AVX1-NEXT:    # xmm3 = xmm4[1],xmm5[1]
3201; X64-AVX1-NEXT:    vmovlhps %xmm1, %xmm0, %xmm4 # encoding: [0xc5,0xf8,0x16,0xe1]
3202; X64-AVX1-NEXT:    # xmm4 = xmm0[0],xmm1[0]
3203; X64-AVX1-NEXT:    vunpckhpd %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x15,0xc1]
3204; X64-AVX1-NEXT:    # xmm0 = xmm0[1],xmm1[1]
3205; X64-AVX1-NEXT:    vmovaps %xmm2, (%rdi) # encoding: [0xc5,0xf8,0x29,0x17]
3206; X64-AVX1-NEXT:    vmovaps %xmm3, (%rsi) # encoding: [0xc5,0xf8,0x29,0x1e]
3207; X64-AVX1-NEXT:    vmovaps %xmm4, (%rdx) # encoding: [0xc5,0xf8,0x29,0x22]
3208; X64-AVX1-NEXT:    vmovaps %xmm0, (%rcx) # encoding: [0xc5,0xf8,0x29,0x01]
3209; X64-AVX1-NEXT:    retq # encoding: [0xc3]
3210;
3211; X64-AVX512-LABEL: test_MM_TRANSPOSE4_PS:
3212; X64-AVX512:       # %bb.0:
3213; X64-AVX512-NEXT:    vmovaps (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x07]
3214; X64-AVX512-NEXT:    vmovaps (%rsi), %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x0e]
3215; X64-AVX512-NEXT:    vmovaps (%rdx), %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x12]
3216; X64-AVX512-NEXT:    vmovaps (%rcx), %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x19]
3217; X64-AVX512-NEXT:    vunpcklps %xmm1, %xmm0, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x14,0xe1]
3218; X64-AVX512-NEXT:    # xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3219; X64-AVX512-NEXT:    vunpcklps %xmm3, %xmm2, %xmm5 # EVEX TO VEX Compression encoding: [0xc5,0xe8,0x14,0xeb]
3220; X64-AVX512-NEXT:    # xmm5 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
3221; X64-AVX512-NEXT:    vunpckhps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x15,0xc1]
3222; X64-AVX512-NEXT:    # xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3223; X64-AVX512-NEXT:    vunpckhps %xmm3, %xmm2, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xe8,0x15,0xcb]
3224; X64-AVX512-NEXT:    # xmm1 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
3225; X64-AVX512-NEXT:    vmovlhps %xmm5, %xmm4, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xd8,0x16,0xd5]
3226; X64-AVX512-NEXT:    # xmm2 = xmm4[0],xmm5[0]
3227; X64-AVX512-NEXT:    vunpckhpd %xmm5, %xmm4, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xd9,0x15,0xdd]
3228; X64-AVX512-NEXT:    # xmm3 = xmm4[1],xmm5[1]
3229; X64-AVX512-NEXT:    vmovlhps %xmm1, %xmm0, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x16,0xe1]
3230; X64-AVX512-NEXT:    # xmm4 = xmm0[0],xmm1[0]
3231; X64-AVX512-NEXT:    vunpckhpd %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x15,0xc1]
3232; X64-AVX512-NEXT:    # xmm0 = xmm0[1],xmm1[1]
3233; X64-AVX512-NEXT:    vmovaps %xmm2, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x17]
3234; X64-AVX512-NEXT:    vmovaps %xmm3, (%rsi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x1e]
3235; X64-AVX512-NEXT:    vmovaps %xmm4, (%rdx) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x22]
3236; X64-AVX512-NEXT:    vmovaps %xmm0, (%rcx) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x01]
3237; X64-AVX512-NEXT:    retq # encoding: [0xc3]
3238  %row0 = load <4 x float>, <4 x float>* %a0, align 16
3239  %row1 = load <4 x float>, <4 x float>* %a1, align 16
3240  %row2 = load <4 x float>, <4 x float>* %a2, align 16
3241  %row3 = load <4 x float>, <4 x float>* %a3, align 16
3242  %tmp0 = shufflevector <4 x float> %row0, <4 x float> %row1, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
3243  %tmp2 = shufflevector <4 x float> %row2, <4 x float> %row3, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
3244  %tmp1 = shufflevector <4 x float> %row0, <4 x float> %row1, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
3245  %tmp3 = shufflevector <4 x float> %row2, <4 x float> %row3, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
3246  %res0 = shufflevector <4 x float> %tmp0, <4 x float> %tmp2, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
3247  %res1 = shufflevector <4 x float> %tmp2, <4 x float> %tmp0, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
3248  %res2 = shufflevector <4 x float> %tmp1, <4 x float> %tmp3, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
3249  %res3 = shufflevector <4 x float> %tmp3, <4 x float> %tmp1, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
3250  store <4 x float> %res0, <4 x float>* %a0, align 16
3251  store <4 x float> %res1, <4 x float>* %a1, align 16
3252  store <4 x float> %res2, <4 x float>* %a2, align 16
3253  store <4 x float> %res3, <4 x float>* %a3, align 16
3254  ret void
3255}
3256
3257define i32 @test_mm_ucomieq_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
3258; SSE-LABEL: test_mm_ucomieq_ss:
3259; SSE:       # %bb.0:
3260; SSE-NEXT:    ucomiss %xmm1, %xmm0 # encoding: [0x0f,0x2e,0xc1]
3261; SSE-NEXT:    setnp %al # encoding: [0x0f,0x9b,0xc0]
3262; SSE-NEXT:    sete %cl # encoding: [0x0f,0x94,0xc1]
3263; SSE-NEXT:    andb %al, %cl # encoding: [0x20,0xc1]
3264; SSE-NEXT:    movzbl %cl, %eax # encoding: [0x0f,0xb6,0xc1]
3265; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3266;
3267; AVX1-LABEL: test_mm_ucomieq_ss:
3268; AVX1:       # %bb.0:
3269; AVX1-NEXT:    vucomiss %xmm1, %xmm0 # encoding: [0xc5,0xf8,0x2e,0xc1]
3270; AVX1-NEXT:    setnp %al # encoding: [0x0f,0x9b,0xc0]
3271; AVX1-NEXT:    sete %cl # encoding: [0x0f,0x94,0xc1]
3272; AVX1-NEXT:    andb %al, %cl # encoding: [0x20,0xc1]
3273; AVX1-NEXT:    movzbl %cl, %eax # encoding: [0x0f,0xb6,0xc1]
3274; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3275;
3276; AVX512-LABEL: test_mm_ucomieq_ss:
3277; AVX512:       # %bb.0:
3278; AVX512-NEXT:    vucomiss %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc1]
3279; AVX512-NEXT:    setnp %al # encoding: [0x0f,0x9b,0xc0]
3280; AVX512-NEXT:    sete %cl # encoding: [0x0f,0x94,0xc1]
3281; AVX512-NEXT:    andb %al, %cl # encoding: [0x20,0xc1]
3282; AVX512-NEXT:    movzbl %cl, %eax # encoding: [0x0f,0xb6,0xc1]
3283; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3284  %res = call i32 @llvm.x86.sse.ucomieq.ss(<4 x float> %a0, <4 x float> %a1)
3285  ret i32 %res
3286}
3287declare i32 @llvm.x86.sse.ucomieq.ss(<4 x float>, <4 x float>) nounwind readnone
3288
3289define i32 @test_mm_ucomige_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
3290; SSE-LABEL: test_mm_ucomige_ss:
3291; SSE:       # %bb.0:
3292; SSE-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
3293; SSE-NEXT:    ucomiss %xmm1, %xmm0 # encoding: [0x0f,0x2e,0xc1]
3294; SSE-NEXT:    setae %al # encoding: [0x0f,0x93,0xc0]
3295; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3296;
3297; AVX1-LABEL: test_mm_ucomige_ss:
3298; AVX1:       # %bb.0:
3299; AVX1-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
3300; AVX1-NEXT:    vucomiss %xmm1, %xmm0 # encoding: [0xc5,0xf8,0x2e,0xc1]
3301; AVX1-NEXT:    setae %al # encoding: [0x0f,0x93,0xc0]
3302; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3303;
3304; AVX512-LABEL: test_mm_ucomige_ss:
3305; AVX512:       # %bb.0:
3306; AVX512-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
3307; AVX512-NEXT:    vucomiss %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc1]
3308; AVX512-NEXT:    setae %al # encoding: [0x0f,0x93,0xc0]
3309; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3310  %res = call i32 @llvm.x86.sse.ucomige.ss(<4 x float> %a0, <4 x float> %a1)
3311  ret i32 %res
3312}
3313declare i32 @llvm.x86.sse.ucomige.ss(<4 x float>, <4 x float>) nounwind readnone
3314
3315define i32 @test_mm_ucomigt_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
3316; SSE-LABEL: test_mm_ucomigt_ss:
3317; SSE:       # %bb.0:
3318; SSE-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
3319; SSE-NEXT:    ucomiss %xmm1, %xmm0 # encoding: [0x0f,0x2e,0xc1]
3320; SSE-NEXT:    seta %al # encoding: [0x0f,0x97,0xc0]
3321; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3322;
3323; AVX1-LABEL: test_mm_ucomigt_ss:
3324; AVX1:       # %bb.0:
3325; AVX1-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
3326; AVX1-NEXT:    vucomiss %xmm1, %xmm0 # encoding: [0xc5,0xf8,0x2e,0xc1]
3327; AVX1-NEXT:    seta %al # encoding: [0x0f,0x97,0xc0]
3328; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3329;
3330; AVX512-LABEL: test_mm_ucomigt_ss:
3331; AVX512:       # %bb.0:
3332; AVX512-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
3333; AVX512-NEXT:    vucomiss %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc1]
3334; AVX512-NEXT:    seta %al # encoding: [0x0f,0x97,0xc0]
3335; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3336  %res = call i32 @llvm.x86.sse.ucomigt.ss(<4 x float> %a0, <4 x float> %a1)
3337  ret i32 %res
3338}
3339declare i32 @llvm.x86.sse.ucomigt.ss(<4 x float>, <4 x float>) nounwind readnone
3340
3341define i32 @test_mm_ucomile_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
3342; SSE-LABEL: test_mm_ucomile_ss:
3343; SSE:       # %bb.0:
3344; SSE-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
3345; SSE-NEXT:    ucomiss %xmm0, %xmm1 # encoding: [0x0f,0x2e,0xc8]
3346; SSE-NEXT:    setae %al # encoding: [0x0f,0x93,0xc0]
3347; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3348;
3349; AVX1-LABEL: test_mm_ucomile_ss:
3350; AVX1:       # %bb.0:
3351; AVX1-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
3352; AVX1-NEXT:    vucomiss %xmm0, %xmm1 # encoding: [0xc5,0xf8,0x2e,0xc8]
3353; AVX1-NEXT:    setae %al # encoding: [0x0f,0x93,0xc0]
3354; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3355;
3356; AVX512-LABEL: test_mm_ucomile_ss:
3357; AVX512:       # %bb.0:
3358; AVX512-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
3359; AVX512-NEXT:    vucomiss %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc8]
3360; AVX512-NEXT:    setae %al # encoding: [0x0f,0x93,0xc0]
3361; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3362  %res = call i32 @llvm.x86.sse.ucomile.ss(<4 x float> %a0, <4 x float> %a1)
3363  ret i32 %res
3364}
3365declare i32 @llvm.x86.sse.ucomile.ss(<4 x float>, <4 x float>) nounwind readnone
3366
3367define i32 @test_mm_ucomilt_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
3368; SSE-LABEL: test_mm_ucomilt_ss:
3369; SSE:       # %bb.0:
3370; SSE-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
3371; SSE-NEXT:    ucomiss %xmm0, %xmm1 # encoding: [0x0f,0x2e,0xc8]
3372; SSE-NEXT:    seta %al # encoding: [0x0f,0x97,0xc0]
3373; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3374;
3375; AVX1-LABEL: test_mm_ucomilt_ss:
3376; AVX1:       # %bb.0:
3377; AVX1-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
3378; AVX1-NEXT:    vucomiss %xmm0, %xmm1 # encoding: [0xc5,0xf8,0x2e,0xc8]
3379; AVX1-NEXT:    seta %al # encoding: [0x0f,0x97,0xc0]
3380; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3381;
3382; AVX512-LABEL: test_mm_ucomilt_ss:
3383; AVX512:       # %bb.0:
3384; AVX512-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
3385; AVX512-NEXT:    vucomiss %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc8]
3386; AVX512-NEXT:    seta %al # encoding: [0x0f,0x97,0xc0]
3387; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3388  %res = call i32 @llvm.x86.sse.ucomilt.ss(<4 x float> %a0, <4 x float> %a1)
3389  ret i32 %res
3390}
3391declare i32 @llvm.x86.sse.ucomilt.ss(<4 x float>, <4 x float>) nounwind readnone
3392
3393define i32 @test_mm_ucomineq_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
3394; SSE-LABEL: test_mm_ucomineq_ss:
3395; SSE:       # %bb.0:
3396; SSE-NEXT:    ucomiss %xmm1, %xmm0 # encoding: [0x0f,0x2e,0xc1]
3397; SSE-NEXT:    setp %al # encoding: [0x0f,0x9a,0xc0]
3398; SSE-NEXT:    setne %cl # encoding: [0x0f,0x95,0xc1]
3399; SSE-NEXT:    orb %al, %cl # encoding: [0x08,0xc1]
3400; SSE-NEXT:    movzbl %cl, %eax # encoding: [0x0f,0xb6,0xc1]
3401; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3402;
3403; AVX1-LABEL: test_mm_ucomineq_ss:
3404; AVX1:       # %bb.0:
3405; AVX1-NEXT:    vucomiss %xmm1, %xmm0 # encoding: [0xc5,0xf8,0x2e,0xc1]
3406; AVX1-NEXT:    setp %al # encoding: [0x0f,0x9a,0xc0]
3407; AVX1-NEXT:    setne %cl # encoding: [0x0f,0x95,0xc1]
3408; AVX1-NEXT:    orb %al, %cl # encoding: [0x08,0xc1]
3409; AVX1-NEXT:    movzbl %cl, %eax # encoding: [0x0f,0xb6,0xc1]
3410; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3411;
3412; AVX512-LABEL: test_mm_ucomineq_ss:
3413; AVX512:       # %bb.0:
3414; AVX512-NEXT:    vucomiss %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc1]
3415; AVX512-NEXT:    setp %al # encoding: [0x0f,0x9a,0xc0]
3416; AVX512-NEXT:    setne %cl # encoding: [0x0f,0x95,0xc1]
3417; AVX512-NEXT:    orb %al, %cl # encoding: [0x08,0xc1]
3418; AVX512-NEXT:    movzbl %cl, %eax # encoding: [0x0f,0xb6,0xc1]
3419; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3420  %res = call i32 @llvm.x86.sse.ucomineq.ss(<4 x float> %a0, <4 x float> %a1)
3421  ret i32 %res
3422}
3423declare i32 @llvm.x86.sse.ucomineq.ss(<4 x float>, <4 x float>) nounwind readnone
3424
3425define <4 x float> @test_mm_undefined_ps() {
3426; CHECK-LABEL: test_mm_undefined_ps:
3427; CHECK:       # %bb.0:
3428; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3429  ret <4 x float> undef
3430}
3431
3432define <4 x float> @test_mm_unpackhi_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
3433; SSE-LABEL: test_mm_unpackhi_ps:
3434; SSE:       # %bb.0:
3435; SSE-NEXT:    unpckhps %xmm1, %xmm0 # encoding: [0x0f,0x15,0xc1]
3436; SSE-NEXT:    # xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3437; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3438;
3439; AVX1-LABEL: test_mm_unpackhi_ps:
3440; AVX1:       # %bb.0:
3441; AVX1-NEXT:    vunpckhps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x15,0xc1]
3442; AVX1-NEXT:    # xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3443; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3444;
3445; AVX512-LABEL: test_mm_unpackhi_ps:
3446; AVX512:       # %bb.0:
3447; AVX512-NEXT:    vunpckhps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x15,0xc1]
3448; AVX512-NEXT:    # xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3449; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3450  %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
3451  ret <4 x float> %res
3452}
3453
3454define <4 x float> @test_mm_unpacklo_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
3455; SSE-LABEL: test_mm_unpacklo_ps:
3456; SSE:       # %bb.0:
3457; SSE-NEXT:    unpcklps %xmm1, %xmm0 # encoding: [0x0f,0x14,0xc1]
3458; SSE-NEXT:    # xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3459; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3460;
3461; AVX1-LABEL: test_mm_unpacklo_ps:
3462; AVX1:       # %bb.0:
3463; AVX1-NEXT:    vunpcklps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x14,0xc1]
3464; AVX1-NEXT:    # xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3465; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3466;
3467; AVX512-LABEL: test_mm_unpacklo_ps:
3468; AVX512:       # %bb.0:
3469; AVX512-NEXT:    vunpcklps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x14,0xc1]
3470; AVX512-NEXT:    # xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3471; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3472  %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
3473  ret <4 x float> %res
3474}
3475
3476define <4 x float> @test_mm_xor_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
3477; SSE-LABEL: test_mm_xor_ps:
3478; SSE:       # %bb.0:
3479; SSE-NEXT:    xorps %xmm1, %xmm0 # encoding: [0x0f,0x57,0xc1]
3480; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3481;
3482; AVX1-LABEL: test_mm_xor_ps:
3483; AVX1:       # %bb.0:
3484; AVX1-NEXT:    vxorps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x57,0xc1]
3485; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3486;
3487; AVX512-LABEL: test_mm_xor_ps:
3488; AVX512:       # %bb.0:
3489; AVX512-NEXT:    vxorps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x57,0xc1]
3490; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3491  %arg0 = bitcast <4 x float> %a0 to <4 x i32>
3492  %arg1 = bitcast <4 x float> %a1 to <4 x i32>
3493  %res = xor <4 x i32> %arg0, %arg1
3494  %bc = bitcast <4 x i32> %res to <4 x float>
3495  ret <4 x float> %bc
3496}
3497
3498!0 = !{i32 1}
3499